上篇文章发了salt-minion的监控代码 http://6252961.blog.51cto.com/6242961/1710977 ,在监控跑完列出全部的有问题的客户端列表以后,若是手动一个个去修复,很费事,服务器你太多,因此写了这个自动修复的代码,解放双手php
代码逻辑:python
一、首先从数据库读取minion端有问题的服务器,若是数量超过100,则中止自动修复,没有则继续(这个没有在代码中实现,不过也很简单,只须要判断一下列表长度便可)mysql
二、检测服务器的ping,若是ping通,继续,不然保存错误信息,中止自动修复sql
三、检测服务器的ssh登录状态,若是能够登陆并命令‘date’执行成功,继续,不然保存错误信息,中止自动修复数据库
四、检查服务器的nfs挂载状态,若是挂载异常,先卸载nfs,再继续执行(由于服务器几千台,会常常出现服务器挂载的nfs的ip不通的问题,形成yum在执行的过程当中卡死,没法完成任务也没法退出任务,具体缘由没有细究),若是nfs挂载正常,则继续下一步,若是卸载失败,则中止修复
json
五、对服务器yum进行修复,就是初始化yum的过程,初始化完以后执行yum list| grep salt若是执行成功,则继续,不然保存错误信息,中止自动修复api
六、卸载服务器原有salt-minion客户端,卸载以后检查有没有卸载成功,若是成功,则继续,不然保存错误信息,中止自动修复服务器
七、从新安装最新salt-minion客户端,检查有没有安装成功,若是成功,则继续,不然保存错误信息,中止自动修复app
八、启动salt-minion客户端,检查启动状态,若是成功,则继续,不然保存错误信息,中止自动修复dom
九、登录master端执行简单命令,确认master与修复后的minion通讯是否成功,若是成功,则修改最新数据库的对应信息,若是报错,则把最新信息的对应报错信息更新
注:
不少地方都是用的公司通道机获取的json格式的返回数据,函数run_cmd,如:
{"RETURN":"{\"sub_task_id\":\"******\",\"ip\":\"10.75.4.43\",\"user\":\"****\",\"result\":\"10.75.19.1**\\n\"}"}
代码:
#!/usr/bin/python # -*- coding:utf-8 -*- _author__ = 'mujibin' #import python lib import random import urllib import datetime import time import MySQLdb import os import time import re import urllib2 import json import string import sys import time import paramiko #add path sys.path.append("/data1/salt/mysqlapi/salt/") #import salt repaire function from multiprocessing import * import logging from salt_minion_list import * from init_server import * from check_salt import * #from check_salt_bak import * from salt_repair_ssh import * reload(sys) sys.setdefaultencoding('utf8') H3303='*****.cn' H3304m='******.cn' P3303=3303 P3304=3304 dp_admin='dp_admin' HOST_PORT='3303' HOST_USER = 'mysqlha' HOST_PASSED = '********' db='test' port='*******' c_date = time.strftime("%Y%m%d",time.localtime()) c_time = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime()) ''' log_path = "/data1/dbatemp/salt/logs" is_path=os.path.exists(log_path) if not is_path: os.makedirs(log_path) log_name = "salt_reparie.log" logger = logging.getLogger() handler = logging.FileHandler(os.path.join(log_path,log_name)) formater = logging.Formatter("%(asctime)s %(levelname)s [%(funcName)s :%(lineno)d] %(message)s") handler.setFormatter(formater) logger.addHandler(handler) logger.setLevel(logging.NOTSET) #logger.setLevel(logging.INFO) #logger.setLevel(logging.DEBUG) #logger.setLevel(logging.ERROR) ''' ########################################################## salt_yes = datetime.date.today() ########################################################## #ssh api argument method = "sync" output = "json" ignore_error = "true" timeout = "28" ########################################################## slat_minion_check_CONSTANT="salt-minion" ########################################################## SALT = "salt" VERSION = "5.3" ########################################################### #master dns transfor to ip ########################################################### def getIp(domain): import socket myaddr = socket.getaddrinfo(domain,'http')[0][4][0] return myaddr MASTERDNS= "******.cn" MASTERIP = getIp(MASTERDNS) ########################################################## def ssh_connect_bak(host): client = paramiko.SSHClient() client.load_system_host_keys() client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) privatekeyfile = os.path.expanduser('/root/.ssh/id_rsa') mykey = paramiko.RSAKey.from_private_key_file(privatekeyfile) host=host.strip() client.connect(host,26387,username='root',timeout=2,pkey=mykey) return client def ssh_connect(host): client = paramiko.SSHClient() client.load_system_host_keys() client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) host=host.strip() client.connect(host,26387,username='root',timeout=10) return client def ssh_cmd(host,cmd): try: client = ssh_connect(host) i,o,e = client.exec_command(cmd) res = o.read().strip() return res except Exception,e: msg = "The host:%s and cmd:%s execute exception." % (host,cmd) #print msg pass def ssh_cmd_check(host,cmd1): #用来判断是否能够ssh登录成功 flag=0 #host_info=[host,flag] cmds=[cmd1] try: for cmd in cmds: #paramiko.util.log_to_file('paramiko.log') s = paramiko.SSHClient() s.load_system_host_keys() s.set_missing_host_key_policy(paramiko.AutoAddPolicy()) host=host.strip() s.connect(host,26387,username='root',timeout=20) s0,s1,s2 = s.exec_command(cmd1) info = s1.read().strip() #print s2.read().strip() #host_info.append(info) s.close() flag=0 except Exception,e: #根据第二个标志位肯定ssh是否通 flag=1 #host_info[1]=-1 return flag def run_cmd(ips,cmd,method,output,ignore_error,timeout):##这是公司的通道机,能够获取json格式的返回数据 _ips_ = ips _cmd_ = cmd #logger.debug(_cmd_) _method_ = method _output_ = output _ignore_error_ = ignore_error _timeout_ = timeout _user_='***' _key_='*****' url='*****p.php' argument={ 'user':_user_,'method':_method_,'output':_output_,'ignore_error':_ignore_error_,'key':_key_,'timeout':_timeout_,'ip':_ips_,'cmd':_cmd_} try: data = urllib.urlencode(argument) response = urllib2.urlopen(url, data) except Exception,e: msg = "Call the api function error!" pass return response.read() def select_in_3303(sql,host,user,port,passwd,db): ##查询sql try: db = MySQLdb.connect(host=host,user=user,port=port,passwd=passwd,db=db,connect_timeout=5,charset = "utf8") cursor = db.cursor() cursor.execute(sql) rows = cursor.fetchall() array = [] for row in rows: array.append(str(row[0])) db.close() return array except Exception,e: ##print str(e) return [] ##插入sql,由于尚未写入库的函数,因此没有用 def sql_insert(sql, port=3304, domain='*****', db='*****'): try: db = MySQLdb.connect(host=domain,user=HOST_USER,port=port,passwd=HOST_PASSED,db='swordfish',connect_timeout=3,charset="utf8") cursor = db.cursor() cursor.execute(sql) db.commit() db.close() except Exception,e: #print str(e) db.rollback() db.close() ##获取全部客户端有问题的服务器ip列表 def fix_list_salt(): sshList=[] try: saltsql="select ip_in from salt_mon where salt_mon_value != 'ok' and salt_mon_info not like '%None%' and ctime = (select ctime from salt_mon order by ctime desc limit 1);" sshList=select_in_3303(sql=saltsql,host=H3304m,user=HOST_USER,port=P3304,passwd=HOST_PASSED,db='swordfish') return sshList except Exception,e: print e #判断salt是否安装,经过命令去判断,若是系统无salt,那么改命名则什么都不会 #返回;若是存在,则会返回该系统salt的版本。返回:0 表示系统上存在salt,但进程不必定起来;返回1 #表示,但愿不存在salt。 def salt_exist_check(host): try: versionCmd = "rpm -qa | grep salt | wc -l" #versionRes = run_cmd(host, versionCmd, method="sync",output="text",ignore_error="true",timeout=5) versionRes = ssh_cmd(host,versionCmd) #logger.info(host+":"+versionRes) if int(versionRes) == 0: status = 0 else: status = 1 res = status return res except Exception,e: msg = "The function salt_exist_check execute failed with host:%s" % host #logger.error(msg) #logger.error(msg) #该函数尝试restart salt minion 客户端,在重启minion客户端以前,首先经过接口去master上 #删除该系统id的key,而后再将本地的key删除,最后重启。重启后经过判断salt进程是否存在,以此 #代表salt是否重启成功。返回0表示重启salt成功,返回1表示重启失败。 def salt_minion_restart(host): """ when salt minion installed, which will be restart. This function remove the key of minion. """ try: #logger.info("%s Try to restart the salt minion,this action can't guarante for success!" % host) #salt_remove_key(host) Cmd1 = """sudo rm -f /etc/salt/pki/minion/minion_master.pub""" Cmd2 = """sudo /etc/init.d/salt-minion restart""" #logger.info(host+" : "+rmKeyCmd) #logger.info(host+" : "+startCmd) rmRes1 = run_cmd(host, Cmd1, method="sync",output="text",ignore_error="true",timeout=10) time.sleep(5) rmRes2=run_cmd(host, Cmd2, method="sync",output="text",ignore_error="true",timeout=10) #logger.info(host+" : "+rmRes) #logger.info(host+" : "+startRes) time.sleep(5) saltExistStatus = salt_check(host) if saltExistStatus == 0: msg = 0 else: msg = 1 res = msg return res except Exception,e: msg = "The host:%s restart minion failed!" %(host) #logger.error(msg) #logger.error(e) #该函数会自动删除系统安装的salt程序,包括salt与salt minion。若是返回0,表示删除成功;若是返回 #1,表示删除失败。 def remove_salt_minion(host): try: #logger.info("%s Try to remove salt minion!" % host) versionCmd = "sudo rpm -qa | grep salt| grep -v grep" versionRes = run_cmd(host, versionCmd, method="sync",output="json",ignore_error="true",timeout=10) #versionRes = ssh_cmd(host,versionCmd) verResJsion = json.loads(versionRes) saltList = json.loads(verResJsion["RETURN"])['result'].split('\n') ssh_cmd(host,'/etc/init.d/salt-minion stop > /dev/null 2>&1 ') if len(saltList) > 1: for one in range(len(saltList)-1): rmCmd ="sudo yum remove -y %s > /dev/null 2>&1 " % (saltList[one]) #logger.info(host+" : "+rmCmd) rmRes = ssh_cmd(host,rmCmd) time.sleep(4) print rmRes #logger.info(host+" : "+rmRes) else: #logger.info("salt minion don't install!") pass versionStatus = salt_exist_check(host) if versionStatus == 0: status = 0 else: status =1 res = status print 'res:%s' %res return res except Exception,e: msg = "The function remove_salt_minion_qa execute failed with host:%s" % host #logger.info(msg) #logger.info(e) #该函数去判断系统的yum列表是否存在所需安装的salt版本。若是存在,则返回0;反之,则返回1。 def yum_check(host): try: #logger.info("%s Try to check yum." % host) checkCmd = "sudo yum list | grep salt | grep 2015 | wc -l" checkRes = ssh_cmd(host,checkCmd) if checkRes != 0: status = 0 else: status = 1 msg = status return msg except Exception,e: msg = "The host:%s check the yum error!" %(host) #logger.error(msg) #logger.error(e) #该函数修复系统的yum源。修复成功,返回0;修复失败,返回1,就是一个初始化yum源的过程。 def yum_repaire(host): try: yumCmd1=""" ([ `ps -ef | grep yum | grep -v grep | wc -l` -ne 0 ] && sudo ps -ef | grep '/usr/bin/yum' | grep -v grep | awk '{print $2}' | xargs kill -9 || echo '') && (cd /var/lib/rpm/ && sudo rm -f __db.00*) && (sudo rpm --rebuilddb) && (sudo yum clean all) && (sudo chattr -i /etc/yum.conf) && (sudo echo 'include=http://****/conf/yumconf.php' > /etc/yum.conf) && (sudo rm -rf /etc/yum.repos.d/*) && (sudo yum -y remove ****dbp > /dev/null 2>&1) && (sudo yum -y install ****dbp > /dev/null 2>&1) """ ret1 = ssh_cmd(host,yumCmd1) time.sleep(60) if yum_check(host) == 0: msg = 0 else: msg = 1 status = msg return msg except Exception,e: msg = "The host:%s try to repaire yum failed!" %(host) #logger.error(msg) #logger.error(msg) #该函数去判断系统是否存在salt进程,若是存在则,返回0;反之,则返回1. def salt_check(host): try: #logger.info("%s Check the process of salt." % host) checkCmd = "ps -ef | grep salt-minion | grep -v grep | wc -l" checkRes = ssh_cmd(host,checkCmd) #pattern = re.compile(r".*salt") #match = pattern.match(checkRes) if checkRes != 0: status = 0 else: status = 1 msg = status return msg except Exception,e: msg = "The host:%s salt check error!" %(host) #logger.error(msg) #logger.error(msg) #该函数安装salt minion客户端,若是安装成功,返回0;反之,则返回1. def install_salt_minion(host): try: #logger.info("Install salt minion.") inSaltCmd = """([ `ps -ef | grep yum | grep -v grep | wc -l` -ne 0 ] && sudo ps -ef | grep '/usr/bin/yum' | grep -v grep | awk '{print $2}' | xargs kill -9 || echo '') && (sudo yum clean all) && (sudo yum -y install salt.noarch salt-minion.noarch)""" #in1Res = run_cmd(host, inSaltCmd, method, output, ignore_error, timeout) in1Res = ssh_cmd(host,inSaltCmd) #logger.info(host+" : "+in1Res) #print in1Res time.sleep(20) saltInStatus = salt_exist_check(host) if int(saltInStatus) == 1: status = 0 else: status = 1 res = status return res except Exception,e: msg = "The host:%s install minion failed!" %(host) #logger.debug(msg) #logger.error(e) ##该函数检测服务器ip是否能ping通 def ping_mon_by_host(host): try: ping_cmd = "ping -c 1 -w 2 %s > /dev/null" % host ret = os.system(ping_cmd) if ret == 0: status = 0 msg = "The host %s ping ok" % host else: status = 1 msg = "The host %s ping failed" % host result = status return result except Exception,e: msg = """The host %d: ping_mon_by_host failed!""" % host #logger.error(msg) #logger.error(e) #检查master与minion端通讯是否成功 def check_salt_minion(host): try: cmd = "salt '%s' -t 7 cmd.run 'uptime'" %host ret = ssh_cmd(MASTERIP,cmd) msg = "" if ret and 'load' in ret: status = 0 msg = 'ok' else : status = 1 try: msg = ret.split(':')[1].strip() except Exception,e: msg = ret result = {'status':status,'message':msg} return result except Exception,e: pass #该函数检测nfs挂载状态,这里使用公司通道机获取json格式的返回数据 def nfs_check(host): mount_number_cmd = "mount | grep 'knfs'| wc -l" mount_number = ssh_cmd(host,mount_number_cmd) if int(mount_number) != 0: mount_data_cmd = "mount | grep 'knfs' | awk -F ' ' '{print $3}'" mount_ip_cmd = "mount | grep 'knfs' | awk -F ':' '{print $1}'" try: mount_ip = run_cmd(host, mount_ip_cmd, method="sync",output="json",ignore_error="true",timeout=10) print mount_ip ipJson = json.loads(mount_ip) ipList = json.loads(ipJson['RETURN'])['result'].split('\n') for one in range(len(ipList)-1): ping_Cmd = "ping -c 1 -w 1 %s | grep '0 received' | wc -l" % (ipList[one]) pingRes = ssh_cmd(host,ping_Cmd) if int(pingRes) != 0: umount = run_cmd(host, mount_data_cmd, method="sync",output="json",ignore_error="true",timeout=10) umJson = json.loads(umount) dataList = json.loads(umJson["RETURN"])['result'].split('\n') for one in range(len(dataList)-1): rmCmd ="umount -l %s > /dev/null 2>&1 " % (dataList[one]) rmRes = ssh_cmd(host,rmCmd) time.sleep(2) mount_number_cmd2 = "mount | grep 'knfs'| wc -l" mount_number2 = ssh_cmd(host,mount_number_cmd2) if int(mount_number2) != 0: msg = 1 else: msg = 0 else: msg = 0 except Exception,e: msg = 1 else: msg = 0 return msg #自动修复salt主程序 def salt_repaire(host): try: msg = "" pingStatus = ping_mon_by_host(host) if pingStatus == 0: #判断是否能够登陆 sshStatus=ssh_cmd_check(host,'date') if sshStatus == 0: #监测nfs挂载是否正常 nfsStatus = nfs_check(host) if nfsStatus == 0: #修复yum源 print 'yum_repair' yumStatus = yum_repaire(host) #print yumStatus if yumStatus == 0: #卸载salt minion客户端 print 'remove salt' removeStatus = remove_salt_minion(host) if removeStatus == 0: print 'install salt' #安装salt minion客户端 installStatus = install_salt_minion(host) if installStatus == 0: #启动salt minion 客户端 print 'start salt' restartStatus = salt_minion_restart(host) if restartStatus == 0: print 'master-minion check' minionStatus = check_salt_minion(host) if minionStatus["status"] == 0: print '%s:ok' % host else: print '%s:%s' %(host,minionStatus["message"]) else: msg = "%s:salt minion restart error!" % host else: msg = "%s:install salt minion failed!" % host else: msg = "%s:remove salt minion failed!" % host else: msg = "%s: yum occur error!" % host else: msg = "%s:nfs err" %host else: msg = "%s: bad ssh,go failed!" % host else: msg = "%s: The host can not ping!" % host print msg #info = msg #re_info = msg #return info #相关信息入库 #ping_status = p_status #salt_status = s_status #salt_minion_mon(host,ping_status,salt_status,re_info) #salt_info(host,info) return info except Exception,e: msg = "Salt repaire failed with host:%s " % host #logger.info(msg) #logger.info(e) def scheduler_repaire(): minionList = fix_list_salt() pool = Pool(8) pool.map(salt_repaire,minionList) pool.close() pool.join() if __name__ == "__main__": scheduler_repaire()