class ActionHandler(object): ''' 负责把达到报警条件的trigger进行分析 ,并根据 action 表中的配置来进行报警 ''' def __init__(self,trigger_data,alert_counter_dic): self.trigger_data = trigger_data #self.trigger_process() self.alert_counter_dic = alert_counter_dic def record_log(self,action_obj,action_operation,host_id,trigger_data): """record alert log into DB""" models.EventLog.objects.create( event_type = 0, host_id=host_id, trigger_id = trigger_data.get('trigger_id'), log = trigger_data )
def action_email(self,action_obj,action_operation_obj,host_id,trigger_data): ''' sending alert email to who concerns. :param action_obj: 触发这个报警的action对象 :param action_operation_obj: 要报警的动做对象 :param host_id: 要报警的目标主机 :param trigger_data: 要报警的数据 :return: ''' print("要发报警的数据:",self.alert_counter_dic[action_obj.id][host_id]) print("action email:",action_operation_obj.action_type,action_operation_obj.notifiers,trigger_data) notifier_mail_list = [obj.email for obj in action_operation_obj.notifiers.all()] subject = '级别:%s -- 主机:%s -- 服务:%s' %(trigger_data.get('trigger_id'), trigger_data.get('host_id'), trigger_data.get('service_item')) send_mail( subject, action_operation_obj.msg_format, settings.DEFAULT_FROM_EMAIL, notifier_mail_list, )
那是由于一个trigger能够被多个template关联,这个trigger触发了,不必定是哪一个tempalte里的主机致使的python
一、第一次被 触,先初始化一个action counter dicspa
二、这个主机第一次触发这个action的报警orm
你不是触发一次我报一次,我是到了触发时间触发才报警,
三、若是达到报警触发interval次数,就记数+1对象
四、该报警了blog
def trigger_process(self): ''' 分析trigger并报警 :return: ''' print('Action Processing'.center(50,'-')) if self.trigger_data.get('trigger_id') == None: #trigger id == None print(self.trigger_data) if self.trigger_data.get('msg'): print(self.trigger_data.get('msg')) #既然没有trigger id,直接报警给管理 员 else: print("\033[41;1mInvalid trigger data %s\033[0m" % self.trigger_data) else:#正经的trigger 报警要触发了 print("\033[33;1m%s\033[0m" %self.trigger_data) trigger_id = self.trigger_data.get('trigger_id') host_id = self.trigger_data.get('host_id') trigger_obj = models.Trigger.objects.get(id=trigger_id) actions_set = trigger_obj.action_set.select_related() #找到这个trigger所关联的action list print("actions_set:",actions_set) matched_action_list = set() # 一个空集合 for action in actions_set: #每一个action 都 能够直接 包含多个主机或主机组, # 为何tigger里关联了template,template里又关联了主机,那action还要直接关联主机呢? #那是由于一个trigger能够被多个template关联,这个trigger触发了,不必定是哪一个tempalte里的主机致使的 for hg in action.host_groups.select_related(): for h in hg.host_set.select_related(): if h.id == host_id:# 这个action适用于此主机 matched_action_list.add(action) if action.id not in self.alert_counter_dic: #第一次被 触,先初始化一个action counter dic self.alert_counter_dic[action.id] = {} print("action, ",id(action)) if h.id not in self.alert_counter_dic[action.id]: # 这个主机第一次触发这个action的报警 self.alert_counter_dic[action.id][h.id] = {'counter': 0, 'last_alert': time.time()} # self.alert_counter_dic.setdefault(action,{h.id:{'counter':0,'last_alert':time.time()}}) else: #若是达到报警触发interval次数,就记数+1 if time.time() - self.alert_counter_dic[action.id][h.id]['last_alert'] >= action.interval: self.alert_counter_dic[action.id][h.id]['counter'] += 1 #self.alert_counter_dic[action.id][h.id]['last_alert'] = time.time() else: print("没达到alert interval时间,不报警",action.interval, time.time() - self.alert_counter_dic[action.id][h.id]['last_alert']) #self.alert_counter_dic.setdefault(action.id,{}) for host in action.hosts.select_related(): if host.id == host_id: # 这个action适用于此主机 matched_action_list.add(action) if action.id not in self.alert_counter_dic: # 第一次被 触,先初始化一个action counter dic self.alert_counter_dic[action.id] = {} if h.id not in self.alert_counter_dic[action.id]: #这个主机第一次触发这个action的报警 self.alert_counter_dic[action.id][h.id] ={'counter': 0, 'last_alert': time.time()} #self.alert_counter_dic.setdefault(action,{h.id:{'counter':0,'last_alert':time.time()}}) else: # 若是达到报警触发interval次数,就记数+1 if time.time() - self.alert_counter_dic[action.id][h.id]['last_alert'] >= action.interval: self.alert_counter_dic[action.id][h.id]['counter'] += 1 #self.alert_counter_dic[action.id][h.id]['last_alert'] = time.time() else: print("没达到alert interval时间,不报警", action.interval, time.time() - self.alert_counter_dic[action.id][h.id]['last_alert']) print("alert_counter_dic:",self.alert_counter_dic) print("matched_action_list:",matched_action_list) for action_obj in matched_action_list:# if time.time() - self.alert_counter_dic[action_obj.id][host_id]['last_alert'] >= action_obj.interval: #该报警 了 print("该报警了.......",time.time() - self.alert_counter_dic[action_obj.id][host_id]['last_alert'],action_obj.interval) for action_operation in action_obj.operations.select_related().order_by('-step'): if action_operation.step > self.alert_counter_dic[action_obj.id][host_id]['counter']: #就 print("##################alert action:%s" % action_operation.action_type,action_operation.notifiers) action_func = getattr(self,'action_%s'% action_operation.action_type) action_func(action_obj,action_operation,host_id,self.trigger_data) #报完警后更新一下报警时间 ,这样就又从新计算alert interval了 self.alert_counter_dic[action_obj.id][host_id]['last_alert'] = time.time() self.record_log(action_obj,action_operation,host_id,self.trigger_data) # else: # print("离下次触发报警的时间还有[%s]s" % )