diff --git a/collectd-extensions/src/fm_notifier.py b/collectd-extensions/src/fm_notifier.py index 8948c24..b4a3776 100755 --- a/collectd-extensions/src/fm_notifier.py +++ b/collectd-extensions/src/fm_notifier.py @@ -105,8 +105,9 @@ debug_lists = False want_state_audit = False want_vswitch = False -# number of notifier loops before the state is object dumped -DEBUG_AUDIT = 2 +# Number of notifier loop between each audit. +# @ 30 sec interval audit rate is every 5 minutes +AUDIT_RATE = 10 # write a 'value' log on a the resource sample change of more than this amount LOG_STEP = 10 @@ -207,6 +208,10 @@ ALARM_ID_LIST = [ALARM_ID__CPU, ALARM_ID__VSWITCH_PORT, ALARM_ID__VSWITCH_IFACE] +AUDIT_ALARM_ID_LIST = [ALARM_ID__CPU, + ALARM_ID__MEM, + ALARM_ID__DF] + # ADD_NEW_PLUGIN: add plugin name definition # WARNING: This must line up exactly with the plugin # filename without the extension. @@ -616,15 +621,17 @@ class fmAlarmObject: # total notification count self.count = 0 - # Debug: state audit controls - self.audit_threshold = 0 - self.audit_count = 0 + # audit counters + self.alarm_audit_threshold = 0 + self.state_audit_count = 0 # For plugins that have multiple instances like df (filesystem plugin) # we need to create an instance of this object for each one. # This dictionary is used to associate an instance with its object. self.instance_objects = {} + self.fault = None + def _ilog(self, string): """Create a collectd notifier info log with the string param""" collectd.info('%s %s : %s' % (PLUGIN, self.plugin, string)) @@ -658,18 +665,18 @@ class fmAlarmObject: if self.id == ALARM_ID__CPU: _print_state() - self.audit_count += 1 + self.state_audit_count += 1 if self.warnings: collectd.info("%s AUDIT %d: %s warning list %s:%s" % (PLUGIN, - self.audit_count, + self.state_audit_count, self.plugin, location, self.warnings)) if self.failures: collectd.info("%s AUDIT %d: %s failure list %s:%s" % (PLUGIN, - self.audit_count, + self.state_audit_count, self.plugin, location, self.failures)) @@ -1461,7 +1468,7 @@ def _print_obj(obj): collectd.info("%s %s %s - %s - %s\n" % (PLUGIN, prefix, obj.resource_name, obj.plugin, obj.id)) - + collectd.info("%s %s fault obj: %s\n" % (PLUGIN, prefix, obj.fault)) collectd.info("%s %s entity id: %s\n" % (PLUGIN, prefix, obj.entity_id)) collectd.info("%s %s degrade_id: %s\n" % (PLUGIN, prefix, obj.degrade_id)) @@ -1817,7 +1824,7 @@ def notifier_func(nObject): if eid.split(base_eid)[1]: want_alarm_clear = True - collectd.info('%s found %s %s alarm [%s]' % + collectd.info('%s alarm %s:%s:%s found at startup' % (PLUGIN, alarm.severity, alarm_id, @@ -1825,8 +1832,9 @@ def notifier_func(nObject): if want_alarm_clear is True: if clear_alarm(alarm_id, eid) is False: - collectd.error("%s %s:%s clear failed" % - (PLUGIN, + collectd.error("%s alarm %s:%s:%s clear " + "failed" % + (PLUGIN, alarm.severity, alarm_id, eid)) continue @@ -1982,15 +1990,6 @@ def notifier_func(nObject): # if obj.warnings or obj.failures: # _print_state(obj) - # If want_state_audit is True then run the audit. - # Primarily used for debug - # default state is False - if want_state_audit: - obj.audit_threshold += 1 - if obj.audit_threshold == DEBUG_AUDIT: - obj.audit_threshold = 0 - obj._state_audit("audit") - # manage reading value change ; store last and log if gt obj.step action = obj.manage_change(nObject) if action == "done": @@ -2013,6 +2012,83 @@ def notifier_func(nObject): if len(mtcDegradeObj.degrade_list): mtcDegradeObj.remove_degrade_for_missing_filesystems() + obj.alarm_audit_threshold += 1 + if obj.alarm_audit_threshold >= AUDIT_RATE: + if want_state_audit: + obj._state_audit("audit") + obj.alarm_audit_threshold = 0 + + ################################################################# + # + # Audit Asserted Alarms + # + # Loop over the list of auditable alarm ids building two + # dictionaries, one containing warning (major) and the other + # failure (critical) with alarm info needed to detect and + # correct stale, missing or severity mismatched alarms for + # the listed alarm ids <100.xxx>. + # + # Note: Conversion in terminology from + # warning -> major and + # failures -> critical + # is done because fm speaks in terms of major and critical + # while the plugin speaks in terms of warning and failure. + # + major_alarm_dict = {} + critical_alarm_dict = {} + for alarm_id in AUDIT_ALARM_ID_LIST: + + tmp_base_obj = get_base_object(alarm_id) + if tmp_base_obj is None: + collectd.error("%s audit %s base object lookup failed" % + (PLUGIN, alarm_id)) + continue + + # Build 2 dictionaries containing current alarmed info. + # Dictionary entries are indexed by entity id to fetch the + # alarm id and last fault object used to create the alarm + # for the mismatch and missing case handling. + # + # { eid : { alarm : , fault : }}, ... } + + # major list for base object from warnings list + if tmp_base_obj.entity_id in tmp_base_obj.warnings: + info = {} + info[pc.AUDIT_INFO_ALARM] = alarm_id + info[pc.AUDIT_INFO_FAULT] = tmp_base_obj.fault + major_alarm_dict[tmp_base_obj.entity_id] = info + + # major list for instance objects from warnings list + for _inst_obj in tmp_base_obj.instance_objects: + inst_obj = tmp_base_obj.instance_objects[_inst_obj] + if inst_obj.entity_id in tmp_base_obj.warnings: + info = {} + info[pc.AUDIT_INFO_ALARM] = alarm_id + info[pc.AUDIT_INFO_FAULT] = inst_obj.fault + major_alarm_dict[inst_obj.entity_id] = info + + # critical list for base object from failures list + if tmp_base_obj.entity_id in tmp_base_obj.failures: + info = {} + info[pc.AUDIT_INFO_ALARM] = alarm_id + info[pc.AUDIT_INFO_FAULT] = tmp_base_obj.fault + critical_alarm_dict[tmp_base_obj.entity_id] = info + + # critical list for instance objects from failures list + for _inst_obj in tmp_base_obj.instance_objects: + inst_obj = tmp_base_obj.instance_objects[_inst_obj] + if inst_obj.entity_id in tmp_base_obj.failures: + info = {} + info[pc.AUDIT_INFO_ALARM] = alarm_id + info[pc.AUDIT_INFO_FAULT] = inst_obj.fault + critical_alarm_dict[inst_obj.entity_id] = info + + pluginObject.alarms_audit(api, AUDIT_ALARM_ID_LIST, + major_alarm_dict, + critical_alarm_dict) + # end alarms audit + ################################################################# + # exit early if there is no alarm update to be made if obj.debounce(base_obj, obj.entity_id, @@ -2053,7 +2129,7 @@ def notifier_func(nObject): reason = obj.reason_warning # build the alarm object - fault = fm_api.Fault( + obj.fault = fm_api.Fault( alarm_id=obj.id, alarm_state=_alarm_state, entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST, @@ -2067,7 +2143,7 @@ def notifier_func(nObject): suppression=base_obj.suppression) try: - alarm_uuid = api.set_fault(fault) + alarm_uuid = api.set_fault(obj.fault) if pc.is_uuid_like(alarm_uuid) is False: collectd.error("%s 'set_fault' failed ; %s:%s ; %s" % (PLUGIN, diff --git a/collectd-extensions/src/plugin_common.py b/collectd-extensions/src/plugin_common.py index fdb3568..5242f3a 100644 --- a/collectd-extensions/src/plugin_common.py +++ b/collectd-extensions/src/plugin_common.py @@ -98,6 +98,9 @@ RESERVED_CPULIST_KEY = 'PLATFORM_CPU_LIST' PLUGIN_PASS = 0 PLUGIN_FAIL = 1 +AUDIT_INFO_ALARM = 'alarm' +AUDIT_INFO_FAULT = 'fault' + class PluginObject(object): @@ -162,8 +165,10 @@ class PluginObject(object): def init_completed(self): """Declare plugin init complete""" - - collectd.info("%s initialization completed" % self.plugin) + self.hostname = self.gethostname() + self.base_eid = 'host=' + self.hostname + collectd.info("%s %s initialization completed" % + (self.plugin, self.hostname)) self.init_complete = True ########################################################################### @@ -349,6 +354,230 @@ class PluginObject(object): return True + ##################################################################### + # + # Name : clear_alarm + # + # Description: Clear the specified alarm. + # + # Returns : True if operation succeeded + # False if there was an error exception. + # + # Assumptions: Caller can decide to retry based on return status. + # + ##################################################################### + def clear_alarm(self, fm, alarm_id, eid): + """Clear the specified alarm:eid + + :param fm The Fault Manager's API Object + :param alarm_id The alarm identifier , ie 100.103 + :param eid The entity identifier ; host=. + """ + + try: + if fm.clear_fault(alarm_id, eid) is True: + collectd.info("%s %s:%s alarm cleared" % + (self.plugin, alarm_id, eid)) + else: + collectd.info("%s %s:%s alarm already cleared" % + (self.plugin, alarm_id, eid)) + return True + + except Exception as ex: + collectd.error("%s 'clear_fault' exception ; %s:%s ; %s" % + (self.plugin, alarm_id, eid, ex)) + return False + + ######################################################################### + # + # Name : __missing_or_mismatch_alarm_handler + # + # Purpose: Find and correct missing or mismatch alarms + # + # Scope: Private + # + ######################################################################### + def __missing_or_mismatch_alarm_handler(self, + fm, + alarms, + alarm_id, + severity, + sev_alarm_dict): + """Find and correct missing or mismatch alarms + + :param fm The Fault Manager's API Object + :param alarms List of database alarms for alarm id and this host + :param alarm_id The alarm id in context + :param severity Specifies the severity level of sev_alarm_dict + :param sev_alarm_dict An alarm dictionary for either (not both) major + or critical alarms + """ + plugin_prefix = self.plugin + ' audit' + for eid in sev_alarm_dict: + found = False + if alarm_id == sev_alarm_dict[eid].get(AUDIT_INFO_ALARM): + error_case = "missing" + if alarms: + for alarm in alarms: + if alarm.entity_instance_id == eid: + if alarm.severity == severity: + collectd.info("%s alarm %s:%s:%s is correct" % + (plugin_prefix, severity, + alarm_id, eid)) + found = True + else: + error_case = "mismatch" + break + + if found is False: + + fault = sev_alarm_dict[eid].get(AUDIT_INFO_FAULT) + if fault: + collectd.info("%s alarm %s:%s:%s %s ; refreshing" % + (plugin_prefix, + severity, alarm_id, eid, error_case)) + fm.set_fault(fault) + else: + collectd.info("%s alarm %s:%s:%s %s" % + (plugin_prefix, + severity, alarm_id, eid, error_case)) + + ######################################################################### + # + # Name: alarms_audit + # + # Purpose: Ensure the alarm state in the FM database matches the plugin + # + # Description: Query FM for the specified alarm id list. Handle missing, + # stale or severity mismatched alarms. + # + # Algorithm : Each alarm id is queried and the response is filtered by + # current host. The plugin's running state takes precedence. + # This audit will only ever raise, modify or clear alarms in + # the database, never change the alarm state of the plugin. + # + # - clear any asserted alarms that have a clear state + # in the plugin. + # - raise an alarm that is cleared in fm but asserted + # in the plugin. + # - correct alarm severity in fm database to align with + # the plugin. + # + # Assumptions: The severity dictionary arguments (major and critical) + # are used to detect severity mismatches and support alarm + # ids with varying entity ids. + # + # The dictionaries are a list of key value pairs ; aid:eid + # - alarm id as 'aid' + # - entity_id as 'eid' + # + # No need to check for fm api call success and retry on + # failure. Stale alarm clear will be retried on next audit. + # + ######################################################################### + def alarms_audit(self, + fm, + audit_alarm_id_list, + major_alarm_dict, + critical_alarm_dict): + """Audit the fm database for this plugin's alarms state + + :param fm The Fault Manager's API Object + :param audit_alarm_id_list A list of alarm ids to query + :param major_alarm_dict A dictionary of major alarms by aid:eid + :param critical_alarm_dict A dictionary of critical alarms by aid:eid + """ + + if len(audit_alarm_id_list) == 0: + return + + plugin_prefix = self.plugin + ' audit' + + if len(major_alarm_dict): + collectd.debug("%s major_alarm_dict: %s" % + (plugin_prefix, major_alarm_dict)) + + if len(critical_alarm_dict): + collectd.debug("%s critical_alarm_dict: %s" % + (plugin_prefix, critical_alarm_dict)) + + for alarm_id in audit_alarm_id_list: + collectd.debug("%s searching for all '%s' alarms" % + (plugin_prefix, alarm_id)) + try: + database_alarms = [] + tmp = fm.get_faults_by_id(alarm_id) + if tmp is not None: + database_alarms = tmp + + # database alarms might contain same alarm id for other + # hosts and needs to be filtered + alarms = [] + for alarm in database_alarms: + base_eid = alarm.entity_instance_id.split('.')[0] + if self.base_eid == base_eid: + collectd.debug("%s alarm %s:%s:%s in fm" % + (plugin_prefix, + alarm.severity, alarm_id, + alarm.entity_instance_id)) + alarms.append(alarm) + + except Exception as ex: + collectd.error("%s get_faults_by_id %s failed " + "with exception ; %s" % + (plugin_prefix, alarm_id, ex)) + continue + + # Service database alarms case + + # Stale database alarms handling case + remove_alarms_list = [] + if alarms: + for alarm in alarms: + found = False + for eid in major_alarm_dict: + if alarm.entity_instance_id == eid: + found = True + break + if found is False: + for eid in critical_alarm_dict: + if alarm.entity_instance_id == eid: + found = True + break + + if found is False: + collectd.info("%s alarm %s:%s:%s is stale ; clearing" % + (plugin_prefix, + alarm.severity, alarm_id, + alarm.entity_instance_id)) + + # clear stale alarm. + self.clear_alarm(fm, alarm_id, + alarm.entity_instance_id) + remove_alarms_list.append(alarm) + for alarm in remove_alarms_list: + alarms.remove(alarm) + else: + collectd.debug("%s database has no %s alarms" % + (plugin_prefix, alarm_id)) + + # If major alarms exist then check for + # missing or mismatch state in fm database + if len(major_alarm_dict): + self.__missing_or_mismatch_alarm_handler(fm, + alarms, + alarm_id, + 'major', + major_alarm_dict) + # If critical alarms exist then check for + # missing or mismatch state in fm database. + if len(critical_alarm_dict): + self.__missing_or_mismatch_alarm_handler(fm, + alarms, + alarm_id, + 'critical', + critical_alarm_dict) + ########################################################################### # # Name : make_http_request