diff --git a/tools/collector/debian-scripts/collect b/tools/collector/debian-scripts/collect index bdae3f35..aa3bc6a9 100755 --- a/tools/collector/debian-scripts/collect +++ b/tools/collector/debian-scripts/collect @@ -2941,6 +2941,56 @@ collect_subclouds() fi } +############################################################################ +# +# Name : get_report_tool +# +# Purpose : Fetch report tool from current host +# +# Parameters: $1 - local path destination +# +############################################################################ + +function get_report_tool() +{ + local local_dest=${1} + + mkdir -p ${local_dest} + cp -r /usr/local/bin/report/tool ${local_dest} + + local rc=${?} + if [ ${rc} -ne ${PASS} ] ; then + report_error "failed to get report tool from /usr/local/bin" ${rc} + else + ilog "copied report tool from host" + fi +} + +############################################################################ +# +# Name : get_report_plugins +# +# Purpose : Fetch plugins for report tool from current host +# +# Parameters: $1 - local path destination +# +############################################################################ + +function get_report_plugins() +{ + local local_dest=${1} + + mkdir -p ${local_dest} + cp -r /etc/collect/plugins ${local_dest} + + local rc=${?} + if [ ${rc} -ne ${PASS} ] ; then + report_error "failed to get report plugins from /etc/collect" ${rc} + else + ilog "copied plugins for report tool from host" + fi +} + ############################################################################ # # Handle subcloud and system hosts batched collect @@ -3031,6 +3081,17 @@ echo -n "creating ${COLLECT_TYPE} tarball ${TARBALL_NAME} ... " remove_file_local ${COLLECT_ERROR_LOG} remove_file_local ${HOST_COLLECT_ERROR_LOG} +get_report_tool ${COLLECT_DIR}/report +get_report_plugins ${COLLECT_DIR}/report + +cd ${COLLECT_DIR} +tar -czf report_tool.tgz report +rc=${?} +if [ ${rc} -ne ${PASS} ] ; then + report_error "failed to tar report tool" ${rc} +else + rm -r report +fi /usr/bin/expect << EOF log_user ${USER_LOG_MODE} diff --git a/tools/collector/debian-scripts/report/README b/tools/collector/debian-scripts/report/README new file mode 100644 index 00000000..23d7a580 --- /dev/null +++ b/tools/collector/debian-scripts/report/README @@ -0,0 +1,81 @@ +Refer to report.py file header for a description of the tool + +Example: + +Consider the following collect bundle structure + +SELECT_NODES_20220527.193605 +├── controller-0_20220527.193605 +│  ├── etc +│  ├── root +│  └── var +├── controller-1_20220527.193605 +│  ├── etc +│  ├── root +│  └── var +├── report + ├── plugins (where the plugin files will be placed) + │  ├── alarm + │  ├── substring + │  └── ... + ├── tool (where the tool will be placed) + └── output (where the output files will be placed) + + +> cat plugins/alarm + +algorithm=alarm +alarm_exclude=400., 800. +entity_exclude=subsystem=vim + +> cat plugins/substring + +algorithm=substring +files=var/log/mtcAgent.log, var/log/sm.log +hosts=controllers +substring=operation failed +substring=Failed to send message + +> report/tool/report.py --start 20220501 --end 20220530 + +Running the command above will populate the report folder with output files. +The tool also provides default values, more details are in 'report.py -h'. + +The substring algorithm creates an output file for every host of the +specified host type. The files will contain log events within the +provided date range containing the substring 'operation failed' and 'Failed +to send message'. + +The alarm algorithm creates two output file: 'log' and 'alarm' +'log' contains customer log messages created within the provided date range, +and 'alarm' contains system alarms created within the provided date range, as +long as the alarm ids and entity ids are not included in the alarm plugin file. + +For more detailed information about an algorithm use 'report.py -h'. + +Here is the report directory after running the above command + +report +├── output +│ └── SELECT_NODES_20220527.193605 (collect bundle that the report tool was run on) +│ ├── plugins (output files for plugins) +│ │ ├── alarm +│ │ └── ... +│ ├── correlator_failures +│ ├── correlator_events +│ ├── correlator_state_changes +│ ├── report.log (log file for report tool) +│ └── untar.log (log file for untarring collect bundle and host tar files) +├── plugins (where the plugins files are) +└── tool (where the report tool is) + +The report tool also allows users to point it at any collect bundle and +have it automatically extract the tarball and tar files for each host +before running. + +> report/tool/report.py -d CGTS-19143 + +Users may specify if they want the correlator to only find events +and state changes for a specific host. + +> report/tool/report.py --hostname controller-0 diff --git a/tools/collector/report/algorithms.py b/tools/collector/debian-scripts/report/algorithms.py similarity index 58% rename from tools/collector/report/algorithms.py rename to tools/collector/debian-scripts/report/algorithms.py index 2b206455..c65c67e2 100644 --- a/tools/collector/report/algorithms.py +++ b/tools/collector/debian-scripts/report/algorithms.py @@ -9,8 +9,12 @@ # Algorithm string constants ALARM = "alarm" AUDIT = "audit" -PROCESS_FAILURE = "process_failure" -PUPPET = "puppet" +DAEMON_FAILURES = "daemon_failures" +HEARTBEAT_LOSS = "heartbeat_loss" +MAINTENANCE_ERR = "maintenance_errors" +PROCESS_FAILURES = "process_failures" +PUPPET_ERRORS = "puppet_errors" +STATE_CHANGES = "state_changes" SUBSTRING = "substring" -SWACT = "swact" +SWACT_ACTIVITY = "swact_activity" SYSTEM_INFO = "system_info" diff --git a/tools/collector/debian-scripts/report/correlator.py b/tools/collector/debian-scripts/report/correlator.py new file mode 100755 index 00000000..eb217150 --- /dev/null +++ b/tools/collector/debian-scripts/report/correlator.py @@ -0,0 +1,583 @@ +######################################################################## +# +# Copyright (c) 2022 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# +######################################################################## +# +# This file contains the Correlator class. +# The Correlator class contains algorithms that search for failures. +# +# The Correlator class reads through all the output files created by +# the plugins and detects failures. A summary of the failures and their +# causes are printed to standard output and an output file is created +# in the report directory. +# +# TODO: Modularize code and separate methods into their own files +# +######################################################################## + +from datetime import datetime +from datetime import timedelta +import logging +import os +import re + + +logger = logging.getLogger(__name__) + + +class Correlator: + def __init__(self, plugin_output_dir): + """Constructor for the Correlator class + + Parameters: + plugin_output_dir (string): Path to directory with output files + from plugins + """ + self.plugin_output_dir = plugin_output_dir + + def run(self, hostname): + """Searches through the output files created by the plugins for + failures and determines their causes, as well as extracts significant + events and state changes + + Errors: + FileNotFoundError + """ + failures = [] + try: + failures += self.uncontrolled_swact() + except FileNotFoundError as e: + logger.error(e) + + try: + failures += self.mtc_errors() + except FileNotFoundError as e: + logger.error(e) + + events = [] + try: + events += self.get_events(hostname) + except FileNotFoundError as e: + logger.error(e) + + alarms = [] + try: + alarms += self.get_alarms(hostname) + except FileNotFoundError as e: + logger.error(e) + + state_changes = [] + try: + state_changes += self.get_state_changes(hostname) + except FileNotFoundError as e: + logger.error(e) + + return (sorted(failures), sorted(events), sorted(alarms), + sorted(state_changes)) + + def uncontrolled_swact(self): + """Searches through the output file created by the swact activity + plugin for uncontrolled swacts and determines their causes through + other indicators, like the log "Neighbour [..] is now in the down" + + Errors: + FileNotFoundError + """ + data = [] + + # Variables to keep track of indicators for failure causes + start_time = end_time = svc_failed = None + ctrlr_down = None # Active controller that went down, causing swact + ctrlr_svc_fail = None # Active controller where service failed + ctrlr_link_down = None # Orig. active controller when link went down + hb_loss = active_failed = go_active_failed = link_down = False + + # Open output file from swact activity plugin and read it + file_path = os.path.join(self.plugin_output_dir, "swact_activity") + + with open(file_path, "r") as swact_activity: + for line in swact_activity: + if "Uncontrolled swact" in line and not start_time: + start_time = datetime.strptime(line[0:19], + "%Y-%m-%dT%H:%M:%S") + if ("Host from active to failed, Peer from standby to " + "active" in line): + link_down = True + ctrlr_link_down = re.findall( + r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3} (.+) " + "sm:", line)[0] + elif (re.search("Neighbor (.+) is now in the down", line) + and start_time and not ctrlr_down): + ctrlr_down = re.findall( + r"Neighbor \((.+)\) received event", line)[0] + elif (re.search("Service (.+) is failed and has reached max " + "failures", line) and not svc_failed): + svc_failed = re.findall( + r"Service \((.+)\) is failed", line)[0] + ctrlr_svc_fail = re.findall( + r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3} (.+) sm:", + line)[0] + elif (svc_failed and re.search( + "active-failed\\s+\\| disabling-failed\\s+\\| " + + svc_failed, line)): + if re.search(r"\| go-active-failed\s+\|", line): + go_active_failed = True + else: + active_failed = True + elif "Swact update" in line and start_time and not end_time: + end_time = datetime.strptime(line[0:19], + "%Y-%m-%dT%H:%M:%S") + if ctrlr_down: + try: + hb_loss = self.search_hb_loss( + start_time, end_time, ctrlr_down) + except FileNotFoundError as e: + logger.error(e) + + start_time = start_time.strftime("%Y-%m-%dT%H:%M:%S") + end_time = end_time.strftime("%Y-%m-%dT%H:%M:%S") + if link_down: + data.append(start_time + " to " + end_time + + " Uncontrolled swact, refer to SM logs " + "for in-depth analysis, original active " + "controller: " + ctrlr_link_down + "\n") + elif ctrlr_down: + if hb_loss: + data.append(start_time + " to " + end_time + + " Uncontrolled swact due to " + "spontaneous reset of active " + "controller " + ctrlr_down + "\n") + else: + data.append(start_time + " to " + end_time + + " Uncontrolled swact likely due to " + "spontaneous reset of active " + "controller " + ctrlr_down + "\n") + elif svc_failed: + if active_failed and go_active_failed: + data.append(start_time + " to " + end_time + + " Uncontrolled swact due to service " + "failure (" + svc_failed + ") twice " + "in 2 minutes was unsuccessful so " + "\"bounced back\" to original active " + "controller " + ctrlr_svc_fail + "\n") + elif active_failed: + data.append(start_time + " to " + end_time + + " Uncontrolled swact due to service " + "failure (" + svc_failed + ") twice " + "in 2 minutes on active controller " + + ctrlr_svc_fail + "\n") + else: + data.append(start_time + " to " + end_time + + " Uncontrolled swact likely due to " + "service failure (" + svc_failed + + ") twice in 2 minutes on active " + "controller " + ctrlr_svc_fail + "\n") + + start_time = end_time = svc_failed = None + ctrlr_down = ctrlr_svc_fail = ctrlr_link_down = None + hb_loss = active_failed = go_active_failed = False + link_down = False + + return data + + def mtc_errors(self): + """Searches through the output file created by the maintenance errors + plugin for failures and determines their causes through other + indicators, like the log "Loss Of Communication for 5 seconds" + + Errors: + FileNotFoundError + """ + data = [] + + # Variables to keep track of indicators for failure causes + goenable_start = goenable_end = goenable_host = None + goenable_tst_f = config_tst_f = None # Tests failed + config_start = config_end = config_host = puppet_error = None + hb_loss_start = hb_loss_end = hb_loss_host = None + daemon_fail = comm_loss = auto_recov_dis = False + + # Open output file from maintenance errors plugin and read it + file_path = os.path.join(self.plugin_output_dir, "maintenance_errors") + + with open(file_path, "r") as mtc: + for line in mtc: + if "auto recovery disabled" in line and not auto_recov_dis: + # Check if previous failure recorded was go-enable, + # configuration or heartbeat failure + if (data and + re.search(r"Go-enable|[cC]onfiguration|Heartbeat", + data[-1])): + host = re.findall(r"failure on ([^\s]+)", data[-1]) + # Check if host in auto recovery disabled mode is same + # as host with previous failure + if (host and re.search( + host[0] + " auto recovery disabled", line)): + old = data[-1].split("due", 1) + if len(old) == 1: + data[-1] = (data[-1][:-1] + + " (auto recovery disabled)\n") + else: + data[-1] = (old[0] + + "(auto recovery disabled) due" + + old[1]) + auto_recov_dis = True + elif "GOENABLED Failed" in line and not goenable_start: + goenable_start, auto_recov_dis = line[0:19], False + goenable_host = re.findall( + "Error : (.+) got GOENABLED Failed", line)[0] + elif ("configuration failed or incomplete" in line + and not config_start): + config_start = datetime.strptime(line[0:19], + "%Y-%m-%dT%H:%M:%S") + auto_recov_dis = False + config_host = re.findall( + "Error : (.+) configuration failed", line)[0] + elif "Heartbeat Loss" in line: + # Check if previous failure recorded was heartbeat loss + # due to missing heartbeat messages + if ("(during recovery soak)" in line and data and + re.search("missing heartbeat messages", data[-1])): + host = re.findall( + "failure on (.+) due to", data[-1])[0] + # Check if host with hearbeat loss failure is the same + # as host with previous failure + if (re.search(host + " (.+) Heartbeat Loss (.+) " + "\\(during recovery soak\\)", line)): + old = data[-1] + data[-1] = (old[0:23] + line[0:19] + old[42:-1] + + " (recovery over disabled due to " + "heartbeat soak failure)\n") + else: + hb_loss_start = line[0:19] + comm_loss = auto_recov_dis = False + hb_loss_host = re.findall("Error : (.+) [CM]", line)[0] + # Check if previous failure recorded was heartbeat loss due to + # missing heartbeat messages + elif ("regained MTCALIVE from host that has rebooted" in line + and data and re.search(r"Heartbeat loss failure (.+) " + r"\(recovery over disabled\)", + data[-1])): + host = re.findall("failure on (.+) due to", data[-1])[0] + if re.search(host + " regained MTCALIVE", line): + old = data[-1].split("due", 1)[0] + data[-1] = (old[0:23] + line[0:19] + old[42:] + + "due to uncontrolled reboot\n") + elif (hb_loss_start and not comm_loss and hb_loss_host and + re.search(hb_loss_host + " Loss Of Communication for 5 " + "seconds", line)): + comm_loss = True + elif re.search("mtcClient --- (.+)Error : FAILED:", line): + if goenable_start and not goenable_tst_f: + goenable_tst_f = re.findall( + r"Error : FAILED: (.+) \(\d", line)[0] + elif config_start and not config_tst_f: + config_tst_f = re.findall( + r"Error : FAILED: (.+) \(\d", line)[0] + elif (goenable_host and not goenable_end and + re.search(goenable_host + " Task: In-Test Failure, " + "threshold reached", line)): + goenable_end = line[0:19] + if goenable_tst_f: + data.append(goenable_start + " to " + goenable_end + + " Go-enable test failure on " + + goenable_host + " due to failing of " + + goenable_tst_f + "\n") + else: + data.append(goenable_start + " to " + goenable_end + + " Go-enable test failure on " + + goenable_host + " due to unknown test " + "failing\n") + + goenable_start = goenable_end = goenable_host = None + goenable_tst_f = None + elif (config_host and not config_end and + re.search(config_host + " Task: Configuration failure, " + "threshold reached", line)): + config_end = datetime.strptime(line[0:19], + "%Y-%m-%dT%H:%M:%S") + if (config_tst_f + != "/etc/goenabled.d/config_goenabled_check.sh"): + try: + daemon_fail = self.search_daemon_fail( + config_start, config_end, config_host) + except FileNotFoundError as e: + logger.error(e) + + if (config_tst_f == + "/etc/goenabled.d/config_goenabled_check.sh" + or daemon_fail): + try: + puppet_error = self.search_puppet_error( + config_start, config_end) + except FileNotFoundError as e: + logger.error(e) + + config_start = config_start.strftime( + "%Y-%m-%dT%H:%M:%S") + config_end = config_end.strftime("%Y-%m-%dT%H:%M:%S") + if puppet_error: + data.append(config_start + " to " + config_end + + " Configuration failure on " + + config_host + " due to:\n" + + puppet_error) + else: + data.append(config_start + " to " + config_end + + " Configuration failure on " + + config_host + + " due to unknown cause\n") + else: + config_start = config_start.strftime( + "%Y-%m-%dT%H:%M:%S") + config_end = config_end.strftime("%Y-%m-%dT%H:%M:%S") + data.append(config_start + " to " + config_end + + " Possible configuration failure on " + + config_host + "\n") + + config_start = config_end = config_host = None + config_tst_f = puppet_error = None + daemon_fail = False + elif (hb_loss_start and not hb_loss_end and hb_loss_host and + re.search(hb_loss_host + " Connectivity Recovered ", + line)): + hb_loss_end = line[0:19] + data.append(hb_loss_start + " to " + hb_loss_end + + " Heartbeat loss failure on " + hb_loss_host + + " due to too many missing heartbeat " + "messages\n") + + hb_loss_start = hb_loss_end = hb_loss_host = None + comm_loss = False + elif (hb_loss_start and comm_loss and not hb_loss_end and + hb_loss_host and re.search( + hb_loss_host + " Graceful Recovery Wait", line)): + hb_loss_end = line[0:19] + data.append(hb_loss_start + " to " + hb_loss_end + + " Heartbeat loss failure on " + hb_loss_host + + " due to too many missing heartbeat " + "messages (recovery over disabled)\n") + + hb_loss_start = hb_loss_end = hb_loss_host = None + comm_loss = False + + return data + + def search_hb_loss(self, start_time, end_time, host): + """Searches through the output file created by the heartbeat loss + plugin for "Heartbeat Loss" message from host between one minute before + start_time and end_time + + Errors: + FileNotFoundError + """ + hb_loss = False + + # Open output file from heartbeat loss plugin and read it + file_path = os.path.join(self.plugin_output_dir, "heartbeat_loss") + + with open(file_path, "r") as heartbeat_loss: + for line in heartbeat_loss: + if (re.search("Error : " + host + " (.+) Heartbeat Loss ", + line)): + date = datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S") + if (date >= start_time - timedelta(minutes=1) + and date <= end_time): + hb_loss = True + break + + return hb_loss + + def search_daemon_fail(self, start_time, end_time, host): + """Searches through the output file created by the daemon failures + plugin for "Failed to run the puppet manifest" message from host + between 10 seconds before start_time and end_time + + Errors: + FileNotFoundError + """ + daemon_fail = False + + # Open output file from daemon failures plugin and read it + file_path = os.path.join(self.plugin_output_dir, "daemon_failures") + + with open(file_path, "r") as daemon_failures: + for line in daemon_failures: + if (re.search("\\d " + host + + " (.+) Failed to run the puppet manifest", + line)): + date = datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S") + if (date >= start_time - timedelta(seconds=10) + and date <= end_time): + daemon_fail = True + break + + return daemon_fail + + def search_puppet_error(self, start_time, end_time): + """Searches through the output file created by the puppet errors + plugin for "Error:" message between 10 seconds before start_time and + end_time and returns it + + Errors: + FileNotFoundError + """ + puppet_log = None + + # Open output file from puppet errors plugin and read it + file_path = os.path.join(self.plugin_output_dir, "puppet_errors") + + with open(file_path, "r") as puppet_errors: + for line in puppet_errors: + if "Error: " in line: + date = datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S") + if (date >= start_time - timedelta(seconds=10) + and date <= end_time): + puppet_log = line + break + + return puppet_log + + def get_events(self, hostname): + """Searches through the output files created by the plugins for + significant events and summarizes them, such as "force failed by SM" + + Errors: + FileNotFoundError + """ + data = [] + + # Variables to keep track of details for events + mnfa_start, mnfa_hist = None, "" + + # Open output file from maintenance errors plugin and read it + file_path = os.path.join(self.plugin_output_dir, "maintenance_errors") + + with open(file_path, "r") as mtc: + for line in mtc: + if "force failed by SM" in line: + host = re.findall("Error : (.+) is being", line)[0] + if hostname == "all" or host == hostname: + data.append(line[0:19] + " " + host + + " force failed by SM\n") + elif "Graceful Recovery Failed" in line: + host = re.findall("Info : (.+) Task:", line)[0] + if hostname == "all" or host == hostname: + data.append(line[0:19] + " " + host + + " graceful recovery failed\n") + elif "MNFA ENTER" in line: + mnfa_start = datetime.strptime(line[0:19], + "%Y-%m-%dT%H:%M:%S") + elif "MNFA POOL" in line: + pool_hosts = len(line.split("MNFA POOL: ")[1].split()) + if mnfa_start: + mnfa_hist += (" " + str(pool_hosts)) + else: + data_len = len(data) + for n in range(0, data_len): + event = data[data_len - 1 - n] + if "Multi-node failure" in event: + temp = " " + str(pool_hosts) + ")\n" + data[data_len - 1 - n] = event[:-2] + temp + break + elif "MNFA EXIT" in line: + mnfa_duration = datetime.strptime(line[0:19], + "%Y-%m-%dT%H:%M:%S") + mnfa_duration -= mnfa_start + mnfa_start = mnfa_start.strftime("%Y-%m-%dT%H:%M:%S") + data.append(mnfa_start + " Multi-node failure avoidance " + + "(duration: " + str(mnfa_duration) + + "; history:" + mnfa_hist + ")\n") + + mnfa_start, mnfa_hist = None, "" + + # Open output file from swact activity plugin and read it + file_path = os.path.join(self.plugin_output_dir, "swact_activity") + + with open(file_path, "r") as swact_activity: + for line in swact_activity: + if (re.search("Service (.+) is failed and has reached max " + "failures", line)): + host = re.findall( + r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3} (.+) sm:", + line)[0] + svc_failed = re.findall( + r"Service \((.+)\) is failed", line)[0] + if hostname == "all" or host == hostname: + data.append(line[0:19] + " " + host + + " service failure (" + svc_failed + + ")\n") + + return data + + def get_alarms(self, hostname): + """Searches through the 'alarm' output file created by the alarm plugin + and summarizes which alarms were found as well as the number of times + they were set and cleared + + Errors: + FileNotFoundError + """ + data = [] + + # Open 'alarm' output file from alarm plugin and read it + file_path = os.path.join(self.plugin_output_dir, "alarm") + + with open(file_path, "r") as alarm: + extract = False + for line in alarm: + if re.search(" \\d", line) and extract: + if line.split()[2] == "set": + data[-1]["set"] += 1 + else: + data[-1]["clear"] += 1 + elif hostname == "all" or hostname in line: + extract = True + alarm = { + "name": line[:-1], + "set": 0, + "clear": 0, + } + + data.append(alarm) + else: + extract = False + + temp = [] + for entry in data: + temp.append(entry["name"] + " - set: " + str(entry["set"]) + + ", clear: " + str(entry["clear"]) + "\n") + data = temp + + return data + + def get_state_changes(self, hostname): + """Searches through the output files created by the state changes + plugin and summarizes the changes of state of the hosts, such as + "is ENABLED" + + Errors: + FileNotFoundError + """ + data = [] + + # Open output file from state changes plugin and read it + file_path = os.path.join(self.plugin_output_dir, "state_changes") + + with open(file_path, "r") as state_changes: + for line in state_changes: + if "is ENABLED" in line: + host = re.findall("Info : (.+) is ENABLED", line)[0] + state = re.findall("is (.+)\n", line)[0].lower() + if hostname == "all" or hostname in host: + data.append(line[0:19] + " " + host + " " + state + + "\n") + elif "locked-disabled" in line: + host = re.findall( + "Info : (.+) u?n?locked-disabled", line)[0] + if hostname == "all" or host == hostname: + data.append(line[0:19] + " " + host + " disabled\n") + + return data diff --git a/tools/collector/debian-scripts/report/execution_engine.py b/tools/collector/debian-scripts/report/execution_engine.py new file mode 100755 index 00000000..bfcd3c60 --- /dev/null +++ b/tools/collector/debian-scripts/report/execution_engine.py @@ -0,0 +1,869 @@ +######################################################################## +# +# Copyright (c) 2022 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# +######################################################################## +# +# This file contains the ExecutionEngine class. +# The ExecutionEngine class contains all the available algorithms. +# +# The ExecutionEngine class runs plugins and gathers relevant logs and +# information, creating output files in the report directory. +# +# TODO: Modularize code and separate plugin algorithms into their own +# files +# +######################################################################## + +from datetime import datetime +import gzip +import logging +import os +import re +import shutil +import subprocess +import tarfile + +import algorithms +from correlator import Correlator + + +logger = logging.getLogger(__name__) + + +class ExecutionEngine: + def __init__(self, opts, output_directory): + """Constructor for the ExecutionEngine class + + Parameters: + opts (dictionary): Options from command line + """ + self.opts = opts + self.hosts = {"controllers": {}, "workers": {}, "storages": {}} + self.active_controller_directory = None + + # Uncompresses host tar files if not already done + with open(os.path.join(output_directory, "untar.log"), "a") as logfile: + for obj in (os.scandir(self.opts.directory)): + info = os.path.splitext(obj.name) + if (obj.is_file() and obj.name != "report_tool.tgz" and + tarfile.is_tarfile(obj.path) and not + os.path.isdir(os.path.join(self.opts.directory, + info[0]))): + try: + subprocess.run(["tar", "xzfC", obj.path, + self.opts.directory], + stderr=logfile, check=True) + subprocess.run(["echo", "uncompressed", obj.name], + check=True) + except subprocess.CalledProcessError as e: + logger.error(e) + + for folder in (f.path for f in os.scandir(self.opts.directory)): + database_path = os.path.join(folder, "var", "extra", "database") + host_info_path = os.path.join(folder, "var", "extra", "host.info") + + if os.path.isdir(database_path) and os.listdir(database_path): + self.active_controller_directory = folder + + if os.path.exists(host_info_path): + hostname, subfunction = self._extract_subfunction( + host_info_path) + if "controller" in subfunction: + self.hosts["controllers"][hostname] = folder + elif "worker" in subfunction: + self.hosts["workers"][hostname] = folder + elif "storage" in subfunction: + self.hosts["storages"][hostname] = folder + + if not self.active_controller_directory: + raise ValueError("Active controller not found") + + def execute(self, plugins, output_directory): + """Run a list of plugins + + Parameters: + plugins (Plugin list): List of plugins to run + + Errors: + FileNotFoundError + """ + plugin_output_dir = os.path.join(output_directory, "plugins") + os.makedirs(plugin_output_dir, exist_ok=True) + + for plugin in plugins: + processing = "Processing plugin: " + os.path.basename(plugin.file) + hosts = {} + if ( + plugin.state["hosts"] and len(plugin.state["hosts"]) >= 1 + ): # if host list is given + logger.info( + f"Processing plugin: {os.path.basename(plugin.file)}") + + for h in plugin.state["hosts"]: + if h == "all": + hosts.update(self.hosts["workers"]) + hosts.update(self.hosts["storages"]) + hosts.update(self.hosts["controllers"]) + else: + hosts.update(self.hosts[h]) + + for hostname, folderpath in hosts.items(): + + events = [] + if plugin.state["algorithm"] == algorithms.SUBSTRING: + events = self.substring( + plugin.state["substring"], + [ + os.path.join(folderpath, file) + for file in plugin.state["files"] + ], + ) + + # creating output file + output_file = os.path.join( + plugin_output_dir, + f"substring_{hostname}", + ) + if self.opts.verbose: + logger.info("output at " + + os.path.relpath(output_file)) + with open(output_file, "w") as file: + file.write( + f"Date range: {self.opts.start} until " + f"{self.opts.end}\n" + ) + file.write( + f"substrings: " + f"{' '.join(plugin.state['substring'])}\n" + ) + for line in events: + if line[-1] == "\n": + file.write(line) + else: + file.write(line + "\n") + else: + if plugin.state["algorithm"] == algorithms.SYSTEM_INFO: + info = self.system_info() + system_info_output = os.path.join(plugin_output_dir, + "system_info") + with open(system_info_output, "w") as file: + for i in info: + file.write(i + "\n") + + for k, v in self.hosts.items(): + file.write(f"{k}: {','.join(v.keys())}\n") + if self.opts.verbose: + logger.info(processing + ", output at " + + os.path.relpath(system_info_output)) + else: + logger.info(processing) + + elif plugin.state["algorithm"] == algorithms.AUDIT: + hosts = {} + hosts.update(self.hosts["workers"]) + hosts.update(self.hosts["storages"]) + hosts.update(self.hosts["controllers"]) + + for hostname, folderpath in hosts.items(): + self._create_output_file( + f"{hostname}_audit", + plugin_output_dir, + self.audit( + plugin.state["start"], + plugin.state["end"], + os.path.join( + folderpath, "var", "log", "dcmanager", + "audit.log" + ), + ), + processing, + ) + + elif plugin.state["algorithm"] == algorithms.SWACT_ACTIVITY: + self._create_output_file( + "swact_activity", plugin_output_dir, + self.swact_activity(), processing + ) + + elif plugin.state["algorithm"] == algorithms.PUPPET_ERRORS: + self._create_output_file( + "puppet_errors", plugin_output_dir, + self.puppet_errors(), processing + ) + + elif plugin.state["algorithm"] == algorithms.PROCESS_FAILURES: + self._create_output_file( + "process_failures", plugin_output_dir, + self.process_failures(), processing + ) + + elif plugin.state["algorithm"] == algorithms.ALARM: + alarms, logs = self.alarm( + plugin.state["alarm_exclude"], + plugin.state["entity_exclude"] + ) + alarm_output = os.path.join(plugin_output_dir, "alarm") + log_output = os.path.join(plugin_output_dir, "log") + + # creating output alarm file + with open(alarm_output, "w") as file: + for k, v in alarms.items(): + file.write(f"{k}:\n") + for date in v["dates"]: + file.write(f" {date}\n") + + # creating output log file + with open(log_output, "w") as file: + for k, v in logs.items(): + file.write(f"{k}: {v['count']}\n") + file.write("\n") + for k, v in logs.items(): + file.write(f"{k}:\n") + for date in v["dates"]: + file.write(f" {date}\n") + if self.opts.verbose: + logger.info(processing + ", output at " + + os.path.relpath(alarm_output) + + ", " + os.path.relpath(log_output)) + else: + logger.info(processing) + elif plugin.state["algorithm"] == algorithms.HEARTBEAT_LOSS: + self._create_output_file( + "heartbeat_loss", plugin_output_dir, + self.heartbeat_loss(), processing + ) + elif plugin.state["algorithm"] == algorithms.MAINTENANCE_ERR: + self._create_output_file( + "maintenance_errors", plugin_output_dir, + self.maintenance_errors(), processing + ) + elif plugin.state["algorithm"] == algorithms.DAEMON_FAILURES: + self._create_output_file( + "daemon_failures", plugin_output_dir, + self.daemon_failures(), processing + ) + elif plugin.state["algorithm"] == algorithms.STATE_CHANGES: + self._create_output_file( + "state_changes", plugin_output_dir, + self.state_changes(), processing + ) + + if not self.opts.verbose: + logger.info("Output files for plugins can be found at " + + os.path.relpath(plugin_output_dir)) + + # Running the correlator and printing the output from it + self.run_correlator(output_directory, plugin_output_dir) + + # Built-in algorithms ------------------------------ + def alarm(self, alarm_exclude=[], entity_exclude=[]): + """Alarm algorithm + Gathers list of alarms and customer logs + + Parameters: + alarm_exclude (string list) : List of alarm id patterns to not + search for + entity_exclude (string list): List of entity id patterns to not + search for + """ + alarm_data = {} + log_data = {} + + with open( + os.path.join( + self.active_controller_directory, + "var", "extra", "database", "fm.db.sql.txt" + ) + ) as file: + start = False + for line in file: + # start of event log + if re.search(r"COPY (public\.)?event_log", line): + start = True + elif start and line == "\\.\n": + break + elif start: + entry = re.split(r"\t", line) + + INDEX_ALARM_ID = 5 + INDEX_ACTION = 6 + INDEX_ENTITY_ID = 8 + INDEX_ALARM_DATE = 9 + INDEX_SEVERITY = 10 + + alarm_id = entry[INDEX_ALARM_ID] + entity_id = entry[INDEX_ENTITY_ID] + action = entry[INDEX_ACTION] + severity = entry[INDEX_SEVERITY] + alarm_date = entry[INDEX_ALARM_DATE] + + entry_date = alarm_date.replace( + " ", "T" + ) # making time format of alarm the same + if (self.opts.start <= entry_date + and entry_date <= self.opts.end): + cont = True + # Checks if the alarm is in the user specified list of + # alarm or entity ids + for id in alarm_exclude: + if id in alarm_id: + cont = False + break + + for entity in entity_exclude: + if entity in entity_id: + cont = False + break + + if not cont: + continue + + try: + if action == "log": + log_info = log_data[ + f"{alarm_id} {entity_id} {severity}" + ] + log_info["count"] += 1 + log_info["dates"].append(alarm_date) + else: + alarm_info = alarm_data[ + f"{alarm_id} {entity_id} {severity}" + ] + alarm_info["dates"].append( + f"{alarm_date} {action}") + except KeyError: + if entry[6] != "log": + alarm_data[ + f"{alarm_id} {entity_id} {severity}" + ] = { + "dates": [f"{alarm_date} {action}"], + } + else: + log_data[ + f"{alarm_id} {entity_id} {severity}" + ] = { + "count": 1, + "dates": [alarm_date], + } + + for _, v in alarm_data.items(): + v["dates"] = sorted(v["dates"]) + temp = [] + temp.append(v["dates"][0]) + for i in range(1, len(v["dates"])): + if v["dates"][i].split()[2] != v["dates"][i-1].split()[2]: + temp.append(v["dates"][i]) + v["dates"] = temp + + for _, v in log_data.items(): + v["dates"] = sorted(v["dates"]) + + return alarm_data, log_data + + def substring(self, substr, files): + """Substring algorithm + Looks for substrings within files + + Parameters: + substr (string list): List of substrings to look for + files (string list): List of absolute filepaths to search in + + Errors: + FileNotFoundError + """ + # don't analyze older files, continue with current file + CONTINUE_CURRENT = 0 + # analyze older files, continue with current file + CONTINUE_CURRENT_OLD = 1 + + data = [] + for file in files: + try: + if not os.path.exists(file): + if (re.search("controller-1_(.+)/var/log/mtcAgent.log", + file)): + continue + raise FileNotFoundError(f"File not found: {file}") + cont = True + # Searching through file + command = (f"""grep -Ea "{'|'.join(s for s in substr)}" """ + f"""{file} 2>/dev/null""") + status = self._continue(file) + + if (status == CONTINUE_CURRENT + or status == CONTINUE_CURRENT_OLD): + # continue with current file + if status == CONTINUE_CURRENT: + cont = False + self._evaluate_substring(data, command) + + # Searching through rotated log files that aren't compressed + n = 1 + while os.path.exists(f"{file}.{n}") and cont: + command = (f"""grep -Ea "{'|'.join(s for s in substr)}" """ + f"""{file}.{n} 2>/dev/null""") + status = self._continue(f"{file}.{n}") + + if (status == CONTINUE_CURRENT + or status == CONTINUE_CURRENT_OLD): + if status == CONTINUE_CURRENT: + cont = False + self._evaluate_substring(data, command) + + n += 1 + + # Searching through rotated log files + while os.path.exists(f"{file}.{n}.gz") and cont: + command = (f"""zgrep -E "{'|'.join(s for s in substr)}" """ + f"""{file}.{n}.gz 2>/dev/null""") + status = self._continue(f"{file}.{n}.gz", compressed=True) + + if (status == CONTINUE_CURRENT + or status == CONTINUE_CURRENT_OLD): + if status == CONTINUE_CURRENT: + cont = False + self._evaluate_substring(data, command) + + n += 1 + + except FileNotFoundError as e: + logger.error(e) + continue + + return sorted(data) + + def system_info(self): + """System info algorithm + Presents basic information about the system + """ + data = [] + with open( + os.path.join( + self.active_controller_directory, "etc", "platform", + "platform.conf" + ) + ) as file: + for line in file: + if "system_mode" in line: + data.append( + f"System Mode: " + f"{re.match('^system_mode=(.*)', line).group(1)}" + ) + elif "system_type" in line: + data.append( + f"System Type: " + f"{re.match('^system_type=(.*)', line).group(1)}" + ) + elif "distributed_cloud_role" in line: + role = re.match('^distributed_cloud_role=(.*)', + line).group(1) + data.append(f"Distributed cloud role: {role}") + elif "sw_version" in line: + data.append( + f"SW Version: " + f"{re.match('^sw_version=(.*)', line).group(1)}" + ) + with open( + os.path.join(self.active_controller_directory, "etc", "build.info") + ) as file: + for line in file: + if "BUILD_TYPE" in line: + data.append( + f"Build Type: " + f"{re.match('^BUILD_TYPE=(.*)', line).group(1)}" + ) + elif re.match("^OS=(.*)", line): + data.append(f"OS: {re.match('^OS=(.*)', line).group(1)}") + + return data + + def swact_activity(self): + """Swact activity algorithm + Presents all swacting activity in the system + """ + data = [] + sm_files = [] + sm_customer_files = [] + swact_start = None + swact_in_progress = False + swact_end = None + + for _, folder in self.hosts["controllers"].items(): + sm_path = os.path.join(folder, "var", "log", "sm.log") + sm_files.append(sm_path) + sm_customer_path = os.path.join(folder, "var", "log", + "sm-customer.log") + sm_customer_files.append(sm_customer_path) + + sm_substrings = ["Uncontrolled swact", "Swact has started,", + "Neighbor (.+) is now in the down", + "Service (.+) has reached max failures", + "Swact update"] + data = self.substring(sm_substrings, sm_files) + + for i, line in enumerate(data): + if "Swact has started," in line and not swact_in_progress: + swact_in_progress = True + swact_start = datetime.strptime(line[0:19], + "%Y-%m-%dT%H:%M:%S") + elif "Swact update" in line and swact_in_progress: + swact_in_progress = False + swact_end = datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S") + line += f" SWACT TOOK {swact_end - swact_start} \n" + data[i] = line + + sm_customer_substrings = [ + "swact", "active-failed\\s+\\| disabling-failed\\s+\\|" + ] + data += self.substring(sm_customer_substrings, sm_customer_files) + + return sorted(data) + + def puppet_errors(self): + """Puppet errors algorithm + Presents log errors from puppet logs + """ + data = [] + for host_type in self.hosts.keys(): + for _, folder in self.hosts[host_type].items(): + puppet_folder = os.path.join(folder, "var", "log", "puppet") + command = (f"""grep -rh "[m ]Error: " {puppet_folder} """ + f"""2>/dev/null""") + self._evaluate_substring(data, command) + return sorted(data) + + def process_failures(self): + """Process failures algorithm + Presents log errors from pmond + """ + data = [] + files = [] + for host_type in self.hosts.keys(): + for _, folder in self.hosts[host_type].items(): + pmond = os.path.join(folder, "var", "log", "pmond.log") + files.append(pmond) + + data = self.substring(["Error :"], files) + + return data + + def heartbeat_loss(self): + """Heartbeat loss algorithm + Presents all heartbeat loss error messages in the system + """ + data = [] + hb_files = [] + + for _, folder in self.hosts["controllers"].items(): + hb_path = os.path.join(folder, "var", "log", "hbsAgent.log") + hb_files.append(hb_path) + + hb_substrings = ["Heartbeat Loss"] + data = self.substring(hb_substrings, hb_files) + + return sorted(data) + + def maintenance_errors(self): + """Maintenance errors algorithm + Presents maintenance errors and other relevant log messages in system + """ + data = [] + mtc_files = [] + + for _, folder in self.hosts["controllers"].items(): + agent = os.path.join(folder, "var", "log", "mtcAgent.log") + mtc_files.append(agent) + + for host_type in self.hosts.keys(): + for _, folder in self.hosts[host_type].items(): + client = os.path.join(folder, "var", "log", "mtcClient.log") + mtc_files.append(client) + + mtc_substrings = ["Error : ", "Configuration failure", + "In-Test Failure", "Loss Of Communication", + "Graceful Recovery Wait ", + "regained MTCALIVE from host that has rebooted", + "Connectivity Recovered ; ", + "auto recovery disabled", "Graceful Recovery Failed", + "MNFA ENTER", "MNFA EXIT", "MNFA POOL"] + data = self.substring(mtc_substrings, mtc_files) + + return sorted(data) + + def daemon_failures(self): + """Daemon failures algorithm + Presents all failed puppet manifest messages in the system + """ + data = [] + daemon_files = [] + + for host_type in self.hosts.keys(): + for _, folder in self.hosts[host_type].items(): + daemon_path = os.path.join(folder, "var", "log", "daemon.log") + daemon_files.append(daemon_path) + + daemon_substrings = ["Failed to run the puppet manifest"] + data = self.substring(daemon_substrings, daemon_files) + + return sorted(data) + + def state_changes(self): + """State changes algorithm + Presents all messages in the system regarding the state of hosts + """ + data = [] + sc_files = [] + + for _, folder in self.hosts["controllers"].items(): + sc_path = os.path.join(folder, "var", "log", "mtcAgent.log") + sc_files.append(sc_path) + + sc_substrings = ["is ENABLED", "allStateChange (.+)locked-disabled"] + data = self.substring(sc_substrings, sc_files) + + return sorted(data) + + def audit(self, start, end, audit_log_path): + """Counts audit events in dcmanager within a specified date range + + Parameters: + start (string) : start date in YYYY-MM-DD HH:MM:SS format + end (string) : end date in YYYY-MM-DD HH:MM:SS format + audit_log_path (string) : absolute path of augit log file + """ + if not shutil.which("lnav"): + raise ValueError("Lnav program not found") + + SECONDS_PER_HOUR = 3600 + fmt = "%Y-%m-%d %H:%M:%S" + + d1 = datetime.strptime(start, fmt) + d2 = datetime.strptime(end, fmt) + seconds = (d2 - d1).total_seconds() + + log_texts = [ + "Triggered subcloud audit%", + "Trigger patch audit%", + "Trigger load audit%", + "Triggered firmware audit%", + "Triggered kubernetes audit%", + # Counts sum of audits from all subclouds + ] + INDEX_MIDDLE_WORD = 1 + data = [("These rates and totals represent the sum of audits from " + + "all subclouds")] + + def command(text): + + return ( + f'lnav -R -n -c ";SELECT count(log_body) AS ' + f'{text.split(" ")[INDEX_MIDDLE_WORD]}_total from ' + f'openstack_log WHERE ' + f'(log_time > \\"{start}\\" AND not log_time > \\"{end}\\")' + f' AND log_body like \\"{text}\\"" "{audit_log_path}"' + ) + + for text in log_texts: + p = subprocess.Popen(command(text), shell=True, + stdout=subprocess.PIPE) + for line in p.stdout: + line = line.decode("utf-8").strip() + if line.isnumeric(): + data.append( + f"rate " + f"{round((int(line)/seconds * SECONDS_PER_HOUR), 3)} " + f"per hour. total: {line}" + ) + else: + data.append(line) + return data + + # ----------------------------------- + + def run_correlator(self, output_directory, plugin_output_dir): + """Runs the correlator and prints the results differently based on if + the tool was run with or without the verbose option + + Parameters: + output_directory (string) : directory to place output files from + correlator + plugin_output_dir (string) : directory with output files from + plugins + """ + correlator = Correlator(plugin_output_dir) + failures, events, alarms, state_changes = correlator.run( + self.opts.hostname) + failures_len, events_len = len(failures), len(events) + alarms_len, state_changes_len = len(alarms), len(state_changes) + failures.append("\nTotal failures found: " + str(failures_len) + "\n") + events.append("\nTotal events found: " + str(events_len) + "\n") + alarms.append("\nTotal alarms found: " + str(alarms_len) + "\n") + state_changes.append("\nTotal state changes found: " + + str(state_changes_len) + "\n") + + logger.info("\nRunning correlator...") + self._create_output_file("correlator_failures", output_directory, + failures, "") + self._create_output_file("correlator_events", output_directory, + events, "") + self._create_output_file("correlator_alarms", output_directory, + alarms, "") + self._create_output_file("correlator_state_changes", output_directory, + state_changes, "") + + if not self.opts.verbose: + logger.info("Output can be found at " + + os.path.relpath(output_directory) + "\n") + logger.info("Failures: " + str(failures_len)) + for f in failures[:-1]: + if "Uncontrolled swact" in f: + logger.info(f[0:19] + " " + + re.findall("active controller:? (.+)\n", + f)[0] + " uncontrolled swact") + elif "failure on" in f: + host = re.findall(r"failure on ([^\s]+) ", f)[0] + logger.info(f[0:19] + " " + host + " " + + re.findall("^(.+) failure on ", + f[43:])[0].lower() + " failure") + else: + logger.info(f[:-1]) + if failures_len != 0: + logger.info("\nEvents: " + str(events_len)) + else: + logger.info("Events: " + str(events_len)) + logger.info("Alarms: " + str(alarms_len)) + logger.info("State Changes: " + str(state_changes_len)) + else: + logger.info("\nFailures: " + str(failures_len)) + for f in failures[:-1]: + logger.info(f[:-1]) + + # Dictionary to keep track of number of times events happens on + # each host + events_summ = {} + for e in events[:-1]: + k = e[20:-1].split(" (", 1)[0] + if not events_summ.get(k): + events_summ[k] = 1 + else: + events_summ[k] += 1 + + if failures_len != 0: + logger.info("\nEvents: " + str(events_len)) + else: + logger.info("Events: " + str(events_len)) + for k, v in sorted(events_summ.items()): + logger.info(k + ": " + str(v) + " time(s)") + + if events_len != 0: + logger.info("\nAlarms: " + str(alarms_len)) + else: + logger.info("Alarms: " + str(alarms_len)) + logger.info("The full list of alarms can be found at " + + os.path.relpath(output_directory) + + "/correlator_alarms") + + # Dictionary to keep track of number of times state changes + # happens on each host + state_changes_summ = {} + for s in state_changes[:-1]: + k = s[20:-1] + if "enabled" in k: + k = k.split("enabled", 1)[0] + "enabled" + if not state_changes_summ.get(k): + state_changes_summ[k] = 1 + else: + state_changes_summ[k] += 1 + + if alarms_len != 0: + logger.info("\nState Changes: " + str(state_changes_len)) + else: + logger.info("State Changes: " + str(state_changes_len)) + for k, v in sorted(state_changes_summ.items()): + logger.info(k + ": " + str(v) + " time(s)") + + def _continue(self, file, compressed=False): + # don't analyze older files, continue with current file + CONTINUE_CURRENT = 0 + # analyze older files, continue with current file + CONTINUE_CURRENT_OLD = 1 + # don't analyze current file, continue to older files + CONTINUE_OLD = 2 + + # check date of first log event and compare with provided + # start, end dates + first = "" + + if not compressed: + with open(file) as f: + line = f.readline() + first = line[0:19] + else: + with gzip.open(file, "rb") as f: + line = f.readline().decode("utf-8") + first = line[0:19] + try: + datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S") + first = line[0:19] + except ValueError: + return CONTINUE_CURRENT_OLD + + if first < self.opts.start: + return CONTINUE_CURRENT + elif first < self.opts.end and first > self.opts.start: + return CONTINUE_CURRENT_OLD + elif first > self.opts.end: + return CONTINUE_OLD + + def _evaluate_substring(self, data, command): + p = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE) + for line in p.stdout: + line = line.decode("utf-8") + # different date locations for log events + dates = [line[0:19], line[2:21]] + for date in dates: + try: + datetime.strptime(date, "%Y-%m-%dT%H:%M:%S") + if date > self.opts.start and date < self.opts.end: + if line[0] == "|": # sm-customer.log edge case + line = line[1:].strip() + line = re.sub("\\s+", " ", line) + data.append(line) + break + except ValueError: + if date == dates[-1]: + data.append(line) + + def _extract_subfunction(self, host_info_path): + GROUP_ONE = 1 + with open(host_info_path) as file: + for line in file: + hostname_match = re.match( + r"\s*hostname =>\s*\"?([^\"]*)(\n|\"\s*,?\s*\n)", line) + subfunction_match = re.match( + r"\s*subfunction =>\s*\"?([^\"]*)(\n|\"\s*,?\s*\n)", line) + if subfunction_match: + subfunction = subfunction_match.group(GROUP_ONE) + if hostname_match: + hostname = hostname_match.group(GROUP_ONE) + return hostname, subfunction + + def _create_output_file(self, filename, directory, data, processing): + with open(os.path.join(directory, filename), "w") as file: + for i in data: + if i[-1] == "\n": + file.write(i) + else: + file.write(i + "\n") + if self.opts.verbose: + output = ("output at " + + os.path.relpath(os.path.join(directory, filename))) + if processing == "": + logger.info(output) + else: + logger.info(processing + ", " + output) + elif processing != "": + logger.info(processing) diff --git a/tools/collector/report/plugin.py b/tools/collector/debian-scripts/report/plugin.py similarity index 69% rename from tools/collector/report/plugin.py rename to tools/collector/debian-scripts/report/plugin.py index 1ecf40e0..a9f4a739 100755 --- a/tools/collector/report/plugin.py +++ b/tools/collector/debian-scripts/report/plugin.py @@ -40,8 +40,8 @@ class Plugin: "files": [], "hosts": [], "substring": [], - "alarm_ids": [], - "entity_ids": [], + "alarm_exclude": [], + "entity_exclude": [], "start": None, "end": None, } @@ -93,10 +93,11 @@ class Plugin: self.state["substring"].append(data[1]) elif label == "hosts": self.state["hosts"] = value.replace(" ", "").split(",") - elif label == "alarm_ids": - self.state["alarm_ids"] = value.replace(" ", "").split(",") - elif label == "entity_ids": - self.state["entity_ids"] = value.replace(" ", "").split(",") + elif label == "alarm_exclude": + self.state["alarm_exclude"] = value.replace(" ", "").split(",") + elif label == "entity_exclude": + self.state["entity_exclude"] = value.replace( + " ", "").split(",") elif label == "files": self.state["files"] = value.replace(" ", "").split(",") elif label == "start": @@ -117,74 +118,77 @@ class Plugin: """ plugin_name = os.path.basename(self.file) + HOSTS_ERR = f"plugin: {plugin_name} should not have hosts specified" if self.state["algorithm"] == algorithms.SUBSTRING: - if len(self.state["files"]) == 0: - raise ValueError( - f"plugin: {plugin_name} needs files specified for substring algorithm" - ) - if len(self.state["hosts"]) == 0: - raise ValueError( - f"plugin: {plugin_name} needs hosts specified for substring algorithm" - ) - if len(self.state["substring"]) == 0: - raise ValueError( - f"plugin: {plugin_name} need substring specified for substring algorithm" - ) + self.validate_state(plugin_name, "files") + self.validate_state(plugin_name, "hosts") + self.validate_state(plugin_name, "substring") elif self.state["algorithm"] == algorithms.ALARM: if len(self.state["hosts"]) > 0: - raise ValueError( - f"plugin: {plugin_name} should not have hosts to be specified" - ) + raise ValueError(HOSTS_ERR) elif self.state["algorithm"] == algorithms.SYSTEM_INFO: if len(self.state["hosts"]) > 0: - raise ValueError( - f"plugin: {plugin_name} should not have hosts to be specified" - ) - elif self.state["algorithm"] == algorithms.SWACT: + raise ValueError(HOSTS_ERR) + elif self.state["algorithm"] == algorithms.SWACT_ACTIVITY: if len(self.state["hosts"]) > 0: - raise ValueError( - f"plugin: {plugin_name} should not have hosts to be specified" - ) - elif self.state["algorithm"] == algorithms.PUPPET: + raise ValueError(HOSTS_ERR) + elif self.state["algorithm"] == algorithms.PUPPET_ERRORS: if len(self.state["hosts"]) > 0: - raise ValueError( - f"plugin: {plugin_name} should not have hosts to be specified" - ) - elif self.state["algorithm"] == algorithms.PROCESS_FAILURE: + raise ValueError(HOSTS_ERR) + elif self.state["algorithm"] == algorithms.PROCESS_FAILURES: if len(self.state["hosts"]) > 0: - raise ValueError( - f"plugin: {plugin_name} should not have hosts to be specified" - ) + raise ValueError(HOSTS_ERR) + elif self.state["algorithm"] == algorithms.HEARTBEAT_LOSS: + if len(self.state["hosts"]) > 0: + raise ValueError(HOSTS_ERR) + elif self.state["algorithm"] == algorithms.MAINTENANCE_ERR: + if len(self.state["hosts"]) > 0: + raise ValueError(HOSTS_ERR) + elif self.state["algorithm"] == algorithms.DAEMON_FAILURES: + if len(self.state["hosts"]) > 0: + raise ValueError(HOSTS_ERR) + elif self.state["algorithm"] == algorithms.STATE_CHANGES: + if len(self.state["hosts"]) > 0: + raise ValueError(HOSTS_ERR) elif self.state["algorithm"] == algorithms.AUDIT: if len(self.state["hosts"]) > 0: - raise ValueError( - f"plugin: {plugin_name} should not have hosts to be specified" - ) + raise ValueError(HOSTS_ERR) try: datetime.strptime(self.state["start"], "%Y-%m-%d %H:%M:%S") except: raise ValueError( - f"plugin : {plugin_name} needs a start time in YYYY-MM-DD HH:MM:SS format" + f"plugin : {plugin_name} needs a start time in YYYY-MM-DD " + f"HH:MM:SS format" ) try: datetime.strptime(self.state["end"], "%Y-%m-%d %H:%M:%S") except: raise ValueError( - f"plugin : {plugin_name} needs an end time in YYYY-MM-DD HH:MM:SS format" + f"plugin : {plugin_name} needs an end time in YYYY-MM-DD " + f"HH:MM:SS format" ) else: raise ValueError( - f"plugin: {plugin_name} unknown algorithm {self.state['algorithm']}" + f"plugin: {plugin_name} unknown algorithm " + f"{self.state['algorithm']}" ) for host in self.state["hosts"]: if host not in ["controllers", "workers", "storages", "all"]: raise ValueError( - f"host not recognized: '{host}', accepted hosts are 'controllers', 'workers', 'storages', 'all'" + f"host not recognized: '{host}', accepted hosts are " + f"'controllers', 'workers', 'storages', 'all'" ) + def validate_state(self, plugin_name, key): + if len(self.state[key]) == 0: + raise ValueError( + f"plugin: {plugin_name} needs {key} specified for " + f"substring algorithm" + ) + def __str__(self) -> str: return f"{json.dumps(self.state)} File: {self.file}" diff --git a/tools/collector/debian-scripts/report/plugins/alarm b/tools/collector/debian-scripts/report/plugins/alarm new file mode 100755 index 00000000..2d4d416c --- /dev/null +++ b/tools/collector/debian-scripts/report/plugins/alarm @@ -0,0 +1,3 @@ +algorithm=alarm +alarm_exclude=400., 800. +entity_exclude=subsystem=vim diff --git a/tools/collector/debian-scripts/report/plugins/daemon_failures b/tools/collector/debian-scripts/report/plugins/daemon_failures new file mode 100755 index 00000000..4d5f7404 --- /dev/null +++ b/tools/collector/debian-scripts/report/plugins/daemon_failures @@ -0,0 +1 @@ +algorithm=daemon_failures diff --git a/tools/collector/debian-scripts/report/plugins/heartbeat_loss b/tools/collector/debian-scripts/report/plugins/heartbeat_loss new file mode 100755 index 00000000..2e5b13be --- /dev/null +++ b/tools/collector/debian-scripts/report/plugins/heartbeat_loss @@ -0,0 +1 @@ +algorithm=heartbeat_loss diff --git a/tools/collector/debian-scripts/report/plugins/maintenance_errors b/tools/collector/debian-scripts/report/plugins/maintenance_errors new file mode 100755 index 00000000..1f5bc144 --- /dev/null +++ b/tools/collector/debian-scripts/report/plugins/maintenance_errors @@ -0,0 +1 @@ +algorithm=maintenance_errors diff --git a/tools/collector/debian-scripts/report/plugins/process_failures b/tools/collector/debian-scripts/report/plugins/process_failures new file mode 100755 index 00000000..1dcdf8a8 --- /dev/null +++ b/tools/collector/debian-scripts/report/plugins/process_failures @@ -0,0 +1 @@ +algorithm=process_failures diff --git a/tools/collector/debian-scripts/report/plugins/puppet_errors b/tools/collector/debian-scripts/report/plugins/puppet_errors new file mode 100755 index 00000000..0eb6f0e4 --- /dev/null +++ b/tools/collector/debian-scripts/report/plugins/puppet_errors @@ -0,0 +1 @@ +algorithm=puppet_errors diff --git a/tools/collector/debian-scripts/report/plugins/state_changes b/tools/collector/debian-scripts/report/plugins/state_changes new file mode 100644 index 00000000..7d3fd2a1 --- /dev/null +++ b/tools/collector/debian-scripts/report/plugins/state_changes @@ -0,0 +1 @@ +algorithm=state_changes diff --git a/tools/collector/debian-scripts/report/plugins/substring b/tools/collector/debian-scripts/report/plugins/substring new file mode 100755 index 00000000..bae11679 --- /dev/null +++ b/tools/collector/debian-scripts/report/plugins/substring @@ -0,0 +1,5 @@ +algorithm=substring +files=var/log/mtcAgent.log, var/log/sm.log +hosts=controllers +substring=operation failed +substring=Failed to send message diff --git a/tools/collector/debian-scripts/report/plugins/swact_activity b/tools/collector/debian-scripts/report/plugins/swact_activity new file mode 100755 index 00000000..215570ae --- /dev/null +++ b/tools/collector/debian-scripts/report/plugins/swact_activity @@ -0,0 +1 @@ +algorithm=swact_activity diff --git a/tools/collector/debian-scripts/report/plugins/system_info b/tools/collector/debian-scripts/report/plugins/system_info new file mode 100755 index 00000000..0736b730 --- /dev/null +++ b/tools/collector/debian-scripts/report/plugins/system_info @@ -0,0 +1 @@ +algorithm=system_info diff --git a/tools/collector/debian-scripts/report/report.py b/tools/collector/debian-scripts/report/report.py new file mode 100755 index 00000000..b3c02878 --- /dev/null +++ b/tools/collector/debian-scripts/report/report.py @@ -0,0 +1,360 @@ +#!/usr/bin/env python3 +######################################################################## +# +# Copyright (c) 2022 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# +######################################################################## +# +# Description: The Report tool is used to gather relevant log events +# and information about the system from a collect bundle. +# +# The report tool allows user created plugins which decides relevance +# for log events. Plugins contain an algorithm label which instructs the +# tool what information to search and how to search for it. +# +# The report tool requires the collect bundle and host tarballs to be +# untarred. +# +# The report tool reads user plugins from the report directory in the +# top level of the collect bundle, and outputs files containing files +# containing relevant logs to this directory as well. +# +# Typical Usage: +# command line functionality +# ------------------------------- ---------------------------------- +# > report.py - Run all plugins in directory +# > report.py [plugin ...] - Run only specified plugins +# > report.py [labels] - Run algorithm with labels +# > report.py --help - help message +# > report.py --help - algorithm specific help +# +# See --help output for a complete list of full and abbreviated +# command line options and examples of plugins. +# +# Refer to README file for more usage and output examples +####################################################################### + +import argparse +from cmath import log +from datetime import datetime +from datetime import timedelta +from datetime import timezone +import logging +import os +import subprocess +import sys +import time + +from execution_engine import ExecutionEngine +from plugin import Plugin + + +now = datetime.now(timezone.utc) +base_dir = os.path.dirname(os.path.realpath(__file__)) +parent_dir = os.path.dirname(base_dir) +default_path = os.path.dirname(parent_dir) +plugins = [] + +parser = argparse.ArgumentParser( + description="Log Event Reporter", + epilog="Place plugins in 'plugins' directory found in 'report' directory " + "at top level of collect bundle.\nOutput files will be placed in 'report' " + "directory.\nThis tool will create a report.log and untar.log file along " + "with other output files.", +) +parser.add_argument( + "-v", + "--verbose", + action="store_true", + help="Verbose output", +) +parser.add_argument( + "-s", + "--start", + default="20000101", + help="Specify a start date in YYYYMMDD format for analysis " + "(default:20000101)", +) +parser.add_argument( + "-e", + "--end", + default=datetime.strftime(now + timedelta(days=1), "%Y%m%d"), + help="Specify an end date in YYYYMMDD format for analysis " + "(default: current date)", +) +parser.add_argument( + "-p", + "--plugin", + default=None, + nargs="*", + help="Specify what plugins to run (default: runs every plugin in plugins " + "folder)", +) +parser.add_argument( + "-d", + "--directory", + default=default_path, + help="Specify top level of collect bundle to analyze " + "(default: two levels above tool directory)", +) +parser.add_argument( + "--hostname", + default="all", + help="Specify host for correlator to find significant events and state " + "changes for (default: all hosts)", +) +subparsers = parser.add_subparsers(help="algorithms", dest="algorithm") + +# substring algorithm arguments +parser_substring = subparsers.add_parser( + "substring", + formatter_class=argparse.RawTextHelpFormatter, + help="""Searches through specified files for lines containing specified + substring. There will be an output file for each host of the host + type specified.""", + epilog="Plugin file example:\n" + " algorithm=substring\n" + " files=var/log/mtcAgent.log, var/log/sm.log\n" + " hosts=controllers\n" + " substring=operation failed\n" + " substring=Failed to send message", +) +substring_required = parser_substring.add_argument_group("required arguments") +substring_required.add_argument( + "--files", + required=True, + nargs="+", + help="Files to perform substring analysis on (required)", +) +substring_required.add_argument( + "--substring", nargs="+", required=True, + help="Substrings to search for (required)" +) +substring_required.add_argument( + "--hosts", + choices=["controllers", "workers", "storages", "all"], + required=True, + nargs="+", + help="Host types to perform analysis on (required)", +) + + +# alarm algorithm arguments +parser_alarm = subparsers.add_parser( + "alarm", + formatter_class=argparse.RawTextHelpFormatter, + help="Searches through fm.db.sql.txt for alarms and logs except for those " + "specified. There are 2 output files: 'alarm', and 'log'", + epilog="Plugin file example:\n" + " algorithm=alarm\n" + " alarm_exclude=400., 800.\n" + " entity_exclude=subsystem=vim\n", +) +parser_alarm.add_argument( + "--alarm_exclude", + nargs="+", + required=False, + default=[], + help="Alarm id patterns to not search for (not required)", +) +parser_alarm.add_argument( + "--entity_exclude", + nargs="+", + required=False, + default=[], + help="Entity id patterns to not search for (not required)", +) + +# system info algorithm +parser_system_info = subparsers.add_parser( + "system_info", + formatter_class=argparse.RawTextHelpFormatter, + help="Presents information about the system", + epilog="Plugin file example:\n" " algorithm=system_info\n", +) + +# swact activity algorithm +parser_swact_activity = subparsers.add_parser( + "swact_activity", + formatter_class=argparse.RawTextHelpFormatter, + help="Presents system swacting activity", + epilog="Plugin file example:\n" " algorithm=swact_activity\n", +) + +# puppet errors algorithm +parser_puppet_errors = subparsers.add_parser( + "puppet_errors", + formatter_class=argparse.RawTextHelpFormatter, + help="Presents any puppet errors", + epilog="Plugin file example:\n" " algorithm=puppet_errors\n", +) + +# process failures algorithm +parser_process_failures = subparsers.add_parser( + "process_failures", + formatter_class=argparse.RawTextHelpFormatter, + help="Presents any process failures from pmond.log", + epilog="Plugin file example:\n" " algorithm=process_failures\n", +) + +# daemon failures algorithm +parser_daemon_failures = subparsers.add_parser( + "daemon_failures", + formatter_class=argparse.RawTextHelpFormatter, + help="Presents any puppet manifest failures from daemon.log", + epilog="Plugin file example:\n" " algorithm=daemon_failures\n", +) + +# heartbeat loss algorithm +parser_heartbeat_loss = subparsers.add_parser( + "heartbeat_loss", + formatter_class=argparse.RawTextHelpFormatter, + help="Presents any heartbeat loss error messages from hbsAgent.log", + epilog="Plugin file example:\n" " algorithm=heartbeat_loss\n", +) + +# maintenance errors algorithm +parser_maintenance_errors = subparsers.add_parser( + "maintenance_errors", + formatter_class=argparse.RawTextHelpFormatter, + help="Presents errors and other relevant messages from mtcAgent.log and " + "mtcClient.log", + epilog="Plugin file example:\n" " algorithm=maintenance_errors\n", +) + +# state changes algorithm +parser_state_changes = subparsers.add_parser( + "state_changes", + formatter_class=argparse.RawTextHelpFormatter, + help="Presents any messages from mtcAgent.log regarding the state of " + "hosts, such as enabled/disabled", + epilog="Plugin file example:\n" " algorithm=state_changes\n", +) + +# audit algorithm +parser_audit = subparsers.add_parser( + "audit", + formatter_class=argparse.RawTextHelpFormatter, + help="Presents information about audit events in dcmanager.\n" + "The rates and totals represents the sum of audits on all subclouds ", + epilog="Plugin file example:\n" + " algorithm=audit\n" + " start=2022-06-01 10:00:00\n" + " end=2022-06-01 04:00:00\n", +) +parser_audit.add_argument( + "--start", + required=False, + default=datetime.strftime(now - timedelta(days=7), "%Y-%m-%d %H:%M:%S"), + type=str, + help="Specify a start date in YYYY-MM-DD HH:MM:SS format for analysis " + "(not required, default: 1 week ago)" +) +parser_audit.add_argument( + "--end", + required=False, + default=datetime.strftime(now, "%Y-%m-%d %H:%M:%S"), + type=str, + help="Specify an end date in YYYY-MM-DD HH:MM:SS format for analysis " + "(not required, default: today)" +) + + +args = parser.parse_args() +args.start = datetime.strptime(args.start, "%Y%m%d").strftime( + "%Y-%m-%dT%H:%M:%S") +args.end = datetime.strptime(args.end, "%Y%m%d").strftime("%Y-%m-%dT%H:%M:%S") + +if args.directory.endswith("/"): + output_directory = os.path.join( + default_path, "report", "output", + os.path.basename(os.path.dirname(args.directory)) + ) +else: + output_directory = os.path.join( + default_path, "report", "output", os.path.basename(args.directory) + ) + +# creating report log +os.makedirs(output_directory, exist_ok=True) +open(os.path.join(output_directory, "report.log"), "w").close() + +# setting up logger +formatter = logging.Formatter("%(message)s") +logger = logging.getLogger() + +logging.basicConfig( + filename=os.path.join(output_directory, "report.log"), + level=logging.INFO, + format="%(asctime)s %(levelname)s: %(message)s", + datefmt="%Y-%m-%dT%H:%M:%S", +) +logging.Formatter.converter = time.gmtime + +ch = logging.StreamHandler() +ch.setLevel(logging.INFO) +ch.setFormatter(formatter) + +logger.addHandler(ch) + +if not os.path.isdir(args.directory): + sys.exit("Top level of collect bundle given to analyze is not a directory") +else: + for obj in (os.scandir(args.directory)): + info = os.path.splitext(obj.name) + + # TODO: ask user which file to report on if more than one tarball in + # directory + # Check if collect tarball is in given directory and extracts it if + # not already done + if (obj.is_file() and info[1] == ".tar"): + try: + result = subprocess.check_output(["tar", "tf", obj.path], + encoding="UTF-8") + result = result.split("\n", 1) + if not os.path.isdir(os.path.join(args.directory, + os.path.dirname(result[0]))): + subprocess.run(["tar", "xfC", obj.path, args.directory], + check=True) + subprocess.run(["echo", "extracted", obj.name], check=True) + args.directory = os.path.join(args.directory, + os.path.dirname(result[0])) + break + except subprocess.CalledProcessError as e: + logger.error(e) + +try: + engine = ExecutionEngine(args, output_directory) +except ValueError as e: + logger.error(str(e)) + sys.exit("Confirm you are running the report tool on a collect bundle") + +if args.algorithm: + plugins.append(Plugin(opts=vars(args))) +else: + if args.plugin: + for p in args.plugin: + path = os.path.join(default_path, "report", "plugins", p) + if os.path.exists(path): + try: + plugins.append(Plugin(path)) + except Exception as e: + logger.error(str(e)) + + else: + logger.warning(f"{p} plugin does not exist") + else: + path = os.path.join(default_path, "report", "plugins") + if not os.path.exists(path): + os.mkdir(path) + logger.error("Plugins folder is empty") + else: + for file in os.listdir(path): + try: + plugins.append(Plugin(os.path.join(path, file))) + except Exception as e: + logger.error(str(e)) + +engine.execute(plugins, output_directory) diff --git a/tools/collector/debian/deb_folder/collector.install b/tools/collector/debian/deb_folder/collector.install index 05221956..142bfa80 100644 --- a/tools/collector/debian/deb_folder/collector.install +++ b/tools/collector/debian/deb_folder/collector.install @@ -3,3 +3,4 @@ etc/collect.d/* /etc/collect.d usr/local/sbin/* /usr/local/sbin usr/local/bin/collect /usr/local/bin usr/sbin/collect /usr/sbin +/usr/local/bin/report/* /usr/local/bin/report diff --git a/tools/collector/debian/deb_folder/rules b/tools/collector/debian/deb_folder/rules index 6f7a99a7..56d346dc 100755 --- a/tools/collector/debian/deb_folder/rules +++ b/tools/collector/debian/deb_folder/rules @@ -13,8 +13,10 @@ override_dh_auto_install: install -m 755 -d $(SYSCONFDIR)/collect.d install -m 755 -d $(SYSCONFDIR)/collect + install -m 755 -d $(SYSCONFDIR)/collect/plugins # Report Tool install -m 755 -d $(ROOT)/usr/local/sbin install -m 755 -d $(ROOT)/usr/local/bin + install -m 755 -d $(ROOT)/usr/local/bin/report/tool # Report Tool install -m 755 -d $(SBINDIR) install -m 755 -p collect $(ROOT)/usr/local/sbin/collect @@ -26,6 +28,24 @@ override_dh_auto_install: install -m 755 -p expect_done $(ROOT)/usr/local/sbin/expect_done install -m 755 -p mariadb-cli.sh $(ROOT)/usr/local/sbin/mariadb-cli + # Report Tool + install -m 755 -p report/report.py $(ROOT)/usr/local/bin/report/tool/report.py + install -m 755 -p report/execution_engine.py $(ROOT)/usr/local/bin/report/tool/execution_engine.py + install -m 755 -p report/algorithms.py $(ROOT)/usr/local/bin/report/tool/algorithms.py + install -m 755 -p report/plugin.py $(ROOT)/usr/local/bin/report/tool/plugin.py + install -m 755 -p report/correlator.py $(ROOT)/usr/local/bin/report/tool/correlator.py + install -m 755 -p report/README $(ROOT)/usr/local/bin/report/tool/README + install -m 755 -p report/plugins/alarm $(SYSCONFDIR)/collect/plugins/alarm + install -m 755 -p report/plugins/daemon_failures $(SYSCONFDIR)/collect/plugins/daemon_failures + install -m 755 -p report/plugins/heartbeat_loss $(SYSCONFDIR)/collect/plugins/heartbeat_loss + install -m 755 -p report/plugins/maintenance_errors $(SYSCONFDIR)/collect/plugins/maintenance_errors + install -m 755 -p report/plugins/process_failures $(SYSCONFDIR)/collect/plugins/process_failures + install -m 755 -p report/plugins/puppet_errors $(SYSCONFDIR)/collect/plugins/puppet_errors + install -m 755 -p report/plugins/state_changes $(SYSCONFDIR)/collect/plugins/state_changes + install -m 755 -p report/plugins/substring $(SYSCONFDIR)/collect/plugins/substring + install -m 755 -p report/plugins/swact_activity $(SYSCONFDIR)/collect/plugins/swact_activity + install -m 755 -p report/plugins/system_info $(SYSCONFDIR)/collect/plugins/system_info + install -m 755 -p collect_sysinv.sh $(SYSCONFDIR)/collect.d/collect_sysinv install -m 755 -p collect_psqldb.sh $(SYSCONFDIR)/collect.d/collect_psqldb install -m 755 -p collect_mariadb.sh $(SYSCONFDIR)/collect.d/collect_mariadb diff --git a/tools/collector/report/README b/tools/collector/report/README deleted file mode 100644 index 12a8bd49..00000000 --- a/tools/collector/report/README +++ /dev/null @@ -1,62 +0,0 @@ -Refer to report.py file header for a description of the tool - -Example: - -Consider the following collect bundle structure - -SELECT_NODES_20220527.193605 -├── controller-0_20220527.193605 -│  ├── etc -│  ├── root -│  └── var -├── controller-1_20220527.193605 -│  ├── etc -│  ├── root -│  └── var -├── plugins (where the plugin files will be placed) -│  ├── alarm_plugin_example -│  └── substring_plugin_example -├── report - └── tool (where the tool will be placed) - └── output (where the output files will be placed) - - -> cat plugins/alarm_plugin_example - -algorithm=alarm -alarm_ids=400.,401. -entity_ids = host=controller-0 - -> cat plugins/substring_plugin_example - -algorithm=substring -files=var/log/mtcAgent.log -hosts=controllers -substring=operation failed - -> report/tool/report.py --start 20220501 --end 20220530 - -Running the command above will populate the report folder with output files. -The tool also provides default values, more details are in 'report.py -h'. - -The substring algorithm creates an output file for every host of the -specified host type. The files will contain log events within the -provided date range containing the substring 'operation failed'. - -The alarm algorithm creates two output file: 'log' and 'alarm' -'log' contains customer log messages created within the provided date range, -and 'alarm' contains system alarms created within the provided date range. - -For more detailed information about an algorithm use 'report.py -h'. - -Here is the report directory after running the above command - -report -├── output -│ └── 20220815.140008 (time in utc when tool was ran) -│ ├── alarm -│ ├── controller-0_substring_plugin_example_substring -│ ├── controller-1_substring_plugin_example_substring -│ ├── report.log (log file for report tool) -│ └── log -└── tool (where the report tool is) diff --git a/tools/collector/report/execution_engine.py b/tools/collector/report/execution_engine.py deleted file mode 100755 index 3e12abc4..00000000 --- a/tools/collector/report/execution_engine.py +++ /dev/null @@ -1,546 +0,0 @@ -######################################################################## -# -# Copyright (c) 2022 Wind River Systems, Inc. -# -# SPDX-License-Identifier: Apache-2.0 -# -######################################################################## -# -# This file contains the ExecutionEngine class. -# The ExecutionEngine class contains all the available algorithms. -# -# The ExecutionEngine class runs plugins and gathers relevant logs and -# information, creating output files in the report directory. -# -######################################################################## - -from datetime import datetime -import gzip -import logging -import os -import re -import shutil -import subprocess - -import algorithms - - -logger = logging.getLogger(__name__) - - -class ExecutionEngine: - def __init__(self, opts): - """Constructor for the ExecutionEngine class - - Parameters: - opts (dictionary): Options from command line - """ - self.opts = opts - self.hosts = {"controllers": {}, "workers": {}, "storages": {}} - self.active_controller_directory = None - - for folder in (f.path for f in os.scandir(self.opts.directory)): - database_path = os.path.join(folder, "var", "extra", "database") - host_info_path = os.path.join(folder, "var", "extra", "host.info") - - if os.path.isdir(database_path) and os.listdir(database_path): - self.active_controller_directory = folder - - if os.path.exists(host_info_path): - hostname, subfunction = self._extract_subfunction(host_info_path) - if "controller" in subfunction: - self.hosts["controllers"][hostname] = folder - elif "worker" in subfunction: - self.hosts["workers"][hostname] = folder - elif "storage" in subfunction: - self.hosts["storages"][hostname] = folder - - if not self.active_controller_directory: - raise ValueError("Active controller not found") - - def execute(self, plugins, output_directory): - """Run a list of plugins - - Parameters: - plugins (Plugin list): List of plugins to run - - Errors: - FileNotFoundError - """ - - for plugin in plugins: - logger.info(f"Processing plugin: {os.path.basename(plugin.file)}") - hosts = {} - if ( - plugin.state["hosts"] and len(plugin.state["hosts"]) >= 1 - ): # if host list is given - for h in plugin.state["hosts"]: - if h == "all": - hosts.update(self.hosts["workers"]) - hosts.update(self.hosts["storages"]) - hosts.update(self.hosts["controllers"]) - else: - hosts.update(self.hosts[h]) - - for hostname, folderpath in hosts.items(): - - events = [] - if plugin.state["algorithm"] == algorithms.SUBSTRING: - try: - events = self.substring( - plugin.state["substring"], - [ - os.path.join(folderpath, file) - for file in plugin.state["files"] - ], - ) - except FileNotFoundError as e: - logger.error(e) - continue - - # creating output file - output_file = os.path.join( - output_directory, - f"{hostname}_{os.path.basename(plugin.file)}_{plugin.state['algorithm']}", - ) - logger.info("output at " + output_file) - with open(output_file, "w") as file: - file.write( - f"Date range: {self.opts.start} until {self.opts.end}\n" - ) - file.write( - f"substrings: {' '.join(plugin.state['substring'])}\n" - ) - for line in events: - file.write(line + "\n") - else: - if plugin.state["algorithm"] == algorithms.SYSTEM_INFO: - info = self.system_info() - system_info_output = os.path.join(output_directory, "system_info") - with open(system_info_output, "w") as file: - for i in info: - file.write(i + "\n") - - for k, v in self.hosts.items(): - file.write(f"{k}: {','.join(v.keys())}\n") - logger.info("output at " + system_info_output) - - elif plugin.state["algorithm"] == algorithms.AUDIT: - hosts = {} - hosts.update(self.hosts["workers"]) - hosts.update(self.hosts["storages"]) - hosts.update(self.hosts["controllers"]) - - for hostname, folderpath in hosts.items(): - self._create_output_file( - f"{hostname}_audit", - output_directory, - self.audit( - plugin.state["start"], - plugin.state["end"], - os.path.join( - folderpath, "var", "log", "dcmanager", "audit.log" - ), - ), - ) - - elif plugin.state["algorithm"] == algorithms.SWACT: - self._create_output_file( - "swact_activity", output_directory, self.swact() - ) - - elif plugin.state["algorithm"] == algorithms.PUPPET: - self._create_output_file( - "puppet_errors", output_directory, self.puppet() - ) - - elif plugin.state["algorithm"] == algorithms.PROCESS_FAILURE: - self._create_output_file( - "process_failures", output_directory, self.process_failure() - ) - - elif plugin.state["algorithm"] == algorithms.ALARM: - alarms, logs = self.alarm( - plugin.state["alarm_ids"], plugin.state["entity_ids"] - ) - alarm_output = os.path.join(output_directory, "alarm") - log_output = os.path.join(output_directory, "log") - os.makedirs(os.path.dirname(log_output), exist_ok=True) - - # creating output alarm file - with open(alarm_output, "w") as file: - for k, v in alarms.items(): - file.write(f"{k} {v['count']}\n") - file.write("\n") - for k, v in alarms.items(): - file.write(f"{k}\n") - for date in v["dates"]: - file.write(f" {date}\n") - - # creating output log file - with open(log_output, "w") as file: - for k, v in logs.items(): - file.write(f"{k} {v['count']}\n") - file.write("\n") - for k, v in logs.items(): - file.write(f"{k}\n") - for date in v["dates"]: - file.write(f" {date}\n") - logger.info("output at " + alarm_output) - logger.info("output at " + log_output) - - # Built-in algorithms ------------------------------ - def alarm(self, alarm_ids=[], entity_ids=[]): - """Alarm algorithm - Gathers list of alarms and customer logs - - Parameters: - alarm_ids (string list) : List of alarm id patterns to search for - entity_ids (string list): List of entity id patterns to search for - """ - alarm_data = {} - log_data = {} - with open( - os.path.join( - self.active_controller_directory, - "var", - "extra", - "database", - "fm.db.sql.txt", - ) - ) as file: - start = False - for line in file: - # start of event log - if "COPY event_log" in line: - start = True - elif start and line == "\\.\n": - break - elif start: - entry = re.split(r"\t", line) - - INDEX_ALARM_ID = 5 - INDEX_ACTION = 6 - INDEX_ENTITY_ID = 8 - INDEX_ALARM_DATE = 9 - INDEX_SEVERITY = 10 - - alarm_id = entry[INDEX_ALARM_ID] - entity_id = entry[INDEX_ENTITY_ID] - action = entry[INDEX_ACTION] - severity = entry[INDEX_SEVERITY] - alarm_date = entry[INDEX_ALARM_DATE] - - entry_date = alarm_date.replace( - " ", "T" - ) # making time format of alarm the same - if self.opts.start <= entry_date and entry_date <= self.opts.end: - # if the alarm is not in the user specified list of alarm or entity ids - for id in alarm_ids: - if id in alarm_id: - break - else: - if len(alarm_ids) > 0: - continue - - for entity in entity_ids: - if entity in entity_id: - break - else: - if len(entity_ids) > 0: - continue - - try: - if action == "log": - log_info = log_data[ - f"{alarm_id} {entity_id} {severity}" - ] - log_info["count"] += 1 - log_info["dates"].append(alarm_date) - else: - alarm_info = alarm_data[ - f"{alarm_id} {entity_id} {severity}" - ] - alarm_info["count"] += 1 - alarm_info["dates"].append(f"{alarm_date} {action}") - except KeyError: - if entry[6] != "log": - alarm_data[f"{alarm_id} {entity_id} {severity}"] = { - "count": 1, - "dates": [f"{alarm_date} {action}"], - } - else: - log_data[f"{alarm_id} {entity_id} {severity}"] = { - "count": 1, - "dates": [alarm_date], - } - - for _, v in alarm_data.items(): - v["dates"] = sorted(v["dates"]) - - for _, v in log_data.items(): - v["dates"] = sorted(v["dates"]) - - return alarm_data, log_data - - def substring(self, substr, files): - """Substring algorithm - Looks for substrings within files - - Parameters: - substr (string list): List of substrings to look for - files (string list): List of absolute filepaths to search in - - Errors: - FileNotFoundError - """ - CONTINUE_CURRENT = 0 # don't analyze older files, continue with current file - CONTINUE_CURRENT_OLD = 1 # analyze older files, continue with current file - - data = [] - for file in files: - if not os.path.exists(file): - raise FileNotFoundError(f"File not found: {file}") - cont = True - # Searching through file - command = f"""grep -Ea "{'|'.join(s for s in substr)}" {file}""" - status = self._continue(file) - - if ( - status == CONTINUE_CURRENT or status == CONTINUE_CURRENT_OLD - ): # continue with current file - if status == CONTINUE_CURRENT: - cont = False - self._evaluate_substring(data, command) - - # Searching through rotated log files - n = 1 - while os.path.exists(f"{file}.{n}.gz") and cont: - command = f"""zgrep -E "{'|'.join(s for s in substr)}" {file}.{n}.gz""" - status = self._continue(f"{file}.{n}.gz", compressed=True) - - if status == CONTINUE_CURRENT or status == CONTINUE_CURRENT_OLD: - if status == CONTINUE_CURRENT: - cont = False - self._evaluate_substring(data, command) - - n += 1 - - return sorted(data) - - def system_info(self): - """System info algorithm - Presents basic information about the system - """ - data = [] - with open( - os.path.join( - self.active_controller_directory, "etc", "platform", "platform.conf" - ) - ) as file: - for line in file: - if "system_mode" in line: - data.append( - f"System Mode: {re.match('^system_mode=(.*)', line).group(1)}" - ) - elif "system_type" in line: - data.append( - f"System Type: {re.match('^system_type=(.*)', line).group(1)}" - ) - elif "distributed_cloud_role" in line: - data.append( - f"Distributed cloud role: {re.match('^distributed_cloud_role=(.*)', line).group(1)}" - ) - elif "sw_version" in line: - data.append( - f"SW Version: {re.match('^sw_version=(.*)', line).group(1)}" - ) - with open( - os.path.join(self.active_controller_directory, "etc", "build.info") - ) as file: - for line in file: - if "BUILD_TYPE" in line: - data.append( - f"Build Type: {re.match('^BUILD_TYPE=(.*)', line).group(1)}" - ) - elif re.match("^OS=(.*)", line): - data.append(f"OS: {re.match('^OS=(.*)', line).group(1)}") - - return data - - def swact(self): - """Swact activity algorithm - Presents all swacting activity in the system - """ - data = [] - sm_files = [] - sm_customer_files = [] - swact_start = None - swact_in_progress = False - swact_end = None - - for _, folder in self.hosts["controllers"].items(): - sm_path = os.path.join(folder, "var", "log", "sm.log") - sm_files.append(sm_path) - - sm_substrings = ["Swact has started,", "Swact update"] - data = self.substring(sm_substrings, sm_files) - - for i, line in enumerate(data): - if "Swact has started," in line and not swact_in_progress: - swact_in_progress = True - swact_start = datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S") - elif "Swact update" in line and swact_in_progress: - swact_in_progress = False - swact_end = datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S") - line += f" SWACT TOOK {swact_end - swact_start} \n" - data[i] = line - - for _, folder in self.hosts["controllers"].items(): - sm_customer_path = os.path.join(folder, "var", "log", "sm-customer.log") - sm_customer_files.append(sm_customer_path) - - sm_customer_substrings = ["swact"] - data += self.substring(sm_customer_substrings, sm_customer_files) - - return sorted(data) - - def puppet(self): - """Puppet error algorithm - Presents log errors from puppet logs - """ - data = [] - for _, folder in self.hosts["controllers"].items(): - puppet_folder = os.path.join(folder, "var", "log", "puppet") - command = f"grep -rh 'Error:' {puppet_folder}" - self._evaluate_substring(data, command) - return sorted(data) - - def process_failure(self): - """Process failure algorithm - Presents log errors from pmond - """ - data = [] - files = [] - for host_type in self.hosts.keys(): - for _, folder in self.hosts[host_type].items(): - pmond = os.path.join(folder, "var", "log", "pmond.log") - files.append(pmond) - data = self.substring(["Error :"], files) - return data - - def audit(self, start, end, audit_log_path): - """Counts audit events in dcmanager within a specified date range - - Parameters: - start (string) : start date in YYYY-MM-DD HH:MM:SS format - end (string) : end date in YYYY-MM-DD HH:MM:SS format - audit_log_path (string) : absolute path of augit log file - """ - if not shutil.which("lnav"): - raise ValueError("Lnav program not found") - - SECONDS_PER_HOUR = 3600 - fmt = "%Y-%m-%d %H:%M:%S" - - d1 = datetime.strptime(start, fmt) - d2 = datetime.strptime(end, fmt) - seconds = (d2 - d1).total_seconds() - - log_texts = [ - "Triggered subcloud audit%", - "Trigger patch audit%", - "Trigger load audit%", - "Triggered firmware audit%", - "Triggered kubernetes audit%", - # Counts sum of audits from all subclouds - ] - INDEX_MIDDLE_WORD = 1 - data = ["These rates and totals represent the sum of audits from all subclouds"] - - def command(text): - - return ( - f'lnav -R -n -c ";SELECT count(log_body) AS {text.split(" ")[INDEX_MIDDLE_WORD]}_total' - f' from openstack_log WHERE (log_time > \\"{start}\\" AND not log_time > \\"{end}\\")' - f' AND log_body like \\"{text}\\"" "{audit_log_path}"' - ) - - for text in log_texts: - p = subprocess.Popen(command(text), shell=True, stdout=subprocess.PIPE) - for line in p.stdout: - line = line.decode("utf-8").strip() - if line.isnumeric(): - data.append( - f"rate {round((int(line)/seconds * SECONDS_PER_HOUR), 3)} per hour. total: {line}" - ) - else: - data.append(line) - return data - - # ----------------------------------- - - def _continue(self, file, compressed=False): - CONTINUE_CURRENT = 0 # don't analyze older files, continue with current file - CONTINUE_CURRENT_OLD = 1 # analyze older files, continue with current file - CONTINUE_OLD = 2 # don't analyze current file, continue to older files - - # check date of first log event and compare with provided start end dates - first = "" - - if not compressed: - with open(file) as f: - line = f.readline() - first = line[0:19] - else: - with gzip.open(file, "rb") as f: - line = f.readline().decode("utf-8") - first = line[0:19] - try: - datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S") - first = line[0:19] - except ValueError: - return CONTINUE_CURRENT_OLD - - if first < self.opts.start: - return CONTINUE_CURRENT - elif first < self.opts.end and first > self.opts.start: - return CONTINUE_CURRENT_OLD - elif first > self.opts.end: - return CONTINUE_OLD - - def _evaluate_substring(self, data, command): - p = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE) - for line in p.stdout: - line = line.decode("utf-8") - dates = [line[0:19], line[2:21]] # different date locations for log events - for date in dates: - try: - datetime.strptime(date, "%Y-%m-%dT%H:%M:%S") - if date > self.opts.start and date < self.opts.end: - if line[0] == "|": # sm-customer.log edge case - line = line.replace("|", "").strip() - line = re.sub("\s+", " ", line) - data.append(line) - break - except ValueError: - if date == dates[-1]: - data.append(line) - - def _extract_subfunction(self, host_info_path): - GROUP_ONE = 1 - with open(host_info_path) as file: - for line in file: - hostname_match = re.match("^hostname => (.+)", line) - subfunction_match = re.match("^subfunction => (.+)", line) - if subfunction_match: - subfunction = subfunction_match.group(GROUP_ONE) - if hostname_match: - hostname = hostname_match.group(GROUP_ONE) - return hostname, subfunction - - def _create_output_file(self, filename, directory, events): - with open(os.path.join(directory, filename), "w") as file: - for i in events: - file.write(i + "\n") - logger.info("output at " + os.path.join(directory, filename)) diff --git a/tools/collector/report/report.py b/tools/collector/report/report.py deleted file mode 100755 index 520771cf..00000000 --- a/tools/collector/report/report.py +++ /dev/null @@ -1,257 +0,0 @@ -#!/usr/bin/env python3 -######################################################################## -# -# Copyright (c) 2022 Wind River Systems, Inc. -# -# SPDX-License-Identifier: Apache-2.0 -# -######################################################################## -# -# Description: The Report tool is used to gather relevant log events -# and information about the system from a collect bundle. -# -# The report tool allows user created plugins which decides relevance -# for log events. Plugins contain an algorithm label which instructs the -# tool what information to search and how to search for it. -# -# The report tool requires the collect bundle and host tarballs to be -# untarred. -# -# The report tool reads user plugins from a plugins directory in the -# top level of the collect bundle, and outputs files containing -# relevant logs to a report directory in the top level as well. -# -# Typical Usage: -# command line functionality -# ------------------------------- ---------------------------------- -# > report.py - Run all plugins in directory -# > report.py [plugin ...] - Run only specified plugins -# > report.py [labels] - Run algorithm with labels -# > report.py --help - help message -# > report.py --help - algorithm specific help -# -# See --help output for a complete list of full and abbreviated -# command line options and examples of plugins. -# -# Refer to README file for more usage and output examples -####################################################################### - -import argparse -from cmath import log -from datetime import datetime -from datetime import timezone -import logging -import os -import time - -from execution_engine import ExecutionEngine -from plugin import Plugin - - -now = datetime.now(timezone.utc) -base_dir = os.path.realpath(__file__) -default_path = os.path.join(os.path.dirname(base_dir), "..", "..") -plugins = [] - -parser = argparse.ArgumentParser( - description="Log Event Reporter", - epilog="Place plugins in 'plugins' directory at top level of collect bundle. Output files will be placed in 'report' directory." - "\nThis tool will create a report.log file along with other output files", -) -parser.add_argument( - "-s", - "--start", - default="20000101", - help="Specify a start date in YYYYMMDD format for analysis (default:20000101)", -) -parser.add_argument( - "-e", - "--end", - default=datetime.strftime(now, "%Y%m%d"), - help="Specify an end date in YYYYMMDD format for analysis (default: current date)", -) -parser.add_argument( - "-p", - "--plugin", - default=None, - nargs="*", - help="Specify what plugins to run (default: runs every plugin in plugins folder)", -) -parser.add_argument( - "-d", - "--directory", - default=default_path, - help="Specify top level of collect bundle to analyze (default: two levels above current location)", -) -subparsers = parser.add_subparsers(help="algorithms", dest="algorithm") - -# substring algorithm arguments -parser_substring = subparsers.add_parser( - "substring", - formatter_class=argparse.RawTextHelpFormatter, - help="""Searches through specified files for lines containing specified substring. - There will be an output file for each host of the host type specified.""", - epilog="Plugin file example:\n" - " algorithm=substring\n" - " files=mtcAgent.log, sm.log\n" - " hosts=controllers, workers\n" - " substring=Swact in progress\n" - " substring=Swact update", -) -substring_required = parser_substring.add_argument_group("required arguments") -substring_required.add_argument( - "--files", - required=True, - nargs="+", - help="Files to perform substring analysis on (required)", -) -substring_required.add_argument( - "--substring", nargs="+", required=True, help="Substrings to search for (required)" -) -substring_required.add_argument( - "--hosts", - choices=["controllers", "workers", "storages", "all"], - required=True, - nargs="+", - help="Host types to perform analysis on (required)", -) - - -# alarm algorithm arguments -parser_alarm = subparsers.add_parser( - "alarm", - formatter_class=argparse.RawTextHelpFormatter, - help="Searches through fm.db.sql.txt for alarms and logs. There are 2 output files: 'alarm', and 'log'", - epilog="Plugin file example:\n" - " algorithm=alarm\n" - " alarm_ids=400.005,200.004\n" - " entity_ids= host=controller-0,host=controller-1\n", -) -parser_alarm.add_argument( - "--alarm_ids", - nargs="+", - required=False, - default=[], - help="Alarm id patterns to search for (not required)", -) -parser_alarm.add_argument( - "--entity_ids", - nargs="+", - required=False, - default=[], - help="Entity id patterns to search for (not required)", -) - -# system info algorithm -parser_system_info = subparsers.add_parser( - "system_info", - formatter_class=argparse.RawTextHelpFormatter, - help="Presents information about the system", - epilog="Plugin file example:\n" " algorithm=system_info\n", -) - -# swact activity algorithm -parser_swact = subparsers.add_parser( - "swact", - formatter_class=argparse.RawTextHelpFormatter, - help="Presents system swacting activity", - epilog="Plugin file example:\n" " algorithm=swact\n", -) - -# puppet errors algorithm -parser_puppet = subparsers.add_parser( - "puppet", - formatter_class=argparse.RawTextHelpFormatter, - help="Presents any puppet errors", - epilog="Plugin file example:\n" " algorithm=puppet\n", -) - -# process failure algorithm -parser_process_failure = subparsers.add_parser( - "process_failure", - formatter_class=argparse.RawTextHelpFormatter, - help="Presents any process failures from pmond.log", - epilog="Plugin file example:\n" " algorithm=process_failure\n", -) - -# audit algorithm -parser_audit = subparsers.add_parser( - "audit", - formatter_class=argparse.RawTextHelpFormatter, - help="Presents information about audit events in dcmanager.\n" - "The rates and totals represents the sum of audits on all subclouds ", - epilog="Plugin file example:\n" - " algorithm=audit\n" - " start=2022-06-01 10:00:00\n" - " end=2022-06-01 04:00:00\n", -) -parser_audit_required = parser_audit.add_argument_group("required arguments") -parser_audit_required.add_argument("--start", required=True) -parser_audit_required.add_argument( - "--end", - required=True, -) - - -args = parser.parse_args() -args.start = datetime.strptime(args.start, "%Y%m%d").strftime("%Y-%m-%dT%H:%M:%S") -args.end = datetime.strptime(args.end, "%Y%m%d").strftime("%Y-%m-%dT%H:%M:%S") - -output_directory = os.path.join( - args.directory, "report", "output", now.strftime("%Y%m%d.%H%M%S") -) - -# creating report log -os.makedirs(output_directory) -open(os.path.join(output_directory, "report.log"), "w").close() - -# setting up logger -formatter = logging.Formatter("%(message)s") -logger = logging.getLogger() - -logging.basicConfig( - filename=os.path.join(output_directory, "report.log"), - level=logging.INFO, - format="%(asctime)s %(levelname)s: %(message)s", - datefmt="%Y-%m-%dT%H:%M:%S", -) -logging.Formatter.converter = time.gmtime - -ch = logging.StreamHandler() -ch.setLevel(logging.INFO) -ch.setFormatter(formatter) - -logger.addHandler(ch) - -try: - engine = ExecutionEngine(args) -except ValueError as e: - logger.error(str(e)) - -if args.algorithm: - plugins.append(Plugin(opts=vars(args))) -else: - if args.plugin: - for p in args.plugin: - path = os.path.join(args.directory, "plugins", p) - if os.path.exists(path): - try: - plugins.append(Plugin(path)) - except Exception as e: - logger.error(str(e)) - - else: - logger.warning(f"{p} plugin does not exist") - else: - path = os.path.join(args.directory, "plugins") - if not os.path.exists(path): - os.mkdir(path) - logger.error("Plugins folder is empty") - else: - for file in os.listdir(path): - try: - plugins.append(Plugin(os.path.join(path, file))) - except Exception as e: - logger.error(str(e)) - -engine.execute(plugins, output_directory)