Eric MacDonald a7c75ad8de Fix utf-8 decode traceback in the report tool strings algorithm
This update adds a try or except on UnicodeDecodeError to the
utf-8 decode case in the _evaluate_substring function of the
strings algorithm.

Added a dropped logs file to record a list of logs that fail the utf-8
conversion. The dropped logs file is stored as dropped_logs in the
report analysis plugins output folder.

Test Plan:

PASS: Verify on collect bundle that exhibited the issue.
PASS: Verify dropped logs are recorded in new
      report_analysis/plugins/dropped_logs file.
PASS: Verify no newly introduced pep8 errors or warnings.

Closes-Bug: 2067952
Change-Id: I82cecd6bb537a459255467f6ced7e742c5771d97
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
2024-07-04 17:00:39 +00:00

580 lines
25 KiB
Python
Executable File

########################################################################
#
# Copyright (c) 2022 - 2023 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
########################################################################
#
# This file contains the ExecutionEngine class.
# The ExecutionEngine class contains all the available algorithms.
#
# The ExecutionEngine class runs plugins and gathers relevant logs and
# information, creating output files in the report directory.
#
# Futures:
#
# 1. Improve how report determines the active controller.
# Specifically what controller was active at the time of detected
# failure rather than what controller was active when collect ran.
#
# 2. Consider running plugins in parallel threads
#
########################################################################
import logging
import mmap
import os
import re
import subprocess
import sys
import tarfile
import algorithms
from correlator import Correlator
from plugin_algs.alarm import alarm
from plugin_algs.audit import audit
from plugin_algs.daemon_failures import daemon_failures
from plugin_algs.heartbeat_loss import heartbeat_loss
from plugin_algs.maintenance_errors import maintenance_errors
from plugin_algs.process_failures import process_failures
from plugin_algs.puppet_errors import puppet_errors
from plugin_algs.state_changes import state_changes
from plugin_algs.substring import substring
from plugin_algs.swact_activity import swact_activity
from plugin_algs.system_info import system_info
# don't generate __pycache__ dir and files
sys.dont_write_bytecode = True
logger = logging.getLogger(__name__)
# regex expression used to get the hostname from the host dir name
# eg: chops '_20221201.213332' off of controller-0_20221201.213332
regex_chop_bundle_date = r"_\d{8}\.\d{6}"
regex_get_bundle_date = r".*_\d{8}\.\d{6}$"
class ExecutionEngine:
def __init__(self, opts, input_dir, output_dir):
"""Constructor for the ExecutionEngine class
Parameters:
opts (dictionary): Options from command line
output_dir (string): directory to put output files
"""
# don't generate __pycache__ dir and files
sys.dont_write_bytecode = True
self.opts = opts
self.hosts = {"controllers": {}, "workers": {}, "storages": {}}
self.active_controller_directory = None
self.active_controller_hostname = None
self.host_dirs = []
self.hostnames = []
if not os.path.isdir(input_dir):
logger.error("Error: Invalid input directory: %s", input_dir)
sys.exit("... exiting")
self.input_dir = input_dir
if not os.path.isdir(output_dir):
logger.error("Error: Invalid output directory : %s", output_dir)
sys.exit("... exiting")
self.output_dir = output_dir
# Uncompresses host tar files if not already done
with open(os.path.join(output_dir, "untar.log"), "a") as logfile:
# Now extract the tarballs
for obj in (os.scandir(self.input_dir)):
# files to ignore
if obj.name == "report_tool.tgz":
continue
info = os.path.splitext(obj.name)
if (obj.is_file() and tarfile.is_tarfile(obj.path) and not
os.path.isdir(os.path.join(self.input_dir, info[0]))):
try:
logger.info("extracting : %s", obj.path)
subprocess.run(["tar", "xzfC", obj.path,
self.input_dir],
stderr=logfile, check=True)
except subprocess.CalledProcessError as e:
logger.error(e)
# Determine the active controller and load system info from it.
for folder in (f.path for f in os.scandir(self.input_dir)):
basename = os.path.basename(folder)
# Ignore all directories that are not a properly dated
# collect file
if not re.match(regex_get_bundle_date, basename):
continue
# skip over files (the tarballs)
if not os.path.isdir(folder):
continue
logger.debug("base folder: %s", os.path.basename(folder))
# Get the hostname from the host folder
hostname = re.sub(regex_chop_bundle_date, "", basename)
self.hostnames.append(hostname)
host_dir = folder
self.host_dirs.append(host_dir)
logger.debug("Host Dirs: %s", self.host_dirs)
# build up a list of hosts. save host folder path based on nodetype
hostinfo_path = os.path.join(host_dir, "var/extra/host.info")
if os.path.exists(hostinfo_path):
hostname, subfunction = self._extract_subfunction(
hostinfo_path)
if "controller" in subfunction:
self.hosts["controllers"][hostname] = folder
elif "worker" in subfunction:
self.hosts["workers"][hostname] = folder
elif "storage" in subfunction:
self.hosts["storages"][hostname] = folder
# skip non controller hosts since that could not be active
if hostname[0:10] != "controller":
continue
logger.debug("searching for active controller: %s" % hostname)
if os.path.isdir(host_dir):
logger.debug("... checking %s" % hostname)
extra_path = os.path.join(host_dir, "var/extra")
# don't analyse a directory that doesn't contain
# a 'var/extra' dir.
if not os.path.exists(extra_path):
logger.warning("%s is missing var/extra" % hostname)
continue
database_path = os.path.join(host_dir, "var/extra/database")
if os.path.exists(database_path):
if os.listdir(database_path):
logger.info("Active Ctrl: %s" % hostname)
self.active_controller_directory = folder
self.active_controller_hostname = hostname
if not len(self.host_dirs):
logger.error("Error: No host bundles found in %s" % input_dir)
files = []
for folder in (f.path for f in os.scandir(input_dir)):
files.append(os.path.basename(folder))
if files:
logger.error("... content: %s" % files)
sys.exit("")
if not self.active_controller_directory:
logger.warning("Active Ctrl: NOT FOUND")
def execute(self, plugins, output_dir):
"""Run a list of plugins
Parameters:
plugins (Plugin list): List of plugins to run
output_dir (string): directory to put output files
Errors:
FileNotFoundError
"""
plugin_output_dir = os.path.join(output_dir, "plugins")
os.makedirs(plugin_output_dir, exist_ok=True)
# track non utf-8 compliant logs in a dropped_logs file.
dropped_logs_file = os.path.join(plugin_output_dir, "dropped_logs")
if self.opts.verbose:
logger.info("Output files for plugins can be found at " +
os.path.abspath(plugin_output_dir))
if self.opts.debug:
logger.debug("Processing Plugins for hosts: %s", self.host_dirs)
for plugin in plugins:
processing = "Processing plugin: " + os.path.basename(plugin.file)
hosts = {}
if (
plugin.state["hosts"] and len(plugin.state["hosts"]) >= 1
): # if host list is given
if self.opts.debug:
logger.debug(processing)
for h in plugin.state["hosts"]:
if h == "all":
hosts.update(self.hosts["workers"])
hosts.update(self.hosts["storages"])
hosts.update(self.hosts["controllers"])
else:
hosts.update(self.hosts[h])
for hostname, folderpath in hosts.items():
events = []
if plugin.state["algorithm"] == algorithms.SUBSTRING:
events = substring(
self.opts.start, self.opts.end,
plugin.state["substring"],
[
os.path.join(folderpath, file)
for file in plugin.state["files"]
],
plugin.state["exclude"],
dropped_logs=dropped_logs_file,
)
# creating output file
output_file = os.path.join(
plugin_output_dir,
f"substring_{hostname}",
)
if self.opts.verbose:
logger.info("... output at " +
os.path.abspath(output_file))
if events:
with open(output_file, "w") as file:
file.write(
f"Date range: {self.opts.start} until "
f"{self.opts.end}\n"
)
file.write(
f"substrings: "
f"{' '.join(plugin.state['substring'])}\n"
)
for line in events:
if line[-1] == "\n":
file.write(line)
else:
file.write(line + "\n")
else:
if plugin.state["algorithm"] == algorithms.SYSTEM_INFO:
# Get system info of the active controller first
# and then put the system info of each host in the
# system info output folder.
system_info_output = os.path.join(plugin_output_dir,
"system_info")
if os.path.exists(system_info_output):
os.remove(system_info_output)
hostname = None
host_dir = None
if self.active_controller_directory is None:
hostname = re.sub(regex_chop_bundle_date, "",
os.path.basename(self.host_dirs[0]))
host_dir = self.host_dirs[0]
else:
hostname = self.active_controller_hostname
host_dir = self.active_controller_directory
system_info(hostname, host_dir,
system_info_output,
self.hosts, True)
start_index = self.active_controller_directory is None
for host_dir in self.host_dirs[start_index:]:
if host_dir != self.active_controller_directory:
hostname = re.sub(regex_chop_bundle_date, "",
os.path.basename(host_dir))
system_info(hostname,
host_dir,
system_info_output,
None,
False)
elif plugin.state["algorithm"] == algorithms.AUDIT:
hosts = {}
hosts.update(self.hosts["workers"])
hosts.update(self.hosts["storages"])
hosts.update(self.hosts["controllers"])
for hostname, folderpath in hosts.items():
self._create_output_file(
f"{hostname}_audit",
plugin_output_dir,
audit(
plugin.state["start"],
plugin.state["end"],
os.path.join(
folderpath, "var", "log", "dcmanager",
"audit.log"
),
),
processing,
)
elif plugin.state["algorithm"] == algorithms.SWACT_ACTIVITY:
self._create_output_file(
"swact_activity", plugin_output_dir,
swact_activity(self.hosts, self.opts.start,
self.opts.end,
dropped_logs=dropped_logs_file),
processing
)
elif plugin.state["algorithm"] == algorithms.PUPPET_ERRORS:
self._create_output_file(
"puppet_errors", plugin_output_dir,
puppet_errors(self.hosts, self.opts.start,
self.opts.end,
dropped_logs=dropped_logs_file),
processing
)
elif plugin.state["algorithm"] == algorithms.PROCESS_FAILURES:
self._create_output_file(
"process_failures", plugin_output_dir,
process_failures(self.hosts, self.opts.start,
self.opts.end,
dropped_logs=dropped_logs_file),
processing
)
elif plugin.state["algorithm"] == algorithms.ALARM:
for host_dir in self.host_dirs:
alarms, logs = alarm(
host_dir,
self.opts.start, self.opts.end,
plugin.state["alarm_exclude"],
plugin.state["entity_exclude"]
)
if alarms is None and logs is None:
continue
alarm_output = os.path.join(plugin_output_dir, "alarm")
log_output = os.path.join(plugin_output_dir, "log")
# creating output alarm file
with open(alarm_output, "w") as file:
for k, v in alarms.items():
file.write(f"{k}:\n")
for date in v["dates"]:
file.write(f" {date}\n")
# creating output log file
with open(log_output, "w") as file:
for k, v in logs.items():
file.write(f"{k}: {v['count']}\n")
file.write("\n")
for k, v in logs.items():
file.write(f"{k}:\n")
for date in v["dates"]:
file.write(f" {date}\n")
if self.opts.verbose:
logger.info(processing)
elif self.opts.debug:
logger.debug(processing + ", output at " +
os.path.abspath(alarm_output) +
", " + os.path.abspath(log_output))
elif plugin.state["algorithm"] == algorithms.HEARTBEAT_LOSS:
self._create_output_file(
"heartbeat_loss", plugin_output_dir,
heartbeat_loss(self.hosts, self.opts.start,
self.opts.end,
dropped_logs=dropped_logs_file),
processing
)
elif plugin.state["algorithm"] == algorithms.MAINTENANCE_ERR:
self._create_output_file(
"maintenance_errors", plugin_output_dir,
maintenance_errors(self.hosts, self.opts.start,
self.opts.end,
plugin.state["exclude"],
dropped_logs=dropped_logs_file),
processing
)
elif plugin.state["algorithm"] == algorithms.DAEMON_FAILURES:
self._create_output_file(
"daemon_failures", plugin_output_dir,
daemon_failures(self.hosts, self.opts.start,
self.opts.end,
plugin.state["exclude"],
dropped_logs=dropped_logs_file),
processing
)
elif plugin.state["algorithm"] == algorithms.STATE_CHANGES:
self._create_output_file(
"state_changes", plugin_output_dir,
state_changes(self.hosts, self.opts.start,
self.opts.end,
dropped_logs=dropped_logs_file),
processing
)
# Dump a summary of data found by the plugins
if os.path.exists(plugin_output_dir):
# Print a summary of the logs/data gathers by the plugins
empty_files = ""
logger.info("")
logger.info("Plugin Results:")
logger.info("")
lines = []
for fn in os.listdir(plugin_output_dir):
filename = os.path.join(plugin_output_dir, fn)
with open(filename, "r+") as f:
# Show how much data is in each plugins output file
if os.path.isfile(filename) and os.path.getsize(filename):
buf = mmap.mmap(f.fileno(), 0)
entries = 0
readline = buf.readline
while readline():
entries += 1
lines.append("%5d %s" % (entries, filename))
else:
empty_files += fn + " "
# Sort the lines based on the numeric value
sorted_lines = sorted(lines, key=lambda x: int(x.split()[0]),
reverse=True)
if sorted_lines:
for line in sorted_lines:
logger.info(line)
else:
sys.exit("no plugin data found ; "
"nothing to correlate ... exiting")
if empty_files:
logger.info("")
logger.info("... nothing found by plugins: %s" % empty_files)
else:
logger.error("Error: Plugin output dir missing: %s" %
plugin_output_dir)
sys.exit("... exiting")
# Running the correlator and printing the output from it
self.run_correlator(output_dir, plugin_output_dir)
# -----------------------------------
def run_correlator(self, output_dir, plugin_output_dir):
"""Runs the correlator and prints the results differently based on if
the tool was run with or without the verbose option
Parameters:
output_dir (string) : directory to place output files from
correlator
plugin_output_dir (string) : directory with output files from
plugins
"""
correlator = Correlator(plugin_output_dir)
failures, events, alarms, state_changes = correlator.run(
self.opts.hostname)
failures_len, events_len = len(failures), len(events)
alarms_len, state_changes_len = len(alarms), len(state_changes)
failures.append("\nTotal failures found: " + str(failures_len) + "\n")
events.append("\nTotal events found: " + str(events_len) + "\n")
alarms.append("\nTotal alarms found: " + str(alarms_len) + "\n")
state_changes.append("\nTotal state changes found: " +
str(state_changes_len) + "\n")
logger.info("")
logger.info("Correlated Results:")
logger.info("")
self._create_output_file("failures", output_dir,
failures, "")
self._create_output_file("events", output_dir,
events, "")
self._create_output_file("alarms", output_dir,
alarms, "")
self._create_output_file("state_changes", output_dir,
state_changes, "")
max = 0
for sl in [events_len, alarms_len, state_changes_len, failures_len]:
if len(str(sl)) > max:
max = len(str(sl))
if not self.opts.verbose:
logger.info("Events : " + str(events_len) +
" " * (max - len(str(events_len))) +
" " + output_dir + "/events")
logger.info("Alarms : " + str(alarms_len) +
" " * (max - len(str(alarms_len))) +
" " + output_dir + "/alarms")
logger.info("State Changes: " + str(state_changes_len) +
" " * (max - len(str(state_changes_len))) +
" " + output_dir + "/state_changes")
logger.info("Failures : " + str(failures_len) +
" " * (max - len(str(failures_len))) +
" " + output_dir + "/failures")
for f in failures[:-1]:
if "Uncontrolled swact" in f:
logger.info(f[0:19] + " " +
re.findall("active controller:? (.+)\n",
f)[0] + " uncontrolled swact")
elif "failure on" in f:
host = re.findall(r"failure on ([^\s]+) ", f)[0]
logger.info(f[0:19] + " " + host + " " +
re.findall("^(.+) failure on ",
f[43:])[0].lower() + " failure")
else:
logger.info(f[:-1])
else:
# Dictionary to keep track of number of times events happens on
# each host
events_summ = {}
for e in events[:-1]:
k = e[20:-1].split(" (", 1)[0]
if not events_summ.get(k):
events_summ[k] = 1
else:
events_summ[k] += 1
logger.info("\nEvents: " + str(events_len))
for k, v in sorted(events_summ.items()):
logger.info(k + ": " + str(v) + " time(s)")
logger.info("\nAlarms: " + str(alarms_len))
logger.info("The full list of alarms can be found at " +
os.path.abspath(output_dir) +
"/alarms")
# Dictionary to keep track of number of times state changes
# happens on each host
state_changes_summ = {}
for s in state_changes[:-1]:
k = s[20:-1]
if "enabled" in k:
k = k.split("enabled", 1)[0] + "enabled"
if not state_changes_summ.get(k):
state_changes_summ[k] = 1
else:
state_changes_summ[k] += 1
logger.info("\nState Changes: " + str(state_changes_len))
for k, v in sorted(state_changes_summ.items()):
logger.info(k + ": " + str(v) + " time(s)")
logger.info("\nFailures : " + str(failures_len))
for f in failures[:-1]:
logger.info(f[:-1])
def _extract_subfunction(self, host_info_path):
GROUP_ONE = 1
with open(host_info_path) as file:
for line in file:
hostname_match = re.match(
r"\s*hostname =>\s*\"?([^\"]*)(\n|\"\s*,?\s*\n)", line)
subfunction_match = re.match(
r"\s*subfunction =>\s*\"?([^\"]*)(\n|\"\s*,?\s*\n)", line)
if subfunction_match:
subfunction = subfunction_match.group(GROUP_ONE)
if hostname_match:
hostname = hostname_match.group(GROUP_ONE)
return hostname, subfunction
def _create_output_file(self, filename, directory, data, processing):
with open(os.path.join(directory, filename), "w") as file:
for i in data:
if i[-1] == "\n":
file.write(i)
else:
file.write(i + "\n")
if self.opts.verbose:
output = ("... output at " +
os.path.abspath(os.path.join(directory, filename)))
if processing == "":
logger.info(output)
else:
logger.info(processing + ", " + output)