
This update packages the report tool and plugin files into Debian, and bundles it with the collect tool so that they are added to the 'collect' tarballs at the time of creation. The report tool now allows users to point it at any collect bundle and have it automatically extract the tarball and tar files for each host before running. This update also adds heartbeat loss, maintenance errors, daemon failures, and state changes plugin algorithms to the report tool. Some of the existing algorithms were enhanced to extract more relevant log events. The alarm algorithm was updated to only track when alarms switch between set and clear. Lastly, there is a correlator function implemented into the tool that determines failures in collect bundles and their root causes, as well as finds significant events and state changes from the log files. The number of times alarms are set and cleared are also counted and printed by the correlator. They are presented in output files and summaries are printed out onto the command line. Users can also specify if they want the correlator to only find events, alarm transitions and state changes for a specific host. Test Plan: PASS: Verify tool is packaged in Debian PASS: Verify tool is inserted into 'collect' tarballs PASS: Verify tool extracts tarballs and host tarfiles PASS: Verify tool can point at any collect bundle and run successfully PASS: Verify substring plugin algorithm is working PASS: Verify swact activity plugin algorithm is working PASS: Verify heartbeat loss plugin algorithm is working PASS: Verify maintenance errors plugin algorithm is working PASS: Verify daemon failures plugin algorithm is working PASS: Verify state changes plugin algorithm is working PASS: Verify alarm plugin algorithm is working PASS: Verify failures and correct root causes are found by correlator PASS: Verify significant events are found by correlator PASS: Verify alarm transitions are found by correlator PASS: Verify state changes are found by correlator PASS: Verify failures/events/alarms/state changes are printed into files PASS: Verify tool prints correct info onto command line PASS: Verify correlator only finds events for specified host PASS: Verify correlator only finds alarm transitions for specified host PASS: Verify correlator only finds state changes for specified host Story: 2010166 Task: 46177 Signed-off-by: Angela Mao <Angela.Mao@windriver.com> Change-Id: I02e28edf16b342abf2224cc98325d77ba0678055
870 lines
36 KiB
Python
Executable File
870 lines
36 KiB
Python
Executable File
########################################################################
|
|
#
|
|
# Copyright (c) 2022 Wind River Systems, Inc.
|
|
#
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
#
|
|
########################################################################
|
|
#
|
|
# This file contains the ExecutionEngine class.
|
|
# The ExecutionEngine class contains all the available algorithms.
|
|
#
|
|
# The ExecutionEngine class runs plugins and gathers relevant logs and
|
|
# information, creating output files in the report directory.
|
|
#
|
|
# TODO: Modularize code and separate plugin algorithms into their own
|
|
# files
|
|
#
|
|
########################################################################
|
|
|
|
from datetime import datetime
|
|
import gzip
|
|
import logging
|
|
import os
|
|
import re
|
|
import shutil
|
|
import subprocess
|
|
import tarfile
|
|
|
|
import algorithms
|
|
from correlator import Correlator
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class ExecutionEngine:
|
|
def __init__(self, opts, output_directory):
|
|
"""Constructor for the ExecutionEngine class
|
|
|
|
Parameters:
|
|
opts (dictionary): Options from command line
|
|
"""
|
|
self.opts = opts
|
|
self.hosts = {"controllers": {}, "workers": {}, "storages": {}}
|
|
self.active_controller_directory = None
|
|
|
|
# Uncompresses host tar files if not already done
|
|
with open(os.path.join(output_directory, "untar.log"), "a") as logfile:
|
|
for obj in (os.scandir(self.opts.directory)):
|
|
info = os.path.splitext(obj.name)
|
|
if (obj.is_file() and obj.name != "report_tool.tgz" and
|
|
tarfile.is_tarfile(obj.path) and not
|
|
os.path.isdir(os.path.join(self.opts.directory,
|
|
info[0]))):
|
|
try:
|
|
subprocess.run(["tar", "xzfC", obj.path,
|
|
self.opts.directory],
|
|
stderr=logfile, check=True)
|
|
subprocess.run(["echo", "uncompressed", obj.name],
|
|
check=True)
|
|
except subprocess.CalledProcessError as e:
|
|
logger.error(e)
|
|
|
|
for folder in (f.path for f in os.scandir(self.opts.directory)):
|
|
database_path = os.path.join(folder, "var", "extra", "database")
|
|
host_info_path = os.path.join(folder, "var", "extra", "host.info")
|
|
|
|
if os.path.isdir(database_path) and os.listdir(database_path):
|
|
self.active_controller_directory = folder
|
|
|
|
if os.path.exists(host_info_path):
|
|
hostname, subfunction = self._extract_subfunction(
|
|
host_info_path)
|
|
if "controller" in subfunction:
|
|
self.hosts["controllers"][hostname] = folder
|
|
elif "worker" in subfunction:
|
|
self.hosts["workers"][hostname] = folder
|
|
elif "storage" in subfunction:
|
|
self.hosts["storages"][hostname] = folder
|
|
|
|
if not self.active_controller_directory:
|
|
raise ValueError("Active controller not found")
|
|
|
|
def execute(self, plugins, output_directory):
|
|
"""Run a list of plugins
|
|
|
|
Parameters:
|
|
plugins (Plugin list): List of plugins to run
|
|
|
|
Errors:
|
|
FileNotFoundError
|
|
"""
|
|
plugin_output_dir = os.path.join(output_directory, "plugins")
|
|
os.makedirs(plugin_output_dir, exist_ok=True)
|
|
|
|
for plugin in plugins:
|
|
processing = "Processing plugin: " + os.path.basename(plugin.file)
|
|
hosts = {}
|
|
if (
|
|
plugin.state["hosts"] and len(plugin.state["hosts"]) >= 1
|
|
): # if host list is given
|
|
logger.info(
|
|
f"Processing plugin: {os.path.basename(plugin.file)}")
|
|
|
|
for h in plugin.state["hosts"]:
|
|
if h == "all":
|
|
hosts.update(self.hosts["workers"])
|
|
hosts.update(self.hosts["storages"])
|
|
hosts.update(self.hosts["controllers"])
|
|
else:
|
|
hosts.update(self.hosts[h])
|
|
|
|
for hostname, folderpath in hosts.items():
|
|
|
|
events = []
|
|
if plugin.state["algorithm"] == algorithms.SUBSTRING:
|
|
events = self.substring(
|
|
plugin.state["substring"],
|
|
[
|
|
os.path.join(folderpath, file)
|
|
for file in plugin.state["files"]
|
|
],
|
|
)
|
|
|
|
# creating output file
|
|
output_file = os.path.join(
|
|
plugin_output_dir,
|
|
f"substring_{hostname}",
|
|
)
|
|
if self.opts.verbose:
|
|
logger.info("output at "
|
|
+ os.path.relpath(output_file))
|
|
with open(output_file, "w") as file:
|
|
file.write(
|
|
f"Date range: {self.opts.start} until "
|
|
f"{self.opts.end}\n"
|
|
)
|
|
file.write(
|
|
f"substrings: "
|
|
f"{' '.join(plugin.state['substring'])}\n"
|
|
)
|
|
for line in events:
|
|
if line[-1] == "\n":
|
|
file.write(line)
|
|
else:
|
|
file.write(line + "\n")
|
|
else:
|
|
if plugin.state["algorithm"] == algorithms.SYSTEM_INFO:
|
|
info = self.system_info()
|
|
system_info_output = os.path.join(plugin_output_dir,
|
|
"system_info")
|
|
with open(system_info_output, "w") as file:
|
|
for i in info:
|
|
file.write(i + "\n")
|
|
|
|
for k, v in self.hosts.items():
|
|
file.write(f"{k}: {','.join(v.keys())}\n")
|
|
if self.opts.verbose:
|
|
logger.info(processing + ", output at "
|
|
+ os.path.relpath(system_info_output))
|
|
else:
|
|
logger.info(processing)
|
|
|
|
elif plugin.state["algorithm"] == algorithms.AUDIT:
|
|
hosts = {}
|
|
hosts.update(self.hosts["workers"])
|
|
hosts.update(self.hosts["storages"])
|
|
hosts.update(self.hosts["controllers"])
|
|
|
|
for hostname, folderpath in hosts.items():
|
|
self._create_output_file(
|
|
f"{hostname}_audit",
|
|
plugin_output_dir,
|
|
self.audit(
|
|
plugin.state["start"],
|
|
plugin.state["end"],
|
|
os.path.join(
|
|
folderpath, "var", "log", "dcmanager",
|
|
"audit.log"
|
|
),
|
|
),
|
|
processing,
|
|
)
|
|
|
|
elif plugin.state["algorithm"] == algorithms.SWACT_ACTIVITY:
|
|
self._create_output_file(
|
|
"swact_activity", plugin_output_dir,
|
|
self.swact_activity(), processing
|
|
)
|
|
|
|
elif plugin.state["algorithm"] == algorithms.PUPPET_ERRORS:
|
|
self._create_output_file(
|
|
"puppet_errors", plugin_output_dir,
|
|
self.puppet_errors(), processing
|
|
)
|
|
|
|
elif plugin.state["algorithm"] == algorithms.PROCESS_FAILURES:
|
|
self._create_output_file(
|
|
"process_failures", plugin_output_dir,
|
|
self.process_failures(), processing
|
|
)
|
|
|
|
elif plugin.state["algorithm"] == algorithms.ALARM:
|
|
alarms, logs = self.alarm(
|
|
plugin.state["alarm_exclude"],
|
|
plugin.state["entity_exclude"]
|
|
)
|
|
alarm_output = os.path.join(plugin_output_dir, "alarm")
|
|
log_output = os.path.join(plugin_output_dir, "log")
|
|
|
|
# creating output alarm file
|
|
with open(alarm_output, "w") as file:
|
|
for k, v in alarms.items():
|
|
file.write(f"{k}:\n")
|
|
for date in v["dates"]:
|
|
file.write(f" {date}\n")
|
|
|
|
# creating output log file
|
|
with open(log_output, "w") as file:
|
|
for k, v in logs.items():
|
|
file.write(f"{k}: {v['count']}\n")
|
|
file.write("\n")
|
|
for k, v in logs.items():
|
|
file.write(f"{k}:\n")
|
|
for date in v["dates"]:
|
|
file.write(f" {date}\n")
|
|
if self.opts.verbose:
|
|
logger.info(processing + ", output at "
|
|
+ os.path.relpath(alarm_output)
|
|
+ ", " + os.path.relpath(log_output))
|
|
else:
|
|
logger.info(processing)
|
|
elif plugin.state["algorithm"] == algorithms.HEARTBEAT_LOSS:
|
|
self._create_output_file(
|
|
"heartbeat_loss", plugin_output_dir,
|
|
self.heartbeat_loss(), processing
|
|
)
|
|
elif plugin.state["algorithm"] == algorithms.MAINTENANCE_ERR:
|
|
self._create_output_file(
|
|
"maintenance_errors", plugin_output_dir,
|
|
self.maintenance_errors(), processing
|
|
)
|
|
elif plugin.state["algorithm"] == algorithms.DAEMON_FAILURES:
|
|
self._create_output_file(
|
|
"daemon_failures", plugin_output_dir,
|
|
self.daemon_failures(), processing
|
|
)
|
|
elif plugin.state["algorithm"] == algorithms.STATE_CHANGES:
|
|
self._create_output_file(
|
|
"state_changes", plugin_output_dir,
|
|
self.state_changes(), processing
|
|
)
|
|
|
|
if not self.opts.verbose:
|
|
logger.info("Output files for plugins can be found at " +
|
|
os.path.relpath(plugin_output_dir))
|
|
|
|
# Running the correlator and printing the output from it
|
|
self.run_correlator(output_directory, plugin_output_dir)
|
|
|
|
# Built-in algorithms ------------------------------
|
|
def alarm(self, alarm_exclude=[], entity_exclude=[]):
|
|
"""Alarm algorithm
|
|
Gathers list of alarms and customer logs
|
|
|
|
Parameters:
|
|
alarm_exclude (string list) : List of alarm id patterns to not
|
|
search for
|
|
entity_exclude (string list): List of entity id patterns to not
|
|
search for
|
|
"""
|
|
alarm_data = {}
|
|
log_data = {}
|
|
|
|
with open(
|
|
os.path.join(
|
|
self.active_controller_directory,
|
|
"var", "extra", "database", "fm.db.sql.txt"
|
|
)
|
|
) as file:
|
|
start = False
|
|
for line in file:
|
|
# start of event log
|
|
if re.search(r"COPY (public\.)?event_log", line):
|
|
start = True
|
|
elif start and line == "\\.\n":
|
|
break
|
|
elif start:
|
|
entry = re.split(r"\t", line)
|
|
|
|
INDEX_ALARM_ID = 5
|
|
INDEX_ACTION = 6
|
|
INDEX_ENTITY_ID = 8
|
|
INDEX_ALARM_DATE = 9
|
|
INDEX_SEVERITY = 10
|
|
|
|
alarm_id = entry[INDEX_ALARM_ID]
|
|
entity_id = entry[INDEX_ENTITY_ID]
|
|
action = entry[INDEX_ACTION]
|
|
severity = entry[INDEX_SEVERITY]
|
|
alarm_date = entry[INDEX_ALARM_DATE]
|
|
|
|
entry_date = alarm_date.replace(
|
|
" ", "T"
|
|
) # making time format of alarm the same
|
|
if (self.opts.start <= entry_date
|
|
and entry_date <= self.opts.end):
|
|
cont = True
|
|
# Checks if the alarm is in the user specified list of
|
|
# alarm or entity ids
|
|
for id in alarm_exclude:
|
|
if id in alarm_id:
|
|
cont = False
|
|
break
|
|
|
|
for entity in entity_exclude:
|
|
if entity in entity_id:
|
|
cont = False
|
|
break
|
|
|
|
if not cont:
|
|
continue
|
|
|
|
try:
|
|
if action == "log":
|
|
log_info = log_data[
|
|
f"{alarm_id} {entity_id} {severity}"
|
|
]
|
|
log_info["count"] += 1
|
|
log_info["dates"].append(alarm_date)
|
|
else:
|
|
alarm_info = alarm_data[
|
|
f"{alarm_id} {entity_id} {severity}"
|
|
]
|
|
alarm_info["dates"].append(
|
|
f"{alarm_date} {action}")
|
|
except KeyError:
|
|
if entry[6] != "log":
|
|
alarm_data[
|
|
f"{alarm_id} {entity_id} {severity}"
|
|
] = {
|
|
"dates": [f"{alarm_date} {action}"],
|
|
}
|
|
else:
|
|
log_data[
|
|
f"{alarm_id} {entity_id} {severity}"
|
|
] = {
|
|
"count": 1,
|
|
"dates": [alarm_date],
|
|
}
|
|
|
|
for _, v in alarm_data.items():
|
|
v["dates"] = sorted(v["dates"])
|
|
temp = []
|
|
temp.append(v["dates"][0])
|
|
for i in range(1, len(v["dates"])):
|
|
if v["dates"][i].split()[2] != v["dates"][i-1].split()[2]:
|
|
temp.append(v["dates"][i])
|
|
v["dates"] = temp
|
|
|
|
for _, v in log_data.items():
|
|
v["dates"] = sorted(v["dates"])
|
|
|
|
return alarm_data, log_data
|
|
|
|
def substring(self, substr, files):
|
|
"""Substring algorithm
|
|
Looks for substrings within files
|
|
|
|
Parameters:
|
|
substr (string list): List of substrings to look for
|
|
files (string list): List of absolute filepaths to search in
|
|
|
|
Errors:
|
|
FileNotFoundError
|
|
"""
|
|
# don't analyze older files, continue with current file
|
|
CONTINUE_CURRENT = 0
|
|
# analyze older files, continue with current file
|
|
CONTINUE_CURRENT_OLD = 1
|
|
|
|
data = []
|
|
for file in files:
|
|
try:
|
|
if not os.path.exists(file):
|
|
if (re.search("controller-1_(.+)/var/log/mtcAgent.log",
|
|
file)):
|
|
continue
|
|
raise FileNotFoundError(f"File not found: {file}")
|
|
cont = True
|
|
# Searching through file
|
|
command = (f"""grep -Ea "{'|'.join(s for s in substr)}" """
|
|
f"""{file} 2>/dev/null""")
|
|
status = self._continue(file)
|
|
|
|
if (status == CONTINUE_CURRENT
|
|
or status == CONTINUE_CURRENT_OLD):
|
|
# continue with current file
|
|
if status == CONTINUE_CURRENT:
|
|
cont = False
|
|
self._evaluate_substring(data, command)
|
|
|
|
# Searching through rotated log files that aren't compressed
|
|
n = 1
|
|
while os.path.exists(f"{file}.{n}") and cont:
|
|
command = (f"""grep -Ea "{'|'.join(s for s in substr)}" """
|
|
f"""{file}.{n} 2>/dev/null""")
|
|
status = self._continue(f"{file}.{n}")
|
|
|
|
if (status == CONTINUE_CURRENT
|
|
or status == CONTINUE_CURRENT_OLD):
|
|
if status == CONTINUE_CURRENT:
|
|
cont = False
|
|
self._evaluate_substring(data, command)
|
|
|
|
n += 1
|
|
|
|
# Searching through rotated log files
|
|
while os.path.exists(f"{file}.{n}.gz") and cont:
|
|
command = (f"""zgrep -E "{'|'.join(s for s in substr)}" """
|
|
f"""{file}.{n}.gz 2>/dev/null""")
|
|
status = self._continue(f"{file}.{n}.gz", compressed=True)
|
|
|
|
if (status == CONTINUE_CURRENT
|
|
or status == CONTINUE_CURRENT_OLD):
|
|
if status == CONTINUE_CURRENT:
|
|
cont = False
|
|
self._evaluate_substring(data, command)
|
|
|
|
n += 1
|
|
|
|
except FileNotFoundError as e:
|
|
logger.error(e)
|
|
continue
|
|
|
|
return sorted(data)
|
|
|
|
def system_info(self):
|
|
"""System info algorithm
|
|
Presents basic information about the system
|
|
"""
|
|
data = []
|
|
with open(
|
|
os.path.join(
|
|
self.active_controller_directory, "etc", "platform",
|
|
"platform.conf"
|
|
)
|
|
) as file:
|
|
for line in file:
|
|
if "system_mode" in line:
|
|
data.append(
|
|
f"System Mode: "
|
|
f"{re.match('^system_mode=(.*)', line).group(1)}"
|
|
)
|
|
elif "system_type" in line:
|
|
data.append(
|
|
f"System Type: "
|
|
f"{re.match('^system_type=(.*)', line).group(1)}"
|
|
)
|
|
elif "distributed_cloud_role" in line:
|
|
role = re.match('^distributed_cloud_role=(.*)',
|
|
line).group(1)
|
|
data.append(f"Distributed cloud role: {role}")
|
|
elif "sw_version" in line:
|
|
data.append(
|
|
f"SW Version: "
|
|
f"{re.match('^sw_version=(.*)', line).group(1)}"
|
|
)
|
|
with open(
|
|
os.path.join(self.active_controller_directory, "etc", "build.info")
|
|
) as file:
|
|
for line in file:
|
|
if "BUILD_TYPE" in line:
|
|
data.append(
|
|
f"Build Type: "
|
|
f"{re.match('^BUILD_TYPE=(.*)', line).group(1)}"
|
|
)
|
|
elif re.match("^OS=(.*)", line):
|
|
data.append(f"OS: {re.match('^OS=(.*)', line).group(1)}")
|
|
|
|
return data
|
|
|
|
def swact_activity(self):
|
|
"""Swact activity algorithm
|
|
Presents all swacting activity in the system
|
|
"""
|
|
data = []
|
|
sm_files = []
|
|
sm_customer_files = []
|
|
swact_start = None
|
|
swact_in_progress = False
|
|
swact_end = None
|
|
|
|
for _, folder in self.hosts["controllers"].items():
|
|
sm_path = os.path.join(folder, "var", "log", "sm.log")
|
|
sm_files.append(sm_path)
|
|
sm_customer_path = os.path.join(folder, "var", "log",
|
|
"sm-customer.log")
|
|
sm_customer_files.append(sm_customer_path)
|
|
|
|
sm_substrings = ["Uncontrolled swact", "Swact has started,",
|
|
"Neighbor (.+) is now in the down",
|
|
"Service (.+) has reached max failures",
|
|
"Swact update"]
|
|
data = self.substring(sm_substrings, sm_files)
|
|
|
|
for i, line in enumerate(data):
|
|
if "Swact has started," in line and not swact_in_progress:
|
|
swact_in_progress = True
|
|
swact_start = datetime.strptime(line[0:19],
|
|
"%Y-%m-%dT%H:%M:%S")
|
|
elif "Swact update" in line and swact_in_progress:
|
|
swact_in_progress = False
|
|
swact_end = datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S")
|
|
line += f" SWACT TOOK {swact_end - swact_start} \n"
|
|
data[i] = line
|
|
|
|
sm_customer_substrings = [
|
|
"swact", "active-failed\\s+\\| disabling-failed\\s+\\|"
|
|
]
|
|
data += self.substring(sm_customer_substrings, sm_customer_files)
|
|
|
|
return sorted(data)
|
|
|
|
def puppet_errors(self):
|
|
"""Puppet errors algorithm
|
|
Presents log errors from puppet logs
|
|
"""
|
|
data = []
|
|
for host_type in self.hosts.keys():
|
|
for _, folder in self.hosts[host_type].items():
|
|
puppet_folder = os.path.join(folder, "var", "log", "puppet")
|
|
command = (f"""grep -rh "[m ]Error: " {puppet_folder} """
|
|
f"""2>/dev/null""")
|
|
self._evaluate_substring(data, command)
|
|
return sorted(data)
|
|
|
|
def process_failures(self):
|
|
"""Process failures algorithm
|
|
Presents log errors from pmond
|
|
"""
|
|
data = []
|
|
files = []
|
|
for host_type in self.hosts.keys():
|
|
for _, folder in self.hosts[host_type].items():
|
|
pmond = os.path.join(folder, "var", "log", "pmond.log")
|
|
files.append(pmond)
|
|
|
|
data = self.substring(["Error :"], files)
|
|
|
|
return data
|
|
|
|
def heartbeat_loss(self):
|
|
"""Heartbeat loss algorithm
|
|
Presents all heartbeat loss error messages in the system
|
|
"""
|
|
data = []
|
|
hb_files = []
|
|
|
|
for _, folder in self.hosts["controllers"].items():
|
|
hb_path = os.path.join(folder, "var", "log", "hbsAgent.log")
|
|
hb_files.append(hb_path)
|
|
|
|
hb_substrings = ["Heartbeat Loss"]
|
|
data = self.substring(hb_substrings, hb_files)
|
|
|
|
return sorted(data)
|
|
|
|
def maintenance_errors(self):
|
|
"""Maintenance errors algorithm
|
|
Presents maintenance errors and other relevant log messages in system
|
|
"""
|
|
data = []
|
|
mtc_files = []
|
|
|
|
for _, folder in self.hosts["controllers"].items():
|
|
agent = os.path.join(folder, "var", "log", "mtcAgent.log")
|
|
mtc_files.append(agent)
|
|
|
|
for host_type in self.hosts.keys():
|
|
for _, folder in self.hosts[host_type].items():
|
|
client = os.path.join(folder, "var", "log", "mtcClient.log")
|
|
mtc_files.append(client)
|
|
|
|
mtc_substrings = ["Error : ", "Configuration failure",
|
|
"In-Test Failure", "Loss Of Communication",
|
|
"Graceful Recovery Wait ",
|
|
"regained MTCALIVE from host that has rebooted",
|
|
"Connectivity Recovered ; ",
|
|
"auto recovery disabled", "Graceful Recovery Failed",
|
|
"MNFA ENTER", "MNFA EXIT", "MNFA POOL"]
|
|
data = self.substring(mtc_substrings, mtc_files)
|
|
|
|
return sorted(data)
|
|
|
|
def daemon_failures(self):
|
|
"""Daemon failures algorithm
|
|
Presents all failed puppet manifest messages in the system
|
|
"""
|
|
data = []
|
|
daemon_files = []
|
|
|
|
for host_type in self.hosts.keys():
|
|
for _, folder in self.hosts[host_type].items():
|
|
daemon_path = os.path.join(folder, "var", "log", "daemon.log")
|
|
daemon_files.append(daemon_path)
|
|
|
|
daemon_substrings = ["Failed to run the puppet manifest"]
|
|
data = self.substring(daemon_substrings, daemon_files)
|
|
|
|
return sorted(data)
|
|
|
|
def state_changes(self):
|
|
"""State changes algorithm
|
|
Presents all messages in the system regarding the state of hosts
|
|
"""
|
|
data = []
|
|
sc_files = []
|
|
|
|
for _, folder in self.hosts["controllers"].items():
|
|
sc_path = os.path.join(folder, "var", "log", "mtcAgent.log")
|
|
sc_files.append(sc_path)
|
|
|
|
sc_substrings = ["is ENABLED", "allStateChange (.+)locked-disabled"]
|
|
data = self.substring(sc_substrings, sc_files)
|
|
|
|
return sorted(data)
|
|
|
|
def audit(self, start, end, audit_log_path):
|
|
"""Counts audit events in dcmanager within a specified date range
|
|
|
|
Parameters:
|
|
start (string) : start date in YYYY-MM-DD HH:MM:SS format
|
|
end (string) : end date in YYYY-MM-DD HH:MM:SS format
|
|
audit_log_path (string) : absolute path of augit log file
|
|
"""
|
|
if not shutil.which("lnav"):
|
|
raise ValueError("Lnav program not found")
|
|
|
|
SECONDS_PER_HOUR = 3600
|
|
fmt = "%Y-%m-%d %H:%M:%S"
|
|
|
|
d1 = datetime.strptime(start, fmt)
|
|
d2 = datetime.strptime(end, fmt)
|
|
seconds = (d2 - d1).total_seconds()
|
|
|
|
log_texts = [
|
|
"Triggered subcloud audit%",
|
|
"Trigger patch audit%",
|
|
"Trigger load audit%",
|
|
"Triggered firmware audit%",
|
|
"Triggered kubernetes audit%",
|
|
# Counts sum of audits from all subclouds
|
|
]
|
|
INDEX_MIDDLE_WORD = 1
|
|
data = [("These rates and totals represent the sum of audits from "
|
|
+ "all subclouds")]
|
|
|
|
def command(text):
|
|
|
|
return (
|
|
f'lnav -R -n -c ";SELECT count(log_body) AS '
|
|
f'{text.split(" ")[INDEX_MIDDLE_WORD]}_total from '
|
|
f'openstack_log WHERE '
|
|
f'(log_time > \\"{start}\\" AND not log_time > \\"{end}\\")'
|
|
f' AND log_body like \\"{text}\\"" "{audit_log_path}"'
|
|
)
|
|
|
|
for text in log_texts:
|
|
p = subprocess.Popen(command(text), shell=True,
|
|
stdout=subprocess.PIPE)
|
|
for line in p.stdout:
|
|
line = line.decode("utf-8").strip()
|
|
if line.isnumeric():
|
|
data.append(
|
|
f"rate "
|
|
f"{round((int(line)/seconds * SECONDS_PER_HOUR), 3)} "
|
|
f"per hour. total: {line}"
|
|
)
|
|
else:
|
|
data.append(line)
|
|
return data
|
|
|
|
# -----------------------------------
|
|
|
|
def run_correlator(self, output_directory, plugin_output_dir):
|
|
"""Runs the correlator and prints the results differently based on if
|
|
the tool was run with or without the verbose option
|
|
|
|
Parameters:
|
|
output_directory (string) : directory to place output files from
|
|
correlator
|
|
plugin_output_dir (string) : directory with output files from
|
|
plugins
|
|
"""
|
|
correlator = Correlator(plugin_output_dir)
|
|
failures, events, alarms, state_changes = correlator.run(
|
|
self.opts.hostname)
|
|
failures_len, events_len = len(failures), len(events)
|
|
alarms_len, state_changes_len = len(alarms), len(state_changes)
|
|
failures.append("\nTotal failures found: " + str(failures_len) + "\n")
|
|
events.append("\nTotal events found: " + str(events_len) + "\n")
|
|
alarms.append("\nTotal alarms found: " + str(alarms_len) + "\n")
|
|
state_changes.append("\nTotal state changes found: "
|
|
+ str(state_changes_len) + "\n")
|
|
|
|
logger.info("\nRunning correlator...")
|
|
self._create_output_file("correlator_failures", output_directory,
|
|
failures, "")
|
|
self._create_output_file("correlator_events", output_directory,
|
|
events, "")
|
|
self._create_output_file("correlator_alarms", output_directory,
|
|
alarms, "")
|
|
self._create_output_file("correlator_state_changes", output_directory,
|
|
state_changes, "")
|
|
|
|
if not self.opts.verbose:
|
|
logger.info("Output can be found at "
|
|
+ os.path.relpath(output_directory) + "\n")
|
|
logger.info("Failures: " + str(failures_len))
|
|
for f in failures[:-1]:
|
|
if "Uncontrolled swact" in f:
|
|
logger.info(f[0:19] + " "
|
|
+ re.findall("active controller:? (.+)\n",
|
|
f)[0] + " uncontrolled swact")
|
|
elif "failure on" in f:
|
|
host = re.findall(r"failure on ([^\s]+) ", f)[0]
|
|
logger.info(f[0:19] + " " + host + " "
|
|
+ re.findall("^(.+) failure on ",
|
|
f[43:])[0].lower() + " failure")
|
|
else:
|
|
logger.info(f[:-1])
|
|
if failures_len != 0:
|
|
logger.info("\nEvents: " + str(events_len))
|
|
else:
|
|
logger.info("Events: " + str(events_len))
|
|
logger.info("Alarms: " + str(alarms_len))
|
|
logger.info("State Changes: " + str(state_changes_len))
|
|
else:
|
|
logger.info("\nFailures: " + str(failures_len))
|
|
for f in failures[:-1]:
|
|
logger.info(f[:-1])
|
|
|
|
# Dictionary to keep track of number of times events happens on
|
|
# each host
|
|
events_summ = {}
|
|
for e in events[:-1]:
|
|
k = e[20:-1].split(" (", 1)[0]
|
|
if not events_summ.get(k):
|
|
events_summ[k] = 1
|
|
else:
|
|
events_summ[k] += 1
|
|
|
|
if failures_len != 0:
|
|
logger.info("\nEvents: " + str(events_len))
|
|
else:
|
|
logger.info("Events: " + str(events_len))
|
|
for k, v in sorted(events_summ.items()):
|
|
logger.info(k + ": " + str(v) + " time(s)")
|
|
|
|
if events_len != 0:
|
|
logger.info("\nAlarms: " + str(alarms_len))
|
|
else:
|
|
logger.info("Alarms: " + str(alarms_len))
|
|
logger.info("The full list of alarms can be found at "
|
|
+ os.path.relpath(output_directory)
|
|
+ "/correlator_alarms")
|
|
|
|
# Dictionary to keep track of number of times state changes
|
|
# happens on each host
|
|
state_changes_summ = {}
|
|
for s in state_changes[:-1]:
|
|
k = s[20:-1]
|
|
if "enabled" in k:
|
|
k = k.split("enabled", 1)[0] + "enabled"
|
|
if not state_changes_summ.get(k):
|
|
state_changes_summ[k] = 1
|
|
else:
|
|
state_changes_summ[k] += 1
|
|
|
|
if alarms_len != 0:
|
|
logger.info("\nState Changes: " + str(state_changes_len))
|
|
else:
|
|
logger.info("State Changes: " + str(state_changes_len))
|
|
for k, v in sorted(state_changes_summ.items()):
|
|
logger.info(k + ": " + str(v) + " time(s)")
|
|
|
|
def _continue(self, file, compressed=False):
|
|
# don't analyze older files, continue with current file
|
|
CONTINUE_CURRENT = 0
|
|
# analyze older files, continue with current file
|
|
CONTINUE_CURRENT_OLD = 1
|
|
# don't analyze current file, continue to older files
|
|
CONTINUE_OLD = 2
|
|
|
|
# check date of first log event and compare with provided
|
|
# start, end dates
|
|
first = ""
|
|
|
|
if not compressed:
|
|
with open(file) as f:
|
|
line = f.readline()
|
|
first = line[0:19]
|
|
else:
|
|
with gzip.open(file, "rb") as f:
|
|
line = f.readline().decode("utf-8")
|
|
first = line[0:19]
|
|
try:
|
|
datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S")
|
|
first = line[0:19]
|
|
except ValueError:
|
|
return CONTINUE_CURRENT_OLD
|
|
|
|
if first < self.opts.start:
|
|
return CONTINUE_CURRENT
|
|
elif first < self.opts.end and first > self.opts.start:
|
|
return CONTINUE_CURRENT_OLD
|
|
elif first > self.opts.end:
|
|
return CONTINUE_OLD
|
|
|
|
def _evaluate_substring(self, data, command):
|
|
p = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
|
|
for line in p.stdout:
|
|
line = line.decode("utf-8")
|
|
# different date locations for log events
|
|
dates = [line[0:19], line[2:21]]
|
|
for date in dates:
|
|
try:
|
|
datetime.strptime(date, "%Y-%m-%dT%H:%M:%S")
|
|
if date > self.opts.start and date < self.opts.end:
|
|
if line[0] == "|": # sm-customer.log edge case
|
|
line = line[1:].strip()
|
|
line = re.sub("\\s+", " ", line)
|
|
data.append(line)
|
|
break
|
|
except ValueError:
|
|
if date == dates[-1]:
|
|
data.append(line)
|
|
|
|
def _extract_subfunction(self, host_info_path):
|
|
GROUP_ONE = 1
|
|
with open(host_info_path) as file:
|
|
for line in file:
|
|
hostname_match = re.match(
|
|
r"\s*hostname =>\s*\"?([^\"]*)(\n|\"\s*,?\s*\n)", line)
|
|
subfunction_match = re.match(
|
|
r"\s*subfunction =>\s*\"?([^\"]*)(\n|\"\s*,?\s*\n)", line)
|
|
if subfunction_match:
|
|
subfunction = subfunction_match.group(GROUP_ONE)
|
|
if hostname_match:
|
|
hostname = hostname_match.group(GROUP_ONE)
|
|
return hostname, subfunction
|
|
|
|
def _create_output_file(self, filename, directory, data, processing):
|
|
with open(os.path.join(directory, filename), "w") as file:
|
|
for i in data:
|
|
if i[-1] == "\n":
|
|
file.write(i)
|
|
else:
|
|
file.write(i + "\n")
|
|
if self.opts.verbose:
|
|
output = ("output at "
|
|
+ os.path.relpath(os.path.join(directory, filename)))
|
|
if processing == "":
|
|
logger.info(output)
|
|
else:
|
|
logger.info(processing + ", " + output)
|
|
elif processing != "":
|
|
logger.info(processing)
|