
The collect tool executes various Linux and system commands to gather data into an archived collect bundle. System administrators often need to run collect on busy, in-service systems. During such operations, they have reported excessive CPU usage, which can lead to undesirable CPU spikes caused by certain collect operations. While the collect tool already employs throttling for ssh and scp, its data collection and archiving commands currently lack similar safeguards. This update introduces the following enhancements to mitigate CPU spikes and improve performance on heavily loaded in-service servers: - removed one unnecessary tar archive operation. - add tar archive checkpoint option support with action handler - removed one redundant kubelet api-resources call in the containerization plugin. - add --chunk-size=50 support to all in one kubelet get api-resources command to help throttle this long running heavyweight command. 50 seems to yield the lowest k8s api latency as measured with the k8smetrics tool. - launch collect plugins with 'nice' and 'ionice' attributes. - add 'nice' and 'ionice' attributes to select commands. - add sleep delays after known cpu intensive data collection commands. - remove unnecessary -v (verbose) option to all tar commands. - add a run_command utility that times the execution of commands and adaptively adds a small post execution delay based on how long that command took to run. - reduce the cpu impact of the containerization plugin by adding periodic delays. - added a few periodic delays in long running or cpu intensive plugins - create a collect command timing log that is added to each the host collect tarball. - timing log file records how long it took for each plugin to run as well as commands called with the new run_command function. - fixed issue in networking plugin. - added a 60 second timeout for the 'lsof' heavyweight command. - fixed delimiter string hostname in all plugins. - increase the default global timeout from 20 to 30 minutes. - increase the default collect_host timeout from 600 to 900 seconds. - incremented tool minor version. These improvements aim to minimize the performance impact of running collect on busy in-service systems. Note: When a process is started with nice, its CPU priority is inherited by all threads spawned by that process. However, it does not restrict the total CPU time a process or its threads can use when no contention exists. Test Plan: PASS: Verify build and install of collect package. PASS: Verify collect runtime is not substantially longer. PASS: Verify tar checkpoint handling on busy system where checkpoint action handler detects and invokes system overload handling. PASS: Verify some CPU spike reduction compared to before update. Regression: PASS: Compare collect bundle size and contents before and after update. PASS: Soak collect on busy/overloaded AIO SX system. PASS: Verify report tool reports the same data before/after update. PASS: Verify multi-node collect Closes-Bug: 2090923 Change-Id: If698d5f275f4482de205fa4a37e0398b19800777 Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
527 lines
16 KiB
Bash
Executable File
527 lines
16 KiB
Bash
Executable File
#! /bin/bash
|
|
#
|
|
# Copyright (c) 2013-2019, 2024 Wind River Systems, Inc.
|
|
#
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
#
|
|
|
|
##########################################################################################
|
|
|
|
source /etc/collect/collect_timeouts
|
|
|
|
hostname="${HOSTNAME}"
|
|
|
|
DEBUG=false
|
|
redirect="/dev/null"
|
|
|
|
# Fail Codes
|
|
PASS=0
|
|
FAIL=1
|
|
RETRY=2
|
|
|
|
FAIL_NODETYPE=3
|
|
|
|
FAIL_TIMEOUT_GLOBAL=10
|
|
FAIL_TIMEOUT_OPERATION=11
|
|
FAIL_TIMEOUT_OPERATION_SCP=12
|
|
FAIL_TIMEOUT_OPERATION_SSH=13
|
|
FAIL_TIMEOUT_HOST_ACCESS=14
|
|
FAIL_TIMEOUT_HOST=15
|
|
FAIL_TIMEOUT_PW=16
|
|
FAIL_TIMEOUT_SCP=17
|
|
FAIL_TIMEOUT_SSH=18
|
|
FAIL_TIMEOUT_SUBCLOUD_ACCESS=19
|
|
FAIL_TIMEOUT_SUBCLOUD=20
|
|
|
|
FAIL_PASSWORD=30
|
|
FAIL_PERMISSION=31
|
|
FAIL_CLEANUP=32
|
|
FAIL_UNREACHABLE=33
|
|
FAIL_HOSTNAME=34
|
|
FAIL_INACTIVE=35
|
|
FAIL_PERMISSION_REMOTE=36
|
|
FAIL_OUT_OF_SPACE=37
|
|
FAIL_INSUFFICIENT_SPACE=38
|
|
FAIL_INTERNAL=39
|
|
FAIL_NO_TARDIR=40
|
|
FAIL_NO_TARBALLS=41
|
|
FAIL_NO_FILE_SPECIFIED=42
|
|
FAIL_FILE_NOT_FOUND=43
|
|
FAIL_FILE_EMPTY=44
|
|
FAIL_PASSWORD_PROMPT=45
|
|
FAIL_MISSING_PARAMETER=46
|
|
FAIL_DATE_FORMAT=47
|
|
FAIL_NO_HOSTS=48
|
|
FAIL_FILE_COPY=49
|
|
FAIL_SUBCLOUD=50
|
|
FAIL_CONTINUE=51
|
|
FAIL_SUBCLOUDNAME=52
|
|
FAIL_NO_SUBCLOUDS=53
|
|
FAIL_NOT_SYSTEMCONTROLLER=54
|
|
FAIL_NAME_TOO_LONG=55
|
|
FAIL_INVALID_START_DATE=56
|
|
FAIL_INVALID_END_DATE=57
|
|
FAIL_INVALID_DATE_RANGE=58
|
|
FAIL_TIMEOUT_ARG=59
|
|
FAIL_NOT_SUDOER=60
|
|
FAIL_NOT_SUDOER_REMOTE=61
|
|
FAIL_PASSWORDLESS=65
|
|
FAIL_PASSWORDLESS_REMOTE=66
|
|
FAIL_INSUFFICIENT_SPACE_REMOTE=70
|
|
FAIL_NOT_ENOUGH_SPACE_REMOTE=71
|
|
FAIL_OUT_OF_SPACE_REMOTE=72
|
|
|
|
# Warnings are above 200
|
|
WARN_WARNING=200
|
|
WARN_HOSTNAME=201
|
|
WARN_SUBCLOUD=202
|
|
|
|
COLLECT_ERROR="Error:"
|
|
COLLECT_DEBUG="Debug:"
|
|
COLLECT_WARN="Warning:"
|
|
|
|
# common permission error strings
|
|
pw_error="orry, try again"
|
|
ac_error="ermission denied"
|
|
su_error="not in the sudoers"
|
|
|
|
# Failure Strings
|
|
FAIL_NOT_ENOUGH_SPACE_STR="Not enough /scratch filesystem space"
|
|
FAIL_NOT_ENOUGH_SPACE_REMOTE_STR="Not enough remote /scratch filesystem space"
|
|
FAIL_OUT_OF_SPACE_STR="No space left on device"
|
|
FAIL_OUT_OF_SPACE_REMOTE_STR="No space left on remote device"
|
|
FAIL_TAR_OUT_OF_SPACE_STR="tar: Error is not recoverable"
|
|
FAIL_INSUFFICIENT_SPACE_STR="Not enough space on device"
|
|
FAIL_INSUFFICIENT_SPACE_REMOTE_STR="Not enough space on remote device"
|
|
|
|
FAIL_TIMEOUT_STR="operation timeout"
|
|
FAIL_TIMEOUT_ARG_STR="out-of-range timeout"
|
|
|
|
# Operational timeouts
|
|
FAIL_TIMEOUT_GLOBAL_STR="global collect timeout"
|
|
FAIL_TIMEOUT_PW_STR="password prompt timeout"
|
|
FAIL_TIMEOUT_SCP_STR="scp timeout"
|
|
FAIL_TIMEOUT_SSH_STR="ssh timeout"
|
|
FAIL_TIMEOUT_OPERATION_STR="linux operation timeout"
|
|
FAIL_TIMEOUT_OPERATION_SSH_STR="ssh operation timeout"
|
|
FAIL_TIMEOUT_OPERATION_SCP_STR="linux operation timeout"
|
|
|
|
# host and subcloud timeouts
|
|
FAIL_TIMEOUT_HOST_ACCESS_STR="host access timeout"
|
|
FAIL_TIMEOUT_HOST_STR="host collect timeout"
|
|
FAIL_TIMEOUT_SUBCLOUD_ACCESS_STR="subcloud access timeout"
|
|
FAIL_TIMEOUT_SUBCLOUD_STR="subcloud collect timeout"
|
|
|
|
FAIL_NO_FILE_SPECIFIED_STR="no file specified"
|
|
FAIL_FILE_NOT_FOUND_STR="no such file or directory"
|
|
FAIL_FILE_EMPTY_STR="file is empty"
|
|
FAIL_PASSWORD_PROMPT_STR="password for"
|
|
FAIL_PASSWORDLESS_STR="timeout waiting for password prompt"
|
|
FAIL_PASSWORDLESS_REMOTE_STR="timeout waiting for remote password prompt"
|
|
FAIL_NOT_SUDOER_STR="collect requires sudo on host"
|
|
FAIL_NOT_SUDOER_REMOTE_STR="collect requires sudo on remote host"
|
|
FAIL_INVALID_PASSWORD_STR="invalid password"
|
|
FAIL_PERMISSION_STR="permission error"
|
|
FAIL_DATE_FORMAT_STR="date format"
|
|
FAIL_INACTIVE_STR="not active"
|
|
FAIL_NO_HOSTS_STR="empty host list"
|
|
FAIL_NO_SUBCLOUDS_STR="empty subcloud list"
|
|
FAIL_MISSING_PARAMETER_STR="missing parameter"
|
|
FAIL_FILE_COPY_STR="failed to copy"
|
|
FAIL_CONTINUE_STR="cannot continue"
|
|
FAIL_UNREACHABLE_STR="Unreachable"
|
|
FAIL_PERMISSION_REMOTE_STR="remote permission error"
|
|
FAIL_UNSPECIFIED_CAUSE_STR="unspecified cause"
|
|
|
|
# The minimum amount of % free space on /scratch to allow collect to proceed
|
|
MIN_PERCENT_SPACE_REQUIRED=75
|
|
|
|
# Subcloud collect stops when avail scratch drops below this threshold.
|
|
# Use collect -sc --continue to tell collect to continue collecting subclouds
|
|
# from where it left off.
|
|
# 2Gib in K blocks rounded up
|
|
declare -i COLLECT_BASE_DIR_FULL_THRESHOLD=2147484 # 2Gib in K blocks rounded up
|
|
|
|
# Log file path/names
|
|
COLLECT_LOG=collect.log
|
|
COLLECT_ERROR_LOG=/tmp/$(whoami)_collect_error.log
|
|
HOST_COLLECT_ERROR_LOG="/tmp/$(whoami)_host_collect_error.log"
|
|
COLLECT_CMD_TIMING_LOG="/tmp/$(whoami)_collect_cmd_timing.log"
|
|
HOST_COLLECT_CMD_TIMING_LOG="collect_cmd_timing.log"
|
|
DCROLE_SYSTEMCONTROLLER="systemcontroller"
|
|
DCROLE_SUBCLOUD="subcloud"
|
|
|
|
function is_active_controller
|
|
{
|
|
active=`sm-query service-group controller-services | grep "controller-services active"`
|
|
if [ -z "$active" ] ; then
|
|
return 0
|
|
else
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
function source_openrc_if_needed
|
|
{
|
|
# get the node and subfunction types
|
|
nodetype=""
|
|
subfunction=""
|
|
PLATFORM_CONF=/etc/platform/platform.conf
|
|
if [ -e ${PLATFORM_CONF} ] ; then
|
|
source ${PLATFORM_CONF}
|
|
fi
|
|
|
|
if [ "${nodetype}" != "controller" -a "${nodetype}" != "worker" -a "${nodetype}" != "storage" ] ; then
|
|
wlog "could not identify nodetype ($nodetype)"
|
|
exit $FAIL_NODETYPE
|
|
fi
|
|
|
|
ACTIVE=false
|
|
if [ "$nodetype" == "controller" ] ; then
|
|
# get local host activity state
|
|
OPENRC="/etc/platform/openrc"
|
|
if [ -e "${OPENRC}" ] ; then
|
|
OS_PASSWORD=""
|
|
source ${OPENRC} 2>/dev/null 1>/dev/null
|
|
if [ "${OS_PASSWORD}" != "" ] ; then
|
|
ACTIVE=true
|
|
fi
|
|
fi
|
|
fi
|
|
}
|
|
|
|
# Setup an expect command completion file.
|
|
# This is used to force serialization of expect
|
|
# sequences and highlight command completion
|
|
collect_done="collect done"
|
|
cmd_done_sig="expect done"
|
|
cmd_done_file="/usr/local/sbin/expect_done"
|
|
EXPECT_LOG_FILE="/tmp/collect_expect"
|
|
|
|
# Compression Commands
|
|
TAR_ZIP_CMD="tar -czf"
|
|
TAR_UZIP_CMD="tar -xzf"
|
|
TAR_CMD="tar -chf"
|
|
TAR_CMD_APPEND="tar -rhf"
|
|
UNTAR_CMD="tar -xf"
|
|
ZIP_CMD="gzip"
|
|
NICE_CMD="/usr/bin/nice -n19"
|
|
IONICE_CMD="/usr/bin/ionice -c2 -n7"
|
|
COLLECT_TAG="COLLECT"
|
|
|
|
# Checkpoint definitions
|
|
# Default is 512 bytes per block
|
|
# Example 10000*512 = 5MBytes
|
|
CHECKPOINT_BLOCKS=10000
|
|
CHECKPOINT_CMD="--checkpoint=${CHECKPOINT_BLOCKS} --checkpoint-action=exec=/usr/local/sbin/collect_checkpoint"
|
|
|
|
STARTDATE_OPTION="--start-date"
|
|
ENDDATE_OPTION="--end-date"
|
|
|
|
DATE_FORMAT="+%H:%M:%S.%3N"
|
|
|
|
PROCESS_DETAIL_CMD="ps -e -H --forest -o ruser,tid,pid,ppid,flags,stat,policy,rtprio,nice,priority,rss:10,vsz:10,sz:10,psr,stime,etime,cputime,wchan:14,tty,cmd"
|
|
BUILD_INFO_CMD="cat /etc/build.info"
|
|
|
|
################################################################################
|
|
# Log Debug, Info or Error log message to syslog
|
|
################################################################################
|
|
function log
|
|
{
|
|
logger -t ${COLLECT_TAG} "$(whoami) $@"
|
|
}
|
|
|
|
function ilog
|
|
{
|
|
echo "$@"
|
|
logger -t ${COLLECT_TAG} "$(whoami) $@"
|
|
}
|
|
|
|
function elog
|
|
{
|
|
echo "${COLLECT_ERROR} $@"
|
|
logger -t ${COLLECT_TAG} "$(whoami) ${COLLECT_ERROR} $@"
|
|
}
|
|
|
|
function wlog
|
|
{
|
|
echo "${COLLECT_WARN} $@"
|
|
logger -t ${COLLECT_TAG} "$(whoami) ${COLLECT_WARN} $@"
|
|
}
|
|
|
|
function set_debug_mode()
|
|
{
|
|
DEBUG=${1}
|
|
}
|
|
|
|
function dlog()
|
|
{
|
|
if [ "$DEBUG" == true ] ; then
|
|
logger -t ${COLLECT_TAG} $(whoami) "${COLLECT_DEBUG} $@"
|
|
echo "$(date) ${COLLECT_DEBUG} $@"
|
|
fi
|
|
}
|
|
|
|
|
|
function delimiter()
|
|
{
|
|
echo "--------------------------------------------------------------------" >> ${1} 2>>${COLLECT_ERROR_LOG}
|
|
echo "`date` : ${hostname} : ${2}" >> ${1} 2>>${COLLECT_ERROR_LOG}
|
|
echo "--------------------------------------------------------------------" >> ${1} 2>>${COLLECT_ERROR_LOG}
|
|
}
|
|
|
|
|
|
|
|
function get_time_delta () {
|
|
start_epoch=$(date -d "1970-01-01 $1" +%s%3N)
|
|
stop_epoch=$(date -d "1970-01-01 $2" +%s%3N)
|
|
delta=$((${stop_epoch}-${start_epoch}))
|
|
secs=$((delta / 1000))
|
|
msecs=$((delta % 1000))
|
|
echo "${secs}.${msecs}"
|
|
}
|
|
|
|
###############################################################################
|
|
#
|
|
# Name: run_command
|
|
#
|
|
# Purpose: Run the specified command and direct the output of
|
|
# that command to the specified log.
|
|
#
|
|
# Assumptions: Requires 2 and only 2 arguments
|
|
#
|
|
# Arguments: $1 - string command to execute
|
|
# $2 - string path/name of file to direct command output to
|
|
#
|
|
# Warning: Command is not executed unless there are only 2 arguments
|
|
# supplied. This check helps identify code errors in command
|
|
# execution and output redirection. Error is logged to the error
|
|
# log as well as the execution timing summary log.
|
|
#
|
|
###############################################################################
|
|
|
|
function run_command () {
|
|
if [ "$#" -ne 2 ]; then
|
|
echo "Error: run_command requires 2 arguments only ; saw $# Argument(s): '$@'" >> ${COLLECT_CMD_TIMING_LOG}
|
|
return 1
|
|
fi
|
|
|
|
local cmd="${1}"
|
|
local log="${2}"
|
|
local start=$(date ${DATE_FORMAT})
|
|
delimiter "${log}" "${cmd}"
|
|
${cmd} >> ${log} 2>>${COLLECT_ERROR_LOG}
|
|
rc=$?
|
|
local stop=$(date ${DATE_FORMAT})
|
|
duration=$(get_time_delta "${start}" "${stop}")
|
|
# echo "$(date ${DATE_FORMAT}): ${duration} - ${log} - ${cmd}" >> ${COLLECT_CMD_TIMING_LOG}
|
|
# return ${rc}
|
|
|
|
# perform a short sleep based on how long this command took
|
|
# if COLLECT_RUN_COMMAND_ADAPTIVE_DELAY handling is true
|
|
# Example:
|
|
# sleep 0.75 if command took 3 seconds or longer
|
|
# sleep 0.50 if command took 1 second or longer
|
|
# sleep 0.2 if command took 100 milliseconds or longer
|
|
# sleep 0.1 if command took 50 milliseconds or longer
|
|
sleep_time=0
|
|
sleep_time_str="0"
|
|
if [ "${COLLECT_RUN_COMMAND_ADAPTIVE_DELAY}" = true ] ; then
|
|
|
|
secs=${duration%%.*}
|
|
|
|
if [ ${secs} -ge ${COLLECT_RUNCMD_XLARGE_THRESHOLD} ] ; then
|
|
sleep_time ${COLLECT_RUNCMD_XLARGE_DELAY}
|
|
sleep_time_str="${COLLECT_RUNCMD_XLARGE_DELAY}"
|
|
|
|
elif [ ${secs} -gt ${COLLECT_RUNCMD_LARGE_THRESHOLD} ] ; then
|
|
sleep_time ${COLLECT_RUNCMD_LARGE_DELAY}
|
|
sleep_time_str="${COLLECT_RUNCMD_LARGE_DELAY}"
|
|
|
|
else
|
|
msec=${duration#*.}
|
|
|
|
if [ ${msec} -gt ${COLLECT_RUNCMD_MEDIUM_THRESHOLD} ] ; then
|
|
sleep_time ${COLLECT_RUNCMD_MEDIUM_DELAY}
|
|
sleep_time_str="${COLLECT_RUNCMD_MEDIUM_DELAY}"
|
|
|
|
elif [ ${msec} -gt ${COLLECT_RUNCMD_SMALL_THRESHOLD} ] ; then
|
|
sleep_time ${COLLECT_RUNCMD_SMALL_DELAY}
|
|
sleep_time_str="${COLLECT_RUNCMD_SMALL_DELAY}"
|
|
fi
|
|
fi
|
|
fi
|
|
|
|
if [ "${sleep_time}" != "0" ] ; then
|
|
sleep ${sleep_time}
|
|
fi
|
|
echo "$(date ${DATE_FORMAT}): ${duration}->${sleep_time_str} - ${log} - ${cmd}" >> ${COLLECT_CMD_TIMING_LOG}
|
|
return ${rc}
|
|
}
|
|
|
|
function log_slabinfo()
|
|
{
|
|
PAGE_SIZE=$(getconf PAGE_SIZE)
|
|
${IONICE_CMD} ${NICE_CMD} cat /proc/slabinfo | awk -v page_size_B=${PAGE_SIZE} '
|
|
BEGIN {page_KiB = page_size_B/1024; TOT_KiB = 0;}
|
|
(NF == 17) {
|
|
gsub(/[<>]/, "");
|
|
printf("%-22s %11s %8s %8s %10s %12s %1s %5s %10s %12s %1s %12s %9s %11s %8s\n",
|
|
$2, $3, $4, $5, $6, $7, $8, $10, $11, $12, $13, $15, $16, $17, "KiB");
|
|
}
|
|
(NF == 16) {
|
|
num_objs=$3; obj_per_slab=$5; pages_per_slab=$6;
|
|
KiB = (obj_per_slab > 0) ? page_KiB*num_objs/obj_per_slab*pages_per_slab : 0;
|
|
TOT_KiB += KiB;
|
|
printf("%-22s %11d %8d %8d %10d %12d %1s %5d %10d %12d %1s %12d %9d %11d %8d\n",
|
|
$1, $2, $3, $4, $5, $6, $7, $9, $10, $11, $12, $14, $15, $16, KiB);
|
|
}
|
|
END {
|
|
printf("%-22s %11s %8s %8s %10s %12s %1s %5s %10s %12s %1s %12s %9s %11s %8d\n",
|
|
"TOTAL", "-", "-", "-", "-", "-", ":", "-", "-", "-", ":", "-", "-", "-", TOT_KiB);
|
|
}
|
|
' >> ${1} 2>>${COLLECT_ERROR_LOG}
|
|
}
|
|
###########################################################################
|
|
#
|
|
# Name : collect_errors
|
|
#
|
|
# Description: search COLLECT_ERROR_LOG for "No space left on device" logs
|
|
# Return 0 if no such logs are found.
|
|
# Return 1 if such logs are found
|
|
#
|
|
# Assumptions: Caller should assume a non-zero return as an indication of
|
|
# a corrupt or incomplete collect log
|
|
#
|
|
# Create logs and screen echos that record the error for the user.
|
|
#
|
|
# May look for other errors in the future
|
|
#
|
|
###########################################################################
|
|
|
|
listOfOutOfSpaceErrors=(
|
|
"${FAIL_OUT_OF_SPACE_STR}"
|
|
"${FAIL_TAR_OUT_OF_SPACE_STR}"
|
|
"${FAIL_INSUFFICIENT_SPACE_STR}"
|
|
)
|
|
|
|
function collect_errors()
|
|
{
|
|
local host="${1}"
|
|
local rc=0
|
|
|
|
if [ -e "${COLLECT_ERROR_LOG}" ] ; then
|
|
|
|
## now loop through known space related error strings
|
|
index=0
|
|
while [ "x${listOfOutOfSpaceErrors[index]}" != "x" ] ; do
|
|
grep -q "${listOfOutOfSpaceErrors[index]}" ${COLLECT_ERROR_LOG}
|
|
if [ "$?" == "0" ] ; then
|
|
wlog "Out of space error(s) found in ${host}:${COLLECT_ERROR_LOG}"
|
|
if [ "${REMOTE_HOST}" = false ] ; then
|
|
rc=${FAIL_OUT_OF_SPACE}
|
|
else
|
|
rc=${FAIL_OUT_OF_SPACE_REMOTE}
|
|
fi
|
|
# return error code
|
|
break
|
|
fi
|
|
index=$(($index+1))
|
|
done
|
|
fi
|
|
return ${rc}
|
|
}
|
|
|
|
############################################################################
|
|
#
|
|
# Name : space_precheck
|
|
#
|
|
# Description:
|
|
#
|
|
############################################################################
|
|
|
|
function space_precheck()
|
|
{
|
|
HOSTNAME=${1}
|
|
COLLECT_BASE_DIR=${2}
|
|
COLLECT_DIR_PCENT_CMD="df --output=pcent ${COLLECT_BASE_DIR}"
|
|
|
|
space="`${COLLECT_DIR_PCENT_CMD}`"
|
|
space1=`echo "${space}" | grep -v Use`
|
|
size=`echo ${space1} | cut -f 1 -d '%'`
|
|
if [ ${size} -ge 0 -a ${size} -le 100 ] ; then
|
|
if [ ${size} -ge ${MIN_PERCENT_SPACE_REQUIRED} ] ; then
|
|
if [ "${REMOTE_HOST}" = false ] ; then
|
|
elog "${HOSTNAME}:${COLLECT_BASE_DIR} ${FAIL_INSUFFICIENT_SPACE_STR}"
|
|
else
|
|
wlog "${HOSTNAME}:${COLLECT_BASE_DIR} ${FAIL_INSUFFICIENT_SPACE_REMOTE_STR}"
|
|
fi
|
|
wlog "${HOSTNAME}:${COLLECT_BASE_DIR} at ${size}% ; usage must be below ${MIN_PERCENT_SPACE_REQUIRED}%"
|
|
wlog "Increase available space in ${HOSTNAME}:${COLLECT_BASE_DIR} and retry operation"
|
|
exit ${FAIL_INSUFFICIENT_SPACE}
|
|
fi
|
|
else
|
|
wlog "unable to parse available space from '${COLLECT_DIR_PCENT_CMD}' output"
|
|
fi
|
|
}
|
|
|
|
|
|
############################################################################
|
|
#
|
|
# Name : remove_common_paths
|
|
#
|
|
# Description: Remove common strings from first 'include' file that
|
|
# exists in second 'exclude' file.
|
|
#
|
|
# Parameters : $1 include list file
|
|
# $2 exclude list file
|
|
#
|
|
# Updates : include list file
|
|
#
|
|
############################################################################
|
|
|
|
function remove_common_paths()
|
|
{
|
|
local include_file=${1}
|
|
local exclude_file=${2}
|
|
local output_file=$(mktemp /tmp/collect_include_list.XXXXXX)
|
|
|
|
# Ensure both files exist
|
|
if [[ ! -f "${exclude_file}" || ! -f "${include_file}" ]] ; then
|
|
return 1
|
|
fi
|
|
|
|
# Read exclude path patterns into an array
|
|
mapfile -t exclude_patterns < "${exclude_file}"
|
|
|
|
# Use awk to filter out paths that are in exclude_file from
|
|
# include_file that match any exclude_patterns.
|
|
awk -v exclude="${exclude_patterns[*]}" '
|
|
BEGIN {split(exclude, patterns, " ")}
|
|
{
|
|
exclude_flag = 0
|
|
for (i in patterns)
|
|
{
|
|
if (index($0, patterns[i]) == 1)
|
|
{
|
|
exclude_flag = 1
|
|
break
|
|
}
|
|
}
|
|
if (!exclude_flag)
|
|
{
|
|
print
|
|
}
|
|
}
|
|
' "${include_file}" > "${output_file}"
|
|
|
|
# replace the callers include file with the filtered file
|
|
rc=$?
|
|
if [[ ${rc} -eq 0 && -f ${output_file} ]] ; then
|
|
cp -a ${output_file} ${include_file}
|
|
rm -f ${output_file}
|
|
fi
|
|
return 0
|
|
}
|
|
|