utilities/tools/collector/debian-scripts/collect_utils

#! /bin/bash
#
# Copyright (c) 2013-2019, 2024 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#

##########################################################################################

source /etc/collect/collect_timeouts

hostname="${HOSTNAME}"

DEBUG=false
redirect="/dev/null"

# Fail Codes
PASS=0
FAIL=1
RETRY=2

FAIL_NODETYPE=3

FAIL_TIMEOUT_GLOBAL=10
FAIL_TIMEOUT_OPERATION=11
FAIL_TIMEOUT_OPERATION_SCP=12
FAIL_TIMEOUT_OPERATION_SSH=13
FAIL_TIMEOUT_HOST_ACCESS=14
FAIL_TIMEOUT_HOST=15
FAIL_TIMEOUT_PW=16
FAIL_TIMEOUT_SCP=17
FAIL_TIMEOUT_SSH=18
FAIL_TIMEOUT_SUBCLOUD_ACCESS=19
FAIL_TIMEOUT_SUBCLOUD=20

FAIL_PASSWORD=30
FAIL_PERMISSION=31
FAIL_CLEANUP=32
FAIL_UNREACHABLE=33
FAIL_HOSTNAME=34
FAIL_INACTIVE=35
FAIL_PERMISSION_REMOTE=36
FAIL_OUT_OF_SPACE=37
FAIL_INSUFFICIENT_SPACE=38
FAIL_INTERNAL=39
FAIL_NO_TARDIR=40
FAIL_NO_TARBALLS=41
FAIL_NO_FILE_SPECIFIED=42
FAIL_FILE_NOT_FOUND=43
FAIL_FILE_EMPTY=44
FAIL_PASSWORD_PROMPT=45
FAIL_MISSING_PARAMETER=46
FAIL_DATE_FORMAT=47
FAIL_NO_HOSTS=48
FAIL_FILE_COPY=49
FAIL_SUBCLOUD=50
FAIL_CONTINUE=51
FAIL_SUBCLOUDNAME=52
FAIL_NO_SUBCLOUDS=53
FAIL_NOT_SYSTEMCONTROLLER=54
FAIL_NAME_TOO_LONG=55
FAIL_INVALID_START_DATE=56
FAIL_INVALID_END_DATE=57
FAIL_INVALID_DATE_RANGE=58
FAIL_TIMEOUT_ARG=59
FAIL_NOT_SUDOER=60
FAIL_NOT_SUDOER_REMOTE=61
FAIL_PASSWORDLESS=65
FAIL_PASSWORDLESS_REMOTE=66
FAIL_INSUFFICIENT_SPACE_REMOTE=70
FAIL_NOT_ENOUGH_SPACE_REMOTE=71
FAIL_OUT_OF_SPACE_REMOTE=72

# Warnings are above 200
WARN_WARNING=200
WARN_HOSTNAME=201
WARN_SUBCLOUD=202

COLLECT_ERROR="Error:"
COLLECT_DEBUG="Debug:"
COLLECT_WARN="Warning:"

# common permission error strings
pw_error="orry, try again"
ac_error="ermission denied"
su_error="not in the sudoers"

# Failure Strings
FAIL_NOT_ENOUGH_SPACE_STR="Not enough /scratch filesystem space"
FAIL_NOT_ENOUGH_SPACE_REMOTE_STR="Not enough remote /scratch filesystem space"
FAIL_OUT_OF_SPACE_STR="No space left on device"
FAIL_OUT_OF_SPACE_REMOTE_STR="No space left on remote device"
FAIL_TAR_OUT_OF_SPACE_STR="tar: Error is not recoverable"
FAIL_INSUFFICIENT_SPACE_STR="Not enough space on device"
FAIL_INSUFFICIENT_SPACE_REMOTE_STR="Not enough space on remote device"

FAIL_TIMEOUT_STR="operation timeout"
FAIL_TIMEOUT_ARG_STR="out-of-range timeout"

# Operational timeouts
FAIL_TIMEOUT_GLOBAL_STR="global collect timeout"
FAIL_TIMEOUT_PW_STR="password prompt timeout"
FAIL_TIMEOUT_SCP_STR="scp timeout"
FAIL_TIMEOUT_SSH_STR="ssh timeout"
FAIL_TIMEOUT_OPERATION_STR="linux operation timeout"
FAIL_TIMEOUT_OPERATION_SSH_STR="ssh operation timeout"
FAIL_TIMEOUT_OPERATION_SCP_STR="linux operation timeout"

# host and subcloud timeouts
FAIL_TIMEOUT_HOST_ACCESS_STR="host access timeout"
FAIL_TIMEOUT_HOST_STR="host collect timeout"
FAIL_TIMEOUT_SUBCLOUD_ACCESS_STR="subcloud access timeout"
FAIL_TIMEOUT_SUBCLOUD_STR="subcloud collect timeout"

FAIL_NO_FILE_SPECIFIED_STR="no file specified"
FAIL_FILE_NOT_FOUND_STR="no such file or directory"
FAIL_FILE_EMPTY_STR="file is empty"
FAIL_PASSWORD_PROMPT_STR="password for"
FAIL_PASSWORDLESS_STR="timeout waiting for password prompt"
FAIL_PASSWORDLESS_REMOTE_STR="timeout waiting for remote password prompt"
FAIL_NOT_SUDOER_STR="collect requires sudo on host"
FAIL_NOT_SUDOER_REMOTE_STR="collect requires sudo on remote host"
FAIL_INVALID_PASSWORD_STR="invalid password"
FAIL_PERMISSION_STR="permission error"
FAIL_DATE_FORMAT_STR="date format"
FAIL_INACTIVE_STR="not active"
FAIL_NO_HOSTS_STR="empty host list"
FAIL_NO_SUBCLOUDS_STR="empty subcloud list"
FAIL_MISSING_PARAMETER_STR="missing parameter"
FAIL_FILE_COPY_STR="failed to copy"
FAIL_CONTINUE_STR="cannot continue"
FAIL_UNREACHABLE_STR="Unreachable"
FAIL_PERMISSION_REMOTE_STR="remote permission error"
FAIL_UNSPECIFIED_CAUSE_STR="unspecified cause"

# The minimum amount of % free space on /scratch to allow collect to proceed
MIN_PERCENT_SPACE_REQUIRED=75

# Subcloud collect stops when avail scratch drops below this threshold.
# Use collect -sc --continue to tell collect to continue collecting subclouds
# from where it left off.
# 2Gib in K blocks rounded up
declare -i COLLECT_BASE_DIR_FULL_THRESHOLD=2147484 # 2Gib in K blocks rounded up

# Log file path/names
COLLECT_LOG=collect.log
COLLECT_ERROR_LOG=/tmp/$(whoami)_collect_error.log
HOST_COLLECT_ERROR_LOG="/tmp/$(whoami)_host_collect_error.log"
COLLECT_CMD_TIMING_LOG="/tmp/$(whoami)_collect_cmd_timing.log"
HOST_COLLECT_CMD_TIMING_LOG="collect_cmd_timing.log"
DCROLE_SYSTEMCONTROLLER="systemcontroller"
DCROLE_SUBCLOUD="subcloud"

function is_active_controller
{
    active=`sm-query service-group controller-services | grep "controller-services active"`
    if [ -z "$active" ] ; then
        return 0
    else
        return 1
    fi
}

function source_openrc_if_needed
{
    # get the node and subfunction types
    nodetype=""
    subfunction=""
    PLATFORM_CONF=/etc/platform/platform.conf
    if [ -e ${PLATFORM_CONF} ] ; then
        source ${PLATFORM_CONF}
    fi

    if [ "${nodetype}" != "controller" -a "${nodetype}" != "worker" -a "${nodetype}" != "storage" ] ; then
        wlog "could not identify nodetype ($nodetype)"
        exit $FAIL_NODETYPE
    fi

    ACTIVE=false
    if [ "$nodetype" == "controller" ] ; then
        # get local host activity state
        OPENRC="/etc/platform/openrc"
        if [ -e "${OPENRC}" ] ; then
            OS_PASSWORD=""
            source ${OPENRC} 2>/dev/null 1>/dev/null
            if [ "${OS_PASSWORD}" != "" ] ; then
                ACTIVE=true
            fi
        fi
    fi
}

# Setup an expect command completion file.
# This is used to force serialization of expect
# sequences and highlight command completion
collect_done="collect done"
cmd_done_sig="expect done"
cmd_done_file="/usr/local/sbin/expect_done"
EXPECT_LOG_FILE="/tmp/collect_expect"

# Compression Commands
TAR_ZIP_CMD="tar -czf"
TAR_UZIP_CMD="tar -xzf"
TAR_CMD="tar -chf"
TAR_CMD_APPEND="tar -rhf"
UNTAR_CMD="tar -xf"
ZIP_CMD="gzip"
NICE_CMD="/usr/bin/nice -n19"
IONICE_CMD="/usr/bin/ionice -c2 -n7"
COLLECT_TAG="COLLECT"

# Checkpoint definitions
# Default is 512 bytes per block
# Example 10000*512 = 5MBytes
CHECKPOINT_BLOCKS=10000
CHECKPOINT_CMD="--checkpoint=${CHECKPOINT_BLOCKS} --checkpoint-action=exec=/usr/local/sbin/collect_checkpoint"

STARTDATE_OPTION="--start-date"
ENDDATE_OPTION="--end-date"

DATE_FORMAT="+%H:%M:%S.%3N"

PROCESS_DETAIL_CMD="ps -e -H --forest -o ruser,tid,pid,ppid,flags,stat,policy,rtprio,nice,priority,rss:10,vsz:10,sz:10,psr,stime,etime,cputime,wchan:14,tty,cmd"
BUILD_INFO_CMD="cat /etc/build.info"

################################################################################
# Log Debug, Info or Error log message to syslog
################################################################################
function log
{
    logger -t ${COLLECT_TAG} "$(whoami) $@"
}

function ilog
{
    echo "$@"
    logger -t ${COLLECT_TAG} "$(whoami) $@"
}

function elog
{
    echo "${COLLECT_ERROR} $@"
    logger -t ${COLLECT_TAG} "$(whoami) ${COLLECT_ERROR} $@"
}

function wlog
{
    echo "${COLLECT_WARN} $@"
    logger -t ${COLLECT_TAG} "$(whoami) ${COLLECT_WARN} $@"
}

function set_debug_mode()
{
    DEBUG=${1}
}

function dlog()
{
    if [ "$DEBUG" == true ] ; then
        logger -t ${COLLECT_TAG} $(whoami) "${COLLECT_DEBUG} $@"
        echo "$(date) ${COLLECT_DEBUG} $@"
    fi
}


function delimiter()
{
    echo "--------------------------------------------------------------------" >> ${1} 2>>${COLLECT_ERROR_LOG}
    echo "`date` : ${hostname} : ${2}" >> ${1} 2>>${COLLECT_ERROR_LOG}
    echo "--------------------------------------------------------------------" >> ${1} 2>>${COLLECT_ERROR_LOG}
}


function get_time_delta () {
    start_epoch=$(date -d "1970-01-01 $1" +%s%3N)
    stop_epoch=$(date -d "1970-01-01 $2" +%s%3N)
    delta=$((${stop_epoch}-${start_epoch}))
    secs=$((delta / 1000))
    msecs=$((delta % 1000))
    echo "${secs}.${msecs}"
}

###############################################################################
#
# Name:        run_command
#
# Purpose:     Run the specified command and direct the output of
#              that command to the specified log.
#
# Assumptions: Requires 2 and only 2 arguments
#
# Arguments:   $1 - string command to execute
#              $2 - string path/name of file to direct command output to
#
# Warning:     Command is not executed unless there are only 2 arguments
#              supplied. This check helps identify code errors in command
#              execution and output redirection. Error is logged to the error
#              log as well as the execution timing summary log.
#
###############################################################################

function run_command () {
    if [ "$#" -ne 2 ]; then
        echo "Error: run_command requires 2 arguments only ; saw $# Argument(s): '$@'" >> ${COLLECT_CMD_TIMING_LOG}
        return 1
    fi

    local cmd="${1}"
    local log="${2}"
    local start=$(date ${DATE_FORMAT})
    delimiter "${log}" "${cmd}"
    ${cmd} >> ${log} 2>>${COLLECT_ERROR_LOG}
    rc=$?
    local stop=$(date ${DATE_FORMAT})
    duration=$(get_time_delta "${start}" "${stop}")
    # echo "$(date ${DATE_FORMAT}): ${duration} - ${log} - ${cmd}" >> ${COLLECT_CMD_TIMING_LOG}
    # return ${rc}

    # perform a short sleep based on how long this command took
    # if COLLECT_RUN_COMMAND_ADAPTIVE_DELAY handling is true
    # Example:
    # sleep 0.75 if command took 3 seconds or longer
    # sleep 0.50 if command took 1 second or longer
    # sleep 0.2  if command took 100 milliseconds or longer
    # sleep 0.1  if command took 50 milliseconds or longer
    sleep_time=0
    sleep_time_str="0"
    if [ "${COLLECT_RUN_COMMAND_ADAPTIVE_DELAY}" = true ] ; then

        secs=${duration%%.*}

        if [ ${secs} -ge ${COLLECT_RUNCMD_XLARGE_THRESHOLD} ] ; then
            sleep_time ${COLLECT_RUNCMD_XLARGE_DELAY}
            sleep_time_str="${COLLECT_RUNCMD_XLARGE_DELAY}"

        elif [ ${secs} -gt ${COLLECT_RUNCMD_LARGE_THRESHOLD} ] ; then
            sleep_time ${COLLECT_RUNCMD_LARGE_DELAY}
            sleep_time_str="${COLLECT_RUNCMD_LARGE_DELAY}"

        else
            msec=${duration#*.}

            if [ ${msec} -gt ${COLLECT_RUNCMD_MEDIUM_THRESHOLD} ] ; then
                sleep_time ${COLLECT_RUNCMD_MEDIUM_DELAY}
                sleep_time_str="${COLLECT_RUNCMD_MEDIUM_DELAY}"

            elif [ ${msec} -gt ${COLLECT_RUNCMD_SMALL_THRESHOLD} ] ; then
                sleep_time ${COLLECT_RUNCMD_SMALL_DELAY}
                sleep_time_str="${COLLECT_RUNCMD_SMALL_DELAY}"
            fi
        fi
    fi

    if [ "${sleep_time}" != "0" ] ; then
        sleep ${sleep_time}
    fi
    echo "$(date ${DATE_FORMAT}): ${duration}->${sleep_time_str} - ${log} - ${cmd}" >> ${COLLECT_CMD_TIMING_LOG}
    return ${rc}
}

function log_slabinfo()
{
    PAGE_SIZE=$(getconf PAGE_SIZE)
    ${IONICE_CMD} ${NICE_CMD} cat /proc/slabinfo | awk -v page_size_B=${PAGE_SIZE} '
    BEGIN {page_KiB = page_size_B/1024; TOT_KiB = 0;}
    (NF == 17) {
        gsub(/[<>]/, "");
        printf("%-22s %11s %8s %8s %10s %12s %1s %5s %10s %12s %1s %12s %9s %11s %8s\n",
        $2, $3, $4, $5, $6, $7, $8, $10, $11, $12, $13, $15, $16, $17, "KiB");
    }
    (NF == 16) {
        num_objs=$3; obj_per_slab=$5; pages_per_slab=$6;
        KiB = (obj_per_slab > 0) ? page_KiB*num_objs/obj_per_slab*pages_per_slab : 0;
        TOT_KiB += KiB;
        printf("%-22s %11d %8d %8d %10d %12d %1s %5d %10d %12d %1s %12d %9d %11d %8d\n",
        $1, $2, $3, $4, $5, $6, $7, $9, $10, $11, $12, $14, $15, $16, KiB);
    }
    END {
        printf("%-22s %11s %8s %8s %10s %12s %1s %5s %10s %12s %1s %12s %9s %11s %8d\n",
        "TOTAL", "-", "-", "-", "-", "-", ":", "-", "-", "-", ":", "-", "-", "-", TOT_KiB);
    }
    ' >> ${1} 2>>${COLLECT_ERROR_LOG}
}
###########################################################################
#
# Name       : collect_errors
#
# Description: search COLLECT_ERROR_LOG for "No space left on device" logs
#              Return 0 if no such logs are found.
#              Return 1 if such logs are found
#
# Assumptions: Caller should assume a non-zero return as an indication of
#              a corrupt or incomplete collect log
#
# Create logs and screen echos that record the error for the user.
#
# May look for other errors in the future
#
###########################################################################

listOfOutOfSpaceErrors=(
"${FAIL_OUT_OF_SPACE_STR}"
"${FAIL_TAR_OUT_OF_SPACE_STR}"
"${FAIL_INSUFFICIENT_SPACE_STR}"
)

function collect_errors()
{
    local host="${1}"
    local rc=0

    if [ -e "${COLLECT_ERROR_LOG}" ] ; then

        ## now loop through known space related error strings
        index=0
        while [ "x${listOfOutOfSpaceErrors[index]}" != "x" ] ; do
            grep -q "${listOfOutOfSpaceErrors[index]}" ${COLLECT_ERROR_LOG}
            if [ "$?" == "0" ] ; then
                wlog "Out of space error(s) found in ${host}:${COLLECT_ERROR_LOG}"
                if [ "${REMOTE_HOST}" = false ] ; then
                    rc=${FAIL_OUT_OF_SPACE}
                else
                    rc=${FAIL_OUT_OF_SPACE_REMOTE}
                fi
                # return error code
                break
            fi
            index=$(($index+1))
        done
    fi
    return ${rc}
}

############################################################################
#
# Name       : space_precheck
#
# Description:
#
############################################################################

function space_precheck()
{
    HOSTNAME=${1}
    COLLECT_BASE_DIR=${2}
    COLLECT_DIR_PCENT_CMD="df --output=pcent ${COLLECT_BASE_DIR}"

    space="`${COLLECT_DIR_PCENT_CMD}`"
    space1=`echo "${space}" | grep -v Use`
    size=`echo ${space1} | cut -f 1 -d '%'`
    if [ ${size} -ge 0 -a ${size} -le 100 ] ; then
        if [ ${size} -ge ${MIN_PERCENT_SPACE_REQUIRED} ] ; then
            if [ "${REMOTE_HOST}" = false ] ; then
                elog "${HOSTNAME}:${COLLECT_BASE_DIR} ${FAIL_INSUFFICIENT_SPACE_STR}"
            else
                wlog "${HOSTNAME}:${COLLECT_BASE_DIR} ${FAIL_INSUFFICIENT_SPACE_REMOTE_STR}"
            fi
            wlog "${HOSTNAME}:${COLLECT_BASE_DIR} at ${size}% ; usage must be below ${MIN_PERCENT_SPACE_REQUIRED}%"
            wlog "Increase available space in ${HOSTNAME}:${COLLECT_BASE_DIR} and retry operation"
            exit ${FAIL_INSUFFICIENT_SPACE}
        fi
    else
        wlog "unable to parse available space from '${COLLECT_DIR_PCENT_CMD}' output"
    fi
}


############################################################################
#
# Name       : remove_common_paths
#
# Description: Remove common strings from first 'include' file that
#              exists in second 'exclude' file.
#
# Parameters : $1 include list file
#              $2 exclude list file
#
# Updates    : include list file
#
############################################################################

function remove_common_paths()
{
    local include_file=${1}
    local exclude_file=${2}
    local output_file=$(mktemp /tmp/collect_include_list.XXXXXX)

    # Ensure both files exist
    if [[ ! -f "${exclude_file}" || ! -f "${include_file}" ]] ; then
        return 1
    fi

    # Read exclude path patterns into an array
    mapfile -t exclude_patterns < "${exclude_file}"

    # Use awk to filter out paths that are in exclude_file from
    # include_file that match any exclude_patterns.
    awk -v exclude="${exclude_patterns[*]}" '
        BEGIN {split(exclude, patterns, " ")}
        {
            exclude_flag = 0
            for (i in patterns)
            {
                if (index($0, patterns[i]) == 1)
                {
                    exclude_flag = 1
                    break
                }
            }
            if (!exclude_flag)
            {
                print
            }
        }
    ' "${include_file}" > "${output_file}"

    # replace the callers include file with the filtered file
    rc=$?
    if [[ ${rc} -eq 0 && -f ${output_file} ]] ; then
        cp -a ${output_file} ${include_file}
        rm -f ${output_file}
    fi
    return 0
}