From 0b079b4804d635fb80a8eafeeb0a8b61ab486951 Mon Sep 17 00:00:00 2001 From: Eric MacDonald Date: Sun, 12 Feb 2023 15:15:43 -0500 Subject: [PATCH] Add --timeout option to collect tool This update adds a new --timeout command line option to the collect tool so that users can extend collect's global timeout. Prior to this update the collect tool had a fixed 1000 second or 16.6 minute timeout. Collect of hosts in large busy systems can take an unpredictably long time. Sometimes longer than 1000 seconds. This can be particularly true when collecting from the active controller deploying and managing lots of pods across many hosts. This new timeout option allows the user to specify a specific timeout in minutes, between 10 and 120, while defaulting to 20 minutes. The default or user specified global timeout is passed to subclouds for subcloud collect as well. Test Plan: PASS: Verify new --timeout or -t options at command line arg level PASS: Verify --timeout parse; error, in and out of bounds PASS: Verify timeout option is described in collect help PASS: Verify 110 minute collect with --timeout 120 PASS: Verify 45 minute collect times out with --timeout 40 PASS: Verify 2 minute collect with --timeout 10 PASS: Verify default timeout is 20 minutes PASS: Verify default or specified timeout is displayed PASS: Verify default or specified timeout is shared with the subcloud PASS: Verify timeout error handling. PASS: Verify collect error handling behavior if --timeout or -t is specified but the number of minutes is missing. Regression: PASS: Verify collect system and subcloud handling PASS: Verify system and subcloud dated collects ; verified content PASS: Verify collect with a variety of options Closes-Bug: 2004666 Signed-off-by: Eric MacDonald Change-Id: Ib68b78f7c810f43fc8d13cbf291ac00f08c3c4f4 --- tools/collector/debian-scripts/collect | 42 ++++++++++++++++++-- tools/collector/debian-scripts/collect_utils | 1 + 2 files changed, 40 insertions(+), 3 deletions(-) diff --git a/tools/collector/debian-scripts/collect b/tools/collector/debian-scripts/collect index 534d0a66..e37b4b35 100644 --- a/tools/collector/debian-scripts/collect +++ b/tools/collector/debian-scripts/collect @@ -342,6 +342,11 @@ function print_help() echo " collect -a -sc [--inline | -in] ... collect logs for all subclouds one after the other" echo " collect --subcloud --continue ... continue a suspended subcloud collect" echo "" + echo "Collect Timeout" + echo "" + echo "collect [--timeout | -t] ... collect with user specified timeout" + echo " valid change range is 10-120 minutes" + echo " default: 20 mins" echo "Dated Collect:" echo "" echo "collect [--start-date | -s] YYYYMMDD ... collection of logs on and after this date" @@ -415,10 +420,16 @@ COLLECT_CONTINUE_MSG_NEEDED=false SUBCLOUD_COLLECT_CONTINUE=false SUBCLOUD_COLLECT_CONTINUE_LIST_FILE="/tmp/collect_continue.lst" +declare -i TIMEOUT_MIN_MINS=10 +declare -i TIMEOUT_MAX_MINS=120 +declare -i TIMEOUT_DEF_MINS=20 +declare -i TIMEOUT_MIN_SECS=$(($TIMEOUT_MAX_MINS*60)) +declare -i TIMEOUT_MAX_SECS=$(($TIMEOUT_MAX_MINS*60)) +declare -i TIMEOUT_DEF_SECS=$(($TIMEOUT_DEF_MINS*60)) # 20 minutes + # overall collect timeout -TIMEOUT=1000 +declare -i TIMEOUT=${TIMEOUT_DEF_SECS} SECONDS=0 -let UNTIL=${SECONDS}+${TIMEOUT} COLLECT_NAME="" @@ -707,6 +718,22 @@ while [[ ${#} -gt 0 ]] ; do clear_variable_args ;; + -t|--timeout) + if [[ ${2} =~ ^[0-9]+$ ]] ; then + if [ ${2} -lt ${TIMEOUT_MIN_MINS} -o \ + ${2} -gt ${TIMEOUT_MAX_MINS} ] ; then + elog "timeout must be between ${TIMEOUT_MIN_MINS} and ${TIMEOUT_MAX_MINS} minutes" + collect_exit ${FAIL_TIMEOUT_ARG} + else + TIMEOUT="$((${2}*60))" + fi + else + elog "timeout value must be an integer" + collect_exit ${FAIL_TIMEOUT_ARG} + fi + shift + ;; + --skip-mask) SKIP_MASK=true shift @@ -758,6 +785,9 @@ while [[ ${#} -gt 0 ]] ; do shift # past argument or value done +# The default TIMEOUT may have been revised with the --timeout option. +# Update UNTIL with updated global timeout time in secs. +let UNTIL=${SECONDS}+${TIMEOUT} date -d $STARTDATE > /dev/null 2>/dev/null rc_start_date=${?} @@ -1093,6 +1123,8 @@ pw=${pw/\[/\\\[} # replace '[' with '\[' pw=${pw/$/\\$} # replace '$' with '\$' pw=${pw/\"/\\\"} # replace '"' with '\"' +ilog "collect bundle timeout set to $((${TIMEOUT}/60)) minutes" + ########################################################################### # # Name : passwordless_sudo_test @@ -1908,6 +1940,10 @@ function collect_subcloud_run() collect_cmd+=("-v") fi + # pass the timeout to the subcloud + collect_cmd+=("-t $((${TIMEOUT}/60))") + + # pass the date range to the subcloud collect_cmd+=("--start-date ${STARTDATE}") collect_cmd+=("--end-date $ENDDATE") @@ -3068,7 +3104,7 @@ if [ "${SUBCLOUD_COLLECT}" = true ] ; then if [ ${SUBCLOUDS} -gt ${TIMEOUT_THRESHOLD_FACTOR} -a "${PARALLEL_COLLECT_MODE}" = true ] ; then # adjust overall timeout to account for the large number of subclouds let UNTIL=$(((SUBCLOUDS*SUBCLOUDS_TIMEOUT_BOOST)+TIMEOUT)) - ilog "adjusted subcloud collect timout from ${TIMEOUT} to ${UNTIL} secs to account for ${SUBCLOUDS} subclouds" + ilog "adjusted subcloud collect timeout from ${TIMEOUT} to ${UNTIL} secs to account for ${SUBCLOUDS} subclouds" fi if [ "${ALLHOSTS}" = true ] ; then if [ ${SUBCLOUDS} -gt ${MAX_LIST_PRINT} ] ; then diff --git a/tools/collector/debian-scripts/collect_utils b/tools/collector/debian-scripts/collect_utils index ba9af14d..f0b495e0 100755 --- a/tools/collector/debian-scripts/collect_utils +++ b/tools/collector/debian-scripts/collect_utils @@ -58,6 +58,7 @@ FAIL_NAME_TOO_LONG=55 FAIL_INVALID_START_DATE=56 FAIL_INVALID_END_DATE=57 FAIL_INVALID_DATE_RANGE=58 +FAIL_TIMEOUT_ARG=59 # Warnings are above 200 WARN_WARNING=200