From 6ca8aa0f46d34ca93b5dc6e27b27007a6d026d6b Mon Sep 17 00:00:00 2001 From: Patrick Van der Veken Date: Sat, 28 Apr 2018 21:46:14 +0200 Subject: [PATCH] Added script options: --timeout && --show-stats --- sources/bin/check_health.sh | 56 ++++++++++++++++--- sources/lib/core/include_core.sh | 92 ++++++++++++++++++++++++++++++++ 2 files changed, 140 insertions(+), 8 deletions(-) diff --git a/sources/bin/check_health.sh b/sources/bin/check_health.sh index 698af64..5c8a1d9 100644 --- a/sources/bin/check_health.sh +++ b/sources/bin/check_health.sh @@ -37,7 +37,7 @@ # ------------------------- CONFIGURATION starts here ------------------------- # define the version (YYYY-MM-DD) -typeset -r SCRIPT_VERSION="2018-03-15" +typeset -r SCRIPT_VERSION="2018-04-28" # location of parent directory containing KSH functions/HC plugins typeset -r FPATH_PARENT="/opt/hc/lib" # location of custom HC configuration files @@ -80,6 +80,7 @@ typeset HC_FAIL_ID="" typeset HC_FILE_LINE="" typeset HC_NOW="" typeset HC_TIME_OUT=60 +typeset HC_MIN_TIME_OUT=30 typeset HC_STDOUT_LOG="" typeset HC_STDERR_LOG="" typeset LINUX_DISTRO="" @@ -111,6 +112,7 @@ typeset ARG_MONITOR=1 # killing long running HC processes is on by def typeset ARG_NOTIFY="" # notification of problems is off by default typeset ARG_REVERSE=0 # show report in reverse date order is off by default typeset ARG_REPORT="" # report of HC events is off by default +typeset ARG_TIME_OUT=0 # custom timeout is off by default typeset ARG_TERSE=0 # show terse help is off by default typeset ARG_TODAY=0 # report today's events typeset ARG_VERBOSE=1 # STDOUT is on by default @@ -353,7 +355,7 @@ then then case "${ARG_HC}" in *,*) - print -u2 "ERROR: you can only specify one value for '--hc' in combination with '--show'" + print -u2 "ERROR: you can only specify a value for '--hc' in combination with '--show'" exit 1 ;; esac @@ -362,7 +364,7 @@ then then case "${ARG_HC}" in *,*) - print -u2 "ERROR: you can only specify one value for '--hc' in combination with '--archive'" + print -u2 "ERROR: you can only specify a value for '--hc' in combination with '--archive'" exit 1 ;; esac @@ -371,12 +373,34 @@ else # host checking has no other messages to display ARG_VERBOSE=0 fi -# --list -if (( ARG_ACTION == 9 )) +# --list/--show-stats +if (( ARG_ACTION == 9 || ARG_ACTION == 11 )) then ARG_VERBOSE=0 ARG_LOG=0 fi +# --timeout +if (( ARG_TIME_OUT > 0 )) +then + if (( ARG_ACTION == 4 )) + then + # keep timeout to a sensible value + if (( ARG_TIME_OUT < HC_MIN_TIME_OUT )) + then + print -u2 "ERROR: you cannot specify a value for '--timeout' smaller than ${HC_MIN_TIME_OUT} (see \$HC_MIN_TIME_OUT})" + exit 1 + fi + if (( ARG_TIME_OUT < HC_TIME_OUT )) + then + print -u2 "ERROR: you cannot specify a value for '--timeout' smaller than ${HC_TIME_OUT} (see ${CONFIG_FILE})" + exit 1 + fi + HC_TIME_OUT=${ARG_TIME_OUT} + else + print -u2 "ERROR: you can only specify a value for '--timeout' in combination with '--run'" + exit 1 + fi +fi # check log location if (( ARG_LOG != 0 )) @@ -475,8 +499,8 @@ cat << EOT Execute/report simple health checks (HC) on UNIX hosts. Syntax: ${SCRIPT_DIR}/${SCRIPT_NAME} [--help] | [--help-terse] | [--version] | - [--list=] | [--list-core] | [--fix-symlinks] | (--disable-all | enable-all) | - (--check-host | ((--archive | --check | --enable | --disable | --run | --show) --hc= [--config-file=] [hc-args="])) + [--list=] | [--list-core] | [--fix-symlinks] | [--show-stats] | (--disable-all | enable-all) | + (--check-host | ((--archive | --check | --enable | --disable | --run [--timeout=] | --show) --hc= [--config-file=] [hc-args="])) [--display=] ([--debug] [--debug-level=]) [--no-monitor] [--no-log] [--no-lock] [--flip-rc] [--notify=] [--mail-to=] [--sms-to= --sms-provider=] [--report= ( ([--last] | [--today]) | ([--reverse] [--id= [--detail]] [--with-history]) ) ] @@ -524,8 +548,10 @@ Parameters: --reverse : show the report in reverse date order (newest events first) --run : execute HC(s). --show : show information/documentation on a HC +--show-stats : show statistics on HC events (current & archived) --sms-provider : name of a supported SMS provider (see \$SMS_PROVIDERS) [requires SMS core plugin] --sms-to : name of person or group to which a sms alert will be send to [requires SMS core plugin] +--timeout : maximum runtime of a HC plugin in seconds (overrides \$HC_TIME_OUT) --today : show today's events (HC and their combined STC value) --version : show the timestamp of the script. --with-history : also include events that have been archived already (reporting) @@ -811,6 +837,9 @@ do ARG_LOG=0 ARG_VERBOSE=0 ;; + -show-stats|--show-stats) + ARG_ACTION=11 + ;; -sms-provider=*) ARG_SMS_PROVIDER="${CMD_PARAMETER#-sms-provider=}" ;; @@ -823,6 +852,12 @@ do --sms-to=*) ARG_SMS_TO="${CMD_PARAMETER#--sms-to=}" ;; + -timeout=*) + ARG_TIME_OUT="${CMD_PARAMETER#-timeout=}" + ;; + --timeout=*) + ARG_TIME_OUT="${CMD_PARAMETER#--timeout=}" + ;; -today|--today) ARG_TODAY=1 ;; @@ -1000,7 +1035,7 @@ case ${ARG_ACTION} in RUN_CONFIG_FILE=$(grep -i -E -e "^hc:${HC_RUN}:" ${HOST_CONFIG_FILE} 2>/dev/null | cut -f3 -d':') [[ -n "${RUN_CONFIG_FILE}" ]] && ARG_CONFIG_FILE="${CONFIG_DIR}/${RUN_CONFIG_FILE}" fi - + # run HC with or without monitor if (( ARG_MONITOR == 0 )) then @@ -1144,6 +1179,11 @@ case ${ARG_ACTION} in ;; esac ;; + 11) # show HC event statistics + show_statistics + ;; + + esac # finish up work diff --git a/sources/lib/core/include_core.sh b/sources/lib/core/include_core.sh index e840d2c..99d316c 100644 --- a/sources/lib/core/include_core.sh +++ b/sources/lib/core/include_core.sh @@ -1325,6 +1325,98 @@ print "${HC_STC}%%${HC_NOW}%%${HC_MSG}%%${HC_MSG_CUR_VAL}%%${HC_MSG_EXP_VAL}" \ return 0 } +# ----------------------------------------------------------------------------- +# @(#) FUNCTION: show_statistics) +# DOES: show statistics about HC events +# EXPECTS: n/a +# RETURNS: n/a +# REQUIRES: n/a +function show_statistics +{ +(( ARG_DEBUG != 0 && ARG_DEBUG_LEVEL > 0 )) && set "${DEBUG_OPTS}" +typeset _ARCHIVE_FILE="" + +# current events +print +print -R "--- CURRENT events --" +print +print "${HC_LOG}:" +awk -F"${SEP}" '{ + # all entries + total_count[$2]++ + # set zero when empty + if (ok_count[$2] == "") { ok_count[$2]=0 } + if (nok_count[$2] == "") { nok_count[$2]=0 } + # count STCs + if ($3 == 0) { + ok_count[$2]++ + } else { + nok_count[$2]++ + } + # record first entry + if (first_entry[$2] == "" ) { + first_entry[$2]=$1 + } + # pile up last entry + last_entry[$2]=$1 + last_failid[$2]=$5 + } + + END { + for (hc in total_count) { + printf ("\t%s:\n", hc) + printf ("\t\t# entries: %s\n", total_count[hc]) + printf ("\t\t# STC==0 : %s\n", ok_count[hc]) + printf ("\t\t# STC<>0 : %s\n", nok_count[hc]) + printf ("\t\tfirst : %s\n", first_entry[hc]) + printf ("\t\tlast : %s\n", last_entry[hc]) + } + } + ' <${HC_LOG} 2>/dev/null + +# archived events +print; print +print -R "--- ARCHIVED events --" +print +find ${ARCHIVE_DIR} -type f -name "hc.*.log" 2>/dev/null | while read _ARCHIVE_FILE +do + print "${_ARCHIVE_FILE}:" + awk -F"${SEP}" '{ + # all entries + total_count[$2]++ + # set zero when empty + if (ok_count[$2] == "") { ok_count[$2]=0 } + if (nok_count[$2] == "") { nok_count[$2]=0 } + # count STCs + if ($3 == 0) { + ok_count[$2]++; + } else { + nok_count[$2]++ + } + # record first entry + if (first_entry[$2] == "" ) { + first_entry[$2]=$1 + } + # pile up last entry + last_entry[$2]=$1 + } + + END { + for (hc in total_count) { + printf ("\t%s:\n", hc) + printf ("\t\t# entries: %s\n", total_count[hc]) + printf ("\t\t# STC==0 : %s\n", ok_count[hc]) + printf ("\t\t# STC<>0 : %s\n", nok_count[hc]) + printf ("\t\tfirst : %s\n", first_entry[hc]) + printf ("\t\tlast : %s\n", last_entry[hc]) + } + } + ' <${_ARCHIVE_FILE} 2>/dev/null +done + +return 0 +} + # ----------------------------------------------------------------------------- # @(#) FUNCTION: stat_hc() # DOES: retrieve status of a HC