diff --git a/sources/bin/check_health.sh b/sources/bin/check_health.sh index 2569f43..73802c8 100644 --- a/sources/bin/check_health.sh +++ b/sources/bin/check_health.sh @@ -37,7 +37,7 @@ # ------------------------- CONFIGURATION starts here ------------------------- # define the version (YYYY-MM-DD) -typeset -r SCRIPT_VERSION="2017-12-22" +typeset -r SCRIPT_VERSION="2017-12-26" # location of parent directory containing KSH functions/HC plugins typeset -r FPATH_PARENT="/opt/hc/lib" # location of custom HC configuration files @@ -60,6 +60,8 @@ typeset -r LOCK_DIR="${TMP_DIR}/.${SCRIPT_NAME}.lock" typeset -r HC_MSG_FILE="${TMP_DIR}/.${SCRIPT_NAME}.hc.msg.$$" # plugin messages files typeset -r SEP="|" typeset -r LOG_DIR="/var/opt/hc" +typeset -r LOG_FILE="${LOG_DIR}/${SCRIPT_NAME}.log" +typeset -r ARCHIVE_DIR="${LOG_DIR}/archive" typeset -r EVENTS_DIR="${LOG_DIR}/events" typeset -r STATE_DIR="${LOG_DIR}/state" typeset -r STATE_PERM_DIR="${STATE_DIR}/persistent" @@ -82,6 +84,9 @@ typeset HC_STDOUT_LOG="" typeset HC_STDERR_LOG="" typeset LINUX_DISTRO="" typeset LINUX_RELEASE="" +typeset ARCHIVE_RC=0 +typeset DISABLE_RC=0 +typeset ENABLE_RC=0 typeset RUN_RC=0 typeset SORT_CMD="" typeset DEBUG_OPTS="" @@ -96,6 +101,7 @@ typeset ARG_DISPLAY="" # display is STDOUT by default typeset ARG_FAIL_ID="" typeset ARG_HC="" typeset ARG_HC_ARGS="" # no extra arguments to HC plug-in by default +typeset ARG_HISTORY=0 # include historical events is off by default typeset ARG_LAST=0 # report last events typeset ARG_LIST="" # list all by default typeset ARG_LOCK=1 # lock for concurrent script executions is on by default @@ -159,6 +165,18 @@ then print -u2 "ERROR: you must define a value for the EXEC_USER setting in $0" exit 1 fi +# SCRIPT_VERSION +if [[ -z "${SCRIPT_VERSION}" ]] +then + print -u2 "ERROR: you must define a value for the SCRIPT_VERSION setting in $0" + exit 1 +fi +# TMP_DIR +if [[ -z "${TMP_DIR}" ]] +then + print -u2 "ERROR: you must define a value for the TMP_DIR setting in $0" + exit 1 +fi # FPATH_PARENT if [[ -z "${FPATH_PARENT}" ]] then @@ -211,6 +229,11 @@ else fi # check for core directories +[[ -d ${ARCHIVE_DIR} ]] || mkdir -p "${ARCHIVE_DIR}" >/dev/null 2>&1 +if [[ ! -d "${ARCHIVE_DIR}" ]] || [[ ! -w "${ARCHIVE_DIR}" ]] +then + print -u2 "ERROR: unable to access the archive directory at ${ARCHIVE_DIR}" +fi [[ -d ${EVENTS_DIR} ]] || mkdir -p "${EVENTS_DIR}" >/dev/null 2>&1 if [[ ! -d "${EVENTS_DIR}" ]] || [[ ! -w "${EVENTS_DIR}" ]] then @@ -297,7 +320,7 @@ then exit 1 fi fi -# --check-host,--check/--disable/--enable/--run/--show,--hc +# --check-host,--check/--disable/--enable/--run/--show/--archive,--hc if [[ -n "${ARG_HC}" ]] && (( ARG_ACTION == 0 )) then print -u2 "ERROR: you must specify an action for the HC (--check/--disable/--enable/--run/--show)" @@ -305,7 +328,7 @@ then fi if (( ARG_CHECK_HOST == 0 )) then - if (( ARG_ACTION < 6 )) && [[ -z "${ARG_HC}" ]] + if (( ARG_ACTION < 6 || ARG_ACTION == 10 )) && [[ -z "${ARG_HC}" ]] then print -u2 "ERROR: you specify a value for parameter '--hc'" exit 1 @@ -319,6 +342,15 @@ then ;; esac fi + if (( ARG_ACTION == 10 )) || [[ -n "${ARG_HC_ARGS}" ]] + then + case "${ARG_HC}" in + *,*) + print -u2 "ERROR: you can only specify one value for '--hc' in combination with '--archive'" + exit 1 + ;; + esac + fi else # host checking has no other messages to display ARG_VERBOSE=0 @@ -329,8 +361,8 @@ then ARG_VERBOSE=0 ARG_LOG=0 fi -# --log-dir -LOG_FILE="${LOG_DIR}/${SCRIPT_NAME}.log" + +# check log location if (( ARG_LOG != 0 )) then if [[ ! -d "${LOG_DIR}" ]] || [[ ! -w "${LOG_DIR}" ]] @@ -428,10 +460,10 @@ Execute/report simple health checks (HC) on UNIX hosts. Syntax: ${SCRIPT_DIR}/${SCRIPT_NAME} [--help] | [--help-terse] | [--version] | [--list=] | [--list-core] | [--fix-symlinks] | (--disable-all | enable-all) | - (--check-host | ((--check | --enable | --disable | --run | --show) --hc= [--config-file=] [hc-args="])) + (--check-host | ((--archive | --check | --enable | --disable | --run | --show) --hc= [--config-file=] [hc-args="])) [--display=] ([--debug] [--debug-level=]) [--no-monitor] [--no-log] [--no-lock] [--notify=] [--mail-to=] [--sms-to= --sms-provider=] - [--report= ( ([--last] | [--today]) | ([--reverse] [--id= [--detail]]) ) ] + [--report= ( ([--last] | [--today]) | ([--reverse] [--id= [--detail]] [--with-history]) ) ] EOT @@ -440,6 +472,7 @@ then cat << EOT Parameters: +--archive : move events from the HC log file into archive log files --check : display HC state. --check-host : execute all configured HC(s) (see check_host.conf) --config-file : custom configuration file for a HC, may only be specified when executing a single HC plugin. @@ -477,6 +510,7 @@ Parameters: --sms-to : name of person or group to which a sms alert will be send to [requires SMS core plugin] --today : show today's events (HC and their combined STC value) --version : show the timestamp of the script. +--with-history : also include events that have been archived already (reporting) EOT fi @@ -598,6 +632,9 @@ CMD_LINE="$*" for CMD_PARAMETER in ${CMD_LINE} do case ${CMD_PARAMETER} in + -archive|--archive) + ARG_ACTION=10 + ;; -check|--check) ARG_ACTION=1 ;; @@ -647,6 +684,15 @@ do -enable-all|--enable-all) ARG_ACTION=7 ;; + -f|-fix-symlinks|--fix-symlinks) + read_config + check_config + build_fpath + check_shell + check_user + fix_symlinks + exit 0 + ;; -hc=*) ARG_HC="${CMD_PARAMETER#-hc=}" ;; @@ -659,14 +705,8 @@ do --hc-args=*) ARG_HC_ARGS="${CMD_PARAMETER#--hc-args=}" ;; - -f|-fix-symlinks|--fix-symlinks) - read_config - check_config - build_fpath - check_shell - check_user - fix_symlinks - exit 0 + -with-history|--with-history) + ARG_HISTORY=1 ;; -id=*) ARG_FAIL_ID="${CMD_PARAMETER#-id=}" @@ -1028,7 +1068,8 @@ case ${ARG_ACTION} in exists_hc "${HC_DISABLE}" && die "cannot find HC: ${HC_DISABLE}" log "disabling HC: ${HC_DISABLE}" touch "${STATE_PERM_DIR}/${HC_DISABLE}.disabled" >/dev/null 2>&1 - if (( $? == 0 )) + DISABLE_RC=$1 + if (( DISABLE_RC == 0 )) then log "successfully disabled HC: ${HC_DISABLE}" else @@ -1046,7 +1087,8 @@ case ${ARG_ACTION} in [[ -d ${STATE_PERM_DIR} ]] || \ die "state directory does not exist, all HC(s) are enabled" rm -f "${STATE_PERM_DIR}/${HC_ENABLE}.disabled" >/dev/null 2>&1 - if (( $? == 0 )) + ENABLE_RC=$? + if (( ENABLE_RC == 0 )) then log "successfully enabled HC: ${HC_ENABLE}" else @@ -1061,6 +1103,24 @@ case ${ARG_ACTION} in 9) # list HC plugins list_hc "" "${ARG_LIST}" ;; + 10) # archive log entries + exists_hc "${ARG_HC}" && die "cannot find HC: ${ARG_HC}" + log "archiving log entries for ${ARG_HC}..." + archive_hc "${ARG_HC}" + ARCHIVE_RC=$? + case ${ARCHIVE_RC} in + 0) + log "no archiving needed for ${ARG_HC}" + ;; + 1) + log "successfully archived log entries for ${ARG_HC}" + ;; + 2) + log "failed to archive log entries for ${ARG_HC} [RC=${ARCHIVE_RC}]" + EXIT_CODE=1 + ;; + esac + ;; esac # finish up work diff --git a/sources/lib/core/include_core.sh b/sources/lib/core/include_core.sh index ad7d8a1..2e33c14 100644 --- a/sources/lib/core/include_core.sh +++ b/sources/lib/core/include_core.sh @@ -23,6 +23,64 @@ # DO NOT CHANGE THIS FILE UNLESS YOU KNOW WHAT YOU ARE DOING! #****************************************************************************** +# ----------------------------------------------------------------------------- +# @(#) FUNCTION: archive_hc() +# DOES: archive log entries for a given HC +# EXPECTS: HC name [string] +# RETURNS: 0=no archiving needed; 1=archiving OK; 2=archiving NOK +# REQUIRES: n/a +function archive_hc +{ +(( ARG_DEBUG != 0 && ARG_DEBUG_LEVEL > 0 )) && set "${DEBUG_OPTS}" +typeset HC_NAME="$1" +typeset ARCHIVE_FILE="" +typeset YEAR_MONTH="" +typeset LOG_COUNT=0 +typeset ARCHIVE_RC=0 +typeset SAVE_HC_LOG="${HC_LOG}.$$" +typeset TMP_FILE="${TMP_DIR}/.$0.tmp.archive.$$" + +# set local trap for cleanup +trap "rm -f ${TMP_FILE} ${SAVE_LOG_FILE} >/dev/null 2>&1; return 1" 1 2 3 15 + +# isolate messages from HC, find unique %Y-%m combinations +grep ".*${SEP}${HC_NAME}${SEP}" ${HC_LOG} 2>/dev/null |\ + cut -f1 -d"${SEP}" | cut -f1 -d' ' | cut -f1-2 -d'-' | sort -u |\ + while read YEAR_MONTH +do + # find all messages for that YEAR-MONTH combination + grep "${YEAR_MONTH}.*${SEP}${HC_NAME}${SEP}" ${HC_LOG} >${TMP_FILE} + LOG_COUNT=$(wc -l ${TMP_FILE} | cut -f1 -d' ') + log "# of new entries to archive: ${LOG_COUNT}" + + # combine existing archived messages and resort + ARCHIVE_FILE="${ARCHIVE_DIR}/hc.${YEAR_MONTH}.log" + cat ${ARCHIVE_FILE} ${TMP_FILE} | sort -u >${ARCHIVE_FILE} + LOG_COUNT=$(wc -l ${ARCHIVE_FILE} | cut -f1 -d' ') + log "# entries in ${ARCHIVE_FILE} now: ${LOG_COUNT}" + + # remove archived messages from the $HC_LOG (but create a backup first!) + cp -p ${HC_LOG} ${SAVE_HC_LOG} 2>/dev/null + comm -23 ${HC_LOG} ${ARCHIVE_FILE} 2>/dev/null >${TMP_FILE} + if [[ -s ${TMP_FILE} ]] + then + mv ${TMP_FILE} ${HC_LOG} 2>/dev/null + LOG_COUNT=$(wc -l ${HC_LOG} | cut -f1 -d' ') + log "# entries in ${HC_LOG} now: ${LOG_COUNT}" + ARCHIVE_RC=1 + else + warn "a problem occurred. Rolling back archival" + mv ${SAVE_HC_LOG} ${HC_LOG} 2>/dev/null + ARCHIVE_RC=2 + fi +done + +# clean up temporary file(s) +rm -f ${TMP_FILE} ${SAVE_HC_LOG} >/dev/null 2>&1 + +return ${ARCHIVE_RC} +} + # ----------------------------------------------------------------------------- # @(#) FUNCTION: debug() # DOES: handle debug messages @@ -391,7 +449,7 @@ if (( DO_NOTIFY_SMS != 0 )) && [[ -z "${ARG_SMS_PROVIDER}" ]] then die "you cannot specify '--notify=sms' without '--sms-provider'" fi -# --report/--detail/--id/--reverse/--last/--today +# --report/--detail/--id/--reverse/--last/--today/--with-history if (( DO_REPORT_STD != 0 )) then if (( ARG_DETAIL != 0 )) && [[ -z "${ARG_FAIL_ID}" ]] @@ -426,6 +484,11 @@ then then die "you cannot specify '--today' with '--id'" fi + # switch on history for --last & --today + if (( ARG_LAST != 0 )) || (( ARG_TODAY != 0 )) + then + ARG_HISTORY=1 + fi fi if (( DO_REPORT_STD == 0 )) && (( ARG_LAST != 0 )) then @@ -439,6 +502,10 @@ if (( DO_REPORT_STD == 0 )) && (( ARG_DETAIL != 0 )) then die "you cannot specify '--detail' without '--report'" fi +if (( DO_REPORT_STD == 0 )) && (( ARG_HISTORY != 0 )) +then + die "you cannot specify '--with-history' without '--report'" +fi if (( DO_REPORT_STD == 0 )) && [[ -n "${ARG_FAIL_ID}" ]] then die "you cannot specify '--id' without '--report'" diff --git a/sources/lib/core/report_std.sh b/sources/lib/core/report_std.sh index d3c318f..4ac0314 100644 --- a/sources/lib/core/report_std.sh +++ b/sources/lib/core/report_std.sh @@ -30,7 +30,7 @@ function report_std { # ------------------------- CONFIGURATION starts here ------------------------- -typeset _VERSION="2017-12-15" # YYYY-MM-DD +typeset _VERSION="2017-12-26" # YYYY-MM-DD typeset _SUPPORTED_PLATFORMS="AIX,HP-UX,Linux" # uname -s match # ------------------------- CONFIGURATION ends here --------------------------- @@ -52,9 +52,19 @@ typeset _HC_LAST_FAIL_ID="-" typeset _HC_LAST_EVENT_FAIL_ID=0 typeset _HC_LAST_EVENT_STC="" typeset _ID_NEEDLE="" +typeset _LOG_STASH="" typeset _REPORT_LINE="" typeset _SORT_CMD="" +# which files do we need to examine +if (( ARG_HISTORY != 0 )) +then + set +f # file globbing must be on + _LOG_STASH="${HC_LOG} ${ARCHIVE_DIR}/hc.*.log" +else + _LOG_STASH="${HC_LOG}" +fi + # --last report if (( ARG_LAST != 0 )) then @@ -68,14 +78,14 @@ then _HC_LAST_FAIL_ID="-" # find last event or block of events (same timestamp) # (but unfortunately this is only accurate to events within the SAME second!) - _HC_LAST_TIME="$(grep ${_HC_LAST} ${HC_LOG} 2>/dev/null | sort -n | cut -f1 -d${SEP} | uniq | tail -1)" + _HC_LAST_TIME="$(grep -h ${_HC_LAST} ${_LOG_STASH} 2>/dev/null | sort -n | cut -f1 -d${SEP} | uniq | tail -1)" if [[ -z "${_HC_LAST_TIME}" ]] then _HC_LAST_TIME="-" _HC_LAST_STC="-" else # find all STC codes for the last event and add them up - grep "${_HC_LAST_TIME}${SEP}${HC_LAST}" ${HC_LOG} 2>/dev/null |\ + grep -h "${_HC_LAST_TIME}${SEP}${HC_LAST}" ${_LOG_STASH} 2>/dev/null |\ while read -r _REPORT_LINE do _HC_LAST_EVENT_STC=$(print "${_REPORT_LINE}" | cut -f3 -d"${SEP}") @@ -99,7 +109,7 @@ else (( ARG_TODAY != 0 )) && _ID_NEEDLE="$(date '+%Y%m%d')" # refers to timestamp of HC FAIL_ID # check fail count (look for unique IDs in the 5th field of the HC log) - _FAIL_COUNT=$(cut -f5 -d"${SEP}" ${HC_LOG} 2>/dev/null | grep -E -e "${_ID_NEEDLE}" | uniq | wc -l) + _FAIL_COUNT=$(cut -f5 -d"${SEP}" ${_LOG_STASH} 2>/dev/null | grep -E -e "${_ID_NEEDLE}" | uniq | wc -l) if (( _FAIL_COUNT != 0 )) then # check for detail or not? @@ -123,7 +133,7 @@ else # print failed events # no extended grep here and no end $SEP! - grep ".*${SEP}.*${SEP}.*${SEP}.*${SEP}${_ID_NEEDLE}" ${HC_LOG} 2>/dev/null |\ + grep -h ".*${SEP}.*${SEP}.*${SEP}.*${SEP}${_ID_NEEDLE}" ${_LOG_STASH} 2>/dev/null |\ ${_SORT_CMD} | while read -r _REPORT_LINE do _FAIL_F1=$(print "${_REPORT_LINE}" | cut -f1 -d"${SEP}") @@ -141,7 +151,7 @@ else _EVENT_COUNT=1 _DIR_PREFIX="$(expr substr ${ARG_FAIL_ID} 1 4)-$(expr substr ${ARG_FAIL_ID} 5 2)" # no extended grep here! - grep ".*${SEP}.*${SEP}.*${SEP}.*${SEP}${_ID_NEEDLE}${SEP}" ${HC_LOG} 2>/dev/null |\ + grep -h ".*${SEP}.*${SEP}.*${SEP}.*${SEP}${_ID_NEEDLE}${SEP}" ${_LOG_STASH} 2>/dev/null |\ ${_SORT_CMD} | while read -r _REPORT_LINE do _FAIL_F1=$(print "${_REPORT_LINE}" | cut -f1 -d"${SEP}")