* Fix for rogue log entries (via fix_logs()). When not using run locks (--use-lock) it is possible that log entries are always written in completely atomic manner.

* Other fixes
This commit is contained in:
Patrick Van der Veken 2018-05-29 21:17:39 +02:00
parent dad23525e3
commit 4554d25073
3 changed files with 381 additions and 53 deletions

View File

@ -37,7 +37,7 @@
# ------------------------- CONFIGURATION starts here ------------------------- # ------------------------- CONFIGURATION starts here -------------------------
# define the version (YYYY-MM-DD) # define the version (YYYY-MM-DD)
typeset -r SCRIPT_VERSION="2018-05-20" typeset -r SCRIPT_VERSION="2018-05-29"
# location of parent directory containing KSH functions/HC plugins # location of parent directory containing KSH functions/HC plugins
typeset -r FPATH_PARENT="/opt/hc/lib" typeset -r FPATH_PARENT="/opt/hc/lib"
# location of custom HC configuration files # location of custom HC configuration files
@ -58,9 +58,10 @@ typeset -r HOST_NAME="$(hostname)"
typeset -r OS_NAME="$(uname -s)" typeset -r OS_NAME="$(uname -s)"
typeset -r LOCK_DIR="${TMP_DIR}/.${SCRIPT_NAME}.lock" typeset -r LOCK_DIR="${TMP_DIR}/.${SCRIPT_NAME}.lock"
typeset -r HC_MSG_FILE="${TMP_DIR}/.${SCRIPT_NAME}.hc.msg.$$" # plugin messages files typeset -r HC_MSG_FILE="${TMP_DIR}/.${SCRIPT_NAME}.hc.msg.$$" # plugin messages files
typeset -r LOG_SEP="|" # single character only typeset -r LOG_SEP="|" # single character only
typeset -r MSG_SEP="%" # single character only typeset -r MSG_SEP="%" # single character only
typeset -r MAGIC_QUOTE="!_!" # magic quote typeset -t NUM_LOG_FIELDS=6 # current number of fields in $HC_LOG + 1
typeset -r MAGIC_QUOTE="!_!" # magic quote
typeset -r LOG_DIR="/var/opt/hc" typeset -r LOG_DIR="/var/opt/hc"
typeset -r LOG_FILE="${LOG_DIR}/${SCRIPT_NAME}.log" typeset -r LOG_FILE="${LOG_DIR}/${SCRIPT_NAME}.log"
typeset -r ARCHIVE_DIR="${LOG_DIR}/archive" typeset -r ARCHIVE_DIR="${LOG_DIR}/archive"
@ -91,6 +92,7 @@ typeset LINUX_RELEASE=""
typeset ARCHIVE_RC=0 typeset ARCHIVE_RC=0
typeset DISABLE_RC=0 typeset DISABLE_RC=0
typeset ENABLE_RC=0 typeset ENABLE_RC=0
typeset FIX_FC=0
typeset RUN_RC=0 typeset RUN_RC=0
typeset RUN_CONFIG_FILE="" typeset RUN_CONFIG_FILE=""
typeset RUN_TIME_OUT=0 typeset RUN_TIME_OUT=0
@ -384,6 +386,12 @@ then
ARG_VERBOSE=0 ARG_VERBOSE=0
ARG_LOG=0 ARG_LOG=0
fi fi
# --fix-logs
if (( ARG_ACTION == 12 )) && [[ -n "${ARG_HC}" ]]
then
print -u2 "ERROR: you can only use '--fix-logs' in combination with '--with-history'"
exit 1
fi
# --timeout # --timeout
if (( ARG_TIME_OUT > 0 )) if (( ARG_TIME_OUT > 0 ))
then then
@ -504,8 +512,8 @@ cat << EOT
Execute/report simple health checks (HC) on UNIX hosts. Execute/report simple health checks (HC) on UNIX hosts.
Syntax: ${SCRIPT_DIR}/${SCRIPT_NAME} [--help] | [--help-terse] | [--version] | Syntax: ${SCRIPT_DIR}/${SCRIPT_NAME} [--help] | [--help-terse] | [--version] |
[--list=<needle>] | [--list-core] | [--fix-symlinks] | [--show-stats] | (--disable-all | enable-all) | [--list=<needle>] | [--list-core] | [--fix-symlinks] | [--show-stats] | (--disable-all | enable-all) | [--fix-logs [--with-history]] |
(--check-host | ((--archive | --check | --enable | --disable | --run [--timeout=<secs>] | --show) --hc=<list_of_checks> [--config-file=<configuration_file>] [hc-args="<arg1,arg2=val,arg3">])) (--check-host | ((--archive | --check | --enable | --disable | --run [--timeout=<secs>] | --show) --hc=<list_of_checks> [--config-file=<configuration_file>] [hc-args="<arg1,arg2=val,arg3">]))
[--display=<method>] ([--debug] [--debug-level=<level>]) [--no-monitor] [--no-log] [--no-lock] [--flip-rc] [--display=<method>] ([--debug] [--debug-level=<level>]) [--no-monitor] [--no-log] [--no-lock] [--flip-rc]
[--notify=<method_list>] [--mail-to=<address_list>] [--sms-to=<sms_rcpt> --sms-provider=<name>] [--notify=<method_list>] [--mail-to=<address_list>] [--sms-to=<sms_rcpt> --sms-provider=<name>]
[--report=<method> ( ([--last] | [--today]) | ([--reverse] [--id=<fail_id> [--detail]] [--with-history]) ) ] [--report=<method> ( ([--last] | [--today]) | ([--reverse] [--id=<fail_id> [--detail]] [--with-history]) ) ]
@ -529,6 +537,7 @@ Parameters:
--display : display HC results in a formatted way. Default is STDOUT (see --list-core for available formats) --display : display HC results in a formatted way. Default is STDOUT (see --list-core for available formats)
--enable : enable HC(s). --enable : enable HC(s).
--enable-all : enable all HCs. --enable-all : enable all HCs.
--fix-logs : fix rogue log entries (can be used with --with-history)
--fix-symlinks : update symbolic links for the KSH autoloader. --fix-symlinks : update symbolic links for the KSH autoloader.
--flip-rc : exit the health checker with the RC (return code) of the HC plugin instead of its own RC (will be discarded) --flip-rc : exit the health checker with the RC (return code) of the HC plugin instead of its own RC (will be discarded)
This option may only be specified when executing a single HC plugin This option may only be specified when executing a single HC plugin
@ -680,16 +689,34 @@ CMD_LINE="$*"
[[ -z "${CMD_LINE}" ]] && display_usage && exit 0 [[ -z "${CMD_LINE}" ]] && display_usage && exit 0
for CMD_PARAMETER in ${CMD_LINE} for CMD_PARAMETER in ${CMD_LINE}
do do
# ARG_ACTION is a toggle, do not allow double toggles
case ${CMD_PARAMETER} in case ${CMD_PARAMETER} in
-archive|--archive) -archive|--archive)
ARG_ACTION=10 if (( ARG_ACTION > 0 ))
then
print -u2 "ERROR: you cannot request two actions if (( ARG_ACTION > 0 ))
then
print -u2 "ERROR: you cannot request two actions at the same time"
exit 1
else"
exit 1
else
ARG_ACTION=10
fi
ARG_LOCK=1
;; ;;
-check|--check) -check|--check)
ARG_ACTION=1 ARG_ACTION=1
;; ;;
-c|-check-host|--check-host) -c|-check-host|--check-host)
if (( ARG_ACTION > 0 ))
then
print -u2 "ERROR: you cannot request two actions at the same time"
exit 1
else
ARG_ACTION=4
fi
ARG_CHECK_HOST=1 ARG_CHECK_HOST=1
ARG_ACTION=4
;; ;;
-config-file=*) -config-file=*)
ARG_CONFIG_FILE="${CMD_PARAMETER#-config-file=}" ARG_CONFIG_FILE="${CMD_PARAMETER#-config-file=}"
@ -712,10 +739,22 @@ do
ARG_DETAIL=1 ARG_DETAIL=1
;; ;;
-d|-disable|--disable) -d|-disable|--disable)
ARG_ACTION=2 if (( ARG_ACTION > 0 ))
then
print -u2 "ERROR: you cannot request two actions at the same time"
exit 1
else
ARG_ACTION=2
fi
;; ;;
-disable-all|--disable-all) -disable-all|--disable-all)
ARG_ACTION=6 if (( ARG_ACTION > 0 ))
then
print -u2 "ERROR: you cannot request two actions at the same time"
exit 1
else
ARG_ACTION=6
fi
;; ;;
-display|--display) -display|--display)
# STDOUT as default # STDOUT as default
@ -728,10 +767,22 @@ do
ARG_DISPLAY="${CMD_PARAMETER#--display=}" ARG_DISPLAY="${CMD_PARAMETER#--display=}"
;; ;;
-e|-enable|--enable) -e|-enable|--enable)
ARG_ACTION=3 if (( ARG_ACTION > 0 ))
then
print -u2 "ERROR: you cannot request two actions at the same time"
exit 1
else
ARG_ACTION=3
fi
;; ;;
-enable-all|--enable-all) -enable-all|--enable-all)
ARG_ACTION=7 if (( ARG_ACTION > 0 ))
then
print -u2 "ERROR: you cannot request two actions at the same time"
exit 1
else
ARG_ACTION=7
fi
;; ;;
-f|-fix-symlinks|--fix-symlinks) -f|-fix-symlinks|--fix-symlinks)
read_config read_config
@ -742,6 +793,16 @@ do
fix_symlinks fix_symlinks
exit 0 exit 0
;; ;;
-fix-logs|--fix-logs)
if (( ARG_ACTION > 0 ))
then
print -u2 "ERROR: you cannot request two actions at the same time"
exit 1
else
ARG_ACTION=12
fi
ARG_LOCK=1
;;
-flip-rc|--flip-rc) -flip-rc|--flip-rc)
ARG_FLIP_RC=1 ARG_FLIP_RC=1
;; ;;
@ -770,15 +831,33 @@ do
ARG_LAST=1 ARG_LAST=1
;; ;;
-list|--list) -list|--list)
ARG_ACTION=9 if (( ARG_ACTION > 0 ))
then
print -u2 "ERROR: you cannot request two actions at the same time"
exit 1
else
ARG_ACTION=9
fi
;; ;;
-list=*) -list=*)
ARG_LIST="${CMD_PARAMETER#-list=}" ARG_LIST="${CMD_PARAMETER#-list=}"
ARG_ACTION=9 if (( ARG_ACTION > 0 ))
then
print -u2 "ERROR: you cannot request two actions at the same time"
exit 1
else
ARG_ACTION=9
fi
;; ;;
--list=*) --list=*)
ARG_LIST="${CMD_PARAMETER#--list=}" ARG_LIST="${CMD_PARAMETER#--list=}"
ARG_ACTION=9 if (( ARG_ACTION > 0 ))
then
print -u2 "ERROR: you cannot request two actions at the same time"
exit 1
else
ARG_ACTION=9
fi
;; ;;
-list-hc|--list-hc|-list-all|--list-all) -list-hc|--list-hc|-list-all|--list-all)
print -u2 "WARN: deprecated option. Use --list | --list=<needle>" print -u2 "WARN: deprecated option. Use --list | --list=<needle>"
@ -816,34 +895,69 @@ do
ARG_MONITOR=0 ARG_MONITOR=0
;; ;;
-report|--report) # compatability support <2017-12-15 -report|--report) # compatability support <2017-12-15
if (( ARG_ACTION > 0 ))
then
print -u2 "ERROR: you cannot request two actions at the same time"
exit 1
else
ARG_ACTION=8
fi
# STDOUT as default # STDOUT as default
ARG_REPORT="std" ARG_REPORT="std"
ARG_LOG=0; ARG_VERBOSE=0 ARG_LOG=0; ARG_VERBOSE=0
ARG_ACTION=8
;; ;;
-report=*) -report=*)
if (( ARG_ACTION > 0 ))
then
print -u2 "ERROR: you cannot request two actions at the same time"
exit 1
else
ARG_ACTION=8
fi
ARG_REPORT="${CMD_PARAMETER#-report=}" ARG_REPORT="${CMD_PARAMETER#-report=}"
ARG_LOG=0; ARG_VERBOSE=0 ARG_LOG=0; ARG_VERBOSE=0
ARG_ACTION=8
;; ;;
--report=*) --report=*)
if (( ARG_ACTION > 0 ))
then
print -u2 "ERROR: you cannot request two actions at the same time"
exit 1
else
ARG_ACTION=8
fi
ARG_REPORT="${CMD_PARAMETER#--report=}" ARG_REPORT="${CMD_PARAMETER#--report=}"
ARG_LOG=0; ARG_VERBOSE=0 ARG_LOG=0; ARG_VERBOSE=0
ARG_ACTION=8
;; ;;
-reverse|--reverse) -reverse|--reverse)
ARG_REVERSE=1 ARG_REVERSE=1
;; ;;
-r|-run|--run) -r|-run|--run)
ARG_ACTION=4 if (( ARG_ACTION > 0 ))
then
print -u2 "ERROR: you cannot request two actions at the same time"
exit 1
else
ARG_ACTION=4
fi
;; ;;
-s|-show|--show) -s|-show|--show)
ARG_ACTION=5 if (( ARG_ACTION > 0 ))
ARG_LOG=0 then
ARG_VERBOSE=0 print -u2 "ERROR: you cannot request two actions at the same time"
exit 1
else
ARG_ACTION=5
fi
ARG_LOG=0; ARG_VERBOSE=0
;; ;;
-show-stats|--show-stats) -show-stats|--show-stats)
ARG_ACTION=11 if (( ARG_ACTION > 0 ))
then
print -u2 "ERROR: you cannot request two actions at the same time"
exit 1
else
ARG_ACTION=11
fi
;; ;;
-sms-provider=*) -sms-provider=*)
ARG_SMS_PROVIDER="${CMD_PARAMETER#-sms-provider=}" ARG_SMS_PROVIDER="${CMD_PARAMETER#-sms-provider=}"
@ -921,8 +1035,8 @@ fi
log "*** start of ${SCRIPT_NAME} [${CMD_LINE}] ***" log "*** start of ${SCRIPT_NAME} [${CMD_LINE}] ***"
(( ARG_LOG != 0 )) && log "logging takes places in ${LOG_FILE}" (( ARG_LOG != 0 )) && log "logging takes places in ${LOG_FILE}"
# check/create lock file & write PID file (only for --run) # check/create lock file & write PID file (only for --run/--archive/--fix-logs)
(( ARG_ACTION == 4 )) && check_lock_dir (( ARG_ACTION == 4 || ARG_ACTION == 11 || ARG_ACTION == 12 )) && check_lock_dir
# general HC log # general HC log
HC_LOG="${LOG_DIR}/hc.log" HC_LOG="${LOG_DIR}/hc.log"
@ -1198,6 +1312,23 @@ case ${ARG_ACTION} in
11) # show HC event statistics 11) # show HC event statistics
show_statistics show_statistics
;; ;;
12)
# fix rogue log entries
fix_logs
FIX_RC=$?
case ${FIX_RC} in
0)
: # feedback via fix_logs()
;;
1)
log "successfully fixed log entries"
;;
2)
log "failed to fix log entries [RC=${FIX_RC}]"
EXIT_CODE=1
;;
esac
;;
esac esac
# finish up work # finish up work

View File

@ -28,12 +28,13 @@
# DOES: archive log entries for a given HC # DOES: archive log entries for a given HC
# EXPECTS: HC name [string] # EXPECTS: HC name [string]
# RETURNS: 0=no archiving needed; 1=archiving OK; 2=archiving NOK # RETURNS: 0=no archiving needed; 1=archiving OK; 2=archiving NOK
# REQUIRES: n/a # REQUIRES: ${HC_LOG}
function archive_hc function archive_hc
{ {
(( ARG_DEBUG != 0 && ARG_DEBUG_LEVEL > 0 )) && set "${DEBUG_OPTS}" (( ARG_DEBUG != 0 && ARG_DEBUG_LEVEL > 0 )) && set "${DEBUG_OPTS}"
typeset HC_NAME="${1}" typeset HC_NAME="${1}"
typeset ARCHIVE_FILE="" typeset ARCHIVE_FILE=""
typeset ARCHIVE_RC=0
typeset YEAR_MONTH="" typeset YEAR_MONTH=""
typeset LOG_COUNT=0 typeset LOG_COUNT=0
typeset ARCHIVE_RC=0 typeset ARCHIVE_RC=0
@ -46,22 +47,23 @@ trap "rm -f ${TMP1_FILE} ${TMP2_FILE} ${SAVE_LOG_FILE} >/dev/null 2>&1; return 1
# isolate messages from HC, find unique %Y-%m combinations # isolate messages from HC, find unique %Y-%m combinations
grep ".*${LOG_SEP}${HC_NAME}${LOG_SEP}" ${HC_LOG} 2>/dev/null |\ grep ".*${LOG_SEP}${HC_NAME}${LOG_SEP}" ${HC_LOG} 2>/dev/null |\
cut -f1 -d"${LOG_SEP}" | cut -f1 -d' ' | cut -f1-2 -d'-' | sort -u |\ cut -f1 -d"${LOG_SEP}" 2>/dev/null | cut -f1 -d' ' 2>/dev/null |\
cut -f1-2 -d'-' 2>/dev/null | sort -u 2>/dev/null |\
while read YEAR_MONTH while read YEAR_MONTH
do do
# find all messages for that YEAR-MONTH combination # find all messages for that YEAR-MONTH combination
grep "${YEAR_MONTH}.*${LOG_SEP}${HC_NAME}${LOG_SEP}" ${HC_LOG} >${TMP1_FILE} grep "${YEAR_MONTH}.*${LOG_SEP}${HC_NAME}${LOG_SEP}" ${HC_LOG} >${TMP1_FILE}
LOG_COUNT=$(wc -l ${TMP1_FILE} | cut -f1 -d' ') LOG_COUNT=$(wc -l ${TMP1_FILE} 2>/dev/null | cut -f1 -d' ' 2>/dev/null)
log "# of entries in ${YEAR_MONTH} to archive: ${LOG_COUNT}" log "# of entries in ${YEAR_MONTH} to archive: ${LOG_COUNT}"
# combine existing archived messages and resort # combine existing archived messages and resort
ARCHIVE_FILE="${ARCHIVE_DIR}/hc.${YEAR_MONTH}.log" ARCHIVE_FILE="${ARCHIVE_DIR}/hc.${YEAR_MONTH}.log"
cat ${ARCHIVE_FILE} ${TMP1_FILE} 2>/dev/null | sort -u >${TMP2_FILE} cat ${ARCHIVE_FILE} ${TMP1_FILE} 2>/dev/null | sort -u >${TMP2_FILE} 2>/dev/null
mv ${TMP2_FILE} ${ARCHIVE_FILE} 2>/dev/null || { mv ${TMP2_FILE} ${ARCHIVE_FILE} 2>/dev/null || {
warn "failed to move archive file, aborting" warn "failed to move archive file, aborting"
return 2 return 2
} }
LOG_COUNT=$(wc -l ${ARCHIVE_FILE} | cut -f1 -d' ') LOG_COUNT=$(wc -l ${ARCHIVE_FILE} 2>/dev/null | cut -f1 -d' ' 2>/dev/null)
log "# entries in ${ARCHIVE_FILE} now: ${LOG_COUNT}" log "# entries in ${ARCHIVE_FILE} now: ${LOG_COUNT}"
# remove archived messages from the $HC_LOG (but create a backup first!) # remove archived messages from the $HC_LOG (but create a backup first!)
@ -76,7 +78,7 @@ do
warn "failed to move HC log file, aborting" warn "failed to move HC log file, aborting"
return 2 return 2
} }
LOG_COUNT=$(wc -l ${HC_LOG} | cut -f1 -d' ') LOG_COUNT=$(wc -l ${HC_LOG} 2>/dev/null | cut -f1 -d' ' 2>/dev/null )
log "# entries in ${HC_LOG} now: ${LOG_COUNT}" log "# entries in ${HC_LOG} now: ${LOG_COUNT}"
ARCHIVE_RC=1 ARCHIVE_RC=1
else else
@ -92,6 +94,29 @@ rm -f ${TMP1_FILE} ${TMP2_FILE} ${SAVE_HC_LOG} >/dev/null 2>&1
return ${ARCHIVE_RC} return ${ARCHIVE_RC}
} }
# -----------------------------------------------------------------------------
# @(#) FUNCTION: count_log_errors()
# DOES: check hc log file(s) for rogue entries. Log entries may get scrambled
# if the append operation in handle_hc() does not happen fully atomically.
# This means that log entries are written without line separator (same line)
# There is no proper way to avoid this without an extra file locking utility
# EXPECTS: path to log file to check
# OUTPUTS: number of errors [number]
# RETURNS: 0
# REQUIRES: n/a
function count_log_errors
{
(( ARG_DEBUG != 0 && ARG_DEBUG_LEVEL > 0 )) && set "${DEBUG_OPTS}"
typeset LOG_STASH="${1}"
typeset ERROR_COUNT=0
ERROR_COUNT=$(cat ${LOG_STASH} 2>/dev/null | awk -F"${LOG_SEP}" 'BEGIN { num = 0 } { if (NF>'"${NUM_LOG_FIELDS}"') { num++ }} END { print num }' 2>/dev/null)
print ${ERROR_COUNT}
return 0
}
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
# @(#) FUNCTION: debug() # @(#) FUNCTION: debug()
# DOES: handle debug messages # DOES: handle debug messages
@ -110,7 +135,6 @@ done
return 0 return 0
} }
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
# @(#) FUNCTION: die() # @(#) FUNCTION: die()
# DOES: handle fatal errors and exit script # DOES: handle fatal errors and exit script
@ -528,10 +552,6 @@ if (( DO_REPORT_STD == 0 )) && (( ARG_DETAIL != 0 ))
then then
die "you cannot specify '--detail' without '--report'" die "you cannot specify '--detail' without '--report'"
fi fi
if (( DO_REPORT_STD == 0 )) && (( ARG_HISTORY != 0 ))
then
die "you cannot specify '--with-history' without '--report'"
fi
if (( DO_REPORT_STD == 0 )) && [[ -n "${ARG_FAIL_ID}" ]] if (( DO_REPORT_STD == 0 )) && [[ -n "${ARG_FAIL_ID}" ]]
then then
die "you cannot specify '--id' without '--report'" die "you cannot specify '--id' without '--report'"
@ -604,6 +624,167 @@ done
return 0 return 0
} }
# -----------------------------------------------------------------------------
# @(#) FUNCTION: fix_logs()
# DOES: fix hc log file(s) with rogue entries
# EXPECTS: n/a
# REQUIRES: n/a
# RETURNS: 0=no fix needed; 1=fix OK; 2=fix NOK
# NOTE: this routine rewrites the HC log(s). Since we cannot use file locking,
# some log entries may be lost if the HC is accessing the HC log during
# the rewrite operation!!
function fix_logs
{
(( ARG_DEBUG != 0 && ARG_DEBUG_LEVEL > 0 )) && set "${DEBUG_OPTS}"
typeset FIX_FILE=""
typeset FIX_RC=0
typeset LOG_STASH=""
typeset ERROR_COUNT=0
typeset STASH_COUNT=0
typeset TMP_COUNT=0
typeset SAVE_TMP_FILE="${TMP_DIR}/.$0.save.log.$$"
typeset TMP_FILE="${TMP_DIR}/.$0.tmp.log.$$"
if (( ARG_HISTORY != 0 ))
then
set +f # file globbing must be on
LOG_STASH="${HC_LOG} ${ARCHIVE_DIR}/hc.*.log"
else
LOG_STASH="${HC_LOG}"
fi
# set local trap for clean-up
trap "[[ -f ${TMP_FILE} ]] && rm -f ${TMP_FILE} >/dev/null 2>&1; return 1" 1 2 3 15
# check and rewrite log file(s)
find ${LOG_STASH} -type f -print 2>/dev/null | while read FIX_FILE
do
log "fixing log file ${FIX_FILE} ..."
# count before rewrite
STASH_COUNT=$(wc -l ${FIX_FILE} 2>/dev/null | cut -f1 -d' ' 2>/dev/null)
# does it have errors?
ERROR_COUNT=$(count_log_errors ${FIX_FILE})
# rewrite if needed
if (( ERROR_COUNT > 0 ))
then
>${TMP_FILE} 2>/dev/null
cat ${FIX_FILE} 2>/dev/null | awk -F"${LOG_SEP}" -v OFS="${LOG_SEP}" '
BEGIN { max_log_fields = '"${NUM_LOG_FIELDS}"'
max_fields = (max_log_fields - 1) * 2
glue_field = max_log_fields - 1
}
# Fix log lines that were smashed together because of unatomic appends
# This can lead to 4 distinct cases that we need to rewrite based on
# whether a FAIL_ID is present in each part of the log line.
# Following examples are based on a log file with 5 standard fields:
# case 1: NO (FAIL_ID) + NO (FAIL_ID) -> 9 fields
# case 2: NO (FAIL_ID) + YES (FAIL_ID) -> 10 fields
# case 3: YES (FAIL_ID) + NO (FAIL_ID) -> 10 fields
# case 4: YES (FAIL_ID) + YES (FAIL_ID) -> 11 fields
{
if (NF > max_log_fields) {
# rogue line that needs rewriting
if (NF < max_fields) {
# case 1
for (i=1;i<max_log_fields-1;i++) {
printf ("%s%s", $i, OFS)
}
printf ("\n")
if ($NF ~ //) {
for (i=max_log_fields-1;i<NF;i++) {
printf ("%s%s", $i, OFS)
}
} else {
for (i=max_log_fields-1;i<=NF;i++) {
printf ("%s%s", $i, OFS)
}
}
} else {
if ($max_fields == "") {
# case 2+3
# is the glue field a DATE or FAIL_ID?
if ($glue_field ~ /[:-]/) {
# it is a DATE (belongs to next line)
for (i=1;i<max_log_fields-1;i++) {
printf ("%s%s", $i, OFS)
}
printf ("\n")
for (i=max_log_fields-1;i<NF;i++) {
printf ("%s%s", $i, OFS)
}
} else {
# it is a FAIL_ID (belongs to this line)
for (i=1;i<max_log_fields;i++) {
printf ("%s%s", $i, OFS)
}
printf ("\n")
for (i=max_log_fields;i<NF;i++) {
printf ("%s%s", $i, OFS)
}
}
} else {
# case 4
for (i=1;i<max_log_fields;i++) {
printf ("%s%s", $i, OFS)
}
printf ("\n")
for (i=max_log_fields;i<NF;i++) {
printf ("%s%s", $i, OFS)
}
}
}
printf ("\n")
} else {
# correct log line, no rewrite needed
print $0
}
}' >${TMP_FILE} 2>/dev/null
# count after rewrite
TMP_COUNT=$(wc -l ${TMP_FILE} 2>/dev/null | cut -f1 -d' ' 2>/dev/null)
# bail out when we do not have enough records
if (( TMP_COUNT <= STASH_COUNT ))
then
warn "found inconsistent record count (${TMP_COUNT}<${STASH_COUNT}), aborting"
return 2
fi
# swap log file (but create a backup first!)
cp -p ${FIX_FILE} ${SAVE_TMP_FILE} 2>/dev/null
if (( $? == 0 ))
then
mv ${TMP_FILE} ${FIX_FILE} 2>/dev/null
if (( $? > 0 ))
then
warn "failed to move/update log file, rolling back"
mv ${SAVE_TMP_FILE} ${FIX_FILE} 2>/dev/null
return 2
fi
FIX_RC=1
else
warn "failed to create a backup of original log file, aborting"
return 2
fi
# clean up temporary file(s)
rm -f ${SAVE_TMP_FILE} ${TMP_FILE} >/dev/null 2>&1
else
log "no fixing needed for ${FIX_FILE}"
fi
ERROR_COUNT=0
done
return ${FIX_RC}
}
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
# @(#) FUNCTION: handle_hc() # @(#) FUNCTION: handle_hc()
# DOES: handle HC results # DOES: handle HC results

View File

@ -20,7 +20,7 @@
# DOES: report HC events on STDOUT # DOES: report HC events on STDOUT
# EXPECTS: n/a # EXPECTS: n/a
# RETURNS: 0 # RETURNS: 0
# REQUIRES: init_hc(), list_hc(), $EVENTS_DIR, $HC_LOG # REQUIRES: count_log_errors(), init_hc(), list_hc(), $EVENTS_DIR, $HC_LOG
# #
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
# DO NOT CHANGE THIS FILE UNLESS YOU KNOW WHAT YOU ARE DOING! # DO NOT CHANGE THIS FILE UNLESS YOU KNOW WHAT YOU ARE DOING!
@ -30,7 +30,7 @@
function report_std function report_std
{ {
# ------------------------- CONFIGURATION starts here ------------------------- # ------------------------- CONFIGURATION starts here -------------------------
typeset _VERSION="2018-04-29" # YYYY-MM-DD typeset _VERSION="2018-05-27" # YYYY-MM-DD
typeset _SUPPORTED_PLATFORMS="AIX,HP-UX,Linux" # uname -s match typeset _SUPPORTED_PLATFORMS="AIX,HP-UX,Linux" # uname -s match
# ------------------------- CONFIGURATION ends here --------------------------- # ------------------------- CONFIGURATION ends here ---------------------------
@ -40,11 +40,14 @@ init_hc "$0" "${_SUPPORTED_PLATFORMS}" "${_VERSION}"
typeset _DIR_PREFIX="" typeset _DIR_PREFIX=""
typeset _FAIL_COUNT=0 typeset _FAIL_COUNT=0
typeset _ERROR_COUNT=0
typeset _ERROR_TOTAL_COUNT=0
typeset _HC_LAST="" typeset _HC_LAST=""
typeset _HC_LAST_TIME="" typeset _HC_LAST_TIME=""
typeset _HC_LAST_STC=0 typeset _HC_LAST_STC=0
typeset _HC_LAST_FAIL_ID="-" typeset _HC_LAST_FAIL_ID="-"
typeset _ID_NEEDLE="" typeset _ID_NEEDLE=""
typeset _CHECK_FILE=""
typeset _LOG_STASH="" typeset _LOG_STASH=""
typeset _REPORT_LINE="" typeset _REPORT_LINE=""
typeset _SORT_CMD="" typeset _SORT_CMD=""
@ -86,7 +89,7 @@ then
last_fail_id = "-" last_fail_id = "-"
} }
{ {
if ($1 ~ needle_time && $2 ~ needle_hc) { if (($1 ~ needle_time && $2 ~ needle_hc) && NF <= '"${NUM_LOG_FIELDS}"') {
last_event_stc = $3 last_event_stc = $3
last_stc = last_stc + last_event_stc last_stc = last_stc + last_event_stc
last_event_fail_id = $5 last_event_fail_id = $5
@ -103,7 +106,7 @@ then
"${_HC_LAST}" "${_HC_LAST_TIME}" "${_HC_LAST_FAIL_ID}" "${_HC_LAST_STC}" "${_HC_LAST}" "${_HC_LAST_TIME}" "${_HC_LAST_FAIL_ID}" "${_HC_LAST_STC}"
done done
# disclaimer # disclaimer
print "Note: this report only shows the overall combined status of all events of each HC within exactly" print "NOTE: this report only shows the overall combined status of all events of each HC within exactly"
print " the *same* time stamp (seconds precise). It may therefore fail to report certain FAIL IDs." print " the *same* time stamp (seconds precise). It may therefore fail to report certain FAIL IDs."
print " Use '--report' to get the exact list of failure events." print " Use '--report' to get the exact list of failure events."
# other reports # other reports
@ -141,7 +144,7 @@ else
cat ${_LOG_STASH} 2>/dev/null | ${_SORT_CMD} 2>/dev/null | awk -F"${LOG_SEP}" -v id_needle="${_ID_NEEDLE}" \ cat ${_LOG_STASH} 2>/dev/null | ${_SORT_CMD} 2>/dev/null | awk -F"${LOG_SEP}" -v id_needle="${_ID_NEEDLE}" \
' '
{ {
if ($5 ~ id_needle) { if ($5 ~ id_needle && NF <= '"${NUM_LOG_FIELDS}"') {
printf ("| %-20s | %-14s | %-30s | %-s\n", $1, $5, $2, $4) printf ("| %-20s | %-14s | %-30s | %-s\n", $1, $5, $2, $4)
} }
} }
@ -157,7 +160,7 @@ else
dashes = sprintf("%36s",""); gsub (/ /, "-", dashes); dashes = sprintf("%36s",""); gsub (/ /, "-", dashes);
} }
{ {
if ($5 ~ id_needle) { if ($5 ~ id_needle && NF <= '"${NUM_LOG_FIELDS}"') {
printf ("%36sMSG #%03d%36s", dashes, event_count, dashes) printf ("%36sMSG #%03d%36s", dashes, event_count, dashes)
printf ("\nTime : %-s\nHC : %-s\nDetail : %-s\n", $1, $2, $4) printf ("\nTime : %-s\nHC : %-s\nDetail : %-s\n", $1, $2, $4)
event_count++ event_count++
@ -191,6 +194,19 @@ else
fi fi
fi fi
# check consistency of log(s)
find ${_LOG_STASH} -type f -print 2>/dev/null | while read _CHECK_FILE
do
_ERROR_COUNT=$(count_log_errors ${_CHECK_FILE})
if (( _ERROR_COUNT > 0 ))
then
print "NOTE: found ${_ERROR_COUNT} rogue entr(y|ies) in log file ${_CHECK_FILE}"
_ERROR_TOTAL_COUNT=$(( _ERROR_TOTAL_COUNT + _ERROR_COUNT ))
fi
_ERROR_COUNT=0
done
(( _ERROR_TOTAL_COUNT > 0 )) && print "NOTE: fix log errors with ${SCRIPT_NAME} --fix-logs [--with-history]"
return 0 return 0
} }