* Fix for rogue log entries (via fix_logs()). When not using run locks (--use-lock) it is possible that log entries are always written in completely atomic manner.
* Other fixes
This commit is contained in:
parent
dad23525e3
commit
4554d25073
@ -37,7 +37,7 @@
|
||||
|
||||
# ------------------------- CONFIGURATION starts here -------------------------
|
||||
# define the version (YYYY-MM-DD)
|
||||
typeset -r SCRIPT_VERSION="2018-05-20"
|
||||
typeset -r SCRIPT_VERSION="2018-05-29"
|
||||
# location of parent directory containing KSH functions/HC plugins
|
||||
typeset -r FPATH_PARENT="/opt/hc/lib"
|
||||
# location of custom HC configuration files
|
||||
@ -60,6 +60,7 @@ typeset -r LOCK_DIR="${TMP_DIR}/.${SCRIPT_NAME}.lock"
|
||||
typeset -r HC_MSG_FILE="${TMP_DIR}/.${SCRIPT_NAME}.hc.msg.$$" # plugin messages files
|
||||
typeset -r LOG_SEP="|" # single character only
|
||||
typeset -r MSG_SEP="%" # single character only
|
||||
typeset -t NUM_LOG_FIELDS=6 # current number of fields in $HC_LOG + 1
|
||||
typeset -r MAGIC_QUOTE="!_!" # magic quote
|
||||
typeset -r LOG_DIR="/var/opt/hc"
|
||||
typeset -r LOG_FILE="${LOG_DIR}/${SCRIPT_NAME}.log"
|
||||
@ -91,6 +92,7 @@ typeset LINUX_RELEASE=""
|
||||
typeset ARCHIVE_RC=0
|
||||
typeset DISABLE_RC=0
|
||||
typeset ENABLE_RC=0
|
||||
typeset FIX_FC=0
|
||||
typeset RUN_RC=0
|
||||
typeset RUN_CONFIG_FILE=""
|
||||
typeset RUN_TIME_OUT=0
|
||||
@ -384,6 +386,12 @@ then
|
||||
ARG_VERBOSE=0
|
||||
ARG_LOG=0
|
||||
fi
|
||||
# --fix-logs
|
||||
if (( ARG_ACTION == 12 )) && [[ -n "${ARG_HC}" ]]
|
||||
then
|
||||
print -u2 "ERROR: you can only use '--fix-logs' in combination with '--with-history'"
|
||||
exit 1
|
||||
fi
|
||||
# --timeout
|
||||
if (( ARG_TIME_OUT > 0 ))
|
||||
then
|
||||
@ -504,7 +512,7 @@ cat << EOT
|
||||
Execute/report simple health checks (HC) on UNIX hosts.
|
||||
|
||||
Syntax: ${SCRIPT_DIR}/${SCRIPT_NAME} [--help] | [--help-terse] | [--version] |
|
||||
[--list=<needle>] | [--list-core] | [--fix-symlinks] | [--show-stats] | (--disable-all | enable-all) |
|
||||
[--list=<needle>] | [--list-core] | [--fix-symlinks] | [--show-stats] | (--disable-all | enable-all) | [--fix-logs [--with-history]] |
|
||||
(--check-host | ((--archive | --check | --enable | --disable | --run [--timeout=<secs>] | --show) --hc=<list_of_checks> [--config-file=<configuration_file>] [hc-args="<arg1,arg2=val,arg3">]))
|
||||
[--display=<method>] ([--debug] [--debug-level=<level>]) [--no-monitor] [--no-log] [--no-lock] [--flip-rc]
|
||||
[--notify=<method_list>] [--mail-to=<address_list>] [--sms-to=<sms_rcpt> --sms-provider=<name>]
|
||||
@ -529,6 +537,7 @@ Parameters:
|
||||
--display : display HC results in a formatted way. Default is STDOUT (see --list-core for available formats)
|
||||
--enable : enable HC(s).
|
||||
--enable-all : enable all HCs.
|
||||
--fix-logs : fix rogue log entries (can be used with --with-history)
|
||||
--fix-symlinks : update symbolic links for the KSH autoloader.
|
||||
--flip-rc : exit the health checker with the RC (return code) of the HC plugin instead of its own RC (will be discarded)
|
||||
This option may only be specified when executing a single HC plugin
|
||||
@ -680,16 +689,34 @@ CMD_LINE="$*"
|
||||
[[ -z "${CMD_LINE}" ]] && display_usage && exit 0
|
||||
for CMD_PARAMETER in ${CMD_LINE}
|
||||
do
|
||||
# ARG_ACTION is a toggle, do not allow double toggles
|
||||
case ${CMD_PARAMETER} in
|
||||
-archive|--archive)
|
||||
if (( ARG_ACTION > 0 ))
|
||||
then
|
||||
print -u2 "ERROR: you cannot request two actions if (( ARG_ACTION > 0 ))
|
||||
then
|
||||
print -u2 "ERROR: you cannot request two actions at the same time"
|
||||
exit 1
|
||||
else"
|
||||
exit 1
|
||||
else
|
||||
ARG_ACTION=10
|
||||
fi
|
||||
ARG_LOCK=1
|
||||
;;
|
||||
-check|--check)
|
||||
ARG_ACTION=1
|
||||
;;
|
||||
-c|-check-host|--check-host)
|
||||
ARG_CHECK_HOST=1
|
||||
if (( ARG_ACTION > 0 ))
|
||||
then
|
||||
print -u2 "ERROR: you cannot request two actions at the same time"
|
||||
exit 1
|
||||
else
|
||||
ARG_ACTION=4
|
||||
fi
|
||||
ARG_CHECK_HOST=1
|
||||
;;
|
||||
-config-file=*)
|
||||
ARG_CONFIG_FILE="${CMD_PARAMETER#-config-file=}"
|
||||
@ -712,10 +739,22 @@ do
|
||||
ARG_DETAIL=1
|
||||
;;
|
||||
-d|-disable|--disable)
|
||||
if (( ARG_ACTION > 0 ))
|
||||
then
|
||||
print -u2 "ERROR: you cannot request two actions at the same time"
|
||||
exit 1
|
||||
else
|
||||
ARG_ACTION=2
|
||||
fi
|
||||
;;
|
||||
-disable-all|--disable-all)
|
||||
if (( ARG_ACTION > 0 ))
|
||||
then
|
||||
print -u2 "ERROR: you cannot request two actions at the same time"
|
||||
exit 1
|
||||
else
|
||||
ARG_ACTION=6
|
||||
fi
|
||||
;;
|
||||
-display|--display)
|
||||
# STDOUT as default
|
||||
@ -728,10 +767,22 @@ do
|
||||
ARG_DISPLAY="${CMD_PARAMETER#--display=}"
|
||||
;;
|
||||
-e|-enable|--enable)
|
||||
if (( ARG_ACTION > 0 ))
|
||||
then
|
||||
print -u2 "ERROR: you cannot request two actions at the same time"
|
||||
exit 1
|
||||
else
|
||||
ARG_ACTION=3
|
||||
fi
|
||||
;;
|
||||
-enable-all|--enable-all)
|
||||
if (( ARG_ACTION > 0 ))
|
||||
then
|
||||
print -u2 "ERROR: you cannot request two actions at the same time"
|
||||
exit 1
|
||||
else
|
||||
ARG_ACTION=7
|
||||
fi
|
||||
;;
|
||||
-f|-fix-symlinks|--fix-symlinks)
|
||||
read_config
|
||||
@ -742,6 +793,16 @@ do
|
||||
fix_symlinks
|
||||
exit 0
|
||||
;;
|
||||
-fix-logs|--fix-logs)
|
||||
if (( ARG_ACTION > 0 ))
|
||||
then
|
||||
print -u2 "ERROR: you cannot request two actions at the same time"
|
||||
exit 1
|
||||
else
|
||||
ARG_ACTION=12
|
||||
fi
|
||||
ARG_LOCK=1
|
||||
;;
|
||||
-flip-rc|--flip-rc)
|
||||
ARG_FLIP_RC=1
|
||||
;;
|
||||
@ -770,15 +831,33 @@ do
|
||||
ARG_LAST=1
|
||||
;;
|
||||
-list|--list)
|
||||
if (( ARG_ACTION > 0 ))
|
||||
then
|
||||
print -u2 "ERROR: you cannot request two actions at the same time"
|
||||
exit 1
|
||||
else
|
||||
ARG_ACTION=9
|
||||
fi
|
||||
;;
|
||||
-list=*)
|
||||
ARG_LIST="${CMD_PARAMETER#-list=}"
|
||||
if (( ARG_ACTION > 0 ))
|
||||
then
|
||||
print -u2 "ERROR: you cannot request two actions at the same time"
|
||||
exit 1
|
||||
else
|
||||
ARG_ACTION=9
|
||||
fi
|
||||
;;
|
||||
--list=*)
|
||||
ARG_LIST="${CMD_PARAMETER#--list=}"
|
||||
if (( ARG_ACTION > 0 ))
|
||||
then
|
||||
print -u2 "ERROR: you cannot request two actions at the same time"
|
||||
exit 1
|
||||
else
|
||||
ARG_ACTION=9
|
||||
fi
|
||||
;;
|
||||
-list-hc|--list-hc|-list-all|--list-all)
|
||||
print -u2 "WARN: deprecated option. Use --list | --list=<needle>"
|
||||
@ -816,34 +895,69 @@ do
|
||||
ARG_MONITOR=0
|
||||
;;
|
||||
-report|--report) # compatability support <2017-12-15
|
||||
if (( ARG_ACTION > 0 ))
|
||||
then
|
||||
print -u2 "ERROR: you cannot request two actions at the same time"
|
||||
exit 1
|
||||
else
|
||||
ARG_ACTION=8
|
||||
fi
|
||||
# STDOUT as default
|
||||
ARG_REPORT="std"
|
||||
ARG_LOG=0; ARG_VERBOSE=0
|
||||
ARG_ACTION=8
|
||||
;;
|
||||
-report=*)
|
||||
if (( ARG_ACTION > 0 ))
|
||||
then
|
||||
print -u2 "ERROR: you cannot request two actions at the same time"
|
||||
exit 1
|
||||
else
|
||||
ARG_ACTION=8
|
||||
fi
|
||||
ARG_REPORT="${CMD_PARAMETER#-report=}"
|
||||
ARG_LOG=0; ARG_VERBOSE=0
|
||||
ARG_ACTION=8
|
||||
;;
|
||||
--report=*)
|
||||
if (( ARG_ACTION > 0 ))
|
||||
then
|
||||
print -u2 "ERROR: you cannot request two actions at the same time"
|
||||
exit 1
|
||||
else
|
||||
ARG_ACTION=8
|
||||
fi
|
||||
ARG_REPORT="${CMD_PARAMETER#--report=}"
|
||||
ARG_LOG=0; ARG_VERBOSE=0
|
||||
ARG_ACTION=8
|
||||
;;
|
||||
-reverse|--reverse)
|
||||
ARG_REVERSE=1
|
||||
;;
|
||||
-r|-run|--run)
|
||||
if (( ARG_ACTION > 0 ))
|
||||
then
|
||||
print -u2 "ERROR: you cannot request two actions at the same time"
|
||||
exit 1
|
||||
else
|
||||
ARG_ACTION=4
|
||||
fi
|
||||
;;
|
||||
-s|-show|--show)
|
||||
if (( ARG_ACTION > 0 ))
|
||||
then
|
||||
print -u2 "ERROR: you cannot request two actions at the same time"
|
||||
exit 1
|
||||
else
|
||||
ARG_ACTION=5
|
||||
ARG_LOG=0
|
||||
ARG_VERBOSE=0
|
||||
fi
|
||||
ARG_LOG=0; ARG_VERBOSE=0
|
||||
;;
|
||||
-show-stats|--show-stats)
|
||||
if (( ARG_ACTION > 0 ))
|
||||
then
|
||||
print -u2 "ERROR: you cannot request two actions at the same time"
|
||||
exit 1
|
||||
else
|
||||
ARG_ACTION=11
|
||||
fi
|
||||
;;
|
||||
-sms-provider=*)
|
||||
ARG_SMS_PROVIDER="${CMD_PARAMETER#-sms-provider=}"
|
||||
@ -921,8 +1035,8 @@ fi
|
||||
log "*** start of ${SCRIPT_NAME} [${CMD_LINE}] ***"
|
||||
(( ARG_LOG != 0 )) && log "logging takes places in ${LOG_FILE}"
|
||||
|
||||
# check/create lock file & write PID file (only for --run)
|
||||
(( ARG_ACTION == 4 )) && check_lock_dir
|
||||
# check/create lock file & write PID file (only for --run/--archive/--fix-logs)
|
||||
(( ARG_ACTION == 4 || ARG_ACTION == 11 || ARG_ACTION == 12 )) && check_lock_dir
|
||||
|
||||
# general HC log
|
||||
HC_LOG="${LOG_DIR}/hc.log"
|
||||
@ -1198,6 +1312,23 @@ case ${ARG_ACTION} in
|
||||
11) # show HC event statistics
|
||||
show_statistics
|
||||
;;
|
||||
12)
|
||||
# fix rogue log entries
|
||||
fix_logs
|
||||
FIX_RC=$?
|
||||
case ${FIX_RC} in
|
||||
0)
|
||||
: # feedback via fix_logs()
|
||||
;;
|
||||
1)
|
||||
log "successfully fixed log entries"
|
||||
;;
|
||||
2)
|
||||
log "failed to fix log entries [RC=${FIX_RC}]"
|
||||
EXIT_CODE=1
|
||||
;;
|
||||
esac
|
||||
;;
|
||||
esac
|
||||
|
||||
# finish up work
|
||||
|
@ -28,12 +28,13 @@
|
||||
# DOES: archive log entries for a given HC
|
||||
# EXPECTS: HC name [string]
|
||||
# RETURNS: 0=no archiving needed; 1=archiving OK; 2=archiving NOK
|
||||
# REQUIRES: n/a
|
||||
# REQUIRES: ${HC_LOG}
|
||||
function archive_hc
|
||||
{
|
||||
(( ARG_DEBUG != 0 && ARG_DEBUG_LEVEL > 0 )) && set "${DEBUG_OPTS}"
|
||||
typeset HC_NAME="${1}"
|
||||
typeset ARCHIVE_FILE=""
|
||||
typeset ARCHIVE_RC=0
|
||||
typeset YEAR_MONTH=""
|
||||
typeset LOG_COUNT=0
|
||||
typeset ARCHIVE_RC=0
|
||||
@ -46,22 +47,23 @@ trap "rm -f ${TMP1_FILE} ${TMP2_FILE} ${SAVE_LOG_FILE} >/dev/null 2>&1; return 1
|
||||
|
||||
# isolate messages from HC, find unique %Y-%m combinations
|
||||
grep ".*${LOG_SEP}${HC_NAME}${LOG_SEP}" ${HC_LOG} 2>/dev/null |\
|
||||
cut -f1 -d"${LOG_SEP}" | cut -f1 -d' ' | cut -f1-2 -d'-' | sort -u |\
|
||||
cut -f1 -d"${LOG_SEP}" 2>/dev/null | cut -f1 -d' ' 2>/dev/null |\
|
||||
cut -f1-2 -d'-' 2>/dev/null | sort -u 2>/dev/null |\
|
||||
while read YEAR_MONTH
|
||||
do
|
||||
# find all messages for that YEAR-MONTH combination
|
||||
grep "${YEAR_MONTH}.*${LOG_SEP}${HC_NAME}${LOG_SEP}" ${HC_LOG} >${TMP1_FILE}
|
||||
LOG_COUNT=$(wc -l ${TMP1_FILE} | cut -f1 -d' ')
|
||||
LOG_COUNT=$(wc -l ${TMP1_FILE} 2>/dev/null | cut -f1 -d' ' 2>/dev/null)
|
||||
log "# of entries in ${YEAR_MONTH} to archive: ${LOG_COUNT}"
|
||||
|
||||
# combine existing archived messages and resort
|
||||
ARCHIVE_FILE="${ARCHIVE_DIR}/hc.${YEAR_MONTH}.log"
|
||||
cat ${ARCHIVE_FILE} ${TMP1_FILE} 2>/dev/null | sort -u >${TMP2_FILE}
|
||||
cat ${ARCHIVE_FILE} ${TMP1_FILE} 2>/dev/null | sort -u >${TMP2_FILE} 2>/dev/null
|
||||
mv ${TMP2_FILE} ${ARCHIVE_FILE} 2>/dev/null || {
|
||||
warn "failed to move archive file, aborting"
|
||||
return 2
|
||||
}
|
||||
LOG_COUNT=$(wc -l ${ARCHIVE_FILE} | cut -f1 -d' ')
|
||||
LOG_COUNT=$(wc -l ${ARCHIVE_FILE} 2>/dev/null | cut -f1 -d' ' 2>/dev/null)
|
||||
log "# entries in ${ARCHIVE_FILE} now: ${LOG_COUNT}"
|
||||
|
||||
# remove archived messages from the $HC_LOG (but create a backup first!)
|
||||
@ -76,7 +78,7 @@ do
|
||||
warn "failed to move HC log file, aborting"
|
||||
return 2
|
||||
}
|
||||
LOG_COUNT=$(wc -l ${HC_LOG} | cut -f1 -d' ')
|
||||
LOG_COUNT=$(wc -l ${HC_LOG} 2>/dev/null | cut -f1 -d' ' 2>/dev/null )
|
||||
log "# entries in ${HC_LOG} now: ${LOG_COUNT}"
|
||||
ARCHIVE_RC=1
|
||||
else
|
||||
@ -92,6 +94,29 @@ rm -f ${TMP1_FILE} ${TMP2_FILE} ${SAVE_HC_LOG} >/dev/null 2>&1
|
||||
return ${ARCHIVE_RC}
|
||||
}
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# @(#) FUNCTION: count_log_errors()
|
||||
# DOES: check hc log file(s) for rogue entries. Log entries may get scrambled
|
||||
# if the append operation in handle_hc() does not happen fully atomically.
|
||||
# This means that log entries are written without line separator (same line)
|
||||
# There is no proper way to avoid this without an extra file locking utility
|
||||
# EXPECTS: path to log file to check
|
||||
# OUTPUTS: number of errors [number]
|
||||
# RETURNS: 0
|
||||
# REQUIRES: n/a
|
||||
function count_log_errors
|
||||
{
|
||||
(( ARG_DEBUG != 0 && ARG_DEBUG_LEVEL > 0 )) && set "${DEBUG_OPTS}"
|
||||
typeset LOG_STASH="${1}"
|
||||
typeset ERROR_COUNT=0
|
||||
|
||||
ERROR_COUNT=$(cat ${LOG_STASH} 2>/dev/null | awk -F"${LOG_SEP}" 'BEGIN { num = 0 } { if (NF>'"${NUM_LOG_FIELDS}"') { num++ }} END { print num }' 2>/dev/null)
|
||||
|
||||
print ${ERROR_COUNT}
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# @(#) FUNCTION: debug()
|
||||
# DOES: handle debug messages
|
||||
@ -110,7 +135,6 @@ done
|
||||
return 0
|
||||
}
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# @(#) FUNCTION: die()
|
||||
# DOES: handle fatal errors and exit script
|
||||
@ -528,10 +552,6 @@ if (( DO_REPORT_STD == 0 )) && (( ARG_DETAIL != 0 ))
|
||||
then
|
||||
die "you cannot specify '--detail' without '--report'"
|
||||
fi
|
||||
if (( DO_REPORT_STD == 0 )) && (( ARG_HISTORY != 0 ))
|
||||
then
|
||||
die "you cannot specify '--with-history' without '--report'"
|
||||
fi
|
||||
if (( DO_REPORT_STD == 0 )) && [[ -n "${ARG_FAIL_ID}" ]]
|
||||
then
|
||||
die "you cannot specify '--id' without '--report'"
|
||||
@ -604,6 +624,167 @@ done
|
||||
return 0
|
||||
}
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# @(#) FUNCTION: fix_logs()
|
||||
# DOES: fix hc log file(s) with rogue entries
|
||||
# EXPECTS: n/a
|
||||
# REQUIRES: n/a
|
||||
# RETURNS: 0=no fix needed; 1=fix OK; 2=fix NOK
|
||||
# NOTE: this routine rewrites the HC log(s). Since we cannot use file locking,
|
||||
# some log entries may be lost if the HC is accessing the HC log during
|
||||
# the rewrite operation!!
|
||||
function fix_logs
|
||||
{
|
||||
(( ARG_DEBUG != 0 && ARG_DEBUG_LEVEL > 0 )) && set "${DEBUG_OPTS}"
|
||||
typeset FIX_FILE=""
|
||||
typeset FIX_RC=0
|
||||
typeset LOG_STASH=""
|
||||
typeset ERROR_COUNT=0
|
||||
typeset STASH_COUNT=0
|
||||
typeset TMP_COUNT=0
|
||||
typeset SAVE_TMP_FILE="${TMP_DIR}/.$0.save.log.$$"
|
||||
typeset TMP_FILE="${TMP_DIR}/.$0.tmp.log.$$"
|
||||
|
||||
if (( ARG_HISTORY != 0 ))
|
||||
then
|
||||
set +f # file globbing must be on
|
||||
LOG_STASH="${HC_LOG} ${ARCHIVE_DIR}/hc.*.log"
|
||||
else
|
||||
LOG_STASH="${HC_LOG}"
|
||||
fi
|
||||
|
||||
# set local trap for clean-up
|
||||
trap "[[ -f ${TMP_FILE} ]] && rm -f ${TMP_FILE} >/dev/null 2>&1; return 1" 1 2 3 15
|
||||
|
||||
# check and rewrite log file(s)
|
||||
find ${LOG_STASH} -type f -print 2>/dev/null | while read FIX_FILE
|
||||
do
|
||||
log "fixing log file ${FIX_FILE} ..."
|
||||
|
||||
# count before rewrite
|
||||
STASH_COUNT=$(wc -l ${FIX_FILE} 2>/dev/null | cut -f1 -d' ' 2>/dev/null)
|
||||
|
||||
# does it have errors?
|
||||
ERROR_COUNT=$(count_log_errors ${FIX_FILE})
|
||||
|
||||
# rewrite if needed
|
||||
if (( ERROR_COUNT > 0 ))
|
||||
then
|
||||
>${TMP_FILE} 2>/dev/null
|
||||
cat ${FIX_FILE} 2>/dev/null | awk -F"${LOG_SEP}" -v OFS="${LOG_SEP}" '
|
||||
|
||||
BEGIN { max_log_fields = '"${NUM_LOG_FIELDS}"'
|
||||
max_fields = (max_log_fields - 1) * 2
|
||||
glue_field = max_log_fields - 1
|
||||
}
|
||||
|
||||
# Fix log lines that were smashed together because of unatomic appends
|
||||
# This can lead to 4 distinct cases that we need to rewrite based on
|
||||
# whether a FAIL_ID is present in each part of the log line.
|
||||
# Following examples are based on a log file with 5 standard fields:
|
||||
# case 1: NO (FAIL_ID) + NO (FAIL_ID) -> 9 fields
|
||||
# case 2: NO (FAIL_ID) + YES (FAIL_ID) -> 10 fields
|
||||
# case 3: YES (FAIL_ID) + NO (FAIL_ID) -> 10 fields
|
||||
# case 4: YES (FAIL_ID) + YES (FAIL_ID) -> 11 fields
|
||||
|
||||
{
|
||||
if (NF > max_log_fields) {
|
||||
# rogue line that needs rewriting
|
||||
if (NF < max_fields) {
|
||||
# case 1
|
||||
for (i=1;i<max_log_fields-1;i++) {
|
||||
printf ("%s%s", $i, OFS)
|
||||
}
|
||||
printf ("\n")
|
||||
if ($NF ~ //) {
|
||||
for (i=max_log_fields-1;i<NF;i++) {
|
||||
printf ("%s%s", $i, OFS)
|
||||
}
|
||||
} else {
|
||||
for (i=max_log_fields-1;i<=NF;i++) {
|
||||
printf ("%s%s", $i, OFS)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if ($max_fields == "") {
|
||||
# case 2+3
|
||||
# is the glue field a DATE or FAIL_ID?
|
||||
if ($glue_field ~ /[:-]/) {
|
||||
# it is a DATE (belongs to next line)
|
||||
for (i=1;i<max_log_fields-1;i++) {
|
||||
printf ("%s%s", $i, OFS)
|
||||
}
|
||||
printf ("\n")
|
||||
for (i=max_log_fields-1;i<NF;i++) {
|
||||
printf ("%s%s", $i, OFS)
|
||||
}
|
||||
} else {
|
||||
# it is a FAIL_ID (belongs to this line)
|
||||
for (i=1;i<max_log_fields;i++) {
|
||||
printf ("%s%s", $i, OFS)
|
||||
}
|
||||
printf ("\n")
|
||||
for (i=max_log_fields;i<NF;i++) {
|
||||
printf ("%s%s", $i, OFS)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
# case 4
|
||||
for (i=1;i<max_log_fields;i++) {
|
||||
printf ("%s%s", $i, OFS)
|
||||
}
|
||||
printf ("\n")
|
||||
for (i=max_log_fields;i<NF;i++) {
|
||||
printf ("%s%s", $i, OFS)
|
||||
}
|
||||
}
|
||||
}
|
||||
printf ("\n")
|
||||
} else {
|
||||
# correct log line, no rewrite needed
|
||||
print $0
|
||||
}
|
||||
}' >${TMP_FILE} 2>/dev/null
|
||||
|
||||
# count after rewrite
|
||||
TMP_COUNT=$(wc -l ${TMP_FILE} 2>/dev/null | cut -f1 -d' ' 2>/dev/null)
|
||||
|
||||
# bail out when we do not have enough records
|
||||
if (( TMP_COUNT <= STASH_COUNT ))
|
||||
then
|
||||
warn "found inconsistent record count (${TMP_COUNT}<${STASH_COUNT}), aborting"
|
||||
return 2
|
||||
fi
|
||||
|
||||
# swap log file (but create a backup first!)
|
||||
cp -p ${FIX_FILE} ${SAVE_TMP_FILE} 2>/dev/null
|
||||
if (( $? == 0 ))
|
||||
then
|
||||
mv ${TMP_FILE} ${FIX_FILE} 2>/dev/null
|
||||
if (( $? > 0 ))
|
||||
then
|
||||
warn "failed to move/update log file, rolling back"
|
||||
mv ${SAVE_TMP_FILE} ${FIX_FILE} 2>/dev/null
|
||||
return 2
|
||||
fi
|
||||
FIX_RC=1
|
||||
else
|
||||
warn "failed to create a backup of original log file, aborting"
|
||||
return 2
|
||||
fi
|
||||
|
||||
# clean up temporary file(s)
|
||||
rm -f ${SAVE_TMP_FILE} ${TMP_FILE} >/dev/null 2>&1
|
||||
else
|
||||
log "no fixing needed for ${FIX_FILE}"
|
||||
fi
|
||||
|
||||
ERROR_COUNT=0
|
||||
done
|
||||
|
||||
return ${FIX_RC}
|
||||
}
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# @(#) FUNCTION: handle_hc()
|
||||
# DOES: handle HC results
|
||||
|
@ -20,7 +20,7 @@
|
||||
# DOES: report HC events on STDOUT
|
||||
# EXPECTS: n/a
|
||||
# RETURNS: 0
|
||||
# REQUIRES: init_hc(), list_hc(), $EVENTS_DIR, $HC_LOG
|
||||
# REQUIRES: count_log_errors(), init_hc(), list_hc(), $EVENTS_DIR, $HC_LOG
|
||||
#
|
||||
# -----------------------------------------------------------------------------
|
||||
# DO NOT CHANGE THIS FILE UNLESS YOU KNOW WHAT YOU ARE DOING!
|
||||
@ -30,7 +30,7 @@
|
||||
function report_std
|
||||
{
|
||||
# ------------------------- CONFIGURATION starts here -------------------------
|
||||
typeset _VERSION="2018-04-29" # YYYY-MM-DD
|
||||
typeset _VERSION="2018-05-27" # YYYY-MM-DD
|
||||
typeset _SUPPORTED_PLATFORMS="AIX,HP-UX,Linux" # uname -s match
|
||||
# ------------------------- CONFIGURATION ends here ---------------------------
|
||||
|
||||
@ -40,11 +40,14 @@ init_hc "$0" "${_SUPPORTED_PLATFORMS}" "${_VERSION}"
|
||||
|
||||
typeset _DIR_PREFIX=""
|
||||
typeset _FAIL_COUNT=0
|
||||
typeset _ERROR_COUNT=0
|
||||
typeset _ERROR_TOTAL_COUNT=0
|
||||
typeset _HC_LAST=""
|
||||
typeset _HC_LAST_TIME=""
|
||||
typeset _HC_LAST_STC=0
|
||||
typeset _HC_LAST_FAIL_ID="-"
|
||||
typeset _ID_NEEDLE=""
|
||||
typeset _CHECK_FILE=""
|
||||
typeset _LOG_STASH=""
|
||||
typeset _REPORT_LINE=""
|
||||
typeset _SORT_CMD=""
|
||||
@ -86,7 +89,7 @@ then
|
||||
last_fail_id = "-"
|
||||
}
|
||||
{
|
||||
if ($1 ~ needle_time && $2 ~ needle_hc) {
|
||||
if (($1 ~ needle_time && $2 ~ needle_hc) && NF <= '"${NUM_LOG_FIELDS}"') {
|
||||
last_event_stc = $3
|
||||
last_stc = last_stc + last_event_stc
|
||||
last_event_fail_id = $5
|
||||
@ -103,7 +106,7 @@ then
|
||||
"${_HC_LAST}" "${_HC_LAST_TIME}" "${_HC_LAST_FAIL_ID}" "${_HC_LAST_STC}"
|
||||
done
|
||||
# disclaimer
|
||||
print "Note: this report only shows the overall combined status of all events of each HC within exactly"
|
||||
print "NOTE: this report only shows the overall combined status of all events of each HC within exactly"
|
||||
print " the *same* time stamp (seconds precise). It may therefore fail to report certain FAIL IDs."
|
||||
print " Use '--report' to get the exact list of failure events."
|
||||
# other reports
|
||||
@ -141,7 +144,7 @@ else
|
||||
cat ${_LOG_STASH} 2>/dev/null | ${_SORT_CMD} 2>/dev/null | awk -F"${LOG_SEP}" -v id_needle="${_ID_NEEDLE}" \
|
||||
'
|
||||
{
|
||||
if ($5 ~ id_needle) {
|
||||
if ($5 ~ id_needle && NF <= '"${NUM_LOG_FIELDS}"') {
|
||||
printf ("| %-20s | %-14s | %-30s | %-s\n", $1, $5, $2, $4)
|
||||
}
|
||||
}
|
||||
@ -157,7 +160,7 @@ else
|
||||
dashes = sprintf("%36s",""); gsub (/ /, "-", dashes);
|
||||
}
|
||||
{
|
||||
if ($5 ~ id_needle) {
|
||||
if ($5 ~ id_needle && NF <= '"${NUM_LOG_FIELDS}"') {
|
||||
printf ("%36sMSG #%03d%36s", dashes, event_count, dashes)
|
||||
printf ("\nTime : %-s\nHC : %-s\nDetail : %-s\n", $1, $2, $4)
|
||||
event_count++
|
||||
@ -191,6 +194,19 @@ else
|
||||
fi
|
||||
fi
|
||||
|
||||
# check consistency of log(s)
|
||||
find ${_LOG_STASH} -type f -print 2>/dev/null | while read _CHECK_FILE
|
||||
do
|
||||
_ERROR_COUNT=$(count_log_errors ${_CHECK_FILE})
|
||||
if (( _ERROR_COUNT > 0 ))
|
||||
then
|
||||
print "NOTE: found ${_ERROR_COUNT} rogue entr(y|ies) in log file ${_CHECK_FILE}"
|
||||
_ERROR_TOTAL_COUNT=$(( _ERROR_TOTAL_COUNT + _ERROR_COUNT ))
|
||||
fi
|
||||
_ERROR_COUNT=0
|
||||
done
|
||||
(( _ERROR_TOTAL_COUNT > 0 )) && print "NOTE: fix log errors with ${SCRIPT_NAME} --fix-logs [--with-history]"
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user