* Fix for rogue log entries (via fix_logs()). When not using run locks (--use-lock) it is possible that log entries are always written in completely atomic manner.
* Other fixes
This commit is contained in:
parent
dad23525e3
commit
4554d25073
@ -37,7 +37,7 @@
|
|||||||
|
|
||||||
# ------------------------- CONFIGURATION starts here -------------------------
|
# ------------------------- CONFIGURATION starts here -------------------------
|
||||||
# define the version (YYYY-MM-DD)
|
# define the version (YYYY-MM-DD)
|
||||||
typeset -r SCRIPT_VERSION="2018-05-20"
|
typeset -r SCRIPT_VERSION="2018-05-29"
|
||||||
# location of parent directory containing KSH functions/HC plugins
|
# location of parent directory containing KSH functions/HC plugins
|
||||||
typeset -r FPATH_PARENT="/opt/hc/lib"
|
typeset -r FPATH_PARENT="/opt/hc/lib"
|
||||||
# location of custom HC configuration files
|
# location of custom HC configuration files
|
||||||
@ -58,9 +58,10 @@ typeset -r HOST_NAME="$(hostname)"
|
|||||||
typeset -r OS_NAME="$(uname -s)"
|
typeset -r OS_NAME="$(uname -s)"
|
||||||
typeset -r LOCK_DIR="${TMP_DIR}/.${SCRIPT_NAME}.lock"
|
typeset -r LOCK_DIR="${TMP_DIR}/.${SCRIPT_NAME}.lock"
|
||||||
typeset -r HC_MSG_FILE="${TMP_DIR}/.${SCRIPT_NAME}.hc.msg.$$" # plugin messages files
|
typeset -r HC_MSG_FILE="${TMP_DIR}/.${SCRIPT_NAME}.hc.msg.$$" # plugin messages files
|
||||||
typeset -r LOG_SEP="|" # single character only
|
typeset -r LOG_SEP="|" # single character only
|
||||||
typeset -r MSG_SEP="%" # single character only
|
typeset -r MSG_SEP="%" # single character only
|
||||||
typeset -r MAGIC_QUOTE="!_!" # magic quote
|
typeset -t NUM_LOG_FIELDS=6 # current number of fields in $HC_LOG + 1
|
||||||
|
typeset -r MAGIC_QUOTE="!_!" # magic quote
|
||||||
typeset -r LOG_DIR="/var/opt/hc"
|
typeset -r LOG_DIR="/var/opt/hc"
|
||||||
typeset -r LOG_FILE="${LOG_DIR}/${SCRIPT_NAME}.log"
|
typeset -r LOG_FILE="${LOG_DIR}/${SCRIPT_NAME}.log"
|
||||||
typeset -r ARCHIVE_DIR="${LOG_DIR}/archive"
|
typeset -r ARCHIVE_DIR="${LOG_DIR}/archive"
|
||||||
@ -91,6 +92,7 @@ typeset LINUX_RELEASE=""
|
|||||||
typeset ARCHIVE_RC=0
|
typeset ARCHIVE_RC=0
|
||||||
typeset DISABLE_RC=0
|
typeset DISABLE_RC=0
|
||||||
typeset ENABLE_RC=0
|
typeset ENABLE_RC=0
|
||||||
|
typeset FIX_FC=0
|
||||||
typeset RUN_RC=0
|
typeset RUN_RC=0
|
||||||
typeset RUN_CONFIG_FILE=""
|
typeset RUN_CONFIG_FILE=""
|
||||||
typeset RUN_TIME_OUT=0
|
typeset RUN_TIME_OUT=0
|
||||||
@ -384,6 +386,12 @@ then
|
|||||||
ARG_VERBOSE=0
|
ARG_VERBOSE=0
|
||||||
ARG_LOG=0
|
ARG_LOG=0
|
||||||
fi
|
fi
|
||||||
|
# --fix-logs
|
||||||
|
if (( ARG_ACTION == 12 )) && [[ -n "${ARG_HC}" ]]
|
||||||
|
then
|
||||||
|
print -u2 "ERROR: you can only use '--fix-logs' in combination with '--with-history'"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
# --timeout
|
# --timeout
|
||||||
if (( ARG_TIME_OUT > 0 ))
|
if (( ARG_TIME_OUT > 0 ))
|
||||||
then
|
then
|
||||||
@ -504,8 +512,8 @@ cat << EOT
|
|||||||
Execute/report simple health checks (HC) on UNIX hosts.
|
Execute/report simple health checks (HC) on UNIX hosts.
|
||||||
|
|
||||||
Syntax: ${SCRIPT_DIR}/${SCRIPT_NAME} [--help] | [--help-terse] | [--version] |
|
Syntax: ${SCRIPT_DIR}/${SCRIPT_NAME} [--help] | [--help-terse] | [--version] |
|
||||||
[--list=<needle>] | [--list-core] | [--fix-symlinks] | [--show-stats] | (--disable-all | enable-all) |
|
[--list=<needle>] | [--list-core] | [--fix-symlinks] | [--show-stats] | (--disable-all | enable-all) | [--fix-logs [--with-history]] |
|
||||||
(--check-host | ((--archive | --check | --enable | --disable | --run [--timeout=<secs>] | --show) --hc=<list_of_checks> [--config-file=<configuration_file>] [hc-args="<arg1,arg2=val,arg3">]))
|
(--check-host | ((--archive | --check | --enable | --disable | --run [--timeout=<secs>] | --show) --hc=<list_of_checks> [--config-file=<configuration_file>] [hc-args="<arg1,arg2=val,arg3">]))
|
||||||
[--display=<method>] ([--debug] [--debug-level=<level>]) [--no-monitor] [--no-log] [--no-lock] [--flip-rc]
|
[--display=<method>] ([--debug] [--debug-level=<level>]) [--no-monitor] [--no-log] [--no-lock] [--flip-rc]
|
||||||
[--notify=<method_list>] [--mail-to=<address_list>] [--sms-to=<sms_rcpt> --sms-provider=<name>]
|
[--notify=<method_list>] [--mail-to=<address_list>] [--sms-to=<sms_rcpt> --sms-provider=<name>]
|
||||||
[--report=<method> ( ([--last] | [--today]) | ([--reverse] [--id=<fail_id> [--detail]] [--with-history]) ) ]
|
[--report=<method> ( ([--last] | [--today]) | ([--reverse] [--id=<fail_id> [--detail]] [--with-history]) ) ]
|
||||||
@ -529,6 +537,7 @@ Parameters:
|
|||||||
--display : display HC results in a formatted way. Default is STDOUT (see --list-core for available formats)
|
--display : display HC results in a formatted way. Default is STDOUT (see --list-core for available formats)
|
||||||
--enable : enable HC(s).
|
--enable : enable HC(s).
|
||||||
--enable-all : enable all HCs.
|
--enable-all : enable all HCs.
|
||||||
|
--fix-logs : fix rogue log entries (can be used with --with-history)
|
||||||
--fix-symlinks : update symbolic links for the KSH autoloader.
|
--fix-symlinks : update symbolic links for the KSH autoloader.
|
||||||
--flip-rc : exit the health checker with the RC (return code) of the HC plugin instead of its own RC (will be discarded)
|
--flip-rc : exit the health checker with the RC (return code) of the HC plugin instead of its own RC (will be discarded)
|
||||||
This option may only be specified when executing a single HC plugin
|
This option may only be specified when executing a single HC plugin
|
||||||
@ -680,16 +689,34 @@ CMD_LINE="$*"
|
|||||||
[[ -z "${CMD_LINE}" ]] && display_usage && exit 0
|
[[ -z "${CMD_LINE}" ]] && display_usage && exit 0
|
||||||
for CMD_PARAMETER in ${CMD_LINE}
|
for CMD_PARAMETER in ${CMD_LINE}
|
||||||
do
|
do
|
||||||
|
# ARG_ACTION is a toggle, do not allow double toggles
|
||||||
case ${CMD_PARAMETER} in
|
case ${CMD_PARAMETER} in
|
||||||
-archive|--archive)
|
-archive|--archive)
|
||||||
ARG_ACTION=10
|
if (( ARG_ACTION > 0 ))
|
||||||
|
then
|
||||||
|
print -u2 "ERROR: you cannot request two actions if (( ARG_ACTION > 0 ))
|
||||||
|
then
|
||||||
|
print -u2 "ERROR: you cannot request two actions at the same time"
|
||||||
|
exit 1
|
||||||
|
else"
|
||||||
|
exit 1
|
||||||
|
else
|
||||||
|
ARG_ACTION=10
|
||||||
|
fi
|
||||||
|
ARG_LOCK=1
|
||||||
;;
|
;;
|
||||||
-check|--check)
|
-check|--check)
|
||||||
ARG_ACTION=1
|
ARG_ACTION=1
|
||||||
;;
|
;;
|
||||||
-c|-check-host|--check-host)
|
-c|-check-host|--check-host)
|
||||||
|
if (( ARG_ACTION > 0 ))
|
||||||
|
then
|
||||||
|
print -u2 "ERROR: you cannot request two actions at the same time"
|
||||||
|
exit 1
|
||||||
|
else
|
||||||
|
ARG_ACTION=4
|
||||||
|
fi
|
||||||
ARG_CHECK_HOST=1
|
ARG_CHECK_HOST=1
|
||||||
ARG_ACTION=4
|
|
||||||
;;
|
;;
|
||||||
-config-file=*)
|
-config-file=*)
|
||||||
ARG_CONFIG_FILE="${CMD_PARAMETER#-config-file=}"
|
ARG_CONFIG_FILE="${CMD_PARAMETER#-config-file=}"
|
||||||
@ -712,10 +739,22 @@ do
|
|||||||
ARG_DETAIL=1
|
ARG_DETAIL=1
|
||||||
;;
|
;;
|
||||||
-d|-disable|--disable)
|
-d|-disable|--disable)
|
||||||
ARG_ACTION=2
|
if (( ARG_ACTION > 0 ))
|
||||||
|
then
|
||||||
|
print -u2 "ERROR: you cannot request two actions at the same time"
|
||||||
|
exit 1
|
||||||
|
else
|
||||||
|
ARG_ACTION=2
|
||||||
|
fi
|
||||||
;;
|
;;
|
||||||
-disable-all|--disable-all)
|
-disable-all|--disable-all)
|
||||||
ARG_ACTION=6
|
if (( ARG_ACTION > 0 ))
|
||||||
|
then
|
||||||
|
print -u2 "ERROR: you cannot request two actions at the same time"
|
||||||
|
exit 1
|
||||||
|
else
|
||||||
|
ARG_ACTION=6
|
||||||
|
fi
|
||||||
;;
|
;;
|
||||||
-display|--display)
|
-display|--display)
|
||||||
# STDOUT as default
|
# STDOUT as default
|
||||||
@ -728,10 +767,22 @@ do
|
|||||||
ARG_DISPLAY="${CMD_PARAMETER#--display=}"
|
ARG_DISPLAY="${CMD_PARAMETER#--display=}"
|
||||||
;;
|
;;
|
||||||
-e|-enable|--enable)
|
-e|-enable|--enable)
|
||||||
ARG_ACTION=3
|
if (( ARG_ACTION > 0 ))
|
||||||
|
then
|
||||||
|
print -u2 "ERROR: you cannot request two actions at the same time"
|
||||||
|
exit 1
|
||||||
|
else
|
||||||
|
ARG_ACTION=3
|
||||||
|
fi
|
||||||
;;
|
;;
|
||||||
-enable-all|--enable-all)
|
-enable-all|--enable-all)
|
||||||
ARG_ACTION=7
|
if (( ARG_ACTION > 0 ))
|
||||||
|
then
|
||||||
|
print -u2 "ERROR: you cannot request two actions at the same time"
|
||||||
|
exit 1
|
||||||
|
else
|
||||||
|
ARG_ACTION=7
|
||||||
|
fi
|
||||||
;;
|
;;
|
||||||
-f|-fix-symlinks|--fix-symlinks)
|
-f|-fix-symlinks|--fix-symlinks)
|
||||||
read_config
|
read_config
|
||||||
@ -742,6 +793,16 @@ do
|
|||||||
fix_symlinks
|
fix_symlinks
|
||||||
exit 0
|
exit 0
|
||||||
;;
|
;;
|
||||||
|
-fix-logs|--fix-logs)
|
||||||
|
if (( ARG_ACTION > 0 ))
|
||||||
|
then
|
||||||
|
print -u2 "ERROR: you cannot request two actions at the same time"
|
||||||
|
exit 1
|
||||||
|
else
|
||||||
|
ARG_ACTION=12
|
||||||
|
fi
|
||||||
|
ARG_LOCK=1
|
||||||
|
;;
|
||||||
-flip-rc|--flip-rc)
|
-flip-rc|--flip-rc)
|
||||||
ARG_FLIP_RC=1
|
ARG_FLIP_RC=1
|
||||||
;;
|
;;
|
||||||
@ -770,15 +831,33 @@ do
|
|||||||
ARG_LAST=1
|
ARG_LAST=1
|
||||||
;;
|
;;
|
||||||
-list|--list)
|
-list|--list)
|
||||||
ARG_ACTION=9
|
if (( ARG_ACTION > 0 ))
|
||||||
|
then
|
||||||
|
print -u2 "ERROR: you cannot request two actions at the same time"
|
||||||
|
exit 1
|
||||||
|
else
|
||||||
|
ARG_ACTION=9
|
||||||
|
fi
|
||||||
;;
|
;;
|
||||||
-list=*)
|
-list=*)
|
||||||
ARG_LIST="${CMD_PARAMETER#-list=}"
|
ARG_LIST="${CMD_PARAMETER#-list=}"
|
||||||
ARG_ACTION=9
|
if (( ARG_ACTION > 0 ))
|
||||||
|
then
|
||||||
|
print -u2 "ERROR: you cannot request two actions at the same time"
|
||||||
|
exit 1
|
||||||
|
else
|
||||||
|
ARG_ACTION=9
|
||||||
|
fi
|
||||||
;;
|
;;
|
||||||
--list=*)
|
--list=*)
|
||||||
ARG_LIST="${CMD_PARAMETER#--list=}"
|
ARG_LIST="${CMD_PARAMETER#--list=}"
|
||||||
ARG_ACTION=9
|
if (( ARG_ACTION > 0 ))
|
||||||
|
then
|
||||||
|
print -u2 "ERROR: you cannot request two actions at the same time"
|
||||||
|
exit 1
|
||||||
|
else
|
||||||
|
ARG_ACTION=9
|
||||||
|
fi
|
||||||
;;
|
;;
|
||||||
-list-hc|--list-hc|-list-all|--list-all)
|
-list-hc|--list-hc|-list-all|--list-all)
|
||||||
print -u2 "WARN: deprecated option. Use --list | --list=<needle>"
|
print -u2 "WARN: deprecated option. Use --list | --list=<needle>"
|
||||||
@ -816,34 +895,69 @@ do
|
|||||||
ARG_MONITOR=0
|
ARG_MONITOR=0
|
||||||
;;
|
;;
|
||||||
-report|--report) # compatability support <2017-12-15
|
-report|--report) # compatability support <2017-12-15
|
||||||
|
if (( ARG_ACTION > 0 ))
|
||||||
|
then
|
||||||
|
print -u2 "ERROR: you cannot request two actions at the same time"
|
||||||
|
exit 1
|
||||||
|
else
|
||||||
|
ARG_ACTION=8
|
||||||
|
fi
|
||||||
# STDOUT as default
|
# STDOUT as default
|
||||||
ARG_REPORT="std"
|
ARG_REPORT="std"
|
||||||
ARG_LOG=0; ARG_VERBOSE=0
|
ARG_LOG=0; ARG_VERBOSE=0
|
||||||
ARG_ACTION=8
|
|
||||||
;;
|
;;
|
||||||
-report=*)
|
-report=*)
|
||||||
|
if (( ARG_ACTION > 0 ))
|
||||||
|
then
|
||||||
|
print -u2 "ERROR: you cannot request two actions at the same time"
|
||||||
|
exit 1
|
||||||
|
else
|
||||||
|
ARG_ACTION=8
|
||||||
|
fi
|
||||||
ARG_REPORT="${CMD_PARAMETER#-report=}"
|
ARG_REPORT="${CMD_PARAMETER#-report=}"
|
||||||
ARG_LOG=0; ARG_VERBOSE=0
|
ARG_LOG=0; ARG_VERBOSE=0
|
||||||
ARG_ACTION=8
|
|
||||||
;;
|
;;
|
||||||
--report=*)
|
--report=*)
|
||||||
|
if (( ARG_ACTION > 0 ))
|
||||||
|
then
|
||||||
|
print -u2 "ERROR: you cannot request two actions at the same time"
|
||||||
|
exit 1
|
||||||
|
else
|
||||||
|
ARG_ACTION=8
|
||||||
|
fi
|
||||||
ARG_REPORT="${CMD_PARAMETER#--report=}"
|
ARG_REPORT="${CMD_PARAMETER#--report=}"
|
||||||
ARG_LOG=0; ARG_VERBOSE=0
|
ARG_LOG=0; ARG_VERBOSE=0
|
||||||
ARG_ACTION=8
|
|
||||||
;;
|
;;
|
||||||
-reverse|--reverse)
|
-reverse|--reverse)
|
||||||
ARG_REVERSE=1
|
ARG_REVERSE=1
|
||||||
;;
|
;;
|
||||||
-r|-run|--run)
|
-r|-run|--run)
|
||||||
ARG_ACTION=4
|
if (( ARG_ACTION > 0 ))
|
||||||
|
then
|
||||||
|
print -u2 "ERROR: you cannot request two actions at the same time"
|
||||||
|
exit 1
|
||||||
|
else
|
||||||
|
ARG_ACTION=4
|
||||||
|
fi
|
||||||
;;
|
;;
|
||||||
-s|-show|--show)
|
-s|-show|--show)
|
||||||
ARG_ACTION=5
|
if (( ARG_ACTION > 0 ))
|
||||||
ARG_LOG=0
|
then
|
||||||
ARG_VERBOSE=0
|
print -u2 "ERROR: you cannot request two actions at the same time"
|
||||||
|
exit 1
|
||||||
|
else
|
||||||
|
ARG_ACTION=5
|
||||||
|
fi
|
||||||
|
ARG_LOG=0; ARG_VERBOSE=0
|
||||||
;;
|
;;
|
||||||
-show-stats|--show-stats)
|
-show-stats|--show-stats)
|
||||||
ARG_ACTION=11
|
if (( ARG_ACTION > 0 ))
|
||||||
|
then
|
||||||
|
print -u2 "ERROR: you cannot request two actions at the same time"
|
||||||
|
exit 1
|
||||||
|
else
|
||||||
|
ARG_ACTION=11
|
||||||
|
fi
|
||||||
;;
|
;;
|
||||||
-sms-provider=*)
|
-sms-provider=*)
|
||||||
ARG_SMS_PROVIDER="${CMD_PARAMETER#-sms-provider=}"
|
ARG_SMS_PROVIDER="${CMD_PARAMETER#-sms-provider=}"
|
||||||
@ -921,8 +1035,8 @@ fi
|
|||||||
log "*** start of ${SCRIPT_NAME} [${CMD_LINE}] ***"
|
log "*** start of ${SCRIPT_NAME} [${CMD_LINE}] ***"
|
||||||
(( ARG_LOG != 0 )) && log "logging takes places in ${LOG_FILE}"
|
(( ARG_LOG != 0 )) && log "logging takes places in ${LOG_FILE}"
|
||||||
|
|
||||||
# check/create lock file & write PID file (only for --run)
|
# check/create lock file & write PID file (only for --run/--archive/--fix-logs)
|
||||||
(( ARG_ACTION == 4 )) && check_lock_dir
|
(( ARG_ACTION == 4 || ARG_ACTION == 11 || ARG_ACTION == 12 )) && check_lock_dir
|
||||||
|
|
||||||
# general HC log
|
# general HC log
|
||||||
HC_LOG="${LOG_DIR}/hc.log"
|
HC_LOG="${LOG_DIR}/hc.log"
|
||||||
@ -1198,6 +1312,23 @@ case ${ARG_ACTION} in
|
|||||||
11) # show HC event statistics
|
11) # show HC event statistics
|
||||||
show_statistics
|
show_statistics
|
||||||
;;
|
;;
|
||||||
|
12)
|
||||||
|
# fix rogue log entries
|
||||||
|
fix_logs
|
||||||
|
FIX_RC=$?
|
||||||
|
case ${FIX_RC} in
|
||||||
|
0)
|
||||||
|
: # feedback via fix_logs()
|
||||||
|
;;
|
||||||
|
1)
|
||||||
|
log "successfully fixed log entries"
|
||||||
|
;;
|
||||||
|
2)
|
||||||
|
log "failed to fix log entries [RC=${FIX_RC}]"
|
||||||
|
EXIT_CODE=1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
;;
|
||||||
esac
|
esac
|
||||||
|
|
||||||
# finish up work
|
# finish up work
|
||||||
|
@ -28,12 +28,13 @@
|
|||||||
# DOES: archive log entries for a given HC
|
# DOES: archive log entries for a given HC
|
||||||
# EXPECTS: HC name [string]
|
# EXPECTS: HC name [string]
|
||||||
# RETURNS: 0=no archiving needed; 1=archiving OK; 2=archiving NOK
|
# RETURNS: 0=no archiving needed; 1=archiving OK; 2=archiving NOK
|
||||||
# REQUIRES: n/a
|
# REQUIRES: ${HC_LOG}
|
||||||
function archive_hc
|
function archive_hc
|
||||||
{
|
{
|
||||||
(( ARG_DEBUG != 0 && ARG_DEBUG_LEVEL > 0 )) && set "${DEBUG_OPTS}"
|
(( ARG_DEBUG != 0 && ARG_DEBUG_LEVEL > 0 )) && set "${DEBUG_OPTS}"
|
||||||
typeset HC_NAME="${1}"
|
typeset HC_NAME="${1}"
|
||||||
typeset ARCHIVE_FILE=""
|
typeset ARCHIVE_FILE=""
|
||||||
|
typeset ARCHIVE_RC=0
|
||||||
typeset YEAR_MONTH=""
|
typeset YEAR_MONTH=""
|
||||||
typeset LOG_COUNT=0
|
typeset LOG_COUNT=0
|
||||||
typeset ARCHIVE_RC=0
|
typeset ARCHIVE_RC=0
|
||||||
@ -46,22 +47,23 @@ trap "rm -f ${TMP1_FILE} ${TMP2_FILE} ${SAVE_LOG_FILE} >/dev/null 2>&1; return 1
|
|||||||
|
|
||||||
# isolate messages from HC, find unique %Y-%m combinations
|
# isolate messages from HC, find unique %Y-%m combinations
|
||||||
grep ".*${LOG_SEP}${HC_NAME}${LOG_SEP}" ${HC_LOG} 2>/dev/null |\
|
grep ".*${LOG_SEP}${HC_NAME}${LOG_SEP}" ${HC_LOG} 2>/dev/null |\
|
||||||
cut -f1 -d"${LOG_SEP}" | cut -f1 -d' ' | cut -f1-2 -d'-' | sort -u |\
|
cut -f1 -d"${LOG_SEP}" 2>/dev/null | cut -f1 -d' ' 2>/dev/null |\
|
||||||
|
cut -f1-2 -d'-' 2>/dev/null | sort -u 2>/dev/null |\
|
||||||
while read YEAR_MONTH
|
while read YEAR_MONTH
|
||||||
do
|
do
|
||||||
# find all messages for that YEAR-MONTH combination
|
# find all messages for that YEAR-MONTH combination
|
||||||
grep "${YEAR_MONTH}.*${LOG_SEP}${HC_NAME}${LOG_SEP}" ${HC_LOG} >${TMP1_FILE}
|
grep "${YEAR_MONTH}.*${LOG_SEP}${HC_NAME}${LOG_SEP}" ${HC_LOG} >${TMP1_FILE}
|
||||||
LOG_COUNT=$(wc -l ${TMP1_FILE} | cut -f1 -d' ')
|
LOG_COUNT=$(wc -l ${TMP1_FILE} 2>/dev/null | cut -f1 -d' ' 2>/dev/null)
|
||||||
log "# of entries in ${YEAR_MONTH} to archive: ${LOG_COUNT}"
|
log "# of entries in ${YEAR_MONTH} to archive: ${LOG_COUNT}"
|
||||||
|
|
||||||
# combine existing archived messages and resort
|
# combine existing archived messages and resort
|
||||||
ARCHIVE_FILE="${ARCHIVE_DIR}/hc.${YEAR_MONTH}.log"
|
ARCHIVE_FILE="${ARCHIVE_DIR}/hc.${YEAR_MONTH}.log"
|
||||||
cat ${ARCHIVE_FILE} ${TMP1_FILE} 2>/dev/null | sort -u >${TMP2_FILE}
|
cat ${ARCHIVE_FILE} ${TMP1_FILE} 2>/dev/null | sort -u >${TMP2_FILE} 2>/dev/null
|
||||||
mv ${TMP2_FILE} ${ARCHIVE_FILE} 2>/dev/null || {
|
mv ${TMP2_FILE} ${ARCHIVE_FILE} 2>/dev/null || {
|
||||||
warn "failed to move archive file, aborting"
|
warn "failed to move archive file, aborting"
|
||||||
return 2
|
return 2
|
||||||
}
|
}
|
||||||
LOG_COUNT=$(wc -l ${ARCHIVE_FILE} | cut -f1 -d' ')
|
LOG_COUNT=$(wc -l ${ARCHIVE_FILE} 2>/dev/null | cut -f1 -d' ' 2>/dev/null)
|
||||||
log "# entries in ${ARCHIVE_FILE} now: ${LOG_COUNT}"
|
log "# entries in ${ARCHIVE_FILE} now: ${LOG_COUNT}"
|
||||||
|
|
||||||
# remove archived messages from the $HC_LOG (but create a backup first!)
|
# remove archived messages from the $HC_LOG (but create a backup first!)
|
||||||
@ -76,7 +78,7 @@ do
|
|||||||
warn "failed to move HC log file, aborting"
|
warn "failed to move HC log file, aborting"
|
||||||
return 2
|
return 2
|
||||||
}
|
}
|
||||||
LOG_COUNT=$(wc -l ${HC_LOG} | cut -f1 -d' ')
|
LOG_COUNT=$(wc -l ${HC_LOG} 2>/dev/null | cut -f1 -d' ' 2>/dev/null )
|
||||||
log "# entries in ${HC_LOG} now: ${LOG_COUNT}"
|
log "# entries in ${HC_LOG} now: ${LOG_COUNT}"
|
||||||
ARCHIVE_RC=1
|
ARCHIVE_RC=1
|
||||||
else
|
else
|
||||||
@ -92,6 +94,29 @@ rm -f ${TMP1_FILE} ${TMP2_FILE} ${SAVE_HC_LOG} >/dev/null 2>&1
|
|||||||
return ${ARCHIVE_RC}
|
return ${ARCHIVE_RC}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# @(#) FUNCTION: count_log_errors()
|
||||||
|
# DOES: check hc log file(s) for rogue entries. Log entries may get scrambled
|
||||||
|
# if the append operation in handle_hc() does not happen fully atomically.
|
||||||
|
# This means that log entries are written without line separator (same line)
|
||||||
|
# There is no proper way to avoid this without an extra file locking utility
|
||||||
|
# EXPECTS: path to log file to check
|
||||||
|
# OUTPUTS: number of errors [number]
|
||||||
|
# RETURNS: 0
|
||||||
|
# REQUIRES: n/a
|
||||||
|
function count_log_errors
|
||||||
|
{
|
||||||
|
(( ARG_DEBUG != 0 && ARG_DEBUG_LEVEL > 0 )) && set "${DEBUG_OPTS}"
|
||||||
|
typeset LOG_STASH="${1}"
|
||||||
|
typeset ERROR_COUNT=0
|
||||||
|
|
||||||
|
ERROR_COUNT=$(cat ${LOG_STASH} 2>/dev/null | awk -F"${LOG_SEP}" 'BEGIN { num = 0 } { if (NF>'"${NUM_LOG_FIELDS}"') { num++ }} END { print num }' 2>/dev/null)
|
||||||
|
|
||||||
|
print ${ERROR_COUNT}
|
||||||
|
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
# @(#) FUNCTION: debug()
|
# @(#) FUNCTION: debug()
|
||||||
# DOES: handle debug messages
|
# DOES: handle debug messages
|
||||||
@ -110,7 +135,6 @@ done
|
|||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
# @(#) FUNCTION: die()
|
# @(#) FUNCTION: die()
|
||||||
# DOES: handle fatal errors and exit script
|
# DOES: handle fatal errors and exit script
|
||||||
@ -528,10 +552,6 @@ if (( DO_REPORT_STD == 0 )) && (( ARG_DETAIL != 0 ))
|
|||||||
then
|
then
|
||||||
die "you cannot specify '--detail' without '--report'"
|
die "you cannot specify '--detail' without '--report'"
|
||||||
fi
|
fi
|
||||||
if (( DO_REPORT_STD == 0 )) && (( ARG_HISTORY != 0 ))
|
|
||||||
then
|
|
||||||
die "you cannot specify '--with-history' without '--report'"
|
|
||||||
fi
|
|
||||||
if (( DO_REPORT_STD == 0 )) && [[ -n "${ARG_FAIL_ID}" ]]
|
if (( DO_REPORT_STD == 0 )) && [[ -n "${ARG_FAIL_ID}" ]]
|
||||||
then
|
then
|
||||||
die "you cannot specify '--id' without '--report'"
|
die "you cannot specify '--id' without '--report'"
|
||||||
@ -604,6 +624,167 @@ done
|
|||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# @(#) FUNCTION: fix_logs()
|
||||||
|
# DOES: fix hc log file(s) with rogue entries
|
||||||
|
# EXPECTS: n/a
|
||||||
|
# REQUIRES: n/a
|
||||||
|
# RETURNS: 0=no fix needed; 1=fix OK; 2=fix NOK
|
||||||
|
# NOTE: this routine rewrites the HC log(s). Since we cannot use file locking,
|
||||||
|
# some log entries may be lost if the HC is accessing the HC log during
|
||||||
|
# the rewrite operation!!
|
||||||
|
function fix_logs
|
||||||
|
{
|
||||||
|
(( ARG_DEBUG != 0 && ARG_DEBUG_LEVEL > 0 )) && set "${DEBUG_OPTS}"
|
||||||
|
typeset FIX_FILE=""
|
||||||
|
typeset FIX_RC=0
|
||||||
|
typeset LOG_STASH=""
|
||||||
|
typeset ERROR_COUNT=0
|
||||||
|
typeset STASH_COUNT=0
|
||||||
|
typeset TMP_COUNT=0
|
||||||
|
typeset SAVE_TMP_FILE="${TMP_DIR}/.$0.save.log.$$"
|
||||||
|
typeset TMP_FILE="${TMP_DIR}/.$0.tmp.log.$$"
|
||||||
|
|
||||||
|
if (( ARG_HISTORY != 0 ))
|
||||||
|
then
|
||||||
|
set +f # file globbing must be on
|
||||||
|
LOG_STASH="${HC_LOG} ${ARCHIVE_DIR}/hc.*.log"
|
||||||
|
else
|
||||||
|
LOG_STASH="${HC_LOG}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# set local trap for clean-up
|
||||||
|
trap "[[ -f ${TMP_FILE} ]] && rm -f ${TMP_FILE} >/dev/null 2>&1; return 1" 1 2 3 15
|
||||||
|
|
||||||
|
# check and rewrite log file(s)
|
||||||
|
find ${LOG_STASH} -type f -print 2>/dev/null | while read FIX_FILE
|
||||||
|
do
|
||||||
|
log "fixing log file ${FIX_FILE} ..."
|
||||||
|
|
||||||
|
# count before rewrite
|
||||||
|
STASH_COUNT=$(wc -l ${FIX_FILE} 2>/dev/null | cut -f1 -d' ' 2>/dev/null)
|
||||||
|
|
||||||
|
# does it have errors?
|
||||||
|
ERROR_COUNT=$(count_log_errors ${FIX_FILE})
|
||||||
|
|
||||||
|
# rewrite if needed
|
||||||
|
if (( ERROR_COUNT > 0 ))
|
||||||
|
then
|
||||||
|
>${TMP_FILE} 2>/dev/null
|
||||||
|
cat ${FIX_FILE} 2>/dev/null | awk -F"${LOG_SEP}" -v OFS="${LOG_SEP}" '
|
||||||
|
|
||||||
|
BEGIN { max_log_fields = '"${NUM_LOG_FIELDS}"'
|
||||||
|
max_fields = (max_log_fields - 1) * 2
|
||||||
|
glue_field = max_log_fields - 1
|
||||||
|
}
|
||||||
|
|
||||||
|
# Fix log lines that were smashed together because of unatomic appends
|
||||||
|
# This can lead to 4 distinct cases that we need to rewrite based on
|
||||||
|
# whether a FAIL_ID is present in each part of the log line.
|
||||||
|
# Following examples are based on a log file with 5 standard fields:
|
||||||
|
# case 1: NO (FAIL_ID) + NO (FAIL_ID) -> 9 fields
|
||||||
|
# case 2: NO (FAIL_ID) + YES (FAIL_ID) -> 10 fields
|
||||||
|
# case 3: YES (FAIL_ID) + NO (FAIL_ID) -> 10 fields
|
||||||
|
# case 4: YES (FAIL_ID) + YES (FAIL_ID) -> 11 fields
|
||||||
|
|
||||||
|
{
|
||||||
|
if (NF > max_log_fields) {
|
||||||
|
# rogue line that needs rewriting
|
||||||
|
if (NF < max_fields) {
|
||||||
|
# case 1
|
||||||
|
for (i=1;i<max_log_fields-1;i++) {
|
||||||
|
printf ("%s%s", $i, OFS)
|
||||||
|
}
|
||||||
|
printf ("\n")
|
||||||
|
if ($NF ~ //) {
|
||||||
|
for (i=max_log_fields-1;i<NF;i++) {
|
||||||
|
printf ("%s%s", $i, OFS)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (i=max_log_fields-1;i<=NF;i++) {
|
||||||
|
printf ("%s%s", $i, OFS)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if ($max_fields == "") {
|
||||||
|
# case 2+3
|
||||||
|
# is the glue field a DATE or FAIL_ID?
|
||||||
|
if ($glue_field ~ /[:-]/) {
|
||||||
|
# it is a DATE (belongs to next line)
|
||||||
|
for (i=1;i<max_log_fields-1;i++) {
|
||||||
|
printf ("%s%s", $i, OFS)
|
||||||
|
}
|
||||||
|
printf ("\n")
|
||||||
|
for (i=max_log_fields-1;i<NF;i++) {
|
||||||
|
printf ("%s%s", $i, OFS)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
# it is a FAIL_ID (belongs to this line)
|
||||||
|
for (i=1;i<max_log_fields;i++) {
|
||||||
|
printf ("%s%s", $i, OFS)
|
||||||
|
}
|
||||||
|
printf ("\n")
|
||||||
|
for (i=max_log_fields;i<NF;i++) {
|
||||||
|
printf ("%s%s", $i, OFS)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
# case 4
|
||||||
|
for (i=1;i<max_log_fields;i++) {
|
||||||
|
printf ("%s%s", $i, OFS)
|
||||||
|
}
|
||||||
|
printf ("\n")
|
||||||
|
for (i=max_log_fields;i<NF;i++) {
|
||||||
|
printf ("%s%s", $i, OFS)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
printf ("\n")
|
||||||
|
} else {
|
||||||
|
# correct log line, no rewrite needed
|
||||||
|
print $0
|
||||||
|
}
|
||||||
|
}' >${TMP_FILE} 2>/dev/null
|
||||||
|
|
||||||
|
# count after rewrite
|
||||||
|
TMP_COUNT=$(wc -l ${TMP_FILE} 2>/dev/null | cut -f1 -d' ' 2>/dev/null)
|
||||||
|
|
||||||
|
# bail out when we do not have enough records
|
||||||
|
if (( TMP_COUNT <= STASH_COUNT ))
|
||||||
|
then
|
||||||
|
warn "found inconsistent record count (${TMP_COUNT}<${STASH_COUNT}), aborting"
|
||||||
|
return 2
|
||||||
|
fi
|
||||||
|
|
||||||
|
# swap log file (but create a backup first!)
|
||||||
|
cp -p ${FIX_FILE} ${SAVE_TMP_FILE} 2>/dev/null
|
||||||
|
if (( $? == 0 ))
|
||||||
|
then
|
||||||
|
mv ${TMP_FILE} ${FIX_FILE} 2>/dev/null
|
||||||
|
if (( $? > 0 ))
|
||||||
|
then
|
||||||
|
warn "failed to move/update log file, rolling back"
|
||||||
|
mv ${SAVE_TMP_FILE} ${FIX_FILE} 2>/dev/null
|
||||||
|
return 2
|
||||||
|
fi
|
||||||
|
FIX_RC=1
|
||||||
|
else
|
||||||
|
warn "failed to create a backup of original log file, aborting"
|
||||||
|
return 2
|
||||||
|
fi
|
||||||
|
|
||||||
|
# clean up temporary file(s)
|
||||||
|
rm -f ${SAVE_TMP_FILE} ${TMP_FILE} >/dev/null 2>&1
|
||||||
|
else
|
||||||
|
log "no fixing needed for ${FIX_FILE}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
ERROR_COUNT=0
|
||||||
|
done
|
||||||
|
|
||||||
|
return ${FIX_RC}
|
||||||
|
}
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
# @(#) FUNCTION: handle_hc()
|
# @(#) FUNCTION: handle_hc()
|
||||||
# DOES: handle HC results
|
# DOES: handle HC results
|
||||||
|
@ -20,7 +20,7 @@
|
|||||||
# DOES: report HC events on STDOUT
|
# DOES: report HC events on STDOUT
|
||||||
# EXPECTS: n/a
|
# EXPECTS: n/a
|
||||||
# RETURNS: 0
|
# RETURNS: 0
|
||||||
# REQUIRES: init_hc(), list_hc(), $EVENTS_DIR, $HC_LOG
|
# REQUIRES: count_log_errors(), init_hc(), list_hc(), $EVENTS_DIR, $HC_LOG
|
||||||
#
|
#
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
# DO NOT CHANGE THIS FILE UNLESS YOU KNOW WHAT YOU ARE DOING!
|
# DO NOT CHANGE THIS FILE UNLESS YOU KNOW WHAT YOU ARE DOING!
|
||||||
@ -30,7 +30,7 @@
|
|||||||
function report_std
|
function report_std
|
||||||
{
|
{
|
||||||
# ------------------------- CONFIGURATION starts here -------------------------
|
# ------------------------- CONFIGURATION starts here -------------------------
|
||||||
typeset _VERSION="2018-04-29" # YYYY-MM-DD
|
typeset _VERSION="2018-05-27" # YYYY-MM-DD
|
||||||
typeset _SUPPORTED_PLATFORMS="AIX,HP-UX,Linux" # uname -s match
|
typeset _SUPPORTED_PLATFORMS="AIX,HP-UX,Linux" # uname -s match
|
||||||
# ------------------------- CONFIGURATION ends here ---------------------------
|
# ------------------------- CONFIGURATION ends here ---------------------------
|
||||||
|
|
||||||
@ -40,11 +40,14 @@ init_hc "$0" "${_SUPPORTED_PLATFORMS}" "${_VERSION}"
|
|||||||
|
|
||||||
typeset _DIR_PREFIX=""
|
typeset _DIR_PREFIX=""
|
||||||
typeset _FAIL_COUNT=0
|
typeset _FAIL_COUNT=0
|
||||||
|
typeset _ERROR_COUNT=0
|
||||||
|
typeset _ERROR_TOTAL_COUNT=0
|
||||||
typeset _HC_LAST=""
|
typeset _HC_LAST=""
|
||||||
typeset _HC_LAST_TIME=""
|
typeset _HC_LAST_TIME=""
|
||||||
typeset _HC_LAST_STC=0
|
typeset _HC_LAST_STC=0
|
||||||
typeset _HC_LAST_FAIL_ID="-"
|
typeset _HC_LAST_FAIL_ID="-"
|
||||||
typeset _ID_NEEDLE=""
|
typeset _ID_NEEDLE=""
|
||||||
|
typeset _CHECK_FILE=""
|
||||||
typeset _LOG_STASH=""
|
typeset _LOG_STASH=""
|
||||||
typeset _REPORT_LINE=""
|
typeset _REPORT_LINE=""
|
||||||
typeset _SORT_CMD=""
|
typeset _SORT_CMD=""
|
||||||
@ -86,7 +89,7 @@ then
|
|||||||
last_fail_id = "-"
|
last_fail_id = "-"
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
if ($1 ~ needle_time && $2 ~ needle_hc) {
|
if (($1 ~ needle_time && $2 ~ needle_hc) && NF <= '"${NUM_LOG_FIELDS}"') {
|
||||||
last_event_stc = $3
|
last_event_stc = $3
|
||||||
last_stc = last_stc + last_event_stc
|
last_stc = last_stc + last_event_stc
|
||||||
last_event_fail_id = $5
|
last_event_fail_id = $5
|
||||||
@ -103,7 +106,7 @@ then
|
|||||||
"${_HC_LAST}" "${_HC_LAST_TIME}" "${_HC_LAST_FAIL_ID}" "${_HC_LAST_STC}"
|
"${_HC_LAST}" "${_HC_LAST_TIME}" "${_HC_LAST_FAIL_ID}" "${_HC_LAST_STC}"
|
||||||
done
|
done
|
||||||
# disclaimer
|
# disclaimer
|
||||||
print "Note: this report only shows the overall combined status of all events of each HC within exactly"
|
print "NOTE: this report only shows the overall combined status of all events of each HC within exactly"
|
||||||
print " the *same* time stamp (seconds precise). It may therefore fail to report certain FAIL IDs."
|
print " the *same* time stamp (seconds precise). It may therefore fail to report certain FAIL IDs."
|
||||||
print " Use '--report' to get the exact list of failure events."
|
print " Use '--report' to get the exact list of failure events."
|
||||||
# other reports
|
# other reports
|
||||||
@ -141,7 +144,7 @@ else
|
|||||||
cat ${_LOG_STASH} 2>/dev/null | ${_SORT_CMD} 2>/dev/null | awk -F"${LOG_SEP}" -v id_needle="${_ID_NEEDLE}" \
|
cat ${_LOG_STASH} 2>/dev/null | ${_SORT_CMD} 2>/dev/null | awk -F"${LOG_SEP}" -v id_needle="${_ID_NEEDLE}" \
|
||||||
'
|
'
|
||||||
{
|
{
|
||||||
if ($5 ~ id_needle) {
|
if ($5 ~ id_needle && NF <= '"${NUM_LOG_FIELDS}"') {
|
||||||
printf ("| %-20s | %-14s | %-30s | %-s\n", $1, $5, $2, $4)
|
printf ("| %-20s | %-14s | %-30s | %-s\n", $1, $5, $2, $4)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -157,7 +160,7 @@ else
|
|||||||
dashes = sprintf("%36s",""); gsub (/ /, "-", dashes);
|
dashes = sprintf("%36s",""); gsub (/ /, "-", dashes);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
if ($5 ~ id_needle) {
|
if ($5 ~ id_needle && NF <= '"${NUM_LOG_FIELDS}"') {
|
||||||
printf ("%36sMSG #%03d%36s", dashes, event_count, dashes)
|
printf ("%36sMSG #%03d%36s", dashes, event_count, dashes)
|
||||||
printf ("\nTime : %-s\nHC : %-s\nDetail : %-s\n", $1, $2, $4)
|
printf ("\nTime : %-s\nHC : %-s\nDetail : %-s\n", $1, $2, $4)
|
||||||
event_count++
|
event_count++
|
||||||
@ -191,6 +194,19 @@ else
|
|||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# check consistency of log(s)
|
||||||
|
find ${_LOG_STASH} -type f -print 2>/dev/null | while read _CHECK_FILE
|
||||||
|
do
|
||||||
|
_ERROR_COUNT=$(count_log_errors ${_CHECK_FILE})
|
||||||
|
if (( _ERROR_COUNT > 0 ))
|
||||||
|
then
|
||||||
|
print "NOTE: found ${_ERROR_COUNT} rogue entr(y|ies) in log file ${_CHECK_FILE}"
|
||||||
|
_ERROR_TOTAL_COUNT=$(( _ERROR_TOTAL_COUNT + _ERROR_COUNT ))
|
||||||
|
fi
|
||||||
|
_ERROR_COUNT=0
|
||||||
|
done
|
||||||
|
(( _ERROR_TOTAL_COUNT > 0 )) && print "NOTE: fix log errors with ${SCRIPT_NAME} --fix-logs [--with-history]"
|
||||||
|
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user