Added possibility to do check based on an hourly schedule

This commit is contained in:
Patrick Van der Veken 2020-03-04 10:15:48 +01:00
parent 7d575cab86
commit daf50c64cb
3 changed files with 133 additions and 51 deletions

View File

@ -35,14 +35,18 @@ max_replication_lag=300
# entry must be present: either a wildcard or custom entry. # entry must be present: either a wildcard or custom entry.
# Caveat: any share must finally resolve to one entry only. # Caveat: any share must finally resolve to one entry only.
# Format: # Format:
# zfs:<host_name>:<replication_name|*>:<true|false|*>:<success|failed|*>:[<max_replication_lag>]:[day1,day2,..|*>] # zfs:<host_name>:<replication_name|*>:<true|false|*>:<success|failed|*>:[<max_replication_lag>]:[day1,day2,..|*>]:[<hour|<start_hour-end_hour>|*>]
#
# <day>: 3 letter day name (case insensitive)
# <hour>: 24 hours notation (start and end hours are inclusive)
#
# Examples: # Examples:
# check rep_share1 on myzfs1 with a custom threshold of 300 seconds on every day of the week # check rep_share1 on myzfs1 with a custom threshold of 300 seconds on every day of the week
# zfs:myzfs1:rep_share1:*:*:600:* # zfs:myzfs1:rep_share1:*:*:600:*
# check all shares of myzfs2 with a custom threshold of 1200 seconds on Sunday and Monday # check all shares of myzfs2 with a custom threshold of 1200 seconds on Sunday and Monday
# zfs:myzfs2:*:*:*:1200:Sun,Mon # zfs:myzfs2:*:*:*:1200:Sun,Mon
# check all shares of myzfs3 with the general threshold but only on Friday # check all shares of myzfs3 with the general threshold but only on Friday between 7am-10m
# zfs:myzfs3:*:*:*:Fri # zfs:myzfs3:*:*:*:Fri:07-10
# disable all shares of myzfs4 from checking # disable all shares of myzfs4 from checking
# zfs:myzfs4:*:*:*:0:* # zfs:myzfs4:*:*:*:0:*
# disable check of rep_share7 on myzfs5 # disable check of rep_share7 on myzfs5

View File

@ -30,7 +30,7 @@
# RETURNS: 0 # RETURNS: 0
function version_include_data function version_include_data
{ {
typeset _VERSION="2020-01-27" # YYYY-MM-DD typeset _VERSION="2020-03-04" # YYYY-MM-DD
print "INFO: $0: ${_VERSION#version_*}" print "INFO: $0: ${_VERSION#version_*}"
@ -613,6 +613,64 @@ esac
return 0 return 0
} }
# -----------------------------------------------------------------------------
# @(#) FUNCTION: data_expand_numerical_range()
# DOES: expand numerical range (X-Y) to comma-separated list of numbers
# EXPECTS: [string]
# OUTPUTS: [string]
# RETURNS: 0=no error occurred; <>0=some error occurred
# REQUIRES: n/a
function data_expand_numerical_range
{
(( ARG_DEBUG > 0 && ARG_DEBUG_LEVEL > 0 )) && set "${DEBUG_OPTS}"
typeset _NUM_LIST=""
case "${1}" in
*-*)
# range operator, expand
# check if there are only 2 operands (fields)
if (( $(print "${1}" | awk -F '-' '{ print NF }' 2>/dev/null) > 2 ))
then
(( ARG_DEBUG > 0 )) && debug "in range $1 found more than one range (-) operator"
return 1
fi
# check if X < Y
if $(print "${1}" | awk -F '-' '{ if ($1 < $2) { exit 1 }}' 2>/dev/null)
then
(( ARG_DEBUG > 0 )) && debug "in range $1 operator Y is smaller or equal to operator Y"
return 1
fi
# expand list
_NUM_LIST=$(print "${1}"| awk -F '-' '
BEGIN { count = 0; }
{
while ($1 + count < $2) {
if (length (NUM_LIST) == 0) {
NUM_LIST = sprintf ("%s", $1 + count);
} else {
NUM_LIST = sprintf ("%s,%s", NUM_LIST, $1 + count);
}
count++;
}
}
END { print NUM_LIST; }')
if [[ -z "${_NUM_LIST}" ]]
then
(( ARG_DEBUG > 0 )) && debug "range conversion returned empty list"
return 1
else
print "${_NUM_LIST}"
fi
;;
*)
# no range, return as-is
print "${1}"
;;
esac
return 0
}
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
# @(#) FUNCTION: data_encode_url # @(#) FUNCTION: data_encode_url
# DOES: encode URL data # DOES: encode URL data

View File

@ -19,9 +19,9 @@
# @(#) MAIN: check_exadata_zfs_share_replication # @(#) MAIN: check_exadata_zfs_share_replication
# DOES: see _show_usage() # DOES: see _show_usage()
# EXPECTS: see _show_usage() # EXPECTS: see _show_usage()
# REQUIRES: data_comma2space(), data_contains_string(), data_get_lvalue_from_config(), # REQUIRES: data_comma2space(), data_contains_string(), data_expand_numerical_range(),
# data_has_newline(), dump_logs(), init_hc(), linux_exec_ssh(), # data_get_lvalue_from_config(), data_has_newline(), data_is_numeric(),
# log_hc(), warn() # dump_logs(), init_hc(), linux_exec_ssh(), log_hc(), warn()
# #
# @(#) HISTORY: # @(#) HISTORY:
# @(#) 2019-02-18: initial version [Patrick Van der Veken] # @(#) 2019-02-18: initial version [Patrick Van der Veken]
@ -31,6 +31,7 @@
# @(#) 2019-05-14: small fixes [Patrick Van der Veken] # @(#) 2019-05-14: small fixes [Patrick Van der Veken]
# @(#) 2020-01-27: addition of day check option + # @(#) 2020-01-27: addition of day check option +
# @(#) newline config value check [Patrick Van der Veken] # @(#) newline config value check [Patrick Van der Veken]
# @(#) 2020-03-05: addition of hour check option
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
# DO NOT CHANGE THIS FILE UNLESS YOU KNOW WHAT YOU ARE DOING! # DO NOT CHANGE THIS FILE UNLESS YOU KNOW WHAT YOU ARE DOING!
#****************************************************************************** #******************************************************************************
@ -40,7 +41,7 @@ function check_exadata_zfs_share_replication
{ {
# ------------------------- CONFIGURATION starts here ------------------------- # ------------------------- CONFIGURATION starts here -------------------------
typeset _CONFIG_FILE="${CONFIG_DIR}/$0.conf" typeset _CONFIG_FILE="${CONFIG_DIR}/$0.conf"
typeset _VERSION="2020-01-27" # YYYY-MM-DD typeset _VERSION="2020-03-04" # YYYY-MM-DD
typeset _SUPPORTED_PLATFORMS="Linux" # uname -s match typeset _SUPPORTED_PLATFORMS="Linux" # uname -s match
# replication query script -- DO NOT CHANGE -- # replication query script -- DO NOT CHANGE --
# prj1/share1:true:idle:success:111 # prj1/share1:true:idle:success:111
@ -78,13 +79,16 @@ typeset _CFG_ZFS_HOSTS=""
typeset _CFG_ZFS_HOST="" typeset _CFG_ZFS_HOST=""
typeset _CFG_ZFS_LINE="" typeset _CFG_ZFS_LINE=""
typeset _CFG_REPLICATION_DAYS="" typeset _CFG_REPLICATION_DAYS=""
typeset _CFG_REPLICATION_HOURS=""
typeset _REPLICATION_ENABLED="" typeset _REPLICATION_ENABLED=""
typeset _REPLICATION_HOURS=""
typeset _REPLICATION_LAG="" typeset _REPLICATION_LAG=""
typeset _REPLICATION_RESULT="" typeset _REPLICATION_RESULT=""
typeset _SSH_BIN="" typeset _SSH_BIN=""
typeset _SSH_OUTPUT="" typeset _SSH_OUTPUT=""
typeset _ZFS_DATA="" typeset _ZFS_DATA=""
typeset _WEEKDAY=$(data_lc "$(date '+%a' 2>/dev/null)") # Sun typeset _WEEKDAY=$(data_lc "$(date '+%a' 2>/dev/null)") # Sun
typeset _HOUR=$(data_strip_space "$(date '+%k' 2>/dev/null)") # 7,23 etc
# handle arguments (originally comma-separated) # handle arguments (originally comma-separated)
for _ARG in ${_ARGS} for _ARG in ${_ARGS}
@ -215,6 +219,7 @@ do
_CFG_REPLICATION_RESULT="" _CFG_REPLICATION_RESULT=""
_CFG_REPLICATION_LAG="" _CFG_REPLICATION_LAG=""
_CFG_REPLICATION_DAYS="" _CFG_REPLICATION_DAYS=""
_CFG_REPLICATION_HOURS=""
# which values to use (general or custom?), keep in mind wildcards (custom will overrule wildcard entry) # which values to use (general or custom?), keep in mind wildcards (custom will overrule wildcard entry)
_CFG_ZFS_LINE=$(grep -E -e "^zfs:${_ZFS_HOST}:[*]:" ${_CONFIG_FILE} 2>/dev/null) _CFG_ZFS_LINE=$(grep -E -e "^zfs:${_ZFS_HOST}:[*]:" ${_CONFIG_FILE} 2>/dev/null)
@ -225,6 +230,7 @@ do
_CFG_REPLICATION_RESULT=$(print "${_CFG_ZFS_LINE}" | cut -f5 -d':' 2>/dev/null) _CFG_REPLICATION_RESULT=$(print "${_CFG_ZFS_LINE}" | cut -f5 -d':' 2>/dev/null)
_CFG_REPLICATION_LAG=$(print "${_CFG_ZFS_LINE}" | cut -f6 -d':' 2>/dev/null) _CFG_REPLICATION_LAG=$(print "${_CFG_ZFS_LINE}" | cut -f6 -d':' 2>/dev/null)
_CFG_REPLICATION_DAYS=$(print "${_CFG_ZFS_LINE}" | cut -f7 -d':' 2>/dev/null) _CFG_REPLICATION_DAYS=$(print "${_CFG_ZFS_LINE}" | cut -f7 -d':' 2>/dev/null)
_CFG_REPLICATION_HOURS=$(print "${_CFG_ZFS_LINE}" | cut -f8 -d':' 2>/dev/null)
# null value means general threshold # null value means general threshold
if [[ -z "${_CFG_REPLICATION_LAG}" ]] if [[ -z "${_CFG_REPLICATION_LAG}" ]]
then then
@ -247,6 +253,7 @@ do
_CFG_REPLICATION_RESULT=$(print "${_CFG_ZFS_LINE}" | cut -f5 -d':' 2>/dev/null) _CFG_REPLICATION_RESULT=$(print "${_CFG_ZFS_LINE}" | cut -f5 -d':' 2>/dev/null)
_CFG_REPLICATION_LAG=$(print "${_CFG_ZFS_LINE}" | cut -f6 -d':' 2>/dev/null) _CFG_REPLICATION_LAG=$(print "${_CFG_ZFS_LINE}" | cut -f6 -d':' 2>/dev/null)
_CFG_REPLICATION_DAYS=$(print "${_CFG_ZFS_LINE}" | cut -f7 -d':' 2>/dev/null) _CFG_REPLICATION_DAYS=$(print "${_CFG_ZFS_LINE}" | cut -f7 -d':' 2>/dev/null)
_CFG_REPLICATION_HOURS=$(print "${_CFG_ZFS_LINE}" | cut -f8 -d':' 2>/dev/null)
# null value means general threshold # null value means general threshold
if [[ -z "${_CFG_REPLICATION_LAG}" ]] if [[ -z "${_CFG_REPLICATION_LAG}" ]]
then then
@ -278,63 +285,76 @@ do
[[ -z "${_CFG_REPLICATION_RESULT}" || "${_CFG_REPLICATION_RESULT}" = '*' ]] && _CFG_REPLICATION_RESULT="success" [[ -z "${_CFG_REPLICATION_RESULT}" || "${_CFG_REPLICATION_RESULT}" = '*' ]] && _CFG_REPLICATION_RESULT="success"
_CFG_REPLICATION_DAYS=$(data_lc "${_CFG_REPLICATION_DAYS}") _CFG_REPLICATION_DAYS=$(data_lc "${_CFG_REPLICATION_DAYS}")
[[ -z "${_CFG_REPLICATION_DAYS}" || "${_CFG_REPLICATION_DAYS}" = '*' ]] && _CFG_REPLICATION_DAYS="${_WEEKDAY}" [[ -z "${_CFG_REPLICATION_DAYS}" || "${_CFG_REPLICATION_DAYS}" = '*' ]] && _CFG_REPLICATION_DAYS="${_WEEKDAY}"
if [[ -z "${_CFG_REPLICATION_HOURS}" || "${_CFG_REPLICATION_HOURS}" = '*' ]]
then
_REPLICATION_HOURS="${_HOUR}"
else
_REPLICATION_HOURS=$(data_expand_numerical_range "${_CFG_REPLICATION_HOURS}")
fi
# perform checks # perform checks
# do we need to perform the check today? # do we need to perform the check today?
data_contains_string "${_CFG_REPLICATION_DAYS}" "${_WEEKDAY}" data_contains_string "${_CFG_REPLICATION_DAYS}" "${_WEEKDAY}"
if (( $? > 0 )) if (( $? > 0 ))
then then
# check replication enabled state (active or not?) # do we need to perform the check this hour?
if [[ $(data_lc "${_REPLICATION_ENABLED}") != $(data_lc "${_CFG_REPLICATION_ENABLED}") ]] data_contains_string "${_REPLICATION_HOURS}" "${_HOUR}"
then
_MSG="state for ${_ZFS_HOST}:${_REPLICATION_NAME} is NOK [${_REPLICATION_ENABLED}!=${_CFG_REPLICATION_ENABLED}]"
_STC=1
else
_MSG="state for ${_ZFS_HOST}:${_REPLICATION_NAME} is OK [${_REPLICATION_ENABLED}==${_CFG_REPLICATION_ENABLED}]"
_STC=0
fi
if (( _LOG_HEALTHY > 0 || _STC > 0 ))
then
log_hc "$0" ${_STC} "${_MSG}" "${_REPLICATION_ENABLED}" "${_CFG_REPLICATION_ENABLED}"
fi
# check replication last result (success or not?)
if [[ $(data_lc "${_REPLICATION_RESULT}") != $(data_lc "${_CFG_REPLICATION_RESULT}") ]]
then
_MSG="result for ${_ZFS_HOST}:${_REPLICATION_NAME} is NOK [${_REPLICATION_RESULT}!=${_CFG_REPLICATION_RESULT}]"
_STC=1
else
_MSG="result for ${_ZFS_HOST}:${_REPLICATION_NAME} is OK [${_REPLICATION_RESULT}==${_CFG_REPLICATION_RESULT}]"
_STC=0
fi
if (( _LOG_HEALTHY > 0 || _STC > 0 ))
then
log_hc "$0" ${_STC} "${_MSG}" "${_REPLICATION_RESULT}" "${_CFG_REPLICATION_RESULT}"
fi
# check replication lag
# caveat: replication lag is <unknown> at initial replication
data_contains_string "${_REPLICATION_LAG}" "unknown"
# shellcheck disable=SC2181
if (( $? > 0 )) if (( $? > 0 ))
then then
_MSG="lag for ${_ZFS_HOST}:${_REPLICATION_NAME} is unknown" # check replication enabled state (active or not?)
_REPLICATION_LAG=-1 if [[ $(data_lc "${_REPLICATION_ENABLED}") != $(data_lc "${_CFG_REPLICATION_ENABLED}") ]]
_STC=1
else
if (( _REPLICATION_LAG > _CFG_REPLICATION_LAG ))
then then
_MSG="lag for ${_ZFS_HOST}:${_REPLICATION_NAME} is too big [${_REPLICATION_LAG}>${_CFG_REPLICATION_LAG}]" _MSG="state for ${_ZFS_HOST}:${_REPLICATION_NAME} is NOK [${_REPLICATION_ENABLED}!=${_CFG_REPLICATION_ENABLED}]"
_STC=1 _STC=1
else else
_MSG="lag for ${_ZFS_HOST}:${_REPLICATION_NAME} is OK [${_REPLICATION_LAG}<=${_CFG_REPLICATION_LAG}]" _MSG="state for ${_ZFS_HOST}:${_REPLICATION_NAME} is OK [${_REPLICATION_ENABLED}==${_CFG_REPLICATION_ENABLED}]"
_STC=0 _STC=0
fi fi
fi if (( _LOG_HEALTHY > 0 || _STC > 0 ))
if (( _LOG_HEALTHY > 0 || _STC > 0 )) then
then log_hc "$0" ${_STC} "${_MSG}" "${_REPLICATION_ENABLED}" "${_CFG_REPLICATION_ENABLED}"
log_hc "$0" ${_STC} "${_MSG}" "${_REPLICATION_LAG}" "${_CFG_REPLICATION_LAG}" fi
# check replication last result (success or not?)
if [[ $(data_lc "${_REPLICATION_RESULT}") != $(data_lc "${_CFG_REPLICATION_RESULT}") ]]
then
_MSG="result for ${_ZFS_HOST}:${_REPLICATION_NAME} is NOK [${_REPLICATION_RESULT}!=${_CFG_REPLICATION_RESULT}]"
_STC=1
else
_MSG="result for ${_ZFS_HOST}:${_REPLICATION_NAME} is OK [${_REPLICATION_RESULT}==${_CFG_REPLICATION_RESULT}]"
_STC=0
fi
if (( _LOG_HEALTHY > 0 || _STC > 0 ))
then
log_hc "$0" ${_STC} "${_MSG}" "${_REPLICATION_RESULT}" "${_CFG_REPLICATION_RESULT}"
fi
# check replication lag
# caveat: replication lag is <unknown> at initial replication
data_contains_string "${_REPLICATION_LAG}" "unknown"
# shellcheck disable=SC2181
if (( $? > 0 ))
then
_MSG="lag for ${_ZFS_HOST}:${_REPLICATION_NAME} is unknown"
_REPLICATION_LAG=-1
_STC=1
else
if (( _REPLICATION_LAG > _CFG_REPLICATION_LAG ))
then
_MSG="lag for ${_ZFS_HOST}:${_REPLICATION_NAME} is too big [${_REPLICATION_LAG}>${_CFG_REPLICATION_LAG}]"
_STC=1
else
_MSG="lag for ${_ZFS_HOST}:${_REPLICATION_NAME} is OK [${_REPLICATION_LAG}<=${_CFG_REPLICATION_LAG}]"
_STC=0
fi
fi
if (( _LOG_HEALTHY > 0 || _STC > 0 ))
then
log_hc "$0" ${_STC} "${_MSG}" "${_REPLICATION_LAG}" "${_CFG_REPLICATION_LAG}"
fi
else
warn "check of ${_ZFS_HOST}:${_REPLICATION_NAME} is not configured for this hour/these hours: ${_REPLICATION_HOURS}"
fi fi
else else
warn "check of ${_ZFS_HOST}:${_REPLICATION_NAME} was not configured for today" warn "check of ${_ZFS_HOST}:${_REPLICATION_NAME} is not configured for today"
fi fi
done done
@ -353,7 +373,7 @@ CONFIG : $3 with parameters:
ssh_key_file=<ssh_private_key_file> ssh_key_file=<ssh_private_key_file>
max_replication_lag=<general_max_replication> max_replication_lag=<general_max_replication>
and formatted stanzas of: and formatted stanzas of:
zfs:<host_name>:<replication_name>:<replication_enabled>:<replication_result>:<max_replication_lag>:<day1,day2> zfs:<host_name>:<replication_name>:<replication_enabled>:<replication_result>:<max_replication_lag>:<day1,day2>:<start_hour>-<end_hour>
PURPOSE : Checks the replication state, sync status and maximum lag of the configured ZFS hosts/shares on certain days PURPOSE : Checks the replication state, sync status and maximum lag of the configured ZFS hosts/shares on certain days
CLI: zfs > shares > replications > packages > select (action) > show CLI: zfs > shares > replications > packages > select (action) > show
LOG HEALTHY : Supported LOG HEALTHY : Supported