diff --git a/README.md b/README.md index dcc526a..ebea4cd 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ Health checker for UNIX/Linux is a small framework of monitoring scripts. It is ## Build -Use the build templates & scripts in the `build/ directory to roll your own packages. +Use the build templates & scripts in the `build/` directory to roll your own packages. ## Installation diff --git a/build/linux/SPECS/hc-linux-platform.spec b/build/linux/SPECS/hc-linux-platform.spec index 46cac65..82eb185 100644 --- a/build/linux/SPECS/hc-linux-platform.spec +++ b/build/linux/SPECS/hc-linux-platform.spec @@ -44,6 +44,7 @@ cp ../SOURCES/lib/platform/linux/check_linux_hpasmcli.sh $RPM_BUILD_ROOT/opt/hc/ cp ../SOURCES/lib/platform/linux/check_linux_hpacucli.sh $RPM_BUILD_ROOT/opt/hc/lib/platform/linux/check_linux_hpacucli.sh cp ../SOURCES/lib/platform/linux/check_linux_hplog.sh $RPM_BUILD_ROOT/opt/hc/lib/platform/linux/check_linux_hplog.sh cp ../SOURCES/lib/platform/linux/check_linux_hpssacli.sh $RPM_BUILD_ROOT/opt/hc/lib/platform/linux/check_linux_hpssacli.sh +cp ../SOURCES/lib/platform/linux/check_linux_process_limits.sh $RPM_BUILD_ROOT/opt/hc/lib/platform/linux/check_linux_process_limits.sh cp ../SOURCES/lib/platform/linux/check_linux_root_crontab.sh $RPM_BUILD_ROOT/opt/hc/lib/platform/linux/check_linux_root_crontab.sh cp ../SOURCES/lib/platform/linux/check_linux_sg_cluster_config.sh $RPM_BUILD_ROOT/opt/hc/lib/platform/linux/check_linux_sg_cluster_config.sh cp ../SOURCES/lib/platform/linux/check_linux_sg_cluster_status.sh $RPM_BUILD_ROOT/opt/hc/lib/platform/linux/check_linux_sg_cluster_status.sh @@ -60,6 +61,7 @@ cp ../SOURCES/etc/check_linux_hpacucli.conf.dist $RPM_BUILD_ROOT/etc/opt/hc/chec cp ../SOURCES/etc/check_linux_hplog.conf.dist $RPM_BUILD_ROOT/etc/opt/hc/check_linux_hplog.conf.dist cp ../SOURCES/etc/check_linux_hpssacli.conf.dist $RPM_BUILD_ROOT/etc/opt/hc/check_linux_hpssacli.conf.dist cp ../SOURCES/etc/check_linux_ntp_status.conf.dist $RPM_BUILD_ROOT/etc/opt/hc/check_linux_ntp_status.conf.dist +cp ../SOURCES/etc/check_linux_process_limits.conf.dist $RPM_BUILD_ROOT/etc/opt/hc/check_linux_process_limits.conf.dist cp ../SOURCES/etc/check_linux_root_crontab.conf.dist $RPM_BUILD_ROOT/etc/opt/hc/check_linux_root_crontab.conf.dist cp ../SOURCES/etc/check_linux_sg_cluster_config.conf.dist $RPM_BUILD_ROOT/etc/opt/hc/check_linux_sg_cluster_config.conf.dist cp ../SOURCES/etc/check_linux_sg_cluster_status.conf.dist $RPM_BUILD_ROOT/etc/opt/hc/check_linux_sg_cluster_status.conf.dist @@ -123,6 +125,7 @@ echo "INFO: finished post-uninstall script" %attr(755, root, root) /opt/hc/lib/platform/linux/check_linux_hpacucli.sh %attr(755, root, root) /opt/hc/lib/platform/linux/check_linux_hplog.sh %attr(755, root, root) /opt/hc/lib/platform/linux/check_linux_hpssacli.sh +%attr(755, root, root) /opt/hc/lib/platform/linux/check_linux_process_limits.sh %attr(755, root, root) /opt/hc/lib/platform/linux/check_linux_root_crontab.sh %attr(755, root, root) /opt/hc/lib/platform/linux/check_linux_sg_cluster_config.sh %attr(755, root, root) /opt/hc/lib/platform/linux/check_linux_sg_cluster_status.sh @@ -139,6 +142,7 @@ echo "INFO: finished post-uninstall script" %attr(644, root, root) /etc/opt/hc/check_linux_hplog.conf.dist %attr(644, root, root) /etc/opt/hc/check_linux_hpssacli.conf.dist %attr(644, root, root) /etc/opt/hc/check_linux_ntp_status.conf.dist +%attr(644, root, root) /etc/opt/hc/check_linux_process_limits.conf.dist %attr(644, root, root) /etc/opt/hc/check_linux_root_crontab.conf.dist %attr(644, root, root) /etc/opt/hc/check_linux_sg_cluster_config.conf.dist %attr(644, root, root) /etc/opt/hc/check_linux_sg_cluster_status.conf.dist @@ -151,6 +155,8 @@ echo "INFO: finished post-uninstall script" %attr(644, root, root) /etc/opt/hc/core/templates/mail_body.tpl-check_linux_root_crontab %changelog +* Tue Jul 10 2018 - 0.1.1 +- Added check_linux_process_limits * Sat Apr 21 2018 - 0.1.0 - Added check_linux_ntp_status * Thu May 18 2017 - 0.0.9 diff --git a/build/linux/SPECS/hc-linux.spec b/build/linux/SPECS/hc-linux.spec index 0dd9e91..d14f2ec 100644 --- a/build/linux/SPECS/hc-linux.spec +++ b/build/linux/SPECS/hc-linux.spec @@ -5,7 +5,7 @@ Version: %{build_timestamp} Release: 1 Summary: The KUDOS Health Checker (HC) for UNIX -Group: Tools/MonitoringGroup: Tools/Monitoring +Group: Tools/Monitoring License: GNU General Public License either version 2 of the License, or (at your option) any later version URL: http://www.kudos.be diff --git a/configs/etc/check_linux_process_limits.conf.dist b/configs/etc/check_linux_process_limits.conf.dist new file mode 100644 index 0000000..754d5df --- /dev/null +++ b/configs/etc/check_linux_process_limits.conf.dist @@ -0,0 +1,31 @@ +#****************************************************************************** +# @(#) check_linux_process_limits.conf +#****************************************************************************** +# This is a configuration file for the check_linux_process_limits HC plugin. +# All lines starting with a '#' are comment lines. +# [default: indicates hardcoded script values if no value is defined here] +#****************************************************************************** + +# specify whether to also log passed health checks +# (warning: this may rapidly grow the HC log) +# [default: no] +log_healthy="no" + +# -- user stanzas -- +# format: +# user:;;; +# note: must exactly match the descriptor in /proc//limits +# thresholds are optional (will not be checked when not specified) +user;postfix;Max open files;;75 + +# -- process stanzas -- +# format: +# process:;;; +# note: must exactly match the descriptor in /proc//limits +# thresholds are optional (will not be checked when not specified) +process;nfsd;Max open files;100;80 + + +#****************************************************************************** +# End of FILE +#****************************************************************************** diff --git a/sources/lib/platform/linux/check_linux_process_limits.sh b/sources/lib/platform/linux/check_linux_process_limits.sh new file mode 100644 index 0000000..abbd250 --- /dev/null +++ b/sources/lib/platform/linux/check_linux_process_limits.sh @@ -0,0 +1,337 @@ +#!/usr/bin/env ksh +#****************************************************************************** +# @(#) check_linux_process_limits.sh +#****************************************************************************** +# @(#) Copyright (C) 2016 by KUDOS BVBA (info@kudos.be). All rights reserved. +# +# This program is a free software; you can redistribute it and/or modify +# it under the same terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details +#****************************************************************************** +# +# DOCUMENTATION (MAIN) +# ----------------------------------------------------------------------------- +# @(#) MAIN: check_linux_process_limits +# DOES: see _show_usage() +# EXPECTS: see _show_usage() +# REQUIRES: data_space2comma(), init_hc(), log_hc() +# +# @(#) HISTORY: +# @(#) 2018-07-10: original version [Patrick Van der Veken] +# ----------------------------------------------------------------------------- +# DO NOT CHANGE THIS FILE UNLESS YOU KNOW WHAT YOU ARE DOING! +#****************************************************************************** + +# ----------------------------------------------------------------------------- +function check_linux_process_limits +{ +# ------------------------- CONFIGURATION starts here ------------------------- +typeset _CONFIG_FILE="${CONFIG_DIR}/$0.conf" +typeset _VERSION="2018-07-10" # YYYY-MM-DD +typeset _SUPPORTED_PLATFORMS="Linux" # uname -s match +# ------------------------- CONFIGURATION ends here --------------------------- + +# set defaults +(( ARG_DEBUG != 0 && ARG_DEBUG_LEVEL > 0 )) && set ${DEBUG_OPTS} +init_hc "$0" "${_SUPPORTED_PLATFORMS}" "${_VERSION}" +typeset _ARGS=$(data_space2comma "$*") +typeset _ARG="" +typeset _MSG="" +typeset _STC=0 +typeset _DUMMY="" +typeset _LINE_COUNT=1 +typeset _LOG_HEALTHY=0 +typeset _OPEN_FILES=0 +typeset _PROCESS="" +typeset _PROCESS_LIMIT="" +typeset _PROCESS_SOFT_THRESHOLD=0 +typeset _PROCESS_HARD_THRESHOLD=0 +typeset _PROCESS_PS="" +typeset _PROCESS_PS_PID="" +typeset _PROCESS_PS_USER="" +typeset _USER="" +typeset _USER_LIMIT="" +typeset _USER_SOFT_THRESHOLD=0 +typeset _USER_HARD_THRESHOLD=0 +typeset _USER_PS="" +typeset _USER_PS_PID="" +typeset _USER_PS_COMM="" + +# set local trap for cleanup +trap "rm -f ${_INSTANCE_RUN_FILE}.* >/dev/null 2>&1; return 1" 1 2 3 15 + +# handle arguments (originally comma-separated) +for _ARG in ${_ARGS} +do + case "${_ARG}" in + help) + _show_usage $0 ${_VERSION} ${_CONFIG_FILE} && return 0 + ;; + esac +done + +# handle configuration file +[[ -n "${ARG_CONFIG_FILE}" ]] && _CONFIG_FILE="${ARG_CONFIG_FILE}" +if [[ ! -r ${_CONFIG_FILE} ]] +then + warn "unable to read configuration file at ${_CONFIG_FILE}" + return 1 +fi +# read required configuration values +_LOG_HEALTHY=$(_CONFIG_FILE="${_CONFIG_FILE}" data_get_lvalue_from_config 'log_healthy') +case "${_LOG_HEALTHY}" in + no|NO|No) + _LOG_HEALTHY=0 + log "not logging/showing passed health checks" + ;; + *) + _LOG_HEALTHY=1 + if (( ARG_LOG > 0 )) + then + log "logging/showing passed health checks" + else + log "showing passed health checks (but not logging)" + fi + ;; +esac + +# check PROCESS stanzas +grep -i '^process' ${_CONFIG_FILE} 2>/dev/null |\ + while IFS=';' read _DUMMY _PROCESS _PROCESS_LIMIT _PROCESS_SOFT_THRESHOLD _PROCESS_HARD_THRESHOLD +do + # check for empties + if [[ -z "${_PROCESS}" || -z "${_PROCESS_LIMIT}" ]] + then + warn "missing parameter in configuration file ${_CONFIG_FILE} at data line ${_LINE_COUNT}" + return 1 + fi + if [[ -n "${_PROCESS_SOFT_THRESHOLD}" ]] + then + $(data_is_numeric ${_PROCESS_SOFT_THRESHOLD}) + if (( $? > 0 )) + then + warn "parameter is not numeric in configuration file ${_CONFIG_FILE} at data line ${_LINE_COUNT}" + return 1 + fi + fi + if [[ -n "${_PROCESS_HARD_THRESHOLD}" ]] + then + $(data_is_numeric ${_PROCESS_HARD_THRESHOLD}) + if (( $? > 0 )) + then + warn "parameter is not numeric in configuration file ${_CONFIG_FILE} at data line ${_LINE_COUNT}" + return 1 + fi + fi + + # collect ps info + (( ARG_DEBUG != 0 )) && debug "collecting information for process class ${_PROCESS}" + _PROCESS_PS=$(_get_psinfo_by_process "${_PROCESS}") + if [[ -z "${_PROCESS_PS}" ]] + then + warn "could not find any matching processes for process ${_PROCESS}" + continue + fi + print "${_PROCESS_PS}" | while read _PROCESS_PS_PID _PROCESS_PS_USER + do + (( ARG_DEBUG != 0 )) && debug "checking process ${_PROCESS_PS_PID}" + # get current values and check thresholds + case "${_PROCESS_LIMIT}" in + "Max open files"|"MAX OPEN FILES"|"max open files") + _OPEN_FILES=$(_get_open_files ${_PROCESS_PS_PID}) + # SOFT limit + _check_limit "${_PROCESS_LIMIT}" soft ${_PROCESS_PS_PID} ${_PROCESS_PS_USER} ${_PROCESS} ${_PROCESS_SOFT_THRESHOLD} ${_OPEN_FILES} ${_LOG_HEALTHY} + # HARD limit + _check_limit "${_PROCESS_LIMIT}" hard ${_PROCESS_PS_PID} ${_PROCESS_PS_USER} ${_PROCESS} ${_PROCESS_HARD_THRESHOLD} ${_OPEN_FILES} ${_LOG_HEALTHY} + ;; + *) + # no other limits are supported yet ;-) + warn "'${_PROCESS_LIMIT}' is an unsupported limit check" + continue + ;; + esac + done + + _LINE_COUNT=$(( _LINE_COUNT + 1 )) +done + +# check USER stanzas +_LINE_COUNT=0 +grep -i '^user' ${_CONFIG_FILE} 2>/dev/null |\ + while IFS=';' read _DUMMY _USER _USER_LIMIT _USER_SOFT_THRESHOLD _USER_HARD_THRESHOLD +do + # check for empties + if [[ -z "${_USER}" || -z "${_USER_LIMIT}" ]] + then + warn "missing parameter in configuration file ${_CONFIG_FILE} at data line ${_LINE_COUNT}" + return 1 + fi + if [[ -n "${_USER_SOFT_THRESHOLD}" ]] + then + $(data_is_numeric ${_USER_SOFT_THRESHOLD}) + if (( $? > 0 )) + then + warn "parameter is not numeric in configuration file ${_CONFIG_FILE} at data line ${_LINE_COUNT}" + return 1 + fi + fi + if [[ -n "${_USER_HARD_THRESHOLD}" ]] + then + $(data_is_numeric ${_USER_HARD_THRESHOLD}) + if (( $? > 0 )) + then + warn "parameter is not numeric in configuration file ${_CONFIG_FILE} at data line ${_LINE_COUNT}" + return 1 + fi + fi + + # collect ps info + (( ARG_DEBUG != 0 )) && debug "collecting information for user ${_USER}" + _USER_PS=$(_get_psinfo_by_user "${_USER}") + if [[ -z "${_USER_PS}" ]] + then + warn "could not find any matching processes for user ${_USER}" + continue + fi + print "${_USER_PS}" | while read _USER_PS_PID _USER_PS_COMM + do + (( ARG_DEBUG != 0 )) && debug "checking process ${_USER_PS_PID}" + # get current values and check thresholds + case "${_USER_LIMIT}" in + "Max open files"|"MAX OPEN FILES"|"max open files") + _OPEN_FILES=$(_get_open_files ${_USER_PS_PID}) + # SOFT limit + _check_limit "${_USER_LIMIT}" soft ${_USER_PS_PID} ${_USER} ${_USER_PS_COMM} ${_USER_SOFT_THRESHOLD} ${_OPEN_FILES} ${_LOG_HEALTHY} + # HARD limit + _check_limit "${_USER_LIMIT}" hard ${_USER_PS_PID} ${_USER} ${_USER_PS_COMM} ${_USER_HARD_THRESHOLD} ${_OPEN_FILES} ${_LOG_HEALTHY} + ;; + *) + # no other limits are supported yet ;-) + warn "'${_USER_LIMIT}' is an unsupported limit check" + continue + ;; + esac + done + + _LINE_COUNT=$(( _LINE_COUNT + 1 )) +done + +return 0 +} + +# ----------------------------------------------------------------------------- +# example: +#1991 root +#1992 root +function _get_psinfo_by_process +{ +(( ARG_DEBUG != 0 && ARG_DEBUG_LEVEL > 0 )) && set ${DEBUG_OPTS} + +ps -C "${1}" -o pid:1,user:1 --no-headers 2>/dev/null + +return 0 +} + +# ----------------------------------------------------------------------------- +# example: +#7270 qmgr +#8539 pickup +function _get_psinfo_by_user +{ +(( ARG_DEBUG != 0 && ARG_DEBUG_LEVEL > 0 )) && set ${DEBUG_OPTS} + +ps -U "${1}" -o pid:1,comm:1 --no-headers 2>/dev/null + +return 0 +} +# ----------------------------------------------------------------------------- +function _check_limit +{ +typeset _LIMIT_NAME="${1}" +typeset _LIMIT_TYPE="${2}" +typeset _LIMIT_PID=${3} +typeset _LIMIT_USER="${4}" +typeset _LIMIT_PROCESS="${5}" +typeset _LIMIT_THRESHOLD=${6} +typeset _CURR_VALUE=${7} +typeset _LOG_HEALTHY=${8} +typeset _LIMIT_ENTRY="" +(( ARG_DEBUG != 0 && ARG_DEBUG_LEVEL > 0 )) && set ${DEBUG_OPTS} + +if [[ -n "${_LIMIT_THRESHOLD}" ]] +then + _LIMIT_ENTRY=$(grep -i "${_LIMIT_NAME}" /proc/${_LIMIT_PID}/limits 2>/dev/null) + if [[ -z "${_LIMIT_ENTRY}" ]] + then + warn "unable to gather limits information (${_LIMIT_PID}/${_LIMIT_USER}/${_LIMIT_PROCESS})" + return 1 + fi + + case "${_LIMIT_TYPE}" in + soft) + _LIMIT_VALUE=$(print "${_LIMIT_ENTRY}" | sed -s "s/${_LIMIT_NAME}//g" 2>/dev/null | awk '{ print $1}' 2>/dev/null) + ;; + hard) + _LIMIT_VALUE=$(print "${_LIMIT_ENTRY}" | sed -s "s/${_LIMIT_NAME}//g" 2>/dev/null | awk '{ print $2}' 2>/dev/null) + ;; + esac + if [[ "${_LIMIT_VALUE}" = "unlimited" ]] + then + log "limit (${_LIMIT_TYPE} on '${_LIMIT_NAME}' is unlimited (${_LIMIT_PID}/${_LIMIT_USER}/${_LIMIT_PROCESS})" + return 0 + else + if (( _CURR_VALUE > (_LIMIT_VALUE * _LIMIT_THRESHOLD / 100) )) + then + _MSG="(${_LIMIT_PID}/${_LIMIT_USER}/${_LIMIT_PROCESS}) limit (${_LIMIT_TYPE}) on '${_LIMIT_NAME}' has been surpassed (${_CURR_VALUE} > ${_LIMIT_VALUE} @${_LIMIT_THRESHOLD}%)" + log_hc "$0" 1 "${_MSG}" + else + if (( _LOG_HEALTHY > 0 )) + then + _MSG="(${_LIMIT_PID}/${_LIMIT_USER}/${_LIMIT_PROCESS}) limit (${_LIMIT_TYPE}) on '${_LIMIT_NAME}' is safe (${_CURR_VALUE} <= ${_LIMIT_VALUE} @${_LIMIT_THRESHOLD}%)" + log_hc "$0" 0 "${_MSG}" + fi + fi + fi +else + warn "limit on (${_LIMIT_TYPE} on '${_LIMIT_NAME}' was not checked (PID=${_LIMIT_PID})" +fi + +return 0 +} + +# ----------------------------------------------------------------------------- +function _get_open_files +{ +(( ARG_DEBUG != 0 && ARG_DEBUG_LEVEL > 0 )) && set ${DEBUG_OPTS} + +ls -f /proc/${1}/fd/ 2>/dev/null | wc -l 2>/dev/null + +return 0 +} + +# ----------------------------------------------------------------------------- +function _show_usage +{ +cat <<- EOT +NAME : $1 +VERSION : $2 +CONFIG : $3 with: + log_healthy= + and formatted stanzas: + user:::: + process:::: +PURPOSE : Checks the value(s) of the process limits from /proc/*/limits + +EOT + +return 0 +} + +#****************************************************************************** +# END of script +#****************************************************************************** diff --git a/sources/lib/platform/linux/check_linux_sg_qs_status.sh b/sources/lib/platform/linux/check_linux_sg_qs_status.sh index 183b6bd..8f05b9d 100644 --- a/sources/lib/platform/linux/check_linux_sg_qs_status.sh +++ b/sources/lib/platform/linux/check_linux_sg_qs_status.sh @@ -32,7 +32,7 @@ function check_linux_sg_qs_status { # ------------------------- CONFIGURATION starts here ------------------------- -typeset _VERSION="2018-05-1" # YYYY-MM-DD +typeset _VERSION="2018-05-21" # YYYY-MM-DD typeset _SUPPORTED_PLATFORMS="Linux" # uname -s match typeset _QS_BIN="/opt/qs/bin/qsc" typeset _QS_AUTH_FILE="/opt/qs/conf/qs_authfile"