* Added check_linux_process_limits pluging

* Small fixes
This commit is contained in:
Patrick Van der Veken 2018-07-05 15:59:52 +02:00
parent 3eac307e32
commit eb4ac9d1df
6 changed files with 377 additions and 3 deletions

View File

@ -12,7 +12,7 @@ Health checker for UNIX/Linux is a small framework of monitoring scripts. It is
## Build
Use the build templates & scripts in the `build/ directory to roll your own packages.
Use the build templates & scripts in the `build/` directory to roll your own packages.
## Installation

View File

@ -44,6 +44,7 @@ cp ../SOURCES/lib/platform/linux/check_linux_hpasmcli.sh $RPM_BUILD_ROOT/opt/hc/
cp ../SOURCES/lib/platform/linux/check_linux_hpacucli.sh $RPM_BUILD_ROOT/opt/hc/lib/platform/linux/check_linux_hpacucli.sh
cp ../SOURCES/lib/platform/linux/check_linux_hplog.sh $RPM_BUILD_ROOT/opt/hc/lib/platform/linux/check_linux_hplog.sh
cp ../SOURCES/lib/platform/linux/check_linux_hpssacli.sh $RPM_BUILD_ROOT/opt/hc/lib/platform/linux/check_linux_hpssacli.sh
cp ../SOURCES/lib/platform/linux/check_linux_process_limits.sh $RPM_BUILD_ROOT/opt/hc/lib/platform/linux/check_linux_process_limits.sh
cp ../SOURCES/lib/platform/linux/check_linux_root_crontab.sh $RPM_BUILD_ROOT/opt/hc/lib/platform/linux/check_linux_root_crontab.sh
cp ../SOURCES/lib/platform/linux/check_linux_sg_cluster_config.sh $RPM_BUILD_ROOT/opt/hc/lib/platform/linux/check_linux_sg_cluster_config.sh
cp ../SOURCES/lib/platform/linux/check_linux_sg_cluster_status.sh $RPM_BUILD_ROOT/opt/hc/lib/platform/linux/check_linux_sg_cluster_status.sh
@ -60,6 +61,7 @@ cp ../SOURCES/etc/check_linux_hpacucli.conf.dist $RPM_BUILD_ROOT/etc/opt/hc/chec
cp ../SOURCES/etc/check_linux_hplog.conf.dist $RPM_BUILD_ROOT/etc/opt/hc/check_linux_hplog.conf.dist
cp ../SOURCES/etc/check_linux_hpssacli.conf.dist $RPM_BUILD_ROOT/etc/opt/hc/check_linux_hpssacli.conf.dist
cp ../SOURCES/etc/check_linux_ntp_status.conf.dist $RPM_BUILD_ROOT/etc/opt/hc/check_linux_ntp_status.conf.dist
cp ../SOURCES/etc/check_linux_process_limits.conf.dist $RPM_BUILD_ROOT/etc/opt/hc/check_linux_process_limits.conf.dist
cp ../SOURCES/etc/check_linux_root_crontab.conf.dist $RPM_BUILD_ROOT/etc/opt/hc/check_linux_root_crontab.conf.dist
cp ../SOURCES/etc/check_linux_sg_cluster_config.conf.dist $RPM_BUILD_ROOT/etc/opt/hc/check_linux_sg_cluster_config.conf.dist
cp ../SOURCES/etc/check_linux_sg_cluster_status.conf.dist $RPM_BUILD_ROOT/etc/opt/hc/check_linux_sg_cluster_status.conf.dist
@ -123,6 +125,7 @@ echo "INFO: finished post-uninstall script"
%attr(755, root, root) /opt/hc/lib/platform/linux/check_linux_hpacucli.sh
%attr(755, root, root) /opt/hc/lib/platform/linux/check_linux_hplog.sh
%attr(755, root, root) /opt/hc/lib/platform/linux/check_linux_hpssacli.sh
%attr(755, root, root) /opt/hc/lib/platform/linux/check_linux_process_limits.sh
%attr(755, root, root) /opt/hc/lib/platform/linux/check_linux_root_crontab.sh
%attr(755, root, root) /opt/hc/lib/platform/linux/check_linux_sg_cluster_config.sh
%attr(755, root, root) /opt/hc/lib/platform/linux/check_linux_sg_cluster_status.sh
@ -139,6 +142,7 @@ echo "INFO: finished post-uninstall script"
%attr(644, root, root) /etc/opt/hc/check_linux_hplog.conf.dist
%attr(644, root, root) /etc/opt/hc/check_linux_hpssacli.conf.dist
%attr(644, root, root) /etc/opt/hc/check_linux_ntp_status.conf.dist
%attr(644, root, root) /etc/opt/hc/check_linux_process_limits.conf.dist
%attr(644, root, root) /etc/opt/hc/check_linux_root_crontab.conf.dist
%attr(644, root, root) /etc/opt/hc/check_linux_sg_cluster_config.conf.dist
%attr(644, root, root) /etc/opt/hc/check_linux_sg_cluster_status.conf.dist
@ -151,6 +155,8 @@ echo "INFO: finished post-uninstall script"
%attr(644, root, root) /etc/opt/hc/core/templates/mail_body.tpl-check_linux_root_crontab
%changelog
* Tue Jul 10 2018 <patrick@kudos.be> - 0.1.1
- Added check_linux_process_limits
* Sat Apr 21 2018 <patrick@kudos.be> - 0.1.0
- Added check_linux_ntp_status
* Thu May 18 2017 <patrick@kudos.be> - 0.0.9

View File

@ -5,7 +5,7 @@ Version: %{build_timestamp}
Release: 1
Summary: The KUDOS Health Checker (HC) for UNIX
Group: Tools/MonitoringGroup: Tools/Monitoring
Group: Tools/Monitoring
License: GNU General Public License either version 2 of the License, or (at your option) any later version
URL: http://www.kudos.be

View File

@ -0,0 +1,31 @@
#******************************************************************************
# @(#) check_linux_process_limits.conf
#******************************************************************************
# This is a configuration file for the check_linux_process_limits HC plugin.
# All lines starting with a '#' are comment lines.
# [default: indicates hardcoded script values if no value is defined here]
#******************************************************************************
# specify whether to also log passed health checks
# (warning: this may rapidly grow the HC log)
# [default: no]
log_healthy="no"
# -- user stanzas --
# format:
# user:<user name>;<limit name>;<soft threshold in %>;<hard threshold in %>
# note: <limit name> must exactly match the descriptor in /proc/<PID>/limits
# thresholds are optional (will not be checked when not specified)
user;postfix;Max open files;;75
# -- process stanzas --
# format:
# process:<process name>;<limit name>;<soft threshold in %>;<hard threshold in %>
# note: <limit name> must exactly match the descriptor in /proc/<PID>/limits
# thresholds are optional (will not be checked when not specified)
process;nfsd;Max open files;100;80
#******************************************************************************
# End of FILE
#******************************************************************************

View File

@ -0,0 +1,337 @@
#!/usr/bin/env ksh
#******************************************************************************
# @(#) check_linux_process_limits.sh
#******************************************************************************
# @(#) Copyright (C) 2016 by KUDOS BVBA (info@kudos.be). All rights reserved.
#
# This program is a free software; you can redistribute it and/or modify
# it under the same terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details
#******************************************************************************
#
# DOCUMENTATION (MAIN)
# -----------------------------------------------------------------------------
# @(#) MAIN: check_linux_process_limits
# DOES: see _show_usage()
# EXPECTS: see _show_usage()
# REQUIRES: data_space2comma(), init_hc(), log_hc()
#
# @(#) HISTORY:
# @(#) 2018-07-10: original version [Patrick Van der Veken]
# -----------------------------------------------------------------------------
# DO NOT CHANGE THIS FILE UNLESS YOU KNOW WHAT YOU ARE DOING!
#******************************************************************************
# -----------------------------------------------------------------------------
function check_linux_process_limits
{
# ------------------------- CONFIGURATION starts here -------------------------
typeset _CONFIG_FILE="${CONFIG_DIR}/$0.conf"
typeset _VERSION="2018-07-10" # YYYY-MM-DD
typeset _SUPPORTED_PLATFORMS="Linux" # uname -s match
# ------------------------- CONFIGURATION ends here ---------------------------
# set defaults
(( ARG_DEBUG != 0 && ARG_DEBUG_LEVEL > 0 )) && set ${DEBUG_OPTS}
init_hc "$0" "${_SUPPORTED_PLATFORMS}" "${_VERSION}"
typeset _ARGS=$(data_space2comma "$*")
typeset _ARG=""
typeset _MSG=""
typeset _STC=0
typeset _DUMMY=""
typeset _LINE_COUNT=1
typeset _LOG_HEALTHY=0
typeset _OPEN_FILES=0
typeset _PROCESS=""
typeset _PROCESS_LIMIT=""
typeset _PROCESS_SOFT_THRESHOLD=0
typeset _PROCESS_HARD_THRESHOLD=0
typeset _PROCESS_PS=""
typeset _PROCESS_PS_PID=""
typeset _PROCESS_PS_USER=""
typeset _USER=""
typeset _USER_LIMIT=""
typeset _USER_SOFT_THRESHOLD=0
typeset _USER_HARD_THRESHOLD=0
typeset _USER_PS=""
typeset _USER_PS_PID=""
typeset _USER_PS_COMM=""
# set local trap for cleanup
trap "rm -f ${_INSTANCE_RUN_FILE}.* >/dev/null 2>&1; return 1" 1 2 3 15
# handle arguments (originally comma-separated)
for _ARG in ${_ARGS}
do
case "${_ARG}" in
help)
_show_usage $0 ${_VERSION} ${_CONFIG_FILE} && return 0
;;
esac
done
# handle configuration file
[[ -n "${ARG_CONFIG_FILE}" ]] && _CONFIG_FILE="${ARG_CONFIG_FILE}"
if [[ ! -r ${_CONFIG_FILE} ]]
then
warn "unable to read configuration file at ${_CONFIG_FILE}"
return 1
fi
# read required configuration values
_LOG_HEALTHY=$(_CONFIG_FILE="${_CONFIG_FILE}" data_get_lvalue_from_config 'log_healthy')
case "${_LOG_HEALTHY}" in
no|NO|No)
_LOG_HEALTHY=0
log "not logging/showing passed health checks"
;;
*)
_LOG_HEALTHY=1
if (( ARG_LOG > 0 ))
then
log "logging/showing passed health checks"
else
log "showing passed health checks (but not logging)"
fi
;;
esac
# check PROCESS stanzas
grep -i '^process' ${_CONFIG_FILE} 2>/dev/null |\
while IFS=';' read _DUMMY _PROCESS _PROCESS_LIMIT _PROCESS_SOFT_THRESHOLD _PROCESS_HARD_THRESHOLD
do
# check for empties
if [[ -z "${_PROCESS}" || -z "${_PROCESS_LIMIT}" ]]
then
warn "missing parameter in configuration file ${_CONFIG_FILE} at data line ${_LINE_COUNT}"
return 1
fi
if [[ -n "${_PROCESS_SOFT_THRESHOLD}" ]]
then
$(data_is_numeric ${_PROCESS_SOFT_THRESHOLD})
if (( $? > 0 ))
then
warn "parameter is not numeric in configuration file ${_CONFIG_FILE} at data line ${_LINE_COUNT}"
return 1
fi
fi
if [[ -n "${_PROCESS_HARD_THRESHOLD}" ]]
then
$(data_is_numeric ${_PROCESS_HARD_THRESHOLD})
if (( $? > 0 ))
then
warn "parameter is not numeric in configuration file ${_CONFIG_FILE} at data line ${_LINE_COUNT}"
return 1
fi
fi
# collect ps info
(( ARG_DEBUG != 0 )) && debug "collecting information for process class ${_PROCESS}"
_PROCESS_PS=$(_get_psinfo_by_process "${_PROCESS}")
if [[ -z "${_PROCESS_PS}" ]]
then
warn "could not find any matching processes for process ${_PROCESS}"
continue
fi
print "${_PROCESS_PS}" | while read _PROCESS_PS_PID _PROCESS_PS_USER
do
(( ARG_DEBUG != 0 )) && debug "checking process ${_PROCESS_PS_PID}"
# get current values and check thresholds
case "${_PROCESS_LIMIT}" in
"Max open files"|"MAX OPEN FILES"|"max open files")
_OPEN_FILES=$(_get_open_files ${_PROCESS_PS_PID})
# SOFT limit
_check_limit "${_PROCESS_LIMIT}" soft ${_PROCESS_PS_PID} ${_PROCESS_PS_USER} ${_PROCESS} ${_PROCESS_SOFT_THRESHOLD} ${_OPEN_FILES} ${_LOG_HEALTHY}
# HARD limit
_check_limit "${_PROCESS_LIMIT}" hard ${_PROCESS_PS_PID} ${_PROCESS_PS_USER} ${_PROCESS} ${_PROCESS_HARD_THRESHOLD} ${_OPEN_FILES} ${_LOG_HEALTHY}
;;
*)
# no other limits are supported yet ;-)
warn "'${_PROCESS_LIMIT}' is an unsupported limit check"
continue
;;
esac
done
_LINE_COUNT=$(( _LINE_COUNT + 1 ))
done
# check USER stanzas
_LINE_COUNT=0
grep -i '^user' ${_CONFIG_FILE} 2>/dev/null |\
while IFS=';' read _DUMMY _USER _USER_LIMIT _USER_SOFT_THRESHOLD _USER_HARD_THRESHOLD
do
# check for empties
if [[ -z "${_USER}" || -z "${_USER_LIMIT}" ]]
then
warn "missing parameter in configuration file ${_CONFIG_FILE} at data line ${_LINE_COUNT}"
return 1
fi
if [[ -n "${_USER_SOFT_THRESHOLD}" ]]
then
$(data_is_numeric ${_USER_SOFT_THRESHOLD})
if (( $? > 0 ))
then
warn "parameter is not numeric in configuration file ${_CONFIG_FILE} at data line ${_LINE_COUNT}"
return 1
fi
fi
if [[ -n "${_USER_HARD_THRESHOLD}" ]]
then
$(data_is_numeric ${_USER_HARD_THRESHOLD})
if (( $? > 0 ))
then
warn "parameter is not numeric in configuration file ${_CONFIG_FILE} at data line ${_LINE_COUNT}"
return 1
fi
fi
# collect ps info
(( ARG_DEBUG != 0 )) && debug "collecting information for user ${_USER}"
_USER_PS=$(_get_psinfo_by_user "${_USER}")
if [[ -z "${_USER_PS}" ]]
then
warn "could not find any matching processes for user ${_USER}"
continue
fi
print "${_USER_PS}" | while read _USER_PS_PID _USER_PS_COMM
do
(( ARG_DEBUG != 0 )) && debug "checking process ${_USER_PS_PID}"
# get current values and check thresholds
case "${_USER_LIMIT}" in
"Max open files"|"MAX OPEN FILES"|"max open files")
_OPEN_FILES=$(_get_open_files ${_USER_PS_PID})
# SOFT limit
_check_limit "${_USER_LIMIT}" soft ${_USER_PS_PID} ${_USER} ${_USER_PS_COMM} ${_USER_SOFT_THRESHOLD} ${_OPEN_FILES} ${_LOG_HEALTHY}
# HARD limit
_check_limit "${_USER_LIMIT}" hard ${_USER_PS_PID} ${_USER} ${_USER_PS_COMM} ${_USER_HARD_THRESHOLD} ${_OPEN_FILES} ${_LOG_HEALTHY}
;;
*)
# no other limits are supported yet ;-)
warn "'${_USER_LIMIT}' is an unsupported limit check"
continue
;;
esac
done
_LINE_COUNT=$(( _LINE_COUNT + 1 ))
done
return 0
}
# -----------------------------------------------------------------------------
# example:
#1991 root
#1992 root
function _get_psinfo_by_process
{
(( ARG_DEBUG != 0 && ARG_DEBUG_LEVEL > 0 )) && set ${DEBUG_OPTS}
ps -C "${1}" -o pid:1,user:1 --no-headers 2>/dev/null
return 0
}
# -----------------------------------------------------------------------------
# example:
#7270 qmgr
#8539 pickup
function _get_psinfo_by_user
{
(( ARG_DEBUG != 0 && ARG_DEBUG_LEVEL > 0 )) && set ${DEBUG_OPTS}
ps -U "${1}" -o pid:1,comm:1 --no-headers 2>/dev/null
return 0
}
# -----------------------------------------------------------------------------
function _check_limit
{
typeset _LIMIT_NAME="${1}"
typeset _LIMIT_TYPE="${2}"
typeset _LIMIT_PID=${3}
typeset _LIMIT_USER="${4}"
typeset _LIMIT_PROCESS="${5}"
typeset _LIMIT_THRESHOLD=${6}
typeset _CURR_VALUE=${7}
typeset _LOG_HEALTHY=${8}
typeset _LIMIT_ENTRY=""
(( ARG_DEBUG != 0 && ARG_DEBUG_LEVEL > 0 )) && set ${DEBUG_OPTS}
if [[ -n "${_LIMIT_THRESHOLD}" ]]
then
_LIMIT_ENTRY=$(grep -i "${_LIMIT_NAME}" /proc/${_LIMIT_PID}/limits 2>/dev/null)
if [[ -z "${_LIMIT_ENTRY}" ]]
then
warn "unable to gather limits information (${_LIMIT_PID}/${_LIMIT_USER}/${_LIMIT_PROCESS})"
return 1
fi
case "${_LIMIT_TYPE}" in
soft)
_LIMIT_VALUE=$(print "${_LIMIT_ENTRY}" | sed -s "s/${_LIMIT_NAME}//g" 2>/dev/null | awk '{ print $1}' 2>/dev/null)
;;
hard)
_LIMIT_VALUE=$(print "${_LIMIT_ENTRY}" | sed -s "s/${_LIMIT_NAME}//g" 2>/dev/null | awk '{ print $2}' 2>/dev/null)
;;
esac
if [[ "${_LIMIT_VALUE}" = "unlimited" ]]
then
log "limit (${_LIMIT_TYPE} on '${_LIMIT_NAME}' is unlimited (${_LIMIT_PID}/${_LIMIT_USER}/${_LIMIT_PROCESS})"
return 0
else
if (( _CURR_VALUE > (_LIMIT_VALUE * _LIMIT_THRESHOLD / 100) ))
then
_MSG="(${_LIMIT_PID}/${_LIMIT_USER}/${_LIMIT_PROCESS}) limit (${_LIMIT_TYPE}) on '${_LIMIT_NAME}' has been surpassed (${_CURR_VALUE} > ${_LIMIT_VALUE} @${_LIMIT_THRESHOLD}%)"
log_hc "$0" 1 "${_MSG}"
else
if (( _LOG_HEALTHY > 0 ))
then
_MSG="(${_LIMIT_PID}/${_LIMIT_USER}/${_LIMIT_PROCESS}) limit (${_LIMIT_TYPE}) on '${_LIMIT_NAME}' is safe (${_CURR_VALUE} <= ${_LIMIT_VALUE} @${_LIMIT_THRESHOLD}%)"
log_hc "$0" 0 "${_MSG}"
fi
fi
fi
else
warn "limit on (${_LIMIT_TYPE} on '${_LIMIT_NAME}' was not checked (PID=${_LIMIT_PID})"
fi
return 0
}
# -----------------------------------------------------------------------------
function _get_open_files
{
(( ARG_DEBUG != 0 && ARG_DEBUG_LEVEL > 0 )) && set ${DEBUG_OPTS}
ls -f /proc/${1}/fd/ 2>/dev/null | wc -l 2>/dev/null
return 0
}
# -----------------------------------------------------------------------------
function _show_usage
{
cat <<- EOT
NAME : $1
VERSION : $2
CONFIG : $3 with:
log_healthy=<yes|no>
and formatted stanzas:
user:<user_name>:<limit_name>:<soft_limit_threshold_%>:<hard_limit_threshold_%>
process:<process_name>:<limit_name>:<soft_limit_threshold_%>:<hard_limit_threshold_%>
PURPOSE : Checks the value(s) of the process limits from /proc/*/limits
EOT
return 0
}
#******************************************************************************
# END of script
#******************************************************************************

View File

@ -32,7 +32,7 @@
function check_linux_sg_qs_status
{
# ------------------------- CONFIGURATION starts here -------------------------
typeset _VERSION="2018-05-1" # YYYY-MM-DD
typeset _VERSION="2018-05-21" # YYYY-MM-DD
typeset _SUPPORTED_PLATFORMS="Linux" # uname -s match
typeset _QS_BIN="/opt/qs/bin/qsc"
typeset _QS_AUTH_FILE="/opt/qs/conf/qs_authfile"