mirror of
https://github.com/samba-team/samba.git
synced 2025-01-11 05:18:09 +03:00
ctdb-scripts: New consistent system memory and swap monitoring
New variables CTDB_MONITOR_MEMORY_USAGE and CTDB_MONITOR_SWAP_USAGE. Both take a pair of <warn_threshold>:<unhealthy_threshold> where each theshold is specified as a percentage. This adds a callout to check_thresholds() that is run when the unhealthy threshold is reached. Add some combination tests. Signed-off-by: Martin Schwenke <martin@meltin.net> Reviewed-by: Amitay Isaacs <amitay@gmail.com>
This commit is contained in:
parent
02fa6c3d10
commit
b6a0e4b856
@ -22,6 +22,7 @@ check_thresholds ()
|
||||
_thing="$1"
|
||||
_thresholds="$2"
|
||||
_usage="$3"
|
||||
_unhealthy_callout="$4"
|
||||
|
||||
case "$_thresholds" in
|
||||
*:*)
|
||||
@ -35,7 +36,9 @@ check_thresholds ()
|
||||
|
||||
if validate_percentage "$_unhealthy_threshold" "$_thing" ; then
|
||||
if [ "$_usage" -ge "$_unhealthy_threshold" ] ; then
|
||||
die "ERROR: ${_thing} utilization ${_usage}% >= threshold ${_unhealthy_threshold}%"
|
||||
echo "ERROR: ${_thing} utilization ${_usage}% >= threshold ${_unhealthy_threshold}%"
|
||||
eval "$_unhealthy_callout"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
@ -73,11 +76,21 @@ monitor_filesystem_usage ()
|
||||
done
|
||||
}
|
||||
|
||||
dump_memory_info ()
|
||||
{
|
||||
echo "CRITICAL: Shutting down CTDB!!!"
|
||||
get_proc "meminfo"
|
||||
ps auxfww
|
||||
set_proc "sysrq-trigger" "m"
|
||||
ctdb disable
|
||||
sleep 3
|
||||
ctdb shutdown
|
||||
}
|
||||
|
||||
monitor_memory_usage ()
|
||||
{
|
||||
if [ -z "$CTDB_MONITOR_FREE_MEMORY_WARN" -a \
|
||||
-z "$CTDB_MONITOR_FREE_MEMORY" -a \
|
||||
"$CTDB_CHECK_SWAP_IS_NOT_USED" != "yes" ] ; then
|
||||
if [ -z "$CTDB_MONITOR_MEMORY_USAGE" -a \
|
||||
-z "$CTDB_MONITOR_SWAP_USAGE" ] ; then
|
||||
return
|
||||
fi
|
||||
|
||||
@ -98,35 +111,15 @@ END {
|
||||
_mem_usage="$1"
|
||||
_swap_usage="$2"
|
||||
|
||||
# Shutdown CTDB when memory is below the configured limit
|
||||
if [ -n "$CTDB_MONITOR_FREE_MEMORY" ] ; then
|
||||
if [ $_mem_usage -ge $CTDB_MONITOR_FREE_MEMORY ] ; then
|
||||
echo "CRITICAL: OOM - ${_mem_usage}% usage >= ${CTDB_MONITOR_FREE_MEMORY}% (CTDB threshold)"
|
||||
echo "CRITICAL: Shutting down CTDB!!!"
|
||||
echo "$_meminfo"
|
||||
ps auxfww
|
||||
set_proc "sysrq-trigger" "m"
|
||||
ctdb disable
|
||||
sleep 3
|
||||
ctdb shutdown
|
||||
fi
|
||||
fi
|
||||
check_thresholds "System memory" \
|
||||
"$CTDB_MONITOR_MEMORY_USAGE" \
|
||||
"$_mem_usage" \
|
||||
dump_memory_info
|
||||
|
||||
# Warn when low on memory
|
||||
if [ -n "$CTDB_MONITOR_FREE_MEMORY_WARN" ] ; then
|
||||
if [ $_mem_usage -ge $CTDB_MONITOR_FREE_MEMORY_WARN ] ; then
|
||||
echo "WARNING: memory usage is excessive - ${_mem_usage}% >= ${CTDB_MONITOR_FREE_MEMORY_WARN}% (CTDB threshold)"
|
||||
fi
|
||||
fi
|
||||
|
||||
# We should never enter swap, so SwapTotal == SwapFree.
|
||||
if [ "$CTDB_CHECK_SWAP_IS_NOT_USED" = "yes" ] ; then
|
||||
if [ $_swap_usage -gt 0 ] ; then
|
||||
echo We are swapping:
|
||||
echo "$_meminfo"
|
||||
ps auxfww
|
||||
fi
|
||||
fi
|
||||
check_thresholds "System swap" \
|
||||
"$CTDB_MONITOR_SWAP_USAGE" \
|
||||
"$_swap_usage" \
|
||||
dump_memory_info
|
||||
}
|
||||
|
||||
|
||||
|
@ -1321,26 +1321,16 @@ CTDB_PER_IP_ROUTING_TABLE_ID_HIGH=9000
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term>CTDB_CHECK_SWAP_IS_NOT_USED=yes|no</term>
|
||||
<term>CTDB_MONITOR_MEMORY_USAGE=<parameter>MEM-LIMITS</parameter></term>
|
||||
<listitem>
|
||||
<para>
|
||||
Should a warning be logged if swap space is in use.
|
||||
</para>
|
||||
<para>
|
||||
Default is no.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term>CTDB_MONITOR_FREE_MEMORY=<parameter>NUM</parameter></term>
|
||||
<listitem>
|
||||
<para>
|
||||
NUM is threshold of acceptable memory usage, expressed
|
||||
as a percentage. If this is set and memory usage
|
||||
reaches this limit then some debug information will be
|
||||
logged, the node will be disabled and then CTDB will be
|
||||
shut down.
|
||||
MEM-LIMITS takes the form
|
||||
<parameter>WARN_LIMIT</parameter><optional>:<parameter>UNHEALTHY_LIMIT</parameter></optional>
|
||||
indicating that warnings should be logged if memory
|
||||
usage reaches WARN_LIMIT%. If usage reaches
|
||||
UNHEALTHY_LIMIT then the node should be flagged
|
||||
unhealthy. Either WARN_LIMIT or UNHEALTHY_LIMIT may be
|
||||
left blank, meaning that check will be omitted.
|
||||
</para>
|
||||
<para>
|
||||
No default.
|
||||
@ -1349,12 +1339,16 @@ CTDB_PER_IP_ROUTING_TABLE_ID_HIGH=9000
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term>CTDB_MONITOR_FREE_MEMORY_WARN=<parameter>NUM</parameter></term>
|
||||
<term>CTDB_MONITOR_SWAP_USAGE=<parameter>SWAP-LIMITS</parameter></term>
|
||||
<listitem>
|
||||
<para>
|
||||
NUM is threshold of acceptable memory usage, expressed
|
||||
as a percentage. If this is set and memory usage
|
||||
reaches this limit then a warning will be logged.
|
||||
SWAP-LIMITS takes the form
|
||||
<parameter>WARN_LIMIT</parameter><optional>:<parameter>UNHEALTHY_LIMIT</parameter></optional>
|
||||
indicating that warnings should be logged if
|
||||
swap usage reaches WARN_LIMIT%. If usage reaches
|
||||
UNHEALTHY_LIMIT then the node should be flagged
|
||||
unhealthy. Either WARN_LIMIT or UNHEALTHY_LIMIT may be
|
||||
left blank, meaning that check will be omitted.
|
||||
</para>
|
||||
<para>
|
||||
No default.
|
||||
|
@ -6,9 +6,8 @@ define_test "Memory check, bad situation, no checks enabled"
|
||||
|
||||
setup_memcheck 100 100
|
||||
|
||||
CTDB_MONITOR_FREE_MEMORY=""
|
||||
CTDB_MONITOR_FREE_MEMORY_WARN=""
|
||||
CTDB_CHECK_SWAP_IS_NOT_USED="no"
|
||||
CTDB_MONITOR_MEMORY_USAGE=""
|
||||
CTDB_MONITOR_SWAP_USAGE=""
|
||||
|
||||
ok_null
|
||||
|
||||
|
@ -6,9 +6,8 @@ define_test "Memory check, good situation, all enabled"
|
||||
|
||||
setup_memcheck
|
||||
|
||||
CTDB_MONITOR_FREE_MEMORY="90"
|
||||
CTDB_MONITOR_FREE_MEMORY_WARN="80"
|
||||
CTDB_CHECK_SWAP_IS_NOT_USED="yes"
|
||||
CTDB_MONITOR_MEMORY_USAGE="80:90"
|
||||
CTDB_MONITOR_SWAP_USAGE="1:50"
|
||||
|
||||
ok_null
|
||||
|
||||
|
@ -4,16 +4,17 @@
|
||||
|
||||
define_test "Memory check, bad situation, only swap check"
|
||||
|
||||
setup_memcheck 100 10
|
||||
setup_memcheck 100 90
|
||||
|
||||
CTDB_MONITOR_FREE_MEMORY=""
|
||||
CTDB_MONITOR_FREE_MEMORY_WARN=""
|
||||
CTDB_CHECK_SWAP_IS_NOT_USED="yes"
|
||||
CTDB_MONITOR_MEMORY_USAGE=""
|
||||
CTDB_MONITOR_SWAP_USAGE=":50"
|
||||
|
||||
ok <<EOF
|
||||
We are swapping:
|
||||
required_result 1 <<EOF
|
||||
ERROR: System swap utilization 90% >= threshold 50%
|
||||
CRITICAL: Shutting down CTDB!!!
|
||||
$FAKE_PROC_MEMINFO
|
||||
$(ps foobar)
|
||||
CTDB says BYE!
|
||||
EOF
|
||||
|
||||
simple_test
|
||||
|
@ -6,12 +6,11 @@ define_test "Memory check, bad situation, only memory warning"
|
||||
|
||||
setup_memcheck 90 10
|
||||
|
||||
CTDB_MONITOR_FREE_MEMORY=""
|
||||
CTDB_MONITOR_FREE_MEMORY_WARN="85"
|
||||
CTDB_CHECK_SWAP_IS_NOT_USED="no"
|
||||
CTDB_MONITOR_MEMORY_USAGE="85:"
|
||||
CTDB_MONITOR_SWAP_USAGE=""
|
||||
|
||||
ok <<EOF
|
||||
WARNING: memory usage is excessive - 90% >= 85% (CTDB threshold)
|
||||
WARNING: System memory utilization 90% >= threshold 85%
|
||||
EOF
|
||||
|
||||
simple_test
|
||||
|
@ -6,12 +6,11 @@ define_test "Memory check, bad situation, only memory critical"
|
||||
|
||||
setup_memcheck 90 0
|
||||
|
||||
CTDB_MONITOR_FREE_MEMORY="85"
|
||||
CTDB_MONITOR_FREE_MEMORY_WARN=""
|
||||
CTDB_CHECK_SWAP_IS_NOT_USED="no"
|
||||
CTDB_MONITOR_MEMORY_USAGE=":85"
|
||||
CTDB_MONITOR_SWAP_USAGE=""
|
||||
|
||||
ok <<EOF
|
||||
CRITICAL: OOM - 90% usage >= 85% (CTDB threshold)
|
||||
required_result 1 <<EOF
|
||||
ERROR: System memory utilization 90% >= threshold 85%
|
||||
CRITICAL: Shutting down CTDB!!!
|
||||
$FAKE_PROC_MEMINFO
|
||||
$(ps foobar)
|
||||
|
16
ctdb/tests/eventscripts/05.system.monitor.016.sh
Executable file
16
ctdb/tests/eventscripts/05.system.monitor.016.sh
Executable file
@ -0,0 +1,16 @@
|
||||
#!/bin/sh
|
||||
|
||||
. "${TEST_SCRIPTS_DIR}/unit.sh"
|
||||
|
||||
define_test "Memory check, bad situation, both memory checks, causes warning"
|
||||
|
||||
setup_memcheck 87 0
|
||||
|
||||
CTDB_MONITOR_MEMORY_USAGE="80:90"
|
||||
CTDB_MONITOR_SWAP_USAGE=""
|
||||
|
||||
ok <<EOF
|
||||
WARNING: System memory utilization 87% >= threshold 80%
|
||||
EOF
|
||||
|
||||
simple_test
|
42
ctdb/tests/eventscripts/05.system.monitor.017.sh
Executable file
42
ctdb/tests/eventscripts/05.system.monitor.017.sh
Executable file
@ -0,0 +1,42 @@
|
||||
#!/bin/sh
|
||||
|
||||
. "${TEST_SCRIPTS_DIR}/unit.sh"
|
||||
|
||||
define_test "Memory check, bad situation, both memory checks, causes unhealthy"
|
||||
|
||||
setup_memcheck 87 0
|
||||
|
||||
CTDB_MONITOR_MEMORY_USAGE="70:80"
|
||||
CTDB_MONITOR_SWAP_USAGE=""
|
||||
|
||||
required_result 1 <<EOF
|
||||
ERROR: System memory utilization 87% >= threshold 80%
|
||||
CRITICAL: Shutting down CTDB!!!
|
||||
MemTotal: 3940712 kB
|
||||
MemFree: 225268 kB
|
||||
Buffers: 146120 kB
|
||||
Cached: 140904 kB
|
||||
SwapCached: 56016 kB
|
||||
Active: 2422104 kB
|
||||
Inactive: 1019928 kB
|
||||
Active(anon): 1917580 kB
|
||||
Inactive(anon): 523080 kB
|
||||
Active(file): 504524 kB
|
||||
Inactive(file): 496848 kB
|
||||
Unevictable: 4844 kB
|
||||
Mlocked: 4844 kB
|
||||
SwapTotal: 5857276 kB
|
||||
SwapFree: 5857276 kB
|
||||
...
|
||||
USER PID %CPU %MEM VSZ RSS TTY STAT START TIME COMMAND
|
||||
root 2 0.0 0.0 0 0 ? S Aug28 0:00 [kthreadd]
|
||||
root 3 0.0 0.0 0 0 ? S Aug28 0:43 \_ [ksoftirqd/0]
|
||||
...
|
||||
root 1 0.0 0.0 2976 624 ? Ss Aug28 0:07 init [2]
|
||||
root 495 0.0 0.0 3888 1640 ? Ss Aug28 0:00 udevd --daemon
|
||||
...
|
||||
[MORE FAKE ps OUTPUT]
|
||||
CTDB says BYE!
|
||||
EOF
|
||||
|
||||
simple_test
|
@ -369,9 +369,8 @@ SwapTotal: ${_swap_total} kB
|
||||
SwapFree: ${_swap_free} kB
|
||||
..."
|
||||
|
||||
export CTDB_MONITOR_FREE_MEMORY
|
||||
export CTDB_MONITOR_FREE_MEMORY_WARN
|
||||
export CTDB_CHECK_SWAP_IS_NOT_USED
|
||||
export CTDB_MONITOR_MEMORY_USAGE
|
||||
export CTDB_MONITOR_SWAP_USAGE
|
||||
}
|
||||
|
||||
setup_fscheck ()
|
||||
|
Loading…
Reference in New Issue
Block a user