1
0
mirror of https://github.com/samba-team/samba.git synced 2024-12-23 17:34:34 +03:00

ctdb-scripts: New consistent system memory and swap monitoring

New variables CTDB_MONITOR_MEMORY_USAGE and CTDB_MONITOR_SWAP_USAGE.
Both take a pair of <warn_threshold>:<unhealthy_threshold> where each
theshold is specified as a percentage.

This adds a callout to check_thresholds() that is run when the
unhealthy threshold is reached.

Add some combination tests.

Signed-off-by: Martin Schwenke <martin@meltin.net>
Reviewed-by: Amitay Isaacs <amitay@gmail.com>
This commit is contained in:
Martin Schwenke 2015-08-03 17:22:08 +10:00 committed by Amitay Isaacs
parent 02fa6c3d10
commit b6a0e4b856
10 changed files with 119 additions and 78 deletions

View File

@ -22,6 +22,7 @@ check_thresholds ()
_thing="$1"
_thresholds="$2"
_usage="$3"
_unhealthy_callout="$4"
case "$_thresholds" in
*:*)
@ -35,7 +36,9 @@ check_thresholds ()
if validate_percentage "$_unhealthy_threshold" "$_thing" ; then
if [ "$_usage" -ge "$_unhealthy_threshold" ] ; then
die "ERROR: ${_thing} utilization ${_usage}% >= threshold ${_unhealthy_threshold}%"
echo "ERROR: ${_thing} utilization ${_usage}% >= threshold ${_unhealthy_threshold}%"
eval "$_unhealthy_callout"
exit 1
fi
fi
@ -73,11 +76,21 @@ monitor_filesystem_usage ()
done
}
dump_memory_info ()
{
echo "CRITICAL: Shutting down CTDB!!!"
get_proc "meminfo"
ps auxfww
set_proc "sysrq-trigger" "m"
ctdb disable
sleep 3
ctdb shutdown
}
monitor_memory_usage ()
{
if [ -z "$CTDB_MONITOR_FREE_MEMORY_WARN" -a \
-z "$CTDB_MONITOR_FREE_MEMORY" -a \
"$CTDB_CHECK_SWAP_IS_NOT_USED" != "yes" ] ; then
if [ -z "$CTDB_MONITOR_MEMORY_USAGE" -a \
-z "$CTDB_MONITOR_SWAP_USAGE" ] ; then
return
fi
@ -98,35 +111,15 @@ END {
_mem_usage="$1"
_swap_usage="$2"
# Shutdown CTDB when memory is below the configured limit
if [ -n "$CTDB_MONITOR_FREE_MEMORY" ] ; then
if [ $_mem_usage -ge $CTDB_MONITOR_FREE_MEMORY ] ; then
echo "CRITICAL: OOM - ${_mem_usage}% usage >= ${CTDB_MONITOR_FREE_MEMORY}% (CTDB threshold)"
echo "CRITICAL: Shutting down CTDB!!!"
echo "$_meminfo"
ps auxfww
set_proc "sysrq-trigger" "m"
ctdb disable
sleep 3
ctdb shutdown
fi
fi
check_thresholds "System memory" \
"$CTDB_MONITOR_MEMORY_USAGE" \
"$_mem_usage" \
dump_memory_info
# Warn when low on memory
if [ -n "$CTDB_MONITOR_FREE_MEMORY_WARN" ] ; then
if [ $_mem_usage -ge $CTDB_MONITOR_FREE_MEMORY_WARN ] ; then
echo "WARNING: memory usage is excessive - ${_mem_usage}% >= ${CTDB_MONITOR_FREE_MEMORY_WARN}% (CTDB threshold)"
fi
fi
# We should never enter swap, so SwapTotal == SwapFree.
if [ "$CTDB_CHECK_SWAP_IS_NOT_USED" = "yes" ] ; then
if [ $_swap_usage -gt 0 ] ; then
echo We are swapping:
echo "$_meminfo"
ps auxfww
fi
fi
check_thresholds "System swap" \
"$CTDB_MONITOR_SWAP_USAGE" \
"$_swap_usage" \
dump_memory_info
}

View File

@ -1321,26 +1321,16 @@ CTDB_PER_IP_ROUTING_TABLE_ID_HIGH=9000
</varlistentry>
<varlistentry>
<term>CTDB_CHECK_SWAP_IS_NOT_USED=yes|no</term>
<term>CTDB_MONITOR_MEMORY_USAGE=<parameter>MEM-LIMITS</parameter></term>
<listitem>
<para>
Should a warning be logged if swap space is in use.
</para>
<para>
Default is no.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term>CTDB_MONITOR_FREE_MEMORY=<parameter>NUM</parameter></term>
<listitem>
<para>
NUM is threshold of acceptable memory usage, expressed
as a percentage. If this is set and memory usage
reaches this limit then some debug information will be
logged, the node will be disabled and then CTDB will be
shut down.
MEM-LIMITS takes the form
<parameter>WARN_LIMIT</parameter><optional>:<parameter>UNHEALTHY_LIMIT</parameter></optional>
indicating that warnings should be logged if memory
usage reaches WARN_LIMIT%. If usage reaches
UNHEALTHY_LIMIT then the node should be flagged
unhealthy. Either WARN_LIMIT or UNHEALTHY_LIMIT may be
left blank, meaning that check will be omitted.
</para>
<para>
No default.
@ -1349,12 +1339,16 @@ CTDB_PER_IP_ROUTING_TABLE_ID_HIGH=9000
</varlistentry>
<varlistentry>
<term>CTDB_MONITOR_FREE_MEMORY_WARN=<parameter>NUM</parameter></term>
<term>CTDB_MONITOR_SWAP_USAGE=<parameter>SWAP-LIMITS</parameter></term>
<listitem>
<para>
NUM is threshold of acceptable memory usage, expressed
as a percentage. If this is set and memory usage
reaches this limit then a warning will be logged.
SWAP-LIMITS takes the form
<parameter>WARN_LIMIT</parameter><optional>:<parameter>UNHEALTHY_LIMIT</parameter></optional>
indicating that warnings should be logged if
swap usage reaches WARN_LIMIT%. If usage reaches
UNHEALTHY_LIMIT then the node should be flagged
unhealthy. Either WARN_LIMIT or UNHEALTHY_LIMIT may be
left blank, meaning that check will be omitted.
</para>
<para>
No default.

View File

@ -6,9 +6,8 @@ define_test "Memory check, bad situation, no checks enabled"
setup_memcheck 100 100
CTDB_MONITOR_FREE_MEMORY=""
CTDB_MONITOR_FREE_MEMORY_WARN=""
CTDB_CHECK_SWAP_IS_NOT_USED="no"
CTDB_MONITOR_MEMORY_USAGE=""
CTDB_MONITOR_SWAP_USAGE=""
ok_null

View File

@ -6,9 +6,8 @@ define_test "Memory check, good situation, all enabled"
setup_memcheck
CTDB_MONITOR_FREE_MEMORY="90"
CTDB_MONITOR_FREE_MEMORY_WARN="80"
CTDB_CHECK_SWAP_IS_NOT_USED="yes"
CTDB_MONITOR_MEMORY_USAGE="80:90"
CTDB_MONITOR_SWAP_USAGE="1:50"
ok_null

View File

@ -4,16 +4,17 @@
define_test "Memory check, bad situation, only swap check"
setup_memcheck 100 10
setup_memcheck 100 90
CTDB_MONITOR_FREE_MEMORY=""
CTDB_MONITOR_FREE_MEMORY_WARN=""
CTDB_CHECK_SWAP_IS_NOT_USED="yes"
CTDB_MONITOR_MEMORY_USAGE=""
CTDB_MONITOR_SWAP_USAGE=":50"
ok <<EOF
We are swapping:
required_result 1 <<EOF
ERROR: System swap utilization 90% >= threshold 50%
CRITICAL: Shutting down CTDB!!!
$FAKE_PROC_MEMINFO
$(ps foobar)
CTDB says BYE!
EOF
simple_test

View File

@ -6,12 +6,11 @@ define_test "Memory check, bad situation, only memory warning"
setup_memcheck 90 10
CTDB_MONITOR_FREE_MEMORY=""
CTDB_MONITOR_FREE_MEMORY_WARN="85"
CTDB_CHECK_SWAP_IS_NOT_USED="no"
CTDB_MONITOR_MEMORY_USAGE="85:"
CTDB_MONITOR_SWAP_USAGE=""
ok <<EOF
WARNING: memory usage is excessive - 90% >= 85% (CTDB threshold)
WARNING: System memory utilization 90% >= threshold 85%
EOF
simple_test

View File

@ -6,12 +6,11 @@ define_test "Memory check, bad situation, only memory critical"
setup_memcheck 90 0
CTDB_MONITOR_FREE_MEMORY="85"
CTDB_MONITOR_FREE_MEMORY_WARN=""
CTDB_CHECK_SWAP_IS_NOT_USED="no"
CTDB_MONITOR_MEMORY_USAGE=":85"
CTDB_MONITOR_SWAP_USAGE=""
ok <<EOF
CRITICAL: OOM - 90% usage >= 85% (CTDB threshold)
required_result 1 <<EOF
ERROR: System memory utilization 90% >= threshold 85%
CRITICAL: Shutting down CTDB!!!
$FAKE_PROC_MEMINFO
$(ps foobar)

View File

@ -0,0 +1,16 @@
#!/bin/sh
. "${TEST_SCRIPTS_DIR}/unit.sh"
define_test "Memory check, bad situation, both memory checks, causes warning"
setup_memcheck 87 0
CTDB_MONITOR_MEMORY_USAGE="80:90"
CTDB_MONITOR_SWAP_USAGE=""
ok <<EOF
WARNING: System memory utilization 87% >= threshold 80%
EOF
simple_test

View File

@ -0,0 +1,42 @@
#!/bin/sh
. "${TEST_SCRIPTS_DIR}/unit.sh"
define_test "Memory check, bad situation, both memory checks, causes unhealthy"
setup_memcheck 87 0
CTDB_MONITOR_MEMORY_USAGE="70:80"
CTDB_MONITOR_SWAP_USAGE=""
required_result 1 <<EOF
ERROR: System memory utilization 87% >= threshold 80%
CRITICAL: Shutting down CTDB!!!
MemTotal: 3940712 kB
MemFree: 225268 kB
Buffers: 146120 kB
Cached: 140904 kB
SwapCached: 56016 kB
Active: 2422104 kB
Inactive: 1019928 kB
Active(anon): 1917580 kB
Inactive(anon): 523080 kB
Active(file): 504524 kB
Inactive(file): 496848 kB
Unevictable: 4844 kB
Mlocked: 4844 kB
SwapTotal: 5857276 kB
SwapFree: 5857276 kB
...
USER PID %CPU %MEM VSZ RSS TTY STAT START TIME COMMAND
root 2 0.0 0.0 0 0 ? S Aug28 0:00 [kthreadd]
root 3 0.0 0.0 0 0 ? S Aug28 0:43 \_ [ksoftirqd/0]
...
root 1 0.0 0.0 2976 624 ? Ss Aug28 0:07 init [2]
root 495 0.0 0.0 3888 1640 ? Ss Aug28 0:00 udevd --daemon
...
[MORE FAKE ps OUTPUT]
CTDB says BYE!
EOF
simple_test

View File

@ -369,9 +369,8 @@ SwapTotal: ${_swap_total} kB
SwapFree: ${_swap_free} kB
..."
export CTDB_MONITOR_FREE_MEMORY
export CTDB_MONITOR_FREE_MEMORY_WARN
export CTDB_CHECK_SWAP_IS_NOT_USED
export CTDB_MONITOR_MEMORY_USAGE
export CTDB_MONITOR_SWAP_USAGE
}
setup_fscheck ()