1
0
mirror of https://github.com/samba-team/samba.git synced 2025-01-03 01:18:10 +03:00

ctdb-scripts: Avoid flapping NFS services at startup

If an NFS service check is set to, say, unhealthy_after=2 then it will
always switch from the (default startup) unhealthy state to healthy,
even if there is a fatal problem.  If all services/scripts appear OK
then the node will become healthy.  When the counter hits the limit it
will return to unhealthy.  This is misleading.

Instead, never use the counter at startup, until the service becomes
healthy.  This stops services flapping unhealthy-healthy-unhealthy.

A side-effect is that a service that starts in a broken state will
never be restarted to try to fix the problem.  This makes sense.  The
counting and restarting really exist to deal with problems that might
occur under load.  The first monitor events occur before public IPs
are hosted, so there can be no load.  If a service doesn't start
reliably the first time then the admin probably wants to know about
it.

nfs_iterate_test() is updated to run an initial monitor event to mark
the services as healthy.  This initialises the counter so it can be
used for the important part of the test.  Passing the -i option avoids
running the extra monitor event, so the first iteration will be the
initial monitor event.

Signed-off-by: Martin Schwenke <mschwenke@ddn.com>
Reviewed-by: Amitay Isaacs <amitay@gmail.com>
This commit is contained in:
Martin Schwenke 2024-06-29 12:25:59 +10:00 committed by Martin Schwenke
parent 18a29ed367
commit 578dfa5765
5 changed files with 93 additions and 7 deletions

View File

@ -193,6 +193,12 @@ nfs_check_service()
exit 0
fi
# Don't count immediately after startup
if ! ctdb_counter_exists "$_progname"; then
echo "ERROR: $_err"
exit 1
fi
ctdb_counter_incr "$_progname"
_failcount=$(ctdb_counter_get "$_progname")

View File

@ -866,6 +866,11 @@ ctdb_counter_get()
# shellcheck disable=SC2086
echo $_val
}
ctdb_counter_exists()
{
_ctdb_counter_common "$1"
[ -e "$_counter_file" ]
}
#
# Fail counter/threshold combination to control warnings and node unhealthy

View File

@ -0,0 +1,9 @@
#!/bin/sh
. "${TEST_SCRIPTS_DIR}/unit.sh"
define_test "nfs down, 1 iteration, not previously healthy"
setup
nfs_iterate_test -i 1 "nfs"

View File

@ -0,0 +1,9 @@
#!/bin/sh
. "${TEST_SCRIPTS_DIR}/unit.sh"
define_test "nfs down, 10 iterations, not previously healthy"
setup
nfs_iterate_test -i 10 "nfs"

View File

@ -22,6 +22,8 @@ EOF
export RPCNFSDCOUNT
TEST_RPC_ALL_SERVICES="portmapper nfs mountd rquotad nlockmgr status"
if [ "$1" != "down" ]; then
debug <<EOF
Setting up NFS environment: all RPC services up, NFS managed by CTDB
@ -40,9 +42,9 @@ EOF
;;
esac
_rpc_services_up \
"portmapper" "nfs" "mountd" "rquotad" \
"nlockmgr" "status"
# Intentional word splitting
# shellcheck disable=SC2086
_rpc_services_up $TEST_RPC_ALL_SERVICES
nfs_setup_fake_threads "nfsd"
nfs_setup_fake_threads "rpc.foobar" # Set the variable to empty
@ -283,6 +285,35 @@ program ${_rpc_service}${_ver:+ version }${_ver} is not available
EOF
}
_rpc_was_healthy_common()
{
_rpc_service="$1"
_f="rpc.${_rpc_service}.was_healthy"
_rpc_was_healthy_file="${CTDB_TEST_TMP_DIR}/${_f}"
}
_rpc_set_was_healthy()
{
if [ $# -eq 0 ]; then
# Intentional word splitting
# shellcheck disable=SC2086
set -- $TEST_RPC_ALL_SERVICES
fi
for _rpc_service; do
_rpc_was_healthy_common "$_rpc_service"
touch "$_rpc_was_healthy_file"
done
}
_rpc_check_was_healthy()
{
_rpc_was_healthy_common "$1"
[ -e "$_rpc_was_healthy_file" ]
}
# Set the required result for a particular RPC program having failed
# for a certain number of iterations. This is probably still a work
# in progress. Note that we could hook aggressively
@ -299,6 +330,7 @@ rpc_set_service_failure_response()
ok_null
if [ -z "$_rpc_service" ]; then
_rpc_set_was_healthy
return
fi
@ -376,6 +408,7 @@ rpc_set_service_failure_response()
# shellcheck disable=SC2181
if [ $? -eq 0 ]; then
echo 0 >"$_failcount_file"
_rpc_set_was_healthy "$_rpc_service"
exit # from subshell
elif rpcinfo_timed_out "$_ri_out"; then
_why="Timed out"
@ -390,6 +423,10 @@ rpc_set_service_failure_response()
echo 0 >"$_failcount_file"
exit # from subshell
fi
elif ! _rpc_check_was_healthy "$_rpc_service"; then
echo 1 >"$_rc_file"
rpc_failure "ERROR:" "$_rpc_service" "$_ver" >"$_out"
exit # from subshell
fi
_numfails=$((_numfails + 1))
@ -407,6 +444,7 @@ rpc_set_service_failure_response()
>"$_out"
else
_unhealthy=false
_rpc_set_was_healthy "$_rpc_service"
fi
if [ $restart_every -gt 0 ] &&
@ -480,6 +518,12 @@ program_stack_traces()
#
nfs_iterate_test()
{
_initial_monitor_event=false
if [ "$1" = "-i" ]; then
shift
_initial_monitor_event=true
fi
_repeats="$1"
_rpc_service="$2"
_up_iteration="${3:--1}"
@ -490,10 +534,6 @@ nfs_iterate_test()
fi
if [ -n "$_rpc_service" ]; then
debug <<EOF
--------------------------------------------------
EOF
_action="${_rpc_service#*:}"
if [ "$_action" != "$_rpc_service" ]; then
_rpc_service="${_rpc_service%:*}"
@ -501,6 +541,23 @@ EOF
_action=""
fi
if ! $_initial_monitor_event; then
cat <<EOF
--------------------------------------------------
Running initial monitor event
EOF
# Remember a successful test result...
rpc_set_service_failure_response "$_rpc_service"
# ... and a successful monitor result
simple_test
fi
cat <<EOF
--------------------------------------------------
EOF
if [ -n "$_action" ]; then
case "$_action" in
TIMEOUT)