1
0
mirror of https://github.com/samba-team/samba.git synced 2025-01-03 01:18:10 +03:00

ctdb-scripts: Add service_stats_command variable to NFS checks

When monitoring an RPC service, the rpcinfo command might time out
even though the service is making progress.  In this case, it is just
slow, so counting the timeout as a failure and potentially restarting
the service will not help.  The problem is determining if a service is
making progress.

Add a new NFS checks service_stats_command.  This command is intended
to run a statistics command.  The output is naively compared using
cmp(1).  If the output changes then rpcinfo failures are converted to
successes.

Signed-off-by: Martin Schwenke <mschwenke@ddn.com>
Reviewed-by: Amitay Isaacs <amitay@gmail.com>
This commit is contained in:
Martin Schwenke 2024-02-19 21:42:11 +11:00 committed by Martin Schwenke
parent d86f9ff7fd
commit 5b7d17d44d
4 changed files with 126 additions and 3 deletions

View File

@ -80,6 +80,13 @@ nfs_check_services()
# traces of threads that have not exited, since # traces of threads that have not exited, since
# they may be stuck doing I/O; # they may be stuck doing I/O;
# no default, see also function program_stack_traces() # no default, see also function program_stack_traces()
# * service_stats_cmd - command to retrieve statistics for given service;
# if this is set and RPC checks fail (or
# $service_check_cmd fails), then statistics are
# compared (using cmp) to see if the service is
# making progress or is truly hung;
# no default, failed service does not double-check
# failure using statistics
# #
# Quoting in values is not preserved # Quoting in values is not preserved
# #
@ -103,6 +110,7 @@ nfs_check_service()
service_start_cmd="" service_start_cmd=""
service_check_cmd="" service_check_cmd=""
service_debug_cmd="" service_debug_cmd=""
service_stats_cmd=""
# Eval line-by-line. Expands variable references in values. # Eval line-by-line. Expands variable references in values.
# Also allows variable name checking, which seems useful. # Also allows variable name checking, which seems useful.
@ -113,7 +121,8 @@ nfs_check_service()
family=* | version=* | \ family=* | version=* | \
unhealthy_after=* | restart_every=* | \ unhealthy_after=* | restart_every=* | \
service_stop_cmd=* | service_start_cmd=* | \ service_stop_cmd=* | service_start_cmd=* | \
service_check_cmd=* | service_debug_cmd=*) service_check_cmd=* | service_debug_cmd=* | \
service_stats_cmd=*)
eval "$_line" eval "$_line"
;; ;;
@ -144,6 +153,30 @@ nfs_check_service()
fi fi
fi fi
if [ -n "$service_stats_cmd" ]; then
# If configured, always update stats,
# regardless of RPC status...
# shellcheck disable=SC2154
# script_state_dir set by ctdb_setup_state_dir
_curr="${script_state_dir}/stats_${_progname}.out"
_prev="${_curr}.prev"
if [ -f "$_curr" ]; then
mv -f "$_curr" "$_prev"
fi
eval "$service_stats_cmd" >"$_curr" 2>&1
if ! $_ok &&
! cmp "$_prev" "$_curr" >/dev/null 2>&1; then
# Stats always implicitly change on
# the first monitor event, since
# previous stats don't exists...
echo "WARNING: statistics changed but ${_err}"
_ok=true
fi
fi
if $_ok; then if $_ok; then
if [ $unhealthy_after -ne 1 ] || if [ $unhealthy_after -ne 1 ] ||
[ $restart_every -ne 0 ]; then [ $restart_every -ne 0 ]; then

View File

@ -0,0 +1,26 @@
#!/bin/sh
. "${TEST_SCRIPTS_DIR}/unit.sh"
define_test "NFS RPC service down, stats change, 10 iterations"
setup
cat >"${CTDB_BASE}/nfs-checks.d/20.nfs.check" <<EOF
# nfs
version="3"
restart_every=10
unhealthy_after=2
service_stop_cmd="\$CTDB_NFS_CALLOUT stop nfs"
service_start_cmd="\$CTDB_NFS_CALLOUT start nfs"
service_debug_cmd="program_stack_traces nfsd 5"
# Dummy pipeline confirms that pipelines work in this context
service_stats_cmd="date --rfc-3339=ns | grep ."
EOF
# Test flag to indicate that stats are expected to change
nfs_stats_set_changed "nfs" "status"
rpc_services_down "nfs"
nfs_iterate_test 10 "nfs"

View File

@ -0,0 +1,26 @@
#!/bin/sh
. "${TEST_SCRIPTS_DIR}/unit.sh"
define_test "NFS RPC service down, stats don't change, 10 iterations"
setup
cat >"${CTDB_BASE}/nfs-checks.d/20.nfs.check" <<EOF
# nfs
version="3"
restart_every=10
unhealthy_after=2
service_stop_cmd="\$CTDB_NFS_CALLOUT stop nfs"
service_start_cmd="\$CTDB_NFS_CALLOUT start nfs"
service_debug_cmd="program_stack_traces nfsd 5"
# Dummy pipeline confirms that pipelines work in this context
service_stats_cmd="echo 'hello world' | grep ."
EOF
# Test flag to indicate that stats are expected to change
nfs_stats_set_changed "status"
rpc_services_down "nfs"
nfs_iterate_test 10 "nfs"

View File

@ -128,6 +128,31 @@ nfs_setup_fake_threads()
esac esac
} }
nfs_stats_set_changed()
{
FAKE_NFS_STATS_CHANGED=" $* "
}
nfs_stats_check_changed()
{
_rpc_service="$1"
_iteration="$2"
_t="$FAKE_NFS_STATS_CHANGED"
if [ -z "$_t" ]; then
return 1
fi
if [ "${_t#* "${_rpc_service}"}" != "$_t" ]; then
return 0
fi
# Statistics always change on the first iteration
if [ "$_iteration" -eq 1 ]; then
return 0
fi
return 1
}
guess_output() guess_output()
{ {
case "$1" in case "$1" in
@ -304,7 +329,12 @@ $_rpc_service failed RPC check:
rpcinfo: RPC: Program not registered rpcinfo: RPC: Program not registered
program $_rpc_service${_ver:+ version }${_ver} is not available" program $_rpc_service${_ver:+ version }${_ver} is not available"
if [ $unhealthy_after -gt 0 ] && if [ "$_numfails" -eq -1 ]; then
_unhealthy=false
echo 0 >"$_rc_file"
printf 'WARNING: statistics changed but %s\n' \
"$_rpc_check_out" >>"$_out"
elif [ $unhealthy_after -gt 0 ] &&
[ "$_numfails" -ge $unhealthy_after ]; then [ "$_numfails" -ge $unhealthy_after ]; then
_unhealthy=true _unhealthy=true
echo 1 >"$_rc_file" echo 1 >"$_rc_file"
@ -411,9 +441,17 @@ EOF
fi fi
if [ -n "$_rpc_service" ]; then if [ -n "$_rpc_service" ]; then
if rpcinfo -T tcp localhost "$_rpc_service" \ if rpcinfo -T tcp localhost "$_rpc_service" \
>/dev/null 2>&1 ; then >/dev/null 2>&1; then
_iterate_failcount=0 _iterate_failcount=0
elif nfs_stats_check_changed \
"$_rpc_service" "$_iteration"; then
_iterate_failcount=-1
else else
# -1 above is a special case of 0:
# hack, unhack ;-)
if [ $_iterate_failcount -eq -1 ]; then
_iterate_failcount=0
fi
_iterate_failcount=$((_iterate_failcount + 1)) _iterate_failcount=$((_iterate_failcount + 1))
fi fi
rpc_set_service_failure_response \ rpc_set_service_failure_response \