From 5b7d17d44d9abdf2f3e15b225bd53b6d21ac41dd Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Mon, 19 Feb 2024 21:42:11 +1100 Subject: [PATCH] ctdb-scripts: Add service_stats_command variable to NFS checks When monitoring an RPC service, the rpcinfo command might time out even though the service is making progress. In this case, it is just slow, so counting the timeout as a failure and potentially restarting the service will not help. The problem is determining if a service is making progress. Add a new NFS checks service_stats_command. This command is intended to run a statistics command. The output is naively compared using cmp(1). If the output changes then rpcinfo failures are converted to successes. Signed-off-by: Martin Schwenke Reviewed-by: Amitay Isaacs --- ctdb/config/events/legacy/60.nfs.script | 35 +++++++++++++++- .../UNIT/eventscripts/60.nfs.monitor.115.sh | 26 ++++++++++++ .../UNIT/eventscripts/60.nfs.monitor.116.sh | 26 ++++++++++++ .../tests/UNIT/eventscripts/scripts/60.nfs.sh | 42 ++++++++++++++++++- 4 files changed, 126 insertions(+), 3 deletions(-) create mode 100755 ctdb/tests/UNIT/eventscripts/60.nfs.monitor.115.sh create mode 100755 ctdb/tests/UNIT/eventscripts/60.nfs.monitor.116.sh diff --git a/ctdb/config/events/legacy/60.nfs.script b/ctdb/config/events/legacy/60.nfs.script index 6935ad9fadc..867cc72d50c 100755 --- a/ctdb/config/events/legacy/60.nfs.script +++ b/ctdb/config/events/legacy/60.nfs.script @@ -80,6 +80,13 @@ nfs_check_services() # traces of threads that have not exited, since # they may be stuck doing I/O; # no default, see also function program_stack_traces() +# * service_stats_cmd - command to retrieve statistics for given service; +# if this is set and RPC checks fail (or +# $service_check_cmd fails), then statistics are +# compared (using cmp) to see if the service is +# making progress or is truly hung; +# no default, failed service does not double-check +# failure using statistics # # Quoting in values is not preserved # @@ -103,6 +110,7 @@ nfs_check_service() service_start_cmd="" service_check_cmd="" service_debug_cmd="" + service_stats_cmd="" # Eval line-by-line. Expands variable references in values. # Also allows variable name checking, which seems useful. @@ -113,7 +121,8 @@ nfs_check_service() family=* | version=* | \ unhealthy_after=* | restart_every=* | \ service_stop_cmd=* | service_start_cmd=* | \ - service_check_cmd=* | service_debug_cmd=*) + service_check_cmd=* | service_debug_cmd=* | \ + service_stats_cmd=*) eval "$_line" ;; @@ -144,6 +153,30 @@ nfs_check_service() fi fi + if [ -n "$service_stats_cmd" ]; then + # If configured, always update stats, + # regardless of RPC status... + + # shellcheck disable=SC2154 + # script_state_dir set by ctdb_setup_state_dir + _curr="${script_state_dir}/stats_${_progname}.out" + _prev="${_curr}.prev" + + if [ -f "$_curr" ]; then + mv -f "$_curr" "$_prev" + fi + eval "$service_stats_cmd" >"$_curr" 2>&1 + + if ! $_ok && + ! cmp "$_prev" "$_curr" >/dev/null 2>&1; then + # Stats always implicitly change on + # the first monitor event, since + # previous stats don't exists... + echo "WARNING: statistics changed but ${_err}" + _ok=true + fi + fi + if $_ok; then if [ $unhealthy_after -ne 1 ] || [ $restart_every -ne 0 ]; then diff --git a/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.115.sh b/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.115.sh new file mode 100755 index 00000000000..860436328d4 --- /dev/null +++ b/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.115.sh @@ -0,0 +1,26 @@ +#!/bin/sh + +. "${TEST_SCRIPTS_DIR}/unit.sh" + +define_test "NFS RPC service down, stats change, 10 iterations" + +setup + +cat >"${CTDB_BASE}/nfs-checks.d/20.nfs.check" <"${CTDB_BASE}/nfs-checks.d/20.nfs.check" <"$_rc_file" + printf 'WARNING: statistics changed but %s\n' \ + "$_rpc_check_out" >>"$_out" + elif [ $unhealthy_after -gt 0 ] && [ "$_numfails" -ge $unhealthy_after ]; then _unhealthy=true echo 1 >"$_rc_file" @@ -411,9 +441,17 @@ EOF fi if [ -n "$_rpc_service" ]; then if rpcinfo -T tcp localhost "$_rpc_service" \ - >/dev/null 2>&1 ; then + >/dev/null 2>&1; then _iterate_failcount=0 + elif nfs_stats_check_changed \ + "$_rpc_service" "$_iteration"; then + _iterate_failcount=-1 else + # -1 above is a special case of 0: + # hack, unhack ;-) + if [ $_iterate_failcount -eq -1 ]; then + _iterate_failcount=0 + fi _iterate_failcount=$((_iterate_failcount + 1)) fi rpc_set_service_failure_response \