From 5b7d17d44d9abdf2f3e15b225bd53b6d21ac41dd Mon Sep 17 00:00:00 2001
From: Martin Schwenke <mschwenke@ddn.com>
Date: Mon, 19 Feb 2024 21:42:11 +1100
Subject: [PATCH] ctdb-scripts: Add service_stats_command variable to NFS
 checks

When monitoring an RPC service, the rpcinfo command might time out
even though the service is making progress.  In this case, it is just
slow, so counting the timeout as a failure and potentially restarting
the service will not help.  The problem is determining if a service is
making progress.

Add a new NFS checks service_stats_command.  This command is intended
to run a statistics command.  The output is naively compared using
cmp(1).  If the output changes then rpcinfo failures are converted to
successes.

Signed-off-by: Martin Schwenke <mschwenke@ddn.com>
Reviewed-by: Amitay Isaacs <amitay@gmail.com>
---
 ctdb/config/events/legacy/60.nfs.script       | 35 +++++++++++++++-
 .../UNIT/eventscripts/60.nfs.monitor.115.sh   | 26 ++++++++++++
 .../UNIT/eventscripts/60.nfs.monitor.116.sh   | 26 ++++++++++++
 .../tests/UNIT/eventscripts/scripts/60.nfs.sh | 42 ++++++++++++++++++-
 4 files changed, 126 insertions(+), 3 deletions(-)
 create mode 100755 ctdb/tests/UNIT/eventscripts/60.nfs.monitor.115.sh
 create mode 100755 ctdb/tests/UNIT/eventscripts/60.nfs.monitor.116.sh

diff --git a/ctdb/config/events/legacy/60.nfs.script b/ctdb/config/events/legacy/60.nfs.script
index 6935ad9fadc..867cc72d50c 100755
--- a/ctdb/config/events/legacy/60.nfs.script
+++ b/ctdb/config/events/legacy/60.nfs.script
@@ -80,6 +80,13 @@ nfs_check_services()
 #                        traces of threads that have not exited, since
 #                        they may be stuck doing I/O;
 #                        no default, see also function program_stack_traces()
+# * service_stats_cmd  - command to retrieve statistics for  given service;
+#                        if this is set and RPC checks fail (or
+#                        $service_check_cmd fails), then statistics are
+#                        compared (using cmp) to see if the service is
+#                        making progress or is truly hung;
+#                        no default, failed service does not double-check
+#                        failure using statistics
 #
 # Quoting in values is not preserved
 #
@@ -103,6 +110,7 @@ nfs_check_service()
 		service_start_cmd=""
 		service_check_cmd=""
 		service_debug_cmd=""
+		service_stats_cmd=""
 
 		# Eval line-by-line.  Expands variable references in values.
 		# Also allows variable name checking, which seems useful.
@@ -113,7 +121,8 @@ nfs_check_service()
 			family=* | version=* | \
 				unhealthy_after=* | restart_every=* | \
 				service_stop_cmd=* | service_start_cmd=* | \
-				service_check_cmd=* | service_debug_cmd=*)
+				service_check_cmd=* | service_debug_cmd=* | \
+				service_stats_cmd=*)
 
 				eval "$_line"
 				;;
@@ -144,6 +153,30 @@ nfs_check_service()
 			fi
 		fi
 
+		if [ -n "$service_stats_cmd" ]; then
+			# If configured, always update stats,
+			# regardless of RPC status...
+
+			# shellcheck disable=SC2154
+			# script_state_dir set by ctdb_setup_state_dir
+			_curr="${script_state_dir}/stats_${_progname}.out"
+			_prev="${_curr}.prev"
+
+			if [ -f "$_curr" ]; then
+				mv -f "$_curr" "$_prev"
+			fi
+			eval "$service_stats_cmd" >"$_curr" 2>&1
+
+			if ! $_ok &&
+				! cmp "$_prev" "$_curr" >/dev/null 2>&1; then
+				# Stats always implicitly change on
+				# the first monitor event, since
+				# previous stats don't exists...
+				echo "WARNING: statistics changed but ${_err}"
+				_ok=true
+			fi
+		fi
+
 		if $_ok; then
 			if [ $unhealthy_after -ne 1 ] ||
 				[ $restart_every -ne 0 ]; then
diff --git a/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.115.sh b/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.115.sh
new file mode 100755
index 00000000000..860436328d4
--- /dev/null
+++ b/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.115.sh
@@ -0,0 +1,26 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "NFS RPC service down, stats change, 10 iterations"
+
+setup
+
+cat >"${CTDB_BASE}/nfs-checks.d/20.nfs.check" <<EOF
+# nfs
+version="3"
+restart_every=10
+unhealthy_after=2
+service_stop_cmd="\$CTDB_NFS_CALLOUT stop nfs"
+service_start_cmd="\$CTDB_NFS_CALLOUT start nfs"
+service_debug_cmd="program_stack_traces nfsd 5"
+# Dummy pipeline confirms that pipelines work in this context
+service_stats_cmd="date --rfc-3339=ns | grep ."
+EOF
+
+# Test flag to indicate that stats are expected to change
+nfs_stats_set_changed "nfs" "status"
+
+rpc_services_down "nfs"
+
+nfs_iterate_test 10 "nfs"
diff --git a/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.116.sh b/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.116.sh
new file mode 100755
index 00000000000..1bdd29e73ec
--- /dev/null
+++ b/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.116.sh
@@ -0,0 +1,26 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "NFS RPC service down, stats don't change, 10 iterations"
+
+setup
+
+cat >"${CTDB_BASE}/nfs-checks.d/20.nfs.check" <<EOF
+# nfs
+version="3"
+restart_every=10
+unhealthy_after=2
+service_stop_cmd="\$CTDB_NFS_CALLOUT stop nfs"
+service_start_cmd="\$CTDB_NFS_CALLOUT start nfs"
+service_debug_cmd="program_stack_traces nfsd 5"
+# Dummy pipeline confirms that pipelines work in this context
+service_stats_cmd="echo 'hello world' | grep ."
+EOF
+
+# Test flag to indicate that stats are expected to change
+nfs_stats_set_changed "status"
+
+rpc_services_down "nfs"
+
+nfs_iterate_test 10 "nfs"
diff --git a/ctdb/tests/UNIT/eventscripts/scripts/60.nfs.sh b/ctdb/tests/UNIT/eventscripts/scripts/60.nfs.sh
index 9c614c76bb7..7c3435c8ce0 100644
--- a/ctdb/tests/UNIT/eventscripts/scripts/60.nfs.sh
+++ b/ctdb/tests/UNIT/eventscripts/scripts/60.nfs.sh
@@ -128,6 +128,31 @@ nfs_setup_fake_threads()
 	esac
 }
 
+nfs_stats_set_changed()
+{
+	FAKE_NFS_STATS_CHANGED=" $* "
+}
+
+nfs_stats_check_changed()
+{
+	_rpc_service="$1"
+	_iteration="$2"
+
+	_t="$FAKE_NFS_STATS_CHANGED"
+	if [ -z "$_t" ]; then
+		return 1
+	fi
+	if [ "${_t#* "${_rpc_service}"}" != "$_t" ]; then
+		return 0
+	fi
+	# Statistics always change on the first iteration
+	if [ "$_iteration" -eq 1 ]; then
+		return 0
+	fi
+
+	return 1
+}
+
 guess_output()
 {
 	case "$1" in
@@ -304,7 +329,12 @@ $_rpc_service failed RPC check:
 rpcinfo: RPC: Program not registered
 program $_rpc_service${_ver:+ version }${_ver} is not available"
 
-		if [ $unhealthy_after -gt 0 ] &&
+		if [ "$_numfails" -eq -1 ]; then
+			_unhealthy=false
+			echo 0 >"$_rc_file"
+			printf 'WARNING: statistics changed but %s\n' \
+				"$_rpc_check_out" >>"$_out"
+		elif [ $unhealthy_after -gt 0 ] &&
 			[ "$_numfails" -ge $unhealthy_after ]; then
 			_unhealthy=true
 			echo 1 >"$_rc_file"
@@ -411,9 +441,17 @@ EOF
 		fi
 		if [ -n "$_rpc_service" ]; then
 			if rpcinfo -T tcp localhost "$_rpc_service" \
-				   >/dev/null 2>&1 ; then
+				>/dev/null 2>&1; then
 				_iterate_failcount=0
+			elif nfs_stats_check_changed \
+				"$_rpc_service" "$_iteration"; then
+				_iterate_failcount=-1
 			else
+				# -1 above is a special case of 0:
+				# hack, unhack ;-)
+				if [ $_iterate_failcount -eq -1 ]; then
+					_iterate_failcount=0
+				fi
 				_iterate_failcount=$((_iterate_failcount + 1))
 			fi
 			rpc_set_service_failure_response \