2025-01-03 01:18:10 +03:00 · 2024-02-19 21:42:11 +11:00 · 2024-02-19 21:42:11 +11:00 · 5b7d17d44d
commit 5b7d17d44d
parent d86f9ff7fd
4 changed files with 126 additions and 3 deletions
--- a/ctdb/config/events/legacy/60.nfs.script
+++ b/ctdb/config/events/legacy/60.nfs.script
@ -80,6 +80,13 @@ nfs_check_services()
 #                        traces of threads that have not exited, since
 #                        they may be stuck doing I/O;
 #                        no default, see also function program_stack_traces()
+# * service_stats_cmd  - command to retrieve statistics for  given service;
+#                        if this is set and RPC checks fail (or
+#                        $service_check_cmd fails), then statistics are
+#                        compared (using cmp) to see if the service is
+#                        making progress or is truly hung;
+#                        no default, failed service does not double-check
+#                        failure using statistics
 #
 # Quoting in values is not preserved
 #
@ -103,6 +110,7 @@ nfs_check_service()
 		service_start_cmd=""
 		service_check_cmd=""
 		service_debug_cmd=""
+		service_stats_cmd=""

 		# Eval line-by-line.  Expands variable references in values.
 		# Also allows variable name checking, which seems useful.
@ -113,7 +121,8 @@ nfs_check_service()
 			family=* | version=* | \
 				unhealthy_after=* | restart_every=* | \
 				service_stop_cmd=* | service_start_cmd=* | \
-				service_check_cmd=* | service_debug_cmd=*)
+				service_check_cmd=* | service_debug_cmd=* | \
+				service_stats_cmd=*)

 				eval "$_line"
 				;;
@ -144,6 +153,30 @@ nfs_check_service()
 			fi
 		fi

+		if [ -n "$service_stats_cmd" ]; then
+			# If configured, always update stats,
+			# regardless of RPC status...
+
+			# shellcheck disable=SC2154
+			# script_state_dir set by ctdb_setup_state_dir
+			_curr="${script_state_dir}/stats_${_progname}.out"
+			_prev="${_curr}.prev"
+
+			if [ -f "$_curr" ]; then
+				mv -f "$_curr" "$_prev"
+			fi
+			eval "$service_stats_cmd" >"$_curr" 2>&1
+
+			if ! $_ok &&
+				! cmp "$_prev" "$_curr" >/dev/null 2>&1; then
+				# Stats always implicitly change on
+				# the first monitor event, since
+				# previous stats don't exists...
+				echo "WARNING: statistics changed but ${_err}"
+				_ok=true
+			fi
+		fi
+
 		if $_ok; then
 			if [ $unhealthy_after -ne 1 ] ||
 				[ $restart_every -ne 0 ]; then
--- a/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.115.sh
+++ b/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.115.sh
@ -0,0 +1,26 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "NFS RPC service down, stats change, 10 iterations"
+
+setup
+
+cat >"${CTDB_BASE}/nfs-checks.d/20.nfs.check" <<EOF
+# nfs
+version="3"
+restart_every=10
+unhealthy_after=2
+service_stop_cmd="\$CTDB_NFS_CALLOUT stop nfs"
+service_start_cmd="\$CTDB_NFS_CALLOUT start nfs"
+service_debug_cmd="program_stack_traces nfsd 5"
+# Dummy pipeline confirms that pipelines work in this context
+service_stats_cmd="date --rfc-3339=ns | grep ."
+EOF
+
+# Test flag to indicate that stats are expected to change
+nfs_stats_set_changed "nfs" "status"
+
+rpc_services_down "nfs"
+
+nfs_iterate_test 10 "nfs"
--- a/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.116.sh
+++ b/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.116.sh
@ -0,0 +1,26 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "NFS RPC service down, stats don't change, 10 iterations"
+
+setup
+
+cat >"${CTDB_BASE}/nfs-checks.d/20.nfs.check" <<EOF
+# nfs
+version="3"
+restart_every=10
+unhealthy_after=2
+service_stop_cmd="\$CTDB_NFS_CALLOUT stop nfs"
+service_start_cmd="\$CTDB_NFS_CALLOUT start nfs"
+service_debug_cmd="program_stack_traces nfsd 5"
+# Dummy pipeline confirms that pipelines work in this context
+service_stats_cmd="echo 'hello world' | grep ."
+EOF
+
+# Test flag to indicate that stats are expected to change
+nfs_stats_set_changed "status"
+
+rpc_services_down "nfs"
+
+nfs_iterate_test 10 "nfs"
--- a/ctdb/tests/UNIT/eventscripts/scripts/60.nfs.sh
+++ b/ctdb/tests/UNIT/eventscripts/scripts/60.nfs.sh
@ -128,6 +128,31 @@ nfs_setup_fake_threads()
 	esac
 }

+nfs_stats_set_changed()
+{
+	FAKE_NFS_STATS_CHANGED=" $* "
+}
+
+nfs_stats_check_changed()
+{
+	_rpc_service="$1"
+	_iteration="$2"
+
+	_t="$FAKE_NFS_STATS_CHANGED"
+	if [ -z "$_t" ]; then
+		return 1
+	fi
+	if [ "${_t#* "${_rpc_service}"}" != "$_t" ]; then
+		return 0
+	fi
+	# Statistics always change on the first iteration
+	if [ "$_iteration" -eq 1 ]; then
+		return 0
+	fi
+
+	return 1
+}
+
 guess_output()
 {
 	case "$1" in
@ -304,7 +329,12 @@ $_rpc_service failed RPC check:
 rpcinfo: RPC: Program not registered
 program $_rpc_service${_ver:+ version }${_ver} is not available"

-		if [ $unhealthy_after -gt 0 ] &&
+		if [ "$_numfails" -eq -1 ]; then
+			_unhealthy=false
+			echo 0 >"$_rc_file"
+			printf 'WARNING: statistics changed but %s\n' \
+				"$_rpc_check_out" >>"$_out"
+		elif [ $unhealthy_after -gt 0 ] &&
 			[ "$_numfails" -ge $unhealthy_after ]; then
 			_unhealthy=true
 			echo 1 >"$_rc_file"
@ -411,9 +441,17 @@ EOF
 		fi
 		if [ -n "$_rpc_service" ]; then
 			if rpcinfo -T tcp localhost "$_rpc_service" \
-				   >/dev/null 2>&1 ; then
+				>/dev/null 2>&1; then
 				_iterate_failcount=0
+			elif nfs_stats_check_changed \
+				"$_rpc_service" "$_iteration"; then
+				_iterate_failcount=-1
 			else
+				# -1 above is a special case of 0:
+				# hack, unhack ;-)
+				if [ $_iterate_failcount -eq -1 ]; then
+					_iterate_failcount=0
+				fi
 				_iterate_failcount=$((_iterate_failcount + 1))
 			fi
 			rpc_set_service_failure_response \