2025-01-03 01:18:10 +03:00 · 2024-06-29 12:25:59 +10:00 · 2024-06-29 12:25:59 +10:00 · 578dfa5765
commit 578dfa5765
parent 18a29ed367
5 changed files with 93 additions and 7 deletions
--- a/ctdb/config/events/legacy/60.nfs.script
+++ b/ctdb/config/events/legacy/60.nfs.script
@ -193,6 +193,12 @@ nfs_check_service()
 			exit 0
 		fi

+		# Don't count immediately after startup
+		if ! ctdb_counter_exists "$_progname"; then
+			echo "ERROR: $_err"
+			exit 1
+		fi
+
 		ctdb_counter_incr "$_progname"
 		_failcount=$(ctdb_counter_get "$_progname")

--- a/ctdb/config/functions
+++ b/ctdb/config/functions
@ -866,6 +866,11 @@ ctdb_counter_get()
 	# shellcheck disable=SC2086
 	echo $_val
 }
+ctdb_counter_exists()
+{
+	_ctdb_counter_common "$1"
+	[ -e "$_counter_file" ]
+}

 #
 # Fail counter/threshold combination to control warnings and node unhealthy
--- a/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.171.sh
+++ b/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.171.sh
@ -0,0 +1,9 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "nfs down, 1 iteration, not previously healthy"
+
+setup
+
+nfs_iterate_test -i 1 "nfs"
--- a/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.172.sh
+++ b/ctdb/tests/UNIT/eventscripts/60.nfs.monitor.172.sh
@ -0,0 +1,9 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "nfs down, 10 iterations, not previously healthy"
+
+setup
+
+nfs_iterate_test -i 10 "nfs"
--- a/ctdb/tests/UNIT/eventscripts/scripts/60.nfs.sh
+++ b/ctdb/tests/UNIT/eventscripts/scripts/60.nfs.sh
@ -22,6 +22,8 @@ EOF

 	export RPCNFSDCOUNT

+	TEST_RPC_ALL_SERVICES="portmapper nfs mountd rquotad nlockmgr status"
+
 	if [ "$1" != "down" ]; then
 		debug <<EOF
 Setting up NFS environment: all RPC services up, NFS managed by CTDB
@ -40,9 +42,9 @@ EOF
 			;;
 		esac

-		_rpc_services_up \
-			"portmapper" "nfs" "mountd" "rquotad" \
-			"nlockmgr" "status"
+		# Intentional word splitting
+		# shellcheck disable=SC2086
+		_rpc_services_up $TEST_RPC_ALL_SERVICES

 		nfs_setup_fake_threads "nfsd"
 		nfs_setup_fake_threads "rpc.foobar" # Set the variable to empty
@ -283,6 +285,35 @@ program ${_rpc_service}${_ver:+ version }${_ver} is not available
 EOF
 }

+_rpc_was_healthy_common()
+{
+	_rpc_service="$1"
+
+	_f="rpc.${_rpc_service}.was_healthy"
+	_rpc_was_healthy_file="${CTDB_TEST_TMP_DIR}/${_f}"
+}
+
+_rpc_set_was_healthy()
+{
+	if [ $# -eq 0 ]; then
+		# Intentional word splitting
+		# shellcheck disable=SC2086
+		set -- $TEST_RPC_ALL_SERVICES
+	fi
+
+	for _rpc_service; do
+		_rpc_was_healthy_common "$_rpc_service"
+		touch "$_rpc_was_healthy_file"
+	done
+}
+
+_rpc_check_was_healthy()
+{
+	_rpc_was_healthy_common "$1"
+
+	[ -e "$_rpc_was_healthy_file" ]
+}
+
 # Set the required result for a particular RPC program having failed
 # for a certain number of iterations.  This is probably still a work
 # in progress.  Note that we could hook aggressively
@ -299,6 +330,7 @@ rpc_set_service_failure_response()
 	ok_null

 	if [ -z "$_rpc_service" ]; then
+		_rpc_set_was_healthy
 		return
 	fi

@ -376,6 +408,7 @@ rpc_set_service_failure_response()
 		# shellcheck disable=SC2181
 		if [ $? -eq 0 ]; then
 			echo 0 >"$_failcount_file"
+			_rpc_set_was_healthy "$_rpc_service"
 			exit # from subshell
 		elif rpcinfo_timed_out "$_ri_out"; then
 			_why="Timed out"
@ -390,6 +423,10 @@ rpc_set_service_failure_response()
 				echo 0 >"$_failcount_file"
 				exit # from subshell
 			fi
+		elif ! _rpc_check_was_healthy "$_rpc_service"; then
+			echo 1 >"$_rc_file"
+			rpc_failure "ERROR:" "$_rpc_service" "$_ver" >"$_out"
+			exit # from subshell
 		fi

 		_numfails=$((_numfails + 1))
@ -407,6 +444,7 @@ rpc_set_service_failure_response()
 				>"$_out"
 		else
 			_unhealthy=false
+			_rpc_set_was_healthy "$_rpc_service"
 		fi

 		if [ $restart_every -gt 0 ] &&
@ -480,6 +518,12 @@ program_stack_traces()
 #
 nfs_iterate_test()
 {
+	_initial_monitor_event=false
+	if [ "$1" = "-i" ]; then
+		shift
+		_initial_monitor_event=true
+	fi
+
 	_repeats="$1"
 	_rpc_service="$2"
 	_up_iteration="${3:--1}"
@ -490,10 +534,6 @@ nfs_iterate_test()
 	fi

 	if [ -n "$_rpc_service" ]; then
-		debug <<EOF
--------------------------------------------------
-EOF
-
 		_action="${_rpc_service#*:}"
 		if [ "$_action" != "$_rpc_service" ]; then
 			_rpc_service="${_rpc_service%:*}"
@ -501,6 +541,23 @@ EOF
 			_action=""
 		fi

+		if ! $_initial_monitor_event; then
+			cat <<EOF
+--------------------------------------------------
+Running initial monitor event
+
+EOF
+			# Remember a successful test result...
+			rpc_set_service_failure_response "$_rpc_service"
+			# ... and a successful monitor result
+			simple_test
+		fi
+
+
+		cat <<EOF
+--------------------------------------------------
+EOF
+
 		if [ -n "$_action" ]; then
 			case "$_action" in
 			TIMEOUT)