From 398116ff295856dd5eccaa37acf4c7727501e8e8 Mon Sep 17 00:00:00 2001
From: Martin Schwenke <martin@meltin.net>
Date: Fri, 17 Dec 2010 16:25:04 +1100
Subject: [PATCH 1/4] Eventscripts: clean up 60.nfs monitor event.

This adds a helper function called nfs_check_rpc_service() and uses it
to make the monitor event much more readable.  An example of usage is
as follows:

  nfs_check_rpc_service "mountd" \
    -ge 10 "verbose restart:b unhealthy" \
    -eq 5 "restart:b"

The first argument to nfs_check_rpc_service() is the name of the RPC
service to be checked.  The RPC service corresponding to this command
is checked for availability using the rpcinfo command.  If the service
is available then the function succeeds and subsequent arguments are
ignored.

If the rpcinfo check fails then a failure counter for that particular
RPC service is incremented and subsequent arguments are processed in
groups of 3:

1. An integer comparison operator supported by test.
2. An integer failure limit.
3. An action string.

The value of the failure counter is checked using (1) and (2) above.
The first check that succeeds has its action string processed - note
that this explains the somewhat curious reverse ordering of checks.

It the example above:

* If the counter is >= 10 then a verbose message is printed
  describing the failure, the service is restarted in the background
  and the node is marked as unhealthy (via an "exit 1" from the
  function).

* If the counter is == 5 then the service us restarted in the
  background.

For more action options please see the code.

This also changes the ctdb_check_rpc() function so that it no longer
takes a program number to check.  It now just takes a real RPC program
name that rpcinfo can resolve via /etc/rpc.

Signed-off-by: Martin Schwenke <martin@meltin.net>

(This used to be ctdb commit 9b66057964756a6245bafb436eb6106fb6a2866e)
---
 ctdb/config/events.d/60.nfs | 128 ++++++----------------------------
 ctdb/config/functions       | 135 ++++++++++++++++++++++++++++++++++--
 2 files changed, 150 insertions(+), 113 deletions(-)

diff --git a/ctdb/config/events.d/60.nfs b/ctdb/config/events.d/60.nfs
index e77804836ba..43d5f665dbb 100755
--- a/ctdb/config/events.d/60.nfs
+++ b/ctdb/config/events.d/60.nfs
@@ -59,7 +59,7 @@ case "$1" in
 	;;
 
       monitor)
-	# and that its directories are available
+	# Check that directories for shares actually exist.
 	[ "$CTDB_NFS_SKIP_SHARE_CHECK" = "yes" ] || {
 	    exportfs | grep -v '^#' | grep '^/' |
 	    sed -e 's/[[:space:]]\+[^[:space:]]*$//' |
@@ -73,118 +73,32 @@ case "$1" in
 	# we only do this IF we have a rpc.statd command.
 	# For platforms where rpc.statd does not exist, we skip
 	# the check completely
-	p="rpc.statd"
-	which $p >/dev/null 2>/dev/null && {
-		if ctdb_check_rpc "STATD" 100024 1 >/dev/null ; then
-			(service_name="nfs_statd"; ctdb_counter_init)
-		else
-			cmd="$p"
-			cmd="${cmd}${STATD_HOSTNAME:+ -n }${STATD_HOSTNAME}"
-			cmd="${cmd}${STATD_PORT:+ -p }${STATD_PORT}"
-			cmd="${cmd}${STATD_OUTGOING_PORT:+ -o }${STATD_OUTGOING_PORT}"
-			(
-				service_name="nfs_statd"
-				ctdb_counter_incr
-				ctdb_check_counter_limit 10 quiet >/dev/null
-			) || {
-				echo "$ctdb_check_rpc_out"
-				echo "Trying to restart STATD [$cmd]"
-				$cmd
-			}
-		fi
-	}
+        p="rpc.statd"
+        which $p >/dev/null 2>/dev/null && \
+	    nfs_check_rpc_service "statd" \
+	    -ge 10 "verbose restart"
 
 	# check that NFS responds to rpc requests
-	[ "$CTDB_NFS_SKIP_KNFSD_ALIVE_CHECK" = "yes" ] || {
-	    if ctdb_check_rpc "NFS" 100003 3 >/dev/null ; then
-		(service_name="nfs_knfsd"; ctdb_counter_init)
-	    else
-		(
-			service_name="nfs_knfsd"
-			ctdb_counter_incr
-
-			ctdb_check_counter_equal 2 || {
-				echo "Trying to restart NFS service"
-				startstop_nfs restart >/dev/null 2>&1 &
-				exit 0
-			}
-
-			ctdb_check_counter_limit 5 quiet >/dev/null
-		) || {
-			echo "$ctdb_check_rpc_out"
-			echo "Trying to restart NFS service"
-			startstop_nfs restart
-			exit 1
-		}
-	    fi
-	}
+	if [ "$CTDB_NFS_SKIP_KNFSD_ALIVE_CHECK" != "yes" ] ; then
+	    nfs_check_rpc_service "knfsd" \
+		-ge 15 "verbose restart unhealthy" \
+		-eq 10 "restart:bs"
+	fi
 
 	# check that lockd responds to rpc requests
-	if ctdb_check_rpc "LOCKD" 100021 1 >/dev/null ; then
-		(service_name="lockd"; ctdb_counter_init)
-	else
-		(
-			service_name="lockd"
-			ctdb_counter_incr
+	nfs_check_rpc_service "lockd" \
+	    -ge 15 "verbose restart unhealthy" \
+	    -eq 10 "restart:bs"
 
-			ctdb_check_counter_equal 10 || {
-				echo "Trying to restart NFS lock service"
-				startstop_nfs restart >/dev/null 2>&1 &
-				startstop_nfslock restart  >/dev/null 2>&1 &
-				exit 0
-			}
+	# mountd is sometimes not started correctly on RHEL5
+	nfs_check_rpc_service "mountd" \
+	    -ge 10 "verbose restart:b unhealthy" \
+	    -eq 5 "restart:b"
 
-			ctdb_check_counter_limit 15 quiet >/dev/null
-	) || {
-			echo "$ctdb_check_rpc_out"
-			echo "Trying to restart NFS lock service"
-			startstop_nfs restart
-			startstop_nfslock restart
-			exit 1
-		}
-	fi
-
-	# mount needs special handling since it is sometimes not started
-	# correctly on RHEL5
-	if ctdb_check_rpc "MOUNTD" 100005 1 >/dev/null ; then
-		(service_name="nfs_mountd"; ctdb_counter_init)
-	else
-	(
-		service_name="nfs_mountd"
-		ctdb_counter_incr
-
-		ctdb_check_counter_equal 5 || {
-			p="rpc.mountd"
-			cmd="${p}${MOUNTD_PORT:+ -p }${MOUNTD_PORT}"
-			echo "Trying to restart MOUNTD [${cmd}]"
-			killall -q -9 $p
-			$cmd &
-			exit 0
-		}
-
-		ctdb_check_counter_limit 10 quiet >/dev/null
-	) || {
-		echo "$ctdb_check_rpc_out"
-		p="rpc.mountd"
-		cmd="${p}${MOUNTD_PORT:+ -p }${MOUNTD_PORT}"
-		echo "Trying to restart MOUNTD [${cmd}]"
-		killall -q -9 $p
-		$cmd &
-		exit 1
-	}
-	fi
-
-
-	# rquotad needs special handling since it is sometimes not started
-	# correctly on RHEL5
-	# this is not a critical service so we dont flag the node as unhealthy
-	ctdb_check_rpc "RQUOTAD" 100011 1 || {
-		p="rpc.rquotad"
-		cmd="${p}${RQUOTAD_PORT:+ -p }${RQUOTAD_PORT}"
-		echo "Trying to restart RQUOTAD [${cmd}]"
-		killall -q -9 $p
-		$cmd &
-	}
+	# rquotad is sometimes not started correctly on RHEL5
+	# not a critical service so we dont flag the node as unhealthy
+	nfs_check_rpc_service "rquotad" \
+	    -gt 0 "verbose restart:b"
 
 	# once every 600 seconds, update the statd state database for which
 	# clients need notifications
diff --git a/ctdb/config/functions b/ctdb/config/functions
index 2668531ca83..329eceb2f7f 100755
--- a/ctdb/config/functions
+++ b/ctdb/config/functions
@@ -105,18 +105,141 @@ get_proc ()
     cat "/proc/$1"
 }
 
+######################################################
+# Check that an RPC service is healthy -
+# this includes allowing a certain number of failures
+# before marking the NFS service unhealthy.
+#
+# usage: nfs_check_rpc_service SERVICE_NAME [ triple ...]
+#
+# each triple is a set of 3 arguments: an operator, a 
+# fail count limit and an action string.
+#
+# For example:
+#
+# 	nfs_check_rpc_service "lockd" \
+#	    -ge 15 "verbose restart unhealthy" \
+#	    -eq 10 "restart:bs"
+#
+# says that if lockd is down for 15 iterations then do
+# a verbose restart of lockd and mark the node unhealthy.
+# Before this, after 10 iterations of failure, the
+# service is restarted silently in the background.
+# Order is important: the number of failures need to be
+# specified in reverse order because processing stops
+# after the first condition that is true.
+######################################################
+nfs_check_rpc_service ()
+{
+    _prog_name="$1" ; shift
+
+    _version=1
+    _rpc_prog="$_prog_name"
+    _restart=""
+    _opts=""
+    case "$_prog_name" in
+	knfsd)
+	    _rpc_prog=nfs
+	    _version=3
+	    _restart="echo 'Trying to restart NFS service'"
+	    _restart="${_restart}; startstop_nfs restart"
+	    ;;
+	mountd)
+	    _opts="${MOUNTD_PORT:+ -p }${MOUNTD_PORT}"
+	    ;;
+	rquotad)
+	    _opts="${RQUOTAD_PORT:+ -p }${RQUOTAD_PORT}"
+	    ;;
+	lockd)
+	    _rpc_prog=nlockmgr
+	    _version=4
+	    _restart="echo 'Trying to restart lock manager service'"
+	    _restart="${_restart}; startstop_nfs restart"
+	    _restart="${_restart}; startstop_nfslock restart"
+	    ;;
+	statd)
+	    _rpc_prog=status
+	    _opts="${STATD_HOSTNAME:+ -n }${STATD_HOSTNAME}"
+	    _opts="${_opts}${STATD_PORT:+ -p }${STATD_PORT}"
+	    _opts="${_opts}${STATD_OUTGOING_PORT:+ -o }${STATD_OUTGOING_PORT}"
+	    ;;
+	*)
+	    echo "Internal error: unknown RPC program \"$_prog_name\"."
+	    exit 1
+    esac
+
+    _service_name="nfs_${_prog_name}"
+
+    if ctdb_check_rpc "$_rpc_prog" $_version >/dev/null ; then
+	ctdb_counter_init "$_service_name"
+	return 0
+    fi
+
+    ctdb_counter_incr "$_service_name"
+
+    while [ -n "$3" ] ; do
+	ctdb_check_counter "quiet" "$1" "$2" "$_service_name" || {
+	    for _action in $3 ; do
+		case "$_action" in
+		    verbose)
+			echo "$ctdb_check_rpc_out"
+			;;
+		    restart|restart:*)
+			# No explicit command specified, construct rpc command.
+			if [ -z "$_restart" ] ; then
+			    _p="rpc.${_prog_name}"
+			    _restart="echo 'Trying to restart $_prog_name [${_p}${_opts}]'"
+			    _restart="${_restart}; killall -q -9 $_p"
+			    _restart="${_restart}; $_p $_opts"
+			fi
+
+			# Process restart flags...
+			_flags="${_action#restart:}"
+			# There may not have been a colon...
+			[ "$_flags" != "$_action" ] || _flags=""
+			# q=quiet - everything to /dev/null
+			if [ "${_flags#*q}" != "$_flags" ] ; then
+			    _restart="{ ${_restart} ; } >/dev/null 2>&1"
+			fi
+			# s=stealthy - last command to /dev/null
+			if [ "${_flags#*s}" != "$_flags" ] ; then
+			    _restart="${_restart} >/dev/null 2>&1"
+			fi
+			# b=background - the whole thing, easy and reliable
+			if [ "${_flags#*b}" != "$_flags" ] ; then
+			    _restart="{ ${_restart} ; } &"
+			fi
+
+			# Do it!
+			eval "${_restart}"
+			;;
+		    unhealthy)
+			exit 1
+			;;
+		    *)
+			echo "Internal error: unknown action \"$_action\"."
+			exit 1
+		esac
+	    done
+
+	    # Only process the first action group.
+	    break
+	}
+	shift 3
+    done
+}
+
 ######################################################
 # check that a rpc server is registered with portmap
 # and responding to requests
-# usage: ctdb_check_rpc SERVICE_NAME PROGNUM VERSION
+# usage: ctdb_check_rpc SERVICE_NAME VERSION
 ######################################################
-ctdb_check_rpc() {
+ctdb_check_rpc ()
+{
     progname="$1"
-    prognum="$2"
-    version="$3"
+    version="$2"
 
-    ctdb_check_rpc_out=$(rpcinfo -u localhost $prognum $version 2>&1)
-    if [ $? -ne 0 ] ; then
+    if ! ctdb_check_rpc_out=$(rpcinfo -u localhost $progname $version 2>&1) ; then
 	ctdb_check_rpc_out="ERROR: $progname failed RPC check:
 $ctdb_check_rpc_out"
 	echo "$ctdb_check_rpc_out"

From 1d71dd08e36414cdd4aac34ac21c247879447f28 Mon Sep 17 00:00:00 2001
From: Martin Schwenke <martin@meltin.net>
Date: Fri, 14 Jan 2011 09:40:11 +1100
Subject: [PATCH 2/4] Eventscripts: change failure counts and behaviour for
 statd and nfsd.

We reduce the number of failures before attempting a restart.
However, after 6 failures we mark the cluster unhealthy and no longer
try to restart.  If the previous 2 attempts didn't work then there
isn't any use in bogging the system down with an attempted restart on
every monitor event.

Signed-off-by: Martin Schwenke <martin@meltin.net>

(This used to be ctdb commit f654739080b40b7ac1b7f998cacc689d3d4e3193)
---
 ctdb/config/events.d/60.nfs | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/ctdb/config/events.d/60.nfs b/ctdb/config/events.d/60.nfs
index 43d5f665dbb..2b4c158d683 100755
--- a/ctdb/config/events.d/60.nfs
+++ b/ctdb/config/events.d/60.nfs
@@ -76,13 +76,16 @@ case "$1" in
         p="rpc.statd"
         which $p >/dev/null 2>/dev/null && \
 	    nfs_check_rpc_service "statd" \
-	    -ge 10 "verbose restart"
+	        -ge 6 "verbose unhealthy" \
+	        -eq 4 "verbose restart" \
+		-eq 2 "restart:bs"
 
 	# check that NFS responds to rpc requests
 	if [ "$CTDB_NFS_SKIP_KNFSD_ALIVE_CHECK" != "yes" ] ; then
 	    nfs_check_rpc_service "knfsd" \
-		-ge 15 "verbose restart unhealthy" \
-		-eq 10 "restart:bs"
+		-ge 6 "verbose unhealthy" \
+		-eq 4 "verbose restart" \
+		-eq 2 "restart:bs"
 	fi
 
 	# check that lockd responds to rpc requests

From 32fe247e37fe97c98c12f196d955d9be682eaa74 Mon Sep 17 00:00:00 2001
From: Martin Schwenke <martin@meltin.net>
Date: Fri, 12 Aug 2011 16:28:09 +1000
Subject: [PATCH 3/4] Eventscripts: In 60.nfs don't restart NFS when restarting
 rpc.lockd.

This effectively reverts 953dbfbddad656a64e30a6aca115cb1479d11573 and
is a policy decision.

Signed-off-by: Martin Schwenke <martin@meltin.net>

(This used to be ctdb commit 380c9263eb37db5a250264316e250c2160908263)
---
 ctdb/config/functions | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ctdb/config/functions b/ctdb/config/functions
index 329eceb2f7f..b04965281db 100755
--- a/ctdb/config/functions
+++ b/ctdb/config/functions
@@ -154,7 +154,6 @@ nfs_check_rpc_service ()
 	    _rpc_prog=nlockmgr
 	    _version=4
 	    _restart="echo 'Trying to restart lock manager service'"
-	    _restart="${_restart}; startstop_nfs restart"
 	    _restart="${_restart}; startstop_nfslock restart"
 	    ;;
 	statd)

From 3b43805a318c2c51f783db83e963e69f6751ad3d Mon Sep 17 00:00:00 2001
From: Martin Schwenke <martin@meltin.net>
Date: Fri, 12 Aug 2011 16:30:54 +1000
Subject: [PATCH 4/4] Tests: re-enable the NFS eventscript tests - they work
 again.

Signed-off-by: Martin Schwenke <martin@meltin.net>

(This used to be ctdb commit 3e145ab1bb61ed2087ec5ce6183ee24802686ed3)
---
 ctdb/tests/eventscripts/simple/60.nfs.monitor.001.sh | 0
 ctdb/tests/eventscripts/simple/60.nfs.monitor.100.sh | 0
 ctdb/tests/eventscripts/simple/60.nfs.monitor.101.sh | 0
 ctdb/tests/eventscripts/simple/60.nfs.monitor.111.sh | 0
 ctdb/tests/eventscripts/simple/60.nfs.monitor.112.sh | 0
 ctdb/tests/eventscripts/simple/60.nfs.monitor.121.sh | 0
 ctdb/tests/eventscripts/simple/60.nfs.monitor.122.sh | 0
 ctdb/tests/eventscripts/simple/60.nfs.monitor.131.sh | 0
 ctdb/tests/eventscripts/simple/60.nfs.monitor.132.sh | 0
 ctdb/tests/eventscripts/simple/60.nfs.monitor.141.sh | 0
 ctdb/tests/eventscripts/simple/60.nfs.monitor.142.sh | 0
 ctdb/tests/eventscripts/simple/60.nfs.monitor.151.sh | 0
 ctdb/tests/eventscripts/simple/60.nfs.monitor.152.sh | 0
 ctdb/tests/eventscripts/simple/60.nfs.monitor.153.sh | 0
 14 files changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 ctdb/tests/eventscripts/simple/60.nfs.monitor.001.sh
 mode change 100644 => 100755 ctdb/tests/eventscripts/simple/60.nfs.monitor.100.sh
 mode change 100644 => 100755 ctdb/tests/eventscripts/simple/60.nfs.monitor.101.sh
 mode change 100644 => 100755 ctdb/tests/eventscripts/simple/60.nfs.monitor.111.sh
 mode change 100644 => 100755 ctdb/tests/eventscripts/simple/60.nfs.monitor.112.sh
 mode change 100644 => 100755 ctdb/tests/eventscripts/simple/60.nfs.monitor.121.sh
 mode change 100644 => 100755 ctdb/tests/eventscripts/simple/60.nfs.monitor.122.sh
 mode change 100644 => 100755 ctdb/tests/eventscripts/simple/60.nfs.monitor.131.sh
 mode change 100644 => 100755 ctdb/tests/eventscripts/simple/60.nfs.monitor.132.sh
 mode change 100644 => 100755 ctdb/tests/eventscripts/simple/60.nfs.monitor.141.sh
 mode change 100644 => 100755 ctdb/tests/eventscripts/simple/60.nfs.monitor.142.sh
 mode change 100644 => 100755 ctdb/tests/eventscripts/simple/60.nfs.monitor.151.sh
 mode change 100644 => 100755 ctdb/tests/eventscripts/simple/60.nfs.monitor.152.sh
 mode change 100644 => 100755 ctdb/tests/eventscripts/simple/60.nfs.monitor.153.sh

diff --git a/ctdb/tests/eventscripts/simple/60.nfs.monitor.001.sh b/ctdb/tests/eventscripts/simple/60.nfs.monitor.001.sh
old mode 100644
new mode 100755
diff --git a/ctdb/tests/eventscripts/simple/60.nfs.monitor.100.sh b/ctdb/tests/eventscripts/simple/60.nfs.monitor.100.sh
old mode 100644
new mode 100755
diff --git a/ctdb/tests/eventscripts/simple/60.nfs.monitor.101.sh b/ctdb/tests/eventscripts/simple/60.nfs.monitor.101.sh
old mode 100644
new mode 100755
diff --git a/ctdb/tests/eventscripts/simple/60.nfs.monitor.111.sh b/ctdb/tests/eventscripts/simple/60.nfs.monitor.111.sh
old mode 100644
new mode 100755
diff --git a/ctdb/tests/eventscripts/simple/60.nfs.monitor.112.sh b/ctdb/tests/eventscripts/simple/60.nfs.monitor.112.sh
old mode 100644
new mode 100755
diff --git a/ctdb/tests/eventscripts/simple/60.nfs.monitor.121.sh b/ctdb/tests/eventscripts/simple/60.nfs.monitor.121.sh
old mode 100644
new mode 100755
diff --git a/ctdb/tests/eventscripts/simple/60.nfs.monitor.122.sh b/ctdb/tests/eventscripts/simple/60.nfs.monitor.122.sh
old mode 100644
new mode 100755
diff --git a/ctdb/tests/eventscripts/simple/60.nfs.monitor.131.sh b/ctdb/tests/eventscripts/simple/60.nfs.monitor.131.sh
old mode 100644
new mode 100755
diff --git a/ctdb/tests/eventscripts/simple/60.nfs.monitor.132.sh b/ctdb/tests/eventscripts/simple/60.nfs.monitor.132.sh
old mode 100644
new mode 100755
diff --git a/ctdb/tests/eventscripts/simple/60.nfs.monitor.141.sh b/ctdb/tests/eventscripts/simple/60.nfs.monitor.141.sh
old mode 100644
new mode 100755
diff --git a/ctdb/tests/eventscripts/simple/60.nfs.monitor.142.sh b/ctdb/tests/eventscripts/simple/60.nfs.monitor.142.sh
old mode 100644
new mode 100755
diff --git a/ctdb/tests/eventscripts/simple/60.nfs.monitor.151.sh b/ctdb/tests/eventscripts/simple/60.nfs.monitor.151.sh
old mode 100644
new mode 100755
diff --git a/ctdb/tests/eventscripts/simple/60.nfs.monitor.152.sh b/ctdb/tests/eventscripts/simple/60.nfs.monitor.152.sh
old mode 100644
new mode 100755
diff --git a/ctdb/tests/eventscripts/simple/60.nfs.monitor.153.sh b/ctdb/tests/eventscripts/simple/60.nfs.monitor.153.sh
old mode 100644
new mode 100755