2025-01-12 09:18:10 +03:00 · 2011-08-15 15:20:18 +10:00 · 2011-08-15 15:20:18 +10:00 · 2c5f1d7ccc
commit 2c5f1d7ccc
parent 775e188cb7 3b43805a31
16 changed files with 152 additions and 113 deletions
--- a/ctdb/config/events.d/60.nfs
+++ b/ctdb/config/events.d/60.nfs
@ -59,7 +59,7 @@ case "$1" in
 	;;
      monitor)
-	# and that its directories are available
+	# Check that directories for shares actually exist.
 	[ "$CTDB_NFS_SKIP_SHARE_CHECK" = "yes" ] || {
 	    exportfs | grep -v '^#' | grep '^/' |
 	    sed -e 's/[[:space:]]\+[^[:space:]]*$//' |
@ -73,118 +73,35 @@ case "$1" in
 	# we only do this IF we have a rpc.statd command.
 	# For platforms where rpc.statd does not exist, we skip
 	# the check completely
-	p="rpc.statd"
+        p="rpc.statd"
-	which $p >/dev/null 2>/dev/null && {
+        which $p >/dev/null 2>/dev/null && \
-		if ctdb_check_rpc "STATD" 100024 1 >/dev/null ; then
+	    nfs_check_rpc_service "statd" \
-			(service_name="nfs_statd"; ctdb_counter_init)
+	        -ge 6 "verbose unhealthy" \
-		else
+	        -eq 4 "verbose restart" \
-			cmd="$p"
+		-eq 2 "restart:bs"
 			cmd="${cmd}${STATD_HOSTNAME:+ -n }${STATD_HOSTNAME}"
 			cmd="${cmd}${STATD_PORT:+ -p }${STATD_PORT}"
 			cmd="${cmd}${STATD_OUTGOING_PORT:+ -o }${STATD_OUTGOING_PORT}"
 			(
 				service_name="nfs_statd"
 				ctdb_counter_incr
 				ctdb_check_counter_limit 10 quiet >/dev/null
 			) || {
 				echo "$ctdb_check_rpc_out"
 				echo "Trying to restart STATD [$cmd]"
 				$cmd
 			}
 		fi
 	}
 	# check that NFS responds to rpc requests
-	[ "$CTDB_NFS_SKIP_KNFSD_ALIVE_CHECK" = "yes" ] || {
+	if [ "$CTDB_NFS_SKIP_KNFSD_ALIVE_CHECK" != "yes" ] ; then
-	    if ctdb_check_rpc "NFS" 100003 3 >/dev/null ; then
+	    nfs_check_rpc_service "knfsd" \
-		(service_name="nfs_knfsd"; ctdb_counter_init)
+		-ge 6 "verbose unhealthy" \
-	    else
+		-eq 4 "verbose restart" \
-		(
+		-eq 2 "restart:bs"
-			service_name="nfs_knfsd"
+	fi
 			ctdb_counter_incr
 			ctdb_check_counter_equal 2 || {
 				echo "Trying to restart NFS service"
 				startstop_nfs restart >/dev/null 2>&1 &
 				exit 0
 			}
 			ctdb_check_counter_limit 5 quiet >/dev/null
 		) || {
 			echo "$ctdb_check_rpc_out"
 			echo "Trying to restart NFS service"
 			startstop_nfs restart
 			exit 1
 		}
 	    fi
 	}
 	# check that lockd responds to rpc requests
-	if ctdb_check_rpc "LOCKD" 100021 1 >/dev/null ; then
+	nfs_check_rpc_service "lockd" \
-		(service_name="lockd"; ctdb_counter_init)
+	    -ge 15 "verbose restart unhealthy" \
-	else
+	    -eq 10 "restart:bs"
 		(
 			service_name="lockd"
 			ctdb_counter_incr
-			ctdb_check_counter_equal 10 || {
+	# mountd is sometimes not started correctly on RHEL5
-				echo "Trying to restart NFS lock service"
+	nfs_check_rpc_service "mountd" \
-				startstop_nfs restart >/dev/null 2>&1 &
+	    -ge 10 "verbose restart:b unhealthy" \
-				startstop_nfslock restart  >/dev/null 2>&1 &
+	    -eq 5 "restart:b"
 				exit 0
 			}
-			ctdb_check_counter_limit 15 quiet >/dev/null
+	# rquotad is sometimes not started correctly on RHEL5
-	) || {
+	# not a critical service so we dont flag the node as unhealthy
-			echo "$ctdb_check_rpc_out"
+	nfs_check_rpc_service "rquotad" \
-			echo "Trying to restart NFS lock service"
+	    -gt 0 "verbose restart:b"
 			startstop_nfs restart
 			startstop_nfslock restart
 			exit 1
 		}
 	fi
 	# mount needs special handling since it is sometimes not started
 	# correctly on RHEL5
 	if ctdb_check_rpc "MOUNTD" 100005 1 >/dev/null ; then
 		(service_name="nfs_mountd"; ctdb_counter_init)
 	else
 	(
 		service_name="nfs_mountd"
 		ctdb_counter_incr
 		ctdb_check_counter_equal 5 || {
 			p="rpc.mountd"
 			cmd="${p}${MOUNTD_PORT:+ -p }${MOUNTD_PORT}"
 			echo "Trying to restart MOUNTD [${cmd}]"
 			killall -q -9 $p
 			$cmd &
 			exit 0
 		}
 		ctdb_check_counter_limit 10 quiet >/dev/null
 	) || {
 		echo "$ctdb_check_rpc_out"
 		p="rpc.mountd"
 		cmd="${p}${MOUNTD_PORT:+ -p }${MOUNTD_PORT}"
 		echo "Trying to restart MOUNTD [${cmd}]"
 		killall -q -9 $p
 		$cmd &
 		exit 1
 	}
 	fi
 	# rquotad needs special handling since it is sometimes not started
 	# correctly on RHEL5
 	# this is not a critical service so we dont flag the node as unhealthy
 	ctdb_check_rpc "RQUOTAD" 100011 1 || {
 		p="rpc.rquotad"
 		cmd="${p}${RQUOTAD_PORT:+ -p }${RQUOTAD_PORT}"
 		echo "Trying to restart RQUOTAD [${cmd}]"
 		killall -q -9 $p
 		$cmd &
 	}
 	# once every 600 seconds, update the statd state database for which
 	# clients need notifications
--- a/ctdb/config/functions
+++ b/ctdb/config/functions
@ -105,18 +105,140 @@ get_proc ()
    cat "/proc/$1"
 }
 ######################################################
 # Check that an RPC service is healthy -
 # this includes allowing a certain number of failures
 # before marking the NFS service unhealthy.
 #
 # usage: nfs_check_rpc_service SERVICE_NAME [ triple ...]
 #
 # each triple is a set of 3 arguments: an operator, a 
 # fail count limit and an action string.
 #
 # For example:
 #
 # 	nfs_check_rpc_service "lockd" \
 #	    -ge 15 "verbose restart unhealthy" \
 #	    -eq 10 "restart:bs"
 #
 # says that if lockd is down for 15 iterations then do
 # a verbose restart of lockd and mark the node unhealthy.
 # Before this, after 10 iterations of failure, the
 # service is restarted silently in the background.
 # Order is important: the number of failures need to be
 # specified in reverse order because processing stops
 # after the first condition that is true.
 ######################################################
 nfs_check_rpc_service ()
 {
    _prog_name="$1" ; shift
    _version=1
    _rpc_prog="$_prog_name"
    _restart=""
    _opts=""
    case "$_prog_name" in
 	knfsd)
 	    _rpc_prog=nfs
 	    _version=3
 	    _restart="echo 'Trying to restart NFS service'"
 	    _restart="${_restart}; startstop_nfs restart"
 	    ;;
 	mountd)
 	    _opts="${MOUNTD_PORT:+ -p }${MOUNTD_PORT}"
 	    ;;
 	rquotad)
 	    _opts="${RQUOTAD_PORT:+ -p }${RQUOTAD_PORT}"
 	    ;;
 	lockd)
 	    _rpc_prog=nlockmgr
 	    _version=4
 	    _restart="echo 'Trying to restart lock manager service'"
 	    _restart="${_restart}; startstop_nfslock restart"
 	    ;;
 	statd)
 	    _rpc_prog=status
 	    _opts="${STATD_HOSTNAME:+ -n }${STATD_HOSTNAME}"
 	    _opts="${_opts}${STATD_PORT:+ -p }${STATD_PORT}"
 	    _opts="${_opts}${STATD_OUTGOING_PORT:+ -o }${STATD_OUTGOING_PORT}"
 	    ;;
 	*)
 	    echo "Internal error: unknown RPC program \"$_prog_name\"."
 	    exit 1
    esac
    _service_name="nfs_${_prog_name}"
    if ctdb_check_rpc "$_rpc_prog" $_version >/dev/null ; then
 	ctdb_counter_init "$_service_name"
 	return 0
    fi
    ctdb_counter_incr "$_service_name"
    while [ -n "$3" ] ; do
 	ctdb_check_counter "quiet" "$1" "$2" "$_service_name" || {
 	    for _action in $3 ; do
 		case "$_action" in
 		    verbose)
 			echo "$ctdb_check_rpc_out"
 			;;
 		    restart|restart:*)
 			# No explicit command specified, construct rpc command.
 			if [ -z "$_restart" ] ; then
 			    _p="rpc.${_prog_name}"
 			    _restart="echo 'Trying to restart $_prog_name [${_p}${_opts}]'"
 			    _restart="${_restart}; killall -q -9 $_p"
 			    _restart="${_restart}; $_p $_opts"
 			fi
 			# Process restart flags...
 			_flags="${_action#restart:}"
 			# There may not have been a colon...
 			[ "$_flags" != "$_action" ] || _flags=""
 			# q=quiet - everything to /dev/null
 			if [ "${_flags#*q}" != "$_flags" ] ; then
 			    _restart="{ ${_restart} ; } >/dev/null 2>&1"
 			fi
 			# s=stealthy - last command to /dev/null
 			if [ "${_flags#*s}" != "$_flags" ] ; then
 			    _restart="${_restart} >/dev/null 2>&1"
 			fi
 			# b=background - the whole thing, easy and reliable
 			if [ "${_flags#*b}" != "$_flags" ] ; then
 			    _restart="{ ${_restart} ; } &"
 			fi
 			# Do it!
 			eval "${_restart}"
 			;;
 		    unhealthy)
 			exit 1
 			;;
 		    *)
 			echo "Internal error: unknown action \"$_action\"."
 			exit 1
 		esac
 	    done
 	    # Only process the first action group.
 	    break
 	}
 	shift 3
    done
 }
 ######################################################
 # check that a rpc server is registered with portmap
 # and responding to requests
-# usage: ctdb_check_rpc SERVICE_NAME PROGNUM VERSION
+# usage: ctdb_check_rpc SERVICE_NAME VERSION
 ######################################################
-ctdb_check_rpc() {
+ctdb_check_rpc ()
 {
    progname="$1"
-    prognum="$2"
+    version="$2"
    version="$3"
-    ctdb_check_rpc_out=$(rpcinfo -u localhost $prognum $version 2>&1)
+    if ! ctdb_check_rpc_out=$(rpcinfo -u localhost $progname $version 2>&1) ; then
    if [ $? -ne 0 ] ; then
 	ctdb_check_rpc_out="ERROR: $progname failed RPC check:
 $ctdb_check_rpc_out"
 	echo "$ctdb_check_rpc_out"
--- a/ctdb/tests/eventscripts/simple/60.nfs.monitor.001.sh
+++ b/ctdb/tests/eventscripts/simple/60.nfs.monitor.001.sh
--- a/ctdb/tests/eventscripts/simple/60.nfs.monitor.100.sh
+++ b/ctdb/tests/eventscripts/simple/60.nfs.monitor.100.sh
--- a/ctdb/tests/eventscripts/simple/60.nfs.monitor.101.sh
+++ b/ctdb/tests/eventscripts/simple/60.nfs.monitor.101.sh
--- a/ctdb/tests/eventscripts/simple/60.nfs.monitor.111.sh
+++ b/ctdb/tests/eventscripts/simple/60.nfs.monitor.111.sh
--- a/ctdb/tests/eventscripts/simple/60.nfs.monitor.112.sh
+++ b/ctdb/tests/eventscripts/simple/60.nfs.monitor.112.sh
--- a/ctdb/tests/eventscripts/simple/60.nfs.monitor.121.sh
+++ b/ctdb/tests/eventscripts/simple/60.nfs.monitor.121.sh
--- a/ctdb/tests/eventscripts/simple/60.nfs.monitor.122.sh
+++ b/ctdb/tests/eventscripts/simple/60.nfs.monitor.122.sh
--- a/ctdb/tests/eventscripts/simple/60.nfs.monitor.131.sh
+++ b/ctdb/tests/eventscripts/simple/60.nfs.monitor.131.sh
--- a/ctdb/tests/eventscripts/simple/60.nfs.monitor.132.sh
+++ b/ctdb/tests/eventscripts/simple/60.nfs.monitor.132.sh
--- a/ctdb/tests/eventscripts/simple/60.nfs.monitor.141.sh
+++ b/ctdb/tests/eventscripts/simple/60.nfs.monitor.141.sh
--- a/ctdb/tests/eventscripts/simple/60.nfs.monitor.142.sh
+++ b/ctdb/tests/eventscripts/simple/60.nfs.monitor.142.sh
--- a/ctdb/tests/eventscripts/simple/60.nfs.monitor.151.sh
+++ b/ctdb/tests/eventscripts/simple/60.nfs.monitor.151.sh
--- a/ctdb/tests/eventscripts/simple/60.nfs.monitor.152.sh
+++ b/ctdb/tests/eventscripts/simple/60.nfs.monitor.152.sh
--- a/ctdb/tests/eventscripts/simple/60.nfs.monitor.153.sh
+++ b/ctdb/tests/eventscripts/simple/60.nfs.monitor.153.sh