From fec69034ee939c98b64030aa6a502556d2b4704b Mon Sep 17 00:00:00 2001 From: Martin Schwenke Date: Mon, 12 Aug 2013 11:36:25 +1000 Subject: [PATCH] eventscripts: Become unhealthy faster on nfsd failure Anecdotal evidence suggests that most nfsd RPC check failures are due to cluster filesystem or storage problem. Apparently these are rarely helped by attempting to restart the NFS service because the restart tends to hang. Fail after 2 nfsd RPC check failures, instead of waiting for 6 failures. Restart on every 10th failure to try to bring the node back to good health. Update unit tests to match. Signed-off-by: Martin Schwenke (This used to be ctdb commit e9ef93f7b6dad59eabaa32124df81f3e74c651ef) --- ctdb/config/nfs-rpc-checks.d/20.nfsd.check | 5 ++--- ctdb/tests/eventscripts/60.nfs.monitor.112.sh | 5 +---- ctdb/tests/eventscripts/60.nfs.monitor.113.sh | 5 +---- ctdb/tests/eventscripts/60.nfs.monitor.114.sh | 5 +---- 4 files changed, 5 insertions(+), 15 deletions(-) diff --git a/ctdb/config/nfs-rpc-checks.d/20.nfsd.check b/ctdb/config/nfs-rpc-checks.d/20.nfsd.check index d738a3245e5..aa4a2e709ca 100644 --- a/ctdb/config/nfs-rpc-checks.d/20.nfsd.check +++ b/ctdb/config/nfs-rpc-checks.d/20.nfsd.check @@ -1,3 +1,2 @@ --ge 6 verbose unhealthy --eq 4 verbose restart --eq 2 restart:b +% 10 verbose restart:b unhealthy +-ge 2 verbose unhealthy diff --git a/ctdb/tests/eventscripts/60.nfs.monitor.112.sh b/ctdb/tests/eventscripts/60.nfs.monitor.112.sh index c5c39b26e67..49ee3357498 100755 --- a/ctdb/tests/eventscripts/60.nfs.monitor.112.sh +++ b/ctdb/tests/eventscripts/60.nfs.monitor.112.sh @@ -9,7 +9,4 @@ define_test "knfsd down, 6 iterations" setup_nfs rpc_services_down "nfs" -iterate_test 6 'ok_null' \ - 2 'rpc_set_service_failure_response "nfsd"' \ - 4 'rpc_set_service_failure_response "nfsd"' \ - 6 'rpc_set_service_failure_response "nfsd"' +iterate_test 10 'rpc_set_service_failure_response "nfsd"' diff --git a/ctdb/tests/eventscripts/60.nfs.monitor.113.sh b/ctdb/tests/eventscripts/60.nfs.monitor.113.sh index caa49892a03..505df1b5275 100755 --- a/ctdb/tests/eventscripts/60.nfs.monitor.113.sh +++ b/ctdb/tests/eventscripts/60.nfs.monitor.113.sh @@ -12,7 +12,4 @@ rpc_services_down "nfs" CTDB_NFS_DUMP_STUCK_THREADS=5 FAKE_NFSD_THREAD_PIDS="" -iterate_test 6 'ok_null' \ - 2 'rpc_set_service_failure_response "nfsd"' \ - 4 'rpc_set_service_failure_response "nfsd"' \ - 6 'rpc_set_service_failure_response "nfsd"' +iterate_test 10 'rpc_set_service_failure_response "nfsd"' diff --git a/ctdb/tests/eventscripts/60.nfs.monitor.114.sh b/ctdb/tests/eventscripts/60.nfs.monitor.114.sh index 8279395cde8..496f5e7dee2 100755 --- a/ctdb/tests/eventscripts/60.nfs.monitor.114.sh +++ b/ctdb/tests/eventscripts/60.nfs.monitor.114.sh @@ -12,7 +12,4 @@ rpc_services_down "nfs" CTDB_NFS_DUMP_STUCK_THREADS=5 FAKE_NFSD_THREAD_PIDS="1001 1002 1003" -iterate_test 6 'ok_null' \ - 2 'rpc_set_service_failure_response "nfsd"' \ - 4 'rpc_set_service_failure_response "nfsd"' \ - 6 'rpc_set_service_failure_response "nfsd"' +iterate_test 10 'rpc_set_service_failure_response "nfsd"'