mirror of
https://github.com/samba-team/samba.git
synced 2025-01-03 01:18:10 +03:00
ctdb-scripts: Add service_stats_command variable to NFS checks
When monitoring an RPC service, the rpcinfo command might time out even though the service is making progress. In this case, it is just slow, so counting the timeout as a failure and potentially restarting the service will not help. The problem is determining if a service is making progress. Add a new NFS checks service_stats_command. This command is intended to run a statistics command. The output is naively compared using cmp(1). If the output changes then rpcinfo failures are converted to successes. Signed-off-by: Martin Schwenke <mschwenke@ddn.com> Reviewed-by: Amitay Isaacs <amitay@gmail.com>
This commit is contained in:
parent
d86f9ff7fd
commit
5b7d17d44d
@ -80,6 +80,13 @@ nfs_check_services()
|
||||
# traces of threads that have not exited, since
|
||||
# they may be stuck doing I/O;
|
||||
# no default, see also function program_stack_traces()
|
||||
# * service_stats_cmd - command to retrieve statistics for given service;
|
||||
# if this is set and RPC checks fail (or
|
||||
# $service_check_cmd fails), then statistics are
|
||||
# compared (using cmp) to see if the service is
|
||||
# making progress or is truly hung;
|
||||
# no default, failed service does not double-check
|
||||
# failure using statistics
|
||||
#
|
||||
# Quoting in values is not preserved
|
||||
#
|
||||
@ -103,6 +110,7 @@ nfs_check_service()
|
||||
service_start_cmd=""
|
||||
service_check_cmd=""
|
||||
service_debug_cmd=""
|
||||
service_stats_cmd=""
|
||||
|
||||
# Eval line-by-line. Expands variable references in values.
|
||||
# Also allows variable name checking, which seems useful.
|
||||
@ -113,7 +121,8 @@ nfs_check_service()
|
||||
family=* | version=* | \
|
||||
unhealthy_after=* | restart_every=* | \
|
||||
service_stop_cmd=* | service_start_cmd=* | \
|
||||
service_check_cmd=* | service_debug_cmd=*)
|
||||
service_check_cmd=* | service_debug_cmd=* | \
|
||||
service_stats_cmd=*)
|
||||
|
||||
eval "$_line"
|
||||
;;
|
||||
@ -144,6 +153,30 @@ nfs_check_service()
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ -n "$service_stats_cmd" ]; then
|
||||
# If configured, always update stats,
|
||||
# regardless of RPC status...
|
||||
|
||||
# shellcheck disable=SC2154
|
||||
# script_state_dir set by ctdb_setup_state_dir
|
||||
_curr="${script_state_dir}/stats_${_progname}.out"
|
||||
_prev="${_curr}.prev"
|
||||
|
||||
if [ -f "$_curr" ]; then
|
||||
mv -f "$_curr" "$_prev"
|
||||
fi
|
||||
eval "$service_stats_cmd" >"$_curr" 2>&1
|
||||
|
||||
if ! $_ok &&
|
||||
! cmp "$_prev" "$_curr" >/dev/null 2>&1; then
|
||||
# Stats always implicitly change on
|
||||
# the first monitor event, since
|
||||
# previous stats don't exists...
|
||||
echo "WARNING: statistics changed but ${_err}"
|
||||
_ok=true
|
||||
fi
|
||||
fi
|
||||
|
||||
if $_ok; then
|
||||
if [ $unhealthy_after -ne 1 ] ||
|
||||
[ $restart_every -ne 0 ]; then
|
||||
|
26
ctdb/tests/UNIT/eventscripts/60.nfs.monitor.115.sh
Executable file
26
ctdb/tests/UNIT/eventscripts/60.nfs.monitor.115.sh
Executable file
@ -0,0 +1,26 @@
|
||||
#!/bin/sh
|
||||
|
||||
. "${TEST_SCRIPTS_DIR}/unit.sh"
|
||||
|
||||
define_test "NFS RPC service down, stats change, 10 iterations"
|
||||
|
||||
setup
|
||||
|
||||
cat >"${CTDB_BASE}/nfs-checks.d/20.nfs.check" <<EOF
|
||||
# nfs
|
||||
version="3"
|
||||
restart_every=10
|
||||
unhealthy_after=2
|
||||
service_stop_cmd="\$CTDB_NFS_CALLOUT stop nfs"
|
||||
service_start_cmd="\$CTDB_NFS_CALLOUT start nfs"
|
||||
service_debug_cmd="program_stack_traces nfsd 5"
|
||||
# Dummy pipeline confirms that pipelines work in this context
|
||||
service_stats_cmd="date --rfc-3339=ns | grep ."
|
||||
EOF
|
||||
|
||||
# Test flag to indicate that stats are expected to change
|
||||
nfs_stats_set_changed "nfs" "status"
|
||||
|
||||
rpc_services_down "nfs"
|
||||
|
||||
nfs_iterate_test 10 "nfs"
|
26
ctdb/tests/UNIT/eventscripts/60.nfs.monitor.116.sh
Executable file
26
ctdb/tests/UNIT/eventscripts/60.nfs.monitor.116.sh
Executable file
@ -0,0 +1,26 @@
|
||||
#!/bin/sh
|
||||
|
||||
. "${TEST_SCRIPTS_DIR}/unit.sh"
|
||||
|
||||
define_test "NFS RPC service down, stats don't change, 10 iterations"
|
||||
|
||||
setup
|
||||
|
||||
cat >"${CTDB_BASE}/nfs-checks.d/20.nfs.check" <<EOF
|
||||
# nfs
|
||||
version="3"
|
||||
restart_every=10
|
||||
unhealthy_after=2
|
||||
service_stop_cmd="\$CTDB_NFS_CALLOUT stop nfs"
|
||||
service_start_cmd="\$CTDB_NFS_CALLOUT start nfs"
|
||||
service_debug_cmd="program_stack_traces nfsd 5"
|
||||
# Dummy pipeline confirms that pipelines work in this context
|
||||
service_stats_cmd="echo 'hello world' | grep ."
|
||||
EOF
|
||||
|
||||
# Test flag to indicate that stats are expected to change
|
||||
nfs_stats_set_changed "status"
|
||||
|
||||
rpc_services_down "nfs"
|
||||
|
||||
nfs_iterate_test 10 "nfs"
|
@ -128,6 +128,31 @@ nfs_setup_fake_threads()
|
||||
esac
|
||||
}
|
||||
|
||||
nfs_stats_set_changed()
|
||||
{
|
||||
FAKE_NFS_STATS_CHANGED=" $* "
|
||||
}
|
||||
|
||||
nfs_stats_check_changed()
|
||||
{
|
||||
_rpc_service="$1"
|
||||
_iteration="$2"
|
||||
|
||||
_t="$FAKE_NFS_STATS_CHANGED"
|
||||
if [ -z "$_t" ]; then
|
||||
return 1
|
||||
fi
|
||||
if [ "${_t#* "${_rpc_service}"}" != "$_t" ]; then
|
||||
return 0
|
||||
fi
|
||||
# Statistics always change on the first iteration
|
||||
if [ "$_iteration" -eq 1 ]; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
return 1
|
||||
}
|
||||
|
||||
guess_output()
|
||||
{
|
||||
case "$1" in
|
||||
@ -304,7 +329,12 @@ $_rpc_service failed RPC check:
|
||||
rpcinfo: RPC: Program not registered
|
||||
program $_rpc_service${_ver:+ version }${_ver} is not available"
|
||||
|
||||
if [ $unhealthy_after -gt 0 ] &&
|
||||
if [ "$_numfails" -eq -1 ]; then
|
||||
_unhealthy=false
|
||||
echo 0 >"$_rc_file"
|
||||
printf 'WARNING: statistics changed but %s\n' \
|
||||
"$_rpc_check_out" >>"$_out"
|
||||
elif [ $unhealthy_after -gt 0 ] &&
|
||||
[ "$_numfails" -ge $unhealthy_after ]; then
|
||||
_unhealthy=true
|
||||
echo 1 >"$_rc_file"
|
||||
@ -411,9 +441,17 @@ EOF
|
||||
fi
|
||||
if [ -n "$_rpc_service" ]; then
|
||||
if rpcinfo -T tcp localhost "$_rpc_service" \
|
||||
>/dev/null 2>&1 ; then
|
||||
>/dev/null 2>&1; then
|
||||
_iterate_failcount=0
|
||||
elif nfs_stats_check_changed \
|
||||
"$_rpc_service" "$_iteration"; then
|
||||
_iterate_failcount=-1
|
||||
else
|
||||
# -1 above is a special case of 0:
|
||||
# hack, unhack ;-)
|
||||
if [ $_iterate_failcount -eq -1 ]; then
|
||||
_iterate_failcount=0
|
||||
fi
|
||||
_iterate_failcount=$((_iterate_failcount + 1))
|
||||
fi
|
||||
rpc_set_service_failure_response \
|
||||
|
Loading…
Reference in New Issue
Block a user