diff --git a/ctdb/config/debug-hung-script.sh b/ctdb/config/debug-hung-script.sh index 19842424188..63d695f01b3 100755 --- a/ctdb/config/debug-hung-script.sh +++ b/ctdb/config/debug-hung-script.sh @@ -1,18 +1,48 @@ #!/bin/sh +[ -n "$CTDB_BASE" ] || \ + export CTDB_BASE=$(cd -P $(dirname "$0") ; echo "$PWD") + +. "$CTDB_BASE/functions" + +loadconfig ctdb + +# Testing hook +if [ -n "$CTDB_DEBUG_HUNG_SCRIPT_LOGFILE" ] ; then + exec >>"$CTDB_DEBUG_HUNG_SCRIPT_LOGFILE" 2>&1 +fi + ( flock --wait 2 9 || exit 1 echo "===== Start of hung script debug for PID=\"$1\", event=\"$2\" =====" echo "pstree -p -a ${1}:" - pstree -p -a $1 + out=$(pstree -p -a $1) + echo "$out" + + # Check for processes matching a regular expression and print + # stack staces. This could help confirm that certain processes + # are stuck in certain places such as the cluster filesystem. The + # regexp should separate items with "\|" and should not contain + # parentheses. The default pattern can be replaced for testing. + default_pat='exportfs\|rpcinfo' + pat="${CTDB_DEBUG_HUNG_SCRIPT_STACKPAT:-${default_pat}}" + echo "$out" | + sed -n "s@.*-\(.*${pat}.*\),\([0-9]*\).*@\2 \1@p" | + while read pid name ; do + trace=$(cat "/proc/${pid}/stack" 2>/dev/null) + if [ $? -eq 0 ] ; then + echo "---- Stack trace of interesting process ${pid}[${name}] ----" + echo "$trace" + fi + done if [ "$2" = "init" ] ; then exit 0 fi - echo "ctdb scriptstatus ${2}:" + echo "---- ctdb scriptstatus ${2}: ----" # No use running several of these in parallel if, say, "releaseip" # event hangs for multiple IPs. In that case the output would be # interleaved in the log and would just be confusing. diff --git a/ctdb/doc/ctdbd.conf.5.xml b/ctdb/doc/ctdbd.conf.5.xml index a1f6db5ef61..37b1cf94cbb 100644 --- a/ctdb/doc/ctdbd.conf.5.xml +++ b/ctdb/doc/ctdbd.conf.5.xml @@ -1374,6 +1374,36 @@ CTDB_SET_MonitorInterval=20 + + CTDB_DEBUG_HUNG_SCRIPT_LOGFILE=FILENAME + + + FILENAME specifies where log messages should go when + debugging hung eventscripts. This is a testing option. + See also CTDB_DEBUG_HUNG_SCRIPT. + + + No default. Messages go to stdout/stderr and are logged + to the same place as other CTDB log messages. + + + + + + CTDB_DEBUG_HUNG_SCRIPT_STACKPAT=REGEXP + + + REGEXP specifies interesting processes for which stack + traces should be logged when debugging hung eventscripts + and those processes are matched in pstree output. See + also CTDB_DEBUG_HUNG_SCRIPT. + + + Default is "exportfs\|rpcinfo". + + + + CTDB_DEBUG_LOCKS=FILENAME diff --git a/ctdb/tests/complex/90_debug_hung_script.sh b/ctdb/tests/complex/90_debug_hung_script.sh new file mode 100755 index 00000000000..ef6216cf94c --- /dev/null +++ b/ctdb/tests/complex/90_debug_hung_script.sh @@ -0,0 +1,91 @@ +#!/bin/bash + +test_info() +{ + cat </dev/null ; then + echo "GOOD: output contains \"$pattern\"" + else + echo "BAD: output does not contain \"$pattern\"" + exit 1 + fi +done <<'EOF' +===== Start of hung script debug for PID=".*", event="monitor" ===== +===== End of hung script debug for PID=".*", event="monitor" ===== +pstree -p -a .*: + *\`-99\\.timeout,.* /etc/ctdb/events.d/99.timeout monitor + *\`-sleep,.* +---- Stack trace of interesting process [0-9]*\\[sleep\\] ---- +[<[0-9a-f]*>] .*sleep+.* +---- ctdb scriptstatus monitor: ---- +[0-9]* scripts were executed last monitor cycle +99\\.timeout *Status:TIMEDOUT.* + *OUTPUT:sleeping for [0-9]* seconds\\.\\.\\. +EOF