diff --git a/ctdb/config/debug-hung-script.sh b/ctdb/config/debug-hung-script.sh
index 19842424188..63d695f01b3 100755
--- a/ctdb/config/debug-hung-script.sh
+++ b/ctdb/config/debug-hung-script.sh
@@ -1,18 +1,48 @@
#!/bin/sh
+[ -n "$CTDB_BASE" ] || \
+ export CTDB_BASE=$(cd -P $(dirname "$0") ; echo "$PWD")
+
+. "$CTDB_BASE/functions"
+
+loadconfig ctdb
+
+# Testing hook
+if [ -n "$CTDB_DEBUG_HUNG_SCRIPT_LOGFILE" ] ; then
+ exec >>"$CTDB_DEBUG_HUNG_SCRIPT_LOGFILE" 2>&1
+fi
+
(
flock --wait 2 9 || exit 1
echo "===== Start of hung script debug for PID=\"$1\", event=\"$2\" ====="
echo "pstree -p -a ${1}:"
- pstree -p -a $1
+ out=$(pstree -p -a $1)
+ echo "$out"
+
+ # Check for processes matching a regular expression and print
+ # stack staces. This could help confirm that certain processes
+ # are stuck in certain places such as the cluster filesystem. The
+ # regexp should separate items with "\|" and should not contain
+ # parentheses. The default pattern can be replaced for testing.
+ default_pat='exportfs\|rpcinfo'
+ pat="${CTDB_DEBUG_HUNG_SCRIPT_STACKPAT:-${default_pat}}"
+ echo "$out" |
+ sed -n "s@.*-\(.*${pat}.*\),\([0-9]*\).*@\2 \1@p" |
+ while read pid name ; do
+ trace=$(cat "/proc/${pid}/stack" 2>/dev/null)
+ if [ $? -eq 0 ] ; then
+ echo "---- Stack trace of interesting process ${pid}[${name}] ----"
+ echo "$trace"
+ fi
+ done
if [ "$2" = "init" ] ; then
exit 0
fi
- echo "ctdb scriptstatus ${2}:"
+ echo "---- ctdb scriptstatus ${2}: ----"
# No use running several of these in parallel if, say, "releaseip"
# event hangs for multiple IPs. In that case the output would be
# interleaved in the log and would just be confusing.
diff --git a/ctdb/doc/ctdbd.conf.5.xml b/ctdb/doc/ctdbd.conf.5.xml
index a1f6db5ef61..37b1cf94cbb 100644
--- a/ctdb/doc/ctdbd.conf.5.xml
+++ b/ctdb/doc/ctdbd.conf.5.xml
@@ -1374,6 +1374,36 @@ CTDB_SET_MonitorInterval=20
+
+ CTDB_DEBUG_HUNG_SCRIPT_LOGFILE=FILENAME
+
+
+ FILENAME specifies where log messages should go when
+ debugging hung eventscripts. This is a testing option.
+ See also CTDB_DEBUG_HUNG_SCRIPT.
+
+
+ No default. Messages go to stdout/stderr and are logged
+ to the same place as other CTDB log messages.
+
+
+
+
+
+ CTDB_DEBUG_HUNG_SCRIPT_STACKPAT=REGEXP
+
+
+ REGEXP specifies interesting processes for which stack
+ traces should be logged when debugging hung eventscripts
+ and those processes are matched in pstree output. See
+ also CTDB_DEBUG_HUNG_SCRIPT.
+
+
+ Default is "exportfs\|rpcinfo".
+
+
+
+
CTDB_DEBUG_LOCKS=FILENAME
diff --git a/ctdb/tests/complex/90_debug_hung_script.sh b/ctdb/tests/complex/90_debug_hung_script.sh
new file mode 100755
index 00000000000..ef6216cf94c
--- /dev/null
+++ b/ctdb/tests/complex/90_debug_hung_script.sh
@@ -0,0 +1,91 @@
+#!/bin/bash
+
+test_info()
+{
+ cat </dev/null ; then
+ echo "GOOD: output contains \"$pattern\""
+ else
+ echo "BAD: output does not contain \"$pattern\""
+ exit 1
+ fi
+done <<'EOF'
+===== Start of hung script debug for PID=".*", event="monitor" =====
+===== End of hung script debug for PID=".*", event="monitor" =====
+pstree -p -a .*:
+ *\`-99\\.timeout,.* /etc/ctdb/events.d/99.timeout monitor
+ *\`-sleep,.*
+---- Stack trace of interesting process [0-9]*\\[sleep\\] ----
+[<[0-9a-f]*>] .*sleep+.*
+---- ctdb scriptstatus monitor: ----
+[0-9]* scripts were executed last monitor cycle
+99\\.timeout *Status:TIMEDOUT.*
+ *OUTPUT:sleeping for [0-9]* seconds\\.\\.\\.
+EOF