mirror of
https://github.com/samba-team/samba.git
synced 2025-01-11 05:18:09 +03:00
ctdb-scripts: Enhancements to hung script debugging
* Add stack dumps for "interesting" processes that sometimes get stuck, so try to print stack traces for them if they appear in the pstree output. * Add new configuration variables CTDB_DEBUG_HUNG_SCRIPT_LOGFILE and CTDB_DEBUG_HUNG_SCRIPT_STACKPAT. These are primarily for testing but the latter may be useful for live debugging. * Load CTDB configuration so that above configuration variables can be set/changed without restarting ctdbd. Add a test that tries to ensure that all of this is working. Signed-off-by: Martin Schwenke <martin@meltin.net> Reviewed-by: Amitay Isaacs <amitay@gmail.com>
This commit is contained in:
parent
79e2725f33
commit
2532149f8f
@ -1,18 +1,48 @@
|
||||
#!/bin/sh
|
||||
|
||||
[ -n "$CTDB_BASE" ] || \
|
||||
export CTDB_BASE=$(cd -P $(dirname "$0") ; echo "$PWD")
|
||||
|
||||
. "$CTDB_BASE/functions"
|
||||
|
||||
loadconfig ctdb
|
||||
|
||||
# Testing hook
|
||||
if [ -n "$CTDB_DEBUG_HUNG_SCRIPT_LOGFILE" ] ; then
|
||||
exec >>"$CTDB_DEBUG_HUNG_SCRIPT_LOGFILE" 2>&1
|
||||
fi
|
||||
|
||||
(
|
||||
flock --wait 2 9 || exit 1
|
||||
|
||||
echo "===== Start of hung script debug for PID=\"$1\", event=\"$2\" ====="
|
||||
|
||||
echo "pstree -p -a ${1}:"
|
||||
pstree -p -a $1
|
||||
out=$(pstree -p -a $1)
|
||||
echo "$out"
|
||||
|
||||
# Check for processes matching a regular expression and print
|
||||
# stack staces. This could help confirm that certain processes
|
||||
# are stuck in certain places such as the cluster filesystem. The
|
||||
# regexp should separate items with "\|" and should not contain
|
||||
# parentheses. The default pattern can be replaced for testing.
|
||||
default_pat='exportfs\|rpcinfo'
|
||||
pat="${CTDB_DEBUG_HUNG_SCRIPT_STACKPAT:-${default_pat}}"
|
||||
echo "$out" |
|
||||
sed -n "s@.*-\(.*${pat}.*\),\([0-9]*\).*@\2 \1@p" |
|
||||
while read pid name ; do
|
||||
trace=$(cat "/proc/${pid}/stack" 2>/dev/null)
|
||||
if [ $? -eq 0 ] ; then
|
||||
echo "---- Stack trace of interesting process ${pid}[${name}] ----"
|
||||
echo "$trace"
|
||||
fi
|
||||
done
|
||||
|
||||
if [ "$2" = "init" ] ; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "ctdb scriptstatus ${2}:"
|
||||
echo "---- ctdb scriptstatus ${2}: ----"
|
||||
# No use running several of these in parallel if, say, "releaseip"
|
||||
# event hangs for multiple IPs. In that case the output would be
|
||||
# interleaved in the log and would just be confusing.
|
||||
|
@ -1374,6 +1374,36 @@ CTDB_SET_MonitorInterval=20
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term>CTDB_DEBUG_HUNG_SCRIPT_LOGFILE=<parameter>FILENAME</parameter></term>
|
||||
<listitem>
|
||||
<para>
|
||||
FILENAME specifies where log messages should go when
|
||||
debugging hung eventscripts. This is a testing option.
|
||||
See also <citetitle>CTDB_DEBUG_HUNG_SCRIPT</citetitle>.
|
||||
</para>
|
||||
<para>
|
||||
No default. Messages go to stdout/stderr and are logged
|
||||
to the same place as other CTDB log messages.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term>CTDB_DEBUG_HUNG_SCRIPT_STACKPAT=<parameter>REGEXP</parameter></term>
|
||||
<listitem>
|
||||
<para>
|
||||
REGEXP specifies interesting processes for which stack
|
||||
traces should be logged when debugging hung eventscripts
|
||||
and those processes are matched in pstree output. See
|
||||
also <citetitle>CTDB_DEBUG_HUNG_SCRIPT</citetitle>.
|
||||
</para>
|
||||
<para>
|
||||
Default is "exportfs\|rpcinfo".
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term>CTDB_DEBUG_LOCKS=<parameter>FILENAME</parameter></term>
|
||||
<listitem>
|
||||
|
91
ctdb/tests/complex/90_debug_hung_script.sh
Executable file
91
ctdb/tests/complex/90_debug_hung_script.sh
Executable file
@ -0,0 +1,91 @@
|
||||
#!/bin/bash
|
||||
|
||||
test_info()
|
||||
{
|
||||
cat <<EOF
|
||||
Verify CTDB's debugging of timed out eventscripts
|
||||
|
||||
Prerequisites:
|
||||
|
||||
* An active CTDB cluster with monitoring enabled
|
||||
|
||||
Expected results:
|
||||
|
||||
* When an eventscript times out the correct debugging is executed.
|
||||
EOF
|
||||
}
|
||||
|
||||
. "${TEST_SCRIPTS_DIR}/integration.bash"
|
||||
|
||||
set -e
|
||||
|
||||
ctdb_test_init "$@"
|
||||
|
||||
ctdb_test_check_real_cluster
|
||||
|
||||
cluster_is_healthy
|
||||
|
||||
# No need for restart when done
|
||||
|
||||
# This is overkill but it at least provides a valid test node
|
||||
select_test_node_and_ips
|
||||
|
||||
####################
|
||||
|
||||
# Set this if CTDB is installed in a non-standard location on cluster
|
||||
# nodes
|
||||
[ -n "$CTDB_BASE" ] || CTDB_BASE="/etc/ctdb"
|
||||
|
||||
####################
|
||||
|
||||
echo "Enable eventscript for testing timeouts..."
|
||||
ctdb_test_exit_hook_add "onnode -q $test_node $CTDB disablescript 99.timeout"
|
||||
try_command_on_node $test_node $CTDB enablescript "99.timeout"
|
||||
|
||||
####################
|
||||
|
||||
echo "Setting monitor events to time out..."
|
||||
rc_local_d="${CTDB_BASE}/rc.local.d"
|
||||
try_command_on_node $test_node mkdir -p "$rc_local_d"
|
||||
|
||||
rc_local_f="${rc_local_d}/timeout_config.$$"
|
||||
ctdb_test_exit_hook_add "onnode $test_node rm -f $rc_local_f"
|
||||
|
||||
try_command_on_node $test_node mktemp
|
||||
debug_output="$out"
|
||||
ctdb_test_exit_hook_add "onnode $test_node rm -f $debug_output"
|
||||
|
||||
try_command_on_node -i $test_node tee "$rc_local_f" <<<"\
|
||||
CTDB_RUN_TIMEOUT_MONITOR=yes
|
||||
CTDB_DEBUG_HUNG_SCRIPT_LOGFILE=\"$debug_output\"
|
||||
CTDB_DEBUG_HUNG_SCRIPT_STACKPAT='exportfs\|rpcinfo\|sleep'"
|
||||
|
||||
try_command_on_node $test_node chmod +x "$rc_local_f"
|
||||
|
||||
####################
|
||||
|
||||
wait_for_monitor_event $test_node
|
||||
|
||||
echo "Checking output of hung script debugging..."
|
||||
try_command_on_node -v $test_node cat "$debug_output"
|
||||
|
||||
while IFS="" read pattern ; do
|
||||
if grep -- "^${pattern}\$" <<<"$out" >/dev/null ; then
|
||||
echo "GOOD: output contains \"$pattern\""
|
||||
else
|
||||
echo "BAD: output does not contain \"$pattern\""
|
||||
exit 1
|
||||
fi
|
||||
done <<'EOF'
|
||||
===== Start of hung script debug for PID=".*", event="monitor" =====
|
||||
===== End of hung script debug for PID=".*", event="monitor" =====
|
||||
pstree -p -a .*:
|
||||
*\`-99\\.timeout,.* /etc/ctdb/events.d/99.timeout monitor
|
||||
*\`-sleep,.*
|
||||
---- Stack trace of interesting process [0-9]*\\[sleep\\] ----
|
||||
[<[0-9a-f]*>] .*sleep+.*
|
||||
---- ctdb scriptstatus monitor: ----
|
||||
[0-9]* scripts were executed last monitor cycle
|
||||
99\\.timeout *Status:TIMEDOUT.*
|
||||
*OUTPUT:sleeping for [0-9]* seconds\\.\\.\\.
|
||||
EOF
|
Loading…
Reference in New Issue
Block a user