2007-09-05 14:20:34 +10:00
#!/bin/sh
# a script to test the basic setup of a CTDB/Samba install
# tridge@samba.org September 2007
2010-08-06 11:10:56 +10:00
# martin@meltin.net August 2010
usage ()
{
cat >&2 <<EOF
Usage: ctdb_diagnostics [OPTION] ...
options:
-n <nodes> Comma separated list of nodes to operate on
-c Ignore comment lines (starting with '#') in file comparisons
-w Ignore whitespace in file comparisons
--no-ads Do not use commands that assume an Active Directory Server
EOF
exit 1
}
2014-11-20 14:39:59 +11:00
nodes=$(ctdb listnodes -X | cut -d'|' -f2)
2011-10-07 15:00:42 +11:00
bad_nodes=""
2010-08-06 11:10:56 +10:00
diff_opts=
no_ads=false
parse_options ()
{
temp=$(getopt -n "ctdb_diagnostics" -o "n:cwh" -l no-ads,help -- "$@")
2017-08-11 12:49:32 +10:00
# No! Checking the exit code afterwards is actually clearer...
# shellcheck disable=SC2181
[ $? -eq 0 ] || usage
2010-08-06 11:10:56 +10:00
eval set -- "$temp"
while true ; do
case "$1" in
-n) nodes=$(echo "$2" | sed -e 's@,@ @g') ; shift 2 ;;
-c) diff_opts="${diff_opts} -I ^#.*" ; shift ;;
-w) diff_opts="${diff_opts} -w" ; shift ;;
--no-ads) no_ads=true ; shift ;;
--) shift ; break ;;
-h|--help|*) usage ;;
esac
done
[ $# -ne 0 ] && usage
}
parse_options "$@"
2011-10-07 15:00:42 +11:00
# Use 5s ssh timeout if EXTRA_SSH_OPTS doesn't set a timeout.
case "$EXTRA_SSH_OPTS" in
*ConnectTimeout=*) : ;;
*)
export EXTRA_SSH_OPTS="${EXTRA_SSH_OPTS} -o ConnectTimeout=5"
esac
# Filter nodes. Remove any nodes we can't contact from $node and add
# them to $bad_nodes.
_nodes=""
for _i in $nodes ; do
2016-07-06 17:31:51 +10:00
if onnode "$_i" true >/dev/null 2>&1 ; then
2011-10-07 15:00:42 +11:00
_nodes="${_nodes}${_nodes:+ }${_i}"
else
bad_nodes="${bad_nodes}${bad_nodes:+,}${_i}"
fi
done
nodes="$_nodes"
2016-07-06 17:31:51 +10:00
nodes_comma=$(echo "$nodes" | sed -e 's@[[:space:]]@,@g')
2007-09-05 14:20:34 +10:00
PATH="$PATH:/sbin:/usr/sbin:/usr/lpp/mmfs/bin"
2007-03-10 13:45:38 +11:00
# list of config files that must exist and that we check are the same
2010-08-06 11:10:56 +10:00
# on the nodes
2011-10-15 19:50:12 +02:00
if [ -d /etc/sysconfig ] ; then
2015-08-17 20:47:58 +10:00
CONFIG_FILES_MUST="/etc/krb5.conf /etc/hosts /usr/local/etc/ctdb/nodes /etc/sysconfig/ctdb /etc/resolv.conf /etc/nsswitch.conf /etc/sysctl.conf /etc/samba/smb.conf /etc/fstab /etc/multipath.conf /etc/pam.d/system-auth /etc/sysconfig/nfs /etc/exports /etc/vsftpd/vsftpd.conf"
2011-10-15 19:50:12 +02:00
else
2015-08-17 20:47:58 +10:00
CONFIG_FILES_MUST="/etc/krb5.conf /etc/hosts /usr/local/etc/ctdb/nodes /etc/default/ctdb /etc/resolv.conf /etc/nsswitch.conf /etc/sysctl.conf /etc/samba/smb.conf /etc/fstab /etc/multipath.conf /etc/pam.d/system-auth /etc/default/nfs /etc/exports /etc/vsftpd/vsftpd.conf"
2011-10-15 19:50:12 +02:00
fi
2007-03-10 13:45:38 +11:00
# list of config files that may exist and should be checked that they
2010-08-06 11:10:56 +10:00
# are the same on the nodes
2015-08-17 20:47:58 +10:00
CONFIG_FILES_MAY="/usr/local/etc/ctdb/public_addresses /usr/local/etc/ctdb/static-routes"
2007-09-05 14:20:34 +10:00
2017-07-13 13:08:39 +10:00
exec 2>&1
2007-09-05 14:20:34 +10:00
cat <<EOF
--------------------------------------------------------------------
ctdb_diagnostics starting. This script will gather information about
your ctdb cluster. You should send the output of this script along
with any ctdb or clustered Samba bug reports.
--------------------------------------------------------------------
EOF
date
error() {
msg="$1"
echo "ERROR: $msg"
2016-07-14 12:58:31 +10:00
NUM_ERRORS=$((NUM_ERRORS + 1))
2016-07-06 17:31:51 +10:00
echo " ERROR[$NUM_ERRORS]: $msg" >> "$ERRORS"
2007-09-05 14:20:34 +10:00
}
show_file() {
fname="$1"
2016-07-06 17:31:51 +10:00
_fdetails=$(ls -l "$fname" 2>&1)
2007-09-05 14:20:34 +10:00
echo " ================================"
echo " File: $fname"
2016-07-06 17:31:51 +10:00
echo " $_fdetails"
2016-07-06 20:17:26 +10:00
sed 's/^/ /' "$fname" 2>&1
2007-09-05 14:20:34 +10:00
echo " ================================"
}
show_all() {
2010-08-06 11:10:56 +10:00
echo "running $1 on nodes $nodes_comma"
2016-07-06 17:31:51 +10:00
onnode "$nodes_comma" "hostname; date; $1 2>&1 | sed 's/^/ /'" 2>&1
2010-08-06 11:10:56 +10:00
}
show_and_compare_files () {
fmt="$1" ; shift
for f ; do
2016-07-06 17:31:51 +10:00
_bf=$(basename "$f")
2010-08-06 11:10:56 +10:00
first=true
for n in $nodes ; do
if $first ; then
2016-07-06 17:31:51 +10:00
onnode "$n" [ -r "$f" ] || {
2016-07-14 12:08:04 +10:00
# This function takes a format string
# shellcheck disable=SC2059
2016-07-06 17:31:51 +10:00
msg=$(printf "$fmt" "$f" "$n")
2010-08-06 11:10:56 +10:00
error "$msg"
continue 2;
}
2016-07-06 17:31:51 +10:00
fstf="${tmpdir}/${_bf}.node${n}"
onnode "$n" cat "$f" >"$fstf" 2>&1
2010-08-06 11:10:56 +10:00
2016-07-14 12:58:31 +10:00
_fdetails=$(onnode "$n" ls -l "$f" 2>&1)
2010-08-06 11:10:56 +10:00
echo " ================================"
echo " File (on node $n): $f"
2016-07-14 12:58:31 +10:00
echo " $_fdetails"
2016-07-06 20:17:26 +10:00
sed 's/^/ /' "$fstf"
2010-08-06 11:10:56 +10:00
echo " ================================"
first=false
else
echo "Testing for same config file $f on node $n"
2016-07-06 17:31:51 +10:00
tmpf="${tmpdir}/${_bf}.node${n}"
onnode "$n" cat "$f" >"$tmpf" 2>&1
# Intentional multi-word splitting on diff_opts
# shellcheck disable=SC2086
diff $diff_opts "$fstf" "$tmpf" >/dev/null 2>&1 || {
2010-08-06 11:10:56 +10:00
error "File $f is different on node $n"
2016-07-06 17:31:51 +10:00
diff -u $diff_opts "$fstf" "$tmpf"
2010-08-06 11:10:56 +10:00
}
2016-07-06 17:31:51 +10:00
rm -f "$tmpf"
2010-08-06 11:10:56 +10:00
fi
done
2016-07-06 17:31:51 +10:00
rm -f "$fstf"
2010-08-06 11:10:56 +10:00
done
2007-09-05 14:20:34 +10:00
}
2013-08-21 16:38:17 +10:00
if ! tmpdir=$(mktemp -d) ; then
echo "Unable to create a temporary directory"
exit 1
fi
ERRORS="${tmpdir}/diag_err"
2007-09-05 14:20:34 +10:00
NUM_ERRORS=0
2010-08-06 11:10:56 +10:00
2007-09-05 14:20:34 +10:00
cat <<EOF
2010-08-06 11:10:56 +10:00
Diagnosis started on these nodes:
$nodes_comma
2011-10-07 15:00:42 +11:00
EOF
if [ -n "$bad_nodes" ] ; then
cat <<EOF
NOT RUNNING DIAGNOSTICS on these uncontactable nodes:
$bad_nodes
EOF
fi
cat <<EOF
2010-08-06 11:10:56 +10:00
For reference, here is the nodes file on the current node...
2007-09-05 14:20:34 +10:00
EOF
2015-08-17 20:47:58 +10:00
show_file /usr/local/etc/ctdb/nodes
2007-09-05 14:20:34 +10:00
cat <<EOF
--------------------------------------------------------------------
2010-08-06 11:10:56 +10:00
Comping critical config files on nodes $nodes_comma
2007-09-05 14:20:34 +10:00
EOF
2016-07-06 17:31:51 +10:00
# Intentional multi-word splitting on CONFIG_FILES_MUST
# shellcheck disable=SC2086
2010-08-06 11:10:56 +10:00
show_and_compare_files \
"%s is missing on node %d" \
$CONFIG_FILES_MUST
2007-03-10 13:45:38 +11:00
2016-07-06 17:31:51 +10:00
# Intentional multi-word splitting on CONFIG_FILES_MAY
# shellcheck disable=SC2086
2010-08-06 11:10:56 +10:00
show_and_compare_files \
"Optional file %s is not present on node %d" \
$CONFIG_FILES_MAY
2007-09-05 14:20:34 +10:00
cat <<EOF
--------------------------------------------------------------------
Checking for clock drift
EOF
2016-07-14 12:58:31 +10:00
t=$(date +%s)
2010-08-06 11:10:56 +10:00
for i in $nodes; do
2016-07-14 12:58:31 +10:00
t2=$(onnode "$i" date +%s)
d=$((t2 - t))
2019-09-02 14:58:22 +10:00
if [ "$d" -gt 30 ] || [ "$d" -lt -30 ]; then
2007-09-05 14:20:34 +10:00
error "time on node $i differs by $d seconds"
fi
done
cat <<EOF
--------------------------------------------------------------------
Showing software versions
EOF
show_all "uname -a"
[ -x /bin/rpm ] && {
show_all "rpm -qa | egrep 'samba|ctdb|gpfs'"
}
2008-10-13 08:21:20 +11:00
[ -x /usr/bin/dpkg-query ] && {
show_all "/usr/bin/dpkg-query --show 'ctdb'"
show_all "/usr/bin/dpkg-query --show 'samba'"
#show_all "/usr/bin/dpkg-query --show 'gpfs'"
}
2007-09-05 14:20:34 +10:00
cat <<EOF
--------------------------------------------------------------------
Showing ctdb status and recent log entries
EOF
show_all "ctdb status; ctdb ip"
show_all "ctdb statistics"
2008-09-17 21:00:04 +10:00
show_all "ctdb uptime"
2012-10-11 16:21:02 +11:00
show_all "ctdb listvars"
2013-08-21 16:48:21 +10:00
show_all "ctdb getdbmap"
2015-08-16 13:26:21 +10:00
show_all "ctdb -X getdbmap | awk -F'|' 'NR > 1 {print \$3}' | sort | xargs -n 1 ctdb dbstatistics"
2007-09-05 14:20:34 +10:00
echo "Showing log.ctdb"
2015-08-17 20:47:58 +10:00
show_all "test -f /usr/local/var/log/log.ctdb && tail -100 /usr/local/var/log/log.ctdb"
2007-09-05 14:20:34 +10:00
2008-09-17 21:00:04 +10:00
show_all "tail -200 /var/log/messages"
2015-08-17 20:47:58 +10:00
show_all "ls -lRs /usr/local/var/lib/ctdb"
show_all "ls -lRs /usr/local/etc/ctdb"
2008-09-17 21:00:04 +10:00
2007-09-05 14:20:34 +10:00
cat <<EOF
--------------------------------------------------------------------
2007-09-10 11:27:07 +10:00
Showing system and process status
2007-09-05 14:20:34 +10:00
EOF
2007-09-19 11:46:11 +10:00
show_all "df"
show_all "df -i"
show_all "mount"
2007-09-17 15:31:33 +10:00
show_all "w"
2008-09-17 21:00:04 +10:00
show_all "ps axfwu"
2007-09-05 14:20:34 +10:00
show_all "dmesg"
show_all "/sbin/lspci"
2008-09-17 21:00:04 +10:00
show_all "dmidecode"
2007-09-17 15:31:33 +10:00
show_all "cat /proc/partitions"
show_all "cat /proc/cpuinfo"
show_all "cat /proc/scsi/scsi"
2007-09-05 14:20:34 +10:00
show_all "/sbin/ifconfig -a"
2008-09-17 21:00:04 +10:00
show_all "/sbin/ifconfig -a"
2022-01-05 10:08:06 +05:30
show_all "cat /proc/net/dev"
2007-09-05 14:20:34 +10:00
show_all "/sbin/ip addr list"
show_all "/sbin/route -n"
2022-01-05 10:08:06 +05:30
show_all "ss -s"
2007-10-30 10:18:52 +11:00
show_all "free"
2007-09-10 11:27:07 +10:00
show_all "crontab -l"
show_all "sysctl -a"
2009-10-06 16:16:13 +02:00
show_all "iptables -L -n"
show_all "iptables -L -n -t nat"
2007-09-17 15:31:33 +10:00
show_all "/usr/sbin/rpcinfo -p"
show_all "/usr/sbin/showmount -a"
show_all "/usr/sbin/showmount -e"
show_all "/usr/sbin/nfsstat -v"
[ -x /sbin/multipath ] && {
2007-10-16 20:13:28 +10:00
show_all "/sbin/multipath -ll"
2007-09-17 15:31:33 +10:00
}
[ -x /sbin/chkconfig ] && {
show_all "/sbin/chkconfig --list"
}
[ -x /usr/sbin/getenforce ] && {
show_all "/usr/sbin/getenforce"
}
2007-10-30 10:18:52 +11:00
[ -d /proc/net/bonding ] && {
for f in /proc/net/bonding/*; do
show_all "cat $f"
done
}
2007-09-05 14:20:34 +10:00
cat <<EOF
--------------------------------------------------------------------
Showing Samba status
EOF
show_all "smbstatus -n -B"
2010-08-06 11:10:56 +10:00
if $no_ads ; then
echo
echo "Skipping \"net ads testjoin\" as requested"
echo
else
show_all "net ads testjoin"
fi
2007-03-10 14:10:21 +11:00
show_all "net conf list"
2007-09-05 14:20:34 +10:00
show_all "lsof -n | grep smbd"
2008-09-17 21:00:04 +10:00
show_all "lsof -n | grep ctdbd"
2007-09-05 14:20:34 +10:00
show_all "netstat -tan"
2010-08-06 11:10:56 +10:00
if $no_ads ; then
echo
echo "Skipping \"net ads info\" as requested"
echo
else
show_all "net ads info"
fi
2007-09-05 14:20:34 +10:00
show_all "date"
show_all "smbclient -U% -L 127.0.0.1"
2016-07-14 12:58:31 +10:00
WORKGROUP=$(testparm -s --parameter-name=WORKGROUP 2> /dev/null)
2007-09-05 14:20:34 +10:00
show_all id "$WORKGROUP/Administrator"
show_all "wbinfo -p"
2009-03-19 10:43:57 +11:00
show_all "wbinfo --online-status"
2007-09-05 14:20:34 +10:00
show_all "smbd -b"
date
echo "Diagnostics finished with $NUM_ERRORS errors"
2016-07-06 17:31:51 +10:00
[ -r "$ERRORS" ] && {
cat "$ERRORS"
rm -f "$ERRORS"
2007-09-05 14:20:34 +10:00
}
2013-08-21 16:38:17 +10:00
rm -rf "$tmpdir"
2007-09-05 14:20:34 +10:00
exit $NUM_ERRORS