2008-07-14 03:19:22 +04:00
#!/bin/bash
2008-07-09 08:18:15 +04:00
# Run commands on CTDB nodes.
# See http://ctdb.samba.org/ for more information about CTDB.
# Copyright (C) Martin Schwenke 2008
# Based on an earlier script by Andrew Tridgell and Ronnie Sahlberg.
2008-07-10 08:19:52 +04:00
# Copyright (C) Andrew Tridgell 2007
2008-07-09 08:18:15 +04:00
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program; if not, see <http://www.gnu.org/licenses/>.
prog=$(basename $0)
usage ()
{
cat >&2 <<EOF
2008-07-10 08:19:52 +04:00
Usage: onnode [OPTION] ... <NODES> <COMMAND> ...
options:
2009-05-06 07:17:34 +04:00
-c Run in current working directory on specified nodes.
-o <prefix> Save standard output from each node to file <prefix>.<ip>
-p Run command in parallel on specified nodes.
-q Do not print node addresses (overrides -v).
2009-10-14 06:44:57 +04:00
-n Allow nodes to be specified by name.
-f Specify nodes file, overrides CTDB_NODES_FILE.
2009-05-06 07:17:34 +04:00
-v Print node address even for a single node.
2012-07-17 10:45:55 +04:00
-P Push given files to nodes instead of running commands.
2009-09-15 13:33:35 +04:00
<NODES> "all", "any", "ok" (or "healthy"), "con" (or "connected"),
2009-05-11 07:39:31 +04:00
"rm" (or "recmaster"), "lvs" (or "lvsmaster"),
2009-10-14 06:44:57 +04:00
"natgw" (or "natgwlist"); or
a node number (0 base); or
a hostname (if -n is specified); or
2009-05-06 07:17:34 +04:00
list (comma separated) of <NODES>; or
range (hyphen separated) of node numbers.
2008-07-09 08:18:15 +04:00
EOF
exit 1
}
invalid_nodespec ()
{
echo "Invalid <nodespec>" >&2 ; echo >&2
usage
}
# Defaults.
current=false
parallel=false
verbose=false
quiet=false
2009-05-06 07:17:34 +04:00
prefix=""
2009-10-14 06:44:57 +04:00
names_ok=false
2012-07-17 10:45:55 +04:00
push=false
2008-07-09 08:18:15 +04:00
2010-01-21 05:16:18 +03:00
ctdb_base="${CTDB_BASE:-/etc/ctdb}"
2008-07-09 08:18:15 +04:00
parse_options ()
{
# $POSIXLY_CORRECT means that the command passed to onnode can
# take options and getopt won't reorder things to make them
# options ot onnode.
2011-05-17 07:25:08 +04:00
local temp
# Not on the previous line - local returns 0!
2012-07-17 10:45:55 +04:00
temp=$(POSIXLY_CORRECT=1 getopt -n "$prog" -o "cf:hno:pqvP" -l help -- "$@")
2008-07-09 08:18:15 +04:00
[ $? != 0 ] && usage
eval set -- "$temp"
while true ; do
case "$1" in
-c) current=true ; shift ;;
2009-10-14 06:44:57 +04:00
-f) CTDB_NODES_FILE="$2" ; shift 2 ;;
-n) names_ok=true ; shift ;;
2009-05-06 07:17:34 +04:00
-o) prefix="$2" ; shift 2 ;;
2008-07-10 08:19:52 +04:00
-p) parallel=true ; shift ;;
2008-07-09 08:18:15 +04:00
-q) quiet=true ; shift ;;
-v) verbose=true ; shift ;;
2012-07-17 10:45:55 +04:00
-P) push=true ; shift ;;
2008-07-09 08:18:15 +04:00
--) shift ; break ;;
2008-07-10 08:19:52 +04:00
-h|--help|*) usage ;; # Shouldn't happen, so this is reasonable.
2008-07-09 08:18:15 +04:00
esac
done
[ $# -lt 2 ] && usage
nodespec="$1" ; shift
command="$@"
}
2009-05-11 07:39:31 +04:00
echo_nth ()
2008-07-09 08:18:15 +04:00
{
2008-09-12 05:22:50 +04:00
local n="$1" ; shift
2008-07-09 08:18:15 +04:00
2009-05-11 07:39:31 +04:00
shift $n
local node="$1"
2008-07-09 08:18:15 +04:00
2009-06-02 09:03:44 +04:00
if [ -n "$node" -a "$node" != "#DEAD" ] ; then
2008-09-12 10:55:18 +04:00
echo $node
else
echo "${prog}: \"node ${n}\" does not exist" >&2
exit 1
fi
}
2008-07-09 08:18:15 +04:00
parse_nodespec ()
{
# Subshell avoids hacks to restore $IFS.
(
IFS=","
for i in $1 ; do
case "$i" in
*-*) seq "${i%-*}" "${i#*-}" 2>/dev/null || invalid_nodespec ;;
2009-05-11 07:39:31 +04:00
# Separate lines for readability.
2009-09-15 13:33:35 +04:00
all|any|ok|healthy|con|connected) echo "$i" ;;
2009-05-11 07:39:31 +04:00
rm|recmaster|lvs|lvsmaster|natgw|natgwlist) echo "$i" ;;
2008-07-09 08:18:15 +04:00
*)
2009-10-14 06:44:57 +04:00
[ $i -gt -1 ] 2>/dev/null || $names_ok || invalid_nodespec
2008-07-09 08:18:15 +04:00
echo $i
esac
done
)
}
2009-05-11 07:39:31 +04:00
ctdb_status_output="" # cache
2008-09-12 05:22:50 +04:00
get_nodes_with_status ()
{
local all_nodes="$1"
local status="$2"
if [ -z "$ctdb_status_output" ] ; then
2011-05-23 09:24:52 +04:00
ctdb_status_output=$(ctdb -Y status 2>&1)
2008-09-12 10:55:18 +04:00
if [ $? -ne 0 ] ; then
echo "${prog}: unable to get status of CTDB nodes" >&2
2011-05-23 09:24:52 +04:00
echo "$ctdb_status_output" >&2
2008-09-12 10:55:18 +04:00
exit 1
fi
2011-05-23 09:24:52 +04:00
local nl="
"
ctdb_status_output="${ctdb_status_output#*${nl}}"
2008-09-12 05:22:50 +04:00
fi
2011-05-17 08:24:30 +04:00
(
local i
2011-05-23 09:24:52 +04:00
IFS="${IFS}:"
while IFS="" read i ; do
2011-05-17 08:24:30 +04:00
2011-05-23 09:24:52 +04:00
set -- $i # split line on colons
shift # line starts with : so 1st field is empty
2011-05-17 08:24:30 +04:00
local pnn="$1" ; shift
local ip="$1" ; shift
case "$status" in
healthy)
2013-10-24 07:15:53 +04:00
# If any bit is 1, don't match this address.
2011-05-17 08:24:30 +04:00
local s
for s ; do
2013-10-24 07:15:53 +04:00
[ "$s" != "1" ] || continue 2
2011-05-17 08:24:30 +04:00
done
;;
connected)
# If disconnected bit is not 0, don't match this address.
[ "$1" = "0" ] || continue
;;
*)
invalid_nodespec
esac
2011-05-23 09:24:52 +04:00
2011-05-17 08:24:30 +04:00
echo_nth "$pnn" $all_nodes
2011-05-23 09:24:52 +04:00
done <<<"$ctdb_status_output"
2011-05-17 08:24:30 +04:00
)
2008-09-12 05:22:50 +04:00
}
2009-05-11 07:39:31 +04:00
ctdb_props="" # cache
get_node_with_property ()
{
local all_nodes="$1"
local prop="$2"
local prop_node=""
if [ "${ctdb_props##:${prop}:}" = "$ctdb_props" ] ; then
2011-05-23 09:33:12 +04:00
# Not in cache.
2009-05-11 08:43:17 +04:00
prop_node=$(ctdb "$prop" -Y 2>/dev/null)
2009-05-11 07:39:31 +04:00
if [ $? -eq 0 ] ; then
2011-05-23 09:33:12 +04:00
if [ "$prop" = "natgwlist" ] ; then
prop_node="${prop_node%% *}" # 1st word
if [ "$prop_node" = "-1" ] ; then
# This works around natgwlist returning 0 even
# when there's no natgw.
prop_node=""
fi
else
# We only want the first line.
local nl="
"
prop_node="${prop_node%%${nl}*}"
fi
2009-05-11 07:39:31 +04:00
else
prop_node=""
fi
2011-05-23 09:33:12 +04:00
if [ -n "$prop_node" ] ; then
# Add to cache.
ctdb_props="${ctdb_props}${ctdb_props:+ }:${prop}:${prop_node}"
fi
2009-05-11 07:39:31 +04:00
else
2011-05-23 09:33:12 +04:00
# Get from cache.
2009-05-11 07:39:31 +04:00
prop_node="${ctdb_props##:${prop}:}"
prop_node="${prop_node%% *}"
fi
2011-05-23 09:33:12 +04:00
2009-05-11 07:39:31 +04:00
if [ -n "$prop_node" ] ; then
echo_nth "$prop_node" $all_nodes
else
echo "${prog}: No ${prop} available" >&2
exit 1
fi
}
2009-09-15 13:33:35 +04:00
get_any_available_node ()
{
local all_nodes="$1"
# We do a recursive onnode to find which nodes are up and running.
local out=$($0 -pq all ctdb pnn 2>&1)
local line
while read line ; do
local pnn="${line#PNN:}"
if [ "$pnn" != "$line" ] ; then
echo_nth "$pnn" $all_nodes
return 0
fi
# Else must be an error message from a down node.
done <<<"$out"
return 1
}
2008-07-09 08:18:15 +04:00
get_nodes ()
{
2008-11-20 12:40:01 +03:00
local all_nodes
if [ -n "$CTDB_NODES_SOCKETS" ] ; then
all_nodes="$CTDB_NODES_SOCKETS"
else
2010-01-21 05:40:03 +03:00
local f="${ctdb_base}/nodes"
if [ -n "$CTDB_NODES_FILE" ] ; then
f="$CTDB_NODES_FILE"
if [ ! -e "$f" -a "${f#/}" = "$f" ] ; then
# $f is relative, try in $ctdb_base
f="${ctdb_base}/${f}"
fi
fi
if [ ! -r "$f" ] ; then
echo "${prog}: unable to open nodes file \"${f}\"" >&2
exit 1
fi
all_nodes=$(sed -e 's@#.*@@g' -e 's@ *@@g' -e 's@^$@#DEAD@' "$f")
2008-11-20 12:40:01 +03:00
fi
2008-07-09 08:18:15 +04:00
2008-09-12 05:22:50 +04:00
local nodes=""
local n
2008-07-09 08:18:15 +04:00
for n in $(parse_nodespec "$1") ; do
[ $? != 0 ] && exit 1 # Required to catch exit in above subshell.
case "$n" in
2008-09-12 05:22:50 +04:00
all)
2009-06-02 09:03:44 +04:00
echo "${all_nodes//#DEAD/}"
;;
2009-09-15 13:33:35 +04:00
any)
get_any_available_node "$all_nodes" || exit 1
;;
2009-05-11 07:39:31 +04:00
ok|healthy)
get_nodes_with_status "$all_nodes" "healthy" || exit 1
;;
con|connected)
get_nodes_with_status "$all_nodes" "connected" || exit 1
2008-09-12 05:22:50 +04:00
;;
2008-09-12 10:55:18 +04:00
rm|recmaster)
2009-05-11 07:39:31 +04:00
get_node_with_property "$all_nodes" "recmaster" || exit 1
;;
lvs|lvsmaster)
get_node_with_property "$all_nodes" "lvsmaster" || exit 1
;;
natgw|natgwlist)
get_node_with_property "$all_nodes" "natgwlist" || exit 1
2008-09-12 10:55:18 +04:00
;;
2009-10-14 06:44:57 +04:00
[0-9]|[0-9][0-9]|[0-9][0-9][0-9])
2008-09-12 10:55:18 +04:00
echo_nth $n $all_nodes
2009-10-14 06:44:57 +04:00
;;
*)
$names_ok || invalid_nodespec
echo $n
2008-07-09 08:18:15 +04:00
esac
done
}
2012-07-17 10:45:55 +04:00
push()
{
local host="$1"
local files="$2"
local f
for f in $files ; do
$verbose && echo "Pushing $f"
case "$f" in
/*) rsync "$f" "${host}:${f}" ;;
*) rsync "${PWD}/${f}" "${host}:${PWD}/${f}" ;;
esac
done
}
2008-11-20 12:40:01 +03:00
fakessh ()
{
Fix minor onnode bugs relating to local daemons.
Commit a0f5148ac749758e2dfbd6099e829c5bf1d900e6 caused a subtle
regression. Due to the subtlety, this description is much longer than
the 1 line patch that fixes it! The regression, where a process that
invokes onnode is unexpectedly blocked, is only apparent if the
following conditions are met:
1. $CTDB_NODES_SOCKETS is set;
2. The command passed to onnode attempts to background a process; and
3. onnode is run in certain types of subshell (e.g. foo=$(onnode ...)).
In particular, when testing against local daemons (i.e. condition (1)
is met), tests/simple/07_ctdb_process_exists.sh would fail (because it
does both (2), (3)).
The problem is caused by the use of file descriptor 3 in the code that
allows separate filtering of stdout and stderr. A backgrounded
process will have this descriptor open and the $(...) construct
appears to wait for all file descriptors to be closed. This only
happens with local daemons because SSH is replaced by a shell and file
descriptor 3 leaks into that shell. It does not occur when SSH is
used because the file descriptor does not leak into the remote shell
where the process is backgrounded.
The fix is simply to redirect file descriptor 3 to /dev/null in the
fakessh function, which is used when $CTDB_NODES_SOCKETS is set.
Also fixed is another minor bug when the -o option and
$CTDB_NODES_SOCKETS are used in combination. The code uses the node
name as a suffix for the output filename(s). Usually this is an IP
address. However, when $CTDB_NODES_SOCKETS is in use the node name is
the socket name, which might be a path several directories deep.
Each output file is created via a simple redirection and this would
fail if unexpected directories appear in the filename. 3 possible
fixes were considered:
1. Replace all '/'s in the node name by '_'s. Nice and simple.
2. Use the basename of the node name. However, sockets may be in
different directories but have the same basename.
3. Create all required directories before redirecting. This is a
little more complex and probably doesn't meet the user's
expectations.
Option (1) is implemented here.
Signed-off-by: Martin Schwenke <martin@meltin.net>
(This used to be ctdb commit c97d56d93d9c1007a4e85affb19ed0c2d0e11b6d)
2009-06-19 06:12:39 +04:00
CTDB_SOCKET="$1" sh -c "$2" 3>/dev/null
2008-11-20 12:40:01 +03:00
}
2009-05-06 07:17:34 +04:00
stdout_filter ()
{
if [ -n "$prefix" ] ; then
Fix minor onnode bugs relating to local daemons.
Commit a0f5148ac749758e2dfbd6099e829c5bf1d900e6 caused a subtle
regression. Due to the subtlety, this description is much longer than
the 1 line patch that fixes it! The regression, where a process that
invokes onnode is unexpectedly blocked, is only apparent if the
following conditions are met:
1. $CTDB_NODES_SOCKETS is set;
2. The command passed to onnode attempts to background a process; and
3. onnode is run in certain types of subshell (e.g. foo=$(onnode ...)).
In particular, when testing against local daemons (i.e. condition (1)
is met), tests/simple/07_ctdb_process_exists.sh would fail (because it
does both (2), (3)).
The problem is caused by the use of file descriptor 3 in the code that
allows separate filtering of stdout and stderr. A backgrounded
process will have this descriptor open and the $(...) construct
appears to wait for all file descriptors to be closed. This only
happens with local daemons because SSH is replaced by a shell and file
descriptor 3 leaks into that shell. It does not occur when SSH is
used because the file descriptor does not leak into the remote shell
where the process is backgrounded.
The fix is simply to redirect file descriptor 3 to /dev/null in the
fakessh function, which is used when $CTDB_NODES_SOCKETS is set.
Also fixed is another minor bug when the -o option and
$CTDB_NODES_SOCKETS are used in combination. The code uses the node
name as a suffix for the output filename(s). Usually this is an IP
address. However, when $CTDB_NODES_SOCKETS is in use the node name is
the socket name, which might be a path several directories deep.
Each output file is created via a simple redirection and this would
fail if unexpected directories appear in the filename. 3 possible
fixes were considered:
1. Replace all '/'s in the node name by '_'s. Nice and simple.
2. Use the basename of the node name. However, sockets may be in
different directories but have the same basename.
3. Create all required directories before redirecting. This is a
little more complex and probably doesn't meet the user's
expectations.
Option (1) is implemented here.
Signed-off-by: Martin Schwenke <martin@meltin.net>
(This used to be ctdb commit c97d56d93d9c1007a4e85affb19ed0c2d0e11b6d)
2009-06-19 06:12:39 +04:00
cat >"${prefix}.${n//\//_}"
2009-05-06 07:17:34 +04:00
elif $verbose && $parallel ; then
sed -e "s@^@[$n] @"
else
cat
fi
}
stderr_filter ()
{
if $verbose && $parallel ; then
sed -e "s@^@[$n] @"
else
cat
fi
}
2008-07-09 08:18:15 +04:00
######################################################################
parse_options "$@"
2008-11-20 12:40:01 +03:00
ssh_opts=
2012-07-17 10:45:55 +04:00
if $push ; then
SSH=push
2011-10-07 07:59:46 +04:00
EXTRA_SSH_OPTS=""
2012-07-17 10:45:55 +04:00
else
$current && command="cd $PWD && $command"
if [ -n "$CTDB_NODES_SOCKETS" ] ; then
SSH=fakessh
EXTRA_SSH_OPTS=""
else
# Could "2>/dev/null || true" but want to see errors from typos in file.
[ -r "${ctdb_base}/onnode.conf" ] && . "${ctdb_base}/onnode.conf"
[ -n "$SSH" ] || SSH=ssh
if [ "$SSH" = "ssh" ] ; then
ssh_opts="-n"
else
: # rsh? All bets are off!
fi
2008-11-20 12:40:01 +03:00
fi
2008-07-09 08:18:15 +04:00
fi
######################################################################
nodes=$(get_nodes "$nodespec")
[ $? != 0 ] && exit 1 # Required to catch exit in above subshell.
if $quiet ; then
verbose=false
else
# If $nodes contains a space or a newline then assume multiple nodes.
nl="
"
[ "$nodes" != "${nodes%[ ${nl}]*}" ] && verbose=true
fi
pids=""
trap 'kill -TERM $pids 2>/dev/null' INT TERM
# There's a small race here where the kill can fail if no processes
# have been added to $pids and the script is interrupted. However,
# the part of the window where it matter is very small.
2008-07-14 03:19:22 +04:00
retcode=0
2008-07-09 08:18:15 +04:00
for n in $nodes ; do
2009-05-06 07:17:34 +04:00
set -o pipefail 2>/dev/null
2008-07-09 08:18:15 +04:00
if $parallel ; then
2009-05-06 07:17:34 +04:00
{ exec 3>&1 ; { $SSH $ssh_opts $EXTRA_SSH_OPTS $n "$command" | stdout_filter >&3 ; } 2>&1 | stderr_filter ; } &
2008-07-09 08:18:15 +04:00
pids="${pids} $!"
else
2008-07-10 08:19:52 +04:00
if $verbose ; then
echo >&2 ; echo ">> NODE: $n <<" >&2
fi
2009-05-06 07:17:34 +04:00
{ exec 3>&1 ; { $SSH $ssh_opts $EXTRA_SSH_OPTS $n "$command" | stdout_filter >&3 ; } 2>&1 | stderr_filter ; }
2008-07-14 03:19:22 +04:00
[ $? = 0 ] || retcode=$?
2008-07-09 08:18:15 +04:00
fi
done
2008-07-14 03:19:22 +04:00
$parallel && {
for p in $pids; do
wait $p
[ $? = 0 ] || retcode=$?
done
}
exit $retcode