1
0
mirror of https://github.com/samba-team/samba.git synced 2024-12-23 17:34:34 +03:00
samba-mirror/ctdb/config/functions

1530 lines
35 KiB
Plaintext
Raw Normal View History

# Hey Emacs, this is a -*- shell-script -*- !!!
# utility functions for ctdb event scripts
[ -z "$CTDB_VARDIR" ] && {
if [ -d "/var/lib/ctdb" ] ; then
export CTDB_VARDIR="/var/lib/ctdb"
else
export CTDB_VARDIR="/var/ctdb"
fi
}
[ -z "$CTDB_ETCDIR" ] && {
export CTDB_ETCDIR="/etc"
}
#######################################
# pull in a system config file, if any
_loadconfig() {
if [ -z "$1" ] ; then
foo="${service_config:-${service_name}}"
if [ -n "$foo" ] ; then
loadconfig "$foo"
return
fi
fi
if [ "$1" != "ctdb" ] ; then
loadconfig "ctdb"
fi
if [ -z "$1" ] ; then
return
fi
if [ -f $CTDB_ETCDIR/sysconfig/$1 ]; then
. $CTDB_ETCDIR/sysconfig/$1
elif [ -f $CTDB_ETCDIR/default/$1 ]; then
. $CTDB_ETCDIR/default/$1
elif [ -f $CTDB_BASE/sysconfig/$1 ]; then
. $CTDB_BASE/sysconfig/$1
fi
if [ "$1" = "ctdb" ] ; then
_config="${CTDB_BASE}/ctdbd.conf"
if [ -r "$_config" ] ; then
. "$_config"
fi
fi
}
loadconfig () {
_loadconfig "$@"
}
##############################################################
# CTDB_SCRIPT_DEBUGLEVEL can be overwritten by setting it in a
# configuration file.
debug ()
{
if [ ${CTDB_SCRIPT_DEBUGLEVEL:-2} -ge 4 ] ; then
# If there are arguments then echo them. Otherwise expect to
# use stdin, which allows us to pass lots of debug using a
# here document.
if [ -n "$1" ] ; then
echo "DEBUG: $*"
else
sed -e 's@^@DEBUG: @'
fi
else
if [ -z "$1" ] ; then
cat >/dev/null
fi
fi
}
die ()
{
_msg="$1"
_rc="${2:-1}"
echo "$_msg"
exit $_rc
}
# Log given message or stdin to either syslog or a CTDB log file
# $1 is the tag passed to logger if syslog is in use.
script_log ()
{
_tag="$1" ; shift
case "$CTDB_LOGGING" in
file:*|"")
if [ -n "$CTDB_LOGGING" ] ; then
_file="${CTDB_LOGGING#file:}"
else
_file="/var/log/log.ctdb"
fi
{
if [ -n "$*" ] ; then
echo "$*"
else
cat
fi
} >>"$_file"
;;
*)
# Handle all syslog:* variants here too. There's no tool to do
# the lossy things, so just use logger.
logger -t "ctdbd: ${_tag}" $*
;;
esac
}
# When things are run in the background in an eventscript then logging
# output might get lost. This is the "solution". :-)
background_with_logging ()
{
(
"$@" 2>&1 </dev/null |
script_log "${script_name}&"
)&
return 0
}
##############################################################
# check number of args for different events
ctdb_check_args ()
{
case "$1" in
takeip|releaseip)
if [ $# != 4 ]; then
echo "ERROR: must supply interface, IP and maskbits"
exit 1
fi
;;
updateip)
if [ $# != 5 ]; then
echo "ERROR: must supply old interface, new interface, IP and maskbits"
exit 1
fi
;;
esac
}
##############################################################
# determine on what type of system (init style) we are running
detect_init_style()
{
# only do detection if not already set:
[ -z "$CTDB_INIT_STYLE" ] || return
if [ -x /sbin/startproc ]; then
CTDB_INIT_STYLE="suse"
elif [ -x /sbin/start-stop-daemon ]; then
CTDB_INIT_STYLE="debian"
else
CTDB_INIT_STYLE="redhat"
fi
}
######################################################
# simulate /sbin/service on platforms that don't have it
# _service() makes it easier to hook the service() function for
# testing.
_service ()
{
_service_name="$1"
_op="$2"
# do nothing, when no service was specified
[ -z "$_service_name" ] && return
if [ -x /sbin/service ]; then
$_nice /sbin/service "$_service_name" "$_op"
elif [ -x /usr/sbin/service ]; then
$_nice /usr/sbin/service "$_service_name" "$_op"
elif [ -x $CTDB_ETCDIR/init.d/$_service_name ]; then
$_nice $CTDB_ETCDIR/init.d/$_service_name "$_op"
elif [ -x $CTDB_ETCDIR/rc.d/init.d/$_service_name ]; then
$_nice $CTDB_ETCDIR/rc.d/init.d/$_service_name "$_op"
fi
}
service()
{
_nice=""
_service "$@"
}
######################################################
# simulate /sbin/service (niced) on platforms that don't have it
nice_service()
{
_nice="nice"
_service "$@"
}
######################################################
# Cached retrieval of PNN from local node. This never changes so why
# open a client connection to the server each time this is needed?
# This sets $pnn - this avoid an unnecessary subprocess.
ctdb_get_pnn ()
{
_pnn_file="$CTDB_VARDIR/state/my-pnn"
if [ ! -f "$_pnn_file" ] ; then
ctdb pnn | sed -e 's@.*:@@' >"$_pnn_file"
fi
read pnn <"$_pnn_file"
}
######################################################
# wrapper around /proc/ settings to allow them to be hooked
# for testing
# 1st arg is relative path under /proc/, 2nd arg is value to set
set_proc ()
{
echo "$2" >"/proc/$1"
}
######################################################
# wrapper around getting file contents from /proc/ to allow
# this to be hooked for testing
# 1st arg is relative path under /proc/
get_proc ()
{
cat "/proc/$1"
}
######################################################
# Print up to $_max kernel stack traces for processes named $_program
program_stack_traces ()
{
_prog="$1"
_max="${2:-1}"
_count=1
for _pid in $(pidof "$_prog") ; do
[ $_count -le $_max ] || break
# Do this first to avoid racing with process exit
_stack=$(get_proc "${_pid}/stack" 2>/dev/null)
if [ -n "$_stack" ] ; then
echo "Stack trace for ${_prog}[${_pid}]:"
echo "$_stack"
_count=$(($_count + 1))
fi
done
}
Eventscripts: clean up 60.nfs monitor event. This adds a helper function called nfs_check_rpc_service() and uses it to make the monitor event much more readable. An example of usage is as follows: nfs_check_rpc_service "mountd" \ -ge 10 "verbose restart:b unhealthy" \ -eq 5 "restart:b" The first argument to nfs_check_rpc_service() is the name of the RPC service to be checked. The RPC service corresponding to this command is checked for availability using the rpcinfo command. If the service is available then the function succeeds and subsequent arguments are ignored. If the rpcinfo check fails then a failure counter for that particular RPC service is incremented and subsequent arguments are processed in groups of 3: 1. An integer comparison operator supported by test. 2. An integer failure limit. 3. An action string. The value of the failure counter is checked using (1) and (2) above. The first check that succeeds has its action string processed - note that this explains the somewhat curious reverse ordering of checks. It the example above: * If the counter is >= 10 then a verbose message is printed describing the failure, the service is restarted in the background and the node is marked as unhealthy (via an "exit 1" from the function). * If the counter is == 5 then the service us restarted in the background. For more action options please see the code. This also changes the ctdb_check_rpc() function so that it no longer takes a program number to check. It now just takes a real RPC program name that rpcinfo can resolve via /etc/rpc. Signed-off-by: Martin Schwenke <martin@meltin.net> (This used to be ctdb commit 9b66057964756a6245bafb436eb6106fb6a2866e)
2010-12-17 08:25:04 +03:00
######################################################
# Check that an RPC service is healthy -
# this includes allowing a certain number of failures
# before marking the NFS service unhealthy.
#
# usage: nfs_check_rpc_service SERVICE_NAME [ triple ...]
#
# each triple is a set of 3 arguments: an operator, a
# fail count limit and an action string.
#
# For example:
#
# nfs_check_rpc_service "lockd" \
# -ge 15 "verbose restart unhealthy" \
# -eq 10 "restart:bs"
#
# says that if lockd is down for 15 iterations then do
# a verbose restart of lockd and mark the node unhealthy.
# Before this, after 10 iterations of failure, the
# service is restarted silently in the background.
# Order is important: the number of failures need to be
# specified in reverse order because processing stops
# after the first condition that is true.
######################################################
nfs_check_rpc_service ()
{
_prog_name="$1" ; shift
if _nfs_check_rpc_common "$_prog_name" ; then
return
fi
while [ -n "$3" ] ; do
if _nfs_check_rpc_action "$1" "$2" "$3" ; then
break
fi
shift 3
done
}
# The new way of doing things...
nfs_check_rpc_services ()
{
# Files must end with .check - avoids editor backups, RPM fu, ...
for _f in "${CTDB_BASE}/nfs-rpc-checks.d/"[0-9][0-9].*.check ; do
_t="${_f%.check}"
_prog_name="${_t##*/[0-9][0-9].}"
# If $_prog_name contains '@' then the bit after it is the
# address family.
_family="${_prog_name#*@}"
if [ "$_family" = "$_prog_name" ] ; then
_family=""
else
_prog_name="${_prog_name%@*}"
fi
if _nfs_check_rpc_common "$_prog_name" "$_family" ; then
# This RPC service is up, check next service...
continue
fi
# Check each line in the file in turn until one of the limit
# checks is hit...
while read _cmp _lim _rest ; do
# Skip comments
case "$_cmp" in
\#*) continue ;;
esac
if _nfs_check_rpc_action "$_cmp" "$_lim" "$_rest" ; then
# Limit was hit on this line, no further checking...
break
fi
done <"$_f"
done
}
_nfs_check_rpc_common ()
{
_prog_name="$1"
_family="$2"
# Some platforms don't have separate programs for all services.
case "$_prog_name" in
statd)
type "rpc.${_prog_name}" >/dev/null 2>&1 || return 0
esac
Eventscripts: clean up 60.nfs monitor event. This adds a helper function called nfs_check_rpc_service() and uses it to make the monitor event much more readable. An example of usage is as follows: nfs_check_rpc_service "mountd" \ -ge 10 "verbose restart:b unhealthy" \ -eq 5 "restart:b" The first argument to nfs_check_rpc_service() is the name of the RPC service to be checked. The RPC service corresponding to this command is checked for availability using the rpcinfo command. If the service is available then the function succeeds and subsequent arguments are ignored. If the rpcinfo check fails then a failure counter for that particular RPC service is incremented and subsequent arguments are processed in groups of 3: 1. An integer comparison operator supported by test. 2. An integer failure limit. 3. An action string. The value of the failure counter is checked using (1) and (2) above. The first check that succeeds has its action string processed - note that this explains the somewhat curious reverse ordering of checks. It the example above: * If the counter is >= 10 then a verbose message is printed describing the failure, the service is restarted in the background and the node is marked as unhealthy (via an "exit 1" from the function). * If the counter is == 5 then the service us restarted in the background. For more action options please see the code. This also changes the ctdb_check_rpc() function so that it no longer takes a program number to check. It now just takes a real RPC program name that rpcinfo can resolve via /etc/rpc. Signed-off-by: Martin Schwenke <martin@meltin.net> (This used to be ctdb commit 9b66057964756a6245bafb436eb6106fb6a2866e)
2010-12-17 08:25:04 +03:00
case "$_prog_name" in
nfsd)
Eventscripts: clean up 60.nfs monitor event. This adds a helper function called nfs_check_rpc_service() and uses it to make the monitor event much more readable. An example of usage is as follows: nfs_check_rpc_service "mountd" \ -ge 10 "verbose restart:b unhealthy" \ -eq 5 "restart:b" The first argument to nfs_check_rpc_service() is the name of the RPC service to be checked. The RPC service corresponding to this command is checked for availability using the rpcinfo command. If the service is available then the function succeeds and subsequent arguments are ignored. If the rpcinfo check fails then a failure counter for that particular RPC service is incremented and subsequent arguments are processed in groups of 3: 1. An integer comparison operator supported by test. 2. An integer failure limit. 3. An action string. The value of the failure counter is checked using (1) and (2) above. The first check that succeeds has its action string processed - note that this explains the somewhat curious reverse ordering of checks. It the example above: * If the counter is >= 10 then a verbose message is printed describing the failure, the service is restarted in the background and the node is marked as unhealthy (via an "exit 1" from the function). * If the counter is == 5 then the service us restarted in the background. For more action options please see the code. This also changes the ctdb_check_rpc() function so that it no longer takes a program number to check. It now just takes a real RPC program name that rpcinfo can resolve via /etc/rpc. Signed-off-by: Martin Schwenke <martin@meltin.net> (This used to be ctdb commit 9b66057964756a6245bafb436eb6106fb6a2866e)
2010-12-17 08:25:04 +03:00
_rpc_prog=nfs
_version=3
Eventscripts: clean up 60.nfs monitor event. This adds a helper function called nfs_check_rpc_service() and uses it to make the monitor event much more readable. An example of usage is as follows: nfs_check_rpc_service "mountd" \ -ge 10 "verbose restart:b unhealthy" \ -eq 5 "restart:b" The first argument to nfs_check_rpc_service() is the name of the RPC service to be checked. The RPC service corresponding to this command is checked for availability using the rpcinfo command. If the service is available then the function succeeds and subsequent arguments are ignored. If the rpcinfo check fails then a failure counter for that particular RPC service is incremented and subsequent arguments are processed in groups of 3: 1. An integer comparison operator supported by test. 2. An integer failure limit. 3. An action string. The value of the failure counter is checked using (1) and (2) above. The first check that succeeds has its action string processed - note that this explains the somewhat curious reverse ordering of checks. It the example above: * If the counter is >= 10 then a verbose message is printed describing the failure, the service is restarted in the background and the node is marked as unhealthy (via an "exit 1" from the function). * If the counter is == 5 then the service us restarted in the background. For more action options please see the code. This also changes the ctdb_check_rpc() function so that it no longer takes a program number to check. It now just takes a real RPC program name that rpcinfo can resolve via /etc/rpc. Signed-off-by: Martin Schwenke <martin@meltin.net> (This used to be ctdb commit 9b66057964756a6245bafb436eb6106fb6a2866e)
2010-12-17 08:25:04 +03:00
;;
mountd)
_rpc_prog=mountd
_version=1
Eventscripts: clean up 60.nfs monitor event. This adds a helper function called nfs_check_rpc_service() and uses it to make the monitor event much more readable. An example of usage is as follows: nfs_check_rpc_service "mountd" \ -ge 10 "verbose restart:b unhealthy" \ -eq 5 "restart:b" The first argument to nfs_check_rpc_service() is the name of the RPC service to be checked. The RPC service corresponding to this command is checked for availability using the rpcinfo command. If the service is available then the function succeeds and subsequent arguments are ignored. If the rpcinfo check fails then a failure counter for that particular RPC service is incremented and subsequent arguments are processed in groups of 3: 1. An integer comparison operator supported by test. 2. An integer failure limit. 3. An action string. The value of the failure counter is checked using (1) and (2) above. The first check that succeeds has its action string processed - note that this explains the somewhat curious reverse ordering of checks. It the example above: * If the counter is >= 10 then a verbose message is printed describing the failure, the service is restarted in the background and the node is marked as unhealthy (via an "exit 1" from the function). * If the counter is == 5 then the service us restarted in the background. For more action options please see the code. This also changes the ctdb_check_rpc() function so that it no longer takes a program number to check. It now just takes a real RPC program name that rpcinfo can resolve via /etc/rpc. Signed-off-by: Martin Schwenke <martin@meltin.net> (This used to be ctdb commit 9b66057964756a6245bafb436eb6106fb6a2866e)
2010-12-17 08:25:04 +03:00
;;
rquotad)
_rpc_prog=rquotad
_version=1
Eventscripts: clean up 60.nfs monitor event. This adds a helper function called nfs_check_rpc_service() and uses it to make the monitor event much more readable. An example of usage is as follows: nfs_check_rpc_service "mountd" \ -ge 10 "verbose restart:b unhealthy" \ -eq 5 "restart:b" The first argument to nfs_check_rpc_service() is the name of the RPC service to be checked. The RPC service corresponding to this command is checked for availability using the rpcinfo command. If the service is available then the function succeeds and subsequent arguments are ignored. If the rpcinfo check fails then a failure counter for that particular RPC service is incremented and subsequent arguments are processed in groups of 3: 1. An integer comparison operator supported by test. 2. An integer failure limit. 3. An action string. The value of the failure counter is checked using (1) and (2) above. The first check that succeeds has its action string processed - note that this explains the somewhat curious reverse ordering of checks. It the example above: * If the counter is >= 10 then a verbose message is printed describing the failure, the service is restarted in the background and the node is marked as unhealthy (via an "exit 1" from the function). * If the counter is == 5 then the service us restarted in the background. For more action options please see the code. This also changes the ctdb_check_rpc() function so that it no longer takes a program number to check. It now just takes a real RPC program name that rpcinfo can resolve via /etc/rpc. Signed-off-by: Martin Schwenke <martin@meltin.net> (This used to be ctdb commit 9b66057964756a6245bafb436eb6106fb6a2866e)
2010-12-17 08:25:04 +03:00
;;
lockd)
_rpc_prog=nlockmgr
_version=4
Eventscripts: clean up 60.nfs monitor event. This adds a helper function called nfs_check_rpc_service() and uses it to make the monitor event much more readable. An example of usage is as follows: nfs_check_rpc_service "mountd" \ -ge 10 "verbose restart:b unhealthy" \ -eq 5 "restart:b" The first argument to nfs_check_rpc_service() is the name of the RPC service to be checked. The RPC service corresponding to this command is checked for availability using the rpcinfo command. If the service is available then the function succeeds and subsequent arguments are ignored. If the rpcinfo check fails then a failure counter for that particular RPC service is incremented and subsequent arguments are processed in groups of 3: 1. An integer comparison operator supported by test. 2. An integer failure limit. 3. An action string. The value of the failure counter is checked using (1) and (2) above. The first check that succeeds has its action string processed - note that this explains the somewhat curious reverse ordering of checks. It the example above: * If the counter is >= 10 then a verbose message is printed describing the failure, the service is restarted in the background and the node is marked as unhealthy (via an "exit 1" from the function). * If the counter is == 5 then the service us restarted in the background. For more action options please see the code. This also changes the ctdb_check_rpc() function so that it no longer takes a program number to check. It now just takes a real RPC program name that rpcinfo can resolve via /etc/rpc. Signed-off-by: Martin Schwenke <martin@meltin.net> (This used to be ctdb commit 9b66057964756a6245bafb436eb6106fb6a2866e)
2010-12-17 08:25:04 +03:00
;;
statd)
_rpc_prog=status
_version=1
Eventscripts: clean up 60.nfs monitor event. This adds a helper function called nfs_check_rpc_service() and uses it to make the monitor event much more readable. An example of usage is as follows: nfs_check_rpc_service "mountd" \ -ge 10 "verbose restart:b unhealthy" \ -eq 5 "restart:b" The first argument to nfs_check_rpc_service() is the name of the RPC service to be checked. The RPC service corresponding to this command is checked for availability using the rpcinfo command. If the service is available then the function succeeds and subsequent arguments are ignored. If the rpcinfo check fails then a failure counter for that particular RPC service is incremented and subsequent arguments are processed in groups of 3: 1. An integer comparison operator supported by test. 2. An integer failure limit. 3. An action string. The value of the failure counter is checked using (1) and (2) above. The first check that succeeds has its action string processed - note that this explains the somewhat curious reverse ordering of checks. It the example above: * If the counter is >= 10 then a verbose message is printed describing the failure, the service is restarted in the background and the node is marked as unhealthy (via an "exit 1" from the function). * If the counter is == 5 then the service us restarted in the background. For more action options please see the code. This also changes the ctdb_check_rpc() function so that it no longer takes a program number to check. It now just takes a real RPC program name that rpcinfo can resolve via /etc/rpc. Signed-off-by: Martin Schwenke <martin@meltin.net> (This used to be ctdb commit 9b66057964756a6245bafb436eb6106fb6a2866e)
2010-12-17 08:25:04 +03:00
;;
*)
echo "Internal error: unknown RPC program \"$_prog_name\"."
exit 1
esac
_service_name="nfs_${_prog_name}${_family:+_}${_family}"
Eventscripts: clean up 60.nfs monitor event. This adds a helper function called nfs_check_rpc_service() and uses it to make the monitor event much more readable. An example of usage is as follows: nfs_check_rpc_service "mountd" \ -ge 10 "verbose restart:b unhealthy" \ -eq 5 "restart:b" The first argument to nfs_check_rpc_service() is the name of the RPC service to be checked. The RPC service corresponding to this command is checked for availability using the rpcinfo command. If the service is available then the function succeeds and subsequent arguments are ignored. If the rpcinfo check fails then a failure counter for that particular RPC service is incremented and subsequent arguments are processed in groups of 3: 1. An integer comparison operator supported by test. 2. An integer failure limit. 3. An action string. The value of the failure counter is checked using (1) and (2) above. The first check that succeeds has its action string processed - note that this explains the somewhat curious reverse ordering of checks. It the example above: * If the counter is >= 10 then a verbose message is printed describing the failure, the service is restarted in the background and the node is marked as unhealthy (via an "exit 1" from the function). * If the counter is == 5 then the service us restarted in the background. For more action options please see the code. This also changes the ctdb_check_rpc() function so that it no longer takes a program number to check. It now just takes a real RPC program name that rpcinfo can resolve via /etc/rpc. Signed-off-by: Martin Schwenke <martin@meltin.net> (This used to be ctdb commit 9b66057964756a6245bafb436eb6106fb6a2866e)
2010-12-17 08:25:04 +03:00
if ctdb_check_rpc "$_rpc_prog" "$_version" "$_family" >/dev/null ; then
Eventscripts: clean up 60.nfs monitor event. This adds a helper function called nfs_check_rpc_service() and uses it to make the monitor event much more readable. An example of usage is as follows: nfs_check_rpc_service "mountd" \ -ge 10 "verbose restart:b unhealthy" \ -eq 5 "restart:b" The first argument to nfs_check_rpc_service() is the name of the RPC service to be checked. The RPC service corresponding to this command is checked for availability using the rpcinfo command. If the service is available then the function succeeds and subsequent arguments are ignored. If the rpcinfo check fails then a failure counter for that particular RPC service is incremented and subsequent arguments are processed in groups of 3: 1. An integer comparison operator supported by test. 2. An integer failure limit. 3. An action string. The value of the failure counter is checked using (1) and (2) above. The first check that succeeds has its action string processed - note that this explains the somewhat curious reverse ordering of checks. It the example above: * If the counter is >= 10 then a verbose message is printed describing the failure, the service is restarted in the background and the node is marked as unhealthy (via an "exit 1" from the function). * If the counter is == 5 then the service us restarted in the background. For more action options please see the code. This also changes the ctdb_check_rpc() function so that it no longer takes a program number to check. It now just takes a real RPC program name that rpcinfo can resolve via /etc/rpc. Signed-off-by: Martin Schwenke <martin@meltin.net> (This used to be ctdb commit 9b66057964756a6245bafb436eb6106fb6a2866e)
2010-12-17 08:25:04 +03:00
ctdb_counter_init "$_service_name"
return 0
fi
ctdb_counter_incr "$_service_name"
return 1
Eventscripts: clean up 60.nfs monitor event. This adds a helper function called nfs_check_rpc_service() and uses it to make the monitor event much more readable. An example of usage is as follows: nfs_check_rpc_service "mountd" \ -ge 10 "verbose restart:b unhealthy" \ -eq 5 "restart:b" The first argument to nfs_check_rpc_service() is the name of the RPC service to be checked. The RPC service corresponding to this command is checked for availability using the rpcinfo command. If the service is available then the function succeeds and subsequent arguments are ignored. If the rpcinfo check fails then a failure counter for that particular RPC service is incremented and subsequent arguments are processed in groups of 3: 1. An integer comparison operator supported by test. 2. An integer failure limit. 3. An action string. The value of the failure counter is checked using (1) and (2) above. The first check that succeeds has its action string processed - note that this explains the somewhat curious reverse ordering of checks. It the example above: * If the counter is >= 10 then a verbose message is printed describing the failure, the service is restarted in the background and the node is marked as unhealthy (via an "exit 1" from the function). * If the counter is == 5 then the service us restarted in the background. For more action options please see the code. This also changes the ctdb_check_rpc() function so that it no longer takes a program number to check. It now just takes a real RPC program name that rpcinfo can resolve via /etc/rpc. Signed-off-by: Martin Schwenke <martin@meltin.net> (This used to be ctdb commit 9b66057964756a6245bafb436eb6106fb6a2866e)
2010-12-17 08:25:04 +03:00
}
_nfs_check_rpc_action ()
{
_cmp="$1"
_limit="$2"
_actions="$3"
if ctdb_check_counter "quiet" "$_cmp" "$_limit" "$_service_name" ; then
return 1
fi
for _action in $_actions ; do
case "$_action" in
verbose)
echo "$ctdb_check_rpc_out"
;;
restart)
_nfs_restart_rpc_service "$_prog_name"
;;
restart:b)
_nfs_restart_rpc_service "$_prog_name" true
;;
unhealthy)
exit 1
;;
*)
echo "Internal error: unknown action \"$_action\"."
exit 1
esac
done
return 0
}
_nfs_restart_rpc_service ()
{
_prog_name="$1"
_background="${2:-false}"
if $_background ; then
_maybe_background="background_with_logging"
else
_maybe_background=""
fi
_p="rpc.${_prog_name}"
case "$_prog_name" in
nfsd)
echo "Trying to restart NFS service"
$_maybe_background startstop_nfs restart
;;
mountd)
echo "Trying to restart $_prog_name [${_p}]"
killall -q -9 "$_p"
nfs_dump_some_threads "$_p"
$_maybe_background $_p $RPCMOUNTDOPTS \
${MOUNTD_PORT:+-p} $MOUNTD_PORT
;;
rquotad)
echo "Trying to restart $_prog_name [${_p}]"
killall -q -9 "$_p"
nfs_dump_some_threads "$_p"
$_maybe_background $_p ${RQUOTAD_PORT:+-p} $RQUOTAD_PORT
;;
lockd)
echo "Trying to restart lock manager service"
$_maybe_background startstop_nfslock restart
;;
statd)
echo "Trying to restart $_prog_name [${_p}]"
killall -q -9 "$_p"
nfs_dump_some_threads "$_p"
$_maybe_background $_p \
${STATD_HOSTNAME:+-n} $STATD_HOSTNAME \
${STATD_PORT:+-p} $STATD_PORT \
${STATD_OUTGOING_PORT:+-o} $STATD_OUTGOING_PORT
;;
*)
echo "Internal error: unknown RPC program \"$_prog_name\"."
exit 1
esac
}
######################################################
# check that a rpc server is registered with portmap
# and responding to requests
Eventscripts: clean up 60.nfs monitor event. This adds a helper function called nfs_check_rpc_service() and uses it to make the monitor event much more readable. An example of usage is as follows: nfs_check_rpc_service "mountd" \ -ge 10 "verbose restart:b unhealthy" \ -eq 5 "restart:b" The first argument to nfs_check_rpc_service() is the name of the RPC service to be checked. The RPC service corresponding to this command is checked for availability using the rpcinfo command. If the service is available then the function succeeds and subsequent arguments are ignored. If the rpcinfo check fails then a failure counter for that particular RPC service is incremented and subsequent arguments are processed in groups of 3: 1. An integer comparison operator supported by test. 2. An integer failure limit. 3. An action string. The value of the failure counter is checked using (1) and (2) above. The first check that succeeds has its action string processed - note that this explains the somewhat curious reverse ordering of checks. It the example above: * If the counter is >= 10 then a verbose message is printed describing the failure, the service is restarted in the background and the node is marked as unhealthy (via an "exit 1" from the function). * If the counter is == 5 then the service us restarted in the background. For more action options please see the code. This also changes the ctdb_check_rpc() function so that it no longer takes a program number to check. It now just takes a real RPC program name that rpcinfo can resolve via /etc/rpc. Signed-off-by: Martin Schwenke <martin@meltin.net> (This used to be ctdb commit 9b66057964756a6245bafb436eb6106fb6a2866e)
2010-12-17 08:25:04 +03:00
# usage: ctdb_check_rpc SERVICE_NAME VERSION
######################################################
Eventscripts: clean up 60.nfs monitor event. This adds a helper function called nfs_check_rpc_service() and uses it to make the monitor event much more readable. An example of usage is as follows: nfs_check_rpc_service "mountd" \ -ge 10 "verbose restart:b unhealthy" \ -eq 5 "restart:b" The first argument to nfs_check_rpc_service() is the name of the RPC service to be checked. The RPC service corresponding to this command is checked for availability using the rpcinfo command. If the service is available then the function succeeds and subsequent arguments are ignored. If the rpcinfo check fails then a failure counter for that particular RPC service is incremented and subsequent arguments are processed in groups of 3: 1. An integer comparison operator supported by test. 2. An integer failure limit. 3. An action string. The value of the failure counter is checked using (1) and (2) above. The first check that succeeds has its action string processed - note that this explains the somewhat curious reverse ordering of checks. It the example above: * If the counter is >= 10 then a verbose message is printed describing the failure, the service is restarted in the background and the node is marked as unhealthy (via an "exit 1" from the function). * If the counter is == 5 then the service us restarted in the background. For more action options please see the code. This also changes the ctdb_check_rpc() function so that it no longer takes a program number to check. It now just takes a real RPC program name that rpcinfo can resolve via /etc/rpc. Signed-off-by: Martin Schwenke <martin@meltin.net> (This used to be ctdb commit 9b66057964756a6245bafb436eb6106fb6a2866e)
2010-12-17 08:25:04 +03:00
ctdb_check_rpc ()
{
progname="$1"
Eventscripts: clean up 60.nfs monitor event. This adds a helper function called nfs_check_rpc_service() and uses it to make the monitor event much more readable. An example of usage is as follows: nfs_check_rpc_service "mountd" \ -ge 10 "verbose restart:b unhealthy" \ -eq 5 "restart:b" The first argument to nfs_check_rpc_service() is the name of the RPC service to be checked. The RPC service corresponding to this command is checked for availability using the rpcinfo command. If the service is available then the function succeeds and subsequent arguments are ignored. If the rpcinfo check fails then a failure counter for that particular RPC service is incremented and subsequent arguments are processed in groups of 3: 1. An integer comparison operator supported by test. 2. An integer failure limit. 3. An action string. The value of the failure counter is checked using (1) and (2) above. The first check that succeeds has its action string processed - note that this explains the somewhat curious reverse ordering of checks. It the example above: * If the counter is >= 10 then a verbose message is printed describing the failure, the service is restarted in the background and the node is marked as unhealthy (via an "exit 1" from the function). * If the counter is == 5 then the service us restarted in the background. For more action options please see the code. This also changes the ctdb_check_rpc() function so that it no longer takes a program number to check. It now just takes a real RPC program name that rpcinfo can resolve via /etc/rpc. Signed-off-by: Martin Schwenke <martin@meltin.net> (This used to be ctdb commit 9b66057964756a6245bafb436eb6106fb6a2866e)
2010-12-17 08:25:04 +03:00
version="$2"
_family="${3:-tcp}"
_localhost="${CTDB_RPCINFO_LOCALHOST:-127.0.0.1}"
if ! ctdb_check_rpc_out=$(rpcinfo -T $_family $_localhost $progname $version 2>&1) ; then
ctdb_check_rpc_out="ERROR: $progname failed RPC check:
$ctdb_check_rpc_out"
echo "$ctdb_check_rpc_out"
return 1
fi
}
######################################################
# Ensure $service_name is set
assert_service_name ()
{
[ -n "$service_name" ] || die "INTERNAL ERROR: \$service_name not set"
}
######################################################
# check a set of directories is available
# return 1 on a missing directory
# directories are read from stdin
######################################################
ctdb_check_directories_probe()
{
while IFS="" read d ; do
case "$d" in
*%*)
continue
;;
*)
[ -d "${d}/." ] || return 1
esac
done
}
######################################################
# check a set of directories is available
# directories are read from stdin
######################################################
ctdb_check_directories()
{
ctdb_check_directories_probe || {
echo "ERROR: $service_name directory \"$d\" not available"
exit 1
}
}
######################################################
# check a set of tcp ports
# usage: ctdb_check_tcp_ports <ports...>
######################################################
# This flag file is created when a service is initially started. It
# is deleted the first time TCP port checks for that service succeed.
# Until then ctdb_check_tcp_ports() prints a more subtle "error"
# message if a port check fails.
_ctdb_check_tcp_common ()
{
assert_service_name
_ctdb_service_started_file="$ctdb_fail_dir/$service_name.started"
}
ctdb_check_tcp_init ()
{
_ctdb_check_tcp_common
mkdir -p "${_ctdb_service_started_file%/*}" # dirname
touch "$_ctdb_service_started_file"
}
# Check whether something is listening on all of the given TCP ports
# using the "ctdb checktcpport" command.
ctdb_check_tcp_ports()
{
if [ -z "$1" ] ; then
echo "INTERNAL ERROR: ctdb_check_tcp_ports - no ports specified"
exit 1
fi
for _p ; do # process each function argument (port)
_cmd="ctdb checktcpport $_p"
_out=$($_cmd 2>&1)
_ret=$?
case "$_ret" in
0)
_ctdb_check_tcp_common
if [ ! -f "$_ctdb_service_started_file" ] ; then
echo "ERROR: $service_name tcp port $_p is not responding"
debug "\"ctdb checktcpport $_p\" was able to bind to port"
else
echo "INFO: $service_name tcp port $_p is not responding"
fi
return 1
;;
98)
# Couldn't bind, something already listening, next port...
continue
;;
*)
echo "ERROR: unexpected error running \"ctdb checktcpport\""
debug <<EOF
ctdb checktcpport (exited with $_ret) with output:
$_out"
EOF
return $_ret
esac
done
# All ports listening
_ctdb_check_tcp_common
rm -f "$_ctdb_service_started_file"
return 0
}
######################################################
# check a unix socket
# usage: ctdb_check_unix_socket SERVICE_NAME <socket_path>
######################################################
ctdb_check_unix_socket() {
socket_path="$1"
[ -z "$socket_path" ] && return
if ! netstat --unix -a -n | grep -q "^unix.*LISTEN.*${socket_path}$"; then
echo "ERROR: $service_name socket $socket_path not found"
return 1
fi
}
######################################################
# check a command returns zero status
# usage: ctdb_check_command <command>
######################################################
ctdb_check_command ()
{
_out=$("$@" 2>&1) || {
echo "ERROR: $* returned error"
echo "$_out" | debug
exit 1
}
}
################################################
# kill off any TCP connections with the given IP
################################################
kill_tcp_connections ()
{
_ip="$1"
_oneway=false
if [ "$2" = "oneway" ] ; then
_oneway=true
fi
get_tcp_connections_for_ip "$_ip" | {
_killcount=0
_connections=""
_nl="
"
while read _dst _src; do
_destport="${_dst##*:}"
__oneway=$_oneway
case $_destport in
# we only do one-way killtcp for CIFS
139|445) __oneway=true ;;
esac
echo "Killing TCP connection $_src $_dst"
_connections="${_connections}${_nl}${_src} ${_dst}"
if ! $__oneway ; then
_connections="${_connections}${_nl}${_dst} ${_src}"
fi
_killcount=$(($_killcount + 1))
done
if [ $_killcount -eq 0 ] ; then
return
fi
echo "$_connections" | ctdb killtcp || {
echo "Failed to send killtcp control"
return
}
_count=0
while : ; do
_remaining=$(get_tcp_connections_for_ip $_ip | wc -l)
if [ $_remaining -eq 0 ] ; then
echo "Killed $_killcount TCP connections to released IP $_ip"
return
fi
_count=$(($_count + 1))
if [ $_count -gt 3 ] ; then
echo "Timed out killing tcp connections for IP $_ip ($_remaining remaining)"
return
fi
echo "Waiting for $_remaining connections to be killed for IP $_ip"
sleep 1
done
}
}
##################################################################
# kill off the local end for any TCP connections with the given IP
##################################################################
kill_tcp_connections_local_only ()
{
kill_tcp_connections "$1" "oneway"
}
##################################################################
# tickle any TCP connections with the given IP
##################################################################
tickle_tcp_connections ()
{
_ip="$1"
get_tcp_connections_for_ip "$_ip" |
{
_failed=false
while read dest src; do
echo "Tickle TCP connection $src $dest"
ctdb tickle $src $dest >/dev/null 2>&1 || _failed=true
echo "Tickle TCP connection $dest $src"
ctdb tickle $dest $src >/dev/null 2>&1 || _failed=true
done
if $_failed ; then
echo "Failed to send tickle control"
fi
}
}
get_tcp_connections_for_ip ()
{
_ip="$1"
netstat -tn | awk -v ip=$_ip \
'index($1, "tcp") == 1 && \
(index($4, ip ":") == 1 || index($4, "::ffff:" ip ":") == 1) \
&& $6 == "ESTABLISHED" \
{print $4" "$5}'
}
##################################################################
# use statd-callout to update NFS lock info
##################################################################
nfs_update_lock_info ()
{
if [ -x "$CTDB_BASE/statd-callout" ] ; then
"$CTDB_BASE/statd-callout" update
fi
}
########################################################
# start/stop the Ganesha nfs service
########################################################
startstop_ganesha()
{
_service_name="nfs-ganesha-$CTDB_CLUSTER_FILESYSTEM_TYPE"
case "$1" in
start)
service "$_service_name" start
;;
stop)
service "$_service_name" stop
;;
restart)
service "$_service_name" stop
nfs_dump_some_threads "rpc.statd"
service "$_service_name" start
;;
esac
}
########################################################
# start/stop the nfs service on different platforms
########################################################
startstop_nfs() {
PLATFORM="unknown"
[ -x $CTDB_ETCDIR/init.d/nfsserver ] && {
PLATFORM="sles"
}
[ -x $CTDB_ETCDIR/init.d/nfslock -o \
-r /usr/lib/systemd/system/nfs-lock.service ] && {
PLATFORM="rhel"
}
case $PLATFORM in
sles)
case $1 in
start)
service nfsserver start
;;
stop)
service nfsserver stop > /dev/null 2>&1
;;
restart)
set_proc "fs/nfsd/threads" 0
service nfsserver stop > /dev/null 2>&1
pkill -9 nfsd
nfs_dump_some_threads
service nfsserver start
;;
esac
;;
rhel)
case $1 in
start)
service nfslock start
service nfs start
;;
stop)
service nfs stop
service nfslock stop
;;
restart)
set_proc "fs/nfsd/threads" 0
service nfs stop > /dev/null 2>&1
service nfslock stop > /dev/null 2>&1
pkill -9 nfsd
nfs_dump_some_threads
service nfslock start
service nfs start
;;
esac
;;
*)
echo "Unknown platform. NFS is not supported with ctdb"
exit 1
;;
esac
}
# Dump up to the configured number of nfsd thread backtraces.
nfs_dump_some_threads ()
{
_prog="${1:-nfsd}"
_num="${CTDB_NFS_DUMP_STUCK_THREADS:-5}"
[ $_num -gt 0 ] || return 0
program_stack_traces "$_prog" $_num
}
########################################################
# start/stop the nfs lockmanager service on different platforms
########################################################
startstop_nfslock() {
PLATFORM="unknown"
[ -x $CTDB_ETCDIR/init.d/nfsserver ] && {
PLATFORM="sles"
}
[ -x $CTDB_ETCDIR/init.d/nfslock -o \
-r /usr/lib/systemd/system/nfs-lock.service ] && {
PLATFORM="rhel"
}
case $PLATFORM in
sles)
# for sles there is no service for lockmanager
# so we instead just shutdown/restart nfs
case $1 in
start)
service nfsserver start
;;
stop)
service nfsserver stop > /dev/null 2>&1
;;
restart)
service nfsserver stop > /dev/null 2>&1
service nfsserver start
;;
esac
;;
rhel)
case $1 in
start)
service nfslock start
;;
stop)
service nfslock stop > /dev/null 2>&1
;;
restart)
service nfslock stop > /dev/null 2>&1
service nfslock start
;;
esac
;;
*)
echo "Unknown platform. NFS locking is not supported with ctdb"
exit 1
;;
esac
}
ctdb-eventscripts: Deleting IPs should use the promote_secondaries option If a primary IP address is being deleted from an interface, the secondaries are remembered and added back after the primary is deleted. This is done under a lock shared by the add/del script code. It is necessary because, by default, Linux deletes secondaries when the corresponding primary is deleted. There is a race here between ctdbd and the scripts, since ctdbd doesn't know about the lock. If ctdbd receives a release IP control and the IP address is not on an interface then it is regarded as a "Redundant release of IP" so no "releaseip" event is generated. This can occur if the IP address in question is a secondary that has been temporarily dropped. It is more likely if the number of secondaries is large. Since Linux 2.6.12 (i.e. 2005) Linux has supported a promote_secondaries option on interfaces. This option is currently undocumented but that will change in Linux 3.14. With promote_secondaries enabled the kernel will not drop secondaries but will promote a corresponding secondary instead. The kernel does all necessary locking. Use promote_secondaries to simplify the code, avoid re-adding secondaries, avoid re-adding routes and provide improved performance. This could be done conditionally, with a fallback to legacy secondary-re-adding code, but no supported Linux distribution is running a pre-2.6.12 kernel so this is unnecessary. Signed-off-by: Martin Schwenke <martin@meltin.net> Reviewed-by: Amitay Isaacs <amitay@gmail.com>
2014-01-28 07:41:25 +04:00
########################################################
add_ip_to_iface ()
{
_iface=$1
_ip=$2
_maskbits=$3
ctdb-eventscripts: Deleting IPs should use the promote_secondaries option If a primary IP address is being deleted from an interface, the secondaries are remembered and added back after the primary is deleted. This is done under a lock shared by the add/del script code. It is necessary because, by default, Linux deletes secondaries when the corresponding primary is deleted. There is a race here between ctdbd and the scripts, since ctdbd doesn't know about the lock. If ctdbd receives a release IP control and the IP address is not on an interface then it is regarded as a "Redundant release of IP" so no "releaseip" event is generated. This can occur if the IP address in question is a secondary that has been temporarily dropped. It is more likely if the number of secondaries is large. Since Linux 2.6.12 (i.e. 2005) Linux has supported a promote_secondaries option on interfaces. This option is currently undocumented but that will change in Linux 3.14. With promote_secondaries enabled the kernel will not drop secondaries but will promote a corresponding secondary instead. The kernel does all necessary locking. Use promote_secondaries to simplify the code, avoid re-adding secondaries, avoid re-adding routes and provide improved performance. This could be done conditionally, with a fallback to legacy secondary-re-adding code, but no supported Linux distribution is running a pre-2.6.12 kernel so this is unnecessary. Signed-off-by: Martin Schwenke <martin@meltin.net> Reviewed-by: Amitay Isaacs <amitay@gmail.com>
2014-01-28 07:41:25 +04:00
# Ensure interface is up
ip link set "$_iface" up || \
die "Failed to bringup interface $_iface"
# Only need to define broadcast for IPv4
case "$ip" in
*:*) _bcast="" ;;
*) _bcast="brd +" ;;
esac
ip addr add "$_ip/$_maskbits" $_bcast dev "$_iface" || {
echo "Failed to add $_ip/$_maskbits on dev $_iface"
return 1
}
# Wait 5 seconds for IPv6 addresses to stop being tentative...
if [ -z "$_bcast" ] ; then
for _x in $(seq 1 10) ; do
ip addr show to "${_ip}/128" | grep -q "tentative" || break
sleep 0.5
done
# If the address was a duplicate then it won't be on the
# interface so flag an error.
_t=$(ip addr show to "${_ip}/128")
case "$_t" in
"")
echo "Failed to add $_ip/$_maskbits on dev $_iface"
return 1
;;
*tentative*|*dadfailed*)
echo "Failed to add $_ip/$_maskbits on dev $_iface"
ip addr del "$_ip/$_maskbits" dev "$_iface"
return 1
;;
esac
fi
}
delete_ip_from_iface()
{
_iface=$1
_ip=$2
_maskbits=$3
ctdb-eventscripts: Deleting IPs should use the promote_secondaries option If a primary IP address is being deleted from an interface, the secondaries are remembered and added back after the primary is deleted. This is done under a lock shared by the add/del script code. It is necessary because, by default, Linux deletes secondaries when the corresponding primary is deleted. There is a race here between ctdbd and the scripts, since ctdbd doesn't know about the lock. If ctdbd receives a release IP control and the IP address is not on an interface then it is regarded as a "Redundant release of IP" so no "releaseip" event is generated. This can occur if the IP address in question is a secondary that has been temporarily dropped. It is more likely if the number of secondaries is large. Since Linux 2.6.12 (i.e. 2005) Linux has supported a promote_secondaries option on interfaces. This option is currently undocumented but that will change in Linux 3.14. With promote_secondaries enabled the kernel will not drop secondaries but will promote a corresponding secondary instead. The kernel does all necessary locking. Use promote_secondaries to simplify the code, avoid re-adding secondaries, avoid re-adding routes and provide improved performance. This could be done conditionally, with a fallback to legacy secondary-re-adding code, but no supported Linux distribution is running a pre-2.6.12 kernel so this is unnecessary. Signed-off-by: Martin Schwenke <martin@meltin.net> Reviewed-by: Amitay Isaacs <amitay@gmail.com>
2014-01-28 07:41:25 +04:00
# This could be set globally for all interfaces but it is probably
# better to avoid surprises, so limit it the interfaces where CTDB
# has public IP addresses. There isn't anywhere else convenient
# to do this so just set it each time. This is much cheaper than
# remembering and re-adding secondaries.
set_proc "sys/net/ipv4/conf/${_iface}/promote_secondaries" 1
ip addr del "$_ip/$_maskbits" dev "$_iface" || {
echo "Failed to del $_ip on dev $_iface"
return 1
}
}
# If the given IP is hosted then print 2 items: maskbits and iface
ip_maskbits_iface ()
{
_addr="$1"
case "$_addr" in
*:*) _family="inet6" ; _bits=128 ;;
*) _family="inet" ; _bits=32 ;;
esac
ip addr show to "${_addr}/${_bits}" 2>/dev/null | \
awk -v family="${_family}" \
'NR == 1 { iface = $2; sub(":$", "", iface) } \
$1 ~ /inet/ { mask = $2; sub(".*/", "", mask); \
print mask, iface, family }'
}
drop_ip ()
{
_addr="${1%/*}" # Remove optional maskbits
set -- $(ip_maskbits_iface $_addr)
if [ -n "$1" ] ; then
_maskbits="$1"
_iface="$2"
echo "Removing public address $_addr/$_maskbits from device $_iface"
delete_ip_from_iface $_iface $_addr $_maskbits >/dev/null 2>&1
fi
}
drop_all_public_ips ()
{
while read _ip _x ; do
drop_ip "$_ip"
done <"${CTDB_PUBLIC_ADDRESSES:-/dev/null}"
}
flush_route_cache ()
{
set_proc sys/net/ipv4/route/flush 1
set_proc sys/net/ipv6/route/flush 1
}
########################################################
# Simple counters
_ctdb_counter_common () {
_service_name="${1:-${service_name:-${script_name}}}"
_counter_file="$ctdb_fail_dir/$_service_name"
mkdir -p "${_counter_file%/*}" # dirname
}
ctdb_counter_init () {
_ctdb_counter_common "$1"
>"$_counter_file"
}
ctdb_counter_incr () {
_ctdb_counter_common "$1"
# unary counting!
echo -n 1 >> "$_counter_file"
}
ctdb_check_counter () {
_msg="${1:-error}" # "error" - anything else is silent on fail
_op="${2:--ge}" # an integer operator supported by test
_limit="${3:-${service_fail_limit}}"
shift 3
_ctdb_counter_common "$1"
# unary counting!
_size=$(stat -c "%s" "$_counter_file" 2>/dev/null || echo 0)
_hit=false
if [ "$_op" != "%" ] ; then
if [ $_size $_op $_limit ] ; then
_hit=true
fi
else
if [ $(($_size $_op $_limit)) -eq 0 ] ; then
_hit=true
fi
fi
if $_hit ; then
if [ "$_msg" = "error" ] ; then
echo "ERROR: $_size consecutive failures for $_service_name, marking node unhealthy"
exit 1
else
return 1
fi
fi
}
########################################################
ctdb_status_dir="$CTDB_VARDIR/state/service_status"
ctdb_fail_dir="$CTDB_VARDIR/state/failcount"
ctdb_setup_service_state_dir ()
{
service_state_dir="$CTDB_VARDIR/state/service_state/${1:-${service_name}}"
mkdir -p "$service_state_dir" || {
echo "Error creating state dir \"$service_state_dir\""
exit 1
}
}
########################################################
# Managed status history, for auto-start/stop
ctdb_managed_dir="$CTDB_VARDIR/state/managed_history"
_ctdb_managed_common ()
{
_ctdb_managed_file="$ctdb_managed_dir/$service_name"
}
ctdb_service_managed ()
{
_ctdb_managed_common
mkdir -p "$ctdb_managed_dir"
touch "$_ctdb_managed_file"
}
ctdb_service_unmanaged ()
{
_ctdb_managed_common
rm -f "$_ctdb_managed_file"
}
is_ctdb_previously_managed_service ()
{
_ctdb_managed_common
[ -f "$_ctdb_managed_file" ]
}
########################################################
# Check and set status
log_status_cat ()
{
echo "node is \"$1\", \"${script_name}\" reports problem: $(cat $2)"
}
ctdb_checkstatus ()
{
if [ -r "$ctdb_status_dir/$script_name/unhealthy" ] ; then
log_status_cat "unhealthy" "$ctdb_status_dir/$script_name/unhealthy"
return 1
elif [ -r "$ctdb_status_dir/$script_name/banned" ] ; then
log_status_cat "banned" "$ctdb_status_dir/$script_name/banned"
return 2
else
return 0
fi
}
ctdb_setstatus ()
{
d="$ctdb_status_dir/$script_name"
case "$1" in
unhealthy|banned)
mkdir -p "$d"
cat "$2" >"$d/$1"
;;
*)
for i in "banned" "unhealthy" ; do
rm -f "$d/$i"
done
;;
esac
}
##################################################################
# Reconfigure a service on demand
_ctdb_service_reconfigure_common ()
{
_d="$ctdb_status_dir/${service_name}"
mkdir -p "$_d"
_ctdb_service_reconfigure_flag="$_d/reconfigure"
}
ctdb_service_needs_reconfigure ()
{
_ctdb_service_reconfigure_common
[ -e "$_ctdb_service_reconfigure_flag" ]
}
ctdb_service_set_reconfigure ()
{
_ctdb_service_reconfigure_common
>"$_ctdb_service_reconfigure_flag"
}
ctdb_service_unset_reconfigure ()
{
_ctdb_service_reconfigure_common
rm -f "$_ctdb_service_reconfigure_flag"
}
ctdb_service_reconfigure ()
{
echo "Reconfiguring service \"${service_name}\"..."
ctdb_service_unset_reconfigure
service_reconfigure || return $?
ctdb_counter_init
}
# Default service_reconfigure() function does nothing.
service_reconfigure ()
{
:
}
ctdb_reconfigure_take_lock ()
{
_ctdb_service_reconfigure_common
_lock="${_d}/reconfigure_lock"
mkdir -p "${_lock%/*}" # dirname
touch "$_lock"
(
flock 0
# This is overkill but will work if we need to extend this to
# allow certain events to run multiple times in parallel
# (e.g. takeip) and write multiple PIDs to the file.
read _locker_event
if [ -n "$_locker_event" ] ; then
while read _pid ; do
if [ -n "$_pid" -a "$_pid" != $$ ] && \
kill -0 "$_pid" 2>/dev/null ; then
exit 1
fi
done
fi
printf "%s\n%s\n" "$event_name" $$ >"$_lock"
exit 0
) <"$_lock"
}
ctdb_reconfigure_release_lock ()
{
_ctdb_service_reconfigure_common
_lock="${_d}/reconfigure_lock"
rm -f "$_lock"
}
ctdb_replay_monitor_status ()
{
echo "Replaying previous status for this script due to reconfigure..."
# Leading separator ('|') is missing in some versions...
_out=$(ctdb scriptstatus -X | grep -E "^\|?monitor\|${script_name}\|")
# Output looks like this:
# |monitor|60.nfs|1|ERROR|1314764004.030861|1314764004.035514|foo bar|
# This is the cheapest way of getting fields in the middle.
set -- $(IFS="|" ; echo $_out)
_code="$3"
_status="$4"
# The error output field can include colons so we'll try to
# preserve them. The weak checking at the beginning tries to make
# this work for both broken (no leading '|') and fixed output.
_out="${_out%|}"
_err_out="${_out#*monitor|${script_name}|*|*|*|*|}"
case "$_status" in
OK) : ;; # Do nothing special.
TIMEDOUT)
# Recast this as an error, since we can't exit with the
# correct negative number.
_code=1
_err_out="[Replay of TIMEDOUT scriptstatus - note incorrect return code.] ${_err_out}"
;;
DISABLED)
# Recast this as an OK, since we can't exit with the
# correct negative number.
_code=0
_err_out="[Replay of DISABLED scriptstatus - note incorrect return code.] ${_err_out}"
;;
*) : ;; # Must be ERROR, do nothing special.
esac
if [ -n "$_err_out" ] ; then
echo "$_err_out"
fi
exit $_code
}
ctdb_service_check_reconfigure ()
{
assert_service_name
# We only care about some events in this function. For others we
# return now.
case "$event_name" in
monitor|ipreallocated|reconfigure) : ;;
*) return 0 ;;
esac
if ctdb_reconfigure_take_lock ; then
# No events covered by this function are running, so proceed
# with gay abandon.
case "$event_name" in
reconfigure)
(ctdb_service_reconfigure)
exit $?
;;
ipreallocated)
if ctdb_service_needs_reconfigure ; then
ctdb_service_reconfigure
fi
;;
esac
ctdb_reconfigure_release_lock
else
# Somebody else is running an event we don't want to collide
# with. We proceed with caution.
case "$event_name" in
reconfigure)
# Tell whoever called us to retry.
exit 2
;;
ipreallocated)
# Defer any scheduled reconfigure and just run the
# rest of the ipreallocated event, as per the
# eventscript. There's an assumption here that the
# event doesn't depend on any scheduled reconfigure.
# This is true in the current code.
return 0
;;
monitor)
# There is most likely a reconfigure in progress so
# the service is possibly unstable. As above, we
# defer any scheduled reconfigured. We also replay
# the previous monitor status since that's the best
# information we have.
ctdb_replay_monitor_status
;;
esac
fi
}
##################################################################
# Does CTDB manage this service? - and associated auto-start/stop
ctdb_compat_managed_service ()
{
if [ "$1" = "yes" -a "$2" = "$service_name" ] ; then
CTDB_MANAGED_SERVICES="$CTDB_MANAGED_SERVICES $2"
fi
}
is_ctdb_managed_service ()
{
assert_service_name
# $t is used just for readability and to allow better accurate
# matching via leading/trailing spaces
t=" $CTDB_MANAGED_SERVICES "
# Return 0 if "<space>$service_name<space>" appears in $t
if [ "${t#* ${service_name} }" != "${t}" ] ; then
return 0
fi
# If above didn't match then update $CTDB_MANAGED_SERVICES for
# backward compatibility and try again.
ctdb_compat_managed_service "$CTDB_MANAGES_VSFTPD" "vsftpd"
ctdb_compat_managed_service "$CTDB_MANAGES_SAMBA" "samba"
ctdb_compat_managed_service "$CTDB_MANAGES_WINBIND" "winbind"
ctdb_compat_managed_service "$CTDB_MANAGES_HTTPD" "apache2"
ctdb_compat_managed_service "$CTDB_MANAGES_HTTPD" "httpd"
ctdb_compat_managed_service "$CTDB_MANAGES_ISCSI" "iscsi"
ctdb_compat_managed_service "$CTDB_MANAGES_CLAMD" "clamd"
ctdb_compat_managed_service "$CTDB_MANAGES_NFS" "nfs"
ctdb_compat_managed_service "$CTDB_MANAGES_NFS" "nfs-ganesha-gpfs"
t=" $CTDB_MANAGED_SERVICES "
# Return 0 if "<space>$service_name<space>" appears in $t
[ "${t#* ${service_name} }" != "${t}" ]
}
ctdb_start_stop_service ()
{
assert_service_name
# Allow service-start/service-stop pseudo-events to start/stop
# services when we're not auto-starting/stopping and we're not
# monitoring.
case "$event_name" in
service-start)
if is_ctdb_managed_service ; then
die 'service-start event not permitted when service is managed'
fi
if [ "$CTDB_SERVICE_AUTOSTARTSTOP" = "yes" ] ; then
die 'service-start event not permitted with $CTDB_SERVICE_AUTOSTARTSTOP = yes'
fi
ctdb_service_start
exit $?
;;
service-stop)
if is_ctdb_managed_service ; then
die 'service-stop event not permitted when service is managed'
fi
if [ "$CTDB_SERVICE_AUTOSTARTSTOP" = "yes" ] ; then
die 'service-stop event not permitted with $CTDB_SERVICE_AUTOSTARTSTOP = yes'
fi
ctdb_service_stop
exit $?
;;
esac
# Do nothing unless configured to...
[ "$CTDB_SERVICE_AUTOSTARTSTOP" = "yes" ] || return 0
[ "$event_name" = "monitor" ] || return 0
if is_ctdb_managed_service ; then
if ! is_ctdb_previously_managed_service ; then
echo "Starting service \"$service_name\" - now managed"
background_with_logging ctdb_service_start
exit $?
fi
else
if is_ctdb_previously_managed_service ; then
echo "Stopping service \"$service_name\" - no longer managed"
background_with_logging ctdb_service_stop
exit $?
fi
fi
}
ctdb_service_start ()
{
# The service is marked managed if we've ever tried to start it.
ctdb_service_managed
service_start || return $?
ctdb_counter_init
ctdb_check_tcp_init
}
ctdb_service_stop ()
{
ctdb_service_unmanaged
service_stop
}
# Default service_start() and service_stop() functions.
# These may be overridden in an eventscript.
service_start ()
{
service "$service_name" start
}
service_stop ()
{
service "$service_name" stop
}
##################################################################
ctdb_standard_event_handler ()
{
case "$1" in
status)
ctdb_checkstatus
exit
;;
setstatus)
shift
ctdb_setstatus "$@"
exit
;;
esac
}
iptables_wrapper ()
{
_family="$1" ; shift
if [ "$_family" = "inet6" ] ; then
_iptables_cmd="ip6tables"
else
_iptables_cmd="iptables"
fi
# iptables doesn't like being re-entered, so flock-wrap it.
flock -w 30 "${CTDB_VARDIR}/iptables-ctdb.flock" "$_iptables_cmd" "$@"
}
# AIX (and perhaps others?) doesn't have mktemp
if ! type mktemp >/dev/null 2>&1 ; then
mktemp ()
{
_dir=false
if [ "$1" = "-d" ] ; then
_dir=true
shift
fi
_d="${TMPDIR:-/tmp}"
_hex10=$(dd if=/dev/urandom count=20 2>/dev/null | \
md5sum | \
sed -e 's@\(..........\).*@\1@')
_t="${_d}/tmp.${_hex10}"
(
umask 077
if $_dir ; then
mkdir "$_t"
else
>"$_t"
fi
)
echo "$_t"
}
fi
########################################################
# tickle handling
########################################################
update_tickles ()
{
_port="$1"
tickledir="$CTDB_VARDIR/state/tickles"
mkdir -p "$tickledir"
ctdb_get_pnn
# What public IPs do I hold?
_ips=$(ctdb -X ip | awk -F'|' -v pnn=$pnn '$3 == pnn {print $2}')
# IPs as a regexp choice
_ipschoice="($(echo $_ips | sed -e 's/ /|/g' -e 's/\./\\\\./g'))"
# Record connections to our public IPs in a temporary file
_my_connections="${tickledir}/${_port}.connections"
rm -f "$_my_connections"
netstat -tn |
awk -v destpat="^${_ipschoice}:${_port}\$" \
'$1 == "tcp" && $6 == "ESTABLISHED" && $4 ~ destpat {print $5, $4}' |
sort >"$_my_connections"
# Record our current tickles in a temporary file
_my_tickles="${tickledir}/${_port}.tickles"
rm -f "$_my_tickles"
for _i in $_ips ; do
ctdb -X gettickles $_i $_port |
awk -F'|' 'NR > 1 { printf "%s:%s %s:%s\n", $2, $3, $4, $5 }'
done |
sort >"$_my_tickles"
# Add tickles for connections that we haven't already got tickles for
comm -23 "$_my_connections" "$_my_tickles" |
while read _src _dst ; do
ctdb addtickle $_src $_dst
done
# Remove tickles for connections that are no longer there
comm -13 "$_my_connections" "$_my_tickles" |
while read _src _dst ; do
ctdb deltickle $_src $_dst
done
rm -f "$_my_connections" "$_my_tickles"
}
########################################################
# load a site local config file
########################################################
[ -n "$CTDB_RC_LOCAL" -a -x "$CTDB_RC_LOCAL" ] && {
. "$CTDB_RC_LOCAL"
}
[ -x $CTDB_BASE/rc.local ] && {
. $CTDB_BASE/rc.local
}
[ -d $CTDB_BASE/rc.local.d ] && {
for i in $CTDB_BASE/rc.local.d/* ; do
[ -x "$i" ] && . "$i"
done
}
script_name="${0##*/}" # basename
service_fail_limit=1
event_name="$1"