# Hey Emacs, this is a -*- shell-script -*- !!! # utility functions for ctdb event scripts PATH=/bin:/usr/bin:/usr/sbin:/sbin:$PATH [ -z "$CTDB_VARDIR" ] && { if [ -d "/var/lib/ctdb" ] ; then export CTDB_VARDIR="/var/lib/ctdb" else export CTDB_VARDIR="/var/ctdb" fi } [ -z "$CTDB_ETCDIR" ] && { export CTDB_ETCDIR="/etc" } ####################################### # pull in a system config file, if any _loadconfig() { if [ -z "$1" ] ; then foo="${service_config:-${service_name}}" if [ -n "$foo" ] ; then loadconfig "$foo" fi elif [ "$1" != "ctdb" ] ; then loadconfig "ctdb" fi if [ -f $CTDB_ETCDIR/sysconfig/$1 ]; then . $CTDB_ETCDIR/sysconfig/$1 elif [ -f $CTDB_ETCDIR/default/$1 ]; then . $CTDB_ETCDIR/default/$1 elif [ -f $CTDB_BASE/sysconfig/$1 ]; then . $CTDB_BASE/sysconfig/$1 fi } loadconfig () { _loadconfig "$@" } ############################################################## # make sure CTDB_CURRENT_DEBUGLEVEL is set to the desired debug level # (integer) # # If it is already set then do nothing, since it might have been set # via a file in rc.local.d/. If it is not set then set it by sourcing # /var/ctdb/eventscript_debuglevel. If this file does not exist then # create it using output from "ctdb getdebug". If the option 1st arg # is "create" then don't source an existing file but create a new one # instead - this is useful for creating the file just once in each # event run in 00.ctdb. If there's a problem getting the debug level # from ctdb then it is silently set to 0 - no use spamming logs if our # debug code is broken... ctdb_set_current_debuglevel () { [ -z "$CTDB_CURRENT_DEBUGLEVEL" ] || return 0 _f="$CTDB_VARDIR/eventscript_debuglevel" if [ "$1" = "create" -o ! -r "$_f" ] ; then _t=$(ctdb getdebug -Y 2>/dev/null) # get last field of output _t="${_t%:}" _t="${_t##*:}" # Defaults to 0 echo "export CTDB_CURRENT_DEBUGLEVEL=\"${_t:-0}\"" >"$_f" fi . "$_f" } debug () { if [ $CTDB_CURRENT_DEBUGLEVEL -ge 4 ] ; then # If there are arguments then echo them. Otherwise expect to # use stdin, which allows us to pass lots of debug using a # here document. if [ -n "$1" ] ; then echo "DEBUG: $*" elif ! tty -s ; then sed -e 's@^@DEBUG: @' fi fi } die () { _msg="$1" _rc="${2:-1}" echo "$_msg" exit $_rc } # When things are run in the background in an eventscript then logging # output might get lost. This is the "solution". :-) background_with_logging () { _using_syslog=false if [ "$CTDB_SYSLOG" = "yes" -o -z "$CTDB_LOGFILE" ] ; then _using_syslog=true fi case "$CTDB_OPTIONS" in *--syslog*) _using_syslog=true ;; esac ( "$@" 2>&1 >"$CTDB_LOGFILE" fi )& return 0 } ############################################################## # check number of args for different events ctdb_check_args () { case "$1" in takeip|releaseip) if [ $# != 4 ]; then echo "ERROR: must supply interface, IP and maskbits" exit 1 fi ;; updateip) if [ $# != 5 ]; then echo "ERROR: must supply old interface, new interface, IP and maskbits" exit 1 fi ;; esac } ############################################################## # determine on what type of system (init style) we are running detect_init_style() { # only do detection if not already set: test "x$CTDB_INIT_STYLE" != "x" && return if [ -x /sbin/startproc ]; then CTDB_INIT_STYLE="suse" elif [ -x /sbin/start-stop-daemon ]; then CTDB_INIT_STYLE="debian" else CTDB_INIT_STYLE="redhat" fi } ###################################################### # simulate /sbin/service on platforms that don't have it # _service() makes it easier to hook the service() function for # testing. _service () { _service_name="$1" _op="$2" # do nothing, when no service was specified [ -z "$_service_name" ] && return if [ -x /sbin/service ]; then $_nice /sbin/service "$_service_name" "$_op" elif [ -x $CTDB_ETCDIR/init.d/$_service_name ]; then $_nice $CTDB_ETCDIR/init.d/$_service_name "$_op" elif [ -x $CTDB_ETCDIR/rc.d/init.d/$_service_name ]; then $_nice $CTDB_ETCDIR/rc.d/init.d/$_service_name "$_op" fi } service() { _nice="" _service "$@" } ###################################################### # simulate /sbin/service (niced) on platforms that don't have it nice_service() { _nice="nice" _service "$@" } ###################################################### # wrapper around /proc/ settings to allow them to be hooked # for testing # 1st arg is relative path under /proc/, 2nd arg is value to set set_proc () { echo "$2" >"/proc/$1" } ###################################################### # wrapper around getting file contents from /proc/ to allow # this to be hooked for testing # 1st arg is relative path under /proc/ get_proc () { cat "/proc/$1" } ###################################################### # Check that an RPC service is healthy - # this includes allowing a certain number of failures # before marking the NFS service unhealthy. # # usage: nfs_check_rpc_service SERVICE_NAME [ triple ...] # # each triple is a set of 3 arguments: an operator, a # fail count limit and an action string. # # For example: # # nfs_check_rpc_service "lockd" \ # -ge 15 "verbose restart unhealthy" \ # -eq 10 "restart:bs" # # says that if lockd is down for 15 iterations then do # a verbose restart of lockd and mark the node unhealthy. # Before this, after 10 iterations of failure, the # service is restarted silently in the background. # Order is important: the number of failures need to be # specified in reverse order because processing stops # after the first condition that is true. ###################################################### nfs_check_rpc_service () { _prog_name="$1" ; shift _v="" case "$1" in -*) : ;; *) _v="$1" ; shift ;; esac _version=${_v:-1} _rpc_prog="$_prog_name" _restart="" _opts="" case "$_prog_name" in knfsd) _rpc_prog=nfs _version=${_v:-3} _restart="echo 'Trying to restart NFS service'" _restart="${_restart}; startstop_nfs restart" ;; ganesha) _rpc_prog=nfs _version=${_v:-3} _restart="echo 'Trying to restart Ganesha NFS service'" _restart="${_restart}; startstop_ganesha restart" ;; mountd) _opts="${MOUNTD_PORT:+ -p }${MOUNTD_PORT}" ;; rquotad) _opts="${RQUOTAD_PORT:+ -p }${RQUOTAD_PORT}" ;; lockd) _rpc_prog=nlockmgr _version=${_v:-4} _restart="echo 'Trying to restart lock manager service'" _restart="${_restart}; startstop_nfslock restart" ;; statd) _rpc_prog=status _opts="${STATD_HOSTNAME:+ -n }${STATD_HOSTNAME}" _opts="${_opts}${STATD_PORT:+ -p }${STATD_PORT}" _opts="${_opts}${STATD_OUTGOING_PORT:+ -o }${STATD_OUTGOING_PORT}" ;; *) echo "Internal error: unknown RPC program \"$_prog_name\"." exit 1 esac _service_name="nfs_${_prog_name}" if ctdb_check_rpc "$_rpc_prog" $_version >/dev/null ; then ctdb_counter_init "$_service_name" return 0 fi ctdb_counter_incr "$_service_name" while [ -n "$3" ] ; do ctdb_check_counter "quiet" "$1" "$2" "$_service_name" || { for _action in $3 ; do case "$_action" in verbose) echo "$ctdb_check_rpc_out" ;; restart|restart:*) # No explicit command specified, construct rpc command. if [ -z "$_restart" ] ; then _p="rpc.${_prog_name}" _restart="echo 'Trying to restart $_prog_name [${_p}${_opts}]'" _restart="${_restart}; killall -q -9 $_p" _restart="${_restart}; $_p $_opts" fi # Process restart flags... _flags="${_action#restart:}" # There may not have been a colon... [ "$_flags" != "$_action" ] || _flags="" # q=quiet - everything to /dev/null if [ "${_flags#*q}" != "$_flags" ] ; then _restart="{ ${_restart} ; } >/dev/null 2>&1" fi # s=stealthy - last command to /dev/null if [ "${_flags#*s}" != "$_flags" ] ; then _restart="${_restart} >/dev/null 2>&1" fi # b=background - the whole thing, easy and reliable if [ "${_flags#*b}" != "$_flags" ] ; then _restart="{ ${_restart} ; } &" fi # Do it! eval "${_restart}" ;; unhealthy) exit 1 ;; *) echo "Internal error: unknown action \"$_action\"." exit 1 esac done # Only process the first action group. break } shift 3 done } ###################################################### # check that a rpc server is registered with portmap # and responding to requests # usage: ctdb_check_rpc SERVICE_NAME VERSION ###################################################### ctdb_check_rpc () { progname="$1" version="$2" if ! ctdb_check_rpc_out=$(rpcinfo -u localhost $progname $version 2>&1) ; then ctdb_check_rpc_out="ERROR: $progname failed RPC check: $ctdb_check_rpc_out" echo "$ctdb_check_rpc_out" return 1 fi } ###################################################### # check a set of directories is available # return 1 on a missing directory # usage: ctdb_check_directories_probe SERVICE_NAME ###################################################### ctdb_check_directories_probe() { while IFS="" read d ; do case "$d" in *%*) continue ;; *) [ -d "${d}/." ] || return 1 esac done } ###################################################### # check a set of directories is available # usage: ctdb_check_directories SERVICE_NAME ###################################################### ctdb_check_directories() { n="${1:-${service_name}}" ctdb_check_directories_probe || { echo "ERROR: $n directory \"$d\" not available" exit 1 } } ###################################################### # check a set of tcp ports # usage: ctdb_check_tcp_ports ###################################################### # This flag file is created when a service is initially started. It # is deleted the first time TCP port checks for that service succeed. # Until then ctdb_check_tcp_ports() prints a more subtle "error" # message if a port check fails. _ctdb_check_tcp_common () { _ctdb_service_started_file="$ctdb_fail_dir/$service_name.started" } ctdb_check_tcp_init () { _ctdb_check_tcp_common mkdir -p "${_ctdb_service_started_file%/*}" # dirname touch "$_ctdb_service_started_file" } ctdb_check_tcp_ports() { if [ -z "$1" ] ; then echo "INTERNAL ERROR: ctdb_check_tcp_ports - no ports specified" exit 1 fi # Set default value for CTDB_TCP_PORT_CHECKS if unset. # If any of these defaults are unsupported then this variable can # be overridden in /etc/sysconfig/ctdb or via a file in # /etc/ctdb/rc.local.d/. : ${CTDB_TCP_PORT_CHECKERS:=ctdb nmap netstat} for _c in $CTDB_TCP_PORT_CHECKERS ; do ctdb_check_tcp_ports_$_c "$@" case "$?" in 0) _ctdb_check_tcp_common rm -f "$_ctdb_service_started_file" return 0 ;; 1) _ctdb_check_tcp_common if [ ! -f "$_ctdb_service_started_file" ] ; then echo "ERROR: $service_name tcp port $_p is not responding" debug <&1) if [ $? -eq 127 ] ; then # netstat probably not installed - unlikely? ctdb_check_tcp_ports_debug="$_ns" return 127 fi for _p ; do # process each function argument (port) for _a in '0\.0\.0\.0' '::' ; do _pat="[[:space:]]${_a}:${_p}[[:space:]]+[^[:space:]]+[[:space:]]+LISTEN" if echo "$_ns" | grep -E -q "$_pat" ; then # We matched the port, so process next port continue 2 fi done # We didn't match the port, so flag an error. ctdb_check_tcp_ports_debug="$_cmd shows this output: $_ns" return 1 done return 0 } ctdb_check_tcp_ports_nmap () { # nmap wants a comma-separated list of ports _ports="" for _p ; do _ports="${_ports}${_ports:+,}${_p}" done _cmd="nmap -n -oG - -PS 127.0.0.1 -p $_ports" _nmap_out=$($_cmd 2>&1) if [ $? -eq 127 ] ; then # nmap probably not installed ctdb_check_tcp_ports_debug="$_nmap_out" return 127 fi # get the port-related output _port_info=$(echo "$_nmap_out" | sed -n -r -e 's@^.*Ports:[[:space:]]@@p') for _p ; do # looking for something like this: # 445/open/tcp//microsoft-ds/// # possibly followed by a comma _t="$_p/open/tcp//" case "$_port_info" in # The info we're after must be either at the beginning of # the string or it must follow a space. $_t*|*\ $_t*) : ;; *) # Nope, flag an error... ctdb_check_tcp_ports_debug="$_cmd shows this output: $_nmap_out" return 1 esac done return 0 } # Use the new "ctdb checktcpport" command to check the port. # This is very cheap. ctdb_check_tcp_ports_ctdb () { for _p ; do # process each function argument (port) _cmd="ctdb checktcpport $_p" _out=$($_cmd 2>&1) _ret=$? case "$_ret" in 0) ctdb_check_tcp_ports_debug="\"$_cmd\" was able to bind to port" return 1 ;; 98) # Couldn't bind, something already listening, next port... continue ;; *) ctdb_check_tcp_ports_debug="$_cmd (exited with $_ret) with output: $_out" # assume not implemented return 127 esac done return 0 } ###################################################### # check a unix socket # usage: ctdb_check_unix_socket SERVICE_NAME ###################################################### ctdb_check_unix_socket() { socket_path="$1" [ -z "$socket_path" ] && return if ! netstat --unix -a -n | grep -q "^unix.*LISTEN.*${socket_path}$"; then echo "ERROR: $service_name socket $socket_path not found" return 1 fi } ###################################################### # check a command returns zero status # usage: ctdb_check_command SERVICE_NAME ###################################################### ctdb_check_command() { service_name="$1" wait_cmd="$2" [ -z "$wait_cmd" ] && return; $wait_cmd > /dev/null 2>&1 || { echo "ERROR: $service_name - $wait_cmd returned error" exit 1 } } ################################################ # kill off any TCP connections with the given IP ################################################ kill_tcp_connections() { _IP="$1" _failed=0 _killcount=0 connfile="$CTDB_VARDIR/state/connections.$_IP" netstat -tn |egrep "^tcp.*[[:space:]]+$_IP:.*ESTABLISHED" | awk '{print $4" "$5}' > $connfile netstat -tn |egrep "^tcp.*[[:space:]]+::ffff:$_IP:.*ESTABLISHED" | awk '{print $4" "$5}' >> $connfile while read dest src; do srcip=`echo $src | sed -e "s/:[^:]*$//"` srcport=`echo $src | sed -e "s/^.*://"` destip=`echo $dest | sed -e "s/:[^:]*$//"` destport=`echo $dest | sed -e "s/^.*://"` echo "Killing TCP connection $srcip:$srcport $destip:$destport" ctdb killtcp $srcip:$srcport $destip:$destport >/dev/null 2>&1 || _failed=1 case $destport in # we only do one-way killtcp for CIFS 139|445) : ;; # for all others we do 2-way *) ctdb killtcp $destip:$destport $srcip:$srcport >/dev/null 2>&1 || _failed=1 ;; esac _killcount=`expr $_killcount + 1` done < $connfile rm -f $connfile [ $_failed = 0 ] || { echo "Failed to send killtcp control" return; } [ $_killcount -gt 0 ] || { return; } _count=0 while netstat -tn |egrep "^tcp.*[[:space:]]+$_IP:.*ESTABLISHED" > /dev/null; do sleep 1 _count=`expr $_count + 1` [ $_count -gt 3 ] && { echo "Timed out killing tcp connections for IP $_IP" return; } done echo "killed $_killcount TCP connections to released IP $_IP" } ################################################################## # kill off the local end for any TCP connections with the given IP ################################################################## kill_tcp_connections_local_only() { _IP="$1" _failed=0 _killcount=0 connfile="$CTDB_VARDIR/state/connections.$_IP" netstat -tn |egrep "^tcp.*[[:space:]]+$_IP:.*ESTABLISHED" | awk '{print $4" "$5}' > $connfile netstat -tn |egrep "^tcp.*[[:space:]]+::ffff:$_IP:.*ESTABLISHED" | awk '{print $4" "$5}' >> $connfile while read dest src; do srcip=`echo $src | sed -e "s/:[^:]*$//"` srcport=`echo $src | sed -e "s/^.*://"` destip=`echo $dest | sed -e "s/:[^:]*$//"` destport=`echo $dest | sed -e "s/^.*://"` echo "Killing TCP connection $srcip:$srcport $destip:$destport" ctdb killtcp $srcip:$srcport $destip:$destport >/dev/null 2>&1 || _failed=1 _killcount=`expr $_killcount + 1` done < $connfile rm -f $connfile [ $_failed = 0 ] || { echo "Failed to send killtcp control" return; } [ $_killcount -gt 0 ] || { return; } _count=0 while netstat -tn |egrep "^tcp.*[[:space:]]+$_IP:.*ESTABLISHED" > /dev/null; do sleep 1 _count=`expr $_count + 1` [ $_count -gt 3 ] && { echo "Timed out killing tcp connections for IP $_IP" return; } done echo "killed $_killcount TCP connections to released IP $_IP" } ################################################################## # tickle any TCP connections with the given IP ################################################################## tickle_tcp_connections() { _IP="$1" _failed=0 _killcount=0 connfile="$CTDB_VARDIR/state/connections.$_IP" netstat -tn |egrep "^tcp.*[[:space:]]+$_IP:.*ESTABLISHED" | awk '{print $4" "$5}' > $connfile netstat -tn |egrep "^tcp.*[[:space:]]+::ffff:$_IP:.*ESTABLISHED" | awk '{print $4" "$5}' >> $connfile while read dest src; do srcip=`echo $src | sed -e "s/:[^:]*$//"` srcport=`echo $src | sed -e "s/^.*://"` destip=`echo $dest | sed -e "s/:[^:]*$//"` destport=`echo $dest | sed -e "s/^.*://"` echo "Tickle TCP connection $srcip:$srcport $destip:$destport" ctdb tickle $srcip:$srcport $destip:$destport >/dev/null 2>&1 || _failed=1 echo "Tickle TCP connection $destip:$destport $srcip:$srcport" ctdb tickle $destip:$destport $srcip:$srcport >/dev/null 2>&1 || _failed=1 done < $connfile rm -f $connfile [ $_failed = 0 ] || { echo "Failed to send tickle control" return; } } ######################################################## # start/stop the Ganesha nfs service ######################################################## startstop_ganesha() { _ganesha_fsal_list="gpfs" for _fsal in $_ganesha_fsal_list ; do _service_name="nfs-ganesha-${_fsal}" if [ -x /etc/init.d/$_service_name ] ; then break fi done case "$1" in start) service "$_service_name" start ;; stop) service "$_service_name" stop ;; restart) service "$_service_name" restart ;; esac } ######################################################## # start/stop the nfs service on different platforms ######################################################## startstop_nfs() { PLATFORM="unknown" [ -x $CTDB_ETCDIR/init.d/nfsserver ] && { PLATFORM="sles" } [ -x $CTDB_ETCDIR/init.d/nfslock ] && { PLATFORM="rhel" } case $PLATFORM in sles) case $1 in start) service nfsserver start ;; stop) service nfsserver stop > /dev/null 2>&1 ;; restart) set_proc "fs/nfsd/threads" 0 service nfsserver stop > /dev/null 2>&1 pkill -9 nfsd service nfsserver start ;; esac ;; rhel) case $1 in start) service nfslock start service nfs start ;; stop) service nfs stop service nfslock stop ;; restart) set_proc "fs/nfsd/threads" 0 service nfs stop > /dev/null 2>&1 service nfslock stop > /dev/null 2>&1 pkill -9 nfsd service nfslock start service nfs start ;; esac ;; *) echo "Unknown platform. NFS is not supported with ctdb" exit 1 ;; esac } ######################################################## # start/stop the nfs lockmanager service on different platforms ######################################################## startstop_nfslock() { PLATFORM="unknown" [ -x $CTDB_ETCDIR/init.d/nfsserver ] && { PLATFORM="sles" } [ -x $CTDB_ETCDIR/init.d/nfslock ] && { PLATFORM="rhel" } case $PLATFORM in sles) # for sles there is no service for lockmanager # so we instead just shutdown/restart nfs case $1 in start) service nfsserver start ;; stop) service nfsserver stop > /dev/null 2>&1 ;; restart) service nfsserver stop service nfsserver start ;; esac ;; rhel) case $1 in start) service nfslock start ;; stop) service nfslock stop > /dev/null 2>&1 ;; restart) service nfslock stop service nfslock start ;; esac ;; *) echo "Unknown platform. NFS locking is not supported with ctdb" exit 1 ;; esac } add_ip_to_iface() { _iface=$1 _ip=$2 _maskbits=$3 _lockfile="${CTDB_VARDIR}/state/interface_modify_${_iface}.flock" [ -f "$_lockfile" ] || touch "$_lockfile" ( # Note: use of return/exit/die() below only gets us out of the # sub-shell, which is actually what we want. That is, the # function should just return non-zero. flock --timeout 30 0 || \ die "add_ip_to_iface: unable to get lock for ${_iface}" # Ensure interface is up ip link set "$_iface" up || \ die "Failed to bringup interface $_iface" ip addr add "$_ip/$_maskbits" brd + dev "$_iface" || \ die "Failed to add $_ip/$_maskbits on dev $_iface" ) <"$_lockfile" # Do nothing here - return above only gets us out of the subshell # and doing anything here will affect the return code. } delete_ip_from_iface() { _iface=$1 _ip=$2 _maskbits=$3 _lockfile="${CTDB_VARDIR}/state/interface_modify_${_iface}.flock" [ -f "$_lockfile" ] || touch "$_lockfile" ( # Note: use of return/exit/die() below only gets us out of the # sub-shell, which is actually what we want. That is, the # function should just return non-zero. flock --timeout 30 0 || \ die "delete_ip_from_iface: unable to get lock for ${_iface}" _im="$_ip/$_maskbits" # shorthand for readability # "ip addr del" will delete all secondary IPs if this is the # primary. To work around this _very_ annoying behaviour we # have to keep a record of the secondaries and re-add them # afterwards. Yuck! _secondaries="" if ip addr list dev "$_iface" primary | grep -Fq "inet $_im " ; then _secondaries=$(ip addr list dev "$_iface" secondary | \ awk '$1 == "inet" { print $2 }') fi local _rc=0 ip addr del "$_im" dev "$_iface" || { echo "Failed to del $_ip on dev $_iface" _rc=1 } if [ -n "$_secondaries" ] ; then for _i in $_secondaries; do if ip addr list dev "$_iface" | grep -Fq "inet $_i" ; then echo "Kept secondary $_i on dev $_iface" else echo "Re-adding secondary address $_i to dev $_iface" ip addr add $_i brd + dev $_iface || { echo "Failed to re-add address $_i to dev $_iface" _rc=1 } fi done fi return $_rc ) <"$_lockfile" # Do nothing here - return above only gets us out of the subshell # and doing anything here will affect the return code. } ######################################################## # some simple logic for counting events - per eventscript # usage: ctdb_counter_init # ctdb_counter_incr # ctdb_check_counter_limit # ctdb_check_counter_limit succeeds when count >= ######################################################## _ctdb_counter_common () { _service_name="${1:-${service_name}}" _counter_file="$ctdb_fail_dir/$_service_name" mkdir -p "${_counter_file%/*}" # dirname } ctdb_counter_init () { _ctdb_counter_common "$1" >"$_counter_file" } ctdb_counter_incr () { _ctdb_counter_common "$1" # unary counting! echo -n 1 >> "$_counter_file" } ctdb_check_counter_limit () { _ctdb_counter_common _limit="${1:-${service_fail_limit}}" _quiet="$2" # unary counting! _size=$(stat -c "%s" "$_counter_file" 2>/dev/null || echo 0) if [ $_size -ge $_limit ] ; then echo "ERROR: more than $_limit consecutive failures for $service_name, marking cluster unhealthy" exit 1 elif [ $_size -gt 0 -a -z "$_quiet" ] ; then echo "WARNING: less than $_limit consecutive failures ($_size) for $service_name, not unhealthy yet" fi } ctdb_check_counter_equal () { _ctdb_counter_common _limit=$1 # unary counting! _size=$(stat -c "%s" "$_counter_file" 2>/dev/null || echo 0) if [ $_size -eq $_limit ] ; then return 1 fi return 0 } ctdb_check_counter () { _msg="${1:-error}" # "error" - anything else is silent on fail _op="${2:--ge}" # an integer operator supported by test _limit="${3:-${service_fail_limit}}" shift 3 _ctdb_counter_common "$1" # unary counting! _size=$(stat -c "%s" "$_counter_file" 2>/dev/null || echo 0) if [ $_size $_op $_limit ] ; then if [ "$_msg" = "error" ] ; then echo "ERROR: $_limit consecutive failures for $_service_name, marking node unhealthy" exit 1 else return 1 fi fi } ######################################################## ctdb_status_dir="$CTDB_VARDIR/status" ctdb_fail_dir="$CTDB_VARDIR/failcount" ctdb_setup_service_state_dir () { service_state_dir="$CTDB_VARDIR/state/${1:-${service_name}}" mkdir -p "$service_state_dir" || { echo "Error creating state dir \"$service_state_dir\"" exit 1 } } ######################################################## # Managed status history, for auto-start/stop ctdb_managed_dir="$CTDB_VARDIR/managed_history" _ctdb_managed_common () { _service_name="${1:-${service_name}}" _ctdb_managed_file="$ctdb_managed_dir/$_service_name" } ctdb_service_managed () { _ctdb_managed_common "$@" mkdir -p "$ctdb_managed_dir" touch "$_ctdb_managed_file" } ctdb_service_unmanaged () { _ctdb_managed_common "$@" rm -f "$_ctdb_managed_file" } is_ctdb_previously_managed_service () { _ctdb_managed_common "$@" [ -f "$_ctdb_managed_file" ] } ######################################################## # Check and set status log_status_cat () { echo "node is \"$1\", \"${script_name}\" reports problem: $(cat $2)" } ctdb_checkstatus () { if [ -r "$ctdb_status_dir/$script_name/unhealthy" ] ; then log_status_cat "unhealthy" "$ctdb_status_dir/$script_name/unhealthy" return 1 elif [ -r "$ctdb_status_dir/$script_name/banned" ] ; then log_status_cat "banned" "$ctdb_status_dir/$script_name/banned" return 2 else return 0 fi } ctdb_setstatus () { d="$ctdb_status_dir/$script_name" case "$1" in unhealthy|banned) mkdir -p "$d" cat "$2" >"$d/$1" ;; *) for i in "banned" "unhealthy" ; do rm -f "$d/$i" done ;; esac } ################################################################## # Reconfigure a service on demand _ctdb_service_reconfigure_common () { _d="$ctdb_status_dir/${1:-${service_name}}" mkdir -p "$_d" _ctdb_service_reconfigure_flag="$_d/reconfigure" } ctdb_service_needs_reconfigure () { _ctdb_service_reconfigure_common "$@" [ -e "$_ctdb_service_reconfigure_flag" ] } ctdb_service_set_reconfigure () { _ctdb_service_reconfigure_common "$@" >"$_ctdb_service_reconfigure_flag" } ctdb_service_unset_reconfigure () { _ctdb_service_reconfigure_common "$@" rm -f "$_ctdb_service_reconfigure_flag" } ctdb_service_reconfigure () { echo "Reconfiguring service \"$@\"..." ctdb_service_unset_reconfigure "$@" service_reconfigure "$@" || return $? ctdb_counter_init "$@" } # Default service_reconfigure() function. service_reconfigure () { service "${1:-$service_name}" restart } ctdb_reconfigure_try_lock () { _ctdb_service_reconfigure_common "$@" _lock="${_d}/reconfigure_lock" touch "$_lock" ( flock 0 # This is overkill but will work if we need to extend this to # allow certain events to run multiple times in parallel # (e.g. takeip) and write multiple PIDs to the file. read _locker_event if [ -n "$_locker_event" ] ; then while read _pid ; do if [ -n "$_pid" -a "$_pid" != $$ ] && \ kill -0 "$_pid" 2>/dev/null ; then exit 1 fi done fi printf "%s\n%s\n" "$event_name" $$ >"$_lock" exit 0 ) <"$_lock" } ctdb_replay_monitor_status () { echo "Replaying previous status for this script due to reconfigure..." # Leading colon (':') is missing in some versions... _out=$(ctdb scriptstatus -Y | grep -E "^:?monitor:${script_name}:") # Output looks like this: # :monitor:60.nfs:1:ERROR:1314764004.030861:1314764004.035514:foo bar: # This is the cheapest way of getting fields in the middle. set -- $(IFS=":" ; echo $_out) _code="$3" _status="$4" # The error output field can include colons so we'll try to # preserve them. The weak checking at the beginning tries to make # this work for both broken (no leading ':') and fixed output. _out="${_out%:}" _err_out="${_out#*monitor:${script_name}:*:*:*:*:}" case "$_status" in OK) : ;; # Do nothing special. TIMEDOUT) # Recast this as an error, since we can't exit with the # correct negative number. _code=1 _err_out="[Replay of TIMEDOUT scriptstatus - note incorrect return code.] ${_err_out}" ;; DISABLED) # Recast this as an OK, since we can't exit with the # correct negative number. _code=0 _err_out="[Replay of DISABLED scriptstatus - note incorrect return code.] ${_err_out}" ;; *) : ;; # Must be ERROR, do nothing special. esac echo "$_err_out" exit $_code } ctdb_service_check_reconfigure () { [ -n "$1" ] || set -- "$service_name" # We only care about some events in this function. For others we # return now. case "$event_name" in monitor|ipreallocated|reconfigure) : ;; *) return 0 ;; esac if ctdb_reconfigure_try_lock "$@" ; then # No events covered by this function are running, so proceed # with gay abandon. case "$event_name" in reconfigure) (ctdb_service_reconfigure "$@") exit $? ;; ipreallocated) if ctdb_service_needs_reconfigure "$@" ; then ctdb_service_reconfigure "$@" fi ;; monitor) if ctdb_service_needs_reconfigure "$@" ; then ctdb_service_reconfigure "$@" # Given that the reconfigure might not have # resulted in the service being stable yet, we # replay the previous status since that's the best # information we have. ctdb_replay_monitor_status fi ;; esac else # Somebody else is running an event we don't want to collide # with. We proceed with caution. case "$event_name" in reconfigure) # Tell whoever called us to retry. exit 2 ;; ipreallocated) # Defer any scheduled reconfigure and just run the # rest of the ipreallocated event, as per the # eventscript. There's an assumption here that the # event doesn't depend on any scheduled reconfigure. # This is true in the current code. return 0 ;; monitor) # There is most likely a reconfigure in progress so # the service is possibly unstable. As above, we # defer any scheduled reconfigured. We also replay # the previous monitor status since that's the best # information we have. ctdb_replay_monitor_status ;; esac fi } ################################################################## # Does CTDB manage this service? - and associated auto-start/stop ctdb_compat_managed_service () { if [ "$1" = "yes" -a "$2" = "$_service_name" ] ; then CTDB_MANAGED_SERVICES="$CTDB_MANAGED_SERVICES $2" fi } is_ctdb_managed_service () { _service_name="${1:-${service_name}}" # $t is used just for readability and to allow better accurate # matching via leading/trailing spaces t=" $CTDB_MANAGED_SERVICES " # Return 0 if "$_service_name" appears in $t if [ "${t#* ${_service_name} }" != "${t}" ] ; then return 0 fi # If above didn't match then update $CTDB_MANAGED_SERVICES for # backward compatibility and try again. ctdb_compat_managed_service "$CTDB_MANAGES_VSFTPD" "vsftpd" ctdb_compat_managed_service "$CTDB_MANAGES_SAMBA" "samba" ctdb_compat_managed_service "$CTDB_MANAGES_SCP" "scp" ctdb_compat_managed_service "$CTDB_MANAGES_WINBIND" "winbind" ctdb_compat_managed_service "$CTDB_MANAGES_HTTPD" "apache2" ctdb_compat_managed_service "$CTDB_MANAGES_HTTPD" "httpd" ctdb_compat_managed_service "$CTDB_MANAGES_ISCSI" "iscsi" ctdb_compat_managed_service "$CTDB_MANAGES_CLAMD" "clamd" ctdb_compat_managed_service "$CTDB_MANAGES_NFS" "nfs" ctdb_compat_managed_service "$CTDB_MANAGES_NFS" "nfs-ganesha-gpfs" t=" $CTDB_MANAGED_SERVICES " # Return 0 if "$_service_name" appears in $t [ "${t#* ${_service_name} }" != "${t}" ] } ctdb_start_stop_service () { # Do nothing unless configured to... [ "$CTDB_SERVICE_AUTOSTARTSTOP" = "yes" ] || return 0 _service_name="${1:-${service_name}}" [ "$event_name" = "monitor" ] || return 0 if is_ctdb_managed_service "$_service_name" ; then if ! is_ctdb_previously_managed_service "$_service_name" ; then echo "Starting service \"$_service_name\" - now managed" background_with_logging ctdb_service_start "$_service_name" exit $? fi else if is_ctdb_previously_managed_service "$_service_name" ; then echo "Stopping service \"$_service_name\" - no longer managed" background_with_logging ctdb_service_stop "$_service_name" exit $? fi fi } ctdb_service_start () { # The service is marked managed if we've ever tried to start it. ctdb_service_managed "$@" # Here we only want $1. If no argument is passed then # service_start needs to know. service_start "$@" || return $? ctdb_counter_init "$@" ctdb_check_tcp_init } ctdb_service_stop () { ctdb_service_unmanaged "$@" service_stop "$@" } # Default service_start() and service_stop() functions. # These may be overridden in an eventscript. When overriding, the # following convention must be followed. If these functions are # called with no arguments then they may use internal logic to # determine whether the service is managed and, therefore, whether # they should take any action. However, if the service name is # specified as an argument then an attempt must be made to start or # stop the service. This is because the auto-start/stop code calls # them with the service name as an argument. service_start () { service "${1:-${service_name}}" start } service_stop () { service "${1:-${service_name}}" stop } ################################################################## ctdb_standard_event_handler () { case "$1" in status) ctdb_checkstatus exit ;; setstatus) shift ctdb_setstatus "$@" exit ;; esac } # iptables doesn't like being re-entered, so flock-wrap it. iptables() { flock -w 30 $CTDB_VARDIR/iptables-ctdb.flock /sbin/iptables "$@" } ######################################################## # tickle handling ######################################################## # Temporary directory for tickles. tickledir="$CTDB_VARDIR/state/tickles" mkdir -p "$tickledir" update_tickles () { _port="$1" mkdir -p "$tickledir" # Just in case # Who am I? _pnn=$(ctdb pnn) ; _pnn=${_pnn#PNN:} # What public IPs do I hold? _ips=$(ctdb -Y ip | awk -F: -v pnn=$_pnn '$3 == pnn {print $2}') # IPs as a regexp choice _ipschoice="($(echo $_ips | sed -e 's/ /|/g' -e 's/\./\\\\./g'))" # Record connections to our public IPs in a temporary file _my_connections="${tickledir}/${_port}.connections" rm -f "$_my_connections" netstat -tn | awk -v destpat="^${_ipschoice}:${_port}\$" \ '$1 == "tcp" && $6 == "ESTABLISHED" && $4 ~ destpat {print $5, $4}' | sort >"$_my_connections" # Record our current tickles in a temporary file _my_tickles="${tickledir}/${_port}.tickles" rm -f "$_my_tickles" for _i in $_ips ; do ctdb -Y gettickles $_i $_port | awk -F: 'NR > 1 { printf "%s:%s %s:%s\n", $2, $3, $4, $5 }' done | sort >"$_my_tickles" # Add tickles for connections that we haven't already got tickles for comm -23 "$_my_connections" "$_my_tickles" | while read _src _dst ; do ctdb addtickle $_src $_dst done # Remove tickles for connections that are no longer there comm -13 "$_my_connections" "$_my_tickles" | while read _src _dst ; do ctdb deltickle $_src $_dst done rm -f "$_my_connections" "$_my_tickles" } ######################################################## # load a site local config file ######################################################## [ -n "$CTDB_RC_LOCAL" -a -x "$CTDB_RC_LOCAL" ] && { . "$CTDB_RC_LOCAL" } [ -x $CTDB_BASE/rc.local ] && { . $CTDB_BASE/rc.local } [ -d $CTDB_BASE/rc.local.d ] && { for i in $CTDB_BASE/rc.local.d/* ; do [ -x "$i" ] && . "$i" done } # We'll call this here to ensure $CTDB_CURRENT_DEBUGLEVEL is set. # This gives us a chance to override the debug level using a file in # $CTDB_BASE/rc.local.d/. ctdb_set_current_debuglevel script_name="${0##*/}" # basename service_name="$script_name" # default is just the script name service_fail_limit=1 event_name="$1"