1
0
mirror of https://github.com/samba-team/samba.git synced 2025-01-26 10:04:02 +03:00
Martin Schwenke 500c6e194b ctdb-scripts: Change statd-callout to be more scalable
Updating ctdb.tdb on each add-client, del-client and each delete
during notify was too ambitious.  Persistent transactions do not
perform well enough to do this.

Revert to having add-client and del-client create touch files.  Each
monitor event calls "statd-callout update" to convert touch files into
ctdb.tdb records.

Update testcases to do the "update" and add an extra test.

Signed-off-by: Martin Schwenke <martin@meltin.net>
Pair-programmed-with: Amitay Isaacs <amitay@gmail.com>
Reviewed-by: Amitay Isaacs <amitay@gmail.com>
2015-03-04 10:42:27 +01:00

255 lines
6.0 KiB
Bash
Executable File

#!/bin/sh
# script to manage nfs in a clustered environment
[ -n "$CTDB_BASE" ] || \
export CTDB_BASE=$(cd -P $(dirname "$0") ; dirname "$PWD")
. $CTDB_BASE/functions
GANRECDIR="/var/lib/nfs/ganesha"
GANRECDIR2="/var/lib/nfs/ganesha/recevents"
GPFS_STATE="/usr/lpp/mmfs/bin/mmgetstate"
GANRECDIR3="/var/lib/nfs/ganesha_local"
service_start ()
{
startstop_ganesha stop
startstop_ganesha start
set_proc "sys/net/ipv4/tcp_tw_recycle" 1
}
service_stop ()
{
startstop_ganesha stop
}
service_reconfigure ()
{
# if the ips have been reallocated, we must restart ganesha
# across all nodes and ping all statd listeners
[ -x $CTDB_BASE/statd-callout ] && {
$CTDB_BASE/statd-callout notify &
} >/dev/null 2>&1
}
loadconfig "nfs"
[ -n "$CTDB_CLUSTER_FILESYSTEM_TYPE" ] || CTDB_CLUSTER_FILESYSTEM_TYPE="gpfs"
service_name="nfs-ganesha-$CTDB_CLUSTER_FILESYSTEM_TYPE"
[ "${CTDB_NFS_SERVER_MODE:-${NFS_SERVER_MODE}}" = "ganesha" ] || exit 0
ctdb_setup_service_state_dir
ctdb_start_stop_service
is_ctdb_managed_service || exit 0
ctdb_service_check_reconfigure
nodenum_file="${service_state_dir}/gpfs_nodenum"
get_cluster_fs_state ()
{
case $CTDB_CLUSTER_FILESYSTEM_TYPE in
gpfs)
STATE=`$GPFS_STATE | awk 'NR <= 3 {next} {printf "%-6s", $3}'`
echo $STATE
;;
*)
die "File system $CTDB_CLUSTER_FILESYSTEM_TYPE not supported"
;;
esac
}
create_nodenum_file()
{
NNUM=$(/usr/lpp/mmfs/bin/mmlsconfig myNodeConfigNumber | awk '{print $2}')
echo $NNUM > $nodenum_file
}
get_nodenum()
{
if [ ! -f $nodenum_file ]; then
create_nodenum_file
fi
cat $nodenum_file
}
create_ganesha_recdirs ()
{
[ -n "$CTDB_GANESHA_REC_SUBDIR" ] || CTDB_GANESHA_REC_SUBDIR=".ganesha"
_mounts=$(mount -t $CTDB_CLUSTER_FILESYSTEM_TYPE)
if [ -z "$_mounts" ]; then
echo "startup $CTDB_CLUSTER_FILESYSTEM_TYPE not ready"
exit 0
fi
_mntpt=$(echo "$_mounts" | sort | awk 'NR == 1 {print $3}')
_link_dst="${_mntpt}/${CTDB_GANESHA_REC_SUBDIR}"
mkdir -vp "$_link_dst"
if [ -e "$GANRECDIR" ]; then
if [ ! -L "$GANRECDIR" ] ; then
rm -vrf "$GANRECDIR"
else
_t=$(readlink "$GANRECDIR")
if [ "$_t" != "$_link_dst" ] ; then
rm -v "$GANRECDIR"
fi
fi
fi
# This is not an "else". It also re-creates the link if it was
# removed above!
if [ ! -e "$GANRECDIR" ]; then
ln -sv "$_link_dst" "$GANRECDIR"
fi
mkdir -p $GANRECDIR2
mkdir -p $GANRECDIR3
}
monitor_ganesha_nfsd ()
{
create_ganesha_recdirs
service_name=${service_name}_process
PIDFILE="/var/run/ganesha.pid"
CUR_STATE=`get_cluster_fs_state`
GANESHA="/usr/bin/$CTDB_CLUSTER_FILESYSTEM_TYPE.ganesha.nfsd"
if { read PID < $PIDFILE && \
grep "$GANESHA" "/proc/$PID/cmdline" ; } >/dev/null 2>&1 ; then
ctdb_counter_init "$service_name"
else
if [ $CUR_STATE = "active" ]; then
echo "Trying fast restart of NFS service"
startstop_ganesha restart
ctdb_counter_incr "$service_name"
ctdb_check_counter "error" "-ge" "6" "$service_name"
fi
fi
service_name="nfs-ganesha-$CTDB_CLUSTER_FILESYSTEM_TYPE"_service
# check that NFS is posting forward progress
if [ $CUR_STATE = "active" -a "$CTDB_NFS_SKIP_KNFSD_ALIVE_CHECK" != "yes" ] ; then
MAXREDS=2
MAXSTALL=120
RESTART=0
NUMREDS=`ls $GANRECDIR3 | grep "red" | wc -l`
LASTONE=`ls -t $GANRECDIR3 | sed 's/_/ /' | awk 'NR > 1 {next} {printf $1} '`
# Beware of startup
if [ -z $LASTONE ] ; then
LASTONE=`date +"%s"`
fi
TNOW=$(date +"%s")
TSTALL=$(($TNOW - $LASTONE))
if [ $NUMREDS -ge $MAXREDS ] ; then
echo restarting because of $NUMREDS red conditions
RESTART=1
ctdb_counter_incr "$service_name"
ctdb_check_counter "error" "-ge" "6" "$service_name"
fi
if [ $TSTALL -ge $MAXSTALL ] ; then
echo restarting because of $TSTALL second stall
RESTART=1
ctdb_counter_incr "$service_name"
ctdb_check_counter "error" "-ge" "6" "$service_name"
fi
if [ $RESTART -gt 0 ] ; then
startstop_ganesha restart
else
ctdb_counter_init "$service_name"
fi
fi
}
############################################################
case "$1" in
init)
# read statd from persistent database
;;
startup)
create_ganesha_recdirs
ctdb_service_start
create_nodenum_file
;;
shutdown)
ctdb_service_stop
;;
takeip)
if [ -n "$2" ] ; then
case $CTDB_CLUSTER_FILESYSTEM_TYPE in
gpfs)
NNUM=$(get_nodenum)
TDATE=`date +"%s"`
TOUCHTGT=$1"_"$TDATE"_"$NNUM"_"$3"_"$4"_"$2
touch $GANRECDIR2/$TOUCHTGT
;;
esac
fi
ctdb_service_set_reconfigure
;;
releaseip)
if [ -n "$2" ] ; then
case $CTDB_CLUSTER_FILESYSTEM_TYPE in
gpfs)
NNUM=$(get_nodenum)
TDATE=`date +"%s"`
TOUCHTGT=$1"_"$TDATE"_"$NNUM"_"$3"_"$4"_"$2
touch $GANRECDIR2/$TOUCHTGT
TOUCHTGT="my"$TOUCHTGT
touch $GANRECDIR2/$TOUCHTGT
;;
esac
fi
ctdb_service_set_reconfigure
;;
monitor)
# Check that directories for shares actually exist.
[ "$CTDB_NFS_SKIP_SHARE_CHECK" = "yes" ] || {
grep Path /etc/ganesha/$CTDB_CLUSTER_FILESYSTEM_TYPE.ganesha.exports.conf |
cut -f2 -d\" | sort -u | ctdb_check_directories
} || exit $?
update_tickles 2049
nfs_update_lock_info
# check that statd responds to rpc requests
# if statd is not running we try to restart it
# we only do this IF we have a rpc.statd command.
# For platforms where rpc.statd does not exist, we skip
# the check completely
p="rpc.statd"
which $p >/dev/null 2>/dev/null && \
nfs_check_rpc_service "statd" \
% 10 "verbose restart:b unhealthy" \
-ge 6 "verbose unhealthy" \
-eq 4 "verbose restart" \
-eq 2 "restart:b"
if [ "$CTDB_SKIP_GANESHA_NFSD_CHECK" != "yes" ] ; then
monitor_ganesha_nfsd
fi
# rquotad is sometimes not started correctly on RHEL5
# not a critical service so we dont flag the node as unhealthy
nfs_check_rpc_service "rquotad" \
-gt 0 "verbose restart:b"
;;
*)
ctdb_standard_event_handler "$@"
;;
esac
exit 0