2016-05-10 12:59:56 +05:30
#!/usr/bin/env bash
2016-08-17 17:31:17 +05:30
#
# Kill all the processes/services except glusterd
#
# Usage: ./extras/stop-all-gluster-processes.sh [-g] [-h]
# options:
# -g Terminate in graceful mode
# -h Show this message, then exit
#
# eg:
# 1. ./extras/stop-all-gluster-processes.sh
# 2. ./extras/stop-all-gluster-processes.sh -g
#
# By default, this script executes in force mode, i.e. all of brick, gsyncd
# and other glustershd services/processes are killed without checking for
# ongoing tasks such as geo-rep, self-heal, rebalance and etc. which may lead
# to inconsistency after the node is brought back.
#
# On specifying '-g' option this script works in graceful mode, to maintain
# data consistency the script fails with a valid exit code incase if any of
# the gluster processes are busy in doing their jobs.
#
# The author of page [1] proposes user-defined exit codes to the range 64 - 113
# Find the better explanation behind the choice in the link
#
# The exit code returned by stop-all-gluster-processes.sh:
# 0 No errors/Success
# 64 Rebalance is in progress
# 65 Self-Heal is in progress
# 66 Tier daemon running on this node
# 127 option not found
#
# [1] http://www.tldp.org/LDP/abs/html/exitcodes.html
2013-11-15 13:17:42 +05:30
2016-05-10 12:59:56 +05:30
# global
errors = 0
# find the mounts and return their pids
2016-05-17 11:06:07 -04:00
get_mount_pids( )
2013-11-15 13:17:42 +05:30
{
2016-05-10 12:59:56 +05:30
local opts
local pid
2015-07-22 16:55:06 +05:30
2016-05-10 12:59:56 +05:30
for opts in $( grep -w fuse.glusterfs /proc/mounts| awk '{print $1":/"$2}' ) ;
2013-11-15 13:17:42 +05:30
do
2016-05-10 12:59:56 +05:30
IFS = ' ' read -r -a volinfo <<< $( echo " ${ opts } " | sed 's/:\// /g' )
pid += " $( ps -Ao pid,args | grep -w " volfile-server= ${ volinfo [0] } " |
grep -w " volfile-id=/ ${ volinfo [1] } " | grep -w " ${ volinfo [2] } " |
awk '{print $1}' ) "
2013-11-15 13:17:42 +05:30
done
2016-05-10 12:59:56 +05:30
echo " ${ pid } "
}
2013-11-15 13:17:42 +05:30
2016-05-10 12:59:56 +05:30
# handle mount processes i.e. 'glusterfs'
2016-05-17 11:06:07 -04:00
kill_mounts( )
2016-05-10 12:59:56 +05:30
{
local signal = ${ 1 }
local pid
2013-11-15 13:17:42 +05:30
2016-05-10 12:59:56 +05:30
for pid in $( get_mount_pids) ;
do
echo " sending SIG ${ signal } to mount process with pid: ${ pid } " ;
kill -${ signal } ${ pid } ;
done
}
# handle brick processes and node services
2016-05-17 11:06:07 -04:00
kill_bricks_and_services( )
2016-05-10 12:59:56 +05:30
{
local signal = ${ 1 }
local pidfile
local pid
2013-11-15 13:17:42 +05:30
2016-03-02 17:42:07 +05:30
for pidfile in $( find /var/run/gluster/ -name '*.pid' ) ;
2013-11-15 13:17:42 +05:30
do
2016-05-10 12:59:56 +05:30
local pid = $( cat ${ pidfile } ) ;
echo " sending SIG ${ signal } to pid: ${ pid } " ;
kill -${ signal } ${ pid } ;
2013-11-15 13:17:42 +05:30
done
2016-05-10 12:59:56 +05:30
}
# for geo-replication, only 'monitor' has pid file written, other
# processes are not having a pid file, so get it through 'ps' and
# handle these processes
2016-05-17 11:06:07 -04:00
kill_georep_gsync( )
2016-05-10 12:59:56 +05:30
{
local signal = ${ 1 }
2013-11-15 13:17:42 +05:30
2016-05-10 12:59:56 +05:30
# FIXME: add strick/better check
local gsyncpid = $( ps -Ao pid,args | grep gluster | grep gsync |
awk '{print $1}' ) ;
if [ -n " ${ gsyncpid } " ]
2015-07-22 16:55:06 +05:30
then
2016-05-10 12:59:56 +05:30
echo " sending SIG ${ signal } to geo-rep gsync process ${ gsyncpid } " ;
kill -${ signal } ${ gsyncpid } || errors = $(( ${ errors } + 1 )) ;
2015-07-22 16:55:06 +05:30
fi
2016-05-10 12:59:56 +05:30
}
2016-08-17 17:31:17 +05:30
# check if all processes are ready to die
check_background_tasks( )
{
volumes = $( gluster vol list)
quit = 0
for volname in ${ volumes } ;
do
# tiering
if [ [ $( gluster volume tier ${ volname } status 2> /dev/null |
grep "localhost" | grep -c "in progress" ) -gt 0 ] ]
then
quit = 66
break;
fi
# rebalance
if [ [ $( gluster volume rebalance ${ volname } status 2> /dev/null |
grep -c "in progress" ) -gt 0 ] ]
then
quit = 64
break;
fi
# self heal
if [ [ $( gluster volume heal ${ volname } info | grep "Number of entries" |
awk '{ sum+=$4} END {print sum}' ) -gt 0 ] ] ;
then
quit = 65
break;
fi
# geo-rep, snapshot and quota doesn't need grace checks,
# as they ensures the consistancy on force kills
done
echo ${ quit }
}
usage( )
{
cat <<EOM
Usage: $0 [ -g] [ -h]
options:
-g Terminate in graceful mode
-h Show this message, then exit
eg:
1. $0
2. $0 -g
EOM
}
2016-05-17 11:06:07 -04:00
main( )
2016-05-10 12:59:56 +05:30
{
2016-08-17 17:31:17 +05:30
while getopts "gh" opt; do
case $opt in
g)
# graceful mode
quit = $( check_background_tasks)
if [ [ ${ quit } -ne 0 ] ]
then
exit ${ quit } ;
fi
# else safe to kill
; ;
h)
usage
exit 0;
; ;
*)
usage
exit 127;
; ;
esac
done
# remove all the options that have been parsed by getopts
shift $(( OPTIND-1))
2016-05-10 12:59:56 +05:30
kill_mounts TERM
kill_georep_gsync TERM
2016-08-17 17:47:08 +05:30
kill_bricks_and_services TERM
2016-05-10 12:59:56 +05:30
sleep 5;
echo ""
# still not Terminated? let's pass SIGKILL
kill_mounts KILL
kill_georep_gsync KILL
2016-08-17 17:47:08 +05:30
kill_bricks_and_services KILL
2015-07-22 16:55:06 +05:30
2016-05-10 12:59:56 +05:30
exit ${ errors } ;
2013-11-15 13:17:42 +05:30
}
2016-08-17 17:31:17 +05:30
main " $@ "