Add OCF compliant resource agents for glusterd and volumes

These resource agents plug glusterd into Open Cluster Framework
(OCF) compliant cluster resource managers, like Pacemaker.

The glusterd RA is fairly trivial; it simply manages the glusterd daemon
like any upstart or systemd job would, except that Pacemaker can
do it in a cluster-aware fashion.

The volume RA is a bit more involved; It starts a volume and monitors
individual brick's daemons in a cluster aware fashion, recovering bricks
when their processes fail.

Note that this does NOT imply people would deploy GlusterFS servers
in pairs, or anything of that nature. Pacemaker has the ability to
deploy cluster resources as clones, meaning glusterd and volumes would be
configured as follows in a Pacemaker cluster:

primitive p_glusterd ocf:glusterfs:glusterd \
	op monitor interval="30"
primitive p_volume_demo ocf:glusterfs:volume \
	params volname="demo" \
	op monitor interval="10"
clone cl_glusterd p_glusterd \
	meta interleave="true"
clone cl_volume_demo p_volume_demo \
	meta interleave="true" ordered="true"
colocation c_volume_on_glusterd inf: cl_volume_demo cl_glusterd
order o_glusterd_before_volume 0: cl_glusterd cl_volume_demo

The cluster status then looks as follows (in a 4-node cluster; note
the configuration above could be applied, unchanged, to a cluster
of any number of nodes):

============
Last updated: Fri Mar 30 10:54:50 2012
Last change: Thu Mar 29 17:20:17 2012 via crmd on gluster02.h
Stack: openais
Current DC: gluster03.h	- partition with quorum
Version: 1.1.6-3.el6-a02c0f19a00c1eb2527ad38f146ebc0834814558
4 Nodes configured, 4 expected votes
8 Resources configured.
============

Online: [ gluster02.h gluster03.h gluster04.h gluster01.h ]

 Clone Set: cl_glusterd [p_glusterd]
     Started: [ gluster02.h gluster03.h gluster04.h gluster01.h ]
 Clone Set: cl_volume_demo [p_volume_demo]
     Started: [ gluster01.h gluster02.h gluster03.h gluster04.h ]

This is also a way of providing automatic glusterd and brick recovery
in systems where neither upstart nor systemd are available.

Change-Id: Ied46657bdfd2dd72dc97cf41b0eb7adcecacd18f
BUG: 869559
Signed-off-by: Florian Haas <florian@hastexo.com>
Reviewed-on: http://review.gluster.org/3043
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Anand Avati <avati@redhat.com>
This commit is contained in:
Florian Haas 2012-02-20 16:25:43 +01:00 committed by Anand Avati
parent 0d868525d2
commit 61c1d77f22
5 changed files with 481 additions and 1 deletions

View File

@ -125,6 +125,9 @@ AC_CONFIG_FILES([Makefile
extras/init.d/glusterd-SuSE
extras/benchmarking/Makefile
extras/hook-scripts/Makefile
extras/ocf/Makefile
extras/ocf/glusterd
extras/ocf/volume
contrib/fuse-util/Makefile
contrib/uuid/uuid_types.h
xlators/nfs/Makefile
@ -168,6 +171,12 @@ AC_ARG_WITH(launchddir,
[launchddir='/Library/LaunchDaemons'])
AC_SUBST(launchddir)
AC_ARG_WITH([ocf],
[AS_HELP_STRING([--with-ocf], [build OCF-compliant cluster resource agents])],
,
[with_ocf=no])
AM_CONDITIONAL(WITH_OCF, [ test "$with_ocf" = "yes" ])
# LEX needs a check
AC_PROG_LEX
if test "x${LEX}" != "xflex" -a "x${FLEX}" != "xlex"; then

View File

@ -3,7 +3,7 @@ docdir = $(datadir)/doc/glusterfs/
EditorModedir = $(docdir)/
EditorMode_DATA = glusterfs-mode.el glusterfs.vim
SUBDIRS = init.d benchmarking hook-scripts
SUBDIRS = init.d benchmarking hook-scripts ocf
confdir = $(sysconfdir)/glusterfs
conf_DATA = glusterfs-logrotate

13
extras/ocf/Makefile.am Normal file
View File

@ -0,0 +1,13 @@
EXTRA_DIST = glusterd.in volume.in
if WITH_OCF
# The root of the OCF resource agent hierarchy
# Per the OCF standard, it's always "lib",
# not "lib64" (even on 64-bit platforms).
ocfdir = $(prefix)/lib/ocf
# The ceph provider directory
radir = $(ocfdir)/resource.d/$(PACKAGE_NAME)
ra_SCRIPTS = glusterd volume
endif

212
extras/ocf/glusterd.in Executable file
View File

@ -0,0 +1,212 @@
#!/bin/sh
#
# glusterd
#
# Description: Manages a glusterd server as a (typically cloned)
# HA resource
#
# Authors: Florian Haas (hastexo Professional Services GmbH)
#
# License: GNU General Public License (GPL)
#######################################################################
# Initialization:
: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}
. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs
# Convenience variables
# When sysconfdir and localstatedir aren't passed in as
# configure flags, they're defined in terms of prefix
prefix=@prefix@
#######################################################################
OCF_RESKEY_binary_default="glusterd"
OCF_RESKEY_pid_default="@localstatedir@/run/glusterd.pid"
OCF_RESKEY_socket_default=""
OCF_RESKEY_additional_parameters_default=""
: ${OCF_RESKEY_binary=${OCF_RESKEY_binary_default}}
: ${OCF_RESKEY_pid=${OCF_RESKEY_pid_default}}
glusterd_meta_data() {
cat <<EOF
<?xml version="1.0"?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
<resource-agent name="glusterd" version="0.1">
<version>0.1</version>
<longdesc lang="en">
</longdesc>
<shortdesc lang="en">Manages a Gluster server</shortdesc>
<parameters>
<parameter name="binary">
<longdesc lang="en">
Name of the glusterd executable. Specify a full absolute
path if the binary is not in your \$PATH.
</longdesc>
<shortdesc lang="en">glusterd executable</shortdesc>
<content type="string" default="$OCF_RESKEY_binary_default"/>
</parameter>
<parameter name="pid">
<longdesc lang="en">
Path to the glusterd PID file.
</longdesc>
<shortdesc lang="en">PID file</shortdesc>
<content type="string" default="$OCF_RESKEY_pid_default"/>
</parameter>
<parameter name="socket">
<longdesc lang="en">
Path to the glusterd UNIX socket file. If unspecified,
glusterd will not listen on any socket.
</longdesc>
<shortdesc lang="en">Socket file</shortdesc>
<content type="string"/>
</parameter>
</parameters>
<actions>
<action name="start" timeout="20" />
<action name="stop" timeout="20" />
<action name="monitor" timeout="20" interval="10" />
<action name="reload" timeout="20" />
<action name="meta-data" timeout="5" />
<action name="validate-all" timeout="20" />
</actions>
</resource-agent>
EOF
}
glusterd_start() {
local glusterd_options
# exit immediately if configuration is not valid
glusterd_validate_all || exit $?
# if resource is already running, bail out early
if glusterd_monitor; then
ocf_log info "Resource is already running"
return $OCF_SUCCESS
fi
# actually start up the resource here (make sure to immediately
# exit with an $OCF_ERR_ error code if anything goes seriously
# wrong)
glusterd_options="-p $OCF_RESKEY_pid"
if [ -n "$OCF_RESKEY_socket" ]; then
glusterd_options="$glusterd_options -S $OCF_RESKEY_socket"
fi
if [ -n "$OCF_RESKEY_additional_parameters" ]; then
glusterd_options="$glusterd_options $OCF_RESKEY_additional_parameters"
fi
ocf_run $OCF_RESKEY_binary $glusterd_options || exit $OCF_ERR_GENERIC
# After the resource has been started, check whether it started up
# correctly. If the resource starts asynchronously, the agent may
# spin on the monitor function here -- if the resource does not
# start up within the defined timeout, the cluster manager will
# consider the start action failed
while ! glusterd_monitor; do
ocf_log debug "Resource has not started yet, waiting"
sleep 1
done
# only return $OCF_SUCCESS if _everything_ succeeded as expected
return $OCF_SUCCESS
}
glusterd_stop() {
local rc
local pid
# exit immediately if configuration is not valid
glusterd_validate_all || exit $?
glusterd_monitor
rc=$?
case "$rc" in
"$OCF_SUCCESS")
# Currently running. Normal, expected behavior.
ocf_log debug "Resource is currently running"
;;
"$OCF_NOT_RUNNING")
# Currently not running. Nothing to do.
ocf_log info "Resource is already stopped"
return $OCF_SUCCESS
;;
esac
# actually shut down the resource here (make sure to immediately
# exit with an $OCF_ERR_ error code if anything goes seriously
# wrong)
pid=`cat $OCF_RESKEY_pid`
ocf_run kill -s TERM $pid || exit OCF_ERR_GENERIC
# After the resource has been stopped, check whether it shut down
# correctly. If the resource stops asynchronously, the agent may
# spin on the monitor function here -- if the resource does not
# shut down within the defined timeout, the cluster manager will
# consider the stop action failed
while glusterd_monitor; do
ocf_log debug "Resource has not stopped yet, waiting"
sleep 1
done
# only return $OCF_SUCCESS if _everything_ succeeded as expected
return $OCF_SUCCESS
}
glusterd_monitor() {
local pid
[ -e $OCF_RESKEY_pid ] || return $OCF_NOT_RUNNING
pid=`cat $OCF_RESKEY_pid`
ocf_run kill -s 0 $pid || return $OCF_NOT_RUNNING
ocf_log debug "$OCF_RESKEY_binary running with PID $pid"
return $OCF_SUCCESS
}
glusterd_validate_all() {
# Test for required binaries
check_binary $OCF_RESKEY_binary
return $OCF_SUCCESS
}
# Make sure meta-data and usage always succeed
case $__OCF_ACTION in
meta-data) glusterd_meta_data
exit $OCF_SUCCESS
;;
usage|help) glusterd_usage
exit $OCF_SUCCESS
;;
esac
# Anything other than meta-data and usage must pass validation
glusterd_validate_all || exit $?
# Translate each action into the appropriate function call
case $__OCF_ACTION in
start) glusterd_start;;
stop) glusterd_stop;;
status|monitor) glusterd_monitor;;
reload) ocf_log info "Reloading..."
glusterd_start
;;
validate-all) ;;
notify) exit $OCF_SUCCESS;;
*) glusterd_usage
exit $OCF_ERR_UNIMPLEMENTED
;;
esac
rc=$?
# The resource agent may optionally log a debug message
ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION returned $rc"
exit $rc

246
extras/ocf/volume.in Executable file
View File

@ -0,0 +1,246 @@
#!/bin/sh
#
# glusterd
#
# Description: Manages a glusterd server as a (typically cloned)
# HA resource
#
# Authors: Florian Haas (hastexo Professional Services GmbH)
#
# License: GNU General Public License (GPL)
#######################################################################
# Initialization:
: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}
. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs
# Convenience variables
# When sysconfdir and localstatedir aren't passed in as
# configure flags, they're defined in terms of prefix
prefix=@prefix@
SHORTHOSTNAME=`hostname -s`
#######################################################################
OCF_RESKEY_binary_default="gluster"
: ${OCF_RESKEY_binary=${OCF_RESKEY_binary_default}}
volume_meta_data() {
cat <<EOF
<?xml version="1.0"?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
<resource-agent name="volume" version="0.1">
<version>0.1</version>
<longdesc lang="en">
Manages a GlusterFS volume and monitors its bricks. When a resource of
this type is configured as a clone (as is commonly the case), then it
must have clone ordering enabled.
</longdesc>
<shortdesc lang="en">Manages a GlusterFS volume</shortdesc>
<parameters>
<parameter name="volname" required="1">
<longdesc lang="en">
The name of the volume to manage.
</longdesc>
<shortdesc lang="en">volume name</shortdesc>
<content type="string"/>
</parameter>
<parameter name="binary">
<longdesc lang="en">
Name of the gluster executable. Specify a full absolute
path if the binary is not in your \$PATH.
</longdesc>
<shortdesc lang="en">gluster executable</shortdesc>
<content type="string" default="$OCF_RESKEY_binary_default"/>
</parameter>
</parameters>
<actions>
<action name="start" timeout="20" />
<action name="stop" timeout="20" />
<action name="monitor" timeout="20" interval="10" />
<action name="reload" timeout="20" />
<action name="meta-data" timeout="5" />
<action name="validate-all" timeout="20" />
</actions>
</resource-agent>
EOF
}
volume_getdir() {
local voldir
voldir="@sysconfdir@/glusterd/vols/${OCF_RESKEY_volname}"
[ -d ${voldir} ] || return 1
echo "${voldir}"
return 0
}
volume_getbricks() {
local infofile
local voldir
voldir=`volume_getdir`
infofile="${voldir}/info"
[ -e ${infofile} ] || return 1
echo "`sed -n -e "s/^brick-.\+=${SHORTHOSTNAME}://p" < ${infofile}`"
return 0
}
volume_getpids() {
local bricks
local piddir
local pidfile
local infofile
local voldir
voldir=`volume_getdir`
bricks=`volume_getbricks`
piddir="${voldir}/run"
for brick in ${bricks}; do
pidfile="${piddir}/${SHORTHOSTNAME}${brick}.pid"
[ -e $pidfile ] || return 1
cat $pidfile
done
return 0
}
volume_start() {
local volume_options
# exit immediately if configuration is not valid
volume_validate_all || exit $?
# if resource is already running, bail out early
if volume_monitor; then
ocf_log info "Resource is already running"
return $OCF_SUCCESS
fi
# actually start up the resource here
ocf_run "$OCF_RESKEY_binary" \
volume start "$OCF_RESKEY_volname" force || exit $OCF_ERR_GENERIC
# After the resource has been started, check whether it started up
# correctly. If the resource starts asynchronously, the agent may
# spin on the monitor function here -- if the resource does not
# start up within the defined timeout, the cluster manager will
# consider the start action failed
while ! volume_monitor; do
ocf_log debug "Resource has not started yet, waiting"
sleep 1
done
# only return $OCF_SUCCESS if _everything_ succeeded as expected
return $OCF_SUCCESS
}
volume_stop() {
local rc
local pid
# exit immediately if configuration is not valid
volume_validate_all || exit $?
volume_monitor
rc=$?
case "$rc" in
"$OCF_SUCCESS")
# Currently running. Normal, expected behavior.
ocf_log debug "Resource is currently running"
;;
"$OCF_NOT_RUNNING")
# Currently not running. Nothing to do.
ocf_log info "Resource is already stopped"
return $OCF_SUCCESS
;;
esac
# actually shut down the resource here (make sure to immediately
# exit with an $OCF_ERR_ error code if anything goes seriously
# wrong)
pids=`volume_getpids`
for pid in $pids; do
ocf_run kill -s TERM $pid
done
# After the resource has been stopped, check whether it shut down
# correctly. If the resource stops asynchronously, the agent may
# spin on the monitor function here -- if the resource does not
# shut down within the defined timeout, the cluster manager will
# consider the stop action failed
while volume_monitor; do
ocf_log debug "Resource has not stopped yet, waiting"
sleep 1
done
# only return $OCF_SUCCESS if _everything_ succeeded as expected
return $OCF_SUCCESS
}
volume_monitor() {
local pid
pids=`volume_getpids` || return $OCF_NOT_RUNNING
for pid in $pids; do
ocf_run kill -s 0 $pid || return $OCF_NOT_RUNNING
done
ocf_log debug "Local bricks for volume ${OCF_RESKEY_volname} running with PIDs $pids"
return $OCF_SUCCESS
}
volume_validate_all() {
# Test for configuration errors first
if [ -z "${OCF_RESKEY_volname}" ]; then
ocf_log err 'Missing required parameter "volname"'
return $OCF_ERR_CONFIGURED
fi
# Test for required binaries
check_binary $OCF_RESKEY_binary
return $OCF_SUCCESS
}
# Make sure meta-data and usage always succeed
case $__OCF_ACTION in
meta-data) volume_meta_data
exit $OCF_SUCCESS
;;
usage|help) volume_usage
exit $OCF_SUCCESS
;;
esac
# Anything other than meta-data and usage must pass validation
volume_validate_all || exit $?
# Translate each action into the appropriate function call
case $__OCF_ACTION in
start) volume_start;;
stop) volume_stop;;
status|monitor) volume_monitor;;
reload) ocf_log info "Reloading..."
volume_start
;;
validate-all) ;;
notify) exit $OCF_SUCCESS;;
*) volume_usage
exit $OCF_ERR_UNIMPLEMENTED
;;
esac
rc=$?
# The resource agent may optionally log a debug message
ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION returned $rc"
exit $rc