From 858a2b1b88b28e95880350a5622053a02b9e472d Mon Sep 17 00:00:00 2001 From: Alasdair Kergon Date: Thu, 24 Jun 2004 08:02:38 +0000 Subject: [PATCH] Add cluster support. --- Makefile.in | 12 +- VERSION | 2 +- WHATS_NEW | 4 + configure | 51 +- configure.in | 34 +- daemons/Makefile.in | 23 + daemons/clvmd/Makefile.in | 47 + daemons/clvmd/clvm.h | 65 ++ daemons/clvmd/clvmd-cman.c | 499 ++++++++++ daemons/clvmd/clvmd-command.c | 219 +++++ daemons/clvmd/clvmd-comms.h | 55 ++ daemons/clvmd/clvmd-gulm.c | 880 +++++++++++++++++ daemons/clvmd/clvmd-gulm.h | 9 + daemons/clvmd/clvmd.c | 1693 +++++++++++++++++++++++++++++++++ daemons/clvmd/clvmd.h | 119 +++ daemons/clvmd/cnxman-socket.h | 226 +++++ daemons/clvmd/libclvm.c | 446 +++++++++ daemons/clvmd/libclvm.h | 36 + daemons/clvmd/lvm-functions.c | 446 +++++++++ daemons/clvmd/lvm-functions.h | 35 + daemons/clvmd/system-lv.c | 369 +++++++ daemons/clvmd/system-lv.h | 30 + daemons/clvmd/tcp-comms.c | 480 ++++++++++ daemons/clvmd/tcp-comms.h | 7 + include/.symlinks | 1 + lib/Makefile.in | 8 + lib/locking/Makefile.in | 32 + lib/locking/cluster_locking.c | 462 +++++++++ lib/locking/locking.c | 8 + lib/locking/locking_types.h | 1 + scripts/clvmd_fix_conf.sh | 154 +++ scripts/clvmd_init | 90 ++ 32 files changed, 6534 insertions(+), 9 deletions(-) create mode 100644 daemons/Makefile.in create mode 100644 daemons/clvmd/Makefile.in create mode 100644 daemons/clvmd/clvm.h create mode 100644 daemons/clvmd/clvmd-cman.c create mode 100644 daemons/clvmd/clvmd-command.c create mode 100644 daemons/clvmd/clvmd-comms.h create mode 100644 daemons/clvmd/clvmd-gulm.c create mode 100644 daemons/clvmd/clvmd-gulm.h create mode 100644 daemons/clvmd/clvmd.c create mode 100644 daemons/clvmd/clvmd.h create mode 100644 daemons/clvmd/cnxman-socket.h create mode 100644 daemons/clvmd/libclvm.c create mode 100644 daemons/clvmd/libclvm.h create mode 100644 daemons/clvmd/lvm-functions.c create mode 100644 daemons/clvmd/lvm-functions.h create mode 100644 daemons/clvmd/system-lv.c create mode 100644 daemons/clvmd/system-lv.h create mode 100644 daemons/clvmd/tcp-comms.c create mode 100644 daemons/clvmd/tcp-comms.h create mode 100644 lib/locking/Makefile.in create mode 100644 lib/locking/cluster_locking.c create mode 100644 scripts/clvmd_fix_conf.sh create mode 100755 scripts/clvmd_init diff --git a/Makefile.in b/Makefile.in index f7b4d5238..af63fc5d9 100644 --- a/Makefile.in +++ b/Makefile.in @@ -22,11 +22,13 @@ ifeq ("@INTL@", "yes") SUBDIRS += po endif -SUBDIRS += lib tools +SUBDIRS += lib tools daemons ifeq ($(MAKECMDGOALS),distclean) - SUBDIRS += lib/format1 \ + SUBDIRS += daemons/clvmd \ + lib/format1 \ lib/format_pool \ + lib/locking \ lib/mirror \ lib/snapshot \ po \ @@ -35,14 +37,16 @@ endif include make.tmpl +daemons: lib lib: include tools: lib -po: lib tools +po: tools daemons ifeq ("@INTL@", "yes") lib.pofile: include.pofile tools.pofile: lib.pofile -po.pofile: lib.pofile tools.pofile +daemons.pofile: lib.pofile +po.pofile: tools.pofile daemons.pofile pofile: po.pofile endif diff --git a/VERSION b/VERSION index f2ad88206..a9f466bfb 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.00.17-cvs (2004-06-20) +2.00.18-cvs (2004-06-24) diff --git a/WHATS_NEW b/WHATS_NEW index b582f61bf..94aadf13c 100644 --- a/WHATS_NEW +++ b/WHATS_NEW @@ -1,3 +1,7 @@ +Version 2.00.18 - 24 June 2004 +============================== + Add cluster support. + Version 2.00.17 - 20 June 2004 ============================== configure --enable-fsadm to try out fsadm. fsadm is not tested yet. diff --git a/configure b/configure index 783dbddad..d5efca06d 100755 --- a/configure +++ b/configure @@ -309,7 +309,7 @@ ac_includes_default="\ #endif" ac_default_prefix=/usr -ac_subst_vars='SHELL PATH_SEPARATOR PACKAGE_NAME PACKAGE_TARNAME PACKAGE_VERSION PACKAGE_STRING PACKAGE_BUGREPORT exec_prefix prefix program_transform_name bindir sbindir libexecdir datadir sysconfdir sharedstatedir localstatedir libdir includedir oldincludedir infodir mandir build_alias host_alias target_alias DEFS ECHO_C ECHO_N ECHO_T LIBS AWK CC CFLAGS LDFLAGS CPPFLAGS ac_ct_CC EXEEXT OBJEXT INSTALL_PROGRAM INSTALL_SCRIPT INSTALL_DATA LN_S SET_MAKE RANLIB ac_ct_RANLIB CPP EGREP build build_cpu build_vendor build_os host host_cpu host_vendor host_os target target_cpu target_vendor target_os MSGFMT JOBS STATIC_LINK LVM1 POOL SNAPSHOTS MIRRORS OWNER GROUP CLDFLAGS CLDWHOLEARCHIVE CLDNOWHOLEARCHIVE LD_DEPS LD_FLAGS SOFLAG LVM_VERSION LVM1_FALLBACK DEBUG DEVMAPPER HAVE_LIBDL HAVE_SELINUX CMDLIB LOCALEDIR CONFDIR STATICDIR INTL_PACKAGE INTL FSADM LIBOBJS LTLIBOBJS' +ac_subst_vars='SHELL PATH_SEPARATOR PACKAGE_NAME PACKAGE_TARNAME PACKAGE_VERSION PACKAGE_STRING PACKAGE_BUGREPORT exec_prefix prefix program_transform_name bindir sbindir libexecdir datadir sysconfdir sharedstatedir localstatedir libdir includedir oldincludedir infodir mandir build_alias host_alias target_alias DEFS ECHO_C ECHO_N ECHO_T LIBS AWK CC CFLAGS LDFLAGS CPPFLAGS ac_ct_CC EXEEXT OBJEXT INSTALL_PROGRAM INSTALL_SCRIPT INSTALL_DATA LN_S SET_MAKE RANLIB ac_ct_RANLIB CPP EGREP build build_cpu build_vendor build_os host host_cpu host_vendor host_os target target_cpu target_vendor target_os MSGFMT JOBS STATIC_LINK LVM1 POOL SNAPSHOTS MIRRORS OWNER GROUP CLDFLAGS CLDWHOLEARCHIVE CLDNOWHOLEARCHIVE LD_DEPS LD_FLAGS SOFLAG LVM_VERSION LVM1_FALLBACK DEBUG DEVMAPPER HAVE_LIBDL HAVE_SELINUX CMDLIB LOCALEDIR CONFDIR STATICDIR INTL_PACKAGE INTL CLVMD CLUSTER FSADM LIBOBJS LTLIBOBJS' ac_subst_files='' # Initialize some variables set by options. @@ -867,10 +867,13 @@ Optional Packages: TYPE=internal --with-pool=TYPE GFS pool read-only support: internal/shared/none TYPE=internal + --with-cluster=TYPE Cluster LVM locking support: internal/shared/none + TYPE=internal --with-snapshots=TYPE Snapshot support: internal/shared/none TYPE=internal --with-mirrors=TYPE Mirror support: internal/shared/none TYPE=internal + --with-clvmd Build cluster LVM Daemon --with-localedir=DIR Translation files in DIR PREFIX/share/locale --with-confdir=DIR Configuration files in DIR /etc --with-staticdir=DIR Static binary in DIR EXEC_PREFIX/sbin @@ -3900,6 +3903,7 @@ case "$host_os" in SOFLAG="-shared" DEVMAPPER=yes ODIRECT=yes + CLUSTER=internal FSADM=no ;; darwin*) CFLAGS="-no-cpp-precomp -fno-common" @@ -3911,6 +3915,7 @@ case "$host_os" in SOFLAG="-dynamiclib" DEVMAPPER=no ODIRECT=no + CLUSTER=none FSADM=no ;; esac @@ -3998,6 +4003,25 @@ if test x$POOL = xinternal; then fi +# Check whether --with-cluster or --without-cluster was given. +if test "${with_cluster+set}" = set; then + withval="$with_cluster" + CLUSTER="$withval" +fi; + +if [ "x$CLUSTER" != xnone -a "x$CLUSTER" != xinternal -a "x$CLUSTER" != xshared ]; + then { { echo "$as_me:$LINENO: error: --with-cluster parameter invalid +" >&5 +echo "$as_me: error: --with-cluster parameter invalid +" >&2;} + { (exit 1); exit 1; }; } + exit +fi; + +if test x$CLUSTER = xinternal; then + CFLAGS="$CFLAGS -DCLUSTER_LOCKING_INTERNAL" +fi + # Check whether --enable-jobs or --disable-jobs was given. if test "${enable_jobs+set}" = set; then enableval="$enable_jobs" @@ -4071,6 +4095,20 @@ if test x$READLINE = xyes; then CFLAGS="$CFLAGS -DREADLINE_SUPPORT" fi + +# Check whether --with-clvmd or --without-clvmd was given. +if test "${with_clvmd+set}" = set; then + withval="$with_clvmd" + \ +CLVMD=$withval +else + CLVMD=no +fi; +if test x$CLVMD = xyes && test x$CLUSTER = xnone; then + CLUSTER=internal +fi +echo "$ac_t""$CLVMD" 1>&6 + echo $ac_n "checking whether to enable debugging""... $ac_c" 1>&6 # Check whether --enable-debug or --disable-debug was given. if test "${enable_debug+set}" = set; then @@ -4698,7 +4736,7 @@ else HAVE_LIBDL=no fi -if [ \( "x$LVM1" = xshared -o "x$POOL" = xshared -o \ +if [ \( "x$LVM1" = xshared -o "x$POOL" = xshared -o "x$CLUSTER" = xshared -o \ "x$SNAPSHOTS" = xshared -o "x$MIRRORS" = xshared \ \) -a "x$STATIC_LINK" = xyes ]; then { { echo "$as_me:$LINENO: error: Features cannot be 'shared' when building statically @@ -5207,7 +5245,9 @@ fi - ac_config_files="$ac_config_files Makefile make.tmpl doc/Makefile include/Makefile lib/Makefile lib/format1/Makefile lib/format_pool/Makefile lib/mirror/Makefile lib/snapshot/Makefile man/Makefile po/Makefile tools/Makefile tools/version.h tools/fsadm/Makefile test/mm/Makefile test/device/Makefile test/format1/Makefile test/regex/Makefile test/filters/Makefile" + + + ac_config_files="$ac_config_files Makefile make.tmpl daemons/Makefile daemons/clvmd/Makefile doc/Makefile include/Makefile lib/Makefile lib/format1/Makefile lib/format_pool/Makefile lib/locking/Makefile lib/mirror/Makefile lib/snapshot/Makefile man/Makefile po/Makefile tools/Makefile tools/version.h tools/fsadm/Makefile test/mm/Makefile test/device/Makefile test/format1/Makefile test/regex/Makefile test/filters/Makefile" cat >confcache <<\_ACEOF # This file is a shell script that caches the results of configure # tests run on this system so they can be shared between configure @@ -5760,11 +5800,14 @@ do # Handling of arguments. "Makefile" ) CONFIG_FILES="$CONFIG_FILES Makefile" ;; "make.tmpl" ) CONFIG_FILES="$CONFIG_FILES make.tmpl" ;; + "daemons/Makefile" ) CONFIG_FILES="$CONFIG_FILES daemons/Makefile" ;; + "daemons/clvmd/Makefile" ) CONFIG_FILES="$CONFIG_FILES daemons/clvmd/Makefile" ;; "doc/Makefile" ) CONFIG_FILES="$CONFIG_FILES doc/Makefile" ;; "include/Makefile" ) CONFIG_FILES="$CONFIG_FILES include/Makefile" ;; "lib/Makefile" ) CONFIG_FILES="$CONFIG_FILES lib/Makefile" ;; "lib/format1/Makefile" ) CONFIG_FILES="$CONFIG_FILES lib/format1/Makefile" ;; "lib/format_pool/Makefile" ) CONFIG_FILES="$CONFIG_FILES lib/format_pool/Makefile" ;; + "lib/locking/Makefile" ) CONFIG_FILES="$CONFIG_FILES lib/locking/Makefile" ;; "lib/mirror/Makefile" ) CONFIG_FILES="$CONFIG_FILES lib/mirror/Makefile" ;; "lib/snapshot/Makefile" ) CONFIG_FILES="$CONFIG_FILES lib/snapshot/Makefile" ;; "man/Makefile" ) CONFIG_FILES="$CONFIG_FILES man/Makefile" ;; @@ -5916,6 +5959,8 @@ s,@CONFDIR@,$CONFDIR,;t t s,@STATICDIR@,$STATICDIR,;t t s,@INTL_PACKAGE@,$INTL_PACKAGE,;t t s,@INTL@,$INTL,;t t +s,@CLVMD@,$CLVMD,;t t +s,@CLUSTER@,$CLUSTER,;t t s,@FSADM@,$FSADM,;t t s,@LIBOBJS@,$LIBOBJS,;t t s,@LTLIBOBJS@,$LTLIBOBJS,;t t diff --git a/configure.in b/configure.in index c07b020a8..95d3a9726 100644 --- a/configure.in +++ b/configure.in @@ -59,6 +59,7 @@ case "$host_os" in SOFLAG="-shared" DEVMAPPER=yes ODIRECT=yes + CLUSTER=internal FSADM=no ;; darwin*) CFLAGS="-no-cpp-precomp -fno-common" @@ -70,6 +71,7 @@ case "$host_os" in SOFLAG="-dynamiclib" DEVMAPPER=no ODIRECT=no + CLUSTER=none FSADM=no ;; esac @@ -141,6 +143,22 @@ if test x$POOL = xinternal; then CFLAGS="$CFLAGS -DPOOL_INTERNAL" fi +dnl -- cluster_locking inclusion type +AC_ARG_WITH(cluster, + [ --with-cluster=TYPE Cluster LVM locking support: internal/shared/none + [TYPE=internal] ], + [ CLUSTER="$withval" ]) + +if [[ "x$CLUSTER" != xnone -a "x$CLUSTER" != xinternal -a "x$CLUSTER" != xshared ]]; + then AC_MSG_ERROR( +--with-cluster parameter invalid +) + exit +fi; + +if test x$CLUSTER = xinternal; then + CFLAGS="$CFLAGS -DCLUSTER_LOCKING_INTERNAL" +fi AC_ARG_ENABLE(jobs, [ --enable-jobs=NUM Number of jobs to run simultaneously], JOBS=-j$enableval, JOBS=-j2) @@ -192,6 +210,15 @@ if test x$READLINE = xyes; then CFLAGS="$CFLAGS -DREADLINE_SUPPORT" fi +dnl Build cluster LVM daemon +AC_ARG_WITH(clvmd, [ --with-clvmd Build cluster LVM Daemon], \ +CLVMD=$withval, CLVMD=no) +dnl If clvmd enabled and not cluster locking, automgically include the locking. +if test x$CLVMD = xyes && test x$CLUSTER = xnone; then + CLUSTER=internal +fi +echo "$ac_t""$CLVMD" 1>&6 + echo $ac_n "checking whether to enable debugging""... $ac_c" 1>&6 dnl Enable Debugging AC_ARG_ENABLE(debug, [ --enable-debug Enable debugging], \ @@ -272,7 +299,7 @@ else fi dnl Check for shared/static conflicts -if [[ \( "x$LVM1" = xshared -o "x$POOL" = xshared -o \ +if [[ \( "x$LVM1" = xshared -o "x$POOL" = xshared -o "x$CLUSTER" = xshared -o \ "x$SNAPSHOTS" = xshared -o "x$MIRRORS" = xshared \ \) -a "x$STATIC_LINK" = xyes ]]; then AC_MSG_ERROR( @@ -377,6 +404,8 @@ AC_SUBST(CONFDIR) AC_SUBST(STATICDIR) AC_SUBST(INTL_PACKAGE) AC_SUBST(INTL) +AC_SUBST(CLVMD) +AC_SUBST(CLUSTER) AC_SUBST(FSADM) dnl First and last lines should not contain files to generate in order to @@ -384,11 +413,14 @@ dnl keep utility scripts running properly AC_OUTPUT( \ Makefile \ make.tmpl \ +daemons/Makefile \ +daemons/clvmd/Makefile \ doc/Makefile \ include/Makefile \ lib/Makefile \ lib/format1/Makefile \ lib/format_pool/Makefile \ +lib/locking/Makefile \ lib/mirror/Makefile \ lib/snapshot/Makefile \ man/Makefile \ diff --git a/daemons/Makefile.in b/daemons/Makefile.in new file mode 100644 index 000000000..a951632ee --- /dev/null +++ b/daemons/Makefile.in @@ -0,0 +1,23 @@ +# +# Copyright (C) 2004 Red Hat, Inc. All rights reserved. +# +# This file is part of the LVM2. +# +# This copyrighted material is made available to anyone wishing to use, +# modify, copy, or redistribute it subject to the terms and conditions +# of the GNU General Public License v.2. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software Foundation, +# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +srcdir = @srcdir@ +top_srcdir = @top_srcdir@ +VPATH = @srcdir@ + +ifeq ("@CLVMD@", "yes") + SUBDIRS = clvmd +endif + +include $(top_srcdir)/make.tmpl + diff --git a/daemons/clvmd/Makefile.in b/daemons/clvmd/Makefile.in new file mode 100644 index 000000000..54563e7eb --- /dev/null +++ b/daemons/clvmd/Makefile.in @@ -0,0 +1,47 @@ +# +# Copyright (C) 2004 Red Hat, Inc. All rights reserved. +# +# This file is part of the LVM2. +# +# This copyrighted material is made available to anyone wishing to use, +# modify, copy, or redistribute it subject to the terms and conditions +# of the GNU General Public License v.2. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software Foundation, +# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +srcdir = @srcdir@ +top_srcdir = @top_srcdir@ +VPATH = @srcdir@ + +SOURCES = \ + clvmd-cman.c \ + clvmd-command.c \ + clvmd.c \ + libclvm.c \ + lvm-functions.c \ + system-lv.c + +TARGETS = \ + clvmd + +include $(top_srcdir)/make.tmpl + +CFLAGS += -D_REENTRANT -fno-strict-aliasing +LIBS += -ldevmapper -ldlm -llvm -lpthread + +INSTALL_TARGETS = \ + install_clvmd + +clvmd: $(OBJECTS) $(top_srcdir)/lib/liblvm.a + $(CC) -o clvmd $(OBJECTS) $(LD_FLAGS) $(LVMLIBS) $(LIBS) + +.PHONY: install_clvmd + +install_clvmd: $(TARGETS) + $(INSTALL) -D $(OWNER) $(GROUP) -m 555 $(STRIP) clvmd \ + $(sbindir)/clvmd + +install: $(INSTALL_TARGETS) + diff --git a/daemons/clvmd/clvm.h b/daemons/clvmd/clvm.h new file mode 100644 index 000000000..dd20bfd33 --- /dev/null +++ b/daemons/clvmd/clvm.h @@ -0,0 +1,65 @@ +/* + * Copyright (C) 2002-2004 Sistina Software, Inc. All rights reserved. + * Copyright (C) 2004 Red Hat, Inc. All rights reserved. + * + * This file is part of LVM2. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* Definitions for CLVMD server and clients */ + +/* + * The protocol spoken over the cluster and across the local socket. + */ + +#ifndef _CLVM_H +#define _CLVM_H + +struct clvm_header { + uint8_t cmd; /* See below */ + uint8_t flags; /* See below */ + uint16_t xid; /* Transaction ID */ + uint32_t clientid; /* Only used in Daemon->Daemon comms */ + int32_t status; /* For replies, whether request succeeded */ + uint32_t arglen; /* Length of argument below. + If >1500 then it will be passed + around the cluster in the system LV */ + char node[1]; /* Actually a NUL-terminated string, node name. + If this is empty then the command is + forwarded to all cluster nodes unless + FLAG_LOCAL is also set. */ + char args[1]; /* Arguments for the command follow the + node name, This member is only + valid if the node name is empty */ +} __attribute__ ((packed)); + +/* Flags */ +#define CLVMD_FLAG_LOCAL 1 /* Only do this on the local node */ +#define CLVMD_FLAG_SYSTEMLV 2 /* Data in system LV under my node name */ + +/* Name of the local socket to communicate between libclvm and clvmd */ +//static const char CLVMD_SOCKNAME[]="/var/run/clvmd"; +static const char CLVMD_SOCKNAME[] = "\0clvmd"; + +/* Internal commands & replies */ +#define CLVMD_CMD_REPLY 1 +#define CLVMD_CMD_VERSION 2 /* Send version around cluster when we start */ +#define CLVMD_CMD_GOAWAY 3 /* Die if received this - we are running + an incompatible version */ +#define CLVMD_CMD_TEST 4 /* Just for mucking about */ + +#define CLVMD_CMD_LOCK 30 +#define CLVMD_CMD_UNLOCK 31 + +/* Lock/Unlock commands */ +#define CLVMD_CMD_LOCK_LV 50 +#define CLVMD_CMD_LOCK_VG 51 + +#endif diff --git a/daemons/clvmd/clvmd-cman.c b/daemons/clvmd/clvmd-cman.c new file mode 100644 index 000000000..751f4ddf4 --- /dev/null +++ b/daemons/clvmd/clvmd-cman.c @@ -0,0 +1,499 @@ +/* + * Copyright (C) 2002-2004 Sistina Software, Inc. All rights reserved. + * Copyright (C) 2004 Red Hat, Inc. All rights reserved. + * + * This file is part of LVM2. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* + * CMAN communication layer for clvmd. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "clvmd-comms.h" +#include "clvm.h" +#include "libdlm.h" +#include "log.h" +#include "clvmd.h" +#include "lvm-functions.h" + +#define LOCKSPACE_NAME "clvmd" + +static int cluster_sock; +static int num_nodes; +static struct cl_cluster_node *nodes = NULL; +static int count_nodes; /* size of allocated nodes array */ +static int max_updown_nodes = 50; /* Current size of the allocated array */ +/* Node up/down status, indexed by nodeid */ +static int *node_updown = NULL; +static dlm_lshandle_t *lockspace; + +static void sigusr1_handler(int sig); +static void count_clvmds_running(void); +static void get_members(void); +static int nodeid_from_csid(char *csid); +static int name_from_nodeid(int nodeid, char *name); + +struct lock_wait { + pthread_cond_t cond; + pthread_mutex_t mutex; + struct dlm_lksb lksb; +}; + +int init_cluster() +{ + struct sockaddr_cl saddr; + int port = CLUSTER_PORT_CLVMD; + + /* Open the cluster communication socket */ + cluster_sock = socket(AF_CLUSTER, SOCK_DGRAM, CLPROTO_CLIENT); + if (cluster_sock == -1) { + perror("Can't open cluster socket"); + return -1; + } + + /* Bind to our port number on the cluster. + Writes to this will block if the cluster loses quorum */ + saddr.scl_family = AF_CLUSTER; + saddr.scl_port = port; + + if (bind + (cluster_sock, (struct sockaddr *) &saddr, + sizeof(struct sockaddr_cl))) { + log_error("Can't bind cluster socket: %m"); + return -1; + } + + /* Get the cluster members list */ + get_members(); + count_clvmds_running(); + + /* Create a lockspace for LV & VG locks to live in */ + lockspace = dlm_create_lockspace(LOCKSPACE_NAME, 0600); + if (!lockspace) { + log_error("Unable to create lockspace for CLVM\n"); + return -1; + } + dlm_ls_pthread_init(lockspace); + return 0; +} + +int get_main_cluster_fd() +{ + return cluster_sock; +} + +int get_num_nodes() +{ + return num_nodes; +} + +/* send_message with the fd check removed */ +int cluster_send_message(void *buf, int msglen, char *csid, const char *errtext) +{ + struct iovec iov[2]; + struct msghdr msg; + struct sockaddr_cl saddr; + int len = 0; + + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_iovlen = 1; + msg.msg_iov = iov; + msg.msg_flags = 0; + iov[0].iov_len = msglen; + iov[0].iov_base = buf; + + saddr.scl_family = AF_CLUSTER; + saddr.scl_port = CLUSTER_PORT_CLVMD; + if (csid) { + msg.msg_name = &saddr; + msg.msg_namelen = sizeof(saddr); + memcpy(&saddr.scl_nodeid, csid, MAX_CSID_LEN); + } else { /* Cluster broadcast */ + + msg.msg_name = NULL; + msg.msg_namelen = 0; + } + + do { + len = sendmsg(cluster_sock, &msg, 0); + if (len < 0 && errno != EAGAIN) + log_error(errtext); + + } while (len == -1 && errno == EAGAIN); + return len; +} + +void get_our_csid(char *csid) +{ + int i; + memset(csid, 0, MAX_CSID_LEN); + + for (i = 0; i < num_nodes; i++) { + if (nodes[i].us) + memcpy(csid, &nodes[i].node_id, MAX_CSID_LEN); + } +} + +/* Call a callback routine for each node that known (down mean not running a clvmd) */ +int cluster_do_node_callback(struct local_client *client, + void (*callback) (struct local_client *, char *, + int)) +{ + int i; + int somedown = 0; + + for (i = 0; i < get_num_nodes(); i++) { + callback(client, (char *)&nodes[i].node_id, node_updown[nodes[i].node_id]); + if (!node_updown[nodes[i].node_id]) + somedown = -1; + } + return somedown; +} + +/* Process OOB message from the cluster socket, + this currently just means that a node has stopped listening on our port */ +static void process_oob_msg(char *buf, int len, int nodeid) +{ + char namebuf[256]; + switch (buf[0]) { + case CLUSTER_OOB_MSG_PORTCLOSED: + name_from_nodeid(nodeid, namebuf); + log_notice("clvmd on node %s has died\n", namebuf); + DEBUGLOG("Got OOB message, removing node %s\n", namebuf); + + node_updown[nodeid] = 0; + break; + + case CLUSTER_OOB_MSG_STATECHANGE: + DEBUGLOG("Got OOB message, Cluster state change\n"); + get_members(); + break; + default: + /* ERROR */ + DEBUGLOG("Got unknown OOB message: %d\n", buf[0]); + } +} + +int cluster_fd_callback(struct local_client *fd, char *buf, int len, char *csid, + struct local_client **new_client) +{ + struct iovec iov[2]; + struct msghdr msg; + struct sockaddr_cl saddr; + + /* We never return a new client */ + *new_client = NULL; + + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_iovlen = 1; + msg.msg_iov = iov; + msg.msg_name = &saddr; + msg.msg_flags = 0; + msg.msg_namelen = sizeof(saddr); + iov[0].iov_len = len; + iov[0].iov_base = buf; + + len = recvmsg(cluster_sock, &msg, MSG_OOB | O_NONBLOCK); + if (len < 0 && errno == EAGAIN) + return len; + + DEBUGLOG("Read on cluster socket, len = %d\n", len); + + /* A real error */ + if (len < 0) { + log_error("read error on cluster socket: %m"); + return 0; + } + + /* EOF - we have left the cluster */ + if (len == 0) + return 0; + + /* Is it OOB? probably a node gone down */ + if (msg.msg_flags & MSG_OOB) { + process_oob_msg(iov[0].iov_base, len, saddr.scl_nodeid); + + /* Tell the upper layer to ignore this message */ + len = -1; + errno = EAGAIN; + } + memcpy(csid, &saddr.scl_nodeid, sizeof(saddr.scl_nodeid)); + return len; +} + +void add_up_node(char *csid) +{ + /* It's up ! */ + int nodeid = nodeid_from_csid(csid); + + if (nodeid >= max_updown_nodes) { + int *new_updown = realloc(node_updown, max_updown_nodes + 10); + + if (new_updown) { + node_updown = new_updown; + max_updown_nodes += 10; + DEBUGLOG("realloced more space for nodes. now %d\n", + max_updown_nodes); + } else { + log_error + ("Realloc failed. Node status for clvmd will be wrong\n"); + return; + } + } + node_updown[nodeid] = 1; + DEBUGLOG("Added new node %d to updown list\n", nodeid); +} + +void cluster_closedown() +{ + unlock_all(); + dlm_release_lockspace(LOCKSPACE_NAME, lockspace, 1); + close(cluster_sock); +} + +static int is_listening(int nodeid) +{ + struct cl_listen_request rq; + int status; + + rq.port = CLUSTER_PORT_CLVMD; + rq.nodeid = nodeid; + + do { + status = ioctl(cluster_sock, SIOCCLUSTER_ISLISTENING, &rq); + if (status < 0 && errno == EBUSY) { /* Don't busywait */ + sleep(1); + errno = EBUSY; /* In case sleep trashes it */ + } + } + while (status < 0 && errno == EBUSY); + + return status; +} + +/* Populate the list of CLVMDs running. + called only at startup time */ +void count_clvmds_running(void) +{ + int i; + + for (i = 0; i < num_nodes; i++) { + node_updown[nodes[i].node_id] = is_listening(nodes[i].node_id); + } +} + +/* Get a list of active cluster members */ +static void get_members() +{ + struct cl_cluster_nodelist nodelist; + + num_nodes = ioctl(cluster_sock, SIOCCLUSTER_GETMEMBERS, 0); + if (num_nodes == -1) { + perror("get nodes"); + } else { + /* Not enough room for new nodes list ? */ + if (num_nodes > count_nodes && nodes) { + free(nodes); + nodes = NULL; + } + + if (nodes == NULL) { + count_nodes = num_nodes + 10; /* Overallocate a little */ + nodes = malloc(count_nodes * sizeof(struct cl_cluster_node)); + if (!nodes) { + perror("Unable to allocate nodes array\n"); + exit(5); + } + } + nodelist.max_members = count_nodes; + nodelist.nodes = nodes; + + num_nodes = ioctl(cluster_sock, SIOCCLUSTER_GETMEMBERS, &nodelist); + if (num_nodes <= 0) { + perror("get node details"); + exit(6); + } + + /* Sanity check struct */ + if (nodes[0].size != sizeof(struct cl_cluster_node)) { + log_error + ("sizeof(cl_cluster_node) does not match size returned from the kernel: aborting\n"); + exit(10); + } + + if (node_updown == NULL) { + node_updown = + (int *) malloc(sizeof(int) * + max(num_nodes, max_updown_nodes)); + memset(node_updown, 0, + sizeof(int) * max(num_nodes, max_updown_nodes)); + } + } +} + +/* Convert a node name to a CSID */ +int csid_from_name(char *csid, char *name) +{ + int i; + + for (i = 0; i < num_nodes; i++) { + if (strcmp(name, nodes[i].name) == 0) { + memcpy(csid, &nodes[i].node_id, MAX_CSID_LEN); + return 0; + } + } + return -1; +} + +/* Convert a CSID to a node name */ +int name_from_csid(char *csid, char *name) +{ + int i; + + for (i = 0; i < num_nodes; i++) { + if (memcmp(csid, &nodes[i].node_id, MAX_CSID_LEN) == 0) { + strcpy(name, nodes[i].name); + return 0; + } + } + /* Who?? */ + strcpy(name, "Unknown"); + return -1; +} + +/* Convert a node ID to a node name */ +int name_from_nodeid(int nodeid, char *name) +{ + int i; + + for (i = 0; i < num_nodes; i++) { + if (nodeid == nodes[i].node_id) { + strcpy(name, nodes[i].name); + return 0; + } + } + /* Who?? */ + strcpy(name, "Unknown"); + return -1; +} + +/* Convert a CSID to a node ID */ +static int nodeid_from_csid(char *csid) +{ + int nodeid; + + memcpy(&nodeid, csid, MAX_CSID_LEN); + + return nodeid; +} + +int is_quorate() +{ + return ioctl(cluster_sock, SIOCCLUSTER_ISQUORATE, 0); +} + +static void sync_ast_routine(void *arg) +{ + struct lock_wait *lwait = arg; + + pthread_mutex_lock(&lwait->mutex); + pthread_cond_signal(&lwait->cond); + pthread_mutex_unlock(&lwait->mutex); +} + +int sync_lock(const char *resource, int mode, int flags, int *lockid) +{ + int status; + struct lock_wait lwait; + + if (!lockid) { + errno = EINVAL; + return -1; + } + + /* Conversions need the lockid in the LKSB */ + if (flags & LKF_CONVERT) + lwait.lksb.sb_lkid = *lockid; + + pthread_cond_init(&lwait.cond, NULL); + pthread_mutex_init(&lwait.mutex, NULL); + pthread_mutex_lock(&lwait.mutex); + + status = dlm_ls_lock(lockspace, + mode, + &lwait.lksb, + flags, + resource, + strlen(resource), + 0, sync_ast_routine, &lwait, NULL, NULL); + if (status) + return status; + + /* Wait for it to complete */ + pthread_cond_wait(&lwait.cond, &lwait.mutex); + pthread_mutex_unlock(&lwait.mutex); + + *lockid = lwait.lksb.sb_lkid; + + errno = lwait.lksb.sb_status; + if (lwait.lksb.sb_status) + return -1; + else + return 0; +} + +int sync_unlock(const char *resource /* UNUSED */, int lockid) +{ + int status; + struct lock_wait lwait; + + pthread_cond_init(&lwait.cond, NULL); + pthread_mutex_init(&lwait.mutex, NULL); + pthread_mutex_lock(&lwait.mutex); + + status = dlm_ls_unlock(lockspace, lockid, 0, &lwait.lksb, &lwait); + + if (status) + return status; + + /* Wait for it to complete */ + pthread_cond_wait(&lwait.cond, &lwait.mutex); + pthread_mutex_unlock(&lwait.mutex); + + errno = lwait.lksb.sb_status; + if (lwait.lksb.sb_status != EUNLOCK) + return -1; + else + return 0; + +} diff --git a/daemons/clvmd/clvmd-command.c b/daemons/clvmd/clvmd-command.c new file mode 100644 index 000000000..517c1346f --- /dev/null +++ b/daemons/clvmd/clvmd-command.c @@ -0,0 +1,219 @@ +/* + * Copyright (C) 2002-2004 Sistina Software, Inc. All rights reserved. + * Copyright (C) 2004 Red Hat, Inc. All rights reserved. + * + * This file is part of LVM2. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* + + CLVMD Cluster LVM daemon command processor. + + To add commands to the daemon simply add a processor in do_command and return + and messages back in buf and the length in *retlen. The initial value of + buflen is the maximum size of the buffer. if buf is not large enough then it + may be reallocated by the functions in here to a suitable size bearing in + mind that anything larger than the passed-in size will have to be returned + using the system LV and so performance will suffer. + + The status return will be negated and passed back to the originating node. + + pre- and post- command routines are called only on the local node. The + purpose is primarily to get and release locks, though the pre- routine should + also do any other local setups required by the command (if any) and can + return a failure code that prevents the command from being distributed around + the cluster + + The pre- and post- routines are run in their own thread so can block as long + they like, do_command is run in the main clvmd thread so should not block for + too long. If the pre-command returns an error code (!=0) then the command + will not be propogated around the cluster but the post-command WILL be called + + Also note that the pre and post routine are *always* called on the local + node, even if the command to be executed was only requested to run on a + remote node. It may peek inside the client structure to check the status of + the command. + + The clients of the daemon must, naturally, understand the return messages and + codes. + + Routines in here may only READ the values in the client structure passed in + apart from client->private which they are free to do what they like with. + +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "list.h" +#include "locking.h" +#include "log.h" +#include "lvm-functions.h" +#include "clvmd-comms.h" +#include "clvm.h" +#include "clvmd.h" +#include "libdlm.h" + +/* This is where all the real work happens: + NOTE: client will be NULL when this is executed on a remote node */ +int do_command(struct local_client *client, struct clvm_header *msg, int msglen, + char **buf, int buflen, int *retlen) +{ + char *args = msg->node + strlen(msg->node) + 1; + int arglen = msglen - sizeof(struct clvm_header) - strlen(msg->node); + int status = 0; + char *lockname; + struct utsname nodeinfo; + unsigned char lock_cmd; + unsigned char lock_flags; + + /* Do the command */ + switch (msg->cmd) { + /* Just a test message */ + case CLVMD_CMD_TEST: + if (arglen > buflen) { + buflen = arglen + 200; + *buf = realloc(*buf, buflen); + } + uname(&nodeinfo); + *retlen = 1 + snprintf(*buf, buflen, "TEST from %s: %s v%s", + nodeinfo.nodename, args, + nodeinfo.release); + break; + + case CLVMD_CMD_LOCK_VG: + /* Check to see if the VG is in use by LVM1 */ + status = do_check_lvm1(&args[2]); + break; + + case CLVMD_CMD_LOCK_LV: + /* This is the biggie */ + lock_cmd = args[0]; + lock_flags = args[1]; + lockname = &args[2]; + status = do_lock_lv(lock_cmd, lock_flags, lockname); + /* Replace EIO with something less scary */ + if (status == EIO) { + *retlen = + 1 + snprintf(*buf, buflen, + "Internal lvm error, check syslog"); + return EIO; + } + break; + + default: + /* Won't get here because command is validated in pre_command */ + break; + } + + /* Check the status of the command and return the error text */ + if (status) { + *retlen = 1 + snprintf(*buf, buflen, strerror(status)); + } + + return status; + +} + +/* Pre-command is a good place to get locks that are needed only for the duration + of the commands around the cluster (don't forget to free them in post-command), + and to sanity check the command arguments */ +int do_pre_command(struct local_client *client) +{ + struct clvm_header *header = + (struct clvm_header *) client->bits.localsock.cmd; + unsigned char lock_cmd; + unsigned char lock_flags; + char *args = header->node + strlen(header->node) + 1; + int lockid; + int status = 0; + char *lockname; + + switch (header->cmd) { + case CLVMD_CMD_TEST: + status = sync_lock("CLVMD_TEST", LKM_EXMODE, 0, &lockid); + client->bits.localsock.private = (void *) lockid; + break; + + case CLVMD_CMD_LOCK_VG: + lock_cmd = args[0]; + lock_flags = args[1]; + lockname = &args[2]; + DEBUGLOG("doing PRE command LOCK_VG %s at %x\n", lockname, + lock_cmd); + if (lock_cmd == LCK_UNLOCK) { + hold_unlock(lockname); + } else { + status = + hold_lock(lockname, (int) lock_cmd, + (int) lock_flags); + if (status) + status = errno; + } + break; + + case CLVMD_CMD_LOCK_LV: + lock_cmd = args[0]; + lock_flags = args[1]; + lockname = &args[2]; + status = pre_lock_lv(lock_cmd, lock_flags, lockname); + break; + + default: + log_error("Unknown command %d received\n", header->cmd); + status = EINVAL; + } + return status; +} + +/* Note that the post-command routine is called even if the pre-command or the real command + failed */ +int do_post_command(struct local_client *client) +{ + struct clvm_header *header = + (struct clvm_header *) client->bits.localsock.cmd; + int status = 0; + unsigned char lock_cmd; + unsigned char lock_flags; + char *args = header->node + strlen(header->node) + 1; + char *lockname; + + switch (header->cmd) { + case CLVMD_CMD_TEST: + status = + sync_unlock("CLVMD_TEST", (int) (long) client->bits.localsock.private); + break; + + case CLVMD_CMD_LOCK_VG: + /* Nothing to do here */ + break; + + case CLVMD_CMD_LOCK_LV: + lock_cmd = args[0]; + lock_flags = args[1]; + lockname = &args[2]; + status = post_lock_lv(lock_cmd, lock_flags, lockname); + break; + } + return status; +} diff --git a/daemons/clvmd/clvmd-comms.h b/daemons/clvmd/clvmd-comms.h new file mode 100644 index 000000000..54017b33f --- /dev/null +++ b/daemons/clvmd/clvmd-comms.h @@ -0,0 +1,55 @@ +/* + * Copyright (C) 2002-2004 Sistina Software, Inc. All rights reserved. + * Copyright (C) 2004 Red Hat, Inc. All rights reserved. + * + * This file is part of LVM2. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* + * Abstraction layer for clvmd cluster communications + */ + +#ifndef _CLVMD_COMMS_H +#define _CLVMD_COMMS_H + +struct local_client; + +extern int cluster_send_message(void *buf, int msglen, char *csid, + const char *errtext); +extern int name_from_csid(char *csid, char *name); +extern int csid_from_name(char *csid, char *name); +extern int get_num_nodes(void); +extern int cluster_fd_callback(struct local_client *fd, char *buf, int len, + char *csid, struct local_client **new_client); +extern int init_cluster(void); +extern int get_main_cluster_fd(void); /* gets accept FD or cman cluster socket */ +extern int cluster_do_node_callback(struct local_client *client, + void (*callback) (struct local_client *, + char *csid, int node_up)); +extern int is_quorate(void); + +extern void get_our_csid(char *csid); +extern void add_up_node(char *csid); +extern void cluster_closedown(void); + +extern int sync_lock(const char *resource, int mode, int flags, int *lockid); +extern int sync_unlock(const char *resource, int lockid); + +#ifdef USE_GULM +#include "tcp-comms.h" +#else +/* cman */ +#include "cnxman-socket.h" +#define MAX_CSID_LEN 4 +#endif + + +#endif diff --git a/daemons/clvmd/clvmd-gulm.c b/daemons/clvmd/clvmd-gulm.c new file mode 100644 index 000000000..bef4cbe78 --- /dev/null +++ b/daemons/clvmd/clvmd-gulm.c @@ -0,0 +1,880 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 2002-2003 All rights reserved. +** +******************************************************************************* +******************************************************************************/ + +/* This provides the interface between clvmd and gulm as the cluster + * and lock manager. + * + * It also provides the "liblm" functions too as it's hard (and pointless) + * to seperate them out when using gulm. + * + * What it does /not/ provide is the communications between clvmd daemons + * on the cluster nodes. That is done in tcp-comms.c + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ccs.h" +#include "list.h" +#include "locking.h" +#include "log.h" +#include "clvm.h" +#include "clvmd-comms.h" +#include "clvmd.h" +#include "hash.h" +#include "clvmd-gulm.h" +#include "libgulm.h" +#include "hash.h" + +/* Hash list of nodes in the cluster */ +static struct hash_table *node_hash; + +/* hash list of outstanding lock requests */ +static struct hash_table *lock_hash; + +/* Copy of the current core state */ +static uint8_t current_corestate; + +/* Number of active nodes */ +static int num_nodes; + +static char *cluster_name; + +static pthread_mutex_t lock_start_mutex; +static volatile int lock_start_flag; + +struct node_info +{ + enum {NODE_UNKNOWN, NODE_DOWN, NODE_UP, NODE_CLVMD} state; + char name[MAX_CLUSTER_MEMBER_NAME_LEN]; +}; + +struct lock_wait +{ + pthread_cond_t cond; + pthread_mutex_t mutex; + int status; +}; + +/* Forward */ +static int read_from_core_sock(struct local_client *client, char *buf, int len, char *csid, + struct local_client **new_client); +static int read_from_lock_sock(struct local_client *client, char *buf, int len, char *csid, + struct local_client **new_client); +static int get_all_cluster_nodes(void); + +/* In tcp-comms.c */ +extern struct hash_table *sock_hash; + +static int add_internal_client(int fd, fd_callback_t callback) +{ + struct local_client *client; + + DEBUGLOG("Add_internal_client, fd = %d\n", fd); + + /* Add a GULM file descriptor it to the main loop */ + client = malloc(sizeof(struct local_client)); + if (!client) + { + DEBUGLOG("malloc failed\n"); + return -1; + } + + memset(client, 0, sizeof(struct local_client)); + client->fd = fd; + client->type = CLUSTER_INTERNAL; + client->callback = callback; + add_client(client); + + return 0; +} + +/* Gulm library handle */ +static gulm_interface_p gulm_if; +static lg_core_callbacks_t core_callbacks; +static lg_lockspace_callbacks_t lock_callbacks; + +static void badsig_handler(int sig) +{ + DEBUGLOG("got sig %d\n", sig); + cluster_closedown(); + exit(0); +} + +static void sighup_handler(int sig) +{ + DEBUGLOG("got SIGHUP\n"); + + /* Re-read CCS node list */ + get_all_cluster_nodes(); +} + +int init_cluster() +{ + int status; + int ccs_h; + + /* Get cluster name from CCS */ + /* TODO: is this right? */ + ccs_h = ccs_connect(); + ccs_get(ccs_h, "//cluster/@name", &cluster_name); + ccs_disconnect(ccs_h); + + /* Block locking until we are logged in */ + pthread_mutex_init(&lock_start_mutex, NULL); + pthread_mutex_lock(&lock_start_mutex); + lock_start_flag = 1; + + node_hash = hash_create(100); + lock_hash = hash_create(10); + + /* Get all nodes from CCS */ + get_all_cluster_nodes(); + + /* Initialise GULM library */ + status = lg_initialize(&gulm_if, cluster_name, "clvmd"); + if (status) + { + DEBUGLOG("lg_initialize failed: %d\n", status); + return status; + } + + /* Connect to core - we are not "important" :-) */ + status = lg_core_login(gulm_if, 0); + if (status) + { + DEBUGLOG("lg_core_login failed: %d\n", status); + return status; + } + + /* Initialise the inter-node comms */ + status = init_comms(); + if (status) + return status; + + /* Add core FD to the list */ + status = add_internal_client(lg_core_selector(gulm_if), read_from_core_sock); + if (status) + { + DEBUGLOG("can't allocate client space\n"); + return status; + } + + /* Connect to the lock server */ + if (lg_lock_login(gulm_if, "CLVM")) + { + syslog(LOG_ERR, "Cannot login in to LOCK server\n"); + DEBUGLOG("Cannot login in to LOCK server\n"); + exit(88); + } + + /* Add lockspace FD to the list */ + status = add_internal_client(lg_lock_selector(gulm_if), read_from_lock_sock); + if (status) + { + DEBUGLOG("can't allocate client space\n"); + exit(status); + } + + /* Request a list of nodes, we can;t really do anything until + this comes back */ + status = lg_core_nodelist(gulm_if); + if (status) + { + DEBUGLOG("lg_core_nodelist failed: %d\n", status); + return status; + } + + /* So I can kill it without taking GULM down too */ + signal(SIGINT, badsig_handler); + signal(SIGTERM, badsig_handler); + + /* Re-read the node list on SIGHUP */ + signal(SIGHUP, sighup_handler); + + return 0; +} + +void cluster_closedown() +{ + DEBUGLOG("cluster_closedown\n"); + lg_lock_logout(gulm_if); + lg_core_logout(gulm_if); + lg_core_shutdown(gulm_if); + lg_release(gulm_if); +} + +/* Expire locks for a named node, or us */ +#define GIO_KEY_SIZE 46 +static void drop_expired_locks(char *nodename) +{ + struct utsname nodeinfo; + uint8_t mask[GIO_KEY_SIZE]; + + memset(mask, 0xff, GIO_KEY_SIZE); + + if (!nodename) + { + uname(&nodeinfo); + nodename = nodeinfo.nodename; + } + + if (lg_lock_drop_exp(gulm_if, nodename, mask, GIO_KEY_SIZE)) + { + DEBUGLOG("Error calling lg_lock_drop_exp()\n"); + } +} + + +static int read_from_core_sock(struct local_client *client, char *buf, int len, char *csid, + struct local_client **new_client) +{ + int status; + + *new_client = NULL; + status = lg_core_handle_messages(gulm_if, &core_callbacks, NULL); + return status<0 ? status : 1; +} + +static int read_from_lock_sock(struct local_client *client, char *buf, int len, char *csid, + struct local_client **new_client) +{ + int status; + + *new_client = NULL; + status = lg_lock_handle_messages(gulm_if, &lock_callbacks, NULL); + return status<0 ? status : 1; +} + + +/* CORE callback routines */ +static int core_login_reply(void *misc, uint64_t gen, uint32_t error, uint32_t rank, uint8_t corestate) +{ + DEBUGLOG("CORE Got a Login reply. gen:%lld err:%d rank:%d corestate:%d\n", + gen, error, rank, corestate); + + if (error) + exit(error); + + current_corestate = corestate; + return 0; +} + +static void set_node_state(struct node_info *ninfo, char *csid, uint8_t nodestate) +{ + if (nodestate == lg_core_Logged_in) + { + /* Don't clobber NODE_CLVMD state */ + if (ninfo->state != NODE_CLVMD) + { + if (ninfo->state == NODE_UNKNOWN || + ninfo->state == NODE_DOWN) + num_nodes++; + + ninfo->state = NODE_UP; + } + } + else + { + if (nodestate == lg_core_Expired || + nodestate == lg_core_Fenced || + nodestate == lg_core_Logged_out) + { + if (ninfo->state != NODE_DOWN) + num_nodes--; + ninfo->state = NODE_DOWN; + tcp_remove_client(csid); + } + } + DEBUGLOG("set_node_state, '%s' state = %d, num_nodes=%d\n", + ninfo->name, ninfo->state, num_nodes); +} + +static struct node_info *add_or_set_node(char *name, uint32_t ip, uint8_t state) +{ + struct node_info *ninfo; + + ninfo = hash_lookup_binary(node_hash, (char *)&ip, MAX_CSID_LEN); + if (!ninfo) + { + /* If we can't find that node then re-read the config file in case it + was added after we were started */ + DEBUGLOG("Node %s not found, re-reading config file\n", name); + get_all_cluster_nodes(); + + /* Now try again */ + ninfo = hash_lookup_binary(node_hash, (char *)&ip, MAX_CSID_LEN); + if (!ninfo) + { + DEBUGLOG("Ignoring node %s, not part of the SAN cluster\n", name); + return NULL; + } + } + + set_node_state(ninfo, (char *)&ip, state); + + return ninfo; +} + +static int core_nodelist(void *misc, lglcb_t type, char *name, uint32_t ip, uint8_t state) +{ + DEBUGLOG("CORE nodelist\n"); + + if (type == lglcb_start) + { + DEBUGLOG("Got Nodelist, start\n"); + } + else + { + if (type == lglcb_item) + { + DEBUGLOG("Got nodelist, item: %s, %#x, %#x\n", name, ip, state); + + add_or_set_node(name, ip, state); + } + else + { + if (type == lglcb_stop) + { + char ourcsid[MAX_CSID_LEN]; + + DEBUGLOG("Got Nodelist, stop\n"); + clvmd_cluster_init_completed(); + + /* Mark ourself as up */ + get_our_csid(ourcsid); + add_up_node(ourcsid); + } + else + { + DEBUGLOG("Unknown lglcb_t %#x\n", type); + } + } + } + + return 0; +} + +static int core_statechange(void *misc, uint8_t corestate, uint32_t masterip, char *mastername) +{ + DEBUGLOG("CORE Got statechange corestate:%#x masterip:%#x mastername:%s\n", + corestate, masterip, mastername); + + current_corestate = corestate; + return 0; +} + +static int core_nodechange(void *misc, char *nodename, uint32_t nodeip, uint8_t nodestate) +{ + struct node_info *ninfo; + + DEBUGLOG("CORE node change, name=%s, ip=%x, state = %d\n", nodename, nodeip, nodestate); + + /* If we don't get nodeip here, try a lookup by name */ + if (!nodeip) + csid_from_name((char *)&nodeip, nodename); + if (!nodeip) + return 0; + + ninfo = add_or_set_node(nodename, nodeip, nodestate); + if (!ninfo) + return 0; + + /* Check if we need to drop any expired locks */ + if (ninfo->state == NODE_DOWN) + { + drop_expired_locks(nodename); + } + + return 0; +} +static int core_error(void *misc, uint32_t err) +{ + DEBUGLOG("CORE error: %d\n", err); + // Not sure what happens here + return 0; +} + +/* LOCK callback routines */ +static int lock_login_reply(void *misc, uint32_t error, uint8_t which) +{ + DEBUGLOG("LOCK Got a Login reply. err:%d which:%d\n", + error, which); + + if (error) + exit(error); + + /* Drop any expired locks for us that might be hanging around */ + drop_expired_locks(NULL); + + /* Enable locking operations in other threads */ + if (lock_start_flag) + { + lock_start_flag = 0; + pthread_mutex_unlock(&lock_start_mutex); + } + + return 0; +} + +static int lock_lock_state(void *misc, uint8_t *key, uint16_t keylen, uint8_t state, uint32_t flags, uint32_t error, + uint8_t *LVB, uint16_t LVBlen) +{ + struct lock_wait *lwait; + + DEBUGLOG("LOCK lock state: %s, error = %d\n", key, error); + + lwait = hash_lookup(lock_hash, key); + if (!lwait) + { + DEBUGLOG("Can't find hash entry for resource %s\n", key); + return 0; + } + lwait->status = error; + pthread_mutex_lock(&lwait->mutex); + pthread_cond_signal(&lwait->cond); + pthread_mutex_unlock(&lwait->mutex); + + return 0; +} +static int lock_error(void *misc, uint32_t err) +{ + DEBUGLOG("LOCK error: %d\n", err); + // Not sure what happens here + return 0; +} + + +/* CORE callbacks */ +static lg_core_callbacks_t core_callbacks = { + .login_reply = core_login_reply, + .nodelist = core_nodelist, + .statechange = core_statechange, + .nodechange = core_nodechange, + .error = core_error, +}; + +/* LOCK callbacks */ +static lg_lockspace_callbacks_t lock_callbacks = { + .login_reply = lock_login_reply, + .lock_state = lock_lock_state, + .error = lock_error, +}; + +/* Allow tcp-comms to loop round the list of active nodes */ +int get_next_node_csid(void **context, char *csid) +{ + struct node_info *ninfo = NULL; + + /* First node */ + if (!*context) + { + *context = hash_get_first(node_hash); + } + else + { + *context = hash_get_next(node_hash, *context); + } + if (*context) + ninfo = hash_get_data(node_hash, *context); + + /* Find a node that is UP */ + while (*context && ninfo->state == NODE_DOWN) + { + *context = hash_get_next(node_hash, *context); + if (*context) + { + ninfo = hash_get_data(node_hash, *context); + } + } + + if (!*context || ninfo->state == NODE_DOWN) + { + return 0; + } + + memcpy(csid, hash_get_key(node_hash, *context), MAX_CSID_LEN); + return 1; +} + +int name_from_csid(char *csid, char *name) +{ + struct node_info *ninfo; + + ninfo = hash_lookup_binary(node_hash, csid, MAX_CSID_LEN); + if (!ninfo) + { + sprintf(name, "UNKNOWN [%d.%d.%d.%d]", + csid[0], csid[1], csid[2], csid[3]); + return -1; + } + + strcpy(name, ninfo->name); + return 0; +} + + +int csid_from_name(char *csid, char *name) +{ + struct hash_node *hn; + struct node_info *ninfo; + + hash_iterate(hn, node_hash) + { + ninfo = hash_get_data(node_hash, hn); + if (strcmp(ninfo->name, name) == 0) + { + memcpy(csid, hash_get_key(node_hash, hn), MAX_CSID_LEN); + return 0; + } + } + return -1; +} + +int get_num_nodes() +{ + DEBUGLOG("num_nodes = %d\n", num_nodes); + return num_nodes; +} + +/* Node is now known to be running a clvmd */ +void add_up_node(char *csid) +{ + struct node_info *ninfo; + + ninfo = hash_lookup_binary(node_hash, csid, MAX_CSID_LEN); + if (!ninfo) + return; + + ninfo->state = NODE_CLVMD; + return; + +} +/* Node is now known to be NOT running a clvmd */ +void add_down_node(char *csid) +{ + struct node_info *ninfo; + + ninfo = hash_lookup_binary(node_hash, csid, MAX_CSID_LEN); + if (!ninfo) + return; + + /* Only set it to UP if it was previously known to be + running clvmd - gulm may set it DOWN quite soon */ + if (ninfo->state == NODE_CLVMD) + ninfo->state = NODE_UP; + return; + +} + +/* Call a callback for each node, so the caller knows whether it's up or down */ +int cluster_do_node_callback(struct local_client *master_client, + void (*callback)(struct local_client *, char *csid, int node_up)) +{ + struct hash_node *hn; + struct node_info *ninfo; + + hash_iterate(hn, node_hash) + { + char csid[MAX_CSID_LEN]; + struct local_client *client; + + ninfo = hash_get_data(node_hash, hn); + memcpy(csid, hash_get_key(node_hash, hn), MAX_CSID_LEN); + + DEBUGLOG("down_callback. node %s, state = %d\n", ninfo->name, ninfo->state); + + client = hash_lookup_binary(sock_hash, csid, MAX_CSID_LEN); + if (client) + callback(master_client, csid, ninfo->state == NODE_CLVMD); + } + return 0; +} + +/* Convert gulm error codes to unix errno numbers */ +static int gulm_to_errno(int gulm_ret) +{ + switch (gulm_ret) + { + case lg_err_TryFailed: + errno = EAGAIN; + break; + + case lg_err_AlreadyPend: + errno = EBUSY; + + /* More?? */ + default: + errno = EINVAL; + } + + return gulm_ret ? -1 : 0; +} + +/* Real locking */ +static int _lock_resource(char *resource, int mode, int flags, int *lockid) +{ + int status; + struct lock_wait lwait; + + /* Wait until the lock module is ready */ + if (lock_start_flag) + { + pthread_mutex_lock(&lock_start_mutex); + pthread_mutex_unlock(&lock_start_mutex); + } + + pthread_cond_init(&lwait.cond, NULL); + pthread_mutex_init(&lwait.mutex, NULL); + pthread_mutex_lock(&lwait.mutex); + + /* This needs to be converted from DLM/LVM2 value for GULM */ + if (flags == LCK_NONBLOCK) flags = lg_lock_flag_Try; + + hash_insert(lock_hash, resource, &lwait); + DEBUGLOG("lock_resource '%s', flags=%d, mode=%d\n", resource, flags, mode); + + status = lg_lock_state_req(gulm_if, resource, strlen(resource)+1, + mode, flags, NULL, 0); + if (status) + { + DEBUGLOG("lg_lock_state returned %d\n", status); + return status; + } + + /* Wait for it to complete */ + pthread_cond_wait(&lwait.cond, &lwait.mutex); + pthread_mutex_unlock(&lwait.mutex); + + hash_remove(lock_hash, resource); + DEBUGLOG("lock-resource returning %d\n", lwait.status); + + return gulm_to_errno(lwait.status); +} + + +static int _unlock_resource(char *resource, int lockid) +{ + int status; + struct lock_wait lwait; + + pthread_cond_init(&lwait.cond, NULL); + pthread_mutex_init(&lwait.mutex, NULL); + pthread_mutex_lock(&lwait.mutex); + + hash_insert(lock_hash, resource, &lwait); + + DEBUGLOG("unlock_resource %s\n", resource); + status = lg_lock_state_req(gulm_if, resource, strlen(resource)+1, + lg_lock_state_Unlock, 0, NULL, 0); + + if (status) + { + DEBUGLOG("lg_lock_state(unlock) returned %d\n", status); + return status; + } + + /* Wait for it to complete */ + + pthread_cond_wait(&lwait.cond, &lwait.mutex); + pthread_mutex_unlock(&lwait.mutex); + + hash_remove(lock_hash, resource); + + return gulm_to_errno(lwait.status); +} + + +/* These two locking functions MUST be called in a seperate thread from + the clvmd main loop because they expect to be woken up by it. + + These are abstractions around the real locking functions (above) + as we need to emulate the DLM's EX/PW/CW interaction with GULM using + two locks. + To aid unlocking, we store the lock mode in the lockid (as GULM + doesn't use this). +*/ +int sync_lock(const char *resource, int mode, int flags, int *lockid) +{ + int status; + char lock1[strlen(resource)+3]; + char lock2[strlen(resource)+3]; + + snprintf(lock1, sizeof(lock1), "%s-1", resource); + snprintf(lock2, sizeof(lock2), "%s-2", resource); + + switch (mode) + { + case LCK_EXCL: + status = _lock_resource(lock1, lg_lock_state_Exclusive, flags, lockid); + if (status) + goto out; + + /* If we can't get this lock then bail out */ + status = _lock_resource(lock2, lg_lock_state_Exclusive, LCK_NONBLOCK, lockid); + if (status == lg_err_TryFailed) + { + _unlock_resource(lock1, *lockid); + status = -1; + errno = EAGAIN; + } + break; + + case LCK_READ: + status = _lock_resource(lock1, lg_lock_state_Shared, flags, lockid); + break; + + case LCK_WRITE: + status = _lock_resource(lock2, lg_lock_state_Exclusive, flags, lockid); + break; + + default: + status = -1; + errno = EINVAL; + break; + } + out: + *lockid = mode; + return status; +} + +int sync_unlock(const char *resource, int lockid) +{ + int status = 0; + char lock1[strlen(resource)+3]; + char lock2[strlen(resource)+3]; + + snprintf(lock1, sizeof(lock1), "%s-1", resource); + snprintf(lock2, sizeof(lock2), "%s-2", resource); + + /* The held lock mode is in the lock id */ + assert(lockid == LCK_EXCL || + lockid == LCK_READ || + lockid == LCK_WRITE); + + switch (lockid) + { + case LCK_EXCL: + status = _unlock_resource(lock1, lockid); + if (status) + goto out; + status = _unlock_resource(lock2, lockid); + break; + + case LCK_READ: + status = _unlock_resource(lock1, lockid); + break; + + case LCK_WRITE: + status = _unlock_resource(lock2, lockid); + break; + } + + out: + return status; +} + +int is_quorate() +{ + if (current_corestate == lg_core_Slave || + current_corestate == lg_core_Master || + current_corestate == lg_core_Client) + return 1; + else + return 0; +} + +/* Get all the cluster node names & IPs from CCS and + add them to our node list so we know who to talk to. + Called when we start up and if we get sent SIGHUP. +*/ +static int get_all_cluster_nodes() +{ + int ctree; + char *nodename; + int error; + + /* Open the config file */ + ctree = ccs_connect(); + if (ctree <= 0) + { + log_error("Error connecting to CCS"); + return -1; + } + + error = ccs_get(ctree, "//nodes/node/@name", &nodename); + while (nodename) + { + char nodeip[MAX_CSID_LEN]; + char *clvmflag; + char key[256]; + + sprintf(key, "//nodes/node[@name=\"%s\"]/clvm", nodename); + ccs_get(ctree, key, &clvmflag); + + if ((get_ip_address(nodename, nodeip) == 0) && atoi(clvmflag)) + { + struct node_info *ninfo; + + /* If it's not in the list, then add it */ + ninfo = hash_lookup_binary(node_hash, nodeip, MAX_CSID_LEN); + if (!ninfo) + { + ninfo = malloc(sizeof(struct node_info)); + if (!ninfo) + { + syslog(LOG_ERR, "Cannot alloc memory for node info\n"); + ccs_disconnect(ctree); + return -1; + } + strcpy(ninfo->name, nodename); + + ninfo->state = NODE_DOWN; + hash_insert_binary(node_hash, nodeip, MAX_CSID_LEN, ninfo); + } + } + else + { + DEBUGLOG("node %s has clvm disabled\n", nodename); + } + if (clvmflag) free(clvmflag); + free(nodename); + error = ccs_get(ctree, "//nodes/node/@name", &nodename); + } + + /* Finished with config file */ + ccs_disconnect(ctree); + + return 0; +} + +int gulm_fd(void) +{ + return lg_core_selector(gulm_if); +} diff --git a/daemons/clvmd/clvmd-gulm.h b/daemons/clvmd/clvmd-gulm.h new file mode 100644 index 000000000..07726faa3 --- /dev/null +++ b/daemons/clvmd/clvmd-gulm.h @@ -0,0 +1,9 @@ + + + +extern int get_next_node_csid(void **context, char *csid); +extern void add_down_node(char *csid); +extern int gulm_fd(void); +extern int get_ip_address(char *node, char *addr); +extern void tcp_remove_client(char *csid); +extern int alloc_client(int fd, char *csid, struct local_client **new_client); diff --git a/daemons/clvmd/clvmd.c b/daemons/clvmd/clvmd.c new file mode 100644 index 000000000..216eb1ea1 --- /dev/null +++ b/daemons/clvmd/clvmd.c @@ -0,0 +1,1693 @@ +/* + * Copyright (C) 2002-2004 Sistina Software, Inc. All rights reserved. + * Copyright (C) 2004 Red Hat, Inc. All rights reserved. + * + * This file is part of LVM2. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* + * CLVMD: Cluster LVM daemon + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "clvmd-comms.h" +#include "lvm-functions.h" +#include "clvm.h" +#include "clvmd.h" +#include "libdlm.h" +#include "system-lv.h" +#include "list.h" +#include "log.h" + +#ifndef TRUE +#define TRUE 1 +#endif +#ifndef FALSE +#define FALSE 0 +#endif + +/* The maximum size of a message that will fit into a packet. Anything bigger + than this is sent via the system LV */ +#define MAX_INLINE_MESSAGE (MAX_CLUSTER_MESSAGE-sizeof(struct clvm_header)) + +#define ISLOCAL_CSID(c) (memcmp(c, our_csid, MAX_CSID_LEN) == 0) + +/* Head of the fd list. Also contains + the cluster_socket details */ +static struct local_client local_client_head; + +static unsigned short global_xid = 0; /* Last transaction ID issued */ + +static char our_csid[MAX_CSID_LEN]; + +/* Structure of items on the LVM thread list */ +struct lvm_thread_cmd { + struct list list; + + struct local_client *client; + struct clvm_header *msg; + char csid[MAX_CSID_LEN]; + int remote; /* Flag */ + int msglen; + unsigned short xid; +}; +static pthread_t lvm_thread; +static pthread_mutex_t lvm_thread_mutex; +static pthread_cond_t lvm_thread_cond; +static struct list lvm_cmd_head; +static int quit = 0; + +/* Prototypes for code further down */ +static void sigusr2_handler(int sig); +static void sigterm_handler(int sig); +static void send_local_reply(struct local_client *client, int status, + int clientid); +static void free_reply(struct local_client *client); +static void send_version_message(void); +static void *pre_and_post_thread(void *arg); +static int send_message(void *buf, int msglen, char *csid, int fd, + const char *errtext); +static int read_from_local_sock(struct local_client *thisfd); +static int process_local_command(struct clvm_header *msg, int msglen, + struct local_client *client, + unsigned short xid); +static void process_remote_command(struct clvm_header *msg, int msglen, int fd, + char *csid); +static int process_reply(struct clvm_header *msg, int msglen, char *csid); +static int open_local_sock(void); +static struct local_client *find_client(int clientid); +static void main_loop(int local_sock, int cmd_timeout); +static void be_daemon(void); +static int check_all_clvmds_running(struct local_client *client); +static int local_rendezvous_callback(struct local_client *thisfd, char *buf, + int len, char *csid, + struct local_client **new_client); +static void *lvm_thread_fn(void *); +static int add_to_lvmqueue(struct local_client *client, struct clvm_header *msg, + int msglen, char *csid); +static int distribute_command(struct local_client *thisfd); +static void hton_clvm(struct clvm_header *hdr); +static void ntoh_clvm(struct clvm_header *hdr); +static void add_reply_to_list(struct local_client *client, int status, + char *csid, const char *buf, int len); + +static void usage(char *prog, FILE *file) +{ + fprintf(file, "Usage:\n"); + fprintf(file, "%s [Vhd]\n", prog); + fprintf(file, "\n"); + fprintf(file, " -V Show version of clvmd\n"); + fprintf(file, " -h Show this help information\n"); + fprintf(file, " -d Don't fork, run in the foreground\n"); + fprintf(file, " -t Command timeout (default 60 seconds)\n"); + fprintf(file, "\n"); +} + +int main(int argc, char *argv[]) +{ + int local_sock; + struct local_client *newfd; + struct utsname nodeinfo; + signed char opt; + int debug = 0; + int cmd_timeout = DEFAULT_CMD_TIMEOUT; + sigset_t ss; + + /* Deal with command-line arguments */ + opterr = 0; + optind = 0; + while ((opt = getopt(argc, argv, "?vVhdt:")) != EOF) { + switch (opt) { + case 'h': + usage(argv[0], stdout); + exit(0); + + case '?': + usage(argv[0], stderr); + exit(0); + + case 'd': + debug++; + break; + + case 't': + cmd_timeout = atoi(optarg); + if (!cmd_timeout) { + fprintf(stderr, "command timeout is invalid\n"); + usage(argv[0], stderr); + exit(1); + } + break; + + case 'V': + printf("\nCluster LVM Daemon version %d.%d.%d\n\n", + CLVMD_MAJOR_VERSION, CLVMD_MINOR_VERSION, + CLVMD_PATCH_VERSION); + exit(1); + break; + + } + } + + /* Fork into the background (unless requested not to) */ + if (!debug) { + be_daemon(); + } + + DEBUGLOG("CLVMD started\n"); + + /* Open the Unix socket we listen for commands on. + We do this before opening the cluster socket so that + potential clients will block rather than error if we are running + but the cluster is not ready yet */ + local_sock = open_local_sock(); + if (local_sock < 0) + exit(2); + + /* Set up signal handlers, USR1 is for cluster change notifications (in cman) + USR2 causes child threads to exit. + PIPE should be ignored */ + signal(SIGUSR2, sigusr2_handler); + signal(SIGTERM, sigterm_handler); + signal(SIGINT, sigterm_handler); + signal(SIGPIPE, SIG_IGN); + + /* Block SIGUSR2 in the main process */ + sigemptyset(&ss); + sigaddset(&ss, SIGUSR2); + sigprocmask(SIG_BLOCK, &ss, NULL); + + /* Initialise the LVM thread variables */ + list_init(&lvm_cmd_head); + pthread_mutex_init(&lvm_thread_mutex, NULL); + pthread_cond_init(&lvm_thread_cond, NULL); + init_lvhash(); + + /* Start the cluster interface */ + if (init_cluster()) { + DEBUGLOG("Can't initialise cluster interface\n"); + log_error("Can't initialise cluster interface\n"); + exit(5); + } + DEBUGLOG("Cluster ready, doing some more initialisation\n"); + + /* Save our CSID */ + uname(&nodeinfo); + get_our_csid(our_csid); + + /* Initialise the FD list head */ + local_client_head.fd = get_main_cluster_fd(); + local_client_head.type = CLUSTER_MAIN_SOCK; + local_client_head.callback = cluster_fd_callback; + + /* Add the local socket to the list */ + newfd = malloc(sizeof(struct local_client)); + if (!newfd) + exit(2); + + newfd->fd = local_sock; + newfd->type = LOCAL_RENDEZVOUS; + newfd->callback = local_rendezvous_callback; + newfd->next = local_client_head.next; + local_client_head.next = newfd; + + /* This needs to be started after cluster initialisation + as it may need to take out locks */ + DEBUGLOG("starting LVM thread\n"); + pthread_create(&lvm_thread, NULL, lvm_thread_fn, nodeinfo.nodename); + +#ifndef USE_GULM + /* Tell the rest of the cluster our version number */ + /* CMAN can do this immediately, gulm needs to wait until + the core initialisation has finished and the node list + has been gathered */ + send_version_message(); +#endif + + DEBUGLOG("clvmd ready for work\n"); + + /* Do some work */ + main_loop(local_sock, cmd_timeout); + + return 0; +} + +/* Called when the GuLM cluster layer has completed initialisation. + We send the version message */ +void clvmd_cluster_init_completed() +{ + send_version_message(); +} + +/* Data on a connected socket */ +static int local_sock_callback(struct local_client *thisfd, char *buf, int len, + char *csid, struct local_client **new_client) +{ + *new_client = NULL; + return read_from_local_sock(thisfd); +} + +/* Data on a connected socket */ +static int local_rendezvous_callback(struct local_client *thisfd, char *buf, + int len, char *csid, + struct local_client **new_client) +{ + /* Someone connected to our local socket, accept it. */ + + struct sockaddr_un socka; + struct local_client *newfd; + socklen_t sl = sizeof(socka); + int client_fd = accept(thisfd->fd, (struct sockaddr *) &socka, &sl); + + if (client_fd >= 0) { + newfd = malloc(sizeof(struct local_client)); + if (!newfd) { + close(client_fd); + return 1; + } + newfd->fd = client_fd; + newfd->type = LOCAL_SOCK; + newfd->xid = 0; + newfd->callback = local_sock_callback; + newfd->bits.localsock.replies = NULL; + newfd->bits.localsock.expected_replies = 0; + newfd->bits.localsock.cmd = NULL; + newfd->bits.localsock.in_progress = FALSE; + newfd->bits.localsock.sent_out = FALSE; + newfd->bits.localsock.threadid = 0; + newfd->bits.localsock.finished = 0; + newfd->bits.localsock.pipe_client = NULL; + newfd->bits.localsock.all_success = 1; + DEBUGLOG("Got new connection on fd %d\n", newfd->fd); + *new_client = newfd; + } + return 1; +} + +static int local_pipe_callback(struct local_client *thisfd, char *buf, + int maxlen, char *csid, + struct local_client **new_client) +{ + int len; + char buffer[PIPE_BUF]; + struct local_client *sock_client = thisfd->bits.pipe.client; + int status = -1; /* in error by default */ + + len = read(thisfd->fd, buffer, sizeof(int)); + + DEBUGLOG("read on PIPE %d: %d bytes: status: %d\n", + thisfd->fd, len, *(int *) buffer); + + if (len == sizeof(int)) { + status = *(int *) buffer; + } + + /* EOF on pipe or an error, close it */ + if (len <= 0) { + int jstat; + close(thisfd->fd); + + /* Clear out the cross-link */ + if (thisfd->bits.pipe.client != NULL) + thisfd->bits.pipe.client->bits.localsock.pipe_client = + NULL; + + /* Reap child thread */ + if (thisfd->bits.pipe.threadid) { + jstat = + pthread_join(thisfd->bits.pipe.threadid, + (void **) &status); + thisfd->bits.pipe.threadid = 0; + if (thisfd->bits.pipe.client != NULL) + thisfd->bits.pipe.client->bits.localsock. + threadid = 0; + } + return -1; + } else { + DEBUGLOG("background routine status was %d, sock_client=%p\n", + status, sock_client); + /* But has the client gone away ?? */ + if (sock_client == NULL) { + DEBUGLOG + ("Got PIPE response for dead client, ignoring it\n"); + } else { + /* If error then just return that code */ + if (status) + send_local_reply(sock_client, status, + sock_client->fd); + else { + if (sock_client->bits.localsock.state == + POST_COMMAND) { + send_local_reply(sock_client, 0, + sock_client->fd); + } else // PRE_COMMAND finished. + { + if ( + (status = + distribute_command(sock_client)) != + 0) send_local_reply(sock_client, + EFBIG, + sock_client-> + fd); + } + } + } + } + return len; +} + +/* If a noed is up, look for it in the reply array, if it's not there then + add one with "ETIMEDOUT". + NOTE: This won't race with real replies because they happen in the same thread. +*/ +static void timedout_callback(struct local_client *client, char *csid, + int node_up) +{ + if (node_up) { + struct node_reply *reply; + char nodename[MAX_CLUSTER_MEMBER_NAME_LEN]; + + name_from_csid(csid, nodename); + DEBUGLOG("PJC: checking for a reply from %s\n", nodename); + pthread_mutex_lock(&client->bits.localsock.reply_mutex); + + reply = client->bits.localsock.replies; + while (reply && strcmp(reply->node, nodename) != 0) { + reply = reply->next; + } + + pthread_mutex_unlock(&client->bits.localsock.reply_mutex); + + if (!reply) { + DEBUGLOG("PJC: node %s timed-out\n", nodename); + add_reply_to_list(client, ETIMEDOUT, csid, + "Command timed out", 18); + } + } +} + +/* Called when the request has timed out on at least one node. We fill in + the remaining node entries with ETIMEDOUT and return. + + By the time we get here the node that caused + the timeout could have gone down, in which case we will never get the expected + number of replies that triggers the post command so we need to do it here +*/ +static void request_timed_out(struct local_client *client) +{ + DEBUGLOG("Request timed-out. padding\n"); + cluster_do_node_callback(client, timedout_callback); + + if (client->bits.localsock.num_replies != + client->bits.localsock.expected_replies) { + /* Post-process the command */ + if (client->bits.localsock.threadid) { + pthread_mutex_lock(&client->bits.localsock.mutex); + client->bits.localsock.state = POST_COMMAND; + pthread_cond_signal(&client->bits.localsock.cond); + pthread_mutex_unlock(&client->bits.localsock.mutex); + } + } +} + +/* This is where the real work happens */ +static void main_loop(int local_sock, int cmd_timeout) +{ + DEBUGLOG("Using timeout of %d seconds\n", cmd_timeout); + + /* Main loop */ + while (!quit) { + fd_set in; + int select_status; + struct local_client *thisfd; + struct timeval tv = { cmd_timeout, 0 }; + int quorate = is_quorate(); + + /* Wait on the cluster FD and all local sockets/pipes */ + FD_ZERO(&in); + for (thisfd = &local_client_head; thisfd != NULL; + thisfd = thisfd->next) { + /* if the cluster is not quorate then don't listen for new requests */ + if ((thisfd->type != LOCAL_RENDEZVOUS && + thisfd->type != LOCAL_SOCK) || quorate) + FD_SET(thisfd->fd, &in); + } + + if ((select_status = select(FD_SETSIZE, &in, NULL, NULL, &tv)) > 0) { + struct local_client *lastfd = NULL; + struct clvm_header *inheader; + char csid[MAX_CSID_LEN]; + char buf[MAX_CLUSTER_MESSAGE]; + + for (thisfd = &local_client_head; thisfd != NULL; + thisfd = thisfd->next) { + if (FD_ISSET(thisfd->fd, &in)) { + struct local_client *newfd; + int ret; + + /* Do callback */ + ret = + thisfd->callback(thisfd, buf, + sizeof(buf), csid, + &newfd); + /* Ignore EAGAIN */ + if (ret < 0 && (errno == EAGAIN || + errno == EINTR)) continue; + + /* Got error or EOF: Remove it from the list safely */ + if (ret <= 0) { + struct local_client *free_fd; + int type = thisfd->type; + + /* If the cluster socket shuts down, so do we */ + if (type == CLUSTER_MAIN_SOCK || + type == CLUSTER_INTERNAL) + goto closedown; + + DEBUGLOG + ("ret == %d, errno = %d. removing client\n", + ret, errno); + lastfd->next = thisfd->next; + free_fd = thisfd; + thisfd = lastfd; + free(free_fd); + break; + } + + /* New client...simply add it to the list */ + if (newfd) { + newfd->next = thisfd->next; + thisfd->next = newfd; + break; + } + + switch (thisfd->type) { + case CLUSTER_MAIN_SOCK: + case CLUSTER_DATA_SOCK: + inheader = + (struct clvm_header *) buf; + ntoh_clvm(inheader); /* Byteswap fields */ + if (inheader->cmd == + CLVMD_CMD_REPLY) + process_reply + (inheader, ret, + csid); + else + add_to_lvmqueue(thisfd, + inheader, + ret, + csid); + break; + + /* All the work for these is done in the callback + rightly or wrongly... */ + case LOCAL_RENDEZVOUS: + case LOCAL_SOCK: + case THREAD_PIPE: + case CLUSTER_INTERNAL: + break; + } + } + lastfd = thisfd; + } + } + + /* Select timed out. Check for clients that have been waiting too long for a response */ + if (select_status == 0) { + time_t the_time = time(NULL); + + for (thisfd = &local_client_head; thisfd != NULL; + thisfd = thisfd->next) { + if (thisfd->type == LOCAL_SOCK + && thisfd->bits.localsock.sent_out + && thisfd->bits.localsock.sent_time + + cmd_timeout < the_time + && thisfd->bits.localsock. + expected_replies != + thisfd->bits.localsock.num_replies) { + /* Send timed out message + replies we already have */ + DEBUGLOG + ("Request timed-out (send: %ld, now: %ld)\n", + thisfd->bits.localsock.sent_time, + the_time); + + thisfd->bits.localsock.all_success = 0; + + request_timed_out(thisfd); + } + } + } + if (select_status < 0) { + if (errno == EINTR) + continue; + +#ifdef DEBUG + perror("select error"); + exit(-1); +#endif + } + } + + closedown: + cluster_closedown(); + close(local_sock); +} + +/* Fork into the background and detach from our parent process */ +static void be_daemon() +{ + pid_t pid; + int devnull = open("/dev/null", O_RDWR); + if (devnull == -1) { + perror("Can't open /dev/null"); + exit(3); + } + + switch (pid = fork()) { + case -1: + perror("clvmd: can't fork"); + exit(2); + + case 0: /* child */ + break; + + default: /* Parent */ + exit(0); + } + + /* Detach ourself from the calling environment */ + if (close(0) || close(1) || close(2)) { + perror("Error closing terminal FDs"); + exit(4); + } + setsid(); + + if (dup2(devnull, 0) < 0 || dup2(devnull, 1) < 0 + || dup2(devnull, 2) < 0) { + perror("Error setting terminal FDs to /dev/null"); + log_error("Error setting terminal FDs to /dev/null: %m"); + exit(5); + } + if (chdir("/")) { + log_error("Error setting current directory to /: %m"); + exit(6); + } + +} + +/* Called when we have a read from the local socket. + was in the main loop but it's grown up and is a big girl now */ +static int read_from_local_sock(struct local_client *thisfd) +{ + int len; + int argslen; + int missing_len; + char buffer[PIPE_BUF]; + + len = read(thisfd->fd, buffer, sizeof(buffer)); + + DEBUGLOG("Read on local socket %d, len = %d\n", thisfd->fd, len); + + /* EOF or error on socket */ + if (len <= 0) { + int *status; + int jstat; + + DEBUGLOG("EOF on local socket: inprogress=%d\n", + thisfd->bits.localsock.in_progress); + + thisfd->bits.localsock.finished = 1; + + /* If the client went away in mid command then tidy up */ + if (thisfd->bits.localsock.in_progress) { + pthread_mutex_lock(&thisfd->bits.localsock.mutex); + thisfd->bits.localsock.state = POST_COMMAND; + pthread_cond_signal(&thisfd->bits.localsock.cond); + pthread_mutex_unlock(&thisfd->bits.localsock.mutex); + + /* Free any unsent buffers */ + free_reply(thisfd); + } + + /* Kill the subthread & free resources */ + if (thisfd->bits.localsock.threadid) { + DEBUGLOG("Waiting for child thread\n"); + pthread_mutex_lock(&thisfd->bits.localsock.mutex); + thisfd->bits.localsock.state = POST_COMMAND; + pthread_cond_signal(&thisfd->bits.localsock.cond); + pthread_mutex_unlock(&thisfd->bits.localsock.mutex); + pthread_kill(thisfd->bits.localsock.threadid, SIGUSR2); + + jstat = + pthread_join(thisfd->bits.localsock.threadid, + (void **) &status); + DEBUGLOG("Joined child thread\n"); + + thisfd->bits.localsock.threadid = 0; + pthread_cond_destroy(&thisfd->bits.localsock.cond); + pthread_mutex_destroy(&thisfd->bits.localsock.mutex); + + /* Remove the pipe client */ + if (thisfd->bits.localsock.pipe_client != NULL) { + struct local_client *newfd; + struct local_client *lastfd = NULL; + struct local_client *free_fd = NULL; + + close(thisfd->bits.localsock.pipe_client->fd); /* Close pipe */ + close(thisfd->bits.localsock.pipe); + + /* Remove pipe client */ + for (newfd = &local_client_head; newfd != NULL; + newfd = newfd->next) { + if (thisfd->bits.localsock. + pipe_client == newfd) { + thisfd->bits.localsock. + pipe_client = NULL; + + lastfd->next = newfd->next; + free_fd = newfd; + newfd->next = lastfd; + free(free_fd); + break; + } + lastfd = newfd; + } + } + } + + /* Free the command buffer */ + if (thisfd->bits.localsock.cmd) + free(thisfd->bits.localsock.cmd); + + /* Clear out the cross-link */ + if (thisfd->bits.localsock.pipe_client != NULL) + thisfd->bits.localsock.pipe_client->bits.pipe.client = + NULL; + + close(thisfd->fd); + return 0; + } else { + int comms_pipe[2]; + struct local_client *newfd; + char csid[MAX_CSID_LEN]; + struct clvm_header *inheader; + + inheader = (struct clvm_header *) buffer; + + /* Fill in the client ID */ + inheader->clientid = htonl(thisfd->fd); + + /* If we are already busy then return an error */ + if (thisfd->bits.localsock.in_progress) { + struct clvm_header reply; + reply.cmd = CLVMD_CMD_REPLY; + reply.status = -EBUSY; + reply.arglen = 0; + reply.flags = 0; + send_message(&reply, sizeof(reply), our_csid, + thisfd->fd, + "Error sending EBUSY reply to local user"); + return len; + } + + /* Free any old buffer space */ + if (thisfd->bits.localsock.cmd) + free(thisfd->bits.localsock.cmd); + + /* See if we have the whole message */ + argslen = + len - strlen(inheader->node) - sizeof(struct clvm_header); + missing_len = inheader->arglen - argslen; + + /* Save the message */ + thisfd->bits.localsock.cmd = malloc(len + missing_len); + if (!thisfd->bits.localsock.cmd) { + struct clvm_header reply; + reply.cmd = CLVMD_CMD_REPLY; + reply.status = -ENOMEM; + reply.arglen = 0; + reply.flags = 0; + send_message(&reply, sizeof(reply), our_csid, + thisfd->fd, + "Error sending ENOMEM reply to local user"); + return 0; + } + memcpy(thisfd->bits.localsock.cmd, buffer, len); + thisfd->bits.localsock.cmd_len = len + missing_len; + inheader = (struct clvm_header *) thisfd->bits.localsock.cmd; + + /* If we don't have the full message then read the rest now */ + if (missing_len) { + char *argptr = + inheader->node + strlen(inheader->node) + 1; + + while (missing_len > 0 && len >= 0) { + DEBUGLOG + ("got %d bytes, need another %d (total %d)\n", + argslen, missing_len, inheader->arglen); + len = + read(thisfd->fd, argptr + argslen, + missing_len); + if (len >= 0) { + missing_len -= len; + argslen += len; + } + } + } + + /* Only run the command if all the cluster nodes are running CLVMD */ + if (((inheader->flags & CLVMD_FLAG_LOCAL) == 0) && + (check_all_clvmds_running(thisfd) == -1)) { + thisfd->bits.localsock.expected_replies = 0; + thisfd->bits.localsock.num_replies = 0; + send_local_reply(thisfd, EHOSTDOWN, thisfd->fd); + return len; + } + + /* Check the node name for validity */ + if (inheader->node[0] && csid_from_name(csid, inheader->node)) { + /* Error, node is not in the cluster */ + struct clvm_header reply; + DEBUGLOG("Unknown node: '%s'\n", inheader->node); + + reply.cmd = CLVMD_CMD_REPLY; + reply.status = -ENOENT; + reply.flags = 0; + reply.arglen = 0; + send_message(&reply, sizeof(reply), our_csid, + thisfd->fd, + "Error sending ENOENT reply to local user"); + thisfd->bits.localsock.expected_replies = 0; + thisfd->bits.localsock.num_replies = 0; + thisfd->bits.localsock.in_progress = FALSE; + thisfd->bits.localsock.sent_out = FALSE; + return len; + } + + /* If we already have a subthread then just signal it to start */ + if (thisfd->bits.localsock.threadid) { + pthread_mutex_lock(&thisfd->bits.localsock.mutex); + thisfd->bits.localsock.state = PRE_COMMAND; + pthread_cond_signal(&thisfd->bits.localsock.cond); + pthread_mutex_unlock(&thisfd->bits.localsock.mutex); + return len; + } + + /* Create a pipe and add the reading end to our FD list */ + pipe(comms_pipe); + newfd = malloc(sizeof(struct local_client)); + if (!newfd) { + struct clvm_header reply; + close(comms_pipe[0]); + close(comms_pipe[1]); + + reply.cmd = CLVMD_CMD_REPLY; + reply.status = -ENOMEM; + reply.arglen = 0; + reply.flags = 0; + send_message(&reply, sizeof(reply), our_csid, + thisfd->fd, + "Error sending ENOMEM reply to local user"); + return len; + } + DEBUGLOG("creating pipe, [%d, %d]\n", comms_pipe[0], + comms_pipe[1]); + newfd->fd = comms_pipe[0]; + newfd->type = THREAD_PIPE; + newfd->callback = local_pipe_callback; + newfd->next = thisfd->next; + newfd->bits.pipe.client = thisfd; + newfd->bits.pipe.threadid = 0; + thisfd->next = newfd; + + /* Store a cross link to the pipe */ + thisfd->bits.localsock.pipe_client = newfd; + + thisfd->bits.localsock.pipe = comms_pipe[1]; + + /* Initialise and lock the mutex so the subthread will wait after + finishing the PRE routine */ + pthread_mutex_init(&thisfd->bits.localsock.mutex, NULL); + pthread_cond_init(&thisfd->bits.localsock.cond, NULL); + pthread_mutex_init(&thisfd->bits.localsock.reply_mutex, NULL); + + /* Make sure the thread has a copy of it's own ID */ + newfd->bits.pipe.threadid = thisfd->bits.localsock.threadid; + + /* Run the pre routine */ + thisfd->bits.localsock.in_progress = TRUE; + thisfd->bits.localsock.state = PRE_COMMAND; + pthread_create(&thisfd->bits.localsock.threadid, NULL, + pre_and_post_thread, thisfd); + } + return len; +} + +/* Add a file descriptor from the cluster or comms interface to + our list of FDs for select +*/ +int add_client(struct local_client *new_client) +{ + new_client->next = local_client_head.next; + local_client_head.next = new_client; + + return 0; +} + + +/* + * Send a long message using the System LV + */ +static int send_long_message(struct local_client *thisfd, struct clvm_header *inheader, int len) +{ + struct clvm_header new_header; + int status; + + DEBUGLOG("Long message: being sent via system LV:\n"); + + /* Use System LV */ + status = system_lv_write_data((char *)inheader, len); + if (status < 0) + return errno; + + /* Send message indicating System-LV is being used */ + memcpy(&new_header, inheader, sizeof(new_header)); + new_header.flags |= CLVMD_FLAG_SYSTEMLV; + new_header.xid = thisfd->xid; + + return send_message(&new_header, sizeof(new_header), NULL, -1, + "Error forwarding long message to cluster"); +} + +/* Called when the pre-command has completed successfully - we + now execute the real command on all the requested nodes */ +static int distribute_command(struct local_client *thisfd) +{ + struct clvm_header *inheader = + (struct clvm_header *) thisfd->bits.localsock.cmd; + int len = thisfd->bits.localsock.cmd_len; + + thisfd->xid = global_xid++; + DEBUGLOG("distribute command: XID = %d\n", thisfd->xid); + + /* Forward it to other nodes in the cluster if needed */ + if (!(inheader->flags & CLVMD_FLAG_LOCAL)) { + /* if node is empty then do it on the whole cluster */ + if (inheader->node[0] == '\0') { + thisfd->bits.localsock.expected_replies = + get_num_nodes(); + thisfd->bits.localsock.num_replies = 0; + thisfd->bits.localsock.sent_time = time(NULL); + thisfd->bits.localsock.in_progress = TRUE; + thisfd->bits.localsock.sent_out = TRUE; + + /* Do it here first */ + add_to_lvmqueue(thisfd, inheader, len, NULL); + + DEBUGLOG("Sending message to all cluster nodes\n"); + if (len > MAX_INLINE_MESSAGE) { + send_long_message(thisfd, inheader, len ); + } else { + inheader->xid = thisfd->xid; + send_message(inheader, len, NULL, -1, + "Error forwarding message to cluster"); + } + } else { + /* Do it on a single node */ + char csid[MAX_CSID_LEN]; + + if (csid_from_name(csid, inheader->node)) { + /* This has already been checked so should not happen */ + return 0; + } else { + /* OK, found a node... */ + thisfd->bits.localsock.expected_replies = 1; + thisfd->bits.localsock.num_replies = 0; + thisfd->bits.localsock.in_progress = TRUE; + + /* Are we the requested node ?? */ + if (memcmp(csid, our_csid, MAX_CSID_LEN) == 0) { + DEBUGLOG("Doing command on local node only\n"); + add_to_lvmqueue(thisfd, inheader, len, NULL); + } else { + DEBUGLOG("Sending message to single node: %s\n", + inheader->node); + if (len > MAX_INLINE_MESSAGE) { + send_long_message(thisfd, inheader, len ); + } else { + inheader->xid = thisfd->xid; + send_message(inheader, len, + csid, -1, + "Error forwarding message to cluster node"); + } + } + } + } + } else { + /* Local explicitly requested, ignore nodes */ + thisfd->bits.localsock.in_progress = TRUE; + thisfd->bits.localsock.expected_replies = 1; + thisfd->bits.localsock.num_replies = 0; + add_to_lvmqueue(thisfd, inheader, len, NULL); + } + return 0; +} + +/* Process a command from a remote node and return the result */ +void process_remote_command(struct clvm_header *msg, int msglen, int fd, + char *csid) +{ + char *replyargs; + char nodename[MAX_CLUSTER_MEMBER_NAME_LEN]; + int replylen = 0; + int buflen = MAX_CLUSTER_MESSAGE - sizeof(struct clvm_header) - 1; + int status; + int msg_malloced = 0; + + /* Get the node name as we /may/ need it later */ + name_from_csid(csid, nodename); + + DEBUGLOG("process_remote_command %d for clientid 0x%x on node %s\n", + msg->cmd, msg->clientid, nodename); + + /* Is the data to be found in the system LV ? */ + if (msg->flags & CLVMD_FLAG_SYSTEMLV) { + struct clvm_header *newmsg; + + DEBUGLOG("Reading message from system LV\n"); + newmsg = + (struct clvm_header *) malloc(msg->arglen + + sizeof(struct clvm_header)); + if (newmsg) { + if (system_lv_read_data + (nodename, (char *) newmsg, + (size_t *) &msglen) == 0) { + msg = newmsg; + msg_malloced = 1; + } else { + struct clvm_header head; + DEBUGLOG("System LV read failed\n"); + + /* Return a failure response */ + head.cmd = CLVMD_CMD_REPLY; + head.status = -EFBIG; + head.flags = 0; + head.clientid = msg->clientid; + head.arglen = 0; + head.node[0] = '\0'; + send_message(&head, sizeof(struct clvm_header), + csid, fd, + "Error sending ENOMEM command reply"); + return; + } + } else { + struct clvm_header head; + DEBUGLOG + ("Error attempting to malloc %d bytes for system LV read\n", + msg->arglen); + /* Return a failure response */ + head.cmd = CLVMD_CMD_REPLY; + head.status = -ENOMEM; + head.flags = 0; + head.clientid = msg->clientid; + head.arglen = 0; + head.node[0] = '\0'; + send_message(&head, sizeof(struct clvm_header), csid, + fd, "Error sending ENOMEM command reply"); + return; + } + } + + /* Check for GOAWAY and sulk */ + if (msg->cmd == CLVMD_CMD_GOAWAY) { + + DEBUGLOG("Told to go away by %s\n", nodename); + log_error("Told to go away by %s\n", nodename); + exit(99); + } + + /* Version check is internal - don't bother exposing it in + clvmd-command.c */ + if (msg->cmd == CLVMD_CMD_VERSION) { + int *version_nums = (int *) msg->args; + char node[256]; + name_from_csid(csid, node); + DEBUGLOG("Remote node %s is version %d.%d.%d\n", + node, + ntohl(version_nums[0]), + ntohl(version_nums[1]), ntohl(version_nums[2])); + + if (ntohl(version_nums[0]) != CLVMD_MAJOR_VERSION) { + struct clvm_header byebyemsg; + DEBUGLOG + ("Telling node %s to go away because of incompatible version number\n", + node); + log_notice + ("Telling node %s to go away because of incompatible version number %d.%d.%d\n", + node, ntohl(version_nums[0]), + ntohl(version_nums[1]), ntohl(version_nums[2])); + + byebyemsg.cmd = CLVMD_CMD_GOAWAY; + byebyemsg.status = 0; + byebyemsg.flags = 0; + byebyemsg.arglen = 0; + byebyemsg.clientid = 0; + cluster_send_message(&byebyemsg, sizeof(byebyemsg), + our_csid, + "Error Sending GOAWAY message"); + } else { + add_up_node(csid); + } + return; + } + + /* Allocate a default reply buffer */ + replyargs = malloc(MAX_CLUSTER_MESSAGE - sizeof(struct clvm_header)); + + if (replyargs != NULL) { + /* Run the command */ + status = + do_command(NULL, msg, msglen, &replyargs, buflen, + &replylen); + } else { + status = -ENOMEM; + } + + /* If it wasn't a reply, then reply */ + if (msg->cmd != CLVMD_CMD_REPLY) { + char *aggreply; + + aggreply = + realloc(replyargs, replylen + sizeof(struct clvm_header)); + if (aggreply) { + struct clvm_header *agghead = + (struct clvm_header *) aggreply; + + replyargs = aggreply; + /* Move it up so there's room for a header in front of the data */ + memmove(aggreply + offsetof(struct clvm_header, args), + replyargs, replylen); + + agghead->xid = msg->xid; + + /* Use the system LV ? */ + if (replylen > MAX_INLINE_MESSAGE) { + agghead->cmd = CLVMD_CMD_REPLY; + agghead->status = status; + agghead->flags = CLVMD_FLAG_SYSTEMLV; + agghead->clientid = msg->clientid; + agghead->arglen = replylen; + agghead->node[0] = '\0'; + + /* If System LV operation failed then report it as EFBIG but only do it + if the data buffer has something in it. */ + if (system_lv_write_data + (aggreply, + replylen + sizeof(struct clvm_header)) < 0 + && replylen > 0) + agghead->status = -EFBIG; + + send_message(agghead, + sizeof(struct clvm_header), csid, + fd, + "Error sending long command reply"); + + } else { + agghead->cmd = CLVMD_CMD_REPLY; + agghead->status = status; + agghead->flags = 0; + agghead->clientid = msg->clientid; + agghead->arglen = replylen; + agghead->node[0] = '\0'; + send_message(aggreply, + sizeof(struct clvm_header) + + replylen + 2, csid, fd, + "Error sending command reply"); + } + } else { + struct clvm_header head; + + DEBUGLOG("Error attempting to realloc return buffer\n"); + /* Return a failure response */ + head.cmd = CLVMD_CMD_REPLY; + head.status = -ENOMEM; + head.flags = 0; + head.clientid = msg->clientid; + head.arglen = 0; + head.node[0] = '\0'; + send_message(&head, sizeof(struct clvm_header), csid, + fd, "Error sending ENOMEM command reply"); + return; + } + } + + /* Free buffer if it was malloced */ + if (msg_malloced) { + free(msg); + } + free(replyargs); +} + +/* Add a reply to a command to the list of replies for this client. + If we have got a full set then send them to the waiting client down the local + socket */ +static void add_reply_to_list(struct local_client *client, int status, + char *csid, const char *buf, int len) +{ + struct node_reply *reply; + + pthread_mutex_lock(&client->bits.localsock.reply_mutex); + + /* Add it to the list of replies */ + reply = malloc(sizeof(struct node_reply)); + if (reply) { + reply->status = status; + name_from_csid(csid, reply->node); + DEBUGLOG("Reply from node %s: %d bytes\n", reply->node, len); + + if (len > 0) { + reply->replymsg = malloc(len); + if (!reply->replymsg) { + reply->status = -ENOMEM; + } else { + memcpy(reply->replymsg, buf, len); + } + } else { + reply->replymsg = NULL; + } + /* Hook it onto the reply chain */ + reply->next = client->bits.localsock.replies; + client->bits.localsock.replies = reply; + } else { + /* It's all gone horribly wrong... */ + pthread_mutex_unlock(&client->bits.localsock.reply_mutex); + send_local_reply(client, ENOMEM, client->fd); + return; + } + DEBUGLOG("Got %d replies, expecting: %d\n", + client->bits.localsock.num_replies + 1, + client->bits.localsock.expected_replies); + + /* If we have the whole lot then do the post-process */ + if (++client->bits.localsock.num_replies == + client->bits.localsock.expected_replies) { + /* Post-process the command */ + if (client->bits.localsock.threadid) { + pthread_mutex_lock(&client->bits.localsock.mutex); + client->bits.localsock.state = POST_COMMAND; + pthread_cond_signal(&client->bits.localsock.cond); + pthread_mutex_unlock(&client->bits.localsock.mutex); + } + } + pthread_mutex_unlock(&client->bits.localsock.reply_mutex); +} + +/* This is the thread that runs the PRE and post commands for a particular connection */ +static void *pre_and_post_thread(void *arg) +{ + struct local_client *client = (struct local_client *) arg; + int status; + sigset_t ss; + int pipe_fd = client->bits.localsock.pipe; + + DEBUGLOG("in sub thread: client = %p\n", client); + + /* Ignore SIGUSR1 (handled by master process) but enable + SIGUSR2 (kills subthreads) */ + sigemptyset(&ss); + sigaddset(&ss, SIGUSR1); + pthread_sigmask(SIG_BLOCK, &ss, NULL); + + sigdelset(&ss, SIGUSR1); + sigaddset(&ss, SIGUSR2); + pthread_sigmask(SIG_UNBLOCK, &ss, NULL); + + /* Loop around doing PRE and POST functions until the client goes away */ + while (!client->bits.localsock.finished) { + /* Execute the code */ + status = do_pre_command(client); + + if (status) + client->bits.localsock.all_success = 0; + + DEBUGLOG("Writing status %d down pipe %d\n", status, pipe_fd); + /* Tell the parent process we have finished this bit */ + write(pipe_fd, &status, sizeof(int)); + + /* We may need to wait for the condition variable before running the post command */ + pthread_mutex_lock(&client->bits.localsock.mutex); + DEBUGLOG("Waiting to do post command - state = %d\n", + client->bits.localsock.state); + + if (client->bits.localsock.state != POST_COMMAND) { + pthread_cond_wait(&client->bits.localsock.cond, + &client->bits.localsock.mutex); + } + pthread_mutex_unlock(&client->bits.localsock.mutex); + + DEBUGLOG("Got post command condition...\n"); + + do_post_command(client); + + write(pipe_fd, &status, sizeof(int)); + + if (client->bits.localsock.finished) + break; + + DEBUGLOG("Waiting for next pre command\n"); + + pthread_mutex_lock(&client->bits.localsock.mutex); + if (client->bits.localsock.state != PRE_COMMAND) { + pthread_cond_wait(&client->bits.localsock.cond, + &client->bits.localsock.mutex); + } + pthread_mutex_unlock(&client->bits.localsock.mutex); + + DEBUGLOG("Got pre command condition...\n"); + } + DEBUGLOG("Subthread finished\n"); + return (void *) 0; +} + +/* Process a command on the local node and store the result */ +static int process_local_command(struct clvm_header *msg, int msglen, + struct local_client *client, + unsigned short xid) +{ + char *replybuf = malloc(MAX_CLUSTER_MESSAGE); + int buflen = MAX_CLUSTER_MESSAGE - sizeof(struct clvm_header) - 1; + int replylen = 0; + int status; + + DEBUGLOG("process_local_command: msg=%p, msglen =%d, client=%p\n", msg, + msglen, client); + if (replybuf == NULL) + return -1; + + status = do_command(client, msg, msglen, &replybuf, buflen, &replylen); + + if (status) + client->bits.localsock.all_success = 0; + + /* If we took too long then discard the reply */ + if (xid == client->xid) { + add_reply_to_list(client, status, our_csid, replybuf, replylen); + } else { + DEBUGLOG + ("Local command took too long, discarding xid %d, current is %d\n", + xid, client->xid); + } + + free(replybuf); + return status; +} + +static int process_reply(struct clvm_header *msg, int msglen, char *csid) +{ + struct local_client *client = NULL; + + client = find_client(msg->clientid); + if (!client) { + DEBUGLOG("Got message for unknown client 0x%x\n", + msg->clientid); + log_error("Got message for unknown client 0x%x\n", + msg->clientid); + return -1; + } + + if (msg->status) + client->bits.localsock.all_success = 0; + + /* Gather replies together for this client id */ + if (msg->xid == client->xid) { + add_reply_to_list(client, msg->status, csid, msg->args, + msg->arglen); + } else { + DEBUGLOG("Discarding reply with old XID %d, current = %d\n", + msg->xid, client->xid); + } + return 0; +} + +/* Send an aggregated reply back to the client */ +static void send_local_reply(struct local_client *client, int status, int fd) +{ + struct clvm_header *clientreply; + struct node_reply *thisreply = client->bits.localsock.replies; + char *replybuf; + char *ptr; + int message_len = 0; + + DEBUGLOG("Send local reply\n"); + + /* Work out the total size of the reply */ + while (thisreply) { + if (thisreply->replymsg) + message_len += strlen(thisreply->replymsg) + 1; + else + message_len++; + + message_len += strlen(thisreply->node) + 1 + sizeof(int); + + thisreply = thisreply->next; + } + + /* Add in the size of our header */ + message_len = message_len + sizeof(struct clvm_header) + 1; + replybuf = malloc(message_len); + + clientreply = (struct clvm_header *) replybuf; + clientreply->status = -status; + clientreply->cmd = CLVMD_CMD_REPLY; + clientreply->node[0] = '\0'; + + ptr = clientreply->args; + + /* Add in all the replies, and free them as we go */ + thisreply = client->bits.localsock.replies; + while (thisreply) { + struct node_reply *tempreply = thisreply; + + strcpy(ptr, thisreply->node); + ptr += strlen(thisreply->node) + 1; + + *(int *) ptr = thisreply->status; + ptr += sizeof(int); + + if (thisreply->replymsg) { + strcpy(ptr, thisreply->replymsg); + ptr += strlen(thisreply->replymsg) + 1; + } else { + ptr[0] = '\0'; + ptr++; + } + thisreply = thisreply->next; + + if (tempreply->replymsg) + free(tempreply->replymsg); + free(tempreply); + } + + /* Terminate with an empty node name */ + *ptr = '\0'; + + clientreply->arglen = ptr - clientreply->args + 1; + + /* And send it */ + send_message(replybuf, message_len, our_csid, fd, + "Error sending REPLY to client"); + free(replybuf); + + /* Reset comms variables */ + client->bits.localsock.replies = NULL; + client->bits.localsock.expected_replies = 0; + client->bits.localsock.in_progress = FALSE; + client->bits.localsock.sent_out = FALSE; +} + +/* Just free a reply chain baceuse it wasn't used. */ +static void free_reply(struct local_client *client) +{ + /* Add in all the replies, and free them as we go */ + struct node_reply *thisreply = client->bits.localsock.replies; + while (thisreply) { + struct node_reply *tempreply = thisreply; + + thisreply = thisreply->next; + + if (tempreply->replymsg) + free(tempreply->replymsg); + free(tempreply); + } + client->bits.localsock.replies = NULL; +} + +/* Send our version number to the cluster */ +static void send_version_message() +{ + char message[sizeof(struct clvm_header) + sizeof(int) * 3]; + struct clvm_header *msg = (struct clvm_header *) message; + int *version_nums = (int *) msg->args; + + msg->cmd = CLVMD_CMD_VERSION; + msg->status = 0; + msg->flags = 0; + msg->clientid = 0; + msg->arglen = sizeof(int) * 3; + + version_nums[0] = htonl(CLVMD_MAJOR_VERSION); + version_nums[1] = htonl(CLVMD_MINOR_VERSION); + version_nums[2] = htonl(CLVMD_PATCH_VERSION); + + cluster_send_message(message, sizeof(message), NULL, + "Error Sending version number"); +} + +/* Send a message to either a local client or another server */ +static int send_message(void *buf, int msglen, char *csid, int fd, + const char *errtext) +{ + int len; + + /* Send remote messages down the cluster socket */ + if (csid == NULL || !ISLOCAL_CSID(csid)) { + hton_clvm((struct clvm_header *) buf); /* Byte swap if necessary */ + return cluster_send_message(buf, msglen, csid, errtext); + } else { + int ptr = 0; + + /* Make sure it all goes */ + do { + len = write(fd, buf + ptr, msglen - ptr); + + if (len <= 0) { + log_error(errtext); + break; + } + ptr += len; + } while (len < msglen); + } + return len; +} + +static int process_work_item(struct lvm_thread_cmd *cmd) +{ + if (!cmd->remote) { + DEBUGLOG("process_work_item: local\n"); + process_local_command(cmd->msg, cmd->msglen, cmd->client, + cmd->xid); + } else { + DEBUGLOG("process_work_item: remote\n"); + process_remote_command(cmd->msg, cmd->msglen, cmd->client->fd, + cmd->csid); + } + return 0; +} + +/* + * Routine that runs in the "LVM thread". + */ +static void *lvm_thread_fn(void *arg) +{ + struct list *cmdl, *tmp; + sigset_t ss; + + DEBUGLOG("LVM thread function started\n"); + pthread_mutex_lock(&lvm_thread_mutex); + + /* Ignore SIGUSR1 & 2 */ + sigemptyset(&ss); + sigaddset(&ss, SIGUSR1); + sigaddset(&ss, SIGUSR2); + pthread_sigmask(SIG_BLOCK, &ss, NULL); + + /* Initialise the interface to liblvm */ + init_lvm(); + pthread_mutex_unlock(&lvm_thread_mutex); + + /* Now wait for some actual work */ + for (;;) { + DEBUGLOG("LVM thread waiting for work\n"); + + pthread_mutex_lock(&lvm_thread_mutex); + if (list_empty(&lvm_cmd_head)) + pthread_cond_wait(&lvm_thread_cond, &lvm_thread_mutex); + + list_iterate_safe(cmdl, tmp, &lvm_cmd_head) { + struct lvm_thread_cmd *cmd; + + cmd = + list_struct_base(cmdl, struct lvm_thread_cmd, list); + list_del(&cmd->list); + pthread_mutex_unlock(&lvm_thread_mutex); + + process_work_item(cmd); + free(cmd->msg); + free(cmd); + + pthread_mutex_lock(&lvm_thread_mutex); + } + pthread_mutex_unlock(&lvm_thread_mutex); + } +} + +/* Pass down some work to the LVM thread */ +static int add_to_lvmqueue(struct local_client *client, struct clvm_header *msg, + int msglen, char *csid) +{ + struct lvm_thread_cmd *cmd; + + cmd = malloc(sizeof(struct lvm_thread_cmd)); + if (!cmd) + return -ENOMEM; + + cmd->msg = malloc(msglen); + if (!cmd->msg) { + log_error("Unable to allocate buffer space\n"); + free(cmd); + return -1; + } + + cmd->client = client; + cmd->msglen = msglen; + cmd->xid = client->xid; + memcpy(cmd->msg, msg, msglen); + if (csid) { + memcpy(cmd->csid, csid, MAX_CSID_LEN); + cmd->remote = 1; + } else { + cmd->remote = 0; + } + + DEBUGLOG + ("add_to_lvmqueue: cmd=%p. client=%p, msg=%p, len=%d, csid=%p, xid=%d\n", + cmd, client, msg, msglen, csid, cmd->xid); + pthread_mutex_lock(&lvm_thread_mutex); + list_add(&lvm_cmd_head, &cmd->list); + pthread_cond_signal(&lvm_thread_cond); + pthread_mutex_unlock(&lvm_thread_mutex); + + return 0; +} + +/* Open the local socket, that's the one we talk to libclvm down */ +static int open_local_sock() +{ + int local_socket; + struct sockaddr_un sockaddr; + + /* Open local socket */ + if (CLVMD_SOCKNAME[0] != '\0') + unlink(CLVMD_SOCKNAME); + local_socket = socket(PF_UNIX, SOCK_STREAM, 0); + if (local_socket < 0) { + log_error("Can't create local socket: %m"); + return -1; + } + + memset(&sockaddr, 0, sizeof(sockaddr)); + memcpy(sockaddr.sun_path, CLVMD_SOCKNAME, sizeof(CLVMD_SOCKNAME)); + sockaddr.sun_family = AF_UNIX; + if (bind(local_socket, (struct sockaddr *) &sockaddr, sizeof(sockaddr))) { + log_error("can't bind local socket: %m"); + close(local_socket); + return -1; + } + if (listen(local_socket, 1) != 0) { + log_error("listen local: %m"); + close(local_socket); + return -1; + } + if (CLVMD_SOCKNAME[0] != '\0') + chmod(CLVMD_SOCKNAME, 0600); + + return local_socket; +} + +static void check_all_callback(struct local_client *client, char *csid, + int node_up) +{ + if (!node_up) + add_reply_to_list(client, -EHOSTDOWN, csid, "CLVMD not running", + 18); +} + +/* Check to see if all CLVMDs are running (ie one on + every node in the cluster). + If not, returns -1 and prints out a list of errant nodes */ +static int check_all_clvmds_running(struct local_client *client) +{ + DEBUGLOG("check_all_clvmds_running\n"); + return cluster_do_node_callback(client, check_all_callback); +} + +/* Return a local_client struct given a client ID. + client IDs are in network byte order */ +static struct local_client *find_client(int clientid) +{ + struct local_client *thisfd; + for (thisfd = &local_client_head; thisfd != NULL; thisfd = thisfd->next) { + if (thisfd->fd == ntohl(clientid)) + return thisfd; + } + return NULL; +} + +/* Byte-swapping routines for the header so we + work in a heterogeneous environment */ +static void hton_clvm(struct clvm_header *hdr) +{ + hdr->status = htonl(hdr->status); + hdr->arglen = htonl(hdr->arglen); + hdr->xid = htons(hdr->xid); + /* Don't swap clientid as it's only a token as far as + remote nodes are concerned */ +} + +static void ntoh_clvm(struct clvm_header *hdr) +{ + hdr->status = ntohl(hdr->status); + hdr->arglen = ntohl(hdr->arglen); + hdr->xid = ntohs(hdr->xid); +} + +/* Handler for SIGUSR2 - sent to kill subthreads */ +static void sigusr2_handler(int sig) +{ + DEBUGLOG("SIGUSR2 received\n"); + pthread_exit((void *) -1); + return; +} + +static void sigterm_handler(int sig) +{ + DEBUGLOG("SIGTERM received\n"); + quit = 1; + return; +} diff --git a/daemons/clvmd/clvmd.h b/daemons/clvmd/clvmd.h new file mode 100644 index 000000000..46e53c4b0 --- /dev/null +++ b/daemons/clvmd/clvmd.h @@ -0,0 +1,119 @@ +/* + * Copyright (C) 2002-2004 Sistina Software, Inc. All rights reserved. + * Copyright (C) 2004 Red Hat, Inc. All rights reserved. + * + * This file is part of LVM2. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _CLVMD_H +#define _CLVMD_H + +#define CLVMD_MAJOR_VERSION 0 +#define CLVMD_MINOR_VERSION 2 +#define CLVMD_PATCH_VERSION 1 + +/* Name of the cluster LVM admin lock */ +#define ADMIN_LOCK_NAME "CLVMD_ADMIN" + +/* Default time (in seconds) we will wait for all remote commands to execute + before declaring them dead */ +#define DEFAULT_CMD_TIMEOUT 60 + +/* One of these for each reply we get from command execution on a node */ +struct node_reply { + char node[MAX_CLUSTER_MEMBER_NAME_LEN]; + char *replymsg; + int status; + struct node_reply *next; +}; + +/* + * These exist for the use of local sockets only when we are + * collecting responses from all cluster nodes + */ +struct localsock_bits { + struct node_reply *replies; + int num_replies; + int expected_replies; + time_t sent_time; /* So we can check for timeouts */ + int in_progress; /* Only execute one cmd at a time per client */ + int sent_out; /* Flag to indicate that a command was sent + to remote nodes */ + void *private; /* Private area for command processor use */ + void *cmd; /* Whole command as passed down local socket */ + int cmd_len; /* Length of above */ + int pipe; /* Pipe to send PRE completion status down */ + int finished; /* Flag to tell subthread to exit */ + int all_success; /* Set to 0 if any node (or the pre_command) + failed */ + struct local_client *pipe_client; + pthread_t threadid; + enum { PRE_COMMAND, POST_COMMAND, QUIT } state; + pthread_mutex_t mutex; /* Main thread and worker synchronisation */ + pthread_cond_t cond; + + pthread_mutex_t reply_mutex; /* Protect reply structure */ +}; + +/* Entries for PIPE clients */ +struct pipe_bits { + struct local_client *client; /* Actual (localsock) client */ + pthread_t threadid; /* Our own copy of the thread id */ +}; + +/* Entries for Network socket clients */ +struct netsock_bits { + void *private; + int flags; +}; + +typedef int (*fd_callback_t) (struct local_client * fd, char *buf, int len, + char *csid, struct local_client ** new_client); + +/* One of these for each fd we are listening on */ +struct local_client { + int fd; + enum { CLUSTER_MAIN_SOCK, CLUSTER_DATA_SOCK, LOCAL_RENDEZVOUS, + LOCAL_SOCK, THREAD_PIPE, CLUSTER_INTERNAL } type; + struct local_client *next; + unsigned short xid; + fd_callback_t callback; + + union { + struct localsock_bits localsock; + struct pipe_bits pipe; + struct netsock_bits net; + } bits; +}; + +#ifdef DEBUG +#define DEBUGLOG(fmt, args...) fprintf(stderr, "CLVMD[%d]: %ld ", getpid(), time(NULL) ); fprintf(stderr, fmt, ## args) +#else +#define DEBUGLOG(fmt, args...) +#endif + +#ifndef max +#define max(a,b) ((a)>(b)?(a):(b)) +#endif + +/* The real command processor is in clvmd-command.c */ +extern int do_command(struct local_client *client, struct clvm_header *msg, + int msglen, char **buf, int buflen, int *retlen); + +/* Pre and post command routines are called only on the local node */ +extern int do_pre_command(struct local_client *client); +extern int do_post_command(struct local_client *client); + +extern int add_client(struct local_client *new_client); + +extern void clvmd_cluster_init_completed(void); + +#endif diff --git a/daemons/clvmd/cnxman-socket.h b/daemons/clvmd/cnxman-socket.h new file mode 100644 index 000000000..8ae44d85c --- /dev/null +++ b/daemons/clvmd/cnxman-socket.h @@ -0,0 +1,226 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +** Copyright (C) 2004 Red Hat, Inc. All rights reserved. +** +** This copyrighted material is made available to anyone wishing to use, +** modify, copy, or redistribute it subject to the terms and conditions +** of the GNU General Public License v.2. +** +******************************************************************************* +******************************************************************************/ + +/* CMAN socket interface header, + may be include by user or kernel code */ + +#ifndef __CNXMAN_SOCKET_H +#define __CNXMAN_SOCKET_H + +/* Just made these up but the address family must be less than 32 (NPROTO) */ +#define AF_CLUSTER 31 +#define PF_CLUSTER AF_CLUSTER + +/* Protocol(socket) types */ +#define CLPROTO_MASTER 2 +#define CLPROTO_CLIENT 3 + +/* Setsockopt -- maybe should be ioctls?? */ +#define CLU_SET_MULTICAST 100 +#define CLU_JOIN_CLUSTER 101 +#define CLU_LEAVE_CLUSTER 102 +#define CLU_SET_RCVONLY 103 +#define CLU_SET_UNICAST 104 +#define KCL_SET_MULTICAST 105 +#define KCL_SET_RCVONLY 106 +#define KCL_SET_UNICAST 107 +#define KCL_SET_NODENAME 108 +#define CLU_SET_NODENAME 109 + +/* ioctls -- should register these properly */ +#define SIOCCLUSTER_NOTIFY _IOW('x', 0x01, int) +#define SIOCCLUSTER_REMOVENOTIFY _IO( 'x', 0x02) +#define SIOCCLUSTER_GETMEMBERS _IOR('x', 0x03, struct cl_cluster_nodelist) +#define SIOCCLUSTER_SETEXPECTED_VOTES _IOW('x', 0x04, int) +#define SIOCCLUSTER_ISQUORATE _IO( 'x', 0x05) +#define SIOCCLUSTER_ISLISTENING _IOW('x', 0x06, struct cl_listen_request) +#define SIOCCLUSTER_GETALLMEMBERS _IOR('x', 0x07, struct cl_cluster_nodelist) +#define SIOCCLUSTER_SET_VOTES _IOW('x', 0x08, int) +#define SIOCCLUSTER_GET_VERSION _IOR('x', 0x09, struct cl_version) +#define SIOCCLUSTER_SET_VERSION _IOW('x', 0x0a, struct cl_version) +#define SIOCCLUSTER_ISACTIVE _IO( 'x', 0x0b) +#define SIOCCLUSTER_KILLNODE _IOW('x', 0x0c, int) +#define SIOCCLUSTER_GET_JOINCOUNT _IO( 'x', 0x0d) +#define SIOCCLUSTER_SERVICE_REGISTER _IOW('x', 0x0e, char) +#define SIOCCLUSTER_SERVICE_UNREGISTER _IO('x', 0x0f) +#define SIOCCLUSTER_SERVICE_JOIN _IO( 'x', 0x10) +#define SIOCCLUSTER_SERVICE_LEAVE _IO( 'x', 0x20) +#define SIOCCLUSTER_SERVICE_SETSIGNAL _IOW('x', 0x30, int) +#define SIOCCLUSTER_SERVICE_STARTDONE _IOW('x', 0x40, unsigned int) +#define SIOCCLUSTER_SERVICE_GETEVENT _IOR('x', 0x50, struct cl_service_event) +#define SIOCCLUSTER_SERVICE_GETMEMBERS _IOR('x', 0x60, struct cl_cluster_node) +#define SIOCCLUSTER_SERVICE_GLOBALID _IOR('x', 0x70, uint32_t) +#define SIOCCLUSTER_SERVICE_SETLEVEL _IOR('x', 0x80, int) +#define SIOCCLUSTER_GETNODE _IOWR('x', 0x90, struct cl_cluster_node) +#define SIOCCLUSTER_BARRIER _IOW('x', 0x0a0, struct cl_barrier_info) + +/* Maximum size of a cluster message */ +#define MAX_CLUSTER_MESSAGE 1500 +#define MAX_CLUSTER_MEMBER_NAME_LEN 255 +#define MAX_BARRIER_NAME_LEN 33 +#define MAX_SA_ADDR_LEN 12 +#define MAX_CLUSTER_NAME_LEN 16 + +/* Well-known cluster port numbers */ +#define CLUSTER_PORT_MEMBERSHIP 1 /* Mustn't block during cluster + * transitions! */ +#define CLUSTER_PORT_SERVICES 2 +#define CLUSTER_PORT_SYSMAN 10 /* Remote execution daemon */ +#define CLUSTER_PORT_CLVMD 11 /* Cluster LVM daemon */ +#define CLUSTER_PORT_SLM 12 /* LVM SLM (simple lock manager) */ + +/* Port numbers above this will be blocked when the cluster is inquorate or in + * transition */ +#define HIGH_PROTECTED_PORT 9 + +/* Reasons for leaving the cluster */ +#define CLUSTER_LEAVEFLAG_DOWN 0 /* Normal shutdown */ +#define CLUSTER_LEAVEFLAG_KILLED 1 +#define CLUSTER_LEAVEFLAG_PANIC 2 +#define CLUSTER_LEAVEFLAG_REMOVED 3 /* This one can reduce quorum */ +#define CLUSTER_LEAVEFLAG_REJECTED 4 /* Not allowed into the cluster in the + * first place */ +#define CLUSTER_LEAVEFLAG_INCONSISTENT 5 /* Our view of the cluster is + * in a minority */ +#define CLUSTER_LEAVEFLAG_DEAD 6 /* Discovered to be dead */ +#define CLUSTER_LEAVEFLAG_FORCE 0x10 /* Forced by command-line */ + +/* OOB messages sent to a local socket */ +#define CLUSTER_OOB_MSG_PORTCLOSED 1 +#define CLUSTER_OOB_MSG_STATECHANGE 2 +#define CLUSTER_OOB_MSG_SERVICEEVENT 3 + +/* Sendmsg flags, these are above the normal sendmsg flags so they don't + * interfere */ +#define MSG_NOACK 0x010000 /* Don't need an ACK for this message */ +#define MSG_QUEUE 0x020000 /* Queue the message for sending later */ +#define MSG_MULTICAST 0x080000 /* Message was sent to all nodes in the cluster + */ +#define MSG_ALLINT 0x100000 /* Send out of all interfaces */ + +typedef enum { NODESTATE_REMOTEMEMBER, NODESTATE_JOINING, NODESTATE_MEMBER, + NODESTATE_DEAD } nodestate_t; + + +struct sockaddr_cl { + unsigned short scl_family; + unsigned char scl_flags; + unsigned char scl_port; + int scl_nodeid; +}; + +/* This is how we pass the multicast socket into kernel space. addr is the + * multicast address to use in the address family of the socket (eg for UDP it + * might be 255.255.255.0) */ +struct cl_multicast_sock { + int fd; /* FD of master socket to do multicast on */ + int number; /* Socket number, to match up recvonly & bcast + * sockets */ +}; + +/* Cluster configuration info passed when we join the cluster */ +struct cl_join_cluster_info { + unsigned char votes; + unsigned int expected_votes; + unsigned int two_node; + unsigned int config_version; + + char cluster_name[17]; +}; + + +/* This is the structure, per node, returned from the membership ioctl */ +struct cl_cluster_node { + unsigned int size; + unsigned int node_id; + unsigned int us; + unsigned int leave_reason; + unsigned int incarnation; + nodestate_t state; + char name[MAX_CLUSTER_MEMBER_NAME_LEN]; + unsigned char votes; +}; + +/* The struct passed to the membership ioctls */ +struct cl_cluster_nodelist { + uint32_t max_members; + struct cl_cluster_node *nodes; +}; + +/* Structure passed to SIOCCLUSTER_ISLISTENING */ +struct cl_listen_request { + unsigned char port; + int nodeid; +}; + +/* A Cluster PORTCLOSED message - received by a local user as an OOB message */ +struct cl_portclosed_oob { + unsigned char cmd; /* CLUSTER_OOB_MSG_PORTCLOSED */ + unsigned char port; +}; + +/* Get all version numbers or set the config version */ +struct cl_version { + unsigned int major; + unsigned int minor; + unsigned int patch; + unsigned int config; +}; + +/* structure passed to barrier ioctls */ +struct cl_barrier_info { + char cmd; + char name[MAX_BARRIER_NAME_LEN]; + unsigned int flags; + unsigned long arg; +}; + +typedef enum { SERVICE_EVENT_STOP, SERVICE_EVENT_START, SERVICE_EVENT_FINISH, + SERVICE_EVENT_LEAVEDONE } service_event_t; + +typedef enum { SERVICE_START_FAILED, SERVICE_START_JOIN, SERVICE_START_LEAVE } + service_start_t; + +struct cl_service_event { + service_event_t type; + service_start_t start_type; + unsigned int event_id; + unsigned int last_stop; + unsigned int last_start; + unsigned int last_finish; + unsigned int node_count; +}; + + +/* Commands to the barrier ioctl */ +#define BARRIER_IOCTL_REGISTER 1 +#define BARRIER_IOCTL_CHANGE 2 +#define BARRIER_IOCTL_DELETE 3 +#define BARRIER_IOCTL_WAIT 4 + +/* Attributes of a barrier - bitmask */ +#define BARRIER_ATTR_AUTODELETE 1 +#define BARRIER_ATTR_MULTISTEP 2 +#define BARRIER_ATTR_MANUAL 4 +#define BARRIER_ATTR_ENABLED 8 +#define BARRIER_ATTR_CALLBACK 16 + +/* Attribute setting commands */ +#define BARRIER_SETATTR_AUTODELETE 1 +#define BARRIER_SETATTR_MULTISTEP 2 +#define BARRIER_SETATTR_ENABLED 3 +#define BARRIER_SETATTR_NODES 4 +#define BARRIER_SETATTR_CALLBACK 5 +#define BARRIER_SETATTR_TIMEOUT 6 + +#endif diff --git a/daemons/clvmd/libclvm.c b/daemons/clvmd/libclvm.c new file mode 100644 index 000000000..085e57ee3 --- /dev/null +++ b/daemons/clvmd/libclvm.c @@ -0,0 +1,446 @@ +/* + * Copyright (C) 1997-2004 Sistina Software, Inc. All rights reserved. + * Copyright (C) 2004 Red Hat, Inc. All rights reserved. + * + * This file is part of LVM2. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* library functions for Cluster LVM Daemon */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "clvm.h" +#include "libclvm.h" + +/* CLVM in hex! */ +#define LVM_SIGNATURE 0x434C564D + +#define MAX_CLUSTER_MEMBER_NAME_LEN 255 + +/* NOTE: the LVMD uses the socket FD as the client ID, this means + that any client that calls fork() will inherit the context of + it's parent. */ +static int clvmd_sock = -1; + +static int open_local_sock(void) +{ + int local_socket; + struct sockaddr_un sockaddr; + + /* Open local socket */ + local_socket = socket(PF_UNIX, SOCK_STREAM, 0); + if (local_socket < 0) { + perror("Can't create local socket"); + return -1; + } + + fcntl(local_socket, F_SETFD, !FD_CLOEXEC); + + strcpy(sockaddr.sun_path, CLVMD_SOCKNAME); + sockaddr.sun_family = AF_UNIX; + if (connect + (local_socket, (struct sockaddr *) &sockaddr, sizeof(sockaddr))) { + int saved_errno = errno; + + close(local_socket); + + errno = saved_errno; + return -1; + } + return local_socket; +} + +/* Send a request and return the status */ +static int send_request(char *inbuf, int inlen, char **retbuf) +{ + char outbuf[PIPE_BUF]; + struct clvm_header *outheader = (struct clvm_header *) outbuf; + int len; + int off; + fd_set fds; + + FD_ZERO(&fds); + FD_SET(clvmd_sock, &fds); + + /* Send it to CLVMD */ + if (write(clvmd_sock, inbuf, inlen) != inlen) { + perror("Error writing to CLVMD"); + return -1; + } + + /* Get the response */ + if ((len = read(clvmd_sock, outbuf, sizeof(struct clvm_header))) < 0) { + perror("Error reading CLVMD"); + return -1; + } + if (len == 0) { + fprintf(stderr, "EOF reading CLVMD"); + errno = ENOTCONN; + return -1; + } + + /* Allocate buffer */ + *retbuf = malloc(len + outheader->arglen); + if (!*retbuf) { + errno = ENOMEM; + return -1; + } + + /* Copy the header */ + memcpy(*retbuf, outbuf, len); + outheader = (struct clvm_header *) *retbuf; + + /* Read the returned values */ + off = 1; /* we've already read the first byte */ + + while (off < outheader->arglen && len > 0) { + len = read(clvmd_sock, outheader->args + off, PIPE_BUF); + if (len > 0) + off += len; + } + + /* Was it an error ? */ + if (outheader->status < 0) { + errno = -outheader->status; + return -2; + } + return 0; +} + +/* Build the structure header and parse-out wildcard node names */ +static void build_header(struct clvm_header *head, int cmd, const char *node, + void *data, int len) +{ + head->cmd = cmd; + head->status = 0; + head->flags = 0; + head->clientid = 0; + head->arglen = len; + if (node) { + /* Allow a couple of special node names: + "*" for all nodes, + "." for the local node only + */ + if (strcmp(node, "*") == 0) { + head->node[0] = '\0'; + } else if (strcmp(node, ".") == 0) { + head->node[0] = '\0'; + head->flags = CLVMD_FLAG_LOCAL; + } else { + strcpy(head->node, node); + } + } else { + head->node[0] = '\0'; + } +} + +/* Send a message to a(or all) node(s) in the cluster */ +int lvm_cluster_write(char cmd, char *node, void *data, int len) +{ + char outbuf[sizeof(struct clvm_header) + len + strlen(node) + 1]; + char *retbuf = NULL; + int status; + struct clvm_header *head = (struct clvm_header *) outbuf; + + if (clvmd_sock == -1) + clvmd_sock = open_local_sock(); + if (clvmd_sock == -1) + return -1; + + build_header(head, cmd, node, data, len); + memcpy(head->node + strlen(head->node) + 1, data, len); + + status = + send_request(outbuf, + sizeof(struct clvm_header) + strlen(head->node) + len, + &retbuf); + if (retbuf) + free(retbuf); + + return status; +} + +/* API: Send a message to a(or all) node(s) in the cluster + and wait for replies */ +int lvm_cluster_request(char cmd, const char *node, void *data, int len, + lvm_response_t ** response, int *num) +{ + char outbuf[sizeof(struct clvm_header) + len + strlen(node) + 1]; + int *outptr; + char *inptr; + char *retbuf = NULL; + int status; + int i; + int num_responses = 0; + struct clvm_header *head = (struct clvm_header *) outbuf; + lvm_response_t *rarray; + + *num = 0; + + if (clvmd_sock == -1) + clvmd_sock = open_local_sock(); + if (clvmd_sock == -1) + return -1; + + build_header(head, cmd, node, data, len); + memcpy(head->node + strlen(head->node) + 1, data, len); + + status = + send_request(outbuf, + sizeof(struct clvm_header) + strlen(head->node) + len, + &retbuf); + if (status == 0 || status == -2) { + /* Count the number of responses we got */ + head = (struct clvm_header *) retbuf; + inptr = head->args; + while (inptr[0]) { + num_responses++; + inptr += strlen(inptr) + 1; + inptr += sizeof(int); + inptr += strlen(inptr) + 1; + } + + /* Allocate response array. With an extra pair of INTs on the front to sanity + check the pointer when we are given it back to free */ + outptr = + malloc(sizeof(lvm_response_t) * num_responses + + sizeof(int) * 2); + if (!outptr) { + if (retbuf) + free(retbuf); + errno = ENOMEM; + return -1; + } + + *response = (lvm_response_t *) (outptr + 2); + outptr[0] = LVM_SIGNATURE; + outptr[1] = num_responses; + rarray = *response; + + /* Unpack the response into an lvm_response_t array */ + inptr = head->args; + i = 0; + while (inptr[0]) { + strcpy(rarray[i].node, inptr); + inptr += strlen(inptr) + 1; + + rarray[i].status = *(int *) inptr; + inptr += sizeof(int); + + rarray[i].response = malloc(strlen(inptr) + 1); + if (rarray[i].response == NULL) { + /* Free up everything else and return error */ + int j; + for (j = 0; j < i; j++) + free(rarray[i].response); + free(outptr); + errno = ENOMEM; + return -1; + } + + strcpy(rarray[i].response, inptr); + rarray[i].len = strlen(inptr); + inptr += strlen(inptr) + 1; + i++; + } + *num = num_responses; + *response = rarray; + } + + if (retbuf) + free(retbuf); + return status; +} + +/* API: Free reply array */ +int lvm_cluster_free_request(lvm_response_t * response) +{ + int *ptr = (int *) response - 2; + int i; + int num; + + /* Check it's ours to free */ + if (response == NULL || *ptr != LVM_SIGNATURE) { + errno = EINVAL; + return -1; + } + + num = ptr[1]; + for (i = 0; i < num; i++) { + free(response[i].response); + } + free(ptr); + + return 0; +} + +/* These are a "higher-level" API providing black-box lock/unlock + functions for cluster LVM...maybe */ + +/* Set by lock(), used by unlock() */ +static int num_responses; +static lvm_response_t *response; + +int lvm_lock_for_cluster(char scope, char *name, int verbosity) +{ + int status; + int i; + char *args; + int len; + + if (name) { + len = strlen(name) + 2; + args = alloca(len); + strcpy(args + 1, name); + } else { + len = 2; + args = alloca(len); + args[1] = '\0'; + } + args[0] = scope; + + status = lvm_cluster_request(CLVMD_CMD_LOCK, + "", args, len, &response, &num_responses); + + /* If any nodes were down then display them and return an error */ + for (i = 0; i < num_responses; i++) { + if (response[i].status == -EHOSTDOWN) { + if (verbosity) + fprintf(stderr, + "clvmd not running on node %s\n", + response[i].node); + status = -1; + } + } + + /* If there was an error then free the memory now as the caller won't + want to do the unlock */ + if (status) { + int saved_errno = errno; + lvm_cluster_free_request(response); + num_responses = 0; + errno = saved_errno; + } + return status; +} + +int lvm_unlock_for_cluster(char scope, char *name, int verbosity) +{ + int status; + int i; + int len; + int failed; + int num_unlock_responses; + char *args; + lvm_response_t *unlock_response; + + /* We failed - this should not have been called */ + if (num_responses == 0) + return 0; + + if (name) { + len = strlen(name) + 2; + args = alloca(len); + strcpy(args + 1, name); + } else { + len = 2; + args = alloca(len); + args[1] = '\0'; + } + args[0] = scope; + + /* See if it failed anywhere */ + failed = 0; + for (i = 0; i < num_responses; i++) { + if (response[i].status != 0) + failed++; + } + + /* If it failed on any nodes then we only unlock on + the nodes that succeeded */ + if (failed) { + for (i = 0; i < num_responses; i++) { + /* Unlock the ones that succeeded */ + if (response[i].status == 0) { + status = lvm_cluster_request(CLVMD_CMD_UNLOCK, + response[i].node, + args, len, + &unlock_response, + &num_unlock_responses); + if (status) { + if (verbosity) + fprintf(stderr, + "cluster command to node %s failed: %s\n", + response[i].node, + strerror(errno)); + } else if (unlock_response[0].status != 0) { + if (verbosity > 1) + fprintf(stderr, + "unlock on node %s failed: %s\n", + response[i].node, + strerror(unlock_response + [0].status)); + } + lvm_cluster_free_request(unlock_response); + } else { + if (verbosity) + fprintf(stderr, + "command on node %s failed: '%s' - will be left locked\n", + response[i].node, + strerror(response[i].status)); + } + } + } else { + /* All OK, we can do a full cluster unlock */ + status = lvm_cluster_request(CLVMD_CMD_UNLOCK, + "", + args, len, + &unlock_response, + &num_unlock_responses); + if (status) { + if (verbosity > 1) + fprintf(stderr, "cluster command failed: %s\n", + strerror(errno)); + } else { + for (i = 0; i < num_unlock_responses; i++) { + if (unlock_response[i].status != 0) { + if (verbosity > 1) + fprintf(stderr, + "unlock on node %s failed: %s\n", + response[i].node, + strerror(unlock_response + [0].status)); + } + } + } + lvm_cluster_free_request(unlock_response); + } + lvm_cluster_free_request(response); + + return 0; +} diff --git a/daemons/clvmd/libclvm.h b/daemons/clvmd/libclvm.h new file mode 100644 index 000000000..bd735ce46 --- /dev/null +++ b/daemons/clvmd/libclvm.h @@ -0,0 +1,36 @@ +/* + * Copyright (C) 1997-2004 Sistina Software, Inc. All rights reserved. + * Copyright (C) 2004 Red Hat, Inc. All rights reserved. + * + * This file is part of LVM2. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _LIBCLVM_H +#define _LIBCLVM_H + +typedef struct lvm_response { + char node[255]; + char *response; + int status; + int len; + +} lvm_response_t; + +extern int lvm_cluster_request(char cmd, const char *node, void *data, int len, + lvm_response_t ** response, int *num); +extern int lvm_cluster_write(char cmd, char *node, void *data, int len); +extern int lvm_cluster_free_request(lvm_response_t * response); + +/* The "high-level" API */ +extern int lvm_lock_for_cluster(char scope, char *name, int verbosity); +extern int lvm_unlock_for_cluster(char scope, char *name, int verbosity); + +#endif diff --git a/daemons/clvmd/lvm-functions.c b/daemons/clvmd/lvm-functions.c new file mode 100644 index 000000000..400d33ff5 --- /dev/null +++ b/daemons/clvmd/lvm-functions.c @@ -0,0 +1,446 @@ +/* + * Copyright (C) 2002-2004 Sistina Software, Inc. All rights reserved. + * Copyright (C) 2004 Red Hat, Inc. All rights reserved. + * + * This file is part of LVM2. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "libdlm.h" +#include "clvm.h" +#include "clvmd-comms.h" +#include "clvmd.h" +#include "lvm-functions.h" + +/* LVM2 headers */ +#include "toolcontext.h" +#include "log.h" +#include "activate.h" +#include "hash.h" +#include "locking.h" + +static struct cmd_context *cmd = NULL; +static struct hash_table *lv_hash = NULL; + +struct lv_info { + int lock_id; + int lock_mode; +}; + +/* Return the mode a lock is currently held at (or -1 if not held) */ +static int get_current_lock(char *resource) +{ + struct lv_info *lvi; + + lvi = hash_lookup(lv_hash, resource); + if (lvi) { + return lvi->lock_mode; + } else { + return -1; + } +} + +/* Called at shutdown to tidy the lockspace */ +void unlock_all() +{ + struct hash_node *v; + hash_iterate(v, lv_hash) { + struct lv_info *lvi = hash_get_data(lv_hash, v); + + sync_unlock(hash_get_key(lv_hash, v), lvi->lock_id); + } +} + +/* Gets a real lock and keeps the info in the hash table */ +int hold_lock(char *resource, int mode, int flags) +{ + int status; + int saved_errno; + struct lv_info *lvi; + + flags &= LKF_NOQUEUE; /* Only LKF_NOQUEUE is valid here */ + + lvi = hash_lookup(lv_hash, resource); + if (lvi) { + /* Already exists - convert it */ + status = + sync_lock(resource, mode, LKF_CONVERT | flags, + &lvi->lock_id); + saved_errno = errno; + if (!status) + lvi->lock_mode = mode; + + if (status) { + DEBUGLOG("hold_lock. convert to %d failed: %s\n", mode, + strerror(errno)); + } + errno = saved_errno; + } else { + lvi = malloc(sizeof(struct lv_info)); + if (!lvi) + return -1; + + lvi->lock_mode = mode; + status = sync_lock(resource, mode, flags, &lvi->lock_id); + saved_errno = errno; + if (status) { + free(lvi); + DEBUGLOG("hold_lock. lock at %d failed: %s\n", mode, + strerror(errno)); + } else { + hash_insert(lv_hash, resource, lvi); + } + errno = saved_errno; + } + return status; +} + +/* Unlock and remove it from the hash table */ +int hold_unlock(char *resource) +{ + struct lv_info *lvi; + int status; + int saved_errno; + + lvi = hash_lookup(lv_hash, resource); + + if (!lvi) { + DEBUGLOG("hold_unlock, lock not already held\n"); + return 0; + } + + status = sync_unlock(resource, lvi->lock_id); + saved_errno = errno; + if (!status) { + hash_remove(lv_hash, resource); + free(lvi); + } else { + DEBUGLOG("hold_unlock. unlock failed(%d): %s\n", status, + strerror(errno)); + } + + errno = saved_errno; + return status; +} + +/* Watch the return codes here. + liblvm API functions return 1(true) for success, 0(false) for failure and don't set errno. + libdlm API functions return 0 for success, -1 for failure and do set errno. + These functions here return 0 for success or >0 for failure (where the retcode is errno) +*/ + +/* Activate LV exclusive or non-exclusive */ +static int do_activate_lv(char *resource, int mode) +{ + int oldmode; + int status; + int activate_lv; + struct lvinfo lvi; + + /* Is it already open ? */ + oldmode = get_current_lock(resource); + if (oldmode == mode) { + return 0; /* Nothing to do */ + } + + /* Does the config file want us to activate this LV ? */ + if (!lv_activation_filter(cmd, resource, &activate_lv)) + return EIO; + + if (!activate_lv) + return 0; /* Success, we did nothing! */ + + /* Do we need to activate exclusively? */ + if (activate_lv == 2) + mode = LKM_EXMODE; + + /* OK, try to get the lock */ + status = hold_lock(resource, mode, LKF_NOQUEUE); + if (status) + return errno; + + /* If it's suspended then resume it */ + if (!lv_info_by_lvid(cmd, resource, &lvi)) + return EIO; + + if (lvi.suspended) + if (!lv_resume(cmd, resource)) + return EIO; + + /* Now activate it */ + if (!lv_activate(cmd, resource)) + return EIO; + + return 0; +} + +/* Resume the LV if it was active */ +static int do_resume_lv(char *resource) +{ + int oldmode; + + /* Is it open ? */ + oldmode = get_current_lock(resource); + if (oldmode == -1) { + DEBUGLOG("do_deactivate_lock, lock not already held\n"); + return 0; /* We don't need to do anything */ + } + + if (!lv_resume_if_active(cmd, resource)) + return EIO; + + return 0; +} + +/* Suspend the device if active */ +static int do_suspend_lv(char *resource) +{ + int oldmode; + struct lvinfo lvi; + + /* Is it open ? */ + oldmode = get_current_lock(resource); + if (oldmode == -1) { + DEBUGLOG("do_suspend_lv, lock held at %d\n", oldmode); + return 0; /* Not active, so it's OK */ + } + + /* Only suspend it if it exists */ + if (!lv_info_by_lvid(cmd, resource, &lvi)) + return EIO; + + if (lvi.exists) { + if (!lv_suspend_if_active(cmd, resource)) { + return EIO; + } + } + return 0; +} + +static int do_deactivate_lv(char *resource) +{ + int oldmode; + int status; + + /* Is it open ? */ + oldmode = get_current_lock(resource); + if (oldmode == -1) { + DEBUGLOG("do_deactivate_lock, lock not already held\n"); + return 0; /* We don't need to do anything */ + } + + if (!lv_deactivate(cmd, resource)) + return EIO; + + status = hold_unlock(resource); + if (status) + return errno; + + return 0; +} + +/* This is the LOCK_LV part that happens on all nodes in the cluster - + it is responsible for the interaction with device-mapper and LVM */ +int do_lock_lv(unsigned char command, unsigned char lock_flags, char *resource) +{ + int status = 0; + + DEBUGLOG("do_lock_lv: resource '%s', cmd = 0x%x, flags = %d\n", + resource, command, lock_flags); + + if (!cmd->config_valid || config_files_changed(cmd)) { + /* Reinitialise various settings inc. logging, filters */ + if (!refresh_toolcontext(cmd)) { + log_error("Updated config file invalid. Aborting."); + return EINVAL; + } + } + + switch (command) { + case LCK_LV_EXCLUSIVE: + status = do_activate_lv(resource, LKM_EXMODE); + break; + + case LCK_LV_SUSPEND: + status = do_suspend_lv(resource); + break; + + case LCK_UNLOCK: + case LCK_LV_RESUME: /* if active */ + status = do_resume_lv(resource); + break; + + case LCK_LV_ACTIVATE: + status = do_activate_lv(resource, LKM_CRMODE); + break; + + case LCK_LV_DEACTIVATE: + status = do_deactivate_lv(resource); + break; + + default: + DEBUGLOG("Invalid LV command 0x%x\n", command); + status = EINVAL; + break; + } + + /* clean the pool for another command */ + pool_empty(cmd->mem); + + DEBUGLOG("Command return is %d\n", status); + return status; +} + +/* Functions to do on the local node only BEFORE the cluster-wide stuff above happens */ +int pre_lock_lv(unsigned char command, unsigned char lock_flags, char *resource) +{ + /* Nearly all the stuff happens cluster-wide. Apart from SUSPEND. Here we get the + lock out on this node (because we are the node modifying the metadata) + before suspending cluster-wide. + */ + if (command == LCK_LV_SUSPEND) { + DEBUGLOG("pre_lock_lv: resource '%s', cmd = 0x%x, flags = %d\n", + resource, command, lock_flags); + + if (hold_lock(resource, LKM_PWMODE, LKF_NOQUEUE)) + return errno; + } + return 0; +} + +/* Functions to do on the local node only AFTER the cluster-wide stuff above happens */ +int post_lock_lv(unsigned char command, unsigned char lock_flags, + char *resource) +{ + /* Opposite of above, done on resume after a metadata update */ + if (command == LCK_LV_RESUME) { + int oldmode; + + DEBUGLOG + ("post_lock_lv: resource '%s', cmd = 0x%x, flags = %d\n", + resource, command, lock_flags); + + /* If the lock state is PW then restore it to what it was */ + oldmode = get_current_lock(resource); + if (oldmode == LKM_PWMODE) { + struct lvinfo lvi; + + if (!lv_info_by_lvid(cmd, resource, &lvi)) + return EIO; + + if (lvi.exists) { + if (hold_lock(resource, LKM_CRMODE, 0)) + return errno; + } else { + if (hold_unlock(resource)) + return errno; + } + } + } + return 0; +} + +/* Check if a VG is un use by LVM1 so we don't stomp on it */ +int do_check_lvm1(char *vgname) +{ + int status; + + status = check_lvm1_vg_inactive(cmd, vgname); + + return status == 1 ? 0 : EBUSY; +} + +/* + * Ideally, clvmd should be started before any LVs are active + * but this may not be the case... + * I suppose this also comes in handy if clvmd crashes, not that it would! + */ +static void *get_initial_state() +{ + char lv[64], vg[64], flags[25]; + char uuid[65]; + char line[255]; + FILE *lvs = + popen + ("/sbin/lvm lvs --nolocking --noheadings -o vg_uuid,lv_uuid,lv_attr", + "r"); + + if (!lvs) + return NULL; + + while (fgets(line, sizeof(line), lvs)) { + if (sscanf(line, "%s %s %s\n", vg, lv, flags) == 3) { + /* States: s:suspended a:active S:dropped snapshot I:invalid snapshot */ + if (flags[4] == 'a' || flags[4] == 's') { /* is it active or suspended? */ + /* Convert hyphen-separated UUIDs into one */ + memcpy(&uuid[0], &vg[0], 6); + memcpy(&uuid[6], &vg[7], 4); + memcpy(&uuid[10], &vg[12], 4); + memcpy(&uuid[14], &vg[17], 4); + memcpy(&uuid[18], &vg[22], 4); + memcpy(&uuid[22], &vg[27], 4); + memcpy(&uuid[26], &vg[32], 6); + memcpy(&uuid[32], &lv[0], 6); + memcpy(&uuid[38], &lv[7], 4); + memcpy(&uuid[42], &lv[12], 4); + memcpy(&uuid[46], &lv[17], 4); + memcpy(&uuid[50], &lv[22], 4); + memcpy(&uuid[54], &lv[27], 4); + memcpy(&uuid[58], &lv[32], 6); + uuid[64] = '\0'; + + DEBUGLOG("getting initial lock for %s\n", uuid); + hold_lock(uuid, LKM_CRMODE, LKF_NOQUEUE); + } + } + } + fclose(lvs); + return NULL; +} + +void init_lvhash() +{ + /* Create hash table for keeping LV locks & status */ + lv_hash = hash_create(100); +} + +/* Called to initialise the LVM context of the daemon */ +int init_lvm(void) +{ + if (!(cmd = create_toolcontext(NULL))) { + log_error("Failed to allocate command context"); + return 0; + } + + /* Use LOG_DAEMON for syslog messages instead of LOG_USER */ + init_syslog(LOG_DAEMON); + + get_initial_state(); + + return 1; +} diff --git a/daemons/clvmd/lvm-functions.h b/daemons/clvmd/lvm-functions.h new file mode 100644 index 000000000..750eba999 --- /dev/null +++ b/daemons/clvmd/lvm-functions.h @@ -0,0 +1,35 @@ +/* + * Copyright (C) 2002-2004 Sistina Software, Inc. All rights reserved. + * Copyright (C) 2004 Red Hat, Inc. All rights reserved. + * + * This file is part of LVM2. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* Functions in lvm-functions.c */ + +#ifndef _LVM_FUNCTIONS_H +#define _LVM_FUNCTIONS_H + +extern int pre_lock_lv(unsigned char lock_cmd, unsigned char lock_flags, + char *resource); +extern int do_lock_lv(unsigned char lock_cmd, unsigned char lock_flags, + char *resource); +extern int post_lock_lv(unsigned char lock_cmd, unsigned char lock_flags, + char *resource); +extern int do_check_lvm1(char *vgname); +extern int init_lvm(void); +extern void init_lvhash(void); + +extern int hold_unlock(char *resource); +extern int hold_lock(char *resource, int mode, int flags); +extern void unlock_all(void); + +#endif diff --git a/daemons/clvmd/system-lv.c b/daemons/clvmd/system-lv.c new file mode 100644 index 000000000..5b359cdf1 --- /dev/null +++ b/daemons/clvmd/system-lv.c @@ -0,0 +1,369 @@ +/* + * Copyright (C) 2002-2004 Sistina Software, Inc. All rights reserved. + * Copyright (C) 2004 Red Hat, Inc. All rights reserved. + * + * This file is part of LVM2. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* Routines dealing with the System LV */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "libdlm.h" +#include "log.h" +#include "list.h" +#include "locking.h" +#include "system-lv.h" +#include "clvmd-comms.h" +#ifdef HAVE_CCS +#include "ccs.h" +#endif + +#define SYSTEM_LV_FILESYSTEM "ext2" +#define SYSTEM_LV_MOUNTPOINT "/tmp/.clvmd-XXXXXX" + +extern char *config_filename(void); + +static char system_lv_name[PATH_MAX] = { '\0' }; +static char mount_point[PATH_MAX] = { '\0' }; +static int mounted = 0; +static int mounted_rw = 0; +static int lockid; +static const char *lock_name = "CLVM_SYSTEM_LV"; + +/* Look in /proc/mounts or (as a last resort) /etc/mtab to + see if the system-lv is mounted. If it is mounted and we + think it's not then abort because we don't have the right + lock status and we don't know what other processes are doing with it. + + Returns 1 for mounted, 0 for not mounted so it matches the condition + of the "mounted" static variable above. +*/ +static int is_really_mounted(void) +{ + FILE *mountfile; + struct mntent *ment; + + mountfile = setmntent("/proc/mounts", "r"); + if (!mountfile) { + mountfile = setmntent("/etc/mtab", "r"); + if (!mountfile) { + log_error("Unable to open /proc/mounts or /etc/mtab"); + return -1; + } + } + + /* Look for system LV name in the file */ + do { + ment = getmntent(mountfile); + if (ment) { + if (strcmp(ment->mnt_fsname, system_lv_name) == 0) { + endmntent(mountfile); + return 1; + } + } + } + while (ment); + + endmntent(mountfile); + return 0; +} + +/* Get the system LV name from the config file */ +static int find_system_lv(void) +{ + if (system_lv_name[0] == '\0') { +#ifdef HAVE_CCS + int error; + ccs_node_t *ctree; + + /* Read the cluster config file */ + /* Open the config file */ + error = open_ccs_file(&ctree, "clvm.ccs"); + if (error) { + perror("reading config file"); + return -1; + } + + strcpy(system_lv_name, find_ccs_str(ctree, + "cluster/systemlv", '/', + "/dev/vg/system_lv")); + + /* Finished with config file */ + close_ccs_file(ctree); +#else + if (getenv("CLVMD_SYSTEM_LV")) + strcpy(system_lv_name, getenv("CLVMD_SYSTEM_LV")); + else + return -1; +#endif + } + + /* See if it has been mounted outside our control */ + if (is_really_mounted() != mounted) { + log_error + ("The system LV state has been mounted/umounted outside the control of clvmd\n" + "it cannot not be used for cluster communications until this is fixed.\n"); + return -1; + } + return 0; +} + +/* No prizes */ +int system_lv_umount(void) +{ + if (!mounted) + return 0; + + if (umount(mount_point) < 0) { + log_error("umount of system LV (%s) failed: %m\n", + system_lv_name); + return -1; + } + + sync_unlock(lock_name, lockid); + mounted = 0; + + /* Remove the mount point */ + rmdir(mount_point); + + return 0; +} + +int system_lv_mount(int readwrite) +{ + int status; + int saved_errno; + int fd; + + if (find_system_lv()) { + errno = EBUSY; + return -1; + } + + /* Is it already mounted suitably? */ + if (mounted) { + if (!readwrite || (readwrite && mounted_rw)) { + return 0; + } else { + /* Mounted RO and we need RW */ + if (system_lv_umount() < 0) + return -1; + } + } + + /* Randomize the mount point */ + strcpy(mount_point, SYSTEM_LV_MOUNTPOINT); + fd = mkstemp(mount_point); + if (fd < 0) { + log_error("mkstemp for system LV mount point failed: %m\n"); + return -1; + } + + /* Race condition here but there's no mkstemp for directories */ + close(fd); + unlink(mount_point); + mkdir(mount_point, 0600); + + /* Make sure we have a system-lv lock */ + status = + sync_lock(lock_name, (readwrite) ? LKM_EXMODE : LKM_CRMODE, 0, + &lockid); + if (status < 0) + return -1; + + /* Mount it */ + if (mount(system_lv_name, mount_point, SYSTEM_LV_FILESYSTEM, + MS_MGC_VAL | MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_SYNCHRONOUS + | (readwrite ? 0 : MS_RDONLY), NULL) < 0) { + /* mount(2) returns EINVAL if the volume has no FS on it. So, if we want to + write to it we try to make a filesystem in it and retry the mount */ + if (errno == EINVAL && readwrite) { + char cmd[256]; + + log_error("Attempting mkfs on system LV device %s\n", + system_lv_name); + snprintf(cmd, sizeof(cmd), "/sbin/mkfs -t %s %s", + SYSTEM_LV_FILESYSTEM, system_lv_name); + system(cmd); + + if (mount + (system_lv_name, mount_point, SYSTEM_LV_FILESYSTEM, + MS_MGC_VAL | MS_NOSUID | MS_NODEV | MS_NOEXEC | + MS_SYNCHRONOUS | (readwrite ? 0 : MS_RDONLY), + NULL) == 0) + goto mounted; + } + + saved_errno = errno; + log_error("mount of system LV (%s, %s, %s) failed: %m\n", + system_lv_name, mount_point, SYSTEM_LV_FILESYSTEM); + sync_unlock(lock_name, lockid); + errno = saved_errno; + return -1; + } + + mounted: +/* Set the internal flags */ + mounted = 1; + mounted_rw = readwrite; + + return 0; +} + +/* Erase *all* files in the root directory of the system LV. + This *MUST* be called with an appropriate lock held! + The LV is left mounted RW because it is assumed that the + caller wants to write something here after clearing some space */ +int system_lv_eraseall(void) +{ + DIR *dir; + struct dirent *ent; + char fname[PATH_MAX]; + + /* Must be mounted R/W */ + system_lv_mount(1); + + dir = opendir(mount_point); + if (!dir) + return -1; + + while ((ent = readdir(dir))) { + struct stat st; + snprintf(fname, sizeof(fname), "%s/%s", mount_point, + ent->d_name); + + if (stat(fname, &st)) { + if (S_ISREG(st.st_mode)) + unlink(fname); + } + } + closedir(dir); + return 0; +} + +/* This is a "high-level" routine - it mounts the system LV, writes + the data into a file named after this node and then umounts the LV + again */ +int system_lv_write_data(char *data, ssize_t len) +{ + struct utsname nodeinfo; + char fname[PATH_MAX]; + int outfile; + ssize_t thiswrite; + ssize_t written; + + if (system_lv_mount(1)) + return -1; + + /* Build the file name we are goingto use. */ + uname(&nodeinfo); + snprintf(fname, sizeof(fname), "%s/%s", mount_point, nodeinfo.nodename); + + /* Open the file for output */ + outfile = open(fname, O_RDWR | O_CREAT | O_TRUNC, 0600); + if (outfile < 0) { + int saved_errno = errno; + system_lv_umount(); + errno = saved_errno; + return -1; + } + + written = 0; + do { + thiswrite = write(outfile, data + written, len - written); + if (thiswrite > 0) + written += thiswrite; + + } while (written < len && thiswrite > 0); + + close(outfile); + + system_lv_umount(); + return (thiswrite < 0) ? -1 : 0; +} + +/* This is a "high-level" routine - it mounts the system LV, reads + the data from a named file and then umounts the LV + again */ +int system_lv_read_data(char *fname_base, char *data, ssize_t *len) +{ + char fname[PATH_MAX]; + int outfile; + struct stat st; + ssize_t filesize; + ssize_t thisread; + ssize_t readbytes; + + if (system_lv_mount(0)) + return -1; + + /* Build the file name we are going to use. */ + snprintf(fname, sizeof(fname), "%s/%s", mount_point, fname_base); + + /* Get the file size and stuff. Actually we only need the file size but + this will also check that the file exists */ + if (stat(fname, &st) < 0) { + int saved_errno = errno; + + log_error("stat of file %s on system LV failed: %m\n", fname); + system_lv_umount(); + errno = saved_errno; + return -1; + } + filesize = st.st_size; + + outfile = open(fname, O_RDONLY); + if (outfile < 0) { + int saved_errno = errno; + + log_error("open of file %s on system LV failed: %m\n", fname); + system_lv_umount(); + errno = saved_errno; + return -1; + } + + readbytes = 0; + do { + thisread = + read(outfile, data + readbytes, filesize - readbytes); + if (thisread > 0) + readbytes += thisread; + + } while (readbytes < filesize && thisread > 0); + + close(outfile); + + system_lv_umount(); + + *len = readbytes; + return (thisread < 0) ? -1 : 0; +} diff --git a/daemons/clvmd/system-lv.h b/daemons/clvmd/system-lv.h new file mode 100644 index 000000000..b90ca4423 --- /dev/null +++ b/daemons/clvmd/system-lv.h @@ -0,0 +1,30 @@ +/* + * Copyright (C) 2002-2004 Sistina Software, Inc. All rights reserved. + * Copyright (C) 2004 Red Hat, Inc. All rights reserved. + * + * This file is part of LVM2. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _CLVM_SYSTEM_LV_H +#define _CLVM_SYSTEM_LV_H + +/* Prototypes for System-LV functions */ + +/* "low-level" functions */ +extern int system_lv_umount(void); +extern int system_lv_mount(int readwrite); +extern int system_lv_eraseall(void); + +/* "high-level" functions */ +extern int system_lv_write_data(char *data, ssize_t len); +extern int system_lv_read_data(char *fname_base, char *data, ssize_t *len); + +#endif diff --git a/daemons/clvmd/tcp-comms.c b/daemons/clvmd/tcp-comms.c new file mode 100644 index 000000000..2e0406b3a --- /dev/null +++ b/daemons/clvmd/tcp-comms.c @@ -0,0 +1,480 @@ +/****************************************************************************** +******************************************************************************* +** +** Copyright (C) Sistina Software, Inc. 2002-2003 All rights reserved. +** +******************************************************************************* +******************************************************************************/ + +/* This provides the inter-clvmd communications for a system without CMAN. + There is a listening TCP socket which accepts new connections in the + normal way. + It can also make outgoing connnections to the other clvmd nodes. +*/ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ccs.h" +#include "clvm.h" +#include "clvmd-comms.h" +#include "clvmd.h" +#include "clvmd-gulm.h" +#include "hash.h" + +#define DEFAULT_TCP_PORT 21064 + +static int listen_fd = -1; +static int tcp_port; +struct hash_table *sock_hash; + +static int get_tcp_port(int default_port); +static int get_our_ip_address(char *addr, int *family); +static int read_from_tcpsock(struct local_client *fd, char *buf, int len, char *csid, + struct local_client **new_client); + +/* Called by init_cluster() to open up the listening socket */ +// TODO: IPv6 compat. +int init_comms() +{ + struct sockaddr *addr = NULL; + struct sockaddr_in addr4; + struct sockaddr_in6 addr6; + int addr_len; + int family; + char address[MAX_CSID_LEN]; + + sock_hash = hash_create(100); + tcp_port = get_tcp_port(DEFAULT_TCP_PORT); + + /* Get IP address and IP type */ + get_our_ip_address(address, &family); + if (family == AF_INET) + { + memcpy(&addr4.sin_addr, addr, sizeof(struct in_addr)); + addr = (struct sockaddr *)&addr4; + addr4.sin_port = htons(tcp_port); + addr_len = sizeof(addr4); + } + else + { + memcpy(&addr6.sin6_addr, addr, sizeof(struct in6_addr)); + addr = (struct sockaddr *)&addr6; + addr6.sin6_port = htons(tcp_port); + addr_len = sizeof(addr6); + } + + listen_fd = socket(family, SOCK_STREAM, 0); + + if (listen_fd < 0) + { + return -1; + } + else + { + int one = 1; + setsockopt(listen_fd, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(int)); + } + + addr->sa_family = family; + + if (bind(listen_fd, addr, addr_len) < 0) + { + DEBUGLOG("Can't bind to port\n"); + syslog(LOG_ERR, "Can't bind to port %d, is clvmd already running ?", tcp_port); + close(listen_fd); + return -1; + } + + listen(listen_fd, 5); + + return 0; +} + +void tcp_remove_client(char *csid) + { + struct local_client *client; + DEBUGLOG("tcp_remove_client\n"); + + /* Don't actually close the socket here - that's the + job of clvmd.c whch will do the job when it notices the + other end has gone. We just need to remove the client(s) from + the hash table so we don't try to use it for sending any more */ + client = hash_lookup_binary(sock_hash, csid, MAX_CSID_LEN); + if (client) + { + hash_remove_binary(sock_hash, csid, MAX_CSID_LEN); + } + + /* Look for a mangled one too */ + csid[0] ^= 0x80; + + client = hash_lookup_binary(sock_hash, csid, MAX_CSID_LEN); + if (client) + { + hash_remove_binary(sock_hash, csid, MAX_CSID_LEN); + } + + /* Put it back as we found it */ + csid[0] ^= 0x80; +} + +int alloc_client(int fd, char *csid, struct local_client **new_client) +{ + struct local_client *client; + + DEBUGLOG("alloc_client %d csid = [%d.%d.%d.%d]\n", fd,csid[0],csid[1],csid[2],csid[3]); + + /* Create a local_client and return it */ + client = malloc(sizeof(struct local_client)); + if (!client) + { + DEBUGLOG("malloc failed\n"); + return -1; + } + + memset(client, 0, sizeof(struct local_client)); + client->fd = fd; + client->type = CLUSTER_DATA_SOCK; + client->callback = read_from_tcpsock; + if (new_client) + *new_client = client; + + /* Add to our list of node sockets */ + if (hash_lookup_binary(sock_hash, csid, MAX_CSID_LEN)) + { + DEBUGLOG("alloc_client mangling CSID for second connection\n"); + /* This is a duplicate connection but we can't close it because + the other end may already have started sending. + So, we mangle the IP address and keep it, all sending will + go out of the main FD + */ + csid[0] ^= 0x80; + client->bits.net.flags = 1; /* indicate mangled CSID */ + + /* If it still exists then kill the connection as we should only + ever have one incoming connection from each node */ + if (hash_lookup_binary(sock_hash, csid, MAX_CSID_LEN)) + { + DEBUGLOG("Multiple incoming connections from node\n"); + syslog(LOG_ERR, " Bogus incoming connection from %d.%d.%d.%d\n", csid[0],csid[1],csid[2],csid[3]); + + free(client); + errno = ECONNREFUSED; + return -1; + } + } + hash_insert_binary(sock_hash, csid, MAX_CSID_LEN, client); + + return 0; +} + +int get_main_cluster_fd() +{ + return listen_fd; +} + + +/* Read on main comms (listen) socket, accept it */ +int cluster_fd_callback(struct local_client *fd, char *buf, int len, char *csid, + struct local_client **new_client) +{ + int newfd; + struct sockaddr_in addr; + socklen_t addrlen = sizeof(addr); + int status; + char name[MAX_CLUSTER_MEMBER_NAME_LEN]; + + DEBUGLOG("cluster_fd_callback\n"); + *new_client = NULL; + newfd = accept(listen_fd, (struct sockaddr *)&addr, &addrlen); + + DEBUGLOG("cluster_fd_callback, newfd=%d (errno=%d)\n", newfd, errno); + if (!newfd) + { + syslog(LOG_ERR, "error in accept: %m"); + errno = EAGAIN; + return -1; /* Don't return an error or clvmd will close the listening FD */ + } + + /* Check that the client is a member of the cluster + and reject if not. + // FIXME: IPv4 specific + */ + if (name_from_csid((char *)&addr.sin_addr.s_addr, name) < 0) + { + char *ip = (char *)&addr.sin_addr.s_addr; + syslog(LOG_ERR, "Got connect from non-cluster node %d.%d.%d.%d\n", + ip[0], ip[1], ip[2], ip[3]); + DEBUGLOG("Got connect from non-cluster node %d.%d.%d.%d\n", + ip[0], ip[1], ip[2], ip[3]); + close(newfd); + + errno = EAGAIN; + return -1; + } + + status = alloc_client(newfd, (char *)&addr.sin_addr.s_addr, new_client); + if (status) + { + DEBUGLOG("cluster_fd_callback, alloc_client failed, status = %d\n", status); + close(newfd); + /* See above... */ + errno = EAGAIN; + return -1; + } + DEBUGLOG("cluster_fd_callback, returning %d, %p\n", newfd, *new_client); + return newfd; +} + + +static int read_from_tcpsock(struct local_client *client, char *buf, int len, char *csid, + struct local_client **new_client) +{ + struct sockaddr_in addr; + socklen_t slen = sizeof(addr); + int status; + + DEBUGLOG("read_from_tcpsock fd %d\n", client->fd); + *new_client = NULL; + + /* Get "csid" */ + getpeername(client->fd, (struct sockaddr *)&addr, &slen); + memcpy(csid, &addr.sin_addr.s_addr, MAX_CSID_LEN); + + status = read(client->fd, buf, len); + + DEBUGLOG("read_from_tcpsock, status = %d(errno = %d)\n", status, errno); + + /* Remove it from the hash table if there's an error, clvmd will + remove the socket from its lists and free the client struct */ + if (status == 0 || + (status < 0 && errno != EAGAIN && errno != EINTR)) + { + char remcsid[MAX_CSID_LEN]; + + memcpy(remcsid, csid, MAX_CSID_LEN); + close(client->fd); + + /* If the csid was mangled, then make sure we remove the right entry */ + if (client->bits.net.flags) + remcsid[0] ^= 0x80; + hash_remove_binary(sock_hash, remcsid, MAX_CSID_LEN); + + /* Tell cluster manager layer */ + add_down_node(remcsid); + } + return status; +} + +static int connect_csid(char *csid, struct local_client **newclient) +{ + int fd; + struct sockaddr_in addr; + int status; + + DEBUGLOG("Connecting socket\n"); + fd = socket(PF_INET, SOCK_STREAM, 0); + + if (fd < 0) + { + syslog(LOG_ERR, "Unable to create new socket: %m"); + return -1; + } + + addr.sin_family = AF_INET; + memcpy(&addr.sin_addr.s_addr, csid, MAX_CSID_LEN); + addr.sin_port = htons(tcp_port); + + DEBUGLOG("Connecting socket %d\n", fd); + if (connect(fd, (struct sockaddr *)&addr, sizeof(struct sockaddr_in)) < 0) + { + syslog(LOG_ERR, "Unable to connect to remote node: %m"); + DEBUGLOG("Unable to connect to remote node: %s\n", strerror(errno)); + close(fd); + return -1; + } + + status = alloc_client(fd, csid, newclient); + if (status) + close(fd); + else + add_client(*newclient); + + /* If we can connect to it, it must be running a clvmd */ + add_up_node(csid); + return status; +} + +/* Send a message to a known CSID */ +static int tcp_send_message(void *buf, int msglen, unsigned char *csid, const char *errtext) +{ + int status; + struct local_client *client; + char ourcsid[MAX_CSID_LEN]; + + assert(csid); + + DEBUGLOG("tcp_send_message, csid = [%d.%d.%d.%d], msglen = %d\n", csid[0],csid[1],csid[2],csid[3], msglen); + + /* Don't connect to ourself */ + get_our_csid(ourcsid); + if (memcmp(csid, ourcsid, MAX_CSID_LEN) == 0) + return msglen; + + client = hash_lookup_binary(sock_hash, csid, MAX_CSID_LEN); + if (!client) + { + status = connect_csid(csid, &client); + if (status) + return -1; + } + DEBUGLOG("tcp_send_message, fd = %d\n", client->fd); + + return write(client->fd, buf, msglen); +} + + +int cluster_send_message(void *buf, int msglen, char *csid, const char *errtext) +{ + int status=0; + + DEBUGLOG("cluster send message, csid = %p, msglen = %d\n", csid, msglen); + + /* If csid is NULL then send to all known (not just connected) nodes */ + if (!csid) + { + void *context = NULL; + char loop_csid[MAX_CSID_LEN]; + + /* Loop round all gulm-known nodes */ + while (get_next_node_csid(&context, loop_csid)) + { + status = tcp_send_message(buf, msglen, loop_csid, errtext); + if (status == 0 || + (status < 0 && (errno == EAGAIN || errno == EINTR))) + break; + } + } + else + { + + status = tcp_send_message(buf, msglen, csid, errtext); + } + return status; +} + +static int get_tcp_port(int default_port) +{ + int ccs_handle; + int port = default_port; + char *portstr; + + ccs_handle = ccs_connect(); + if (ccs_handle) + { + return port; + } + + if (!ccs_get(ccs_handle, "//clvm/@port", &portstr)) + { + port = atoi(portstr); + free(portstr); + + if (port <= 0 && port >= 65536) + port = default_port; + } + ccs_disconnect(ccs_handle); + + DEBUGLOG("Using port %d for communications\n", port); + return port; +} + +/* To get our own IP address we get the locally bound address of the + socket that's talking to GULM in the assumption(eek) that it will + be on the "right" network in a multi-homed system */ +static int get_our_ip_address(char *addr, int *family) +{ + /* Use a sockaddr_in6 to make sure it's big enough */ + struct sockaddr_in6 saddr; + int socklen = sizeof(saddr); + + if (!getsockname(gulm_fd(), (struct sockaddr *)&saddr, &socklen)) + { + if (saddr.sin6_family == AF_INET6) + { + memcpy(addr, &saddr.sin6_addr, sizeof(saddr.sin6_addr)); + } + else + { + struct sockaddr_in *sin4 = (struct sockaddr_in *)&saddr; + memcpy(addr, &sin4->sin_addr, sizeof(sin4->sin_addr)); + } + return 0; + } + return -1; +} + +/* Public version of above for those that don't care what protocol + we're using */ +void get_our_csid(char *csid) +{ + static char our_csid[MAX_CSID_LEN]; + static int got_csid = 0; + + if (!got_csid) + { + int family; + + memset(our_csid, 0, sizeof(our_csid)); + if (get_our_ip_address(our_csid, &family)) + { + got_csid = 1; + } + } + memcpy(csid, our_csid, MAX_CSID_LEN); +} + +/* Get someone else's IP address from DNS */ +int get_ip_address(char *node, char *addr) +{ + struct hostent *he; + + memset(addr, 0, MAX_CSID_LEN); + + // TODO: what do we do about multi-homed hosts ??? + // CCSs ip_interfaces solved this but some bugger removed it. + + /* Try IPv6 first. The man page for gethostbyname implies that + it will lookup ip6 & ip4 names, but it seems not to */ + he = gethostbyname2(node, AF_INET6); + if (!he) + he = gethostbyname2(node, AF_INET); + if (!he) + return -1; + + /* For IPv4 address just use the lower 4 bytes */ + memcpy(&addr, he->h_addr_list[0], + he->h_length); + + return 0; +} diff --git a/daemons/clvmd/tcp-comms.h b/daemons/clvmd/tcp-comms.h new file mode 100644 index 000000000..8dafd441c --- /dev/null +++ b/daemons/clvmd/tcp-comms.h @@ -0,0 +1,7 @@ +#include + +#define MAX_CLUSTER_MESSAGE 1600 +#define MAX_CSID_LEN sizeof(struct in6_addr) +#define MAX_CLUSTER_MEMBER_NAME_LEN 128 + +extern int init_comms(void); diff --git a/include/.symlinks b/include/.symlinks index 74987de7c..54d27bc6a 100644 --- a/include/.symlinks +++ b/include/.symlinks @@ -1,3 +1,4 @@ +../daemons/clvmd/clvm.h ../lib/activate/activate.h ../lib/activate/targets.h ../lib/cache/lvmcache.h diff --git a/lib/Makefile.in b/lib/Makefile.in index f0da66e7a..75ff735b9 100644 --- a/lib/Makefile.in +++ b/lib/Makefile.in @@ -104,6 +104,14 @@ ifeq ("@POOL@", "internal") format_pool/pool_label.c endif +ifeq ("@CLUSTER@", "internal") + SOURCES += locking/cluster_locking.c +endif + +ifeq ("@CLUSTER@", "shared") + SUBDIRS += locking +endif + ifeq ("@SNAPSHOTS@", "internal") SOURCES += snapshot/snapshot.c endif diff --git a/lib/locking/Makefile.in b/lib/locking/Makefile.in new file mode 100644 index 000000000..d28cfbfe5 --- /dev/null +++ b/lib/locking/Makefile.in @@ -0,0 +1,32 @@ +# +# Copyright (C) 2003-2004 Sistina Software, Inc. All rights reserved. +# Copyright (C) 2004 Red Hat, Inc. All rights reserved. +# +# This file is part of the LVM2. +# +# This copyrighted material is made available to anyone wishing to use, +# modify, copy, or redistribute it subject to the terms and conditions +# of the GNU General Public License v.2. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software Foundation, +# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +srcdir = @srcdir@ +top_srcdir = @top_srcdir@ +VPATH = @srcdir@ + +SOURCES = cluster_locking.c + +LIB_SHARED = liblvm2clusterlock.so + +include $(top_srcdir)/make.tmpl + +.PHONY: install + +install: liblvm2clusterlock.so + $(INSTALL) -D $(OWNER) $(GROUP) -m 555 $(STRIP) $< \ + $(libdir)/liblvm2clusterlock.so.$(LIB_VERSION) + $(LN_S) -f liblvm2clusterlock.so.$(LIB_VERSION) \ + $(libdir)/liblvm2clusterlock.so + diff --git a/lib/locking/cluster_locking.c b/lib/locking/cluster_locking.c new file mode 100644 index 000000000..d9cab2d7e --- /dev/null +++ b/lib/locking/cluster_locking.c @@ -0,0 +1,462 @@ +/* + * Copyright (C) 2002-2004 Sistina Software, Inc. All rights reserved. + * Copyright (C) 2004 Red Hat, Inc. All rights reserved. + * + * This file is part of LVM2. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* + * Locking functions for LVM. + * The main purpose of this part of the library is to serialise LVM + * management operations across a cluster. + */ + +#include "lib.h" +#include "clvm.h" +#include "lvm-string.h" +#include "locking.h" +#include "locking_types.h" + +#include +#include +#include +#include + +#ifndef CLUSTER_LOCKING_INTERNAL +int lock_resource(struct cmd_context *cmd, const char *resource, int flags); +void locking_end(void); +int locking_init(int type, struct config_tree *cf, uint32_t *flags); +#endif + +typedef struct lvm_response { + char node[255]; + char *response; + int status; + int len; +} lvm_response_t; + +/* + * This gets stuck at the start of memory we allocate so we + * can sanity-check it at deallocation time + */ +#define LVM_SIGNATURE 0x434C564D + +/* + * NOTE: the LVMD uses the socket FD as the client ID, this means + * that any client that calls fork() will inherit the context of + * it's parent. + */ +static int _clvmd_sock = -1; + +/* FIXME Install SIGPIPE handler? */ + +/* Open connection to the Cluster Manager daemon */ +static int _open_local_sock(void) +{ + int local_socket; + struct sockaddr_un sockaddr; + + /* Open local socket */ + if ((local_socket = socket(PF_UNIX, SOCK_STREAM, 0)) < 0) { + log_error("Local socket creation failed: %s", strerror(errno)); + return -1; + } + + memset(&sockaddr, 0, sizeof(sockaddr)); + memcpy(sockaddr.sun_path, CLVMD_SOCKNAME, sizeof(CLVMD_SOCKNAME)); + + sockaddr.sun_family = AF_UNIX; + + if (connect(local_socket,(struct sockaddr *) &sockaddr, + sizeof(sockaddr))) { + int saved_errno = errno; + + log_error("connect() failed on local socket: %s", + strerror(errno)); + if (close(local_socket)) + stack; + + errno = saved_errno; + return -1; + } + + return local_socket; +} + +/* Send a request and return the status */ +static int _send_request(char *inbuf, int inlen, char **retbuf) +{ + char outbuf[PIPE_BUF]; + struct clvm_header *outheader = (struct clvm_header *) outbuf; + int len; + int off; + int buflen; + int err; + + /* Send it to CLVMD */ + rewrite: + if ( (err = write(_clvmd_sock, inbuf, inlen)) != inlen) { + if (err == -1 && errno == EINTR) + goto rewrite; + log_error("Error writing data to clvmd: %s", strerror(errno)); + return 0; + } + + /* Get the response */ + reread: + if ((len = read(_clvmd_sock, outbuf, sizeof(struct clvm_header))) < 0) { + if (errno == EINTR) + goto reread; + log_error("Error reading data from clvmd: %s", strerror(errno)); + return 0; + } + + if (len == 0) { + log_error("EOF reading CLVMD"); + errno = ENOTCONN; + return 0; + } + + /* Allocate buffer */ + buflen = len + outheader->arglen; + *retbuf = dbg_malloc(buflen); + if (!*retbuf) { + errno = ENOMEM; + return 0; + } + + /* Copy the header */ + memcpy(*retbuf, outbuf, len); + outheader = (struct clvm_header *) *retbuf; + + /* Read the returned values */ + off = 1; /* we've already read the first byte */ + + while (off < outheader->arglen && len > 0) { + len = read(_clvmd_sock, outheader->args + off, + buflen - off - offsetof(struct clvm_header, args)); + if (len > 0) + off += len; + } + + /* Was it an error ? */ + if (outheader->status < 0) { + errno = -outheader->status; + log_error("cluster send request failed: %s", strerror(errno)); + return 0; + } + + return 1; +} + +/* Build the structure header and parse-out wildcard node names */ +static void _build_header(struct clvm_header *head, int cmd, const char *node, + int len) +{ + head->cmd = cmd; + head->status = 0; + head->flags = 0; + head->clientid = 0; + head->arglen = len; + + if (node) { + /* + * Allow a couple of special node names: + * "*" for all nodes, + * "." for the local node only + */ + if (strcmp(node, "*") == 0) { + head->node[0] = '\0'; + } else if (strcmp(node, ".") == 0) { + head->node[0] = '\0'; + head->flags = CLVMD_FLAG_LOCAL; + } else + strcpy(head->node, node); + } else + head->node[0] = '\0'; +} + +/* + * Send a message to a(or all) node(s) in the cluster and wait for replies + */ +static int _cluster_request(char cmd, const char *node, void *data, int len, + lvm_response_t ** response, int *num) +{ + char outbuf[sizeof(struct clvm_header) + len + strlen(node) + 1]; + int *outptr; + char *inptr; + char *retbuf = NULL; + int status; + int i; + int num_responses = 0; + struct clvm_header *head = (struct clvm_header *) outbuf; + lvm_response_t *rarray; + + *num = 0; + + if (_clvmd_sock == -1) + _clvmd_sock = _open_local_sock(); + + if (_clvmd_sock == -1) + return 0; + + _build_header(head, cmd, node, len); + memcpy(head->node + strlen(head->node) + 1, data, len); + + status = _send_request(outbuf, sizeof(struct clvm_header) + + strlen(head->node) + len, &retbuf); + if (!status) + goto out; + + /* Count the number of responses we got */ + head = (struct clvm_header *) retbuf; + inptr = head->args; + while (inptr[0]) { + num_responses++; + inptr += strlen(inptr) + 1; + inptr += sizeof(int); + inptr += strlen(inptr) + 1; + } + + /* + * Allocate response array. + * With an extra pair of INTs on the front to sanity + * check the pointer when we are given it back to free + */ + outptr = dbg_malloc(sizeof(lvm_response_t) * num_responses + + sizeof(int) * 2); + if (!outptr) { + errno = ENOMEM; + status = 0; + goto out; + } + + *response = (lvm_response_t *) (outptr + 2); + outptr[0] = LVM_SIGNATURE; + outptr[1] = num_responses; + rarray = *response; + + /* Unpack the response into an lvm_response_t array */ + inptr = head->args; + i = 0; + while (inptr[0]) { + strcpy(rarray[i].node, inptr); + inptr += strlen(inptr) + 1; + + rarray[i].status = *(int *) inptr; + inptr += sizeof(int); + + rarray[i].response = dbg_malloc(strlen(inptr) + 1); + if (rarray[i].response == NULL) { + /* Free up everything else and return error */ + int j; + for (j = 0; j < i; j++) + dbg_free(rarray[i].response); + free(outptr); + errno = ENOMEM; + status = -1; + goto out; + } + + strcpy(rarray[i].response, inptr); + rarray[i].len = strlen(inptr); + inptr += strlen(inptr) + 1; + i++; + } + *num = num_responses; + *response = rarray; + + out: + if (retbuf) + dbg_free(retbuf); + + return status; +} + +/* Free reply array */ +static int _cluster_free_request(lvm_response_t * response) +{ + int *ptr = (int *) response - 2; + int i; + int num; + + /* Check it's ours to free */ + if (response == NULL || *ptr != LVM_SIGNATURE) { + errno = EINVAL; + return 0; + } + + num = ptr[1]; + + for (i = 0; i < num; i++) { + dbg_free(response[i].response); + } + + dbg_free(ptr); + + return 1; +} + +static int _lock_for_cluster(unsigned char cmd, unsigned int flags, char *name) +{ + int status; + int i; + char *args; + const char *node = ""; + int len; + int saved_errno = errno; + lvm_response_t *response = NULL; + int num_responses; + + assert(name); + + len = strlen(name) + 3; + args = alloca(len); + strcpy(args + 2, name); + + args[0] = flags & 0xBF; /* Maskoff LOCAL flag */ + args[1] = 0; /* Not used now */ + + /* + * VG locks are just that: locks, and have no side effects + * so we only need to do them on the local node because all + * locks are cluster-wide. + * Also, if the lock is exclusive it makes no sense to try to + * acquire it on all nodes, so just do that on the local node too. + */ + if (cmd == CLVMD_CMD_LOCK_VG || + (flags & LCK_TYPE_MASK) == LCK_EXCL || + (flags & LCK_LOCAL)) + node = "."; + + status = _cluster_request(cmd, node, args, len, + &response, &num_responses); + + /* If any nodes were down then display them and return an error */ + for (i = 0; i < num_responses; i++) { + if (response[i].status == -EHOSTDOWN) { + log_error("clvmd not running on node %s", + response[i].node); + status = 0; + } else if (response[i].status) { + log_error("Error locking on node %s: %s", + response[i].node, + response[i].response[0] ? + response[i].response : + strerror(response[i].status)); + status = 0; + } + } + + saved_errno = errno; + _cluster_free_request(response); + errno = saved_errno; + + return status; +} + +/* API entry point for LVM */ +#ifdef CLUSTER_LOCKING_INTERNAL +static int _lock_resource(struct cmd_context *cmd, const char *resource, + int flags) +#else +int lock_resource(struct cmd_context *cmd, const char *resource, int flags) +#endif +{ + char lockname[PATH_MAX]; + int cluster_cmd = 0; + + assert(strlen(resource) < sizeof(lockname)); + + switch (flags & LCK_SCOPE_MASK) { + case LCK_VG: + /* If the VG name is empty then lock the unused PVs */ + if (!resource || !*resource) + lvm_snprintf(lockname, sizeof(lockname), "P_orphans"); + else + lvm_snprintf(lockname, sizeof(lockname), "V_%s", + resource); + + cluster_cmd = CLVMD_CMD_LOCK_VG; + flags &= LCK_TYPE_MASK; + break; + + case LCK_LV: + cluster_cmd = CLVMD_CMD_LOCK_LV; + strcpy(lockname, resource); + flags &= 0xffdf; /* Mask off HOLD flag */ + break; + + default: + log_error("Unrecognised lock scope: %d", + flags & LCK_SCOPE_MASK); + return 0; + } + + /* Send a message to the cluster manager */ + log_very_verbose("Locking %s at 0x%x", lockname, flags); + + return _lock_for_cluster(cluster_cmd, flags, lockname); +} + +#ifdef CLUSTER_LOCKING_INTERNAL +static void _locking_end(void) +#else +void locking_end(void) +#endif +{ + if (_clvmd_sock != -1 && close(_clvmd_sock)) + stack; + + _clvmd_sock = -1; +} + +#ifdef CLUSTER_LOCKING_INTERNAL +static void _reset_locking(void) +#else +void reset_locking(void) +#endif +{ + if (close(_clvmd_sock)) + stack; + + _clvmd_sock = _open_local_sock(); + if (_clvmd_sock == -1) + stack; +} + +#ifdef CLUSTER_LOCKING_INTERNAL +int init_cluster_locking(struct locking_type *locking, struct config_tree *cft) +{ + locking->lock_resource = _lock_resource; + locking->fin_locking = _locking_end; + locking->reset_locking = _reset_locking; + locking->flags = LCK_PRE_MEMLOCK; + + _clvmd_sock = _open_local_sock(); + if (_clvmd_sock == -1) + return 0; + + return 1; +} +#else +int locking_init(int type, struct config_tree *cf, uint32_t *flags) +{ + _clvmd_sock = _open_local_sock(); + if (_clvmd_sock == -1) + return 0; + + /* Ask LVM to lock memory before calling us */ + *flags |= LCK_PRE_MEMLOCK; + + return 1; +} +#endif diff --git a/lib/locking/locking.c b/lib/locking/locking.c index f4fa45eb3..2c5ab1b9e 100644 --- a/lib/locking/locking.c +++ b/lib/locking/locking.c @@ -145,6 +145,14 @@ int init_locking(int type, struct config_tree *cft) return 1; #endif +#ifdef CLUSTER_LOCKING_INTERNAL + case 3: + if (!init_cluster_locking(&_locking, cft)) + break; + log_very_verbose("Cluster locking enabled."); + return 1; +#endif + default: log_error("Unknown locking type requested."); return 0; diff --git a/lib/locking/locking_types.h b/lib/locking/locking_types.h index de8d94490..441e2c309 100644 --- a/lib/locking/locking_types.h +++ b/lib/locking/locking_types.h @@ -40,3 +40,4 @@ int init_no_locking(struct locking_type *locking, struct config_tree *cf); int init_file_locking(struct locking_type *locking, struct config_tree *cf); int init_external_locking(struct locking_type *locking, struct config_tree *cf); +int init_cluster_locking(struct locking_type *locking, struct config_tree *cf); diff --git a/scripts/clvmd_fix_conf.sh b/scripts/clvmd_fix_conf.sh new file mode 100644 index 000000000..9e363d521 --- /dev/null +++ b/scripts/clvmd_fix_conf.sh @@ -0,0 +1,154 @@ +#!/bin/sh +# +# Edit an lvm.conf file to enable cluster locking. +# +# $1 is the directory where the locking library is installed. +# $2 (optional) is the config file +# $3 (optional) is the locking library name +# +# +PREFIX=$1 +LVMCONF=$2 +LIB=$3 + +if [ -z "$PREFIX" ] +then + echo "usage: $0 [] []" + echo "" + echo " location of the cluster locking shared library. (no default)" + echo " name of the LVM config file (default: /etc/lvm/lvm.conf)" + echo " name of the shared library (default: liblvm2clusterlock.so)" + echo "" + exit 0 +fi + +[ -z "$LVMCONF" ] && LVMCONF="/etc/lvm/lvm.conf" +[ -z "$LIB" ] && LIB="liblvm2clusterlock.so" + +if [ "${PREFIX:0:1}" != "/" ] +then + echo "Prefix must be an absolute path name (starting with a /)" + exit 12 +fi + +if [ ! -f "$LVMCONF" ] +then + echo "$LVMCONF does not exist" + exit 10 +fi + +if [ ! -f "$PREFIX/$LIB" ] +then + echo "$PREFIX/$LIB does not exist, did you do a \"make install\" ?" + exit 11 +fi + + +SCRIPTFILE=`mktemp -t lvmscript.XXXXXXXXXX` +TMPFILE=`mktemp -t lvmtmp.XXXXXXXXXX` + + +# Flags so we know which parts of the file we can replace and which need +# adding. These are return codes from grep, so zero means it IS present! +have_type=1 +have_dir=1 +have_library=1 +have_global=1 + +grep -q '^[[:blank:]]*locking_type[[:blank:]]*=' $LVMCONF +have_type=$? + +grep -q '^[[:blank:]]*library_dir[[:blank:]]*=' $LVMCONF +have_dir=$? + +grep -q '^[[:blank:]]*locking_library[[:blank:]]*=' $LVMCONF +have_library=$? + +# Those options are in section "global {" so we must have one if any are present. +if [ "$have_type" = "0" -o "$have_dir" = "0" -o "$have_library" = "0" ] +then + + # See if we can find it... + grep -q '^[[:blank:]]*global[[:blank:]]*{' $LVMCONF + have_global=$? + + if [ "$have_global" = "1" ] + then + echo "global keys but no 'global {' found, can't edit file" + exit 12 + fi +fi + +# So if we don't have "global {" we need to create one and +# populate it + +if [ "$have_global" = "1" ] +then + cat $LVMCONF - < $TMPFILE +global { + # Enable locking for cluster LVM + locking_type = 2 + library_dir = "$PREFIX" + locking_library = "$LIB" +} +EOF + if [ $? != 0 ] + then + echo "failed to create temporary config file, $LVMCONF not updated" + exit 1 + fi +else + # + # We have a "global {" section, so add or replace the + # locking entries as appropriate + # + + if [ "$have_type" = "0" ] + then + SEDCMD=" s/^[[:blank:]]*locking_type[[:blank:]]*=.*/\ \ \ \ locking_type = 2/g" + else + SEDCMD=" /global[[:blank:]]*{/a\ \ \ \ locking_type = 2" + fi + + if [ "$have_dir" = "0" ] + then + SEDCMD="${SEDCMD}\ns'^[[:blank:]]*library_dir[[:blank:]]*=.*'\ \ \ \ library_dir = \"$PREFIX\"'g" + else + SEDCMD="${SEDCMD}\n/global[[:blank:]]*{/a\ \ \ \ library_dir = \"$PREFIX\"" + fi + + if [ "$have_library" = "0" ] + then + SEDCMD="${SEDCMD}\ns/^[[:blank:]]*locking_library[[:blank:]]*=.*/\ \ \ \ locking_library = \"$LIB\"/g" + else + SEDCMD="${SEDCMD}\n/global[[:blank:]]*{/a\ \ \ \ locking_library = \"$LIB\"" + fi + + echo -e $SEDCMD > $SCRIPTFILE + sed <$LVMCONF >$TMPFILE -f $SCRIPTFILE + if [ $? != 0 ] + then + echo "sed failed, $LVMCONF not updated" + exit 1 + fi +fi + +# Now we have a suitably editted config file in a temp place, +# backup the original and copy our new one into place. + +cp $LVMCONF $LVMCONF.nocluster +if [ $? != 0 ] + then + echo "failed to backup old config file, $LVMCONF not updated" + exit 2 +fi + +cp $TMPFILE $LVMCONF +if [ $? != 0 ] + then + echo "failed to copy new config file into place, check $LVMCONF is still OK" + exit 3 +fi + +rm -f $SCRIPTFILE $TMPFILE + diff --git a/scripts/clvmd_init b/scripts/clvmd_init new file mode 100755 index 000000000..31eb8cebc --- /dev/null +++ b/scripts/clvmd_init @@ -0,0 +1,90 @@ +#!/bin/bash +# +# /etc/rc.d/init.d/clvmd +# +# Starts the clvm daemon +# NOTE: These startup levels may not be right yet - it depends on where +# the rest of the cluster startup goes. +# +# chkconfig: 345 72 5 +# description: distributes LVM commands in a clustered environment. \ +# a clvmd must be run on all nodes in a cluster for clustered LVM \ +# operations to work. +# processname: clvmd + +# Source function library. +. /etc/init.d/functions + +BINARY=/usr/sbin/clvmd +LOCKFILE=/var/lock/subsys/clvmd + +test -x "$BINARY" || exit 0 + +RETVAL=0 + +# +# See how we were called. +# + +prog="clvmd" + +start() { + # Check if clvmd is already running + if [ ! -f "$LOCKFILE" ]; then + echo -n $"Starting $prog: " + daemon $BINARY + RETVAL=$? + [ $RETVAL -eq 0 ] && touch $LOCKFILE + echo + fi + return $RETVAL +} + +stop() { + echo -n $"Stopping $prog: " + killproc $BINARY + RETVAL=$? + [ $RETVAL -eq 0 ] && rm -f $LOCKFILE + echo + return $RETVAL +} + + +restart() { + stop + start +} + +reload() { + restart +} + +status_clvm() { + status $BINARY +} + +case "$1" in +start) + start + ;; +stop) + stop + ;; +reload|restart) + restart + ;; +condrestart) + if [ -f $LOCKFILE ]; then + restart + fi + ;; +status) + status_clvm + ;; +*) + echo $"Usage: $0 {start|stop|restart|condrestart|status}" + exit 1 +esac + +exit $? +exit $RETVAL