2025-08-24 21:49:29 +03:00 · 2008-01-07 16:16:43 +11:00
parent db6ac79579 d38fbaa38b
commit fd227f5d29
10 changed files with 212 additions and 111 deletions
--- a/ctdb/config/ctdb.sysconfig
+++ b/ctdb/config/ctdb.sysconfig
@ -21,6 +21,15 @@
 #
 # CTDB_PUBLIC_ADDRESSES=/etc/ctdb/public_addresses

+# Should CTDB present the cluster using a single public ip address to clients
+# and multiplex clients across all CONNECTED nodes ?
+# This is based on LVS 
+# When this is enabled, the entire cluster will present one single ip address
+# which clients will connect to.
+# CTDB_LVS_PUBLIC_IP=10.1.1.1
+
+
+# IPMUX : OBSOLETE use LVS instead
 # Should ctdb implement a single public ip address across the entire cluster
 # and multiplex incoming connections across the connected nodes
 # When using a single public ip you must also specify the public interface!
--- a/ctdb/config/events.d/50.samba
+++ b/ctdb/config/events.d/50.samba
@ -46,14 +46,19 @@ case $cmd in

 	# make sure samba is not already started
 	service smb stop > /dev/null 2>&1
+	killall -0 -q smbd && {
+	    sleep 1
+	    # make absolutely sure samba is dead
+	    killall -q -9 smbd
+	}

 	# restart the winbind service
 	[ "$CTDB_MANAGES_WINBIND" = "yes" ] && {
 		service winbind stop > /dev/null 2>&1
-		killall -0 -q smbd winbindd && {
+		killall -0 -q winbindd && {
 		    sleep 1
-		    # make absolutely sure samba is dead
-		    killall -q -9 smbd winbindd
+		    # make absolutely sure winbindd is dead
+		    killall -q -9 winbindd
 		}
 		service winbind start
 	}
--- a/ctdb/config/events.d/60.nfs
+++ b/ctdb/config/events.d/60.nfs
@ -55,6 +55,9 @@ case $cmd in
 	;;

     recovered)
+	# if no IPs have changed then don't need to restart statd 
+	[ -f $CTDB_BASE/state/statd/restart ] || exit 0;
+
 	# always restart the lockmanager so that we start with a clusterwide
 	# graceperiod when ip addresses has changed
 	[ -x $CTDB_BASE/statd-callout ] && {
--- a/ctdb/doc/ctdbd.1.xml
+++ b/ctdb/doc/ctdbd.1.xml
@ -20,20 +20,25 @@
 	
 	<cmdsynopsis>
 		<command>ctdbd</command>
-		<arg choice="req">--reclock=&lt;filename&gt;</arg>
-		<arg choice="req">--nlist=&lt;filename&gt;</arg>
-		<arg choice="req">--dbdir=&lt;directory&gt;</arg>
 		<arg choice="opt">-? --help</arg>
-		<arg choice="opt">--usage</arg>
-		<arg choice="opt">-i --interactive</arg>
-		<arg choice="opt">--public-addresses=&lt;filename&gt;</arg>
-		<arg choice="opt">--event-script-dir=&lt;directory&gt;</arg>
-		<arg choice="opt">--logfile=&lt;filename&gt;</arg>
-		<arg choice="opt">--listen=&lt;address&gt;</arg>
-		<arg choice="opt">--transport=&lt;STRING&gt;</arg>
-		<arg choice="opt">--socket=&lt;filename&gt;</arg>
 		<arg choice="opt">-d --debug=&lt;INTEGER&gt;</arg>
+		<arg choice="req">--dbdir=&lt;directory&gt;</arg>
+		<arg choice="req">--dbdir-persistent=&lt;directory&gt;</arg>
+		<arg choice="opt">--event-script-dir=&lt;directory&gt;</arg>
+		<arg choice="opt">-i --interactive</arg>
+		<arg choice="opt">--listen=&lt;address&gt;</arg>
+		<arg choice="opt">--logfile=&lt;filename&gt;</arg>
+		<arg choice="req">--nlist=&lt;filename&gt;</arg>
+		<arg choice="opt">--nosetsched</arg>
+		<arg choice="opt">--public-addresses=&lt;filename&gt;</arg>
+		<arg choice="opt">--public-interface=&lt;interface&gt;</arg>
+		<arg choice="req">--reclock=&lt;filename&gt;</arg>
+		<arg choice="opt">--single-public-ip=&lt;address&gt;</arg>
+		<arg choice="opt">--socket=&lt;filename&gt;</arg>
+		<arg choice="opt">--syslog</arg>
 		<arg choice="opt">--torture</arg>
+		<arg choice="opt">--transport=&lt;STRING&gt;</arg>
+		<arg choice="opt">--usage</arg>
 	</cmdsynopsis>
 	
 </refsynopsisdiv>
@ -69,30 +74,10 @@
        </listitem>
      </varlistentry>

-      <varlistentry><term>--usage</term>
+      <varlistentry><term>-d --debug=&lt;DEBUGLEVEL&gt;</term>
        <listitem>
          <para>
-            Print useage information to the screen.
-          </para>
-        </listitem>
-      </varlistentry>
-
-      <varlistentry><term>--reclock=&lt;filename&gt;</term>
-        <listitem>
-          <para>
-            This is the name of the lock file stored of the shared cluster filesystem that ctdbd uses to arbitrate which node has the role of recovery-master.
-            This file must be stored on shared storage.
-          </para>
-        </listitem>
-      </varlistentry>
-
-      <varlistentry><term>--nlist=&lt;filename&gt;</term>
-        <listitem>
-          <para>
-            This file contains a list of the private ip addresses of every node in the cluster. There is one line/ip address for each node. This file must be the same for all nodes in the cluster.
-          </para>
-          <para>
-            This file is usually /etc/ctdb/nodes .
+            This option sets the debuglevel on the ctdbd daemon which controls what will be written to the logfile. The default is 0 which will only log important events and errors. A larger number will provide additional logging.
          </para>
        </listitem>
      </varlistentry>
@ -109,22 +94,14 @@
        </listitem>
      </varlistentry>

-      <varlistentry><term>-i --interactive</term>
+      <varlistentry><term>--dbdir-persistent=&lt;directory&gt;</term>
        <listitem>
          <para>
-            By default ctdbd will detach itself from the shell and run in
-            the background as a daemon. This option makes ctdbd to start in interactive mode.
-          </para>
-        </listitem>
-      </varlistentry>
-
-      <varlistentry><term>--public_addresses=&lt;filename&gt;</term>
-        <listitem>
-          <para>
-            When used with IP takeover this specifies a file containing the public ip addresses to use on the cluster. This file contains a list of ip addresses netmasks and interfaces. When ctdb is operational it will distribute these public ip addresses evenly across the available nodes.
+            This is the directory on local storage where ctdbd keeps the local
+            copy of the persistent TDB databases. This directory is local for each node and should not be stored on the shared cluster filesystem.
          </para>
          <para>
-            This is usually the file /etc/ctdb/public_addresses
+            This directory would usually be /etc/ctdb/persistent .
          </para>
        </listitem>
      </varlistentry>
@ -141,10 +118,11 @@
        </listitem>
      </varlistentry>

-      <varlistentry><term>--logfile=&lt;filename&gt;</term>
+      <varlistentry><term>-i --interactive</term>
        <listitem>
          <para>
-            This is the file where ctdbd will write its log. This is usually /var/log/log.ctdb .
+            By default ctdbd will detach itself from the shell and run in
+            the background as a daemon. This option makes ctdbd to start in interactive mode.
          </para>
        </listitem>
      </varlistentry>
@ -160,13 +138,92 @@
        </listitem>
      </varlistentry>

-      <varlistentry><term>--transport=&lt;STRING&gt;</term>
+      <varlistentry><term>--logfile=&lt;filename&gt;</term>
        <listitem>
          <para>
-            This option specifies which transport to use for ctdbd internode communications. The default is "tcp".
+            This is the file where ctdbd will write its log. This is usually /var/log/log.ctdb .
+          </para>
+        </listitem>
+      </varlistentry>
+
+      <varlistentry><term>--nlist=&lt;filename&gt;</term>
+        <listitem>
+          <para>
+            This file contains a list of the private ip addresses of every node in the cluster. There is one line/ip address for each node. This file must be the same for all nodes in the cluster.
          </para>
          <para>
-            Suported transports are "tcp" and "infiniband".
+            This file is usually /etc/ctdb/nodes .
+          </para>
+        </listitem>
+      </varlistentry>
+
+      <varlistentry><term>--nosetsched</term>
+        <listitem>
+          <para>
+            Normally ctdb will change its scheduler to run as a real-time 
+	    process. This option is used to change this behaviour and have
+	    ctdb run as a normal process.
+          </para>
+        </listitem>
+      </varlistentry>
+
+      <varlistentry><term>--public_addresses=&lt;filename&gt;</term>
+        <listitem>
+          <para>
+            When used with IP takeover this specifies a file containing the public ip addresses to use on the cluster. This file contains a list of ip addresses netmasks and interfaces. When ctdb is operational it will distribute these public ip addresses evenly across the available nodes.
+          </para>
+          <para>
+            This is usually the file /etc/ctdb/public_addresses
+          </para>
+        </listitem>
+      </varlistentry>
+
+      <varlistentry><term>--public_interface=&lt;interface&gt;</term>
+        <listitem>
+          <para>
+            This option tells ctdb which interface to attach public-addresses
+	    to and also where to attach the single-public-ip when used.
+          </para>
+        </listitem>
+      </varlistentry>
+
+      <varlistentry><term>--reclock=&lt;filename&gt;</term>
+        <listitem>
+          <para>
+            This is the name of the lock file stored of the shared cluster filesystem that ctdbd uses to arbitrate which node has the role of recovery-master.
+            This file must be stored on shared storage.
+          </para>
+        </listitem>
+      </varlistentry>
+
+      <varlistentry><term>--single-public-ip=&lt;address&gt;</term>
+        <listitem>
+          <para>
+            This option is used to activate the "ipmux" functionality of ctdb.
+            In this mode, all nodes of the cluster will expose a single
+            ip address from all nodes with all incoming traffic to the cluster
+            being passed through the current recmaster. This functionality
+            is similar to using a load-balancing switch.
+          </para>
+          <para>
+            All incoming packets are sent to the recmaster which will multiplex
+            the clients across all available nodes and pass the packets on to
+            a different node in the cluster to manage the connection based
+            on the clients ip address. Outgoing packets however are sent
+            directly from the node that was choosen back to the client.
+            Since all incoming packets are sent through the recmaster this will
+            have a throughput and performance impact when used. This impact
+            in performance primarily affects write-performance while 
+	    read-performance should be mainly unaffected.
+            Only use this feature if your environment is mostly-read 
+            (i.e. most traffic is from the nodes back to the clients) or
+            if it is not important to get maximum write-performance to the
+	    cluster.
+          </para>
+          <para>
+            When using a single public ip, you must also specify the 
+            public-interface so that ctdb knows which interface to attach the 
+            single public ip to.
          </para>
        </listitem>
      </varlistentry>
@ -182,10 +239,10 @@
        </listitem>
      </varlistentry>

-      <varlistentry><term>-d --debug=&lt;DEBUGLEVEL&gt;</term>
+      <varlistentry><term>--syslog</term>
        <listitem>
          <para>
-            This option sets the debuglevel on the ctdbd daemon which controls what will be written to the logfile. The default is 0 which will only log important events and errors. A larger number will provide additional logging.
+	    Send all log messages to syslog instead of to the ctdb logfile.
          </para>
        </listitem>
      </varlistentry>
@ -200,6 +257,26 @@
          </para>
        </listitem>
      </varlistentry>
+
+      <varlistentry><term>--transport=&lt;STRING&gt;</term>
+        <listitem>
+          <para>
+            This option specifies which transport to use for ctdbd internode communications. The default is "tcp".
+          </para>
+          <para>
+            Suported transports are "tcp" and "infiniband".
+          </para>
+        </listitem>
+      </varlistentry>
+
+      <varlistentry><term>--usage</term>
+        <listitem>
+          <para>
+            Print useage information to the screen.
+          </para>
+        </listitem>
+      </varlistentry>
+
    </variablelist>
  </refsect1>

--- a/ctdb/packaging/RPM/ctdb.spec
+++ b/ctdb/packaging/RPM/ctdb.spec
@ -101,6 +101,7 @@ fi
 %{_sysconfdir}/ctdb/events.d/60.nfs
 %{_sysconfdir}/ctdb/events.d/61.nfstickle
 %{_sysconfdir}/ctdb/events.d/90.ipmux
+%{_sysconfdir}/ctdb/events.d/91.lvs
 %{_sysconfdir}/ctdb/statd-callout
 %{_sbindir}/ctdbd
 %{_bindir}/ctdb
--- a/ctdb/server/ctdb_recover.c
+++ b/ctdb/server/ctdb_recover.c
@ -170,36 +170,38 @@ ctdb_control_getnodemap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA ind
 	return 0;
 }

-struct getkeys_params {
+/* 
+   a traverse function for pulling all relevent records from pulldb
+ */
+struct pulldb_data {
 	struct ctdb_context *ctdb;
-	uint32_t lmaster;
-	uint32_t rec_count;
-	struct getkeys_rec {
-		TDB_DATA key;
-		TDB_DATA data;
-	} *recs;
+	struct ctdb_control_pulldb_reply *pulldata;
+	uint32_t len;
+	bool failed;
 };

-static int traverse_getkeys(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
+static int traverse_pulldb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
 {
-	struct getkeys_params *params = (struct getkeys_params *)p;
-	uint32_t lmaster;
+	struct pulldb_data *params = (struct pulldb_data *)p;
+	struct ctdb_rec_data *rec;

-	lmaster = ctdb_lmaster(params->ctdb, &key);
-
-	/* only include this record if the lmaster matches or if
-	   the wildcard lmaster (-1) was specified.
-	*/
-	if ((params->lmaster != CTDB_LMASTER_ANY) && (params->lmaster != lmaster)) {
-		return 0;
+	/* add the record to the blob */
+	rec = ctdb_marshall_record(params->pulldata, 0, key, NULL, data);
+	if (rec == NULL) {
+		params->failed = true;
+		return -1;
 	}
-
-	params->recs = talloc_realloc(NULL, params->recs, struct getkeys_rec, params->rec_count+1);
-	key.dptr = talloc_memdup(params->recs, key.dptr, key.dsize);
-	data.dptr = talloc_memdup(params->recs, data.dptr, data.dsize);
-	params->recs[params->rec_count].key = key;
-	params->recs[params->rec_count].data = data;
-	params->rec_count++;
+	params->pulldata = talloc_realloc_size(NULL, params->pulldata, rec->length + params->len);
+	if (params->pulldata == NULL) {
+		DEBUG(0,(__location__ " Failed to expand pulldb_data to %u (%u records)\n", 
+			 rec->length + params->len, params->pulldata->count));
+		params->failed = true;
+		return -1;
+	}
+	params->pulldata->count++;
+	memcpy(params->len+(uint8_t *)params->pulldata, rec, rec->length);
+	params->len += rec->length;
+	talloc_free(rec);

 	return 0;
 }
@ -211,10 +213,8 @@ int32_t ctdb_control_pull_db(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DAT
 {
 	struct ctdb_control_pulldb *pull;
 	struct ctdb_db_context *ctdb_db;
-	struct getkeys_params params;
+	struct pulldb_data params;
 	struct ctdb_control_pulldb_reply *reply;
-	int i;
-	size_t len = 0;

 	if (ctdb->freeze_mode != CTDB_FREEZE_FROZEN) {
 		DEBUG(0,("rejecting ctdb_control_pull_db when not frozen\n"));
@ -225,47 +225,36 @@ int32_t ctdb_control_pull_db(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DAT
 	
 	ctdb_db = find_ctdb_db(ctdb, pull->db_id);
 	if (!ctdb_db) {
-		DEBUG(0,(__location__ " Unknown db\n"));
+		DEBUG(0,(__location__ " Unknown db 0x%08x\n", pull->db_id));
 		return -1;
 	}

-	params.ctdb = ctdb;
-	params.lmaster = pull->lmaster;
+	reply = talloc_zero(outdata, struct ctdb_control_pulldb_reply);
+	CTDB_NO_MEMORY(ctdb, reply);

-	params.rec_count = 0;
-	params.recs = talloc_array(outdata, struct getkeys_rec, 0);
-	CTDB_NO_MEMORY(ctdb, params.recs);
+	reply->db_id = pull->db_id;
+
+	params.ctdb = ctdb;
+	params.pulldata = reply;
+	params.len = offsetof(struct ctdb_control_pulldb_reply, data);
+	params.failed = false;

 	if (ctdb_lock_all_databases_mark(ctdb) != 0) {
 		DEBUG(0,(__location__ " Failed to get lock on entired db - failing\n"));
 		return -1;
 	}

-	tdb_traverse_read(ctdb_db->ltdb->tdb, traverse_getkeys, &params);
+	if (tdb_traverse_read(ctdb_db->ltdb->tdb, traverse_pulldb, &params) == -1) {
+		DEBUG(0,(__location__ " Failed to get traverse db '%s'\n", ctdb_db->db_name));
+		ctdb_lock_all_databases_unmark(ctdb);
+		talloc_free(params.pulldata);
+		return -1;
+	}

 	ctdb_lock_all_databases_unmark(ctdb);

-	reply = talloc(outdata, struct ctdb_control_pulldb_reply);
-	CTDB_NO_MEMORY(ctdb, reply);
-
-	reply->db_id = pull->db_id;
-	reply->count = params.rec_count;
-
-	len = offsetof(struct ctdb_control_pulldb_reply, data);
-
-	for (i=0;i<reply->count;i++) {
-		struct ctdb_rec_data *rec;
-		rec = ctdb_marshall_record(outdata, 0, params.recs[i].key, NULL, params.recs[i].data);
-		reply = talloc_realloc_size(outdata, reply, rec->length + len);
-		memcpy(len+(uint8_t *)reply, rec, rec->length);
-		len += rec->length;
-		talloc_free(rec);
-	}
-
-	talloc_free(params.recs);
-
-	outdata->dptr = (uint8_t *)reply;
-	outdata->dsize = len;
+	outdata->dptr = (uint8_t *)params.pulldata;
+	outdata->dsize = params.len;

 	return 0;
 }
--- a/ctdb/server/ctdb_recoverd.c
+++ b/ctdb/server/ctdb_recoverd.c
@ -542,7 +542,7 @@ static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode,
 			struct ctdb_ltdb_header header;
 			if (existing.dsize < sizeof(struct ctdb_ltdb_header)) {
 				DEBUG(0,(__location__ " Bad record size %u from node %u\n", 
-					 existing.dsize, srcnode));
+					 (unsigned)existing.dsize, srcnode));
 				free(existing.dptr);
 				talloc_free(tmp_ctx);
 				return -1;
@ -898,6 +898,7 @@ struct recdb_data {
 	struct ctdb_context *ctdb;
 	struct ctdb_control_pulldb_reply *recdata;
 	uint32_t len;
+	bool failed;
 };

 static int traverse_recdb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
@ -917,10 +918,15 @@ static int traverse_recdb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data,

 	/* add the record to the blob ready to send to the nodes */
 	rec = ctdb_marshall_record(params->recdata, 0, key, NULL, data);
+	if (rec == NULL) {
+		params->failed = true;
+		return -1;
+	}
 	params->recdata = talloc_realloc_size(NULL, params->recdata, rec->length + params->len);
 	if (params->recdata == NULL) {
 		DEBUG(0,(__location__ " Failed to expand recdata to %u (%u records)\n", 
 			 rec->length + params->len, params->recdata->count));
+		params->failed = true;
 		return -1;
 	}
 	params->recdata->count++;
@ -949,12 +955,20 @@ static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
 	params.ctdb = ctdb;
 	params.recdata = recdata;
 	params.len = offsetof(struct ctdb_control_pulldb_reply, data);
+	params.failed = false;

 	if (tdb_traverse_read(recdb->tdb, traverse_recdb, &params) == -1) {
 		DEBUG(0,(__location__ " Failed to traverse recdb database\n"));
+		talloc_free(params.recdata);
 		return -1;
 	}

+	if (params.failed) {
+		DEBUG(0,(__location__ " Failed to traverse recdb database\n"));
+		talloc_free(params.recdata);
+		return -1;		
+	}
+
 	recdata = params.recdata;

 	outdata.dptr = (void *)recdata;
--- a/ctdb/tools/ctdb_diagnostics
+++ b/ctdb/tools/ctdb_diagnostics
@ -160,6 +160,7 @@ EOF
 show_all "/usr/lpp/mmfs/bin/mmlsconfig"
 show_all "/usr/lpp/mmfs/bin/mmlsfs all"
 show_all "/usr/lpp/mmfs/bin/mmlsnsd"
+ show_all "/usr/lpp/mmfs/bin/mmlsnsd -X"
 show_all "/usr/lpp/mmfs/bin/mmfsadm dump version"
 show_all "/usr/lpp/mmfs/bin/mmfsadm dump waiters"
 show_all "/usr/lpp/mmfs/bin/mmlsmount all"
--- a/ctdb/tools/onnode.rsh
+++ b/ctdb/tools/onnode.rsh
@ -19,6 +19,7 @@ MAXNODE=`expr $NUMNODES - 1`

 if [ $NODE = "all" ]; then
    for a in `egrep '^[[:alnum:]]' $NODES`; do
+	echo; echo ">> NODE: $a <<"
 	if [ -f "$SCRIPT" ]; then
 	    rsh $a at -f $SCRIPT now
 	else
--- a/ctdb/tools/onnode.ssh
+++ b/ctdb/tools/onnode.ssh
@ -19,6 +19,7 @@ MAXNODE=`expr $NUMNODES - 1`

 if [ $NODE = "all" ]; then
    for a in `egrep '^[[:alnum:]]' $NODES`; do
+	echo; echo ">> NODE: $a <<"
 	if [ -f "$SCRIPT" ]; then
 	    ssh -n $a at -f $SCRIPT now
 	else