From 277cdbe3d10c4d7dacc69a3c3bad06c0e7fac6df Mon Sep 17 00:00:00 2001
From: Ronnie Sahlberg <sahlberg@ronnie>
Date: Thu, 23 Aug 2007 09:53:10 +1000
Subject: [PATCH 01/15] create an enum to describe the state of a control in
 flight  instead of using the enum that is for calls

(This used to be ctdb commit f9cf7076151af983a1c4ea56fbeb6d94ea508a34)
---
 ctdb/client/ctdb_client.c | 10 ++++++----
 ctdb/doc/ctdb.1           | 10 +++++-----
 ctdb/doc/ctdb.1.html      | 30 +++++++++++++++---------------
 ctdb/doc/ctdbd.1          |  6 +++---
 ctdb/doc/ctdbd.1.html     |  2 +-
 5 files changed, 30 insertions(+), 28 deletions(-)
diff --git a/ctdb/client/ctdb_client.c b/ctdb/client/ctdb_client.c
index b569a69c8d8..19b19f489a4 100644
--- a/ctdb/client/ctdb_client.c
+++ b/ctdb/client/ctdb_client.c
@@ -661,12 +661,14 @@ int ctdb_fetch(struct ctdb_db_context *ctdb_db, TALLOC_CTX *mem_ctx,
 }
 
 
+enum control_state {CTDB_CONTROL_WAIT, CTDB_CONTROL_DONE, CTDB_CONTROL_ERROR};
+
 struct ctdb_client_control_state {
 	struct ctdb_context *ctdb;
 	uint32_t reqid;
 	int32_t status;
 	TDB_DATA outdata;
-	enum call_state state;
+	enum control_state state;
 	char *errormsg;
 };
 
@@ -705,7 +707,7 @@ static void ctdb_client_reply_control(struct ctdb_context *ctdb,
 
 	talloc_steal(state, c);
 
-	state->state = CTDB_CALL_DONE;
+	state->state = CTDB_CONTROL_DONE;
 }
 
 
@@ -758,7 +760,7 @@ int ctdb_control(struct ctdb_context *ctdb, uint32_t destnode, uint64_t srvid,
 
 	state->ctdb  = ctdb;
 	state->reqid = ctdb_reqid_new(ctdb, state);
-	state->state = CTDB_CALL_WAIT;
+	state->state = CTDB_CONTROL_WAIT;
 	state->errormsg = NULL;
 
 	talloc_set_destructor(state, ctdb_control_destructor);
@@ -796,7 +798,7 @@ int ctdb_control(struct ctdb_context *ctdb, uint32_t destnode, uint64_t srvid,
 	if (timeout && !timeval_is_zero(timeout)) {
 		event_add_timed(ctdb->ev, state, *timeout, timeout_func, &timed_out);
 	}
-	while ((state->state == CTDB_CALL_WAIT)
+	while ((state->state == CTDB_CONTROL_WAIT)
 	&&	(timed_out == 0) ){
 		event_loop_once(ctdb->ev);
 	}
diff --git a/ctdb/doc/ctdb.1 b/ctdb/doc/ctdb.1
index af75cb28c38..a39de9b6ca7 100644
--- a/ctdb/doc/ctdb.1
+++ b/ctdb/doc/ctdb.1
@@ -1,11 +1,11 @@
 .\"     Title: ctdb
 .\"    Author: 
 .\" Generator: DocBook XSL Stylesheets v1.71.0 <http://docbook.sf.net/>
-.\"      Date: 08/03/2007
+.\"      Date: 08/23/2007
 .\"    Manual: 
 .\"    Source: 
 .\"
-.TH "CTDB" "1" "08/03/2007" "" ""
+.TH "CTDB" "1" "08/23/2007" "" ""
 .\" disable hyphenation
 .nh
 .\" disable justification (adjust text to left margin only)
@@ -31,7 +31,7 @@ The virtual node number is an integer that describes the node in the cluster. Th
 .PP
 \-Y
 .RS 3n
-Produce output in machinereadable form for easier parsing by scripts. Not all commands support this option.
+Produce output in machine readable form for easier parsing by scripts. Not all commands support this option.
 .RE
 .PP
 \-t <timeout>
@@ -78,7 +78,7 @@ Node status reflects the current status of the node. There are four possible sta
 .PP
 OK \- This node is fully functional.
 .PP
-DISCONNECTED \- This node could not be connected through the network and is currently not parcipitating in the cluster. If there is a public IP address associated with this node it should have been taken over by a different node. No services are running on this node.
+DISCONNECTED \- This node could not be connected through the network and is currently not participating in the cluster. If there is a public IP address associated with this node it should have been taken over by a different node. No services are running on this node.
 .PP
 DISABLED \- This node has been administratively disabled. This node is still functional and participates in the CTDB cluster but its IP addresses have been taken over by a different node and no services are currently being hosted.
 .PP
@@ -104,7 +104,7 @@ The generation id is a number that indicates the current generation of a cluster
 \fBVNNMAP\fR
 .RS
 .PP
-The list of Virtual Node Numbers. This is a list of all nodes that actively participates in the cluster and that share the workload of hosting the Clustered TDB database records. Only nodes that are parcipitating in the vnnmap can become lmaster or dmaster for a database record.
+The list of Virtual Node Numbers. This is a list of all nodes that actively participates in the cluster and that share the workload of hosting the Clustered TDB database records. Only nodes that are participating in the vnnmap can become lmaster or dmaster for a database record.
 .RE
 .sp
 .it 1 an-trap
diff --git a/ctdb/doc/ctdb.1.html b/ctdb/doc/ctdb.1.html
index ec514f070ff..23dfda09da7 100644
--- a/ctdb/doc/ctdb.1.html
+++ b/ctdb/doc/ctdb.1.html
@@ -8,7 +8,7 @@
 	    The virtual node number is an integer that describes the node in the
 	    cluster. The first node has virtual node number 0.
           </p></dd><dt><span class="term">-Y</span></dt><dd><p>
-            Produce output in machinereadable form for easier parsing by scripts. Not all commands support this option.
+            Produce output in machine readable form for easier parsing by scripts. Not all commands support this option.
           </p></dd><dt><span class="term">-t &lt;timeout&gt;</span></dt><dd><p>
             How long should ctdb wait for a command to complete before timing out. Default is 3 seconds.
           </p></dd><dt><span class="term">-? --help</span></dt><dd><p>
@@ -24,36 +24,36 @@
             You only need to specify this parameter if you run multiple ctdb 
             daemons on the same physical host and thus can not use the default
             name for the domain socket.
-          </p></dd></dl></div></div><div class="refsect1" lang="en"><a name="id2481133"></a><h2>Administrative Commands</h2><p>
+          </p></dd></dl></div></div><div class="refsect1" lang="en"><a name="id2481134"></a><h2>Administrative Commands</h2><p>
       These are commands used to monitor and administrate a CTDB cluster.
-    </p><div class="refsect2" lang="en"><a name="id2481142"></a><h3>status</h3><p>
+    </p><div class="refsect2" lang="en"><a name="id2481143"></a><h3>status</h3><p>
         This command shows the current status of the ctdb node.
-      </p><div class="refsect3" lang="en"><a name="id2481151"></a><h4>node status</h4><p>
+      </p><div class="refsect3" lang="en"><a name="id2481152"></a><h4>node status</h4><p>
           Node status reflects the current status of the node. There are four possible states:
         </p><p>
           OK - This node is fully functional.
         </p><p>
-          DISCONNECTED - This node could not be connected through the network and is currently not parcipitating in the cluster. If there is a public IP address associated with this node it should have been taken over by a different node. No services are running on this node.
+          DISCONNECTED - This node could not be connected through the network and is currently not participating in the cluster. If there is a public IP address associated with this node it should have been taken over by a different node. No services are running on this node.
         </p><p>
           DISABLED - This node has been administratively disabled. This node is still functional and participates in the CTDB cluster but its IP addresses have been taken over by a different node and no services are currently being hosted.
         </p><p>
           UNHEALTHY - A service provided by this node is malfunctioning and should be investigated. The CTDB daemon itself is operational and participates in the cluster. Its public IP address has been taken over by a different node and no services are currnetly being hosted. All unhealthy nodes should be investigated and require an administrative action to rectify.
         </p><p>
           BANNED - This node failed too many recovery attempts and has been banned from participating in the cluster for a period of RecoveryBanPeriod seconds. Any public IP address has been taken over by other nodes. This node does not provide any services. All banned nodes should be investigated and require an administrative action to rectify. This node does not perticipate in the CTDB cluster but can still be communicated with. I.e. ctdb commands can be sent to it.
-        </p></div><div class="refsect3" lang="en"><a name="id2481202"></a><h4>generation</h4><p>
+        </p></div><div class="refsect3" lang="en"><a name="id2481204"></a><h4>generation</h4><p>
           The generation id is a number that indicates the current generation 
           of a cluster instance. Each time a cluster goes through a 
           reconfiguration or a recovery its generation id will be changed.
-        </p></div><div class="refsect3" lang="en"><a name="id2481215"></a><h4>VNNMAP</h4><p>
+        </p></div><div class="refsect3" lang="en"><a name="id2481216"></a><h4>VNNMAP</h4><p>
           The list of Virtual Node Numbers. This is a list of all nodes that actively participates in the cluster and that share the workload of hosting the Clustered TDB database records.
-          Only nodes that are parcipitating in the vnnmap can become lmaster or dmaster for a database record.
-        </p></div><div class="refsect3" lang="en"><a name="id2481229"></a><h4>Recovery mode</h4><p>
+          Only nodes that are participating in the vnnmap can become lmaster or dmaster for a database record.
+        </p></div><div class="refsect3" lang="en"><a name="id2481230"></a><h4>Recovery mode</h4><p>
           This is the current recovery mode of the cluster. There are two possible modes:
         </p><p>
           NORMAL - The cluster is fully operational.
         </p><p>
           RECOVERY - The cluster databases have all been frozen, pausing all services while the cluster awaits a recovery process to complete. A recovery process should finish within seconds. If a cluster is stuck in the RECOVERY state this would indicate a cluster malfunction which needs to be investigated.
-        </p></div><div class="refsect3" lang="en"><a name="id2481253"></a><h4>Recovery master</h4><p>
+        </p></div><div class="refsect3" lang="en"><a name="id2481254"></a><h4>Recovery master</h4><p>
           This is the cluster node that is currently designated as the recovery master. This node is responsible of monitoring the consistency of the cluster and to perform the actual recovery process when reqired.
         </p></div><p>
 	Example: ctdb status
@@ -94,7 +94,7 @@ Number of nodes:4
 12.1.1.2         1
 12.1.1.3         2
 12.1.1.4         3
-      </pre></div><div class="refsect2" lang="en"><a name="id2481335"></a><h3>getvar &lt;name&gt;</h3><p>
+      </pre></div><div class="refsect2" lang="en"><a name="id2481336"></a><h3>getvar &lt;name&gt;</h3><p>
         Get the runtime value of a tuneable variable.
       </p><p>
 	Example: ctdb getvar MaxRedirectCount
@@ -170,7 +170,7 @@ CTDB version 1
  max_hop_count                      0
  max_call_latency                   4.948321 sec
  max_lockwait_latency               0.000000 sec
-      </pre></div><div class="refsect2" lang="en"><a name="id2528503"></a><h3>statisticsreset</h3><p>
+      </pre></div><div class="refsect2" lang="en"><a name="id2528504"></a><h3>statisticsreset</h3><p>
         This command is used to clear all statistics counters in a node.
       </p><p>
 	Example: ctdb statisticsreset
@@ -178,14 +178,14 @@ CTDB version 1
         Get the current debug level for the node. the debug level controls what information is written to the log file.
       </p></div><div class="refsect2" lang="en"><a name="id2528529"></a><h3>setdebug &lt;debuglevel&gt;</h3><p>
         Set the debug level of a node. This is a number between 0 and 9 and controls what information will be written to the logfile.
-      </p></div><div class="refsect2" lang="en"><a name="id2528540"></a><h3>getpid</h3><p>
+      </p></div><div class="refsect2" lang="en"><a name="id2528541"></a><h3>getpid</h3><p>
         This command will return the process id of the ctdb daemon.
       </p></div><div class="refsect2" lang="en"><a name="id2528551"></a><h3>disable</h3><p>
         This command is used to administratively disable a node in the cluster.
         A disabled node will still participate in the cluster and host
         clustered TDB records but its public ip address has been taken over by
         a different node and it no longer hosts any services.
-      </p></div><div class="refsect2" lang="en"><a name="id2528564"></a><h3>enable</h3><p>
+      </p></div><div class="refsect2" lang="en"><a name="id2528565"></a><h3>enable</h3><p>
         Re-enable a node that has been administratively disabled.
       </p></div><div class="refsect2" lang="en"><a name="id2528575"></a><h3>ban &lt;bantime|0&gt;</h3><p>
         Administratively ban a node for bantime seconds. A bantime of 0 means that the node should be permanently banned. 
@@ -221,7 +221,7 @@ CTDB version 1
       </p></div></div><div class="refsect1" lang="en"><a name="id2528668"></a><h2>Debugging Commands</h2><p>
       These commands are primarily used for CTDB development and testing and
       should not be used for normal administration.
-    </p><div class="refsect2" lang="en"><a name="id2528678"></a><h3>process-exists &lt;pid&gt;</h3><p>
+    </p><div class="refsect2" lang="en"><a name="id2528679"></a><h3>process-exists &lt;pid&gt;</h3><p>
         This command checks if a specific process exists on the CTDB host. This is mainly used by Samba to check if remote instances of samba are still running or not.
       </p></div><div class="refsect2" lang="en"><a name="id2528691"></a><h3>getdbmap</h3><p>
         This command lists all clustered TDB databases that the CTDB daemon has attahced to.
diff --git a/ctdb/doc/ctdbd.1 b/ctdb/doc/ctdbd.1
index a50aa3fe64f..d853126e6e7 100644
--- a/ctdb/doc/ctdbd.1
+++ b/ctdb/doc/ctdbd.1
@@ -1,11 +1,11 @@
 .\"     Title: ctdbd
 .\"    Author: 
 .\" Generator: DocBook XSL Stylesheets v1.71.0 <http://docbook.sf.net/>
-.\"      Date: 07/10/2007
+.\"      Date: 08/23/2007
 .\"    Manual: 
 .\"    Source: 
 .\"
-.TH "CTDBD" "1" "07/10/2007" "" ""
+.TH "CTDBD" "1" "08/23/2007" "" ""
 .\" disable hyphenation
 .nh
 .\" disable justification (adjust text to left margin only)
@@ -179,7 +179,7 @@ There are five possible for a node.
 .PP
 OK \- This node is fully functional.
 .PP
-DISCONNECTED \- This node could not be connected through the network and is currently not parcipitating in the cluster. If there is a public IP address associated with this node it should have been taken over by a different node. No services are running on this node.
+DISCONNECTED \- This node could not be connected through the network and is currently not particpating in the cluster. If there is a public IP address associated with this node it should have been taken over by a different node. No services are running on this node.
 .PP
 DISABLED \- This node has been administratively disabled. This node is still functional and participates in the CTDB cluster but its IP addresses have been taken over by a different node and no services are currently being hosted.
 .PP
diff --git a/ctdb/doc/ctdbd.1.html b/ctdb/doc/ctdbd.1.html
index 90b8be1113f..526aff350e1 100644
--- a/ctdb/doc/ctdbd.1.html
+++ b/ctdb/doc/ctdbd.1.html
@@ -138,7 +138,7 @@
       OK - This node is fully functional.
     </p><p>
       DISCONNECTED - This node could not be connected through the network 
-      and is currently not parcipitating in the cluster. If there is a 
+      and is currently not particpating in the cluster. If there is a 
       public IP address associated with this node it should have been taken 
       over by a different node. No services are running on this node.
     </p><p>

From 20120c23311b0724d503e9d855e4f671e988e1e7 Mon Sep 17 00:00:00 2001
From: Ronnie Sahlberg <sahlberg@ronnie>
Date: Thu, 23 Aug 2007 11:58:09 +1000
Subject: [PATCH 02/15] in ctdb_call_recv() we must check that state is
 non-NULL since ctdb_call() may pass a null pointer to _recv() and this would
 cause a segfault. fortunately there appears there are no critical users for
 this codepath right now so the risk was more theoretical IF clients start
 using this call it coult segfault.

change ctdb_control() to become fully async so we later can make
recovery daemon do the expensive controls to nodes in parallell instead
of in sequence

(This used to be ctdb commit 379789cda6ef049f389f10136aaa1b37a4d063a9)
---
 ctdb/client/ctdb_client.c | 145 ++++++++++++++++++++++++--------------
 1 file changed, 93 insertions(+), 52 deletions(-)

diff --git a/ctdb/client/ctdb_client.c b/ctdb/client/ctdb_client.c
index 19b19f489a4..a18c567ff29 100644
--- a/ctdb/client/ctdb_client.c
+++ b/ctdb/client/ctdb_client.c
@@ -301,6 +301,10 @@ struct ctdb_record_handle {
 */
 int ctdb_call_recv(struct ctdb_client_call_state *state, struct ctdb_call *call)
 {
+	if (state == NULL) {
+		return -1;
+	}
+
 	while (state->state < CTDB_CALL_DONE) {
 		event_loop_once(state->ctdb_db->ctdb->ev);
 	}
@@ -661,7 +665,7 @@ int ctdb_fetch(struct ctdb_db_context *ctdb_db, TALLOC_CTX *mem_ctx,
 }
 
 
-enum control_state {CTDB_CONTROL_WAIT, CTDB_CONTROL_DONE, CTDB_CONTROL_ERROR};
+enum control_state {CTDB_CONTROL_WAIT, CTDB_CONTROL_DONE, CTDB_CONTROL_ERROR, CTDB_CONTROL_TIMEOUT};
 
 struct ctdb_client_control_state {
 	struct ctdb_context *ctdb;
@@ -711,15 +715,6 @@ static void ctdb_client_reply_control(struct ctdb_context *ctdb,
 }
 
 
-/* time out handler for ctdb_control */
-static void timeout_func(struct event_context *ev, struct timed_event *te, 
-	struct timeval t, void *private_data)
-{
-	uint32_t *timed_out = (uint32_t *)private_data;
-
-	*timed_out = 1;
-}
-
 /*
   destroy a ctdb_control in client
 */
@@ -729,22 +724,29 @@ static int ctdb_control_destructor(struct ctdb_client_control_state *state)
 	return 0;
 }
 
-/*
-  send a ctdb control message
-  timeout specifies how long we should wait for a reply.
-  if timeout is NULL we wait indefinitely
- */
-int ctdb_control(struct ctdb_context *ctdb, uint32_t destnode, uint64_t srvid, 
-		 uint32_t opcode, uint32_t flags, TDB_DATA data, 
-		 TALLOC_CTX *mem_ctx, TDB_DATA *outdata, int32_t *status,
-		 struct timeval *timeout,
-		 char **errormsg)
+
+/* time out handler for ctdb_control */
+static void control_timeout_func(struct event_context *ev, struct timed_event *te, 
+	struct timeval t, void *private_data)
+{
+	struct ctdb_client_control_state *state = talloc_get_type(private_data, struct ctdb_client_control_state);
+
+	state->state = CTDB_CONTROL_TIMEOUT;
+}
+
+/* async version of send control request */
+struct ctdb_client_control_state *ctdb_control_send(struct ctdb_context *ctdb, 
+		uint32_t destnode, uint64_t srvid, 
+		uint32_t opcode, uint32_t flags, TDB_DATA data, 
+		TALLOC_CTX *mem_ctx, TDB_DATA *outdata, int32_t *status,
+		struct timeval *timeout,
+		char **errormsg)
+
 {
 	struct ctdb_client_control_state *state;
 	struct ctdb_req_control *c;
 	size_t len;
 	int ret;
-	uint32_t timed_out;
 
 	if (errormsg) {
 		*errormsg = NULL;
@@ -756,19 +758,19 @@ int ctdb_control(struct ctdb_context *ctdb, uint32_t destnode, uint64_t srvid,
 	}
 
 	state = talloc_zero(ctdb, struct ctdb_client_control_state);
-	CTDB_NO_MEMORY(ctdb, state);
+	CTDB_NO_MEMORY_NULL(ctdb, state);
 
-	state->ctdb  = ctdb;
-	state->reqid = ctdb_reqid_new(ctdb, state);
-	state->state = CTDB_CONTROL_WAIT;
-	state->errormsg = NULL;
+	state->ctdb    = ctdb;
+	state->reqid   = ctdb_reqid_new(ctdb, state);
+	state->state   = CTDB_CONTROL_WAIT;
+	state->errormsg= NULL;
 
 	talloc_set_destructor(state, ctdb_control_destructor);
 
 	len = offsetof(struct ctdb_req_control, data) + data.dsize;
 	c = ctdbd_allocate_pkt(ctdb, state, CTDB_REQ_CONTROL, 
 			       len, struct ctdb_req_control);
-	CTDB_NO_MEMORY(ctdb, c);
+	CTDB_NO_MEMORY_NULL(ctdb, c);
 	
 	c->hdr.reqid        = state->reqid;
 	c->hdr.destnode     = destnode;
@@ -785,55 +787,94 @@ int ctdb_control(struct ctdb_context *ctdb, uint32_t destnode, uint64_t srvid,
 	ret = ctdb_client_queue_pkt(ctdb, &(c->hdr));
 	if (ret != 0) {
 		talloc_free(state);
-		return -1;
+		return NULL;
 	}
 
 	if (flags & CTDB_CTRL_FLAG_NOREPLY) {
 		talloc_free(state);
-		return 0;
+		return NULL;
 	}
 
-	/* semi-async operation */
-	timed_out = 0;
+	/* timeout */
 	if (timeout && !timeval_is_zero(timeout)) {
-		event_add_timed(ctdb->ev, state, *timeout, timeout_func, &timed_out);
+		event_add_timed(ctdb->ev, state, *timeout, control_timeout_func, state);
 	}
-	while ((state->state == CTDB_CONTROL_WAIT)
-	&&	(timed_out == 0) ){
-		event_loop_once(ctdb->ev);
-	}
-	if (timed_out) {
-		talloc_free(state);
-		if (errormsg) {
-			(*errormsg) = talloc_strdup(mem_ctx, "control timed out");
-		} else {
-			DEBUG(0,("ctdb_control timed out\n"));
-		}
+
+	return state;
+}
+
+
+/* async version of receive control reply */
+int ctdb_control_recv(struct ctdb_context *ctdb, 
+		struct ctdb_client_control_state *state, 
+		TALLOC_CTX *mem_ctx,
+		TDB_DATA *outdata, int32_t *status, char **errormsg)
+{
+	if (state == NULL) {
 		return -1;
 	}
 
+	/* loop one event at a time until we either timeout or the control
+	   completes.
+	*/
+	while (state->state == CTDB_CONTROL_WAIT) {
+		event_loop_once(ctdb->ev);
+	}
+	if (state->state != CTDB_CONTROL_DONE) {
+		DEBUG(0,(__location__ " ctdb_control_recv failed\n"));
+		talloc_free(state);
+		return -1;
+	}
+
+	if (state->errormsg) {
+		DEBUG(0,("ctdb_control error: '%s'\n", state->errormsg));
+		if (errormsg) {
+			(*errormsg) = talloc_move(mem_ctx, &state->errormsg);
+		}
+		talloc_free(state);
+		return -1;
+	}
+
+
 	if (outdata) {
 		*outdata = state->outdata;
 		outdata->dptr = talloc_memdup(mem_ctx, outdata->dptr, outdata->dsize);
 	}
 
-	*status = state->status;
-
-	if (!errormsg && state->errormsg) {
-		DEBUG(0,("ctdb_control error: '%s'\n", state->errormsg));
+	if (status) {
+		*status = state->status;
 	}
 
-	if (errormsg && state->errormsg) {
-		(*errormsg) = talloc_move(mem_ctx, &state->errormsg);
-	}
 
 	talloc_free(state);
-
-	return 0;	
+	return 0;
 }
 
 
 
+/*
+  send a ctdb control message
+  timeout specifies how long we should wait for a reply.
+  if timeout is NULL we wait indefinitely
+ */
+int ctdb_control(struct ctdb_context *ctdb, uint32_t destnode, uint64_t srvid, 
+		 uint32_t opcode, uint32_t flags, TDB_DATA data, 
+		 TALLOC_CTX *mem_ctx, TDB_DATA *outdata, int32_t *status,
+		 struct timeval *timeout,
+		 char **errormsg)
+{
+	struct ctdb_client_control_state *state;
+
+	state = ctdb_control_send(ctdb, destnode, srvid, opcode, 
+			flags, data, mem_ctx, outdata, status,
+			timeout, errormsg);
+	return ctdb_control_recv(ctdb, state, mem_ctx, outdata, status, 
+			errormsg);
+}
+
+
+
+
 /*
   a process exists call. Returns 0 if process exists, -1 otherwise
  */

From 8fd3df2553dbe2454e3aaf4a954a6800c596ba34 Mon Sep 17 00:00:00 2001
From: Ronnie Sahlberg <sahlberg@ronnie>
Date: Thu, 23 Aug 2007 13:00:10 +1000
Subject: [PATCH 03/15] hang the ctdb_req_control structure off the
 ctdb_client_control_state struct  so that if we timeout a control we can
 print debug info such as what opcode failed and to which node

we dont need the *status parameter to ctdb_client_control_state

create async versions of the getrecmaster control

pass a memory context to getrecmaster

(This used to be ctdb commit 558b680c82f830fba82c283c78c2de8a0b150b75)
---
 ctdb/client/ctdb_client.c   | 49 +++++++++++++++++++++++++++----------
 ctdb/include/ctdb.h         |  8 +++++-
 ctdb/server/ctdb_recoverd.c |  4 +--
 ctdb/tests/ctdb_bench.c     |  2 +-
 ctdb/tests/ctdb_fetch.c     |  2 +-
 ctdb/tests/ctdb_store.c     |  2 +-
 ctdb/tools/ctdb.c           |  2 +-
 7 files changed, 49 insertions(+), 20 deletions(-)

diff --git a/ctdb/client/ctdb_client.c b/ctdb/client/ctdb_client.c
index a18c567ff29..22fb0e319ce 100644
--- a/ctdb/client/ctdb_client.c
+++ b/ctdb/client/ctdb_client.c
@@ -674,6 +674,7 @@ struct ctdb_client_control_state {
 	TDB_DATA outdata;
 	enum control_state state;
 	char *errormsg;
+	struct ctdb_req_control *c;
 };
 
 /*
@@ -731,6 +732,8 @@ static void control_timeout_func(struct event_context *ev, struct timed_event *t
 {
 	struct ctdb_client_control_state *state = talloc_get_type(private_data, struct ctdb_client_control_state);
 
+	DEBUG(0,("control timed out. reqid:%d opcode:%d dstnode:%d\n", state->reqid, state->c->opcode, state->c->hdr.destnode));
+
 	state->state = CTDB_CONTROL_TIMEOUT;
 }
 
@@ -738,14 +741,14 @@ static void control_timeout_func(struct event_context *ev, struct timed_event *t
 struct ctdb_client_control_state *ctdb_control_send(struct ctdb_context *ctdb, 
 		uint32_t destnode, uint64_t srvid, 
 		uint32_t opcode, uint32_t flags, TDB_DATA data, 
-		TALLOC_CTX *mem_ctx, TDB_DATA *outdata, int32_t *status,
+		TALLOC_CTX *mem_ctx, TDB_DATA *outdata,
 		struct timeval *timeout,
 		char **errormsg)
 
 {
 	struct ctdb_client_control_state *state;
-	struct ctdb_req_control *c;
 	size_t len;
+	struct ctdb_req_control *c;
 	int ret;
 
 	if (errormsg) {
@@ -770,8 +773,8 @@ struct ctdb_client_control_state *ctdb_control_send(struct ctdb_context *ctdb,
 	len = offsetof(struct ctdb_req_control, data) + data.dsize;
 	c = ctdbd_allocate_pkt(ctdb, state, CTDB_REQ_CONTROL, 
 			       len, struct ctdb_req_control);
+	state->c            = c;	
 	CTDB_NO_MEMORY_NULL(ctdb, c);
-	
 	c->hdr.reqid        = state->reqid;
 	c->hdr.destnode     = destnode;
 	c->hdr.reqid        = state->reqid;
@@ -866,7 +869,7 @@ int ctdb_control(struct ctdb_context *ctdb, uint32_t destnode, uint64_t srvid,
 	struct ctdb_client_control_state *state;
 
 	state = ctdb_control_send(ctdb, destnode, srvid, opcode, 
-			flags, data, mem_ctx, outdata, status,
+			flags, data, mem_ctx, outdata,
 			timeout, errormsg);
 	return ctdb_control_recv(ctdb, state, mem_ctx, outdata, status, 
 			errormsg);
@@ -984,27 +987,47 @@ int ctdb_ctrl_getvnnmap(struct ctdb_context *ctdb, struct timeval timeout, uint3
 	return 0;
 }
 
-/*
-  get the recovery mode of a remote node
- */
-int ctdb_ctrl_getrecmode(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, uint32_t *recmode)
+
+struct ctdb_client_control_state *
+ctdb_ctrl_getrecmode_send(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct timeval timeout, uint32_t destnode)
+{
+	return ctdb_control_send(ctdb, destnode, 0, 
+			   CTDB_CONTROL_GET_RECMODE, 0, tdb_null, 
+			   mem_ctx, NULL, &timeout, NULL);
+}
+
+int ctdb_ctrl_getrecmode_recv(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct ctdb_client_control_state *state, uint32_t *recmode)
 {
 	int ret;
 	int32_t res;
 
-	ret = ctdb_control(ctdb, destnode, 0, 
-			   CTDB_CONTROL_GET_RECMODE, 0, tdb_null, 
-			   NULL, NULL, &res, &timeout, NULL);
+	ret = ctdb_control_recv(ctdb, state, mem_ctx, NULL, &res, NULL);
 	if (ret != 0) {
-		DEBUG(0,(__location__ " ctdb_control for getrecmode failed\n"));
+		DEBUG(0,(__location__ " ctdb_ctrl_getrecmode_recv failed\n"));
 		return -1;
 	}
 
-	*recmode = res;
+	if (recmode) {
+		*recmode = (uint32_t)res;
+	}
 
 	return 0;
 }
 
+/*
+  get the recovery mode of a remote node
+ */
+int ctdb_ctrl_getrecmode(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct timeval timeout, uint32_t destnode, uint32_t *recmode)
+{
+	struct ctdb_client_control_state *state;
+
+	state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx, timeout, destnode);
+	return ctdb_ctrl_getrecmode_recv(ctdb, mem_ctx, state, recmode);
+}
+
+
+
+
 /*
   set the recovery mode of a remote node
  */
diff --git a/ctdb/include/ctdb.h b/ctdb/include/ctdb.h
index 04339b06663..2acde41dd52 100644
--- a/ctdb/include/ctdb.h
+++ b/ctdb/include/ctdb.h
@@ -302,7 +302,13 @@ int ctdb_ctrl_write_record(struct ctdb_context *ctdb, uint32_t destnode, TALLOC_
 /*
   get the recovery mode of a remote node
  */
-int ctdb_ctrl_getrecmode(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, uint32_t *recmode);
+int ctdb_ctrl_getrecmode(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct timeval timeout, uint32_t destnode, uint32_t *recmode);
+
+struct ctdb_client_control_state *ctdb_ctrl_getrecmode_send(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct timeval timeout, uint32_t destnode);
+
+int ctdb_ctrl_getrecmode_recv(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct ctdb_client_control_state *state, uint32_t *recmode);
+
+
 /*
   set the recovery mode of a remote node
  */
diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c
index d710cd2a289..1b1b056cd84 100644
--- a/ctdb/server/ctdb_recoverd.c
+++ b/ctdb/server/ctdb_recoverd.c
@@ -1108,7 +1108,7 @@ static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid,
 				     CTDB_CURRENT_NODE, &ctdb->recovery_master);
 
 	if (ret == 0) {
-		ret = ctdb_ctrl_getrecmode(ctdb, CONTROL_TIMEOUT(), 
+		ret = ctdb_ctrl_getrecmode(ctdb, tmp_ctx, CONTROL_TIMEOUT(), 
 					   CTDB_CURRENT_NODE, &ctdb->recovery_mode);
 	}
 	
@@ -1307,7 +1307,7 @@ again:
 			continue;
 		}
 
-		ret = ctdb_ctrl_getrecmode(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].vnn, &recmode);
+		ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(), nodemap->nodes[j].vnn, &recmode);
 		if (ret != 0) {
 			DEBUG(0, ("Unable to get recmode from node %u\n", vnn));
 			goto again;
diff --git a/ctdb/tests/ctdb_bench.c b/ctdb/tests/ctdb_bench.c
index f1292ee275d..e020c1b62f2 100644
--- a/ctdb/tests/ctdb_bench.c
+++ b/ctdb/tests/ctdb_bench.c
@@ -217,7 +217,7 @@ int main(int argc, const char *argv[])
 	printf("Waiting for cluster\n");
 	while (1) {
 		uint32_t recmode=1;
-		ctdb_ctrl_getrecmode(ctdb, timeval_zero(), CTDB_CURRENT_NODE, &recmode);
+		ctdb_ctrl_getrecmode(ctdb, ctdb, timeval_zero(), CTDB_CURRENT_NODE, &recmode);
 		if (recmode == 0) break;
 		event_loop_once(ev);
 	}
diff --git a/ctdb/tests/ctdb_fetch.c b/ctdb/tests/ctdb_fetch.c
index f57d05d0993..0a6351f1fdf 100644
--- a/ctdb/tests/ctdb_fetch.c
+++ b/ctdb/tests/ctdb_fetch.c
@@ -232,7 +232,7 @@ int main(int argc, const char *argv[])
 	printf("Waiting for cluster\n");
 	while (1) {
 		uint32_t recmode=1;
-		ctdb_ctrl_getrecmode(ctdb, timeval_zero(), CTDB_CURRENT_NODE, &recmode);
+		ctdb_ctrl_getrecmode(ctdb, ctdb, timeval_zero(), CTDB_CURRENT_NODE, &recmode);
 		if (recmode == 0) break;
 		event_loop_once(ev);
 	}
diff --git a/ctdb/tests/ctdb_store.c b/ctdb/tests/ctdb_store.c
index a60e009b910..ea7721a34c2 100644
--- a/ctdb/tests/ctdb_store.c
+++ b/ctdb/tests/ctdb_store.c
@@ -145,7 +145,7 @@ int main(int argc, const char *argv[])
 	printf("Waiting for cluster\n");
 	while (1) {
 		uint32_t recmode=1;
-		ctdb_ctrl_getrecmode(ctdb, timeval_zero(), CTDB_CURRENT_NODE, &recmode);
+		ctdb_ctrl_getrecmode(ctdb, ctdb, timeval_zero(), CTDB_CURRENT_NODE, &recmode);
 		if (recmode == 0) break;
 		event_loop_once(ev);
 	}
diff --git a/ctdb/tools/ctdb.c b/ctdb/tools/ctdb.c
index 849b638a953..9cfe966db92 100644
--- a/ctdb/tools/ctdb.c
+++ b/ctdb/tools/ctdb.c
@@ -292,7 +292,7 @@ static int control_status(struct ctdb_context *ctdb, int argc, const char **argv
 		printf("hash:%d lmaster:%d\n", i, vnnmap->map[i]);
 	}
 
-	ret = ctdb_ctrl_getrecmode(ctdb, TIMELIMIT(), options.vnn, &recmode);
+	ret = ctdb_ctrl_getrecmode(ctdb, ctdb, TIMELIMIT(), options.vnn, &recmode);
 	if (ret != 0) {
 		DEBUG(0, ("Unable to get recmode from node %u\n", options.vnn));
 		return ret;

From 4c13bf0c5f56e3309a6fba3a59912ab48a703944 Mon Sep 17 00:00:00 2001
From: Ronnie Sahlberg <sahlberg@ronnie>
Date: Thu, 23 Aug 2007 13:48:39 +1000
Subject: [PATCH 04/15] break checking that the recoverymode on all nodes are
 ok  out into its own function

(This used to be ctdb commit 813cf9a252af96da24122b80f24aabeed2911939)
---
 ctdb/server/ctdb_recoverd.c | 80 +++++++++++++++++++++++++++++--------
 1 file changed, 63 insertions(+), 17 deletions(-)

diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c
index 1b1b056cd84..df46f2fa3f1 100644
--- a/ctdb/server/ctdb_recoverd.c
+++ b/ctdb/server/ctdb_recoverd.c
@@ -1138,13 +1138,65 @@ static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid,
 }
 
 
+enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_FAILED};
+
+
+/* verify that all nodes are in recovery mode normal */
+static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, TALLOC_CTX *mem_ctx)
+{
+	struct ctdb_client_control_state **ctrl_states;
+	uint32_t recmode;
+	int j, ret;
+	
+	ctrl_states = talloc_array(mem_ctx, struct ctdb_client_control_state *,
+				 nodemap->num);
+	if (!ctrl_states) {
+		DEBUG(0,(__location__ " Failed to allocate temporary ctrl state array\n"));
+		exit(-1);
+	}
+
+
+	/* loop over all active nodes and send an async getrecmode call to 
+	   them*/
+	for (j=0; j<nodemap->num; j++) {
+		if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
+			ctrl_states[j] = NULL;
+			continue;
+		}
+		ctrl_states[j] = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx, 
+					CONTROL_TIMEOUT(), 
+					nodemap->nodes[j].vnn);
+	}
+
+	/* wait for the responses to come back and check that all is ok */
+	for (j=0; j<nodemap->num; j++) {
+		if (ctrl_states[j] == NULL) {
+			continue;
+		}
+		ret = ctdb_ctrl_getrecmode_recv(ctdb, mem_ctx, ctrl_states[j], &recmode);
+		if (ret != 0) {
+			DEBUG(0, ("Unable to get recmode from node %u\n", nodemap->nodes[j].vnn));
+			talloc_free(ctrl_states);
+			return MONITOR_FAILED;
+		}
+		if (recmode != CTDB_RECOVERY_NORMAL) {
+			DEBUG(0, (__location__ " Node:%u was in recovery mode. Restart recovery process\n", nodemap->nodes[j].vnn));
+			talloc_free(ctrl_states);
+			return MONITOR_RECOVERY_NEEDED;
+		}
+	}
+
+	talloc_free(ctrl_states);
+	return MONITOR_OK;
+}
+
 
 /*
   the main monitoring loop
  */
 static void monitor_cluster(struct ctdb_context *ctdb)
 {
-	uint32_t vnn, num_active, recmode, recmaster;
+	uint32_t vnn, num_active, recmaster;
 	TALLOC_CTX *mem_ctx=NULL;
 	struct ctdb_node_map *nodemap=NULL;
 	struct ctdb_node_map *remote_nodemap=NULL;
@@ -1302,25 +1354,19 @@ again:
 	/* verify that all active nodes are in normal mode 
 	   and not in recovery mode 
 	 */
-	for (j=0; j<nodemap->num; j++) {
-		if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
-			continue;
-		}
-
-		ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(), nodemap->nodes[j].vnn, &recmode);
-		if (ret != 0) {
-			DEBUG(0, ("Unable to get recmode from node %u\n", vnn));
-			goto again;
-		}
-		if (recmode != CTDB_RECOVERY_NORMAL) {
-			DEBUG(0, (__location__ " Node:%u was in recovery mode. Restart recovery process\n", 
-				  nodemap->nodes[j].vnn));
-			do_recovery(rec, mem_ctx, vnn, num_active, nodemap, vnnmap, nodemap->nodes[j].vnn);
-			goto again;
-		}
+	/* send a getrecmode call out to every node */
+	switch (verify_recmode(ctdb, nodemap, mem_ctx)) {
+	case MONITOR_RECOVERY_NEEDED:
+		do_recovery(rec, mem_ctx, vnn, num_active, nodemap, vnnmap, nodemap->nodes[j].vnn);
+		goto again;
+	case MONITOR_FAILED:
+		goto again;
+	case MONITOR_OK:
+		break;
 	}
 
 
+
 	/* get the nodemap for all active remote nodes and verify
 	   they are the same as for this node
 	 */

From f854b5f87678efa1bbebd8a2c423fc35bb4649a0 Mon Sep 17 00:00:00 2001
From: Ronnie Sahlberg <sahlberg@ronnie>
Date: Thu, 23 Aug 2007 19:27:09 +1000
Subject: [PATCH 05/15] try out a slightly different api for controls where you
 provide a callback function which is called upon completion (or timeout) of
 the control.

modify scanning of recmaster in the monitoring_cluster code to try the
api out

(This used to be ctdb commit c37843f1d97b169afec910e7ddb4e5ac12c3015c)
---
 ctdb/client/ctdb_client.c   | 140 +++++++++++++++++++++++------
 ctdb/include/ctdb.h         |  22 ++++-
 ctdb/include/ctdb_private.h |  13 +++
 ctdb/server/ctdb_recoverd.c | 171 +++++++++++++++++++++++++++++-------
 ctdb/tools/ctdb.c           |   6 +-
 5 files changed, 292 insertions(+), 60 deletions(-)

diff --git a/ctdb/client/ctdb_client.c b/ctdb/client/ctdb_client.c
index 22fb0e319ce..cc1aa3fb8f5 100644
--- a/ctdb/client/ctdb_client.c
+++ b/ctdb/client/ctdb_client.c
@@ -665,8 +665,6 @@ int ctdb_fetch(struct ctdb_db_context *ctdb_db, TALLOC_CTX *mem_ctx,
 }
 
 
-enum control_state {CTDB_CONTROL_WAIT, CTDB_CONTROL_DONE, CTDB_CONTROL_ERROR, CTDB_CONTROL_TIMEOUT};
-
 struct ctdb_client_control_state {
 	struct ctdb_context *ctdb;
 	uint32_t reqid;
@@ -675,8 +673,60 @@ struct ctdb_client_control_state {
 	enum control_state state;
 	char *errormsg;
 	struct ctdb_req_control *c;
+
+	/* if we have a callback registered for the completion (or failure) of
+	   this control
+	   if a callback is used, it MUST talloc_free the cb_data passed to it
+	*/
+	control_callback callback;
+	void *cb_private;
 };
 
+/*
+   called when a control completes or timesout to invoke the callback
+   function the user provided
+*/
+static void invoke_control_callback(struct event_context *ev, struct timed_event *te, 
+	struct timeval t, void *private_data)
+{
+	struct ctdb_client_control_state *state;
+	struct ctdb_control_cb_data *cb_data;
+	struct ctdb_context *ctdb;
+	control_callback callback;
+	void *cb_private;
+	TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+	int ret;
+
+	state = talloc_get_type(private_data, struct ctdb_client_control_state);
+	talloc_steal(tmp_ctx, state);
+
+	ctdb       = state->ctdb;
+	callback   = state->callback;
+	cb_private = state->cb_private;
+
+	cb_data = talloc_zero(tmp_ctx, struct ctdb_control_cb_data);
+	if (cb_data == NULL) {
+		talloc_free(tmp_ctx);
+		CTDB_NO_MEMORY_VOID(ctdb, cb_data);
+	}
+
+	cb_data->state = state->state;
+	cb_data->vnn   = state->c->hdr.destnode;
+
+	ret = ctdb_control_recv(ctdb, state, cb_data,
+			&cb_data->outdata, 
+			&cb_data->status, 
+			&cb_data->errormsg);
+	/* we dont check ret since we expect that ctdb_control_recv can fail
+	   for example if the control timedout
+
+	   state is always talloc_free()'d inside ctdb_control_recv
+	*/
+
+	callback(cb_data, cb_private);
+	talloc_free(tmp_ctx);
+}
+
 /*
   called when a CTDB_REPLY_CONTROL packet comes in in the client
 
@@ -713,6 +763,13 @@ static void ctdb_client_reply_control(struct ctdb_context *ctdb,
 	talloc_steal(state, c);
 
 	state->state = CTDB_CONTROL_DONE;
+
+	/* if we had a callback registered for this control, pull the response
+	   and call the callback.
+	*/
+	if (state->callback) {
+		event_add_timed(ctdb->ev, state, timeval_zero(), invoke_control_callback, state);
+	}
 }
 
 
@@ -735,6 +792,13 @@ static void control_timeout_func(struct event_context *ev, struct timed_event *t
 	DEBUG(0,("control timed out. reqid:%d opcode:%d dstnode:%d\n", state->reqid, state->c->opcode, state->c->hdr.destnode));
 
 	state->state = CTDB_CONTROL_TIMEOUT;
+
+	/* if we had a callback registered for this control, pull the response
+	   and call the callback.
+	*/
+	if (state->callback) {
+		event_add_timed(state->ctdb->ev, state, timeval_zero(), invoke_control_callback, state);
+	}
 }
 
 /* async version of send control request */
@@ -743,8 +807,8 @@ struct ctdb_client_control_state *ctdb_control_send(struct ctdb_context *ctdb,
 		uint32_t opcode, uint32_t flags, TDB_DATA data, 
 		TALLOC_CTX *mem_ctx, TDB_DATA *outdata,
 		struct timeval *timeout,
-		char **errormsg)
-
+		char **errormsg,
+		control_callback callback, void *cb_private)
 {
 	struct ctdb_client_control_state *state;
 	size_t len;
@@ -760,13 +824,15 @@ struct ctdb_client_control_state *ctdb_control_send(struct ctdb_context *ctdb,
 		ctdb_socket_connect(ctdb);
 	}
 
-	state = talloc_zero(ctdb, struct ctdb_client_control_state);
+	state = talloc_zero(mem_ctx, struct ctdb_client_control_state);
 	CTDB_NO_MEMORY_NULL(ctdb, state);
 
-	state->ctdb    = ctdb;
-	state->reqid   = ctdb_reqid_new(ctdb, state);
-	state->state   = CTDB_CONTROL_WAIT;
-	state->errormsg= NULL;
+	state->ctdb       = ctdb;
+	state->reqid      = ctdb_reqid_new(ctdb, state);
+	state->state      = CTDB_CONTROL_WAIT;
+	state->errormsg   = NULL;
+	state->callback   = callback;
+	state->cb_private = cb_private;
 
 	talloc_set_destructor(state, ctdb_control_destructor);
 
@@ -787,6 +853,11 @@ struct ctdb_client_control_state *ctdb_control_send(struct ctdb_context *ctdb,
 		memcpy(&c->data[0], data.dptr, data.dsize);
 	}
 
+	/* timeout */
+	if (timeout && !timeval_is_zero(timeout)) {
+		event_add_timed(ctdb->ev, state, *timeout, control_timeout_func, state);
+	}
+
 	ret = ctdb_client_queue_pkt(ctdb, &(c->hdr));
 	if (ret != 0) {
 		talloc_free(state);
@@ -798,11 +869,6 @@ struct ctdb_client_control_state *ctdb_control_send(struct ctdb_context *ctdb,
 		return NULL;
 	}
 
-	/* timeout */
-	if (timeout && !timeval_is_zero(timeout)) {
-		event_add_timed(ctdb->ev, state, *timeout, control_timeout_func, state);
-	}
-
 	return state;
 }
 
@@ -870,7 +936,8 @@ int ctdb_control(struct ctdb_context *ctdb, uint32_t destnode, uint64_t srvid,
 
 	state = ctdb_control_send(ctdb, destnode, srvid, opcode, 
 			flags, data, mem_ctx, outdata,
-			timeout, errormsg);
+			timeout, errormsg,
+			NULL, NULL);
 	return ctdb_control_recv(ctdb, state, mem_ctx, outdata, status, 
 			errormsg);
 }
@@ -988,12 +1055,16 @@ int ctdb_ctrl_getvnnmap(struct ctdb_context *ctdb, struct timeval timeout, uint3
 }
 
 
+/*
+  get the recovery mode of a remote node
+ */
 struct ctdb_client_control_state *
 ctdb_ctrl_getrecmode_send(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct timeval timeout, uint32_t destnode)
 {
 	return ctdb_control_send(ctdb, destnode, 0, 
 			   CTDB_CONTROL_GET_RECMODE, 0, tdb_null, 
-			   mem_ctx, NULL, &timeout, NULL);
+			   mem_ctx, NULL, &timeout, NULL,
+			   NULL, NULL);
 }
 
 int ctdb_ctrl_getrecmode_recv(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct ctdb_client_control_state *state, uint32_t *recmode)
@@ -1014,9 +1085,6 @@ int ctdb_ctrl_getrecmode_recv(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, st
 	return 0;
 }
 
-/*
-  get the recovery mode of a remote node
- */
 int ctdb_ctrl_getrecmode(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct timeval timeout, uint32_t destnode, uint32_t *recmode)
 {
 	struct ctdb_client_control_state *state;
@@ -1051,27 +1119,49 @@ int ctdb_ctrl_setrecmode(struct ctdb_context *ctdb, struct timeval timeout, uint
 	return 0;
 }
 
+
+
 /*
   get the recovery master of a remote node
  */
-int ctdb_ctrl_getrecmaster(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, uint32_t *recmaster)
+struct ctdb_client_control_state *
+ctdb_ctrl_getrecmaster_send(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, 
+			struct timeval timeout, uint32_t destnode,
+			control_callback callback, void *cb_private)
+{
+	return ctdb_control_send(ctdb, destnode, 0, 
+			   CTDB_CONTROL_GET_RECMASTER, 0, tdb_null, 
+			   mem_ctx, NULL, &timeout, NULL,
+			   callback, cb_private);
+}
+
+int ctdb_ctrl_getrecmaster_recv(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct ctdb_client_control_state *state, uint32_t *recmaster)
 {
 	int ret;
 	int32_t res;
 
-	ret = ctdb_control(ctdb, destnode, 0, 
-			   CTDB_CONTROL_GET_RECMASTER, 0, tdb_null, 
-			   NULL, NULL, &res, &timeout, NULL);
+	ret = ctdb_control_recv(ctdb, state, mem_ctx, NULL, &res, NULL);
 	if (ret != 0) {
-		DEBUG(0,(__location__ " ctdb_control for getrecmaster failed\n"));
+		DEBUG(0,(__location__ " ctdb_ctrl_getrecmaster_recv failed\n"));
 		return -1;
 	}
 
-	*recmaster = res;
+	if (recmaster) {
+		*recmaster = (uint32_t)res;
+	}
 
 	return 0;
 }
 
+int ctdb_ctrl_getrecmaster(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct timeval timeout, uint32_t destnode, uint32_t *recmaster)
+{
+	struct ctdb_client_control_state *state;
+
+	state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx, timeout, destnode, NULL, NULL);
+	return ctdb_ctrl_getrecmaster_recv(ctdb, mem_ctx, state, recmaster);
+}
+
+
 /*
   set the recovery master of a remote node
  */
diff --git a/ctdb/include/ctdb.h b/ctdb/include/ctdb.h
index 2acde41dd52..17cf2c598f4 100644
--- a/ctdb/include/ctdb.h
+++ b/ctdb/include/ctdb.h
@@ -96,6 +96,19 @@ struct ctdb_call_info {
 #define CTDB_BROADCAST_CONNECTED 0xF0000004
 
 
+enum control_state {CTDB_CONTROL_WAIT, CTDB_CONTROL_DONE, CTDB_CONTROL_ERROR, CTDB_CONTROL_TIMEOUT};
+
+struct ctdb_control_cb_data {
+	enum control_state state;
+	uint32_t vnn;
+	int32_t status;
+	TDB_DATA outdata;
+	char *errormsg;
+};
+
+typedef int (*control_callback)(struct ctdb_control_cb_data *cb_data, void *cb_private);
+
+
 struct event_context;
 
 /*
@@ -325,7 +338,14 @@ int ctdb_ctrl_setmonmode(struct ctdb_context *ctdb, struct timeval timeout, uint
 /*
   get the recovery master of a remote node
  */
-int ctdb_ctrl_getrecmaster(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, uint32_t *recmaster);
+int ctdb_ctrl_getrecmaster(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct timeval timeout, uint32_t destnode, uint32_t *recmaster);
+
+struct ctdb_client_control_state *ctdb_ctrl_getrecmaster_send(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct timeval timeout, uint32_t destnode, control_callback callback, void *cb_private);
+
+int ctdb_ctrl_getrecmaster_recv(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct ctdb_client_control_state *state, uint32_t *recmaster);
+
+
+
 /*
   set the recovery master of a remote node
  */
diff --git a/ctdb/include/ctdb_private.h b/ctdb/include/ctdb_private.h
index 1338ce79a57..1c63daf469b 100644
--- a/ctdb/include/ctdb_private.h
+++ b/ctdb/include/ctdb_private.h
@@ -850,6 +850,19 @@ int ctdb_control(struct ctdb_context *ctdb, uint32_t destnode, uint64_t srvid,
 		 uint32_t opcode, uint32_t flags, TDB_DATA data, 
 		 TALLOC_CTX *mem_ctx, TDB_DATA *outdata, int32_t *status,
 		 struct timeval *timeout, char **errormsg);
+int ctdb_control_recv(struct ctdb_context *ctdb, 
+		struct ctdb_client_control_state *state, 
+		TALLOC_CTX *mem_ctx,
+		TDB_DATA *outdata, int32_t *status, char **errormsg);
+
+struct ctdb_client_control_state *
+ctdb_control_send(struct ctdb_context *ctdb, 
+		uint32_t destnode, uint64_t srvid, 
+		uint32_t opcode, uint32_t flags, TDB_DATA data, 
+		TALLOC_CTX *mem_ctx, TDB_DATA *outdata,
+		struct timeval *timeout,
+		char **errormsg,
+		control_callback callback, void *cb_private);
 
 
 
diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c
index df46f2fa3f1..c931efcf2ce 100644
--- a/ctdb/server/ctdb_recoverd.c
+++ b/ctdb/server/ctdb_recoverd.c
@@ -531,28 +531,33 @@ static void ban_handler(struct ctdb_context *ctdb, uint64_t srvid,
 {
 	struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
 	struct ctdb_ban_info *b = (struct ctdb_ban_info *)data.dptr;
+	TALLOC_CTX *mem_ctx = talloc_new(ctdb);
 	uint32_t recmaster;
 	int ret;
 
 	if (data.dsize != sizeof(*b)) {
 		DEBUG(0,("Bad data in ban_handler\n"));
+		talloc_free(mem_ctx);
 		return;
 	}
 
-	ret = ctdb_ctrl_getrecmaster(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &recmaster);
+	ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &recmaster);
 	if (ret != 0) {
 		DEBUG(0,(__location__ " Failed to find the recmaster\n"));
+		talloc_free(mem_ctx);
 		return;
 	}
 
 	if (recmaster != ctdb->vnn) {
 		DEBUG(0,("We are not the recmaster - ignoring ban request\n"));
+		talloc_free(mem_ctx);
 		return;
 	}
 
 	DEBUG(0,("Node %u has been banned for %u seconds by the administrator\n", 
 		 b->vnn, b->ban_time));
 	ctdb_ban_node(rec, b->vnn, b->ban_time);
+	talloc_free(mem_ctx);
 }
 
 /*
@@ -562,29 +567,34 @@ static void unban_handler(struct ctdb_context *ctdb, uint64_t srvid,
 			  TDB_DATA data, void *private_data)
 {
 	struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
+	TALLOC_CTX *mem_ctx = talloc_new(ctdb);
 	uint32_t vnn;
 	int ret;
 	uint32_t recmaster;
 
 	if (data.dsize != sizeof(uint32_t)) {
 		DEBUG(0,("Bad data in unban_handler\n"));
+		talloc_free(mem_ctx);
 		return;
 	}
 	vnn = *(uint32_t *)data.dptr;
 
-	ret = ctdb_ctrl_getrecmaster(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &recmaster);
+	ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &recmaster);
 	if (ret != 0) {
 		DEBUG(0,(__location__ " Failed to find the recmaster\n"));
+		talloc_free(mem_ctx);
 		return;
 	}
 
 	if (recmaster != ctdb->vnn) {
 		DEBUG(0,("We are not the recmaster - ignoring unban request\n"));
+		talloc_free(mem_ctx);
 		return;
 	}
 
 	DEBUG(0,("Node %u has been unbanned by the administrator\n", vnn));
 	ctdb_unban_node(rec, vnn);
+	talloc_free(mem_ctx);
 }
 
 
@@ -1104,7 +1114,7 @@ static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid,
 
 	nodemap->nodes[i].flags = c->new_flags;
 
-	ret = ctdb_ctrl_getrecmaster(ctdb, CONTROL_TIMEOUT(), 
+	ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(), 
 				     CTDB_CURRENT_NODE, &ctdb->recovery_master);
 
 	if (ret == 0) {
@@ -1138,20 +1148,22 @@ static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid,
 }
 
 
-enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_FAILED};
+enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
 
 
 /* verify that all nodes are in recovery mode normal */
-static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, TALLOC_CTX *mem_ctx)
+static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
 {
 	struct ctdb_client_control_state **ctrl_states;
 	uint32_t recmode;
+	TALLOC_CTX *mem_ctx = talloc_new(ctdb);
 	int j, ret;
 	
 	ctrl_states = talloc_array(mem_ctx, struct ctdb_client_control_state *,
 				 nodemap->num);
 	if (!ctrl_states) {
 		DEBUG(0,(__location__ " Failed to allocate temporary ctrl state array\n"));
+		talloc_free(mem_ctx);
 		exit(-1);
 	}
 
@@ -1176,20 +1188,125 @@ static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb
 		ret = ctdb_ctrl_getrecmode_recv(ctdb, mem_ctx, ctrl_states[j], &recmode);
 		if (ret != 0) {
 			DEBUG(0, ("Unable to get recmode from node %u\n", nodemap->nodes[j].vnn));
-			talloc_free(ctrl_states);
+			talloc_free(mem_ctx);
 			return MONITOR_FAILED;
 		}
+
 		if (recmode != CTDB_RECOVERY_NORMAL) {
 			DEBUG(0, (__location__ " Node:%u was in recovery mode. Restart recovery process\n", nodemap->nodes[j].vnn));
-			talloc_free(ctrl_states);
+			talloc_free(mem_ctx);
 			return MONITOR_RECOVERY_NEEDED;
 		}
 	}
 
-	talloc_free(ctrl_states);
+	talloc_free(mem_ctx);
 	return MONITOR_OK;
 }
 
+struct verify_recmaster_data {
+	uint32_t count;
+	uint32_t vnn;
+	enum monitor_result status;
+};
+
+static int verify_recmaster_callback(struct ctdb_control_cb_data *cb_data, void *cb_private)
+{
+	struct verify_recmaster_data *rmdata = talloc_get_type(cb_private, struct verify_recmaster_data);
+
+
+	/* one more node has responded with recmaster data*/
+	rmdata->count--;
+
+	/* if we failed to get the recmaster, then return an error and let
+	   the main loop try again.
+	*/
+	if (cb_data->state != CTDB_CONTROL_DONE) {
+		if (rmdata->status == MONITOR_OK) {
+			rmdata->status = MONITOR_FAILED;
+		}
+		return 0;
+	}
+
+	/* if we got a response, then the recmaster will be stored in the
+	   status field
+	*/
+	if (cb_data->status != rmdata->vnn) {
+		DEBUG(0,("Node %d does not agree we are the recmaster. Need a new recmaster election\n",cb_data->vnn));
+		rmdata->status = MONITOR_ELECTION_NEEDED;
+	}
+
+	return 0;
+}
+
+static void verify_recmaster_timeout(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
+{
+	uint32_t *timedout = (uint32_t *)p;
+
+	*timedout = 1;
+}
+
+/* verify that all nodes agree that we are the recmaster */
+static enum monitor_result verify_recmaster(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t vnn)
+{
+	struct verify_recmaster_data *rmdata;
+	TALLOC_CTX *mem_ctx = talloc_new(ctdb);
+	struct ctdb_client_control_state *state;
+	enum monitor_result status;
+	uint32_t timedout;	
+	int j;
+	
+	rmdata = talloc(mem_ctx, struct verify_recmaster_data);
+	CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
+	rmdata->count  = 0;
+	rmdata->vnn    = vnn;
+	rmdata->status = MONITOR_OK;
+
+	/* loop over all active nodes and send an async getrecmaster call to 
+	   them*/
+	for (j=0; j<nodemap->num; j++) {
+		if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
+			continue;
+		}
+		state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx, 
+					timeval_zero(),
+					nodemap->nodes[j].vnn,
+					verify_recmaster_callback, rmdata);
+		if (state == NULL) {
+			/* we failed to send the control, treat this as 
+			   an error and try again next iteration
+			*/			
+			DEBUG(0,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
+			talloc_free(mem_ctx);
+			return MONITOR_FAILED;
+		}
+
+		/* one more control to wait for to complete */
+		rmdata->count++;
+	}
+
+
+	/* now wait for up to the maximum number of seconds allowed
+	   or until all nodes we expect a response from has replied
+	*/
+	timedout = 0;
+	event_add_timed(ctdb->ev, rmdata, CONTROL_TIMEOUT(),
+			verify_recmaster_timeout, &timedout);
+
+	while ( (rmdata->count > 0)
+	&&	(timedout == 0) ) {
+		event_loop_once(ctdb->ev);
+	}
+	if (timedout) {
+		DEBUG(0,("Timedout while waiting for getrecmaster replies.\n"));
+		rmdata->status = MONITOR_FAILED;
+	}
+
+
+	status = rmdata->status;
+	talloc_free(mem_ctx);
+	return status;
+}
+
 
 /*
   the main monitoring loop
@@ -1287,7 +1404,7 @@ again:
 
 
 	/* check which node is the recovery master */
-	ret = ctdb_ctrl_getrecmaster(ctdb, CONTROL_TIMEOUT(), vnn, &recmaster);
+	ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), vnn, &recmaster);
 	if (ret != 0) {
 		DEBUG(0, (__location__ " Unable to get recmaster from node %u\n", vnn));
 		goto again;
@@ -1328,39 +1445,31 @@ again:
 
 
 	/* verify that all active nodes agree that we are the recmaster */
-	for (j=0; j<nodemap->num; j++) {
-		if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
-			continue;
-		}
-		if (nodemap->nodes[j].vnn == vnn) {
-			continue;
-		}
-
-		ret = ctdb_ctrl_getrecmaster(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].vnn, &recmaster);
-		if (ret != 0) {
-			DEBUG(0, (__location__ " Unable to get recmaster from node %u\n", vnn));
-			goto again;
-		}
-
-		if (recmaster!=vnn) {
-			DEBUG(0, ("Node %u does not agree we are the recmaster. Force reelection\n", 
-				  nodemap->nodes[j].vnn));
-			force_election(rec, mem_ctx, vnn, nodemap);
-			goto again;
-		}
+	switch (verify_recmaster(ctdb, nodemap, vnn)) {
+	case MONITOR_RECOVERY_NEEDED:
+		/* can not happen */
+		goto again;
+	case MONITOR_ELECTION_NEEDED:
+		force_election(rec, mem_ctx, vnn, nodemap);
+		goto again;
+	case MONITOR_OK:
+		break;
+	case MONITOR_FAILED:
+		goto again;
 	}
 
 
 	/* verify that all active nodes are in normal mode 
 	   and not in recovery mode 
 	 */
-	/* send a getrecmode call out to every node */
-	switch (verify_recmode(ctdb, nodemap, mem_ctx)) {
+	switch (verify_recmode(ctdb, nodemap)) {
 	case MONITOR_RECOVERY_NEEDED:
 		do_recovery(rec, mem_ctx, vnn, num_active, nodemap, vnnmap, nodemap->nodes[j].vnn);
 		goto again;
 	case MONITOR_FAILED:
 		goto again;
+	case MONITOR_ELECTION_NEEDED:
+		/* can not happen */
 	case MONITOR_OK:
 		break;
 	}
diff --git a/ctdb/tools/ctdb.c b/ctdb/tools/ctdb.c
index 9cfe966db92..7a6ff94233a 100644
--- a/ctdb/tools/ctdb.c
+++ b/ctdb/tools/ctdb.c
@@ -299,7 +299,7 @@ static int control_status(struct ctdb_context *ctdb, int argc, const char **argv
 	}
 	printf("Recovery mode:%s (%d)\n",recmode==CTDB_RECOVERY_NORMAL?"NORMAL":"RECOVERY",recmode);
 
-	ret = ctdb_ctrl_getrecmaster(ctdb, TIMELIMIT(), options.vnn, &recmaster);
+	ret = ctdb_ctrl_getrecmaster(ctdb, ctdb, TIMELIMIT(), options.vnn, &recmaster);
 	if (ret != 0) {
 		DEBUG(0, ("Unable to get recmaster from node %u\n", options.vnn));
 		return ret;
@@ -544,7 +544,7 @@ static int control_ban(struct ctdb_context *ctdb, int argc, const char **argv)
 
 	ban_time = strtoul(argv[0], NULL, 0);
 
-	ret = ctdb_ctrl_getrecmaster(ctdb, TIMELIMIT(), options.vnn, &recmaster);
+	ret = ctdb_ctrl_getrecmaster(ctdb, ctdb, TIMELIMIT(), options.vnn, &recmaster);
 	if (ret != 0) {
 		DEBUG(0,("Failed to find the recmaster\n"));
 		return -1;
@@ -575,7 +575,7 @@ static int control_unban(struct ctdb_context *ctdb, int argc, const char **argv)
 	uint32_t recmaster;
 	TDB_DATA data;
 
-	ret = ctdb_ctrl_getrecmaster(ctdb, TIMELIMIT(), options.vnn, &recmaster);
+	ret = ctdb_ctrl_getrecmaster(ctdb, ctdb, TIMELIMIT(), options.vnn, &recmaster);
 	if (ret != 0) {
 		DEBUG(0,("Failed to find the recmaster\n"));
 		return -1;

From 62a03ef9d513beabe589118ce16a2cf6acfec541 Mon Sep 17 00:00:00 2001
From: Ronnie Sahlberg <sahlberg@ronnie>
Date: Thu, 23 Aug 2007 19:38:54 +1000
Subject: [PATCH 06/15] get rid of the explicit global timeout used in the
 previous example and try this time by relying on the timeouts for the
 individual controls

(This used to be ctdb commit 448a0eb4fd896dc545aa0b4bb2ba4628491578be)
---
 ctdb/server/ctdb_recoverd.c | 21 ++-------------------
 1 file changed, 2 insertions(+), 19 deletions(-)

diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c
index c931efcf2ce..208db3963d5 100644
--- a/ctdb/server/ctdb_recoverd.c
+++ b/ctdb/server/ctdb_recoverd.c
@@ -1238,12 +1238,6 @@ static int verify_recmaster_callback(struct ctdb_control_cb_data *cb_data, void
 	return 0;
 }
 
-static void verify_recmaster_timeout(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
-{
-	uint32_t *timedout = (uint32_t *)p;
-
-	*timedout = 1;
-}
 
 /* verify that all nodes agree that we are the recmaster */
 static enum monitor_result verify_recmaster(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t vnn)
@@ -1252,7 +1246,6 @@ static enum monitor_result verify_recmaster(struct ctdb_context *ctdb, struct ct
 	TALLOC_CTX *mem_ctx = talloc_new(ctdb);
 	struct ctdb_client_control_state *state;
 	enum monitor_result status;
-	uint32_t timedout;	
 	int j;
 	
 	rmdata = talloc(mem_ctx, struct verify_recmaster_data);
@@ -1268,7 +1261,7 @@ static enum monitor_result verify_recmaster(struct ctdb_context *ctdb, struct ct
 			continue;
 		}
 		state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx, 
-					timeval_zero(),
+					CONTROL_TIMEOUT(),
 					nodemap->nodes[j].vnn,
 					verify_recmaster_callback, rmdata);
 		if (state == NULL) {
@@ -1288,19 +1281,9 @@ static enum monitor_result verify_recmaster(struct ctdb_context *ctdb, struct ct
 	/* now wait for up to the maximum number of seconds allowed
 	   or until all nodes we expect a response from has replied
 	*/
-	timedout = 0;
-	event_add_timed(ctdb->ev, rmdata, CONTROL_TIMEOUT(),
-			verify_recmaster_timeout, &timedout);
-
-	while ( (rmdata->count > 0)
-	&&	(timedout == 0) ) {
+	while (rmdata->count > 0) {
 		event_loop_once(ctdb->ev);
 	}
-	if (timedout) {
-		DEBUG(0,("Timedout while waiting for getrecmaster replies.\n"));
-		rmdata->status = MONITOR_FAILED;
-	}
-
 
 	status = rmdata->status;
 	talloc_free(mem_ctx);

From 1da9c03b1fdaaa18fcd0682533952b021aaf13ca Mon Sep 17 00:00:00 2001
From: Ronnie Sahlberg <sahlberg@ronnie>
Date: Fri, 24 Aug 2007 09:34:04 +1000
Subject: [PATCH 07/15] comment why we do a talloc_steal

(This used to be ctdb commit aba7972728307e0ae52ccf8c0dd5808110fb92d7)
---
 ctdb/client/ctdb_client.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/ctdb/client/ctdb_client.c b/ctdb/client/ctdb_client.c
index cc1aa3fb8f5..65fb10ec0c8 100644
--- a/ctdb/client/ctdb_client.c
+++ b/ctdb/client/ctdb_client.c
@@ -760,6 +760,9 @@ static void ctdb_client_reply_control(struct ctdb_context *ctdb,
 						 c->errorlen);
 	}
 
+	/* state->outdata now uses resources from c so we dont want c
+	   to just dissappear from under us while state is still alive
+	*/
 	talloc_steal(state, c);
 
 	state->state = CTDB_CONTROL_DONE;

From 495a6403da4de32c382d885dcc88ea4979942c0b Mon Sep 17 00:00:00 2001
From: Ronnie Sahlberg <sahlberg@ronnie>
Date: Fri, 24 Aug 2007 10:42:06 +1000
Subject: [PATCH 08/15] change the api for managing callbacks to controls   so
 that isntead of passing it as a parameter we set the callback function
 explicitely from the caller if the ..._send() function returned a valid state
 pointer.

(This used to be ctdb commit aa939570662786455f63299b62c99882cff29d42)
---
 ctdb/client/ctdb_client.c   | 79 +++++++++++--------------------------
 ctdb/include/ctdb.h         | 21 +++++++---
 ctdb/include/ctdb_private.h |  3 +-
 ctdb/server/ctdb_recoverd.c | 21 +++++-----
 4 files changed, 51 insertions(+), 73 deletions(-)

diff --git a/ctdb/client/ctdb_client.c b/ctdb/client/ctdb_client.c
index 65fb10ec0c8..ecbe57e342b 100644
--- a/ctdb/client/ctdb_client.c
+++ b/ctdb/client/ctdb_client.c
@@ -665,22 +665,6 @@ int ctdb_fetch(struct ctdb_db_context *ctdb_db, TALLOC_CTX *mem_ctx,
 }
 
 
-struct ctdb_client_control_state {
-	struct ctdb_context *ctdb;
-	uint32_t reqid;
-	int32_t status;
-	TDB_DATA outdata;
-	enum control_state state;
-	char *errormsg;
-	struct ctdb_req_control *c;
-
-	/* if we have a callback registered for the completion (or failure) of
-	   this control
-	   if a callback is used, it MUST talloc_free the cb_data passed to it
-	*/
-	control_callback callback;
-	void *cb_private;
-};
 
 /*
    called when a control completes or timesout to invoke the callback
@@ -690,10 +674,7 @@ static void invoke_control_callback(struct event_context *ev, struct timed_event
 	struct timeval t, void *private_data)
 {
 	struct ctdb_client_control_state *state;
-	struct ctdb_control_cb_data *cb_data;
 	struct ctdb_context *ctdb;
-	control_callback callback;
-	void *cb_private;
 	TALLOC_CTX *tmp_ctx = talloc_new(NULL);
 	int ret;
 
@@ -701,29 +682,12 @@ static void invoke_control_callback(struct event_context *ev, struct timed_event
 	talloc_steal(tmp_ctx, state);
 
 	ctdb       = state->ctdb;
-	callback   = state->callback;
-	cb_private = state->cb_private;
 
-	cb_data = talloc_zero(tmp_ctx, struct ctdb_control_cb_data);
-	if (cb_data == NULL) {
-		talloc_free(tmp_ctx);
-		CTDB_NO_MEMORY_VOID(ctdb, cb_data);
-	}
+	ret = ctdb_control_recv(ctdb, state, state,
+			&state->outdata, 
+			&state->status, 
+			&state->errormsg);
 
-	cb_data->state = state->state;
-	cb_data->vnn   = state->c->hdr.destnode;
-
-	ret = ctdb_control_recv(ctdb, state, cb_data,
-			&cb_data->outdata, 
-			&cb_data->status, 
-			&cb_data->errormsg);
-	/* we dont check ret since we expect that ctdb_control_recv can fail
-	   for example if the control timedout
-
-	   state is always talloc_free()'d inside ctdb_control_recv
-	*/
-
-	callback(cb_data, cb_private);
 	talloc_free(tmp_ctx);
 }
 
@@ -770,7 +734,7 @@ static void ctdb_client_reply_control(struct ctdb_context *ctdb,
 	/* if we had a callback registered for this control, pull the response
 	   and call the callback.
 	*/
-	if (state->callback) {
+	if (state->async.fn) {
 		event_add_timed(ctdb->ev, state, timeval_zero(), invoke_control_callback, state);
 	}
 }
@@ -799,7 +763,7 @@ static void control_timeout_func(struct event_context *ev, struct timed_event *t
 	/* if we had a callback registered for this control, pull the response
 	   and call the callback.
 	*/
-	if (state->callback) {
+	if (state->async.fn) {
 		event_add_timed(state->ctdb->ev, state, timeval_zero(), invoke_control_callback, state);
 	}
 }
@@ -810,8 +774,7 @@ struct ctdb_client_control_state *ctdb_control_send(struct ctdb_context *ctdb,
 		uint32_t opcode, uint32_t flags, TDB_DATA data, 
 		TALLOC_CTX *mem_ctx, TDB_DATA *outdata,
 		struct timeval *timeout,
-		char **errormsg,
-		control_callback callback, void *cb_private)
+		char **errormsg)
 {
 	struct ctdb_client_control_state *state;
 	size_t len;
@@ -834,8 +797,6 @@ struct ctdb_client_control_state *ctdb_control_send(struct ctdb_context *ctdb,
 	state->reqid      = ctdb_reqid_new(ctdb, state);
 	state->state      = CTDB_CONTROL_WAIT;
 	state->errormsg   = NULL;
-	state->callback   = callback;
-	state->cb_private = cb_private;
 
 	talloc_set_destructor(state, ctdb_control_destructor);
 
@@ -892,8 +853,12 @@ int ctdb_control_recv(struct ctdb_context *ctdb,
 	while (state->state == CTDB_CONTROL_WAIT) {
 		event_loop_once(ctdb->ev);
 	}
+
 	if (state->state != CTDB_CONTROL_DONE) {
 		DEBUG(0,(__location__ " ctdb_control_recv failed\n"));
+		if (state->async.fn) {
+			state->async.fn(state);
+		}
 		talloc_free(state);
 		return -1;
 	}
@@ -903,11 +868,13 @@ int ctdb_control_recv(struct ctdb_context *ctdb,
 		if (errormsg) {
 			(*errormsg) = talloc_move(mem_ctx, &state->errormsg);
 		}
+		if (state->async.fn) {
+			state->async.fn(state);
+		}
 		talloc_free(state);
 		return -1;
 	}
 
-
 	if (outdata) {
 		*outdata = state->outdata;
 		outdata->dptr = talloc_memdup(mem_ctx, outdata->dptr, outdata->dsize);
@@ -918,6 +885,10 @@ int ctdb_control_recv(struct ctdb_context *ctdb,
 	}
 
 
+
+	if (state->async.fn) {
+		state->async.fn(state);
+	}
 	talloc_free(state);
 	return 0;
 }
@@ -939,8 +910,7 @@ int ctdb_control(struct ctdb_context *ctdb, uint32_t destnode, uint64_t srvid,
 
 	state = ctdb_control_send(ctdb, destnode, srvid, opcode, 
 			flags, data, mem_ctx, outdata,
-			timeout, errormsg,
-			NULL, NULL);
+			timeout, errormsg);
 	return ctdb_control_recv(ctdb, state, mem_ctx, outdata, status, 
 			errormsg);
 }
@@ -1066,8 +1036,7 @@ ctdb_ctrl_getrecmode_send(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct
 {
 	return ctdb_control_send(ctdb, destnode, 0, 
 			   CTDB_CONTROL_GET_RECMODE, 0, tdb_null, 
-			   mem_ctx, NULL, &timeout, NULL,
-			   NULL, NULL);
+			   mem_ctx, NULL, &timeout, NULL);
 }
 
 int ctdb_ctrl_getrecmode_recv(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct ctdb_client_control_state *state, uint32_t *recmode)
@@ -1129,13 +1098,11 @@ int ctdb_ctrl_setrecmode(struct ctdb_context *ctdb, struct timeval timeout, uint
  */
 struct ctdb_client_control_state *
 ctdb_ctrl_getrecmaster_send(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, 
-			struct timeval timeout, uint32_t destnode,
-			control_callback callback, void *cb_private)
+			struct timeval timeout, uint32_t destnode)
 {
 	return ctdb_control_send(ctdb, destnode, 0, 
 			   CTDB_CONTROL_GET_RECMASTER, 0, tdb_null, 
-			   mem_ctx, NULL, &timeout, NULL,
-			   callback, cb_private);
+			   mem_ctx, NULL, &timeout, NULL);
 }
 
 int ctdb_ctrl_getrecmaster_recv(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct ctdb_client_control_state *state, uint32_t *recmaster)
@@ -1160,7 +1127,7 @@ int ctdb_ctrl_getrecmaster(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struc
 {
 	struct ctdb_client_control_state *state;
 
-	state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx, timeout, destnode, NULL, NULL);
+	state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx, timeout, destnode);
 	return ctdb_ctrl_getrecmaster_recv(ctdb, mem_ctx, state, recmaster);
 }
 
diff --git a/ctdb/include/ctdb.h b/ctdb/include/ctdb.h
index 17cf2c598f4..8f52aa79a0b 100644
--- a/ctdb/include/ctdb.h
+++ b/ctdb/include/ctdb.h
@@ -98,15 +98,24 @@ struct ctdb_call_info {
 
 enum control_state {CTDB_CONTROL_WAIT, CTDB_CONTROL_DONE, CTDB_CONTROL_ERROR, CTDB_CONTROL_TIMEOUT};
 
-struct ctdb_control_cb_data {
-	enum control_state state;
-	uint32_t vnn;
+struct ctdb_client_control_state {
+	struct ctdb_context *ctdb;
+	uint32_t reqid;
 	int32_t status;
 	TDB_DATA outdata;
+	enum control_state state;
 	char *errormsg;
-};
+	struct ctdb_req_control *c;
 
-typedef int (*control_callback)(struct ctdb_control_cb_data *cb_data, void *cb_private);
+	/* if we have a callback registered for the completion (or failure) of
+	   this control
+	   if a callback is used, it MUST talloc_free the cb_data passed to it
+	*/
+	struct {
+		void (*fn)(struct ctdb_client_control_state *);
+		void *private;
+	} async;	
+};
 
 
 struct event_context;
@@ -340,7 +349,7 @@ int ctdb_ctrl_setmonmode(struct ctdb_context *ctdb, struct timeval timeout, uint
  */
 int ctdb_ctrl_getrecmaster(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct timeval timeout, uint32_t destnode, uint32_t *recmaster);
 
-struct ctdb_client_control_state *ctdb_ctrl_getrecmaster_send(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct timeval timeout, uint32_t destnode, control_callback callback, void *cb_private);
+struct ctdb_client_control_state *ctdb_ctrl_getrecmaster_send(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct timeval timeout, uint32_t destnode);
 
 int ctdb_ctrl_getrecmaster_recv(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct ctdb_client_control_state *state, uint32_t *recmaster);
 
diff --git a/ctdb/include/ctdb_private.h b/ctdb/include/ctdb_private.h
index 1c63daf469b..8a61131593d 100644
--- a/ctdb/include/ctdb_private.h
+++ b/ctdb/include/ctdb_private.h
@@ -861,8 +861,7 @@ ctdb_control_send(struct ctdb_context *ctdb,
 		uint32_t opcode, uint32_t flags, TDB_DATA data, 
 		TALLOC_CTX *mem_ctx, TDB_DATA *outdata,
 		struct timeval *timeout,
-		char **errormsg,
-		control_callback callback, void *cb_private);
+		char **errormsg);
 
 
 
diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c
index 208db3963d5..f23dcea2704 100644
--- a/ctdb/server/ctdb_recoverd.c
+++ b/ctdb/server/ctdb_recoverd.c
@@ -1209,9 +1209,9 @@ struct verify_recmaster_data {
 	enum monitor_result status;
 };
 
-static int verify_recmaster_callback(struct ctdb_control_cb_data *cb_data, void *cb_private)
+static void verify_recmaster_callback(struct ctdb_client_control_state *state)
 {
-	struct verify_recmaster_data *rmdata = talloc_get_type(cb_private, struct verify_recmaster_data);
+	struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private, struct verify_recmaster_data);
 
 
 	/* one more node has responded with recmaster data*/
@@ -1220,22 +1220,22 @@ static int verify_recmaster_callback(struct ctdb_control_cb_data *cb_data, void
 	/* if we failed to get the recmaster, then return an error and let
 	   the main loop try again.
 	*/
-	if (cb_data->state != CTDB_CONTROL_DONE) {
+	if (state->state != CTDB_CONTROL_DONE) {
 		if (rmdata->status == MONITOR_OK) {
 			rmdata->status = MONITOR_FAILED;
 		}
-		return 0;
+		return;
 	}
 
 	/* if we got a response, then the recmaster will be stored in the
 	   status field
 	*/
-	if (cb_data->status != rmdata->vnn) {
-		DEBUG(0,("Node %d does not agree we are the recmaster. Need a new recmaster election\n",cb_data->vnn));
+	if (state->status != rmdata->vnn) {
+		DEBUG(0,("Node %d does not agree we are the recmaster. Need a new recmaster election\n", state->c->hdr.destnode));
 		rmdata->status = MONITOR_ELECTION_NEEDED;
 	}
 
-	return 0;
+	return;
 }
 
 
@@ -1262,8 +1262,7 @@ static enum monitor_result verify_recmaster(struct ctdb_context *ctdb, struct ct
 		}
 		state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx, 
 					CONTROL_TIMEOUT(),
-					nodemap->nodes[j].vnn,
-					verify_recmaster_callback, rmdata);
+					nodemap->nodes[j].vnn);
 		if (state == NULL) {
 			/* we failed to send the control, treat this as 
 			   an error and try again next iteration
@@ -1273,6 +1272,10 @@ static enum monitor_result verify_recmaster(struct ctdb_context *ctdb, struct ct
 			return MONITOR_FAILED;
 		}
 
+		/* set up the callback functions */
+		state->async.fn = verify_recmaster_callback;
+		state->async.private = rmdata;
+
 		/* one more control to wait for to complete */
 		rmdata->count++;
 	}

From de23937368088327059214f72f083f896f4e1fed Mon Sep 17 00:00:00 2001
From: Ronnie Sahlberg <sahlberg@ronnie>
Date: Fri, 24 Aug 2007 10:54:34 +1000
Subject: [PATCH 09/15] cleanup invoke_control_callback.   we dont need to pass
 some of these parameters to _recv() since they are already set

(This used to be ctdb commit 2034dbebb26d7a2d51241943f6ccbe15bb6a5169)
---
 ctdb/client/ctdb_client.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/ctdb/client/ctdb_client.c b/ctdb/client/ctdb_client.c
index ecbe57e342b..a3203c21e96 100644
--- a/ctdb/client/ctdb_client.c
+++ b/ctdb/client/ctdb_client.c
@@ -674,19 +674,16 @@ static void invoke_control_callback(struct event_context *ev, struct timed_event
 	struct timeval t, void *private_data)
 {
 	struct ctdb_client_control_state *state;
-	struct ctdb_context *ctdb;
 	TALLOC_CTX *tmp_ctx = talloc_new(NULL);
 	int ret;
 
 	state = talloc_get_type(private_data, struct ctdb_client_control_state);
 	talloc_steal(tmp_ctx, state);
 
-	ctdb       = state->ctdb;
-
-	ret = ctdb_control_recv(ctdb, state, state,
-			&state->outdata, 
-			&state->status, 
-			&state->errormsg);
+	ret = ctdb_control_recv(state->ctdb, state, state,
+			NULL, 
+			NULL, 
+			NULL);
 
 	talloc_free(tmp_ctx);
 }

From 6681da31df926625ce73b3e9047caa7ff4065792 Mon Sep 17 00:00:00 2001
From: Ronnie Sahlberg <sahlberg@ronnie>
Date: Fri, 24 Aug 2007 15:53:41 +1000
Subject: [PATCH 10/15] add an initial implementation of a service_id structure
 and three controls to  register/unregister/check a server id.

a server id consists of TYPE:VNN:ID    where type is specific to the
application.  VNN is the node where the serverid was registered and ID
might be a node unique identifier such as a pid or similar.


Clients can register a server id for themself at the local ctdb daemon.
When a client dissappears   or when the domain socket connection for the
client drops  then any and all server ids registered across that domain
socket will also be automatically removed from the store.

clients can register as many server_ids as they want at the same time
but each TYPE:VNN:ID must be globally unique.

Clients have the option of explicitely unregister a server id by using
the UNREGISTER control.


Registration and unregistration can only be done by clients to the local
daemon. clients can not register their server id to a remote node.


clients can check if a server id does exist on any ctdb node in the
network by using the check control

(This used to be ctdb commit d44798feec26147c5cc05922cb2186f0ef0307be)
---
 ctdb/Makefile.in             |   1 +
 ctdb/client/ctdb_client.c    |  87 +++++++++++++++++++++++++++
 ctdb/common/cmdline.c        |   4 ++
 ctdb/include/ctdb.h          |  17 ++++++
 ctdb/include/ctdb_private.h  |  16 ++++-
 ctdb/server/ctdb_control.c   |  14 +++++
 ctdb/server/ctdb_serverids.c | 113 +++++++++++++++++++++++++++++++++++
 ctdb/tools/ctdb.c            |  84 ++++++++++++++++++++++++++
 8 files changed, 335 insertions(+), 1 deletion(-)
 create mode 100644 ctdb/server/ctdb_serverids.c

diff --git a/ctdb/Makefile.in b/ctdb/Makefile.in
index 7499b0d7ff2..d8c06fab9ad 100644
--- a/ctdb/Makefile.in
+++ b/ctdb/Makefile.in
@@ -48,6 +48,7 @@ CTDB_SERVER_OBJ = server/ctdbd.o server/ctdb_daemon.o server/ctdb_lockwait.o \
 	server/ctdb_tunables.o server/ctdb_monitor.o server/ctdb_server.o \
 	server/ctdb_control.o server/ctdb_call.o server/ctdb_ltdb_server.o \
 	server/ctdb_traverse.o server/eventscript.o server/ctdb_takeover.o \
+	server/ctdb_serverids.o \
 	$(CTDB_CLIENT_OBJ) $(CTDB_TCP_OBJ) @INFINIBAND_WRAPPER_OBJ@
 
 TEST_BINS=bin/ctdb_bench bin/ctdb_fetch bin/ctdb_store bin/rb_test \
diff --git a/ctdb/client/ctdb_client.c b/ctdb/client/ctdb_client.c
index a3203c21e96..f0b22dbb066 100644
--- a/ctdb/client/ctdb_client.c
+++ b/ctdb/client/ctdb_client.c
@@ -2299,6 +2299,93 @@ int ctdb_ctrl_get_tcp_tickles(struct ctdb_context *ctdb,
 	return status;
 }
 
+/*
+  register a server id
+ */
+int ctdb_ctrl_register_server_id(struct ctdb_context *ctdb, 
+		      struct timeval timeout, 
+		      struct ctdb_server_id *id)
+{
+	TDB_DATA data;
+	int32_t res;
+	int ret;
+
+	data.dsize = sizeof(struct ctdb_server_id);
+	data.dptr  = (unsigned char *)id;
+
+	ret = ctdb_control(ctdb, CTDB_CURRENT_NODE, 0, 
+			CTDB_CONTROL_REGISTER_SERVER_ID, 
+			0, data, NULL,
+			NULL, &res, &timeout, NULL);
+	if (ret != 0 || res != 0) {
+		DEBUG(0,(__location__ " ctdb_control for register server id failed\n"));
+		return -1;
+	}
+
+	return 0;
+}
+
+/*
+  unregister a server id
+ */
+int ctdb_ctrl_unregister_server_id(struct ctdb_context *ctdb, 
+		      struct timeval timeout, 
+		      struct ctdb_server_id *id)
+{
+	TDB_DATA data;
+	int32_t res;
+	int ret;
+
+	data.dsize = sizeof(struct ctdb_server_id);
+	data.dptr  = (unsigned char *)id;
+
+	ret = ctdb_control(ctdb, CTDB_CURRENT_NODE, 0, 
+			CTDB_CONTROL_UNREGISTER_SERVER_ID, 
+			0, data, NULL,
+			NULL, &res, &timeout, NULL);
+	if (ret != 0 || res != 0) {
+		DEBUG(0,(__location__ " ctdb_control for unregister server id failed\n"));
+		return -1;
+	}
+
+	return 0;
+}
+
+
+/*
+  check if a server id exists
+ */
+int ctdb_ctrl_check_server_id(struct ctdb_context *ctdb, 
+		      struct timeval timeout, 
+		      uint32_t destnode,
+		      struct ctdb_server_id *id,
+		      uint32_t *status)
+{
+	TDB_DATA data;
+	int32_t res;
+	int ret;
+
+	data.dsize = sizeof(struct ctdb_server_id);
+	data.dptr  = (unsigned char *)id;
+
+	ret = ctdb_control(ctdb, destnode, 0, CTDB_CONTROL_CHECK_SERVER_ID, 
+			0, data, NULL,
+			NULL, &res, &timeout, NULL);
+	if (ret != 0) {
+		DEBUG(0,(__location__ " ctdb_control for check server id failed\n"));
+		return -1;
+	}
+
+	if (res) {
+		*status = 1;
+	} else {
+		*status = 0;
+	}
+
+	return 0;
+}
+
+
 /*
   initialise the ctdb daemon for client applications
 
diff --git a/ctdb/common/cmdline.c b/ctdb/common/cmdline.c
index f72254e2956..fb5c76aeae5 100644
--- a/ctdb/common/cmdline.c
+++ b/ctdb/common/cmdline.c
@@ -23,6 +23,7 @@
 #include "popt.h"
 #include "../include/ctdb.h"
 #include "../include/ctdb_private.h"
+#include "../common/rb_tree.h"
 
 /* Handle common command line options for ctdb test progs
  */
@@ -87,6 +88,9 @@ struct ctdb_context *ctdb_cmdline_init(struct event_context *ev)
 		exit(1);
 	}
 
+	/* set up the tree to store server ids */
+	ctdb->server_ids = trbt_create(ctdb, 0);
+
 	return ctdb;
 }
 
diff --git a/ctdb/include/ctdb.h b/ctdb/include/ctdb.h
index 8f52aa79a0b..b7c1f71a0b6 100644
--- a/ctdb/include/ctdb.h
+++ b/ctdb/include/ctdb.h
@@ -405,6 +405,23 @@ int ctdb_ctrl_modflags(struct ctdb_context *ctdb,
 		       uint32_t destnode, 
 		       uint32_t set, uint32_t clear);
 
+enum ctdb_server_id_type { SERVER_TYPE_SAMBA=1 };
+
+struct ctdb_server_id {
+	enum ctdb_server_id_type type;
+	uint32_t vnn;
+	uint32_t server_id;
+};
+int ctdb_ctrl_register_server_id(struct ctdb_context *ctdb,
+		struct timeval timeout,
+		struct ctdb_server_id *id);
+int ctdb_ctrl_unregister_server_id(struct ctdb_context *ctdb, 
+		struct timeval timeout, 
+		struct ctdb_server_id *id);
+int ctdb_ctrl_check_server_id(struct ctdb_context *ctdb,
+		struct timeval timeout, uint32_t destnode, 
+		struct ctdb_server_id *id, uint32_t *status);
+
 int ctdb_socket_connect(struct ctdb_context *ctdb);
 
 #endif
diff --git a/ctdb/include/ctdb_private.h b/ctdb/include/ctdb_private.h
index 8a61131593d..9e77e58fd75 100644
--- a/ctdb/include/ctdb_private.h
+++ b/ctdb/include/ctdb_private.h
@@ -349,6 +349,7 @@ struct ctdb_context {
 	bool do_setsched;
 	void *saved_scheduler_param;
 	struct ctdb_kill_tcp *killtcp;
+	struct _trbt_tree_t *server_ids;	
 };
 
 struct ctdb_db_context {
@@ -451,7 +452,11 @@ enum ctdb_controls {CTDB_CONTROL_PROCESS_EXISTS          = 0,
 		    CTDB_CONTROL_KILL_TCP                = 54,
 		    CTDB_CONTROL_GET_TCP_TICKLE_LIST     = 55,
 		    CTDB_CONTROL_SET_TCP_TICKLE_LIST     = 56,
-};
+		    CTDB_CONTROL_REGISTER_SERVER_ID	 = 57,
+		    CTDB_CONTROL_UNREGISTER_SERVER_ID	 = 58,
+		    CTDB_CONTROL_CHECK_SERVER_ID	 = 59,
+		    CTDB_CONTROL_GET_SERVER_ID_LIST	 = 60,
+};	
 
 /*
   structure passed in ctdb_control_set_rsn_nonempty
@@ -1120,4 +1125,13 @@ int ctdb_ctrl_get_tcp_tickles(struct ctdb_context *ctdb,
 		      uint32_t vnn,
 		      struct ctdb_control_tcp_tickle_list **list);
 
+
+int32_t ctdb_control_register_server_id(struct ctdb_context *ctdb, 
+		      uint32_t client_id,
+		      TDB_DATA indata);
+int32_t ctdb_control_check_server_id(struct ctdb_context *ctdb, 
+		      TDB_DATA indata);
+int32_t ctdb_control_unregister_server_id(struct ctdb_context *ctdb, 
+		      TDB_DATA indata);
+
 #endif
diff --git a/ctdb/server/ctdb_control.c b/ctdb/server/ctdb_control.c
index e134850cf46..346d0237426 100644
--- a/ctdb/server/ctdb_control.c
+++ b/ctdb/server/ctdb_control.c
@@ -26,6 +26,7 @@
 #include "lib/util/dlinklist.h"
 #include "db_wrap.h"
 
+
 struct ctdb_control_state {
 	struct ctdb_context *ctdb;
 	uint32_t reqid;
@@ -34,6 +35,7 @@ struct ctdb_control_state {
 	unsigned flags;
 };
 
+
 /*
   process a control request
  */
@@ -294,6 +296,18 @@ static int32_t ctdb_control_dispatch(struct ctdb_context *ctdb,
 		/* data size is verified in the called function */
 		return ctdb_control_set_tcp_tickle_list(ctdb, indata);
 
+	case CTDB_CONTROL_REGISTER_SERVER_ID: 
+		CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_server_id));
+		return ctdb_control_register_server_id(ctdb, client_id, indata);
+
+	case CTDB_CONTROL_UNREGISTER_SERVER_ID: 
+		CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_server_id));
+		return ctdb_control_unregister_server_id(ctdb, indata);
+
+	case CTDB_CONTROL_CHECK_SERVER_ID: 
+		CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_server_id));
+		return ctdb_control_check_server_id(ctdb, indata);
+
 	default:
 		DEBUG(0,(__location__ " Unknown CTDB control opcode %u\n", opcode));
 		return -1;
diff --git a/ctdb/server/ctdb_serverids.c b/ctdb/server/ctdb_serverids.c
new file mode 100644
index 00000000000..b406a353f13
--- /dev/null
+++ b/ctdb/server/ctdb_serverids.c
@@ -0,0 +1,113 @@
+/* 
+   ctdb_control protocol code to manage server ids
+
+   Copyright (C) Ronnie Sahlberg 2007
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+#include "includes.h"
+#include "../include/ctdb_private.h"
+#include "../common/rb_tree.h"
+
+
+#define SERVER_ID_KEY_SIZE 3
+static uint32_t *get_server_id_key(struct ctdb_server_id *server_id)
+{
+	static uint32_t key[SERVER_ID_KEY_SIZE];
+
+	key[0] = server_id->type;
+	key[1] = server_id->vnn;
+	key[2] = server_id->server_id;
+
+	return &key[0];
+}
+
+/* add a server_id to the tree.
+   if we had already 'data' in the tree then this is a duplicate and we can
+   just talloc_free the structure in parm and leave data in the tree.
+   othervise if this is a new node we return parm and that is inserted
+   into the tree.
+*/
+static void *add_server_id_callback(void *parm, void *data)
+{
+	if (data) {
+		talloc_free(parm);
+		return data;
+	}
+	return parm;
+}
+
+/*
+  register a server id
+  a serverid that is registered with ctdb will be automatically unregistered
+  once the client domain socket dissappears.
+ */
+int32_t ctdb_control_register_server_id(struct ctdb_context *ctdb, 
+				 uint32_t client_id,
+				 TDB_DATA indata)
+{
+	struct ctdb_server_id *server_id;
+	struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
+
+
+	if (client == NULL) {
+		DEBUG(0,(__location__ " Could not find client parent structure. You can not send this control to a remote node\n"));
+		return 1;
+	}
+
+	/* hang the server_id structure off client before storing it in the
+	   tree so that is will be automatically destroyed when client
+	   is destroyed. 
+	   when the structure is free'd it will be automatically
+	   removed from the tree
+	*/
+	server_id = talloc_memdup(client, indata.dptr, indata.dsize);
+	CTDB_NO_MEMORY(ctdb, server_id);
+
+	trbt_insertarray32_callback(ctdb->server_ids, SERVER_ID_KEY_SIZE,
+		get_server_id_key(server_id), 
+		add_server_id_callback, server_id);
+
+	return 0;
+}
+
+
+/*
+  check whether a server id exists
+ */
+int32_t ctdb_control_check_server_id(struct ctdb_context *ctdb, 
+				 TDB_DATA indata)
+{
+	struct ctdb_server_id *server_id = (struct ctdb_server_id *)indata.dptr;
+
+	return (int32_t)trbt_lookuparray32(ctdb->server_ids, 
+				SERVER_ID_KEY_SIZE,
+				get_server_id_key(server_id));
+}
+
+/*
+  unregisters a server id
+ */
+int32_t ctdb_control_unregister_server_id(struct ctdb_context *ctdb, 
+				 TDB_DATA indata)
+{
+	struct ctdb_server_id *server_id = (struct ctdb_server_id *)indata.dptr;
+
+	talloc_free(trbt_lookuparray32(ctdb->server_ids, 
+			SERVER_ID_KEY_SIZE,
+			get_server_id_key(server_id)));
+	return 0;
+}
+
+
diff --git a/ctdb/tools/ctdb.c b/ctdb/tools/ctdb.c
index 7a6ff94233a..83b262f520b 100644
--- a/ctdb/tools/ctdb.c
+++ b/ctdb/tools/ctdb.c
@@ -373,6 +373,86 @@ static int kill_tcp(struct ctdb_context *ctdb, int argc, const char **argv)
 	return -1;
 }
 
+
+/*
+  register a server id
+ */
+static int regsrvid(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+	int ret;
+	struct ctdb_server_id server_id;
+
+	if (argc < 3) {
+		usage();
+	}
+
+	server_id.vnn       = strtoul(argv[0], NULL, 0);
+	server_id.type      = strtoul(argv[1], NULL, 0);
+	server_id.server_id = strtoul(argv[2], NULL, 0);
+
+	ret = ctdb_ctrl_register_server_id(ctdb, TIMELIMIT(), &server_id);
+	if (ret != 0) {
+		DEBUG(0, ("Unable to register server_id from node %u\n", options.vnn));
+		return ret;
+	}
+	return -1;
+}
+
+/*
+  unregister a server id
+ */
+static int unregsrvid(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+	int ret;
+	struct ctdb_server_id server_id;
+
+	if (argc < 3) {
+		usage();
+	}
+
+	server_id.vnn       = strtoul(argv[0], NULL, 0);
+	server_id.type      = strtoul(argv[1], NULL, 0);
+	server_id.server_id = strtoul(argv[2], NULL, 0);
+
+	ret = ctdb_ctrl_unregister_server_id(ctdb, TIMELIMIT(), &server_id);
+	if (ret != 0) {
+		DEBUG(0, ("Unable to unregister server_id from node %u\n", options.vnn));
+		return ret;
+	}
+	return -1;
+}
+
+/*
+  check if a server id exists
+ */
+static int chksrvid(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+	uint32_t status;
+	int ret;
+	struct ctdb_server_id server_id;
+
+	if (argc < 3) {
+		usage();
+	}
+
+	server_id.vnn       = strtoul(argv[0], NULL, 0);
+	server_id.type      = strtoul(argv[1], NULL, 0);
+	server_id.server_id = strtoul(argv[2], NULL, 0);
+
+	ret = ctdb_ctrl_check_server_id(ctdb, TIMELIMIT(), options.vnn, &server_id, &status);
+	if (ret != 0) {
+		DEBUG(0, ("Unable to check server_id from node %u\n", options.vnn));
+		return ret;
+	}
+
+	if (status) {
+		printf("Server id %d:%d:%d EXISTS\n", server_id.vnn, server_id.type, server_id.server_id);
+	} else {
+		printf("Server id %d:%d:%d does NOT exist\n", server_id.vnn, server_id.type, server_id.server_id);
+	}
+	return 0;
+}
+
 /*
   send a tcp tickle ack
  */
@@ -964,6 +1044,10 @@ static const struct {
 	{ "killtcp",         kill_tcp,                  false, "kill a tcp connection.", "<srcip:port> <dstip:port>" },
 	{ "tickle",          tickle_tcp,                false, "send a tcp tickle ack", "<srcip:port> <dstip:port>" },
 	{ "gettickles",      control_get_tickles,       false, "get the list of tickles registered for this vnn", "<vnn>" },
+
+	{ "regsrvid",        regsrvid,			false, "register a server id", "<vnn> <type> <id>" },
+	{ "unregsrvid",      unregsrvid,		false, "unregister a server id", "<vnn> <type> <id>" },
+	{ "chksrvid",        chksrvid,			false, "check if a server id exists", "<vnn> <type> <id>" },
 };
 
 /*

From 801bdbdc805180155e7cf5b6b268aaeb348d843d Mon Sep 17 00:00:00 2001
From: Ronnie Sahlberg <sahlberg@ronnie>
Date: Sun, 26 Aug 2007 10:57:02 +1000
Subject: [PATCH 11/15] add a control to pull the server id list off a node

(This used to be ctdb commit 38aa759aa88a042c31b401551f6a713fb7bbe84e)
---
 ctdb/client/ctdb_client.c    | 24 +++++++++++
 ctdb/include/ctdb.h          | 11 +++++
 ctdb/include/ctdb_private.h  |  2 +
 ctdb/server/ctdb_control.c   |  4 ++
 ctdb/server/ctdb_serverids.c | 78 +++++++++++++++++++++++++++++++++++-
 ctdb/tools/ctdb.c            | 25 ++++++++++++
 6 files changed, 143 insertions(+), 1 deletion(-)

diff --git a/ctdb/client/ctdb_client.c b/ctdb/client/ctdb_client.c
index f0b22dbb066..0aa0e9b97ed 100644
--- a/ctdb/client/ctdb_client.c
+++ b/ctdb/client/ctdb_client.c
@@ -2385,6 +2385,30 @@ int ctdb_ctrl_check_server_id(struct ctdb_context *ctdb,
 	return 0;
 }
 
+/*
+   get the list of server ids that are registered on a node
+*/
+int ctdb_ctrl_get_server_id_list(struct ctdb_context *ctdb,
+		TALLOC_CTX *mem_ctx,
+		struct timeval timeout, uint32_t destnode, 
+		struct ctdb_server_id_list **svid_list)
+{
+	int ret;
+	TDB_DATA outdata;
+	int32_t res;
+
+	ret = ctdb_control(ctdb, destnode, 0, 
+			   CTDB_CONTROL_GET_SERVER_ID_LIST, 0, tdb_null, 
+			   mem_ctx, &outdata, &res, &timeout, NULL);
+	if (ret != 0 || res != 0) {
+		DEBUG(0,(__location__ " ctdb_control for get_server_id_list failed\n"));
+		return -1;
+	}
+
+	*svid_list = (struct ctdb_server_id_list *)talloc_steal(mem_ctx, outdata.dptr);
+		    
+	return 0;
+}
 
 /*
   initialise the ctdb daemon for client applications
diff --git a/ctdb/include/ctdb.h b/ctdb/include/ctdb.h
index b7c1f71a0b6..e46cc68a794 100644
--- a/ctdb/include/ctdb.h
+++ b/ctdb/include/ctdb.h
@@ -412,6 +412,13 @@ struct ctdb_server_id {
 	uint32_t vnn;
 	uint32_t server_id;
 };
+
+struct ctdb_server_id_list {
+	uint32_t num;
+	struct ctdb_server_id server_ids[1];
+};
+
+
 int ctdb_ctrl_register_server_id(struct ctdb_context *ctdb,
 		struct timeval timeout,
 		struct ctdb_server_id *id);
@@ -421,6 +428,10 @@ int ctdb_ctrl_unregister_server_id(struct ctdb_context *ctdb,
 int ctdb_ctrl_check_server_id(struct ctdb_context *ctdb,
 		struct timeval timeout, uint32_t destnode, 
 		struct ctdb_server_id *id, uint32_t *status);
+int ctdb_ctrl_get_server_id_list(struct ctdb_context *ctdb,
+		TALLOC_CTX *mem_ctx,
+		struct timeval timeout, uint32_t destnode, 
+		struct ctdb_server_id_list **svid_list);
 
 int ctdb_socket_connect(struct ctdb_context *ctdb);
 
diff --git a/ctdb/include/ctdb_private.h b/ctdb/include/ctdb_private.h
index 9e77e58fd75..516e3111c39 100644
--- a/ctdb/include/ctdb_private.h
+++ b/ctdb/include/ctdb_private.h
@@ -1133,5 +1133,7 @@ int32_t ctdb_control_check_server_id(struct ctdb_context *ctdb,
 		      TDB_DATA indata);
 int32_t ctdb_control_unregister_server_id(struct ctdb_context *ctdb, 
 		      TDB_DATA indata);
+int32_t ctdb_control_get_server_id_list(struct ctdb_context *ctdb, 
+		      TDB_DATA *outdata);
 
 #endif
diff --git a/ctdb/server/ctdb_control.c b/ctdb/server/ctdb_control.c
index 346d0237426..326b8edca66 100644
--- a/ctdb/server/ctdb_control.c
+++ b/ctdb/server/ctdb_control.c
@@ -308,6 +308,10 @@ static int32_t ctdb_control_dispatch(struct ctdb_context *ctdb,
 		CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_server_id));
 		return ctdb_control_check_server_id(ctdb, indata);
 
+	case CTDB_CONTROL_GET_SERVER_ID_LIST:
+		CHECK_CONTROL_DATA_SIZE(0);
+		return ctdb_control_get_server_id_list(ctdb, outdata);
+
 	default:
 		DEBUG(0,(__location__ " Unknown CTDB control opcode %u\n", opcode));
 		return -1;
diff --git a/ctdb/server/ctdb_serverids.c b/ctdb/server/ctdb_serverids.c
index b406a353f13..8c8c9308744 100644
--- a/ctdb/server/ctdb_serverids.c
+++ b/ctdb/server/ctdb_serverids.c
@@ -72,8 +72,9 @@ int32_t ctdb_control_register_server_id(struct ctdb_context *ctdb,
 	   when the structure is free'd it will be automatically
 	   removed from the tree
 	*/
-	server_id = talloc_memdup(client, indata.dptr, indata.dsize);
+	server_id = talloc_zero(client, struct ctdb_server_id);
 	CTDB_NO_MEMORY(ctdb, server_id);
+	memcpy(server_id, indata.dptr, sizeof(struct ctdb_server_id));
 
 	trbt_insertarray32_callback(ctdb->server_ids, SERVER_ID_KEY_SIZE,
 		get_server_id_key(server_id), 
@@ -111,3 +112,78 @@ int32_t ctdb_control_unregister_server_id(struct ctdb_context *ctdb,
 }
 
 
+
+
+struct count_server_ids {
+	int count;
+	struct ctdb_server_id_list *list;
+};
+
+static void server_id_count(void *param, void *data)
+{
+	struct count_server_ids *svid = talloc_get_type(param, 
+						struct count_server_ids);
+
+	if (svid == NULL) {
+		DEBUG(0, (__location__ " Got null pointer for svid\n"));
+		return;
+	}
+
+	svid->count++;
+}
+
+static void server_id_store(void *param, void *data)
+{
+	struct count_server_ids *svid = talloc_get_type(param, 
+						struct count_server_ids);
+	struct ctdb_server_id *server_id = talloc_get_type(data, 
+						struct ctdb_server_id);
+
+	if (svid == NULL) {
+		DEBUG(0, (__location__ " Got null pointer for svid\n"));
+		return;
+	}
+
+	if (svid->count >= svid->list->num) {
+		DEBUG(0, (__location__ " size of server id tree changed during traverse\n"));
+		return;
+	}
+
+	memcpy(&svid->list->server_ids[svid->count], server_id, sizeof(struct ctdb_server_id));
+	svid->count++;
+}
+
+/* 
+   returns a list of all registered server ids for a node
+*/
+int32_t ctdb_control_get_server_id_list(struct ctdb_context *ctdb, TDB_DATA *outdata)
+{
+	struct count_server_ids *svid;
+
+
+	svid = talloc_zero(outdata, struct count_server_ids);
+	CTDB_NO_MEMORY(ctdb, svid);
+
+
+	/* first we must count how many entries we have */
+	trbt_traversearray32(ctdb->server_ids, SERVER_ID_KEY_SIZE,
+			server_id_count, svid);
+
+
+	outdata->dsize = offsetof(struct ctdb_server_id_list, 
+				server_ids)
+			+ sizeof(struct ctdb_server_id) * svid->count;
+	outdata->dptr  = talloc_size(outdata, outdata->dsize);
+	CTDB_NO_MEMORY(ctdb, outdata->dptr);
+
+
+	/* now fill the structure in */
+	svid->list = (struct ctdb_server_id_list *)(outdata->dptr);
+	svid->list->num = svid->count;
+	svid->count=0;
+	trbt_traversearray32(ctdb->server_ids, SERVER_ID_KEY_SIZE,
+			server_id_store, svid);
+
+
+	return 0;
+}
diff --git a/ctdb/tools/ctdb.c b/ctdb/tools/ctdb.c
index 83b262f520b..3e56a97f6e9 100644
--- a/ctdb/tools/ctdb.c
+++ b/ctdb/tools/ctdb.c
@@ -453,6 +453,30 @@ static int chksrvid(struct ctdb_context *ctdb, int argc, const char **argv)
 	return 0;
 }
 
+/*
+  get a list of all server ids that are registered on a node
+ */
+static int getsrvids(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+	int i, ret;
+	struct ctdb_server_id_list *server_ids;
+
+	ret = ctdb_ctrl_get_server_id_list(ctdb, ctdb, TIMELIMIT(), options.vnn, &server_ids);
+	if (ret != 0) {
+		DEBUG(0, ("Unable to get server_id list from node %u\n", options.vnn));
+		return ret;
+	}
+
+	for (i=0; i<server_ids->num; i++) {
+		printf("Server id %d:%d:%d\n", 
+			server_ids->server_ids[i].vnn, 
+			server_ids->server_ids[i].type, 
+			server_ids->server_ids[i].server_id); 
+	}
+
+	return -1;
+}
+
 /*
   send a tcp tickle ack
  */
@@ -1048,6 +1072,7 @@ static const struct {
 	{ "regsrvid",        regsrvid,			false, "register a server id", "<vnn> <type> <id>" },
 	{ "unregsrvid",      unregsrvid,		false, "unregister a server id", "<vnn> <type> <id>" },
 	{ "chksrvid",        chksrvid,			false, "check if a server id exists", "<vnn> <type> <id>" },
+	{ "getsrvids",       getsrvids,			false, "get a list of all server ids"},
 };
 
 /*

From a9c45b256230080ffd52dd7d65756f0444583b12 Mon Sep 17 00:00:00 2001
From: Ronnie Sahlberg <sahlberg@ronnie>
Date: Mon, 27 Aug 2007 09:40:10 +1000
Subject: [PATCH 12/15] change the monitoring of recmode in the recovery daemon
 to use a fully async eventdriven api for controls

(This used to be ctdb commit 8d0e43428c507967a0d96e6a4c5c821ac269c546)
---
 ctdb/server/ctdb_recoverd.c | 99 +++++++++++++++++++++++++------------
 1 file changed, 68 insertions(+), 31 deletions(-)

diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c
index f23dcea2704..38ecf35bdcc 100644
--- a/ctdb/server/ctdb_recoverd.c
+++ b/ctdb/server/ctdb_recoverd.c
@@ -1151,58 +1151,95 @@ static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid,
 enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
 
 
-/* verify that all nodes are in recovery mode normal */
-static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
+struct verify_recmode_normal_data {
+	uint32_t count;
+	enum monitor_result status;
+};
+
+static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
 {
-	struct ctdb_client_control_state **ctrl_states;
-	uint32_t recmode;
-	TALLOC_CTX *mem_ctx = talloc_new(ctdb);
-	int j, ret;
-	
-	ctrl_states = talloc_array(mem_ctx, struct ctdb_client_control_state *,
-				 nodemap->num);
-	if (!ctrl_states) {
-		DEBUG(0,(__location__ " Failed to allocate temporary ctrl state array\n"));
-		talloc_free(mem_ctx);
-		exit(-1);
+	struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private, struct verify_recmode_normal_data);
+
+
+	/* one more node has responded with recmode data*/
+	rmdata->count--;
+
+	/* if we failed to get the recmode, then return an error and let
+	   the main loop try again.
+	*/
+	if (state->state != CTDB_CONTROL_DONE) {
+		if (rmdata->status == MONITOR_OK) {
+			rmdata->status = MONITOR_FAILED;
+		}
+		return;
 	}
 
+	/* if we got a response, then the recmode will be stored in the
+	   status field
+	*/
+	if (state->status != CTDB_RECOVERY_NORMAL) {
+		DEBUG(0, (__location__ " Node:%u was in recovery mode. Restart recovery process\n", state->c->hdr.destnode));
+		rmdata->status = MONITOR_RECOVERY_NEEDED;
+	}
+
+	return;
+}
+
+
+/* verify that all nodes are in normal recovery mode */
+static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
+{
+	struct verify_recmode_normal_data *rmdata;
+	TALLOC_CTX *mem_ctx = talloc_new(ctdb);
+	struct ctdb_client_control_state *state;
+	enum monitor_result status;
+	int j;
+	
+	rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
+	CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
+	rmdata->count  = 0;
+	rmdata->status = MONITOR_OK;
 
 	/* loop over all active nodes and send an async getrecmode call to 
 	   them*/
 	for (j=0; j<nodemap->num; j++) {
 		if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
-			ctrl_states[j] = NULL;
 			continue;
 		}
-		ctrl_states[j] = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx, 
+		state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx, 
 					CONTROL_TIMEOUT(), 
 					nodemap->nodes[j].vnn);
-	}
-
-	/* wait for the responses to come back and check that all is ok */
-	for (j=0; j<nodemap->num; j++) {
-		if (ctrl_states[j] == NULL) {
-			continue;
-		}
-		ret = ctdb_ctrl_getrecmode_recv(ctdb, mem_ctx, ctrl_states[j], &recmode);
-		if (ret != 0) {
-			DEBUG(0, ("Unable to get recmode from node %u\n", nodemap->nodes[j].vnn));
+		if (state == NULL) {
+			/* we failed to send the control, treat this as 
+			   an error and try again next iteration
+			*/			
+			DEBUG(0,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
 			talloc_free(mem_ctx);
 			return MONITOR_FAILED;
 		}
 
-		if (recmode != CTDB_RECOVERY_NORMAL) {
-			DEBUG(0, (__location__ " Node:%u was in recovery mode. Restart recovery process\n", nodemap->nodes[j].vnn));
-			talloc_free(mem_ctx);
-			return MONITOR_RECOVERY_NEEDED;
-		}
+		/* set up the callback functions */
+		state->async.fn = verify_recmode_normal_callback;
+		state->async.private = rmdata;
+
+		/* one more control to wait for to complete */
+		rmdata->count++;
 	}
 
+
+	/* now wait for up to the maximum number of seconds allowed
+	   or until all nodes we expect a response from has replied
+	*/
+	while (rmdata->count > 0) {
+		event_loop_once(ctdb->ev);
+	}
+
+	status = rmdata->status;
 	talloc_free(mem_ctx);
-	return MONITOR_OK;
+	return status;
 }
 
+
 struct verify_recmaster_data {
 	uint32_t count;
 	uint32_t vnn;

From 7f02e16143a067b0ee40cdbc60006ccad6c3879a Mon Sep 17 00:00:00 2001
From: Ronnie Sahlberg <sahlberg@ronnie>
Date: Mon, 27 Aug 2007 10:31:22 +1000
Subject: [PATCH 13/15] add async versions of the freeze node control and
 freeze all nodes in parallell

(This used to be ctdb commit f34e89f54d9f4380e76eb1b5b2385a4d8500b505)
---
 ctdb/client/ctdb_client.c   |  39 +++++++++++---
 ctdb/include/ctdb.h         |  11 +++-
 ctdb/server/ctdb_recoverd.c | 104 +++++++++++++++++++++++++++++++-----
 3 files changed, 132 insertions(+), 22 deletions(-)

diff --git a/ctdb/client/ctdb_client.c b/ctdb/client/ctdb_client.c
index 0aa0e9b97ed..91351befd49 100644
--- a/ctdb/client/ctdb_client.c
+++ b/ctdb/client/ctdb_client.c
@@ -1807,24 +1807,49 @@ int ctdb_ctrl_getpid(struct ctdb_context *ctdb, struct timeval timeout, uint32_t
 
 
 /*
-  freeze a node
+  async freeze send control
  */
-int ctdb_ctrl_freeze(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode)
+struct ctdb_client_control_state *
+ctdb_ctrl_freeze_send(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct timeval timeout, uint32_t destnode)
+{
+	return ctdb_control_send(ctdb, destnode, 0, 
+			   CTDB_CONTROL_FREEZE, 0, tdb_null, 
+			   mem_ctx, NULL, &timeout, NULL);
+}
+
+/* 
+   async freeze recv control
+*/
+int ctdb_ctrl_freeze_recv(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct ctdb_client_control_state *state)
 {
 	int ret;
 	int32_t res;
 
-	ret = ctdb_control(ctdb, destnode, 0, 
-			   CTDB_CONTROL_FREEZE, 0, tdb_null, 
-			   NULL, NULL, &res, &timeout, NULL);
-	if (ret != 0 || res != 0) {
-		DEBUG(0,(__location__ " ctdb_control freeze failed\n"));
+	ret = ctdb_control_recv(ctdb, state, mem_ctx, NULL, &res, NULL);
+	if ( (ret != 0) || (res != 0) ){
+		DEBUG(0,(__location__ " ctdb_ctrl_freeze_recv failed\n"));
 		return -1;
 	}
 
 	return 0;
 }
 
+/*
+  freeze a node
+ */
+int ctdb_ctrl_freeze(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode)
+{
+	TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+	struct ctdb_client_control_state *state;
+	int ret;
+
+	state = ctdb_ctrl_freeze_send(ctdb, tmp_ctx, timeout, destnode);
+	ret = ctdb_ctrl_freeze_recv(ctdb, tmp_ctx, state);
+	talloc_free(tmp_ctx);
+
+	return ret;
+}
+
 /*
   thaw a node
  */
diff --git a/ctdb/include/ctdb.h b/ctdb/include/ctdb.h
index e46cc68a794..133b667cc93 100644
--- a/ctdb/include/ctdb.h
+++ b/ctdb/include/ctdb.h
@@ -379,7 +379,16 @@ int ctdb_dump_db(struct ctdb_db_context *ctdb_db, FILE *f);
  */
 int ctdb_ctrl_getpid(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, uint32_t *pid);
 
-int ctdb_ctrl_freeze(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode);
+int ctdb_ctrl_freeze(struct ctdb_context *ctdb, struct timeval timeout, 
+			uint32_t destnode);
+
+struct ctdb_client_control_state *
+ctdb_ctrl_freeze_send(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, 
+			struct timeval timeout, uint32_t destnode);
+
+int ctdb_ctrl_freeze_recv(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, 
+			struct ctdb_client_control_state *state);
+
 int ctdb_ctrl_thaw(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode);
 
 int ctdb_ctrl_getvnn(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode);
diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c
index 38ecf35bdcc..64bb0cf5193 100644
--- a/ctdb/server/ctdb_recoverd.c
+++ b/ctdb/server/ctdb_recoverd.c
@@ -116,6 +116,87 @@ static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t vnn, uint32_t ban_
 	}
 }
 
+enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
+
+
+struct freeze_node_data {
+	uint32_t count;
+	enum monitor_result status;
+};
+
+
+static void freeze_node_callback(struct ctdb_client_control_state *state)
+{
+	struct freeze_node_data *fndata = talloc_get_type(state->async.private, struct freeze_node_data);
+
+
+	/* one more node has responded to our freeze node*/
+	fndata->count--;
+
+	/* if we failed to freeze the node, we must trigger another recovery */
+	if ( (state->state != CTDB_CONTROL_DONE) || (state->status != 0) ) {
+		DEBUG(0, (__location__ " Failed to freeze node:%u. recovery failed\n", state->c->hdr.destnode));
+		fndata->status = MONITOR_RECOVERY_NEEDED;
+	}
+
+	return;
+}
+
+
+
+/* freeze all nodes */
+static enum monitor_result freeze_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
+{
+	struct freeze_node_data *fndata;
+	TALLOC_CTX *mem_ctx = talloc_new(ctdb);
+	struct ctdb_client_control_state *state;
+	enum monitor_result status;
+	int j;
+	
+	fndata = talloc(mem_ctx, struct freeze_node_data);
+	CTDB_NO_MEMORY_FATAL(ctdb, fndata);
+	fndata->count  = 0;
+	fndata->status = MONITOR_OK;
+
+	/* loop over all active nodes and send an async freeze call to 
+	   them*/
+	for (j=0; j<nodemap->num; j++) {
+		if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
+			continue;
+		}
+		state = ctdb_ctrl_freeze_send(ctdb, mem_ctx, 
+					CONTROL_TIMEOUT(), 
+					nodemap->nodes[j].vnn);
+		if (state == NULL) {
+			/* we failed to send the control, treat this as 
+			   an error and try again next iteration
+			*/			
+			DEBUG(0,("Failed to call ctdb_ctrl_freeze_send during recovery\n"));
+			talloc_free(mem_ctx);
+			return MONITOR_RECOVERY_NEEDED;
+		}
+
+		/* set up the callback functions */
+		state->async.fn = freeze_node_callback;
+		state->async.private = fndata;
+
+		/* one more control to wait for to complete */
+		fndata->count++;
+	}
+
+
+	/* now wait for up to the maximum number of seconds allowed
+	   or until all nodes we expect a response from has replied
+	*/
+	while (fndata->count > 0) {
+		event_loop_once(ctdb->ev);
+	}
+
+	status = fndata->status;
+	talloc_free(mem_ctx);
+	return status;
+}
+
 
 /*
   change recovery mode on all nodes
@@ -124,10 +205,15 @@ static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_node_map *no
 {
 	int j, ret;
 
-	/* start the freeze process immediately on all nodes */
-	ctdb_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
-		     CTDB_CONTROL_FREEZE, CTDB_CTRL_FLAG_NOREPLY, tdb_null, 
-		     NULL, NULL, NULL, NULL, NULL);
+	/* freeze all nodes */
+	if (rec_mode == CTDB_RECOVERY_ACTIVE) {
+		ret = freeze_all_nodes(ctdb, nodemap);
+		if (ret != MONITOR_OK) {
+			DEBUG(0, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
+			return -1;
+		}
+	}
+
 
 	/* set recovery mode to active on all nodes */
 	for (j=0; j<nodemap->num; j++) {
@@ -136,14 +222,6 @@ static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_node_map *no
 			continue;
 		}
 
-		if (rec_mode == CTDB_RECOVERY_ACTIVE) {
-			ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].vnn);
-			if (ret != 0) {
-				DEBUG(0, (__location__ " Unable to freeze node %u\n", nodemap->nodes[j].vnn));
-				return -1;
-			}
-		}
-
 		ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].vnn, rec_mode);
 		if (ret != 0) {
 			DEBUG(0, (__location__ " Unable to set recmode on node %u\n", nodemap->nodes[j].vnn));
@@ -1148,8 +1226,6 @@ static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid,
 }
 
 
-enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
-
 
 struct verify_recmode_normal_data {
 	uint32_t count;

From 2c0c94782aaade9cdf552291a00780bfce9530a0 Mon Sep 17 00:00:00 2001
From: Ronnie Sahlberg <sahlberg@ronnie>
Date: Mon, 27 Aug 2007 15:03:52 +1000
Subject: [PATCH 14/15] make the ctdb shutdown command use the async _send()
 function to send the shutdown command and return success to the caller if the
 _send() was successful

(This used to be ctdb commit 6bacaf8c7a96044708a6eda10cc8576adb7f5f79)
---
 ctdb/client/ctdb_client.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/ctdb/client/ctdb_client.c b/ctdb/client/ctdb_client.c
index 91351befd49..85c55971342 100644
--- a/ctdb/client/ctdb_client.c
+++ b/ctdb/client/ctdb_client.c
@@ -972,13 +972,12 @@ int ctdb_ctrl_statistics(struct ctdb_context *ctdb, uint32_t destnode, struct ct
  */
 int ctdb_ctrl_shutdown(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode)
 {
-	int ret;
-	int32_t res;
+	struct ctdb_client_control_state *state;
 
-	ret = ctdb_control(ctdb, destnode, 0, 
-			   CTDB_CONTROL_SHUTDOWN, CTDB_CTRL_FLAG_NOREPLY, tdb_null, 
-			   NULL, NULL, &res, &timeout, NULL);
-	if (ret != 0) {
+	state = ctdb_control_send(ctdb, destnode, 0, 
+			   CTDB_CONTROL_SHUTDOWN, 0, tdb_null, 
+			   NULL, NULL, &timeout, NULL);
+	if (state == NULL) {
 		DEBUG(0,(__location__ " ctdb_control for shutdown failed\n"));
 		return -1;
 	}

From 794fb10634d92e318dffdb0cd5dacd489c64469a Mon Sep 17 00:00:00 2001
From: Ronnie Sahlberg <sahlberg@ronnie>
Date: Mon, 27 Aug 2007 17:33:46 +1000
Subject: [PATCH 15/15] add an extra debug statement when we send a SIGTERM to
 a process

(This used to be ctdb commit a9c1be9cf9efdc69bfc95657b70e9f8b8230cda8)
---
 ctdb/server/eventscript.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ctdb/server/eventscript.c b/ctdb/server/eventscript.c
index afb86d742d2..c7339a355a6 100644
--- a/ctdb/server/eventscript.c
+++ b/ctdb/server/eventscript.c
@@ -234,6 +234,7 @@ static void ctdb_event_script_timeout(struct event_context *ev, struct timed_eve
  */
 static int event_script_destructor(struct ctdb_event_script_state *state)
 {
+	DEBUG(0,(__location__ " Sending SIGTERM to child pid:%d\n", state->child));
 	kill(state->child, SIGTERM);
 	waitpid(state->child, NULL, 0);
 	return 0;