From 37861932ce25e9d336e5e6c6fe42071818b4cba9 Mon Sep 17 00:00:00 2001
From: Andrew Tridgell <tridge@samba.org>
Date: Mon, 7 Jan 2008 16:17:22 +1100
Subject: [PATCH 01/13] merge from ronnie (This used to be ctdb commit
 0aa6e04438aa5ec727815689baa19544df042cf7)

---
 ctdb/include/ctdb_private.h | 1 +
 ctdb/server/ctdb_recoverd.c | 8 ++++++++
 2 files changed, 9 insertions(+)

diff --git a/ctdb/include/ctdb_private.h b/ctdb/include/ctdb_private.h
index 07dfcbcd68a..9582b6dade2 100644
--- a/ctdb/include/ctdb_private.h
+++ b/ctdb/include/ctdb_private.h
@@ -366,6 +366,7 @@ struct ctdb_context {
 	struct _trbt_tree_t *server_ids;	
 	const char *event_script_dir;
 	const char *default_public_interface;
+	pid_t ctdbd_pid;
 	pid_t recoverd_pid;
 	bool done_startup;
 	const char *node_ip;
diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c
index fa19a975a82..8dbf46932b2 100644
--- a/ctdb/server/ctdb_recoverd.c
+++ b/ctdb/server/ctdb_recoverd.c
@@ -1821,6 +1821,12 @@ again:
 	/* we only check for recovery once every second */
 	ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval);
 
+	/* verify that the main daemon is still running */
+	if (kill(ctdb->ctdbd_pid, 0) != 0) {
+		DEBUG(0,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
+		exit(-1);
+	}
+
 	if (rec->election_timeout) {
 		/* an election is in progress */
 		goto again;
@@ -2275,6 +2281,8 @@ int ctdb_start_recoverd(struct ctdb_context *ctdb)
 		return -1;
 	}
 
+	ctdb->ctdbd_pid = getpid();
+
 	ctdb->recoverd_pid = fork();
 	if (ctdb->recoverd_pid == -1) {
 		return -1;

From 61fd2d5d6ff769932e6d946a8317cede8909135a Mon Sep 17 00:00:00 2001
From: Andrew Tridgell <tridge@samba.org>
Date: Mon, 7 Jan 2008 23:41:07 +1100
Subject: [PATCH 02/13] background the smbstatus -n command (This used to be
 ctdb commit 0a05cc6763aa6e57089d3ce70516d359fcddb0e4)

---
 ctdb/config/events.d/50.samba | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ctdb/config/events.d/50.samba b/ctdb/config/events.d/50.samba
index 1c7b79f5d9c..429d475e929 100755
--- a/ctdb/config/events.d/50.samba
+++ b/ctdb/config/events.d/50.samba
@@ -36,7 +36,7 @@ periodic_cleanup() {
     # running smbstatus scrubs any dead entries from the connections
     # and sessionid database
     echo "`date` Running periodic cleanup of samba databases"
-    smbstatus -n > /dev/null 2>&1
+    smbstatus -n > /dev/null 2>&1 &
 }
 
 case $cmd in 

From 25bb60f11223c6ef1294cec51f0cd76a215febc4 Mon Sep 17 00:00:00 2001
From: Andrew Tridgell <tridge@samba.org>
Date: Tue, 8 Jan 2008 09:30:11 +1100
Subject: [PATCH 03/13] show start/stop time of recovery on all nodes (This
 used to be ctdb commit 9f7662279c367eb3e8a58e6f4aeca521e6f1f1d0)

---
 ctdb/server/ctdb_recover.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/ctdb/server/ctdb_recover.c b/ctdb/server/ctdb_recover.c
index 637894dfd15..7f165cf73af 100644
--- a/ctdb/server/ctdb_recover.c
+++ b/ctdb/server/ctdb_recover.c
@@ -509,6 +509,11 @@ int32_t ctdb_control_set_recmode(struct ctdb_context *ctdb,
 		return -1;
 	}
 
+	if (recmode != ctdb->recovery_mode) {
+		DEBUG(0,(__location__ " Recovery mode set to %s\n", 
+			 recmode==CTDB_RECOVERY_NORMAL?"NORMAL":"ACTIVE"));
+	}
+
 	if (recmode != CTDB_RECOVERY_NORMAL ||
 	    ctdb->recovery_mode != CTDB_RECOVERY_ACTIVE) {
 		ctdb->recovery_mode = recmode;

From 41f63fe16221cda228887da3f191673debb34c36 Mon Sep 17 00:00:00 2001
From: Andrew Tridgell <tridge@samba.org>
Date: Tue, 8 Jan 2008 11:47:29 +1100
Subject: [PATCH 04/13] updated docs from ronnie (This used to be ctdb commit
 0390f9e68210dd7f6e750f7a5909a0f97841193c)

---
 ctdb/doc/ctdbd.1      | 239 ++++++++++++++++++++++++------------------
 ctdb/doc/ctdbd.1.html | 100 ++++++++++++------
 2 files changed, 204 insertions(+), 135 deletions(-)

diff --git a/ctdb/doc/ctdbd.1 b/ctdb/doc/ctdbd.1
index 5d056355826..3be1330d01e 100644
--- a/ctdb/doc/ctdbd.1
+++ b/ctdb/doc/ctdbd.1
@@ -1,219 +1,250 @@
 .\"     Title: ctdbd
 .\"    Author: 
-.\" Generator: DocBook XSL Stylesheets v1.71.0 <http://docbook.sf.net/>
-.\"      Date: 09/14/2007
+.\" Generator: DocBook XSL Stylesheets v1.73.2 <http://docbook.sf.net/>
+.\"      Date: 01/07/2008
 .\"    Manual: 
 .\"    Source: 
 .\"
-.TH "CTDBD" "1" "09/14/2007" "" ""
+.TH "CTDBD" "1" "01/07/2008" "" ""
 .\" disable hyphenation
 .nh
 .\" disable justification (adjust text to left margin only)
 .ad l
 .SH "NAME"
-ctdbd \- The CTDB cluster daemon
+ctdbd - The CTDB cluster daemon
 .SH "SYNOPSIS"
 .HP 6
 \fBctdbd\fR
 .HP 6
-\fBctdbd\fR {\-\-reclock=<filename>} {\-\-nlist=<filename>} {\-\-dbdir=<directory>} [\-?\ \-\-help] [\-\-usage] [\-i\ \-\-interactive] [\-\-public\-addresses=<filename>] [\-\-event\-script\-dir=<directory>] [\-\-logfile=<filename>] [\-\-listen=<address>] [\-\-transport=<STRING>] [\-\-socket=<filename>] [\-d\ \-\-debug=<INTEGER>] [\-\-torture]
+\fBctdbd\fR [\-?\ \-\-help] [\-d\ \-\-debug=<INTEGER>] {\-\-dbdir=<directory>} {\-\-dbdir\-persistent=<directory>} [\-\-event\-script\-dir=<directory>] [\-i\ \-\-interactive] [\-\-listen=<address>] [\-\-logfile=<filename>] {\-\-nlist=<filename>} [\-\-nosetsched] [\-\-public\-addresses=<filename>] [\-\-public\-interface=<interface>] {\-\-reclock=<filename>} [\-\-single\-public\-ip=<address>] [\-\-socket=<filename>] [\-\-syslog] [\-\-torture] [\-\-transport=<STRING>] [\-\-usage]
 .SH "DESCRIPTION"
 .PP
-ctdbd is the main ctdb daemon.
+ctdbd is the main ctdb daemon\.
 .PP
-ctdbd provides a clustered version of the TDB database with automatic rebuild/recovery of the databases upon nodefailures.
+ctdbd provides a clustered version of the TDB database with automatic rebuild/recovery of the databases upon nodefailures\.
 .PP
-Combined with a cluster filesystem ctdbd provides a full HA environment for services such as clustered Samba and NFS as well as other services.
+Combined with a cluster filesystem ctdbd provides a full HA environment for services such as clustered Samba and NFS as well as other services\.
 .PP
-ctdbd provides monitoring of all nodes in the cluster and automatically reconfigures the cluster and recovers upon node failures.
+ctdbd provides monitoring of all nodes in the cluster and automatically reconfigures the cluster and recovers upon node failures\.
 .PP
-ctdbd is the main component in clustered Samba that provides a high\-awailability load\-sharing CIFS server cluster.
+ctdbd is the main component in clustered Samba that provides a high\-awailability load\-sharing CIFS server cluster\.
 .SH "OPTIONS"
 .PP
 \-? \-\-help
-.RS 3n
-Print some help text to the screen.
+.RS 4
+Print some help text to the screen\.
 .RE
 .PP
-\-\-usage
-.RS 3n
-Print useage information to the screen.
-.RE
-.PP
-\-\-reclock=<filename>
-.RS 3n
-This is the name of the lock file stored of the shared cluster filesystem that ctdbd uses to arbitrate which node has the role of recovery\-master. This file must be stored on shared storage.
-.RE
-.PP
-\-\-nlist=<filename>
-.RS 3n
-This file contains a list of the private ip addresses of every node in the cluster. There is one line/ip address for each node. This file must be the same for all nodes in the cluster.
-.sp
-This file is usually /etc/ctdb/nodes .
+\-d \-\-debug=<DEBUGLEVEL>
+.RS 4
+This option sets the debuglevel on the ctdbd daemon which controls what will be written to the logfile\. The default is 0 which will only log important events and errors\. A larger number will provide additional logging\.
 .RE
 .PP
 \-\-dbdir=<directory>
-.RS 3n
-This is the directory on local storage where ctdbd keeps the local copy of the TDB databases. This directory is local for each node and should not be stored on the shared cluster filesystem.
+.RS 4
+This is the directory on local storage where ctdbd keeps the local copy of the TDB databases\. This directory is local for each node and should not be stored on the shared cluster filesystem\.
 .sp
-This directory would usually be /var/ctdb .
+This directory would usually be /var/ctdb \.
+.RE
+.PP
+\-\-dbdir\-persistent=<directory>
+.RS 4
+This is the directory on local storage where ctdbd keeps the local copy of the persistent TDB databases\. This directory is local for each node and should not be stored on the shared cluster filesystem\.
+.sp
+This directory would usually be /etc/ctdb/persistent \.
+.RE
+.PP
+\-\-event\-script\-dir=<directory>
+.RS 4
+This option is used to specify the directory where the CTDB event scripts are stored\.
+.sp
+This will normally be /etc/ctdb/events\.d which is part of the ctdb distribution\.
 .RE
 .PP
 \-i \-\-interactive
-.RS 3n
-By default ctdbd will detach itself from the shell and run in the background as a daemon. This option makes ctdbd to start in interactive mode.
+.RS 4
+By default ctdbd will detach itself from the shell and run in the background as a daemon\. This option makes ctdbd to start in interactive mode\.
+.RE
+.PP
+\-\-listen=<address>
+.RS 4
+This specifies which ip address ctdb will bind to\. By default ctdbd will bind to the first address it finds in the /etc/ctdb/nodes file and which is also present on the local system in which case you do not need to provide this option\.
+.sp
+This option is only required when you want to run multiple ctdbd daemons/nodes on the same physical host in which case there would be multiple entries in /etc/ctdb/nodes what would match a local interface\.
+.RE
+.PP
+\-\-logfile=<filename>
+.RS 4
+This is the file where ctdbd will write its log\. This is usually /var/log/log\.ctdb \.
+.RE
+.PP
+\-\-nlist=<filename>
+.RS 4
+This file contains a list of the private ip addresses of every node in the cluster\. There is one line/ip address for each node\. This file must be the same for all nodes in the cluster\.
+.sp
+This file is usually /etc/ctdb/nodes \.
+.RE
+.PP
+\-\-nosetsched
+.RS 4
+Normally ctdb will change its scheduler to run as a real\-time process\. This option is used to change this behaviour and have ctdb run as a normal process\.
 .RE
 .PP
 \-\-public_addresses=<filename>
-.RS 3n
-When used with IP takeover this specifies a file containing the public ip addresses to use on the cluster. This file contains a list of ip addresses netmasks and interfaces. When ctdb is operational it will distribute these public ip addresses evenly across the available nodes.
+.RS 4
+When used with IP takeover this specifies a file containing the public ip addresses to use on the cluster\. This file contains a list of ip addresses netmasks and interfaces\. When ctdb is operational it will distribute these public ip addresses evenly across the available nodes\.
 .sp
 This is usually the file /etc/ctdb/public_addresses
 .RE
 .PP
-\-\-event\-script\-dir=<directory>
-.RS 3n
-This option is used to specify the directory where the CTDB event scripts are stored.
-.sp
-This will normally be /etc/ctdb/events.d which is part of the ctdb distribution.
+\-\-public_interface=<interface>
+.RS 4
+This option tells ctdb which interface to attach public\-addresses to and also where to attach the single\-public\-ip when used\.
 .RE
 .PP
-\-\-logfile=<filename>
-.RS 3n
-This is the file where ctdbd will write its log. This is usually /var/log/log.ctdb .
+\-\-reclock=<filename>
+.RS 4
+This is the name of the lock file stored of the shared cluster filesystem that ctdbd uses to arbitrate which node has the role of recovery\-master\. This file must be stored on shared storage\.
 .RE
 .PP
-\-\-listen=<address>
-.RS 3n
-This specifies which ip address ctdb will bind to. By default ctdbd will bind to the first address it finds in the /etc/ctdb/nodes file and which is also present on the local system in which case you do not need to provide this option.
+\-\-single\-public\-ip=<address>
+.RS 4
+This option is used to activate the "ipmux" functionality of ctdb\. In this mode, all nodes of the cluster will expose a single ip address from all nodes with all incoming traffic to the cluster being passed through the current recmaster\. This functionality is similar to using a load\-balancing switch\.
 .sp
-This option is only required when you want to run multiple ctdbd daemons/nodes on the same physical host in which case there would be multiple entries in /etc/ctdb/nodes what would match a local interface.
-.RE
-.PP
-\-\-transport=<STRING>
-.RS 3n
-This option specifies which transport to use for ctdbd internode communications. The default is "tcp".
+All incoming packets are sent to the recmaster which will multiplex the clients across all available nodes and pass the packets on to a different node in the cluster to manage the connection based on the clients ip address\. Outgoing packets however are sent directly from the node that was choosen back to the client\. Since all incoming packets are sent through the recmaster this will have a throughput and performance impact when used\. This impact in performance primarily affects write\-performance while read\-performance should be mainly unaffected\. Only use this feature if your environment is mostly\-read (i\.e\. most traffic is from the nodes back to the clients) or if it is not important to get maximum write\-performance to the cluster\.
 .sp
-Suported transports are "tcp" and "infiniband".
+When using a single public ip, you must also specify the public\-interface so that ctdb knows which interface to attach the single public ip to\.
 .RE
 .PP
 \-\-socket=<filename>
-.RS 3n
-This specifies the name of the domain socket that ctdbd will create. This socket is used for local clients to attach to and communicate with the ctdbd daemon.
+.RS 4
+This specifies the name of the domain socket that ctdbd will create\. This socket is used for local clients to attach to and communicate with the ctdbd daemon\.
 .sp
-The default is /tmp/ctdb.socket . You only need to use this option if you plan to run multiple ctdbd daemons on the same physical host.
+The default is /tmp/ctdb\.socket \. You only need to use this option if you plan to run multiple ctdbd daemons on the same physical host\.
 .RE
 .PP
-\-d \-\-debug=<DEBUGLEVEL>
-.RS 3n
-This option sets the debuglevel on the ctdbd daemon which controls what will be written to the logfile. The default is 0 which will only log important events and errors. A larger number will provide additional logging.
+\-\-syslog
+.RS 4
+Send all log messages to syslog instead of to the ctdb logfile\.
 .RE
 .PP
 \-\-torture
-.RS 3n
-This option is only used for development and testing of ctdbd. It adds artificial errors and failures to the common codepaths in ctdbd to verify that ctdbd can recover correctly for failures.
+.RS 4
+This option is only used for development and testing of ctdbd\. It adds artificial errors and failures to the common codepaths in ctdbd to verify that ctdbd can recover correctly for failures\.
 .sp
-You do NOT want to use this option unless you are developing and testing new functionality in ctdbd.
+You do NOT want to use this option unless you are developing and testing new functionality in ctdbd\.
+.RE
+.PP
+\-\-transport=<STRING>
+.RS 4
+This option specifies which transport to use for ctdbd internode communications\. The default is "tcp"\.
+.sp
+Suported transports are "tcp" and "infiniband"\.
+.RE
+.PP
+\-\-usage
+.RS 4
+Print useage information to the screen\.
 .RE
 .SH "PRIVATE VS PUBLIC ADDRESSES"
 .PP
-When used for ip takeover in a HA environment, each node in a ctdb cluster has multiple ip addresses assigned to it. One private and one or more public.
+When used for ip takeover in a HA environment, each node in a ctdb cluster has multiple ip addresses assigned to it\. One private and one or more public\.
 .SS "Private address"
 .PP
-This is the physical ip address of the node which is configured in linux and attached to a physical interface. This address uniquely identifies a physical node in the cluster and is the ip addresses that ctdbd will use to communicate with the ctdbd daemons on the other nodes in the cluster.
+This is the physical ip address of the node which is configured in linux and attached to a physical interface\. This address uniquely identifies a physical node in the cluster and is the ip addresses that ctdbd will use to communicate with the ctdbd daemons on the other nodes in the cluster\.
 .PP
-The private addresses are configured in /etc/ctdb/nodes (unless the \-\-nlist option is used) and contain one line for each node in the cluster. Each line contains the private ip address for one node in the cluster. This file must be the same on all nodes in the cluster.
+The private addresses are configured in /etc/ctdb/nodes (unless the \-\-nlist option is used) and contain one line for each node in the cluster\. Each line contains the private ip address for one node in the cluster\. This file must be the same on all nodes in the cluster\.
 .PP
-Since the private addresses are only available to the network when the corresponding node is up and running you should not use these addresses for clients to connect to services provided by the cluster. Instead client applications should only attach to the public addresses since these are guaranteed to always be available.
+Since the private addresses are only available to the network when the corresponding node is up and running you should not use these addresses for clients to connect to services provided by the cluster\. Instead client applications should only attach to the public addresses since these are guaranteed to always be available\.
 .PP
-When using ip takeover, it is strongly recommended that the private addresses are configured on a private network physically separated from the rest of the network and that this private network is dedicated to CTDB traffic.
+When using ip takeover, it is strongly recommended that the private addresses are configured on a private network physically separated from the rest of the network and that this private network is dedicated to CTDB traffic\.
 
       Example /etc/ctdb/nodes for a four node cluster:
       
 .sp
-.RS 3n
+.RS 4
 .nf
-        10.1.1.1
-        10.1.1.2
-        10.1.1.3
-        10.1.1.4
+        10\.1\.1\.1
+        10\.1\.1\.2
+        10\.1\.1\.3
+        10\.1\.1\.4
       
 .fi
 .RE
 .SS "Public address"
 .PP
-A public address on the other hand is not attached to an interface. This address is managed by ctdbd itself and is attached/detached to a physical node at runtime.
+A public address on the other hand is not attached to an interface\. This address is managed by ctdbd itself and is attached/detached to a physical node at runtime\.
 .PP
-The ctdb cluster will assign/reassign these public addresses across the available healthy nodes in the cluster. When one node fails, its public address will be migrated to and taken over by a different node in the cluster to ensure that all public addresses are always available to clients as long as there are still nodes available capable of hosting this address.
+The ctdb cluster will assign/reassign these public addresses across the available healthy nodes in the cluster\. When one node fails, its public address will be migrated to and taken over by a different node in the cluster to ensure that all public addresses are always available to clients as long as there are still nodes available capable of hosting this address\.
 .PP
-These addresses are not physically attached to a specific node. The 'ctdb ip' command can be used to view the current assignment of public addresses and which physical node is currently serving it.
+These addresses are not physically attached to a specific node\. The \'ctdb ip\' command can be used to view the current assignment of public addresses and which physical node is currently serving it\.
 .PP
-On each node this file contains a list of the public addresses that this node is capable of hosting. The list also contain the netmask and the interface where this address should be attached for the case where you may want to serve data out through multiple different interfaces.
+On each node this file contains a list of the public addresses that this node is capable of hosting\. The list also contain the netmask and the interface where this address should be attached for the case where you may want to serve data out through multiple different interfaces\.
 
       Example /etc/ctdb/public_addresses for a node that can host 4 public addresses:
       
 .sp
-.RS 3n
+.RS 4
 .nf
-        11.1.1.1/24 eth0
-        11.1.1.2/24 eth0
-        11.1.2.1/24 eth1
-        11.1.2.2/24 eth1
+        11\.1\.1\.1/24 eth0
+        11\.1\.1\.2/24 eth0
+        11\.1\.2\.1/24 eth1
+        11\.1\.2\.2/24 eth1
       
 .fi
 .RE
 .PP
-In most cases this file would be the same on all nodes in a cluster but there are exceptions when one may want to use different files on different nodes.
+In most cases this file would be the same on all nodes in a cluster but there are exceptions when one may want to use different files on different nodes\.
 
 	Example: 4 nodes partitioned into two subgroups :
 	
 .sp
-.RS 3n
+.RS 4
 .nf
 	Node 0:/etc/ctdb/public_addresses
-		10.1.1.1/24 eth0
-		10.1.1.2/24 eth0
+		10\.1\.1\.1/24 eth0
+		10\.1\.1\.2/24 eth0
 
 	Node 1:/etc/ctdb/public_addresses
-		10.1.1.1/24 eth0
-		10.1.1.2/24 eth0
+		10\.1\.1\.1/24 eth0
+		10\.1\.1\.2/24 eth0
 
 	Node 2:/etc/ctdb/public_addresses
-		10.2.1.1/24 eth0
-		10.2.1.2/24 eth0
+		10\.2\.1\.1/24 eth0
+		10\.2\.1\.2/24 eth0
 
 	Node 3:/etc/ctdb/public_addresses
-		10.2.1.1/24 eth0
-		10.2.1.2/24 eth0
+		10\.2\.1\.1/24 eth0
+		10\.2\.1\.2/24 eth0
 	
 .fi
 .RE
 .PP
-In this example nodes 0 and 1 host two public addresses on the 10.1.1.x network while nodes 2 and 3 host two public addresses for the 10.2.1.x network.
+In this example nodes 0 and 1 host two public addresses on the 10\.1\.1\.x network while nodes 2 and 3 host two public addresses for the 10\.2\.1\.x network\.
 .PP
-Ip address 10.1.1.1 can be hosted by either of nodes 0 or 1 and will be available to clients as long as at least one of these two nodes are available. If both nodes 0 and node 1 become unavailable 10.1.1.1 also becomes unavailable. 10.1.1.1 can not be failed over to node 2 or node 3 since these nodes do not have this ip address listed in their public addresses file.
+Ip address 10\.1\.1\.1 can be hosted by either of nodes 0 or 1 and will be available to clients as long as at least one of these two nodes are available\. If both nodes 0 and node 1 become unavailable 10\.1\.1\.1 also becomes unavailable\. 10\.1\.1\.1 can not be failed over to node 2 or node 3 since these nodes do not have this ip address listed in their public addresses file\.
 .SH "NODE STATUS"
 .PP
-The current status of each node in the cluster can be viewed by the 'ctdb status' command.
+The current status of each node in the cluster can be viewed by the \'ctdb status\' command\.
 .PP
-There are five possible for a node.
+There are five possible for a node\.
 .PP
-OK \- This node is fully functional.
+OK \- This node is fully functional\.
 .PP
-DISCONNECTED \- This node could not be connected through the network and is currently not particpating in the cluster. If there is a public IP address associated with this node it should have been taken over by a different node. No services are running on this node.
+DISCONNECTED \- This node could not be connected through the network and is currently not particpating in the cluster\. If there is a public IP address associated with this node it should have been taken over by a different node\. No services are running on this node\.
 .PP
-DISABLED \- This node has been administratively disabled. This node is still functional and participates in the CTDB cluster but its IP addresses have been taken over by a different node and no services are currently being hosted.
+DISABLED \- This node has been administratively disabled\. This node is still functional and participates in the CTDB cluster but its IP addresses have been taken over by a different node and no services are currently being hosted\.
 .PP
-UNHEALTHY \- A service provided by this node is malfunctioning and should be investigated. The CTDB daemon itself is operational and participates in the cluster. Its public IP address has been taken over by a different node and no services are currently being hosted. All unhealthy nodes should be investigated and require an administrative action to rectify.
+UNHEALTHY \- A service provided by this node is malfunctioning and should be investigated\. The CTDB daemon itself is operational and participates in the cluster\. Its public IP address has been taken over by a different node and no services are currently being hosted\. All unhealthy nodes should be investigated and require an administrative action to rectify\.
 .PP
-BANNED \- This node failed too many recovery attempts and has been banned from participating in the cluster for a period of RecoveryBanPeriod seconds. Any public IP address has been taken over by other nodes. This node does not provide any services. All banned nodes should be investigated and require an administrative action to rectify. This node does not perticipate in the CTDB cluster but can still be communicated with. I.e. ctdb commands can be sent to it.
+BANNED \- This node failed too many recovery attempts and has been banned from participating in the cluster for a period of RecoveryBanPeriod seconds\. Any public IP address has been taken over by other nodes\. This node does not provide any services\. All banned nodes should be investigated and require an administrative action to rectify\. This node does not perticipate in the CTDB cluster but can still be communicated with\. I\.e\. ctdb commands can be sent to it\.
 .SH "SEE ALSO"
 .PP
 ctdb(1), onnode(1)
 \fI\%http://ctdb.samba.org/\fR
 .SH "COPYRIGHT/LICENSE"
 .sp
-.RS 3n
+.RS 4
 .nf
 Copyright (C) Andrew Tridgell 2007
 Copyright (C) Ronnie sahlberg 2007
@@ -221,14 +252,14 @@ Copyright (C) Ronnie sahlberg 2007
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 3 of the License, or (at
-your option) any later version.
+your option) any later version\.
 
 This program is distributed in the hope that it will be useful, but
 WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-General Public License for more details.
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE\.  See the GNU
+General Public License for more details\.
 
 You should have received a copy of the GNU General Public License
-along with this program; if not, see http://www.gnu.org/licenses/.
+along with this program; if not, see http://www\.gnu\.org/licenses/\.
 .fi
 .RE
diff --git a/ctdb/doc/ctdbd.1.html b/ctdb/doc/ctdbd.1.html
index 8a5059e7301..e70f8206b72 100644
--- a/ctdb/doc/ctdbd.1.html
+++ b/ctdb/doc/ctdbd.1.html
@@ -1,4 +1,4 @@
-<html><head><meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"><title>ctdbd</title><meta name="generator" content="DocBook XSL Stylesheets V1.71.0"></head><body bgcolor="white" text="black" link="#0000FF" vlink="#840084" alink="#0000FF"><div class="refentry" lang="en"><a name="ctdbd.1"></a><div class="titlepage"></div><div class="refnamediv"><h2>Name</h2><p>ctdbd &#8212; The CTDB cluster daemon</p></div><div class="refsynopsisdiv"><h2>Synopsis</h2><div class="cmdsynopsis"><p><code class="command">ctdbd</code> </p></div><div class="cmdsynopsis"><p><code class="command">ctdbd</code>  {--reclock=&lt;filename&gt;} {--nlist=&lt;filename&gt;} {--dbdir=&lt;directory&gt;} [-? --help] [--usage] [-i --interactive] [--public-addresses=&lt;filename&gt;] [--event-script-dir=&lt;directory&gt;] [--logfile=&lt;filename&gt;] [--listen=&lt;address&gt;] [--transport=&lt;STRING&gt;] [--socket=&lt;filename&gt;] [-d --debug=&lt;INTEGER&gt;] [--torture]</p></div></div><div class="refsect1" lang="en"><a name="id2480886"></a><h2>DESCRIPTION</h2><p>
+<html><head><meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"><title>ctdbd</title><meta name="generator" content="DocBook XSL Stylesheets V1.73.2"></head><body bgcolor="white" text="black" link="#0000FF" vlink="#840084" alink="#0000FF"><div class="refentry" lang="en"><a name="ctdbd.1"></a><div class="titlepage"></div><div class="refnamediv"><h2>Name</h2><p>ctdbd &#8212; The CTDB cluster daemon</p></div><div class="refsynopsisdiv"><h2>Synopsis</h2><div class="cmdsynopsis"><p><code class="command">ctdbd</code> </p></div><div class="cmdsynopsis"><p><code class="command">ctdbd</code>  [-? --help] [-d --debug=&lt;INTEGER&gt;] {--dbdir=&lt;directory&gt;} {--dbdir-persistent=&lt;directory&gt;} [--event-script-dir=&lt;directory&gt;] [-i --interactive] [--listen=&lt;address&gt;] [--logfile=&lt;filename&gt;] {--nlist=&lt;filename&gt;} [--nosetsched] [--public-addresses=&lt;filename&gt;] [--public-interface=&lt;interface&gt;] {--reclock=&lt;filename&gt;} [--single-public-ip=&lt;address&gt;] [--socket=&lt;filename&gt;] [--syslog] [--torture] [--transport=&lt;STRING&gt;] [--usage]</p></div></div><div class="refsect1" lang="en"><a name="id2506368"></a><h2>DESCRIPTION</h2><p>
       ctdbd is the main ctdb daemon.
     </p><p>
       ctdbd provides a clustered version of the TDB database with automatic rebuild/recovery of the databases upon nodefailures.
@@ -8,58 +8,96 @@
       ctdbd provides monitoring of all nodes in the cluster and automatically reconfigures the cluster and recovers upon node failures.
     </p><p>
       ctdbd is the main component in clustered Samba that provides a high-awailability load-sharing CIFS server cluster.
-    </p></div><div class="refsect1" lang="en"><a name="id2481092"></a><h2>OPTIONS</h2><div class="variablelist"><dl><dt><span class="term">-? --help</span></dt><dd><p>
+    </p></div><div class="refsect1" lang="en"><a name="id2506399"></a><h2>OPTIONS</h2><div class="variablelist"><dl><dt><span class="term">-? --help</span></dt><dd><p>
             Print some help text to the screen.
-          </p></dd><dt><span class="term">--usage</span></dt><dd><p>
-            Print useage information to the screen.
-          </p></dd><dt><span class="term">--reclock=&lt;filename&gt;</span></dt><dd><p>
-            This is the name of the lock file stored of the shared cluster filesystem that ctdbd uses to arbitrate which node has the role of recovery-master.
-            This file must be stored on shared storage.
-          </p></dd><dt><span class="term">--nlist=&lt;filename&gt;</span></dt><dd><p>
-            This file contains a list of the private ip addresses of every node in the cluster. There is one line/ip address for each node. This file must be the same for all nodes in the cluster.
-          </p><p>
-            This file is usually /etc/ctdb/nodes .
+          </p></dd><dt><span class="term">-d --debug=&lt;DEBUGLEVEL&gt;</span></dt><dd><p>
+            This option sets the debuglevel on the ctdbd daemon which controls what will be written to the logfile. The default is 0 which will only log important events and errors. A larger number will provide additional logging.
           </p></dd><dt><span class="term">--dbdir=&lt;directory&gt;</span></dt><dd><p>
             This is the directory on local storage where ctdbd keeps the local
             copy of the TDB databases. This directory is local for each node and should not be stored on the shared cluster filesystem.
           </p><p>
             This directory would usually be /var/ctdb .
-          </p></dd><dt><span class="term">-i --interactive</span></dt><dd><p>
-            By default ctdbd will detach itself from the shell and run in
-            the background as a daemon. This option makes ctdbd to start in interactive mode.
-          </p></dd><dt><span class="term">--public_addresses=&lt;filename&gt;</span></dt><dd><p>
-            When used with IP takeover this specifies a file containing the public ip addresses to use on the cluster. This file contains a list of ip addresses netmasks and interfaces. When ctdb is operational it will distribute these public ip addresses evenly across the available nodes.
+          </p></dd><dt><span class="term">--dbdir-persistent=&lt;directory&gt;</span></dt><dd><p>
+            This is the directory on local storage where ctdbd keeps the local
+            copy of the persistent TDB databases. This directory is local for each node and should not be stored on the shared cluster filesystem.
           </p><p>
-            This is usually the file /etc/ctdb/public_addresses
+            This directory would usually be /etc/ctdb/persistent .
           </p></dd><dt><span class="term">--event-script-dir=&lt;directory&gt;</span></dt><dd><p>
             This option is used to specify the directory where the CTDB event
 	    scripts are stored.
           </p><p>
             This will normally be /etc/ctdb/events.d which is part of the ctdb distribution.
-          </p></dd><dt><span class="term">--logfile=&lt;filename&gt;</span></dt><dd><p>
-            This is the file where ctdbd will write its log. This is usually /var/log/log.ctdb .
+          </p></dd><dt><span class="term">-i --interactive</span></dt><dd><p>
+            By default ctdbd will detach itself from the shell and run in
+            the background as a daemon. This option makes ctdbd to start in interactive mode.
           </p></dd><dt><span class="term">--listen=&lt;address&gt;</span></dt><dd><p>
             This specifies which ip address ctdb will bind to. By default ctdbd will bind to the first address it finds in the /etc/ctdb/nodes file and which is also present on the local system in which case you do not need to provide this option.
           </p><p>
             This option is only required when you want to run multiple ctdbd daemons/nodes on the same physical host in which case there would be multiple entries in /etc/ctdb/nodes what would match a local interface.
-          </p></dd><dt><span class="term">--transport=&lt;STRING&gt;</span></dt><dd><p>
-            This option specifies which transport to use for ctdbd internode communications. The default is "tcp".
+          </p></dd><dt><span class="term">--logfile=&lt;filename&gt;</span></dt><dd><p>
+            This is the file where ctdbd will write its log. This is usually /var/log/log.ctdb .
+          </p></dd><dt><span class="term">--nlist=&lt;filename&gt;</span></dt><dd><p>
+            This file contains a list of the private ip addresses of every node in the cluster. There is one line/ip address for each node. This file must be the same for all nodes in the cluster.
           </p><p>
-            Suported transports are "tcp" and "infiniband".
+            This file is usually /etc/ctdb/nodes .
+          </p></dd><dt><span class="term">--nosetsched</span></dt><dd><p>
+            Normally ctdb will change its scheduler to run as a real-time 
+	    process. This option is used to change this behaviour and have
+	    ctdb run as a normal process.
+          </p></dd><dt><span class="term">--public_addresses=&lt;filename&gt;</span></dt><dd><p>
+            When used with IP takeover this specifies a file containing the public ip addresses to use on the cluster. This file contains a list of ip addresses netmasks and interfaces. When ctdb is operational it will distribute these public ip addresses evenly across the available nodes.
+          </p><p>
+            This is usually the file /etc/ctdb/public_addresses
+          </p></dd><dt><span class="term">--public_interface=&lt;interface&gt;</span></dt><dd><p>
+            This option tells ctdb which interface to attach public-addresses
+	    to and also where to attach the single-public-ip when used.
+          </p></dd><dt><span class="term">--reclock=&lt;filename&gt;</span></dt><dd><p>
+            This is the name of the lock file stored of the shared cluster filesystem that ctdbd uses to arbitrate which node has the role of recovery-master.
+            This file must be stored on shared storage.
+          </p></dd><dt><span class="term">--single-public-ip=&lt;address&gt;</span></dt><dd><p>
+            This option is used to activate the "ipmux" functionality of ctdb.
+            In this mode, all nodes of the cluster will expose a single
+            ip address from all nodes with all incoming traffic to the cluster
+            being passed through the current recmaster. This functionality
+            is similar to using a load-balancing switch.
+          </p><p>
+            All incoming packets are sent to the recmaster which will multiplex
+            the clients across all available nodes and pass the packets on to
+            a different node in the cluster to manage the connection based
+            on the clients ip address. Outgoing packets however are sent
+            directly from the node that was choosen back to the client.
+            Since all incoming packets are sent through the recmaster this will
+            have a throughput and performance impact when used. This impact
+            in performance primarily affects write-performance while 
+	    read-performance should be mainly unaffected.
+            Only use this feature if your environment is mostly-read 
+            (i.e. most traffic is from the nodes back to the clients) or
+            if it is not important to get maximum write-performance to the
+	    cluster.
+          </p><p>
+            When using a single public ip, you must also specify the 
+            public-interface so that ctdb knows which interface to attach the 
+            single public ip to.
           </p></dd><dt><span class="term">--socket=&lt;filename&gt;</span></dt><dd><p>
             This specifies the name of the domain socket that ctdbd will create. This socket is used for local clients to attach to and communicate with the ctdbd daemon.
           </p><p>
             The default is /tmp/ctdb.socket . You only need to use this option if you plan to run multiple ctdbd daemons on the same physical host.
-          </p></dd><dt><span class="term">-d --debug=&lt;DEBUGLEVEL&gt;</span></dt><dd><p>
-            This option sets the debuglevel on the ctdbd daemon which controls what will be written to the logfile. The default is 0 which will only log important events and errors. A larger number will provide additional logging.
+          </p></dd><dt><span class="term">--syslog</span></dt><dd><p>
+	    Send all log messages to syslog instead of to the ctdb logfile.
           </p></dd><dt><span class="term">--torture</span></dt><dd><p>
             This option is only used for development and testing of ctdbd. It adds artificial errors and failures to the common codepaths in ctdbd to verify that ctdbd can recover correctly for failures.
           </p><p>
             You do NOT want to use this option unless you are developing and testing new functionality in ctdbd.
-          </p></dd></dl></div></div><div class="refsect1" lang="en"><a name="id2528417"></a><h2>Private vs Public addresses</h2><p>
+          </p></dd><dt><span class="term">--transport=&lt;STRING&gt;</span></dt><dd><p>
+            This option specifies which transport to use for ctdbd internode communications. The default is "tcp".
+          </p><p>
+            Suported transports are "tcp" and "infiniband".
+          </p></dd><dt><span class="term">--usage</span></dt><dd><p>
+            Print useage information to the screen.
+          </p></dd></dl></div></div><div class="refsect1" lang="en"><a name="id2553928"></a><h2>Private vs Public addresses</h2><p>
       When used for ip takeover in a HA environment, each node in a ctdb 
       cluster has multiple ip addresses assigned to it. One private and one or more public.
-    </p><div class="refsect2" lang="en"><a name="id2528427"></a><h3>Private address</h3><p>
+    </p><div class="refsect2" lang="en"><a name="id2553939"></a><h3>Private address</h3><p>
         This is the physical ip address of the node which is configured in 
         linux and attached to a physical interface. This address uniquely
         identifies a physical node in the cluster and is the ip addresses
@@ -89,7 +127,7 @@
         10.1.1.2
         10.1.1.3
         10.1.1.4
-      </pre></div><div class="refsect2" lang="en"><a name="id2528476"></a><h3>Public address</h3><p>
+      </pre></div><div class="refsect2" lang="en"><a name="id2553987"></a><h3>Public address</h3><p>
         A public address on the other hand is not attached to an interface.
         This address is managed by ctdbd itself and is attached/detached to
         a physical node at runtime.
@@ -150,7 +188,7 @@
 	unavailable. 10.1.1.1 can not be failed over to node 2 or node 3 since
 	these nodes do not have this ip address listed in their public
 	addresses file.
-	</p></div></div><div class="refsect1" lang="en"><a name="id2528564"></a><h2>Node status</h2><p>
+	</p></div></div><div class="refsect1" lang="en"><a name="id2554069"></a><h2>Node status</h2><p>
       The current status of each node in the cluster can be viewed by the 
       'ctdb status' command.
     </p><p>
@@ -181,10 +219,10 @@
       investigated and require an administrative action to rectify. This node 
       does not perticipate in the CTDB cluster but can still be communicated 
       with. I.e. ctdb commands can be sent to it.
-    </p></div><div class="refsect1" lang="en"><a name="id2528621"></a><h2>SEE ALSO</h2><p>
+    </p></div><div class="refsect1" lang="en"><a name="id2554131"></a><h2>SEE ALSO</h2><p>
       ctdb(1), onnode(1)
-      <a href="http://ctdb.samba.org/" target="_top">http://ctdb.samba.org/</a>
-    </p></div><div class="refsect1" lang="en"><a name="id2528634"></a><h2>COPYRIGHT/LICENSE</h2><div class="literallayout"><p><br>
+      <a class="ulink" href="http://ctdb.samba.org/" target="_top">http://ctdb.samba.org/</a>
+    </p></div><div class="refsect1" lang="en"><a name="id2554144"></a><h2>COPYRIGHT/LICENSE</h2><div class="literallayout"><p><br>
 Copyright�(C)�Andrew�Tridgell�2007<br>
 Copyright�(C)�Ronnie�sahlberg�2007<br>
 <br>

From 96100fcae632731bbafa831a79a48922005c8238 Mon Sep 17 00:00:00 2001
From: Andrew Tridgell <tridge@samba.org>
Date: Tue, 8 Jan 2008 17:23:27 +1100
Subject: [PATCH 05/13] added two new ctdb commands:

 ctdb vacuum   : vacuums all the databases, deleting any zero length
                 ctdb records

 ctdb repack   : repacks all the databases, resulting in a perfectly
                 packed database with no freelist entries

(This used to be ctdb commit 3532119c84ab3247051ed6ba21ba3243ae2f6bf4)
---
 ctdb/Makefile.in               |   4 +-
 ctdb/client/ctdb_client.c      |  43 ++-
 ctdb/include/ctdb.h            |   8 +-
 ctdb/include/ctdb_private.h    |   8 +
 ctdb/lib/tdb/common/freelist.c |  23 ++
 ctdb/lib/tdb/include/tdb.h     |   1 +
 ctdb/server/ctdb_control.c     |   3 +
 ctdb/server/ctdb_recover.c     |  87 +++++
 ctdb/server/ctdb_recoverd.c    |  91 +++++
 ctdb/tools/ctdb.c              |   3 +-
 ctdb/tools/ctdb_vacuum.c       | 619 +++++++++++++++++++++++++++++++++
 11 files changed, 883 insertions(+), 7 deletions(-)
 create mode 100644 ctdb/tools/ctdb_vacuum.c

diff --git a/ctdb/Makefile.in b/ctdb/Makefile.in
index 0a2e58e5a94..3e2b1e1b192 100644
--- a/ctdb/Makefile.in
+++ b/ctdb/Makefile.in
@@ -90,9 +90,9 @@ bin/ctdb_ipmux: $(CTDB_CLIENT_OBJ) utils/ipmux/ipmux.o
 	@echo Linking $@
 	@$(CC) $(CFLAGS) -o $@ utils/ipmux/ipmux.o $(CTDB_CLIENT_OBJ) $(LIB_FLAGS) $(IPQ_LIBS)
 
-bin/ctdb: $(CTDB_CLIENT_OBJ) tools/ctdb.o 
+bin/ctdb: $(CTDB_CLIENT_OBJ) tools/ctdb.o tools/ctdb_vacuum.o
 	@echo Linking $@
-	@$(CC) $(CFLAGS) -o $@ tools/ctdb.o $(CTDB_CLIENT_OBJ) $(LIB_FLAGS)
+	@$(CC) $(CFLAGS) -o $@ tools/ctdb.o tools/ctdb_vacuum.o $(CTDB_CLIENT_OBJ) $(LIB_FLAGS)
 
 bin/smnotify: utils/smnotify/gen_xdr.o utils/smnotify/gen_smnotify.o utils/smnotify/smnotify.o 
 	@echo Linking $@
diff --git a/ctdb/client/ctdb_client.c b/ctdb/client/ctdb_client.c
index 20d75d1373e..a6336f9c84b 100644
--- a/ctdb/client/ctdb_client.c
+++ b/ctdb/client/ctdb_client.c
@@ -155,6 +155,9 @@ struct ctdb_client_call_state {
 	uint32_t reqid;
 	struct ctdb_db_context *ctdb_db;
 	struct ctdb_call call;
+	struct {
+		void (*fn)(struct ctdb_client_call_state *);
+	} async;
 };
 
 /*
@@ -187,6 +190,10 @@ static void ctdb_client_reply_call(struct ctdb_context *ctdb, struct ctdb_req_he
 	talloc_steal(state, c);
 
 	state->state = CTDB_CALL_DONE;
+
+	if (state->async.fn) {
+		state->async.fn(state);
+	}
 }
 
 static void ctdb_client_reply_control(struct ctdb_context *ctdb, struct ctdb_req_header *hdr);
@@ -377,7 +384,8 @@ static struct ctdb_client_call_state *ctdb_client_call_local_send(struct ctdb_db
   This call never blocks.
 */
 struct ctdb_client_call_state *ctdb_call_send(struct ctdb_db_context *ctdb_db, 
-				       struct ctdb_call *call)
+					      struct ctdb_call *call, 
+					      void (*callback)(struct ctdb_client_call_state *))
 {
 	struct ctdb_client_call_state *state;
 	struct ctdb_context *ctdb = ctdb_db->ctdb;
@@ -404,6 +412,9 @@ struct ctdb_client_call_state *ctdb_call_send(struct ctdb_db_context *ctdb_db,
 		state = ctdb_client_call_local_send(ctdb_db, call, &header, &data);
 		talloc_free(data.dptr);
 		ctdb_ltdb_unlock(ctdb_db, call->key);
+		if (state) {
+			state->async.fn = callback;
+		}
 		return state;
 	}
 
@@ -446,6 +457,8 @@ struct ctdb_client_call_state *ctdb_call_send(struct ctdb_db_context *ctdb_db,
 
 	ctdb_client_queue_pkt(ctdb, &c->hdr);
 
+	state->async.fn = callback;
+
 	return state;
 }
 
@@ -457,7 +470,7 @@ int ctdb_call(struct ctdb_db_context *ctdb_db, struct ctdb_call *call)
 {
 	struct ctdb_client_call_state *state;
 
-	state = ctdb_call_send(ctdb_db, call);
+	state = ctdb_call_send(ctdb_db, call, NULL);
 	return ctdb_call_recv(state, call);
 }
 
@@ -1575,6 +1588,22 @@ int ctdb_statistics_reset(struct ctdb_context *ctdb, uint32_t destnode)
 	return 0;
 }
 
+/*
+  this is the dummy null procedure that all databases support
+*/
+static int ctdb_null_func(struct ctdb_call_info *call)
+{
+	return 0;
+}
+
+/*
+  this is a plain fetch procedure that all databases support
+*/
+static int ctdb_fetch_func(struct ctdb_call_info *call)
+{
+	call->reply_data = &call->record_data;
+	return 0;
+}
 
 /*
   attach to a specific database - client call
@@ -1632,6 +1661,10 @@ struct ctdb_db_context *ctdb_attach(struct ctdb_context *ctdb, const char *name,
 
 	DLIST_ADD(ctdb->db_list, ctdb_db);
 
+	/* add well known functions */
+	ctdb_set_call(ctdb_db, ctdb_null_func, CTDB_NULL_FUNC);
+	ctdb_set_call(ctdb_db, ctdb_fetch_func, CTDB_FETCH_FUNC);
+
 	return ctdb_db;
 }
 
@@ -1641,12 +1674,15 @@ struct ctdb_db_context *ctdb_attach(struct ctdb_context *ctdb, const char *name,
  */
 int ctdb_set_call(struct ctdb_db_context *ctdb_db, ctdb_fn_t fn, uint32_t id)
 {
+	struct ctdb_registered_call *call;
+
+#if 0
 	TDB_DATA data;
 	int32_t status;
 	struct ctdb_control_set_call c;
 	int ret;
-	struct ctdb_registered_call *call;
 
+	/* this is no longer valid with the separate daemon architecture */
 	c.db_id = ctdb_db->db_id;
 	c.fn    = fn;
 	c.id    = id;
@@ -1660,6 +1696,7 @@ int ctdb_set_call(struct ctdb_db_context *ctdb_db, ctdb_fn_t fn, uint32_t id)
 		DEBUG(0,("ctdb_set_call failed for call %u\n", id));
 		return -1;
 	}
+#endif
 
 	/* also register locally */
 	call = talloc(ctdb_db, struct ctdb_registered_call);
diff --git a/ctdb/include/ctdb.h b/ctdb/include/ctdb.h
index 14f75b4c822..ed38535224b 100644
--- a/ctdb/include/ctdb.h
+++ b/ctdb/include/ctdb.h
@@ -85,6 +85,11 @@ struct ctdb_call_info {
  */
 #define CTDB_SRVID_UNBAN_NODE 0xF600000000000000LL
 
+/*
+  a message to tell the recovery daemon to fetch a set of records
+ */
+#define CTDB_SRVID_VACUUM_FETCH 0xF700000000000000LL
+
 
 /* used on the domain socket, send a pdu to the local daemon */
 #define CTDB_CURRENT_NODE     0xF0000001
@@ -225,7 +230,8 @@ int ctdb_set_message_handler(struct ctdb_context *ctdb, uint64_t srvid,
 
 
 int ctdb_call(struct ctdb_db_context *ctdb_db, struct ctdb_call *call);
-struct ctdb_client_call_state *ctdb_call_send(struct ctdb_db_context *ctdb_db, struct ctdb_call *call);
+struct ctdb_client_call_state *ctdb_call_send(struct ctdb_db_context *ctdb_db, struct ctdb_call *call,
+					      void (*callback)(struct ctdb_client_call_state *));
 int ctdb_call_recv(struct ctdb_client_call_state *state, struct ctdb_call *call);
 
 /* send a ctdb message */
diff --git a/ctdb/include/ctdb_private.h b/ctdb/include/ctdb_private.h
index 9582b6dade2..aa4cc96c98e 100644
--- a/ctdb/include/ctdb_private.h
+++ b/ctdb/include/ctdb_private.h
@@ -484,6 +484,7 @@ enum ctdb_controls {CTDB_CONTROL_PROCESS_EXISTS          = 0,
 		    CTDB_CONTROL_TRANSACTION_START       = 65,
 		    CTDB_CONTROL_TRANSACTION_COMMIT      = 66,
 		    CTDB_CONTROL_WIPE_DATABASE           = 67,
+		    CTDB_CONTROL_DELETE_RECORD           = 68,
 };	
 
 /*
@@ -1187,4 +1188,11 @@ int32_t ctdb_control_transaction_start(struct ctdb_context *ctdb, uint32_t id);
 int32_t ctdb_control_transaction_commit(struct ctdb_context *ctdb, uint32_t id);
 int32_t ctdb_control_wipe_database(struct ctdb_context *ctdb, TDB_DATA indata);
 
+
+int ctdb_vacuum(struct ctdb_context *ctdb, int argc, const char **argv);
+int ctdb_repack(struct ctdb_context *ctdb, int argc, const char **argv);
+
+int32_t ctdb_control_delete_record(struct ctdb_context *ctdb, TDB_DATA indata);
+
+
 #endif
diff --git a/ctdb/lib/tdb/common/freelist.c b/ctdb/lib/tdb/common/freelist.c
index 48e64c2b4cf..358545ed575 100644
--- a/ctdb/lib/tdb/common/freelist.c
+++ b/ctdb/lib/tdb/common/freelist.c
@@ -342,3 +342,26 @@ tdb_off_t tdb_allocate(struct tdb_context *tdb, tdb_len_t length, struct list_st
 	return 0;
 }
 
+
+
+/* 
+   return the size of the freelist - used to decide if we should repack 
+*/
+int tdb_freelist_size(struct tdb_context *tdb)
+{
+	tdb_off_t ptr;
+	int count=0;
+
+	if (tdb_lock(tdb, -1, F_RDLCK) == -1) {
+		return -1;
+	}
+
+	ptr = FREELIST_TOP;
+	while (ptr != 0 && tdb_ofs_read(tdb, ptr, &ptr) == 0) {
+		count++;
+		
+	}
+
+	tdb_unlock(tdb, -1, F_RDLCK);
+	return count;
+}
diff --git a/ctdb/lib/tdb/include/tdb.h b/ctdb/lib/tdb/include/tdb.h
index f6d4b4b1f45..371381049e9 100644
--- a/ctdb/lib/tdb/include/tdb.h
+++ b/ctdb/lib/tdb/include/tdb.h
@@ -156,6 +156,7 @@ void tdb_dump_all(struct tdb_context *tdb);
 int tdb_printfreelist(struct tdb_context *tdb);
 int tdb_validate_freelist(struct tdb_context *tdb, int *pnum_entries);
 int tdb_wipe_all(struct tdb_context *tdb);
+int tdb_freelist_size(struct tdb_context *tdb);
 
 extern TDB_DATA tdb_null;
 
diff --git a/ctdb/server/ctdb_control.c b/ctdb/server/ctdb_control.c
index f2fd6ee641f..4e013a530ea 100644
--- a/ctdb/server/ctdb_control.c
+++ b/ctdb/server/ctdb_control.c
@@ -321,6 +321,9 @@ static int32_t ctdb_control_dispatch(struct ctdb_context *ctdb,
 		CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_control_wipe_database));
 		return ctdb_control_wipe_database(ctdb, indata);
 
+	case CTDB_CONTROL_DELETE_RECORD:
+		return ctdb_control_delete_record(ctdb, indata);
+
 	default:
 		DEBUG(0,(__location__ " Unknown CTDB control opcode %u\n", opcode));
 		return -1;
diff --git a/ctdb/server/ctdb_recover.c b/ctdb/server/ctdb_recover.c
index 7f165cf73af..97a7d0251f5 100644
--- a/ctdb/server/ctdb_recover.c
+++ b/ctdb/server/ctdb_recover.c
@@ -636,3 +636,90 @@ bool ctdb_recovery_lock(struct ctdb_context *ctdb, bool keep)
 }
 
 
+/*
+  delete a record as part of the vacuum process
+  only delete if we are not lmaster or dmaster, and our rsn is <= the provided rsn
+  use non-blocking locks
+ */
+int32_t ctdb_control_delete_record(struct ctdb_context *ctdb, TDB_DATA indata)
+{
+	struct ctdb_rec_data *rec = (struct ctdb_rec_data *)indata.dptr;
+	struct ctdb_db_context *ctdb_db;
+	TDB_DATA key, data;
+	struct ctdb_ltdb_header *hdr, *hdr2;
+
+	if (indata.dsize < sizeof(uint32_t) || indata.dsize != rec->length) {
+		DEBUG(0,(__location__ " Bad record size in ctdb_control_delete_record\n"));
+		return -1;
+	}
+
+	ctdb_db = find_ctdb_db(ctdb, rec->reqid);
+	if (!ctdb_db) {
+		DEBUG(0,(__location__ " Unknown db 0x%08x\n", rec->reqid));
+		return -1;
+	}
+
+	key.dsize = rec->keylen;
+	key.dptr  = &rec->data[0];
+	data.dsize = rec->datalen;
+	data.dptr = &rec->data[rec->keylen];
+
+	if (ctdb_lmaster(ctdb, &key) == ctdb->pnn) {
+		DEBUG(2,(__location__ " Called delete on record where we are lmaster\n"));
+		return -1;
+	}
+
+	if (data.dsize != sizeof(struct ctdb_ltdb_header)) {
+		DEBUG(0,(__location__ " Bad record size\n"));
+		return -1;
+	}
+
+	hdr = (struct ctdb_ltdb_header *)data.dptr;
+
+	/* use a non-blocking lock */
+	if (tdb_chainlock_nonblock(ctdb_db->ltdb->tdb, key) != 0) {
+		return -1;
+	}
+
+	data = tdb_fetch(ctdb_db->ltdb->tdb, key);
+	if (data.dptr == NULL) {
+		tdb_chainunlock(ctdb_db->ltdb->tdb, key);
+		return 0;
+	}
+
+	if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
+		tdb_delete(ctdb_db->ltdb->tdb, key);
+		DEBUG(0,(__location__ " Deleted corrupt record\n"));
+		tdb_chainunlock(ctdb_db->ltdb->tdb, key);
+		free(data.dptr);
+		return 0;
+	}
+	
+	hdr2 = (struct ctdb_ltdb_header *)data.dptr;
+
+	if (hdr2->rsn > hdr->rsn) {
+		tdb_chainunlock(ctdb_db->ltdb->tdb, key);
+		DEBUG(2,(__location__ " Skipping record with rsn=%llu - called with rsn=%llu\n",
+			 (unsigned long long)hdr2->rsn, (unsigned long long)hdr->rsn));
+		free(data.dptr);
+		return -1;		
+	}
+
+	if (hdr2->dmaster == ctdb->pnn) {
+		tdb_chainunlock(ctdb_db->ltdb->tdb, key);
+		DEBUG(2,(__location__ " Attempted delete record where we are the dmaster\n"));
+		free(data.dptr);
+		return -1;				
+	}
+
+	if (tdb_delete(ctdb_db->ltdb->tdb, key) != 0) {
+		tdb_chainunlock(ctdb_db->ltdb->tdb, key);
+		DEBUG(2,(__location__ " Failed to delete record\n"));
+		free(data.dptr);
+		return -1;						
+	}
+
+	tdb_chainunlock(ctdb_db->ltdb->tdb, key);
+	free(data.dptr);
+	return 0;	
+}
diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c
index 8dbf46932b2..c7086468add 100644
--- a/ctdb/server/ctdb_recoverd.c
+++ b/ctdb/server/ctdb_recoverd.c
@@ -701,6 +701,94 @@ static void unban_handler(struct ctdb_context *ctdb, uint64_t srvid,
 }
 
 
+/*
+  called when a vacuum fetch has completed - just free it
+ */
+static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
+{
+	talloc_free(state);
+}
+
+
+/*
+  handler for vacuum fetch
+*/
+static void vacuum_fetch_handler(struct ctdb_context *ctdb, uint64_t srvid, 
+				 TDB_DATA data, void *private_data)
+{
+	struct ctdb_call call;
+	struct ctdb_control_pulldb_reply *recs;
+	int ret, i;
+	TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+	const char *name;
+	struct ctdb_dbid_map *dbmap=NULL;
+	bool persistent = false;
+	struct ctdb_db_context *ctdb_db;
+	struct ctdb_rec_data *r;
+
+	recs = (struct ctdb_control_pulldb_reply *)data.dptr;
+
+	/* work out if the database is persistent */
+	ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
+	if (ret != 0) {
+		DEBUG(0, (__location__ " Unable to get dbids from local node\n"));
+		talloc_free(tmp_ctx);
+		return;
+	}
+
+	for (i=0;i<dbmap->num;i++) {
+		if (dbmap->dbs[i].dbid == recs->db_id) {
+			persistent = dbmap->dbs[i].persistent;
+			break;
+		}
+	}
+	if (i == dbmap->num) {
+		DEBUG(0, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
+		talloc_free(tmp_ctx);
+		return;		
+	}
+
+	/* find the name of this database */
+	if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
+		DEBUG(0,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
+		talloc_free(tmp_ctx);
+		return;
+	}
+
+	/* attach to it */
+	ctdb_db = ctdb_attach(ctdb, name, persistent);
+	if (ctdb_db == NULL) {
+		DEBUG(0,(__location__ " Failed to attach to database '%s'\n", name));
+		talloc_free(tmp_ctx);
+		return;
+	}
+	
+
+	ZERO_STRUCT(call);
+	call.call_id = CTDB_NULL_FUNC;
+	call.flags = CTDB_IMMEDIATE_MIGRATION;
+
+	r = (struct ctdb_rec_data *)&recs->data[0];
+	
+	for (i=0;
+	     i<recs->count;
+	     r = (struct ctdb_rec_data *)(r->length + (uint8_t *)r), i++) {
+		struct ctdb_client_call_state *state;
+
+		call.key.dptr = &r->data[0];
+		call.key.dsize = r->keylen;
+
+		state = ctdb_call_send(ctdb_db, &call, vacuum_fetch_callback);
+		if (state == NULL) {
+			DEBUG(0,(__location__ " Failed to setup vacuum fetch call\n"));
+			talloc_free(tmp_ctx);
+			return;			
+		}
+	}
+
+	talloc_free(tmp_ctx);	
+}
+
 
 /*
   called when ctdb_wait_timeout should finish
@@ -1806,6 +1894,9 @@ static void monitor_cluster(struct ctdb_context *ctdb)
 
 	/* and one for when nodes are unbanned */
 	ctdb_set_message_handler(ctdb, CTDB_SRVID_UNBAN_NODE, unban_handler, rec);
+
+	/* register a message port for vacuum fetch */
+	ctdb_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
 	
 again:
 	if (mem_ctx) {
diff --git a/ctdb/tools/ctdb.c b/ctdb/tools/ctdb.c
index a1ce35aace2..6255d7bf642 100644
--- a/ctdb/tools/ctdb.c
+++ b/ctdb/tools/ctdb.c
@@ -1025,7 +1025,6 @@ static int control_dumpmemory(struct ctdb_context *ctdb, int argc, const char **
 			    CTDB_CTRL_FLAG_NOREPLY, tdb_null, NULL, NULL, NULL, NULL, NULL);
 }
 
-
 static const struct {
 	const char *name;
 	int (*fn)(struct ctdb_context *, int, const char **);
@@ -1068,6 +1067,8 @@ static const struct {
 	{ "unregsrvid",      unregsrvid,		false, "unregister a server id", "<pnn> <type> <id>" },
 	{ "chksrvid",        chksrvid,			false, "check if a server id exists", "<pnn> <type> <id>" },
 	{ "getsrvids",       getsrvids,			false, "get a list of all server ids"},
+	{ "vacuum",          ctdb_vacuum,		false, "vacuum the databases of empty records", "[max_records]"},
+	{ "repack",          ctdb_repack,		false, "repack all databases", "[max_freelist]"},
 };
 
 /*
diff --git a/ctdb/tools/ctdb_vacuum.c b/ctdb/tools/ctdb_vacuum.c
new file mode 100644
index 00000000000..6e88bac6eda
--- /dev/null
+++ b/ctdb/tools/ctdb_vacuum.c
@@ -0,0 +1,619 @@
+/* 
+   ctdb control tool - database vacuum 
+
+   Copyright (C) Andrew Tridgell  2008
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "lib/events/events.h"
+#include "system/filesys.h"
+#include "system/network.h"
+#include "../include/ctdb.h"
+#include "../include/ctdb_private.h"
+#include "db_wrap.h"
+
+/* should be tunable */
+#define TIMELIMIT() timeval_current_ofs(10, 0)
+
+struct async_data {
+	uint32_t count;
+	uint32_t fail_count;
+};
+
+static void async_callback(struct ctdb_client_control_state *state)
+{
+	struct async_data *data = talloc_get_type(state->async.private_data, struct async_data);
+	int ret;
+	int32_t res;
+
+	/* one more node has responded with recmode data */
+	data->count--;
+
+	/* if we failed to push the db, then return an error and let
+	   the main loop try again.
+	*/
+	if (state->state != CTDB_CONTROL_DONE) {
+		data->fail_count++;
+		return;
+	}
+	
+	state->async.fn = NULL;
+
+	ret = ctdb_control_recv(state->ctdb, state, data, NULL, &res, NULL);
+	if ((ret != 0) || (res != 0)) {
+		data->fail_count++;
+	}
+}
+
+static void async_add(struct async_data *data, struct ctdb_client_control_state *state)
+{
+	/* set up the callback functions */
+	state->async.fn = async_callback;
+	state->async.private_data = data;
+	
+	/* one more control to wait for to complete */
+	data->count++;
+}
+
+
+/* wait for up to the maximum number of seconds allowed
+   or until all nodes we expect a response from has replied
+*/
+static int async_wait(struct ctdb_context *ctdb, struct async_data *data)
+{
+	while (data->count > 0) {
+		event_loop_once(ctdb->ev);
+	}
+	if (data->fail_count != 0) {
+		DEBUG(0,("Async wait failed - fail_count=%u\n", data->fail_count));
+		return -1;
+	}
+	return 0;
+}
+
+/* 
+   perform a simple control on nodes in the vnn map except ourselves.
+   The control cannot return data
+ */
+static int async_control_on_vnnmap(struct ctdb_context *ctdb, enum ctdb_controls opcode,
+				   TDB_DATA data)
+{
+	struct async_data *async_data;
+	struct ctdb_client_control_state *state;
+	int j;
+	struct timeval timeout = TIMELIMIT();
+	
+	async_data = talloc_zero(ctdb, struct async_data);
+	CTDB_NO_MEMORY_FATAL(ctdb, async_data);
+
+	/* loop over all active nodes and send an async control to each of them */
+	for (j=0; j<ctdb->vnn_map->size; j++) {
+		uint32_t pnn = ctdb->vnn_map->map[j];
+		if (pnn == ctdb->pnn) {
+			continue;
+		}
+		state = ctdb_control_send(ctdb, pnn, 0, opcode, 
+					  0, data, async_data, NULL, &timeout, NULL);
+		if (state == NULL) {
+			DEBUG(0,(__location__ " Failed to call async control %u\n", (unsigned)opcode));
+			talloc_free(async_data);
+			return -1;
+		}
+		
+		async_add(async_data, state);
+	}
+
+	if (async_wait(ctdb, async_data) != 0) {
+		talloc_free(async_data);
+		return -1;
+	}
+
+	talloc_free(async_data);
+	return 0;
+}
+
+
+/*
+  vacuum one record
+ */
+static int ctdb_vacuum_one(struct ctdb_context *ctdb, TDB_DATA key, struct ctdb_db_context *ctdb_db)
+{
+	TDB_DATA data;
+	struct ctdb_ltdb_header *hdr;
+	struct ctdb_rec_data *rec;
+	uint64_t rsn;
+
+	if (tdb_chainlock_nonblock(ctdb_db->ltdb->tdb, key) != 0) {
+		/* the chain is busy - come back later */
+		return 0;
+	}
+
+	data = tdb_fetch(ctdb_db->ltdb->tdb, key);
+	tdb_chainunlock(ctdb_db->ltdb->tdb, key);
+	if (data.dptr == NULL) {
+		return 0;
+	}
+	if (data.dsize != sizeof(struct ctdb_ltdb_header)) {
+		free(data.dptr);
+		return 0;
+	}
+
+
+	hdr = (struct ctdb_ltdb_header *)data.dptr;
+	rsn = hdr->rsn;
+
+	/* if we are not the lmaster and the dmaster then skip the record */
+	if (hdr->dmaster != ctdb->pnn ||
+	    ctdb_lmaster(ctdb, &key) != ctdb->pnn) {
+		free(data.dptr);
+		return 0;
+	}
+
+	rec = ctdb_marshall_record(ctdb, ctdb_db->db_id, key, hdr, tdb_null);
+	free(data.dptr);
+	if (rec == NULL) {
+		/* try it again later */
+		return 0;
+	}
+
+	data.dptr = (void *)rec;
+	data.dsize = rec->length;
+
+	if (async_control_on_vnnmap(ctdb, CTDB_CONTROL_DELETE_RECORD, data) != 0) {
+		/* one or more nodes failed to delete a record - no problem! */
+		talloc_free(rec);
+		return 0;
+	}
+
+	talloc_free(rec);
+
+	/* its deleted on all other nodes - refetch, check and delete */
+	if (tdb_chainlock_nonblock(ctdb_db->ltdb->tdb, key) != 0) {
+		/* the chain is busy - come back later */
+		return 0;
+	}
+
+	data = tdb_fetch(ctdb_db->ltdb->tdb, key);
+	if (data.dptr == NULL) {
+		tdb_chainunlock(ctdb_db->ltdb->tdb, key);
+		return 0;
+	}
+	if (data.dsize != sizeof(struct ctdb_ltdb_header)) {
+		free(data.dptr);
+		tdb_chainunlock(ctdb_db->ltdb->tdb, key);
+		return 0;
+	}
+
+	hdr = (struct ctdb_ltdb_header *)data.dptr;
+
+	/* if we are not the lmaster and the dmaster then skip the record */
+	if (hdr->dmaster != ctdb->pnn ||
+	    ctdb_lmaster(ctdb, &key) != ctdb->pnn ||
+	    rsn != hdr->rsn) {
+		tdb_chainunlock(ctdb_db->ltdb->tdb, key);
+		free(data.dptr);
+		return 0;
+	}
+
+	tdb_delete(ctdb_db->ltdb->tdb, key);
+	tdb_chainunlock(ctdb_db->ltdb->tdb, key);
+	free(data.dptr);
+
+	return 0;
+}
+
+
+/*
+  vacuum records for which we are the lmaster 
+ */
+static int ctdb_vacuum_local(struct ctdb_context *ctdb, struct ctdb_control_pulldb_reply *list, 
+			     struct ctdb_db_context *ctdb_db)
+{
+	struct ctdb_rec_data *r;
+	int i;
+
+	r = (struct ctdb_rec_data *)&list->data[0];
+	
+	for (i=0;
+	     i<list->count;
+	     r = (struct ctdb_rec_data *)(r->length + (uint8_t *)r), i++) {
+		TDB_DATA key;
+		key.dptr = &r->data[0];
+		key.dsize = r->keylen;
+		if (ctdb_vacuum_one(ctdb, key, ctdb_db) != 0) {
+			return -1;
+		}
+	}
+
+	return 0;	
+}
+
+/* 
+   a list of records to possibly delete
+ */
+struct vacuum_data {
+	uint32_t vacuum_limit;
+	struct ctdb_context *ctdb;
+	struct ctdb_control_pulldb_reply **list;
+	bool traverse_error;
+	uint32_t total;
+};
+
+/*
+  traverse function for vacuuming
+ */
+static int vacuum_traverse(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *private)
+{
+	struct vacuum_data *vdata = talloc_get_type(private, struct vacuum_data);
+	uint32_t lmaster;
+	struct ctdb_ltdb_header *hdr;
+	struct ctdb_rec_data *rec;
+	size_t old_size;
+	       
+	lmaster = ctdb_lmaster(vdata->ctdb, &key);
+	if (lmaster >= vdata->ctdb->vnn_map->size) {
+		return 0;
+	}
+
+	if (data.dsize != sizeof(struct ctdb_ltdb_header)) {
+		/* its not a deleted record */
+		return 0;
+	}
+
+	hdr = (struct ctdb_ltdb_header *)data.dptr;
+
+	if (hdr->dmaster != vdata->ctdb->pnn) {
+		return 0;
+	}
+
+
+	/* add the record to the blob ready to send to the nodes */
+	rec = ctdb_marshall_record(vdata->list[lmaster], 0, key, NULL, tdb_null);
+	if (rec == NULL) {
+		DEBUG(0,(__location__ " Out of memory\n"));
+		vdata->traverse_error = true;
+		return -1;
+	}
+	old_size = talloc_get_size(vdata->list[lmaster]);
+	vdata->list[lmaster] = talloc_realloc_size(NULL, vdata->list[lmaster], 
+						   old_size + rec->length);
+	if (vdata->list[lmaster] == NULL) {
+		DEBUG(0,(__location__ " Failed to expand\n"));
+		vdata->traverse_error = true;
+		return -1;
+	}
+	vdata->list[lmaster]->count++;
+	memcpy(old_size+(uint8_t *)vdata->list[lmaster], rec, rec->length);
+	talloc_free(rec);
+
+	vdata->total++;
+
+	/* don't gather too many records */
+	if (vdata->vacuum_limit != 0 &&
+	    vdata->total == vdata->vacuum_limit) {
+		return -1;
+	}
+
+	return 0;
+}
+
+
+/* vacuum one database */
+static int ctdb_vacuum_db(struct ctdb_context *ctdb, uint32_t db_id, struct ctdb_node_map *map,
+			  bool persistent, uint32_t vacuum_limit)
+{
+	struct ctdb_db_context *ctdb_db;
+	const char *name;
+	struct vacuum_data *vdata;
+	int i;
+
+	vdata = talloc_zero(ctdb, struct vacuum_data);
+	if (vdata == NULL) {
+		DEBUG(0,(__location__ " Out of memory\n"));
+		return -1;
+	}
+
+	vdata->ctdb = ctdb;
+	vdata->vacuum_limit = vacuum_limit;
+
+	if (ctdb_ctrl_getdbname(ctdb, TIMELIMIT(), CTDB_CURRENT_NODE, db_id, vdata, &name) != 0) {
+		DEBUG(0,(__location__ " Failed to get name of db 0x%x\n", db_id));
+		talloc_free(vdata);
+		return -1;
+	}
+
+	ctdb_db = ctdb_attach(ctdb, name, persistent);
+	if (ctdb_db == NULL) {
+		DEBUG(0,(__location__ " Failed to attach to database '%s'\n", name));
+		talloc_free(vdata);
+		return -1;
+	}
+
+	/* the list needs to be of length num_nodes */
+	vdata->list = talloc_array(vdata, struct ctdb_control_pulldb_reply *, ctdb->vnn_map->size);
+	if (vdata->list == NULL) {
+		DEBUG(0,(__location__ " Out of memory\n"));
+		talloc_free(vdata);
+		return -1;
+	}
+	for (i=0;i<ctdb->vnn_map->size;i++) {
+		vdata->list[i] = (struct ctdb_control_pulldb_reply *)
+			talloc_zero_size(vdata->list, 
+				    offsetof(struct ctdb_control_pulldb_reply, data));
+		if (vdata->list[i] == NULL) {
+			DEBUG(0,(__location__ " Out of memory\n"));
+			talloc_free(vdata);
+			return -1;
+		}
+		vdata->list[i]->db_id = db_id;
+	}
+
+	/* traverse, looking for records that might be able to be vacuumed */
+	if (tdb_traverse_read(ctdb_db->ltdb->tdb, vacuum_traverse, vdata) == -1 ||
+	    vdata->traverse_error) {
+		DEBUG(0,(__location__ " Traverse error in vacuuming '%s'\n", name));
+		talloc_free(vdata);
+		return -1;		
+	}
+
+
+	for (i=0;i<ctdb->vnn_map->size;i++) {
+		if (vdata->list[i]->count == 0) {
+			continue;
+		}
+
+		printf("Found %u records for lmaster %u\n", vdata->list[i]->count, i);		
+
+		/* for records where we are not the lmaster, tell the lmaster to fetch the record */
+		if (ctdb->vnn_map->map[i] != ctdb->pnn) {
+			TDB_DATA data;
+			data.dsize = talloc_get_size(vdata->list[i]);
+			data.dptr  = (void *)vdata->list[i];
+			if (ctdb_send_message(ctdb, ctdb->vnn_map->map[i], CTDB_SRVID_VACUUM_FETCH, data) != 0) {
+				DEBUG(0,(__location__ " Failed to send vacuum fetch message to %u\n",
+					 ctdb->vnn_map->map[i]));
+				talloc_free(vdata);
+				return -1;		
+			}
+			continue;
+		}
+
+		/* for records where we are the lmaster, we can try to delete them */
+		if (ctdb_vacuum_local(ctdb, vdata->list[i], ctdb_db) != 0) {
+			DEBUG(0,(__location__ " Deletion error in vacuuming '%s'\n", name));
+			talloc_free(vdata);
+			return -1;					
+		}
+	}	
+
+	/* this ensures we run our event queue */
+	ctdb_ctrl_getpnn(ctdb, TIMELIMIT(), CTDB_CURRENT_NODE);
+
+	talloc_free(vdata);
+
+	return 0;
+}
+
+
+/*
+  vacuum all our databases
+ */
+int ctdb_vacuum(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+	struct ctdb_dbid_map *dbmap=NULL;
+	struct ctdb_node_map *nodemap=NULL;
+	int ret, i, pnn;
+	uint32_t vacuum_limit = 100;
+
+	if (argc > 0) {
+		vacuum_limit = atoi(argv[0]);
+	}
+
+	ret = ctdb_ctrl_getdbmap(ctdb, TIMELIMIT(), CTDB_CURRENT_NODE, ctdb, &dbmap);
+	if (ret != 0) {
+		DEBUG(0, ("Unable to get dbids from local node\n"));
+		return ret;
+	}
+
+	ret = ctdb_ctrl_getnodemap(ctdb, TIMELIMIT(), CTDB_CURRENT_NODE, ctdb, &nodemap);
+	if (ret != 0) {
+		DEBUG(0, ("Unable to get nodemap from local node\n"));
+		return ret;
+	}
+
+	ret = ctdb_ctrl_getvnnmap(ctdb, TIMELIMIT(), CTDB_CURRENT_NODE, ctdb, &ctdb->vnn_map);
+	if (ret != 0) {
+		DEBUG(0, ("Unable to get vnnmap from local node\n"));
+		return ret;
+	}
+
+	pnn = ctdb_ctrl_getpnn(ctdb, TIMELIMIT(), CTDB_CURRENT_NODE);
+	if (pnn == -1) {
+		DEBUG(0, ("Unable to get pnn from local node\n"));
+		return -1;
+	}
+	ctdb->pnn = pnn;
+
+	for (i=0;i<dbmap->num;i++) {
+		if (ctdb_vacuum_db(ctdb, dbmap->dbs[i].dbid, nodemap, 
+				   dbmap->dbs[i].persistent, vacuum_limit) != 0) {
+			DEBUG(0,("Failed to vacuum db 0x%x\n", dbmap->dbs[i].dbid));
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+struct traverse_state {
+	bool error;
+	struct tdb_context *dest_db;
+};
+
+/*
+  traverse function for repacking
+ */
+static int repack_traverse(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *private)
+{
+	struct traverse_state *state = (struct traverse_state *)private;
+	if (tdb_store(state->dest_db, key, data, TDB_INSERT) != 0) {
+		state->error = true;
+		return -1;
+	}
+	return 0;
+}
+
+/*
+  repack a tdb
+ */
+static int ctdb_repack_tdb(struct tdb_context *tdb)
+{
+	struct tdb_context *tmp_db;
+	struct traverse_state state;
+
+	if (tdb_transaction_start(tdb) != 0) {
+		DEBUG(0,(__location__ " Failed to start transaction\n"));
+		return -1;
+	}
+
+	tmp_db = tdb_open("tmpdb", tdb_hash_size(tdb), TDB_INTERNAL, O_RDWR|O_CREAT, 0);
+	if (tmp_db == NULL) {
+		DEBUG(0,(__location__ " Failed to create tmp_db\n"));
+		tdb_transaction_cancel(tdb);
+		return -1;
+	}
+
+	state.error = false;
+	state.dest_db = tmp_db;
+
+	if (tdb_traverse_read(tdb, repack_traverse, &state) == -1) {
+		DEBUG(0,(__location__ " Failed to traverse copying out\n"));
+		tdb_transaction_cancel(tdb);
+		tdb_close(tmp_db);
+		return -1;		
+	}
+
+	if (state.error) {
+		DEBUG(0,(__location__ " Error during traversal\n"));
+		tdb_transaction_cancel(tdb);
+		tdb_close(tmp_db);
+		return -1;
+	}
+
+	if (tdb_wipe_all(tdb) != 0) {
+		DEBUG(0,(__location__ " Failed to wipe database\n"));
+		tdb_transaction_cancel(tdb);
+		tdb_close(tmp_db);
+		return -1;
+	}
+
+	state.error = false;
+	state.dest_db = tdb;
+
+	if (tdb_traverse_read(tmp_db, repack_traverse, &state) == -1) {
+		DEBUG(0,(__location__ " Failed to traverse copying back\n"));
+		tdb_transaction_cancel(tdb);
+		tdb_close(tmp_db);
+		return -1;		
+	}
+
+	if (state.error) {
+		DEBUG(0,(__location__ " Error during second traversal\n"));
+		tdb_transaction_cancel(tdb);
+		tdb_close(tmp_db);
+		return -1;
+	}
+
+	tdb_close(tmp_db);
+
+	if (tdb_transaction_commit(tdb) != 0) {
+		DEBUG(0,(__location__ " Failed to commit\n"));
+		return -1;
+	}
+
+	return 0;
+}
+
+
+/* repack one database */
+static int ctdb_repack_db(struct ctdb_context *ctdb, uint32_t db_id, 
+			  bool persistent, uint32_t repack_limit)
+{
+	struct ctdb_db_context *ctdb_db;
+	const char *name;
+	int size;
+
+	if (ctdb_ctrl_getdbname(ctdb, TIMELIMIT(), CTDB_CURRENT_NODE, db_id, ctdb, &name) != 0) {
+		DEBUG(0,(__location__ " Failed to get name of db 0x%x\n", db_id));
+		return -1;
+	}
+
+	ctdb_db = ctdb_attach(ctdb, name, persistent);
+	if (ctdb_db == NULL) {
+		DEBUG(0,(__location__ " Failed to attach to database '%s'\n", name));
+		return -1;
+	}
+
+	size = tdb_freelist_size(ctdb_db->ltdb->tdb);
+	if (size == -1) {
+		DEBUG(0,(__location__ " Failed to get freelist size for '%s'\n", name));
+		return -1;
+	}
+
+	if (size <= repack_limit) {
+		return 0;
+	}
+
+	DEBUG(0,("Repacking %s with %u freelist entries\n", name, size));
+
+	if (ctdb_repack_tdb(ctdb_db->ltdb->tdb) != 0) {
+		DEBUG(0,(__location__ " Failed to repack '%s'\n", name));
+		return -1;
+	}
+
+	return 0;
+}
+
+
+/*
+  repack all our databases
+ */
+int ctdb_repack(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+	struct ctdb_dbid_map *dbmap=NULL;
+	int ret, i;
+	uint32_t repack_limit = 100;
+
+	if (argc > 0) {
+		repack_limit = atoi(argv[0]);
+	}
+
+	ret = ctdb_ctrl_getdbmap(ctdb, TIMELIMIT(), CTDB_CURRENT_NODE, ctdb, &dbmap);
+	if (ret != 0) {
+		DEBUG(0, ("Unable to get dbids from local node\n"));
+		return ret;
+	}
+
+	for (i=0;i<dbmap->num;i++) {
+		if (ctdb_repack_db(ctdb, dbmap->dbs[i].dbid, 
+				   dbmap->dbs[i].persistent, repack_limit) != 0) {
+			DEBUG(0,("Failed to repack db 0x%x\n", dbmap->dbs[i].dbid));
+			return -1;
+		}
+	}
+
+	return 0;
+}

From 1c91398aeffe0d918cd3065fffe0f348c086a530 Mon Sep 17 00:00:00 2001
From: Andrew Tridgell <tridge@samba.org>
Date: Tue, 8 Jan 2008 21:28:42 +1100
Subject: [PATCH 06/13] ensure the recovery daemon is not clagged up by vacuum
 calls (This used to be ctdb commit ff7e80e247bf5a86adda0ef850d901478449675b)

---
 ctdb/client/ctdb_client.c   |  23 +-----
 ctdb/include/ctdb.h         |   3 +-
 ctdb/include/ctdb_private.h |  15 ++++
 ctdb/server/ctdb_recoverd.c | 146 ++++++++++++++++++++++++++++++------
 ctdb/tools/ctdb_vacuum.c    |   4 +-
 5 files changed, 142 insertions(+), 49 deletions(-)

diff --git a/ctdb/client/ctdb_client.c b/ctdb/client/ctdb_client.c
index a6336f9c84b..7f897325c30 100644
--- a/ctdb/client/ctdb_client.c
+++ b/ctdb/client/ctdb_client.c
@@ -147,19 +147,6 @@ static int ctdb_client_queue_pkt(struct ctdb_context *ctdb, struct ctdb_req_head
 }
 
 
-/*
-  state of a in-progress ctdb call in client
-*/
-struct ctdb_client_call_state {
-	enum call_state state;
-	uint32_t reqid;
-	struct ctdb_db_context *ctdb_db;
-	struct ctdb_call call;
-	struct {
-		void (*fn)(struct ctdb_client_call_state *);
-	} async;
-};
-
 /*
   called when a CTDB_REPLY_CALL packet comes in in the client
 
@@ -384,8 +371,7 @@ static struct ctdb_client_call_state *ctdb_client_call_local_send(struct ctdb_db
   This call never blocks.
 */
 struct ctdb_client_call_state *ctdb_call_send(struct ctdb_db_context *ctdb_db, 
-					      struct ctdb_call *call, 
-					      void (*callback)(struct ctdb_client_call_state *))
+					      struct ctdb_call *call)
 {
 	struct ctdb_client_call_state *state;
 	struct ctdb_context *ctdb = ctdb_db->ctdb;
@@ -412,9 +398,6 @@ struct ctdb_client_call_state *ctdb_call_send(struct ctdb_db_context *ctdb_db,
 		state = ctdb_client_call_local_send(ctdb_db, call, &header, &data);
 		talloc_free(data.dptr);
 		ctdb_ltdb_unlock(ctdb_db, call->key);
-		if (state) {
-			state->async.fn = callback;
-		}
 		return state;
 	}
 
@@ -457,8 +440,6 @@ struct ctdb_client_call_state *ctdb_call_send(struct ctdb_db_context *ctdb_db,
 
 	ctdb_client_queue_pkt(ctdb, &c->hdr);
 
-	state->async.fn = callback;
-
 	return state;
 }
 
@@ -470,7 +451,7 @@ int ctdb_call(struct ctdb_db_context *ctdb_db, struct ctdb_call *call)
 {
 	struct ctdb_client_call_state *state;
 
-	state = ctdb_call_send(ctdb_db, call, NULL);
+	state = ctdb_call_send(ctdb_db, call);
 	return ctdb_call_recv(state, call);
 }
 
diff --git a/ctdb/include/ctdb.h b/ctdb/include/ctdb.h
index ed38535224b..f8d0db3d174 100644
--- a/ctdb/include/ctdb.h
+++ b/ctdb/include/ctdb.h
@@ -230,8 +230,7 @@ int ctdb_set_message_handler(struct ctdb_context *ctdb, uint64_t srvid,
 
 
 int ctdb_call(struct ctdb_db_context *ctdb_db, struct ctdb_call *call);
-struct ctdb_client_call_state *ctdb_call_send(struct ctdb_db_context *ctdb_db, struct ctdb_call *call,
-					      void (*callback)(struct ctdb_client_call_state *));
+struct ctdb_client_call_state *ctdb_call_send(struct ctdb_db_context *ctdb_db, struct ctdb_call *call);
 int ctdb_call_recv(struct ctdb_client_call_state *state, struct ctdb_call *call);
 
 /* send a ctdb message */
diff --git a/ctdb/include/ctdb_private.h b/ctdb/include/ctdb_private.h
index aa4cc96c98e..f59ffeeed0d 100644
--- a/ctdb/include/ctdb_private.h
+++ b/ctdb/include/ctdb_private.h
@@ -1003,6 +1003,21 @@ struct ctdb_control_wipe_database {
 	uint32_t transaction_id;
 };
 
+/*
+  state of a in-progress ctdb call in client
+*/
+struct ctdb_client_call_state {
+	enum call_state state;
+	uint32_t reqid;
+	struct ctdb_db_context *ctdb_db;
+	struct ctdb_call call;
+	struct {
+		void (*fn)(struct ctdb_client_call_state *);
+		void *private;
+	} async;
+};
+
+
 int32_t ctdb_control_traverse_start(struct ctdb_context *ctdb, TDB_DATA indata, 
 				    TDB_DATA *outdata, uint32_t srcnode);
 int32_t ctdb_control_traverse_all(struct ctdb_context *ctdb, TDB_DATA data, TDB_DATA *outdata);
diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c
index c7086468add..eeb6b77561b 100644
--- a/ctdb/server/ctdb_recoverd.c
+++ b/ctdb/server/ctdb_recoverd.c
@@ -28,6 +28,7 @@
 #include "../include/ctdb.h"
 #include "../include/ctdb_private.h"
 #include "db_wrap.h"
+#include "dlinklist.h"
 
 
 struct ban_state {
@@ -50,6 +51,7 @@ struct ctdb_recoverd {
 	uint32_t node_flags;
 	struct timed_event *send_election_te;
 	struct timed_event *election_timeout;
+	struct vacuum_info *vacuum_info;
 };
 
 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
@@ -701,12 +703,94 @@ static void unban_handler(struct ctdb_context *ctdb, uint64_t srvid,
 }
 
 
+struct vacuum_info {
+	struct vacuum_info *next, *prev;
+	struct ctdb_recoverd *rec;
+	uint32_t srcnode;
+	struct ctdb_db_context *ctdb_db;
+	struct ctdb_control_pulldb_reply *recs;
+	struct ctdb_rec_data *r;
+};
+
+static void vacuum_fetch_next(struct vacuum_info *v);
+
 /*
-  called when a vacuum fetch has completed - just free it
+  called when a vacuum fetch has completed - just free it and do the next one
  */
 static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
 {
+	struct vacuum_info *v = talloc_get_type(state->async.private, struct vacuum_info);
 	talloc_free(state);
+	vacuum_fetch_next(v);
+}
+
+
+/*
+  process the next element from the vacuum list
+*/
+static void vacuum_fetch_next(struct vacuum_info *v)
+{
+	struct ctdb_call call;
+	struct ctdb_rec_data *r;
+
+	while (v->recs->count) {
+		struct ctdb_client_call_state *state;
+		TDB_DATA data;
+		struct ctdb_ltdb_header *hdr;
+
+		ZERO_STRUCT(call);
+		call.call_id = CTDB_NULL_FUNC;
+		call.flags = CTDB_IMMEDIATE_MIGRATION;
+
+		r = v->r;
+		v->r = (struct ctdb_rec_data *)(r->length + (uint8_t *)r);
+		v->recs->count--;
+
+		call.key.dptr = &r->data[0];
+		call.key.dsize = r->keylen;
+
+		/* ensure we don't block this daemon - just skip a record if we can't get
+		   the chainlock */
+		if (tdb_chainlock_nonblock(v->ctdb_db->ltdb->tdb, call.key) != 0) {
+			continue;
+		}
+
+		data = tdb_fetch(v->ctdb_db->ltdb->tdb, call.key);
+		if (data.dptr == NULL || data.dsize < sizeof(struct ctdb_ltdb_header)) {
+			tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
+			continue;
+		}
+		
+		hdr = (struct ctdb_ltdb_header *)data.dptr;
+		if (hdr->dmaster == v->rec->ctdb->pnn) {
+			/* its already local */
+			tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
+			continue;
+		}
+
+		state = ctdb_call_send(v->ctdb_db, &call);
+		tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
+		if (state == NULL) {
+			DEBUG(0,(__location__ " Failed to setup vacuum fetch call\n"));
+			talloc_free(v);
+			return;
+		}
+		state->async.fn = vacuum_fetch_callback;
+		state->async.private = v;
+		return;
+	}
+
+	talloc_free(v);
+}
+
+
+/*
+  destroy a vacuum info structure
+ */
+static int vacuum_info_destructor(struct vacuum_info *v)
+{
+	DLIST_REMOVE(v->rec->vacuum_info, v);
+	return 0;
 }
 
 
@@ -716,7 +800,7 @@ static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
 static void vacuum_fetch_handler(struct ctdb_context *ctdb, uint64_t srvid, 
 				 TDB_DATA data, void *private_data)
 {
-	struct ctdb_call call;
+	struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
 	struct ctdb_control_pulldb_reply *recs;
 	int ret, i;
 	TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
@@ -725,8 +809,24 @@ static void vacuum_fetch_handler(struct ctdb_context *ctdb, uint64_t srvid,
 	bool persistent = false;
 	struct ctdb_db_context *ctdb_db;
 	struct ctdb_rec_data *r;
+	uint32_t srcnode;
+	struct vacuum_info *v;
 
 	recs = (struct ctdb_control_pulldb_reply *)data.dptr;
+	r = (struct ctdb_rec_data *)&recs->data[0];
+
+	if (recs->count == 0) {
+		return;
+	}
+
+	srcnode = r->reqid;
+
+	for (v=rec->vacuum_info;v;v=v->next) {
+		if (srcnode == v->srcnode) {
+			/* we're already working on records from this node */
+			return;
+		}
+	}
 
 	/* work out if the database is persistent */
 	ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
@@ -762,31 +862,29 @@ static void vacuum_fetch_handler(struct ctdb_context *ctdb, uint64_t srvid,
 		talloc_free(tmp_ctx);
 		return;
 	}
-	
 
-	ZERO_STRUCT(call);
-	call.call_id = CTDB_NULL_FUNC;
-	call.flags = CTDB_IMMEDIATE_MIGRATION;
-
-	r = (struct ctdb_rec_data *)&recs->data[0];
-	
-	for (i=0;
-	     i<recs->count;
-	     r = (struct ctdb_rec_data *)(r->length + (uint8_t *)r), i++) {
-		struct ctdb_client_call_state *state;
-
-		call.key.dptr = &r->data[0];
-		call.key.dsize = r->keylen;
-
-		state = ctdb_call_send(ctdb_db, &call, vacuum_fetch_callback);
-		if (state == NULL) {
-			DEBUG(0,(__location__ " Failed to setup vacuum fetch call\n"));
-			talloc_free(tmp_ctx);
-			return;			
-		}
+	v = talloc_zero(rec, struct vacuum_info);
+	if (v == NULL) {
+		DEBUG(0,(__location__ " Out of memory\n"));
+		return;
 	}
 
-	talloc_free(tmp_ctx);	
+	v->rec = rec;
+	v->srcnode = srcnode;
+	v->ctdb_db = ctdb_db;
+	v->recs = talloc_memdup(v, recs, data.dsize);
+	if (v->recs == NULL) {
+		DEBUG(0,(__location__ " Out of memory\n"));
+		talloc_free(v);
+		return;		
+	}
+	v->r = 	(struct ctdb_rec_data *)&v->recs->data[0];
+
+	DLIST_ADD(rec->vacuum_info, v);
+
+	talloc_set_destructor(v, vacuum_info_destructor);
+
+	vacuum_fetch_next(v);
 }
 
 
diff --git a/ctdb/tools/ctdb_vacuum.c b/ctdb/tools/ctdb_vacuum.c
index 6e88bac6eda..8fa5d79ac0b 100644
--- a/ctdb/tools/ctdb_vacuum.c
+++ b/ctdb/tools/ctdb_vacuum.c
@@ -281,7 +281,7 @@ static int vacuum_traverse(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data,
 
 
 	/* add the record to the blob ready to send to the nodes */
-	rec = ctdb_marshall_record(vdata->list[lmaster], 0, key, NULL, tdb_null);
+	rec = ctdb_marshall_record(vdata->list[lmaster], vdata->ctdb->pnn, key, NULL, tdb_null);
 	if (rec == NULL) {
 		DEBUG(0,(__location__ " Out of memory\n"));
 		vdata->traverse_error = true;
@@ -416,7 +416,7 @@ int ctdb_vacuum(struct ctdb_context *ctdb, int argc, const char **argv)
 	struct ctdb_dbid_map *dbmap=NULL;
 	struct ctdb_node_map *nodemap=NULL;
 	int ret, i, pnn;
-	uint32_t vacuum_limit = 100;
+	uint32_t vacuum_limit = 0;
 
 	if (argc > 0) {
 		vacuum_limit = atoi(argv[0]);

From 9559249e15bd6e8c7496f71270e7a778ea7b957b Mon Sep 17 00:00:00 2001
From: Andrew Tridgell <tridge@samba.org>
Date: Tue, 8 Jan 2008 22:31:48 +1100
Subject: [PATCH 07/13] ensure the main daemon doesn't use a blocking lock on
 the freelist (This used to be ctdb commit
 73f8257906b09e6516f675883d8e7a3c455ad869)

---
 ctdb/server/ctdb_recover.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/ctdb/server/ctdb_recover.c b/ctdb/server/ctdb_recover.c
index 97a7d0251f5..3bc7a3744c1 100644
--- a/ctdb/server/ctdb_recover.c
+++ b/ctdb/server/ctdb_recover.c
@@ -647,6 +647,11 @@ int32_t ctdb_control_delete_record(struct ctdb_context *ctdb, TDB_DATA indata)
 	struct ctdb_db_context *ctdb_db;
 	TDB_DATA key, data;
 	struct ctdb_ltdb_header *hdr, *hdr2;
+	
+	/* these are really internal tdb functions - but we need them here for
+	   non-blocking lock of the freelist */
+	int tdb_lock_nonblock(struct tdb_context *tdb, int list, int ltype);
+	int tdb_unlock(struct tdb_context *tdb, int list, int ltype);
 
 	if (indata.dsize < sizeof(uint32_t) || indata.dsize != rec->length) {
 		DEBUG(0,(__location__ " Bad record size in ctdb_control_delete_record\n"));
@@ -688,8 +693,11 @@ int32_t ctdb_control_delete_record(struct ctdb_context *ctdb, TDB_DATA indata)
 	}
 
 	if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
-		tdb_delete(ctdb_db->ltdb->tdb, key);
-		DEBUG(0,(__location__ " Deleted corrupt record\n"));
+		if (tdb_lock_nonblock(ctdb_db->ltdb->tdb, -1, F_WRLCK) == 0) {
+			tdb_delete(ctdb_db->ltdb->tdb, key);
+			tdb_unlock(ctdb_db->ltdb->tdb, -1, F_WRLCK);
+			DEBUG(0,(__location__ " Deleted corrupt record\n"));
+		}
 		tdb_chainunlock(ctdb_db->ltdb->tdb, key);
 		free(data.dptr);
 		return 0;
@@ -712,13 +720,21 @@ int32_t ctdb_control_delete_record(struct ctdb_context *ctdb, TDB_DATA indata)
 		return -1;				
 	}
 
+	if (tdb_lock_nonblock(ctdb_db->ltdb->tdb, -1, F_WRLCK) != 0) {
+		tdb_chainunlock(ctdb_db->ltdb->tdb, key);
+		free(data.dptr);
+		return -1;				
+	}
+
 	if (tdb_delete(ctdb_db->ltdb->tdb, key) != 0) {
+		tdb_unlock(ctdb_db->ltdb->tdb, -1, F_WRLCK);
 		tdb_chainunlock(ctdb_db->ltdb->tdb, key);
 		DEBUG(2,(__location__ " Failed to delete record\n"));
 		free(data.dptr);
 		return -1;						
 	}
 
+	tdb_unlock(ctdb_db->ltdb->tdb, -1, F_WRLCK);
 	tdb_chainunlock(ctdb_db->ltdb->tdb, key);
 	free(data.dptr);
 	return 0;	

From 0ee375ad665d2bf4116e638d8a25143278422c0e Mon Sep 17 00:00:00 2001
From: Andrew Tridgell <tridge@samba.org>
Date: Tue, 8 Jan 2008 22:36:44 +1100
Subject: [PATCH 08/13] this is not an error - it just means the record was
 busy (This used to be ctdb commit 749451a4e97330d0fc35f5366dcc61aa500f7ce9)

---
 ctdb/tools/ctdb_vacuum.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ctdb/tools/ctdb_vacuum.c b/ctdb/tools/ctdb_vacuum.c
index 8fa5d79ac0b..7de55dfc506 100644
--- a/ctdb/tools/ctdb_vacuum.c
+++ b/ctdb/tools/ctdb_vacuum.c
@@ -78,7 +78,6 @@ static int async_wait(struct ctdb_context *ctdb, struct async_data *data)
 		event_loop_once(ctdb->ev);
 	}
 	if (data->fail_count != 0) {
-		DEBUG(0,("Async wait failed - fail_count=%u\n", data->fail_count));
 		return -1;
 	}
 	return 0;

From 673a2b46f9f62a1e33a1e66064667e87b30969ae Mon Sep 17 00:00:00 2001
From: Andrew Tridgell <tridge@samba.org>
Date: Tue, 8 Jan 2008 23:02:43 +1100
Subject: [PATCH 09/13] nicer outut from repack and vacuum (This used to be
 ctdb commit 446c76bc332fe1366c32898fb77279a902d7159c)

---
 ctdb/tools/ctdb.c        |  2 ++
 ctdb/tools/ctdb_vacuum.c | 28 +++++++++++++++++++++-------
 2 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/ctdb/tools/ctdb.c b/ctdb/tools/ctdb.c
index 6255d7bf642..bbd2f0ec28b 100644
--- a/ctdb/tools/ctdb.c
+++ b/ctdb/tools/ctdb.c
@@ -1117,6 +1117,8 @@ int main(int argc, const char *argv[])
 	struct event_context *ev;
 	const char *control;
 
+	setlinebuf(stdout);
+	
 	/* set some defaults */
 	options.timelimit = 3;
 	options.pnn = CTDB_CURRENT_NODE;
diff --git a/ctdb/tools/ctdb_vacuum.c b/ctdb/tools/ctdb_vacuum.c
index 7de55dfc506..b4c232766a5 100644
--- a/ctdb/tools/ctdb_vacuum.c
+++ b/ctdb/tools/ctdb_vacuum.c
@@ -128,7 +128,8 @@ static int async_control_on_vnnmap(struct ctdb_context *ctdb, enum ctdb_controls
 /*
   vacuum one record
  */
-static int ctdb_vacuum_one(struct ctdb_context *ctdb, TDB_DATA key, struct ctdb_db_context *ctdb_db)
+static int ctdb_vacuum_one(struct ctdb_context *ctdb, TDB_DATA key, 
+			   struct ctdb_db_context *ctdb_db, uint32_t *count)
 {
 	TDB_DATA data;
 	struct ctdb_ltdb_header *hdr;
@@ -211,6 +212,8 @@ static int ctdb_vacuum_one(struct ctdb_context *ctdb, TDB_DATA key, struct ctdb_
 	tdb_chainunlock(ctdb_db->ltdb->tdb, key);
 	free(data.dptr);
 
+	(*count)++;
+
 	return 0;
 }
 
@@ -219,7 +222,7 @@ static int ctdb_vacuum_one(struct ctdb_context *ctdb, TDB_DATA key, struct ctdb_
   vacuum records for which we are the lmaster 
  */
 static int ctdb_vacuum_local(struct ctdb_context *ctdb, struct ctdb_control_pulldb_reply *list, 
-			     struct ctdb_db_context *ctdb_db)
+			     struct ctdb_db_context *ctdb_db, uint32_t *count)
 {
 	struct ctdb_rec_data *r;
 	int i;
@@ -232,7 +235,7 @@ static int ctdb_vacuum_local(struct ctdb_context *ctdb, struct ctdb_control_pull
 		TDB_DATA key;
 		key.dptr = &r->data[0];
 		key.dsize = r->keylen;
-		if (ctdb_vacuum_one(ctdb, key, ctdb_db) != 0) {
+		if (ctdb_vacuum_one(ctdb, key, ctdb_db, count) != 0) {
 			return -1;
 		}
 	}
@@ -374,11 +377,11 @@ static int ctdb_vacuum_db(struct ctdb_context *ctdb, uint32_t db_id, struct ctdb
 			continue;
 		}
 
-		printf("Found %u records for lmaster %u\n", vdata->list[i]->count, i);		
-
 		/* for records where we are not the lmaster, tell the lmaster to fetch the record */
 		if (ctdb->vnn_map->map[i] != ctdb->pnn) {
 			TDB_DATA data;
+			printf("Found %u records for lmaster %u in '%s'\n", vdata->list[i]->count, i, name);
+
 			data.dsize = talloc_get_size(vdata->list[i]);
 			data.dptr  = (void *)vdata->list[i];
 			if (ctdb_send_message(ctdb, ctdb->vnn_map->map[i], CTDB_SRVID_VACUUM_FETCH, data) != 0) {
@@ -389,13 +392,24 @@ static int ctdb_vacuum_db(struct ctdb_context *ctdb, uint32_t db_id, struct ctdb
 			}
 			continue;
 		}
+	}	
+
+	for (i=0;i<ctdb->vnn_map->size;i++) {
+		uint32_t count = 0;
+
+		if (vdata->list[i]->count == 0) {
+			continue;
+		}
 
 		/* for records where we are the lmaster, we can try to delete them */
-		if (ctdb_vacuum_local(ctdb, vdata->list[i], ctdb_db) != 0) {
+		if (ctdb_vacuum_local(ctdb, vdata->list[i], ctdb_db, &count) != 0) {
 			DEBUG(0,(__location__ " Deletion error in vacuuming '%s'\n", name));
 			talloc_free(vdata);
 			return -1;					
 		}
+		if (count != 0) {
+			printf("Deleted %u records on this node from '%s'\n", count, name);
+		}
 	}	
 
 	/* this ensures we run our event queue */
@@ -576,7 +590,7 @@ static int ctdb_repack_db(struct ctdb_context *ctdb, uint32_t db_id,
 		return 0;
 	}
 
-	DEBUG(0,("Repacking %s with %u freelist entries\n", name, size));
+	printf("Repacking %s with %u freelist entries\n", name, size);
 
 	if (ctdb_repack_tdb(ctdb_db->ltdb->tdb) != 0) {
 		DEBUG(0,(__location__ " Failed to repack '%s'\n", name));

From bb3f77d61d14d1beadee576e55a0c444872cd37c Mon Sep 17 00:00:00 2001
From: Andrew Tridgell <tridge@samba.org>
Date: Wed, 9 Jan 2008 08:28:18 +1100
Subject: [PATCH 10/13] changed default vacuum limit (This used to be ctdb
 commit 7ca2977c12cf7938da639a17a0f857d7029d749c)

---
 ctdb/tools/ctdb_vacuum.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ctdb/tools/ctdb_vacuum.c b/ctdb/tools/ctdb_vacuum.c
index b4c232766a5..86d9c4c5f19 100644
--- a/ctdb/tools/ctdb_vacuum.c
+++ b/ctdb/tools/ctdb_vacuum.c
@@ -608,7 +608,8 @@ int ctdb_repack(struct ctdb_context *ctdb, int argc, const char **argv)
 {
 	struct ctdb_dbid_map *dbmap=NULL;
 	int ret, i;
-	uint32_t repack_limit = 100;
+	/* a reasonable default limit to prevent us using too much memory */
+	uint32_t repack_limit = 10000; 
 
 	if (argc > 0) {
 		repack_limit = atoi(argv[0]);

From 010e257e6457fe0fd9ace9e7227227f3ae99a26f Mon Sep 17 00:00:00 2001
From: Andrew Tridgell <tridge@samba.org>
Date: Wed, 9 Jan 2008 08:29:19 +1100
Subject: [PATCH 11/13] increase version number (This used to be ctdb commit
 8aa1d26a83fd781e641fa23b14bbfd1c238de0b6)

---
 ctdb/packaging/RPM/ctdb.spec | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/ctdb/packaging/RPM/ctdb.spec b/ctdb/packaging/RPM/ctdb.spec
index d8fa622d36f..410a1044e82 100644
--- a/ctdb/packaging/RPM/ctdb.spec
+++ b/ctdb/packaging/RPM/ctdb.spec
@@ -5,7 +5,7 @@ Vendor: Samba Team
 Packager: Samba Team <samba@samba.org>
 Name: ctdb
 Version: 1.0
-Release: 20
+Release: 21
 Epoch: 0
 License: GNU GPL version 3
 Group: System Environment/Daemons
@@ -118,6 +118,8 @@ fi
 %{_includedir}/ctdb_private.h
 
 %changelog
+* Wed Jan 09 2008 : Version 1.0.21
+ - added ctdb vacuum and ctdb repack code
 * Sun Jan 06 2008 : Version 1.0.20
  - new transaction based recovery code
 * Sat Jan 05 2008 : Version 1.0.19

From 4879d465fa01ecf1183d5fa6db4bfc12b1039200 Mon Sep 17 00:00:00 2001
From: Andrew Tridgell <tridge@samba.org>
Date: Wed, 9 Jan 2008 08:41:27 +1100
Subject: [PATCH 12/13] forgot this file (This used to be ctdb commit
 d715bef49a88f9084f53b3c88307848ed8434a50)

---
 ctdb/config/events.d/91.lvs | 86 +++++++++++++++++++++++++++++++++++++
 1 file changed, 86 insertions(+)
 create mode 100755 ctdb/config/events.d/91.lvs

diff --git a/ctdb/config/events.d/91.lvs b/ctdb/config/events.d/91.lvs
new file mode 100755
index 00000000000..48b546fd584
--- /dev/null
+++ b/ctdb/config/events.d/91.lvs
@@ -0,0 +1,86 @@
+#!/bin/sh
+# script to manage the lvs ip multiplexer for a single public address cluster
+
+. $CTDB_BASE/functions
+loadconfig ctdb
+
+[ -z "$CTDB_LVS_PUBLIC_IP" ] && exit 0
+[ -z "$CTDB_PUBLIC_INTERFACE" ] && exit 0
+
+cmd="$1"
+shift
+
+PATH=/usr/bin:/bin:/usr/sbin:/sbin:$PATH
+
+case $cmd in 
+     startup)
+	ipvsadm -D -t $CTDB_LVS_PUBLIC_IP:0
+	ipvsadm -D -u $CTDB_LVS_PUBLIC_IP:0
+
+	ip addr add $CTDB_LVS_PUBLIC_IP/32 dev lo scope host >/dev/null 2>/dev/null
+
+	# do not respond to ARPs that are for ip addresses with scope 'host'
+	echo 3 > /proc/sys/net/ipv4/conf/all/arp_ignore
+	# do not send out arp requests from loopback addresses
+	echo 2 > /proc/sys/net/ipv4/conf/all/arp_announce
+	;;
+
+     shutdown)
+	ipvsadm -D -t $CTDB_LVS_PUBLIC_IP:0
+	ipvsadm -D -u $CTDB_LVS_PUBLIC_IP:0
+
+	# remove the ip
+	ip addr del $CTDB_LVS_PUBLIC_IP/32 dev lo >/dev/null 2>/dev/null
+
+	# flush our route cache
+	echo 1 > /proc/sys/net/ipv4/route/flush
+	;;
+
+     takeip)
+	;;
+
+     releaseip)
+	;;
+
+     recovered)
+	# kill off any tcp connections
+	ipvsadm -D -t $CTDB_LVS_PUBLIC_IP:0
+	ipvsadm -D -u $CTDB_LVS_PUBLIC_IP:0
+	kill_tcp_connections $CTDB_LVS_PUBLIC_IP
+
+	# are we the recmaster ? 
+	ctdb isnotrecmaster >/dev/null 2>/dev/null || {
+	    # change the ip address to have scope host so we wont respond
+	    # to arps
+	    ip addr del $CTDB_LVS_PUBLIC_IP/32 dev lo >/dev/null 2>/dev/null
+	    ip addr add $CTDB_LVS_PUBLIC_IP/32 dev lo scope host >/dev/null 2>/dev/null
+	    exit 0
+	}
+
+	# change the scope so we start responding to arps
+	ip addr del $CTDB_LVS_PUBLIC_IP/32 dev lo >/dev/null 2>/dev/null
+	ip addr add $CTDB_LVS_PUBLIC_IP/32 dev lo >/dev/null 2>/dev/null
+
+	ipvsadm -A -t $CTDB_LVS_PUBLIC_IP:0 -p 9999 -s lc
+	ipvsadm -A -u $CTDB_LVS_PUBLIC_IP:0 -p 9999 -s lc
+
+	ctdb status 2>/dev/null | egrep "^pnn:" | grep -v DISCONNECTED | grep -v "(THIS NODE)" | sed -e "s/^pnn:[0-9]* //" -e "s/[ 	].*//" | while read IP; do
+		ipvsadm -a -t $CTDB_LVS_PUBLIC_IP:0 -r $IP -g
+		ipvsadm -a -u $CTDB_LVS_PUBLIC_IP:0 -r $IP -g
+	done
+	ipvsadm -a -t $CTDB_LVS_PUBLIC_IP:0 -r 127.0.0.1
+	ipvsadm -a -u $CTDB_LVS_PUBLIC_IP:0 -r 127.0.0.1
+
+	# send out a gratious arp so our peers will update their arp tables
+	ctdb gratiousarp $CTDB_LVS_PUBLIC_IP $CTDB_PUBLIC_INTERFACE >/dev/null 2>/dev/null
+
+	# flush our route cache
+	echo 1 > /proc/sys/net/ipv4/route/flush
+	;;
+
+      monitor)
+	;;
+
+esac
+
+exit 0

From fa77de5b342cc438fefac22a565de3d97410eba8 Mon Sep 17 00:00:00 2001
From: Andrew Tridgell <tridge@samba.org>
Date: Wed, 9 Jan 2008 08:50:03 +1100
Subject: [PATCH 13/13] needs to be in Makefile.in too (This used to be ctdb
 commit b3dfdf28fa682a55d177564774cde3af8c260d8e)

---
 ctdb/Makefile.in | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ctdb/Makefile.in b/ctdb/Makefile.in
index 3e2b1e1b192..ba0d1ba2e56 100644
--- a/ctdb/Makefile.in
+++ b/ctdb/Makefile.in
@@ -181,6 +181,7 @@ install: all
 	${INSTALLCMD} -m 755 config/events.d/60.nfs $(DESTDIR)$(etcdir)/ctdb/events.d
 	${INSTALLCMD} -m 755 config/events.d/61.nfstickle $(DESTDIR)$(etcdir)/ctdb/events.d
 	${INSTALLCMD} -m 755 config/events.d/90.ipmux $(DESTDIR)$(etcdir)/ctdb/events.d
+	${INSTALLCMD} -m 755 config/events.d/91.lvs $(DESTDIR)$(etcdir)/ctdb/events.d
 	${INSTALLCMD} -m 755 tools/ctdb_diagnostics $(DESTDIR)$(bindir)
 	${INSTALLCMD} -m 755 tools/onnode.ssh $(DESTDIR)$(bindir)
 	${INSTALLCMD} -m 755 tools/onnode.rsh $(DESTDIR)$(bindir)