2025-02-04 17:47:26 +03:00 · 2008-01-09 10:10:33 +11:00 · 2008-01-09 10:10:33 +11:00 · 9862338dd2
commit 9862338dd2
parent fd227f5d29 fa77de5b34
16 changed files with 1325 additions and 152 deletions
--- a/ctdb/Makefile.in
+++ b/ctdb/Makefile.in
@ -90,9 +90,9 @@ bin/ctdb_ipmux: $(CTDB_CLIENT_OBJ) utils/ipmux/ipmux.o
 	@echo Linking $@
 	@$(CC) $(CFLAGS) -o $@ utils/ipmux/ipmux.o $(CTDB_CLIENT_OBJ) $(LIB_FLAGS) $(IPQ_LIBS)

-bin/ctdb: $(CTDB_CLIENT_OBJ) tools/ctdb.o 
+bin/ctdb: $(CTDB_CLIENT_OBJ) tools/ctdb.o tools/ctdb_vacuum.o
 	@echo Linking $@
-	@$(CC) $(CFLAGS) -o $@ tools/ctdb.o $(CTDB_CLIENT_OBJ) $(LIB_FLAGS)
+	@$(CC) $(CFLAGS) -o $@ tools/ctdb.o tools/ctdb_vacuum.o $(CTDB_CLIENT_OBJ) $(LIB_FLAGS)

 bin/smnotify: utils/smnotify/gen_xdr.o utils/smnotify/gen_smnotify.o utils/smnotify/smnotify.o 
 	@echo Linking $@
@ -181,6 +181,7 @@ install: all
 	${INSTALLCMD} -m 755 config/events.d/60.nfs $(DESTDIR)$(etcdir)/ctdb/events.d
 	${INSTALLCMD} -m 755 config/events.d/61.nfstickle $(DESTDIR)$(etcdir)/ctdb/events.d
 	${INSTALLCMD} -m 755 config/events.d/90.ipmux $(DESTDIR)$(etcdir)/ctdb/events.d
+	${INSTALLCMD} -m 755 config/events.d/91.lvs $(DESTDIR)$(etcdir)/ctdb/events.d
 	${INSTALLCMD} -m 755 tools/ctdb_diagnostics $(DESTDIR)$(bindir)
 	${INSTALLCMD} -m 755 tools/onnode.ssh $(DESTDIR)$(bindir)
 	${INSTALLCMD} -m 755 tools/onnode.rsh $(DESTDIR)$(bindir)
--- a/ctdb/client/ctdb_client.c
+++ b/ctdb/client/ctdb_client.c
@ -147,16 +147,6 @@ static int ctdb_client_queue_pkt(struct ctdb_context *ctdb, struct ctdb_req_head
 }


-/*
-  state of a in-progress ctdb call in client
-*/
-struct ctdb_client_call_state {
-	enum call_state state;
-	uint32_t reqid;
-	struct ctdb_db_context *ctdb_db;
-	struct ctdb_call call;
-};
-
 /*
  called when a CTDB_REPLY_CALL packet comes in in the client

@ -187,6 +177,10 @@ static void ctdb_client_reply_call(struct ctdb_context *ctdb, struct ctdb_req_he
 	talloc_steal(state, c);

 	state->state = CTDB_CALL_DONE;
+
+	if (state->async.fn) {
+		state->async.fn(state);
+	}
 }

 static void ctdb_client_reply_control(struct ctdb_context *ctdb, struct ctdb_req_header *hdr);
@ -377,7 +371,7 @@ static struct ctdb_client_call_state *ctdb_client_call_local_send(struct ctdb_db
  This call never blocks.
 */
 struct ctdb_client_call_state *ctdb_call_send(struct ctdb_db_context *ctdb_db, 
-				       struct ctdb_call *call)
+					      struct ctdb_call *call)
 {
 	struct ctdb_client_call_state *state;
 	struct ctdb_context *ctdb = ctdb_db->ctdb;
@ -1575,6 +1569,22 @@ int ctdb_statistics_reset(struct ctdb_context *ctdb, uint32_t destnode)
 	return 0;
 }

+/*
+  this is the dummy null procedure that all databases support
+*/
+static int ctdb_null_func(struct ctdb_call_info *call)
+{
+	return 0;
+}
+
+/*
+  this is a plain fetch procedure that all databases support
+*/
+static int ctdb_fetch_func(struct ctdb_call_info *call)
+{
+	call->reply_data = &call->record_data;
+	return 0;
+}

 /*
  attach to a specific database - client call
@ -1632,6 +1642,10 @@ struct ctdb_db_context *ctdb_attach(struct ctdb_context *ctdb, const char *name,

 	DLIST_ADD(ctdb->db_list, ctdb_db);

+	/* add well known functions */
+	ctdb_set_call(ctdb_db, ctdb_null_func, CTDB_NULL_FUNC);
+	ctdb_set_call(ctdb_db, ctdb_fetch_func, CTDB_FETCH_FUNC);
+
 	return ctdb_db;
 }

@ -1641,12 +1655,15 @@ struct ctdb_db_context *ctdb_attach(struct ctdb_context *ctdb, const char *name,
 */
 int ctdb_set_call(struct ctdb_db_context *ctdb_db, ctdb_fn_t fn, uint32_t id)
 {
+	struct ctdb_registered_call *call;
+
+#if 0
 	TDB_DATA data;
 	int32_t status;
 	struct ctdb_control_set_call c;
 	int ret;
-	struct ctdb_registered_call *call;

+	/* this is no longer valid with the separate daemon architecture */
 	c.db_id = ctdb_db->db_id;
 	c.fn    = fn;
 	c.id    = id;
@ -1660,6 +1677,7 @@ int ctdb_set_call(struct ctdb_db_context *ctdb_db, ctdb_fn_t fn, uint32_t id)
 		DEBUG(0,("ctdb_set_call failed for call %u\n", id));
 		return -1;
 	}
+#endif

 	/* also register locally */
 	call = talloc(ctdb_db, struct ctdb_registered_call);
--- a/ctdb/config/events.d/50.samba
+++ b/ctdb/config/events.d/50.samba
@ -36,7 +36,7 @@ periodic_cleanup() {
    # running smbstatus scrubs any dead entries from the connections
    # and sessionid database
    echo "`date` Running periodic cleanup of samba databases"
-    smbstatus -n > /dev/null 2>&1
+    smbstatus -n > /dev/null 2>&1 &
 }

 case $cmd in 
--- a/ctdb/config/events.d/91.lvs
+++ b/ctdb/config/events.d/91.lvs
@ -0,0 +1,86 @@
+#!/bin/sh
+# script to manage the lvs ip multiplexer for a single public address cluster
+
+. $CTDB_BASE/functions
+loadconfig ctdb
+
+[ -z "$CTDB_LVS_PUBLIC_IP" ] && exit 0
+[ -z "$CTDB_PUBLIC_INTERFACE" ] && exit 0
+
+cmd="$1"
+shift
+
+PATH=/usr/bin:/bin:/usr/sbin:/sbin:$PATH
+
+case $cmd in 
+     startup)
+	ipvsadm -D -t $CTDB_LVS_PUBLIC_IP:0
+	ipvsadm -D -u $CTDB_LVS_PUBLIC_IP:0
+
+	ip addr add $CTDB_LVS_PUBLIC_IP/32 dev lo scope host >/dev/null 2>/dev/null
+
+	# do not respond to ARPs that are for ip addresses with scope 'host'
+	echo 3 > /proc/sys/net/ipv4/conf/all/arp_ignore
+	# do not send out arp requests from loopback addresses
+	echo 2 > /proc/sys/net/ipv4/conf/all/arp_announce
+	;;
+
+     shutdown)
+	ipvsadm -D -t $CTDB_LVS_PUBLIC_IP:0
+	ipvsadm -D -u $CTDB_LVS_PUBLIC_IP:0
+
+	# remove the ip
+	ip addr del $CTDB_LVS_PUBLIC_IP/32 dev lo >/dev/null 2>/dev/null
+
+	# flush our route cache
+	echo 1 > /proc/sys/net/ipv4/route/flush
+	;;
+
+     takeip)
+	;;
+
+     releaseip)
+	;;
+
+     recovered)
+	# kill off any tcp connections
+	ipvsadm -D -t $CTDB_LVS_PUBLIC_IP:0
+	ipvsadm -D -u $CTDB_LVS_PUBLIC_IP:0
+	kill_tcp_connections $CTDB_LVS_PUBLIC_IP
+
+	# are we the recmaster ? 
+	ctdb isnotrecmaster >/dev/null 2>/dev/null || {
+	    # change the ip address to have scope host so we wont respond
+	    # to arps
+	    ip addr del $CTDB_LVS_PUBLIC_IP/32 dev lo >/dev/null 2>/dev/null
+	    ip addr add $CTDB_LVS_PUBLIC_IP/32 dev lo scope host >/dev/null 2>/dev/null
+	    exit 0
+	}
+
+	# change the scope so we start responding to arps
+	ip addr del $CTDB_LVS_PUBLIC_IP/32 dev lo >/dev/null 2>/dev/null
+	ip addr add $CTDB_LVS_PUBLIC_IP/32 dev lo >/dev/null 2>/dev/null
+
+	ipvsadm -A -t $CTDB_LVS_PUBLIC_IP:0 -p 9999 -s lc
+	ipvsadm -A -u $CTDB_LVS_PUBLIC_IP:0 -p 9999 -s lc
+
+	ctdb status 2>/dev/null | egrep "^pnn:" | grep -v DISCONNECTED | grep -v "(THIS NODE)" | sed -e "s/^pnn:[0-9]* //" -e "s/[ 	].*//" | while read IP; do
+		ipvsadm -a -t $CTDB_LVS_PUBLIC_IP:0 -r $IP -g
+		ipvsadm -a -u $CTDB_LVS_PUBLIC_IP:0 -r $IP -g
+	done
+	ipvsadm -a -t $CTDB_LVS_PUBLIC_IP:0 -r 127.0.0.1
+	ipvsadm -a -u $CTDB_LVS_PUBLIC_IP:0 -r 127.0.0.1
+
+	# send out a gratious arp so our peers will update their arp tables
+	ctdb gratiousarp $CTDB_LVS_PUBLIC_IP $CTDB_PUBLIC_INTERFACE >/dev/null 2>/dev/null
+
+	# flush our route cache
+	echo 1 > /proc/sys/net/ipv4/route/flush
+	;;
+
+      monitor)
+	;;
+
+esac
+
+exit 0
--- a/ctdb/doc/ctdbd.1
+++ b/ctdb/doc/ctdbd.1
@ -1,219 +1,250 @@
 .\"     Title: ctdbd
 .\"    Author: 
-.\" Generator: DocBook XSL Stylesheets v1.71.0 <http://docbook.sf.net/>
-.\"      Date: 09/14/2007
+.\" Generator: DocBook XSL Stylesheets v1.73.2 <http://docbook.sf.net/>
+.\"      Date: 01/07/2008
 .\"    Manual: 
 .\"    Source: 
 .\"
-.TH "CTDBD" "1" "09/14/2007" "" ""
+.TH "CTDBD" "1" "01/07/2008" "" ""
 .\" disable hyphenation
 .nh
 .\" disable justification (adjust text to left margin only)
 .ad l
 .SH "NAME"
-ctdbd \- The CTDB cluster daemon
+ctdbd - The CTDB cluster daemon
 .SH "SYNOPSIS"
 .HP 6
 \fBctdbd\fR
 .HP 6
-\fBctdbd\fR {\-\-reclock=<filename>} {\-\-nlist=<filename>} {\-\-dbdir=<directory>} [\-?\ \-\-help] [\-\-usage] [\-i\ \-\-interactive] [\-\-public\-addresses=<filename>] [\-\-event\-script\-dir=<directory>] [\-\-logfile=<filename>] [\-\-listen=<address>] [\-\-transport=<STRING>] [\-\-socket=<filename>] [\-d\ \-\-debug=<INTEGER>] [\-\-torture]
+\fBctdbd\fR [\-?\ \-\-help] [\-d\ \-\-debug=<INTEGER>] {\-\-dbdir=<directory>} {\-\-dbdir\-persistent=<directory>} [\-\-event\-script\-dir=<directory>] [\-i\ \-\-interactive] [\-\-listen=<address>] [\-\-logfile=<filename>] {\-\-nlist=<filename>} [\-\-nosetsched] [\-\-public\-addresses=<filename>] [\-\-public\-interface=<interface>] {\-\-reclock=<filename>} [\-\-single\-public\-ip=<address>] [\-\-socket=<filename>] [\-\-syslog] [\-\-torture] [\-\-transport=<STRING>] [\-\-usage]
 .SH "DESCRIPTION"
 .PP
-ctdbd is the main ctdb daemon.
+ctdbd is the main ctdb daemon\.
 .PP
-ctdbd provides a clustered version of the TDB database with automatic rebuild/recovery of the databases upon nodefailures.
+ctdbd provides a clustered version of the TDB database with automatic rebuild/recovery of the databases upon nodefailures\.
 .PP
-Combined with a cluster filesystem ctdbd provides a full HA environment for services such as clustered Samba and NFS as well as other services.
+Combined with a cluster filesystem ctdbd provides a full HA environment for services such as clustered Samba and NFS as well as other services\.
 .PP
-ctdbd provides monitoring of all nodes in the cluster and automatically reconfigures the cluster and recovers upon node failures.
+ctdbd provides monitoring of all nodes in the cluster and automatically reconfigures the cluster and recovers upon node failures\.
 .PP
-ctdbd is the main component in clustered Samba that provides a high\-awailability load\-sharing CIFS server cluster.
+ctdbd is the main component in clustered Samba that provides a high\-awailability load\-sharing CIFS server cluster\.
 .SH "OPTIONS"
 .PP
 \-? \-\-help
-.RS 3n
-Print some help text to the screen.
+.RS 4
+Print some help text to the screen\.
 .RE
 .PP
-\-\-usage
-.RS 3n
-Print useage information to the screen.
-.RE
-.PP
-\-\-reclock=<filename>
-.RS 3n
-This is the name of the lock file stored of the shared cluster filesystem that ctdbd uses to arbitrate which node has the role of recovery\-master. This file must be stored on shared storage.
-.RE
-.PP
-\-\-nlist=<filename>
-.RS 3n
-This file contains a list of the private ip addresses of every node in the cluster. There is one line/ip address for each node. This file must be the same for all nodes in the cluster.
-.sp
-This file is usually /etc/ctdb/nodes .
+\-d \-\-debug=<DEBUGLEVEL>
+.RS 4
+This option sets the debuglevel on the ctdbd daemon which controls what will be written to the logfile\. The default is 0 which will only log important events and errors\. A larger number will provide additional logging\.
 .RE
 .PP
 \-\-dbdir=<directory>
-.RS 3n
-This is the directory on local storage where ctdbd keeps the local copy of the TDB databases. This directory is local for each node and should not be stored on the shared cluster filesystem.
+.RS 4
+This is the directory on local storage where ctdbd keeps the local copy of the TDB databases\. This directory is local for each node and should not be stored on the shared cluster filesystem\.
 .sp
-This directory would usually be /var/ctdb .
+This directory would usually be /var/ctdb \.
+.RE
+.PP
+\-\-dbdir\-persistent=<directory>
+.RS 4
+This is the directory on local storage where ctdbd keeps the local copy of the persistent TDB databases\. This directory is local for each node and should not be stored on the shared cluster filesystem\.
+.sp
+This directory would usually be /etc/ctdb/persistent \.
+.RE
+.PP
+\-\-event\-script\-dir=<directory>
+.RS 4
+This option is used to specify the directory where the CTDB event scripts are stored\.
+.sp
+This will normally be /etc/ctdb/events\.d which is part of the ctdb distribution\.
 .RE
 .PP
 \-i \-\-interactive
-.RS 3n
-By default ctdbd will detach itself from the shell and run in the background as a daemon. This option makes ctdbd to start in interactive mode.
+.RS 4
+By default ctdbd will detach itself from the shell and run in the background as a daemon\. This option makes ctdbd to start in interactive mode\.
+.RE
+.PP
+\-\-listen=<address>
+.RS 4
+This specifies which ip address ctdb will bind to\. By default ctdbd will bind to the first address it finds in the /etc/ctdb/nodes file and which is also present on the local system in which case you do not need to provide this option\.
+.sp
+This option is only required when you want to run multiple ctdbd daemons/nodes on the same physical host in which case there would be multiple entries in /etc/ctdb/nodes what would match a local interface\.
+.RE
+.PP
+\-\-logfile=<filename>
+.RS 4
+This is the file where ctdbd will write its log\. This is usually /var/log/log\.ctdb \.
+.RE
+.PP
+\-\-nlist=<filename>
+.RS 4
+This file contains a list of the private ip addresses of every node in the cluster\. There is one line/ip address for each node\. This file must be the same for all nodes in the cluster\.
+.sp
+This file is usually /etc/ctdb/nodes \.
+.RE
+.PP
+\-\-nosetsched
+.RS 4
+Normally ctdb will change its scheduler to run as a real\-time process\. This option is used to change this behaviour and have ctdb run as a normal process\.
 .RE
 .PP
 \-\-public_addresses=<filename>
-.RS 3n
-When used with IP takeover this specifies a file containing the public ip addresses to use on the cluster. This file contains a list of ip addresses netmasks and interfaces. When ctdb is operational it will distribute these public ip addresses evenly across the available nodes.
+.RS 4
+When used with IP takeover this specifies a file containing the public ip addresses to use on the cluster\. This file contains a list of ip addresses netmasks and interfaces\. When ctdb is operational it will distribute these public ip addresses evenly across the available nodes\.
 .sp
 This is usually the file /etc/ctdb/public_addresses
 .RE
 .PP
-\-\-event\-script\-dir=<directory>
-.RS 3n
-This option is used to specify the directory where the CTDB event scripts are stored.
-.sp
-This will normally be /etc/ctdb/events.d which is part of the ctdb distribution.
+\-\-public_interface=<interface>
+.RS 4
+This option tells ctdb which interface to attach public\-addresses to and also where to attach the single\-public\-ip when used\.
 .RE
 .PP
-\-\-logfile=<filename>
-.RS 3n
-This is the file where ctdbd will write its log. This is usually /var/log/log.ctdb .
+\-\-reclock=<filename>
+.RS 4
+This is the name of the lock file stored of the shared cluster filesystem that ctdbd uses to arbitrate which node has the role of recovery\-master\. This file must be stored on shared storage\.
 .RE
 .PP
-\-\-listen=<address>
-.RS 3n
-This specifies which ip address ctdb will bind to. By default ctdbd will bind to the first address it finds in the /etc/ctdb/nodes file and which is also present on the local system in which case you do not need to provide this option.
+\-\-single\-public\-ip=<address>
+.RS 4
+This option is used to activate the "ipmux" functionality of ctdb\. In this mode, all nodes of the cluster will expose a single ip address from all nodes with all incoming traffic to the cluster being passed through the current recmaster\. This functionality is similar to using a load\-balancing switch\.
 .sp
-This option is only required when you want to run multiple ctdbd daemons/nodes on the same physical host in which case there would be multiple entries in /etc/ctdb/nodes what would match a local interface.
-.RE
-.PP
-\-\-transport=<STRING>
-.RS 3n
-This option specifies which transport to use for ctdbd internode communications. The default is "tcp".
+All incoming packets are sent to the recmaster which will multiplex the clients across all available nodes and pass the packets on to a different node in the cluster to manage the connection based on the clients ip address\. Outgoing packets however are sent directly from the node that was choosen back to the client\. Since all incoming packets are sent through the recmaster this will have a throughput and performance impact when used\. This impact in performance primarily affects write\-performance while read\-performance should be mainly unaffected\. Only use this feature if your environment is mostly\-read (i\.e\. most traffic is from the nodes back to the clients) or if it is not important to get maximum write\-performance to the cluster\.
 .sp
-Suported transports are "tcp" and "infiniband".
+When using a single public ip, you must also specify the public\-interface so that ctdb knows which interface to attach the single public ip to\.
 .RE
 .PP
 \-\-socket=<filename>
-.RS 3n
-This specifies the name of the domain socket that ctdbd will create. This socket is used for local clients to attach to and communicate with the ctdbd daemon.
+.RS 4
+This specifies the name of the domain socket that ctdbd will create\. This socket is used for local clients to attach to and communicate with the ctdbd daemon\.
 .sp
-The default is /tmp/ctdb.socket . You only need to use this option if you plan to run multiple ctdbd daemons on the same physical host.
+The default is /tmp/ctdb\.socket \. You only need to use this option if you plan to run multiple ctdbd daemons on the same physical host\.
 .RE
 .PP
-\-d \-\-debug=<DEBUGLEVEL>
-.RS 3n
-This option sets the debuglevel on the ctdbd daemon which controls what will be written to the logfile. The default is 0 which will only log important events and errors. A larger number will provide additional logging.
+\-\-syslog
+.RS 4
+Send all log messages to syslog instead of to the ctdb logfile\.
 .RE
 .PP
 \-\-torture
-.RS 3n
-This option is only used for development and testing of ctdbd. It adds artificial errors and failures to the common codepaths in ctdbd to verify that ctdbd can recover correctly for failures.
+.RS 4
+This option is only used for development and testing of ctdbd\. It adds artificial errors and failures to the common codepaths in ctdbd to verify that ctdbd can recover correctly for failures\.
 .sp
-You do NOT want to use this option unless you are developing and testing new functionality in ctdbd.
+You do NOT want to use this option unless you are developing and testing new functionality in ctdbd\.
+.RE
+.PP
+\-\-transport=<STRING>
+.RS 4
+This option specifies which transport to use for ctdbd internode communications\. The default is "tcp"\.
+.sp
+Suported transports are "tcp" and "infiniband"\.
+.RE
+.PP
+\-\-usage
+.RS 4
+Print useage information to the screen\.
 .RE
 .SH "PRIVATE VS PUBLIC ADDRESSES"
 .PP
-When used for ip takeover in a HA environment, each node in a ctdb cluster has multiple ip addresses assigned to it. One private and one or more public.
+When used for ip takeover in a HA environment, each node in a ctdb cluster has multiple ip addresses assigned to it\. One private and one or more public\.
 .SS "Private address"
 .PP
-This is the physical ip address of the node which is configured in linux and attached to a physical interface. This address uniquely identifies a physical node in the cluster and is the ip addresses that ctdbd will use to communicate with the ctdbd daemons on the other nodes in the cluster.
+This is the physical ip address of the node which is configured in linux and attached to a physical interface\. This address uniquely identifies a physical node in the cluster and is the ip addresses that ctdbd will use to communicate with the ctdbd daemons on the other nodes in the cluster\.
 .PP
-The private addresses are configured in /etc/ctdb/nodes (unless the \-\-nlist option is used) and contain one line for each node in the cluster. Each line contains the private ip address for one node in the cluster. This file must be the same on all nodes in the cluster.
+The private addresses are configured in /etc/ctdb/nodes (unless the \-\-nlist option is used) and contain one line for each node in the cluster\. Each line contains the private ip address for one node in the cluster\. This file must be the same on all nodes in the cluster\.
 .PP
-Since the private addresses are only available to the network when the corresponding node is up and running you should not use these addresses for clients to connect to services provided by the cluster. Instead client applications should only attach to the public addresses since these are guaranteed to always be available.
+Since the private addresses are only available to the network when the corresponding node is up and running you should not use these addresses for clients to connect to services provided by the cluster\. Instead client applications should only attach to the public addresses since these are guaranteed to always be available\.
 .PP
-When using ip takeover, it is strongly recommended that the private addresses are configured on a private network physically separated from the rest of the network and that this private network is dedicated to CTDB traffic.
+When using ip takeover, it is strongly recommended that the private addresses are configured on a private network physically separated from the rest of the network and that this private network is dedicated to CTDB traffic\.

      Example /etc/ctdb/nodes for a four node cluster:
      
 .sp
-.RS 3n
+.RS 4
 .nf
-        10.1.1.1
-        10.1.1.2
-        10.1.1.3
-        10.1.1.4
+        10\.1\.1\.1
+        10\.1\.1\.2
+        10\.1\.1\.3
+        10\.1\.1\.4
      
 .fi
 .RE
 .SS "Public address"
 .PP
-A public address on the other hand is not attached to an interface. This address is managed by ctdbd itself and is attached/detached to a physical node at runtime.
+A public address on the other hand is not attached to an interface\. This address is managed by ctdbd itself and is attached/detached to a physical node at runtime\.
 .PP
-The ctdb cluster will assign/reassign these public addresses across the available healthy nodes in the cluster. When one node fails, its public address will be migrated to and taken over by a different node in the cluster to ensure that all public addresses are always available to clients as long as there are still nodes available capable of hosting this address.
+The ctdb cluster will assign/reassign these public addresses across the available healthy nodes in the cluster\. When one node fails, its public address will be migrated to and taken over by a different node in the cluster to ensure that all public addresses are always available to clients as long as there are still nodes available capable of hosting this address\.
 .PP
-These addresses are not physically attached to a specific node. The 'ctdb ip' command can be used to view the current assignment of public addresses and which physical node is currently serving it.
+These addresses are not physically attached to a specific node\. The \'ctdb ip\' command can be used to view the current assignment of public addresses and which physical node is currently serving it\.
 .PP
-On each node this file contains a list of the public addresses that this node is capable of hosting. The list also contain the netmask and the interface where this address should be attached for the case where you may want to serve data out through multiple different interfaces.
+On each node this file contains a list of the public addresses that this node is capable of hosting\. The list also contain the netmask and the interface where this address should be attached for the case where you may want to serve data out through multiple different interfaces\.

      Example /etc/ctdb/public_addresses for a node that can host 4 public addresses:
      
 .sp
-.RS 3n
+.RS 4
 .nf
-        11.1.1.1/24 eth0
-        11.1.1.2/24 eth0
-        11.1.2.1/24 eth1
-        11.1.2.2/24 eth1
+        11\.1\.1\.1/24 eth0
+        11\.1\.1\.2/24 eth0
+        11\.1\.2\.1/24 eth1
+        11\.1\.2\.2/24 eth1
      
 .fi
 .RE
 .PP
-In most cases this file would be the same on all nodes in a cluster but there are exceptions when one may want to use different files on different nodes.
+In most cases this file would be the same on all nodes in a cluster but there are exceptions when one may want to use different files on different nodes\.

 	Example: 4 nodes partitioned into two subgroups :
 	
 .sp
-.RS 3n
+.RS 4
 .nf
 	Node 0:/etc/ctdb/public_addresses
-		10.1.1.1/24 eth0
-		10.1.1.2/24 eth0
+		10\.1\.1\.1/24 eth0
+		10\.1\.1\.2/24 eth0

 	Node 1:/etc/ctdb/public_addresses
-		10.1.1.1/24 eth0
-		10.1.1.2/24 eth0
+		10\.1\.1\.1/24 eth0
+		10\.1\.1\.2/24 eth0

 	Node 2:/etc/ctdb/public_addresses
-		10.2.1.1/24 eth0
-		10.2.1.2/24 eth0
+		10\.2\.1\.1/24 eth0
+		10\.2\.1\.2/24 eth0

 	Node 3:/etc/ctdb/public_addresses
-		10.2.1.1/24 eth0
-		10.2.1.2/24 eth0
+		10\.2\.1\.1/24 eth0
+		10\.2\.1\.2/24 eth0
 	
 .fi
 .RE
 .PP
-In this example nodes 0 and 1 host two public addresses on the 10.1.1.x network while nodes 2 and 3 host two public addresses for the 10.2.1.x network.
+In this example nodes 0 and 1 host two public addresses on the 10\.1\.1\.x network while nodes 2 and 3 host two public addresses for the 10\.2\.1\.x network\.
 .PP
-Ip address 10.1.1.1 can be hosted by either of nodes 0 or 1 and will be available to clients as long as at least one of these two nodes are available. If both nodes 0 and node 1 become unavailable 10.1.1.1 also becomes unavailable. 10.1.1.1 can not be failed over to node 2 or node 3 since these nodes do not have this ip address listed in their public addresses file.
+Ip address 10\.1\.1\.1 can be hosted by either of nodes 0 or 1 and will be available to clients as long as at least one of these two nodes are available\. If both nodes 0 and node 1 become unavailable 10\.1\.1\.1 also becomes unavailable\. 10\.1\.1\.1 can not be failed over to node 2 or node 3 since these nodes do not have this ip address listed in their public addresses file\.
 .SH "NODE STATUS"
 .PP
-The current status of each node in the cluster can be viewed by the 'ctdb status' command.
+The current status of each node in the cluster can be viewed by the \'ctdb status\' command\.
 .PP
-There are five possible for a node.
+There are five possible for a node\.
 .PP
-OK \- This node is fully functional.
+OK \- This node is fully functional\.
 .PP
-DISCONNECTED \- This node could not be connected through the network and is currently not particpating in the cluster. If there is a public IP address associated with this node it should have been taken over by a different node. No services are running on this node.
+DISCONNECTED \- This node could not be connected through the network and is currently not particpating in the cluster\. If there is a public IP address associated with this node it should have been taken over by a different node\. No services are running on this node\.
 .PP
-DISABLED \- This node has been administratively disabled. This node is still functional and participates in the CTDB cluster but its IP addresses have been taken over by a different node and no services are currently being hosted.
+DISABLED \- This node has been administratively disabled\. This node is still functional and participates in the CTDB cluster but its IP addresses have been taken over by a different node and no services are currently being hosted\.
 .PP
-UNHEALTHY \- A service provided by this node is malfunctioning and should be investigated. The CTDB daemon itself is operational and participates in the cluster. Its public IP address has been taken over by a different node and no services are currently being hosted. All unhealthy nodes should be investigated and require an administrative action to rectify.
+UNHEALTHY \- A service provided by this node is malfunctioning and should be investigated\. The CTDB daemon itself is operational and participates in the cluster\. Its public IP address has been taken over by a different node and no services are currently being hosted\. All unhealthy nodes should be investigated and require an administrative action to rectify\.
 .PP
-BANNED \- This node failed too many recovery attempts and has been banned from participating in the cluster for a period of RecoveryBanPeriod seconds. Any public IP address has been taken over by other nodes. This node does not provide any services. All banned nodes should be investigated and require an administrative action to rectify. This node does not perticipate in the CTDB cluster but can still be communicated with. I.e. ctdb commands can be sent to it.
+BANNED \- This node failed too many recovery attempts and has been banned from participating in the cluster for a period of RecoveryBanPeriod seconds\. Any public IP address has been taken over by other nodes\. This node does not provide any services\. All banned nodes should be investigated and require an administrative action to rectify\. This node does not perticipate in the CTDB cluster but can still be communicated with\. I\.e\. ctdb commands can be sent to it\.
 .SH "SEE ALSO"
 .PP
 ctdb(1), onnode(1)
 \fI\%http://ctdb.samba.org/\fR
 .SH "COPYRIGHT/LICENSE"
 .sp
-.RS 3n
+.RS 4
 .nf
 Copyright (C) Andrew Tridgell 2007
 Copyright (C) Ronnie sahlberg 2007
@ -221,14 +252,14 @@ Copyright (C) Ronnie sahlberg 2007
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 3 of the License, or (at
-your option) any later version.
+your option) any later version\.

 This program is distributed in the hope that it will be useful, but
 WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-General Public License for more details.
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE\.  See the GNU
+General Public License for more details\.

 You should have received a copy of the GNU General Public License
-along with this program; if not, see http://www.gnu.org/licenses/.
+along with this program; if not, see http://www\.gnu\.org/licenses/\.
 .fi
 .RE
--- a/ctdb/doc/ctdbd.1.html
+++ b/ctdb/doc/ctdbd.1.html
@ -1,4 +1,4 @@
-<html><head><meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"><title>ctdbd</title><meta name="generator" content="DocBook XSL Stylesheets V1.71.0"></head><body bgcolor="white" text="black" link="#0000FF" vlink="#840084" alink="#0000FF"><div class="refentry" lang="en"><a name="ctdbd.1"></a><div class="titlepage"></div><div class="refnamediv"><h2>Name</h2><p>ctdbd &#8212; The CTDB cluster daemon</p></div><div class="refsynopsisdiv"><h2>Synopsis</h2><div class="cmdsynopsis"><p><code class="command">ctdbd</code> </p></div><div class="cmdsynopsis"><p><code class="command">ctdbd</code>  {--reclock=&lt;filename&gt;} {--nlist=&lt;filename&gt;} {--dbdir=&lt;directory&gt;} [-? --help] [--usage] [-i --interactive] [--public-addresses=&lt;filename&gt;] [--event-script-dir=&lt;directory&gt;] [--logfile=&lt;filename&gt;] [--listen=&lt;address&gt;] [--transport=&lt;STRING&gt;] [--socket=&lt;filename&gt;] [-d --debug=&lt;INTEGER&gt;] [--torture]</p></div></div><div class="refsect1" lang="en"><a name="id2480886"></a><h2>DESCRIPTION</h2><p>
+<html><head><meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"><title>ctdbd</title><meta name="generator" content="DocBook XSL Stylesheets V1.73.2"></head><body bgcolor="white" text="black" link="#0000FF" vlink="#840084" alink="#0000FF"><div class="refentry" lang="en"><a name="ctdbd.1"></a><div class="titlepage"></div><div class="refnamediv"><h2>Name</h2><p>ctdbd &#8212; The CTDB cluster daemon</p></div><div class="refsynopsisdiv"><h2>Synopsis</h2><div class="cmdsynopsis"><p><code class="command">ctdbd</code> </p></div><div class="cmdsynopsis"><p><code class="command">ctdbd</code>  [-? --help] [-d --debug=&lt;INTEGER&gt;] {--dbdir=&lt;directory&gt;} {--dbdir-persistent=&lt;directory&gt;} [--event-script-dir=&lt;directory&gt;] [-i --interactive] [--listen=&lt;address&gt;] [--logfile=&lt;filename&gt;] {--nlist=&lt;filename&gt;} [--nosetsched] [--public-addresses=&lt;filename&gt;] [--public-interface=&lt;interface&gt;] {--reclock=&lt;filename&gt;} [--single-public-ip=&lt;address&gt;] [--socket=&lt;filename&gt;] [--syslog] [--torture] [--transport=&lt;STRING&gt;] [--usage]</p></div></div><div class="refsect1" lang="en"><a name="id2506368"></a><h2>DESCRIPTION</h2><p>
      ctdbd is the main ctdb daemon.
    </p><p>
      ctdbd provides a clustered version of the TDB database with automatic rebuild/recovery of the databases upon nodefailures.
@ -8,58 +8,96 @@
      ctdbd provides monitoring of all nodes in the cluster and automatically reconfigures the cluster and recovers upon node failures.
    </p><p>
      ctdbd is the main component in clustered Samba that provides a high-awailability load-sharing CIFS server cluster.
-    </p></div><div class="refsect1" lang="en"><a name="id2481092"></a><h2>OPTIONS</h2><div class="variablelist"><dl><dt><span class="term">-? --help</span></dt><dd><p>
+    </p></div><div class="refsect1" lang="en"><a name="id2506399"></a><h2>OPTIONS</h2><div class="variablelist"><dl><dt><span class="term">-? --help</span></dt><dd><p>
            Print some help text to the screen.
-          </p></dd><dt><span class="term">--usage</span></dt><dd><p>
-            Print useage information to the screen.
-          </p></dd><dt><span class="term">--reclock=&lt;filename&gt;</span></dt><dd><p>
-            This is the name of the lock file stored of the shared cluster filesystem that ctdbd uses to arbitrate which node has the role of recovery-master.
-            This file must be stored on shared storage.
-          </p></dd><dt><span class="term">--nlist=&lt;filename&gt;</span></dt><dd><p>
-            This file contains a list of the private ip addresses of every node in the cluster. There is one line/ip address for each node. This file must be the same for all nodes in the cluster.
-          </p><p>
-            This file is usually /etc/ctdb/nodes .
+          </p></dd><dt><span class="term">-d --debug=&lt;DEBUGLEVEL&gt;</span></dt><dd><p>
+            This option sets the debuglevel on the ctdbd daemon which controls what will be written to the logfile. The default is 0 which will only log important events and errors. A larger number will provide additional logging.
          </p></dd><dt><span class="term">--dbdir=&lt;directory&gt;</span></dt><dd><p>
            This is the directory on local storage where ctdbd keeps the local
            copy of the TDB databases. This directory is local for each node and should not be stored on the shared cluster filesystem.
          </p><p>
            This directory would usually be /var/ctdb .
-          </p></dd><dt><span class="term">-i --interactive</span></dt><dd><p>
-            By default ctdbd will detach itself from the shell and run in
-            the background as a daemon. This option makes ctdbd to start in interactive mode.
-          </p></dd><dt><span class="term">--public_addresses=&lt;filename&gt;</span></dt><dd><p>
-            When used with IP takeover this specifies a file containing the public ip addresses to use on the cluster. This file contains a list of ip addresses netmasks and interfaces. When ctdb is operational it will distribute these public ip addresses evenly across the available nodes.
+          </p></dd><dt><span class="term">--dbdir-persistent=&lt;directory&gt;</span></dt><dd><p>
+            This is the directory on local storage where ctdbd keeps the local
+            copy of the persistent TDB databases. This directory is local for each node and should not be stored on the shared cluster filesystem.
          </p><p>
-            This is usually the file /etc/ctdb/public_addresses
+            This directory would usually be /etc/ctdb/persistent .
          </p></dd><dt><span class="term">--event-script-dir=&lt;directory&gt;</span></dt><dd><p>
            This option is used to specify the directory where the CTDB event
 	    scripts are stored.
          </p><p>
            This will normally be /etc/ctdb/events.d which is part of the ctdb distribution.
-          </p></dd><dt><span class="term">--logfile=&lt;filename&gt;</span></dt><dd><p>
-            This is the file where ctdbd will write its log. This is usually /var/log/log.ctdb .
+          </p></dd><dt><span class="term">-i --interactive</span></dt><dd><p>
+            By default ctdbd will detach itself from the shell and run in
+            the background as a daemon. This option makes ctdbd to start in interactive mode.
          </p></dd><dt><span class="term">--listen=&lt;address&gt;</span></dt><dd><p>
            This specifies which ip address ctdb will bind to. By default ctdbd will bind to the first address it finds in the /etc/ctdb/nodes file and which is also present on the local system in which case you do not need to provide this option.
          </p><p>
            This option is only required when you want to run multiple ctdbd daemons/nodes on the same physical host in which case there would be multiple entries in /etc/ctdb/nodes what would match a local interface.
-          </p></dd><dt><span class="term">--transport=&lt;STRING&gt;</span></dt><dd><p>
-            This option specifies which transport to use for ctdbd internode communications. The default is "tcp".
+          </p></dd><dt><span class="term">--logfile=&lt;filename&gt;</span></dt><dd><p>
+            This is the file where ctdbd will write its log. This is usually /var/log/log.ctdb .
+          </p></dd><dt><span class="term">--nlist=&lt;filename&gt;</span></dt><dd><p>
+            This file contains a list of the private ip addresses of every node in the cluster. There is one line/ip address for each node. This file must be the same for all nodes in the cluster.
          </p><p>
-            Suported transports are "tcp" and "infiniband".
+            This file is usually /etc/ctdb/nodes .
+          </p></dd><dt><span class="term">--nosetsched</span></dt><dd><p>
+            Normally ctdb will change its scheduler to run as a real-time 
+	    process. This option is used to change this behaviour and have
+	    ctdb run as a normal process.
+          </p></dd><dt><span class="term">--public_addresses=&lt;filename&gt;</span></dt><dd><p>
+            When used with IP takeover this specifies a file containing the public ip addresses to use on the cluster. This file contains a list of ip addresses netmasks and interfaces. When ctdb is operational it will distribute these public ip addresses evenly across the available nodes.
+          </p><p>
+            This is usually the file /etc/ctdb/public_addresses
+          </p></dd><dt><span class="term">--public_interface=&lt;interface&gt;</span></dt><dd><p>
+            This option tells ctdb which interface to attach public-addresses
+	    to and also where to attach the single-public-ip when used.
+          </p></dd><dt><span class="term">--reclock=&lt;filename&gt;</span></dt><dd><p>
+            This is the name of the lock file stored of the shared cluster filesystem that ctdbd uses to arbitrate which node has the role of recovery-master.
+            This file must be stored on shared storage.
+          </p></dd><dt><span class="term">--single-public-ip=&lt;address&gt;</span></dt><dd><p>
+            This option is used to activate the "ipmux" functionality of ctdb.
+            In this mode, all nodes of the cluster will expose a single
+            ip address from all nodes with all incoming traffic to the cluster
+            being passed through the current recmaster. This functionality
+            is similar to using a load-balancing switch.
+          </p><p>
+            All incoming packets are sent to the recmaster which will multiplex
+            the clients across all available nodes and pass the packets on to
+            a different node in the cluster to manage the connection based
+            on the clients ip address. Outgoing packets however are sent
+            directly from the node that was choosen back to the client.
+            Since all incoming packets are sent through the recmaster this will
+            have a throughput and performance impact when used. This impact
+            in performance primarily affects write-performance while 
+	    read-performance should be mainly unaffected.
+            Only use this feature if your environment is mostly-read 
+            (i.e. most traffic is from the nodes back to the clients) or
+            if it is not important to get maximum write-performance to the
+	    cluster.
+          </p><p>
+            When using a single public ip, you must also specify the 
+            public-interface so that ctdb knows which interface to attach the 
+            single public ip to.
          </p></dd><dt><span class="term">--socket=&lt;filename&gt;</span></dt><dd><p>
            This specifies the name of the domain socket that ctdbd will create. This socket is used for local clients to attach to and communicate with the ctdbd daemon.
          </p><p>
            The default is /tmp/ctdb.socket . You only need to use this option if you plan to run multiple ctdbd daemons on the same physical host.
-          </p></dd><dt><span class="term">-d --debug=&lt;DEBUGLEVEL&gt;</span></dt><dd><p>
-            This option sets the debuglevel on the ctdbd daemon which controls what will be written to the logfile. The default is 0 which will only log important events and errors. A larger number will provide additional logging.
+          </p></dd><dt><span class="term">--syslog</span></dt><dd><p>
+	    Send all log messages to syslog instead of to the ctdb logfile.
          </p></dd><dt><span class="term">--torture</span></dt><dd><p>
            This option is only used for development and testing of ctdbd. It adds artificial errors and failures to the common codepaths in ctdbd to verify that ctdbd can recover correctly for failures.
          </p><p>
            You do NOT want to use this option unless you are developing and testing new functionality in ctdbd.
-          </p></dd></dl></div></div><div class="refsect1" lang="en"><a name="id2528417"></a><h2>Private vs Public addresses</h2><p>
+          </p></dd><dt><span class="term">--transport=&lt;STRING&gt;</span></dt><dd><p>
+            This option specifies which transport to use for ctdbd internode communications. The default is "tcp".
+          </p><p>
+            Suported transports are "tcp" and "infiniband".
+          </p></dd><dt><span class="term">--usage</span></dt><dd><p>
+            Print useage information to the screen.
+          </p></dd></dl></div></div><div class="refsect1" lang="en"><a name="id2553928"></a><h2>Private vs Public addresses</h2><p>
      When used for ip takeover in a HA environment, each node in a ctdb 
      cluster has multiple ip addresses assigned to it. One private and one or more public.
-    </p><div class="refsect2" lang="en"><a name="id2528427"></a><h3>Private address</h3><p>
+    </p><div class="refsect2" lang="en"><a name="id2553939"></a><h3>Private address</h3><p>
        This is the physical ip address of the node which is configured in 
        linux and attached to a physical interface. This address uniquely
        identifies a physical node in the cluster and is the ip addresses
@ -89,7 +127,7 @@
        10.1.1.2
        10.1.1.3
        10.1.1.4
-      </pre></div><div class="refsect2" lang="en"><a name="id2528476"></a><h3>Public address</h3><p>
+      </pre></div><div class="refsect2" lang="en"><a name="id2553987"></a><h3>Public address</h3><p>
        A public address on the other hand is not attached to an interface.
        This address is managed by ctdbd itself and is attached/detached to
        a physical node at runtime.
@ -150,7 +188,7 @@
 	unavailable. 10.1.1.1 can not be failed over to node 2 or node 3 since
 	these nodes do not have this ip address listed in their public
 	addresses file.
-	</p></div></div><div class="refsect1" lang="en"><a name="id2528564"></a><h2>Node status</h2><p>
+	</p></div></div><div class="refsect1" lang="en"><a name="id2554069"></a><h2>Node status</h2><p>
      The current status of each node in the cluster can be viewed by the 
      'ctdb status' command.
    </p><p>
@ -181,10 +219,10 @@
      investigated and require an administrative action to rectify. This node 
      does not perticipate in the CTDB cluster but can still be communicated 
      with. I.e. ctdb commands can be sent to it.
-    </p></div><div class="refsect1" lang="en"><a name="id2528621"></a><h2>SEE ALSO</h2><p>
+    </p></div><div class="refsect1" lang="en"><a name="id2554131"></a><h2>SEE ALSO</h2><p>
      ctdb(1), onnode(1)
-      <a href="http://ctdb.samba.org/" target="_top">http://ctdb.samba.org/</a>
-    </p></div><div class="refsect1" lang="en"><a name="id2528634"></a><h2>COPYRIGHT/LICENSE</h2><div class="literallayout"><p><br>
+      <a class="ulink" href="http://ctdb.samba.org/" target="_top">http://ctdb.samba.org/</a>
+    </p></div><div class="refsect1" lang="en"><a name="id2554144"></a><h2>COPYRIGHT/LICENSE</h2><div class="literallayout"><p><br>
 Copyright (C) Andrew Tridgell 2007<br>
 Copyright (C) Ronnie sahlberg 2007<br>
 <br>
--- a/ctdb/include/ctdb.h
+++ b/ctdb/include/ctdb.h
@ -85,6 +85,11 @@ struct ctdb_call_info {
 */
 #define CTDB_SRVID_UNBAN_NODE 0xF600000000000000LL

+/*
+  a message to tell the recovery daemon to fetch a set of records
+ */
+#define CTDB_SRVID_VACUUM_FETCH 0xF700000000000000LL
+

 /* used on the domain socket, send a pdu to the local daemon */
 #define CTDB_CURRENT_NODE     0xF0000001
--- a/ctdb/include/ctdb_private.h
+++ b/ctdb/include/ctdb_private.h
@ -366,6 +366,7 @@ struct ctdb_context {
 	struct _trbt_tree_t *server_ids;	
 	const char *event_script_dir;
 	const char *default_public_interface;
+	pid_t ctdbd_pid;
 	pid_t recoverd_pid;
 	bool done_startup;
 	const char *node_ip;
@ -483,6 +484,7 @@ enum ctdb_controls {CTDB_CONTROL_PROCESS_EXISTS          = 0,
 		    CTDB_CONTROL_TRANSACTION_START       = 65,
 		    CTDB_CONTROL_TRANSACTION_COMMIT      = 66,
 		    CTDB_CONTROL_WIPE_DATABASE           = 67,
+		    CTDB_CONTROL_DELETE_RECORD           = 68,
 };	

 /*
@ -1001,6 +1003,21 @@ struct ctdb_control_wipe_database {
 	uint32_t transaction_id;
 };

+/*
+  state of a in-progress ctdb call in client
+*/
+struct ctdb_client_call_state {
+	enum call_state state;
+	uint32_t reqid;
+	struct ctdb_db_context *ctdb_db;
+	struct ctdb_call call;
+	struct {
+		void (*fn)(struct ctdb_client_call_state *);
+		void *private;
+	} async;
+};
+
+
 int32_t ctdb_control_traverse_start(struct ctdb_context *ctdb, TDB_DATA indata, 
 				    TDB_DATA *outdata, uint32_t srcnode);
 int32_t ctdb_control_traverse_all(struct ctdb_context *ctdb, TDB_DATA data, TDB_DATA *outdata);
@ -1186,4 +1203,11 @@ int32_t ctdb_control_transaction_start(struct ctdb_context *ctdb, uint32_t id);
 int32_t ctdb_control_transaction_commit(struct ctdb_context *ctdb, uint32_t id);
 int32_t ctdb_control_wipe_database(struct ctdb_context *ctdb, TDB_DATA indata);

+
+int ctdb_vacuum(struct ctdb_context *ctdb, int argc, const char **argv);
+int ctdb_repack(struct ctdb_context *ctdb, int argc, const char **argv);
+
+int32_t ctdb_control_delete_record(struct ctdb_context *ctdb, TDB_DATA indata);
+
+
 #endif
--- a/ctdb/lib/tdb/common/freelist.c
+++ b/ctdb/lib/tdb/common/freelist.c
@ -342,3 +342,26 @@ tdb_off_t tdb_allocate(struct tdb_context *tdb, tdb_len_t length, struct list_st
 	return 0;
 }

+
+
+/* 
+   return the size of the freelist - used to decide if we should repack 
+*/
+int tdb_freelist_size(struct tdb_context *tdb)
+{
+	tdb_off_t ptr;
+	int count=0;
+
+	if (tdb_lock(tdb, -1, F_RDLCK) == -1) {
+		return -1;
+	}
+
+	ptr = FREELIST_TOP;
+	while (ptr != 0 && tdb_ofs_read(tdb, ptr, &ptr) == 0) {
+		count++;
+		
+	}
+
+	tdb_unlock(tdb, -1, F_RDLCK);
+	return count;
+}
--- a/ctdb/lib/tdb/include/tdb.h
+++ b/ctdb/lib/tdb/include/tdb.h
@ -156,6 +156,7 @@ void tdb_dump_all(struct tdb_context *tdb);
 int tdb_printfreelist(struct tdb_context *tdb);
 int tdb_validate_freelist(struct tdb_context *tdb, int *pnum_entries);
 int tdb_wipe_all(struct tdb_context *tdb);
+int tdb_freelist_size(struct tdb_context *tdb);

 extern TDB_DATA tdb_null;

--- a/ctdb/packaging/RPM/ctdb.spec
+++ b/ctdb/packaging/RPM/ctdb.spec
@ -5,7 +5,7 @@ Vendor: Samba Team
 Packager: Samba Team <samba@samba.org>
 Name: ctdb
 Version: 1.0
-Release: 20
+Release: 21
 Epoch: 0
 License: GNU GPL version 3
 Group: System Environment/Daemons
@ -118,6 +118,8 @@ fi
 %{_includedir}/ctdb_private.h

 %changelog
+* Wed Jan 09 2008 : Version 1.0.21
+ - added ctdb vacuum and ctdb repack code
 * Sun Jan 06 2008 : Version 1.0.20
 - new transaction based recovery code
 * Sat Jan 05 2008 : Version 1.0.19
--- a/ctdb/server/ctdb_control.c
+++ b/ctdb/server/ctdb_control.c
@ -321,6 +321,9 @@ static int32_t ctdb_control_dispatch(struct ctdb_context *ctdb,
 		CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_control_wipe_database));
 		return ctdb_control_wipe_database(ctdb, indata);

+	case CTDB_CONTROL_DELETE_RECORD:
+		return ctdb_control_delete_record(ctdb, indata);
+
 	default:
 		DEBUG(0,(__location__ " Unknown CTDB control opcode %u\n", opcode));
 		return -1;
--- a/ctdb/server/ctdb_recover.c
+++ b/ctdb/server/ctdb_recover.c
@ -509,6 +509,11 @@ int32_t ctdb_control_set_recmode(struct ctdb_context *ctdb,
 		return -1;
 	}

+	if (recmode != ctdb->recovery_mode) {
+		DEBUG(0,(__location__ " Recovery mode set to %s\n", 
+			 recmode==CTDB_RECOVERY_NORMAL?"NORMAL":"ACTIVE"));
+	}
+
 	if (recmode != CTDB_RECOVERY_NORMAL ||
 	    ctdb->recovery_mode != CTDB_RECOVERY_ACTIVE) {
 		ctdb->recovery_mode = recmode;
@ -631,3 +636,106 @@ bool ctdb_recovery_lock(struct ctdb_context *ctdb, bool keep)
 }


+/*
+  delete a record as part of the vacuum process
+  only delete if we are not lmaster or dmaster, and our rsn is <= the provided rsn
+  use non-blocking locks
+ */
+int32_t ctdb_control_delete_record(struct ctdb_context *ctdb, TDB_DATA indata)
+{
+	struct ctdb_rec_data *rec = (struct ctdb_rec_data *)indata.dptr;
+	struct ctdb_db_context *ctdb_db;
+	TDB_DATA key, data;
+	struct ctdb_ltdb_header *hdr, *hdr2;
+	
+	/* these are really internal tdb functions - but we need them here for
+	   non-blocking lock of the freelist */
+	int tdb_lock_nonblock(struct tdb_context *tdb, int list, int ltype);
+	int tdb_unlock(struct tdb_context *tdb, int list, int ltype);
+
+	if (indata.dsize < sizeof(uint32_t) || indata.dsize != rec->length) {
+		DEBUG(0,(__location__ " Bad record size in ctdb_control_delete_record\n"));
+		return -1;
+	}
+
+	ctdb_db = find_ctdb_db(ctdb, rec->reqid);
+	if (!ctdb_db) {
+		DEBUG(0,(__location__ " Unknown db 0x%08x\n", rec->reqid));
+		return -1;
+	}
+
+	key.dsize = rec->keylen;
+	key.dptr  = &rec->data[0];
+	data.dsize = rec->datalen;
+	data.dptr = &rec->data[rec->keylen];
+
+	if (ctdb_lmaster(ctdb, &key) == ctdb->pnn) {
+		DEBUG(2,(__location__ " Called delete on record where we are lmaster\n"));
+		return -1;
+	}
+
+	if (data.dsize != sizeof(struct ctdb_ltdb_header)) {
+		DEBUG(0,(__location__ " Bad record size\n"));
+		return -1;
+	}
+
+	hdr = (struct ctdb_ltdb_header *)data.dptr;
+
+	/* use a non-blocking lock */
+	if (tdb_chainlock_nonblock(ctdb_db->ltdb->tdb, key) != 0) {
+		return -1;
+	}
+
+	data = tdb_fetch(ctdb_db->ltdb->tdb, key);
+	if (data.dptr == NULL) {
+		tdb_chainunlock(ctdb_db->ltdb->tdb, key);
+		return 0;
+	}
+
+	if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
+		if (tdb_lock_nonblock(ctdb_db->ltdb->tdb, -1, F_WRLCK) == 0) {
+			tdb_delete(ctdb_db->ltdb->tdb, key);
+			tdb_unlock(ctdb_db->ltdb->tdb, -1, F_WRLCK);
+			DEBUG(0,(__location__ " Deleted corrupt record\n"));
+		}
+		tdb_chainunlock(ctdb_db->ltdb->tdb, key);
+		free(data.dptr);
+		return 0;
+	}
+	
+	hdr2 = (struct ctdb_ltdb_header *)data.dptr;
+
+	if (hdr2->rsn > hdr->rsn) {
+		tdb_chainunlock(ctdb_db->ltdb->tdb, key);
+		DEBUG(2,(__location__ " Skipping record with rsn=%llu - called with rsn=%llu\n",
+			 (unsigned long long)hdr2->rsn, (unsigned long long)hdr->rsn));
+		free(data.dptr);
+		return -1;		
+	}
+
+	if (hdr2->dmaster == ctdb->pnn) {
+		tdb_chainunlock(ctdb_db->ltdb->tdb, key);
+		DEBUG(2,(__location__ " Attempted delete record where we are the dmaster\n"));
+		free(data.dptr);
+		return -1;				
+	}
+
+	if (tdb_lock_nonblock(ctdb_db->ltdb->tdb, -1, F_WRLCK) != 0) {
+		tdb_chainunlock(ctdb_db->ltdb->tdb, key);
+		free(data.dptr);
+		return -1;				
+	}
+
+	if (tdb_delete(ctdb_db->ltdb->tdb, key) != 0) {
+		tdb_unlock(ctdb_db->ltdb->tdb, -1, F_WRLCK);
+		tdb_chainunlock(ctdb_db->ltdb->tdb, key);
+		DEBUG(2,(__location__ " Failed to delete record\n"));
+		free(data.dptr);
+		return -1;						
+	}
+
+	tdb_unlock(ctdb_db->ltdb->tdb, -1, F_WRLCK);
+	tdb_chainunlock(ctdb_db->ltdb->tdb, key);
+	free(data.dptr);
+	return 0;	
+}
--- a/ctdb/server/ctdb_recoverd.c
+++ b/ctdb/server/ctdb_recoverd.c
@ -28,6 +28,7 @@
 #include "../include/ctdb.h"
 #include "../include/ctdb_private.h"
 #include "db_wrap.h"
+#include "dlinklist.h"


 struct ban_state {
@ -50,6 +51,7 @@ struct ctdb_recoverd {
 	uint32_t node_flags;
 	struct timed_event *send_election_te;
 	struct timed_event *election_timeout;
+	struct vacuum_info *vacuum_info;
 };

 #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
@ -701,6 +703,190 @@ static void unban_handler(struct ctdb_context *ctdb, uint64_t srvid,
 }


+struct vacuum_info {
+	struct vacuum_info *next, *prev;
+	struct ctdb_recoverd *rec;
+	uint32_t srcnode;
+	struct ctdb_db_context *ctdb_db;
+	struct ctdb_control_pulldb_reply *recs;
+	struct ctdb_rec_data *r;
+};
+
+static void vacuum_fetch_next(struct vacuum_info *v);
+
+/*
+  called when a vacuum fetch has completed - just free it and do the next one
+ */
+static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
+{
+	struct vacuum_info *v = talloc_get_type(state->async.private, struct vacuum_info);
+	talloc_free(state);
+	vacuum_fetch_next(v);
+}
+
+
+/*
+  process the next element from the vacuum list
+*/
+static void vacuum_fetch_next(struct vacuum_info *v)
+{
+	struct ctdb_call call;
+	struct ctdb_rec_data *r;
+
+	while (v->recs->count) {
+		struct ctdb_client_call_state *state;
+		TDB_DATA data;
+		struct ctdb_ltdb_header *hdr;
+
+		ZERO_STRUCT(call);
+		call.call_id = CTDB_NULL_FUNC;
+		call.flags = CTDB_IMMEDIATE_MIGRATION;
+
+		r = v->r;
+		v->r = (struct ctdb_rec_data *)(r->length + (uint8_t *)r);
+		v->recs->count--;
+
+		call.key.dptr = &r->data[0];
+		call.key.dsize = r->keylen;
+
+		/* ensure we don't block this daemon - just skip a record if we can't get
+		   the chainlock */
+		if (tdb_chainlock_nonblock(v->ctdb_db->ltdb->tdb, call.key) != 0) {
+			continue;
+		}
+
+		data = tdb_fetch(v->ctdb_db->ltdb->tdb, call.key);
+		if (data.dptr == NULL || data.dsize < sizeof(struct ctdb_ltdb_header)) {
+			tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
+			continue;
+		}
+		
+		hdr = (struct ctdb_ltdb_header *)data.dptr;
+		if (hdr->dmaster == v->rec->ctdb->pnn) {
+			/* its already local */
+			tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
+			continue;
+		}
+
+		state = ctdb_call_send(v->ctdb_db, &call);
+		tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
+		if (state == NULL) {
+			DEBUG(0,(__location__ " Failed to setup vacuum fetch call\n"));
+			talloc_free(v);
+			return;
+		}
+		state->async.fn = vacuum_fetch_callback;
+		state->async.private = v;
+		return;
+	}
+
+	talloc_free(v);
+}
+
+
+/*
+  destroy a vacuum info structure
+ */
+static int vacuum_info_destructor(struct vacuum_info *v)
+{
+	DLIST_REMOVE(v->rec->vacuum_info, v);
+	return 0;
+}
+
+
+/*
+  handler for vacuum fetch
+*/
+static void vacuum_fetch_handler(struct ctdb_context *ctdb, uint64_t srvid, 
+				 TDB_DATA data, void *private_data)
+{
+	struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
+	struct ctdb_control_pulldb_reply *recs;
+	int ret, i;
+	TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+	const char *name;
+	struct ctdb_dbid_map *dbmap=NULL;
+	bool persistent = false;
+	struct ctdb_db_context *ctdb_db;
+	struct ctdb_rec_data *r;
+	uint32_t srcnode;
+	struct vacuum_info *v;
+
+	recs = (struct ctdb_control_pulldb_reply *)data.dptr;
+	r = (struct ctdb_rec_data *)&recs->data[0];
+
+	if (recs->count == 0) {
+		return;
+	}
+
+	srcnode = r->reqid;
+
+	for (v=rec->vacuum_info;v;v=v->next) {
+		if (srcnode == v->srcnode) {
+			/* we're already working on records from this node */
+			return;
+		}
+	}
+
+	/* work out if the database is persistent */
+	ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
+	if (ret != 0) {
+		DEBUG(0, (__location__ " Unable to get dbids from local node\n"));
+		talloc_free(tmp_ctx);
+		return;
+	}
+
+	for (i=0;i<dbmap->num;i++) {
+		if (dbmap->dbs[i].dbid == recs->db_id) {
+			persistent = dbmap->dbs[i].persistent;
+			break;
+		}
+	}
+	if (i == dbmap->num) {
+		DEBUG(0, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
+		talloc_free(tmp_ctx);
+		return;		
+	}
+
+	/* find the name of this database */
+	if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
+		DEBUG(0,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
+		talloc_free(tmp_ctx);
+		return;
+	}
+
+	/* attach to it */
+	ctdb_db = ctdb_attach(ctdb, name, persistent);
+	if (ctdb_db == NULL) {
+		DEBUG(0,(__location__ " Failed to attach to database '%s'\n", name));
+		talloc_free(tmp_ctx);
+		return;
+	}
+
+	v = talloc_zero(rec, struct vacuum_info);
+	if (v == NULL) {
+		DEBUG(0,(__location__ " Out of memory\n"));
+		return;
+	}
+
+	v->rec = rec;
+	v->srcnode = srcnode;
+	v->ctdb_db = ctdb_db;
+	v->recs = talloc_memdup(v, recs, data.dsize);
+	if (v->recs == NULL) {
+		DEBUG(0,(__location__ " Out of memory\n"));
+		talloc_free(v);
+		return;		
+	}
+	v->r = 	(struct ctdb_rec_data *)&v->recs->data[0];
+
+	DLIST_ADD(rec->vacuum_info, v);
+
+	talloc_set_destructor(v, vacuum_info_destructor);
+
+	vacuum_fetch_next(v);
+}
+

 /*
  called when ctdb_wait_timeout should finish
@ -1806,6 +1992,9 @@ static void monitor_cluster(struct ctdb_context *ctdb)

 	/* and one for when nodes are unbanned */
 	ctdb_set_message_handler(ctdb, CTDB_SRVID_UNBAN_NODE, unban_handler, rec);
+
+	/* register a message port for vacuum fetch */
+	ctdb_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
 	
 again:
 	if (mem_ctx) {
@ -1821,6 +2010,12 @@ again:
 	/* we only check for recovery once every second */
 	ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval);

+	/* verify that the main daemon is still running */
+	if (kill(ctdb->ctdbd_pid, 0) != 0) {
+		DEBUG(0,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
+		exit(-1);
+	}
+
 	if (rec->election_timeout) {
 		/* an election is in progress */
 		goto again;
@ -2275,6 +2470,8 @@ int ctdb_start_recoverd(struct ctdb_context *ctdb)
 		return -1;
 	}

+	ctdb->ctdbd_pid = getpid();
+
 	ctdb->recoverd_pid = fork();
 	if (ctdb->recoverd_pid == -1) {
 		return -1;
--- a/ctdb/tools/ctdb.c
+++ b/ctdb/tools/ctdb.c
@ -1025,7 +1025,6 @@ static int control_dumpmemory(struct ctdb_context *ctdb, int argc, const char **
 			    CTDB_CTRL_FLAG_NOREPLY, tdb_null, NULL, NULL, NULL, NULL, NULL);
 }

-
 static const struct {
 	const char *name;
 	int (*fn)(struct ctdb_context *, int, const char **);
@ -1068,6 +1067,8 @@ static const struct {
 	{ "unregsrvid",      unregsrvid,		false, "unregister a server id", "<pnn> <type> <id>" },
 	{ "chksrvid",        chksrvid,			false, "check if a server id exists", "<pnn> <type> <id>" },
 	{ "getsrvids",       getsrvids,			false, "get a list of all server ids"},
+	{ "vacuum",          ctdb_vacuum,		false, "vacuum the databases of empty records", "[max_records]"},
+	{ "repack",          ctdb_repack,		false, "repack all databases", "[max_freelist]"},
 };

 /*
@ -1116,6 +1117,8 @@ int main(int argc, const char *argv[])
 	struct event_context *ev;
 	const char *control;

+	setlinebuf(stdout);
+	
 	/* set some defaults */
 	options.timelimit = 3;
 	options.pnn = CTDB_CURRENT_NODE;
--- a/ctdb/tools/ctdb_vacuum.c
+++ b/ctdb/tools/ctdb_vacuum.c
@ -0,0 +1,633 @@
+/* 
+   ctdb control tool - database vacuum 
+
+   Copyright (C) Andrew Tridgell  2008
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "lib/events/events.h"
+#include "system/filesys.h"
+#include "system/network.h"
+#include "../include/ctdb.h"
+#include "../include/ctdb_private.h"
+#include "db_wrap.h"
+
+/* should be tunable */
+#define TIMELIMIT() timeval_current_ofs(10, 0)
+
+struct async_data {
+	uint32_t count;
+	uint32_t fail_count;
+};
+
+static void async_callback(struct ctdb_client_control_state *state)
+{
+	struct async_data *data = talloc_get_type(state->async.private_data, struct async_data);
+	int ret;
+	int32_t res;
+
+	/* one more node has responded with recmode data */
+	data->count--;
+
+	/* if we failed to push the db, then return an error and let
+	   the main loop try again.
+	*/
+	if (state->state != CTDB_CONTROL_DONE) {
+		data->fail_count++;
+		return;
+	}
+	
+	state->async.fn = NULL;
+
+	ret = ctdb_control_recv(state->ctdb, state, data, NULL, &res, NULL);
+	if ((ret != 0) || (res != 0)) {
+		data->fail_count++;
+	}
+}
+
+static void async_add(struct async_data *data, struct ctdb_client_control_state *state)
+{
+	/* set up the callback functions */
+	state->async.fn = async_callback;
+	state->async.private_data = data;
+	
+	/* one more control to wait for to complete */
+	data->count++;
+}
+
+
+/* wait for up to the maximum number of seconds allowed
+   or until all nodes we expect a response from has replied
+*/
+static int async_wait(struct ctdb_context *ctdb, struct async_data *data)
+{
+	while (data->count > 0) {
+		event_loop_once(ctdb->ev);
+	}
+	if (data->fail_count != 0) {
+		return -1;
+	}
+	return 0;
+}
+
+/* 
+   perform a simple control on nodes in the vnn map except ourselves.
+   The control cannot return data
+ */
+static int async_control_on_vnnmap(struct ctdb_context *ctdb, enum ctdb_controls opcode,
+				   TDB_DATA data)
+{
+	struct async_data *async_data;
+	struct ctdb_client_control_state *state;
+	int j;
+	struct timeval timeout = TIMELIMIT();
+	
+	async_data = talloc_zero(ctdb, struct async_data);
+	CTDB_NO_MEMORY_FATAL(ctdb, async_data);
+
+	/* loop over all active nodes and send an async control to each of them */
+	for (j=0; j<ctdb->vnn_map->size; j++) {
+		uint32_t pnn = ctdb->vnn_map->map[j];
+		if (pnn == ctdb->pnn) {
+			continue;
+		}
+		state = ctdb_control_send(ctdb, pnn, 0, opcode, 
+					  0, data, async_data, NULL, &timeout, NULL);
+		if (state == NULL) {
+			DEBUG(0,(__location__ " Failed to call async control %u\n", (unsigned)opcode));
+			talloc_free(async_data);
+			return -1;
+		}
+		
+		async_add(async_data, state);
+	}
+
+	if (async_wait(ctdb, async_data) != 0) {
+		talloc_free(async_data);
+		return -1;
+	}
+
+	talloc_free(async_data);
+	return 0;
+}
+
+
+/*
+  vacuum one record
+ */
+static int ctdb_vacuum_one(struct ctdb_context *ctdb, TDB_DATA key, 
+			   struct ctdb_db_context *ctdb_db, uint32_t *count)
+{
+	TDB_DATA data;
+	struct ctdb_ltdb_header *hdr;
+	struct ctdb_rec_data *rec;
+	uint64_t rsn;
+
+	if (tdb_chainlock_nonblock(ctdb_db->ltdb->tdb, key) != 0) {
+		/* the chain is busy - come back later */
+		return 0;
+	}
+
+	data = tdb_fetch(ctdb_db->ltdb->tdb, key);
+	tdb_chainunlock(ctdb_db->ltdb->tdb, key);
+	if (data.dptr == NULL) {
+		return 0;
+	}
+	if (data.dsize != sizeof(struct ctdb_ltdb_header)) {
+		free(data.dptr);
+		return 0;
+	}
+
+
+	hdr = (struct ctdb_ltdb_header *)data.dptr;
+	rsn = hdr->rsn;
+
+	/* if we are not the lmaster and the dmaster then skip the record */
+	if (hdr->dmaster != ctdb->pnn ||
+	    ctdb_lmaster(ctdb, &key) != ctdb->pnn) {
+		free(data.dptr);
+		return 0;
+	}
+
+	rec = ctdb_marshall_record(ctdb, ctdb_db->db_id, key, hdr, tdb_null);
+	free(data.dptr);
+	if (rec == NULL) {
+		/* try it again later */
+		return 0;
+	}
+
+	data.dptr = (void *)rec;
+	data.dsize = rec->length;
+
+	if (async_control_on_vnnmap(ctdb, CTDB_CONTROL_DELETE_RECORD, data) != 0) {
+		/* one or more nodes failed to delete a record - no problem! */
+		talloc_free(rec);
+		return 0;
+	}
+
+	talloc_free(rec);
+
+	/* its deleted on all other nodes - refetch, check and delete */
+	if (tdb_chainlock_nonblock(ctdb_db->ltdb->tdb, key) != 0) {
+		/* the chain is busy - come back later */
+		return 0;
+	}
+
+	data = tdb_fetch(ctdb_db->ltdb->tdb, key);
+	if (data.dptr == NULL) {
+		tdb_chainunlock(ctdb_db->ltdb->tdb, key);
+		return 0;
+	}
+	if (data.dsize != sizeof(struct ctdb_ltdb_header)) {
+		free(data.dptr);
+		tdb_chainunlock(ctdb_db->ltdb->tdb, key);
+		return 0;
+	}
+
+	hdr = (struct ctdb_ltdb_header *)data.dptr;
+
+	/* if we are not the lmaster and the dmaster then skip the record */
+	if (hdr->dmaster != ctdb->pnn ||
+	    ctdb_lmaster(ctdb, &key) != ctdb->pnn ||
+	    rsn != hdr->rsn) {
+		tdb_chainunlock(ctdb_db->ltdb->tdb, key);
+		free(data.dptr);
+		return 0;
+	}
+
+	tdb_delete(ctdb_db->ltdb->tdb, key);
+	tdb_chainunlock(ctdb_db->ltdb->tdb, key);
+	free(data.dptr);
+
+	(*count)++;
+
+	return 0;
+}
+
+
+/*
+  vacuum records for which we are the lmaster 
+ */
+static int ctdb_vacuum_local(struct ctdb_context *ctdb, struct ctdb_control_pulldb_reply *list, 
+			     struct ctdb_db_context *ctdb_db, uint32_t *count)
+{
+	struct ctdb_rec_data *r;
+	int i;
+
+	r = (struct ctdb_rec_data *)&list->data[0];
+	
+	for (i=0;
+	     i<list->count;
+	     r = (struct ctdb_rec_data *)(r->length + (uint8_t *)r), i++) {
+		TDB_DATA key;
+		key.dptr = &r->data[0];
+		key.dsize = r->keylen;
+		if (ctdb_vacuum_one(ctdb, key, ctdb_db, count) != 0) {
+			return -1;
+		}
+	}
+
+	return 0;	
+}
+
+/* 
+   a list of records to possibly delete
+ */
+struct vacuum_data {
+	uint32_t vacuum_limit;
+	struct ctdb_context *ctdb;
+	struct ctdb_control_pulldb_reply **list;
+	bool traverse_error;
+	uint32_t total;
+};
+
+/*
+  traverse function for vacuuming
+ */
+static int vacuum_traverse(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *private)
+{
+	struct vacuum_data *vdata = talloc_get_type(private, struct vacuum_data);
+	uint32_t lmaster;
+	struct ctdb_ltdb_header *hdr;
+	struct ctdb_rec_data *rec;
+	size_t old_size;
+	       
+	lmaster = ctdb_lmaster(vdata->ctdb, &key);
+	if (lmaster >= vdata->ctdb->vnn_map->size) {
+		return 0;
+	}
+
+	if (data.dsize != sizeof(struct ctdb_ltdb_header)) {
+		/* its not a deleted record */
+		return 0;
+	}
+
+	hdr = (struct ctdb_ltdb_header *)data.dptr;
+
+	if (hdr->dmaster != vdata->ctdb->pnn) {
+		return 0;
+	}
+
+
+	/* add the record to the blob ready to send to the nodes */
+	rec = ctdb_marshall_record(vdata->list[lmaster], vdata->ctdb->pnn, key, NULL, tdb_null);
+	if (rec == NULL) {
+		DEBUG(0,(__location__ " Out of memory\n"));
+		vdata->traverse_error = true;
+		return -1;
+	}
+	old_size = talloc_get_size(vdata->list[lmaster]);
+	vdata->list[lmaster] = talloc_realloc_size(NULL, vdata->list[lmaster], 
+						   old_size + rec->length);
+	if (vdata->list[lmaster] == NULL) {
+		DEBUG(0,(__location__ " Failed to expand\n"));
+		vdata->traverse_error = true;
+		return -1;
+	}
+	vdata->list[lmaster]->count++;
+	memcpy(old_size+(uint8_t *)vdata->list[lmaster], rec, rec->length);
+	talloc_free(rec);
+
+	vdata->total++;
+
+	/* don't gather too many records */
+	if (vdata->vacuum_limit != 0 &&
+	    vdata->total == vdata->vacuum_limit) {
+		return -1;
+	}
+
+	return 0;
+}
+
+
+/* vacuum one database */
+static int ctdb_vacuum_db(struct ctdb_context *ctdb, uint32_t db_id, struct ctdb_node_map *map,
+			  bool persistent, uint32_t vacuum_limit)
+{
+	struct ctdb_db_context *ctdb_db;
+	const char *name;
+	struct vacuum_data *vdata;
+	int i;
+
+	vdata = talloc_zero(ctdb, struct vacuum_data);
+	if (vdata == NULL) {
+		DEBUG(0,(__location__ " Out of memory\n"));
+		return -1;
+	}
+
+	vdata->ctdb = ctdb;
+	vdata->vacuum_limit = vacuum_limit;
+
+	if (ctdb_ctrl_getdbname(ctdb, TIMELIMIT(), CTDB_CURRENT_NODE, db_id, vdata, &name) != 0) {
+		DEBUG(0,(__location__ " Failed to get name of db 0x%x\n", db_id));
+		talloc_free(vdata);
+		return -1;
+	}
+
+	ctdb_db = ctdb_attach(ctdb, name, persistent);
+	if (ctdb_db == NULL) {
+		DEBUG(0,(__location__ " Failed to attach to database '%s'\n", name));
+		talloc_free(vdata);
+		return -1;
+	}
+
+	/* the list needs to be of length num_nodes */
+	vdata->list = talloc_array(vdata, struct ctdb_control_pulldb_reply *, ctdb->vnn_map->size);
+	if (vdata->list == NULL) {
+		DEBUG(0,(__location__ " Out of memory\n"));
+		talloc_free(vdata);
+		return -1;
+	}
+	for (i=0;i<ctdb->vnn_map->size;i++) {
+		vdata->list[i] = (struct ctdb_control_pulldb_reply *)
+			talloc_zero_size(vdata->list, 
+				    offsetof(struct ctdb_control_pulldb_reply, data));
+		if (vdata->list[i] == NULL) {
+			DEBUG(0,(__location__ " Out of memory\n"));
+			talloc_free(vdata);
+			return -1;
+		}
+		vdata->list[i]->db_id = db_id;
+	}
+
+	/* traverse, looking for records that might be able to be vacuumed */
+	if (tdb_traverse_read(ctdb_db->ltdb->tdb, vacuum_traverse, vdata) == -1 ||
+	    vdata->traverse_error) {
+		DEBUG(0,(__location__ " Traverse error in vacuuming '%s'\n", name));
+		talloc_free(vdata);
+		return -1;		
+	}
+
+
+	for (i=0;i<ctdb->vnn_map->size;i++) {
+		if (vdata->list[i]->count == 0) {
+			continue;
+		}
+
+		/* for records where we are not the lmaster, tell the lmaster to fetch the record */
+		if (ctdb->vnn_map->map[i] != ctdb->pnn) {
+			TDB_DATA data;
+			printf("Found %u records for lmaster %u in '%s'\n", vdata->list[i]->count, i, name);
+
+			data.dsize = talloc_get_size(vdata->list[i]);
+			data.dptr  = (void *)vdata->list[i];
+			if (ctdb_send_message(ctdb, ctdb->vnn_map->map[i], CTDB_SRVID_VACUUM_FETCH, data) != 0) {
+				DEBUG(0,(__location__ " Failed to send vacuum fetch message to %u\n",
+					 ctdb->vnn_map->map[i]));
+				talloc_free(vdata);
+				return -1;		
+			}
+			continue;
+		}
+	}	
+
+	for (i=0;i<ctdb->vnn_map->size;i++) {
+		uint32_t count = 0;
+
+		if (vdata->list[i]->count == 0) {
+			continue;
+		}
+
+		/* for records where we are the lmaster, we can try to delete them */
+		if (ctdb_vacuum_local(ctdb, vdata->list[i], ctdb_db, &count) != 0) {
+			DEBUG(0,(__location__ " Deletion error in vacuuming '%s'\n", name));
+			talloc_free(vdata);
+			return -1;					
+		}
+		if (count != 0) {
+			printf("Deleted %u records on this node from '%s'\n", count, name);
+		}
+	}	
+
+	/* this ensures we run our event queue */
+	ctdb_ctrl_getpnn(ctdb, TIMELIMIT(), CTDB_CURRENT_NODE);
+
+	talloc_free(vdata);
+
+	return 0;
+}
+
+
+/*
+  vacuum all our databases
+ */
+int ctdb_vacuum(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+	struct ctdb_dbid_map *dbmap=NULL;
+	struct ctdb_node_map *nodemap=NULL;
+	int ret, i, pnn;
+	uint32_t vacuum_limit = 0;
+
+	if (argc > 0) {
+		vacuum_limit = atoi(argv[0]);
+	}
+
+	ret = ctdb_ctrl_getdbmap(ctdb, TIMELIMIT(), CTDB_CURRENT_NODE, ctdb, &dbmap);
+	if (ret != 0) {
+		DEBUG(0, ("Unable to get dbids from local node\n"));
+		return ret;
+	}
+
+	ret = ctdb_ctrl_getnodemap(ctdb, TIMELIMIT(), CTDB_CURRENT_NODE, ctdb, &nodemap);
+	if (ret != 0) {
+		DEBUG(0, ("Unable to get nodemap from local node\n"));
+		return ret;
+	}
+
+	ret = ctdb_ctrl_getvnnmap(ctdb, TIMELIMIT(), CTDB_CURRENT_NODE, ctdb, &ctdb->vnn_map);
+	if (ret != 0) {
+		DEBUG(0, ("Unable to get vnnmap from local node\n"));
+		return ret;
+	}
+
+	pnn = ctdb_ctrl_getpnn(ctdb, TIMELIMIT(), CTDB_CURRENT_NODE);
+	if (pnn == -1) {
+		DEBUG(0, ("Unable to get pnn from local node\n"));
+		return -1;
+	}
+	ctdb->pnn = pnn;
+
+	for (i=0;i<dbmap->num;i++) {
+		if (ctdb_vacuum_db(ctdb, dbmap->dbs[i].dbid, nodemap, 
+				   dbmap->dbs[i].persistent, vacuum_limit) != 0) {
+			DEBUG(0,("Failed to vacuum db 0x%x\n", dbmap->dbs[i].dbid));
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+struct traverse_state {
+	bool error;
+	struct tdb_context *dest_db;
+};
+
+/*
+  traverse function for repacking
+ */
+static int repack_traverse(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *private)
+{
+	struct traverse_state *state = (struct traverse_state *)private;
+	if (tdb_store(state->dest_db, key, data, TDB_INSERT) != 0) {
+		state->error = true;
+		return -1;
+	}
+	return 0;
+}
+
+/*
+  repack a tdb
+ */
+static int ctdb_repack_tdb(struct tdb_context *tdb)
+{
+	struct tdb_context *tmp_db;
+	struct traverse_state state;
+
+	if (tdb_transaction_start(tdb) != 0) {
+		DEBUG(0,(__location__ " Failed to start transaction\n"));
+		return -1;
+	}
+
+	tmp_db = tdb_open("tmpdb", tdb_hash_size(tdb), TDB_INTERNAL, O_RDWR|O_CREAT, 0);
+	if (tmp_db == NULL) {
+		DEBUG(0,(__location__ " Failed to create tmp_db\n"));
+		tdb_transaction_cancel(tdb);
+		return -1;
+	}
+
+	state.error = false;
+	state.dest_db = tmp_db;
+
+	if (tdb_traverse_read(tdb, repack_traverse, &state) == -1) {
+		DEBUG(0,(__location__ " Failed to traverse copying out\n"));
+		tdb_transaction_cancel(tdb);
+		tdb_close(tmp_db);
+		return -1;		
+	}
+
+	if (state.error) {
+		DEBUG(0,(__location__ " Error during traversal\n"));
+		tdb_transaction_cancel(tdb);
+		tdb_close(tmp_db);
+		return -1;
+	}
+
+	if (tdb_wipe_all(tdb) != 0) {
+		DEBUG(0,(__location__ " Failed to wipe database\n"));
+		tdb_transaction_cancel(tdb);
+		tdb_close(tmp_db);
+		return -1;
+	}
+
+	state.error = false;
+	state.dest_db = tdb;
+
+	if (tdb_traverse_read(tmp_db, repack_traverse, &state) == -1) {
+		DEBUG(0,(__location__ " Failed to traverse copying back\n"));
+		tdb_transaction_cancel(tdb);
+		tdb_close(tmp_db);
+		return -1;		
+	}
+
+	if (state.error) {
+		DEBUG(0,(__location__ " Error during second traversal\n"));
+		tdb_transaction_cancel(tdb);
+		tdb_close(tmp_db);
+		return -1;
+	}
+
+	tdb_close(tmp_db);
+
+	if (tdb_transaction_commit(tdb) != 0) {
+		DEBUG(0,(__location__ " Failed to commit\n"));
+		return -1;
+	}
+
+	return 0;
+}
+
+
+/* repack one database */
+static int ctdb_repack_db(struct ctdb_context *ctdb, uint32_t db_id, 
+			  bool persistent, uint32_t repack_limit)
+{
+	struct ctdb_db_context *ctdb_db;
+	const char *name;
+	int size;
+
+	if (ctdb_ctrl_getdbname(ctdb, TIMELIMIT(), CTDB_CURRENT_NODE, db_id, ctdb, &name) != 0) {
+		DEBUG(0,(__location__ " Failed to get name of db 0x%x\n", db_id));
+		return -1;
+	}
+
+	ctdb_db = ctdb_attach(ctdb, name, persistent);
+	if (ctdb_db == NULL) {
+		DEBUG(0,(__location__ " Failed to attach to database '%s'\n", name));
+		return -1;
+	}
+
+	size = tdb_freelist_size(ctdb_db->ltdb->tdb);
+	if (size == -1) {
+		DEBUG(0,(__location__ " Failed to get freelist size for '%s'\n", name));
+		return -1;
+	}
+
+	if (size <= repack_limit) {
+		return 0;
+	}
+
+	printf("Repacking %s with %u freelist entries\n", name, size);
+
+	if (ctdb_repack_tdb(ctdb_db->ltdb->tdb) != 0) {
+		DEBUG(0,(__location__ " Failed to repack '%s'\n", name));
+		return -1;
+	}
+
+	return 0;
+}
+
+
+/*
+  repack all our databases
+ */
+int ctdb_repack(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+	struct ctdb_dbid_map *dbmap=NULL;
+	int ret, i;
+	/* a reasonable default limit to prevent us using too much memory */
+	uint32_t repack_limit = 10000; 
+
+	if (argc > 0) {
+		repack_limit = atoi(argv[0]);
+	}
+
+	ret = ctdb_ctrl_getdbmap(ctdb, TIMELIMIT(), CTDB_CURRENT_NODE, ctdb, &dbmap);
+	if (ret != 0) {
+		DEBUG(0, ("Unable to get dbids from local node\n"));
+		return ret;
+	}
+
+	for (i=0;i<dbmap->num;i++) {
+		if (ctdb_repack_db(ctdb, dbmap->dbs[i].dbid, 
+				   dbmap->dbs[i].persistent, repack_limit) != 0) {
+			DEBUG(0,("Failed to repack db 0x%x\n", dbmap->dbs[i].dbid));
+			return -1;
+		}
+	}
+
+	return 0;
+}