2025-02-05 21:57:51 +03:00 · 2011-08-08 13:25:40 +10:00 · 2011-08-08 13:25:40 +10:00 · 46c908d542
commit 46c908d542
parent 394bbe8454 e097b7f8ff
38 changed files with 1668 additions and 287 deletions
--- a/ctdb/common/ctdb_io.c
+++ b/ctdb/common/ctdb_io.c
@ -81,12 +81,17 @@ static void dump_packet(unsigned char *data, size_t len)

 /*
  called when an incoming connection is readable
+  This function MUST be safe for reentry via the queue callback!
 */
 static void queue_io_read(struct ctdb_queue *queue)
 {
 	int num_ready = 0;
-	ssize_t nread, totread, partlen;
-	uint8_t *data, *data_base;
+	uint32_t sz_bytes_req;
+	uint32_t pkt_size;
+	uint32_t pkt_bytes_remaining;
+	uint32_t to_read;
+	ssize_t nread;
+	uint8_t *data;

 	if (ioctl(queue->fd, FIONREAD, &num_ready) != 0) {
 		return;
@ -96,93 +101,77 @@ static void queue_io_read(struct ctdb_queue *queue)
 		goto failed;
 	}

-
-	queue->partial.data = talloc_realloc_size(queue, queue->partial.data, 
-						  num_ready + queue->partial.length);
-
 	if (queue->partial.data == NULL) {
-		DEBUG(DEBUG_ERR,("%s: read error alloc failed for %u\n",
-			queue->name, num_ready + queue->partial.length));
-		goto failed;
+		/* starting fresh, allocate buf for size bytes */
+		sz_bytes_req = sizeof(pkt_size);
+		queue->partial.data = talloc_size(queue, sz_bytes_req);
+		if (queue->partial.data == NULL) {
+			DEBUG(DEBUG_ERR,("read error alloc failed for %u\n",
+					 sz_bytes_req));
+			goto failed;
+		}
+	} else if (queue->partial.length < sizeof(pkt_size)) {
+		/* yet to find out the packet length */
+		sz_bytes_req = sizeof(pkt_size) - queue->partial.length;
+	} else {
+		/* partial packet, length known, full buf allocated */
+		sz_bytes_req = 0;
 	}
-
-	nread = read(queue->fd, queue->partial.data + queue->partial.length, num_ready);
-	if (nread <= 0) {
-		DEBUG(DEBUG_ERR,("%s: read error nread=%d\n",
-				 queue->name, (int)nread));
-		goto failed;
-	}
-	totread = nread;
-	partlen = queue->partial.length;
-
 	data = queue->partial.data;
-	nread += queue->partial.length;
+
+	if (sz_bytes_req > 0) {
+		to_read = MIN(sz_bytes_req, num_ready);
+		nread = read(queue->fd, data + queue->partial.length,
+			     to_read);
+		if (nread <= 0) {
+			DEBUG(DEBUG_ERR,("read error nread=%d\n", (int)nread));
+			goto failed;
+		}
+		queue->partial.length += nread;
+
+		if (nread < sz_bytes_req) {
+			/* not enough to know the length */
+			DEBUG(DEBUG_DEBUG,("Partial packet length read\n"));
+			return;
+		}
+		/* size now known, allocate buffer for the full packet */
+		queue->partial.data = talloc_realloc_size(queue, data,
+							  *(uint32_t *)data);
+		if (queue->partial.data == NULL) {
+			DEBUG(DEBUG_ERR,("read error alloc failed for %u\n",
+					 *(uint32_t *)data));
+			goto failed;
+		}
+		data = queue->partial.data;
+		num_ready -= nread;
+	}
+
+	pkt_size = *(uint32_t *)data;
+	if (pkt_size == 0) {
+		DEBUG(DEBUG_CRIT,("Invalid packet of length 0\n"));
+		goto failed;
+	}
+
+	pkt_bytes_remaining = pkt_size - queue->partial.length;
+	to_read = MIN(pkt_bytes_remaining, num_ready);
+	nread = read(queue->fd, data + queue->partial.length,
+		     to_read);
+	if (nread <= 0) {
+		DEBUG(DEBUG_ERR,("read error nread=%d\n",
+				 (int)nread));
+		goto failed;
+	}
+	queue->partial.length += nread;
+
+	if (queue->partial.length < pkt_size) {
+		DEBUG(DEBUG_DEBUG,("Partial packet data read\n"));
+		return;
+	}

 	queue->partial.data = NULL;
 	queue->partial.length = 0;
-
-	if (nread >= 4 && *(uint32_t *)data == nread) {
-		/* it is the responsibility of the incoming packet
-		 function to free 'data' */
-		queue->callback(data, nread, queue->private_data);
-		return;
-	}
-
-	data_base = data;
-
-	while (nread >= 4 && *(uint32_t *)data <= nread) {
-		/* we have at least one packet */
-		uint8_t *d2;
-		uint32_t len;
-		bool destroyed = false;
-
-		len = *(uint32_t *)data;
-		if (len == 0) {
-			/* bad packet! treat as EOF */
-			DEBUG(DEBUG_CRIT,("%s: Invalid packet of length 0 (nread = %zu, totread = %zu, partlen = %zu)\n",
-					  queue->name, nread, totread, partlen));
-			dump_packet(data_base, totread + partlen);
-			goto failed;
-		}
-		d2 = talloc_memdup(queue, data, len);
-		if (d2 == NULL) {
-			DEBUG(DEBUG_ERR,("%s: read error memdup failed for %u\n",
-					 queue->name, len));
-			/* sigh */
-			goto failed;
-		}
-
-		queue->destroyed = &destroyed;
-		queue->callback(d2, len, queue->private_data);
-		/* If callback freed us, don't do anything else. */
-		if (destroyed) {
-			return;
-		}
-		queue->destroyed = NULL;
-
-		data += len;
-		nread -= len;		
-	}
-
-	if (nread > 0) {
-		/* we have only part of a packet */
-		if (data_base == data) {
-			queue->partial.data = data;
-			queue->partial.length = nread;
-		} else {
-			queue->partial.data = talloc_memdup(queue, data, nread);
-			if (queue->partial.data == NULL) {
-				DEBUG(DEBUG_ERR,("%s: read error memdup partial failed for %u\n",
-						 queue->name, (unsigned)nread));
-				goto failed;
-			}
-			queue->partial.length = nread;
-			talloc_free(data_base);
-		}
-		return;
-	}
-
-	talloc_free(data_base);
+	/* it is the responsibility of the callback to free 'data' */
+	queue->callback(data, pkt_size, queue->private_data);
 	return;

 failed:
--- a/ctdb/server/ctdb_monitor.c
+++ b/ctdb/server/ctdb_monitor.c
@ -302,11 +302,8 @@ static void ctdb_wait_until_recovered(struct event_context *ev, struct timed_eve
 		exit(11);
 	}
 	ctdb->db_persistent_check_errors = 0;
-	DEBUG(DEBUG_NOTICE,(__location__
-			   "ctdb_start_monitoring: ctdb_recheck_persistent_health() OK\n"));

 	DEBUG(DEBUG_NOTICE,(__location__ " Recoveries finished. Running the \"startup\" event.\n"));
-	DEBUG(DEBUG_ERR,(__location__ " Allow clients to attach to databases.\n"));
 	event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
 			     timeval_current(),
 			     ctdb_check_health, ctdb);
--- a/ctdb/server/ctdb_recoverd.c
+++ b/ctdb/server/ctdb_recoverd.c
@ -1638,7 +1638,6 @@ static int do_recovery(struct ctdb_recoverd *rec,
 		DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses. ctdb_takeover_run() failed.\n"));
 		rec->need_takeover_run = true;
 	}
-	DEBUG(DEBUG_NOTICE, (__location__ " Recovery - takeip finished\n"));

 	/* execute the "recovered" event script on all nodes */
 	ret = run_recovered_eventscript(ctdb, nodemap, "do_recovery");
--- a/ctdb/server/ctdb_takeover.c
+++ b/ctdb/server/ctdb_takeover.c
@ -2633,8 +2633,6 @@ int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,

 	iface = ctdb_find_iface(ctdb, info->name);
 	if (iface == NULL) {
-		DEBUG(DEBUG_ERR, (__location__ "iface[%s] is unknown\n",
-				  info->name));
 		return -1;
 	}

--- a/ctdb/server/ctdbd.c
+++ b/ctdb/server/ctdbd.c
@ -196,7 +196,7 @@ int main(int argc, const char *argv[])
 		exit(1);
 	}

-	DEBUG(DEBUG_NOTICE,("Starting CTDB daemon\n"));
+	DEBUG(DEBUG_NOTICE,("CTDB starting on node\n"));

 	gettimeofday(&ctdb->ctdbd_start_time, NULL);
 	gettimeofday(&ctdb->last_recovery_started, NULL);
--- a/ctdb/tests/complex/01_ctdb_nfs_skip_share_check.sh
+++ b/ctdb/tests/complex/01_ctdb_nfs_skip_share_check.sh
@ -103,15 +103,9 @@ try_command_on_node $test_node "echo \"function exportfs () { echo $foo_dir 127.

 n="$rc_local_d/nfs-skip-share-check"
 n_contents='loadconfig() {
-    name="$1"
-    if [ -f /etc/sysconfig/$name ]; then
-	. /etc/sysconfig/$name
-    elif [ -f /etc/default/$name ]; then
-	. /etc/default/$name
-    elif [ -f $CTDB_BASE/sysconfig/$name ]; then
-	. $CTDB_BASE/sysconfig/$name
-    fi
-    if [ "$name" = "ctdb" ] ; then
+    _loadconfig "$@"
+
+    if [ "$1" = "ctdb" -o "$1" = "nfs" ] ; then
        CTDB_NFS_SKIP_SHARE_CHECK=no
    fi
 }
--- a/ctdb/tests/complex/02_ctdb_samba_skip_share_check.sh
+++ b/ctdb/tests/complex/02_ctdb_samba_skip_share_check.sh
@ -108,15 +108,9 @@ try_command_on_node $test_node "echo 'function testparm () { tp=\$(which testpar

 n="$rc_local_d/samba-skip-share-check"
 n_contents='loadconfig() {
-    name="$1"
-    if [ -f /etc/sysconfig/$name ]; then
-	. /etc/sysconfig/$name
-    elif [ -f /etc/default/$name ]; then
-	. /etc/default/$name
-    elif [ -f $CTDB_BASE/sysconfig/$name ]; then
-	. $CTDB_BASE/sysconfig/$name
-    fi
-    if [ "$name" = "ctdb" ] ; then
+    _loadconfig "$@"
+
+    if [ "$1" = "ctdb" ] ; then
        CTDB_SAMBA_SKIP_SHARE_CHECK=no
    fi
 }
--- a/ctdb/tests/onnode/0001.sh
+++ b/ctdb/tests/onnode/0001.sh
@ -0,0 +1,24 @@
+#!/bin/sh
+
+. "${ONNODE_TESTS_DIR}/common.sh"
+
+cmd="$ONNODE all hostname"
+
+define_test "$cmd" "all nodes OK"
+
+required_result <<EOF
+
+>> NODE: 192.168.1.101 <<
+-n 192.168.1.101 hostname
+
+>> NODE: 192.168.1.102 <<
+-n 192.168.1.102 hostname
+
+>> NODE: 192.168.1.103 <<
+-n 192.168.1.103 hostname
+
+>> NODE: 192.168.1.104 <<
+-n 192.168.1.104 hostname
+EOF
+
+simple_test $cmd
--- a/ctdb/tests/onnode/0002.sh
+++ b/ctdb/tests/onnode/0002.sh
@ -0,0 +1,16 @@
+#!/bin/sh
+
+. "${ONNODE_TESTS_DIR}/common.sh"
+
+cmd="$ONNODE -q all hostname"
+
+define_test "$cmd" "all nodes OK"
+
+required_result <<EOF
+-n 192.168.1.101 hostname
+-n 192.168.1.102 hostname
+-n 192.168.1.103 hostname
+-n 192.168.1.104 hostname
+EOF
+
+simple_test $cmd
--- a/ctdb/tests/onnode/0003.sh
+++ b/ctdb/tests/onnode/0003.sh
@ -0,0 +1,16 @@
+#!/bin/sh
+
+. "${ONNODE_TESTS_DIR}/common.sh"
+
+cmd="$ONNODE -p all hostname"
+
+define_test "$cmd" "all nodes OK"
+
+required_result <<EOF
+[192.168.1.101] -n 192.168.1.101 hostname
+[192.168.1.102] -n 192.168.1.102 hostname
+[192.168.1.103] -n 192.168.1.103 hostname
+[192.168.1.104] -n 192.168.1.104 hostname
+EOF
+
+simple_test -s $cmd
--- a/ctdb/tests/onnode/0004.sh
+++ b/ctdb/tests/onnode/0004.sh
@ -0,0 +1,16 @@
+#!/bin/sh
+
+. "${ONNODE_TESTS_DIR}/common.sh"
+
+cmd="$ONNODE -pq all hostname"
+
+define_test "$cmd" "all nodes OK"
+
+required_result <<EOF
+-n 192.168.1.101 hostname
+-n 192.168.1.102 hostname
+-n 192.168.1.103 hostname
+-n 192.168.1.104 hostname
+EOF
+
+simple_test -s $cmd
--- a/ctdb/tests/onnode/0005.sh
+++ b/ctdb/tests/onnode/0005.sh
@ -0,0 +1,13 @@
+#!/bin/sh
+
+. "${ONNODE_TESTS_DIR}/common.sh"
+
+cmd="$ONNODE 3 hostname"
+
+define_test "$cmd" "all nodes OK"
+
+required_result <<EOF
+-n 192.168.1.104 hostname
+EOF
+
+simple_test $cmd
--- a/ctdb/tests/onnode/0006.sh
+++ b/ctdb/tests/onnode/0006.sh
@ -0,0 +1,15 @@
+#!/bin/sh
+
+. "${ONNODE_TESTS_DIR}/common.sh"
+
+cmd="$ONNODE -v 3 hostname"
+
+define_test "$cmd" "all nodes OK"
+
+required_result <<EOF
+
+>> NODE: 192.168.1.104 <<
+-n 192.168.1.104 hostname
+EOF
+
+simple_test $cmd
--- a/ctdb/tests/onnode/0070.sh
+++ b/ctdb/tests/onnode/0070.sh
@ -0,0 +1,32 @@
+#!/bin/sh
+
+. "${ONNODE_TESTS_DIR}/common.sh"
+
+cmd="$ONNODE ok hostname"
+
+define_test "$cmd" "all nodes OK"
+
+ctdb_set_output <<EOF
+:Node:IP:Disconnected:Banned:Disabled:Unhealthy:Stopped:Inactive:
+:0:192.168.1.101:0:0:0:0:0:0:
+:1:192.168.1.102:0:0:0:0:0:0:
+:2:192.168.1.103:0:0:0:0:0:0:
+:3:192.168.1.104:0:0:0:0:0:0:
+EOF
+
+required_result <<EOF
+
+>> NODE: 192.168.1.101 <<
+-n 192.168.1.101 hostname
+
+>> NODE: 192.168.1.102 <<
+-n 192.168.1.102 hostname
+
+>> NODE: 192.168.1.103 <<
+-n 192.168.1.103 hostname
+
+>> NODE: 192.168.1.104 <<
+-n 192.168.1.104 hostname
+EOF
+
+simple_test $cmd
--- a/ctdb/tests/onnode/0071.sh
+++ b/ctdb/tests/onnode/0071.sh
@ -0,0 +1,30 @@
+#!/bin/sh
+
+. "${ONNODE_TESTS_DIR}/common.sh"
+
+cmd="$ONNODE ok hostname"
+
+define_test "$cmd" "2nd node disconnected"
+
+ctdb_set_output  <<EOF 
+ctdb_set_output <<EOF
+:Node:IP:Disconnected:Banned:Disabled:Unhealthy:Stopped:Inactive:
+:0:192.168.1.101:0:0:0:0:0:0:
+:1:192.168.1.102:1:0:0:0:0:0:
+:2:192.168.1.103:0:0:0:0:0:0:
+:3:192.168.1.104:0:0:0:0:0:0:
+EOF
+
+required_result <<EOF
+
+>> NODE: 192.168.1.101 <<
+-n 192.168.1.101 hostname
+
+>> NODE: 192.168.1.103 <<
+-n 192.168.1.103 hostname
+
+>> NODE: 192.168.1.104 <<
+-n 192.168.1.104 hostname
+EOF
+
+simple_test $cmd
--- a/ctdb/tests/onnode/0072.sh
+++ b/ctdb/tests/onnode/0072.sh
@ -0,0 +1,29 @@
+#!/bin/sh
+
+. "${ONNODE_TESTS_DIR}/common.sh"
+
+cmd="$ONNODE ok hostname"
+
+define_test "$cmd" "2nd node disconnected, extra status columns"
+
+ctdb_set_output <<EOF
+:Node:IP:Disconnected:Banned:Disabled:Unhealthy:Stopped:Inactive:X1:X2:X3:X4:
+:0:192.168.1.101:0:0:0:0:0:0:0:0:0:0:
+:1:192.168.1.102:1:0:0:0:0:0:0:0:0:0:
+:2:192.168.1.103:0:0:0:0:0:0:0:0:0:0:
+:3:192.168.1.104:0:0:0:0:0:0:0:0:0:0:
+EOF
+
+required_result <<EOF
+
+>> NODE: 192.168.1.101 <<
+-n 192.168.1.101 hostname
+
+>> NODE: 192.168.1.103 <<
+-n 192.168.1.103 hostname
+
+>> NODE: 192.168.1.104 <<
+-n 192.168.1.104 hostname
+EOF
+
+simple_test $cmd
--- a/ctdb/tests/onnode/0075.sh
+++ b/ctdb/tests/onnode/0075.sh
@ -0,0 +1,29 @@
+#!/bin/sh
+
+. "${ONNODE_TESTS_DIR}/common.sh"
+
+cmd="$ONNODE con hostname"
+
+define_test "$cmd" "1st node disconnected"
+
+ctdb_set_output <<EOF
+:Node:IP:Disconnected:Banned:Disabled:Unhealthy:Stopped:Inactive:
+:0:192.168.1.101:1:0:0:0:0:0:
+:1:192.168.1.102:0:0:0:0:0:0:
+:2:192.168.1.103:0:0:0:0:0:0:
+:3:192.168.1.104:0:0:0:0:0:0:
+EOF
+
+required_result <<EOF
+
+>> NODE: 192.168.1.102 <<
+-n 192.168.1.102 hostname
+
+>> NODE: 192.168.1.103 <<
+-n 192.168.1.103 hostname
+
+>> NODE: 192.168.1.104 <<
+-n 192.168.1.104 hostname
+EOF
+
+simple_test $cmd
--- a/ctdb/tests/onnode/0080.sh
+++ b/ctdb/tests/onnode/0080.sh
@ -0,0 +1,17 @@
+#!/bin/sh
+
+. "${ONNODE_TESTS_DIR}/common.sh"
+
+cmd="$ONNODE recmaster hostname"
+
+define_test "$cmd" "node 1 (192.168.1.102) is recmaster"
+
+ctdb_set_output <<EOF
+1
+EOF
+
+required_result <<EOF
+-n 192.168.1.102 hostname
+EOF
+
+simple_test $cmd
--- a/ctdb/tests/onnode/0081.sh
+++ b/ctdb/tests/onnode/0081.sh
@ -0,0 +1,17 @@
+#!/bin/sh
+
+. "${ONNODE_TESTS_DIR}/common.sh"
+
+cmd="$ONNODE lvsmaster hostname"
+
+define_test "$cmd" "no lvsmaster"
+
+ctdb_set_output 255 <<EOF
+There is no LVS master
+EOF
+
+required_result 1 <<EOF
+onnode: No lvsmaster available
+EOF
+
+simple_test $cmd
--- a/ctdb/tests/onnode/0090.sh
+++ b/ctdb/tests/onnode/0090.sh
@ -0,0 +1,21 @@
+#!/bin/sh
+
+. "${ONNODE_TESTS_DIR}/common.sh"
+
+cmd="$ONNODE natgw hostname"
+
+define_test "$cmd" "no natgw"
+
+ctdb_set_output <<EOF
+-1 0.0.0.0
+:0:192.168.1.101:0:0:0:0:0:
+:1:192.168.1.102:0:0:0:0:0:
+:2:192.168.1.103:0:0:0:0:0:
+:3:192.168.1.104:0:0:0:0:0:
+EOF
+
+required_result 1 <<EOF
+onnode: No natgwlist available
+EOF
+
+simple_test $cmd
--- a/ctdb/tests/onnode/0091.sh
+++ b/ctdb/tests/onnode/0091.sh
@ -0,0 +1,21 @@
+#!/bin/sh
+
+. "${ONNODE_TESTS_DIR}/common.sh"
+
+cmd="$ONNODE natgw hostname"
+
+define_test "$cmd" "node 2 (192.168.1.103) is natgw"
+
+ctdb_set_output <<EOF
+2 192.168.1.103
+:0:192.168.1.101:0:0:0:0:0:
+:1:192.168.1.102:0:0:0:0:0:
+:2:192.168.1.103:0:0:0:0:0:
+:3:192.168.1.104:0:0:0:0:0:
+EOF
+
+required_result <<EOF
+-n 192.168.1.103 hostname
+EOF
+
+simple_test $cmd
--- a/ctdb/tests/onnode/README
+++ b/ctdb/tests/onnode/README
@ -0,0 +1,38 @@
+onnode unit tests
+=================
+
+Examples:
+
+* ./run_tests.sh
+
+  Run all tests, displaying output.
+
+* ./run_tests.sh -s
+
+  Run all tests, displaying output and a summary.
+
+* ./run_tests.sh -sq
+
+  Run all tests, displaying only a summary.
+
+* ONNODE=onnode-buggy-001 ./run_tests.sh -s
+
+  Run against stubs/onnode-buggy-001 instead of default onnode version.
+
+  Add more buggy versions of onnode to this directory as bugs are
+  fixed to enable test validation using this feature.
+
+* ./run_tests.sh ./009*.sh
+
+  Run only the specified tests.
+
+* ONNODE="bash -x stubs/onnode-buggy-001" ./run_tests.sh ./0090.sh
+  ONNODE="bash -x ../../tools/onnode" ./run_tests.sh ./0090.sh
+
+  Debug the specified test or test failure.  The test will fail
+  because the bash trace output will be included in the test output.
+  However, this at least makes it easy to trace onnode while running
+  the test...
+
+  To see if the test pases, the -x can be dropped... so command-line
+  editing can be kept to a minimum.
--- a/ctdb/tests/onnode/common.sh
+++ b/ctdb/tests/onnode/common.sh
@ -0,0 +1,103 @@
+# Hey Emacs, this is a -*- shell-script -*- !!!  :-)
+
+# Set indirectly by run_tests at top level.
+unset CTDB_NODES_SOCKETS
+
+# Default to just "onnode".
+: ${ONNODE:=onnode}
+
+# Augment PATH with relevant stubs/ directories.
+
+if [ -d "${ONNODE_TESTS_DIR}/stubs" ] ; then
+    PATH="${ONNODE_TESTS_DIR}/stubs:$PATH"
+fi
+
+export ONNODE_TESTCASE_DIR=$(dirname "$0")
+if [ $(basename "$ONNODE_TESTCASE_DIR") = "onnode" ] ; then
+    # Just a test script, no testcase subdirectory.
+    ONNODE_TESTCASE_DIR="$ONNODE_TESTS_DIR"
+else
+    if [ -d "${ONNODE_TESTCASE_DIR}/stubs" ] ; then
+	PATH="${ONNODE_TESTCASE_DIR}/stubs:$PATH"
+    fi
+fi
+
+# Find CTDB nodes file.
+if [ -z "$CTDB_NODES_FILE" ] ; then
+    if [ -r "${ONNODE_TESTCASE_DIR}/nodes" ] ; then
+	CTDB_NODES_FILE="${ONNODE_TESTCASE_DIR}/nodes"
+    elif [ -r "${ONNODE_TESTS_DIR}/nodes" ] ; then
+	CTDB_NODES_FILE="${ONNODE_TESTS_DIR}/nodes"
+    else
+	CTDB_NODES_FILE="${CTDB_BASE:-/etc/ctdb}/nodes"
+    fi
+fi
+
+export CTDB_NODES_FILE
+
+export ONNODE_TESTS_VAR_DIR="${ONNODE_TESTS_DIR}/var"
+mkdir -p "$ONNODE_TESTS_VAR_DIR"
+
+if [ -z "$CTDB_BASE" ] ; then
+    export CTDB_BASE=$(dirname "$CTDB_NODES_FILE")
+fi
+
+define_test ()
+{
+    _f="$0"
+    _f="${_f#./}"  # strip leading ./
+    _f="${_f%%/*}" # if subdir, strip off file
+    _f="${_f%.sh}" # strip off .sh suffix if any
+
+    echo "$_f $1 - $2"
+}
+
+# Set output for ctdb command.  Option 1st argument is return code.
+ctdb_set_output ()
+{
+    _out="$ONNODE_TESTS_VAR_DIR/ctdb.out"
+    cat >"$_out"
+
+    _rc="$ONNODE_TESTS_VAR_DIR/ctdb.rc"
+    echo "${1:-0}" >"$_rc"
+
+    trap "rm -f $_out $_rc" 0
+}
+
+required_result ()
+{
+    required_rc="${1:-0}"
+    required_output=$(cat)
+}
+
+simple_test ()
+{
+    _sort="cat"
+    if [ "$1" = "-s" ] ; then
+	shift
+	_sort="sort"
+    fi
+    _out=$("$@" 2>&1)
+    _rc=$?
+    _out=$(echo "$_out" | $_sort )
+
+    if [ "$_out" = "$required_output" -a $_rc = $required_rc ] ; then
+	echo "PASSED"
+    else
+	cat <<EOF
+CTDB_NODES_FILE="${CTDB_NODES_FILE}"
+CTDB_BASE="$CTDB_BASE"
+$(which ctdb)
+
+##################################################
+Required output (Exit status: ${required_rc}):
+##################################################
+$required_output
+##################################################
+Actual output (Exit status: ${_rc}):
+##################################################
+$_out
+EOF
+	return 1
+    fi
+}
--- a/ctdb/tests/onnode/nodes
+++ b/ctdb/tests/onnode/nodes
@ -0,0 +1,4 @@
+192.168.1.101
+192.168.1.102
+192.168.1.103
+192.168.1.104
--- a/ctdb/tests/onnode/run_tests.sh
+++ b/ctdb/tests/onnode/run_tests.sh
@ -0,0 +1,31 @@
+#!/bin/sh
+
+# Run some onnode unit tests.
+
+cd $(dirname "$0")
+export ONNODE_TESTS_DIR=$(pwd)
+
+test_dir=$(dirname "$ONNODE_TESTS_DIR")
+
+opts="-d"
+
+for i ; do
+    case "$i" in
+	-*)
+	    opts="$opts $i"
+	    shift
+	    ;;
+	*)
+	    break
+    esac
+done
+
+tests=""
+if [ -z "$*" ] ; then
+    tests=$(ls ./[0-9][0-9][0-9][0-9].sh ./[0-9][0-9][0-9][0-9]/run_test.sh 2>/dev/null)
+fi
+
+"$test_dir/scripts/run_tests" $opts "$@" $tests || exit 1
+
+echo "All OK"
+exit 0
--- a/ctdb/tests/onnode/stubs/ctdb
+++ b/ctdb/tests/onnode/stubs/ctdb
@ -0,0 +1,33 @@
+#!/bin/sh
+
+# Fake ctdb client for onnode tests.
+
+cmd=$(echo "$*" | sed -r -e 's@[[:space:]]+@_@g')
+
+out="${ONNODE_TESTS_VAR_DIR}/ctdb.out"
+if [ -r "$out" ] ; then
+    cat "$out"
+
+    rc="${ONNODE_TESTS_VAR_DIR}/ctdb.rc"
+    if [ -r "$rc" ] ; then
+	exit $(cat "$rc")
+    fi
+
+    exit 0
+fi
+
+f="${ONNODE_TESTCASE_DIR}/ctdb.d/${cmd}.sh"
+if [ -x "$f" ] ; then
+    "$f"
+    exit $?
+fi
+
+f="${ONNODE_TESTCASE_DIR}/ctdb.d/${cmd}.out"
+if [ -r "$f" ] ; then
+    cat "$f"
+    exit 0
+fi
+
+echo "fake ctdb: no implementation for \"$*\""
+
+exit 1
--- a/ctdb/tests/onnode/stubs/onnode-buggy-001
+++ b/ctdb/tests/onnode/stubs/onnode-buggy-001
@ -0,0 +1,376 @@
+#!/bin/bash
+
+# Run commands on CTDB nodes.
+
+# See http://ctdb.samba.org/ for more information about CTDB.
+
+# Copyright (C) Martin Schwenke  2008
+
+# Based on an earlier script by Andrew Tridgell and Ronnie Sahlberg.
+
+# Copyright (C) Andrew Tridgell  2007
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+   
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+   
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+prog=$(basename $0)
+
+usage ()
+{
+    cat >&2 <<EOF
+Usage: onnode [OPTION] ... <NODES> <COMMAND> ...
+  options:
+    -c          Run in current working directory on specified nodes.
+    -o <prefix> Save standard output from each node to file <prefix>.<ip>
+    -p          Run command in parallel on specified nodes.
+    -q          Do not print node addresses (overrides -v).
+    -n          Allow nodes to be specified by name.
+    -f          Specify nodes file, overrides CTDB_NODES_FILE.
+    -v          Print node address even for a single node.
+  <NODES>       "all", "any", "ok" (or "healthy"), "con" (or "connected"),
+                "rm" (or "recmaster"), "lvs" (or "lvsmaster"),
+                "natgw" (or "natgwlist"); or
+                a node number (0 base); or
+                a hostname (if -n is specified); or
+                list (comma separated) of <NODES>; or
+                range (hyphen separated) of node numbers.
+EOF
+    exit 1
+
+}
+
+invalid_nodespec ()
+{
+    echo "Invalid <nodespec>" >&2 ; echo >&2
+    usage
+}
+
+# Defaults.
+current=false
+parallel=false
+verbose=false
+quiet=false
+prefix=""
+names_ok=false
+
+ctdb_base="${CTDB_BASE:-/etc/ctdb}"
+
+parse_options ()
+{
+    # $POSIXLY_CORRECT means that the command passed to onnode can
+    # take options and getopt won't reorder things to make them
+    # options ot onnode.
+    local temp
+    # Not on the previous line - local returns 0!
+    temp=$(POSIXLY_CORRECT=1 getopt -n "$prog" -o "cf:hno:pqv" -l help -- "$@")
+
+    [ $? != 0 ] && usage
+
+    eval set -- "$temp"
+
+    while true ; do
+	case "$1" in
+	    -c) current=true ; shift ;;
+	    -f) CTDB_NODES_FILE="$2" ; shift 2 ;;
+	    -n) names_ok=true ; shift ;;
+	    -o) prefix="$2" ; shift 2 ;;
+	    -p) parallel=true ; shift ;;
+	    -q) quiet=true ; shift ;;
+	    -v) verbose=true ; shift ;;
+	    --) shift ; break ;;
+	    -h|--help|*) usage ;; # Shouldn't happen, so this is reasonable.
+	esac
+    done
+
+    [ $# -lt 2 ] && usage
+
+    nodespec="$1" ; shift
+    command="$@"
+}
+
+echo_nth ()
+{
+    local n="$1" ; shift
+
+    shift $n
+    local node="$1"
+
+    if [ -n "$node" -a "$node" != "#DEAD" ] ; then
+	echo $node
+    else
+	echo "${prog}: \"node ${n}\" does not exist" >&2
+	exit 1
+    fi
+}
+
+parse_nodespec ()
+{
+    # Subshell avoids hacks to restore $IFS.
+    (
+	IFS=","
+	for i in $1 ; do
+	    case "$i" in
+		*-*) seq "${i%-*}" "${i#*-}" 2>/dev/null || invalid_nodespec ;;
+		# Separate lines for readability.
+		all|any|ok|healthy|con|connected) echo "$i" ;;
+		rm|recmaster|lvs|lvsmaster|natgw|natgwlist) echo "$i" ;;
+		*)
+		    [ $i -gt -1 ] 2>/dev/null || $names_ok || invalid_nodespec
+		    echo $i
+	    esac
+	done
+    )
+}
+
+ctdb_status_output="" # cache
+get_nodes_with_status ()
+{
+    local all_nodes="$1"
+    local status="$2"
+
+    local bits
+    case "$status" in
+	healthy)
+	    bits="0:0:0:0:0:0"
+	    ;;
+	connected)
+	    bits="0:[0-1]:[0-1]:[0-1]:[0-1]:[0-1]"
+	    ;;
+	*)
+	    invalid_nodespec
+    esac
+
+    if [ -z "$ctdb_status_output" ] ; then
+	# FIXME: need to do something if $CTDB_NODES_SOCKETS is set.
+	ctdb_status_output=$(ctdb -Y status 2>/dev/null)
+	if [ $? -ne 0 ] ; then
+	    echo "${prog}: unable to get status of CTDB nodes" >&2
+	    exit 1
+	fi
+	ctdb_status_output="${ctdb_status_output#* }"
+    fi
+
+    local nodes=""
+    local i
+    for i in $ctdb_status_output ; do
+	# Try removing bits from end.
+	local t="${i%:${bits}:}"
+	if [ "$t" != "$i" ] ; then
+	    # Succeeded.  Get address.  NOTE: this is an optimisation.
+	    # It might be better to get the node number and then get
+	    # the nth node to get the address.  This would make things
+	    # more consistent if $ctdb_base/nodes actually contained
+	    # hostnames.
+	    nodes="${nodes} ${t#:*:}"
+	fi
+    done
+
+    echo $nodes
+}
+
+ctdb_props="" # cache
+get_node_with_property ()
+{
+    local all_nodes="$1"
+    local prop="$2"
+
+    local prop_node=""
+    if [ "${ctdb_props##:${prop}:}" = "$ctdb_props" ] ; then
+	prop_node=$(ctdb "$prop" -Y 2>/dev/null)
+	# We only want the first line.
+	local nl="
+"
+	prop_node="${prop_node%%${nl}*}"
+	if [ $? -eq 0 ] ; then
+	    ctdb_props="${ctdb_props}${ctdb_props:+ }:${prop}:${prop_node}"
+	else
+	    prop_node=""
+	fi
+    else
+	prop_node="${ctdb_props##:${prop}:}"
+	prop_node="${prop_node%% *}"
+    fi
+    if [ -n "$prop_node" ] ; then
+	echo_nth "$prop_node" $all_nodes
+    else
+	echo "${prog}: No ${prop} available" >&2
+	exit 1
+    fi
+}
+
+get_any_available_node ()
+{
+    local all_nodes="$1"
+
+    # We do a recursive onnode to find which nodes are up and running.
+    local out=$($0 -pq all ctdb pnn 2>&1)
+    local line
+    while read line ; do 
+	local pnn="${line#PNN:}"
+	if [ "$pnn" != "$line" ] ; then
+	    echo_nth "$pnn" $all_nodes
+	    return 0
+	fi
+	# Else must be an error message from a down node.
+    done <<<"$out"
+    return 1
+}
+
+get_nodes ()
+{
+    local all_nodes
+
+    if [ -n "$CTDB_NODES_SOCKETS" ] ; then 
+	all_nodes="$CTDB_NODES_SOCKETS"
+    else
+	local f="${ctdb_base}/nodes"
+	if [ -n "$CTDB_NODES_FILE" ] ; then
+	    f="$CTDB_NODES_FILE"
+	    if [ ! -e "$f" -a "${f#/}" = "$f" ] ; then
+		# $f is relative, try in $ctdb_base
+		f="${ctdb_base}/${f}"
+	    fi
+	fi
+
+	if [ ! -r "$f" ] ; then
+	    echo "${prog}: unable to open nodes file  \"${f}\"" >&2
+	    exit 1
+	fi
+
+	all_nodes=$(sed -e 's@#.*@@g' -e 's@ *@@g' -e 's@^$@#DEAD@' "$f")
+    fi
+
+    local nodes=""
+    local n
+    for n in $(parse_nodespec "$1") ; do
+	[ $? != 0 ] && exit 1  # Required to catch exit in above subshell.
+	case "$n" in
+	    all)
+		echo "${all_nodes//#DEAD/}"
+		;;
+	    any)
+		get_any_available_node "$all_nodes" || exit 1
+		;;
+	    ok|healthy) 
+		get_nodes_with_status "$all_nodes" "healthy" || exit 1
+		;;
+	    con|connected) 
+		get_nodes_with_status "$all_nodes" "connected" || exit 1
+		;;
+	    rm|recmaster)
+		get_node_with_property "$all_nodes" "recmaster" || exit 1
+		;;
+	    lvs|lvsmaster)
+		get_node_with_property "$all_nodes" "lvsmaster" || exit 1
+		;;
+	    natgw|natgwlist)
+		get_node_with_property "$all_nodes" "natgwlist" || exit 1
+		;;
+	    [0-9]|[0-9][0-9]|[0-9][0-9][0-9])
+		echo_nth $n $all_nodes
+		;;
+	    *)
+		$names_ok || invalid_nodespec
+		echo $n
+	esac
+    done
+}
+
+fakessh ()
+{
+    CTDB_SOCKET="$1" sh -c "$2" 3>/dev/null
+}
+
+stdout_filter ()
+{
+    if [ -n "$prefix" ] ; then
+	cat >"${prefix}.${n//\//_}"
+    elif $verbose && $parallel ; then
+	sed -e "s@^@[$n] @"
+    else
+	cat
+    fi
+}
+
+stderr_filter ()
+{
+    if $verbose && $parallel ; then
+	sed -e "s@^@[$n] @"
+    else
+	cat
+    fi
+}
+
+######################################################################
+
+parse_options "$@"
+
+$current && command="cd $PWD && $command"
+
+ssh_opts=
+if [ -n "$CTDB_NODES_SOCKETS" ] ; then
+    SSH=fakessh
+else 
+    # Could "2>/dev/null || true" but want to see errors from typos in file.
+    [ -r "${ctdb_base}/onnode.conf" ] && . "${ctdb_base}/onnode.conf"
+    [ -n "$SSH" ] || SSH=ssh
+    if [ "$SSH" = "ssh" ] ; then
+	ssh_opts="-n"
+    else
+	: # rsh? All bets are off!
+    fi
+fi
+
+######################################################################
+
+nodes=$(get_nodes "$nodespec")
+[ $? != 0 ] && exit 1   # Required to catch exit in above subshell.
+
+if $quiet ; then
+    verbose=false
+else
+    # If $nodes contains a space or a newline then assume multiple nodes.
+    nl="
+"
+    [ "$nodes" != "${nodes%[ ${nl}]*}" ] && verbose=true
+fi
+
+pids=""
+trap 'kill -TERM $pids 2>/dev/null' INT TERM
+# There's a small race here where the kill can fail if no processes
+# have been added to $pids and the script is interrupted.  However,
+# the part of the window where it matter is very small.
+retcode=0
+for n in $nodes ; do
+    set -o pipefail 2>/dev/null
+    if $parallel ; then
+	{ exec 3>&1 ; { $SSH $ssh_opts $EXTRA_SSH_OPTS $n "$command" | stdout_filter >&3 ; } 2>&1 | stderr_filter ; } &
+	pids="${pids} $!"
+    else
+	if $verbose ; then
+	    echo >&2 ; echo ">> NODE: $n <<" >&2
+	fi
+
+	{ exec 3>&1 ; { $SSH $ssh_opts $EXTRA_SSH_OPTS $n "$command" | stdout_filter >&3 ; } 2>&1 | stderr_filter ; }
+	[ $? = 0 ] || retcode=$?
+    fi
+done
+
+$parallel && {
+    for p in $pids; do
+	wait $p
+	[ $? = 0 ] || retcode=$?
+    done
+}
+
+exit $retcode
--- a/ctdb/tests/onnode/stubs/ssh
+++ b/ctdb/tests/onnode/stubs/ssh
@ -0,0 +1,2 @@
+#!/bin/sh
+echo "$*"
--- a/ctdb/tests/scripts/ctdb_test_functions.bash
+++ b/ctdb/tests/scripts/ctdb_test_functions.bash
@ -53,6 +53,54 @@ test_exit ()
    exit $(($testfailures+0))
 }

+ctdb_check_time_logs ()
+{
+    local threshold=20
+
+    local jump=false
+    local prev=""
+    local ds_prev=""
+    local node=""
+
+    out=$(onnode all tail -n 20 /var/log/ctdb.test.time.log 2>&1)
+
+    if [ $? -eq 0 ] ; then
+	local line
+	while read line ; do
+	    case "$line" in
+		\>\>\ NODE:\ *\ \<\<)
+		    node="${line#>> NODE: }"
+		    node=${node% <<*}
+		    ds_prev=""
+		    ;;
+		*\ *)
+		    set -- $line
+		    ds_curr="$1${2:0:1}"
+		    if [ -n "$ds_prev" ] && \
+			[ $(($ds_curr - $ds_prev)) -ge $threshold ] ; then
+			echo "Node $node had time jump of $(($ds_curr - $ds_prev))ds between $(date +'%T' -d @${ds_prev%?}) and $(date +'%T' -d @${ds_curr%?})"
+			jump=true
+		    fi
+		    prev="$line"
+		    ds_prev="$ds_curr"
+		    ;;
+	    esac
+	done <<<"$out"
+    else
+	echo Error getting time logs
+    fi
+    if $jump ; then
+	echo "Check time sync (test client first):"
+	date
+	onnode -p all date
+	echo "Information from test client:"
+	hostname
+	top -b -n 1
+	echo "Information from cluster nodes:"
+	onnode all "top -b -n 1 ; echo '/proc/slabinfo' ; cat /proc/slabinfo"
+    fi
+}
+
 ctdb_test_exit ()
 {
    local status=$?
@ -68,6 +116,10 @@ ctdb_test_exit ()

    echo "*** TEST COMPLETED (RC=$status) AT $(date '+%F %T'), CLEANING UP..."

+    if [ -n "$CTDB_TEST_REAL_CLUSTER" -a $status -ne 0 ] ; then
+	ctdb_check_time_logs
+    fi
+
    eval "$ctdb_test_exit_hook" || true
    unset ctdb_test_exit_hook

@ -856,6 +908,8 @@ restart_ctdb ()
 	
    onnode -q 1  $CTDB_TEST_WRAPPER wait_until_healthy || return 1

+    local debug_out=$(onnode -p all ctdb status -Y 2>&1; onnode -p all ctdb scriptstatus 2>&1)
+
    echo "Setting RerecoveryTimeout to 1"
    onnode -pq all "$CTDB setvar RerecoveryTimeout 1"

@ -869,6 +923,13 @@ restart_ctdb ()
    onnode -q 0 $CTDB recover

    echo "ctdb is ready"
+
+    if ! onnode 0 $CTDB_TEST_WRAPPER _cluster_is_healthy ; then
+	echo "OUCH!  Cluster is UNHEALTHY again..."
+	echo "$debug_out"
+	# Try to make the calling test fail
+	status=1
+    fi
 }

 ctdb_restart_when_done ()
--- a/ctdb/tests/scripts/run_tests
+++ b/ctdb/tests/scripts/run_tests
@ -18,8 +18,10 @@ EOF
 ######################################################################

 with_summary=false
+with_desc=false
+quiet=false

-temp=$(getopt -n "$prog" -o "xhs" -l help -- "$@")
+temp=$(getopt -n "$prog" -o "xdhqs" -l help -- "$@")

 [ $? != 0 ] && usage

@ -28,12 +30,20 @@ eval set -- "$temp"
 while true ; do
    case "$1" in
 	-x) set -x; shift ;;
+	-d) with_desc=true ; shift ;;  # 4th line of output is description
+	-q) quiet=true ; shift ;;
 	-s) with_summary=true ; shift ;;
 	--) shift ; break ;;
 	*) usage ;;
    esac
 done

+if $quiet ; then
+    show_progress() { cat >/dev/null ; }
+else
+    show_progress() { cat ; }
+fi
+
 ######################################################################

 tests_total=0
@ -43,22 +53,40 @@ summary=""
 rows=$(if tty -s ; then stty size ; else echo x 80 ; fi | sed -e 's@.* @@' -e 's@^0$@80@')
 ww=$((rows - 7))

+tf=$(mktemp)
+sf=$(mktemp)
+
+set -o pipefail
+
 for f; do
    [ -x $f ] || fail "test \"$f\" is not executable"
    tests_total=$(($tests_total + 1))
-    if ctdb_test_run "$f" ; then
-	tests_passed=$(($tests_passed + 1))
-	t="PASSED"
-    else
-	t="FAILED"
+    ctdb_test_run "$f" | tee "$tf" | show_progress
+    status=$?
+    if $with_summary ; then
+	if [ $status -eq 0 ] ; then
+	    tests_passed=$(($tests_passed + 1))
+	    t=" PASSED "
+	else
+	    t="*FAILED*"
+	fi
+	if $with_desc ; then
+	    desc=$(tail -n +4 $tf | head -n 1)
+	    f="$desc"
+	fi
+	echo "$t $f" >>"$sf"
    fi
-    summary=$(printf "%s\n%-${ww}s%s" "$summary" "$f" "$t")
 done

+rm -f "$tf"
+
 if $with_summary ; then
-    echo "$summary"
+    echo
+    cat "$sf"
    echo
    echo "${tests_passed}/${tests_total} tests passed"
 fi

+rm -f "$sf"
+
 test_exit
--- a/ctdb/tests/simple/00_ctdb_onnode.sh
+++ b/ctdb/tests/simple/00_ctdb_onnode.sh
@ -27,3 +27,12 @@ ctdb_test_init "$@"

 echo "Checking connectivity between nodes..."
 onnode all onnode all true
+
+# We're seeing some weirdness with CTDB controls timing out.  We're
+# wondering if time is jumping forward, so this creates a time log on
+# each node that we can examine later if tests fail weirdly.
+if [ -n "$CTDB_TEST_REAL_CLUSTER" ] ; then
+    echo "Starting time logging on each node..."
+    f="/var/log/ctdb.test.time.log"
+    onnode -p all "[ -f $f ] || while : ; do date '+%s %N' ; sleep 1 ; done >$f 2>&1 </dev/null &"  &
+fi
--- a/ctdb/tests/takeover/ctdb_takeover.py
+++ b/ctdb/tests/takeover/ctdb_takeover.py
@ -2,7 +2,7 @@

 # ctdb ip takeover code

-# Copyright (C) Martin Schwenke 2010
+# Copyright (C) Martin Schwenke, Ronnie Sahlberg 2010, 2011

 # Based on original CTDB C code:
 #
@ -29,6 +29,11 @@ import sys
 from optparse import OptionParser
 import copy
 import random
+import itertools
+
+# For parsing IP addresses
+import socket
+import struct

 options = None

@ -44,6 +49,9 @@ def process_args(extra_options=[]):
    parser.add_option("--ni",
                      action="store_true", dest="no_ip_failback", default=False,
                      help="turn on no_ip_failback")
+    parser.add_option("-L", "--lcp2",
+                      action="store_true", dest="lcp2", default=False,
+                      help="use LCP2 IP rebalancing algorithm [default: %default]")
    parser.add_option("-b", "--balance",
                      action="store_true", dest="balance", default=False,
                      help="show (im)balance information after each event")
@ -54,14 +62,11 @@ def process_args(extra_options=[]):
                      action="store_false", dest="show", default=True,
                      help="don't show IP address layout after each event")
    parser.add_option("-v", "--verbose",
-                      action="store_true", dest="verbose", default=False,
+                      action="count", dest="verbose", default=0,
                      help="print information and actions taken to stdout")
-    parser.add_option("--hack",
-                      action="store", type="int", dest="hack", default=0,
-                      help="apply a hack (see the code!!!)")
    parser.add_option("-r", "--retries",
                      action="store", type="int", dest="retries", default=5,
-                      help="number of retry loops for rebalancing [default: %default]")
+                      help="number of retry loops for rebalancing non-deterministic failback [default: %default]")
    parser.add_option("-i", "--iterations",
                      action="store", type="int", dest="iterations",
                      default=1000,
@ -69,6 +74,9 @@ def process_args(extra_options=[]):
    parser.add_option("-o", "--odds",
                      action="store", type="int", dest="odds", default=4,
                      help="make the chances of a failover 1 in ODDS [default: %default]")
+    parser.add_option("-A", "--aggressive",
+                      action="store_true", dest="aggressive", default=False,
+                      help="apply ODDS to try to flip each node [default: %default]")

    def seed_callback(option, opt, value, parser):
        random.seed(value)
@ -78,47 +86,165 @@ def process_args(extra_options=[]):

    parser.add_option("-x", "--exit",
                      action="store_true", dest="exit", default=False,
-                      help="exit on the 1st gratuitous IP move")
-    
+                      help="exit on the 1st gratuitous IP move or IP imbalance")
+    parser.add_option("-H", "--hard-imbalance-limit",
+                      action="store", type="int", dest="hard_limit", default=1,
+                      help="exceeding this limit causes termination  [default: %default]")
+    parser.add_option("-S", "--soft-imbalance-limit",
+                      action="store", type="int", dest="soft_limit", default=1,
+                      help="exceeding this limit increments a counter [default: %default]")
+
    (options, args) = parser.parse_args()

    if len(args) != 0:
        parser.error("too many argumentss")

-def print_begin(t):
-    print "=" * 40
+def print_begin(t, delim='='):
+    print delim * 40
    print "%s:" % (t)

 def print_end():
    print "-" * 40

 def verbose_begin(t):
-    if options.verbose:
+    if options.verbose > 0:
        print_begin(t)

 def verbose_end():
-    if options.verbose:
+    if options.verbose > 0:
        print_end()

 def verbose_print(t):
-    if options.verbose:
+    if options.verbose > 0:
        if not type(t) == list:
            t = [t]
        if t != []:
            print "\n".join([str(i) for i in t])

+# more than this and we switch to the logging module...  :-)
+def debug_begin(t):
+    if options.verbose > 1:
+        print_begin(t, '-')
+
+def debug_end():
+    if options.verbose > 1:
+        print_end()
+
+def debug_print(t):
+    if options.verbose > 1:
+        if not type(t) == list:
+            t = [t]
+        if t != []:
+            print "\n".join([str(i) for i in t])
+
+def ip_to_list_of_ints(ip):
+    # Be lazy... but only expose errors in IPv4 addresses, since
+    # they'll be more commonly used.  :-)
+    try:
+        l = socket.inet_pton(socket.AF_INET6, ip)
+    except:
+        # Pad with leading 0s.  This makes IPv4 addresses comparable
+        # with IPv6 but reduces the overall effectiveness of the
+        # algorithm.  The alternative would be to treat these
+        # addresses separately while trying to keep all the IPs in
+        # overall balance.
+        l = "".join(itertools.repeat("\0", 12)) + \
+            socket.inet_pton(socket.AF_INET, ip)
+
+    return map(lambda x: struct.unpack('B', x)[0], l)
+
+def ip_distance(ip1, ip2):
+    """Calculate the distance between 2 IPs.
+
+    This is the length of the longtest common prefix between the IPs.
+    It is calculated by XOR-ing the 2 IPs together and counting the
+    number of leading zeroes."""
+
+    distance = 0
+    for (o1, o2) in zip(ip_to_list_of_ints(ip1), ip_to_list_of_ints(ip2)):
+        # XOR this pair of octets
+        x = o1 ^ o2
+        # count number leading zeroes
+        if x == 0:
+            distance += 8
+        else:
+            # bin() gives minimal length '0bNNN' string
+            distance += (8 - (len(bin(x)) - 2))
+            break
+
+    return distance
+
+def ip_distance_2_sum(ip, ips):
+    """Calculate the IP distance for the given IP relative to IPs.
+
+    This could be made more efficient by insering ip_distance_2 into
+    the loop in this function.  However, that would result in some
+    loss of clarity and also will not be necessary in a C
+    implemntation."""
+
+    sum = 0
+    for i in ips:
+        sum += ip_distance(ip, i) ** 2
+
+    return sum
+
+def imbalance_metric(ips):
+    """Return the imbalance metric for a group of IPs.
+
+    This is the sum of squares of the IP distances between each pair of IPs."""
+    if len(ips) > 1:
+        (h, t) = (ips[0], ips[1:])
+        return ip_distance_2_sum(h, t) + imbalance_metric(t)
+    else:
+        return 0
+
+def mean(l):
+    return float(sum(l))/len(l)

 class Node(object):
    def __init__(self, public_addresses):
-        self.public_addresses = set(public_addresses)
+        # List of list allows groups of IPs to be passed in.  They're
+        # not actually used in the algorithm but are just used by
+        # calculate_imbalance() for checking the simulation.  Note
+        # that people can pass in garbage and make this code
+        # fail... but we're all friends here in simulation world...
+        # :-)
+        if type(public_addresses[0]) is str:
+            self.public_addresses = set(public_addresses)
+            self.ip_groups = []
+        else:
+            # flatten
+            self.public_addresses = set([i for s in public_addresses for i in s])
+            self.ip_groups = public_addresses
+
        self.current_addresses = set()
        self.healthy = True
+        self.imbalance = -1
+
+    def __str__(self):
+        return "%s %s%s" % \
+            ("*" if len(self.public_addresses) == 0 else \
+                 (" " if self.healthy else "#"),
+             sorted(list(self.current_addresses)),
+             " %d" % self.imbalance if options.lcp2 else "")

    def can_node_serve_ip(self, ip):
        return ip in self.public_addresses

-    def node_ip_coverage(self):
-        return len(self.current_addresses)
+    def node_ip_coverage(self, ips=None):
+        return len([a for a in self.current_addresses if ips == None or a in ips])
+
+    def set_imbalance(self, imbalance=-1):
+        """Set the imbalance metric to the given value.  If none given
+        then calculate it."""
+
+        if imbalance != -1:
+            self.imbalance = imbalance
+        else:
+            self.imbalance = imbalance_metric(list(self.current_addresses))
+
+    def get_imbalance(self):
+        return self.imbalance

 class Cluster(object):
    def __init__(self):
@ -131,27 +257,46 @@ class Cluster(object):
        self.ip_moves = []
        self.grat_ip_moves = []
        self.imbalance = []
+        self.imbalance_groups = []
+        self.imbalance_count = 0
+        self.imbalance_groups_count = itertools.repeat(0)
+        self.imbalance_metric = []
        self.events = -1
        self.num_unhealthy = []

        self.prev = None

    def __str__(self):
-        return "\n".join(["%2d %s %s" %
-                          (i,
-                           "*" if len(n.public_addresses) == 0 else \
-                               (" " if n.healthy else "#"),
-                           sorted(list(n.current_addresses)))
-                          for (i, n) in enumerate(self.nodes)])
+        return "\n".join(["%2d %s" % (i, n) \
+                              for (i, n) in enumerate(self.nodes)])
+
+    # This is naive.  It assumes that IP groups are indicated by the
+    # 1st node having IP groups.
+    def have_ip_groups(self):
+        return (len(self.nodes[0].ip_groups) > 0)

    def print_statistics(self):
        print_begin("STATISTICS")
-        print "Events:              %6d" % self.events
-        print "Total IP moves:      %6d" % sum(self.ip_moves)
-        print "Gratuitous IP moves: %6d" % sum(self.grat_ip_moves)
-        print "Max imbalance:       %6d" % max(self.imbalance)
-        print "Final imbalance:     %6d" % self.imbalance[-1]
-        print "Maximum unhealthy:   %6d" % max(self.num_unhealthy)
+        print "Events:                      %6d" % self.events
+        print "Total IP moves:              %6d" % sum(self.ip_moves)
+        print "Gratuitous IP moves:         %6d" % sum(self.grat_ip_moves)
+        print "Max imbalance:               %6d" % max(self.imbalance)
+        if self.have_ip_groups():
+            print "Max group imbalance counts:    ", map(max, zip(*self.imbalance_groups))
+        print "Mean imbalance:              %f" % mean(self.imbalance)
+        if self.have_ip_groups():
+            print "Mean group imbalances counts:   ", map(mean, zip(*self.imbalance_groups))
+        print "Final imbalance:             %6d" % self.imbalance[-1]
+        if self.have_ip_groups():
+            print "Final group imbalances:         ", self.imbalance_groups[-1]
+        if options.lcp2:
+            print "Max LCP2 imbalance  :        %6d" % max(self.imbalance_metric)
+        print "Soft imbalance count:        %6d" % self.imbalance_count
+        if self.have_ip_groups():
+            print "Soft imbalance group counts:    ", self.imbalance_groups_count
+        if options.lcp2:
+            print "Final LCP2 imbalance  :      %6d" % self.imbalance_metric[-1]
+        print "Maximum unhealthy:           %6d" % max(self.num_unhealthy)
        print_end()

    def find_pnn_with_ip(self, ip):
@ -178,7 +323,7 @@ class Cluster(object):
            verbose_print(pnn)

        verbose_end()
-        
+
    def unhealthy(self, *pnns):

        verbose_begin("UNHEALTHY")
@ -191,27 +336,42 @@ class Cluster(object):

    def do_something_random(self):

+        """Make random node(s) healthy or unhealthy.

-        """Make a random node healthy or unhealthy.
+        If options.aggressive is False then: If all nodes are healthy
+        or unhealthy, then invert one of them; otherwise, there's a 1
+        in options.odds chance of making another node unhealthy.

-        If all nodes are healthy or unhealthy, then invert one of
-        them.  Otherwise, there's a 1 in options.odds chance of making
-        another node unhealthy."""
+        If options.aggressive is True then: For each node there is a 1
+        in options.odds chance of flipping the state of that node
+        between healthy and unhealthy."""

-        num_nodes = len(self.nodes)
-        healthy_pnns = [i for (i,n) in enumerate(self.nodes) if n.healthy]
-        num_healthy = len(healthy_pnns)
+        if not options.aggressive:
+            num_nodes = len(self.nodes)
+            healthy_pnns = [i for (i,n) in enumerate(self.nodes) if n.healthy]
+            num_healthy = len(healthy_pnns)

-        if num_nodes == num_healthy:
-            self.unhealthy(random.randint(0, num_nodes-1))
-        elif num_healthy == 0:
-            self.healthy(random.randint(0, num_nodes-1))
-        elif random.randint(1, options.odds) == 1:
-            self.unhealthy(random.choice(healthy_pnns))
+            if num_nodes == num_healthy:
+                self.unhealthy(random.randint(0, num_nodes-1))
+            elif num_healthy == 0:
+                self.healthy(random.randint(0, num_nodes-1))
+            elif random.randint(1, options.odds) == 1:
+                self.unhealthy(random.choice(healthy_pnns))
+            else:
+                all_pnns = range(num_nodes)
+                unhealthy_pnns = sorted(list(set(all_pnns) - set(healthy_pnns)))
+                self.healthy(random.choice(unhealthy_pnns))
        else:
-            all_pnns = range(num_nodes)
-            unhealthy_pnns = sorted(list(set(all_pnns) - set(healthy_pnns)))
-            self.healthy(random.choice(unhealthy_pnns))
+            # We need to make at least one change or we retry...x
+            changed = False
+            while not changed:
+                for (pnn, n) in enumerate(self.nodes):
+                    if random.randint(1, options.odds) == 1:
+                        changed = True
+                        if n.healthy:
+                            self.unhealthy(pnn)
+                        else:
+                            self.healthy(pnn)

    def random_iterations(self):
        i = 1
@ -219,35 +379,26 @@ class Cluster(object):
            verbose_begin("EVENT %d" % i)
            verbose_end()
            self.do_something_random()
-            if self.recover() and options.exit > 0:
+            if self.recover() and options.exit:
                break
            i += 1

        self.print_statistics()

-    def calculate_imbalance(self):
+    def imbalance_for_ips(self, ips):

        imbalance = 0

-        assigned = sorted([ip
-                           for n in self.nodes
-                           for ip in n.current_addresses])
+        maxnode = -1
+        minnode = -1

-        for ip in assigned:
-
-            num_capable = 0
-            maxnode = -1
-            minnode = -1
+        for ip in ips:
            for (i, n) in enumerate(self.nodes):
-                if not n.healthy:
+
+                if not n.healthy or not n.can_node_serve_ip(ip):
                    continue

-                if not n.can_node_serve_ip(ip):
-                    continue
-
-                num_capable += 1
-
-                num = n.node_ip_coverage()
+                num = n.node_ip_coverage(ips)

                if maxnode == -1 or num > maxnum:
                    maxnode = i
@ -256,24 +407,42 @@ class Cluster(object):
                if minnode == -1 or num < minnum:
                    minnode = i
                    minnum = num
-            
-            if maxnode == -1:
+
+            if maxnode == -1 or minnode == -1:
                continue

            i = maxnum - minnum
-            if maxnum - minnum < 2:
-                i = 0
+            #if i < 2:
+            #    i = 0
            imbalance = max([imbalance, i])

        return imbalance

+
+    def calculate_imbalance(self):
+
+        # First, do all the assigned IPs.
+        assigned = sorted([ip
+                           for n in self.nodes
+                           for ip in n.current_addresses])
+
+        i = self.imbalance_for_ips(assigned)
+
+        ig = []
+        # FIXME?  If dealing with IP groups, assume the nodes are all
+        # the same.
+        for ips in self.nodes[0].ip_groups:
+            gi = self.imbalance_for_ips(ips)
+            ig.append(gi)
+
+        return (i, ig)
+
+
    def diff(self):
        """Calculate differences in IP assignments between self and prev.

        Gratuitous IP moves (from a healthy node to a healthy node)
-        are prefix by !!.  Any gratuitous IP moves cause this function
-        to return False.  If there are no gratuitous moves then it
-        will return True."""
+        are prefixed by !!."""

        ip_moves = 0
        grat_ip_moves = 0
@ -297,33 +466,6 @@ class Cluster(object):
                                   (prefix, ip, old, new))

        return (ip_moves, grat_ip_moves, details)
-                    
-    def find_least_loaded_node(self, ip):
-        """Just like find_takeover_node but doesn't care about health."""
-        pnn = -1
-        min = 0
-        for (i, n) in enumerate(self.nodes):
-            if not n.can_node_serve_ip(ip):
-                continue
-
-            num = n.node_ip_coverage()
-
-            if (pnn == -1):
-                pnn = i
-                min = num
-            else:
-                if num < min:
-                    pnn = i
-                    min = num
-
-        if pnn == -1:
-            verbose_print("Could not find node to take over public address %s" % ip)
-            return False
-
-        self.nodes[pnn].current_addresses.add(ip)
-
-        verbose_print("%s -> %d" % (ip, pnn))
-        return True

    def find_takeover_node(self, ip):

@ -355,6 +497,190 @@ class Cluster(object):
        verbose_print("%s -> %d" % (ip, pnn))
        return True

+    def basic_allocate_unassigned(self):
+
+        assigned = set([ip for n in self.nodes for ip in n.current_addresses])
+        unassigned = sorted(list(self.all_public_ips - assigned))
+
+        for ip in unassigned:
+            self.find_takeover_node(ip)
+
+    def basic_failback(self, retries_l):
+
+        assigned = sorted([ip
+                           for n in self.nodes
+                           for ip in n.current_addresses])
+        for ip in assigned:
+
+            maxnode = -1
+            minnode = -1
+            for (i, n) in enumerate(self.nodes):
+                if not n.healthy:
+                    continue
+
+                if not n.can_node_serve_ip(ip):
+                    continue
+
+                num = n.node_ip_coverage()
+
+                if maxnode == -1:
+                    maxnode = i
+                    maxnum = num
+                else:
+                    if num > maxnum:
+                        maxnode = i
+                        maxnum = num
+                if minnode == -1:
+                    minnode = i
+                    minnum = num
+                else:
+                    if num < minnum:
+                        minnode = i
+                        minnum = num
+
+            if maxnode == -1:
+                print "Could not find maxnode. May not be able to serve ip", ip
+                continue
+
+            #if self.deterministic_public_ips:
+            #    continue
+
+            if maxnum > minnum + 1 and retries_l[0] < options.retries:
+                # Remove the 1st ip from maxnode
+                t = sorted(list(self.nodes[maxnode].current_addresses))
+                realloc = t[0]
+                verbose_print("%s <- %d" % (realloc, maxnode))
+                self.nodes[maxnode].current_addresses.remove(realloc)
+                # Redo the outer loop.
+                retries_l[0] += 1
+                return True
+
+        return False
+
+
+    def lcp2_allocate_unassigned(self):
+
+        # Assign as many unassigned addresses as possible.  Keep
+        # selecting the optimal assignment until we don't manage to
+        # assign anything.
+        assigned = set([ip for n in self.nodes for ip in n.current_addresses])
+        unassigned = sorted(list(self.all_public_ips - assigned))
+
+        should_loop = True
+        while len(unassigned) > 0 and should_loop:
+            should_loop = False
+
+            debug_begin(" CONSIDERING MOVES (UNASSIGNED)")
+
+            minnode = -1
+            mindsum = 0
+            minip = None
+
+            for ip in unassigned:
+                for dstnode in range(len(self.nodes)):
+                    if self.nodes[dstnode].can_node_serve_ip(ip) and \
+                            self.nodes[dstnode].healthy:
+                        dstdsum = ip_distance_2_sum(ip, self.nodes[dstnode].current_addresses)
+                        dstimbl = self.nodes[dstnode].get_imbalance() + dstdsum
+                        debug_print(" %s -> %d [+%d]" % \
+                                        (ip,
+                                         dstnode,
+                                         dstimbl - self.nodes[dstnode].get_imbalance()))
+
+                        if (minnode == -1) or (dstdsum < mindsum):
+                            minnode = dstnode
+                            minimbl = dstimbl
+                            mindsum = dstdsum
+                            minip = ip
+                            should_loop = True
+            debug_end()
+
+            if minnode != -1:
+                self.nodes[minnode].current_addresses.add(minip)
+                self.nodes[minnode].set_imbalance(self.nodes[minnode].get_imbalance() + mindsum)
+                verbose_print("%s -> %d [+%d]" % (minip, minnode, mindsum))
+                unassigned.remove(minip)
+
+        for ip in unassigned:
+            verbose_print("Could not find node to take over public address %s" % ip)
+
+    def lcp2_failback(self, targets):
+
+        # Get the node with the highest imbalance metric.
+        srcnode = -1
+        maximbl = 0
+        for (pnn, n) in enumerate(self.nodes):
+            b = n.get_imbalance()
+            if (srcnode == -1) or (b > maximbl):
+                srcnode = pnn
+                maximbl = b
+
+        # This means that all nodes had 0 or 1 addresses, so can't
+        # be imbalanced.
+        if maximbl == 0:
+            return False
+
+        # We'll need this a few times...
+        ips = self.nodes[srcnode].current_addresses
+
+        # Find an IP and destination node that best reduces imbalance.
+        optimum = None
+        debug_begin(" CONSIDERING MOVES FROM %d [%d]" % (srcnode, maximbl))
+        for ip in ips:
+            # What is this IP address costing the source node?
+            srcdsum = ip_distance_2_sum(ip, ips - set([ip]))
+            srcimbl = maximbl - srcdsum
+
+            # Consider this IP address would cost each potential
+            # destination node.  Destination nodes are limited to
+            # those that are newly healthy, since we don't want to
+            # do gratuitous failover of IPs just to make minor
+            # balance improvements.
+            for dstnode in targets:
+                if self.nodes[dstnode].can_node_serve_ip(ip) and \
+                        self.nodes[dstnode].healthy:
+                    dstdsum = ip_distance_2_sum(ip, self.nodes[dstnode].current_addresses)
+                    dstimbl = self.nodes[dstnode].get_imbalance() + dstdsum
+                    debug_print(" %d [%d] -> %s -> %d [+%d]" % \
+                                    (srcnode,
+                                     srcimbl - self.nodes[srcnode].get_imbalance(),
+                                     ip,
+                                     dstnode,
+                                     dstimbl - self.nodes[dstnode].get_imbalance()))
+
+                    if (dstimbl < maximbl) and (dstdsum < srcdsum):
+                        if optimum is None:
+                            optimum = (ip, srcnode, srcimbl, dstnode, dstimbl)
+                        else:
+                            (x, sn, si, dn, di) = optimum
+                            if (srcimbl + dstimbl) < (si + di):
+                                optimum = (ip, srcnode, srcimbl, dstnode, dstimbl)
+        debug_end()
+
+        if optimum is not None:
+            # We found a move that makes things better...
+            (ip, srcnode, srcimbl, dstnode, dstimbl) = optimum
+            ini_srcimbl = self.nodes[srcnode].get_imbalance()
+            ini_dstimbl = self.nodes[dstnode].get_imbalance()
+
+            self.nodes[srcnode].current_addresses.remove(ip)
+            self.nodes[srcnode].set_imbalance(srcimbl)
+
+            self.nodes[dstnode].current_addresses.add(ip)
+            self.nodes[dstnode].set_imbalance(dstimbl)
+
+            verbose_print("%d [%d] -> %s -> %d [+%d]" % \
+                              (srcnode,
+                               srcimbl - ini_srcimbl,
+                               ip,
+                               dstnode,
+                               dstimbl - ini_dstimbl))
+
+            return True
+
+        return False
+
+
    def ctdb_takeover_run(self):

        self.events += 1
@ -369,22 +695,11 @@ class Cluster(object):
            # Remap everything.
            addr_list = sorted(list(self.all_public_ips))
            for (i, ip) in enumerate(addr_list):
-                if options.hack == 1:
-                    self.quietly_remove_ip(ip)
-                    self.find_least_loaded_node(ip)
-                elif options.hack == 2:
-                    pnn = i % len(self.nodes)
-                    if ip in self.nodes[pnn].public_addresses:
-                        self.quietly_remove_ip(ip)
-                        # Add addresses to new node.
-                        self.nodes[pnn].current_addresses.add(ip)
-                        verbose_print("%s -> %d" % (ip, pnn))
-                else:
-                    self.quietly_remove_ip(ip)
-                    # Add addresses to new node.
-                    pnn = i % len(self.nodes)
-                    self.nodes[pnn].current_addresses.add(ip)
-                    verbose_print("%s -> %d" % (ip, pnn))
+                self.quietly_remove_ip(ip)
+                # Add addresses to new node.
+                pnn = i % len(self.nodes)
+                self.nodes[pnn].current_addresses.add(ip)
+                verbose_print("%s -> %d" % (ip, pnn))

        # Remove public addresses from unhealthy nodes.
        for (pnn, n) in enumerate(self.nodes):
@ -399,69 +714,39 @@ class Cluster(object):
                           for ip in n.current_addresses - n.public_addresses])
            n.current_addresses &= n.public_addresses

-        # We'll only retry the balancing act up to 5 times.
-        retries = 0
+        if options.lcp2:
+            newly_healthy = [pnn for (pnn, n) in enumerate(self.nodes)
+                             if len(n.current_addresses) == 0 and n.healthy]
+            for n in self.nodes:
+                n.set_imbalance()
+
+        # We'll only retry the balancing act up to options.retries
+        # times (for the basic non-deterministic algorithm).  This
+        # nonsense gives us a reference on the retries count in
+        # Python.  It will be easier in C.  :-)
+        # For LCP2 we reassignas many IPs from heavily "loaded" nodes
+        # to nodes that are newly healthy, looping until we fail to
+        # reassign an IP.
+        retries_l = [0]
        should_loop = True
        while should_loop:
            should_loop = False

-            assigned = set([ip for n in self.nodes for ip in n.current_addresses])
-            unassigned = sorted(list(self.all_public_ips - assigned))
+            if options.lcp2:
+                self.lcp2_allocate_unassigned()
+            else:
+                self.basic_allocate_unassigned()

-            for ip in unassigned:
-                self.find_takeover_node(ip)
-
-            if self.no_ip_failback:
+            if self.no_ip_failback or self.deterministic_public_ips:
                break

-            assigned = sorted([ip
-                               for n in self.nodes
-                               for ip in n.current_addresses])
-            for ip in assigned:
-
-                maxnode = -1
-                minnode = -1
-                for (i, n) in enumerate(self.nodes):
-                    if not n.healthy:
-                        continue
-
-                    if not n.can_node_serve_ip(ip):
-                        continue
-
-                    num = n.node_ip_coverage()
-
-                    if maxnode == -1:
-                        maxnode = i
-                        maxnum = num
-                    else:
-                        if num > maxnum:
-                            maxnode = i
-                            maxnum = num
-                    if minnode == -1:
-                        minnode = i
-                        minnum = num
-                    else:
-                        if num < minnum:
-                            minnode = i
-                            minnum = num
-
-                if maxnode == -1:
-                    print "Could not maxnode. May not be able to serve ip", ip
-                    continue
-
-                if self.deterministic_public_ips:
-                    continue
-
-                if maxnum > minnum + 1 and retries < options.retries:
-                    # Remove the 1st ip from maxnode
-                    t = sorted(list(self.nodes[maxnode].current_addresses))
-                    realloc = t[0]
-                    verbose_print("%s <- %d" % (realloc, maxnode))
-                    self.nodes[maxnode].current_addresses.remove(realloc)
-                    retries += 1
-                    # Redo the outer loop.
-                    should_loop = True
+            if options.lcp2:
+                if len(newly_healthy) == 0:
                    break
+                should_loop = self.lcp2_failback(newly_healthy)
+            else:
+                should_loop = self.basic_failback(retries_l)
+

    def recover(self):
        verbose_begin("TAKEOVER")
@ -482,11 +767,31 @@ class Cluster(object):
                print "\n".join(details)
                print_end()

-        imbalance = self.calculate_imbalance()
+        (imbalance, imbalance_groups) = self.calculate_imbalance()
        self.imbalance.append(imbalance)
+        self.imbalance_groups.append(imbalance_groups)
+
+        if imbalance > options.soft_limit:
+            self.imbalance_count += 1
+
+        # There must be a cleaner way...
+        t = []
+        for (c, i) in zip(self.imbalance_groups_count, imbalance_groups):
+            if i > options.soft_limit:
+                t.append(c + i)
+            else:
+                t.append(c)
+        self.imbalance_groups_count = t
+
+        imbalance_metric = max([n.get_imbalance() for n in self.nodes])
+        self.imbalance_metric.append(imbalance_metric)
        if options.balance:
            print_begin("IMBALANCE")
-            print imbalance
+            print "ALL IPS:", imbalance
+            if self.have_ip_groups():
+                print "IP GROUPS:", imbalance_groups
+            if options.lcp2:
+                print "LCP2 IMBALANCE:", imbalance_metric
            print_end()

        num_unhealthy = len(self.nodes) - \
@ -501,4 +806,7 @@ class Cluster(object):
        self.prev = None
        self.prev = copy.deepcopy(self)

-        return grat_ip_moves
+        # True is bad!
+        return (grat_ip_moves > 0) or \
+            (not self.have_ip_groups() and imbalance > options.hard_limit) or \
+            (self.have_ip_groups() and (max(imbalance_groups) > options.hard_limit))
--- a/ctdb/tests/takeover/ip_groups1.py
+++ b/ctdb/tests/takeover/ip_groups1.py
@ -0,0 +1,25 @@
+#!/usr/bin/env python
+
+# 2 IP groups, both on the same 5 nodes, with each group on different
+# interfaces/VLANs.  One group has many more addresses to test how
+# well an "imbalanced" configuration will balance...
+
+from ctdb_takeover import Cluster, Node, process_args
+
+process_args()
+
+addresses20 = ['192.168.20.%d' % n for n in range(1, 13)]
+addresses128 = ['192.168.128.%d' % n for n in range(1, 5)]
+
+c = Cluster()
+
+for i in range(5):
+    c.add_node(Node([addresses20, addresses128]))
+
+#for i in range(3):
+#    c.add_node(Node([addresses20]))
+
+
+c.recover()
+
+c.random_iterations()
--- a/ctdb/tests/takeover/ip_groups2.py
+++ b/ctdb/tests/takeover/ip_groups2.py
@ -0,0 +1,20 @@
+#!/usr/bin/env python
+
+# 2 groups of addresses, combined into 1 pool so the checking
+# algorithm doesn't know about the groups, across 2 nodes.
+
+from ctdb_takeover import Cluster, Node, process_args
+
+process_args()
+
+addresses20 = ['192.168.20.%d' % n for n in range(1, 13)]
+addresses21 = ['192.168.21.%d' % n for n in range(1, 5)]
+
+c = Cluster()
+
+for i in range(2):
+    c.add_node(Node(addresses20 + addresses21))
+
+c.recover()
+
+c.random_iterations()
--- a/ctdb/tests/takeover/ip_groups3.py
+++ b/ctdb/tests/takeover/ip_groups3.py
@ -0,0 +1,27 @@
+#!/usr/bin/env python
+
+# 4 IP groups, across 10 nodes, with each group on different
+# interfaces/VLANs.  80 addresses in total but not evenly balanced, to
+# help check some of the more extreme behaviour.
+
+from ctdb_takeover import Cluster, Node, process_args
+
+process_args()
+
+addresses1 = ['192.168.1.%d' % n for n in range(1, 41)]
+addresses2 = ['192.168.2.%d' % n for n in range(1, 21)]
+addresses3 = ['192.168.3.%d' % n for n in range(1, 11)]
+addresses4 = ['192.168.4.%d' % n for n in range(1, 11)]
+
+# Try detecting imbalance with square root of number of nodes?  Or
+# just with a parameter indicating how unbalanced you're willing to
+# accept...
+
+c = Cluster()
+
+for i in range(10):
+    c.add_node(Node([addresses1, addresses2, addresses3, addresses4]))
+
+c.recover()
+
+c.random_iterations()
--- a/ctdb/tests/takeover/ip_groups4.py
+++ b/ctdb/tests/takeover/ip_groups4.py
@ -0,0 +1,25 @@
+#!/usr/bin/env python
+
+# 2 IP groups, across 2 nodes, with each group on different
+# interfaces.  4 addresses per group.  A nice little canonical 2 node
+# configuration.
+
+from ctdb_takeover import Cluster, Node, process_args
+
+process_args()
+
+addresses1 = ['192.168.1.%d' % n for n in range(1, 5)]
+addresses2 = ['192.168.2.%d' % n for n in range(1, 5)]
+
+# Try detecting imbalance with square root of number of nodes?  Or
+# just with a parameter indicating how unbalanced you're willing to
+# accept...
+
+c = Cluster()
+
+for i in range(2):
+    c.add_node(Node([addresses1, addresses2]))
+
+c.recover()
+
+c.random_iterations()
--- a/ctdb/tests/takeover/ip_groups5.py
+++ b/ctdb/tests/takeover/ip_groups5.py
@ -0,0 +1,23 @@
+#!/usr/bin/env python
+
+# 1 IP group, to test backward compatibility of LCP2 algorithm.  16
+# addresses across 4 nodes.
+
+from ctdb_takeover import Cluster, Node, process_args
+
+process_args()
+
+addresses1 = ['192.168.1.%d' % n for n in range(1, 17)]
+
+# Try detecting imbalance with square root of number of nodes?  Or
+# just with a parameter indicating how unbalanced you're willing to
+# accept...
+
+c = Cluster()
+
+for i in range(4):
+    c.add_node(Node(addresses1))
+
+c.recover()
+
+c.random_iterations()
--- a/ctdb/tools/ctdb.c
+++ b/ctdb/tools/ctdb.c
@ -893,6 +893,7 @@ static int control_natgwlist(struct ctdb_context *ctdb, int argc, const char **a
 		/* or if we still can not find any */
 		if (i == nodemap->num) {
 			printf("-1 0.0.0.0\n");
+			ret = 2; /* matches ENOENT */
 		}
 	}

@ -910,7 +911,7 @@ static int control_natgwlist(struct ctdb_context *ctdb, int argc, const char **a
 		       !!(nodemap->nodes[i].flags&NODE_FLAGS_STOPPED));
 	}

-	return 0;
+	return ret;
 }

 /*