mirror of
https://github.com/samba-team/samba.git
synced 2025-02-05 21:57:51 +03:00
Merge remote-tracking branch 'origin/master' into eventscripts_relative
(This used to be ctdb commit b723f23fc9c38e75b91d43306d606be26c55d31d)
This commit is contained in:
commit
46c908d542
@ -81,12 +81,17 @@ static void dump_packet(unsigned char *data, size_t len)
|
||||
|
||||
/*
|
||||
called when an incoming connection is readable
|
||||
This function MUST be safe for reentry via the queue callback!
|
||||
*/
|
||||
static void queue_io_read(struct ctdb_queue *queue)
|
||||
{
|
||||
int num_ready = 0;
|
||||
ssize_t nread, totread, partlen;
|
||||
uint8_t *data, *data_base;
|
||||
uint32_t sz_bytes_req;
|
||||
uint32_t pkt_size;
|
||||
uint32_t pkt_bytes_remaining;
|
||||
uint32_t to_read;
|
||||
ssize_t nread;
|
||||
uint8_t *data;
|
||||
|
||||
if (ioctl(queue->fd, FIONREAD, &num_ready) != 0) {
|
||||
return;
|
||||
@ -96,93 +101,77 @@ static void queue_io_read(struct ctdb_queue *queue)
|
||||
goto failed;
|
||||
}
|
||||
|
||||
|
||||
queue->partial.data = talloc_realloc_size(queue, queue->partial.data,
|
||||
num_ready + queue->partial.length);
|
||||
|
||||
if (queue->partial.data == NULL) {
|
||||
DEBUG(DEBUG_ERR,("%s: read error alloc failed for %u\n",
|
||||
queue->name, num_ready + queue->partial.length));
|
||||
goto failed;
|
||||
/* starting fresh, allocate buf for size bytes */
|
||||
sz_bytes_req = sizeof(pkt_size);
|
||||
queue->partial.data = talloc_size(queue, sz_bytes_req);
|
||||
if (queue->partial.data == NULL) {
|
||||
DEBUG(DEBUG_ERR,("read error alloc failed for %u\n",
|
||||
sz_bytes_req));
|
||||
goto failed;
|
||||
}
|
||||
} else if (queue->partial.length < sizeof(pkt_size)) {
|
||||
/* yet to find out the packet length */
|
||||
sz_bytes_req = sizeof(pkt_size) - queue->partial.length;
|
||||
} else {
|
||||
/* partial packet, length known, full buf allocated */
|
||||
sz_bytes_req = 0;
|
||||
}
|
||||
|
||||
nread = read(queue->fd, queue->partial.data + queue->partial.length, num_ready);
|
||||
if (nread <= 0) {
|
||||
DEBUG(DEBUG_ERR,("%s: read error nread=%d\n",
|
||||
queue->name, (int)nread));
|
||||
goto failed;
|
||||
}
|
||||
totread = nread;
|
||||
partlen = queue->partial.length;
|
||||
|
||||
data = queue->partial.data;
|
||||
nread += queue->partial.length;
|
||||
|
||||
if (sz_bytes_req > 0) {
|
||||
to_read = MIN(sz_bytes_req, num_ready);
|
||||
nread = read(queue->fd, data + queue->partial.length,
|
||||
to_read);
|
||||
if (nread <= 0) {
|
||||
DEBUG(DEBUG_ERR,("read error nread=%d\n", (int)nread));
|
||||
goto failed;
|
||||
}
|
||||
queue->partial.length += nread;
|
||||
|
||||
if (nread < sz_bytes_req) {
|
||||
/* not enough to know the length */
|
||||
DEBUG(DEBUG_DEBUG,("Partial packet length read\n"));
|
||||
return;
|
||||
}
|
||||
/* size now known, allocate buffer for the full packet */
|
||||
queue->partial.data = talloc_realloc_size(queue, data,
|
||||
*(uint32_t *)data);
|
||||
if (queue->partial.data == NULL) {
|
||||
DEBUG(DEBUG_ERR,("read error alloc failed for %u\n",
|
||||
*(uint32_t *)data));
|
||||
goto failed;
|
||||
}
|
||||
data = queue->partial.data;
|
||||
num_ready -= nread;
|
||||
}
|
||||
|
||||
pkt_size = *(uint32_t *)data;
|
||||
if (pkt_size == 0) {
|
||||
DEBUG(DEBUG_CRIT,("Invalid packet of length 0\n"));
|
||||
goto failed;
|
||||
}
|
||||
|
||||
pkt_bytes_remaining = pkt_size - queue->partial.length;
|
||||
to_read = MIN(pkt_bytes_remaining, num_ready);
|
||||
nread = read(queue->fd, data + queue->partial.length,
|
||||
to_read);
|
||||
if (nread <= 0) {
|
||||
DEBUG(DEBUG_ERR,("read error nread=%d\n",
|
||||
(int)nread));
|
||||
goto failed;
|
||||
}
|
||||
queue->partial.length += nread;
|
||||
|
||||
if (queue->partial.length < pkt_size) {
|
||||
DEBUG(DEBUG_DEBUG,("Partial packet data read\n"));
|
||||
return;
|
||||
}
|
||||
|
||||
queue->partial.data = NULL;
|
||||
queue->partial.length = 0;
|
||||
|
||||
if (nread >= 4 && *(uint32_t *)data == nread) {
|
||||
/* it is the responsibility of the incoming packet
|
||||
function to free 'data' */
|
||||
queue->callback(data, nread, queue->private_data);
|
||||
return;
|
||||
}
|
||||
|
||||
data_base = data;
|
||||
|
||||
while (nread >= 4 && *(uint32_t *)data <= nread) {
|
||||
/* we have at least one packet */
|
||||
uint8_t *d2;
|
||||
uint32_t len;
|
||||
bool destroyed = false;
|
||||
|
||||
len = *(uint32_t *)data;
|
||||
if (len == 0) {
|
||||
/* bad packet! treat as EOF */
|
||||
DEBUG(DEBUG_CRIT,("%s: Invalid packet of length 0 (nread = %zu, totread = %zu, partlen = %zu)\n",
|
||||
queue->name, nread, totread, partlen));
|
||||
dump_packet(data_base, totread + partlen);
|
||||
goto failed;
|
||||
}
|
||||
d2 = talloc_memdup(queue, data, len);
|
||||
if (d2 == NULL) {
|
||||
DEBUG(DEBUG_ERR,("%s: read error memdup failed for %u\n",
|
||||
queue->name, len));
|
||||
/* sigh */
|
||||
goto failed;
|
||||
}
|
||||
|
||||
queue->destroyed = &destroyed;
|
||||
queue->callback(d2, len, queue->private_data);
|
||||
/* If callback freed us, don't do anything else. */
|
||||
if (destroyed) {
|
||||
return;
|
||||
}
|
||||
queue->destroyed = NULL;
|
||||
|
||||
data += len;
|
||||
nread -= len;
|
||||
}
|
||||
|
||||
if (nread > 0) {
|
||||
/* we have only part of a packet */
|
||||
if (data_base == data) {
|
||||
queue->partial.data = data;
|
||||
queue->partial.length = nread;
|
||||
} else {
|
||||
queue->partial.data = talloc_memdup(queue, data, nread);
|
||||
if (queue->partial.data == NULL) {
|
||||
DEBUG(DEBUG_ERR,("%s: read error memdup partial failed for %u\n",
|
||||
queue->name, (unsigned)nread));
|
||||
goto failed;
|
||||
}
|
||||
queue->partial.length = nread;
|
||||
talloc_free(data_base);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
talloc_free(data_base);
|
||||
/* it is the responsibility of the callback to free 'data' */
|
||||
queue->callback(data, pkt_size, queue->private_data);
|
||||
return;
|
||||
|
||||
failed:
|
||||
|
@ -302,11 +302,8 @@ static void ctdb_wait_until_recovered(struct event_context *ev, struct timed_eve
|
||||
exit(11);
|
||||
}
|
||||
ctdb->db_persistent_check_errors = 0;
|
||||
DEBUG(DEBUG_NOTICE,(__location__
|
||||
"ctdb_start_monitoring: ctdb_recheck_persistent_health() OK\n"));
|
||||
|
||||
DEBUG(DEBUG_NOTICE,(__location__ " Recoveries finished. Running the \"startup\" event.\n"));
|
||||
DEBUG(DEBUG_ERR,(__location__ " Allow clients to attach to databases.\n"));
|
||||
event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
|
||||
timeval_current(),
|
||||
ctdb_check_health, ctdb);
|
||||
|
@ -1638,7 +1638,6 @@ static int do_recovery(struct ctdb_recoverd *rec,
|
||||
DEBUG(DEBUG_ERR, (__location__ " Unable to setup public takeover addresses. ctdb_takeover_run() failed.\n"));
|
||||
rec->need_takeover_run = true;
|
||||
}
|
||||
DEBUG(DEBUG_NOTICE, (__location__ " Recovery - takeip finished\n"));
|
||||
|
||||
/* execute the "recovered" event script on all nodes */
|
||||
ret = run_recovered_eventscript(ctdb, nodemap, "do_recovery");
|
||||
|
@ -2633,8 +2633,6 @@ int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
|
||||
|
||||
iface = ctdb_find_iface(ctdb, info->name);
|
||||
if (iface == NULL) {
|
||||
DEBUG(DEBUG_ERR, (__location__ "iface[%s] is unknown\n",
|
||||
info->name));
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
@ -196,7 +196,7 @@ int main(int argc, const char *argv[])
|
||||
exit(1);
|
||||
}
|
||||
|
||||
DEBUG(DEBUG_NOTICE,("Starting CTDB daemon\n"));
|
||||
DEBUG(DEBUG_NOTICE,("CTDB starting on node\n"));
|
||||
|
||||
gettimeofday(&ctdb->ctdbd_start_time, NULL);
|
||||
gettimeofday(&ctdb->last_recovery_started, NULL);
|
||||
|
@ -103,15 +103,9 @@ try_command_on_node $test_node "echo \"function exportfs () { echo $foo_dir 127.
|
||||
|
||||
n="$rc_local_d/nfs-skip-share-check"
|
||||
n_contents='loadconfig() {
|
||||
name="$1"
|
||||
if [ -f /etc/sysconfig/$name ]; then
|
||||
. /etc/sysconfig/$name
|
||||
elif [ -f /etc/default/$name ]; then
|
||||
. /etc/default/$name
|
||||
elif [ -f $CTDB_BASE/sysconfig/$name ]; then
|
||||
. $CTDB_BASE/sysconfig/$name
|
||||
fi
|
||||
if [ "$name" = "ctdb" ] ; then
|
||||
_loadconfig "$@"
|
||||
|
||||
if [ "$1" = "ctdb" -o "$1" = "nfs" ] ; then
|
||||
CTDB_NFS_SKIP_SHARE_CHECK=no
|
||||
fi
|
||||
}
|
||||
|
@ -108,15 +108,9 @@ try_command_on_node $test_node "echo 'function testparm () { tp=\$(which testpar
|
||||
|
||||
n="$rc_local_d/samba-skip-share-check"
|
||||
n_contents='loadconfig() {
|
||||
name="$1"
|
||||
if [ -f /etc/sysconfig/$name ]; then
|
||||
. /etc/sysconfig/$name
|
||||
elif [ -f /etc/default/$name ]; then
|
||||
. /etc/default/$name
|
||||
elif [ -f $CTDB_BASE/sysconfig/$name ]; then
|
||||
. $CTDB_BASE/sysconfig/$name
|
||||
fi
|
||||
if [ "$name" = "ctdb" ] ; then
|
||||
_loadconfig "$@"
|
||||
|
||||
if [ "$1" = "ctdb" ] ; then
|
||||
CTDB_SAMBA_SKIP_SHARE_CHECK=no
|
||||
fi
|
||||
}
|
||||
|
24
ctdb/tests/onnode/0001.sh
Executable file
24
ctdb/tests/onnode/0001.sh
Executable file
@ -0,0 +1,24 @@
|
||||
#!/bin/sh
|
||||
|
||||
. "${ONNODE_TESTS_DIR}/common.sh"
|
||||
|
||||
cmd="$ONNODE all hostname"
|
||||
|
||||
define_test "$cmd" "all nodes OK"
|
||||
|
||||
required_result <<EOF
|
||||
|
||||
>> NODE: 192.168.1.101 <<
|
||||
-n 192.168.1.101 hostname
|
||||
|
||||
>> NODE: 192.168.1.102 <<
|
||||
-n 192.168.1.102 hostname
|
||||
|
||||
>> NODE: 192.168.1.103 <<
|
||||
-n 192.168.1.103 hostname
|
||||
|
||||
>> NODE: 192.168.1.104 <<
|
||||
-n 192.168.1.104 hostname
|
||||
EOF
|
||||
|
||||
simple_test $cmd
|
16
ctdb/tests/onnode/0002.sh
Executable file
16
ctdb/tests/onnode/0002.sh
Executable file
@ -0,0 +1,16 @@
|
||||
#!/bin/sh
|
||||
|
||||
. "${ONNODE_TESTS_DIR}/common.sh"
|
||||
|
||||
cmd="$ONNODE -q all hostname"
|
||||
|
||||
define_test "$cmd" "all nodes OK"
|
||||
|
||||
required_result <<EOF
|
||||
-n 192.168.1.101 hostname
|
||||
-n 192.168.1.102 hostname
|
||||
-n 192.168.1.103 hostname
|
||||
-n 192.168.1.104 hostname
|
||||
EOF
|
||||
|
||||
simple_test $cmd
|
16
ctdb/tests/onnode/0003.sh
Executable file
16
ctdb/tests/onnode/0003.sh
Executable file
@ -0,0 +1,16 @@
|
||||
#!/bin/sh
|
||||
|
||||
. "${ONNODE_TESTS_DIR}/common.sh"
|
||||
|
||||
cmd="$ONNODE -p all hostname"
|
||||
|
||||
define_test "$cmd" "all nodes OK"
|
||||
|
||||
required_result <<EOF
|
||||
[192.168.1.101] -n 192.168.1.101 hostname
|
||||
[192.168.1.102] -n 192.168.1.102 hostname
|
||||
[192.168.1.103] -n 192.168.1.103 hostname
|
||||
[192.168.1.104] -n 192.168.1.104 hostname
|
||||
EOF
|
||||
|
||||
simple_test -s $cmd
|
16
ctdb/tests/onnode/0004.sh
Executable file
16
ctdb/tests/onnode/0004.sh
Executable file
@ -0,0 +1,16 @@
|
||||
#!/bin/sh
|
||||
|
||||
. "${ONNODE_TESTS_DIR}/common.sh"
|
||||
|
||||
cmd="$ONNODE -pq all hostname"
|
||||
|
||||
define_test "$cmd" "all nodes OK"
|
||||
|
||||
required_result <<EOF
|
||||
-n 192.168.1.101 hostname
|
||||
-n 192.168.1.102 hostname
|
||||
-n 192.168.1.103 hostname
|
||||
-n 192.168.1.104 hostname
|
||||
EOF
|
||||
|
||||
simple_test -s $cmd
|
13
ctdb/tests/onnode/0005.sh
Executable file
13
ctdb/tests/onnode/0005.sh
Executable file
@ -0,0 +1,13 @@
|
||||
#!/bin/sh
|
||||
|
||||
. "${ONNODE_TESTS_DIR}/common.sh"
|
||||
|
||||
cmd="$ONNODE 3 hostname"
|
||||
|
||||
define_test "$cmd" "all nodes OK"
|
||||
|
||||
required_result <<EOF
|
||||
-n 192.168.1.104 hostname
|
||||
EOF
|
||||
|
||||
simple_test $cmd
|
15
ctdb/tests/onnode/0006.sh
Executable file
15
ctdb/tests/onnode/0006.sh
Executable file
@ -0,0 +1,15 @@
|
||||
#!/bin/sh
|
||||
|
||||
. "${ONNODE_TESTS_DIR}/common.sh"
|
||||
|
||||
cmd="$ONNODE -v 3 hostname"
|
||||
|
||||
define_test "$cmd" "all nodes OK"
|
||||
|
||||
required_result <<EOF
|
||||
|
||||
>> NODE: 192.168.1.104 <<
|
||||
-n 192.168.1.104 hostname
|
||||
EOF
|
||||
|
||||
simple_test $cmd
|
32
ctdb/tests/onnode/0070.sh
Executable file
32
ctdb/tests/onnode/0070.sh
Executable file
@ -0,0 +1,32 @@
|
||||
#!/bin/sh
|
||||
|
||||
. "${ONNODE_TESTS_DIR}/common.sh"
|
||||
|
||||
cmd="$ONNODE ok hostname"
|
||||
|
||||
define_test "$cmd" "all nodes OK"
|
||||
|
||||
ctdb_set_output <<EOF
|
||||
:Node:IP:Disconnected:Banned:Disabled:Unhealthy:Stopped:Inactive:
|
||||
:0:192.168.1.101:0:0:0:0:0:0:
|
||||
:1:192.168.1.102:0:0:0:0:0:0:
|
||||
:2:192.168.1.103:0:0:0:0:0:0:
|
||||
:3:192.168.1.104:0:0:0:0:0:0:
|
||||
EOF
|
||||
|
||||
required_result <<EOF
|
||||
|
||||
>> NODE: 192.168.1.101 <<
|
||||
-n 192.168.1.101 hostname
|
||||
|
||||
>> NODE: 192.168.1.102 <<
|
||||
-n 192.168.1.102 hostname
|
||||
|
||||
>> NODE: 192.168.1.103 <<
|
||||
-n 192.168.1.103 hostname
|
||||
|
||||
>> NODE: 192.168.1.104 <<
|
||||
-n 192.168.1.104 hostname
|
||||
EOF
|
||||
|
||||
simple_test $cmd
|
30
ctdb/tests/onnode/0071.sh
Executable file
30
ctdb/tests/onnode/0071.sh
Executable file
@ -0,0 +1,30 @@
|
||||
#!/bin/sh
|
||||
|
||||
. "${ONNODE_TESTS_DIR}/common.sh"
|
||||
|
||||
cmd="$ONNODE ok hostname"
|
||||
|
||||
define_test "$cmd" "2nd node disconnected"
|
||||
|
||||
ctdb_set_output <<EOF
|
||||
ctdb_set_output <<EOF
|
||||
:Node:IP:Disconnected:Banned:Disabled:Unhealthy:Stopped:Inactive:
|
||||
:0:192.168.1.101:0:0:0:0:0:0:
|
||||
:1:192.168.1.102:1:0:0:0:0:0:
|
||||
:2:192.168.1.103:0:0:0:0:0:0:
|
||||
:3:192.168.1.104:0:0:0:0:0:0:
|
||||
EOF
|
||||
|
||||
required_result <<EOF
|
||||
|
||||
>> NODE: 192.168.1.101 <<
|
||||
-n 192.168.1.101 hostname
|
||||
|
||||
>> NODE: 192.168.1.103 <<
|
||||
-n 192.168.1.103 hostname
|
||||
|
||||
>> NODE: 192.168.1.104 <<
|
||||
-n 192.168.1.104 hostname
|
||||
EOF
|
||||
|
||||
simple_test $cmd
|
29
ctdb/tests/onnode/0072.sh
Executable file
29
ctdb/tests/onnode/0072.sh
Executable file
@ -0,0 +1,29 @@
|
||||
#!/bin/sh
|
||||
|
||||
. "${ONNODE_TESTS_DIR}/common.sh"
|
||||
|
||||
cmd="$ONNODE ok hostname"
|
||||
|
||||
define_test "$cmd" "2nd node disconnected, extra status columns"
|
||||
|
||||
ctdb_set_output <<EOF
|
||||
:Node:IP:Disconnected:Banned:Disabled:Unhealthy:Stopped:Inactive:X1:X2:X3:X4:
|
||||
:0:192.168.1.101:0:0:0:0:0:0:0:0:0:0:
|
||||
:1:192.168.1.102:1:0:0:0:0:0:0:0:0:0:
|
||||
:2:192.168.1.103:0:0:0:0:0:0:0:0:0:0:
|
||||
:3:192.168.1.104:0:0:0:0:0:0:0:0:0:0:
|
||||
EOF
|
||||
|
||||
required_result <<EOF
|
||||
|
||||
>> NODE: 192.168.1.101 <<
|
||||
-n 192.168.1.101 hostname
|
||||
|
||||
>> NODE: 192.168.1.103 <<
|
||||
-n 192.168.1.103 hostname
|
||||
|
||||
>> NODE: 192.168.1.104 <<
|
||||
-n 192.168.1.104 hostname
|
||||
EOF
|
||||
|
||||
simple_test $cmd
|
29
ctdb/tests/onnode/0075.sh
Executable file
29
ctdb/tests/onnode/0075.sh
Executable file
@ -0,0 +1,29 @@
|
||||
#!/bin/sh
|
||||
|
||||
. "${ONNODE_TESTS_DIR}/common.sh"
|
||||
|
||||
cmd="$ONNODE con hostname"
|
||||
|
||||
define_test "$cmd" "1st node disconnected"
|
||||
|
||||
ctdb_set_output <<EOF
|
||||
:Node:IP:Disconnected:Banned:Disabled:Unhealthy:Stopped:Inactive:
|
||||
:0:192.168.1.101:1:0:0:0:0:0:
|
||||
:1:192.168.1.102:0:0:0:0:0:0:
|
||||
:2:192.168.1.103:0:0:0:0:0:0:
|
||||
:3:192.168.1.104:0:0:0:0:0:0:
|
||||
EOF
|
||||
|
||||
required_result <<EOF
|
||||
|
||||
>> NODE: 192.168.1.102 <<
|
||||
-n 192.168.1.102 hostname
|
||||
|
||||
>> NODE: 192.168.1.103 <<
|
||||
-n 192.168.1.103 hostname
|
||||
|
||||
>> NODE: 192.168.1.104 <<
|
||||
-n 192.168.1.104 hostname
|
||||
EOF
|
||||
|
||||
simple_test $cmd
|
17
ctdb/tests/onnode/0080.sh
Executable file
17
ctdb/tests/onnode/0080.sh
Executable file
@ -0,0 +1,17 @@
|
||||
#!/bin/sh
|
||||
|
||||
. "${ONNODE_TESTS_DIR}/common.sh"
|
||||
|
||||
cmd="$ONNODE recmaster hostname"
|
||||
|
||||
define_test "$cmd" "node 1 (192.168.1.102) is recmaster"
|
||||
|
||||
ctdb_set_output <<EOF
|
||||
1
|
||||
EOF
|
||||
|
||||
required_result <<EOF
|
||||
-n 192.168.1.102 hostname
|
||||
EOF
|
||||
|
||||
simple_test $cmd
|
17
ctdb/tests/onnode/0081.sh
Executable file
17
ctdb/tests/onnode/0081.sh
Executable file
@ -0,0 +1,17 @@
|
||||
#!/bin/sh
|
||||
|
||||
. "${ONNODE_TESTS_DIR}/common.sh"
|
||||
|
||||
cmd="$ONNODE lvsmaster hostname"
|
||||
|
||||
define_test "$cmd" "no lvsmaster"
|
||||
|
||||
ctdb_set_output 255 <<EOF
|
||||
There is no LVS master
|
||||
EOF
|
||||
|
||||
required_result 1 <<EOF
|
||||
onnode: No lvsmaster available
|
||||
EOF
|
||||
|
||||
simple_test $cmd
|
21
ctdb/tests/onnode/0090.sh
Executable file
21
ctdb/tests/onnode/0090.sh
Executable file
@ -0,0 +1,21 @@
|
||||
#!/bin/sh
|
||||
|
||||
. "${ONNODE_TESTS_DIR}/common.sh"
|
||||
|
||||
cmd="$ONNODE natgw hostname"
|
||||
|
||||
define_test "$cmd" "no natgw"
|
||||
|
||||
ctdb_set_output <<EOF
|
||||
-1 0.0.0.0
|
||||
:0:192.168.1.101:0:0:0:0:0:
|
||||
:1:192.168.1.102:0:0:0:0:0:
|
||||
:2:192.168.1.103:0:0:0:0:0:
|
||||
:3:192.168.1.104:0:0:0:0:0:
|
||||
EOF
|
||||
|
||||
required_result 1 <<EOF
|
||||
onnode: No natgwlist available
|
||||
EOF
|
||||
|
||||
simple_test $cmd
|
21
ctdb/tests/onnode/0091.sh
Executable file
21
ctdb/tests/onnode/0091.sh
Executable file
@ -0,0 +1,21 @@
|
||||
#!/bin/sh
|
||||
|
||||
. "${ONNODE_TESTS_DIR}/common.sh"
|
||||
|
||||
cmd="$ONNODE natgw hostname"
|
||||
|
||||
define_test "$cmd" "node 2 (192.168.1.103) is natgw"
|
||||
|
||||
ctdb_set_output <<EOF
|
||||
2 192.168.1.103
|
||||
:0:192.168.1.101:0:0:0:0:0:
|
||||
:1:192.168.1.102:0:0:0:0:0:
|
||||
:2:192.168.1.103:0:0:0:0:0:
|
||||
:3:192.168.1.104:0:0:0:0:0:
|
||||
EOF
|
||||
|
||||
required_result <<EOF
|
||||
-n 192.168.1.103 hostname
|
||||
EOF
|
||||
|
||||
simple_test $cmd
|
38
ctdb/tests/onnode/README
Normal file
38
ctdb/tests/onnode/README
Normal file
@ -0,0 +1,38 @@
|
||||
onnode unit tests
|
||||
=================
|
||||
|
||||
Examples:
|
||||
|
||||
* ./run_tests.sh
|
||||
|
||||
Run all tests, displaying output.
|
||||
|
||||
* ./run_tests.sh -s
|
||||
|
||||
Run all tests, displaying output and a summary.
|
||||
|
||||
* ./run_tests.sh -sq
|
||||
|
||||
Run all tests, displaying only a summary.
|
||||
|
||||
* ONNODE=onnode-buggy-001 ./run_tests.sh -s
|
||||
|
||||
Run against stubs/onnode-buggy-001 instead of default onnode version.
|
||||
|
||||
Add more buggy versions of onnode to this directory as bugs are
|
||||
fixed to enable test validation using this feature.
|
||||
|
||||
* ./run_tests.sh ./009*.sh
|
||||
|
||||
Run only the specified tests.
|
||||
|
||||
* ONNODE="bash -x stubs/onnode-buggy-001" ./run_tests.sh ./0090.sh
|
||||
ONNODE="bash -x ../../tools/onnode" ./run_tests.sh ./0090.sh
|
||||
|
||||
Debug the specified test or test failure. The test will fail
|
||||
because the bash trace output will be included in the test output.
|
||||
However, this at least makes it easy to trace onnode while running
|
||||
the test...
|
||||
|
||||
To see if the test pases, the -x can be dropped... so command-line
|
||||
editing can be kept to a minimum.
|
103
ctdb/tests/onnode/common.sh
Normal file
103
ctdb/tests/onnode/common.sh
Normal file
@ -0,0 +1,103 @@
|
||||
# Hey Emacs, this is a -*- shell-script -*- !!! :-)
|
||||
|
||||
# Set indirectly by run_tests at top level.
|
||||
unset CTDB_NODES_SOCKETS
|
||||
|
||||
# Default to just "onnode".
|
||||
: ${ONNODE:=onnode}
|
||||
|
||||
# Augment PATH with relevant stubs/ directories.
|
||||
|
||||
if [ -d "${ONNODE_TESTS_DIR}/stubs" ] ; then
|
||||
PATH="${ONNODE_TESTS_DIR}/stubs:$PATH"
|
||||
fi
|
||||
|
||||
export ONNODE_TESTCASE_DIR=$(dirname "$0")
|
||||
if [ $(basename "$ONNODE_TESTCASE_DIR") = "onnode" ] ; then
|
||||
# Just a test script, no testcase subdirectory.
|
||||
ONNODE_TESTCASE_DIR="$ONNODE_TESTS_DIR"
|
||||
else
|
||||
if [ -d "${ONNODE_TESTCASE_DIR}/stubs" ] ; then
|
||||
PATH="${ONNODE_TESTCASE_DIR}/stubs:$PATH"
|
||||
fi
|
||||
fi
|
||||
|
||||
# Find CTDB nodes file.
|
||||
if [ -z "$CTDB_NODES_FILE" ] ; then
|
||||
if [ -r "${ONNODE_TESTCASE_DIR}/nodes" ] ; then
|
||||
CTDB_NODES_FILE="${ONNODE_TESTCASE_DIR}/nodes"
|
||||
elif [ -r "${ONNODE_TESTS_DIR}/nodes" ] ; then
|
||||
CTDB_NODES_FILE="${ONNODE_TESTS_DIR}/nodes"
|
||||
else
|
||||
CTDB_NODES_FILE="${CTDB_BASE:-/etc/ctdb}/nodes"
|
||||
fi
|
||||
fi
|
||||
|
||||
export CTDB_NODES_FILE
|
||||
|
||||
export ONNODE_TESTS_VAR_DIR="${ONNODE_TESTS_DIR}/var"
|
||||
mkdir -p "$ONNODE_TESTS_VAR_DIR"
|
||||
|
||||
if [ -z "$CTDB_BASE" ] ; then
|
||||
export CTDB_BASE=$(dirname "$CTDB_NODES_FILE")
|
||||
fi
|
||||
|
||||
define_test ()
|
||||
{
|
||||
_f="$0"
|
||||
_f="${_f#./}" # strip leading ./
|
||||
_f="${_f%%/*}" # if subdir, strip off file
|
||||
_f="${_f%.sh}" # strip off .sh suffix if any
|
||||
|
||||
echo "$_f $1 - $2"
|
||||
}
|
||||
|
||||
# Set output for ctdb command. Option 1st argument is return code.
|
||||
ctdb_set_output ()
|
||||
{
|
||||
_out="$ONNODE_TESTS_VAR_DIR/ctdb.out"
|
||||
cat >"$_out"
|
||||
|
||||
_rc="$ONNODE_TESTS_VAR_DIR/ctdb.rc"
|
||||
echo "${1:-0}" >"$_rc"
|
||||
|
||||
trap "rm -f $_out $_rc" 0
|
||||
}
|
||||
|
||||
required_result ()
|
||||
{
|
||||
required_rc="${1:-0}"
|
||||
required_output=$(cat)
|
||||
}
|
||||
|
||||
simple_test ()
|
||||
{
|
||||
_sort="cat"
|
||||
if [ "$1" = "-s" ] ; then
|
||||
shift
|
||||
_sort="sort"
|
||||
fi
|
||||
_out=$("$@" 2>&1)
|
||||
_rc=$?
|
||||
_out=$(echo "$_out" | $_sort )
|
||||
|
||||
if [ "$_out" = "$required_output" -a $_rc = $required_rc ] ; then
|
||||
echo "PASSED"
|
||||
else
|
||||
cat <<EOF
|
||||
CTDB_NODES_FILE="${CTDB_NODES_FILE}"
|
||||
CTDB_BASE="$CTDB_BASE"
|
||||
$(which ctdb)
|
||||
|
||||
##################################################
|
||||
Required output (Exit status: ${required_rc}):
|
||||
##################################################
|
||||
$required_output
|
||||
##################################################
|
||||
Actual output (Exit status: ${_rc}):
|
||||
##################################################
|
||||
$_out
|
||||
EOF
|
||||
return 1
|
||||
fi
|
||||
}
|
4
ctdb/tests/onnode/nodes
Normal file
4
ctdb/tests/onnode/nodes
Normal file
@ -0,0 +1,4 @@
|
||||
192.168.1.101
|
||||
192.168.1.102
|
||||
192.168.1.103
|
||||
192.168.1.104
|
31
ctdb/tests/onnode/run_tests.sh
Executable file
31
ctdb/tests/onnode/run_tests.sh
Executable file
@ -0,0 +1,31 @@
|
||||
#!/bin/sh
|
||||
|
||||
# Run some onnode unit tests.
|
||||
|
||||
cd $(dirname "$0")
|
||||
export ONNODE_TESTS_DIR=$(pwd)
|
||||
|
||||
test_dir=$(dirname "$ONNODE_TESTS_DIR")
|
||||
|
||||
opts="-d"
|
||||
|
||||
for i ; do
|
||||
case "$i" in
|
||||
-*)
|
||||
opts="$opts $i"
|
||||
shift
|
||||
;;
|
||||
*)
|
||||
break
|
||||
esac
|
||||
done
|
||||
|
||||
tests=""
|
||||
if [ -z "$*" ] ; then
|
||||
tests=$(ls ./[0-9][0-9][0-9][0-9].sh ./[0-9][0-9][0-9][0-9]/run_test.sh 2>/dev/null)
|
||||
fi
|
||||
|
||||
"$test_dir/scripts/run_tests" $opts "$@" $tests || exit 1
|
||||
|
||||
echo "All OK"
|
||||
exit 0
|
33
ctdb/tests/onnode/stubs/ctdb
Executable file
33
ctdb/tests/onnode/stubs/ctdb
Executable file
@ -0,0 +1,33 @@
|
||||
#!/bin/sh
|
||||
|
||||
# Fake ctdb client for onnode tests.
|
||||
|
||||
cmd=$(echo "$*" | sed -r -e 's@[[:space:]]+@_@g')
|
||||
|
||||
out="${ONNODE_TESTS_VAR_DIR}/ctdb.out"
|
||||
if [ -r "$out" ] ; then
|
||||
cat "$out"
|
||||
|
||||
rc="${ONNODE_TESTS_VAR_DIR}/ctdb.rc"
|
||||
if [ -r "$rc" ] ; then
|
||||
exit $(cat "$rc")
|
||||
fi
|
||||
|
||||
exit 0
|
||||
fi
|
||||
|
||||
f="${ONNODE_TESTCASE_DIR}/ctdb.d/${cmd}.sh"
|
||||
if [ -x "$f" ] ; then
|
||||
"$f"
|
||||
exit $?
|
||||
fi
|
||||
|
||||
f="${ONNODE_TESTCASE_DIR}/ctdb.d/${cmd}.out"
|
||||
if [ -r "$f" ] ; then
|
||||
cat "$f"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "fake ctdb: no implementation for \"$*\""
|
||||
|
||||
exit 1
|
376
ctdb/tests/onnode/stubs/onnode-buggy-001
Executable file
376
ctdb/tests/onnode/stubs/onnode-buggy-001
Executable file
@ -0,0 +1,376 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Run commands on CTDB nodes.
|
||||
|
||||
# See http://ctdb.samba.org/ for more information about CTDB.
|
||||
|
||||
# Copyright (C) Martin Schwenke 2008
|
||||
|
||||
# Based on an earlier script by Andrew Tridgell and Ronnie Sahlberg.
|
||||
|
||||
# Copyright (C) Andrew Tridgell 2007
|
||||
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
prog=$(basename $0)
|
||||
|
||||
usage ()
|
||||
{
|
||||
cat >&2 <<EOF
|
||||
Usage: onnode [OPTION] ... <NODES> <COMMAND> ...
|
||||
options:
|
||||
-c Run in current working directory on specified nodes.
|
||||
-o <prefix> Save standard output from each node to file <prefix>.<ip>
|
||||
-p Run command in parallel on specified nodes.
|
||||
-q Do not print node addresses (overrides -v).
|
||||
-n Allow nodes to be specified by name.
|
||||
-f Specify nodes file, overrides CTDB_NODES_FILE.
|
||||
-v Print node address even for a single node.
|
||||
<NODES> "all", "any", "ok" (or "healthy"), "con" (or "connected"),
|
||||
"rm" (or "recmaster"), "lvs" (or "lvsmaster"),
|
||||
"natgw" (or "natgwlist"); or
|
||||
a node number (0 base); or
|
||||
a hostname (if -n is specified); or
|
||||
list (comma separated) of <NODES>; or
|
||||
range (hyphen separated) of node numbers.
|
||||
EOF
|
||||
exit 1
|
||||
|
||||
}
|
||||
|
||||
invalid_nodespec ()
|
||||
{
|
||||
echo "Invalid <nodespec>" >&2 ; echo >&2
|
||||
usage
|
||||
}
|
||||
|
||||
# Defaults.
|
||||
current=false
|
||||
parallel=false
|
||||
verbose=false
|
||||
quiet=false
|
||||
prefix=""
|
||||
names_ok=false
|
||||
|
||||
ctdb_base="${CTDB_BASE:-/etc/ctdb}"
|
||||
|
||||
parse_options ()
|
||||
{
|
||||
# $POSIXLY_CORRECT means that the command passed to onnode can
|
||||
# take options and getopt won't reorder things to make them
|
||||
# options ot onnode.
|
||||
local temp
|
||||
# Not on the previous line - local returns 0!
|
||||
temp=$(POSIXLY_CORRECT=1 getopt -n "$prog" -o "cf:hno:pqv" -l help -- "$@")
|
||||
|
||||
[ $? != 0 ] && usage
|
||||
|
||||
eval set -- "$temp"
|
||||
|
||||
while true ; do
|
||||
case "$1" in
|
||||
-c) current=true ; shift ;;
|
||||
-f) CTDB_NODES_FILE="$2" ; shift 2 ;;
|
||||
-n) names_ok=true ; shift ;;
|
||||
-o) prefix="$2" ; shift 2 ;;
|
||||
-p) parallel=true ; shift ;;
|
||||
-q) quiet=true ; shift ;;
|
||||
-v) verbose=true ; shift ;;
|
||||
--) shift ; break ;;
|
||||
-h|--help|*) usage ;; # Shouldn't happen, so this is reasonable.
|
||||
esac
|
||||
done
|
||||
|
||||
[ $# -lt 2 ] && usage
|
||||
|
||||
nodespec="$1" ; shift
|
||||
command="$@"
|
||||
}
|
||||
|
||||
echo_nth ()
|
||||
{
|
||||
local n="$1" ; shift
|
||||
|
||||
shift $n
|
||||
local node="$1"
|
||||
|
||||
if [ -n "$node" -a "$node" != "#DEAD" ] ; then
|
||||
echo $node
|
||||
else
|
||||
echo "${prog}: \"node ${n}\" does not exist" >&2
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
parse_nodespec ()
|
||||
{
|
||||
# Subshell avoids hacks to restore $IFS.
|
||||
(
|
||||
IFS=","
|
||||
for i in $1 ; do
|
||||
case "$i" in
|
||||
*-*) seq "${i%-*}" "${i#*-}" 2>/dev/null || invalid_nodespec ;;
|
||||
# Separate lines for readability.
|
||||
all|any|ok|healthy|con|connected) echo "$i" ;;
|
||||
rm|recmaster|lvs|lvsmaster|natgw|natgwlist) echo "$i" ;;
|
||||
*)
|
||||
[ $i -gt -1 ] 2>/dev/null || $names_ok || invalid_nodespec
|
||||
echo $i
|
||||
esac
|
||||
done
|
||||
)
|
||||
}
|
||||
|
||||
ctdb_status_output="" # cache
|
||||
get_nodes_with_status ()
|
||||
{
|
||||
local all_nodes="$1"
|
||||
local status="$2"
|
||||
|
||||
local bits
|
||||
case "$status" in
|
||||
healthy)
|
||||
bits="0:0:0:0:0:0"
|
||||
;;
|
||||
connected)
|
||||
bits="0:[0-1]:[0-1]:[0-1]:[0-1]:[0-1]"
|
||||
;;
|
||||
*)
|
||||
invalid_nodespec
|
||||
esac
|
||||
|
||||
if [ -z "$ctdb_status_output" ] ; then
|
||||
# FIXME: need to do something if $CTDB_NODES_SOCKETS is set.
|
||||
ctdb_status_output=$(ctdb -Y status 2>/dev/null)
|
||||
if [ $? -ne 0 ] ; then
|
||||
echo "${prog}: unable to get status of CTDB nodes" >&2
|
||||
exit 1
|
||||
fi
|
||||
ctdb_status_output="${ctdb_status_output#* }"
|
||||
fi
|
||||
|
||||
local nodes=""
|
||||
local i
|
||||
for i in $ctdb_status_output ; do
|
||||
# Try removing bits from end.
|
||||
local t="${i%:${bits}:}"
|
||||
if [ "$t" != "$i" ] ; then
|
||||
# Succeeded. Get address. NOTE: this is an optimisation.
|
||||
# It might be better to get the node number and then get
|
||||
# the nth node to get the address. This would make things
|
||||
# more consistent if $ctdb_base/nodes actually contained
|
||||
# hostnames.
|
||||
nodes="${nodes} ${t#:*:}"
|
||||
fi
|
||||
done
|
||||
|
||||
echo $nodes
|
||||
}
|
||||
|
||||
ctdb_props="" # cache
|
||||
get_node_with_property ()
|
||||
{
|
||||
local all_nodes="$1"
|
||||
local prop="$2"
|
||||
|
||||
local prop_node=""
|
||||
if [ "${ctdb_props##:${prop}:}" = "$ctdb_props" ] ; then
|
||||
prop_node=$(ctdb "$prop" -Y 2>/dev/null)
|
||||
# We only want the first line.
|
||||
local nl="
|
||||
"
|
||||
prop_node="${prop_node%%${nl}*}"
|
||||
if [ $? -eq 0 ] ; then
|
||||
ctdb_props="${ctdb_props}${ctdb_props:+ }:${prop}:${prop_node}"
|
||||
else
|
||||
prop_node=""
|
||||
fi
|
||||
else
|
||||
prop_node="${ctdb_props##:${prop}:}"
|
||||
prop_node="${prop_node%% *}"
|
||||
fi
|
||||
if [ -n "$prop_node" ] ; then
|
||||
echo_nth "$prop_node" $all_nodes
|
||||
else
|
||||
echo "${prog}: No ${prop} available" >&2
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
get_any_available_node ()
|
||||
{
|
||||
local all_nodes="$1"
|
||||
|
||||
# We do a recursive onnode to find which nodes are up and running.
|
||||
local out=$($0 -pq all ctdb pnn 2>&1)
|
||||
local line
|
||||
while read line ; do
|
||||
local pnn="${line#PNN:}"
|
||||
if [ "$pnn" != "$line" ] ; then
|
||||
echo_nth "$pnn" $all_nodes
|
||||
return 0
|
||||
fi
|
||||
# Else must be an error message from a down node.
|
||||
done <<<"$out"
|
||||
return 1
|
||||
}
|
||||
|
||||
get_nodes ()
|
||||
{
|
||||
local all_nodes
|
||||
|
||||
if [ -n "$CTDB_NODES_SOCKETS" ] ; then
|
||||
all_nodes="$CTDB_NODES_SOCKETS"
|
||||
else
|
||||
local f="${ctdb_base}/nodes"
|
||||
if [ -n "$CTDB_NODES_FILE" ] ; then
|
||||
f="$CTDB_NODES_FILE"
|
||||
if [ ! -e "$f" -a "${f#/}" = "$f" ] ; then
|
||||
# $f is relative, try in $ctdb_base
|
||||
f="${ctdb_base}/${f}"
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ ! -r "$f" ] ; then
|
||||
echo "${prog}: unable to open nodes file \"${f}\"" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
all_nodes=$(sed -e 's@#.*@@g' -e 's@ *@@g' -e 's@^$@#DEAD@' "$f")
|
||||
fi
|
||||
|
||||
local nodes=""
|
||||
local n
|
||||
for n in $(parse_nodespec "$1") ; do
|
||||
[ $? != 0 ] && exit 1 # Required to catch exit in above subshell.
|
||||
case "$n" in
|
||||
all)
|
||||
echo "${all_nodes//#DEAD/}"
|
||||
;;
|
||||
any)
|
||||
get_any_available_node "$all_nodes" || exit 1
|
||||
;;
|
||||
ok|healthy)
|
||||
get_nodes_with_status "$all_nodes" "healthy" || exit 1
|
||||
;;
|
||||
con|connected)
|
||||
get_nodes_with_status "$all_nodes" "connected" || exit 1
|
||||
;;
|
||||
rm|recmaster)
|
||||
get_node_with_property "$all_nodes" "recmaster" || exit 1
|
||||
;;
|
||||
lvs|lvsmaster)
|
||||
get_node_with_property "$all_nodes" "lvsmaster" || exit 1
|
||||
;;
|
||||
natgw|natgwlist)
|
||||
get_node_with_property "$all_nodes" "natgwlist" || exit 1
|
||||
;;
|
||||
[0-9]|[0-9][0-9]|[0-9][0-9][0-9])
|
||||
echo_nth $n $all_nodes
|
||||
;;
|
||||
*)
|
||||
$names_ok || invalid_nodespec
|
||||
echo $n
|
||||
esac
|
||||
done
|
||||
}
|
||||
|
||||
fakessh ()
|
||||
{
|
||||
CTDB_SOCKET="$1" sh -c "$2" 3>/dev/null
|
||||
}
|
||||
|
||||
stdout_filter ()
|
||||
{
|
||||
if [ -n "$prefix" ] ; then
|
||||
cat >"${prefix}.${n//\//_}"
|
||||
elif $verbose && $parallel ; then
|
||||
sed -e "s@^@[$n] @"
|
||||
else
|
||||
cat
|
||||
fi
|
||||
}
|
||||
|
||||
stderr_filter ()
|
||||
{
|
||||
if $verbose && $parallel ; then
|
||||
sed -e "s@^@[$n] @"
|
||||
else
|
||||
cat
|
||||
fi
|
||||
}
|
||||
|
||||
######################################################################
|
||||
|
||||
parse_options "$@"
|
||||
|
||||
$current && command="cd $PWD && $command"
|
||||
|
||||
ssh_opts=
|
||||
if [ -n "$CTDB_NODES_SOCKETS" ] ; then
|
||||
SSH=fakessh
|
||||
else
|
||||
# Could "2>/dev/null || true" but want to see errors from typos in file.
|
||||
[ -r "${ctdb_base}/onnode.conf" ] && . "${ctdb_base}/onnode.conf"
|
||||
[ -n "$SSH" ] || SSH=ssh
|
||||
if [ "$SSH" = "ssh" ] ; then
|
||||
ssh_opts="-n"
|
||||
else
|
||||
: # rsh? All bets are off!
|
||||
fi
|
||||
fi
|
||||
|
||||
######################################################################
|
||||
|
||||
nodes=$(get_nodes "$nodespec")
|
||||
[ $? != 0 ] && exit 1 # Required to catch exit in above subshell.
|
||||
|
||||
if $quiet ; then
|
||||
verbose=false
|
||||
else
|
||||
# If $nodes contains a space or a newline then assume multiple nodes.
|
||||
nl="
|
||||
"
|
||||
[ "$nodes" != "${nodes%[ ${nl}]*}" ] && verbose=true
|
||||
fi
|
||||
|
||||
pids=""
|
||||
trap 'kill -TERM $pids 2>/dev/null' INT TERM
|
||||
# There's a small race here where the kill can fail if no processes
|
||||
# have been added to $pids and the script is interrupted. However,
|
||||
# the part of the window where it matter is very small.
|
||||
retcode=0
|
||||
for n in $nodes ; do
|
||||
set -o pipefail 2>/dev/null
|
||||
if $parallel ; then
|
||||
{ exec 3>&1 ; { $SSH $ssh_opts $EXTRA_SSH_OPTS $n "$command" | stdout_filter >&3 ; } 2>&1 | stderr_filter ; } &
|
||||
pids="${pids} $!"
|
||||
else
|
||||
if $verbose ; then
|
||||
echo >&2 ; echo ">> NODE: $n <<" >&2
|
||||
fi
|
||||
|
||||
{ exec 3>&1 ; { $SSH $ssh_opts $EXTRA_SSH_OPTS $n "$command" | stdout_filter >&3 ; } 2>&1 | stderr_filter ; }
|
||||
[ $? = 0 ] || retcode=$?
|
||||
fi
|
||||
done
|
||||
|
||||
$parallel && {
|
||||
for p in $pids; do
|
||||
wait $p
|
||||
[ $? = 0 ] || retcode=$?
|
||||
done
|
||||
}
|
||||
|
||||
exit $retcode
|
2
ctdb/tests/onnode/stubs/ssh
Executable file
2
ctdb/tests/onnode/stubs/ssh
Executable file
@ -0,0 +1,2 @@
|
||||
#!/bin/sh
|
||||
echo "$*"
|
@ -53,6 +53,54 @@ test_exit ()
|
||||
exit $(($testfailures+0))
|
||||
}
|
||||
|
||||
ctdb_check_time_logs ()
|
||||
{
|
||||
local threshold=20
|
||||
|
||||
local jump=false
|
||||
local prev=""
|
||||
local ds_prev=""
|
||||
local node=""
|
||||
|
||||
out=$(onnode all tail -n 20 /var/log/ctdb.test.time.log 2>&1)
|
||||
|
||||
if [ $? -eq 0 ] ; then
|
||||
local line
|
||||
while read line ; do
|
||||
case "$line" in
|
||||
\>\>\ NODE:\ *\ \<\<)
|
||||
node="${line#>> NODE: }"
|
||||
node=${node% <<*}
|
||||
ds_prev=""
|
||||
;;
|
||||
*\ *)
|
||||
set -- $line
|
||||
ds_curr="$1${2:0:1}"
|
||||
if [ -n "$ds_prev" ] && \
|
||||
[ $(($ds_curr - $ds_prev)) -ge $threshold ] ; then
|
||||
echo "Node $node had time jump of $(($ds_curr - $ds_prev))ds between $(date +'%T' -d @${ds_prev%?}) and $(date +'%T' -d @${ds_curr%?})"
|
||||
jump=true
|
||||
fi
|
||||
prev="$line"
|
||||
ds_prev="$ds_curr"
|
||||
;;
|
||||
esac
|
||||
done <<<"$out"
|
||||
else
|
||||
echo Error getting time logs
|
||||
fi
|
||||
if $jump ; then
|
||||
echo "Check time sync (test client first):"
|
||||
date
|
||||
onnode -p all date
|
||||
echo "Information from test client:"
|
||||
hostname
|
||||
top -b -n 1
|
||||
echo "Information from cluster nodes:"
|
||||
onnode all "top -b -n 1 ; echo '/proc/slabinfo' ; cat /proc/slabinfo"
|
||||
fi
|
||||
}
|
||||
|
||||
ctdb_test_exit ()
|
||||
{
|
||||
local status=$?
|
||||
@ -68,6 +116,10 @@ ctdb_test_exit ()
|
||||
|
||||
echo "*** TEST COMPLETED (RC=$status) AT $(date '+%F %T'), CLEANING UP..."
|
||||
|
||||
if [ -n "$CTDB_TEST_REAL_CLUSTER" -a $status -ne 0 ] ; then
|
||||
ctdb_check_time_logs
|
||||
fi
|
||||
|
||||
eval "$ctdb_test_exit_hook" || true
|
||||
unset ctdb_test_exit_hook
|
||||
|
||||
@ -856,6 +908,8 @@ restart_ctdb ()
|
||||
|
||||
onnode -q 1 $CTDB_TEST_WRAPPER wait_until_healthy || return 1
|
||||
|
||||
local debug_out=$(onnode -p all ctdb status -Y 2>&1; onnode -p all ctdb scriptstatus 2>&1)
|
||||
|
||||
echo "Setting RerecoveryTimeout to 1"
|
||||
onnode -pq all "$CTDB setvar RerecoveryTimeout 1"
|
||||
|
||||
@ -869,6 +923,13 @@ restart_ctdb ()
|
||||
onnode -q 0 $CTDB recover
|
||||
|
||||
echo "ctdb is ready"
|
||||
|
||||
if ! onnode 0 $CTDB_TEST_WRAPPER _cluster_is_healthy ; then
|
||||
echo "OUCH! Cluster is UNHEALTHY again..."
|
||||
echo "$debug_out"
|
||||
# Try to make the calling test fail
|
||||
status=1
|
||||
fi
|
||||
}
|
||||
|
||||
ctdb_restart_when_done ()
|
||||
|
@ -18,8 +18,10 @@ EOF
|
||||
######################################################################
|
||||
|
||||
with_summary=false
|
||||
with_desc=false
|
||||
quiet=false
|
||||
|
||||
temp=$(getopt -n "$prog" -o "xhs" -l help -- "$@")
|
||||
temp=$(getopt -n "$prog" -o "xdhqs" -l help -- "$@")
|
||||
|
||||
[ $? != 0 ] && usage
|
||||
|
||||
@ -28,12 +30,20 @@ eval set -- "$temp"
|
||||
while true ; do
|
||||
case "$1" in
|
||||
-x) set -x; shift ;;
|
||||
-d) with_desc=true ; shift ;; # 4th line of output is description
|
||||
-q) quiet=true ; shift ;;
|
||||
-s) with_summary=true ; shift ;;
|
||||
--) shift ; break ;;
|
||||
*) usage ;;
|
||||
esac
|
||||
done
|
||||
|
||||
if $quiet ; then
|
||||
show_progress() { cat >/dev/null ; }
|
||||
else
|
||||
show_progress() { cat ; }
|
||||
fi
|
||||
|
||||
######################################################################
|
||||
|
||||
tests_total=0
|
||||
@ -43,22 +53,40 @@ summary=""
|
||||
rows=$(if tty -s ; then stty size ; else echo x 80 ; fi | sed -e 's@.* @@' -e 's@^0$@80@')
|
||||
ww=$((rows - 7))
|
||||
|
||||
tf=$(mktemp)
|
||||
sf=$(mktemp)
|
||||
|
||||
set -o pipefail
|
||||
|
||||
for f; do
|
||||
[ -x $f ] || fail "test \"$f\" is not executable"
|
||||
tests_total=$(($tests_total + 1))
|
||||
if ctdb_test_run "$f" ; then
|
||||
tests_passed=$(($tests_passed + 1))
|
||||
t="PASSED"
|
||||
else
|
||||
t="FAILED"
|
||||
ctdb_test_run "$f" | tee "$tf" | show_progress
|
||||
status=$?
|
||||
if $with_summary ; then
|
||||
if [ $status -eq 0 ] ; then
|
||||
tests_passed=$(($tests_passed + 1))
|
||||
t=" PASSED "
|
||||
else
|
||||
t="*FAILED*"
|
||||
fi
|
||||
if $with_desc ; then
|
||||
desc=$(tail -n +4 $tf | head -n 1)
|
||||
f="$desc"
|
||||
fi
|
||||
echo "$t $f" >>"$sf"
|
||||
fi
|
||||
summary=$(printf "%s\n%-${ww}s%s" "$summary" "$f" "$t")
|
||||
done
|
||||
|
||||
rm -f "$tf"
|
||||
|
||||
if $with_summary ; then
|
||||
echo "$summary"
|
||||
echo
|
||||
cat "$sf"
|
||||
echo
|
||||
echo "${tests_passed}/${tests_total} tests passed"
|
||||
fi
|
||||
|
||||
rm -f "$sf"
|
||||
|
||||
test_exit
|
||||
|
@ -27,3 +27,12 @@ ctdb_test_init "$@"
|
||||
|
||||
echo "Checking connectivity between nodes..."
|
||||
onnode all onnode all true
|
||||
|
||||
# We're seeing some weirdness with CTDB controls timing out. We're
|
||||
# wondering if time is jumping forward, so this creates a time log on
|
||||
# each node that we can examine later if tests fail weirdly.
|
||||
if [ -n "$CTDB_TEST_REAL_CLUSTER" ] ; then
|
||||
echo "Starting time logging on each node..."
|
||||
f="/var/log/ctdb.test.time.log"
|
||||
onnode -p all "[ -f $f ] || while : ; do date '+%s %N' ; sleep 1 ; done >$f 2>&1 </dev/null &" &
|
||||
fi
|
||||
|
@ -2,7 +2,7 @@
|
||||
|
||||
# ctdb ip takeover code
|
||||
|
||||
# Copyright (C) Martin Schwenke 2010
|
||||
# Copyright (C) Martin Schwenke, Ronnie Sahlberg 2010, 2011
|
||||
|
||||
# Based on original CTDB C code:
|
||||
#
|
||||
@ -29,6 +29,11 @@ import sys
|
||||
from optparse import OptionParser
|
||||
import copy
|
||||
import random
|
||||
import itertools
|
||||
|
||||
# For parsing IP addresses
|
||||
import socket
|
||||
import struct
|
||||
|
||||
options = None
|
||||
|
||||
@ -44,6 +49,9 @@ def process_args(extra_options=[]):
|
||||
parser.add_option("--ni",
|
||||
action="store_true", dest="no_ip_failback", default=False,
|
||||
help="turn on no_ip_failback")
|
||||
parser.add_option("-L", "--lcp2",
|
||||
action="store_true", dest="lcp2", default=False,
|
||||
help="use LCP2 IP rebalancing algorithm [default: %default]")
|
||||
parser.add_option("-b", "--balance",
|
||||
action="store_true", dest="balance", default=False,
|
||||
help="show (im)balance information after each event")
|
||||
@ -54,14 +62,11 @@ def process_args(extra_options=[]):
|
||||
action="store_false", dest="show", default=True,
|
||||
help="don't show IP address layout after each event")
|
||||
parser.add_option("-v", "--verbose",
|
||||
action="store_true", dest="verbose", default=False,
|
||||
action="count", dest="verbose", default=0,
|
||||
help="print information and actions taken to stdout")
|
||||
parser.add_option("--hack",
|
||||
action="store", type="int", dest="hack", default=0,
|
||||
help="apply a hack (see the code!!!)")
|
||||
parser.add_option("-r", "--retries",
|
||||
action="store", type="int", dest="retries", default=5,
|
||||
help="number of retry loops for rebalancing [default: %default]")
|
||||
help="number of retry loops for rebalancing non-deterministic failback [default: %default]")
|
||||
parser.add_option("-i", "--iterations",
|
||||
action="store", type="int", dest="iterations",
|
||||
default=1000,
|
||||
@ -69,6 +74,9 @@ def process_args(extra_options=[]):
|
||||
parser.add_option("-o", "--odds",
|
||||
action="store", type="int", dest="odds", default=4,
|
||||
help="make the chances of a failover 1 in ODDS [default: %default]")
|
||||
parser.add_option("-A", "--aggressive",
|
||||
action="store_true", dest="aggressive", default=False,
|
||||
help="apply ODDS to try to flip each node [default: %default]")
|
||||
|
||||
def seed_callback(option, opt, value, parser):
|
||||
random.seed(value)
|
||||
@ -78,47 +86,165 @@ def process_args(extra_options=[]):
|
||||
|
||||
parser.add_option("-x", "--exit",
|
||||
action="store_true", dest="exit", default=False,
|
||||
help="exit on the 1st gratuitous IP move")
|
||||
|
||||
help="exit on the 1st gratuitous IP move or IP imbalance")
|
||||
parser.add_option("-H", "--hard-imbalance-limit",
|
||||
action="store", type="int", dest="hard_limit", default=1,
|
||||
help="exceeding this limit causes termination [default: %default]")
|
||||
parser.add_option("-S", "--soft-imbalance-limit",
|
||||
action="store", type="int", dest="soft_limit", default=1,
|
||||
help="exceeding this limit increments a counter [default: %default]")
|
||||
|
||||
(options, args) = parser.parse_args()
|
||||
|
||||
if len(args) != 0:
|
||||
parser.error("too many argumentss")
|
||||
|
||||
def print_begin(t):
|
||||
print "=" * 40
|
||||
def print_begin(t, delim='='):
|
||||
print delim * 40
|
||||
print "%s:" % (t)
|
||||
|
||||
def print_end():
|
||||
print "-" * 40
|
||||
|
||||
def verbose_begin(t):
|
||||
if options.verbose:
|
||||
if options.verbose > 0:
|
||||
print_begin(t)
|
||||
|
||||
def verbose_end():
|
||||
if options.verbose:
|
||||
if options.verbose > 0:
|
||||
print_end()
|
||||
|
||||
def verbose_print(t):
|
||||
if options.verbose:
|
||||
if options.verbose > 0:
|
||||
if not type(t) == list:
|
||||
t = [t]
|
||||
if t != []:
|
||||
print "\n".join([str(i) for i in t])
|
||||
|
||||
# more than this and we switch to the logging module... :-)
|
||||
def debug_begin(t):
|
||||
if options.verbose > 1:
|
||||
print_begin(t, '-')
|
||||
|
||||
def debug_end():
|
||||
if options.verbose > 1:
|
||||
print_end()
|
||||
|
||||
def debug_print(t):
|
||||
if options.verbose > 1:
|
||||
if not type(t) == list:
|
||||
t = [t]
|
||||
if t != []:
|
||||
print "\n".join([str(i) for i in t])
|
||||
|
||||
def ip_to_list_of_ints(ip):
|
||||
# Be lazy... but only expose errors in IPv4 addresses, since
|
||||
# they'll be more commonly used. :-)
|
||||
try:
|
||||
l = socket.inet_pton(socket.AF_INET6, ip)
|
||||
except:
|
||||
# Pad with leading 0s. This makes IPv4 addresses comparable
|
||||
# with IPv6 but reduces the overall effectiveness of the
|
||||
# algorithm. The alternative would be to treat these
|
||||
# addresses separately while trying to keep all the IPs in
|
||||
# overall balance.
|
||||
l = "".join(itertools.repeat("\0", 12)) + \
|
||||
socket.inet_pton(socket.AF_INET, ip)
|
||||
|
||||
return map(lambda x: struct.unpack('B', x)[0], l)
|
||||
|
||||
def ip_distance(ip1, ip2):
|
||||
"""Calculate the distance between 2 IPs.
|
||||
|
||||
This is the length of the longtest common prefix between the IPs.
|
||||
It is calculated by XOR-ing the 2 IPs together and counting the
|
||||
number of leading zeroes."""
|
||||
|
||||
distance = 0
|
||||
for (o1, o2) in zip(ip_to_list_of_ints(ip1), ip_to_list_of_ints(ip2)):
|
||||
# XOR this pair of octets
|
||||
x = o1 ^ o2
|
||||
# count number leading zeroes
|
||||
if x == 0:
|
||||
distance += 8
|
||||
else:
|
||||
# bin() gives minimal length '0bNNN' string
|
||||
distance += (8 - (len(bin(x)) - 2))
|
||||
break
|
||||
|
||||
return distance
|
||||
|
||||
def ip_distance_2_sum(ip, ips):
|
||||
"""Calculate the IP distance for the given IP relative to IPs.
|
||||
|
||||
This could be made more efficient by insering ip_distance_2 into
|
||||
the loop in this function. However, that would result in some
|
||||
loss of clarity and also will not be necessary in a C
|
||||
implemntation."""
|
||||
|
||||
sum = 0
|
||||
for i in ips:
|
||||
sum += ip_distance(ip, i) ** 2
|
||||
|
||||
return sum
|
||||
|
||||
def imbalance_metric(ips):
|
||||
"""Return the imbalance metric for a group of IPs.
|
||||
|
||||
This is the sum of squares of the IP distances between each pair of IPs."""
|
||||
if len(ips) > 1:
|
||||
(h, t) = (ips[0], ips[1:])
|
||||
return ip_distance_2_sum(h, t) + imbalance_metric(t)
|
||||
else:
|
||||
return 0
|
||||
|
||||
def mean(l):
|
||||
return float(sum(l))/len(l)
|
||||
|
||||
class Node(object):
|
||||
def __init__(self, public_addresses):
|
||||
self.public_addresses = set(public_addresses)
|
||||
# List of list allows groups of IPs to be passed in. They're
|
||||
# not actually used in the algorithm but are just used by
|
||||
# calculate_imbalance() for checking the simulation. Note
|
||||
# that people can pass in garbage and make this code
|
||||
# fail... but we're all friends here in simulation world...
|
||||
# :-)
|
||||
if type(public_addresses[0]) is str:
|
||||
self.public_addresses = set(public_addresses)
|
||||
self.ip_groups = []
|
||||
else:
|
||||
# flatten
|
||||
self.public_addresses = set([i for s in public_addresses for i in s])
|
||||
self.ip_groups = public_addresses
|
||||
|
||||
self.current_addresses = set()
|
||||
self.healthy = True
|
||||
self.imbalance = -1
|
||||
|
||||
def __str__(self):
|
||||
return "%s %s%s" % \
|
||||
("*" if len(self.public_addresses) == 0 else \
|
||||
(" " if self.healthy else "#"),
|
||||
sorted(list(self.current_addresses)),
|
||||
" %d" % self.imbalance if options.lcp2 else "")
|
||||
|
||||
def can_node_serve_ip(self, ip):
|
||||
return ip in self.public_addresses
|
||||
|
||||
def node_ip_coverage(self):
|
||||
return len(self.current_addresses)
|
||||
def node_ip_coverage(self, ips=None):
|
||||
return len([a for a in self.current_addresses if ips == None or a in ips])
|
||||
|
||||
def set_imbalance(self, imbalance=-1):
|
||||
"""Set the imbalance metric to the given value. If none given
|
||||
then calculate it."""
|
||||
|
||||
if imbalance != -1:
|
||||
self.imbalance = imbalance
|
||||
else:
|
||||
self.imbalance = imbalance_metric(list(self.current_addresses))
|
||||
|
||||
def get_imbalance(self):
|
||||
return self.imbalance
|
||||
|
||||
class Cluster(object):
|
||||
def __init__(self):
|
||||
@ -131,27 +257,46 @@ class Cluster(object):
|
||||
self.ip_moves = []
|
||||
self.grat_ip_moves = []
|
||||
self.imbalance = []
|
||||
self.imbalance_groups = []
|
||||
self.imbalance_count = 0
|
||||
self.imbalance_groups_count = itertools.repeat(0)
|
||||
self.imbalance_metric = []
|
||||
self.events = -1
|
||||
self.num_unhealthy = []
|
||||
|
||||
self.prev = None
|
||||
|
||||
def __str__(self):
|
||||
return "\n".join(["%2d %s %s" %
|
||||
(i,
|
||||
"*" if len(n.public_addresses) == 0 else \
|
||||
(" " if n.healthy else "#"),
|
||||
sorted(list(n.current_addresses)))
|
||||
for (i, n) in enumerate(self.nodes)])
|
||||
return "\n".join(["%2d %s" % (i, n) \
|
||||
for (i, n) in enumerate(self.nodes)])
|
||||
|
||||
# This is naive. It assumes that IP groups are indicated by the
|
||||
# 1st node having IP groups.
|
||||
def have_ip_groups(self):
|
||||
return (len(self.nodes[0].ip_groups) > 0)
|
||||
|
||||
def print_statistics(self):
|
||||
print_begin("STATISTICS")
|
||||
print "Events: %6d" % self.events
|
||||
print "Total IP moves: %6d" % sum(self.ip_moves)
|
||||
print "Gratuitous IP moves: %6d" % sum(self.grat_ip_moves)
|
||||
print "Max imbalance: %6d" % max(self.imbalance)
|
||||
print "Final imbalance: %6d" % self.imbalance[-1]
|
||||
print "Maximum unhealthy: %6d" % max(self.num_unhealthy)
|
||||
print "Events: %6d" % self.events
|
||||
print "Total IP moves: %6d" % sum(self.ip_moves)
|
||||
print "Gratuitous IP moves: %6d" % sum(self.grat_ip_moves)
|
||||
print "Max imbalance: %6d" % max(self.imbalance)
|
||||
if self.have_ip_groups():
|
||||
print "Max group imbalance counts: ", map(max, zip(*self.imbalance_groups))
|
||||
print "Mean imbalance: %f" % mean(self.imbalance)
|
||||
if self.have_ip_groups():
|
||||
print "Mean group imbalances counts: ", map(mean, zip(*self.imbalance_groups))
|
||||
print "Final imbalance: %6d" % self.imbalance[-1]
|
||||
if self.have_ip_groups():
|
||||
print "Final group imbalances: ", self.imbalance_groups[-1]
|
||||
if options.lcp2:
|
||||
print "Max LCP2 imbalance : %6d" % max(self.imbalance_metric)
|
||||
print "Soft imbalance count: %6d" % self.imbalance_count
|
||||
if self.have_ip_groups():
|
||||
print "Soft imbalance group counts: ", self.imbalance_groups_count
|
||||
if options.lcp2:
|
||||
print "Final LCP2 imbalance : %6d" % self.imbalance_metric[-1]
|
||||
print "Maximum unhealthy: %6d" % max(self.num_unhealthy)
|
||||
print_end()
|
||||
|
||||
def find_pnn_with_ip(self, ip):
|
||||
@ -178,7 +323,7 @@ class Cluster(object):
|
||||
verbose_print(pnn)
|
||||
|
||||
verbose_end()
|
||||
|
||||
|
||||
def unhealthy(self, *pnns):
|
||||
|
||||
verbose_begin("UNHEALTHY")
|
||||
@ -191,27 +336,42 @@ class Cluster(object):
|
||||
|
||||
def do_something_random(self):
|
||||
|
||||
"""Make random node(s) healthy or unhealthy.
|
||||
|
||||
"""Make a random node healthy or unhealthy.
|
||||
If options.aggressive is False then: If all nodes are healthy
|
||||
or unhealthy, then invert one of them; otherwise, there's a 1
|
||||
in options.odds chance of making another node unhealthy.
|
||||
|
||||
If all nodes are healthy or unhealthy, then invert one of
|
||||
them. Otherwise, there's a 1 in options.odds chance of making
|
||||
another node unhealthy."""
|
||||
If options.aggressive is True then: For each node there is a 1
|
||||
in options.odds chance of flipping the state of that node
|
||||
between healthy and unhealthy."""
|
||||
|
||||
num_nodes = len(self.nodes)
|
||||
healthy_pnns = [i for (i,n) in enumerate(self.nodes) if n.healthy]
|
||||
num_healthy = len(healthy_pnns)
|
||||
if not options.aggressive:
|
||||
num_nodes = len(self.nodes)
|
||||
healthy_pnns = [i for (i,n) in enumerate(self.nodes) if n.healthy]
|
||||
num_healthy = len(healthy_pnns)
|
||||
|
||||
if num_nodes == num_healthy:
|
||||
self.unhealthy(random.randint(0, num_nodes-1))
|
||||
elif num_healthy == 0:
|
||||
self.healthy(random.randint(0, num_nodes-1))
|
||||
elif random.randint(1, options.odds) == 1:
|
||||
self.unhealthy(random.choice(healthy_pnns))
|
||||
if num_nodes == num_healthy:
|
||||
self.unhealthy(random.randint(0, num_nodes-1))
|
||||
elif num_healthy == 0:
|
||||
self.healthy(random.randint(0, num_nodes-1))
|
||||
elif random.randint(1, options.odds) == 1:
|
||||
self.unhealthy(random.choice(healthy_pnns))
|
||||
else:
|
||||
all_pnns = range(num_nodes)
|
||||
unhealthy_pnns = sorted(list(set(all_pnns) - set(healthy_pnns)))
|
||||
self.healthy(random.choice(unhealthy_pnns))
|
||||
else:
|
||||
all_pnns = range(num_nodes)
|
||||
unhealthy_pnns = sorted(list(set(all_pnns) - set(healthy_pnns)))
|
||||
self.healthy(random.choice(unhealthy_pnns))
|
||||
# We need to make at least one change or we retry...x
|
||||
changed = False
|
||||
while not changed:
|
||||
for (pnn, n) in enumerate(self.nodes):
|
||||
if random.randint(1, options.odds) == 1:
|
||||
changed = True
|
||||
if n.healthy:
|
||||
self.unhealthy(pnn)
|
||||
else:
|
||||
self.healthy(pnn)
|
||||
|
||||
def random_iterations(self):
|
||||
i = 1
|
||||
@ -219,35 +379,26 @@ class Cluster(object):
|
||||
verbose_begin("EVENT %d" % i)
|
||||
verbose_end()
|
||||
self.do_something_random()
|
||||
if self.recover() and options.exit > 0:
|
||||
if self.recover() and options.exit:
|
||||
break
|
||||
i += 1
|
||||
|
||||
self.print_statistics()
|
||||
|
||||
def calculate_imbalance(self):
|
||||
def imbalance_for_ips(self, ips):
|
||||
|
||||
imbalance = 0
|
||||
|
||||
assigned = sorted([ip
|
||||
for n in self.nodes
|
||||
for ip in n.current_addresses])
|
||||
maxnode = -1
|
||||
minnode = -1
|
||||
|
||||
for ip in assigned:
|
||||
|
||||
num_capable = 0
|
||||
maxnode = -1
|
||||
minnode = -1
|
||||
for ip in ips:
|
||||
for (i, n) in enumerate(self.nodes):
|
||||
if not n.healthy:
|
||||
|
||||
if not n.healthy or not n.can_node_serve_ip(ip):
|
||||
continue
|
||||
|
||||
if not n.can_node_serve_ip(ip):
|
||||
continue
|
||||
|
||||
num_capable += 1
|
||||
|
||||
num = n.node_ip_coverage()
|
||||
num = n.node_ip_coverage(ips)
|
||||
|
||||
if maxnode == -1 or num > maxnum:
|
||||
maxnode = i
|
||||
@ -256,24 +407,42 @@ class Cluster(object):
|
||||
if minnode == -1 or num < minnum:
|
||||
minnode = i
|
||||
minnum = num
|
||||
|
||||
if maxnode == -1:
|
||||
|
||||
if maxnode == -1 or minnode == -1:
|
||||
continue
|
||||
|
||||
i = maxnum - minnum
|
||||
if maxnum - minnum < 2:
|
||||
i = 0
|
||||
#if i < 2:
|
||||
# i = 0
|
||||
imbalance = max([imbalance, i])
|
||||
|
||||
return imbalance
|
||||
|
||||
|
||||
def calculate_imbalance(self):
|
||||
|
||||
# First, do all the assigned IPs.
|
||||
assigned = sorted([ip
|
||||
for n in self.nodes
|
||||
for ip in n.current_addresses])
|
||||
|
||||
i = self.imbalance_for_ips(assigned)
|
||||
|
||||
ig = []
|
||||
# FIXME? If dealing with IP groups, assume the nodes are all
|
||||
# the same.
|
||||
for ips in self.nodes[0].ip_groups:
|
||||
gi = self.imbalance_for_ips(ips)
|
||||
ig.append(gi)
|
||||
|
||||
return (i, ig)
|
||||
|
||||
|
||||
def diff(self):
|
||||
"""Calculate differences in IP assignments between self and prev.
|
||||
|
||||
Gratuitous IP moves (from a healthy node to a healthy node)
|
||||
are prefix by !!. Any gratuitous IP moves cause this function
|
||||
to return False. If there are no gratuitous moves then it
|
||||
will return True."""
|
||||
are prefixed by !!."""
|
||||
|
||||
ip_moves = 0
|
||||
grat_ip_moves = 0
|
||||
@ -297,33 +466,6 @@ class Cluster(object):
|
||||
(prefix, ip, old, new))
|
||||
|
||||
return (ip_moves, grat_ip_moves, details)
|
||||
|
||||
def find_least_loaded_node(self, ip):
|
||||
"""Just like find_takeover_node but doesn't care about health."""
|
||||
pnn = -1
|
||||
min = 0
|
||||
for (i, n) in enumerate(self.nodes):
|
||||
if not n.can_node_serve_ip(ip):
|
||||
continue
|
||||
|
||||
num = n.node_ip_coverage()
|
||||
|
||||
if (pnn == -1):
|
||||
pnn = i
|
||||
min = num
|
||||
else:
|
||||
if num < min:
|
||||
pnn = i
|
||||
min = num
|
||||
|
||||
if pnn == -1:
|
||||
verbose_print("Could not find node to take over public address %s" % ip)
|
||||
return False
|
||||
|
||||
self.nodes[pnn].current_addresses.add(ip)
|
||||
|
||||
verbose_print("%s -> %d" % (ip, pnn))
|
||||
return True
|
||||
|
||||
def find_takeover_node(self, ip):
|
||||
|
||||
@ -355,6 +497,190 @@ class Cluster(object):
|
||||
verbose_print("%s -> %d" % (ip, pnn))
|
||||
return True
|
||||
|
||||
def basic_allocate_unassigned(self):
|
||||
|
||||
assigned = set([ip for n in self.nodes for ip in n.current_addresses])
|
||||
unassigned = sorted(list(self.all_public_ips - assigned))
|
||||
|
||||
for ip in unassigned:
|
||||
self.find_takeover_node(ip)
|
||||
|
||||
def basic_failback(self, retries_l):
|
||||
|
||||
assigned = sorted([ip
|
||||
for n in self.nodes
|
||||
for ip in n.current_addresses])
|
||||
for ip in assigned:
|
||||
|
||||
maxnode = -1
|
||||
minnode = -1
|
||||
for (i, n) in enumerate(self.nodes):
|
||||
if not n.healthy:
|
||||
continue
|
||||
|
||||
if not n.can_node_serve_ip(ip):
|
||||
continue
|
||||
|
||||
num = n.node_ip_coverage()
|
||||
|
||||
if maxnode == -1:
|
||||
maxnode = i
|
||||
maxnum = num
|
||||
else:
|
||||
if num > maxnum:
|
||||
maxnode = i
|
||||
maxnum = num
|
||||
if minnode == -1:
|
||||
minnode = i
|
||||
minnum = num
|
||||
else:
|
||||
if num < minnum:
|
||||
minnode = i
|
||||
minnum = num
|
||||
|
||||
if maxnode == -1:
|
||||
print "Could not find maxnode. May not be able to serve ip", ip
|
||||
continue
|
||||
|
||||
#if self.deterministic_public_ips:
|
||||
# continue
|
||||
|
||||
if maxnum > minnum + 1 and retries_l[0] < options.retries:
|
||||
# Remove the 1st ip from maxnode
|
||||
t = sorted(list(self.nodes[maxnode].current_addresses))
|
||||
realloc = t[0]
|
||||
verbose_print("%s <- %d" % (realloc, maxnode))
|
||||
self.nodes[maxnode].current_addresses.remove(realloc)
|
||||
# Redo the outer loop.
|
||||
retries_l[0] += 1
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def lcp2_allocate_unassigned(self):
|
||||
|
||||
# Assign as many unassigned addresses as possible. Keep
|
||||
# selecting the optimal assignment until we don't manage to
|
||||
# assign anything.
|
||||
assigned = set([ip for n in self.nodes for ip in n.current_addresses])
|
||||
unassigned = sorted(list(self.all_public_ips - assigned))
|
||||
|
||||
should_loop = True
|
||||
while len(unassigned) > 0 and should_loop:
|
||||
should_loop = False
|
||||
|
||||
debug_begin(" CONSIDERING MOVES (UNASSIGNED)")
|
||||
|
||||
minnode = -1
|
||||
mindsum = 0
|
||||
minip = None
|
||||
|
||||
for ip in unassigned:
|
||||
for dstnode in range(len(self.nodes)):
|
||||
if self.nodes[dstnode].can_node_serve_ip(ip) and \
|
||||
self.nodes[dstnode].healthy:
|
||||
dstdsum = ip_distance_2_sum(ip, self.nodes[dstnode].current_addresses)
|
||||
dstimbl = self.nodes[dstnode].get_imbalance() + dstdsum
|
||||
debug_print(" %s -> %d [+%d]" % \
|
||||
(ip,
|
||||
dstnode,
|
||||
dstimbl - self.nodes[dstnode].get_imbalance()))
|
||||
|
||||
if (minnode == -1) or (dstdsum < mindsum):
|
||||
minnode = dstnode
|
||||
minimbl = dstimbl
|
||||
mindsum = dstdsum
|
||||
minip = ip
|
||||
should_loop = True
|
||||
debug_end()
|
||||
|
||||
if minnode != -1:
|
||||
self.nodes[minnode].current_addresses.add(minip)
|
||||
self.nodes[minnode].set_imbalance(self.nodes[minnode].get_imbalance() + mindsum)
|
||||
verbose_print("%s -> %d [+%d]" % (minip, minnode, mindsum))
|
||||
unassigned.remove(minip)
|
||||
|
||||
for ip in unassigned:
|
||||
verbose_print("Could not find node to take over public address %s" % ip)
|
||||
|
||||
def lcp2_failback(self, targets):
|
||||
|
||||
# Get the node with the highest imbalance metric.
|
||||
srcnode = -1
|
||||
maximbl = 0
|
||||
for (pnn, n) in enumerate(self.nodes):
|
||||
b = n.get_imbalance()
|
||||
if (srcnode == -1) or (b > maximbl):
|
||||
srcnode = pnn
|
||||
maximbl = b
|
||||
|
||||
# This means that all nodes had 0 or 1 addresses, so can't
|
||||
# be imbalanced.
|
||||
if maximbl == 0:
|
||||
return False
|
||||
|
||||
# We'll need this a few times...
|
||||
ips = self.nodes[srcnode].current_addresses
|
||||
|
||||
# Find an IP and destination node that best reduces imbalance.
|
||||
optimum = None
|
||||
debug_begin(" CONSIDERING MOVES FROM %d [%d]" % (srcnode, maximbl))
|
||||
for ip in ips:
|
||||
# What is this IP address costing the source node?
|
||||
srcdsum = ip_distance_2_sum(ip, ips - set([ip]))
|
||||
srcimbl = maximbl - srcdsum
|
||||
|
||||
# Consider this IP address would cost each potential
|
||||
# destination node. Destination nodes are limited to
|
||||
# those that are newly healthy, since we don't want to
|
||||
# do gratuitous failover of IPs just to make minor
|
||||
# balance improvements.
|
||||
for dstnode in targets:
|
||||
if self.nodes[dstnode].can_node_serve_ip(ip) and \
|
||||
self.nodes[dstnode].healthy:
|
||||
dstdsum = ip_distance_2_sum(ip, self.nodes[dstnode].current_addresses)
|
||||
dstimbl = self.nodes[dstnode].get_imbalance() + dstdsum
|
||||
debug_print(" %d [%d] -> %s -> %d [+%d]" % \
|
||||
(srcnode,
|
||||
srcimbl - self.nodes[srcnode].get_imbalance(),
|
||||
ip,
|
||||
dstnode,
|
||||
dstimbl - self.nodes[dstnode].get_imbalance()))
|
||||
|
||||
if (dstimbl < maximbl) and (dstdsum < srcdsum):
|
||||
if optimum is None:
|
||||
optimum = (ip, srcnode, srcimbl, dstnode, dstimbl)
|
||||
else:
|
||||
(x, sn, si, dn, di) = optimum
|
||||
if (srcimbl + dstimbl) < (si + di):
|
||||
optimum = (ip, srcnode, srcimbl, dstnode, dstimbl)
|
||||
debug_end()
|
||||
|
||||
if optimum is not None:
|
||||
# We found a move that makes things better...
|
||||
(ip, srcnode, srcimbl, dstnode, dstimbl) = optimum
|
||||
ini_srcimbl = self.nodes[srcnode].get_imbalance()
|
||||
ini_dstimbl = self.nodes[dstnode].get_imbalance()
|
||||
|
||||
self.nodes[srcnode].current_addresses.remove(ip)
|
||||
self.nodes[srcnode].set_imbalance(srcimbl)
|
||||
|
||||
self.nodes[dstnode].current_addresses.add(ip)
|
||||
self.nodes[dstnode].set_imbalance(dstimbl)
|
||||
|
||||
verbose_print("%d [%d] -> %s -> %d [+%d]" % \
|
||||
(srcnode,
|
||||
srcimbl - ini_srcimbl,
|
||||
ip,
|
||||
dstnode,
|
||||
dstimbl - ini_dstimbl))
|
||||
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def ctdb_takeover_run(self):
|
||||
|
||||
self.events += 1
|
||||
@ -369,22 +695,11 @@ class Cluster(object):
|
||||
# Remap everything.
|
||||
addr_list = sorted(list(self.all_public_ips))
|
||||
for (i, ip) in enumerate(addr_list):
|
||||
if options.hack == 1:
|
||||
self.quietly_remove_ip(ip)
|
||||
self.find_least_loaded_node(ip)
|
||||
elif options.hack == 2:
|
||||
pnn = i % len(self.nodes)
|
||||
if ip in self.nodes[pnn].public_addresses:
|
||||
self.quietly_remove_ip(ip)
|
||||
# Add addresses to new node.
|
||||
self.nodes[pnn].current_addresses.add(ip)
|
||||
verbose_print("%s -> %d" % (ip, pnn))
|
||||
else:
|
||||
self.quietly_remove_ip(ip)
|
||||
# Add addresses to new node.
|
||||
pnn = i % len(self.nodes)
|
||||
self.nodes[pnn].current_addresses.add(ip)
|
||||
verbose_print("%s -> %d" % (ip, pnn))
|
||||
self.quietly_remove_ip(ip)
|
||||
# Add addresses to new node.
|
||||
pnn = i % len(self.nodes)
|
||||
self.nodes[pnn].current_addresses.add(ip)
|
||||
verbose_print("%s -> %d" % (ip, pnn))
|
||||
|
||||
# Remove public addresses from unhealthy nodes.
|
||||
for (pnn, n) in enumerate(self.nodes):
|
||||
@ -399,69 +714,39 @@ class Cluster(object):
|
||||
for ip in n.current_addresses - n.public_addresses])
|
||||
n.current_addresses &= n.public_addresses
|
||||
|
||||
# We'll only retry the balancing act up to 5 times.
|
||||
retries = 0
|
||||
if options.lcp2:
|
||||
newly_healthy = [pnn for (pnn, n) in enumerate(self.nodes)
|
||||
if len(n.current_addresses) == 0 and n.healthy]
|
||||
for n in self.nodes:
|
||||
n.set_imbalance()
|
||||
|
||||
# We'll only retry the balancing act up to options.retries
|
||||
# times (for the basic non-deterministic algorithm). This
|
||||
# nonsense gives us a reference on the retries count in
|
||||
# Python. It will be easier in C. :-)
|
||||
# For LCP2 we reassignas many IPs from heavily "loaded" nodes
|
||||
# to nodes that are newly healthy, looping until we fail to
|
||||
# reassign an IP.
|
||||
retries_l = [0]
|
||||
should_loop = True
|
||||
while should_loop:
|
||||
should_loop = False
|
||||
|
||||
assigned = set([ip for n in self.nodes for ip in n.current_addresses])
|
||||
unassigned = sorted(list(self.all_public_ips - assigned))
|
||||
if options.lcp2:
|
||||
self.lcp2_allocate_unassigned()
|
||||
else:
|
||||
self.basic_allocate_unassigned()
|
||||
|
||||
for ip in unassigned:
|
||||
self.find_takeover_node(ip)
|
||||
|
||||
if self.no_ip_failback:
|
||||
if self.no_ip_failback or self.deterministic_public_ips:
|
||||
break
|
||||
|
||||
assigned = sorted([ip
|
||||
for n in self.nodes
|
||||
for ip in n.current_addresses])
|
||||
for ip in assigned:
|
||||
|
||||
maxnode = -1
|
||||
minnode = -1
|
||||
for (i, n) in enumerate(self.nodes):
|
||||
if not n.healthy:
|
||||
continue
|
||||
|
||||
if not n.can_node_serve_ip(ip):
|
||||
continue
|
||||
|
||||
num = n.node_ip_coverage()
|
||||
|
||||
if maxnode == -1:
|
||||
maxnode = i
|
||||
maxnum = num
|
||||
else:
|
||||
if num > maxnum:
|
||||
maxnode = i
|
||||
maxnum = num
|
||||
if minnode == -1:
|
||||
minnode = i
|
||||
minnum = num
|
||||
else:
|
||||
if num < minnum:
|
||||
minnode = i
|
||||
minnum = num
|
||||
|
||||
if maxnode == -1:
|
||||
print "Could not maxnode. May not be able to serve ip", ip
|
||||
continue
|
||||
|
||||
if self.deterministic_public_ips:
|
||||
continue
|
||||
|
||||
if maxnum > minnum + 1 and retries < options.retries:
|
||||
# Remove the 1st ip from maxnode
|
||||
t = sorted(list(self.nodes[maxnode].current_addresses))
|
||||
realloc = t[0]
|
||||
verbose_print("%s <- %d" % (realloc, maxnode))
|
||||
self.nodes[maxnode].current_addresses.remove(realloc)
|
||||
retries += 1
|
||||
# Redo the outer loop.
|
||||
should_loop = True
|
||||
if options.lcp2:
|
||||
if len(newly_healthy) == 0:
|
||||
break
|
||||
should_loop = self.lcp2_failback(newly_healthy)
|
||||
else:
|
||||
should_loop = self.basic_failback(retries_l)
|
||||
|
||||
|
||||
def recover(self):
|
||||
verbose_begin("TAKEOVER")
|
||||
@ -482,11 +767,31 @@ class Cluster(object):
|
||||
print "\n".join(details)
|
||||
print_end()
|
||||
|
||||
imbalance = self.calculate_imbalance()
|
||||
(imbalance, imbalance_groups) = self.calculate_imbalance()
|
||||
self.imbalance.append(imbalance)
|
||||
self.imbalance_groups.append(imbalance_groups)
|
||||
|
||||
if imbalance > options.soft_limit:
|
||||
self.imbalance_count += 1
|
||||
|
||||
# There must be a cleaner way...
|
||||
t = []
|
||||
for (c, i) in zip(self.imbalance_groups_count, imbalance_groups):
|
||||
if i > options.soft_limit:
|
||||
t.append(c + i)
|
||||
else:
|
||||
t.append(c)
|
||||
self.imbalance_groups_count = t
|
||||
|
||||
imbalance_metric = max([n.get_imbalance() for n in self.nodes])
|
||||
self.imbalance_metric.append(imbalance_metric)
|
||||
if options.balance:
|
||||
print_begin("IMBALANCE")
|
||||
print imbalance
|
||||
print "ALL IPS:", imbalance
|
||||
if self.have_ip_groups():
|
||||
print "IP GROUPS:", imbalance_groups
|
||||
if options.lcp2:
|
||||
print "LCP2 IMBALANCE:", imbalance_metric
|
||||
print_end()
|
||||
|
||||
num_unhealthy = len(self.nodes) - \
|
||||
@ -501,4 +806,7 @@ class Cluster(object):
|
||||
self.prev = None
|
||||
self.prev = copy.deepcopy(self)
|
||||
|
||||
return grat_ip_moves
|
||||
# True is bad!
|
||||
return (grat_ip_moves > 0) or \
|
||||
(not self.have_ip_groups() and imbalance > options.hard_limit) or \
|
||||
(self.have_ip_groups() and (max(imbalance_groups) > options.hard_limit))
|
||||
|
25
ctdb/tests/takeover/ip_groups1.py
Executable file
25
ctdb/tests/takeover/ip_groups1.py
Executable file
@ -0,0 +1,25 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# 2 IP groups, both on the same 5 nodes, with each group on different
|
||||
# interfaces/VLANs. One group has many more addresses to test how
|
||||
# well an "imbalanced" configuration will balance...
|
||||
|
||||
from ctdb_takeover import Cluster, Node, process_args
|
||||
|
||||
process_args()
|
||||
|
||||
addresses20 = ['192.168.20.%d' % n for n in range(1, 13)]
|
||||
addresses128 = ['192.168.128.%d' % n for n in range(1, 5)]
|
||||
|
||||
c = Cluster()
|
||||
|
||||
for i in range(5):
|
||||
c.add_node(Node([addresses20, addresses128]))
|
||||
|
||||
#for i in range(3):
|
||||
# c.add_node(Node([addresses20]))
|
||||
|
||||
|
||||
c.recover()
|
||||
|
||||
c.random_iterations()
|
20
ctdb/tests/takeover/ip_groups2.py
Executable file
20
ctdb/tests/takeover/ip_groups2.py
Executable file
@ -0,0 +1,20 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# 2 groups of addresses, combined into 1 pool so the checking
|
||||
# algorithm doesn't know about the groups, across 2 nodes.
|
||||
|
||||
from ctdb_takeover import Cluster, Node, process_args
|
||||
|
||||
process_args()
|
||||
|
||||
addresses20 = ['192.168.20.%d' % n for n in range(1, 13)]
|
||||
addresses21 = ['192.168.21.%d' % n for n in range(1, 5)]
|
||||
|
||||
c = Cluster()
|
||||
|
||||
for i in range(2):
|
||||
c.add_node(Node(addresses20 + addresses21))
|
||||
|
||||
c.recover()
|
||||
|
||||
c.random_iterations()
|
27
ctdb/tests/takeover/ip_groups3.py
Executable file
27
ctdb/tests/takeover/ip_groups3.py
Executable file
@ -0,0 +1,27 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# 4 IP groups, across 10 nodes, with each group on different
|
||||
# interfaces/VLANs. 80 addresses in total but not evenly balanced, to
|
||||
# help check some of the more extreme behaviour.
|
||||
|
||||
from ctdb_takeover import Cluster, Node, process_args
|
||||
|
||||
process_args()
|
||||
|
||||
addresses1 = ['192.168.1.%d' % n for n in range(1, 41)]
|
||||
addresses2 = ['192.168.2.%d' % n for n in range(1, 21)]
|
||||
addresses3 = ['192.168.3.%d' % n for n in range(1, 11)]
|
||||
addresses4 = ['192.168.4.%d' % n for n in range(1, 11)]
|
||||
|
||||
# Try detecting imbalance with square root of number of nodes? Or
|
||||
# just with a parameter indicating how unbalanced you're willing to
|
||||
# accept...
|
||||
|
||||
c = Cluster()
|
||||
|
||||
for i in range(10):
|
||||
c.add_node(Node([addresses1, addresses2, addresses3, addresses4]))
|
||||
|
||||
c.recover()
|
||||
|
||||
c.random_iterations()
|
25
ctdb/tests/takeover/ip_groups4.py
Executable file
25
ctdb/tests/takeover/ip_groups4.py
Executable file
@ -0,0 +1,25 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# 2 IP groups, across 2 nodes, with each group on different
|
||||
# interfaces. 4 addresses per group. A nice little canonical 2 node
|
||||
# configuration.
|
||||
|
||||
from ctdb_takeover import Cluster, Node, process_args
|
||||
|
||||
process_args()
|
||||
|
||||
addresses1 = ['192.168.1.%d' % n for n in range(1, 5)]
|
||||
addresses2 = ['192.168.2.%d' % n for n in range(1, 5)]
|
||||
|
||||
# Try detecting imbalance with square root of number of nodes? Or
|
||||
# just with a parameter indicating how unbalanced you're willing to
|
||||
# accept...
|
||||
|
||||
c = Cluster()
|
||||
|
||||
for i in range(2):
|
||||
c.add_node(Node([addresses1, addresses2]))
|
||||
|
||||
c.recover()
|
||||
|
||||
c.random_iterations()
|
23
ctdb/tests/takeover/ip_groups5.py
Executable file
23
ctdb/tests/takeover/ip_groups5.py
Executable file
@ -0,0 +1,23 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# 1 IP group, to test backward compatibility of LCP2 algorithm. 16
|
||||
# addresses across 4 nodes.
|
||||
|
||||
from ctdb_takeover import Cluster, Node, process_args
|
||||
|
||||
process_args()
|
||||
|
||||
addresses1 = ['192.168.1.%d' % n for n in range(1, 17)]
|
||||
|
||||
# Try detecting imbalance with square root of number of nodes? Or
|
||||
# just with a parameter indicating how unbalanced you're willing to
|
||||
# accept...
|
||||
|
||||
c = Cluster()
|
||||
|
||||
for i in range(4):
|
||||
c.add_node(Node(addresses1))
|
||||
|
||||
c.recover()
|
||||
|
||||
c.random_iterations()
|
@ -893,6 +893,7 @@ static int control_natgwlist(struct ctdb_context *ctdb, int argc, const char **a
|
||||
/* or if we still can not find any */
|
||||
if (i == nodemap->num) {
|
||||
printf("-1 0.0.0.0\n");
|
||||
ret = 2; /* matches ENOENT */
|
||||
}
|
||||
}
|
||||
|
||||
@ -910,7 +911,7 @@ static int control_natgwlist(struct ctdb_context *ctdb, int argc, const char **a
|
||||
!!(nodemap->nodes[i].flags&NODE_FLAGS_STOPPED));
|
||||
}
|
||||
|
||||
return 0;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
|
Loading…
x
Reference in New Issue
Block a user