From 3499c8c673e468be589a5ef6cce31b08d8b489cc Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Mon, 9 Jul 2007 12:33:00 +1000 Subject: [PATCH 1/9] when checking the nodemap flags for consitency while monitoring the cluster, we cant check that both the BANNED and the DISCONNECTED flags are both set at the same time since if a node becomes banned just before it is DISCONNECTED there is no guarantee that all other nodes will have seen the BANNED flag. So we must first check the DISCONNECTED flag only and only if the DISCONNECTED flag is not set should we check the BANNED flag. othervise this can cause a recovery loop while some nodes thing the disconnected node is DISCONNECTED|BANNED and other think it is just DISCONNECTED (This used to be ctdb commit 0967b2fff376ead631d98e78b3a97253fc109c69) --- ctdb/server/ctdb_recoverd.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c index ef867efac12..19e843ef3f2 100644 --- a/ctdb/server/ctdb_recoverd.c +++ b/ctdb/server/ctdb_recoverd.c @@ -1325,9 +1325,17 @@ again: vnnmap, nodemap->nodes[j].vnn); goto again; } - if ((remote_nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) != - (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) { - DEBUG(0, (__location__ " Remote node:%u has different nodemap flags for %d (0x%x vs 0x%x)\n", + if ((remote_nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) != + (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) { + DEBUG(0, (__location__ " Remote node:%u has different nodemap disconnected flag for %d (0x%x vs 0x%x)\n", + nodemap->nodes[j].vnn, i, + remote_nodemap->nodes[i].flags, nodemap->nodes[i].flags)); + do_recovery(rec, mem_ctx, vnn, num_active, nodemap, + vnnmap, nodemap->nodes[j].vnn); + goto again; + } else if ((remote_nodemap->nodes[i].flags & NODE_FLAGS_BANNED) != + (nodemap->nodes[i].flags & NODE_FLAGS_BANNED)) { + DEBUG(0, (__location__ " Remote node:%u has different nodemap banned flag for %d (0x%x vs 0x%x)\n", nodemap->nodes[j].vnn, i, remote_nodemap->nodes[i].flags, nodemap->nodes[i].flags)); do_recovery(rec, mem_ctx, vnn, num_active, nodemap, From b871c3e365bd62a4c8ad9bac8c253dba4b894c9d Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Mon, 9 Jul 2007 12:55:15 +1000 Subject: [PATCH 2/9] a better way to fix the DISCONNECT|BANNED vs DISCONNECT bug (This used to be ctdb commit 5c638d7731c5a268de02d3a37828ac7aec9a12de) --- ctdb/server/ctdb_recoverd.c | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c index 19e843ef3f2..bba963b9d1d 100644 --- a/ctdb/server/ctdb_recoverd.c +++ b/ctdb/server/ctdb_recoverd.c @@ -385,11 +385,6 @@ static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node for (i=0;inum;i++) { struct ctdb_node_flag_change c; TDB_DATA data; - uint32_t flags = nodemap->nodes[i].flags; - - if (flags & NODE_FLAGS_DISCONNECTED) { - continue; - } c.vnn = nodemap->nodes[i].vnn; c.flags = nodemap->nodes[i].flags; @@ -1325,17 +1320,9 @@ again: vnnmap, nodemap->nodes[j].vnn); goto again; } - if ((remote_nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) != - (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) { - DEBUG(0, (__location__ " Remote node:%u has different nodemap disconnected flag for %d (0x%x vs 0x%x)\n", - nodemap->nodes[j].vnn, i, - remote_nodemap->nodes[i].flags, nodemap->nodes[i].flags)); - do_recovery(rec, mem_ctx, vnn, num_active, nodemap, - vnnmap, nodemap->nodes[j].vnn); - goto again; - } else if ((remote_nodemap->nodes[i].flags & NODE_FLAGS_BANNED) != - (nodemap->nodes[i].flags & NODE_FLAGS_BANNED)) { - DEBUG(0, (__location__ " Remote node:%u has different nodemap banned flag for %d (0x%x vs 0x%x)\n", + if ((remote_nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) != + (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) { + DEBUG(0, (__location__ " Remote node:%u has different nodemap flag for %d (0x%x vs 0x%x)\n", nodemap->nodes[j].vnn, i, remote_nodemap->nodes[i].flags, nodemap->nodes[i].flags)); do_recovery(rec, mem_ctx, vnn, num_active, nodemap, From 69f3a09e6f338b54c27a79f48950b7508bc07bc2 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Mon, 9 Jul 2007 13:21:17 +1000 Subject: [PATCH 3/9] when a remote node has sent us a message to update the flags for a node, dont let those messages modify the DISCONNECTED flag. the DISCONNECTED flag must be managed locally since it describes whether the local node can communicate with the remote node or not (This used to be ctdb commit 5650673205d335a32d4f27f66847ea66752a00f0) --- ctdb/server/ctdb_recoverd.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c index bba963b9d1d..0a07244fe30 100644 --- a/ctdb/server/ctdb_recoverd.c +++ b/ctdb/server/ctdb_recoverd.c @@ -1068,11 +1068,20 @@ static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid, return; } - if (nodemap->nodes[i].flags != c->flags) { + /* Dont let messages from remote nodes change the DISCONNECTED flag. + This flag is handled locally based on whether the local node + can communicate with the node or not. + */ + c->flags &= ~NODE_FLAGS_DISCONNECTED; + + /* check whether the flags (except for the DISCONNECTED flag have changed */ + if ((nodemap->nodes[i].flags&(~NODE_FLAGS_DISCONNECTED)) != c->flags) { DEBUG(0,("Node %u has changed flags - now 0x%x\n", c->vnn, c->flags)); } - nodemap->nodes[i].flags = c->flags; + /* Update the flags but leave the DISCONNECTED flag as is */ + nodemap->nodes[i].flags = c->flags + | (nodemap->nodes[i].flags&NODE_FLAGS_DISCONNECTED); ret = ctdb_ctrl_getrecmaster(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->recovery_master); From a859723912fd4c204007a136d6b82b5cc4145081 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Mon, 9 Jul 2007 17:40:15 +1000 Subject: [PATCH 4/9] nicer handling of DISCONNECTED flag when we update the node flags from a remote message (This used to be ctdb commit 9a50ad22be61a09761ffda89de91ef3221917c84) --- ctdb/server/ctdb_recoverd.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c index 0a07244fe30..14808268782 100644 --- a/ctdb/server/ctdb_recoverd.c +++ b/ctdb/server/ctdb_recoverd.c @@ -1073,15 +1073,15 @@ static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid, can communicate with the node or not. */ c->flags &= ~NODE_FLAGS_DISCONNECTED; + if (nodemap->nodes[i].flags&NODE_FLAGS_DISCONNECTED) { + c->flags |= NODE_FLAGS_DISCONNECTED; + } - /* check whether the flags (except for the DISCONNECTED flag have changed */ - if ((nodemap->nodes[i].flags&(~NODE_FLAGS_DISCONNECTED)) != c->flags) { + if (nodemap->nodes[i].flags != c->flags) { DEBUG(0,("Node %u has changed flags - now 0x%x\n", c->vnn, c->flags)); } - /* Update the flags but leave the DISCONNECTED flag as is */ - nodemap->nodes[i].flags = c->flags - | (nodemap->nodes[i].flags&NODE_FLAGS_DISCONNECTED); + nodemap->nodes[i].flags = c->flags; ret = ctdb_ctrl_getrecmaster(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->recovery_master); From dbc66d054bfe2acbf8eb3bee57b05ef3fed9f734 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Tue, 10 Jul 2007 09:45:14 +1000 Subject: [PATCH 5/9] dont restart the tcp service after a ip takeover, it is more efficient to just kill off the tcp connections (This used to be ctdb commit bc481c3f1a44c50648488c4f8a7f15ec395d446f) --- ctdb/config/events.d/60.nfs | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/ctdb/config/events.d/60.nfs b/ctdb/config/events.d/60.nfs index 48b50f126b3..47f8a903a6b 100755 --- a/ctdb/config/events.d/60.nfs +++ b/ctdb/config/events.d/60.nfs @@ -61,12 +61,18 @@ case $cmd in ;; recovered) - # restart NFS to ensure that all TCP connections to the released ip - # are closed + # RST all tcp connections used for NFS to ensure that they do + # not survive in ESTABLISHED state across a failover/failback + # and create an ack storm [ -f /etc/ctdb/state/nfs/restart ] && { - ( service nfs status > /dev/null 2>&1 && - service nfs restart > /dev/null 2>&1 && - service nfslock restart > /dev/null 2>&1 ) & + netstat -tn |egrep '^tcp.*\s+[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+:2049\s+.*ESTABLISHED' | awk '{print $4" "$5}' | while read dest src; do + srcip=`echo $src | cut -d: -f1` + srcport=`echo $src | cut -d: -f2` + destip=`echo $dest | cut -d: -f1` + destport=`echo $dest | cut -d: -f2` + ctdb killtcp $srcip:$srcport $destip:$destport + ctdb killtcp $destip:$destport $srcip:$srcport + done } > /dev/null 2>&1 /bin/rm -f /etc/ctdb/state/nfs/restart From 1c32f65ee03008d4f0339785c69bed8f156baebe Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Tue, 10 Jul 2007 10:07:26 +1000 Subject: [PATCH 6/9] run the ctdb killtcp in the background (This used to be ctdb commit d6a514c2b3d427099ed669eef104146608378fa8) --- ctdb/config/events.d/60.nfs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ctdb/config/events.d/60.nfs b/ctdb/config/events.d/60.nfs index 47f8a903a6b..5d03b7c75db 100755 --- a/ctdb/config/events.d/60.nfs +++ b/ctdb/config/events.d/60.nfs @@ -70,8 +70,8 @@ case $cmd in srcport=`echo $src | cut -d: -f2` destip=`echo $dest | cut -d: -f1` destport=`echo $dest | cut -d: -f2` - ctdb killtcp $srcip:$srcport $destip:$destport - ctdb killtcp $destip:$destport $srcip:$srcport + ctdb killtcp $srcip:$srcport $destip:$destport >/dev/null 2>&1 & + ctdb killtcp $destip:$destport $srcip:$srcport >/dev/null 2>&1 & done } > /dev/null 2>&1 /bin/rm -f /etc/ctdb/state/nfs/restart From d81bca20729759e357ed61781454b601fdb58686 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Tue, 10 Jul 2007 10:24:20 +1000 Subject: [PATCH 7/9] make it possible to specify how many times ctdb killtcp will try to RST the tcp connection change the 60.nfs script to run ctdb killtcp in the foreground so we dont get lots of these running in parallel when there are a lot of tcp connections to rst (This used to be ctdb commit d81616214752882242f2886e94681972a790db80) --- ctdb/config/events.d/60.nfs | 4 ++-- ctdb/tools/ctdb.c | 10 ++++++---- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/ctdb/config/events.d/60.nfs b/ctdb/config/events.d/60.nfs index 5d03b7c75db..7a7dcd389bd 100755 --- a/ctdb/config/events.d/60.nfs +++ b/ctdb/config/events.d/60.nfs @@ -70,8 +70,8 @@ case $cmd in srcport=`echo $src | cut -d: -f2` destip=`echo $dest | cut -d: -f1` destport=`echo $dest | cut -d: -f2` - ctdb killtcp $srcip:$srcport $destip:$destport >/dev/null 2>&1 & - ctdb killtcp $destip:$destport $srcip:$srcport >/dev/null 2>&1 & + ctdb killtcp $srcip:$srcport $destip:$destport 1 >/dev/null 2>&1 + ctdb killtcp $destip:$destport $srcip:$srcport 1 >/dev/null 2>&1 done } > /dev/null 2>&1 /bin/rm -f /etc/ctdb/state/nfs/restart diff --git a/ctdb/tools/ctdb.c b/ctdb/tools/ctdb.c index 371f6e867a6..4e0378d7b9c 100644 --- a/ctdb/tools/ctdb.c +++ b/ctdb/tools/ctdb.c @@ -308,10 +308,10 @@ static int control_status(struct ctdb_context *ctdb, int argc, const char **argv */ static int kill_tcp(struct ctdb_context *ctdb, int argc, const char **argv) { - int i, ret; + int i, ret, numrst; struct sockaddr_in src, dst; - if (argc < 2) { + if (argc < 3) { usage(); } @@ -325,7 +325,9 @@ static int kill_tcp(struct ctdb_context *ctdb, int argc, const char **argv) return -1; } - for (i=0;i<5;i++) { + numrst = strtoul(argv[2], NULL, 0); + + for (i=0;iev, &src, &dst); printf("ret:%d\n", ret); @@ -889,7 +891,7 @@ static const struct { { "recover", control_recover, true, "force recovery" }, { "freeze", control_freeze, true, "freeze all databases" }, { "thaw", control_thaw, true, "thaw all databases" }, - { "killtcp", kill_tcp, false, "kill a tcp connection", " " }, + { "killtcp", kill_tcp, false, "kill a tcp connection. Try times.", " " }, { "tickle", tickle_tcp, false, "send a tcp tickle ack", " " }, }; From 26fc2ebd4bfafc9f2875fbb87ade7d1d96138d70 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Tue, 10 Jul 2007 12:43:46 +1000 Subject: [PATCH 8/9] update the documentation for NFS to mention that the lock manager must run on the same port on all nodes. remove the CTDB_MANAGES_NFSLOCK variable that is no longer used (This used to be ctdb commit 389a503c44c999e46caa344c4bc073336e797909) --- ctdb/web/nfs.html | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/ctdb/web/nfs.html b/ctdb/web/nfs.html index caff4001f8a..6bec33bc648 100644 --- a/ctdb/web/nfs.html +++ b/ctdb/web/nfs.html @@ -47,16 +47,18 @@ which causes problems on some clients.
This file should look something like :
   CTDB_MANAGES_NFS=yes
-  CTDB_MANAGES_NFSLOCK=yes
+  LOCKD_TCPPORT=599
+  LOCKD_UDPPORT=599
   STATD_SHARED_DIRECTORY=/gpfs0/nfs-state
-  STATD_HOSTNAME=\"ctdb -P $STATD_SHARED_DIRECTORY/192.168.1.1 -H /etc/ctdb/statd-callout -p 97\"
+  STATD_HOSTNAME="ctdb -P $STATD_SHARED_DIRECTORY/192.168.1.1 -H /etc/ctdb/statd-callout -p 97"
 
The CTDB_MANAGES_NFS line tells the events scripts that CTDB is to manage startup and shutdown of the NFS and NFSLOCK services.
-The CTDB_MANAGES_NFSLOCK line tells the events scripts that CTDB is also to manage the nfs lock manager.
+With this set to yes, CTDB will start/stop/restart these services as required.

-With these set to yes, CTDB will start/stop/restart these services as required.

+You need to make sure that the lock manager runs on the same port on all nodes in the cluster since some clients will have "issues" and take very long to recover if the port suddenly changes.
+599 above is only an example. You can run the lock manager on any available port as long as you use the same port on all nodes.

STATD_SHARED_DIRECTORY is the shared directory where statd and the statd-callout script expects that the state variables and lists of clients to notify are found.
From ed1a52b29301bb2fdff4498f6eb38b4a1e49d7e0 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Tue, 10 Jul 2007 13:09:35 +1000 Subject: [PATCH 9/9] use the socketkiller to kill off all lock manager sessions as well (This used to be ctdb commit 980b090001ed3a77001e2a3bfc1b03833498f434) --- ctdb/config/events.d/60.nfs | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/ctdb/config/events.d/60.nfs b/ctdb/config/events.d/60.nfs index 7a7dcd389bd..7f0710c4006 100755 --- a/ctdb/config/events.d/60.nfs +++ b/ctdb/config/events.d/60.nfs @@ -61,10 +61,24 @@ case $cmd in ;; recovered) - # RST all tcp connections used for NFS to ensure that they do - # not survive in ESTABLISHED state across a failover/failback - # and create an ack storm + [ -f /etc/ctdb/state/nfs/restart ] && [ ! -z "$LOCKD_TCPPORT" ] && { + # RST all tcp connections used for NLM to ensure that they do + # not survive in ESTABLISHED state across a failover/failback + # and create an ack storm + netstat -tn |egrep "^tcp.*\s+[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+:${LOCKD_TCPPORT}\s+.*ESTABLISHED" | awk '{print $4" "$5}' | while read dest src; do + srcip=`echo $src | cut -d: -f1` + srcport=`echo $src | cut -d: -f2` + destip=`echo $dest | cut -d: -f1` + destport=`echo $dest | cut -d: -f2` + ctdb killtcp $srcip:$srcport $destip:$destport 1 >/dev/null 2>&1 +# ctdb killtcp $destip:$destport $srcip:$srcport 1 >/dev/null 2>&1 + done + } > /dev/null 2>&1 + [ -f /etc/ctdb/state/nfs/restart ] && { + # RST all tcp connections used for NFS to ensure that they do + # not survive in ESTABLISHED state across a failover/failback + # and create an ack storm netstat -tn |egrep '^tcp.*\s+[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+:2049\s+.*ESTABLISHED' | awk '{print $4" "$5}' | while read dest src; do srcip=`echo $src | cut -d: -f1` srcport=`echo $src | cut -d: -f2`