diff --git a/ctdb/include/ctdb_private.h b/ctdb/include/ctdb_private.h index 01146f3f43e..64caeb30ec4 100644 --- a/ctdb/include/ctdb_private.h +++ b/ctdb/include/ctdb_private.h @@ -506,7 +506,8 @@ struct ctdb_control_tcp_vnn { */ struct ctdb_node_flag_change { uint32_t vnn; - uint32_t flags; + uint32_t new_flags; + uint32_t old_flags; }; /* diff --git a/ctdb/server/ctdb_daemon.c b/ctdb/server/ctdb_daemon.c index f216ca4365d..6f09acea17b 100644 --- a/ctdb/server/ctdb_daemon.c +++ b/ctdb/server/ctdb_daemon.c @@ -51,7 +51,7 @@ static void flag_change_handler(struct ctdb_context *ctdb, uint64_t srvid, /* don't get the disconnected flag from the other node */ ctdb->nodes[c->vnn]->flags = (ctdb->nodes[c->vnn]->flags&NODE_FLAGS_DISCONNECTED) - | (c->flags & ~NODE_FLAGS_DISCONNECTED); + | (c->new_flags & ~NODE_FLAGS_DISCONNECTED); DEBUG(2,("Node flags for node %u are now 0x%x\n", c->vnn, ctdb->nodes[c->vnn]->flags)); /* make sure we don't hold any IPs when we shouldn't */ diff --git a/ctdb/server/ctdb_monitor.c b/ctdb/server/ctdb_monitor.c index ec5244703c0..243961d228f 100644 --- a/ctdb/server/ctdb_monitor.c +++ b/ctdb/server/ctdb_monitor.c @@ -103,6 +103,9 @@ static void ctdb_health_callback(struct ctdb_context *ctdb, int status, void *p) timeval_current_ofs(ctdb->tunable.monitor_interval, 0), ctdb_check_health, ctdb); + c.vnn = ctdb->vnn; + c.old_flags = node->flags; + if (status != 0 && !(node->flags & NODE_FLAGS_UNHEALTHY)) { DEBUG(0,("monitor event failed - disabling node\n")); node->flags |= NODE_FLAGS_UNHEALTHY; @@ -114,8 +117,7 @@ static void ctdb_health_callback(struct ctdb_context *ctdb, int status, void *p) return; } - c.vnn = ctdb->vnn; - c.flags = node->flags; + c.new_flags = node->flags; data.dptr = (uint8_t *)&c; data.dsize = sizeof(c); @@ -206,7 +208,8 @@ int32_t ctdb_control_modflags(struct ctdb_context *ctdb, TDB_DATA indata) /* if we have been banned, go into recovery mode */ c.vnn = ctdb->vnn; - c.flags = node->flags; + c.old_flags = old_flags; + c.new_flags = node->flags; data.dptr = (uint8_t *)&c; data.dsize = sizeof(c); diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c index 5cb985521d7..812214088e7 100644 --- a/ctdb/server/ctdb_recoverd.c +++ b/ctdb/server/ctdb_recoverd.c @@ -386,7 +386,8 @@ static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node TDB_DATA data; c.vnn = nodemap->nodes[i].vnn; - c.flags = nodemap->nodes[i].flags; + c.old_flags = nodemap->nodes[i].flags; + c.new_flags = nodemap->nodes[i].flags; data.dptr = (uint8_t *)&c; data.dsize = sizeof(c); @@ -815,7 +816,7 @@ static int do_recovery(struct ctdb_recoverd *rec, /* send a message to all clients telling them that the cluster has been reconfigured */ - ctdb_send_message(ctdb, CTDB_BROADCAST_ALL, CTDB_SRVID_RECONFIGURE, tdb_null); + ctdb_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_RECONFIGURE, tdb_null); DEBUG(0, (__location__ " Recovery complete\n")); @@ -1045,6 +1046,7 @@ static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid, struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr; struct ctdb_node_map *nodemap=NULL; TALLOC_CTX *tmp_ctx; + uint32_t changed_flags; int i; if (data.dsize != sizeof(*c)) { @@ -1067,20 +1069,22 @@ static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid, return; } + changed_flags = c->old_flags ^ c->new_flags; + /* Dont let messages from remote nodes change the DISCONNECTED flag. This flag is handled locally based on whether the local node can communicate with the node or not. */ - c->flags &= ~NODE_FLAGS_DISCONNECTED; + c->new_flags &= ~NODE_FLAGS_DISCONNECTED; if (nodemap->nodes[i].flags&NODE_FLAGS_DISCONNECTED) { - c->flags |= NODE_FLAGS_DISCONNECTED; + c->new_flags |= NODE_FLAGS_DISCONNECTED; } - if (nodemap->nodes[i].flags != c->flags) { - DEBUG(0,("Node %u has changed flags - now 0x%x\n", c->vnn, c->flags)); + if (nodemap->nodes[i].flags != c->new_flags) { + DEBUG(0,("Node %u has changed flags - now 0x%x was 0x%x\n", c->vnn, c->new_flags, c->old_flags)); } - nodemap->nodes[i].flags = c->flags; + nodemap->nodes[i].flags = c->new_flags; ret = ctdb_ctrl_getrecmaster(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->recovery_master); @@ -1094,9 +1098,21 @@ static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid, ctdb->recovery_master == ctdb->vnn && ctdb->recovery_mode == CTDB_RECOVERY_NORMAL && ctdb->takeover.enabled) { - ret = ctdb_takeover_run(ctdb, nodemap); - if (ret != 0) { - DEBUG(0, (__location__ " Unable to setup public takeover addresses\n")); + /* Only do the takeover run if the perm disabled or unhealthy + flags changed since these will cause an ip failover but not + a recovery. + If the node became disconnected or banned this will also + lead to an ip address failover but that is handled + during recovery + */ + if (changed_flags & NODE_FLAGS_DISABLED) { + ret = ctdb_takeover_run(ctdb, nodemap); + if (ret != 0) { + DEBUG(0, (__location__ " Unable to setup public takeover addresses\n")); + } + /* send a message to all clients telling them that the + cluster has been reconfigured */ + ctdb_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_RECONFIGURE, tdb_null); } }