From 8241d3f9cf8d4140a7fed32d632e8b4b31382116 Mon Sep 17 00:00:00 2001 From: root Date: Tue, 9 Dec 2008 10:45:14 +1100 Subject: [PATCH 1/2] update to the flags handling make sure to abort the monitoring and restart if we failed to get the nodemap from a remote node (This used to be ctdb commit 4eac0214e732e6c2f867d66ec71d4406680dbb94) --- ctdb/server/ctdb_recoverd.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c index 468977c4021..540749d12f8 100644 --- a/ctdb/server/ctdb_recoverd.c +++ b/ctdb/server/ctdb_recoverd.c @@ -2278,19 +2278,9 @@ static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_p static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct ctdb_node_map *nodemap, - struct ctdb_node_map ***remote_nodemaps) + struct ctdb_node_map **remote_nodemaps) { uint32_t *nodes; - int i; - - *remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map *, nodemap->num); - if (*remote_nodemaps == NULL) { - DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n")); - return -1; - } - for(i=0; inum; i++) { - (*remote_nodemaps)[i] = NULL; - } nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true); if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP, @@ -2298,7 +2288,7 @@ static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, CONTROL_TIMEOUT(), false, tdb_null, async_getnodemap_callback, NULL, - *remote_nodemaps) != 0) { + remote_nodemaps) != 0) { DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n")); return -1; @@ -2651,7 +2641,15 @@ again: /* get the nodemap for all active remote nodes */ - if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, &remote_nodemaps) != 0) { + remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map *, nodemap->num); + if (remote_nodemaps == NULL) { + DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n")); + goto again; + } + for(i=0; inum; i++) { + remote_nodemaps[i] = NULL; + } + if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) { DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n")); goto again; } @@ -2663,6 +2661,11 @@ again: continue; } + if (remote_nodemaps[j] == NULL) { + DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j)); + goto again; + } + /* if the nodes disagree on how many nodes there are then this is a good reason to try recovery */ From 762d4be8f929bba2afb6a02f3620eff636382486 Mon Sep 17 00:00:00 2001 From: root Date: Tue, 9 Dec 2008 12:03:42 +1100 Subject: [PATCH 2/2] add a helper that waits until the clueter is no longe rin recovery mode and return the generation number. change the ban/unban logic to wait until we are not in recovery before it bans/unbans the node. also wait until after the cluster has recovered from the ban/unban before returning so that the cluster is in recpovery mode == normal when the command returns. this makes it much easier to script things ... (This used to be ctdb commit 39c77371a2f995025a584691fe61af12dc6ed5d7) --- ctdb/tools/ctdb.c | 89 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 88 insertions(+), 1 deletion(-) diff --git a/ctdb/tools/ctdb.c b/ctdb/tools/ctdb.c index 27fa7de2e19..bb663a67678 100644 --- a/ctdb/tools/ctdb.c +++ b/ctdb/tools/ctdb.c @@ -1255,6 +1255,49 @@ static int control_enable(struct ctdb_context *ctdb, int argc, const char **argv return 0; } +static uint32_t get_generation(struct ctdb_context *ctdb) +{ + struct ctdb_vnn_map *vnnmap=NULL; + int ret; + + /* wait until the recmaster is not in recovery mode */ + while (1) { + uint32_t recmode, recmaster; + + if (vnnmap != NULL) { + talloc_free(vnnmap); + vnnmap = NULL; + } + + /* get the recmaster */ + ret = ctdb_ctrl_getrecmaster(ctdb, ctdb, TIMELIMIT(), CTDB_CURRENT_NODE, &recmaster); + if (ret != 0) { + DEBUG(DEBUG_ERR, ("Unable to get recmaster from node %u\n", options.pnn)); + exit(10); + } + + /* get recovery mode */ + ret = ctdb_ctrl_getrecmode(ctdb, ctdb, TIMELIMIT(), recmaster, &recmode); + if (ret != 0) { + DEBUG(DEBUG_ERR, ("Unable to get recmode from node %u\n", options.pnn)); + exit(10); + } + + /* get the current generation number */ + ret = ctdb_ctrl_getvnnmap(ctdb, TIMELIMIT(), recmaster, ctdb, &vnnmap); + if (ret != 0) { + DEBUG(DEBUG_ERR, ("Unable to get vnnmap from recmaster (%u)\n", recmaster)); + exit(10); + } + + if ((recmode == CTDB_RECOVERY_NORMAL) + && (vnnmap->generation != 1)){ + return vnnmap->generation; + } + sleep(1); + } +} + /* ban a node from the cluster */ @@ -1264,10 +1307,27 @@ static int control_ban(struct ctdb_context *ctdb, int argc, const char **argv) struct ctdb_ban_info b; TDB_DATA data; uint32_t ban_time; + struct ctdb_node_map *nodemap=NULL; + uint32_t generation, next_generation; if (argc < 1) { usage(); } + + /* record the current generation number */ + generation = get_generation(ctdb); + + + /* verify the node exists */ + ret = ctdb_ctrl_getnodemap(ctdb, TIMELIMIT(), CTDB_CURRENT_NODE, ctdb, &nodemap); + if (ret != 0) { + DEBUG(DEBUG_ERR, ("Unable to get nodemap from local node\n")); + return ret; + } + if (options.pnn >= nodemap->num) { + DEBUG(DEBUG_ERR, ("Node %u does not exist\n", options.pnn)); + return ret; + } /* verify we can access the node */ ret = ctdb_ctrl_getpnn(ctdb, TIMELIMIT(), options.pnn); @@ -1276,6 +1336,11 @@ static int control_ban(struct ctdb_context *ctdb, int argc, const char **argv) return -1; } + if (nodemap->nodes[options.pnn].flags & NODE_FLAGS_BANNED) { + DEBUG(DEBUG_ERR,("Node %u is already banned.\n", options.pnn)); + return -1; + } + ban_time = strtoul(argv[0], NULL, 0); b.pnn = options.pnn; @@ -1289,7 +1354,16 @@ static int control_ban(struct ctdb_context *ctdb, int argc, const char **argv) DEBUG(DEBUG_ERR,("Failed to ban node %u\n", options.pnn)); return -1; } - + + /* wait until we are in a new generation */ + while (1) { + next_generation = get_generation(ctdb); + if (next_generation != generation) { + return 0; + } + sleep(1); + } + return 0; } @@ -1301,6 +1375,10 @@ static int control_unban(struct ctdb_context *ctdb, int argc, const char **argv) { int ret; TDB_DATA data; + uint32_t generation, next_generation; + + /* record the current generation number */ + generation = get_generation(ctdb); /* verify we can access the node */ ret = ctdb_ctrl_getpnn(ctdb, TIMELIMIT(), options.pnn); @@ -1318,6 +1396,15 @@ static int control_unban(struct ctdb_context *ctdb, int argc, const char **argv) return -1; } + /* wait until we are in a new generation */ + while (1) { + next_generation = get_generation(ctdb); + if (next_generation != generation) { + return 0; + } + sleep(1); + } + return 0; }