1
0
mirror of https://github.com/samba-team/samba.git synced 2025-02-02 09:47:23 +03:00

ctdb-tools: Sanity check changes before processing "reloadnodes"

"ctdb reloadnodes" currently does no sanity checking of the nodes
file.  This can cause chaos if a line is deleted from the nodes file
rather than commented out.  It also repeatedly produces a spurious
warning for each deleted node, even if the node was deleted a long
time ago.

Instead compare the nodemap with the contents of the local nodes file
to sanity check before attempting any reloads.  Note that this is
still imperfect if the nodes files are inconsistent across nodes but
it is better.  Also ensure that any nodes that are to be deleted are
already disconnected.  Avoid trying to talk to deleted nodes.

The current implementation is a bit unfortunate when it comes to
deleting nodes.  The most obvious alternative to the above complexity
would be to reloadnodes on the specified node first, then fetch the
node map (in which newly deleted nodes would be marked as such) and
then handle the remote nodes.  However, the implementation of
reloadnodes is asynchronous and it only actions the reload after 1
second.  This is presumably to avoid the recovery master noticing the
inconsistency between nodemaps and triggering a recovery before all
nodes have had their nodemaps updated.

Note that this recovery can still occur if the check is done at an
inconvenient time.  A better long term approach might be to quiesce
the recovery master checks while reloadnodes is in progress.

Update a unit test to reflect the change.

Signed-off-by: Martin Schwenke <martin@meltin.net>
Reviewed-by: Amitay Isaacs <amitay@gmail.com>
This commit is contained in:
Martin Schwenke 2015-01-30 20:14:26 +11:00 committed by Amitay Isaacs
parent 2cb2aa58d0
commit 1cebd75f62
2 changed files with 98 additions and 6 deletions

View File

@ -11,12 +11,10 @@ setup_nodes <<EOF
EOF
ok <<EOF
Reloading nodes file on node 1
ctdb_ctrl_reload_nodes_file: node 1
Reloading nodes file on node 2
ctdb_ctrl_reload_nodes_file: node 2
Reloading nodes file on node 0
ctdb_ctrl_reload_nodes_file: node 0
Node 0 is unchanged
Node 1 is unchanged
Node 2 is unchanged
No change in nodes file, skipping unnecessary reload
EOF
simple_test <<EOF

View File

@ -6259,6 +6259,85 @@ static bool check_all_node_files_are_identical(struct ctdb_context *ctdb,
return ret;
}
/*
reload the nodes file on the local node
*/
static bool sanity_check_nodes_file_changes(TALLOC_CTX *mem_ctx,
struct ctdb_node_map *nodemap,
struct ctdb_node_map *file_nodemap)
{
int i;
bool should_abort = false;
bool have_changes = false;
for (i=0; i<nodemap->num; i++) {
if (i >= file_nodemap->num) {
DEBUG(DEBUG_ERR,
("ERROR: Node %u (%s) missing from nodes file\n",
nodemap->nodes[i].pnn,
ctdb_addr_to_str(&nodemap->nodes[i].addr)));
should_abort = true;
continue;
}
if ((nodemap->nodes[i].flags & NODE_FLAGS_DELETED) ==
(file_nodemap->nodes[i].flags & NODE_FLAGS_DELETED)) {
if (!ctdb_same_ip(&nodemap->nodes[i].addr,
&file_nodemap->nodes[i].addr)) {
DEBUG(DEBUG_ERR,
("ERROR: Node %u has changed IP address (was %s, now %s)\n",
nodemap->nodes[i].pnn,
/* ctdb_addr_to_str() returns a static */
talloc_strdup(mem_ctx,
ctdb_addr_to_str(&nodemap->nodes[i].addr)),
ctdb_addr_to_str(&file_nodemap->nodes[i].addr)));
should_abort = true;
} else {
DEBUG(DEBUG_INFO,
("Node %u is unchanged\n",
nodemap->nodes[i].pnn));
if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
DEBUG(DEBUG_WARNING,
("WARNING: Node %u is disconnected. You MUST fix this node manually!\n",
nodemap->nodes[i].pnn));
}
}
continue;
}
if (file_nodemap->nodes[i].flags & NODE_FLAGS_DELETED) {
DEBUG(DEBUG_NOTICE,
("Node %u is DELETED\n",
nodemap->nodes[i].pnn));
have_changes = true;
if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
DEBUG(DEBUG_ERR,
("ERROR: Node %u is still connected\n",
nodemap->nodes[i].pnn));
should_abort = true;
}
} else if (nodemap->nodes[i].flags & NODE_FLAGS_DELETED) {
DEBUG(DEBUG_NOTICE,
("Node %u is UNDELETED\n", nodemap->nodes[i].pnn));
have_changes = true;
}
}
if (should_abort) {
DEBUG(DEBUG_ERR,
("ERROR: Nodes will not be reloaded due to previous error\n"));
talloc_free(mem_ctx);
exit(1);
}
/* Leftover nodes in file are NEW */
for (; i < file_nodemap->num; i++) {
DEBUG(DEBUG_NOTICE, ("Node %u is NEW\n",
file_nodemap->nodes[i].pnn));
have_changes = true;
}
return have_changes;
}
static int control_reload_nodes_file(struct ctdb_context *ctdb, int argc, const char **argv)
{
int i, ret;
@ -6268,6 +6347,10 @@ static int control_reload_nodes_file(struct ctdb_context *ctdb, int argc, const
assert_current_node_only(ctdb);
/* Load both the current nodemap and the contents of the local
* nodes file. Compare and sanity check them before doing
* anything. */
ret = ctdb_ctrl_getnodemap(ctdb, TIMELIMIT(), CTDB_CURRENT_NODE, ctdb, &nodemap);
if (ret != 0) {
DEBUG(DEBUG_ERR, ("Unable to get nodemap from local node\n"));
@ -6286,6 +6369,12 @@ static int control_reload_nodes_file(struct ctdb_context *ctdb, int argc, const
return -1;
}
if (!sanity_check_nodes_file_changes(tmp_ctx, nodemap, file_nodemap)) {
DEBUG(DEBUG_NOTICE,
("No change in nodes file, skipping unnecessary reload\n"));
talloc_free(tmp_ctx);
return 0;
}
/* Now make the changes */
@ -6294,6 +6383,9 @@ static int control_reload_nodes_file(struct ctdb_context *ctdb, int argc, const
if (nodemap->nodes[i].pnn == options.pnn) {
continue;
}
if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
continue;
}
DEBUG(DEBUG_NOTICE, ("Reloading nodes file on node %u\n", nodemap->nodes[i].pnn));
ret = ctdb_ctrl_reload_nodes_file(ctdb, TIMELIMIT(),
nodemap->nodes[i].pnn);
@ -6312,6 +6404,8 @@ static int control_reload_nodes_file(struct ctdb_context *ctdb, int argc, const
/* initiate a recovery */
control_recover(ctdb, argc, argv);
talloc_free(tmp_ctx);
return 0;
}