mirror of
https://github.com/samba-team/samba.git
synced 2025-02-02 09:47:23 +03:00
ctdb-tools: Sanity check changes before processing "reloadnodes"
"ctdb reloadnodes" currently does no sanity checking of the nodes file. This can cause chaos if a line is deleted from the nodes file rather than commented out. It also repeatedly produces a spurious warning for each deleted node, even if the node was deleted a long time ago. Instead compare the nodemap with the contents of the local nodes file to sanity check before attempting any reloads. Note that this is still imperfect if the nodes files are inconsistent across nodes but it is better. Also ensure that any nodes that are to be deleted are already disconnected. Avoid trying to talk to deleted nodes. The current implementation is a bit unfortunate when it comes to deleting nodes. The most obvious alternative to the above complexity would be to reloadnodes on the specified node first, then fetch the node map (in which newly deleted nodes would be marked as such) and then handle the remote nodes. However, the implementation of reloadnodes is asynchronous and it only actions the reload after 1 second. This is presumably to avoid the recovery master noticing the inconsistency between nodemaps and triggering a recovery before all nodes have had their nodemaps updated. Note that this recovery can still occur if the check is done at an inconvenient time. A better long term approach might be to quiesce the recovery master checks while reloadnodes is in progress. Update a unit test to reflect the change. Signed-off-by: Martin Schwenke <martin@meltin.net> Reviewed-by: Amitay Isaacs <amitay@gmail.com>
This commit is contained in:
parent
2cb2aa58d0
commit
1cebd75f62
@ -11,12 +11,10 @@ setup_nodes <<EOF
|
||||
EOF
|
||||
|
||||
ok <<EOF
|
||||
Reloading nodes file on node 1
|
||||
ctdb_ctrl_reload_nodes_file: node 1
|
||||
Reloading nodes file on node 2
|
||||
ctdb_ctrl_reload_nodes_file: node 2
|
||||
Reloading nodes file on node 0
|
||||
ctdb_ctrl_reload_nodes_file: node 0
|
||||
Node 0 is unchanged
|
||||
Node 1 is unchanged
|
||||
Node 2 is unchanged
|
||||
No change in nodes file, skipping unnecessary reload
|
||||
EOF
|
||||
|
||||
simple_test <<EOF
|
||||
|
@ -6259,6 +6259,85 @@ static bool check_all_node_files_are_identical(struct ctdb_context *ctdb,
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
reload the nodes file on the local node
|
||||
*/
|
||||
static bool sanity_check_nodes_file_changes(TALLOC_CTX *mem_ctx,
|
||||
struct ctdb_node_map *nodemap,
|
||||
struct ctdb_node_map *file_nodemap)
|
||||
{
|
||||
int i;
|
||||
bool should_abort = false;
|
||||
bool have_changes = false;
|
||||
|
||||
for (i=0; i<nodemap->num; i++) {
|
||||
if (i >= file_nodemap->num) {
|
||||
DEBUG(DEBUG_ERR,
|
||||
("ERROR: Node %u (%s) missing from nodes file\n",
|
||||
nodemap->nodes[i].pnn,
|
||||
ctdb_addr_to_str(&nodemap->nodes[i].addr)));
|
||||
should_abort = true;
|
||||
continue;
|
||||
}
|
||||
if ((nodemap->nodes[i].flags & NODE_FLAGS_DELETED) ==
|
||||
(file_nodemap->nodes[i].flags & NODE_FLAGS_DELETED)) {
|
||||
if (!ctdb_same_ip(&nodemap->nodes[i].addr,
|
||||
&file_nodemap->nodes[i].addr)) {
|
||||
DEBUG(DEBUG_ERR,
|
||||
("ERROR: Node %u has changed IP address (was %s, now %s)\n",
|
||||
nodemap->nodes[i].pnn,
|
||||
/* ctdb_addr_to_str() returns a static */
|
||||
talloc_strdup(mem_ctx,
|
||||
ctdb_addr_to_str(&nodemap->nodes[i].addr)),
|
||||
ctdb_addr_to_str(&file_nodemap->nodes[i].addr)));
|
||||
should_abort = true;
|
||||
} else {
|
||||
DEBUG(DEBUG_INFO,
|
||||
("Node %u is unchanged\n",
|
||||
nodemap->nodes[i].pnn));
|
||||
if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
|
||||
DEBUG(DEBUG_WARNING,
|
||||
("WARNING: Node %u is disconnected. You MUST fix this node manually!\n",
|
||||
nodemap->nodes[i].pnn));
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if (file_nodemap->nodes[i].flags & NODE_FLAGS_DELETED) {
|
||||
DEBUG(DEBUG_NOTICE,
|
||||
("Node %u is DELETED\n",
|
||||
nodemap->nodes[i].pnn));
|
||||
have_changes = true;
|
||||
if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
|
||||
DEBUG(DEBUG_ERR,
|
||||
("ERROR: Node %u is still connected\n",
|
||||
nodemap->nodes[i].pnn));
|
||||
should_abort = true;
|
||||
}
|
||||
} else if (nodemap->nodes[i].flags & NODE_FLAGS_DELETED) {
|
||||
DEBUG(DEBUG_NOTICE,
|
||||
("Node %u is UNDELETED\n", nodemap->nodes[i].pnn));
|
||||
have_changes = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (should_abort) {
|
||||
DEBUG(DEBUG_ERR,
|
||||
("ERROR: Nodes will not be reloaded due to previous error\n"));
|
||||
talloc_free(mem_ctx);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/* Leftover nodes in file are NEW */
|
||||
for (; i < file_nodemap->num; i++) {
|
||||
DEBUG(DEBUG_NOTICE, ("Node %u is NEW\n",
|
||||
file_nodemap->nodes[i].pnn));
|
||||
have_changes = true;
|
||||
}
|
||||
|
||||
return have_changes;
|
||||
}
|
||||
|
||||
static int control_reload_nodes_file(struct ctdb_context *ctdb, int argc, const char **argv)
|
||||
{
|
||||
int i, ret;
|
||||
@ -6268,6 +6347,10 @@ static int control_reload_nodes_file(struct ctdb_context *ctdb, int argc, const
|
||||
|
||||
assert_current_node_only(ctdb);
|
||||
|
||||
/* Load both the current nodemap and the contents of the local
|
||||
* nodes file. Compare and sanity check them before doing
|
||||
* anything. */
|
||||
|
||||
ret = ctdb_ctrl_getnodemap(ctdb, TIMELIMIT(), CTDB_CURRENT_NODE, ctdb, &nodemap);
|
||||
if (ret != 0) {
|
||||
DEBUG(DEBUG_ERR, ("Unable to get nodemap from local node\n"));
|
||||
@ -6286,6 +6369,12 @@ static int control_reload_nodes_file(struct ctdb_context *ctdb, int argc, const
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (!sanity_check_nodes_file_changes(tmp_ctx, nodemap, file_nodemap)) {
|
||||
DEBUG(DEBUG_NOTICE,
|
||||
("No change in nodes file, skipping unnecessary reload\n"));
|
||||
talloc_free(tmp_ctx);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Now make the changes */
|
||||
|
||||
@ -6294,6 +6383,9 @@ static int control_reload_nodes_file(struct ctdb_context *ctdb, int argc, const
|
||||
if (nodemap->nodes[i].pnn == options.pnn) {
|
||||
continue;
|
||||
}
|
||||
if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
|
||||
continue;
|
||||
}
|
||||
DEBUG(DEBUG_NOTICE, ("Reloading nodes file on node %u\n", nodemap->nodes[i].pnn));
|
||||
ret = ctdb_ctrl_reload_nodes_file(ctdb, TIMELIMIT(),
|
||||
nodemap->nodes[i].pnn);
|
||||
@ -6312,6 +6404,8 @@ static int control_reload_nodes_file(struct ctdb_context *ctdb, int argc, const
|
||||
/* initiate a recovery */
|
||||
control_recover(ctdb, argc, argv);
|
||||
|
||||
talloc_free(tmp_ctx);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user