1
0
mirror of https://github.com/samba-team/samba.git synced 2025-01-11 05:18:09 +03:00

ctdb-recovery: Ban a node that causes recovery failure

... instead of applying banning credits.

There have been a couple of cases where recovery repeatedly takes just
over 2 minutes to fail.  Therefore, banning credits expire between
failures and a continuously problematic node is never banned,
resulting in endless recoveries.  This is because it takes 2
applications of banning credits before a node is banned, which
generally involves 2 recovery failures.

The recovery helper makes up to 3 attempts to recover each database
during a single run.  If a node causes 3 failures then this is really
equivalent to 3 recovery failures in the model that existed before the
recovery helper added retries.  In that case the node would have been
banned after 2 failures.

So, instead of applying banning credits to the "most failing" node,
simply ban it directly from the recovery helper.

If multiple nodes are causing recovery failures then this can cause a
node to be banned more quickly than it might otherwise have been, even
pre-recovery-helper.  However, 90 seconds (i.e. 3 failures) is a long
time to be in recovery, so banning earlier seems like the best
approach.

BUG: https://bugzilla.samba.org/show_bug.cgi?id=13670

Signed-off-by: Martin Schwenke <martin@meltin.net>
Reviewed-by: Amitay Isaacs <amitay@gmail.com>

Autobuild-User(master): Amitay Isaacs <amitay@samba.org>
Autobuild-Date(master): Mon Nov  5 06:52:33 CET 2018 on sn-devel-144
This commit is contained in:
Martin Schwenke 2018-10-29 14:33:08 +11:00 committed by Amitay Isaacs
parent 3338a3e257
commit 27df4f002a

View File

@ -2571,22 +2571,28 @@ static void recovery_db_recovery_done(struct tevent_req *subreq)
/* If pulling database fails multiple times */
if (max_credits >= NUM_RETRIES) {
struct ctdb_req_message message;
struct ctdb_ban_state ban_state = {
.pnn = max_pnn,
.time = state->tun_list->recovery_ban_period,
};
D_ERR("Assigning banning credits to node %u\n",
max_pnn);
D_ERR("Banning node %u for %u seconds\n",
ban_state.pnn,
ban_state.time);
message.srvid = CTDB_SRVID_BANNING;
message.data.pnn = max_pnn;
subreq = ctdb_client_message_send(
state, state->ev, state->client,
ctdb_client_pnn(state->client),
&message);
ctdb_req_control_set_ban_state(&request,
&ban_state);
subreq = ctdb_client_control_send(state,
state->ev,
state->client,
ban_state.pnn,
TIMEOUT(),
&request);
if (tevent_req_nomem(subreq, req)) {
return;
}
tevent_req_set_callback(subreq, recovery_failed_done,
tevent_req_set_callback(subreq,
recovery_failed_done,
req);
} else {
tevent_req_error(req, EIO);
@ -2609,15 +2615,25 @@ static void recovery_failed_done(struct tevent_req *subreq)
{
struct tevent_req *req = tevent_req_callback_data(
subreq, struct tevent_req);
struct recovery_state *state = tevent_req_data(
req, struct recovery_state);
struct ctdb_reply_control *reply;
int ret;
bool status;
status = ctdb_client_message_recv(subreq, &ret);
status = ctdb_client_control_recv(subreq, &ret, state, &reply);
TALLOC_FREE(subreq);
if (! status) {
D_ERR("failed to assign banning credits, ret=%d\n", ret);
D_ERR("failed to ban node, ret=%d\n", ret);
goto done;
}
ret = ctdb_reply_control_set_ban_state(reply);
if (ret != 0) {
D_ERR("control SET_BAN_STATE failed, ret=%d\n", ret);
}
done:
tevent_req_error(req, EIO);
}