mirror of
https://github.com/samba-team/samba.git
synced 2024-12-22 13:34:15 +03:00
ctdb-recovery: Ban a node that causes recovery failure
... instead of applying banning credits. There have been a couple of cases where recovery repeatedly takes just over 2 minutes to fail. Therefore, banning credits expire between failures and a continuously problematic node is never banned, resulting in endless recoveries. This is because it takes 2 applications of banning credits before a node is banned, which generally involves 2 recovery failures. The recovery helper makes up to 3 attempts to recover each database during a single run. If a node causes 3 failures then this is really equivalent to 3 recovery failures in the model that existed before the recovery helper added retries. In that case the node would have been banned after 2 failures. So, instead of applying banning credits to the "most failing" node, simply ban it directly from the recovery helper. If multiple nodes are causing recovery failures then this can cause a node to be banned more quickly than it might otherwise have been, even pre-recovery-helper. However, 90 seconds (i.e. 3 failures) is a long time to be in recovery, so banning earlier seems like the best approach. BUG: https://bugzilla.samba.org/show_bug.cgi?id=13670 Signed-off-by: Martin Schwenke <martin@meltin.net> Reviewed-by: Amitay Isaacs <amitay@gmail.com> Autobuild-User(master): Amitay Isaacs <amitay@samba.org> Autobuild-Date(master): Mon Nov 5 06:52:33 CET 2018 on sn-devel-144
This commit is contained in:
parent
3338a3e257
commit
27df4f002a
@ -2571,22 +2571,28 @@ static void recovery_db_recovery_done(struct tevent_req *subreq)
|
||||
|
||||
/* If pulling database fails multiple times */
|
||||
if (max_credits >= NUM_RETRIES) {
|
||||
struct ctdb_req_message message;
|
||||
struct ctdb_ban_state ban_state = {
|
||||
.pnn = max_pnn,
|
||||
.time = state->tun_list->recovery_ban_period,
|
||||
};
|
||||
|
||||
D_ERR("Assigning banning credits to node %u\n",
|
||||
max_pnn);
|
||||
D_ERR("Banning node %u for %u seconds\n",
|
||||
ban_state.pnn,
|
||||
ban_state.time);
|
||||
|
||||
message.srvid = CTDB_SRVID_BANNING;
|
||||
message.data.pnn = max_pnn;
|
||||
|
||||
subreq = ctdb_client_message_send(
|
||||
state, state->ev, state->client,
|
||||
ctdb_client_pnn(state->client),
|
||||
&message);
|
||||
ctdb_req_control_set_ban_state(&request,
|
||||
&ban_state);
|
||||
subreq = ctdb_client_control_send(state,
|
||||
state->ev,
|
||||
state->client,
|
||||
ban_state.pnn,
|
||||
TIMEOUT(),
|
||||
&request);
|
||||
if (tevent_req_nomem(subreq, req)) {
|
||||
return;
|
||||
}
|
||||
tevent_req_set_callback(subreq, recovery_failed_done,
|
||||
tevent_req_set_callback(subreq,
|
||||
recovery_failed_done,
|
||||
req);
|
||||
} else {
|
||||
tevent_req_error(req, EIO);
|
||||
@ -2609,15 +2615,25 @@ static void recovery_failed_done(struct tevent_req *subreq)
|
||||
{
|
||||
struct tevent_req *req = tevent_req_callback_data(
|
||||
subreq, struct tevent_req);
|
||||
struct recovery_state *state = tevent_req_data(
|
||||
req, struct recovery_state);
|
||||
struct ctdb_reply_control *reply;
|
||||
int ret;
|
||||
bool status;
|
||||
|
||||
status = ctdb_client_message_recv(subreq, &ret);
|
||||
status = ctdb_client_control_recv(subreq, &ret, state, &reply);
|
||||
TALLOC_FREE(subreq);
|
||||
if (! status) {
|
||||
D_ERR("failed to assign banning credits, ret=%d\n", ret);
|
||||
D_ERR("failed to ban node, ret=%d\n", ret);
|
||||
goto done;
|
||||
}
|
||||
|
||||
ret = ctdb_reply_control_set_ban_state(reply);
|
||||
if (ret != 0) {
|
||||
D_ERR("control SET_BAN_STATE failed, ret=%d\n", ret);
|
||||
}
|
||||
|
||||
done:
|
||||
tevent_req_error(req, EIO);
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user