mirror of
https://github.com/samba-team/samba.git
synced 2025-01-08 21:18:16 +03:00
eventscript: handle banning within the callbacks
Currently the timeout handler in eventscript.c does the banning if a timeout happens. However, because monitor events are different, it has to special case them. As we call the callback anyway in this case, we should make that handle -ETIME as it sees fit: for everyone but the monitor event, we simply ban ourselves. The more complicated monitor event banning logic is now in ctdb_monitor.c where it belongs. Note: I wrapped the other bans in "if (status == -ETIME)", though they should probably ban themselves on any error. This change should be a noop. Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> (This used to be ctdb commit 9ecee127e19a9e7cae114a66f3514ee7a75276c5)
This commit is contained in:
parent
5190932507
commit
928b8dcb31
@ -123,6 +123,17 @@ static void ctdb_health_callback(struct ctdb_context *ctdb, int status, void *p)
|
||||
rddata.dptr = (uint8_t *)&rd;
|
||||
rddata.dsize = sizeof(rd);
|
||||
|
||||
if (status == -ETIME) {
|
||||
ctdb->event_script_timeouts++;
|
||||
|
||||
if (ctdb->event_script_timeouts > ctdb->tunable.script_ban_count) {
|
||||
DEBUG(DEBUG_ERR, ("Maximum timeout count %u reached for eventscript. Making node unhealthy\n", ctdb->tunable.script_ban_count));
|
||||
} else {
|
||||
/* We pretend this is OK. */
|
||||
status = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (status != 0 && !(node->flags & NODE_FLAGS_UNHEALTHY)) {
|
||||
DEBUG(DEBUG_NOTICE,("monitor event failed - disabling node\n"));
|
||||
node->flags |= NODE_FLAGS_UNHEALTHY;
|
||||
|
@ -935,6 +935,9 @@ static void ctdb_end_recovery_callback(struct ctdb_context *ctdb, int status, vo
|
||||
|
||||
if (status != 0) {
|
||||
DEBUG(DEBUG_ERR,(__location__ " recovered event script failed (status %d)\n", status));
|
||||
if (status == -ETIME) {
|
||||
ctdb_ban_self(ctdb);
|
||||
}
|
||||
}
|
||||
|
||||
ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
|
||||
@ -1210,6 +1213,9 @@ static void ctdb_stop_node_callback(struct ctdb_context *ctdb, int status, void
|
||||
if (status != 0) {
|
||||
DEBUG(DEBUG_ERR,(__location__ " stopped event script failed (status %d)\n", status));
|
||||
ctdb->nodes[ctdb->pnn]->flags &= ~NODE_FLAGS_STOPPED;
|
||||
if (status == -ETIME) {
|
||||
ctdb_ban_self(ctdb);
|
||||
}
|
||||
}
|
||||
|
||||
ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
|
||||
|
@ -128,6 +128,9 @@ static void takeover_ip_callback(struct ctdb_context *ctdb, int status,
|
||||
struct ctdb_tcp_array *tcparray;
|
||||
|
||||
if (status != 0) {
|
||||
if (status == -ETIME) {
|
||||
ctdb_ban_self(ctdb);
|
||||
}
|
||||
DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
|
||||
ctdb_addr_to_str(state->addr),
|
||||
state->vnn->iface));
|
||||
@ -323,6 +326,10 @@ static void release_ip_callback(struct ctdb_context *ctdb, int status,
|
||||
talloc_get_type(private_data, struct takeover_callback_state);
|
||||
TDB_DATA data;
|
||||
|
||||
if (status == -ETIME) {
|
||||
ctdb_ban_self(ctdb);
|
||||
}
|
||||
|
||||
/* send a message to all clients of this node telling them
|
||||
that the cluster has been reconfigured and they should
|
||||
release any sockets on this IP */
|
||||
|
@ -633,57 +633,24 @@ static void ctdb_event_script_timeout(struct event_context *ev, struct timed_eve
|
||||
DEBUG(DEBUG_ERR,("Event script timed out : %s %s count : %u pid : %d\n",
|
||||
call_names[state->call], state->options, ctdb->event_script_timeouts, state->child));
|
||||
|
||||
state->cb_status = -ETIME;
|
||||
|
||||
if (kill(state->child, 0) != 0) {
|
||||
DEBUG(DEBUG_ERR,("Event script child process already dead, errno %s(%d)\n", strerror(errno), errno));
|
||||
state->child = 0;
|
||||
state->cb_status = -ETIME;
|
||||
talloc_free(state);
|
||||
return;
|
||||
}
|
||||
|
||||
if (state->call == CTDB_EVENT_MONITOR) {
|
||||
/* if it is a monitor event, we allow it to "hang" a few times
|
||||
before we declare it a failure and ban ourself (and make
|
||||
ourself unhealthy)
|
||||
*/
|
||||
DEBUG(DEBUG_ERR, (__location__ " eventscript for monitor event timedout.\n"));
|
||||
|
||||
ctdb->event_script_timeouts++;
|
||||
|
||||
if (ctdb->event_script_timeouts > ctdb->tunable.script_ban_count) {
|
||||
DEBUG(DEBUG_ERR, ("Maximum timeout count %u reached for eventscript. Making node unhealthy\n", ctdb->tunable.script_ban_count));
|
||||
state->cb_status = -ETIME;
|
||||
} else {
|
||||
state->cb_status = 0;
|
||||
}
|
||||
} else if (state->call == CTDB_EVENT_STARTUP) {
|
||||
DEBUG(DEBUG_ERR, (__location__ " eventscript for startup event timedout.\n"));
|
||||
state->cb_status = -ETIME;
|
||||
} else {
|
||||
/* if it is not a monitor or a startup event we ban ourself
|
||||
immediately
|
||||
*/
|
||||
DEBUG(DEBUG_ERR, (__location__ " eventscript for NON-monitor/NON-startup event timedout. Immediately banning ourself for %d seconds\n", ctdb->tunable.recovery_ban_period));
|
||||
|
||||
ctdb_ban_self(ctdb);
|
||||
|
||||
state->cb_status = -ETIME;
|
||||
}
|
||||
|
||||
if (state->call == CTDB_EVENT_MONITOR || state->call == CTDB_EVENT_STATUS) {
|
||||
struct ctdb_monitor_script_status *script;
|
||||
|
||||
if (ctdb->current_monitor_status_ctx == NULL) {
|
||||
talloc_free(state);
|
||||
return;
|
||||
}
|
||||
if (ctdb->current_monitor_status_ctx != NULL) {
|
||||
script = ctdb->current_monitor_status_ctx->scripts;
|
||||
if (script != NULL) {
|
||||
script->status = state->cb_status;
|
||||
}
|
||||
|
||||
script = ctdb->current_monitor_status_ctx->scripts;
|
||||
if (script != NULL) {
|
||||
script->status = state->cb_status;
|
||||
ctdb_control_event_script_finished(ctdb);
|
||||
}
|
||||
|
||||
ctdb_control_event_script_finished(ctdb);
|
||||
}
|
||||
|
||||
talloc_free(state);
|
||||
@ -930,6 +897,14 @@ int ctdb_event_script_args(struct ctdb_context *ctdb, enum ctdb_eventscript_call
|
||||
|
||||
while (status.done == false && event_loop_once(ctdb->ev) == 0) /* noop */;
|
||||
|
||||
if (status.status == -ETIME) {
|
||||
DEBUG(DEBUG_ERR, (__location__ " eventscript for '%s' timedout."
|
||||
" Immediately banning ourself for %d seconds\n",
|
||||
call_names[call],
|
||||
ctdb->tunable.recovery_ban_period));
|
||||
ctdb_ban_self(ctdb);
|
||||
}
|
||||
|
||||
return status.status;
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user