From 928b8dcb319ca7d3f7e14f1ff3c90fe33bbbad5a Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 7 Dec 2009 23:48:57 +1030 Subject: [PATCH] eventscript: handle banning within the callbacks Currently the timeout handler in eventscript.c does the banning if a timeout happens. However, because monitor events are different, it has to special case them. As we call the callback anyway in this case, we should make that handle -ETIME as it sees fit: for everyone but the monitor event, we simply ban ourselves. The more complicated monitor event banning logic is now in ctdb_monitor.c where it belongs. Note: I wrapped the other bans in "if (status == -ETIME)", though they should probably ban themselves on any error. This change should be a noop. Signed-off-by: Rusty Russell (This used to be ctdb commit 9ecee127e19a9e7cae114a66f3514ee7a75276c5) --- ctdb/server/ctdb_monitor.c | 11 +++++++ ctdb/server/ctdb_recover.c | 6 ++++ ctdb/server/ctdb_takeover.c | 7 +++++ ctdb/server/eventscript.c | 57 +++++++++++-------------------------- 4 files changed, 40 insertions(+), 41 deletions(-) diff --git a/ctdb/server/ctdb_monitor.c b/ctdb/server/ctdb_monitor.c index f4223772b6c..c7bbb48a255 100644 --- a/ctdb/server/ctdb_monitor.c +++ b/ctdb/server/ctdb_monitor.c @@ -123,6 +123,17 @@ static void ctdb_health_callback(struct ctdb_context *ctdb, int status, void *p) rddata.dptr = (uint8_t *)&rd; rddata.dsize = sizeof(rd); + if (status == -ETIME) { + ctdb->event_script_timeouts++; + + if (ctdb->event_script_timeouts > ctdb->tunable.script_ban_count) { + DEBUG(DEBUG_ERR, ("Maximum timeout count %u reached for eventscript. Making node unhealthy\n", ctdb->tunable.script_ban_count)); + } else { + /* We pretend this is OK. */ + status = 0; + } + } + if (status != 0 && !(node->flags & NODE_FLAGS_UNHEALTHY)) { DEBUG(DEBUG_NOTICE,("monitor event failed - disabling node\n")); node->flags |= NODE_FLAGS_UNHEALTHY; diff --git a/ctdb/server/ctdb_recover.c b/ctdb/server/ctdb_recover.c index da11bdb8e0e..8568e8bbe07 100644 --- a/ctdb/server/ctdb_recover.c +++ b/ctdb/server/ctdb_recover.c @@ -935,6 +935,9 @@ static void ctdb_end_recovery_callback(struct ctdb_context *ctdb, int status, vo if (status != 0) { DEBUG(DEBUG_ERR,(__location__ " recovered event script failed (status %d)\n", status)); + if (status == -ETIME) { + ctdb_ban_self(ctdb); + } } ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL); @@ -1210,6 +1213,9 @@ static void ctdb_stop_node_callback(struct ctdb_context *ctdb, int status, void if (status != 0) { DEBUG(DEBUG_ERR,(__location__ " stopped event script failed (status %d)\n", status)); ctdb->nodes[ctdb->pnn]->flags &= ~NODE_FLAGS_STOPPED; + if (status == -ETIME) { + ctdb_ban_self(ctdb); + } } ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL); diff --git a/ctdb/server/ctdb_takeover.c b/ctdb/server/ctdb_takeover.c index 7cb8383114b..7357c0ca541 100644 --- a/ctdb/server/ctdb_takeover.c +++ b/ctdb/server/ctdb_takeover.c @@ -128,6 +128,9 @@ static void takeover_ip_callback(struct ctdb_context *ctdb, int status, struct ctdb_tcp_array *tcparray; if (status != 0) { + if (status == -ETIME) { + ctdb_ban_self(ctdb); + } DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n", ctdb_addr_to_str(state->addr), state->vnn->iface)); @@ -323,6 +326,10 @@ static void release_ip_callback(struct ctdb_context *ctdb, int status, talloc_get_type(private_data, struct takeover_callback_state); TDB_DATA data; + if (status == -ETIME) { + ctdb_ban_self(ctdb); + } + /* send a message to all clients of this node telling them that the cluster has been reconfigured and they should release any sockets on this IP */ diff --git a/ctdb/server/eventscript.c b/ctdb/server/eventscript.c index a6d3950bada..5dca0cea578 100644 --- a/ctdb/server/eventscript.c +++ b/ctdb/server/eventscript.c @@ -633,57 +633,24 @@ static void ctdb_event_script_timeout(struct event_context *ev, struct timed_eve DEBUG(DEBUG_ERR,("Event script timed out : %s %s count : %u pid : %d\n", call_names[state->call], state->options, ctdb->event_script_timeouts, state->child)); + state->cb_status = -ETIME; + if (kill(state->child, 0) != 0) { DEBUG(DEBUG_ERR,("Event script child process already dead, errno %s(%d)\n", strerror(errno), errno)); state->child = 0; - state->cb_status = -ETIME; - talloc_free(state); - return; - } - - if (state->call == CTDB_EVENT_MONITOR) { - /* if it is a monitor event, we allow it to "hang" a few times - before we declare it a failure and ban ourself (and make - ourself unhealthy) - */ - DEBUG(DEBUG_ERR, (__location__ " eventscript for monitor event timedout.\n")); - - ctdb->event_script_timeouts++; - - if (ctdb->event_script_timeouts > ctdb->tunable.script_ban_count) { - DEBUG(DEBUG_ERR, ("Maximum timeout count %u reached for eventscript. Making node unhealthy\n", ctdb->tunable.script_ban_count)); - state->cb_status = -ETIME; - } else { - state->cb_status = 0; - } - } else if (state->call == CTDB_EVENT_STARTUP) { - DEBUG(DEBUG_ERR, (__location__ " eventscript for startup event timedout.\n")); - state->cb_status = -ETIME; - } else { - /* if it is not a monitor or a startup event we ban ourself - immediately - */ - DEBUG(DEBUG_ERR, (__location__ " eventscript for NON-monitor/NON-startup event timedout. Immediately banning ourself for %d seconds\n", ctdb->tunable.recovery_ban_period)); - - ctdb_ban_self(ctdb); - - state->cb_status = -ETIME; } if (state->call == CTDB_EVENT_MONITOR || state->call == CTDB_EVENT_STATUS) { struct ctdb_monitor_script_status *script; - if (ctdb->current_monitor_status_ctx == NULL) { - talloc_free(state); - return; - } + if (ctdb->current_monitor_status_ctx != NULL) { + script = ctdb->current_monitor_status_ctx->scripts; + if (script != NULL) { + script->status = state->cb_status; + } - script = ctdb->current_monitor_status_ctx->scripts; - if (script != NULL) { - script->status = state->cb_status; + ctdb_control_event_script_finished(ctdb); } - - ctdb_control_event_script_finished(ctdb); } talloc_free(state); @@ -930,6 +897,14 @@ int ctdb_event_script_args(struct ctdb_context *ctdb, enum ctdb_eventscript_call while (status.done == false && event_loop_once(ctdb->ev) == 0) /* noop */; + if (status.status == -ETIME) { + DEBUG(DEBUG_ERR, (__location__ " eventscript for '%s' timedout." + " Immediately banning ourself for %d seconds\n", + call_names[call], + ctdb->tunable.recovery_ban_period)); + ctdb_ban_self(ctdb); + } + return status.status; }