From 779468ab3f0dfdda57230fafb51aa4510818780c Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Fri, 13 Jun 2008 13:18:06 +1000 Subject: [PATCH] if the event scripts hangs EventScriptsBanCount consecutive times in a row the node will ban itself for the default recovery ban period (This used to be ctdb commit 7239d7ecd54037b11eddf47328a3129d281e7d4a) --- ctdb/include/ctdb_private.h | 2 ++ ctdb/server/ctdb_tunables.c | 1 + ctdb/server/eventscript.c | 33 +++++++++++++++++++++++++++++++-- 3 files changed, 34 insertions(+), 2 deletions(-) diff --git a/ctdb/include/ctdb_private.h b/ctdb/include/ctdb_private.h index e78b683905a..b19012f1ec0 100644 --- a/ctdb/include/ctdb_private.h +++ b/ctdb/include/ctdb_private.h @@ -102,6 +102,7 @@ struct ctdb_tunable { uint32_t monitor_interval; uint32_t tickle_update_interval; uint32_t script_timeout; + uint32_t script_ban_count; /* ban after this many consec timeouts*/ uint32_t recovery_grace_period; uint32_t recovery_ban_period; uint32_t database_hash_size; @@ -410,6 +411,7 @@ struct ctdb_context { struct ctdb_monitor_state *monitor; struct ctdb_log_state *log; int start_as_disabled; + uint32_t event_script_timeouts; /* counting how many consecutive times an eventscript has timedout */ TALLOC_CTX *eventscripts_ctx; /* a context to hold data for the RUN_EVENTSCRIPTS control */ }; diff --git a/ctdb/server/ctdb_tunables.c b/ctdb/server/ctdb_tunables.c index 9518b2233cb..d138137afdc 100644 --- a/ctdb/server/ctdb_tunables.c +++ b/ctdb/server/ctdb_tunables.c @@ -38,6 +38,7 @@ static const struct { { "MonitorInterval", 15, offsetof(struct ctdb_tunable, monitor_interval) }, { "TickleUpdateInterval",20, offsetof(struct ctdb_tunable, tickle_update_interval) }, { "EventScriptTimeout", 20, offsetof(struct ctdb_tunable, script_timeout) }, + { "EventScriptBanCount", 5, offsetof(struct ctdb_tunable, script_ban_count) }, { "RecoveryGracePeriod", 60, offsetof(struct ctdb_tunable, recovery_grace_period) }, { "RecoveryBanPeriod", 300, offsetof(struct ctdb_tunable, recovery_ban_period) }, { "DatabaseHashSize", 10000, offsetof(struct ctdb_tunable, database_hash_size) }, diff --git a/ctdb/server/eventscript.c b/ctdb/server/eventscript.c index ff26dd76e12..0e4af037c5a 100644 --- a/ctdb/server/eventscript.c +++ b/ctdb/server/eventscript.c @@ -222,6 +222,27 @@ static void ctdb_event_script_handler(struct event_context *ev, struct fd_event talloc_set_destructor(state, NULL); talloc_free(state); callback(ctdb, status, private_data); + + ctdb->event_script_timeouts = 0; +} + +static void ctdb_ban_self(struct ctdb_context *ctdb, uint32_t ban_period) +{ + int ret; + struct ctdb_ban_info b; + TDB_DATA data; + + b.pnn = ctdb->pnn; + b.ban_time = ban_period; + + data.dptr = (uint8_t *)&b; + data.dsize = sizeof(b); + + ret = ctdb_daemon_send_message(ctdb, CTDB_BROADCAST_CONNECTED, + CTDB_SRVID_BAN_NODE, data); + if (ret != 0) { + DEBUG(DEBUG_ERR,(__location__ " Failed to send ban message\n")); + } } @@ -234,9 +255,17 @@ static void ctdb_event_script_timeout(struct event_context *ev, struct timed_eve void *private_data = state->private_data; struct ctdb_context *ctdb = state->ctdb; - DEBUG(DEBUG_ERR,("event script timed out : %s\n", state->options)); + DEBUG(DEBUG_ERR,("Event script timed out : %s count : %u\n", state->options, ctdb->event_script_timeouts)); + talloc_free(state); callback(ctdb, -1, private_data); + + ctdb->event_script_timeouts++; + if (ctdb->event_script_timeouts > ctdb->tunable.script_ban_count) { + ctdb->event_script_timeouts = 0; + DEBUG(DEBUG_ERR, ("Maximum timeout count reached for eventscript. Banning self for %d seconds\n", ctdb->tunable.recovery_ban_period)); + ctdb_ban_self(ctdb, ctdb->tunable.recovery_ban_period); + } } /* @@ -308,7 +337,7 @@ static int ctdb_event_script_callback_v(struct ctdb_context *ctdb, if (!timeval_is_zero(&timeout)) { event_add_timed(ctdb->ev, state, timeout, ctdb_event_script_timeout, state); } else { - DEBUG(DEBUG_ERR, (__location__ " eventscript %s called with no timeout\n", fmt)); + DEBUG(DEBUG_ERR, (__location__ " eventscript %s called with no timeout\n", state->options)); } return 0;