1
0
mirror of https://github.com/samba-team/samba.git synced 2025-01-11 05:18:09 +03:00

use more libral handling of event scripts timing out.

If the event script that timed out was for the "monitor" event, then
even if it timed out we still return SUCCESS back to the guy invoking the eventscript.
Only consider the eventscript for "monitor" to have failed with an error
IFF it actually terminated with an error,   or if it timed out 5 times in a row and hung.

(This used to be ctdb commit 60f3c04bd8b20ecbe937ffed08875cdc6898b422)
This commit is contained in:
Ronnie Sahlberg 2008-07-07 20:38:59 +10:00
parent 6eff9289d7
commit 6bfbec28a4
2 changed files with 25 additions and 7 deletions

View File

@ -38,7 +38,7 @@ static const struct {
{ "MonitorInterval", 15, offsetof(struct ctdb_tunable, monitor_interval) }, { "MonitorInterval", 15, offsetof(struct ctdb_tunable, monitor_interval) },
{ "TickleUpdateInterval",20, offsetof(struct ctdb_tunable, tickle_update_interval) }, { "TickleUpdateInterval",20, offsetof(struct ctdb_tunable, tickle_update_interval) },
{ "EventScriptTimeout", 20, offsetof(struct ctdb_tunable, script_timeout) }, { "EventScriptTimeout", 20, offsetof(struct ctdb_tunable, script_timeout) },
{ "EventScriptBanCount", 3, offsetof(struct ctdb_tunable, script_ban_count) }, { "EventScriptBanCount", 5, offsetof(struct ctdb_tunable, script_ban_count) },
{ "RecoveryGracePeriod", 60, offsetof(struct ctdb_tunable, recovery_grace_period) }, { "RecoveryGracePeriod", 60, offsetof(struct ctdb_tunable, recovery_grace_period) },
{ "RecoveryBanPeriod", 300, offsetof(struct ctdb_tunable, recovery_ban_period) }, { "RecoveryBanPeriod", 300, offsetof(struct ctdb_tunable, recovery_ban_period) },
{ "DatabaseHashSize", 10000, offsetof(struct ctdb_tunable, database_hash_size) }, { "DatabaseHashSize", 10000, offsetof(struct ctdb_tunable, database_hash_size) },

View File

@ -257,15 +257,33 @@ static void ctdb_event_script_timeout(struct event_context *ev, struct timed_eve
DEBUG(DEBUG_ERR,("Event script timed out : %s count : %u\n", state->options, ctdb->event_script_timeouts)); DEBUG(DEBUG_ERR,("Event script timed out : %s count : %u\n", state->options, ctdb->event_script_timeouts));
talloc_free(state); if (!strcmp(state->options, "monitor")) {
callback(ctdb, -1, private_data); /* if it is a monitor event, we allow it to "hang" a few times
before we declare it a failure and ban ourself (and make
ourself unhealthy)
*/
DEBUG(DEBUG_ERR, (__location__ " eventscript for monitor event timedout.\n"));
ctdb->event_script_timeouts++; ctdb->event_script_timeouts++;
if (ctdb->event_script_timeouts > ctdb->tunable.script_ban_count) { if (ctdb->event_script_timeouts > ctdb->tunable.script_ban_count) {
ctdb->event_script_timeouts = 0; ctdb->event_script_timeouts = 0;
DEBUG(DEBUG_ERR, ("Maximum timeout count reached for eventscript. Banning self for %d seconds\n", ctdb->tunable.recovery_ban_period)); DEBUG(DEBUG_ERR, ("Maximum timeout count %u reached for eventscript. Banning self for %d seconds\n", ctdb->tunable.script_ban_count, ctdb->tunable.recovery_ban_period));
ctdb_ban_self(ctdb, ctdb->tunable.recovery_ban_period); ctdb_ban_self(ctdb, ctdb->tunable.recovery_ban_period);
callback(ctdb, -1, private_data);
} else {
callback(ctdb, 0, private_data);
} }
} else if (!strcmp(state->options, "startup")) {
DEBUG(DEBUG_ERR, (__location__ " eventscript for startup event timedout.\n"));
callback(ctdb, -1, private_data);
} else {
/* if it is not a monitor event we ban ourself immediately */
DEBUG(DEBUG_ERR, (__location__ " eventscript for NON-monitor/NON-startup event timedout. Immediately banning ourself for %d seconds\n", ctdb->tunable.recovery_ban_period));
ctdb_ban_self(ctdb, ctdb->tunable.recovery_ban_period);
callback(ctdb, -1, private_data);
}
talloc_free(state);
} }
/* /*