From 538f519dba737319dae6ea6d1451297048af8bda Mon Sep 17 00:00:00 2001 From: Andrew Tridgell Date: Thu, 10 Jan 2008 14:40:56 +1100 Subject: [PATCH 1/2] exponential backoff in health monitoring for faster startup (This used to be ctdb commit 1b04a1f675f73b48366ba98803a58c3d8df1b6e1) --- ctdb/include/ctdb_private.h | 4 +- ctdb/server/ctdb_control.c | 2 +- ctdb/server/ctdb_monitor.c | 87 ++++++++++++++++++++++--------------- ctdb/server/ctdb_tunables.c | 1 - ctdb/server/ctdbd.c | 1 - 5 files changed, 56 insertions(+), 39 deletions(-) diff --git a/ctdb/include/ctdb_private.h b/ctdb/include/ctdb_private.h index a876100d41f..e984078b5e7 100644 --- a/ctdb/include/ctdb_private.h +++ b/ctdb/include/ctdb_private.h @@ -324,8 +324,6 @@ enum ctdb_freeze_mode {CTDB_FREEZE_NONE, CTDB_FREEZE_PENDING, CTDB_FREEZE_FROZEN struct ctdb_context { struct event_context *ev; uint32_t recovery_mode; - uint32_t monitoring_mode; - TALLOC_CTX *monitor_context; TALLOC_CTX *tickle_update_context; struct ctdb_tunable tunable; enum ctdb_freeze_mode freeze_mode; @@ -370,6 +368,7 @@ struct ctdb_context { pid_t recoverd_pid; bool done_startup; const char *node_ip; + struct ctdb_monitor_state *monitor; }; struct ctdb_db_context { @@ -1211,5 +1210,6 @@ int32_t ctdb_control_delete_record(struct ctdb_context *ctdb, TDB_DATA indata); void ctdb_block_signal(int signum); void ctdb_unblock_signal(int signum); +int32_t ctdb_monitoring_mode(struct ctdb_context *ctdb); #endif diff --git a/ctdb/server/ctdb_control.c b/ctdb/server/ctdb_control.c index 4e013a530ea..35266bcde23 100644 --- a/ctdb/server/ctdb_control.c +++ b/ctdb/server/ctdb_control.c @@ -224,7 +224,7 @@ static int32_t ctdb_control_dispatch(struct ctdb_context *ctdb, return ctdb_control_set_recmode(ctdb, c, indata, async_reply, errormsg); case CTDB_CONTROL_GET_MONMODE: - return ctdb->monitoring_mode; + return ctdb_monitoring_mode(ctdb); case CTDB_CONTROL_SHUTDOWN: ctdb_stop_recoverd(ctdb); diff --git a/ctdb/server/ctdb_monitor.c b/ctdb/server/ctdb_monitor.c index bdb3d45eda0..f5fd807cf76 100644 --- a/ctdb/server/ctdb_monitor.c +++ b/ctdb/server/ctdb_monitor.c @@ -24,6 +24,12 @@ #include "system/wait.h" #include "../include/ctdb_private.h" +struct ctdb_monitor_state { + uint32_t monitoring_mode; + TALLOC_CTX *monitor_context; + uint32_t next_interval; +}; + /* see if any nodes are dead */ @@ -75,7 +81,7 @@ static void ctdb_check_for_dead_nodes(struct event_context *ev, struct timed_eve node->tx_cnt = 0; } - event_add_timed(ctdb->ev, ctdb->monitor_context, + event_add_timed(ctdb->ev, ctdb->monitor->monitor_context, timeval_current_ofs(ctdb->tunable.keepalive_interval, 0), ctdb_check_for_dead_nodes, ctdb); } @@ -99,18 +105,21 @@ static void ctdb_health_callback(struct ctdb_context *ctdb, int status, void *p) if (status != 0 && !(node->flags & NODE_FLAGS_UNHEALTHY)) { DEBUG(0,("monitor event failed - disabling node\n")); node->flags |= NODE_FLAGS_UNHEALTHY; + ctdb->monitor->next_interval = 1; } else if (status == 0 && (node->flags & NODE_FLAGS_UNHEALTHY)) { DEBUG(0,("monitor event OK - node re-enabled\n")); node->flags &= ~NODE_FLAGS_UNHEALTHY; + ctdb->monitor->next_interval = 1; } - if (node->flags & NODE_FLAGS_UNHEALTHY) { - next_interval = ctdb->tunable.monitor_retry; - } else { - next_interval = ctdb->tunable.monitor_interval; + next_interval = ctdb->monitor->next_interval; + + ctdb->monitor->next_interval *= 2; + if (ctdb->monitor->next_interval > ctdb->tunable.monitor_interval) { + ctdb->monitor->next_interval = ctdb->tunable.monitor_interval; } - event_add_timed(ctdb->ev, ctdb->monitor_context, + event_add_timed(ctdb->ev, ctdb->monitor->monitor_context, timeval_current_ofs(next_interval, 0), ctdb_check_health, ctdb); @@ -140,18 +149,12 @@ static void ctdb_startup_callback(struct ctdb_context *ctdb, int status, void *p } else if (status == 0) { DEBUG(0,("startup event OK - enabling monitoring\n")); ctdb->done_startup = true; + ctdb->monitor->next_interval = 1; } - if (ctdb->done_startup) { - event_add_timed(ctdb->ev, ctdb->monitor_context, - timeval_zero(), - ctdb_check_health, ctdb); - } else { - event_add_timed(ctdb->ev, ctdb->monitor_context, - timeval_current_ofs(ctdb->tunable.monitor_interval, 0), - ctdb_check_health, ctdb); - } - + event_add_timed(ctdb->ev, ctdb->monitor->monitor_context, + timeval_current_ofs(ctdb->monitor->next_interval, 0), + ctdb_check_health, ctdb); } @@ -165,9 +168,9 @@ static void ctdb_check_health(struct event_context *ev, struct timed_event *te, int ret; if (ctdb->recovery_mode != CTDB_RECOVERY_NORMAL || - (ctdb->monitoring_mode == CTDB_MONITORING_DISABLED && ctdb->done_startup)) { - event_add_timed(ctdb->ev, ctdb->monitor_context, - timeval_current_ofs(ctdb->tunable.monitor_interval, 0), + (ctdb->monitor->monitoring_mode == CTDB_MONITORING_DISABLED && ctdb->done_startup)) { + event_add_timed(ctdb->ev, ctdb->monitor->monitor_context, + timeval_current_ofs(ctdb->monitor->next_interval, 0), ctdb_check_health, ctdb); return; } @@ -175,18 +178,18 @@ static void ctdb_check_health(struct event_context *ev, struct timed_event *te, if (!ctdb->done_startup) { ret = ctdb_event_script_callback(ctdb, timeval_current_ofs(ctdb->tunable.script_timeout, 0), - ctdb->monitor_context, ctdb_startup_callback, + ctdb->monitor->monitor_context, ctdb_startup_callback, ctdb, "startup"); } else { ret = ctdb_event_script_callback(ctdb, timeval_current_ofs(ctdb->tunable.script_timeout, 0), - ctdb->monitor_context, ctdb_health_callback, + ctdb->monitor->monitor_context, ctdb_health_callback, ctdb, "monitor"); } if (ret != 0) { DEBUG(0,("Unable to launch monitor event script\n")); - event_add_timed(ctdb->ev, ctdb->monitor_context, + event_add_timed(ctdb->ev, ctdb->monitor->monitor_context, timeval_current_ofs(ctdb->tunable.monitor_retry, 0), ctdb_check_health, ctdb); } @@ -198,7 +201,7 @@ static void ctdb_check_health(struct event_context *ev, struct timed_event *te, */ void ctdb_disable_monitoring(struct ctdb_context *ctdb) { - ctdb->monitoring_mode = CTDB_MONITORING_DISABLED; + ctdb->monitor->monitoring_mode = CTDB_MONITORING_DISABLED; DEBUG(2,("Monitoring has been disabled\n")); } @@ -207,7 +210,8 @@ void ctdb_disable_monitoring(struct ctdb_context *ctdb) */ void ctdb_enable_monitoring(struct ctdb_context *ctdb) { - ctdb->monitoring_mode = CTDB_MONITORING_ACTIVE; + ctdb->monitor->monitoring_mode = CTDB_MONITORING_ACTIVE; + ctdb->monitor->next_interval = 1; DEBUG(2,("Monitoring has been enabled\n")); } @@ -216,10 +220,11 @@ void ctdb_enable_monitoring(struct ctdb_context *ctdb) */ void ctdb_stop_monitoring(struct ctdb_context *ctdb) { - talloc_free(ctdb->monitor_context); - ctdb->monitor_context = NULL; + talloc_free(ctdb->monitor->monitor_context); + ctdb->monitor->monitor_context = NULL; - ctdb->monitoring_mode = CTDB_MONITORING_DISABLED; + ctdb->monitor->monitoring_mode = CTDB_MONITORING_DISABLED; + ctdb->monitor->next_interval = 1; DEBUG(0,("Monitoring has been stopped\n")); } @@ -230,26 +235,29 @@ void ctdb_start_monitoring(struct ctdb_context *ctdb) { struct timed_event *te; - if (ctdb->monitoring_mode == CTDB_MONITORING_ACTIVE) { + if (ctdb->monitor != NULL) { return; } - ctdb_stop_monitoring(ctdb); + ctdb->monitor = talloc(ctdb, struct ctdb_monitor_state); + CTDB_NO_MEMORY_FATAL(ctdb, ctdb->monitor); - ctdb->monitor_context = talloc_new(ctdb); - CTDB_NO_MEMORY_FATAL(ctdb, ctdb->monitor_context); + ctdb->monitor->next_interval = 1; - te = event_add_timed(ctdb->ev, ctdb->monitor_context, + ctdb->monitor->monitor_context = talloc_new(ctdb->monitor); + CTDB_NO_MEMORY_FATAL(ctdb, ctdb->monitor->monitor_context); + + te = event_add_timed(ctdb->ev, ctdb->monitor->monitor_context, timeval_current_ofs(ctdb->tunable.keepalive_interval, 0), ctdb_check_for_dead_nodes, ctdb); CTDB_NO_MEMORY_FATAL(ctdb, te); - te = event_add_timed(ctdb->ev, ctdb->monitor_context, + te = event_add_timed(ctdb->ev, ctdb->monitor->monitor_context, timeval_current_ofs(ctdb->tunable.monitor_retry, 0), ctdb_check_health, ctdb); CTDB_NO_MEMORY_FATAL(ctdb, te); - ctdb->monitoring_mode = CTDB_MONITORING_ACTIVE; + ctdb->monitor->monitoring_mode = CTDB_MONITORING_ACTIVE; DEBUG(0,("Monitoring has been started\n")); } @@ -304,3 +312,14 @@ int32_t ctdb_control_modflags(struct ctdb_context *ctdb, TDB_DATA indata) return 0; } + +/* + return the monitoring mode + */ +int32_t ctdb_monitoring_mode(struct ctdb_context *ctdb) +{ + if (ctdb->monitor == NULL) { + return CTDB_MONITORING_DISABLED; + } + return ctdb->monitor->monitoring_mode; +} diff --git a/ctdb/server/ctdb_tunables.c b/ctdb/server/ctdb_tunables.c index 246ce942ccd..0fe16c9aaf1 100644 --- a/ctdb/server/ctdb_tunables.c +++ b/ctdb/server/ctdb_tunables.c @@ -36,7 +36,6 @@ static const struct { { "ElectionTimeout", 3, offsetof(struct ctdb_tunable, election_timeout) }, { "TakeoverTimeout", 5, offsetof(struct ctdb_tunable, takeover_timeout) }, { "MonitorInterval", 15, offsetof(struct ctdb_tunable, monitor_interval) }, - { "MonitorRetry", 5, offsetof(struct ctdb_tunable, monitor_retry) }, { "TickleUpdateInterval",20, offsetof(struct ctdb_tunable, tickle_update_interval) }, { "EventScriptTimeout", 20, offsetof(struct ctdb_tunable, script_timeout) }, { "RecoveryGracePeriod", 60, offsetof(struct ctdb_tunable, recovery_grace_period) }, diff --git a/ctdb/server/ctdbd.c b/ctdb/server/ctdbd.c index a386cea8aee..de711bb3d25 100644 --- a/ctdb/server/ctdbd.c +++ b/ctdb/server/ctdbd.c @@ -156,7 +156,6 @@ int main(int argc, const char *argv[]) ctdb->upcalls = &ctdb_upcalls; ctdb->idr = idr_init(ctdb); ctdb->recovery_lock_fd = -1; - ctdb->monitoring_mode = CTDB_MONITORING_DISABLED; ctdb_tunables_set_defaults(ctdb); From b866a147d2645c645caa3846fc398223de2e935d Mon Sep 17 00:00:00 2001 From: Andrew Tridgell Date: Thu, 10 Jan 2008 14:49:43 +1100 Subject: [PATCH 2/2] get rid of monitor_retry as well (This used to be ctdb commit c957cf9c1d99d5d3f4ca726f7a867c829660a2b7) --- ctdb/include/ctdb_private.h | 1 - ctdb/server/ctdb_monitor.c | 5 +++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ctdb/include/ctdb_private.h b/ctdb/include/ctdb_private.h index e984078b5e7..62764a1fbc9 100644 --- a/ctdb/include/ctdb_private.h +++ b/ctdb/include/ctdb_private.h @@ -80,7 +80,6 @@ struct ctdb_tunable { uint32_t election_timeout; uint32_t takeover_timeout; uint32_t monitor_interval; - uint32_t monitor_retry; uint32_t tickle_update_interval; uint32_t script_timeout; uint32_t recovery_grace_period; diff --git a/ctdb/server/ctdb_monitor.c b/ctdb/server/ctdb_monitor.c index f5fd807cf76..9120324f849 100644 --- a/ctdb/server/ctdb_monitor.c +++ b/ctdb/server/ctdb_monitor.c @@ -189,8 +189,9 @@ static void ctdb_check_health(struct event_context *ev, struct timed_event *te, if (ret != 0) { DEBUG(0,("Unable to launch monitor event script\n")); + ctdb->monitor->next_interval = 1; event_add_timed(ctdb->ev, ctdb->monitor->monitor_context, - timeval_current_ofs(ctdb->tunable.monitor_retry, 0), + timeval_current_ofs(1, 0), ctdb_check_health, ctdb); } } @@ -253,7 +254,7 @@ void ctdb_start_monitoring(struct ctdb_context *ctdb) CTDB_NO_MEMORY_FATAL(ctdb, te); te = event_add_timed(ctdb->ev, ctdb->monitor->monitor_context, - timeval_current_ofs(ctdb->tunable.monitor_retry, 0), + timeval_current_ofs(1, 0), ctdb_check_health, ctdb); CTDB_NO_MEMORY_FATAL(ctdb, te);