1
0
mirror of https://github.com/samba-team/samba.git synced 2024-12-23 17:34:34 +03:00

exponential backoff in health monitoring for faster startup

(This used to be ctdb commit 1b04a1f675f73b48366ba98803a58c3d8df1b6e1)
This commit is contained in:
Andrew Tridgell 2008-01-10 14:40:56 +11:00
parent bccfdc0838
commit 538f519dba
5 changed files with 56 additions and 39 deletions

View File

@ -324,8 +324,6 @@ enum ctdb_freeze_mode {CTDB_FREEZE_NONE, CTDB_FREEZE_PENDING, CTDB_FREEZE_FROZEN
struct ctdb_context {
struct event_context *ev;
uint32_t recovery_mode;
uint32_t monitoring_mode;
TALLOC_CTX *monitor_context;
TALLOC_CTX *tickle_update_context;
struct ctdb_tunable tunable;
enum ctdb_freeze_mode freeze_mode;
@ -370,6 +368,7 @@ struct ctdb_context {
pid_t recoverd_pid;
bool done_startup;
const char *node_ip;
struct ctdb_monitor_state *monitor;
};
struct ctdb_db_context {
@ -1211,5 +1210,6 @@ int32_t ctdb_control_delete_record(struct ctdb_context *ctdb, TDB_DATA indata);
void ctdb_block_signal(int signum);
void ctdb_unblock_signal(int signum);
int32_t ctdb_monitoring_mode(struct ctdb_context *ctdb);
#endif

View File

@ -224,7 +224,7 @@ static int32_t ctdb_control_dispatch(struct ctdb_context *ctdb,
return ctdb_control_set_recmode(ctdb, c, indata, async_reply, errormsg);
case CTDB_CONTROL_GET_MONMODE:
return ctdb->monitoring_mode;
return ctdb_monitoring_mode(ctdb);
case CTDB_CONTROL_SHUTDOWN:
ctdb_stop_recoverd(ctdb);

View File

@ -24,6 +24,12 @@
#include "system/wait.h"
#include "../include/ctdb_private.h"
struct ctdb_monitor_state {
uint32_t monitoring_mode;
TALLOC_CTX *monitor_context;
uint32_t next_interval;
};
/*
see if any nodes are dead
*/
@ -75,7 +81,7 @@ static void ctdb_check_for_dead_nodes(struct event_context *ev, struct timed_eve
node->tx_cnt = 0;
}
event_add_timed(ctdb->ev, ctdb->monitor_context,
event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
timeval_current_ofs(ctdb->tunable.keepalive_interval, 0),
ctdb_check_for_dead_nodes, ctdb);
}
@ -99,18 +105,21 @@ static void ctdb_health_callback(struct ctdb_context *ctdb, int status, void *p)
if (status != 0 && !(node->flags & NODE_FLAGS_UNHEALTHY)) {
DEBUG(0,("monitor event failed - disabling node\n"));
node->flags |= NODE_FLAGS_UNHEALTHY;
ctdb->monitor->next_interval = 1;
} else if (status == 0 && (node->flags & NODE_FLAGS_UNHEALTHY)) {
DEBUG(0,("monitor event OK - node re-enabled\n"));
node->flags &= ~NODE_FLAGS_UNHEALTHY;
ctdb->monitor->next_interval = 1;
}
if (node->flags & NODE_FLAGS_UNHEALTHY) {
next_interval = ctdb->tunable.monitor_retry;
} else {
next_interval = ctdb->tunable.monitor_interval;
next_interval = ctdb->monitor->next_interval;
ctdb->monitor->next_interval *= 2;
if (ctdb->monitor->next_interval > ctdb->tunable.monitor_interval) {
ctdb->monitor->next_interval = ctdb->tunable.monitor_interval;
}
event_add_timed(ctdb->ev, ctdb->monitor_context,
event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
timeval_current_ofs(next_interval, 0),
ctdb_check_health, ctdb);
@ -140,18 +149,12 @@ static void ctdb_startup_callback(struct ctdb_context *ctdb, int status, void *p
} else if (status == 0) {
DEBUG(0,("startup event OK - enabling monitoring\n"));
ctdb->done_startup = true;
ctdb->monitor->next_interval = 1;
}
if (ctdb->done_startup) {
event_add_timed(ctdb->ev, ctdb->monitor_context,
timeval_zero(),
event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
timeval_current_ofs(ctdb->monitor->next_interval, 0),
ctdb_check_health, ctdb);
} else {
event_add_timed(ctdb->ev, ctdb->monitor_context,
timeval_current_ofs(ctdb->tunable.monitor_interval, 0),
ctdb_check_health, ctdb);
}
}
@ -165,9 +168,9 @@ static void ctdb_check_health(struct event_context *ev, struct timed_event *te,
int ret;
if (ctdb->recovery_mode != CTDB_RECOVERY_NORMAL ||
(ctdb->monitoring_mode == CTDB_MONITORING_DISABLED && ctdb->done_startup)) {
event_add_timed(ctdb->ev, ctdb->monitor_context,
timeval_current_ofs(ctdb->tunable.monitor_interval, 0),
(ctdb->monitor->monitoring_mode == CTDB_MONITORING_DISABLED && ctdb->done_startup)) {
event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
timeval_current_ofs(ctdb->monitor->next_interval, 0),
ctdb_check_health, ctdb);
return;
}
@ -175,18 +178,18 @@ static void ctdb_check_health(struct event_context *ev, struct timed_event *te,
if (!ctdb->done_startup) {
ret = ctdb_event_script_callback(ctdb,
timeval_current_ofs(ctdb->tunable.script_timeout, 0),
ctdb->monitor_context, ctdb_startup_callback,
ctdb->monitor->monitor_context, ctdb_startup_callback,
ctdb, "startup");
} else {
ret = ctdb_event_script_callback(ctdb,
timeval_current_ofs(ctdb->tunable.script_timeout, 0),
ctdb->monitor_context, ctdb_health_callback,
ctdb->monitor->monitor_context, ctdb_health_callback,
ctdb, "monitor");
}
if (ret != 0) {
DEBUG(0,("Unable to launch monitor event script\n"));
event_add_timed(ctdb->ev, ctdb->monitor_context,
event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
timeval_current_ofs(ctdb->tunable.monitor_retry, 0),
ctdb_check_health, ctdb);
}
@ -198,7 +201,7 @@ static void ctdb_check_health(struct event_context *ev, struct timed_event *te,
*/
void ctdb_disable_monitoring(struct ctdb_context *ctdb)
{
ctdb->monitoring_mode = CTDB_MONITORING_DISABLED;
ctdb->monitor->monitoring_mode = CTDB_MONITORING_DISABLED;
DEBUG(2,("Monitoring has been disabled\n"));
}
@ -207,7 +210,8 @@ void ctdb_disable_monitoring(struct ctdb_context *ctdb)
*/
void ctdb_enable_monitoring(struct ctdb_context *ctdb)
{
ctdb->monitoring_mode = CTDB_MONITORING_ACTIVE;
ctdb->monitor->monitoring_mode = CTDB_MONITORING_ACTIVE;
ctdb->monitor->next_interval = 1;
DEBUG(2,("Monitoring has been enabled\n"));
}
@ -216,10 +220,11 @@ void ctdb_enable_monitoring(struct ctdb_context *ctdb)
*/
void ctdb_stop_monitoring(struct ctdb_context *ctdb)
{
talloc_free(ctdb->monitor_context);
ctdb->monitor_context = NULL;
talloc_free(ctdb->monitor->monitor_context);
ctdb->monitor->monitor_context = NULL;
ctdb->monitoring_mode = CTDB_MONITORING_DISABLED;
ctdb->monitor->monitoring_mode = CTDB_MONITORING_DISABLED;
ctdb->monitor->next_interval = 1;
DEBUG(0,("Monitoring has been stopped\n"));
}
@ -230,26 +235,29 @@ void ctdb_start_monitoring(struct ctdb_context *ctdb)
{
struct timed_event *te;
if (ctdb->monitoring_mode == CTDB_MONITORING_ACTIVE) {
if (ctdb->monitor != NULL) {
return;
}
ctdb_stop_monitoring(ctdb);
ctdb->monitor = talloc(ctdb, struct ctdb_monitor_state);
CTDB_NO_MEMORY_FATAL(ctdb, ctdb->monitor);
ctdb->monitor_context = talloc_new(ctdb);
CTDB_NO_MEMORY_FATAL(ctdb, ctdb->monitor_context);
ctdb->monitor->next_interval = 1;
te = event_add_timed(ctdb->ev, ctdb->monitor_context,
ctdb->monitor->monitor_context = talloc_new(ctdb->monitor);
CTDB_NO_MEMORY_FATAL(ctdb, ctdb->monitor->monitor_context);
te = event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
timeval_current_ofs(ctdb->tunable.keepalive_interval, 0),
ctdb_check_for_dead_nodes, ctdb);
CTDB_NO_MEMORY_FATAL(ctdb, te);
te = event_add_timed(ctdb->ev, ctdb->monitor_context,
te = event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
timeval_current_ofs(ctdb->tunable.monitor_retry, 0),
ctdb_check_health, ctdb);
CTDB_NO_MEMORY_FATAL(ctdb, te);
ctdb->monitoring_mode = CTDB_MONITORING_ACTIVE;
ctdb->monitor->monitoring_mode = CTDB_MONITORING_ACTIVE;
DEBUG(0,("Monitoring has been started\n"));
}
@ -304,3 +312,14 @@ int32_t ctdb_control_modflags(struct ctdb_context *ctdb, TDB_DATA indata)
return 0;
}
/*
return the monitoring mode
*/
int32_t ctdb_monitoring_mode(struct ctdb_context *ctdb)
{
if (ctdb->monitor == NULL) {
return CTDB_MONITORING_DISABLED;
}
return ctdb->monitor->monitoring_mode;
}

View File

@ -36,7 +36,6 @@ static const struct {
{ "ElectionTimeout", 3, offsetof(struct ctdb_tunable, election_timeout) },
{ "TakeoverTimeout", 5, offsetof(struct ctdb_tunable, takeover_timeout) },
{ "MonitorInterval", 15, offsetof(struct ctdb_tunable, monitor_interval) },
{ "MonitorRetry", 5, offsetof(struct ctdb_tunable, monitor_retry) },
{ "TickleUpdateInterval",20, offsetof(struct ctdb_tunable, tickle_update_interval) },
{ "EventScriptTimeout", 20, offsetof(struct ctdb_tunable, script_timeout) },
{ "RecoveryGracePeriod", 60, offsetof(struct ctdb_tunable, recovery_grace_period) },

View File

@ -156,7 +156,6 @@ int main(int argc, const char *argv[])
ctdb->upcalls = &ctdb_upcalls;
ctdb->idr = idr_init(ctdb);
ctdb->recovery_lock_fd = -1;
ctdb->monitoring_mode = CTDB_MONITORING_DISABLED;
ctdb_tunables_set_defaults(ctdb);