mirror of
https://github.com/samba-team/samba.git
synced 2025-01-11 05:18:09 +03:00
exponential backoff in health monitoring for faster startup
(This used to be ctdb commit 1b04a1f675f73b48366ba98803a58c3d8df1b6e1)
This commit is contained in:
parent
bccfdc0838
commit
538f519dba
@ -324,8 +324,6 @@ enum ctdb_freeze_mode {CTDB_FREEZE_NONE, CTDB_FREEZE_PENDING, CTDB_FREEZE_FROZEN
|
||||
struct ctdb_context {
|
||||
struct event_context *ev;
|
||||
uint32_t recovery_mode;
|
||||
uint32_t monitoring_mode;
|
||||
TALLOC_CTX *monitor_context;
|
||||
TALLOC_CTX *tickle_update_context;
|
||||
struct ctdb_tunable tunable;
|
||||
enum ctdb_freeze_mode freeze_mode;
|
||||
@ -370,6 +368,7 @@ struct ctdb_context {
|
||||
pid_t recoverd_pid;
|
||||
bool done_startup;
|
||||
const char *node_ip;
|
||||
struct ctdb_monitor_state *monitor;
|
||||
};
|
||||
|
||||
struct ctdb_db_context {
|
||||
@ -1211,5 +1210,6 @@ int32_t ctdb_control_delete_record(struct ctdb_context *ctdb, TDB_DATA indata);
|
||||
|
||||
void ctdb_block_signal(int signum);
|
||||
void ctdb_unblock_signal(int signum);
|
||||
int32_t ctdb_monitoring_mode(struct ctdb_context *ctdb);
|
||||
|
||||
#endif
|
||||
|
@ -224,7 +224,7 @@ static int32_t ctdb_control_dispatch(struct ctdb_context *ctdb,
|
||||
return ctdb_control_set_recmode(ctdb, c, indata, async_reply, errormsg);
|
||||
|
||||
case CTDB_CONTROL_GET_MONMODE:
|
||||
return ctdb->monitoring_mode;
|
||||
return ctdb_monitoring_mode(ctdb);
|
||||
|
||||
case CTDB_CONTROL_SHUTDOWN:
|
||||
ctdb_stop_recoverd(ctdb);
|
||||
|
@ -24,6 +24,12 @@
|
||||
#include "system/wait.h"
|
||||
#include "../include/ctdb_private.h"
|
||||
|
||||
struct ctdb_monitor_state {
|
||||
uint32_t monitoring_mode;
|
||||
TALLOC_CTX *monitor_context;
|
||||
uint32_t next_interval;
|
||||
};
|
||||
|
||||
/*
|
||||
see if any nodes are dead
|
||||
*/
|
||||
@ -75,7 +81,7 @@ static void ctdb_check_for_dead_nodes(struct event_context *ev, struct timed_eve
|
||||
node->tx_cnt = 0;
|
||||
}
|
||||
|
||||
event_add_timed(ctdb->ev, ctdb->monitor_context,
|
||||
event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
|
||||
timeval_current_ofs(ctdb->tunable.keepalive_interval, 0),
|
||||
ctdb_check_for_dead_nodes, ctdb);
|
||||
}
|
||||
@ -99,18 +105,21 @@ static void ctdb_health_callback(struct ctdb_context *ctdb, int status, void *p)
|
||||
if (status != 0 && !(node->flags & NODE_FLAGS_UNHEALTHY)) {
|
||||
DEBUG(0,("monitor event failed - disabling node\n"));
|
||||
node->flags |= NODE_FLAGS_UNHEALTHY;
|
||||
ctdb->monitor->next_interval = 1;
|
||||
} else if (status == 0 && (node->flags & NODE_FLAGS_UNHEALTHY)) {
|
||||
DEBUG(0,("monitor event OK - node re-enabled\n"));
|
||||
node->flags &= ~NODE_FLAGS_UNHEALTHY;
|
||||
ctdb->monitor->next_interval = 1;
|
||||
}
|
||||
|
||||
if (node->flags & NODE_FLAGS_UNHEALTHY) {
|
||||
next_interval = ctdb->tunable.monitor_retry;
|
||||
} else {
|
||||
next_interval = ctdb->tunable.monitor_interval;
|
||||
next_interval = ctdb->monitor->next_interval;
|
||||
|
||||
ctdb->monitor->next_interval *= 2;
|
||||
if (ctdb->monitor->next_interval > ctdb->tunable.monitor_interval) {
|
||||
ctdb->monitor->next_interval = ctdb->tunable.monitor_interval;
|
||||
}
|
||||
|
||||
event_add_timed(ctdb->ev, ctdb->monitor_context,
|
||||
event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
|
||||
timeval_current_ofs(next_interval, 0),
|
||||
ctdb_check_health, ctdb);
|
||||
|
||||
@ -140,18 +149,12 @@ static void ctdb_startup_callback(struct ctdb_context *ctdb, int status, void *p
|
||||
} else if (status == 0) {
|
||||
DEBUG(0,("startup event OK - enabling monitoring\n"));
|
||||
ctdb->done_startup = true;
|
||||
ctdb->monitor->next_interval = 1;
|
||||
}
|
||||
|
||||
if (ctdb->done_startup) {
|
||||
event_add_timed(ctdb->ev, ctdb->monitor_context,
|
||||
timeval_zero(),
|
||||
ctdb_check_health, ctdb);
|
||||
} else {
|
||||
event_add_timed(ctdb->ev, ctdb->monitor_context,
|
||||
timeval_current_ofs(ctdb->tunable.monitor_interval, 0),
|
||||
ctdb_check_health, ctdb);
|
||||
}
|
||||
|
||||
event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
|
||||
timeval_current_ofs(ctdb->monitor->next_interval, 0),
|
||||
ctdb_check_health, ctdb);
|
||||
}
|
||||
|
||||
|
||||
@ -165,9 +168,9 @@ static void ctdb_check_health(struct event_context *ev, struct timed_event *te,
|
||||
int ret;
|
||||
|
||||
if (ctdb->recovery_mode != CTDB_RECOVERY_NORMAL ||
|
||||
(ctdb->monitoring_mode == CTDB_MONITORING_DISABLED && ctdb->done_startup)) {
|
||||
event_add_timed(ctdb->ev, ctdb->monitor_context,
|
||||
timeval_current_ofs(ctdb->tunable.monitor_interval, 0),
|
||||
(ctdb->monitor->monitoring_mode == CTDB_MONITORING_DISABLED && ctdb->done_startup)) {
|
||||
event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
|
||||
timeval_current_ofs(ctdb->monitor->next_interval, 0),
|
||||
ctdb_check_health, ctdb);
|
||||
return;
|
||||
}
|
||||
@ -175,18 +178,18 @@ static void ctdb_check_health(struct event_context *ev, struct timed_event *te,
|
||||
if (!ctdb->done_startup) {
|
||||
ret = ctdb_event_script_callback(ctdb,
|
||||
timeval_current_ofs(ctdb->tunable.script_timeout, 0),
|
||||
ctdb->monitor_context, ctdb_startup_callback,
|
||||
ctdb->monitor->monitor_context, ctdb_startup_callback,
|
||||
ctdb, "startup");
|
||||
} else {
|
||||
ret = ctdb_event_script_callback(ctdb,
|
||||
timeval_current_ofs(ctdb->tunable.script_timeout, 0),
|
||||
ctdb->monitor_context, ctdb_health_callback,
|
||||
ctdb->monitor->monitor_context, ctdb_health_callback,
|
||||
ctdb, "monitor");
|
||||
}
|
||||
|
||||
if (ret != 0) {
|
||||
DEBUG(0,("Unable to launch monitor event script\n"));
|
||||
event_add_timed(ctdb->ev, ctdb->monitor_context,
|
||||
event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
|
||||
timeval_current_ofs(ctdb->tunable.monitor_retry, 0),
|
||||
ctdb_check_health, ctdb);
|
||||
}
|
||||
@ -198,7 +201,7 @@ static void ctdb_check_health(struct event_context *ev, struct timed_event *te,
|
||||
*/
|
||||
void ctdb_disable_monitoring(struct ctdb_context *ctdb)
|
||||
{
|
||||
ctdb->monitoring_mode = CTDB_MONITORING_DISABLED;
|
||||
ctdb->monitor->monitoring_mode = CTDB_MONITORING_DISABLED;
|
||||
DEBUG(2,("Monitoring has been disabled\n"));
|
||||
}
|
||||
|
||||
@ -207,7 +210,8 @@ void ctdb_disable_monitoring(struct ctdb_context *ctdb)
|
||||
*/
|
||||
void ctdb_enable_monitoring(struct ctdb_context *ctdb)
|
||||
{
|
||||
ctdb->monitoring_mode = CTDB_MONITORING_ACTIVE;
|
||||
ctdb->monitor->monitoring_mode = CTDB_MONITORING_ACTIVE;
|
||||
ctdb->monitor->next_interval = 1;
|
||||
DEBUG(2,("Monitoring has been enabled\n"));
|
||||
}
|
||||
|
||||
@ -216,10 +220,11 @@ void ctdb_enable_monitoring(struct ctdb_context *ctdb)
|
||||
*/
|
||||
void ctdb_stop_monitoring(struct ctdb_context *ctdb)
|
||||
{
|
||||
talloc_free(ctdb->monitor_context);
|
||||
ctdb->monitor_context = NULL;
|
||||
talloc_free(ctdb->monitor->monitor_context);
|
||||
ctdb->monitor->monitor_context = NULL;
|
||||
|
||||
ctdb->monitoring_mode = CTDB_MONITORING_DISABLED;
|
||||
ctdb->monitor->monitoring_mode = CTDB_MONITORING_DISABLED;
|
||||
ctdb->monitor->next_interval = 1;
|
||||
DEBUG(0,("Monitoring has been stopped\n"));
|
||||
}
|
||||
|
||||
@ -230,26 +235,29 @@ void ctdb_start_monitoring(struct ctdb_context *ctdb)
|
||||
{
|
||||
struct timed_event *te;
|
||||
|
||||
if (ctdb->monitoring_mode == CTDB_MONITORING_ACTIVE) {
|
||||
if (ctdb->monitor != NULL) {
|
||||
return;
|
||||
}
|
||||
|
||||
ctdb_stop_monitoring(ctdb);
|
||||
ctdb->monitor = talloc(ctdb, struct ctdb_monitor_state);
|
||||
CTDB_NO_MEMORY_FATAL(ctdb, ctdb->monitor);
|
||||
|
||||
ctdb->monitor_context = talloc_new(ctdb);
|
||||
CTDB_NO_MEMORY_FATAL(ctdb, ctdb->monitor_context);
|
||||
ctdb->monitor->next_interval = 1;
|
||||
|
||||
te = event_add_timed(ctdb->ev, ctdb->monitor_context,
|
||||
ctdb->monitor->monitor_context = talloc_new(ctdb->monitor);
|
||||
CTDB_NO_MEMORY_FATAL(ctdb, ctdb->monitor->monitor_context);
|
||||
|
||||
te = event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
|
||||
timeval_current_ofs(ctdb->tunable.keepalive_interval, 0),
|
||||
ctdb_check_for_dead_nodes, ctdb);
|
||||
CTDB_NO_MEMORY_FATAL(ctdb, te);
|
||||
|
||||
te = event_add_timed(ctdb->ev, ctdb->monitor_context,
|
||||
te = event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
|
||||
timeval_current_ofs(ctdb->tunable.monitor_retry, 0),
|
||||
ctdb_check_health, ctdb);
|
||||
CTDB_NO_MEMORY_FATAL(ctdb, te);
|
||||
|
||||
ctdb->monitoring_mode = CTDB_MONITORING_ACTIVE;
|
||||
ctdb->monitor->monitoring_mode = CTDB_MONITORING_ACTIVE;
|
||||
DEBUG(0,("Monitoring has been started\n"));
|
||||
}
|
||||
|
||||
@ -304,3 +312,14 @@ int32_t ctdb_control_modflags(struct ctdb_context *ctdb, TDB_DATA indata)
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
return the monitoring mode
|
||||
*/
|
||||
int32_t ctdb_monitoring_mode(struct ctdb_context *ctdb)
|
||||
{
|
||||
if (ctdb->monitor == NULL) {
|
||||
return CTDB_MONITORING_DISABLED;
|
||||
}
|
||||
return ctdb->monitor->monitoring_mode;
|
||||
}
|
||||
|
@ -36,7 +36,6 @@ static const struct {
|
||||
{ "ElectionTimeout", 3, offsetof(struct ctdb_tunable, election_timeout) },
|
||||
{ "TakeoverTimeout", 5, offsetof(struct ctdb_tunable, takeover_timeout) },
|
||||
{ "MonitorInterval", 15, offsetof(struct ctdb_tunable, monitor_interval) },
|
||||
{ "MonitorRetry", 5, offsetof(struct ctdb_tunable, monitor_retry) },
|
||||
{ "TickleUpdateInterval",20, offsetof(struct ctdb_tunable, tickle_update_interval) },
|
||||
{ "EventScriptTimeout", 20, offsetof(struct ctdb_tunable, script_timeout) },
|
||||
{ "RecoveryGracePeriod", 60, offsetof(struct ctdb_tunable, recovery_grace_period) },
|
||||
|
@ -156,7 +156,6 @@ int main(int argc, const char *argv[])
|
||||
ctdb->upcalls = &ctdb_upcalls;
|
||||
ctdb->idr = idr_init(ctdb);
|
||||
ctdb->recovery_lock_fd = -1;
|
||||
ctdb->monitoring_mode = CTDB_MONITORING_DISABLED;
|
||||
|
||||
ctdb_tunables_set_defaults(ctdb);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user