mirror of
https://github.com/samba-team/samba.git
synced 2024-12-23 17:34:34 +03:00
ctdb/daemon: Untangle serialisation of 1st recovery -> startup -> monitor
At the moment ctdb_check_healthy() is overloaded to wait until the first recovery is complete, handle the "startup" event and also actually handle monitoring. This is untidy and hard to follow. Instead, have the daemon explicitly wait for 1st recovery after the "setup" event. When first recovery is complete, schedule a function to handle the "startup" event. When the "startup" event succeeds then explicitly enable monitoring. Signed-off-by: Martin Schwenke <martin@meltin.net> Reviewed-by: Amitay Isaacs <amitay@gmail.com>
This commit is contained in:
parent
50e00b3e52
commit
e6304d1e1a
@ -1067,7 +1067,7 @@ uint32_t ctdb_get_num_active_nodes(struct ctdb_context *ctdb);
|
||||
void ctdb_disable_monitoring(struct ctdb_context *ctdb);
|
||||
void ctdb_enable_monitoring(struct ctdb_context *ctdb);
|
||||
void ctdb_stop_monitoring(struct ctdb_context *ctdb);
|
||||
void ctdb_start_monitoring(struct ctdb_context *ctdb);
|
||||
void ctdb_wait_for_first_recovery(struct ctdb_context *ctdb);
|
||||
void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb);
|
||||
void ctdb_send_keepalive(struct ctdb_context *ctdb, uint32_t destnode);
|
||||
void ctdb_start_keepalive(struct ctdb_context *ctdb);
|
||||
|
@ -84,9 +84,6 @@ static void ctdb_start_periodic_events(struct ctdb_context *ctdb)
|
||||
/* start monitoring for connected/disconnected nodes */
|
||||
ctdb_start_keepalive(ctdb);
|
||||
|
||||
/* start monitoring for node health */
|
||||
ctdb_start_monitoring(ctdb);
|
||||
|
||||
/* start periodic update of tcp tickle lists */
|
||||
ctdb_start_tcp_tickle_update(ctdb);
|
||||
|
||||
@ -1048,8 +1045,6 @@ static void ctdb_setup_event_callback(struct ctdb_context *ctdb, int status,
|
||||
}
|
||||
ctdb_run_notification_script(ctdb, "setup");
|
||||
|
||||
ctdb_set_runstate(ctdb, CTDB_RUNSTATE_FIRST_RECOVERY);
|
||||
|
||||
/* tell all other nodes we've just started up */
|
||||
ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_ALL,
|
||||
0, CTDB_CONTROL_STARTUP, 0,
|
||||
@ -1063,6 +1058,8 @@ static void ctdb_setup_event_callback(struct ctdb_context *ctdb, int status,
|
||||
}
|
||||
|
||||
ctdb_start_periodic_events(ctdb);
|
||||
|
||||
ctdb_wait_for_first_recovery(ctdb);
|
||||
}
|
||||
|
||||
static struct timeval tevent_before_wait_ts;
|
||||
|
@ -196,6 +196,8 @@ after_change_status:
|
||||
}
|
||||
|
||||
|
||||
static void ctdb_run_startup(struct event_context *ev, struct timed_event *te,
|
||||
struct timeval t, void *private_data);
|
||||
/*
|
||||
called when the startup event script finishes
|
||||
*/
|
||||
@ -203,18 +205,58 @@ static void ctdb_startup_callback(struct ctdb_context *ctdb, int status, void *p
|
||||
{
|
||||
if (status != 0) {
|
||||
DEBUG(DEBUG_ERR,("startup event failed\n"));
|
||||
} else if (status == 0) {
|
||||
event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
|
||||
timeval_current_ofs(5, 0),
|
||||
ctdb_run_startup, ctdb);
|
||||
return;
|
||||
}
|
||||
|
||||
DEBUG(DEBUG_NOTICE,("startup event OK - enabling monitoring\n"));
|
||||
ctdb_set_runstate(ctdb, CTDB_RUNSTATE_RUNNING);
|
||||
ctdb->monitor->next_interval = 2;
|
||||
ctdb_run_notification_script(ctdb, "startup");
|
||||
}
|
||||
|
||||
ctdb->monitor->monitoring_mode = CTDB_MONITORING_ACTIVE;
|
||||
|
||||
event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
|
||||
timeval_current_ofs(ctdb->monitor->next_interval, 0),
|
||||
ctdb_check_health, ctdb);
|
||||
}
|
||||
|
||||
static void ctdb_run_startup(struct event_context *ev, struct timed_event *te,
|
||||
struct timeval t, void *private_data)
|
||||
{
|
||||
struct ctdb_context *ctdb = talloc_get_type(private_data,
|
||||
struct ctdb_context);
|
||||
int ret;
|
||||
|
||||
/* This is necessary to avoid the "startup" event colliding
|
||||
* with the "ipreallocated" event from the takeover run
|
||||
* following the first recovery. We might as well serialise
|
||||
* these things if we can.
|
||||
*/
|
||||
if (ctdb->runstate < CTDB_RUNSTATE_STARTUP) {
|
||||
DEBUG(DEBUG_NOTICE,
|
||||
("Not yet in startup runstate. Wait one more second\n"));
|
||||
event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
|
||||
timeval_current_ofs(1, 0),
|
||||
ctdb_run_startup, ctdb);
|
||||
return;
|
||||
}
|
||||
|
||||
DEBUG(DEBUG_NOTICE,("Running the \"startup\" event.\n"));
|
||||
ret = ctdb_event_script_callback(ctdb,
|
||||
ctdb->monitor->monitor_context,
|
||||
ctdb_startup_callback,
|
||||
ctdb, CTDB_EVENT_STARTUP, "%s", "");
|
||||
|
||||
if (ret != 0) {
|
||||
DEBUG(DEBUG_ERR,("Unable to launch startup event script\n"));
|
||||
event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
|
||||
timeval_current_ofs(5, 0),
|
||||
ctdb_run_startup, ctdb);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
wait until we have finished initial recoveries before we start the
|
||||
@ -302,8 +344,7 @@ static void ctdb_wait_until_recovered(struct event_context *ev, struct timed_eve
|
||||
ctdb->db_persistent_check_errors = 0;
|
||||
|
||||
event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
|
||||
timeval_current(),
|
||||
ctdb_check_health, ctdb);
|
||||
timeval_current(), ctdb_run_startup, ctdb);
|
||||
}
|
||||
|
||||
|
||||
@ -314,59 +355,35 @@ static void ctdb_check_health(struct event_context *ev, struct timed_event *te,
|
||||
struct timeval t, void *private_data)
|
||||
{
|
||||
struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
|
||||
bool skip_monitoring = false;
|
||||
int ret = 0;
|
||||
|
||||
if (ctdb->runstate < CTDB_RUNSTATE_STARTUP) {
|
||||
DEBUG(DEBUG_NOTICE,("Not yet in startup runstate. Wait one more second\n"));
|
||||
event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
|
||||
timeval_current_ofs(1, 0),
|
||||
ctdb_check_health, ctdb);
|
||||
return;
|
||||
}
|
||||
|
||||
if (ctdb->recovery_mode != CTDB_RECOVERY_NORMAL ||
|
||||
(ctdb->monitor->monitoring_mode == CTDB_MONITORING_DISABLED &&
|
||||
ctdb->runstate == CTDB_RUNSTATE_RUNNING)) {
|
||||
event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
|
||||
timeval_current_ofs(ctdb->monitor->next_interval, 0),
|
||||
ctdb_check_health, ctdb);
|
||||
return;
|
||||
}
|
||||
|
||||
if (ctdb->runstate == CTDB_RUNSTATE_STARTUP) {
|
||||
DEBUG(DEBUG_NOTICE,("Recoveries finished. Running the \"startup\" event.\n"));
|
||||
ret = ctdb_event_script_callback(ctdb,
|
||||
ctdb->monitor->monitor_context, ctdb_startup_callback,
|
||||
ctdb,
|
||||
CTDB_EVENT_STARTUP, "%s", "");
|
||||
ctdb->monitor->monitoring_mode == CTDB_MONITORING_DISABLED) {
|
||||
skip_monitoring = true;
|
||||
} else {
|
||||
int i;
|
||||
int skip_monitoring = 0;
|
||||
|
||||
if (ctdb->recovery_mode != CTDB_RECOVERY_NORMAL) {
|
||||
skip_monitoring = 1;
|
||||
DEBUG(DEBUG_ERR,("Skip monitoring during recovery\n"));
|
||||
}
|
||||
for (i=1; i<=NUM_DB_PRIORITIES; i++) {
|
||||
if (ctdb->freeze_handles[i] != NULL) {
|
||||
DEBUG(DEBUG_ERR,("Skip monitoring since databases are frozen\n"));
|
||||
skip_monitoring = 1;
|
||||
DEBUG(DEBUG_ERR,
|
||||
("Skip monitoring since databases are frozen\n"));
|
||||
skip_monitoring = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (skip_monitoring != 0) {
|
||||
}
|
||||
|
||||
if (skip_monitoring) {
|
||||
event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
|
||||
timeval_current_ofs(ctdb->monitor->next_interval, 0),
|
||||
ctdb_check_health, ctdb);
|
||||
return;
|
||||
} else {
|
||||
ret = ctdb_event_script_callback(ctdb,
|
||||
ctdb->monitor->monitor_context, ctdb_health_callback,
|
||||
ctdb,
|
||||
CTDB_EVENT_MONITOR, "%s", "");
|
||||
}
|
||||
}
|
||||
|
||||
ret = ctdb_event_script_callback(ctdb,
|
||||
ctdb->monitor->monitor_context,
|
||||
ctdb_health_callback,
|
||||
ctdb, CTDB_EVENT_MONITOR, "%s", "");
|
||||
if (ret != 0) {
|
||||
DEBUG(DEBUG_ERR,("Unable to launch monitor event script\n"));
|
||||
ctdb->monitor->next_interval = 5;
|
||||
@ -412,26 +429,19 @@ void ctdb_stop_monitoring(struct ctdb_context *ctdb)
|
||||
/*
|
||||
start watching for nodes that might be dead
|
||||
*/
|
||||
void ctdb_start_monitoring(struct ctdb_context *ctdb)
|
||||
void ctdb_wait_for_first_recovery(struct ctdb_context *ctdb)
|
||||
{
|
||||
if (ctdb->monitor != NULL) {
|
||||
return;
|
||||
}
|
||||
ctdb_set_runstate(ctdb, CTDB_RUNSTATE_FIRST_RECOVERY);
|
||||
|
||||
ctdb->monitor = talloc(ctdb, struct ctdb_monitor_state);
|
||||
CTDB_NO_MEMORY_FATAL(ctdb, ctdb->monitor);
|
||||
|
||||
ctdb->monitor->next_interval = 5;
|
||||
|
||||
ctdb->monitor->monitor_context = talloc_new(ctdb->monitor);
|
||||
CTDB_NO_MEMORY_FATAL(ctdb, ctdb->monitor->monitor_context);
|
||||
|
||||
event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
|
||||
timeval_current_ofs(1, 0),
|
||||
ctdb_wait_until_recovered, ctdb);
|
||||
|
||||
ctdb->monitor->monitoring_mode = CTDB_MONITORING_ACTIVE;
|
||||
DEBUG(DEBUG_NOTICE,("Monitoring has been started\n"));
|
||||
}
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user