1
0
mirror of https://github.com/samba-team/samba.git synced 2024-12-23 17:34:34 +03:00

ctdb/daemon: Untangle serialisation of 1st recovery -> startup -> monitor

At the moment ctdb_check_healthy() is overloaded to wait until the
first recovery is complete, handle the "startup" event and also
actually handle monitoring.  This is untidy and hard to follow.

Instead, have the daemon explicitly wait for 1st recovery after the
"setup" event.  When first recovery is complete, schedule a function
to handle the "startup" event.  When the "startup" event succeeds then
explicitly enable monitoring.

Signed-off-by: Martin Schwenke <martin@meltin.net>
Reviewed-by: Amitay Isaacs <amitay@gmail.com>
This commit is contained in:
Martin Schwenke 2013-12-18 15:37:11 +11:00 committed by Amitay Isaacs
parent 50e00b3e52
commit e6304d1e1a
3 changed files with 76 additions and 69 deletions

View File

@ -1067,7 +1067,7 @@ uint32_t ctdb_get_num_active_nodes(struct ctdb_context *ctdb);
void ctdb_disable_monitoring(struct ctdb_context *ctdb); void ctdb_disable_monitoring(struct ctdb_context *ctdb);
void ctdb_enable_monitoring(struct ctdb_context *ctdb); void ctdb_enable_monitoring(struct ctdb_context *ctdb);
void ctdb_stop_monitoring(struct ctdb_context *ctdb); void ctdb_stop_monitoring(struct ctdb_context *ctdb);
void ctdb_start_monitoring(struct ctdb_context *ctdb); void ctdb_wait_for_first_recovery(struct ctdb_context *ctdb);
void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb); void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb);
void ctdb_send_keepalive(struct ctdb_context *ctdb, uint32_t destnode); void ctdb_send_keepalive(struct ctdb_context *ctdb, uint32_t destnode);
void ctdb_start_keepalive(struct ctdb_context *ctdb); void ctdb_start_keepalive(struct ctdb_context *ctdb);

View File

@ -84,9 +84,6 @@ static void ctdb_start_periodic_events(struct ctdb_context *ctdb)
/* start monitoring for connected/disconnected nodes */ /* start monitoring for connected/disconnected nodes */
ctdb_start_keepalive(ctdb); ctdb_start_keepalive(ctdb);
/* start monitoring for node health */
ctdb_start_monitoring(ctdb);
/* start periodic update of tcp tickle lists */ /* start periodic update of tcp tickle lists */
ctdb_start_tcp_tickle_update(ctdb); ctdb_start_tcp_tickle_update(ctdb);
@ -1048,8 +1045,6 @@ static void ctdb_setup_event_callback(struct ctdb_context *ctdb, int status,
} }
ctdb_run_notification_script(ctdb, "setup"); ctdb_run_notification_script(ctdb, "setup");
ctdb_set_runstate(ctdb, CTDB_RUNSTATE_FIRST_RECOVERY);
/* tell all other nodes we've just started up */ /* tell all other nodes we've just started up */
ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_ALL, ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_ALL,
0, CTDB_CONTROL_STARTUP, 0, 0, CTDB_CONTROL_STARTUP, 0,
@ -1063,6 +1058,8 @@ static void ctdb_setup_event_callback(struct ctdb_context *ctdb, int status,
} }
ctdb_start_periodic_events(ctdb); ctdb_start_periodic_events(ctdb);
ctdb_wait_for_first_recovery(ctdb);
} }
static struct timeval tevent_before_wait_ts; static struct timeval tevent_before_wait_ts;

View File

@ -196,6 +196,8 @@ after_change_status:
} }
static void ctdb_run_startup(struct event_context *ev, struct timed_event *te,
struct timeval t, void *private_data);
/* /*
called when the startup event script finishes called when the startup event script finishes
*/ */
@ -203,18 +205,58 @@ static void ctdb_startup_callback(struct ctdb_context *ctdb, int status, void *p
{ {
if (status != 0) { if (status != 0) {
DEBUG(DEBUG_ERR,("startup event failed\n")); DEBUG(DEBUG_ERR,("startup event failed\n"));
} else if (status == 0) { event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
DEBUG(DEBUG_NOTICE,("startup event OK - enabling monitoring\n")); timeval_current_ofs(5, 0),
ctdb_set_runstate(ctdb, CTDB_RUNSTATE_RUNNING); ctdb_run_startup, ctdb);
ctdb->monitor->next_interval = 2; return;
ctdb_run_notification_script(ctdb, "startup");
} }
DEBUG(DEBUG_NOTICE,("startup event OK - enabling monitoring\n"));
ctdb_set_runstate(ctdb, CTDB_RUNSTATE_RUNNING);
ctdb->monitor->next_interval = 2;
ctdb_run_notification_script(ctdb, "startup");
ctdb->monitor->monitoring_mode = CTDB_MONITORING_ACTIVE;
event_add_timed(ctdb->ev, ctdb->monitor->monitor_context, event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
timeval_current_ofs(ctdb->monitor->next_interval, 0), timeval_current_ofs(ctdb->monitor->next_interval, 0),
ctdb_check_health, ctdb); ctdb_check_health, ctdb);
} }
static void ctdb_run_startup(struct event_context *ev, struct timed_event *te,
struct timeval t, void *private_data)
{
struct ctdb_context *ctdb = talloc_get_type(private_data,
struct ctdb_context);
int ret;
/* This is necessary to avoid the "startup" event colliding
* with the "ipreallocated" event from the takeover run
* following the first recovery. We might as well serialise
* these things if we can.
*/
if (ctdb->runstate < CTDB_RUNSTATE_STARTUP) {
DEBUG(DEBUG_NOTICE,
("Not yet in startup runstate. Wait one more second\n"));
event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
timeval_current_ofs(1, 0),
ctdb_run_startup, ctdb);
return;
}
DEBUG(DEBUG_NOTICE,("Running the \"startup\" event.\n"));
ret = ctdb_event_script_callback(ctdb,
ctdb->monitor->monitor_context,
ctdb_startup_callback,
ctdb, CTDB_EVENT_STARTUP, "%s", "");
if (ret != 0) {
DEBUG(DEBUG_ERR,("Unable to launch startup event script\n"));
event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
timeval_current_ofs(5, 0),
ctdb_run_startup, ctdb);
}
}
/* /*
wait until we have finished initial recoveries before we start the wait until we have finished initial recoveries before we start the
@ -302,8 +344,7 @@ static void ctdb_wait_until_recovered(struct event_context *ev, struct timed_eve
ctdb->db_persistent_check_errors = 0; ctdb->db_persistent_check_errors = 0;
event_add_timed(ctdb->ev, ctdb->monitor->monitor_context, event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
timeval_current(), timeval_current(), ctdb_run_startup, ctdb);
ctdb_check_health, ctdb);
} }
@ -314,65 +355,41 @@ static void ctdb_check_health(struct event_context *ev, struct timed_event *te,
struct timeval t, void *private_data) struct timeval t, void *private_data)
{ {
struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context); struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
bool skip_monitoring = false;
int ret = 0; int ret = 0;
if (ctdb->runstate < CTDB_RUNSTATE_STARTUP) { if (ctdb->recovery_mode != CTDB_RECOVERY_NORMAL ||
DEBUG(DEBUG_NOTICE,("Not yet in startup runstate. Wait one more second\n")); ctdb->monitor->monitoring_mode == CTDB_MONITORING_DISABLED) {
event_add_timed(ctdb->ev, ctdb->monitor->monitor_context, skip_monitoring = true;
timeval_current_ofs(1, 0), } else {
ctdb_check_health, ctdb); int i;
return; for (i=1; i<=NUM_DB_PRIORITIES; i++) {
if (ctdb->freeze_handles[i] != NULL) {
DEBUG(DEBUG_ERR,
("Skip monitoring since databases are frozen\n"));
skip_monitoring = true;
break;
}
}
} }
if (ctdb->recovery_mode != CTDB_RECOVERY_NORMAL || if (skip_monitoring) {
(ctdb->monitor->monitoring_mode == CTDB_MONITORING_DISABLED &&
ctdb->runstate == CTDB_RUNSTATE_RUNNING)) {
event_add_timed(ctdb->ev, ctdb->monitor->monitor_context, event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
timeval_current_ofs(ctdb->monitor->next_interval, 0), timeval_current_ofs(ctdb->monitor->next_interval, 0),
ctdb_check_health, ctdb); ctdb_check_health, ctdb);
return; return;
} }
if (ctdb->runstate == CTDB_RUNSTATE_STARTUP) { ret = ctdb_event_script_callback(ctdb,
DEBUG(DEBUG_NOTICE,("Recoveries finished. Running the \"startup\" event.\n")); ctdb->monitor->monitor_context,
ret = ctdb_event_script_callback(ctdb, ctdb_health_callback,
ctdb->monitor->monitor_context, ctdb_startup_callback, ctdb, CTDB_EVENT_MONITOR, "%s", "");
ctdb,
CTDB_EVENT_STARTUP, "%s", "");
} else {
int i;
int skip_monitoring = 0;
if (ctdb->recovery_mode != CTDB_RECOVERY_NORMAL) {
skip_monitoring = 1;
DEBUG(DEBUG_ERR,("Skip monitoring during recovery\n"));
}
for (i=1; i<=NUM_DB_PRIORITIES; i++) {
if (ctdb->freeze_handles[i] != NULL) {
DEBUG(DEBUG_ERR,("Skip monitoring since databases are frozen\n"));
skip_monitoring = 1;
break;
}
}
if (skip_monitoring != 0) {
event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
timeval_current_ofs(ctdb->monitor->next_interval, 0),
ctdb_check_health, ctdb);
return;
} else {
ret = ctdb_event_script_callback(ctdb,
ctdb->monitor->monitor_context, ctdb_health_callback,
ctdb,
CTDB_EVENT_MONITOR, "%s", "");
}
}
if (ret != 0) { if (ret != 0) {
DEBUG(DEBUG_ERR,("Unable to launch monitor event script\n")); DEBUG(DEBUG_ERR,("Unable to launch monitor event script\n"));
ctdb->monitor->next_interval = 5; ctdb->monitor->next_interval = 5;
event_add_timed(ctdb->ev, ctdb->monitor->monitor_context, event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
timeval_current_ofs(5, 0), timeval_current_ofs(5, 0),
ctdb_check_health, ctdb); ctdb_check_health, ctdb);
} }
} }
@ -412,26 +429,19 @@ void ctdb_stop_monitoring(struct ctdb_context *ctdb)
/* /*
start watching for nodes that might be dead start watching for nodes that might be dead
*/ */
void ctdb_start_monitoring(struct ctdb_context *ctdb) void ctdb_wait_for_first_recovery(struct ctdb_context *ctdb)
{ {
if (ctdb->monitor != NULL) { ctdb_set_runstate(ctdb, CTDB_RUNSTATE_FIRST_RECOVERY);
return;
}
ctdb->monitor = talloc(ctdb, struct ctdb_monitor_state); ctdb->monitor = talloc(ctdb, struct ctdb_monitor_state);
CTDB_NO_MEMORY_FATAL(ctdb, ctdb->monitor); CTDB_NO_MEMORY_FATAL(ctdb, ctdb->monitor);
ctdb->monitor->next_interval = 5;
ctdb->monitor->monitor_context = talloc_new(ctdb->monitor); ctdb->monitor->monitor_context = talloc_new(ctdb->monitor);
CTDB_NO_MEMORY_FATAL(ctdb, ctdb->monitor->monitor_context); CTDB_NO_MEMORY_FATAL(ctdb, ctdb->monitor->monitor_context);
event_add_timed(ctdb->ev, ctdb->monitor->monitor_context, event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
timeval_current_ofs(1, 0), timeval_current_ofs(1, 0),
ctdb_wait_until_recovered, ctdb); ctdb_wait_until_recovered, ctdb);
ctdb->monitor->monitoring_mode = CTDB_MONITORING_ACTIVE;
DEBUG(DEBUG_NOTICE,("Monitoring has been started\n"));
} }