2024-12-23 17:34:34 +03:00 · 2013-12-18 15:37:11 +11:00 · 2013-12-18 15:37:11 +11:00 · e6304d1e1a
commit e6304d1e1a
parent 50e00b3e52
3 changed files with 76 additions and 69 deletions
--- a/ctdb/include/ctdb_private.h
+++ b/ctdb/include/ctdb_private.h
@ -1067,7 +1067,7 @@ uint32_t ctdb_get_num_active_nodes(struct ctdb_context *ctdb);
 void ctdb_disable_monitoring(struct ctdb_context *ctdb);
 void ctdb_enable_monitoring(struct ctdb_context *ctdb);
 void ctdb_stop_monitoring(struct ctdb_context *ctdb);
-void ctdb_start_monitoring(struct ctdb_context *ctdb);
+void ctdb_wait_for_first_recovery(struct ctdb_context *ctdb);
 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb);
 void ctdb_send_keepalive(struct ctdb_context *ctdb, uint32_t destnode);
 void ctdb_start_keepalive(struct ctdb_context *ctdb);
--- a/ctdb/server/ctdb_daemon.c
+++ b/ctdb/server/ctdb_daemon.c
@ -84,9 +84,6 @@ static void ctdb_start_periodic_events(struct ctdb_context *ctdb)
 	/* start monitoring for connected/disconnected nodes */
 	ctdb_start_keepalive(ctdb);
 	/* start monitoring for node health */
 	ctdb_start_monitoring(ctdb);
 	/* start periodic update of tcp tickle lists */
       	ctdb_start_tcp_tickle_update(ctdb);
@ -1048,8 +1045,6 @@ static void ctdb_setup_event_callback(struct ctdb_context *ctdb, int status,
 	}
 	ctdb_run_notification_script(ctdb, "setup");
 	ctdb_set_runstate(ctdb, CTDB_RUNSTATE_FIRST_RECOVERY);
 	/* tell all other nodes we've just started up */
 	ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_ALL,
 				 0, CTDB_CONTROL_STARTUP, 0,
@ -1063,6 +1058,8 @@ static void ctdb_setup_event_callback(struct ctdb_context *ctdb, int status,
 	}
 	ctdb_start_periodic_events(ctdb);
 	ctdb_wait_for_first_recovery(ctdb);
 }
 static struct timeval tevent_before_wait_ts;
--- a/ctdb/server/ctdb_monitor.c
+++ b/ctdb/server/ctdb_monitor.c
@ -196,6 +196,8 @@ after_change_status:
 }
 static void ctdb_run_startup(struct event_context *ev, struct timed_event *te,
 			     struct timeval t, void *private_data);
 /*
  called when the startup event script finishes
 */
@ -203,18 +205,58 @@ static void ctdb_startup_callback(struct ctdb_context *ctdb, int status, void *p
 {
 	if (status != 0) {
 		DEBUG(DEBUG_ERR,("startup event failed\n"));
-	} else if (status == 0) {
+		event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
 				timeval_current_ofs(5, 0),
 				ctdb_run_startup, ctdb);
 		return;
 	}
 	DEBUG(DEBUG_NOTICE,("startup event OK - enabling monitoring\n"));
 	ctdb_set_runstate(ctdb, CTDB_RUNSTATE_RUNNING);
 	ctdb->monitor->next_interval = 2;
 	ctdb_run_notification_script(ctdb, "startup");
-	}
+
 	ctdb->monitor->monitoring_mode = CTDB_MONITORING_ACTIVE;
 	event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
 			timeval_current_ofs(ctdb->monitor->next_interval, 0),
 			ctdb_check_health, ctdb);
 }
 static void ctdb_run_startup(struct event_context *ev, struct timed_event *te,
 			     struct timeval t, void *private_data)
 {
 	struct ctdb_context *ctdb = talloc_get_type(private_data,
 						    struct ctdb_context);
 	int ret;
 	/* This is necessary to avoid the "startup" event colliding
 	 * with the "ipreallocated" event from the takeover run
 	 * following the first recovery.  We might as well serialise
 	 * these things if we can.
 	 */
 	if (ctdb->runstate < CTDB_RUNSTATE_STARTUP) {
 		DEBUG(DEBUG_NOTICE,
 		      ("Not yet in startup runstate. Wait one more second\n"));
 		event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
 				timeval_current_ofs(1, 0),
 				ctdb_run_startup, ctdb);
 		return;
 	}
 	DEBUG(DEBUG_NOTICE,("Running the \"startup\" event.\n"));
 	ret = ctdb_event_script_callback(ctdb,
 					 ctdb->monitor->monitor_context,
 					 ctdb_startup_callback,
 					 ctdb, CTDB_EVENT_STARTUP, "%s", "");
 	if (ret != 0) {
 		DEBUG(DEBUG_ERR,("Unable to launch startup event script\n"));
 		event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
 				timeval_current_ofs(5, 0),
 				ctdb_run_startup, ctdb);
 	}
 }
 /*
  wait until we have finished initial recoveries before we start the
@ -302,8 +344,7 @@ static void ctdb_wait_until_recovered(struct event_context *ev, struct timed_eve
 	ctdb->db_persistent_check_errors = 0;
 	event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
-			     timeval_current(),
+			timeval_current(), ctdb_run_startup, ctdb);
 			     ctdb_check_health, ctdb);
 }
@ -314,59 +355,35 @@ static void ctdb_check_health(struct event_context *ev, struct timed_event *te,
 			      struct timeval t, void *private_data)
 {
 	struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
 	bool skip_monitoring = false;
 	int ret = 0;
 	if (ctdb->runstate < CTDB_RUNSTATE_STARTUP) {
 		DEBUG(DEBUG_NOTICE,("Not yet in startup runstate. Wait one more second\n"));
 		event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
 				timeval_current_ofs(1, 0), 
 				ctdb_check_health, ctdb);
 		return;
 	}
 	if (ctdb->recovery_mode != CTDB_RECOVERY_NORMAL ||
-	    (ctdb->monitor->monitoring_mode == CTDB_MONITORING_DISABLED &&
+	    ctdb->monitor->monitoring_mode == CTDB_MONITORING_DISABLED) {
-	     ctdb->runstate == CTDB_RUNSTATE_RUNNING)) {
+		skip_monitoring = true;
 		event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
 				timeval_current_ofs(ctdb->monitor->next_interval, 0), 
 				ctdb_check_health, ctdb);
 		return;
 	}
 	if (ctdb->runstate == CTDB_RUNSTATE_STARTUP) {
 		DEBUG(DEBUG_NOTICE,("Recoveries finished. Running the \"startup\" event.\n"));
 		ret = ctdb_event_script_callback(ctdb, 
 						 ctdb->monitor->monitor_context, ctdb_startup_callback, 
 						 ctdb,
 						 CTDB_EVENT_STARTUP, "%s", "");
 	} else {
 		int i;
 		int skip_monitoring = 0;
 		if (ctdb->recovery_mode != CTDB_RECOVERY_NORMAL) {
 			skip_monitoring = 1;
 			DEBUG(DEBUG_ERR,("Skip monitoring during recovery\n"));
 		}
 		for (i=1; i<=NUM_DB_PRIORITIES; i++) {
 			if (ctdb->freeze_handles[i] != NULL) {
-				DEBUG(DEBUG_ERR,("Skip monitoring since databases are frozen\n"));
+				DEBUG(DEBUG_ERR,
-				skip_monitoring = 1;
+				      ("Skip monitoring since databases are frozen\n"));
 				skip_monitoring = true;
 				break;
 			}
 		}
-		if (skip_monitoring != 0) {
+	}
 	if (skip_monitoring) {
 		event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
 				timeval_current_ofs(ctdb->monitor->next_interval, 0),
 				ctdb_check_health, ctdb);
 		return;
 		} else {
 			ret = ctdb_event_script_callback(ctdb, 
 					ctdb->monitor->monitor_context, ctdb_health_callback,
 					ctdb,
 					CTDB_EVENT_MONITOR, "%s", "");
 		}
 	}
 	ret = ctdb_event_script_callback(ctdb,
 					 ctdb->monitor->monitor_context,
 					 ctdb_health_callback,
 					 ctdb, CTDB_EVENT_MONITOR, "%s", "");
 	if (ret != 0) {
 		DEBUG(DEBUG_ERR,("Unable to launch monitor event script\n"));
 		ctdb->monitor->next_interval = 5;
@ -412,26 +429,19 @@ void ctdb_stop_monitoring(struct ctdb_context *ctdb)
 /*
  start watching for nodes that might be dead
 */
-void ctdb_start_monitoring(struct ctdb_context *ctdb)
+void ctdb_wait_for_first_recovery(struct ctdb_context *ctdb)
 {
-	if (ctdb->monitor != NULL) {
+	ctdb_set_runstate(ctdb, CTDB_RUNSTATE_FIRST_RECOVERY);
 		return;
 	}
 	ctdb->monitor = talloc(ctdb, struct ctdb_monitor_state);
 	CTDB_NO_MEMORY_FATAL(ctdb, ctdb->monitor);
 	ctdb->monitor->next_interval = 5;
 	ctdb->monitor->monitor_context = talloc_new(ctdb->monitor);
 	CTDB_NO_MEMORY_FATAL(ctdb, ctdb->monitor->monitor_context);
 	event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
 			timeval_current_ofs(1, 0),
 			ctdb_wait_until_recovered, ctdb);
 	ctdb->monitor->monitoring_mode  = CTDB_MONITORING_ACTIVE;
 	DEBUG(DEBUG_NOTICE,("Monitoring has been started\n"));
 }