2025-01-08 21:18:16 +03:00 · 2009-12-07 23:48:57 +10:30 · 2009-12-07 23:48:57 +10:30 · 928b8dcb31
commit 928b8dcb31
parent 5190932507
4 changed files with 40 additions and 41 deletions
--- a/ctdb/server/ctdb_monitor.c
+++ b/ctdb/server/ctdb_monitor.c
@ -123,6 +123,17 @@ static void ctdb_health_callback(struct ctdb_context *ctdb, int status, void *p)
 	rddata.dptr = (uint8_t *)&rd;
 	rddata.dsize = sizeof(rd);

+	if (status == -ETIME) {
+		ctdb->event_script_timeouts++;
+
+		if (ctdb->event_script_timeouts > ctdb->tunable.script_ban_count) {
+			DEBUG(DEBUG_ERR, ("Maximum timeout count %u reached for eventscript. Making node unhealthy\n", ctdb->tunable.script_ban_count));
+		} else {
+			/* We pretend this is OK. */
+			status = 0;
+		}
+	}
+
 	if (status != 0 && !(node->flags & NODE_FLAGS_UNHEALTHY)) {
 		DEBUG(DEBUG_NOTICE,("monitor event failed - disabling node\n"));
 		node->flags |= NODE_FLAGS_UNHEALTHY;
--- a/ctdb/server/ctdb_recover.c
+++ b/ctdb/server/ctdb_recover.c
@ -935,6 +935,9 @@ static void ctdb_end_recovery_callback(struct ctdb_context *ctdb, int status, vo

 	if (status != 0) {
 		DEBUG(DEBUG_ERR,(__location__ " recovered event script failed (status %d)\n", status));
+		if (status == -ETIME) {
+			ctdb_ban_self(ctdb);
+		}
 	}

 	ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
@ -1210,6 +1213,9 @@ static void ctdb_stop_node_callback(struct ctdb_context *ctdb, int status, void
 	if (status != 0) {
 		DEBUG(DEBUG_ERR,(__location__ " stopped event script failed (status %d)\n", status));
 		ctdb->nodes[ctdb->pnn]->flags &= ~NODE_FLAGS_STOPPED;
+		if (status == -ETIME) {
+			ctdb_ban_self(ctdb);
+		}
 	}

 	ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
--- a/ctdb/server/ctdb_takeover.c
+++ b/ctdb/server/ctdb_takeover.c
@ -128,6 +128,9 @@ static void takeover_ip_callback(struct ctdb_context *ctdb, int status,
 	struct ctdb_tcp_array *tcparray;

 	if (status != 0) {
+		if (status == -ETIME) {
+			ctdb_ban_self(ctdb);
+		}
 		DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
 			ctdb_addr_to_str(state->addr),
 			state->vnn->iface));
@ -323,6 +326,10 @@ static void release_ip_callback(struct ctdb_context *ctdb, int status,
 		talloc_get_type(private_data, struct takeover_callback_state);
 	TDB_DATA data;

+	if (status == -ETIME) {
+		ctdb_ban_self(ctdb);
+	}
+
 	/* send a message to all clients of this node telling them
 	   that the cluster has been reconfigured and they should
 	   release any sockets on this IP */
--- a/ctdb/server/eventscript.c
+++ b/ctdb/server/eventscript.c
@ -633,57 +633,24 @@ static void ctdb_event_script_timeout(struct event_context *ev, struct timed_eve
 	DEBUG(DEBUG_ERR,("Event script timed out : %s %s count : %u  pid : %d\n",
 			 call_names[state->call], state->options, ctdb->event_script_timeouts, state->child));

+	state->cb_status = -ETIME;
+
 	if (kill(state->child, 0) != 0) {
 		DEBUG(DEBUG_ERR,("Event script child process already dead, errno %s(%d)\n", strerror(errno), errno));
 		state->child = 0;
-		state->cb_status = -ETIME;
-		talloc_free(state);
-		return;
-	}
-
-	if (state->call == CTDB_EVENT_MONITOR) {
-		/* if it is a monitor event, we allow it to "hang" a few times
-		   before we declare it a failure and ban ourself (and make
-		   ourself unhealthy)
-		*/
-		DEBUG(DEBUG_ERR, (__location__ " eventscript for monitor event timedout.\n"));
-
-		ctdb->event_script_timeouts++;
-
-		if (ctdb->event_script_timeouts > ctdb->tunable.script_ban_count) {
-			DEBUG(DEBUG_ERR, ("Maximum timeout count %u reached for eventscript. Making node unhealthy\n", ctdb->tunable.script_ban_count));
-			state->cb_status = -ETIME;
-		} else {
-			state->cb_status = 0;
-		}
-	} else if (state->call == CTDB_EVENT_STARTUP) {
-		DEBUG(DEBUG_ERR, (__location__ " eventscript for startup event timedout.\n"));
-		state->cb_status = -ETIME;
-	} else {
-		/* if it is not a monitor or a startup event we ban ourself
-		   immediately
-		*/
-		DEBUG(DEBUG_ERR, (__location__ " eventscript for NON-monitor/NON-startup event timedout. Immediately banning ourself for %d seconds\n", ctdb->tunable.recovery_ban_period));
-
-		ctdb_ban_self(ctdb);
-
-		state->cb_status = -ETIME;
 	}

 	if (state->call == CTDB_EVENT_MONITOR || state->call == CTDB_EVENT_STATUS) {
 		struct ctdb_monitor_script_status *script;

-		if (ctdb->current_monitor_status_ctx == NULL) {
-			talloc_free(state);
-			return;
-		}
+		if (ctdb->current_monitor_status_ctx != NULL) {
+			script = ctdb->current_monitor_status_ctx->scripts;
+			if (script != NULL) {
+				script->status = state->cb_status;
+			}

-		script = ctdb->current_monitor_status_ctx->scripts;
-		if (script != NULL) {
-			script->status = state->cb_status;
+			ctdb_control_event_script_finished(ctdb);
 		}
-
-		ctdb_control_event_script_finished(ctdb);
 	}

 	talloc_free(state);
@ -930,6 +897,14 @@ int ctdb_event_script_args(struct ctdb_context *ctdb, enum ctdb_eventscript_call

 	while (status.done == false && event_loop_once(ctdb->ev) == 0) /* noop */;

+	if (status.status == -ETIME) {
+		DEBUG(DEBUG_ERR, (__location__ " eventscript for '%s' timedout."
+				  " Immediately banning ourself for %d seconds\n",
+				  call_names[call],
+				  ctdb->tunable.recovery_ban_period));
+		ctdb_ban_self(ctdb);
+	}
+
 	return status.status;
 }