mirror of
https://github.com/samba-team/samba.git
synced 2024-12-22 13:34:15 +03:00
additional monitoring between the two daemons.
we currently only monitor that the dameons are running by kill(0, pid) and verifying the the domain socket between them is ok. this is not sufficient since we can have a situation where the recovery daemon is hung. this new code monitors that the recovery daemon is operating. if the recovery hangs, we log this and shut down the main daemon (This used to be ctdb commit cd69d292292eaab3aac0e9d9fc57cb621597c63c)
This commit is contained in:
parent
7a78a78a1c
commit
6474f3278d
@ -3280,3 +3280,21 @@ again:
|
||||
talloc_free(h);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
recovery daemon ping to main daemon
|
||||
*/
|
||||
int ctdb_ctrl_recd_ping(struct ctdb_context *ctdb)
|
||||
{
|
||||
int ret;
|
||||
int32_t res;
|
||||
|
||||
ret = ctdb_control(ctdb, CTDB_CURRENT_NODE, 0, CTDB_CONTROL_RECD_PING, 0, tdb_null,
|
||||
ctdb, NULL, &res, NULL, NULL);
|
||||
if (ret != 0 || res != 0) {
|
||||
DEBUG(DEBUG_ERR,("Failed to send recd ping\n"));
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -566,4 +566,6 @@ int ctdb_transaction_store(struct ctdb_transaction_handle *h,
|
||||
TDB_DATA key, TDB_DATA data);
|
||||
int ctdb_transaction_commit(struct ctdb_transaction_handle *h);
|
||||
|
||||
int ctdb_ctrl_recd_ping(struct ctdb_context *ctdb);
|
||||
|
||||
#endif
|
||||
|
@ -114,6 +114,7 @@ struct ctdb_tunable {
|
||||
uint32_t reclock_ping_period;
|
||||
uint32_t no_ip_failback;
|
||||
uint32_t verbose_memory_names;
|
||||
uint32_t recd_ping_timeout;
|
||||
};
|
||||
|
||||
/*
|
||||
@ -417,6 +418,7 @@ struct ctdb_context {
|
||||
int start_as_disabled;
|
||||
uint32_t event_script_timeouts; /* counting how many consecutive times an eventscript has timedout */
|
||||
TALLOC_CTX *eventscripts_ctx; /* a context to hold data for the RUN_EVENTSCRIPTS control */
|
||||
TALLOC_CTX *recd_ping_ctx;
|
||||
};
|
||||
|
||||
struct ctdb_db_context {
|
||||
@ -550,6 +552,7 @@ enum ctdb_controls {CTDB_CONTROL_PROCESS_EXISTS = 0,
|
||||
CTDB_CONTROL_TRANS2_FINISHED = 84,
|
||||
CTDB_CONTROL_TRANS2_ERROR = 85,
|
||||
CTDB_CONTROL_TRANS2_COMMIT_RETRY = 86,
|
||||
CTDB_CONTROL_RECD_PING = 87,
|
||||
};
|
||||
|
||||
/*
|
||||
@ -1378,5 +1381,6 @@ int32_t ctdb_control_trans2_error(struct ctdb_context *ctdb,
|
||||
char *ctdb_addr_to_str(ctdb_sock_addr *addr);
|
||||
void ctdb_canonicalize_ip(const ctdb_sock_addr *ip, ctdb_sock_addr *cip);
|
||||
|
||||
int32_t ctdb_control_recd_ping(struct ctdb_context *ctdb);
|
||||
|
||||
#endif
|
||||
|
@ -406,6 +406,10 @@ static int32_t ctdb_control_dispatch(struct ctdb_context *ctdb,
|
||||
case CTDB_CONTROL_TRANS2_FINISHED:
|
||||
return ctdb_control_trans2_finished(ctdb, c);
|
||||
|
||||
case CTDB_CONTROL_RECD_PING:
|
||||
CHECK_CONTROL_DATA_SIZE(0);
|
||||
return ctdb_control_recd_ping(ctdb);
|
||||
|
||||
default:
|
||||
DEBUG(DEBUG_CRIT,(__location__ " Unknown CTDB control opcode %u\n", opcode));
|
||||
return -1;
|
||||
|
@ -103,6 +103,9 @@ static void ctdb_start_transport(struct ctdb_context *ctdb)
|
||||
|
||||
/* start periodic update of tcp tickle lists */
|
||||
ctdb_start_tcp_tickle_update(ctdb);
|
||||
|
||||
/* start listening for recovery daemon pings */
|
||||
ctdb_control_recd_ping(ctdb);
|
||||
}
|
||||
|
||||
static void block_signal(int signum)
|
||||
|
@ -971,3 +971,41 @@ int32_t ctdb_control_get_capabilities(struct ctdb_context *ctdb, TDB_DATA *outda
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void ctdb_recd_ping_timeout(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
|
||||
{
|
||||
struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
|
||||
|
||||
DEBUG(DEBUG_ERR, (__location__ " Recovery daemon ping timeout. Shutting down ctdb daemon\n"));
|
||||
|
||||
ctdb_stop_recoverd(ctdb);
|
||||
ctdb_stop_keepalive(ctdb);
|
||||
ctdb_stop_monitoring(ctdb);
|
||||
ctdb_release_all_ips(ctdb);
|
||||
if (ctdb->methods != NULL) {
|
||||
ctdb->methods->shutdown(ctdb);
|
||||
}
|
||||
ctdb_event_script(ctdb, "shutdown");
|
||||
DEBUG(DEBUG_ERR, (__location__ " Recovery daemon ping timeout. Daemon has been shut down.\n"));
|
||||
exit(0);
|
||||
}
|
||||
|
||||
/* The recovery daemon will ping us at regular intervals.
|
||||
If we havent been pinged for a while we assume the recovery
|
||||
daemon is inoperable and we shut down.
|
||||
*/
|
||||
int32_t ctdb_control_recd_ping(struct ctdb_context *ctdb)
|
||||
{
|
||||
talloc_free(ctdb->recd_ping_ctx);
|
||||
|
||||
ctdb->recd_ping_ctx = talloc_new(ctdb);
|
||||
CTDB_NO_MEMORY(ctdb, ctdb->recd_ping_ctx);
|
||||
|
||||
if (ctdb->tunable.recd_ping_timeout != 0) {
|
||||
event_add_timed(ctdb->ev, ctdb->recd_ping_ctx,
|
||||
timeval_current_ofs(ctdb->tunable.recd_ping_timeout, 0),
|
||||
ctdb_recd_ping_timeout, ctdb);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -2317,6 +2317,9 @@ again:
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
/* ping the local daemon to tell it we are alive */
|
||||
ctdb_ctrl_recd_ping(ctdb);
|
||||
|
||||
if (rec->election_timeout) {
|
||||
/* an election is in progress */
|
||||
goto again;
|
||||
|
@ -50,6 +50,7 @@ static const struct {
|
||||
{ "ReclockPingPeriod", 60, offsetof(struct ctdb_tunable, reclock_ping_period) },
|
||||
{ "NoIPFailback", 0, offsetof(struct ctdb_tunable, no_ip_failback) },
|
||||
{ "VerboseMemoryNames", 0, offsetof(struct ctdb_tunable, verbose_memory_names) },
|
||||
{ "RecdPingTimeout", 60, offsetof(struct ctdb_tunable, recd_ping_timeout) },
|
||||
};
|
||||
|
||||
/*
|
||||
|
Loading…
Reference in New Issue
Block a user