mirror of
https://github.com/samba-team/samba.git
synced 2025-02-02 09:47:23 +03:00
add a new file <reclock>.pnn where each recovery daemon can lock that byte at offset==pnn to offer an alternative way to detect which nodes are active instead of relying on CONNECTED being accurate.
(This used to be ctdb commit 21d3319eaf463e2a00637d440ee2d4d15f53bf09)
This commit is contained in:
parent
4adeafef11
commit
e0036942bc
@ -2720,3 +2720,38 @@ uint32_t *list_of_active_nodes(struct ctdb_context *ctdb,
|
||||
|
||||
return nodes;
|
||||
}
|
||||
|
||||
/*
|
||||
this is used to test if a pnn lock exists and if it exists will return
|
||||
the number of connections that pnn has reported or -1 if that recovery
|
||||
daemon is not running.
|
||||
*/
|
||||
int
|
||||
ctdb_read_pnn_lock(int fd, int32_t pnn)
|
||||
{
|
||||
struct flock lock;
|
||||
char c;
|
||||
|
||||
lock.l_type = F_WRLCK;
|
||||
lock.l_whence = SEEK_SET;
|
||||
lock.l_start = pnn;
|
||||
lock.l_len = 1;
|
||||
lock.l_pid = 0;
|
||||
|
||||
if (fcntl(fd, F_GETLK, &lock) != 0) {
|
||||
DEBUG(DEBUG_ERR, (__location__ " F_GETLK failed with %s\n", strerror(errno)));
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (lock.l_type == F_UNLCK) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (pread(fd, &c, 1, pnn) == -1) {
|
||||
DEBUG(DEBUG_CRIT,(__location__ " failed read pnn count - %s\n", strerror(errno)));
|
||||
return -1;
|
||||
}
|
||||
|
||||
return c;
|
||||
}
|
||||
|
||||
|
@ -517,4 +517,6 @@ uint32_t *list_of_vnnmap_nodes(struct ctdb_context *ctdb,
|
||||
TALLOC_CTX *mem_ctx,
|
||||
bool include_self);
|
||||
|
||||
int ctdb_read_pnn_lock(int fd, int32_t pnn);
|
||||
|
||||
#endif
|
||||
|
@ -41,6 +41,7 @@ struct ban_state {
|
||||
*/
|
||||
struct ctdb_recoverd {
|
||||
struct ctdb_context *ctdb;
|
||||
int rec_file_fd;
|
||||
uint32_t last_culprit;
|
||||
uint32_t culprit_counter;
|
||||
struct timeval first_recover_time;
|
||||
@ -1957,6 +1958,72 @@ static enum monitor_result verify_recmaster(struct ctdb_context *ctdb, struct ct
|
||||
return status;
|
||||
}
|
||||
|
||||
/*
|
||||
this function writes the number of connected nodes we have for this pnn
|
||||
to the pnn slot in the reclock file
|
||||
*/
|
||||
static void
|
||||
ctdb_recoverd_write_pnn_connect_count(struct ctdb_recoverd *rec, const char count)
|
||||
{
|
||||
struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
|
||||
|
||||
if (pwrite(rec->rec_file_fd, &count, 1, ctdb->pnn) == -1) {
|
||||
DEBUG(DEBUG_CRIT, (__location__ " Failed to write pnn count\n"));
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
this function opens the reclock file and sets a byterage lock for the single
|
||||
byte at position pnn+1.
|
||||
the existence/non-existence of such a lock provides an alternative mechanism
|
||||
to know whether a remote node(recovery daemon) is running or not.
|
||||
*/
|
||||
static void
|
||||
ctdb_recoverd_get_pnn_lock(struct ctdb_recoverd *rec)
|
||||
{
|
||||
struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
|
||||
struct flock lock;
|
||||
char *pnnfile = NULL;
|
||||
|
||||
DEBUG(DEBUG_INFO, ("Setting PNN lock for pnn:%d\n", ctdb->pnn));
|
||||
|
||||
if (rec->rec_file_fd != -1) {
|
||||
DEBUG(DEBUG_CRIT, (__location__ " rec_lock_fd is already open. Aborting\n"));
|
||||
exit(10);
|
||||
}
|
||||
|
||||
pnnfile = talloc_asprintf(rec, "%s.pnn", ctdb->recovery_lock_file);
|
||||
CTDB_NO_MEMORY_FATAL(ctdb, pnnfile);
|
||||
|
||||
rec->rec_file_fd = open(pnnfile, O_RDWR|O_CREAT, 0600);
|
||||
if (rec->rec_file_fd == -1) {
|
||||
DEBUG(DEBUG_CRIT,(__location__ " Unable to open %s - (%s)\n",
|
||||
pnnfile, strerror(errno)));
|
||||
exit(10);
|
||||
}
|
||||
|
||||
set_close_on_exec(rec->rec_file_fd);
|
||||
lock.l_type = F_WRLCK;
|
||||
lock.l_whence = SEEK_SET;
|
||||
lock.l_start = ctdb->pnn;
|
||||
lock.l_len = 1;
|
||||
lock.l_pid = 0;
|
||||
|
||||
if (fcntl(rec->rec_file_fd, F_SETLK, &lock) != 0) {
|
||||
close(rec->rec_file_fd);
|
||||
rec->rec_file_fd = -1;
|
||||
DEBUG(DEBUG_CRIT,(__location__ " Failed to get pnn lock on '%s'\n", pnnfile));
|
||||
exit(10);
|
||||
}
|
||||
|
||||
|
||||
DEBUG(DEBUG_NOTICE,(__location__ " Got pnn lock on '%s'\n", pnnfile));
|
||||
|
||||
talloc_free(pnnfile);
|
||||
|
||||
/* we start out with 0 connected nodes */
|
||||
ctdb_recoverd_write_pnn_connect_count(rec, 0);
|
||||
}
|
||||
|
||||
/*
|
||||
the main monitoring loop
|
||||
@ -1986,6 +2053,10 @@ static void monitor_cluster(struct ctdb_context *ctdb)
|
||||
|
||||
rec->priority_time = timeval_current();
|
||||
|
||||
/* open the rec file fd and lock our slot */
|
||||
rec->rec_file_fd = -1;
|
||||
ctdb_recoverd_get_pnn_lock(rec);
|
||||
|
||||
/* register a message port for recovery elections */
|
||||
ctdb_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec);
|
||||
|
||||
|
@ -858,8 +858,10 @@ static int control_getdbmap(struct ctdb_context *ctdb, int argc, const char **ar
|
||||
*/
|
||||
static int control_getreclock(struct ctdb_context *ctdb, int argc, const char **argv)
|
||||
{
|
||||
int ret;
|
||||
int i, ret, fd;
|
||||
const char *reclock;
|
||||
struct ctdb_node_map *nodemap=NULL;
|
||||
char *pnnfile;
|
||||
|
||||
ret = ctdb_ctrl_getreclock(ctdb, TIMELIMIT(), options.pnn, ctdb, &reclock);
|
||||
if (ret != 0) {
|
||||
@ -867,7 +869,40 @@ static int control_getreclock(struct ctdb_context *ctdb, int argc, const char **
|
||||
return ret;
|
||||
}
|
||||
|
||||
DEBUG(DEBUG_ERR, ("Reclock file : %s\n", reclock));
|
||||
ret = ctdb_ctrl_getnodemap(ctdb, TIMELIMIT(), options.pnn, ctdb, &nodemap);
|
||||
if (ret != 0) {
|
||||
DEBUG(DEBUG_ERR, ("Unable to get nodemap from node %u\n", options.pnn));
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
pnnfile = talloc_asprintf(ctdb, "%s.pnn", reclock);
|
||||
CTDB_NO_MEMORY(ctdb, pnnfile);
|
||||
|
||||
fd = open(pnnfile, O_RDONLY);
|
||||
if (fd == -1) {
|
||||
DEBUG(DEBUG_CRIT,(__location__ " Failed to open reclock pnn file %s - (%s)\n",
|
||||
pnnfile, strerror(errno)));
|
||||
exit(10);
|
||||
}
|
||||
|
||||
|
||||
printf("Reclock file : %s\n", reclock);
|
||||
for (i=0; i<nodemap->num; i++) {
|
||||
int count;
|
||||
|
||||
count = ctdb_read_pnn_lock(fd, nodemap->nodes[i].pnn);
|
||||
|
||||
printf("pnn:%d %-16s", nodemap->nodes[i].pnn,
|
||||
inet_ntoa(nodemap->nodes[i].sin.sin_addr));
|
||||
if (count == -1) {
|
||||
printf(" NOT ACTIVE\n");
|
||||
} else {
|
||||
printf(" ACTIVE with %d connections\n", count);
|
||||
}
|
||||
}
|
||||
|
||||
close(fd);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user