From 2b7e4013cd4a323c958dc6b0966431551adc6dfd Mon Sep 17 00:00:00 2001 From: Patrick Caulfield Date: Wed, 13 Apr 2005 13:50:07 +0000 Subject: [PATCH] Make clvmd work around some "limitations" in gulm's node state notifications. Also make clvmd debuglog timestamps a little more helpful. --- WHATS_NEW | 1 + daemons/clvmd/clvmd-gulm.c | 58 +++++++++++++++++++++++++++----------- daemons/clvmd/clvmd.c | 27 +++++++++++++++--- daemons/clvmd/clvmd.h | 3 +- daemons/clvmd/tcp-comms.c | 2 ++ 5 files changed, 69 insertions(+), 22 deletions(-) diff --git a/WHATS_NEW b/WHATS_NEW index 090eb3d7d..e1f1d5d40 100644 --- a/WHATS_NEW +++ b/WHATS_NEW @@ -7,6 +7,7 @@ Version 2.01.10 - Scan ramdisks too and allow non-O_DIRECT fallback. Annotate, tidy and extend list.h. Alignment tidying. + Make clvmd work around some "bugs" in gulm's node state notifications. Version 2.01.09 - 4th April 2005 ================================ diff --git a/daemons/clvmd/clvmd-gulm.c b/daemons/clvmd/clvmd-gulm.c index e574f38b3..ac2af654e 100644 --- a/daemons/clvmd/clvmd-gulm.c +++ b/daemons/clvmd/clvmd-gulm.c @@ -60,8 +60,9 @@ static struct hash_table *node_hash; /* hash list of outstanding lock requests */ static struct hash_table *lock_hash; -/* Copy of the current core state */ -static uint8_t current_corestate; +/* Copy of the current quorate state */ +static uint8_t gulm_quorate = 0; +static enum {INIT_NOTDONE, INIT_DONE, INIT_WAITQUORATE} init_state = INIT_NOTDONE; /* Number of active nodes */ static int num_nodes; @@ -312,12 +313,16 @@ static int core_login_reply(void *misc, uint64_t gen, uint32_t error, uint32_t r if (error) exit(error); - current_corestate = corestate; + /* Get the current core state (for quorum) */ + lg_core_corestate(gulm_if); + return 0; } static void set_node_state(struct node_info *ninfo, char *csid, uint8_t nodestate) { + int oldstate = ninfo->state; + if (nodestate == lg_core_Logged_in) { /* Don't clobber NODE_CLVMD state */ @@ -339,11 +344,17 @@ static void set_node_state(struct node_info *ninfo, char *csid, uint8_t nodestat if (ninfo->state != NODE_DOWN) num_nodes--; ninfo->state = NODE_DOWN; - tcp_remove_client(csid); } } - DEBUGLOG("set_node_state, '%s' state = %d, num_nodes=%d\n", - ninfo->name, ninfo->state, num_nodes); + /* Gulm doesn't always send node DOWN events, so even if this a a node UP we must + * assume (ahem) that it prevously went down at some time. So we close + * the sockets here to make sure that we don't have any dead connections + * to that node. + */ + tcp_remove_client(csid); + + DEBUGLOG("set_node_state, '%s' state = %d (oldstate=%d), num_nodes=%d\n", + ninfo->name, ninfo->state, oldstate, num_nodes); } static struct node_info *add_or_set_node(char *name, struct in6_addr *ip, uint8_t state) @@ -400,7 +411,16 @@ static int core_nodelist(void *misc, lglcb_t type, char *name, struct in6_addr * char ourcsid[GULM_MAX_CSID_LEN]; DEBUGLOG("Got Nodelist, stop\n"); - clvmd_cluster_init_completed(); + if (gulm_quorate) + { + clvmd_cluster_init_completed(); + init_state = INIT_DONE; + } + else + { + if (init_state == INIT_NOTDONE) + init_state = INIT_WAITQUORATE; + } /* Mark ourself as up */ _get_our_csid(ourcsid); @@ -418,10 +438,15 @@ static int core_nodelist(void *misc, lglcb_t type, char *name, struct in6_addr * static int core_statechange(void *misc, uint8_t corestate, uint8_t quorate, struct in6_addr *masterip, char *mastername) { - DEBUGLOG("CORE Got statechange corestate:%#x mastername:%s\n", - corestate, mastername); + DEBUGLOG("CORE Got statechange. quorate:%d, corestate:%x mastername:%s\n", + quorate, corestate, mastername); - current_corestate = corestate; + gulm_quorate = quorate; + if (quorate && init_state == INIT_WAITQUORATE) + { + clvmd_cluster_init_completed(); + init_state = INIT_DONE; + } return 0; } @@ -474,7 +499,7 @@ static int lock_login_reply(void *misc, uint32_t error, uint8_t which) lock_start_flag = 0; pthread_mutex_unlock(&lock_start_mutex); } - + return 0; } @@ -615,7 +640,11 @@ void gulm_add_up_node(char *csid) } DEBUGLOG("gulm_add_up_node %s\n", ninfo->name); + + if (ninfo->state == NODE_DOWN) + num_nodes++; ninfo->state = NODE_CLVMD; + return; } @@ -853,12 +882,7 @@ static int _sync_unlock(const char *resource, int lockid) static int _is_quorate() { - if (current_corestate == lg_core_Slave || - current_corestate == lg_core_Master || - current_corestate == lg_core_Client) - return 1; - else - return 0; + return gulm_quorate; } /* Get all the cluster node names & IPs from CCS and diff --git a/daemons/clvmd/clvmd.c b/daemons/clvmd/clvmd.c index c615ea43f..3b6600599 100644 --- a/daemons/clvmd/clvmd.c +++ b/daemons/clvmd/clvmd.c @@ -280,6 +280,7 @@ int main(int argc, char *argv[]) child_init_signal(DFAIL_MALLOC); newfd->fd = local_sock; + newfd->removeme = 0; newfd->type = LOCAL_RENDEZVOUS; newfd->callback = local_rendezvous_callback; newfd->next = local_client_head.next; @@ -346,6 +347,7 @@ static int local_rendezvous_callback(struct local_client *thisfd, char *buf, newfd->fd = client_fd; newfd->type = LOCAL_SOCK; newfd->xid = 0; + newfd->removeme = 0; newfd->callback = local_sock_callback; newfd->bits.localsock.replies = NULL; newfd->bits.localsock.expected_replies = 0; @@ -519,6 +521,20 @@ static void main_loop(int local_sock, int cmd_timeout) for (thisfd = &local_client_head; thisfd != NULL; thisfd = thisfd->next) { + + if (thisfd->removeme) { + struct local_client *free_fd; + lastfd->next = thisfd->next; + free_fd = thisfd; + thisfd = lastfd; + + DEBUGLOG("removeme set for fd %d\n", free_fd->fd); + + /* Queue cleanup, this also frees the client struct */ + add_to_lvmqueue(free_fd, NULL, 0, NULL); + break; + } + if (FD_ISSET(thisfd->fd, &in)) { struct local_client *newfd; int ret; @@ -905,6 +921,7 @@ static int read_from_local_sock(struct local_client *thisfd) DEBUGLOG("creating pipe, [%d, %d]\n", comms_pipe[0], comms_pipe[1]); newfd->fd = comms_pipe[0]; + newfd->removeme = 0; newfd->type = THREAD_PIPE; newfd->callback = local_pipe_callback; newfd->next = thisfd->next; @@ -1061,8 +1078,8 @@ void process_remote_command(struct clvm_header *msg, int msglen, int fd, /* Get the node name as we /may/ need it later */ clops->name_from_csid(csid, nodename); - DEBUGLOG("process_remote_command %d for clientid 0x%x on node %s\n", - msg->cmd, msg->clientid, nodename); + DEBUGLOG("process_remote_command %d for clientid 0x%x XID %d on node %s\n", + msg->cmd, msg->clientid, msg->xid, nodename); /* Is the data to be found in the system LV ? */ if (msg->flags & CLVMD_FLAG_SYSTEMLV) { @@ -1575,9 +1592,10 @@ static int send_message(void *buf, int msglen, char *csid, int fd, static int process_work_item(struct lvm_thread_cmd *cmd) { - /* If msg is NULL then this is a cleanup request */ if (cmd->msg == NULL) { + DEBUGLOG("process_work_item: free fd %d\n", cmd->client->fd); + close(cmd->client->fd); cmd_client_cleanup(cmd->client); free(cmd->client); return 0; @@ -1638,7 +1656,8 @@ static void *lvm_thread_fn(void *arg) pthread_mutex_unlock(&lvm_thread_mutex); process_work_item(cmd); - free(cmd->msg); + if (cmd->msg) + free(cmd->msg); free(cmd); pthread_mutex_lock(&lvm_thread_mutex); diff --git a/daemons/clvmd/clvmd.h b/daemons/clvmd/clvmd.h index e4203552a..12e421707 100644 --- a/daemons/clvmd/clvmd.h +++ b/daemons/clvmd/clvmd.h @@ -86,6 +86,7 @@ struct local_client { struct local_client *next; unsigned short xid; fd_callback_t callback; + uint8_t removeme; union { struct localsock_bits localsock; @@ -95,7 +96,7 @@ struct local_client { }; #ifdef DEBUG -#define DEBUGLOG(fmt, args...) fprintf(stderr, "CLVMD[%x]: %ld ", (int)pthread_self(), time(NULL) ); fprintf(stderr, fmt, ## args) +#define DEBUGLOG(fmt, args...) {time_t P; time(&P); fprintf(stderr, "CLVMD[%x]: %.15s ", (int)pthread_self(), ctime(&P)+4 ); fprintf(stderr, fmt, ## args);} #else #define DEBUGLOG(fmt, args...) #endif diff --git a/daemons/clvmd/tcp-comms.c b/daemons/clvmd/tcp-comms.c index 3b1aa1ac2..8dcee67cb 100644 --- a/daemons/clvmd/tcp-comms.c +++ b/daemons/clvmd/tcp-comms.c @@ -105,6 +105,7 @@ void tcp_remove_client(char *csid) if (client) { hash_remove_binary(sock_hash, csid, GULM_MAX_CSID_LEN); + client->removeme = 1; } /* Look for a mangled one too */ @@ -114,6 +115,7 @@ void tcp_remove_client(char *csid) if (client) { hash_remove_binary(sock_hash, csid, GULM_MAX_CSID_LEN); + client->removeme = 1; } /* Put it back as we found it */