1
0
mirror of https://github.com/samba-team/samba.git synced 2025-01-24 02:04:21 +03:00

add dead node detection so that if a node does not generate any

keepalive traffic for x seconds   it is deemed dead


this triggers a recovery after a while if a ctdbd has been STOPPED    
but it doesnt recover automatically when the node reappears

(This used to be ctdb commit d6324afe0d13b5e21d06e347caca433c6b36a32a)
This commit is contained in:
Ronnie Sahlberg 2007-05-18 19:19:35 +10:00
parent d7c8c15d72
commit db4c479568
6 changed files with 51 additions and 1 deletions

View File

@ -30,7 +30,7 @@ CTDB_COMMON_OBJ = common/ctdb.o common/ctdb_daemon.o common/ctdb_client.o \
common/ctdb_call.o common/ctdb_ltdb.o common/ctdb_lockwait.o \
common/ctdb_message.o common/cmdline.o common/ctdb_control.o \
lib/util/debug.o common/ctdb_recover.o common/ctdb_recoverd.o \
common/ctdb_freeze.o common/ctdb_traverse.o
common/ctdb_freeze.o common/ctdb_traverse.o common/ctdb_monitor.o
CTDB_TCP_OBJ = tcp/tcp_connect.o tcp/tcp_io.o tcp/tcp_init.o

View File

@ -127,6 +127,7 @@ static int ctdb_add_node(struct ctdb_context *ctdb, char *nstr)
}
ctdb->num_nodes++;
node->dead_count = 0;
return 0;
}
@ -342,6 +343,11 @@ void ctdb_recv_pkt(struct ctdb_context *ctdb, uint8_t *data, uint32_t length)
ctdb_reply_control(ctdb, hdr);
break;
case CTDB_REQ_KEEPALIVE:
ctdb->status.keepalive_packets_recv++;
ctdb_request_keepalive(ctdb, hdr);
break;
default:
DEBUG(0,("%s: Packet with unknown operation %d\n",
__location__, hdr->operation));

View File

@ -782,3 +782,25 @@ int ctdb_daemon_call_recv(struct ctdb_call_state *state, struct ctdb_call *call)
}
/*
send a keepalive packet to the other node
*/
void ctdb_send_keepalive(struct ctdb_context *ctdb,
TALLOC_CTX *mem_ctx,
uint32_t destnode)
{
struct ctdb_req_keepalive *r;
r = ctdb_transport_allocate(ctdb, mem_ctx, CTDB_REQ_KEEPALIVE,
sizeof(struct ctdb_req_keepalive),
struct ctdb_req_keepalive);
CTDB_NO_MEMORY_FATAL(ctdb, r);
r->hdr.destnode = destnode;
r->hdr.reqid = 0;
ctdb->status.keepalive_packets_sent++;
ctdb_queue_packet(ctdb, &r->hdr);
talloc_free(r);
}

View File

@ -749,6 +749,9 @@ int ctdb_start_daemon(struct ctdb_context *ctdb, bool do_fork)
fde = event_add_fd(ctdb->ev, ctdb, ctdb->daemon.sd, EVENT_FD_READ|EVENT_FD_AUTOCLOSE,
ctdb_accept_client, ctdb);
/* start monitoring for dead nodes */
ctdb_start_monitoring(ctdb);
ctdb_main_loop(ctdb);
return 0;

View File

@ -89,6 +89,10 @@ struct ctdb_node {
uint32_t vnn;
#define NODE_FLAGS_CONNECTED 0x00000001
uint32_t flags;
/* used by the dead node monitoring */
uint32_t dead_count;
uint32_t rx_cnt;
};
/*
@ -143,6 +147,8 @@ struct ctdb_status {
uint32_t client_packets_recv;
uint32_t node_packets_sent;
uint32_t node_packets_recv;
uint32_t keepalive_packets_sent;
uint32_t keepalive_packets_recv;
struct {
uint32_t req_call;
uint32_t reply_call;
@ -302,6 +308,9 @@ struct ctdb_db_context {
the traverse */
#define CTDB_TRAVERSE_TIMEOUT 20
/* timeout between dead-node monitoring events */
#define CTDB_MONITORING_TIMEOUT 5
/* number of consecutive calls from the same node before we give them
the record */
@ -410,6 +419,7 @@ enum ctdb_operation {
CTDB_REQ_FINISHED,
CTDB_REQ_CONTROL,
CTDB_REPLY_CONTROL,
CTDB_REQ_KEEPALIVE,
/* only used on the domain socket */
CTDB_REQ_CONNECT_WAIT = 1000,
@ -533,6 +543,9 @@ struct ctdb_reply_control {
uint8_t data[1];
};
struct ctdb_req_keepalive {
struct ctdb_req_header hdr;
};
/* internal prototypes */
void ctdb_set_error(struct ctdb_context *ctdb, const char *fmt, ...) PRINTF_ATTRIBUTE(2,3);
@ -697,6 +710,7 @@ void *_ctdb_reqid_find(struct ctdb_context *ctdb, uint32_t reqid, const char *ty
void ctdb_reqid_remove(struct ctdb_context *ctdb, uint32_t reqid);
void ctdb_request_control(struct ctdb_context *ctdb, struct ctdb_req_header *hdr);
void ctdb_request_keepalive(struct ctdb_context *ctdb, struct ctdb_req_header *hdr);
void ctdb_reply_control(struct ctdb_context *ctdb, struct ctdb_req_header *hdr);
int ctdb_daemon_send_control(struct ctdb_context *ctdb, uint32_t destnode,
@ -804,4 +818,7 @@ int ctdb_start_recoverd(struct ctdb_context *ctdb);
uint32_t ctdb_get_num_connected_nodes(struct ctdb_context *ctdb);
int ctdb_start_monitoring(struct ctdb_context *ctdb);
void ctdb_send_keepalive(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, uint32_t destnode);
#endif

View File

@ -110,6 +110,8 @@ static void show_status(struct ctdb_status *s)
STATUS_FIELD(client_packets_recv),
STATUS_FIELD(node_packets_sent),
STATUS_FIELD(node_packets_recv),
STATUS_FIELD(keepalive_packets_sent),
STATUS_FIELD(keepalive_packets_recv),
STATUS_FIELD(node.req_call),
STATUS_FIELD(node.reply_call),
STATUS_FIELD(node.req_dmaster),