mirror of
https://github.com/samba-team/samba.git
synced 2025-01-24 02:04:21 +03:00
add dead node detection so that if a node does not generate any
keepalive traffic for x seconds it is deemed dead this triggers a recovery after a while if a ctdbd has been STOPPED but it doesnt recover automatically when the node reappears (This used to be ctdb commit d6324afe0d13b5e21d06e347caca433c6b36a32a)
This commit is contained in:
parent
d7c8c15d72
commit
db4c479568
@ -30,7 +30,7 @@ CTDB_COMMON_OBJ = common/ctdb.o common/ctdb_daemon.o common/ctdb_client.o \
|
||||
common/ctdb_call.o common/ctdb_ltdb.o common/ctdb_lockwait.o \
|
||||
common/ctdb_message.o common/cmdline.o common/ctdb_control.o \
|
||||
lib/util/debug.o common/ctdb_recover.o common/ctdb_recoverd.o \
|
||||
common/ctdb_freeze.o common/ctdb_traverse.o
|
||||
common/ctdb_freeze.o common/ctdb_traverse.o common/ctdb_monitor.o
|
||||
|
||||
CTDB_TCP_OBJ = tcp/tcp_connect.o tcp/tcp_io.o tcp/tcp_init.o
|
||||
|
||||
|
@ -127,6 +127,7 @@ static int ctdb_add_node(struct ctdb_context *ctdb, char *nstr)
|
||||
}
|
||||
|
||||
ctdb->num_nodes++;
|
||||
node->dead_count = 0;
|
||||
|
||||
return 0;
|
||||
}
|
||||
@ -342,6 +343,11 @@ void ctdb_recv_pkt(struct ctdb_context *ctdb, uint8_t *data, uint32_t length)
|
||||
ctdb_reply_control(ctdb, hdr);
|
||||
break;
|
||||
|
||||
case CTDB_REQ_KEEPALIVE:
|
||||
ctdb->status.keepalive_packets_recv++;
|
||||
ctdb_request_keepalive(ctdb, hdr);
|
||||
break;
|
||||
|
||||
default:
|
||||
DEBUG(0,("%s: Packet with unknown operation %d\n",
|
||||
__location__, hdr->operation));
|
||||
|
@ -782,3 +782,25 @@ int ctdb_daemon_call_recv(struct ctdb_call_state *state, struct ctdb_call *call)
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
send a keepalive packet to the other node
|
||||
*/
|
||||
void ctdb_send_keepalive(struct ctdb_context *ctdb,
|
||||
TALLOC_CTX *mem_ctx,
|
||||
uint32_t destnode)
|
||||
{
|
||||
struct ctdb_req_keepalive *r;
|
||||
|
||||
r = ctdb_transport_allocate(ctdb, mem_ctx, CTDB_REQ_KEEPALIVE,
|
||||
sizeof(struct ctdb_req_keepalive),
|
||||
struct ctdb_req_keepalive);
|
||||
CTDB_NO_MEMORY_FATAL(ctdb, r);
|
||||
r->hdr.destnode = destnode;
|
||||
r->hdr.reqid = 0;
|
||||
|
||||
ctdb->status.keepalive_packets_sent++;
|
||||
|
||||
ctdb_queue_packet(ctdb, &r->hdr);
|
||||
|
||||
talloc_free(r);
|
||||
}
|
||||
|
@ -749,6 +749,9 @@ int ctdb_start_daemon(struct ctdb_context *ctdb, bool do_fork)
|
||||
fde = event_add_fd(ctdb->ev, ctdb, ctdb->daemon.sd, EVENT_FD_READ|EVENT_FD_AUTOCLOSE,
|
||||
ctdb_accept_client, ctdb);
|
||||
|
||||
/* start monitoring for dead nodes */
|
||||
ctdb_start_monitoring(ctdb);
|
||||
|
||||
ctdb_main_loop(ctdb);
|
||||
|
||||
return 0;
|
||||
|
@ -89,6 +89,10 @@ struct ctdb_node {
|
||||
uint32_t vnn;
|
||||
#define NODE_FLAGS_CONNECTED 0x00000001
|
||||
uint32_t flags;
|
||||
|
||||
/* used by the dead node monitoring */
|
||||
uint32_t dead_count;
|
||||
uint32_t rx_cnt;
|
||||
};
|
||||
|
||||
/*
|
||||
@ -143,6 +147,8 @@ struct ctdb_status {
|
||||
uint32_t client_packets_recv;
|
||||
uint32_t node_packets_sent;
|
||||
uint32_t node_packets_recv;
|
||||
uint32_t keepalive_packets_sent;
|
||||
uint32_t keepalive_packets_recv;
|
||||
struct {
|
||||
uint32_t req_call;
|
||||
uint32_t reply_call;
|
||||
@ -302,6 +308,9 @@ struct ctdb_db_context {
|
||||
the traverse */
|
||||
#define CTDB_TRAVERSE_TIMEOUT 20
|
||||
|
||||
/* timeout between dead-node monitoring events */
|
||||
#define CTDB_MONITORING_TIMEOUT 5
|
||||
|
||||
|
||||
/* number of consecutive calls from the same node before we give them
|
||||
the record */
|
||||
@ -410,6 +419,7 @@ enum ctdb_operation {
|
||||
CTDB_REQ_FINISHED,
|
||||
CTDB_REQ_CONTROL,
|
||||
CTDB_REPLY_CONTROL,
|
||||
CTDB_REQ_KEEPALIVE,
|
||||
|
||||
/* only used on the domain socket */
|
||||
CTDB_REQ_CONNECT_WAIT = 1000,
|
||||
@ -533,6 +543,9 @@ struct ctdb_reply_control {
|
||||
uint8_t data[1];
|
||||
};
|
||||
|
||||
struct ctdb_req_keepalive {
|
||||
struct ctdb_req_header hdr;
|
||||
};
|
||||
|
||||
/* internal prototypes */
|
||||
void ctdb_set_error(struct ctdb_context *ctdb, const char *fmt, ...) PRINTF_ATTRIBUTE(2,3);
|
||||
@ -697,6 +710,7 @@ void *_ctdb_reqid_find(struct ctdb_context *ctdb, uint32_t reqid, const char *ty
|
||||
void ctdb_reqid_remove(struct ctdb_context *ctdb, uint32_t reqid);
|
||||
|
||||
void ctdb_request_control(struct ctdb_context *ctdb, struct ctdb_req_header *hdr);
|
||||
void ctdb_request_keepalive(struct ctdb_context *ctdb, struct ctdb_req_header *hdr);
|
||||
void ctdb_reply_control(struct ctdb_context *ctdb, struct ctdb_req_header *hdr);
|
||||
|
||||
int ctdb_daemon_send_control(struct ctdb_context *ctdb, uint32_t destnode,
|
||||
@ -804,4 +818,7 @@ int ctdb_start_recoverd(struct ctdb_context *ctdb);
|
||||
|
||||
uint32_t ctdb_get_num_connected_nodes(struct ctdb_context *ctdb);
|
||||
|
||||
int ctdb_start_monitoring(struct ctdb_context *ctdb);
|
||||
void ctdb_send_keepalive(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, uint32_t destnode);
|
||||
|
||||
#endif
|
||||
|
@ -110,6 +110,8 @@ static void show_status(struct ctdb_status *s)
|
||||
STATUS_FIELD(client_packets_recv),
|
||||
STATUS_FIELD(node_packets_sent),
|
||||
STATUS_FIELD(node_packets_recv),
|
||||
STATUS_FIELD(keepalive_packets_sent),
|
||||
STATUS_FIELD(keepalive_packets_recv),
|
||||
STATUS_FIELD(node.req_call),
|
||||
STATUS_FIELD(node.reply_call),
|
||||
STATUS_FIELD(node.req_dmaster),
|
||||
|
Loading…
x
Reference in New Issue
Block a user