mirror of
https://github.com/samba-team/samba.git
synced 2025-02-02 09:47:23 +03:00
better timeout handling for calls, controls and traverses
(This used to be ctdb commit 63346a6c59d4821b4c443939b5d88db8cd20f5fe)
This commit is contained in:
parent
31cd92dc7e
commit
15bc97cdaa
@ -613,17 +613,36 @@ static int ctdb_call_destructor(struct ctdb_call_state *state)
|
||||
/*
|
||||
called when a ctdb_call times out
|
||||
*/
|
||||
void ctdb_call_timeout(struct event_context *ev, struct timed_event *te,
|
||||
struct timeval t, void *private_data)
|
||||
static void ctdb_call_timeout(struct event_context *ev, struct timed_event *te,
|
||||
struct timeval t, void *private_data)
|
||||
{
|
||||
struct ctdb_call_state *state = talloc_get_type(private_data, struct ctdb_call_state);
|
||||
DEBUG(0,(__location__ " call timeout for reqid %d\n", state->c->hdr.reqid));
|
||||
state->state = CTDB_CALL_ERROR;
|
||||
ctdb_set_error(state->ctdb_db->ctdb, "ctdb_call %u timed out",
|
||||
state->c->hdr.reqid);
|
||||
if (state->async.fn) {
|
||||
state->async.fn(state);
|
||||
struct ctdb_context *ctdb = state->ctdb_db->ctdb;
|
||||
|
||||
ctdb->status.timeouts.call++;
|
||||
|
||||
event_add_timed(ctdb->ev, state, timeval_current_ofs(CTDB_CALL_TIMEOUT, 0),
|
||||
ctdb_call_timeout, state);
|
||||
|
||||
if (ctdb->vnn_map->generation == state->generation ||
|
||||
ctdb->recovery_mode != CTDB_RECOVERY_NORMAL) {
|
||||
/* the call is just being slow, or we are curently
|
||||
recovering, give it more time */
|
||||
return;
|
||||
}
|
||||
|
||||
/* the generation count changed - the call must be re-issued */
|
||||
state->generation = ctdb->vnn_map->generation;
|
||||
|
||||
/* use a new reqid, in case the old reply does eventually come in */
|
||||
ctdb_reqid_remove(ctdb, state->reqid);
|
||||
state->reqid = ctdb_reqid_new(ctdb, state);
|
||||
state->c->hdr.reqid = state->reqid;
|
||||
|
||||
/* send the packet to ourselves, it will be redirected appropriately */
|
||||
state->c->hdr.destnode = ctdb->vnn;
|
||||
|
||||
ctdb_queue_packet(ctdb, &state->c->hdr);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -697,15 +716,6 @@ struct ctdb_call_state *ctdb_daemon_call_send_remote(struct ctdb_db_context *ctd
|
||||
CTDB_NO_MEMORY_NULL(ctdb, state->c);
|
||||
state->c->hdr.destnode = header->dmaster;
|
||||
|
||||
#if 0
|
||||
/*always sending the remote call straight to the lmaster
|
||||
improved performance slightly in some tests.
|
||||
worth investigating further in the future
|
||||
*/
|
||||
state->c->hdr.destnode = ctdb_lmaster(ctdb_db->ctdb, &(call->key));
|
||||
#endif
|
||||
|
||||
|
||||
/* this limits us to 16k outstanding messages - not unreasonable */
|
||||
state->c->hdr.reqid = state->reqid;
|
||||
state->c->flags = call->flags;
|
||||
@ -723,13 +733,12 @@ struct ctdb_call_state *ctdb_daemon_call_send_remote(struct ctdb_db_context *ctd
|
||||
|
||||
state->state = CTDB_CALL_WAIT;
|
||||
state->ctdb_db = ctdb_db;
|
||||
state->generation = ctdb->vnn_map->generation;
|
||||
|
||||
ctdb_queue_packet(ctdb, &state->c->hdr);
|
||||
|
||||
#if CTDB_REQ_TIMEOUT
|
||||
event_add_timed(ctdb->ev, state, timeval_current_ofs(CTDB_REQ_TIMEOUT, 0),
|
||||
event_add_timed(ctdb->ev, state, timeval_current_ofs(CTDB_CALL_TIMEOUT, 0),
|
||||
ctdb_call_timeout, state);
|
||||
#endif
|
||||
return state;
|
||||
}
|
||||
|
||||
|
@ -531,6 +531,21 @@ static int ctdb_control_destructor(struct ctdb_control_state *state)
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
handle a timeout of a control
|
||||
*/
|
||||
static void ctdb_control_timeout(struct event_context *ev, struct timed_event *te,
|
||||
struct timeval t, void *private_data)
|
||||
{
|
||||
struct ctdb_control_state *state = talloc_get_type(private_data, struct ctdb_control_state);
|
||||
|
||||
state->ctdb->status.timeouts.control++;
|
||||
|
||||
state->callback(state->ctdb, -1, tdb_null, state->private_data);
|
||||
talloc_free(state);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
send a control message to a node
|
||||
*/
|
||||
@ -586,8 +601,8 @@ int ctdb_daemon_send_control(struct ctdb_context *ctdb, uint32_t destnode,
|
||||
return 0;
|
||||
}
|
||||
|
||||
#if CTDB_REQ_TIMEOUT
|
||||
event_add_timed(ctdb->ev, state, timeval_current_ofs(CTDB_REQ_TIMEOUT, 0),
|
||||
#if CTDB_CONTROL_TIMEOUT
|
||||
event_add_timed(ctdb->ev, state, timeval_current_ofs(CTDB_CONTROL_TIMEOUT, 0),
|
||||
ctdb_control_timeout, state);
|
||||
#endif
|
||||
|
||||
|
@ -147,8 +147,6 @@ struct ctdb_traverse_local_handle *ctdb_traverse_local(struct ctdb_db_context *c
|
||||
struct ctdb_traverse_local_handle *h;
|
||||
int ret;
|
||||
|
||||
ctdb_db->ctdb->status.traverse_calls++;
|
||||
|
||||
h = talloc_zero(ctdb_db, struct ctdb_traverse_local_handle);
|
||||
if (h == NULL) {
|
||||
return NULL;
|
||||
@ -223,6 +221,18 @@ struct ctdb_traverse_all {
|
||||
uint32_t vnn;
|
||||
};
|
||||
|
||||
/* called when a traverse times out */
|
||||
static void ctdb_traverse_all_timeout(struct event_context *ev, struct timed_event *te,
|
||||
struct timeval t, void *private_data)
|
||||
{
|
||||
struct ctdb_traverse_all_handle *state = talloc_get_type(private_data, struct ctdb_traverse_all_handle);
|
||||
|
||||
state->ctdb->status.timeouts.traverse++;
|
||||
|
||||
state->callback(state->private_data, tdb_null, tdb_null);
|
||||
talloc_free(state);
|
||||
}
|
||||
|
||||
/*
|
||||
setup a cluster-wide non-blocking traverse of a ctdb. The
|
||||
callback function will be called on every record in the local
|
||||
@ -269,6 +279,10 @@ struct ctdb_traverse_all_handle *ctdb_daemon_traverse_all(struct ctdb_db_context
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* timeout the traverse */
|
||||
event_add_timed(ctdb->ev, state, timeval_current_ofs(CTDB_TRAVERSE_TIMEOUT, 0),
|
||||
ctdb_traverse_all_timeout, state);
|
||||
|
||||
return state;
|
||||
}
|
||||
|
||||
|
@ -21,6 +21,7 @@
|
||||
#include "includes.h"
|
||||
#include "lib/events/events.h"
|
||||
#include "system/filesys.h"
|
||||
#include "system/time.h"
|
||||
#include "popt.h"
|
||||
#include "cmdline.h"
|
||||
#include "../include/ctdb.h"
|
||||
|
@ -175,10 +175,14 @@ struct ctdb_status {
|
||||
uint32_t register_srvid;
|
||||
uint32_t deregister_srvid;
|
||||
} controls;
|
||||
struct {
|
||||
uint32_t call;
|
||||
uint32_t control;
|
||||
uint32_t traverse;
|
||||
} timeouts;
|
||||
uint32_t total_calls;
|
||||
uint32_t pending_calls;
|
||||
uint32_t lockwait_calls;
|
||||
uint32_t traverse_calls;
|
||||
uint32_t pending_lockwait_calls;
|
||||
uint32_t memory_used;
|
||||
uint32_t __last_counter; /* hack for control_status_all */
|
||||
@ -279,8 +283,18 @@ struct ctdb_db_context {
|
||||
ctdb_fatal(ctdb, "Out of memory in " __location__ ); \
|
||||
}} while (0)
|
||||
|
||||
/* arbitrary maximum timeout for ctdb operations */
|
||||
#define CTDB_REQ_TIMEOUT 0
|
||||
/* timeout for ctdb call operations. When this timeout expires we
|
||||
check if the generation count has changed, and if it has then
|
||||
re-issue the call */
|
||||
#define CTDB_CALL_TIMEOUT 2
|
||||
|
||||
/* timeout for ctdb control calls */
|
||||
#define CTDB_CONTROL_TIMEOUT 10
|
||||
|
||||
/* timeout for ctdb traverse calls. When this is reached we cut short
|
||||
the traverse */
|
||||
#define CTDB_TRAVERSE_TIMEOUT 20
|
||||
|
||||
|
||||
/* number of consecutive calls from the same node before we give them
|
||||
the record */
|
||||
@ -356,6 +370,7 @@ struct ctdb_call_state {
|
||||
struct ctdb_db_context *ctdb_db;
|
||||
const char *errmsg;
|
||||
struct ctdb_call call;
|
||||
uint32_t generation;
|
||||
struct {
|
||||
void (*fn)(struct ctdb_call_state *);
|
||||
void *private_data;
|
||||
|
@ -135,10 +135,12 @@ static void show_status(struct ctdb_status *s)
|
||||
STATUS_FIELD(controls.set_seqnum_frequency),
|
||||
STATUS_FIELD(controls.register_srvid),
|
||||
STATUS_FIELD(controls.deregister_srvid),
|
||||
STATUS_FIELD(timeouts.call),
|
||||
STATUS_FIELD(timeouts.control),
|
||||
STATUS_FIELD(timeouts.traverse),
|
||||
STATUS_FIELD(total_calls),
|
||||
STATUS_FIELD(pending_calls),
|
||||
STATUS_FIELD(lockwait_calls),
|
||||
STATUS_FIELD(traverse_calls),
|
||||
STATUS_FIELD(pending_lockwait_calls),
|
||||
STATUS_FIELD(memory_used),
|
||||
STATUS_FIELD(max_hop_count),
|
||||
|
Loading…
x
Reference in New Issue
Block a user