From 3fc279760c82b4df070e3a16893824e66a6cc7ca Mon Sep 17 00:00:00 2001 From: Andrew Tridgell Date: Tue, 17 Apr 2007 11:20:00 +1000 Subject: [PATCH 1/2] better error handling in ctdb_ltdb_lock_fetch_requeue() (This used to be ctdb commit 1952be19f625dbe257050acebf468e7e6eb0da8c) --- ctdb/common/ctdb_ltdb.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/ctdb/common/ctdb_ltdb.c b/ctdb/common/ctdb_ltdb.c index 132108e69b1..c523d5f9952 100644 --- a/ctdb/common/ctdb_ltdb.c +++ b/ctdb/common/ctdb_ltdb.c @@ -244,6 +244,12 @@ static void lock_fetch_callback(void *p) immediately satisfied until it can get the lock. This means that the main ctdb daemon will not block waiting for a chainlock held by a client + + There are 3 possible return values: + + 0: means that it got the lock immediately. + -1: means that it failed to get the lock, and won't retry + -2: means that it failed to get the lock immediately, but will retry */ int ctdb_ltdb_lock_fetch_requeue(struct ctdb_db_context *ctdb_db, TDB_DATA key, struct ctdb_ltdb_header *header, @@ -255,6 +261,12 @@ int ctdb_ltdb_lock_fetch_requeue(struct ctdb_db_context *ctdb_db, ret = tdb_chainlock_nonblock(tdb, key); + if (ret != 0 && + !(errno == EACCES || errno == EAGAIN || errno == EDEADLK)) { + /* a hard failure - don't try again */ + return -1; + } + /* first the non-contended path */ if (ret == 0) { ret = ctdb_ltdb_fetch(ctdb_db, key, header, hdr, data); @@ -273,6 +285,11 @@ int ctdb_ltdb_lock_fetch_requeue(struct ctdb_db_context *ctdb_db, /* we get an extra reference to the packet here, to stop it being freed in the top level packet handler */ - (void)talloc_reference(ctdb_db, hdr); - return 0; + if (talloc_reference(ctdb_db, hdr) == NULL) { + talloc_free(h); + return -1; + } + + /* now tell the caller than we will retry asynchronously */ + return -2; } From 3c2ebff3cbfb398aef9d6d2cc8c82a5892f630c8 Mon Sep 17 00:00:00 2001 From: Andrew Tridgell Date: Tue, 17 Apr 2007 11:26:59 +1000 Subject: [PATCH 2/2] partial merge from volker (some overlaps removed) (This used to be ctdb commit c4747460a8e0017acfd2a97a632ecd9395562d4f) --- ctdb/common/ctdb_daemon.c | 24 +++++++++++++++++ ctdb/common/ctdb_lockwait.c | 54 ++++++++++++++++++++----------------- 2 files changed, 54 insertions(+), 24 deletions(-) diff --git a/ctdb/common/ctdb_daemon.c b/ctdb/common/ctdb_daemon.c index 945030d77e2..aec988115a9 100644 --- a/ctdb/common/ctdb_daemon.c +++ b/ctdb/common/ctdb_daemon.c @@ -186,6 +186,24 @@ static void daemon_request_fetch_lock(struct ctdb_client *client, struct client_fetch_lock_data *fl_data; ctdb_db = find_ctdb_db(client->ctdb, f->db_id); + if (ctdb_db == NULL) { + struct ctdb_reply_fetch_lock r; + + ZERO_STRUCT(r); + r.hdr.length = sizeof(r); + r.hdr.ctdb_magic = CTDB_MAGIC; + r.hdr.ctdb_version = CTDB_VERSION; + r.hdr.operation = CTDB_REPLY_FETCH_LOCK; + r.hdr.reqid = f->hdr.reqid; + r.state = -1; + + /* + * Ignore the result, there's not much we can do anyway. + */ + ctdb_queue_send(client->queue, (uint8_t *)&r.hdr, + r.hdr.length); + return; + } key.dsize = f->keylen; key.dptr = &f->key[0]; @@ -220,6 +238,12 @@ static void daemon_request_store_unlock(struct ctdb_client *client, int res; ctdb_db = find_ctdb_db(client->ctdb, f->db_id); + if (ctdb_db == NULL) { + ctdb_set_error(client->ctdb, "Could not find database %i", + f->db_id); + res = -1; + goto done; + } /* write the data to ltdb */ key.dsize = f->keylen; diff --git a/ctdb/common/ctdb_lockwait.c b/ctdb/common/ctdb_lockwait.c index 304a0a413ea..77cb6a82e20 100644 --- a/ctdb/common/ctdb_lockwait.c +++ b/ctdb/common/ctdb_lockwait.c @@ -71,48 +71,54 @@ static int lockwait_destructor(struct lockwait_handle *h) */ struct lockwait_handle *ctdb_lockwait(struct ctdb_db_context *ctdb_db, TDB_DATA key, - void (*callback)(void *), void *private_data) + void (*callback)(void *private_data), + void *private_data) { - struct lockwait_handle *h; + struct lockwait_handle *result; int ret; - h = talloc_zero(ctdb_db, struct lockwait_handle); - if (h == NULL) { + if (!(result = talloc_zero(ctdb_db, struct lockwait_handle))) { return NULL; } - ret = pipe(h->fd); + ret = pipe(result->fd); + if (ret != 0) { - talloc_free(h); + talloc_free(result); return NULL; } - h->child = fork(); - if (h->child == (pid_t)-1) { - close(h->fd[0]); - close(h->fd[1]); - talloc_free(h); + result->child = fork(); + + if (result->child == (pid_t)-1) { + close(result->fd[0]); + close(result->fd[1]); + talloc_free(result); return NULL; } - h->callback = callback; - h->private_data = private_data; + result->callback = callback; + result->private_data = private_data; - if (h->child == 0) { - struct tdb_context *tdb = ctdb_db->ltdb->tdb; - /* in child */ - tdb_chainlock(tdb, key); - _exit(0); + if (result->child == 0) { + close(result->fd[0]); + /* + * Do we need a tdb_reopen here? + */ + tdb_chainlock(ctdb_db->ltdb->tdb, key); + exit(0); } - close(h->fd[1]); - talloc_set_destructor(h, lockwait_destructor); + close(result->fd[1]); + talloc_set_destructor(result, lockwait_destructor); - h->fde = event_add_fd(ctdb_db->ctdb->ev, h, h->fd[0], EVENT_FD_READ, lockwait_handler, h); - if (h->fde == NULL) { - talloc_free(h); + result->fde = event_add_fd(ctdb_db->ctdb->ev, result, result->fd[0], + EVENT_FD_READ, lockwait_handler, + (void *)result); + if (result->fde == NULL) { + talloc_free(result); return NULL; } - return h; + return result; }