From b7410c95cfbf53b2197611a0950450f7251643fc Mon Sep 17 00:00:00 2001 From: David Teigland Date: Wed, 16 Sep 2015 15:22:21 -0500 Subject: [PATCH] lvmlockd: unlock lv if command fails before lock completes If lvmlockd acquires an lv lock for a command, but the command exits before the reply, then the command has not activated the lv and lvmlockd should unlock it. This only applies when the lv was not already locked. (There will always be a chance that the lv lock is held while the lv is not active, i.e. if the command fails in the small window between getting the lv lock and before doing the activation. In that case, rerunning the activation command corrects the inconsistency.) This commit helps by automatically clearing the inconsistency (lv locked by not activated) in the most common case when the lv lock operation is slow to complete and the command is canceled by the user. This commit also adds and cleans up references to the client id in a bunch of log messages, which is useful to follow processing on each independent lock request. --- daemons/lvmlockd/lvmlockd-core.c | 142 ++++++++++++++++++++------- daemons/lvmlockd/lvmlockd-internal.h | 3 + 2 files changed, 109 insertions(+), 36 deletions(-) diff --git a/daemons/lvmlockd/lvmlockd-core.c b/daemons/lvmlockd/lvmlockd-core.c index 7d5b15873..487b6ec39 100644 --- a/daemons/lvmlockd/lvmlockd-core.c +++ b/daemons/lvmlockd/lvmlockd-core.c @@ -1031,10 +1031,12 @@ static int res_lock(struct lockspace *ls, struct resource *r, struct action *act memset(&vb, 0, sizeof(vb)); + r->last_client_id = act->client_id; + if (r->type == LD_RT_LV) - log_debug("S %s R %s res_lock mode %s (%s)", ls->name, r->name, mode_str(act->mode), act->lv_name); + log_debug("S %s R %s res_lock cl %u mode %s (%s)", ls->name, r->name, act->client_id, mode_str(act->mode), act->lv_name); else - log_debug("S %s R %s res_lock mode %s", ls->name, r->name, mode_str(act->mode)); + log_debug("S %s R %s res_lock cl %u mode %s", ls->name, r->name, act->client_id, mode_str(act->mode)); if (r->mode == LD_LK_SH && act->mode == LD_LK_SH) goto add_lk; @@ -1045,10 +1047,10 @@ static int res_lock(struct lockspace *ls, struct resource *r, struct action *act rv = lm_lock(ls, r, act->mode, act, &vb, retry, act->flags & LD_AF_ADOPT); if (r->use_vb) - log_debug("S %s R %s res_lock %d read vb %x %x %u", + log_debug("S %s R %s res_lock rv %d read vb %x %x %u", ls->name, r->name, rv, vb.version, vb.flags, vb.r_version); else - log_debug("S %s R %s res_lock %d", ls->name, r->name, rv); + log_debug("S %s R %s res_lock rv %d", ls->name, r->name, rv); if (rv < 0) return rv; @@ -1298,6 +1300,26 @@ add_lk: lk->client_id = 0; } + /* + * LV_LOCK means the action acquired the lv lock in the lock manager + * (as opposed to finding that the lv lock was already held). If + * the client for this LV_LOCK action fails before we send the result, + * then we automatically unlock the lv since the lv wasn't activated. + * (There will always be an odd chance the lv lock is held while the + * lv is not active, but this helps.) The most common case where this + * is helpful is when the lv lock operation is slow/delayed and the + * command is canceled by the user. + * + * LV_UNLOCK means the lv unlock action was generated by lvmlockd when + * it tried to send the reply for an lv lock action (with LV_LOCK set), + * and failed to send the reply to the client/command. The + * last_client_id saved on the resource is compared to this LV_UNLOCK + * action before the auto unlock is done in case another action locked + * the lv between the failed client lock action and the auto unlock. + */ + if (r->type == LD_RT_LV) + act->flags |= LD_AF_LV_LOCK; + list_add_tail(&lk->list, &r->locks); return rv; @@ -1309,7 +1331,9 @@ static int res_convert(struct lockspace *ls, struct resource *r, uint32_t r_version; int rv; - log_debug("S %s R %s res_convert mode %d", ls->name, r->name, act->mode); + r->last_client_id = act->client_id; + + log_debug("S %s R %s res_convert cl %u mode %d", ls->name, r->name, act->client_id, act->mode); if (act->mode == LD_LK_EX && lk->mode == LD_LK_SH && r->sh_count > 1) return -EAGAIN; @@ -1387,7 +1411,7 @@ static int res_cancel(struct lockspace *ls, struct resource *r, return -ENOENT; do_cancel: - log_debug("S %s R %s res_cancel client %d", ls->name, r->name, cact->client_id); + log_debug("S %s R %s res_cancel cl %u", ls->name, r->name, cact->client_id); cact->result = -ECANCELED; list_del(&cact->list); add_client_result(cact); @@ -1435,22 +1459,32 @@ static int res_unlock(struct lockspace *ls, struct resource *r, } if (act->op != LD_OP_CLOSE) - log_debug("S %s R %s res_unlock no locks", ls->name, r->name); + log_debug("S %s R %s res_unlock cl %u no locks", ls->name, r->name, act->client_id); return -ENOENT; do_unlock: + if ((act->flags & LD_AF_LV_UNLOCK) && (r->last_client_id != act->client_id)) { + log_debug("S %s R %s res_unlock cl %u for failed client ignored, last client %u", + ls->name, r->name, act->client_id, r->last_client_id); + return -ENOENT; + } + + r->last_client_id = act->client_id; + if (act->op == LD_OP_CLOSE) - log_debug("S %s R %s res_unlock from close", ls->name, r->name); + log_debug("S %s R %s res_unlock cl %u from close", ls->name, r->name, act->client_id); else if (r->type == LD_RT_LV) - log_debug("S %s R %s res_unlock (%s)", ls->name, r->name, act->lv_name); + log_debug("S %s R %s res_unlock cl %u (%s)", ls->name, r->name, act->client_id, act->lv_name); else - log_debug("S %s R %s res_unlock", ls->name, r->name); + log_debug("S %s R %s res_unlock cl %u", ls->name, r->name, act->client_id); /* send unlock to lm when last sh lock is unlocked */ if (lk->mode == LD_LK_SH) { r->sh_count--; - if (r->sh_count > 0) + if (r->sh_count > 0) { + log_debug("S %s R %s res_unlock sh_count %u", ls->name, r->name, r->sh_count); goto rem_lk; + } } if ((r->type == LD_RT_GL) && (r->mode == LD_LK_EX)) { @@ -1498,14 +1532,14 @@ static int res_update(struct lockspace *ls, struct resource *r, lk = find_lock_client(r, act->client_id); if (!lk) { - log_error("S %s R %s res_update client %u lock not found", + log_error("S %s R %s res_update cl %u lock not found", ls->name, r->name, act->client_id); return -ENOENT; } if (r->mode != LD_LK_EX) { - log_error("S %s R %s res_update version on non-ex lock", - ls->name, r->name); + log_error("S %s R %s res_update cl %u version on non-ex lock", + ls->name, r->name, act->client_id); return -EINVAL; } @@ -1516,7 +1550,7 @@ static int res_update(struct lockspace *ls, struct resource *r, else lk->version = act->version; - log_debug("S %s R %s res_update lk version to %u", ls->name, r->name, lk->version); + log_debug("S %s R %s res_update cl %u lk version to %u", ls->name, r->name, act->client_id, lk->version); return 0; } @@ -1716,6 +1750,7 @@ static void res_process(struct lockspace *ls, struct resource *r, continue; } else { /* success */ + r->last_client_id = act->client_id; act->result = -EALREADY; list_del(&act->list); add_client_result(act); @@ -1750,6 +1785,7 @@ static void res_process(struct lockspace *ls, struct resource *r, continue; } else { /* success */ + r->last_client_id = act->client_id; act->result = -EALREADY; list_del(&act->list); add_client_result(act); @@ -1787,6 +1823,7 @@ static void res_process(struct lockspace *ls, struct resource *r, if ((lk->mode == LD_LK_EX) || (lk->mode == LD_LK_SH && act->mode == LD_LK_SH)) { + r->last_client_id = act->client_id; act->result = 0; list_del(&act->list); add_client_result(act); @@ -1794,6 +1831,7 @@ static void res_process(struct lockspace *ls, struct resource *r, /* persistent lock is sh, transient request is ex */ /* FIXME: can we remove this case? do a convert here? */ log_debug("res_process %s existing persistent lock new transient", r->name); + r->last_client_id = act->client_id; act->result = -EEXIST; list_del(&act->list); add_client_result(act); @@ -1828,10 +1866,12 @@ static void res_process(struct lockspace *ls, struct resource *r, if (lk->mode != act->mode) { /* FIXME: convert and change to persistent? */ log_debug("res_process %s existing transient lock new persistent", r->name); + r->last_client_id = act->client_id; act->result = -EEXIST; list_del(&act->list); add_client_result(act); } else { + r->last_client_id = act->client_id; lk->flags |= LD_LF_PERSISTENT; lk->client_id = 0; act->result = 0; @@ -1880,7 +1920,6 @@ static void res_process(struct lockspace *ls, struct resource *r, * lv lock conflicts won't be transient so don't retry them. */ - if (r->mode == LD_LK_EX) return; @@ -3491,14 +3530,14 @@ static void client_resume(struct client *cl) if (!cl->poll_ignore || cl->fd == -1 || cl->pi == -1) { /* shouldn't happen */ - log_error("client_resume %d bad state ig %d fd %d pi %d", + log_error("client_resume %u bad state ig %d fd %d pi %d", cl->id, cl->poll_ignore, cl->fd, cl->pi); return; } pthread_mutex_lock(&pollfd_mutex); if (pollfd[cl->pi].fd != POLL_FD_IGNORE) { - log_error("client_resume %d pi %d fd %d not IGNORE", + log_error("client_resume %u pi %d fd %d not IGNORE", cl->id, cl->pi, cl->fd); } pollfd[cl->pi].fd = cl->fd; @@ -3509,16 +3548,17 @@ static void client_resume(struct client *cl) } /* called from client_thread, cl->mutex is held */ -static void client_send_result(struct client *cl, struct action *act) +static int client_send_result(struct client *cl, struct action *act) { response res; char result_flags[128]; int dump_len = 0; int dump_fd = -1; + int rv = 0; if (cl->dead) { - log_debug("client send %d skip dead", cl->id); - return; + log_debug("send cl %u skip dead", cl->id); + return -1; } memset(result_flags, 0, sizeof(result_flags)); @@ -3580,7 +3620,7 @@ static void client_send_result(struct client *cl, struct action *act) if (act->lv_args[0]) lv_args = act->lv_args; - log_debug("send %s[%d.%u] %s %s rv %d vg_args %s lv_args %s", + log_debug("send %s[%d] cl %u %s %s rv %d vg_args %s lv_args %s", cl->name[0] ? cl->name : "client", cl->pid, cl->id, op_str(act->op), rt_str(act->rt), act->result, vg_args ? vg_args : "", lv_args ? lv_args : ""); @@ -3612,7 +3652,7 @@ static void client_send_result(struct client *cl, struct action *act) else act->result = -EINVAL; - log_debug("send %s[%d.%u] dump result %d dump_len %d", + log_debug("send %s[%d] cl %u dump result %d dump_len %d", cl->name[0] ? cl->name : "client", cl->pid, cl->id, act->result, dump_len); @@ -3625,7 +3665,7 @@ static void client_send_result(struct client *cl, struct action *act) * A normal reply. */ - log_debug("send %s[%d.%u] %s %s rv %d %s %s", + log_debug("send %s[%d] cl %u %s %s rv %d %s %s", cl->name[0] ? cl->name : "client", cl->pid, cl->id, op_str(act->op), rt_str(act->rt), act->result, (act->result == -ENOLS) ? "ENOLS" : "", result_flags); @@ -3639,7 +3679,13 @@ static void client_send_result(struct client *cl, struct action *act) NULL); } - buffer_write(cl->fd, &res.buffer); + if (!buffer_write(cl->fd, &res.buffer)) { + rv = -errno; + if (rv >= 0) + rv = -1; + log_debug("send cl %u fd %d error %d", cl->id, cl->fd, rv); + } + buffer_destroy(&res.buffer); client_resume(cl); @@ -3649,6 +3695,8 @@ static void client_send_result(struct client *cl, struct action *act) send_dump_buf(dump_fd, dump_len); close(dump_fd); } + + return rv; } /* called from client_thread */ @@ -4303,10 +4351,10 @@ static void client_recv_action(struct client *cl) rv = buffer_read(cl->fd, &req.buffer); if (!rv) { if (errno == ECONNRESET) { - log_debug("client recv %d ECONNRESET", cl->id); + log_debug("client recv %u ECONNRESET", cl->id); cl->dead = 1; } else { - log_error("client recv %d buffer_read error %d", cl->id, errno); + log_error("client recv %u buffer_read error %d", cl->id, errno); } buffer_destroy(&req.buffer); client_resume(cl); @@ -4315,7 +4363,7 @@ static void client_recv_action(struct client *cl) req.cft = dm_config_from_string(req.buffer.mem); if (!req.cft) { - log_error("client recv %d config_from_string error", cl->id); + log_error("client recv %u config_from_string error", cl->id); buffer_destroy(&req.buffer); client_resume(cl); return; @@ -4324,7 +4372,7 @@ static void client_recv_action(struct client *cl) str = daemon_request_str(req, "request", NULL); rv = str_to_op_rt(str, &op, &rt); if (rv < 0) { - log_error("client recv %d bad request name \"%s\"", cl->id, str ? str : ""); + log_error("client recv %u bad request name \"%s\"", cl->id, str ? str : ""); dm_config_destroy(req.cft); buffer_destroy(&req.buffer); client_resume(cl); @@ -4447,7 +4495,7 @@ static void client_recv_action(struct client *cl) dm_config_destroy(req.cft); buffer_destroy(&req.buffer); - log_debug("recv %s[%d.%u] %s %s \"%s\" mode %s flags %x", + log_debug("recv %s[%d] cl %u %s %s \"%s\" mode %s flags %x", cl->name[0] ? cl->name : "client", cl->pid, cl->id, op_str(act->op), rt_str(act->rt), act->vg_name, mode_str(act->mode), opts); @@ -4514,6 +4562,8 @@ static void *client_thread_main(void *arg_in) { struct client *cl; struct action *act; + struct action *act_un; + int rv; while (1) { pthread_mutex_lock(&client_mutex); @@ -4537,11 +4587,30 @@ static void *client_thread_main(void *arg_in) if (cl) { pthread_mutex_lock(&cl->mutex); - client_send_result(cl, act); + rv = client_send_result(cl, act); pthread_mutex_unlock(&cl->mutex); } else { - log_debug("no client for result"); + log_debug("no client %u for result", act->client_id); + rv = -1; } + + /* + * The client failed after we acquired an LV lock for + * it, but before getting this reply saying it's done. + * So the lv will not be active and we should release + * the lv lock it requested. + */ + if ((rv < 0) && (act->flags & LD_AF_LV_LOCK)) { + log_debug("auto unlock lv for failed client %u", act->client_id); + if ((act_un = alloc_action())) { + memcpy(act_un, act, sizeof(struct action)); + act_un->mode = LD_LK_UN; + act_un->flags |= LD_AF_LV_UNLOCK; + act_un->flags &= ~LD_AF_LV_LOCK; + add_lock_action(act_un); + } + } + free_action(act); continue; } @@ -4571,6 +4640,7 @@ static void *client_thread_main(void *arg_in) log_debug("client rem %d pi %d fd %d ig %d", cl->id, cl->pi, cl->fd, cl->poll_ignore); */ + /* * If cl->dead was set in main_loop, then the * fd has already been closed and the pollfd @@ -5620,7 +5690,7 @@ static void process_listener(int poll_fd) list_add_tail(&cl->list, &client_list); pthread_mutex_unlock(&client_mutex); - log_debug("client add id %d pi %d fd %d", cl->id, cl->pi, cl->fd); + log_debug("new cl %u pi %d fd %d", cl->id, cl->pi, cl->fd); } /* @@ -5763,14 +5833,14 @@ static int main_loop(daemon_state *ds_arg) if (cl->recv) { /* should not happen */ - log_error("main client %d already recv", cl->id); + log_error("main client %u already recv", cl->id); } else if (cl->dead) { /* should not happen */ - log_error("main client %d already dead", cl->id); + log_error("main client %u already dead", cl->id); } else if (is_dead) { - log_debug("close %s[%d.%u] fd %d", + log_debug("close %s[%d] cl %u fd %d", cl->name[0] ? cl->name : "client", cl->pid, cl->id, cl->fd); cl->dead = 1; diff --git a/daemons/lvmlockd/lvmlockd-internal.h b/daemons/lvmlockd/lvmlockd-internal.h index a90dd997d..3afa75a5f 100644 --- a/daemons/lvmlockd/lvmlockd-internal.h +++ b/daemons/lvmlockd/lvmlockd-internal.h @@ -103,6 +103,8 @@ struct client { #define LD_AF_DUP_GL_LS 0x00002000 #define LD_AF_ADOPT 0x00010000 #define LD_AF_WARN_GL_REMOVED 0x00020000 +#define LD_AF_LV_LOCK 0x00040000 +#define LD_AF_LV_UNLOCK 0x00080000 /* * Number of times to repeat a lock request after @@ -141,6 +143,7 @@ struct resource { int8_t mode; unsigned int sh_count; /* number of sh locks on locks list */ uint32_t version; + uint32_t last_client_id; /* last client_id to lock or unlock resource */ unsigned int lm_init : 1; /* lm_data is initialized */ unsigned int adopt : 1; /* temp flag in remove_inactive_lvs */ unsigned int version_zero_valid : 1;