1
0
mirror of git://sourceware.org/git/lvm2.git synced 2025-07-29 19:41:56 +03:00

lvmlockd: use new sanlock_acquire2 to return owner info

Use the new sanlock_acquire2() which returns info about the owner
of a lease.  Pass this info back to the lvm command, where it's
initially used to print the host_id of a host holding a lock
when it cannot be acquired.
This commit is contained in:
David Teigland
2025-03-21 11:36:49 -05:00
parent 0217887fcd
commit 9b51b3d3f1
5 changed files with 267 additions and 127 deletions

View File

@ -57,5 +57,6 @@ static inline void lvmlockd_close(daemon_handle h)
#define EORPHAN 222
#define EADOPT_NONE 223
#define EADOPT_RETRY 224
#define EIOTIMEOUT 225
#endif /* _LVM_LVMLOCKD_CLIENT_H */

View File

@ -1108,14 +1108,15 @@ static int lm_add_resource(struct lockspace *ls, struct resource *r)
}
static int lm_lock(struct lockspace *ls, struct resource *r, int mode, struct action *act,
struct val_blk *vb_out, int *retry, int adopt_only, int adopt_ok)
struct val_blk *vb_out, int *retry, struct owner *owner,
int adopt_only, int adopt_ok)
{
int rv = -1;
if (ls->lm_type == LD_LM_DLM)
rv = lm_lock_dlm(ls, r, mode, vb_out, adopt_only, adopt_ok);
else if (ls->lm_type == LD_LM_SANLOCK)
rv = lm_lock_sanlock(ls, r, mode, vb_out, retry, adopt_only, adopt_ok);
rv = lm_lock_sanlock(ls, r, mode, vb_out, retry, owner, adopt_only, adopt_ok);
else if (ls->lm_type == LD_LM_IDM)
rv = lm_lock_idm(ls, r, mode, vb_out, act->lv_uuid,
&act->pvs, adopt_only, adopt_ok);
@ -1264,7 +1265,7 @@ static void add_work_action(struct action *act)
pthread_mutex_unlock(&worker_mutex);
}
static int res_lock(struct lockspace *ls, struct resource *r, struct action *act, int *retry)
static int res_lock(struct lockspace *ls, struct resource *r, struct action *act, int *retry, struct owner *owner)
{
struct lock *lk;
struct val_blk vb;
@ -1289,7 +1290,7 @@ static int res_lock(struct lockspace *ls, struct resource *r, struct action *act
if (r->type == LD_RT_LV && act->lv_args[0])
memcpy(r->lv_args, act->lv_args, MAX_ARGS);
rv = lm_lock(ls, r, act->mode, act, &vb, retry,
rv = lm_lock(ls, r, act->mode, act, &vb, retry, owner,
act->flags & LD_AF_ADOPT_ONLY ? 1 : 0,
act->flags & LD_AF_ADOPT ? 1 : 0);
@ -1901,6 +1902,7 @@ out:
static void res_process(struct lockspace *ls, struct resource *r,
struct list_head *act_close_list, int *retry_out)
{
struct owner owner = { 0 };
struct action *act, *safe, *act_close;
struct lock *lk;
uint32_t unlock_by_client_id = 0;
@ -2189,8 +2191,15 @@ static void res_process(struct lockspace *ls, struct resource *r,
if (act->op == LD_OP_LOCK && act->mode == LD_LK_SH) {
lm_retry = 0;
memset(&owner, 0, sizeof(owner));
rv = res_lock(ls, r, act, &lm_retry, &owner);
/* TODO: if lock fails because it's owned by a failed host,
and persistent reservations are enabled, then remove the
pr of failed host_id, tell sanlock the host_id is now
dead, and retry lock request. */
rv = res_lock(ls, r, act, &lm_retry);
if ((rv == -EAGAIN) &&
(act->retries <= act->max_retries) &&
(lm_retry || (r->type != LD_RT_LV))) {
@ -2199,6 +2208,8 @@ static void res_process(struct lockspace *ls, struct resource *r,
act->retries++;
*retry_out = 1;
} else {
if (rv == -EAGAIN)
memcpy(&act->owner, &owner, sizeof(owner));
act->result = rv;
list_del(&act->list);
add_client_result(act);
@ -2222,8 +2233,10 @@ static void res_process(struct lockspace *ls, struct resource *r,
list_for_each_entry_safe(act, safe, &r->actions, list) {
if (act->op == LD_OP_LOCK && act->mode == LD_LK_EX) {
lm_retry = 0;
memset(&owner, 0, sizeof(owner));
rv = res_lock(ls, r, act, &lm_retry, &owner);
rv = res_lock(ls, r, act, &lm_retry);
if ((rv == -EAGAIN) &&
(act->retries <= act->max_retries) &&
(lm_retry || (r->type != LD_RT_LV))) {
@ -2232,6 +2245,8 @@ static void res_process(struct lockspace *ls, struct resource *r,
act->retries++;
*retry_out = 1;
} else {
if (rv == -EAGAIN)
memcpy(&act->owner, &owner, sizeof(owner));
act->result = rv;
list_del(&act->list);
add_client_result(act);
@ -4235,6 +4250,31 @@ static int client_send_result(struct client *cl, struct action *act)
"result = " FMTd64, (int64_t) act->result,
"dump_len = " FMTd64, (int64_t) dump_len,
NULL);
} else if (act->op == LD_OP_LOCK && act->owner.host_id) {
/*
* lock reply with owner info
*/
log_debug("send %s[%d][%u] %s%s%s result %d owner %u %u %u %s %s",
cl->name[0] ? cl->name : "client", cl->pid, cl->id,
op_mode_str(act->op, act->mode), act->rt ? "_" : "", rt_str(act->rt), act->result,
act->owner.host_id, act->owner.generation, act->owner.timestamp,
act->owner.state[0] ? act->owner.state : "",
act->owner.name[0] ? act->owner.name : "");
res = daemon_reply_simple("OK",
"op = " FMTd64, (int64_t) act->op,
"lock_type = %s", lm_str(act->lm_type),
"op_result = " FMTd64, (int64_t) act->result,
"lm_result = " FMTd64, (int64_t) act->lm_rv,
"owner_host_id = " FMTd64, (int64_t) act->owner.host_id,
"owner_generation = " FMTd64, (int64_t) act->owner.generation,
"owner_timestamp = " FMTd64, (int64_t) act->owner.timestamp,
"owner_state = %s", act->owner.state[0] ? act->owner.state : "none",
"owner_name = %s", act->owner.name[0] ? act->owner.name : "none",
"result_flags = %s", result_flags[0] ? result_flags : "none",
NULL);
} else {
/*
* A normal reply.

View File

@ -130,6 +130,17 @@ struct pvs {
int num;
};
#define OWNER_NAME_SIZE 64
#define OWNER_STATE_SIZE 32
struct owner {
uint32_t host_id;
uint32_t generation;
uint32_t timestamp;
char state[OWNER_STATE_SIZE];
char name[OWNER_NAME_SIZE];
};
struct action {
struct list_head list;
uint32_t client_id;
@ -154,6 +165,7 @@ struct action {
char vg_args[MAX_ARGS+1];
char lv_args[MAX_ARGS+1];
char prev_lv_args[MAX_ARGS+1];
struct owner owner;
struct pvs pvs; /* PV list for idm */
};
@ -553,7 +565,7 @@ int lm_add_lockspace_sanlock(struct lockspace *ls, int adopt_only, int adopt_ok,
int lm_rem_lockspace_sanlock(struct lockspace *ls, int free_vg);
int lm_add_resource_sanlock(struct lockspace *ls, struct resource *r);
int lm_lock_sanlock(struct lockspace *ls, struct resource *r, int ld_mode,
struct val_blk *vb_out, int *retry,
struct val_blk *vb_out, int *retry, struct owner *owner,
int adopt_only, int adopt_ok);
int lm_convert_sanlock(struct lockspace *ls, struct resource *r,
int ld_mode, uint32_t r_version);
@ -617,7 +629,7 @@ static inline int lm_add_resource_sanlock(struct lockspace *ls, struct resource
}
static inline int lm_lock_sanlock(struct lockspace *ls, struct resource *r, int ld_mode,
struct val_blk *vb_out, int *retry,
struct val_blk *vb_out, int *retry, struct owner *owner,
int adopt_only, int adopt_ok)
{
return -1;

View File

@ -27,6 +27,8 @@
/* FIXME: copied from sanlock header until the sanlock update is more widespread */
#define SANLK_ADD_NODELAY 0x00000002
#define SANLOCK_HAS_ACQUIRE2 1
#include <stddef.h>
#include <poll.h>
#include <errno.h>
@ -1797,13 +1799,33 @@ int lm_rem_resource_sanlock(struct lockspace *ls, struct resource *r)
return 0;
}
static const char *_host_flags_to_str(uint32_t flags)
{
int val = flags & SANLK_HOST_MASK;
if (val == SANLK_HOST_FREE)
return "FREE";
if (val == SANLK_HOST_LIVE)
return "LIVE";
if (val == SANLK_HOST_FAIL)
return "FAIL";
if (val == SANLK_HOST_DEAD)
return "DEAD";
if (val == SANLK_HOST_UNKNOWN)
return "UNKNOWN";
return "ERROR";
}
int lm_lock_sanlock(struct lockspace *ls, struct resource *r, int ld_mode,
struct val_blk *vb_out, int *retry, int adopt_only, int adopt_ok)
struct val_blk *vb_out, int *retry, struct owner *owner,
int adopt_only, int adopt_ok)
{
struct lm_sanlock *lms = (struct lm_sanlock *)ls->lm_data;
struct rd_sanlock *rds = (struct rd_sanlock *)r->lm_data;
struct sanlk_resource *rs;
struct sanlk_options opt;
struct sanlk_host owner_host = { 0 };
char *owner_name = NULL;
uint64_t lock_lv_offset;
uint32_t flags = 0;
struct val_blk vb = { 0 };
@ -1907,7 +1929,11 @@ int lm_lock_sanlock(struct lockspace *ls, struct resource *r, int ld_mode,
memset(&opt, 0, sizeof(opt));
sprintf(opt.owner_name, "%s", "lvmlockd");
#ifdef SANLOCK_HAS_ACQUIRE2
rv = sanlock_acquire2(lms->sock, -1, flags, rs, &opt, &owner_host, &owner_name);
#else
rv = sanlock_acquire(lms->sock, -1, flags, 1, &rs, &opt);
#endif
/*
* errors: translate the sanlock error number to an lvmlockd error.
@ -1915,17 +1941,6 @@ int lm_lock_sanlock(struct lockspace *ls, struct resource *r, int ld_mode,
* this function to code that doesn't recognize sanlock error numbers.
*/
if (rv == -EAGAIN) {
/*
* It appears that sanlock_acquire returns EAGAIN when we request
* a shared lock but the lock is held ex by another host.
* There's no point in retrying this case, just return an error.
*/
log_debug("%s:%s lock_san acquire mode %d rv EAGAIN", ls->name, r->name, ld_mode);
*retry = 0;
return -EAGAIN;
}
if ((rv == -EMSGSIZE) && (r->type == LD_RT_LV)) {
/*
* sanlock tried to read beyond the end of the device,
@ -1962,64 +1977,68 @@ int lm_lock_sanlock(struct lockspace *ls, struct resource *r, int ld_mode,
return -EADOPT_NONE;
}
if (rv == SANLK_ACQUIRE_IDLIVE || rv == SANLK_ACQUIRE_OWNED || rv == SANLK_ACQUIRE_OTHER) {
if (rv == SANLK_ACQUIRE_IDLIVE ||
rv == SANLK_ACQUIRE_OWNED ||
rv == SANLK_ACQUIRE_OTHER ||
rv == SANLK_ACQUIRE_OWNED_RETRY ||
rv == -EAGAIN) {
/*
* The lock is held by another host. These failures can
* happen while multiple hosts are concurrently acquiring
* shared locks. We want to retry a couple times in this
* case because we'll probably get the sh lock.
* EAGAIN: when a shared lock is held, and we request an ex lock.
*
* I believe these are also the errors when requesting an
* ex lock that another host holds ex. We want to report
* something like: "lock is held by another host" in this case.
* Retry is pointless here.
* OWNED_RETRY: the lock is held by a failed but not yet dead host.
* Retrying will eventually find the host is dead (and the lock is
* granted), or another host has acquired it.
*
* We can't distinguish between the two cases above,
* so if requesting a sh lock, retry a couple times,
* otherwise don't.
* Multiple hosts all requesting shared locks can also result in
* some transient errors here (shared locks involve acquiring the
* paxos lease ex for a short period, which means two hosts both
* requesting sh at once can cause one to fail here.)
* Retry here to attempt to cover these transient failures.
*
* The command also has its own configurable retry logic.
* The intention is to handle actual lock contention retries
* from the command, and the transient failures from concurrent
* shared requests here. We don't actually know when a failure
* was related to the transient concurrent sh, so we just guess
* it was if we were requesting a sh lock.
*/
log_debug("%s:%s lock_san acquire mode %d rv %d", ls->name, r->name, ld_mode, rv);
*retry = (ld_mode == LD_LK_SH) ? 1 : 0;
if (rv == SANLK_ACQUIRE_OWNED_RETRY)
*retry = 0;
if (owner && owner_host.host_id) {
const char *host_state;
owner->host_id = (uint32_t)owner_host.host_id;
owner->generation = (uint32_t)owner_host.generation;
owner->timestamp = (uint32_t)owner_host.timestamp;
if ((host_state = _host_flags_to_str(owner_host.flags)))
dm_strncpy(owner->state, host_state, OWNER_STATE_SIZE-1);
if (owner_name) {
dm_strncpy(owner->name, owner_name, OWNER_NAME_SIZE-1);
free(owner_name);
}
log_debug("%s:%s lock_san acquire mode %d lock held %d owner %u %u %u %s %s",
ls->name, r->name, ld_mode, rv,
owner->host_id, owner->generation, owner->timestamp,
owner->state, owner->name ?: "");
} else {
log_debug("%s:%s lock_san acquire mode %d lock held %d",
ls->name, r->name, ld_mode, rv);
}
return -EAGAIN;
}
if (rv == SANLK_AIO_TIMEOUT) {
/*
* sanlock got an i/o timeout when trying to acquire the
* lease on disk.
*/
log_debug("%s:%s lock_san acquire mode %d rv %d", ls->name, r->name, ld_mode, rv);
log_debug("%s:%s lock_san acquire mode %d io timeout", ls->name, r->name, ld_mode);
*retry = 0;
return -EAGAIN;
}
if (rv == SANLK_DBLOCK_LVER || rv == SANLK_DBLOCK_MBAL) {
/*
* There was contention with another host for the lease,
* and we lost.
*/
log_debug("%s:%s lock_san acquire mode %d rv %d", ls->name, r->name, ld_mode, rv);
*retry = 0;
return -EAGAIN;
}
if (rv == SANLK_ACQUIRE_OWNED_RETRY) {
/*
* The lock is held by a failed host, and will eventually
* expire. If we retry we'll eventually acquire the lock
* (or find someone else has acquired it). The EAGAIN retry
* attempts for SH locks above would not be sufficient for
* the length of expiration time. We could add a longer
* retry time here to cover the full expiration time and block
* the activation command for that long. For now just return
* the standard error indicating that another host still owns
* the lease. FIXME: return a different error number so the
* command can print an different error indicating that the
* owner of the lease is in the process of expiring?
*/
log_debug("%s:%s lock_san acquire mode %d rv %d", ls->name, r->name, ld_mode, rv);
*retry = 0;
return -EAGAIN;
return -EIOTIMEOUT;
}
if (rv < 0) {
@ -2162,8 +2181,6 @@ int lm_convert_sanlock(struct lockspace *ls, struct resource *r,
case SANLK_ACQUIRE_OWNED_RETRY:
case SANLK_ACQUIRE_OTHER:
case SANLK_AIO_TIMEOUT:
case SANLK_DBLOCK_LVER:
case SANLK_DBLOCK_MBAL:
/* expected errors from known/normal cases like lock contention or io timeouts */
log_debug("%s:%s convert_san error %d", ls->name, r->name, rv);
return -EAGAIN;

View File

@ -32,6 +32,12 @@ struct lvmlockd_pvs {
int num;
};
struct owner {
uint32_t host_id;
uint32_t generation;
char *name;
};
void lvmlockd_set_socket(const char *sock)
{
_lvmlockd_socket = sock;
@ -132,6 +138,21 @@ static void _flags_str_to_lockd_flags(const char *flags_str, uint32_t *lockd_fla
*lockd_flags |= LD_RF_SH_EXISTS;
}
static char *_owner_str(struct owner *owner)
{
static char log_owner_str[128];
if (!owner || !owner->host_id)
return (char *)"";
log_owner_str[0] = '\0';
/* Use a --lockopt setting to print all owner details? */
snprintf(log_owner_str, sizeof(log_owner_str)-1, " (host_id %u)", owner->host_id);
return log_owner_str;
}
/*
* evaluate the reply from lvmlockd, check for errors, extract
* the result and lockd_flags returned by lvmlockd.
@ -146,10 +167,11 @@ static void _flags_str_to_lockd_flags(const char *flags_str, uint32_t *lockd_fla
*/
#define NO_LOCKD_RESULT (-1000)
static int _lockd_result(const char *req_name, daemon_reply reply, int *result, uint32_t *lockd_flags)
static int _lockd_result(struct cmd_context *cmd, const char *req_name, daemon_reply reply,
int *result, uint32_t *lockd_flags, struct owner *owner)
{
int reply_result;
const char *flags_str = NULL;
const char *str;
*result = -1;
@ -172,8 +194,15 @@ static int _lockd_result(const char *req_name, daemon_reply reply, int *result,
*result = reply_result;
if (lockd_flags) {
if ((flags_str = daemon_reply_str(reply, "result_flags", NULL)))
_flags_str_to_lockd_flags(flags_str, lockd_flags);
if ((str = daemon_reply_str(reply, "result_flags", NULL)))
_flags_str_to_lockd_flags(str, lockd_flags);
}
if (owner) {
owner->host_id = (uint32_t)daemon_reply_int(reply, "owner_host_id", 0);
owner->generation = (uint32_t)daemon_reply_int(reply, "owner_generation", 0);
if ((str = daemon_reply_str(reply, "owner_name", "none")))
owner->name = dm_pool_strdup(cmd->mem, str);
}
log_debug("lockd %s result: %d", req_name, reply_result);
@ -389,7 +418,8 @@ static int _lockd_request(struct cmd_context *cmd,
const char *opts,
const struct lvmlockd_pvs *lock_pvs,
int *result,
uint32_t *lockd_flags)
uint32_t *lockd_flags,
struct owner *owner)
{
const char *cmd_name = get_cmd_name();
daemon_reply reply;
@ -426,7 +456,7 @@ static int _lockd_request(struct cmd_context *cmd,
"lv_lock_args = %s", lv_lock_args ?: "none",
NULL);
if (!_lockd_result(req_name, reply, result, lockd_flags))
if (!_lockd_result(cmd, req_name, reply, result, lockd_flags, owner))
goto fail;
/*
@ -446,7 +476,7 @@ static int _lockd_request(struct cmd_context *cmd,
"vg_lock_args = %s", vg_lock_args ?: "none",
NULL);
if (!_lockd_result(req_name, reply, result, lockd_flags))
if (!_lockd_result(cmd, req_name, reply, result, lockd_flags, owner))
goto fail;
/*
@ -464,7 +494,7 @@ static int _lockd_request(struct cmd_context *cmd,
"vg_lock_type = %s", vg_lock_type ?: "none",
NULL);
if (!_lockd_result(req_name, reply, result, lockd_flags))
if (!_lockd_result(cmd, req_name, reply, result, lockd_flags, owner))
goto fail;
log_debug("lockd %s %s result %d %x",
@ -735,7 +765,7 @@ static int _handle_sanlock_lv(struct cmd_context *cmd, struct volume_group *vg)
"lv_size_bytes = " FMTd64, (int64_t) lv_size_bytes,
NULL);
if (!_lockd_result("find_free_lock", reply, &result, NULL)) {
if (!_lockd_result(cmd, "find_free_lock", reply, &result, NULL, NULL)) {
ret = 0;
} else {
ret = (result < 0) ? 0 : 1;
@ -790,7 +820,7 @@ static int _init_vg(struct cmd_context *cmd, struct volume_group *vg,
"vg_lock_type = %s", lock_type,
NULL);
if (!_lockd_result("init_vg", reply, &result, NULL)) {
if (!_lockd_result(cmd, "init_vg", reply, &result, NULL, NULL)) {
ret = 0;
result = -ELOCKD;
} else {
@ -987,7 +1017,7 @@ static int _init_vg_sanlock(struct cmd_context *cmd, struct volume_group *vg, in
"opts = %s", opts ?: "none",
NULL);
if (!_lockd_result("init_vg", reply, &result, NULL)) {
if (!_lockd_result(cmd, "init_vg", reply, &result, NULL, NULL)) {
ret = 0;
result = -ELOCKD;
} else {
@ -1089,7 +1119,7 @@ static int _free_vg(struct cmd_context *cmd, struct volume_group *vg)
"vg_lock_args = %s", vg->lock_args,
NULL);
if (!_lockd_result("free_vg", reply, &result, &lockd_flags)) {
if (!_lockd_result(cmd, "free_vg", reply, &result, &lockd_flags, NULL)) {
ret = 0;
} else {
ret = (result < 0) ? 0 : 1;
@ -1143,7 +1173,7 @@ static int _busy_vg(struct cmd_context *cmd, struct volume_group *vg)
"vg_lock_args = %s", vg->lock_args,
NULL);
if (!_lockd_result("busy_vg", reply, &result, &lockd_flags)) {
if (!_lockd_result(cmd, "busy_vg", reply, &result, &lockd_flags, NULL)) {
ret = 0;
} else {
ret = (result < 0) ? 0 : 1;
@ -1217,7 +1247,7 @@ static int _free_vg_sanlock(struct cmd_context *cmd, struct volume_group *vg)
"vg_lock_args = %s", vg->lock_args,
NULL);
if (!_lockd_result("free_vg", reply, &result, &lockd_flags)) {
if (!_lockd_result(cmd, "free_vg", reply, &result, &lockd_flags, NULL)) {
ret = 0;
} else {
ret = (result < 0) ? 0 : 1;
@ -1497,7 +1527,7 @@ int lockd_start_vg(struct cmd_context *cmd, struct volume_group *vg, int *exists
NULL);
}
if (!_lockd_result("start_vg", reply, &result, &lockd_flags)) {
if (!_lockd_result(cmd, "start_vg", reply, &result, &lockd_flags, NULL)) {
ret = 0;
result = -ELOCKD;
} else {
@ -1566,7 +1596,7 @@ int lockd_stop_vg(struct cmd_context *cmd, struct volume_group *vg)
"vg_name = %s", vg->name,
NULL);
if (!_lockd_result("stop_vg", reply, &result, NULL)) {
if (!_lockd_result(cmd, "stop_vg", reply, &result, NULL, NULL)) {
ret = 0;
} else {
ret = (result < 0) ? 0 : 1;
@ -1612,7 +1642,7 @@ int lockd_start_wait(struct cmd_context *cmd)
"pid = " FMTd64, (int64_t) getpid(),
NULL);
if (!_lockd_result("start_wait", reply, &result, NULL)) {
if (!_lockd_result(cmd, "start_wait", reply, &result, NULL, NULL)) {
ret = 0;
} else {
ret = (result < 0) ? 0 : 1;
@ -1683,6 +1713,7 @@ int lockd_start_wait(struct cmd_context *cmd)
int lockd_global_create(struct cmd_context *cmd, const char *def_mode, const char *vg_lock_type)
{
struct owner owner = { 0 };
const char *mode = NULL;
uint32_t lockd_flags;
int retries = 0;
@ -1730,15 +1761,18 @@ int lockd_global_create(struct cmd_context *cmd, const char *def_mode, const cha
req:
if (!_lockd_request(cmd, "lock_gl",
NULL, vg_lock_type, NULL, NULL, NULL, NULL, mode, NULL,
NULL, &result, &lockd_flags)) {
NULL, &result, &lockd_flags, &owner)) {
/* No result from lvmlockd, it is probably not running. */
log_error("Global lock failed: check that lvmlockd is running.");
return 0;
}
if (result == -EAGAIN) {
if (result == -EAGAIN || result == -EIOTIMEOUT) {
if (retries < find_config_tree_int(cmd, global_lvmlockd_lock_retries_CFG, NULL)) {
log_warn("Retrying %s global lock", mode);
if (result == -EIOTIMEOUT)
log_warn("Retrying global lock: io timeout");
else
log_warn("Retrying global lock: held by other host%s", _owner_str(&owner));
sleep(1);
retries++;
goto req;
@ -1821,8 +1855,10 @@ int lockd_global_create(struct cmd_context *cmd, const char *def_mode, const cha
if (result < 0) {
if (result == -ESTARTING)
log_error("Global lock failed: lockspace is starting.");
else if (result == -EIOTIMEOUT)
log_error("Global lock failed: io timeout");
else if (result == -EAGAIN)
log_error("Global lock failed: held by other host.");
log_error("Global lock failed: held by other host%s", _owner_str(&owner));
else if (result == -EPROTONOSUPPORT)
log_error("VG create failed: lock manager %s is not supported by lvmlockd.", vg_lock_type);
else
@ -1925,6 +1961,7 @@ out:
int lockd_global(struct cmd_context *cmd, const char *def_mode)
{
struct owner owner = { 0 };
const char *mode = NULL;
const char *opts = NULL;
uint32_t lockd_flags;
@ -1976,7 +2013,7 @@ int lockd_global(struct cmd_context *cmd, const char *def_mode)
if (!_lockd_request(cmd, "lock_gl",
NULL, NULL, NULL, NULL, NULL, NULL, mode, opts,
NULL, &result, &lockd_flags)) {
NULL, &result, &lockd_flags, &owner)) {
/* No result from lvmlockd, it is probably not running. */
/* We don't care if an unlock fails. */
@ -1993,9 +2030,12 @@ int lockd_global(struct cmd_context *cmd, const char *def_mode)
return 0;
}
if (result == -EAGAIN) {
if (result == -EAGAIN || result == -EIOTIMEOUT) {
if (retries < find_config_tree_int(cmd, global_lvmlockd_lock_retries_CFG, NULL)) {
log_warn("Retrying %s global lock", mode);
if (result == -EIOTIMEOUT)
log_warn("Retrying global lock: io timeout");
else
log_warn("Retrying global lock: held by other host%s", _owner_str(&owner));
sleep(1);
retries++;
goto req;
@ -2038,10 +2078,12 @@ int lockd_global(struct cmd_context *cmd, const char *def_mode)
result == -ESTARTING ||
result == -EVGKILLED ||
result == -ELOCKIO ||
result == -EIOTIMEOUT ||
result == -ELMERR ||
result == -EORPHAN ||
result == -EADOPT_RETRY ||
result == -EADOPT_NONE) {
result == -EADOPT_NONE ||
result == -EAGAIN) {
/*
* If an ex global lock fails, then the command fails.
*/
@ -2052,6 +2094,8 @@ int lockd_global(struct cmd_context *cmd, const char *def_mode)
log_error("Global lock failed: check that global lockspace is started");
else if (result == -ELOCKIO)
log_error("Global lock failed: storage errors for sanlock leases");
else if (result == -EIOTIMEOUT)
log_error("Global lock failed: io timeout");
else if (result == -ELMERR)
log_error("Global lock failed: lock manager error");
else if (result == -EVGKILLED)
@ -2062,6 +2106,8 @@ int lockd_global(struct cmd_context *cmd, const char *def_mode)
log_error("Global lock failed: adopt found no orphan");
else if (result == -EADOPT_RETRY)
log_error("Global lock failed: adopt found other mode");
else if (result == -EAGAIN)
log_error("Global lock failed: held by other host%s", _owner_str(&owner));
else
log_error("Global lock failed: error %d", result);
@ -2085,6 +2131,11 @@ int lockd_global(struct cmd_context *cmd, const char *def_mode)
goto allow;
}
if (result == -EIOTIMEOUT) {
log_warn("Skipping global lock: io timeout");
goto allow;
}
if ((lockd_flags & LD_RF_NO_GL_LS) && (lockd_flags & LD_RF_WARN_GL_REMOVED)) {
log_warn("Skipping global lock: VG with global lock was removed");
goto allow;
@ -2110,12 +2161,16 @@ int lockd_global(struct cmd_context *cmd, const char *def_mode)
goto allow;
}
if (result == -EAGAIN) {
log_warn("Skipping global lock: held by other host%s", _owner_str(&owner));
goto allow;
}
if ((lockd_flags & LD_RF_NO_GL_LS) || (lockd_flags & LD_RF_NO_LOCKSPACES)) {
log_debug("Skipping global lock: lockspace not found or started");
goto allow;
}
/*
* This is for completeness. If we reach here, then
* a specific check for the error should be added above
@ -2129,21 +2184,13 @@ int lockd_global(struct cmd_context *cmd, const char *def_mode)
log_warn("Duplicate sanlock global locks should be corrected");
if (result < 0) {
if (result == -EAGAIN) {
/*
* Most of the time, retries should avoid this case.
*/
log_error("Global lock failed: held by other host.");
return 0;
} else {
/*
* We don't intend to reach this. We should check
* any known/possible error specifically and print
* a more helpful message. This is for completeness.
*/
log_error("Global lock failed: error %d.", result);
return 0;
}
/*
* We don't intend to reach this. We should check
* any known/possible error specifically and print
* a more helpful message. This is for completeness.
*/
log_error("Global lock failed: error %d.", result);
return 0;
}
allow:
@ -2194,6 +2241,7 @@ int lockd_global(struct cmd_context *cmd, const char *def_mode)
int lockd_vg(struct cmd_context *cmd, const char *vg_name, const char *def_mode,
uint32_t flags, uint32_t *lockd_state)
{
struct owner owner = { 0 };
const char *mode = NULL;
const char *opts = NULL;
uint32_t lockd_flags;
@ -2293,7 +2341,7 @@ int lockd_vg(struct cmd_context *cmd, const char *vg_name, const char *def_mode,
if (!_lockd_request(cmd, "lock_vg",
vg_name, NULL, NULL, NULL, NULL, NULL, mode, opts,
NULL, &result, &lockd_flags)) {
NULL, &result, &lockd_flags, &owner)) {
/*
* No result from lvmlockd, it is probably not running.
* Decide if it is ok to continue without a lock in
@ -2305,9 +2353,12 @@ int lockd_vg(struct cmd_context *cmd, const char *vg_name, const char *def_mode,
return 1;
}
if (result == -EAGAIN) {
if (result == -EAGAIN || result == -EIOTIMEOUT) {
if (retries < find_config_tree_int(cmd, global_lvmlockd_lock_retries_CFG, NULL)) {
log_warn("Retrying %s lock on VG %s", mode, vg_name);
if (result == -EIOTIMEOUT)
log_warn("Retrying lock on VG %s: io timeout", vg_name);
else
log_warn("Retrying lock on VG %s: held by other host%s", vg_name, _owner_str(&owner));
sleep(1);
retries++;
goto req;
@ -2381,6 +2432,19 @@ int lockd_vg(struct cmd_context *cmd, const char *vg_name, const char *def_mode,
}
}
if (result == -EIOTIMEOUT) {
if (!strcmp(mode, "un"))
goto out;
else if (!strcmp(mode, "sh")) {
log_warn("VG %s lock skipped: io timeout", vg_name);
goto out;
} else {
log_error("VG %s lock failed: io timeout", vg_name);
ret = 0;
goto out;
}
}
/*
* The lock is held by another host, and retries have been unsuccessful.
*/
@ -2388,10 +2452,10 @@ int lockd_vg(struct cmd_context *cmd, const char *vg_name, const char *def_mode,
if (!strcmp(mode, "un"))
goto out;
else if (!strcmp(mode, "sh")) {
log_warn("VG %s lock skipped: held by other host.", vg_name);
log_warn("VG %s lock skipped: held by other host%s", vg_name, _owner_str(&owner));
goto out;
} else {
log_error("VG %s lock failed: held by other host.", vg_name);
log_error("VG %s lock failed: held by other host%s", vg_name, _owner_str(&owner));
ret = 0;
goto out;
}
@ -2522,7 +2586,7 @@ int lockd_vg_update(struct volume_group *vg)
"version = " FMTd64, (int64_t) vg->seqno,
NULL);
if (!_lockd_result("vg_update", reply, &result, NULL)) {
if (!_lockd_result(vg->cmd, "vg_update", reply, &result, NULL, NULL)) {
ret = 0;
} else {
ret = (result < 0) ? 0 : 1;
@ -2554,7 +2618,7 @@ static int _query_lv(struct cmd_context *cmd, struct volume_group *vg,
"lv_lock_args = %s", lock_args ?: "none",
NULL);
if (!_lockd_result("query_lock_lv", reply, &result, NULL)) {
if (!_lockd_result(cmd, "query_lock_lv", reply, &result, NULL, NULL)) {
/* No result from lvmlockd, it is probably not running. */
log_error("Lock query failed for LV %s/%s", vg->name, lv_name);
return 0;
@ -2621,6 +2685,7 @@ int lockd_lv_name(struct cmd_context *cmd, struct volume_group *vg,
const char *lv_name, struct id *lv_id,
const char *lock_args, const char *def_mode, uint32_t flags)
{
struct owner owner = { 0 };
char lv_uuid[64] __attribute__((aligned(8)));
char opt_buf[64] = {};
const char *opts = NULL;
@ -2718,7 +2783,7 @@ int lockd_lv_name(struct cmd_context *cmd, struct volume_group *vg,
if (!_lockd_request(cmd, "lock_lv",
vg->name, vg->lock_type, vg->lock_args,
lv_name, lv_uuid, lock_args, mode, opts,
&lock_pvs, &result, &lockd_flags)) {
&lock_pvs, &result, &lockd_flags, NULL)) {
_lockd_free_pv_list(&lock_pvs);
/* No result from lvmlockd, it is probably not running. */
log_error("Locking failed for LV %s/%s", vg->name, lv_name);
@ -2729,7 +2794,7 @@ int lockd_lv_name(struct cmd_context *cmd, struct volume_group *vg,
if (!_lockd_request(cmd, "lock_lv",
vg->name, vg->lock_type, vg->lock_args,
lv_name, lv_uuid, lock_args, mode, opts,
NULL, &result, &lockd_flags)) {
NULL, &result, &lockd_flags, &owner)) {
/* No result from lvmlockd, it is probably not running. */
log_error("Locking failed for LV %s/%s", vg->name, lv_name);
return 0;
@ -2744,7 +2809,12 @@ int lockd_lv_name(struct cmd_context *cmd, struct volume_group *vg,
return 1;
if (result == -EAGAIN) {
log_error("LV locked by other host: %s/%s", vg->name, lv_name);
log_error("LV locked by other host: %s/%s%s", vg->name, lv_name, _owner_str(&owner));
return 0;
}
if (result == -EIOTIMEOUT) {
log_error("LV %s/%s lock failed: io timeout.", vg->name, lv_name);
return 0;
}
@ -3649,7 +3719,7 @@ static int _init_lv_sanlock(struct cmd_context *cmd, struct volume_group *vg,
"vg_lock_args = %s", vg->lock_args,
NULL);
if (!_lockd_result("init_lv", reply, &result, NULL)) {
if (!_lockd_result(cmd, "init_lv", reply, &result, NULL, NULL)) {
ret = 0;
} else {
ret = (result < 0) ? 0 : 1;
@ -3724,7 +3794,7 @@ static int _free_lv(struct cmd_context *cmd, struct volume_group *vg,
"lv_lock_args = %s", lock_args ?: "none",
NULL);
if (!_lockd_result("free_lv", reply, &result, NULL)) {
if (!_lockd_result(cmd, "free_lv", reply, &result, NULL, NULL)) {
ret = 0;
} else {
ret = (result < 0) ? 0 : 1;
@ -3989,7 +4059,7 @@ int lockd_rename_vg_before(struct cmd_context *cmd, struct volume_group *vg)
"vg_lock_args = %s", vg->lock_args,
NULL);
if (!_lockd_result("rename_vg_before", reply, &result, NULL)) {
if (!_lockd_result(cmd, "rename_vg_before", reply, &result, NULL, NULL)) {
ret = 0;
} else {
ret = (result < 0) ? 0 : 1;
@ -4054,7 +4124,7 @@ int lockd_rename_vg_final(struct cmd_context *cmd, struct volume_group *vg, int
"vg_lock_args = %s", vg->lock_args,
NULL);
if (!_lockd_result("rename_vg_final", reply, &result, NULL)) {
if (!_lockd_result(cmd, "rename_vg_final", reply, &result, NULL, NULL)) {
ret = 0;
} else {
ret = (result < 0) ? 0 : 1;
@ -4095,7 +4165,7 @@ const char *lockd_running_lock_type(struct cmd_context *cmd, int *found_multiple
"pid = " FMTd64, (int64_t) getpid(),
NULL);
if (!_lockd_result("running_lm", reply, &result, NULL)) {
if (!_lockd_result(cmd, "running_lm", reply, &result, NULL, NULL)) {
log_error("Failed to get result from lvmlockd");
goto out;
}
@ -4216,7 +4286,7 @@ int lockd_lv_refresh(struct cmd_context *cmd, struct lvresize_params *lp)
"path = %s", path,
NULL);
if (!_lockd_result("refresh_lv", reply, &result, NULL)) {
if (!_lockd_result(cmd, "refresh_lv", reply, &result, NULL, NULL)) {
/* No result from lvmlockd, it is probably not running. */
log_error("LV refresh failed for LV %s", path);
return 0;