lvmlockd: use new sanlock_acquire2 to return owner info

Use the new sanlock_acquire2() which returns info about the owner of a lease. Pass this info back to the lvm command, where it's initially used to print the host_id of a host holding a lock when it cannot be acquired.
2025-08-04 12:22:00 +03:00 · 2025-03-21 11:36:49 -05:00
parent 0217887fcd
commit 9b51b3d3f1
5 changed files with 267 additions and 127 deletions
--- a/daemons/lvmlockd/lvmlockd-client.h
+++ b/daemons/lvmlockd/lvmlockd-client.h
@ -57,5 +57,6 @@ static inline void lvmlockd_close(daemon_handle h)
 #define EORPHAN   222
 #define EADOPT_NONE  223
 #define EADOPT_RETRY 224
+#define EIOTIMEOUT   225

 #endif	/* _LVM_LVMLOCKD_CLIENT_H */
--- a/daemons/lvmlockd/lvmlockd-core.c
+++ b/daemons/lvmlockd/lvmlockd-core.c
@ -1108,14 +1108,15 @@ static int lm_add_resource(struct lockspace *ls, struct resource *r)
 }

 static int lm_lock(struct lockspace *ls, struct resource *r, int mode, struct action *act,
-		   struct val_blk *vb_out, int *retry, int adopt_only, int adopt_ok)
+		   struct val_blk *vb_out, int *retry, struct owner *owner,
+		   int adopt_only, int adopt_ok)
 {
 	int rv = -1;

 	if (ls->lm_type == LD_LM_DLM)
 		rv = lm_lock_dlm(ls, r, mode, vb_out, adopt_only, adopt_ok);
 	else if (ls->lm_type == LD_LM_SANLOCK)
-		rv = lm_lock_sanlock(ls, r, mode, vb_out, retry, adopt_only, adopt_ok);
+		rv = lm_lock_sanlock(ls, r, mode, vb_out, retry, owner, adopt_only, adopt_ok);
 	else if (ls->lm_type == LD_LM_IDM)
 		rv = lm_lock_idm(ls, r, mode, vb_out, act->lv_uuid,
 				 &act->pvs, adopt_only, adopt_ok);
@ -1264,7 +1265,7 @@ static void add_work_action(struct action *act)
 	pthread_mutex_unlock(&worker_mutex);
 }

-static int res_lock(struct lockspace *ls, struct resource *r, struct action *act, int *retry)
+static int res_lock(struct lockspace *ls, struct resource *r, struct action *act, int *retry, struct owner *owner)
 {
 	struct lock *lk;
 	struct val_blk vb;
@ -1289,7 +1290,7 @@ static int res_lock(struct lockspace *ls, struct resource *r, struct action *act
 	if (r->type == LD_RT_LV && act->lv_args[0])
 		memcpy(r->lv_args, act->lv_args, MAX_ARGS);

-	rv = lm_lock(ls, r, act->mode, act, &vb, retry,
+	rv = lm_lock(ls, r, act->mode, act, &vb, retry, owner,
 		     act->flags & LD_AF_ADOPT_ONLY ? 1 : 0,
 		     act->flags & LD_AF_ADOPT ? 1 : 0);

@ -1901,6 +1902,7 @@ out:
 static void res_process(struct lockspace *ls, struct resource *r,
 			struct list_head *act_close_list, int *retry_out)
 {
+	struct owner owner = { 0 };
 	struct action *act, *safe, *act_close;
 	struct lock *lk;
 	uint32_t unlock_by_client_id = 0;
@ -2189,8 +2191,15 @@ static void res_process(struct lockspace *ls, struct resource *r,

 		if (act->op == LD_OP_LOCK && act->mode == LD_LK_SH) {
 			lm_retry = 0;
+			memset(&owner, 0, sizeof(owner));
+
+			rv = res_lock(ls, r, act, &lm_retry, &owner);
+
+			/* TODO: if lock fails because it's owned by a failed host,
+			   and persistent reservations are enabled, then remove the
+			   pr of failed host_id, tell sanlock the host_id is now
+			   dead, and retry lock request. */

-			rv = res_lock(ls, r, act, &lm_retry);
 			if ((rv == -EAGAIN) &&
 			    (act->retries <= act->max_retries) &&
 			    (lm_retry || (r->type != LD_RT_LV))) {
@ -2199,6 +2208,8 @@ static void res_process(struct lockspace *ls, struct resource *r,
 				act->retries++;
 				*retry_out = 1;
 			} else {
+				if (rv == -EAGAIN)
+					memcpy(&act->owner, &owner, sizeof(owner));
 				act->result = rv;
 				list_del(&act->list);
 				add_client_result(act);
@ -2222,8 +2233,10 @@ static void res_process(struct lockspace *ls, struct resource *r,
 	list_for_each_entry_safe(act, safe, &r->actions, list) {
 		if (act->op == LD_OP_LOCK && act->mode == LD_LK_EX) {
 			lm_retry = 0;
+			memset(&owner, 0, sizeof(owner));
+
+			rv = res_lock(ls, r, act, &lm_retry, &owner);

-			rv = res_lock(ls, r, act, &lm_retry);
 			if ((rv == -EAGAIN) &&
 			    (act->retries <= act->max_retries) &&
 			    (lm_retry || (r->type != LD_RT_LV))) {
@ -2232,6 +2245,8 @@ static void res_process(struct lockspace *ls, struct resource *r,
 				act->retries++;
 				*retry_out = 1;
 			} else {
+				if (rv == -EAGAIN)
+					memcpy(&act->owner, &owner, sizeof(owner));
 				act->result = rv;
 				list_del(&act->list);
 				add_client_result(act);
@ -4235,6 +4250,31 @@ static int client_send_result(struct client *cl, struct action *act)
 					  "result = " FMTd64, (int64_t) act->result,
 					  "dump_len = " FMTd64, (int64_t) dump_len,
 					  NULL);
+	} else if (act->op == LD_OP_LOCK && act->owner.host_id) {
+
+		/*
+		 * lock reply with owner info
+		 */
+
+		log_debug("send %s[%d][%u] %s%s%s result %d owner %u %u %u %s %s",
+			  cl->name[0] ? cl->name : "client", cl->pid, cl->id,
+			  op_mode_str(act->op, act->mode), act->rt ? "_" : "", rt_str(act->rt), act->result,
+			  act->owner.host_id, act->owner.generation, act->owner.timestamp,
+			  act->owner.state[0] ? act->owner.state : "",
+			  act->owner.name[0] ? act->owner.name : "");
+
+		res = daemon_reply_simple("OK",
+					  "op = " FMTd64, (int64_t) act->op,
+					  "lock_type = %s", lm_str(act->lm_type),
+					  "op_result = " FMTd64, (int64_t) act->result,
+					  "lm_result = " FMTd64, (int64_t) act->lm_rv,
+					  "owner_host_id = " FMTd64, (int64_t) act->owner.host_id,
+					  "owner_generation = " FMTd64, (int64_t) act->owner.generation,
+					  "owner_timestamp = " FMTd64, (int64_t) act->owner.timestamp,
+					  "owner_state = %s", act->owner.state[0] ? act->owner.state : "none",
+					  "owner_name = %s", act->owner.name[0] ? act->owner.name : "none",
+					  "result_flags = %s", result_flags[0] ? result_flags : "none",
+					  NULL);
 	} else {
 		/*
 		 * A normal reply.
--- a/daemons/lvmlockd/lvmlockd-internal.h
+++ b/daemons/lvmlockd/lvmlockd-internal.h
@ -130,6 +130,17 @@ struct pvs {
 	int num;
 };

+#define OWNER_NAME_SIZE 64
+#define OWNER_STATE_SIZE 32
+
+struct owner {
+	uint32_t host_id;
+	uint32_t generation;
+	uint32_t timestamp;
+	char state[OWNER_STATE_SIZE];
+	char name[OWNER_NAME_SIZE];
+};
+
 struct action {
 	struct list_head list;
 	uint32_t client_id;
@ -154,6 +165,7 @@ struct action {
 	char vg_args[MAX_ARGS+1];
 	char lv_args[MAX_ARGS+1];
 	char prev_lv_args[MAX_ARGS+1];
+	struct owner owner;
 	struct pvs pvs;			/* PV list for idm */
 };

@ -553,7 +565,7 @@ int lm_add_lockspace_sanlock(struct lockspace *ls, int adopt_only, int adopt_ok,
 int lm_rem_lockspace_sanlock(struct lockspace *ls, int free_vg);
 int lm_add_resource_sanlock(struct lockspace *ls, struct resource *r);
 int lm_lock_sanlock(struct lockspace *ls, struct resource *r, int ld_mode,
-		    struct val_blk *vb_out, int *retry, 
+		    struct val_blk *vb_out, int *retry, struct owner *owner,
 		    int adopt_only, int adopt_ok);
 int lm_convert_sanlock(struct lockspace *ls, struct resource *r,
 		       int ld_mode, uint32_t r_version);
@ -617,7 +629,7 @@ static inline int lm_add_resource_sanlock(struct lockspace *ls, struct resource
 }

 static inline int lm_lock_sanlock(struct lockspace *ls, struct resource *r, int ld_mode,
-		    struct val_blk *vb_out, int *retry,
+		    struct val_blk *vb_out, int *retry, struct owner *owner,
 		    int adopt_only, int adopt_ok)
 {
 	return -1;
--- a/daemons/lvmlockd/lvmlockd-sanlock.c
+++ b/daemons/lvmlockd/lvmlockd-sanlock.c
@ -27,6 +27,8 @@
 /* FIXME: copied from sanlock header until the sanlock update is more widespread */
 #define SANLK_ADD_NODELAY      0x00000002

+#define SANLOCK_HAS_ACQUIRE2 1
+
 #include <stddef.h>
 #include <poll.h>
 #include <errno.h>
@ -1797,13 +1799,33 @@ int lm_rem_resource_sanlock(struct lockspace *ls, struct resource *r)
 	return 0;
 }

+static const char *_host_flags_to_str(uint32_t flags)
+{
+	int val = flags & SANLK_HOST_MASK;
+
+	if (val == SANLK_HOST_FREE)
+		return "FREE";
+	if (val == SANLK_HOST_LIVE)
+		return "LIVE";
+	if (val == SANLK_HOST_FAIL)
+		return "FAIL";
+	if (val == SANLK_HOST_DEAD)
+		return "DEAD";
+	if (val == SANLK_HOST_UNKNOWN)
+		return "UNKNOWN";
+	return "ERROR";
+}
+
 int lm_lock_sanlock(struct lockspace *ls, struct resource *r, int ld_mode,
-		    struct val_blk *vb_out, int *retry, int adopt_only, int adopt_ok)
+		    struct val_blk *vb_out, int *retry, struct owner *owner,
+		    int adopt_only, int adopt_ok)
 {
 	struct lm_sanlock *lms = (struct lm_sanlock *)ls->lm_data;
 	struct rd_sanlock *rds = (struct rd_sanlock *)r->lm_data;
 	struct sanlk_resource *rs;
 	struct sanlk_options opt;
+	struct sanlk_host owner_host = { 0 };
+	char *owner_name = NULL;
 	uint64_t lock_lv_offset;
 	uint32_t flags = 0;
 	struct val_blk vb = { 0 };
@ -1907,7 +1929,11 @@ int lm_lock_sanlock(struct lockspace *ls, struct resource *r, int ld_mode,
 	memset(&opt, 0, sizeof(opt));
 	sprintf(opt.owner_name, "%s", "lvmlockd");

+#ifdef SANLOCK_HAS_ACQUIRE2
+	rv = sanlock_acquire2(lms->sock, -1, flags, rs, &opt, &owner_host, &owner_name);
+#else
 	rv = sanlock_acquire(lms->sock, -1, flags, 1, &rs, &opt);
+#endif

 	/*
 	 * errors: translate the sanlock error number to an lvmlockd error.
@ -1915,17 +1941,6 @@ int lm_lock_sanlock(struct lockspace *ls, struct resource *r, int ld_mode,
 	 * this function to code that doesn't recognize sanlock error numbers.
 	 */

-	if (rv == -EAGAIN) {
-		/*
-		 * It appears that sanlock_acquire returns EAGAIN when we request
-		 * a shared lock but the lock is held ex by another host.
-		 * There's no point in retrying this case, just return an error.
-		 */
-		log_debug("%s:%s lock_san acquire mode %d rv EAGAIN", ls->name, r->name, ld_mode);
-		*retry = 0;
-		return -EAGAIN;
-	}
-
 	if ((rv == -EMSGSIZE) && (r->type == LD_RT_LV)) {
 		/*
 		 * sanlock tried to read beyond the end of the device,
@ -1962,64 +1977,68 @@ int lm_lock_sanlock(struct lockspace *ls, struct resource *r, int ld_mode,
 		return -EADOPT_NONE;
 	}

-	if (rv == SANLK_ACQUIRE_IDLIVE || rv == SANLK_ACQUIRE_OWNED || rv == SANLK_ACQUIRE_OTHER) {
+	if (rv == SANLK_ACQUIRE_IDLIVE ||
+	    rv == SANLK_ACQUIRE_OWNED ||
+	    rv == SANLK_ACQUIRE_OTHER ||
+	    rv == SANLK_ACQUIRE_OWNED_RETRY ||
+	    rv == -EAGAIN) {
+
 		/*
-		 * The lock is held by another host.  These failures can
-		 * happen while multiple hosts are concurrently acquiring
-		 * shared locks.  We want to retry a couple times in this
-		 * case because we'll probably get the sh lock.
+		 * EAGAIN: when a shared lock is held, and we request an ex lock.
 		 *
-		 * I believe these are also the errors when requesting an
-		 * ex lock that another host holds ex.  We want to report
-		 * something like: "lock is held by another host" in this case.
-		 * Retry is pointless here.
+		 * OWNED_RETRY: the lock is held by a failed but not yet dead host.
+		 * Retrying will eventually find the host is dead (and the lock is
+		 * granted), or another host has acquired it.
 		 *
-		 * We can't distinguish between the two cases above,
-		 * so if requesting a sh lock, retry a couple times,
-		 * otherwise don't.
+		 * Multiple hosts all requesting shared locks can also result in
+		 * some transient errors here (shared locks involve acquiring the
+		 * paxos lease ex for a short period, which means two hosts both
+		 * requesting sh at once can cause one to fail here.)
+		 * Retry here to attempt to cover these transient failures.
+		 *
+		 * The command also has its own configurable retry logic.
+		 * The intention is to handle actual lock contention retries
+		 * from the command, and the transient failures from concurrent
+		 * shared requests here.  We don't actually know when a failure
+		 * was related to the transient concurrent sh, so we just guess
+		 * it was if we were requesting a sh lock.
 		 */
-		log_debug("%s:%s lock_san acquire mode %d rv %d", ls->name, r->name, ld_mode, rv);
+
 		*retry = (ld_mode == LD_LK_SH) ? 1 : 0;
+
+		if (rv == SANLK_ACQUIRE_OWNED_RETRY)
+			*retry = 0;
+
+		if (owner && owner_host.host_id) {
+			const char *host_state;
+
+			owner->host_id = (uint32_t)owner_host.host_id;
+			owner->generation = (uint32_t)owner_host.generation;
+			owner->timestamp = (uint32_t)owner_host.timestamp;
+
+			if ((host_state = _host_flags_to_str(owner_host.flags)))
+				dm_strncpy(owner->state, host_state, OWNER_STATE_SIZE-1);
+
+			if (owner_name) {
+				dm_strncpy(owner->name, owner_name, OWNER_NAME_SIZE-1);
+				free(owner_name);
+			}
+
+			log_debug("%s:%s lock_san acquire mode %d lock held %d owner %u %u %u %s %s",
+				  ls->name, r->name, ld_mode, rv,
+				  owner->host_id, owner->generation, owner->timestamp,
+				  owner->state, owner->name ?: "");
+		} else {
+			log_debug("%s:%s lock_san acquire mode %d lock held %d",
+				  ls->name, r->name, ld_mode, rv);
+		}
 		return -EAGAIN;
 	}

 	if (rv == SANLK_AIO_TIMEOUT) {
-		/*
-		 * sanlock got an i/o timeout when trying to acquire the
-		 * lease on disk.
-		 */
-		log_debug("%s:%s lock_san acquire mode %d rv %d", ls->name, r->name, ld_mode, rv);
+		log_debug("%s:%s lock_san acquire mode %d io timeout", ls->name, r->name, ld_mode);
 		*retry = 0;
-		return -EAGAIN;
-	}
-
-	if (rv == SANLK_DBLOCK_LVER || rv == SANLK_DBLOCK_MBAL) {
-		/*
-		 * There was contention with another host for the lease,
-		 * and we lost.
-		 */
-		log_debug("%s:%s lock_san acquire mode %d rv %d", ls->name, r->name, ld_mode, rv);
-		*retry = 0;
-		return -EAGAIN;
-	}
-
-	if (rv == SANLK_ACQUIRE_OWNED_RETRY) {
-		/*
-		 * The lock is held by a failed host, and will eventually
-		 * expire.  If we retry we'll eventually acquire the lock
-		 * (or find someone else has acquired it).  The EAGAIN retry
-		 * attempts for SH locks above would not be sufficient for
-		 * the length of expiration time.  We could add a longer
-		 * retry time here to cover the full expiration time and block
-		 * the activation command for that long.  For now just return
-		 * the standard error indicating that another host still owns
-		 * the lease.  FIXME: return a different error number so the
-		 * command can print an different error indicating that the
-		 * owner of the lease is in the process of expiring?
-		 */
-		log_debug("%s:%s lock_san acquire mode %d rv %d", ls->name, r->name, ld_mode, rv);
-		*retry = 0;
-		return -EAGAIN;
+		return -EIOTIMEOUT;
 	}

 	if (rv < 0) {
@ -2162,8 +2181,6 @@ int lm_convert_sanlock(struct lockspace *ls, struct resource *r,
 	case SANLK_ACQUIRE_OWNED_RETRY:
 	case SANLK_ACQUIRE_OTHER:
 	case SANLK_AIO_TIMEOUT:
-	case SANLK_DBLOCK_LVER:
-	case SANLK_DBLOCK_MBAL:
 		/* expected errors from known/normal cases like lock contention or io timeouts */
 		log_debug("%s:%s convert_san error %d", ls->name, r->name, rv);
 		return -EAGAIN;
--- a/lib/locking/lvmlockd.c
+++ b/lib/locking/lvmlockd.c
@ -32,6 +32,12 @@ struct lvmlockd_pvs {
 	int num;
 };

+struct owner {
+	uint32_t host_id;
+	uint32_t generation;
+	char *name;
+};
+
 void lvmlockd_set_socket(const char *sock)
 {
 	_lvmlockd_socket = sock;
@ -132,6 +138,21 @@ static void _flags_str_to_lockd_flags(const char *flags_str, uint32_t *lockd_fla
 		*lockd_flags |= LD_RF_SH_EXISTS;
 }

+static char *_owner_str(struct owner *owner)
+{
+	static char log_owner_str[128];
+
+	if (!owner || !owner->host_id)
+		return (char *)"";
+
+	log_owner_str[0] = '\0';
+
+	/* Use a --lockopt setting to print all owner details? */
+
+	snprintf(log_owner_str, sizeof(log_owner_str)-1, " (host_id %u)", owner->host_id);
+	return log_owner_str;
+}
+
 /*
 * evaluate the reply from lvmlockd, check for errors, extract
 * the result and lockd_flags returned by lvmlockd.
@ -146,10 +167,11 @@ static void _flags_str_to_lockd_flags(const char *flags_str, uint32_t *lockd_fla
 */
 #define NO_LOCKD_RESULT (-1000)

-static int _lockd_result(const char *req_name, daemon_reply reply, int *result, uint32_t *lockd_flags)
+static int _lockd_result(struct cmd_context *cmd, const char *req_name, daemon_reply reply,
+			 int *result, uint32_t *lockd_flags, struct owner *owner)
 {
 	int reply_result;
-	const char *flags_str = NULL;
+	const char *str;

 	*result = -1;

@ -172,8 +194,15 @@ static int _lockd_result(const char *req_name, daemon_reply reply, int *result,
 	*result = reply_result;

 	if (lockd_flags) {
-		if ((flags_str = daemon_reply_str(reply, "result_flags", NULL)))
-			_flags_str_to_lockd_flags(flags_str, lockd_flags);
+		if ((str = daemon_reply_str(reply, "result_flags", NULL)))
+			_flags_str_to_lockd_flags(str, lockd_flags);
+	}
+
+	if (owner) {
+		owner->host_id = (uint32_t)daemon_reply_int(reply, "owner_host_id", 0);
+		owner->generation = (uint32_t)daemon_reply_int(reply, "owner_generation", 0);
+		if ((str = daemon_reply_str(reply, "owner_name", "none")))
+			owner->name = dm_pool_strdup(cmd->mem, str);
 	}

 	log_debug("lockd %s result: %d", req_name, reply_result);
@ -389,7 +418,8 @@ static int _lockd_request(struct cmd_context *cmd,
 		          const char *opts,
 			  const struct lvmlockd_pvs *lock_pvs,
 		          int *result,
-		          uint32_t *lockd_flags)
+		          uint32_t *lockd_flags,
+			  struct owner *owner)
 {
 	const char *cmd_name = get_cmd_name();
 	daemon_reply reply;
@ -426,7 +456,7 @@ static int _lockd_request(struct cmd_context *cmd,
 					"lv_lock_args = %s", lv_lock_args ?: "none",
 					NULL);

-		if (!_lockd_result(req_name, reply, result, lockd_flags))
+		if (!_lockd_result(cmd, req_name, reply, result, lockd_flags, owner))
 			goto fail;

 		/*
@ -446,7 +476,7 @@ static int _lockd_request(struct cmd_context *cmd,
 					"vg_lock_args = %s", vg_lock_args ?: "none",
 					NULL);

-		if (!_lockd_result(req_name, reply, result, lockd_flags))
+		if (!_lockd_result(cmd, req_name, reply, result, lockd_flags, owner))
 			goto fail;

 		/*
@ -464,7 +494,7 @@ static int _lockd_request(struct cmd_context *cmd,
 					"vg_lock_type = %s", vg_lock_type ?: "none",
 					NULL);

-		if (!_lockd_result(req_name, reply, result, lockd_flags))
+		if (!_lockd_result(cmd, req_name, reply, result, lockd_flags, owner))
 			goto fail;

 		log_debug("lockd %s %s result %d %x",
@ -735,7 +765,7 @@ static int _handle_sanlock_lv(struct cmd_context *cmd, struct volume_group *vg)
 			"lv_size_bytes = " FMTd64, (int64_t) lv_size_bytes,
 			NULL);

-	if (!_lockd_result("find_free_lock", reply, &result, NULL)) {
+	if (!_lockd_result(cmd, "find_free_lock", reply, &result, NULL, NULL)) {
 		ret = 0;
 	} else {
 		ret = (result < 0) ? 0 : 1;
@ -790,7 +820,7 @@ static int _init_vg(struct cmd_context *cmd, struct volume_group *vg,
 				"vg_lock_type = %s", lock_type,
 				NULL);

-	if (!_lockd_result("init_vg", reply, &result, NULL)) {
+	if (!_lockd_result(cmd, "init_vg", reply, &result, NULL, NULL)) {
 		ret = 0;
 		result = -ELOCKD;
 	} else {
@ -987,7 +1017,7 @@ static int _init_vg_sanlock(struct cmd_context *cmd, struct volume_group *vg, in
 				"opts = %s", opts ?: "none",
 				NULL);

-	if (!_lockd_result("init_vg", reply, &result, NULL)) {
+	if (!_lockd_result(cmd, "init_vg", reply, &result, NULL, NULL)) {
 		ret = 0;
 		result = -ELOCKD;
 	} else {
@ -1089,7 +1119,7 @@ static int _free_vg(struct cmd_context *cmd, struct volume_group *vg)
 				"vg_lock_args = %s", vg->lock_args,
 				NULL);

-	if (!_lockd_result("free_vg", reply, &result, &lockd_flags)) {
+	if (!_lockd_result(cmd, "free_vg", reply, &result, &lockd_flags, NULL)) {
 		ret = 0;
 	} else {
 		ret = (result < 0) ? 0 : 1;
@ -1143,7 +1173,7 @@ static int _busy_vg(struct cmd_context *cmd, struct volume_group *vg)
 				"vg_lock_args = %s", vg->lock_args,
 				NULL);

-	if (!_lockd_result("busy_vg", reply, &result, &lockd_flags)) {
+	if (!_lockd_result(cmd, "busy_vg", reply, &result, &lockd_flags, NULL)) {
 		ret = 0;
 	} else {
 		ret = (result < 0) ? 0 : 1;
@ -1217,7 +1247,7 @@ static int _free_vg_sanlock(struct cmd_context *cmd, struct volume_group *vg)
 				"vg_lock_args = %s", vg->lock_args,
 				NULL);

-	if (!_lockd_result("free_vg", reply, &result, &lockd_flags)) {
+	if (!_lockd_result(cmd, "free_vg", reply, &result, &lockd_flags, NULL)) {
 		ret = 0;
 	} else {
 		ret = (result < 0) ? 0 : 1;
@ -1497,7 +1527,7 @@ int lockd_start_vg(struct cmd_context *cmd, struct volume_group *vg, int *exists
 				NULL);
 	}

-	if (!_lockd_result("start_vg", reply, &result, &lockd_flags)) {
+	if (!_lockd_result(cmd, "start_vg", reply, &result, &lockd_flags, NULL)) {
 		ret = 0;
 		result = -ELOCKD;
 	} else {
@ -1566,7 +1596,7 @@ int lockd_stop_vg(struct cmd_context *cmd, struct volume_group *vg)
 			"vg_name = %s", vg->name,
 			NULL);

-	if (!_lockd_result("stop_vg", reply, &result, NULL)) {
+	if (!_lockd_result(cmd, "stop_vg", reply, &result, NULL, NULL)) {
 		ret = 0;
 	} else {
 		ret = (result < 0) ? 0 : 1;
@ -1612,7 +1642,7 @@ int lockd_start_wait(struct cmd_context *cmd)
 			"pid = " FMTd64, (int64_t) getpid(),
 			NULL);

-	if (!_lockd_result("start_wait", reply, &result, NULL)) {
+	if (!_lockd_result(cmd, "start_wait", reply, &result, NULL, NULL)) {
 		ret = 0;
 	} else {
 		ret = (result < 0) ? 0 : 1;
@ -1683,6 +1713,7 @@ int lockd_start_wait(struct cmd_context *cmd)

 int lockd_global_create(struct cmd_context *cmd, const char *def_mode, const char *vg_lock_type)
 {
+	struct owner owner = { 0 };
 	const char *mode = NULL;
 	uint32_t lockd_flags;
 	int retries = 0;
@ -1730,15 +1761,18 @@ int lockd_global_create(struct cmd_context *cmd, const char *def_mode, const cha
 req:
 	if (!_lockd_request(cmd, "lock_gl",
 			      NULL, vg_lock_type, NULL, NULL, NULL, NULL, mode, NULL,
-			      NULL, &result, &lockd_flags)) {
+			      NULL, &result, &lockd_flags, &owner)) {
 		/* No result from lvmlockd, it is probably not running. */
 		log_error("Global lock failed: check that lvmlockd is running.");
 		return 0;
 	}

-	if (result == -EAGAIN) {
+	if (result == -EAGAIN || result == -EIOTIMEOUT) {
 		if (retries < find_config_tree_int(cmd, global_lvmlockd_lock_retries_CFG, NULL)) {
-			log_warn("Retrying %s global lock", mode);
+			if (result == -EIOTIMEOUT)
+				log_warn("Retrying global lock: io timeout");
+			else
+				log_warn("Retrying global lock: held by other host%s", _owner_str(&owner));
 			sleep(1);
 			retries++;
 			goto req;
@ -1821,8 +1855,10 @@ int lockd_global_create(struct cmd_context *cmd, const char *def_mode, const cha
 	if (result < 0) {
 		if (result == -ESTARTING)
 			log_error("Global lock failed: lockspace is starting.");
+		else if (result == -EIOTIMEOUT)
+			log_error("Global lock failed: io timeout");
 		else if (result == -EAGAIN)
-			log_error("Global lock failed: held by other host.");
+			log_error("Global lock failed: held by other host%s", _owner_str(&owner));
 		else if (result == -EPROTONOSUPPORT)
 			log_error("VG create failed: lock manager %s is not supported by lvmlockd.", vg_lock_type);
 		else
@ -1925,6 +1961,7 @@ out:

 int lockd_global(struct cmd_context *cmd, const char *def_mode)
 {
+	struct owner owner = { 0 };
 	const char *mode = NULL;
 	const char *opts = NULL;
 	uint32_t lockd_flags;
@ -1976,7 +2013,7 @@ int lockd_global(struct cmd_context *cmd, const char *def_mode)

 	if (!_lockd_request(cmd, "lock_gl",
 			    NULL, NULL, NULL, NULL, NULL, NULL, mode, opts,
-			    NULL, &result, &lockd_flags)) {
+			    NULL, &result, &lockd_flags, &owner)) {
 		/* No result from lvmlockd, it is probably not running. */

 		/* We don't care if an unlock fails. */
@ -1993,9 +2030,12 @@ int lockd_global(struct cmd_context *cmd, const char *def_mode)
 		return 0;
 	}

-	if (result == -EAGAIN) {
+	if (result == -EAGAIN || result == -EIOTIMEOUT) {
 		if (retries < find_config_tree_int(cmd, global_lvmlockd_lock_retries_CFG, NULL)) {
-			log_warn("Retrying %s global lock", mode);
+			if (result == -EIOTIMEOUT)
+				log_warn("Retrying global lock: io timeout");
+                        else
+				log_warn("Retrying global lock: held by other host%s", _owner_str(&owner));
 			sleep(1);
 			retries++;
 			goto req;
@ -2038,10 +2078,12 @@ int lockd_global(struct cmd_context *cmd, const char *def_mode)
 	    result == -ESTARTING ||
 	    result == -EVGKILLED ||
 	    result == -ELOCKIO ||
+	    result == -EIOTIMEOUT ||
 	    result == -ELMERR ||
 	    result == -EORPHAN ||
 	    result == -EADOPT_RETRY ||
-	    result == -EADOPT_NONE) {
+	    result == -EADOPT_NONE ||
+	    result == -EAGAIN) {
 		/*
 		 * If an ex global lock fails, then the command fails.
 		 */
@ -2052,6 +2094,8 @@ int lockd_global(struct cmd_context *cmd, const char *def_mode)
 				log_error("Global lock failed: check that global lockspace is started");
 			else if (result == -ELOCKIO)
 				log_error("Global lock failed: storage errors for sanlock leases");
+			else if (result == -EIOTIMEOUT)
+				log_error("Global lock failed: io timeout");
 			else if (result == -ELMERR)
 				log_error("Global lock failed: lock manager error");
 			else if (result == -EVGKILLED)
@ -2062,6 +2106,8 @@ int lockd_global(struct cmd_context *cmd, const char *def_mode)
 				log_error("Global lock failed: adopt found no orphan");
 			else if (result == -EADOPT_RETRY)
 				log_error("Global lock failed: adopt found other mode");
+			else if (result == -EAGAIN)
+				log_error("Global lock failed: held by other host%s", _owner_str(&owner));
 			else
 				log_error("Global lock failed: error %d", result);

@ -2085,6 +2131,11 @@ int lockd_global(struct cmd_context *cmd, const char *def_mode)
 			goto allow;
 		}

+		if (result == -EIOTIMEOUT) {
+			log_warn("Skipping global lock: io timeout");
+			goto allow;
+		}
+
 		if ((lockd_flags & LD_RF_NO_GL_LS) && (lockd_flags & LD_RF_WARN_GL_REMOVED)) {
 			log_warn("Skipping global lock: VG with global lock was removed");
 			goto allow;
@ -2110,12 +2161,16 @@ int lockd_global(struct cmd_context *cmd, const char *def_mode)
 			goto allow;
 		}

+		if (result == -EAGAIN) {
+			log_warn("Skipping global lock: held by other host%s", _owner_str(&owner));
+			goto allow;
+		}
+
 		if ((lockd_flags & LD_RF_NO_GL_LS) || (lockd_flags & LD_RF_NO_LOCKSPACES)) {
 			log_debug("Skipping global lock: lockspace not found or started");
 			goto allow;
 		}

-
 		/*
 		 * This is for completeness.  If we reach here, then
 		 * a specific check for the error should be added above
@ -2129,21 +2184,13 @@ int lockd_global(struct cmd_context *cmd, const char *def_mode)
 		log_warn("Duplicate sanlock global locks should be corrected");

 	if (result < 0) {
-		if (result == -EAGAIN) {
-			/*
-			 * Most of the time, retries should avoid this case.
-			 */
-			log_error("Global lock failed: held by other host.");
-			return 0;
-		} else {
-			/*
-			 * We don't intend to reach this.  We should check
-			 * any known/possible error specifically and print
-			 * a more helpful message.  This is for completeness.
-			 */
-			log_error("Global lock failed: error %d.", result);
-			return 0;
-		}
+		/*
+		 * We don't intend to reach this.  We should check
+		 * any known/possible error specifically and print
+		 * a more helpful message.  This is for completeness.
+		 */
+		log_error("Global lock failed: error %d.", result);
+		return 0;
 	}

 allow:
@ -2194,6 +2241,7 @@ int lockd_global(struct cmd_context *cmd, const char *def_mode)
 int lockd_vg(struct cmd_context *cmd, const char *vg_name, const char *def_mode,
 	     uint32_t flags, uint32_t *lockd_state)
 {
+	struct owner owner = { 0 };
 	const char *mode = NULL;
 	const char *opts = NULL;
 	uint32_t lockd_flags;
@ -2293,7 +2341,7 @@ int lockd_vg(struct cmd_context *cmd, const char *vg_name, const char *def_mode,

 	if (!_lockd_request(cmd, "lock_vg",
 			      vg_name, NULL, NULL, NULL, NULL, NULL, mode, opts,
-			      NULL, &result, &lockd_flags)) {
+			      NULL, &result, &lockd_flags, &owner)) {
 		/*
 		 * No result from lvmlockd, it is probably not running.
 		 * Decide if it is ok to continue without a lock in
@ -2305,9 +2353,12 @@ int lockd_vg(struct cmd_context *cmd, const char *vg_name, const char *def_mode,
 		return 1;
 	}

-	if (result == -EAGAIN) {
+	if (result == -EAGAIN || result == -EIOTIMEOUT) {
 		if (retries < find_config_tree_int(cmd, global_lvmlockd_lock_retries_CFG, NULL)) {
-			log_warn("Retrying %s lock on VG %s", mode, vg_name);
+			if (result == -EIOTIMEOUT)
+				log_warn("Retrying lock on VG %s: io timeout", vg_name);
+			else
+				log_warn("Retrying lock on VG %s: held by other host%s", vg_name, _owner_str(&owner));
 			sleep(1);
 			retries++;
 			goto req;
@ -2381,6 +2432,19 @@ int lockd_vg(struct cmd_context *cmd, const char *vg_name, const char *def_mode,
 		}
 	}

+	if (result == -EIOTIMEOUT) {
+		if (!strcmp(mode, "un"))
+			goto out;
+		else if (!strcmp(mode, "sh")) {
+			log_warn("VG %s lock skipped: io timeout", vg_name);
+			goto out;
+		} else {
+			log_error("VG %s lock failed: io timeout", vg_name);
+			ret = 0;
+			goto out;
+		}
+	}
+
 	/*
 	 * The lock is held by another host, and retries have been unsuccessful.
 	 */
@ -2388,10 +2452,10 @@ int lockd_vg(struct cmd_context *cmd, const char *vg_name, const char *def_mode,
 		if (!strcmp(mode, "un"))
 			goto out;
 		else if (!strcmp(mode, "sh")) {
-			log_warn("VG %s lock skipped: held by other host.", vg_name);
+			log_warn("VG %s lock skipped: held by other host%s", vg_name, _owner_str(&owner));
 			goto out;
 		} else {
-			log_error("VG %s lock failed: held by other host.", vg_name);
+			log_error("VG %s lock failed: held by other host%s", vg_name, _owner_str(&owner));
 			ret = 0;
 			goto out;
 		}
@ -2522,7 +2586,7 @@ int lockd_vg_update(struct volume_group *vg)
 				"version = " FMTd64, (int64_t) vg->seqno,
 				NULL);

-	if (!_lockd_result("vg_update", reply, &result, NULL)) {
+	if (!_lockd_result(vg->cmd, "vg_update", reply, &result, NULL, NULL)) {
 		ret = 0;
 	} else {
 		ret = (result < 0) ? 0 : 1;
@ -2554,7 +2618,7 @@ static int _query_lv(struct cmd_context *cmd, struct volume_group *vg,
 				"lv_lock_args = %s", lock_args ?: "none",
 				NULL);

-	if (!_lockd_result("query_lock_lv", reply, &result, NULL)) {
+	if (!_lockd_result(cmd, "query_lock_lv", reply, &result, NULL, NULL)) {
 		/* No result from lvmlockd, it is probably not running. */
 		log_error("Lock query failed for LV %s/%s", vg->name, lv_name);
 		return 0;
@ -2621,6 +2685,7 @@ int lockd_lv_name(struct cmd_context *cmd, struct volume_group *vg,
 		  const char *lv_name, struct id *lv_id,
 		  const char *lock_args, const char *def_mode, uint32_t flags)
 {
+	struct owner owner = { 0 };
 	char lv_uuid[64] __attribute__((aligned(8)));
 	char opt_buf[64] = {};
 	const char *opts = NULL;
@ -2718,7 +2783,7 @@ int lockd_lv_name(struct cmd_context *cmd, struct volume_group *vg,
 		if (!_lockd_request(cmd, "lock_lv",
 				       vg->name, vg->lock_type, vg->lock_args,
 				       lv_name, lv_uuid, lock_args, mode, opts,
-				       &lock_pvs, &result, &lockd_flags)) {
+				       &lock_pvs, &result, &lockd_flags, NULL)) {
 			_lockd_free_pv_list(&lock_pvs);
 			/* No result from lvmlockd, it is probably not running. */
 			log_error("Locking failed for LV %s/%s", vg->name, lv_name);
@ -2729,7 +2794,7 @@ int lockd_lv_name(struct cmd_context *cmd, struct volume_group *vg,
 		if (!_lockd_request(cmd, "lock_lv",
 				       vg->name, vg->lock_type, vg->lock_args,
 				       lv_name, lv_uuid, lock_args, mode, opts,
-				       NULL, &result, &lockd_flags)) {
+				       NULL, &result, &lockd_flags, &owner)) {
 			/* No result from lvmlockd, it is probably not running. */
 			log_error("Locking failed for LV %s/%s", vg->name, lv_name);
 			return 0;
@ -2744,7 +2809,12 @@ int lockd_lv_name(struct cmd_context *cmd, struct volume_group *vg,
 		return 1;

 	if (result == -EAGAIN) {
-		log_error("LV locked by other host: %s/%s", vg->name, lv_name);
+		log_error("LV locked by other host: %s/%s%s", vg->name, lv_name, _owner_str(&owner));
+		return 0;
+	}
+
+	if (result == -EIOTIMEOUT) {
+		log_error("LV %s/%s lock failed: io timeout.", vg->name, lv_name);
 		return 0;
 	}

@ -3649,7 +3719,7 @@ static int _init_lv_sanlock(struct cmd_context *cmd, struct volume_group *vg,
 				"vg_lock_args = %s", vg->lock_args,
 				NULL);

-	if (!_lockd_result("init_lv", reply, &result, NULL)) {
+	if (!_lockd_result(cmd, "init_lv", reply, &result, NULL, NULL)) {
 		ret = 0;
 	} else {
 		ret = (result < 0) ? 0 : 1;
@ -3724,7 +3794,7 @@ static int _free_lv(struct cmd_context *cmd, struct volume_group *vg,
 				"lv_lock_args = %s", lock_args ?: "none",
 				NULL);

-	if (!_lockd_result("free_lv", reply, &result, NULL)) {
+	if (!_lockd_result(cmd, "free_lv", reply, &result, NULL, NULL)) {
 		ret = 0;
 	} else {
 		ret = (result < 0) ? 0 : 1;
@ -3989,7 +4059,7 @@ int lockd_rename_vg_before(struct cmd_context *cmd, struct volume_group *vg)
 			"vg_lock_args = %s", vg->lock_args,
 			NULL);

-	if (!_lockd_result("rename_vg_before", reply, &result, NULL)) {
+	if (!_lockd_result(cmd, "rename_vg_before", reply, &result, NULL, NULL)) {
 		ret = 0;
 	} else {
 		ret = (result < 0) ? 0 : 1;
@ -4054,7 +4124,7 @@ int lockd_rename_vg_final(struct cmd_context *cmd, struct volume_group *vg, int
 				"vg_lock_args = %s", vg->lock_args,
 				NULL);

-		if (!_lockd_result("rename_vg_final", reply, &result, NULL)) {
+		if (!_lockd_result(cmd, "rename_vg_final", reply, &result, NULL, NULL)) {
 			ret = 0;
 		} else {
 			ret = (result < 0) ? 0 : 1;
@ -4095,7 +4165,7 @@ const char *lockd_running_lock_type(struct cmd_context *cmd, int *found_multiple
 			"pid = " FMTd64, (int64_t) getpid(),
 			NULL);

-	if (!_lockd_result("running_lm", reply, &result, NULL)) {
+	if (!_lockd_result(cmd, "running_lm", reply, &result, NULL, NULL)) {
 		log_error("Failed to get result from lvmlockd");
 		goto out;
 	}
@ -4216,7 +4286,7 @@ int lockd_lv_refresh(struct cmd_context *cmd, struct lvresize_params *lp)
 				"path = %s", path,
 				NULL);

-	if (!_lockd_result("refresh_lv", reply, &result, NULL)) {
+	if (!_lockd_result(cmd, "refresh_lv", reply, &result, NULL, NULL)) {
 		/* No result from lvmlockd, it is probably not running. */
 		log_error("LV refresh failed for LV %s", path);
 		return 0;