From 8a82dbf19129dde9e6fc9ab25a00dbc7569abe6a Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Mon, 9 Oct 2017 13:39:44 -0700
Subject: [PATCH 1/6] nvme-fc: fix iowait hang
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add missing iowait head initialization.
Fix irqsave vs irq: wait_event_lock_irq() doesn't do irq save/restore

Fixes: 36715cf4b366 ("nvme_fc: replace ioabort msleep loop with completion”)
Cc: <stable@vger.kernel.org> # 4.13
Signed-off-by: James Smart <james.smart@broadcom.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Himanshu Madhani <himanshu.madhani@cavium.com>
Tested-by: Himanshu Madhani <himanshu.madhani@cavium.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/fc.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index af075e998944..8182b1999f49 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -2545,10 +2545,10 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl)
 	nvme_fc_abort_aen_ops(ctrl);
 
 	/* wait for all io that had to be aborted */
-	spin_lock_irqsave(&ctrl->lock, flags);
+	spin_lock_irq(&ctrl->lock);
 	wait_event_lock_irq(ctrl->ioabort_wait, ctrl->iocnt == 0, ctrl->lock);
 	ctrl->flags &= ~FCCTRL_TERMIO;
-	spin_unlock_irqrestore(&ctrl->lock, flags);
+	spin_unlock_irq(&ctrl->lock);
 
 	nvme_fc_term_aen_ops(ctrl);
 
@@ -2760,6 +2760,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
 	ctrl->rport = rport;
 	ctrl->dev = lport->dev;
 	ctrl->cnum = idx;
+	init_waitqueue_head(&ctrl->ioabort_wait);
 
 	get_device(ctrl->dev);
 	kref_init(&ctrl->ref);

From 17c4dc6eb7e1b2fb1ce6a52467e3be635224606e Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Mon, 9 Oct 2017 16:39:22 -0700
Subject: [PATCH 2/6] nvme-fc: retry initial controller connections 3 times

Currently, if a frame is lost of command fails as part of initial
association create for a new controller, the new controller connection
request will immediately fail.

Add in an immediate 3 retry loop before giving up.

Signed-off-by: James Smart <james.smart@broadcom.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/fc.c | 32 ++++++++++++++++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 8182b1999f49..be49d0f79381 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -2734,7 +2734,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
 {
 	struct nvme_fc_ctrl *ctrl;
 	unsigned long flags;
-	int ret, idx;
+	int ret, idx, retry;
 
 	if (!(rport->remoteport.port_role &
 	    (FC_PORT_ROLE_NVME_DISCOVERY | FC_PORT_ROLE_NVME_TARGET))) {
@@ -2826,9 +2826,37 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
 	list_add_tail(&ctrl->ctrl_list, &rport->ctrl_list);
 	spin_unlock_irqrestore(&rport->lock, flags);
 
-	ret = nvme_fc_create_association(ctrl);
+	/*
+	 * It's possible that transactions used to create the association
+	 * may fail. Examples: CreateAssociation LS or CreateIOConnection
+	 * LS gets dropped/corrupted/fails; or a frame gets dropped or a
+	 * command times out for one of the actions to init the controller
+	 * (Connect, Get/Set_Property, Set_Features, etc). Many of these
+	 * transport errors (frame drop, LS failure) inherently must kill
+	 * the association. The transport is coded so that any command used
+	 * to create the association (prior to a LIVE state transition
+	 * while NEW or RECONNECTING) will fail if it completes in error or
+	 * times out.
+	 *
+	 * As such: as the connect request was mostly likely due to a
+	 * udev event that discovered the remote port, meaning there is
+	 * not an admin or script there to restart if the connect
+	 * request fails, retry the initial connection creation up to
+	 * three times before giving up and declaring failure.
+	 */
+	for (retry = 0; retry < 3; retry++) {
+		ret = nvme_fc_create_association(ctrl);
+		if (!ret)
+			break;
+	}
+
 	if (ret) {
+		/* couldn't schedule retry - fail out */
+		dev_err(ctrl->ctrl.device,
+			"NVME-FC{%d}: Connect retry failed\n", ctrl->cnum);
+
 		ctrl->ctrl.opts = NULL;
+
 		/* initiate nvme ctrl ref counting teardown */
 		nvme_uninit_ctrl(&ctrl->ctrl);
 		nvme_put_ctrl(&ctrl->ctrl);

From f9cf2a64912d67c9cf49c316a0a0ada0ea7ed1da Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Wed, 18 Oct 2017 14:33:59 -0700
Subject: [PATCH 3/6] nvmet: synchronize sqhd update

In testing target io in read write mix, we did indeed get into cases where
sqhd didn't update properly and slowly missed enough updates to shutdown
the queue.

Protect the updating sqhd by using cmpxchg, and for that turn the sqhd
field into a u32 so that cmpxchg works on it for all architectures.

Signed-off-by: James Smart <james.smart@broadcom.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/core.c  | 15 ++++++++++++---
 drivers/nvme/target/nvmet.h |  2 +-
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 1b208beeef50..645ba7eee35d 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -387,12 +387,21 @@ struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid)
 
 static void __nvmet_req_complete(struct nvmet_req *req, u16 status)
 {
+	u32 old_sqhd, new_sqhd;
+	u16 sqhd;
+
 	if (status)
 		nvmet_set_status(req, status);
 
-	if (req->sq->size)
-		req->sq->sqhd = (req->sq->sqhd + 1) % req->sq->size;
-	req->rsp->sq_head = cpu_to_le16(req->sq->sqhd);
+	if (req->sq->size) {
+		do {
+			old_sqhd = req->sq->sqhd;
+			new_sqhd = (old_sqhd + 1) % req->sq->size;
+		} while (cmpxchg(&req->sq->sqhd, old_sqhd, new_sqhd) !=
+					old_sqhd);
+	}
+	sqhd = req->sq->sqhd & 0x0000FFFF;
+	req->rsp->sq_head = cpu_to_le16(sqhd);
 	req->rsp->sq_id = cpu_to_le16(req->sq->qid);
 	req->rsp->command_id = req->cmd->common.command_id;
 
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 7b8e20adf760..87e429bfcd8a 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -74,7 +74,7 @@ struct nvmet_sq {
 	struct percpu_ref	ref;
 	u16			qid;
 	u16			size;
-	u16			sqhd;
+	u32			sqhd;
 	struct completion	free_done;
 	struct completion	confirm_done;
 };

From bd9f07590a17f3158b51fb869dca723f1f606bdc Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Thu, 19 Oct 2017 16:00:30 +0300
Subject: [PATCH 4/6] nvme-rdma: Fix possible double free in reconnect flow

The fact that we free the async event buffer in
nvme_rdma_destroy_admin_queue can cause us to free it
more than once because this happens in every reconnect
attempt since commit 31fdf1840170. we rely on the queue
state flags DELETING to avoid this for other resources.

A more complete fix is to not destroy the admin/io queues
unconditionally on every reconnect attempt, but its a bit
more extensive and will go in the next release.

Fixes: 31fdf1840170 ("nvme-rdma: reuse configure/destroy_admin_queue")
Reported-by: Yi Zhang <yi.zhang@redhat.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/rdma.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 92a03ff5fb4d..4dbee893a047 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -571,6 +571,12 @@ static void nvme_rdma_free_queue(struct nvme_rdma_queue *queue)
 	if (test_and_set_bit(NVME_RDMA_Q_DELETING, &queue->flags))
 		return;
 
+	if (nvme_rdma_queue_idx(queue) == 0) {
+		nvme_rdma_free_qe(queue->device->dev,
+			&queue->ctrl->async_event_sqe,
+			sizeof(struct nvme_command), DMA_TO_DEVICE);
+	}
+
 	nvme_rdma_destroy_queue_ib(queue);
 	rdma_destroy_id(queue->cm_id);
 }
@@ -739,8 +745,6 @@ out:
 static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl,
 		bool remove)
 {
-	nvme_rdma_free_qe(ctrl->queues[0].device->dev, &ctrl->async_event_sqe,
-			sizeof(struct nvme_command), DMA_TO_DEVICE);
 	nvme_rdma_stop_queue(&ctrl->queues[0]);
 	if (remove) {
 		blk_cleanup_queue(ctrl->ctrl.admin_q);

From f04b9cc87b5fc466b1b7231ba7b078e885956c5b Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Thu, 19 Oct 2017 18:10:53 +0300
Subject: [PATCH 5/6] nvme-rdma: Fix error status return in tagset allocation
 failure

We should make sure to escelate allocation failures to prevent a
use-after-free in nvmf_create_ctrl.

Fixes: b28a308ee777 ("nvme-rdma: move tagset allocation to a dedicated routine")
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/rdma.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 4dbee893a047..87bac27ec64b 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -769,8 +769,10 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
 
 	if (new) {
 		ctrl->ctrl.admin_tagset = nvme_rdma_alloc_tagset(&ctrl->ctrl, true);
-		if (IS_ERR(ctrl->ctrl.admin_tagset))
+		if (IS_ERR(ctrl->ctrl.admin_tagset)) {
+			error = PTR_ERR(ctrl->ctrl.admin_tagset);
 			goto out_free_queue;
+		}
 
 		ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set);
 		if (IS_ERR(ctrl->ctrl.admin_q)) {
@@ -850,8 +852,10 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new)
 
 	if (new) {
 		ctrl->ctrl.tagset = nvme_rdma_alloc_tagset(&ctrl->ctrl, false);
-		if (IS_ERR(ctrl->ctrl.tagset))
+		if (IS_ERR(ctrl->ctrl.tagset)) {
+			ret = PTR_ERR(ctrl->ctrl.tagset);
 			goto out_free_io_queues;
+		}
 
 		ctrl->ctrl.connect_q = blk_mq_init_queue(&ctrl->tag_set);
 		if (IS_ERR(ctrl->ctrl.connect_q)) {

From 32e67a3a06b88904155170560b7a63d372b320bd Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Tue, 24 Oct 2017 15:57:18 -0400
Subject: [PATCH 6/6] nbd: handle interrupted sendmsg with a sndtimeo set

If you do not set sk_sndtimeo you will get -ERESTARTSYS if there is a
pending signal when you enter sendmsg, which we handle properly.
However if you set a timeout for your commands we'll set sk_sndtimeo to
that timeout, which means that sendmsg will start returning -EINTR
instead of -ERESTARTSYS.  Fix this by checking either cases and doing
the correct thing.

Cc: stable@vger.kernel.org
Fixes: dc88e34d69d8 ("nbd: set sk->sk_sndtimeo for our sockets")
Reported-and-tested-by: Daniel Xu <dlxu@fb.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/nbd.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index baebbdfd74d5..9adfb5445f8d 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -386,6 +386,15 @@ static int sock_xmit(struct nbd_device *nbd, int index, int send,
 	return result;
 }
 
+/*
+ * Different settings for sk->sk_sndtimeo can result in different return values
+ * if there is a signal pending when we enter sendmsg, because reasons?
+ */
+static inline int was_interrupted(int result)
+{
+	return result == -ERESTARTSYS || result == -EINTR;
+}
+
 /* always call with the tx_lock held */
 static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
 {
@@ -458,7 +467,7 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
 	result = sock_xmit(nbd, index, 1, &from,
 			(type == NBD_CMD_WRITE) ? MSG_MORE : 0, &sent);
 	if (result <= 0) {
-		if (result == -ERESTARTSYS) {
+		if (was_interrupted(result)) {
 			/* If we havne't sent anything we can just return BUSY,
 			 * however if we have sent something we need to make
 			 * sure we only allow this req to be sent until we are
@@ -502,7 +511,7 @@ send_pages:
 			}
 			result = sock_xmit(nbd, index, 1, &from, flags, &sent);
 			if (result <= 0) {
-				if (result == -ERESTARTSYS) {
+				if (was_interrupted(result)) {
 					/* We've already sent the header, we
 					 * have no choice but to set pending and
 					 * return BUSY.