From 16001c10725e11b73b8518f42e414506bf73c291 Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Sun, 10 Jun 2018 10:31:10 +0000 Subject: [PATCH 01/13] nvme: fix NULL pointer dereference in nvme_init_subsystem When using nvme-pci driver the nvmf_ctrl_options is NULL. There is no need to check for discovery_nqn flag at non-fabrics controller. Fixes: 181303d0 ("nvme-fabrics: allow duplicate connections to the discovery controller") Signed-off-by: Israel Rukshin Reviewed-by: Max Gurtovoy Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index e2dcd14012b1..d6ca4598c027 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2208,7 +2208,7 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) * Verify that the subsystem actually supports multiple * controllers, else bail out. */ - if (!ctrl->opts->discovery_nqn && + if (!(ctrl->opts && ctrl->opts->discovery_nqn) && nvme_active_ctrls(found) && !(id->cmic & (1 << 1))) { dev_err(ctrl->device, "ignoring ctrl due to duplicate subnqn (%s).\n", From 2796b569591c6f0681303f148a55e50c2cc8304a Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Thu, 7 Jun 2018 10:38:47 +0200 Subject: [PATCH 02/13] nvme: add bio remapping tracepoint Adding a tracepoint to trace bio remapping for native nvme multipath. Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/multipath.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index d7b664ae5923..1ffd3e8b13a1 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -12,6 +12,7 @@ */ #include +#include #include "nvme.h" static bool multipath = true; @@ -111,6 +112,9 @@ static blk_qc_t nvme_ns_head_make_request(struct request_queue *q, if (likely(ns)) { bio->bi_disk = ns->disk; bio->bi_opf |= REQ_NVME_MPATH; + trace_block_bio_remap(bio->bi_disk->queue, bio, + disk_devt(ns->head->disk), + bio->bi_iter.bi_sector); ret = direct_make_request(bio); } else if (!list_empty_careful(&head->list)) { dev_warn_ratelimited(dev, "no path available - requeuing I/O\n"); From 94423a8f89ed7b66746cade3351a185fb6a1f38d Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Sun, 10 Jun 2018 16:58:29 +0300 Subject: [PATCH 03/13] nvme-rdma: fix error flow during mapping request data After dma mapping the sgl, we map the sgl to nvme sgl descriptor. In case of failure during the last mapping we never dma unmap the sgl. Signed-off-by: Max Gurtovoy Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 2aba03876d84..7cd4199db225 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1189,21 +1189,38 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue, count = ib_dma_map_sg(ibdev, req->sg_table.sgl, req->nents, rq_data_dir(rq) == WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE); if (unlikely(count <= 0)) { - sg_free_table_chained(&req->sg_table, true); - return -EIO; + ret = -EIO; + goto out_free_table; } if (count == 1) { if (rq_data_dir(rq) == WRITE && nvme_rdma_queue_idx(queue) && blk_rq_payload_bytes(rq) <= - nvme_rdma_inline_data_size(queue)) - return nvme_rdma_map_sg_inline(queue, req, c); + nvme_rdma_inline_data_size(queue)) { + ret = nvme_rdma_map_sg_inline(queue, req, c); + goto out; + } - if (dev->pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) - return nvme_rdma_map_sg_single(queue, req, c); + if (dev->pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) { + ret = nvme_rdma_map_sg_single(queue, req, c); + goto out; + } } - return nvme_rdma_map_sg_fr(queue, req, c, count); + ret = nvme_rdma_map_sg_fr(queue, req, c, count); +out: + if (unlikely(ret)) + goto out_unmap_sg; + + return 0; + +out_unmap_sg: + ib_dma_unmap_sg(ibdev, req->sg_table.sgl, + req->nents, rq_data_dir(rq) == + WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE); +out_free_table: + sg_free_table_chained(&req->sg_table, true); + return ret; } static void nvme_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc) From c42d7a30aba5da4593f240d70a088e3be5285744 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 11 Jun 2018 03:20:24 -0400 Subject: [PATCH 04/13] nvmet: free smart-log buffer after use Free smart-log buffer allocated in the function after use. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/admin-cmd.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 962532842769..38803576d5e1 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -119,9 +119,11 @@ static void nvmet_execute_get_log_page_smart(struct nvmet_req *req) else status = nvmet_get_smart_log_nsid(req, log); if (status) - goto out; + goto out_free_log; status = nvmet_copy_to_sgl(req, 0, log, sizeof(*log)); +out_free_log: + kfree(log); out: nvmet_req_complete(req, status); } From f493af37abfb3c0ae0f62f628d2b20e9c32561c4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 7 Jun 2018 13:47:33 +0200 Subject: [PATCH 05/13] nvme: don't rely on the changed namespace list log Don't optimize our namespace rescan based on the changed namespace list log page as userspace might have changed the content through reading it. Suggested-by: Keith Busch Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Hannes Reinecke --- drivers/nvme/host/core.c | 36 +++++++++++------------------------- 1 file changed, 11 insertions(+), 25 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index d6ca4598c027..dee8e71baf62 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3197,40 +3197,28 @@ static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl, unsigned nn) nvme_remove_invalid_namespaces(ctrl, nn); } -static bool nvme_scan_changed_ns_log(struct nvme_ctrl *ctrl) +static void nvme_clear_changed_ns_log(struct nvme_ctrl *ctrl) { size_t log_size = NVME_MAX_CHANGED_NAMESPACES * sizeof(__le32); __le32 *log; - int error, i; - bool ret = false; + int error; log = kzalloc(log_size, GFP_KERNEL); if (!log) - return false; + return; + /* + * We need to read the log to clear the AEN, but we don't want to rely + * on it for the changed namespace information as userspace could have + * raced with us in reading the log page, which could cause us to miss + * updates. + */ error = nvme_get_log(ctrl, NVME_LOG_CHANGED_NS, log, log_size); - if (error) { + if (error) dev_warn(ctrl->device, "reading changed ns log failed: %d\n", error); - goto out_free_log; - } - if (log[0] == cpu_to_le32(0xffffffff)) - goto out_free_log; - - for (i = 0; i < NVME_MAX_CHANGED_NAMESPACES; i++) { - u32 nsid = le32_to_cpu(log[i]); - - if (nsid == 0) - break; - dev_info(ctrl->device, "rescanning namespace %d.\n", nsid); - nvme_validate_ns(ctrl, nsid); - } - ret = true; - -out_free_log: kfree(log); - return ret; } static void nvme_scan_work(struct work_struct *work) @@ -3246,9 +3234,8 @@ static void nvme_scan_work(struct work_struct *work) WARN_ON_ONCE(!ctrl->tagset); if (test_and_clear_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events)) { - if (nvme_scan_changed_ns_log(ctrl)) - goto out_sort_namespaces; dev_info(ctrl->device, "rescanning namespaces.\n"); + nvme_clear_changed_ns_log(ctrl); } if (nvme_identify_ctrl(ctrl, &id)) @@ -3263,7 +3250,6 @@ static void nvme_scan_work(struct work_struct *work) nvme_scan_ns_sequential(ctrl, nn); out_free_id: kfree(id); -out_sort_namespaces: down_write(&ctrl->namespaces_rwsem); list_sort(NULL, &ctrl->namespaces, ns_cmp); up_write(&ctrl->namespaces_rwsem); From 4c984154efa13175bbb1e2aeb1de9fb2960ca28c Mon Sep 17 00:00:00 2001 From: James Smart Date: Wed, 13 Jun 2018 14:07:37 -0700 Subject: [PATCH 06/13] nvme-fc: change controllers first connect to use reconnect path Current code follows the framework that has been in the transports from the beginning where initial link-side controller connect occurs as part of "creating the controller". Thus that first connect fully talks to the controller and obtains values that can then be used in for blk-mq setup, etc. It also means that everything about the controller is fully know before the "create controller" call returns. This has several weaknesses: - The initial create_ctrl call made by the cli will block for a long time as wire transactions are performed synchronously. This delay becomes longer if errors occur or connectivity is lost and retries need to be performed. - Code wise, it means there is a separate connect path for initial controller connect vs the (same) steps used in the reconnect path. - And as there's separate paths, it means there's separate error handling and retry logic. It also plays havoc with the NEW state (should transition out of it after successful initial connect) vs the RESETTING and CONNECTING (reconnect) states that want to be transitioned to on error. - As there's separate paths, to recover from errors and disruptions, it requires separate recovery/retry paths as well and can severely convolute the controller state. This patch reworks the fc transport to use the same connect paths for the initial connection as it uses for reconnect. This makes a single path for error recovery and handling. This patch: - Removes the driving of the initial connect and replaces it with a state transition to CONNECTING and initiating the reconnect thread. A dummy state transition of RESETTING had to be traversed as a direct transtion of NEW->CONNECTING is not allowed. Given that the controller is "new", the RESETTING transition is a simple no-op. Once in the reconnecting thread, the normal behaviors of ctrl_loss_tmo (max_retries * connect_delay) and dev_loss_tmo will apply before the controller is torn down. - Only if the state transitions couldn't be traversed and the reconnect thread not scheduled, will the controller be torn down while in create_ctrl. - The prior code used the controller state of NEW to indicate whether request queues had been initialized or not. For the admin queue, the request queue is always created, so there's no need to check a state. For IO queues, change to tracking whether a successful io request queue create has occurred (e.g. 1st successful connect). - The initial controller id is initialized to the dynamic controller id used in the initial connect message. It will be overwritten by the real controller id once the controller is connected on the wire. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 108 +++++++++++++++++++---------------------- 1 file changed, 49 insertions(+), 59 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 0bad65803271..9d826b726425 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -142,6 +142,7 @@ struct nvme_fc_ctrl { struct nvme_fc_rport *rport; u32 cnum; + bool ioq_live; bool assoc_active; u64 association_id; @@ -2463,6 +2464,8 @@ nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl) if (ret) goto out_delete_hw_queues; + ctrl->ioq_live = true; + return 0; out_delete_hw_queues: @@ -2615,8 +2618,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) if (ret) goto out_delete_hw_queue; - if (ctrl->ctrl.state != NVME_CTRL_NEW) - blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); + blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); ret = nvmf_connect_admin_queue(&ctrl->ctrl); if (ret) @@ -2689,7 +2691,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) */ if (ctrl->ctrl.queue_count > 1) { - if (ctrl->ctrl.state == NVME_CTRL_NEW) + if (!ctrl->ioq_live) ret = nvme_fc_create_io_queues(ctrl); else ret = nvme_fc_reinit_io_queues(ctrl); @@ -2776,8 +2778,7 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl) * use blk_mq_tagset_busy_itr() and the transport routine to * terminate the exchanges. */ - if (ctrl->ctrl.state != NVME_CTRL_NEW) - blk_mq_quiesce_queue(ctrl->ctrl.admin_q); + blk_mq_quiesce_queue(ctrl->ctrl.admin_q); blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, nvme_fc_terminate_exchange, &ctrl->ctrl); @@ -2934,7 +2935,7 @@ nvme_fc_connect_ctrl_work(struct work_struct *work) nvme_fc_reconnect_or_delete(ctrl, ret); else dev_info(ctrl->ctrl.device, - "NVME-FC{%d}: controller reconnect complete\n", + "NVME-FC{%d}: controller connect complete\n", ctrl->cnum); } @@ -2982,7 +2983,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, { struct nvme_fc_ctrl *ctrl; unsigned long flags; - int ret, idx, retry; + int ret, idx; if (!(rport->remoteport.port_role & (FC_PORT_ROLE_NVME_DISCOVERY | FC_PORT_ROLE_NVME_TARGET))) { @@ -3009,11 +3010,13 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, } ctrl->ctrl.opts = opts; + ctrl->ctrl.nr_reconnects = 0; INIT_LIST_HEAD(&ctrl->ctrl_list); ctrl->lport = lport; ctrl->rport = rport; ctrl->dev = lport->dev; ctrl->cnum = idx; + ctrl->ioq_live = false; ctrl->assoc_active = false; init_waitqueue_head(&ctrl->ioabort_wait); @@ -3032,6 +3035,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, ctrl->ctrl.sqsize = opts->queue_size - 1; ctrl->ctrl.kato = opts->kato; + ctrl->ctrl.cntlid = 0xffff; ret = -ENOMEM; ctrl->queues = kcalloc(ctrl->ctrl.queue_count, @@ -3081,69 +3085,55 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, list_add_tail(&ctrl->ctrl_list, &rport->ctrl_list); spin_unlock_irqrestore(&rport->lock, flags); - /* - * It's possible that transactions used to create the association - * may fail. Examples: CreateAssociation LS or CreateIOConnection - * LS gets dropped/corrupted/fails; or a frame gets dropped or a - * command times out for one of the actions to init the controller - * (Connect, Get/Set_Property, Set_Features, etc). Many of these - * transport errors (frame drop, LS failure) inherently must kill - * the association. The transport is coded so that any command used - * to create the association (prior to a LIVE state transition - * while NEW or CONNECTING) will fail if it completes in error or - * times out. - * - * As such: as the connect request was mostly likely due to a - * udev event that discovered the remote port, meaning there is - * not an admin or script there to restart if the connect - * request fails, retry the initial connection creation up to - * three times before giving up and declaring failure. - */ - for (retry = 0; retry < 3; retry++) { - ret = nvme_fc_create_association(ctrl); - if (!ret) - break; - } - - if (ret) { - nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING); - cancel_work_sync(&ctrl->ctrl.reset_work); - cancel_delayed_work_sync(&ctrl->connect_work); - - /* couldn't schedule retry - fail out */ + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING) || + !nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { dev_err(ctrl->ctrl.device, - "NVME-FC{%d}: Connect retry failed\n", ctrl->cnum); - - ctrl->ctrl.opts = NULL; - - /* initiate nvme ctrl ref counting teardown */ - nvme_uninit_ctrl(&ctrl->ctrl); - - /* Remove core ctrl ref. */ - nvme_put_ctrl(&ctrl->ctrl); - - /* as we're past the point where we transition to the ref - * counting teardown path, if we return a bad pointer here, - * the calling routine, thinking it's prior to the - * transition, will do an rport put. Since the teardown - * path also does a rport put, we do an extra get here to - * so proper order/teardown happens. - */ - nvme_fc_rport_get(rport); - - if (ret > 0) - ret = -EIO; - return ERR_PTR(ret); + "NVME-FC{%d}: failed to init ctrl state\n", ctrl->cnum); + goto fail_ctrl; } nvme_get_ctrl(&ctrl->ctrl); + if (!queue_delayed_work(nvme_wq, &ctrl->connect_work, 0)) { + nvme_put_ctrl(&ctrl->ctrl); + dev_err(ctrl->ctrl.device, + "NVME-FC{%d}: failed to schedule initial connect\n", + ctrl->cnum); + goto fail_ctrl; + } + + flush_delayed_work(&ctrl->connect_work); + dev_info(ctrl->ctrl.device, "NVME-FC{%d}: new ctrl: NQN \"%s\"\n", ctrl->cnum, ctrl->ctrl.opts->subsysnqn); return &ctrl->ctrl; +fail_ctrl: + nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING); + cancel_work_sync(&ctrl->ctrl.reset_work); + cancel_delayed_work_sync(&ctrl->connect_work); + + ctrl->ctrl.opts = NULL; + + /* initiate nvme ctrl ref counting teardown */ + nvme_uninit_ctrl(&ctrl->ctrl); + + /* Remove core ctrl ref. */ + nvme_put_ctrl(&ctrl->ctrl); + + /* as we're past the point where we transition to the ref + * counting teardown path, if we return a bad pointer here, + * the calling routine, thinking it's prior to the + * transition, will do an rport put. Since the teardown + * path also does a rport put, we do an extra get here to + * so proper order/teardown happens. + */ + nvme_fc_rport_get(rport); + + return ERR_PTR(-EIO); + out_cleanup_admin_q: blk_cleanup_queue(ctrl->ctrl.admin_q); out_free_admin_tag_set: From 587331f71e2748371526597cafc72e5732c67e88 Mon Sep 17 00:00:00 2001 From: James Smart Date: Wed, 13 Jun 2018 14:07:36 -0700 Subject: [PATCH 07/13] nvme-fc: remove reinit_request routine The reinit_request routine is not necessary. Remove support for the op callback. As all that nvme_reinit_tagset() does is itterate and call the reinit routine, it too has no purpose. Remove the call. Signed-off-by: James Smart Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 9d826b726425..36e72e64f57d 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1471,21 +1471,6 @@ nvme_fc_xmt_disconnect_assoc(struct nvme_fc_ctrl *ctrl) static void nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg); -static int -nvme_fc_reinit_request(void *data, struct request *rq) -{ - struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq); - struct nvme_fc_cmd_iu *cmdiu = &op->cmd_iu; - - memset(cmdiu, 0, sizeof(*cmdiu)); - cmdiu->scsi_id = NVME_CMD_SCSI_ID; - cmdiu->fc_id = NVME_CMD_FC_ID; - cmdiu->iu_len = cpu_to_be16(sizeof(*cmdiu) / sizeof(u32)); - memset(&op->rsp_iu, 0, sizeof(op->rsp_iu)); - - return 0; -} - static void __nvme_fc_exit_request(struct nvme_fc_ctrl *ctrl, struct nvme_fc_fcp_op *op) @@ -2505,10 +2490,6 @@ nvme_fc_reinit_io_queues(struct nvme_fc_ctrl *ctrl) nvme_fc_init_io_queues(ctrl); - ret = nvme_reinit_tagset(&ctrl->ctrl, ctrl->ctrl.tagset); - if (ret) - goto out_free_io_queues; - ret = nvme_fc_create_hw_io_queues(ctrl, ctrl->ctrl.sqsize + 1); if (ret) goto out_free_io_queues; @@ -2918,7 +2899,6 @@ static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = { .submit_async_event = nvme_fc_submit_async_event, .delete_ctrl = nvme_fc_delete_ctrl, .get_address = nvmf_get_address, - .reinit_request = nvme_fc_reinit_request, }; static void From 3e493c00cedb457c0731399a835f7ba1c6df172b Mon Sep 17 00:00:00 2001 From: James Smart Date: Wed, 13 Jun 2018 14:07:38 -0700 Subject: [PATCH 08/13] nvme-fc: fix nulling of queue data on reconnect The reconnect path is calling the init routines to clear a queue structure. But the queue structure has state that perhaps needs to persist as long as the controller is live. Remove the nvme_fc_init_queue() calls on reconnect. The nvme_fc_free_queue() calls will clear state bits and reset any relevant queue state for a new connection. Signed-off-by: James Smart Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 36e72e64f57d..318e827e74ec 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1879,6 +1879,7 @@ nvme_fc_free_queue(struct nvme_fc_queue *queue) */ queue->connection_id = 0; + atomic_set(&queue->csn, 1); } static void @@ -2468,7 +2469,7 @@ out_free_tag_set: } static int -nvme_fc_reinit_io_queues(struct nvme_fc_ctrl *ctrl) +nvme_fc_recreate_io_queues(struct nvme_fc_ctrl *ctrl) { struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; unsigned int nr_io_queues; @@ -2488,8 +2489,6 @@ nvme_fc_reinit_io_queues(struct nvme_fc_ctrl *ctrl) if (ctrl->ctrl.queue_count == 1) return 0; - nvme_fc_init_io_queues(ctrl); - ret = nvme_fc_create_hw_io_queues(ctrl, ctrl->ctrl.sqsize + 1); if (ret) goto out_free_io_queues; @@ -2587,8 +2586,6 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) * Create the admin queue */ - nvme_fc_init_queue(ctrl, 0); - ret = __nvme_fc_create_hw_queue(ctrl, &ctrl->queues[0], 0, NVME_AQ_DEPTH); if (ret) @@ -2675,7 +2672,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) if (!ctrl->ioq_live) ret = nvme_fc_create_io_queues(ctrl); else - ret = nvme_fc_reinit_io_queues(ctrl); + ret = nvme_fc_recreate_io_queues(ctrl); if (ret) goto out_term_aen_ops; } @@ -3023,6 +3020,8 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, if (!ctrl->queues) goto out_free_ida; + nvme_fc_init_queue(ctrl, 0); + memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set)); ctrl->admin_tag_set.ops = &nvme_fc_admin_mq_ops; ctrl->admin_tag_set.queue_depth = NVME_AQ_MQ_TAG_DEPTH; From 14dfa400f95b7d7960343165507125a065db84c2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 14 Jun 2018 14:25:55 +0200 Subject: [PATCH 09/13] nvme: remove nvme_reinit_tagset Unused now that all transports stopped using it. Signed-off-by: Christoph Hellwig Reviewed-by: Jens Axboe --- drivers/nvme/host/core.c | 10 ---------- drivers/nvme/host/nvme.h | 2 -- 2 files changed, 12 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index dee8e71baf62..020a00f932a0 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3627,16 +3627,6 @@ void nvme_start_queues(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_start_queues); -int nvme_reinit_tagset(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set) -{ - if (!ctrl->ops->reinit_request) - return 0; - - return blk_mq_tagset_iter(set, set->driver_data, - ctrl->ops->reinit_request); -} -EXPORT_SYMBOL_GPL(nvme_reinit_tagset); - int __init nvme_core_init(void) { int result = -ENOMEM; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 34df07d44f80..231807cbc849 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -321,7 +321,6 @@ struct nvme_ctrl_ops { void (*submit_async_event)(struct nvme_ctrl *ctrl); void (*delete_ctrl)(struct nvme_ctrl *ctrl); int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size); - int (*reinit_request)(void *data, struct request *rq); void (*stop_ctrl)(struct nvme_ctrl *ctrl); }; @@ -416,7 +415,6 @@ void nvme_unfreeze(struct nvme_ctrl *ctrl); void nvme_wait_freeze(struct nvme_ctrl *ctrl); void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout); void nvme_start_freeze(struct nvme_ctrl *ctrl); -int nvme_reinit_tagset(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set); #define NVME_QID_ANY -1 struct request *nvme_alloc_request(struct request_queue *q, From e6c3456aa897c9799de5423b28550efad14a51b0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 14 Jun 2018 14:26:47 +0200 Subject: [PATCH 10/13] blk-mq: remove blk_mq_tagset_iter Unused now that nvme stopped using it. Signed-off-by: Christoph Hellwig Reviewed-by: Jens Axboe --- block/blk-mq-tag.c | 29 ----------------------------- include/linux/blk-mq.h | 2 -- 2 files changed, 31 deletions(-) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 70356a2a11ab..09b2ee6694fb 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -311,35 +311,6 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, } EXPORT_SYMBOL(blk_mq_tagset_busy_iter); -int blk_mq_tagset_iter(struct blk_mq_tag_set *set, void *data, - int (fn)(void *, struct request *)) -{ - int i, j, ret = 0; - - if (WARN_ON_ONCE(!fn)) - goto out; - - for (i = 0; i < set->nr_hw_queues; i++) { - struct blk_mq_tags *tags = set->tags[i]; - - if (!tags) - continue; - - for (j = 0; j < tags->nr_tags; j++) { - if (!tags->static_rqs[j]) - continue; - - ret = fn(data, tags->static_rqs[j]); - if (ret) - goto out; - } - } - -out: - return ret; -} -EXPORT_SYMBOL_GPL(blk_mq_tagset_iter); - void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, void *priv) { diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index fb355173f3c7..e3147eb74222 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -281,8 +281,6 @@ void blk_freeze_queue_start(struct request_queue *q); void blk_mq_freeze_queue_wait(struct request_queue *q); int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, unsigned long timeout); -int blk_mq_tagset_iter(struct blk_mq_tag_set *set, void *data, - int (reinit_request)(void *, struct request *)); int blk_mq_map_queues(struct blk_mq_tag_set *set); void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues); From 3bc32bb1186ccaf3177cbf29caa6cc14dc510b7b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 11 Jun 2018 17:34:06 +0200 Subject: [PATCH 11/13] nvme-fabrics: refactor queue ready check Move the is_connected check to the fibre channel transport, as it has no meaning for other transports. To facilitate this split out a new nvmf_fail_nonready_command helper that is called by the transport when it is asked to handle a command on a queue that is not ready. Also avoid a function call for the queue live fast path by inlining the check. Signed-off-by: Christoph Hellwig Reviewed-by: James Smart --- drivers/nvme/host/fabrics.c | 59 +++++++++++++++---------------------- drivers/nvme/host/fabrics.h | 13 ++++++-- drivers/nvme/host/fc.c | 9 +++--- drivers/nvme/host/rdma.c | 7 ++--- drivers/nvme/target/loop.c | 7 ++--- 5 files changed, 45 insertions(+), 50 deletions(-) diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index fa32c1216409..6b4e253b9347 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -536,30 +536,32 @@ static struct nvmf_transport_ops *nvmf_lookup_transport( return NULL; } -blk_status_t nvmf_check_if_ready(struct nvme_ctrl *ctrl, struct request *rq, - bool queue_live, bool is_connected) +/* + * For something we're not in a state to send to the device the default action + * is to busy it and retry it after the controller state is recovered. However, + * anything marked for failfast or nvme multipath is immediately failed. + * + * Note: commands used to initialize the controller will be marked for failfast. + * Note: nvme cli/ioctl commands are marked for failfast. + */ +blk_status_t nvmf_fail_nonready_command(struct request *rq) +{ + if (!blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH)) + return BLK_STS_RESOURCE; + nvme_req(rq)->status = NVME_SC_ABORT_REQ; + return BLK_STS_IOERR; +} +EXPORT_SYMBOL_GPL(nvmf_fail_nonready_command); + +bool __nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq, + bool queue_live) { struct nvme_command *cmd = nvme_req(rq)->cmd; - if (likely(ctrl->state == NVME_CTRL_LIVE && is_connected)) - return BLK_STS_OK; - switch (ctrl->state) { case NVME_CTRL_NEW: case NVME_CTRL_CONNECTING: case NVME_CTRL_DELETING: - /* - * This is the case of starting a new or deleting an association - * but connectivity was lost before it was fully created or torn - * down. We need to error the commands used to initialize the - * controller so the reconnect can go into a retry attempt. The - * commands should all be marked REQ_FAILFAST_DRIVER, which will - * hit the reject path below. Anything else will be queued while - * the state settles. - */ - if (!is_connected) - break; - /* * If queue is live, allow only commands that are internally * generated pass through. These are commands on the admin @@ -567,7 +569,7 @@ blk_status_t nvmf_check_if_ready(struct nvme_ctrl *ctrl, struct request *rq, * ioctl admin cmds received while initializing. */ if (queue_live && !(nvme_req(rq)->flags & NVME_REQ_USERCMD)) - return BLK_STS_OK; + return true; /* * If the queue is not live, allow only a connect command. This @@ -577,26 +579,13 @@ blk_status_t nvmf_check_if_ready(struct nvme_ctrl *ctrl, struct request *rq, if (!queue_live && blk_rq_is_passthrough(rq) && cmd->common.opcode == nvme_fabrics_command && cmd->fabrics.fctype == nvme_fabrics_type_connect) - return BLK_STS_OK; - break; + return true; + return false; default: - break; + return false; } - - /* - * Any other new io is something we're not in a state to send to the - * device. Default action is to busy it and retry it after the - * controller state is recovered. However, anything marked for failfast - * or nvme multipath is immediately failed. Note: commands used to - * initialize the controller will be marked for failfast. - * Note: nvme cli/ioctl commands are marked for failfast. - */ - if (!blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH)) - return BLK_STS_RESOURCE; - nvme_req(rq)->status = NVME_SC_ABORT_REQ; - return BLK_STS_IOERR; } -EXPORT_SYMBOL_GPL(nvmf_check_if_ready); +EXPORT_SYMBOL_GPL(__nvmf_check_ready); static const match_table_t opt_tokens = { { NVMF_OPT_TRANSPORT, "transport=%s" }, diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index 7491a0bbf711..2ea949a3868c 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -162,7 +162,16 @@ void nvmf_unregister_transport(struct nvmf_transport_ops *ops); void nvmf_free_options(struct nvmf_ctrl_options *opts); int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size); bool nvmf_should_reconnect(struct nvme_ctrl *ctrl); -blk_status_t nvmf_check_if_ready(struct nvme_ctrl *ctrl, - struct request *rq, bool queue_live, bool is_connected); +blk_status_t nvmf_fail_nonready_command(struct request *rq); +bool __nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq, + bool queue_live); + +static inline bool nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq, + bool queue_live) +{ + if (likely(ctrl->state == NVME_CTRL_LIVE)) + return true; + return __nvmf_check_ready(ctrl, rq, queue_live); +} #endif /* _NVME_FABRICS_H */ diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 318e827e74ec..b528a2f5826c 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2266,14 +2266,13 @@ nvme_fc_queue_rq(struct blk_mq_hw_ctx *hctx, struct nvme_fc_cmd_iu *cmdiu = &op->cmd_iu; struct nvme_command *sqe = &cmdiu->sqe; enum nvmefc_fcp_datadir io_dir; + bool queue_ready = test_bit(NVME_FC_Q_LIVE, &queue->flags); u32 data_len; blk_status_t ret; - ret = nvmf_check_if_ready(&queue->ctrl->ctrl, rq, - test_bit(NVME_FC_Q_LIVE, &queue->flags), - ctrl->rport->remoteport.port_state == FC_OBJSTATE_ONLINE); - if (unlikely(ret)) - return ret; + if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE || + !nvmf_check_ready(&queue->ctrl->ctrl, rq, queue_ready)) + return nvmf_fail_nonready_command(rq); ret = nvme_setup_cmd(ns, rq, sqe); if (ret) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 7cd4199db225..c9424da0d23e 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1630,15 +1630,14 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, struct nvme_rdma_qe *sqe = &req->sqe; struct nvme_command *c = sqe->data; struct ib_device *dev; + bool queue_ready = test_bit(NVME_RDMA_Q_LIVE, &queue->flags); blk_status_t ret; int err; WARN_ON_ONCE(rq->tag < 0); - ret = nvmf_check_if_ready(&queue->ctrl->ctrl, rq, - test_bit(NVME_RDMA_Q_LIVE, &queue->flags), true); - if (unlikely(ret)) - return ret; + if (!nvmf_check_ready(&queue->ctrl->ctrl, rq, queue_ready)) + return nvmf_fail_nonready_command(rq); dev = queue->device->dev; ib_dma_sync_single_for_cpu(dev, sqe->dma, diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 1304ec3a7ede..d8d91f04bd7e 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -158,12 +158,11 @@ static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx, struct nvme_loop_queue *queue = hctx->driver_data; struct request *req = bd->rq; struct nvme_loop_iod *iod = blk_mq_rq_to_pdu(req); + bool queue_ready = test_bit(NVME_LOOP_Q_LIVE, &queue->flags); blk_status_t ret; - ret = nvmf_check_if_ready(&queue->ctrl->ctrl, req, - test_bit(NVME_LOOP_Q_LIVE, &queue->flags), true); - if (unlikely(ret)) - return ret; + if (!nvmf_check_ready(&queue->ctrl->ctrl, req, queue_ready)) + return nvmf_fail_nonready_command(req); ret = nvme_setup_cmd(ns, req, &iod->cmd); if (ret) From 278ab3799a2588f97423180947f09ec5b576e79e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 11 Jun 2018 17:37:23 +0200 Subject: [PATCH 12/13] nvme-fabrics: handle the admin-only case properly in nvmf_check_ready MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the ADMIN_ONLY state we don't have any I/O queues, but we should accept all admin commands without further checks. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: James Smart --- drivers/nvme/host/fabrics.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index 2ea949a3868c..e1818a27aa2d 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -169,7 +169,8 @@ bool __nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq, static inline bool nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq, bool queue_live) { - if (likely(ctrl->state == NVME_CTRL_LIVE)) + if (likely(ctrl->state == NVME_CTRL_LIVE || + ctrl->state == NVME_CTRL_ADMIN_ONLY)) return true; return __nvmf_check_ready(ctrl, rq, queue_live); } From 35897b920c8ab5e23331ad429e0aa235528c63ba Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 11 Jun 2018 17:41:11 +0200 Subject: [PATCH 13/13] nvme-fabrics: fix and refine state checks in __nvmf_check_ready - make sure we only allow internally generates commands in any non-live state - only allow connect commands on non-live queues when actually in the new or connecting states - treat all other non-live, non-dead states the same as a default cach-all This fixes a regression where we could not shutdown a controller orderly as we didn't allow the internal generated Property Set command, and also ensures we don't accidentally let a Connect command through in the wrong state. Signed-off-by: Christoph Hellwig Reviewed-by: James Smart --- drivers/nvme/host/fabrics.c | 39 ++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 6b4e253b9347..903eb4545e26 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -556,34 +556,33 @@ EXPORT_SYMBOL_GPL(nvmf_fail_nonready_command); bool __nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq, bool queue_live) { - struct nvme_command *cmd = nvme_req(rq)->cmd; + struct nvme_request *req = nvme_req(rq); + /* + * If we are in some state of setup or teardown only allow + * internally generated commands. + */ + if (!blk_rq_is_passthrough(rq) || (req->flags & NVME_REQ_USERCMD)) + return false; + + /* + * Only allow commands on a live queue, except for the connect command, + * which is require to set the queue live in the appropinquate states. + */ switch (ctrl->state) { case NVME_CTRL_NEW: case NVME_CTRL_CONNECTING: - case NVME_CTRL_DELETING: - /* - * If queue is live, allow only commands that are internally - * generated pass through. These are commands on the admin - * queue to initialize the controller. This will reject any - * ioctl admin cmds received while initializing. - */ - if (queue_live && !(nvme_req(rq)->flags & NVME_REQ_USERCMD)) + if (req->cmd->common.opcode == nvme_fabrics_command && + req->cmd->fabrics.fctype == nvme_fabrics_type_connect) return true; - - /* - * If the queue is not live, allow only a connect command. This - * will reject any ioctl admin cmd as well as initialization - * commands if the controller reverted the queue to non-live. - */ - if (!queue_live && blk_rq_is_passthrough(rq) && - cmd->common.opcode == nvme_fabrics_command && - cmd->fabrics.fctype == nvme_fabrics_type_connect) - return true; - return false; + break; default: + break; + case NVME_CTRL_DEAD: return false; } + + return queue_live; } EXPORT_SYMBOL_GPL(__nvmf_check_ready);