From a347c7ad8edf4c5685154f3fdc3c12fc1db800ba Mon Sep 17 00:00:00 2001 From: Roman Pen Date: Sun, 10 Jun 2018 22:38:24 +0200 Subject: [PATCH 01/17] blk-mq: reinit q->tag_set_list entry only after grace period It is not allowed to reinit q->tag_set_list list entry while RCU grace period has not completed yet, otherwise the following soft lockup in blk_mq_sched_restart() happens: [ 1064.252652] watchdog: BUG: soft lockup - CPU#12 stuck for 23s! [fio:9270] [ 1064.254445] task: ffff99b912e8b900 task.stack: ffffa6d54c758000 [ 1064.254613] RIP: 0010:blk_mq_sched_restart+0x96/0x150 [ 1064.256510] Call Trace: [ 1064.256664] [ 1064.256824] blk_mq_free_request+0xea/0x100 [ 1064.256987] msg_io_conf+0x59/0xd0 [ibnbd_client] [ 1064.257175] complete_rdma_req+0xf2/0x230 [ibtrs_client] [ 1064.257340] ? ibtrs_post_recv_empty+0x4d/0x70 [ibtrs_core] [ 1064.257502] ibtrs_clt_rdma_done+0xd1/0x1e0 [ibtrs_client] [ 1064.257669] ib_create_qp+0x321/0x380 [ib_core] [ 1064.257841] ib_process_cq_direct+0xbd/0x120 [ib_core] [ 1064.258007] irq_poll_softirq+0xb7/0xe0 [ 1064.258165] __do_softirq+0x106/0x2a2 [ 1064.258328] irq_exit+0x92/0xa0 [ 1064.258509] do_IRQ+0x4a/0xd0 [ 1064.258660] common_interrupt+0x7a/0x7a [ 1064.258818] Meanwhile another context frees other queue but with the same set of shared tags: [ 1288.201183] INFO: task bash:5910 blocked for more than 180 seconds. [ 1288.201833] bash D 0 5910 5820 0x00000000 [ 1288.202016] Call Trace: [ 1288.202315] schedule+0x32/0x80 [ 1288.202462] schedule_timeout+0x1e5/0x380 [ 1288.203838] wait_for_completion+0xb0/0x120 [ 1288.204137] __wait_rcu_gp+0x125/0x160 [ 1288.204287] synchronize_sched+0x6e/0x80 [ 1288.204770] blk_mq_free_queue+0x74/0xe0 [ 1288.204922] blk_cleanup_queue+0xc7/0x110 [ 1288.205073] ibnbd_clt_unmap_device+0x1bc/0x280 [ibnbd_client] [ 1288.205389] ibnbd_clt_unmap_dev_store+0x169/0x1f0 [ibnbd_client] [ 1288.205548] kernfs_fop_write+0x109/0x180 [ 1288.206328] vfs_write+0xb3/0x1a0 [ 1288.206476] SyS_write+0x52/0xc0 [ 1288.206624] do_syscall_64+0x68/0x1d0 [ 1288.206774] entry_SYSCALL_64_after_hwframe+0x3d/0xa2 What happened is the following: 1. There are several MQ queues with shared tags. 2. One queue is about to be freed and now task is in blk_mq_del_queue_tag_set(). 3. Other CPU is in blk_mq_sched_restart() and loops over all queues in tag list in order to find hctx to restart. Because linked list entry was modified in blk_mq_del_queue_tag_set() without proper waiting for a grace period, blk_mq_sched_restart() never ends, spining in list_for_each_entry_rcu_rr(), thus soft lockup. Fix is simple: reinit list entry after an RCU grace period elapsed. Fixes: Fixes: 705cda97ee3a ("blk-mq: Make it safe to use RCU to iterate over blk_mq_tag_set.tag_list") Cc: stable@vger.kernel.org Cc: Sagi Grimberg Cc: linux-block@vger.kernel.org Reviewed-by: Christoph Hellwig Reviewed-by: Ming Lei Reviewed-by: Bart Van Assche Signed-off-by: Roman Pen Signed-off-by: Jens Axboe --- block/blk-mq.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index d2de0a719ab8..2be78cc30ec5 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2349,7 +2349,6 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q) mutex_lock(&set->tag_list_lock); list_del_rcu(&q->tag_set_list); - INIT_LIST_HEAD(&q->tag_set_list); if (list_is_singular(&set->tag_list)) { /* just transitioned to unshared */ set->flags &= ~BLK_MQ_F_TAG_SHARED; @@ -2357,8 +2356,8 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q) blk_mq_update_tag_set_depth(set, false); } mutex_unlock(&set->tag_list_lock); - synchronize_rcu(); + INIT_LIST_HEAD(&q->tag_set_list); } static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, From 16001c10725e11b73b8518f42e414506bf73c291 Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Sun, 10 Jun 2018 10:31:10 +0000 Subject: [PATCH 02/17] nvme: fix NULL pointer dereference in nvme_init_subsystem When using nvme-pci driver the nvmf_ctrl_options is NULL. There is no need to check for discovery_nqn flag at non-fabrics controller. Fixes: 181303d0 ("nvme-fabrics: allow duplicate connections to the discovery controller") Signed-off-by: Israel Rukshin Reviewed-by: Max Gurtovoy Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index e2dcd14012b1..d6ca4598c027 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2208,7 +2208,7 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) * Verify that the subsystem actually supports multiple * controllers, else bail out. */ - if (!ctrl->opts->discovery_nqn && + if (!(ctrl->opts && ctrl->opts->discovery_nqn) && nvme_active_ctrls(found) && !(id->cmic & (1 << 1))) { dev_err(ctrl->device, "ignoring ctrl due to duplicate subnqn (%s).\n", From 2796b569591c6f0681303f148a55e50c2cc8304a Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Thu, 7 Jun 2018 10:38:47 +0200 Subject: [PATCH 03/17] nvme: add bio remapping tracepoint Adding a tracepoint to trace bio remapping for native nvme multipath. Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/multipath.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index d7b664ae5923..1ffd3e8b13a1 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -12,6 +12,7 @@ */ #include +#include #include "nvme.h" static bool multipath = true; @@ -111,6 +112,9 @@ static blk_qc_t nvme_ns_head_make_request(struct request_queue *q, if (likely(ns)) { bio->bi_disk = ns->disk; bio->bi_opf |= REQ_NVME_MPATH; + trace_block_bio_remap(bio->bi_disk->queue, bio, + disk_devt(ns->head->disk), + bio->bi_iter.bi_sector); ret = direct_make_request(bio); } else if (!list_empty_careful(&head->list)) { dev_warn_ratelimited(dev, "no path available - requeuing I/O\n"); From 94423a8f89ed7b66746cade3351a185fb6a1f38d Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Sun, 10 Jun 2018 16:58:29 +0300 Subject: [PATCH 04/17] nvme-rdma: fix error flow during mapping request data After dma mapping the sgl, we map the sgl to nvme sgl descriptor. In case of failure during the last mapping we never dma unmap the sgl. Signed-off-by: Max Gurtovoy Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 2aba03876d84..7cd4199db225 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1189,21 +1189,38 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue, count = ib_dma_map_sg(ibdev, req->sg_table.sgl, req->nents, rq_data_dir(rq) == WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE); if (unlikely(count <= 0)) { - sg_free_table_chained(&req->sg_table, true); - return -EIO; + ret = -EIO; + goto out_free_table; } if (count == 1) { if (rq_data_dir(rq) == WRITE && nvme_rdma_queue_idx(queue) && blk_rq_payload_bytes(rq) <= - nvme_rdma_inline_data_size(queue)) - return nvme_rdma_map_sg_inline(queue, req, c); + nvme_rdma_inline_data_size(queue)) { + ret = nvme_rdma_map_sg_inline(queue, req, c); + goto out; + } - if (dev->pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) - return nvme_rdma_map_sg_single(queue, req, c); + if (dev->pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) { + ret = nvme_rdma_map_sg_single(queue, req, c); + goto out; + } } - return nvme_rdma_map_sg_fr(queue, req, c, count); + ret = nvme_rdma_map_sg_fr(queue, req, c, count); +out: + if (unlikely(ret)) + goto out_unmap_sg; + + return 0; + +out_unmap_sg: + ib_dma_unmap_sg(ibdev, req->sg_table.sgl, + req->nents, rq_data_dir(rq) == + WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE); +out_free_table: + sg_free_table_chained(&req->sg_table, true); + return ret; } static void nvme_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc) From c42d7a30aba5da4593f240d70a088e3be5285744 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 11 Jun 2018 03:20:24 -0400 Subject: [PATCH 05/17] nvmet: free smart-log buffer after use Free smart-log buffer allocated in the function after use. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/admin-cmd.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 962532842769..38803576d5e1 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -119,9 +119,11 @@ static void nvmet_execute_get_log_page_smart(struct nvmet_req *req) else status = nvmet_get_smart_log_nsid(req, log); if (status) - goto out; + goto out_free_log; status = nvmet_copy_to_sgl(req, 0, log, sizeof(*log)); +out_free_log: + kfree(log); out: nvmet_req_complete(req, status); } From f493af37abfb3c0ae0f62f628d2b20e9c32561c4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 7 Jun 2018 13:47:33 +0200 Subject: [PATCH 06/17] nvme: don't rely on the changed namespace list log Don't optimize our namespace rescan based on the changed namespace list log page as userspace might have changed the content through reading it. Suggested-by: Keith Busch Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Hannes Reinecke --- drivers/nvme/host/core.c | 36 +++++++++++------------------------- 1 file changed, 11 insertions(+), 25 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index d6ca4598c027..dee8e71baf62 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3197,40 +3197,28 @@ static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl, unsigned nn) nvme_remove_invalid_namespaces(ctrl, nn); } -static bool nvme_scan_changed_ns_log(struct nvme_ctrl *ctrl) +static void nvme_clear_changed_ns_log(struct nvme_ctrl *ctrl) { size_t log_size = NVME_MAX_CHANGED_NAMESPACES * sizeof(__le32); __le32 *log; - int error, i; - bool ret = false; + int error; log = kzalloc(log_size, GFP_KERNEL); if (!log) - return false; + return; + /* + * We need to read the log to clear the AEN, but we don't want to rely + * on it for the changed namespace information as userspace could have + * raced with us in reading the log page, which could cause us to miss + * updates. + */ error = nvme_get_log(ctrl, NVME_LOG_CHANGED_NS, log, log_size); - if (error) { + if (error) dev_warn(ctrl->device, "reading changed ns log failed: %d\n", error); - goto out_free_log; - } - if (log[0] == cpu_to_le32(0xffffffff)) - goto out_free_log; - - for (i = 0; i < NVME_MAX_CHANGED_NAMESPACES; i++) { - u32 nsid = le32_to_cpu(log[i]); - - if (nsid == 0) - break; - dev_info(ctrl->device, "rescanning namespace %d.\n", nsid); - nvme_validate_ns(ctrl, nsid); - } - ret = true; - -out_free_log: kfree(log); - return ret; } static void nvme_scan_work(struct work_struct *work) @@ -3246,9 +3234,8 @@ static void nvme_scan_work(struct work_struct *work) WARN_ON_ONCE(!ctrl->tagset); if (test_and_clear_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events)) { - if (nvme_scan_changed_ns_log(ctrl)) - goto out_sort_namespaces; dev_info(ctrl->device, "rescanning namespaces.\n"); + nvme_clear_changed_ns_log(ctrl); } if (nvme_identify_ctrl(ctrl, &id)) @@ -3263,7 +3250,6 @@ static void nvme_scan_work(struct work_struct *work) nvme_scan_ns_sequential(ctrl, nn); out_free_id: kfree(id); -out_sort_namespaces: down_write(&ctrl->namespaces_rwsem); list_sort(NULL, &ctrl->namespaces, ns_cmp); up_write(&ctrl->namespaces_rwsem); From 4c984154efa13175bbb1e2aeb1de9fb2960ca28c Mon Sep 17 00:00:00 2001 From: James Smart Date: Wed, 13 Jun 2018 14:07:37 -0700 Subject: [PATCH 07/17] nvme-fc: change controllers first connect to use reconnect path Current code follows the framework that has been in the transports from the beginning where initial link-side controller connect occurs as part of "creating the controller". Thus that first connect fully talks to the controller and obtains values that can then be used in for blk-mq setup, etc. It also means that everything about the controller is fully know before the "create controller" call returns. This has several weaknesses: - The initial create_ctrl call made by the cli will block for a long time as wire transactions are performed synchronously. This delay becomes longer if errors occur or connectivity is lost and retries need to be performed. - Code wise, it means there is a separate connect path for initial controller connect vs the (same) steps used in the reconnect path. - And as there's separate paths, it means there's separate error handling and retry logic. It also plays havoc with the NEW state (should transition out of it after successful initial connect) vs the RESETTING and CONNECTING (reconnect) states that want to be transitioned to on error. - As there's separate paths, to recover from errors and disruptions, it requires separate recovery/retry paths as well and can severely convolute the controller state. This patch reworks the fc transport to use the same connect paths for the initial connection as it uses for reconnect. This makes a single path for error recovery and handling. This patch: - Removes the driving of the initial connect and replaces it with a state transition to CONNECTING and initiating the reconnect thread. A dummy state transition of RESETTING had to be traversed as a direct transtion of NEW->CONNECTING is not allowed. Given that the controller is "new", the RESETTING transition is a simple no-op. Once in the reconnecting thread, the normal behaviors of ctrl_loss_tmo (max_retries * connect_delay) and dev_loss_tmo will apply before the controller is torn down. - Only if the state transitions couldn't be traversed and the reconnect thread not scheduled, will the controller be torn down while in create_ctrl. - The prior code used the controller state of NEW to indicate whether request queues had been initialized or not. For the admin queue, the request queue is always created, so there's no need to check a state. For IO queues, change to tracking whether a successful io request queue create has occurred (e.g. 1st successful connect). - The initial controller id is initialized to the dynamic controller id used in the initial connect message. It will be overwritten by the real controller id once the controller is connected on the wire. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 108 +++++++++++++++++++---------------------- 1 file changed, 49 insertions(+), 59 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 0bad65803271..9d826b726425 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -142,6 +142,7 @@ struct nvme_fc_ctrl { struct nvme_fc_rport *rport; u32 cnum; + bool ioq_live; bool assoc_active; u64 association_id; @@ -2463,6 +2464,8 @@ nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl) if (ret) goto out_delete_hw_queues; + ctrl->ioq_live = true; + return 0; out_delete_hw_queues: @@ -2615,8 +2618,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) if (ret) goto out_delete_hw_queue; - if (ctrl->ctrl.state != NVME_CTRL_NEW) - blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); + blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); ret = nvmf_connect_admin_queue(&ctrl->ctrl); if (ret) @@ -2689,7 +2691,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) */ if (ctrl->ctrl.queue_count > 1) { - if (ctrl->ctrl.state == NVME_CTRL_NEW) + if (!ctrl->ioq_live) ret = nvme_fc_create_io_queues(ctrl); else ret = nvme_fc_reinit_io_queues(ctrl); @@ -2776,8 +2778,7 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl) * use blk_mq_tagset_busy_itr() and the transport routine to * terminate the exchanges. */ - if (ctrl->ctrl.state != NVME_CTRL_NEW) - blk_mq_quiesce_queue(ctrl->ctrl.admin_q); + blk_mq_quiesce_queue(ctrl->ctrl.admin_q); blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, nvme_fc_terminate_exchange, &ctrl->ctrl); @@ -2934,7 +2935,7 @@ nvme_fc_connect_ctrl_work(struct work_struct *work) nvme_fc_reconnect_or_delete(ctrl, ret); else dev_info(ctrl->ctrl.device, - "NVME-FC{%d}: controller reconnect complete\n", + "NVME-FC{%d}: controller connect complete\n", ctrl->cnum); } @@ -2982,7 +2983,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, { struct nvme_fc_ctrl *ctrl; unsigned long flags; - int ret, idx, retry; + int ret, idx; if (!(rport->remoteport.port_role & (FC_PORT_ROLE_NVME_DISCOVERY | FC_PORT_ROLE_NVME_TARGET))) { @@ -3009,11 +3010,13 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, } ctrl->ctrl.opts = opts; + ctrl->ctrl.nr_reconnects = 0; INIT_LIST_HEAD(&ctrl->ctrl_list); ctrl->lport = lport; ctrl->rport = rport; ctrl->dev = lport->dev; ctrl->cnum = idx; + ctrl->ioq_live = false; ctrl->assoc_active = false; init_waitqueue_head(&ctrl->ioabort_wait); @@ -3032,6 +3035,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, ctrl->ctrl.sqsize = opts->queue_size - 1; ctrl->ctrl.kato = opts->kato; + ctrl->ctrl.cntlid = 0xffff; ret = -ENOMEM; ctrl->queues = kcalloc(ctrl->ctrl.queue_count, @@ -3081,69 +3085,55 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, list_add_tail(&ctrl->ctrl_list, &rport->ctrl_list); spin_unlock_irqrestore(&rport->lock, flags); - /* - * It's possible that transactions used to create the association - * may fail. Examples: CreateAssociation LS or CreateIOConnection - * LS gets dropped/corrupted/fails; or a frame gets dropped or a - * command times out for one of the actions to init the controller - * (Connect, Get/Set_Property, Set_Features, etc). Many of these - * transport errors (frame drop, LS failure) inherently must kill - * the association. The transport is coded so that any command used - * to create the association (prior to a LIVE state transition - * while NEW or CONNECTING) will fail if it completes in error or - * times out. - * - * As such: as the connect request was mostly likely due to a - * udev event that discovered the remote port, meaning there is - * not an admin or script there to restart if the connect - * request fails, retry the initial connection creation up to - * three times before giving up and declaring failure. - */ - for (retry = 0; retry < 3; retry++) { - ret = nvme_fc_create_association(ctrl); - if (!ret) - break; - } - - if (ret) { - nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING); - cancel_work_sync(&ctrl->ctrl.reset_work); - cancel_delayed_work_sync(&ctrl->connect_work); - - /* couldn't schedule retry - fail out */ + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING) || + !nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { dev_err(ctrl->ctrl.device, - "NVME-FC{%d}: Connect retry failed\n", ctrl->cnum); - - ctrl->ctrl.opts = NULL; - - /* initiate nvme ctrl ref counting teardown */ - nvme_uninit_ctrl(&ctrl->ctrl); - - /* Remove core ctrl ref. */ - nvme_put_ctrl(&ctrl->ctrl); - - /* as we're past the point where we transition to the ref - * counting teardown path, if we return a bad pointer here, - * the calling routine, thinking it's prior to the - * transition, will do an rport put. Since the teardown - * path also does a rport put, we do an extra get here to - * so proper order/teardown happens. - */ - nvme_fc_rport_get(rport); - - if (ret > 0) - ret = -EIO; - return ERR_PTR(ret); + "NVME-FC{%d}: failed to init ctrl state\n", ctrl->cnum); + goto fail_ctrl; } nvme_get_ctrl(&ctrl->ctrl); + if (!queue_delayed_work(nvme_wq, &ctrl->connect_work, 0)) { + nvme_put_ctrl(&ctrl->ctrl); + dev_err(ctrl->ctrl.device, + "NVME-FC{%d}: failed to schedule initial connect\n", + ctrl->cnum); + goto fail_ctrl; + } + + flush_delayed_work(&ctrl->connect_work); + dev_info(ctrl->ctrl.device, "NVME-FC{%d}: new ctrl: NQN \"%s\"\n", ctrl->cnum, ctrl->ctrl.opts->subsysnqn); return &ctrl->ctrl; +fail_ctrl: + nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING); + cancel_work_sync(&ctrl->ctrl.reset_work); + cancel_delayed_work_sync(&ctrl->connect_work); + + ctrl->ctrl.opts = NULL; + + /* initiate nvme ctrl ref counting teardown */ + nvme_uninit_ctrl(&ctrl->ctrl); + + /* Remove core ctrl ref. */ + nvme_put_ctrl(&ctrl->ctrl); + + /* as we're past the point where we transition to the ref + * counting teardown path, if we return a bad pointer here, + * the calling routine, thinking it's prior to the + * transition, will do an rport put. Since the teardown + * path also does a rport put, we do an extra get here to + * so proper order/teardown happens. + */ + nvme_fc_rport_get(rport); + + return ERR_PTR(-EIO); + out_cleanup_admin_q: blk_cleanup_queue(ctrl->ctrl.admin_q); out_free_admin_tag_set: From da661267398869a553b7f67d739d360aaa1361b6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 14 Jun 2018 13:58:45 +0200 Subject: [PATCH 08/17] blk-mq: don't time out requests again that are in the timeout handler We can currently call the timeout handler again on a request that has already been handed over to the timeout handler. Prevent that with a new flag. Fixes: 12f5b931 ("blk-mq: Remove generation seqeunce") Reported-by: Andrew Randrianasulu Tested-by: Andrew Randrianasulu Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 5 +++++ include/linux/blkdev.h | 2 ++ 2 files changed, 7 insertions(+) diff --git a/block/blk-mq.c b/block/blk-mq.c index 2be78cc30ec5..8e57b84e50e9 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -671,6 +671,7 @@ static void __blk_mq_requeue_request(struct request *rq) if (blk_mq_request_started(rq)) { WRITE_ONCE(rq->state, MQ_RQ_IDLE); + rq->rq_flags &= ~RQF_TIMED_OUT; if (q->dma_drain_size && blk_rq_bytes(rq)) rq->nr_phys_segments--; } @@ -770,6 +771,7 @@ EXPORT_SYMBOL(blk_mq_tag_to_rq); static void blk_mq_rq_timed_out(struct request *req, bool reserved) { + req->rq_flags |= RQF_TIMED_OUT; if (req->q->mq_ops->timeout) { enum blk_eh_timer_return ret; @@ -779,6 +781,7 @@ static void blk_mq_rq_timed_out(struct request *req, bool reserved) WARN_ON_ONCE(ret != BLK_EH_RESET_TIMER); } + req->rq_flags &= ~RQF_TIMED_OUT; blk_add_timer(req); } @@ -788,6 +791,8 @@ static bool blk_mq_req_expired(struct request *rq, unsigned long *next) if (blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT) return false; + if (rq->rq_flags & RQF_TIMED_OUT) + return false; deadline = blk_rq_deadline(rq); if (time_after_eq(jiffies, deadline)) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index bca3a92eb55f..fa6f11751430 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -127,6 +127,8 @@ typedef __u32 __bitwise req_flags_t; #define RQF_ZONE_WRITE_LOCKED ((__force req_flags_t)(1 << 19)) /* already slept for hybrid poll */ #define RQF_MQ_POLL_SLEPT ((__force req_flags_t)(1 << 20)) +/* ->timeout has been called, don't expire again */ +#define RQF_TIMED_OUT ((__force req_flags_t)(1 << 21)) /* flags that prevent us from merging requests: */ #define RQF_NOMERGE_FLAGS \ From 587331f71e2748371526597cafc72e5732c67e88 Mon Sep 17 00:00:00 2001 From: James Smart Date: Wed, 13 Jun 2018 14:07:36 -0700 Subject: [PATCH 09/17] nvme-fc: remove reinit_request routine The reinit_request routine is not necessary. Remove support for the op callback. As all that nvme_reinit_tagset() does is itterate and call the reinit routine, it too has no purpose. Remove the call. Signed-off-by: James Smart Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 9d826b726425..36e72e64f57d 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1471,21 +1471,6 @@ nvme_fc_xmt_disconnect_assoc(struct nvme_fc_ctrl *ctrl) static void nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg); -static int -nvme_fc_reinit_request(void *data, struct request *rq) -{ - struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq); - struct nvme_fc_cmd_iu *cmdiu = &op->cmd_iu; - - memset(cmdiu, 0, sizeof(*cmdiu)); - cmdiu->scsi_id = NVME_CMD_SCSI_ID; - cmdiu->fc_id = NVME_CMD_FC_ID; - cmdiu->iu_len = cpu_to_be16(sizeof(*cmdiu) / sizeof(u32)); - memset(&op->rsp_iu, 0, sizeof(op->rsp_iu)); - - return 0; -} - static void __nvme_fc_exit_request(struct nvme_fc_ctrl *ctrl, struct nvme_fc_fcp_op *op) @@ -2505,10 +2490,6 @@ nvme_fc_reinit_io_queues(struct nvme_fc_ctrl *ctrl) nvme_fc_init_io_queues(ctrl); - ret = nvme_reinit_tagset(&ctrl->ctrl, ctrl->ctrl.tagset); - if (ret) - goto out_free_io_queues; - ret = nvme_fc_create_hw_io_queues(ctrl, ctrl->ctrl.sqsize + 1); if (ret) goto out_free_io_queues; @@ -2918,7 +2899,6 @@ static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = { .submit_async_event = nvme_fc_submit_async_event, .delete_ctrl = nvme_fc_delete_ctrl, .get_address = nvmf_get_address, - .reinit_request = nvme_fc_reinit_request, }; static void From 3e493c00cedb457c0731399a835f7ba1c6df172b Mon Sep 17 00:00:00 2001 From: James Smart Date: Wed, 13 Jun 2018 14:07:38 -0700 Subject: [PATCH 10/17] nvme-fc: fix nulling of queue data on reconnect The reconnect path is calling the init routines to clear a queue structure. But the queue structure has state that perhaps needs to persist as long as the controller is live. Remove the nvme_fc_init_queue() calls on reconnect. The nvme_fc_free_queue() calls will clear state bits and reset any relevant queue state for a new connection. Signed-off-by: James Smart Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 36e72e64f57d..318e827e74ec 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1879,6 +1879,7 @@ nvme_fc_free_queue(struct nvme_fc_queue *queue) */ queue->connection_id = 0; + atomic_set(&queue->csn, 1); } static void @@ -2468,7 +2469,7 @@ out_free_tag_set: } static int -nvme_fc_reinit_io_queues(struct nvme_fc_ctrl *ctrl) +nvme_fc_recreate_io_queues(struct nvme_fc_ctrl *ctrl) { struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; unsigned int nr_io_queues; @@ -2488,8 +2489,6 @@ nvme_fc_reinit_io_queues(struct nvme_fc_ctrl *ctrl) if (ctrl->ctrl.queue_count == 1) return 0; - nvme_fc_init_io_queues(ctrl); - ret = nvme_fc_create_hw_io_queues(ctrl, ctrl->ctrl.sqsize + 1); if (ret) goto out_free_io_queues; @@ -2587,8 +2586,6 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) * Create the admin queue */ - nvme_fc_init_queue(ctrl, 0); - ret = __nvme_fc_create_hw_queue(ctrl, &ctrl->queues[0], 0, NVME_AQ_DEPTH); if (ret) @@ -2675,7 +2672,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) if (!ctrl->ioq_live) ret = nvme_fc_create_io_queues(ctrl); else - ret = nvme_fc_reinit_io_queues(ctrl); + ret = nvme_fc_recreate_io_queues(ctrl); if (ret) goto out_term_aen_ops; } @@ -3023,6 +3020,8 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, if (!ctrl->queues) goto out_free_ida; + nvme_fc_init_queue(ctrl, 0); + memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set)); ctrl->admin_tag_set.ops = &nvme_fc_admin_mq_ops; ctrl->admin_tag_set.queue_depth = NVME_AQ_MQ_TAG_DEPTH; From 14dfa400f95b7d7960343165507125a065db84c2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 14 Jun 2018 14:25:55 +0200 Subject: [PATCH 11/17] nvme: remove nvme_reinit_tagset Unused now that all transports stopped using it. Signed-off-by: Christoph Hellwig Reviewed-by: Jens Axboe --- drivers/nvme/host/core.c | 10 ---------- drivers/nvme/host/nvme.h | 2 -- 2 files changed, 12 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index dee8e71baf62..020a00f932a0 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3627,16 +3627,6 @@ void nvme_start_queues(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_start_queues); -int nvme_reinit_tagset(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set) -{ - if (!ctrl->ops->reinit_request) - return 0; - - return blk_mq_tagset_iter(set, set->driver_data, - ctrl->ops->reinit_request); -} -EXPORT_SYMBOL_GPL(nvme_reinit_tagset); - int __init nvme_core_init(void) { int result = -ENOMEM; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 34df07d44f80..231807cbc849 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -321,7 +321,6 @@ struct nvme_ctrl_ops { void (*submit_async_event)(struct nvme_ctrl *ctrl); void (*delete_ctrl)(struct nvme_ctrl *ctrl); int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size); - int (*reinit_request)(void *data, struct request *rq); void (*stop_ctrl)(struct nvme_ctrl *ctrl); }; @@ -416,7 +415,6 @@ void nvme_unfreeze(struct nvme_ctrl *ctrl); void nvme_wait_freeze(struct nvme_ctrl *ctrl); void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout); void nvme_start_freeze(struct nvme_ctrl *ctrl); -int nvme_reinit_tagset(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set); #define NVME_QID_ANY -1 struct request *nvme_alloc_request(struct request_queue *q, From e6c3456aa897c9799de5423b28550efad14a51b0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 14 Jun 2018 14:26:47 +0200 Subject: [PATCH 12/17] blk-mq: remove blk_mq_tagset_iter Unused now that nvme stopped using it. Signed-off-by: Christoph Hellwig Reviewed-by: Jens Axboe --- block/blk-mq-tag.c | 29 ----------------------------- include/linux/blk-mq.h | 2 -- 2 files changed, 31 deletions(-) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 70356a2a11ab..09b2ee6694fb 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -311,35 +311,6 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, } EXPORT_SYMBOL(blk_mq_tagset_busy_iter); -int blk_mq_tagset_iter(struct blk_mq_tag_set *set, void *data, - int (fn)(void *, struct request *)) -{ - int i, j, ret = 0; - - if (WARN_ON_ONCE(!fn)) - goto out; - - for (i = 0; i < set->nr_hw_queues; i++) { - struct blk_mq_tags *tags = set->tags[i]; - - if (!tags) - continue; - - for (j = 0; j < tags->nr_tags; j++) { - if (!tags->static_rqs[j]) - continue; - - ret = fn(data, tags->static_rqs[j]); - if (ret) - goto out; - } - } - -out: - return ret; -} -EXPORT_SYMBOL_GPL(blk_mq_tagset_iter); - void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, void *priv) { diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index fb355173f3c7..e3147eb74222 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -281,8 +281,6 @@ void blk_freeze_queue_start(struct request_queue *q); void blk_mq_freeze_queue_wait(struct request_queue *q); int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, unsigned long timeout); -int blk_mq_tagset_iter(struct blk_mq_tag_set *set, void *data, - int (reinit_request)(void *, struct request *)); int blk_mq_map_queues(struct blk_mq_tag_set *set); void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues); From 3bc32bb1186ccaf3177cbf29caa6cc14dc510b7b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 11 Jun 2018 17:34:06 +0200 Subject: [PATCH 13/17] nvme-fabrics: refactor queue ready check Move the is_connected check to the fibre channel transport, as it has no meaning for other transports. To facilitate this split out a new nvmf_fail_nonready_command helper that is called by the transport when it is asked to handle a command on a queue that is not ready. Also avoid a function call for the queue live fast path by inlining the check. Signed-off-by: Christoph Hellwig Reviewed-by: James Smart --- drivers/nvme/host/fabrics.c | 59 +++++++++++++++---------------------- drivers/nvme/host/fabrics.h | 13 ++++++-- drivers/nvme/host/fc.c | 9 +++--- drivers/nvme/host/rdma.c | 7 ++--- drivers/nvme/target/loop.c | 7 ++--- 5 files changed, 45 insertions(+), 50 deletions(-) diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index fa32c1216409..6b4e253b9347 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -536,30 +536,32 @@ static struct nvmf_transport_ops *nvmf_lookup_transport( return NULL; } -blk_status_t nvmf_check_if_ready(struct nvme_ctrl *ctrl, struct request *rq, - bool queue_live, bool is_connected) +/* + * For something we're not in a state to send to the device the default action + * is to busy it and retry it after the controller state is recovered. However, + * anything marked for failfast or nvme multipath is immediately failed. + * + * Note: commands used to initialize the controller will be marked for failfast. + * Note: nvme cli/ioctl commands are marked for failfast. + */ +blk_status_t nvmf_fail_nonready_command(struct request *rq) +{ + if (!blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH)) + return BLK_STS_RESOURCE; + nvme_req(rq)->status = NVME_SC_ABORT_REQ; + return BLK_STS_IOERR; +} +EXPORT_SYMBOL_GPL(nvmf_fail_nonready_command); + +bool __nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq, + bool queue_live) { struct nvme_command *cmd = nvme_req(rq)->cmd; - if (likely(ctrl->state == NVME_CTRL_LIVE && is_connected)) - return BLK_STS_OK; - switch (ctrl->state) { case NVME_CTRL_NEW: case NVME_CTRL_CONNECTING: case NVME_CTRL_DELETING: - /* - * This is the case of starting a new or deleting an association - * but connectivity was lost before it was fully created or torn - * down. We need to error the commands used to initialize the - * controller so the reconnect can go into a retry attempt. The - * commands should all be marked REQ_FAILFAST_DRIVER, which will - * hit the reject path below. Anything else will be queued while - * the state settles. - */ - if (!is_connected) - break; - /* * If queue is live, allow only commands that are internally * generated pass through. These are commands on the admin @@ -567,7 +569,7 @@ blk_status_t nvmf_check_if_ready(struct nvme_ctrl *ctrl, struct request *rq, * ioctl admin cmds received while initializing. */ if (queue_live && !(nvme_req(rq)->flags & NVME_REQ_USERCMD)) - return BLK_STS_OK; + return true; /* * If the queue is not live, allow only a connect command. This @@ -577,26 +579,13 @@ blk_status_t nvmf_check_if_ready(struct nvme_ctrl *ctrl, struct request *rq, if (!queue_live && blk_rq_is_passthrough(rq) && cmd->common.opcode == nvme_fabrics_command && cmd->fabrics.fctype == nvme_fabrics_type_connect) - return BLK_STS_OK; - break; + return true; + return false; default: - break; + return false; } - - /* - * Any other new io is something we're not in a state to send to the - * device. Default action is to busy it and retry it after the - * controller state is recovered. However, anything marked for failfast - * or nvme multipath is immediately failed. Note: commands used to - * initialize the controller will be marked for failfast. - * Note: nvme cli/ioctl commands are marked for failfast. - */ - if (!blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH)) - return BLK_STS_RESOURCE; - nvme_req(rq)->status = NVME_SC_ABORT_REQ; - return BLK_STS_IOERR; } -EXPORT_SYMBOL_GPL(nvmf_check_if_ready); +EXPORT_SYMBOL_GPL(__nvmf_check_ready); static const match_table_t opt_tokens = { { NVMF_OPT_TRANSPORT, "transport=%s" }, diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index 7491a0bbf711..2ea949a3868c 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -162,7 +162,16 @@ void nvmf_unregister_transport(struct nvmf_transport_ops *ops); void nvmf_free_options(struct nvmf_ctrl_options *opts); int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size); bool nvmf_should_reconnect(struct nvme_ctrl *ctrl); -blk_status_t nvmf_check_if_ready(struct nvme_ctrl *ctrl, - struct request *rq, bool queue_live, bool is_connected); +blk_status_t nvmf_fail_nonready_command(struct request *rq); +bool __nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq, + bool queue_live); + +static inline bool nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq, + bool queue_live) +{ + if (likely(ctrl->state == NVME_CTRL_LIVE)) + return true; + return __nvmf_check_ready(ctrl, rq, queue_live); +} #endif /* _NVME_FABRICS_H */ diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 318e827e74ec..b528a2f5826c 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2266,14 +2266,13 @@ nvme_fc_queue_rq(struct blk_mq_hw_ctx *hctx, struct nvme_fc_cmd_iu *cmdiu = &op->cmd_iu; struct nvme_command *sqe = &cmdiu->sqe; enum nvmefc_fcp_datadir io_dir; + bool queue_ready = test_bit(NVME_FC_Q_LIVE, &queue->flags); u32 data_len; blk_status_t ret; - ret = nvmf_check_if_ready(&queue->ctrl->ctrl, rq, - test_bit(NVME_FC_Q_LIVE, &queue->flags), - ctrl->rport->remoteport.port_state == FC_OBJSTATE_ONLINE); - if (unlikely(ret)) - return ret; + if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE || + !nvmf_check_ready(&queue->ctrl->ctrl, rq, queue_ready)) + return nvmf_fail_nonready_command(rq); ret = nvme_setup_cmd(ns, rq, sqe); if (ret) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 7cd4199db225..c9424da0d23e 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1630,15 +1630,14 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, struct nvme_rdma_qe *sqe = &req->sqe; struct nvme_command *c = sqe->data; struct ib_device *dev; + bool queue_ready = test_bit(NVME_RDMA_Q_LIVE, &queue->flags); blk_status_t ret; int err; WARN_ON_ONCE(rq->tag < 0); - ret = nvmf_check_if_ready(&queue->ctrl->ctrl, rq, - test_bit(NVME_RDMA_Q_LIVE, &queue->flags), true); - if (unlikely(ret)) - return ret; + if (!nvmf_check_ready(&queue->ctrl->ctrl, rq, queue_ready)) + return nvmf_fail_nonready_command(rq); dev = queue->device->dev; ib_dma_sync_single_for_cpu(dev, sqe->dma, diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 1304ec3a7ede..d8d91f04bd7e 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -158,12 +158,11 @@ static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx, struct nvme_loop_queue *queue = hctx->driver_data; struct request *req = bd->rq; struct nvme_loop_iod *iod = blk_mq_rq_to_pdu(req); + bool queue_ready = test_bit(NVME_LOOP_Q_LIVE, &queue->flags); blk_status_t ret; - ret = nvmf_check_if_ready(&queue->ctrl->ctrl, req, - test_bit(NVME_LOOP_Q_LIVE, &queue->flags), true); - if (unlikely(ret)) - return ret; + if (!nvmf_check_ready(&queue->ctrl->ctrl, req, queue_ready)) + return nvmf_fail_nonready_command(req); ret = nvme_setup_cmd(ns, req, &iod->cmd); if (ret) From 278ab3799a2588f97423180947f09ec5b576e79e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 11 Jun 2018 17:37:23 +0200 Subject: [PATCH 14/17] nvme-fabrics: handle the admin-only case properly in nvmf_check_ready MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the ADMIN_ONLY state we don't have any I/O queues, but we should accept all admin commands without further checks. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: James Smart --- drivers/nvme/host/fabrics.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index 2ea949a3868c..e1818a27aa2d 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -169,7 +169,8 @@ bool __nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq, static inline bool nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq, bool queue_live) { - if (likely(ctrl->state == NVME_CTRL_LIVE)) + if (likely(ctrl->state == NVME_CTRL_LIVE || + ctrl->state == NVME_CTRL_ADMIN_ONLY)) return true; return __nvmf_check_ready(ctrl, rq, queue_live); } From 35897b920c8ab5e23331ad429e0aa235528c63ba Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 11 Jun 2018 17:41:11 +0200 Subject: [PATCH 15/17] nvme-fabrics: fix and refine state checks in __nvmf_check_ready - make sure we only allow internally generates commands in any non-live state - only allow connect commands on non-live queues when actually in the new or connecting states - treat all other non-live, non-dead states the same as a default cach-all This fixes a regression where we could not shutdown a controller orderly as we didn't allow the internal generated Property Set command, and also ensures we don't accidentally let a Connect command through in the wrong state. Signed-off-by: Christoph Hellwig Reviewed-by: James Smart --- drivers/nvme/host/fabrics.c | 39 ++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 6b4e253b9347..903eb4545e26 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -556,34 +556,33 @@ EXPORT_SYMBOL_GPL(nvmf_fail_nonready_command); bool __nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq, bool queue_live) { - struct nvme_command *cmd = nvme_req(rq)->cmd; + struct nvme_request *req = nvme_req(rq); + /* + * If we are in some state of setup or teardown only allow + * internally generated commands. + */ + if (!blk_rq_is_passthrough(rq) || (req->flags & NVME_REQ_USERCMD)) + return false; + + /* + * Only allow commands on a live queue, except for the connect command, + * which is require to set the queue live in the appropinquate states. + */ switch (ctrl->state) { case NVME_CTRL_NEW: case NVME_CTRL_CONNECTING: - case NVME_CTRL_DELETING: - /* - * If queue is live, allow only commands that are internally - * generated pass through. These are commands on the admin - * queue to initialize the controller. This will reject any - * ioctl admin cmds received while initializing. - */ - if (queue_live && !(nvme_req(rq)->flags & NVME_REQ_USERCMD)) + if (req->cmd->common.opcode == nvme_fabrics_command && + req->cmd->fabrics.fctype == nvme_fabrics_type_connect) return true; - - /* - * If the queue is not live, allow only a connect command. This - * will reject any ioctl admin cmd as well as initialization - * commands if the controller reverted the queue to non-live. - */ - if (!queue_live && blk_rq_is_passthrough(rq) && - cmd->common.opcode == nvme_fabrics_command && - cmd->fabrics.fctype == nvme_fabrics_type_connect) - return true; - return false; + break; default: + break; + case NVME_CTRL_DEAD: return false; } + + return queue_live; } EXPORT_SYMBOL_GPL(__nvmf_check_ready); From be7f99c536c5aeebad29082b7d8dce32077fea14 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 15 Jun 2018 13:55:07 +0200 Subject: [PATCH 16/17] block: remov blk_queue_invalidate_tags This function is entirely unused, so remove it and the tag_queue_busy member of struct request_queue. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- Documentation/block/biodoc.txt | 15 +-------------- block/blk-tag.c | 22 ---------------------- include/linux/blkdev.h | 2 -- 3 files changed, 1 insertion(+), 38 deletions(-) diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt index 86927029a52d..207eca58efaa 100644 --- a/Documentation/block/biodoc.txt +++ b/Documentation/block/biodoc.txt @@ -752,18 +752,6 @@ completion of the request to the block layer. This means ending tag operations before calling end_that_request_last()! For an example of a user of these helpers, see the IDE tagged command queueing support. -Certain hardware conditions may dictate a need to invalidate the block tag -queue. For instance, on IDE any tagged request error needs to clear both -the hardware and software block queue and enable the driver to sanely restart -all the outstanding requests. There's a third helper to do that: - - blk_queue_invalidate_tags(struct request_queue *q) - - Clear the internal block tag queue and re-add all the pending requests - to the request queue. The driver will receive them again on the - next request_fn run, just like it did the first time it encountered - them. - 3.2.5.2 Tag info Some block functions exist to query current tag status or to go from a @@ -805,8 +793,7 @@ Internally, block manages tags in the blk_queue_tag structure: Most of the above is simple and straight forward, however busy_list may need a bit of explaining. Normally we don't care too much about request ordering, but in the event of any barrier requests in the tag queue we need to ensure -that requests are restarted in the order they were queue. This may happen -if the driver needs to use blk_queue_invalidate_tags(). +that requests are restarted in the order they were queue. 3.3 I/O Submission diff --git a/block/blk-tag.c b/block/blk-tag.c index 09f19c6c52ce..ada0d7cff62b 100644 --- a/block/blk-tag.c +++ b/block/blk-tag.c @@ -188,7 +188,6 @@ int blk_queue_init_tags(struct request_queue *q, int depth, */ q->queue_tags = tags; queue_flag_set_unlocked(QUEUE_FLAG_QUEUED, q); - INIT_LIST_HEAD(&q->tag_busy_list); return 0; } EXPORT_SYMBOL(blk_queue_init_tags); @@ -374,27 +373,6 @@ int blk_queue_start_tag(struct request_queue *q, struct request *rq) rq->tag = tag; bqt->tag_index[tag] = rq; blk_start_request(rq); - list_add(&rq->queuelist, &q->tag_busy_list); return 0; } EXPORT_SYMBOL(blk_queue_start_tag); - -/** - * blk_queue_invalidate_tags - invalidate all pending tags - * @q: the request queue for the device - * - * Description: - * Hardware conditions may dictate a need to stop all pending requests. - * In this case, we will safely clear the block side of the tag queue and - * readd all requests to the request queue in the right order. - **/ -void blk_queue_invalidate_tags(struct request_queue *q) -{ - struct list_head *tmp, *n; - - lockdep_assert_held(q->queue_lock); - - list_for_each_safe(tmp, n, &q->tag_busy_list) - blk_requeue_request(q, list_entry_rq(tmp)); -} -EXPORT_SYMBOL(blk_queue_invalidate_tags); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index fa6f11751430..9154570edf29 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -562,7 +562,6 @@ struct request_queue { unsigned int dma_alignment; struct blk_queue_tag *queue_tags; - struct list_head tag_busy_list; unsigned int nr_sorted; unsigned int in_flight[2]; @@ -1375,7 +1374,6 @@ extern void blk_queue_end_tag(struct request_queue *, struct request *); extern int blk_queue_init_tags(struct request_queue *, int, struct blk_queue_tag *, int); extern void blk_queue_free_tags(struct request_queue *); extern int blk_queue_resize_tags(struct request_queue *, int); -extern void blk_queue_invalidate_tags(struct request_queue *); extern struct blk_queue_tag *blk_init_tags(int, int); extern void blk_free_tags(struct blk_queue_tag *); From d6c73964f1e2a07f75057fb32ae46f6599036f93 Mon Sep 17 00:00:00 2001 From: Anatoliy Glagolev Date: Wed, 13 Jun 2018 15:38:51 -0600 Subject: [PATCH 17/17] bsg: fix race of bsg_open and bsg_unregister The existing implementation allows races between bsg_unregister and bsg_open paths. bsg_unregister and request_queue cleanup and deletion may start and complete right after bsg_get_device (in bsg_open path) retrieves bsg_class_device and releases the mutex. Then bsg_open path touches freed memory of bsg_class_device and request_queue. One possible fix is to hold the mutex all the way through bsg_get_device instead of releasing it after bsg_class_device retrieval. Reviewed-by: Christoph Hellwig Signed-Off-By: Anatoliy Glagolev Signed-off-by: Jens Axboe --- block/bsg.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/block/bsg.c b/block/bsg.c index 132e657e2d91..66602c489956 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -693,6 +693,8 @@ static struct bsg_device *bsg_add_device(struct inode *inode, struct bsg_device *bd; unsigned char buf[32]; + lockdep_assert_held(&bsg_mutex); + if (!blk_get_queue(rq)) return ERR_PTR(-ENXIO); @@ -707,14 +709,12 @@ static struct bsg_device *bsg_add_device(struct inode *inode, bsg_set_block(bd, file); atomic_set(&bd->ref_count, 1); - mutex_lock(&bsg_mutex); hlist_add_head(&bd->dev_list, bsg_dev_idx_hash(iminor(inode))); strncpy(bd->name, dev_name(rq->bsg_dev.class_dev), sizeof(bd->name) - 1); bsg_dbg(bd, "bound to <%s>, max queue %d\n", format_dev_t(buf, inode->i_rdev), bd->max_queue); - mutex_unlock(&bsg_mutex); return bd; } @@ -722,7 +722,7 @@ static struct bsg_device *__bsg_get_device(int minor, struct request_queue *q) { struct bsg_device *bd; - mutex_lock(&bsg_mutex); + lockdep_assert_held(&bsg_mutex); hlist_for_each_entry(bd, bsg_dev_idx_hash(minor), dev_list) { if (bd->queue == q) { @@ -732,7 +732,6 @@ static struct bsg_device *__bsg_get_device(int minor, struct request_queue *q) } bd = NULL; found: - mutex_unlock(&bsg_mutex); return bd; } @@ -746,17 +745,18 @@ static struct bsg_device *bsg_get_device(struct inode *inode, struct file *file) */ mutex_lock(&bsg_mutex); bcd = idr_find(&bsg_minor_idr, iminor(inode)); - mutex_unlock(&bsg_mutex); - if (!bcd) - return ERR_PTR(-ENODEV); + if (!bcd) { + bd = ERR_PTR(-ENODEV); + goto out_unlock; + } bd = __bsg_get_device(iminor(inode), bcd->queue); - if (bd) - return bd; - - bd = bsg_add_device(inode, bcd->queue, file); + if (!bd) + bd = bsg_add_device(inode, bcd->queue, file); +out_unlock: + mutex_unlock(&bsg_mutex); return bd; }