From bbf5410bc69e131c82ad970ce7ee28b5906a6cc5 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 20 Oct 2022 17:35:40 +0200 Subject: [PATCH 01/45] nvmet: use try_cmpxchg in nvmet_update_sq_head Use try_cmpxchg instead of cmpxchg (*ptr, old, new) == old in nvmet_update_sq_head. x86 CMPXCHG instruction returns success in ZF flag, so this change saves a compare after cmpxchg (and related move instruction in front of cmpxchg). Also, try_cmpxchg implicitly assigns old *ptr value to "old" when cmpxchg fails. There is no need to re-read the value in the loop. Note that the value from *ptr should be read using READ_ONCE to prevent the compiler from merging, refetching or reordering the read. No functional change intended. Signed-off-by: Uros Bizjak Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/core.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index aecb5853f8da..560b6bbb6c44 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -695,11 +695,10 @@ static void nvmet_update_sq_head(struct nvmet_req *req) if (req->sq->size) { u32 old_sqhd, new_sqhd; + old_sqhd = READ_ONCE(req->sq->sqhd); do { - old_sqhd = req->sq->sqhd; new_sqhd = (old_sqhd + 1) % req->sq->size; - } while (cmpxchg(&req->sq->sqhd, old_sqhd, new_sqhd) != - old_sqhd); + } while (!try_cmpxchg(&req->sq->sqhd, &old_sqhd, new_sqhd)); } req->cqe->sq_head = cpu_to_le16(req->sq->sqhd & 0x0000FFFF); } From 2be2cd5287152a6284b45244b6e5c2f7e0a218bd Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Tue, 25 Oct 2022 17:50:08 +0200 Subject: [PATCH 02/45] nvmet: force reconnect when number of queue changes In order to test queue number changes we need to make sure that the host reconnects. Because only when the host disconnects from the target the number of queues are allowed to change according the spec. The initial idea was to disable and re-enable the ports and have the host wait until the KATO timer expires, triggering error recovery. Though the host would see a DNR reply when trying to reconnect. Because of the DNR bit the connection is dropped completely. There is no point in trying to reconnect with the same parameters according the spec. We can force to reconnect the host is by deleting all controllers. The host will observe any newly posted request to fail and thus starts the error recovery but this time without the DNR bit set. Signed-off-by: Daniel Wagner Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Acked-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/configfs.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 9443ee1d4ae3..051a420d818e 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -1290,6 +1290,8 @@ static ssize_t nvmet_subsys_attr_qid_max_show(struct config_item *item, static ssize_t nvmet_subsys_attr_qid_max_store(struct config_item *item, const char *page, size_t cnt) { + struct nvmet_subsys *subsys = to_subsys(item); + struct nvmet_ctrl *ctrl; u16 qid_max; if (sscanf(page, "%hu\n", &qid_max) != 1) @@ -1299,8 +1301,13 @@ static ssize_t nvmet_subsys_attr_qid_max_store(struct config_item *item, return -EINVAL; down_write(&nvmet_config_sem); - to_subsys(item)->max_qid = qid_max; + subsys->max_qid = qid_max; + + /* Force reconnect */ + list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) + ctrl->ops->delete_ctrl(ctrl); up_write(&nvmet_config_sem); + return cnt; } CONFIGFS_ATTR(nvmet_subsys_, attr_qid_max); From fa8f9ac42350edd3ce82d0d148a60f0fa088f995 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 7 Nov 2022 14:01:24 +0100 Subject: [PATCH 03/45] nvmet: only allocate a single slab for bvecs There is no need to have a separate slab cache for each namespace, and having separate ones creates duplicate debugs file names as well. Fixes: d5eff33ee6f8 ("nvmet: add simple file backed ns support") Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Reviewed-by: Martin K. Petersen --- drivers/nvme/target/core.c | 22 ++++++++++++++-------- drivers/nvme/target/io-cmd-file.c | 16 +++------------- drivers/nvme/target/nvmet.h | 3 ++- 3 files changed, 19 insertions(+), 22 deletions(-) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 560b6bbb6c44..171493bcf0fb 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -15,6 +15,7 @@ #include "nvmet.h" +struct kmem_cache *nvmet_bvec_cache; struct workqueue_struct *buffered_io_wq; struct workqueue_struct *zbd_wq; static const struct nvmet_fabrics_ops *nvmet_transports[NVMF_TRTYPE_MAX]; @@ -1630,26 +1631,28 @@ void nvmet_subsys_put(struct nvmet_subsys *subsys) static int __init nvmet_init(void) { - int error; + int error = -ENOMEM; nvmet_ana_group_enabled[NVMET_DEFAULT_ANA_GRPID] = 1; + nvmet_bvec_cache = kmem_cache_create("nvmet-bvec", + NVMET_MAX_MPOOL_BVEC * sizeof(struct bio_vec), 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!nvmet_bvec_cache) + return -ENOMEM; + zbd_wq = alloc_workqueue("nvmet-zbd-wq", WQ_MEM_RECLAIM, 0); if (!zbd_wq) - return -ENOMEM; + goto out_destroy_bvec_cache; buffered_io_wq = alloc_workqueue("nvmet-buffered-io-wq", WQ_MEM_RECLAIM, 0); - if (!buffered_io_wq) { - error = -ENOMEM; + if (!buffered_io_wq) goto out_free_zbd_work_queue; - } nvmet_wq = alloc_workqueue("nvmet-wq", WQ_MEM_RECLAIM, 0); - if (!nvmet_wq) { - error = -ENOMEM; + if (!nvmet_wq) goto out_free_buffered_work_queue; - } error = nvmet_init_discovery(); if (error) @@ -1668,6 +1671,8 @@ out_free_buffered_work_queue: destroy_workqueue(buffered_io_wq); out_free_zbd_work_queue: destroy_workqueue(zbd_wq); +out_destroy_bvec_cache: + kmem_cache_destroy(nvmet_bvec_cache); return error; } @@ -1679,6 +1684,7 @@ static void __exit nvmet_exit(void) destroy_workqueue(nvmet_wq); destroy_workqueue(buffered_io_wq); destroy_workqueue(zbd_wq); + kmem_cache_destroy(nvmet_bvec_cache); BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_entry) != 1024); BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_hdr) != 1024); diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c index 64b47e2a4633..e55ec6fefd7f 100644 --- a/drivers/nvme/target/io-cmd-file.c +++ b/drivers/nvme/target/io-cmd-file.c @@ -11,7 +11,6 @@ #include #include "nvmet.h" -#define NVMET_MAX_MPOOL_BVEC 16 #define NVMET_MIN_MPOOL_OBJ 16 void nvmet_file_ns_revalidate(struct nvmet_ns *ns) @@ -26,8 +25,6 @@ void nvmet_file_ns_disable(struct nvmet_ns *ns) flush_workqueue(buffered_io_wq); mempool_destroy(ns->bvec_pool); ns->bvec_pool = NULL; - kmem_cache_destroy(ns->bvec_cache); - ns->bvec_cache = NULL; fput(ns->file); ns->file = NULL; } @@ -59,16 +56,8 @@ int nvmet_file_ns_enable(struct nvmet_ns *ns) ns->blksize_shift = min_t(u8, file_inode(ns->file)->i_blkbits, 12); - ns->bvec_cache = kmem_cache_create("nvmet-bvec", - NVMET_MAX_MPOOL_BVEC * sizeof(struct bio_vec), - 0, SLAB_HWCACHE_ALIGN, NULL); - if (!ns->bvec_cache) { - ret = -ENOMEM; - goto err; - } - ns->bvec_pool = mempool_create(NVMET_MIN_MPOOL_OBJ, mempool_alloc_slab, - mempool_free_slab, ns->bvec_cache); + mempool_free_slab, nvmet_bvec_cache); if (!ns->bvec_pool) { ret = -ENOMEM; @@ -77,9 +66,10 @@ int nvmet_file_ns_enable(struct nvmet_ns *ns) return ret; err: + fput(ns->file); + ns->file = NULL; ns->size = 0; ns->blksize_shift = 0; - nvmet_file_ns_disable(ns); return ret; } diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index dfe3894205aa..bda1c1f71f39 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -77,7 +77,6 @@ struct nvmet_ns { struct completion disable_done; mempool_t *bvec_pool; - struct kmem_cache *bvec_cache; int use_p2pmem; struct pci_dev *p2p_dev; @@ -393,6 +392,8 @@ struct nvmet_req { u64 error_slba; }; +#define NVMET_MAX_MPOOL_BVEC 16 +extern struct kmem_cache *nvmet_bvec_cache; extern struct workqueue_struct *buffered_io_wq; extern struct workqueue_struct *zbd_wq; extern struct workqueue_struct *nvmet_wq; From cf3d00840170ebf372bcacc5d5c27f5ed9c1b976 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 2 Oct 2022 11:59:45 +0200 Subject: [PATCH 04/45] nvme-fc: improve memory usage in nvme_fc_rcv_ls_req() sizeof( struct nvmefc_ls_rcv_op ) = 64 sizeof( union nvmefc_ls_requests ) = 1024 sizeof( union nvmefc_ls_responses ) = 128 So, in nvme_fc_rcv_ls_req(), 1216 bytes of memory are requested when kzalloc() is called. Because of the way memory allocations are performed, 2048 bytes are allocated. So about 800 bytes are wasted for each request. Switch to 3 distinct memory allocations, in order to: - save these 800 bytes - avoid zeroing this extra memory - make sure that memory is properly aligned in case of DMA access ("fc_dma_map_single(lsop->rspbuf)" just a few lines below) Signed-off-by: Christophe JAILLET Reviewed-by: James Smart Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 5d57a042dbca..2d3c54838496 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1475,6 +1475,8 @@ nvme_fc_xmt_ls_rsp_done(struct nvmefc_ls_rsp *lsrsp) fc_dma_unmap_single(lport->dev, lsop->rspdma, sizeof(*lsop->rspbuf), DMA_TO_DEVICE); + kfree(lsop->rspbuf); + kfree(lsop->rqstbuf); kfree(lsop); nvme_fc_rport_put(rport); @@ -1751,20 +1753,17 @@ nvme_fc_rcv_ls_req(struct nvme_fc_remote_port *portptr, goto out_put; } - lsop = kzalloc(sizeof(*lsop) + - sizeof(union nvmefc_ls_requests) + - sizeof(union nvmefc_ls_responses), - GFP_KERNEL); - if (!lsop) { + lsop = kzalloc(sizeof(*lsop), GFP_KERNEL); + lsop->rqstbuf = kzalloc(sizeof(*lsop->rqstbuf), GFP_KERNEL); + lsop->rspbuf = kzalloc(sizeof(*lsop->rspbuf), GFP_KERNEL); + if (!lsop || !lsop->rqstbuf || !lsop->rspbuf) { dev_info(lport->dev, "RCV %s LS failed: No memory\n", (w0->ls_cmd <= NVME_FC_LAST_LS_CMD_VALUE) ? nvmefc_ls_names[w0->ls_cmd] : ""); ret = -ENOMEM; - goto out_put; + goto out_free; } - lsop->rqstbuf = (union nvmefc_ls_requests *)&lsop[1]; - lsop->rspbuf = (union nvmefc_ls_responses *)&lsop->rqstbuf[1]; lsop->rspdma = fc_dma_map_single(lport->dev, lsop->rspbuf, sizeof(*lsop->rspbuf), @@ -1801,6 +1800,8 @@ out_unmap: fc_dma_unmap_single(lport->dev, lsop->rspdma, sizeof(*lsop->rspbuf), DMA_TO_DEVICE); out_free: + kfree(lsop->rspbuf); + kfree(lsop->rqstbuf); kfree(lsop); out_put: nvme_fc_rport_put(rport); From 855b7717f44b13e0990aa5ad36bbf9aa35051516 Mon Sep 17 00:00:00 2001 From: Kanchan Joshi Date: Mon, 31 Oct 2022 21:53:50 +0530 Subject: [PATCH 05/45] nvme: fine-granular CAP_SYS_ADMIN for nvme io commands Currently both io and admin commands are kept under a coarse-granular CAP_SYS_ADMIN check, disregarding file mode completely. $ ls -l /dev/ng* crw-rw-rw- 1 root root 242, 0 Sep 9 19:20 /dev/ng0n1 crw------- 1 root root 242, 1 Sep 9 19:20 /dev/ng0n2 In the example above, ng0n1 appears as if it may allow unprivileged read/write operation but it does not and behaves same as ng0n2. This patch implements a shift from CAP_SYS_ADMIN to more fine-granular control for io-commands. If CAP_SYS_ADMIN is present, nothing else is checked as before. Otherwise, following rules are in place - any admin-cmd is not allowed - vendor-specific and fabric commmand are not allowed - io-commands that can write are allowed if matching FMODE_WRITE permission is present - io-commands that read are allowed Add a helper nvme_cmd_allowed that implements above policy. Change all the callers of CAP_SYS_ADMIN to go through nvme_cmd_allowed for any decision making. Since file open mode is counted for any approval/denial, change at various places to keep file-mode information handy. Signed-off-by: Kanchan Joshi Reviewed-by: Jens Axboe Reviewed-by: Keith Busch Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/ioctl.c | 102 ++++++++++++++++++++++++++------------ include/linux/nvme.h | 1 + 2 files changed, 70 insertions(+), 33 deletions(-) diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 81f5550b670d..1d68f161064a 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -8,6 +8,34 @@ #include #include "nvme.h" +static bool nvme_cmd_allowed(struct nvme_ns *ns, struct nvme_command *c, + fmode_t mode) +{ + if (capable(CAP_SYS_ADMIN)) + return true; + + /* + * Do not allow unprivileged processes to send vendor specific or fabrics + * commands as we can't be sure about their effects. + */ + if (c->common.opcode >= nvme_cmd_vendor_start || + c->common.opcode == nvme_fabrics_command) + return false; + + /* do not allow unprivileged admin commands */ + if (!ns) + return false; + + /* + * Only allow I/O commands that transfer data to the controller if the + * special file is open for writing, but always allow I/O commands that + * transfer data from the controller. + */ + if (nvme_is_write(c)) + return mode & FMODE_WRITE; + return true; +} + /* * Convert integer values from ioctl structures to user pointers, silently * ignoring the upper bits in the compat case to match behaviour of 32-bit @@ -261,7 +289,7 @@ static bool nvme_validate_passthru_nsid(struct nvme_ctrl *ctrl, } static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, - struct nvme_passthru_cmd __user *ucmd) + struct nvme_passthru_cmd __user *ucmd, fmode_t mode) { struct nvme_passthru_cmd cmd; struct nvme_command c; @@ -269,8 +297,6 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u64 result; int status; - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; if (copy_from_user(&cmd, ucmd, sizeof(cmd))) return -EFAULT; if (cmd.flags) @@ -291,6 +317,9 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, c.common.cdw14 = cpu_to_le32(cmd.cdw14); c.common.cdw15 = cpu_to_le32(cmd.cdw15); + if (!nvme_cmd_allowed(ns, &c, mode)) + return -EACCES; + if (cmd.timeout_ms) timeout = msecs_to_jiffies(cmd.timeout_ms); @@ -308,15 +337,14 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, } static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, - struct nvme_passthru_cmd64 __user *ucmd, bool vec) + struct nvme_passthru_cmd64 __user *ucmd, bool vec, + fmode_t mode) { struct nvme_passthru_cmd64 cmd; struct nvme_command c; unsigned timeout = 0; int status; - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; if (copy_from_user(&cmd, ucmd, sizeof(cmd))) return -EFAULT; if (cmd.flags) @@ -337,6 +365,9 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, c.common.cdw14 = cpu_to_le32(cmd.cdw14); c.common.cdw15 = cpu_to_le32(cmd.cdw15); + if (!nvme_cmd_allowed(ns, &c, mode)) + return -EACCES; + if (cmd.timeout_ms) timeout = msecs_to_jiffies(cmd.timeout_ms); @@ -483,9 +514,6 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns, void *meta = NULL; int ret; - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - c.common.opcode = READ_ONCE(cmd->opcode); c.common.flags = READ_ONCE(cmd->flags); if (c.common.flags) @@ -507,6 +535,9 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns, c.common.cdw14 = cpu_to_le32(READ_ONCE(cmd->cdw14)); c.common.cdw15 = cpu_to_le32(READ_ONCE(cmd->cdw15)); + if (!nvme_cmd_allowed(ns, &c, ioucmd->file->f_mode)) + return -EACCES; + d.metadata = READ_ONCE(cmd->metadata); d.addr = READ_ONCE(cmd->addr); d.data_len = READ_ONCE(cmd->data_len); @@ -570,13 +601,13 @@ static bool is_ctrl_ioctl(unsigned int cmd) } static int nvme_ctrl_ioctl(struct nvme_ctrl *ctrl, unsigned int cmd, - void __user *argp) + void __user *argp, fmode_t mode) { switch (cmd) { case NVME_IOCTL_ADMIN_CMD: - return nvme_user_cmd(ctrl, NULL, argp); + return nvme_user_cmd(ctrl, NULL, argp, mode); case NVME_IOCTL_ADMIN64_CMD: - return nvme_user_cmd64(ctrl, NULL, argp, false); + return nvme_user_cmd64(ctrl, NULL, argp, false, mode); default: return sed_ioctl(ctrl->opal_dev, cmd, argp); } @@ -601,14 +632,14 @@ struct nvme_user_io32 { #endif /* COMPAT_FOR_U64_ALIGNMENT */ static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned int cmd, - void __user *argp) + void __user *argp, fmode_t mode) { switch (cmd) { case NVME_IOCTL_ID: force_successful_syscall_return(); return ns->head->ns_id; case NVME_IOCTL_IO_CMD: - return nvme_user_cmd(ns->ctrl, ns, argp); + return nvme_user_cmd(ns->ctrl, ns, argp, mode); /* * struct nvme_user_io can have different padding on some 32-bit ABIs. * Just accept the compat version as all fields that are used are the @@ -620,19 +651,20 @@ static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned int cmd, case NVME_IOCTL_SUBMIT_IO: return nvme_submit_io(ns, argp); case NVME_IOCTL_IO64_CMD: - return nvme_user_cmd64(ns->ctrl, ns, argp, false); + return nvme_user_cmd64(ns->ctrl, ns, argp, false, mode); case NVME_IOCTL_IO64_CMD_VEC: - return nvme_user_cmd64(ns->ctrl, ns, argp, true); + return nvme_user_cmd64(ns->ctrl, ns, argp, true, mode); default: return -ENOTTY; } } -static int __nvme_ioctl(struct nvme_ns *ns, unsigned int cmd, void __user *arg) +static int __nvme_ioctl(struct nvme_ns *ns, unsigned int cmd, void __user *arg, + fmode_t mode) { - if (is_ctrl_ioctl(cmd)) - return nvme_ctrl_ioctl(ns->ctrl, cmd, arg); - return nvme_ns_ioctl(ns, cmd, arg); + if (is_ctrl_ioctl(cmd)) + return nvme_ctrl_ioctl(ns->ctrl, cmd, arg, mode); + return nvme_ns_ioctl(ns, cmd, arg, mode); } int nvme_ioctl(struct block_device *bdev, fmode_t mode, @@ -640,7 +672,7 @@ int nvme_ioctl(struct block_device *bdev, fmode_t mode, { struct nvme_ns *ns = bdev->bd_disk->private_data; - return __nvme_ioctl(ns, cmd, (void __user *)arg); + return __nvme_ioctl(ns, cmd, (void __user *)arg, mode); } long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg) @@ -648,7 +680,7 @@ long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg) struct nvme_ns *ns = container_of(file_inode(file)->i_cdev, struct nvme_ns, cdev); - return __nvme_ioctl(ns, cmd, (void __user *)arg); + return __nvme_ioctl(ns, cmd, (void __user *)arg, file->f_mode); } static int nvme_uring_cmd_checks(unsigned int issue_flags) @@ -716,7 +748,8 @@ int nvme_ns_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd, } #ifdef CONFIG_NVME_MULTIPATH static int nvme_ns_head_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd, - void __user *argp, struct nvme_ns_head *head, int srcu_idx) + void __user *argp, struct nvme_ns_head *head, int srcu_idx, + fmode_t mode) __releases(&head->srcu) { struct nvme_ctrl *ctrl = ns->ctrl; @@ -724,7 +757,7 @@ static int nvme_ns_head_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd, nvme_get_ctrl(ns->ctrl); srcu_read_unlock(&head->srcu, srcu_idx); - ret = nvme_ctrl_ioctl(ns->ctrl, cmd, argp); + ret = nvme_ctrl_ioctl(ns->ctrl, cmd, argp, mode); nvme_put_ctrl(ctrl); return ret; @@ -749,9 +782,10 @@ int nvme_ns_head_ioctl(struct block_device *bdev, fmode_t mode, * deadlock when deleting namespaces using the passthrough interface. */ if (is_ctrl_ioctl(cmd)) - return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx); + return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx, + mode); - ret = nvme_ns_ioctl(ns, cmd, argp); + ret = nvme_ns_ioctl(ns, cmd, argp, mode); out_unlock: srcu_read_unlock(&head->srcu, srcu_idx); return ret; @@ -773,9 +807,10 @@ long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd, goto out_unlock; if (is_ctrl_ioctl(cmd)) - return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx); + return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx, + file->f_mode); - ret = nvme_ns_ioctl(ns, cmd, argp); + ret = nvme_ns_ioctl(ns, cmd, argp, file->f_mode); out_unlock: srcu_read_unlock(&head->srcu, srcu_idx); return ret; @@ -849,7 +884,8 @@ int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags) return ret; } -static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp) +static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp, + fmode_t mode) { struct nvme_ns *ns; int ret; @@ -873,7 +909,7 @@ static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp) kref_get(&ns->kref); up_read(&ctrl->namespaces_rwsem); - ret = nvme_user_cmd(ctrl, ns, argp); + ret = nvme_user_cmd(ctrl, ns, argp, mode); nvme_put_ns(ns); return ret; @@ -890,11 +926,11 @@ long nvme_dev_ioctl(struct file *file, unsigned int cmd, switch (cmd) { case NVME_IOCTL_ADMIN_CMD: - return nvme_user_cmd(ctrl, NULL, argp); + return nvme_user_cmd(ctrl, NULL, argp, file->f_mode); case NVME_IOCTL_ADMIN64_CMD: - return nvme_user_cmd64(ctrl, NULL, argp, false); + return nvme_user_cmd64(ctrl, NULL, argp, false, file->f_mode); case NVME_IOCTL_IO_CMD: - return nvme_dev_user_cmd(ctrl, argp); + return nvme_dev_user_cmd(ctrl, argp, file->f_mode); case NVME_IOCTL_RESET: if (!capable(CAP_SYS_ADMIN)) return -EACCES; diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 050d7d0cd81b..1d102b662e88 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -797,6 +797,7 @@ enum nvme_opcode { nvme_cmd_zone_mgmt_send = 0x79, nvme_cmd_zone_mgmt_recv = 0x7a, nvme_cmd_zone_append = 0x7d, + nvme_cmd_vendor_start = 0x80, }; #define nvme_opcode_name(opcode) { opcode, #opcode } From e4fbcf32c860f98103ca7f1dc8c0dc69e2219ec6 Mon Sep 17 00:00:00 2001 From: Kanchan Joshi Date: Tue, 1 Nov 2022 10:43:07 +0100 Subject: [PATCH 06/45] nvme: identify-namespace without CAP_SYS_ADMIN Allow all identify-namespace variants (CNS 00h, 05h and 08h) without requiring CAP_SYS_ADMIN. The information (retrieved using id-ns) is needed to form IO commands for passthrough interface. Signed-off-by: Kanchan Joshi Reviewed-by: Jens Axboe Reviewed-by: Keith Busch Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/ioctl.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 1d68f161064a..9550a69029b3 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -22,9 +22,23 @@ static bool nvme_cmd_allowed(struct nvme_ns *ns, struct nvme_command *c, c->common.opcode == nvme_fabrics_command) return false; - /* do not allow unprivileged admin commands */ - if (!ns) + /* + * Do not allow unprivileged passthrough of admin commands except + * for a subset of identify commands that contain information required + * to form proper I/O commands in userspace and do not expose any + * potentially sensitive information. + */ + if (!ns) { + if (c->common.opcode == nvme_admin_identify) { + switch (c->identify.cns) { + case NVME_ID_CNS_NS: + case NVME_ID_CNS_CS_NS: + case NVME_ID_CNS_NS_CS_INDEP: + return true; + } + } return false; + } /* * Only allow I/O commands that transfer data to the controller if the From 1b96f862ecccb3e6f950eba584bebf22955cecc5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 30 Oct 2022 16:50:15 +0100 Subject: [PATCH 07/45] nvme: implement the DEAC bit for the Write Zeroes command While the specification allows devices to either deallocate data or to actually write zeroes on any Write Zeroes command, many SSDs only do the sensible thing and deallocate data when the DEAC bit is specific. Set it when it is supported and the caller doesn't explicitly opt out of deallocation. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Chaitanya Kulkarni Reviewed-by: Martin K. Petersen --- drivers/nvme/host/core.c | 13 ++++++++++++- drivers/nvme/host/nvme.h | 1 + include/linux/nvme.h | 1 + 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index f94b05c585cb..1a87a072fbed 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -850,8 +850,11 @@ static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns, cmnd->write_zeroes.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); + if (!(req->cmd_flags & REQ_NOUNMAP) && (ns->features & NVME_NS_DEAC)) + cmnd->write_zeroes.control |= cpu_to_le16(NVME_WZ_DEAC); + if (nvme_ns_has_pi(ns)) { - cmnd->write_zeroes.control = cpu_to_le16(NVME_RW_PRINFO_PRACT); + cmnd->write_zeroes.control |= cpu_to_le16(NVME_RW_PRINFO_PRACT); switch (ns->pi_type) { case NVME_NS_DPS_PI_TYPE1: @@ -2003,6 +2006,14 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, } } + /* + * Only set the DEAC bit if the device guarantees that reads from + * deallocated data return zeroes. While the DEAC bit does not + * require that, it must be a no-op if reads from deallocated data + * do not return zeroes. + */ + if ((id->dlfeat & 0x7) == 0x1 && (id->dlfeat & (1 << 3))) + ns->features |= NVME_NS_DEAC; set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info)); set_bit(NVME_NS_READY, &ns->flags); blk_mq_unfreeze_queue(ns->disk->queue); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index f9df10653f3c..16b34a491495 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -455,6 +455,7 @@ static inline bool nvme_ns_head_multipath(struct nvme_ns_head *head) enum nvme_ns_features { NVME_NS_EXT_LBAS = 1 << 0, /* support extended LBA format */ NVME_NS_METADATA_SUPPORTED = 1 << 1, /* support getting generated md */ + NVME_NS_DEAC, /* DEAC bit in Write Zeores supported */ }; struct nvme_ns { diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 1d102b662e88..d6be2a686100 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -964,6 +964,7 @@ enum { NVME_RW_PRINFO_PRCHK_GUARD = 1 << 12, NVME_RW_PRINFO_PRACT = 1 << 13, NVME_RW_DTYPE_STREAMS = 1 << 4, + NVME_WZ_DEAC = 1 << 9, }; struct nvme_dsm_cmd { From 1e37a307f1481058da852accb37e0e1a3e137e9e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 8 Nov 2022 15:46:45 +0100 Subject: [PATCH 08/45] nvme: don't call nvme_init_ctrl_finish from nvme_passthru_end nvme_passthrough_end can race with a reset, which can lead to racing stores to the cels xarray as well as further shengians with upcoming more complicated initialization. So drop the call and just log that the controller capabilities might have changed and a reset could be required to use the new controller capabilities. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Tested-by Gerd Bayer --- drivers/nvme/host/core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 1a87a072fbed..ce8314aee1dd 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1123,8 +1123,10 @@ void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects, mutex_unlock(&ctrl->subsys->lock); mutex_unlock(&ctrl->scan_lock); } - if (effects & NVME_CMD_EFFECTS_CCC) - nvme_init_ctrl_finish(ctrl); + if (effects & NVME_CMD_EFFECTS_CCC) { + dev_info(ctrl->device, +"controller capabilities changed, reset may be required to take effect.\n"); + } if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) { nvme_queue_scan(ctrl); flush_work(&ctrl->scan_work); From 94cc781f69f49f665383dd87aef973b7896153d0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 8 Nov 2022 15:48:27 +0100 Subject: [PATCH 09/45] nvme: move OPAL setup from PCIe to core Nothing about the TCG Opal support is PCIe transport specific, so move it to the core code. For this nvme_init_ctrl_finish grows a new was_suspended argument that allows the transport driver to tell the OPAL code if the controller came out of a suspend cycle. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Reviewed-by: James Smart Tested-by Gerd Bayer --- drivers/nvme/host/apple.c | 2 +- drivers/nvme/host/core.c | 25 ++++++++++++++++++++++--- drivers/nvme/host/fc.c | 2 +- drivers/nvme/host/nvme.h | 5 +---- drivers/nvme/host/pci.c | 14 +------------- drivers/nvme/host/rdma.c | 2 +- drivers/nvme/host/tcp.c | 2 +- drivers/nvme/target/loop.c | 2 +- 8 files changed, 29 insertions(+), 25 deletions(-) diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c index 24e224c279a4..a85349a7e938 100644 --- a/drivers/nvme/host/apple.c +++ b/drivers/nvme/host/apple.c @@ -1102,7 +1102,7 @@ static void apple_nvme_reset_work(struct work_struct *work) goto out; } - ret = nvme_init_ctrl_finish(&anv->ctrl); + ret = nvme_init_ctrl_finish(&anv->ctrl, false); if (ret) goto out; diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index ce8314aee1dd..aedacf2fba69 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2192,7 +2192,7 @@ const struct pr_ops nvme_pr_ops = { }; #ifdef CONFIG_BLK_SED_OPAL -int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, +static int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, bool send) { struct nvme_ctrl *ctrl = data; @@ -2209,7 +2209,23 @@ int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len, NVME_QID_ANY, 1, 0); } -EXPORT_SYMBOL_GPL(nvme_sec_submit); + +static void nvme_configure_opal(struct nvme_ctrl *ctrl, bool was_suspended) +{ + if (ctrl->oacs & NVME_CTRL_OACS_SEC_SUPP) { + if (!ctrl->opal_dev) + ctrl->opal_dev = init_opal_dev(ctrl, &nvme_sec_submit); + else if (was_suspended) + opal_unlock_from_suspend(ctrl->opal_dev); + } else { + free_opal_dev(ctrl->opal_dev); + ctrl->opal_dev = NULL; + } +} +#else +static void nvme_configure_opal(struct nvme_ctrl *ctrl, bool was_suspended) +{ +} #endif /* CONFIG_BLK_SED_OPAL */ #ifdef CONFIG_BLK_DEV_ZONED @@ -3242,7 +3258,7 @@ out_free: * register in our nvme_ctrl structure. This should be called as soon as * the admin queue is fully up and running. */ -int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl) +int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl, bool was_suspended) { int ret; @@ -3273,6 +3289,8 @@ int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl) if (ret < 0) return ret; + nvme_configure_opal(ctrl, was_suspended); + if (!ctrl->identified && !nvme_discovery_ctrl(ctrl)) { /* * Do not return errors unless we are in a controller reset, @@ -5007,6 +5025,7 @@ static void nvme_free_ctrl(struct device *dev) nvme_auth_stop(ctrl); nvme_auth_free(ctrl); __free_page(ctrl->discard_page); + free_opal_dev(ctrl->opal_dev); if (subsys) { mutex_lock(&nvme_subsystems_lock); diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 2d3c54838496..1f9f4075794b 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -3107,7 +3107,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) nvme_start_admin_queue(&ctrl->ctrl); - ret = nvme_init_ctrl_finish(&ctrl->ctrl); + ret = nvme_init_ctrl_finish(&ctrl->ctrl, false); if (ret || test_bit(ASSOC_FAILED, &ctrl->flags)) goto out_disconnect_admin_queue; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 16b34a491495..306a120d49ab 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -736,7 +736,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, void nvme_uninit_ctrl(struct nvme_ctrl *ctrl); void nvme_start_ctrl(struct nvme_ctrl *ctrl); void nvme_stop_ctrl(struct nvme_ctrl *ctrl); -int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl); +int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl, bool was_suspended); int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, const struct blk_mq_ops *ops, unsigned int flags, unsigned int cmd_size); @@ -748,9 +748,6 @@ void nvme_remove_io_tag_set(struct nvme_ctrl *ctrl); void nvme_remove_namespaces(struct nvme_ctrl *ctrl); -int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, - bool send); - void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, volatile union nvme_result *res); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 208c387f1558..e4f084e12b96 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2772,7 +2772,6 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl) nvme_free_tagset(dev); if (dev->ctrl.admin_q) blk_put_queue(dev->ctrl.admin_q); - free_opal_dev(dev->ctrl.opal_dev); mempool_destroy(dev->iod_mempool); put_device(dev->dev); kfree(dev->queues); @@ -2866,21 +2865,10 @@ static void nvme_reset_work(struct work_struct *work) */ dev->ctrl.max_integrity_segments = 1; - result = nvme_init_ctrl_finish(&dev->ctrl); + result = nvme_init_ctrl_finish(&dev->ctrl, was_suspend); if (result) goto out; - if (dev->ctrl.oacs & NVME_CTRL_OACS_SEC_SUPP) { - if (!dev->ctrl.opal_dev) - dev->ctrl.opal_dev = - init_opal_dev(&dev->ctrl, &nvme_sec_submit); - else if (was_suspend) - opal_unlock_from_suspend(dev->ctrl.opal_dev); - } else { - free_opal_dev(dev->ctrl.opal_dev); - dev->ctrl.opal_dev = NULL; - } - if (dev->ctrl.oacs & NVME_CTRL_OACS_DBBUF_SUPP) { result = nvme_dbbuf_dma_alloc(dev); if (result) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 6e079abb22ee..ccd45e5b3298 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -871,7 +871,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, nvme_start_admin_queue(&ctrl->ctrl); - error = nvme_init_ctrl_finish(&ctrl->ctrl); + error = nvme_init_ctrl_finish(&ctrl->ctrl, false); if (error) goto out_quiesce_queue; diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 1eed0fc26b3a..4f8584657bb7 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1949,7 +1949,7 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new) nvme_start_admin_queue(ctrl); - error = nvme_init_ctrl_finish(ctrl); + error = nvme_init_ctrl_finish(ctrl, false); if (error) goto out_quiesce_queue; diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index b45fe3adf015..893c50f365c4 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -377,7 +377,7 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) nvme_start_admin_queue(&ctrl->ctrl); - error = nvme_init_ctrl_finish(&ctrl->ctrl); + error = nvme_init_ctrl_finish(&ctrl->ctrl, false); if (error) goto out_cleanup_tagset; From 86adbf0cdb9ec6533234696c3e243184d4d0d040 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 27 Oct 2022 02:34:13 -0700 Subject: [PATCH 10/45] nvme: simplify transport specific device attribute handling Allow the transport driver to override the attribute groups for the control device, so that the PCIe driver doesn't manually have to add a group after device creation and keep track of it. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Tested-by Gerd Bayer --- drivers/nvme/host/core.c | 8 ++++++-- drivers/nvme/host/nvme.h | 2 ++ drivers/nvme/host/pci.c | 23 ++++++++--------------- 3 files changed, 16 insertions(+), 17 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index aedacf2fba69..6040a13d3e2d 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3906,10 +3906,11 @@ static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj, return a->mode; } -static const struct attribute_group nvme_dev_attrs_group = { +const struct attribute_group nvme_dev_attrs_group = { .attrs = nvme_dev_attrs, .is_visible = nvme_dev_attrs_are_visible, }; +EXPORT_SYMBOL_GPL(nvme_dev_attrs_group); static const struct attribute_group *nvme_dev_attr_groups[] = { &nvme_dev_attrs_group, @@ -5091,7 +5092,10 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, ctrl->instance); ctrl->device->class = nvme_class; ctrl->device->parent = ctrl->dev; - ctrl->device->groups = nvme_dev_attr_groups; + if (ops->dev_attr_groups) + ctrl->device->groups = ops->dev_attr_groups; + else + ctrl->device->groups = nvme_dev_attr_groups; ctrl->device->release = nvme_free_ctrl; dev_set_drvdata(ctrl->device, ctrl); ret = dev_set_name(ctrl->device, "nvme%d", ctrl->instance); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 306a120d49ab..924ff80d85f6 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -508,6 +508,7 @@ struct nvme_ctrl_ops { unsigned int flags; #define NVME_F_FABRICS (1 << 0) #define NVME_F_METADATA_SUPPORTED (1 << 1) + const struct attribute_group **dev_attr_groups; int (*reg_read32)(struct nvme_ctrl *ctrl, u32 off, u32 *val); int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val); int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val); @@ -854,6 +855,7 @@ int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags); extern const struct attribute_group *nvme_ns_id_attr_groups[]; extern const struct pr_ops nvme_pr_ops; extern const struct block_device_operations nvme_ns_head_ops; +extern const struct attribute_group nvme_dev_attrs_group; struct nvme_ns *nvme_find_path(struct nvme_ns_head *head); #ifdef CONFIG_NVME_MULTIPATH diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index e4f084e12b96..c8f6ce5eee1c 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -158,8 +158,6 @@ struct nvme_dev { unsigned int nr_allocated_queues; unsigned int nr_write_queues; unsigned int nr_poll_queues; - - bool attrs_added; }; static int io_queue_depth_set(const char *val, const struct kernel_param *kp) @@ -2234,11 +2232,17 @@ static struct attribute *nvme_pci_attrs[] = { NULL, }; -static const struct attribute_group nvme_pci_attr_group = { +static const struct attribute_group nvme_pci_dev_attrs_group = { .attrs = nvme_pci_attrs, .is_visible = nvme_pci_attrs_are_visible, }; +static const struct attribute_group *nvme_pci_dev_attr_groups[] = { + &nvme_dev_attrs_group, + &nvme_pci_dev_attrs_group, + NULL, +}; + /* * nirqs is the number of interrupts available for write and read * queues. The core already reserved an interrupt for the admin queue. @@ -2930,10 +2934,6 @@ static void nvme_reset_work(struct work_struct *work) goto out; } - if (!dev->attrs_added && !sysfs_create_group(&dev->ctrl.device->kobj, - &nvme_pci_attr_group)) - dev->attrs_added = true; - nvme_start_ctrl(&dev->ctrl); return; @@ -3006,6 +3006,7 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = { .name = "pcie", .module = THIS_MODULE, .flags = NVME_F_METADATA_SUPPORTED, + .dev_attr_groups = nvme_pci_dev_attr_groups, .reg_read32 = nvme_pci_reg_read32, .reg_write32 = nvme_pci_reg_write32, .reg_read64 = nvme_pci_reg_read64, @@ -3204,13 +3205,6 @@ static void nvme_shutdown(struct pci_dev *pdev) nvme_disable_prepare_reset(dev, true); } -static void nvme_remove_attrs(struct nvme_dev *dev) -{ - if (dev->attrs_added) - sysfs_remove_group(&dev->ctrl.device->kobj, - &nvme_pci_attr_group); -} - /* * The driver's remove may be called on a device in a partially initialized * state. This function must not have any dependencies on the device state in @@ -3232,7 +3226,6 @@ static void nvme_remove(struct pci_dev *pdev) nvme_stop_ctrl(&dev->ctrl); nvme_remove_namespaces(&dev->ctrl); nvme_dev_disable(dev, true); - nvme_remove_attrs(dev); nvme_free_host_mem(dev); nvme_dev_remove_admin(dev); nvme_free_queues(dev, 0); From 96ef1be53663a9343dffcf106e2f1b59da4b8799 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 8 Nov 2022 09:10:21 +0100 Subject: [PATCH 11/45] nvme-pci: put the admin queue in nvme_dev_remove_admin Once the controller is shutdown no one can access the admin queue. Tear it down in nvme_dev_remove_admin, which matches the flow in the other drivers. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Tested-by Gerd Bayer --- drivers/nvme/host/pci.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index c8f6ce5eee1c..f526ad578088 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1747,6 +1747,7 @@ static void nvme_dev_remove_admin(struct nvme_dev *dev) */ nvme_start_admin_queue(&dev->ctrl); blk_mq_destroy_queue(dev->ctrl.admin_q); + blk_put_queue(dev->ctrl.admin_q); blk_mq_free_tag_set(&dev->admin_tagset); } } @@ -2774,8 +2775,6 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl) nvme_dbbuf_dma_free(dev); nvme_free_tagset(dev); - if (dev->ctrl.admin_q) - blk_put_queue(dev->ctrl.admin_q); mempool_destroy(dev->iod_mempool); put_device(dev->dev); kfree(dev->queues); From c11b7716d6c96e82d2b404c4520237d968357a0d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 8 Nov 2022 09:11:13 +0100 Subject: [PATCH 12/45] nvme-pci: move more teardown work to nvme_remove nvme_dbbuf_dma_free frees dma coherent memory, so it must not be called after ->remove has returned. Fortunately there is no way to use it after shutdown as no more I/O is possible so it can be moved. Similarly the iod_mempool can't be used for a device kept alive after shutdown, so move it next to freeing the PRP pools. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Tested-by Gerd Bayer --- drivers/nvme/host/pci.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index f526ad578088..b638f43f2df2 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2773,9 +2773,7 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl) { struct nvme_dev *dev = to_nvme_dev(ctrl); - nvme_dbbuf_dma_free(dev); nvme_free_tagset(dev); - mempool_destroy(dev->iod_mempool); put_device(dev->dev); kfree(dev->queues); kfree(dev); @@ -3227,7 +3225,9 @@ static void nvme_remove(struct pci_dev *pdev) nvme_dev_disable(dev, true); nvme_free_host_mem(dev); nvme_dev_remove_admin(dev); + nvme_dbbuf_dma_free(dev); nvme_free_queues(dev, 0); + mempool_destroy(dev->iod_mempool); nvme_release_prp_pools(dev); nvme_dev_unmap(dev); nvme_uninit_ctrl(&dev->ctrl); From 081a7d958ce4b65f9aab6e70e65b0b2e0b92297c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 8 Nov 2022 09:41:41 +0100 Subject: [PATCH 13/45] nvme-pci: factor the iod mempool creation into a helper Add a helper to create the iod mempool. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Tested-by Gerd Bayer --- drivers/nvme/host/pci.c | 41 ++++++++++++++++++----------------------- 1 file changed, 18 insertions(+), 23 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index b638f43f2df2..f7dab65bf504 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -390,14 +390,6 @@ static int nvme_pci_npages_sgl(void) PAGE_SIZE); } -static size_t nvme_pci_iod_alloc_size(void) -{ - size_t npages = max(nvme_pci_npages_prp(), nvme_pci_npages_sgl()); - - return sizeof(__le64 *) * npages + - sizeof(struct scatterlist) * NVME_MAX_SEGS; -} - static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, unsigned int hctx_idx) { @@ -2762,6 +2754,22 @@ static void nvme_release_prp_pools(struct nvme_dev *dev) dma_pool_destroy(dev->prp_small_pool); } +static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev) +{ + size_t npages = max(nvme_pci_npages_prp(), nvme_pci_npages_sgl()); + size_t alloc_size = sizeof(__le64 *) * npages + + sizeof(struct scatterlist) * NVME_MAX_SEGS; + + WARN_ON_ONCE(alloc_size > PAGE_SIZE); + dev->iod_mempool = mempool_create_node(1, + mempool_kmalloc, mempool_kfree, + (void *)alloc_size, GFP_KERNEL, + dev_to_node(dev->dev)); + if (!dev->iod_mempool) + return -ENOMEM; + return 0; +} + static void nvme_free_tagset(struct nvme_dev *dev) { if (dev->tagset.tags) @@ -3087,7 +3095,6 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) int node, result = -ENOMEM; struct nvme_dev *dev; unsigned long quirks = id->driver_data; - size_t alloc_size; node = dev_to_node(&pdev->dev); if (node == NUMA_NO_NODE) @@ -3132,21 +3139,9 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) quirks |= NVME_QUIRK_SIMPLE_SUSPEND; } - /* - * Double check that our mempool alloc size will cover the biggest - * command we support. - */ - alloc_size = nvme_pci_iod_alloc_size(); - WARN_ON_ONCE(alloc_size > PAGE_SIZE); - - dev->iod_mempool = mempool_create_node(1, mempool_kmalloc, - mempool_kfree, - (void *) alloc_size, - GFP_KERNEL, node); - if (!dev->iod_mempool) { - result = -ENOMEM; + result = nvme_pci_alloc_iod_mempool(dev); + if (result) goto release_pools; - } result = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops, quirks); From 2e87570be9d2746e7c4e7ab1cc18fd3ca7de2768 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 8 Nov 2022 09:44:00 +0100 Subject: [PATCH 14/45] nvme-pci: factor out a nvme_pci_alloc_dev helper Add a helper that allocates the nvme_dev structure up to the point where we can call nvme_init_ctrl. This pairs with the free_ctrl method and can thus be used to cleanup the teardown path and make it more symmetric. Note that this now calls nvme_init_ctrl a lot earlier during probing, which also means the per-controller character device shows up earlier. Due to the controller state no commnds can be send on it, but it might make sense to delay the cdev registration until nvme_init_ctrl_finish. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Tested-by Gerd Bayer --- drivers/nvme/host/pci.c | 83 +++++++++++++++++++++++------------------ 1 file changed, 47 insertions(+), 36 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index f7dab65bf504..03c83cd724ec 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2777,6 +2777,7 @@ static void nvme_free_tagset(struct nvme_dev *dev) dev->ctrl.tagset = NULL; } +/* pairs with nvme_pci_alloc_dev */ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl) { struct nvme_dev *dev = to_nvme_dev(ctrl); @@ -3090,19 +3091,23 @@ static void nvme_async_probe(void *data, async_cookie_t cookie) nvme_put_ctrl(&dev->ctrl); } -static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) +static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev, + const struct pci_device_id *id) { - int node, result = -ENOMEM; - struct nvme_dev *dev; unsigned long quirks = id->driver_data; + int node = dev_to_node(&pdev->dev); + struct nvme_dev *dev; + int ret = -ENOMEM; - node = dev_to_node(&pdev->dev); if (node == NUMA_NO_NODE) set_dev_node(&pdev->dev, first_memory_node); dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node); if (!dev) - return -ENOMEM; + return NULL; + INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work); + INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work); + mutex_init(&dev->shutdown_lock); dev->nr_write_queues = write_queues; dev->nr_poll_queues = poll_queues; @@ -3110,25 +3115,11 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) dev->queues = kcalloc_node(dev->nr_allocated_queues, sizeof(struct nvme_queue), GFP_KERNEL, node); if (!dev->queues) - goto free; + goto out_free_dev; dev->dev = get_device(&pdev->dev); - pci_set_drvdata(pdev, dev); - - result = nvme_dev_map(dev); - if (result) - goto put_pci; - - INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work); - INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work); - mutex_init(&dev->shutdown_lock); - - result = nvme_setup_prp_pools(dev); - if (result) - goto unmap; quirks |= check_vendor_combination_bug(pdev); - if (!noacpi && acpi_storage_d3(&pdev->dev)) { /* * Some systems use a bios work around to ask for D3 on @@ -3138,34 +3129,54 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) "platform quirk: setting simple suspend\n"); quirks |= NVME_QUIRK_SIMPLE_SUSPEND; } + ret = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops, + quirks); + if (ret) + goto out_put_device; + return dev; + +out_put_device: + put_device(dev->dev); + kfree(dev->queues); +out_free_dev: + kfree(dev); + return ERR_PTR(ret); +} + +static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) +{ + struct nvme_dev *dev; + int result = -ENOMEM; + + dev = nvme_pci_alloc_dev(pdev, id); + if (!dev) + return -ENOMEM; + + result = nvme_dev_map(dev); + if (result) + goto out_uninit_ctrl; + + result = nvme_setup_prp_pools(dev); + if (result) + goto out_dev_unmap; result = nvme_pci_alloc_iod_mempool(dev); if (result) - goto release_pools; - - result = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops, - quirks); - if (result) - goto release_mempool; + goto out_release_prp_pools; dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev)); + pci_set_drvdata(pdev, dev); nvme_reset_ctrl(&dev->ctrl); async_schedule(nvme_async_probe, dev); - return 0; - release_mempool: - mempool_destroy(dev->iod_mempool); - release_pools: +out_release_prp_pools: nvme_release_prp_pools(dev); - unmap: +out_dev_unmap: nvme_dev_unmap(dev); - put_pci: - put_device(dev->dev); - free: - kfree(dev->queues); - kfree(dev); +out_uninit_ctrl: + nvme_uninit_ctrl(&dev->ctrl); return result; } From 3f30a79c2e2c7d5ad14dfc372adb248fc239c6c1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 8 Nov 2022 09:48:43 +0100 Subject: [PATCH 15/45] nvme-pci: set constant paramters in nvme_pci_alloc_ctrl Move setting of low-level constant parameters from nvme_reset_work to nvme_pci_alloc_ctrl. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Tested-by Gerd Bayer --- drivers/nvme/host/pci.c | 38 +++++++++++++++++--------------------- 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 03c83cd724ec..9dcb35f14800 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2841,21 +2841,6 @@ static void nvme_reset_work(struct work_struct *work) nvme_start_admin_queue(&dev->ctrl); } - dma_set_min_align_mask(dev->dev, NVME_CTRL_PAGE_SIZE - 1); - - /* - * Limit the max command size to prevent iod->sg allocations going - * over a single page. - */ - dev->ctrl.max_hw_sectors = min_t(u32, - NVME_MAX_KB_SZ << 1, dma_max_mapping_size(dev->dev) >> 9); - dev->ctrl.max_segments = NVME_MAX_SEGS; - - /* - * Don't limit the IOMMU merged segment size. - */ - dma_set_max_seg_size(dev->dev, 0xffffffff); - mutex_unlock(&dev->shutdown_lock); /* @@ -2869,12 +2854,6 @@ static void nvme_reset_work(struct work_struct *work) goto out; } - /* - * We do not support an SGL for metadata (yet), so we are limited to a - * single integrity segment for the separate metadata pointer. - */ - dev->ctrl.max_integrity_segments = 1; - result = nvme_init_ctrl_finish(&dev->ctrl, was_suspend); if (result) goto out; @@ -3133,6 +3112,23 @@ static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev, quirks); if (ret) goto out_put_device; + + dma_set_min_align_mask(&pdev->dev, NVME_CTRL_PAGE_SIZE - 1); + dma_set_max_seg_size(&pdev->dev, 0xffffffff); + + /* + * Limit the max command size to prevent iod->sg allocations going + * over a single page. + */ + dev->ctrl.max_hw_sectors = min_t(u32, + NVME_MAX_KB_SZ << 1, dma_max_mapping_size(&pdev->dev) >> 9); + dev->ctrl.max_segments = NVME_MAX_SEGS; + + /* + * There is no support for SGLs for metadata (yet), so we are limited to + * a single integrity segment for the separate metadata pointer. + */ + dev->ctrl.max_integrity_segments = 1; return dev; out_put_device: From a6ee7f19ebfd158ffb3a6ebacf20ae43549bed05 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 27 Oct 2022 03:28:16 -0700 Subject: [PATCH 16/45] nvme-pci: call nvme_pci_configure_admin_queue from nvme_pci_enable nvme_pci_configure_admin_queue is called right after nvme_pci_enable, and it's work is undone by nvme_dev_disable. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Tested-by Gerd Bayer --- drivers/nvme/host/pci.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 9dcb35f14800..c2e3a87237da 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2639,7 +2639,8 @@ static int nvme_pci_enable(struct nvme_dev *dev) pci_enable_pcie_error_reporting(pdev); pci_save_state(pdev); - return 0; + + return nvme_pci_configure_admin_queue(dev); disable: pci_disable_device(pdev); @@ -2829,10 +2830,6 @@ static void nvme_reset_work(struct work_struct *work) if (result) goto out_unlock; - result = nvme_pci_configure_admin_queue(dev); - if (result) - goto out_unlock; - if (!dev->ctrl.admin_q) { result = nvme_pci_alloc_admin_tag_set(dev); if (result) From 65a54646420e1409760c2b9f0e1a5e5feca1364e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 8 Nov 2022 10:56:49 +0100 Subject: [PATCH 17/45] nvme-pci: simplify nvme_dbbuf_dma_alloc Move the OACS check and the error checking into nvme_dbbuf_dma_alloc so that an upcoming second caller doesn't have to duplicate this boilerplate code. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni --- drivers/nvme/host/pci.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index c2e3a87237da..4da339690ec6 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -239,10 +239,13 @@ static inline unsigned int nvme_dbbuf_size(struct nvme_dev *dev) return dev->nr_allocated_queues * 8 * dev->db_stride; } -static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev) +static void nvme_dbbuf_dma_alloc(struct nvme_dev *dev) { unsigned int mem_size = nvme_dbbuf_size(dev); + if (!(dev->ctrl.oacs & NVME_CTRL_OACS_DBBUF_SUPP)) + return; + if (dev->dbbuf_dbs) { /* * Clear the dbbuf memory so the driver doesn't observe stale @@ -250,25 +253,27 @@ static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev) */ memset(dev->dbbuf_dbs, 0, mem_size); memset(dev->dbbuf_eis, 0, mem_size); - return 0; + return; } dev->dbbuf_dbs = dma_alloc_coherent(dev->dev, mem_size, &dev->dbbuf_dbs_dma_addr, GFP_KERNEL); if (!dev->dbbuf_dbs) - return -ENOMEM; + goto fail; dev->dbbuf_eis = dma_alloc_coherent(dev->dev, mem_size, &dev->dbbuf_eis_dma_addr, GFP_KERNEL); - if (!dev->dbbuf_eis) { - dma_free_coherent(dev->dev, mem_size, - dev->dbbuf_dbs, dev->dbbuf_dbs_dma_addr); - dev->dbbuf_dbs = NULL; - return -ENOMEM; - } + if (!dev->dbbuf_eis) + goto fail_free_dbbuf_dbs; + return; - return 0; +fail_free_dbbuf_dbs: + dma_free_coherent(dev->dev, mem_size, dev->dbbuf_dbs, + dev->dbbuf_dbs_dma_addr); + dev->dbbuf_dbs = NULL; +fail: + dev_warn(dev->dev, "unable to allocate dma for dbbuf\n"); } static void nvme_dbbuf_dma_free(struct nvme_dev *dev) @@ -2855,12 +2860,7 @@ static void nvme_reset_work(struct work_struct *work) if (result) goto out; - if (dev->ctrl.oacs & NVME_CTRL_OACS_DBBUF_SUPP) { - result = nvme_dbbuf_dma_alloc(dev); - if (result) - dev_warn(dev->dev, - "unable to allocate dma for dbbuf\n"); - } + nvme_dbbuf_dma_alloc(dev); if (dev->ctrl.hmpre) { result = nvme_setup_host_mem(dev); From acb71e53bb47a08731aa77497cd3f1871cba59c3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 13 Nov 2022 12:05:58 +0100 Subject: [PATCH 18/45] nvme-pci: move the HMPRE check into nvme_setup_host_mem Check that a HMB is wanted into the allocation helper instead of the caller. This makes life simpler for an upcoming second caller. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni --- drivers/nvme/host/pci.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 4da339690ec6..57fb88396fd8 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2102,6 +2102,9 @@ static int nvme_setup_host_mem(struct nvme_dev *dev) u32 enable_bits = NVME_HOST_MEM_ENABLE; int ret; + if (!dev->ctrl.hmpre) + return 0; + preferred = min(preferred, max); if (min > max) { dev_warn(dev->ctrl.device, @@ -2862,11 +2865,9 @@ static void nvme_reset_work(struct work_struct *work) nvme_dbbuf_dma_alloc(dev); - if (dev->ctrl.hmpre) { - result = nvme_setup_host_mem(dev); - if (result < 0) - goto out; - } + result = nvme_setup_host_mem(dev); + if (result < 0) + goto out; result = nvme_setup_io_queues(dev); if (result) From eac3ef262941f62fe01bc357911fea4847183333 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 13 Nov 2022 12:07:04 +0100 Subject: [PATCH 19/45] nvme-pci: split the initial probe from the rest path nvme_reset_work is a little fragile as it needs to handle both resetting a live controller and initializing one during probe. Split out the initial probe and open code it in nvme_probe and leave nvme_reset_work to just do the live controller reset. This fixes a recently introduced bug where nvme_dev_disable causes a NULL pointer dereferences in blk_mq_quiesce_tagset because the tagset pointer is not set when the reset state is entered directly from the new state. The separate probe code can skip the reset state and probe directly and fixes this. To make sure the system isn't single threaded on enabling nvme controllers, set the PROBE_PREFER_ASYNCHRONOUS flag in the device_driver structure so that the driver core probes in parallel. Fixes: 98d81f0df70c ("nvme: use blk_mq_[un]quiesce_tagset") Reported-by: Gerd Bayer Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Chaitanya Kulkarni Tested-by Gerd Bayer --- drivers/nvme/host/pci.c | 133 ++++++++++++++++++++++++---------------- 1 file changed, 80 insertions(+), 53 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 57fb88396fd8..2521c00a0af6 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2837,15 +2837,7 @@ static void nvme_reset_work(struct work_struct *work) result = nvme_pci_enable(dev); if (result) goto out_unlock; - - if (!dev->ctrl.admin_q) { - result = nvme_pci_alloc_admin_tag_set(dev); - if (result) - goto out_unlock; - } else { - nvme_start_admin_queue(&dev->ctrl); - } - + nvme_start_admin_queue(&dev->ctrl); mutex_unlock(&dev->shutdown_lock); /* @@ -2873,37 +2865,23 @@ static void nvme_reset_work(struct work_struct *work) if (result) goto out; - if (dev->ctrl.tagset) { - /* - * This is a controller reset and we already have a tagset. - * Freeze and update the number of I/O queues as thos might have - * changed. If there are no I/O queues left after this reset, - * keep the controller around but remove all namespaces. - */ - if (dev->online_queues > 1) { - nvme_start_queues(&dev->ctrl); - nvme_wait_freeze(&dev->ctrl); - nvme_pci_update_nr_queues(dev); - nvme_dbbuf_set(dev); - nvme_unfreeze(&dev->ctrl); - } else { - dev_warn(dev->ctrl.device, "IO queues lost\n"); - nvme_mark_namespaces_dead(&dev->ctrl); - nvme_start_queues(&dev->ctrl); - nvme_remove_namespaces(&dev->ctrl); - nvme_free_tagset(dev); - } + /* + * Freeze and update the number of I/O queues as thos might have + * changed. If there are no I/O queues left after this reset, keep the + * controller around but remove all namespaces. + */ + if (dev->online_queues > 1) { + nvme_start_queues(&dev->ctrl); + nvme_wait_freeze(&dev->ctrl); + nvme_pci_update_nr_queues(dev); + nvme_dbbuf_set(dev); + nvme_unfreeze(&dev->ctrl); } else { - /* - * First probe. Still allow the controller to show up even if - * there are no namespaces. - */ - if (dev->online_queues > 1) { - nvme_pci_alloc_tag_set(dev); - nvme_dbbuf_set(dev); - } else { - dev_warn(dev->ctrl.device, "IO queues not created\n"); - } + dev_warn(dev->ctrl.device, "IO queues lost\n"); + nvme_mark_namespaces_dead(&dev->ctrl); + nvme_start_queues(&dev->ctrl); + nvme_remove_namespaces(&dev->ctrl); + nvme_free_tagset(dev); } /* @@ -3059,15 +3037,6 @@ static unsigned long check_vendor_combination_bug(struct pci_dev *pdev) return 0; } -static void nvme_async_probe(void *data, async_cookie_t cookie) -{ - struct nvme_dev *dev = data; - - flush_work(&dev->ctrl.reset_work); - flush_work(&dev->ctrl.scan_work); - nvme_put_ctrl(&dev->ctrl); -} - static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev, const struct pci_device_id *id) { @@ -3159,12 +3128,69 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) goto out_release_prp_pools; dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev)); + + result = nvme_pci_enable(dev); + if (result) + goto out_release_iod_mempool; + + result = nvme_pci_alloc_admin_tag_set(dev); + if (result) + goto out_disable; + + /* + * Mark the controller as connecting before sending admin commands to + * allow the timeout handler to do the right thing. + */ + if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_CONNECTING)) { + dev_warn(dev->ctrl.device, + "failed to mark controller CONNECTING\n"); + result = -EBUSY; + goto out_disable; + } + + result = nvme_init_ctrl_finish(&dev->ctrl, false); + if (result) + goto out_disable; + + nvme_dbbuf_dma_alloc(dev); + + result = nvme_setup_host_mem(dev); + if (result < 0) + goto out_disable; + + result = nvme_setup_io_queues(dev); + if (result) + goto out_disable; + + if (dev->online_queues > 1) { + nvme_pci_alloc_tag_set(dev); + nvme_dbbuf_set(dev); + } else { + dev_warn(dev->ctrl.device, "IO queues not created\n"); + } + + if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) { + dev_warn(dev->ctrl.device, + "failed to mark controller live state\n"); + result = -ENODEV; + goto out_disable; + } + pci_set_drvdata(pdev, dev); - nvme_reset_ctrl(&dev->ctrl); - async_schedule(nvme_async_probe, dev); + nvme_start_ctrl(&dev->ctrl); + nvme_put_ctrl(&dev->ctrl); return 0; +out_disable: + nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING); + nvme_dev_disable(dev, true); + nvme_free_host_mem(dev); + nvme_dev_remove_admin(dev); + nvme_dbbuf_dma_free(dev); + nvme_free_queues(dev, 0); +out_release_iod_mempool: + mempool_destroy(dev->iod_mempool); out_release_prp_pools: nvme_release_prp_pools(dev); out_dev_unmap: @@ -3560,11 +3586,12 @@ static struct pci_driver nvme_driver = { .probe = nvme_probe, .remove = nvme_remove, .shutdown = nvme_shutdown, -#ifdef CONFIG_PM_SLEEP .driver = { - .pm = &nvme_dev_pm_ops, - }, + .probe_type = PROBE_PREFER_ASYNCHRONOUS, +#ifdef CONFIG_PM_SLEEP + .pm = &nvme_dev_pm_ops, #endif + }, .sriov_configure = pci_sriov_configure_simple, .err_handler = &nvme_err_handler, }; From c7c16c5b196772aba1d99c4c0b505fe99a6fbba8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 8 Nov 2022 15:42:57 +0100 Subject: [PATCH 20/45] nvme-pci: don't unbind the driver on reset failure Unbind a device driver when a reset fails is very unusual behavior. Just shut the controller down and leave it in dead state if we fail to reset it. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni --- drivers/nvme/host/pci.c | 40 ++++++++++------------------------------ 1 file changed, 10 insertions(+), 30 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 2521c00a0af6..d394498d96de 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -130,7 +130,6 @@ struct nvme_dev { u32 db_stride; void __iomem *bar; unsigned long bar_mapped_size; - struct work_struct remove_work; struct mutex shutdown_lock; bool subsystem; u64 cmb_size; @@ -2797,20 +2796,6 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl) kfree(dev); } -static void nvme_remove_dead_ctrl(struct nvme_dev *dev) -{ - /* - * Set state to deleting now to avoid blocking nvme_wait_reset(), which - * may be holding this pci_dev's device lock. - */ - nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING); - nvme_get_ctrl(&dev->ctrl); - nvme_dev_disable(dev, false); - nvme_mark_namespaces_dead(&dev->ctrl); - if (!queue_work(nvme_wq, &dev->remove_work)) - nvme_put_ctrl(&dev->ctrl); -} - static void nvme_reset_work(struct work_struct *work) { struct nvme_dev *dev = @@ -2901,20 +2886,16 @@ static void nvme_reset_work(struct work_struct *work) out_unlock: mutex_unlock(&dev->shutdown_lock); out: - if (result) - dev_warn(dev->ctrl.device, - "Removing after probe failure status: %d\n", result); - nvme_remove_dead_ctrl(dev); -} - -static void nvme_remove_dead_ctrl_work(struct work_struct *work) -{ - struct nvme_dev *dev = container_of(work, struct nvme_dev, remove_work); - struct pci_dev *pdev = to_pci_dev(dev->dev); - - if (pci_get_drvdata(pdev)) - device_release_driver(&pdev->dev); - nvme_put_ctrl(&dev->ctrl); + /* + * Set state to deleting now to avoid blocking nvme_wait_reset(), which + * may be holding this pci_dev's device lock. + */ + dev_warn(dev->ctrl.device, "Disabling device after reset failure: %d\n", + result); + nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING); + nvme_dev_disable(dev, true); + nvme_mark_namespaces_dead(&dev->ctrl); + nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD); } static int nvme_pci_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val) @@ -3052,7 +3033,6 @@ static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev, if (!dev) return NULL; INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work); - INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work); mutex_init(&dev->shutdown_lock); dev->nr_write_queues = write_queues; From 0a7ce375f83f4ade7c2a835444093b6870fb8257 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 13 Nov 2022 13:24:05 +0200 Subject: [PATCH 21/45] nvme-auth: rename __nvme_auth_[reset|free] to nvme_auth[reset|free]_dhchap nvme_auth_[reset|free] operate on the controller while __nvme_auth_[reset|free] operate on a chap struct (which maps to a queue context). Rename it for clarity. Signed-off-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/host/auth.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index c8a6db7c4498..d45333268fcf 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -654,7 +654,7 @@ gen_sesskey: return 0; } -static void __nvme_auth_reset(struct nvme_dhchap_queue_context *chap) +static void nvme_auth_reset_dhchap(struct nvme_dhchap_queue_context *chap) { kfree_sensitive(chap->host_response); chap->host_response = NULL; @@ -676,9 +676,9 @@ static void __nvme_auth_reset(struct nvme_dhchap_queue_context *chap) memset(chap->c2, 0, sizeof(chap->c2)); } -static void __nvme_auth_free(struct nvme_dhchap_queue_context *chap) +static void nvme_auth_free_dhchap(struct nvme_dhchap_queue_context *chap) { - __nvme_auth_reset(chap); + nvme_auth_reset_dhchap(chap); if (chap->shash_tfm) crypto_free_shash(chap->shash_tfm); if (chap->dh_tfm) @@ -868,7 +868,7 @@ int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid) dev_dbg(ctrl->device, "qid %d: re-using context\n", qid); mutex_unlock(&ctrl->dhchap_auth_mutex); flush_work(&chap->auth_work); - __nvme_auth_reset(chap); + nvme_auth_reset_dhchap(chap); queue_work(nvme_wq, &chap->auth_work); return 0; } @@ -928,7 +928,7 @@ void nvme_auth_reset(struct nvme_ctrl *ctrl) list_for_each_entry(chap, &ctrl->dhchap_auth_list, entry) { mutex_unlock(&ctrl->dhchap_auth_mutex); flush_work(&chap->auth_work); - __nvme_auth_reset(chap); + nvme_auth_reset_dhchap(chap); } mutex_unlock(&ctrl->dhchap_auth_mutex); } @@ -1002,7 +1002,7 @@ void nvme_auth_free(struct nvme_ctrl *ctrl) list_for_each_entry_safe(chap, tmp, &ctrl->dhchap_auth_list, entry) { list_del_init(&chap->entry); flush_work(&chap->auth_work); - __nvme_auth_free(chap); + nvme_auth_free_dhchap(chap); } mutex_unlock(&ctrl->dhchap_auth_mutex); if (ctrl->host_key) { From 0c999e69c40a87285f910c400b550fad866e99d0 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 13 Nov 2022 13:24:06 +0200 Subject: [PATCH 22/45] nvme-auth: rename authentication work elements Use nvme_ctrl_auth_work and nvme_queue_auth_work for better readability. Signed-off-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/host/auth.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index d45333268fcf..e3e801e2b78d 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -691,7 +691,7 @@ static void nvme_auth_free_dhchap(struct nvme_dhchap_queue_context *chap) kfree(chap); } -static void __nvme_auth_work(struct work_struct *work) +static void nvme_queue_auth_work(struct work_struct *work) { struct nvme_dhchap_queue_context *chap = container_of(work, struct nvme_dhchap_queue_context, auth_work); @@ -893,7 +893,7 @@ int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid) return -ENOMEM; } - INIT_WORK(&chap->auth_work, __nvme_auth_work); + INIT_WORK(&chap->auth_work, nvme_queue_auth_work); list_add(&chap->entry, &ctrl->dhchap_auth_list); mutex_unlock(&ctrl->dhchap_auth_mutex); queue_work(nvme_wq, &chap->auth_work); @@ -934,7 +934,7 @@ void nvme_auth_reset(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_auth_reset); -static void nvme_dhchap_auth_work(struct work_struct *work) +static void nvme_ctrl_auth_work(struct work_struct *work) { struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, dhchap_auth_work); @@ -973,7 +973,7 @@ static void nvme_dhchap_auth_work(struct work_struct *work) void nvme_auth_init_ctrl(struct nvme_ctrl *ctrl) { INIT_LIST_HEAD(&ctrl->dhchap_auth_list); - INIT_WORK(&ctrl->dhchap_auth_work, nvme_dhchap_auth_work); + INIT_WORK(&ctrl->dhchap_auth_work, nvme_ctrl_auth_work); mutex_init(&ctrl->dhchap_auth_mutex); if (!ctrl->opts) return; From 100b555bc204fc754108351676297805f5affa49 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 13 Nov 2022 13:24:07 +0200 Subject: [PATCH 23/45] nvme-auth: remove symbol export from nvme_auth_reset Only the nvme module calls it. Reviewed-by: Hannes Reinecke Signed-off-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/auth.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index e3e801e2b78d..2f823c6b84fd 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -932,7 +932,6 @@ void nvme_auth_reset(struct nvme_ctrl *ctrl) } mutex_unlock(&ctrl->dhchap_auth_mutex); } -EXPORT_SYMBOL_GPL(nvme_auth_reset); static void nvme_ctrl_auth_work(struct work_struct *work) { From c7390f132a896ff1a3fa26ea2b0be4f9ceb9041e Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 13 Nov 2022 13:24:08 +0200 Subject: [PATCH 24/45] nvme-auth: don't re-authenticate if the controller is not LIVE The connect sequence will re-authenticate. Reviewed-by: Hannes Reinecke Signed-off-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/auth.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index 2f823c6b84fd..4f2c8d0567bd 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -939,6 +939,13 @@ static void nvme_ctrl_auth_work(struct work_struct *work) container_of(work, struct nvme_ctrl, dhchap_auth_work); int ret, q; + /* + * If the ctrl is no connected, bail as reconnect will handle + * authentication. + */ + if (ctrl->state != NVME_CTRL_LIVE) + return; + /* Authenticate admin queue first */ ret = nvme_auth_negotiate(ctrl, 0); if (ret) { From f6b182fbd5c608bd6cbaaaee35b1325443f48043 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 13 Nov 2022 13:24:09 +0200 Subject: [PATCH 25/45] nvme-auth: remove redundant buffer deallocations host_response, host_key, ctrl_key and sess_key are freed in nvme_auth_reset_dhchap which is called from nvme_auth_free_dhchap. Reviewed-by: Hannes Reinecke Signed-off-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/auth.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index 4f2c8d0567bd..0d0542e33484 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -683,10 +683,6 @@ static void nvme_auth_free_dhchap(struct nvme_dhchap_queue_context *chap) crypto_free_shash(chap->shash_tfm); if (chap->dh_tfm) crypto_free_kpp(chap->dh_tfm); - kfree_sensitive(chap->ctrl_key); - kfree_sensitive(chap->host_key); - kfree_sensitive(chap->sess_key); - kfree_sensitive(chap->host_response); kfree(chap->buf); kfree(chap); } From 193a8c7e5f1a8481841636cec9c185543ec5c759 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 13 Nov 2022 13:24:10 +0200 Subject: [PATCH 26/45] nvme-auth: don't ignore key generation failures when initializing ctrl keys nvme_auth_generate_key can fail, don't ignore it upon initialization. Reviewed-by: Hannes Reinecke Signed-off-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/auth.c | 19 +++++++++++++++---- drivers/nvme/host/core.c | 6 +++++- drivers/nvme/host/nvme.h | 7 +++++-- 3 files changed, 25 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index 0d0542e33484..d62862ef5b3f 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -972,15 +972,26 @@ static void nvme_ctrl_auth_work(struct work_struct *work) */ } -void nvme_auth_init_ctrl(struct nvme_ctrl *ctrl) +int nvme_auth_init_ctrl(struct nvme_ctrl *ctrl) { + int ret; + INIT_LIST_HEAD(&ctrl->dhchap_auth_list); INIT_WORK(&ctrl->dhchap_auth_work, nvme_ctrl_auth_work); mutex_init(&ctrl->dhchap_auth_mutex); if (!ctrl->opts) - return; - nvme_auth_generate_key(ctrl->opts->dhchap_secret, &ctrl->host_key); - nvme_auth_generate_key(ctrl->opts->dhchap_ctrl_secret, &ctrl->ctrl_key); + return 0; + ret = nvme_auth_generate_key(ctrl->opts->dhchap_secret, + &ctrl->host_key); + if (ret) + return ret; + ret = nvme_auth_generate_key(ctrl->opts->dhchap_ctrl_secret, + &ctrl->ctrl_key); + if (ret) { + nvme_auth_free_key(ctrl->host_key); + ctrl->host_key = NULL; + } + return ret; } EXPORT_SYMBOL_GPL(nvme_auth_init_ctrl); diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 6040a13d3e2d..3d6751cbf40e 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -5119,9 +5119,13 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, nvme_fault_inject_init(&ctrl->fault_inject, dev_name(ctrl->device)); nvme_mpath_init_ctrl(ctrl); - nvme_auth_init_ctrl(ctrl); + ret = nvme_auth_init_ctrl(ctrl); + if (ret) + goto out_free_cdev; return 0; +out_free_cdev: + cdev_device_del(&ctrl->cdev, ctrl->device); out_free_name: nvme_put_ctrl(ctrl); kfree_const(ctrl->device->kobj.name); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 924ff80d85f6..47f96ab14c6a 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -1018,14 +1018,17 @@ static inline bool nvme_ctrl_sgl_supported(struct nvme_ctrl *ctrl) } #ifdef CONFIG_NVME_AUTH -void nvme_auth_init_ctrl(struct nvme_ctrl *ctrl); +int nvme_auth_init_ctrl(struct nvme_ctrl *ctrl); void nvme_auth_stop(struct nvme_ctrl *ctrl); int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid); int nvme_auth_wait(struct nvme_ctrl *ctrl, int qid); void nvme_auth_reset(struct nvme_ctrl *ctrl); void nvme_auth_free(struct nvme_ctrl *ctrl); #else -static inline void nvme_auth_init_ctrl(struct nvme_ctrl *ctrl) {}; +static inline int nvme_auth_init_ctrl(struct nvme_ctrl *ctrl) +{ + return 0; +} static inline void nvme_auth_stop(struct nvme_ctrl *ctrl) {}; static inline int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid) { From 01604350e14560d4d69323eb1ba12a257a643ea8 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 13 Nov 2022 13:24:11 +0200 Subject: [PATCH 27/45] nvme-auth: don't override ctrl keys before validation Replace ctrl ctrl_key/host_key only after nvme_auth_generate_key is successful. Also, this fixes a bug where the keys are leaked. Reviewed-by: Hannes Reinecke Signed-off-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 3d6751cbf40e..bad55fe0de7d 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3776,13 +3776,17 @@ static ssize_t nvme_ctrl_dhchap_secret_store(struct device *dev, memcpy(dhchap_secret, buf, count); nvme_auth_stop(ctrl); if (strcmp(dhchap_secret, opts->dhchap_secret)) { + struct nvme_dhchap_key *key, *host_key; int ret; - ret = nvme_auth_generate_key(dhchap_secret, &ctrl->host_key); + ret = nvme_auth_generate_key(dhchap_secret, &key); if (ret) return ret; kfree(opts->dhchap_secret); opts->dhchap_secret = dhchap_secret; + host_key = ctrl->host_key; + ctrl->host_key = key; + nvme_auth_free_key(host_key); /* Key has changed; re-authentication with new key */ nvme_auth_reset(ctrl); } @@ -3826,13 +3830,17 @@ static ssize_t nvme_ctrl_dhchap_ctrl_secret_store(struct device *dev, memcpy(dhchap_secret, buf, count); nvme_auth_stop(ctrl); if (strcmp(dhchap_secret, opts->dhchap_ctrl_secret)) { + struct nvme_dhchap_key *key, *ctrl_key; int ret; - ret = nvme_auth_generate_key(dhchap_secret, &ctrl->ctrl_key); + ret = nvme_auth_generate_key(dhchap_secret, &key); if (ret) return ret; kfree(opts->dhchap_ctrl_secret); opts->dhchap_ctrl_secret = dhchap_secret; + ctrl_key = ctrl->ctrl_key; + ctrl->ctrl_key = key; + nvme_auth_free_key(ctrl_key); /* Key has changed; re-authentication with new key */ nvme_auth_reset(ctrl); } From bfc4068e1e55e30a86f0e82e15163a60f99a894d Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 13 Nov 2022 13:24:12 +0200 Subject: [PATCH 28/45] nvme-auth: remove redundant if statement No one passes NVME_QID_ANY to nvme_auth_negotiate. Reviewed-by: Hannes Reinecke Signed-off-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/auth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index d62862ef5b3f..e7e4a00ee37e 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -874,7 +874,7 @@ int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid) mutex_unlock(&ctrl->dhchap_auth_mutex); return -ENOMEM; } - chap->qid = (qid == NVME_QID_ANY) ? 0 : qid; + chap->qid = qid; chap->ctrl = ctrl; /* From b7d604cae8f6edde53ac8aa9038ee154be562eb5 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 13 Nov 2022 13:24:13 +0200 Subject: [PATCH 29/45] nvme-auth: don't keep long lived 4k dhchap buffer dhchap structure is per-queue, it is wasteful to keep it for the entire lifetime of the queue. Allocate it dynamically and get rid of it after authentication. We don't need kzalloc because all accessors are clearing it before writing to it. Also, remove redundant chap buf_size which is always 4096, use a define instead. Reviewed-by: Hannes Reinecke Signed-off-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/auth.c | 47 ++++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index e7e4a00ee37e..0812eb9a5d0b 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -13,6 +13,8 @@ #include "fabrics.h" #include +#define CHAP_BUF_SIZE 4096 + struct nvme_dhchap_queue_context { struct list_head entry; struct work_struct auth_work; @@ -20,7 +22,6 @@ struct nvme_dhchap_queue_context { struct crypto_shash *shash_tfm; struct crypto_kpp *dh_tfm; void *buf; - size_t buf_size; int qid; int error; u32 s1; @@ -112,7 +113,7 @@ static int nvme_auth_set_dhchap_negotiate_data(struct nvme_ctrl *ctrl, struct nvmf_auth_dhchap_negotiate_data *data = chap->buf; size_t size = sizeof(*data) + sizeof(union nvmf_auth_protocol); - if (chap->buf_size < size) { + if (size > CHAP_BUF_SIZE) { chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; return -EINVAL; } @@ -147,7 +148,7 @@ static int nvme_auth_process_dhchap_challenge(struct nvme_ctrl *ctrl, const char *gid_name = nvme_auth_dhgroup_name(data->dhgid); const char *hmac_name, *kpp_name; - if (chap->buf_size < size) { + if (size > CHAP_BUF_SIZE) { chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; return NVME_SC_INVALID_FIELD; } @@ -302,7 +303,7 @@ static int nvme_auth_set_dhchap_reply_data(struct nvme_ctrl *ctrl, if (chap->host_key_len) size += chap->host_key_len; - if (chap->buf_size < size) { + if (size > CHAP_BUF_SIZE) { chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; return -EINVAL; } @@ -347,7 +348,7 @@ static int nvme_auth_process_dhchap_success1(struct nvme_ctrl *ctrl, if (ctrl->ctrl_key) size += chap->hash_len; - if (chap->buf_size < size) { + if (size > CHAP_BUF_SIZE) { chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; return NVME_SC_INVALID_FIELD; } @@ -674,6 +675,8 @@ static void nvme_auth_reset_dhchap(struct nvme_dhchap_queue_context *chap) chap->transaction = 0; memset(chap->c1, 0, sizeof(chap->c1)); memset(chap->c2, 0, sizeof(chap->c2)); + kfree(chap->buf); + chap->buf = NULL; } static void nvme_auth_free_dhchap(struct nvme_dhchap_queue_context *chap) @@ -683,7 +686,6 @@ static void nvme_auth_free_dhchap(struct nvme_dhchap_queue_context *chap) crypto_free_shash(chap->shash_tfm); if (chap->dh_tfm) crypto_free_kpp(chap->dh_tfm); - kfree(chap->buf); kfree(chap); } @@ -695,6 +697,16 @@ static void nvme_queue_auth_work(struct work_struct *work) size_t tl; int ret = 0; + /* + * Allocate a large enough buffer for the entire negotiation: + * 4k is enough to ffdhe8192. + */ + chap->buf = kmalloc(CHAP_BUF_SIZE, GFP_KERNEL); + if (!chap->buf) { + chap->error = -ENOMEM; + return; + } + chap->transaction = ctrl->transaction++; /* DH-HMAC-CHAP Step 1: send negotiate */ @@ -716,8 +728,9 @@ static void nvme_queue_auth_work(struct work_struct *work) dev_dbg(ctrl->device, "%s: qid %d receive challenge\n", __func__, chap->qid); - memset(chap->buf, 0, chap->buf_size); - ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, chap->buf_size, false); + memset(chap->buf, 0, CHAP_BUF_SIZE); + ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, CHAP_BUF_SIZE, + false); if (ret) { dev_warn(ctrl->device, "qid %d failed to receive challenge, %s %d\n", @@ -779,8 +792,9 @@ static void nvme_queue_auth_work(struct work_struct *work) dev_dbg(ctrl->device, "%s: qid %d receive success1\n", __func__, chap->qid); - memset(chap->buf, 0, chap->buf_size); - ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, chap->buf_size, false); + memset(chap->buf, 0, CHAP_BUF_SIZE); + ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, CHAP_BUF_SIZE, + false); if (ret) { dev_warn(ctrl->device, "qid %d failed to receive success1, %s %d\n", @@ -876,19 +890,6 @@ int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid) } chap->qid = qid; chap->ctrl = ctrl; - - /* - * Allocate a large enough buffer for the entire negotiation: - * 4k should be enough to ffdhe8192. - */ - chap->buf_size = 4096; - chap->buf = kzalloc(chap->buf_size, GFP_KERNEL); - if (!chap->buf) { - mutex_unlock(&ctrl->dhchap_auth_mutex); - kfree(chap); - return -ENOMEM; - } - INIT_WORK(&chap->auth_work, nvme_queue_auth_work); list_add(&chap->entry, &ctrl->dhchap_auth_list); mutex_unlock(&ctrl->dhchap_auth_mutex); From e481fc0a377798976d5c3044c7f10c86a8372b92 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Tue, 15 Nov 2022 17:08:06 +0100 Subject: [PATCH 30/45] nvme-auth: guarantee dhchap buffers under memory pressure We want to guarantee that we have chap buffers when a controller reconnects under memory pressure. Add a mempool specifically for that. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/auth.c | 30 ++++++++++++++++++++++++++++-- drivers/nvme/host/core.c | 6 ++++++ drivers/nvme/host/nvme.h | 9 +++++++++ 3 files changed, 43 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index 0812eb9a5d0b..1b44676b6155 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -14,6 +14,8 @@ #include #define CHAP_BUF_SIZE 4096 +static struct kmem_cache *nvme_chap_buf_cache; +static mempool_t *nvme_chap_buf_pool; struct nvme_dhchap_queue_context { struct list_head entry; @@ -675,7 +677,7 @@ static void nvme_auth_reset_dhchap(struct nvme_dhchap_queue_context *chap) chap->transaction = 0; memset(chap->c1, 0, sizeof(chap->c1)); memset(chap->c2, 0, sizeof(chap->c2)); - kfree(chap->buf); + mempool_free(chap->buf, nvme_chap_buf_pool); chap->buf = NULL; } @@ -701,7 +703,7 @@ static void nvme_queue_auth_work(struct work_struct *work) * Allocate a large enough buffer for the entire negotiation: * 4k is enough to ffdhe8192. */ - chap->buf = kmalloc(CHAP_BUF_SIZE, GFP_KERNEL); + chap->buf = mempool_alloc(nvme_chap_buf_pool, GFP_KERNEL); if (!chap->buf) { chap->error = -ENOMEM; return; @@ -1029,3 +1031,27 @@ void nvme_auth_free(struct nvme_ctrl *ctrl) } } EXPORT_SYMBOL_GPL(nvme_auth_free); + +int __init nvme_init_auth(void) +{ + nvme_chap_buf_cache = kmem_cache_create("nvme-chap-buf-cache", + CHAP_BUF_SIZE, 0, SLAB_HWCACHE_ALIGN, NULL); + if (!nvme_chap_buf_cache) + return -ENOMEM; + + nvme_chap_buf_pool = mempool_create(16, mempool_alloc_slab, + mempool_free_slab, nvme_chap_buf_cache); + if (!nvme_chap_buf_pool) + goto err_destroy_chap_buf_cache; + + return 0; +err_destroy_chap_buf_cache: + kmem_cache_destroy(nvme_chap_buf_cache); + return -ENOMEM; +} + +void __exit nvme_exit_auth(void) +{ + mempool_destroy(nvme_chap_buf_pool); + kmem_cache_destroy(nvme_chap_buf_cache); +} diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index bad55fe0de7d..cb5e6d07a5f8 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -5347,8 +5347,13 @@ static int __init nvme_core_init(void) goto unregister_generic_ns; } + result = nvme_init_auth(); + if (result) + goto destroy_ns_chr; return 0; +destroy_ns_chr: + class_destroy(nvme_ns_chr_class); unregister_generic_ns: unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS); destroy_subsys_class: @@ -5369,6 +5374,7 @@ out: static void __exit nvme_core_exit(void) { + nvme_exit_auth(); class_destroy(nvme_ns_chr_class); class_destroy(nvme_subsys_class); class_destroy(nvme_class); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 47f96ab14c6a..ebd67e7dc11e 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -1018,6 +1018,8 @@ static inline bool nvme_ctrl_sgl_supported(struct nvme_ctrl *ctrl) } #ifdef CONFIG_NVME_AUTH +int __init nvme_init_auth(void); +void __exit nvme_exit_auth(void); int nvme_auth_init_ctrl(struct nvme_ctrl *ctrl); void nvme_auth_stop(struct nvme_ctrl *ctrl); int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid); @@ -1029,6 +1031,13 @@ static inline int nvme_auth_init_ctrl(struct nvme_ctrl *ctrl) { return 0; } +static inline int __init nvme_init_auth(void) +{ + return 0; +} +static inline void __exit nvme_exit_auth(void) +{ +} static inline void nvme_auth_stop(struct nvme_ctrl *ctrl) {}; static inline int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid) { From 8d1c1904e94757b78c28fbbef9285e4101d86ee9 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 13 Nov 2022 13:24:15 +0200 Subject: [PATCH 31/45] nvme-auth: clear sensitive info right after authentication completes We don't want to keep authentication sensitive info in memory for unlimited amount of time. Reviewed-by: Hannes Reinecke Signed-off-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/auth.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index 1b44676b6155..04cf183d9519 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -912,6 +912,8 @@ int nvme_auth_wait(struct nvme_ctrl *ctrl, int qid) mutex_unlock(&ctrl->dhchap_auth_mutex); flush_work(&chap->auth_work); ret = chap->error; + /* clear sensitive info */ + nvme_auth_reset_dhchap(chap); return ret; } mutex_unlock(&ctrl->dhchap_auth_mutex); From 96df31839354c2bb9d2f0d51eb6c6f6b762fd150 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 13 Nov 2022 13:24:16 +0200 Subject: [PATCH 32/45] nvme-auth: remove redundant deallocations These are now redundant as the dhchap context is removed after authentication completes. Signed-off-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/host/auth.c | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index 04cf183d9519..3bffd22221c9 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -200,12 +200,6 @@ static int nvme_auth_process_dhchap_challenge(struct nvme_ctrl *ctrl, return NVME_SC_AUTH_REQUIRED; } - /* Reset host response if the hash had been changed */ - if (chap->hash_id != data->hashid) { - kfree(chap->host_response); - chap->host_response = NULL; - } - chap->hash_id = data->hashid; chap->hash_len = data->hl; dev_dbg(ctrl->device, "qid %d: selected hash %s\n", @@ -222,14 +216,6 @@ select_kpp: return NVME_SC_AUTH_REQUIRED; } - /* Clear host and controller key to avoid accidental reuse */ - kfree_sensitive(chap->host_key); - chap->host_key = NULL; - chap->host_key_len = 0; - kfree_sensitive(chap->ctrl_key); - chap->ctrl_key = NULL; - chap->ctrl_key_len = 0; - if (chap->dhgroup_id == data->dhgid && (data->dhgid == NVME_AUTH_DHGROUP_NULL || chap->dh_tfm)) { dev_dbg(ctrl->device, @@ -624,9 +610,6 @@ static int nvme_auth_dhchap_exponential(struct nvme_ctrl *ctrl, if (ret) { dev_dbg(ctrl->device, "failed to generate public key, error %d\n", ret); - kfree(chap->host_key); - chap->host_key = NULL; - chap->host_key_len = 0; chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; return ret; } @@ -646,9 +629,6 @@ gen_sesskey: if (ret) { dev_dbg(ctrl->device, "failed to generate shared secret, error %d\n", ret); - kfree_sensitive(chap->sess_key); - chap->sess_key = NULL; - chap->sess_key_len = 0; chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; return ret; } From e8a420efb637f52c586596283d6fd96f2a7ecb5c Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 13 Nov 2022 13:24:17 +0200 Subject: [PATCH 33/45] nvme-auth: no need to reset chap contexts on re-authentication Now that the chap context is reset upon completion, this is no longer needed. Also remove nvme_auth_reset as no callers are left. Reviewed-by: Hannes Reinecke Signed-off-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/auth.c | 13 ------------- drivers/nvme/host/core.c | 4 ---- drivers/nvme/host/nvme.h | 1 - 3 files changed, 18 deletions(-) diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index 3bffd22221c9..d1d89920d03c 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -901,19 +901,6 @@ int nvme_auth_wait(struct nvme_ctrl *ctrl, int qid) } EXPORT_SYMBOL_GPL(nvme_auth_wait); -void nvme_auth_reset(struct nvme_ctrl *ctrl) -{ - struct nvme_dhchap_queue_context *chap; - - mutex_lock(&ctrl->dhchap_auth_mutex); - list_for_each_entry(chap, &ctrl->dhchap_auth_list, entry) { - mutex_unlock(&ctrl->dhchap_auth_mutex); - flush_work(&chap->auth_work); - nvme_auth_reset_dhchap(chap); - } - mutex_unlock(&ctrl->dhchap_auth_mutex); -} - static void nvme_ctrl_auth_work(struct work_struct *work) { struct nvme_ctrl *ctrl = diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index cb5e6d07a5f8..2944d9b565a2 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3787,8 +3787,6 @@ static ssize_t nvme_ctrl_dhchap_secret_store(struct device *dev, host_key = ctrl->host_key; ctrl->host_key = key; nvme_auth_free_key(host_key); - /* Key has changed; re-authentication with new key */ - nvme_auth_reset(ctrl); } /* Start re-authentication */ dev_info(ctrl->device, "re-authenticating controller\n"); @@ -3841,8 +3839,6 @@ static ssize_t nvme_ctrl_dhchap_ctrl_secret_store(struct device *dev, ctrl_key = ctrl->ctrl_key; ctrl->ctrl_key = key; nvme_auth_free_key(ctrl_key); - /* Key has changed; re-authentication with new key */ - nvme_auth_reset(ctrl); } /* Start re-authentication */ dev_info(ctrl->device, "re-authenticating controller\n"); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index ebd67e7dc11e..e431ec348d52 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -1024,7 +1024,6 @@ int nvme_auth_init_ctrl(struct nvme_ctrl *ctrl); void nvme_auth_stop(struct nvme_ctrl *ctrl); int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid); int nvme_auth_wait(struct nvme_ctrl *ctrl, int qid); -void nvme_auth_reset(struct nvme_ctrl *ctrl); void nvme_auth_free(struct nvme_ctrl *ctrl); #else static inline int nvme_auth_init_ctrl(struct nvme_ctrl *ctrl) From 546dea18c99928bb81392de63092da0e25d07b10 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 13 Nov 2022 13:24:18 +0200 Subject: [PATCH 34/45] nvme-auth: check chap ctrl_key once constructed ctrl ctrl_key member may be overwritten from a sysfs context driven by the user. Once a queue local copy was created, use that instead to minimize checks on a shared resource. Signed-off-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/host/auth.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index d1d89920d03c..781a6003109d 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -333,7 +333,7 @@ static int nvme_auth_process_dhchap_success1(struct nvme_ctrl *ctrl, struct nvmf_auth_dhchap_success1_data *data = chap->buf; size_t size = sizeof(*data); - if (ctrl->ctrl_key) + if (chap->ctrl_key) size += chap->hash_len; if (size > CHAP_BUF_SIZE) { @@ -811,7 +811,7 @@ static void nvme_queue_auth_work(struct work_struct *work) goto fail2; } - if (ctrl->ctrl_key) { + if (chap->ctrl_key) { /* DH-HMAC-CHAP Step 5: send success2 */ dev_dbg(ctrl->device, "%s: qid %d send success2\n", __func__, chap->qid); From aa36d711e945e65fa87410927800f01878a8faed Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 13 Nov 2022 13:24:20 +0200 Subject: [PATCH 35/45] nvme-auth: convert dhchap_auth_list to an array We know exactly how many dhchap contexts we will need, there is no need to hold a list that we need to protect with a mutex. Convert to a dynamically allocated array. And dhchap_context access state is maintained by the chap itself. Make dhchap_auth_mutex protect only the ctrl host_key and ctrl_key in a fine-grained lock such that there is no long lasting acquisition of the lock and no need to take/release this lock when flushing authentication works. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/auth.c | 118 +++++++++++++++++++++------------------ drivers/nvme/host/core.c | 4 ++ drivers/nvme/host/nvme.h | 2 +- 3 files changed, 69 insertions(+), 55 deletions(-) diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index 781a6003109d..24726683d4bc 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -50,6 +50,12 @@ struct nvme_dhchap_queue_context { #define nvme_auth_queue_from_qid(ctrl, qid) \ (qid == 0) ? (ctrl)->fabrics_q : (ctrl)->connect_q +static inline int ctrl_max_dhchaps(struct nvme_ctrl *ctrl) +{ + return ctrl->opts->nr_io_queues + ctrl->opts->nr_write_queues + + ctrl->opts->nr_poll_queues + 1; +} + static int nvme_auth_submit(struct nvme_ctrl *ctrl, int qid, void *data, size_t data_len, bool auth_send) { @@ -510,6 +516,7 @@ static int nvme_auth_dhchap_setup_ctrl_response(struct nvme_ctrl *ctrl, ret = PTR_ERR(ctrl_response); return ret; } + ret = crypto_shash_setkey(chap->shash_tfm, ctrl_response, ctrl->ctrl_key->len); if (ret) { @@ -668,7 +675,6 @@ static void nvme_auth_free_dhchap(struct nvme_dhchap_queue_context *chap) crypto_free_shash(chap->shash_tfm); if (chap->dh_tfm) crypto_free_kpp(chap->dh_tfm); - kfree(chap); } static void nvme_queue_auth_work(struct work_struct *work) @@ -748,11 +754,14 @@ static void nvme_queue_auth_work(struct work_struct *work) dev_dbg(ctrl->device, "%s: qid %d host response\n", __func__, chap->qid); + mutex_lock(&ctrl->dhchap_auth_mutex); ret = nvme_auth_dhchap_setup_host_response(ctrl, chap); if (ret) { + mutex_unlock(&ctrl->dhchap_auth_mutex); chap->error = ret; goto fail2; } + mutex_unlock(&ctrl->dhchap_auth_mutex); /* DH-HMAC-CHAP Step 3: send reply */ dev_dbg(ctrl->device, "%s: qid %d send reply\n", @@ -793,16 +802,19 @@ static void nvme_queue_auth_work(struct work_struct *work) return; } + mutex_lock(&ctrl->dhchap_auth_mutex); if (ctrl->ctrl_key) { dev_dbg(ctrl->device, "%s: qid %d controller response\n", __func__, chap->qid); ret = nvme_auth_dhchap_setup_ctrl_response(ctrl, chap); if (ret) { + mutex_unlock(&ctrl->dhchap_auth_mutex); chap->error = ret; goto fail2; } } + mutex_unlock(&ctrl->dhchap_auth_mutex); ret = nvme_auth_process_dhchap_success1(ctrl, chap); if (ret) { @@ -852,29 +864,8 @@ int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid) return -ENOKEY; } - mutex_lock(&ctrl->dhchap_auth_mutex); - /* Check if the context is already queued */ - list_for_each_entry(chap, &ctrl->dhchap_auth_list, entry) { - WARN_ON(!chap->buf); - if (chap->qid == qid) { - dev_dbg(ctrl->device, "qid %d: re-using context\n", qid); - mutex_unlock(&ctrl->dhchap_auth_mutex); - flush_work(&chap->auth_work); - nvme_auth_reset_dhchap(chap); - queue_work(nvme_wq, &chap->auth_work); - return 0; - } - } - chap = kzalloc(sizeof(*chap), GFP_KERNEL); - if (!chap) { - mutex_unlock(&ctrl->dhchap_auth_mutex); - return -ENOMEM; - } - chap->qid = qid; - chap->ctrl = ctrl; - INIT_WORK(&chap->auth_work, nvme_queue_auth_work); - list_add(&chap->entry, &ctrl->dhchap_auth_list); - mutex_unlock(&ctrl->dhchap_auth_mutex); + chap = &ctrl->dhchap_ctxs[qid]; + cancel_work_sync(&chap->auth_work); queue_work(nvme_wq, &chap->auth_work); return 0; } @@ -885,19 +876,12 @@ int nvme_auth_wait(struct nvme_ctrl *ctrl, int qid) struct nvme_dhchap_queue_context *chap; int ret; - mutex_lock(&ctrl->dhchap_auth_mutex); - list_for_each_entry(chap, &ctrl->dhchap_auth_list, entry) { - if (chap->qid != qid) - continue; - mutex_unlock(&ctrl->dhchap_auth_mutex); - flush_work(&chap->auth_work); - ret = chap->error; - /* clear sensitive info */ - nvme_auth_reset_dhchap(chap); - return ret; - } - mutex_unlock(&ctrl->dhchap_auth_mutex); - return -ENXIO; + chap = &ctrl->dhchap_ctxs[qid]; + flush_work(&chap->auth_work); + ret = chap->error; + /* clear sensitive info */ + nvme_auth_reset_dhchap(chap); + return ret; } EXPORT_SYMBOL_GPL(nvme_auth_wait); @@ -946,11 +930,11 @@ static void nvme_ctrl_auth_work(struct work_struct *work) int nvme_auth_init_ctrl(struct nvme_ctrl *ctrl) { - int ret; + struct nvme_dhchap_queue_context *chap; + int i, ret; - INIT_LIST_HEAD(&ctrl->dhchap_auth_list); - INIT_WORK(&ctrl->dhchap_auth_work, nvme_ctrl_auth_work); mutex_init(&ctrl->dhchap_auth_mutex); + INIT_WORK(&ctrl->dhchap_auth_work, nvme_ctrl_auth_work); if (!ctrl->opts) return 0; ret = nvme_auth_generate_key(ctrl->opts->dhchap_secret, @@ -959,37 +943,63 @@ int nvme_auth_init_ctrl(struct nvme_ctrl *ctrl) return ret; ret = nvme_auth_generate_key(ctrl->opts->dhchap_ctrl_secret, &ctrl->ctrl_key); - if (ret) { - nvme_auth_free_key(ctrl->host_key); - ctrl->host_key = NULL; + if (ret) + goto err_free_dhchap_secret; + + if (!ctrl->opts->dhchap_secret && !ctrl->opts->dhchap_ctrl_secret) + return ret; + + ctrl->dhchap_ctxs = kvcalloc(ctrl_max_dhchaps(ctrl), + sizeof(*chap), GFP_KERNEL); + if (!ctrl->dhchap_ctxs) { + ret = -ENOMEM; + goto err_free_dhchap_ctrl_secret; } + + for (i = 0; i < ctrl_max_dhchaps(ctrl); i++) { + chap = &ctrl->dhchap_ctxs[i]; + chap->qid = i; + chap->ctrl = ctrl; + INIT_WORK(&chap->auth_work, nvme_queue_auth_work); + } + + return 0; +err_free_dhchap_ctrl_secret: + nvme_auth_free_key(ctrl->ctrl_key); + ctrl->ctrl_key = NULL; +err_free_dhchap_secret: + nvme_auth_free_key(ctrl->host_key); + ctrl->host_key = NULL; return ret; } EXPORT_SYMBOL_GPL(nvme_auth_init_ctrl); void nvme_auth_stop(struct nvme_ctrl *ctrl) { - struct nvme_dhchap_queue_context *chap = NULL, *tmp; + struct nvme_dhchap_queue_context *chap; + int i; cancel_work_sync(&ctrl->dhchap_auth_work); - mutex_lock(&ctrl->dhchap_auth_mutex); - list_for_each_entry_safe(chap, tmp, &ctrl->dhchap_auth_list, entry) + for (i = 0; i < ctrl_max_dhchaps(ctrl); i++) { + chap = &ctrl->dhchap_ctxs[i]; cancel_work_sync(&chap->auth_work); - mutex_unlock(&ctrl->dhchap_auth_mutex); + } } EXPORT_SYMBOL_GPL(nvme_auth_stop); void nvme_auth_free(struct nvme_ctrl *ctrl) { - struct nvme_dhchap_queue_context *chap = NULL, *tmp; + struct nvme_dhchap_queue_context *chap; + int i; - mutex_lock(&ctrl->dhchap_auth_mutex); - list_for_each_entry_safe(chap, tmp, &ctrl->dhchap_auth_list, entry) { - list_del_init(&chap->entry); - flush_work(&chap->auth_work); - nvme_auth_free_dhchap(chap); + if (ctrl->dhchap_ctxs) { + for (i = 0; i < ctrl_max_dhchaps(ctrl); i++) { + chap = &ctrl->dhchap_ctxs[i]; + flush_work(&chap->auth_work); + nvme_auth_free_dhchap(chap); + } + kfree(ctrl->dhchap_ctxs); } - mutex_unlock(&ctrl->dhchap_auth_mutex); if (ctrl->host_key) { nvme_auth_free_key(ctrl->host_key); ctrl->host_key = NULL; diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 2944d9b565a2..bc0eede419cb 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3785,7 +3785,9 @@ static ssize_t nvme_ctrl_dhchap_secret_store(struct device *dev, kfree(opts->dhchap_secret); opts->dhchap_secret = dhchap_secret; host_key = ctrl->host_key; + mutex_lock(&ctrl->dhchap_auth_mutex); ctrl->host_key = key; + mutex_unlock(&ctrl->dhchap_auth_mutex); nvme_auth_free_key(host_key); } /* Start re-authentication */ @@ -3837,7 +3839,9 @@ static ssize_t nvme_ctrl_dhchap_ctrl_secret_store(struct device *dev, kfree(opts->dhchap_ctrl_secret); opts->dhchap_ctrl_secret = dhchap_secret; ctrl_key = ctrl->ctrl_key; + mutex_lock(&ctrl->dhchap_auth_mutex); ctrl->ctrl_key = key; + mutex_unlock(&ctrl->dhchap_auth_mutex); nvme_auth_free_key(ctrl_key); } /* Start re-authentication */ diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index e431ec348d52..ef23c6c6e2a3 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -337,8 +337,8 @@ struct nvme_ctrl { #ifdef CONFIG_NVME_AUTH struct work_struct dhchap_auth_work; - struct list_head dhchap_auth_list; struct mutex dhchap_auth_mutex; + struct nvme_dhchap_queue_context *dhchap_ctxs; struct nvme_dhchap_key *host_key; struct nvme_dhchap_key *ctrl_key; u16 transaction; From a2a00d2a66e480c8b225012db538dca6e389a92d Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 13 Nov 2022 13:24:21 +0200 Subject: [PATCH 36/45] nvme-auth: remove redundant auth_work flush only ctrl deletion calls nvme_auth_free, which was stopped prior in the teardown stage, so there is no possibility that it should ever run when nvme_auth_free is called. As a result, we can remove a local chap pointer variable. Reviewed-by: Hannes Reinecke Signed-off-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/auth.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index 24726683d4bc..c9b3f0056afc 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -989,15 +989,11 @@ EXPORT_SYMBOL_GPL(nvme_auth_stop); void nvme_auth_free(struct nvme_ctrl *ctrl) { - struct nvme_dhchap_queue_context *chap; int i; if (ctrl->dhchap_ctxs) { - for (i = 0; i < ctrl_max_dhchaps(ctrl); i++) { - chap = &ctrl->dhchap_ctxs[i]; - flush_work(&chap->auth_work); - nvme_auth_free_dhchap(chap); - } + for (i = 0; i < ctrl_max_dhchaps(ctrl); i++) + nvme_auth_free_dhchap(&ctrl->dhchap_ctxs[i]); kfree(ctrl->dhchap_ctxs); } if (ctrl->host_key) { From d061a1bd1fff5332ee48601947abb414007a9610 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 13 Nov 2022 13:24:22 +0200 Subject: [PATCH 37/45] nvme-auth: have dhchap_auth_work wait for queues auth to complete It triggered the queue authentication work elements in parallel, but the ctrl authentication work itself completes when all of them completes. Hence wait for queues auth completions. This also makes nvme_auth_stop simply a sync cancel of ctrl dhchap_auth_work. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/auth.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index c9b3f0056afc..bb0abbe4491c 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -926,6 +926,12 @@ static void nvme_ctrl_auth_work(struct work_struct *work) * Failure is a soft-state; credentials remain valid until * the controller terminates the connection. */ + for (q = 1; q < ctrl->queue_count; q++) { + ret = nvme_auth_wait(ctrl, q); + if (ret) + dev_warn(ctrl->device, + "qid %d: authentication failed\n", q); + } } int nvme_auth_init_ctrl(struct nvme_ctrl *ctrl) @@ -976,14 +982,7 @@ EXPORT_SYMBOL_GPL(nvme_auth_init_ctrl); void nvme_auth_stop(struct nvme_ctrl *ctrl) { - struct nvme_dhchap_queue_context *chap; - int i; - cancel_work_sync(&ctrl->dhchap_auth_work); - for (i = 0; i < ctrl_max_dhchaps(ctrl); i++) { - chap = &ctrl->dhchap_ctxs[i]; - cancel_work_sync(&chap->auth_work); - } } EXPORT_SYMBOL_GPL(nvme_auth_stop); From 1f1a4f89562d3b33b6ca4fc8a4f3bd4cd35ab4ea Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 13 Nov 2022 13:24:23 +0200 Subject: [PATCH 38/45] nvme-tcp: stop auth work after tearing down queues in error recovery when starting error recovery there might be a authentication work running, and it involves I/O commands. Given the controller is tearing down there is no chance for the I/O to complete other than timing out which may unnecessarily take a full io timeout. So first tear down the queues, fail/cancel all inflight I/O (including potentially authentication) and only then stop authentication. This ensures that failover is not stalled due to blocked authentication I/O. Signed-off-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 4f8584657bb7..3fedddf0aedc 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -2119,7 +2119,6 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work) struct nvme_tcp_ctrl, err_work); struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl; - nvme_auth_stop(ctrl); nvme_stop_keep_alive(ctrl); flush_work(&ctrl->async_event_work); nvme_tcp_teardown_io_queues(ctrl, false); @@ -2127,6 +2126,7 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work) nvme_start_queues(ctrl); nvme_tcp_teardown_admin_queue(ctrl, false); nvme_start_admin_queue(ctrl); + nvme_auth_stop(ctrl); if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) { /* state change failure is ok if we started ctrl delete */ From 91c11d5f32547a08d462934246488fe72f3d44c3 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 13 Nov 2022 13:24:24 +0200 Subject: [PATCH 39/45] nvme-rdma: stop auth work after tearing down queues in error recovery when starting error recovery there might be a authentication work running, and it involves I/O commands. Given the controller is tearing down there is no chance for the I/O to complete other than timing out which may unnecessarily take a full io timeout. So first tear down the queues, fail/cancel all inflight I/O (including potentially authentication) and only then stop authentication. This ensures that failover is not stalled due to blocked authentication I/O. Signed-off-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index ccd45e5b3298..3d78278e47c5 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1153,13 +1153,13 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) struct nvme_rdma_ctrl *ctrl = container_of(work, struct nvme_rdma_ctrl, err_work); - nvme_auth_stop(&ctrl->ctrl); nvme_stop_keep_alive(&ctrl->ctrl); flush_work(&ctrl->ctrl.async_event_work); nvme_rdma_teardown_io_queues(ctrl, false); nvme_start_queues(&ctrl->ctrl); nvme_rdma_teardown_admin_queue(ctrl, false); nvme_start_admin_queue(&ctrl->ctrl); + nvme_auth_stop(&ctrl->ctrl); if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { /* state change failure is ok if we started ctrl delete */ From 811f4de0344d48a33a94d5c7d31a0ef0490f5701 Mon Sep 17 00:00:00 2001 From: Uday Shankar Date: Mon, 14 Nov 2022 17:23:59 -0700 Subject: [PATCH 40/45] nvme: avoid fallback to sequential scan due to transient issues Currently, if nvme_scan_ns_list fails, nvme_scan_work will fall back to a sequential scan. nvme_scan_ns_list can fail for a variety of reasons, e.g. a transient transport issue, and the resulting sequential scan can be extremely expensive on controllers reporting an NN value close to the maximum allowed (> 4 billion). Avoid sequential scans wherever possible by only falling back to them in two cases: - When the NVMe version supported (VS) value reported by the device is older than NVME_VS(1, 1, 0), before which support of Identify NS List not required. - When the Identify NS List command fails with the DNR bit set in the status. This is to accommodate (non-compliant) devices which report a VS value which implies support for Identify NS List, but nevertheless do not support the command. Such devices will most likely fail the command with the DNR bit set. The third case is when the device claims support for Identify NS List but the command fails with DNR not set. In such cases, fallback to sequential scan is potentially expensive and likely unnecessary, as a retry of the list scan should succeed. So this change skips the fallback in this third case. Signed-off-by: Uday Shankar Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index bc0eede419cb..cba45bad689b 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4460,9 +4460,6 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl) u32 prev = 0; int ret = 0, i; - if (nvme_ctrl_limited_cns(ctrl)) - return -EOPNOTSUPP; - ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL); if (!ns_list) return -ENOMEM; @@ -4570,8 +4567,18 @@ static void nvme_scan_work(struct work_struct *work) } mutex_lock(&ctrl->scan_lock); - if (nvme_scan_ns_list(ctrl) != 0) + if (nvme_ctrl_limited_cns(ctrl)) { nvme_scan_ns_sequential(ctrl); + } else { + /* + * Fall back to sequential scan if DNR is set to handle broken + * devices which should support Identify NS List (as per the VS + * they report) but don't actually support it. + */ + ret = nvme_scan_ns_list(ctrl); + if (ret > 0 && ret & NVME_SC_DNR) + nvme_scan_ns_sequential(ctrl); + } mutex_unlock(&ctrl->scan_lock); } From bcaf434b8f04e1ee82a8b1e1bce0de99fbff67fa Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Wed, 2 Nov 2022 18:17:08 +0100 Subject: [PATCH 41/45] nvme: return err on nvme_init_non_mdts_limits fail In nvme_init_non_mdts_limits function we were returning 0 when kzalloc failed; it now returns -ENOMEM. Fixes: 5befc7c26e5a ("nvme: implement non-mdts command limits") Signed-off-by: Joel Granados Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index cba45bad689b..ca4d40996ac1 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3078,7 +3078,7 @@ static int nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl) id = kzalloc(sizeof(*id), GFP_KERNEL); if (!id) - return 0; + return -ENOMEM; c.identify.opcode = nvme_admin_identify; c.identify.cns = NVME_ID_CNS_CS_CTRL; From c58e28afb11f5cd3c7f8a27b3abb045d848467ac Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 14 Nov 2022 08:45:02 +0200 Subject: [PATCH 42/45] nvmet: fix a memory leak in nvmet_auth_set_key When changing dhchap secrets we need to release the old secrets as well. kmemleak complaint: -- unreferenced object 0xffff8c7f44ed8180 (size 64): comm "check", pid 7304, jiffies 4295686133 (age 72034.246s) hex dump (first 32 bytes): 44 48 48 43 2d 31 3a 30 30 3a 4c 64 4c 4f 64 71 DHHC-1:00:LdLOdq 79 56 69 67 77 48 55 32 6d 5a 59 4c 7a 35 59 38 yVigwHU2mZYLz5Y8 backtrace: [<00000000b6fc5071>] kstrdup+0x2e/0x60 [<00000000f0f4633f>] 0xffffffffc0e07ee6 [<0000000053006c05>] 0xffffffffc0dff783 [<00000000419ae922>] configfs_write_iter+0xb1/0x120 [<000000008183c424>] vfs_write+0x2be/0x3c0 [<000000009005a2a5>] ksys_write+0x5f/0xe0 [<00000000cd495c89>] do_syscall_64+0x38/0x90 [<00000000f2a84ac5>] entry_SYSCALL_64_after_hwframe+0x63/0xcd Fixes: db1312dd9548 ("nvmet: implement basic In-Band Authentication") Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/auth.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/target/auth.c b/drivers/nvme/target/auth.c index c4113b43dbfe..4dcddcf95279 100644 --- a/drivers/nvme/target/auth.c +++ b/drivers/nvme/target/auth.c @@ -45,9 +45,11 @@ int nvmet_auth_set_key(struct nvmet_host *host, const char *secret, if (!dhchap_secret) return -ENOMEM; if (set_ctrl) { + kfree(host->dhchap_ctrl_secret); host->dhchap_ctrl_secret = strim(dhchap_secret); host->dhchap_ctrl_key_hash = key_hash; } else { + kfree(host->dhchap_secret); host->dhchap_secret = strim(dhchap_secret); host->dhchap_key_hash = key_hash; } From 9f27bd701d18f012646a06bc6c1b35d81f30359b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 15 Nov 2022 11:22:14 +0100 Subject: [PATCH 43/45] nvme: rename the queue quiescing helpers Naming the nvme helpers that wrap the block quiesce functionality _start/_stop is rather confusing. Switch to using the quiesce naming used by the block layer instead. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni --- drivers/nvme/host/apple.c | 12 ++++++------ drivers/nvme/host/core.c | 24 ++++++++++++------------ drivers/nvme/host/fc.c | 16 ++++++++-------- drivers/nvme/host/nvme.h | 8 ++++---- drivers/nvme/host/pci.c | 16 ++++++++-------- drivers/nvme/host/rdma.c | 26 +++++++++++++------------- drivers/nvme/host/tcp.c | 28 ++++++++++++++-------------- drivers/nvme/target/loop.c | 6 +++--- 8 files changed, 68 insertions(+), 68 deletions(-) diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c index a85349a7e938..cab69516af5b 100644 --- a/drivers/nvme/host/apple.c +++ b/drivers/nvme/host/apple.c @@ -821,7 +821,7 @@ static void apple_nvme_disable(struct apple_nvme *anv, bool shutdown) if (!dead && shutdown && freeze) nvme_wait_freeze_timeout(&anv->ctrl, NVME_IO_TIMEOUT); - nvme_stop_queues(&anv->ctrl); + nvme_quiesce_io_queues(&anv->ctrl); if (!dead) { if (READ_ONCE(anv->ioq.enabled)) { @@ -837,7 +837,7 @@ static void apple_nvme_disable(struct apple_nvme *anv, bool shutdown) WRITE_ONCE(anv->ioq.enabled, false); WRITE_ONCE(anv->adminq.enabled, false); mb(); /* ensure that nvme_queue_rq() sees that enabled is cleared */ - nvme_stop_admin_queue(&anv->ctrl); + nvme_quiesce_admin_queue(&anv->ctrl); /* last chance to complete any requests before nvme_cancel_request */ spin_lock_irqsave(&anv->lock, flags); @@ -854,8 +854,8 @@ static void apple_nvme_disable(struct apple_nvme *anv, bool shutdown) * deadlocking blk-mq hot-cpu notifier. */ if (shutdown) { - nvme_start_queues(&anv->ctrl); - nvme_start_admin_queue(&anv->ctrl); + nvme_unquiesce_io_queues(&anv->ctrl); + nvme_unquiesce_admin_queue(&anv->ctrl); } } @@ -1093,7 +1093,7 @@ static void apple_nvme_reset_work(struct work_struct *work) dev_dbg(anv->dev, "Starting admin queue"); apple_nvme_init_queue(&anv->adminq); - nvme_start_admin_queue(&anv->ctrl); + nvme_unquiesce_admin_queue(&anv->ctrl); if (!nvme_change_ctrl_state(&anv->ctrl, NVME_CTRL_CONNECTING)) { dev_warn(anv->ctrl.device, @@ -1127,7 +1127,7 @@ static void apple_nvme_reset_work(struct work_struct *work) anv->ctrl.queue_count = nr_io_queues + 1; - nvme_start_queues(&anv->ctrl); + nvme_unquiesce_io_queues(&anv->ctrl); nvme_wait_freeze(&anv->ctrl); blk_mq_update_nr_hw_queues(&anv->tagset, 1); nvme_unfreeze(&anv->ctrl); diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index ca4d40996ac1..3195ae17df30 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4610,7 +4610,7 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl) */ if (ctrl->state == NVME_CTRL_DEAD) { nvme_mark_namespaces_dead(ctrl); - nvme_start_queues(ctrl); + nvme_unquiesce_io_queues(ctrl); } /* this is a no-op when called from the controller reset handler */ @@ -4737,7 +4737,7 @@ static void nvme_fw_act_work(struct work_struct *work) fw_act_timeout = jiffies + msecs_to_jiffies(admin_timeout * 1000); - nvme_stop_queues(ctrl); + nvme_quiesce_io_queues(ctrl); while (nvme_ctrl_pp_status(ctrl)) { if (time_after(jiffies, fw_act_timeout)) { dev_warn(ctrl->device, @@ -4751,7 +4751,7 @@ static void nvme_fw_act_work(struct work_struct *work) if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) return; - nvme_start_queues(ctrl); + nvme_unquiesce_io_queues(ctrl); /* read FW slot information to clear the AER */ nvme_get_fw_slot_info(ctrl); @@ -4996,7 +4996,7 @@ void nvme_start_ctrl(struct nvme_ctrl *ctrl) if (ctrl->queue_count > 1) { nvme_queue_scan(ctrl); - nvme_start_queues(ctrl); + nvme_unquiesce_io_queues(ctrl); nvme_mpath_update(ctrl); } @@ -5213,37 +5213,37 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_start_freeze); -void nvme_stop_queues(struct nvme_ctrl *ctrl) +void nvme_quiesce_io_queues(struct nvme_ctrl *ctrl) { if (!test_and_set_bit(NVME_CTRL_STOPPED, &ctrl->flags)) blk_mq_quiesce_tagset(ctrl->tagset); else blk_mq_wait_quiesce_done(ctrl->tagset); } -EXPORT_SYMBOL_GPL(nvme_stop_queues); +EXPORT_SYMBOL_GPL(nvme_quiesce_io_queues); -void nvme_start_queues(struct nvme_ctrl *ctrl) +void nvme_unquiesce_io_queues(struct nvme_ctrl *ctrl) { if (test_and_clear_bit(NVME_CTRL_STOPPED, &ctrl->flags)) blk_mq_unquiesce_tagset(ctrl->tagset); } -EXPORT_SYMBOL_GPL(nvme_start_queues); +EXPORT_SYMBOL_GPL(nvme_unquiesce_io_queues); -void nvme_stop_admin_queue(struct nvme_ctrl *ctrl) +void nvme_quiesce_admin_queue(struct nvme_ctrl *ctrl) { if (!test_and_set_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->flags)) blk_mq_quiesce_queue(ctrl->admin_q); else blk_mq_wait_quiesce_done(ctrl->admin_q->tag_set); } -EXPORT_SYMBOL_GPL(nvme_stop_admin_queue); +EXPORT_SYMBOL_GPL(nvme_quiesce_admin_queue); -void nvme_start_admin_queue(struct nvme_ctrl *ctrl) +void nvme_unquiesce_admin_queue(struct nvme_ctrl *ctrl) { if (test_and_clear_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->flags)) blk_mq_unquiesce_queue(ctrl->admin_q); } -EXPORT_SYMBOL_GPL(nvme_start_admin_queue); +EXPORT_SYMBOL_GPL(nvme_unquiesce_admin_queue); void nvme_sync_io_queues(struct nvme_ctrl *ctrl) { diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 1f9f4075794b..aa5fb56c07d9 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2392,7 +2392,7 @@ nvme_fc_ctrl_free(struct kref *ref) list_del(&ctrl->ctrl_list); spin_unlock_irqrestore(&ctrl->rport->lock, flags); - nvme_start_admin_queue(&ctrl->ctrl); + nvme_unquiesce_admin_queue(&ctrl->ctrl); nvme_remove_admin_tag_set(&ctrl->ctrl); kfree(ctrl->queues); @@ -2493,13 +2493,13 @@ __nvme_fc_abort_outstanding_ios(struct nvme_fc_ctrl *ctrl, bool start_queues) * (but with error status). */ if (ctrl->ctrl.queue_count > 1) { - nvme_stop_queues(&ctrl->ctrl); + nvme_quiesce_io_queues(&ctrl->ctrl); nvme_sync_io_queues(&ctrl->ctrl); blk_mq_tagset_busy_iter(&ctrl->tag_set, nvme_fc_terminate_exchange, &ctrl->ctrl); blk_mq_tagset_wait_completed_request(&ctrl->tag_set); if (start_queues) - nvme_start_queues(&ctrl->ctrl); + nvme_unquiesce_io_queues(&ctrl->ctrl); } /* @@ -2517,13 +2517,13 @@ __nvme_fc_abort_outstanding_ios(struct nvme_fc_ctrl *ctrl, bool start_queues) /* * clean up the admin queue. Same thing as above. */ - nvme_stop_admin_queue(&ctrl->ctrl); + nvme_quiesce_admin_queue(&ctrl->ctrl); blk_sync_queue(ctrl->ctrl.admin_q); blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, nvme_fc_terminate_exchange, &ctrl->ctrl); blk_mq_tagset_wait_completed_request(&ctrl->admin_tag_set); if (start_queues) - nvme_start_admin_queue(&ctrl->ctrl); + nvme_unquiesce_admin_queue(&ctrl->ctrl); } static void @@ -3105,7 +3105,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) ctrl->ctrl.max_hw_sectors = ctrl->ctrl.max_segments << (ilog2(SZ_4K) - 9); - nvme_start_admin_queue(&ctrl->ctrl); + nvme_unquiesce_admin_queue(&ctrl->ctrl); ret = nvme_init_ctrl_finish(&ctrl->ctrl, false); if (ret || test_bit(ASSOC_FAILED, &ctrl->flags)) @@ -3251,10 +3251,10 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl) nvme_fc_free_queue(&ctrl->queues[0]); /* re-enable the admin_q so anything new can fast fail */ - nvme_start_admin_queue(&ctrl->ctrl); + nvme_unquiesce_admin_queue(&ctrl->ctrl); /* resume the io queues so that things will fast fail */ - nvme_start_queues(&ctrl->ctrl); + nvme_unquiesce_io_queues(&ctrl->ctrl); nvme_fc_ctlr_inactive_on_rport(ctrl); } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index ef23c6c6e2a3..b3a1c595d144 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -752,10 +752,10 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl); void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, volatile union nvme_result *res); -void nvme_stop_queues(struct nvme_ctrl *ctrl); -void nvme_start_queues(struct nvme_ctrl *ctrl); -void nvme_stop_admin_queue(struct nvme_ctrl *ctrl); -void nvme_start_admin_queue(struct nvme_ctrl *ctrl); +void nvme_quiesce_io_queues(struct nvme_ctrl *ctrl); +void nvme_unquiesce_io_queues(struct nvme_ctrl *ctrl); +void nvme_quiesce_admin_queue(struct nvme_ctrl *ctrl); +void nvme_unquiesce_admin_queue(struct nvme_ctrl *ctrl); void nvme_mark_namespaces_dead(struct nvme_ctrl *ctrl); void nvme_sync_queues(struct nvme_ctrl *ctrl); void nvme_sync_io_queues(struct nvme_ctrl *ctrl); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index d394498d96de..bd5fcdc9211c 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1481,7 +1481,7 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq) nvmeq->dev->online_queues--; if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q) - nvme_stop_admin_queue(&nvmeq->dev->ctrl); + nvme_quiesce_admin_queue(&nvmeq->dev->ctrl); if (!test_and_clear_bit(NVMEQ_POLLED, &nvmeq->flags)) pci_free_irq(to_pci_dev(nvmeq->dev->dev), nvmeq->cq_vector, nvmeq); return 0; @@ -1741,7 +1741,7 @@ static void nvme_dev_remove_admin(struct nvme_dev *dev) * user requests may be waiting on a stopped queue. Start the * queue to flush these to completion. */ - nvme_start_admin_queue(&dev->ctrl); + nvme_unquiesce_admin_queue(&dev->ctrl); blk_mq_destroy_queue(dev->ctrl.admin_q); blk_put_queue(dev->ctrl.admin_q); blk_mq_free_tag_set(&dev->admin_tagset); @@ -2703,7 +2703,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) if (!dead && shutdown && freeze) nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT); - nvme_stop_queues(&dev->ctrl); + nvme_quiesce_io_queues(&dev->ctrl); if (!dead && dev->ctrl.queue_count > 0) { nvme_disable_io_queues(dev); @@ -2723,9 +2723,9 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) * deadlocking blk-mq hot-cpu notifier. */ if (shutdown) { - nvme_start_queues(&dev->ctrl); + nvme_unquiesce_io_queues(&dev->ctrl); if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q)) - nvme_start_admin_queue(&dev->ctrl); + nvme_unquiesce_admin_queue(&dev->ctrl); } mutex_unlock(&dev->shutdown_lock); } @@ -2822,7 +2822,7 @@ static void nvme_reset_work(struct work_struct *work) result = nvme_pci_enable(dev); if (result) goto out_unlock; - nvme_start_admin_queue(&dev->ctrl); + nvme_unquiesce_admin_queue(&dev->ctrl); mutex_unlock(&dev->shutdown_lock); /* @@ -2856,7 +2856,7 @@ static void nvme_reset_work(struct work_struct *work) * controller around but remove all namespaces. */ if (dev->online_queues > 1) { - nvme_start_queues(&dev->ctrl); + nvme_unquiesce_io_queues(&dev->ctrl); nvme_wait_freeze(&dev->ctrl); nvme_pci_update_nr_queues(dev); nvme_dbbuf_set(dev); @@ -2864,7 +2864,7 @@ static void nvme_reset_work(struct work_struct *work) } else { dev_warn(dev->ctrl.device, "IO queues lost\n"); nvme_mark_namespaces_dead(&dev->ctrl); - nvme_start_queues(&dev->ctrl); + nvme_unquiesce_io_queues(&dev->ctrl); nvme_remove_namespaces(&dev->ctrl); nvme_free_tagset(dev); } diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 3d78278e47c5..de591cdf78f3 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -869,7 +869,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, else ctrl->ctrl.max_integrity_segments = 0; - nvme_start_admin_queue(&ctrl->ctrl); + nvme_unquiesce_admin_queue(&ctrl->ctrl); error = nvme_init_ctrl_finish(&ctrl->ctrl, false); if (error) @@ -878,7 +878,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, return 0; out_quiesce_queue: - nvme_stop_admin_queue(&ctrl->ctrl); + nvme_quiesce_admin_queue(&ctrl->ctrl); blk_sync_queue(ctrl->ctrl.admin_q); out_stop_queue: nvme_rdma_stop_queue(&ctrl->queues[0]); @@ -922,7 +922,7 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) goto out_cleanup_tagset; if (!new) { - nvme_start_queues(&ctrl->ctrl); + nvme_unquiesce_io_queues(&ctrl->ctrl); if (!nvme_wait_freeze_timeout(&ctrl->ctrl, NVME_IO_TIMEOUT)) { /* * If we timed out waiting for freeze we are likely to @@ -949,7 +949,7 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) return 0; out_wait_freeze_timed_out: - nvme_stop_queues(&ctrl->ctrl); + nvme_quiesce_io_queues(&ctrl->ctrl); nvme_sync_io_queues(&ctrl->ctrl); nvme_rdma_stop_io_queues(ctrl); out_cleanup_tagset: @@ -964,12 +964,12 @@ out_free_io_queues: static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl, bool remove) { - nvme_stop_admin_queue(&ctrl->ctrl); + nvme_quiesce_admin_queue(&ctrl->ctrl); blk_sync_queue(ctrl->ctrl.admin_q); nvme_rdma_stop_queue(&ctrl->queues[0]); nvme_cancel_admin_tagset(&ctrl->ctrl); if (remove) { - nvme_start_admin_queue(&ctrl->ctrl); + nvme_unquiesce_admin_queue(&ctrl->ctrl); nvme_remove_admin_tag_set(&ctrl->ctrl); } nvme_rdma_destroy_admin_queue(ctrl); @@ -980,12 +980,12 @@ static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl, { if (ctrl->ctrl.queue_count > 1) { nvme_start_freeze(&ctrl->ctrl); - nvme_stop_queues(&ctrl->ctrl); + nvme_quiesce_io_queues(&ctrl->ctrl); nvme_sync_io_queues(&ctrl->ctrl); nvme_rdma_stop_io_queues(ctrl); nvme_cancel_tagset(&ctrl->ctrl); if (remove) { - nvme_start_queues(&ctrl->ctrl); + nvme_unquiesce_io_queues(&ctrl->ctrl); nvme_remove_io_tag_set(&ctrl->ctrl); } nvme_rdma_free_io_queues(ctrl); @@ -1106,7 +1106,7 @@ static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new) destroy_io: if (ctrl->ctrl.queue_count > 1) { - nvme_stop_queues(&ctrl->ctrl); + nvme_quiesce_io_queues(&ctrl->ctrl); nvme_sync_io_queues(&ctrl->ctrl); nvme_rdma_stop_io_queues(ctrl); nvme_cancel_tagset(&ctrl->ctrl); @@ -1115,7 +1115,7 @@ destroy_io: nvme_rdma_free_io_queues(ctrl); } destroy_admin: - nvme_stop_admin_queue(&ctrl->ctrl); + nvme_quiesce_admin_queue(&ctrl->ctrl); blk_sync_queue(ctrl->ctrl.admin_q); nvme_rdma_stop_queue(&ctrl->queues[0]); nvme_cancel_admin_tagset(&ctrl->ctrl); @@ -1156,9 +1156,9 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) nvme_stop_keep_alive(&ctrl->ctrl); flush_work(&ctrl->ctrl.async_event_work); nvme_rdma_teardown_io_queues(ctrl, false); - nvme_start_queues(&ctrl->ctrl); + nvme_unquiesce_io_queues(&ctrl->ctrl); nvme_rdma_teardown_admin_queue(ctrl, false); - nvme_start_admin_queue(&ctrl->ctrl); + nvme_unquiesce_admin_queue(&ctrl->ctrl); nvme_auth_stop(&ctrl->ctrl); if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { @@ -2207,7 +2207,7 @@ static const struct blk_mq_ops nvme_rdma_admin_mq_ops = { static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) { nvme_rdma_teardown_io_queues(ctrl, shutdown); - nvme_stop_admin_queue(&ctrl->ctrl); + nvme_quiesce_admin_queue(&ctrl->ctrl); if (shutdown) nvme_shutdown_ctrl(&ctrl->ctrl); else diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 3fedddf0aedc..776b8d9dfca7 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1875,7 +1875,7 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new) goto out_cleanup_connect_q; if (!new) { - nvme_start_queues(ctrl); + nvme_unquiesce_io_queues(ctrl); if (!nvme_wait_freeze_timeout(ctrl, NVME_IO_TIMEOUT)) { /* * If we timed out waiting for freeze we are likely to @@ -1902,7 +1902,7 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new) return 0; out_wait_freeze_timed_out: - nvme_stop_queues(ctrl); + nvme_quiesce_io_queues(ctrl); nvme_sync_io_queues(ctrl); nvme_tcp_stop_io_queues(ctrl); out_cleanup_connect_q: @@ -1947,7 +1947,7 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new) if (error) goto out_stop_queue; - nvme_start_admin_queue(ctrl); + nvme_unquiesce_admin_queue(ctrl); error = nvme_init_ctrl_finish(ctrl, false); if (error) @@ -1956,7 +1956,7 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new) return 0; out_quiesce_queue: - nvme_stop_admin_queue(ctrl); + nvme_quiesce_admin_queue(ctrl); blk_sync_queue(ctrl->admin_q); out_stop_queue: nvme_tcp_stop_queue(ctrl, 0); @@ -1972,12 +1972,12 @@ out_free_queue: static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl, bool remove) { - nvme_stop_admin_queue(ctrl); + nvme_quiesce_admin_queue(ctrl); blk_sync_queue(ctrl->admin_q); nvme_tcp_stop_queue(ctrl, 0); nvme_cancel_admin_tagset(ctrl); if (remove) - nvme_start_admin_queue(ctrl); + nvme_unquiesce_admin_queue(ctrl); nvme_tcp_destroy_admin_queue(ctrl, remove); } @@ -1986,14 +1986,14 @@ static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl, { if (ctrl->queue_count <= 1) return; - nvme_stop_admin_queue(ctrl); + nvme_quiesce_admin_queue(ctrl); nvme_start_freeze(ctrl); - nvme_stop_queues(ctrl); + nvme_quiesce_io_queues(ctrl); nvme_sync_io_queues(ctrl); nvme_tcp_stop_io_queues(ctrl); nvme_cancel_tagset(ctrl); if (remove) - nvme_start_queues(ctrl); + nvme_unquiesce_io_queues(ctrl); nvme_tcp_destroy_io_queues(ctrl, remove); } @@ -2074,14 +2074,14 @@ static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new) destroy_io: if (ctrl->queue_count > 1) { - nvme_stop_queues(ctrl); + nvme_quiesce_io_queues(ctrl); nvme_sync_io_queues(ctrl); nvme_tcp_stop_io_queues(ctrl); nvme_cancel_tagset(ctrl); nvme_tcp_destroy_io_queues(ctrl, new); } destroy_admin: - nvme_stop_admin_queue(ctrl); + nvme_quiesce_admin_queue(ctrl); blk_sync_queue(ctrl->admin_q); nvme_tcp_stop_queue(ctrl, 0); nvme_cancel_admin_tagset(ctrl); @@ -2123,9 +2123,9 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work) flush_work(&ctrl->async_event_work); nvme_tcp_teardown_io_queues(ctrl, false); /* unquiesce to fail fast pending requests */ - nvme_start_queues(ctrl); + nvme_unquiesce_io_queues(ctrl); nvme_tcp_teardown_admin_queue(ctrl, false); - nvme_start_admin_queue(ctrl); + nvme_unquiesce_admin_queue(ctrl); nvme_auth_stop(ctrl); if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) { @@ -2141,7 +2141,7 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work) static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown) { nvme_tcp_teardown_io_queues(ctrl, shutdown); - nvme_stop_admin_queue(ctrl); + nvme_quiesce_admin_queue(ctrl); if (shutdown) nvme_shutdown_ctrl(ctrl); else diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 893c50f365c4..4173099ef9a4 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -375,7 +375,7 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) ctrl->ctrl.max_hw_sectors = (NVME_LOOP_MAX_SEGMENTS - 1) << (PAGE_SHIFT - 9); - nvme_start_admin_queue(&ctrl->ctrl); + nvme_unquiesce_admin_queue(&ctrl->ctrl); error = nvme_init_ctrl_finish(&ctrl->ctrl, false); if (error) @@ -394,12 +394,12 @@ out_free_sq: static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl) { if (ctrl->ctrl.queue_count > 1) { - nvme_stop_queues(&ctrl->ctrl); + nvme_quiesce_io_queues(&ctrl->ctrl); nvme_cancel_tagset(&ctrl->ctrl); nvme_loop_destroy_io_queues(ctrl); } - nvme_stop_admin_queue(&ctrl->ctrl); + nvme_quiesce_admin_queue(&ctrl->ctrl); if (ctrl->ctrl.state == NVME_CTRL_LIVE) nvme_shutdown_ctrl(&ctrl->ctrl); From 23855abdc4be03e17564f665b5d0029ef27abf7b Mon Sep 17 00:00:00 2001 From: Aleksandr Miloserdov Date: Tue, 15 Nov 2022 14:58:09 +0300 Subject: [PATCH 44/45] nvmet: expose IEEE OUI to configfs Allow user to set OUI for the controller vendor. Reviewed-by: Konstantin Shelekhin Reviewed-by: Dmitriy Bogdanov Signed-off-by: Aleksandr Miloserdov Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/admin-cmd.c | 7 ++--- drivers/nvme/target/configfs.c | 49 +++++++++++++++++++++++++++++++++ drivers/nvme/target/core.c | 2 ++ drivers/nvme/target/nvmet.h | 1 + 4 files changed, 54 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index c8a061ce3ee5..48a2f587f38a 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -372,6 +372,8 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req) memcpy_and_pad(id->fr, sizeof(id->fr), UTS_RELEASE, strlen(UTS_RELEASE), ' '); + put_unaligned_le24(subsys->ieee_oui, id->ieee); + id->rab = 6; if (nvmet_is_disc_subsys(ctrl->subsys)) @@ -379,11 +381,6 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req) else id->cntrltype = NVME_CTRL_IO; - /* - * XXX: figure out how we can assign a IEEE OUI, but until then - * the safest is to leave it as zeroes. - */ - /* we support multiple ports, multiples hosts and ANA: */ id->cmic = NVME_CTRL_CMIC_MULTI_PORT | NVME_CTRL_CMIC_MULTI_CTRL | NVME_CTRL_CMIC_ANA; diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 051a420d818e..02797170dd91 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -1259,6 +1259,54 @@ static ssize_t nvmet_subsys_attr_model_store(struct config_item *item, } CONFIGFS_ATTR(nvmet_subsys_, attr_model); +static ssize_t nvmet_subsys_attr_ieee_oui_show(struct config_item *item, + char *page) +{ + struct nvmet_subsys *subsys = to_subsys(item); + + return sysfs_emit(page, "0x%06x\n", subsys->ieee_oui); +} + +static ssize_t nvmet_subsys_attr_ieee_oui_store_locked(struct nvmet_subsys *subsys, + const char *page, size_t count) +{ + uint32_t val = 0; + int ret; + + if (subsys->subsys_discovered) { + pr_err("Can't set IEEE OUI. 0x%06x is already assigned\n", + subsys->ieee_oui); + return -EINVAL; + } + + ret = kstrtou32(page, 0, &val); + if (ret < 0) + return ret; + + if (val >= 0x1000000) + return -EINVAL; + + subsys->ieee_oui = val; + + return count; +} + +static ssize_t nvmet_subsys_attr_ieee_oui_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_subsys *subsys = to_subsys(item); + ssize_t ret; + + down_write(&nvmet_config_sem); + mutex_lock(&subsys->lock); + ret = nvmet_subsys_attr_ieee_oui_store_locked(subsys, page, count); + mutex_unlock(&subsys->lock); + up_write(&nvmet_config_sem); + + return ret; +} +CONFIGFS_ATTR(nvmet_subsys_, attr_ieee_oui); + #ifdef CONFIG_BLK_DEV_INTEGRITY static ssize_t nvmet_subsys_attr_pi_enable_show(struct config_item *item, char *page) @@ -1320,6 +1368,7 @@ static struct configfs_attribute *nvmet_subsys_attrs[] = { &nvmet_subsys_attr_attr_cntlid_max, &nvmet_subsys_attr_attr_model, &nvmet_subsys_attr_attr_qid_max, + &nvmet_subsys_attr_attr_ieee_oui, #ifdef CONFIG_BLK_DEV_INTEGRITY &nvmet_subsys_attr_attr_pi_enable, #endif diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 171493bcf0fb..006d8c94f5e1 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -1561,6 +1561,8 @@ struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn, goto free_subsys; } + subsys->ieee_oui = 0; + switch (type) { case NVME_NQN_NVME: subsys->max_qid = NVMET_NR_QUEUES; diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index bda1c1f71f39..976e11cd8c01 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -263,6 +263,7 @@ struct nvmet_subsys { struct config_group allowed_hosts_group; char *model_number; + u32 ieee_oui; #ifdef CONFIG_NVME_TARGET_PASSTHRU struct nvme_ctrl *passthru_ctrl; From 68c5444c317208f5a114f671140373f47f0a2cf6 Mon Sep 17 00:00:00 2001 From: Aleksandr Miloserdov Date: Tue, 15 Nov 2022 14:58:10 +0300 Subject: [PATCH 45/45] nvmet: expose firmware revision to configfs Allow user to set currently active firmware revision Reviewed-by: Konstantin Shelekhin Reviewed-by: Dmitriy Bogdanov Signed-off-by: Aleksandr Miloserdov Signed-off-by: Christoph Hellwig --- drivers/nvme/target/admin-cmd.c | 2 +- drivers/nvme/target/configfs.c | 63 +++++++++++++++++++++++++++++++++ drivers/nvme/target/core.c | 15 ++++++-- drivers/nvme/target/nvmet.h | 2 ++ 4 files changed, 79 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 48a2f587f38a..6b46f90a63cf 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -370,7 +370,7 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req) memcpy_and_pad(id->mn, sizeof(id->mn), subsys->model_number, strlen(subsys->model_number), ' '); memcpy_and_pad(id->fr, sizeof(id->fr), - UTS_RELEASE, strlen(UTS_RELEASE), ' '); + subsys->firmware_rev, strlen(subsys->firmware_rev), ' '); put_unaligned_le24(subsys->ieee_oui, id->ieee); diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 02797170dd91..d48deb9bdb27 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -1307,6 +1307,68 @@ static ssize_t nvmet_subsys_attr_ieee_oui_store(struct config_item *item, } CONFIGFS_ATTR(nvmet_subsys_, attr_ieee_oui); +static ssize_t nvmet_subsys_attr_firmware_show(struct config_item *item, + char *page) +{ + struct nvmet_subsys *subsys = to_subsys(item); + + return sysfs_emit(page, "%s\n", subsys->firmware_rev); +} + +static ssize_t nvmet_subsys_attr_firmware_store_locked(struct nvmet_subsys *subsys, + const char *page, size_t count) +{ + int pos = 0, len; + char *val; + + if (subsys->subsys_discovered) { + pr_err("Can't set firmware revision. %s is already assigned\n", + subsys->firmware_rev); + return -EINVAL; + } + + len = strcspn(page, "\n"); + if (!len) + return -EINVAL; + + if (len > NVMET_FR_MAX_SIZE) { + pr_err("Firmware revision size can not exceed %d Bytes\n", + NVMET_FR_MAX_SIZE); + return -EINVAL; + } + + for (pos = 0; pos < len; pos++) { + if (!nvmet_is_ascii(page[pos])) + return -EINVAL; + } + + val = kmemdup_nul(page, len, GFP_KERNEL); + if (!val) + return -ENOMEM; + + kfree(subsys->firmware_rev); + + subsys->firmware_rev = val; + + return count; +} + +static ssize_t nvmet_subsys_attr_firmware_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_subsys *subsys = to_subsys(item); + ssize_t ret; + + down_write(&nvmet_config_sem); + mutex_lock(&subsys->lock); + ret = nvmet_subsys_attr_firmware_store_locked(subsys, page, count); + mutex_unlock(&subsys->lock); + up_write(&nvmet_config_sem); + + return ret; +} +CONFIGFS_ATTR(nvmet_subsys_, attr_firmware); + #ifdef CONFIG_BLK_DEV_INTEGRITY static ssize_t nvmet_subsys_attr_pi_enable_show(struct config_item *item, char *page) @@ -1369,6 +1431,7 @@ static struct configfs_attribute *nvmet_subsys_attrs[] = { &nvmet_subsys_attr_attr_model, &nvmet_subsys_attr_attr_qid_max, &nvmet_subsys_attr_attr_ieee_oui, + &nvmet_subsys_attr_attr_firmware, #ifdef CONFIG_BLK_DEV_INTEGRITY &nvmet_subsys_attr_attr_pi_enable, #endif diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 006d8c94f5e1..f66ed13d7c11 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -10,6 +10,8 @@ #include #include +#include + #define CREATE_TRACE_POINTS #include "trace.h" @@ -1563,6 +1565,12 @@ struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn, subsys->ieee_oui = 0; + subsys->firmware_rev = kstrndup(UTS_RELEASE, NVMET_FR_MAX_SIZE, GFP_KERNEL); + if (!subsys->firmware_rev) { + ret = -ENOMEM; + goto free_mn; + } + switch (type) { case NVME_NQN_NVME: subsys->max_qid = NVMET_NR_QUEUES; @@ -1574,14 +1582,14 @@ struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn, default: pr_err("%s: Unknown Subsystem type - %d\n", __func__, type); ret = -EINVAL; - goto free_mn; + goto free_fr; } subsys->type = type; subsys->subsysnqn = kstrndup(subsysnqn, NVMF_NQN_SIZE, GFP_KERNEL); if (!subsys->subsysnqn) { ret = -ENOMEM; - goto free_mn; + goto free_fr; } subsys->cntlid_min = NVME_CNTLID_MIN; subsys->cntlid_max = NVME_CNTLID_MAX; @@ -1594,6 +1602,8 @@ struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn, return subsys; +free_fr: + kfree(subsys->firmware_rev); free_mn: kfree(subsys->model_number); free_subsys: @@ -1613,6 +1623,7 @@ static void nvmet_subsys_free(struct kref *ref) kfree(subsys->subsysnqn); kfree(subsys->model_number); + kfree(subsys->firmware_rev); kfree(subsys); } diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 976e11cd8c01..89bedfcd974c 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -29,6 +29,7 @@ #define NVMET_DEFAULT_CTRL_MODEL "Linux" #define NVMET_MN_MAX_SIZE 40 #define NVMET_SN_MAX_SIZE 20 +#define NVMET_FR_MAX_SIZE 8 /* * Supported optional AENs: @@ -264,6 +265,7 @@ struct nvmet_subsys { char *model_number; u32 ieee_oui; + char *firmware_rev; #ifdef CONFIG_NVME_TARGET_PASSTHRU struct nvme_ctrl *passthru_ctrl;