From 4a09a845c1773fe3cd96c2afe737b58a88e3d30b Mon Sep 17 00:00:00 2001 From: Gioh Kim Date: Fri, 14 Jan 2022 16:58:54 +0100 Subject: [PATCH 01/76] block/rnbd-clt: fix CHECK:BRACES warning This patch fix the "CHECK:BRACES: braces {} should be used on all arms of this statement" warning from checkpatch Signed-off-by: Gioh Kim Signed-off-by: Md Haris Iqbal Link: https://lore.kernel.org/r/20220114155855.984144-2-haris.iqbal@ionos.com Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index c08971de369f..f0370a345c3e 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -1262,9 +1262,9 @@ find_and_get_or_create_sess(const char *sessname, struct rtrs_clt_ops rtrs_ops; sess = find_or_create_sess(sessname, &first); - if (sess == ERR_PTR(-ENOMEM)) + if (sess == ERR_PTR(-ENOMEM)) { return ERR_PTR(-ENOMEM); - else if ((nr_poll_queues && !first) || (!nr_poll_queues && sess->nr_poll_queues)) { + } else if ((nr_poll_queues && !first) || (!nr_poll_queues && sess->nr_poll_queues)) { /* * A device MUST have its own session to use the polling-mode. * It must fail to map new device with the same session. From 030ce8ba97d2e85e5310b1ae8236cd640bd384d0 Mon Sep 17 00:00:00 2001 From: Gioh Kim Date: Fri, 14 Jan 2022 16:58:55 +0100 Subject: [PATCH 02/76] block/rnbd: client device does not care queue/rotational On client side, the device is a network device. There is no reason to set rotational even-if the target device on server is rotational. Signed-off-by: Gioh Kim Signed-off-by: Jack Wang Signed-off-by: Md Haris Iqbal Link: https://lore.kernel.org/r/20220114155855.984144-3-haris.iqbal@ionos.com Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt.c | 11 ++++++----- drivers/block/rnbd/rnbd-clt.h | 1 - drivers/block/rnbd/rnbd-proto.h | 4 ++-- drivers/block/rnbd/rnbd-srv.c | 1 - 4 files changed, 8 insertions(+), 9 deletions(-) diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index f0370a345c3e..9a880d559ab8 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -87,7 +87,6 @@ static int rnbd_clt_set_dev_attr(struct rnbd_clt_dev *dev, dev->discard_granularity = le32_to_cpu(rsp->discard_granularity); dev->discard_alignment = le32_to_cpu(rsp->discard_alignment); dev->secure_discard = le16_to_cpu(rsp->secure_discard); - dev->rotational = rsp->rotational; dev->wc = !!(rsp->cache_policy & RNBD_WRITEBACK); dev->fua = !!(rsp->cache_policy & RNBD_FUA); @@ -1410,8 +1409,10 @@ static int rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev, int idx) dev->read_only = false; } - if (!dev->rotational) - blk_queue_flag_set(QUEUE_FLAG_NONROT, dev->queue); + /* + * Network device does not need rotational + */ + blk_queue_flag_set(QUEUE_FLAG_NONROT, dev->queue); err = add_disk(dev->gd); if (err) blk_cleanup_disk(dev->gd); @@ -1610,13 +1611,13 @@ struct rnbd_clt_dev *rnbd_clt_map_device(const char *sessname, } rnbd_clt_info(dev, - "map_device: Device mapped as %s (nsectors: %zu, logical_block_size: %d, physical_block_size: %d, max_write_same_sectors: %d, max_discard_sectors: %d, discard_granularity: %d, discard_alignment: %d, secure_discard: %d, max_segments: %d, max_hw_sectors: %d, rotational: %d, wc: %d, fua: %d)\n", + "map_device: Device mapped as %s (nsectors: %zu, logical_block_size: %d, physical_block_size: %d, max_write_same_sectors: %d, max_discard_sectors: %d, discard_granularity: %d, discard_alignment: %d, secure_discard: %d, max_segments: %d, max_hw_sectors: %d, wc: %d, fua: %d)\n", dev->gd->disk_name, dev->nsectors, dev->logical_block_size, dev->physical_block_size, dev->max_write_same_sectors, dev->max_discard_sectors, dev->discard_granularity, dev->discard_alignment, dev->secure_discard, dev->max_segments, - dev->max_hw_sectors, dev->rotational, dev->wc, dev->fua); + dev->max_hw_sectors, dev->wc, dev->fua); mutex_unlock(&dev->lock); rnbd_clt_put_sess(sess); diff --git a/drivers/block/rnbd/rnbd-clt.h b/drivers/block/rnbd/rnbd-clt.h index 0c2cae7f39b9..62bf7c3fa63c 100644 --- a/drivers/block/rnbd/rnbd-clt.h +++ b/drivers/block/rnbd/rnbd-clt.h @@ -118,7 +118,6 @@ struct rnbd_clt_dev { enum rnbd_access_mode access_mode; u32 nr_poll_queues; bool read_only; - bool rotational; bool wc; bool fua; u32 max_hw_sectors; diff --git a/drivers/block/rnbd/rnbd-proto.h b/drivers/block/rnbd/rnbd-proto.h index de5d5a8df81d..c4a68b3a1cbe 100644 --- a/drivers/block/rnbd/rnbd-proto.h +++ b/drivers/block/rnbd/rnbd-proto.h @@ -128,7 +128,7 @@ enum rnbd_cache_policy { * @logical_block_size: logical block size device supports in bytes * @max_segments: max segments hardware support in one transfer * @secure_discard: supports secure discard - * @rotation: is a rotational disc? + * @obsolete_rotational: obsolete, not in used. * @cache_policy: support write-back caching or FUA? */ struct rnbd_msg_open_rsp { @@ -144,7 +144,7 @@ struct rnbd_msg_open_rsp { __le16 logical_block_size; __le16 max_segments; __le16 secure_discard; - u8 rotational; + u8 obsolete_rotational; u8 cache_policy; u8 reserved[10]; }; diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c index 132e950685d5..6499efae5c43 100644 --- a/drivers/block/rnbd/rnbd-srv.c +++ b/drivers/block/rnbd/rnbd-srv.c @@ -558,7 +558,6 @@ static void rnbd_srv_fill_msg_open_rsp(struct rnbd_msg_open_rsp *rsp, cpu_to_le32(rnbd_dev_get_discard_alignment(rnbd_dev)); rsp->secure_discard = cpu_to_le16(rnbd_dev_get_secure_discard(rnbd_dev)); - rsp->rotational = !blk_queue_nonrot(q); rsp->cache_policy = 0; if (test_bit(QUEUE_FLAG_WC, &q->queue_flags)) rsp->cache_policy |= RNBD_WRITEBACK; From 24afc15dbe218f860994f627b4ba1fb09225a298 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Mon, 7 Feb 2022 21:48:19 +0100 Subject: [PATCH 03/76] block/rnbd: Remove a useless mutex According to lib/idr.c, The IDA handles its own locking. It is safe to call any of the IDA functions without synchronisation in your code. so the 'ida_lock' mutex can just be removed. It is here only to protect some ida_simple_get()/ida_simple_remove() calls. While at it, switch to ida_alloc_XXX()/ida_free() instead to ida_simple_get()/ida_simple_remove(). The latter is deprecated and more verbose. Signed-off-by: Christophe JAILLET Acked-by: Jack Wang Link: https://lore.kernel.org/r/7f9eccd8b1fce1bac45ac9b01a78cf72f54c0a61.1644266862.git.christophe.jaillet@wanadoo.fr Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index 9a880d559ab8..1f63f308eb39 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -23,7 +23,6 @@ MODULE_LICENSE("GPL"); static int rnbd_client_major; static DEFINE_IDA(index_ida); -static DEFINE_MUTEX(ida_lock); static DEFINE_MUTEX(sess_lock); static LIST_HEAD(sess_list); @@ -55,9 +54,7 @@ static void rnbd_clt_put_dev(struct rnbd_clt_dev *dev) if (!refcount_dec_and_test(&dev->refcount)) return; - mutex_lock(&ida_lock); - ida_simple_remove(&index_ida, dev->clt_device_id); - mutex_unlock(&ida_lock); + ida_free(&index_ida, dev->clt_device_id); kfree(dev->hw_queues); kfree(dev->pathname); rnbd_clt_put_sess(dev->sess); @@ -1460,10 +1457,8 @@ static struct rnbd_clt_dev *init_dev(struct rnbd_clt_session *sess, goto out_alloc; } - mutex_lock(&ida_lock); - ret = ida_simple_get(&index_ida, 0, 1 << (MINORBITS - RNBD_PART_BITS), - GFP_KERNEL); - mutex_unlock(&ida_lock); + ret = ida_alloc_max(&index_ida, 1 << (MINORBITS - RNBD_PART_BITS), + GFP_KERNEL); if (ret < 0) { pr_err("Failed to initialize device '%s' from session %s, allocating idr failed, err: %d\n", pathname, sess->sessname, ret); From d9a74051a73c4fbd065ae806b8da151cbded84f1 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 8 Feb 2022 11:46:56 +0000 Subject: [PATCH 04/76] loop: clean up grammar in warning message The phrase "has still" should be "still has" to clean up the grammar. Signed-off-by: Colin Ian King Link: https://lore.kernel.org/r/20220208114656.61629-1-colin.i.king@gmail.com Signed-off-by: Jens Axboe --- drivers/block/loop.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 01cbbfc4e9e2..bdea448d2419 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1262,7 +1262,7 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) if (size_changed && lo->lo_device->bd_inode->i_mapping->nrpages) { /* If any pages were dirtied after invalidate_bdev(), try again */ err = -EAGAIN; - pr_warn("%s: loop%d (%s) has still dirty pages (nrpages=%lu)\n", + pr_warn("%s: loop%d (%s) still has dirty pages (nrpages=%lu)\n", __func__, lo->lo_number, lo->lo_file_name, lo->lo_device->bd_inode->i_mapping->nrpages); goto out_unfreeze; @@ -1482,7 +1482,7 @@ static int loop_set_block_size(struct loop_device *lo, unsigned long arg) /* invalidate_bdev should have truncated all the pages */ if (lo->lo_device->bd_inode->i_mapping->nrpages) { err = -EAGAIN; - pr_warn("%s: loop%d (%s) has still dirty pages (nrpages=%lu)\n", + pr_warn("%s: loop%d (%s) still has dirty pages (nrpages=%lu)\n", __func__, lo->lo_number, lo->lo_file_name, lo->lo_device->bd_inode->i_mapping->nrpages); goto out_unfreeze; From a75110c3b36959d04d3a586ba43510ddf9b410b5 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 15 Feb 2022 03:59:51 -0800 Subject: [PATCH 05/76] null_blk: fix return value from null_add_dev() The function nullb_device_power_store() returns -ENOMEM when null_add_dev() fails. null_add_dev() can fail with return value other than -ENOMEM such as -EINVAL when Zoned Block Device option is used, see : nullb_device_power_store() null_add_dev() null_init_zoned_dev() return -EINVAL; When trying to load the module having -ENOMEM value returned on the command line creates confusion when pleanty of memory is free on the machine. Instead of hardcoding -ENOMEM return the value of null_add_dev() function. Signed-off-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20220215115951.15945-1-kch@nvidia.com Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 13004beb48ca..90b6bd2a114b 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -431,9 +431,10 @@ static ssize_t nullb_device_power_store(struct config_item *item, if (!dev->power && newp) { if (test_and_set_bit(NULLB_DEV_FL_UP, &dev->flags)) return count; - if (null_add_dev(dev)) { + ret = null_add_dev(dev); + if (ret) { clear_bit(NULLB_DEV_FL_UP, &dev->flags); - return -ENOMEM; + return ret; } set_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags); From b27824d31f09ea7b4a6ba2c1b18bd328df3e8bed Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 15 Feb 2022 13:33:07 -0800 Subject: [PATCH 06/76] loop: use sysfs_emit() in the sysfs xxx show() sprintf does not know the PAGE_SIZE maximum of the temporary buffer used for outputting sysfs content and it's possible to overrun the PAGE_SIZE buffer length. Use a generic sysfs_emit function that knows the size of the temporary buffer and ensures that no overrun is done for offset attribute in loop_attr_[offset|sizelimit|autoclear|partscan|dio]_show() callbacks. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Himanshu Madhani Link: https://lore.kernel.org/r/20220215213310.7264-2-kch@nvidia.com Signed-off-by: Jens Axboe --- drivers/block/loop.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index bdea448d2419..a55e5eda1d17 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -680,33 +680,33 @@ static ssize_t loop_attr_backing_file_show(struct loop_device *lo, char *buf) static ssize_t loop_attr_offset_show(struct loop_device *lo, char *buf) { - return sprintf(buf, "%llu\n", (unsigned long long)lo->lo_offset); + return sysfs_emit(buf, "%llu\n", (unsigned long long)lo->lo_offset); } static ssize_t loop_attr_sizelimit_show(struct loop_device *lo, char *buf) { - return sprintf(buf, "%llu\n", (unsigned long long)lo->lo_sizelimit); + return sysfs_emit(buf, "%llu\n", (unsigned long long)lo->lo_sizelimit); } static ssize_t loop_attr_autoclear_show(struct loop_device *lo, char *buf) { int autoclear = (lo->lo_flags & LO_FLAGS_AUTOCLEAR); - return sprintf(buf, "%s\n", autoclear ? "1" : "0"); + return sysfs_emit(buf, "%s\n", autoclear ? "1" : "0"); } static ssize_t loop_attr_partscan_show(struct loop_device *lo, char *buf) { int partscan = (lo->lo_flags & LO_FLAGS_PARTSCAN); - return sprintf(buf, "%s\n", partscan ? "1" : "0"); + return sysfs_emit(buf, "%s\n", partscan ? "1" : "0"); } static ssize_t loop_attr_dio_show(struct loop_device *lo, char *buf) { int dio = (lo->lo_flags & LO_FLAGS_DIRECT_IO); - return sprintf(buf, "%s\n", dio ? "1" : "0"); + return sysfs_emit(buf, "%s\n", dio ? "1" : "0"); } LOOP_ATTR_RO(backing_file); From 0aab29b85478b994422f1551ce36e5640b09db2b Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 15 Feb 2022 13:33:08 -0800 Subject: [PATCH 07/76] loop: remove extra variable in lo_fallocate() The local variable q is used to pass it to the blk_queue_discard(). We can get away with using lo->lo_queue instead of storing in a local variable which is not used anywhere else. No functional change in this patch. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Himanshu Madhani Link: https://lore.kernel.org/r/20220215213310.7264-3-kch@nvidia.com Signed-off-by: Jens Axboe --- drivers/block/loop.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index a55e5eda1d17..77c61eaaa6e4 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -308,12 +308,11 @@ static int lo_fallocate(struct loop_device *lo, struct request *rq, loff_t pos, * a.k.a. discard/zerorange. */ struct file *file = lo->lo_backing_file; - struct request_queue *q = lo->lo_queue; int ret; mode |= FALLOC_FL_KEEP_SIZE; - if (!blk_queue_discard(q)) { + if (!blk_queue_discard(lo->lo_queue)) { ret = -EOPNOTSUPP; goto out; } From 9c64e38cc639537ad9b542635af1733e8cb5e19b Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 15 Feb 2022 13:33:09 -0800 Subject: [PATCH 08/76] loop: remove extra variable in lo_req_flush The local variable file is used to pass it to the vfs_fsync(). We can get away with using lo->lo_backing_file instead of storing in a local variable which is not used anywhere else. No functional change in this patch. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Himanshu Madhani Link: https://lore.kernel.org/r/20220215213310.7264-4-kch@nvidia.com Signed-off-by: Jens Axboe --- drivers/block/loop.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 77c61eaaa6e4..18b30a56bfc4 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -326,8 +326,7 @@ static int lo_fallocate(struct loop_device *lo, struct request *rq, loff_t pos, static int lo_req_flush(struct loop_device *lo, struct request *rq) { - struct file *file = lo->lo_backing_file; - int ret = vfs_fsync(file, 0); + int ret = vfs_fsync(lo->lo_backing_file, 0); if (unlikely(ret && ret != -EINVAL)) ret = -EIO; From ef44c50837ab7818920bd9994b36d6e573312abc Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 15 Feb 2022 13:33:10 -0800 Subject: [PATCH 09/76] loop: allow user to set the queue depth Instead of hardcoding queue depth allow user to set the hw queue depth using module parameter. Set default value to 128 to retain the existing behavior. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Himanshu Madhani Link: https://lore.kernel.org/r/20220215213310.7264-5-kch@nvidia.com Signed-off-by: Jens Axboe --- drivers/block/loop.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 18b30a56bfc4..8fb89d0624fc 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -85,6 +85,7 @@ #include #define LOOP_IDLE_WORKER_TIMEOUT (60 * HZ) +#define LOOP_DEFAULT_HW_Q_DEPTH (128) static DEFINE_IDR(loop_index_idr); static DEFINE_MUTEX(loop_ctl_mutex); @@ -1785,6 +1786,24 @@ module_param(max_loop, int, 0444); MODULE_PARM_DESC(max_loop, "Maximum number of loop devices"); module_param(max_part, int, 0444); MODULE_PARM_DESC(max_part, "Maximum number of partitions per loop device"); + +static int hw_queue_depth = LOOP_DEFAULT_HW_Q_DEPTH; + +static int loop_set_hw_queue_depth(const char *s, const struct kernel_param *p) +{ + int ret = kstrtoint(s, 10, &hw_queue_depth); + + return (ret || (hw_queue_depth < 1)) ? -EINVAL : 0; +} + +static const struct kernel_param_ops loop_hw_qdepth_param_ops = { + .set = loop_set_hw_queue_depth, + .get = param_get_int, +}; + +device_param_cb(hw_queue_depth, &loop_hw_qdepth_param_ops, &hw_queue_depth, 0444); +MODULE_PARM_DESC(hw_queue_depth, "Queue depth for each hardware queue. Default: 128"); + MODULE_LICENSE("GPL"); MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR); @@ -1979,7 +1998,7 @@ static int loop_add(int i) lo->tag_set.ops = &loop_mq_ops; lo->tag_set.nr_hw_queues = 1; - lo->tag_set.queue_depth = 128; + lo->tag_set.queue_depth = hw_queue_depth; lo->tag_set.numa_node = NUMA_NO_NODE; lo->tag_set.cmd_size = sizeof(struct loop_cmd); lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_STACKING | From 3d3472f3ed419fe1a9d3f881a4905fa8e03d750c Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Wed, 16 Feb 2022 09:29:45 -0800 Subject: [PATCH 10/76] null_blk: remove hardcoded alloc_cmd() parameter Only caller of alloc_cmd() is null_submit_bio() unconditionally sets second parameter to true and that is statically hard-coded in null_blk. There is no point in having statically hardcoded function parameter. Remove the unnecessary parameter can_wait and adjust the code so it can retain existing behavior of waiting when we don't get valid nullb_cmd from __alloc_cmd() in alloc_cmd(). The restructured code avoids multiple return statements, multiple calls to __alloc_cmd() and resulting a fast path call to prepare_to_wait() due to removal of first alloc_cmd() call. Follow the pattern that we have in bio_alloc() to set the structure members in the structure allocation function in alloc_cmd() and pass bio to initialize newly allocated cmd->bio member. Follow the pattern in copy_to_nullb() to use result of one function call (null_cache_active()) to be used as a parameter to another function call (null_insert_page()), use result of alloc_cmd() as a first parameter to the null_handle_cmd() in null_submit_bio() function. This allow us to remove the local variable cmd on stack in null_submit_bio() that is in fast path. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Damien Le Moal Link: https://lore.kernel.org/r/20220216172945.31124-2-kch@nvidia.com Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 90b6bd2a114b..29e183719e77 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -720,26 +720,25 @@ static struct nullb_cmd *__alloc_cmd(struct nullb_queue *nq) return NULL; } -static struct nullb_cmd *alloc_cmd(struct nullb_queue *nq, int can_wait) +static struct nullb_cmd *alloc_cmd(struct nullb_queue *nq, struct bio *bio) { struct nullb_cmd *cmd; DEFINE_WAIT(wait); - cmd = __alloc_cmd(nq); - if (cmd || !can_wait) - return cmd; - do { - prepare_to_wait(&nq->wait, &wait, TASK_UNINTERRUPTIBLE); + /* + * This avoids multiple return statements, multiple calls to + * __alloc_cmd() and a fast path call to prepare_to_wait(). + */ cmd = __alloc_cmd(nq); - if (cmd) - break; - + if (cmd) { + cmd->bio = bio; + return cmd; + } + prepare_to_wait(&nq->wait, &wait, TASK_UNINTERRUPTIBLE); io_schedule(); + finish_wait(&nq->wait, &wait); } while (1); - - finish_wait(&nq->wait, &wait); - return cmd; } static void end_cmd(struct nullb_cmd *cmd) @@ -1477,12 +1476,8 @@ static void null_submit_bio(struct bio *bio) sector_t nr_sectors = bio_sectors(bio); struct nullb *nullb = bio->bi_bdev->bd_disk->private_data; struct nullb_queue *nq = nullb_to_queue(nullb); - struct nullb_cmd *cmd; - cmd = alloc_cmd(nq, 1); - cmd->bio = bio; - - null_handle_cmd(cmd, sector, nr_sectors, bio_op(bio)); + null_handle_cmd(alloc_cmd(nq, bio), sector, nr_sectors, bio_op(bio)); } static bool should_timeout_request(struct request *rq) From c90b6b50b42dfdfeb27b02e670e812b69ff364f5 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 22 Feb 2022 07:28:51 -0800 Subject: [PATCH 11/76] null_blk: remove hardcoded null_alloc_page() param Only caller of null_alloc_page() is null_insert_page() unconditionally sets only parameter to GFP_NOIO and that is statically hard-coded in null_blk. There is no point in having statically hardcoded function parameter. Remove the unnecessary parameter gfp_flags and adjust the code, so it can retain existing behavior null_alloc_page() with GFP_NOIO. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220222152852.26043-2-kch@nvidia.com Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 29e183719e77..80f9a6ba376d 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -777,15 +777,15 @@ static void null_complete_rq(struct request *rq) end_cmd(blk_mq_rq_to_pdu(rq)); } -static struct nullb_page *null_alloc_page(gfp_t gfp_flags) +static struct nullb_page *null_alloc_page(void) { struct nullb_page *t_page; - t_page = kmalloc(sizeof(struct nullb_page), gfp_flags); + t_page = kmalloc(sizeof(struct nullb_page), GFP_NOIO); if (!t_page) goto out; - t_page->page = alloc_pages(gfp_flags, 0); + t_page->page = alloc_pages(GFP_NOIO, 0); if (!t_page->page) goto out_freepage; @@ -932,7 +932,7 @@ static struct nullb_page *null_insert_page(struct nullb *nullb, spin_unlock_irq(&nullb->lock); - t_page = null_alloc_page(GFP_NOIO); + t_page = null_alloc_page(); if (!t_page) goto out_lock; From df00b1d26c3c3ff9dae4b572a6ad878ab65334e1 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 22 Feb 2022 07:28:52 -0800 Subject: [PATCH 12/76] null_blk: null_alloc_page() cleanup Remove goto labels and use direct returns as error unwinding code only needs to free t_page variable if we alloc_pages() call fails as having two labels for one kfree() can be avoided easily. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220222152852.26043-3-kch@nvidia.com Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 80f9a6ba376d..05b1120e6623 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -783,18 +783,16 @@ static struct nullb_page *null_alloc_page(void) t_page = kmalloc(sizeof(struct nullb_page), GFP_NOIO); if (!t_page) - goto out; + return NULL; t_page->page = alloc_pages(GFP_NOIO, 0); - if (!t_page->page) - goto out_freepage; + if (!t_page->page) { + kfree(t_page); + return NULL; + } memset(t_page->bitmap, 0, sizeof(t_page->bitmap)); return t_page; -out_freepage: - kfree(t_page); -out: - return NULL; } static void null_free_page(struct nullb_page *t_page) From e65b831a1e191caff3fc0d06bc7019cdaf8f868e Mon Sep 17 00:00:00 2001 From: Qinghua Jin Date: Fri, 7 Jan 2022 10:22:58 +0800 Subject: [PATCH 13/76] nvme-fc: fix a typo subsytem -> subsystem Signed-off-by: Qinghua Jin Signed-off-by: Christoph Hellwig --- include/linux/nvme-fc-driver.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h index cb909edb76c4..5358a5facdee 100644 --- a/include/linux/nvme-fc-driver.h +++ b/include/linux/nvme-fc-driver.h @@ -721,7 +721,7 @@ enum { * * Fields with static values for the port. Initialized by the * port_info struct supplied to the registration call. - * @port_num: NVME-FC transport subsytem port number + * @port_num: NVME-FC transport subsystem port number * @node_name: FC WWNN for the port * @port_name: FC WWPN for the port * @private: pointer to memory allocated alongside the local port From 50ab19d89feaf4ebeca6872b46da4b503eee20c1 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 18 Jan 2022 23:49:54 -0800 Subject: [PATCH 14/76] nvme-core: remove unnecessary semicolon It is not a good practice to have a semicolon at the end of the function definition. Remove it from nvme_pr_type(). Signed-off-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 5e0bfda04bd7..a1d793e79982 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1979,7 +1979,7 @@ static char nvme_pr_type(enum pr_type type) default: return 0; } -}; +} static int nvme_send_ns_head_pr_command(struct block_device *bdev, struct nvme_command *c, u8 data[16]) From ba3266434d6615ff8015b01846d0e7756c9ad936 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Fri, 21 Jan 2022 21:05:39 -0800 Subject: [PATCH 15/76] nvme-core: remove unnecessary function parameter In function nvme_execute_rq() we don't use gendisk parameter at all. Remove the unsed parameter and adjust the calls. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index a1d793e79982..dd18861f77c0 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1049,8 +1049,7 @@ EXPORT_SYMBOL_GPL(nvme_setup_cmd); * >0: nvme controller's cqe status response * <0: kernel error in lieu of controller response */ -static int nvme_execute_rq(struct gendisk *disk, struct request *rq, - bool at_head) +static int nvme_execute_rq(struct request *rq, bool at_head) { blk_status_t status; @@ -1090,7 +1089,7 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, goto out; } - ret = nvme_execute_rq(NULL, req, at_head); + ret = nvme_execute_rq(req, at_head); if (result && ret >= 0) *result = nvme_req(req)->result; out: @@ -1206,12 +1205,11 @@ int nvme_execute_passthru_rq(struct request *rq) struct nvme_command *cmd = nvme_req(rq)->cmd; struct nvme_ctrl *ctrl = nvme_req(rq)->ctrl; struct nvme_ns *ns = rq->q->queuedata; - struct gendisk *disk = ns ? ns->disk : NULL; u32 effects; int ret; effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode); - ret = nvme_execute_rq(disk, rq, false); + ret = nvme_execute_rq(rq, false); if (effects) /* nothing to be done for zero cmd effects */ nvme_passthru_end(ctrl, effects, cmd, ret); From 572c97355bdc0e7cdd5979304b8554712e26ceb8 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 11 Jan 2022 22:20:59 -0800 Subject: [PATCH 16/76] nvme-fabrics: use unsigned int type Loop variable i will never have a negative value, so use unsigned int type instaed of int. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fabrics.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index f79a66d4e22c..9f81a0562e3e 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -873,7 +873,7 @@ static int nvmf_check_required_opts(struct nvmf_ctrl_options *opts, unsigned int required_opts) { if ((opts->mask & required_opts) != required_opts) { - int i; + unsigned int i; for (i = 0; i < ARRAY_SIZE(opt_tokens); i++) { if ((opt_tokens[i].token & required_opts) && From 0801a4b630ab0949ddf0fc3f26cb17976e2d3afb Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 11 Jan 2022 22:21:00 -0800 Subject: [PATCH 17/76] nvme-fabrics: use unsigned int type Loop variable i will never have a negative value, so use unsigned int type instaed of int. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fabrics.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 9f81a0562e3e..131e78e62d00 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -923,7 +923,7 @@ static int nvmf_check_allowed_opts(struct nvmf_ctrl_options *opts, unsigned int allowed_opts) { if (opts->mask & ~allowed_opts) { - int i; + unsigned int i; for (i = 0; i < ARRAY_SIZE(opt_tokens); i++) { if ((opt_tokens[i].token & opts->mask) && From 72b3eab456ba57bf933b25e1b2adea3e3eafd153 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 11 Jan 2022 22:20:57 -0800 Subject: [PATCH 18/76] nvme-fabrics: use consistent zeroout pattern Remove zeroout memeset call & zeroout local variable cmd at the time of declaration in nvmf_ref_read32() similar to what we have done in nvmf_reg_read64(), nvmf_reg_write32(), nvmf_connect_admin_queue(), and nvmf_connect_io_queue(). Signed-off-by: Chaitanya Kulkarni Reviewed-by: Keith Busch Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fabrics.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 131e78e62d00..f4aa40124720 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -144,11 +144,10 @@ EXPORT_SYMBOL_GPL(nvmf_get_address); */ int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val) { - struct nvme_command cmd; + struct nvme_command cmd = { }; union nvme_result res; int ret; - memset(&cmd, 0, sizeof(cmd)); cmd.prop_get.opcode = nvme_fabrics_command; cmd.prop_get.fctype = nvme_fabrics_type_property_get; cmd.prop_get.offset = cpu_to_le32(off); From 581f19dd72b9d148ce667b6dcb20d3ef16b9a1cf Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 11 Jan 2022 22:20:58 -0800 Subject: [PATCH 19/76] nvme-fabrics: remove unnecessary braces for case Braces are not required for enum value NVME_SC_CONNECT_INVALID_PARAM when used on the switch-case statement, remove the braces. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Keith Busch Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fabrics.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index f4aa40124720..ee79a6d639b4 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -271,7 +271,7 @@ static void nvmf_log_connect_error(struct nvme_ctrl *ctrl, int err_sctype = errval & ~NVME_SC_DNR; switch (err_sctype) { - case (NVME_SC_CONNECT_INVALID_PARAM): + case NVME_SC_CONNECT_INVALID_PARAM: if (offset >> 16) { char *inv_data = "Connect Invalid Data Parameter"; From 2caecd62ea5160803b25d96cb1a14ce755c2c259 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Wed, 2 Feb 2022 01:04:44 -0800 Subject: [PATCH 20/76] nvmet: use i_size_read() to set size for file-ns Instead of calling vfs_getattr() use i_size_read() to read the size of file so we can read the size of not only file type but also block type with one call. This is needed to implement buffered_io support for the NVMeOF block device backend. We also change return type of function nvmet_file_ns_revalidate() from int to void, since this function does not return any meaning value. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/io-cmd-file.c | 17 ++++------------- drivers/nvme/target/nvmet.h | 2 +- 2 files changed, 5 insertions(+), 14 deletions(-) diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c index 6be6e59d273b..6485dc8eb974 100644 --- a/drivers/nvme/target/io-cmd-file.c +++ b/drivers/nvme/target/io-cmd-file.c @@ -14,16 +14,9 @@ #define NVMET_MAX_MPOOL_BVEC 16 #define NVMET_MIN_MPOOL_OBJ 16 -int nvmet_file_ns_revalidate(struct nvmet_ns *ns) +void nvmet_file_ns_revalidate(struct nvmet_ns *ns) { - struct kstat stat; - int ret; - - ret = vfs_getattr(&ns->file->f_path, &stat, STATX_SIZE, - AT_STATX_FORCE_SYNC); - if (!ret) - ns->size = stat.size; - return ret; + ns->size = i_size_read(ns->file->f_mapping->host); } void nvmet_file_ns_disable(struct nvmet_ns *ns) @@ -43,7 +36,7 @@ void nvmet_file_ns_disable(struct nvmet_ns *ns) int nvmet_file_ns_enable(struct nvmet_ns *ns) { int flags = O_RDWR | O_LARGEFILE; - int ret; + int ret = 0; if (!ns->buffered_io) flags |= O_DIRECT; @@ -57,9 +50,7 @@ int nvmet_file_ns_enable(struct nvmet_ns *ns) return ret; } - ret = nvmet_file_ns_revalidate(ns); - if (ret) - goto err; + nvmet_file_ns_revalidate(ns); /* * i_blkbits can be greater than the universally accepted upper bound, diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index af193423c10b..69637bf8f8e1 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -541,7 +541,7 @@ u16 nvmet_bdev_flush(struct nvmet_req *req); u16 nvmet_file_flush(struct nvmet_req *req); void nvmet_ns_changed(struct nvmet_subsys *subsys, u32 nsid); void nvmet_bdev_ns_revalidate(struct nvmet_ns *ns); -int nvmet_file_ns_revalidate(struct nvmet_ns *ns); +void nvmet_file_ns_revalidate(struct nvmet_ns *ns); void nvmet_ns_revalidate(struct nvmet_ns *ns); u16 blk_to_nvme_status(struct nvmet_req *req, blk_status_t blk_sts); From 6f6d604b4ef896793901f213db3b45eefa7288fa Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Wed, 2 Feb 2022 01:04:45 -0800 Subject: [PATCH 21/76] nvmet: allow bdev in buffered_io mode Allow block device to be configured in the buffered I/O mode by using the file backend. In this way now we can use cache for the block device namespace which shows significant performance improvement. We update the block device ns enable function and return early when buffered_io flag is set. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/io-cmd-bdev.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index 95c2bbb0b2f5..e9194804ddee 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -76,6 +76,14 @@ int nvmet_bdev_ns_enable(struct nvmet_ns *ns) { int ret; + /* + * When buffered_io namespace attribute is enabled that means user want + * this block device to be used as a file, so block device can take + * an advantage of cache. + */ + if (ns->buffered_io) + return -ENOTBLK; + ns->bdev = blkdev_get_by_path(ns->device_path, FMODE_READ | FMODE_WRITE, NULL); if (IS_ERR(ns->bdev)) { From 8b850475c08caa9545c460d7dc823d0a8c7eafd3 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 14 Feb 2022 11:07:27 +0200 Subject: [PATCH 22/76] nvme: replace ida_simple[get|remove] with the simler ida_[alloc|free] ida_simple_[get|remove] are wrappers anyways. Also, use ida_alloc_min with the ns_ida as namespace enumeration starts with 1. Signed-off-by: Sagi Grimberg Reviewed-by: Keith Busch Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index dd18861f77c0..da0e19148177 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -561,7 +561,7 @@ static void nvme_free_ns_head(struct kref *ref) container_of(ref, struct nvme_ns_head, ref); nvme_mpath_remove_disk(head); - ida_simple_remove(&head->subsys->ns_ida, head->instance); + ida_free(&head->subsys->ns_ida, head->instance); cleanup_srcu_struct(&head->srcu); nvme_put_subsystem(head->subsys); kfree(head); @@ -2565,7 +2565,7 @@ static void nvme_release_subsystem(struct device *dev) container_of(dev, struct nvme_subsystem, dev); if (subsys->instance >= 0) - ida_simple_remove(&nvme_instance_ida, subsys->instance); + ida_free(&nvme_instance_ida, subsys->instance); kfree(subsys); } @@ -3616,7 +3616,7 @@ static int __nvme_check_ids(struct nvme_subsystem *subsys, static void nvme_cdev_rel(struct device *dev) { - ida_simple_remove(&nvme_ns_chr_minor_ida, MINOR(dev->devt)); + ida_free(&nvme_ns_chr_minor_ida, MINOR(dev->devt)); } void nvme_cdev_del(struct cdev *cdev, struct device *cdev_device) @@ -3630,7 +3630,7 @@ int nvme_cdev_add(struct cdev *cdev, struct device *cdev_device, { int minor, ret; - minor = ida_simple_get(&nvme_ns_chr_minor_ida, 0, 0, GFP_KERNEL); + minor = ida_alloc(&nvme_ns_chr_minor_ida, GFP_KERNEL); if (minor < 0) return minor; cdev_device->devt = MKDEV(MAJOR(nvme_ns_chr_devt), minor); @@ -3693,7 +3693,7 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl, head = kzalloc(size, GFP_KERNEL); if (!head) goto out; - ret = ida_simple_get(&ctrl->subsys->ns_ida, 1, 0, GFP_KERNEL); + ret = ida_alloc_min(&ctrl->subsys->ns_ida, 1, GFP_KERNEL); if (ret < 0) goto out_free_head; head->instance = ret; @@ -3732,7 +3732,7 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl, out_cleanup_srcu: cleanup_srcu_struct(&head->srcu); out_ida_remove: - ida_simple_remove(&ctrl->subsys->ns_ida, head->instance); + ida_free(&ctrl->subsys->ns_ida, head->instance); out_free_head: kfree(head); out: @@ -4429,7 +4429,7 @@ static void nvme_free_ctrl(struct device *dev) struct nvme_subsystem *subsys = ctrl->subsys; if (!subsys || ctrl->instance != subsys->instance) - ida_simple_remove(&nvme_instance_ida, ctrl->instance); + ida_free(&nvme_instance_ida, ctrl->instance); nvme_free_cels(ctrl); nvme_mpath_uninit(ctrl); @@ -4488,7 +4488,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, goto out; } - ret = ida_simple_get(&nvme_instance_ida, 0, 0, GFP_KERNEL); + ret = ida_alloc(&nvme_instance_ida, GFP_KERNEL); if (ret < 0) goto out; ctrl->instance = ret; @@ -4529,7 +4529,7 @@ out_free_name: nvme_put_ctrl(ctrl); kfree_const(ctrl->device->kobj.name); out_release_instance: - ida_simple_remove(&nvme_instance_ida, ctrl->instance); + ida_free(&nvme_instance_ida, ctrl->instance); out: if (ctrl->discard_page) __free_page(ctrl->discard_page); From 3dd83f4013f0e85d37c059c91015a2721209fe86 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 14 Feb 2022 11:07:28 +0200 Subject: [PATCH 23/76] nvme-fc: replace ida_simple[get|remove] with the simler ida_[alloc|free] ida_simple_[get|remove] are wrappers anyways. Signed-off-by: Sagi Grimberg Reviewed-by: Keith Busch Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 71b3108c22f0..96a5d7fc36f7 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -259,7 +259,7 @@ nvme_fc_free_lport(struct kref *ref) complete(&nvme_fc_unload_proceed); spin_unlock_irqrestore(&nvme_fc_lock, flags); - ida_simple_remove(&nvme_fc_local_port_cnt, lport->localport.port_num); + ida_free(&nvme_fc_local_port_cnt, lport->localport.port_num); ida_destroy(&lport->endp_cnt); put_device(lport->dev); @@ -399,7 +399,7 @@ nvme_fc_register_localport(struct nvme_fc_port_info *pinfo, goto out_reghost_failed; } - idx = ida_simple_get(&nvme_fc_local_port_cnt, 0, 0, GFP_KERNEL); + idx = ida_alloc(&nvme_fc_local_port_cnt, GFP_KERNEL); if (idx < 0) { ret = -ENOSPC; goto out_fail_kfree; @@ -439,7 +439,7 @@ nvme_fc_register_localport(struct nvme_fc_port_info *pinfo, return 0; out_ida_put: - ida_simple_remove(&nvme_fc_local_port_cnt, idx); + ida_free(&nvme_fc_local_port_cnt, idx); out_fail_kfree: kfree(newrec); out_reghost_failed: @@ -535,7 +535,7 @@ nvme_fc_free_rport(struct kref *ref) spin_unlock_irqrestore(&nvme_fc_lock, flags); WARN_ON(!list_empty(&rport->disc_list)); - ida_simple_remove(&lport->endp_cnt, rport->remoteport.port_num); + ida_free(&lport->endp_cnt, rport->remoteport.port_num); kfree(rport); @@ -713,7 +713,7 @@ nvme_fc_register_remoteport(struct nvme_fc_local_port *localport, goto out_lport_put; } - idx = ida_simple_get(&lport->endp_cnt, 0, 0, GFP_KERNEL); + idx = ida_alloc(&lport->endp_cnt, GFP_KERNEL); if (idx < 0) { ret = -ENOSPC; goto out_kfree_rport; @@ -2393,7 +2393,7 @@ nvme_fc_ctrl_free(struct kref *ref) put_device(ctrl->dev); nvme_fc_rport_put(ctrl->rport); - ida_simple_remove(&nvme_fc_ctrl_cnt, ctrl->cnum); + ida_free(&nvme_fc_ctrl_cnt, ctrl->cnum); if (ctrl->ctrl.opts) nvmf_free_options(ctrl->ctrl.opts); kfree(ctrl); @@ -3472,7 +3472,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, goto out_fail; } - idx = ida_simple_get(&nvme_fc_ctrl_cnt, 0, 0, GFP_KERNEL); + idx = ida_alloc(&nvme_fc_ctrl_cnt, GFP_KERNEL); if (idx < 0) { ret = -ENOSPC; goto out_free_ctrl; @@ -3635,7 +3635,7 @@ out_free_queues: kfree(ctrl->queues); out_free_ida: put_device(ctrl->dev); - ida_simple_remove(&nvme_fc_ctrl_cnt, ctrl->cnum); + ida_free(&nvme_fc_ctrl_cnt, ctrl->cnum); out_free_ctrl: kfree(ctrl); out_fail: From 22027a9811349de28f81e13e20e83299099acd3a Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 14 Feb 2022 11:07:29 +0200 Subject: [PATCH 24/76] nvmet: replace ida_simple[get|remove] with the simler ida_[alloc|free] ida_simple_[get|remove] are wrappers anyways. Signed-off-by: Sagi Grimberg Reviewed-by: Keith Busch Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 5119c687de68..724a6d373340 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -1400,7 +1400,7 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, if (subsys->cntlid_min > subsys->cntlid_max) goto out_free_sqs; - ret = ida_simple_get(&cntlid_ida, + ret = ida_alloc_range(&cntlid_ida, subsys->cntlid_min, subsys->cntlid_max, GFP_KERNEL); if (ret < 0) { @@ -1459,7 +1459,7 @@ static void nvmet_ctrl_free(struct kref *ref) flush_work(&ctrl->async_event_work); cancel_work_sync(&ctrl->fatal_err_work); - ida_simple_remove(&cntlid_ida, ctrl->cntlid); + ida_free(&cntlid_ida, ctrl->cntlid); nvmet_async_events_free(ctrl); kfree(ctrl->sqs); From 6dd0f465d57c4522294d0fd42fde7603264d0871 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 14 Feb 2022 11:07:30 +0200 Subject: [PATCH 25/76] nvmet-fc: replace ida_simple[get|remove] with the simler ida_[alloc|free] ida_simple_[get|remove] are wrappers anyways. Signed-off-by: Sagi Grimberg Reviewed-by: Keith Busch Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/fc.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 22b5108168a6..cb013516784c 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -1115,7 +1115,7 @@ nvmet_fc_alloc_target_assoc(struct nvmet_fc_tgtport *tgtport, void *hosthandle) if (!assoc) return NULL; - idx = ida_simple_get(&tgtport->assoc_cnt, 0, 0, GFP_KERNEL); + idx = ida_alloc(&tgtport->assoc_cnt, GFP_KERNEL); if (idx < 0) goto out_free_assoc; @@ -1157,7 +1157,7 @@ nvmet_fc_alloc_target_assoc(struct nvmet_fc_tgtport *tgtport, void *hosthandle) out_put: nvmet_fc_tgtport_put(tgtport); out_ida: - ida_simple_remove(&tgtport->assoc_cnt, idx); + ida_free(&tgtport->assoc_cnt, idx); out_free_assoc: kfree(assoc); return NULL; @@ -1183,7 +1183,7 @@ nvmet_fc_target_assoc_free(struct kref *ref) /* if pending Rcv Disconnect Association LS, send rsp now */ if (oldls) nvmet_fc_xmt_ls_rsp(tgtport, oldls); - ida_simple_remove(&tgtport->assoc_cnt, assoc->a_id); + ida_free(&tgtport->assoc_cnt, assoc->a_id); dev_info(tgtport->dev, "{%d:%d} Association freed\n", tgtport->fc_target_port.port_num, assoc->a_id); @@ -1383,7 +1383,7 @@ nvmet_fc_register_targetport(struct nvmet_fc_port_info *pinfo, goto out_regtgt_failed; } - idx = ida_simple_get(&nvmet_fc_tgtport_cnt, 0, 0, GFP_KERNEL); + idx = ida_alloc(&nvmet_fc_tgtport_cnt, GFP_KERNEL); if (idx < 0) { ret = -ENOSPC; goto out_fail_kfree; @@ -1433,7 +1433,7 @@ nvmet_fc_register_targetport(struct nvmet_fc_port_info *pinfo, out_free_newrec: put_device(dev); out_ida_put: - ida_simple_remove(&nvmet_fc_tgtport_cnt, idx); + ida_free(&nvmet_fc_tgtport_cnt, idx); out_fail_kfree: kfree(newrec); out_regtgt_failed: @@ -1460,7 +1460,7 @@ nvmet_fc_free_tgtport(struct kref *ref) /* let the LLDD know we've finished tearing it down */ tgtport->ops->targetport_delete(&tgtport->fc_target_port); - ida_simple_remove(&nvmet_fc_tgtport_cnt, + ida_free(&nvmet_fc_tgtport_cnt, tgtport->fc_target_port.port_num); ida_destroy(&tgtport->assoc_cnt); From 7c2566394f8ceba4b99a373c1bc2411750301587 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 14 Feb 2022 11:07:31 +0200 Subject: [PATCH 26/76] nvmet-rdma: replace ida_simple[get|remove] with the simler ida_[alloc|free] ida_simple_[get|remove] are wrappers anyways. Signed-off-by: Sagi Grimberg Reviewed-by: Keith Busch Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/rdma.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 1deb4043e242..00656754e331 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -1356,7 +1356,7 @@ static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue) !queue->host_qid); } nvmet_rdma_free_rsps(queue); - ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx); + ida_free(&nvmet_rdma_queue_ida, queue->idx); kfree(queue); } @@ -1459,7 +1459,7 @@ nvmet_rdma_alloc_queue(struct nvmet_rdma_device *ndev, spin_lock_init(&queue->rsps_lock); INIT_LIST_HEAD(&queue->queue_list); - queue->idx = ida_simple_get(&nvmet_rdma_queue_ida, 0, 0, GFP_KERNEL); + queue->idx = ida_alloc(&nvmet_rdma_queue_ida, GFP_KERNEL); if (queue->idx < 0) { ret = NVME_RDMA_CM_NO_RSC; goto out_destroy_sq; @@ -1510,7 +1510,7 @@ out_free_cmds: out_free_responses: nvmet_rdma_free_rsps(queue); out_ida_remove: - ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx); + ida_free(&nvmet_rdma_queue_ida, queue->idx); out_destroy_sq: nvmet_sq_destroy(&queue->nvme_sq); out_free_queue: From 44f331a630bdc7c61de9c6760c4eec0133ee9f04 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 14 Feb 2022 11:07:32 +0200 Subject: [PATCH 27/76] nvmet-tcp: replace ida_simple[get|remove] with the simler ida_[alloc|free] ida_simple_[get|remove] are wrappers anyways. Signed-off-by: Sagi Grimberg Reviewed-by: Keith Busch Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/tcp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 7c1c43ce466b..83ca577f72be 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -1473,7 +1473,7 @@ static void nvmet_tcp_release_queue_work(struct work_struct *w) nvmet_tcp_free_cmds(queue); if (queue->hdr_digest || queue->data_digest) nvmet_tcp_free_crypto(queue); - ida_simple_remove(&nvmet_tcp_queue_ida, queue->idx); + ida_free(&nvmet_tcp_queue_ida, queue->idx); page = virt_to_head_page(queue->pf_cache.va); __page_frag_cache_drain(page, queue->pf_cache.pagecnt_bias); @@ -1613,7 +1613,7 @@ static int nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port, init_llist_head(&queue->resp_list); INIT_LIST_HEAD(&queue->resp_send_list); - queue->idx = ida_simple_get(&nvmet_tcp_queue_ida, 0, 0, GFP_KERNEL); + queue->idx = ida_alloc(&nvmet_tcp_queue_ida, GFP_KERNEL); if (queue->idx < 0) { ret = queue->idx; goto out_free_queue; @@ -1646,7 +1646,7 @@ out_destroy_sq: out_free_connect: nvmet_tcp_free_cmd(&queue->connect); out_ida_remove: - ida_simple_remove(&nvmet_tcp_queue_ida, queue->idx); + ida_free(&nvmet_tcp_queue_ida, queue->idx); out_free_queue: kfree(queue); return ret; From 4686af885a9168f9ec70c4063616640911c48b03 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Wed, 9 Feb 2022 10:54:49 +0200 Subject: [PATCH 28/76] nvme-rdma: add helpers for mapping/unmapping request Introduce nvme_rdma_dma_map_req/nvme_rdma_dma_unmap_req helper functions to improve code readability and ease on the error flow. Reviewed-by: Israel Rukshin Signed-off-by: Max Gurtovoy Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 111 +++++++++++++++++++++++---------------- 1 file changed, 65 insertions(+), 46 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 850f84d204d0..14ec2c85db06 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1282,6 +1282,22 @@ static int nvme_rdma_inv_rkey(struct nvme_rdma_queue *queue, return ib_post_send(queue->qp, &wr, NULL); } +static void nvme_rdma_dma_unmap_req(struct ib_device *ibdev, struct request *rq) +{ + struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + + if (blk_integrity_rq(rq)) { + ib_dma_unmap_sg(ibdev, req->metadata_sgl->sg_table.sgl, + req->metadata_sgl->nents, rq_dma_dir(rq)); + sg_free_table_chained(&req->metadata_sgl->sg_table, + NVME_INLINE_METADATA_SG_CNT); + } + + ib_dma_unmap_sg(ibdev, req->data_sgl.sg_table.sgl, req->data_sgl.nents, + rq_dma_dir(rq)); + sg_free_table_chained(&req->data_sgl.sg_table, NVME_INLINE_SG_CNT); +} + static void nvme_rdma_unmap_data(struct nvme_rdma_queue *queue, struct request *rq) { @@ -1293,13 +1309,6 @@ static void nvme_rdma_unmap_data(struct nvme_rdma_queue *queue, if (!blk_rq_nr_phys_segments(rq)) return; - if (blk_integrity_rq(rq)) { - ib_dma_unmap_sg(ibdev, req->metadata_sgl->sg_table.sgl, - req->metadata_sgl->nents, rq_dma_dir(rq)); - sg_free_table_chained(&req->metadata_sgl->sg_table, - NVME_INLINE_METADATA_SG_CNT); - } - if (req->use_sig_mr) pool = &queue->qp->sig_mrs; @@ -1308,9 +1317,7 @@ static void nvme_rdma_unmap_data(struct nvme_rdma_queue *queue, req->mr = NULL; } - ib_dma_unmap_sg(ibdev, req->data_sgl.sg_table.sgl, req->data_sgl.nents, - rq_dma_dir(rq)); - sg_free_table_chained(&req->data_sgl.sg_table, NVME_INLINE_SG_CNT); + nvme_rdma_dma_unmap_req(ibdev, rq); } static int nvme_rdma_set_sg_null(struct nvme_command *c) @@ -1521,22 +1528,11 @@ mr_put: return -EINVAL; } -static int nvme_rdma_map_data(struct nvme_rdma_queue *queue, - struct request *rq, struct nvme_command *c) +static int nvme_rdma_dma_map_req(struct ib_device *ibdev, struct request *rq, + int *count, int *pi_count) { struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); - struct nvme_rdma_device *dev = queue->device; - struct ib_device *ibdev = dev->dev; - int pi_count = 0; - int count, ret; - - req->num_sge = 1; - refcount_set(&req->ref, 2); /* send and recv completions */ - - c->common.flags |= NVME_CMD_SGL_METABUF; - - if (!blk_rq_nr_phys_segments(rq)) - return nvme_rdma_set_sg_null(c); + int ret; req->data_sgl.sg_table.sgl = (struct scatterlist *)(req + 1); ret = sg_alloc_table_chained(&req->data_sgl.sg_table, @@ -1548,9 +1544,9 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue, req->data_sgl.nents = blk_rq_map_sg(rq->q, rq, req->data_sgl.sg_table.sgl); - count = ib_dma_map_sg(ibdev, req->data_sgl.sg_table.sgl, - req->data_sgl.nents, rq_dma_dir(rq)); - if (unlikely(count <= 0)) { + *count = ib_dma_map_sg(ibdev, req->data_sgl.sg_table.sgl, + req->data_sgl.nents, rq_dma_dir(rq)); + if (unlikely(*count <= 0)) { ret = -EIO; goto out_free_table; } @@ -1569,16 +1565,50 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue, req->metadata_sgl->nents = blk_rq_map_integrity_sg(rq->q, rq->bio, req->metadata_sgl->sg_table.sgl); - pi_count = ib_dma_map_sg(ibdev, - req->metadata_sgl->sg_table.sgl, - req->metadata_sgl->nents, - rq_dma_dir(rq)); - if (unlikely(pi_count <= 0)) { + *pi_count = ib_dma_map_sg(ibdev, + req->metadata_sgl->sg_table.sgl, + req->metadata_sgl->nents, + rq_dma_dir(rq)); + if (unlikely(*pi_count <= 0)) { ret = -EIO; goto out_free_pi_table; } } + return 0; + +out_free_pi_table: + sg_free_table_chained(&req->metadata_sgl->sg_table, + NVME_INLINE_METADATA_SG_CNT); +out_unmap_sg: + ib_dma_unmap_sg(ibdev, req->data_sgl.sg_table.sgl, req->data_sgl.nents, + rq_dma_dir(rq)); +out_free_table: + sg_free_table_chained(&req->data_sgl.sg_table, NVME_INLINE_SG_CNT); + return ret; +} + +static int nvme_rdma_map_data(struct nvme_rdma_queue *queue, + struct request *rq, struct nvme_command *c) +{ + struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_rdma_device *dev = queue->device; + struct ib_device *ibdev = dev->dev; + int pi_count = 0; + int count, ret; + + req->num_sge = 1; + refcount_set(&req->ref, 2); /* send and recv completions */ + + c->common.flags |= NVME_CMD_SGL_METABUF; + + if (!blk_rq_nr_phys_segments(rq)) + return nvme_rdma_set_sg_null(c); + + ret = nvme_rdma_dma_map_req(ibdev, rq, &count, &pi_count); + if (unlikely(ret)) + return ret; + if (req->use_sig_mr) { ret = nvme_rdma_map_sg_pi(queue, req, c, count, pi_count); goto out; @@ -1602,23 +1632,12 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue, ret = nvme_rdma_map_sg_fr(queue, req, c, count); out: if (unlikely(ret)) - goto out_unmap_pi_sg; + goto out_dma_unmap_req; return 0; -out_unmap_pi_sg: - if (blk_integrity_rq(rq)) - ib_dma_unmap_sg(ibdev, req->metadata_sgl->sg_table.sgl, - req->metadata_sgl->nents, rq_dma_dir(rq)); -out_free_pi_table: - if (blk_integrity_rq(rq)) - sg_free_table_chained(&req->metadata_sgl->sg_table, - NVME_INLINE_METADATA_SG_CNT); -out_unmap_sg: - ib_dma_unmap_sg(ibdev, req->data_sgl.sg_table.sgl, req->data_sgl.nents, - rq_dma_dir(rq)); -out_free_table: - sg_free_table_chained(&req->data_sgl.sg_table, NVME_INLINE_SG_CNT); +out_dma_unmap_req: + nvme_rdma_dma_unmap_req(ibdev, rq); return ret; } From 72e8b5cd7dd387ba3eee81b0a59746ad8ccdcb5f Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Thu, 10 Feb 2022 11:12:36 -0800 Subject: [PATCH 29/76] nvme: add a helper to initialize connect_q Add and use helper to remove duplicate code for fabrics connect_q initialization and error handling for all the transports. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 6 ++---- drivers/nvme/host/nvme.h | 8 ++++++++ drivers/nvme/host/rdma.c | 6 ++---- drivers/nvme/host/tcp.c | 6 ++---- drivers/nvme/target/loop.c | 6 ++---- 5 files changed, 16 insertions(+), 16 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 96a5d7fc36f7..080f85f4105f 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2916,11 +2916,9 @@ nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl) ctrl->ctrl.tagset = &ctrl->tag_set; - ctrl->ctrl.connect_q = blk_mq_init_queue(&ctrl->tag_set); - if (IS_ERR(ctrl->ctrl.connect_q)) { - ret = PTR_ERR(ctrl->ctrl.connect_q); + ret = nvme_ctrl_init_connect_q(&(ctrl->ctrl)); + if (ret) goto out_free_tag_set; - } ret = nvme_fc_create_hw_io_queues(ctrl, ctrl->ctrl.sqsize + 1); if (ret) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index a162f6c6da6e..f8658f984d64 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -894,6 +894,14 @@ static inline int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf) } #endif +static inline int nvme_ctrl_init_connect_q(struct nvme_ctrl *ctrl) +{ + ctrl->connect_q = blk_mq_init_queue(ctrl->tagset); + if (IS_ERR(ctrl->connect_q)) + return PTR_ERR(ctrl->connect_q); + return 0; +} + static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev) { return dev_to_disk(dev)->private_data; diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 14ec2c85db06..c49b9c3c46f2 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -978,11 +978,9 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) goto out_free_io_queues; } - ctrl->ctrl.connect_q = blk_mq_init_queue(&ctrl->tag_set); - if (IS_ERR(ctrl->ctrl.connect_q)) { - ret = PTR_ERR(ctrl->ctrl.connect_q); + ret = nvme_ctrl_init_connect_q(&(ctrl->ctrl)); + if (ret) goto out_free_tag_set; - } } ret = nvme_rdma_start_io_queues(ctrl); diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 4ceb28675fdf..6cbcc8b4daaf 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1825,11 +1825,9 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new) goto out_free_io_queues; } - ctrl->connect_q = blk_mq_init_queue(ctrl->tagset); - if (IS_ERR(ctrl->connect_q)) { - ret = PTR_ERR(ctrl->connect_q); + ret = nvme_ctrl_init_connect_q(ctrl); + if (ret) goto out_free_tag_set; - } } ret = nvme_tcp_start_io_queues(ctrl); diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index eb1094254c82..23f9d6f88804 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -543,11 +543,9 @@ static int nvme_loop_create_io_queues(struct nvme_loop_ctrl *ctrl) if (ret) goto out_destroy_queues; - ctrl->ctrl.connect_q = blk_mq_init_queue(&ctrl->tag_set); - if (IS_ERR(ctrl->ctrl.connect_q)) { - ret = PTR_ERR(ctrl->ctrl.connect_q); + ret = nvme_ctrl_init_connect_q(&(ctrl->ctrl)); + if (ret) goto out_free_tagset; - } ret = nvme_loop_connect_io_queues(ctrl); if (ret) From bd83fe6f2cd2133beaac7c423fd36c3515048fc8 Mon Sep 17 00:00:00 2001 From: Alan Adamson Date: Thu, 3 Feb 2022 00:11:53 -0800 Subject: [PATCH 30/76] nvme: add verbose error logging Improves logging of NVMe errors. If NVME_VERBOSE_ERRORS is configured, a verbose description of the error is logged, otherwise only status codes/bits is logged. Signed-off-by: Chaitanya Kulkarni [kch]: fix several nits, cosmetics, and trim down code. Signed-off-by: Martin K. Petersen Signed-off-by: Alan Adamson Reviewed-by: Himanshu Madhani Reviewed-by: Keith Busch Signed-off-by: Christoph Hellwig --- drivers/nvme/host/Kconfig | 8 ++ drivers/nvme/host/Makefile | 2 +- drivers/nvme/host/constants.c | 185 ++++++++++++++++++++++++++++++++++ drivers/nvme/host/core.c | 33 ++++++ drivers/nvme/host/nvme.h | 19 ++++ include/linux/nvme.h | 1 + 6 files changed, 247 insertions(+), 1 deletion(-) create mode 100644 drivers/nvme/host/constants.c diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig index dc0450ca23a3..d6d056963c06 100644 --- a/drivers/nvme/host/Kconfig +++ b/drivers/nvme/host/Kconfig @@ -24,6 +24,14 @@ config NVME_MULTIPATH /dev/nvmeXnY device will show up for each NVMe namespace, even if it is accessible through multiple controllers. +config NVME_VERBOSE_ERRORS + bool "NVMe verbose error reporting" + depends on NVME_CORE + help + This option enables verbose reporting for NVMe errors. The + error translation table will grow the kernel image size by + about 4 KB. + config NVME_HWMON bool "NVMe hardware monitoring" depends on (NVME_CORE=y && HWMON=y) || (NVME_CORE=m && HWMON) diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile index dfaacd472e5d..476c5c988496 100644 --- a/drivers/nvme/host/Makefile +++ b/drivers/nvme/host/Makefile @@ -9,7 +9,7 @@ obj-$(CONFIG_NVME_RDMA) += nvme-rdma.o obj-$(CONFIG_NVME_FC) += nvme-fc.o obj-$(CONFIG_NVME_TCP) += nvme-tcp.o -nvme-core-y := core.o ioctl.o +nvme-core-y := core.o ioctl.o constants.o nvme-core-$(CONFIG_TRACING) += trace.o nvme-core-$(CONFIG_NVME_MULTIPATH) += multipath.o nvme-core-$(CONFIG_BLK_DEV_ZONED) += zns.o diff --git a/drivers/nvme/host/constants.c b/drivers/nvme/host/constants.c new file mode 100644 index 000000000000..7d49eb34b348 --- /dev/null +++ b/drivers/nvme/host/constants.c @@ -0,0 +1,185 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NVM Express device driver verbose errors + * Copyright (c) 2022, Oracle and/or its affiliates + */ + +#include +#include "nvme.h" + +#ifdef CONFIG_NVME_VERBOSE_ERRORS +static const char * const nvme_ops[] = { + [nvme_cmd_flush] = "Flush", + [nvme_cmd_write] = "Write", + [nvme_cmd_read] = "Read", + [nvme_cmd_write_uncor] = "Write Uncorrectable", + [nvme_cmd_compare] = "Compare", + [nvme_cmd_write_zeroes] = "Write Zeros", + [nvme_cmd_dsm] = "Dataset Management", + [nvme_cmd_verify] = "Verify", + [nvme_cmd_resv_register] = "Reservation Register", + [nvme_cmd_resv_report] = "Reservation Report", + [nvme_cmd_resv_acquire] = "Reservation Acquire", + [nvme_cmd_resv_release] = "Reservation Release", + [nvme_cmd_zone_mgmt_send] = "Zone Management Send", + [nvme_cmd_zone_mgmt_recv] = "Zone Management Receive", + [nvme_cmd_zone_append] = "Zone Management Append", +}; + +static const char * const nvme_admin_ops[] = { + [nvme_admin_delete_sq] = "Delete SQ", + [nvme_admin_create_sq] = "Create SQ", + [nvme_admin_get_log_page] = "Get Log Page", + [nvme_admin_delete_cq] = "Delete CQ", + [nvme_admin_create_cq] = "Create CQ", + [nvme_admin_identify] = "Identify", + [nvme_admin_abort_cmd] = "Abort Command", + [nvme_admin_set_features] = "Set Features", + [nvme_admin_get_features] = "Get Features", + [nvme_admin_async_event] = "Async Event", + [nvme_admin_ns_mgmt] = "Namespace Management", + [nvme_admin_activate_fw] = "Activate Firmware", + [nvme_admin_download_fw] = "Download Firmware", + [nvme_admin_dev_self_test] = "Device Self Test", + [nvme_admin_ns_attach] = "Namespace Attach", + [nvme_admin_keep_alive] = "Keep Alive", + [nvme_admin_directive_send] = "Directive Send", + [nvme_admin_directive_recv] = "Directive Receive", + [nvme_admin_virtual_mgmt] = "Virtual Management", + [nvme_admin_nvme_mi_send] = "NVMe Send MI", + [nvme_admin_nvme_mi_recv] = "NVMe Receive MI", + [nvme_admin_dbbuf] = "Doorbell Buffer Config", + [nvme_admin_format_nvm] = "Format NVM", + [nvme_admin_security_send] = "Security Send", + [nvme_admin_security_recv] = "Security Receive", + [nvme_admin_sanitize_nvm] = "Sanitize NVM", + [nvme_admin_get_lba_status] = "Get LBA Status", +}; + +static const char * const nvme_statuses[] = { + [NVME_SC_SUCCESS] = "Success", + [NVME_SC_INVALID_OPCODE] = "Invalid Command Opcode", + [NVME_SC_INVALID_FIELD] = "Invalid Field in Command", + [NVME_SC_CMDID_CONFLICT] = "Command ID Conflict", + [NVME_SC_DATA_XFER_ERROR] = "Data Transfer Error", + [NVME_SC_POWER_LOSS] = "Commands Aborted due to Power Loss Notification", + [NVME_SC_INTERNAL] = "Internal Error", + [NVME_SC_ABORT_REQ] = "Command Abort Requested", + [NVME_SC_ABORT_QUEUE] = "Command Aborted due to SQ Deletion", + [NVME_SC_FUSED_FAIL] = "Command Aborted due to Failed Fused Command", + [NVME_SC_FUSED_MISSING] = "Command Aborted due to Missing Fused Command", + [NVME_SC_INVALID_NS] = "Invalid Namespace or Format", + [NVME_SC_CMD_SEQ_ERROR] = "Command Sequence Error", + [NVME_SC_SGL_INVALID_LAST] = "Invalid SGL Segment Descriptor", + [NVME_SC_SGL_INVALID_COUNT] = "Invalid Number of SGL Descriptors", + [NVME_SC_SGL_INVALID_DATA] = "Data SGL Length Invalid", + [NVME_SC_SGL_INVALID_METADATA] = "Metadata SGL Length Invalid", + [NVME_SC_SGL_INVALID_TYPE] = "SGL Descriptor Type Invalid", + [NVME_SC_CMB_INVALID_USE] = "Invalid Use of Controller Memory Buffer", + [NVME_SC_PRP_INVALID_OFFSET] = "PRP Offset Invalid", + [NVME_SC_ATOMIC_WU_EXCEEDED] = "Atomic Write Unit Exceeded", + [NVME_SC_OP_DENIED] = "Operation Denied", + [NVME_SC_SGL_INVALID_OFFSET] = "SGL Offset Invalid", + [NVME_SC_RESERVED] = "Reserved", + [NVME_SC_HOST_ID_INCONSIST] = "Host Identifier Inconsistent Format", + [NVME_SC_KA_TIMEOUT_EXPIRED] = "Keep Alive Timeout Expired", + [NVME_SC_KA_TIMEOUT_INVALID] = "Keep Alive Timeout Invalid", + [NVME_SC_ABORTED_PREEMPT_ABORT] = "Command Aborted due to Preempt and Abort", + [NVME_SC_SANITIZE_FAILED] = "Sanitize Failed", + [NVME_SC_SANITIZE_IN_PROGRESS] = "Sanitize In Progress", + [NVME_SC_SGL_INVALID_GRANULARITY] = "SGL Data Block Granularity Invalid", + [NVME_SC_CMD_NOT_SUP_CMB_QUEUE] = "Command Not Supported for Queue in CMB", + [NVME_SC_NS_WRITE_PROTECTED] = "Namespace is Write Protected", + [NVME_SC_CMD_INTERRUPTED] = "Command Interrupted", + [NVME_SC_TRANSIENT_TR_ERR] = "Transient Transport Error", + [NVME_SC_INVALID_IO_CMD_SET] = "Invalid IO Command Set", + [NVME_SC_LBA_RANGE] = "LBA Out of Range", + [NVME_SC_CAP_EXCEEDED] = "Capacity Exceeded", + [NVME_SC_NS_NOT_READY] = "Namespace Not Ready", + [NVME_SC_RESERVATION_CONFLICT] = "Reservation Conflict", + [NVME_SC_FORMAT_IN_PROGRESS] = "Format In Progress", + [NVME_SC_CQ_INVALID] = "Completion Queue Invalid", + [NVME_SC_QID_INVALID] = "Invalid Queue Identifier", + [NVME_SC_QUEUE_SIZE] = "Invalid Queue Size", + [NVME_SC_ABORT_LIMIT] = "Abort Command Limit Exceeded", + [NVME_SC_ABORT_MISSING] = "Reserved", /* XXX */ + [NVME_SC_ASYNC_LIMIT] = "Asynchronous Event Request Limit Exceeded", + [NVME_SC_FIRMWARE_SLOT] = "Invalid Firmware Slot", + [NVME_SC_FIRMWARE_IMAGE] = "Invalid Firmware Image", + [NVME_SC_INVALID_VECTOR] = "Invalid Interrupt Vector", + [NVME_SC_INVALID_LOG_PAGE] = "Invalid Log Page", + [NVME_SC_INVALID_FORMAT] = "Invalid Format", + [NVME_SC_FW_NEEDS_CONV_RESET] = "Firmware Activation Requires Conventional Reset", + [NVME_SC_INVALID_QUEUE] = "Invalid Queue Deletion", + [NVME_SC_FEATURE_NOT_SAVEABLE] = "Feature Identifier Not Saveable", + [NVME_SC_FEATURE_NOT_CHANGEABLE] = "Feature Not Changeable", + [NVME_SC_FEATURE_NOT_PER_NS] = "Feature Not Namespace Specific", + [NVME_SC_FW_NEEDS_SUBSYS_RESET] = "Firmware Activation Requires NVM Subsystem Reset", + [NVME_SC_FW_NEEDS_RESET] = "Firmware Activation Requires Reset", + [NVME_SC_FW_NEEDS_MAX_TIME] = "Firmware Activation Requires Maximum Time Violation", + [NVME_SC_FW_ACTIVATE_PROHIBITED] = "Firmware Activation Prohibited", + [NVME_SC_OVERLAPPING_RANGE] = "Overlapping Range", + [NVME_SC_NS_INSUFFICIENT_CAP] = "Namespace Insufficient Capacity", + [NVME_SC_NS_ID_UNAVAILABLE] = "Namespace Identifier Unavailable", + [NVME_SC_NS_ALREADY_ATTACHED] = "Namespace Already Attached", + [NVME_SC_NS_IS_PRIVATE] = "Namespace Is Private", + [NVME_SC_NS_NOT_ATTACHED] = "Namespace Not Attached", + [NVME_SC_THIN_PROV_NOT_SUPP] = "Thin Provisioning Not Supported", + [NVME_SC_CTRL_LIST_INVALID] = "Controller List Invalid", + [NVME_SC_SELT_TEST_IN_PROGRESS] = "Device Self-test In Progress", + [NVME_SC_BP_WRITE_PROHIBITED] = "Boot Partition Write Prohibited", + [NVME_SC_CTRL_ID_INVALID] = "Invalid Controller Identifier", + [NVME_SC_SEC_CTRL_STATE_INVALID] = "Invalid Secondary Controller State", + [NVME_SC_CTRL_RES_NUM_INVALID] = "Invalid Number of Controller Resources", + [NVME_SC_RES_ID_INVALID] = "Invalid Resource Identifier", + [NVME_SC_PMR_SAN_PROHIBITED] = "Sanitize Prohibited", + [NVME_SC_ANA_GROUP_ID_INVALID] = "ANA Group Identifier Invalid", + [NVME_SC_ANA_ATTACH_FAILED] = "ANA Attach Failed", + [NVME_SC_BAD_ATTRIBUTES] = "Conflicting Attributes", + [NVME_SC_INVALID_PI] = "Invalid Protection Information", + [NVME_SC_READ_ONLY] = "Attempted Write to Read Only Range", + [NVME_SC_ONCS_NOT_SUPPORTED] = "ONCS Not Supported", + [NVME_SC_ZONE_BOUNDARY_ERROR] = "Zoned Boundary Error", + [NVME_SC_ZONE_FULL] = "Zone Is Full", + [NVME_SC_ZONE_READ_ONLY] = "Zone Is Read Only", + [NVME_SC_ZONE_OFFLINE] = "Zone Is Offline", + [NVME_SC_ZONE_INVALID_WRITE] = "Zone Invalid Write", + [NVME_SC_ZONE_TOO_MANY_ACTIVE] = "Too Many Active Zones", + [NVME_SC_ZONE_TOO_MANY_OPEN] = "Too Many Open Zones", + [NVME_SC_ZONE_INVALID_TRANSITION] = "Invalid Zone State Transition", + [NVME_SC_WRITE_FAULT] = "Write Fault", + [NVME_SC_READ_ERROR] = "Unrecovered Read Error", + [NVME_SC_GUARD_CHECK] = "End-to-end Guard Check Error", + [NVME_SC_APPTAG_CHECK] = "End-to-end Application Tag Check Error", + [NVME_SC_REFTAG_CHECK] = "End-to-end Reference Tag Check Error", + [NVME_SC_COMPARE_FAILED] = "Compare Failure", + [NVME_SC_ACCESS_DENIED] = "Access Denied", + [NVME_SC_UNWRITTEN_BLOCK] = "Deallocated or Unwritten Logical Block", + [NVME_SC_ANA_PERSISTENT_LOSS] = "Asymmetric Access Persistent Loss", + [NVME_SC_ANA_INACCESSIBLE] = "Asymmetric Access Inaccessible", + [NVME_SC_ANA_TRANSITION] = "Asymmetric Access Transition", + [NVME_SC_HOST_PATH_ERROR] = "Host Pathing Error", +}; + +const unsigned char *nvme_get_error_status_str(u16 status) +{ + status &= 0x7ff; + if (status < ARRAY_SIZE(nvme_statuses) && nvme_statuses[status]) + return nvme_statuses[status & 0x7ff]; + return "Unknown"; +} + +const unsigned char *nvme_get_opcode_str(u8 opcode) +{ + if (opcode < ARRAY_SIZE(nvme_ops) && nvme_ops[opcode]) + return nvme_ops[opcode]; + return "Unknown"; +} + +const unsigned char *nvme_get_admin_opcode_str(u8 opcode) +{ + if (opcode < ARRAY_SIZE(nvme_admin_ops) && nvme_admin_ops[opcode]) + return nvme_admin_ops[opcode]; + return "Unknown"; +} +#endif /* CONFIG_NVME_VERBOSE_ERRORS */ diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index da0e19148177..5e55732f3ea1 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -299,6 +299,37 @@ static void nvme_retry_req(struct request *req) blk_mq_delay_kick_requeue_list(req->q, delay); } +static void nvme_log_error(struct request *req) +{ + struct nvme_ns *ns = req->q->queuedata; + struct nvme_request *nr = nvme_req(req); + + if (ns) { + pr_err_ratelimited("%s: %s(0x%x) @ LBA %llu, %llu blocks, %s (sct 0x%x / sc 0x%x) %s%s\n", + ns->disk ? ns->disk->disk_name : "?", + nvme_get_opcode_str(nr->cmd->common.opcode), + nr->cmd->common.opcode, + (unsigned long long)nvme_sect_to_lba(ns, blk_rq_pos(req)), + (unsigned long long)blk_rq_bytes(req) >> ns->lba_shift, + nvme_get_error_status_str(nr->status), + nr->status >> 8 & 7, /* Status Code Type */ + nr->status & 0xff, /* Status Code */ + nr->status & NVME_SC_MORE ? "MORE " : "", + nr->status & NVME_SC_DNR ? "DNR " : ""); + return; + } + + pr_err_ratelimited("%s: %s(0x%x), %s (sct 0x%x / sc 0x%x) %s%s\n", + dev_name(nr->ctrl->device), + nvme_get_admin_opcode_str(nr->cmd->common.opcode), + nr->cmd->common.opcode, + nvme_get_error_status_str(nr->status), + nr->status >> 8 & 7, /* Status Code Type */ + nr->status & 0xff, /* Status Code */ + nr->status & NVME_SC_MORE ? "MORE " : "", + nr->status & NVME_SC_DNR ? "DNR " : ""); +} + enum nvme_disposition { COMPLETE, RETRY, @@ -339,6 +370,8 @@ static inline void nvme_end_req(struct request *req) { blk_status_t status = nvme_error_status(nvme_req(req)->status); + if (unlikely(nvme_req(req)->status != NVME_SC_SUCCESS)) + nvme_log_error(req); nvme_end_req_zoned(req); nvme_trace_bio_complete(req); blk_mq_end_request(req, status); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index f8658f984d64..08893c04ca89 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -938,4 +938,23 @@ static inline bool nvme_multi_css(struct nvme_ctrl *ctrl) return (ctrl->ctrl_config & NVME_CC_CSS_MASK) == NVME_CC_CSS_CSI; } +#ifdef CONFIG_NVME_VERBOSE_ERRORS +const unsigned char *nvme_get_error_status_str(u16 status); +const unsigned char *nvme_get_opcode_str(u8 opcode); +const unsigned char *nvme_get_admin_opcode_str(u8 opcode); +#else /* CONFIG_NVME_VERBOSE_ERRORS */ +static inline const unsigned char *nvme_get_error_status_str(u16 status) +{ + return "I/O Error"; +} +static inline const unsigned char *nvme_get_opcode_str(u8 opcode) +{ + return "I/O Cmd"; +} +static inline const unsigned char *nvme_get_admin_opcode_str(u8 opcode) +{ + return "Admin Cmd"; +} +#endif /* CONFIG_NVME_VERBOSE_ERRORS */ + #endif /* _NVME_H */ diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 855dd9b3e84b..1f946e5bf7c1 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -1636,6 +1636,7 @@ enum { NVME_SC_HOST_ABORTED_CMD = 0x371, NVME_SC_CRD = 0x1800, + NVME_SC_MORE = 0x2000, NVME_SC_DNR = 0x4000, }; From 89377bc1975c2993bde4a498a3a4e5817ac0ae2c Mon Sep 17 00:00:00 2001 From: Kanchan Joshi Date: Thu, 10 Feb 2022 11:07:55 +0530 Subject: [PATCH 31/76] nvme: add vectored-io support for user-passthrough Add a new NVME_IOCTL_IO64_CMD_VEC ioctl that works like the existing NVME_IOCTL_IO64_CMD ioctl except that it takes and array of iovecs and thus supports vectored I/O. - cmd.addr is base address of user iovec array - cmd.vec_cnt is count of iovec array elements This patch does not include vectored-variant for admin-commands as most of them are light on buffers and likely to have low invocation frequency. Signed-off-by: Kanchan Joshi Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/ioctl.c | 35 ++++++++++++++++++++++++--------- include/uapi/linux/nvme_ioctl.h | 6 +++++- 2 files changed, 31 insertions(+), 10 deletions(-) diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 22314962842d..aaf3dfad2657 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -56,7 +56,7 @@ out: static int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, void __user *ubuffer, unsigned bufflen, void __user *meta_buffer, unsigned meta_len, - u32 meta_seed, u64 *result, unsigned timeout) + u32 meta_seed, u64 *result, unsigned timeout, bool vec) { bool write = nvme_is_write(cmd); struct nvme_ns *ns = q->queuedata; @@ -75,8 +75,22 @@ static int nvme_submit_user_cmd(struct request_queue *q, nvme_req(req)->flags |= NVME_REQ_USERCMD; if (ubuffer && bufflen) { - ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, + if (!vec) + ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, GFP_KERNEL); + else { + struct iovec fast_iov[UIO_FASTIOV]; + struct iovec *iov = fast_iov; + struct iov_iter iter; + + ret = import_iovec(rq_data_dir(req), ubuffer, bufflen, + UIO_FASTIOV, &iov, &iter); + if (ret < 0) + goto out; + ret = blk_rq_map_user_iov(q, req, NULL, &iter, + GFP_KERNEL); + kfree(iov); + } if (ret) goto out; bio = req->bio; @@ -170,7 +184,8 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) return nvme_submit_user_cmd(ns->queue, &c, nvme_to_user_ptr(io.addr), length, - metadata, meta_len, lower_32_bits(io.slba), NULL, 0); + metadata, meta_len, lower_32_bits(io.slba), NULL, 0, + false); } static bool nvme_validate_passthru_nsid(struct nvme_ctrl *ctrl, @@ -224,7 +239,7 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, nvme_to_user_ptr(cmd.addr), cmd.data_len, nvme_to_user_ptr(cmd.metadata), cmd.metadata_len, - 0, &result, timeout); + 0, &result, timeout, false); if (status >= 0) { if (put_user(result, &ucmd->result)) @@ -235,7 +250,7 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, } static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, - struct nvme_passthru_cmd64 __user *ucmd) + struct nvme_passthru_cmd64 __user *ucmd, bool vec) { struct nvme_passthru_cmd64 cmd; struct nvme_command c; @@ -270,7 +285,7 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, nvme_to_user_ptr(cmd.addr), cmd.data_len, nvme_to_user_ptr(cmd.metadata), cmd.metadata_len, - 0, &cmd.result, timeout); + 0, &cmd.result, timeout, vec); if (status >= 0) { if (put_user(cmd.result, &ucmd->result)) @@ -296,7 +311,7 @@ static int nvme_ctrl_ioctl(struct nvme_ctrl *ctrl, unsigned int cmd, case NVME_IOCTL_ADMIN_CMD: return nvme_user_cmd(ctrl, NULL, argp); case NVME_IOCTL_ADMIN64_CMD: - return nvme_user_cmd64(ctrl, NULL, argp); + return nvme_user_cmd64(ctrl, NULL, argp, false); default: return sed_ioctl(ctrl->opal_dev, cmd, argp); } @@ -340,7 +355,9 @@ static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned int cmd, case NVME_IOCTL_SUBMIT_IO: return nvme_submit_io(ns, argp); case NVME_IOCTL_IO64_CMD: - return nvme_user_cmd64(ns->ctrl, ns, argp); + return nvme_user_cmd64(ns->ctrl, ns, argp, false); + case NVME_IOCTL_IO64_CMD_VEC: + return nvme_user_cmd64(ns->ctrl, ns, argp, true); default: return -ENOTTY; } @@ -480,7 +497,7 @@ long nvme_dev_ioctl(struct file *file, unsigned int cmd, case NVME_IOCTL_ADMIN_CMD: return nvme_user_cmd(ctrl, NULL, argp); case NVME_IOCTL_ADMIN64_CMD: - return nvme_user_cmd64(ctrl, NULL, argp); + return nvme_user_cmd64(ctrl, NULL, argp, false); case NVME_IOCTL_IO_CMD: return nvme_dev_user_cmd(ctrl, argp); case NVME_IOCTL_RESET: diff --git a/include/uapi/linux/nvme_ioctl.h b/include/uapi/linux/nvme_ioctl.h index d99b5a772698..b2e43185e3b5 100644 --- a/include/uapi/linux/nvme_ioctl.h +++ b/include/uapi/linux/nvme_ioctl.h @@ -55,7 +55,10 @@ struct nvme_passthru_cmd64 { __u64 metadata; __u64 addr; __u32 metadata_len; - __u32 data_len; + union { + __u32 data_len; /* for non-vectored io */ + __u32 vec_cnt; /* for vectored io */ + }; __u32 cdw10; __u32 cdw11; __u32 cdw12; @@ -78,5 +81,6 @@ struct nvme_passthru_cmd64 { #define NVME_IOCTL_RESCAN _IO('N', 0x46) #define NVME_IOCTL_ADMIN64_CMD _IOWR('N', 0x47, struct nvme_passthru_cmd64) #define NVME_IOCTL_IO64_CMD _IOWR('N', 0x48, struct nvme_passthru_cmd64) +#define NVME_IOCTL_IO64_CMD_VEC _IOWR('N', 0x49, struct nvme_passthru_cmd64) #endif /* _UAPI_LINUX_NVME_IOCTL_H */ From 20d64911e7580f7e29c0086d67860c18307377d7 Mon Sep 17 00:00:00 2001 From: Martin Belanger Date: Tue, 8 Feb 2022 14:33:45 -0500 Subject: [PATCH 32/76] nvme: send uevent on connection up When connectivity with a controller is lost, the driver will keep trying to reconnect once every 10 sec. When connection is restored, user-space apps need to be informed so that they can take proper action. For example, TP8010 introduces the DIM PDU, which is used to register with a discovery controller (DC). The DIM PDU is sent from user-space. The DIM PDU must be sent every time a connection is established with a DC. Therefore, the kernel must tell user-space apps when connection is restored so that registration can happen. The uevent sent is a "change" uevent with environmental data set to: "NVME_EVENT=connected". Signed-off-by: Martin Belanger Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Reviewed-by: John Meneghini Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 5e55732f3ea1..b3e70bb2e6dc 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4262,6 +4262,13 @@ static int nvme_class_uevent(struct device *dev, struct kobj_uevent_env *env) return ret; } +static void nvme_change_uevent(struct nvme_ctrl *ctrl, char *envdata) +{ + char *envp[2] = { envdata, NULL }; + + kobject_uevent_env(&ctrl->device->kobj, KOBJ_CHANGE, envp); +} + static void nvme_aen_uevent(struct nvme_ctrl *ctrl) { char *envp[2] = { NULL, NULL }; @@ -4429,6 +4436,8 @@ void nvme_start_ctrl(struct nvme_ctrl *ctrl) nvme_queue_scan(ctrl); nvme_start_queues(ctrl); } + + nvme_change_uevent(ctrl, "NVME_EVENT=connected"); } EXPORT_SYMBOL_GPL(nvme_start_ctrl); From 86c2457a8e8112f16af8fd10a3e1dd7a302c3c3e Mon Sep 17 00:00:00 2001 From: Martin Belanger Date: Tue, 8 Feb 2022 14:33:46 -0500 Subject: [PATCH 33/76] nvme: expose cntrltype and dctype through sysfs TP8010 introduces the Discovery Controller Type attribute (dctype). The dctype is returned in the response to the Identify command. This patch exposes the dctype through the sysfs. Since the dctype depends on the Controller Type (cntrltype), another attribute of the Identify response, the patch also exposes the cntrltype as well. The dctype will only be displayed for discovery controllers. A note about the naming of this attribute: Although TP8010 calls this attribute the Discovery Controller Type, note that the dctype is now part of the response to the Identify command for all controller types. I/O, Discovery, and Admin controllers all share the same Identify response PDU structure. Non-discovery controllers as well as pre-TP8010 discovery controllers will continue to set this field to 0 (which has always been the default for reserved bytes). Per TP8010, the value 0 now means "Discovery controller type is not reported" instead of "Reserved". One could argue that this definition is correct even for non-discovery controllers, and by extension, exposing it in the sysfs for non-discovery controllers is appropriate. Signed-off-by: Martin Belanger Reviewed-by: Chaitanya Kulkarni Reviewed-by: John Meneghini Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 39 +++++++++++++++++++++++++++++++++++++++ drivers/nvme/host/nvme.h | 3 +++ include/linux/nvme.h | 10 +++++++++- 3 files changed, 51 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index b3e70bb2e6dc..52a49ae5b673 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3023,6 +3023,9 @@ static int nvme_init_identify(struct nvme_ctrl *ctrl) ctrl->max_namespaces = le32_to_cpu(id->mnan); ctrl->ctratt = le32_to_cpu(id->ctratt); + ctrl->cntrltype = id->cntrltype; + ctrl->dctype = id->dctype; + if (id->rtd3e) { /* us -> s */ u32 transition_time = le32_to_cpu(id->rtd3e) / USEC_PER_SEC; @@ -3556,6 +3559,40 @@ static ssize_t nvme_ctrl_fast_io_fail_tmo_store(struct device *dev, static DEVICE_ATTR(fast_io_fail_tmo, S_IRUGO | S_IWUSR, nvme_ctrl_fast_io_fail_tmo_show, nvme_ctrl_fast_io_fail_tmo_store); +static ssize_t cntrltype_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + static const char * const type[] = { + [NVME_CTRL_IO] = "io\n", + [NVME_CTRL_DISC] = "discovery\n", + [NVME_CTRL_ADMIN] = "admin\n", + }; + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + if (ctrl->cntrltype > NVME_CTRL_ADMIN || !type[ctrl->cntrltype]) + return sysfs_emit(buf, "reserved\n"); + + return sysfs_emit(buf, type[ctrl->cntrltype]); +} +static DEVICE_ATTR_RO(cntrltype); + +static ssize_t dctype_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + static const char * const type[] = { + [NVME_DCTYPE_NOT_REPORTED] = "none\n", + [NVME_DCTYPE_DDC] = "ddc\n", + [NVME_DCTYPE_CDC] = "cdc\n", + }; + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + if (ctrl->dctype > NVME_DCTYPE_CDC || !type[ctrl->dctype]) + return sysfs_emit(buf, "reserved\n"); + + return sysfs_emit(buf, type[ctrl->dctype]); +} +static DEVICE_ATTR_RO(dctype); + static struct attribute *nvme_dev_attrs[] = { &dev_attr_reset_controller.attr, &dev_attr_rescan_controller.attr, @@ -3577,6 +3614,8 @@ static struct attribute *nvme_dev_attrs[] = { &dev_attr_reconnect_delay.attr, &dev_attr_fast_io_fail_tmo.attr, &dev_attr_kato.attr, + &dev_attr_cntrltype.attr, + &dev_attr_dctype.attr, NULL }; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 08893c04ca89..4ce48203e63f 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -349,6 +349,9 @@ struct nvme_ctrl { unsigned long discard_page_busy; struct nvme_fault_inject fault_inject; + + enum nvme_ctrl_type cntrltype; + enum nvme_dctype dctype; }; enum nvme_iopolicy { diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 1f946e5bf7c1..9dbc3ef4daf7 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -43,6 +43,12 @@ enum nvme_ctrl_type { NVME_CTRL_ADMIN = 3, /* Administrative controller */ }; +enum nvme_dctype { + NVME_DCTYPE_NOT_REPORTED = 0, + NVME_DCTYPE_DDC = 1, /* Direct Discovery Controller */ + NVME_DCTYPE_CDC = 2, /* Central Discovery Controller */ +}; + /* Address Family codes for Discovery Log Page entry ADRFAM field */ enum { NVMF_ADDR_FAMILY_PCI = 0, /* PCIe */ @@ -320,7 +326,9 @@ struct nvme_id_ctrl { __le16 icdoff; __u8 ctrattr; __u8 msdbd; - __u8 rsvd1804[244]; + __u8 rsvd1804[2]; + __u8 dctype; + __u8 rsvd1807[241]; struct nvme_id_power_state psd[32]; __u8 vs[1024]; }; From 1c3adf0de1db86cf354dcb1a2dd1184e5e63a50a Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 15 Feb 2022 07:03:08 -0800 Subject: [PATCH 34/76] nvme: explicitly set non-error for directives Stream directives is an optional feature. It is not an error if a controller doesn't support as many as the kernel can optionally use. Explicitly set the non-error return value on this condition with a comment explaining why. Note, the return value was already 0 in this condition, so the setting is redundant. This patch should just silence bots that falsely believe the condition contains an error omission. Signed-off-by: Keith Busch Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 52a49ae5b673..1ce579d583c0 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -809,6 +809,8 @@ static int nvme_configure_directives(struct nvme_ctrl *ctrl) if (ctrl->nssa < BLK_MAX_WRITE_HINTS - 1) { dev_info(ctrl->device, "too few streams (%u) available\n", ctrl->nssa); + /* this condition is not an error: streams are optional */ + ret = 0; goto out_disable_stream; } From 0a9f850061d9126b9a4aaf56ae1810138cac0f51 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 15 Feb 2022 07:37:19 -0800 Subject: [PATCH 35/76] nvme: remove nssa from struct nvme_ctrl The reported number of streams is not used outside the function that gets it, so no need to stash it in the controller structure. Use a local variable instead. Signed-off-by: Keith Busch Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 9 +++++---- drivers/nvme/host/nvme.h | 1 - 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 1ce579d583c0..9cffc4770e73 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -790,6 +790,7 @@ static int nvme_get_stream_params(struct nvme_ctrl *ctrl, static int nvme_configure_directives(struct nvme_ctrl *ctrl) { struct streams_directive_params s; + u16 nssa; int ret; if (!(ctrl->oacs & NVME_CTRL_OACS_DIRECTIVES)) @@ -805,16 +806,16 @@ static int nvme_configure_directives(struct nvme_ctrl *ctrl) if (ret) goto out_disable_stream; - ctrl->nssa = le16_to_cpu(s.nssa); - if (ctrl->nssa < BLK_MAX_WRITE_HINTS - 1) { + nssa = le16_to_cpu(s.nssa); + if (nssa < BLK_MAX_WRITE_HINTS - 1) { dev_info(ctrl->device, "too few streams (%u) available\n", - ctrl->nssa); + nssa); /* this condition is not an error: streams are optional */ ret = 0; goto out_disable_stream; } - ctrl->nr_streams = min_t(u16, ctrl->nssa, BLK_MAX_WRITE_HINTS - 1); + ctrl->nr_streams = min_t(u16, nssa, BLK_MAX_WRITE_HINTS - 1); dev_info(ctrl->device, "Using %u streams\n", ctrl->nr_streams); return 0; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 4ce48203e63f..587d92df118b 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -280,7 +280,6 @@ struct nvme_ctrl { u16 crdt[3]; u16 oncs; u16 oacs; - u16 nssa; u16 nr_streams; u16 sqsize; u32 max_namespaces; From fd8099e7918cd2df39ef306dd1d1af7178a15b81 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 24 Feb 2022 10:57:15 +0100 Subject: [PATCH 36/76] nvme: cleanup __nvme_check_ids Pass the actual nvme_ns_ids used for the comparison instead of the ns_head that isn't needed and use a more descriptive function name. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Chaitanya Kulkarni --- drivers/nvme/host/core.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 9cffc4770e73..076a03b801b7 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3673,16 +3673,15 @@ static struct nvme_ns_head *nvme_find_ns_head(struct nvme_subsystem *subsys, return NULL; } -static int __nvme_check_ids(struct nvme_subsystem *subsys, - struct nvme_ns_head *new) +static int nvme_subsys_check_duplicate_ids(struct nvme_subsystem *subsys, + struct nvme_ns_ids *ids) { struct nvme_ns_head *h; lockdep_assert_held(&subsys->lock); list_for_each_entry(h, &subsys->nsheads, entry) { - if (nvme_ns_ids_valid(&new->ids) && - nvme_ns_ids_equal(&new->ids, &h->ids)) + if (nvme_ns_ids_valid(ids) && nvme_ns_ids_equal(ids, &h->ids)) return -EINVAL; } @@ -3781,7 +3780,7 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl, head->ids = *ids; kref_init(&head->ref); - ret = __nvme_check_ids(ctrl->subsys, head); + ret = nvme_subsys_check_duplicate_ids(ctrl->subsys, &head->ids); if (ret) { dev_err(ctrl->device, "duplicate IDs for nsid %d\n", nsid); From e2724cb9f0c406b8fb66efd3aa9e8b3edfd8d5c8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 24 Feb 2022 11:32:58 +0100 Subject: [PATCH 37/76] nvme: fix the check for duplicate unique identifiers nvme_subsys_check_duplicate_ids should needs to return an error if any of the identifiers matches, not just if all of them match. But it does not need to and should not look at the CSI value for this sanity check. Rewrite the logic to be separate from nvme_ns_ids_equal and optimize it by reducing duplicate checks for non-present identifiers. Fixes: ed754e5deeb1 ("nvme: track shared namespaces") Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Chaitanya Kulkarni --- drivers/nvme/host/core.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 076a03b801b7..3e6ac985b24f 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1716,13 +1716,6 @@ static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns) blk_queue_max_write_zeroes_sectors(queue, UINT_MAX); } -static bool nvme_ns_ids_valid(struct nvme_ns_ids *ids) -{ - return !uuid_is_null(&ids->uuid) || - memchr_inv(ids->nguid, 0, sizeof(ids->nguid)) || - memchr_inv(ids->eui64, 0, sizeof(ids->eui64)); -} - static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b) { return uuid_equal(&a->uuid, &b->uuid) && @@ -3676,12 +3669,21 @@ static struct nvme_ns_head *nvme_find_ns_head(struct nvme_subsystem *subsys, static int nvme_subsys_check_duplicate_ids(struct nvme_subsystem *subsys, struct nvme_ns_ids *ids) { + bool has_uuid = !uuid_is_null(&ids->uuid); + bool has_nguid = memchr_inv(ids->nguid, 0, sizeof(ids->nguid)); + bool has_eui64 = memchr_inv(ids->eui64, 0, sizeof(ids->eui64)); struct nvme_ns_head *h; lockdep_assert_held(&subsys->lock); list_for_each_entry(h, &subsys->nsheads, entry) { - if (nvme_ns_ids_valid(ids) && nvme_ns_ids_equal(ids, &h->ids)) + if (has_uuid && uuid_equal(&ids->uuid, &h->ids.uuid)) + return -EINVAL; + if (has_nguid && + memcmp(&ids->nguid, &h->ids.nguid, sizeof(ids->nguid)) == 0) + return -EINVAL; + if (has_eui64 && + memcmp(&ids->eui64, &h->ids.eui64, sizeof(ids->eui64)) == 0) return -EINVAL; } From e2d77d2e11c4f1e70a1a24cc8fe63ff3dc9b53ef Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 24 Feb 2022 17:46:50 +0100 Subject: [PATCH 38/76] nvme: check for duplicate identifiers earlier Lift the check for duplicate identifiers into nvme_init_ns_head, which avoids pointless error unwinding in case they don't match, and also matches where we check identifier validity for the multipath case. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Chaitanya Kulkarni --- drivers/nvme/host/core.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 3e6ac985b24f..cece987ba169 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3782,13 +3782,6 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl, head->ids = *ids; kref_init(&head->ref); - ret = nvme_subsys_check_duplicate_ids(ctrl->subsys, &head->ids); - if (ret) { - dev_err(ctrl->device, - "duplicate IDs for nsid %d\n", nsid); - goto out_cleanup_srcu; - } - if (head->ids.csi) { ret = nvme_get_effects_log(ctrl, head->ids.csi, &head->effects); if (ret) @@ -3827,6 +3820,12 @@ static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid, mutex_lock(&ctrl->subsys->lock); head = nvme_find_ns_head(ctrl->subsys, nsid); if (!head) { + ret = nvme_subsys_check_duplicate_ids(ctrl->subsys, ids); + if (ret) { + dev_err(ctrl->device, + "duplicate IDs for nsid %d\n", nsid); + goto out_unlock; + } head = nvme_alloc_ns_head(ctrl, nsid, ids); if (IS_ERR(head)) { ret = PTR_ERR(head); From 2079f41ec6ffaad9aa51ca550105b2228467aec7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 24 Feb 2022 17:48:32 +0100 Subject: [PATCH 39/76] nvme: check that EUI/GUID/UUID are globally unique Add a check to verify that the unique identifiers are unique globally in addition to the existing check that verifies that they are unique inside a single subsystem. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Chaitanya Kulkarni --- drivers/nvme/host/core.c | 38 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index cece987ba169..f8084ded69e5 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3810,12 +3810,45 @@ out: return ERR_PTR(ret); } +static int nvme_global_check_duplicate_ids(struct nvme_subsystem *this, + struct nvme_ns_ids *ids) +{ + struct nvme_subsystem *s; + int ret = 0; + + /* + * Note that this check is racy as we try to avoid holding the global + * lock over the whole ns_head creation. But it is only intended as + * a sanity check anyway. + */ + mutex_lock(&nvme_subsystems_lock); + list_for_each_entry(s, &nvme_subsystems, entry) { + if (s == this) + continue; + mutex_lock(&s->lock); + ret = nvme_subsys_check_duplicate_ids(s, ids); + mutex_unlock(&s->lock); + if (ret) + break; + } + mutex_unlock(&nvme_subsystems_lock); + + return ret; +} + static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid, struct nvme_ns_ids *ids, bool is_shared) { struct nvme_ctrl *ctrl = ns->ctrl; struct nvme_ns_head *head = NULL; - int ret = 0; + int ret; + + ret = nvme_global_check_duplicate_ids(ctrl->subsys, ids); + if (ret) { + dev_err(ctrl->device, + "globally duplicate IDs for nsid %d\n", nsid); + return ret; + } mutex_lock(&ctrl->subsys->lock); head = nvme_find_ns_head(ctrl->subsys, nsid); @@ -3823,7 +3856,8 @@ static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid, ret = nvme_subsys_check_duplicate_ids(ctrl->subsys, ids); if (ret) { dev_err(ctrl->device, - "duplicate IDs for nsid %d\n", nsid); + "duplicate IDs in subsystem for nsid %d\n", + nsid); goto out_unlock; } head = nvme_alloc_ns_head(ctrl, nsid, ids); From 143a70b8b4300faa92ad82468f65dccd440e7957 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 3 Mar 2022 14:18:56 +0300 Subject: [PATCH 40/76] iss-simdisk: use bvec_kmap_local in simdisk_submit_bio Using local kmaps slightly reduces the chances to stray writes, and the bvec interface cleans up the code a little bit. Signed-off-by: Christoph Hellwig Reviewed-by: Ira Weiny Acked-by: Max Filippov Link: https://lore.kernel.org/r/20220303111905.321089-2-hch@lst.de Signed-off-by: Jens Axboe --- arch/xtensa/platforms/iss/simdisk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/xtensa/platforms/iss/simdisk.c b/arch/xtensa/platforms/iss/simdisk.c index 8eb6ad1a3a1d..0f0e0724397f 100644 --- a/arch/xtensa/platforms/iss/simdisk.c +++ b/arch/xtensa/platforms/iss/simdisk.c @@ -108,13 +108,13 @@ static void simdisk_submit_bio(struct bio *bio) sector_t sector = bio->bi_iter.bi_sector; bio_for_each_segment(bvec, bio, iter) { - char *buffer = kmap_atomic(bvec.bv_page) + bvec.bv_offset; + char *buffer = bvec_kmap_local(&bvec); unsigned len = bvec.bv_len >> SECTOR_SHIFT; simdisk_transfer(dev, sector, len, buffer, bio_data_dir(bio) == WRITE); sector += len; - kunmap_atomic(buffer); + kunmap_local(buffer); } bio_endio(bio); From b7ab4611b6c793100197abc93e069d6f9aab7960 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 3 Mar 2022 14:18:57 +0300 Subject: [PATCH 41/76] aoe: use bvec_kmap_local in bvcpy Using local kmaps slightly reduces the chances to stray writes, and the bvec interface cleans up the code a little bit. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220303111905.321089-3-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/aoe/aoecmd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index cc11f89a0928..384073ef2323 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -1018,9 +1018,9 @@ bvcpy(struct sk_buff *skb, struct bio *bio, struct bvec_iter iter, long cnt) iter.bi_size = cnt; __bio_for_each_segment(bv, bio, iter, iter) { - char *p = kmap_atomic(bv.bv_page) + bv.bv_offset; + char *p = bvec_kmap_local(&bv); skb_copy_bits(skb, soff, p, bv.bv_len); - kunmap_atomic(p); + kunmap_local(p); soff += bv.bv_len; } } From b3bd0a8a74ab970cc1cf0849e66bd0906741105b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 3 Mar 2022 14:18:58 +0300 Subject: [PATCH 42/76] zram: use memcpy_to_bvec in zram_bvec_read Use the proper helper instead of open coding the copy. Signed-off-by: Christoph Hellwig Reviewed-by: Ira Weiny Link: https://lore.kernel.org/r/20220303111905.321089-4-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/zram/zram_drv.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index a3a5e1e71326..14becdf2815d 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1331,12 +1331,10 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, goto out; if (is_partial_io(bvec)) { - void *dst = kmap_atomic(bvec->bv_page); void *src = kmap_atomic(page); - memcpy(dst + bvec->bv_offset, src + offset, bvec->bv_len); + memcpy_to_bvec(bvec, src + offset); kunmap_atomic(src); - kunmap_atomic(dst); } out: if (is_partial_io(bvec)) From bd3d3203eb84d08a6daef805efe9316b79d3bf3c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 3 Mar 2022 14:18:59 +0300 Subject: [PATCH 43/76] zram: use memcpy_from_bvec in zram_bvec_write Use memcpy_from_bvec instead of open coding the logic. Signed-off-by: Christoph Hellwig Reviewed-by: Ira Weiny Link: https://lore.kernel.org/r/20220303111905.321089-5-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/zram/zram_drv.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 14becdf2815d..e9474b02012d 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1465,7 +1465,6 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, { int ret; struct page *page = NULL; - void *src; struct bio_vec vec; vec = *bvec; @@ -1483,11 +1482,9 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, if (ret) goto out; - src = kmap_atomic(bvec->bv_page); dst = kmap_atomic(page); - memcpy(dst + offset, src + bvec->bv_offset, bvec->bv_len); + memcpy_from_bvec(dst + offset, bvec); kunmap_atomic(dst); - kunmap_atomic(src); vec.bv_page = page; vec.bv_len = PAGE_SIZE; From 20072ec828640b7d23a0cfdbccf0dea48e77ba3e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 3 Mar 2022 14:19:00 +0300 Subject: [PATCH 44/76] nvdimm-blk: use bvec_kmap_local in nd_blk_rw_integrity Using local kmaps slightly reduces the chances to stray writes, and the bvec interface cleans up the code a little bit. Signed-off-by: Christoph Hellwig Reviewed-by: Ira Weiny Link: https://lore.kernel.org/r/20220303111905.321089-6-hch@lst.de Signed-off-by: Jens Axboe --- drivers/nvdimm/blk.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c index c1db43524d75..0a3873833594 100644 --- a/drivers/nvdimm/blk.c +++ b/drivers/nvdimm/blk.c @@ -88,10 +88,9 @@ static int nd_blk_rw_integrity(struct nd_namespace_blk *nsblk, */ cur_len = min(len, bv.bv_len); - iobuf = kmap_atomic(bv.bv_page); - err = ndbr->do_io(ndbr, dev_offset, iobuf + bv.bv_offset, - cur_len, rw); - kunmap_atomic(iobuf); + iobuf = bvec_kmap_local(&bv); + err = ndbr->do_io(ndbr, dev_offset, iobuf, cur_len, rw); + kunmap_local(iobuf); if (err) return err; From 3205190655ea56ea5e00815eeff4dab2bde0af80 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 3 Mar 2022 14:19:01 +0300 Subject: [PATCH 45/76] nvdimm-btt: use bvec_kmap_local in btt_rw_integrity Using local kmaps slightly reduces the chances to stray writes, and the bvec interface cleans up the code a little bit. Signed-off-by: Christoph Hellwig Reviewed-by: Ira Weiny Link: https://lore.kernel.org/r/20220303111905.321089-7-hch@lst.de Signed-off-by: Jens Axboe --- drivers/nvdimm/btt.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c index cbd994f7f1fe..9613e54c7a67 100644 --- a/drivers/nvdimm/btt.c +++ b/drivers/nvdimm/btt.c @@ -1163,17 +1163,15 @@ static int btt_rw_integrity(struct btt *btt, struct bio_integrity_payload *bip, */ cur_len = min(len, bv.bv_len); - mem = kmap_atomic(bv.bv_page); + mem = bvec_kmap_local(&bv); if (rw) - ret = arena_write_bytes(arena, meta_nsoff, - mem + bv.bv_offset, cur_len, + ret = arena_write_bytes(arena, meta_nsoff, mem, cur_len, NVDIMM_IO_ATOMIC); else - ret = arena_read_bytes(arena, meta_nsoff, - mem + bv.bv_offset, cur_len, + ret = arena_read_bytes(arena, meta_nsoff, mem, cur_len, NVDIMM_IO_ATOMIC); - kunmap_atomic(mem); + kunmap_local(mem); if (ret) return ret; From 07fee7aba5472d0e65345146a68b4bd1a8b656c3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 3 Mar 2022 14:19:02 +0300 Subject: [PATCH 46/76] bcache: use bvec_kmap_local in bio_csum Using local kmaps slightly reduces the chances to stray writes, and the bvec interface cleans up the code a little bit. Signed-off-by: Christoph Hellwig Reviewed-by: Ira Weiny Link: https://lore.kernel.org/r/20220303111905.321089-8-hch@lst.de Signed-off-by: Jens Axboe --- drivers/md/bcache/request.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 6869e010475a..fdd0194f84dd 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -44,10 +44,10 @@ static void bio_csum(struct bio *bio, struct bkey *k) uint64_t csum = 0; bio_for_each_segment(bv, bio, iter) { - void *d = kmap(bv.bv_page) + bv.bv_offset; + void *d = bvec_kmap_local(&bv); csum = crc64_be(csum, d, bv.bv_len); - kunmap(bv.bv_page); + kunmap_local(d); } k->ptr[KEY_PTRS(k)] = csum & (~0ULL >> 1); From 472278508dce25316e806e45778658c3e4b353b3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 3 Mar 2022 14:19:03 +0300 Subject: [PATCH 47/76] drbd: use bvec_kmap_local in drbd_csum_bio Using local kmaps slightly reduces the chances to stray writes, and the bvec interface cleans up the code a little bit. Signed-off-by: Christoph Hellwig Reviewed-by: Ira Weiny Link: https://lore.kernel.org/r/20220303111905.321089-9-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_worker.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index a5e04b38006b..1b48c8172a07 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -326,9 +326,9 @@ void drbd_csum_bio(struct crypto_shash *tfm, struct bio *bio, void *digest) bio_for_each_segment(bvec, bio, iter) { u8 *src; - src = kmap_atomic(bvec.bv_page); - crypto_shash_update(desc, src + bvec.bv_offset, bvec.bv_len); - kunmap_atomic(src); + src = bvec_kmap_local(&bvec); + crypto_shash_update(desc, src, bvec.bv_len); + kunmap_local(src); /* REQ_OP_WRITE_SAME has only one segment, * checksum the payload only once. */ From 3eddaa60b8411c135d1c71090dea9b59ff3f2e26 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 3 Mar 2022 14:19:04 +0300 Subject: [PATCH 48/76] drbd: use bvec_kmap_local in recv_dless_read Using local kmaps slightly reduces the chances to stray writes, and the bvec interface cleans up the code a little bit. Signed-off-by: Christoph Hellwig Reviewed-by: Ira Weiny Link: https://lore.kernel.org/r/20220303111905.321089-10-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_receiver.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 04e3ec12d8b4..fa00cf2ea952 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -2017,10 +2017,10 @@ static int recv_dless_read(struct drbd_peer_device *peer_device, struct drbd_req D_ASSERT(peer_device->device, sector == bio->bi_iter.bi_sector); bio_for_each_segment(bvec, bio, iter) { - void *mapped = kmap(bvec.bv_page) + bvec.bv_offset; + void *mapped = bvec_kmap_local(&bvec); expect = min_t(int, data_size, bvec.bv_len); err = drbd_recv_all_warn(peer_device->connection, mapped, expect); - kunmap(bvec.bv_page); + kunmap_local(mapped); if (err) return err; data_size -= expect; From 13d4ef0f66b7ee9415e101e213acaf94a0cb28ee Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 3 Mar 2022 14:19:05 +0300 Subject: [PATCH 49/76] floppy: use memcpy_{to,from}_bvec Use the helpers instead of open coding them. Signed-off-by: Christoph Hellwig Reviewed-by: Ira Weiny Link: https://lore.kernel.org/r/20220303111905.321089-11-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/floppy.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 19c2d0327e15..8c647532e3ce 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -2485,11 +2485,9 @@ static void copy_buffer(int ssize, int max_sector, int max_sector_2) } if (CT(raw_cmd->cmd[COMMAND]) == FD_READ) - memcpy_to_page(bv.bv_page, bv.bv_offset, dma_buffer, - size); + memcpy_to_bvec(&bv, dma_buffer); else - memcpy_from_page(dma_buffer, bv.bv_page, bv.bv_offset, - size); + memcpy_from_bvec(dma_buffer, &bv); remaining -= size; dma_buffer += size; From 7b1002f7cfe581930f63787a0b3de0144e61ed55 Mon Sep 17 00:00:00 2001 From: Mingzhe Zou Date: Fri, 7 Jan 2022 16:21:13 +0800 Subject: [PATCH 50/76] bcache: fixup bcache_dev_sectors_dirty_add() multithreaded CPU false sharing When attaching a cached device (a.k.a backing device) to a cache device, bch_sectors_dirty_init() is called to count dirty sectors and stripes (see what bcache_dev_sectors_dirty_add() does) on the cache device. When bcache_dev_sectors_dirty_add() is called, set_bit(stripe, d->full_dirty_stripes) or clear_bit(stripe, d->full_dirty_stripes) operation will always be performed. In full_dirty_stripes, each 1bit represents stripe_size (8192) sectors (512B), so 1bit=4MB (8192*512), and each CPU cache line=64B=512bit=2048MB. When 20 threads process a cached disk with 100G dirty data, a single thread processes about 23M at a time, and 20 threads total 460M. These full_dirty_stripes bits corresponding to the 460M data is likely to fall in the same CPU cache line. When one of these threads performs a set_bit or clear_bit operation, the same CPU cache line of other threads will become invalid and must read the full_dirty_stripes from the main memory again. Compared with single thread, the time of a bcache_dev_sectors_dirty_add() call is increased by about 50 times in our test (100G dirty data, 20 threads, bcache_dev_sectors_dirty_add() is called more than 20 million times). This patch tries to test_bit before set_bit or clear_bit operation. Therefore, a lot of force set and clear operations will be avoided, and most of bcache_dev_sectors_dirty_add() calls will only read CPU cache line. Signed-off-by: Mingzhe Zou Signed-off-by: Coly Li --- drivers/md/bcache/writeback.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index d42301e6309d..176461f89f46 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -585,10 +585,13 @@ void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned int inode, sectors_dirty = atomic_add_return(s, d->stripe_sectors_dirty + stripe); - if (sectors_dirty == d->stripe_size) - set_bit(stripe, d->full_dirty_stripes); - else - clear_bit(stripe, d->full_dirty_stripes); + if (sectors_dirty == d->stripe_size) { + if (!test_bit(stripe, d->full_dirty_stripes)) + set_bit(stripe, d->full_dirty_stripes); + } else { + if (test_bit(stripe, d->full_dirty_stripes)) + clear_bit(stripe, d->full_dirty_stripes); + } nr_sectors -= s; stripe_offset = 0; From 887554ab96588de2917b6c8c73e552da082e5368 Mon Sep 17 00:00:00 2001 From: Mingzhe Zou Date: Fri, 11 Feb 2022 14:39:15 +0800 Subject: [PATCH 51/76] bcache: fixup multiple threads crash When multiple threads to check btree nodes in parallel, the main thread wait for all threads to stop or CACHE_SET_IO_DISABLE flag: wait_event_interruptible(check_state->wait, atomic_read(&check_state->started) == 0 || test_bit(CACHE_SET_IO_DISABLE, &c->flags)); However, the bch_btree_node_read and bch_btree_node_read_done maybe call bch_cache_set_error, then the CACHE_SET_IO_DISABLE will be set. If the flag already set, the main thread return error. At the same time, maybe some threads still running and read NULL pointer, the kernel will crash. This patch change the event wait condition, the main thread must wait for all threads to stop. Fixes: 8e7102273f597 ("bcache: make bch_btree_check() to be multithreaded") Signed-off-by: Mingzhe Zou Cc: stable@vger.kernel.org # v5.7+ Signed-off-by: Coly Li --- drivers/md/bcache/btree.c | 6 ++++-- drivers/md/bcache/writeback.c | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 88c573eeb598..ad9f16689419 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -2060,9 +2060,11 @@ int bch_btree_check(struct cache_set *c) } } + /* + * Must wait for all threads to stop. + */ wait_event_interruptible(check_state->wait, - atomic_read(&check_state->started) == 0 || - test_bit(CACHE_SET_IO_DISABLE, &c->flags)); + atomic_read(&check_state->started) == 0); for (i = 0; i < check_state->total_threads; i++) { if (check_state->infos[i].result) { diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 176461f89f46..9ee0005874cd 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -1001,9 +1001,11 @@ void bch_sectors_dirty_init(struct bcache_device *d) } } + /* + * Must wait for all threads to stop. + */ wait_event_interruptible(state->wait, - atomic_read(&state->started) == 0 || - test_bit(CACHE_SET_IO_DISABLE, &c->flags)); + atomic_read(&state->started) == 0); out: kfree(state); From daae161fd2e568b4f481b177b8be34374df98b68 Mon Sep 17 00:00:00 2001 From: Mariusz Tkaczyk Date: Mon, 17 Jan 2022 12:38:47 +0100 Subject: [PATCH 52/76] md: raid1/raid10: drop pending_cnt Those counters are not necessary after commit 11bb45e8aaf6 ("md: drop queue limitation for RAID1 and RAID10"). Remove them from all code (conf and plug structs). raid1_plug_cb and raid10_plug_cb are identical, so move definition of raid1_plug_cb to common raid1-10 definitions and use it for RAID10 too. Signed-off-by: Mariusz Tkaczyk Signed-off-by: Song Liu --- drivers/md/raid1-10.c | 5 +++++ drivers/md/raid1.c | 11 ----------- drivers/md/raid1.h | 1 - drivers/md/raid10.c | 17 +++-------------- drivers/md/raid10.h | 1 - 5 files changed, 8 insertions(+), 27 deletions(-) diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c index 83f9a4f3d82e..e61f6cad4e08 100644 --- a/drivers/md/raid1-10.c +++ b/drivers/md/raid1-10.c @@ -28,6 +28,11 @@ struct resync_pages { struct page *pages[RESYNC_PAGES]; }; +struct raid1_plug_cb { + struct blk_plug_cb cb; + struct bio_list pending; +}; + static void rbio_pool_free(void *rbio, void *data) { kfree(rbio); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index c3288d46948d..0fb4d252034b 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -824,7 +824,6 @@ static void flush_pending_writes(struct r1conf *conf) struct bio *bio; bio = bio_list_get(&conf->pending_bio_list); - conf->pending_count = 0; spin_unlock_irq(&conf->device_lock); /* @@ -1167,12 +1166,6 @@ free_pages: bio_put(behind_bio); } -struct raid1_plug_cb { - struct blk_plug_cb cb; - struct bio_list pending; - int pending_cnt; -}; - static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule) { struct raid1_plug_cb *plug = container_of(cb, struct raid1_plug_cb, @@ -1184,7 +1177,6 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule) if (from_schedule || current->bio_list) { spin_lock_irq(&conf->device_lock); bio_list_merge(&conf->pending_bio_list, &plug->pending); - conf->pending_count += plug->pending_cnt; spin_unlock_irq(&conf->device_lock); wake_up(&conf->wait_barrier); md_wakeup_thread(mddev->thread); @@ -1588,11 +1580,9 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, plug = NULL; if (plug) { bio_list_add(&plug->pending, mbio); - plug->pending_cnt++; } else { spin_lock_irqsave(&conf->device_lock, flags); bio_list_add(&conf->pending_bio_list, mbio); - conf->pending_count++; spin_unlock_irqrestore(&conf->device_lock, flags); md_wakeup_thread(mddev->thread); } @@ -3058,7 +3048,6 @@ static struct r1conf *setup_conf(struct mddev *mddev) init_waitqueue_head(&conf->wait_barrier); bio_list_init(&conf->pending_bio_list); - conf->pending_count = 0; conf->recovery_disabled = mddev->recovery_disabled - 1; err = -EIO; diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index ccf10e59b116..ebb6788820e7 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -87,7 +87,6 @@ struct r1conf { /* queue pending writes to be submitted on unplug */ struct bio_list pending_bio_list; - int pending_count; /* for use when syncing mirrors: * We don't allow both normal IO and resync/recovery IO at diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 5dd2e17e1d0e..b369ebb965a9 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -861,7 +861,6 @@ static void flush_pending_writes(struct r10conf *conf) struct bio *bio; bio = bio_list_get(&conf->pending_bio_list); - conf->pending_count = 0; spin_unlock_irq(&conf->device_lock); /* @@ -1054,16 +1053,9 @@ static sector_t choose_data_offset(struct r10bio *r10_bio, return rdev->new_data_offset; } -struct raid10_plug_cb { - struct blk_plug_cb cb; - struct bio_list pending; - int pending_cnt; -}; - static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) { - struct raid10_plug_cb *plug = container_of(cb, struct raid10_plug_cb, - cb); + struct raid1_plug_cb *plug = container_of(cb, struct raid1_plug_cb, cb); struct mddev *mddev = plug->cb.data; struct r10conf *conf = mddev->private; struct bio *bio; @@ -1071,7 +1063,6 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) if (from_schedule || current->bio_list) { spin_lock_irq(&conf->device_lock); bio_list_merge(&conf->pending_bio_list, &plug->pending); - conf->pending_count += plug->pending_cnt; spin_unlock_irq(&conf->device_lock); wake_up(&conf->wait_barrier); md_wakeup_thread(mddev->thread); @@ -1238,7 +1229,7 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, const unsigned long do_fua = (bio->bi_opf & REQ_FUA); unsigned long flags; struct blk_plug_cb *cb; - struct raid10_plug_cb *plug = NULL; + struct raid1_plug_cb *plug = NULL; struct r10conf *conf = mddev->private; struct md_rdev *rdev; int devnum = r10_bio->devs[n_copy].devnum; @@ -1280,16 +1271,14 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, cb = blk_check_plugged(raid10_unplug, mddev, sizeof(*plug)); if (cb) - plug = container_of(cb, struct raid10_plug_cb, cb); + plug = container_of(cb, struct raid1_plug_cb, cb); else plug = NULL; if (plug) { bio_list_add(&plug->pending, mbio); - plug->pending_cnt++; } else { spin_lock_irqsave(&conf->device_lock, flags); bio_list_add(&conf->pending_bio_list, mbio); - conf->pending_count++; spin_unlock_irqrestore(&conf->device_lock, flags); md_wakeup_thread(mddev->thread); } diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index c34bb196790e..5c0804d8bb1f 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -75,7 +75,6 @@ struct r10conf { /* queue pending writes and submit them on unplug */ struct bio_list pending_bio_list; - int pending_count; spinlock_t resync_lock; atomic_t nr_pending; From a5359ddd052860bacf957e65fe819c63e974b3a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dirk=20M=C3=BCller?= Date: Tue, 8 Feb 2022 17:50:50 +0100 Subject: [PATCH 53/76] lib/raid6/test: fix multiple definition linking error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GCC 10+ defaults to -fno-common, which enforces proper declaration of external references using "extern". without this change a link would fail with: lib/raid6/test/algos.c:28: multiple definition of `raid6_call'; lib/raid6/test/test.c:22: first defined here the pq.h header that is included already includes an extern declaration so we can just remove the redundant one here. Cc: Signed-off-by: Dirk Müller Reviewed-by: Paul Menzel Signed-off-by: Song Liu --- lib/raid6/test/test.c | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/raid6/test/test.c b/lib/raid6/test/test.c index a3cf071941ab..841a55242aba 100644 --- a/lib/raid6/test/test.c +++ b/lib/raid6/test/test.c @@ -19,7 +19,6 @@ #define NDISKS 16 /* Including P and Q */ const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE))); -struct raid6_calls raid6_call; char *dataptrs[NDISKS]; char data[NDISKS][PAGE_SIZE] __attribute__((aligned(PAGE_SIZE))); From 633174a7046ec3b4572bec24ef98e6ee89bce14b Mon Sep 17 00:00:00 2001 From: Paul Menzel Date: Tue, 8 Feb 2022 16:21:48 +0100 Subject: [PATCH 54/76] lib/raid6/test/Makefile: Use $(pound) instead of \# for Make 4.3 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Buidling raid6test on Ubuntu 21.10 (ppc64le) with GNU Make 4.3 shows the errors below: $ cd lib/raid6/test/ $ make :1:1: error: stray ‘\’ in program :1:2: error: stray ‘#’ in program :1:11: error: expected ‘=’, ‘,’, ‘;’, ‘asm’ or ‘__attribute__’ \ before ‘<’ token [...] The errors come from the HAS_ALTIVEC test, which fails, and the POWER optimized versions are not built. That’s also reason nobody noticed on the other architectures. GNU Make 4.3 does not remove the backslash anymore. From the 4.3 release announcment: > * WARNING: Backward-incompatibility! > Number signs (#) appearing inside a macro reference or function invocation > no longer introduce comments and should not be escaped with backslashes: > thus a call such as: > foo := $(shell echo '#') > is legal. Previously the number sign needed to be escaped, for example: > foo := $(shell echo '\#') > Now this latter will resolve to "\#". If you want to write makefiles > portable to both versions, assign the number sign to a variable: > H := \# > foo := $(shell echo '$H') > This was claimed to be fixed in 3.81, but wasn't, for some reason. > To detect this change search for 'nocomment' in the .FEATURES variable. So, do the same as commit 9564a8cf422d ("Kbuild: fix # escaping in .cmd files for future Make") and commit 929bef467771 ("bpf: Use $(pound) instead of \# in Makefiles") and define and use a $(pound) variable. Reference for the change in make: https://git.savannah.gnu.org/cgit/make.git/commit/?id=c6966b323811c37acedff05b57 Cc: Matt Brown Signed-off-by: Paul Menzel Signed-off-by: Song Liu --- lib/raid6/test/Makefile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/raid6/test/Makefile b/lib/raid6/test/Makefile index a4c7cd74cff5..4fb7700a741b 100644 --- a/lib/raid6/test/Makefile +++ b/lib/raid6/test/Makefile @@ -4,6 +4,8 @@ # from userspace. # +pound := \# + CC = gcc OPTFLAGS = -O2 # Adjust as desired CFLAGS = -I.. -I ../../../include -g $(OPTFLAGS) @@ -42,7 +44,7 @@ else ifeq ($(HAS_NEON),yes) OBJS += neon.o neon1.o neon2.o neon4.o neon8.o recov_neon.o recov_neon_inner.o CFLAGS += -DCONFIG_KERNEL_MODE_NEON=1 else - HAS_ALTIVEC := $(shell printf '\#include \nvector int a;\n' |\ + HAS_ALTIVEC := $(shell printf '$(pound)include \nvector int a;\n' |\ gcc -c -x c - >/dev/null && rm ./-.o && echo yes) ifeq ($(HAS_ALTIVEC),yes) CFLAGS += -I../../../arch/powerpc/include From 5b401e4e9ac4c6a62b9f6562d0250c1a56477ff2 Mon Sep 17 00:00:00 2001 From: Paul Menzel Date: Tue, 8 Feb 2022 16:21:50 +0100 Subject: [PATCH 55/76] lib/raid6: Include for VPERMXOR MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On Ubuntu 21.10 (ppc64le) building raid6test with gcc (Ubuntu 11.2.0-7ubuntu2) 11.2.0 fails with the error below. gcc -I.. -I ../../../include -g -O2 \ -I../../../arch/powerpc/include -DCONFIG_ALTIVEC \ -c -o vpermxor1.o vpermxor1.c vpermxor1.c: In function ‘raid6_vpermxor1_gen_syndrome_real’: vpermxor1.c:64:29: error: expected string literal before ‘VPERMXOR’ 64 | asm(VPERMXOR(%0,%1,%2,%3):"=v"(wq0):"v"(gf_high), "v"(gf_low), "v"(wq0)); | ^~~~~~~~ make: *** [Makefile:58: vpermxor1.o] Error 1 So, include the header asm/ppc-opcode.h defining this macro also when not building the Linux kernel but only this too. Cc: Matt Brown Signed-off-by: Paul Menzel Signed-off-by: Song Liu --- lib/raid6/vpermxor.uc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/raid6/vpermxor.uc b/lib/raid6/vpermxor.uc index 10475dc423c1..1bfb127fbfe8 100644 --- a/lib/raid6/vpermxor.uc +++ b/lib/raid6/vpermxor.uc @@ -24,9 +24,9 @@ #ifdef CONFIG_ALTIVEC #include +#include #ifdef __KERNEL__ #include -#include #include #endif From 7d959f6e978cbbca90e26a192cc39480e977182f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 3 Mar 2022 15:19:33 -0800 Subject: [PATCH 56/76] md: use msleep() in md_notify_reboot() Calling mdelay(1000) from process context, even while a reboot is in progress, does not make sense. Using msleep() allows other threads to make progress. Signed-off-by: Eric Dumazet Cc: linux-raid@vger.kernel.org Signed-off-by: Song Liu --- drivers/md/md.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index f88a9e948f3e..d059e21f3c35 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -9582,7 +9582,7 @@ static int md_notify_reboot(struct notifier_block *this, * driver, we do want to have a safe RAID driver ... */ if (need_delay) - mdelay(1000*1); + msleep(1000); return NOTIFY_DONE; } From 9f7c3f837a3768ce8d5a5c34070a47608a4a0209 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 28 Feb 2022 13:25:00 +0200 Subject: [PATCH 57/76] raid5-ppl: fully initialize the bio in ppl_new_iounit We have all the information to pass the bdev and op directly to bio_init, so do that. Signed-off-by: Christoph Hellwig Signed-off-by: Song Liu --- drivers/md/raid5-ppl.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c index 3446797fa0ac..f7fdd8292485 100644 --- a/drivers/md/raid5-ppl.c +++ b/drivers/md/raid5-ppl.c @@ -250,7 +250,8 @@ static struct ppl_io_unit *ppl_new_iounit(struct ppl_log *log, INIT_LIST_HEAD(&io->stripe_list); atomic_set(&io->pending_stripes, 0); atomic_set(&io->pending_flushes, 0); - bio_init(&io->bio, NULL, io->biovec, PPL_IO_INLINE_BVECS, 0); + bio_init(&io->bio, log->rdev->bdev, io->biovec, PPL_IO_INLINE_BVECS, + REQ_OP_WRITE | REQ_FUA); pplhdr = page_address(io->header_page); clear_page(pplhdr); @@ -465,8 +466,6 @@ static void ppl_submit_iounit(struct ppl_io_unit *io) bio->bi_end_io = ppl_log_endio; - bio->bi_opf = REQ_OP_WRITE | REQ_FUA; - bio_set_dev(bio, log->rdev->bdev); bio->bi_iter.bi_sector = log->next_io_sector; bio_add_page(bio, io->header_page, PAGE_SIZE, 0); bio->bi_write_hint = ppl_conf->write_hint; From 0dd00cba99c352dc9afd62979f350d808c215cb9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 28 Feb 2022 13:25:01 +0200 Subject: [PATCH 58/76] raid5-cache: fully initialize flush_bio when needed Stop using bio_reset and just initialize the bio fully when needed. Signed-off-by: Christoph Hellwig Signed-off-by: Song Liu --- drivers/md/raid5-cache.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 86e2bb89d9c7..8907d18c225a 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -1266,6 +1266,8 @@ static void r5l_log_flush_endio(struct bio *bio) r5l_io_run_stripes(io); list_splice_tail_init(&log->flushing_ios, &log->finished_ios); spin_unlock_irqrestore(&log->io_list_lock, flags); + + bio_uninit(bio); } /* @@ -1301,7 +1303,7 @@ void r5l_flush_stripe_to_raid(struct r5l_log *log) if (!do_flush) return; - bio_reset(&log->flush_bio, log->rdev->bdev, + bio_init(&log->flush_bio, log->rdev->bdev, NULL, 0, REQ_OP_WRITE | REQ_PREFLUSH); log->flush_bio.bi_end_io = r5l_log_flush_endio; submit_bio(&log->flush_bio); @@ -3105,7 +3107,6 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) INIT_LIST_HEAD(&log->io_end_ios); INIT_LIST_HEAD(&log->flushing_ios); INIT_LIST_HEAD(&log->finished_ios); - bio_init(&log->flush_bio, NULL, NULL, 0, 0); log->io_kc = KMEM_CACHE(r5l_io_unit, 0); if (!log->io_kc) From 89f94b64408b576c88e407d87061bb7b9bc2111d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 28 Feb 2022 13:25:02 +0200 Subject: [PATCH 59/76] raid5-cache: statically allocate the recovery ra bio There is no need to preallocate the bio and reset it when use. Just allocate it on-stack and use a bvec places next to the pages used for it. Signed-off-by: Christoph Hellwig Signed-off-by: Song Liu --- drivers/md/raid5-cache.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 8907d18c225a..a7d50ff9020a 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -1623,10 +1623,10 @@ struct r5l_recovery_ctx { * just copy data from the pool. */ struct page *ra_pool[R5L_RECOVERY_PAGE_POOL_SIZE]; + struct bio_vec ra_bvec[R5L_RECOVERY_PAGE_POOL_SIZE]; sector_t pool_offset; /* offset of first page in the pool */ int total_pages; /* total allocated pages */ int valid_pages; /* pages with valid data */ - struct bio *ra_bio; /* bio to do the read ahead */ }; static int r5l_recovery_allocate_ra_pool(struct r5l_log *log, @@ -1634,11 +1634,6 @@ static int r5l_recovery_allocate_ra_pool(struct r5l_log *log, { struct page *page; - ctx->ra_bio = bio_alloc_bioset(NULL, BIO_MAX_VECS, 0, GFP_KERNEL, - &log->bs); - if (!ctx->ra_bio) - return -ENOMEM; - ctx->valid_pages = 0; ctx->total_pages = 0; while (ctx->total_pages < R5L_RECOVERY_PAGE_POOL_SIZE) { @@ -1650,10 +1645,8 @@ static int r5l_recovery_allocate_ra_pool(struct r5l_log *log, ctx->total_pages += 1; } - if (ctx->total_pages == 0) { - bio_put(ctx->ra_bio); + if (ctx->total_pages == 0) return -ENOMEM; - } ctx->pool_offset = 0; return 0; @@ -1666,7 +1659,6 @@ static void r5l_recovery_free_ra_pool(struct r5l_log *log, for (i = 0; i < ctx->total_pages; ++i) put_page(ctx->ra_pool[i]); - bio_put(ctx->ra_bio); } /* @@ -1679,15 +1671,19 @@ static int r5l_recovery_fetch_ra_pool(struct r5l_log *log, struct r5l_recovery_ctx *ctx, sector_t offset) { - bio_reset(ctx->ra_bio, log->rdev->bdev, REQ_OP_READ); - ctx->ra_bio->bi_iter.bi_sector = log->rdev->data_offset + offset; + struct bio bio; + int ret; + + bio_init(&bio, log->rdev->bdev, ctx->ra_bvec, + R5L_RECOVERY_PAGE_POOL_SIZE, REQ_OP_READ); + bio.bi_iter.bi_sector = log->rdev->data_offset + offset; ctx->valid_pages = 0; ctx->pool_offset = offset; while (ctx->valid_pages < ctx->total_pages) { - bio_add_page(ctx->ra_bio, - ctx->ra_pool[ctx->valid_pages], PAGE_SIZE, 0); + __bio_add_page(&bio, ctx->ra_pool[ctx->valid_pages], PAGE_SIZE, + 0); ctx->valid_pages += 1; offset = r5l_ring_add(log, offset, BLOCK_SECTORS); @@ -1696,7 +1692,9 @@ static int r5l_recovery_fetch_ra_pool(struct r5l_log *log, break; } - return submit_bio_wait(ctx->ra_bio); + ret = submit_bio_wait(&bio); + bio_uninit(&bio); + return ret; } /* From 03a6b195e8e846f7373bcbeb3ea2a756dfb9ae61 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 28 Feb 2022 13:25:03 +0200 Subject: [PATCH 60/76] raid5: initialize the stripe_head embeeded bios as needed Use bio_init to initialize the bios when needed to the full state instead of a partial initialization plus later setting of dev and op and bio_reset. Signed-off-by: Christoph Hellwig Signed-off-by: Song Liu --- drivers/md/raid5.c | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 8891aaba6596..8bd5f06390ea 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1060,6 +1060,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) int i, disks = sh->disks; struct stripe_head *head_sh = sh; struct bio_list pending_bios = BIO_EMPTY_LIST; + struct r5dev *dev; bool should_defer; might_sleep(); @@ -1094,8 +1095,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) op_flags |= REQ_SYNC; again: - bi = &sh->dev[i].req; - rbi = &sh->dev[i].rreq; /* For writing to replacement */ + dev = &sh->dev[i]; + bi = &dev->req; + rbi = &dev->rreq; /* For writing to replacement */ rcu_read_lock(); rrdev = rcu_dereference(conf->disks[i].replacement); @@ -1171,8 +1173,7 @@ again: set_bit(STRIPE_IO_STARTED, &sh->state); - bio_set_dev(bi, rdev->bdev); - bio_set_op_attrs(bi, op, op_flags); + bio_init(bi, rdev->bdev, &dev->vec, 1, op | op_flags); bi->bi_end_io = op_is_write(op) ? raid5_end_write_request : raid5_end_read_request; @@ -1238,8 +1239,7 @@ again: set_bit(STRIPE_IO_STARTED, &sh->state); - bio_set_dev(rbi, rrdev->bdev); - bio_set_op_attrs(rbi, op, op_flags); + bio_init(rbi, rrdev->bdev, &dev->rvec, 1, op | op_flags); BUG_ON(!op_is_write(op)); rbi->bi_end_io = raid5_end_write_request; rbi->bi_private = sh; @@ -2294,7 +2294,6 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp, int disks, struct r5conf *conf) { struct stripe_head *sh; - int i; sh = kmem_cache_zalloc(sc, gfp); if (sh) { @@ -2307,12 +2306,6 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp, atomic_set(&sh->count, 1); sh->raid_conf = conf; sh->log_start = MaxSector; - for (i = 0; i < disks; i++) { - struct r5dev *dev = &sh->dev[i]; - - bio_init(&dev->req, NULL, &dev->vec, 1, 0); - bio_init(&dev->rreq, NULL, &dev->rvec, 1, 0); - } if (raid5_has_ppl(conf)) { sh->ppl_page = alloc_page(gfp); @@ -2677,7 +2670,6 @@ static void raid5_end_read_request(struct bio * bi) (unsigned long long)sh->sector, i, atomic_read(&sh->count), bi->bi_status); if (i == disks) { - bio_reset(bi, NULL, 0); BUG(); return; } @@ -2785,7 +2777,7 @@ static void raid5_end_read_request(struct bio * bi) } } rdev_dec_pending(rdev, conf->mddev); - bio_reset(bi, NULL, 0); + bio_uninit(bi); clear_bit(R5_LOCKED, &sh->dev[i].flags); set_bit(STRIPE_HANDLE, &sh->state); raid5_release_stripe(sh); @@ -2823,7 +2815,6 @@ static void raid5_end_write_request(struct bio *bi) (unsigned long long)sh->sector, i, atomic_read(&sh->count), bi->bi_status); if (i == disks) { - bio_reset(bi, NULL, 0); BUG(); return; } @@ -2860,7 +2851,7 @@ static void raid5_end_write_request(struct bio *bi) if (sh->batch_head && bi->bi_status && !replacement) set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state); - bio_reset(bi, NULL, 0); + bio_uninit(bi); if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags)) clear_bit(R5_LOCKED, &sh->dev[i].flags); set_bit(STRIPE_HANDLE, &sh->state); From 85d9abcd7331566781b93ff46e4bccd4806ef2b2 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 11 Mar 2022 11:35:27 +0100 Subject: [PATCH 61/76] xen/blkfront: speed up purge_persistent_grants() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit purge_persistent_grants() is scanning the grants list for persistent grants being no longer in use by the backend. When having found such a grant, it will be set to "invalid" and pushed to the tail of the list. Instead of pushing it directly to the end of the list, add it first to a temporary list, avoiding to scan those entries again in the main list traversal. After having finished the scan, append the temporary list to the grant list. Suggested-by: Jan Beulich Signed-off-by: Juergen Gross Reviewed-by: Roger Pau Monné Reviewed-by: Jan Beulich Link: https://lore.kernel.org/r/20220311103527.12931-1-jgross@suse.com Signed-off-by: Jens Axboe --- drivers/block/xen-blkfront.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index ccd0dd0c6b83..5d04e480f36a 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -2521,6 +2521,7 @@ static void purge_persistent_grants(struct blkfront_info *info) for_each_rinfo(info, rinfo, i) { struct grant *gnt_list_entry, *tmp; + LIST_HEAD(grants); spin_lock_irqsave(&rinfo->ring_lock, flags); @@ -2539,9 +2540,11 @@ static void purge_persistent_grants(struct blkfront_info *info) gnttab_end_foreign_access(gnt_list_entry->gref, 0, 0UL); rinfo->persistent_gnts_c--; gnt_list_entry->gref = GRANT_INVALID_REF; - list_add_tail(&gnt_list_entry->node, &rinfo->grants); + list_add_tail(&gnt_list_entry->node, &grants); } + list_splice_tail(&grants, &rinfo->grants); + spin_unlock_irqrestore(&rinfo->ring_lock, flags); } } From 5e6a7d1eb834be23243c48a964ced67ab0b6e9ce Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Wed, 9 Mar 2022 14:29:00 +0100 Subject: [PATCH 62/76] nvme-multipath: use vmalloc for ANA log buffer The ANA log buffer can get really large, as it depends on the controller configuration. So to avoid an out-of-memory issue during scanning use kvmalloc() instead of the kmalloc(). Signed-off-by: Hannes Reinecke Tested-by: Daniel Wagner Signed-off-by: Daniel Wagner Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/multipath.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index f8bf6606eb2f..3ddc1ba66f0d 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -5,6 +5,7 @@ #include #include +#include #include #include "nvme.h" @@ -898,7 +899,7 @@ int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) if (ana_log_size > ctrl->ana_log_size) { nvme_mpath_stop(ctrl); nvme_mpath_uninit(ctrl); - ctrl->ana_log_buf = kmalloc(ana_log_size, GFP_KERNEL); + ctrl->ana_log_buf = kvmalloc(ana_log_size, GFP_KERNEL); if (!ctrl->ana_log_buf) return -ENOMEM; } @@ -915,7 +916,7 @@ out_uninit: void nvme_mpath_uninit(struct nvme_ctrl *ctrl) { - kfree(ctrl->ana_log_buf); + kvfree(ctrl->ana_log_buf); ctrl->ana_log_buf = NULL; ctrl->ana_log_size = 0; } From 8f31dded50d927974a5a696196052ed53f1baf66 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Wed, 9 Mar 2022 14:02:28 +0800 Subject: [PATCH 63/76] nvme-multipath: call bio_io_error in nvme_ns_head_submit_bio Use bio_io_error() here since bio_io_error does the same thing. Signed-off-by: Guoqing Jiang Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/multipath.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 3ddc1ba66f0d..d13b81cd6225 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -387,8 +387,7 @@ static void nvme_ns_head_submit_bio(struct bio *bio) } else { dev_warn_ratelimited(dev, "no available path - failing I/O\n"); - bio->bi_status = BLK_STS_IOERR; - bio_endio(bio); + bio_io_error(bio); } srcu_read_unlock(&head->srcu, srcu_idx); From 462b8b2d84975758e5b74fe832bfe8145eef403f Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 22 Feb 2022 19:36:56 -0800 Subject: [PATCH 64/76] nvme-tcp: don't initialize ret variable No point in initializing ret variable to 0 in nvme_tcp_start_io_queue() since it gets overwritten by a call to nvme_tcp_start_queue(). Signed-off-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 6cbcc8b4daaf..bb03ae5ef11f 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1674,7 +1674,7 @@ static void nvme_tcp_stop_io_queues(struct nvme_ctrl *ctrl) static int nvme_tcp_start_io_queues(struct nvme_ctrl *ctrl) { - int i, ret = 0; + int i, ret; for (i = 1; i < ctrl->queue_count; i++) { ret = nvme_tcp_start_queue(ctrl, i); From a387935c241d2517c22546f1206a77a856a40541 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 22 Feb 2022 19:36:57 -0800 Subject: [PATCH 65/76] nvme-tcp: don't fold the line The call to nvme_tcp_alloc_queue() fits perfectly in one line without exceeding 80 char limit for the line. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index bb03ae5ef11f..babbc14a4b76 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1714,8 +1714,7 @@ static int __nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl) int i, ret; for (i = 1; i < ctrl->queue_count; i++) { - ret = nvme_tcp_alloc_queue(ctrl, i, - ctrl->sqsize + 1); + ret = nvme_tcp_alloc_queue(ctrl, i, ctrl->sqsize + 1); if (ret) goto out_free_queues; } From 841aee4d75f18fdfb53935080b03de0c65e9b92c Mon Sep 17 00:00:00 2001 From: Chris Leech Date: Tue, 15 Feb 2022 18:22:49 -0800 Subject: [PATCH 66/76] nvme-tcp: lockdep: annotate in-kernel sockets Put NVMe/TCP sockets in their own class to avoid some lockdep warnings. Sockets created by nvme-tcp are not exposed to user-space, and will not trigger certain code paths that the general socket API exposes. Lockdep complains about a circular dependency between the socket and filesystem locks, because setsockopt can trigger a page fault with a socket lock held, but nvme-tcp sends requests on the socket while file system locks are held. ====================================================== WARNING: possible circular locking dependency detected 5.15.0-rc3 #1 Not tainted ------------------------------------------------------ fio/1496 is trying to acquire lock: (sk_lock-AF_INET){+.+.}-{0:0}, at: tcp_sendpage+0x23/0x80 but task is already holding lock: (&xfs_dir_ilock_class/5){+.+.}-{3:3}, at: xfs_ilock+0xcf/0x290 [xfs] which lock already depends on the new lock. other info that might help us debug this: chain exists of: sk_lock-AF_INET --> sb_internal --> &xfs_dir_ilock_class/5 Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&xfs_dir_ilock_class/5); lock(sb_internal); lock(&xfs_dir_ilock_class/5); lock(sk_lock-AF_INET); *** DEADLOCK *** 6 locks held by fio/1496: #0: (sb_writers#13){.+.+}-{0:0}, at: path_openat+0x9fc/0xa20 #1: (&inode->i_sb->s_type->i_mutex_dir_key){++++}-{3:3}, at: path_openat+0x296/0xa20 #2: (sb_internal){.+.+}-{0:0}, at: xfs_trans_alloc_icreate+0x41/0xd0 [xfs] #3: (&xfs_dir_ilock_class/5){+.+.}-{3:3}, at: xfs_ilock+0xcf/0x290 [xfs] #4: (hctx->srcu){....}-{0:0}, at: hctx_lock+0x51/0xd0 #5: (&queue->send_mutex){+.+.}-{3:3}, at: nvme_tcp_queue_rq+0x33e/0x380 [nvme_tcp] This annotation lets lockdep analyze nvme-tcp controlled sockets independently of what the user-space sockets API does. Link: https://lore.kernel.org/linux-nvme/CAHj4cs9MDYLJ+q+2_GXUK9HxFizv2pxUryUR0toX974M040z7g@mail.gmail.com/ Signed-off-by: Chris Leech Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index babbc14a4b76..10fc45d95b86 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -30,6 +30,44 @@ static int so_priority; module_param(so_priority, int, 0644); MODULE_PARM_DESC(so_priority, "nvme tcp socket optimize priority"); +#ifdef CONFIG_DEBUG_LOCK_ALLOC +/* lockdep can detect a circular dependency of the form + * sk_lock -> mmap_lock (page fault) -> fs locks -> sk_lock + * because dependencies are tracked for both nvme-tcp and user contexts. Using + * a separate class prevents lockdep from conflating nvme-tcp socket use with + * user-space socket API use. + */ +static struct lock_class_key nvme_tcp_sk_key[2]; +static struct lock_class_key nvme_tcp_slock_key[2]; + +static void nvme_tcp_reclassify_socket(struct socket *sock) +{ + struct sock *sk = sock->sk; + + if (WARN_ON_ONCE(!sock_allow_reclassification(sk))) + return; + + switch (sk->sk_family) { + case AF_INET: + sock_lock_init_class_and_name(sk, "slock-AF_INET-NVME", + &nvme_tcp_slock_key[0], + "sk_lock-AF_INET-NVME", + &nvme_tcp_sk_key[0]); + break; + case AF_INET6: + sock_lock_init_class_and_name(sk, "slock-AF_INET6-NVME", + &nvme_tcp_slock_key[1], + "sk_lock-AF_INET6-NVME", + &nvme_tcp_sk_key[1]); + break; + default: + WARN_ON_ONCE(1); + } +} +#else +static void nvme_tcp_reclassify_socket(struct socket *sock) { } +#endif + enum nvme_tcp_send_state { NVME_TCP_SEND_CMD_PDU = 0, NVME_TCP_SEND_H2C_PDU, @@ -1427,6 +1465,8 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, goto err_destroy_mutex; } + nvme_tcp_reclassify_socket(queue->sock); + /* Single syn retry */ tcp_sock_set_syncnt(queue->sock->sk, 1); From b2fb99e42598ba5d1ac045fb39a2b38fd80dd8d2 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 8 Mar 2022 21:59:25 -0800 Subject: [PATCH 67/76] nvmet-fc: fix kernel-doc warning for nvmet_fc_register_targetport This fixes following kernel-doc warning :- drivers/nvme/target/fc.c:1365: warning: expecting prototype for nvme_fc_register_targetport(). Prototype was for nvmet_fc_register_targetport() instead Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/fc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index cb013516784c..56c2bdfd8e3b 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -1341,7 +1341,7 @@ nvmet_fc_portentry_rebind_tgt(struct nvmet_fc_tgtport *tgtport) } /** - * nvme_fc_register_targetport - transport entry point called by an + * nvmet_fc_register_targetport - transport entry point called by an * LLDD to register the existence of a local * NVME subystem FC port. * @pinfo: pointer to information about the port to be registered From 0acb8231ebae5175260a7da853f525f2730cc095 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 8 Mar 2022 21:59:26 -0800 Subject: [PATCH 68/76] nvmet-fc: fix kernel-doc warning for nvmet_fc_unregister_targetport This fixes following kernel-doc warning:- drivers/nvme/target/fc.c:1619: warning: expecting prototype for nvme_fc_unregister_targetport(). Prototype was for nvmet_fc_unregister_targetport() instead Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/fc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 56c2bdfd8e3b..de90001fc5c4 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -1604,7 +1604,7 @@ nvmet_fc_delete_ctrl(struct nvmet_ctrl *ctrl) } /** - * nvme_fc_unregister_targetport - transport entry point called by an + * nvmet_fc_unregister_targetport - transport entry point called by an * LLDD to deregister/remove a previously * registered a local NVME subsystem FC port. * @target_port: pointer to the (registered) target port that is to be From a8adf0cddc230be98da0e68bea846e28686dcb60 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 8 Mar 2022 21:59:27 -0800 Subject: [PATCH 69/76] nvmet-rdma: fix kernel-doc warning for nvmet_rdma_device_removal This fixes following kernel-doc warning:- drivers/nvme/target/rdma.c:1722: warning: expecting prototype for nvme_rdma_device_removal(). Prototype was for nvmet_rdma_device_removal() instead Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/rdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 00656754e331..2446d0918a41 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -1703,7 +1703,7 @@ static void nvmet_rdma_queue_connect_fail(struct rdma_cm_id *cm_id, } /** - * nvme_rdma_device_removal() - Handle RDMA device removal + * nvmet_rdma_device_removal() - Handle RDMA device removal * @cm_id: rdma_cm id, used for nvmet port * @queue: nvmet rdma queue (cm id qp_context) * From 73d77c53ff342bfa23daedf4475cdd06618e447b Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 8 Mar 2022 14:56:57 -0800 Subject: [PATCH 70/76] nvmet: don't fold lines Don't fold line that can fit into 80 char limit. No functional change in this patch. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/configfs.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 091a0ca16361..0f5c680c851a 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -93,10 +93,9 @@ CONFIGFS_ATTR(nvmet_, addr_adrfam); static ssize_t nvmet_addr_portid_show(struct config_item *item, char *page) { - struct nvmet_port *port = to_nvmet_port(item); + __le16 portid = to_nvmet_port(item)->disc_addr.portid; - return snprintf(page, PAGE_SIZE, "%d\n", - le16_to_cpu(port->disc_addr.portid)); + return snprintf(page, PAGE_SIZE, "%d\n", le16_to_cpu(portid)); } static ssize_t nvmet_addr_portid_store(struct config_item *item, @@ -124,8 +123,7 @@ static ssize_t nvmet_addr_traddr_show(struct config_item *item, { struct nvmet_port *port = to_nvmet_port(item); - return snprintf(page, PAGE_SIZE, "%s\n", - port->disc_addr.traddr); + return snprintf(page, PAGE_SIZE, "%s\n", port->disc_addr.traddr); } static ssize_t nvmet_addr_traddr_store(struct config_item *item, @@ -199,8 +197,7 @@ static ssize_t nvmet_addr_trsvcid_show(struct config_item *item, { struct nvmet_port *port = to_nvmet_port(item); - return snprintf(page, PAGE_SIZE, "%s\n", - port->disc_addr.trsvcid); + return snprintf(page, PAGE_SIZE, "%s\n", port->disc_addr.trsvcid); } static ssize_t nvmet_addr_trsvcid_store(struct config_item *item, @@ -1236,8 +1233,7 @@ CONFIGFS_ATTR(nvmet_subsys_, attr_model); static ssize_t nvmet_subsys_attr_discovery_nqn_show(struct config_item *item, char *page) { - return snprintf(page, PAGE_SIZE, "%s\n", - nvmet_disc_subsys->subsysnqn); + return snprintf(page, PAGE_SIZE, "%s\n", nvmet_disc_subsys->subsysnqn); } static ssize_t nvmet_subsys_attr_discovery_nqn_store(struct config_item *item, From 98152eb70fffad910ee7c511b110afe98f162fc5 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 8 Mar 2022 14:56:58 -0800 Subject: [PATCH 71/76] nvmet: use snprintf() with PAGE_SIZE in configfs Instead of using sprintf, use snprintf with buffer size limited to PAGE_SIZE just like what we have for the rest of the file. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/configfs.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 0f5c680c851a..ac801fa1641b 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -60,10 +60,11 @@ static ssize_t nvmet_addr_adrfam_show(struct config_item *item, char *page) for (i = 1; i < ARRAY_SIZE(nvmet_addr_family); i++) { if (nvmet_addr_family[i].type == adrfam) - return sprintf(page, "%s\n", nvmet_addr_family[i].name); + return snprintf(page, PAGE_SIZE, "%s\n", + nvmet_addr_family[i].name); } - return sprintf(page, "\n"); + return snprintf(page, PAGE_SIZE, "\n"); } static ssize_t nvmet_addr_adrfam_store(struct config_item *item, @@ -160,10 +161,11 @@ static ssize_t nvmet_addr_treq_show(struct config_item *item, char *page) for (i = 0; i < ARRAY_SIZE(nvmet_addr_treq); i++) { if (treq == nvmet_addr_treq[i].type) - return sprintf(page, "%s\n", nvmet_addr_treq[i].name); + return snprintf(page, PAGE_SIZE, "%s\n", + nvmet_addr_treq[i].name); } - return sprintf(page, "\n"); + return snprintf(page, PAGE_SIZE, "\n"); } static ssize_t nvmet_addr_treq_store(struct config_item *item, @@ -281,7 +283,8 @@ static ssize_t nvmet_addr_trtype_show(struct config_item *item, for (i = 0; i < ARRAY_SIZE(nvmet_transport); i++) { if (port->disc_addr.trtype == nvmet_transport[i].type) - return sprintf(page, "%s\n", nvmet_transport[i].name); + return snprintf(page, PAGE_SIZE, + "%s\n", nvmet_transport[i].name); } return sprintf(page, "\n"); From da7837339641601f202f27515771dc0646083938 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 15 Mar 2022 08:13:04 +0100 Subject: [PATCH 72/76] nvmet: move the call to nvmet_ns_changed out of nvmet_ns_revalidate nvmet_ns_changed states via lockdep that the ns->subsys->lock must be held. The only caller of nvmet_ns_changed which does not acquire that lock is nvmet_ns_revalidate. nvmet_ns_revalidate has 3 callers, of which 2 do not acquire that lock: nvmet_execute_identify_cns_cs_ns and nvmet_execute_identify_ns. The other caller nvmet_ns_revalidate_size_store does acquire the lock. Move the call to nvmet_ns_changed from nvmet_ns_revalidate to the callers so that they can perform the correct locking as needed. This issue was found using a static type-based analyser and manually verified. Reported-by: Niels Dossche Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni --- drivers/nvme/target/admin-cmd.c | 6 +++++- drivers/nvme/target/configfs.c | 3 ++- drivers/nvme/target/core.c | 5 ++--- drivers/nvme/target/nvmet.h | 2 +- drivers/nvme/target/zns.c | 6 +++++- 5 files changed, 15 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 6fb24746de06..46d0dab686dd 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -511,7 +511,11 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req) goto done; } - nvmet_ns_revalidate(req->ns); + if (nvmet_ns_revalidate(req->ns)) { + mutex_lock(&req->ns->subsys->lock); + nvmet_ns_changed(req->ns->subsys, req->ns->nsid); + mutex_unlock(&req->ns->subsys->lock); + } /* * nuse = ncap = nsze isn't always true, but we have no way to find diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index ac801fa1641b..b650312dd4ae 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -586,7 +586,8 @@ static ssize_t nvmet_ns_revalidate_size_store(struct config_item *item, mutex_unlock(&ns->subsys->lock); return -EINVAL; } - nvmet_ns_revalidate(ns); + if (nvmet_ns_revalidate(ns)) + nvmet_ns_changed(ns->subsys, ns->nsid); mutex_unlock(&ns->subsys->lock); return count; } diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 724a6d373340..92af37d33ec3 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -531,7 +531,7 @@ static void nvmet_p2pmem_ns_add_p2p(struct nvmet_ctrl *ctrl, ns->nsid); } -void nvmet_ns_revalidate(struct nvmet_ns *ns) +bool nvmet_ns_revalidate(struct nvmet_ns *ns) { loff_t oldsize = ns->size; @@ -540,8 +540,7 @@ void nvmet_ns_revalidate(struct nvmet_ns *ns) else nvmet_file_ns_revalidate(ns); - if (oldsize != ns->size) - nvmet_ns_changed(ns->subsys, ns->nsid); + return oldsize != ns->size; } int nvmet_ns_enable(struct nvmet_ns *ns) diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 69637bf8f8e1..d910c6aad4b6 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -542,7 +542,7 @@ u16 nvmet_file_flush(struct nvmet_req *req); void nvmet_ns_changed(struct nvmet_subsys *subsys, u32 nsid); void nvmet_bdev_ns_revalidate(struct nvmet_ns *ns); void nvmet_file_ns_revalidate(struct nvmet_ns *ns); -void nvmet_ns_revalidate(struct nvmet_ns *ns); +bool nvmet_ns_revalidate(struct nvmet_ns *ns); u16 blk_to_nvme_status(struct nvmet_req *req, blk_status_t blk_sts); bool nvmet_bdev_zns_enable(struct nvmet_ns *ns); diff --git a/drivers/nvme/target/zns.c b/drivers/nvme/target/zns.c index 3e421217a7ad..e34718b09550 100644 --- a/drivers/nvme/target/zns.c +++ b/drivers/nvme/target/zns.c @@ -123,7 +123,11 @@ void nvmet_execute_identify_cns_cs_ns(struct nvmet_req *req) goto done; } - nvmet_ns_revalidate(req->ns); + if (nvmet_ns_revalidate(req->ns)) { + mutex_lock(&req->ns->subsys->lock); + nvmet_ns_changed(req->ns->subsys, req->ns->nsid); + mutex_unlock(&req->ns->subsys->lock); + } zsze = (bdev_zone_sectors(req->ns->bdev) << 9) >> req->ns->blksize_shift; id_zns->lbafe[0].zsze = cpu_to_le64(zsze); From b739e137052069c996ab8b3bc9c25be501ecc63b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 15 Mar 2022 12:58:06 +0100 Subject: [PATCH 73/76] nvme: cleanup how disk->disk_name is assigned They way how assigning the disk name and commenting on why it is done is split over core.c and multipath.c seems to be rather confusing. Now that ns_head->disk always exists we can do all the work in core.c and have a single big comment explaining the issues. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni --- drivers/nvme/host/core.c | 22 ++++++++++++++++++---- drivers/nvme/host/multipath.c | 24 +----------------------- drivers/nvme/host/nvme.h | 8 ++------ 3 files changed, 21 insertions(+), 33 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index f8084ded69e5..fd9878671d73 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3967,13 +3967,27 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, goto out_cleanup_disk; /* - * Without the multipath code enabled, multiple controller per - * subsystems are visible as devices and thus we cannot use the - * subsystem instance. + * If multipathing is enabled, the device name for all disks and not + * just those that represent shared namespaces needs to be based on the + * subsystem instance. Using the controller instance for private + * namespaces could lead to naming collisions between shared and private + * namespaces if they don't use a common numbering scheme. + * + * If multipathing is not enabled, disk names must use the controller + * instance as shared namespaces will show up as multiple block + * devices. */ - if (!nvme_mpath_set_disk_name(ns, disk->disk_name, &disk->flags)) + if (ns->head->disk) { + sprintf(disk->disk_name, "nvme%dc%dn%d", ctrl->subsys->instance, + ctrl->instance, ns->head->instance); + disk->flags |= GENHD_FL_HIDDEN; + } else if (multipath) { + sprintf(disk->disk_name, "nvme%dn%d", ctrl->subsys->instance, + ns->head->instance); + } else { sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance); + } if (nvme_update_ns_info(ns, id)) goto out_unlink_ns; diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index d13b81cd6225..c97d7f843977 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -9,7 +9,7 @@ #include #include "nvme.h" -static bool multipath = true; +bool multipath = true; module_param(multipath, bool, 0444); MODULE_PARM_DESC(multipath, "turn on native support for multiple controllers per subsystem"); @@ -80,28 +80,6 @@ void nvme_mpath_start_freeze(struct nvme_subsystem *subsys) blk_freeze_queue_start(h->disk->queue); } -/* - * If multipathing is enabled we need to always use the subsystem instance - * number for numbering our devices to avoid conflicts between subsystems that - * have multiple controllers and thus use the multipath-aware subsystem node - * and those that have a single controller and use the controller node - * directly. - */ -bool nvme_mpath_set_disk_name(struct nvme_ns *ns, char *disk_name, int *flags) -{ - if (!multipath) - return false; - if (!ns->head->disk) { - sprintf(disk_name, "nvme%dn%d", ns->ctrl->subsys->instance, - ns->head->instance); - return true; - } - sprintf(disk_name, "nvme%dc%dn%d", ns->ctrl->subsys->instance, - ns->ctrl->instance, ns->head->instance); - *flags = GENHD_FL_HIDDEN; - return true; -} - void nvme_failover_req(struct request *req) { struct nvme_ns *ns = req->q->queuedata; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 587d92df118b..47badb76c654 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -770,7 +770,6 @@ void nvme_mpath_unfreeze(struct nvme_subsystem *subsys); void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys); void nvme_mpath_start_freeze(struct nvme_subsystem *subsys); void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys); -bool nvme_mpath_set_disk_name(struct nvme_ns *ns, char *disk_name, int *flags); void nvme_failover_req(struct request *req); void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl); int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head); @@ -793,20 +792,17 @@ static inline void nvme_trace_bio_complete(struct request *req) trace_block_bio_complete(ns->head->disk->queue, req->bio); } +extern bool multipath; extern struct device_attribute dev_attr_ana_grpid; extern struct device_attribute dev_attr_ana_state; extern struct device_attribute subsys_attr_iopolicy; #else +#define multipath false static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl) { return false; } -static inline bool nvme_mpath_set_disk_name(struct nvme_ns *ns, char *disk_name, - int *flags) -{ - return false; -} static inline void nvme_failover_req(struct request *req) { } From e559398f47e090394bbbd9006349c858e1ba80da Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 15 Mar 2022 15:53:59 +0100 Subject: [PATCH 74/76] nvme: remove nvme_alloc_request and nvme_alloc_request_qid Just open code the allocation + initialization in the callers. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni --- drivers/nvme/host/core.c | 46 ++++++++-------------------------- drivers/nvme/host/ioctl.c | 3 ++- drivers/nvme/host/nvme.h | 8 ++++-- drivers/nvme/host/pci.c | 17 +++++++------ drivers/nvme/target/passthru.c | 3 ++- 5 files changed, 31 insertions(+), 46 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index fd9878671d73..c868015c1c96 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -639,13 +639,8 @@ static inline void nvme_clear_nvme_request(struct request *req) req->rq_flags |= RQF_DONTPREP; } -static inline unsigned int nvme_req_op(struct nvme_command *cmd) -{ - return nvme_is_write(cmd) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN; -} - -static inline void nvme_init_request(struct request *req, - struct nvme_command *cmd) +/* initialize a passthrough request */ +void nvme_init_request(struct request *req, struct nvme_command *cmd) { if (req->q->queuedata) req->timeout = NVME_IO_TIMEOUT; @@ -661,30 +656,7 @@ static inline void nvme_init_request(struct request *req, nvme_clear_nvme_request(req); memcpy(nvme_req(req)->cmd, cmd, sizeof(*cmd)); } - -struct request *nvme_alloc_request(struct request_queue *q, - struct nvme_command *cmd, blk_mq_req_flags_t flags) -{ - struct request *req; - - req = blk_mq_alloc_request(q, nvme_req_op(cmd), flags); - if (!IS_ERR(req)) - nvme_init_request(req, cmd); - return req; -} -EXPORT_SYMBOL_GPL(nvme_alloc_request); - -static struct request *nvme_alloc_request_qid(struct request_queue *q, - struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid) -{ - struct request *req; - - req = blk_mq_alloc_request_hctx(q, nvme_req_op(cmd), flags, - qid ? qid - 1 : 0); - if (!IS_ERR(req)) - nvme_init_request(req, cmd); - return req; -} +EXPORT_SYMBOL_GPL(nvme_init_request); /* * For something we're not in a state to send to the device the default action @@ -1110,11 +1082,14 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, int ret; if (qid == NVME_QID_ANY) - req = nvme_alloc_request(q, cmd, flags); + req = blk_mq_alloc_request(q, nvme_req_op(cmd), flags); else - req = nvme_alloc_request_qid(q, cmd, flags, qid); + req = blk_mq_alloc_request_hctx(q, nvme_req_op(cmd), flags, + qid ? qid - 1 : 0); + if (IS_ERR(req)) return PTR_ERR(req); + nvme_init_request(req, cmd); if (timeout) req->timeout = timeout; @@ -1304,14 +1279,15 @@ static void nvme_keep_alive_work(struct work_struct *work) return; } - rq = nvme_alloc_request(ctrl->admin_q, &ctrl->ka_cmd, - BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT); + rq = blk_mq_alloc_request(ctrl->admin_q, nvme_req_op(&ctrl->ka_cmd), + BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT); if (IS_ERR(rq)) { /* allocation failure, reset the controller */ dev_err(ctrl->device, "keep-alive failed: %ld\n", PTR_ERR(rq)); nvme_reset_ctrl(ctrl); return; } + nvme_init_request(rq, &ctrl->ka_cmd); rq->timeout = ctrl->kato * HZ; rq->end_io_data = ctrl; diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index aaf3dfad2657..554566371ffa 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -66,9 +66,10 @@ static int nvme_submit_user_cmd(struct request_queue *q, void *meta = NULL; int ret; - req = nvme_alloc_request(q, cmd, 0); + req = blk_mq_alloc_request(q, nvme_req_op(cmd), 0); if (IS_ERR(req)) return PTR_ERR(req); + nvme_init_request(req, cmd); if (timeout) req->timeout = timeout; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 47badb76c654..1ea908d43e17 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -698,9 +698,13 @@ void nvme_wait_freeze(struct nvme_ctrl *ctrl); int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout); void nvme_start_freeze(struct nvme_ctrl *ctrl); +static inline unsigned int nvme_req_op(struct nvme_command *cmd) +{ + return nvme_is_write(cmd) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN; +} + #define NVME_QID_ANY -1 -struct request *nvme_alloc_request(struct request_queue *q, - struct nvme_command *cmd, blk_mq_req_flags_t flags); +void nvme_init_request(struct request *req, struct nvme_command *cmd); void nvme_cleanup_cmd(struct request *req); blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req); blk_status_t nvme_fail_nonready_command(struct nvme_ctrl *ctrl, diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 6a99ed680915..9f4f3884fefe 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -424,8 +424,9 @@ static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, return 0; } -static int nvme_init_request(struct blk_mq_tag_set *set, struct request *req, - unsigned int hctx_idx, unsigned int numa_node) +static int nvme_pci_init_request(struct blk_mq_tag_set *set, + struct request *req, unsigned int hctx_idx, + unsigned int numa_node) { struct nvme_dev *dev = set->driver_data; struct nvme_iod *iod = blk_mq_rq_to_pdu(req); @@ -1428,12 +1429,13 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) "I/O %d QID %d timeout, aborting\n", req->tag, nvmeq->qid); - abort_req = nvme_alloc_request(dev->ctrl.admin_q, &cmd, - BLK_MQ_REQ_NOWAIT); + abort_req = blk_mq_alloc_request(dev->ctrl.admin_q, nvme_req_op(&cmd), + BLK_MQ_REQ_NOWAIT); if (IS_ERR(abort_req)) { atomic_inc(&dev->ctrl.abort_limit); return BLK_EH_RESET_TIMER; } + nvme_init_request(abort_req, &cmd); abort_req->end_io_data = NULL; blk_execute_rq_nowait(abort_req, false, abort_endio); @@ -1722,7 +1724,7 @@ static const struct blk_mq_ops nvme_mq_admin_ops = { .queue_rq = nvme_queue_rq, .complete = nvme_pci_complete_rq, .init_hctx = nvme_admin_init_hctx, - .init_request = nvme_init_request, + .init_request = nvme_pci_init_request, .timeout = nvme_timeout, }; @@ -1732,7 +1734,7 @@ static const struct blk_mq_ops nvme_mq_ops = { .complete = nvme_pci_complete_rq, .commit_rqs = nvme_commit_rqs, .init_hctx = nvme_init_hctx, - .init_request = nvme_init_request, + .init_request = nvme_pci_init_request, .map_queues = nvme_pci_map_queues, .timeout = nvme_timeout, .poll = nvme_poll, @@ -2475,9 +2477,10 @@ static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode) cmd.delete_queue.opcode = opcode; cmd.delete_queue.qid = cpu_to_le16(nvmeq->qid); - req = nvme_alloc_request(q, &cmd, BLK_MQ_REQ_NOWAIT); + req = blk_mq_alloc_request(q, nvme_req_op(&cmd), BLK_MQ_REQ_NOWAIT); if (IS_ERR(req)) return PTR_ERR(req); + nvme_init_request(req, &cmd); req->end_io_data = nvmeq; diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index a810bf569fff..a4de1e0d518b 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -254,11 +254,12 @@ static void nvmet_passthru_execute_cmd(struct nvmet_req *req) timeout = nvmet_req_subsys(req)->admin_timeout; } - rq = nvme_alloc_request(q, req->cmd, 0); + rq = blk_mq_alloc_request(q, nvme_req_op(req->cmd), 0); if (IS_ERR(rq)) { status = NVME_SC_INTERNAL; goto out_put_ns; } + nvme_init_request(rq, req->cmd); if (timeout) rq->timeout = timeout; From ce8d78616a6b637d1b763eb18e32045687a84305 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 15 Mar 2022 13:27:07 +0100 Subject: [PATCH 75/76] nvme: warn about shared namespaces without CONFIG_NVME_MULTIPATH Start warning about exposing a namespace as multiple block devices, and set a fixed deprecation release. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch --- drivers/block/loop.c | 1 + drivers/nvme/host/core.c | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 8fb89d0624fc..e733c48de2e9 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -2092,6 +2092,7 @@ static void loop_remove(struct loop_device *lo) del_gendisk(lo->lo_disk); blk_cleanup_disk(lo->lo_disk); blk_mq_free_tag_set(&lo->tag_set); + mutex_lock(&loop_ctl_mutex); idr_remove(&loop_index_idr, lo->lo_number); mutex_unlock(&loop_ctl_mutex); diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index c868015c1c96..3c0461129bca 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3855,6 +3855,14 @@ static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid, nsid); goto out_put_ns_head; } + + if (!multipath && !list_empty(&head->list)) { + dev_warn(ctrl->device, + "Found shared namespace %d, but multipathing not supported.\n", + nsid); + dev_warn_once(ctrl->device, + "Support for shared namespaces without CONFIG_NVME_MULTIPATH is deprecated and will be removed in Linux 6.0\n."); + } } list_add_tail_rcu(&ns->siblings, &head->list); From bcfe9b6cbb4438b8c1cc4bd475221652c8f9301b Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 16 Mar 2022 12:20:02 -0700 Subject: [PATCH 76/76] virtio_blk: eliminate anonymous module_init & module_exit Eliminate anonymous module_init() and module_exit(), which can lead to confusion or ambiguity when reading System.map, crashes/oops/bugs, or an initcall_debug log. Give each of these init and exit functions unique driver-specific names to eliminate the anonymous names. Example 1: (System.map) ffffffff832fc78c t init ffffffff832fc79e t init ffffffff832fc8f8 t init Example 2: (initcall_debug log) calling init+0x0/0x12 @ 1 initcall init+0x0/0x12 returned 0 after 15 usecs calling init+0x0/0x60 @ 1 initcall init+0x0/0x60 returned 0 after 2 usecs calling init+0x0/0x9a @ 1 initcall init+0x0/0x9a returned 0 after 74 usecs Fixes: e467cde23818 ("Block driver using virtio.") Signed-off-by: Randy Dunlap Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Paolo Bonzini Cc: Stefan Hajnoczi Cc: virtualization@lists.linux-foundation.org Cc: Jens Axboe Cc: linux-block@vger.kernel.org Reviewed-by: Stefan Hajnoczi Acked-by: Jason Wang Reviewed-by: Ira Weiny Link: https://lore.kernel.org/r/20220316192010.19001-2-rdunlap@infradead.org Signed-off-by: Jens Axboe --- drivers/block/virtio_blk.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 5c636ca7f1a7..fa8abf982c64 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -1022,7 +1022,7 @@ static struct virtio_driver virtio_blk = { #endif }; -static int __init init(void) +static int __init virtio_blk_init(void) { int error; @@ -1048,14 +1048,14 @@ out_destroy_workqueue: return error; } -static void __exit fini(void) +static void __exit virtio_blk_fini(void) { unregister_virtio_driver(&virtio_blk); unregister_blkdev(major, "virtblk"); destroy_workqueue(virtblk_wq); } -module_init(init); -module_exit(fini); +module_init(virtio_blk_init); +module_exit(virtio_blk_fini); MODULE_DEVICE_TABLE(virtio, id_table); MODULE_DESCRIPTION("Virtio block driver");