From 72c215ed8731c88b2d7e09afc51fffc207ae47b8 Mon Sep 17 00:00:00 2001 From: Li Nan Date: Wed, 22 Feb 2023 12:09:59 +0800 Subject: [PATCH 01/15] md/raid10: fix task hung in raid10d commit fe630de009d0 ("md/raid10: avoid deadlock on recovery.") allowed normal io and sync io to exist at the same time. Task hung will occur as below: T1 T2 T3 T4 raid10d handle_read_error allow_barrier conf->nr_pending-- -> 0 //submit sync io raid10_sync_request raise_barrier ->will not be blocked ... //submit to drivers raid10_read_request wait_barrier conf->nr_pending++ -> 1 //retry read fail raid10_end_read_request reschedule_retry add to retry_list conf->nr_queued++ -> 1 //sync io fail end_sync_read __end_sync_read reschedule_retry add to retry_list conf->nr_queued++ -> 2 ... handle_read_error get form retry_list conf->nr_queued-- freeze_array wait nr_pending == nr_queued+1 ->1 ->2 //task hung retry read and sync io will be added to retry_list(nr_queued->2) if they fails. raid10d() called handle_read_error() and hung in freeze_array(). nr_queued will not decrease because raid10d is blocked, nr_pending will not increase because conf->barrier is not released. Fix it by moving allow_barrier() after raid10_read_request(). raise_barrier() will wait for nr_waiting to become 0. Therefore, sync io and regular io will not be issued at the same time. Also remove the check of nr_queued in stop_waiting_barrier. It can be 0 but don't need to be blocking. Remove the check for MD_RECOVERY_RUNNING as the check is redundent. Fixes: fe630de009d0 ("md/raid10: avoid deadlock on recovery.") Signed-off-by: Li Nan Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230222041000.3341651-2-linan666@huaweicloud.com --- drivers/md/raid10.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 6c66357f92f5..db9ee3b637d6 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -995,11 +995,15 @@ static bool stop_waiting_barrier(struct r10conf *conf) (!bio_list_empty(&bio_list[0]) || !bio_list_empty(&bio_list[1]))) return true; - /* move on if recovery thread is blocked by us */ - if (conf->mddev->thread->tsk == current && - test_bit(MD_RECOVERY_RUNNING, &conf->mddev->recovery) && - conf->nr_queued > 0) + /* + * move on if io is issued from raid10d(), nr_pending is not released + * from original io(see handle_read_error()). All raise barrier is + * blocked until this io is done. + */ + if (conf->mddev->thread->tsk == current) { + WARN_ON_ONCE(atomic_read(&conf->nr_pending) == 0); return true; + } return false; } @@ -2978,9 +2982,13 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) md_error(mddev, rdev); rdev_dec_pending(rdev, mddev); - allow_barrier(conf); r10_bio->state = 0; raid10_read_request(mddev, r10_bio->master_bio, r10_bio); + /* + * allow_barrier after re-submit to ensure no sync io + * can be issued while regular io pending. + */ + allow_barrier(conf); } static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) From a405c6f0229526160aa3f177f65e20c86fce84c5 Mon Sep 17 00:00:00 2001 From: Li Nan Date: Wed, 22 Feb 2023 12:10:00 +0800 Subject: [PATCH 02/15] md/raid10: fix null-ptr-deref in raid10_sync_request init_resync() inits mempool and sets conf->have_replacemnt at the beginning of sync, close_sync() frees the mempool when sync is completed. After [1] recovery might be skipped and init_resync() is called but close_sync() is not. null-ptr-deref occurs with r10bio->dev[i].repl_bio. The following is one way to reproduce the issue. 1) create a array, wait for resync to complete, mddev->recovery_cp is set to MaxSector. 2) recovery is woken and it is skipped. conf->have_replacement is set to 0 in init_resync(). close_sync() not called. 3) some io errors and rdev A is set to WantReplacement. 4) a new device is added and set to A's replacement. 5) recovery is woken, A have replacement, but conf->have_replacemnt is 0. r10bio->dev[i].repl_bio will not be alloced and null-ptr-deref occurs. Fix it by not calling init_resync() if recovery skipped. [1] commit 7e83ccbecd60 ("md/raid10: Allow skipping recovery when clean arrays are assembled") Fixes: 7e83ccbecd60 ("md/raid10: Allow skipping recovery when clean arrays are assembled") Cc: stable@vger.kernel.org Signed-off-by: Li Nan Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230222041000.3341651-3-linan666@huaweicloud.com --- drivers/md/raid10.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index db9ee3b637d6..9e0e7bf524aa 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -3297,10 +3297,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, sector_t chunk_mask = conf->geo.chunk_mask; int page_idx = 0; - if (!mempool_initialized(&conf->r10buf_pool)) - if (init_resync(conf)) - return 0; - /* * Allow skipping a full rebuild for incremental assembly * of a clean array, like RAID1 does. @@ -3316,6 +3312,10 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, return mddev->dev_sectors - sector_nr; } + if (!mempool_initialized(&conf->r10buf_pool)) + if (init_resync(conf)) + return 0; + skipped: max_sector = mddev->dev_sectors; if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || From 4d72a9de2f002c77267c720ebc5595a17f0d66e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 14 Feb 2023 03:19:22 +0000 Subject: [PATCH 03/15] md: make kobj_type structures constant MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit ee6d3dd4ed48 ("driver core: make kobj_type constant.") the driver core allows the usage of const struct kobj_type. Take advantage of this to constify the structure definitions to prevent modification at runtime. Signed-off-by: Thomas Weißschuh Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230214-kobj_type-md-v1-1-d6853f707f11@weissschuh.net --- drivers/md/md.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 927a43db5dfb..40dba7c1bdf0 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -78,7 +78,7 @@ static LIST_HEAD(pers_list); static DEFINE_SPINLOCK(pers_lock); -static struct kobj_type md_ktype; +static const struct kobj_type md_ktype; struct md_cluster_operations *md_cluster_ops; EXPORT_SYMBOL(md_cluster_ops); @@ -3597,7 +3597,7 @@ static const struct sysfs_ops rdev_sysfs_ops = { .show = rdev_attr_show, .store = rdev_attr_store, }; -static struct kobj_type rdev_ktype = { +static const struct kobj_type rdev_ktype = { .release = rdev_free, .sysfs_ops = &rdev_sysfs_ops, .default_groups = rdev_default_groups, @@ -5555,7 +5555,7 @@ static const struct sysfs_ops md_sysfs_ops = { .show = md_attr_show, .store = md_attr_store, }; -static struct kobj_type md_ktype = { +static const struct kobj_type md_ktype = { .release = md_kobj_release, .sysfs_ops = &md_sysfs_ops, .default_groups = md_attr_groups, From dccb8ad615bf042a318d66d284151d4783cd56d6 Mon Sep 17 00:00:00 2001 From: Jiangshan Yi Date: Tue, 14 Feb 2023 14:40:13 +0800 Subject: [PATCH 04/15] md/raid10: Fix typo in comment (replacment -> replacement) Replace replacment with replacement. Signed-off-by: Jiangshan Yi Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230214064013.2373851-1-yijiangshan@kylinos.cn --- drivers/md/raid10.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 9e0e7bf524aa..6b39e6c7ada3 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1630,7 +1630,7 @@ static void raid10_end_discard_request(struct bio *bio) /* * raid10_remove_disk uses smp_mb to make sure rdev is set to * replacement before setting replacement to NULL. It can read - * rdev first without barrier protect even replacment is NULL + * rdev first without barrier protect even replacement is NULL */ smp_rmb(); rdev = conf->mirrors[dev].rdev; From 328e17d8d9428b390ab48bdedcaf5106e4b511fe Mon Sep 17 00:00:00 2001 From: Jon Derrick Date: Fri, 24 Feb 2023 11:33:21 -0700 Subject: [PATCH 05/15] md: Move sb writer loop to its own function Preparatory patch for optimal I/O size calculation. Move the sb writer loop routine into its own function for clarity. Reviewed-by: Christoph Hellwig Signed-off-by: Jon Derrick Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230224183323.638-2-jonathan.derrick@linux.dev --- drivers/md/md-bitmap.c | 127 +++++++++++++++++++++-------------------- 1 file changed, 66 insertions(+), 61 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index e7cc6ba1b657..cdc61e65abae 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -209,76 +209,81 @@ static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mdde return NULL; } -static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) +static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap, + struct page *page) { - struct md_rdev *rdev; struct block_device *bdev; struct mddev *mddev = bitmap->mddev; struct bitmap_storage *store = &bitmap->storage; + loff_t offset = mddev->bitmap_info.offset; + int size = PAGE_SIZE; -restart: - rdev = NULL; - while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { - int size = PAGE_SIZE; - loff_t offset = mddev->bitmap_info.offset; + bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev; + if (page->index == store->file_pages - 1) { + int last_page_size = store->bytes & (PAGE_SIZE - 1); - bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev; - - if (page->index == store->file_pages-1) { - int last_page_size = store->bytes & (PAGE_SIZE-1); - if (last_page_size == 0) - last_page_size = PAGE_SIZE; - size = roundup(last_page_size, - bdev_logical_block_size(bdev)); - } - /* Just make sure we aren't corrupting data or - * metadata - */ - if (mddev->external) { - /* Bitmap could be anywhere. */ - if (rdev->sb_start + offset + (page->index - * (PAGE_SIZE/512)) - > rdev->data_offset - && - rdev->sb_start + offset - < (rdev->data_offset + mddev->dev_sectors - + (PAGE_SIZE/512))) - goto bad_alignment; - } else if (offset < 0) { - /* DATA BITMAP METADATA */ - if (offset - + (long)(page->index * (PAGE_SIZE/512)) - + size/512 > 0) - /* bitmap runs in to metadata */ - goto bad_alignment; - if (rdev->data_offset + mddev->dev_sectors - > rdev->sb_start + offset) - /* data runs in to bitmap */ - goto bad_alignment; - } else if (rdev->sb_start < rdev->data_offset) { - /* METADATA BITMAP DATA */ - if (rdev->sb_start - + offset - + page->index*(PAGE_SIZE/512) + size/512 - > rdev->data_offset) - /* bitmap runs in to data */ - goto bad_alignment; - } else { - /* DATA METADATA BITMAP - no problems */ - } - md_super_write(mddev, rdev, - rdev->sb_start + offset - + page->index * (PAGE_SIZE/512), - size, - page); + if (last_page_size == 0) + last_page_size = PAGE_SIZE; + size = roundup(last_page_size, + bdev_logical_block_size(bdev)); } - if (wait && md_super_wait(mddev) < 0) - goto restart; - return 0; + /* Just make sure we aren't corrupting data or metadata */ + if (mddev->external) { + /* Bitmap could be anywhere. */ + if (rdev->sb_start + offset + + (page->index * (PAGE_SIZE / SECTOR_SIZE)) + > rdev->data_offset && + rdev->sb_start + offset + < (rdev->data_offset + mddev->dev_sectors + + (PAGE_SIZE / SECTOR_SIZE))) + return -EINVAL; + } else if (offset < 0) { + /* DATA BITMAP METADATA */ + if (offset + + (long)(page->index * (PAGE_SIZE / SECTOR_SIZE)) + + size / SECTOR_SIZE > 0) + /* bitmap runs in to metadata */ + return -EINVAL; - bad_alignment: - return -EINVAL; + if (rdev->data_offset + mddev->dev_sectors + > rdev->sb_start + offset) + /* data runs in to bitmap */ + return -EINVAL; + } else if (rdev->sb_start < rdev->data_offset) { + /* METADATA BITMAP DATA */ + if (rdev->sb_start + offset + + page->index * (PAGE_SIZE / SECTOR_SIZE) + + size / SECTOR_SIZE > rdev->data_offset) + /* bitmap runs in to data */ + return -EINVAL; + } else { + /* DATA METADATA BITMAP - no problems */ + } + + md_super_write(mddev, rdev, + rdev->sb_start + offset + + page->index * (PAGE_SIZE / SECTOR_SIZE), + size, page); + return 0; +} + +static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) +{ + struct md_rdev *rdev; + struct mddev *mddev = bitmap->mddev; + int ret; + + do { + rdev = NULL; + while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { + ret = __write_sb_page(rdev, bitmap, page); + if (ret) + return ret; + } + } while (wait && md_super_wait(mddev) < 0); + + return 0; } static void md_bitmap_file_kick(struct bitmap *bitmap); From 10172f200b67aafcb9b8ce7d0575d713c1aafbb7 Mon Sep 17 00:00:00 2001 From: Jon Derrick Date: Fri, 24 Feb 2023 11:33:22 -0700 Subject: [PATCH 06/15] md: Fix types in sb writer Page->index is a pgoff_t and multiplying could cause overflows on a 32-bit architecture. In the sb writer, this is used to calculate and verify the sector being used, and is multiplied by a sector value. Using sector_t will cast it to a u64 type and is the more appropriate type for the unit. Additionally, the integer size unit is converted to a sector unit in later calculations, and is now corrected to be an unsigned type. Finally, clean up the calculations using variable aliases to improve readabiliy. Reviewed-by: Christoph Hellwig Signed-off-by: Jon Derrick Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230224183323.638-3-jonathan.derrick@linux.dev --- drivers/md/md-bitmap.c | 35 ++++++++++++++--------------------- 1 file changed, 14 insertions(+), 21 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index cdc61e65abae..bf250a5e3a86 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -215,12 +215,13 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap, struct block_device *bdev; struct mddev *mddev = bitmap->mddev; struct bitmap_storage *store = &bitmap->storage; - loff_t offset = mddev->bitmap_info.offset; - int size = PAGE_SIZE; + sector_t offset = mddev->bitmap_info.offset; + sector_t ps, sboff, doff; + unsigned int size = PAGE_SIZE; bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev; if (page->index == store->file_pages - 1) { - int last_page_size = store->bytes & (PAGE_SIZE - 1); + unsigned int last_page_size = store->bytes & (PAGE_SIZE - 1); if (last_page_size == 0) last_page_size = PAGE_SIZE; @@ -228,43 +229,35 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap, bdev_logical_block_size(bdev)); } + ps = page->index * PAGE_SIZE / SECTOR_SIZE; + sboff = rdev->sb_start + offset; + doff = rdev->data_offset; + /* Just make sure we aren't corrupting data or metadata */ if (mddev->external) { /* Bitmap could be anywhere. */ - if (rdev->sb_start + offset - + (page->index * (PAGE_SIZE / SECTOR_SIZE)) - > rdev->data_offset && - rdev->sb_start + offset - < (rdev->data_offset + mddev->dev_sectors - + (PAGE_SIZE / SECTOR_SIZE))) + if (sboff + ps > doff && + sboff < (doff + mddev->dev_sectors + PAGE_SIZE / SECTOR_SIZE)) return -EINVAL; } else if (offset < 0) { /* DATA BITMAP METADATA */ - if (offset - + (long)(page->index * (PAGE_SIZE / SECTOR_SIZE)) - + size / SECTOR_SIZE > 0) + if (offset + ps + size / SECTOR_SIZE > 0) /* bitmap runs in to metadata */ return -EINVAL; - if (rdev->data_offset + mddev->dev_sectors - > rdev->sb_start + offset) + if (doff + mddev->dev_sectors > sboff) /* data runs in to bitmap */ return -EINVAL; } else if (rdev->sb_start < rdev->data_offset) { /* METADATA BITMAP DATA */ - if (rdev->sb_start + offset - + page->index * (PAGE_SIZE / SECTOR_SIZE) - + size / SECTOR_SIZE > rdev->data_offset) + if (sboff + ps + size / SECTOR_SIZE > doff) /* bitmap runs in to data */ return -EINVAL; } else { /* DATA METADATA BITMAP - no problems */ } - md_super_write(mddev, rdev, - rdev->sb_start + offset - + page->index * (PAGE_SIZE / SECTOR_SIZE), - size, page); + md_super_write(mddev, rdev, sboff + ps, (int) size, page); return 0; } From 8745faa95611f6331331284925012c0310ae3e2c Mon Sep 17 00:00:00 2001 From: Jon Derrick Date: Fri, 24 Feb 2023 11:33:23 -0700 Subject: [PATCH 07/15] md: Use optimal I/O size for last bitmap page If the bitmap space has enough room, size the I/O for the last bitmap page write to the optimal I/O size for the storage device. The expanded write is checked that it won't overrun the data or metadata. The drive this was tested against has higher latencies when there are sub-4k writes due to device-side read-mod-writes of its atomic 4k write unit. This change helps increase performance by sizing the last bitmap page I/O for the device's preferred write unit, if it is given. Example Intel/Solidigm P5520 Raid10, Chunk-size 64M, bitmap-size 57228 bits $ mdadm --create /dev/md0 --level=10 --raid-devices=4 /dev/nvme{0,1,2,3}n1 --assume-clean --bitmap=internal --bitmap-chunk=64M $ fio --name=test --direct=1 --filename=/dev/md0 --rw=randwrite --bs=4k --runtime=60 Without patch: write: IOPS=1676, BW=6708KiB/s (6869kB/s)(393MiB/60001msec); 0 zone resets With patch: write: IOPS=15.7k, BW=61.4MiB/s (64.4MB/s)(3683MiB/60001msec); 0 zone resets Biosnoop: Without patch: Time Process PID Device LBA Size Lat 1.410377 md0_raid10 6900 nvme0n1 W 16 4096 0.02 1.410387 md0_raid10 6900 nvme2n1 W 16 4096 0.02 1.410374 md0_raid10 6900 nvme3n1 W 16 4096 0.01 1.410381 md0_raid10 6900 nvme1n1 W 16 4096 0.02 1.410411 md0_raid10 6900 nvme1n1 W 115346512 4096 0.01 1.410418 md0_raid10 6900 nvme0n1 W 115346512 4096 0.02 1.410915 md0_raid10 6900 nvme2n1 W 24 3584 0.43 <-- 1.410935 md0_raid10 6900 nvme3n1 W 24 3584 0.45 <-- 1.411124 md0_raid10 6900 nvme1n1 W 24 3584 0.64 <-- 1.411147 md0_raid10 6900 nvme0n1 W 24 3584 0.66 <-- 1.411176 md0_raid10 6900 nvme3n1 W 2019022184 4096 0.01 1.411189 md0_raid10 6900 nvme2n1 W 2019022184 4096 0.02 With patch: Time Process PID Device LBA Size Lat 5.747193 md0_raid10 727 nvme0n1 W 16 4096 0.01 5.747192 md0_raid10 727 nvme1n1 W 16 4096 0.02 5.747195 md0_raid10 727 nvme3n1 W 16 4096 0.01 5.747202 md0_raid10 727 nvme2n1 W 16 4096 0.02 5.747229 md0_raid10 727 nvme3n1 W 1196223704 4096 0.02 5.747224 md0_raid10 727 nvme0n1 W 1196223704 4096 0.01 5.747279 md0_raid10 727 nvme0n1 W 24 4096 0.01 <-- 5.747279 md0_raid10 727 nvme1n1 W 24 4096 0.02 <-- 5.747284 md0_raid10 727 nvme3n1 W 24 4096 0.02 <-- 5.747291 md0_raid10 727 nvme2n1 W 24 4096 0.02 <-- 5.747314 md0_raid10 727 nvme2n1 W 2234636712 4096 0.01 5.747317 md0_raid10 727 nvme1n1 W 2234636712 4096 0.02 Reviewed-by: Christoph Hellwig Signed-off-by: Jon Derrick Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230224183323.638-4-jonathan.derrick@linux.dev --- drivers/md/md-bitmap.c | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index bf250a5e3a86..920bb68156d2 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -209,6 +209,28 @@ static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mdde return NULL; } +static unsigned int optimal_io_size(struct block_device *bdev, + unsigned int last_page_size, + unsigned int io_size) +{ + if (bdev_io_opt(bdev) > bdev_logical_block_size(bdev)) + return roundup(last_page_size, bdev_io_opt(bdev)); + return io_size; +} + +static unsigned int bitmap_io_size(unsigned int io_size, unsigned int opt_size, + sector_t start, sector_t boundary) +{ + if (io_size != opt_size && + start + opt_size / SECTOR_SIZE <= boundary) + return opt_size; + if (start + io_size / SECTOR_SIZE <= boundary) + return io_size; + + /* Overflows boundary */ + return 0; +} + static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap, struct page *page) { @@ -218,6 +240,7 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap, sector_t offset = mddev->bitmap_info.offset; sector_t ps, sboff, doff; unsigned int size = PAGE_SIZE; + unsigned int opt_size = PAGE_SIZE; bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev; if (page->index == store->file_pages - 1) { @@ -225,8 +248,8 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap, if (last_page_size == 0) last_page_size = PAGE_SIZE; - size = roundup(last_page_size, - bdev_logical_block_size(bdev)); + size = roundup(last_page_size, bdev_logical_block_size(bdev)); + opt_size = optimal_io_size(bdev, last_page_size, size); } ps = page->index * PAGE_SIZE / SECTOR_SIZE; @@ -241,7 +264,8 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap, return -EINVAL; } else if (offset < 0) { /* DATA BITMAP METADATA */ - if (offset + ps + size / SECTOR_SIZE > 0) + size = bitmap_io_size(size, opt_size, offset + ps, 0); + if (size == 0) /* bitmap runs in to metadata */ return -EINVAL; @@ -250,7 +274,8 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap, return -EINVAL; } else if (rdev->sb_start < rdev->data_offset) { /* METADATA BITMAP DATA */ - if (sboff + ps + size / SECTOR_SIZE > doff) + size = bitmap_io_size(size, opt_size, sboff + ps, doff); + if (size == 0) /* bitmap runs in to data */ return -EINVAL; } else { From c31fea2f8e2a72c817f318016bbc327095175a9f Mon Sep 17 00:00:00 2001 From: Mariusz Tkaczyk Date: Mon, 6 Mar 2023 14:03:17 +0100 Subject: [PATCH 08/15] md: add error_handlers for raid0 and linear After the commit 9631abdbf406c("md: Set MD_BROKEN for RAID1 and RAID10") MD_BROKEN must be set if array is failed because state_store() checks it. If it is set then -EBUSY is returned to userspace. For raid0 and linear MD_BROKEN is not set by error_handler(). As a result mdadm is unable to trigger clean-up actions. It is a regression. This patch adds appropriate error_handler for raid0 and linear. The error handler sets MD_BROKEN for this device. Reviewed-by: Xiao Ni Signed-off-by: Mariusz Tkaczyk Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230306130317.3418-1-mariusz.tkaczyk@linux.intel.com --- drivers/md/md-linear.c | 14 +++++++++++++- drivers/md/md.c | 3 +++ drivers/md/md.h | 10 ++-------- drivers/md/raid0.c | 14 +++++++++++++- 4 files changed, 31 insertions(+), 10 deletions(-) diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c index 6e7797b4e738..4eb72b9dd933 100644 --- a/drivers/md/md-linear.c +++ b/drivers/md/md-linear.c @@ -223,7 +223,8 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio) bio_sector < start_sector)) goto out_of_bounds; - if (unlikely(is_mddev_broken(tmp_dev->rdev, "linear"))) { + if (unlikely(is_rdev_broken(tmp_dev->rdev))) { + md_error(mddev, tmp_dev->rdev); bio_io_error(bio); return true; } @@ -270,6 +271,16 @@ static void linear_status (struct seq_file *seq, struct mddev *mddev) seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2); } +static void linear_error(struct mddev *mddev, struct md_rdev *rdev) +{ + if (!test_and_set_bit(MD_BROKEN, &mddev->flags)) { + char *md_name = mdname(mddev); + + pr_crit("md/linear%s: Disk failure on %pg detected, failing array.\n", + md_name, rdev->bdev); + } +} + static void linear_quiesce(struct mddev *mddev, int state) { } @@ -286,6 +297,7 @@ static struct md_personality linear_personality = .hot_add_disk = linear_add, .size = linear_size, .quiesce = linear_quiesce, + .error_handler = linear_error, }; static int __init linear_init (void) diff --git a/drivers/md/md.c b/drivers/md/md.c index 40dba7c1bdf0..63722035983d 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -7974,6 +7974,9 @@ void md_error(struct mddev *mddev, struct md_rdev *rdev) return; mddev->pers->error_handler(mddev, rdev); + if (mddev->pers->level == 0 || mddev->pers->level == LEVEL_LINEAR) + return; + if (mddev->degraded && !test_bit(MD_BROKEN, &mddev->flags)) set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); sysfs_notify_dirent_safe(rdev->sysfs_state); diff --git a/drivers/md/md.h b/drivers/md/md.h index e148e3c83b0d..fd8f260ed5f8 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -790,15 +790,9 @@ extern void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev, struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr); struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev); -static inline bool is_mddev_broken(struct md_rdev *rdev, const char *md_type) +static inline bool is_rdev_broken(struct md_rdev *rdev) { - if (!disk_live(rdev->bdev->bd_disk)) { - if (!test_and_set_bit(MD_BROKEN, &rdev->mddev->flags)) - pr_warn("md: %s: %s array has a missing/failed member\n", - mdname(rdev->mddev), md_type); - return true; - } - return false; + return !disk_live(rdev->bdev->bd_disk); } static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index b536befd8898..f8ee9a95e25d 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -569,8 +569,9 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio) return true; } - if (unlikely(is_mddev_broken(tmp_dev, "raid0"))) { + if (unlikely(is_rdev_broken(tmp_dev))) { bio_io_error(bio); + md_error(mddev, tmp_dev); return true; } @@ -592,6 +593,16 @@ static void raid0_status(struct seq_file *seq, struct mddev *mddev) return; } +static void raid0_error(struct mddev *mddev, struct md_rdev *rdev) +{ + if (!test_and_set_bit(MD_BROKEN, &mddev->flags)) { + char *md_name = mdname(mddev); + + pr_crit("md/raid0%s: Disk failure on %pg detected, failing array.\n", + md_name, rdev->bdev); + } +} + static void *raid0_takeover_raid45(struct mddev *mddev) { struct md_rdev *rdev; @@ -767,6 +778,7 @@ static struct md_personality raid0_personality= .size = raid0_size, .takeover = raid0_takeover, .quiesce = raid0_quiesce, + .error_handler = raid0_error, }; static int __init raid0_init (void) From 6efddf1e32e2a264694766ca485a4f5e04ee82a7 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Fri, 10 Mar 2023 15:38:51 +0800 Subject: [PATCH 09/15] md: fix soft lockup in status_resync status_resync() will calculate 'curr_resync - recovery_active' to show user a progress bar like following: [============>........] resync = 61.4% 'curr_resync' and 'recovery_active' is updated in md_do_sync(), and status_resync() can read them concurrently, hence it's possible that 'curr_resync - recovery_active' can overflow to a huge number. In this case status_resync() will be stuck in the loop to print a large amount of '=', which will end up soft lockup. Fix the problem by setting 'resync' to MD_RESYNC_ACTIVE in this case, this way resync in progress will be reported to user. Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230310073855.1337560-3-yukuai1@huaweicloud.com --- drivers/md/md.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 63722035983d..122ae28e785c 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8032,16 +8032,16 @@ static int status_resync(struct seq_file *seq, struct mddev *mddev) } else if (resync > max_sectors) { resync = max_sectors; } else { - resync -= atomic_read(&mddev->recovery_active); - if (resync < MD_RESYNC_ACTIVE) { - /* - * Resync has started, but the subtraction has - * yielded one of the special values. Force it - * to active to ensure the status reports an - * active resync. - */ + res = atomic_read(&mddev->recovery_active); + /* + * Resync has started, but the subtraction has overflowed or + * yielded one of the special values. Force it to active to + * ensure the status reports an active resync. + */ + if (resync < res || resync - res < MD_RESYNC_ACTIVE) resync = MD_RESYNC_ACTIVE; - } + else + resync -= res; } if (resync == MD_RESYNC_NONE) { From 9fdfe6d45be2fe1bd07678bfc453e6a627c08223 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Fri, 10 Mar 2023 15:38:52 +0800 Subject: [PATCH 10/15] md/raid10: don't BUG_ON() in raise_barrier() If raise_barrier() is called the first time in raid10_sync_request(), which means the first non-normal io is handled, raise_barrier() should wait for all dispatched normal io to be done. This ensures that normal io won't starve. However, BUG_ON() if this is broken is too aggressive. This patch replace BUG_ON() with WARN and fall back to not force. Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230310073855.1337560-4-yukuai1@huaweicloud.com --- drivers/md/raid10.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 6b39e6c7ada3..f82a930f7be0 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -952,7 +952,9 @@ static void flush_pending_writes(struct r10conf *conf) static void raise_barrier(struct r10conf *conf, int force) { write_seqlock_irq(&conf->resync_lock); - BUG_ON(force && !conf->barrier); + + if (WARN_ON_ONCE(force && !conf->barrier)) + force = false; /* Wait until no block IO is waiting (unless 'force') */ wait_event_barrier(conf, force || !conf->nr_waiting); From 26208a7cffd0c7cbf14237ccd20c7270b3ffeb7e Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Fri, 10 Mar 2023 15:38:53 +0800 Subject: [PATCH 11/15] md/raid10: fix leak of 'r10bio->remaining' for recovery raid10_sync_request() will add 'r10bio->remaining' for both rdev and replacement rdev. However, if the read io fails, recovery_request_write() returns without issuing the write io, in this case, end_sync_request() is only called once and 'remaining' is leaked, cause an io hang. Fix the problem by decreasing 'remaining' according to if 'bio' and 'repl_bio' is valid. Fixes: 24afd80d99f8 ("md/raid10: handle recovery of replacement devices.") Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230310073855.1337560-5-yukuai1@huaweicloud.com --- drivers/md/raid10.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index f82a930f7be0..82f2277d7bcf 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2615,11 +2615,22 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio) { struct r10conf *conf = mddev->private; int d; - struct bio *wbio, *wbio2; + struct bio *wbio = r10_bio->devs[1].bio; + struct bio *wbio2 = r10_bio->devs[1].repl_bio; + + /* Need to test wbio2->bi_end_io before we call + * submit_bio_noacct as if the former is NULL, + * the latter is free to free wbio2. + */ + if (wbio2 && !wbio2->bi_end_io) + wbio2 = NULL; if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) { fix_recovery_read_error(r10_bio); - end_sync_request(r10_bio); + if (wbio->bi_end_io) + end_sync_request(r10_bio); + if (wbio2) + end_sync_request(r10_bio); return; } @@ -2628,14 +2639,6 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio) * and submit the write request */ d = r10_bio->devs[1].devnum; - wbio = r10_bio->devs[1].bio; - wbio2 = r10_bio->devs[1].repl_bio; - /* Need to test wbio2->bi_end_io before we call - * submit_bio_noacct as if the former is NULL, - * the latter is free to free wbio2. - */ - if (wbio2 && !wbio2->bi_end_io) - wbio2 = NULL; if (wbio->bi_end_io) { atomic_inc(&conf->mirrors[d].rdev->nr_pending); md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio)); From c9ac2acde53f5385de185bccf6aaa91cf9ac1541 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Fri, 10 Mar 2023 15:38:54 +0800 Subject: [PATCH 12/15] md/raid10: fix memleak for 'conf->bio_split' In the error path of raid10_run(), 'conf' need be freed, however, 'conf->bio_split' is missed and memory will be leaked. Since there are 3 places to free 'conf', factor out a helper to fix the problem. Fixes: fc9977dd069e ("md/raid10: simplify the splitting of requests.") Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230310073855.1337560-6-yukuai1@huaweicloud.com --- drivers/md/raid10.c | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 82f2277d7bcf..befa4775c78b 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -4017,6 +4017,20 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new) return nc*fc; } +static void raid10_free_conf(struct r10conf *conf) +{ + if (!conf) + return; + + mempool_exit(&conf->r10bio_pool); + kfree(conf->mirrors); + kfree(conf->mirrors_old); + kfree(conf->mirrors_new); + safe_put_page(conf->tmppage); + bioset_exit(&conf->bio_split); + kfree(conf); +} + static struct r10conf *setup_conf(struct mddev *mddev) { struct r10conf *conf = NULL; @@ -4099,13 +4113,7 @@ static struct r10conf *setup_conf(struct mddev *mddev) return conf; out: - if (conf) { - mempool_exit(&conf->r10bio_pool); - kfree(conf->mirrors); - safe_put_page(conf->tmppage); - bioset_exit(&conf->bio_split); - kfree(conf); - } + raid10_free_conf(conf); return ERR_PTR(err); } @@ -4296,10 +4304,7 @@ static int raid10_run(struct mddev *mddev) out_free_conf: md_unregister_thread(&mddev->thread); - mempool_exit(&conf->r10bio_pool); - safe_put_page(conf->tmppage); - kfree(conf->mirrors); - kfree(conf); + raid10_free_conf(conf); mddev->private = NULL; out: return -EIO; @@ -4307,15 +4312,7 @@ out: static void raid10_free(struct mddev *mddev, void *priv) { - struct r10conf *conf = priv; - - mempool_exit(&conf->r10bio_pool); - safe_put_page(conf->tmppage); - kfree(conf->mirrors); - kfree(conf->mirrors_old); - kfree(conf->mirrors_new); - bioset_exit(&conf->bio_split); - kfree(conf); + raid10_free_conf(priv); } static void raid10_quiesce(struct mddev *mddev, int quiesce) From f0ddb83da3cbbf8a1f9087a642c448ff52ee9abd Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Fri, 10 Mar 2023 15:38:55 +0800 Subject: [PATCH 13/15] md/raid10: fix memleak of md thread In raid10_run(), if setup_conf() succeed and raid10_run() failed before setting 'mddev->thread', then in the error path 'conf->thread' is not freed. Fix the problem by setting 'mddev->thread' right after setup_conf(). Fixes: 43a521238aca ("md-cluster: choose correct label when clustered layout is not supported") Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230310073855.1337560-7-yukuai1@huaweicloud.com --- drivers/md/raid10.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index befa4775c78b..375ed73f6c9e 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -4150,6 +4150,9 @@ static int raid10_run(struct mddev *mddev) if (!conf) goto out; + mddev->thread = conf->thread; + conf->thread = NULL; + if (mddev_is_clustered(conf->mddev)) { int fc, fo; @@ -4162,9 +4165,6 @@ static int raid10_run(struct mddev *mddev) } } - mddev->thread = conf->thread; - conf->thread = NULL; - if (mddev->queue) { blk_queue_max_write_zeroes_sectors(mddev->queue, 0); blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9); From 7cddb055bfda5f7b0be931e8ea750fc28bc18a27 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 14 Mar 2023 09:22:58 +0800 Subject: [PATCH 14/15] md/raid10: don't call bio_start_io_acct twice for bio which experienced read error handle_read_error() will resumit r10_bio by raid10_read_request(), which will call bio_start_io_acct() again, while bio_end_io_acct() will only be called once. Fix the problem by don't account io again from handle_read_error(). Fixes: 528bc2cf2fcc ("md/raid10: enable io accounting") Suggested-by: Song Liu Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230314012258.2395894-1-yukuai1@huaweicloud.com --- drivers/md/raid10.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 375ed73f6c9e..4fcfcb350d2b 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1250,7 +1250,8 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, } slot = r10_bio->read_slot; - if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) + if (!r10_bio->start_time && + blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) r10_bio->start_time = bio_start_io_acct(bio); read_bio = bio_alloc_clone(rdev->bdev, bio, gfp, &mddev->bio_set); @@ -1580,6 +1581,7 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors) r10_bio->sector = bio->bi_iter.bi_sector; r10_bio->state = 0; r10_bio->read_slot = -1; + r10_bio->start_time = 0; memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * conf->geo.raid_disks); From 7bc436121e557b1f5bebf5ad67e7ed3614d6df92 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Mon, 27 Mar 2023 09:23:24 -0400 Subject: [PATCH 15/15] md/raid5: remove unused working_disks variable clang with W=1 reports drivers/md/raid5.c:7719:6: error: variable 'working_disks' set but not used [-Werror,-Wunused-but-set-variable] int working_disks = 0; ^ This variable is not used so remove it. Signed-off-by: Tom Rix Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230327132324.1769595-1-trix@redhat.com --- drivers/md/raid5.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 7b820b81d8c2..812a12e3e41a 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -7716,7 +7716,6 @@ static void raid5_set_io_opt(struct r5conf *conf) static int raid5_run(struct mddev *mddev) { struct r5conf *conf; - int working_disks = 0; int dirty_parity_disks = 0; struct md_rdev *rdev; struct md_rdev *journal_dev = NULL; @@ -7912,10 +7911,8 @@ static int raid5_run(struct mddev *mddev) pr_warn("md: cannot handle concurrent replacement and reshape.\n"); goto abort; } - if (test_bit(In_sync, &rdev->flags)) { - working_disks++; + if (test_bit(In_sync, &rdev->flags)) continue; - } /* This disc is not fully in-sync. However if it * just stored parity (beyond the recovery_offset), * when we don't need to be concerned about the