From 3bd548e5b819b8c0f2c9085de775c5c7bff9052f Mon Sep 17 00:00:00 2001 From: Li Zhong Date: Fri, 16 Sep 2022 16:33:05 -0700 Subject: [PATCH 01/10] drivers/md/md-bitmap: check the return value of md_bitmap_get_counter() Check the return value of md_bitmap_get_counter() in case it returns NULL pointer, which will result in a null pointer dereference. v2: update the check to include other dereference Signed-off-by: Li Zhong Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index bf6dffadbe6f..63ece30114e5 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -2195,20 +2195,23 @@ int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks, if (set) { bmc_new = md_bitmap_get_counter(&bitmap->counts, block, &new_blocks, 1); - if (*bmc_new == 0) { - /* need to set on-disk bits too. */ - sector_t end = block + new_blocks; - sector_t start = block >> chunkshift; - start <<= chunkshift; - while (start < end) { - md_bitmap_file_set_bit(bitmap, block); - start += 1 << chunkshift; + if (bmc_new) { + if (*bmc_new == 0) { + /* need to set on-disk bits too. */ + sector_t end = block + new_blocks; + sector_t start = block >> chunkshift; + + start <<= chunkshift; + while (start < end) { + md_bitmap_file_set_bit(bitmap, block); + start += 1 << chunkshift; + } + *bmc_new = 2; + md_bitmap_count_page(&bitmap->counts, block, 1); + md_bitmap_set_pending(&bitmap->counts, block); } - *bmc_new = 2; - md_bitmap_count_page(&bitmap->counts, block, 1); - md_bitmap_set_pending(&bitmap->counts, block); + *bmc_new |= NEEDED_MASK; } - *bmc_new |= NEEDED_MASK; if (new_blocks < old_blocks) old_blocks = new_blocks; } From 9487a0f6855c1a28e4e39d41545cd19ed417c015 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 20 Oct 2022 17:51:04 +0200 Subject: [PATCH 02/10] raid5-cache: use try_cmpxchg in r5l_wake_reclaim Use try_cmpxchg instead of cmpxchg (*ptr, old, new) == old in r5l_wake_reclaim. 86 CMPXCHG instruction returns success in ZF flag, so this change saves a compare after cmpxchg (and related move instruction in front of cmpxchg). Also, try_cmpxchg implicitly assigns old *ptr value to "old" when cmpxchg fails. There is no need to re-read the value in the loop. Note that the value from *ptr should be read using READ_ONCE to prevent the compiler from merging, refetching or reordering the read. No functional change intended. Cc: Song Liu Signed-off-by: Uros Bizjak Signed-off-by: Song Liu --- drivers/md/raid5-cache.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 832d8566e165..a63023aae21e 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -1565,11 +1565,12 @@ void r5l_wake_reclaim(struct r5l_log *log, sector_t space) if (!log) return; + + target = READ_ONCE(log->reclaim_target); do { - target = log->reclaim_target; if (new < target) return; - } while (cmpxchg(&log->reclaim_target, target, new) != target); + } while (!try_cmpxchg(&log->reclaim_target, &target, new)); md_wakeup_thread(log->reclaim_thread); } From 42271ca389edb0446b9e492858b4c38083b0b9f8 Mon Sep 17 00:00:00 2001 From: Giulio Benetti Date: Wed, 19 Oct 2022 18:04:07 +0200 Subject: [PATCH 03/10] lib/raid6: drop RAID6_USE_EMPTY_ZERO_PAGE RAID6_USE_EMPTY_ZERO_PAGE is unused and hardcoded to 0, so let's drop it. Signed-off-by: Giulio Benetti Reviewed-by: Christoph Hellwig Signed-off-by: Song Liu --- include/linux/raid/pq.h | 8 -------- lib/raid6/algos.c | 2 -- 2 files changed, 10 deletions(-) diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h index d6e5a1feb947..f29aaaf2eb21 100644 --- a/include/linux/raid/pq.h +++ b/include/linux/raid/pq.h @@ -10,17 +10,9 @@ #ifdef __KERNEL__ -/* Set to 1 to use kernel-wide empty_zero_page */ -#define RAID6_USE_EMPTY_ZERO_PAGE 0 #include -/* We need a pre-zeroed page... if we don't want to use the kernel-provided - one define it here */ -#if RAID6_USE_EMPTY_ZERO_PAGE -# define raid6_empty_zero_page empty_zero_page -#else extern const char raid6_empty_zero_page[PAGE_SIZE]; -#endif #else /* ! __KERNEL__ */ /* Used for testing in user space */ diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c index 39b74221f4a7..a22a05c9af8a 100644 --- a/lib/raid6/algos.c +++ b/lib/raid6/algos.c @@ -18,12 +18,10 @@ #else #include #include -#if !RAID6_USE_EMPTY_ZERO_PAGE /* In .bss so it's zeroed */ const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256))); EXPORT_SYMBOL(raid6_empty_zero_page); #endif -#endif struct raid6_calls raid6_call; EXPORT_SYMBOL_GPL(raid6_call); From 2f6d261e15e8d09a22192aa943c5ed8a792f04ef Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Tue, 20 Sep 2022 10:39:37 +0800 Subject: [PATCH 04/10] md: factor out __md_set_array_info() Factor out __md_set_array_info(). No functional change. Signed-off-by: Ye Bin Signed-off-by: Song Liu --- drivers/md/md.c | 65 ++++++++++++++++++++++++++----------------------- 1 file changed, 35 insertions(+), 30 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index a467b492d4ad..b0c3e7d3adbc 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -7464,6 +7464,40 @@ static inline bool md_ioctl_valid(unsigned int cmd) } } +static int __md_set_array_info(struct mddev *mddev, void __user *argp) +{ + mdu_array_info_t info; + int err; + + if (!argp) + memset(&info, 0, sizeof(info)); + else if (copy_from_user(&info, argp, sizeof(info))) + return -EFAULT; + + if (mddev->pers) { + err = update_array_info(mddev, &info); + if (err) + pr_warn("md: couldn't update array info. %d\n", err); + return err; + } + + if (!list_empty(&mddev->disks)) { + pr_warn("md: array %s already has disks!\n", mdname(mddev)); + return -EBUSY; + } + + if (mddev->raid_disks) { + pr_warn("md: array %s already initialised!\n", mdname(mddev)); + return -EBUSY; + } + + err = md_set_array_info(mddev, &info); + if (err) + pr_warn("md: couldn't set array info. %d\n", err); + + return err; +} + static int md_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { @@ -7569,36 +7603,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, } if (cmd == SET_ARRAY_INFO) { - mdu_array_info_t info; - if (!arg) - memset(&info, 0, sizeof(info)); - else if (copy_from_user(&info, argp, sizeof(info))) { - err = -EFAULT; - goto unlock; - } - if (mddev->pers) { - err = update_array_info(mddev, &info); - if (err) { - pr_warn("md: couldn't update array info. %d\n", err); - goto unlock; - } - goto unlock; - } - if (!list_empty(&mddev->disks)) { - pr_warn("md: array %s already has disks!\n", mdname(mddev)); - err = -EBUSY; - goto unlock; - } - if (mddev->raid_disks) { - pr_warn("md: array %s already initialised!\n", mdname(mddev)); - err = -EBUSY; - goto unlock; - } - err = md_set_array_info(mddev, &info); - if (err) { - pr_warn("md: couldn't set array info. %d\n", err); - goto unlock; - } + err = __md_set_array_info(mddev, argp); goto unlock; } From f97a5528b21eb175d90dce2df9960c8d08e1be82 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Tue, 20 Sep 2022 10:39:38 +0800 Subject: [PATCH 05/10] md: introduce md_ro_state Introduce md_ro_state for mddev->ro, so it is easy to understand. Signed-off-by: Ye Bin Signed-off-by: Song Liu --- drivers/md/md.c | 152 ++++++++++++++++++++++++++---------------------- 1 file changed, 82 insertions(+), 70 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index b0c3e7d3adbc..6f3b2c1cb6cd 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -93,6 +93,18 @@ static int remove_and_add_spares(struct mddev *mddev, struct md_rdev *this); static void mddev_detach(struct mddev *mddev); +enum md_ro_state { + MD_RDWR, + MD_RDONLY, + MD_AUTO_READ, + MD_MAX_STATE +}; + +static bool md_is_rdwr(struct mddev *mddev) +{ + return (mddev->ro == MD_RDWR); +} + /* * Default number of read corrections we'll attempt on an rdev * before ejecting it from the array. We divide the read error @@ -444,7 +456,7 @@ static void md_submit_bio(struct bio *bio) bio = bio_split_to_limits(bio); - if (mddev->ro == 1 && unlikely(rw == WRITE)) { + if (mddev->ro == MD_RDONLY && unlikely(rw == WRITE)) { if (bio_sectors(bio) != 0) bio->bi_status = BLK_STS_IOERR; bio_endio(bio); @@ -2639,7 +2651,7 @@ void md_update_sb(struct mddev *mddev, int force_change) int any_badblocks_changed = 0; int ret = -1; - if (mddev->ro) { + if (!md_is_rdwr(mddev)) { if (force_change) set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); return; @@ -3901,7 +3913,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) goto out_unlock; } rv = -EROFS; - if (mddev->ro) + if (!md_is_rdwr(mddev)) goto out_unlock; /* request to change the personality. Need to ensure: @@ -4107,7 +4119,7 @@ layout_store(struct mddev *mddev, const char *buf, size_t len) if (mddev->pers) { if (mddev->pers->check_reshape == NULL) err = -EBUSY; - else if (mddev->ro) + else if (!md_is_rdwr(mddev)) err = -EROFS; else { mddev->new_layout = n; @@ -4216,7 +4228,7 @@ chunk_size_store(struct mddev *mddev, const char *buf, size_t len) if (mddev->pers) { if (mddev->pers->check_reshape == NULL) err = -EBUSY; - else if (mddev->ro) + else if (!md_is_rdwr(mddev)) err = -EROFS; else { mddev->new_chunk_sectors = n >> 9; @@ -4339,13 +4351,13 @@ array_state_show(struct mddev *mddev, char *page) if (mddev->pers && !test_bit(MD_NOT_READY, &mddev->flags)) { switch(mddev->ro) { - case 1: + case MD_RDONLY: st = readonly; break; - case 2: + case MD_AUTO_READ: st = read_auto; break; - case 0: + case MD_RDWR: spin_lock(&mddev->lock); if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) st = write_pending; @@ -4381,7 +4393,8 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) int err = 0; enum array_state st = match_word(buf, array_states); - if (mddev->pers && (st == active || st == clean) && mddev->ro != 1) { + if (mddev->pers && (st == active || st == clean) && + mddev->ro != MD_RDONLY) { /* don't take reconfig_mutex when toggling between * clean and active */ @@ -4425,23 +4438,23 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) if (mddev->pers) err = md_set_readonly(mddev, NULL); else { - mddev->ro = 1; + mddev->ro = MD_RDONLY; set_disk_ro(mddev->gendisk, 1); err = do_md_run(mddev); } break; case read_auto: if (mddev->pers) { - if (mddev->ro == 0) + if (md_is_rdwr(mddev)) err = md_set_readonly(mddev, NULL); - else if (mddev->ro == 1) + else if (mddev->ro == MD_RDONLY) err = restart_array(mddev); if (err == 0) { - mddev->ro = 2; + mddev->ro = MD_AUTO_READ; set_disk_ro(mddev->gendisk, 0); } } else { - mddev->ro = 2; + mddev->ro = MD_AUTO_READ; err = do_md_run(mddev); } break; @@ -4466,7 +4479,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) wake_up(&mddev->sb_wait); err = 0; } else { - mddev->ro = 0; + mddev->ro = MD_RDWR; set_disk_ro(mddev->gendisk, 0); err = do_md_run(mddev); } @@ -4765,7 +4778,7 @@ action_show(struct mddev *mddev, char *page) if (test_bit(MD_RECOVERY_FROZEN, &recovery)) type = "frozen"; else if (test_bit(MD_RECOVERY_RUNNING, &recovery) || - (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery))) { + (md_is_rdwr(mddev) && test_bit(MD_RECOVERY_NEEDED, &recovery))) { if (test_bit(MD_RECOVERY_RESHAPE, &recovery)) type = "reshape"; else if (test_bit(MD_RECOVERY_SYNC, &recovery)) { @@ -4851,11 +4864,11 @@ action_store(struct mddev *mddev, const char *page, size_t len) set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); set_bit(MD_RECOVERY_SYNC, &mddev->recovery); } - if (mddev->ro == 2) { + if (mddev->ro == MD_AUTO_READ) { /* A write to sync_action is enough to justify * canceling read-auto mode */ - mddev->ro = 0; + mddev->ro = MD_RDWR; md_wakeup_thread(mddev->sync_thread); } set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); @@ -5083,8 +5096,7 @@ max_sync_store(struct mddev *mddev, const char *buf, size_t len) goto out_unlock; err = -EBUSY; - if (max < mddev->resync_max && - mddev->ro == 0 && + if (max < mddev->resync_max && md_is_rdwr(mddev) && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) goto out_unlock; @@ -5813,8 +5825,8 @@ int md_run(struct mddev *mddev) continue; sync_blockdev(rdev->bdev); invalidate_bdev(rdev->bdev); - if (mddev->ro != 1 && rdev_read_only(rdev)) { - mddev->ro = 1; + if (mddev->ro != MD_RDONLY && rdev_read_only(rdev)) { + mddev->ro = MD_RDONLY; if (mddev->gendisk) set_disk_ro(mddev->gendisk, 1); } @@ -5917,8 +5929,8 @@ int md_run(struct mddev *mddev) mddev->ok_start_degraded = start_dirty_degraded; - if (start_readonly && mddev->ro == 0) - mddev->ro = 2; /* read-only, but switch on first write */ + if (start_readonly && md_is_rdwr(mddev)) + mddev->ro = MD_AUTO_READ; /* read-only, but switch on first write */ err = pers->run(mddev); if (err) @@ -5996,8 +6008,8 @@ int md_run(struct mddev *mddev) mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action"); mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed"); mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded"); - } else if (mddev->ro == 2) /* auto-readonly not meaningful */ - mddev->ro = 0; + } else if (mddev->ro == MD_AUTO_READ) + mddev->ro = MD_RDWR; atomic_set(&mddev->max_corr_read_errors, MD_DEFAULT_MAX_CORRECTED_READ_ERRORS); @@ -6015,7 +6027,7 @@ int md_run(struct mddev *mddev) if (rdev->raid_disk >= 0) sysfs_link_rdev(mddev, rdev); /* failure here is OK */ - if (mddev->degraded && !mddev->ro) + if (mddev->degraded && md_is_rdwr(mddev)) /* This ensures that recovering status is reported immediately * via sysfs - until a lack of spares is confirmed. */ @@ -6105,7 +6117,7 @@ static int restart_array(struct mddev *mddev) return -ENXIO; if (!mddev->pers) return -EINVAL; - if (!mddev->ro) + if (md_is_rdwr(mddev)) return -EBUSY; rcu_read_lock(); @@ -6124,7 +6136,7 @@ static int restart_array(struct mddev *mddev) return -EROFS; mddev->safemode = 0; - mddev->ro = 0; + mddev->ro = MD_RDWR; set_disk_ro(disk, 0); pr_debug("md: %s switched to read-write mode.\n", mdname(mddev)); /* Kick recovery or resync if necessary */ @@ -6151,7 +6163,7 @@ static void md_clean(struct mddev *mddev) mddev->clevel[0] = 0; mddev->flags = 0; mddev->sb_flags = 0; - mddev->ro = 0; + mddev->ro = MD_RDWR; mddev->metadata_type[0] = 0; mddev->chunk_sectors = 0; mddev->ctime = mddev->utime = 0; @@ -6203,7 +6215,7 @@ static void __md_stop_writes(struct mddev *mddev) } md_bitmap_flush(mddev); - if (mddev->ro == 0 && + if (md_is_rdwr(mddev) && ((!mddev->in_sync && !mddev_is_clustered(mddev)) || mddev->sb_flags)) { /* mark array as shutdown cleanly */ @@ -6312,9 +6324,9 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) __md_stop_writes(mddev); err = -ENXIO; - if (mddev->ro==1) + if (mddev->ro == MD_RDONLY) goto out; - mddev->ro = 1; + mddev->ro = MD_RDONLY; set_disk_ro(mddev->gendisk, 1); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); @@ -6371,7 +6383,7 @@ static int do_md_stop(struct mddev *mddev, int mode, return -EBUSY; } if (mddev->pers) { - if (mddev->ro) + if (!md_is_rdwr(mddev)) set_disk_ro(disk, 0); __md_stop_writes(mddev); @@ -6388,8 +6400,8 @@ static int do_md_stop(struct mddev *mddev, int mode, mutex_unlock(&mddev->open_mutex); mddev->changed = 1; - if (mddev->ro) - mddev->ro = 0; + if (!md_is_rdwr(mddev)) + mddev->ro = MD_RDWR; } else mutex_unlock(&mddev->open_mutex); /* @@ -7204,7 +7216,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors) if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || mddev->sync_thread) return -EBUSY; - if (mddev->ro) + if (!md_is_rdwr(mddev)) return -EROFS; rdev_for_each(rdev, mddev) { @@ -7234,7 +7246,7 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks) /* change the number of raid disks */ if (mddev->pers->check_reshape == NULL) return -EINVAL; - if (mddev->ro) + if (!md_is_rdwr(mddev)) return -EROFS; if (raid_disks <= 0 || (mddev->max_disks && raid_disks >= mddev->max_disks)) @@ -7663,26 +7675,25 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, * The remaining ioctls are changing the state of the * superblock, so we do not allow them on read-only arrays. */ - if (mddev->ro && mddev->pers) { - if (mddev->ro == 2) { - mddev->ro = 0; - sysfs_notify_dirent_safe(mddev->sysfs_state); - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - /* mddev_unlock will wake thread */ - /* If a device failed while we were read-only, we - * need to make sure the metadata is updated now. - */ - if (test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) { - mddev_unlock(mddev); - wait_event(mddev->sb_wait, - !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) && - !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); - mddev_lock_nointr(mddev); - } - } else { + if (!md_is_rdwr(mddev) && mddev->pers) { + if (mddev->ro != MD_AUTO_READ) { err = -EROFS; goto unlock; } + mddev->ro = MD_RDWR; + sysfs_notify_dirent_safe(mddev->sysfs_state); + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + /* mddev_unlock will wake thread */ + /* If a device failed while we were read-only, we + * need to make sure the metadata is updated now. + */ + if (test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) { + mddev_unlock(mddev); + wait_event(mddev->sb_wait, + !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) && + !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); + mddev_lock_nointr(mddev); + } } switch (cmd) { @@ -7768,11 +7779,11 @@ static int md_set_read_only(struct block_device *bdev, bool ro) * Transitioning to read-auto need only happen for arrays that call * md_write_start and which are not ready for writes yet. */ - if (!ro && mddev->ro == 1 && mddev->pers) { + if (!ro && mddev->ro == MD_RDONLY && mddev->pers) { err = restart_array(mddev); if (err) goto out_unlock; - mddev->ro = 2; + mddev->ro = MD_AUTO_READ; } out_unlock: @@ -8246,9 +8257,9 @@ static int md_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%s : %sactive", mdname(mddev), mddev->pers ? "" : "in"); if (mddev->pers) { - if (mddev->ro==1) + if (mddev->ro == MD_RDONLY) seq_printf(seq, " (read-only)"); - if (mddev->ro==2) + if (mddev->ro == MD_AUTO_READ) seq_printf(seq, " (auto-read-only)"); seq_printf(seq, " %s", mddev->pers->name); } @@ -8507,10 +8518,10 @@ bool md_write_start(struct mddev *mddev, struct bio *bi) if (bio_data_dir(bi) != WRITE) return true; - BUG_ON(mddev->ro == 1); - if (mddev->ro == 2) { + BUG_ON(mddev->ro == MD_RDONLY); + if (mddev->ro == MD_AUTO_READ) { /* need to switch to read/write */ - mddev->ro = 0; + mddev->ro = MD_RDWR; set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->sync_thread); @@ -8561,7 +8572,7 @@ void md_write_inc(struct mddev *mddev, struct bio *bi) { if (bio_data_dir(bi) != WRITE) return; - WARN_ON_ONCE(mddev->in_sync || mddev->ro); + WARN_ON_ONCE(mddev->in_sync || !md_is_rdwr(mddev)); percpu_ref_get(&mddev->writes_pending); } EXPORT_SYMBOL(md_write_inc); @@ -8666,7 +8677,7 @@ void md_allow_write(struct mddev *mddev) { if (!mddev->pers) return; - if (mddev->ro) + if (!md_is_rdwr(mddev)) return; if (!mddev->pers->sync_request) return; @@ -8714,7 +8725,7 @@ void md_do_sync(struct md_thread *thread) if (test_bit(MD_RECOVERY_DONE, &mddev->recovery) || test_bit(MD_RECOVERY_WAIT, &mddev->recovery)) return; - if (mddev->ro) {/* never try to sync a read-only array */ + if (!md_is_rdwr(mddev)) {/* never try to sync a read-only array */ set_bit(MD_RECOVERY_INTR, &mddev->recovery); return; } @@ -9183,9 +9194,9 @@ static int remove_and_add_spares(struct mddev *mddev, if (test_bit(Faulty, &rdev->flags)) continue; if (!test_bit(Journal, &rdev->flags)) { - if (mddev->ro && - ! (rdev->saved_raid_disk >= 0 && - !test_bit(Bitmap_sync, &rdev->flags))) + if (!md_is_rdwr(mddev) && + !(rdev->saved_raid_disk >= 0 && + !test_bit(Bitmap_sync, &rdev->flags))) continue; rdev->recovery_offset = 0; @@ -9283,7 +9294,8 @@ void md_check_recovery(struct mddev *mddev) flush_signals(current); } - if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) + if (!md_is_rdwr(mddev) && + !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) return; if ( ! ( (mddev->sb_flags & ~ (1<external && mddev->safemode == 1) mddev->safemode = 0; - if (mddev->ro) { + if (!md_is_rdwr(mddev)) { struct md_rdev *rdev; if (!mddev->external && mddev->in_sync) /* 'Blocked' flag not needed as failed devices From 4555211190798b6b6fa2c37667d175bf67945c78 Mon Sep 17 00:00:00 2001 From: Florian-Ewald Mueller Date: Tue, 25 Oct 2022 09:37:05 +0200 Subject: [PATCH 06/10] md/bitmap: Fix bitmap chunk size overflow issues - limit bitmap chunk size internal u64 variable to values not overflowing the u32 bitmap superblock structure variable stored on persistent media - assign bitmap chunk size internal u64 variable from unsigned values to avoid possible sign extension artifacts when assigning from a s32 value The bug has been there since at least kernel 4.0. Steps to reproduce it: 1: mdadm -C /dev/mdx -l 1 --bitmap=internal --bitmap-chunk=256M -e 1.2 -n2 /dev/rnbd1 /dev/rnbd2 2 resize member device rnbd1 and rnbd2 to 8 TB 3 mdadm --grow /dev/mdx --size=max The bitmap_chunksize will overflow without patch. Cc: stable@vger.kernel.org Signed-off-by: Florian-Ewald Mueller Signed-off-by: Jack Wang Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 63ece30114e5..e7cc6ba1b657 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -486,7 +486,7 @@ void md_bitmap_print_sb(struct bitmap *bitmap) sb = kmap_atomic(bitmap->storage.sb_page); pr_debug("%s: bitmap file superblock:\n", bmname(bitmap)); pr_debug(" magic: %08x\n", le32_to_cpu(sb->magic)); - pr_debug(" version: %d\n", le32_to_cpu(sb->version)); + pr_debug(" version: %u\n", le32_to_cpu(sb->version)); pr_debug(" uuid: %08x.%08x.%08x.%08x\n", le32_to_cpu(*(__le32 *)(sb->uuid+0)), le32_to_cpu(*(__le32 *)(sb->uuid+4)), @@ -497,11 +497,11 @@ void md_bitmap_print_sb(struct bitmap *bitmap) pr_debug("events cleared: %llu\n", (unsigned long long) le64_to_cpu(sb->events_cleared)); pr_debug(" state: %08x\n", le32_to_cpu(sb->state)); - pr_debug(" chunksize: %d B\n", le32_to_cpu(sb->chunksize)); - pr_debug(" daemon sleep: %ds\n", le32_to_cpu(sb->daemon_sleep)); + pr_debug(" chunksize: %u B\n", le32_to_cpu(sb->chunksize)); + pr_debug(" daemon sleep: %us\n", le32_to_cpu(sb->daemon_sleep)); pr_debug(" sync size: %llu KB\n", (unsigned long long)le64_to_cpu(sb->sync_size)/2); - pr_debug("max write behind: %d\n", le32_to_cpu(sb->write_behind)); + pr_debug("max write behind: %u\n", le32_to_cpu(sb->write_behind)); kunmap_atomic(sb); } @@ -2105,7 +2105,8 @@ int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks, bytes = DIV_ROUND_UP(chunks, 8); if (!bitmap->mddev->bitmap_info.external) bytes += sizeof(bitmap_super_t); - } while (bytes > (space << 9)); + } while (bytes > (space << 9) && (chunkshift + BITMAP_BLOCK_SHIFT) < + (BITS_PER_BYTE * sizeof(((bitmap_super_t *)0)->chunksize) - 1)); } else chunkshift = ffz(~chunksize) - BITMAP_BLOCK_SHIFT; @@ -2150,7 +2151,7 @@ int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks, bitmap->counts.missing_pages = pages; bitmap->counts.chunkshift = chunkshift; bitmap->counts.chunks = chunks; - bitmap->mddev->bitmap_info.chunksize = 1 << (chunkshift + + bitmap->mddev->bitmap_info.chunksize = 1UL << (chunkshift + BITMAP_BLOCK_SHIFT); blocks = min(old_counts.chunks << old_counts.chunkshift, @@ -2176,8 +2177,8 @@ int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks, bitmap->counts.missing_pages = old_counts.pages; bitmap->counts.chunkshift = old_counts.chunkshift; bitmap->counts.chunks = old_counts.chunks; - bitmap->mddev->bitmap_info.chunksize = 1 << (old_counts.chunkshift + - BITMAP_BLOCK_SHIFT); + bitmap->mddev->bitmap_info.chunksize = + 1UL << (old_counts.chunkshift + BITMAP_BLOCK_SHIFT); blocks = old_counts.chunks << old_counts.chunkshift; pr_warn("Could not pre-allocate in-memory bitmap for cluster raid\n"); break; @@ -2537,6 +2538,9 @@ chunksize_store(struct mddev *mddev, const char *buf, size_t len) if (csize < 512 || !is_power_of_2(csize)) return -EINVAL; + if (BITS_PER_LONG > 32 && csize >= (1ULL << (BITS_PER_BYTE * + sizeof(((bitmap_super_t *)0)->chunksize)))) + return -EOVERFLOW; mddev->bitmap_info.chunksize = csize; return len; } From 8e1a2279ca2b0485cc379a153d02a9793f74a48f Mon Sep 17 00:00:00 2001 From: Xiao Ni Date: Wed, 2 Nov 2022 10:07:30 +0800 Subject: [PATCH 07/10] md/raid0, raid10: Don't set discard sectors for request queue It should use disk_stack_limits to get a proper max_discard_sectors rather than setting a value by stack drivers. And there is a bug. If all member disks are rotational devices, raid0/raid10 set max_discard_sectors. So the member devices are not ssd/nvme, but raid0/raid10 export the wrong value. It reports warning messages in function __blkdev_issue_discard when mkfs.xfs like this: [ 4616.022599] ------------[ cut here ]------------ [ 4616.027779] WARNING: CPU: 4 PID: 99634 at block/blk-lib.c:50 __blkdev_issue_discard+0x16a/0x1a0 [ 4616.140663] RIP: 0010:__blkdev_issue_discard+0x16a/0x1a0 [ 4616.146601] Code: 24 4c 89 20 31 c0 e9 fe fe ff ff c1 e8 09 8d 48 ff 4c 89 f0 4c 09 e8 48 85 c1 0f 84 55 ff ff ff b8 ea ff ff ff e9 df fe ff ff <0f> 0b 48 8d 74 24 08 e8 ea d6 00 00 48 c7 c6 20 1e 89 ab 48 c7 c7 [ 4616.167567] RSP: 0018:ffffaab88cbffca8 EFLAGS: 00010246 [ 4616.173406] RAX: ffff9ba1f9e44678 RBX: 0000000000000000 RCX: ffff9ba1c9792080 [ 4616.181376] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff9ba1c9792080 [ 4616.189345] RBP: 0000000000000cc0 R08: ffffaab88cbffd10 R09: 0000000000000000 [ 4616.197317] R10: 0000000000000012 R11: 0000000000000000 R12: 0000000000000000 [ 4616.205288] R13: 0000000000400000 R14: 0000000000000cc0 R15: ffff9ba1c9792080 [ 4616.213259] FS: 00007f9a5534e980(0000) GS:ffff9ba1b7c80000(0000) knlGS:0000000000000000 [ 4616.222298] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 4616.228719] CR2: 000055a390a4c518 CR3: 0000000123e40006 CR4: 00000000001706e0 [ 4616.236689] Call Trace: [ 4616.239428] blkdev_issue_discard+0x52/0xb0 [ 4616.244108] blkdev_common_ioctl+0x43c/0xa00 [ 4616.248883] blkdev_ioctl+0x116/0x280 [ 4616.252977] __x64_sys_ioctl+0x8a/0xc0 [ 4616.257163] do_syscall_64+0x5c/0x90 [ 4616.261164] ? handle_mm_fault+0xc5/0x2a0 [ 4616.265652] ? do_user_addr_fault+0x1d8/0x690 [ 4616.270527] ? do_syscall_64+0x69/0x90 [ 4616.274717] ? exc_page_fault+0x62/0x150 [ 4616.279097] entry_SYSCALL_64_after_hwframe+0x63/0xcd [ 4616.284748] RIP: 0033:0x7f9a55398c6b Signed-off-by: Xiao Ni Reported-by: Yi Zhang Reviewed-by: Ming Lei Signed-off-by: Song Liu --- drivers/md/raid0.c | 1 - drivers/md/raid10.c | 2 -- 2 files changed, 3 deletions(-) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 857c49399c28..b536befd8898 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -398,7 +398,6 @@ static int raid0_run(struct mddev *mddev) blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors); blk_queue_max_write_zeroes_sectors(mddev->queue, mddev->chunk_sectors); - blk_queue_max_discard_sectors(mddev->queue, UINT_MAX); blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9); blk_queue_io_opt(mddev->queue, diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 3aa8b6e11d58..9a6503f5cb98 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -4145,8 +4145,6 @@ static int raid10_run(struct mddev *mddev) conf->thread = NULL; if (mddev->queue) { - blk_queue_max_discard_sectors(mddev->queue, - UINT_MAX); blk_queue_max_write_zeroes_sectors(mddev->queue, 0); blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9); raid10_set_io_opt(conf); From 341097ee53573e06ab9fc675d96a052385b851fa Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 4 Nov 2022 09:53:38 -0400 Subject: [PATCH 08/10] md: fix a crash in mempool_free There's a crash in mempool_free when running the lvm test shell/lvchange-rebuild-raid.sh. The reason for the crash is this: * super_written calls atomic_dec_and_test(&mddev->pending_writes) and wake_up(&mddev->sb_wait). Then it calls rdev_dec_pending(rdev, mddev) and bio_put(bio). * so, the process that waited on sb_wait and that is woken up is racing with bio_put(bio). * if the process wins the race, it calls bioset_exit before bio_put(bio) is executed. * bio_put(bio) attempts to free a bio into a destroyed bio set - causing a crash in mempool_free. We fix this bug by moving bio_put before atomic_dec_and_test. We also move rdev_dec_pending before atomic_dec_and_test as suggested by Neil Brown. The function md_end_flush has a similar bug - we must call bio_put before we decrement the number of in-progress bios. BUG: kernel NULL pointer dereference, address: 0000000000000000 #PF: supervisor write access in kernel mode #PF: error_code(0x0002) - not-present page PGD 11557f0067 P4D 11557f0067 PUD 0 Oops: 0002 [#1] PREEMPT SMP CPU: 0 PID: 73 Comm: kworker/0:1 Not tainted 6.1.0-rc3 #5 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014 Workqueue: kdelayd flush_expired_bios [dm_delay] RIP: 0010:mempool_free+0x47/0x80 Code: 48 89 ef 5b 5d ff e0 f3 c3 48 89 f7 e8 32 45 3f 00 48 63 53 08 48 89 c6 3b 53 04 7d 2d 48 8b 43 10 8d 4a 01 48 89 df 89 4b 08 <48> 89 2c d0 e8 b0 45 3f 00 48 8d 7b 30 5b 5d 31 c9 ba 01 00 00 00 RSP: 0018:ffff88910036bda8 EFLAGS: 00010093 RAX: 0000000000000000 RBX: ffff8891037b65d8 RCX: 0000000000000001 RDX: 0000000000000000 RSI: 0000000000000202 RDI: ffff8891037b65d8 RBP: ffff8891447ba240 R08: 0000000000012908 R09: 00000000003d0900 R10: 0000000000000000 R11: 0000000000173544 R12: ffff889101a14000 R13: ffff8891562ac300 R14: ffff889102b41440 R15: ffffe8ffffa00d05 FS: 0000000000000000(0000) GS:ffff88942fa00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 0000001102e99000 CR4: 00000000000006b0 Call Trace: clone_endio+0xf4/0x1c0 [dm_mod] clone_endio+0xf4/0x1c0 [dm_mod] __submit_bio+0x76/0x120 submit_bio_noacct_nocheck+0xb6/0x2a0 flush_expired_bios+0x28/0x2f [dm_delay] process_one_work+0x1b4/0x300 worker_thread+0x45/0x3e0 ? rescuer_thread+0x380/0x380 kthread+0xc2/0x100 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork+0x1f/0x30 Modules linked in: brd dm_delay dm_raid dm_mod af_packet uvesafb cfbfillrect cfbimgblt cn cfbcopyarea fb font fbdev tun autofs4 binfmt_misc configfs ipv6 virtio_rng virtio_balloon rng_core virtio_net pcspkr net_failover failover qemu_fw_cfg button mousedev raid10 raid456 libcrc32c async_raid6_recov async_memcpy async_pq raid6_pq async_xor xor async_tx raid1 raid0 md_mod sd_mod t10_pi crc64_rocksoft crc64 virtio_scsi scsi_mod evdev psmouse bsg scsi_common [last unloaded: brd] CR2: 0000000000000000 ---[ end trace 0000000000000000 ]--- Signed-off-by: Mikulas Patocka Cc: stable@vger.kernel.org Signed-off-by: Song Liu --- drivers/md/md.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 6f3b2c1cb6cd..f95b86440485 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -521,13 +521,14 @@ static void md_end_flush(struct bio *bio) struct md_rdev *rdev = bio->bi_private; struct mddev *mddev = rdev->mddev; + bio_put(bio); + rdev_dec_pending(rdev, mddev); if (atomic_dec_and_test(&mddev->flush_pending)) { /* The pre-request flush has finished */ queue_work(md_wq, &mddev->flush_work); } - bio_put(bio); } static void md_submit_flush_data(struct work_struct *ws); @@ -925,10 +926,12 @@ static void super_written(struct bio *bio) } else clear_bit(LastDev, &rdev->flags); + bio_put(bio); + + rdev_dec_pending(rdev, mddev); + if (atomic_dec_and_test(&mddev->pending_writes)) wake_up(&mddev->sb_wait); - rdev_dec_pending(rdev, mddev); - bio_put(bio); } void md_super_write(struct mddev *mddev, struct md_rdev *rdev, From ad831a16b08c3f1a1f28a56d2054313d7d521da9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 9 Nov 2022 11:10:37 +0100 Subject: [PATCH 09/10] md/raid5: use bdev_write_cache instead of open coding it Use the bdev_write_cache instead of two equivalent open coded checks. Signed-off-by: Christoph Hellwig Signed-off-by: Song Liu --- drivers/md/raid5-cache.c | 5 +---- drivers/md/raid5-ppl.c | 5 +---- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index a63023aae21e..46182b955aef 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -3062,7 +3062,6 @@ void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev) int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) { - struct request_queue *q = bdev_get_queue(rdev->bdev); struct r5l_log *log; int ret; @@ -3091,9 +3090,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) if (!log) return -ENOMEM; log->rdev = rdev; - - log->need_cache_flush = test_bit(QUEUE_FLAG_WC, &q->queue_flags) != 0; - + log->need_cache_flush = bdev_write_cache(rdev->bdev); log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid, sizeof(rdev->mddev->uuid)); diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c index 31b9157bc9ae..e495939bb3e0 100644 --- a/drivers/md/raid5-ppl.c +++ b/drivers/md/raid5-ppl.c @@ -1301,8 +1301,6 @@ static int ppl_validate_rdev(struct md_rdev *rdev) static void ppl_init_child_log(struct ppl_log *log, struct md_rdev *rdev) { - struct request_queue *q; - if ((rdev->ppl.size << 9) >= (PPL_SPACE_SIZE + PPL_HEADER_SIZE) * 2) { log->use_multippl = true; @@ -1316,8 +1314,7 @@ static void ppl_init_child_log(struct ppl_log *log, struct md_rdev *rdev) } log->next_io_sector = rdev->ppl.sector; - q = bdev_get_queue(rdev->bdev); - if (test_bit(QUEUE_FLAG_WC, &q->queue_flags)) + if (bdev_write_cache(rdev->bdev)) log->wb_cache_on = true; } From b611ad14006e5be2170d9e8e611bf49dff288911 Mon Sep 17 00:00:00 2001 From: Jiang Li Date: Mon, 7 Nov 2022 22:16:59 +0800 Subject: [PATCH 10/10] md/raid1: stop mdx_raid1 thread when raid1 array run failed fail run raid1 array when we assemble array with the inactive disk only, but the mdx_raid1 thread were not stop, Even if the associated resources have been released. it will caused a NULL dereference when we do poweroff. This causes the following Oops: [ 287.587787] BUG: kernel NULL pointer dereference, address: 0000000000000070 [ 287.594762] #PF: supervisor read access in kernel mode [ 287.599912] #PF: error_code(0x0000) - not-present page [ 287.605061] PGD 0 P4D 0 [ 287.607612] Oops: 0000 [#1] SMP NOPTI [ 287.611287] CPU: 3 PID: 5265 Comm: md0_raid1 Tainted: G U 5.10.146 #0 [ 287.619029] Hardware name: xxxxxxx/To be filled by O.E.M, BIOS 5.19 06/16/2022 [ 287.626775] RIP: 0010:md_check_recovery+0x57/0x500 [md_mod] [ 287.632357] Code: fe 01 00 00 48 83 bb 10 03 00 00 00 74 08 48 89 ...... [ 287.651118] RSP: 0018:ffffc90000433d78 EFLAGS: 00010202 [ 287.656347] RAX: 0000000000000000 RBX: ffff888105986800 RCX: 0000000000000000 [ 287.663491] RDX: ffffc90000433bb0 RSI: 00000000ffffefff RDI: ffff888105986800 [ 287.670634] RBP: ffffc90000433da0 R08: 0000000000000000 R09: c0000000ffffefff [ 287.677771] R10: 0000000000000001 R11: ffffc90000433ba8 R12: ffff888105986800 [ 287.684907] R13: 0000000000000000 R14: fffffffffffffe00 R15: ffff888100b6b500 [ 287.692052] FS: 0000000000000000(0000) GS:ffff888277f80000(0000) knlGS:0000000000000000 [ 287.700149] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 287.705897] CR2: 0000000000000070 CR3: 000000000320a000 CR4: 0000000000350ee0 [ 287.713033] Call Trace: [ 287.715498] raid1d+0x6c/0xbbb [raid1] [ 287.719256] ? __schedule+0x1ff/0x760 [ 287.722930] ? schedule+0x3b/0xb0 [ 287.726260] ? schedule_timeout+0x1ed/0x290 [ 287.730456] ? __switch_to+0x11f/0x400 [ 287.734219] md_thread+0xe9/0x140 [md_mod] [ 287.738328] ? md_thread+0xe9/0x140 [md_mod] [ 287.742601] ? wait_woken+0x80/0x80 [ 287.746097] ? md_register_thread+0xe0/0xe0 [md_mod] [ 287.751064] kthread+0x11a/0x140 [ 287.754300] ? kthread_park+0x90/0x90 [ 287.757974] ret_from_fork+0x1f/0x30 In fact, when raid1 array run fail, we need to do md_unregister_thread() before raid1_free(). Signed-off-by: Jiang Li Signed-off-by: Song Liu --- drivers/md/raid1.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 05d8438cfec8..58f705f42948 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -3159,6 +3159,7 @@ static int raid1_run(struct mddev *mddev) * RAID1 needs at least one disk in active */ if (conf->raid_disks - mddev->degraded < 1) { + md_unregister_thread(&conf->thread); ret = -EINVAL; goto abort; }