From d6e035aad6c09991da1c667fb83419329a3baed8 Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Wed, 8 Nov 2023 10:22:15 -0800 Subject: [PATCH 1/9] md: bypass block throttle for superblock update commit 5e2cf333b7bd ("md/raid5: Wait for MD_SB_CHANGE_PENDING in raid5d") introduced a hung bug and will be reverted in next patch, since the issue that commit is fixing is due to md superblock write is throttled by wbt, to fix it, we can have superblock write bypass block layer throttle. Fixes: 5e2cf333b7bd ("md/raid5: Wait for MD_SB_CHANGE_PENDING in raid5d") Cc: stable@vger.kernel.org # v5.19+ Suggested-by: Yu Kuai Signed-off-by: Junxiao Bi Reviewed-by: Logan Gunthorpe Reviewed-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20231108182216.73611-1-junxiao.bi@oracle.com --- drivers/md/md.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index c94373d64f2c..466bbcb4e230 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1013,9 +1013,10 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev, return; bio = bio_alloc_bioset(rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev, - 1, - REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH | REQ_FUA, - GFP_NOIO, &mddev->sync_set); + 1, + REQ_OP_WRITE | REQ_SYNC | REQ_IDLE | REQ_META + | REQ_PREFLUSH | REQ_FUA, + GFP_NOIO, &mddev->sync_set); atomic_inc(&rdev->nr_pending); From bed9e27baf52a09b7ba2a3714f1e24e17ced386d Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Wed, 8 Nov 2023 10:22:16 -0800 Subject: [PATCH 2/9] Revert "md/raid5: Wait for MD_SB_CHANGE_PENDING in raid5d" This reverts commit 5e2cf333b7bd5d3e62595a44d598a254c697cd74. That commit introduced the following race and can cause system hung. md_write_start: raid5d: // mddev->in_sync == 1 set "MD_SB_CHANGE_PENDING" // running before md_write_start wakeup it waiting "MD_SB_CHANGE_PENDING" cleared >>>>>>>>> hung wakeup mddev->thread ... waiting "MD_SB_CHANGE_PENDING" cleared >>>> hung, raid5d should clear this flag but get hung by same flag. The issue reverted commit fixing is fixed by last patch in a new way. Fixes: 5e2cf333b7bd ("md/raid5: Wait for MD_SB_CHANGE_PENDING in raid5d") Cc: stable@vger.kernel.org # v5.19+ Signed-off-by: Junxiao Bi Reviewed-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20231108182216.73611-2-junxiao.bi@oracle.com --- drivers/md/raid5.c | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index dc031d42f53b..fcc8a44dd4fd 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -36,7 +36,6 @@ */ #include -#include #include #include #include @@ -6820,18 +6819,7 @@ static void raid5d(struct md_thread *thread) spin_unlock_irq(&conf->device_lock); md_check_recovery(mddev); spin_lock_irq(&conf->device_lock); - - /* - * Waiting on MD_SB_CHANGE_PENDING below may deadlock - * seeing md_check_recovery() is needed to clear - * the flag when using mdmon. - */ - continue; } - - wait_event_lock_irq(mddev->sb_wait, - !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags), - conf->device_lock); } pr_debug("%d stripes handled\n", handled); From c891f1fd90e66e584bb1353e1859cef7c9eb36f8 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Sat, 25 Nov 2023 16:16:00 +0800 Subject: [PATCH 3/9] md: remove flag RemoveSynchronized rcu is not used correctly here, because synchronize_rcu() is called before replacing old value, for example: remove_and_add_spares // other path synchronize_rcu // called before replacing old value set_bit(RemoveSynchronized) rcu_read_lock() rdev = conf->mirros[].rdev pers->hot_remove_disk conf->mirros[].rdev = NULL; if (!test_bit(RemoveSynchronized)) synchronize_rcu /* * won't be called, and won't wait * for concurrent readers to be done. */ // access rdev after remove_and_add_spares() rcu_read_unlock() Fortunately, there is a separate rcu protection to prevent such rdev to be freed: md_kick_rdev_from_array //other path rcu_read_lock() rdev = conf->mirros[].rdev list_del_rcu(&rdev->same_set) rcu_read_unlock() /* * rdev can be removed from conf, but * rdev won't be freed. */ synchronize_rcu() free rdev Hence remove this useless flag and prepare to remove rcu protection to access rdev from 'conf'. Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20231125081604.3939938-2-yukuai1@huaweicloud.com --- drivers/md/md-multipath.c | 9 --------- drivers/md/md.c | 37 ++++++------------------------------- drivers/md/md.h | 5 ----- drivers/md/raid1.c | 9 --------- drivers/md/raid10.c | 9 --------- drivers/md/raid5.c | 9 --------- 6 files changed, 6 insertions(+), 72 deletions(-) diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c index d22276870283..aa77133f3188 100644 --- a/drivers/md/md-multipath.c +++ b/drivers/md/md-multipath.c @@ -258,15 +258,6 @@ static int multipath_remove_disk(struct mddev *mddev, struct md_rdev *rdev) goto abort; } p->rdev = NULL; - if (!test_bit(RemoveSynchronized, &rdev->flags)) { - synchronize_rcu(); - if (atomic_read(&rdev->nr_pending)) { - /* lost the race, try later */ - err = -EBUSY; - p->rdev = rdev; - goto abort; - } - } err = md_integrity_register(mddev); } abort: diff --git a/drivers/md/md.c b/drivers/md/md.c index 466bbcb4e230..71b3397dea47 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -9244,46 +9244,21 @@ static int remove_and_add_spares(struct mddev *mddev, struct md_rdev *rdev; int spares = 0; int removed = 0; - bool remove_some = false; if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) /* Mustn't remove devices when resync thread is running */ return 0; rdev_for_each(rdev, mddev) { - if ((this == NULL || rdev == this) && - rdev->raid_disk >= 0 && - !test_bit(Blocked, &rdev->flags) && - test_bit(Faulty, &rdev->flags) && - atomic_read(&rdev->nr_pending)==0) { - /* Faulty non-Blocked devices with nr_pending == 0 - * never get nr_pending incremented, - * never get Faulty cleared, and never get Blocked set. - * So we can synchronize_rcu now rather than once per device - */ - remove_some = true; - set_bit(RemoveSynchronized, &rdev->flags); + if ((this == NULL || rdev == this) && rdev_removeable(rdev) && + !mddev->pers->hot_remove_disk(mddev, rdev)) { + sysfs_unlink_rdev(mddev, rdev); + rdev->saved_raid_disk = rdev->raid_disk; + rdev->raid_disk = -1; + removed++; } } - if (remove_some) - synchronize_rcu(); - rdev_for_each(rdev, mddev) { - if ((this == NULL || rdev == this) && - (test_bit(RemoveSynchronized, &rdev->flags) || - rdev_removeable(rdev))) { - if (mddev->pers->hot_remove_disk( - mddev, rdev) == 0) { - sysfs_unlink_rdev(mddev, rdev); - rdev->saved_raid_disk = rdev->raid_disk; - rdev->raid_disk = -1; - removed++; - } - } - if (remove_some && test_bit(RemoveSynchronized, &rdev->flags)) - clear_bit(RemoveSynchronized, &rdev->flags); - } - if (removed && mddev->kobj.sd) sysfs_notify_dirent_safe(mddev->sysfs_degraded); diff --git a/drivers/md/md.h b/drivers/md/md.h index ade83af123a2..8d881cc59799 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -190,11 +190,6 @@ enum flag_bits { * than other devices in the array */ ClusterRemove, - RemoveSynchronized, /* synchronize_rcu() was called after - * this device was known to be faulty, - * so it is safe to remove without - * another synchronize_rcu() call. - */ ExternalBbl, /* External metadata provides bad * block management for a disk */ diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 35d12948e0a9..a678e0e6e102 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1863,15 +1863,6 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev) goto abort; } p->rdev = NULL; - if (!test_bit(RemoveSynchronized, &rdev->flags)) { - synchronize_rcu(); - if (atomic_read(&rdev->nr_pending)) { - /* lost the race, try later */ - err = -EBUSY; - p->rdev = rdev; - goto abort; - } - } if (conf->mirrors[conf->raid_disks + number].rdev) { /* We just removed a device that is being replaced. * Move down the replacement. We drain all IO before diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index a5927e98dc67..132a79523338 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2247,15 +2247,6 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev) goto abort; } *rdevp = NULL; - if (!test_bit(RemoveSynchronized, &rdev->flags)) { - synchronize_rcu(); - if (atomic_read(&rdev->nr_pending)) { - /* lost the race, try later */ - err = -EBUSY; - *rdevp = rdev; - goto abort; - } - } if (p->replacement) { /* We must have just cleared 'rdev' */ p->rdev = p->replacement; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index fcc8a44dd4fd..d431e4625cc5 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -8233,15 +8233,6 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) goto abort; } *rdevp = NULL; - if (!test_bit(RemoveSynchronized, &rdev->flags)) { - lockdep_assert_held(&mddev->reconfig_mutex); - synchronize_rcu(); - if (atomic_read(&rdev->nr_pending)) { - /* lost the race, try later */ - err = -EBUSY; - rcu_assign_pointer(*rdevp, rdev); - } - } if (!err) { err = log_modify(conf, rdev, false); if (err) From a448af25becf4b555660b5ba2618c7ed3c4de6da Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Sat, 25 Nov 2023 16:16:01 +0800 Subject: [PATCH 4/9] md/raid10: remove rcu protection to access rdev from conf Because it's safe to accees rdev from conf: - If any spinlock is held, because synchronize_rcu() from md_kick_rdev_from_array() will prevent 'rdev' to be freed until spinlock is released; - If 'reconfig_lock' is held, because rdev can't be added or removed from array; - If there is normal IO inflight, because mddev_suspend() will prevent rdev to be added or removed from array; - If there is sync IO inflight, because 'MD_RECOVERY_RUNNING' is checked in remove_and_add_spares(). And these will cover all the scenarios in raid10. This patch also cleanup the code to handle the case that replacement replace rdev while IO is still inflight. Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20231125081604.3939938-3-yukuai1@huaweicloud.com --- drivers/md/raid10.c | 213 ++++++++++++-------------------------------- 1 file changed, 58 insertions(+), 155 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 132a79523338..375c11d6159f 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -743,7 +743,6 @@ static struct md_rdev *read_balance(struct r10conf *conf, struct geom *geo = &conf->geo; raid10_find_phys(conf, r10_bio); - rcu_read_lock(); best_dist_slot = -1; min_pending = UINT_MAX; best_dist_rdev = NULL; @@ -775,18 +774,11 @@ static struct md_rdev *read_balance(struct r10conf *conf, if (r10_bio->devs[slot].bio == IO_BLOCKED) continue; disk = r10_bio->devs[slot].devnum; - rdev = rcu_dereference(conf->mirrors[disk].replacement); + rdev = conf->mirrors[disk].replacement; if (rdev == NULL || test_bit(Faulty, &rdev->flags) || r10_bio->devs[slot].addr + sectors > - rdev->recovery_offset) { - /* - * Read replacement first to prevent reading both rdev - * and replacement as NULL during replacement replace - * rdev. - */ - smp_mb(); - rdev = rcu_dereference(conf->mirrors[disk].rdev); - } + rdev->recovery_offset) + rdev = conf->mirrors[disk].rdev; if (rdev == NULL || test_bit(Faulty, &rdev->flags)) continue; @@ -876,7 +868,6 @@ static struct md_rdev *read_balance(struct r10conf *conf, r10_bio->read_slot = slot; } else rdev = NULL; - rcu_read_unlock(); *max_sectors = best_good_sectors; return rdev; @@ -1198,9 +1189,8 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, */ gfp = GFP_NOIO | __GFP_HIGH; - rcu_read_lock(); disk = r10_bio->devs[slot].devnum; - err_rdev = rcu_dereference(conf->mirrors[disk].rdev); + err_rdev = conf->mirrors[disk].rdev; if (err_rdev) snprintf(b, sizeof(b), "%pg", err_rdev->bdev); else { @@ -1208,7 +1198,6 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, /* This never gets dereferenced */ err_rdev = r10_bio->devs[slot].rdev; } - rcu_read_unlock(); } if (!regular_request_wait(mddev, conf, bio, r10_bio->sectors)) @@ -1279,15 +1268,8 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, int devnum = r10_bio->devs[n_copy].devnum; struct bio *mbio; - if (replacement) { - rdev = conf->mirrors[devnum].replacement; - if (rdev == NULL) { - /* Replacement just got moved to main 'rdev' */ - smp_mb(); - rdev = conf->mirrors[devnum].rdev; - } - } else - rdev = conf->mirrors[devnum].rdev; + rdev = replacement ? conf->mirrors[devnum].replacement : + conf->mirrors[devnum].rdev; mbio = bio_alloc_clone(rdev->bdev, bio, GFP_NOIO, &mddev->bio_set); if (replacement) @@ -1321,25 +1303,6 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, } } -static struct md_rdev *dereference_rdev_and_rrdev(struct raid10_info *mirror, - struct md_rdev **prrdev) -{ - struct md_rdev *rdev, *rrdev; - - rrdev = rcu_dereference(mirror->replacement); - /* - * Read replacement first to prevent reading both rdev and - * replacement as NULL during replacement replace rdev. - */ - smp_mb(); - rdev = rcu_dereference(mirror->rdev); - if (rdev == rrdev) - rrdev = NULL; - - *prrdev = rrdev; - return rdev; -} - static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio) { int i; @@ -1348,11 +1311,11 @@ static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio) retry_wait: blocked_rdev = NULL; - rcu_read_lock(); for (i = 0; i < conf->copies; i++) { struct md_rdev *rdev, *rrdev; - rdev = dereference_rdev_and_rrdev(&conf->mirrors[i], &rrdev); + rdev = conf->mirrors[i].rdev; + rrdev = conf->mirrors[i].replacement; if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { atomic_inc(&rdev->nr_pending); blocked_rdev = rdev; @@ -1391,7 +1354,6 @@ retry_wait: } } } - rcu_read_unlock(); if (unlikely(blocked_rdev)) { /* Have to wait for this device to get unblocked, then retry */ @@ -1474,14 +1436,14 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, wait_blocked_dev(mddev, r10_bio); - rcu_read_lock(); max_sectors = r10_bio->sectors; for (i = 0; i < conf->copies; i++) { int d = r10_bio->devs[i].devnum; struct md_rdev *rdev, *rrdev; - rdev = dereference_rdev_and_rrdev(&conf->mirrors[d], &rrdev); + rdev = conf->mirrors[d].rdev; + rrdev = conf->mirrors[d].replacement; if (rdev && (test_bit(Faulty, &rdev->flags))) rdev = NULL; if (rrdev && (test_bit(Faulty, &rrdev->flags))) @@ -1535,7 +1497,6 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, atomic_inc(&rrdev->nr_pending); } } - rcu_read_unlock(); if (max_sectors < r10_bio->sectors) r10_bio->sectors = max_sectors; @@ -1625,17 +1586,8 @@ static void raid10_end_discard_request(struct bio *bio) set_bit(R10BIO_Uptodate, &r10_bio->state); dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl); - if (repl) - rdev = conf->mirrors[dev].replacement; - if (!rdev) { - /* - * raid10_remove_disk uses smp_mb to make sure rdev is set to - * replacement before setting replacement to NULL. It can read - * rdev first without barrier protect even replacement is NULL - */ - smp_rmb(); - rdev = conf->mirrors[dev].rdev; - } + rdev = repl ? conf->mirrors[dev].replacement : + conf->mirrors[dev].rdev; raid_end_discard_bio(r10_bio); rdev_dec_pending(rdev, conf->mddev); @@ -1785,11 +1737,11 @@ retry_discard: * inc refcount on their rdev. Record them by setting * bios[x] to bio */ - rcu_read_lock(); for (disk = 0; disk < geo->raid_disks; disk++) { struct md_rdev *rdev, *rrdev; - rdev = dereference_rdev_and_rrdev(&conf->mirrors[disk], &rrdev); + rdev = conf->mirrors[disk].rdev; + rrdev = conf->mirrors[disk].replacement; r10_bio->devs[disk].bio = NULL; r10_bio->devs[disk].repl_bio = NULL; @@ -1809,7 +1761,6 @@ retry_discard: atomic_inc(&rrdev->nr_pending); } } - rcu_read_unlock(); atomic_set(&r10_bio->remaining, 1); for (disk = 0; disk < geo->raid_disks; disk++) { @@ -1939,6 +1890,8 @@ static void raid10_status(struct seq_file *seq, struct mddev *mddev) struct r10conf *conf = mddev->private; int i; + lockdep_assert_held(&mddev->lock); + if (conf->geo.near_copies < conf->geo.raid_disks) seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2); if (conf->geo.near_copies > 1) @@ -1953,12 +1906,11 @@ static void raid10_status(struct seq_file *seq, struct mddev *mddev) } seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks, conf->geo.raid_disks - mddev->degraded); - rcu_read_lock(); for (i = 0; i < conf->geo.raid_disks; i++) { - struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); + struct md_rdev *rdev = READ_ONCE(conf->mirrors[i].rdev); + seq_printf(seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_"); } - rcu_read_unlock(); seq_printf(seq, "]"); } @@ -1980,7 +1932,6 @@ static int _enough(struct r10conf *conf, int previous, int ignore) ncopies = conf->geo.near_copies; } - rcu_read_lock(); do { int n = conf->copies; int cnt = 0; @@ -1988,7 +1939,7 @@ static int _enough(struct r10conf *conf, int previous, int ignore) while (n--) { struct md_rdev *rdev; if (this != ignore && - (rdev = rcu_dereference(conf->mirrors[this].rdev)) && + (rdev = conf->mirrors[this].rdev) && test_bit(In_sync, &rdev->flags)) cnt++; this = (this+1) % disks; @@ -1999,7 +1950,6 @@ static int _enough(struct r10conf *conf, int previous, int ignore) } while (first != 0); has_enough = 1; out: - rcu_read_unlock(); return has_enough; } @@ -2072,8 +2022,7 @@ static void print_conf(struct r10conf *conf) pr_debug(" --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded, conf->geo.raid_disks); - /* This is only called with ->reconfix_mutex held, so - * rcu protection of rdev is not needed */ + lockdep_assert_held(&conf->mddev->reconfig_mutex); for (i = 0; i < conf->geo.raid_disks; i++) { rdev = conf->mirrors[i].rdev; if (rdev) @@ -2190,7 +2139,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) err = 0; if (rdev->saved_raid_disk != mirror) conf->fullsync = 1; - rcu_assign_pointer(p->rdev, rdev); + WRITE_ONCE(p->rdev, rdev); break; } @@ -2204,7 +2153,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); conf->fullsync = 1; - rcu_assign_pointer(p->replacement, rdev); + WRITE_ONCE(p->replacement, rdev); } print_conf(conf); @@ -2246,15 +2195,12 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev) err = -EBUSY; goto abort; } - *rdevp = NULL; + WRITE_ONCE(*rdevp, NULL); if (p->replacement) { /* We must have just cleared 'rdev' */ - p->rdev = p->replacement; + WRITE_ONCE(p->rdev, p->replacement); clear_bit(Replacement, &p->replacement->flags); - smp_mb(); /* Make sure other CPUs may see both as identical - * but will never see neither -- if they are careful. - */ - p->replacement = NULL; + WRITE_ONCE(p->replacement, NULL); } clear_bit(WantReplacement, &rdev->flags); @@ -2754,20 +2700,18 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 if (s > (PAGE_SIZE>>9)) s = PAGE_SIZE >> 9; - rcu_read_lock(); do { sector_t first_bad; int bad_sectors; d = r10_bio->devs[sl].devnum; - rdev = rcu_dereference(conf->mirrors[d].rdev); + rdev = conf->mirrors[d].rdev; if (rdev && test_bit(In_sync, &rdev->flags) && !test_bit(Faulty, &rdev->flags) && is_badblock(rdev, r10_bio->devs[sl].addr + sect, s, &first_bad, &bad_sectors) == 0) { atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); success = sync_page_io(rdev, r10_bio->devs[sl].addr + sect, @@ -2775,7 +2719,6 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 conf->tmppage, REQ_OP_READ, false); rdev_dec_pending(rdev, mddev); - rcu_read_lock(); if (success) break; } @@ -2783,7 +2726,6 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 if (sl == conf->copies) sl = 0; } while (sl != slot); - rcu_read_unlock(); if (!success) { /* Cannot read from anywhere, just mark the block @@ -2807,20 +2749,18 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 start = sl; /* write it back and re-read */ - rcu_read_lock(); while (sl != slot) { if (sl==0) sl = conf->copies; sl--; d = r10_bio->devs[sl].devnum; - rdev = rcu_dereference(conf->mirrors[d].rdev); + rdev = conf->mirrors[d].rdev; if (!rdev || test_bit(Faulty, &rdev->flags) || !test_bit(In_sync, &rdev->flags)) continue; atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); if (r10_sync_page_io(rdev, r10_bio->devs[sl].addr + sect, @@ -2839,7 +2779,6 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 rdev->bdev); } rdev_dec_pending(rdev, mddev); - rcu_read_lock(); } sl = start; while (sl != slot) { @@ -2847,14 +2786,13 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 sl = conf->copies; sl--; d = r10_bio->devs[sl].devnum; - rdev = rcu_dereference(conf->mirrors[d].rdev); + rdev = conf->mirrors[d].rdev; if (!rdev || test_bit(Faulty, &rdev->flags) || !test_bit(In_sync, &rdev->flags)) continue; atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); switch (r10_sync_page_io(rdev, r10_bio->devs[sl].addr + sect, @@ -2882,9 +2820,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 } rdev_dec_pending(rdev, mddev); - rcu_read_lock(); } - rcu_read_unlock(); sectors -= s; sect += s; @@ -3358,14 +3294,13 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, /* Completed a full sync so the replacements * are now fully recovered. */ - rcu_read_lock(); for (i = 0; i < conf->geo.raid_disks; i++) { struct md_rdev *rdev = - rcu_dereference(conf->mirrors[i].replacement); + conf->mirrors[i].replacement; + if (rdev) rdev->recovery_offset = MaxSector; } - rcu_read_unlock(); } conf->fullsync = 0; } @@ -3446,9 +3381,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, struct raid10_info *mirror = &conf->mirrors[i]; struct md_rdev *mrdev, *mreplace; - rcu_read_lock(); - mrdev = rcu_dereference(mirror->rdev); - mreplace = rcu_dereference(mirror->replacement); + mrdev = mirror->rdev; + mreplace = mirror->replacement; if (mrdev && (test_bit(Faulty, &mrdev->flags) || test_bit(In_sync, &mrdev->flags))) @@ -3456,22 +3390,18 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, if (mreplace && test_bit(Faulty, &mreplace->flags)) mreplace = NULL; - if (!mrdev && !mreplace) { - rcu_read_unlock(); + if (!mrdev && !mreplace) continue; - } still_degraded = 0; /* want to reconstruct this device */ rb2 = r10_bio; sect = raid10_find_virt(conf, sector_nr, i); - if (sect >= mddev->resync_max_sectors) { + if (sect >= mddev->resync_max_sectors) /* last stripe is not complete - don't * try to recover this sector. */ - rcu_read_unlock(); continue; - } /* Unless we are doing a full sync, or a replacement * we only need to recover the block if it is set in * the bitmap @@ -3487,14 +3417,12 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, * that there will never be anything to do here */ chunks_skipped = -1; - rcu_read_unlock(); continue; } if (mrdev) atomic_inc(&mrdev->nr_pending); if (mreplace) atomic_inc(&mreplace->nr_pending); - rcu_read_unlock(); r10_bio = raid10_alloc_init_r10buf(conf); r10_bio->state = 0; @@ -3513,10 +3441,9 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, /* Need to check if the array will still be * degraded */ - rcu_read_lock(); for (j = 0; j < conf->geo.raid_disks; j++) { - struct md_rdev *rdev = rcu_dereference( - conf->mirrors[j].rdev); + struct md_rdev *rdev = conf->mirrors[j].rdev; + if (rdev == NULL || test_bit(Faulty, &rdev->flags)) { still_degraded = 1; break; @@ -3531,8 +3458,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, int k; int d = r10_bio->devs[j].devnum; sector_t from_addr, to_addr; - struct md_rdev *rdev = - rcu_dereference(conf->mirrors[d].rdev); + struct md_rdev *rdev = conf->mirrors[d].rdev; sector_t sector, first_bad; int bad_sectors; if (!rdev || @@ -3611,7 +3537,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, atomic_inc(&r10_bio->remaining); break; } - rcu_read_unlock(); if (j == conf->copies) { /* Cannot recover, so abort the recovery or * record a bad block */ @@ -3738,12 +3663,10 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, bio = r10_bio->devs[i].bio; bio->bi_status = BLK_STS_IOERR; - rcu_read_lock(); - rdev = rcu_dereference(conf->mirrors[d].rdev); - if (rdev == NULL || test_bit(Faulty, &rdev->flags)) { - rcu_read_unlock(); + rdev = conf->mirrors[d].rdev; + if (rdev == NULL || test_bit(Faulty, &rdev->flags)) continue; - } + sector = r10_bio->devs[i].addr; if (is_badblock(rdev, sector, max_sync, &first_bad, &bad_sectors)) { @@ -3753,7 +3676,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, bad_sectors -= (sector - first_bad); if (max_sync > bad_sectors) max_sync = bad_sectors; - rcu_read_unlock(); continue; } } @@ -3769,11 +3691,10 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, bio_set_dev(bio, rdev->bdev); count++; - rdev = rcu_dereference(conf->mirrors[d].replacement); - if (rdev == NULL || test_bit(Faulty, &rdev->flags)) { - rcu_read_unlock(); + rdev = conf->mirrors[d].replacement; + if (rdev == NULL || test_bit(Faulty, &rdev->flags)) continue; - } + atomic_inc(&rdev->nr_pending); /* Need to set up for writing to the replacement */ @@ -3790,7 +3711,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, bio->bi_iter.bi_sector = sector + rdev->data_offset; bio_set_dev(bio, rdev->bdev); count++; - rcu_read_unlock(); } if (count < 2) { @@ -4500,11 +4420,11 @@ static int calc_degraded(struct r10conf *conf) int degraded, degraded2; int i; - rcu_read_lock(); degraded = 0; /* 'prev' section first */ for (i = 0; i < conf->prev.raid_disks; i++) { - struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); + struct md_rdev *rdev = conf->mirrors[i].rdev; + if (!rdev || test_bit(Faulty, &rdev->flags)) degraded++; else if (!test_bit(In_sync, &rdev->flags)) @@ -4514,13 +4434,12 @@ static int calc_degraded(struct r10conf *conf) */ degraded++; } - rcu_read_unlock(); if (conf->geo.raid_disks == conf->prev.raid_disks) return degraded; - rcu_read_lock(); degraded2 = 0; for (i = 0; i < conf->geo.raid_disks; i++) { - struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); + struct md_rdev *rdev = conf->mirrors[i].rdev; + if (!rdev || test_bit(Faulty, &rdev->flags)) degraded2++; else if (!test_bit(In_sync, &rdev->flags)) { @@ -4533,7 +4452,6 @@ static int calc_degraded(struct r10conf *conf) degraded2++; } } - rcu_read_unlock(); if (degraded2 > degraded) return degraded2; return degraded; @@ -4965,16 +4883,15 @@ read_more: blist = read_bio; read_bio->bi_next = NULL; - rcu_read_lock(); for (s = 0; s < conf->copies*2; s++) { struct bio *b; int d = r10_bio->devs[s/2].devnum; struct md_rdev *rdev2; if (s&1) { - rdev2 = rcu_dereference(conf->mirrors[d].replacement); + rdev2 = conf->mirrors[d].replacement; b = r10_bio->devs[s/2].repl_bio; } else { - rdev2 = rcu_dereference(conf->mirrors[d].rdev); + rdev2 = conf->mirrors[d].rdev; b = r10_bio->devs[s/2].bio; } if (!rdev2 || test_bit(Faulty, &rdev2->flags)) @@ -5008,7 +4925,6 @@ read_more: sector_nr += len >> 9; nr_sectors += len >> 9; } - rcu_read_unlock(); r10_bio->sectors = nr_sectors; /* Now submit the read */ @@ -5061,20 +4977,17 @@ static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio) struct bio *b; int d = r10_bio->devs[s/2].devnum; struct md_rdev *rdev; - rcu_read_lock(); if (s&1) { - rdev = rcu_dereference(conf->mirrors[d].replacement); + rdev = conf->mirrors[d].replacement; b = r10_bio->devs[s/2].repl_bio; } else { - rdev = rcu_dereference(conf->mirrors[d].rdev); + rdev = conf->mirrors[d].rdev; b = r10_bio->devs[s/2].bio; } - if (!rdev || test_bit(Faulty, &rdev->flags)) { - rcu_read_unlock(); + if (!rdev || test_bit(Faulty, &rdev->flags)) continue; - } + atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); md_sync_acct_bio(b, r10_bio->sectors); atomic_inc(&r10_bio->remaining); b->bi_next = NULL; @@ -5145,10 +5058,9 @@ static int handle_reshape_read_error(struct mddev *mddev, if (s > (PAGE_SIZE >> 9)) s = PAGE_SIZE >> 9; - rcu_read_lock(); while (!success) { int d = r10b->devs[slot].devnum; - struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev); + struct md_rdev *rdev = conf->mirrors[d].rdev; sector_t addr; if (rdev == NULL || test_bit(Faulty, &rdev->flags) || @@ -5157,14 +5069,12 @@ static int handle_reshape_read_error(struct mddev *mddev, addr = r10b->devs[slot].addr + idx * PAGE_SIZE; atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); success = sync_page_io(rdev, addr, s << 9, pages[idx], REQ_OP_READ, false); rdev_dec_pending(rdev, mddev); - rcu_read_lock(); if (success) break; failed: @@ -5174,7 +5084,6 @@ static int handle_reshape_read_error(struct mddev *mddev, if (slot == first_slot) break; } - rcu_read_unlock(); if (!success) { /* couldn't read this block, must give up */ set_bit(MD_RECOVERY_INTR, @@ -5200,12 +5109,8 @@ static void end_reshape_write(struct bio *bio) struct md_rdev *rdev = NULL; d = find_bio_disk(conf, r10_bio, bio, &slot, &repl); - if (repl) - rdev = conf->mirrors[d].replacement; - if (!rdev) { - smp_mb(); - rdev = conf->mirrors[d].rdev; - } + rdev = repl ? conf->mirrors[d].replacement : + conf->mirrors[d].rdev; if (bio->bi_status) { /* FIXME should record badblock */ @@ -5240,18 +5145,16 @@ static void raid10_finish_reshape(struct mddev *mddev) mddev->resync_max_sectors = mddev->array_sectors; } else { int d; - rcu_read_lock(); for (d = conf->geo.raid_disks ; d < conf->geo.raid_disks - mddev->delta_disks; d++) { - struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev); + struct md_rdev *rdev = conf->mirrors[d].rdev; if (rdev) clear_bit(In_sync, &rdev->flags); - rdev = rcu_dereference(conf->mirrors[d].replacement); + rdev = conf->mirrors[d].replacement; if (rdev) clear_bit(In_sync, &rdev->flags); } - rcu_read_unlock(); } mddev->layout = mddev->new_layout; mddev->chunk_sectors = 1 << conf->geo.chunk_shift; From 2d32777d60de81aa020a2431567020af26564c71 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Sat, 25 Nov 2023 16:16:02 +0800 Subject: [PATCH 5/9] md/raid1: remove rcu protection to access rdev from conf Because it's safe to accees rdev from conf: - If any spinlock is held, because synchronize_rcu() from md_kick_rdev_from_array() will prevent 'rdev' to be freed until spinlock is released; - If 'reconfig_lock' is held, because rdev can't be added or removed from array; - If there is normal IO inflight, because mddev_suspend() will prevent rdev to be added or removed from array; - If there is sync IO inflight, because 'MD_RECOVERY_RUNNING' is checked in remove_and_add_spares(). And these will cover all the scenarios in raid1. Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20231125081604.3939938-4-yukuai1@huaweicloud.com --- drivers/md/raid1.c | 62 +++++++++++++++++----------------------------- 1 file changed, 23 insertions(+), 39 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index a678e0e6e102..9348f1709512 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -609,7 +609,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect int choose_first; int choose_next_idle; - rcu_read_lock(); /* * Check if we can balance. We can balance on the whole * device if no resync is going on, or below the resync window. @@ -642,7 +641,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect unsigned int pending; bool nonrot; - rdev = rcu_dereference(conf->mirrors[disk].rdev); + rdev = conf->mirrors[disk].rdev; if (r1_bio->bios[disk] == IO_BLOCKED || rdev == NULL || test_bit(Faulty, &rdev->flags)) @@ -773,7 +772,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect } if (best_disk >= 0) { - rdev = rcu_dereference(conf->mirrors[best_disk].rdev); + rdev = conf->mirrors[best_disk].rdev; if (!rdev) goto retry; atomic_inc(&rdev->nr_pending); @@ -784,7 +783,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect conf->mirrors[best_disk].next_seq_sect = this_sector + sectors; } - rcu_read_unlock(); *max_sectors = sectors; return best_disk; @@ -1235,14 +1233,12 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, if (r1bio_existed) { /* Need to get the block device name carefully */ - struct md_rdev *rdev; - rcu_read_lock(); - rdev = rcu_dereference(conf->mirrors[r1_bio->read_disk].rdev); + struct md_rdev *rdev = conf->mirrors[r1_bio->read_disk].rdev; + if (rdev) snprintf(b, sizeof(b), "%pg", rdev->bdev); else strcpy(b, "???"); - rcu_read_unlock(); } /* @@ -1396,10 +1392,9 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, disks = conf->raid_disks * 2; blocked_rdev = NULL; - rcu_read_lock(); max_sectors = r1_bio->sectors; for (i = 0; i < disks; i++) { - struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); + struct md_rdev *rdev = conf->mirrors[i].rdev; /* * The write-behind io is only attempted on drives marked as @@ -1465,7 +1460,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, } r1_bio->bios[i] = bio; } - rcu_read_unlock(); if (unlikely(blocked_rdev)) { /* Wait for this device to become unblocked */ @@ -1617,15 +1611,16 @@ static void raid1_status(struct seq_file *seq, struct mddev *mddev) struct r1conf *conf = mddev->private; int i; + lockdep_assert_held(&mddev->lock); + seq_printf(seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); - rcu_read_lock(); for (i = 0; i < conf->raid_disks; i++) { - struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); + struct md_rdev *rdev = READ_ONCE(conf->mirrors[i].rdev); + seq_printf(seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_"); } - rcu_read_unlock(); seq_printf(seq, "]"); } @@ -1691,16 +1686,15 @@ static void print_conf(struct r1conf *conf) pr_debug(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, conf->raid_disks); - rcu_read_lock(); + lockdep_assert_held(&conf->mddev->reconfig_mutex); for (i = 0; i < conf->raid_disks; i++) { - struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); + struct md_rdev *rdev = conf->mirrors[i].rdev; if (rdev) pr_debug(" disk %d, wo:%d, o:%d, dev:%pg\n", i, !test_bit(In_sync, &rdev->flags), !test_bit(Faulty, &rdev->flags), rdev->bdev); } - rcu_read_unlock(); } static void close_sync(struct r1conf *conf) @@ -1810,7 +1804,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) */ if (rdev->saved_raid_disk < 0) conf->fullsync = 1; - rcu_assign_pointer(p->rdev, rdev); + WRITE_ONCE(p->rdev, rdev); break; } if (test_bit(WantReplacement, &p->rdev->flags) && @@ -1826,7 +1820,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) rdev->raid_disk = repl_slot; err = 0; conf->fullsync = 1; - rcu_assign_pointer(p[conf->raid_disks].rdev, rdev); + WRITE_ONCE(p[conf->raid_disks].rdev, rdev); } print_conf(conf); @@ -1862,7 +1856,7 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev) err = -EBUSY; goto abort; } - p->rdev = NULL; + WRITE_ONCE(p->rdev, NULL); if (conf->mirrors[conf->raid_disks + number].rdev) { /* We just removed a device that is being replaced. * Move down the replacement. We drain all IO before @@ -1883,7 +1877,7 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev) goto abort; } clear_bit(Replacement, &repl->flags); - p->rdev = repl; + WRITE_ONCE(p->rdev, repl); conf->mirrors[conf->raid_disks + number].rdev = NULL; unfreeze_array(conf); } @@ -2281,8 +2275,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk, sector_t first_bad; int bad_sectors; - rcu_read_lock(); - rdev = rcu_dereference(conf->mirrors[d].rdev); + rdev = conf->mirrors[d].rdev; if (rdev && (test_bit(In_sync, &rdev->flags) || (!test_bit(Faulty, &rdev->flags) && @@ -2290,15 +2283,14 @@ static void fix_read_error(struct r1conf *conf, int read_disk, is_badblock(rdev, sect, s, &first_bad, &bad_sectors) == 0) { atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); if (sync_page_io(rdev, sect, s<<9, conf->tmppage, REQ_OP_READ, false)) success = 1; rdev_dec_pending(rdev, mddev); if (success) break; - } else - rcu_read_unlock(); + } + d++; if (d == conf->raid_disks * 2) d = 0; @@ -2317,29 +2309,24 @@ static void fix_read_error(struct r1conf *conf, int read_disk, if (d==0) d = conf->raid_disks * 2; d--; - rcu_read_lock(); - rdev = rcu_dereference(conf->mirrors[d].rdev); + rdev = conf->mirrors[d].rdev; if (rdev && !test_bit(Faulty, &rdev->flags)) { atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); r1_sync_page_io(rdev, sect, s, conf->tmppage, WRITE); rdev_dec_pending(rdev, mddev); - } else - rcu_read_unlock(); + } } d = start; while (d != read_disk) { if (d==0) d = conf->raid_disks * 2; d--; - rcu_read_lock(); - rdev = rcu_dereference(conf->mirrors[d].rdev); + rdev = conf->mirrors[d].rdev; if (rdev && !test_bit(Faulty, &rdev->flags)) { atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); if (r1_sync_page_io(rdev, sect, s, conf->tmppage, READ)) { atomic_add(s, &rdev->corrected_errors); @@ -2350,8 +2337,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk, rdev->bdev); } rdev_dec_pending(rdev, mddev); - } else - rcu_read_unlock(); + } } sectors -= s; sect += s; @@ -2732,7 +2718,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, r1_bio = raid1_alloc_init_r1buf(conf); - rcu_read_lock(); /* * If we get a correctably read error during resync or recovery, * we might want to read from a different device. So we @@ -2753,7 +2738,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, struct md_rdev *rdev; bio = r1_bio->bios[i]; - rdev = rcu_dereference(conf->mirrors[i].rdev); + rdev = conf->mirrors[i].rdev; if (rdev == NULL || test_bit(Faulty, &rdev->flags)) { if (i < conf->raid_disks) @@ -2811,7 +2796,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, bio->bi_opf |= MD_FAILFAST; } } - rcu_read_unlock(); if (disk < 0) disk = wonly; r1_bio->read_disk = disk; From ad8606702f268903b26795e6b93605646fd1a6a8 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Sat, 25 Nov 2023 16:16:03 +0800 Subject: [PATCH 6/9] md/raid5: remove rcu protection to access rdev from conf Because it's safe to accees rdev from conf: - If any spinlock is held, because synchronize_rcu() from md_kick_rdev_from_array() will prevent 'rdev' to be freed until spinlock is released; - If 'reconfig_lock' is held, because rdev can't be added or removed from array; - If there is normal IO inflight, because mddev_suspend() will prevent rdev to be added or removed from array; - If there is sync IO inflight, because 'MD_RECOVERY_RUNNING' is checked in remove_and_add_spares(). And these will cover all the scenarios in raid456. Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20231125081604.3939938-5-yukuai1@huaweicloud.com --- drivers/md/raid5-cache.c | 11 +-- drivers/md/raid5-ppl.c | 16 +--- drivers/md/raid5.c | 182 +++++++++++++-------------------------- drivers/md/raid5.h | 4 +- 4 files changed, 69 insertions(+), 144 deletions(-) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 6157f5beb9fe..874874fe4fa1 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -1890,28 +1890,22 @@ r5l_recovery_replay_one_stripe(struct r5conf *conf, continue; /* in case device is broken */ - rcu_read_lock(); - rdev = rcu_dereference(conf->disks[disk_index].rdev); + rdev = conf->disks[disk_index].rdev; if (rdev) { atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); sync_page_io(rdev, sh->sector, PAGE_SIZE, sh->dev[disk_index].page, REQ_OP_WRITE, false); rdev_dec_pending(rdev, rdev->mddev); - rcu_read_lock(); } - rrdev = rcu_dereference(conf->disks[disk_index].replacement); + rrdev = conf->disks[disk_index].replacement; if (rrdev) { atomic_inc(&rrdev->nr_pending); - rcu_read_unlock(); sync_page_io(rrdev, sh->sector, PAGE_SIZE, sh->dev[disk_index].page, REQ_OP_WRITE, false); rdev_dec_pending(rrdev, rrdev->mddev); - rcu_read_lock(); } - rcu_read_unlock(); } ctx->data_parity_stripes++; out: @@ -2948,7 +2942,6 @@ bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect) if (!log) return false; - WARN_ON_ONCE(!rcu_read_lock_held()); tree_index = r5c_tree_index(conf, sect); slot = radix_tree_lookup(&log->big_stripe_tree, tree_index); return slot != NULL; diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c index eaea57aee602..da4ba736c4f0 100644 --- a/drivers/md/raid5-ppl.c +++ b/drivers/md/raid5-ppl.c @@ -620,11 +620,9 @@ static void ppl_do_flush(struct ppl_io_unit *io) struct md_rdev *rdev; struct block_device *bdev = NULL; - rcu_read_lock(); - rdev = rcu_dereference(conf->disks[i].rdev); + rdev = conf->disks[i].rdev; if (rdev && !test_bit(Faulty, &rdev->flags)) bdev = rdev->bdev; - rcu_read_unlock(); if (bdev) { struct bio *bio; @@ -882,9 +880,7 @@ static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e, (unsigned long long)r_sector, dd_idx, (unsigned long long)sector); - /* Array has not started so rcu dereference is safe */ - rdev = rcu_dereference_protected( - conf->disks[dd_idx].rdev, 1); + rdev = conf->disks[dd_idx].rdev; if (!rdev || (!test_bit(In_sync, &rdev->flags) && sector >= rdev->recovery_offset)) { pr_debug("%s:%*s data member disk %d missing\n", @@ -936,9 +932,7 @@ static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e, 0, &disk, &sh); BUG_ON(sh.pd_idx != le32_to_cpu(e->parity_disk)); - /* Array has not started so rcu dereference is safe */ - parity_rdev = rcu_dereference_protected( - conf->disks[sh.pd_idx].rdev, 1); + parity_rdev = conf->disks[sh.pd_idx].rdev; BUG_ON(parity_rdev->bdev->bd_dev != log->rdev->bdev->bd_dev); pr_debug("%s:%*s write parity at sector %llu, disk %pg\n", @@ -1404,9 +1398,7 @@ int ppl_init_log(struct r5conf *conf) for (i = 0; i < ppl_conf->count; i++) { struct ppl_log *log = &ppl_conf->child_logs[i]; - /* Array has not started so rcu dereference is safe */ - struct md_rdev *rdev = - rcu_dereference_protected(conf->disks[i].rdev, 1); + struct md_rdev *rdev = conf->disks[i].rdev; mutex_init(&log->io_mutex); spin_lock_init(&log->io_list_lock); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index d431e4625cc5..e57deb1c6138 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -693,12 +693,12 @@ int raid5_calc_degraded(struct r5conf *conf) int degraded, degraded2; int i; - rcu_read_lock(); degraded = 0; for (i = 0; i < conf->previous_raid_disks; i++) { - struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); + struct md_rdev *rdev = READ_ONCE(conf->disks[i].rdev); + if (rdev && test_bit(Faulty, &rdev->flags)) - rdev = rcu_dereference(conf->disks[i].replacement); + rdev = READ_ONCE(conf->disks[i].replacement); if (!rdev || test_bit(Faulty, &rdev->flags)) degraded++; else if (test_bit(In_sync, &rdev->flags)) @@ -716,15 +716,14 @@ int raid5_calc_degraded(struct r5conf *conf) if (conf->raid_disks >= conf->previous_raid_disks) degraded++; } - rcu_read_unlock(); if (conf->raid_disks == conf->previous_raid_disks) return degraded; - rcu_read_lock(); degraded2 = 0; for (i = 0; i < conf->raid_disks; i++) { - struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); + struct md_rdev *rdev = READ_ONCE(conf->disks[i].rdev); + if (rdev && test_bit(Faulty, &rdev->flags)) - rdev = rcu_dereference(conf->disks[i].replacement); + rdev = READ_ONCE(conf->disks[i].replacement); if (!rdev || test_bit(Faulty, &rdev->flags)) degraded2++; else if (test_bit(In_sync, &rdev->flags)) @@ -738,7 +737,6 @@ int raid5_calc_degraded(struct r5conf *conf) if (conf->raid_disks <= conf->previous_raid_disks) degraded2++; } - rcu_read_unlock(); if (degraded2 > degraded) return degraded2; return degraded; @@ -1183,14 +1181,8 @@ again: bi = &dev->req; rbi = &dev->rreq; /* For writing to replacement */ - rcu_read_lock(); - rrdev = rcu_dereference(conf->disks[i].replacement); - smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */ - rdev = rcu_dereference(conf->disks[i].rdev); - if (!rdev) { - rdev = rrdev; - rrdev = NULL; - } + rdev = conf->disks[i].rdev; + rrdev = conf->disks[i].replacement; if (op_is_write(op)) { if (replace_only) rdev = NULL; @@ -1211,7 +1203,6 @@ again: rrdev = NULL; if (rrdev) atomic_inc(&rrdev->nr_pending); - rcu_read_unlock(); /* We have already checked bad blocks for reads. Now * need to check for writes. We never accept write errors @@ -2730,28 +2721,6 @@ static void shrink_stripes(struct r5conf *conf) conf->slab_cache = NULL; } -/* - * This helper wraps rcu_dereference_protected() and can be used when - * it is known that the nr_pending of the rdev is elevated. - */ -static struct md_rdev *rdev_pend_deref(struct md_rdev __rcu *rdev) -{ - return rcu_dereference_protected(rdev, - atomic_read(&rcu_access_pointer(rdev)->nr_pending)); -} - -/* - * This helper wraps rcu_dereference_protected() and should be used - * when it is known that the mddev_lock() is held. This is safe - * seeing raid5_remove_disk() has the same lock held. - */ -static struct md_rdev *rdev_mdlock_deref(struct mddev *mddev, - struct md_rdev __rcu *rdev) -{ - return rcu_dereference_protected(rdev, - lockdep_is_held(&mddev->reconfig_mutex)); -} - static void raid5_end_read_request(struct bio * bi) { struct stripe_head *sh = bi->bi_private; @@ -2777,9 +2746,9 @@ static void raid5_end_read_request(struct bio * bi) * In that case it moved down to 'rdev'. * rdev is not removed until all requests are finished. */ - rdev = rdev_pend_deref(conf->disks[i].replacement); + rdev = conf->disks[i].replacement; if (!rdev) - rdev = rdev_pend_deref(conf->disks[i].rdev); + rdev = conf->disks[i].rdev; if (use_new_offset(conf, sh)) s = sh->sector + rdev->new_data_offset; @@ -2892,11 +2861,11 @@ static void raid5_end_write_request(struct bio *bi) for (i = 0 ; i < disks; i++) { if (bi == &sh->dev[i].req) { - rdev = rdev_pend_deref(conf->disks[i].rdev); + rdev = conf->disks[i].rdev; break; } if (bi == &sh->dev[i].rreq) { - rdev = rdev_pend_deref(conf->disks[i].replacement); + rdev = conf->disks[i].replacement; if (rdev) replacement = 1; else @@ -2904,7 +2873,7 @@ static void raid5_end_write_request(struct bio *bi) * replaced it. rdev is not removed * until all requests are finished. */ - rdev = rdev_pend_deref(conf->disks[i].rdev); + rdev = conf->disks[i].rdev; break; } } @@ -3666,15 +3635,13 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, int bitmap_end = 0; if (test_bit(R5_ReadError, &sh->dev[i].flags)) { - struct md_rdev *rdev; - rcu_read_lock(); - rdev = rcu_dereference(conf->disks[i].rdev); + struct md_rdev *rdev = conf->disks[i].rdev; + if (rdev && test_bit(In_sync, &rdev->flags) && !test_bit(Faulty, &rdev->flags)) atomic_inc(&rdev->nr_pending); else rdev = NULL; - rcu_read_unlock(); if (rdev) { if (!rdev_set_badblocks( rdev, @@ -3792,16 +3759,17 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, /* During recovery devices cannot be removed, so * locking and refcounting of rdevs is not needed */ - rcu_read_lock(); for (i = 0; i < conf->raid_disks; i++) { - struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); + struct md_rdev *rdev = conf->disks[i].rdev; + if (rdev && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && !rdev_set_badblocks(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), 0)) abort = 1; - rdev = rcu_dereference(conf->disks[i].replacement); + rdev = conf->disks[i].replacement; + if (rdev && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) @@ -3809,7 +3777,6 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, RAID5_STRIPE_SECTORS(conf), 0)) abort = 1; } - rcu_read_unlock(); if (abort) conf->recovery_disabled = conf->mddev->recovery_disabled; @@ -3822,15 +3789,13 @@ static int want_replace(struct stripe_head *sh, int disk_idx) struct md_rdev *rdev; int rv = 0; - rcu_read_lock(); - rdev = rcu_dereference(sh->raid_conf->disks[disk_idx].replacement); + rdev = sh->raid_conf->disks[disk_idx].replacement; if (rdev && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && (rdev->recovery_offset <= sh->sector || rdev->mddev->recovery_cp <= sh->sector)) rv = 1; - rcu_read_unlock(); return rv; } @@ -4707,7 +4672,6 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) s->log_failed = r5l_log_disk_error(conf); /* Now to look around and see what can be done */ - rcu_read_lock(); for (i=disks; i--; ) { struct md_rdev *rdev; sector_t first_bad; @@ -4752,7 +4716,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) /* Prefer to use the replacement for reads, but only * if it is recovered enough and has no bad blocks. */ - rdev = rcu_dereference(conf->disks[i].replacement); + rdev = conf->disks[i].replacement; if (rdev && !test_bit(Faulty, &rdev->flags) && rdev->recovery_offset >= sh->sector + RAID5_STRIPE_SECTORS(conf) && !is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), @@ -4763,7 +4727,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) set_bit(R5_NeedReplace, &dev->flags); else clear_bit(R5_NeedReplace, &dev->flags); - rdev = rcu_dereference(conf->disks[i].rdev); + rdev = conf->disks[i].rdev; clear_bit(R5_ReadRepl, &dev->flags); } if (rdev && test_bit(Faulty, &rdev->flags)) @@ -4810,8 +4774,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) if (test_bit(R5_WriteError, &dev->flags)) { /* This flag does not apply to '.replacement' * only to .rdev, so make sure to check that*/ - struct md_rdev *rdev2 = rcu_dereference( - conf->disks[i].rdev); + struct md_rdev *rdev2 = conf->disks[i].rdev; + if (rdev2 == rdev) clear_bit(R5_Insync, &dev->flags); if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { @@ -4823,8 +4787,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) if (test_bit(R5_MadeGood, &dev->flags)) { /* This flag does not apply to '.replacement' * only to .rdev, so make sure to check that*/ - struct md_rdev *rdev2 = rcu_dereference( - conf->disks[i].rdev); + struct md_rdev *rdev2 = conf->disks[i].rdev; + if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { s->handle_bad_blocks = 1; atomic_inc(&rdev2->nr_pending); @@ -4832,8 +4796,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) clear_bit(R5_MadeGood, &dev->flags); } if (test_bit(R5_MadeGoodRepl, &dev->flags)) { - struct md_rdev *rdev2 = rcu_dereference( - conf->disks[i].replacement); + struct md_rdev *rdev2 = conf->disks[i].replacement; + if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { s->handle_bad_blocks = 1; atomic_inc(&rdev2->nr_pending); @@ -4854,8 +4818,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) if (rdev && !test_bit(Faulty, &rdev->flags)) do_recovery = 1; else if (!rdev) { - rdev = rcu_dereference( - conf->disks[i].replacement); + rdev = conf->disks[i].replacement; if (rdev && !test_bit(Faulty, &rdev->flags)) do_recovery = 1; } @@ -4882,7 +4845,6 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) else s->replacing = 1; } - rcu_read_unlock(); } /* @@ -5339,23 +5301,23 @@ finish: struct r5dev *dev = &sh->dev[i]; if (test_and_clear_bit(R5_WriteError, &dev->flags)) { /* We own a safe reference to the rdev */ - rdev = rdev_pend_deref(conf->disks[i].rdev); + rdev = conf->disks[i].rdev; if (!rdev_set_badblocks(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), 0)) md_error(conf->mddev, rdev); rdev_dec_pending(rdev, conf->mddev); } if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { - rdev = rdev_pend_deref(conf->disks[i].rdev); + rdev = conf->disks[i].rdev; rdev_clear_badblocks(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), 0); rdev_dec_pending(rdev, conf->mddev); } if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) { - rdev = rdev_pend_deref(conf->disks[i].replacement); + rdev = conf->disks[i].replacement; if (!rdev) /* rdev have been moved down */ - rdev = rdev_pend_deref(conf->disks[i].rdev); + rdev = conf->disks[i].rdev; rdev_clear_badblocks(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), 0); rdev_dec_pending(rdev, conf->mddev); @@ -5514,24 +5476,22 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) &dd_idx, NULL); end_sector = sector + bio_sectors(raid_bio); - rcu_read_lock(); if (r5c_big_stripe_cached(conf, sector)) - goto out_rcu_unlock; + return 0; - rdev = rcu_dereference(conf->disks[dd_idx].replacement); + rdev = conf->disks[dd_idx].replacement; if (!rdev || test_bit(Faulty, &rdev->flags) || rdev->recovery_offset < end_sector) { - rdev = rcu_dereference(conf->disks[dd_idx].rdev); + rdev = conf->disks[dd_idx].rdev; if (!rdev) - goto out_rcu_unlock; + return 0; if (test_bit(Faulty, &rdev->flags) || !(test_bit(In_sync, &rdev->flags) || rdev->recovery_offset >= end_sector)) - goto out_rcu_unlock; + return 0; } atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); if (is_badblock(rdev, sector, bio_sectors(raid_bio), &first_bad, &bad_sectors)) { @@ -5575,10 +5535,6 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) raid_bio->bi_iter.bi_sector); submit_bio_noacct(align_bio); return 1; - -out_rcu_unlock: - rcu_read_unlock(); - return 0; } static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio) @@ -6581,14 +6537,12 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n * Note in case of > 1 drive failures it's possible we're rebuilding * one drive while leaving another faulty drive in array. */ - rcu_read_lock(); for (i = 0; i < conf->raid_disks; i++) { - struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); + struct md_rdev *rdev = conf->disks[i].rdev; if (rdev == NULL || test_bit(Faulty, &rdev->flags)) still_degraded = 1; } - rcu_read_unlock(); md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); @@ -7899,18 +7853,10 @@ static int raid5_run(struct mddev *mddev) for (i = 0; i < conf->raid_disks && conf->previous_raid_disks; i++) { - rdev = rdev_mdlock_deref(mddev, conf->disks[i].rdev); - if (!rdev && conf->disks[i].replacement) { - /* The replacement is all we have yet */ - rdev = rdev_mdlock_deref(mddev, - conf->disks[i].replacement); - conf->disks[i].replacement = NULL; - clear_bit(Replacement, &rdev->flags); - rcu_assign_pointer(conf->disks[i].rdev, rdev); - } + rdev = conf->disks[i].rdev; if (!rdev) continue; - if (rcu_access_pointer(conf->disks[i].replacement) && + if (conf->disks[i].replacement && conf->reshape_progress != MaxSector) { /* replacements and reshape simply do not mix. */ pr_warn("md: cannot handle concurrent replacement and reshape.\n"); @@ -8094,15 +8040,16 @@ static void raid5_status(struct seq_file *seq, struct mddev *mddev) struct r5conf *conf = mddev->private; int i; + lockdep_assert_held(&mddev->lock); + seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, conf->chunk_sectors / 2, mddev->layout); seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); - rcu_read_lock(); for (i = 0; i < conf->raid_disks; i++) { - struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); + struct md_rdev *rdev = READ_ONCE(conf->disks[i].rdev); + seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_"); } - rcu_read_unlock(); seq_printf (seq, "]"); } @@ -8140,9 +8087,8 @@ static int raid5_spare_active(struct mddev *mddev) unsigned long flags; for (i = 0; i < conf->raid_disks; i++) { - rdev = rdev_mdlock_deref(mddev, conf->disks[i].rdev); - replacement = rdev_mdlock_deref(mddev, - conf->disks[i].replacement); + rdev = conf->disks[i].rdev; + replacement = conf->disks[i].replacement; if (replacement && replacement->recovery_offset == MaxSector && !test_bit(Faulty, &replacement->flags) @@ -8181,7 +8127,7 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) struct r5conf *conf = mddev->private; int err = 0; int number = rdev->raid_disk; - struct md_rdev __rcu **rdevp; + struct md_rdev **rdevp; struct disk_info *p; struct md_rdev *tmp; @@ -8204,9 +8150,9 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) if (unlikely(number >= conf->pool_size)) return 0; p = conf->disks + number; - if (rdev == rcu_access_pointer(p->rdev)) + if (rdev == p->rdev) rdevp = &p->rdev; - else if (rdev == rcu_access_pointer(p->replacement)) + else if (rdev == p->replacement) rdevp = &p->replacement; else return 0; @@ -8226,28 +8172,24 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) if (!test_bit(Faulty, &rdev->flags) && mddev->recovery_disabled != conf->recovery_disabled && !has_failed(conf) && - (!rcu_access_pointer(p->replacement) || - rcu_access_pointer(p->replacement) == rdev) && + (!p->replacement || p->replacement == rdev) && number < conf->raid_disks) { err = -EBUSY; goto abort; } - *rdevp = NULL; + WRITE_ONCE(*rdevp, NULL); if (!err) { err = log_modify(conf, rdev, false); if (err) goto abort; } - tmp = rcu_access_pointer(p->replacement); + tmp = p->replacement; if (tmp) { /* We must have just cleared 'rdev' */ - rcu_assign_pointer(p->rdev, tmp); + WRITE_ONCE(p->rdev, tmp); clear_bit(Replacement, &tmp->flags); - smp_mb(); /* Make sure other CPUs may see both as identical - * but will never see neither - if they are careful - */ - rcu_assign_pointer(p->replacement, NULL); + WRITE_ONCE(p->replacement, NULL); if (!err) err = log_modify(conf, tmp, true); @@ -8315,7 +8257,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) rdev->raid_disk = disk; if (rdev->saved_raid_disk != disk) conf->fullsync = 1; - rcu_assign_pointer(p->rdev, rdev); + WRITE_ONCE(p->rdev, rdev); err = log_modify(conf, rdev, true); @@ -8324,7 +8266,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) } for (disk = first; disk <= last; disk++) { p = conf->disks + disk; - tmp = rdev_mdlock_deref(mddev, p->rdev); + tmp = p->rdev; if (test_bit(WantReplacement, &tmp->flags) && mddev->reshape_position == MaxSector && p->replacement == NULL) { @@ -8333,7 +8275,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) rdev->raid_disk = disk; err = 0; conf->fullsync = 1; - rcu_assign_pointer(p->replacement, rdev); + WRITE_ONCE(p->replacement, rdev); break; } } @@ -8466,7 +8408,7 @@ static int raid5_start_reshape(struct mddev *mddev) if (mddev->recovery_cp < MaxSector) return -EBUSY; for (i = 0; i < conf->raid_disks; i++) - if (rdev_mdlock_deref(mddev, conf->disks[i].replacement)) + if (conf->disks[i].replacement) return -EBUSY; rdev_for_each(rdev, mddev) { @@ -8637,12 +8579,10 @@ static void raid5_finish_reshape(struct mddev *mddev) for (d = conf->raid_disks ; d < conf->raid_disks - mddev->delta_disks; d++) { - rdev = rdev_mdlock_deref(mddev, - conf->disks[d].rdev); + rdev = conf->disks[d].rdev; if (rdev) clear_bit(In_sync, &rdev->flags); - rdev = rdev_mdlock_deref(mddev, - conf->disks[d].replacement); + rdev = conf->disks[d].replacement; if (rdev) clear_bit(In_sync, &rdev->flags); } diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 22bea20eccbd..9b5a7dc3f2a0 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -473,8 +473,8 @@ enum { */ struct disk_info { - struct md_rdev __rcu *rdev; - struct md_rdev __rcu *replacement; + struct md_rdev *rdev; + struct md_rdev *replacement; struct page *extra_page; /* extra page to use in prexor */ }; From 7ecab28c3b2c26c1a51154d997433e8432eb49ba Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Sat, 25 Nov 2023 16:16:04 +0800 Subject: [PATCH 7/9] md/md-multipath: remove rcu protection to access rdev from conf Because it's safe to accees rdev from conf: - If any spinlock is held, because synchronize_rcu() from md_kick_rdev_from_array() will prevent 'rdev' to be freed until spinlock is released; - If there is normal IO inflight, because mddev_suspend() will prevent rdev to be added or removed from array; And these will cover all the scenarios in md-multipath. Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20231125081604.3939938-6-yukuai1@huaweicloud.com --- drivers/md/md-multipath.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c index aa77133f3188..19c8625ea642 100644 --- a/drivers/md/md-multipath.c +++ b/drivers/md/md-multipath.c @@ -32,17 +32,15 @@ static int multipath_map (struct mpconf *conf) * now we use the first available disk. */ - rcu_read_lock(); for (i = 0; i < disks; i++) { - struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev); + struct md_rdev *rdev = conf->multipaths[i].rdev; + if (rdev && test_bit(In_sync, &rdev->flags) && !test_bit(Faulty, &rdev->flags)) { atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); return i; } } - rcu_read_unlock(); pr_crit_ratelimited("multipath_map(): no more operational IO paths?\n"); return (-1); @@ -137,14 +135,16 @@ static void multipath_status(struct seq_file *seq, struct mddev *mddev) struct mpconf *conf = mddev->private; int i; + lockdep_assert_held(&mddev->lock); + seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); - rcu_read_lock(); for (i = 0; i < conf->raid_disks; i++) { - struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev); - seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_"); + struct md_rdev *rdev = READ_ONCE(conf->multipaths[i].rdev); + + seq_printf(seq, "%s", + rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_"); } - rcu_read_unlock(); seq_putc(seq, ']'); } @@ -182,7 +182,7 @@ static void multipath_error (struct mddev *mddev, struct md_rdev *rdev) conf->raid_disks - mddev->degraded); } -static void print_multipath_conf (struct mpconf *conf) +static void print_multipath_conf(struct mpconf *conf) { int i; struct multipath_info *tmp; @@ -195,6 +195,7 @@ static void print_multipath_conf (struct mpconf *conf) pr_debug(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, conf->raid_disks); + lockdep_assert_held(&conf->mddev->reconfig_mutex); for (i = 0; i < conf->raid_disks; i++) { tmp = conf->multipaths + i; if (tmp->rdev) @@ -231,7 +232,7 @@ static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev) rdev->raid_disk = path; set_bit(In_sync, &rdev->flags); spin_unlock_irq(&conf->device_lock); - rcu_assign_pointer(p->rdev, rdev); + WRITE_ONCE(p->rdev, rdev); err = 0; break; } @@ -257,7 +258,7 @@ static int multipath_remove_disk(struct mddev *mddev, struct md_rdev *rdev) err = -EBUSY; goto abort; } - p->rdev = NULL; + WRITE_ONCE(p->rdev, NULL); err = md_integrity_register(mddev); } abort: From 15da990f8dd7e9d0e1fd0275730f6fed6f6a8a57 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 27 Nov 2023 19:58:07 -0800 Subject: [PATCH 8/9] MAINTAINERS: SOFTWARE RAID: Add Yu Kuai as Reviewer Add Yu Kuai as reviewer for md/raid subsystem. Signed-off-by: Song Liu Acked-by: Yu Kuai Link: https://lore.kernel.org/r/20231128035807.3191738-1-song@kernel.org --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 012df8ccf34e..a800acf46e6a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -20106,6 +20106,7 @@ F: include/linux/property.h SOFTWARE RAID (Multiple Disks) SUPPORT M: Song Liu +R: Yu Kuai L: linux-raid@vger.kernel.org S: Supported Q: https://patchwork.kernel.org/project/linux-raid/list/ From fa2bbff7b0b4e211fec5e5686ef96350690597b5 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 29 Nov 2023 10:02:34 +0800 Subject: [PATCH 9/9] md: synchronize flush io with array reconfiguration Currently rcu is used to protect iterating rdev from submit_flushes(): submit_flushes remove_and_add_spares synchronize_rcu pers->hot_remove_disk() rcu_read_lock() rdev_for_each_rcu if (rdev->raid_disk >= 0) rdev->radi_disk = -1; atomic_inc(&rdev->nr_pending) rcu_read_unlock() bi = bio_alloc_bioset() bi->bi_end_io = md_end_flush bi->private = rdev submit_bio // issue io for removed rdev Fix this problem by grabbing 'acive_io' before iterating rdev, make sure that remove_and_add_spares() won't concurrent with submit_flushes(). Fixes: a2826aa92e2e ("md: support barrier requests on all personalities.") Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20231129020234.1586910-1-yukuai1@huaweicloud.com --- drivers/md/md.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 71b3397dea47..4e9fe5cbeedc 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -529,6 +529,9 @@ static void md_end_flush(struct bio *bio) rdev_dec_pending(rdev, mddev); if (atomic_dec_and_test(&mddev->flush_pending)) { + /* The pair is percpu_ref_get() from md_flush_request() */ + percpu_ref_put(&mddev->active_io); + /* The pre-request flush has finished */ queue_work(md_wq, &mddev->flush_work); } @@ -548,12 +551,8 @@ static void submit_flushes(struct work_struct *ws) rdev_for_each_rcu(rdev, mddev) if (rdev->raid_disk >= 0 && !test_bit(Faulty, &rdev->flags)) { - /* Take two references, one is dropped - * when request finishes, one after - * we reclaim rcu_read_lock - */ struct bio *bi; - atomic_inc(&rdev->nr_pending); + atomic_inc(&rdev->nr_pending); rcu_read_unlock(); bi = bio_alloc_bioset(rdev->bdev, 0, @@ -564,7 +563,6 @@ static void submit_flushes(struct work_struct *ws) atomic_inc(&mddev->flush_pending); submit_bio(bi); rcu_read_lock(); - rdev_dec_pending(rdev, mddev); } rcu_read_unlock(); if (atomic_dec_and_test(&mddev->flush_pending)) @@ -617,6 +615,18 @@ bool md_flush_request(struct mddev *mddev, struct bio *bio) /* new request after previous flush is completed */ if (ktime_after(req_start, mddev->prev_flush_start)) { WARN_ON(mddev->flush_bio); + /* + * Grab a reference to make sure mddev_suspend() will wait for + * this flush to be done. + * + * md_flush_reqeust() is called under md_handle_request() and + * 'active_io' is already grabbed, hence percpu_ref_is_zero() + * won't pass, percpu_ref_tryget_live() can't be used because + * percpu_ref_kill() can be called by mddev_suspend() + * concurrently. + */ + WARN_ON(percpu_ref_is_zero(&mddev->active_io)); + percpu_ref_get(&mddev->active_io); mddev->flush_bio = bio; bio = NULL; }