From 61c90765e131e63ead773b9b99167415e246a945 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 28 Dec 2023 20:55:51 +0800 Subject: [PATCH 1/7] md: remove redundant check of 'mddev->sync_thread' The lifetime of sync_thread: 1) Set MD_RECOVERY_NEEDED and wake up daemon thread (by ioctl/sysfs or other events); 2) Daemon thread woke up, md_check_recovery() found that MD_RECOVERY_NEEDED is set: a) try to grab reconfig_mutex; b) set MD_RECOVERY_RUNNING; c) clear MD_RECOVERY_NEEDED, and then queue sync_work; 3) md_start_sync() choose sync_action, then register sync_thread; 4) md_do_sync() is done, set MD_RECOVERY_DONE and wake up daemon thread; 5) Daemon thread woke up, md_check_recovery() found that MD_RECOVERY_DONE is set: a) try to grab reconfig_mutex; b) unregister sync_thread; c) clear MD_RECOVERY_RUNNING and MD_RECOVERY_DONE; Hence there is no such case that MD_RECOVERY_RUNNING is not set, while sync_thread is registered. Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20231228125553.2697765-2-yukuai1@huaweicloud.com --- drivers/md/md.c | 14 ++++---------- drivers/md/raid5.c | 6 ++---- 2 files changed, 6 insertions(+), 14 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 2266358d8074..9b6b34ab9d1d 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3339,8 +3339,7 @@ static ssize_t new_offset_store(struct md_rdev *rdev, if (kstrtoull(buf, 10, &new_offset) < 0) return -EINVAL; - if (mddev->sync_thread || - test_bit(MD_RECOVERY_RUNNING,&mddev->recovery)) + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) return -EBUSY; if (new_offset == rdev->data_offset) /* reset is always permitted */ @@ -4013,8 +4012,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) */ rv = -EBUSY; - if (mddev->sync_thread || - test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || mddev->reshape_position != MaxSector || mddev->sysfs_active) goto out_unlock; @@ -6408,7 +6406,6 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) mutex_lock(&mddev->open_mutex); if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) || - mddev->sync_thread || test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { pr_warn("md: %s still in use.\n",mdname(mddev)); err = -EBUSY; @@ -6461,7 +6458,6 @@ static int do_md_stop(struct mddev *mddev, int mode, mutex_lock(&mddev->open_mutex); if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) || mddev->sysfs_active || - mddev->sync_thread || test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { pr_warn("md: %s still in use.\n",mdname(mddev)); mutex_unlock(&mddev->open_mutex); @@ -7307,8 +7303,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors) * of each device. If num_sectors is zero, we find the largest size * that fits. */ - if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || - mddev->sync_thread) + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) return -EBUSY; if (!md_is_rdwr(mddev)) return -EROFS; @@ -7345,8 +7340,7 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks) if (raid_disks <= 0 || (mddev->max_disks && raid_disks >= mddev->max_disks)) return -EINVAL; - if (mddev->sync_thread || - test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) || mddev->reshape_position != MaxSector) return -EBUSY; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 8497880135ee..14f2cf75abbd 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -6967,10 +6967,8 @@ raid5_store_stripe_size(struct mddev *mddev, const char *page, size_t len) pr_debug("md/raid: change stripe_size from %lu to %lu\n", conf->stripe_size, new); - if (mddev->sync_thread || - test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || - mddev->reshape_position != MaxSector || - mddev->sysfs_active) { + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || + mddev->reshape_position != MaxSector || mddev->sysfs_active) { err = -EBUSY; goto out_unlock; } From faeaf210a559eb05bc1a294082d100d01c49a1e9 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 28 Dec 2023 20:55:52 +0800 Subject: [PATCH 2/7] md: remove redundant md_wakeup_thread() On the one hand, mddev_unlock() will call md_wakeup_thread() unconditionally; on the other hand, md_check_recovery() can't make progress if 'reconfig_mutex' can't be grabbed. Hence, it really doesn't make sense to wake up daemon thread while 'reconfig_mutex' is still grabbed. Remove all the md_wakup_thread() for 'mddev->thread' while 'reconfig_mtuex' is still grabbed. Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20231228125553.2697765-3-yukuai1@huaweicloud.com --- drivers/md/md.c | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 9b6b34ab9d1d..0b132ee2672e 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2929,7 +2929,6 @@ static int add_bound_rdev(struct md_rdev *rdev) set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_new_event(); - md_wakeup_thread(mddev->thread); return 0; } @@ -3044,10 +3043,8 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) if (err == 0) { md_kick_rdev_from_array(rdev); - if (mddev->pers) { + if (mddev->pers) set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); - md_wakeup_thread(mddev->thread); - } md_new_event(); } } @@ -3077,7 +3074,6 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) clear_bit(BlockedBadBlocks, &rdev->flags); wake_up(&rdev->blocked_wait); set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); - md_wakeup_thread(rdev->mddev->thread); err = 0; } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { @@ -3115,7 +3111,6 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) !test_bit(Replacement, &rdev->flags)) set_bit(WantReplacement, &rdev->flags); set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); - md_wakeup_thread(rdev->mddev->thread); err = 0; } else if (cmd_match(buf, "-want_replacement")) { /* Clearing 'want_replacement' is always allowed. @@ -3245,7 +3240,6 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len) if (rdev->raid_disk >= 0) return -EBUSY; set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); - md_wakeup_thread(rdev->mddev->thread); } else if (rdev->mddev->pers) { /* Activating a spare .. or possibly reactivating * if we ever get bitmaps working here. @@ -6186,7 +6180,6 @@ int do_md_run(struct mddev *mddev) /* run start up tasks that require md_thread */ md_start(mddev); - md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); @@ -6207,7 +6200,6 @@ int md_start(struct mddev *mddev) if (mddev->pers->start) { set_bit(MD_RECOVERY_WAIT, &mddev->recovery); - md_wakeup_thread(mddev->thread); ret = mddev->pers->start(mddev); clear_bit(MD_RECOVERY_WAIT, &mddev->recovery); md_wakeup_thread(mddev->sync_thread); @@ -6252,7 +6244,6 @@ static int restart_array(struct mddev *mddev) pr_debug("md: %s switched to read-write mode.\n", mdname(mddev)); /* Kick recovery or resync if necessary */ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->sync_thread); sysfs_notify_dirent_safe(mddev->sysfs_state); return 0; @@ -6396,7 +6387,6 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { did_freeze = 1; set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - md_wakeup_thread(mddev->thread); } stop_sync_thread(mddev, false, false); @@ -6428,7 +6418,6 @@ out: if ((mddev->pers && !err) || did_freeze) { clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - md_wakeup_thread(mddev->thread); sysfs_notify_dirent_safe(mddev->sysfs_state); } @@ -6450,7 +6439,6 @@ static int do_md_stop(struct mddev *mddev, int mode, if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { did_freeze = 1; set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - md_wakeup_thread(mddev->thread); } stop_sync_thread(mddev, true, false); @@ -6464,7 +6452,6 @@ static int do_md_stop(struct mddev *mddev, int mode, if (did_freeze) { clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - md_wakeup_thread(mddev->thread); } return -EBUSY; } @@ -7005,9 +6992,7 @@ kick_rdev: md_kick_rdev_from_array(rdev); set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); - if (mddev->thread) - md_wakeup_thread(mddev->thread); - else + if (!mddev->thread) md_update_sb(mddev, 1); md_new_event(); @@ -7089,7 +7074,6 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev) * array immediately. */ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - md_wakeup_thread(mddev->thread); md_new_event(); return 0; From 9cfcf99e7ed613e6b3697e1c1034a24487ec3154 Mon Sep 17 00:00:00 2001 From: Li Lingfeng Date: Fri, 29 Dec 2023 15:05:00 +0800 Subject: [PATCH 3/7] md: get rdev->mddev with READ_ONCE() Users may get rdev->mddev by sysfs while rdev is releasing. So use both READ_ONCE() and WRITE_ONCE() to prevent load/store tearing and to read/write mddev atomically. Signed-off-by: Li Lingfeng Reviewed-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20231229070500.3602712-1-lilingfeng@huaweicloud.com --- drivers/md/md.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 0b132ee2672e..5eff8e84cddf 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2591,7 +2591,7 @@ static void md_kick_rdev_from_array(struct md_rdev *rdev) list_del_rcu(&rdev->same_set); pr_debug("md: unbind<%pg>\n", rdev->bdev); mddev_destroy_serial_pool(rdev->mddev, rdev); - rdev->mddev = NULL; + WRITE_ONCE(rdev->mddev, NULL); sysfs_remove_link(&rdev->kobj, "block"); sysfs_put(rdev->sysfs_state); sysfs_put(rdev->sysfs_unack_badblocks); @@ -3664,7 +3664,7 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr, struct kernfs_node *kn = NULL; bool suspend = false; ssize_t rv; - struct mddev *mddev = rdev->mddev; + struct mddev *mddev = READ_ONCE(rdev->mddev); if (!entry->store) return -EIO; From 570b9147deb6b07b955b55e06c714ca12a5f3e16 Mon Sep 17 00:00:00 2001 From: Li Lingfeng Date: Thu, 4 Jan 2024 21:36:29 +0800 Subject: [PATCH 4/7] md: use RCU lock to protect traversal in md_spares_need_change() Since md_start_sync() will be called without the protect of mddev_lock, and it can run concurrently with array reconfiguration, traversal of rdev in it should be protected by RCU lock. Commit bc08041b32ab ("md: suspend array in md_start_sync() if array need reconfiguration") added md_spares_need_change() to md_start_sync(), casusing use of rdev without any protection. Fix this by adding RCU lock in md_spares_need_change(). Fixes: bc08041b32ab ("md: suspend array in md_start_sync() if array need reconfiguration") Cc: stable@vger.kernel.org # 6.7+ Signed-off-by: Li Lingfeng Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20240104133629.1277517-1-lilingfeng@huaweicloud.com --- drivers/md/md.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 5eff8e84cddf..45bb387d69c4 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -9240,9 +9240,14 @@ static bool md_spares_need_change(struct mddev *mddev) { struct md_rdev *rdev; - rdev_for_each(rdev, mddev) - if (rdev_removeable(rdev) || rdev_addable(rdev)) + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) { + if (rdev_removeable(rdev) || rdev_addable(rdev)) { + rcu_read_unlock(); return true; + } + } + rcu_read_unlock(); return false; } From 95b77082b790633129e318efde85a51571570395 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 1 Feb 2024 22:45:49 +0000 Subject: [PATCH 5/7] md/linear: Get rid of md-linear.h Given that 849d18e27be9 ("md: Remove deprecated CONFIG_MD_LINEAR") killed the linear flavour of MD, it seems only logical to drop the leftover include file that used to come with it. I also feel that it should be my own privilege to remove my 30 year old attempt at writing kernel code ;-). RIP! Cc: Song Liu Cc: Yu Kuai Signed-off-by: Marc Zyngier Reviewed-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20240201224549.750644-1-maz@kernel.org --- drivers/md/md-linear.h | 17 ----------------- 1 file changed, 17 deletions(-) delete mode 100644 drivers/md/md-linear.h diff --git a/drivers/md/md-linear.h b/drivers/md/md-linear.h deleted file mode 100644 index 5587eeedb882..000000000000 --- a/drivers/md/md-linear.h +++ /dev/null @@ -1,17 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINEAR_H -#define _LINEAR_H - -struct dev_info { - struct md_rdev *rdev; - sector_t end_sector; -}; - -struct linear_conf -{ - struct rcu_head rcu; - sector_t array_sectors; - int raid_disks; /* a copy of mddev->raid_disks */ - struct dev_info disks[] __counted_by(raid_disks); -}; -#endif From 83cbdaf61b1ab9cdaa0321eeea734bc70ca069c8 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 5 Feb 2024 15:34:39 -0800 Subject: [PATCH 6/7] md/multipath: Remove md-multipath.h md-multipath is already deprecated. Remove the header file. Signed-off-by: Song Liu --- drivers/md/md-multipath.h | 32 -------------------------------- 1 file changed, 32 deletions(-) delete mode 100644 drivers/md/md-multipath.h diff --git a/drivers/md/md-multipath.h b/drivers/md/md-multipath.h deleted file mode 100644 index b3099e5fc4d7..000000000000 --- a/drivers/md/md-multipath.h +++ /dev/null @@ -1,32 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _MULTIPATH_H -#define _MULTIPATH_H - -struct multipath_info { - struct md_rdev *rdev; -}; - -struct mpconf { - struct mddev *mddev; - struct multipath_info *multipaths; - int raid_disks; - spinlock_t device_lock; - struct list_head retry_list; - - mempool_t pool; -}; - -/* - * this is our 'private' 'collective' MULTIPATH buffer head. - * it contains information about what kind of IO operations were started - * for this MULTIPATH operation, and about their status: - */ - -struct multipath_bh { - struct mddev *mddev; - struct bio *master_bio; - struct bio bio; - int path; - struct list_head retry_list; -}; -#endif From 6cf350658736681b9d6b0b6e58c5c76b235bb4c4 Mon Sep 17 00:00:00 2001 From: Li Nan Date: Thu, 8 Feb 2024 16:55:56 +0800 Subject: [PATCH 7/7] md: fix kmemleak of rdev->serial If kobject_add() is fail in bind_rdev_to_array(), 'rdev->serial' will be alloc not be freed, and kmemleak occurs. unreferenced object 0xffff88815a350000 (size 49152): comm "mdadm", pid 789, jiffies 4294716910 hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace (crc f773277a): [<0000000058b0a453>] kmemleak_alloc+0x61/0xe0 [<00000000366adf14>] __kmalloc_large_node+0x15e/0x270 [<000000002e82961b>] __kmalloc_node.cold+0x11/0x7f [<00000000f206d60a>] kvmalloc_node+0x74/0x150 [<0000000034bf3363>] rdev_init_serial+0x67/0x170 [<0000000010e08fe9>] mddev_create_serial_pool+0x62/0x220 [<00000000c3837bf0>] bind_rdev_to_array+0x2af/0x630 [<0000000073c28560>] md_add_new_disk+0x400/0x9f0 [<00000000770e30ff>] md_ioctl+0x15bf/0x1c10 [<000000006cfab718>] blkdev_ioctl+0x191/0x3f0 [<0000000085086a11>] vfs_ioctl+0x22/0x60 [<0000000018b656fe>] __x64_sys_ioctl+0xba/0xe0 [<00000000e54e675e>] do_syscall_64+0x71/0x150 [<000000008b0ad622>] entry_SYSCALL_64_after_hwframe+0x6c/0x74 Fixes: 963c555e75b0 ("md: introduce mddev_create/destroy_wb_pool for the change of member device") Signed-off-by: Li Nan Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20240208085556.2412922-1-linan666@huaweicloud.com --- drivers/md/md.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/md.c b/drivers/md/md.c index 45bb387d69c4..e2a5f513dbb7 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2562,6 +2562,7 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) fail: pr_warn("md: failed to register dev-%s for %s\n", b, mdname(mddev)); + mddev_destroy_serial_pool(mddev, rdev); return err; }