md: fix duplicate filename for rdev

Commit 5792a2856a ("[PATCH] md: avoid a deadlock when removing a device
from an md array via sysfs") delays the deletion of rdev, however, this
introduces a window that rdev can be added again while the deletion is
not done yet, and sysfs will complain about duplicate filename.

Follow up patches try to fix this problem by flushing workqueue, however,
flush_rdev_wq() is just dead code, the progress in
md_kick_rdev_from_array():

1) list_del_rcu(&rdev->same_set);
2) synchronize_rcu();
3) queue_work(md_rdev_misc_wq, &rdev->del_work);

So in flush_rdev_wq(), if rdev is found in the list, work_pending() can
never pass, in the meantime, if work is queued, then rdev can never be
found in the list.

flush_rdev_wq() can be replaced by flush_workqueue() directly, however,
this approach is not good:
- the workqueue is global, this synchronization for all raid disks is
  not necessary.
- flush_workqueue can't be called under 'reconfig_mutex', there is still
  a small window between flush_workqueue() and mddev_lock() that other
  contexts can queue new work, hence the problem is not solved completely.

sysfs already has apis to support delete itself through writer, and
these apis, specifically sysfs_break/unbreak_active_protection(), is used
to support deleting rdev synchronously. Therefore, the above commit can be
reverted, and sysfs duplicate filename can be avoided.

A new mdadm regression test is proposed as well([1]).

[1] https://lore.kernel.org/linux-raid/20230428062845.1975462-1-yukuai1@huaweicloud.com/

Fixes: 5792a2856a ("[PATCH] md: avoid a deadlock when removing a device from an md array via sysfs")
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230523012727.3042247-1-yukuai1@huaweicloud.com
This commit is contained in:
Yu Kuai 2023-05-23 09:27:27 +08:00 committed by Song Liu
parent f8b20a4054
commit 3ce94ce5d0
2 changed files with 52 additions and 44 deletions

View File

@ -87,11 +87,11 @@ static struct module *md_cluster_mod;
static DECLARE_WAIT_QUEUE_HEAD(resync_wait); static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
static struct workqueue_struct *md_wq; static struct workqueue_struct *md_wq;
static struct workqueue_struct *md_misc_wq; static struct workqueue_struct *md_misc_wq;
static struct workqueue_struct *md_rdev_misc_wq;
static int remove_and_add_spares(struct mddev *mddev, static int remove_and_add_spares(struct mddev *mddev,
struct md_rdev *this); struct md_rdev *this);
static void mddev_detach(struct mddev *mddev); static void mddev_detach(struct mddev *mddev);
static void export_rdev(struct md_rdev *rdev, struct mddev *mddev);
/* /*
* Default number of read corrections we'll attempt on an rdev * Default number of read corrections we'll attempt on an rdev
@ -643,9 +643,11 @@ void mddev_init(struct mddev *mddev)
{ {
mutex_init(&mddev->open_mutex); mutex_init(&mddev->open_mutex);
mutex_init(&mddev->reconfig_mutex); mutex_init(&mddev->reconfig_mutex);
mutex_init(&mddev->delete_mutex);
mutex_init(&mddev->bitmap_info.mutex); mutex_init(&mddev->bitmap_info.mutex);
INIT_LIST_HEAD(&mddev->disks); INIT_LIST_HEAD(&mddev->disks);
INIT_LIST_HEAD(&mddev->all_mddevs); INIT_LIST_HEAD(&mddev->all_mddevs);
INIT_LIST_HEAD(&mddev->deleting);
timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0); timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0);
atomic_set(&mddev->active, 1); atomic_set(&mddev->active, 1);
atomic_set(&mddev->openers, 0); atomic_set(&mddev->openers, 0);
@ -747,6 +749,24 @@ static void mddev_free(struct mddev *mddev)
static const struct attribute_group md_redundancy_group; static const struct attribute_group md_redundancy_group;
static void md_free_rdev(struct mddev *mddev)
{
struct md_rdev *rdev;
struct md_rdev *tmp;
mutex_lock(&mddev->delete_mutex);
if (list_empty(&mddev->deleting))
goto out;
list_for_each_entry_safe(rdev, tmp, &mddev->deleting, same_set) {
list_del_init(&rdev->same_set);
kobject_del(&rdev->kobj);
export_rdev(rdev, mddev);
}
out:
mutex_unlock(&mddev->delete_mutex);
}
void mddev_unlock(struct mddev *mddev) void mddev_unlock(struct mddev *mddev)
{ {
if (mddev->to_remove) { if (mddev->to_remove) {
@ -788,6 +808,8 @@ void mddev_unlock(struct mddev *mddev)
} else } else
mutex_unlock(&mddev->reconfig_mutex); mutex_unlock(&mddev->reconfig_mutex);
md_free_rdev(mddev);
/* As we've dropped the mutex we need a spinlock to /* As we've dropped the mutex we need a spinlock to
* make sure the thread doesn't disappear * make sure the thread doesn't disappear
*/ */
@ -2428,13 +2450,6 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
return err; return err;
} }
static void rdev_delayed_delete(struct work_struct *ws)
{
struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work);
kobject_del(&rdev->kobj);
kobject_put(&rdev->kobj);
}
void md_autodetect_dev(dev_t dev); void md_autodetect_dev(dev_t dev);
/* just for claiming the bdev */ /* just for claiming the bdev */
@ -2455,6 +2470,8 @@ static void export_rdev(struct md_rdev *rdev, struct mddev *mddev)
static void md_kick_rdev_from_array(struct md_rdev *rdev) static void md_kick_rdev_from_array(struct md_rdev *rdev)
{ {
struct mddev *mddev = rdev->mddev;
bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
list_del_rcu(&rdev->same_set); list_del_rcu(&rdev->same_set);
pr_debug("md: unbind<%pg>\n", rdev->bdev); pr_debug("md: unbind<%pg>\n", rdev->bdev);
@ -2468,15 +2485,17 @@ static void md_kick_rdev_from_array(struct md_rdev *rdev)
rdev->sysfs_unack_badblocks = NULL; rdev->sysfs_unack_badblocks = NULL;
rdev->sysfs_badblocks = NULL; rdev->sysfs_badblocks = NULL;
rdev->badblocks.count = 0; rdev->badblocks.count = 0;
/* We need to delay this, otherwise we can deadlock when
* writing to 'remove' to "dev/state". We also need
* to delay it due to rcu usage.
*/
synchronize_rcu(); synchronize_rcu();
INIT_WORK(&rdev->del_work, rdev_delayed_delete);
kobject_get(&rdev->kobj); /*
queue_work(md_rdev_misc_wq, &rdev->del_work); * kobject_del() will wait for all in progress writers to be done, where
export_rdev(rdev, rdev->mddev); * reconfig_mutex is held, hence it can't be called under
* reconfig_mutex and it's delayed to mddev_unlock().
*/
mutex_lock(&mddev->delete_mutex);
list_add(&rdev->same_set, &mddev->deleting);
mutex_unlock(&mddev->delete_mutex);
} }
static void export_array(struct mddev *mddev) static void export_array(struct mddev *mddev)
@ -3544,6 +3563,7 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr,
{ {
struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
struct kernfs_node *kn = NULL;
ssize_t rv; ssize_t rv;
struct mddev *mddev = rdev->mddev; struct mddev *mddev = rdev->mddev;
@ -3551,6 +3571,10 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr,
return -EIO; return -EIO;
if (!capable(CAP_SYS_ADMIN)) if (!capable(CAP_SYS_ADMIN))
return -EACCES; return -EACCES;
if (entry->store == state_store && cmd_match(page, "remove"))
kn = sysfs_break_active_protection(kobj, attr);
rv = mddev ? mddev_lock(mddev) : -ENODEV; rv = mddev ? mddev_lock(mddev) : -ENODEV;
if (!rv) { if (!rv) {
if (rdev->mddev == NULL) if (rdev->mddev == NULL)
@ -3559,6 +3583,10 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr,
rv = entry->store(rdev, page, length); rv = entry->store(rdev, page, length);
mddev_unlock(mddev); mddev_unlock(mddev);
} }
if (kn)
sysfs_unbreak_active_protection(kn);
return rv; return rv;
} }
@ -4484,20 +4512,6 @@ null_show(struct mddev *mddev, char *page)
return -EINVAL; return -EINVAL;
} }
/* need to ensure rdev_delayed_delete() has completed */
static void flush_rdev_wq(struct mddev *mddev)
{
struct md_rdev *rdev;
rcu_read_lock();
rdev_for_each_rcu(rdev, mddev)
if (work_pending(&rdev->del_work)) {
flush_workqueue(md_rdev_misc_wq);
break;
}
rcu_read_unlock();
}
static ssize_t static ssize_t
new_dev_store(struct mddev *mddev, const char *buf, size_t len) new_dev_store(struct mddev *mddev, const char *buf, size_t len)
{ {
@ -4525,7 +4539,6 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len)
minor != MINOR(dev)) minor != MINOR(dev))
return -EOVERFLOW; return -EOVERFLOW;
flush_rdev_wq(mddev);
err = mddev_lock(mddev); err = mddev_lock(mddev);
if (err) if (err)
return err; return err;
@ -5595,7 +5608,6 @@ struct mddev *md_alloc(dev_t dev, char *name)
* removed (mddev_delayed_delete). * removed (mddev_delayed_delete).
*/ */
flush_workqueue(md_misc_wq); flush_workqueue(md_misc_wq);
flush_workqueue(md_rdev_misc_wq);
mutex_lock(&disks_mutex); mutex_lock(&disks_mutex);
mddev = mddev_alloc(dev); mddev = mddev_alloc(dev);
@ -7558,9 +7570,6 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode,
} }
if (cmd == ADD_NEW_DISK || cmd == HOT_ADD_DISK)
flush_rdev_wq(mddev);
if (cmd == HOT_REMOVE_DISK) if (cmd == HOT_REMOVE_DISK)
/* need to ensure recovery thread has run */ /* need to ensure recovery thread has run */
wait_event_interruptible_timeout(mddev->sb_wait, wait_event_interruptible_timeout(mddev->sb_wait,
@ -9623,10 +9632,6 @@ static int __init md_init(void)
if (!md_misc_wq) if (!md_misc_wq)
goto err_misc_wq; goto err_misc_wq;
md_rdev_misc_wq = alloc_workqueue("md_rdev_misc", 0, 0);
if (!md_rdev_misc_wq)
goto err_rdev_misc_wq;
ret = __register_blkdev(MD_MAJOR, "md", md_probe); ret = __register_blkdev(MD_MAJOR, "md", md_probe);
if (ret < 0) if (ret < 0)
goto err_md; goto err_md;
@ -9645,8 +9650,6 @@ static int __init md_init(void)
err_mdp: err_mdp:
unregister_blkdev(MD_MAJOR, "md"); unregister_blkdev(MD_MAJOR, "md");
err_md: err_md:
destroy_workqueue(md_rdev_misc_wq);
err_rdev_misc_wq:
destroy_workqueue(md_misc_wq); destroy_workqueue(md_misc_wq);
err_misc_wq: err_misc_wq:
destroy_workqueue(md_wq); destroy_workqueue(md_wq);
@ -9942,7 +9945,6 @@ static __exit void md_exit(void)
} }
spin_unlock(&all_mddevs_lock); spin_unlock(&all_mddevs_lock);
destroy_workqueue(md_rdev_misc_wq);
destroy_workqueue(md_misc_wq); destroy_workqueue(md_misc_wq);
destroy_workqueue(md_wq); destroy_workqueue(md_wq);
} }

View File

@ -122,8 +122,6 @@ struct md_rdev {
struct serial_in_rdev *serial; /* used for raid1 io serialization */ struct serial_in_rdev *serial; /* used for raid1 io serialization */
struct work_struct del_work; /* used for delayed sysfs removal */
struct kernfs_node *sysfs_state; /* handle for 'state' struct kernfs_node *sysfs_state; /* handle for 'state'
* sysfs entry */ * sysfs entry */
/* handle for 'unacknowledged_bad_blocks' sysfs dentry */ /* handle for 'unacknowledged_bad_blocks' sysfs dentry */
@ -531,6 +529,14 @@ struct mddev {
unsigned int good_device_nr; /* good device num within cluster raid */ unsigned int good_device_nr; /* good device num within cluster raid */
unsigned int noio_flag; /* for memalloc scope API */ unsigned int noio_flag; /* for memalloc scope API */
/*
* Temporarily store rdev that will be finally removed when
* reconfig_mutex is unlocked.
*/
struct list_head deleting;
/* Protect the deleting list */
struct mutex delete_mutex;
bool has_superblocks:1; bool has_superblocks:1;
bool fail_last_dev:1; bool fail_last_dev:1;
bool serialize_policy:1; bool serialize_policy:1;