md: Don't register sync_thread for reshape directly
Currently, if reshape is interrupted, then reassemble the array will register sync_thread directly from pers->run(), in this case 'MD_RECOVERY_RUNNING' is set directly, however, there is no guarantee that md_do_sync() will be executed, hence stop_sync_thread() will hang because 'MD_RECOVERY_RUNNING' can't be cleared. Last patch make sure that md_do_sync() will set MD_RECOVERY_DONE, however, following hang can still be triggered by dm-raid test shell/lvconvert-raid-reshape.sh occasionally: [root@fedora ~]# cat /proc/1982/stack [<0>] stop_sync_thread+0x1ab/0x270 [md_mod] [<0>] md_frozen_sync_thread+0x5c/0xa0 [md_mod] [<0>] raid_presuspend+0x1e/0x70 [dm_raid] [<0>] dm_table_presuspend_targets+0x40/0xb0 [dm_mod] [<0>] __dm_destroy+0x2a5/0x310 [dm_mod] [<0>] dm_destroy+0x16/0x30 [dm_mod] [<0>] dev_remove+0x165/0x290 [dm_mod] [<0>] ctl_ioctl+0x4bb/0x7b0 [dm_mod] [<0>] dm_ctl_ioctl+0x11/0x20 [dm_mod] [<0>] vfs_ioctl+0x21/0x60 [<0>] __x64_sys_ioctl+0xb9/0xe0 [<0>] do_syscall_64+0xc6/0x230 [<0>] entry_SYSCALL_64_after_hwframe+0x6c/0x74 Meanwhile mddev->recovery is: MD_RECOVERY_RUNNING | MD_RECOVERY_INTR | MD_RECOVERY_RESHAPE | MD_RECOVERY_FROZEN Fix this problem by remove the code to register sync_thread directly from raid10 and raid5. And let md_check_recovery() to register sync_thread. Fixes: f67055780caa ("[PATCH] md: Checkpoint and allow restart of raid5 reshape") Fixes: f52f5c71f3d4 ("md: fix stopping sync thread") Cc: stable@vger.kernel.org # v6.7+ Signed-off-by: Yu Kuai <yukuai3@huawei.com> Signed-off-by: Song Liu <song@kernel.org> Link: https://lore.kernel.org/r/20240201092559.910982-5-yukuai1@huaweicloud.com
This commit is contained in:
parent
82ec0ae59d
commit
ad39c08186
@ -9376,6 +9376,7 @@ static void md_start_sync(struct work_struct *ws)
|
||||
struct mddev *mddev = container_of(ws, struct mddev, sync_work);
|
||||
int spares = 0;
|
||||
bool suspend = false;
|
||||
char *name;
|
||||
|
||||
if (md_spares_need_change(mddev))
|
||||
suspend = true;
|
||||
@ -9408,8 +9409,10 @@ static void md_start_sync(struct work_struct *ws)
|
||||
if (spares)
|
||||
md_bitmap_write_all(mddev->bitmap);
|
||||
|
||||
name = test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ?
|
||||
"reshape" : "resync";
|
||||
rcu_assign_pointer(mddev->sync_thread,
|
||||
md_register_thread(md_do_sync, mddev, "resync"));
|
||||
md_register_thread(md_do_sync, mddev, name));
|
||||
if (!mddev->sync_thread) {
|
||||
pr_warn("%s: could not start resync thread...\n",
|
||||
mdname(mddev));
|
||||
|
@ -4175,11 +4175,7 @@ static int raid10_run(struct mddev *mddev)
|
||||
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
|
||||
clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
|
||||
set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
|
||||
set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
|
||||
rcu_assign_pointer(mddev->sync_thread,
|
||||
md_register_thread(md_do_sync, mddev, "reshape"));
|
||||
if (!mddev->sync_thread)
|
||||
goto out_free_conf;
|
||||
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
|
||||
}
|
||||
|
||||
return 0;
|
||||
@ -4573,16 +4569,8 @@ out:
|
||||
clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
|
||||
clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
|
||||
set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
|
||||
set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
|
||||
|
||||
rcu_assign_pointer(mddev->sync_thread,
|
||||
md_register_thread(md_do_sync, mddev, "reshape"));
|
||||
if (!mddev->sync_thread) {
|
||||
ret = -EAGAIN;
|
||||
goto abort;
|
||||
}
|
||||
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
|
||||
conf->reshape_checkpoint = jiffies;
|
||||
md_wakeup_thread(mddev->sync_thread);
|
||||
md_new_event();
|
||||
return 0;
|
||||
|
||||
|
@ -7936,11 +7936,7 @@ static int raid5_run(struct mddev *mddev)
|
||||
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
|
||||
clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
|
||||
set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
|
||||
set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
|
||||
rcu_assign_pointer(mddev->sync_thread,
|
||||
md_register_thread(md_do_sync, mddev, "reshape"));
|
||||
if (!mddev->sync_thread)
|
||||
goto abort;
|
||||
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
|
||||
}
|
||||
|
||||
/* Ok, everything is just fine now */
|
||||
@ -8506,29 +8502,8 @@ static int raid5_start_reshape(struct mddev *mddev)
|
||||
clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
|
||||
clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
|
||||
set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
|
||||
set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
|
||||
rcu_assign_pointer(mddev->sync_thread,
|
||||
md_register_thread(md_do_sync, mddev, "reshape"));
|
||||
if (!mddev->sync_thread) {
|
||||
mddev->recovery = 0;
|
||||
spin_lock_irq(&conf->device_lock);
|
||||
write_seqcount_begin(&conf->gen_lock);
|
||||
mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
|
||||
mddev->new_chunk_sectors =
|
||||
conf->chunk_sectors = conf->prev_chunk_sectors;
|
||||
mddev->new_layout = conf->algorithm = conf->prev_algo;
|
||||
rdev_for_each(rdev, mddev)
|
||||
rdev->new_data_offset = rdev->data_offset;
|
||||
smp_wmb();
|
||||
conf->generation --;
|
||||
conf->reshape_progress = MaxSector;
|
||||
mddev->reshape_position = MaxSector;
|
||||
write_seqcount_end(&conf->gen_lock);
|
||||
spin_unlock_irq(&conf->device_lock);
|
||||
return -EAGAIN;
|
||||
}
|
||||
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
|
||||
conf->reshape_checkpoint = jiffies;
|
||||
md_wakeup_thread(mddev->sync_thread);
|
||||
md_new_event();
|
||||
return 0;
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user