From 5268de78e1e61c828174cb2ac7d9c566a97cce69 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 8 Apr 2024 21:36:56 +0200 Subject: [PATCH 01/10] dm-crypt: add the optional "high_priority" flag When WQ_HIGHPRI was used for the dm-crypt kcryptd workqueue it was reported that dm-crypt performs badly when the system is loaded[1]. Because of reports of audio skipping, dm-crypt stopped using WQ_HIGHPRI with commit f612b2132db5 (Revert "dm crypt: use WQ_HIGHPRI for the IO and crypt workqueues"). But it has since been determined that WQ_HIGHPRI provides improved performance (with reduced latency) for highend systems with much more resources than those laptop/desktop users which suffered from the use of WQ_HIGHPRI. As such, add an option "high_priority" that allows the use of WQ_HIGHPRI for dm-crypt's workqueues and also sets the write_thread to nice level MIN_NICE (-20). This commit makes it optional, so that normal users won't be harmed by it. [1] https://listman.redhat.com/archives/dm-devel/2023-February/053410.html Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- .../admin-guide/device-mapper/dm-crypt.rst | 5 +++ drivers/md/dm-crypt.c | 37 +++++++++++++------ 2 files changed, 31 insertions(+), 11 deletions(-) diff --git a/Documentation/admin-guide/device-mapper/dm-crypt.rst b/Documentation/admin-guide/device-mapper/dm-crypt.rst index aa2d04d95df6..41f5f57f00eb 100644 --- a/Documentation/admin-guide/device-mapper/dm-crypt.rst +++ b/Documentation/admin-guide/device-mapper/dm-crypt.rst @@ -113,6 +113,11 @@ same_cpu_crypt The default is to use an unbound workqueue so that encryption work is automatically balanced between available CPUs. +high_priority + Set dm-crypt workqueues and the writer thread to high priority. This + improves throughput and latency of dm-crypt while degrading general + responsiveness of the system. + submit_from_crypt_cpus Disable offloading writes to a separate thread after encryption. There are some situations where offloading write bios from the diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 9a74c6316c5d..eabc576d8d28 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -137,9 +137,9 @@ struct iv_elephant_private { * and encrypts / decrypts at the same time. */ enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID, - DM_CRYPT_SAME_CPU, DM_CRYPT_NO_OFFLOAD, - DM_CRYPT_NO_READ_WORKQUEUE, DM_CRYPT_NO_WRITE_WORKQUEUE, - DM_CRYPT_WRITE_INLINE }; + DM_CRYPT_SAME_CPU, DM_CRYPT_HIGH_PRIORITY, + DM_CRYPT_NO_OFFLOAD, DM_CRYPT_NO_READ_WORKQUEUE, + DM_CRYPT_NO_WRITE_WORKQUEUE, DM_CRYPT_WRITE_INLINE }; enum cipher_flags { CRYPT_MODE_INTEGRITY_AEAD, /* Use authenticated mode for cipher */ @@ -3134,7 +3134,7 @@ static int crypt_ctr_optional(struct dm_target *ti, unsigned int argc, char **ar struct crypt_config *cc = ti->private; struct dm_arg_set as; static const struct dm_arg _args[] = { - {0, 8, "Invalid number of feature args"}, + {0, 9, "Invalid number of feature args"}, }; unsigned int opt_params, val; const char *opt_string, *sval; @@ -3161,6 +3161,8 @@ static int crypt_ctr_optional(struct dm_target *ti, unsigned int argc, char **ar else if (!strcasecmp(opt_string, "same_cpu_crypt")) set_bit(DM_CRYPT_SAME_CPU, &cc->flags); + else if (!strcasecmp(opt_string, "high_priority")) + set_bit(DM_CRYPT_HIGH_PRIORITY, &cc->flags); else if (!strcasecmp(opt_string, "submit_from_crypt_cpus")) set_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags); @@ -3232,6 +3234,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) const char *devname = dm_table_device_name(ti->table); int key_size; unsigned int align_mask; + unsigned int common_wq_flags; unsigned long long tmpll; int ret; size_t iv_size_padding, additional_req_size; @@ -3399,19 +3402,25 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) } ret = -ENOMEM; - cc->io_queue = alloc_workqueue("kcryptd_io/%s", WQ_MEM_RECLAIM, 1, devname); + common_wq_flags = WQ_MEM_RECLAIM; + if (test_bit(DM_CRYPT_HIGH_PRIORITY, &cc->flags)) + common_wq_flags |= WQ_HIGHPRI; + + cc->io_queue = alloc_workqueue("kcryptd_io/%s", common_wq_flags, 1, devname); if (!cc->io_queue) { ti->error = "Couldn't create kcryptd io queue"; goto bad; } - if (test_bit(DM_CRYPT_SAME_CPU, &cc->flags)) - cc->crypt_queue = alloc_workqueue("kcryptd/%s", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, - 1, devname); - else + if (test_bit(DM_CRYPT_SAME_CPU, &cc->flags)) { cc->crypt_queue = alloc_workqueue("kcryptd/%s", - WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND, + common_wq_flags | WQ_CPU_INTENSIVE, + 1, devname); + } else { + cc->crypt_queue = alloc_workqueue("kcryptd/%s", + common_wq_flags | WQ_CPU_INTENSIVE | WQ_UNBOUND, num_online_cpus(), devname); + } if (!cc->crypt_queue) { ti->error = "Couldn't create kcryptd queue"; goto bad; @@ -3427,6 +3436,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->error = "Couldn't spawn write thread"; goto bad; } + if (test_bit(DM_CRYPT_HIGH_PRIORITY, &cc->flags)) + set_user_nice(cc->write_thread, MIN_NICE); ti->num_flush_bios = 1; ti->limit_swap_bios = true; @@ -3547,6 +3558,7 @@ static void crypt_status(struct dm_target *ti, status_type_t type, num_feature_args += !!ti->num_discard_bios; num_feature_args += test_bit(DM_CRYPT_SAME_CPU, &cc->flags); + num_feature_args += test_bit(DM_CRYPT_HIGH_PRIORITY, &cc->flags); num_feature_args += test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags); num_feature_args += test_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags); num_feature_args += test_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags); @@ -3560,6 +3572,8 @@ static void crypt_status(struct dm_target *ti, status_type_t type, DMEMIT(" allow_discards"); if (test_bit(DM_CRYPT_SAME_CPU, &cc->flags)) DMEMIT(" same_cpu_crypt"); + if (test_bit(DM_CRYPT_HIGH_PRIORITY, &cc->flags)) + DMEMIT(" high_priority"); if (test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags)) DMEMIT(" submit_from_crypt_cpus"); if (test_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags)) @@ -3579,6 +3593,7 @@ static void crypt_status(struct dm_target *ti, status_type_t type, DMEMIT_TARGET_NAME_VERSION(ti->type); DMEMIT(",allow_discards=%c", ti->num_discard_bios ? 'y' : 'n'); DMEMIT(",same_cpu_crypt=%c", test_bit(DM_CRYPT_SAME_CPU, &cc->flags) ? 'y' : 'n'); + DMEMIT(",high_priority=%c", test_bit(DM_CRYPT_HIGH_PRIORITY, &cc->flags) ? 'y' : 'n'); DMEMIT(",submit_from_crypt_cpus=%c", test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags) ? 'y' : 'n'); DMEMIT(",no_read_workqueue=%c", test_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags) ? @@ -3706,7 +3721,7 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type crypt_target = { .name = "crypt", - .version = {1, 25, 0}, + .version = {1, 26, 0}, .module = THIS_MODULE, .ctr = crypt_ctr, .dtr = crypt_dtr, From 2285e1496dc68787a626145cfb0904a1b6cc5501 Mon Sep 17 00:00:00 2001 From: yangerkun Date: Tue, 11 Jul 2023 14:11:29 +0800 Subject: [PATCH 02/10] dm-crypt: export sysfs of all workqueues Once there is a heavy IO load, so many encrypt/decrypt work will occupy all of the cpu, which may lead to the poor performance for other service. So the improved visibility and controls over dm-crypt workqueues, as was offered with commit a2b8b2d97567 ("dm crypt: export sysfs of kcryptd workqueue"), seems necessary. By exporting dm-crypt's workqueues in sysfs, the entry like cpumask/max_active and so on can help us to limit the CPU usage for encrypt/decrypt work. However, commit a2b8b2d97567 did not consider that DM table reload will call .ctr before .dtr, so the reload for dm-crypt failed because the same sysfs name was present. This was the original need for commit 48b0777cd93d ("Revert "dm crypt: export sysfs of kcryptd workqueue""). Reintroduce the use of WQ_SYSFS, and use it for both the IO and crypt workqueues, but make the workqueue names include a unique id (via ida) to allow both old and new sysfs entries to coexist. Signed-off-by: yangerkun Acked-by: Tejun Heo Signed-off-by: Mike Snitzer --- drivers/md/dm-crypt.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index eabc576d8d28..5bfa35760167 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -47,6 +47,8 @@ #define DM_MSG_PREFIX "crypt" +static DEFINE_IDA(workqueue_ida); + /* * context holding the current state of a multi-part conversion */ @@ -184,6 +186,7 @@ struct crypt_config { struct crypto_aead **tfms_aead; } cipher_tfm; unsigned int tfms_count; + int workqueue_id; unsigned long cipher_flags; /* @@ -2771,6 +2774,9 @@ static void crypt_dtr(struct dm_target *ti) if (cc->crypt_queue) destroy_workqueue(cc->crypt_queue); + if (cc->workqueue_id) + ida_free(&workqueue_ida, cc->workqueue_id); + crypt_free_tfms(cc); bioset_exit(&cc->bs); @@ -3232,7 +3238,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) { struct crypt_config *cc; const char *devname = dm_table_device_name(ti->table); - int key_size; + int key_size, wq_id; unsigned int align_mask; unsigned int common_wq_flags; unsigned long long tmpll; @@ -3401,25 +3407,33 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) cc->tag_pool_max_sectors <<= cc->sector_shift; } + wq_id = ida_alloc_min(&workqueue_ida, 1, GFP_KERNEL); + if (wq_id < 0) { + ti->error = "Couldn't get workqueue id"; + ret = wq_id; + goto bad; + } + cc->workqueue_id = wq_id; + ret = -ENOMEM; - common_wq_flags = WQ_MEM_RECLAIM; + common_wq_flags = WQ_MEM_RECLAIM | WQ_SYSFS; if (test_bit(DM_CRYPT_HIGH_PRIORITY, &cc->flags)) common_wq_flags |= WQ_HIGHPRI; - cc->io_queue = alloc_workqueue("kcryptd_io/%s", common_wq_flags, 1, devname); + cc->io_queue = alloc_workqueue("kcryptd_io-%s-%d", common_wq_flags, 1, devname, wq_id); if (!cc->io_queue) { ti->error = "Couldn't create kcryptd io queue"; goto bad; } if (test_bit(DM_CRYPT_SAME_CPU, &cc->flags)) { - cc->crypt_queue = alloc_workqueue("kcryptd/%s", + cc->crypt_queue = alloc_workqueue("kcryptd-%s-%d", common_wq_flags | WQ_CPU_INTENSIVE, - 1, devname); + 1, devname, wq_id); } else { - cc->crypt_queue = alloc_workqueue("kcryptd/%s", + cc->crypt_queue = alloc_workqueue("kcryptd-%s-%d", common_wq_flags | WQ_CPU_INTENSIVE | WQ_UNBOUND, - num_online_cpus(), devname); + num_online_cpus(), devname, wq_id); } if (!cc->crypt_queue) { ti->error = "Couldn't create kcryptd queue"; From 7560680c8d1e810826d048a61d35668541dce038 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 11 Apr 2024 15:08:05 -0400 Subject: [PATCH 03/10] dm-crypt: stop constraining max_segment_size to PAGE_SIZE This change effectively reverts commit 586b286b110e ("dm crypt: constrain crypt device's max_segment_size to PAGE_SIZE") and relies on block core's late bio-splitting to ensure that dm-crypt's encryption bios are split accordingly if they exceed the underlying device's limits (e.g. max_segment_size). Commit 586b286b110e was applied as a 4.3 fix for the benefit of stable@ kernels 4.0+ just after block core's late bio-splitting was introduced in 4.3 with commit 54efd50bfd873 ("block: make generic_make_request handle arbitrarily sized bios"). Given block core's late bio-splitting it is past time that dm-crypt make use of it. Also, given the recent need to revert meaningful progress that was attempted during the 6.9 merge window (see commit bff4b74625fe Revert "dm: use queue_limits_set") this change allows DM core to safely make use of queue_limits_set() without risk of breaking dm-crypt on NVMe. Though it should be noted this commit isn't a prereq for reinstating DM core's use of queue_limits_set() because blk_validate_limits() was made less strict with commit b561ea56a264 ("block: allow device to have both virt_boundary_mask and max segment size"). Reviewed-by: Christoph Hellwig Reviewed-by: Mikulas Patocka Reviewed-by: Ming Lei Signed-off-by: Mike Snitzer --- drivers/md/dm-crypt.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 5bfa35760167..f43a2c0b3d77 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1656,8 +1656,8 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone); /* * Generate a new unfragmented bio with the given size - * This should never violate the device limitations (but only because - * max_segment_size is being constrained to PAGE_SIZE). + * This should never violate the device limitations (but if it did then block + * core should split the bio as needed). * * This function may be called concurrently. If we allocate from the mempool * concurrently, there is a possibility of deadlock. For example, if we have @@ -3717,14 +3717,6 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits) { struct crypt_config *cc = ti->private; - /* - * Unfortunate constraint that is required to avoid the potential - * for exceeding underlying device's max_segments limits -- due to - * crypt_alloc_buffer() possibly allocating pages for the encryption - * bio that are not as physically contiguous as the original bio. - */ - limits->max_segment_size = PAGE_SIZE; - limits->logical_block_size = max_t(unsigned int, limits->logical_block_size, cc->sector_size); limits->physical_block_size = From 1c0e720228ad1c63bb487cdcead2558353b5a067 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 28 Feb 2024 14:56:42 -0800 Subject: [PATCH 04/10] dm: use queue_limits_set Use queue_limits_set which validates the limits and takes care of updating the readahead settings instead of directly assigning them to the queue. For that make sure all limits are actually updated before the assignment. Signed-off-by: Christoph Hellwig Signed-off-by: Mike Snitzer --- drivers/md/dm-table.c | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 41f1d731ae5a..88114719fe18 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1963,26 +1963,27 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, bool wc = false, fua = false; int r; - /* - * Copy table's limits to the DM device's request_queue - */ - q->limits = *limits; - if (dm_table_supports_nowait(t)) blk_queue_flag_set(QUEUE_FLAG_NOWAIT, q); else blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, q); if (!dm_table_supports_discards(t)) { - q->limits.max_discard_sectors = 0; - q->limits.max_hw_discard_sectors = 0; - q->limits.discard_granularity = 0; - q->limits.discard_alignment = 0; - q->limits.discard_misaligned = 0; + limits->max_hw_discard_sectors = 0; + limits->discard_granularity = 0; + limits->discard_alignment = 0; + limits->discard_misaligned = 0; } + if (!dm_table_supports_write_zeroes(t)) + limits->max_write_zeroes_sectors = 0; + if (!dm_table_supports_secure_erase(t)) - q->limits.max_secure_erase_sectors = 0; + limits->max_secure_erase_sectors = 0; + + r = queue_limits_set(q, limits); + if (r) + return r; if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_WC))) { wc = true; @@ -2007,9 +2008,6 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, else blk_queue_flag_set(QUEUE_FLAG_NONROT, q); - if (!dm_table_supports_write_zeroes(t)) - q->limits.max_write_zeroes_sectors = 0; - dm_table_verify_integrity(t); /* @@ -2047,7 +2045,6 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, } dm_update_crypto_profile(q, t); - disk_update_readahead(t->md->disk); /* * Check for request-based device is left to From 83637d9017b22a5e11ada9f44ba776beb807222b Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 12 Apr 2024 10:58:05 -0400 Subject: [PATCH 05/10] dm-crypt: don't set WQ_CPU_INTENSIVE for WQ_UNBOUND crypt_queue Fix crypt_queue's use of WQ_UNBOUND to _not_ use WQ_CPU_INTENSIVE because it is meaningless with WQ_UNBOUND. Signed-off-by: Mike Snitzer --- drivers/md/dm-crypt.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index f43a2c0b3d77..1b7a97cc3779 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -3431,8 +3431,12 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) common_wq_flags | WQ_CPU_INTENSIVE, 1, devname, wq_id); } else { + /* + * While crypt_queue is certainly CPU intensive, the use of + * WQ_CPU_INTENSIVE is meaningless with WQ_UNBOUND. + */ cc->crypt_queue = alloc_workqueue("kcryptd-%s-%d", - common_wq_flags | WQ_CPU_INTENSIVE | WQ_UNBOUND, + common_wq_flags | WQ_UNBOUND, num_online_cpus(), devname, wq_id); } if (!cc->crypt_queue) { From 8d24790ed08ab4e619ce58ed4a1b353ab77ffdc5 Mon Sep 17 00:00:00 2001 From: Benjamin Marzinski Date: Tue, 7 May 2024 17:16:23 -0400 Subject: [PATCH 06/10] dm-delay: fix workqueue delay_timer race delay_timer could be pending when delay_dtr() is called. It needs to be shut down before kdelayd_wq is destroyed, so it won't try queueing more work to kdelayd_wq while that's getting destroyed. Also the del_timer_sync() call in delay_presuspend() doesn't protect against the timer getting immediately rearmed by the queued call to flush_delayed_bios(), but there's no real harm if that does happen. timer_delete() is less work, and is basically just as likely to stop a pointless call to flush_delayed_bios(). Fixes: 26b9f228703f ("dm: delay target") Signed-off-by: Benjamin Marzinski Signed-off-by: Mike Snitzer --- drivers/md/dm-delay.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index 5eabdb06c649..eec0daa4b227 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -154,8 +154,10 @@ static void delay_dtr(struct dm_target *ti) { struct delay_c *dc = ti->private; - if (dc->kdelayd_wq) + if (dc->kdelayd_wq) { + timer_shutdown_sync(&dc->delay_timer); destroy_workqueue(dc->kdelayd_wq); + } if (dc->read.dev) dm_put_device(ti, dc->read.dev); @@ -335,7 +337,7 @@ static void delay_presuspend(struct dm_target *ti) mutex_unlock(&delayed_bios_lock); if (!delay_is_fast(dc)) - del_timer_sync(&dc->delay_timer); + timer_delete(&dc->delay_timer); flush_delayed_bios(dc, true); } From d14646f23300a5fc85be867bafdc0702c2002789 Mon Sep 17 00:00:00 2001 From: Joel Colledge Date: Mon, 6 May 2024 09:25:23 +0200 Subject: [PATCH 07/10] dm-delay: fix hung task introduced by kthread mode If the worker thread is not woken due to a bio, then it is not woken at all. This causes the hung task check to trigger. This occurs, for instance, when no bios are submitted. Also when a delay of 0 is configured, delay_bio() returns without waking the worker. Prevent the hung task check from triggering by creating the thread with kthread_run() instead of using kthread_create() directly. Fixes: 70bbeb29fab0 ("dm delay: for short delays, use kthread instead of timers and wq") Signed-off-by: Joel Colledge Reviewed-by: Benjamin Marzinski Signed-off-by: Mike Snitzer --- drivers/md/dm-delay.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index eec0daa4b227..4ba12d536994 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -269,8 +269,7 @@ out: * In case of small requested delays, use kthread instead of * timers and workqueue to achieve better latency. */ - dc->worker = kthread_create(&flush_worker_fn, dc, - "dm-delay-flush-worker"); + dc->worker = kthread_run(&flush_worker_fn, dc, "dm-delay-flush-worker"); if (IS_ERR(dc->worker)) { ret = PTR_ERR(dc->worker); dc->worker = NULL; From 64eb88d6caee2c8eb806a68dab3f184f14f818a4 Mon Sep 17 00:00:00 2001 From: Benjamin Marzinski Date: Mon, 6 May 2024 17:55:44 -0400 Subject: [PATCH 08/10] dm-delay: fix max_delay calculations delay_ctr() pointlessly compared max_delay in cases where multiple delay classes were initialized identically. Also, when write delays were configured different than read delays, delay_ctr() never compared their value against max_delay. Fix these issues. Fixes: 70bbeb29fab0 ("dm delay: for short delays, use kthread instead of timers and wq") Signed-off-by: Benjamin Marzinski Signed-off-by: Mike Snitzer --- drivers/md/dm-delay.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index 4ba12d536994..2ac43d1f1b92 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -242,19 +242,18 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) ret = delay_class_ctr(ti, &dc->flush, argv); if (ret) goto bad; - max_delay = max(max_delay, dc->write.delay); - max_delay = max(max_delay, dc->flush.delay); goto out; } ret = delay_class_ctr(ti, &dc->write, argv + 3); if (ret) goto bad; + max_delay = max(max_delay, dc->write.delay); + if (argc == 6) { ret = delay_class_ctr(ti, &dc->flush, argv + 3); if (ret) goto bad; - max_delay = max(max_delay, dc->flush.delay); goto out; } From c542ee149230c4c3fc086feae608230e7aa97fcf Mon Sep 17 00:00:00 2001 From: Benjamin Marzinski Date: Tue, 7 May 2024 17:16:24 -0400 Subject: [PATCH 09/10] dm-delay: change locking to avoid contention The delayed_bios list is protected by one mutex shared by all dm-delay devices. This mutex must be held whenever a bio is added or expired bios are removed from the list. Since a large number of expired bios could be on the list, flush_delayed_bios() can schedule while holding the mutex. This means a flush_delayed_bios() call on any dm-delay device can slow down delay_map() calls on any other dm-delay device. To keep dm-delay devices from slowing each other down and keep processing delay bios from slowing adding delayed bios, the global mutex has been removed, and each dm-delay device now has two locks. delayed_bios_lock is a spinlock that must be held whenever the delayed_bios list is accessed. process_bios_lock is a mutex that must be held whenever a process has temporarily pulled bios off the delayed_bios list to check which ones should be processed. It must be held until all the bios that won't be processed are returned to the list. This is what flush_delayed_bios() now does. The mutex is necessary to guarantee that delay_presuspend() sees the entire list of delayed bios when it calls flush_delayed_bios(). Signed-off-by: Benjamin Marzinski Signed-off-by: Mike Snitzer --- drivers/md/dm-delay.c | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index 2ac43d1f1b92..da3f8131d52c 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -29,6 +29,8 @@ struct delay_class { struct delay_c { struct timer_list delay_timer; struct mutex timer_lock; + struct mutex process_bios_lock; /* hold while removing bios to be processed from list */ + spinlock_t delayed_bios_lock; /* hold on all accesses to delayed_bios list */ struct workqueue_struct *kdelayd_wq; struct work_struct flush_expired_bios; struct list_head delayed_bios; @@ -49,8 +51,6 @@ struct dm_delay_info { unsigned long expires; }; -static DEFINE_MUTEX(delayed_bios_lock); - static void handle_delayed_timer(struct timer_list *t) { struct delay_c *dc = from_timer(dc, t, delay_timer); @@ -89,12 +89,16 @@ static void flush_delayed_bios(struct delay_c *dc, bool flush_all) { struct dm_delay_info *delayed, *next; struct bio_list flush_bio_list; + LIST_HEAD(local_list); unsigned long next_expires = 0; bool start_timer = false; bio_list_init(&flush_bio_list); - mutex_lock(&delayed_bios_lock); - list_for_each_entry_safe(delayed, next, &dc->delayed_bios, list) { + mutex_lock(&dc->process_bios_lock); + spin_lock(&dc->delayed_bios_lock); + list_replace_init(&dc->delayed_bios, &local_list); + spin_unlock(&dc->delayed_bios_lock); + list_for_each_entry_safe(delayed, next, &local_list, list) { cond_resched(); if (flush_all || time_after_eq(jiffies, delayed->expires)) { struct bio *bio = dm_bio_from_per_bio_data(delayed, @@ -114,7 +118,10 @@ static void flush_delayed_bios(struct delay_c *dc, bool flush_all) } } } - mutex_unlock(&delayed_bios_lock); + spin_lock(&dc->delayed_bios_lock); + list_splice(&local_list, &dc->delayed_bios); + spin_unlock(&dc->delayed_bios_lock); + mutex_unlock(&dc->process_bios_lock); if (start_timer) queue_timeout(dc, next_expires); @@ -128,13 +135,13 @@ static int flush_worker_fn(void *data) while (!kthread_should_stop()) { flush_delayed_bios(dc, false); - mutex_lock(&delayed_bios_lock); + spin_lock(&dc->delayed_bios_lock); if (unlikely(list_empty(&dc->delayed_bios))) { set_current_state(TASK_INTERRUPTIBLE); - mutex_unlock(&delayed_bios_lock); + spin_unlock(&dc->delayed_bios_lock); schedule(); } else { - mutex_unlock(&delayed_bios_lock); + spin_unlock(&dc->delayed_bios_lock); cond_resched(); } } @@ -168,6 +175,7 @@ static void delay_dtr(struct dm_target *ti) if (dc->worker) kthread_stop(dc->worker); + mutex_destroy(&dc->process_bios_lock); mutex_destroy(&dc->timer_lock); kfree(dc); @@ -227,6 +235,8 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->private = dc; INIT_LIST_HEAD(&dc->delayed_bios); mutex_init(&dc->timer_lock); + mutex_init(&dc->process_bios_lock); + spin_lock_init(&dc->delayed_bios_lock); dc->may_delay = true; dc->argc = argc; @@ -309,14 +319,14 @@ static int delay_bio(struct delay_c *dc, struct delay_class *c, struct bio *bio) delayed->context = dc; delayed->expires = expires = jiffies + msecs_to_jiffies(c->delay); - mutex_lock(&delayed_bios_lock); + spin_lock(&dc->delayed_bios_lock); if (unlikely(!dc->may_delay)) { - mutex_unlock(&delayed_bios_lock); + spin_unlock(&dc->delayed_bios_lock); return DM_MAPIO_REMAPPED; } c->ops++; list_add_tail(&delayed->list, &dc->delayed_bios); - mutex_unlock(&delayed_bios_lock); + spin_unlock(&dc->delayed_bios_lock); if (delay_is_fast(dc)) wake_up_process(dc->worker); @@ -330,9 +340,9 @@ static void delay_presuspend(struct dm_target *ti) { struct delay_c *dc = ti->private; - mutex_lock(&delayed_bios_lock); + spin_lock(&dc->delayed_bios_lock); dc->may_delay = false; - mutex_unlock(&delayed_bios_lock); + spin_unlock(&dc->delayed_bios_lock); if (!delay_is_fast(dc)) timer_delete(&dc->delay_timer); From 8b21ac87d550acc4f6207764fed0cf6f0e3966cd Mon Sep 17 00:00:00 2001 From: Benjamin Marzinski Date: Tue, 7 May 2024 17:16:25 -0400 Subject: [PATCH 10/10] dm-delay: remove timer_lock Instead of manually checking the timer details in queue_timeout(), call timer_reduce() to start the timer or reduce the expiration time. This avoids needing a lock. Signed-off-by: Benjamin Marzinski Signed-off-by: Mike Snitzer --- drivers/md/dm-delay.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index da3f8131d52c..08f6387620c1 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -28,7 +28,6 @@ struct delay_class { struct delay_c { struct timer_list delay_timer; - struct mutex timer_lock; struct mutex process_bios_lock; /* hold while removing bios to be processed from list */ spinlock_t delayed_bios_lock; /* hold on all accesses to delayed_bios list */ struct workqueue_struct *kdelayd_wq; @@ -60,12 +59,7 @@ static void handle_delayed_timer(struct timer_list *t) static void queue_timeout(struct delay_c *dc, unsigned long expires) { - mutex_lock(&dc->timer_lock); - - if (!timer_pending(&dc->delay_timer) || expires < dc->delay_timer.expires) - mod_timer(&dc->delay_timer, expires); - - mutex_unlock(&dc->timer_lock); + timer_reduce(&dc->delay_timer, expires); } static inline bool delay_is_fast(struct delay_c *dc) @@ -176,7 +170,6 @@ static void delay_dtr(struct dm_target *ti) kthread_stop(dc->worker); mutex_destroy(&dc->process_bios_lock); - mutex_destroy(&dc->timer_lock); kfree(dc); } @@ -234,7 +227,6 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->private = dc; INIT_LIST_HEAD(&dc->delayed_bios); - mutex_init(&dc->timer_lock); mutex_init(&dc->process_bios_lock); spin_lock_init(&dc->delayed_bios_lock); dc->may_delay = true;