From 5268de78e1e61c828174cb2ac7d9c566a97cce69 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Mon, 8 Apr 2024 21:36:56 +0200
Subject: [PATCH 01/10] dm-crypt: add the optional "high_priority" flag

When WQ_HIGHPRI was used for the dm-crypt kcryptd workqueue it was
reported that dm-crypt performs badly when the system is loaded[1].
Because of reports of audio skipping, dm-crypt stopped using
WQ_HIGHPRI with commit f612b2132db5 (Revert "dm crypt: use WQ_HIGHPRI
for the IO and crypt workqueues").

But it has since been determined that WQ_HIGHPRI provides improved
performance (with reduced latency) for highend systems with much more
resources than those laptop/desktop users which suffered from the use
of WQ_HIGHPRI.

As such, add an option "high_priority" that allows the use of
WQ_HIGHPRI for dm-crypt's workqueues and also sets the write_thread to
nice level MIN_NICE (-20). This commit makes it optional, so that
normal users won't be harmed by it.

[1] https://listman.redhat.com/archives/dm-devel/2023-February/053410.html

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 .../admin-guide/device-mapper/dm-crypt.rst    |  5 +++
 drivers/md/dm-crypt.c                         | 37 +++++++++++++------
 2 files changed, 31 insertions(+), 11 deletions(-)

diff --git a/Documentation/admin-guide/device-mapper/dm-crypt.rst b/Documentation/admin-guide/device-mapper/dm-crypt.rst
index aa2d04d95df6..41f5f57f00eb 100644
--- a/Documentation/admin-guide/device-mapper/dm-crypt.rst
+++ b/Documentation/admin-guide/device-mapper/dm-crypt.rst
@@ -113,6 +113,11 @@ same_cpu_crypt
     The default is to use an unbound workqueue so that encryption work
     is automatically balanced between available CPUs.
 
+high_priority
+    Set dm-crypt workqueues and the writer thread to high priority. This
+    improves throughput and latency of dm-crypt while degrading general
+    responsiveness of the system.
+
 submit_from_crypt_cpus
     Disable offloading writes to a separate thread after encryption.
     There are some situations where offloading write bios from the
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 9a74c6316c5d..eabc576d8d28 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -137,9 +137,9 @@ struct iv_elephant_private {
  * and encrypts / decrypts at the same time.
  */
 enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID,
-	     DM_CRYPT_SAME_CPU, DM_CRYPT_NO_OFFLOAD,
-	     DM_CRYPT_NO_READ_WORKQUEUE, DM_CRYPT_NO_WRITE_WORKQUEUE,
-	     DM_CRYPT_WRITE_INLINE };
+	     DM_CRYPT_SAME_CPU, DM_CRYPT_HIGH_PRIORITY,
+	     DM_CRYPT_NO_OFFLOAD, DM_CRYPT_NO_READ_WORKQUEUE,
+	     DM_CRYPT_NO_WRITE_WORKQUEUE, DM_CRYPT_WRITE_INLINE };
 
 enum cipher_flags {
 	CRYPT_MODE_INTEGRITY_AEAD,	/* Use authenticated mode for cipher */
@@ -3134,7 +3134,7 @@ static int crypt_ctr_optional(struct dm_target *ti, unsigned int argc, char **ar
 	struct crypt_config *cc = ti->private;
 	struct dm_arg_set as;
 	static const struct dm_arg _args[] = {
-		{0, 8, "Invalid number of feature args"},
+		{0, 9, "Invalid number of feature args"},
 	};
 	unsigned int opt_params, val;
 	const char *opt_string, *sval;
@@ -3161,6 +3161,8 @@ static int crypt_ctr_optional(struct dm_target *ti, unsigned int argc, char **ar
 
 		else if (!strcasecmp(opt_string, "same_cpu_crypt"))
 			set_bit(DM_CRYPT_SAME_CPU, &cc->flags);
+		else if (!strcasecmp(opt_string, "high_priority"))
+			set_bit(DM_CRYPT_HIGH_PRIORITY, &cc->flags);
 
 		else if (!strcasecmp(opt_string, "submit_from_crypt_cpus"))
 			set_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags);
@@ -3232,6 +3234,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	const char *devname = dm_table_device_name(ti->table);
 	int key_size;
 	unsigned int align_mask;
+	unsigned int common_wq_flags;
 	unsigned long long tmpll;
 	int ret;
 	size_t iv_size_padding, additional_req_size;
@@ -3399,19 +3402,25 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	}
 
 	ret = -ENOMEM;
-	cc->io_queue = alloc_workqueue("kcryptd_io/%s", WQ_MEM_RECLAIM, 1, devname);
+	common_wq_flags = WQ_MEM_RECLAIM;
+	if (test_bit(DM_CRYPT_HIGH_PRIORITY, &cc->flags))
+		common_wq_flags |= WQ_HIGHPRI;
+
+	cc->io_queue = alloc_workqueue("kcryptd_io/%s", common_wq_flags, 1, devname);
 	if (!cc->io_queue) {
 		ti->error = "Couldn't create kcryptd io queue";
 		goto bad;
 	}
 
-	if (test_bit(DM_CRYPT_SAME_CPU, &cc->flags))
-		cc->crypt_queue = alloc_workqueue("kcryptd/%s", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM,
-						  1, devname);
-	else
+	if (test_bit(DM_CRYPT_SAME_CPU, &cc->flags)) {
 		cc->crypt_queue = alloc_workqueue("kcryptd/%s",
-						  WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND,
+						  common_wq_flags | WQ_CPU_INTENSIVE,
+						  1, devname);
+	} else {
+		cc->crypt_queue = alloc_workqueue("kcryptd/%s",
+						  common_wq_flags | WQ_CPU_INTENSIVE | WQ_UNBOUND,
 						  num_online_cpus(), devname);
+	}
 	if (!cc->crypt_queue) {
 		ti->error = "Couldn't create kcryptd queue";
 		goto bad;
@@ -3427,6 +3436,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		ti->error = "Couldn't spawn write thread";
 		goto bad;
 	}
+	if (test_bit(DM_CRYPT_HIGH_PRIORITY, &cc->flags))
+		set_user_nice(cc->write_thread, MIN_NICE);
 
 	ti->num_flush_bios = 1;
 	ti->limit_swap_bios = true;
@@ -3547,6 +3558,7 @@ static void crypt_status(struct dm_target *ti, status_type_t type,
 
 		num_feature_args += !!ti->num_discard_bios;
 		num_feature_args += test_bit(DM_CRYPT_SAME_CPU, &cc->flags);
+		num_feature_args += test_bit(DM_CRYPT_HIGH_PRIORITY, &cc->flags);
 		num_feature_args += test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags);
 		num_feature_args += test_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags);
 		num_feature_args += test_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags);
@@ -3560,6 +3572,8 @@ static void crypt_status(struct dm_target *ti, status_type_t type,
 				DMEMIT(" allow_discards");
 			if (test_bit(DM_CRYPT_SAME_CPU, &cc->flags))
 				DMEMIT(" same_cpu_crypt");
+			if (test_bit(DM_CRYPT_HIGH_PRIORITY, &cc->flags))
+				DMEMIT(" high_priority");
 			if (test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags))
 				DMEMIT(" submit_from_crypt_cpus");
 			if (test_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags))
@@ -3579,6 +3593,7 @@ static void crypt_status(struct dm_target *ti, status_type_t type,
 		DMEMIT_TARGET_NAME_VERSION(ti->type);
 		DMEMIT(",allow_discards=%c", ti->num_discard_bios ? 'y' : 'n');
 		DMEMIT(",same_cpu_crypt=%c", test_bit(DM_CRYPT_SAME_CPU, &cc->flags) ? 'y' : 'n');
+		DMEMIT(",high_priority=%c", test_bit(DM_CRYPT_HIGH_PRIORITY, &cc->flags) ? 'y' : 'n');
 		DMEMIT(",submit_from_crypt_cpus=%c", test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags) ?
 		       'y' : 'n');
 		DMEMIT(",no_read_workqueue=%c", test_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags) ?
@@ -3706,7 +3721,7 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits)
 
 static struct target_type crypt_target = {
 	.name   = "crypt",
-	.version = {1, 25, 0},
+	.version = {1, 26, 0},
 	.module = THIS_MODULE,
 	.ctr    = crypt_ctr,
 	.dtr    = crypt_dtr,

From 2285e1496dc68787a626145cfb0904a1b6cc5501 Mon Sep 17 00:00:00 2001
From: yangerkun <yangerkun@huawei.com>
Date: Tue, 11 Jul 2023 14:11:29 +0800
Subject: [PATCH 02/10] dm-crypt: export sysfs of all workqueues

Once there is a heavy IO load, so many encrypt/decrypt work will occupy
all of the cpu, which may lead to the poor performance for other service.
So the improved visibility and controls over dm-crypt workqueues, as
was offered with commit a2b8b2d97567 ("dm crypt: export sysfs of
kcryptd workqueue"), seems necessary. By exporting dm-crypt's
workqueues in sysfs, the entry like cpumask/max_active and so on can
help us to limit the CPU usage for encrypt/decrypt work.

However, commit a2b8b2d97567 did not consider that DM table reload
will call .ctr before .dtr, so the reload for dm-crypt failed because
the same sysfs name was present. This was the original need for commit
48b0777cd93d ("Revert "dm crypt: export sysfs of kcryptd workqueue"").

Reintroduce the use of WQ_SYSFS, and use it for both the IO and crypt
workqueues, but make the workqueue names include a unique id (via ida)
to allow both old and new sysfs entries to coexist.

Signed-off-by: yangerkun <yangerkun@huawei.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-crypt.c | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index eabc576d8d28..5bfa35760167 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -47,6 +47,8 @@
 
 #define DM_MSG_PREFIX "crypt"
 
+static DEFINE_IDA(workqueue_ida);
+
 /*
  * context holding the current state of a multi-part conversion
  */
@@ -184,6 +186,7 @@ struct crypt_config {
 		struct crypto_aead **tfms_aead;
 	} cipher_tfm;
 	unsigned int tfms_count;
+	int workqueue_id;
 	unsigned long cipher_flags;
 
 	/*
@@ -2771,6 +2774,9 @@ static void crypt_dtr(struct dm_target *ti)
 	if (cc->crypt_queue)
 		destroy_workqueue(cc->crypt_queue);
 
+	if (cc->workqueue_id)
+		ida_free(&workqueue_ida, cc->workqueue_id);
+
 	crypt_free_tfms(cc);
 
 	bioset_exit(&cc->bs);
@@ -3232,7 +3238,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 {
 	struct crypt_config *cc;
 	const char *devname = dm_table_device_name(ti->table);
-	int key_size;
+	int key_size, wq_id;
 	unsigned int align_mask;
 	unsigned int common_wq_flags;
 	unsigned long long tmpll;
@@ -3401,25 +3407,33 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		cc->tag_pool_max_sectors <<= cc->sector_shift;
 	}
 
+	wq_id = ida_alloc_min(&workqueue_ida, 1, GFP_KERNEL);
+	if (wq_id < 0) {
+		ti->error = "Couldn't get workqueue id";
+		ret = wq_id;
+		goto bad;
+	}
+	cc->workqueue_id = wq_id;
+
 	ret = -ENOMEM;
-	common_wq_flags = WQ_MEM_RECLAIM;
+	common_wq_flags = WQ_MEM_RECLAIM | WQ_SYSFS;
 	if (test_bit(DM_CRYPT_HIGH_PRIORITY, &cc->flags))
 		common_wq_flags |= WQ_HIGHPRI;
 
-	cc->io_queue = alloc_workqueue("kcryptd_io/%s", common_wq_flags, 1, devname);
+	cc->io_queue = alloc_workqueue("kcryptd_io-%s-%d", common_wq_flags, 1, devname, wq_id);
 	if (!cc->io_queue) {
 		ti->error = "Couldn't create kcryptd io queue";
 		goto bad;
 	}
 
 	if (test_bit(DM_CRYPT_SAME_CPU, &cc->flags)) {
-		cc->crypt_queue = alloc_workqueue("kcryptd/%s",
+		cc->crypt_queue = alloc_workqueue("kcryptd-%s-%d",
 						  common_wq_flags | WQ_CPU_INTENSIVE,
-						  1, devname);
+						  1, devname, wq_id);
 	} else {
-		cc->crypt_queue = alloc_workqueue("kcryptd/%s",
+		cc->crypt_queue = alloc_workqueue("kcryptd-%s-%d",
 						  common_wq_flags | WQ_CPU_INTENSIVE | WQ_UNBOUND,
-						  num_online_cpus(), devname);
+						  num_online_cpus(), devname, wq_id);
 	}
 	if (!cc->crypt_queue) {
 		ti->error = "Couldn't create kcryptd queue";

From 7560680c8d1e810826d048a61d35668541dce038 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Thu, 11 Apr 2024 15:08:05 -0400
Subject: [PATCH 03/10] dm-crypt: stop constraining max_segment_size to
 PAGE_SIZE

This change effectively reverts commit 586b286b110e ("dm crypt:
constrain crypt device's max_segment_size to PAGE_SIZE") and relies on
block core's late bio-splitting to ensure that dm-crypt's encryption
bios are split accordingly if they exceed the underlying device's
limits (e.g. max_segment_size).

Commit 586b286b110e was applied as a 4.3 fix for the benefit of
stable@ kernels 4.0+ just after block core's late bio-splitting was
introduced in 4.3 with commit 54efd50bfd873 ("block: make
generic_make_request handle arbitrarily sized bios"). Given block
core's late bio-splitting it is past time that dm-crypt make use of
it.

Also, given the recent need to revert meaningful progress that was
attempted during the 6.9 merge window (see commit bff4b74625fe Revert
"dm: use queue_limits_set") this change allows DM core to safely make
use of queue_limits_set() without risk of breaking dm-crypt on NVMe.
Though it should be noted this commit isn't a prereq for reinstating
DM core's use of queue_limits_set() because blk_validate_limits() was
made less strict with commit b561ea56a264 ("block: allow device to
have both virt_boundary_mask and max segment size").

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Mikulas Patocka <mpatocka@redhat.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-crypt.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 5bfa35760167..f43a2c0b3d77 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1656,8 +1656,8 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone);
 
 /*
  * Generate a new unfragmented bio with the given size
- * This should never violate the device limitations (but only because
- * max_segment_size is being constrained to PAGE_SIZE).
+ * This should never violate the device limitations (but if it did then block
+ * core should split the bio as needed).
  *
  * This function may be called concurrently. If we allocate from the mempool
  * concurrently, there is a possibility of deadlock. For example, if we have
@@ -3717,14 +3717,6 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits)
 {
 	struct crypt_config *cc = ti->private;
 
-	/*
-	 * Unfortunate constraint that is required to avoid the potential
-	 * for exceeding underlying device's max_segments limits -- due to
-	 * crypt_alloc_buffer() possibly allocating pages for the encryption
-	 * bio that are not as physically contiguous as the original bio.
-	 */
-	limits->max_segment_size = PAGE_SIZE;
-
 	limits->logical_block_size =
 		max_t(unsigned int, limits->logical_block_size, cc->sector_size);
 	limits->physical_block_size =

From 1c0e720228ad1c63bb487cdcead2558353b5a067 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 28 Feb 2024 14:56:42 -0800
Subject: [PATCH 04/10] dm: use queue_limits_set

Use queue_limits_set which validates the limits and takes care of
updating the readahead settings instead of directly assigning them to
the queue.  For that make sure all limits are actually updated before
the assignment.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-table.c | 27 ++++++++++++---------------
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 41f1d731ae5a..88114719fe18 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1963,26 +1963,27 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
 	bool wc = false, fua = false;
 	int r;
 
-	/*
-	 * Copy table's limits to the DM device's request_queue
-	 */
-	q->limits = *limits;
-
 	if (dm_table_supports_nowait(t))
 		blk_queue_flag_set(QUEUE_FLAG_NOWAIT, q);
 	else
 		blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, q);
 
 	if (!dm_table_supports_discards(t)) {
-		q->limits.max_discard_sectors = 0;
-		q->limits.max_hw_discard_sectors = 0;
-		q->limits.discard_granularity = 0;
-		q->limits.discard_alignment = 0;
-		q->limits.discard_misaligned = 0;
+		limits->max_hw_discard_sectors = 0;
+		limits->discard_granularity = 0;
+		limits->discard_alignment = 0;
+		limits->discard_misaligned = 0;
 	}
 
+	if (!dm_table_supports_write_zeroes(t))
+		limits->max_write_zeroes_sectors = 0;
+
 	if (!dm_table_supports_secure_erase(t))
-		q->limits.max_secure_erase_sectors = 0;
+		limits->max_secure_erase_sectors = 0;
+
+	r = queue_limits_set(q, limits);
+	if (r)
+		return r;
 
 	if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_WC))) {
 		wc = true;
@@ -2007,9 +2008,6 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
 	else
 		blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
 
-	if (!dm_table_supports_write_zeroes(t))
-		q->limits.max_write_zeroes_sectors = 0;
-
 	dm_table_verify_integrity(t);
 
 	/*
@@ -2047,7 +2045,6 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
 	}
 
 	dm_update_crypto_profile(q, t);
-	disk_update_readahead(t->md->disk);
 
 	/*
 	 * Check for request-based device is left to

From 83637d9017b22a5e11ada9f44ba776beb807222b Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@kernel.org>
Date: Fri, 12 Apr 2024 10:58:05 -0400
Subject: [PATCH 05/10] dm-crypt: don't set WQ_CPU_INTENSIVE for WQ_UNBOUND
 crypt_queue

Fix crypt_queue's use of WQ_UNBOUND to _not_ use WQ_CPU_INTENSIVE
because it is meaningless with WQ_UNBOUND.

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-crypt.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index f43a2c0b3d77..1b7a97cc3779 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -3431,8 +3431,12 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 						  common_wq_flags | WQ_CPU_INTENSIVE,
 						  1, devname, wq_id);
 	} else {
+		/*
+		 * While crypt_queue is certainly CPU intensive, the use of
+		 * WQ_CPU_INTENSIVE is meaningless with WQ_UNBOUND.
+		 */
 		cc->crypt_queue = alloc_workqueue("kcryptd-%s-%d",
-						  common_wq_flags | WQ_CPU_INTENSIVE | WQ_UNBOUND,
+						  common_wq_flags | WQ_UNBOUND,
 						  num_online_cpus(), devname, wq_id);
 	}
 	if (!cc->crypt_queue) {

From 8d24790ed08ab4e619ce58ed4a1b353ab77ffdc5 Mon Sep 17 00:00:00 2001
From: Benjamin Marzinski <bmarzins@redhat.com>
Date: Tue, 7 May 2024 17:16:23 -0400
Subject: [PATCH 06/10] dm-delay: fix workqueue delay_timer race

delay_timer could be pending when delay_dtr() is called. It needs to be
shut down before kdelayd_wq is destroyed, so it won't try queueing more
work to kdelayd_wq while that's getting destroyed.

Also the del_timer_sync() call in delay_presuspend() doesn't protect
against the timer getting immediately rearmed by the queued call to
flush_delayed_bios(), but there's no real harm if that does happen.
timer_delete() is less work, and is basically just as likely to stop a
pointless call to flush_delayed_bios().

Fixes: 26b9f228703f ("dm: delay target")
Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-delay.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index 5eabdb06c649..eec0daa4b227 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -154,8 +154,10 @@ static void delay_dtr(struct dm_target *ti)
 {
 	struct delay_c *dc = ti->private;
 
-	if (dc->kdelayd_wq)
+	if (dc->kdelayd_wq) {
+		timer_shutdown_sync(&dc->delay_timer);
 		destroy_workqueue(dc->kdelayd_wq);
+	}
 
 	if (dc->read.dev)
 		dm_put_device(ti, dc->read.dev);
@@ -335,7 +337,7 @@ static void delay_presuspend(struct dm_target *ti)
 	mutex_unlock(&delayed_bios_lock);
 
 	if (!delay_is_fast(dc))
-		del_timer_sync(&dc->delay_timer);
+		timer_delete(&dc->delay_timer);
 	flush_delayed_bios(dc, true);
 }
 

From d14646f23300a5fc85be867bafdc0702c2002789 Mon Sep 17 00:00:00 2001
From: Joel Colledge <joel.colledge@linbit.com>
Date: Mon, 6 May 2024 09:25:23 +0200
Subject: [PATCH 07/10] dm-delay: fix hung task introduced by kthread mode

If the worker thread is not woken due to a bio, then it is not woken at
all. This causes the hung task check to trigger. This occurs, for
instance, when no bios are submitted. Also when a delay of 0 is
configured, delay_bio() returns without waking the worker.

Prevent the hung task check from triggering by creating the thread with
kthread_run() instead of using kthread_create() directly.

Fixes: 70bbeb29fab0 ("dm delay: for short delays, use kthread instead of timers and wq")
Signed-off-by: Joel Colledge <joel.colledge@linbit.com>
Reviewed-by: Benjamin Marzinski <bmarzins@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-delay.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index eec0daa4b227..4ba12d536994 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -269,8 +269,7 @@ out:
 		 * In case of small requested delays, use kthread instead of
 		 * timers and workqueue to achieve better latency.
 		 */
-		dc->worker = kthread_create(&flush_worker_fn, dc,
-					    "dm-delay-flush-worker");
+		dc->worker = kthread_run(&flush_worker_fn, dc, "dm-delay-flush-worker");
 		if (IS_ERR(dc->worker)) {
 			ret = PTR_ERR(dc->worker);
 			dc->worker = NULL;

From 64eb88d6caee2c8eb806a68dab3f184f14f818a4 Mon Sep 17 00:00:00 2001
From: Benjamin Marzinski <bmarzins@redhat.com>
Date: Mon, 6 May 2024 17:55:44 -0400
Subject: [PATCH 08/10] dm-delay: fix max_delay calculations

delay_ctr() pointlessly compared max_delay in cases where multiple delay
classes were initialized identically. Also, when write delays were
configured different than read delays, delay_ctr() never compared their
value against max_delay. Fix these issues.

Fixes: 70bbeb29fab0 ("dm delay: for short delays, use kthread instead of timers and wq")
Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-delay.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index 4ba12d536994..2ac43d1f1b92 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -242,19 +242,18 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		ret = delay_class_ctr(ti, &dc->flush, argv);
 		if (ret)
 			goto bad;
-		max_delay = max(max_delay, dc->write.delay);
-		max_delay = max(max_delay, dc->flush.delay);
 		goto out;
 	}
 
 	ret = delay_class_ctr(ti, &dc->write, argv + 3);
 	if (ret)
 		goto bad;
+	max_delay = max(max_delay, dc->write.delay);
+
 	if (argc == 6) {
 		ret = delay_class_ctr(ti, &dc->flush, argv + 3);
 		if (ret)
 			goto bad;
-		max_delay = max(max_delay, dc->flush.delay);
 		goto out;
 	}
 

From c542ee149230c4c3fc086feae608230e7aa97fcf Mon Sep 17 00:00:00 2001
From: Benjamin Marzinski <bmarzins@redhat.com>
Date: Tue, 7 May 2024 17:16:24 -0400
Subject: [PATCH 09/10] dm-delay: change locking to avoid contention

The delayed_bios list is protected by one mutex shared by all dm-delay
devices. This mutex must be held whenever a bio is added or expired bios
are removed from the list.  Since a large number of expired bios could
be on the list, flush_delayed_bios() can schedule while holding the
mutex. This means a flush_delayed_bios() call on any dm-delay device can
slow down delay_map() calls on any other dm-delay device.

To keep dm-delay devices from slowing each other down and keep
processing delay bios from slowing adding delayed bios, the global mutex
has been removed, and each dm-delay device now has two locks.
delayed_bios_lock is a spinlock that must be held whenever the
delayed_bios list is accessed. process_bios_lock is a mutex that must be
held whenever a process has temporarily pulled bios off the delayed_bios
list to check which ones should be processed. It must be held until all
the bios that won't be processed are returned to the list. This is what
flush_delayed_bios() now does. The mutex is necessary to guarantee that
delay_presuspend() sees the entire list of delayed bios when it calls
flush_delayed_bios().

Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-delay.c | 36 +++++++++++++++++++++++-------------
 1 file changed, 23 insertions(+), 13 deletions(-)

diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index 2ac43d1f1b92..da3f8131d52c 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -29,6 +29,8 @@ struct delay_class {
 struct delay_c {
 	struct timer_list delay_timer;
 	struct mutex timer_lock;
+	struct mutex process_bios_lock; /* hold while removing bios to be processed from list */
+	spinlock_t delayed_bios_lock; /* hold on all accesses to delayed_bios list */
 	struct workqueue_struct *kdelayd_wq;
 	struct work_struct flush_expired_bios;
 	struct list_head delayed_bios;
@@ -49,8 +51,6 @@ struct dm_delay_info {
 	unsigned long expires;
 };
 
-static DEFINE_MUTEX(delayed_bios_lock);
-
 static void handle_delayed_timer(struct timer_list *t)
 {
 	struct delay_c *dc = from_timer(dc, t, delay_timer);
@@ -89,12 +89,16 @@ static void flush_delayed_bios(struct delay_c *dc, bool flush_all)
 {
 	struct dm_delay_info *delayed, *next;
 	struct bio_list flush_bio_list;
+	LIST_HEAD(local_list);
 	unsigned long next_expires = 0;
 	bool start_timer = false;
 	bio_list_init(&flush_bio_list);
 
-	mutex_lock(&delayed_bios_lock);
-	list_for_each_entry_safe(delayed, next, &dc->delayed_bios, list) {
+	mutex_lock(&dc->process_bios_lock);
+	spin_lock(&dc->delayed_bios_lock);
+	list_replace_init(&dc->delayed_bios, &local_list);
+	spin_unlock(&dc->delayed_bios_lock);
+	list_for_each_entry_safe(delayed, next, &local_list, list) {
 		cond_resched();
 		if (flush_all || time_after_eq(jiffies, delayed->expires)) {
 			struct bio *bio = dm_bio_from_per_bio_data(delayed,
@@ -114,7 +118,10 @@ static void flush_delayed_bios(struct delay_c *dc, bool flush_all)
 			}
 		}
 	}
-	mutex_unlock(&delayed_bios_lock);
+	spin_lock(&dc->delayed_bios_lock);
+	list_splice(&local_list, &dc->delayed_bios);
+	spin_unlock(&dc->delayed_bios_lock);
+	mutex_unlock(&dc->process_bios_lock);
 
 	if (start_timer)
 		queue_timeout(dc, next_expires);
@@ -128,13 +135,13 @@ static int flush_worker_fn(void *data)
 
 	while (!kthread_should_stop()) {
 		flush_delayed_bios(dc, false);
-		mutex_lock(&delayed_bios_lock);
+		spin_lock(&dc->delayed_bios_lock);
 		if (unlikely(list_empty(&dc->delayed_bios))) {
 			set_current_state(TASK_INTERRUPTIBLE);
-			mutex_unlock(&delayed_bios_lock);
+			spin_unlock(&dc->delayed_bios_lock);
 			schedule();
 		} else {
-			mutex_unlock(&delayed_bios_lock);
+			spin_unlock(&dc->delayed_bios_lock);
 			cond_resched();
 		}
 	}
@@ -168,6 +175,7 @@ static void delay_dtr(struct dm_target *ti)
 	if (dc->worker)
 		kthread_stop(dc->worker);
 
+	mutex_destroy(&dc->process_bios_lock);
 	mutex_destroy(&dc->timer_lock);
 
 	kfree(dc);
@@ -227,6 +235,8 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	ti->private = dc;
 	INIT_LIST_HEAD(&dc->delayed_bios);
 	mutex_init(&dc->timer_lock);
+	mutex_init(&dc->process_bios_lock);
+	spin_lock_init(&dc->delayed_bios_lock);
 	dc->may_delay = true;
 	dc->argc = argc;
 
@@ -309,14 +319,14 @@ static int delay_bio(struct delay_c *dc, struct delay_class *c, struct bio *bio)
 	delayed->context = dc;
 	delayed->expires = expires = jiffies + msecs_to_jiffies(c->delay);
 
-	mutex_lock(&delayed_bios_lock);
+	spin_lock(&dc->delayed_bios_lock);
 	if (unlikely(!dc->may_delay)) {
-		mutex_unlock(&delayed_bios_lock);
+		spin_unlock(&dc->delayed_bios_lock);
 		return DM_MAPIO_REMAPPED;
 	}
 	c->ops++;
 	list_add_tail(&delayed->list, &dc->delayed_bios);
-	mutex_unlock(&delayed_bios_lock);
+	spin_unlock(&dc->delayed_bios_lock);
 
 	if (delay_is_fast(dc))
 		wake_up_process(dc->worker);
@@ -330,9 +340,9 @@ static void delay_presuspend(struct dm_target *ti)
 {
 	struct delay_c *dc = ti->private;
 
-	mutex_lock(&delayed_bios_lock);
+	spin_lock(&dc->delayed_bios_lock);
 	dc->may_delay = false;
-	mutex_unlock(&delayed_bios_lock);
+	spin_unlock(&dc->delayed_bios_lock);
 
 	if (!delay_is_fast(dc))
 		timer_delete(&dc->delay_timer);

From 8b21ac87d550acc4f6207764fed0cf6f0e3966cd Mon Sep 17 00:00:00 2001
From: Benjamin Marzinski <bmarzins@redhat.com>
Date: Tue, 7 May 2024 17:16:25 -0400
Subject: [PATCH 10/10] dm-delay: remove timer_lock

Instead of manually checking the timer details in queue_timeout(), call
timer_reduce() to start the timer or reduce the expiration time. This
avoids needing a lock.

Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
---
 drivers/md/dm-delay.c | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index da3f8131d52c..08f6387620c1 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -28,7 +28,6 @@ struct delay_class {
 
 struct delay_c {
 	struct timer_list delay_timer;
-	struct mutex timer_lock;
 	struct mutex process_bios_lock; /* hold while removing bios to be processed from list */
 	spinlock_t delayed_bios_lock; /* hold on all accesses to delayed_bios list */
 	struct workqueue_struct *kdelayd_wq;
@@ -60,12 +59,7 @@ static void handle_delayed_timer(struct timer_list *t)
 
 static void queue_timeout(struct delay_c *dc, unsigned long expires)
 {
-	mutex_lock(&dc->timer_lock);
-
-	if (!timer_pending(&dc->delay_timer) || expires < dc->delay_timer.expires)
-		mod_timer(&dc->delay_timer, expires);
-
-	mutex_unlock(&dc->timer_lock);
+	timer_reduce(&dc->delay_timer, expires);
 }
 
 static inline bool delay_is_fast(struct delay_c *dc)
@@ -176,7 +170,6 @@ static void delay_dtr(struct dm_target *ti)
 		kthread_stop(dc->worker);
 
 	mutex_destroy(&dc->process_bios_lock);
-	mutex_destroy(&dc->timer_lock);
 
 	kfree(dc);
 }
@@ -234,7 +227,6 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 
 	ti->private = dc;
 	INIT_LIST_HEAD(&dc->delayed_bios);
-	mutex_init(&dc->timer_lock);
 	mutex_init(&dc->process_bios_lock);
 	spin_lock_init(&dc->delayed_bios_lock);
 	dc->may_delay = true;