From 0c4fe008c9cb2215b3f838769886857ae986014b Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 5 Jan 2023 11:22:20 +0000
Subject: [PATCH 01/51] io_uring: rearrange defer list checks

There should be nothing in the ->work_llist for non DEFER_TASKRUN rings,
so we can skip flag checks and test the list emptiness directly. Also
move it out of io_run_local_work() for inlining.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/331d63fd15ca79b35b95c82a82d9246110686392.1672916894.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 3 ---
 io_uring/io_uring.h | 2 +-
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index db623b3185c8..da9b0397f9a1 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1343,9 +1343,6 @@ int io_run_local_work(struct io_ring_ctx *ctx)
 	bool locked;
 	int ret;
 
-	if (llist_empty(&ctx->work_llist))
-		return 0;
-
 	__set_current_state(TASK_RUNNING);
 	locked = mutex_trylock(&ctx->uring_lock);
 	ret = __io_run_local_work(ctx, &locked);
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index ab4b2a1c3b7e..7607a4992a94 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -289,7 +289,7 @@ static inline int io_run_task_work_ctx(struct io_ring_ctx *ctx)
 	int ret = 0;
 	int ret2;
 
-	if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
+	if (!llist_empty(&ctx->work_llist))
 		ret = io_run_local_work(ctx);
 
 	/* want to run this after in case more is added */

From f36ba6cf1ab6b05a538aae9cca896917db14ba27 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 5 Jan 2023 11:22:21 +0000
Subject: [PATCH 02/51] io_uring: don't iterate cq wait fast path

Task work runners keep running until all queues tw items are exhausted.
It's also rare for defer tw to queue normal tw and vise versa. Taking it
into account, there is only a dim chance that further iterating the
io_cqring_wait() fast path will get us anything and so we can remove
the loop there.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/1f9565726661266abaa5d921e97433c831759ecf.1672916894.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index da9b0397f9a1..dfc94654e082 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2510,18 +2510,16 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 
 	if (!io_allowed_run_tw(ctx))
 		return -EEXIST;
-
-	do {
-		/* always run at least 1 task work to process local work */
-		ret = io_run_task_work_ctx(ctx);
+	if (!llist_empty(&ctx->work_llist)) {
+		ret = io_run_local_work(ctx);
 		if (ret < 0)
 			return ret;
-		io_cqring_overflow_flush(ctx);
-
-		/* if user messes with these they will just get an early return */
-		if (__io_cqring_events_user(ctx) >= min_events)
-			return 0;
-	} while (ret > 0);
+	}
+	io_run_task_work();
+	io_cqring_overflow_flush(ctx);
+	/* if user messes with these they will just get an early return */
+	if (__io_cqring_events_user(ctx) >= min_events)
+		return 0;
 
 	if (sig) {
 #ifdef CONFIG_COMPAT

From 1414d62985848d095af5a400a4ca074a4888b77f Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 5 Jan 2023 11:22:22 +0000
Subject: [PATCH 03/51] io_uring: kill io_run_task_work_ctx

There is only one user of io_run_task_work_ctx(), inline it.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/40953c65f7c88fb00cdc4d870ca5d5319fb3d7ea.1672916894.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c |  6 +++++-
 io_uring/io_uring.h | 20 --------------------
 2 files changed, 5 insertions(+), 21 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index dfc94654e082..5326e2d94055 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2455,7 +2455,11 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
 
 int io_run_task_work_sig(struct io_ring_ctx *ctx)
 {
-	if (io_run_task_work_ctx(ctx) > 0)
+	if (!llist_empty(&ctx->work_llist)) {
+		if (io_run_local_work(ctx) > 0)
+			return 1;
+	}
+	if (io_run_task_work() > 0)
 		return 1;
 	if (task_sigpending(current))
 		return -EINTR;
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 7607a4992a94..9d36271eb6d3 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -284,26 +284,6 @@ static inline bool io_task_work_pending(struct io_ring_ctx *ctx)
 	return task_work_pending(current) || !wq_list_empty(&ctx->work_llist);
 }
 
-static inline int io_run_task_work_ctx(struct io_ring_ctx *ctx)
-{
-	int ret = 0;
-	int ret2;
-
-	if (!llist_empty(&ctx->work_llist))
-		ret = io_run_local_work(ctx);
-
-	/* want to run this after in case more is added */
-	ret2 = io_run_task_work();
-
-	/* Try propagate error in favour of if tasks were run,
-	 * but still make sure to run them if requested
-	 */
-	if (ret >= 0)
-		ret += ret2;
-
-	return ret;
-}
-
 static inline int io_run_local_work_locked(struct io_ring_ctx *ctx)
 {
 	bool locked;

From 140102ae9a9f2f83f0592b98b3c5c6119d9a9b32 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 5 Jan 2023 11:22:23 +0000
Subject: [PATCH 04/51] io_uring: move defer tw task checks

Most places that want to run local tw explicitly and in advance check if
they are allowed to do so. Don't rely on a similar check in
__io_run_local_work(), leave it as a just-in-case warning and make sure
callers checks capabilities themselves.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/990fe0e8e70fd4d57e43625e5ce8fba584821d1a.1672916894.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 15 ++++++---------
 io_uring/io_uring.h |  5 +++++
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 5326e2d94055..97b749203ba8 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1301,14 +1301,13 @@ int __io_run_local_work(struct io_ring_ctx *ctx, bool *locked)
 	struct llist_node *node;
 	struct llist_node fake;
 	struct llist_node *current_final = NULL;
-	int ret;
+	int ret = 0;
 	unsigned int loops = 1;
 
-	if (unlikely(ctx->submitter_task != current))
+	if (WARN_ON_ONCE(ctx->submitter_task != current))
 		return -EEXIST;
 
 	node = io_llist_xchg(&ctx->work_llist, &fake);
-	ret = 0;
 again:
 	while (node != current_final) {
 		struct llist_node *next = node->next;
@@ -2514,11 +2513,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 
 	if (!io_allowed_run_tw(ctx))
 		return -EEXIST;
-	if (!llist_empty(&ctx->work_llist)) {
-		ret = io_run_local_work(ctx);
-		if (ret < 0)
-			return ret;
-	}
+	if (!llist_empty(&ctx->work_llist))
+		io_run_local_work(ctx);
 	io_run_task_work();
 	io_cqring_overflow_flush(ctx);
 	/* if user messes with these they will just get an early return */
@@ -3055,7 +3051,8 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 		}
 	}
 
-	if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
+	if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) &&
+	    io_allowed_defer_tw_run(ctx))
 		ret |= io_run_local_work(ctx) > 0;
 	ret |= io_cancel_defer_files(ctx, task, cancel_all);
 	mutex_lock(&ctx->uring_lock);
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 9d36271eb6d3..011932935c36 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -367,6 +367,11 @@ static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
 	return container_of(node, struct io_kiocb, comp_list);
 }
 
+static inline bool io_allowed_defer_tw_run(struct io_ring_ctx *ctx)
+{
+	return likely(ctx->submitter_task == current);
+}
+
 static inline bool io_allowed_run_tw(struct io_ring_ctx *ctx)
 {
 	return likely(!(ctx->flags & IORING_SETUP_DEFER_TASKRUN) ||

From 3fcf19d592d5cb63eb209400b22055651e3c27d0 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 5 Jan 2023 11:22:24 +0000
Subject: [PATCH 05/51] io_uring: parse check_cq out of wq waiting

We already avoid flushing overflows in io_cqring_wait_schedule() but
only return an error for the outer loop to handle it. Minimise it even
further by moving all ->check_cq parsing there.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/9dfcec3121013f98208dbf79368d636d74e1231a.1672916894.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 97b749203ba8..524ef5a2bb9c 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2471,21 +2471,13 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
 					  ktime_t *timeout)
 {
 	int ret;
-	unsigned long check_cq;
 
+	if (unlikely(READ_ONCE(ctx->check_cq)))
+		return 1;
 	/* make sure we run task_work before checking for signals */
 	ret = io_run_task_work_sig(ctx);
 	if (ret || io_should_wake(iowq))
 		return ret;
-
-	check_cq = READ_ONCE(ctx->check_cq);
-	if (unlikely(check_cq)) {
-		/* let the caller flush overflows, retry */
-		if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))
-			return 1;
-		if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT))
-			return -EBADR;
-	}
 	if (!schedule_hrtimeout(timeout, HRTIMER_MODE_ABS))
 		return -ETIME;
 
@@ -2551,13 +2543,25 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 
 	trace_io_uring_cqring_wait(ctx, min_events);
 	do {
-		if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) {
-			finish_wait(&ctx->cq_wait, &iowq.wq);
-			io_cqring_do_overflow_flush(ctx);
-		}
+		unsigned long check_cq;
+
 		prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
 						TASK_INTERRUPTIBLE);
 		ret = io_cqring_wait_schedule(ctx, &iowq, &timeout);
+
+		check_cq = READ_ONCE(ctx->check_cq);
+		if (unlikely(check_cq)) {
+			/* let the caller flush overflows, retry */
+			if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) {
+				finish_wait(&ctx->cq_wait, &iowq.wq);
+				io_cqring_do_overflow_flush(ctx);
+			}
+			if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)) {
+				ret = -EBADR;
+				break;
+			}
+		}
+
 		if (__io_cqring_events_user(ctx) >= min_events)
 			break;
 		cond_resched();

From 846072f16eed3b3fb4e59b677f3ed8afb8509b89 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 5 Jan 2023 11:22:25 +0000
Subject: [PATCH 06/51] io_uring: mimimise io_cqring_wait_schedule

io_cqring_wait_schedule() is called after we started waiting on the cq
wq and set the state to TASK_INTERRUPTIBLE, for that reason we have to
constantly worry whether we has returned the state back to running or
not. Leave only quick checks in io_cqring_wait_schedule() and move the
rest including running task work to the callers. Note, we run tw in the
loop after the sched checks because of the fast path in the beginning of
the function.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/2814fabe75e2e019e7ca43ea07daa94564349805.1672916894.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 39 +++++++++++++++++++++++----------------
 1 file changed, 23 insertions(+), 16 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 524ef5a2bb9c..2cbf33996c3b 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2470,24 +2470,19 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
 					  struct io_wait_queue *iowq,
 					  ktime_t *timeout)
 {
-	int ret;
-
 	if (unlikely(READ_ONCE(ctx->check_cq)))
 		return 1;
-	/* make sure we run task_work before checking for signals */
-	ret = io_run_task_work_sig(ctx);
-	if (ret || io_should_wake(iowq))
-		return ret;
+	if (unlikely(!llist_empty(&ctx->work_llist)))
+		return 1;
+	if (unlikely(test_thread_flag(TIF_NOTIFY_SIGNAL)))
+		return 1;
+	if (unlikely(task_sigpending(current)))
+		return -EINTR;
+	if (unlikely(io_should_wake(iowq)))
+		return 0;
 	if (!schedule_hrtimeout(timeout, HRTIMER_MODE_ABS))
 		return -ETIME;
-
-	/*
-	 * Run task_work after scheduling. If we got woken because of
-	 * task_work being processed, run it now rather than let the caller
-	 * do another wait loop.
-	 */
-	ret = io_run_task_work_sig(ctx);
-	return ret < 0 ? ret : 1;
+	return 0;
 }
 
 /*
@@ -2548,6 +2543,16 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 		prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
 						TASK_INTERRUPTIBLE);
 		ret = io_cqring_wait_schedule(ctx, &iowq, &timeout);
+		if (ret < 0)
+			break;
+		/*
+		 * Run task_work after scheduling and before io_should_wake().
+		 * If we got woken because of task_work being processed, run it
+		 * now rather than let the caller do another wait loop.
+		 */
+		io_run_task_work();
+		if (!llist_empty(&ctx->work_llist))
+			io_run_local_work(ctx);
 
 		check_cq = READ_ONCE(ctx->check_cq);
 		if (unlikely(check_cq)) {
@@ -2562,10 +2567,12 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 			}
 		}
 
-		if (__io_cqring_events_user(ctx) >= min_events)
+		if (io_should_wake(&iowq)) {
+			ret = 0;
 			break;
+		}
 		cond_resched();
-	} while (ret > 0);
+	} while (1);
 
 	finish_wait(&ctx->cq_wait, &iowq.wq);
 	restore_saved_sigmask_unless(ret == -EINTR);

From 490c00eb4fa5e5e25e0127240f6d6c1b499da95b Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 5 Jan 2023 11:22:26 +0000
Subject: [PATCH 07/51] io_uring: simplify io_has_work

->work_llist should never be non-empty for a non DEFER_TASKRUN ring, so
we can safely skip checking the flag.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/26af9f73c09a56c9a035f94db56127358688f3aa.1672916894.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 2cbf33996c3b..4cb9cce23c90 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2419,8 +2419,7 @@ struct io_wait_queue {
 static inline bool io_has_work(struct io_ring_ctx *ctx)
 {
 	return test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq) ||
-	       ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) &&
-		!llist_empty(&ctx->work_llist));
+	       !llist_empty(&ctx->work_llist);
 }
 
 static inline bool io_should_wake(struct io_wait_queue *iowq)

From 326a9e482e2134d7a44b7f8f9a721b38c6bbb146 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 5 Jan 2023 11:22:27 +0000
Subject: [PATCH 08/51] io_uring: set TASK_RUNNING right after schedule

Instead of constantly watching that the state of the task is running
before executing tw or taking locks in io_cqring_wait(), switch it back
to TASK_RUNNING immediately.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/246dddee247d89fd52023f785ed17cc34962a008.1672916894.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 4cb9cce23c90..2ec011f0ba7d 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2544,6 +2544,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 		ret = io_cqring_wait_schedule(ctx, &iowq, &timeout);
 		if (ret < 0)
 			break;
+		__set_current_state(TASK_RUNNING);
 		/*
 		 * Run task_work after scheduling and before io_should_wake().
 		 * If we got woken because of task_work being processed, run it
@@ -2556,10 +2557,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 		check_cq = READ_ONCE(ctx->check_cq);
 		if (unlikely(check_cq)) {
 			/* let the caller flush overflows, retry */
-			if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) {
-				finish_wait(&ctx->cq_wait, &iowq.wq);
+			if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))
 				io_cqring_do_overflow_flush(ctx);
-			}
 			if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)) {
 				ret = -EBADR;
 				break;

From 46ae7eef44f6dfd825a3bcfa43392d3ad9836ada Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 5 Jan 2023 11:22:28 +0000
Subject: [PATCH 09/51] io_uring: optimise non-timeout waiting

Unlike the jiffy scheduling version, schedule_hrtimeout() jumps a few
functions before getting into schedule() even if there is no actual
timeout needed. Some tests showed that it takes up to 1% of CPU cycles.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/89f880574eceee6f4899783377ead234df7b3d04.1672916894.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 2ec011f0ba7d..6229a49c0c33 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2479,7 +2479,9 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
 		return -EINTR;
 	if (unlikely(io_should_wake(iowq)))
 		return 0;
-	if (!schedule_hrtimeout(timeout, HRTIMER_MODE_ABS))
+	if (*timeout == KTIME_MAX)
+		schedule();
+	else if (!schedule_hrtimeout(timeout, HRTIMER_MODE_ABS))
 		return -ETIME;
 	return 0;
 }

From d33a39e577687e12d4468e9dd999375b9973d700 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 5 Jan 2023 11:22:29 +0000
Subject: [PATCH 10/51] io_uring: keep timeout in io_wait_queue

Move waiting timeout into io_wait_queue

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/e4b48a9e26a3b1cf97c80121e62d4b5ab873d28d.1672916894.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 6229a49c0c33..fdea6fbc3fad 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2414,6 +2414,7 @@ struct io_wait_queue {
 	struct io_ring_ctx *ctx;
 	unsigned cq_tail;
 	unsigned nr_timeouts;
+	ktime_t timeout;
 };
 
 static inline bool io_has_work(struct io_ring_ctx *ctx)
@@ -2466,8 +2467,7 @@ int io_run_task_work_sig(struct io_ring_ctx *ctx)
 
 /* when returns >0, the caller should retry */
 static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
-					  struct io_wait_queue *iowq,
-					  ktime_t *timeout)
+					  struct io_wait_queue *iowq)
 {
 	if (unlikely(READ_ONCE(ctx->check_cq)))
 		return 1;
@@ -2479,9 +2479,9 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
 		return -EINTR;
 	if (unlikely(io_should_wake(iowq)))
 		return 0;
-	if (*timeout == KTIME_MAX)
+	if (iowq->timeout == KTIME_MAX)
 		schedule();
-	else if (!schedule_hrtimeout(timeout, HRTIMER_MODE_ABS))
+	else if (!schedule_hrtimeout(&iowq->timeout, HRTIMER_MODE_ABS))
 		return -ETIME;
 	return 0;
 }
@@ -2496,7 +2496,6 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 {
 	struct io_wait_queue iowq;
 	struct io_rings *rings = ctx->rings;
-	ktime_t timeout = KTIME_MAX;
 	int ret;
 
 	if (!io_allowed_run_tw(ctx))
@@ -2522,20 +2521,21 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 			return ret;
 	}
 
-	if (uts) {
-		struct timespec64 ts;
-
-		if (get_timespec64(&ts, uts))
-			return -EFAULT;
-		timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns());
-	}
-
 	init_waitqueue_func_entry(&iowq.wq, io_wake_function);
 	iowq.wq.private = current;
 	INIT_LIST_HEAD(&iowq.wq.entry);
 	iowq.ctx = ctx;
 	iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
 	iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
+	iowq.timeout = KTIME_MAX;
+
+	if (uts) {
+		struct timespec64 ts;
+
+		if (get_timespec64(&ts, uts))
+			return -EFAULT;
+		iowq.timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns());
+	}
 
 	trace_io_uring_cqring_wait(ctx, min_events);
 	do {
@@ -2543,7 +2543,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 
 		prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
 						TASK_INTERRUPTIBLE);
-		ret = io_cqring_wait_schedule(ctx, &iowq, &timeout);
+		ret = io_cqring_wait_schedule(ctx, &iowq);
 		if (ret < 0)
 			break;
 		__set_current_state(TASK_RUNNING);

From cbeb47a7b5f003429ded32b1fb3a7108ce5c1b54 Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Tue, 3 Jan 2023 08:05:07 -0800
Subject: [PATCH 11/51] io_uring/msg_ring: Pass custom flags to the cqe

This patch adds a new flag (IORING_MSG_RING_FLAGS_PASS) in the message
ring operations (IORING_OP_MSG_RING). This new flag enables the sender
to specify custom flags, which will be copied over to cqe->flags in the
receiving ring.  These custom flags should be specified using the
sqe->file_index field.

This mechanism provides additional flexibility when sending messages
between rings.

Signed-off-by: Breno Leitao <leitao@debian.org>
Link: https://lore.kernel.org/r/20230103160507.617416-1-leitao@debian.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h |  2 ++
 io_uring/msg_ring.c           | 24 +++++++++++++++++++-----
 2 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 2780bce62faf..636a4c2c1294 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -347,6 +347,8 @@ enum {
  *				applicable for IORING_MSG_DATA, obviously.
  */
 #define IORING_MSG_RING_CQE_SKIP	(1U << 0)
+/* Pass through the flags from sqe->file_index to cqe->flags */
+#define IORING_MSG_RING_FLAGS_PASS	(1U << 1)
 
 /*
  * IO completion data structure (Completion Queue Entry)
diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c
index 15602a136821..27992bda1ffa 100644
--- a/io_uring/msg_ring.c
+++ b/io_uring/msg_ring.c
@@ -13,6 +13,11 @@
 #include "filetable.h"
 #include "msg_ring.h"
 
+
+/* All valid masks for MSG_RING */
+#define IORING_MSG_RING_MASK		(IORING_MSG_RING_CQE_SKIP | \
+					IORING_MSG_RING_FLAGS_PASS)
+
 struct io_msg {
 	struct file			*file;
 	struct file			*src_file;
@@ -21,7 +26,10 @@ struct io_msg {
 	u32 len;
 	u32 cmd;
 	u32 src_fd;
-	u32 dst_fd;
+	union {
+		u32 dst_fd;
+		u32 cqe_flags;
+	};
 	u32 flags;
 };
 
@@ -114,9 +122,12 @@ static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_ring_ctx *target_ctx = req->file->private_data;
 	struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
+	u32 flags = 0;
 	int ret;
 
-	if (msg->src_fd || msg->dst_fd || msg->flags)
+	if (msg->src_fd || msg->flags & ~IORING_MSG_RING_FLAGS_PASS)
+		return -EINVAL;
+	if (!(msg->flags & IORING_MSG_RING_FLAGS_PASS) && msg->dst_fd)
 		return -EINVAL;
 	if (target_ctx->flags & IORING_SETUP_R_DISABLED)
 		return -EBADFD;
@@ -124,15 +135,18 @@ static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags)
 	if (io_msg_need_remote(target_ctx))
 		return io_msg_exec_remote(req, io_msg_tw_complete);
 
+	if (msg->flags & IORING_MSG_RING_FLAGS_PASS)
+		flags = msg->cqe_flags;
+
 	ret = -EOVERFLOW;
 	if (target_ctx->flags & IORING_SETUP_IOPOLL) {
 		if (unlikely(io_double_lock_ctx(target_ctx, issue_flags)))
 			return -EAGAIN;
-		if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0))
+		if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, flags))
 			ret = 0;
 		io_double_unlock_ctx(target_ctx);
 	} else {
-		if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0))
+		if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, flags))
 			ret = 0;
 	}
 	return ret;
@@ -241,7 +255,7 @@ int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	msg->src_fd = READ_ONCE(sqe->addr3);
 	msg->dst_fd = READ_ONCE(sqe->file_index);
 	msg->flags = READ_ONCE(sqe->msg_ring_flags);
-	if (msg->flags & ~IORING_MSG_RING_CQE_SKIP)
+	if (msg->flags & ~IORING_MSG_RING_MASK)
 		return -EINVAL;
 
 	return 0;

From 81594e7e7a146888f0bac4fa782b0b5d3c37fdff Mon Sep 17 00:00:00 2001
From: Dmitrii Bundin <dmitrii.bundin.a@gmail.com>
Date: Mon, 9 Jan 2023 21:58:54 +0300
Subject: [PATCH 12/51] io_uring: remove excessive unlikely on IS_ERR

The IS_ERR function uses the IS_ERR_VALUE macro under the hood which
already wraps the condition into unlikely.

Signed-off-by: Dmitrii Bundin <dmitrii.bundin.a@gmail.com>
Link: https://lore.kernel.org/r/20230109185854.25698-1-dmitrii.bundin.a@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/rw.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/io_uring/rw.c b/io_uring/rw.c
index 9c3ddd46a1ad..3a3aef7e7c33 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -410,7 +410,7 @@ static inline int io_import_iovec(int rw, struct io_kiocb *req,
 				  unsigned int issue_flags)
 {
 	*iovec = __io_import_iovec(rw, req, s, issue_flags);
-	if (unlikely(IS_ERR(*iovec)))
+	if (IS_ERR(*iovec))
 		return PTR_ERR(*iovec);
 
 	iov_iter_save_state(&s->iter, &s->iter_state);

From dde40322ae20f2d6b0bcb781a9eedcfc7ca3aa73 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 9 Jan 2023 14:46:03 +0000
Subject: [PATCH 13/51] io_uring: move submitter_task out of cold cacheline

->submitter_task is used somewhat more frequent now than before, i.e.
for local tw enqueue and run, let's move it from the end of ctx, which
is full of cold data, to the first cacheline with mostly constants.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/415ca91dc5ad1dec612b892e489cda98e1069542.1673274244.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 128a67a40065..8dfb6c4a35d9 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -195,11 +195,7 @@ struct io_alloc_cache {
 struct io_ring_ctx {
 	/* const or read-mostly hot data */
 	struct {
-		struct percpu_ref	refs;
-
-		struct io_rings		*rings;
 		unsigned int		flags;
-		enum task_work_notify_mode	notify_method;
 		unsigned int		compat: 1;
 		unsigned int		drain_next: 1;
 		unsigned int		restricted: 1;
@@ -210,6 +206,11 @@ struct io_ring_ctx {
 		unsigned int		syscall_iopoll: 1;
 		/* all CQEs should be posted only by the submitter task */
 		unsigned int		task_complete: 1;
+
+		enum task_work_notify_mode	notify_method;
+		struct io_rings			*rings;
+		struct task_struct		*submitter_task;
+		struct percpu_ref		refs;
 	} ____cacheline_aligned_in_smp;
 
 	/* submission data */
@@ -320,7 +321,6 @@ struct io_ring_ctx {
 	/* Keep this last, we don't need it for the fast path */
 
 	struct io_restriction		restrictions;
-	struct task_struct		*submitter_task;
 
 	/* slow path rsrc auxilary data, used by update/register */
 	struct io_rsrc_node		*rsrc_backup_node;

From bd550173acc2dc782d9c57852f6b6e71f5d9a159 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 9 Jan 2023 14:46:04 +0000
Subject: [PATCH 14/51] io_uring: refactor io_wake_function

Remove a local variable ctx in io_wake_function(), we don't need it if
io_should_wake() triggers it to wake up.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/e60eb1008aebe286aab7d34c772ed01c447bddb1.1673274244.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index fdea6fbc3fad..5e1a0845e4b6 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2439,15 +2439,13 @@ static inline bool io_should_wake(struct io_wait_queue *iowq)
 static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
 			    int wake_flags, void *key)
 {
-	struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
-							wq);
-	struct io_ring_ctx *ctx = iowq->ctx;
+	struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue, wq);
 
 	/*
 	 * Cannot safely flush overflowed CQEs from here, ensure we wake up
 	 * the task, and the next invocation will do it.
 	 */
-	if (io_should_wake(iowq) || io_has_work(ctx))
+	if (io_should_wake(iowq) || io_has_work(iowq->ctx))
 		return autoremove_wake_function(curr, mode, wake_flags, key);
 	return -1;
 }

From 2f413956cc8a72fc11c2779228112559bbca8279 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 9 Jan 2023 14:46:05 +0000
Subject: [PATCH 15/51] io_uring: don't set TASK_RUNNING in local tw runner

The CQ waiting loop sets TASK_RUNNING before trying to execute
task_work, no need to repeat it in io_run_local_work().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/9d9422c429ef3f9457b4f4b8288bf4789564f33b.1673274244.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 5e1a0845e4b6..e61e2a0f23b1 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1339,11 +1339,9 @@ again:
 
 int io_run_local_work(struct io_ring_ctx *ctx)
 {
-	bool locked;
+	bool locked = mutex_trylock(&ctx->uring_lock);
 	int ret;
 
-	__set_current_state(TASK_RUNNING);
-	locked = mutex_trylock(&ctx->uring_lock);
 	ret = __io_run_local_work(ctx, &locked);
 	if (locked)
 		mutex_unlock(&ctx->uring_lock);
@@ -2453,6 +2451,7 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
 int io_run_task_work_sig(struct io_ring_ctx *ctx)
 {
 	if (!llist_empty(&ctx->work_llist)) {
+		__set_current_state(TASK_RUNNING);
 		if (io_run_local_work(ctx) > 0)
 			return 1;
 	}

From 3e5655552a8299492d54117a15b6ddf5a2e7512c Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 9 Jan 2023 14:46:06 +0000
Subject: [PATCH 16/51] io_uring: mark io_run_local_work static

io_run_local_work is enclosed in io_uring.c, we don't need to export it.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/b477fb81f5e77044f724a06fe245d5c078659364.1673274244.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 2 +-
 io_uring/io_uring.h | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index e61e2a0f23b1..79d94381d967 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1337,7 +1337,7 @@ again:
 
 }
 
-int io_run_local_work(struct io_ring_ctx *ctx)
+static int io_run_local_work(struct io_ring_ctx *ctx)
 {
 	bool locked = mutex_trylock(&ctx->uring_lock);
 	int ret;
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 011932935c36..78896d1f7916 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -29,7 +29,6 @@ struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow);
 bool io_req_cqe_overflow(struct io_kiocb *req);
 int io_run_task_work_sig(struct io_ring_ctx *ctx);
 int __io_run_local_work(struct io_ring_ctx *ctx, bool *locked);
-int io_run_local_work(struct io_ring_ctx *ctx);
 void io_req_defer_failed(struct io_kiocb *req, s32 res);
 void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags);
 bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags);

From 360173ab9e1a8a50bc9092ae8c741f0a05d499b7 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 9 Jan 2023 14:46:07 +0000
Subject: [PATCH 17/51] io_uring: move io_run_local_work_locked

io_run_local_work_locked() is only used in io_uring.c, move it there.
With that we can also make __io_run_local_work() static.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/91757bcb33e5774e49fed6f2b6e058630608119b.1673274244.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 18 +++++++++++++++++-
 io_uring/io_uring.h | 17 -----------------
 2 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 79d94381d967..1e79f37b071b 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1296,7 +1296,7 @@ static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx)
 	}
 }
 
-int __io_run_local_work(struct io_ring_ctx *ctx, bool *locked)
+static int __io_run_local_work(struct io_ring_ctx *ctx, bool *locked)
 {
 	struct llist_node *node;
 	struct llist_node fake;
@@ -1337,6 +1337,22 @@ again:
 
 }
 
+static inline int io_run_local_work_locked(struct io_ring_ctx *ctx)
+{
+	bool locked;
+	int ret;
+
+	if (llist_empty(&ctx->work_llist))
+		return 0;
+
+	locked = true;
+	ret = __io_run_local_work(ctx, &locked);
+	/* shouldn't happen! */
+	if (WARN_ON_ONCE(!locked))
+		mutex_lock(&ctx->uring_lock);
+	return ret;
+}
+
 static int io_run_local_work(struct io_ring_ctx *ctx)
 {
 	bool locked = mutex_trylock(&ctx->uring_lock);
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 78896d1f7916..b5975e353aa1 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -28,7 +28,6 @@ enum {
 struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow);
 bool io_req_cqe_overflow(struct io_kiocb *req);
 int io_run_task_work_sig(struct io_ring_ctx *ctx);
-int __io_run_local_work(struct io_ring_ctx *ctx, bool *locked);
 void io_req_defer_failed(struct io_kiocb *req, s32 res);
 void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags);
 bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags);
@@ -283,22 +282,6 @@ static inline bool io_task_work_pending(struct io_ring_ctx *ctx)
 	return task_work_pending(current) || !wq_list_empty(&ctx->work_llist);
 }
 
-static inline int io_run_local_work_locked(struct io_ring_ctx *ctx)
-{
-	bool locked;
-	int ret;
-
-	if (llist_empty(&ctx->work_llist))
-		return 0;
-
-	locked = true;
-	ret = __io_run_local_work(ctx, &locked);
-	/* shouldn't happen! */
-	if (WARN_ON_ONCE(!locked))
-		mutex_lock(&ctx->uring_lock);
-	return ret;
-}
-
 static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked)
 {
 	if (!*locked) {

From 7b235dd82ad32c1626e51303d94ec5ef4d7bc994 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 9 Jan 2023 14:46:08 +0000
Subject: [PATCH 18/51] io_uring: separate wq for ring polling

Don't use ->cq_wait for ring polling but add a separate wait queue for
it. We need it for following patches.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/dea0be0bf990503443c5c6c337fc66824af7d590.1673274244.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h | 2 +-
 io_uring/io_uring.c            | 3 ++-
 io_uring/io_uring.h            | 9 +++++++++
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 8dfb6c4a35d9..0d94ee191c15 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -319,7 +319,7 @@ struct io_ring_ctx {
 	} ____cacheline_aligned_in_smp;
 
 	/* Keep this last, we don't need it for the fast path */
-
+	struct wait_queue_head		poll_wq;
 	struct io_restriction		restrictions;
 
 	/* slow path rsrc auxilary data, used by update/register */
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 1e79f37b071b..2ee2bcaeadfd 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -316,6 +316,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
 	mutex_init(&ctx->uring_lock);
 	init_waitqueue_head(&ctx->cq_wait);
+	init_waitqueue_head(&ctx->poll_wq);
 	spin_lock_init(&ctx->completion_lock);
 	spin_lock_init(&ctx->timeout_lock);
 	INIT_WQ_LIST(&ctx->iopoll_list);
@@ -2786,7 +2787,7 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait)
 	struct io_ring_ctx *ctx = file->private_data;
 	__poll_t mask = 0;
 
-	poll_wait(file, &ctx->cq_wait, wait);
+	poll_wait(file, &ctx->poll_wq, wait);
 	/*
 	 * synchronizes with barrier from wq_has_sleeper call in
 	 * io_commit_cqring
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index b5975e353aa1..c75bbb94703c 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -220,9 +220,18 @@ static inline void io_commit_cqring(struct io_ring_ctx *ctx)
 	smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
 }
 
+static inline void io_poll_wq_wake(struct io_ring_ctx *ctx)
+{
+	if (waitqueue_active(&ctx->poll_wq))
+		__wake_up(&ctx->poll_wq, TASK_NORMAL, 0,
+				poll_to_key(EPOLL_URING_WAKE | EPOLLIN));
+}
+
 /* requires smb_mb() prior, see wq_has_sleeper() */
 static inline void __io_cqring_wake(struct io_ring_ctx *ctx)
 {
+	io_poll_wq_wake(ctx);
+
 	/*
 	 * Trigger waitqueue handler on all waiters on our waitqueue. This
 	 * won't necessarily wake up all the tasks, io_should_wake() will make

From bca39f39058567643487cd654970717705784ba3 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 9 Jan 2023 14:46:09 +0000
Subject: [PATCH 19/51] io_uring: add lazy poll_wq activation

Even though io_poll_wq_wake()'s waitqueue_active reuses a barrier we do
for another waitqueue, it's not going to be the case in the future and
so we want to have a fast path for it when the ring has never been
polled.

Move poll_wq wake ups into __io_commit_cqring_flush() using a new flag
called ->poll_activated. The idea behind the flag is to set it when the
ring was polled for the first time. This requires additional sync to not
miss events, which is done here by using task_work for ->task_complete
rings, and by default enabling the flag for all other types of rings.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/060785e8e9137a920b232c0c7f575b131af19cac.1673274244.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  2 ++
 io_uring/io_uring.c            | 60 +++++++++++++++++++++++++++++++++-
 io_uring/io_uring.h            |  7 ++--
 3 files changed, 64 insertions(+), 5 deletions(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 0d94ee191c15..7b5e90520278 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -206,6 +206,7 @@ struct io_ring_ctx {
 		unsigned int		syscall_iopoll: 1;
 		/* all CQEs should be posted only by the submitter task */
 		unsigned int		task_complete: 1;
+		unsigned int		poll_activated: 1;
 
 		enum task_work_notify_mode	notify_method;
 		struct io_rings			*rings;
@@ -357,6 +358,7 @@ struct io_ring_ctx {
 	u32				iowq_limits[2];
 	bool				iowq_limits_set;
 
+	struct callback_head		poll_wq_task_work;
 	struct list_head		defer_list;
 	unsigned			sq_thread_idle;
 	/* protected by ->completion_lock */
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 2ee2bcaeadfd..de71730d9051 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -573,6 +573,8 @@ static void io_eventfd_flush_signal(struct io_ring_ctx *ctx)
 
 void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
 {
+	if (ctx->poll_activated)
+		io_poll_wq_wake(ctx);
 	if (ctx->off_timeout_used)
 		io_flush_timeouts(ctx);
 	if (ctx->drain_active) {
@@ -2782,11 +2784,53 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 	kfree(ctx);
 }
 
+static __cold void io_activate_pollwq_cb(struct callback_head *cb)
+{
+	struct io_ring_ctx *ctx = container_of(cb, struct io_ring_ctx,
+					       poll_wq_task_work);
+
+	mutex_lock(&ctx->uring_lock);
+	ctx->poll_activated = true;
+	mutex_unlock(&ctx->uring_lock);
+
+	/*
+	 * Wake ups for some events between start of polling and activation
+	 * might've been lost due to loose synchronisation.
+	 */
+	wake_up_all(&ctx->poll_wq);
+	percpu_ref_put(&ctx->refs);
+}
+
+static __cold void io_activate_pollwq(struct io_ring_ctx *ctx)
+{
+	spin_lock(&ctx->completion_lock);
+	/* already activated or in progress */
+	if (ctx->poll_activated || ctx->poll_wq_task_work.func)
+		goto out;
+	if (WARN_ON_ONCE(!ctx->task_complete))
+		goto out;
+	if (!ctx->submitter_task)
+		goto out;
+	/*
+	 * with ->submitter_task only the submitter task completes requests, we
+	 * only need to sync with it, which is done by injecting a tw
+	 */
+	init_task_work(&ctx->poll_wq_task_work, io_activate_pollwq_cb);
+	percpu_ref_get(&ctx->refs);
+	if (task_work_add(ctx->submitter_task, &ctx->poll_wq_task_work, TWA_SIGNAL))
+		percpu_ref_put(&ctx->refs);
+out:
+	spin_unlock(&ctx->completion_lock);
+}
+
 static __poll_t io_uring_poll(struct file *file, poll_table *wait)
 {
 	struct io_ring_ctx *ctx = file->private_data;
 	__poll_t mask = 0;
 
+	if (unlikely(!ctx->poll_activated))
+		io_activate_pollwq(ctx);
+
 	poll_wait(file, &ctx->poll_wq, wait);
 	/*
 	 * synchronizes with barrier from wq_has_sleeper call in
@@ -3593,6 +3637,13 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
 	    !(ctx->flags & IORING_SETUP_SQPOLL))
 		ctx->task_complete = true;
 
+	/*
+	 * lazy poll_wq activation relies on ->task_complete for synchronisation
+	 * purposes, see io_activate_pollwq()
+	 */
+	if (!ctx->task_complete)
+		ctx->poll_activated = true;
+
 	/*
 	 * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
 	 * space applications don't need to do io completion events
@@ -3886,8 +3937,15 @@ static int io_register_enable_rings(struct io_ring_ctx *ctx)
 	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
 		return -EBADFD;
 
-	if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task)
+	if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) {
 		WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
+		/*
+		 * Lazy activation attempts would fail if it was polled before
+		 * submitter_task is set.
+		 */
+		if (wq_has_sleeper(&ctx->poll_wq))
+			io_activate_pollwq(ctx);
+	}
 
 	if (ctx->restrictions.registered)
 		ctx->restricted = 1;
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index c75bbb94703c..5113e0ddb01d 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -222,7 +222,7 @@ static inline void io_commit_cqring(struct io_ring_ctx *ctx)
 
 static inline void io_poll_wq_wake(struct io_ring_ctx *ctx)
 {
-	if (waitqueue_active(&ctx->poll_wq))
+	if (wq_has_sleeper(&ctx->poll_wq))
 		__wake_up(&ctx->poll_wq, TASK_NORMAL, 0,
 				poll_to_key(EPOLL_URING_WAKE | EPOLLIN));
 }
@@ -230,8 +230,6 @@ static inline void io_poll_wq_wake(struct io_ring_ctx *ctx)
 /* requires smb_mb() prior, see wq_has_sleeper() */
 static inline void __io_cqring_wake(struct io_ring_ctx *ctx)
 {
-	io_poll_wq_wake(ctx);
-
 	/*
 	 * Trigger waitqueue handler on all waiters on our waitqueue. This
 	 * won't necessarily wake up all the tasks, io_should_wake() will make
@@ -316,7 +314,8 @@ static inline void io_req_complete_defer(struct io_kiocb *req)
 
 static inline void io_commit_cqring_flush(struct io_ring_ctx *ctx)
 {
-	if (unlikely(ctx->off_timeout_used || ctx->drain_active || ctx->has_evfd))
+	if (unlikely(ctx->off_timeout_used || ctx->drain_active ||
+		     ctx->has_evfd || ctx->poll_activated))
 		__io_commit_cqring_flush(ctx);
 }
 

From 3181e22fb79910c7071e84a43af93ac89e8a7106 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 9 Jan 2023 14:46:10 +0000
Subject: [PATCH 20/51] io_uring: wake up optimisations

Flush completions is done either from the submit syscall or by the
task_work, both are in the context of the submitter task, and when it
goes for a single threaded rings like implied by ->task_complete, there
won't be any waiters on ->cq_wait but the master task. That means that
there can be no tasks sleeping on cq_wait while we run
__io_submit_flush_completions() and so waking up can be skipped.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/60ad9768ec74435a0ddaa6eec0ffa7729474f69f.1673274244.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index de71730d9051..f2e1dd076d98 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -621,6 +621,25 @@ static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx)
 	io_cqring_wake(ctx);
 }
 
+static inline void __io_cq_unlock_post_flush(struct io_ring_ctx *ctx)
+	__releases(ctx->completion_lock)
+{
+	io_commit_cqring(ctx);
+	__io_cq_unlock(ctx);
+	io_commit_cqring_flush(ctx);
+
+	/*
+	 * As ->task_complete implies that the ring is single tasked, cq_wait
+	 * may only be waited on by the current in io_cqring_wait(), but since
+	 * it will re-check the wakeup conditions once we return we can safely
+	 * skip waking it up.
+	 */
+	if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) {
+		smp_mb();
+		__io_cqring_wake(ctx);
+	}
+}
+
 void io_cq_unlock_post(struct io_ring_ctx *ctx)
 	__releases(ctx->completion_lock)
 {
@@ -1480,7 +1499,7 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
 			}
 		}
 	}
-	__io_cq_unlock_post(ctx);
+	__io_cq_unlock_post_flush(ctx);
 
 	if (!wq_list_empty(&ctx->submit_state.compl_reqs)) {
 		io_free_batch_list(ctx, state->compl_reqs.first);

From 130bd686d9be918e4cc8c03abf5794ba2d860502 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 9 Jan 2023 14:46:11 +0000
Subject: [PATCH 21/51] io_uring: waitqueue-less cq waiting

With DEFER_TASKRUN only ctx->submitter_task might be waiting for CQEs,
we can use this to optimise io_cqring_wait(). Replace ->cq_wait
waitqueue with waking the task directly.

It works but misses an important optimisation covered by the following
patch, so this patch without follow ups might hurt performance.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/103d174d35d919d4cb0922d8a9c93a8f0c35f74a.1673274244.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index f2e1dd076d98..be26829f7d20 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1263,7 +1263,7 @@ static void io_req_local_work_add(struct io_kiocb *req)
 		percpu_ref_put(&ctx->refs);
 		return;
 	}
-	/* need it for the following io_cqring_wake() */
+	/* needed for the following wake up */
 	smp_mb__after_atomic();
 
 	if (unlikely(atomic_read(&req->task->io_uring->in_idle))) {
@@ -1274,10 +1274,9 @@ static void io_req_local_work_add(struct io_kiocb *req)
 
 	if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
 		atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
-
 	if (ctx->has_evfd)
 		io_eventfd_signal(ctx);
-	__io_cqring_wake(ctx);
+	wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE);
 	percpu_ref_put(&ctx->refs);
 }
 
@@ -2576,12 +2575,17 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 	do {
 		unsigned long check_cq;
 
-		prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
-						TASK_INTERRUPTIBLE);
+		if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
+			set_current_state(TASK_INTERRUPTIBLE);
+		} else {
+			prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
+							TASK_INTERRUPTIBLE);
+		}
+
 		ret = io_cqring_wait_schedule(ctx, &iowq);
+		__set_current_state(TASK_RUNNING);
 		if (ret < 0)
 			break;
-		__set_current_state(TASK_RUNNING);
 		/*
 		 * Run task_work after scheduling and before io_should_wake().
 		 * If we got woken because of task_work being processed, run it
@@ -2609,7 +2613,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 		cond_resched();
 	} while (1);
 
-	finish_wait(&ctx->cq_wait, &iowq.wq);
+	if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
+		finish_wait(&ctx->cq_wait, &iowq.wq);
 	restore_saved_sigmask_unless(ret == -EINTR);
 
 	return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;

From d80c0f00d04766972c95e72b7535a842d6f4680d Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 9 Jan 2023 14:46:12 +0000
Subject: [PATCH 22/51] io_uring: add io_req_local_work_add wake fast path

Don't wake the master task after queueing a deferred tw unless it's
currently waiting in io_cqring_wait.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/717702d772825a6647e6c315b4690277ba84c3fc.1673274244.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h | 1 +
 io_uring/io_uring.c            | 7 ++++++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 7b5e90520278..cc0cf0705b8f 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -295,6 +295,7 @@ struct io_ring_ctx {
 		spinlock_t		completion_lock;
 
 		bool			poll_multi_queue;
+		bool			cq_waiting;
 
 		/*
 		 * ->iopoll_list is protected by the ctx->uring_lock for
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index be26829f7d20..3f92028a7ee8 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1276,7 +1276,9 @@ static void io_req_local_work_add(struct io_kiocb *req)
 		atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
 	if (ctx->has_evfd)
 		io_eventfd_signal(ctx);
-	wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE);
+
+	if (READ_ONCE(ctx->cq_waiting))
+		wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE);
 	percpu_ref_put(&ctx->refs);
 }
 
@@ -2576,6 +2578,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 		unsigned long check_cq;
 
 		if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
+			WRITE_ONCE(ctx->cq_waiting, 1);
 			set_current_state(TASK_INTERRUPTIBLE);
 		} else {
 			prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
@@ -2584,6 +2587,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 
 		ret = io_cqring_wait_schedule(ctx, &iowq);
 		__set_current_state(TASK_RUNNING);
+		WRITE_ONCE(ctx->cq_waiting, 0);
+
 		if (ret < 0)
 			break;
 		/*

From c3f4d39ee4bc06f975ff45012398342ff276eb69 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 9 Jan 2023 14:46:13 +0000
Subject: [PATCH 23/51] io_uring: optimise deferred tw execution

We needed fake nodes in __io_run_local_work() and to avoid unecessary wake
ups while the task already running task_works, but we don't need them
anymore since wake ups are protected by cq_waiting, which is always
cleared by the time we're executing deferred task_work items.

Note that because of loose sync around cq_waiting clearing
io_req_local_work_add() may wake the task more than once, but that's
fine and should be rare to not hurt perf.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/8839534891f0a2f1076e78554a31ea7e099f7de5.1673274244.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 24 +++++++-----------------
 1 file changed, 7 insertions(+), 17 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 3f92028a7ee8..653361746203 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1322,17 +1322,16 @@ static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx)
 static int __io_run_local_work(struct io_ring_ctx *ctx, bool *locked)
 {
 	struct llist_node *node;
-	struct llist_node fake;
-	struct llist_node *current_final = NULL;
+	unsigned int loops = 0;
 	int ret = 0;
-	unsigned int loops = 1;
 
 	if (WARN_ON_ONCE(ctx->submitter_task != current))
 		return -EEXIST;
-
-	node = io_llist_xchg(&ctx->work_llist, &fake);
+	if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
+		atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
 again:
-	while (node != current_final) {
+	node = io_llist_xchg(&ctx->work_llist, NULL);
+	while (node) {
 		struct llist_node *next = node->next;
 		struct io_kiocb *req = container_of(node, struct io_kiocb,
 						    io_task_work.node);
@@ -1341,23 +1340,14 @@ again:
 		ret++;
 		node = next;
 	}
+	loops++;
 
-	if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
-		atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
-
-	node = io_llist_cmpxchg(&ctx->work_llist, &fake, NULL);
-	if (node != &fake) {
-		loops++;
-		current_final = &fake;
-		node = io_llist_xchg(&ctx->work_llist, &fake);
+	if (!llist_empty(&ctx->work_llist))
 		goto again;
-	}
-
 	if (*locked)
 		io_submit_flush_completions(ctx);
 	trace_io_uring_local_work_run(ctx, ret, loops);
 	return ret;
-
 }
 
 static inline int io_run_local_work_locked(struct io_ring_ctx *ctx)

From 88b80534f60f5ddf5f42218b280e0370f95eae78 Mon Sep 17 00:00:00 2001
From: Quanfa Fu <quanfafu@gmail.com>
Date: Sun, 15 Jan 2023 15:15:19 +0800
Subject: [PATCH 24/51] io_uring: make io_sqpoll_wait_sq return void

Change the return type to void since it always return 0, and no need
to do the checking in syscall io_uring_enter.

Signed-off-by: Quanfa Fu <quanfafu@gmail.com>
Link: https://lore.kernel.org/r/20230115071519.554282-1-quanfafu@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 8 +++-----
 io_uring/sqpoll.c   | 3 +--
 io_uring/sqpoll.h   | 2 +-
 3 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 653361746203..8fddc010ffe2 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3412,11 +3412,9 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 		}
 		if (flags & IORING_ENTER_SQ_WAKEUP)
 			wake_up(&ctx->sq_data->wait);
-		if (flags & IORING_ENTER_SQ_WAIT) {
-			ret = io_sqpoll_wait_sq(ctx);
-			if (ret)
-				goto out;
-		}
+		if (flags & IORING_ENTER_SQ_WAIT)
+			io_sqpoll_wait_sq(ctx);
+
 		ret = to_submit;
 	} else if (to_submit) {
 		ret = io_uring_add_tctx_node(ctx);
diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c
index 559652380672..0119d3f1a556 100644
--- a/io_uring/sqpoll.c
+++ b/io_uring/sqpoll.c
@@ -312,7 +312,7 @@ static int io_sq_thread(void *data)
 	do_exit(0);
 }
 
-int io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
+void io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
 {
 	DEFINE_WAIT(wait);
 
@@ -327,7 +327,6 @@ int io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
 	} while (!signal_pending(current));
 
 	finish_wait(&ctx->sqo_sq_wait, &wait);
-	return 0;
 }
 
 __cold int io_sq_offload_create(struct io_ring_ctx *ctx,
diff --git a/io_uring/sqpoll.h b/io_uring/sqpoll.h
index 0c3fbcd1f583..e1b8d508d22d 100644
--- a/io_uring/sqpoll.h
+++ b/io_uring/sqpoll.h
@@ -26,4 +26,4 @@ void io_sq_thread_stop(struct io_sq_data *sqd);
 void io_sq_thread_park(struct io_sq_data *sqd);
 void io_sq_thread_unpark(struct io_sq_data *sqd);
 void io_put_sq_data(struct io_sq_data *sqd);
-int io_sqpoll_wait_sq(struct io_ring_ctx *ctx);
+void io_sqpoll_wait_sq(struct io_ring_ctx *ctx);

From b0b7a7d24b66109a940d09d8d2dcf513e4eaf3a1 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 16 Jan 2023 16:48:57 +0000
Subject: [PATCH 25/51] io_uring: return back links tw run optimisation

io_submit_flush_completions() may queue new requests for tw execution,
especially true for linked requests. Recheck the tw list for emptiness
after flushing completions.

Note that this doesn't really fix the commit referenced below, but it
does reinstate an optimization that existed before that got merged.

Fixes: f88262e60bb9 ("io_uring: lockless task list")
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/6328acdbb5e60efc762b18003382de077e6e1367.1673887636.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 8fddc010ffe2..c47af3a3dacf 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1344,8 +1344,11 @@ again:
 
 	if (!llist_empty(&ctx->work_llist))
 		goto again;
-	if (*locked)
+	if (*locked) {
 		io_submit_flush_completions(ctx);
+		if (!llist_empty(&ctx->work_llist))
+			goto again;
+	}
 	trace_io_uring_local_work_run(ctx, ret, loops);
 	return ret;
 }

From 89800a2dd570919bfe01ced90c80e3b472d1c723 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 16 Jan 2023 16:48:58 +0000
Subject: [PATCH 26/51] io_uring: don't export io_put_task()

io_put_task() is only used in uring.c so enclose it there together with
__io_put_task().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/43c7f9227e2ab215f1a6069dadbc5382bed346fe.1673887636.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 11 ++++++++++-
 io_uring/io_uring.h | 10 ----------
 2 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index c47af3a3dacf..f49d0036657f 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -715,7 +715,7 @@ static void io_cqring_overflow_flush(struct io_ring_ctx *ctx)
 		io_cqring_do_overflow_flush(ctx);
 }
 
-void __io_put_task(struct task_struct *task, int nr)
+static void __io_put_task(struct task_struct *task, int nr)
 {
 	struct io_uring_task *tctx = task->io_uring;
 
@@ -725,6 +725,15 @@ void __io_put_task(struct task_struct *task, int nr)
 	put_task_struct_many(task, nr);
 }
 
+/* must to be called somewhat shortly after putting a request */
+static inline void io_put_task(struct task_struct *task, int nr)
+{
+	if (likely(task == current))
+		task->io_uring->cached_refs += nr;
+	else
+		__io_put_task(task, nr);
+}
+
 void io_task_refs_refill(struct io_uring_task *tctx)
 {
 	unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR;
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 5113e0ddb01d..c68edf9872a5 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -70,7 +70,6 @@ void io_wq_submit_work(struct io_wq_work *work);
 
 void io_free_req(struct io_kiocb *req);
 void io_queue_next(struct io_kiocb *req);
-void __io_put_task(struct task_struct *task, int nr);
 void io_task_refs_refill(struct io_uring_task *tctx);
 bool __io_alloc_req_refill(struct io_ring_ctx *ctx);
 
@@ -319,15 +318,6 @@ static inline void io_commit_cqring_flush(struct io_ring_ctx *ctx)
 		__io_commit_cqring_flush(ctx);
 }
 
-/* must to be called somewhat shortly after putting a request */
-static inline void io_put_task(struct task_struct *task, int nr)
-{
-	if (likely(task == current))
-		task->io_uring->cached_refs += nr;
-	else
-		__io_put_task(task, nr);
-}
-
 static inline void io_get_task_refs(int nr)
 {
 	struct io_uring_task *tctx = current->io_uring;

From 31f084b7b0288fd51740b1e1efdb0ff61fb81e48 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 16 Jan 2023 16:48:59 +0000
Subject: [PATCH 27/51] io_uring: simplify fallback execution

Lock the ring with uring_lock in io_fallback_req_func(), which should
make it a bit safer and easier. With that we also don't need refs
pinning as io_ring_exit_work() will wait until uring_lock is freed.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/56170e6a0cbfc8edee2794c6613e8f6f1d76d276.1673887636.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index f49d0036657f..c314dc111d5d 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -245,17 +245,15 @@ static __cold void io_fallback_req_func(struct work_struct *work)
 						fallback_work.work);
 	struct llist_node *node = llist_del_all(&ctx->fallback_llist);
 	struct io_kiocb *req, *tmp;
-	bool locked = false;
+	bool locked = true;
 
-	percpu_ref_get(&ctx->refs);
+	mutex_lock(&ctx->uring_lock);
 	llist_for_each_entry_safe(req, tmp, node, io_task_work.node)
 		req->io_task_work.func(req, &locked);
-
-	if (locked) {
-		io_submit_flush_completions(ctx);
-		mutex_unlock(&ctx->uring_lock);
-	}
-	percpu_ref_put(&ctx->refs);
+	if (WARN_ON_ONCE(!locked))
+		return;
+	io_submit_flush_completions(ctx);
+	mutex_unlock(&ctx->uring_lock);
 }
 
 static int io_alloc_hash_table(struct io_hash_table *table, unsigned bits)

From 632ffe0956740ba56ae3f83778f3bf97edd57c69 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 16 Jan 2023 16:49:00 +0000
Subject: [PATCH 28/51] io_uring: optimise ctx flags layout

There may be different cost for reeading just one byte or more, so it's
benificial to keep ctx flag bits that we access together in a single
byte. That affected code generation of __io_cq_unlock_post_flush() and
removed one memory load.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/bbe8ca4705704690319d65e45845f9fc9d35f420.1673887636.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index cc0cf0705b8f..0efe4d784358 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -196,17 +196,17 @@ struct io_ring_ctx {
 	/* const or read-mostly hot data */
 	struct {
 		unsigned int		flags;
-		unsigned int		compat: 1;
 		unsigned int		drain_next: 1;
 		unsigned int		restricted: 1;
 		unsigned int		off_timeout_used: 1;
 		unsigned int		drain_active: 1;
-		unsigned int		drain_disabled: 1;
 		unsigned int		has_evfd: 1;
-		unsigned int		syscall_iopoll: 1;
 		/* all CQEs should be posted only by the submitter task */
 		unsigned int		task_complete: 1;
+		unsigned int		syscall_iopoll: 1;
 		unsigned int		poll_activated: 1;
+		unsigned int		drain_disabled: 1;
+		unsigned int		compat: 1;
 
 		enum task_work_notify_mode	notify_method;
 		struct io_rings			*rings;

From 68a2cc1bba98144340f6ed66f3aed57e914766d0 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 16 Jan 2023 16:49:01 +0000
Subject: [PATCH 29/51] io_uring: refactor __io_req_complete_post

Keep parts of __io_req_complete_post() relying on req->flags together so
the value can be cached.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/2b4fbb42f404a0e75c4d9f0a5b16f314a839d0a9.1673887636.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index c314dc111d5d..597b757e791e 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -974,14 +974,14 @@ static void __io_req_complete_post(struct io_kiocb *req)
 				req->link = NULL;
 			}
 		}
+		io_put_kbuf_comp(req);
+		io_dismantle_req(req);
 		io_req_put_rsrc(req);
 		/*
 		 * Selected buffer deallocation in io_clean_op() assumes that
 		 * we don't hold ->completion_lock. Clean them here to avoid
 		 * deadlocks.
 		 */
-		io_put_kbuf_comp(req);
-		io_dismantle_req(req);
 		io_put_task(req->task, 1);
 		wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
 		ctx->locked_free_nr++;

From a7dd27828b00be8c0c7520c53baf0b360f4d8bea Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Thu, 12 Jan 2023 06:44:10 -0800
Subject: [PATCH 30/51] io_uring: Rename struct io_op_def

The current io_op_def struct is becoming huge and the name is a bit
generic.

The goal of this patch is to rename this struct to `io_issue_def`. This
struct will contain the hot functions associated with the issue code
path.

For now, this patch only renames the structure, and an upcoming patch
will break up the structure in two, moving the non-issue fields to a
secondary struct.

Signed-off-by: Breno Leitao <leitao@debian.org>
Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/20230112144411.2624698-1-leitao@debian.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 26 +++++++++++++-------------
 io_uring/opdef.c    | 16 ++++++++--------
 io_uring/opdef.h    |  4 ++--
 io_uring/poll.c     |  2 +-
 io_uring/rw.c       |  2 +-
 5 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 597b757e791e..e5b51abe0060 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -406,7 +406,7 @@ static inline void io_arm_ltimeout(struct io_kiocb *req)
 
 static void io_prep_async_work(struct io_kiocb *req)
 {
-	const struct io_op_def *def = &io_op_defs[req->opcode];
+	const struct io_issue_def *def = &io_issue_defs[req->opcode];
 	struct io_ring_ctx *ctx = req->ctx;
 
 	if (!(req->flags & REQ_F_CREDS)) {
@@ -1009,7 +1009,7 @@ void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
 void io_req_defer_failed(struct io_kiocb *req, s32 res)
 	__must_hold(&ctx->uring_lock)
 {
-	const struct io_op_def *def = &io_op_defs[req->opcode];
+	const struct io_issue_def *def = &io_issue_defs[req->opcode];
 
 	lockdep_assert_held(&req->ctx->uring_lock);
 
@@ -1741,8 +1741,8 @@ unsigned int io_file_get_flags(struct file *file)
 
 bool io_alloc_async_data(struct io_kiocb *req)
 {
-	WARN_ON_ONCE(!io_op_defs[req->opcode].async_size);
-	req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL);
+	WARN_ON_ONCE(!io_issue_defs[req->opcode].async_size);
+	req->async_data = kmalloc(io_issue_defs[req->opcode].async_size, GFP_KERNEL);
 	if (req->async_data) {
 		req->flags |= REQ_F_ASYNC_DATA;
 		return false;
@@ -1752,7 +1752,7 @@ bool io_alloc_async_data(struct io_kiocb *req)
 
 int io_req_prep_async(struct io_kiocb *req)
 {
-	const struct io_op_def *def = &io_op_defs[req->opcode];
+	const struct io_issue_def *def = &io_issue_defs[req->opcode];
 
 	/* assign early for deferred execution for non-fixed file */
 	if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE))
@@ -1761,7 +1761,7 @@ int io_req_prep_async(struct io_kiocb *req)
 		return 0;
 	if (WARN_ON_ONCE(req_has_async_data(req)))
 		return -EFAULT;
-	if (!io_op_defs[req->opcode].manual_alloc) {
+	if (!io_issue_defs[req->opcode].manual_alloc) {
 		if (io_alloc_async_data(req))
 			return -EAGAIN;
 	}
@@ -1829,7 +1829,7 @@ static void io_clean_op(struct io_kiocb *req)
 	}
 
 	if (req->flags & REQ_F_NEED_CLEANUP) {
-		const struct io_op_def *def = &io_op_defs[req->opcode];
+		const struct io_issue_def *def = &io_issue_defs[req->opcode];
 
 		if (def->cleanup)
 			def->cleanup(req);
@@ -1855,7 +1855,7 @@ static void io_clean_op(struct io_kiocb *req)
 
 static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags)
 {
-	if (req->file || !io_op_defs[req->opcode].needs_file)
+	if (req->file || !io_issue_defs[req->opcode].needs_file)
 		return true;
 
 	if (req->flags & REQ_F_FIXED_FILE)
@@ -1868,7 +1868,7 @@ static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags)
 
 static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
 {
-	const struct io_op_def *def = &io_op_defs[req->opcode];
+	const struct io_issue_def *def = &io_issue_defs[req->opcode];
 	const struct cred *creds = NULL;
 	int ret;
 
@@ -1922,7 +1922,7 @@ struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
 void io_wq_submit_work(struct io_wq_work *work)
 {
 	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
-	const struct io_op_def *def = &io_op_defs[req->opcode];
+	const struct io_issue_def *def = &io_issue_defs[req->opcode];
 	unsigned int issue_flags = IO_URING_F_UNLOCKED | IO_URING_F_IOWQ;
 	bool needs_poll = false;
 	int ret = 0, err = -ECANCELED;
@@ -2137,7 +2137,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 		       const struct io_uring_sqe *sqe)
 	__must_hold(&ctx->uring_lock)
 {
-	const struct io_op_def *def;
+	const struct io_issue_def *def;
 	unsigned int sqe_flags;
 	int personality;
 	u8 opcode;
@@ -2155,7 +2155,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 		req->opcode = 0;
 		return -EINVAL;
 	}
-	def = &io_op_defs[opcode];
+	def = &io_issue_defs[opcode];
 	if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) {
 		/* enforce forwards compatibility on users */
 		if (sqe_flags & ~SQE_VALID_FLAGS)
@@ -3859,7 +3859,7 @@ static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
 
 	for (i = 0; i < nr_args; i++) {
 		p->ops[i].op = i;
-		if (!io_op_defs[i].not_supported)
+		if (!io_issue_defs[i].not_supported)
 			p->ops[i].flags = IO_URING_OP_SUPPORTED;
 	}
 	p->ops_len = i;
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index 3aa0d65c50e3..3c95e70a625e 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -46,7 +46,7 @@ static __maybe_unused int io_eopnotsupp_prep(struct io_kiocb *kiocb,
 	return -EOPNOTSUPP;
 }
 
-const struct io_op_def io_op_defs[] = {
+const struct io_issue_def io_issue_defs[] = {
 	[IORING_OP_NOP] = {
 		.audit_skip		= 1,
 		.iopoll			= 1,
@@ -536,7 +536,7 @@ const struct io_op_def io_op_defs[] = {
 const char *io_uring_get_opcode(u8 opcode)
 {
 	if (opcode < IORING_OP_LAST)
-		return io_op_defs[opcode].name;
+		return io_issue_defs[opcode].name;
 	return "INVALID";
 }
 
@@ -544,12 +544,12 @@ void __init io_uring_optable_init(void)
 {
 	int i;
 
-	BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
+	BUILD_BUG_ON(ARRAY_SIZE(io_issue_defs) != IORING_OP_LAST);
 
-	for (i = 0; i < ARRAY_SIZE(io_op_defs); i++) {
-		BUG_ON(!io_op_defs[i].prep);
-		if (io_op_defs[i].prep != io_eopnotsupp_prep)
-			BUG_ON(!io_op_defs[i].issue);
-		WARN_ON_ONCE(!io_op_defs[i].name);
+	for (i = 0; i < ARRAY_SIZE(io_issue_defs); i++) {
+		BUG_ON(!io_issue_defs[i].prep);
+		if (io_issue_defs[i].prep != io_eopnotsupp_prep)
+			BUG_ON(!io_issue_defs[i].issue);
+		WARN_ON_ONCE(!io_issue_defs[i].name);
 	}
 }
diff --git a/io_uring/opdef.h b/io_uring/opdef.h
index df7e13d9bfba..d718e2ab1ff7 100644
--- a/io_uring/opdef.h
+++ b/io_uring/opdef.h
@@ -2,7 +2,7 @@
 #ifndef IOU_OP_DEF_H
 #define IOU_OP_DEF_H
 
-struct io_op_def {
+struct io_issue_def {
 	/* needs req->file assigned */
 	unsigned		needs_file : 1;
 	/* should block plug */
@@ -41,7 +41,7 @@ struct io_op_def {
 	void (*fail)(struct io_kiocb *);
 };
 
-extern const struct io_op_def io_op_defs[];
+extern const struct io_issue_def io_issue_defs[];
 
 void io_uring_optable_init(void);
 #endif
diff --git a/io_uring/poll.c b/io_uring/poll.c
index 2ac1366adbd7..8339a92b4510 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -678,7 +678,7 @@ alloc_apoll:
 
 int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
 {
-	const struct io_op_def *def = &io_op_defs[req->opcode];
+	const struct io_issue_def *def = &io_issue_defs[req->opcode];
 	struct async_poll *apoll;
 	struct io_poll_table ipt;
 	__poll_t mask = POLLPRI | POLLERR | EPOLLET;
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 3a3aef7e7c33..02e710e77145 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -516,7 +516,7 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
 static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
 			     struct io_rw_state *s, bool force)
 {
-	if (!force && !io_op_defs[req->opcode].prep_async)
+	if (!force && !io_issue_defs[req->opcode].prep_async)
 		return 0;
 	if (!req_has_async_data(req)) {
 		struct io_async_rw *iorw;

From f30bd4d03824fb437bf080c2b2f926cfee3f09d0 Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Thu, 12 Jan 2023 06:44:11 -0800
Subject: [PATCH 31/51] io_uring: Split io_issue_def struct

This patch removes some "cold" fields from `struct io_issue_def`.

The plan is to keep only highly used fields into `struct io_issue_def`, so,
it may be hot in the cache. The hot fields are basically all the bitfields
and the callback functions for .issue and .prep.

The other less frequently used fields are now located in a secondary and
cold struct, called `io_cold_def`.

This is the size for the structs:

Before: io_issue_def = 56 bytes
After: io_issue_def = 24 bytes; io_cold_def = 40 bytes

Signed-off-by: Breno Leitao <leitao@debian.org>
Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/20230112144411.2624698-2-leitao@debian.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c |  15 +-
 io_uring/opdef.c    | 327 ++++++++++++++++++++++++++++++--------------
 io_uring/opdef.h    |   9 +-
 io_uring/rw.c       |   2 +-
 4 files changed, 238 insertions(+), 115 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index e5b51abe0060..c84dce6944db 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1009,7 +1009,7 @@ void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
 void io_req_defer_failed(struct io_kiocb *req, s32 res)
 	__must_hold(&ctx->uring_lock)
 {
-	const struct io_issue_def *def = &io_issue_defs[req->opcode];
+	const struct io_cold_def *def = &io_cold_defs[req->opcode];
 
 	lockdep_assert_held(&req->ctx->uring_lock);
 
@@ -1741,8 +1741,8 @@ unsigned int io_file_get_flags(struct file *file)
 
 bool io_alloc_async_data(struct io_kiocb *req)
 {
-	WARN_ON_ONCE(!io_issue_defs[req->opcode].async_size);
-	req->async_data = kmalloc(io_issue_defs[req->opcode].async_size, GFP_KERNEL);
+	WARN_ON_ONCE(!io_cold_defs[req->opcode].async_size);
+	req->async_data = kmalloc(io_cold_defs[req->opcode].async_size, GFP_KERNEL);
 	if (req->async_data) {
 		req->flags |= REQ_F_ASYNC_DATA;
 		return false;
@@ -1752,20 +1752,21 @@ bool io_alloc_async_data(struct io_kiocb *req)
 
 int io_req_prep_async(struct io_kiocb *req)
 {
+	const struct io_cold_def *cdef = &io_cold_defs[req->opcode];
 	const struct io_issue_def *def = &io_issue_defs[req->opcode];
 
 	/* assign early for deferred execution for non-fixed file */
 	if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE))
 		req->file = io_file_get_normal(req, req->cqe.fd);
-	if (!def->prep_async)
+	if (!cdef->prep_async)
 		return 0;
 	if (WARN_ON_ONCE(req_has_async_data(req)))
 		return -EFAULT;
-	if (!io_issue_defs[req->opcode].manual_alloc) {
+	if (!def->manual_alloc) {
 		if (io_alloc_async_data(req))
 			return -EAGAIN;
 	}
-	return def->prep_async(req);
+	return cdef->prep_async(req);
 }
 
 static u32 io_get_sequence(struct io_kiocb *req)
@@ -1829,7 +1830,7 @@ static void io_clean_op(struct io_kiocb *req)
 	}
 
 	if (req->flags & REQ_F_NEED_CLEANUP) {
-		const struct io_issue_def *def = &io_issue_defs[req->opcode];
+		const struct io_cold_def *def = &io_cold_defs[req->opcode];
 
 		if (def->cleanup)
 			def->cleanup(req);
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index 3c95e70a625e..5238ecd7af6a 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -50,7 +50,6 @@ const struct io_issue_def io_issue_defs[] = {
 	[IORING_OP_NOP] = {
 		.audit_skip		= 1,
 		.iopoll			= 1,
-		.name			= "NOP",
 		.prep			= io_nop_prep,
 		.issue			= io_nop,
 	},
@@ -64,13 +63,8 @@ const struct io_issue_def io_issue_defs[] = {
 		.ioprio			= 1,
 		.iopoll			= 1,
 		.iopoll_queue		= 1,
-		.async_size		= sizeof(struct io_async_rw),
-		.name			= "READV",
 		.prep			= io_prep_rw,
 		.issue			= io_read,
-		.prep_async		= io_readv_prep_async,
-		.cleanup		= io_readv_writev_cleanup,
-		.fail			= io_rw_fail,
 	},
 	[IORING_OP_WRITEV] = {
 		.needs_file		= 1,
@@ -82,18 +76,12 @@ const struct io_issue_def io_issue_defs[] = {
 		.ioprio			= 1,
 		.iopoll			= 1,
 		.iopoll_queue		= 1,
-		.async_size		= sizeof(struct io_async_rw),
-		.name			= "WRITEV",
 		.prep			= io_prep_rw,
 		.issue			= io_write,
-		.prep_async		= io_writev_prep_async,
-		.cleanup		= io_readv_writev_cleanup,
-		.fail			= io_rw_fail,
 	},
 	[IORING_OP_FSYNC] = {
 		.needs_file		= 1,
 		.audit_skip		= 1,
-		.name			= "FSYNC",
 		.prep			= io_fsync_prep,
 		.issue			= io_fsync,
 	},
@@ -106,11 +94,8 @@ const struct io_issue_def io_issue_defs[] = {
 		.ioprio			= 1,
 		.iopoll			= 1,
 		.iopoll_queue		= 1,
-		.async_size		= sizeof(struct io_async_rw),
-		.name			= "READ_FIXED",
 		.prep			= io_prep_rw,
 		.issue			= io_read,
-		.fail			= io_rw_fail,
 	},
 	[IORING_OP_WRITE_FIXED] = {
 		.needs_file		= 1,
@@ -122,30 +107,24 @@ const struct io_issue_def io_issue_defs[] = {
 		.ioprio			= 1,
 		.iopoll			= 1,
 		.iopoll_queue		= 1,
-		.async_size		= sizeof(struct io_async_rw),
-		.name			= "WRITE_FIXED",
 		.prep			= io_prep_rw,
 		.issue			= io_write,
-		.fail			= io_rw_fail,
 	},
 	[IORING_OP_POLL_ADD] = {
 		.needs_file		= 1,
 		.unbound_nonreg_file	= 1,
 		.audit_skip		= 1,
-		.name			= "POLL_ADD",
 		.prep			= io_poll_add_prep,
 		.issue			= io_poll_add,
 	},
 	[IORING_OP_POLL_REMOVE] = {
 		.audit_skip		= 1,
-		.name			= "POLL_REMOVE",
 		.prep			= io_poll_remove_prep,
 		.issue			= io_poll_remove,
 	},
 	[IORING_OP_SYNC_FILE_RANGE] = {
 		.needs_file		= 1,
 		.audit_skip		= 1,
-		.name			= "SYNC_FILE_RANGE",
 		.prep			= io_sfr_prep,
 		.issue			= io_sync_file_range,
 	},
@@ -155,14 +134,9 @@ const struct io_issue_def io_issue_defs[] = {
 		.pollout		= 1,
 		.ioprio			= 1,
 		.manual_alloc		= 1,
-		.name			= "SENDMSG",
 #if defined(CONFIG_NET)
-		.async_size		= sizeof(struct io_async_msghdr),
 		.prep			= io_sendmsg_prep,
 		.issue			= io_sendmsg,
-		.prep_async		= io_sendmsg_prep_async,
-		.cleanup		= io_sendmsg_recvmsg_cleanup,
-		.fail			= io_sendrecv_fail,
 #else
 		.prep			= io_eopnotsupp_prep,
 #endif
@@ -174,29 +148,21 @@ const struct io_issue_def io_issue_defs[] = {
 		.buffer_select		= 1,
 		.ioprio			= 1,
 		.manual_alloc		= 1,
-		.name			= "RECVMSG",
 #if defined(CONFIG_NET)
-		.async_size		= sizeof(struct io_async_msghdr),
 		.prep			= io_recvmsg_prep,
 		.issue			= io_recvmsg,
-		.prep_async		= io_recvmsg_prep_async,
-		.cleanup		= io_sendmsg_recvmsg_cleanup,
-		.fail			= io_sendrecv_fail,
 #else
 		.prep			= io_eopnotsupp_prep,
 #endif
 	},
 	[IORING_OP_TIMEOUT] = {
 		.audit_skip		= 1,
-		.async_size		= sizeof(struct io_timeout_data),
-		.name			= "TIMEOUT",
 		.prep			= io_timeout_prep,
 		.issue			= io_timeout,
 	},
 	[IORING_OP_TIMEOUT_REMOVE] = {
 		/* used by timeout updates' prep() */
 		.audit_skip		= 1,
-		.name			= "TIMEOUT_REMOVE",
 		.prep			= io_timeout_remove_prep,
 		.issue			= io_timeout_remove,
 	},
@@ -206,7 +172,6 @@ const struct io_issue_def io_issue_defs[] = {
 		.pollin			= 1,
 		.poll_exclusive		= 1,
 		.ioprio			= 1,	/* used for flags */
-		.name			= "ACCEPT",
 #if defined(CONFIG_NET)
 		.prep			= io_accept_prep,
 		.issue			= io_accept,
@@ -216,14 +181,11 @@ const struct io_issue_def io_issue_defs[] = {
 	},
 	[IORING_OP_ASYNC_CANCEL] = {
 		.audit_skip		= 1,
-		.name			= "ASYNC_CANCEL",
 		.prep			= io_async_cancel_prep,
 		.issue			= io_async_cancel,
 	},
 	[IORING_OP_LINK_TIMEOUT] = {
 		.audit_skip		= 1,
-		.async_size		= sizeof(struct io_timeout_data),
-		.name			= "LINK_TIMEOUT",
 		.prep			= io_link_timeout_prep,
 		.issue			= io_no_issue,
 	},
@@ -231,46 +193,36 @@ const struct io_issue_def io_issue_defs[] = {
 		.needs_file		= 1,
 		.unbound_nonreg_file	= 1,
 		.pollout		= 1,
-		.name			= "CONNECT",
 #if defined(CONFIG_NET)
-		.async_size		= sizeof(struct io_async_connect),
 		.prep			= io_connect_prep,
 		.issue			= io_connect,
-		.prep_async		= io_connect_prep_async,
 #else
 		.prep			= io_eopnotsupp_prep,
 #endif
 	},
 	[IORING_OP_FALLOCATE] = {
 		.needs_file		= 1,
-		.name			= "FALLOCATE",
 		.prep			= io_fallocate_prep,
 		.issue			= io_fallocate,
 	},
 	[IORING_OP_OPENAT] = {
-		.name			= "OPENAT",
 		.prep			= io_openat_prep,
 		.issue			= io_openat,
-		.cleanup		= io_open_cleanup,
 	},
 	[IORING_OP_CLOSE] = {
-		.name			= "CLOSE",
 		.prep			= io_close_prep,
 		.issue			= io_close,
 	},
 	[IORING_OP_FILES_UPDATE] = {
 		.audit_skip		= 1,
 		.iopoll			= 1,
-		.name			= "FILES_UPDATE",
 		.prep			= io_files_update_prep,
 		.issue			= io_files_update,
 	},
 	[IORING_OP_STATX] = {
 		.audit_skip		= 1,
-		.name			= "STATX",
 		.prep			= io_statx_prep,
 		.issue			= io_statx,
-		.cleanup		= io_statx_cleanup,
 	},
 	[IORING_OP_READ] = {
 		.needs_file		= 1,
@@ -282,11 +234,8 @@ const struct io_issue_def io_issue_defs[] = {
 		.ioprio			= 1,
 		.iopoll			= 1,
 		.iopoll_queue		= 1,
-		.async_size		= sizeof(struct io_async_rw),
-		.name			= "READ",
 		.prep			= io_prep_rw,
 		.issue			= io_read,
-		.fail			= io_rw_fail,
 	},
 	[IORING_OP_WRITE] = {
 		.needs_file		= 1,
@@ -298,21 +247,16 @@ const struct io_issue_def io_issue_defs[] = {
 		.ioprio			= 1,
 		.iopoll			= 1,
 		.iopoll_queue		= 1,
-		.async_size		= sizeof(struct io_async_rw),
-		.name			= "WRITE",
 		.prep			= io_prep_rw,
 		.issue			= io_write,
-		.fail			= io_rw_fail,
 	},
 	[IORING_OP_FADVISE] = {
 		.needs_file		= 1,
 		.audit_skip		= 1,
-		.name			= "FADVISE",
 		.prep			= io_fadvise_prep,
 		.issue			= io_fadvise,
 	},
 	[IORING_OP_MADVISE] = {
-		.name			= "MADVISE",
 		.prep			= io_madvise_prep,
 		.issue			= io_madvise,
 	},
@@ -323,13 +267,9 @@ const struct io_issue_def io_issue_defs[] = {
 		.audit_skip		= 1,
 		.ioprio			= 1,
 		.manual_alloc		= 1,
-		.name			= "SEND",
 #if defined(CONFIG_NET)
-		.async_size		= sizeof(struct io_async_msghdr),
 		.prep			= io_sendmsg_prep,
 		.issue			= io_send,
-		.fail			= io_sendrecv_fail,
-		.prep_async		= io_send_prep_async,
 #else
 		.prep			= io_eopnotsupp_prep,
 #endif
@@ -341,25 +281,20 @@ const struct io_issue_def io_issue_defs[] = {
 		.buffer_select		= 1,
 		.audit_skip		= 1,
 		.ioprio			= 1,
-		.name			= "RECV",
 #if defined(CONFIG_NET)
 		.prep			= io_recvmsg_prep,
 		.issue			= io_recv,
-		.fail			= io_sendrecv_fail,
 #else
 		.prep			= io_eopnotsupp_prep,
 #endif
 	},
 	[IORING_OP_OPENAT2] = {
-		.name			= "OPENAT2",
 		.prep			= io_openat2_prep,
 		.issue			= io_openat2,
-		.cleanup		= io_open_cleanup,
 	},
 	[IORING_OP_EPOLL_CTL] = {
 		.unbound_nonreg_file	= 1,
 		.audit_skip		= 1,
-		.name			= "EPOLL",
 #if defined(CONFIG_EPOLL)
 		.prep			= io_epoll_ctl_prep,
 		.issue			= io_epoll_ctl,
@@ -372,21 +307,18 @@ const struct io_issue_def io_issue_defs[] = {
 		.hash_reg_file		= 1,
 		.unbound_nonreg_file	= 1,
 		.audit_skip		= 1,
-		.name			= "SPLICE",
 		.prep			= io_splice_prep,
 		.issue			= io_splice,
 	},
 	[IORING_OP_PROVIDE_BUFFERS] = {
 		.audit_skip		= 1,
 		.iopoll			= 1,
-		.name			= "PROVIDE_BUFFERS",
 		.prep			= io_provide_buffers_prep,
 		.issue			= io_provide_buffers,
 	},
 	[IORING_OP_REMOVE_BUFFERS] = {
 		.audit_skip		= 1,
 		.iopoll			= 1,
-		.name			= "REMOVE_BUFFERS",
 		.prep			= io_remove_buffers_prep,
 		.issue			= io_remove_buffers,
 	},
@@ -395,13 +327,11 @@ const struct io_issue_def io_issue_defs[] = {
 		.hash_reg_file		= 1,
 		.unbound_nonreg_file	= 1,
 		.audit_skip		= 1,
-		.name			= "TEE",
 		.prep			= io_tee_prep,
 		.issue			= io_tee,
 	},
 	[IORING_OP_SHUTDOWN] = {
 		.needs_file		= 1,
-		.name			= "SHUTDOWN",
 #if defined(CONFIG_NET)
 		.prep			= io_shutdown_prep,
 		.issue			= io_shutdown,
@@ -410,72 +340,51 @@ const struct io_issue_def io_issue_defs[] = {
 #endif
 	},
 	[IORING_OP_RENAMEAT] = {
-		.name			= "RENAMEAT",
 		.prep			= io_renameat_prep,
 		.issue			= io_renameat,
-		.cleanup		= io_renameat_cleanup,
 	},
 	[IORING_OP_UNLINKAT] = {
-		.name			= "UNLINKAT",
 		.prep			= io_unlinkat_prep,
 		.issue			= io_unlinkat,
-		.cleanup		= io_unlinkat_cleanup,
 	},
 	[IORING_OP_MKDIRAT] = {
-		.name			= "MKDIRAT",
 		.prep			= io_mkdirat_prep,
 		.issue			= io_mkdirat,
-		.cleanup		= io_mkdirat_cleanup,
 	},
 	[IORING_OP_SYMLINKAT] = {
-		.name			= "SYMLINKAT",
 		.prep			= io_symlinkat_prep,
 		.issue			= io_symlinkat,
-		.cleanup		= io_link_cleanup,
 	},
 	[IORING_OP_LINKAT] = {
-		.name			= "LINKAT",
 		.prep			= io_linkat_prep,
 		.issue			= io_linkat,
-		.cleanup		= io_link_cleanup,
 	},
 	[IORING_OP_MSG_RING] = {
 		.needs_file		= 1,
 		.iopoll			= 1,
-		.name			= "MSG_RING",
 		.prep			= io_msg_ring_prep,
 		.issue			= io_msg_ring,
-		.cleanup		= io_msg_ring_cleanup,
 	},
 	[IORING_OP_FSETXATTR] = {
 		.needs_file = 1,
-		.name			= "FSETXATTR",
 		.prep			= io_fsetxattr_prep,
 		.issue			= io_fsetxattr,
-		.cleanup		= io_xattr_cleanup,
 	},
 	[IORING_OP_SETXATTR] = {
-		.name			= "SETXATTR",
 		.prep			= io_setxattr_prep,
 		.issue			= io_setxattr,
-		.cleanup		= io_xattr_cleanup,
 	},
 	[IORING_OP_FGETXATTR] = {
 		.needs_file = 1,
-		.name			= "FGETXATTR",
 		.prep			= io_fgetxattr_prep,
 		.issue			= io_fgetxattr,
-		.cleanup		= io_xattr_cleanup,
 	},
 	[IORING_OP_GETXATTR] = {
-		.name			= "GETXATTR",
 		.prep			= io_getxattr_prep,
 		.issue			= io_getxattr,
-		.cleanup		= io_xattr_cleanup,
 	},
 	[IORING_OP_SOCKET] = {
 		.audit_skip		= 1,
-		.name			= "SOCKET",
 #if defined(CONFIG_NET)
 		.prep			= io_socket_prep,
 		.issue			= io_socket,
@@ -486,16 +395,12 @@ const struct io_issue_def io_issue_defs[] = {
 	[IORING_OP_URING_CMD] = {
 		.needs_file		= 1,
 		.plug			= 1,
-		.name			= "URING_CMD",
 		.iopoll			= 1,
 		.iopoll_queue		= 1,
-		.async_size		= uring_cmd_pdu_size(1),
 		.prep			= io_uring_cmd_prep,
 		.issue			= io_uring_cmd,
-		.prep_async		= io_uring_cmd_prep_async,
 	},
 	[IORING_OP_SEND_ZC] = {
-		.name			= "SEND_ZC",
 		.needs_file		= 1,
 		.unbound_nonreg_file	= 1,
 		.pollout		= 1,
@@ -503,32 +408,243 @@ const struct io_issue_def io_issue_defs[] = {
 		.ioprio			= 1,
 		.manual_alloc		= 1,
 #if defined(CONFIG_NET)
-		.async_size		= sizeof(struct io_async_msghdr),
 		.prep			= io_send_zc_prep,
 		.issue			= io_send_zc,
-		.prep_async		= io_send_prep_async,
-		.cleanup		= io_send_zc_cleanup,
-		.fail			= io_sendrecv_fail,
 #else
 		.prep			= io_eopnotsupp_prep,
 #endif
 	},
 	[IORING_OP_SENDMSG_ZC] = {
-		.name			= "SENDMSG_ZC",
 		.needs_file		= 1,
 		.unbound_nonreg_file	= 1,
 		.pollout		= 1,
 		.ioprio			= 1,
 		.manual_alloc		= 1,
 #if defined(CONFIG_NET)
-		.async_size		= sizeof(struct io_async_msghdr),
 		.prep			= io_send_zc_prep,
 		.issue			= io_sendmsg_zc,
+#else
+		.prep			= io_eopnotsupp_prep,
+#endif
+	},
+};
+
+
+const struct io_cold_def io_cold_defs[] = {
+	[IORING_OP_NOP] = {
+		.name			= "NOP",
+	},
+	[IORING_OP_READV] = {
+		.async_size		= sizeof(struct io_async_rw),
+		.name			= "READV",
+		.prep_async		= io_readv_prep_async,
+		.cleanup		= io_readv_writev_cleanup,
+		.fail			= io_rw_fail,
+	},
+	[IORING_OP_WRITEV] = {
+		.async_size		= sizeof(struct io_async_rw),
+		.name			= "WRITEV",
+		.prep_async		= io_writev_prep_async,
+		.cleanup		= io_readv_writev_cleanup,
+		.fail			= io_rw_fail,
+	},
+	[IORING_OP_FSYNC] = {
+		.name			= "FSYNC",
+	},
+	[IORING_OP_READ_FIXED] = {
+		.async_size		= sizeof(struct io_async_rw),
+		.name			= "READ_FIXED",
+		.fail			= io_rw_fail,
+	},
+	[IORING_OP_WRITE_FIXED] = {
+		.async_size		= sizeof(struct io_async_rw),
+		.name			= "WRITE_FIXED",
+		.fail			= io_rw_fail,
+	},
+	[IORING_OP_POLL_ADD] = {
+		.name			= "POLL_ADD",
+	},
+	[IORING_OP_POLL_REMOVE] = {
+		.name			= "POLL_REMOVE",
+	},
+	[IORING_OP_SYNC_FILE_RANGE] = {
+		.name			= "SYNC_FILE_RANGE",
+	},
+	[IORING_OP_SENDMSG] = {
+		.name			= "SENDMSG",
+#if defined(CONFIG_NET)
+		.async_size		= sizeof(struct io_async_msghdr),
+		.prep_async		= io_sendmsg_prep_async,
+		.cleanup		= io_sendmsg_recvmsg_cleanup,
+		.fail			= io_sendrecv_fail,
+#endif
+	},
+	[IORING_OP_RECVMSG] = {
+		.name			= "RECVMSG",
+#if defined(CONFIG_NET)
+		.async_size		= sizeof(struct io_async_msghdr),
+		.prep_async		= io_recvmsg_prep_async,
+		.cleanup		= io_sendmsg_recvmsg_cleanup,
+		.fail			= io_sendrecv_fail,
+#endif
+	},
+	[IORING_OP_TIMEOUT] = {
+		.async_size		= sizeof(struct io_timeout_data),
+		.name			= "TIMEOUT",
+	},
+	[IORING_OP_TIMEOUT_REMOVE] = {
+		.name			= "TIMEOUT_REMOVE",
+	},
+	[IORING_OP_ACCEPT] = {
+		.name			= "ACCEPT",
+	},
+	[IORING_OP_ASYNC_CANCEL] = {
+		.name			= "ASYNC_CANCEL",
+	},
+	[IORING_OP_LINK_TIMEOUT] = {
+		.async_size		= sizeof(struct io_timeout_data),
+		.name			= "LINK_TIMEOUT",
+	},
+	[IORING_OP_CONNECT] = {
+		.name			= "CONNECT",
+#if defined(CONFIG_NET)
+		.async_size		= sizeof(struct io_async_connect),
+		.prep_async		= io_connect_prep_async,
+#endif
+	},
+	[IORING_OP_FALLOCATE] = {
+		.name			= "FALLOCATE",
+	},
+	[IORING_OP_OPENAT] = {
+		.name			= "OPENAT",
+		.cleanup		= io_open_cleanup,
+	},
+	[IORING_OP_CLOSE] = {
+		.name			= "CLOSE",
+	},
+	[IORING_OP_FILES_UPDATE] = {
+		.name			= "FILES_UPDATE",
+	},
+	[IORING_OP_STATX] = {
+		.name			= "STATX",
+		.cleanup		= io_statx_cleanup,
+	},
+	[IORING_OP_READ] = {
+		.async_size		= sizeof(struct io_async_rw),
+		.name			= "READ",
+		.fail			= io_rw_fail,
+	},
+	[IORING_OP_WRITE] = {
+		.async_size		= sizeof(struct io_async_rw),
+		.name			= "WRITE",
+		.fail			= io_rw_fail,
+	},
+	[IORING_OP_FADVISE] = {
+		.name			= "FADVISE",
+	},
+	[IORING_OP_MADVISE] = {
+		.name			= "MADVISE",
+	},
+	[IORING_OP_SEND] = {
+		.name			= "SEND",
+#if defined(CONFIG_NET)
+		.async_size		= sizeof(struct io_async_msghdr),
+		.fail			= io_sendrecv_fail,
+		.prep_async		= io_send_prep_async,
+#endif
+	},
+	[IORING_OP_RECV] = {
+		.name			= "RECV",
+#if defined(CONFIG_NET)
+		.fail			= io_sendrecv_fail,
+#endif
+	},
+	[IORING_OP_OPENAT2] = {
+		.name			= "OPENAT2",
+		.cleanup		= io_open_cleanup,
+	},
+	[IORING_OP_EPOLL_CTL] = {
+		.name			= "EPOLL",
+	},
+	[IORING_OP_SPLICE] = {
+		.name			= "SPLICE",
+	},
+	[IORING_OP_PROVIDE_BUFFERS] = {
+		.name			= "PROVIDE_BUFFERS",
+	},
+	[IORING_OP_REMOVE_BUFFERS] = {
+		.name			= "REMOVE_BUFFERS",
+	},
+	[IORING_OP_TEE] = {
+		.name			= "TEE",
+	},
+	[IORING_OP_SHUTDOWN] = {
+		.name			= "SHUTDOWN",
+	},
+	[IORING_OP_RENAMEAT] = {
+		.name			= "RENAMEAT",
+		.cleanup		= io_renameat_cleanup,
+	},
+	[IORING_OP_UNLINKAT] = {
+		.name			= "UNLINKAT",
+		.cleanup		= io_unlinkat_cleanup,
+	},
+	[IORING_OP_MKDIRAT] = {
+		.name			= "MKDIRAT",
+		.cleanup		= io_mkdirat_cleanup,
+	},
+	[IORING_OP_SYMLINKAT] = {
+		.name			= "SYMLINKAT",
+		.cleanup		= io_link_cleanup,
+	},
+	[IORING_OP_LINKAT] = {
+		.name			= "LINKAT",
+		.cleanup		= io_link_cleanup,
+	},
+	[IORING_OP_MSG_RING] = {
+		.name			= "MSG_RING",
+		.cleanup		= io_msg_ring_cleanup,
+	},
+	[IORING_OP_FSETXATTR] = {
+		.name			= "FSETXATTR",
+		.cleanup		= io_xattr_cleanup,
+	},
+	[IORING_OP_SETXATTR] = {
+		.name			= "SETXATTR",
+		.cleanup		= io_xattr_cleanup,
+	},
+	[IORING_OP_FGETXATTR] = {
+		.name			= "FGETXATTR",
+		.cleanup		= io_xattr_cleanup,
+	},
+	[IORING_OP_GETXATTR] = {
+		.name			= "GETXATTR",
+		.cleanup		= io_xattr_cleanup,
+	},
+	[IORING_OP_SOCKET] = {
+		.name			= "SOCKET",
+	},
+	[IORING_OP_URING_CMD] = {
+		.name			= "URING_CMD",
+		.async_size		= uring_cmd_pdu_size(1),
+		.prep_async		= io_uring_cmd_prep_async,
+	},
+	[IORING_OP_SEND_ZC] = {
+		.name			= "SEND_ZC",
+#if defined(CONFIG_NET)
+		.async_size		= sizeof(struct io_async_msghdr),
+		.prep_async		= io_send_prep_async,
+		.cleanup		= io_send_zc_cleanup,
+		.fail			= io_sendrecv_fail,
+#endif
+	},
+	[IORING_OP_SENDMSG_ZC] = {
+		.name			= "SENDMSG_ZC",
+#if defined(CONFIG_NET)
+		.async_size		= sizeof(struct io_async_msghdr),
 		.prep_async		= io_sendmsg_prep_async,
 		.cleanup		= io_send_zc_cleanup,
 		.fail			= io_sendrecv_fail,
-#else
-		.prep			= io_eopnotsupp_prep,
 #endif
 	},
 };
@@ -536,7 +652,7 @@ const struct io_issue_def io_issue_defs[] = {
 const char *io_uring_get_opcode(u8 opcode)
 {
 	if (opcode < IORING_OP_LAST)
-		return io_issue_defs[opcode].name;
+		return io_cold_defs[opcode].name;
 	return "INVALID";
 }
 
@@ -544,12 +660,13 @@ void __init io_uring_optable_init(void)
 {
 	int i;
 
+	BUILD_BUG_ON(ARRAY_SIZE(io_cold_defs) != IORING_OP_LAST);
 	BUILD_BUG_ON(ARRAY_SIZE(io_issue_defs) != IORING_OP_LAST);
 
 	for (i = 0; i < ARRAY_SIZE(io_issue_defs); i++) {
 		BUG_ON(!io_issue_defs[i].prep);
 		if (io_issue_defs[i].prep != io_eopnotsupp_prep)
 			BUG_ON(!io_issue_defs[i].issue);
-		WARN_ON_ONCE(!io_issue_defs[i].name);
+		WARN_ON_ONCE(!io_cold_defs[i].name);
 	}
 }
diff --git a/io_uring/opdef.h b/io_uring/opdef.h
index d718e2ab1ff7..c22c8696e749 100644
--- a/io_uring/opdef.h
+++ b/io_uring/opdef.h
@@ -29,19 +29,24 @@ struct io_issue_def {
 	unsigned		iopoll_queue : 1;
 	/* opcode specific path will handle ->async_data allocation if needed */
 	unsigned		manual_alloc : 1;
+
+	int (*issue)(struct io_kiocb *, unsigned int);
+	int (*prep)(struct io_kiocb *, const struct io_uring_sqe *);
+};
+
+struct io_cold_def {
 	/* size of async data needed, if any */
 	unsigned short		async_size;
 
 	const char		*name;
 
-	int (*prep)(struct io_kiocb *, const struct io_uring_sqe *);
-	int (*issue)(struct io_kiocb *, unsigned int);
 	int (*prep_async)(struct io_kiocb *);
 	void (*cleanup)(struct io_kiocb *);
 	void (*fail)(struct io_kiocb *);
 };
 
 extern const struct io_issue_def io_issue_defs[];
+extern const struct io_cold_def io_cold_defs[];
 
 void io_uring_optable_init(void);
 #endif
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 02e710e77145..efe6bfda9ca9 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -516,7 +516,7 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
 static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
 			     struct io_rw_state *s, bool force)
 {
-	if (!force && !io_issue_defs[req->opcode].prep_async)
+	if (!force && !io_cold_defs[req->opcode].prep_async)
 		return 0;
 	if (!req_has_async_data(req)) {
 		struct io_async_rw *iorw;

From 8572df941cbef2b295282535b013828e7df39471 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 21 Jan 2023 19:53:41 -0700
Subject: [PATCH 32/51] io_uring/msg-ring: ensure flags passing works for
 task_work completions

If the target ring is using IORING_SETUP_SINGLE_ISSUER and we're posting
a message from a different thread, then we need to ensure that the
fallback task_work that posts the CQE knwos about the flags passing as
well. If not we'll always be posting 0 as the flags.

Fixes: 3563d7ed58a5 ("io_uring/msg_ring: Pass custom flags to the cqe")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/msg_ring.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c
index 27992bda1ffa..8803c0979e2a 100644
--- a/io_uring/msg_ring.c
+++ b/io_uring/msg_ring.c
@@ -99,6 +99,11 @@ static void io_msg_tw_complete(struct callback_head *head)
 	if (current->flags & PF_EXITING) {
 		ret = -EOWNERDEAD;
 	} else {
+		u32 flags = 0;
+
+		if (msg->flags & IORING_MSG_RING_FLAGS_PASS)
+			flags = msg->cqe_flags;
+
 		/*
 		 * If the target ring is using IOPOLL mode, then we need to be
 		 * holding the uring_lock for posting completions. Other ring
@@ -107,7 +112,7 @@ static void io_msg_tw_complete(struct callback_head *head)
 		 */
 		if (target_ctx->flags & IORING_SETUP_IOPOLL)
 			mutex_lock(&target_ctx->uring_lock);
-		if (!io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0))
+		if (!io_post_aux_cqe(target_ctx, msg->user_data, msg->len, flags))
 			ret = -EOVERFLOW;
 		if (target_ctx->flags & IORING_SETUP_IOPOLL)
 			mutex_unlock(&target_ctx->uring_lock);

From b5d3ae202fbfe055aa2a8ae8524531ee1dcab717 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 24 Jan 2023 08:24:25 -0700
Subject: [PATCH 33/51] io_uring: handle TIF_NOTIFY_RESUME when checking for
 task_work

If TIF_NOTIFY_RESUME is set, then we need to call resume_user_mode_work()
for PF_IO_WORKER threads. They never return to usermode, hence never get
a chance to process any items that are marked by this flag. Most notably
this includes the final put of files, but also any throttling markers set
by block cgroups.

Cc: stable@vger.kernel.org # 5.10+
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index c68edf9872a5..d58cfe062da9 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -3,6 +3,7 @@
 
 #include <linux/errno.h>
 #include <linux/lockdep.h>
+#include <linux/resume_user_mode.h>
 #include <linux/io_uring_types.h>
 #include <uapi/linux/eventpoll.h>
 #include "io-wq.h"
@@ -274,6 +275,13 @@ static inline int io_run_task_work(void)
 	 */
 	if (test_thread_flag(TIF_NOTIFY_SIGNAL))
 		clear_notify_signal();
+	/*
+	 * PF_IO_WORKER never returns to userspace, so check here if we have
+	 * notify work that needs processing.
+	 */
+	if (current->flags & PF_IO_WORKER &&
+	    test_thread_flag(TIF_NOTIFY_RESUME))
+		resume_user_mode_work(NULL);
 	if (task_work_pending(current)) {
 		__set_current_state(TASK_RUNNING);
 		task_work_run();

From c1755c25a7190494b45861284b4a30bd9cd813ff Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Wed, 18 Jan 2023 07:56:30 -0800
Subject: [PATCH 34/51] io_uring: Enable KASAN for request cache

Every io_uring request is represented by struct io_kiocb, which is
cached locally by io_uring (not SLAB/SLUB) in the list called
submit_state.freelist. This patch simply enabled KASAN for this free
list.

This list is initially created by KMEM_CACHE, but later, managed by
io_uring. This patch basically poisons the objects that are not used
(i.e., they are the free list), and unpoisons it when the object is
allocated/removed from the list.

Touching these poisoned objects while in the freelist will cause a KASAN
warning.

Suggested-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Breno Leitao <leitao@debian.org>
Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c |  3 ++-
 io_uring/io_uring.h | 11 ++++++++---
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index c84dce6944db..407834255a59 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -151,7 +151,7 @@ static void io_move_task_work_from_local(struct io_ring_ctx *ctx);
 static void __io_submit_flush_completions(struct io_ring_ctx *ctx);
 static __cold void io_fallback_tw(struct io_uring_task *tctx);
 
-static struct kmem_cache *req_cachep;
+struct kmem_cache *req_cachep;
 
 struct sock *io_uring_get_socket(struct file *file)
 {
@@ -230,6 +230,7 @@ static inline void req_fail_link_node(struct io_kiocb *req, int res)
 static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx)
 {
 	wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
+	kasan_poison_object_data(req_cachep, req);
 }
 
 static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref)
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index d58cfe062da9..9270156288aa 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -4,6 +4,7 @@
 #include <linux/errno.h>
 #include <linux/lockdep.h>
 #include <linux/resume_user_mode.h>
+#include <linux/kasan.h>
 #include <linux/io_uring_types.h>
 #include <uapi/linux/eventpoll.h>
 #include "io-wq.h"
@@ -347,12 +348,16 @@ static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx)
 	return true;
 }
 
+extern struct kmem_cache *req_cachep;
+
 static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
 {
-	struct io_wq_work_node *node;
+	struct io_kiocb *req;
 
-	node = wq_stack_extract(&ctx->submit_state.free_list);
-	return container_of(node, struct io_kiocb, comp_list);
+	req = container_of(ctx->submit_state.free_list.next, struct io_kiocb, comp_list);
+	kasan_unpoison_object_data(req_cachep, req);
+	wq_stack_extract(&ctx->submit_state.free_list);
+	return req;
 }
 
 static inline bool io_allowed_defer_tw_run(struct io_ring_ctx *ctx)

From f499254474a83bf60191c86de82c1fec1d8eb9f9 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 20 Jan 2023 09:10:30 -0700
Subject: [PATCH 35/51] io_uring: pass in io_issue_def to io_assign_file()

This generates better code for me, avoiding an extra load on arm64, and
both call sites already have this variable available for easy passing.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 407834255a59..4958725e0b97 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1855,9 +1855,10 @@ static void io_clean_op(struct io_kiocb *req)
 	req->flags &= ~IO_REQ_CLEAN_FLAGS;
 }
 
-static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags)
+static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def,
+			   unsigned int issue_flags)
 {
-	if (req->file || !io_issue_defs[req->opcode].needs_file)
+	if (req->file || !def->needs_file)
 		return true;
 
 	if (req->flags & REQ_F_FIXED_FILE)
@@ -1874,7 +1875,7 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
 	const struct cred *creds = NULL;
 	int ret;
 
-	if (unlikely(!io_assign_file(req, issue_flags)))
+	if (unlikely(!io_assign_file(req, def, issue_flags)))
 		return -EBADF;
 
 	if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred()))
@@ -1943,7 +1944,7 @@ fail:
 		io_req_task_queue_fail(req, err);
 		return;
 	}
-	if (!io_assign_file(req, issue_flags)) {
+	if (!io_assign_file(req, def, issue_flags)) {
 		err = -EBADF;
 		work->flags |= IO_WQ_WORK_CANCEL;
 		goto fail;

From c10bb64684813a326174c3eebcafb3ee5af52ca3 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 23 Jan 2023 14:37:13 +0000
Subject: [PATCH 36/51] io_uring: use user visible tail in io_uring_poll()

We return POLLIN from io_uring_poll() depending on whether there are
CQEs for the userspace, and so we should use the user visible tail
pointer instead of a transient cached value.

Cc: stable@vger.kernel.org
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/228ffcbf30ba98856f66ffdb9a6a60ead1dd96c0.1674484266.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 4958725e0b97..1681be25cf58 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2886,7 +2886,7 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait)
 	 * pushes them to do the flush.
 	 */
 
-	if (io_cqring_events(ctx) || io_has_work(ctx))
+	if (__io_cqring_events_user(ctx) || io_has_work(ctx))
 		mask |= EPOLLIN | EPOLLRDNORM;
 
 	return mask;

From b2aa66aff60c841b2c93242752c25abf4c82a28c Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 23 Jan 2023 14:37:14 +0000
Subject: [PATCH 37/51] io_uring: kill outdated comment about overflow flush

__io_cqring_overflow_flush() doesn't return anything anymore, remove
outdate comment.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/4ce2bcbb17eac80cdf883fd1459d5ee6586e238c.1674484266.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 1681be25cf58..b865275898b1 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -666,7 +666,6 @@ static void io_cqring_overflow_kill(struct io_ring_ctx *ctx)
 	}
 }
 
-/* Returns true if there are no backlogged entries after the flush */
 static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx)
 {
 	size_t cqe_size = sizeof(struct io_uring_cqe);

From b5083dfa36676e7b5d72bf3d70f429a0d08c5075 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 23 Jan 2023 14:37:15 +0000
Subject: [PATCH 38/51] io_uring: improve io_get_sqe

Return an SQE from io_get_sqe() as a parameter and use the return value
to determine if it failed or not. This enables the compiler to compile out
the sqe NULL check when we know that the return SQE is valid.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/9cceb11329240ea097dffef6bf0a675bca14cf42.1674484266.git.asml.silence@gmail.com
[axboe: remove bogus const modifier on return value]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index b865275898b1..c1f27626c205 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2368,7 +2368,7 @@ static void io_commit_sqring(struct io_ring_ctx *ctx)
  * used, it's important that those reads are done through READ_ONCE() to
  * prevent a re-load down the line.
  */
-static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
+static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe)
 {
 	unsigned head, mask = ctx->sq_entries - 1;
 	unsigned sq_idx = ctx->cached_sq_head++ & mask;
@@ -2386,14 +2386,15 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
 		/* double index for 128-byte SQEs, twice as long */
 		if (ctx->flags & IORING_SETUP_SQE128)
 			head <<= 1;
-		return &ctx->sq_sqes[head];
+		*sqe = &ctx->sq_sqes[head];
+		return true;
 	}
 
 	/* drop invalid entries */
 	ctx->cq_extra--;
 	WRITE_ONCE(ctx->rings->sq_dropped,
 		   READ_ONCE(ctx->rings->sq_dropped) + 1);
-	return NULL;
+	return false;
 }
 
 int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
@@ -2417,8 +2418,7 @@ int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
 		if (unlikely(!io_alloc_req_refill(ctx)))
 			break;
 		req = io_alloc_req(ctx);
-		sqe = io_get_sqe(ctx);
-		if (unlikely(!sqe)) {
+		if (unlikely(!io_get_sqe(ctx, &sqe))) {
 			io_req_add_to_cache(req, ctx);
 			break;
 		}

From c8576f3e612d3c5b01d434ae296b9b76d7907708 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 23 Jan 2023 14:37:16 +0000
Subject: [PATCH 39/51] io_uring: refactor req allocation

Follow the io_get_sqe pattern returning the result via a pointer
and hide request cache refill inside io_alloc_req().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/8c37c2e8a3cb5e4cd6a8ae3b91371227a92708a6.1674484266.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c |  7 +++----
 io_uring/io_uring.h | 19 +++++++++++--------
 io_uring/notif.c    |  3 +--
 3 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index c1f27626c205..2b046a80f75b 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2415,9 +2415,8 @@ int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
 		const struct io_uring_sqe *sqe;
 		struct io_kiocb *req;
 
-		if (unlikely(!io_alloc_req_refill(ctx)))
+		if (unlikely(!io_alloc_req(ctx, &req)))
 			break;
-		req = io_alloc_req(ctx);
 		if (unlikely(!io_get_sqe(ctx, &sqe))) {
 			io_req_add_to_cache(req, ctx);
 			break;
@@ -2736,14 +2735,14 @@ static int io_eventfd_unregister(struct io_ring_ctx *ctx)
 
 static void io_req_caches_free(struct io_ring_ctx *ctx)
 {
+	struct io_kiocb *req;
 	int nr = 0;
 
 	mutex_lock(&ctx->uring_lock);
 	io_flush_cached_locked_reqs(ctx, &ctx->submit_state);
 
 	while (!io_req_cache_empty(ctx)) {
-		struct io_kiocb *req = io_alloc_req(ctx);
-
+		req = io_extract_req(ctx);
 		kmem_cache_free(req_cachep, req);
 		nr++;
 	}
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 9270156288aa..f90816aac896 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -341,16 +341,9 @@ static inline bool io_req_cache_empty(struct io_ring_ctx *ctx)
 	return !ctx->submit_state.free_list.next;
 }
 
-static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx)
-{
-	if (unlikely(io_req_cache_empty(ctx)))
-		return __io_alloc_req_refill(ctx);
-	return true;
-}
-
 extern struct kmem_cache *req_cachep;
 
-static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
+static inline struct io_kiocb *io_extract_req(struct io_ring_ctx *ctx)
 {
 	struct io_kiocb *req;
 
@@ -360,6 +353,16 @@ static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
 	return req;
 }
 
+static inline bool io_alloc_req(struct io_ring_ctx *ctx, struct io_kiocb **req)
+{
+	if (unlikely(io_req_cache_empty(ctx))) {
+		if (!__io_alloc_req_refill(ctx))
+			return false;
+	}
+	*req = io_extract_req(ctx);
+	return true;
+}
+
 static inline bool io_allowed_defer_tw_run(struct io_ring_ctx *ctx)
 {
 	return likely(ctx->submitter_task == current);
diff --git a/io_uring/notif.c b/io_uring/notif.c
index c4bb793ebf0e..09dfd0832d19 100644
--- a/io_uring/notif.c
+++ b/io_uring/notif.c
@@ -68,9 +68,8 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx)
 	struct io_kiocb *notif;
 	struct io_notif_data *nd;
 
-	if (unlikely(!io_alloc_req_refill(ctx)))
+	if (unlikely(!io_alloc_req(ctx, &notif)))
 		return NULL;
-	notif = io_alloc_req(ctx);
 	notif->opcode = IORING_OP_NOP;
 	notif->flags = 0;
 	notif->file = NULL;

From 5afa4650713918e60865ed42d9439e82f6d24773 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 23 Jan 2023 14:37:17 +0000
Subject: [PATCH 40/51] io_uring: refactor io_put_task helpers

Add a helper for putting refs from the target task context, rename
__io_put_task() and add a couple of comments around. Use the remote
version for __io_req_complete_post(), the local is only needed for
__io_submit_flush_completions().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/3bf92ebd594769d8a5d648472a8e335f2031d542.1674484266.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 2b046a80f75b..106cba919023 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -713,7 +713,8 @@ static void io_cqring_overflow_flush(struct io_ring_ctx *ctx)
 		io_cqring_do_overflow_flush(ctx);
 }
 
-static void __io_put_task(struct task_struct *task, int nr)
+/* can be called by any task */
+static void io_put_task_remote(struct task_struct *task, int nr)
 {
 	struct io_uring_task *tctx = task->io_uring;
 
@@ -723,13 +724,19 @@ static void __io_put_task(struct task_struct *task, int nr)
 	put_task_struct_many(task, nr);
 }
 
+/* used by a task to put its own references */
+static void io_put_task_local(struct task_struct *task, int nr)
+{
+	task->io_uring->cached_refs += nr;
+}
+
 /* must to be called somewhat shortly after putting a request */
 static inline void io_put_task(struct task_struct *task, int nr)
 {
 	if (likely(task == current))
-		task->io_uring->cached_refs += nr;
+		io_put_task_local(task, nr);
 	else
-		__io_put_task(task, nr);
+		io_put_task_remote(task, nr);
 }
 
 void io_task_refs_refill(struct io_uring_task *tctx)
@@ -982,7 +989,7 @@ static void __io_req_complete_post(struct io_kiocb *req)
 		 * we don't hold ->completion_lock. Clean them here to avoid
 		 * deadlocks.
 		 */
-		io_put_task(req->task, 1);
+		io_put_task_remote(req->task, 1);
 		wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
 		ctx->locked_free_nr++;
 	}
@@ -1105,7 +1112,7 @@ __cold void io_free_req(struct io_kiocb *req)
 
 	io_req_put_rsrc(req);
 	io_dismantle_req(req);
-	io_put_task(req->task, 1);
+	io_put_task_remote(req->task, 1);
 
 	spin_lock(&ctx->completion_lock);
 	wq_list_add_head(&req->comp_list, &ctx->locked_free_list);

From cb6bf7f285c270d7808807128b5bce414e3f254a Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 23 Jan 2023 14:37:18 +0000
Subject: [PATCH 41/51] io_uring: refactor tctx_task_work

Merge almost identical sections of tctx_task_work(), this will make code
modifications later easier and also inlines handle_tw_list().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/d06592d91e3e7559e7a4dbb8907d110863008dc7.1674484266.git.asml.silence@gmail.com
[axboe: fold in setting count to zero patch from Tom Rix]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 106cba919023..55101013f3ee 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1166,7 +1166,7 @@ static unsigned int handle_tw_list(struct llist_node *node,
 {
 	unsigned int count = 0;
 
-	while (node != last) {
+	while (node && node != last) {
 		struct llist_node *next = node->next;
 		struct io_kiocb *req = container_of(node, struct io_kiocb,
 						    io_task_work.node);
@@ -1226,23 +1226,20 @@ void tctx_task_work(struct callback_head *cb)
 						  task_work);
 	struct llist_node fake = {};
 	struct llist_node *node;
-	unsigned int loops = 1;
-	unsigned int count;
+	unsigned int loops = 0;
+	unsigned int count = 0;
 
 	if (unlikely(current->flags & PF_EXITING)) {
 		io_fallback_tw(tctx);
 		return;
 	}
 
-	node = io_llist_xchg(&tctx->task_list, &fake);
-	count = handle_tw_list(node, &ctx, &uring_locked, NULL);
-	node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL);
-	while (node != &fake) {
+	do {
 		loops++;
 		node = io_llist_xchg(&tctx->task_list, &fake);
 		count += handle_tw_list(node, &ctx, &uring_locked, &fake);
 		node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL);
-	}
+	} while (node != &fake);
 
 	ctx_flush_and_put(ctx, &uring_locked);
 

From 50470fc5723ae0adb2f429a8b27ff6bf1a41913e Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 23 Jan 2023 14:37:19 +0000
Subject: [PATCH 42/51] io_uring: return normal tw run linking optimisation

io_submit_flush_completions() may produce new task_work items, so it's a
good idea to recheck the task_work list after flushing completions. The
optimisation is not new and was accidentially removed by
f88262e60bb9 ("io_uring: lockless task list")

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/a7ed5ede84de190832cc33ebbcdd6e91cd90f5b6.1674484266.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 55101013f3ee..9c92ca081c11 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1238,6 +1238,15 @@ void tctx_task_work(struct callback_head *cb)
 		loops++;
 		node = io_llist_xchg(&tctx->task_list, &fake);
 		count += handle_tw_list(node, &ctx, &uring_locked, &fake);
+
+		/* skip expensive cmpxchg if there are items in the list */
+		if (READ_ONCE(tctx->task_list.first) != &fake)
+			continue;
+		if (uring_locked && !wq_list_empty(&ctx->submit_state.compl_reqs)) {
+			io_submit_flush_completions(ctx);
+			if (READ_ONCE(tctx->task_list.first) != &fake)
+				continue;
+		}
 		node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL);
 	} while (node != &fake);
 

From fcc926bb857949dbfa51a7d95f3f5ebc657f198c Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 27 Jan 2023 09:28:13 -0700
Subject: [PATCH 43/51] io_uring: add a conditional reschedule to the IOPOLL
 cancelation loop

If the kernel is configured with CONFIG_PREEMPT_NONE, we could be
sitting in a tight loop reaping events but not giving them a chance to
finish. This results in a trace ala:

rcu: INFO: rcu_sched self-detected stall on CPU
rcu: 	2-...!: (5249 ticks this GP) idle=935c/1/0x4000000000000000 softirq=4265/4274 fqs=1
	(t=5251 jiffies g=465 q=4135 ncpus=4)
rcu: rcu_sched kthread starved for 5249 jiffies! g465 f0x0 RCU_GP_WAIT_FQS(5) ->state=0x0 ->cpu=0
rcu: 	Unless rcu_sched kthread gets sufficient CPU time, OOM is now expected behavior.
rcu: RCU grace-period kthread stack dump:
task:rcu_sched       state:R  running task     stack:0     pid:12    ppid:2      flags:0x00000008
Call trace:
 __switch_to+0xb0/0xc8
 __schedule+0x43c/0x520
 schedule+0x4c/0x98
 schedule_timeout+0xbc/0xdc
 rcu_gp_fqs_loop+0x308/0x344
 rcu_gp_kthread+0xd8/0xf0
 kthread+0xb8/0xc8
 ret_from_fork+0x10/0x20
rcu: Stack dump where RCU GP kthread last ran:
Task dump for CPU 0:
task:kworker/u8:10   state:R  running task     stack:0     pid:89    ppid:2      flags:0x0000000a
Workqueue: events_unbound io_ring_exit_work
Call trace:
 __switch_to+0xb0/0xc8
 0xffff0000c8fefd28
CPU: 2 PID: 95 Comm: kworker/u8:13 Not tainted 6.2.0-rc5-00042-g40316e337c80-dirty #2759
Hardware name: linux,dummy-virt (DT)
Workqueue: events_unbound io_ring_exit_work
pstate: 61400005 (nZCv daif +PAN -UAO -TCO +DIT -SSBS BTYPE=--)
pc : io_do_iopoll+0x344/0x360
lr : io_do_iopoll+0xb8/0x360
sp : ffff800009bebc60
x29: ffff800009bebc60 x28: 0000000000000000 x27: 0000000000000000
x26: ffff0000c0f67d48 x25: ffff0000c0f67840 x24: ffff800008950024
x23: 0000000000000001 x22: 0000000000000000 x21: ffff0000c27d3200
x20: ffff0000c0f67840 x19: ffff0000c0f67800 x18: 0000000000000000
x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000
x14: 0000000000000001 x13: 0000000000000001 x12: 0000000000000000
x11: 0000000000000179 x10: 0000000000000870 x9 : ffff800009bebd60
x8 : ffff0000c27d3ad0 x7 : fefefefefefefeff x6 : 0000646e756f626e
x5 : ffff0000c0f67840 x4 : 0000000000000000 x3 : ffff0000c2398000
x2 : 0000000000000000 x1 : 0000000000000000 x0 : 0000000000000000
Call trace:
 io_do_iopoll+0x344/0x360
 io_uring_try_cancel_requests+0x21c/0x334
 io_ring_exit_work+0x90/0x40c
 process_one_work+0x1a4/0x254
 worker_thread+0x1ec/0x258
 kthread+0xb8/0xc8
 ret_from_fork+0x10/0x20

Add a cond_resched() in the cancelation IOPOLL loop to fix this.

Cc: stable@vger.kernel.org # 5.10+
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 9c92ca081c11..fab581a31dc1 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3160,6 +3160,7 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 		while (!wq_list_empty(&ctx->iopoll_list)) {
 			io_iopoll_try_reap_events(ctx);
 			ret = true;
+			cond_resched();
 		}
 	}
 

From f58680085478dd292435727210122960d38e8014 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 27 Jan 2023 09:50:31 -0700
Subject: [PATCH 44/51] io_uring: add reschedule point to handle_tw_list()

If CONFIG_PREEMPT_NONE is set and the task_work chains are long, we
could be running into issues blocking others for too long. Add a
reschedule check in handle_tw_list(), and flush the ctx if we need to
reschedule.

Cc: stable@vger.kernel.org # 5.10+
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index fab581a31dc1..acf6d9680d76 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1179,10 +1179,16 @@ static unsigned int handle_tw_list(struct llist_node *node,
 			/* if not contended, grab and improve batching */
 			*locked = mutex_trylock(&(*ctx)->uring_lock);
 			percpu_ref_get(&(*ctx)->refs);
-		}
+		} else if (!*locked)
+			*locked = mutex_trylock(&(*ctx)->uring_lock);
 		req->io_task_work.func(req, locked);
 		node = next;
 		count++;
+		if (unlikely(need_resched())) {
+			ctx_flush_and_put(*ctx, locked);
+			*ctx = NULL;
+			cond_resched();
+		}
 	}
 
 	return count;

From 6bb30855560e6343e7b88595d7c3159d0f848a04 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@meta.com>
Date: Fri, 27 Jan 2023 05:52:24 -0800
Subject: [PATCH 45/51] io_uring: if a linked request has REQ_F_FORCE_ASYNC
 then run it async

REQ_F_FORCE_ASYNC was being ignored for re-queueing linked
requests. Instead obey that flag.

Signed-off-by: Dylan Yudaken <dylany@meta.com>
Link: https://lore.kernel.org/r/20230127135227.3646353-2-dylany@meta.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index acf6d9680d76..0e42160a3051 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1417,10 +1417,12 @@ void io_req_task_submit(struct io_kiocb *req, bool *locked)
 {
 	io_tw_lock(req->ctx, locked);
 	/* req->task == current here, checking PF_EXITING is safe */
-	if (likely(!(req->task->flags & PF_EXITING)))
-		io_queue_sqe(req);
-	else
+	if (unlikely(req->task->flags & PF_EXITING))
 		io_req_defer_failed(req, -EFAULT);
+	else if (req->flags & REQ_F_FORCE_ASYNC)
+		io_queue_iowq(req, locked);
+	else
+		io_queue_sqe(req);
 }
 
 void io_req_task_queue_fail(struct io_kiocb *req, int ret)

From aebb224fd4fc7352cd839ad90414c548387142fd Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@meta.com>
Date: Fri, 27 Jan 2023 05:52:25 -0800
Subject: [PATCH 46/51] io_uring: for requests that require async, force it

Some requests require being run async as they do not support
non-blocking. Instead of trying to issue these requests, getting -EAGAIN
and then queueing them for async issue, rather just force async upfront.

Add WARN_ON_ONCE to make sure surprising code paths do not come up,
however in those cases the bug would end up being a blocking
io_uring_enter(2) which should not be critical.

Signed-off-by: Dylan Yudaken <dylany@meta.com>
Link: https://lore.kernel.org/r/20230127135227.3646353-3-dylany@meta.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/advise.c |  4 ++--
 io_uring/fs.c     | 20 ++++++++++----------
 io_uring/net.c    |  4 ++--
 io_uring/splice.c |  7 +++----
 io_uring/statx.c  |  4 ++--
 io_uring/sync.c   | 14 ++++++++------
 io_uring/xattr.c  | 14 ++++++--------
 7 files changed, 33 insertions(+), 34 deletions(-)

diff --git a/io_uring/advise.c b/io_uring/advise.c
index 449c6f14649f..cf600579bffe 100644
--- a/io_uring/advise.c
+++ b/io_uring/advise.c
@@ -39,6 +39,7 @@ int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	ma->addr = READ_ONCE(sqe->addr);
 	ma->len = READ_ONCE(sqe->len);
 	ma->advice = READ_ONCE(sqe->fadvise_advice);
+	req->flags |= REQ_F_FORCE_ASYNC;
 	return 0;
 #else
 	return -EOPNOTSUPP;
@@ -51,8 +52,7 @@ int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
 	struct io_madvise *ma = io_kiocb_to_cmd(req, struct io_madvise);
 	int ret;
 
-	if (issue_flags & IO_URING_F_NONBLOCK)
-		return -EAGAIN;
+	WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
 
 	ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
 	io_req_set_res(req, ret, 0);
diff --git a/io_uring/fs.c b/io_uring/fs.c
index 7100c293c13a..f6a69a549fd4 100644
--- a/io_uring/fs.c
+++ b/io_uring/fs.c
@@ -74,6 +74,7 @@ int io_renameat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	}
 
 	req->flags |= REQ_F_NEED_CLEANUP;
+	req->flags |= REQ_F_FORCE_ASYNC;
 	return 0;
 }
 
@@ -82,8 +83,7 @@ int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
 	struct io_rename *ren = io_kiocb_to_cmd(req, struct io_rename);
 	int ret;
 
-	if (issue_flags & IO_URING_F_NONBLOCK)
-		return -EAGAIN;
+	WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
 
 	ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd,
 				ren->newpath, ren->flags);
@@ -123,6 +123,7 @@ int io_unlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		return PTR_ERR(un->filename);
 
 	req->flags |= REQ_F_NEED_CLEANUP;
+	req->flags |= REQ_F_FORCE_ASYNC;
 	return 0;
 }
 
@@ -131,8 +132,7 @@ int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
 	struct io_unlink *un = io_kiocb_to_cmd(req, struct io_unlink);
 	int ret;
 
-	if (issue_flags & IO_URING_F_NONBLOCK)
-		return -EAGAIN;
+	WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
 
 	if (un->flags & AT_REMOVEDIR)
 		ret = do_rmdir(un->dfd, un->filename);
@@ -170,6 +170,7 @@ int io_mkdirat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		return PTR_ERR(mkd->filename);
 
 	req->flags |= REQ_F_NEED_CLEANUP;
+	req->flags |= REQ_F_FORCE_ASYNC;
 	return 0;
 }
 
@@ -178,8 +179,7 @@ int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags)
 	struct io_mkdir *mkd = io_kiocb_to_cmd(req, struct io_mkdir);
 	int ret;
 
-	if (issue_flags & IO_URING_F_NONBLOCK)
-		return -EAGAIN;
+	WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
 
 	ret = do_mkdirat(mkd->dfd, mkd->filename, mkd->mode);
 
@@ -220,6 +220,7 @@ int io_symlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	}
 
 	req->flags |= REQ_F_NEED_CLEANUP;
+	req->flags |= REQ_F_FORCE_ASYNC;
 	return 0;
 }
 
@@ -228,8 +229,7 @@ int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags)
 	struct io_link *sl = io_kiocb_to_cmd(req, struct io_link);
 	int ret;
 
-	if (issue_flags & IO_URING_F_NONBLOCK)
-		return -EAGAIN;
+	WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
 
 	ret = do_symlinkat(sl->oldpath, sl->new_dfd, sl->newpath);
 
@@ -265,6 +265,7 @@ int io_linkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	}
 
 	req->flags |= REQ_F_NEED_CLEANUP;
+	req->flags |= REQ_F_FORCE_ASYNC;
 	return 0;
 }
 
@@ -273,8 +274,7 @@ int io_linkat(struct io_kiocb *req, unsigned int issue_flags)
 	struct io_link *lnk = io_kiocb_to_cmd(req, struct io_link);
 	int ret;
 
-	if (issue_flags & IO_URING_F_NONBLOCK)
-		return -EAGAIN;
+	WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
 
 	ret = do_linkat(lnk->old_dfd, lnk->oldpath, lnk->new_dfd,
 				lnk->newpath, lnk->flags);
diff --git a/io_uring/net.c b/io_uring/net.c
index 90326b279965..aeb1d016e2e9 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -90,6 +90,7 @@ int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		return -EINVAL;
 
 	shutdown->how = READ_ONCE(sqe->len);
+	req->flags |= REQ_F_FORCE_ASYNC;
 	return 0;
 }
 
@@ -99,8 +100,7 @@ int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
 	struct socket *sock;
 	int ret;
 
-	if (issue_flags & IO_URING_F_NONBLOCK)
-		return -EAGAIN;
+	WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
 
 	sock = sock_from_file(req->file);
 	if (unlikely(!sock))
diff --git a/io_uring/splice.c b/io_uring/splice.c
index 53e4232d0866..2a4bbb719531 100644
--- a/io_uring/splice.c
+++ b/io_uring/splice.c
@@ -34,6 +34,7 @@ static int __io_splice_prep(struct io_kiocb *req,
 	if (unlikely(sp->flags & ~valid_flags))
 		return -EINVAL;
 	sp->splice_fd_in = READ_ONCE(sqe->splice_fd_in);
+	req->flags |= REQ_F_FORCE_ASYNC;
 	return 0;
 }
 
@@ -52,8 +53,7 @@ int io_tee(struct io_kiocb *req, unsigned int issue_flags)
 	struct file *in;
 	long ret = 0;
 
-	if (issue_flags & IO_URING_F_NONBLOCK)
-		return -EAGAIN;
+	WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
 
 	if (sp->flags & SPLICE_F_FD_IN_FIXED)
 		in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags);
@@ -94,8 +94,7 @@ int io_splice(struct io_kiocb *req, unsigned int issue_flags)
 	struct file *in;
 	long ret = 0;
 
-	if (issue_flags & IO_URING_F_NONBLOCK)
-		return -EAGAIN;
+	WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
 
 	if (sp->flags & SPLICE_F_FD_IN_FIXED)
 		in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags);
diff --git a/io_uring/statx.c b/io_uring/statx.c
index d8fc933d3f59..abb874209caa 100644
--- a/io_uring/statx.c
+++ b/io_uring/statx.c
@@ -48,6 +48,7 @@ int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	}
 
 	req->flags |= REQ_F_NEED_CLEANUP;
+	req->flags |= REQ_F_FORCE_ASYNC;
 	return 0;
 }
 
@@ -56,8 +57,7 @@ int io_statx(struct io_kiocb *req, unsigned int issue_flags)
 	struct io_statx *sx = io_kiocb_to_cmd(req, struct io_statx);
 	int ret;
 
-	if (issue_flags & IO_URING_F_NONBLOCK)
-		return -EAGAIN;
+	WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
 
 	ret = do_statx(sx->dfd, sx->filename, sx->flags, sx->mask, sx->buffer);
 	io_req_set_res(req, ret, 0);
diff --git a/io_uring/sync.c b/io_uring/sync.c
index 64e87ea2b8fb..255f68c37e55 100644
--- a/io_uring/sync.c
+++ b/io_uring/sync.c
@@ -32,6 +32,8 @@ int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	sync->off = READ_ONCE(sqe->off);
 	sync->len = READ_ONCE(sqe->len);
 	sync->flags = READ_ONCE(sqe->sync_range_flags);
+	req->flags |= REQ_F_FORCE_ASYNC;
+
 	return 0;
 }
 
@@ -41,8 +43,7 @@ int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)
 	int ret;
 
 	/* sync_file_range always requires a blocking context */
-	if (issue_flags & IO_URING_F_NONBLOCK)
-		return -EAGAIN;
+	WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
 
 	ret = sync_file_range(req->file, sync->off, sync->len, sync->flags);
 	io_req_set_res(req, ret, 0);
@@ -62,6 +63,7 @@ int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 	sync->off = READ_ONCE(sqe->off);
 	sync->len = READ_ONCE(sqe->len);
+	req->flags |= REQ_F_FORCE_ASYNC;
 	return 0;
 }
 
@@ -72,8 +74,7 @@ int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
 	int ret;
 
 	/* fsync always requires a blocking context */
-	if (issue_flags & IO_URING_F_NONBLOCK)
-		return -EAGAIN;
+	WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
 
 	ret = vfs_fsync_range(req->file, sync->off, end > 0 ? end : LLONG_MAX,
 				sync->flags & IORING_FSYNC_DATASYNC);
@@ -91,6 +92,7 @@ int io_fallocate_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	sync->off = READ_ONCE(sqe->off);
 	sync->len = READ_ONCE(sqe->addr);
 	sync->mode = READ_ONCE(sqe->len);
+	req->flags |= REQ_F_FORCE_ASYNC;
 	return 0;
 }
 
@@ -100,8 +102,8 @@ int io_fallocate(struct io_kiocb *req, unsigned int issue_flags)
 	int ret;
 
 	/* fallocate always requiring blocking context */
-	if (issue_flags & IO_URING_F_NONBLOCK)
-		return -EAGAIN;
+	WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
+
 	ret = vfs_fallocate(req->file, sync->mode, sync->off, sync->len);
 	if (ret >= 0)
 		fsnotify_modify(req->file);
diff --git a/io_uring/xattr.c b/io_uring/xattr.c
index 6201a9f442c6..e1c810e0b85a 100644
--- a/io_uring/xattr.c
+++ b/io_uring/xattr.c
@@ -75,6 +75,7 @@ static int __io_getxattr_prep(struct io_kiocb *req,
 	}
 
 	req->flags |= REQ_F_NEED_CLEANUP;
+	req->flags |= REQ_F_FORCE_ASYNC;
 	return 0;
 }
 
@@ -109,8 +110,7 @@ int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags)
 	struct io_xattr *ix = io_kiocb_to_cmd(req, struct io_xattr);
 	int ret;
 
-	if (issue_flags & IO_URING_F_NONBLOCK)
-		return -EAGAIN;
+	WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
 
 	ret = do_getxattr(mnt_idmap(req->file->f_path.mnt),
 			req->file->f_path.dentry,
@@ -127,8 +127,7 @@ int io_getxattr(struct io_kiocb *req, unsigned int issue_flags)
 	struct path path;
 	int ret;
 
-	if (issue_flags & IO_URING_F_NONBLOCK)
-		return -EAGAIN;
+	WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
 
 retry:
 	ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL);
@@ -174,6 +173,7 @@ static int __io_setxattr_prep(struct io_kiocb *req,
 	}
 
 	req->flags |= REQ_F_NEED_CLEANUP;
+	req->flags |= REQ_F_FORCE_ASYNC;
 	return 0;
 }
 
@@ -222,8 +222,7 @@ int io_fsetxattr(struct io_kiocb *req, unsigned int issue_flags)
 {
 	int ret;
 
-	if (issue_flags & IO_URING_F_NONBLOCK)
-		return -EAGAIN;
+	WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
 
 	ret = __io_setxattr(req, issue_flags, &req->file->f_path);
 	io_xattr_finish(req, ret);
@@ -237,8 +236,7 @@ int io_setxattr(struct io_kiocb *req, unsigned int issue_flags)
 	struct path path;
 	int ret;
 
-	if (issue_flags & IO_URING_F_NONBLOCK)
-		return -EAGAIN;
+	WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
 
 retry:
 	ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL);

From c31cc60fddd11134031e7f9eb76812353cfaac84 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@meta.com>
Date: Fri, 27 Jan 2023 05:52:26 -0800
Subject: [PATCH 47/51] io_uring: always go async for unsupported fadvise flags

No point in issuing -> return -EAGAIN -> go async, when it can be done upfront.

Signed-off-by: Dylan Yudaken <dylany@meta.com>
Link: https://lore.kernel.org/r/20230127135227.3646353-4-dylany@meta.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/advise.c | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/io_uring/advise.c b/io_uring/advise.c
index cf600579bffe..7085804c513c 100644
--- a/io_uring/advise.c
+++ b/io_uring/advise.c
@@ -62,6 +62,18 @@ int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
 #endif
 }
 
+static bool io_fadvise_force_async(struct io_fadvise *fa)
+{
+	switch (fa->advice) {
+	case POSIX_FADV_NORMAL:
+	case POSIX_FADV_RANDOM:
+	case POSIX_FADV_SEQUENTIAL:
+		return false;
+	default:
+		return true;
+	}
+}
+
 int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_fadvise *fa = io_kiocb_to_cmd(req, struct io_fadvise);
@@ -72,6 +84,8 @@ int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	fa->offset = READ_ONCE(sqe->off);
 	fa->len = READ_ONCE(sqe->len);
 	fa->advice = READ_ONCE(sqe->fadvise_advice);
+	if (io_fadvise_force_async(fa))
+		req->flags |= REQ_F_FORCE_ASYNC;
 	return 0;
 }
 
@@ -80,16 +94,7 @@ int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
 	struct io_fadvise *fa = io_kiocb_to_cmd(req, struct io_fadvise);
 	int ret;
 
-	if (issue_flags & IO_URING_F_NONBLOCK) {
-		switch (fa->advice) {
-		case POSIX_FADV_NORMAL:
-		case POSIX_FADV_RANDOM:
-		case POSIX_FADV_SEQUENTIAL:
-			break;
-		default:
-			return -EAGAIN;
-		}
-	}
+	WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK && io_fadvise_force_async(fa));
 
 	ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
 	if (ret < 0)

From 0ffae640ad83de46865c6b8dc3fda370823e4f1d Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@meta.com>
Date: Fri, 27 Jan 2023 05:52:27 -0800
Subject: [PATCH 48/51] io_uring: always go async for unsupported open flags

No point in issuing -> return -EAGAIN -> go async, when it can be done upfront.

Signed-off-by: Dylan Yudaken <dylany@meta.com>
Link: https://lore.kernel.org/r/20230127135227.3646353-5-dylany@meta.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/openclose.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/io_uring/openclose.c b/io_uring/openclose.c
index 67178e4bb282..a1b98c81a52d 100644
--- a/io_uring/openclose.c
+++ b/io_uring/openclose.c
@@ -31,6 +31,15 @@ struct io_close {
 	u32				file_slot;
 };
 
+static bool io_openat_force_async(struct io_open *open)
+{
+	/*
+	 * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open,
+	 * it'll always -EAGAIN
+	 */
+	return open->how.flags & (O_TRUNC | O_CREAT | O_TMPFILE);
+}
+
 static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_open *open = io_kiocb_to_cmd(req, struct io_open);
@@ -61,6 +70,8 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
 
 	open->nofile = rlimit(RLIMIT_NOFILE);
 	req->flags |= REQ_F_NEED_CLEANUP;
+	if (io_openat_force_async(open))
+		req->flags |= REQ_F_FORCE_ASYNC;
 	return 0;
 }
 
@@ -108,12 +119,7 @@ int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
 	nonblock_set = op.open_flag & O_NONBLOCK;
 	resolve_nonblock = open->how.resolve & RESOLVE_CACHED;
 	if (issue_flags & IO_URING_F_NONBLOCK) {
-		/*
-		 * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open,
-		 * it'll always -EAGAIN
-		 */
-		if (open->how.flags & (O_TRUNC | O_CREAT | O_TMPFILE))
-			return -EAGAIN;
+		WARN_ON_ONCE(io_openat_force_async(open));
 		op.lookup_flags |= LOOKUP_CACHED;
 		op.open_flag |= O_NONBLOCK;
 	}

From 2f2bb1ffc9983e227424d0787289da5483b0c74f Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 6 Feb 2023 08:20:46 -0700
Subject: [PATCH 49/51] io_uring: mark task TASK_RUNNING before handling
 resume/task work

Just like for task_work, set the task mode to TASK_RUNNING before doing
any potential resume work. We're not holding any locks at this point,
but we may have already set the task state to TASK_INTERRUPTIBLE in
preparation for going to sleep waiting for events. Ensure that we set it
back to TASK_RUNNING if we have work to process, to avoid warnings on
calling blocking operations with !TASK_RUNNING.

Fixes: b5d3ae202fbf ("io_uring: handle TIF_NOTIFY_RESUME when checking for task_work")
Reported-by: kernel test robot <oliver.sang@intel.com>
Link: https://lore.kernel.org/oe-lkp/202302062208.24d3e563-oliver.sang@intel.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index f90816aac896..2711865f1e19 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -281,8 +281,10 @@ static inline int io_run_task_work(void)
 	 * notify work that needs processing.
 	 */
 	if (current->flags & PF_IO_WORKER &&
-	    test_thread_flag(TIF_NOTIFY_RESUME))
+	    test_thread_flag(TIF_NOTIFY_RESUME)) {
+		__set_current_state(TASK_RUNNING);
 		resume_user_mode_work(NULL);
+	}
 	if (task_work_pending(current)) {
 		__set_current_state(TASK_RUNNING);
 		task_work_run();

From fbe870a72fd1ddc5e08c23764e23e5766f54aa87 Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Wed, 1 Feb 2023 15:33:33 -0500
Subject: [PATCH 50/51] io_uring,audit: don't log IORING_OP_MADVISE

fadvise and madvise both provide hints for caching or access pattern for
file and memory respectively.  Skip them.

Fixes: 5bd2182d58e9 ("audit,io_uring,io-wq: add some basic audit support to io_uring")
Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
Link: https://lore.kernel.org/r/b5dfdcd541115c86dbc774aa9dd502c964849c5f.1675282642.git.rgb@redhat.com
Acked-by: Paul Moore <paul@paul-moore.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/opdef.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index 5238ecd7af6a..cca7c5b55208 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -257,6 +257,7 @@ const struct io_issue_def io_issue_defs[] = {
 		.issue			= io_fadvise,
 	},
 	[IORING_OP_MADVISE] = {
+		.audit_skip		= 1,
 		.prep			= io_madvise_prep,
 		.issue			= io_madvise,
 	},

From 7d3fd88d61a41016da01889f076fd1c60c7298fc Mon Sep 17 00:00:00 2001
From: Josh Triplett <josh@joshtriplett.org>
Date: Tue, 14 Feb 2023 16:42:22 -0800
Subject: [PATCH 51/51] io_uring: Support calling io_uring_register with a
 registered ring fd

Add a new flag IORING_REGISTER_USE_REGISTERED_RING (set via the high bit
of the opcode) to treat the fd as a registered index rather than a file
descriptor.

This makes it possible for a library to open an io_uring, register the
ring fd, close the ring fd, and subsequently use the ring entirely via
registered index.

Signed-off-by: Josh Triplett <josh@joshtriplett.org>
Link: https://lore.kernel.org/r/f2396369e638284586b069dbddffb8c992afba95.1676419314.git.josh@joshtriplett.org
[axboe: remove extra high bit clear]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h |  6 +++++-
 io_uring/io_uring.c           | 33 ++++++++++++++++++++++++++-------
 2 files changed, 31 insertions(+), 8 deletions(-)

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 636a4c2c1294..97661a60b28c 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -472,6 +472,7 @@ struct io_uring_params {
 #define IORING_FEAT_RSRC_TAGS		(1U << 10)
 #define IORING_FEAT_CQE_SKIP		(1U << 11)
 #define IORING_FEAT_LINKED_FILE		(1U << 12)
+#define IORING_FEAT_REG_REG_RING	(1U << 13)
 
 /*
  * io_uring_register(2) opcodes and arguments
@@ -519,7 +520,10 @@ enum {
 	IORING_REGISTER_FILE_ALLOC_RANGE	= 25,
 
 	/* this goes last */
-	IORING_REGISTER_LAST
+	IORING_REGISTER_LAST,
+
+	/* flag added to the opcode to use a registered ring fd */
+	IORING_REGISTER_USE_REGISTERED_RING	= 1U << 31
 };
 
 /* io-wq worker categories */
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 0e42160a3051..3b915deb4d08 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3785,7 +3785,7 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
 			IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
 			IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS |
 			IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP |
-			IORING_FEAT_LINKED_FILE;
+			IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING;
 
 	if (copy_to_user(params, p, sizeof(*p))) {
 		ret = -EFAULT;
@@ -4306,17 +4306,36 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
 	struct io_ring_ctx *ctx;
 	long ret = -EBADF;
 	struct fd f;
+	bool use_registered_ring;
+
+	use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
+	opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;
 
 	if (opcode >= IORING_REGISTER_LAST)
 		return -EINVAL;
 
-	f = fdget(fd);
-	if (!f.file)
-		return -EBADF;
+	if (use_registered_ring) {
+		/*
+		 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
+		 * need only dereference our task private array to find it.
+		 */
+		struct io_uring_task *tctx = current->io_uring;
 
-	ret = -EOPNOTSUPP;
-	if (!io_is_uring_fops(f.file))
-		goto out_fput;
+		if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
+			return -EINVAL;
+		fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
+		f.file = tctx->registered_rings[fd];
+		f.flags = 0;
+		if (unlikely(!f.file))
+			return -EBADF;
+	} else {
+		f = fdget(fd);
+		if (unlikely(!f.file))
+			return -EBADF;
+		ret = -EOPNOTSUPP;
+		if (!io_is_uring_fops(f.file))
+			goto out_fput;
+	}
 
 	ctx = f.file->private_data;