From 95a5bbae05ef1ec1cceb8c1b04a482aa0b7c177c Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 14 Nov 2019 22:40:44 -0700
Subject: [PATCH 01/42] io_uring: io_async_cancel() should pass in 'nxt'
 request pointer

If we have a linked request, this enables us to pass it back directly
without having to go through async context.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 4c030a92de79..011281856ff7 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2480,7 +2480,7 @@ static int io_async_cancel(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	    sqe->cancel_flags)
 		return -EINVAL;
 
-	io_async_find_and_cancel(ctx, req, READ_ONCE(sqe->addr), NULL);
+	io_async_find_and_cancel(ctx, req, READ_ONCE(sqe->addr), nxt);
 	return 0;
 }
 

From 0e0702dac26b282603261f04a62711a2d9aac17b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 14 Nov 2019 21:42:10 -0700
Subject: [PATCH 02/42] io_uring: cleanup return values from the queueing
 functions

__io_queue_sqe(), io_queue_sqe(), io_queue_link_head() all return 0/err,
but the caller doesn't care since the errors are handled inline. Clean
these up and just make them void.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 28 ++++++++++++----------------
 1 file changed, 12 insertions(+), 16 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 011281856ff7..d877c7f6368e 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2840,7 +2840,7 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req,
 	return nxt;
 }
 
-static int __io_queue_sqe(struct io_kiocb *req)
+static void __io_queue_sqe(struct io_kiocb *req)
 {
 	enum hrtimer_mode mode;
 	struct io_kiocb *nxt;
@@ -2885,7 +2885,7 @@ static int __io_queue_sqe(struct io_kiocb *req)
 			if (nxt)
 				io_queue_linked_timeout(nxt, &ts, &mode);
 
-			return 0;
+			return;
 		}
 	}
 
@@ -2907,11 +2907,9 @@ err:
 			req->flags |= REQ_F_FAIL_LINK;
 		io_put_req(req);
 	}
-
-	return ret;
 }
 
-static int io_queue_sqe(struct io_kiocb *req)
+static void io_queue_sqe(struct io_kiocb *req)
 {
 	int ret;
 
@@ -2921,20 +2919,20 @@ static int io_queue_sqe(struct io_kiocb *req)
 			io_cqring_add_event(req, ret);
 			io_double_put_req(req);
 		}
-		return 0;
-	}
-
-	return __io_queue_sqe(req);
+	} else
+		__io_queue_sqe(req);
 }
 
-static int io_queue_link_head(struct io_kiocb *req, struct io_kiocb *shadow)
+static void io_queue_link_head(struct io_kiocb *req, struct io_kiocb *shadow)
 {
 	int ret;
 	int need_submit = false;
 	struct io_ring_ctx *ctx = req->ctx;
 
-	if (!shadow)
-		return io_queue_sqe(req);
+	if (!shadow) {
+		io_queue_sqe(req);
+		return;
+	}
 
 	/*
 	 * Mark the first IO in link list as DRAIN, let all the following
@@ -2948,7 +2946,7 @@ static int io_queue_link_head(struct io_kiocb *req, struct io_kiocb *shadow)
 			io_cqring_add_event(req, ret);
 			io_double_put_req(req);
 			__io_free_req(shadow);
-			return 0;
+			return;
 		}
 	} else {
 		/*
@@ -2965,9 +2963,7 @@ static int io_queue_link_head(struct io_kiocb *req, struct io_kiocb *shadow)
 	spin_unlock_irq(&ctx->completion_lock);
 
 	if (need_submit)
-		return __io_queue_sqe(req);
-
-	return 0;
+		__io_queue_sqe(req);
 }
 
 #define SQE_VALID_FLAGS	(IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK)

From 978db57e2c329fc612ff669cab9bf0007efd3ca3 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 14 Nov 2019 22:39:04 -0700
Subject: [PATCH 03/42] io_uring: make io_double_put_req() use normal
 completion path

If we don't use the normal completion path, we may skip killing links
that should be errored and freed. Add __io_double_put_req() for use
within the completion path itself, other calls should just use
io_double_put_req().

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index d877c7f6368e..1da103df2bfa 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -383,6 +383,7 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res);
 static void __io_free_req(struct io_kiocb *req);
 static void io_put_req(struct io_kiocb *req);
 static void io_double_put_req(struct io_kiocb *req);
+static void __io_double_put_req(struct io_kiocb *req);
 
 static struct kmem_cache *req_cachep;
 
@@ -916,7 +917,7 @@ static void io_fail_links(struct io_kiocb *req)
 			io_link_cancel_timeout(link);
 		} else {
 			io_cqring_fill_event(link, -ECANCELED);
-			io_double_put_req(link);
+			__io_double_put_req(link);
 		}
 	}
 
@@ -990,13 +991,24 @@ static void io_put_req(struct io_kiocb *req)
 		io_free_req(req);
 }
 
-static void io_double_put_req(struct io_kiocb *req)
+/*
+ * Must only be used if we don't need to care about links, usually from
+ * within the completion handling itself.
+ */
+static void __io_double_put_req(struct io_kiocb *req)
 {
 	/* drop both submit and complete references */
 	if (refcount_sub_and_test(2, &req->refs))
 		__io_free_req(req);
 }
 
+static void io_double_put_req(struct io_kiocb *req)
+{
+	/* drop both submit and complete references */
+	if (refcount_sub_and_test(2, &req->refs))
+		io_free_req(req);
+}
+
 static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush)
 {
 	struct io_rings *rings = ctx->rings;

From ad8a48acc23cb13cbf4332ebabb867b1baa81842 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 15 Nov 2019 08:49:11 -0700
Subject: [PATCH 04/42] io_uring: make req->timeout be dynamically allocated

There are a few reasons for this:

- As a prep to improving the linked timeout logic
- io_timeout is the biggest member in the io_kiocb opcode union

This also enables a few cleanups, like unifying the timer setup between
IORING_OP_TIMEOUT and IORING_OP_LINK_TIMEOUT, and not needing multiple
arguments to the link/prep helpers.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 129 +++++++++++++++++++++++++++-----------------------
 1 file changed, 70 insertions(+), 59 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 1da103df2bfa..8cd3505d167b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -301,9 +301,16 @@ struct io_poll_iocb {
 	struct wait_queue_entry		wait;
 };
 
+struct io_timeout_data {
+	struct io_kiocb			*req;
+	struct hrtimer			timer;
+	struct timespec64		ts;
+	enum hrtimer_mode		mode;
+};
+
 struct io_timeout {
 	struct file			*file;
-	struct hrtimer			timer;
+	struct io_timeout_data		*data;
 };
 
 /*
@@ -573,7 +580,7 @@ static void io_kill_timeout(struct io_kiocb *req)
 {
 	int ret;
 
-	ret = hrtimer_try_to_cancel(&req->timeout.timer);
+	ret = hrtimer_try_to_cancel(&req->timeout.data->timer);
 	if (ret != -1) {
 		atomic_inc(&req->ctx->cq_timeouts);
 		list_del_init(&req->list);
@@ -828,6 +835,8 @@ static void __io_free_req(struct io_kiocb *req)
 			wake_up(&ctx->inflight_wait);
 		spin_unlock_irqrestore(&ctx->inflight_lock, flags);
 	}
+	if (req->flags & REQ_F_TIMEOUT)
+		kfree(req->timeout.data);
 	percpu_ref_put(&ctx->refs);
 	if (likely(!io_is_fallback_req(req)))
 		kmem_cache_free(req_cachep, req);
@@ -840,7 +849,7 @@ static bool io_link_cancel_timeout(struct io_kiocb *req)
 	struct io_ring_ctx *ctx = req->ctx;
 	int ret;
 
-	ret = hrtimer_try_to_cancel(&req->timeout.timer);
+	ret = hrtimer_try_to_cancel(&req->timeout.data->timer);
 	if (ret != -1) {
 		io_cqring_fill_event(req, -ECANCELED);
 		io_commit_cqring(ctx);
@@ -2236,12 +2245,12 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 
 static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
 {
-	struct io_ring_ctx *ctx;
-	struct io_kiocb *req;
+	struct io_timeout_data *data = container_of(timer,
+						struct io_timeout_data, timer);
+	struct io_kiocb *req = data->req;
+	struct io_ring_ctx *ctx = req->ctx;
 	unsigned long flags;
 
-	req = container_of(timer, struct io_kiocb, timeout.timer);
-	ctx = req->ctx;
 	atomic_inc(&ctx->cq_timeouts);
 
 	spin_lock_irqsave(&ctx->completion_lock, flags);
@@ -2291,7 +2300,7 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
 	if (ret == -ENOENT)
 		return ret;
 
-	ret = hrtimer_try_to_cancel(&req->timeout.timer);
+	ret = hrtimer_try_to_cancel(&req->timeout.data->timer);
 	if (ret == -1)
 		return -EALREADY;
 
@@ -2331,34 +2340,54 @@ static int io_timeout_remove(struct io_kiocb *req,
 	return 0;
 }
 
-static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+static int io_timeout_setup(struct io_kiocb *req)
 {
-	unsigned count;
-	struct io_ring_ctx *ctx = req->ctx;
-	struct list_head *entry;
-	enum hrtimer_mode mode;
-	struct timespec64 ts;
-	unsigned span = 0;
+	const struct io_uring_sqe *sqe = req->submit.sqe;
+	struct io_timeout_data *data;
 	unsigned flags;
 
-	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
+	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
-	if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->len != 1)
+	if (sqe->ioprio || sqe->buf_index || sqe->len != 1)
 		return -EINVAL;
 	flags = READ_ONCE(sqe->timeout_flags);
 	if (flags & ~IORING_TIMEOUT_ABS)
 		return -EINVAL;
 
-	if (get_timespec64(&ts, u64_to_user_ptr(sqe->addr)))
+	data = kzalloc(sizeof(struct io_timeout_data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+	data->req = req;
+	req->timeout.data = data;
+	req->flags |= REQ_F_TIMEOUT;
+
+	if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
 		return -EFAULT;
 
 	if (flags & IORING_TIMEOUT_ABS)
-		mode = HRTIMER_MODE_ABS;
+		data->mode = HRTIMER_MODE_ABS;
 	else
-		mode = HRTIMER_MODE_REL;
+		data->mode = HRTIMER_MODE_REL;
 
-	hrtimer_init(&req->timeout.timer, CLOCK_MONOTONIC, mode);
-	req->flags |= REQ_F_TIMEOUT;
+	hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode);
+	return 0;
+}
+
+static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	unsigned count;
+	struct io_ring_ctx *ctx = req->ctx;
+	struct io_timeout_data *data;
+	struct list_head *entry;
+	unsigned span = 0;
+	int ret;
+
+	ret = io_timeout_setup(req);
+	/* common setup allows flags (like links) set, we don't */
+	if (!ret && sqe->flags)
+		ret = -EINVAL;
+	if (ret)
+		return ret;
 
 	/*
 	 * sqe->off holds how many events that need to occur for this
@@ -2418,8 +2447,9 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	req->sequence -= span;
 add:
 	list_add(&req->list, entry);
-	req->timeout.timer.function = io_timeout_fn;
-	hrtimer_start(&req->timeout.timer, timespec64_to_ktime(ts), mode);
+	data = req->timeout.data;
+	data->timer.function = io_timeout_fn;
+	hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
 	spin_unlock_irq(&ctx->completion_lock);
 	return 0;
 }
@@ -2754,8 +2784,9 @@ static int io_grab_files(struct io_kiocb *req)
 
 static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
 {
-	struct io_kiocb *req = container_of(timer, struct io_kiocb,
-						timeout.timer);
+	struct io_timeout_data *data = container_of(timer,
+						struct io_timeout_data, timer);
+	struct io_kiocb *req = data->req;
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_kiocb *prev = NULL;
 	unsigned long flags;
@@ -2786,9 +2817,9 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
 	return HRTIMER_NORESTART;
 }
 
-static void io_queue_linked_timeout(struct io_kiocb *req, struct timespec64 *ts,
-				    enum hrtimer_mode *mode)
+static void io_queue_linked_timeout(struct io_kiocb *req)
 {
+	struct io_timeout_data *data = req->timeout.data;
 	struct io_ring_ctx *ctx = req->ctx;
 
 	/*
@@ -2797,9 +2828,9 @@ static void io_queue_linked_timeout(struct io_kiocb *req, struct timespec64 *ts,
 	 */
 	spin_lock_irq(&ctx->completion_lock);
 	if (!list_empty(&req->list)) {
-		req->timeout.timer.function = io_link_timeout_fn;
-		hrtimer_start(&req->timeout.timer, timespec64_to_ktime(*ts),
-				*mode);
+		data->timer.function = io_link_timeout_fn;
+		hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
+				data->mode);
 	}
 	spin_unlock_irq(&ctx->completion_lock);
 
@@ -2807,22 +2838,7 @@ static void io_queue_linked_timeout(struct io_kiocb *req, struct timespec64 *ts,
 	io_put_req(req);
 }
 
-static int io_validate_link_timeout(const struct io_uring_sqe *sqe,
-				    struct timespec64 *ts)
-{
-	if (sqe->ioprio || sqe->buf_index || sqe->len != 1 || sqe->off)
-		return -EINVAL;
-	if (sqe->timeout_flags & ~IORING_TIMEOUT_ABS)
-		return -EINVAL;
-	if (get_timespec64(ts, u64_to_user_ptr(sqe->addr)))
-		return -EFAULT;
-
-	return 0;
-}
-
-static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req,
-					       struct timespec64 *ts,
-					       enum hrtimer_mode *mode)
+static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
 {
 	struct io_kiocb *nxt;
 	int ret;
@@ -2834,7 +2850,10 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req,
 	if (!nxt || nxt->submit.sqe->opcode != IORING_OP_LINK_TIMEOUT)
 		return NULL;
 
-	ret = io_validate_link_timeout(nxt->submit.sqe, ts);
+	ret = io_timeout_setup(nxt);
+	/* common setup allows offset being set, we don't */
+	if (!ret && nxt->submit.sqe->off)
+		ret = -EINVAL;
 	if (ret) {
 		list_del_init(&nxt->list);
 		io_cqring_add_event(nxt, ret);
@@ -2842,24 +2861,16 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req,
 		return ERR_PTR(-ECANCELED);
 	}
 
-	if (nxt->submit.sqe->timeout_flags & IORING_TIMEOUT_ABS)
-		*mode = HRTIMER_MODE_ABS;
-	else
-		*mode = HRTIMER_MODE_REL;
-
 	req->flags |= REQ_F_LINK_TIMEOUT;
-	hrtimer_init(&nxt->timeout.timer, CLOCK_MONOTONIC, *mode);
 	return nxt;
 }
 
 static void __io_queue_sqe(struct io_kiocb *req)
 {
-	enum hrtimer_mode mode;
 	struct io_kiocb *nxt;
-	struct timespec64 ts;
 	int ret;
 
-	nxt = io_prep_linked_timeout(req, &ts, &mode);
+	nxt = io_prep_linked_timeout(req);
 	if (IS_ERR(nxt)) {
 		ret = PTR_ERR(nxt);
 		nxt = NULL;
@@ -2895,7 +2906,7 @@ static void __io_queue_sqe(struct io_kiocb *req)
 			io_queue_async_work(req);
 
 			if (nxt)
-				io_queue_linked_timeout(nxt, &ts, &mode);
+				io_queue_linked_timeout(nxt);
 
 			return;
 		}
@@ -2907,7 +2918,7 @@ err:
 
 	if (nxt) {
 		if (!ret)
-			io_queue_linked_timeout(nxt, &ts, &mode);
+			io_queue_linked_timeout(nxt);
 		else
 			io_put_req(nxt);
 	}

From 94ae5e77a9150a8c6c57432e2db290c6868ddfad Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 14 Nov 2019 19:39:52 -0700
Subject: [PATCH 05/42] io_uring: fix sequencing issues with linked timeouts

We have an issue with timeout links that are deeper in the submit chain,
because we only handle it upfront, not from later submissions. Move the
prep + issue of the timeout link to the async work prep handler, and do
it normally for non-async queue. If we validate and prepare the timeout
links upfront when we first see them, there's nothing stopping us from
supporting any sort of nesting.

Fixes: 2665abfd757f ("io_uring: add support for linked SQE timeouts")
Reported-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 102 ++++++++++++++++++++++++++++++--------------------
 1 file changed, 61 insertions(+), 41 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 8cd3505d167b..759a30e52fda 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -353,6 +353,7 @@ struct io_kiocb {
 #define REQ_F_TIMEOUT_NOSEQ	8192	/* no timeout sequence */
 #define REQ_F_INFLIGHT		16384	/* on inflight list */
 #define REQ_F_COMP_LOCKED	32768	/* completion under lock */
+#define REQ_F_FREE_SQE		65536	/* free sqe if not async queued */
 	u64			user_data;
 	u32			result;
 	u32			sequence;
@@ -391,6 +392,8 @@ static void __io_free_req(struct io_kiocb *req);
 static void io_put_req(struct io_kiocb *req);
 static void io_double_put_req(struct io_kiocb *req);
 static void __io_double_put_req(struct io_kiocb *req);
+static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
+static void io_queue_linked_timeout(struct io_kiocb *req);
 
 static struct kmem_cache *req_cachep;
 
@@ -529,7 +532,8 @@ static inline bool io_sqe_needs_user(const struct io_uring_sqe *sqe)
 		 opcode == IORING_OP_WRITE_FIXED);
 }
 
-static inline bool io_prep_async_work(struct io_kiocb *req)
+static inline bool io_prep_async_work(struct io_kiocb *req,
+				      struct io_kiocb **link)
 {
 	bool do_hashed = false;
 
@@ -558,13 +562,17 @@ static inline bool io_prep_async_work(struct io_kiocb *req)
 			req->work.flags |= IO_WQ_WORK_NEEDS_USER;
 	}
 
+	*link = io_prep_linked_timeout(req);
 	return do_hashed;
 }
 
 static inline void io_queue_async_work(struct io_kiocb *req)
 {
-	bool do_hashed = io_prep_async_work(req);
 	struct io_ring_ctx *ctx = req->ctx;
+	struct io_kiocb *link;
+	bool do_hashed;
+
+	do_hashed = io_prep_async_work(req, &link);
 
 	trace_io_uring_queue_async_work(ctx, do_hashed, req, &req->work,
 					req->flags);
@@ -574,6 +582,9 @@ static inline void io_queue_async_work(struct io_kiocb *req)
 		io_wq_enqueue_hashed(ctx->io_wq, &req->work,
 					file_inode(req->file));
 	}
+
+	if (link)
+		io_queue_linked_timeout(link);
 }
 
 static void io_kill_timeout(struct io_kiocb *req)
@@ -875,6 +886,15 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
 	nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list);
 	while (nxt) {
 		list_del_init(&nxt->list);
+
+		if ((req->flags & REQ_F_LINK_TIMEOUT) &&
+		    (nxt->flags & REQ_F_TIMEOUT)) {
+			wake_ev |= io_link_cancel_timeout(nxt);
+			nxt = list_first_entry_or_null(&req->link_list,
+							struct io_kiocb, list);
+			req->flags &= ~REQ_F_LINK_TIMEOUT;
+			continue;
+		}
 		if (!list_empty(&req->link_list)) {
 			INIT_LIST_HEAD(&nxt->link_list);
 			list_splice(&req->link_list, &nxt->link_list);
@@ -885,19 +905,13 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
 		 * If we're in async work, we can continue processing the chain
 		 * in this context instead of having to queue up new async work.
 		 */
-		if (req->flags & REQ_F_LINK_TIMEOUT) {
-			wake_ev = io_link_cancel_timeout(nxt);
-
-			/* we dropped this link, get next */
-			nxt = list_first_entry_or_null(&req->link_list,
-							struct io_kiocb, list);
-		} else if (nxtptr && io_wq_current_is_worker()) {
-			*nxtptr = nxt;
-			break;
-		} else {
-			io_queue_async_work(nxt);
-			break;
+		if (nxt) {
+			if (nxtptr && io_wq_current_is_worker())
+				*nxtptr = nxt;
+			else
+				io_queue_async_work(nxt);
 		}
+		break;
 	}
 
 	if (wake_ev)
@@ -916,11 +930,16 @@ static void io_fail_links(struct io_kiocb *req)
 	spin_lock_irqsave(&ctx->completion_lock, flags);
 
 	while (!list_empty(&req->link_list)) {
+		const struct io_uring_sqe *sqe_to_free = NULL;
+
 		link = list_first_entry(&req->link_list, struct io_kiocb, list);
 		list_del_init(&link->list);
 
 		trace_io_uring_fail_link(req, link);
 
+		if (link->flags & REQ_F_FREE_SQE)
+			sqe_to_free = link->submit.sqe;
+
 		if ((req->flags & REQ_F_LINK_TIMEOUT) &&
 		    link->submit.sqe->opcode == IORING_OP_LINK_TIMEOUT) {
 			io_link_cancel_timeout(link);
@@ -928,6 +947,7 @@ static void io_fail_links(struct io_kiocb *req)
 			io_cqring_fill_event(link, -ECANCELED);
 			__io_double_put_req(link);
 		}
+		kfree(sqe_to_free);
 	}
 
 	io_commit_cqring(ctx);
@@ -2683,8 +2703,12 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
 
 	/* if a dependent link is ready, pass it back */
 	if (!ret && nxt) {
-		io_prep_async_work(nxt);
+		struct io_kiocb *link;
+
+		io_prep_async_work(nxt, &link);
 		*workptr = &nxt->work;
+		if (link)
+			io_queue_linked_timeout(link);
 	}
 }
 
@@ -2819,7 +2843,6 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
 
 static void io_queue_linked_timeout(struct io_kiocb *req)
 {
-	struct io_timeout_data *data = req->timeout.data;
 	struct io_ring_ctx *ctx = req->ctx;
 
 	/*
@@ -2828,6 +2851,8 @@ static void io_queue_linked_timeout(struct io_kiocb *req)
 	 */
 	spin_lock_irq(&ctx->completion_lock);
 	if (!list_empty(&req->list)) {
+		struct io_timeout_data *data = req->timeout.data;
+
 		data->timer.function = io_link_timeout_fn;
 		hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
 				data->mode);
@@ -2841,7 +2866,6 @@ static void io_queue_linked_timeout(struct io_kiocb *req)
 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
 {
 	struct io_kiocb *nxt;
-	int ret;
 
 	if (!(req->flags & REQ_F_LINK))
 		return NULL;
@@ -2850,33 +2874,15 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
 	if (!nxt || nxt->submit.sqe->opcode != IORING_OP_LINK_TIMEOUT)
 		return NULL;
 
-	ret = io_timeout_setup(nxt);
-	/* common setup allows offset being set, we don't */
-	if (!ret && nxt->submit.sqe->off)
-		ret = -EINVAL;
-	if (ret) {
-		list_del_init(&nxt->list);
-		io_cqring_add_event(nxt, ret);
-		io_double_put_req(nxt);
-		return ERR_PTR(-ECANCELED);
-	}
-
 	req->flags |= REQ_F_LINK_TIMEOUT;
 	return nxt;
 }
 
 static void __io_queue_sqe(struct io_kiocb *req)
 {
-	struct io_kiocb *nxt;
+	struct io_kiocb *nxt = io_prep_linked_timeout(req);
 	int ret;
 
-	nxt = io_prep_linked_timeout(req);
-	if (IS_ERR(nxt)) {
-		ret = PTR_ERR(nxt);
-		nxt = NULL;
-		goto err;
-	}
-
 	ret = __io_submit_sqe(req, NULL, true);
 
 	/*
@@ -2904,10 +2910,6 @@ static void __io_queue_sqe(struct io_kiocb *req)
 			 * submit reference when the iocb is actually submitted.
 			 */
 			io_queue_async_work(req);
-
-			if (nxt)
-				io_queue_linked_timeout(nxt);
-
 			return;
 		}
 	}
@@ -2952,6 +2954,10 @@ static void io_queue_link_head(struct io_kiocb *req, struct io_kiocb *shadow)
 	int need_submit = false;
 	struct io_ring_ctx *ctx = req->ctx;
 
+	if (unlikely(req->flags & REQ_F_FAIL_LINK)) {
+		ret = -ECANCELED;
+		goto err;
+	}
 	if (!shadow) {
 		io_queue_sqe(req);
 		return;
@@ -2966,9 +2972,11 @@ static void io_queue_link_head(struct io_kiocb *req, struct io_kiocb *shadow)
 	ret = io_req_defer(req);
 	if (ret) {
 		if (ret != -EIOCBQUEUED) {
+err:
 			io_cqring_add_event(req, ret);
 			io_double_put_req(req);
-			__io_free_req(shadow);
+			if (shadow)
+				__io_free_req(shadow);
 			return;
 		}
 	} else {
@@ -3025,6 +3033,17 @@ err_req:
 	if (*link) {
 		struct io_kiocb *prev = *link;
 
+		if (READ_ONCE(s->sqe->opcode) == IORING_OP_LINK_TIMEOUT) {
+			ret = io_timeout_setup(req);
+			/* common setup allows offset being set, we don't */
+			if (!ret && s->sqe->off)
+				ret = -EINVAL;
+			if (ret) {
+				prev->flags |= REQ_F_FAIL_LINK;
+				goto err_req;
+			}
+		}
+
 		sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL);
 		if (!sqe_copy) {
 			ret = -EAGAIN;
@@ -3032,6 +3051,7 @@ err_req:
 		}
 
 		s->sqe = sqe_copy;
+		req->flags |= REQ_F_FREE_SQE;
 		trace_io_uring_link(ctx, req, prev);
 		list_add_tail(&req->list, &prev->link_list);
 	} else if (s->sqe->flags & IOSQE_IO_LINK) {

From e0e328c4b330712e45ba799dc589bda751323110 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 15 Nov 2019 17:59:46 -0700
Subject: [PATCH 06/42] io_uring: remove dead REQ_F_SEQ_PREV flag

With the conversion to io-wq, we no longer use that flag. Kill it.

Fixes: 561fb04a6a22 ("io_uring: replace workqueue usage with io-wq")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 759a30e52fda..942c04d5e63d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -340,7 +340,6 @@ struct io_kiocb {
 #define REQ_F_NOWAIT		1	/* must not punt to workers */
 #define REQ_F_IOPOLL_COMPLETED	2	/* polled IO has completed */
 #define REQ_F_FIXED_FILE	4	/* ctx owns file */
-#define REQ_F_SEQ_PREV		8	/* sequential with previous */
 #define REQ_F_IO_DRAIN		16	/* drain existing IO first */
 #define REQ_F_IO_DRAINED	32	/* drain done */
 #define REQ_F_LINK		64	/* linked sqes */

From b0dd8a412699afe3420a08f841333f3474ad45c5 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 18 Nov 2019 12:14:54 -0700
Subject: [PATCH 07/42] io_uring: correct poll cancel and linked timeout
 expiration completion

Currently a poll request fills a completion entry of 0, even if it got
cancelled. This is odd, and it makes it harder to support with chains.
Ensure that it returns -ECANCELED in the completions events if it got
cancelled, and furthermore ensure that the linked timeout that triggered
it completes with -ETIME if we did indeed trigger the completions
through a timeout.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 33 ++++++++++++++++++++++-----------
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 942c04d5e63d..40c351a9ed26 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2066,12 +2066,15 @@ static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return 0;
 }
 
-static void io_poll_complete(struct io_kiocb *req, __poll_t mask)
+static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 
 	req->poll.done = true;
-	io_cqring_fill_event(req, mangle_poll(mask));
+	if (error)
+		io_cqring_fill_event(req, error);
+	else
+		io_cqring_fill_event(req, mangle_poll(mask));
 	io_commit_cqring(ctx);
 }
 
@@ -2084,11 +2087,16 @@ static void io_poll_complete_work(struct io_wq_work **workptr)
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_kiocb *nxt = NULL;
 	__poll_t mask = 0;
+	int ret = 0;
 
-	if (work->flags & IO_WQ_WORK_CANCEL)
+	if (work->flags & IO_WQ_WORK_CANCEL) {
 		WRITE_ONCE(poll->canceled, true);
+		ret = -ECANCELED;
+	} else if (READ_ONCE(poll->canceled)) {
+		ret = -ECANCELED;
+	}
 
-	if (!READ_ONCE(poll->canceled))
+	if (ret != -ECANCELED)
 		mask = vfs_poll(poll->file, &pt) & poll->events;
 
 	/*
@@ -2099,13 +2107,13 @@ static void io_poll_complete_work(struct io_wq_work **workptr)
 	 * avoid further branches in the fast path.
 	 */
 	spin_lock_irq(&ctx->completion_lock);
-	if (!mask && !READ_ONCE(poll->canceled)) {
+	if (!mask && ret != -ECANCELED) {
 		add_wait_queue(poll->head, &poll->wait);
 		spin_unlock_irq(&ctx->completion_lock);
 		return;
 	}
 	io_poll_remove_req(req);
-	io_poll_complete(req, mask);
+	io_poll_complete(req, mask, ret);
 	spin_unlock_irq(&ctx->completion_lock);
 
 	io_cqring_ev_posted(ctx);
@@ -2139,7 +2147,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
 	 */
 	if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) {
 		io_poll_remove_req(req);
-		io_poll_complete(req, mask);
+		io_poll_complete(req, mask, 0);
 		req->flags |= REQ_F_COMP_LOCKED;
 		io_put_req(req);
 		spin_unlock_irqrestore(&ctx->completion_lock, flags);
@@ -2251,7 +2259,7 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	}
 	if (mask) { /* no async, we'd stolen it */
 		ipt.error = 0;
-		io_poll_complete(req, mask);
+		io_poll_complete(req, mask, 0);
 	}
 	spin_unlock_irq(&ctx->completion_lock);
 
@@ -2503,7 +2511,7 @@ static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr)
 
 static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
 				     struct io_kiocb *req, __u64 sqe_addr,
-				     struct io_kiocb **nxt)
+				     struct io_kiocb **nxt, int success_ret)
 {
 	unsigned long flags;
 	int ret;
@@ -2520,6 +2528,8 @@ static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
 		goto done;
 	ret = io_poll_cancel(ctx, sqe_addr);
 done:
+	if (!ret)
+		ret = success_ret;
 	io_cqring_fill_event(req, ret);
 	io_commit_cqring(ctx);
 	spin_unlock_irqrestore(&ctx->completion_lock, flags);
@@ -2541,7 +2551,7 @@ static int io_async_cancel(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	    sqe->cancel_flags)
 		return -EINVAL;
 
-	io_async_find_and_cancel(ctx, req, READ_ONCE(sqe->addr), nxt);
+	io_async_find_and_cancel(ctx, req, READ_ONCE(sqe->addr), nxt, 0);
 	return 0;
 }
 
@@ -2831,7 +2841,8 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
 	spin_unlock_irqrestore(&ctx->completion_lock, flags);
 
 	if (prev) {
-		io_async_find_and_cancel(ctx, req, prev->user_data, NULL);
+		io_async_find_and_cancel(ctx, req, prev->user_data, NULL,
+						-ETIME);
 		io_put_req(prev);
 	} else {
 		io_cqring_add_event(req, -ETIME);

From fba38c272a0385148935d6443cb9dc68cf1f37a7 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 18 Nov 2019 12:27:57 -0700
Subject: [PATCH 08/42] io_uring: request cancellations should break links

We currently don't explicitly break links if a request is cancelled, but
we should. Add explicitly link breakage for all types of request
cancellations that we support.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 40c351a9ed26..6550b7eab7d2 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2118,6 +2118,8 @@ static void io_poll_complete_work(struct io_wq_work **workptr)
 
 	io_cqring_ev_posted(ctx);
 
+	if (ret < 0 && req->flags & REQ_F_LINK)
+		req->flags |= REQ_F_FAIL_LINK;
 	io_put_req_find_next(req, &nxt);
 	if (nxt)
 		*workptr = &nxt->work;
@@ -2331,6 +2333,8 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
 	if (ret == -1)
 		return -EALREADY;
 
+	if (req->flags & REQ_F_LINK)
+		req->flags |= REQ_F_FAIL_LINK;
 	io_cqring_fill_event(req, -ECANCELED);
 	io_put_req(req);
 	return 0;
@@ -2841,6 +2845,8 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
 	spin_unlock_irqrestore(&ctx->completion_lock, flags);
 
 	if (prev) {
+		if (prev->flags & REQ_F_LINK)
+			prev->flags |= REQ_F_FAIL_LINK;
 		io_async_find_and_cancel(ctx, req, prev->user_data, NULL,
 						-ETIME);
 		io_put_req(prev);

From b60fda6000a99a7ccac36005ab78b14b47c06de3 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 19 Nov 2019 08:37:07 -0700
Subject: [PATCH 09/42] io-wq: wait for io_wq_create() to setup necessary
 workers

We currently have a race where if setup is really slow, we can be
calling io_wq_destroy() before we're done setting up. This will cause
the caller to get stuck waiting for the manager to set things up, but
the manager already exited.

Fix this by doing a sync setup of the manager. This also fixes the case
where if we failed creating workers, we'd also get stuck.

In practice this race window was really small, as we already wait for
the manager to start. Hence someone would have to call io_wq_destroy()
after the task has started, but before it started the first loop. The
reported test case forked tons of these, which is why it became an
issue.

Reported-by: syzbot+0f1cc17f85154f400465@syzkaller.appspotmail.com
Fixes: 771b53d033e8 ("io-wq: small threadpool implementation for io_uring")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.c | 50 +++++++++++++++++++++++++++++++++++---------------
 1 file changed, 35 insertions(+), 15 deletions(-)

diff --git a/fs/io-wq.c b/fs/io-wq.c
index 9174007ce107..243eb1d4d2bd 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -33,6 +33,7 @@ enum {
 enum {
 	IO_WQ_BIT_EXIT		= 0,	/* wq exiting */
 	IO_WQ_BIT_CANCEL	= 1,	/* cancel work on list */
+	IO_WQ_BIT_ERROR		= 2,	/* error on setup */
 };
 
 enum {
@@ -562,14 +563,14 @@ void io_wq_worker_sleeping(struct task_struct *tsk)
 	spin_unlock_irq(&wqe->lock);
 }
 
-static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
+static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
 {
 	struct io_wqe_acct *acct =&wqe->acct[index];
 	struct io_worker *worker;
 
 	worker = kcalloc_node(1, sizeof(*worker), GFP_KERNEL, wqe->node);
 	if (!worker)
-		return;
+		return false;
 
 	refcount_set(&worker->ref, 1);
 	worker->nulls_node.pprev = NULL;
@@ -581,7 +582,7 @@ static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
 				"io_wqe_worker-%d/%d", index, wqe->node);
 	if (IS_ERR(worker->task)) {
 		kfree(worker);
-		return;
+		return false;
 	}
 
 	spin_lock_irq(&wqe->lock);
@@ -599,6 +600,7 @@ static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
 		atomic_inc(&wq->user->processes);
 
 	wake_up_process(worker->task);
+	return true;
 }
 
 static inline bool io_wqe_need_worker(struct io_wqe *wqe, int index)
@@ -606,9 +608,6 @@ static inline bool io_wqe_need_worker(struct io_wqe *wqe, int index)
 {
 	struct io_wqe_acct *acct = &wqe->acct[index];
 
-	/* always ensure we have one bounded worker */
-	if (index == IO_WQ_ACCT_BOUND && !acct->nr_workers)
-		return true;
 	/* if we have available workers or no work, no need */
 	if (!hlist_nulls_empty(&wqe->free_list) || !io_wqe_run_queue(wqe))
 		return false;
@@ -621,10 +620,19 @@ static inline bool io_wqe_need_worker(struct io_wqe *wqe, int index)
 static int io_wq_manager(void *data)
 {
 	struct io_wq *wq = data;
+	int i;
+
+	/* create fixed workers */
+	refcount_set(&wq->refs, wq->nr_wqes);
+	for (i = 0; i < wq->nr_wqes; i++) {
+		if (create_io_worker(wq, wq->wqes[i], IO_WQ_ACCT_BOUND))
+			continue;
+		goto err;
+	}
+
+	complete(&wq->done);
 
 	while (!kthread_should_stop()) {
-		int i;
-
 		for (i = 0; i < wq->nr_wqes; i++) {
 			struct io_wqe *wqe = wq->wqes[i];
 			bool fork_worker[2] = { false, false };
@@ -645,6 +653,12 @@ static int io_wq_manager(void *data)
 	}
 
 	return 0;
+err:
+	set_bit(IO_WQ_BIT_ERROR, &wq->state);
+	set_bit(IO_WQ_BIT_EXIT, &wq->state);
+	if (refcount_sub_and_test(wq->nr_wqes - i, &wq->refs))
+		complete(&wq->done);
+	return 0;
 }
 
 static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct,
@@ -982,7 +996,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct mm_struct *mm,
 	wq->user = user;
 
 	i = 0;
-	refcount_set(&wq->refs, wq->nr_wqes);
 	for_each_online_node(node) {
 		struct io_wqe *wqe;
 
@@ -1020,14 +1033,22 @@ struct io_wq *io_wq_create(unsigned bounded, struct mm_struct *mm,
 	wq->manager = kthread_create(io_wq_manager, wq, "io_wq_manager");
 	if (!IS_ERR(wq->manager)) {
 		wake_up_process(wq->manager);
+		wait_for_completion(&wq->done);
+		if (test_bit(IO_WQ_BIT_ERROR, &wq->state)) {
+			ret = -ENOMEM;
+			goto err;
+		}
+		reinit_completion(&wq->done);
 		return wq;
 	}
 
 	ret = PTR_ERR(wq->manager);
-	wq->manager = NULL;
-err:
 	complete(&wq->done);
-	io_wq_destroy(wq);
+err:
+	for (i = 0; i < wq->nr_wqes; i++)
+		kfree(wq->wqes[i]);
+	kfree(wq->wqes);
+	kfree(wq);
 	return ERR_PTR(ret);
 }
 
@@ -1041,10 +1062,9 @@ void io_wq_destroy(struct io_wq *wq)
 {
 	int i;
 
-	if (wq->manager) {
-		set_bit(IO_WQ_BIT_EXIT, &wq->state);
+	set_bit(IO_WQ_BIT_EXIT, &wq->state);
+	if (wq->manager)
 		kthread_stop(wq->manager);
-	}
 
 	rcu_read_lock();
 	for (i = 0; i < wq->nr_wqes; i++) {

From b2e9c7d64b7ecacc1d0f15a6af88a73cab7d8db9 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Tue, 19 Nov 2019 09:22:16 +0300
Subject: [PATCH 10/42] io-wq: remove extra space characters

These lines are indented an extra space character.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/io-wq.c b/fs/io-wq.c
index 243eb1d4d2bd..b4bc377dda61 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -328,9 +328,9 @@ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker,
 	 * If worker is moving from bound to unbound (or vice versa), then
 	 * ensure we update the running accounting.
 	 */
-	 worker_bound = (worker->flags & IO_WORKER_F_BOUND) != 0;
-	 work_bound = (work->flags & IO_WQ_WORK_UNBOUND) == 0;
-	 if (worker_bound != work_bound) {
+	worker_bound = (worker->flags & IO_WORKER_F_BOUND) != 0;
+	work_bound = (work->flags & IO_WQ_WORK_UNBOUND) == 0;
+	if (worker_bound != work_bound) {
 		io_wqe_dec_running(wqe, worker);
 		if (work_bound) {
 			worker->flags |= IO_WORKER_F_BOUND;

From d3b35796b1e3f118017491d621f624e0de7ff9fb Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 19 Nov 2019 23:32:48 +0300
Subject: [PATCH 11/42] io_uring: break links for failed defer

If io_req_defer() failed, it needs to cancel a dependant link.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 6550b7eab7d2..c1226f609e18 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2958,6 +2958,8 @@ static void io_queue_sqe(struct io_kiocb *req)
 	if (ret) {
 		if (ret != -EIOCBQUEUED) {
 			io_cqring_add_event(req, ret);
+			if (req->flags & REQ_F_LINK)
+				req->flags |= REQ_F_FAIL_LINK;
 			io_double_put_req(req);
 		}
 	} else
@@ -2990,6 +2992,8 @@ static void io_queue_link_head(struct io_kiocb *req, struct io_kiocb *shadow)
 		if (ret != -EIOCBQUEUED) {
 err:
 			io_cqring_add_event(req, ret);
+			if (req->flags & REQ_F_LINK)
+				req->flags |= REQ_F_FAIL_LINK;
 			io_double_put_req(req);
 			if (shadow)
 				__io_free_req(shadow);

From f70193d6d8cad4cc614223fef349e6ea9d48c61f Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 19 Nov 2019 23:32:49 +0300
Subject: [PATCH 12/42] io_uring: remove redundant check

Pass any IORING_OP_LINK_TIMEOUT request further, where it will
eventually fail in io_issue_sqe().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index c1226f609e18..fd60939a8a59 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3079,10 +3079,6 @@ err_req:
 
 		INIT_LIST_HEAD(&req->link_list);
 		*link = req;
-	} else if (READ_ONCE(s->sqe->opcode) == IORING_OP_LINK_TIMEOUT) {
-		/* Only valid as a linked SQE */
-		ret = -EINVAL;
-		goto err_req;
 	} else {
 		io_queue_sqe(req);
 	}

From 09fbb0a83ec6ab5a4037766261c031151985fff6 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 19 Nov 2019 23:32:50 +0300
Subject: [PATCH 13/42] io_uring: Fix leaking linked timeouts

let have a dependant link: REQ -> LINK_TIMEOUT -> LINK_TIMEOUT

1. submission stage: submission references for REQ and LINK_TIMEOUT
are dropped. So, references respectively (1,1,2)

2. io_put(REQ) + FAIL_LINKS stage: calls io_fail_links(), which for all
linked timeouts will call cancel_timeout() and drop 1 reference.
So, references after: (0,0,1). That's a leak.

Make it treat only the first linked timeout as such, and pass others
through __io_double_put_req().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index fd60939a8a59..f88e12b971c7 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -942,6 +942,7 @@ static void io_fail_links(struct io_kiocb *req)
 		if ((req->flags & REQ_F_LINK_TIMEOUT) &&
 		    link->submit.sqe->opcode == IORING_OP_LINK_TIMEOUT) {
 			io_link_cancel_timeout(link);
+			req->flags &= ~REQ_F_LINK_TIMEOUT;
 		} else {
 			io_cqring_fill_event(link, -ECANCELED);
 			__io_double_put_req(link);

From 5d960724b0cb0d12469d1c62912e4a8c09c9fd92 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 19 Nov 2019 15:31:28 -0700
Subject: [PATCH 14/42] io_uring: io_fail_links() should only consider first
 linked timeout

We currently clear the linked timeout field if we cancel such a timeout,
but we should only attempt to cancel if it's the first one we see.
Others should simply be freed like other requests, as they haven't
been started yet.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index f88e12b971c7..09fc29599e96 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -942,12 +942,12 @@ static void io_fail_links(struct io_kiocb *req)
 		if ((req->flags & REQ_F_LINK_TIMEOUT) &&
 		    link->submit.sqe->opcode == IORING_OP_LINK_TIMEOUT) {
 			io_link_cancel_timeout(link);
-			req->flags &= ~REQ_F_LINK_TIMEOUT;
 		} else {
 			io_cqring_fill_event(link, -ECANCELED);
 			__io_double_put_req(link);
 		}
 		kfree(sqe_to_free);
+		req->flags &= ~REQ_F_LINK_TIMEOUT;
 	}
 
 	io_commit_cqring(ctx);
@@ -2837,9 +2837,10 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
 	 */
 	if (!list_empty(&req->list)) {
 		prev = list_entry(req->list.prev, struct io_kiocb, link_list);
-		if (refcount_inc_not_zero(&prev->refs))
+		if (refcount_inc_not_zero(&prev->refs)) {
 			list_del_init(&req->list);
-		else
+			prev->flags &= ~REQ_F_LINK_TIMEOUT;
+		} else
 			prev = NULL;
 	}
 

From bbad27b2f622fa26d107f8a72c0cd5cc102dc56e Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 19 Nov 2019 23:32:47 +0300
Subject: [PATCH 15/42] io_uring: Always REQ_F_FREE_SQE for allocated sqe

Always mark requests with allocated sqe and deallocate it in
__io_free_req(). It's easier to follow and doesn't add edge cases.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 49 ++++++++++++++++++++++---------------------------
 1 file changed, 22 insertions(+), 27 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 09fc29599e96..6d52a4d643e6 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -834,6 +834,8 @@ static void __io_free_req(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 
+	if (req->flags & REQ_F_FREE_SQE)
+		kfree(req->submit.sqe);
 	if (req->file && !(req->flags & REQ_F_FIXED_FILE))
 		fput(req->file);
 	if (req->flags & REQ_F_INFLIGHT) {
@@ -929,16 +931,11 @@ static void io_fail_links(struct io_kiocb *req)
 	spin_lock_irqsave(&ctx->completion_lock, flags);
 
 	while (!list_empty(&req->link_list)) {
-		const struct io_uring_sqe *sqe_to_free = NULL;
-
 		link = list_first_entry(&req->link_list, struct io_kiocb, list);
 		list_del_init(&link->list);
 
 		trace_io_uring_fail_link(req, link);
 
-		if (link->flags & REQ_F_FREE_SQE)
-			sqe_to_free = link->submit.sqe;
-
 		if ((req->flags & REQ_F_LINK_TIMEOUT) &&
 		    link->submit.sqe->opcode == IORING_OP_LINK_TIMEOUT) {
 			io_link_cancel_timeout(link);
@@ -946,7 +943,6 @@ static void io_fail_links(struct io_kiocb *req)
 			io_cqring_fill_event(link, -ECANCELED);
 			__io_double_put_req(link);
 		}
-		kfree(sqe_to_free);
 		req->flags &= ~REQ_F_LINK_TIMEOUT;
 	}
 
@@ -1089,7 +1085,8 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 			 * completions for those, only batch free for fixed
 			 * file and non-linked commands.
 			 */
-			if (((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) ==
+			if (((req->flags &
+				(REQ_F_FIXED_FILE|REQ_F_LINK|REQ_F_FREE_SQE)) ==
 			    REQ_F_FIXED_FILE) && !io_is_fallback_req(req)) {
 				reqs[to_free++] = req;
 				if (to_free == ARRAY_SIZE(reqs))
@@ -2582,6 +2579,7 @@ static int io_req_defer(struct io_kiocb *req)
 	}
 
 	memcpy(sqe_copy, sqe, sizeof(*sqe_copy));
+	req->flags |= REQ_F_FREE_SQE;
 	req->submit.sqe = sqe_copy;
 
 	trace_io_uring_defer(ctx, req, false);
@@ -2676,7 +2674,6 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
 	struct io_wq_work *work = *workptr;
 	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
 	struct sqe_submit *s = &req->submit;
-	const struct io_uring_sqe *sqe = s->sqe;
 	struct io_kiocb *nxt = NULL;
 	int ret = 0;
 
@@ -2712,9 +2709,6 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
 		io_put_req(req);
 	}
 
-	/* async context always use a copy of the sqe */
-	kfree(sqe);
-
 	/* if a dependent link is ready, pass it back */
 	if (!ret && nxt) {
 		struct io_kiocb *link;
@@ -2913,23 +2907,24 @@ static void __io_queue_sqe(struct io_kiocb *req)
 		struct io_uring_sqe *sqe_copy;
 
 		sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL);
-		if (sqe_copy) {
-			s->sqe = sqe_copy;
-			if (req->work.flags & IO_WQ_WORK_NEEDS_FILES) {
-				ret = io_grab_files(req);
-				if (ret) {
-					kfree(sqe_copy);
-					goto err;
-				}
-			}
+		if (!sqe_copy)
+			goto err;
 
-			/*
-			 * Queued up for async execution, worker will release
-			 * submit reference when the iocb is actually submitted.
-			 */
-			io_queue_async_work(req);
-			return;
+		s->sqe = sqe_copy;
+		req->flags |= REQ_F_FREE_SQE;
+
+		if (req->work.flags & IO_WQ_WORK_NEEDS_FILES) {
+			ret = io_grab_files(req);
+			if (ret)
+				goto err;
 		}
+
+		/*
+		 * Queued up for async execution, worker will release
+		 * submit reference when the iocb is actually submitted.
+		 */
+		io_queue_async_work(req);
+		return;
 	}
 
 err:
@@ -3024,7 +3019,6 @@ err:
 static void io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state,
 			  struct io_kiocb **link)
 {
-	struct io_uring_sqe *sqe_copy;
 	struct sqe_submit *s = &req->submit;
 	struct io_ring_ctx *ctx = req->ctx;
 	int ret;
@@ -3054,6 +3048,7 @@ err_req:
 	 */
 	if (*link) {
 		struct io_kiocb *prev = *link;
+		struct io_uring_sqe *sqe_copy;
 
 		if (READ_ONCE(s->sqe->opcode) == IORING_OP_LINK_TIMEOUT) {
 			ret = io_timeout_setup(req);

From eb065d301e8c83643367bdb0898becc364046bda Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 20 Nov 2019 09:26:29 -0700
Subject: [PATCH 16/42] io_uring: io_allocate_scq_urings() should return a sane
 state

We currently rely on the ring destroy on cleaning things up in case of
failure, but io_allocate_scq_urings() can leave things half initialized
if only parts of it fails.

Be nice and return with either everything setup in success, or return an
error with things nicely cleaned up.

Reported-by: syzbot+0d818c0d39399188f393@syzkaller.appspotmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 6d52a4d643e6..55bc890f863d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4583,12 +4583,18 @@ static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
 	ctx->cq_entries = rings->cq_ring_entries;
 
 	size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
-	if (size == SIZE_MAX)
+	if (size == SIZE_MAX) {
+		io_mem_free(ctx->rings);
+		ctx->rings = NULL;
 		return -EOVERFLOW;
+	}
 
 	ctx->sq_sqes = io_mem_alloc(size);
-	if (!ctx->sq_sqes)
+	if (!ctx->sq_sqes) {
+		io_mem_free(ctx->rings);
+		ctx->rings = NULL;
 		return -ENOMEM;
+	}
 
 	return 0;
 }

From 4d7dd462971405c65bfb3821dbb6b9ce13b5e8d6 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 20 Nov 2019 13:03:52 -0700
Subject: [PATCH 17/42] io_uring: allow finding next link independent of req
 reference count

We currently try and start the next link when we put the request, and
only if we were going to free it. This means that the optimization to
continue executing requests from the same context often fails, as we're
not putting the final reference.

Add REQ_F_LINK_NEXT to keep track of this, and allow io_uring to find the
next request more efficiently.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 55bc890f863d..e9980c584120 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -340,6 +340,7 @@ struct io_kiocb {
 #define REQ_F_NOWAIT		1	/* must not punt to workers */
 #define REQ_F_IOPOLL_COMPLETED	2	/* polled IO has completed */
 #define REQ_F_FIXED_FILE	4	/* ctx owns file */
+#define REQ_F_LINK_NEXT		8	/* already grabbed next link */
 #define REQ_F_IO_DRAIN		16	/* drain existing IO first */
 #define REQ_F_IO_DRAINED	32	/* drain done */
 #define REQ_F_LINK		64	/* linked sqes */
@@ -879,6 +880,10 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
 	struct io_kiocb *nxt;
 	bool wake_ev = false;
 
+	/* Already got next link */
+	if (req->flags & REQ_F_LINK_NEXT)
+		return;
+
 	/*
 	 * The list should never be empty when we are called here. But could
 	 * potentially happen if the chain is messed up, check to be on the
@@ -915,6 +920,7 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
 		break;
 	}
 
+	req->flags |= REQ_F_LINK_NEXT;
 	if (wake_ev)
 		io_cqring_ev_posted(ctx);
 }
@@ -951,12 +957,10 @@ static void io_fail_links(struct io_kiocb *req)
 	io_cqring_ev_posted(ctx);
 }
 
-static void io_free_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt)
+static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt)
 {
-	if (likely(!(req->flags & REQ_F_LINK))) {
-		__io_free_req(req);
+	if (likely(!(req->flags & REQ_F_LINK)))
 		return;
-	}
 
 	/*
 	 * If LINK is set, we have dependent requests in this chain. If we
@@ -982,7 +986,11 @@ static void io_free_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt)
 	} else {
 		io_req_link_next(req, nxt);
 	}
+}
 
+static void io_free_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt)
+{
+	io_req_find_next(req, nxt);
 	__io_free_req(req);
 }
 
@@ -999,8 +1007,10 @@ static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
 {
 	struct io_kiocb *nxt = NULL;
 
+	io_req_find_next(req, &nxt);
+
 	if (refcount_dec_and_test(&req->refs))
-		io_free_req_find_next(req, &nxt);
+		__io_free_req(req);
 
 	if (nxt) {
 		if (nxtptr)

From b76da70fc3759df13e0991706451f1a2e06ba19e Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 20 Nov 2019 13:05:32 -0700
Subject: [PATCH 18/42] io_uring: close lookup gap for dependent next work

When we find new work to process within the work handler, we queue the
linked timeout before we have issued the new work. This can be
problematic for very short timeouts, as we have a window where the new
work isn't visible.

Allow the work handler to store a callback function for this in the work
item, and flag it with IO_WQ_WORK_CB if the caller has done so. If that
is set, then io-wq will call the callback when it has setup the new work
item.

Reported-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.c    |  3 +++
 fs/io-wq.h    |  6 +++++-
 fs/io_uring.c | 16 ++++++++++++++--
 3 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/fs/io-wq.c b/fs/io-wq.c
index b4bc377dda61..9b32b3c811f5 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -427,6 +427,9 @@ next:
 		worker->cur_work = work;
 		spin_unlock_irq(&worker->lock);
 
+		if (work->flags & IO_WQ_WORK_CB)
+			work->func(&work);
+
 		if ((work->flags & IO_WQ_WORK_NEEDS_FILES) &&
 		    current->files != work->files) {
 			task_lock(current);
diff --git a/fs/io-wq.h b/fs/io-wq.h
index 4b29f922f80c..b68b11bf3633 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -11,6 +11,7 @@ enum {
 	IO_WQ_WORK_NEEDS_FILES	= 16,
 	IO_WQ_WORK_UNBOUND	= 32,
 	IO_WQ_WORK_INTERNAL	= 64,
+	IO_WQ_WORK_CB		= 128,
 
 	IO_WQ_HASH_SHIFT	= 24,	/* upper 8 bits are used for hash key */
 };
@@ -22,7 +23,10 @@ enum io_wq_cancel {
 };
 
 struct io_wq_work {
-	struct list_head list;
+	union {
+		struct list_head list;
+		void *data;
+	};
 	void (*func)(struct io_wq_work **);
 	unsigned flags;
 	struct files_struct *files;
diff --git a/fs/io_uring.c b/fs/io_uring.c
index e9980c584120..27fefb52910c 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2679,6 +2679,15 @@ static int __io_submit_sqe(struct io_kiocb *req, struct io_kiocb **nxt,
 	return 0;
 }
 
+static void io_link_work_cb(struct io_wq_work **workptr)
+{
+	struct io_wq_work *work = *workptr;
+	struct io_kiocb *link = work->data;
+
+	io_queue_linked_timeout(link);
+	work->func = io_wq_submit_work;
+}
+
 static void io_wq_submit_work(struct io_wq_work **workptr)
 {
 	struct io_wq_work *work = *workptr;
@@ -2725,8 +2734,11 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
 
 		io_prep_async_work(nxt, &link);
 		*workptr = &nxt->work;
-		if (link)
-			io_queue_linked_timeout(link);
+		if (link) {
+			nxt->work.flags |= IO_WQ_WORK_CB;
+			nxt->work.func = io_link_work_cb;
+			nxt->work.data = link;
+		}
 	}
 }
 

From 1b4a51b6d03d21f55effbcf609ba5526d87d9e9d Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 21 Nov 2019 11:54:28 +0300
Subject: [PATCH 19/42] io_uring: drain next sqe instead of shadowing

There's an issue with the shadow drain logic in that we drop the
completion lock after deciding to defer a request, then re-grab it later
and assume that the state is still the same. In the mean time, someone
else completing a request could have found and issued it. This can cause
a stall in the queue, by having a shadow request inserted that nobody is
going to drain.

Additionally, if we fail allocating the shadow request, we simply ignore
the drain.

Instead of using a shadow request, defer the next request/link instead.
This also has the following advantages:

- removes semi-duplicated code
- doesn't allocate memory for shadows
- works better if only the head marked for drain
- doesn't need complex synchronisation

On the flip side, it removes the shadow->seq ==
last_drain_in_in_link->seq optimization. That shouldn't be a common
case, and can always be added back, if needed.

Fixes: 4fe2c963154c ("io_uring: add support for link with drain")
Cc: Jackie Liu <liuyun01@kylinos.cn>
Reported-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 86 +++++++++++----------------------------------------
 1 file changed, 18 insertions(+), 68 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 27fefb52910c..ca980c5878e9 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -186,6 +186,7 @@ struct io_ring_ctx {
 		bool			compat;
 		bool			account_mem;
 		bool			cq_overflow_flushed;
+		bool			drain_next;
 
 		/*
 		 * Ring buffer of indices into array of io_uring_sqe, which is
@@ -346,7 +347,7 @@ struct io_kiocb {
 #define REQ_F_LINK		64	/* linked sqes */
 #define REQ_F_LINK_TIMEOUT	128	/* has linked timeout */
 #define REQ_F_FAIL_LINK		256	/* fail rest of links */
-#define REQ_F_SHADOW_DRAIN	512	/* link-drain shadow req */
+#define REQ_F_DRAIN_LINK	512	/* link should be fully drained */
 #define REQ_F_TIMEOUT		1024	/* timeout request */
 #define REQ_F_ISREG		2048	/* regular file */
 #define REQ_F_MUST_PUNT		4096	/* must be punted even for NONBLOCK */
@@ -620,11 +621,6 @@ static void io_commit_cqring(struct io_ring_ctx *ctx)
 	__io_commit_cqring(ctx);
 
 	while ((req = io_get_deferred_req(ctx)) != NULL) {
-		if (req->flags & REQ_F_SHADOW_DRAIN) {
-			/* Just for drain, free it. */
-			__io_free_req(req);
-			continue;
-		}
 		req->flags |= REQ_F_IO_DRAINED;
 		io_queue_async_work(req);
 	}
@@ -2973,6 +2969,12 @@ static void io_queue_sqe(struct io_kiocb *req)
 {
 	int ret;
 
+	if (unlikely(req->ctx->drain_next)) {
+		req->flags |= REQ_F_IO_DRAIN;
+		req->ctx->drain_next = false;
+	}
+	req->ctx->drain_next = (req->flags & REQ_F_DRAIN_LINK);
+
 	ret = io_req_defer(req);
 	if (ret) {
 		if (ret != -EIOCBQUEUED) {
@@ -2985,57 +2987,16 @@ static void io_queue_sqe(struct io_kiocb *req)
 		__io_queue_sqe(req);
 }
 
-static void io_queue_link_head(struct io_kiocb *req, struct io_kiocb *shadow)
+static inline void io_queue_link_head(struct io_kiocb *req)
 {
-	int ret;
-	int need_submit = false;
-	struct io_ring_ctx *ctx = req->ctx;
-
 	if (unlikely(req->flags & REQ_F_FAIL_LINK)) {
-		ret = -ECANCELED;
-		goto err;
-	}
-	if (!shadow) {
+		io_cqring_add_event(req, -ECANCELED);
+		io_double_put_req(req);
+	} else
 		io_queue_sqe(req);
-		return;
-	}
-
-	/*
-	 * Mark the first IO in link list as DRAIN, let all the following
-	 * IOs enter the defer list. all IO needs to be completed before link
-	 * list.
-	 */
-	req->flags |= REQ_F_IO_DRAIN;
-	ret = io_req_defer(req);
-	if (ret) {
-		if (ret != -EIOCBQUEUED) {
-err:
-			io_cqring_add_event(req, ret);
-			if (req->flags & REQ_F_LINK)
-				req->flags |= REQ_F_FAIL_LINK;
-			io_double_put_req(req);
-			if (shadow)
-				__io_free_req(shadow);
-			return;
-		}
-	} else {
-		/*
-		 * If ret == 0 means that all IOs in front of link io are
-		 * running done. let's queue link head.
-		 */
-		need_submit = true;
-	}
-
-	/* Insert shadow req to defer_list, blocking next IOs */
-	spin_lock_irq(&ctx->completion_lock);
-	trace_io_uring_defer(ctx, shadow, true);
-	list_add_tail(&shadow->list, &ctx->defer_list);
-	spin_unlock_irq(&ctx->completion_lock);
-
-	if (need_submit)
-		__io_queue_sqe(req);
 }
 
+
 #define SQE_VALID_FLAGS	(IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK)
 
 static void io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state,
@@ -3072,6 +3033,9 @@ err_req:
 		struct io_kiocb *prev = *link;
 		struct io_uring_sqe *sqe_copy;
 
+		if (s->sqe->flags & IOSQE_IO_DRAIN)
+			(*link)->flags |= REQ_F_DRAIN_LINK | REQ_F_IO_DRAIN;
+
 		if (READ_ONCE(s->sqe->opcode) == IORING_OP_LINK_TIMEOUT) {
 			ret = io_timeout_setup(req);
 			/* common setup allows offset being set, we don't */
@@ -3190,7 +3154,6 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 {
 	struct io_submit_state state, *statep = NULL;
 	struct io_kiocb *link = NULL;
-	struct io_kiocb *shadow_req = NULL;
 	int i, submitted = 0;
 	bool mm_fault = false;
 
@@ -3229,18 +3192,6 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 
 		sqe_flags = req->submit.sqe->flags;
 
-		if (link && (sqe_flags & IOSQE_IO_DRAIN)) {
-			if (!shadow_req) {
-				shadow_req = io_get_req(ctx, NULL);
-				if (unlikely(!shadow_req))
-					goto out;
-				shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN);
-				refcount_dec(&shadow_req->refs);
-			}
-			shadow_req->sequence = req->submit.sequence;
-		}
-
-out:
 		req->submit.ring_file = ring_file;
 		req->submit.ring_fd = ring_fd;
 		req->submit.has_user = *mm != NULL;
@@ -3256,14 +3207,13 @@ out:
 		 * that's the end of the chain. Submit the previous link.
 		 */
 		if (!(sqe_flags & IOSQE_IO_LINK) && link) {
-			io_queue_link_head(link, shadow_req);
+			io_queue_link_head(link);
 			link = NULL;
-			shadow_req = NULL;
 		}
 	}
 
 	if (link)
-		io_queue_link_head(link, shadow_req);
+		io_queue_link_head(link);
 	if (statep)
 		io_submit_state_end(&state);
 

From 915967f69c591b34c5a18d6618af021a81ffd700 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 21 Nov 2019 09:01:20 -0700
Subject: [PATCH 20/42] io_uring: improve trace_io_uring_defer() trace point

We don't have shadow requests anymore, so get rid of the shadow
argument. Add the user_data argument, as that's often useful to easily
match up requests, instead of having to look at request pointers.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                   |  2 +-
 include/trace/events/io_uring.h | 16 ++++++++--------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index ca980c5878e9..736f27808f99 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2588,7 +2588,7 @@ static int io_req_defer(struct io_kiocb *req)
 	req->flags |= REQ_F_FREE_SQE;
 	req->submit.sqe = sqe_copy;
 
-	trace_io_uring_defer(ctx, req, false);
+	trace_io_uring_defer(ctx, req, req->user_data);
 	list_add_tail(&req->list, &ctx->defer_list);
 	spin_unlock_irq(&ctx->completion_lock);
 	return -EIOCBQUEUED;
diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h
index 72a4d0174b02..b352d66b5d51 100644
--- a/include/trace/events/io_uring.h
+++ b/include/trace/events/io_uring.h
@@ -163,35 +163,35 @@ TRACE_EVENT(io_uring_queue_async_work,
 );
 
 /**
- * io_uring_defer_list - called before the io_uring work added into defer_list
+ * io_uring_defer - called when an io_uring request is deferred
  *
  * @ctx:	pointer to a ring context structure
  * @req:	pointer to a deferred request
- * @shadow: whether request is shadow or not
+ * @user_data:	user data associated with the request
  *
  * Allows to track deferred requests, to get an insight about what requests are
  * not started immediately.
  */
 TRACE_EVENT(io_uring_defer,
 
-	TP_PROTO(void *ctx, void *req, bool shadow),
+	TP_PROTO(void *ctx, void *req, unsigned long long user_data),
 
-	TP_ARGS(ctx, req, shadow),
+	TP_ARGS(ctx, req, user_data),
 
 	TP_STRUCT__entry (
 		__field(  void *,	ctx		)
 		__field(  void *,	req		)
-		__field(  bool,		shadow	)
+		__field(  unsigned long long, data	)
 	),
 
 	TP_fast_assign(
 		__entry->ctx	= ctx;
 		__entry->req	= req;
-		__entry->shadow	= shadow;
+		__entry->data	= user_data;
 	),
 
-	TP_printk("ring %p, request %p%s", __entry->ctx, __entry->req,
-			  __entry->shadow ? ", shadow": "")
+	TP_printk("ring %p, request %p user_data %llu", __entry->ctx,
+			__entry->req, __entry->data)
 );
 
 /**

From d732447fed7d6b4c22907f630cd25d574bae5276 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 21 Nov 2019 21:24:36 +0300
Subject: [PATCH 21/42] io_uring: rename __io_submit_sqe()

__io_submit_sqe() is issuing requests, so call it as
such. Moreover, it ends by calling io_iopoll_req_issued().

Rename it and make terminology clearer.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 736f27808f99..956a61cb0b13 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2594,8 +2594,8 @@ static int io_req_defer(struct io_kiocb *req)
 	return -EIOCBQUEUED;
 }
 
-static int __io_submit_sqe(struct io_kiocb *req, struct io_kiocb **nxt,
-			   bool force_nonblock)
+static int io_issue_sqe(struct io_kiocb *req, struct io_kiocb **nxt,
+			bool force_nonblock)
 {
 	int ret, opcode;
 	struct sqe_submit *s = &req->submit;
@@ -2702,7 +2702,7 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
 		s->has_user = (work->flags & IO_WQ_WORK_HAS_MM) != 0;
 		s->in_async = true;
 		do {
-			ret = __io_submit_sqe(req, &nxt, false);
+			ret = io_issue_sqe(req, &nxt, false);
 			/*
 			 * We can get EAGAIN for polled IO even though we're
 			 * forcing a sync submission from here, since we can't
@@ -2913,7 +2913,7 @@ static void __io_queue_sqe(struct io_kiocb *req)
 	struct io_kiocb *nxt = io_prep_linked_timeout(req);
 	int ret;
 
-	ret = __io_submit_sqe(req, NULL, true);
+	ret = io_issue_sqe(req, NULL, true);
 
 	/*
 	 * We async punt it if the file wasn't marked NOWAIT, or if the file

From 9835d6fafba58e6d9386a6d5af800789bdb52e5b Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 21 Nov 2019 21:24:56 +0300
Subject: [PATCH 22/42] io_uring: add likely/unlikely in io_get_sqring()

The number of SQEs to submit is specified by a user, so io_get_sqring()
in most of the cases succeeds. Hint compilers about that.

Checking ASM genereted by gcc 9.2.0 for x64, there is one branch
misprediction.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 956a61cb0b13..63e0448f3f8d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3129,11 +3129,11 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s)
 	 */
 	head = ctx->cached_sq_head;
 	/* make sure SQ entry isn't read before tail */
-	if (head == smp_load_acquire(&rings->sq.tail))
+	if (unlikely(head == smp_load_acquire(&rings->sq.tail)))
 		return false;
 
 	head = READ_ONCE(sq_array[head & ctx->sq_mask]);
-	if (head < ctx->sq_entries) {
+	if (likely(head < ctx->sq_entries)) {
 		s->ring_file = NULL;
 		s->sqe = &ctx->sq_sqes[head];
 		s->sequence = ctx->cached_sq_head;

From 70cf9f3270a5c5148e93a526dc1e51965259e70c Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 21 Nov 2019 23:21:00 +0300
Subject: [PATCH 23/42] io_uring: remove io_free_req_find_next()

There is only one one-liner user of io_free_req_find_next(). Inline it.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 63e0448f3f8d..6d8665e8f0e8 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -984,15 +984,10 @@ static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt)
 	}
 }
 
-static void io_free_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt)
-{
-	io_req_find_next(req, nxt);
-	__io_free_req(req);
-}
-
 static void io_free_req(struct io_kiocb *req)
 {
-	io_free_req_find_next(req, NULL);
+	io_req_find_next(req, NULL);
+	__io_free_req(req);
 }
 
 /*

From 944e58bfeda0e9b97cd611adafc823c78e0bc464 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 21 Nov 2019 23:21:01 +0300
Subject: [PATCH 24/42] io_uring: pass only !null to io_req_find_next()

Make io_req_find_next() and io_req_link_next() to accept only non-null
nxt, and handle it in callers.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 6d8665e8f0e8..95deb45e89cf 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -908,7 +908,7 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
 		 * in this context instead of having to queue up new async work.
 		 */
 		if (nxt) {
-			if (nxtptr && io_wq_current_is_worker())
+			if (io_wq_current_is_worker())
 				*nxtptr = nxt;
 			else
 				io_queue_async_work(nxt);
@@ -986,8 +986,13 @@ static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt)
 
 static void io_free_req(struct io_kiocb *req)
 {
-	io_req_find_next(req, NULL);
+	struct io_kiocb *nxt = NULL;
+
+	io_req_find_next(req, &nxt);
 	__io_free_req(req);
+
+	if (nxt)
+		io_queue_async_work(nxt);
 }
 
 /*

From b18fdf71e01fba29a804d63f8c1e2ed61011170d Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 21 Nov 2019 23:21:02 +0300
Subject: [PATCH 25/42] io_uring: simplify io_req_link_next()

"if (nxt)" is always true, as it was checked in the while's condition.
io_wq_current_is_worker() is unnecessary, as non-async callers don't
pass nxt, so io_queue_async_work() will be called for them anyway.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 95deb45e89cf..f6f5871bd7d2 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -903,16 +903,7 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
 			nxt->flags |= REQ_F_LINK;
 		}
 
-		/*
-		 * If we're in async work, we can continue processing the chain
-		 * in this context instead of having to queue up new async work.
-		 */
-		if (nxt) {
-			if (io_wq_current_is_worker())
-				*nxtptr = nxt;
-			else
-				io_queue_async_work(nxt);
-		}
+		*nxtptr = nxt;
 		break;
 	}
 

From f9bd67f69af56d712bfd498f5ad9cf7bb177d600 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 21 Nov 2019 23:21:03 +0300
Subject: [PATCH 26/42] io_uring: only !null ptr to io_issue_sqe()

Pass only non-null @nxt to io_issue_sqe() and handle it at the caller's
side. And propagate it.

- kiocb_done() is only called from io_read() and io_write(), which are
only called from io_issue_sqe(), so it's @nxt != NULL

- io_put_req_find_next() is called either with explicitly non-null local
nxt, or from one of the functions in io_issue_sqe() switch (or their
callees).

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 28 ++++++++++++----------------
 1 file changed, 12 insertions(+), 16 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index f6f5871bd7d2..553fa23120e6 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -990,21 +990,13 @@ static void io_free_req(struct io_kiocb *req)
  * Drop reference to request, return next in chain (if there is one) if this
  * was the last reference to this request.
  */
+__attribute__((nonnull))
 static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
 {
-	struct io_kiocb *nxt = NULL;
-
-	io_req_find_next(req, &nxt);
+	io_req_find_next(req, nxtptr);
 
 	if (refcount_dec_and_test(&req->refs))
 		__io_free_req(req);
-
-	if (nxt) {
-		if (nxtptr)
-			*nxtptr = nxt;
-		else
-			io_queue_async_work(nxt);
-	}
 }
 
 static void io_put_req(struct io_kiocb *req)
@@ -1488,7 +1480,7 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
 static void kiocb_done(struct kiocb *kiocb, ssize_t ret, struct io_kiocb **nxt,
 		       bool in_async)
 {
-	if (in_async && ret >= 0 && nxt && kiocb->ki_complete == io_complete_rw)
+	if (in_async && ret >= 0 && kiocb->ki_complete == io_complete_rw)
 		*nxt = __io_complete_rw(kiocb, ret);
 	else
 		io_rw_done(kiocb, ret);
@@ -2585,6 +2577,7 @@ static int io_req_defer(struct io_kiocb *req)
 	return -EIOCBQUEUED;
 }
 
+__attribute__((nonnull))
 static int io_issue_sqe(struct io_kiocb *req, struct io_kiocb **nxt,
 			bool force_nonblock)
 {
@@ -2901,10 +2894,13 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
 
 static void __io_queue_sqe(struct io_kiocb *req)
 {
-	struct io_kiocb *nxt = io_prep_linked_timeout(req);
+	struct io_kiocb *linked_timeout = io_prep_linked_timeout(req);
+	struct io_kiocb *nxt = NULL;
 	int ret;
 
-	ret = io_issue_sqe(req, NULL, true);
+	ret = io_issue_sqe(req, &nxt, true);
+	if (nxt)
+		io_queue_async_work(nxt);
 
 	/*
 	 * We async punt it if the file wasn't marked NOWAIT, or if the file
@@ -2940,11 +2936,11 @@ err:
 	/* drop submission reference */
 	io_put_req(req);
 
-	if (nxt) {
+	if (linked_timeout) {
 		if (!ret)
-			io_queue_linked_timeout(nxt);
+			io_queue_linked_timeout(linked_timeout);
 		else
-			io_put_req(nxt);
+			io_put_req(linked_timeout);
 	}
 
 	/* and drop final reference, if we failed */

From c4a2ed72c9a61594b6afc23e1fbc78878d32b5a3 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 21 Nov 2019 21:01:26 -0700
Subject: [PATCH 27/42] io_uring: only return -EBUSY for submit on non-flushed
 backlog

We return -EBUSY on submit when we have a CQ ring overflow backlog, but
that can be a bit problematic if the application is using pure userspace
poll of the CQ ring. For that case, if the ring briefly overflowed and
we have pending entries in the backlog, the submit flushes the backlog
successfully but still returns -EBUSY. If we're able to fully flush the
CQ ring backlog, let the submission proceed.

Reported-by: Dan Melnic <dmm@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 553fa23120e6..129723087bad 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -654,7 +654,8 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
 		eventfd_signal(ctx->cq_ev_fd, 1);
 }
 
-static void io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
+/* Returns true if there are no backlogged entries after the flush */
+static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 {
 	struct io_rings *rings = ctx->rings;
 	struct io_uring_cqe *cqe;
@@ -664,10 +665,10 @@ static void io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 
 	if (!force) {
 		if (list_empty_careful(&ctx->cq_overflow_list))
-			return;
+			return true;
 		if ((ctx->cached_cq_tail - READ_ONCE(rings->cq.head) ==
 		    rings->cq_ring_entries))
-			return;
+			return false;
 	}
 
 	spin_lock_irqsave(&ctx->completion_lock, flags);
@@ -676,6 +677,7 @@ static void io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 	if (force)
 		ctx->cq_overflow_flushed = true;
 
+	cqe = NULL;
 	while (!list_empty(&ctx->cq_overflow_list)) {
 		cqe = io_get_cqring(ctx);
 		if (!cqe && !force)
@@ -703,6 +705,8 @@ static void io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 		list_del(&req->list);
 		io_put_req(req);
 	}
+
+	return cqe != NULL;
 }
 
 static void io_cqring_fill_event(struct io_kiocb *req, long res)
@@ -3144,10 +3148,10 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 	int i, submitted = 0;
 	bool mm_fault = false;
 
-	if (!list_empty(&ctx->cq_overflow_list)) {
-		io_cqring_overflow_flush(ctx, false);
+	/* if we have a backlog and couldn't flush it all, return BUSY */
+	if (!list_empty(&ctx->cq_overflow_list) &&
+	    !io_cqring_overflow_flush(ctx, false))
 		return -EBUSY;
-	}
 
 	if (nr > IO_PLUG_THRESHOLD) {
 		io_submit_state_start(&state, ctx, nr);

From bd3ded3146daa2cbb57ed353749ef99cf75371b0 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 23 Nov 2019 14:17:16 -0700
Subject: [PATCH 28/42] net: add __sys_connect_file() helper

This is identical to __sys_connect(), except it takes a struct file
instead of an fd, and it also allows passing in extra file->f_flags
flags. The latter is done to support masking in O_NONBLOCK without
manipulating the original file flags.

No functional changes in this patch.

Cc: netdev@vger.kernel.org
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/socket.h |  3 +++
 net/socket.c           | 30 ++++++++++++++++++++++--------
 2 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/include/linux/socket.h b/include/linux/socket.h
index 09c32a21555b..4bde63021c09 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -399,6 +399,9 @@ extern int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr,
 			 int __user *upeer_addrlen, int flags);
 extern int __sys_socket(int family, int type, int protocol);
 extern int __sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen);
+extern int __sys_connect_file(struct file *file,
+			struct sockaddr __user *uservaddr, int addrlen,
+			int file_flags);
 extern int __sys_connect(int fd, struct sockaddr __user *uservaddr,
 			 int addrlen);
 extern int __sys_listen(int fd, int backlog);
diff --git a/net/socket.c b/net/socket.c
index 17bc1eee198a..274df4ddfc2c 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1825,32 +1825,46 @@ SYSCALL_DEFINE3(accept, int, fd, struct sockaddr __user *, upeer_sockaddr,
  *	include the -EINPROGRESS status for such sockets.
  */
 
-int __sys_connect(int fd, struct sockaddr __user *uservaddr, int addrlen)
+int __sys_connect_file(struct file *file, struct sockaddr __user *uservaddr,
+		       int addrlen, int file_flags)
 {
 	struct socket *sock;
 	struct sockaddr_storage address;
-	int err, fput_needed;
+	int err;
 
-	sock = sockfd_lookup_light(fd, &err, &fput_needed);
+	sock = sock_from_file(file, &err);
 	if (!sock)
 		goto out;
 	err = move_addr_to_kernel(uservaddr, addrlen, &address);
 	if (err < 0)
-		goto out_put;
+		goto out;
 
 	err =
 	    security_socket_connect(sock, (struct sockaddr *)&address, addrlen);
 	if (err)
-		goto out_put;
+		goto out;
 
 	err = sock->ops->connect(sock, (struct sockaddr *)&address, addrlen,
-				 sock->file->f_flags);
-out_put:
-	fput_light(sock->file, fput_needed);
+				 sock->file->f_flags | file_flags);
 out:
 	return err;
 }
 
+int __sys_connect(int fd, struct sockaddr __user *uservaddr, int addrlen)
+{
+	int ret = -EBADF;
+	struct fd f;
+
+	f = fdget(fd);
+	if (f.file) {
+		ret = __sys_connect_file(f.file, uservaddr, addrlen, 0);
+		if (f.flags)
+			fput(f.file);
+	}
+
+	return ret;
+}
+
 SYSCALL_DEFINE3(connect, int, fd, struct sockaddr __user *, uservaddr,
 		int, addrlen)
 {

From f8e85cf255ad57d65eeb9a9d0e59e3dec55bdd9e Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 23 Nov 2019 14:24:24 -0700
Subject: [PATCH 29/42] io_uring: add support for IORING_OP_CONNECT

This allows an application to call connect() in an async fashion. Like
other opcodes, we first try a non-blocking connect, then punt to async
context if we have to.

Note that we can still return -EINPROGRESS, and in that case the caller
should use IORING_OP_POLL_ADD to do an async wait for completion of the
connect request (just like for regular connect(2), except we can do it
async here too).

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 36 +++++++++++++++++++++++++++++++++++
 include/uapi/linux/io_uring.h |  1 +
 2 files changed, 37 insertions(+)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 129723087bad..02254929231b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -550,6 +550,7 @@ static inline bool io_prep_async_work(struct io_kiocb *req,
 		case IORING_OP_RECVMSG:
 		case IORING_OP_ACCEPT:
 		case IORING_OP_POLL_ADD:
+		case IORING_OP_CONNECT:
 			/*
 			 * We know REQ_F_ISREG is not set on some of these
 			 * opcodes, but this enables us to keep the check in
@@ -1974,6 +1975,38 @@ static int io_accept(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 #endif
 }
 
+static int io_connect(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+		      struct io_kiocb **nxt, bool force_nonblock)
+{
+#if defined(CONFIG_NET)
+	struct sockaddr __user *addr;
+	unsigned file_flags;
+	int addr_len, ret;
+
+	if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
+		return -EINVAL;
+	if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags)
+		return -EINVAL;
+
+	addr = (struct sockaddr __user *) (unsigned long) READ_ONCE(sqe->addr);
+	addr_len = READ_ONCE(sqe->addr2);
+	file_flags = force_nonblock ? O_NONBLOCK : 0;
+
+	ret = __sys_connect_file(req->file, addr, addr_len, file_flags);
+	if (ret == -EAGAIN && force_nonblock)
+		return -EAGAIN;
+	if (ret == -ERESTARTSYS)
+		ret = -EINTR;
+	if (ret < 0 && (req->flags & REQ_F_LINK))
+		req->flags |= REQ_F_FAIL_LINK;
+	io_cqring_add_event(req, ret);
+	io_put_req_find_next(req, nxt);
+	return 0;
+#else
+	return -EOPNOTSUPP;
+#endif
+}
+
 static inline void io_poll_remove_req(struct io_kiocb *req)
 {
 	if (!RB_EMPTY_NODE(&req->rb_node)) {
@@ -2637,6 +2670,9 @@ static int io_issue_sqe(struct io_kiocb *req, struct io_kiocb **nxt,
 	case IORING_OP_ACCEPT:
 		ret = io_accept(req, s->sqe, nxt, force_nonblock);
 		break;
+	case IORING_OP_CONNECT:
+		ret = io_connect(req, s->sqe, nxt, force_nonblock);
+		break;
 	case IORING_OP_ASYNC_CANCEL:
 		ret = io_async_cancel(req, s->sqe, nxt);
 		break;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 2a1569211d87..4637ed1d9949 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -73,6 +73,7 @@ struct io_uring_sqe {
 #define IORING_OP_ACCEPT	13
 #define IORING_OP_ASYNC_CANCEL	14
 #define IORING_OP_LINK_TIMEOUT	15
+#define IORING_OP_CONNECT	16
 
 /*
  * sqe->fsync_flags

From 311ae9e159d81a1ec1cf645daf40b39ae5a0bd84 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 24 Nov 2019 11:58:24 +0300
Subject: [PATCH 30/42] io_uring: fix dead-hung for non-iter fixed rw

Read/write requests to devices without implemented read/write_iter
using fixed buffers can cause general protection fault, which totally
hangs a machine.

io_import_fixed() initialises iov_iter with bvec, but loop_rw_iter()
accesses it as iovec, dereferencing random address.

kmap() page by page in this case

Cc: stable@vger.kernel.org
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 02254929231b..fc832164bbb8 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1622,9 +1622,19 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
 		return -EAGAIN;
 
 	while (iov_iter_count(iter)) {
-		struct iovec iovec = iov_iter_iovec(iter);
+		struct iovec iovec;
 		ssize_t nr;
 
+		if (!iov_iter_is_bvec(iter)) {
+			iovec = iov_iter_iovec(iter);
+		} else {
+			/* fixed buffers import bvec */
+			iovec.iov_base = kmap(iter->bvec->bv_page)
+						+ iter->iov_offset;
+			iovec.iov_len = min(iter->count,
+					iter->bvec->bv_len - iter->iov_offset);
+		}
+
 		if (rw == READ) {
 			nr = file->f_op->read(file, iovec.iov_base,
 					      iovec.iov_len, &kiocb->ki_pos);
@@ -1633,6 +1643,9 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
 					       iovec.iov_len, &kiocb->ki_pos);
 		}
 
+		if (iov_iter_is_bvec(iter))
+			kunmap(iter->bvec->bv_page);
+
 		if (nr < 0) {
 			if (!ret)
 				ret = nr;

From 576a347b7af8abfbddc80783fb6629c2894d036e Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 25 Nov 2019 08:49:20 -0700
Subject: [PATCH 31/42] io-wq: have io_wq_create() take a 'data' argument

We currently pass in 4 arguments outside of the bounded size. In
preparation for adding one more argument, let's bundle them up in
a struct to make it more readable.

No functional changes in this patch.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.c    | 14 ++++++--------
 fs/io-wq.h    | 12 +++++++++---
 fs/io_uring.c |  9 +++++++--
 3 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/fs/io-wq.c b/fs/io-wq.c
index 9b32b3c811f5..2b4276990571 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -974,9 +974,7 @@ void io_wq_flush(struct io_wq *wq)
 	}
 }
 
-struct io_wq *io_wq_create(unsigned bounded, struct mm_struct *mm,
-			   struct user_struct *user, get_work_fn *get_work,
-			   put_work_fn *put_work)
+struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
 {
 	int ret = -ENOMEM, i, node;
 	struct io_wq *wq;
@@ -992,11 +990,11 @@ struct io_wq *io_wq_create(unsigned bounded, struct mm_struct *mm,
 		return ERR_PTR(-ENOMEM);
 	}
 
-	wq->get_work = get_work;
-	wq->put_work = put_work;
+	wq->get_work = data->get_work;
+	wq->put_work = data->put_work;
 
 	/* caller must already hold a reference to this */
-	wq->user = user;
+	wq->user = data->user;
 
 	i = 0;
 	for_each_online_node(node) {
@@ -1009,7 +1007,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct mm_struct *mm,
 		wqe->node = node;
 		wqe->acct[IO_WQ_ACCT_BOUND].max_workers = bounded;
 		atomic_set(&wqe->acct[IO_WQ_ACCT_BOUND].nr_running, 0);
-		if (user) {
+		if (wq->user) {
 			wqe->acct[IO_WQ_ACCT_UNBOUND].max_workers =
 					task_rlimit(current, RLIMIT_NPROC);
 		}
@@ -1031,7 +1029,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct mm_struct *mm,
 		goto err;
 
 	/* caller must have already done mmgrab() on this mm */
-	wq->mm = mm;
+	wq->mm = data->mm;
 
 	wq->manager = kthread_create(io_wq_manager, wq, "io_wq_manager");
 	if (!IS_ERR(wq->manager)) {
diff --git a/fs/io-wq.h b/fs/io-wq.h
index b68b11bf3633..bb8f1c8f8e24 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -42,9 +42,15 @@ struct io_wq_work {
 typedef void (get_work_fn)(struct io_wq_work *);
 typedef void (put_work_fn)(struct io_wq_work *);
 
-struct io_wq *io_wq_create(unsigned bounded, struct mm_struct *mm,
-				struct user_struct *user,
-				get_work_fn *get_work, put_work_fn *put_work);
+struct io_wq_data {
+	struct mm_struct *mm;
+	struct user_struct *user;
+
+	get_work_fn *get_work;
+	put_work_fn *put_work;
+};
+
+struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data);
 void io_wq_destroy(struct io_wq *wq);
 
 void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);
diff --git a/fs/io_uring.c b/fs/io_uring.c
index fc832164bbb8..fabae84396bc 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3962,6 +3962,7 @@ static void io_get_work(struct io_wq_work *work)
 static int io_sq_offload_start(struct io_ring_ctx *ctx,
 			       struct io_uring_params *p)
 {
+	struct io_wq_data data;
 	unsigned concurrency;
 	int ret;
 
@@ -4006,10 +4007,14 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx,
 		goto err;
 	}
 
+	data.mm = ctx->sqo_mm;
+	data.user = ctx->user;
+	data.get_work = io_get_work;
+	data.put_work = io_put_work;
+
 	/* Do QD, or 4 * CPUS, whatever is smallest */
 	concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
-	ctx->io_wq = io_wq_create(concurrency, ctx->sqo_mm, ctx->user,
-					io_get_work, io_put_work);
+	ctx->io_wq = io_wq_create(concurrency, &data);
 	if (IS_ERR(ctx->io_wq)) {
 		ret = PTR_ERR(ctx->io_wq);
 		ctx->io_wq = NULL;

From 181e448d8709e517c9c7b523fcd209f24eb38ca7 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 25 Nov 2019 08:52:30 -0700
Subject: [PATCH 32/42] io_uring: async workers should inherit the user creds

If we don't inherit the original task creds, then we can confuse users
like fuse that pass creds in the request header. See link below on
identical aio issue.

Link: https://lore.kernel.org/linux-fsdevel/26f0d78e-99ca-2f1b-78b9-433088053a61@scylladb.com/T/#u
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.c    | 10 ++++++++++
 fs/io-wq.h    |  1 +
 fs/io_uring.c |  8 ++++++++
 3 files changed, 19 insertions(+)

diff --git a/fs/io-wq.c b/fs/io-wq.c
index 2b4276990571..31c5a10b0825 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -57,6 +57,7 @@ struct io_worker {
 
 	struct rcu_head rcu;
 	struct mm_struct *mm;
+	const struct cred *creds;
 	struct files_struct *restore_files;
 };
 
@@ -111,6 +112,7 @@ struct io_wq {
 
 	struct task_struct *manager;
 	struct user_struct *user;
+	struct cred *creds;
 	struct mm_struct *mm;
 	refcount_t refs;
 	struct completion done;
@@ -136,6 +138,11 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker)
 {
 	bool dropped_lock = false;
 
+	if (worker->creds) {
+		revert_creds(worker->creds);
+		worker->creds = NULL;
+	}
+
 	if (current->files != worker->restore_files) {
 		__acquire(&wqe->lock);
 		spin_unlock_irq(&wqe->lock);
@@ -442,6 +449,8 @@ next:
 			set_fs(USER_DS);
 			worker->mm = wq->mm;
 		}
+		if (!worker->creds)
+			worker->creds = override_creds(wq->creds);
 		if (test_bit(IO_WQ_BIT_CANCEL, &wq->state))
 			work->flags |= IO_WQ_WORK_CANCEL;
 		if (worker->mm)
@@ -995,6 +1004,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
 
 	/* caller must already hold a reference to this */
 	wq->user = data->user;
+	wq->creds = data->creds;
 
 	i = 0;
 	for_each_online_node(node) {
diff --git a/fs/io-wq.h b/fs/io-wq.h
index bb8f1c8f8e24..5cd8c7697e88 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -45,6 +45,7 @@ typedef void (put_work_fn)(struct io_wq_work *);
 struct io_wq_data {
 	struct mm_struct *mm;
 	struct user_struct *user;
+	struct cred *creds;
 
 	get_work_fn *get_work;
 	put_work_fn *put_work;
diff --git a/fs/io_uring.c b/fs/io_uring.c
index fabae84396bc..b6c6fdc12de7 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -237,6 +237,8 @@ struct io_ring_ctx {
 
 	struct user_struct	*user;
 
+	struct cred		*creds;
+
 	/* 0 is for ctx quiesce/reinit/free, 1 is for sqo_thread started */
 	struct completion	*completions;
 
@@ -3267,6 +3269,7 @@ static int io_sq_thread(void *data)
 {
 	struct io_ring_ctx *ctx = data;
 	struct mm_struct *cur_mm = NULL;
+	const struct cred *old_cred;
 	mm_segment_t old_fs;
 	DEFINE_WAIT(wait);
 	unsigned inflight;
@@ -3277,6 +3280,7 @@ static int io_sq_thread(void *data)
 
 	old_fs = get_fs();
 	set_fs(USER_DS);
+	old_cred = override_creds(ctx->creds);
 
 	ret = timeout = inflight = 0;
 	while (!kthread_should_park()) {
@@ -3383,6 +3387,7 @@ static int io_sq_thread(void *data)
 		unuse_mm(cur_mm);
 		mmput(cur_mm);
 	}
+	revert_creds(old_cred);
 
 	kthread_parkme();
 
@@ -4009,6 +4014,7 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx,
 
 	data.mm = ctx->sqo_mm;
 	data.user = ctx->user;
+	data.creds = ctx->creds;
 	data.get_work = io_get_work;
 	data.put_work = io_put_work;
 
@@ -4363,6 +4369,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
 		io_unaccount_mem(ctx->user,
 				ring_pages(ctx->sq_entries, ctx->cq_entries));
 	free_uid(ctx->user);
+	put_cred(ctx->creds);
 	kfree(ctx->completions);
 	kmem_cache_free(req_cachep, ctx->fallback_req);
 	kfree(ctx);
@@ -4715,6 +4722,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p)
 	ctx->compat = in_compat_syscall();
 	ctx->account_mem = account_mem;
 	ctx->user = user;
+	ctx->creds = prepare_creds();
 
 	ret = io_allocate_scq_urings(ctx, p);
 	if (ret)

From 8042d6ce8c40df0abb0d91662a754d074a3d3f16 Mon Sep 17 00:00:00 2001
From: Hrvoje Zeba <zeba.hrvoje@gmail.com>
Date: Mon, 25 Nov 2019 14:40:22 -0500
Subject: [PATCH 33/42] io_uring: remove superfluous check for sqe->off in
 io_accept()

This field contains a pointer to addrlen and checking to see if it's set
returns -EINVAL if the caller sets addr & addrlen pointers.

Fixes: 17f2fe35d080 ("io_uring: add support for IORING_OP_ACCEPT")
Signed-off-by: Hrvoje Zeba <zeba.hrvoje@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index b6c6fdc12de7..7412fdefa35a 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1965,7 +1965,7 @@ static int io_accept(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 
 	if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
 		return -EINVAL;
-	if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
+	if (sqe->ioprio || sqe->len || sqe->buf_index)
 		return -EINVAL;
 
 	addr = (struct sockaddr __user *) (unsigned long) READ_ONCE(sqe->addr);

From 4257c8ca13b084550574b8c9a667d9c90ff746eb Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 25 Nov 2019 14:27:34 -0700
Subject: [PATCH 34/42] net: separate out the msghdr copy from
 ___sys_{send,recv}msg()

This is in preparation for enabling the io_uring helpers for sendmsg
and recvmsg to first copy the header for validation before continuing
with the operation.

There should be no functional changes in this patch.

Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 net/socket.c | 141 ++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 95 insertions(+), 46 deletions(-)

diff --git a/net/socket.c b/net/socket.c
index 274df4ddfc2c..98568ae58bad 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2264,15 +2264,10 @@ static int copy_msghdr_from_user(struct msghdr *kmsg,
 	return err < 0 ? err : 0;
 }
 
-static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg,
-			 struct msghdr *msg_sys, unsigned int flags,
-			 struct used_address *used_address,
-			 unsigned int allowed_msghdr_flags)
+static int ____sys_sendmsg(struct socket *sock, struct msghdr *msg_sys,
+			   unsigned int flags, struct used_address *used_address,
+			   unsigned int allowed_msghdr_flags)
 {
-	struct compat_msghdr __user *msg_compat =
-	    (struct compat_msghdr __user *)msg;
-	struct sockaddr_storage address;
-	struct iovec iovstack[UIO_FASTIOV], *iov = iovstack;
 	unsigned char ctl[sizeof(struct cmsghdr) + 20]
 				__aligned(sizeof(__kernel_size_t));
 	/* 20 is size of ipv6_pktinfo */
@@ -2280,19 +2275,10 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg,
 	int ctl_len;
 	ssize_t err;
 
-	msg_sys->msg_name = &address;
-
-	if (MSG_CMSG_COMPAT & flags)
-		err = get_compat_msghdr(msg_sys, msg_compat, NULL, &iov);
-	else
-		err = copy_msghdr_from_user(msg_sys, msg, NULL, &iov);
-	if (err < 0)
-		return err;
-
 	err = -ENOBUFS;
 
 	if (msg_sys->msg_controllen > INT_MAX)
-		goto out_freeiov;
+		goto out;
 	flags |= (msg_sys->msg_flags & allowed_msghdr_flags);
 	ctl_len = msg_sys->msg_controllen;
 	if ((MSG_CMSG_COMPAT & flags) && ctl_len) {
@@ -2300,7 +2286,7 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg,
 		    cmsghdr_from_user_compat_to_kern(msg_sys, sock->sk, ctl,
 						     sizeof(ctl));
 		if (err)
-			goto out_freeiov;
+			goto out;
 		ctl_buf = msg_sys->msg_control;
 		ctl_len = msg_sys->msg_controllen;
 	} else if (ctl_len) {
@@ -2309,7 +2295,7 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg,
 		if (ctl_len > sizeof(ctl)) {
 			ctl_buf = sock_kmalloc(sock->sk, ctl_len, GFP_KERNEL);
 			if (ctl_buf == NULL)
-				goto out_freeiov;
+				goto out;
 		}
 		err = -EFAULT;
 		/*
@@ -2355,7 +2341,47 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg,
 out_freectl:
 	if (ctl_buf != ctl)
 		sock_kfree_s(sock->sk, ctl_buf, ctl_len);
-out_freeiov:
+out:
+	return err;
+}
+
+static int sendmsg_copy_msghdr(struct msghdr *msg,
+			       struct user_msghdr __user *umsg, unsigned flags,
+			       struct iovec **iov)
+{
+	int err;
+
+	if (flags & MSG_CMSG_COMPAT) {
+		struct compat_msghdr __user *msg_compat;
+
+		msg_compat = (struct compat_msghdr __user *) umsg;
+		err = get_compat_msghdr(msg, msg_compat, NULL, iov);
+	} else {
+		err = copy_msghdr_from_user(msg, umsg, NULL, iov);
+	}
+	if (err < 0)
+		return err;
+
+	return 0;
+}
+
+static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg,
+			 struct msghdr *msg_sys, unsigned int flags,
+			 struct used_address *used_address,
+			 unsigned int allowed_msghdr_flags)
+{
+	struct sockaddr_storage address;
+	struct iovec iovstack[UIO_FASTIOV], *iov = iovstack;
+	ssize_t err;
+
+	msg_sys->msg_name = &address;
+
+	err = sendmsg_copy_msghdr(msg_sys, msg, flags, &iov);
+	if (err < 0)
+		return err;
+
+	err = ____sys_sendmsg(sock, msg_sys, flags, used_address,
+				allowed_msghdr_flags);
 	kfree(iov);
 	return err;
 }
@@ -2474,33 +2500,41 @@ SYSCALL_DEFINE4(sendmmsg, int, fd, struct mmsghdr __user *, mmsg,
 	return __sys_sendmmsg(fd, mmsg, vlen, flags, true);
 }
 
-static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg,
-			 struct msghdr *msg_sys, unsigned int flags, int nosec)
+static int recvmsg_copy_msghdr(struct msghdr *msg,
+			       struct user_msghdr __user *umsg, unsigned flags,
+			       struct sockaddr __user **uaddr,
+			       struct iovec **iov)
+{
+	ssize_t err;
+
+	if (MSG_CMSG_COMPAT & flags) {
+		struct compat_msghdr __user *msg_compat;
+
+		msg_compat = (struct compat_msghdr __user *) umsg;
+		err = get_compat_msghdr(msg, msg_compat, uaddr, iov);
+	} else {
+		err = copy_msghdr_from_user(msg, umsg, uaddr, iov);
+	}
+	if (err < 0)
+		return err;
+
+	return 0;
+}
+
+static int ____sys_recvmsg(struct socket *sock, struct msghdr *msg_sys,
+			   struct user_msghdr __user *msg,
+			   struct sockaddr __user *uaddr,
+			   unsigned int flags, int nosec)
 {
 	struct compat_msghdr __user *msg_compat =
-	    (struct compat_msghdr __user *)msg;
-	struct iovec iovstack[UIO_FASTIOV];
-	struct iovec *iov = iovstack;
+					(struct compat_msghdr __user *) msg;
+	int __user *uaddr_len = COMPAT_NAMELEN(msg);
+	struct sockaddr_storage addr;
 	unsigned long cmsg_ptr;
 	int len;
 	ssize_t err;
 
-	/* kernel mode address */
-	struct sockaddr_storage addr;
-
-	/* user mode address pointers */
-	struct sockaddr __user *uaddr;
-	int __user *uaddr_len = COMPAT_NAMELEN(msg);
-
 	msg_sys->msg_name = &addr;
-
-	if (MSG_CMSG_COMPAT & flags)
-		err = get_compat_msghdr(msg_sys, msg_compat, &uaddr, &iov);
-	else
-		err = copy_msghdr_from_user(msg_sys, msg, &uaddr, &iov);
-	if (err < 0)
-		return err;
-
 	cmsg_ptr = (unsigned long)msg_sys->msg_control;
 	msg_sys->msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT);
 
@@ -2511,7 +2545,7 @@ static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg,
 		flags |= MSG_DONTWAIT;
 	err = (nosec ? sock_recvmsg_nosec : sock_recvmsg)(sock, msg_sys, flags);
 	if (err < 0)
-		goto out_freeiov;
+		goto out;
 	len = err;
 
 	if (uaddr != NULL) {
@@ -2519,12 +2553,12 @@ static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg,
 					msg_sys->msg_namelen, uaddr,
 					uaddr_len);
 		if (err < 0)
-			goto out_freeiov;
+			goto out;
 	}
 	err = __put_user((msg_sys->msg_flags & ~MSG_CMSG_COMPAT),
 			 COMPAT_FLAGS(msg));
 	if (err)
-		goto out_freeiov;
+		goto out;
 	if (MSG_CMSG_COMPAT & flags)
 		err = __put_user((unsigned long)msg_sys->msg_control - cmsg_ptr,
 				 &msg_compat->msg_controllen);
@@ -2532,10 +2566,25 @@ static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg,
 		err = __put_user((unsigned long)msg_sys->msg_control - cmsg_ptr,
 				 &msg->msg_controllen);
 	if (err)
-		goto out_freeiov;
+		goto out;
 	err = len;
+out:
+	return err;
+}
 
-out_freeiov:
+static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg,
+			 struct msghdr *msg_sys, unsigned int flags, int nosec)
+{
+	struct iovec iovstack[UIO_FASTIOV], *iov = iovstack;
+	/* user mode address pointers */
+	struct sockaddr __user *uaddr;
+	ssize_t err;
+
+	err = recvmsg_copy_msghdr(msg_sys, msg, flags, &uaddr, &iov);
+	if (err < 0)
+		return err;
+
+	err = ____sys_recvmsg(sock, msg_sys, msg, uaddr, flags, nosec);
 	kfree(iov);
 	return err;
 }

From d69e07793f891524c6bbf1e75b9ae69db4450953 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 25 Nov 2019 17:04:13 -0700
Subject: [PATCH 35/42] net: disallow ancillary data for
 __sys_{send,recv}msg_file()

Only io_uring uses (and added) these, and we want to disallow the
use of sendmsg/recvmsg for anything but regular data transfers.
Use the newly added prep helper to split the msghdr copy out from
the core function, to check for msg_control and msg_controllen
settings. If either is set, we return -EINVAL.

Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 net/socket.c | 43 +++++++++++++++++++++++++++++++++++++------
 1 file changed, 37 insertions(+), 6 deletions(-)

diff --git a/net/socket.c b/net/socket.c
index 98568ae58bad..c78c3d37c884 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2389,12 +2389,27 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg,
 /*
  *	BSD sendmsg interface
  */
-long __sys_sendmsg_sock(struct socket *sock, struct user_msghdr __user *msg,
+long __sys_sendmsg_sock(struct socket *sock, struct user_msghdr __user *umsg,
 			unsigned int flags)
 {
-	struct msghdr msg_sys;
+	struct iovec iovstack[UIO_FASTIOV], *iov = iovstack;
+	struct sockaddr_storage address;
+	struct msghdr msg = { .msg_name = &address };
+	ssize_t err;
 
-	return ___sys_sendmsg(sock, msg, &msg_sys, flags, NULL, 0);
+	err = sendmsg_copy_msghdr(&msg, umsg, flags, &iov);
+	if (err)
+		return err;
+	/* disallow ancillary data requests from this path */
+	if (msg.msg_control || msg.msg_controllen) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	err = ____sys_sendmsg(sock, &msg, flags, NULL, 0);
+out:
+	kfree(iov);
+	return err;
 }
 
 long __sys_sendmsg(int fd, struct user_msghdr __user *msg, unsigned int flags,
@@ -2593,12 +2608,28 @@ static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg,
  *	BSD recvmsg interface
  */
 
-long __sys_recvmsg_sock(struct socket *sock, struct user_msghdr __user *msg,
+long __sys_recvmsg_sock(struct socket *sock, struct user_msghdr __user *umsg,
 			unsigned int flags)
 {
-	struct msghdr msg_sys;
+	struct iovec iovstack[UIO_FASTIOV], *iov = iovstack;
+	struct sockaddr_storage address;
+	struct msghdr msg = { .msg_name = &address };
+	struct sockaddr __user *uaddr;
+	ssize_t err;
 
-	return ___sys_recvmsg(sock, msg, &msg_sys, flags, 0);
+	err = recvmsg_copy_msghdr(&msg, umsg, flags, &uaddr, &iov);
+	if (err)
+		return err;
+	/* disallow ancillary data requests from this path */
+	if (msg.msg_control || msg.msg_controllen) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	err = ____sys_recvmsg(sock, &msg, umsg, uaddr, flags, 0);
+out:
+	kfree(iov);
+	return err;
 }
 
 long __sys_recvmsg(int fd, struct user_msghdr __user *msg, unsigned int flags,

From cc42e0ac17d3664a70e020dfe7897f14e7aa7453 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 25 Nov 2019 23:14:38 +0300
Subject: [PATCH 36/42] io_uring: store timeout's sqe->off in proper place

Timeouts' sequence offset (i.e. sqe->off) is stored in
req->submit.sequence under a false name. Keep it in timeout.data
instead. The unused space for sequence will be reclaimed in the
following patches.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 7412fdefa35a..39409934e9e6 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -309,6 +309,7 @@ struct io_timeout_data {
 	struct hrtimer			timer;
 	struct timespec64		ts;
 	enum hrtimer_mode		mode;
+	u32				seq_offset;
 };
 
 struct io_timeout {
@@ -2473,8 +2474,7 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	}
 
 	req->sequence = ctx->cached_sq_head + count - 1;
-	/* reuse it to store the count */
-	req->submit.sequence = count;
+	req->timeout.data->seq_offset = count;
 
 	/*
 	 * Insertion sort, ensuring the first entry in the list is always
@@ -2485,6 +2485,7 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list);
 		unsigned nxt_sq_head;
 		long long tmp, tmp_nxt;
+		u32 nxt_offset = nxt->timeout.data->seq_offset;
 
 		if (nxt->flags & REQ_F_TIMEOUT_NOSEQ)
 			continue;
@@ -2494,8 +2495,8 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		 * long to store it.
 		 */
 		tmp = (long long)ctx->cached_sq_head + count - 1;
-		nxt_sq_head = nxt->sequence - nxt->submit.sequence + 1;
-		tmp_nxt = (long long)nxt_sq_head + nxt->submit.sequence - 1;
+		nxt_sq_head = nxt->sequence - nxt_offset + 1;
+		tmp_nxt = (long long)nxt_sq_head + nxt_offset - 1;
 
 		/*
 		 * cached_sq_head may overflow, and it will never overflow twice

From cf6fd4bd559ee61a4454b161863c8de6f30f8dca Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 25 Nov 2019 23:14:39 +0300
Subject: [PATCH 37/42] io_uring: inline struct sqe_submit

There is no point left in keeping struct sqe_submit. Inline it
into struct io_kiocb, so any req->submit.field is now just req->field

- moves initialisation of ring_file into io_get_req()
- removes duplicated req->sequence.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 169 +++++++++++++++++++++++---------------------------
 1 file changed, 78 insertions(+), 91 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 39409934e9e6..c846605b8361 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -281,16 +281,6 @@ struct io_ring_ctx {
 	} ____cacheline_aligned_in_smp;
 };
 
-struct sqe_submit {
-	const struct io_uring_sqe	*sqe;
-	struct file			*ring_file;
-	int				ring_fd;
-	u32				sequence;
-	bool				has_user;
-	bool				in_async;
-	bool				needs_fixed_file;
-};
-
 /*
  * First field must be the file pointer in all the
  * iocb unions! See also 'struct kiocb' in <linux/fs.h>
@@ -331,7 +321,12 @@ struct io_kiocb {
 		struct io_timeout	timeout;
 	};
 
-	struct sqe_submit	submit;
+	const struct io_uring_sqe	*sqe;
+	struct file			*ring_file;
+	int				ring_fd;
+	bool				has_user;
+	bool				in_async;
+	bool				needs_fixed_file;
 
 	struct io_ring_ctx	*ctx;
 	union {
@@ -541,8 +536,8 @@ static inline bool io_prep_async_work(struct io_kiocb *req,
 {
 	bool do_hashed = false;
 
-	if (req->submit.sqe) {
-		switch (req->submit.sqe->opcode) {
+	if (req->sqe) {
+		switch (req->sqe->opcode) {
 		case IORING_OP_WRITEV:
 		case IORING_OP_WRITE_FIXED:
 			do_hashed = true;
@@ -563,7 +558,7 @@ static inline bool io_prep_async_work(struct io_kiocb *req,
 				req->work.flags |= IO_WQ_WORK_UNBOUND;
 			break;
 		}
-		if (io_sqe_needs_user(req->submit.sqe))
+		if (io_sqe_needs_user(req->sqe))
 			req->work.flags |= IO_WQ_WORK_NEEDS_USER;
 	}
 
@@ -810,6 +805,7 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
 	}
 
 got_it:
+	req->ring_file = NULL;
 	req->file = NULL;
 	req->ctx = ctx;
 	req->flags = 0;
@@ -840,7 +836,7 @@ static void __io_free_req(struct io_kiocb *req)
 	struct io_ring_ctx *ctx = req->ctx;
 
 	if (req->flags & REQ_F_FREE_SQE)
-		kfree(req->submit.sqe);
+		kfree(req->sqe);
 	if (req->file && !(req->flags & REQ_F_FIXED_FILE))
 		fput(req->file);
 	if (req->flags & REQ_F_INFLIGHT) {
@@ -938,7 +934,7 @@ static void io_fail_links(struct io_kiocb *req)
 		trace_io_uring_fail_link(req, link);
 
 		if ((req->flags & REQ_F_LINK_TIMEOUT) &&
-		    link->submit.sqe->opcode == IORING_OP_LINK_TIMEOUT) {
+		    link->sqe->opcode == IORING_OP_LINK_TIMEOUT) {
 			io_link_cancel_timeout(link);
 		} else {
 			io_cqring_fill_event(link, -ECANCELED);
@@ -1401,7 +1397,7 @@ static bool io_file_supports_async(struct file *file)
 
 static int io_prep_rw(struct io_kiocb *req, bool force_nonblock)
 {
-	const struct io_uring_sqe *sqe = req->submit.sqe;
+	const struct io_uring_sqe *sqe = req->sqe;
 	struct io_ring_ctx *ctx = req->ctx;
 	struct kiocb *kiocb = &req->rw;
 	unsigned ioprio;
@@ -1568,11 +1564,10 @@ static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
 	return len;
 }
 
-static ssize_t io_import_iovec(struct io_ring_ctx *ctx, int rw,
-			       const struct sqe_submit *s, struct iovec **iovec,
-			       struct iov_iter *iter)
+static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
+			       struct iovec **iovec, struct iov_iter *iter)
 {
-	const struct io_uring_sqe *sqe = s->sqe;
+	const struct io_uring_sqe *sqe = req->sqe;
 	void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
 	size_t sqe_len = READ_ONCE(sqe->len);
 	u8 opcode;
@@ -1588,16 +1583,16 @@ static ssize_t io_import_iovec(struct io_ring_ctx *ctx, int rw,
 	opcode = READ_ONCE(sqe->opcode);
 	if (opcode == IORING_OP_READ_FIXED ||
 	    opcode == IORING_OP_WRITE_FIXED) {
-		ssize_t ret = io_import_fixed(ctx, rw, sqe, iter);
+		ssize_t ret = io_import_fixed(req->ctx, rw, sqe, iter);
 		*iovec = NULL;
 		return ret;
 	}
 
-	if (!s->has_user)
+	if (!req->has_user)
 		return -EFAULT;
 
 #ifdef CONFIG_COMPAT
-	if (ctx->compat)
+	if (req->ctx->compat)
 		return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV,
 						iovec, iter);
 #endif
@@ -1681,7 +1676,7 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt,
 	if (unlikely(!(file->f_mode & FMODE_READ)))
 		return -EBADF;
 
-	ret = io_import_iovec(req->ctx, READ, &req->submit, &iovec, &iter);
+	ret = io_import_iovec(READ, req, &iovec, &iter);
 	if (ret < 0)
 		return ret;
 
@@ -1713,7 +1708,7 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt,
 			ret2 = -EAGAIN;
 		/* Catch -EAGAIN return for forced non-blocking submission */
 		if (!force_nonblock || ret2 != -EAGAIN)
-			kiocb_done(kiocb, ret2, nxt, req->submit.in_async);
+			kiocb_done(kiocb, ret2, nxt, req->in_async);
 		else
 			ret = -EAGAIN;
 	}
@@ -1739,7 +1734,7 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
 	if (unlikely(!(file->f_mode & FMODE_WRITE)))
 		return -EBADF;
 
-	ret = io_import_iovec(req->ctx, WRITE, &req->submit, &iovec, &iter);
+	ret = io_import_iovec(WRITE, req, &iovec, &iter);
 	if (ret < 0)
 		return ret;
 
@@ -1776,7 +1771,7 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
 		else
 			ret2 = loop_rw_iter(WRITE, file, kiocb, &iter);
 		if (!force_nonblock || ret2 != -EAGAIN)
-			kiocb_done(kiocb, ret2, nxt, req->submit.in_async);
+			kiocb_done(kiocb, ret2, nxt, req->in_async);
 		else
 			ret = -EAGAIN;
 	}
@@ -2259,7 +2254,7 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	if (!poll->file)
 		return -EBADF;
 
-	req->submit.sqe = NULL;
+	req->sqe = NULL;
 	INIT_IO_WORK(&req->work, io_poll_complete_work);
 	events = READ_ONCE(sqe->poll_events);
 	poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
@@ -2413,7 +2408,7 @@ static int io_timeout_remove(struct io_kiocb *req,
 
 static int io_timeout_setup(struct io_kiocb *req)
 {
-	const struct io_uring_sqe *sqe = req->submit.sqe;
+	const struct io_uring_sqe *sqe = req->sqe;
 	struct io_timeout_data *data;
 	unsigned flags;
 
@@ -2601,7 +2596,6 @@ static int io_async_cancel(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 
 static int io_req_defer(struct io_kiocb *req)
 {
-	const struct io_uring_sqe *sqe = req->submit.sqe;
 	struct io_uring_sqe *sqe_copy;
 	struct io_ring_ctx *ctx = req->ctx;
 
@@ -2620,9 +2614,9 @@ static int io_req_defer(struct io_kiocb *req)
 		return 0;
 	}
 
-	memcpy(sqe_copy, sqe, sizeof(*sqe_copy));
+	memcpy(sqe_copy, req->sqe, sizeof(*sqe_copy));
 	req->flags |= REQ_F_FREE_SQE;
-	req->submit.sqe = sqe_copy;
+	req->sqe = sqe_copy;
 
 	trace_io_uring_defer(ctx, req, req->user_data);
 	list_add_tail(&req->list, &ctx->defer_list);
@@ -2635,21 +2629,20 @@ static int io_issue_sqe(struct io_kiocb *req, struct io_kiocb **nxt,
 			bool force_nonblock)
 {
 	int ret, opcode;
-	struct sqe_submit *s = &req->submit;
 	struct io_ring_ctx *ctx = req->ctx;
 
-	opcode = READ_ONCE(s->sqe->opcode);
+	opcode = READ_ONCE(req->sqe->opcode);
 	switch (opcode) {
 	case IORING_OP_NOP:
 		ret = io_nop(req);
 		break;
 	case IORING_OP_READV:
-		if (unlikely(s->sqe->buf_index))
+		if (unlikely(req->sqe->buf_index))
 			return -EINVAL;
 		ret = io_read(req, nxt, force_nonblock);
 		break;
 	case IORING_OP_WRITEV:
-		if (unlikely(s->sqe->buf_index))
+		if (unlikely(req->sqe->buf_index))
 			return -EINVAL;
 		ret = io_write(req, nxt, force_nonblock);
 		break;
@@ -2660,37 +2653,37 @@ static int io_issue_sqe(struct io_kiocb *req, struct io_kiocb **nxt,
 		ret = io_write(req, nxt, force_nonblock);
 		break;
 	case IORING_OP_FSYNC:
-		ret = io_fsync(req, s->sqe, nxt, force_nonblock);
+		ret = io_fsync(req, req->sqe, nxt, force_nonblock);
 		break;
 	case IORING_OP_POLL_ADD:
-		ret = io_poll_add(req, s->sqe, nxt);
+		ret = io_poll_add(req, req->sqe, nxt);
 		break;
 	case IORING_OP_POLL_REMOVE:
-		ret = io_poll_remove(req, s->sqe);
+		ret = io_poll_remove(req, req->sqe);
 		break;
 	case IORING_OP_SYNC_FILE_RANGE:
-		ret = io_sync_file_range(req, s->sqe, nxt, force_nonblock);
+		ret = io_sync_file_range(req, req->sqe, nxt, force_nonblock);
 		break;
 	case IORING_OP_SENDMSG:
-		ret = io_sendmsg(req, s->sqe, nxt, force_nonblock);
+		ret = io_sendmsg(req, req->sqe, nxt, force_nonblock);
 		break;
 	case IORING_OP_RECVMSG:
-		ret = io_recvmsg(req, s->sqe, nxt, force_nonblock);
+		ret = io_recvmsg(req, req->sqe, nxt, force_nonblock);
 		break;
 	case IORING_OP_TIMEOUT:
-		ret = io_timeout(req, s->sqe);
+		ret = io_timeout(req, req->sqe);
 		break;
 	case IORING_OP_TIMEOUT_REMOVE:
-		ret = io_timeout_remove(req, s->sqe);
+		ret = io_timeout_remove(req, req->sqe);
 		break;
 	case IORING_OP_ACCEPT:
-		ret = io_accept(req, s->sqe, nxt, force_nonblock);
+		ret = io_accept(req, req->sqe, nxt, force_nonblock);
 		break;
 	case IORING_OP_CONNECT:
-		ret = io_connect(req, s->sqe, nxt, force_nonblock);
+		ret = io_connect(req, req->sqe, nxt, force_nonblock);
 		break;
 	case IORING_OP_ASYNC_CANCEL:
-		ret = io_async_cancel(req, s->sqe, nxt);
+		ret = io_async_cancel(req, req->sqe, nxt);
 		break;
 	default:
 		ret = -EINVAL;
@@ -2705,10 +2698,10 @@ static int io_issue_sqe(struct io_kiocb *req, struct io_kiocb **nxt,
 			return -EAGAIN;
 
 		/* workqueue context doesn't hold uring_lock, grab it now */
-		if (s->in_async)
+		if (req->in_async)
 			mutex_lock(&ctx->uring_lock);
 		io_iopoll_req_issued(req);
-		if (s->in_async)
+		if (req->in_async)
 			mutex_unlock(&ctx->uring_lock);
 	}
 
@@ -2728,7 +2721,6 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
 {
 	struct io_wq_work *work = *workptr;
 	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
-	struct sqe_submit *s = &req->submit;
 	struct io_kiocb *nxt = NULL;
 	int ret = 0;
 
@@ -2739,8 +2731,8 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
 		ret = -ECANCELED;
 
 	if (!ret) {
-		s->has_user = (work->flags & IO_WQ_WORK_HAS_MM) != 0;
-		s->in_async = true;
+		req->has_user = (work->flags & IO_WQ_WORK_HAS_MM) != 0;
+		req->in_async = true;
 		do {
 			ret = io_issue_sqe(req, &nxt, false);
 			/*
@@ -2806,24 +2798,17 @@ static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
 
 static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req)
 {
-	struct sqe_submit *s = &req->submit;
 	struct io_ring_ctx *ctx = req->ctx;
 	unsigned flags;
 	int fd;
 
-	flags = READ_ONCE(s->sqe->flags);
-	fd = READ_ONCE(s->sqe->fd);
+	flags = READ_ONCE(req->sqe->flags);
+	fd = READ_ONCE(req->sqe->fd);
 
 	if (flags & IOSQE_IO_DRAIN)
 		req->flags |= REQ_F_IO_DRAIN;
-	/*
-	 * All io need record the previous position, if LINK vs DARIN,
-	 * it can be used to mark the position of the first IO in the
-	 * link list.
-	 */
-	req->sequence = s->sequence;
 
-	if (!io_op_needs_file(s->sqe))
+	if (!io_op_needs_file(req->sqe))
 		return 0;
 
 	if (flags & IOSQE_FIXED_FILE) {
@@ -2836,7 +2821,7 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req)
 			return -EBADF;
 		req->flags |= REQ_F_FIXED_FILE;
 	} else {
-		if (s->needs_fixed_file)
+		if (req->needs_fixed_file)
 			return -EBADF;
 		trace_io_uring_file_get(ctx, fd);
 		req->file = io_file_get(state, fd);
@@ -2860,7 +2845,7 @@ static int io_grab_files(struct io_kiocb *req)
 	 * the fd has changed since we started down this path, and disallow
 	 * this operation if it has.
 	 */
-	if (fcheck(req->submit.ring_fd) == req->submit.ring_file) {
+	if (fcheck(req->ring_fd) == req->ring_file) {
 		list_add(&req->inflight_entry, &ctx->inflight_list);
 		req->flags |= REQ_F_INFLIGHT;
 		req->work.files = current->files;
@@ -2941,7 +2926,7 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
 		return NULL;
 
 	nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list);
-	if (!nxt || nxt->submit.sqe->opcode != IORING_OP_LINK_TIMEOUT)
+	if (!nxt || nxt->sqe->opcode != IORING_OP_LINK_TIMEOUT)
 		return NULL;
 
 	req->flags |= REQ_F_LINK_TIMEOUT;
@@ -2964,14 +2949,13 @@ static void __io_queue_sqe(struct io_kiocb *req)
 	 */
 	if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) ||
 	    (req->flags & REQ_F_MUST_PUNT))) {
-		struct sqe_submit *s = &req->submit;
 		struct io_uring_sqe *sqe_copy;
 
-		sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL);
+		sqe_copy = kmemdup(req->sqe, sizeof(*sqe_copy), GFP_KERNEL);
 		if (!sqe_copy)
 			goto err;
 
-		s->sqe = sqe_copy;
+		req->sqe = sqe_copy;
 		req->flags |= REQ_F_FREE_SQE;
 
 		if (req->work.flags & IO_WQ_WORK_NEEDS_FILES) {
@@ -3045,14 +3029,13 @@ static inline void io_queue_link_head(struct io_kiocb *req)
 static void io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state,
 			  struct io_kiocb **link)
 {
-	struct sqe_submit *s = &req->submit;
 	struct io_ring_ctx *ctx = req->ctx;
 	int ret;
 
-	req->user_data = s->sqe->user_data;
+	req->user_data = req->sqe->user_data;
 
 	/* enforce forwards compatibility on users */
-	if (unlikely(s->sqe->flags & ~SQE_VALID_FLAGS)) {
+	if (unlikely(req->sqe->flags & ~SQE_VALID_FLAGS)) {
 		ret = -EINVAL;
 		goto err_req;
 	}
@@ -3076,13 +3059,13 @@ err_req:
 		struct io_kiocb *prev = *link;
 		struct io_uring_sqe *sqe_copy;
 
-		if (s->sqe->flags & IOSQE_IO_DRAIN)
+		if (req->sqe->flags & IOSQE_IO_DRAIN)
 			(*link)->flags |= REQ_F_DRAIN_LINK | REQ_F_IO_DRAIN;
 
-		if (READ_ONCE(s->sqe->opcode) == IORING_OP_LINK_TIMEOUT) {
+		if (READ_ONCE(req->sqe->opcode) == IORING_OP_LINK_TIMEOUT) {
 			ret = io_timeout_setup(req);
 			/* common setup allows offset being set, we don't */
-			if (!ret && s->sqe->off)
+			if (!ret && req->sqe->off)
 				ret = -EINVAL;
 			if (ret) {
 				prev->flags |= REQ_F_FAIL_LINK;
@@ -3090,17 +3073,17 @@ err_req:
 			}
 		}
 
-		sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL);
+		sqe_copy = kmemdup(req->sqe, sizeof(*sqe_copy), GFP_KERNEL);
 		if (!sqe_copy) {
 			ret = -EAGAIN;
 			goto err_req;
 		}
 
-		s->sqe = sqe_copy;
+		req->sqe = sqe_copy;
 		req->flags |= REQ_F_FREE_SQE;
 		trace_io_uring_link(ctx, req, prev);
 		list_add_tail(&req->list, &prev->link_list);
-	} else if (s->sqe->flags & IOSQE_IO_LINK) {
+	} else if (req->sqe->flags & IOSQE_IO_LINK) {
 		req->flags |= REQ_F_LINK;
 
 		INIT_LIST_HEAD(&req->link_list);
@@ -3156,7 +3139,7 @@ static void io_commit_sqring(struct io_ring_ctx *ctx)
  * used, it's important that those reads are done through READ_ONCE() to
  * prevent a re-load down the line.
  */
-static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s)
+static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req)
 {
 	struct io_rings *rings = ctx->rings;
 	u32 *sq_array = ctx->sq_array;
@@ -3177,9 +3160,13 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s)
 
 	head = READ_ONCE(sq_array[head & ctx->sq_mask]);
 	if (likely(head < ctx->sq_entries)) {
-		s->ring_file = NULL;
-		s->sqe = &ctx->sq_sqes[head];
-		s->sequence = ctx->cached_sq_head;
+		/*
+		 * All io need record the previous position, if LINK vs DARIN,
+		 * it can be used to mark the position of the first IO in the
+		 * link list.
+		 */
+		req->sequence = ctx->cached_sq_head;
+		req->sqe = &ctx->sq_sqes[head];
 		ctx->cached_sq_head++;
 		return true;
 	}
@@ -3220,12 +3207,12 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 				submitted = -EAGAIN;
 			break;
 		}
-		if (!io_get_sqring(ctx, &req->submit)) {
+		if (!io_get_sqring(ctx, req)) {
 			__io_free_req(req);
 			break;
 		}
 
-		if (io_sqe_needs_user(req->submit.sqe) && !*mm) {
+		if (io_sqe_needs_user(req->sqe) && !*mm) {
 			mm_fault = mm_fault || !mmget_not_zero(ctx->sqo_mm);
 			if (!mm_fault) {
 				use_mm(ctx->sqo_mm);
@@ -3233,14 +3220,14 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 			}
 		}
 
-		sqe_flags = req->submit.sqe->flags;
+		sqe_flags = req->sqe->flags;
 
-		req->submit.ring_file = ring_file;
-		req->submit.ring_fd = ring_fd;
-		req->submit.has_user = *mm != NULL;
-		req->submit.in_async = async;
-		req->submit.needs_fixed_file = async;
-		trace_io_uring_submit_sqe(ctx, req->submit.sqe->user_data,
+		req->ring_file = ring_file;
+		req->ring_fd = ring_fd;
+		req->has_user = *mm != NULL;
+		req->in_async = async;
+		req->needs_fixed_file = async;
+		trace_io_uring_submit_sqe(ctx, req->sqe->user_data,
 					  true, async);
 		io_submit_sqe(req, statep, &link);
 		submitted++;

From 7d009165550adc64e3561c65ecce564125052e00 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 25 Nov 2019 23:14:40 +0300
Subject: [PATCH 38/42] io_uring: cleanup io_import_fixed()

Clean io_import_fixed() call site and make it return proper type.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index c846605b8361..e44b0e01f1b5 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1490,9 +1490,9 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret, struct io_kiocb **nxt,
 		io_rw_done(kiocb, ret);
 }
 
-static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
-			   const struct io_uring_sqe *sqe,
-			   struct iov_iter *iter)
+static ssize_t io_import_fixed(struct io_ring_ctx *ctx, int rw,
+			       const struct io_uring_sqe *sqe,
+			       struct iov_iter *iter)
 {
 	size_t len = READ_ONCE(sqe->len);
 	struct io_mapped_ubuf *imu;
@@ -1581,11 +1581,9 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
 	 * flag.
 	 */
 	opcode = READ_ONCE(sqe->opcode);
-	if (opcode == IORING_OP_READ_FIXED ||
-	    opcode == IORING_OP_WRITE_FIXED) {
-		ssize_t ret = io_import_fixed(req->ctx, rw, sqe, iter);
+	if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
 		*iovec = NULL;
-		return ret;
+		return io_import_fixed(req->ctx, rw, sqe, iter);
 	}
 
 	if (!req->has_user)

From ad6e005ca68de7af76f9ed3e4c9b6f0aa2f842e3 Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Tue, 26 Nov 2019 17:39:45 +0100
Subject: [PATCH 39/42] io_uring: use kzalloc instead of kcalloc for
 single-element allocations

These allocations are single-element allocations, so don't use the array
allocation wrapper for them.

Signed-off-by: Jann Horn <jannh@google.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/io-wq.c b/fs/io-wq.c
index 31c5a10b0825..dffea3c25a6a 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -580,7 +580,7 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
 	struct io_wqe_acct *acct =&wqe->acct[index];
 	struct io_worker *worker;
 
-	worker = kcalloc_node(1, sizeof(*worker), GFP_KERNEL, wqe->node);
+	worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, wqe->node);
 	if (!worker)
 		return false;
 
@@ -988,7 +988,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
 	int ret = -ENOMEM, i, node;
 	struct io_wq *wq;
 
-	wq = kcalloc(1, sizeof(*wq), GFP_KERNEL);
+	wq = kzalloc(sizeof(*wq), GFP_KERNEL);
 	if (!wq)
 		return ERR_PTR(-ENOMEM);
 
@@ -1010,7 +1010,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
 	for_each_online_node(node) {
 		struct io_wqe *wqe;
 
-		wqe = kcalloc_node(1, sizeof(struct io_wqe), GFP_KERNEL, node);
+		wqe = kzalloc_node(sizeof(struct io_wqe), GFP_KERNEL, node);
 		if (!wqe)
 			break;
 		wq->wqes[i] = wqe;

From 3fc50ab559f5ae400aa33bd0836b3602da7fa51b Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Tue, 26 Nov 2019 19:10:20 +0100
Subject: [PATCH 40/42] io-wq: fix handling of NUMA node IDs

There are several things that can go wrong in the current code on NUMA
systems, especially if not all nodes are online all the time:

 - If the identifiers of the online nodes do not form a single contiguous
   block starting at zero, wq->wqes will be too small, and OOB memory
   accesses will occur e.g. in the loop in io_wq_create().
 - If a node comes online between the call to num_online_nodes() and the
   for_each_node() loop in io_wq_create(), an OOB write will occur.
 - If a node comes online between io_wq_create() and io_wq_enqueue(), a
   lookup is performed for an element that doesn't exist, and an OOB read
   will probably occur.

Fix it by:

 - using nr_node_ids instead of num_online_nodes() for the allocation size;
   nr_node_ids is calculated by setup_nr_node_ids() to be bigger than the
   highest node ID that could possibly come online at some point, even if
   those nodes' identifiers are not a contiguous block
 - creating workers for all possible CPUs, not just all online ones

This is basically what the normal workqueue code also does, as far as I can
tell.

Signed-off-by: Jann Horn <jannh@google.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.c | 80 +++++++++++++++++++++++-------------------------------
 1 file changed, 34 insertions(+), 46 deletions(-)

diff --git a/fs/io-wq.c b/fs/io-wq.c
index dffea3c25a6a..d3e8907cc182 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -105,7 +105,6 @@ struct io_wqe {
 struct io_wq {
 	struct io_wqe **wqes;
 	unsigned long state;
-	unsigned nr_wqes;
 
 	get_work_fn *get_work;
 	put_work_fn *put_work;
@@ -632,21 +631,22 @@ static inline bool io_wqe_need_worker(struct io_wqe *wqe, int index)
 static int io_wq_manager(void *data)
 {
 	struct io_wq *wq = data;
-	int i;
+	int workers_to_create = num_possible_nodes();
+	int node;
 
 	/* create fixed workers */
-	refcount_set(&wq->refs, wq->nr_wqes);
-	for (i = 0; i < wq->nr_wqes; i++) {
-		if (create_io_worker(wq, wq->wqes[i], IO_WQ_ACCT_BOUND))
-			continue;
-		goto err;
+	refcount_set(&wq->refs, workers_to_create);
+	for_each_node(node) {
+		if (!create_io_worker(wq, wq->wqes[node], IO_WQ_ACCT_BOUND))
+			goto err;
+		workers_to_create--;
 	}
 
 	complete(&wq->done);
 
 	while (!kthread_should_stop()) {
-		for (i = 0; i < wq->nr_wqes; i++) {
-			struct io_wqe *wqe = wq->wqes[i];
+		for_each_node(node) {
+			struct io_wqe *wqe = wq->wqes[node];
 			bool fork_worker[2] = { false, false };
 
 			spin_lock_irq(&wqe->lock);
@@ -668,7 +668,7 @@ static int io_wq_manager(void *data)
 err:
 	set_bit(IO_WQ_BIT_ERROR, &wq->state);
 	set_bit(IO_WQ_BIT_EXIT, &wq->state);
-	if (refcount_sub_and_test(wq->nr_wqes - i, &wq->refs))
+	if (refcount_sub_and_test(workers_to_create, &wq->refs))
 		complete(&wq->done);
 	return 0;
 }
@@ -776,7 +776,7 @@ static bool io_wq_for_each_worker(struct io_wqe *wqe,
 
 void io_wq_cancel_all(struct io_wq *wq)
 {
-	int i;
+	int node;
 
 	set_bit(IO_WQ_BIT_CANCEL, &wq->state);
 
@@ -785,8 +785,8 @@ void io_wq_cancel_all(struct io_wq *wq)
 	 * to a worker and the worker putting itself on the busy_list
 	 */
 	rcu_read_lock();
-	for (i = 0; i < wq->nr_wqes; i++) {
-		struct io_wqe *wqe = wq->wqes[i];
+	for_each_node(node) {
+		struct io_wqe *wqe = wq->wqes[node];
 
 		io_wq_for_each_worker(wqe, io_wqe_worker_send_sig, NULL);
 	}
@@ -859,10 +859,10 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
 				  void *data)
 {
 	enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND;
-	int i;
+	int node;
 
-	for (i = 0; i < wq->nr_wqes; i++) {
-		struct io_wqe *wqe = wq->wqes[i];
+	for_each_node(node) {
+		struct io_wqe *wqe = wq->wqes[node];
 
 		ret = io_wqe_cancel_cb_work(wqe, cancel, data);
 		if (ret != IO_WQ_CANCEL_NOTFOUND)
@@ -936,10 +936,10 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
 enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork)
 {
 	enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND;
-	int i;
+	int node;
 
-	for (i = 0; i < wq->nr_wqes; i++) {
-		struct io_wqe *wqe = wq->wqes[i];
+	for_each_node(node) {
+		struct io_wqe *wqe = wq->wqes[node];
 
 		ret = io_wqe_cancel_work(wqe, cwork);
 		if (ret != IO_WQ_CANCEL_NOTFOUND)
@@ -970,10 +970,10 @@ static void io_wq_flush_func(struct io_wq_work **workptr)
 void io_wq_flush(struct io_wq *wq)
 {
 	struct io_wq_flush_data data;
-	int i;
+	int node;
 
-	for (i = 0; i < wq->nr_wqes; i++) {
-		struct io_wqe *wqe = wq->wqes[i];
+	for_each_node(node) {
+		struct io_wqe *wqe = wq->wqes[node];
 
 		init_completion(&data.done);
 		INIT_IO_WORK(&data.work, io_wq_flush_func);
@@ -985,15 +985,14 @@ void io_wq_flush(struct io_wq *wq)
 
 struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
 {
-	int ret = -ENOMEM, i, node;
+	int ret = -ENOMEM, node;
 	struct io_wq *wq;
 
 	wq = kzalloc(sizeof(*wq), GFP_KERNEL);
 	if (!wq)
 		return ERR_PTR(-ENOMEM);
 
-	wq->nr_wqes = num_online_nodes();
-	wq->wqes = kcalloc(wq->nr_wqes, sizeof(struct io_wqe *), GFP_KERNEL);
+	wq->wqes = kcalloc(nr_node_ids, sizeof(struct io_wqe *), GFP_KERNEL);
 	if (!wq->wqes) {
 		kfree(wq);
 		return ERR_PTR(-ENOMEM);
@@ -1006,14 +1005,13 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
 	wq->user = data->user;
 	wq->creds = data->creds;
 
-	i = 0;
-	for_each_online_node(node) {
+	for_each_node(node) {
 		struct io_wqe *wqe;
 
 		wqe = kzalloc_node(sizeof(struct io_wqe), GFP_KERNEL, node);
 		if (!wqe)
-			break;
-		wq->wqes[i] = wqe;
+			goto err;
+		wq->wqes[node] = wqe;
 		wqe->node = node;
 		wqe->acct[IO_WQ_ACCT_BOUND].max_workers = bounded;
 		atomic_set(&wqe->acct[IO_WQ_ACCT_BOUND].nr_running, 0);
@@ -1029,15 +1027,10 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
 		INIT_HLIST_NULLS_HEAD(&wqe->free_list, 0);
 		INIT_HLIST_NULLS_HEAD(&wqe->busy_list, 1);
 		INIT_LIST_HEAD(&wqe->all_list);
-
-		i++;
 	}
 
 	init_completion(&wq->done);
 
-	if (i != wq->nr_wqes)
-		goto err;
-
 	/* caller must have already done mmgrab() on this mm */
 	wq->mm = data->mm;
 
@@ -1056,8 +1049,8 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
 	ret = PTR_ERR(wq->manager);
 	complete(&wq->done);
 err:
-	for (i = 0; i < wq->nr_wqes; i++)
-		kfree(wq->wqes[i]);
+	for_each_node(node)
+		kfree(wq->wqes[node]);
 	kfree(wq->wqes);
 	kfree(wq);
 	return ERR_PTR(ret);
@@ -1071,26 +1064,21 @@ static bool io_wq_worker_wake(struct io_worker *worker, void *data)
 
 void io_wq_destroy(struct io_wq *wq)
 {
-	int i;
+	int node;
 
 	set_bit(IO_WQ_BIT_EXIT, &wq->state);
 	if (wq->manager)
 		kthread_stop(wq->manager);
 
 	rcu_read_lock();
-	for (i = 0; i < wq->nr_wqes; i++) {
-		struct io_wqe *wqe = wq->wqes[i];
-
-		if (!wqe)
-			continue;
-		io_wq_for_each_worker(wqe, io_wq_worker_wake, NULL);
-	}
+	for_each_node(node)
+		io_wq_for_each_worker(wq->wqes[node], io_wq_worker_wake, NULL);
 	rcu_read_unlock();
 
 	wait_for_completion(&wq->done);
 
-	for (i = 0; i < wq->nr_wqes; i++)
-		kfree(wq->wqes[i]);
+	for_each_node(node)
+		kfree(wq->wqes[node]);
 	kfree(wq->wqes);
 	kfree(wq);
 }

From 6206f0e180d4eddc0a178f57120ab1b913701f6e Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 26 Nov 2019 11:59:32 -0700
Subject: [PATCH 41/42] io-wq: shrink io_wq_work a bit

Currently we're using 40 bytes for the io_wq_work structure, and 16 of
those is the doubly link list node. We don't need doubly linked lists,
we always add to tail to keep things ordered, and any other use case
is list traversal with deletion. For the deletion case, we can easily
support any node deletion by keeping track of the previous entry.

This shrinks io_wq_work to 32 bytes, and subsequently io_kiock from
io_uring to 216 to 208 bytes.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.c | 36 +++++++++++++++++++++++-------------
 fs/io-wq.h | 46 ++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 67 insertions(+), 15 deletions(-)

diff --git a/fs/io-wq.c b/fs/io-wq.c
index d3e8907cc182..91b85df0861e 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -84,7 +84,7 @@ enum {
 struct io_wqe {
 	struct {
 		spinlock_t lock;
-		struct list_head work_list;
+		struct io_wq_work_list work_list;
 		unsigned long hash_map;
 		unsigned flags;
 	} ____cacheline_aligned_in_smp;
@@ -236,7 +236,8 @@ static void io_worker_exit(struct io_worker *worker)
 static inline bool io_wqe_run_queue(struct io_wqe *wqe)
 	__must_hold(wqe->lock)
 {
-	if (!list_empty(&wqe->work_list) && !(wqe->flags & IO_WQE_FLAG_STALLED))
+	if (!wq_list_empty(&wqe->work_list) &&
+	    !(wqe->flags & IO_WQE_FLAG_STALLED))
 		return true;
 	return false;
 }
@@ -375,12 +376,15 @@ static bool __io_worker_idle(struct io_wqe *wqe, struct io_worker *worker)
 static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, unsigned *hash)
 	__must_hold(wqe->lock)
 {
+	struct io_wq_work_node *node, *prev;
 	struct io_wq_work *work;
 
-	list_for_each_entry(work, &wqe->work_list, list) {
+	wq_list_for_each(node, prev, &wqe->work_list) {
+		work = container_of(node, struct io_wq_work, list);
+
 		/* not hashed, can run anytime */
 		if (!(work->flags & IO_WQ_WORK_HASHED)) {
-			list_del(&work->list);
+			wq_node_del(&wqe->work_list, node, prev);
 			return work;
 		}
 
@@ -388,7 +392,7 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, unsigned *hash)
 		*hash = work->flags >> IO_WQ_HASH_SHIFT;
 		if (!(wqe->hash_map & BIT_ULL(*hash))) {
 			wqe->hash_map |= BIT_ULL(*hash);
-			list_del(&work->list);
+			wq_node_del(&wqe->work_list, node, prev);
 			return work;
 		}
 	}
@@ -416,7 +420,7 @@ static void io_worker_handle_work(struct io_worker *worker)
 		work = io_get_next_work(wqe, &hash);
 		if (work)
 			__io_worker_busy(wqe, worker, work);
-		else if (!list_empty(&wqe->work_list))
+		else if (!wq_list_empty(&wqe->work_list))
 			wqe->flags |= IO_WQE_FLAG_STALLED;
 
 		spin_unlock_irq(&wqe->lock);
@@ -526,7 +530,7 @@ static int io_wqe_worker(void *data)
 
 	if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
 		spin_lock_irq(&wqe->lock);
-		if (!list_empty(&wqe->work_list))
+		if (!wq_list_empty(&wqe->work_list))
 			io_worker_handle_work(worker);
 		else
 			spin_unlock_irq(&wqe->lock);
@@ -714,7 +718,7 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
 	}
 
 	spin_lock_irqsave(&wqe->lock, flags);
-	list_add_tail(&work->list, &wqe->work_list);
+	wq_list_add_tail(&work->list, &wqe->work_list);
 	wqe->flags &= ~IO_WQE_FLAG_STALLED;
 	spin_unlock_irqrestore(&wqe->lock, flags);
 
@@ -829,14 +833,17 @@ static enum io_wq_cancel io_wqe_cancel_cb_work(struct io_wqe *wqe,
 		.cancel = cancel,
 		.caller_data = cancel_data,
 	};
+	struct io_wq_work_node *node, *prev;
 	struct io_wq_work *work;
 	unsigned long flags;
 	bool found = false;
 
 	spin_lock_irqsave(&wqe->lock, flags);
-	list_for_each_entry(work, &wqe->work_list, list) {
+	wq_list_for_each(node, prev, &wqe->work_list) {
+		work = container_of(node, struct io_wq_work, list);
+
 		if (cancel(work, cancel_data)) {
-			list_del(&work->list);
+			wq_node_del(&wqe->work_list, node, prev);
 			found = true;
 			break;
 		}
@@ -894,6 +901,7 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
 static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
 					    struct io_wq_work *cwork)
 {
+	struct io_wq_work_node *node, *prev;
 	struct io_wq_work *work;
 	unsigned long flags;
 	bool found = false;
@@ -906,9 +914,11 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
 	 * no completion will be posted for it.
 	 */
 	spin_lock_irqsave(&wqe->lock, flags);
-	list_for_each_entry(work, &wqe->work_list, list) {
+	wq_list_for_each(node, prev, &wqe->work_list) {
+		work = container_of(node, struct io_wq_work, list);
+
 		if (work == cwork) {
-			list_del(&work->list);
+			wq_node_del(&wqe->work_list, node, prev);
 			found = true;
 			break;
 		}
@@ -1023,7 +1033,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
 		wqe->node = node;
 		wqe->wq = wq;
 		spin_lock_init(&wqe->lock);
-		INIT_LIST_HEAD(&wqe->work_list);
+		INIT_WQ_LIST(&wqe->work_list);
 		INIT_HLIST_NULLS_HEAD(&wqe->free_list, 0);
 		INIT_HLIST_NULLS_HEAD(&wqe->busy_list, 1);
 		INIT_LIST_HEAD(&wqe->all_list);
diff --git a/fs/io-wq.h b/fs/io-wq.h
index 5cd8c7697e88..600e0158cba7 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -22,18 +22,60 @@ enum io_wq_cancel {
 	IO_WQ_CANCEL_NOTFOUND,	/* work not found */
 };
 
+struct io_wq_work_node {
+	struct io_wq_work_node *next;
+};
+
+struct io_wq_work_list {
+	struct io_wq_work_node *first;
+	struct io_wq_work_node *last;
+};
+
+static inline void wq_list_add_tail(struct io_wq_work_node *node,
+				    struct io_wq_work_list *list)
+{
+	if (!list->first) {
+		list->first = list->last = node;
+	} else {
+		list->last->next = node;
+		list->last = node;
+	}
+}
+
+static inline void wq_node_del(struct io_wq_work_list *list,
+			       struct io_wq_work_node *node,
+			       struct io_wq_work_node *prev)
+{
+	if (node == list->first)
+		list->first = node->next;
+	if (node == list->last)
+		list->last = prev;
+	if (prev)
+		prev->next = node->next;
+}
+
+#define wq_list_for_each(pos, prv, head)			\
+	for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next)
+
+#define wq_list_empty(list)	((list)->first == NULL)
+#define INIT_WQ_LIST(list)	do {				\
+	(list)->first = NULL;					\
+	(list)->last = NULL;					\
+} while (0)
+
 struct io_wq_work {
 	union {
-		struct list_head list;
+		struct io_wq_work_node list;
 		void *data;
 	};
 	void (*func)(struct io_wq_work **);
-	unsigned flags;
 	struct files_struct *files;
+	unsigned flags;
 };
 
 #define INIT_IO_WORK(work, _func)			\
 	do {						\
+		(work)->list.next = NULL;		\
 		(work)->func = _func;			\
 		(work)->flags = 0;			\
 		(work)->files = NULL;			\

From e944475e69849273ca8f1fe04a3ce81b5901d165 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 26 Nov 2019 15:02:04 -0700
Subject: [PATCH 42/42] io_uring: make poll->wait dynamically allocated

In the quest to bring io_kiocb down to 3 cachelines, this one does
the trick. Make the wait_queue_entry for the poll command come out
of kmalloc instead of embedding it in struct io_poll_iocb, as the
latter is the largest member of io_kiocb. Once we trim this down a
bit, we're back at a healthy 192 bytes for struct io_kiocb.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index e44b0e01f1b5..2c2e8c25da01 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -291,7 +291,7 @@ struct io_poll_iocb {
 	__poll_t			events;
 	bool				done;
 	bool				canceled;
-	struct wait_queue_entry		wait;
+	struct wait_queue_entry		*wait;
 };
 
 struct io_timeout_data {
@@ -2030,8 +2030,8 @@ static void io_poll_remove_one(struct io_kiocb *req)
 
 	spin_lock(&poll->head->lock);
 	WRITE_ONCE(poll->canceled, true);
-	if (!list_empty(&poll->wait.entry)) {
-		list_del_init(&poll->wait.entry);
+	if (!list_empty(&poll->wait->entry)) {
+		list_del_init(&poll->wait->entry);
 		io_queue_async_work(req);
 	}
 	spin_unlock(&poll->head->lock);
@@ -2104,6 +2104,7 @@ static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
 	struct io_ring_ctx *ctx = req->ctx;
 
 	req->poll.done = true;
+	kfree(req->poll.wait);
 	if (error)
 		io_cqring_fill_event(req, error);
 	else
@@ -2141,7 +2142,7 @@ static void io_poll_complete_work(struct io_wq_work **workptr)
 	 */
 	spin_lock_irq(&ctx->completion_lock);
 	if (!mask && ret != -ECANCELED) {
-		add_wait_queue(poll->head, &poll->wait);
+		add_wait_queue(poll->head, poll->wait);
 		spin_unlock_irq(&ctx->completion_lock);
 		return;
 	}
@@ -2161,8 +2162,7 @@ static void io_poll_complete_work(struct io_wq_work **workptr)
 static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
 			void *key)
 {
-	struct io_poll_iocb *poll = container_of(wait, struct io_poll_iocb,
-							wait);
+	struct io_poll_iocb *poll = wait->private;
 	struct io_kiocb *req = container_of(poll, struct io_kiocb, poll);
 	struct io_ring_ctx *ctx = req->ctx;
 	__poll_t mask = key_to_poll(key);
@@ -2172,7 +2172,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
 	if (mask && !(mask & poll->events))
 		return 0;
 
-	list_del_init(&poll->wait.entry);
+	list_del_init(&poll->wait->entry);
 
 	/*
 	 * Run completion inline if we can. We're using trylock here because
@@ -2213,7 +2213,7 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
 
 	pt->error = 0;
 	pt->req->poll.head = head;
-	add_wait_queue(head, &pt->req->poll.wait);
+	add_wait_queue(head, pt->req->poll.wait);
 }
 
 static void io_poll_req_insert(struct io_kiocb *req)
@@ -2252,6 +2252,10 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	if (!poll->file)
 		return -EBADF;
 
+	poll->wait = kmalloc(sizeof(*poll->wait), GFP_KERNEL);
+	if (!poll->wait)
+		return -ENOMEM;
+
 	req->sqe = NULL;
 	INIT_IO_WORK(&req->work, io_poll_complete_work);
 	events = READ_ONCE(sqe->poll_events);
@@ -2268,8 +2272,9 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	ipt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */
 
 	/* initialized the list so that we can do list_empty checks */
-	INIT_LIST_HEAD(&poll->wait.entry);
-	init_waitqueue_func_entry(&poll->wait, io_poll_wake);
+	INIT_LIST_HEAD(&poll->wait->entry);
+	init_waitqueue_func_entry(poll->wait, io_poll_wake);
+	poll->wait->private = poll;
 
 	INIT_LIST_HEAD(&req->list);
 
@@ -2278,14 +2283,14 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	spin_lock_irq(&ctx->completion_lock);
 	if (likely(poll->head)) {
 		spin_lock(&poll->head->lock);
-		if (unlikely(list_empty(&poll->wait.entry))) {
+		if (unlikely(list_empty(&poll->wait->entry))) {
 			if (ipt.error)
 				cancel = true;
 			ipt.error = 0;
 			mask = 0;
 		}
 		if (mask || ipt.error)
-			list_del_init(&poll->wait.entry);
+			list_del_init(&poll->wait->entry);
 		else if (cancel)
 			WRITE_ONCE(poll->canceled, true);
 		else if (!poll->done) /* actually waiting for an event */