From 95a5bbae05ef1ec1cceb8c1b04a482aa0b7c177c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 14 Nov 2019 22:40:44 -0700 Subject: [PATCH 01/42] io_uring: io_async_cancel() should pass in 'nxt' request pointer If we have a linked request, this enables us to pass it back directly without having to go through async context. Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 4c030a92de79..011281856ff7 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2480,7 +2480,7 @@ static int io_async_cancel(struct io_kiocb *req, const struct io_uring_sqe *sqe, sqe->cancel_flags) return -EINVAL; - io_async_find_and_cancel(ctx, req, READ_ONCE(sqe->addr), NULL); + io_async_find_and_cancel(ctx, req, READ_ONCE(sqe->addr), nxt); return 0; } From 0e0702dac26b282603261f04a62711a2d9aac17b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 14 Nov 2019 21:42:10 -0700 Subject: [PATCH 02/42] io_uring: cleanup return values from the queueing functions __io_queue_sqe(), io_queue_sqe(), io_queue_link_head() all return 0/err, but the caller doesn't care since the errors are handled inline. Clean these up and just make them void. Signed-off-by: Jens Axboe --- fs/io_uring.c | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 011281856ff7..d877c7f6368e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2840,7 +2840,7 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req, return nxt; } -static int __io_queue_sqe(struct io_kiocb *req) +static void __io_queue_sqe(struct io_kiocb *req) { enum hrtimer_mode mode; struct io_kiocb *nxt; @@ -2885,7 +2885,7 @@ static int __io_queue_sqe(struct io_kiocb *req) if (nxt) io_queue_linked_timeout(nxt, &ts, &mode); - return 0; + return; } } @@ -2907,11 +2907,9 @@ err: req->flags |= REQ_F_FAIL_LINK; io_put_req(req); } - - return ret; } -static int io_queue_sqe(struct io_kiocb *req) +static void io_queue_sqe(struct io_kiocb *req) { int ret; @@ -2921,20 +2919,20 @@ static int io_queue_sqe(struct io_kiocb *req) io_cqring_add_event(req, ret); io_double_put_req(req); } - return 0; - } - - return __io_queue_sqe(req); + } else + __io_queue_sqe(req); } -static int io_queue_link_head(struct io_kiocb *req, struct io_kiocb *shadow) +static void io_queue_link_head(struct io_kiocb *req, struct io_kiocb *shadow) { int ret; int need_submit = false; struct io_ring_ctx *ctx = req->ctx; - if (!shadow) - return io_queue_sqe(req); + if (!shadow) { + io_queue_sqe(req); + return; + } /* * Mark the first IO in link list as DRAIN, let all the following @@ -2948,7 +2946,7 @@ static int io_queue_link_head(struct io_kiocb *req, struct io_kiocb *shadow) io_cqring_add_event(req, ret); io_double_put_req(req); __io_free_req(shadow); - return 0; + return; } } else { /* @@ -2965,9 +2963,7 @@ static int io_queue_link_head(struct io_kiocb *req, struct io_kiocb *shadow) spin_unlock_irq(&ctx->completion_lock); if (need_submit) - return __io_queue_sqe(req); - - return 0; + __io_queue_sqe(req); } #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK) From 978db57e2c329fc612ff669cab9bf0007efd3ca3 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 14 Nov 2019 22:39:04 -0700 Subject: [PATCH 03/42] io_uring: make io_double_put_req() use normal completion path If we don't use the normal completion path, we may skip killing links that should be errored and freed. Add __io_double_put_req() for use within the completion path itself, other calls should just use io_double_put_req(). Signed-off-by: Jens Axboe --- fs/io_uring.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index d877c7f6368e..1da103df2bfa 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -383,6 +383,7 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res); static void __io_free_req(struct io_kiocb *req); static void io_put_req(struct io_kiocb *req); static void io_double_put_req(struct io_kiocb *req); +static void __io_double_put_req(struct io_kiocb *req); static struct kmem_cache *req_cachep; @@ -916,7 +917,7 @@ static void io_fail_links(struct io_kiocb *req) io_link_cancel_timeout(link); } else { io_cqring_fill_event(link, -ECANCELED); - io_double_put_req(link); + __io_double_put_req(link); } } @@ -990,13 +991,24 @@ static void io_put_req(struct io_kiocb *req) io_free_req(req); } -static void io_double_put_req(struct io_kiocb *req) +/* + * Must only be used if we don't need to care about links, usually from + * within the completion handling itself. + */ +static void __io_double_put_req(struct io_kiocb *req) { /* drop both submit and complete references */ if (refcount_sub_and_test(2, &req->refs)) __io_free_req(req); } +static void io_double_put_req(struct io_kiocb *req) +{ + /* drop both submit and complete references */ + if (refcount_sub_and_test(2, &req->refs)) + io_free_req(req); +} + static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush) { struct io_rings *rings = ctx->rings; From ad8a48acc23cb13cbf4332ebabb867b1baa81842 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 15 Nov 2019 08:49:11 -0700 Subject: [PATCH 04/42] io_uring: make req->timeout be dynamically allocated There are a few reasons for this: - As a prep to improving the linked timeout logic - io_timeout is the biggest member in the io_kiocb opcode union This also enables a few cleanups, like unifying the timer setup between IORING_OP_TIMEOUT and IORING_OP_LINK_TIMEOUT, and not needing multiple arguments to the link/prep helpers. Signed-off-by: Jens Axboe --- fs/io_uring.c | 129 +++++++++++++++++++++++++++----------------------- 1 file changed, 70 insertions(+), 59 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 1da103df2bfa..8cd3505d167b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -301,9 +301,16 @@ struct io_poll_iocb { struct wait_queue_entry wait; }; +struct io_timeout_data { + struct io_kiocb *req; + struct hrtimer timer; + struct timespec64 ts; + enum hrtimer_mode mode; +}; + struct io_timeout { struct file *file; - struct hrtimer timer; + struct io_timeout_data *data; }; /* @@ -573,7 +580,7 @@ static void io_kill_timeout(struct io_kiocb *req) { int ret; - ret = hrtimer_try_to_cancel(&req->timeout.timer); + ret = hrtimer_try_to_cancel(&req->timeout.data->timer); if (ret != -1) { atomic_inc(&req->ctx->cq_timeouts); list_del_init(&req->list); @@ -828,6 +835,8 @@ static void __io_free_req(struct io_kiocb *req) wake_up(&ctx->inflight_wait); spin_unlock_irqrestore(&ctx->inflight_lock, flags); } + if (req->flags & REQ_F_TIMEOUT) + kfree(req->timeout.data); percpu_ref_put(&ctx->refs); if (likely(!io_is_fallback_req(req))) kmem_cache_free(req_cachep, req); @@ -840,7 +849,7 @@ static bool io_link_cancel_timeout(struct io_kiocb *req) struct io_ring_ctx *ctx = req->ctx; int ret; - ret = hrtimer_try_to_cancel(&req->timeout.timer); + ret = hrtimer_try_to_cancel(&req->timeout.data->timer); if (ret != -1) { io_cqring_fill_event(req, -ECANCELED); io_commit_cqring(ctx); @@ -2236,12 +2245,12 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe, static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) { - struct io_ring_ctx *ctx; - struct io_kiocb *req; + struct io_timeout_data *data = container_of(timer, + struct io_timeout_data, timer); + struct io_kiocb *req = data->req; + struct io_ring_ctx *ctx = req->ctx; unsigned long flags; - req = container_of(timer, struct io_kiocb, timeout.timer); - ctx = req->ctx; atomic_inc(&ctx->cq_timeouts); spin_lock_irqsave(&ctx->completion_lock, flags); @@ -2291,7 +2300,7 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) if (ret == -ENOENT) return ret; - ret = hrtimer_try_to_cancel(&req->timeout.timer); + ret = hrtimer_try_to_cancel(&req->timeout.data->timer); if (ret == -1) return -EALREADY; @@ -2331,34 +2340,54 @@ static int io_timeout_remove(struct io_kiocb *req, return 0; } -static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static int io_timeout_setup(struct io_kiocb *req) { - unsigned count; - struct io_ring_ctx *ctx = req->ctx; - struct list_head *entry; - enum hrtimer_mode mode; - struct timespec64 ts; - unsigned span = 0; + const struct io_uring_sqe *sqe = req->submit.sqe; + struct io_timeout_data *data; unsigned flags; - if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->len != 1) + if (sqe->ioprio || sqe->buf_index || sqe->len != 1) return -EINVAL; flags = READ_ONCE(sqe->timeout_flags); if (flags & ~IORING_TIMEOUT_ABS) return -EINVAL; - if (get_timespec64(&ts, u64_to_user_ptr(sqe->addr))) + data = kzalloc(sizeof(struct io_timeout_data), GFP_KERNEL); + if (!data) + return -ENOMEM; + data->req = req; + req->timeout.data = data; + req->flags |= REQ_F_TIMEOUT; + + if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr))) return -EFAULT; if (flags & IORING_TIMEOUT_ABS) - mode = HRTIMER_MODE_ABS; + data->mode = HRTIMER_MODE_ABS; else - mode = HRTIMER_MODE_REL; + data->mode = HRTIMER_MODE_REL; - hrtimer_init(&req->timeout.timer, CLOCK_MONOTONIC, mode); - req->flags |= REQ_F_TIMEOUT; + hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode); + return 0; +} + +static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + unsigned count; + struct io_ring_ctx *ctx = req->ctx; + struct io_timeout_data *data; + struct list_head *entry; + unsigned span = 0; + int ret; + + ret = io_timeout_setup(req); + /* common setup allows flags (like links) set, we don't */ + if (!ret && sqe->flags) + ret = -EINVAL; + if (ret) + return ret; /* * sqe->off holds how many events that need to occur for this @@ -2418,8 +2447,9 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) req->sequence -= span; add: list_add(&req->list, entry); - req->timeout.timer.function = io_timeout_fn; - hrtimer_start(&req->timeout.timer, timespec64_to_ktime(ts), mode); + data = req->timeout.data; + data->timer.function = io_timeout_fn; + hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); spin_unlock_irq(&ctx->completion_lock); return 0; } @@ -2754,8 +2784,9 @@ static int io_grab_files(struct io_kiocb *req) static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) { - struct io_kiocb *req = container_of(timer, struct io_kiocb, - timeout.timer); + struct io_timeout_data *data = container_of(timer, + struct io_timeout_data, timer); + struct io_kiocb *req = data->req; struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *prev = NULL; unsigned long flags; @@ -2786,9 +2817,9 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) return HRTIMER_NORESTART; } -static void io_queue_linked_timeout(struct io_kiocb *req, struct timespec64 *ts, - enum hrtimer_mode *mode) +static void io_queue_linked_timeout(struct io_kiocb *req) { + struct io_timeout_data *data = req->timeout.data; struct io_ring_ctx *ctx = req->ctx; /* @@ -2797,9 +2828,9 @@ static void io_queue_linked_timeout(struct io_kiocb *req, struct timespec64 *ts, */ spin_lock_irq(&ctx->completion_lock); if (!list_empty(&req->list)) { - req->timeout.timer.function = io_link_timeout_fn; - hrtimer_start(&req->timeout.timer, timespec64_to_ktime(*ts), - *mode); + data->timer.function = io_link_timeout_fn; + hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), + data->mode); } spin_unlock_irq(&ctx->completion_lock); @@ -2807,22 +2838,7 @@ static void io_queue_linked_timeout(struct io_kiocb *req, struct timespec64 *ts, io_put_req(req); } -static int io_validate_link_timeout(const struct io_uring_sqe *sqe, - struct timespec64 *ts) -{ - if (sqe->ioprio || sqe->buf_index || sqe->len != 1 || sqe->off) - return -EINVAL; - if (sqe->timeout_flags & ~IORING_TIMEOUT_ABS) - return -EINVAL; - if (get_timespec64(ts, u64_to_user_ptr(sqe->addr))) - return -EFAULT; - - return 0; -} - -static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req, - struct timespec64 *ts, - enum hrtimer_mode *mode) +static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) { struct io_kiocb *nxt; int ret; @@ -2834,7 +2850,10 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req, if (!nxt || nxt->submit.sqe->opcode != IORING_OP_LINK_TIMEOUT) return NULL; - ret = io_validate_link_timeout(nxt->submit.sqe, ts); + ret = io_timeout_setup(nxt); + /* common setup allows offset being set, we don't */ + if (!ret && nxt->submit.sqe->off) + ret = -EINVAL; if (ret) { list_del_init(&nxt->list); io_cqring_add_event(nxt, ret); @@ -2842,24 +2861,16 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req, return ERR_PTR(-ECANCELED); } - if (nxt->submit.sqe->timeout_flags & IORING_TIMEOUT_ABS) - *mode = HRTIMER_MODE_ABS; - else - *mode = HRTIMER_MODE_REL; - req->flags |= REQ_F_LINK_TIMEOUT; - hrtimer_init(&nxt->timeout.timer, CLOCK_MONOTONIC, *mode); return nxt; } static void __io_queue_sqe(struct io_kiocb *req) { - enum hrtimer_mode mode; struct io_kiocb *nxt; - struct timespec64 ts; int ret; - nxt = io_prep_linked_timeout(req, &ts, &mode); + nxt = io_prep_linked_timeout(req); if (IS_ERR(nxt)) { ret = PTR_ERR(nxt); nxt = NULL; @@ -2895,7 +2906,7 @@ static void __io_queue_sqe(struct io_kiocb *req) io_queue_async_work(req); if (nxt) - io_queue_linked_timeout(nxt, &ts, &mode); + io_queue_linked_timeout(nxt); return; } @@ -2907,7 +2918,7 @@ err: if (nxt) { if (!ret) - io_queue_linked_timeout(nxt, &ts, &mode); + io_queue_linked_timeout(nxt); else io_put_req(nxt); } From 94ae5e77a9150a8c6c57432e2db290c6868ddfad Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 14 Nov 2019 19:39:52 -0700 Subject: [PATCH 05/42] io_uring: fix sequencing issues with linked timeouts We have an issue with timeout links that are deeper in the submit chain, because we only handle it upfront, not from later submissions. Move the prep + issue of the timeout link to the async work prep handler, and do it normally for non-async queue. If we validate and prepare the timeout links upfront when we first see them, there's nothing stopping us from supporting any sort of nesting. Fixes: 2665abfd757f ("io_uring: add support for linked SQE timeouts") Reported-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 102 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 61 insertions(+), 41 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 8cd3505d167b..759a30e52fda 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -353,6 +353,7 @@ struct io_kiocb { #define REQ_F_TIMEOUT_NOSEQ 8192 /* no timeout sequence */ #define REQ_F_INFLIGHT 16384 /* on inflight list */ #define REQ_F_COMP_LOCKED 32768 /* completion under lock */ +#define REQ_F_FREE_SQE 65536 /* free sqe if not async queued */ u64 user_data; u32 result; u32 sequence; @@ -391,6 +392,8 @@ static void __io_free_req(struct io_kiocb *req); static void io_put_req(struct io_kiocb *req); static void io_double_put_req(struct io_kiocb *req); static void __io_double_put_req(struct io_kiocb *req); +static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req); +static void io_queue_linked_timeout(struct io_kiocb *req); static struct kmem_cache *req_cachep; @@ -529,7 +532,8 @@ static inline bool io_sqe_needs_user(const struct io_uring_sqe *sqe) opcode == IORING_OP_WRITE_FIXED); } -static inline bool io_prep_async_work(struct io_kiocb *req) +static inline bool io_prep_async_work(struct io_kiocb *req, + struct io_kiocb **link) { bool do_hashed = false; @@ -558,13 +562,17 @@ static inline bool io_prep_async_work(struct io_kiocb *req) req->work.flags |= IO_WQ_WORK_NEEDS_USER; } + *link = io_prep_linked_timeout(req); return do_hashed; } static inline void io_queue_async_work(struct io_kiocb *req) { - bool do_hashed = io_prep_async_work(req); struct io_ring_ctx *ctx = req->ctx; + struct io_kiocb *link; + bool do_hashed; + + do_hashed = io_prep_async_work(req, &link); trace_io_uring_queue_async_work(ctx, do_hashed, req, &req->work, req->flags); @@ -574,6 +582,9 @@ static inline void io_queue_async_work(struct io_kiocb *req) io_wq_enqueue_hashed(ctx->io_wq, &req->work, file_inode(req->file)); } + + if (link) + io_queue_linked_timeout(link); } static void io_kill_timeout(struct io_kiocb *req) @@ -875,6 +886,15 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list); while (nxt) { list_del_init(&nxt->list); + + if ((req->flags & REQ_F_LINK_TIMEOUT) && + (nxt->flags & REQ_F_TIMEOUT)) { + wake_ev |= io_link_cancel_timeout(nxt); + nxt = list_first_entry_or_null(&req->link_list, + struct io_kiocb, list); + req->flags &= ~REQ_F_LINK_TIMEOUT; + continue; + } if (!list_empty(&req->link_list)) { INIT_LIST_HEAD(&nxt->link_list); list_splice(&req->link_list, &nxt->link_list); @@ -885,19 +905,13 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) * If we're in async work, we can continue processing the chain * in this context instead of having to queue up new async work. */ - if (req->flags & REQ_F_LINK_TIMEOUT) { - wake_ev = io_link_cancel_timeout(nxt); - - /* we dropped this link, get next */ - nxt = list_first_entry_or_null(&req->link_list, - struct io_kiocb, list); - } else if (nxtptr && io_wq_current_is_worker()) { - *nxtptr = nxt; - break; - } else { - io_queue_async_work(nxt); - break; + if (nxt) { + if (nxtptr && io_wq_current_is_worker()) + *nxtptr = nxt; + else + io_queue_async_work(nxt); } + break; } if (wake_ev) @@ -916,11 +930,16 @@ static void io_fail_links(struct io_kiocb *req) spin_lock_irqsave(&ctx->completion_lock, flags); while (!list_empty(&req->link_list)) { + const struct io_uring_sqe *sqe_to_free = NULL; + link = list_first_entry(&req->link_list, struct io_kiocb, list); list_del_init(&link->list); trace_io_uring_fail_link(req, link); + if (link->flags & REQ_F_FREE_SQE) + sqe_to_free = link->submit.sqe; + if ((req->flags & REQ_F_LINK_TIMEOUT) && link->submit.sqe->opcode == IORING_OP_LINK_TIMEOUT) { io_link_cancel_timeout(link); @@ -928,6 +947,7 @@ static void io_fail_links(struct io_kiocb *req) io_cqring_fill_event(link, -ECANCELED); __io_double_put_req(link); } + kfree(sqe_to_free); } io_commit_cqring(ctx); @@ -2683,8 +2703,12 @@ static void io_wq_submit_work(struct io_wq_work **workptr) /* if a dependent link is ready, pass it back */ if (!ret && nxt) { - io_prep_async_work(nxt); + struct io_kiocb *link; + + io_prep_async_work(nxt, &link); *workptr = &nxt->work; + if (link) + io_queue_linked_timeout(link); } } @@ -2819,7 +2843,6 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) static void io_queue_linked_timeout(struct io_kiocb *req) { - struct io_timeout_data *data = req->timeout.data; struct io_ring_ctx *ctx = req->ctx; /* @@ -2828,6 +2851,8 @@ static void io_queue_linked_timeout(struct io_kiocb *req) */ spin_lock_irq(&ctx->completion_lock); if (!list_empty(&req->list)) { + struct io_timeout_data *data = req->timeout.data; + data->timer.function = io_link_timeout_fn; hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); @@ -2841,7 +2866,6 @@ static void io_queue_linked_timeout(struct io_kiocb *req) static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) { struct io_kiocb *nxt; - int ret; if (!(req->flags & REQ_F_LINK)) return NULL; @@ -2850,33 +2874,15 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) if (!nxt || nxt->submit.sqe->opcode != IORING_OP_LINK_TIMEOUT) return NULL; - ret = io_timeout_setup(nxt); - /* common setup allows offset being set, we don't */ - if (!ret && nxt->submit.sqe->off) - ret = -EINVAL; - if (ret) { - list_del_init(&nxt->list); - io_cqring_add_event(nxt, ret); - io_double_put_req(nxt); - return ERR_PTR(-ECANCELED); - } - req->flags |= REQ_F_LINK_TIMEOUT; return nxt; } static void __io_queue_sqe(struct io_kiocb *req) { - struct io_kiocb *nxt; + struct io_kiocb *nxt = io_prep_linked_timeout(req); int ret; - nxt = io_prep_linked_timeout(req); - if (IS_ERR(nxt)) { - ret = PTR_ERR(nxt); - nxt = NULL; - goto err; - } - ret = __io_submit_sqe(req, NULL, true); /* @@ -2904,10 +2910,6 @@ static void __io_queue_sqe(struct io_kiocb *req) * submit reference when the iocb is actually submitted. */ io_queue_async_work(req); - - if (nxt) - io_queue_linked_timeout(nxt); - return; } } @@ -2952,6 +2954,10 @@ static void io_queue_link_head(struct io_kiocb *req, struct io_kiocb *shadow) int need_submit = false; struct io_ring_ctx *ctx = req->ctx; + if (unlikely(req->flags & REQ_F_FAIL_LINK)) { + ret = -ECANCELED; + goto err; + } if (!shadow) { io_queue_sqe(req); return; @@ -2966,9 +2972,11 @@ static void io_queue_link_head(struct io_kiocb *req, struct io_kiocb *shadow) ret = io_req_defer(req); if (ret) { if (ret != -EIOCBQUEUED) { +err: io_cqring_add_event(req, ret); io_double_put_req(req); - __io_free_req(shadow); + if (shadow) + __io_free_req(shadow); return; } } else { @@ -3025,6 +3033,17 @@ err_req: if (*link) { struct io_kiocb *prev = *link; + if (READ_ONCE(s->sqe->opcode) == IORING_OP_LINK_TIMEOUT) { + ret = io_timeout_setup(req); + /* common setup allows offset being set, we don't */ + if (!ret && s->sqe->off) + ret = -EINVAL; + if (ret) { + prev->flags |= REQ_F_FAIL_LINK; + goto err_req; + } + } + sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL); if (!sqe_copy) { ret = -EAGAIN; @@ -3032,6 +3051,7 @@ err_req: } s->sqe = sqe_copy; + req->flags |= REQ_F_FREE_SQE; trace_io_uring_link(ctx, req, prev); list_add_tail(&req->list, &prev->link_list); } else if (s->sqe->flags & IOSQE_IO_LINK) { From e0e328c4b330712e45ba799dc589bda751323110 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 15 Nov 2019 17:59:46 -0700 Subject: [PATCH 06/42] io_uring: remove dead REQ_F_SEQ_PREV flag With the conversion to io-wq, we no longer use that flag. Kill it. Fixes: 561fb04a6a22 ("io_uring: replace workqueue usage with io-wq") Signed-off-by: Jens Axboe --- fs/io_uring.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 759a30e52fda..942c04d5e63d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -340,7 +340,6 @@ struct io_kiocb { #define REQ_F_NOWAIT 1 /* must not punt to workers */ #define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */ #define REQ_F_FIXED_FILE 4 /* ctx owns file */ -#define REQ_F_SEQ_PREV 8 /* sequential with previous */ #define REQ_F_IO_DRAIN 16 /* drain existing IO first */ #define REQ_F_IO_DRAINED 32 /* drain done */ #define REQ_F_LINK 64 /* linked sqes */ From b0dd8a412699afe3420a08f841333f3474ad45c5 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 18 Nov 2019 12:14:54 -0700 Subject: [PATCH 07/42] io_uring: correct poll cancel and linked timeout expiration completion Currently a poll request fills a completion entry of 0, even if it got cancelled. This is odd, and it makes it harder to support with chains. Ensure that it returns -ECANCELED in the completions events if it got cancelled, and furthermore ensure that the linked timeout that triggered it completes with -ETIME if we did indeed trigger the completions through a timeout. Signed-off-by: Jens Axboe --- fs/io_uring.c | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 942c04d5e63d..40c351a9ed26 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2066,12 +2066,15 @@ static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } -static void io_poll_complete(struct io_kiocb *req, __poll_t mask) +static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error) { struct io_ring_ctx *ctx = req->ctx; req->poll.done = true; - io_cqring_fill_event(req, mangle_poll(mask)); + if (error) + io_cqring_fill_event(req, error); + else + io_cqring_fill_event(req, mangle_poll(mask)); io_commit_cqring(ctx); } @@ -2084,11 +2087,16 @@ static void io_poll_complete_work(struct io_wq_work **workptr) struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *nxt = NULL; __poll_t mask = 0; + int ret = 0; - if (work->flags & IO_WQ_WORK_CANCEL) + if (work->flags & IO_WQ_WORK_CANCEL) { WRITE_ONCE(poll->canceled, true); + ret = -ECANCELED; + } else if (READ_ONCE(poll->canceled)) { + ret = -ECANCELED; + } - if (!READ_ONCE(poll->canceled)) + if (ret != -ECANCELED) mask = vfs_poll(poll->file, &pt) & poll->events; /* @@ -2099,13 +2107,13 @@ static void io_poll_complete_work(struct io_wq_work **workptr) * avoid further branches in the fast path. */ spin_lock_irq(&ctx->completion_lock); - if (!mask && !READ_ONCE(poll->canceled)) { + if (!mask && ret != -ECANCELED) { add_wait_queue(poll->head, &poll->wait); spin_unlock_irq(&ctx->completion_lock); return; } io_poll_remove_req(req); - io_poll_complete(req, mask); + io_poll_complete(req, mask, ret); spin_unlock_irq(&ctx->completion_lock); io_cqring_ev_posted(ctx); @@ -2139,7 +2147,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, */ if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) { io_poll_remove_req(req); - io_poll_complete(req, mask); + io_poll_complete(req, mask, 0); req->flags |= REQ_F_COMP_LOCKED; io_put_req(req); spin_unlock_irqrestore(&ctx->completion_lock, flags); @@ -2251,7 +2259,7 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe, } if (mask) { /* no async, we'd stolen it */ ipt.error = 0; - io_poll_complete(req, mask); + io_poll_complete(req, mask, 0); } spin_unlock_irq(&ctx->completion_lock); @@ -2503,7 +2511,7 @@ static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr) static void io_async_find_and_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req, __u64 sqe_addr, - struct io_kiocb **nxt) + struct io_kiocb **nxt, int success_ret) { unsigned long flags; int ret; @@ -2520,6 +2528,8 @@ static void io_async_find_and_cancel(struct io_ring_ctx *ctx, goto done; ret = io_poll_cancel(ctx, sqe_addr); done: + if (!ret) + ret = success_ret; io_cqring_fill_event(req, ret); io_commit_cqring(ctx); spin_unlock_irqrestore(&ctx->completion_lock, flags); @@ -2541,7 +2551,7 @@ static int io_async_cancel(struct io_kiocb *req, const struct io_uring_sqe *sqe, sqe->cancel_flags) return -EINVAL; - io_async_find_and_cancel(ctx, req, READ_ONCE(sqe->addr), nxt); + io_async_find_and_cancel(ctx, req, READ_ONCE(sqe->addr), nxt, 0); return 0; } @@ -2831,7 +2841,8 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) spin_unlock_irqrestore(&ctx->completion_lock, flags); if (prev) { - io_async_find_and_cancel(ctx, req, prev->user_data, NULL); + io_async_find_and_cancel(ctx, req, prev->user_data, NULL, + -ETIME); io_put_req(prev); } else { io_cqring_add_event(req, -ETIME); From fba38c272a0385148935d6443cb9dc68cf1f37a7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 18 Nov 2019 12:27:57 -0700 Subject: [PATCH 08/42] io_uring: request cancellations should break links We currently don't explicitly break links if a request is cancelled, but we should. Add explicitly link breakage for all types of request cancellations that we support. Signed-off-by: Jens Axboe --- fs/io_uring.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 40c351a9ed26..6550b7eab7d2 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2118,6 +2118,8 @@ static void io_poll_complete_work(struct io_wq_work **workptr) io_cqring_ev_posted(ctx); + if (ret < 0 && req->flags & REQ_F_LINK) + req->flags |= REQ_F_FAIL_LINK; io_put_req_find_next(req, &nxt); if (nxt) *workptr = &nxt->work; @@ -2331,6 +2333,8 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) if (ret == -1) return -EALREADY; + if (req->flags & REQ_F_LINK) + req->flags |= REQ_F_FAIL_LINK; io_cqring_fill_event(req, -ECANCELED); io_put_req(req); return 0; @@ -2841,6 +2845,8 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) spin_unlock_irqrestore(&ctx->completion_lock, flags); if (prev) { + if (prev->flags & REQ_F_LINK) + prev->flags |= REQ_F_FAIL_LINK; io_async_find_and_cancel(ctx, req, prev->user_data, NULL, -ETIME); io_put_req(prev); From b60fda6000a99a7ccac36005ab78b14b47c06de3 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 19 Nov 2019 08:37:07 -0700 Subject: [PATCH 09/42] io-wq: wait for io_wq_create() to setup necessary workers We currently have a race where if setup is really slow, we can be calling io_wq_destroy() before we're done setting up. This will cause the caller to get stuck waiting for the manager to set things up, but the manager already exited. Fix this by doing a sync setup of the manager. This also fixes the case where if we failed creating workers, we'd also get stuck. In practice this race window was really small, as we already wait for the manager to start. Hence someone would have to call io_wq_destroy() after the task has started, but before it started the first loop. The reported test case forked tons of these, which is why it became an issue. Reported-by: syzbot+0f1cc17f85154f400465@syzkaller.appspotmail.com Fixes: 771b53d033e8 ("io-wq: small threadpool implementation for io_uring") Signed-off-by: Jens Axboe --- fs/io-wq.c | 50 +++++++++++++++++++++++++++++++++++--------------- 1 file changed, 35 insertions(+), 15 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index 9174007ce107..243eb1d4d2bd 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -33,6 +33,7 @@ enum { enum { IO_WQ_BIT_EXIT = 0, /* wq exiting */ IO_WQ_BIT_CANCEL = 1, /* cancel work on list */ + IO_WQ_BIT_ERROR = 2, /* error on setup */ }; enum { @@ -562,14 +563,14 @@ void io_wq_worker_sleeping(struct task_struct *tsk) spin_unlock_irq(&wqe->lock); } -static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) +static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) { struct io_wqe_acct *acct =&wqe->acct[index]; struct io_worker *worker; worker = kcalloc_node(1, sizeof(*worker), GFP_KERNEL, wqe->node); if (!worker) - return; + return false; refcount_set(&worker->ref, 1); worker->nulls_node.pprev = NULL; @@ -581,7 +582,7 @@ static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) "io_wqe_worker-%d/%d", index, wqe->node); if (IS_ERR(worker->task)) { kfree(worker); - return; + return false; } spin_lock_irq(&wqe->lock); @@ -599,6 +600,7 @@ static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) atomic_inc(&wq->user->processes); wake_up_process(worker->task); + return true; } static inline bool io_wqe_need_worker(struct io_wqe *wqe, int index) @@ -606,9 +608,6 @@ static inline bool io_wqe_need_worker(struct io_wqe *wqe, int index) { struct io_wqe_acct *acct = &wqe->acct[index]; - /* always ensure we have one bounded worker */ - if (index == IO_WQ_ACCT_BOUND && !acct->nr_workers) - return true; /* if we have available workers or no work, no need */ if (!hlist_nulls_empty(&wqe->free_list) || !io_wqe_run_queue(wqe)) return false; @@ -621,10 +620,19 @@ static inline bool io_wqe_need_worker(struct io_wqe *wqe, int index) static int io_wq_manager(void *data) { struct io_wq *wq = data; + int i; + + /* create fixed workers */ + refcount_set(&wq->refs, wq->nr_wqes); + for (i = 0; i < wq->nr_wqes; i++) { + if (create_io_worker(wq, wq->wqes[i], IO_WQ_ACCT_BOUND)) + continue; + goto err; + } + + complete(&wq->done); while (!kthread_should_stop()) { - int i; - for (i = 0; i < wq->nr_wqes; i++) { struct io_wqe *wqe = wq->wqes[i]; bool fork_worker[2] = { false, false }; @@ -645,6 +653,12 @@ static int io_wq_manager(void *data) } return 0; +err: + set_bit(IO_WQ_BIT_ERROR, &wq->state); + set_bit(IO_WQ_BIT_EXIT, &wq->state); + if (refcount_sub_and_test(wq->nr_wqes - i, &wq->refs)) + complete(&wq->done); + return 0; } static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct, @@ -982,7 +996,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct mm_struct *mm, wq->user = user; i = 0; - refcount_set(&wq->refs, wq->nr_wqes); for_each_online_node(node) { struct io_wqe *wqe; @@ -1020,14 +1033,22 @@ struct io_wq *io_wq_create(unsigned bounded, struct mm_struct *mm, wq->manager = kthread_create(io_wq_manager, wq, "io_wq_manager"); if (!IS_ERR(wq->manager)) { wake_up_process(wq->manager); + wait_for_completion(&wq->done); + if (test_bit(IO_WQ_BIT_ERROR, &wq->state)) { + ret = -ENOMEM; + goto err; + } + reinit_completion(&wq->done); return wq; } ret = PTR_ERR(wq->manager); - wq->manager = NULL; -err: complete(&wq->done); - io_wq_destroy(wq); +err: + for (i = 0; i < wq->nr_wqes; i++) + kfree(wq->wqes[i]); + kfree(wq->wqes); + kfree(wq); return ERR_PTR(ret); } @@ -1041,10 +1062,9 @@ void io_wq_destroy(struct io_wq *wq) { int i; - if (wq->manager) { - set_bit(IO_WQ_BIT_EXIT, &wq->state); + set_bit(IO_WQ_BIT_EXIT, &wq->state); + if (wq->manager) kthread_stop(wq->manager); - } rcu_read_lock(); for (i = 0; i < wq->nr_wqes; i++) { From b2e9c7d64b7ecacc1d0f15a6af88a73cab7d8db9 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 19 Nov 2019 09:22:16 +0300 Subject: [PATCH 10/42] io-wq: remove extra space characters These lines are indented an extra space character. Signed-off-by: Dan Carpenter Signed-off-by: Jens Axboe --- fs/io-wq.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index 243eb1d4d2bd..b4bc377dda61 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -328,9 +328,9 @@ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker, * If worker is moving from bound to unbound (or vice versa), then * ensure we update the running accounting. */ - worker_bound = (worker->flags & IO_WORKER_F_BOUND) != 0; - work_bound = (work->flags & IO_WQ_WORK_UNBOUND) == 0; - if (worker_bound != work_bound) { + worker_bound = (worker->flags & IO_WORKER_F_BOUND) != 0; + work_bound = (work->flags & IO_WQ_WORK_UNBOUND) == 0; + if (worker_bound != work_bound) { io_wqe_dec_running(wqe, worker); if (work_bound) { worker->flags |= IO_WORKER_F_BOUND; From d3b35796b1e3f118017491d621f624e0de7ff9fb Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 19 Nov 2019 23:32:48 +0300 Subject: [PATCH 11/42] io_uring: break links for failed defer If io_req_defer() failed, it needs to cancel a dependant link. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 6550b7eab7d2..c1226f609e18 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2958,6 +2958,8 @@ static void io_queue_sqe(struct io_kiocb *req) if (ret) { if (ret != -EIOCBQUEUED) { io_cqring_add_event(req, ret); + if (req->flags & REQ_F_LINK) + req->flags |= REQ_F_FAIL_LINK; io_double_put_req(req); } } else @@ -2990,6 +2992,8 @@ static void io_queue_link_head(struct io_kiocb *req, struct io_kiocb *shadow) if (ret != -EIOCBQUEUED) { err: io_cqring_add_event(req, ret); + if (req->flags & REQ_F_LINK) + req->flags |= REQ_F_FAIL_LINK; io_double_put_req(req); if (shadow) __io_free_req(shadow); From f70193d6d8cad4cc614223fef349e6ea9d48c61f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 19 Nov 2019 23:32:49 +0300 Subject: [PATCH 12/42] io_uring: remove redundant check Pass any IORING_OP_LINK_TIMEOUT request further, where it will eventually fail in io_issue_sqe(). Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index c1226f609e18..fd60939a8a59 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3079,10 +3079,6 @@ err_req: INIT_LIST_HEAD(&req->link_list); *link = req; - } else if (READ_ONCE(s->sqe->opcode) == IORING_OP_LINK_TIMEOUT) { - /* Only valid as a linked SQE */ - ret = -EINVAL; - goto err_req; } else { io_queue_sqe(req); } From 09fbb0a83ec6ab5a4037766261c031151985fff6 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 19 Nov 2019 23:32:50 +0300 Subject: [PATCH 13/42] io_uring: Fix leaking linked timeouts let have a dependant link: REQ -> LINK_TIMEOUT -> LINK_TIMEOUT 1. submission stage: submission references for REQ and LINK_TIMEOUT are dropped. So, references respectively (1,1,2) 2. io_put(REQ) + FAIL_LINKS stage: calls io_fail_links(), which for all linked timeouts will call cancel_timeout() and drop 1 reference. So, references after: (0,0,1). That's a leak. Make it treat only the first linked timeout as such, and pass others through __io_double_put_req(). Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index fd60939a8a59..f88e12b971c7 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -942,6 +942,7 @@ static void io_fail_links(struct io_kiocb *req) if ((req->flags & REQ_F_LINK_TIMEOUT) && link->submit.sqe->opcode == IORING_OP_LINK_TIMEOUT) { io_link_cancel_timeout(link); + req->flags &= ~REQ_F_LINK_TIMEOUT; } else { io_cqring_fill_event(link, -ECANCELED); __io_double_put_req(link); From 5d960724b0cb0d12469d1c62912e4a8c09c9fd92 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 19 Nov 2019 15:31:28 -0700 Subject: [PATCH 14/42] io_uring: io_fail_links() should only consider first linked timeout We currently clear the linked timeout field if we cancel such a timeout, but we should only attempt to cancel if it's the first one we see. Others should simply be freed like other requests, as they haven't been started yet. Signed-off-by: Jens Axboe --- fs/io_uring.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index f88e12b971c7..09fc29599e96 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -942,12 +942,12 @@ static void io_fail_links(struct io_kiocb *req) if ((req->flags & REQ_F_LINK_TIMEOUT) && link->submit.sqe->opcode == IORING_OP_LINK_TIMEOUT) { io_link_cancel_timeout(link); - req->flags &= ~REQ_F_LINK_TIMEOUT; } else { io_cqring_fill_event(link, -ECANCELED); __io_double_put_req(link); } kfree(sqe_to_free); + req->flags &= ~REQ_F_LINK_TIMEOUT; } io_commit_cqring(ctx); @@ -2837,9 +2837,10 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) */ if (!list_empty(&req->list)) { prev = list_entry(req->list.prev, struct io_kiocb, link_list); - if (refcount_inc_not_zero(&prev->refs)) + if (refcount_inc_not_zero(&prev->refs)) { list_del_init(&req->list); - else + prev->flags &= ~REQ_F_LINK_TIMEOUT; + } else prev = NULL; } From bbad27b2f622fa26d107f8a72c0cd5cc102dc56e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 19 Nov 2019 23:32:47 +0300 Subject: [PATCH 15/42] io_uring: Always REQ_F_FREE_SQE for allocated sqe Always mark requests with allocated sqe and deallocate it in __io_free_req(). It's easier to follow and doesn't add edge cases. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 49 ++++++++++++++++++++++--------------------------- 1 file changed, 22 insertions(+), 27 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 09fc29599e96..6d52a4d643e6 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -834,6 +834,8 @@ static void __io_free_req(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; + if (req->flags & REQ_F_FREE_SQE) + kfree(req->submit.sqe); if (req->file && !(req->flags & REQ_F_FIXED_FILE)) fput(req->file); if (req->flags & REQ_F_INFLIGHT) { @@ -929,16 +931,11 @@ static void io_fail_links(struct io_kiocb *req) spin_lock_irqsave(&ctx->completion_lock, flags); while (!list_empty(&req->link_list)) { - const struct io_uring_sqe *sqe_to_free = NULL; - link = list_first_entry(&req->link_list, struct io_kiocb, list); list_del_init(&link->list); trace_io_uring_fail_link(req, link); - if (link->flags & REQ_F_FREE_SQE) - sqe_to_free = link->submit.sqe; - if ((req->flags & REQ_F_LINK_TIMEOUT) && link->submit.sqe->opcode == IORING_OP_LINK_TIMEOUT) { io_link_cancel_timeout(link); @@ -946,7 +943,6 @@ static void io_fail_links(struct io_kiocb *req) io_cqring_fill_event(link, -ECANCELED); __io_double_put_req(link); } - kfree(sqe_to_free); req->flags &= ~REQ_F_LINK_TIMEOUT; } @@ -1089,7 +1085,8 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, * completions for those, only batch free for fixed * file and non-linked commands. */ - if (((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) == + if (((req->flags & + (REQ_F_FIXED_FILE|REQ_F_LINK|REQ_F_FREE_SQE)) == REQ_F_FIXED_FILE) && !io_is_fallback_req(req)) { reqs[to_free++] = req; if (to_free == ARRAY_SIZE(reqs)) @@ -2582,6 +2579,7 @@ static int io_req_defer(struct io_kiocb *req) } memcpy(sqe_copy, sqe, sizeof(*sqe_copy)); + req->flags |= REQ_F_FREE_SQE; req->submit.sqe = sqe_copy; trace_io_uring_defer(ctx, req, false); @@ -2676,7 +2674,6 @@ static void io_wq_submit_work(struct io_wq_work **workptr) struct io_wq_work *work = *workptr; struct io_kiocb *req = container_of(work, struct io_kiocb, work); struct sqe_submit *s = &req->submit; - const struct io_uring_sqe *sqe = s->sqe; struct io_kiocb *nxt = NULL; int ret = 0; @@ -2712,9 +2709,6 @@ static void io_wq_submit_work(struct io_wq_work **workptr) io_put_req(req); } - /* async context always use a copy of the sqe */ - kfree(sqe); - /* if a dependent link is ready, pass it back */ if (!ret && nxt) { struct io_kiocb *link; @@ -2913,23 +2907,24 @@ static void __io_queue_sqe(struct io_kiocb *req) struct io_uring_sqe *sqe_copy; sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL); - if (sqe_copy) { - s->sqe = sqe_copy; - if (req->work.flags & IO_WQ_WORK_NEEDS_FILES) { - ret = io_grab_files(req); - if (ret) { - kfree(sqe_copy); - goto err; - } - } + if (!sqe_copy) + goto err; - /* - * Queued up for async execution, worker will release - * submit reference when the iocb is actually submitted. - */ - io_queue_async_work(req); - return; + s->sqe = sqe_copy; + req->flags |= REQ_F_FREE_SQE; + + if (req->work.flags & IO_WQ_WORK_NEEDS_FILES) { + ret = io_grab_files(req); + if (ret) + goto err; } + + /* + * Queued up for async execution, worker will release + * submit reference when the iocb is actually submitted. + */ + io_queue_async_work(req); + return; } err: @@ -3024,7 +3019,6 @@ err: static void io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state, struct io_kiocb **link) { - struct io_uring_sqe *sqe_copy; struct sqe_submit *s = &req->submit; struct io_ring_ctx *ctx = req->ctx; int ret; @@ -3054,6 +3048,7 @@ err_req: */ if (*link) { struct io_kiocb *prev = *link; + struct io_uring_sqe *sqe_copy; if (READ_ONCE(s->sqe->opcode) == IORING_OP_LINK_TIMEOUT) { ret = io_timeout_setup(req); From eb065d301e8c83643367bdb0898becc364046bda Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 20 Nov 2019 09:26:29 -0700 Subject: [PATCH 16/42] io_uring: io_allocate_scq_urings() should return a sane state We currently rely on the ring destroy on cleaning things up in case of failure, but io_allocate_scq_urings() can leave things half initialized if only parts of it fails. Be nice and return with either everything setup in success, or return an error with things nicely cleaned up. Reported-by: syzbot+0d818c0d39399188f393@syzkaller.appspotmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 6d52a4d643e6..55bc890f863d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4583,12 +4583,18 @@ static int io_allocate_scq_urings(struct io_ring_ctx *ctx, ctx->cq_entries = rings->cq_ring_entries; size = array_size(sizeof(struct io_uring_sqe), p->sq_entries); - if (size == SIZE_MAX) + if (size == SIZE_MAX) { + io_mem_free(ctx->rings); + ctx->rings = NULL; return -EOVERFLOW; + } ctx->sq_sqes = io_mem_alloc(size); - if (!ctx->sq_sqes) + if (!ctx->sq_sqes) { + io_mem_free(ctx->rings); + ctx->rings = NULL; return -ENOMEM; + } return 0; } From 4d7dd462971405c65bfb3821dbb6b9ce13b5e8d6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 20 Nov 2019 13:03:52 -0700 Subject: [PATCH 17/42] io_uring: allow finding next link independent of req reference count We currently try and start the next link when we put the request, and only if we were going to free it. This means that the optimization to continue executing requests from the same context often fails, as we're not putting the final reference. Add REQ_F_LINK_NEXT to keep track of this, and allow io_uring to find the next request more efficiently. Signed-off-by: Jens Axboe --- fs/io_uring.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 55bc890f863d..e9980c584120 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -340,6 +340,7 @@ struct io_kiocb { #define REQ_F_NOWAIT 1 /* must not punt to workers */ #define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */ #define REQ_F_FIXED_FILE 4 /* ctx owns file */ +#define REQ_F_LINK_NEXT 8 /* already grabbed next link */ #define REQ_F_IO_DRAIN 16 /* drain existing IO first */ #define REQ_F_IO_DRAINED 32 /* drain done */ #define REQ_F_LINK 64 /* linked sqes */ @@ -879,6 +880,10 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) struct io_kiocb *nxt; bool wake_ev = false; + /* Already got next link */ + if (req->flags & REQ_F_LINK_NEXT) + return; + /* * The list should never be empty when we are called here. But could * potentially happen if the chain is messed up, check to be on the @@ -915,6 +920,7 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) break; } + req->flags |= REQ_F_LINK_NEXT; if (wake_ev) io_cqring_ev_posted(ctx); } @@ -951,12 +957,10 @@ static void io_fail_links(struct io_kiocb *req) io_cqring_ev_posted(ctx); } -static void io_free_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt) +static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt) { - if (likely(!(req->flags & REQ_F_LINK))) { - __io_free_req(req); + if (likely(!(req->flags & REQ_F_LINK))) return; - } /* * If LINK is set, we have dependent requests in this chain. If we @@ -982,7 +986,11 @@ static void io_free_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt) } else { io_req_link_next(req, nxt); } +} +static void io_free_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt) +{ + io_req_find_next(req, nxt); __io_free_req(req); } @@ -999,8 +1007,10 @@ static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr) { struct io_kiocb *nxt = NULL; + io_req_find_next(req, &nxt); + if (refcount_dec_and_test(&req->refs)) - io_free_req_find_next(req, &nxt); + __io_free_req(req); if (nxt) { if (nxtptr) From b76da70fc3759df13e0991706451f1a2e06ba19e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 20 Nov 2019 13:05:32 -0700 Subject: [PATCH 18/42] io_uring: close lookup gap for dependent next work When we find new work to process within the work handler, we queue the linked timeout before we have issued the new work. This can be problematic for very short timeouts, as we have a window where the new work isn't visible. Allow the work handler to store a callback function for this in the work item, and flag it with IO_WQ_WORK_CB if the caller has done so. If that is set, then io-wq will call the callback when it has setup the new work item. Reported-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io-wq.c | 3 +++ fs/io-wq.h | 6 +++++- fs/io_uring.c | 16 ++++++++++++++-- 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index b4bc377dda61..9b32b3c811f5 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -427,6 +427,9 @@ next: worker->cur_work = work; spin_unlock_irq(&worker->lock); + if (work->flags & IO_WQ_WORK_CB) + work->func(&work); + if ((work->flags & IO_WQ_WORK_NEEDS_FILES) && current->files != work->files) { task_lock(current); diff --git a/fs/io-wq.h b/fs/io-wq.h index 4b29f922f80c..b68b11bf3633 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -11,6 +11,7 @@ enum { IO_WQ_WORK_NEEDS_FILES = 16, IO_WQ_WORK_UNBOUND = 32, IO_WQ_WORK_INTERNAL = 64, + IO_WQ_WORK_CB = 128, IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */ }; @@ -22,7 +23,10 @@ enum io_wq_cancel { }; struct io_wq_work { - struct list_head list; + union { + struct list_head list; + void *data; + }; void (*func)(struct io_wq_work **); unsigned flags; struct files_struct *files; diff --git a/fs/io_uring.c b/fs/io_uring.c index e9980c584120..27fefb52910c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2679,6 +2679,15 @@ static int __io_submit_sqe(struct io_kiocb *req, struct io_kiocb **nxt, return 0; } +static void io_link_work_cb(struct io_wq_work **workptr) +{ + struct io_wq_work *work = *workptr; + struct io_kiocb *link = work->data; + + io_queue_linked_timeout(link); + work->func = io_wq_submit_work; +} + static void io_wq_submit_work(struct io_wq_work **workptr) { struct io_wq_work *work = *workptr; @@ -2725,8 +2734,11 @@ static void io_wq_submit_work(struct io_wq_work **workptr) io_prep_async_work(nxt, &link); *workptr = &nxt->work; - if (link) - io_queue_linked_timeout(link); + if (link) { + nxt->work.flags |= IO_WQ_WORK_CB; + nxt->work.func = io_link_work_cb; + nxt->work.data = link; + } } } From 1b4a51b6d03d21f55effbcf609ba5526d87d9e9d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 21 Nov 2019 11:54:28 +0300 Subject: [PATCH 19/42] io_uring: drain next sqe instead of shadowing There's an issue with the shadow drain logic in that we drop the completion lock after deciding to defer a request, then re-grab it later and assume that the state is still the same. In the mean time, someone else completing a request could have found and issued it. This can cause a stall in the queue, by having a shadow request inserted that nobody is going to drain. Additionally, if we fail allocating the shadow request, we simply ignore the drain. Instead of using a shadow request, defer the next request/link instead. This also has the following advantages: - removes semi-duplicated code - doesn't allocate memory for shadows - works better if only the head marked for drain - doesn't need complex synchronisation On the flip side, it removes the shadow->seq == last_drain_in_in_link->seq optimization. That shouldn't be a common case, and can always be added back, if needed. Fixes: 4fe2c963154c ("io_uring: add support for link with drain") Cc: Jackie Liu Reported-by: Jens Axboe Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 86 +++++++++++---------------------------------------- 1 file changed, 18 insertions(+), 68 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 27fefb52910c..ca980c5878e9 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -186,6 +186,7 @@ struct io_ring_ctx { bool compat; bool account_mem; bool cq_overflow_flushed; + bool drain_next; /* * Ring buffer of indices into array of io_uring_sqe, which is @@ -346,7 +347,7 @@ struct io_kiocb { #define REQ_F_LINK 64 /* linked sqes */ #define REQ_F_LINK_TIMEOUT 128 /* has linked timeout */ #define REQ_F_FAIL_LINK 256 /* fail rest of links */ -#define REQ_F_SHADOW_DRAIN 512 /* link-drain shadow req */ +#define REQ_F_DRAIN_LINK 512 /* link should be fully drained */ #define REQ_F_TIMEOUT 1024 /* timeout request */ #define REQ_F_ISREG 2048 /* regular file */ #define REQ_F_MUST_PUNT 4096 /* must be punted even for NONBLOCK */ @@ -620,11 +621,6 @@ static void io_commit_cqring(struct io_ring_ctx *ctx) __io_commit_cqring(ctx); while ((req = io_get_deferred_req(ctx)) != NULL) { - if (req->flags & REQ_F_SHADOW_DRAIN) { - /* Just for drain, free it. */ - __io_free_req(req); - continue; - } req->flags |= REQ_F_IO_DRAINED; io_queue_async_work(req); } @@ -2973,6 +2969,12 @@ static void io_queue_sqe(struct io_kiocb *req) { int ret; + if (unlikely(req->ctx->drain_next)) { + req->flags |= REQ_F_IO_DRAIN; + req->ctx->drain_next = false; + } + req->ctx->drain_next = (req->flags & REQ_F_DRAIN_LINK); + ret = io_req_defer(req); if (ret) { if (ret != -EIOCBQUEUED) { @@ -2985,57 +2987,16 @@ static void io_queue_sqe(struct io_kiocb *req) __io_queue_sqe(req); } -static void io_queue_link_head(struct io_kiocb *req, struct io_kiocb *shadow) +static inline void io_queue_link_head(struct io_kiocb *req) { - int ret; - int need_submit = false; - struct io_ring_ctx *ctx = req->ctx; - if (unlikely(req->flags & REQ_F_FAIL_LINK)) { - ret = -ECANCELED; - goto err; - } - if (!shadow) { + io_cqring_add_event(req, -ECANCELED); + io_double_put_req(req); + } else io_queue_sqe(req); - return; - } - - /* - * Mark the first IO in link list as DRAIN, let all the following - * IOs enter the defer list. all IO needs to be completed before link - * list. - */ - req->flags |= REQ_F_IO_DRAIN; - ret = io_req_defer(req); - if (ret) { - if (ret != -EIOCBQUEUED) { -err: - io_cqring_add_event(req, ret); - if (req->flags & REQ_F_LINK) - req->flags |= REQ_F_FAIL_LINK; - io_double_put_req(req); - if (shadow) - __io_free_req(shadow); - return; - } - } else { - /* - * If ret == 0 means that all IOs in front of link io are - * running done. let's queue link head. - */ - need_submit = true; - } - - /* Insert shadow req to defer_list, blocking next IOs */ - spin_lock_irq(&ctx->completion_lock); - trace_io_uring_defer(ctx, shadow, true); - list_add_tail(&shadow->list, &ctx->defer_list); - spin_unlock_irq(&ctx->completion_lock); - - if (need_submit) - __io_queue_sqe(req); } + #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK) static void io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state, @@ -3072,6 +3033,9 @@ err_req: struct io_kiocb *prev = *link; struct io_uring_sqe *sqe_copy; + if (s->sqe->flags & IOSQE_IO_DRAIN) + (*link)->flags |= REQ_F_DRAIN_LINK | REQ_F_IO_DRAIN; + if (READ_ONCE(s->sqe->opcode) == IORING_OP_LINK_TIMEOUT) { ret = io_timeout_setup(req); /* common setup allows offset being set, we don't */ @@ -3190,7 +3154,6 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, { struct io_submit_state state, *statep = NULL; struct io_kiocb *link = NULL; - struct io_kiocb *shadow_req = NULL; int i, submitted = 0; bool mm_fault = false; @@ -3229,18 +3192,6 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, sqe_flags = req->submit.sqe->flags; - if (link && (sqe_flags & IOSQE_IO_DRAIN)) { - if (!shadow_req) { - shadow_req = io_get_req(ctx, NULL); - if (unlikely(!shadow_req)) - goto out; - shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN); - refcount_dec(&shadow_req->refs); - } - shadow_req->sequence = req->submit.sequence; - } - -out: req->submit.ring_file = ring_file; req->submit.ring_fd = ring_fd; req->submit.has_user = *mm != NULL; @@ -3256,14 +3207,13 @@ out: * that's the end of the chain. Submit the previous link. */ if (!(sqe_flags & IOSQE_IO_LINK) && link) { - io_queue_link_head(link, shadow_req); + io_queue_link_head(link); link = NULL; - shadow_req = NULL; } } if (link) - io_queue_link_head(link, shadow_req); + io_queue_link_head(link); if (statep) io_submit_state_end(&state); From 915967f69c591b34c5a18d6618af021a81ffd700 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 21 Nov 2019 09:01:20 -0700 Subject: [PATCH 20/42] io_uring: improve trace_io_uring_defer() trace point We don't have shadow requests anymore, so get rid of the shadow argument. Add the user_data argument, as that's often useful to easily match up requests, instead of having to look at request pointers. Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- include/trace/events/io_uring.h | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index ca980c5878e9..736f27808f99 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2588,7 +2588,7 @@ static int io_req_defer(struct io_kiocb *req) req->flags |= REQ_F_FREE_SQE; req->submit.sqe = sqe_copy; - trace_io_uring_defer(ctx, req, false); + trace_io_uring_defer(ctx, req, req->user_data); list_add_tail(&req->list, &ctx->defer_list); spin_unlock_irq(&ctx->completion_lock); return -EIOCBQUEUED; diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h index 72a4d0174b02..b352d66b5d51 100644 --- a/include/trace/events/io_uring.h +++ b/include/trace/events/io_uring.h @@ -163,35 +163,35 @@ TRACE_EVENT(io_uring_queue_async_work, ); /** - * io_uring_defer_list - called before the io_uring work added into defer_list + * io_uring_defer - called when an io_uring request is deferred * * @ctx: pointer to a ring context structure * @req: pointer to a deferred request - * @shadow: whether request is shadow or not + * @user_data: user data associated with the request * * Allows to track deferred requests, to get an insight about what requests are * not started immediately. */ TRACE_EVENT(io_uring_defer, - TP_PROTO(void *ctx, void *req, bool shadow), + TP_PROTO(void *ctx, void *req, unsigned long long user_data), - TP_ARGS(ctx, req, shadow), + TP_ARGS(ctx, req, user_data), TP_STRUCT__entry ( __field( void *, ctx ) __field( void *, req ) - __field( bool, shadow ) + __field( unsigned long long, data ) ), TP_fast_assign( __entry->ctx = ctx; __entry->req = req; - __entry->shadow = shadow; + __entry->data = user_data; ), - TP_printk("ring %p, request %p%s", __entry->ctx, __entry->req, - __entry->shadow ? ", shadow": "") + TP_printk("ring %p, request %p user_data %llu", __entry->ctx, + __entry->req, __entry->data) ); /** From d732447fed7d6b4c22907f630cd25d574bae5276 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 21 Nov 2019 21:24:36 +0300 Subject: [PATCH 21/42] io_uring: rename __io_submit_sqe() __io_submit_sqe() is issuing requests, so call it as such. Moreover, it ends by calling io_iopoll_req_issued(). Rename it and make terminology clearer. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 736f27808f99..956a61cb0b13 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2594,8 +2594,8 @@ static int io_req_defer(struct io_kiocb *req) return -EIOCBQUEUED; } -static int __io_submit_sqe(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int io_issue_sqe(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) { int ret, opcode; struct sqe_submit *s = &req->submit; @@ -2702,7 +2702,7 @@ static void io_wq_submit_work(struct io_wq_work **workptr) s->has_user = (work->flags & IO_WQ_WORK_HAS_MM) != 0; s->in_async = true; do { - ret = __io_submit_sqe(req, &nxt, false); + ret = io_issue_sqe(req, &nxt, false); /* * We can get EAGAIN for polled IO even though we're * forcing a sync submission from here, since we can't @@ -2913,7 +2913,7 @@ static void __io_queue_sqe(struct io_kiocb *req) struct io_kiocb *nxt = io_prep_linked_timeout(req); int ret; - ret = __io_submit_sqe(req, NULL, true); + ret = io_issue_sqe(req, NULL, true); /* * We async punt it if the file wasn't marked NOWAIT, or if the file From 9835d6fafba58e6d9386a6d5af800789bdb52e5b Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 21 Nov 2019 21:24:56 +0300 Subject: [PATCH 22/42] io_uring: add likely/unlikely in io_get_sqring() The number of SQEs to submit is specified by a user, so io_get_sqring() in most of the cases succeeds. Hint compilers about that. Checking ASM genereted by gcc 9.2.0 for x64, there is one branch misprediction. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 956a61cb0b13..63e0448f3f8d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3129,11 +3129,11 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s) */ head = ctx->cached_sq_head; /* make sure SQ entry isn't read before tail */ - if (head == smp_load_acquire(&rings->sq.tail)) + if (unlikely(head == smp_load_acquire(&rings->sq.tail))) return false; head = READ_ONCE(sq_array[head & ctx->sq_mask]); - if (head < ctx->sq_entries) { + if (likely(head < ctx->sq_entries)) { s->ring_file = NULL; s->sqe = &ctx->sq_sqes[head]; s->sequence = ctx->cached_sq_head; From 70cf9f3270a5c5148e93a526dc1e51965259e70c Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 21 Nov 2019 23:21:00 +0300 Subject: [PATCH 23/42] io_uring: remove io_free_req_find_next() There is only one one-liner user of io_free_req_find_next(). Inline it. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 63e0448f3f8d..6d8665e8f0e8 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -984,15 +984,10 @@ static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt) } } -static void io_free_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt) -{ - io_req_find_next(req, nxt); - __io_free_req(req); -} - static void io_free_req(struct io_kiocb *req) { - io_free_req_find_next(req, NULL); + io_req_find_next(req, NULL); + __io_free_req(req); } /* From 944e58bfeda0e9b97cd611adafc823c78e0bc464 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 21 Nov 2019 23:21:01 +0300 Subject: [PATCH 24/42] io_uring: pass only !null to io_req_find_next() Make io_req_find_next() and io_req_link_next() to accept only non-null nxt, and handle it in callers. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 6d8665e8f0e8..95deb45e89cf 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -908,7 +908,7 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) * in this context instead of having to queue up new async work. */ if (nxt) { - if (nxtptr && io_wq_current_is_worker()) + if (io_wq_current_is_worker()) *nxtptr = nxt; else io_queue_async_work(nxt); @@ -986,8 +986,13 @@ static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt) static void io_free_req(struct io_kiocb *req) { - io_req_find_next(req, NULL); + struct io_kiocb *nxt = NULL; + + io_req_find_next(req, &nxt); __io_free_req(req); + + if (nxt) + io_queue_async_work(nxt); } /* From b18fdf71e01fba29a804d63f8c1e2ed61011170d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 21 Nov 2019 23:21:02 +0300 Subject: [PATCH 25/42] io_uring: simplify io_req_link_next() "if (nxt)" is always true, as it was checked in the while's condition. io_wq_current_is_worker() is unnecessary, as non-async callers don't pass nxt, so io_queue_async_work() will be called for them anyway. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 95deb45e89cf..f6f5871bd7d2 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -903,16 +903,7 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) nxt->flags |= REQ_F_LINK; } - /* - * If we're in async work, we can continue processing the chain - * in this context instead of having to queue up new async work. - */ - if (nxt) { - if (io_wq_current_is_worker()) - *nxtptr = nxt; - else - io_queue_async_work(nxt); - } + *nxtptr = nxt; break; } From f9bd67f69af56d712bfd498f5ad9cf7bb177d600 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 21 Nov 2019 23:21:03 +0300 Subject: [PATCH 26/42] io_uring: only !null ptr to io_issue_sqe() Pass only non-null @nxt to io_issue_sqe() and handle it at the caller's side. And propagate it. - kiocb_done() is only called from io_read() and io_write(), which are only called from io_issue_sqe(), so it's @nxt != NULL - io_put_req_find_next() is called either with explicitly non-null local nxt, or from one of the functions in io_issue_sqe() switch (or their callees). Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index f6f5871bd7d2..553fa23120e6 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -990,21 +990,13 @@ static void io_free_req(struct io_kiocb *req) * Drop reference to request, return next in chain (if there is one) if this * was the last reference to this request. */ +__attribute__((nonnull)) static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr) { - struct io_kiocb *nxt = NULL; - - io_req_find_next(req, &nxt); + io_req_find_next(req, nxtptr); if (refcount_dec_and_test(&req->refs)) __io_free_req(req); - - if (nxt) { - if (nxtptr) - *nxtptr = nxt; - else - io_queue_async_work(nxt); - } } static void io_put_req(struct io_kiocb *req) @@ -1488,7 +1480,7 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) static void kiocb_done(struct kiocb *kiocb, ssize_t ret, struct io_kiocb **nxt, bool in_async) { - if (in_async && ret >= 0 && nxt && kiocb->ki_complete == io_complete_rw) + if (in_async && ret >= 0 && kiocb->ki_complete == io_complete_rw) *nxt = __io_complete_rw(kiocb, ret); else io_rw_done(kiocb, ret); @@ -2585,6 +2577,7 @@ static int io_req_defer(struct io_kiocb *req) return -EIOCBQUEUED; } +__attribute__((nonnull)) static int io_issue_sqe(struct io_kiocb *req, struct io_kiocb **nxt, bool force_nonblock) { @@ -2901,10 +2894,13 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) static void __io_queue_sqe(struct io_kiocb *req) { - struct io_kiocb *nxt = io_prep_linked_timeout(req); + struct io_kiocb *linked_timeout = io_prep_linked_timeout(req); + struct io_kiocb *nxt = NULL; int ret; - ret = io_issue_sqe(req, NULL, true); + ret = io_issue_sqe(req, &nxt, true); + if (nxt) + io_queue_async_work(nxt); /* * We async punt it if the file wasn't marked NOWAIT, or if the file @@ -2940,11 +2936,11 @@ err: /* drop submission reference */ io_put_req(req); - if (nxt) { + if (linked_timeout) { if (!ret) - io_queue_linked_timeout(nxt); + io_queue_linked_timeout(linked_timeout); else - io_put_req(nxt); + io_put_req(linked_timeout); } /* and drop final reference, if we failed */ From c4a2ed72c9a61594b6afc23e1fbc78878d32b5a3 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 21 Nov 2019 21:01:26 -0700 Subject: [PATCH 27/42] io_uring: only return -EBUSY for submit on non-flushed backlog We return -EBUSY on submit when we have a CQ ring overflow backlog, but that can be a bit problematic if the application is using pure userspace poll of the CQ ring. For that case, if the ring briefly overflowed and we have pending entries in the backlog, the submit flushes the backlog successfully but still returns -EBUSY. If we're able to fully flush the CQ ring backlog, let the submission proceed. Reported-by: Dan Melnic Signed-off-by: Jens Axboe --- fs/io_uring.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 553fa23120e6..129723087bad 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -654,7 +654,8 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx) eventfd_signal(ctx->cq_ev_fd, 1); } -static void io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) +/* Returns true if there are no backlogged entries after the flush */ +static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) { struct io_rings *rings = ctx->rings; struct io_uring_cqe *cqe; @@ -664,10 +665,10 @@ static void io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) if (!force) { if (list_empty_careful(&ctx->cq_overflow_list)) - return; + return true; if ((ctx->cached_cq_tail - READ_ONCE(rings->cq.head) == rings->cq_ring_entries)) - return; + return false; } spin_lock_irqsave(&ctx->completion_lock, flags); @@ -676,6 +677,7 @@ static void io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) if (force) ctx->cq_overflow_flushed = true; + cqe = NULL; while (!list_empty(&ctx->cq_overflow_list)) { cqe = io_get_cqring(ctx); if (!cqe && !force) @@ -703,6 +705,8 @@ static void io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) list_del(&req->list); io_put_req(req); } + + return cqe != NULL; } static void io_cqring_fill_event(struct io_kiocb *req, long res) @@ -3144,10 +3148,10 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, int i, submitted = 0; bool mm_fault = false; - if (!list_empty(&ctx->cq_overflow_list)) { - io_cqring_overflow_flush(ctx, false); + /* if we have a backlog and couldn't flush it all, return BUSY */ + if (!list_empty(&ctx->cq_overflow_list) && + !io_cqring_overflow_flush(ctx, false)) return -EBUSY; - } if (nr > IO_PLUG_THRESHOLD) { io_submit_state_start(&state, ctx, nr); From bd3ded3146daa2cbb57ed353749ef99cf75371b0 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 23 Nov 2019 14:17:16 -0700 Subject: [PATCH 28/42] net: add __sys_connect_file() helper This is identical to __sys_connect(), except it takes a struct file instead of an fd, and it also allows passing in extra file->f_flags flags. The latter is done to support masking in O_NONBLOCK without manipulating the original file flags. No functional changes in this patch. Cc: netdev@vger.kernel.org Acked-by: David S. Miller Signed-off-by: Jens Axboe --- include/linux/socket.h | 3 +++ net/socket.c | 30 ++++++++++++++++++++++-------- 2 files changed, 25 insertions(+), 8 deletions(-) diff --git a/include/linux/socket.h b/include/linux/socket.h index 09c32a21555b..4bde63021c09 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -399,6 +399,9 @@ extern int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr, int __user *upeer_addrlen, int flags); extern int __sys_socket(int family, int type, int protocol); extern int __sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen); +extern int __sys_connect_file(struct file *file, + struct sockaddr __user *uservaddr, int addrlen, + int file_flags); extern int __sys_connect(int fd, struct sockaddr __user *uservaddr, int addrlen); extern int __sys_listen(int fd, int backlog); diff --git a/net/socket.c b/net/socket.c index 17bc1eee198a..274df4ddfc2c 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1825,32 +1825,46 @@ SYSCALL_DEFINE3(accept, int, fd, struct sockaddr __user *, upeer_sockaddr, * include the -EINPROGRESS status for such sockets. */ -int __sys_connect(int fd, struct sockaddr __user *uservaddr, int addrlen) +int __sys_connect_file(struct file *file, struct sockaddr __user *uservaddr, + int addrlen, int file_flags) { struct socket *sock; struct sockaddr_storage address; - int err, fput_needed; + int err; - sock = sockfd_lookup_light(fd, &err, &fput_needed); + sock = sock_from_file(file, &err); if (!sock) goto out; err = move_addr_to_kernel(uservaddr, addrlen, &address); if (err < 0) - goto out_put; + goto out; err = security_socket_connect(sock, (struct sockaddr *)&address, addrlen); if (err) - goto out_put; + goto out; err = sock->ops->connect(sock, (struct sockaddr *)&address, addrlen, - sock->file->f_flags); -out_put: - fput_light(sock->file, fput_needed); + sock->file->f_flags | file_flags); out: return err; } +int __sys_connect(int fd, struct sockaddr __user *uservaddr, int addrlen) +{ + int ret = -EBADF; + struct fd f; + + f = fdget(fd); + if (f.file) { + ret = __sys_connect_file(f.file, uservaddr, addrlen, 0); + if (f.flags) + fput(f.file); + } + + return ret; +} + SYSCALL_DEFINE3(connect, int, fd, struct sockaddr __user *, uservaddr, int, addrlen) { From f8e85cf255ad57d65eeb9a9d0e59e3dec55bdd9e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 23 Nov 2019 14:24:24 -0700 Subject: [PATCH 29/42] io_uring: add support for IORING_OP_CONNECT This allows an application to call connect() in an async fashion. Like other opcodes, we first try a non-blocking connect, then punt to async context if we have to. Note that we can still return -EINPROGRESS, and in that case the caller should use IORING_OP_POLL_ADD to do an async wait for completion of the connect request (just like for regular connect(2), except we can do it async here too). Signed-off-by: Jens Axboe --- fs/io_uring.c | 36 +++++++++++++++++++++++++++++++++++ include/uapi/linux/io_uring.h | 1 + 2 files changed, 37 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 129723087bad..02254929231b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -550,6 +550,7 @@ static inline bool io_prep_async_work(struct io_kiocb *req, case IORING_OP_RECVMSG: case IORING_OP_ACCEPT: case IORING_OP_POLL_ADD: + case IORING_OP_CONNECT: /* * We know REQ_F_ISREG is not set on some of these * opcodes, but this enables us to keep the check in @@ -1974,6 +1975,38 @@ static int io_accept(struct io_kiocb *req, const struct io_uring_sqe *sqe, #endif } +static int io_connect(struct io_kiocb *req, const struct io_uring_sqe *sqe, + struct io_kiocb **nxt, bool force_nonblock) +{ +#if defined(CONFIG_NET) + struct sockaddr __user *addr; + unsigned file_flags; + int addr_len, ret; + + if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) + return -EINVAL; + if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags) + return -EINVAL; + + addr = (struct sockaddr __user *) (unsigned long) READ_ONCE(sqe->addr); + addr_len = READ_ONCE(sqe->addr2); + file_flags = force_nonblock ? O_NONBLOCK : 0; + + ret = __sys_connect_file(req->file, addr, addr_len, file_flags); + if (ret == -EAGAIN && force_nonblock) + return -EAGAIN; + if (ret == -ERESTARTSYS) + ret = -EINTR; + if (ret < 0 && (req->flags & REQ_F_LINK)) + req->flags |= REQ_F_FAIL_LINK; + io_cqring_add_event(req, ret); + io_put_req_find_next(req, nxt); + return 0; +#else + return -EOPNOTSUPP; +#endif +} + static inline void io_poll_remove_req(struct io_kiocb *req) { if (!RB_EMPTY_NODE(&req->rb_node)) { @@ -2637,6 +2670,9 @@ static int io_issue_sqe(struct io_kiocb *req, struct io_kiocb **nxt, case IORING_OP_ACCEPT: ret = io_accept(req, s->sqe, nxt, force_nonblock); break; + case IORING_OP_CONNECT: + ret = io_connect(req, s->sqe, nxt, force_nonblock); + break; case IORING_OP_ASYNC_CANCEL: ret = io_async_cancel(req, s->sqe, nxt); break; diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 2a1569211d87..4637ed1d9949 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -73,6 +73,7 @@ struct io_uring_sqe { #define IORING_OP_ACCEPT 13 #define IORING_OP_ASYNC_CANCEL 14 #define IORING_OP_LINK_TIMEOUT 15 +#define IORING_OP_CONNECT 16 /* * sqe->fsync_flags From 311ae9e159d81a1ec1cf645daf40b39ae5a0bd84 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 24 Nov 2019 11:58:24 +0300 Subject: [PATCH 30/42] io_uring: fix dead-hung for non-iter fixed rw Read/write requests to devices without implemented read/write_iter using fixed buffers can cause general protection fault, which totally hangs a machine. io_import_fixed() initialises iov_iter with bvec, but loop_rw_iter() accesses it as iovec, dereferencing random address. kmap() page by page in this case Cc: stable@vger.kernel.org Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 02254929231b..fc832164bbb8 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1622,9 +1622,19 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb, return -EAGAIN; while (iov_iter_count(iter)) { - struct iovec iovec = iov_iter_iovec(iter); + struct iovec iovec; ssize_t nr; + if (!iov_iter_is_bvec(iter)) { + iovec = iov_iter_iovec(iter); + } else { + /* fixed buffers import bvec */ + iovec.iov_base = kmap(iter->bvec->bv_page) + + iter->iov_offset; + iovec.iov_len = min(iter->count, + iter->bvec->bv_len - iter->iov_offset); + } + if (rw == READ) { nr = file->f_op->read(file, iovec.iov_base, iovec.iov_len, &kiocb->ki_pos); @@ -1633,6 +1643,9 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb, iovec.iov_len, &kiocb->ki_pos); } + if (iov_iter_is_bvec(iter)) + kunmap(iter->bvec->bv_page); + if (nr < 0) { if (!ret) ret = nr; From 576a347b7af8abfbddc80783fb6629c2894d036e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 25 Nov 2019 08:49:20 -0700 Subject: [PATCH 31/42] io-wq: have io_wq_create() take a 'data' argument We currently pass in 4 arguments outside of the bounded size. In preparation for adding one more argument, let's bundle them up in a struct to make it more readable. No functional changes in this patch. Signed-off-by: Jens Axboe --- fs/io-wq.c | 14 ++++++-------- fs/io-wq.h | 12 +++++++++--- fs/io_uring.c | 9 +++++++-- 3 files changed, 22 insertions(+), 13 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index 9b32b3c811f5..2b4276990571 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -974,9 +974,7 @@ void io_wq_flush(struct io_wq *wq) } } -struct io_wq *io_wq_create(unsigned bounded, struct mm_struct *mm, - struct user_struct *user, get_work_fn *get_work, - put_work_fn *put_work) +struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) { int ret = -ENOMEM, i, node; struct io_wq *wq; @@ -992,11 +990,11 @@ struct io_wq *io_wq_create(unsigned bounded, struct mm_struct *mm, return ERR_PTR(-ENOMEM); } - wq->get_work = get_work; - wq->put_work = put_work; + wq->get_work = data->get_work; + wq->put_work = data->put_work; /* caller must already hold a reference to this */ - wq->user = user; + wq->user = data->user; i = 0; for_each_online_node(node) { @@ -1009,7 +1007,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct mm_struct *mm, wqe->node = node; wqe->acct[IO_WQ_ACCT_BOUND].max_workers = bounded; atomic_set(&wqe->acct[IO_WQ_ACCT_BOUND].nr_running, 0); - if (user) { + if (wq->user) { wqe->acct[IO_WQ_ACCT_UNBOUND].max_workers = task_rlimit(current, RLIMIT_NPROC); } @@ -1031,7 +1029,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct mm_struct *mm, goto err; /* caller must have already done mmgrab() on this mm */ - wq->mm = mm; + wq->mm = data->mm; wq->manager = kthread_create(io_wq_manager, wq, "io_wq_manager"); if (!IS_ERR(wq->manager)) { diff --git a/fs/io-wq.h b/fs/io-wq.h index b68b11bf3633..bb8f1c8f8e24 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -42,9 +42,15 @@ struct io_wq_work { typedef void (get_work_fn)(struct io_wq_work *); typedef void (put_work_fn)(struct io_wq_work *); -struct io_wq *io_wq_create(unsigned bounded, struct mm_struct *mm, - struct user_struct *user, - get_work_fn *get_work, put_work_fn *put_work); +struct io_wq_data { + struct mm_struct *mm; + struct user_struct *user; + + get_work_fn *get_work; + put_work_fn *put_work; +}; + +struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data); void io_wq_destroy(struct io_wq *wq); void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work); diff --git a/fs/io_uring.c b/fs/io_uring.c index fc832164bbb8..fabae84396bc 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3962,6 +3962,7 @@ static void io_get_work(struct io_wq_work *work) static int io_sq_offload_start(struct io_ring_ctx *ctx, struct io_uring_params *p) { + struct io_wq_data data; unsigned concurrency; int ret; @@ -4006,10 +4007,14 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, goto err; } + data.mm = ctx->sqo_mm; + data.user = ctx->user; + data.get_work = io_get_work; + data.put_work = io_put_work; + /* Do QD, or 4 * CPUS, whatever is smallest */ concurrency = min(ctx->sq_entries, 4 * num_online_cpus()); - ctx->io_wq = io_wq_create(concurrency, ctx->sqo_mm, ctx->user, - io_get_work, io_put_work); + ctx->io_wq = io_wq_create(concurrency, &data); if (IS_ERR(ctx->io_wq)) { ret = PTR_ERR(ctx->io_wq); ctx->io_wq = NULL; From 181e448d8709e517c9c7b523fcd209f24eb38ca7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 25 Nov 2019 08:52:30 -0700 Subject: [PATCH 32/42] io_uring: async workers should inherit the user creds If we don't inherit the original task creds, then we can confuse users like fuse that pass creds in the request header. See link below on identical aio issue. Link: https://lore.kernel.org/linux-fsdevel/26f0d78e-99ca-2f1b-78b9-433088053a61@scylladb.com/T/#u Signed-off-by: Jens Axboe --- fs/io-wq.c | 10 ++++++++++ fs/io-wq.h | 1 + fs/io_uring.c | 8 ++++++++ 3 files changed, 19 insertions(+) diff --git a/fs/io-wq.c b/fs/io-wq.c index 2b4276990571..31c5a10b0825 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -57,6 +57,7 @@ struct io_worker { struct rcu_head rcu; struct mm_struct *mm; + const struct cred *creds; struct files_struct *restore_files; }; @@ -111,6 +112,7 @@ struct io_wq { struct task_struct *manager; struct user_struct *user; + struct cred *creds; struct mm_struct *mm; refcount_t refs; struct completion done; @@ -136,6 +138,11 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker) { bool dropped_lock = false; + if (worker->creds) { + revert_creds(worker->creds); + worker->creds = NULL; + } + if (current->files != worker->restore_files) { __acquire(&wqe->lock); spin_unlock_irq(&wqe->lock); @@ -442,6 +449,8 @@ next: set_fs(USER_DS); worker->mm = wq->mm; } + if (!worker->creds) + worker->creds = override_creds(wq->creds); if (test_bit(IO_WQ_BIT_CANCEL, &wq->state)) work->flags |= IO_WQ_WORK_CANCEL; if (worker->mm) @@ -995,6 +1004,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) /* caller must already hold a reference to this */ wq->user = data->user; + wq->creds = data->creds; i = 0; for_each_online_node(node) { diff --git a/fs/io-wq.h b/fs/io-wq.h index bb8f1c8f8e24..5cd8c7697e88 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -45,6 +45,7 @@ typedef void (put_work_fn)(struct io_wq_work *); struct io_wq_data { struct mm_struct *mm; struct user_struct *user; + struct cred *creds; get_work_fn *get_work; put_work_fn *put_work; diff --git a/fs/io_uring.c b/fs/io_uring.c index fabae84396bc..b6c6fdc12de7 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -237,6 +237,8 @@ struct io_ring_ctx { struct user_struct *user; + struct cred *creds; + /* 0 is for ctx quiesce/reinit/free, 1 is for sqo_thread started */ struct completion *completions; @@ -3267,6 +3269,7 @@ static int io_sq_thread(void *data) { struct io_ring_ctx *ctx = data; struct mm_struct *cur_mm = NULL; + const struct cred *old_cred; mm_segment_t old_fs; DEFINE_WAIT(wait); unsigned inflight; @@ -3277,6 +3280,7 @@ static int io_sq_thread(void *data) old_fs = get_fs(); set_fs(USER_DS); + old_cred = override_creds(ctx->creds); ret = timeout = inflight = 0; while (!kthread_should_park()) { @@ -3383,6 +3387,7 @@ static int io_sq_thread(void *data) unuse_mm(cur_mm); mmput(cur_mm); } + revert_creds(old_cred); kthread_parkme(); @@ -4009,6 +4014,7 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, data.mm = ctx->sqo_mm; data.user = ctx->user; + data.creds = ctx->creds; data.get_work = io_get_work; data.put_work = io_put_work; @@ -4363,6 +4369,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) io_unaccount_mem(ctx->user, ring_pages(ctx->sq_entries, ctx->cq_entries)); free_uid(ctx->user); + put_cred(ctx->creds); kfree(ctx->completions); kmem_cache_free(req_cachep, ctx->fallback_req); kfree(ctx); @@ -4715,6 +4722,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) ctx->compat = in_compat_syscall(); ctx->account_mem = account_mem; ctx->user = user; + ctx->creds = prepare_creds(); ret = io_allocate_scq_urings(ctx, p); if (ret) From 8042d6ce8c40df0abb0d91662a754d074a3d3f16 Mon Sep 17 00:00:00 2001 From: Hrvoje Zeba Date: Mon, 25 Nov 2019 14:40:22 -0500 Subject: [PATCH 33/42] io_uring: remove superfluous check for sqe->off in io_accept() This field contains a pointer to addrlen and checking to see if it's set returns -EINVAL if the caller sets addr & addrlen pointers. Fixes: 17f2fe35d080 ("io_uring: add support for IORING_OP_ACCEPT") Signed-off-by: Hrvoje Zeba Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index b6c6fdc12de7..7412fdefa35a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1965,7 +1965,7 @@ static int io_accept(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) return -EINVAL; - if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index) + if (sqe->ioprio || sqe->len || sqe->buf_index) return -EINVAL; addr = (struct sockaddr __user *) (unsigned long) READ_ONCE(sqe->addr); From 4257c8ca13b084550574b8c9a667d9c90ff746eb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 25 Nov 2019 14:27:34 -0700 Subject: [PATCH 34/42] net: separate out the msghdr copy from ___sys_{send,recv}msg() This is in preparation for enabling the io_uring helpers for sendmsg and recvmsg to first copy the header for validation before continuing with the operation. There should be no functional changes in this patch. Acked-by: David S. Miller Signed-off-by: Jens Axboe --- net/socket.c | 141 ++++++++++++++++++++++++++++++++++----------------- 1 file changed, 95 insertions(+), 46 deletions(-) diff --git a/net/socket.c b/net/socket.c index 274df4ddfc2c..98568ae58bad 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2264,15 +2264,10 @@ static int copy_msghdr_from_user(struct msghdr *kmsg, return err < 0 ? err : 0; } -static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg, - struct msghdr *msg_sys, unsigned int flags, - struct used_address *used_address, - unsigned int allowed_msghdr_flags) +static int ____sys_sendmsg(struct socket *sock, struct msghdr *msg_sys, + unsigned int flags, struct used_address *used_address, + unsigned int allowed_msghdr_flags) { - struct compat_msghdr __user *msg_compat = - (struct compat_msghdr __user *)msg; - struct sockaddr_storage address; - struct iovec iovstack[UIO_FASTIOV], *iov = iovstack; unsigned char ctl[sizeof(struct cmsghdr) + 20] __aligned(sizeof(__kernel_size_t)); /* 20 is size of ipv6_pktinfo */ @@ -2280,19 +2275,10 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg, int ctl_len; ssize_t err; - msg_sys->msg_name = &address; - - if (MSG_CMSG_COMPAT & flags) - err = get_compat_msghdr(msg_sys, msg_compat, NULL, &iov); - else - err = copy_msghdr_from_user(msg_sys, msg, NULL, &iov); - if (err < 0) - return err; - err = -ENOBUFS; if (msg_sys->msg_controllen > INT_MAX) - goto out_freeiov; + goto out; flags |= (msg_sys->msg_flags & allowed_msghdr_flags); ctl_len = msg_sys->msg_controllen; if ((MSG_CMSG_COMPAT & flags) && ctl_len) { @@ -2300,7 +2286,7 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg, cmsghdr_from_user_compat_to_kern(msg_sys, sock->sk, ctl, sizeof(ctl)); if (err) - goto out_freeiov; + goto out; ctl_buf = msg_sys->msg_control; ctl_len = msg_sys->msg_controllen; } else if (ctl_len) { @@ -2309,7 +2295,7 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg, if (ctl_len > sizeof(ctl)) { ctl_buf = sock_kmalloc(sock->sk, ctl_len, GFP_KERNEL); if (ctl_buf == NULL) - goto out_freeiov; + goto out; } err = -EFAULT; /* @@ -2355,7 +2341,47 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg, out_freectl: if (ctl_buf != ctl) sock_kfree_s(sock->sk, ctl_buf, ctl_len); -out_freeiov: +out: + return err; +} + +static int sendmsg_copy_msghdr(struct msghdr *msg, + struct user_msghdr __user *umsg, unsigned flags, + struct iovec **iov) +{ + int err; + + if (flags & MSG_CMSG_COMPAT) { + struct compat_msghdr __user *msg_compat; + + msg_compat = (struct compat_msghdr __user *) umsg; + err = get_compat_msghdr(msg, msg_compat, NULL, iov); + } else { + err = copy_msghdr_from_user(msg, umsg, NULL, iov); + } + if (err < 0) + return err; + + return 0; +} + +static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg, + struct msghdr *msg_sys, unsigned int flags, + struct used_address *used_address, + unsigned int allowed_msghdr_flags) +{ + struct sockaddr_storage address; + struct iovec iovstack[UIO_FASTIOV], *iov = iovstack; + ssize_t err; + + msg_sys->msg_name = &address; + + err = sendmsg_copy_msghdr(msg_sys, msg, flags, &iov); + if (err < 0) + return err; + + err = ____sys_sendmsg(sock, msg_sys, flags, used_address, + allowed_msghdr_flags); kfree(iov); return err; } @@ -2474,33 +2500,41 @@ SYSCALL_DEFINE4(sendmmsg, int, fd, struct mmsghdr __user *, mmsg, return __sys_sendmmsg(fd, mmsg, vlen, flags, true); } -static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg, - struct msghdr *msg_sys, unsigned int flags, int nosec) +static int recvmsg_copy_msghdr(struct msghdr *msg, + struct user_msghdr __user *umsg, unsigned flags, + struct sockaddr __user **uaddr, + struct iovec **iov) +{ + ssize_t err; + + if (MSG_CMSG_COMPAT & flags) { + struct compat_msghdr __user *msg_compat; + + msg_compat = (struct compat_msghdr __user *) umsg; + err = get_compat_msghdr(msg, msg_compat, uaddr, iov); + } else { + err = copy_msghdr_from_user(msg, umsg, uaddr, iov); + } + if (err < 0) + return err; + + return 0; +} + +static int ____sys_recvmsg(struct socket *sock, struct msghdr *msg_sys, + struct user_msghdr __user *msg, + struct sockaddr __user *uaddr, + unsigned int flags, int nosec) { struct compat_msghdr __user *msg_compat = - (struct compat_msghdr __user *)msg; - struct iovec iovstack[UIO_FASTIOV]; - struct iovec *iov = iovstack; + (struct compat_msghdr __user *) msg; + int __user *uaddr_len = COMPAT_NAMELEN(msg); + struct sockaddr_storage addr; unsigned long cmsg_ptr; int len; ssize_t err; - /* kernel mode address */ - struct sockaddr_storage addr; - - /* user mode address pointers */ - struct sockaddr __user *uaddr; - int __user *uaddr_len = COMPAT_NAMELEN(msg); - msg_sys->msg_name = &addr; - - if (MSG_CMSG_COMPAT & flags) - err = get_compat_msghdr(msg_sys, msg_compat, &uaddr, &iov); - else - err = copy_msghdr_from_user(msg_sys, msg, &uaddr, &iov); - if (err < 0) - return err; - cmsg_ptr = (unsigned long)msg_sys->msg_control; msg_sys->msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT); @@ -2511,7 +2545,7 @@ static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg, flags |= MSG_DONTWAIT; err = (nosec ? sock_recvmsg_nosec : sock_recvmsg)(sock, msg_sys, flags); if (err < 0) - goto out_freeiov; + goto out; len = err; if (uaddr != NULL) { @@ -2519,12 +2553,12 @@ static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg, msg_sys->msg_namelen, uaddr, uaddr_len); if (err < 0) - goto out_freeiov; + goto out; } err = __put_user((msg_sys->msg_flags & ~MSG_CMSG_COMPAT), COMPAT_FLAGS(msg)); if (err) - goto out_freeiov; + goto out; if (MSG_CMSG_COMPAT & flags) err = __put_user((unsigned long)msg_sys->msg_control - cmsg_ptr, &msg_compat->msg_controllen); @@ -2532,10 +2566,25 @@ static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg, err = __put_user((unsigned long)msg_sys->msg_control - cmsg_ptr, &msg->msg_controllen); if (err) - goto out_freeiov; + goto out; err = len; +out: + return err; +} -out_freeiov: +static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg, + struct msghdr *msg_sys, unsigned int flags, int nosec) +{ + struct iovec iovstack[UIO_FASTIOV], *iov = iovstack; + /* user mode address pointers */ + struct sockaddr __user *uaddr; + ssize_t err; + + err = recvmsg_copy_msghdr(msg_sys, msg, flags, &uaddr, &iov); + if (err < 0) + return err; + + err = ____sys_recvmsg(sock, msg_sys, msg, uaddr, flags, nosec); kfree(iov); return err; } From d69e07793f891524c6bbf1e75b9ae69db4450953 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 25 Nov 2019 17:04:13 -0700 Subject: [PATCH 35/42] net: disallow ancillary data for __sys_{send,recv}msg_file() Only io_uring uses (and added) these, and we want to disallow the use of sendmsg/recvmsg for anything but regular data transfers. Use the newly added prep helper to split the msghdr copy out from the core function, to check for msg_control and msg_controllen settings. If either is set, we return -EINVAL. Acked-by: David S. Miller Signed-off-by: Jens Axboe --- net/socket.c | 43 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 37 insertions(+), 6 deletions(-) diff --git a/net/socket.c b/net/socket.c index 98568ae58bad..c78c3d37c884 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2389,12 +2389,27 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg, /* * BSD sendmsg interface */ -long __sys_sendmsg_sock(struct socket *sock, struct user_msghdr __user *msg, +long __sys_sendmsg_sock(struct socket *sock, struct user_msghdr __user *umsg, unsigned int flags) { - struct msghdr msg_sys; + struct iovec iovstack[UIO_FASTIOV], *iov = iovstack; + struct sockaddr_storage address; + struct msghdr msg = { .msg_name = &address }; + ssize_t err; - return ___sys_sendmsg(sock, msg, &msg_sys, flags, NULL, 0); + err = sendmsg_copy_msghdr(&msg, umsg, flags, &iov); + if (err) + return err; + /* disallow ancillary data requests from this path */ + if (msg.msg_control || msg.msg_controllen) { + err = -EINVAL; + goto out; + } + + err = ____sys_sendmsg(sock, &msg, flags, NULL, 0); +out: + kfree(iov); + return err; } long __sys_sendmsg(int fd, struct user_msghdr __user *msg, unsigned int flags, @@ -2593,12 +2608,28 @@ static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg, * BSD recvmsg interface */ -long __sys_recvmsg_sock(struct socket *sock, struct user_msghdr __user *msg, +long __sys_recvmsg_sock(struct socket *sock, struct user_msghdr __user *umsg, unsigned int flags) { - struct msghdr msg_sys; + struct iovec iovstack[UIO_FASTIOV], *iov = iovstack; + struct sockaddr_storage address; + struct msghdr msg = { .msg_name = &address }; + struct sockaddr __user *uaddr; + ssize_t err; - return ___sys_recvmsg(sock, msg, &msg_sys, flags, 0); + err = recvmsg_copy_msghdr(&msg, umsg, flags, &uaddr, &iov); + if (err) + return err; + /* disallow ancillary data requests from this path */ + if (msg.msg_control || msg.msg_controllen) { + err = -EINVAL; + goto out; + } + + err = ____sys_recvmsg(sock, &msg, umsg, uaddr, flags, 0); +out: + kfree(iov); + return err; } long __sys_recvmsg(int fd, struct user_msghdr __user *msg, unsigned int flags, From cc42e0ac17d3664a70e020dfe7897f14e7aa7453 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 25 Nov 2019 23:14:38 +0300 Subject: [PATCH 36/42] io_uring: store timeout's sqe->off in proper place Timeouts' sequence offset (i.e. sqe->off) is stored in req->submit.sequence under a false name. Keep it in timeout.data instead. The unused space for sequence will be reclaimed in the following patches. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 7412fdefa35a..39409934e9e6 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -309,6 +309,7 @@ struct io_timeout_data { struct hrtimer timer; struct timespec64 ts; enum hrtimer_mode mode; + u32 seq_offset; }; struct io_timeout { @@ -2473,8 +2474,7 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) } req->sequence = ctx->cached_sq_head + count - 1; - /* reuse it to store the count */ - req->submit.sequence = count; + req->timeout.data->seq_offset = count; /* * Insertion sort, ensuring the first entry in the list is always @@ -2485,6 +2485,7 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list); unsigned nxt_sq_head; long long tmp, tmp_nxt; + u32 nxt_offset = nxt->timeout.data->seq_offset; if (nxt->flags & REQ_F_TIMEOUT_NOSEQ) continue; @@ -2494,8 +2495,8 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) * long to store it. */ tmp = (long long)ctx->cached_sq_head + count - 1; - nxt_sq_head = nxt->sequence - nxt->submit.sequence + 1; - tmp_nxt = (long long)nxt_sq_head + nxt->submit.sequence - 1; + nxt_sq_head = nxt->sequence - nxt_offset + 1; + tmp_nxt = (long long)nxt_sq_head + nxt_offset - 1; /* * cached_sq_head may overflow, and it will never overflow twice From cf6fd4bd559ee61a4454b161863c8de6f30f8dca Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 25 Nov 2019 23:14:39 +0300 Subject: [PATCH 37/42] io_uring: inline struct sqe_submit There is no point left in keeping struct sqe_submit. Inline it into struct io_kiocb, so any req->submit.field is now just req->field - moves initialisation of ring_file into io_get_req() - removes duplicated req->sequence. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 169 +++++++++++++++++++++++--------------------------- 1 file changed, 78 insertions(+), 91 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 39409934e9e6..c846605b8361 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -281,16 +281,6 @@ struct io_ring_ctx { } ____cacheline_aligned_in_smp; }; -struct sqe_submit { - const struct io_uring_sqe *sqe; - struct file *ring_file; - int ring_fd; - u32 sequence; - bool has_user; - bool in_async; - bool needs_fixed_file; -}; - /* * First field must be the file pointer in all the * iocb unions! See also 'struct kiocb' in @@ -331,7 +321,12 @@ struct io_kiocb { struct io_timeout timeout; }; - struct sqe_submit submit; + const struct io_uring_sqe *sqe; + struct file *ring_file; + int ring_fd; + bool has_user; + bool in_async; + bool needs_fixed_file; struct io_ring_ctx *ctx; union { @@ -541,8 +536,8 @@ static inline bool io_prep_async_work(struct io_kiocb *req, { bool do_hashed = false; - if (req->submit.sqe) { - switch (req->submit.sqe->opcode) { + if (req->sqe) { + switch (req->sqe->opcode) { case IORING_OP_WRITEV: case IORING_OP_WRITE_FIXED: do_hashed = true; @@ -563,7 +558,7 @@ static inline bool io_prep_async_work(struct io_kiocb *req, req->work.flags |= IO_WQ_WORK_UNBOUND; break; } - if (io_sqe_needs_user(req->submit.sqe)) + if (io_sqe_needs_user(req->sqe)) req->work.flags |= IO_WQ_WORK_NEEDS_USER; } @@ -810,6 +805,7 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, } got_it: + req->ring_file = NULL; req->file = NULL; req->ctx = ctx; req->flags = 0; @@ -840,7 +836,7 @@ static void __io_free_req(struct io_kiocb *req) struct io_ring_ctx *ctx = req->ctx; if (req->flags & REQ_F_FREE_SQE) - kfree(req->submit.sqe); + kfree(req->sqe); if (req->file && !(req->flags & REQ_F_FIXED_FILE)) fput(req->file); if (req->flags & REQ_F_INFLIGHT) { @@ -938,7 +934,7 @@ static void io_fail_links(struct io_kiocb *req) trace_io_uring_fail_link(req, link); if ((req->flags & REQ_F_LINK_TIMEOUT) && - link->submit.sqe->opcode == IORING_OP_LINK_TIMEOUT) { + link->sqe->opcode == IORING_OP_LINK_TIMEOUT) { io_link_cancel_timeout(link); } else { io_cqring_fill_event(link, -ECANCELED); @@ -1401,7 +1397,7 @@ static bool io_file_supports_async(struct file *file) static int io_prep_rw(struct io_kiocb *req, bool force_nonblock) { - const struct io_uring_sqe *sqe = req->submit.sqe; + const struct io_uring_sqe *sqe = req->sqe; struct io_ring_ctx *ctx = req->ctx; struct kiocb *kiocb = &req->rw; unsigned ioprio; @@ -1568,11 +1564,10 @@ static int io_import_fixed(struct io_ring_ctx *ctx, int rw, return len; } -static ssize_t io_import_iovec(struct io_ring_ctx *ctx, int rw, - const struct sqe_submit *s, struct iovec **iovec, - struct iov_iter *iter) +static ssize_t io_import_iovec(int rw, struct io_kiocb *req, + struct iovec **iovec, struct iov_iter *iter) { - const struct io_uring_sqe *sqe = s->sqe; + const struct io_uring_sqe *sqe = req->sqe; void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr)); size_t sqe_len = READ_ONCE(sqe->len); u8 opcode; @@ -1588,16 +1583,16 @@ static ssize_t io_import_iovec(struct io_ring_ctx *ctx, int rw, opcode = READ_ONCE(sqe->opcode); if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { - ssize_t ret = io_import_fixed(ctx, rw, sqe, iter); + ssize_t ret = io_import_fixed(req->ctx, rw, sqe, iter); *iovec = NULL; return ret; } - if (!s->has_user) + if (!req->has_user) return -EFAULT; #ifdef CONFIG_COMPAT - if (ctx->compat) + if (req->ctx->compat) return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter); #endif @@ -1681,7 +1676,7 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, if (unlikely(!(file->f_mode & FMODE_READ))) return -EBADF; - ret = io_import_iovec(req->ctx, READ, &req->submit, &iovec, &iter); + ret = io_import_iovec(READ, req, &iovec, &iter); if (ret < 0) return ret; @@ -1713,7 +1708,7 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, ret2 = -EAGAIN; /* Catch -EAGAIN return for forced non-blocking submission */ if (!force_nonblock || ret2 != -EAGAIN) - kiocb_done(kiocb, ret2, nxt, req->submit.in_async); + kiocb_done(kiocb, ret2, nxt, req->in_async); else ret = -EAGAIN; } @@ -1739,7 +1734,7 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, if (unlikely(!(file->f_mode & FMODE_WRITE))) return -EBADF; - ret = io_import_iovec(req->ctx, WRITE, &req->submit, &iovec, &iter); + ret = io_import_iovec(WRITE, req, &iovec, &iter); if (ret < 0) return ret; @@ -1776,7 +1771,7 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, else ret2 = loop_rw_iter(WRITE, file, kiocb, &iter); if (!force_nonblock || ret2 != -EAGAIN) - kiocb_done(kiocb, ret2, nxt, req->submit.in_async); + kiocb_done(kiocb, ret2, nxt, req->in_async); else ret = -EAGAIN; } @@ -2259,7 +2254,7 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (!poll->file) return -EBADF; - req->submit.sqe = NULL; + req->sqe = NULL; INIT_IO_WORK(&req->work, io_poll_complete_work); events = READ_ONCE(sqe->poll_events); poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; @@ -2413,7 +2408,7 @@ static int io_timeout_remove(struct io_kiocb *req, static int io_timeout_setup(struct io_kiocb *req) { - const struct io_uring_sqe *sqe = req->submit.sqe; + const struct io_uring_sqe *sqe = req->sqe; struct io_timeout_data *data; unsigned flags; @@ -2601,7 +2596,6 @@ static int io_async_cancel(struct io_kiocb *req, const struct io_uring_sqe *sqe, static int io_req_defer(struct io_kiocb *req) { - const struct io_uring_sqe *sqe = req->submit.sqe; struct io_uring_sqe *sqe_copy; struct io_ring_ctx *ctx = req->ctx; @@ -2620,9 +2614,9 @@ static int io_req_defer(struct io_kiocb *req) return 0; } - memcpy(sqe_copy, sqe, sizeof(*sqe_copy)); + memcpy(sqe_copy, req->sqe, sizeof(*sqe_copy)); req->flags |= REQ_F_FREE_SQE; - req->submit.sqe = sqe_copy; + req->sqe = sqe_copy; trace_io_uring_defer(ctx, req, req->user_data); list_add_tail(&req->list, &ctx->defer_list); @@ -2635,21 +2629,20 @@ static int io_issue_sqe(struct io_kiocb *req, struct io_kiocb **nxt, bool force_nonblock) { int ret, opcode; - struct sqe_submit *s = &req->submit; struct io_ring_ctx *ctx = req->ctx; - opcode = READ_ONCE(s->sqe->opcode); + opcode = READ_ONCE(req->sqe->opcode); switch (opcode) { case IORING_OP_NOP: ret = io_nop(req); break; case IORING_OP_READV: - if (unlikely(s->sqe->buf_index)) + if (unlikely(req->sqe->buf_index)) return -EINVAL; ret = io_read(req, nxt, force_nonblock); break; case IORING_OP_WRITEV: - if (unlikely(s->sqe->buf_index)) + if (unlikely(req->sqe->buf_index)) return -EINVAL; ret = io_write(req, nxt, force_nonblock); break; @@ -2660,37 +2653,37 @@ static int io_issue_sqe(struct io_kiocb *req, struct io_kiocb **nxt, ret = io_write(req, nxt, force_nonblock); break; case IORING_OP_FSYNC: - ret = io_fsync(req, s->sqe, nxt, force_nonblock); + ret = io_fsync(req, req->sqe, nxt, force_nonblock); break; case IORING_OP_POLL_ADD: - ret = io_poll_add(req, s->sqe, nxt); + ret = io_poll_add(req, req->sqe, nxt); break; case IORING_OP_POLL_REMOVE: - ret = io_poll_remove(req, s->sqe); + ret = io_poll_remove(req, req->sqe); break; case IORING_OP_SYNC_FILE_RANGE: - ret = io_sync_file_range(req, s->sqe, nxt, force_nonblock); + ret = io_sync_file_range(req, req->sqe, nxt, force_nonblock); break; case IORING_OP_SENDMSG: - ret = io_sendmsg(req, s->sqe, nxt, force_nonblock); + ret = io_sendmsg(req, req->sqe, nxt, force_nonblock); break; case IORING_OP_RECVMSG: - ret = io_recvmsg(req, s->sqe, nxt, force_nonblock); + ret = io_recvmsg(req, req->sqe, nxt, force_nonblock); break; case IORING_OP_TIMEOUT: - ret = io_timeout(req, s->sqe); + ret = io_timeout(req, req->sqe); break; case IORING_OP_TIMEOUT_REMOVE: - ret = io_timeout_remove(req, s->sqe); + ret = io_timeout_remove(req, req->sqe); break; case IORING_OP_ACCEPT: - ret = io_accept(req, s->sqe, nxt, force_nonblock); + ret = io_accept(req, req->sqe, nxt, force_nonblock); break; case IORING_OP_CONNECT: - ret = io_connect(req, s->sqe, nxt, force_nonblock); + ret = io_connect(req, req->sqe, nxt, force_nonblock); break; case IORING_OP_ASYNC_CANCEL: - ret = io_async_cancel(req, s->sqe, nxt); + ret = io_async_cancel(req, req->sqe, nxt); break; default: ret = -EINVAL; @@ -2705,10 +2698,10 @@ static int io_issue_sqe(struct io_kiocb *req, struct io_kiocb **nxt, return -EAGAIN; /* workqueue context doesn't hold uring_lock, grab it now */ - if (s->in_async) + if (req->in_async) mutex_lock(&ctx->uring_lock); io_iopoll_req_issued(req); - if (s->in_async) + if (req->in_async) mutex_unlock(&ctx->uring_lock); } @@ -2728,7 +2721,6 @@ static void io_wq_submit_work(struct io_wq_work **workptr) { struct io_wq_work *work = *workptr; struct io_kiocb *req = container_of(work, struct io_kiocb, work); - struct sqe_submit *s = &req->submit; struct io_kiocb *nxt = NULL; int ret = 0; @@ -2739,8 +2731,8 @@ static void io_wq_submit_work(struct io_wq_work **workptr) ret = -ECANCELED; if (!ret) { - s->has_user = (work->flags & IO_WQ_WORK_HAS_MM) != 0; - s->in_async = true; + req->has_user = (work->flags & IO_WQ_WORK_HAS_MM) != 0; + req->in_async = true; do { ret = io_issue_sqe(req, &nxt, false); /* @@ -2806,24 +2798,17 @@ static inline struct file *io_file_from_index(struct io_ring_ctx *ctx, static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req) { - struct sqe_submit *s = &req->submit; struct io_ring_ctx *ctx = req->ctx; unsigned flags; int fd; - flags = READ_ONCE(s->sqe->flags); - fd = READ_ONCE(s->sqe->fd); + flags = READ_ONCE(req->sqe->flags); + fd = READ_ONCE(req->sqe->fd); if (flags & IOSQE_IO_DRAIN) req->flags |= REQ_F_IO_DRAIN; - /* - * All io need record the previous position, if LINK vs DARIN, - * it can be used to mark the position of the first IO in the - * link list. - */ - req->sequence = s->sequence; - if (!io_op_needs_file(s->sqe)) + if (!io_op_needs_file(req->sqe)) return 0; if (flags & IOSQE_FIXED_FILE) { @@ -2836,7 +2821,7 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req) return -EBADF; req->flags |= REQ_F_FIXED_FILE; } else { - if (s->needs_fixed_file) + if (req->needs_fixed_file) return -EBADF; trace_io_uring_file_get(ctx, fd); req->file = io_file_get(state, fd); @@ -2860,7 +2845,7 @@ static int io_grab_files(struct io_kiocb *req) * the fd has changed since we started down this path, and disallow * this operation if it has. */ - if (fcheck(req->submit.ring_fd) == req->submit.ring_file) { + if (fcheck(req->ring_fd) == req->ring_file) { list_add(&req->inflight_entry, &ctx->inflight_list); req->flags |= REQ_F_INFLIGHT; req->work.files = current->files; @@ -2941,7 +2926,7 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) return NULL; nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list); - if (!nxt || nxt->submit.sqe->opcode != IORING_OP_LINK_TIMEOUT) + if (!nxt || nxt->sqe->opcode != IORING_OP_LINK_TIMEOUT) return NULL; req->flags |= REQ_F_LINK_TIMEOUT; @@ -2964,14 +2949,13 @@ static void __io_queue_sqe(struct io_kiocb *req) */ if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) || (req->flags & REQ_F_MUST_PUNT))) { - struct sqe_submit *s = &req->submit; struct io_uring_sqe *sqe_copy; - sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL); + sqe_copy = kmemdup(req->sqe, sizeof(*sqe_copy), GFP_KERNEL); if (!sqe_copy) goto err; - s->sqe = sqe_copy; + req->sqe = sqe_copy; req->flags |= REQ_F_FREE_SQE; if (req->work.flags & IO_WQ_WORK_NEEDS_FILES) { @@ -3045,14 +3029,13 @@ static inline void io_queue_link_head(struct io_kiocb *req) static void io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state, struct io_kiocb **link) { - struct sqe_submit *s = &req->submit; struct io_ring_ctx *ctx = req->ctx; int ret; - req->user_data = s->sqe->user_data; + req->user_data = req->sqe->user_data; /* enforce forwards compatibility on users */ - if (unlikely(s->sqe->flags & ~SQE_VALID_FLAGS)) { + if (unlikely(req->sqe->flags & ~SQE_VALID_FLAGS)) { ret = -EINVAL; goto err_req; } @@ -3076,13 +3059,13 @@ err_req: struct io_kiocb *prev = *link; struct io_uring_sqe *sqe_copy; - if (s->sqe->flags & IOSQE_IO_DRAIN) + if (req->sqe->flags & IOSQE_IO_DRAIN) (*link)->flags |= REQ_F_DRAIN_LINK | REQ_F_IO_DRAIN; - if (READ_ONCE(s->sqe->opcode) == IORING_OP_LINK_TIMEOUT) { + if (READ_ONCE(req->sqe->opcode) == IORING_OP_LINK_TIMEOUT) { ret = io_timeout_setup(req); /* common setup allows offset being set, we don't */ - if (!ret && s->sqe->off) + if (!ret && req->sqe->off) ret = -EINVAL; if (ret) { prev->flags |= REQ_F_FAIL_LINK; @@ -3090,17 +3073,17 @@ err_req: } } - sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL); + sqe_copy = kmemdup(req->sqe, sizeof(*sqe_copy), GFP_KERNEL); if (!sqe_copy) { ret = -EAGAIN; goto err_req; } - s->sqe = sqe_copy; + req->sqe = sqe_copy; req->flags |= REQ_F_FREE_SQE; trace_io_uring_link(ctx, req, prev); list_add_tail(&req->list, &prev->link_list); - } else if (s->sqe->flags & IOSQE_IO_LINK) { + } else if (req->sqe->flags & IOSQE_IO_LINK) { req->flags |= REQ_F_LINK; INIT_LIST_HEAD(&req->link_list); @@ -3156,7 +3139,7 @@ static void io_commit_sqring(struct io_ring_ctx *ctx) * used, it's important that those reads are done through READ_ONCE() to * prevent a re-load down the line. */ -static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s) +static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req) { struct io_rings *rings = ctx->rings; u32 *sq_array = ctx->sq_array; @@ -3177,9 +3160,13 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s) head = READ_ONCE(sq_array[head & ctx->sq_mask]); if (likely(head < ctx->sq_entries)) { - s->ring_file = NULL; - s->sqe = &ctx->sq_sqes[head]; - s->sequence = ctx->cached_sq_head; + /* + * All io need record the previous position, if LINK vs DARIN, + * it can be used to mark the position of the first IO in the + * link list. + */ + req->sequence = ctx->cached_sq_head; + req->sqe = &ctx->sq_sqes[head]; ctx->cached_sq_head++; return true; } @@ -3220,12 +3207,12 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, submitted = -EAGAIN; break; } - if (!io_get_sqring(ctx, &req->submit)) { + if (!io_get_sqring(ctx, req)) { __io_free_req(req); break; } - if (io_sqe_needs_user(req->submit.sqe) && !*mm) { + if (io_sqe_needs_user(req->sqe) && !*mm) { mm_fault = mm_fault || !mmget_not_zero(ctx->sqo_mm); if (!mm_fault) { use_mm(ctx->sqo_mm); @@ -3233,14 +3220,14 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, } } - sqe_flags = req->submit.sqe->flags; + sqe_flags = req->sqe->flags; - req->submit.ring_file = ring_file; - req->submit.ring_fd = ring_fd; - req->submit.has_user = *mm != NULL; - req->submit.in_async = async; - req->submit.needs_fixed_file = async; - trace_io_uring_submit_sqe(ctx, req->submit.sqe->user_data, + req->ring_file = ring_file; + req->ring_fd = ring_fd; + req->has_user = *mm != NULL; + req->in_async = async; + req->needs_fixed_file = async; + trace_io_uring_submit_sqe(ctx, req->sqe->user_data, true, async); io_submit_sqe(req, statep, &link); submitted++; From 7d009165550adc64e3561c65ecce564125052e00 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 25 Nov 2019 23:14:40 +0300 Subject: [PATCH 38/42] io_uring: cleanup io_import_fixed() Clean io_import_fixed() call site and make it return proper type. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index c846605b8361..e44b0e01f1b5 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1490,9 +1490,9 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret, struct io_kiocb **nxt, io_rw_done(kiocb, ret); } -static int io_import_fixed(struct io_ring_ctx *ctx, int rw, - const struct io_uring_sqe *sqe, - struct iov_iter *iter) +static ssize_t io_import_fixed(struct io_ring_ctx *ctx, int rw, + const struct io_uring_sqe *sqe, + struct iov_iter *iter) { size_t len = READ_ONCE(sqe->len); struct io_mapped_ubuf *imu; @@ -1581,11 +1581,9 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req, * flag. */ opcode = READ_ONCE(sqe->opcode); - if (opcode == IORING_OP_READ_FIXED || - opcode == IORING_OP_WRITE_FIXED) { - ssize_t ret = io_import_fixed(req->ctx, rw, sqe, iter); + if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { *iovec = NULL; - return ret; + return io_import_fixed(req->ctx, rw, sqe, iter); } if (!req->has_user) From ad6e005ca68de7af76f9ed3e4c9b6f0aa2f842e3 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Tue, 26 Nov 2019 17:39:45 +0100 Subject: [PATCH 39/42] io_uring: use kzalloc instead of kcalloc for single-element allocations These allocations are single-element allocations, so don't use the array allocation wrapper for them. Signed-off-by: Jann Horn Signed-off-by: Jens Axboe --- fs/io-wq.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index 31c5a10b0825..dffea3c25a6a 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -580,7 +580,7 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) struct io_wqe_acct *acct =&wqe->acct[index]; struct io_worker *worker; - worker = kcalloc_node(1, sizeof(*worker), GFP_KERNEL, wqe->node); + worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, wqe->node); if (!worker) return false; @@ -988,7 +988,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) int ret = -ENOMEM, i, node; struct io_wq *wq; - wq = kcalloc(1, sizeof(*wq), GFP_KERNEL); + wq = kzalloc(sizeof(*wq), GFP_KERNEL); if (!wq) return ERR_PTR(-ENOMEM); @@ -1010,7 +1010,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) for_each_online_node(node) { struct io_wqe *wqe; - wqe = kcalloc_node(1, sizeof(struct io_wqe), GFP_KERNEL, node); + wqe = kzalloc_node(sizeof(struct io_wqe), GFP_KERNEL, node); if (!wqe) break; wq->wqes[i] = wqe; From 3fc50ab559f5ae400aa33bd0836b3602da7fa51b Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Tue, 26 Nov 2019 19:10:20 +0100 Subject: [PATCH 40/42] io-wq: fix handling of NUMA node IDs There are several things that can go wrong in the current code on NUMA systems, especially if not all nodes are online all the time: - If the identifiers of the online nodes do not form a single contiguous block starting at zero, wq->wqes will be too small, and OOB memory accesses will occur e.g. in the loop in io_wq_create(). - If a node comes online between the call to num_online_nodes() and the for_each_node() loop in io_wq_create(), an OOB write will occur. - If a node comes online between io_wq_create() and io_wq_enqueue(), a lookup is performed for an element that doesn't exist, and an OOB read will probably occur. Fix it by: - using nr_node_ids instead of num_online_nodes() for the allocation size; nr_node_ids is calculated by setup_nr_node_ids() to be bigger than the highest node ID that could possibly come online at some point, even if those nodes' identifiers are not a contiguous block - creating workers for all possible CPUs, not just all online ones This is basically what the normal workqueue code also does, as far as I can tell. Signed-off-by: Jann Horn Signed-off-by: Jens Axboe --- fs/io-wq.c | 80 +++++++++++++++++++++++------------------------------- 1 file changed, 34 insertions(+), 46 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index dffea3c25a6a..d3e8907cc182 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -105,7 +105,6 @@ struct io_wqe { struct io_wq { struct io_wqe **wqes; unsigned long state; - unsigned nr_wqes; get_work_fn *get_work; put_work_fn *put_work; @@ -632,21 +631,22 @@ static inline bool io_wqe_need_worker(struct io_wqe *wqe, int index) static int io_wq_manager(void *data) { struct io_wq *wq = data; - int i; + int workers_to_create = num_possible_nodes(); + int node; /* create fixed workers */ - refcount_set(&wq->refs, wq->nr_wqes); - for (i = 0; i < wq->nr_wqes; i++) { - if (create_io_worker(wq, wq->wqes[i], IO_WQ_ACCT_BOUND)) - continue; - goto err; + refcount_set(&wq->refs, workers_to_create); + for_each_node(node) { + if (!create_io_worker(wq, wq->wqes[node], IO_WQ_ACCT_BOUND)) + goto err; + workers_to_create--; } complete(&wq->done); while (!kthread_should_stop()) { - for (i = 0; i < wq->nr_wqes; i++) { - struct io_wqe *wqe = wq->wqes[i]; + for_each_node(node) { + struct io_wqe *wqe = wq->wqes[node]; bool fork_worker[2] = { false, false }; spin_lock_irq(&wqe->lock); @@ -668,7 +668,7 @@ static int io_wq_manager(void *data) err: set_bit(IO_WQ_BIT_ERROR, &wq->state); set_bit(IO_WQ_BIT_EXIT, &wq->state); - if (refcount_sub_and_test(wq->nr_wqes - i, &wq->refs)) + if (refcount_sub_and_test(workers_to_create, &wq->refs)) complete(&wq->done); return 0; } @@ -776,7 +776,7 @@ static bool io_wq_for_each_worker(struct io_wqe *wqe, void io_wq_cancel_all(struct io_wq *wq) { - int i; + int node; set_bit(IO_WQ_BIT_CANCEL, &wq->state); @@ -785,8 +785,8 @@ void io_wq_cancel_all(struct io_wq *wq) * to a worker and the worker putting itself on the busy_list */ rcu_read_lock(); - for (i = 0; i < wq->nr_wqes; i++) { - struct io_wqe *wqe = wq->wqes[i]; + for_each_node(node) { + struct io_wqe *wqe = wq->wqes[node]; io_wq_for_each_worker(wqe, io_wqe_worker_send_sig, NULL); } @@ -859,10 +859,10 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, void *data) { enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND; - int i; + int node; - for (i = 0; i < wq->nr_wqes; i++) { - struct io_wqe *wqe = wq->wqes[i]; + for_each_node(node) { + struct io_wqe *wqe = wq->wqes[node]; ret = io_wqe_cancel_cb_work(wqe, cancel, data); if (ret != IO_WQ_CANCEL_NOTFOUND) @@ -936,10 +936,10 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe, enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork) { enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND; - int i; + int node; - for (i = 0; i < wq->nr_wqes; i++) { - struct io_wqe *wqe = wq->wqes[i]; + for_each_node(node) { + struct io_wqe *wqe = wq->wqes[node]; ret = io_wqe_cancel_work(wqe, cwork); if (ret != IO_WQ_CANCEL_NOTFOUND) @@ -970,10 +970,10 @@ static void io_wq_flush_func(struct io_wq_work **workptr) void io_wq_flush(struct io_wq *wq) { struct io_wq_flush_data data; - int i; + int node; - for (i = 0; i < wq->nr_wqes; i++) { - struct io_wqe *wqe = wq->wqes[i]; + for_each_node(node) { + struct io_wqe *wqe = wq->wqes[node]; init_completion(&data.done); INIT_IO_WORK(&data.work, io_wq_flush_func); @@ -985,15 +985,14 @@ void io_wq_flush(struct io_wq *wq) struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) { - int ret = -ENOMEM, i, node; + int ret = -ENOMEM, node; struct io_wq *wq; wq = kzalloc(sizeof(*wq), GFP_KERNEL); if (!wq) return ERR_PTR(-ENOMEM); - wq->nr_wqes = num_online_nodes(); - wq->wqes = kcalloc(wq->nr_wqes, sizeof(struct io_wqe *), GFP_KERNEL); + wq->wqes = kcalloc(nr_node_ids, sizeof(struct io_wqe *), GFP_KERNEL); if (!wq->wqes) { kfree(wq); return ERR_PTR(-ENOMEM); @@ -1006,14 +1005,13 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) wq->user = data->user; wq->creds = data->creds; - i = 0; - for_each_online_node(node) { + for_each_node(node) { struct io_wqe *wqe; wqe = kzalloc_node(sizeof(struct io_wqe), GFP_KERNEL, node); if (!wqe) - break; - wq->wqes[i] = wqe; + goto err; + wq->wqes[node] = wqe; wqe->node = node; wqe->acct[IO_WQ_ACCT_BOUND].max_workers = bounded; atomic_set(&wqe->acct[IO_WQ_ACCT_BOUND].nr_running, 0); @@ -1029,15 +1027,10 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) INIT_HLIST_NULLS_HEAD(&wqe->free_list, 0); INIT_HLIST_NULLS_HEAD(&wqe->busy_list, 1); INIT_LIST_HEAD(&wqe->all_list); - - i++; } init_completion(&wq->done); - if (i != wq->nr_wqes) - goto err; - /* caller must have already done mmgrab() on this mm */ wq->mm = data->mm; @@ -1056,8 +1049,8 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) ret = PTR_ERR(wq->manager); complete(&wq->done); err: - for (i = 0; i < wq->nr_wqes; i++) - kfree(wq->wqes[i]); + for_each_node(node) + kfree(wq->wqes[node]); kfree(wq->wqes); kfree(wq); return ERR_PTR(ret); @@ -1071,26 +1064,21 @@ static bool io_wq_worker_wake(struct io_worker *worker, void *data) void io_wq_destroy(struct io_wq *wq) { - int i; + int node; set_bit(IO_WQ_BIT_EXIT, &wq->state); if (wq->manager) kthread_stop(wq->manager); rcu_read_lock(); - for (i = 0; i < wq->nr_wqes; i++) { - struct io_wqe *wqe = wq->wqes[i]; - - if (!wqe) - continue; - io_wq_for_each_worker(wqe, io_wq_worker_wake, NULL); - } + for_each_node(node) + io_wq_for_each_worker(wq->wqes[node], io_wq_worker_wake, NULL); rcu_read_unlock(); wait_for_completion(&wq->done); - for (i = 0; i < wq->nr_wqes; i++) - kfree(wq->wqes[i]); + for_each_node(node) + kfree(wq->wqes[node]); kfree(wq->wqes); kfree(wq); } From 6206f0e180d4eddc0a178f57120ab1b913701f6e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 26 Nov 2019 11:59:32 -0700 Subject: [PATCH 41/42] io-wq: shrink io_wq_work a bit Currently we're using 40 bytes for the io_wq_work structure, and 16 of those is the doubly link list node. We don't need doubly linked lists, we always add to tail to keep things ordered, and any other use case is list traversal with deletion. For the deletion case, we can easily support any node deletion by keeping track of the previous entry. This shrinks io_wq_work to 32 bytes, and subsequently io_kiock from io_uring to 216 to 208 bytes. Signed-off-by: Jens Axboe --- fs/io-wq.c | 36 +++++++++++++++++++++++------------- fs/io-wq.h | 46 ++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 67 insertions(+), 15 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index d3e8907cc182..91b85df0861e 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -84,7 +84,7 @@ enum { struct io_wqe { struct { spinlock_t lock; - struct list_head work_list; + struct io_wq_work_list work_list; unsigned long hash_map; unsigned flags; } ____cacheline_aligned_in_smp; @@ -236,7 +236,8 @@ static void io_worker_exit(struct io_worker *worker) static inline bool io_wqe_run_queue(struct io_wqe *wqe) __must_hold(wqe->lock) { - if (!list_empty(&wqe->work_list) && !(wqe->flags & IO_WQE_FLAG_STALLED)) + if (!wq_list_empty(&wqe->work_list) && + !(wqe->flags & IO_WQE_FLAG_STALLED)) return true; return false; } @@ -375,12 +376,15 @@ static bool __io_worker_idle(struct io_wqe *wqe, struct io_worker *worker) static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, unsigned *hash) __must_hold(wqe->lock) { + struct io_wq_work_node *node, *prev; struct io_wq_work *work; - list_for_each_entry(work, &wqe->work_list, list) { + wq_list_for_each(node, prev, &wqe->work_list) { + work = container_of(node, struct io_wq_work, list); + /* not hashed, can run anytime */ if (!(work->flags & IO_WQ_WORK_HASHED)) { - list_del(&work->list); + wq_node_del(&wqe->work_list, node, prev); return work; } @@ -388,7 +392,7 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, unsigned *hash) *hash = work->flags >> IO_WQ_HASH_SHIFT; if (!(wqe->hash_map & BIT_ULL(*hash))) { wqe->hash_map |= BIT_ULL(*hash); - list_del(&work->list); + wq_node_del(&wqe->work_list, node, prev); return work; } } @@ -416,7 +420,7 @@ static void io_worker_handle_work(struct io_worker *worker) work = io_get_next_work(wqe, &hash); if (work) __io_worker_busy(wqe, worker, work); - else if (!list_empty(&wqe->work_list)) + else if (!wq_list_empty(&wqe->work_list)) wqe->flags |= IO_WQE_FLAG_STALLED; spin_unlock_irq(&wqe->lock); @@ -526,7 +530,7 @@ static int io_wqe_worker(void *data) if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) { spin_lock_irq(&wqe->lock); - if (!list_empty(&wqe->work_list)) + if (!wq_list_empty(&wqe->work_list)) io_worker_handle_work(worker); else spin_unlock_irq(&wqe->lock); @@ -714,7 +718,7 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) } spin_lock_irqsave(&wqe->lock, flags); - list_add_tail(&work->list, &wqe->work_list); + wq_list_add_tail(&work->list, &wqe->work_list); wqe->flags &= ~IO_WQE_FLAG_STALLED; spin_unlock_irqrestore(&wqe->lock, flags); @@ -829,14 +833,17 @@ static enum io_wq_cancel io_wqe_cancel_cb_work(struct io_wqe *wqe, .cancel = cancel, .caller_data = cancel_data, }; + struct io_wq_work_node *node, *prev; struct io_wq_work *work; unsigned long flags; bool found = false; spin_lock_irqsave(&wqe->lock, flags); - list_for_each_entry(work, &wqe->work_list, list) { + wq_list_for_each(node, prev, &wqe->work_list) { + work = container_of(node, struct io_wq_work, list); + if (cancel(work, cancel_data)) { - list_del(&work->list); + wq_node_del(&wqe->work_list, node, prev); found = true; break; } @@ -894,6 +901,7 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data) static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe, struct io_wq_work *cwork) { + struct io_wq_work_node *node, *prev; struct io_wq_work *work; unsigned long flags; bool found = false; @@ -906,9 +914,11 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe, * no completion will be posted for it. */ spin_lock_irqsave(&wqe->lock, flags); - list_for_each_entry(work, &wqe->work_list, list) { + wq_list_for_each(node, prev, &wqe->work_list) { + work = container_of(node, struct io_wq_work, list); + if (work == cwork) { - list_del(&work->list); + wq_node_del(&wqe->work_list, node, prev); found = true; break; } @@ -1023,7 +1033,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) wqe->node = node; wqe->wq = wq; spin_lock_init(&wqe->lock); - INIT_LIST_HEAD(&wqe->work_list); + INIT_WQ_LIST(&wqe->work_list); INIT_HLIST_NULLS_HEAD(&wqe->free_list, 0); INIT_HLIST_NULLS_HEAD(&wqe->busy_list, 1); INIT_LIST_HEAD(&wqe->all_list); diff --git a/fs/io-wq.h b/fs/io-wq.h index 5cd8c7697e88..600e0158cba7 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -22,18 +22,60 @@ enum io_wq_cancel { IO_WQ_CANCEL_NOTFOUND, /* work not found */ }; +struct io_wq_work_node { + struct io_wq_work_node *next; +}; + +struct io_wq_work_list { + struct io_wq_work_node *first; + struct io_wq_work_node *last; +}; + +static inline void wq_list_add_tail(struct io_wq_work_node *node, + struct io_wq_work_list *list) +{ + if (!list->first) { + list->first = list->last = node; + } else { + list->last->next = node; + list->last = node; + } +} + +static inline void wq_node_del(struct io_wq_work_list *list, + struct io_wq_work_node *node, + struct io_wq_work_node *prev) +{ + if (node == list->first) + list->first = node->next; + if (node == list->last) + list->last = prev; + if (prev) + prev->next = node->next; +} + +#define wq_list_for_each(pos, prv, head) \ + for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next) + +#define wq_list_empty(list) ((list)->first == NULL) +#define INIT_WQ_LIST(list) do { \ + (list)->first = NULL; \ + (list)->last = NULL; \ +} while (0) + struct io_wq_work { union { - struct list_head list; + struct io_wq_work_node list; void *data; }; void (*func)(struct io_wq_work **); - unsigned flags; struct files_struct *files; + unsigned flags; }; #define INIT_IO_WORK(work, _func) \ do { \ + (work)->list.next = NULL; \ (work)->func = _func; \ (work)->flags = 0; \ (work)->files = NULL; \ From e944475e69849273ca8f1fe04a3ce81b5901d165 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 26 Nov 2019 15:02:04 -0700 Subject: [PATCH 42/42] io_uring: make poll->wait dynamically allocated In the quest to bring io_kiocb down to 3 cachelines, this one does the trick. Make the wait_queue_entry for the poll command come out of kmalloc instead of embedding it in struct io_poll_iocb, as the latter is the largest member of io_kiocb. Once we trim this down a bit, we're back at a healthy 192 bytes for struct io_kiocb. Signed-off-by: Jens Axboe --- fs/io_uring.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index e44b0e01f1b5..2c2e8c25da01 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -291,7 +291,7 @@ struct io_poll_iocb { __poll_t events; bool done; bool canceled; - struct wait_queue_entry wait; + struct wait_queue_entry *wait; }; struct io_timeout_data { @@ -2030,8 +2030,8 @@ static void io_poll_remove_one(struct io_kiocb *req) spin_lock(&poll->head->lock); WRITE_ONCE(poll->canceled, true); - if (!list_empty(&poll->wait.entry)) { - list_del_init(&poll->wait.entry); + if (!list_empty(&poll->wait->entry)) { + list_del_init(&poll->wait->entry); io_queue_async_work(req); } spin_unlock(&poll->head->lock); @@ -2104,6 +2104,7 @@ static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error) struct io_ring_ctx *ctx = req->ctx; req->poll.done = true; + kfree(req->poll.wait); if (error) io_cqring_fill_event(req, error); else @@ -2141,7 +2142,7 @@ static void io_poll_complete_work(struct io_wq_work **workptr) */ spin_lock_irq(&ctx->completion_lock); if (!mask && ret != -ECANCELED) { - add_wait_queue(poll->head, &poll->wait); + add_wait_queue(poll->head, poll->wait); spin_unlock_irq(&ctx->completion_lock); return; } @@ -2161,8 +2162,7 @@ static void io_poll_complete_work(struct io_wq_work **workptr) static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, void *key) { - struct io_poll_iocb *poll = container_of(wait, struct io_poll_iocb, - wait); + struct io_poll_iocb *poll = wait->private; struct io_kiocb *req = container_of(poll, struct io_kiocb, poll); struct io_ring_ctx *ctx = req->ctx; __poll_t mask = key_to_poll(key); @@ -2172,7 +2172,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, if (mask && !(mask & poll->events)) return 0; - list_del_init(&poll->wait.entry); + list_del_init(&poll->wait->entry); /* * Run completion inline if we can. We're using trylock here because @@ -2213,7 +2213,7 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, pt->error = 0; pt->req->poll.head = head; - add_wait_queue(head, &pt->req->poll.wait); + add_wait_queue(head, pt->req->poll.wait); } static void io_poll_req_insert(struct io_kiocb *req) @@ -2252,6 +2252,10 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (!poll->file) return -EBADF; + poll->wait = kmalloc(sizeof(*poll->wait), GFP_KERNEL); + if (!poll->wait) + return -ENOMEM; + req->sqe = NULL; INIT_IO_WORK(&req->work, io_poll_complete_work); events = READ_ONCE(sqe->poll_events); @@ -2268,8 +2272,9 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe, ipt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */ /* initialized the list so that we can do list_empty checks */ - INIT_LIST_HEAD(&poll->wait.entry); - init_waitqueue_func_entry(&poll->wait, io_poll_wake); + INIT_LIST_HEAD(&poll->wait->entry); + init_waitqueue_func_entry(poll->wait, io_poll_wake); + poll->wait->private = poll; INIT_LIST_HEAD(&req->list); @@ -2278,14 +2283,14 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe, spin_lock_irq(&ctx->completion_lock); if (likely(poll->head)) { spin_lock(&poll->head->lock); - if (unlikely(list_empty(&poll->wait.entry))) { + if (unlikely(list_empty(&poll->wait->entry))) { if (ipt.error) cancel = true; ipt.error = 0; mask = 0; } if (mask || ipt.error) - list_del_init(&poll->wait.entry); + list_del_init(&poll->wait->entry); else if (cancel) WRITE_ONCE(poll->canceled, true); else if (!poll->done) /* actually waiting for an event */