diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index 8019103aac10..cf33de56da27 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -450,6 +450,35 @@ this allows system administrators to override the ``IA64_THREAD_UAC_NOPRINT`` ``prctl`` and avoid logs being flooded. +io_uring_disabled +================= + +Prevents all processes from creating new io_uring instances. Enabling this +shrinks the kernel's attack surface. + += ====================================================================== +0 All processes can create io_uring instances as normal. This is the + default setting. +1 io_uring creation is disabled (io_uring_setup() will fail with + -EPERM) for unprivileged processes not in the io_uring_group group. + Existing io_uring instances can still be used. See the + documentation for io_uring_group for more information. +2 io_uring creation is disabled for all processes. io_uring_setup() + always fails with -EPERM. Existing io_uring instances can still be + used. += ====================================================================== + + +io_uring_group +============== + +When io_uring_disabled is set to 1, a process must either be +privileged (CAP_SYS_ADMIN) or be in the io_uring_group group in order +to create an io_uring instance. If io_uring_group is set to -1 (the +default), only processes with the CAP_SYS_ADMIN capability may create +io_uring instances. + + kexec_load_disabled =================== diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index 300455b4bc12..c53678875416 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -93,6 +93,8 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f) struct io_uring_sqe *sqe; unsigned int sq_idx; + if (ctx->flags & IORING_SETUP_NO_SQARRAY) + break; sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]); if (sq_idx > sq_mask) continue; diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index 62f345587df5..1ecc8c748768 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -174,6 +174,16 @@ static void io_worker_ref_put(struct io_wq *wq) complete(&wq->worker_done); } +bool io_wq_worker_stopped(void) +{ + struct io_worker *worker = current->worker_private; + + if (WARN_ON_ONCE(!io_wq_current_is_worker())) + return true; + + return test_bit(IO_WQ_BIT_EXIT, &worker->wq->state); +} + static void io_worker_cancel_cb(struct io_worker *worker) { struct io_wq_acct *acct = io_wq_get_acct(worker); diff --git a/io_uring/io-wq.h b/io_uring/io-wq.h index 06d9ca90c577..2b2a6406dd8e 100644 --- a/io_uring/io-wq.h +++ b/io_uring/io-wq.h @@ -52,6 +52,7 @@ void io_wq_hash_work(struct io_wq_work *work, void *val); int io_wq_cpu_affinity(struct io_uring_task *tctx, cpumask_var_t mask); int io_wq_max_workers(struct io_wq *wq, int *new_count); +bool io_wq_worker_stopped(void); static inline bool io_wq_is_hashed(struct io_wq_work *work) { diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index e7675355048d..783ed0fff71b 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -150,6 +150,31 @@ static void io_queue_sqe(struct io_kiocb *req); struct kmem_cache *req_cachep; +static int __read_mostly sysctl_io_uring_disabled; +static int __read_mostly sysctl_io_uring_group = -1; + +#ifdef CONFIG_SYSCTL +static struct ctl_table kernel_io_uring_disabled_table[] = { + { + .procname = "io_uring_disabled", + .data = &sysctl_io_uring_disabled, + .maxlen = sizeof(sysctl_io_uring_disabled), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, + }, + { + .procname = "io_uring_group", + .data = &sysctl_io_uring_group, + .maxlen = sizeof(gid_t), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + {}, +}; +#endif + struct sock *io_uring_get_socket(struct file *file) { #if defined(CONFIG_UNIX) @@ -883,7 +908,7 @@ static void __io_flush_post_cqes(struct io_ring_ctx *ctx) struct io_uring_cqe *cqe = &ctx->completion_cqes[i]; if (!io_fill_cqe_aux(ctx, cqe->user_data, cqe->res, cqe->flags)) { - if (ctx->task_complete) { + if (ctx->lockless_cq) { spin_lock(&ctx->completion_lock); io_cqring_event_overflow(ctx, cqe->user_data, cqe->res, cqe->flags, 0, 0); @@ -1541,7 +1566,7 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx) if (!(req->flags & REQ_F_CQE_SKIP) && unlikely(!io_fill_cqe_req(ctx, req))) { - if (ctx->task_complete) { + if (ctx->lockless_cq) { spin_lock(&ctx->completion_lock); io_req_cqe_overflow(req); spin_unlock(&ctx->completion_lock); @@ -1950,6 +1975,8 @@ fail: if (!needs_poll) { if (!(req->ctx->flags & IORING_SETUP_IOPOLL)) break; + if (io_wq_worker_stopped()) + break; cond_resched(); continue; } @@ -4038,9 +4065,30 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) return io_uring_create(entries, &p, params); } +static inline bool io_uring_allowed(void) +{ + int disabled = READ_ONCE(sysctl_io_uring_disabled); + kgid_t io_uring_group; + + if (disabled == 2) + return false; + + if (disabled == 0 || capable(CAP_SYS_ADMIN)) + return true; + + io_uring_group = make_kgid(&init_user_ns, sysctl_io_uring_group); + if (!gid_valid(io_uring_group)) + return false; + + return in_group_p(io_uring_group); +} + SYSCALL_DEFINE2(io_uring_setup, u32, entries, struct io_uring_params __user *, params) { + if (!io_uring_allowed()) + return -EPERM; + return io_uring_setup(entries, params); } @@ -4634,6 +4682,10 @@ static int __init io_uring_init(void) offsetof(struct io_kiocb, cmd.data), sizeof_field(struct io_kiocb, cmd.data), NULL); +#ifdef CONFIG_SYSCTL + register_sysctl_init("kernel", kernel_io_uring_disabled_table); +#endif + return 0; }; __initcall(io_uring_init); diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c index ee2d2c687fda..bd6c2c7959a5 100644 --- a/io_uring/sqpoll.c +++ b/io_uring/sqpoll.c @@ -430,7 +430,9 @@ __cold int io_sqpoll_wq_cpu_affinity(struct io_ring_ctx *ctx, if (sqd) { io_sq_thread_park(sqd); - ret = io_wq_cpu_affinity(sqd->thread->io_uring, mask); + /* Don't set affinity for a dying thread */ + if (sqd->thread) + ret = io_wq_cpu_affinity(sqd->thread->io_uring, mask); io_sq_thread_unpark(sqd); }