Merge branch 'for-6.7/io_uring' into io_uring-futex

* for-6.7/io_uring:
  io_uring: cancelable uring_cmd
  io_uring: retain top 8bits of uring_cmd flags for kernel internal use
  io_uring: add IORING_OP_WAITID support
  exit: add internal include file with helpers
  exit: add kernel_waitid_prepare() helper
  exit: move core of do_wait() into helper
  exit: abstract out should_wake helper for child_wait_callback()
  io_uring/rw: add support for IORING_OP_READ_MULTISHOT
  io_uring/rw: mark readv/writev as vectored in the opcode definition
  io_uring/rw: split io_read() into a helper
This commit is contained in:
Jens Axboe 2023-09-28 07:46:55 -06:00
commit 73c7e7a91f
15 changed files with 727 additions and 71 deletions

View File

@ -20,8 +20,15 @@ enum io_uring_cmd_flags {
IO_URING_F_SQE128 = (1 << 8),
IO_URING_F_CQE32 = (1 << 9),
IO_URING_F_IOPOLL = (1 << 10),
/* set when uring wants to cancel a previously issued command */
IO_URING_F_CANCEL = (1 << 11),
};
/* only top 8 bits of sqe->uring_cmd_flags for kernel internal use */
#define IORING_URING_CMD_CANCELABLE (1U << 30)
#define IORING_URING_CMD_POLLED (1U << 31)
struct io_uring_cmd {
struct file *file;
const struct io_uring_sqe *sqe;
@ -82,6 +89,9 @@ static inline void io_uring_free(struct task_struct *tsk)
__io_uring_free(tsk);
}
int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags);
void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
unsigned int issue_flags);
struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd);
#else
static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
struct iov_iter *iter, void *ioucmd)
@ -122,6 +132,14 @@ static inline int io_uring_cmd_sock(struct io_uring_cmd *cmd,
{
return -EOPNOTSUPP;
}
static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
unsigned int issue_flags)
{
}
static inline struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd)
{
return NULL;
}
#endif
#endif

View File

@ -265,6 +265,12 @@ struct io_ring_ctx {
*/
struct io_wq_work_list iopoll_list;
bool poll_multi_queue;
/*
* Any cancelable uring_cmd is added to this list in
* ->uring_cmd() by io_uring_cmd_insert_cancelable()
*/
struct hlist_head cancelable_uring_cmd;
} ____cacheline_aligned_in_smp;
struct {
@ -313,6 +319,8 @@ struct io_ring_ctx {
struct list_head cq_overflow_list;
struct io_hash_table cancel_table;
struct hlist_head waitid_list;
const struct cred *sq_creds; /* cred used for __io_sq_thread() */
struct io_sq_data *sq_data; /* if using sq thread polling */

View File

@ -65,6 +65,7 @@ struct io_uring_sqe {
__u32 xattr_flags;
__u32 msg_ring_flags;
__u32 uring_cmd_flags;
__u32 waitid_flags;
};
__u64 user_data; /* data to be passed back at completion time */
/* pack this to avoid bogus arm OABI complaints */
@ -240,19 +241,20 @@ enum io_uring_op {
IORING_OP_URING_CMD,
IORING_OP_SEND_ZC,
IORING_OP_SENDMSG_ZC,
IORING_OP_READ_MULTISHOT,
IORING_OP_WAITID,
/* this goes last, obviously */
IORING_OP_LAST,
};
/*
* sqe->uring_cmd_flags
* sqe->uring_cmd_flags top 8bits aren't available for userspace
* IORING_URING_CMD_FIXED use registered buffer; pass this flag
* along with setting sqe->buf_index.
* IORING_URING_CMD_POLLED driver use only
*/
#define IORING_URING_CMD_FIXED (1U << 0)
#define IORING_URING_CMD_POLLED (1U << 31)
#define IORING_URING_CMD_MASK IORING_URING_CMD_FIXED
/*

View File

@ -7,5 +7,6 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \
openclose.o uring_cmd.o epoll.o \
statx.o net.o msg_ring.o timeout.o \
sqpoll.o fdinfo.o tctx.o poll.o \
cancel.o kbuf.o rsrc.o rw.o opdef.o notif.o
cancel.o kbuf.o rsrc.o rw.o opdef.o \
notif.o waitid.o
obj-$(CONFIG_IO_WQ) += io-wq.o

View File

@ -15,6 +15,7 @@
#include "tctx.h"
#include "poll.h"
#include "timeout.h"
#include "waitid.h"
#include "cancel.h"
struct io_cancel {
@ -119,6 +120,10 @@ int io_try_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd,
if (ret != -ENOENT)
return ret;
ret = io_waitid_cancel(ctx, cd, issue_flags);
if (ret != -ENOENT)
return ret;
spin_lock(&ctx->completion_lock);
if (!(cd->flags & IORING_ASYNC_CANCEL_FD))
ret = io_timeout_cancel(ctx, cd);

View File

@ -92,6 +92,7 @@
#include "cancel.h"
#include "net.h"
#include "notif.h"
#include "waitid.h"
#include "timeout.h"
#include "poll.h"
@ -348,8 +349,10 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_LIST_HEAD(&ctx->tctx_list);
ctx->submit_state.free_list.next = NULL;
INIT_WQ_LIST(&ctx->locked_free_list);
INIT_HLIST_HEAD(&ctx->waitid_list);
INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
INIT_HLIST_HEAD(&ctx->cancelable_uring_cmd);
return ctx;
err:
kfree(ctx->cancel_table.hbs);
@ -3256,6 +3259,37 @@ static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
return ret;
}
static bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx,
struct task_struct *task, bool cancel_all)
{
struct hlist_node *tmp;
struct io_kiocb *req;
bool ret = false;
lockdep_assert_held(&ctx->uring_lock);
hlist_for_each_entry_safe(req, tmp, &ctx->cancelable_uring_cmd,
hash_node) {
struct io_uring_cmd *cmd = io_kiocb_to_cmd(req,
struct io_uring_cmd);
struct file *file = req->file;
if (!cancel_all && req->task != task)
continue;
if (cmd->flags & IORING_URING_CMD_CANCELABLE) {
/* ->sqe isn't available if no async data */
if (!req_has_async_data(req))
cmd->sqe = NULL;
file->f_op->uring_cmd(cmd, IO_URING_F_CANCEL);
ret = true;
}
}
io_submit_flush_completions(ctx);
return ret;
}
static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
struct task_struct *task,
bool cancel_all)
@ -3303,6 +3337,8 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
ret |= io_cancel_defer_files(ctx, task, cancel_all);
mutex_lock(&ctx->uring_lock);
ret |= io_poll_remove_all(ctx, task, cancel_all);
ret |= io_waitid_remove_all(ctx, task, cancel_all);
ret |= io_uring_try_cancel_uring_cmd(ctx, task, cancel_all);
mutex_unlock(&ctx->uring_lock);
ret |= io_kill_timeouts(ctx, task, cancel_all);
if (task)
@ -4666,6 +4702,9 @@ static int __init io_uring_init(void)
BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32));
/* top 8bits are for internal use */
BUILD_BUG_ON((IORING_URING_CMD_MASK & 0xff000000) != 0);
io_uring_optable_init();
/*

View File

@ -33,6 +33,7 @@
#include "poll.h"
#include "cancel.h"
#include "rw.h"
#include "waitid.h"
static int io_no_issue(struct io_kiocb *req, unsigned int issue_flags)
{
@ -63,6 +64,7 @@ const struct io_issue_def io_issue_defs[] = {
.ioprio = 1,
.iopoll = 1,
.iopoll_queue = 1,
.vectored = 1,
.prep = io_prep_rw,
.issue = io_read,
},
@ -76,6 +78,7 @@ const struct io_issue_def io_issue_defs[] = {
.ioprio = 1,
.iopoll = 1,
.iopoll_queue = 1,
.vectored = 1,
.prep = io_prep_rw,
.issue = io_write,
},
@ -428,9 +431,21 @@ const struct io_issue_def io_issue_defs[] = {
.prep = io_eopnotsupp_prep,
#endif
},
[IORING_OP_READ_MULTISHOT] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
.buffer_select = 1,
.audit_skip = 1,
.prep = io_read_mshot_prep,
.issue = io_read_mshot,
},
[IORING_OP_WAITID] = {
.prep = io_waitid_prep,
.issue = io_waitid,
},
};
const struct io_cold_def io_cold_defs[] = {
[IORING_OP_NOP] = {
.name = "NOP",
@ -648,6 +663,13 @@ const struct io_cold_def io_cold_defs[] = {
.fail = io_sendrecv_fail,
#endif
},
[IORING_OP_READ_MULTISHOT] = {
.name = "READ_MULTISHOT",
},
[IORING_OP_WAITID] = {
.name = "WAITID",
.async_size = sizeof(struct io_waitid_async),
},
};
const char *io_uring_get_opcode(u8 opcode)

View File

@ -29,6 +29,8 @@ struct io_issue_def {
unsigned iopoll_queue : 1;
/* opcode specific path will handle ->async_data allocation if needed */
unsigned manual_alloc : 1;
/* vectored opcode, set if 1) vectored, and 2) handler needs to know */
unsigned vectored : 1;
int (*issue)(struct io_kiocb *, unsigned int);
int (*prep)(struct io_kiocb *, const struct io_uring_sqe *);

View File

@ -123,6 +123,22 @@ int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return 0;
}
/*
* Multishot read is prepared just like a normal read/write request, only
* difference is that we set the MULTISHOT flag.
*/
int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
int ret;
ret = io_prep_rw(req, sqe);
if (unlikely(ret))
return ret;
req->flags |= REQ_F_APOLL_MULTISHOT;
return 0;
}
void io_readv_writev_cleanup(struct io_kiocb *req)
{
struct io_async_rw *io = req->async_data;
@ -388,8 +404,7 @@ static struct iovec *__io_import_iovec(int ddir, struct io_kiocb *req,
buf = u64_to_user_ptr(rw->addr);
sqe_len = rw->len;
if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE ||
(req->flags & REQ_F_BUFFER_SELECT)) {
if (!io_issue_defs[opcode].vectored || req->flags & REQ_F_BUFFER_SELECT) {
if (io_do_buffer_select(req)) {
buf = io_buffer_select(req, &sqe_len, issue_flags);
if (!buf)
@ -708,7 +723,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
return 0;
}
int io_read(struct io_kiocb *req, unsigned int issue_flags)
static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
struct io_rw_state __s, *s = &__s;
@ -776,8 +791,11 @@ int io_read(struct io_kiocb *req, unsigned int issue_flags)
if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
req->flags &= ~REQ_F_REISSUE;
/* if we can poll, just do that */
if (req->opcode == IORING_OP_READ && file_can_poll(req->file))
/*
* If we can poll, just do that. For a vectored read, we'll
* need to copy state first.
*/
if (file_can_poll(req->file) && !io_issue_defs[req->opcode].vectored)
return -EAGAIN;
/* IOPOLL retry should happen for io-wq threads */
if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
@ -853,7 +871,69 @@ done:
/* it's faster to check here then delegate to kfree */
if (iovec)
kfree(iovec);
return kiocb_done(req, ret, issue_flags);
return ret;
}
int io_read(struct io_kiocb *req, unsigned int issue_flags)
{
int ret;
ret = __io_read(req, issue_flags);
if (ret >= 0)
return kiocb_done(req, ret, issue_flags);
return ret;
}
int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags)
{
unsigned int cflags = 0;
int ret;
/*
* Multishot MUST be used on a pollable file
*/
if (!file_can_poll(req->file))
return -EBADFD;
ret = __io_read(req, issue_flags);
/*
* If we get -EAGAIN, recycle our buffer and just let normal poll
* handling arm it.
*/
if (ret == -EAGAIN) {
io_kbuf_recycle(req, issue_flags);
return -EAGAIN;
}
/*
* Any successful return value will keep the multishot read armed.
*/
if (ret > 0) {
/*
* Put our buffer and post a CQE. If we fail to post a CQE, then
* jump to the termination path. This request is then done.
*/
cflags = io_put_kbuf(req, issue_flags);
if (io_fill_cqe_req_aux(req,
issue_flags & IO_URING_F_COMPLETE_DEFER,
ret, cflags | IORING_CQE_F_MORE)) {
if (issue_flags & IO_URING_F_MULTISHOT)
return IOU_ISSUE_SKIP_COMPLETE;
return -EAGAIN;
}
}
/*
* Either an error, or we've hit overflow posting the CQE. For any
* multishot request, hitting overflow will terminate it.
*/
io_req_set_res(req, ret, cflags);
if (issue_flags & IO_URING_F_MULTISHOT)
return IOU_STOP_MULTISHOT;
return IOU_OK;
}
int io_write(struct io_kiocb *req, unsigned int issue_flags)

View File

@ -23,3 +23,5 @@ int io_writev_prep_async(struct io_kiocb *req);
void io_readv_writev_cleanup(struct io_kiocb *req);
void io_rw_fail(struct io_kiocb *req);
void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts);
int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags);

View File

@ -13,6 +13,51 @@
#include "rsrc.h"
#include "uring_cmd.h"
static void io_uring_cmd_del_cancelable(struct io_uring_cmd *cmd,
unsigned int issue_flags)
{
struct io_kiocb *req = cmd_to_io_kiocb(cmd);
struct io_ring_ctx *ctx = req->ctx;
if (!(cmd->flags & IORING_URING_CMD_CANCELABLE))
return;
cmd->flags &= ~IORING_URING_CMD_CANCELABLE;
io_ring_submit_lock(ctx, issue_flags);
hlist_del(&req->hash_node);
io_ring_submit_unlock(ctx, issue_flags);
}
/*
* Mark this command as concelable, then io_uring_try_cancel_uring_cmd()
* will try to cancel this issued command by sending ->uring_cmd() with
* issue_flags of IO_URING_F_CANCEL.
*
* The command is guaranteed to not be done when calling ->uring_cmd()
* with IO_URING_F_CANCEL, but it is driver's responsibility to deal
* with race between io_uring canceling and normal completion.
*/
void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
unsigned int issue_flags)
{
struct io_kiocb *req = cmd_to_io_kiocb(cmd);
struct io_ring_ctx *ctx = req->ctx;
if (!(cmd->flags & IORING_URING_CMD_CANCELABLE)) {
cmd->flags |= IORING_URING_CMD_CANCELABLE;
io_ring_submit_lock(ctx, issue_flags);
hlist_add_head(&req->hash_node, &ctx->cancelable_uring_cmd);
io_ring_submit_unlock(ctx, issue_flags);
}
}
EXPORT_SYMBOL_GPL(io_uring_cmd_mark_cancelable);
struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd)
{
return cmd_to_io_kiocb(cmd)->task;
}
EXPORT_SYMBOL_GPL(io_uring_cmd_get_task);
static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts)
{
struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
@ -56,6 +101,8 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2,
{
struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
io_uring_cmd_del_cancelable(ioucmd, issue_flags);
if (ret < 0)
req_set_fail(req);
@ -91,7 +138,7 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return -EINVAL;
ioucmd->flags = READ_ONCE(sqe->uring_cmd_flags);
if (ioucmd->flags & ~IORING_URING_CMD_FIXED)
if (ioucmd->flags & ~IORING_URING_CMD_MASK)
return -EINVAL;
if (ioucmd->flags & IORING_URING_CMD_FIXED) {

372
io_uring/waitid.c Normal file
View File

@ -0,0 +1,372 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Support for async notification of waitid
*/
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/compat.h>
#include <linux/io_uring.h>
#include <uapi/linux/io_uring.h>
#include "io_uring.h"
#include "cancel.h"
#include "waitid.h"
#include "../kernel/exit.h"
static void io_waitid_cb(struct io_kiocb *req, struct io_tw_state *ts);
#define IO_WAITID_CANCEL_FLAG BIT(31)
#define IO_WAITID_REF_MASK GENMASK(30, 0)
struct io_waitid {
struct file *file;
int which;
pid_t upid;
int options;
atomic_t refs;
struct wait_queue_head *head;
struct siginfo __user *infop;
struct waitid_info info;
};
static void io_waitid_free(struct io_kiocb *req)
{
struct io_waitid_async *iwa = req->async_data;
put_pid(iwa->wo.wo_pid);
kfree(req->async_data);
req->async_data = NULL;
req->flags &= ~REQ_F_ASYNC_DATA;
}
#ifdef CONFIG_COMPAT
static bool io_waitid_compat_copy_si(struct io_waitid *iw, int signo)
{
struct compat_siginfo __user *infop;
bool ret;
infop = (struct compat_siginfo __user *) iw->infop;
if (!user_write_access_begin(infop, sizeof(*infop)))
return false;
unsafe_put_user(signo, &infop->si_signo, Efault);
unsafe_put_user(0, &infop->si_errno, Efault);
unsafe_put_user(iw->info.cause, &infop->si_code, Efault);
unsafe_put_user(iw->info.pid, &infop->si_pid, Efault);
unsafe_put_user(iw->info.uid, &infop->si_uid, Efault);
unsafe_put_user(iw->info.status, &infop->si_status, Efault);
ret = true;
done:
user_write_access_end();
return ret;
Efault:
ret = false;
goto done;
}
#endif
static bool io_waitid_copy_si(struct io_kiocb *req, int signo)
{
struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
bool ret;
if (!iw->infop)
return true;
#ifdef CONFIG_COMPAT
if (req->ctx->compat)
return io_waitid_compat_copy_si(iw, signo);
#endif
if (!user_write_access_begin(iw->infop, sizeof(*iw->infop)))
return false;
unsafe_put_user(signo, &iw->infop->si_signo, Efault);
unsafe_put_user(0, &iw->infop->si_errno, Efault);
unsafe_put_user(iw->info.cause, &iw->infop->si_code, Efault);
unsafe_put_user(iw->info.pid, &iw->infop->si_pid, Efault);
unsafe_put_user(iw->info.uid, &iw->infop->si_uid, Efault);
unsafe_put_user(iw->info.status, &iw->infop->si_status, Efault);
ret = true;
done:
user_write_access_end();
return ret;
Efault:
ret = false;
goto done;
}
static int io_waitid_finish(struct io_kiocb *req, int ret)
{
int signo = 0;
if (ret > 0) {
signo = SIGCHLD;
ret = 0;
}
if (!io_waitid_copy_si(req, signo))
ret = -EFAULT;
io_waitid_free(req);
return ret;
}
static void io_waitid_complete(struct io_kiocb *req, int ret)
{
struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
struct io_tw_state ts = { .locked = true };
/* anyone completing better be holding a reference */
WARN_ON_ONCE(!(atomic_read(&iw->refs) & IO_WAITID_REF_MASK));
lockdep_assert_held(&req->ctx->uring_lock);
/*
* Did cancel find it meanwhile?
*/
if (hlist_unhashed(&req->hash_node))
return;
hlist_del_init(&req->hash_node);
ret = io_waitid_finish(req, ret);
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
io_req_task_complete(req, &ts);
}
static bool __io_waitid_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req)
{
struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
struct io_waitid_async *iwa = req->async_data;
/*
* Mark us canceled regardless of ownership. This will prevent a
* potential retry from a spurious wakeup.
*/
atomic_or(IO_WAITID_CANCEL_FLAG, &iw->refs);
/* claim ownership */
if (atomic_fetch_inc(&iw->refs) & IO_WAITID_REF_MASK)
return false;
spin_lock_irq(&iw->head->lock);
list_del_init(&iwa->wo.child_wait.entry);
spin_unlock_irq(&iw->head->lock);
io_waitid_complete(req, -ECANCELED);
return true;
}
int io_waitid_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
unsigned int issue_flags)
{
struct hlist_node *tmp;
struct io_kiocb *req;
int nr = 0;
if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_FD_FIXED))
return -ENOENT;
io_ring_submit_lock(ctx, issue_flags);
hlist_for_each_entry_safe(req, tmp, &ctx->waitid_list, hash_node) {
if (req->cqe.user_data != cd->data &&
!(cd->flags & IORING_ASYNC_CANCEL_ANY))
continue;
if (__io_waitid_cancel(ctx, req))
nr++;
if (!(cd->flags & IORING_ASYNC_CANCEL_ALL))
break;
}
io_ring_submit_unlock(ctx, issue_flags);
if (nr)
return nr;
return -ENOENT;
}
bool io_waitid_remove_all(struct io_ring_ctx *ctx, struct task_struct *task,
bool cancel_all)
{
struct hlist_node *tmp;
struct io_kiocb *req;
bool found = false;
lockdep_assert_held(&ctx->uring_lock);
hlist_for_each_entry_safe(req, tmp, &ctx->waitid_list, hash_node) {
if (!io_match_task_safe(req, task, cancel_all))
continue;
__io_waitid_cancel(ctx, req);
found = true;
}
return found;
}
static inline bool io_waitid_drop_issue_ref(struct io_kiocb *req)
{
struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
struct io_waitid_async *iwa = req->async_data;
if (!atomic_sub_return(1, &iw->refs))
return false;
/*
* Wakeup triggered, racing with us. It was prevented from
* completing because of that, queue up the tw to do that.
*/
req->io_task_work.func = io_waitid_cb;
io_req_task_work_add(req);
remove_wait_queue(iw->head, &iwa->wo.child_wait);
return true;
}
static void io_waitid_cb(struct io_kiocb *req, struct io_tw_state *ts)
{
struct io_waitid_async *iwa = req->async_data;
struct io_ring_ctx *ctx = req->ctx;
int ret;
io_tw_lock(ctx, ts);
ret = __do_wait(&iwa->wo);
/*
* If we get -ERESTARTSYS here, we need to re-arm and check again
* to ensure we get another callback. If the retry works, then we can
* just remove ourselves from the waitqueue again and finish the
* request.
*/
if (unlikely(ret == -ERESTARTSYS)) {
struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
/* Don't retry if cancel found it meanwhile */
ret = -ECANCELED;
if (!(atomic_read(&iw->refs) & IO_WAITID_CANCEL_FLAG)) {
iw->head = &current->signal->wait_chldexit;
add_wait_queue(iw->head, &iwa->wo.child_wait);
ret = __do_wait(&iwa->wo);
if (ret == -ERESTARTSYS) {
/* retry armed, drop our ref */
io_waitid_drop_issue_ref(req);
return;
}
remove_wait_queue(iw->head, &iwa->wo.child_wait);
}
}
io_waitid_complete(req, ret);
}
static int io_waitid_wait(struct wait_queue_entry *wait, unsigned mode,
int sync, void *key)
{
struct wait_opts *wo = container_of(wait, struct wait_opts, child_wait);
struct io_waitid_async *iwa = container_of(wo, struct io_waitid_async, wo);
struct io_kiocb *req = iwa->req;
struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
struct task_struct *p = key;
if (!pid_child_should_wake(wo, p))
return 0;
/* cancel is in progress */
if (atomic_fetch_inc(&iw->refs) & IO_WAITID_REF_MASK)
return 1;
req->io_task_work.func = io_waitid_cb;
io_req_task_work_add(req);
list_del_init(&wait->entry);
return 1;
}
int io_waitid_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
if (sqe->addr || sqe->buf_index || sqe->addr3 || sqe->waitid_flags)
return -EINVAL;
iw->which = READ_ONCE(sqe->len);
iw->upid = READ_ONCE(sqe->fd);
iw->options = READ_ONCE(sqe->file_index);
iw->infop = u64_to_user_ptr(READ_ONCE(sqe->addr2));
return 0;
}
int io_waitid(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
struct io_ring_ctx *ctx = req->ctx;
struct io_waitid_async *iwa;
int ret;
if (io_alloc_async_data(req))
return -ENOMEM;
iwa = req->async_data;
iwa->req = req;
ret = kernel_waitid_prepare(&iwa->wo, iw->which, iw->upid, &iw->info,
iw->options, NULL);
if (ret)
goto done;
/*
* Mark the request as busy upfront, in case we're racing with the
* wakeup. If we are, then we'll notice when we drop this initial
* reference again after arming.
*/
atomic_set(&iw->refs, 1);
/*
* Cancel must hold the ctx lock, so there's no risk of cancelation
* finding us until a) we remain on the list, and b) the lock is
* dropped. We only need to worry about racing with the wakeup
* callback.
*/
io_ring_submit_lock(ctx, issue_flags);
hlist_add_head(&req->hash_node, &ctx->waitid_list);
init_waitqueue_func_entry(&iwa->wo.child_wait, io_waitid_wait);
iwa->wo.child_wait.private = req->task;
iw->head = &current->signal->wait_chldexit;
add_wait_queue(iw->head, &iwa->wo.child_wait);
ret = __do_wait(&iwa->wo);
if (ret == -ERESTARTSYS) {
/*
* Nobody else grabbed a reference, it'll complete when we get
* a waitqueue callback, or if someone cancels it.
*/
if (!io_waitid_drop_issue_ref(req)) {
io_ring_submit_unlock(ctx, issue_flags);
return IOU_ISSUE_SKIP_COMPLETE;
}
/*
* Wakeup triggered, racing with us. It was prevented from
* completing because of that, queue up the tw to do that.
*/
io_ring_submit_unlock(ctx, issue_flags);
return IOU_ISSUE_SKIP_COMPLETE;
}
hlist_del_init(&req->hash_node);
remove_wait_queue(iw->head, &iwa->wo.child_wait);
ret = io_waitid_finish(req, ret);
io_ring_submit_unlock(ctx, issue_flags);
done:
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
return IOU_OK;
}

15
io_uring/waitid.h Normal file
View File

@ -0,0 +1,15 @@
// SPDX-License-Identifier: GPL-2.0
#include "../kernel/exit.h"
struct io_waitid_async {
struct io_kiocb *req;
struct wait_opts wo;
};
int io_waitid_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_waitid(struct io_kiocb *req, unsigned int issue_flags);
int io_waitid_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
unsigned int issue_flags);
bool io_waitid_remove_all(struct io_ring_ctx *ctx, struct task_struct *task,
bool cancel_all);

View File

@ -74,6 +74,8 @@
#include <asm/unistd.h>
#include <asm/mmu_context.h>
#include "exit.h"
/*
* The default value should be high enough to not crash a system that randomly
* crashes its kernel from time to time, but low enough to at least not permit
@ -1037,26 +1039,6 @@ SYSCALL_DEFINE1(exit_group, int, error_code)
return 0;
}
struct waitid_info {
pid_t pid;
uid_t uid;
int status;
int cause;
};
struct wait_opts {
enum pid_type wo_type;
int wo_flags;
struct pid *wo_pid;
struct waitid_info *wo_info;
int wo_stat;
struct rusage *wo_rusage;
wait_queue_entry_t child_wait;
int notask_error;
};
static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
{
return wo->wo_type == PIDTYPE_MAX ||
@ -1520,6 +1502,17 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
return 0;
}
bool pid_child_should_wake(struct wait_opts *wo, struct task_struct *p)
{
if (!eligible_pid(wo, p))
return false;
if ((wo->wo_flags & __WNOTHREAD) && wo->child_wait.private != p->parent)
return false;
return true;
}
static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode,
int sync, void *key)
{
@ -1527,13 +1520,10 @@ static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode,
child_wait);
struct task_struct *p = key;
if (!eligible_pid(wo, p))
return 0;
if (pid_child_should_wake(wo, p))
return default_wake_function(wait, mode, sync, key);
if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent)
return 0;
return default_wake_function(wait, mode, sync, key);
return 0;
}
void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
@ -1582,16 +1572,10 @@ static int do_wait_pid(struct wait_opts *wo)
return 0;
}
static long do_wait(struct wait_opts *wo)
long __do_wait(struct wait_opts *wo)
{
int retval;
long retval;
trace_sched_process_wait(wo->wo_pid);
init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
wo->child_wait.private = current;
add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
repeat:
/*
* If there is nothing that can match our criteria, just get out.
* We will clear ->notask_error to zero if we see any child that
@ -1603,24 +1587,23 @@ repeat:
(!wo->wo_pid || !pid_has_task(wo->wo_pid, wo->wo_type)))
goto notask;
set_current_state(TASK_INTERRUPTIBLE);
read_lock(&tasklist_lock);
if (wo->wo_type == PIDTYPE_PID) {
retval = do_wait_pid(wo);
if (retval)
goto end;
return retval;
} else {
struct task_struct *tsk = current;
do {
retval = do_wait_thread(wo, tsk);
if (retval)
goto end;
return retval;
retval = ptrace_do_wait(wo, tsk);
if (retval)
goto end;
return retval;
if (wo->wo_flags & __WNOTHREAD)
break;
@ -1630,27 +1613,44 @@ repeat:
notask:
retval = wo->notask_error;
if (!retval && !(wo->wo_flags & WNOHANG)) {
retval = -ERESTARTSYS;
if (!signal_pending(current)) {
schedule();
goto repeat;
}
}
end:
if (!retval && !(wo->wo_flags & WNOHANG))
return -ERESTARTSYS;
return retval;
}
static long do_wait(struct wait_opts *wo)
{
int retval;
trace_sched_process_wait(wo->wo_pid);
init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
wo->child_wait.private = current;
add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
do {
set_current_state(TASK_INTERRUPTIBLE);
retval = __do_wait(wo);
if (retval != -ERESTARTSYS)
break;
if (signal_pending(current))
break;
schedule();
} while (1);
__set_current_state(TASK_RUNNING);
remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
return retval;
}
static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
int options, struct rusage *ru)
int kernel_waitid_prepare(struct wait_opts *wo, int which, pid_t upid,
struct waitid_info *infop, int options,
struct rusage *ru)
{
struct wait_opts wo;
unsigned int f_flags = 0;
struct pid *pid = NULL;
enum pid_type type;
long ret;
unsigned int f_flags = 0;
if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED|
__WNOTHREAD|__WCLONE|__WALL))
@ -1693,19 +1693,32 @@ static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
return -EINVAL;
}
wo.wo_type = type;
wo.wo_pid = pid;
wo.wo_flags = options;
wo.wo_info = infop;
wo.wo_rusage = ru;
wo->wo_type = type;
wo->wo_pid = pid;
wo->wo_flags = options;
wo->wo_info = infop;
wo->wo_rusage = ru;
if (f_flags & O_NONBLOCK)
wo.wo_flags |= WNOHANG;
wo->wo_flags |= WNOHANG;
return 0;
}
static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
int options, struct rusage *ru)
{
struct wait_opts wo;
long ret;
ret = kernel_waitid_prepare(&wo, which, upid, infop, options, ru);
if (ret)
return ret;
ret = do_wait(&wo);
if (!ret && !(options & WNOHANG) && (f_flags & O_NONBLOCK))
if (!ret && !(options & WNOHANG) && (wo.wo_flags & WNOHANG))
ret = -EAGAIN;
put_pid(pid);
put_pid(wo.wo_pid);
return ret;
}

30
kernel/exit.h Normal file
View File

@ -0,0 +1,30 @@
// SPDX-License-Identifier: GPL-2.0-only
#ifndef LINUX_WAITID_H
#define LINUX_WAITID_H
struct waitid_info {
pid_t pid;
uid_t uid;
int status;
int cause;
};
struct wait_opts {
enum pid_type wo_type;
int wo_flags;
struct pid *wo_pid;
struct waitid_info *wo_info;
int wo_stat;
struct rusage *wo_rusage;
wait_queue_entry_t child_wait;
int notask_error;
};
bool pid_child_should_wake(struct wait_opts *wo, struct task_struct *p);
long __do_wait(struct wait_opts *wo);
int kernel_waitid_prepare(struct wait_opts *wo, int which, pid_t upid,
struct waitid_info *infop, int options,
struct rusage *ru);
#endif