for-5.19/io_uring-2022-05-22
-----BEGIN PGP SIGNATURE----- iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAmKKol0QHGF4Ym9lQGtl cm5lbC5kawAKCRD301j7KXHgpn+sEACbdEQqG6OoCOhJ0ZuxTdQqNMGxCImKBxjP 8Bqf+0hYNgwfG+80/UQvmc7olb+KxvZ6KtrgViC/ujhvMQmX0Xf/881kiiKG/iHJ XKoL9PdqIkenIGnlyEp1uRmnUbooYF+s4iT6Gj/pjnn29GbcKjsPzKV1CUNkt3GC R+wpdKczHQDaSwzDY5Ntyjf68QUQOyUznkHW+6JOcBeih3ET7NfapR/zsFS93RlL B9pQ9NiBBQfzCAUycVyQMC+p/rJbKWgidAiFk4fXKRm8/7iNwT4dB0+oUymlECxt xvalRVK6ER1s4RSdQcUTZoQA+SrzzOnK1DYja9cvcLT3wH+aojana6S0rOMDi8wp hoWT5jdMaZN09Vcm7J4sBN15i50m9aDITp21PKOVDZXSMVsebltCL9phaN5+9x/j AfF6Vki1WTB4gYaDHR8v6UkW+HcF1WOmMdq8GB9UMfnTya6EJqAooYT9lhQBP/rv jxkdj9Fu98O87dOfy1Av9AxH1UB8d7ypCJKkSEMAUPoWf0rC9HjYr0cRq/yppAj8 pI/0PwXaXRfQuoHPqZyETrPel77VQdBw+Hg+6TS0KlTd3WlVEJMZJPtXK466IFLp pYSRVnSI9PuhiClOpxriTCw0cppfRIv11IerCxRziqH9S1zijk0VBCN40//XDs1o JfvoA6htKQ== =S+Uf -----END PGP SIGNATURE----- Merge tag 'for-5.19/io_uring-2022-05-22' of git://git.kernel.dk/linux-block Pull io_uring updates from Jens Axboe: "Here are the main io_uring changes for 5.19. This contains: - Fixes for sparse type warnings (Christoph, Vasily) - Support for multi-shot accept (Hao) - Support for io_uring managed fixed files, rather than always needing the applicationt o manage the indices (me) - Fix for a spurious poll wakeup (Dylan) - CQE overflow fixes (Dylan) - Support more types of cancelations (me) - Support for co-operative task_work signaling, rather than always forcing an IPI (me) - Support for doing poll first when appropriate, rather than always attempting a transfer first (me) - Provided buffer cleanups and support for mapped buffers (me) - Improve how io_uring handles inflight SCM files (Pavel) - Speedups for registered files (Pavel, me) - Organize the completion data in a struct in io_kiocb rather than keep it in separate spots (Pavel) - task_work improvements (Pavel) - Cleanup and optimize the submission path, in general and for handling links (Pavel) - Speedups for registered resource handling (Pavel) - Support sparse buffers and file maps (Pavel, me) - Various fixes and cleanups (Almog, Pavel, me)" * tag 'for-5.19/io_uring-2022-05-22' of git://git.kernel.dk/linux-block: (111 commits) io_uring: fix incorrect __kernel_rwf_t cast io_uring: disallow mixed provided buffer group registrations io_uring: initialize io_buffer_list head when shared ring is unregistered io_uring: add fully sparse buffer registration io_uring: use rcu_dereference in io_close io_uring: consistently use the EPOLL* defines io_uring: make apoll_events a __poll_t io_uring: drop a spurious inline on a forward declaration io_uring: don't use ERR_PTR for user pointers io_uring: use a rwf_t for io_rw.flags io_uring: add support for ring mapped supplied buffers io_uring: add io_pin_pages() helper io_uring: add buffer selection support to IORING_OP_NOP io_uring: fix locking state for empty buffer group io_uring: implement multishot mode for accept io_uring: let fast poll support multishot io_uring: add REQ_F_APOLL_MULTISHOT for requests io_uring: add IORING_ACCEPT_MULTISHOT for accept io_uring: only wake when the correct events are set io_uring: avoid io-wq -EAGAIN looping for !IOPOLL ...
This commit is contained in:
commit
3a166bdbf3
@ -871,7 +871,7 @@ static bool io_wq_for_each_worker(struct io_wqe *wqe,
|
|||||||
|
|
||||||
static bool io_wq_worker_wake(struct io_worker *worker, void *data)
|
static bool io_wq_worker_wake(struct io_worker *worker, void *data)
|
||||||
{
|
{
|
||||||
set_notify_signal(worker->task);
|
__set_notify_signal(worker->task);
|
||||||
wake_up_process(worker->task);
|
wake_up_process(worker->task);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -991,7 +991,7 @@ static bool __io_wq_worker_cancel(struct io_worker *worker,
|
|||||||
{
|
{
|
||||||
if (work && match->fn(work, match->data)) {
|
if (work && match->fn(work, match->data)) {
|
||||||
work->flags |= IO_WQ_WORK_CANCEL;
|
work->flags |= IO_WQ_WORK_CANCEL;
|
||||||
set_notify_signal(worker->task);
|
__set_notify_signal(worker->task);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -155,6 +155,7 @@ struct io_wq_work_node *wq_stack_extract(struct io_wq_work_node *stack)
|
|||||||
struct io_wq_work {
|
struct io_wq_work {
|
||||||
struct io_wq_work_node list;
|
struct io_wq_work_node list;
|
||||||
unsigned flags;
|
unsigned flags;
|
||||||
|
int cancel_seq;
|
||||||
};
|
};
|
||||||
|
|
||||||
static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
|
static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
|
||||||
|
2671
fs/io_uring.c
2671
fs/io_uring.c
File diff suppressed because it is too large
Load Diff
@ -355,14 +355,23 @@ static inline void clear_notify_signal(void)
|
|||||||
smp_mb__after_atomic();
|
smp_mb__after_atomic();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Returns 'true' if kick_process() is needed to force a transition from
|
||||||
|
* user -> kernel to guarantee expedient run of TWA_SIGNAL based task_work.
|
||||||
|
*/
|
||||||
|
static inline bool __set_notify_signal(struct task_struct *task)
|
||||||
|
{
|
||||||
|
return !test_and_set_tsk_thread_flag(task, TIF_NOTIFY_SIGNAL) &&
|
||||||
|
!wake_up_state(task, TASK_INTERRUPTIBLE);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Called to break out of interruptible wait loops, and enter the
|
* Called to break out of interruptible wait loops, and enter the
|
||||||
* exit_to_user_mode_loop().
|
* exit_to_user_mode_loop().
|
||||||
*/
|
*/
|
||||||
static inline void set_notify_signal(struct task_struct *task)
|
static inline void set_notify_signal(struct task_struct *task)
|
||||||
{
|
{
|
||||||
if (!test_and_set_tsk_thread_flag(task, TIF_NOTIFY_SIGNAL) &&
|
if (__set_notify_signal(task))
|
||||||
!wake_up_state(task, TASK_INTERRUPTIBLE))
|
|
||||||
kick_process(task);
|
kick_process(task);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -17,6 +17,7 @@ enum task_work_notify_mode {
|
|||||||
TWA_NONE,
|
TWA_NONE,
|
||||||
TWA_RESUME,
|
TWA_RESUME,
|
||||||
TWA_SIGNAL,
|
TWA_SIGNAL,
|
||||||
|
TWA_SIGNAL_NO_IPI,
|
||||||
};
|
};
|
||||||
|
|
||||||
static inline bool task_work_pending(struct task_struct *task)
|
static inline bool task_work_pending(struct task_struct *task)
|
||||||
|
@ -520,7 +520,7 @@ TRACE_EVENT(io_uring_req_failed,
|
|||||||
__entry->off = sqe->off;
|
__entry->off = sqe->off;
|
||||||
__entry->addr = sqe->addr;
|
__entry->addr = sqe->addr;
|
||||||
__entry->len = sqe->len;
|
__entry->len = sqe->len;
|
||||||
__entry->op_flags = sqe->rw_flags;
|
__entry->op_flags = sqe->poll32_events;
|
||||||
__entry->buf_index = sqe->buf_index;
|
__entry->buf_index = sqe->buf_index;
|
||||||
__entry->personality = sqe->personality;
|
__entry->personality = sqe->personality;
|
||||||
__entry->file_index = sqe->file_index;
|
__entry->file_index = sqe->file_index;
|
||||||
@ -530,7 +530,7 @@ TRACE_EVENT(io_uring_req_failed,
|
|||||||
),
|
),
|
||||||
|
|
||||||
TP_printk("ring %p, req %p, user_data 0x%llx, "
|
TP_printk("ring %p, req %p, user_data 0x%llx, "
|
||||||
"op %d, flags 0x%x, prio=%d, off=%llu, addr=%llu, "
|
"op %d, flags 0x%x, prio=%d, off=%llu, addr=%llu, "
|
||||||
"len=%u, rw_flags=0x%x, buf_index=%d, "
|
"len=%u, rw_flags=0x%x, buf_index=%d, "
|
||||||
"personality=%d, file_index=%d, pad=0x%llx/%llx, error=%d",
|
"personality=%d, file_index=%d, pad=0x%llx/%llx, error=%d",
|
||||||
__entry->ctx, __entry->req, __entry->user_data,
|
__entry->ctx, __entry->req, __entry->user_data,
|
||||||
@ -543,6 +543,46 @@ TRACE_EVENT(io_uring_req_failed,
|
|||||||
(unsigned long long) __entry->pad2, __entry->error)
|
(unsigned long long) __entry->pad2, __entry->error)
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* io_uring_cqe_overflow - a CQE overflowed
|
||||||
|
*
|
||||||
|
* @ctx: pointer to a ring context structure
|
||||||
|
* @user_data: user data associated with the request
|
||||||
|
* @res: CQE result
|
||||||
|
* @cflags: CQE flags
|
||||||
|
* @ocqe: pointer to the overflow cqe (if available)
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
TRACE_EVENT(io_uring_cqe_overflow,
|
||||||
|
|
||||||
|
TP_PROTO(void *ctx, unsigned long long user_data, s32 res, u32 cflags,
|
||||||
|
void *ocqe),
|
||||||
|
|
||||||
|
TP_ARGS(ctx, user_data, res, cflags, ocqe),
|
||||||
|
|
||||||
|
TP_STRUCT__entry (
|
||||||
|
__field( void *, ctx )
|
||||||
|
__field( unsigned long long, user_data )
|
||||||
|
__field( s32, res )
|
||||||
|
__field( u32, cflags )
|
||||||
|
__field( void *, ocqe )
|
||||||
|
),
|
||||||
|
|
||||||
|
TP_fast_assign(
|
||||||
|
__entry->ctx = ctx;
|
||||||
|
__entry->user_data = user_data;
|
||||||
|
__entry->res = res;
|
||||||
|
__entry->cflags = cflags;
|
||||||
|
__entry->ocqe = ocqe;
|
||||||
|
),
|
||||||
|
|
||||||
|
TP_printk("ring %p, user_data 0x%llx, res %d, flags %x, "
|
||||||
|
"overflow_cqe %p",
|
||||||
|
__entry->ctx, __entry->user_data, __entry->res,
|
||||||
|
__entry->cflags, __entry->ocqe)
|
||||||
|
);
|
||||||
|
|
||||||
#endif /* _TRACE_IO_URING_H */
|
#endif /* _TRACE_IO_URING_H */
|
||||||
|
|
||||||
/* This part must be outside protection */
|
/* This part must be outside protection */
|
||||||
|
@ -63,6 +63,15 @@ struct io_uring_sqe {
|
|||||||
__u64 __pad2[2];
|
__u64 __pad2[2];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If sqe->file_index is set to this for opcodes that instantiate a new
|
||||||
|
* direct descriptor (like openat/openat2/accept), then io_uring will allocate
|
||||||
|
* an available direct descriptor instead of having the application pass one
|
||||||
|
* in. The picked direct descriptor will be returned in cqe->res, or -ENFILE
|
||||||
|
* if the space is full.
|
||||||
|
*/
|
||||||
|
#define IORING_FILE_INDEX_ALLOC (~0U)
|
||||||
|
|
||||||
enum {
|
enum {
|
||||||
IOSQE_FIXED_FILE_BIT,
|
IOSQE_FIXED_FILE_BIT,
|
||||||
IOSQE_IO_DRAIN_BIT,
|
IOSQE_IO_DRAIN_BIT,
|
||||||
@ -102,6 +111,20 @@ enum {
|
|||||||
#define IORING_SETUP_ATTACH_WQ (1U << 5) /* attach to existing wq */
|
#define IORING_SETUP_ATTACH_WQ (1U << 5) /* attach to existing wq */
|
||||||
#define IORING_SETUP_R_DISABLED (1U << 6) /* start with ring disabled */
|
#define IORING_SETUP_R_DISABLED (1U << 6) /* start with ring disabled */
|
||||||
#define IORING_SETUP_SUBMIT_ALL (1U << 7) /* continue submit on error */
|
#define IORING_SETUP_SUBMIT_ALL (1U << 7) /* continue submit on error */
|
||||||
|
/*
|
||||||
|
* Cooperative task running. When requests complete, they often require
|
||||||
|
* forcing the submitter to transition to the kernel to complete. If this
|
||||||
|
* flag is set, work will be done when the task transitions anyway, rather
|
||||||
|
* than force an inter-processor interrupt reschedule. This avoids interrupting
|
||||||
|
* a task running in userspace, and saves an IPI.
|
||||||
|
*/
|
||||||
|
#define IORING_SETUP_COOP_TASKRUN (1U << 8)
|
||||||
|
/*
|
||||||
|
* If COOP_TASKRUN is set, get notified if task work is available for
|
||||||
|
* running and a kernel transition would be needed to run it. This sets
|
||||||
|
* IORING_SQ_TASKRUN in the sq ring flags. Not valid with COOP_TASKRUN.
|
||||||
|
*/
|
||||||
|
#define IORING_SETUP_TASKRUN_FLAG (1U << 9)
|
||||||
|
|
||||||
enum {
|
enum {
|
||||||
IORING_OP_NOP,
|
IORING_OP_NOP,
|
||||||
@ -187,6 +210,33 @@ enum {
|
|||||||
#define IORING_POLL_UPDATE_EVENTS (1U << 1)
|
#define IORING_POLL_UPDATE_EVENTS (1U << 1)
|
||||||
#define IORING_POLL_UPDATE_USER_DATA (1U << 2)
|
#define IORING_POLL_UPDATE_USER_DATA (1U << 2)
|
||||||
|
|
||||||
|
/*
|
||||||
|
* ASYNC_CANCEL flags.
|
||||||
|
*
|
||||||
|
* IORING_ASYNC_CANCEL_ALL Cancel all requests that match the given key
|
||||||
|
* IORING_ASYNC_CANCEL_FD Key off 'fd' for cancelation rather than the
|
||||||
|
* request 'user_data'
|
||||||
|
* IORING_ASYNC_CANCEL_ANY Match any request
|
||||||
|
*/
|
||||||
|
#define IORING_ASYNC_CANCEL_ALL (1U << 0)
|
||||||
|
#define IORING_ASYNC_CANCEL_FD (1U << 1)
|
||||||
|
#define IORING_ASYNC_CANCEL_ANY (1U << 2)
|
||||||
|
|
||||||
|
/*
|
||||||
|
* send/sendmsg and recv/recvmsg flags (sqe->addr2)
|
||||||
|
*
|
||||||
|
* IORING_RECVSEND_POLL_FIRST If set, instead of first attempting to send
|
||||||
|
* or receive and arm poll if that yields an
|
||||||
|
* -EAGAIN result, arm poll upfront and skip
|
||||||
|
* the initial transfer attempt.
|
||||||
|
*/
|
||||||
|
#define IORING_RECVSEND_POLL_FIRST (1U << 0)
|
||||||
|
|
||||||
|
/*
|
||||||
|
* accept flags stored in sqe->ioprio
|
||||||
|
*/
|
||||||
|
#define IORING_ACCEPT_MULTISHOT (1U << 0)
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* IO completion data structure (Completion Queue Entry)
|
* IO completion data structure (Completion Queue Entry)
|
||||||
*/
|
*/
|
||||||
@ -236,6 +286,7 @@ struct io_sqring_offsets {
|
|||||||
*/
|
*/
|
||||||
#define IORING_SQ_NEED_WAKEUP (1U << 0) /* needs io_uring_enter wakeup */
|
#define IORING_SQ_NEED_WAKEUP (1U << 0) /* needs io_uring_enter wakeup */
|
||||||
#define IORING_SQ_CQ_OVERFLOW (1U << 1) /* CQ ring is overflown */
|
#define IORING_SQ_CQ_OVERFLOW (1U << 1) /* CQ ring is overflown */
|
||||||
|
#define IORING_SQ_TASKRUN (1U << 2) /* task should enter the kernel */
|
||||||
|
|
||||||
struct io_cqring_offsets {
|
struct io_cqring_offsets {
|
||||||
__u32 head;
|
__u32 head;
|
||||||
@ -333,6 +384,10 @@ enum {
|
|||||||
IORING_REGISTER_RING_FDS = 20,
|
IORING_REGISTER_RING_FDS = 20,
|
||||||
IORING_UNREGISTER_RING_FDS = 21,
|
IORING_UNREGISTER_RING_FDS = 21,
|
||||||
|
|
||||||
|
/* register ring based provide buffer group */
|
||||||
|
IORING_REGISTER_PBUF_RING = 22,
|
||||||
|
IORING_UNREGISTER_PBUF_RING = 23,
|
||||||
|
|
||||||
/* this goes last */
|
/* this goes last */
|
||||||
IORING_REGISTER_LAST
|
IORING_REGISTER_LAST
|
||||||
};
|
};
|
||||||
@ -350,9 +405,15 @@ struct io_uring_files_update {
|
|||||||
__aligned_u64 /* __s32 * */ fds;
|
__aligned_u64 /* __s32 * */ fds;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Register a fully sparse file space, rather than pass in an array of all
|
||||||
|
* -1 file descriptors.
|
||||||
|
*/
|
||||||
|
#define IORING_RSRC_REGISTER_SPARSE (1U << 0)
|
||||||
|
|
||||||
struct io_uring_rsrc_register {
|
struct io_uring_rsrc_register {
|
||||||
__u32 nr;
|
__u32 nr;
|
||||||
__u32 resv;
|
__u32 flags;
|
||||||
__u64 resv2;
|
__u64 resv2;
|
||||||
__aligned_u64 data;
|
__aligned_u64 data;
|
||||||
__aligned_u64 tags;
|
__aligned_u64 tags;
|
||||||
@ -404,6 +465,38 @@ struct io_uring_restriction {
|
|||||||
__u32 resv2[3];
|
__u32 resv2[3];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct io_uring_buf {
|
||||||
|
__u64 addr;
|
||||||
|
__u32 len;
|
||||||
|
__u16 bid;
|
||||||
|
__u16 resv;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct io_uring_buf_ring {
|
||||||
|
union {
|
||||||
|
/*
|
||||||
|
* To avoid spilling into more pages than we need to, the
|
||||||
|
* ring tail is overlaid with the io_uring_buf->resv field.
|
||||||
|
*/
|
||||||
|
struct {
|
||||||
|
__u64 resv1;
|
||||||
|
__u32 resv2;
|
||||||
|
__u16 resv3;
|
||||||
|
__u16 tail;
|
||||||
|
};
|
||||||
|
struct io_uring_buf bufs[0];
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
/* argument for IORING_(UN)REGISTER_PBUF_RING */
|
||||||
|
struct io_uring_buf_reg {
|
||||||
|
__u64 ring_addr;
|
||||||
|
__u32 ring_entries;
|
||||||
|
__u16 bgid;
|
||||||
|
__u16 pad;
|
||||||
|
__u64 resv[3];
|
||||||
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* io_uring_restriction->opcode values
|
* io_uring_restriction->opcode values
|
||||||
*/
|
*/
|
||||||
|
@ -12,12 +12,22 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */
|
|||||||
* @notify: how to notify the targeted task
|
* @notify: how to notify the targeted task
|
||||||
*
|
*
|
||||||
* Queue @work for task_work_run() below and notify the @task if @notify
|
* Queue @work for task_work_run() below and notify the @task if @notify
|
||||||
* is @TWA_RESUME or @TWA_SIGNAL. @TWA_SIGNAL works like signals, in that the
|
* is @TWA_RESUME, @TWA_SIGNAL, or @TWA_SIGNAL_NO_IPI.
|
||||||
* it will interrupt the targeted task and run the task_work. @TWA_RESUME
|
*
|
||||||
* work is run only when the task exits the kernel and returns to user mode,
|
* @TWA_SIGNAL works like signals, in that the it will interrupt the targeted
|
||||||
* or before entering guest mode. Fails if the @task is exiting/exited and thus
|
* task and run the task_work, regardless of whether the task is currently
|
||||||
* it can't process this @work. Otherwise @work->func() will be called when the
|
* running in the kernel or userspace.
|
||||||
* @task goes through one of the aforementioned transitions, or exits.
|
* @TWA_SIGNAL_NO_IPI works like @TWA_SIGNAL, except it doesn't send a
|
||||||
|
* reschedule IPI to force the targeted task to reschedule and run task_work.
|
||||||
|
* This can be advantageous if there's no strict requirement that the
|
||||||
|
* task_work be run as soon as possible, just whenever the task enters the
|
||||||
|
* kernel anyway.
|
||||||
|
* @TWA_RESUME work is run only when the task exits the kernel and returns to
|
||||||
|
* user mode, or before entering guest mode.
|
||||||
|
*
|
||||||
|
* Fails if the @task is exiting/exited and thus it can't process this @work.
|
||||||
|
* Otherwise @work->func() will be called when the @task goes through one of
|
||||||
|
* the aforementioned transitions, or exits.
|
||||||
*
|
*
|
||||||
* If the targeted task is exiting, then an error is returned and the work item
|
* If the targeted task is exiting, then an error is returned and the work item
|
||||||
* is not queued. It's up to the caller to arrange for an alternative mechanism
|
* is not queued. It's up to the caller to arrange for an alternative mechanism
|
||||||
@ -53,6 +63,9 @@ int task_work_add(struct task_struct *task, struct callback_head *work,
|
|||||||
case TWA_SIGNAL:
|
case TWA_SIGNAL:
|
||||||
set_notify_signal(task);
|
set_notify_signal(task);
|
||||||
break;
|
break;
|
||||||
|
case TWA_SIGNAL_NO_IPI:
|
||||||
|
__set_notify_signal(task);
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
WARN_ON_ONCE(1);
|
WARN_ON_ONCE(1);
|
||||||
break;
|
break;
|
||||||
|
Loading…
Reference in New Issue
Block a user