for-5.19/io_uring-2022-05-22

-----BEGIN PGP SIGNATURE-----
 
 iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAmKKol0QHGF4Ym9lQGtl
 cm5lbC5kawAKCRD301j7KXHgpn+sEACbdEQqG6OoCOhJ0ZuxTdQqNMGxCImKBxjP
 8Bqf+0hYNgwfG+80/UQvmc7olb+KxvZ6KtrgViC/ujhvMQmX0Xf/881kiiKG/iHJ
 XKoL9PdqIkenIGnlyEp1uRmnUbooYF+s4iT6Gj/pjnn29GbcKjsPzKV1CUNkt3GC
 R+wpdKczHQDaSwzDY5Ntyjf68QUQOyUznkHW+6JOcBeih3ET7NfapR/zsFS93RlL
 B9pQ9NiBBQfzCAUycVyQMC+p/rJbKWgidAiFk4fXKRm8/7iNwT4dB0+oUymlECxt
 xvalRVK6ER1s4RSdQcUTZoQA+SrzzOnK1DYja9cvcLT3wH+aojana6S0rOMDi8wp
 hoWT5jdMaZN09Vcm7J4sBN15i50m9aDITp21PKOVDZXSMVsebltCL9phaN5+9x/j
 AfF6Vki1WTB4gYaDHR8v6UkW+HcF1WOmMdq8GB9UMfnTya6EJqAooYT9lhQBP/rv
 jxkdj9Fu98O87dOfy1Av9AxH1UB8d7ypCJKkSEMAUPoWf0rC9HjYr0cRq/yppAj8
 pI/0PwXaXRfQuoHPqZyETrPel77VQdBw+Hg+6TS0KlTd3WlVEJMZJPtXK466IFLp
 pYSRVnSI9PuhiClOpxriTCw0cppfRIv11IerCxRziqH9S1zijk0VBCN40//XDs1o
 JfvoA6htKQ==
 =S+Uf
 -----END PGP SIGNATURE-----

Merge tag 'for-5.19/io_uring-2022-05-22' of git://git.kernel.dk/linux-block

Pull io_uring updates from Jens Axboe:
 "Here are the main io_uring changes for 5.19. This contains:

   - Fixes for sparse type warnings (Christoph, Vasily)

   - Support for multi-shot accept (Hao)

   - Support for io_uring managed fixed files, rather than always
     needing the applicationt o manage the indices (me)

   - Fix for a spurious poll wakeup (Dylan)

   - CQE overflow fixes (Dylan)

   - Support more types of cancelations (me)

   - Support for co-operative task_work signaling, rather than always
     forcing an IPI (me)

   - Support for doing poll first when appropriate, rather than always
     attempting a transfer first (me)

   - Provided buffer cleanups and support for mapped buffers (me)

   - Improve how io_uring handles inflight SCM files (Pavel)

   - Speedups for registered files (Pavel, me)

   - Organize the completion data in a struct in io_kiocb rather than
     keep it in separate spots (Pavel)

   - task_work improvements (Pavel)

   - Cleanup and optimize the submission path, in general and for
     handling links (Pavel)

   - Speedups for registered resource handling (Pavel)

   - Support sparse buffers and file maps (Pavel, me)

   - Various fixes and cleanups (Almog, Pavel, me)"

* tag 'for-5.19/io_uring-2022-05-22' of git://git.kernel.dk/linux-block: (111 commits)
  io_uring: fix incorrect __kernel_rwf_t cast
  io_uring: disallow mixed provided buffer group registrations
  io_uring: initialize io_buffer_list head when shared ring is unregistered
  io_uring: add fully sparse buffer registration
  io_uring: use rcu_dereference in io_close
  io_uring: consistently use the EPOLL* defines
  io_uring: make apoll_events a __poll_t
  io_uring: drop a spurious inline on a forward declaration
  io_uring: don't use ERR_PTR for user pointers
  io_uring: use a rwf_t for io_rw.flags
  io_uring: add support for ring mapped supplied buffers
  io_uring: add io_pin_pages() helper
  io_uring: add buffer selection support to IORING_OP_NOP
  io_uring: fix locking state for empty buffer group
  io_uring: implement multishot mode for accept
  io_uring: let fast poll support multishot
  io_uring: add REQ_F_APOLL_MULTISHOT for requests
  io_uring: add IORING_ACCEPT_MULTISHOT for accept
  io_uring: only wake when the correct events are set
  io_uring: avoid io-wq -EAGAIN looping for !IOPOLL
  ...
This commit is contained in:
Linus Torvalds 2022-05-23 12:22:49 -07:00
commit 3a166bdbf3
8 changed files with 1818 additions and 1036 deletions

View File

@ -871,7 +871,7 @@ static bool io_wq_for_each_worker(struct io_wqe *wqe,
static bool io_wq_worker_wake(struct io_worker *worker, void *data) static bool io_wq_worker_wake(struct io_worker *worker, void *data)
{ {
set_notify_signal(worker->task); __set_notify_signal(worker->task);
wake_up_process(worker->task); wake_up_process(worker->task);
return false; return false;
} }
@ -991,7 +991,7 @@ static bool __io_wq_worker_cancel(struct io_worker *worker,
{ {
if (work && match->fn(work, match->data)) { if (work && match->fn(work, match->data)) {
work->flags |= IO_WQ_WORK_CANCEL; work->flags |= IO_WQ_WORK_CANCEL;
set_notify_signal(worker->task); __set_notify_signal(worker->task);
return true; return true;
} }

View File

@ -155,6 +155,7 @@ struct io_wq_work_node *wq_stack_extract(struct io_wq_work_node *stack)
struct io_wq_work { struct io_wq_work {
struct io_wq_work_node list; struct io_wq_work_node list;
unsigned flags; unsigned flags;
int cancel_seq;
}; };
static inline struct io_wq_work *wq_next_work(struct io_wq_work *work) static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)

File diff suppressed because it is too large Load Diff

View File

@ -355,14 +355,23 @@ static inline void clear_notify_signal(void)
smp_mb__after_atomic(); smp_mb__after_atomic();
} }
/*
* Returns 'true' if kick_process() is needed to force a transition from
* user -> kernel to guarantee expedient run of TWA_SIGNAL based task_work.
*/
static inline bool __set_notify_signal(struct task_struct *task)
{
return !test_and_set_tsk_thread_flag(task, TIF_NOTIFY_SIGNAL) &&
!wake_up_state(task, TASK_INTERRUPTIBLE);
}
/* /*
* Called to break out of interruptible wait loops, and enter the * Called to break out of interruptible wait loops, and enter the
* exit_to_user_mode_loop(). * exit_to_user_mode_loop().
*/ */
static inline void set_notify_signal(struct task_struct *task) static inline void set_notify_signal(struct task_struct *task)
{ {
if (!test_and_set_tsk_thread_flag(task, TIF_NOTIFY_SIGNAL) && if (__set_notify_signal(task))
!wake_up_state(task, TASK_INTERRUPTIBLE))
kick_process(task); kick_process(task);
} }

View File

@ -17,6 +17,7 @@ enum task_work_notify_mode {
TWA_NONE, TWA_NONE,
TWA_RESUME, TWA_RESUME,
TWA_SIGNAL, TWA_SIGNAL,
TWA_SIGNAL_NO_IPI,
}; };
static inline bool task_work_pending(struct task_struct *task) static inline bool task_work_pending(struct task_struct *task)

View File

@ -520,7 +520,7 @@ TRACE_EVENT(io_uring_req_failed,
__entry->off = sqe->off; __entry->off = sqe->off;
__entry->addr = sqe->addr; __entry->addr = sqe->addr;
__entry->len = sqe->len; __entry->len = sqe->len;
__entry->op_flags = sqe->rw_flags; __entry->op_flags = sqe->poll32_events;
__entry->buf_index = sqe->buf_index; __entry->buf_index = sqe->buf_index;
__entry->personality = sqe->personality; __entry->personality = sqe->personality;
__entry->file_index = sqe->file_index; __entry->file_index = sqe->file_index;
@ -530,7 +530,7 @@ TRACE_EVENT(io_uring_req_failed,
), ),
TP_printk("ring %p, req %p, user_data 0x%llx, " TP_printk("ring %p, req %p, user_data 0x%llx, "
"op %d, flags 0x%x, prio=%d, off=%llu, addr=%llu, " "op %d, flags 0x%x, prio=%d, off=%llu, addr=%llu, "
"len=%u, rw_flags=0x%x, buf_index=%d, " "len=%u, rw_flags=0x%x, buf_index=%d, "
"personality=%d, file_index=%d, pad=0x%llx/%llx, error=%d", "personality=%d, file_index=%d, pad=0x%llx/%llx, error=%d",
__entry->ctx, __entry->req, __entry->user_data, __entry->ctx, __entry->req, __entry->user_data,
@ -543,6 +543,46 @@ TRACE_EVENT(io_uring_req_failed,
(unsigned long long) __entry->pad2, __entry->error) (unsigned long long) __entry->pad2, __entry->error)
); );
/*
* io_uring_cqe_overflow - a CQE overflowed
*
* @ctx: pointer to a ring context structure
* @user_data: user data associated with the request
* @res: CQE result
* @cflags: CQE flags
* @ocqe: pointer to the overflow cqe (if available)
*
*/
TRACE_EVENT(io_uring_cqe_overflow,
TP_PROTO(void *ctx, unsigned long long user_data, s32 res, u32 cflags,
void *ocqe),
TP_ARGS(ctx, user_data, res, cflags, ocqe),
TP_STRUCT__entry (
__field( void *, ctx )
__field( unsigned long long, user_data )
__field( s32, res )
__field( u32, cflags )
__field( void *, ocqe )
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->user_data = user_data;
__entry->res = res;
__entry->cflags = cflags;
__entry->ocqe = ocqe;
),
TP_printk("ring %p, user_data 0x%llx, res %d, flags %x, "
"overflow_cqe %p",
__entry->ctx, __entry->user_data, __entry->res,
__entry->cflags, __entry->ocqe)
);
#endif /* _TRACE_IO_URING_H */ #endif /* _TRACE_IO_URING_H */
/* This part must be outside protection */ /* This part must be outside protection */

View File

@ -63,6 +63,15 @@ struct io_uring_sqe {
__u64 __pad2[2]; __u64 __pad2[2];
}; };
/*
* If sqe->file_index is set to this for opcodes that instantiate a new
* direct descriptor (like openat/openat2/accept), then io_uring will allocate
* an available direct descriptor instead of having the application pass one
* in. The picked direct descriptor will be returned in cqe->res, or -ENFILE
* if the space is full.
*/
#define IORING_FILE_INDEX_ALLOC (~0U)
enum { enum {
IOSQE_FIXED_FILE_BIT, IOSQE_FIXED_FILE_BIT,
IOSQE_IO_DRAIN_BIT, IOSQE_IO_DRAIN_BIT,
@ -102,6 +111,20 @@ enum {
#define IORING_SETUP_ATTACH_WQ (1U << 5) /* attach to existing wq */ #define IORING_SETUP_ATTACH_WQ (1U << 5) /* attach to existing wq */
#define IORING_SETUP_R_DISABLED (1U << 6) /* start with ring disabled */ #define IORING_SETUP_R_DISABLED (1U << 6) /* start with ring disabled */
#define IORING_SETUP_SUBMIT_ALL (1U << 7) /* continue submit on error */ #define IORING_SETUP_SUBMIT_ALL (1U << 7) /* continue submit on error */
/*
* Cooperative task running. When requests complete, they often require
* forcing the submitter to transition to the kernel to complete. If this
* flag is set, work will be done when the task transitions anyway, rather
* than force an inter-processor interrupt reschedule. This avoids interrupting
* a task running in userspace, and saves an IPI.
*/
#define IORING_SETUP_COOP_TASKRUN (1U << 8)
/*
* If COOP_TASKRUN is set, get notified if task work is available for
* running and a kernel transition would be needed to run it. This sets
* IORING_SQ_TASKRUN in the sq ring flags. Not valid with COOP_TASKRUN.
*/
#define IORING_SETUP_TASKRUN_FLAG (1U << 9)
enum { enum {
IORING_OP_NOP, IORING_OP_NOP,
@ -187,6 +210,33 @@ enum {
#define IORING_POLL_UPDATE_EVENTS (1U << 1) #define IORING_POLL_UPDATE_EVENTS (1U << 1)
#define IORING_POLL_UPDATE_USER_DATA (1U << 2) #define IORING_POLL_UPDATE_USER_DATA (1U << 2)
/*
* ASYNC_CANCEL flags.
*
* IORING_ASYNC_CANCEL_ALL Cancel all requests that match the given key
* IORING_ASYNC_CANCEL_FD Key off 'fd' for cancelation rather than the
* request 'user_data'
* IORING_ASYNC_CANCEL_ANY Match any request
*/
#define IORING_ASYNC_CANCEL_ALL (1U << 0)
#define IORING_ASYNC_CANCEL_FD (1U << 1)
#define IORING_ASYNC_CANCEL_ANY (1U << 2)
/*
* send/sendmsg and recv/recvmsg flags (sqe->addr2)
*
* IORING_RECVSEND_POLL_FIRST If set, instead of first attempting to send
* or receive and arm poll if that yields an
* -EAGAIN result, arm poll upfront and skip
* the initial transfer attempt.
*/
#define IORING_RECVSEND_POLL_FIRST (1U << 0)
/*
* accept flags stored in sqe->ioprio
*/
#define IORING_ACCEPT_MULTISHOT (1U << 0)
/* /*
* IO completion data structure (Completion Queue Entry) * IO completion data structure (Completion Queue Entry)
*/ */
@ -236,6 +286,7 @@ struct io_sqring_offsets {
*/ */
#define IORING_SQ_NEED_WAKEUP (1U << 0) /* needs io_uring_enter wakeup */ #define IORING_SQ_NEED_WAKEUP (1U << 0) /* needs io_uring_enter wakeup */
#define IORING_SQ_CQ_OVERFLOW (1U << 1) /* CQ ring is overflown */ #define IORING_SQ_CQ_OVERFLOW (1U << 1) /* CQ ring is overflown */
#define IORING_SQ_TASKRUN (1U << 2) /* task should enter the kernel */
struct io_cqring_offsets { struct io_cqring_offsets {
__u32 head; __u32 head;
@ -333,6 +384,10 @@ enum {
IORING_REGISTER_RING_FDS = 20, IORING_REGISTER_RING_FDS = 20,
IORING_UNREGISTER_RING_FDS = 21, IORING_UNREGISTER_RING_FDS = 21,
/* register ring based provide buffer group */
IORING_REGISTER_PBUF_RING = 22,
IORING_UNREGISTER_PBUF_RING = 23,
/* this goes last */ /* this goes last */
IORING_REGISTER_LAST IORING_REGISTER_LAST
}; };
@ -350,9 +405,15 @@ struct io_uring_files_update {
__aligned_u64 /* __s32 * */ fds; __aligned_u64 /* __s32 * */ fds;
}; };
/*
* Register a fully sparse file space, rather than pass in an array of all
* -1 file descriptors.
*/
#define IORING_RSRC_REGISTER_SPARSE (1U << 0)
struct io_uring_rsrc_register { struct io_uring_rsrc_register {
__u32 nr; __u32 nr;
__u32 resv; __u32 flags;
__u64 resv2; __u64 resv2;
__aligned_u64 data; __aligned_u64 data;
__aligned_u64 tags; __aligned_u64 tags;
@ -404,6 +465,38 @@ struct io_uring_restriction {
__u32 resv2[3]; __u32 resv2[3];
}; };
struct io_uring_buf {
__u64 addr;
__u32 len;
__u16 bid;
__u16 resv;
};
struct io_uring_buf_ring {
union {
/*
* To avoid spilling into more pages than we need to, the
* ring tail is overlaid with the io_uring_buf->resv field.
*/
struct {
__u64 resv1;
__u32 resv2;
__u16 resv3;
__u16 tail;
};
struct io_uring_buf bufs[0];
};
};
/* argument for IORING_(UN)REGISTER_PBUF_RING */
struct io_uring_buf_reg {
__u64 ring_addr;
__u32 ring_entries;
__u16 bgid;
__u16 pad;
__u64 resv[3];
};
/* /*
* io_uring_restriction->opcode values * io_uring_restriction->opcode values
*/ */

View File

@ -12,12 +12,22 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */
* @notify: how to notify the targeted task * @notify: how to notify the targeted task
* *
* Queue @work for task_work_run() below and notify the @task if @notify * Queue @work for task_work_run() below and notify the @task if @notify
* is @TWA_RESUME or @TWA_SIGNAL. @TWA_SIGNAL works like signals, in that the * is @TWA_RESUME, @TWA_SIGNAL, or @TWA_SIGNAL_NO_IPI.
* it will interrupt the targeted task and run the task_work. @TWA_RESUME *
* work is run only when the task exits the kernel and returns to user mode, * @TWA_SIGNAL works like signals, in that the it will interrupt the targeted
* or before entering guest mode. Fails if the @task is exiting/exited and thus * task and run the task_work, regardless of whether the task is currently
* it can't process this @work. Otherwise @work->func() will be called when the * running in the kernel or userspace.
* @task goes through one of the aforementioned transitions, or exits. * @TWA_SIGNAL_NO_IPI works like @TWA_SIGNAL, except it doesn't send a
* reschedule IPI to force the targeted task to reschedule and run task_work.
* This can be advantageous if there's no strict requirement that the
* task_work be run as soon as possible, just whenever the task enters the
* kernel anyway.
* @TWA_RESUME work is run only when the task exits the kernel and returns to
* user mode, or before entering guest mode.
*
* Fails if the @task is exiting/exited and thus it can't process this @work.
* Otherwise @work->func() will be called when the @task goes through one of
* the aforementioned transitions, or exits.
* *
* If the targeted task is exiting, then an error is returned and the work item * If the targeted task is exiting, then an error is returned and the work item
* is not queued. It's up to the caller to arrange for an alternative mechanism * is not queued. It's up to the caller to arrange for an alternative mechanism
@ -53,6 +63,9 @@ int task_work_add(struct task_struct *task, struct callback_head *work,
case TWA_SIGNAL: case TWA_SIGNAL:
set_notify_signal(task); set_notify_signal(task);
break; break;
case TWA_SIGNAL_NO_IPI:
__set_notify_signal(task);
break;
default: default:
WARN_ON_ONCE(1); WARN_ON_ONCE(1);
break; break;