2022-05-24 21:45:38 +03:00
# ifndef IOU_CORE_H
# define IOU_CORE_H
# include <linux/errno.h>
2022-05-25 06:54:43 +03:00
# include <linux/lockdep.h>
2023-01-24 18:24:25 +03:00
# include <linux/resume_user_mode.h>
2023-01-18 18:56:30 +03:00
# include <linux/kasan.h>
2022-06-16 15:57:19 +03:00
# include <linux/io_uring_types.h>
2022-11-20 20:18:45 +03:00
# include <uapi/linux/eventpoll.h>
2022-06-16 15:57:19 +03:00
# include "io-wq.h"
2022-06-21 12:09:01 +03:00
# include "slist.h"
2022-06-16 15:57:19 +03:00
# include "filetable.h"
2022-05-24 21:45:38 +03:00
2022-06-13 16:27:03 +03:00
# ifndef CREATE_TRACE_POINTS
# include <trace/events/io_uring.h>
# endif
2023-04-06 16:20:10 +03:00
enum {
2023-04-06 16:20:12 +03:00
/*
* A hint to not wake right away but delay until there are enough of
* tw ' s queued to match the number of CQEs the task is waiting for .
*
* Must not be used wirh requests generating more than one CQE .
* It ' s also ignored unless IORING_SETUP_DEFER_TASKRUN is set .
*/
2023-06-23 14:23:26 +03:00
IOU_F_TWQ_LAZY_WAKE = 1 ,
2023-04-06 16:20:10 +03:00
} ;
2022-05-25 00:21:00 +03:00
enum {
IOU_OK = 0 ,
IOU_ISSUE_SKIP_COMPLETE = - EIOCBQUEUED ,
2022-06-30 12:12:25 +03:00
/*
2022-11-17 21:40:16 +03:00
* Intended only when both IO_URING_F_MULTISHOT is passed
* to indicate to the poll runner that multishot should be
2022-06-30 12:12:25 +03:00
* removed and the result is set on req - > cqe . res .
*/
IOU_STOP_MULTISHOT = - ECANCELED ,
2022-05-25 00:21:00 +03:00
} ;
2023-08-25 01:53:26 +03:00
bool io_cqe_cache_refill ( struct io_ring_ctx * ctx , bool overflow ) ;
2023-08-11 15:53:44 +03:00
void io_req_cqe_overflow ( struct io_kiocb * req ) ;
2022-08-30 15:50:10 +03:00
int io_run_task_work_sig ( struct io_ring_ctx * ctx ) ;
2022-11-24 12:35:53 +03:00
void io_req_defer_failed ( struct io_kiocb * req , s32 res ) ;
2022-11-23 14:33:41 +03:00
void io_req_complete_post ( struct io_kiocb * req , unsigned issue_flags ) ;
2022-11-24 12:35:58 +03:00
bool io_post_aux_cqe ( struct io_ring_ctx * ctx , u64 user_data , s32 res , u32 cflags ) ;
2023-08-11 15:53:45 +03:00
bool io_fill_cqe_req_aux ( struct io_kiocb * req , bool defer , s32 res , u32 cflags ) ;
2022-06-19 14:26:05 +03:00
void __io_commit_cqring_flush ( struct io_ring_ctx * ctx ) ;
struct page * * io_pin_pages ( unsigned long ubuf , unsigned long len , int * npages ) ;
struct file * io_file_get_normal ( struct io_kiocb * req , int fd ) ;
struct file * io_file_get_fixed ( struct io_kiocb * req , int fd ,
unsigned issue_flags ) ;
2023-04-06 16:20:10 +03:00
void __io_req_task_work_add ( struct io_kiocb * req , unsigned flags ) ;
2022-06-19 14:26:05 +03:00
bool io_is_uring_fops ( struct file * file ) ;
bool io_alloc_async_data ( struct io_kiocb * req ) ;
void io_req_task_queue ( struct io_kiocb * req ) ;
2023-03-27 18:38:15 +03:00
void io_queue_iowq ( struct io_kiocb * req , struct io_tw_state * ts_dont_use ) ;
void io_req_task_complete ( struct io_kiocb * req , struct io_tw_state * ts ) ;
2022-06-19 14:26:05 +03:00
void io_req_task_queue_fail ( struct io_kiocb * req , int ret ) ;
2023-03-27 18:38:15 +03:00
void io_req_task_submit ( struct io_kiocb * req , struct io_tw_state * ts ) ;
2022-06-19 14:26:05 +03:00
void tctx_task_work ( struct callback_head * cb ) ;
__cold void io_uring_cancel_generic ( bool cancel_all , struct io_sq_data * sqd ) ;
int io_uring_alloc_task_context ( struct task_struct * task ,
struct io_ring_ctx * ctx ) ;
2023-04-28 19:40:30 +03:00
int io_ring_add_registered_file ( struct io_uring_task * tctx , struct file * file ,
int start , int end ) ;
2023-03-27 18:38:15 +03:00
int io_poll_issue ( struct io_kiocb * req , struct io_tw_state * ts ) ;
2022-06-19 14:26:05 +03:00
int io_submit_sqes ( struct io_ring_ctx * ctx , unsigned int nr ) ;
int io_do_iopoll ( struct io_ring_ctx * ctx , bool force_nonspin ) ;
2023-08-25 01:53:29 +03:00
void __io_submit_flush_completions ( struct io_ring_ctx * ctx ) ;
2022-06-19 14:26:05 +03:00
int io_req_prep_async ( struct io_kiocb * req ) ;
struct io_wq_work * io_wq_free_work ( struct io_wq_work * work ) ;
void io_wq_submit_work ( struct io_wq_work * work ) ;
void io_free_req ( struct io_kiocb * req ) ;
void io_queue_next ( struct io_kiocb * req ) ;
2022-07-12 23:52:47 +03:00
void io_task_refs_refill ( struct io_uring_task * tctx ) ;
2022-07-27 12:30:40 +03:00
bool __io_alloc_req_refill ( struct io_ring_ctx * ctx ) ;
2022-06-19 14:26:05 +03:00
bool io_match_task_safe ( struct io_kiocb * head , struct task_struct * task ,
bool cancel_all ) ;
2023-10-03 04:51:38 +03:00
# if defined(CONFIG_PROVE_LOCKING)
static inline void io_lockdep_assert_cq_locked ( struct io_ring_ctx * ctx )
{
lockdep_assert ( in_task ( ) ) ;
if ( ctx - > flags & IORING_SETUP_IOPOLL ) {
lockdep_assert_held ( & ctx - > uring_lock ) ;
} else if ( ! ctx - > task_complete ) {
lockdep_assert_held ( & ctx - > completion_lock ) ;
} else if ( ctx - > submitter_task ) {
/*
* - > submitter_task may be NULL and we can still post a CQE ,
* if the ring has been setup with IORING_SETUP_R_DISABLED .
* Not from an SQE , as those cannot be submitted , but via
* updating tagged resources .
*/
if ( ctx - > submitter_task - > flags & PF_EXITING )
lockdep_assert ( current_work ( ) ) ;
else
lockdep_assert ( current = = ctx - > submitter_task ) ;
}
}
# else
static inline void io_lockdep_assert_cq_locked ( struct io_ring_ctx * ctx )
{
}
# endif
2023-01-04 04:34:57 +03:00
2022-11-11 19:54:08 +03:00
static inline void io_req_task_work_add ( struct io_kiocb * req )
{
2023-04-06 16:20:10 +03:00
__io_req_task_work_add ( req , 0 ) ;
2022-11-11 19:54:08 +03:00
}
2022-06-19 14:26:05 +03:00
# define io_for_each_link(pos, head) \
for ( pos = ( head ) ; pos ; pos = pos - > link )
2022-06-13 16:27:03 +03:00
2023-08-25 01:53:27 +03:00
static inline bool io_get_cqe_overflow ( struct io_ring_ctx * ctx ,
struct io_uring_cqe * * ret ,
bool overflow )
2022-06-13 16:27:03 +03:00
{
2023-08-25 01:53:26 +03:00
io_lockdep_assert_cq_locked ( ctx ) ;
2022-06-13 16:27:03 +03:00
2023-08-25 01:53:26 +03:00
if ( unlikely ( ctx - > cqe_cached > = ctx - > cqe_sentinel ) ) {
if ( unlikely ( ! io_cqe_cache_refill ( ctx , overflow ) ) )
2023-08-25 01:53:27 +03:00
return false ;
2022-06-13 16:27:03 +03:00
}
2023-08-25 01:53:27 +03:00
* ret = ctx - > cqe_cached ;
2023-08-25 01:53:26 +03:00
ctx - > cached_cq_tail + + ;
ctx - > cqe_cached + + ;
if ( ctx - > flags & IORING_SETUP_CQE32 )
ctx - > cqe_cached + + ;
2023-08-25 01:53:27 +03:00
return true ;
2022-09-23 16:53:25 +03:00
}
2023-08-25 01:53:27 +03:00
static inline bool io_get_cqe ( struct io_ring_ctx * ctx , struct io_uring_cqe * * ret )
2022-09-23 16:53:25 +03:00
{
2023-08-25 01:53:27 +03:00
return io_get_cqe_overflow ( ctx , ret , false ) ;
2022-06-13 16:27:03 +03:00
}
2023-08-25 01:53:30 +03:00
static __always_inline bool io_fill_cqe_req ( struct io_ring_ctx * ctx ,
struct io_kiocb * req )
2022-06-13 16:27:03 +03:00
{
struct io_uring_cqe * cqe ;
2022-06-17 11:48:04 +03:00
/*
* If we can ' t get a cq entry , userspace overflowed the
* submission ( by quite a lot ) . Increment the overflow count in
* the ring .
*/
2023-08-25 01:53:27 +03:00
if ( unlikely ( ! io_get_cqe ( ctx , & cqe ) ) )
2022-12-07 18:50:01 +03:00
return false ;
2022-06-30 12:12:31 +03:00
io_uring: improve cqe !tracing hot path
While looking at io_fill_cqe_req()'s asm I stumbled on our trace points
turning into the chunk below:
trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
req->cqe.res, req->cqe.flags,
req->extra1, req->extra2);
io_uring/io_uring.c:898: trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
movq 232(%rbx), %rdi # req_44(D)->big_cqe.extra2, _5
movq 224(%rbx), %rdx # req_44(D)->big_cqe.extra1, _6
movl 84(%rbx), %r9d # req_44(D)->cqe.D.81184.flags, _7
movl 80(%rbx), %r8d # req_44(D)->cqe.res, _8
movq 72(%rbx), %rcx # req_44(D)->cqe.user_data, _9
movq 88(%rbx), %rsi # req_44(D)->ctx, _10
./arch/x86/include/asm/jump_label.h:27: asm_volatile_goto("1:"
1:jmp .L1772 # objtool NOPs this #
...
It does a jump_label for actual tracing, but those 6 moves will stay
there in the hottest io_uring path. As an optimisation, add a
trace_io_uring_complete_enabled() check, which is also uses jump_labels,
it tricks the compiler into behaving. It removes the junk without
changing anything else int the hot path.
Note: apparently, it's not only me noticing it, and people are also
working it around. We should remove the check when it's solved
generically or rework tracing.
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/555d8312644b3776f4be7e23f9b92943875c4bc7.1692916914.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2023-08-25 01:53:23 +03:00
if ( trace_io_uring_complete_enabled ( ) )
trace_io_uring_complete ( req - > ctx , req , req - > cqe . user_data ,
req - > cqe . res , req - > cqe . flags ,
2023-08-25 01:53:25 +03:00
req - > big_cqe . extra1 , req - > big_cqe . extra2 ) ;
2022-06-30 12:12:31 +03:00
2022-06-17 11:48:04 +03:00
memcpy ( cqe , & req - > cqe , sizeof ( * cqe ) ) ;
if ( ctx - > flags & IORING_SETUP_CQE32 ) {
2023-08-25 01:53:25 +03:00
memcpy ( cqe - > big_cqe , & req - > big_cqe , sizeof ( * cqe ) ) ;
memset ( & req - > big_cqe , 0 , sizeof ( req - > big_cqe ) ) ;
2022-06-13 16:27:03 +03:00
}
2022-06-17 11:48:04 +03:00
return true ;
2022-06-13 16:27:03 +03:00
}
2022-05-25 06:19:47 +03:00
static inline void req_set_fail ( struct io_kiocb * req )
{
req - > flags | = REQ_F_FAIL ;
if ( req - > flags & REQ_F_CQE_SKIP ) {
req - > flags & = ~ REQ_F_CQE_SKIP ;
req - > flags | = REQ_F_SKIP_LINK_CQES ;
}
}
2022-05-24 21:45:38 +03:00
static inline void io_req_set_res ( struct io_kiocb * req , s32 res , u32 cflags )
{
req - > cqe . res = res ;
req - > cqe . flags = cflags ;
}
2022-05-25 14:59:19 +03:00
static inline bool req_has_async_data ( struct io_kiocb * req )
{
return req - > flags & REQ_F_ASYNC_DATA ;
}
2023-07-07 20:14:40 +03:00
static inline void io_put_file ( struct io_kiocb * req )
2022-05-25 06:19:47 +03:00
{
2023-07-07 20:14:40 +03:00
if ( ! ( req - > flags & REQ_F_FIXED_FILE ) & & req - > file )
fput ( req - > file ) ;
2022-05-25 06:19:47 +03:00
}
2022-05-25 06:54:43 +03:00
static inline void io_ring_submit_unlock ( struct io_ring_ctx * ctx ,
unsigned issue_flags )
{
lockdep_assert_held ( & ctx - > uring_lock ) ;
if ( issue_flags & IO_URING_F_UNLOCKED )
mutex_unlock ( & ctx - > uring_lock ) ;
}
static inline void io_ring_submit_lock ( struct io_ring_ctx * ctx ,
unsigned issue_flags )
{
/*
* " Normal " inline submissions always hold the uring_lock , since we
* grab it from the system call . Same is true for the SQPOLL offload .
* The only exception is when we ' ve detached the request and issue it
* from an async worker thread , grab the lock for that case .
*/
if ( issue_flags & IO_URING_F_UNLOCKED )
mutex_lock ( & ctx - > uring_lock ) ;
lockdep_assert_held ( & ctx - > uring_lock ) ;
}
2022-05-25 15:25:13 +03:00
static inline void io_commit_cqring ( struct io_ring_ctx * ctx )
{
/* order cqe stores with ring update */
smp_store_release ( & ctx - > rings - > cq . tail , ctx - > cached_cq_tail ) ;
}
2023-01-09 17:46:08 +03:00
static inline void io_poll_wq_wake ( struct io_ring_ctx * ctx )
{
2023-01-09 17:46:09 +03:00
if ( wq_has_sleeper ( & ctx - > poll_wq ) )
2023-01-09 17:46:08 +03:00
__wake_up ( & ctx - > poll_wq , TASK_NORMAL , 0 ,
poll_to_key ( EPOLL_URING_WAKE | EPOLLIN ) ) ;
}
2023-04-06 16:20:09 +03:00
static inline void io_cqring_wake ( struct io_ring_ctx * ctx )
2022-06-13 16:27:03 +03:00
{
/*
2022-11-20 20:18:45 +03:00
* Trigger waitqueue handler on all waiters on our waitqueue . This
* won ' t necessarily wake up all the tasks , io_should_wake ( ) will make
* that decision .
*
* Pass in EPOLLIN | EPOLL_URING_WAKE as the poll wakeup key . The latter
* set in the mask so that if we recurse back into our own poll
* waitqueue handlers , we know we have a dependency between eventfd or
* epoll and should terminate multishot poll at that point .
2022-06-13 16:27:03 +03:00
*/
2023-04-06 16:20:09 +03:00
if ( wq_has_sleeper ( & ctx - > cq_wait ) )
2022-11-20 20:18:45 +03:00
__wake_up ( & ctx - > cq_wait , TASK_NORMAL , 0 ,
poll_to_key ( EPOLL_URING_WAKE | EPOLLIN ) ) ;
2022-06-13 16:27:03 +03:00
}
2022-05-25 18:13:39 +03:00
static inline bool io_sqring_full ( struct io_ring_ctx * ctx )
{
struct io_rings * r = ctx - > rings ;
return READ_ONCE ( r - > sq . tail ) - ctx - > cached_sq_head = = ctx - > sq_entries ;
}
static inline unsigned int io_sqring_entries ( struct io_ring_ctx * ctx )
{
struct io_rings * rings = ctx - > rings ;
2023-03-30 19:05:31 +03:00
unsigned int entries ;
2022-05-25 18:13:39 +03:00
/* make sure SQ entry isn't read before tail */
2023-03-30 19:05:31 +03:00
entries = smp_load_acquire ( & rings - > sq . tail ) - ctx - > cached_sq_head ;
return min ( entries , ctx - > sq_entries ) ;
2022-05-25 18:13:39 +03:00
}
2022-08-30 15:50:10 +03:00
static inline int io_run_task_work ( void )
2022-05-25 18:13:39 +03:00
{
2022-11-25 19:36:29 +03:00
/*
* Always check - and - clear the task_work notification signal . With how
* signaling works for task_work , we can find it set with nothing to
* run . We need to clear it for that case , like get_signal ( ) does .
*/
if ( test_thread_flag ( TIF_NOTIFY_SIGNAL ) )
clear_notify_signal ( ) ;
2023-01-24 18:24:25 +03:00
/*
* PF_IO_WORKER never returns to userspace , so check here if we have
* notify work that needs processing .
*/
if ( current - > flags & PF_IO_WORKER & &
2023-02-06 18:20:46 +03:00
test_thread_flag ( TIF_NOTIFY_RESUME ) ) {
__set_current_state ( TASK_RUNNING ) ;
2023-01-24 18:24:25 +03:00
resume_user_mode_work ( NULL ) ;
2023-02-06 18:20:46 +03:00
}
2022-09-30 00:29:13 +03:00
if ( task_work_pending ( current ) ) {
2022-05-25 18:13:39 +03:00
__set_current_state ( TASK_RUNNING ) ;
2022-09-30 00:29:13 +03:00
task_work_run ( ) ;
2022-08-30 15:50:10 +03:00
return 1 ;
2022-05-25 18:13:39 +03:00
}
2022-08-30 15:50:10 +03:00
return 0 ;
}
2022-09-03 18:52:01 +03:00
static inline bool io_task_work_pending ( struct io_ring_ctx * ctx )
{
2022-12-17 23:40:17 +03:00
return task_work_pending ( current ) | | ! wq_list_empty ( & ctx - > work_llist ) ;
2022-09-03 18:52:01 +03:00
}
2023-03-27 18:38:15 +03:00
static inline void io_tw_lock ( struct io_ring_ctx * ctx , struct io_tw_state * ts )
2022-06-15 19:33:51 +03:00
{
2023-03-27 18:38:15 +03:00
if ( ! ts - > locked ) {
2022-06-15 19:33:51 +03:00
mutex_lock ( & ctx - > uring_lock ) ;
2023-03-27 18:38:15 +03:00
ts - > locked = true ;
2022-06-15 19:33:51 +03:00
}
}
2022-06-20 03:26:00 +03:00
/*
* Don ' t complete immediately but use deferred completion infrastructure .
* Protected by - > uring_lock and can only be used either with
* IO_URING_F_COMPLETE_DEFER or inside a tw handler holding the mutex .
*/
static inline void io_req_complete_defer ( struct io_kiocb * req )
__must_hold ( & req - > ctx - > uring_lock )
2022-06-15 19:33:51 +03:00
{
struct io_submit_state * state = & req - > ctx - > submit_state ;
2022-06-20 03:26:00 +03:00
lockdep_assert_held ( & req - > ctx - > uring_lock ) ;
2022-06-15 19:33:51 +03:00
wq_list_add_tail ( & req - > comp_list , & state - > compl_reqs ) ;
}
2022-06-20 03:25:57 +03:00
static inline void io_commit_cqring_flush ( struct io_ring_ctx * ctx )
{
2023-01-09 17:46:09 +03:00
if ( unlikely ( ctx - > off_timeout_used | | ctx - > drain_active | |
ctx - > has_evfd | | ctx - > poll_activated ) )
2022-06-20 03:25:57 +03:00
__io_commit_cqring_flush ( ctx ) ;
}
2022-07-12 23:52:47 +03:00
static inline void io_get_task_refs ( int nr )
{
struct io_uring_task * tctx = current - > io_uring ;
tctx - > cached_refs - = nr ;
if ( unlikely ( tctx - > cached_refs < 0 ) )
io_task_refs_refill ( tctx ) ;
}
2022-07-27 12:30:40 +03:00
static inline bool io_req_cache_empty ( struct io_ring_ctx * ctx )
{
return ! ctx - > submit_state . free_list . next ;
}
2023-01-18 18:56:30 +03:00
extern struct kmem_cache * req_cachep ;
2023-01-23 17:37:16 +03:00
static inline struct io_kiocb * io_extract_req ( struct io_ring_ctx * ctx )
2022-07-27 12:30:40 +03:00
{
2023-01-18 18:56:30 +03:00
struct io_kiocb * req ;
2022-07-27 12:30:40 +03:00
2023-01-18 18:56:30 +03:00
req = container_of ( ctx - > submit_state . free_list . next , struct io_kiocb , comp_list ) ;
wq_stack_extract ( & ctx - > submit_state . free_list ) ;
return req ;
2022-07-27 12:30:40 +03:00
}
2023-01-23 17:37:16 +03:00
static inline bool io_alloc_req ( struct io_ring_ctx * ctx , struct io_kiocb * * req )
{
if ( unlikely ( io_req_cache_empty ( ctx ) ) ) {
if ( ! __io_alloc_req_refill ( ctx ) )
return false ;
}
* req = io_extract_req ( ctx ) ;
return true ;
}
2023-01-05 14:22:23 +03:00
static inline bool io_allowed_defer_tw_run ( struct io_ring_ctx * ctx )
{
return likely ( ctx - > submitter_task = = current ) ;
}
2022-09-08 18:56:52 +03:00
static inline bool io_allowed_run_tw ( struct io_ring_ctx * ctx )
{
2022-09-08 18:56:53 +03:00
return likely ( ! ( ctx - > flags & IORING_SETUP_DEFER_TASKRUN ) | |
ctx - > submitter_task = = current ) ;
2022-09-08 18:56:52 +03:00
}
2022-11-23 14:33:39 +03:00
static inline void io_req_queue_tw_complete ( struct io_kiocb * req , s32 res )
{
io_req_set_res ( req , res , 0 ) ;
req - > io_task_work . func = io_req_task_complete ;
io_req_task_work_add ( req ) ;
}
2023-05-04 15:18:54 +03:00
/*
* IORING_SETUP_SQE128 contexts allocate twice the normal SQE size for each
* slot .
*/
static inline size_t uring_sqe_size ( struct io_ring_ctx * ctx )
{
if ( ctx - > flags & IORING_SETUP_SQE128 )
return 2 * sizeof ( struct io_uring_sqe ) ;
return sizeof ( struct io_uring_sqe ) ;
}
2022-05-24 21:45:38 +03:00
# endif