2022-05-25 09:13:39 -06:00
// SPDX-License-Identifier: GPL-2.0
/*
* Contains the core associated with submission side polling of the SQ
* ring , offloading submissions from the application to a kernel thread .
*/
# include <linux/kernel.h>
# include <linux/errno.h>
# include <linux/file.h>
# include <linux/mm.h>
# include <linux/slab.h>
# include <linux/audit.h>
# include <linux/security.h>
# include <linux/io_uring.h>
# include <uapi/linux/io_uring.h>
# include "io_uring.h"
2023-06-08 09:38:37 -07:00
# include "napi.h"
2022-05-25 09:13:39 -06:00
# include "sqpoll.h"
# define IORING_SQPOLL_CAP_ENTRIES_VALUE 8
2024-02-02 10:20:05 -07:00
# define IORING_TW_CAP_ENTRIES_VALUE 8
2022-05-25 09:13:39 -06:00
enum {
IO_SQ_THREAD_SHOULD_STOP = 0 ,
IO_SQ_THREAD_SHOULD_PARK ,
} ;
void io_sq_thread_unpark ( struct io_sq_data * sqd )
__releases ( & sqd - > lock )
{
WARN_ON_ONCE ( sqd - > thread = = current ) ;
/*
* Do the dance but not conditional clear_bit ( ) because it ' d race with
* other threads incrementing park_pending and setting the bit .
*/
clear_bit ( IO_SQ_THREAD_SHOULD_PARK , & sqd - > state ) ;
if ( atomic_dec_return ( & sqd - > park_pending ) )
set_bit ( IO_SQ_THREAD_SHOULD_PARK , & sqd - > state ) ;
mutex_unlock ( & sqd - > lock ) ;
}
void io_sq_thread_park ( struct io_sq_data * sqd )
__acquires ( & sqd - > lock )
{
WARN_ON_ONCE ( sqd - > thread = = current ) ;
atomic_inc ( & sqd - > park_pending ) ;
set_bit ( IO_SQ_THREAD_SHOULD_PARK , & sqd - > state ) ;
mutex_lock ( & sqd - > lock ) ;
if ( sqd - > thread )
wake_up_process ( sqd - > thread ) ;
}
void io_sq_thread_stop ( struct io_sq_data * sqd )
{
WARN_ON_ONCE ( sqd - > thread = = current ) ;
WARN_ON_ONCE ( test_bit ( IO_SQ_THREAD_SHOULD_STOP , & sqd - > state ) ) ;
set_bit ( IO_SQ_THREAD_SHOULD_STOP , & sqd - > state ) ;
mutex_lock ( & sqd - > lock ) ;
if ( sqd - > thread )
wake_up_process ( sqd - > thread ) ;
mutex_unlock ( & sqd - > lock ) ;
wait_for_completion ( & sqd - > exited ) ;
}
void io_put_sq_data ( struct io_sq_data * sqd )
{
if ( refcount_dec_and_test ( & sqd - > refs ) ) {
WARN_ON_ONCE ( atomic_read ( & sqd - > park_pending ) ) ;
io_sq_thread_stop ( sqd ) ;
kfree ( sqd ) ;
}
}
static __cold void io_sqd_update_thread_idle ( struct io_sq_data * sqd )
{
struct io_ring_ctx * ctx ;
unsigned sq_thread_idle = 0 ;
list_for_each_entry ( ctx , & sqd - > ctx_list , sqd_list )
sq_thread_idle = max ( sq_thread_idle , ctx - > sq_thread_idle ) ;
sqd - > sq_thread_idle = sq_thread_idle ;
}
void io_sq_thread_finish ( struct io_ring_ctx * ctx )
{
struct io_sq_data * sqd = ctx - > sq_data ;
if ( sqd ) {
io_sq_thread_park ( sqd ) ;
list_del_init ( & ctx - > sqd_list ) ;
io_sqd_update_thread_idle ( sqd ) ;
io_sq_thread_unpark ( sqd ) ;
io_put_sq_data ( sqd ) ;
ctx - > sq_data = NULL ;
}
}
static struct io_sq_data * io_attach_sq_data ( struct io_uring_params * p )
{
struct io_ring_ctx * ctx_attach ;
struct io_sq_data * sqd ;
struct fd f ;
f = fdget ( p - > wq_fd ) ;
if ( ! f . file )
return ERR_PTR ( - ENXIO ) ;
if ( ! io_is_uring_fops ( f . file ) ) {
fdput ( f ) ;
return ERR_PTR ( - EINVAL ) ;
}
ctx_attach = f . file - > private_data ;
sqd = ctx_attach - > sq_data ;
if ( ! sqd ) {
fdput ( f ) ;
return ERR_PTR ( - EINVAL ) ;
}
if ( sqd - > task_tgid ! = current - > tgid ) {
fdput ( f ) ;
return ERR_PTR ( - EPERM ) ;
}
refcount_inc ( & sqd - > refs ) ;
fdput ( f ) ;
return sqd ;
}
static struct io_sq_data * io_get_sq_data ( struct io_uring_params * p ,
bool * attached )
{
struct io_sq_data * sqd ;
* attached = false ;
if ( p - > flags & IORING_SETUP_ATTACH_WQ ) {
sqd = io_attach_sq_data ( p ) ;
if ( ! IS_ERR ( sqd ) ) {
* attached = true ;
return sqd ;
}
/* fall through for EPERM case, setup new sqd/task */
if ( PTR_ERR ( sqd ) ! = - EPERM )
return sqd ;
}
sqd = kzalloc ( sizeof ( * sqd ) , GFP_KERNEL ) ;
if ( ! sqd )
return ERR_PTR ( - ENOMEM ) ;
atomic_set ( & sqd - > park_pending , 0 ) ;
refcount_set ( & sqd - > refs , 1 ) ;
INIT_LIST_HEAD ( & sqd - > ctx_list ) ;
mutex_init ( & sqd - > lock ) ;
init_waitqueue_head ( & sqd - > wait ) ;
init_completion ( & sqd - > exited ) ;
return sqd ;
}
static inline bool io_sqd_events_pending ( struct io_sq_data * sqd )
{
return READ_ONCE ( sqd - > state ) ;
}
static int __io_sq_thread ( struct io_ring_ctx * ctx , bool cap_entries )
{
unsigned int to_submit ;
int ret = 0 ;
to_submit = io_sqring_entries ( ctx ) ;
/* if we're handling multiple rings, cap submit size for fairness */
if ( cap_entries & & to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE )
to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE ;
if ( ! wq_list_empty ( & ctx - > iopoll_list ) | | to_submit ) {
const struct cred * creds = NULL ;
if ( ctx - > sq_creds ! = current_cred ( ) )
creds = override_creds ( ctx - > sq_creds ) ;
mutex_lock ( & ctx - > uring_lock ) ;
if ( ! wq_list_empty ( & ctx - > iopoll_list ) )
io_do_iopoll ( ctx , true ) ;
/*
* Don ' t submit if refs are dying , good for io_uring_register ( ) ,
* but also it is relied upon by io_ring_exit_work ( )
*/
if ( to_submit & & likely ( ! percpu_ref_is_dying ( & ctx - > refs ) ) & &
! ( ctx - > flags & IORING_SETUP_R_DISABLED ) )
ret = io_submit_sqes ( ctx , to_submit ) ;
mutex_unlock ( & ctx - > uring_lock ) ;
2023-06-08 09:38:37 -07:00
if ( io_napi ( ctx ) )
ret + = io_napi_sqpoll_busy_poll ( ctx ) ;
2022-05-25 09:13:39 -06:00
if ( to_submit & & wq_has_sleeper ( & ctx - > sqo_sq_wait ) )
wake_up ( & ctx - > sqo_sq_wait ) ;
if ( creds )
revert_creds ( creds ) ;
}
return ret ;
}
static bool io_sqd_handle_event ( struct io_sq_data * sqd )
{
bool did_sig = false ;
struct ksignal ksig ;
if ( test_bit ( IO_SQ_THREAD_SHOULD_PARK , & sqd - > state ) | |
signal_pending ( current ) ) {
mutex_unlock ( & sqd - > lock ) ;
if ( signal_pending ( current ) )
did_sig = get_signal ( & ksig ) ;
cond_resched ( ) ;
mutex_lock ( & sqd - > lock ) ;
io_uring/fdinfo: remove need for sqpoll lock for thread/pid retrieval
A previous commit added a trylock for getting the SQPOLL thread info via
fdinfo, but this introduced a regression where we often fail to get it if
the thread is busy. For that case, we end up not printing the current CPU
and PID info.
Rather than rely on this lock, just print the pid we already stored in
the io_sq_data struct, and ensure we update the current CPU every time
we've slept or potentially rescheduled. The latter won't potentially be
100% accurate, but that wasn't the case before either as the task can
get migrated at any time unless it has been pinned at creation time.
We retain keeping the io_sq_data dereference inside the ctx->uring_lock,
as it has always been, as destruction of the thread and data happen below
that. We could make this RCU safe, but there's little point in doing that.
With this, we always print the last valid information we had, rather than
have spurious outputs with missing information.
Fixes: 7644b1a1c9a7 ("io_uring/fdinfo: lock SQ thread while retrieving thread cpu/pid")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2023-11-14 09:55:50 -07:00
sqd - > sq_cpu = raw_smp_processor_id ( ) ;
2022-05-25 09:13:39 -06:00
}
return did_sig | | test_bit ( IO_SQ_THREAD_SHOULD_STOP , & sqd - > state ) ;
}
2024-02-02 10:20:05 -07:00
/*
* Run task_work , processing the retry_list first . The retry_list holds
* entries that we passed on in the previous run , if we had more task_work
* than we were asked to process . Newly queued task_work isn ' t run until the
* retry list has been fully processed .
*/
static unsigned int io_sq_tw ( struct llist_node * * retry_list , int max_entries )
{
struct io_uring_task * tctx = current - > io_uring ;
unsigned int count = 0 ;
if ( * retry_list ) {
* retry_list = io_handle_tw_list ( * retry_list , & count , max_entries ) ;
if ( count > = max_entries )
2024-05-21 12:31:12 -06:00
goto out ;
2024-02-02 10:20:05 -07:00
max_entries - = count ;
}
* retry_list = tctx_task_work_run ( tctx , max_entries , & count ) ;
2024-05-21 12:31:12 -06:00
out :
if ( task_work_pending ( current ) )
task_work_run ( ) ;
2024-02-02 10:20:05 -07:00
return count ;
}
2024-02-14 13:57:20 -07:00
static bool io_sq_tw_pending ( struct llist_node * retry_list )
{
struct io_uring_task * tctx = current - > io_uring ;
return retry_list | | ! llist_empty ( & tctx - > task_list ) ;
}
2024-02-28 17:12:51 +08:00
static void io_sq_update_worktime ( struct io_sq_data * sqd , struct rusage * start )
{
struct rusage end ;
getrusage ( current , RUSAGE_SELF , & end ) ;
end . ru_stime . tv_sec - = start - > ru_stime . tv_sec ;
end . ru_stime . tv_usec - = start - > ru_stime . tv_usec ;
sqd - > work_time + = end . ru_stime . tv_usec + end . ru_stime . tv_sec * 1000000 ;
}
2022-05-25 09:13:39 -06:00
static int io_sq_thread ( void * data )
{
2024-02-02 10:20:05 -07:00
struct llist_node * retry_list = NULL ;
2022-05-25 09:13:39 -06:00
struct io_sq_data * sqd = data ;
struct io_ring_ctx * ctx ;
2024-02-28 17:12:51 +08:00
struct rusage start ;
2022-05-25 09:13:39 -06:00
unsigned long timeout = 0 ;
char buf [ TASK_COMM_LEN ] ;
DEFINE_WAIT ( wait ) ;
2024-03-18 20:22:42 -06:00
/* offload context creation failed, just exit */
if ( ! current - > io_uring )
goto err_out ;
2022-05-25 09:13:39 -06:00
snprintf ( buf , sizeof ( buf ) , " iou-sqp-%d " , sqd - > task_pid ) ;
set_task_comm ( current , buf ) ;
io_uring/fdinfo: remove need for sqpoll lock for thread/pid retrieval
A previous commit added a trylock for getting the SQPOLL thread info via
fdinfo, but this introduced a regression where we often fail to get it if
the thread is busy. For that case, we end up not printing the current CPU
and PID info.
Rather than rely on this lock, just print the pid we already stored in
the io_sq_data struct, and ensure we update the current CPU every time
we've slept or potentially rescheduled. The latter won't potentially be
100% accurate, but that wasn't the case before either as the task can
get migrated at any time unless it has been pinned at creation time.
We retain keeping the io_sq_data dereference inside the ctx->uring_lock,
as it has always been, as destruction of the thread and data happen below
that. We could make this RCU safe, but there's little point in doing that.
With this, we always print the last valid information we had, rather than
have spurious outputs with missing information.
Fixes: 7644b1a1c9a7 ("io_uring/fdinfo: lock SQ thread while retrieving thread cpu/pid")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2023-11-14 09:55:50 -07:00
/* reset to our pid after we've set task_comm, for fdinfo */
sqd - > task_pid = current - > pid ;
if ( sqd - > sq_cpu ! = - 1 ) {
2022-05-25 09:13:39 -06:00
set_cpus_allowed_ptr ( current , cpumask_of ( sqd - > sq_cpu ) ) ;
io_uring/fdinfo: remove need for sqpoll lock for thread/pid retrieval
A previous commit added a trylock for getting the SQPOLL thread info via
fdinfo, but this introduced a regression where we often fail to get it if
the thread is busy. For that case, we end up not printing the current CPU
and PID info.
Rather than rely on this lock, just print the pid we already stored in
the io_sq_data struct, and ensure we update the current CPU every time
we've slept or potentially rescheduled. The latter won't potentially be
100% accurate, but that wasn't the case before either as the task can
get migrated at any time unless it has been pinned at creation time.
We retain keeping the io_sq_data dereference inside the ctx->uring_lock,
as it has always been, as destruction of the thread and data happen below
that. We could make this RCU safe, but there's little point in doing that.
With this, we always print the last valid information we had, rather than
have spurious outputs with missing information.
Fixes: 7644b1a1c9a7 ("io_uring/fdinfo: lock SQ thread while retrieving thread cpu/pid")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2023-11-14 09:55:50 -07:00
} else {
2022-05-25 09:13:39 -06:00
set_cpus_allowed_ptr ( current , cpu_online_mask ) ;
io_uring/fdinfo: remove need for sqpoll lock for thread/pid retrieval
A previous commit added a trylock for getting the SQPOLL thread info via
fdinfo, but this introduced a regression where we often fail to get it if
the thread is busy. For that case, we end up not printing the current CPU
and PID info.
Rather than rely on this lock, just print the pid we already stored in
the io_sq_data struct, and ensure we update the current CPU every time
we've slept or potentially rescheduled. The latter won't potentially be
100% accurate, but that wasn't the case before either as the task can
get migrated at any time unless it has been pinned at creation time.
We retain keeping the io_sq_data dereference inside the ctx->uring_lock,
as it has always been, as destruction of the thread and data happen below
that. We could make this RCU safe, but there's little point in doing that.
With this, we always print the last valid information we had, rather than
have spurious outputs with missing information.
Fixes: 7644b1a1c9a7 ("io_uring/fdinfo: lock SQ thread while retrieving thread cpu/pid")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2023-11-14 09:55:50 -07:00
sqd - > sq_cpu = raw_smp_processor_id ( ) ;
}
2022-05-25 09:13:39 -06:00
2024-03-21 07:38:38 -06:00
/*
* Force audit context to get setup , in case we do prep side async
* operations that would trigger an audit call before any issue side
* audit has been done .
*/
audit_uring_entry ( IORING_OP_NOP ) ;
audit_uring_exit ( true , 0 ) ;
2022-05-25 09:13:39 -06:00
mutex_lock ( & sqd - > lock ) ;
while ( 1 ) {
bool cap_entries , sqt_spin = false ;
if ( io_sqd_events_pending ( sqd ) | | signal_pending ( current ) ) {
if ( io_sqd_handle_event ( sqd ) )
break ;
timeout = jiffies + sqd - > sq_thread_idle ;
}
cap_entries = ! list_is_singular ( & sqd - > ctx_list ) ;
2024-02-28 17:12:51 +08:00
getrusage ( current , RUSAGE_SELF , & start ) ;
2022-05-25 09:13:39 -06:00
list_for_each_entry ( ctx , & sqd - > ctx_list , sqd_list ) {
int ret = __io_sq_thread ( ctx , cap_entries ) ;
if ( ! sqt_spin & & ( ret > 0 | | ! wq_list_empty ( & ctx - > iopoll_list ) ) )
sqt_spin = true ;
}
2024-02-02 10:20:05 -07:00
if ( io_sq_tw ( & retry_list , IORING_TW_CAP_ENTRIES_VALUE ) )
2022-05-25 09:13:39 -06:00
sqt_spin = true ;
if ( sqt_spin | | ! time_after ( jiffies , timeout ) ) {
2024-02-28 17:12:51 +08:00
if ( sqt_spin ) {
io_sq_update_worktime ( sqd , & start ) ;
2022-05-25 09:13:39 -06:00
timeout = jiffies + sqd - > sq_thread_idle ;
2024-02-28 17:12:51 +08:00
}
2023-05-25 16:26:26 +08:00
if ( unlikely ( need_resched ( ) ) ) {
mutex_unlock ( & sqd - > lock ) ;
cond_resched ( ) ;
mutex_lock ( & sqd - > lock ) ;
io_uring/fdinfo: remove need for sqpoll lock for thread/pid retrieval
A previous commit added a trylock for getting the SQPOLL thread info via
fdinfo, but this introduced a regression where we often fail to get it if
the thread is busy. For that case, we end up not printing the current CPU
and PID info.
Rather than rely on this lock, just print the pid we already stored in
the io_sq_data struct, and ensure we update the current CPU every time
we've slept or potentially rescheduled. The latter won't potentially be
100% accurate, but that wasn't the case before either as the task can
get migrated at any time unless it has been pinned at creation time.
We retain keeping the io_sq_data dereference inside the ctx->uring_lock,
as it has always been, as destruction of the thread and data happen below
that. We could make this RCU safe, but there's little point in doing that.
With this, we always print the last valid information we had, rather than
have spurious outputs with missing information.
Fixes: 7644b1a1c9a7 ("io_uring/fdinfo: lock SQ thread while retrieving thread cpu/pid")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2023-11-14 09:55:50 -07:00
sqd - > sq_cpu = raw_smp_processor_id ( ) ;
2023-05-25 16:26:26 +08:00
}
2022-05-25 09:13:39 -06:00
continue ;
}
prepare_to_wait ( & sqd - > wait , & wait , TASK_INTERRUPTIBLE ) ;
2024-02-14 13:57:20 -07:00
if ( ! io_sqd_events_pending ( sqd ) & & ! io_sq_tw_pending ( retry_list ) ) {
2022-05-25 09:13:39 -06:00
bool needs_sched = true ;
list_for_each_entry ( ctx , & sqd - > ctx_list , sqd_list ) {
atomic_or ( IORING_SQ_NEED_WAKEUP ,
& ctx - > rings - > sq_flags ) ;
if ( ( ctx - > flags & IORING_SETUP_IOPOLL ) & &
! wq_list_empty ( & ctx - > iopoll_list ) ) {
needs_sched = false ;
break ;
}
/*
* Ensure the store of the wakeup flag is not
* reordered with the load of the SQ tail
*/
smp_mb__after_atomic ( ) ;
if ( io_sqring_entries ( ctx ) ) {
needs_sched = false ;
break ;
}
}
if ( needs_sched ) {
mutex_unlock ( & sqd - > lock ) ;
schedule ( ) ;
mutex_lock ( & sqd - > lock ) ;
io_uring/fdinfo: remove need for sqpoll lock for thread/pid retrieval
A previous commit added a trylock for getting the SQPOLL thread info via
fdinfo, but this introduced a regression where we often fail to get it if
the thread is busy. For that case, we end up not printing the current CPU
and PID info.
Rather than rely on this lock, just print the pid we already stored in
the io_sq_data struct, and ensure we update the current CPU every time
we've slept or potentially rescheduled. The latter won't potentially be
100% accurate, but that wasn't the case before either as the task can
get migrated at any time unless it has been pinned at creation time.
We retain keeping the io_sq_data dereference inside the ctx->uring_lock,
as it has always been, as destruction of the thread and data happen below
that. We could make this RCU safe, but there's little point in doing that.
With this, we always print the last valid information we had, rather than
have spurious outputs with missing information.
Fixes: 7644b1a1c9a7 ("io_uring/fdinfo: lock SQ thread while retrieving thread cpu/pid")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2023-11-14 09:55:50 -07:00
sqd - > sq_cpu = raw_smp_processor_id ( ) ;
2022-05-25 09:13:39 -06:00
}
list_for_each_entry ( ctx , & sqd - > ctx_list , sqd_list )
atomic_andnot ( IORING_SQ_NEED_WAKEUP ,
& ctx - > rings - > sq_flags ) ;
}
finish_wait ( & sqd - > wait , & wait ) ;
timeout = jiffies + sqd - > sq_thread_idle ;
}
2024-02-02 10:20:05 -07:00
if ( retry_list )
io_sq_tw ( & retry_list , UINT_MAX ) ;
2022-05-25 09:13:39 -06:00
io_uring_cancel_generic ( true , sqd ) ;
sqd - > thread = NULL ;
list_for_each_entry ( ctx , & sqd - > ctx_list , sqd_list )
atomic_or ( IORING_SQ_NEED_WAKEUP , & ctx - > rings - > sq_flags ) ;
io_run_task_work ( ) ;
mutex_unlock ( & sqd - > lock ) ;
2024-03-18 20:22:42 -06:00
err_out :
2022-05-25 09:13:39 -06:00
complete ( & sqd - > exited ) ;
do_exit ( 0 ) ;
}
2023-01-15 15:15:19 +08:00
void io_sqpoll_wait_sq ( struct io_ring_ctx * ctx )
2022-05-25 09:13:39 -06:00
{
DEFINE_WAIT ( wait ) ;
do {
if ( ! io_sqring_full ( ctx ) )
break ;
prepare_to_wait ( & ctx - > sqo_sq_wait , & wait , TASK_INTERRUPTIBLE ) ;
if ( ! io_sqring_full ( ctx ) )
break ;
schedule ( ) ;
} while ( ! signal_pending ( current ) ) ;
finish_wait ( & ctx - > sqo_sq_wait , & wait ) ;
}
__cold int io_sq_offload_create ( struct io_ring_ctx * ctx ,
struct io_uring_params * p )
{
int ret ;
/* Retain compatibility with failing for an invalid attach attempt */
if ( ( ctx - > flags & ( IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL ) ) = =
IORING_SETUP_ATTACH_WQ ) {
struct fd f ;
f = fdget ( p - > wq_fd ) ;
if ( ! f . file )
return - ENXIO ;
if ( ! io_is_uring_fops ( f . file ) ) {
fdput ( f ) ;
return - EINVAL ;
}
fdput ( f ) ;
}
if ( ctx - > flags & IORING_SETUP_SQPOLL ) {
struct task_struct * tsk ;
struct io_sq_data * sqd ;
bool attached ;
ret = security_uring_sqpoll ( ) ;
if ( ret )
return ret ;
sqd = io_get_sq_data ( p , & attached ) ;
if ( IS_ERR ( sqd ) ) {
ret = PTR_ERR ( sqd ) ;
goto err ;
}
ctx - > sq_creds = get_current_cred ( ) ;
ctx - > sq_data = sqd ;
ctx - > sq_thread_idle = msecs_to_jiffies ( p - > sq_thread_idle ) ;
if ( ! ctx - > sq_thread_idle )
ctx - > sq_thread_idle = HZ ;
io_sq_thread_park ( sqd ) ;
list_add ( & ctx - > sqd_list , & sqd - > ctx_list ) ;
io_sqd_update_thread_idle ( sqd ) ;
/* don't attach to a dying SQPOLL thread, would be racy */
ret = ( attached & & ! sqd - > thread ) ? - ENXIO : 0 ;
io_sq_thread_unpark ( sqd ) ;
if ( ret < 0 )
goto err ;
if ( attached )
return 0 ;
if ( p - > flags & IORING_SETUP_SQ_AFF ) {
int cpu = p - > sq_thread_cpu ;
ret = - EINVAL ;
if ( cpu > = nr_cpu_ids | | ! cpu_online ( cpu ) )
goto err_sqpoll ;
sqd - > sq_cpu = cpu ;
} else {
sqd - > sq_cpu = - 1 ;
}
sqd - > task_pid = current - > pid ;
sqd - > task_tgid = current - > tgid ;
tsk = create_io_thread ( io_sq_thread , sqd , NUMA_NO_NODE ) ;
if ( IS_ERR ( tsk ) ) {
ret = PTR_ERR ( tsk ) ;
goto err_sqpoll ;
}
sqd - > thread = tsk ;
ret = io_uring_alloc_task_context ( tsk , ctx ) ;
wake_up_new_task ( tsk ) ;
if ( ret )
goto err ;
} else if ( p - > flags & IORING_SETUP_SQ_AFF ) {
/* Can't have SQ_AFF without SQPOLL */
ret = - EINVAL ;
goto err ;
}
return 0 ;
err_sqpoll :
complete ( & ctx - > sq_data - > exited ) ;
err :
io_sq_thread_finish ( ctx ) ;
return ret ;
}
2023-08-13 11:05:36 -06:00
__cold int io_sqpoll_wq_cpu_affinity ( struct io_ring_ctx * ctx ,
cpumask_var_t mask )
{
struct io_sq_data * sqd = ctx - > sq_data ;
int ret = - EINVAL ;
if ( sqd ) {
io_sq_thread_park ( sqd ) ;
io_uring: Don't set affinity on a dying sqpoll thread
Syzbot reported a null-ptr-deref of sqd->thread inside
io_sqpoll_wq_cpu_affinity. It turns out the sqd->thread can go away
from under us during io_uring_register, in case the process gets a
fatal signal during io_uring_register.
It is not particularly hard to hit the race, and while I am not sure
this is the exact case hit by syzbot, it solves it. Finally, checking
->thread is enough to close the race because we locked sqd while
"parking" the thread, thus preventing it from going away.
I reproduced it fairly consistently with a program that does:
int main(void) {
...
io_uring_queue_init(RING_LEN, &ring1, IORING_SETUP_SQPOLL);
while (1) {
io_uring_register_iowq_aff(ring, 1, &mask);
}
}
Executed in a loop with timeout to trigger SIGTERM:
while true; do timeout 1 /a.out ; done
This will hit the following BUG() in very few attempts.
BUG: kernel NULL pointer dereference, address: 00000000000007a8
PGD 800000010e949067 P4D 800000010e949067 PUD 10e46e067 PMD 0
Oops: 0000 [#1] PREEMPT SMP PTI
CPU: 0 PID: 15715 Comm: dead-sqpoll Not tainted 6.5.0-rc7-next-20230825-g193296236fa0-dirty #23
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
RIP: 0010:io_sqpoll_wq_cpu_affinity+0x27/0x70
Code: 90 90 90 0f 1f 44 00 00 55 53 48 8b 9f 98 03 00 00 48 85 db 74 4f
48 89 df 48 89 f5 e8 e2 f8 ff ff 48 8b 43 38 48 85 c0 74 22 <48> 8b b8
a8 07 00 00 48 89 ee e8 ba b1 00 00 48 89 df 89 c5 e8 70
RSP: 0018:ffffb04040ea7e70 EFLAGS: 00010282
RAX: 0000000000000000 RBX: ffff93c010749e40 RCX: 0000000000000001
RDX: 0000000000000000 RSI: ffffffffa7653331 RDI: 00000000ffffffff
RBP: ffffb04040ea7eb8 R08: 0000000000000000 R09: c0000000ffffdfff
R10: ffff93c01141b600 R11: ffffb04040ea7d18 R12: ffff93c00ea74840
R13: 0000000000000011 R14: 0000000000000000 R15: ffff93c00ea74800
FS: 00007fb7c276ab80(0000) GS:ffff93c36f200000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00000000000007a8 CR3: 0000000111634003 CR4: 0000000000370ef0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
<TASK>
? __die_body+0x1a/0x60
? page_fault_oops+0x154/0x440
? do_user_addr_fault+0x174/0x7b0
? exc_page_fault+0x63/0x140
? asm_exc_page_fault+0x22/0x30
? io_sqpoll_wq_cpu_affinity+0x27/0x70
__io_register_iowq_aff+0x2b/0x60
__io_uring_register+0x614/0xa70
__x64_sys_io_uring_register+0xaa/0x1a0
do_syscall_64+0x3a/0x90
entry_SYSCALL_64_after_hwframe+0x6e/0xd8
RIP: 0033:0x7fb7c226fec9
Code: 2e 00 b8 ca 00 00 00 0f 05 eb a5 66 0f 1f 44 00 00 48 89 f8 48 89
f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01
f0 ff ff 73 01 c3 48 8b 0d 97 7f 2d 00 f7 d8 64 89 01 48
RSP: 002b:00007ffe2c0674f8 EFLAGS: 00000246 ORIG_RAX: 00000000000001ab
RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fb7c226fec9
RDX: 00007ffe2c067530 RSI: 0000000000000011 RDI: 0000000000000003
RBP: 00007ffe2c0675d0 R08: 00007ffe2c067550 R09: 00007ffe2c067550
R10: 0000000000000001 R11: 0000000000000246 R12: 0000000000000000
R13: 00007ffe2c067750 R14: 0000000000000000 R15: 0000000000000000
</TASK>
Modules linked in:
CR2: 00000000000007a8
---[ end trace 0000000000000000 ]---
Reported-by: syzbot+c74fea926a78b8a91042@syzkaller.appspotmail.com
Fixes: ebdfefc09c6d ("io_uring/sqpoll: fix io-wq affinity when IORING_SETUP_SQPOLL is used")
Signed-off-by: Gabriel Krisman Bertazi <krisman@suse.de>
Link: https://lore.kernel.org/r/87v8cybuo6.fsf@suse.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2023-08-28 19:42:49 -04:00
/* Don't set affinity for a dying thread */
if ( sqd - > thread )
ret = io_wq_cpu_affinity ( sqd - > thread - > io_uring , mask ) ;
2023-08-13 11:05:36 -06:00
io_sq_thread_unpark ( sqd ) ;
}
return ret ;
}