2019-10-22 19:25:58 +03:00
// SPDX-License-Identifier: GPL-2.0
/*
* Basic worker thread pool for io_uring
*
* Copyright ( C ) 2019 Jens Axboe
*
*/
# include <linux/kernel.h>
# include <linux/init.h>
# include <linux/errno.h>
# include <linux/sched/signal.h>
# include <linux/mm.h>
# include <linux/sched/mm.h>
# include <linux/percpu.h>
# include <linux/slab.h>
# include <linux/kthread.h>
# include <linux/rculist_nulls.h>
2020-02-07 07:42:51 +03:00
# include <linux/fs_struct.h>
2020-04-03 20:26:26 +03:00
# include <linux/task_work.h>
2020-09-16 23:41:05 +03:00
# include <linux/blk-cgroup.h>
2020-10-15 22:46:44 +03:00
# include <linux/audit.h>
2020-10-22 18:02:50 +03:00
# include <linux/cpu.h>
2019-10-22 19:25:58 +03:00
2020-10-22 18:02:50 +03:00
# include "../kernel/sched/sched.h"
2019-10-22 19:25:58 +03:00
# include "io-wq.h"
# define WORKER_IDLE_TIMEOUT (5 * HZ)
enum {
IO_WORKER_F_UP = 1 , /* up and active */
IO_WORKER_F_RUNNING = 2 , /* account as running */
IO_WORKER_F_FREE = 4 , /* worker on free list */
2020-09-26 21:37:46 +03:00
IO_WORKER_F_FIXED = 8 , /* static idle worker */
IO_WORKER_F_BOUND = 16 , /* is doing bounded work */
2019-10-22 19:25:58 +03:00
} ;
enum {
IO_WQ_BIT_EXIT = 0 , /* wq exiting */
2020-12-20 20:47:42 +03:00
IO_WQ_BIT_ERROR = 1 , /* error on setup */
2019-10-22 19:25:58 +03:00
} ;
enum {
IO_WQE_FLAG_STALLED = 1 , /* stalled on hash */
} ;
/*
* One for each thread in a wqe pool
*/
struct io_worker {
refcount_t ref ;
unsigned flags ;
struct hlist_nulls_node nulls_node ;
2019-11-13 23:54:49 +03:00
struct list_head all_list ;
2019-10-22 19:25:58 +03:00
struct task_struct * task ;
struct io_wqe * wqe ;
2019-11-13 19:43:34 +03:00
2019-10-22 19:25:58 +03:00
struct io_wq_work * cur_work ;
2019-11-13 19:43:34 +03:00
spinlock_t lock ;
2019-10-22 19:25:58 +03:00
struct rcu_head rcu ;
struct mm_struct * mm ;
2020-09-16 23:41:05 +03:00
# ifdef CONFIG_BLK_CGROUP
struct cgroup_subsys_state * blkcg_css ;
# endif
2020-01-28 02:34:48 +03:00
const struct cred * cur_creds ;
const struct cred * saved_creds ;
2019-10-24 21:39:47 +03:00
struct files_struct * restore_files ;
2020-09-19 05:13:06 +03:00
struct nsproxy * restore_nsproxy ;
2020-02-07 07:42:51 +03:00
struct fs_struct * restore_fs ;
2019-10-22 19:25:58 +03:00
} ;
# if BITS_PER_LONG == 64
# define IO_WQ_HASH_ORDER 6
# else
# define IO_WQ_HASH_ORDER 5
# endif
2020-03-23 22:57:22 +03:00
# define IO_WQ_NR_HASH_BUCKETS (1u << IO_WQ_HASH_ORDER)
2019-11-07 21:41:16 +03:00
struct io_wqe_acct {
unsigned nr_workers ;
unsigned max_workers ;
atomic_t nr_running ;
} ;
enum {
IO_WQ_ACCT_BOUND ,
IO_WQ_ACCT_UNBOUND ,
} ;
2019-10-22 19:25:58 +03:00
/*
* Per - node worker thread pool
*/
struct io_wqe {
struct {
2020-09-01 11:41:46 +03:00
raw_spinlock_t lock ;
2019-11-26 21:59:32 +03:00
struct io_wq_work_list work_list ;
2019-10-22 19:25:58 +03:00
unsigned long hash_map ;
unsigned flags ;
} ____cacheline_aligned_in_smp ;
int node ;
2019-11-07 21:41:16 +03:00
struct io_wqe_acct acct [ 2 ] ;
2019-10-22 19:25:58 +03:00
2019-11-14 18:00:41 +03:00
struct hlist_nulls_head free_list ;
2019-11-13 23:54:49 +03:00
struct list_head all_list ;
2019-10-22 19:25:58 +03:00
struct io_wq * wq ;
2020-03-23 22:57:22 +03:00
struct io_wq_work * hash_tail [ IO_WQ_NR_HASH_BUCKETS ] ;
2019-10-22 19:25:58 +03:00
} ;
/*
* Per io_wq state
*/
struct io_wq {
struct io_wqe * * wqes ;
unsigned long state ;
2020-03-04 16:14:12 +03:00
free_work_fn * free_work ;
2020-06-08 21:08:20 +03:00
io_wq_work_fn * do_work ;
2019-11-13 08:31:31 +03:00
2019-10-22 19:25:58 +03:00
struct task_struct * manager ;
2019-11-07 21:41:16 +03:00
struct user_struct * user ;
2019-10-22 19:25:58 +03:00
refcount_t refs ;
struct completion done ;
2020-01-24 01:33:32 +03:00
2020-10-22 18:02:50 +03:00
struct hlist_node cpuhp_node ;
2020-01-24 01:33:32 +03:00
refcount_t use_refs ;
2019-10-22 19:25:58 +03:00
} ;
2020-10-22 18:02:50 +03:00
static enum cpuhp_state io_wq_online ;
2019-10-22 19:25:58 +03:00
static bool io_worker_get ( struct io_worker * worker )
{
return refcount_inc_not_zero ( & worker - > ref ) ;
}
static void io_worker_release ( struct io_worker * worker )
{
if ( refcount_dec_and_test ( & worker - > ref ) )
wake_up_process ( worker - > task ) ;
}
/*
* Note : drops the wqe - > lock if returning true ! The caller must re - acquire
* the lock in that case . Some callers need to restart handling if this
* happens , so we can ' t just re - acquire the lock on behalf of the caller .
*/
static bool __io_worker_unuse ( struct io_wqe * wqe , struct io_worker * worker )
{
2019-10-24 21:39:47 +03:00
bool dropped_lock = false ;
2020-01-28 02:34:48 +03:00
if ( worker - > saved_creds ) {
revert_creds ( worker - > saved_creds ) ;
worker - > cur_creds = worker - > saved_creds = NULL ;
2019-11-25 18:52:30 +03:00
}
2019-10-24 21:39:47 +03:00
if ( current - > files ! = worker - > restore_files ) {
__acquire ( & wqe - > lock ) ;
2020-09-01 11:41:46 +03:00
raw_spin_unlock_irq ( & wqe - > lock ) ;
2019-10-24 21:39:47 +03:00
dropped_lock = true ;
task_lock ( current ) ;
current - > files = worker - > restore_files ;
2020-09-19 05:13:06 +03:00
current - > nsproxy = worker - > restore_nsproxy ;
2019-10-24 21:39:47 +03:00
task_unlock ( current ) ;
}
2020-02-07 07:42:51 +03:00
if ( current - > fs ! = worker - > restore_fs )
current - > fs = worker - > restore_fs ;
2019-10-22 19:25:58 +03:00
/*
* If we have an active mm , we need to drop the wq lock before unusing
* it . If we do , return true and let the caller retry the idle loop .
*/
if ( worker - > mm ) {
2019-10-24 21:39:47 +03:00
if ( ! dropped_lock ) {
__acquire ( & wqe - > lock ) ;
2020-09-01 11:41:46 +03:00
raw_spin_unlock_irq ( & wqe - > lock ) ;
2019-10-24 21:39:47 +03:00
dropped_lock = true ;
}
2019-10-22 19:25:58 +03:00
__set_current_state ( TASK_RUNNING ) ;
2020-06-11 04:42:06 +03:00
kthread_unuse_mm ( worker - > mm ) ;
2019-10-22 19:25:58 +03:00
mmput ( worker - > mm ) ;
worker - > mm = NULL ;
}
2020-09-16 23:41:05 +03:00
# ifdef CONFIG_BLK_CGROUP
if ( worker - > blkcg_css ) {
kthread_associate_blkcg ( NULL ) ;
worker - > blkcg_css = NULL ;
}
# endif
2020-10-20 23:28:41 +03:00
if ( current - > signal - > rlim [ RLIMIT_FSIZE ] . rlim_cur ! = RLIM_INFINITY )
current - > signal - > rlim [ RLIMIT_FSIZE ] . rlim_cur = RLIM_INFINITY ;
2019-10-24 21:39:47 +03:00
return dropped_lock ;
2019-10-22 19:25:58 +03:00
}
2019-11-07 21:41:16 +03:00
static inline struct io_wqe_acct * io_work_get_acct ( struct io_wqe * wqe ,
struct io_wq_work * work )
{
if ( work - > flags & IO_WQ_WORK_UNBOUND )
return & wqe - > acct [ IO_WQ_ACCT_UNBOUND ] ;
return & wqe - > acct [ IO_WQ_ACCT_BOUND ] ;
}
static inline struct io_wqe_acct * io_wqe_get_acct ( struct io_wqe * wqe ,
struct io_worker * worker )
{
if ( worker - > flags & IO_WORKER_F_BOUND )
return & wqe - > acct [ IO_WQ_ACCT_BOUND ] ;
return & wqe - > acct [ IO_WQ_ACCT_UNBOUND ] ;
}
2019-10-22 19:25:58 +03:00
static void io_worker_exit ( struct io_worker * worker )
{
struct io_wqe * wqe = worker - > wqe ;
2019-11-07 21:41:16 +03:00
struct io_wqe_acct * acct = io_wqe_get_acct ( wqe , worker ) ;
2019-10-22 19:25:58 +03:00
/*
* If we ' re not at zero , someone else is holding a brief reference
* to the worker . Wait for that to go away .
*/
set_current_state ( TASK_INTERRUPTIBLE ) ;
if ( ! refcount_dec_and_test ( & worker - > ref ) )
schedule ( ) ;
__set_current_state ( TASK_RUNNING ) ;
preempt_disable ( ) ;
current - > flags & = ~ PF_IO_WORKER ;
if ( worker - > flags & IO_WORKER_F_RUNNING )
2019-11-07 21:41:16 +03:00
atomic_dec ( & acct - > nr_running ) ;
if ( ! ( worker - > flags & IO_WORKER_F_BOUND ) )
atomic_dec ( & wqe - > wq - > user - > processes ) ;
2019-10-22 19:25:58 +03:00
worker - > flags = 0 ;
preempt_enable ( ) ;
2020-09-01 11:41:46 +03:00
raw_spin_lock_irq ( & wqe - > lock ) ;
2019-10-22 19:25:58 +03:00
hlist_nulls_del_rcu ( & worker - > nulls_node ) ;
2019-11-13 23:54:49 +03:00
list_del_rcu ( & worker - > all_list ) ;
2019-10-22 19:25:58 +03:00
if ( __io_worker_unuse ( wqe , worker ) ) {
__release ( & wqe - > lock ) ;
2020-09-01 11:41:46 +03:00
raw_spin_lock_irq ( & wqe - > lock ) ;
2019-10-22 19:25:58 +03:00
}
2019-11-07 21:41:16 +03:00
acct - > nr_workers - - ;
2020-09-01 11:41:46 +03:00
raw_spin_unlock_irq ( & wqe - > lock ) ;
2019-10-22 19:25:58 +03:00
2019-11-02 10:55:01 +03:00
kfree_rcu ( worker , rcu ) ;
2020-09-26 16:26:55 +03:00
if ( refcount_dec_and_test ( & wqe - > wq - > refs ) )
complete ( & wqe - > wq - > done ) ;
2019-10-22 19:25:58 +03:00
}
2019-11-07 21:41:16 +03:00
static inline bool io_wqe_run_queue ( struct io_wqe * wqe )
__must_hold ( wqe - > lock )
{
2019-11-26 21:59:32 +03:00
if ( ! wq_list_empty ( & wqe - > work_list ) & &
! ( wqe - > flags & IO_WQE_FLAG_STALLED ) )
2019-11-07 21:41:16 +03:00
return true ;
return false ;
}
/*
* Check head of free list for an available worker . If one isn ' t available ,
* caller must wake up the wq manager to create one .
*/
static bool io_wqe_activate_free_worker ( struct io_wqe * wqe )
__must_hold ( RCU )
{
struct hlist_nulls_node * n ;
struct io_worker * worker ;
2019-11-14 18:00:41 +03:00
n = rcu_dereference ( hlist_nulls_first_rcu ( & wqe - > free_list ) ) ;
2019-11-07 21:41:16 +03:00
if ( is_a_nulls ( n ) )
return false ;
worker = hlist_nulls_entry ( n , struct io_worker , nulls_node ) ;
if ( io_worker_get ( worker ) ) {
2019-12-08 07:03:59 +03:00
wake_up_process ( worker - > task ) ;
2019-11-07 21:41:16 +03:00
io_worker_release ( worker ) ;
return true ;
}
return false ;
}
/*
* We need a worker . If we find a free one , we ' re good . If not , and we ' re
* below the max number of workers , wake up the manager to create one .
*/
static void io_wqe_wake_worker ( struct io_wqe * wqe , struct io_wqe_acct * acct )
{
bool ret ;
/*
* Most likely an attempt to queue unbounded work on an io_wq that
* wasn ' t setup with any unbounded workers .
*/
WARN_ON_ONCE ( ! acct - > max_workers ) ;
rcu_read_lock ( ) ;
ret = io_wqe_activate_free_worker ( wqe ) ;
rcu_read_unlock ( ) ;
if ( ! ret & & acct - > nr_workers < acct - > max_workers )
wake_up_process ( wqe - > wq - > manager ) ;
}
static void io_wqe_inc_running ( struct io_wqe * wqe , struct io_worker * worker )
{
struct io_wqe_acct * acct = io_wqe_get_acct ( wqe , worker ) ;
atomic_inc ( & acct - > nr_running ) ;
}
static void io_wqe_dec_running ( struct io_wqe * wqe , struct io_worker * worker )
__must_hold ( wqe - > lock )
{
struct io_wqe_acct * acct = io_wqe_get_acct ( wqe , worker ) ;
if ( atomic_dec_and_test ( & acct - > nr_running ) & & io_wqe_run_queue ( wqe ) )
io_wqe_wake_worker ( wqe , acct ) ;
}
2019-10-22 19:25:58 +03:00
static void io_worker_start ( struct io_wqe * wqe , struct io_worker * worker )
{
allow_kernel_signal ( SIGINT ) ;
current - > flags | = PF_IO_WORKER ;
worker - > flags | = ( IO_WORKER_F_UP | IO_WORKER_F_RUNNING ) ;
2019-10-24 21:39:47 +03:00
worker - > restore_files = current - > files ;
2020-09-19 05:13:06 +03:00
worker - > restore_nsproxy = current - > nsproxy ;
2020-02-07 07:42:51 +03:00
worker - > restore_fs = current - > fs ;
2019-11-07 21:41:16 +03:00
io_wqe_inc_running ( wqe , worker ) ;
2019-10-22 19:25:58 +03:00
}
/*
* Worker will start processing some work . Move it to the busy list , if
* it ' s currently on the freelist
*/
static void __io_worker_busy ( struct io_wqe * wqe , struct io_worker * worker ,
struct io_wq_work * work )
__must_hold ( wqe - > lock )
{
2019-11-07 21:41:16 +03:00
bool worker_bound , work_bound ;
2019-10-22 19:25:58 +03:00
if ( worker - > flags & IO_WORKER_F_FREE ) {
worker - > flags & = ~ IO_WORKER_F_FREE ;
hlist_nulls_del_init_rcu ( & worker - > nulls_node ) ;
}
2019-11-07 21:41:16 +03:00
/*
* If worker is moving from bound to unbound ( or vice versa ) , then
* ensure we update the running accounting .
*/
2019-11-19 09:22:16 +03:00
worker_bound = ( worker - > flags & IO_WORKER_F_BOUND ) ! = 0 ;
work_bound = ( work - > flags & IO_WQ_WORK_UNBOUND ) = = 0 ;
if ( worker_bound ! = work_bound ) {
2019-11-07 21:41:16 +03:00
io_wqe_dec_running ( wqe , worker ) ;
if ( work_bound ) {
worker - > flags | = IO_WORKER_F_BOUND ;
wqe - > acct [ IO_WQ_ACCT_UNBOUND ] . nr_workers - - ;
wqe - > acct [ IO_WQ_ACCT_BOUND ] . nr_workers + + ;
atomic_dec ( & wqe - > wq - > user - > processes ) ;
} else {
worker - > flags & = ~ IO_WORKER_F_BOUND ;
wqe - > acct [ IO_WQ_ACCT_UNBOUND ] . nr_workers + + ;
wqe - > acct [ IO_WQ_ACCT_BOUND ] . nr_workers - - ;
atomic_inc ( & wqe - > wq - > user - > processes ) ;
}
io_wqe_inc_running ( wqe , worker ) ;
}
2019-10-22 19:25:58 +03:00
}
/*
* No work , worker going to sleep . Move to freelist , and unuse mm if we
* have one attached . Dropping the mm may potentially sleep , so we drop
* the lock in that case and return success . Since the caller has to
* retry the loop in that case ( we changed task state ) , we don ' t regrab
* the lock if we return success .
*/
static bool __io_worker_idle ( struct io_wqe * wqe , struct io_worker * worker )
__must_hold ( wqe - > lock )
{
if ( ! ( worker - > flags & IO_WORKER_F_FREE ) ) {
worker - > flags | = IO_WORKER_F_FREE ;
2019-11-14 18:00:41 +03:00
hlist_nulls_add_head_rcu ( & worker - > nulls_node , & wqe - > free_list ) ;
2019-10-22 19:25:58 +03:00
}
return __io_worker_unuse ( wqe , worker ) ;
}
2020-03-14 00:31:05 +03:00
static inline unsigned int io_get_work_hash ( struct io_wq_work * work )
{
return work - > flags > > IO_WQ_HASH_SHIFT ;
}
static struct io_wq_work * io_get_next_work ( struct io_wqe * wqe )
2019-10-22 19:25:58 +03:00
__must_hold ( wqe - > lock )
{
2019-11-26 21:59:32 +03:00
struct io_wq_work_node * node , * prev ;
2020-03-23 22:57:22 +03:00
struct io_wq_work * work , * tail ;
2020-03-14 00:31:05 +03:00
unsigned int hash ;
2019-10-22 19:25:58 +03:00
2019-11-26 21:59:32 +03:00
wq_list_for_each ( node , prev , & wqe - > work_list ) {
work = container_of ( node , struct io_wq_work , list ) ;
2019-10-22 19:25:58 +03:00
/* not hashed, can run anytime */
2020-03-14 00:31:04 +03:00
if ( ! io_wq_is_hashed ( work ) ) {
2020-03-23 22:57:22 +03:00
wq_list_del ( & wqe - > work_list , node , prev ) ;
2019-10-22 19:25:58 +03:00
return work ;
}
/* hashed, can run if not already running */
2020-03-14 00:31:05 +03:00
hash = io_get_work_hash ( work ) ;
if ( ! ( wqe - > hash_map & BIT ( hash ) ) ) {
wqe - > hash_map | = BIT ( hash ) ;
2020-03-23 22:57:22 +03:00
/* all items with this hash lie in [work, tail] */
tail = wqe - > hash_tail [ hash ] ;
wqe - > hash_tail [ hash ] = NULL ;
wq_list_cut ( & wqe - > work_list , & tail - > list , prev ) ;
2019-10-22 19:25:58 +03:00
return work ;
}
}
return NULL ;
}
2020-01-28 02:34:48 +03:00
static void io_wq_switch_mm ( struct io_worker * worker , struct io_wq_work * work )
{
if ( worker - > mm ) {
2020-06-11 04:42:06 +03:00
kthread_unuse_mm ( worker - > mm ) ;
2020-01-28 02:34:48 +03:00
mmput ( worker - > mm ) ;
worker - > mm = NULL ;
}
2020-06-11 04:42:10 +03:00
2020-10-14 19:48:51 +03:00
if ( mmget_not_zero ( work - > identity - > mm ) ) {
kthread_use_mm ( work - > identity - > mm ) ;
worker - > mm = work - > identity - > mm ;
2020-01-28 02:34:48 +03:00
return ;
}
/* failed grabbing mm, ensure work gets cancelled */
work - > flags | = IO_WQ_WORK_CANCEL ;
}
2020-09-16 23:41:05 +03:00
static inline void io_wq_switch_blkcg ( struct io_worker * worker ,
struct io_wq_work * work )
{
# ifdef CONFIG_BLK_CGROUP
2020-10-14 18:23:55 +03:00
if ( ! ( work - > flags & IO_WQ_WORK_BLKCG ) )
return ;
2020-10-14 19:48:51 +03:00
if ( work - > identity - > blkcg_css ! = worker - > blkcg_css ) {
kthread_associate_blkcg ( work - > identity - > blkcg_css ) ;
worker - > blkcg_css = work - > identity - > blkcg_css ;
2020-09-16 23:41:05 +03:00
}
# endif
}
2020-01-28 02:34:48 +03:00
static void io_wq_switch_creds ( struct io_worker * worker ,
struct io_wq_work * work )
{
2020-10-14 19:48:51 +03:00
const struct cred * old_creds = override_creds ( work - > identity - > creds ) ;
2020-01-28 02:34:48 +03:00
2020-10-14 19:48:51 +03:00
worker - > cur_creds = work - > identity - > creds ;
2020-01-28 02:34:48 +03:00
if ( worker - > saved_creds )
put_cred ( old_creds ) ; /* creds set by previous switch */
else
worker - > saved_creds = old_creds ;
}
2020-03-04 16:14:09 +03:00
static void io_impersonate_work ( struct io_worker * worker ,
struct io_wq_work * work )
{
2020-10-14 19:48:51 +03:00
if ( ( work - > flags & IO_WQ_WORK_FILES ) & &
current - > files ! = work - > identity - > files ) {
2020-03-04 16:14:09 +03:00
task_lock ( current ) ;
2020-10-14 19:48:51 +03:00
current - > files = work - > identity - > files ;
current - > nsproxy = work - > identity - > nsproxy ;
2020-03-04 16:14:09 +03:00
task_unlock ( current ) ;
2020-10-30 18:36:41 +03:00
if ( ! work - > identity - > files ) {
/* failed grabbing files, ensure work gets cancelled */
work - > flags | = IO_WQ_WORK_CANCEL ;
}
2020-03-04 16:14:09 +03:00
}
2020-10-14 19:48:51 +03:00
if ( ( work - > flags & IO_WQ_WORK_FS ) & & current - > fs ! = work - > identity - > fs )
current - > fs = work - > identity - > fs ;
if ( ( work - > flags & IO_WQ_WORK_MM ) & & work - > identity - > mm ! = worker - > mm )
2020-03-04 16:14:09 +03:00
io_wq_switch_mm ( worker , work ) ;
2020-10-14 19:48:51 +03:00
if ( ( work - > flags & IO_WQ_WORK_CREDS ) & &
worker - > cur_creds ! = work - > identity - > creds )
2020-03-04 16:14:09 +03:00
io_wq_switch_creds ( worker , work ) ;
2020-10-20 23:28:41 +03:00
if ( work - > flags & IO_WQ_WORK_FSIZE )
current - > signal - > rlim [ RLIMIT_FSIZE ] . rlim_cur = work - > identity - > fsize ;
else if ( current - > signal - > rlim [ RLIMIT_FSIZE ] . rlim_cur ! = RLIM_INFINITY )
current - > signal - > rlim [ RLIMIT_FSIZE ] . rlim_cur = RLIM_INFINITY ;
2020-09-16 23:41:05 +03:00
io_wq_switch_blkcg ( worker , work ) ;
2020-10-15 22:46:44 +03:00
# ifdef CONFIG_AUDIT
current - > loginuid = work - > identity - > loginuid ;
current - > sessionid = work - > identity - > sessionid ;
# endif
2020-03-04 16:14:09 +03:00
}
static void io_assign_current_work ( struct io_worker * worker ,
struct io_wq_work * work )
{
2020-03-14 00:31:03 +03:00
if ( work ) {
/* flush pending signals before assigning new work */
if ( signal_pending ( current ) )
flush_signals ( current ) ;
cond_resched ( ) ;
}
2020-03-04 16:14:09 +03:00
2020-10-15 22:46:44 +03:00
# ifdef CONFIG_AUDIT
current - > loginuid = KUIDT_INIT ( AUDIT_UID_UNSET ) ;
current - > sessionid = AUDIT_SID_UNSET ;
# endif
2020-03-04 16:14:09 +03:00
spin_lock_irq ( & worker - > lock ) ;
worker - > cur_work = work ;
spin_unlock_irq ( & worker - > lock ) ;
}
2020-03-14 00:31:05 +03:00
static void io_wqe_enqueue ( struct io_wqe * wqe , struct io_wq_work * work ) ;
2019-10-22 19:25:58 +03:00
static void io_worker_handle_work ( struct io_worker * worker )
__releases ( wqe - > lock )
{
struct io_wqe * wqe = worker - > wqe ;
struct io_wq * wq = wqe - > wq ;
do {
2020-03-23 22:57:22 +03:00
struct io_wq_work * work ;
2020-03-04 16:14:11 +03:00
get_next :
2019-10-22 19:25:58 +03:00
/*
* If we got some work , mark us as busy . If we didn ' t , but
* the list isn ' t empty , it means we stalled on hashed work .
* Mark us stalled so we don ' t keep looking for work when we
* can ' t make progress , any work completion or insertion will
* clear the stalled flag .
*/
2020-03-14 00:31:05 +03:00
work = io_get_next_work ( wqe ) ;
2019-10-22 19:25:58 +03:00
if ( work )
__io_worker_busy ( wqe , worker , work ) ;
2019-11-26 21:59:32 +03:00
else if ( ! wq_list_empty ( & wqe - > work_list ) )
2019-10-22 19:25:58 +03:00
wqe - > flags | = IO_WQE_FLAG_STALLED ;
2020-09-01 11:41:46 +03:00
raw_spin_unlock_irq ( & wqe - > lock ) ;
2019-10-22 19:25:58 +03:00
if ( ! work )
break ;
2020-03-04 16:14:10 +03:00
io_assign_current_work ( worker , work ) ;
2019-11-13 19:43:34 +03:00
2020-03-04 16:14:09 +03:00
/* handle a whole dependent link */
do {
2020-03-23 22:57:22 +03:00
struct io_wq_work * old_work , * next_hashed , * linked ;
2020-07-25 14:42:00 +03:00
unsigned int hash = io_get_work_hash ( work ) ;
2020-03-04 16:14:09 +03:00
2020-03-23 22:57:22 +03:00
next_hashed = wq_next_work ( work ) ;
2020-03-04 16:14:10 +03:00
io_impersonate_work ( worker , work ) ;
2020-03-04 16:14:09 +03:00
2020-06-25 18:20:54 +03:00
old_work = work ;
linked = wq - > do_work ( work ) ;
2020-03-23 22:57:22 +03:00
work = next_hashed ;
if ( ! work & & linked & & ! io_wq_is_hashed ( linked ) ) {
work = linked ;
linked = NULL ;
}
io_assign_current_work ( worker , work ) ;
2020-03-04 16:14:12 +03:00
wq - > free_work ( old_work ) ;
2020-03-04 16:14:09 +03:00
2020-03-23 22:57:22 +03:00
if ( linked )
io_wqe_enqueue ( wqe , linked ) ;
if ( hash ! = - 1U & & ! next_hashed ) {
2020-09-01 11:41:46 +03:00
raw_spin_lock_irq ( & wqe - > lock ) ;
2020-03-04 16:14:09 +03:00
wqe - > hash_map & = ~ BIT_ULL ( hash ) ;
wqe - > flags & = ~ IO_WQE_FLAG_STALLED ;
2020-03-04 16:14:11 +03:00
/* skip unnecessary unlock-lock wqe->lock */
if ( ! work )
goto get_next ;
2020-09-01 11:41:46 +03:00
raw_spin_unlock_irq ( & wqe - > lock ) ;
2019-11-13 08:31:31 +03:00
}
2020-03-04 16:14:10 +03:00
} while ( work ) ;
2019-11-13 08:31:31 +03:00
2020-09-01 11:41:46 +03:00
raw_spin_lock_irq ( & wqe - > lock ) ;
2019-10-22 19:25:58 +03:00
} while ( 1 ) ;
}
static int io_wqe_worker ( void * data )
{
struct io_worker * worker = data ;
struct io_wqe * wqe = worker - > wqe ;
struct io_wq * wq = wqe - > wq ;
io_worker_start ( wqe , worker ) ;
while ( ! test_bit ( IO_WQ_BIT_EXIT , & wq - > state ) ) {
2019-12-08 07:03:59 +03:00
set_current_state ( TASK_INTERRUPTIBLE ) ;
2019-12-08 07:06:46 +03:00
loop :
2020-09-01 11:41:46 +03:00
raw_spin_lock_irq ( & wqe - > lock ) ;
2019-10-22 19:25:58 +03:00
if ( io_wqe_run_queue ( wqe ) ) {
__set_current_state ( TASK_RUNNING ) ;
io_worker_handle_work ( worker ) ;
2019-12-08 07:06:46 +03:00
goto loop ;
2019-10-22 19:25:58 +03:00
}
/* drops the lock on success, retry */
if ( __io_worker_idle ( wqe , worker ) ) {
__release ( & wqe - > lock ) ;
2019-12-08 07:06:46 +03:00
goto loop ;
2019-10-22 19:25:58 +03:00
}
2020-09-01 11:41:46 +03:00
raw_spin_unlock_irq ( & wqe - > lock ) ;
2019-10-22 19:25:58 +03:00
if ( signal_pending ( current ) )
flush_signals ( current ) ;
if ( schedule_timeout ( WORKER_IDLE_TIMEOUT ) )
continue ;
/* timed out, exit unless we're the fixed worker */
if ( test_bit ( IO_WQ_BIT_EXIT , & wq - > state ) | |
! ( worker - > flags & IO_WORKER_F_FIXED ) )
break ;
}
if ( test_bit ( IO_WQ_BIT_EXIT , & wq - > state ) ) {
2020-09-01 11:41:46 +03:00
raw_spin_lock_irq ( & wqe - > lock ) ;
2019-11-26 21:59:32 +03:00
if ( ! wq_list_empty ( & wqe - > work_list ) )
2019-10-22 19:25:58 +03:00
io_worker_handle_work ( worker ) ;
else
2020-09-01 11:41:46 +03:00
raw_spin_unlock_irq ( & wqe - > lock ) ;
2019-10-22 19:25:58 +03:00
}
io_worker_exit ( worker ) ;
return 0 ;
}
/*
* Called when a worker is scheduled in . Mark us as currently running .
*/
void io_wq_worker_running ( struct task_struct * tsk )
{
struct io_worker * worker = kthread_data ( tsk ) ;
struct io_wqe * wqe = worker - > wqe ;
if ( ! ( worker - > flags & IO_WORKER_F_UP ) )
return ;
if ( worker - > flags & IO_WORKER_F_RUNNING )
return ;
worker - > flags | = IO_WORKER_F_RUNNING ;
2019-11-07 21:41:16 +03:00
io_wqe_inc_running ( wqe , worker ) ;
2019-10-22 19:25:58 +03:00
}
/*
* Called when worker is going to sleep . If there are no workers currently
* running and we have work pending , wake up a free one or have the manager
* set one up .
*/
void io_wq_worker_sleeping ( struct task_struct * tsk )
{
struct io_worker * worker = kthread_data ( tsk ) ;
struct io_wqe * wqe = worker - > wqe ;
if ( ! ( worker - > flags & IO_WORKER_F_UP ) )
return ;
if ( ! ( worker - > flags & IO_WORKER_F_RUNNING ) )
return ;
worker - > flags & = ~ IO_WORKER_F_RUNNING ;
2020-09-01 11:41:46 +03:00
raw_spin_lock_irq ( & wqe - > lock ) ;
2019-11-07 21:41:16 +03:00
io_wqe_dec_running ( wqe , worker ) ;
2020-09-01 11:41:46 +03:00
raw_spin_unlock_irq ( & wqe - > lock ) ;
2019-10-22 19:25:58 +03:00
}
2019-11-19 18:37:07 +03:00
static bool create_io_worker ( struct io_wq * wq , struct io_wqe * wqe , int index )
2019-10-22 19:25:58 +03:00
{
2020-09-26 16:26:55 +03:00
struct io_wqe_acct * acct = & wqe - > acct [ index ] ;
2019-10-22 19:25:58 +03:00
struct io_worker * worker ;
2019-11-26 19:39:45 +03:00
worker = kzalloc_node ( sizeof ( * worker ) , GFP_KERNEL , wqe - > node ) ;
2019-10-22 19:25:58 +03:00
if ( ! worker )
2019-11-19 18:37:07 +03:00
return false ;
2019-10-22 19:25:58 +03:00
refcount_set ( & worker - > ref , 1 ) ;
worker - > nulls_node . pprev = NULL ;
worker - > wqe = wqe ;
2019-11-13 19:43:34 +03:00
spin_lock_init ( & worker - > lock ) ;
2019-10-22 19:25:58 +03:00
worker - > task = kthread_create_on_node ( io_wqe_worker , worker , wqe - > node ,
2019-11-07 21:41:16 +03:00
" io_wqe_worker-%d/%d " , index , wqe - > node ) ;
2019-10-22 19:25:58 +03:00
if ( IS_ERR ( worker - > task ) ) {
kfree ( worker ) ;
2019-11-19 18:37:07 +03:00
return false ;
2019-10-22 19:25:58 +03:00
}
2020-10-15 19:13:07 +03:00
kthread_bind_mask ( worker - > task , cpumask_of_node ( wqe - > node ) ) ;
2019-10-22 19:25:58 +03:00
2020-09-01 11:41:46 +03:00
raw_spin_lock_irq ( & wqe - > lock ) ;
2019-11-14 18:00:41 +03:00
hlist_nulls_add_head_rcu ( & worker - > nulls_node , & wqe - > free_list ) ;
2019-11-13 23:54:49 +03:00
list_add_tail_rcu ( & worker - > all_list , & wqe - > all_list ) ;
2019-10-22 19:25:58 +03:00
worker - > flags | = IO_WORKER_F_FREE ;
2019-11-07 21:41:16 +03:00
if ( index = = IO_WQ_ACCT_BOUND )
worker - > flags | = IO_WORKER_F_BOUND ;
if ( ! acct - > nr_workers & & ( worker - > flags & IO_WORKER_F_BOUND ) )
2019-10-22 19:25:58 +03:00
worker - > flags | = IO_WORKER_F_FIXED ;
2019-11-07 21:41:16 +03:00
acct - > nr_workers + + ;
2020-09-01 11:41:46 +03:00
raw_spin_unlock_irq ( & wqe - > lock ) ;
2019-10-22 19:25:58 +03:00
2019-11-07 21:41:16 +03:00
if ( index = = IO_WQ_ACCT_UNBOUND )
atomic_inc ( & wq - > user - > processes ) ;
2020-09-26 16:26:55 +03:00
refcount_inc ( & wq - > refs ) ;
2019-10-22 19:25:58 +03:00
wake_up_process ( worker - > task ) ;
2019-11-19 18:37:07 +03:00
return true ;
2019-10-22 19:25:58 +03:00
}
2019-11-07 21:41:16 +03:00
static inline bool io_wqe_need_worker ( struct io_wqe * wqe , int index )
2019-10-22 19:25:58 +03:00
__must_hold ( wqe - > lock )
{
2019-11-07 21:41:16 +03:00
struct io_wqe_acct * acct = & wqe - > acct [ index ] ;
2019-10-22 19:25:58 +03:00
2019-11-07 21:41:16 +03:00
/* if we have available workers or no work, no need */
2019-11-14 18:00:41 +03:00
if ( ! hlist_nulls_empty ( & wqe - > free_list ) | | ! io_wqe_run_queue ( wqe ) )
2019-11-07 21:41:16 +03:00
return false ;
return acct - > nr_workers < acct - > max_workers ;
2019-10-22 19:25:58 +03:00
}
2020-09-26 16:26:55 +03:00
/*
* Iterate the passed in list and call the specific function for each
* worker that isn ' t exiting
*/
static bool io_wq_for_each_worker ( struct io_wqe * wqe ,
bool ( * func ) ( struct io_worker * , void * ) ,
void * data )
{
struct io_worker * worker ;
bool ret = false ;
list_for_each_entry_rcu ( worker , & wqe - > all_list , all_list ) {
if ( io_worker_get ( worker ) ) {
/* no task if node is/was offline */
if ( worker - > task )
ret = func ( worker , data ) ;
io_worker_release ( worker ) ;
if ( ret )
break ;
}
}
return ret ;
}
static bool io_wq_worker_wake ( struct io_worker * worker , void * data )
{
wake_up_process ( worker - > task ) ;
return false ;
}
2019-10-22 19:25:58 +03:00
/*
* Manager thread . Tasked with creating new workers , if we need them .
*/
static int io_wq_manager ( void * data )
{
struct io_wq * wq = data ;
2019-11-26 21:10:20 +03:00
int node ;
2019-10-22 19:25:58 +03:00
2019-11-19 18:37:07 +03:00
/* create fixed workers */
2020-09-26 16:26:55 +03:00
refcount_set ( & wq - > refs , 1 ) ;
2019-11-26 21:10:20 +03:00
for_each_node ( node ) {
2020-02-11 16:30:06 +03:00
if ( ! node_online ( node ) )
continue ;
2020-09-26 16:26:55 +03:00
if ( create_io_worker ( wq , wq - > wqes [ node ] , IO_WQ_ACCT_BOUND ) )
continue ;
set_bit ( IO_WQ_BIT_ERROR , & wq - > state ) ;
set_bit ( IO_WQ_BIT_EXIT , & wq - > state ) ;
goto out ;
2019-11-19 18:37:07 +03:00
}
2019-10-22 19:25:58 +03:00
2019-11-19 18:37:07 +03:00
complete ( & wq - > done ) ;
while ( ! kthread_should_stop ( ) ) {
2020-04-03 20:26:26 +03:00
if ( current - > task_works )
task_work_run ( ) ;
2019-11-26 21:10:20 +03:00
for_each_node ( node ) {
struct io_wqe * wqe = wq - > wqes [ node ] ;
2019-11-07 21:41:16 +03:00
bool fork_worker [ 2 ] = { false , false } ;
2019-10-22 19:25:58 +03:00
2020-02-11 16:30:06 +03:00
if ( ! node_online ( node ) )
continue ;
2020-09-01 11:41:46 +03:00
raw_spin_lock_irq ( & wqe - > lock ) ;
2019-11-07 21:41:16 +03:00
if ( io_wqe_need_worker ( wqe , IO_WQ_ACCT_BOUND ) )
fork_worker [ IO_WQ_ACCT_BOUND ] = true ;
if ( io_wqe_need_worker ( wqe , IO_WQ_ACCT_UNBOUND ) )
fork_worker [ IO_WQ_ACCT_UNBOUND ] = true ;
2020-09-01 11:41:46 +03:00
raw_spin_unlock_irq ( & wqe - > lock ) ;
2019-11-07 21:41:16 +03:00
if ( fork_worker [ IO_WQ_ACCT_BOUND ] )
create_io_worker ( wq , wqe , IO_WQ_ACCT_BOUND ) ;
if ( fork_worker [ IO_WQ_ACCT_UNBOUND ] )
create_io_worker ( wq , wqe , IO_WQ_ACCT_UNBOUND ) ;
2019-10-22 19:25:58 +03:00
}
set_current_state ( TASK_INTERRUPTIBLE ) ;
schedule_timeout ( HZ ) ;
}
2020-04-03 20:26:26 +03:00
if ( current - > task_works )
task_work_run ( ) ;
2020-09-26 16:26:55 +03:00
out :
if ( refcount_dec_and_test ( & wq - > refs ) ) {
2019-11-19 18:37:07 +03:00
complete ( & wq - > done ) ;
2020-09-26 16:26:55 +03:00
return 0 ;
}
/* if ERROR is set and we get here, we have workers to wake */
if ( test_bit ( IO_WQ_BIT_ERROR , & wq - > state ) ) {
rcu_read_lock ( ) ;
for_each_node ( node )
io_wq_for_each_worker ( wq - > wqes [ node ] , io_wq_worker_wake , NULL ) ;
rcu_read_unlock ( ) ;
}
2019-11-19 18:37:07 +03:00
return 0 ;
2019-10-22 19:25:58 +03:00
}
2019-11-07 21:41:16 +03:00
static bool io_wq_can_queue ( struct io_wqe * wqe , struct io_wqe_acct * acct ,
struct io_wq_work * work )
{
bool free_worker ;
if ( ! ( work - > flags & IO_WQ_WORK_UNBOUND ) )
return true ;
if ( atomic_read ( & acct - > nr_running ) )
return true ;
rcu_read_lock ( ) ;
2019-11-14 18:00:41 +03:00
free_worker = ! hlist_nulls_empty ( & wqe - > free_list ) ;
2019-11-07 21:41:16 +03:00
rcu_read_unlock ( ) ;
if ( free_worker )
return true ;
if ( atomic_read ( & wqe - > wq - > user - > processes ) > = acct - > max_workers & &
! ( capable ( CAP_SYS_RESOURCE ) | | capable ( CAP_SYS_ADMIN ) ) )
return false ;
return true ;
}
2020-03-04 16:14:12 +03:00
static void io_run_cancel ( struct io_wq_work * work , struct io_wqe * wqe )
2020-03-01 19:18:19 +03:00
{
2020-03-04 16:14:12 +03:00
struct io_wq * wq = wqe - > wq ;
2020-03-01 19:18:19 +03:00
do {
struct io_wq_work * old_work = work ;
work - > flags | = IO_WQ_WORK_CANCEL ;
2020-06-25 18:20:54 +03:00
work = wq - > do_work ( work ) ;
2020-03-04 16:14:12 +03:00
wq - > free_work ( old_work ) ;
2020-03-01 19:18:19 +03:00
} while ( work ) ;
}
2020-03-23 22:57:22 +03:00
static void io_wqe_insert_work ( struct io_wqe * wqe , struct io_wq_work * work )
{
unsigned int hash ;
struct io_wq_work * tail ;
if ( ! io_wq_is_hashed ( work ) ) {
append :
wq_list_add_tail ( & work - > list , & wqe - > work_list ) ;
return ;
}
hash = io_get_work_hash ( work ) ;
tail = wqe - > hash_tail [ hash ] ;
wqe - > hash_tail [ hash ] = work ;
if ( ! tail )
goto append ;
wq_list_add_after ( & work - > list , & tail - > list , & wqe - > work_list ) ;
}
2019-10-22 19:25:58 +03:00
static void io_wqe_enqueue ( struct io_wqe * wqe , struct io_wq_work * work )
{
2019-11-07 21:41:16 +03:00
struct io_wqe_acct * acct = io_work_get_acct ( wqe , work ) ;
2019-12-17 18:46:33 +03:00
int work_flags ;
2019-10-22 19:25:58 +03:00
unsigned long flags ;
2019-11-07 21:41:16 +03:00
/*
* Do early check to see if we need a new unbound worker , and if we do ,
* if we ' re allowed to do so . This isn ' t 100 % accurate as there ' s a
* gap between this check and incrementing the value , but that ' s OK .
* It ' s close enough to not be an issue , fork ( ) has the same delay .
*/
if ( unlikely ( ! io_wq_can_queue ( wqe , acct , work ) ) ) {
2020-03-04 16:14:12 +03:00
io_run_cancel ( work , wqe ) ;
2019-11-07 21:41:16 +03:00
return ;
}
2019-12-17 18:46:33 +03:00
work_flags = work - > flags ;
2020-09-01 11:41:46 +03:00
raw_spin_lock_irqsave ( & wqe - > lock , flags ) ;
2020-03-23 22:57:22 +03:00
io_wqe_insert_work ( wqe , work ) ;
2019-10-22 19:25:58 +03:00
wqe - > flags & = ~ IO_WQE_FLAG_STALLED ;
2020-09-01 11:41:46 +03:00
raw_spin_unlock_irqrestore ( & wqe - > lock , flags ) ;
2019-10-22 19:25:58 +03:00
2019-12-17 18:46:33 +03:00
if ( ( work_flags & IO_WQ_WORK_CONCURRENT ) | |
! atomic_read ( & acct - > nr_running ) )
2019-11-07 21:41:16 +03:00
io_wqe_wake_worker ( wqe , acct ) ;
2019-10-22 19:25:58 +03:00
}
void io_wq_enqueue ( struct io_wq * wq , struct io_wq_work * work )
{
struct io_wqe * wqe = wq - > wqes [ numa_node_id ( ) ] ;
io_wqe_enqueue ( wqe , work ) ;
}
/*
2020-03-14 00:31:04 +03:00
* Work items that hash to the same value will not be done in parallel .
* Used to limit concurrent writes , generally hashed by inode .
2019-10-22 19:25:58 +03:00
*/
2020-03-14 00:31:04 +03:00
void io_wq_hash_work ( struct io_wq_work * work , void * val )
2019-10-22 19:25:58 +03:00
{
2020-03-14 00:31:04 +03:00
unsigned int bit ;
2019-10-22 19:25:58 +03:00
bit = hash_ptr ( val , IO_WQ_HASH_ORDER ) ;
work - > flags | = ( IO_WQ_WORK_HASHED | ( bit < < IO_WQ_HASH_SHIFT ) ) ;
}
2019-10-29 06:49:21 +03:00
struct io_cb_cancel_data {
2020-03-07 01:15:39 +03:00
work_cancel_fn * fn ;
void * data ;
2020-06-15 10:24:03 +03:00
int nr_running ;
int nr_pending ;
bool cancel_all ;
2019-10-29 06:49:21 +03:00
} ;
2020-03-07 01:15:39 +03:00
static bool io_wq_worker_cancel ( struct io_worker * worker , void * data )
2019-10-29 06:49:21 +03:00
{
2020-03-07 01:15:39 +03:00
struct io_cb_cancel_data * match = data ;
2019-11-05 23:51:51 +03:00
unsigned long flags ;
2019-10-29 06:49:21 +03:00
/*
* Hold the lock to avoid - > cur_work going out of scope , caller
2019-11-13 19:43:34 +03:00
* may dereference the passed in work .
2019-10-29 06:49:21 +03:00
*/
2019-11-13 19:43:34 +03:00
spin_lock_irqsave ( & worker - > lock , flags ) ;
2019-10-29 06:49:21 +03:00
if ( worker - > cur_work & &
2019-12-12 05:29:43 +03:00
! ( worker - > cur_work - > flags & IO_WQ_WORK_NO_CANCEL ) & &
2020-03-07 01:15:39 +03:00
match - > fn ( worker - > cur_work , match - > data ) ) {
2019-10-22 19:25:58 +03:00
send_sig ( SIGINT , worker - > task , 1 ) ;
2020-06-15 10:24:03 +03:00
match - > nr_running + + ;
2019-10-22 19:25:58 +03:00
}
2019-11-13 19:43:34 +03:00
spin_unlock_irqrestore ( & worker - > lock , flags ) ;
2019-10-22 19:25:58 +03:00
2020-06-15 10:24:03 +03:00
return match - > nr_running & & ! match - > cancel_all ;
2019-10-22 19:25:58 +03:00
}
2020-08-23 20:33:10 +03:00
static inline void io_wqe_remove_pending ( struct io_wqe * wqe ,
struct io_wq_work * work ,
struct io_wq_work_node * prev )
{
unsigned int hash = io_get_work_hash ( work ) ;
struct io_wq_work * prev_work = NULL ;
if ( io_wq_is_hashed ( work ) & & work = = wqe - > hash_tail [ hash ] ) {
if ( prev )
prev_work = container_of ( prev , struct io_wq_work , list ) ;
if ( prev_work & & io_get_work_hash ( prev_work ) = = hash )
wqe - > hash_tail [ hash ] = prev_work ;
else
wqe - > hash_tail [ hash ] = NULL ;
}
wq_list_del ( & wqe - > work_list , & work - > list , prev ) ;
}
2020-06-15 10:24:03 +03:00
static void io_wqe_cancel_pending_work ( struct io_wqe * wqe ,
2020-06-15 10:24:02 +03:00
struct io_cb_cancel_data * match )
2019-10-22 19:25:58 +03:00
{
2019-11-26 21:59:32 +03:00
struct io_wq_work_node * node , * prev ;
2019-10-22 19:25:58 +03:00
struct io_wq_work * work ;
2019-11-05 23:51:51 +03:00
unsigned long flags ;
2019-10-22 19:25:58 +03:00
2020-06-15 10:24:03 +03:00
retry :
2020-09-01 11:41:46 +03:00
raw_spin_lock_irqsave ( & wqe - > lock , flags ) ;
2019-11-26 21:59:32 +03:00
wq_list_for_each ( node , prev , & wqe - > work_list ) {
work = container_of ( node , struct io_wq_work , list ) ;
2020-06-15 10:24:03 +03:00
if ( ! match - > fn ( work , match - > data ) )
continue ;
2020-08-23 20:33:10 +03:00
io_wqe_remove_pending ( wqe , work , prev ) ;
2020-09-01 11:41:46 +03:00
raw_spin_unlock_irqrestore ( & wqe - > lock , flags ) ;
2020-06-15 10:24:03 +03:00
io_run_cancel ( work , wqe ) ;
match - > nr_pending + + ;
if ( ! match - > cancel_all )
return ;
/* not safe to continue after unlock */
goto retry ;
2019-10-22 19:25:58 +03:00
}
2020-09-01 11:41:46 +03:00
raw_spin_unlock_irqrestore ( & wqe - > lock , flags ) ;
2020-06-15 10:24:02 +03:00
}
2020-06-15 10:24:03 +03:00
static void io_wqe_cancel_running_work ( struct io_wqe * wqe ,
2020-06-15 10:24:02 +03:00
struct io_cb_cancel_data * match )
{
2019-10-22 19:25:58 +03:00
rcu_read_lock ( ) ;
2020-06-15 10:24:03 +03:00
io_wq_for_each_worker ( wqe , io_wq_worker_cancel , match ) ;
2019-10-22 19:25:58 +03:00
rcu_read_unlock ( ) ;
}
2020-03-07 01:15:39 +03:00
enum io_wq_cancel io_wq_cancel_cb ( struct io_wq * wq , work_cancel_fn * cancel ,
2020-06-15 10:24:03 +03:00
void * data , bool cancel_all )
2019-10-22 19:25:58 +03:00
{
2020-03-07 01:15:39 +03:00
struct io_cb_cancel_data match = {
2020-06-15 10:24:03 +03:00
. fn = cancel ,
. data = data ,
. cancel_all = cancel_all ,
2020-02-09 05:13:32 +03:00
} ;
2019-11-26 21:10:20 +03:00
int node ;
2019-10-22 19:25:58 +03:00
2020-06-15 10:24:02 +03:00
/*
* First check pending list , if we ' re lucky we can just remove it
* from there . CANCEL_OK means that the work is returned as - new ,
* no completion will be posted for it .
*/
2019-11-26 21:10:20 +03:00
for_each_node ( node ) {
struct io_wqe * wqe = wq - > wqes [ node ] ;
2019-10-22 19:25:58 +03:00
2020-06-15 10:24:03 +03:00
io_wqe_cancel_pending_work ( wqe , & match ) ;
if ( match . nr_pending & & ! match . cancel_all )
2020-06-15 10:24:02 +03:00
return IO_WQ_CANCEL_OK ;
2019-10-22 19:25:58 +03:00
}
2020-06-15 10:24:02 +03:00
/*
* Now check if a free ( going busy ) or busy worker has the work
* currently running . If we find it there , we ' ll return CANCEL_RUNNING
* as an indication that we attempt to signal cancellation . The
* completion will run normally in this case .
*/
for_each_node ( node ) {
struct io_wqe * wqe = wq - > wqes [ node ] ;
2020-06-15 10:24:03 +03:00
io_wqe_cancel_running_work ( wqe , & match ) ;
if ( match . nr_running & & ! match . cancel_all )
2020-06-15 10:24:02 +03:00
return IO_WQ_CANCEL_RUNNING ;
}
2020-06-15 10:24:03 +03:00
if ( match . nr_running )
return IO_WQ_CANCEL_RUNNING ;
if ( match . nr_pending )
return IO_WQ_CANCEL_OK ;
2020-06-15 10:24:02 +03:00
return IO_WQ_CANCEL_NOTFOUND ;
2019-10-22 19:25:58 +03:00
}
2019-11-25 18:49:20 +03:00
struct io_wq * io_wq_create ( unsigned bounded , struct io_wq_data * data )
2019-10-22 19:25:58 +03:00
{
2019-11-26 21:10:20 +03:00
int ret = - ENOMEM , node ;
2019-10-22 19:25:58 +03:00
struct io_wq * wq ;
2020-06-08 21:08:20 +03:00
if ( WARN_ON_ONCE ( ! data - > free_work | | ! data - > do_work ) )
2020-03-04 16:14:12 +03:00
return ERR_PTR ( - EINVAL ) ;
2019-11-26 19:39:45 +03:00
wq = kzalloc ( sizeof ( * wq ) , GFP_KERNEL ) ;
2019-10-22 19:25:58 +03:00
if ( ! wq )
return ERR_PTR ( - ENOMEM ) ;
2019-11-26 21:10:20 +03:00
wq - > wqes = kcalloc ( nr_node_ids , sizeof ( struct io_wqe * ) , GFP_KERNEL ) ;
2020-10-22 18:02:50 +03:00
if ( ! wq - > wqes )
goto err_wq ;
ret = cpuhp_state_add_instance_nocalls ( io_wq_online , & wq - > cpuhp_node ) ;
if ( ret )
goto err_wqes ;
2019-10-22 19:25:58 +03:00
2020-03-04 16:14:12 +03:00
wq - > free_work = data - > free_work ;
2020-06-08 21:08:20 +03:00
wq - > do_work = data - > do_work ;
2019-11-13 08:31:31 +03:00
2019-11-07 21:41:16 +03:00
/* caller must already hold a reference to this */
2019-11-25 18:49:20 +03:00
wq - > user = data - > user ;
2019-11-07 21:41:16 +03:00
2020-10-22 18:02:50 +03:00
ret = - ENOMEM ;
2019-11-26 21:10:20 +03:00
for_each_node ( node ) {
2019-10-22 19:25:58 +03:00
struct io_wqe * wqe ;
2020-02-11 16:30:06 +03:00
int alloc_node = node ;
2019-10-22 19:25:58 +03:00
2020-02-11 16:30:06 +03:00
if ( ! node_online ( alloc_node ) )
alloc_node = NUMA_NO_NODE ;
wqe = kzalloc_node ( sizeof ( struct io_wqe ) , GFP_KERNEL , alloc_node ) ;
2019-10-22 19:25:58 +03:00
if ( ! wqe )
2019-11-26 21:10:20 +03:00
goto err ;
wq - > wqes [ node ] = wqe ;
2020-02-11 16:30:06 +03:00
wqe - > node = alloc_node ;
2019-11-07 21:41:16 +03:00
wqe - > acct [ IO_WQ_ACCT_BOUND ] . max_workers = bounded ;
atomic_set ( & wqe - > acct [ IO_WQ_ACCT_BOUND ] . nr_running , 0 ) ;
2019-11-25 18:49:20 +03:00
if ( wq - > user ) {
2019-11-07 21:41:16 +03:00
wqe - > acct [ IO_WQ_ACCT_UNBOUND ] . max_workers =
task_rlimit ( current , RLIMIT_NPROC ) ;
}
atomic_set ( & wqe - > acct [ IO_WQ_ACCT_UNBOUND ] . nr_running , 0 ) ;
2019-10-22 19:25:58 +03:00
wqe - > wq = wq ;
2020-09-01 11:41:46 +03:00
raw_spin_lock_init ( & wqe - > lock ) ;
2019-11-26 21:59:32 +03:00
INIT_WQ_LIST ( & wqe - > work_list ) ;
2019-11-14 18:00:41 +03:00
INIT_HLIST_NULLS_HEAD ( & wqe - > free_list , 0 ) ;
2019-11-13 23:54:49 +03:00
INIT_LIST_HEAD ( & wqe - > all_list ) ;
2019-10-22 19:25:58 +03:00
}
init_completion ( & wq - > done ) ;
wq - > manager = kthread_create ( io_wq_manager , wq , " io_wq_manager " ) ;
if ( ! IS_ERR ( wq - > manager ) ) {
wake_up_process ( wq - > manager ) ;
2019-11-19 18:37:07 +03:00
wait_for_completion ( & wq - > done ) ;
if ( test_bit ( IO_WQ_BIT_ERROR , & wq - > state ) ) {
ret = - ENOMEM ;
goto err ;
}
2020-01-24 01:33:32 +03:00
refcount_set ( & wq - > use_refs , 1 ) ;
2019-11-19 18:37:07 +03:00
reinit_completion ( & wq - > done ) ;
2019-10-22 19:25:58 +03:00
return wq ;
}
ret = PTR_ERR ( wq - > manager ) ;
complete ( & wq - > done ) ;
2019-11-19 18:37:07 +03:00
err :
2020-10-22 18:02:50 +03:00
cpuhp_state_remove_instance_nocalls ( io_wq_online , & wq - > cpuhp_node ) ;
2019-11-26 21:10:20 +03:00
for_each_node ( node )
kfree ( wq - > wqes [ node ] ) ;
2020-10-22 18:02:50 +03:00
err_wqes :
2019-11-19 18:37:07 +03:00
kfree ( wq - > wqes ) ;
2020-10-22 18:02:50 +03:00
err_wq :
2019-11-19 18:37:07 +03:00
kfree ( wq ) ;
2019-10-22 19:25:58 +03:00
return ERR_PTR ( ret ) ;
}
2020-01-28 03:15:47 +03:00
bool io_wq_get ( struct io_wq * wq , struct io_wq_data * data )
{
2020-06-08 21:08:20 +03:00
if ( data - > free_work ! = wq - > free_work | | data - > do_work ! = wq - > do_work )
2020-01-28 03:15:47 +03:00
return false ;
return refcount_inc_not_zero ( & wq - > use_refs ) ;
}
2020-01-24 01:33:32 +03:00
static void __io_wq_destroy ( struct io_wq * wq )
2019-10-22 19:25:58 +03:00
{
2019-11-26 21:10:20 +03:00
int node ;
2019-10-22 19:25:58 +03:00
2020-10-22 18:02:50 +03:00
cpuhp_state_remove_instance_nocalls ( io_wq_online , & wq - > cpuhp_node ) ;
2019-11-19 18:37:07 +03:00
set_bit ( IO_WQ_BIT_EXIT , & wq - > state ) ;
if ( wq - > manager )
2019-10-22 19:25:58 +03:00
kthread_stop ( wq - > manager ) ;
rcu_read_lock ( ) ;
2019-11-26 21:10:20 +03:00
for_each_node ( node )
io_wq_for_each_worker ( wq - > wqes [ node ] , io_wq_worker_wake , NULL ) ;
2019-10-22 19:25:58 +03:00
rcu_read_unlock ( ) ;
wait_for_completion ( & wq - > done ) ;
2019-11-26 21:10:20 +03:00
for_each_node ( node )
kfree ( wq - > wqes [ node ] ) ;
2019-10-22 19:25:58 +03:00
kfree ( wq - > wqes ) ;
kfree ( wq ) ;
}
2020-01-24 01:33:32 +03:00
void io_wq_destroy ( struct io_wq * wq )
{
if ( refcount_dec_and_test ( & wq - > use_refs ) )
__io_wq_destroy ( wq ) ;
}
2020-04-03 20:26:26 +03:00
struct task_struct * io_wq_get_task ( struct io_wq * wq )
{
return wq - > manager ;
}
2020-10-22 18:02:50 +03:00
static bool io_wq_worker_affinity ( struct io_worker * worker , void * data )
{
struct task_struct * task = worker - > task ;
struct rq_flags rf ;
struct rq * rq ;
rq = task_rq_lock ( task , & rf ) ;
do_set_cpus_allowed ( task , cpumask_of_node ( worker - > wqe - > node ) ) ;
task - > flags | = PF_NO_SETAFFINITY ;
task_rq_unlock ( rq , task , & rf ) ;
return false ;
}
static int io_wq_cpu_online ( unsigned int cpu , struct hlist_node * node )
{
struct io_wq * wq = hlist_entry_safe ( node , struct io_wq , cpuhp_node ) ;
int i ;
rcu_read_lock ( ) ;
for_each_node ( i )
io_wq_for_each_worker ( wq - > wqes [ i ] , io_wq_worker_affinity , NULL ) ;
rcu_read_unlock ( ) ;
return 0 ;
}
static __init int io_wq_init ( void )
{
int ret ;
ret = cpuhp_setup_state_multi ( CPUHP_AP_ONLINE_DYN , " io-wq/online " ,
io_wq_cpu_online , NULL ) ;
if ( ret < 0 )
return ret ;
io_wq_online = ret ;
return 0 ;
}
subsys_initcall ( io_wq_init ) ;