2016-12-27 14:49:06 -05:00
# ifndef __CGROUP_INTERNAL_H
# define __CGROUP_INTERNAL_H
# include <linux/cgroup.h>
# include <linux/kernfs.h>
# include <linux/workqueue.h>
# include <linux/list.h>
2017-03-08 10:00:40 +02:00
# include <linux/refcount.h>
2016-12-27 14:49:06 -05:00
/*
* A cgroup can be associated with multiple css_sets as different tasks may
* belong to different cgroups on different hierarchies . In the other
* direction , a css_set is naturally associated with multiple cgroups .
* This M : N relationship is represented by the following link structure
* which exists for each association and allows traversing the associations
* from both sides .
*/
struct cgrp_cset_link {
/* the cgroup and css_set this link associates */
struct cgroup * cgrp ;
struct css_set * cset ;
/* list of cgrp_cset_links anchored at cgrp->cset_links */
struct list_head cset_link ;
/* list of cgrp_cset_links anchored at css_set->cgrp_links */
struct list_head cgrp_link ;
} ;
2017-01-15 19:03:41 -05:00
/* used to track tasks and csets during migration */
struct cgroup_taskset {
/* the src and dst cset list running through cset->mg_node */
struct list_head src_csets ;
struct list_head dst_csets ;
cgroup: don't call migration methods if there are no tasks to migrate
Subsystem migration methods shouldn't be called for empty migrations.
cgroup_migrate_execute() implements this guarantee by bailing early if
there are no source css_sets. This used to be correct before
a79a908fd2b0 ("cgroup: introduce cgroup namespaces"), but no longer
since the commit because css_sets can stay pinned without tasks in
them.
This caused cgroup_migrate_execute() call into cpuset migration
methods with an empty cgroup_taskset. cpuset migration methods
correctly assume that cgroup_taskset_first() never returns NULL;
however, due to the bug, it can, leading to the following oops.
Unable to handle kernel paging request for data at address 0x00000960
Faulting instruction address: 0xc0000000001d6868
Oops: Kernel access of bad area, sig: 11 [#1]
...
CPU: 14 PID: 16947 Comm: kworker/14:0 Tainted: G W
4.12.0-rc4-next-20170609 #2
Workqueue: events cpuset_hotplug_workfn
task: c00000000ca60580 task.stack: c00000000c728000
NIP: c0000000001d6868 LR: c0000000001d6858 CTR: c0000000001d6810
REGS: c00000000c72b720 TRAP: 0300 Tainted: GW (4.12.0-rc4-next-20170609)
MSR: 8000000000009033 <SF,EE,ME,IR,DR,RI,LE> CR: 44722422 XER: 20000000
CFAR: c000000000008710 DAR: 0000000000000960 DSISR: 40000000 SOFTE: 1
GPR00: c0000000001d6858 c00000000c72b9a0 c000000001536e00 0000000000000000
GPR04: c00000000c72b9c0 0000000000000000 c00000000c72bad0 c000000766367678
GPR08: c000000766366d10 c00000000c72b958 c000000001736e00 0000000000000000
GPR12: c0000000001d6810 c00000000e749300 c000000000123ef8 c000000775af4180
GPR16: 0000000000000000 0000000000000000 c00000075480e9c0 c00000075480e9e0
GPR20: c00000075480e8c0 0000000000000001 0000000000000000 c00000000c72ba20
GPR24: c00000000c72baa0 c00000000c72bac0 c000000001407248 c00000000c72ba20
GPR28: c00000000141fc80 c00000000c72bac0 c00000000c6bc790 0000000000000000
NIP [c0000000001d6868] cpuset_can_attach+0x58/0x1b0
LR [c0000000001d6858] cpuset_can_attach+0x48/0x1b0
Call Trace:
[c00000000c72b9a0] [c0000000001d6858] cpuset_can_attach+0x48/0x1b0 (unreliable)
[c00000000c72ba00] [c0000000001cbe80] cgroup_migrate_execute+0xb0/0x450
[c00000000c72ba80] [c0000000001d3754] cgroup_transfer_tasks+0x1c4/0x360
[c00000000c72bba0] [c0000000001d923c] cpuset_hotplug_workfn+0x86c/0xa20
[c00000000c72bca0] [c00000000011aa44] process_one_work+0x1e4/0x580
[c00000000c72bd30] [c00000000011ae78] worker_thread+0x98/0x5c0
[c00000000c72bdc0] [c000000000124058] kthread+0x168/0x1b0
[c00000000c72be30] [c00000000000b2e8] ret_from_kernel_thread+0x5c/0x74
Instruction dump:
f821ffa1 7c7d1b78 60000000 60000000 38810020 7fa3eb78 3f42ffed 4bff4c25
60000000 3b5a0448 3d420020 eb610020 <e9230960> 7f43d378 e9290000 f92af200
---[ end trace dcaaf98fb36d9e64 ]---
This patch fixes the bug by adding an explicit nr_tasks counter to
cgroup_taskset and skipping calling the migration methods if the
counter is zero. While at it, remove the now spurious check on no
source css_sets.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-and-tested-by: Abdul Haleem <abdhalee@linux.vnet.ibm.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: stable@vger.kernel.org # v4.6+
Fixes: a79a908fd2b0 ("cgroup: introduce cgroup namespaces")
Link: http://lkml.kernel.org/r/1497266622.15415.39.camel@abdul.in.ibm.com
2017-07-08 07:17:02 -04:00
/* the number of tasks in the set */
int nr_tasks ;
2017-01-15 19:03:41 -05:00
/* the subsys currently being processed */
int ssid ;
/*
* Fields for cgroup_taskset_ * ( ) iteration .
*
* Before migration is committed , the target migration tasks are on
* - > mg_tasks of the csets on - > src_csets . After , on - > mg_tasks of
* the csets on - > dst_csets . - > csets point to either - > src_csets
* or - > dst_csets depending on whether migration is committed .
*
* - > cur_csets and - > cur_task point to the current task position
* during iteration .
*/
struct list_head * csets ;
struct css_set * cur_cset ;
struct task_struct * cur_task ;
} ;
/* migration context also tracks preloading */
struct cgroup_mgctx {
/*
* Preloaded source and destination csets . Used to guarantee
* atomic success or failure on actual migration .
*/
struct list_head preloaded_src_csets ;
struct list_head preloaded_dst_csets ;
/* tasks and csets to migrate */
struct cgroup_taskset tset ;
2017-01-15 19:03:41 -05:00
/* subsystems affected by migration */
u16 ss_mask ;
2017-01-15 19:03:41 -05:00
} ;
# define CGROUP_TASKSET_INIT(tset) \
{ \
. src_csets = LIST_HEAD_INIT ( tset . src_csets ) , \
. dst_csets = LIST_HEAD_INIT ( tset . dst_csets ) , \
. csets = & tset . src_csets , \
}
# define CGROUP_MGCTX_INIT(name) \
{ \
LIST_HEAD_INIT ( name . preloaded_src_csets ) , \
LIST_HEAD_INIT ( name . preloaded_dst_csets ) , \
CGROUP_TASKSET_INIT ( name . tset ) , \
}
# define DEFINE_CGROUP_MGCTX(name) \
struct cgroup_mgctx name = CGROUP_MGCTX_INIT ( name )
2016-12-27 14:49:08 -05:00
struct cgroup_sb_opts {
u16 subsys_mask ;
unsigned int flags ;
char * release_agent ;
bool cpuset_clone_children ;
char * name ;
/* User explicitly requested empty subsystem */
bool none ;
} ;
2016-12-27 14:49:06 -05:00
extern struct mutex cgroup_mutex ;
extern spinlock_t css_set_lock ;
extern struct cgroup_subsys * cgroup_subsys [ ] ;
extern struct list_head cgroup_roots ;
extern struct file_system_type cgroup_fs_type ;
/* iterate across the hierarchies */
# define for_each_root(root) \
list_for_each_entry ( ( root ) , & cgroup_roots , root_list )
/**
* for_each_subsys - iterate all enabled cgroup subsystems
* @ ss : the iteration cursor
* @ ssid : the index of @ ss , CGROUP_SUBSYS_COUNT after reaching the end
*/
# define for_each_subsys(ss, ssid) \
for ( ( ssid ) = 0 ; ( ssid ) < CGROUP_SUBSYS_COUNT & & \
( ( ( ss ) = cgroup_subsys [ ssid ] ) | | true ) ; ( ssid ) + + )
static inline bool cgroup_is_dead ( const struct cgroup * cgrp )
{
return ! ( cgrp - > self . flags & CSS_ONLINE ) ;
}
static inline bool notify_on_release ( const struct cgroup * cgrp )
{
return test_bit ( CGRP_NOTIFY_ON_RELEASE , & cgrp - > flags ) ;
}
2016-12-27 14:49:09 -05:00
void put_css_set_locked ( struct css_set * cset ) ;
static inline void put_css_set ( struct css_set * cset )
{
unsigned long flags ;
/*
* Ensure that the refcount doesn ' t hit zero while any readers
* can see it . Similar to atomic_dec_and_lock ( ) , but for an
* rwlock
*/
2017-03-08 10:00:40 +02:00
if ( refcount_dec_not_one ( & cset - > refcount ) )
2016-12-27 14:49:09 -05:00
return ;
spin_lock_irqsave ( & css_set_lock , flags ) ;
put_css_set_locked ( cset ) ;
spin_unlock_irqrestore ( & css_set_lock , flags ) ;
}
/*
* refcounted get / put for css_set objects
*/
static inline void get_css_set ( struct css_set * cset )
{
2017-03-08 10:00:40 +02:00
refcount_inc ( & cset - > refcount ) ;
2016-12-27 14:49:09 -05:00
}
2016-12-27 14:49:06 -05:00
bool cgroup_ssid_enabled ( int ssid ) ;
bool cgroup_on_dfl ( const struct cgroup * cgrp ) ;
struct cgroup_root * cgroup_root_from_kf ( struct kernfs_root * kf_root ) ;
struct cgroup * task_cgroup_from_root ( struct task_struct * task ,
struct cgroup_root * root ) ;
struct cgroup * cgroup_kn_lock_live ( struct kernfs_node * kn , bool drain_offline ) ;
void cgroup_kn_unlock ( struct kernfs_node * kn ) ;
int cgroup_path_ns_locked ( struct cgroup * cgrp , char * buf , size_t buflen ,
struct cgroup_namespace * ns ) ;
2016-12-27 14:49:08 -05:00
void cgroup_free_root ( struct cgroup_root * root ) ;
void init_cgroup_root ( struct cgroup_root * root , struct cgroup_sb_opts * opts ) ;
2017-04-19 10:15:59 +08:00
int cgroup_setup_root ( struct cgroup_root * root , u16 ss_mask , int ref_flags ) ;
2016-12-27 14:49:06 -05:00
int rebind_subsystems ( struct cgroup_root * dst_root , u16 ss_mask ) ;
2016-12-27 14:49:08 -05:00
struct dentry * cgroup_do_mount ( struct file_system_type * fs_type , int flags ,
struct cgroup_root * root , unsigned long magic ,
struct cgroup_namespace * ns ) ;
2016-12-27 14:49:06 -05:00
bool cgroup_may_migrate_to ( struct cgroup * dst_cgrp ) ;
2017-01-15 19:03:41 -05:00
void cgroup_migrate_finish ( struct cgroup_mgctx * mgctx ) ;
void cgroup_migrate_add_src ( struct css_set * src_cset , struct cgroup * dst_cgrp ,
struct cgroup_mgctx * mgctx ) ;
int cgroup_migrate_prepare_dst ( struct cgroup_mgctx * mgctx ) ;
2016-12-27 14:49:06 -05:00
int cgroup_migrate ( struct task_struct * leader , bool threadgroup ,
2017-01-15 19:03:41 -05:00
struct cgroup_mgctx * mgctx ) ;
2016-12-27 14:49:06 -05:00
int cgroup_attach_task ( struct cgroup * dst_cgrp , struct task_struct * leader ,
bool threadgroup ) ;
ssize_t __cgroup_procs_write ( struct kernfs_open_file * of , char * buf ,
size_t nbytes , loff_t off , bool threadgroup ) ;
ssize_t cgroup_procs_write ( struct kernfs_open_file * of , char * buf , size_t nbytes ,
loff_t off ) ;
void cgroup_lock_and_drain_offline ( struct cgroup * cgrp ) ;
2016-12-27 14:49:08 -05:00
int cgroup_mkdir ( struct kernfs_node * parent_kn , const char * name , umode_t mode ) ;
int cgroup_rmdir ( struct kernfs_node * kn ) ;
int cgroup_show_path ( struct seq_file * sf , struct kernfs_node * kf_node ,
struct kernfs_root * kf_root ) ;
2017-06-13 17:18:02 -04:00
int cgroup_task_count ( const struct cgroup * cgrp ) ;
2016-12-27 14:49:09 -05:00
/*
* namespace . c
*/
extern const struct proc_ns_operations cgroupns_operations ;
2016-12-27 14:49:06 -05:00
/*
* cgroup - v1 . c
*/
2016-12-27 14:49:08 -05:00
extern struct cftype cgroup1_base_files [ ] ;
2016-12-27 14:49:06 -05:00
extern const struct file_operations proc_cgroupstats_operations ;
2016-12-27 14:49:08 -05:00
extern struct kernfs_syscall_ops cgroup1_kf_syscall_ops ;
2016-12-27 14:49:06 -05:00
2016-12-27 14:49:08 -05:00
bool cgroup1_ssid_disabled ( int ssid ) ;
void cgroup1_pidlist_destroy_all ( struct cgroup * cgrp ) ;
void cgroup1_release_agent ( struct work_struct * work ) ;
void cgroup1_check_for_release ( struct cgroup * cgrp ) ;
2016-12-27 14:49:08 -05:00
struct dentry * cgroup1_mount ( struct file_system_type * fs_type , int flags ,
void * data , unsigned long magic ,
struct cgroup_namespace * ns ) ;
2016-12-27 14:49:06 -05:00
# endif /* __CGROUP_INTERNAL_H */