2016-12-27 22:49:06 +03:00
# include "cgroup-internal.h"
2016-12-27 22:49:08 +03:00
# include <linux/ctype.h>
2016-12-27 22:49:06 +03:00
# include <linux/kmod.h>
# include <linux/sort.h>
2016-12-27 22:49:08 +03:00
# include <linux/delay.h>
2016-12-27 22:49:06 +03:00
# include <linux/mm.h>
2017-02-02 10:35:14 +03:00
# include <linux/sched/signal.h>
2017-02-06 12:57:33 +03:00
# include <linux/sched/task.h>
2017-02-05 18:03:58 +03:00
# include <linux/magic.h>
2016-12-27 22:49:06 +03:00
# include <linux/slab.h>
# include <linux/vmalloc.h>
# include <linux/delayacct.h>
# include <linux/pid_namespace.h>
# include <linux/cgroupstats.h>
# include <trace/events/cgroup.h>
/*
* pidlists linger the following amount before being destroyed . The goal
* is avoiding frequent destruction in the middle of consecutive read calls
* Expiring in the middle is a performance problem not a correctness one .
* 1 sec should be enough .
*/
# define CGROUP_PIDLIST_DESTROY_DELAY HZ
/* Controllers blocked by the commandline in v1 */
static u16 cgroup_no_v1_mask ;
/*
* pidlist destructions need to be flushed on cgroup destruction . Use a
* separate workqueue as flush domain .
*/
static struct workqueue_struct * cgroup_pidlist_destroy_wq ;
/*
* Protects cgroup_subsys - > release_agent_path . Modifying it also requires
* cgroup_mutex . Reading requires either cgroup_mutex or this spinlock .
*/
2016-12-27 22:49:08 +03:00
static DEFINE_SPINLOCK ( release_agent_path_lock ) ;
2016-12-27 22:49:06 +03:00
2016-12-27 22:49:08 +03:00
bool cgroup1_ssid_disabled ( int ssid )
2016-12-27 22:49:06 +03:00
{
return cgroup_no_v1_mask & ( 1 < < ssid ) ;
}
/**
* cgroup_attach_task_all - attach task ' tsk ' to all cgroups of task ' from '
* @ from : attach to all cgroups of a given task
* @ tsk : the task to be attached
*/
int cgroup_attach_task_all ( struct task_struct * from , struct task_struct * tsk )
{
struct cgroup_root * root ;
int retval = 0 ;
mutex_lock ( & cgroup_mutex ) ;
percpu_down_write ( & cgroup_threadgroup_rwsem ) ;
for_each_root ( root ) {
struct cgroup * from_cgrp ;
if ( root = = & cgrp_dfl_root )
continue ;
spin_lock_irq ( & css_set_lock ) ;
from_cgrp = task_cgroup_from_root ( from , root ) ;
spin_unlock_irq ( & css_set_lock ) ;
retval = cgroup_attach_task ( from_cgrp , tsk , false ) ;
if ( retval )
break ;
}
percpu_up_write ( & cgroup_threadgroup_rwsem ) ;
mutex_unlock ( & cgroup_mutex ) ;
return retval ;
}
EXPORT_SYMBOL_GPL ( cgroup_attach_task_all ) ;
/**
* cgroup_trasnsfer_tasks - move tasks from one cgroup to another
* @ to : cgroup to which the tasks will be moved
* @ from : cgroup in which the tasks currently reside
*
* Locking rules between cgroup_post_fork ( ) and the migration path
* guarantee that , if a task is forking while being migrated , the new child
* is guaranteed to be either visible in the source cgroup after the
* parent ' s migration is complete or put into the target cgroup . No task
* can slip out of migration through forking .
*/
int cgroup_transfer_tasks ( struct cgroup * to , struct cgroup * from )
{
2017-01-16 03:03:41 +03:00
DEFINE_CGROUP_MGCTX ( mgctx ) ;
2016-12-27 22:49:06 +03:00
struct cgrp_cset_link * link ;
struct css_task_iter it ;
struct task_struct * task ;
int ret ;
if ( cgroup_on_dfl ( to ) )
return - EINVAL ;
if ( ! cgroup_may_migrate_to ( to ) )
return - EBUSY ;
mutex_lock ( & cgroup_mutex ) ;
percpu_down_write ( & cgroup_threadgroup_rwsem ) ;
/* all tasks in @from are being moved, all csets are source */
spin_lock_irq ( & css_set_lock ) ;
list_for_each_entry ( link , & from - > cset_links , cset_link )
2017-01-16 03:03:41 +03:00
cgroup_migrate_add_src ( link - > cset , to , & mgctx ) ;
2016-12-27 22:49:06 +03:00
spin_unlock_irq ( & css_set_lock ) ;
2017-01-16 03:03:41 +03:00
ret = cgroup_migrate_prepare_dst ( & mgctx ) ;
2016-12-27 22:49:06 +03:00
if ( ret )
goto out_err ;
/*
* Migrate tasks one - by - one until @ from is empty . This fails iff
* - > can_attach ( ) fails .
*/
do {
2017-05-15 16:34:01 +03:00
css_task_iter_start ( & from - > self , 0 , & it ) ;
2016-12-27 22:49:06 +03:00
task = css_task_iter_next ( & it ) ;
if ( task )
get_task_struct ( task ) ;
css_task_iter_end ( & it ) ;
if ( task ) {
2017-01-16 03:03:41 +03:00
ret = cgroup_migrate ( task , false , & mgctx ) ;
2016-12-27 22:49:06 +03:00
if ( ! ret )
trace_cgroup_transfer_tasks ( to , task , false ) ;
put_task_struct ( task ) ;
}
} while ( task & & ! ret ) ;
out_err :
2017-01-16 03:03:41 +03:00
cgroup_migrate_finish ( & mgctx ) ;
2016-12-27 22:49:06 +03:00
percpu_up_write ( & cgroup_threadgroup_rwsem ) ;
mutex_unlock ( & cgroup_mutex ) ;
return ret ;
}
/*
* Stuff for reading the ' tasks ' / ' procs ' files .
*
* Reading this file can return large amounts of data if a cgroup has
* * lots * of attached tasks . So it may need several calls to read ( ) ,
* but we cannot guarantee that the information we produce is correct
* unless we produce it entirely atomically .
*
*/
/* which pidlist file are we talking about? */
enum cgroup_filetype {
CGROUP_FILE_PROCS ,
CGROUP_FILE_TASKS ,
} ;
/*
* A pidlist is a list of pids that virtually represents the contents of one
* of the cgroup files ( " procs " or " tasks " ) . We keep a list of such pidlists ,
* a pair ( one each for procs , tasks ) for each pid namespace that ' s relevant
* to the cgroup .
*/
struct cgroup_pidlist {
/*
* used to find which pidlist is wanted . doesn ' t change as long as
* this particular list stays in the list .
*/
struct { enum cgroup_filetype type ; struct pid_namespace * ns ; } key ;
/* array of xids */
pid_t * list ;
/* how many elements the above list has */
int length ;
/* each of these stored in a list by its cgroup */
struct list_head links ;
/* pointer to the cgroup we belong to, for list removal purposes */
struct cgroup * owner ;
/* for delayed destruction */
struct delayed_work destroy_dwork ;
} ;
/*
* The following two functions " fix " the issue where there are more pids
* than kmalloc will give memory for ; in such cases , we use vmalloc / vfree .
* TODO : replace with a kernel - wide solution to this problem
*/
# define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
static void * pidlist_allocate ( int count )
{
if ( PIDLIST_TOO_LARGE ( count ) )
return vmalloc ( count * sizeof ( pid_t ) ) ;
else
return kmalloc ( count * sizeof ( pid_t ) , GFP_KERNEL ) ;
}
static void pidlist_free ( void * p )
{
kvfree ( p ) ;
}
/*
* Used to destroy all pidlists lingering waiting for destroy timer . None
* should be left afterwards .
*/
2016-12-27 22:49:08 +03:00
void cgroup1_pidlist_destroy_all ( struct cgroup * cgrp )
2016-12-27 22:49:06 +03:00
{
struct cgroup_pidlist * l , * tmp_l ;
mutex_lock ( & cgrp - > pidlist_mutex ) ;
list_for_each_entry_safe ( l , tmp_l , & cgrp - > pidlists , links )
mod_delayed_work ( cgroup_pidlist_destroy_wq , & l - > destroy_dwork , 0 ) ;
mutex_unlock ( & cgrp - > pidlist_mutex ) ;
flush_workqueue ( cgroup_pidlist_destroy_wq ) ;
BUG_ON ( ! list_empty ( & cgrp - > pidlists ) ) ;
}
static void cgroup_pidlist_destroy_work_fn ( struct work_struct * work )
{
struct delayed_work * dwork = to_delayed_work ( work ) ;
struct cgroup_pidlist * l = container_of ( dwork , struct cgroup_pidlist ,
destroy_dwork ) ;
struct cgroup_pidlist * tofree = NULL ;
mutex_lock ( & l - > owner - > pidlist_mutex ) ;
/*
* Destroy iff we didn ' t get queued again . The state won ' t change
* as destroy_dwork can only be queued while locked .
*/
if ( ! delayed_work_pending ( dwork ) ) {
list_del ( & l - > links ) ;
pidlist_free ( l - > list ) ;
put_pid_ns ( l - > key . ns ) ;
tofree = l ;
}
mutex_unlock ( & l - > owner - > pidlist_mutex ) ;
kfree ( tofree ) ;
}
/*
* pidlist_uniq - given a kmalloc ( ) ed list , strip out all duplicate entries
* Returns the number of unique elements .
*/
static int pidlist_uniq ( pid_t * list , int length )
{
int src , dest = 1 ;
/*
* we presume the 0 th element is unique , so i starts at 1. trivial
* edge cases first ; no work needs to be done for either
*/
if ( length = = 0 | | length = = 1 )
return length ;
/* src and dest walk down the list; dest counts unique elements */
for ( src = 1 ; src < length ; src + + ) {
/* find next unique element */
while ( list [ src ] = = list [ src - 1 ] ) {
src + + ;
if ( src = = length )
goto after ;
}
/* dest always points to where the next unique element goes */
list [ dest ] = list [ src ] ;
dest + + ;
}
after :
return dest ;
}
/*
* The two pid files - task and cgroup . procs - guaranteed that the result
* is sorted , which forced this whole pidlist fiasco . As pid order is
* different per namespace , each namespace needs differently sorted list ,
* making it impossible to use , for example , single rbtree of member tasks
* sorted by task pointer . As pidlists can be fairly large , allocating one
* per open file is dangerous , so cgroup had to implement shared pool of
* pidlists keyed by cgroup and namespace .
*/
static int cmppid ( const void * a , const void * b )
{
return * ( pid_t * ) a - * ( pid_t * ) b ;
}
static struct cgroup_pidlist * cgroup_pidlist_find ( struct cgroup * cgrp ,
enum cgroup_filetype type )
{
struct cgroup_pidlist * l ;
/* don't need task_nsproxy() if we're looking at ourself */
struct pid_namespace * ns = task_active_pid_ns ( current ) ;
lockdep_assert_held ( & cgrp - > pidlist_mutex ) ;
list_for_each_entry ( l , & cgrp - > pidlists , links )
if ( l - > key . type = = type & & l - > key . ns = = ns )
return l ;
return NULL ;
}
/*
* find the appropriate pidlist for our purpose ( given procs vs tasks )
* returns with the lock on that pidlist already held , and takes care
* of the use count , or returns NULL with no locks held if we ' re out of
* memory .
*/
static struct cgroup_pidlist * cgroup_pidlist_find_create ( struct cgroup * cgrp ,
enum cgroup_filetype type )
{
struct cgroup_pidlist * l ;
lockdep_assert_held ( & cgrp - > pidlist_mutex ) ;
l = cgroup_pidlist_find ( cgrp , type ) ;
if ( l )
return l ;
/* entry not found; create a new one */
l = kzalloc ( sizeof ( struct cgroup_pidlist ) , GFP_KERNEL ) ;
if ( ! l )
return l ;
INIT_DELAYED_WORK ( & l - > destroy_dwork , cgroup_pidlist_destroy_work_fn ) ;
l - > key . type = type ;
/* don't need task_nsproxy() if we're looking at ourself */
l - > key . ns = get_pid_ns ( task_active_pid_ns ( current ) ) ;
l - > owner = cgrp ;
list_add ( & l - > links , & cgrp - > pidlists ) ;
return l ;
}
/**
* cgroup_task_count - count the number of tasks in a cgroup .
* @ cgrp : the cgroup in question
*/
2017-06-14 00:18:02 +03:00
int cgroup_task_count ( const struct cgroup * cgrp )
2016-12-27 22:49:06 +03:00
{
int count = 0 ;
struct cgrp_cset_link * link ;
spin_lock_irq ( & css_set_lock ) ;
list_for_each_entry ( link , & cgrp - > cset_links , cset_link )
2017-06-14 00:18:01 +03:00
count + = link - > cset - > nr_tasks ;
2016-12-27 22:49:06 +03:00
spin_unlock_irq ( & css_set_lock ) ;
return count ;
}
/*
* Load a cgroup ' s pidarray with either procs ' tgids or tasks ' pids
*/
static int pidlist_array_load ( struct cgroup * cgrp , enum cgroup_filetype type ,
struct cgroup_pidlist * * lp )
{
pid_t * array ;
int length ;
int pid , n = 0 ; /* used for populating the array */
struct css_task_iter it ;
struct task_struct * tsk ;
struct cgroup_pidlist * l ;
lockdep_assert_held ( & cgrp - > pidlist_mutex ) ;
/*
* If cgroup gets more users after we read count , we won ' t have
* enough space - tough . This race is indistinguishable to the
* caller from the case that the additional cgroup users didn ' t
* show up until sometime later on .
*/
length = cgroup_task_count ( cgrp ) ;
array = pidlist_allocate ( length ) ;
if ( ! array )
return - ENOMEM ;
/* now, populate the array */
2017-05-15 16:34:01 +03:00
css_task_iter_start ( & cgrp - > self , 0 , & it ) ;
2016-12-27 22:49:06 +03:00
while ( ( tsk = css_task_iter_next ( & it ) ) ) {
if ( unlikely ( n = = length ) )
break ;
/* get tgid or pid for procs or tasks file respectively */
if ( type = = CGROUP_FILE_PROCS )
pid = task_tgid_vnr ( tsk ) ;
else
pid = task_pid_vnr ( tsk ) ;
if ( pid > 0 ) /* make sure to only use valid results */
array [ n + + ] = pid ;
}
css_task_iter_end ( & it ) ;
length = n ;
/* now sort & (if procs) strip out duplicates */
sort ( array , length , sizeof ( pid_t ) , cmppid , NULL ) ;
if ( type = = CGROUP_FILE_PROCS )
length = pidlist_uniq ( array , length ) ;
l = cgroup_pidlist_find_create ( cgrp , type ) ;
if ( ! l ) {
pidlist_free ( array ) ;
return - ENOMEM ;
}
/* store array, freeing old if necessary */
pidlist_free ( l - > list ) ;
l - > list = array ;
l - > length = length ;
* lp = l ;
return 0 ;
}
/*
* seq_file methods for the tasks / procs files . The seq_file position is the
* next pid to display ; the seq_file iterator is a pointer to the pid
* in the cgroup - > l - > list array .
*/
static void * cgroup_pidlist_start ( struct seq_file * s , loff_t * pos )
{
/*
* Initially we receive a position value that corresponds to
* one more than the last pid shown ( or 0 on the first call or
* after a seek to the start ) . Use a binary - search to find the
* next pid to display , if any
*/
struct kernfs_open_file * of = s - > private ;
struct cgroup * cgrp = seq_css ( s ) - > cgroup ;
struct cgroup_pidlist * l ;
enum cgroup_filetype type = seq_cft ( s ) - > private ;
int index = 0 , pid = * pos ;
int * iter , ret ;
mutex_lock ( & cgrp - > pidlist_mutex ) ;
/*
* ! NULL @ of - > priv indicates that this isn ' t the first start ( )
* after open . If the matching pidlist is around , we can use that .
* Look for it . Note that @ of - > priv can ' t be used directly . It
* could already have been destroyed .
*/
if ( of - > priv )
of - > priv = cgroup_pidlist_find ( cgrp , type ) ;
/*
* Either this is the first start ( ) after open or the matching
* pidlist has been destroyed inbetween . Create a new one .
*/
if ( ! of - > priv ) {
ret = pidlist_array_load ( cgrp , type ,
( struct cgroup_pidlist * * ) & of - > priv ) ;
if ( ret )
return ERR_PTR ( ret ) ;
}
l = of - > priv ;
if ( pid ) {
int end = l - > length ;
while ( index < end ) {
int mid = ( index + end ) / 2 ;
if ( l - > list [ mid ] = = pid ) {
index = mid ;
break ;
} else if ( l - > list [ mid ] < = pid )
index = mid + 1 ;
else
end = mid ;
}
}
/* If we're off the end of the array, we're done */
if ( index > = l - > length )
return NULL ;
/* Update the abstract position to be the actual pid that we found */
iter = l - > list + index ;
* pos = * iter ;
return iter ;
}
static void cgroup_pidlist_stop ( struct seq_file * s , void * v )
{
struct kernfs_open_file * of = s - > private ;
struct cgroup_pidlist * l = of - > priv ;
if ( l )
mod_delayed_work ( cgroup_pidlist_destroy_wq , & l - > destroy_dwork ,
CGROUP_PIDLIST_DESTROY_DELAY ) ;
mutex_unlock ( & seq_css ( s ) - > cgroup - > pidlist_mutex ) ;
}
static void * cgroup_pidlist_next ( struct seq_file * s , void * v , loff_t * pos )
{
struct kernfs_open_file * of = s - > private ;
struct cgroup_pidlist * l = of - > priv ;
pid_t * p = v ;
pid_t * end = l - > list + l - > length ;
/*
* Advance to the next pid in the array . If this goes off the
* end , we ' re done
*/
p + + ;
if ( p > = end ) {
return NULL ;
} else {
* pos = * p ;
return p ;
}
}
static int cgroup_pidlist_show ( struct seq_file * s , void * v )
{
seq_printf ( s , " %d \n " , * ( int * ) v ) ;
return 0 ;
}
2017-05-15 16:34:00 +03:00
static ssize_t __cgroup1_procs_write ( struct kernfs_open_file * of ,
char * buf , size_t nbytes , loff_t off ,
bool threadgroup )
2016-12-27 22:49:06 +03:00
{
2017-05-15 16:34:00 +03:00
struct cgroup * cgrp ;
struct task_struct * task ;
const struct cred * cred , * tcred ;
ssize_t ret ;
cgrp = cgroup_kn_lock_live ( of - > kn , false ) ;
if ( ! cgrp )
return - ENODEV ;
task = cgroup_procs_write_start ( buf , threadgroup ) ;
ret = PTR_ERR_OR_ZERO ( task ) ;
if ( ret )
goto out_unlock ;
/*
* Even if we ' re attaching all tasks in the thread group , we only
* need to check permissions on one of them .
*/
cred = current_cred ( ) ;
tcred = get_task_cred ( task ) ;
if ( ! uid_eq ( cred - > euid , GLOBAL_ROOT_UID ) & &
! uid_eq ( cred - > euid , tcred - > uid ) & &
! uid_eq ( cred - > euid , tcred - > suid ) )
ret = - EACCES ;
put_cred ( tcred ) ;
if ( ret )
goto out_finish ;
ret = cgroup_attach_task ( cgrp , task , threadgroup ) ;
out_finish :
cgroup_procs_write_finish ( task ) ;
out_unlock :
cgroup_kn_unlock ( of - > kn ) ;
return ret ? : nbytes ;
}
static ssize_t cgroup1_procs_write ( struct kernfs_open_file * of ,
char * buf , size_t nbytes , loff_t off )
{
return __cgroup1_procs_write ( of , buf , nbytes , off , true ) ;
}
static ssize_t cgroup1_tasks_write ( struct kernfs_open_file * of ,
char * buf , size_t nbytes , loff_t off )
{
return __cgroup1_procs_write ( of , buf , nbytes , off , false ) ;
2016-12-27 22:49:06 +03:00
}
static ssize_t cgroup_release_agent_write ( struct kernfs_open_file * of ,
char * buf , size_t nbytes , loff_t off )
{
struct cgroup * cgrp ;
BUILD_BUG_ON ( sizeof ( cgrp - > root - > release_agent_path ) < PATH_MAX ) ;
cgrp = cgroup_kn_lock_live ( of - > kn , false ) ;
if ( ! cgrp )
return - ENODEV ;
spin_lock ( & release_agent_path_lock ) ;
strlcpy ( cgrp - > root - > release_agent_path , strstrip ( buf ) ,
sizeof ( cgrp - > root - > release_agent_path ) ) ;
spin_unlock ( & release_agent_path_lock ) ;
cgroup_kn_unlock ( of - > kn ) ;
return nbytes ;
}
static int cgroup_release_agent_show ( struct seq_file * seq , void * v )
{
struct cgroup * cgrp = seq_css ( seq ) - > cgroup ;
spin_lock ( & release_agent_path_lock ) ;
seq_puts ( seq , cgrp - > root - > release_agent_path ) ;
spin_unlock ( & release_agent_path_lock ) ;
seq_putc ( seq , ' \n ' ) ;
return 0 ;
}
static int cgroup_sane_behavior_show ( struct seq_file * seq , void * v )
{
seq_puts ( seq , " 0 \n " ) ;
return 0 ;
}
static u64 cgroup_read_notify_on_release ( struct cgroup_subsys_state * css ,
struct cftype * cft )
{
return notify_on_release ( css - > cgroup ) ;
}
static int cgroup_write_notify_on_release ( struct cgroup_subsys_state * css ,
struct cftype * cft , u64 val )
{
if ( val )
set_bit ( CGRP_NOTIFY_ON_RELEASE , & css - > cgroup - > flags ) ;
else
clear_bit ( CGRP_NOTIFY_ON_RELEASE , & css - > cgroup - > flags ) ;
return 0 ;
}
static u64 cgroup_clone_children_read ( struct cgroup_subsys_state * css ,
struct cftype * cft )
{
return test_bit ( CGRP_CPUSET_CLONE_CHILDREN , & css - > cgroup - > flags ) ;
}
static int cgroup_clone_children_write ( struct cgroup_subsys_state * css ,
struct cftype * cft , u64 val )
{
if ( val )
set_bit ( CGRP_CPUSET_CLONE_CHILDREN , & css - > cgroup - > flags ) ;
else
clear_bit ( CGRP_CPUSET_CLONE_CHILDREN , & css - > cgroup - > flags ) ;
return 0 ;
}
/* cgroup core interface files for the legacy hierarchies */
2016-12-27 22:49:08 +03:00
struct cftype cgroup1_base_files [ ] = {
2016-12-27 22:49:06 +03:00
{
. name = " cgroup.procs " ,
. seq_start = cgroup_pidlist_start ,
. seq_next = cgroup_pidlist_next ,
. seq_stop = cgroup_pidlist_stop ,
. seq_show = cgroup_pidlist_show ,
. private = CGROUP_FILE_PROCS ,
2017-05-15 16:34:00 +03:00
. write = cgroup1_procs_write ,
2016-12-27 22:49:06 +03:00
} ,
{
. name = " cgroup.clone_children " ,
. read_u64 = cgroup_clone_children_read ,
. write_u64 = cgroup_clone_children_write ,
} ,
{
. name = " cgroup.sane_behavior " ,
. flags = CFTYPE_ONLY_ON_ROOT ,
. seq_show = cgroup_sane_behavior_show ,
} ,
{
. name = " tasks " ,
. seq_start = cgroup_pidlist_start ,
. seq_next = cgroup_pidlist_next ,
. seq_stop = cgroup_pidlist_stop ,
. seq_show = cgroup_pidlist_show ,
. private = CGROUP_FILE_TASKS ,
2017-05-15 16:34:00 +03:00
. write = cgroup1_tasks_write ,
2016-12-27 22:49:06 +03:00
} ,
{
. name = " notify_on_release " ,
. read_u64 = cgroup_read_notify_on_release ,
. write_u64 = cgroup_write_notify_on_release ,
} ,
{
. name = " release_agent " ,
. flags = CFTYPE_ONLY_ON_ROOT ,
. seq_show = cgroup_release_agent_show ,
. write = cgroup_release_agent_write ,
. max_write_len = PATH_MAX - 1 ,
} ,
{ } /* terminate */
} ;
/* Display information about each subsystem and each hierarchy */
static int proc_cgroupstats_show ( struct seq_file * m , void * v )
{
struct cgroup_subsys * ss ;
int i ;
seq_puts ( m , " #subsys_name \t hierarchy \t num_cgroups \t enabled \n " ) ;
/*
* ideally we don ' t want subsystems moving around while we do this .
* cgroup_mutex is also necessary to guarantee an atomic snapshot of
* subsys / hierarchy state .
*/
mutex_lock ( & cgroup_mutex ) ;
for_each_subsys ( ss , i )
seq_printf ( m , " %s \t %d \t %d \t %d \n " ,
ss - > legacy_name , ss - > root - > hierarchy_id ,
atomic_read ( & ss - > root - > nr_cgrps ) ,
cgroup_ssid_enabled ( i ) ) ;
mutex_unlock ( & cgroup_mutex ) ;
return 0 ;
}
static int cgroupstats_open ( struct inode * inode , struct file * file )
{
return single_open ( file , proc_cgroupstats_show , NULL ) ;
}
const struct file_operations proc_cgroupstats_operations = {
. open = cgroupstats_open ,
. read = seq_read ,
. llseek = seq_lseek ,
. release = single_release ,
} ;
/**
* cgroupstats_build - build and fill cgroupstats
* @ stats : cgroupstats to fill information into
* @ dentry : A dentry entry belonging to the cgroup for which stats have
* been requested .
*
* Build and fill cgroupstats so that taskstats can export it to user
* space .
*/
int cgroupstats_build ( struct cgroupstats * stats , struct dentry * dentry )
{
struct kernfs_node * kn = kernfs_node_from_dentry ( dentry ) ;
struct cgroup * cgrp ;
struct css_task_iter it ;
struct task_struct * tsk ;
/* it should be kernfs_node belonging to cgroupfs and is a directory */
if ( dentry - > d_sb - > s_type ! = & cgroup_fs_type | | ! kn | |
kernfs_type ( kn ) ! = KERNFS_DIR )
return - EINVAL ;
mutex_lock ( & cgroup_mutex ) ;
/*
* We aren ' t being called from kernfs and there ' s no guarantee on
* @ kn - > priv ' s validity . For this and css_tryget_online_from_dir ( ) ,
* @ kn - > priv is RCU safe . Let ' s do the RCU dancing .
*/
rcu_read_lock ( ) ;
2016-12-27 22:49:09 +03:00
cgrp = rcu_dereference ( * ( void __rcu __force * * ) & kn - > priv ) ;
2016-12-27 22:49:06 +03:00
if ( ! cgrp | | cgroup_is_dead ( cgrp ) ) {
rcu_read_unlock ( ) ;
mutex_unlock ( & cgroup_mutex ) ;
return - ENOENT ;
}
rcu_read_unlock ( ) ;
2017-05-15 16:34:01 +03:00
css_task_iter_start ( & cgrp - > self , 0 , & it ) ;
2016-12-27 22:49:06 +03:00
while ( ( tsk = css_task_iter_next ( & it ) ) ) {
switch ( tsk - > state ) {
case TASK_RUNNING :
stats - > nr_running + + ;
break ;
case TASK_INTERRUPTIBLE :
stats - > nr_sleeping + + ;
break ;
case TASK_UNINTERRUPTIBLE :
stats - > nr_uninterruptible + + ;
break ;
case TASK_STOPPED :
stats - > nr_stopped + + ;
break ;
default :
if ( delayacct_is_task_waiting_on_io ( tsk ) )
stats - > nr_io_wait + + ;
break ;
}
}
css_task_iter_end ( & it ) ;
mutex_unlock ( & cgroup_mutex ) ;
return 0 ;
}
2016-12-27 22:49:08 +03:00
void cgroup1_check_for_release ( struct cgroup * cgrp )
2016-12-27 22:49:06 +03:00
{
if ( notify_on_release ( cgrp ) & & ! cgroup_is_populated ( cgrp ) & &
! css_has_online_children ( & cgrp - > self ) & & ! cgroup_is_dead ( cgrp ) )
schedule_work ( & cgrp - > release_agent_work ) ;
}
/*
* Notify userspace when a cgroup is released , by running the
* configured release agent with the name of the cgroup ( path
* relative to the root of cgroup file system ) as the argument .
*
* Most likely , this user command will try to rmdir this cgroup .
*
* This races with the possibility that some other task will be
* attached to this cgroup before it is removed , or that some other
* user task will ' mkdir ' a child cgroup of this cgroup . That ' s ok .
* The presumed ' rmdir ' will fail quietly if this cgroup is no longer
* unused , and this cgroup will be reprieved from its death sentence ,
* to continue to serve a useful existence . Next time it ' s released ,
* we will get notified again , if it still has ' notify_on_release ' set .
*
* The final arg to call_usermodehelper ( ) is UMH_WAIT_EXEC , which
* means only wait until the task is successfully execve ( ) ' d . The
* separate release agent task is forked by call_usermodehelper ( ) ,
* then control in this thread returns here , without waiting for the
* release agent task . We don ' t bother to wait because the caller of
* this routine has no use for the exit status of the release agent
* task , so no sense holding our caller up for that .
*/
2016-12-27 22:49:08 +03:00
void cgroup1_release_agent ( struct work_struct * work )
2016-12-27 22:49:06 +03:00
{
struct cgroup * cgrp =
container_of ( work , struct cgroup , release_agent_work ) ;
char * pathbuf = NULL , * agentbuf = NULL ;
char * argv [ 3 ] , * envp [ 3 ] ;
int ret ;
mutex_lock ( & cgroup_mutex ) ;
pathbuf = kmalloc ( PATH_MAX , GFP_KERNEL ) ;
agentbuf = kstrdup ( cgrp - > root - > release_agent_path , GFP_KERNEL ) ;
if ( ! pathbuf | | ! agentbuf )
goto out ;
spin_lock_irq ( & css_set_lock ) ;
ret = cgroup_path_ns_locked ( cgrp , pathbuf , PATH_MAX , & init_cgroup_ns ) ;
spin_unlock_irq ( & css_set_lock ) ;
if ( ret < 0 | | ret > = PATH_MAX )
goto out ;
argv [ 0 ] = agentbuf ;
argv [ 1 ] = pathbuf ;
argv [ 2 ] = NULL ;
/* minimal command environment */
envp [ 0 ] = " HOME=/ " ;
envp [ 1 ] = " PATH=/sbin:/bin:/usr/sbin:/usr/bin " ;
envp [ 2 ] = NULL ;
mutex_unlock ( & cgroup_mutex ) ;
call_usermodehelper ( argv [ 0 ] , argv , envp , UMH_WAIT_EXEC ) ;
goto out_free ;
out :
mutex_unlock ( & cgroup_mutex ) ;
out_free :
kfree ( agentbuf ) ;
kfree ( pathbuf ) ;
}
/*
* cgroup_rename - Only allow simple rename of directories in place .
*/
2016-12-27 22:49:08 +03:00
static int cgroup1_rename ( struct kernfs_node * kn , struct kernfs_node * new_parent ,
const char * new_name_str )
2016-12-27 22:49:06 +03:00
{
struct cgroup * cgrp = kn - > priv ;
int ret ;
if ( kernfs_type ( kn ) ! = KERNFS_DIR )
return - ENOTDIR ;
if ( kn - > parent ! = new_parent )
return - EIO ;
/*
* We ' re gonna grab cgroup_mutex which nests outside kernfs
* active_ref . kernfs_rename ( ) doesn ' t require active_ref
* protection . Break them before grabbing cgroup_mutex .
*/
kernfs_break_active_protection ( new_parent ) ;
kernfs_break_active_protection ( kn ) ;
mutex_lock ( & cgroup_mutex ) ;
ret = kernfs_rename ( kn , new_parent , new_name_str ) ;
if ( ! ret )
trace_cgroup_rename ( cgrp ) ;
mutex_unlock ( & cgroup_mutex ) ;
kernfs_unbreak_active_protection ( kn ) ;
kernfs_unbreak_active_protection ( new_parent ) ;
return ret ;
}
2016-12-27 22:49:08 +03:00
static int cgroup1_show_options ( struct seq_file * seq , struct kernfs_root * kf_root )
{
struct cgroup_root * root = cgroup_root_from_kf ( kf_root ) ;
struct cgroup_subsys * ss ;
int ssid ;
for_each_subsys ( ss , ssid )
if ( root - > subsys_mask & ( 1 < < ssid ) )
seq_show_option ( seq , ss - > legacy_name , NULL ) ;
if ( root - > flags & CGRP_ROOT_NOPREFIX )
seq_puts ( seq , " ,noprefix " ) ;
if ( root - > flags & CGRP_ROOT_XATTR )
seq_puts ( seq , " ,xattr " ) ;
spin_lock ( & release_agent_path_lock ) ;
if ( strlen ( root - > release_agent_path ) )
seq_show_option ( seq , " release_agent " ,
root - > release_agent_path ) ;
spin_unlock ( & release_agent_path_lock ) ;
if ( test_bit ( CGRP_CPUSET_CLONE_CHILDREN , & root - > cgrp . flags ) )
seq_puts ( seq , " ,clone_children " ) ;
if ( strlen ( root - > name ) )
seq_show_option ( seq , " name " , root - > name ) ;
return 0 ;
}
static int parse_cgroupfs_options ( char * data , struct cgroup_sb_opts * opts )
{
char * token , * o = data ;
bool all_ss = false , one_ss = false ;
u16 mask = U16_MAX ;
struct cgroup_subsys * ss ;
int nr_opts = 0 ;
int i ;
# ifdef CONFIG_CPUSETS
mask = ~ ( ( u16 ) 1 < < cpuset_cgrp_id ) ;
# endif
memset ( opts , 0 , sizeof ( * opts ) ) ;
while ( ( token = strsep ( & o , " , " ) ) ! = NULL ) {
nr_opts + + ;
if ( ! * token )
return - EINVAL ;
if ( ! strcmp ( token , " none " ) ) {
/* Explicitly have no subsystems */
opts - > none = true ;
continue ;
}
if ( ! strcmp ( token , " all " ) ) {
/* Mutually exclusive option 'all' + subsystem name */
if ( one_ss )
return - EINVAL ;
all_ss = true ;
continue ;
}
if ( ! strcmp ( token , " noprefix " ) ) {
opts - > flags | = CGRP_ROOT_NOPREFIX ;
continue ;
}
if ( ! strcmp ( token , " clone_children " ) ) {
opts - > cpuset_clone_children = true ;
continue ;
}
if ( ! strcmp ( token , " xattr " ) ) {
opts - > flags | = CGRP_ROOT_XATTR ;
continue ;
}
if ( ! strncmp ( token , " release_agent= " , 14 ) ) {
/* Specifying two release agents is forbidden */
if ( opts - > release_agent )
return - EINVAL ;
opts - > release_agent =
kstrndup ( token + 14 , PATH_MAX - 1 , GFP_KERNEL ) ;
if ( ! opts - > release_agent )
return - ENOMEM ;
continue ;
}
if ( ! strncmp ( token , " name= " , 5 ) ) {
const char * name = token + 5 ;
/* Can't specify an empty name */
if ( ! strlen ( name ) )
return - EINVAL ;
/* Must match [\w.-]+ */
for ( i = 0 ; i < strlen ( name ) ; i + + ) {
char c = name [ i ] ;
if ( isalnum ( c ) )
continue ;
if ( ( c = = ' . ' ) | | ( c = = ' - ' ) | | ( c = = ' _ ' ) )
continue ;
return - EINVAL ;
}
/* Specifying two names is forbidden */
if ( opts - > name )
return - EINVAL ;
opts - > name = kstrndup ( name ,
MAX_CGROUP_ROOT_NAMELEN - 1 ,
GFP_KERNEL ) ;
if ( ! opts - > name )
return - ENOMEM ;
continue ;
}
for_each_subsys ( ss , i ) {
if ( strcmp ( token , ss - > legacy_name ) )
continue ;
if ( ! cgroup_ssid_enabled ( i ) )
continue ;
2016-12-27 22:49:08 +03:00
if ( cgroup1_ssid_disabled ( i ) )
2016-12-27 22:49:08 +03:00
continue ;
/* Mutually exclusive option 'all' + subsystem name */
if ( all_ss )
return - EINVAL ;
opts - > subsys_mask | = ( 1 < < i ) ;
one_ss = true ;
break ;
}
if ( i = = CGROUP_SUBSYS_COUNT )
return - ENOENT ;
}
/*
* If the ' all ' option was specified select all the subsystems ,
* otherwise if ' none ' , ' name = ' and a subsystem name options were
* not specified , let ' s default to ' all '
*/
if ( all_ss | | ( ! one_ss & & ! opts - > none & & ! opts - > name ) )
for_each_subsys ( ss , i )
2016-12-27 22:49:08 +03:00
if ( cgroup_ssid_enabled ( i ) & & ! cgroup1_ssid_disabled ( i ) )
2016-12-27 22:49:08 +03:00
opts - > subsys_mask | = ( 1 < < i ) ;
/*
* We either have to specify by name or by subsystems . ( So all
* empty hierarchies must have a name ) .
*/
if ( ! opts - > subsys_mask & & ! opts - > name )
return - EINVAL ;
/*
* Option noprefix was introduced just for backward compatibility
* with the old cpuset , so we allow noprefix only if mounting just
* the cpuset subsystem .
*/
if ( ( opts - > flags & CGRP_ROOT_NOPREFIX ) & & ( opts - > subsys_mask & mask ) )
return - EINVAL ;
/* Can't specify "none" and some subsystems */
if ( opts - > subsys_mask & & opts - > none )
return - EINVAL ;
return 0 ;
}
static int cgroup1_remount ( struct kernfs_root * kf_root , int * flags , char * data )
{
int ret = 0 ;
struct cgroup_root * root = cgroup_root_from_kf ( kf_root ) ;
struct cgroup_sb_opts opts ;
u16 added_mask , removed_mask ;
cgroup_lock_and_drain_offline ( & cgrp_dfl_root . cgrp ) ;
/* See what subsystems are wanted */
ret = parse_cgroupfs_options ( data , & opts ) ;
if ( ret )
goto out_unlock ;
if ( opts . subsys_mask ! = root - > subsys_mask | | opts . release_agent )
pr_warn ( " option changes via remount are deprecated (pid=%d comm=%s) \n " ,
task_tgid_nr ( current ) , current - > comm ) ;
added_mask = opts . subsys_mask & ~ root - > subsys_mask ;
removed_mask = root - > subsys_mask & ~ opts . subsys_mask ;
/* Don't allow flags or name to change at remount */
if ( ( opts . flags ^ root - > flags ) | |
( opts . name & & strcmp ( opts . name , root - > name ) ) ) {
pr_err ( " option or name mismatch, new: 0x%x \" %s \" , old: 0x%x \" %s \" \n " ,
opts . flags , opts . name ? : " " , root - > flags , root - > name ) ;
ret = - EINVAL ;
goto out_unlock ;
}
/* remounting is not allowed for populated hierarchies */
if ( ! list_empty ( & root - > cgrp . self . children ) ) {
ret = - EBUSY ;
goto out_unlock ;
}
ret = rebind_subsystems ( root , added_mask ) ;
if ( ret )
goto out_unlock ;
WARN_ON ( rebind_subsystems ( & cgrp_dfl_root , removed_mask ) ) ;
if ( opts . release_agent ) {
spin_lock ( & release_agent_path_lock ) ;
strcpy ( root - > release_agent_path , opts . release_agent ) ;
spin_unlock ( & release_agent_path_lock ) ;
}
trace_cgroup_remount ( root ) ;
out_unlock :
kfree ( opts . release_agent ) ;
kfree ( opts . name ) ;
mutex_unlock ( & cgroup_mutex ) ;
return ret ;
}
struct kernfs_syscall_ops cgroup1_kf_syscall_ops = {
. rename = cgroup1_rename ,
. show_options = cgroup1_show_options ,
. remount_fs = cgroup1_remount ,
. mkdir = cgroup_mkdir ,
. rmdir = cgroup_rmdir ,
. show_path = cgroup_show_path ,
} ;
struct dentry * cgroup1_mount ( struct file_system_type * fs_type , int flags ,
void * data , unsigned long magic ,
struct cgroup_namespace * ns )
{
struct super_block * pinned_sb = NULL ;
struct cgroup_sb_opts opts ;
struct cgroup_root * root ;
struct cgroup_subsys * ss ;
struct dentry * dentry ;
int i , ret ;
2017-04-19 05:15:59 +03:00
bool new_root = false ;
2016-12-27 22:49:08 +03:00
cgroup_lock_and_drain_offline ( & cgrp_dfl_root . cgrp ) ;
/* First find the desired set of subsystems */
ret = parse_cgroupfs_options ( data , & opts ) ;
if ( ret )
goto out_unlock ;
/*
* Destruction of cgroup root is asynchronous , so subsystems may
* still be dying after the previous unmount . Let ' s drain the
* dying subsystems . We just need to ensure that the ones
* unmounted previously finish dying and don ' t care about new ones
* starting . Testing ref liveliness is good enough .
*/
for_each_subsys ( ss , i ) {
if ( ! ( opts . subsys_mask & ( 1 < < i ) ) | |
ss - > root = = & cgrp_dfl_root )
continue ;
if ( ! percpu_ref_tryget_live ( & ss - > root - > cgrp . self . refcnt ) ) {
mutex_unlock ( & cgroup_mutex ) ;
msleep ( 10 ) ;
ret = restart_syscall ( ) ;
goto out_free ;
}
cgroup_put ( & ss - > root - > cgrp ) ;
}
for_each_root ( root ) {
bool name_match = false ;
if ( root = = & cgrp_dfl_root )
continue ;
/*
* If we asked for a name then it must match . Also , if
* name matches but sybsys_mask doesn ' t , we should fail .
* Remember whether name matched .
*/
if ( opts . name ) {
if ( strcmp ( opts . name , root - > name ) )
continue ;
name_match = true ;
}
/*
* If we asked for subsystems ( or explicitly for no
* subsystems ) then they must match .
*/
if ( ( opts . subsys_mask | | opts . none ) & &
( opts . subsys_mask ! = root - > subsys_mask ) ) {
if ( ! name_match )
continue ;
ret = - EBUSY ;
goto out_unlock ;
}
if ( root - > flags ^ opts . flags )
pr_warn ( " new mount options do not match the existing superblock, will be ignored \n " ) ;
/*
* We want to reuse @ root whose lifetime is governed by its
* - > cgrp . Let ' s check whether @ root is alive and keep it
* that way . As cgroup_kill_sb ( ) can happen anytime , we
* want to block it by pinning the sb so that @ root doesn ' t
* get killed before mount is complete .
*
* With the sb pinned , tryget_live can reliably indicate
* whether @ root can be reused . If it ' s being killed ,
* drain it . We can use wait_queue for the wait but this
* path is super cold . Let ' s just sleep a bit and retry .
*/
pinned_sb = kernfs_pin_sb ( root - > kf_root , NULL ) ;
2017-04-16 17:17:37 +03:00
if ( IS_ERR ( pinned_sb ) | |
2016-12-27 22:49:08 +03:00
! percpu_ref_tryget_live ( & root - > cgrp . self . refcnt ) ) {
mutex_unlock ( & cgroup_mutex ) ;
if ( ! IS_ERR_OR_NULL ( pinned_sb ) )
deactivate_super ( pinned_sb ) ;
msleep ( 10 ) ;
ret = restart_syscall ( ) ;
goto out_free ;
}
ret = 0 ;
goto out_unlock ;
}
/*
* No such thing , create a new one . name = matching without subsys
* specification is allowed for already existing hierarchies but we
* can ' t create new one without subsys specification .
*/
if ( ! opts . subsys_mask & & ! opts . none ) {
ret = - EINVAL ;
goto out_unlock ;
}
/* Hierarchies may only be created in the initial cgroup namespace. */
if ( ns ! = & init_cgroup_ns ) {
ret = - EPERM ;
goto out_unlock ;
}
root = kzalloc ( sizeof ( * root ) , GFP_KERNEL ) ;
if ( ! root ) {
ret = - ENOMEM ;
goto out_unlock ;
}
2017-04-19 05:15:59 +03:00
new_root = true ;
2016-12-27 22:49:08 +03:00
init_cgroup_root ( root , & opts ) ;
2017-04-19 05:15:59 +03:00
ret = cgroup_setup_root ( root , opts . subsys_mask , PERCPU_REF_INIT_DEAD ) ;
2016-12-27 22:49:08 +03:00
if ( ret )
cgroup_free_root ( root ) ;
out_unlock :
mutex_unlock ( & cgroup_mutex ) ;
out_free :
kfree ( opts . release_agent ) ;
kfree ( opts . name ) ;
if ( ret )
return ERR_PTR ( ret ) ;
dentry = cgroup_do_mount ( & cgroup_fs_type , flags , root ,
CGROUP_SUPER_MAGIC , ns ) ;
2017-04-19 05:15:59 +03:00
/*
* There ' s a race window after we release cgroup_mutex and before
* allocating a superblock . Make sure a concurrent process won ' t
* be able to re - use the root during this window by delaying the
* initialization of root refcnt .
*/
if ( new_root ) {
mutex_lock ( & cgroup_mutex ) ;
percpu_ref_reinit ( & root - > cgrp . self . refcnt ) ;
mutex_unlock ( & cgroup_mutex ) ;
}
2016-12-27 22:49:08 +03:00
/*
* If @ pinned_sb , we ' re reusing an existing root and holding an
* extra ref on its sb . Mount is complete . Put the extra ref .
*/
if ( pinned_sb )
deactivate_super ( pinned_sb ) ;
return dentry ;
}
2016-12-27 22:49:06 +03:00
static int __init cgroup1_wq_init ( void )
{
/*
* Used to destroy pidlists and separate to serve as flush domain .
* Cap @ max_active to 1 too .
*/
cgroup_pidlist_destroy_wq = alloc_workqueue ( " cgroup_pidlist_destroy " ,
0 , 1 ) ;
BUG_ON ( ! cgroup_pidlist_destroy_wq ) ;
return 0 ;
}
core_initcall ( cgroup1_wq_init ) ;
static int __init cgroup_no_v1 ( char * str )
{
struct cgroup_subsys * ss ;
char * token ;
int i ;
while ( ( token = strsep ( & str , " , " ) ) ! = NULL ) {
if ( ! * token )
continue ;
if ( ! strcmp ( token , " all " ) ) {
cgroup_no_v1_mask = U16_MAX ;
break ;
}
for_each_subsys ( ss , i ) {
if ( strcmp ( token , ss - > name ) & &
strcmp ( token , ss - > legacy_name ) )
continue ;
cgroup_no_v1_mask | = 1 < < i ;
}
}
return 1 ;
}
__setup ( " cgroup_no_v1= " , cgroup_no_v1 ) ;