2012-08-01 03:42:12 +04:00
/*
*
* Copyright IBM Corporation , 2012
* Author Aneesh Kumar K . V < aneesh . kumar @ linux . vnet . ibm . com >
*
* This program is free software ; you can redistribute it and / or modify it
* under the terms of version 2.1 of the GNU Lesser General Public License
* as published by the Free Software Foundation .
*
* This program is distributed in the hope that it would be useful , but
* WITHOUT ANY WARRANTY ; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE .
*
*/
# include <linux/cgroup.h>
# include <linux/slab.h>
# include <linux/hugetlb.h>
# include <linux/hugetlb_cgroup.h>
struct hugetlb_cgroup {
struct cgroup_subsys_state css ;
/*
* the counter to account for hugepages from hugetlb .
*/
struct res_counter hugepage [ HUGE_MAX_HSTATE ] ;
} ;
2012-08-01 03:42:24 +04:00
# define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
# define MEMFILE_IDX(val) (((val) >> 16) & 0xffff)
# define MEMFILE_ATTR(val) ((val) & 0xffff)
2012-08-01 03:42:12 +04:00
struct cgroup_subsys hugetlb_subsys __read_mostly ;
static struct hugetlb_cgroup * root_h_cgroup __read_mostly ;
static inline
struct hugetlb_cgroup * hugetlb_cgroup_from_css ( struct cgroup_subsys_state * s )
{
2013-08-09 04:11:23 +04:00
return s ? container_of ( s , struct hugetlb_cgroup , css ) : NULL ;
2012-08-01 03:42:12 +04:00
}
static inline
struct hugetlb_cgroup * hugetlb_cgroup_from_task ( struct task_struct * task )
{
2013-08-09 04:11:22 +04:00
return hugetlb_cgroup_from_css ( task_css ( task , hugetlb_subsys_id ) ) ;
2012-08-01 03:42:12 +04:00
}
static inline bool hugetlb_cgroup_is_root ( struct hugetlb_cgroup * h_cg )
{
return ( h_cg = = root_h_cgroup ) ;
}
2013-08-09 04:11:22 +04:00
static inline struct hugetlb_cgroup *
parent_hugetlb_cgroup ( struct hugetlb_cgroup * h_cg )
2012-08-01 03:42:12 +04:00
{
cgroup: add css_parent()
Currently, controllers have to explicitly follow the cgroup hierarchy
to find the parent of a given css. cgroup is moving towards using
cgroup_subsys_state as the main controller interface construct, so
let's provide a way to climb the hierarchy using just csses.
This patch implements css_parent() which, given a css, returns its
parent. The function is guarnateed to valid non-NULL parent css as
long as the target css is not at the top of the hierarchy.
freezer, cpuset, cpu, cpuacct, hugetlb, memory, net_cls and devices
are converted to use css_parent() instead of accessing cgroup->parent
directly.
* __parent_ca() is dropped from cpuacct and its usage is replaced with
parent_ca(). The only difference between the two was NULL test on
cgroup->parent which is now embedded in css_parent() making the
distinction moot. Note that eventually a css->parent field will be
added to css and the NULL check in css_parent() will go away.
This patch shouldn't cause any behavior differences.
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
2013-08-09 04:11:23 +04:00
return hugetlb_cgroup_from_css ( css_parent ( & h_cg - > css ) ) ;
2012-08-01 03:42:12 +04:00
}
2013-08-09 04:11:22 +04:00
static inline bool hugetlb_cgroup_have_usage ( struct hugetlb_cgroup * h_cg )
2012-08-01 03:42:12 +04:00
{
int idx ;
for ( idx = 0 ; idx < hugetlb_max_hstate ; idx + + ) {
if ( ( res_counter_read_u64 ( & h_cg - > hugepage [ idx ] , RES_USAGE ) ) > 0 )
return true ;
}
return false ;
}
2013-08-09 04:11:23 +04:00
static struct cgroup_subsys_state *
hugetlb_cgroup_css_alloc ( struct cgroup_subsys_state * parent_css )
2012-08-01 03:42:12 +04:00
{
2013-08-09 04:11:23 +04:00
struct hugetlb_cgroup * parent_h_cgroup = hugetlb_cgroup_from_css ( parent_css ) ;
struct hugetlb_cgroup * h_cgroup ;
2012-08-01 03:42:12 +04:00
int idx ;
h_cgroup = kzalloc ( sizeof ( * h_cgroup ) , GFP_KERNEL ) ;
if ( ! h_cgroup )
return ERR_PTR ( - ENOMEM ) ;
2013-08-09 04:11:23 +04:00
if ( parent_h_cgroup ) {
2012-08-01 03:42:12 +04:00
for ( idx = 0 ; idx < HUGE_MAX_HSTATE ; idx + + )
res_counter_init ( & h_cgroup - > hugepage [ idx ] ,
& parent_h_cgroup - > hugepage [ idx ] ) ;
} else {
root_h_cgroup = h_cgroup ;
for ( idx = 0 ; idx < HUGE_MAX_HSTATE ; idx + + )
res_counter_init ( & h_cgroup - > hugepage [ idx ] , NULL ) ;
}
return & h_cgroup - > css ;
}
2013-08-09 04:11:23 +04:00
static void hugetlb_cgroup_css_free ( struct cgroup_subsys_state * css )
2012-08-01 03:42:12 +04:00
{
struct hugetlb_cgroup * h_cgroup ;
2013-08-09 04:11:23 +04:00
h_cgroup = hugetlb_cgroup_from_css ( css ) ;
2012-08-01 03:42:12 +04:00
kfree ( h_cgroup ) ;
}
2012-08-01 03:42:21 +04:00
/*
* Should be called with hugetlb_lock held .
* Since we are holding hugetlb_lock , pages cannot get moved from
* active list or uncharged from the cgroup , So no need to get
* page reference and test for page active here . This function
* cannot fail .
*/
2013-08-09 04:11:22 +04:00
static void hugetlb_cgroup_move_parent ( int idx , struct hugetlb_cgroup * h_cg ,
2012-08-01 03:42:21 +04:00
struct page * page )
{
int csize ;
struct res_counter * counter ;
struct res_counter * fail_res ;
struct hugetlb_cgroup * page_hcg ;
2013-08-09 04:11:22 +04:00
struct hugetlb_cgroup * parent = parent_hugetlb_cgroup ( h_cg ) ;
2012-08-01 03:42:21 +04:00
page_hcg = hugetlb_cgroup_from_page ( page ) ;
/*
* We can have pages in active list without any cgroup
* ie , hugepage with less than 3 pages . We can safely
* ignore those pages .
*/
if ( ! page_hcg | | page_hcg ! = h_cg )
goto out ;
csize = PAGE_SIZE < < compound_order ( page ) ;
if ( ! parent ) {
parent = root_h_cgroup ;
/* root has no limit */
res_counter_charge_nofail ( & parent - > hugepage [ idx ] ,
csize , & fail_res ) ;
}
counter = & h_cg - > hugepage [ idx ] ;
res_counter_uncharge_until ( counter , counter - > parent , csize ) ;
set_hugetlb_cgroup ( page , parent ) ;
out :
return ;
}
/*
* Force the hugetlb cgroup to empty the hugetlb resources by moving them to
* the parent cgroup .
*/
2013-08-09 04:11:23 +04:00
static void hugetlb_cgroup_css_offline ( struct cgroup_subsys_state * css )
2012-08-01 03:42:12 +04:00
{
2013-08-09 04:11:23 +04:00
struct hugetlb_cgroup * h_cg = hugetlb_cgroup_from_css ( css ) ;
2012-08-01 03:42:21 +04:00
struct hstate * h ;
struct page * page ;
2012-10-26 15:37:33 +04:00
int idx = 0 ;
2012-08-01 03:42:21 +04:00
do {
for_each_hstate ( h ) {
spin_lock ( & hugetlb_lock ) ;
list_for_each_entry ( page , & h - > hugepage_activelist , lru )
2013-08-09 04:11:22 +04:00
hugetlb_cgroup_move_parent ( idx , h_cg , page ) ;
2012-08-01 03:42:21 +04:00
spin_unlock ( & hugetlb_lock ) ;
idx + + ;
}
cond_resched ( ) ;
2013-08-09 04:11:22 +04:00
} while ( hugetlb_cgroup_have_usage ( h_cg ) ) ;
2012-08-01 03:42:12 +04:00
}
2012-08-01 03:42:18 +04:00
int hugetlb_cgroup_charge_cgroup ( int idx , unsigned long nr_pages ,
struct hugetlb_cgroup * * ptr )
{
int ret = 0 ;
struct res_counter * fail_res ;
struct hugetlb_cgroup * h_cg = NULL ;
unsigned long csize = nr_pages * PAGE_SIZE ;
if ( hugetlb_cgroup_disabled ( ) )
goto done ;
/*
* We don ' t charge any cgroup if the compound page have less
* than 3 pages .
*/
if ( huge_page_order ( & hstates [ idx ] ) < HUGETLB_CGROUP_MIN_ORDER )
goto done ;
again :
rcu_read_lock ( ) ;
h_cg = hugetlb_cgroup_from_task ( current ) ;
if ( ! css_tryget ( & h_cg - > css ) ) {
rcu_read_unlock ( ) ;
goto again ;
}
rcu_read_unlock ( ) ;
ret = res_counter_charge ( & h_cg - > hugepage [ idx ] , csize , & fail_res ) ;
css_put ( & h_cg - > css ) ;
done :
* ptr = h_cg ;
return ret ;
}
2012-08-01 03:42:35 +04:00
/* Should be called with hugetlb_lock held */
2012-08-01 03:42:18 +04:00
void hugetlb_cgroup_commit_charge ( int idx , unsigned long nr_pages ,
struct hugetlb_cgroup * h_cg ,
struct page * page )
{
if ( hugetlb_cgroup_disabled ( ) | | ! h_cg )
return ;
set_hugetlb_cgroup ( page , h_cg ) ;
return ;
}
/*
* Should be called with hugetlb_lock held
*/
void hugetlb_cgroup_uncharge_page ( int idx , unsigned long nr_pages ,
struct page * page )
{
struct hugetlb_cgroup * h_cg ;
unsigned long csize = nr_pages * PAGE_SIZE ;
if ( hugetlb_cgroup_disabled ( ) )
return ;
VM_BUG_ON ( ! spin_is_locked ( & hugetlb_lock ) ) ;
h_cg = hugetlb_cgroup_from_page ( page ) ;
if ( unlikely ( ! h_cg ) )
return ;
set_hugetlb_cgroup ( page , NULL ) ;
res_counter_uncharge ( & h_cg - > hugepage [ idx ] , csize ) ;
return ;
}
void hugetlb_cgroup_uncharge_cgroup ( int idx , unsigned long nr_pages ,
struct hugetlb_cgroup * h_cg )
{
unsigned long csize = nr_pages * PAGE_SIZE ;
if ( hugetlb_cgroup_disabled ( ) | | ! h_cg )
return ;
if ( huge_page_order ( & hstates [ idx ] ) < HUGETLB_CGROUP_MIN_ORDER )
return ;
res_counter_uncharge ( & h_cg - > hugepage [ idx ] , csize ) ;
return ;
}
2013-12-05 21:28:03 +04:00
static u64 hugetlb_cgroup_read_u64 ( struct cgroup_subsys_state * css ,
struct cftype * cft )
2012-08-01 03:42:24 +04:00
{
2013-12-05 21:28:03 +04:00
int idx , name ;
2013-08-09 04:11:24 +04:00
struct hugetlb_cgroup * h_cg = hugetlb_cgroup_from_css ( css ) ;
2012-08-01 03:42:24 +04:00
idx = MEMFILE_IDX ( cft - > private ) ;
name = MEMFILE_ATTR ( cft - > private ) ;
2013-12-05 21:28:03 +04:00
return res_counter_read_u64 ( & h_cg - > hugepage [ idx ] , name ) ;
2012-08-01 03:42:24 +04:00
}
2013-08-09 04:11:24 +04:00
static int hugetlb_cgroup_write ( struct cgroup_subsys_state * css ,
struct cftype * cft , const char * buffer )
2012-08-01 03:42:24 +04:00
{
int idx , name , ret ;
unsigned long long val ;
2013-08-09 04:11:24 +04:00
struct hugetlb_cgroup * h_cg = hugetlb_cgroup_from_css ( css ) ;
2012-08-01 03:42:24 +04:00
idx = MEMFILE_IDX ( cft - > private ) ;
name = MEMFILE_ATTR ( cft - > private ) ;
switch ( name ) {
case RES_LIMIT :
if ( hugetlb_cgroup_is_root ( h_cg ) ) {
/* Can't set limit on root */
ret = - EINVAL ;
break ;
}
/* This function does all necessary parse...reuse it */
ret = res_counter_memparse_write_strategy ( buffer , & val ) ;
if ( ret )
break ;
ret = res_counter_set_limit ( & h_cg - > hugepage [ idx ] , val ) ;
break ;
default :
ret = - EINVAL ;
break ;
}
return ret ;
}
2013-08-09 04:11:24 +04:00
static int hugetlb_cgroup_reset ( struct cgroup_subsys_state * css ,
unsigned int event )
2012-08-01 03:42:24 +04:00
{
int idx , name , ret = 0 ;
2013-08-09 04:11:24 +04:00
struct hugetlb_cgroup * h_cg = hugetlb_cgroup_from_css ( css ) ;
2012-08-01 03:42:24 +04:00
idx = MEMFILE_IDX ( event ) ;
name = MEMFILE_ATTR ( event ) ;
switch ( name ) {
case RES_MAX_USAGE :
res_counter_reset_max ( & h_cg - > hugepage [ idx ] ) ;
break ;
case RES_FAILCNT :
res_counter_reset_failcnt ( & h_cg - > hugepage [ idx ] ) ;
break ;
default :
ret = - EINVAL ;
break ;
}
return ret ;
}
static char * mem_fmt ( char * buf , int size , unsigned long hsize )
{
if ( hsize > = ( 1UL < < 30 ) )
snprintf ( buf , size , " %luGB " , hsize > > 30 ) ;
else if ( hsize > = ( 1UL < < 20 ) )
snprintf ( buf , size , " %luMB " , hsize > > 20 ) ;
else
snprintf ( buf , size , " %luKB " , hsize > > 10 ) ;
return buf ;
}
2012-12-19 02:23:19 +04:00
static void __init __hugetlb_cgroup_file_init ( int idx )
2012-08-01 03:42:24 +04:00
{
char buf [ 32 ] ;
struct cftype * cft ;
struct hstate * h = & hstates [ idx ] ;
/* format the size */
mem_fmt ( buf , 32 , huge_page_size ( h ) ) ;
/* Add the limit file */
cft = & h - > cgroup_files [ 0 ] ;
snprintf ( cft - > name , MAX_CFTYPE_NAME , " %s.limit_in_bytes " , buf ) ;
cft - > private = MEMFILE_PRIVATE ( idx , RES_LIMIT ) ;
2013-12-05 21:28:03 +04:00
cft - > read_u64 = hugetlb_cgroup_read_u64 ;
2012-08-01 03:42:24 +04:00
cft - > write_string = hugetlb_cgroup_write ;
/* Add the usage file */
cft = & h - > cgroup_files [ 1 ] ;
snprintf ( cft - > name , MAX_CFTYPE_NAME , " %s.usage_in_bytes " , buf ) ;
cft - > private = MEMFILE_PRIVATE ( idx , RES_USAGE ) ;
2013-12-05 21:28:03 +04:00
cft - > read_u64 = hugetlb_cgroup_read_u64 ;
2012-08-01 03:42:24 +04:00
/* Add the MAX usage file */
cft = & h - > cgroup_files [ 2 ] ;
snprintf ( cft - > name , MAX_CFTYPE_NAME , " %s.max_usage_in_bytes " , buf ) ;
cft - > private = MEMFILE_PRIVATE ( idx , RES_MAX_USAGE ) ;
cft - > trigger = hugetlb_cgroup_reset ;
2013-12-05 21:28:03 +04:00
cft - > read_u64 = hugetlb_cgroup_read_u64 ;
2012-08-01 03:42:24 +04:00
/* Add the failcntfile */
cft = & h - > cgroup_files [ 3 ] ;
snprintf ( cft - > name , MAX_CFTYPE_NAME , " %s.failcnt " , buf ) ;
cft - > private = MEMFILE_PRIVATE ( idx , RES_FAILCNT ) ;
cft - > trigger = hugetlb_cgroup_reset ;
2013-12-05 21:28:03 +04:00
cft - > read_u64 = hugetlb_cgroup_read_u64 ;
2012-08-01 03:42:24 +04:00
/* NULL terminate the last cft */
cft = & h - > cgroup_files [ 4 ] ;
memset ( cft , 0 , sizeof ( * cft ) ) ;
WARN_ON ( cgroup_add_cftypes ( & hugetlb_subsys , h - > cgroup_files ) ) ;
2012-12-19 02:23:19 +04:00
return ;
}
void __init hugetlb_cgroup_file_init ( void )
{
struct hstate * h ;
for_each_hstate ( h ) {
/*
* Add cgroup control files only if the huge page consists
* of more than two normal pages . This is because we use
* page [ 2 ] . lru . next for storing cgroup details .
*/
if ( huge_page_order ( h ) > = HUGETLB_CGROUP_MIN_ORDER )
__hugetlb_cgroup_file_init ( hstate_index ( h ) ) ;
}
2012-08-01 03:42:24 +04:00
}
2012-08-01 03:42:36 +04:00
/*
* hugetlb_lock will make sure a parallel cgroup rmdir won ' t happen
* when we migrate hugepages
*/
2012-08-01 03:42:27 +04:00
void hugetlb_cgroup_migrate ( struct page * oldhpage , struct page * newhpage )
{
struct hugetlb_cgroup * h_cg ;
2012-08-01 03:42:35 +04:00
struct hstate * h = page_hstate ( oldhpage ) ;
2012-08-01 03:42:27 +04:00
if ( hugetlb_cgroup_disabled ( ) )
return ;
2014-01-24 03:52:54 +04:00
VM_BUG_ON_PAGE ( ! PageHuge ( oldhpage ) , oldhpage ) ;
2012-08-01 03:42:27 +04:00
spin_lock ( & hugetlb_lock ) ;
h_cg = hugetlb_cgroup_from_page ( oldhpage ) ;
set_hugetlb_cgroup ( oldhpage , NULL ) ;
/* move the h_cg details to new cgroup */
set_hugetlb_cgroup ( newhpage , h_cg ) ;
2012-08-01 03:42:35 +04:00
list_move ( & newhpage - > lru , & h - > hugepage_activelist ) ;
2012-08-01 03:42:27 +04:00
spin_unlock ( & hugetlb_lock ) ;
return ;
}
2012-08-01 03:42:12 +04:00
struct cgroup_subsys hugetlb_subsys = {
. name = " hugetlb " ,
2012-11-19 20:13:38 +04:00
. css_alloc = hugetlb_cgroup_css_alloc ,
. css_offline = hugetlb_cgroup_css_offline ,
. css_free = hugetlb_cgroup_css_free ,
. subsys_id = hugetlb_subsys_id ,
2012-08-01 03:42:12 +04:00
} ;