2008-02-07 11:13:50 +03:00
/* memcontrol.c - Memory Controller
*
* Copyright IBM Corporation , 2007
* Author Balbir Singh < balbir @ linux . vnet . ibm . com >
*
2008-02-07 11:13:51 +03:00
* Copyright 2007 OpenVZ SWsoft Inc
* Author : Pavel Emelianov < xemul @ openvz . org >
*
2008-02-07 11:13:50 +03:00
* This program is free software ; you can redistribute it and / or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation ; either version 2 of the License , or
* ( at your option ) any later version .
*
* This program is distributed in the hope that it will be useful ,
* but WITHOUT ANY WARRANTY ; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the
* GNU General Public License for more details .
*/
# include <linux/res_counter.h>
# include <linux/memcontrol.h>
# include <linux/cgroup.h>
2008-02-07 11:13:51 +03:00
# include <linux/mm.h>
2008-02-07 11:14:24 +03:00
# include <linux/smp.h>
2008-02-07 11:13:53 +03:00
# include <linux/page-flags.h>
2008-02-07 11:13:56 +03:00
# include <linux/backing-dev.h>
2008-02-07 11:13:53 +03:00
# include <linux/bit_spinlock.h>
# include <linux/rcupdate.h>
2008-02-07 11:13:56 +03:00
# include <linux/swap.h>
# include <linux/spinlock.h>
# include <linux/fs.h>
2008-02-07 11:14:25 +03:00
# include <linux/seq_file.h>
2008-02-07 11:13:50 +03:00
2008-02-07 11:13:59 +03:00
# include <asm/uaccess.h>
2008-02-07 11:13:50 +03:00
struct cgroup_subsys mem_cgroup_subsys ;
2008-02-07 11:13:56 +03:00
static const int MEM_CGROUP_RECLAIM_RETRIES = 5 ;
2008-02-07 11:13:50 +03:00
2008-02-07 11:14:24 +03:00
/*
* Statistics for memory cgroup .
*/
enum mem_cgroup_stat_index {
/*
* For MEM_CONTAINER_TYPE_ALL , usage = pagecache + rss .
*/
MEM_CGROUP_STAT_CACHE , /* # of pages charged as cache */
MEM_CGROUP_STAT_RSS , /* # of pages charged as rss */
MEM_CGROUP_STAT_NSTATS ,
} ;
struct mem_cgroup_stat_cpu {
s64 count [ MEM_CGROUP_STAT_NSTATS ] ;
} ____cacheline_aligned_in_smp ;
struct mem_cgroup_stat {
struct mem_cgroup_stat_cpu cpustat [ NR_CPUS ] ;
} ;
/*
* For accounting under irq disable , no need for increment preempt count .
*/
static void __mem_cgroup_stat_add_safe ( struct mem_cgroup_stat * stat ,
enum mem_cgroup_stat_index idx , int val )
{
int cpu = smp_processor_id ( ) ;
stat - > cpustat [ cpu ] . count [ idx ] + = val ;
}
static s64 mem_cgroup_read_stat ( struct mem_cgroup_stat * stat ,
enum mem_cgroup_stat_index idx )
{
int cpu ;
s64 ret = 0 ;
for_each_possible_cpu ( cpu )
ret + = stat - > cpustat [ cpu ] . count [ idx ] ;
return ret ;
}
2008-02-07 11:14:31 +03:00
/*
* per - zone information in memory controller .
*/
enum mem_cgroup_zstat_index {
MEM_CGROUP_ZSTAT_ACTIVE ,
MEM_CGROUP_ZSTAT_INACTIVE ,
NR_MEM_CGROUP_ZSTAT ,
} ;
struct mem_cgroup_per_zone {
unsigned long count [ NR_MEM_CGROUP_ZSTAT ] ;
} ;
/* Macro for accessing counter */
# define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])
struct mem_cgroup_per_node {
struct mem_cgroup_per_zone zoneinfo [ MAX_NR_ZONES ] ;
} ;
struct mem_cgroup_lru_info {
struct mem_cgroup_per_node * nodeinfo [ MAX_NUMNODES ] ;
} ;
2008-02-07 11:13:50 +03:00
/*
* The memory controller data structure . The memory controller controls both
* page cache and RSS per cgroup . We would eventually like to provide
* statistics based on the statistics developed by Rik Van Riel for clock - pro ,
* to help the administrator determine what knobs to tune .
*
* TODO : Add a water mark for the memory controller . Reclaim will begin when
2008-02-07 11:13:53 +03:00
* we hit the water mark . May be even add a low water mark , such that
* no reclaim occurs from a cgroup at it ' s low water mark , this is
* a feature that will be implemented much later in the future .
2008-02-07 11:13:50 +03:00
*/
struct mem_cgroup {
struct cgroup_subsys_state css ;
/*
* the counter to account for memory usage
*/
struct res_counter res ;
2008-02-07 11:13:51 +03:00
/*
* Per cgroup active and inactive list , similar to the
* per zone LRU lists .
* TODO : Consider making these lists per zone
*/
struct list_head active_list ;
struct list_head inactive_list ;
2008-02-07 11:14:31 +03:00
struct mem_cgroup_lru_info info ;
2008-02-07 11:13:56 +03:00
/*
* spin_lock to protect the per cgroup LRU
*/
spinlock_t lru_lock ;
2008-02-07 11:13:59 +03:00
unsigned long control_type ; /* control RSS or RSS+Pagecache */
2008-02-07 11:14:34 +03:00
int prev_priority ; /* for recording reclaim priority */
2008-02-07 11:14:24 +03:00
/*
* statistics .
*/
struct mem_cgroup_stat stat ;
2008-02-07 11:13:50 +03:00
} ;
2008-02-07 11:13:53 +03:00
/*
* We use the lower bit of the page - > page_cgroup pointer as a bit spin
* lock . We need to ensure that page - > page_cgroup is atleast two
* byte aligned ( based on comments from Nick Piggin )
*/
# define PAGE_CGROUP_LOCK_BIT 0x0
# define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT)
2008-02-07 11:13:50 +03:00
/*
* A page_cgroup page is associated with every page descriptor . The
* page_cgroup helps us identify information about the cgroup
*/
struct page_cgroup {
struct list_head lru ; /* per cgroup LRU list */
struct page * page ;
struct mem_cgroup * mem_cgroup ;
2008-02-07 11:13:53 +03:00
atomic_t ref_cnt ; /* Helpful when pages move b/w */
/* mapped and cached states */
2008-02-07 11:14:17 +03:00
int flags ;
2008-02-07 11:13:50 +03:00
} ;
2008-02-07 11:14:17 +03:00
# define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */
2008-02-07 11:14:23 +03:00
# define PAGE_CGROUP_FLAG_ACTIVE (0x2) /* page is active in this cgroup */
2008-02-07 11:13:50 +03:00
2008-02-07 11:14:30 +03:00
static inline int page_cgroup_nid ( struct page_cgroup * pc )
{
return page_to_nid ( pc - > page ) ;
}
static inline enum zone_type page_cgroup_zid ( struct page_cgroup * pc )
{
return page_zonenum ( pc - > page ) ;
}
2008-02-07 11:13:59 +03:00
enum {
MEM_CGROUP_TYPE_UNSPEC = 0 ,
MEM_CGROUP_TYPE_MAPPED ,
MEM_CGROUP_TYPE_CACHED ,
MEM_CGROUP_TYPE_ALL ,
MEM_CGROUP_TYPE_MAX ,
} ;
2008-02-07 11:14:17 +03:00
enum charge_type {
MEM_CGROUP_CHARGE_TYPE_CACHE = 0 ,
MEM_CGROUP_CHARGE_TYPE_MAPPED ,
} ;
2008-02-07 11:14:31 +03:00
2008-02-07 11:14:24 +03:00
/*
* Always modified under lru lock . Then , not necessary to preempt_disable ( )
*/
static void mem_cgroup_charge_statistics ( struct mem_cgroup * mem , int flags ,
bool charge )
{
int val = ( charge ) ? 1 : - 1 ;
struct mem_cgroup_stat * stat = & mem - > stat ;
VM_BUG_ON ( ! irqs_disabled ( ) ) ;
if ( flags & PAGE_CGROUP_FLAG_CACHE )
__mem_cgroup_stat_add_safe ( stat ,
MEM_CGROUP_STAT_CACHE , val ) ;
else
__mem_cgroup_stat_add_safe ( stat , MEM_CGROUP_STAT_RSS , val ) ;
2008-02-07 11:14:31 +03:00
}
static inline struct mem_cgroup_per_zone *
mem_cgroup_zoneinfo ( struct mem_cgroup * mem , int nid , int zid )
{
BUG_ON ( ! mem - > info . nodeinfo [ nid ] ) ;
return & mem - > info . nodeinfo [ nid ] - > zoneinfo [ zid ] ;
}
static inline struct mem_cgroup_per_zone *
page_cgroup_zoneinfo ( struct page_cgroup * pc )
{
struct mem_cgroup * mem = pc - > mem_cgroup ;
int nid = page_cgroup_nid ( pc ) ;
int zid = page_cgroup_zid ( pc ) ;
2008-02-07 11:14:24 +03:00
2008-02-07 11:14:31 +03:00
return mem_cgroup_zoneinfo ( mem , nid , zid ) ;
}
static unsigned long mem_cgroup_get_all_zonestat ( struct mem_cgroup * mem ,
enum mem_cgroup_zstat_index idx )
{
int nid , zid ;
struct mem_cgroup_per_zone * mz ;
u64 total = 0 ;
for_each_online_node ( nid )
for ( zid = 0 ; zid < MAX_NR_ZONES ; zid + + ) {
mz = mem_cgroup_zoneinfo ( mem , nid , zid ) ;
total + = MEM_CGROUP_ZSTAT ( mz , idx ) ;
}
return total ;
2008-02-07 11:14:24 +03:00
}
2008-02-07 11:13:59 +03:00
static struct mem_cgroup init_mem_cgroup ;
2008-02-07 11:13:50 +03:00
static inline
struct mem_cgroup * mem_cgroup_from_cont ( struct cgroup * cont )
{
return container_of ( cgroup_subsys_state ( cont ,
mem_cgroup_subsys_id ) , struct mem_cgroup ,
css ) ;
}
2008-02-07 11:13:51 +03:00
static inline
struct mem_cgroup * mem_cgroup_from_task ( struct task_struct * p )
{
return container_of ( task_subsys_state ( p , mem_cgroup_subsys_id ) ,
struct mem_cgroup , css ) ;
}
void mm_init_cgroup ( struct mm_struct * mm , struct task_struct * p )
{
struct mem_cgroup * mem ;
mem = mem_cgroup_from_task ( p ) ;
css_get ( & mem - > css ) ;
mm - > mem_cgroup = mem ;
}
void mm_free_cgroup ( struct mm_struct * mm )
{
css_put ( & mm - > mem_cgroup - > css ) ;
}
2008-02-07 11:13:53 +03:00
static inline int page_cgroup_locked ( struct page * page )
{
return bit_spin_is_locked ( PAGE_CGROUP_LOCK_BIT ,
& page - > page_cgroup ) ;
}
2008-02-07 11:13:51 +03:00
void page_assign_page_cgroup ( struct page * page , struct page_cgroup * pc )
{
2008-02-07 11:13:53 +03:00
int locked ;
/*
* While resetting the page_cgroup we might not hold the
* page_cgroup lock . free_hot_cold_page ( ) is an example
* of such a scenario
*/
if ( pc )
VM_BUG_ON ( ! page_cgroup_locked ( page ) ) ;
locked = ( page - > page_cgroup & PAGE_CGROUP_LOCK ) ;
page - > page_cgroup = ( ( unsigned long ) pc | locked ) ;
2008-02-07 11:13:51 +03:00
}
struct page_cgroup * page_get_page_cgroup ( struct page * page )
{
2008-02-07 11:13:53 +03:00
return ( struct page_cgroup * )
( page - > page_cgroup & ~ PAGE_CGROUP_LOCK ) ;
}
2008-02-07 11:13:59 +03:00
static void __always_inline lock_page_cgroup ( struct page * page )
2008-02-07 11:13:53 +03:00
{
bit_spin_lock ( PAGE_CGROUP_LOCK_BIT , & page - > page_cgroup ) ;
VM_BUG_ON ( ! page_cgroup_locked ( page ) ) ;
}
2008-02-07 11:13:59 +03:00
static void __always_inline unlock_page_cgroup ( struct page * page )
2008-02-07 11:13:53 +03:00
{
bit_spin_unlock ( PAGE_CGROUP_LOCK_BIT , & page - > page_cgroup ) ;
}
2008-02-07 11:14:08 +03:00
/*
* Tie new page_cgroup to struct page under lock_page_cgroup ( )
* This can fail if the page has been tied to a page_cgroup .
* If success , returns 0.
*/
2008-02-07 11:14:24 +03:00
static int page_cgroup_assign_new_page_cgroup ( struct page * page ,
struct page_cgroup * pc )
2008-02-07 11:14:08 +03:00
{
int ret = 0 ;
lock_page_cgroup ( page ) ;
if ( ! page_get_page_cgroup ( page ) )
page_assign_page_cgroup ( page , pc ) ;
else /* A page is tied to other pc. */
ret = 1 ;
unlock_page_cgroup ( page ) ;
return ret ;
}
/*
* Clear page - > page_cgroup member under lock_page_cgroup ( ) .
* If given " pc " value is different from one page - > page_cgroup ,
* page - > cgroup is not cleared .
* Returns a value of page - > page_cgroup at lock taken .
* A can can detect failure of clearing by following
* clear_page_cgroup ( page , pc ) = = pc
*/
2008-02-07 11:14:24 +03:00
static struct page_cgroup * clear_page_cgroup ( struct page * page ,
struct page_cgroup * pc )
2008-02-07 11:14:08 +03:00
{
struct page_cgroup * ret ;
/* lock and clear */
lock_page_cgroup ( page ) ;
ret = page_get_page_cgroup ( page ) ;
if ( likely ( ret = = pc ) )
page_assign_page_cgroup ( page , NULL ) ;
unlock_page_cgroup ( page ) ;
return ret ;
}
2008-02-07 11:14:31 +03:00
static void __mem_cgroup_remove_list ( struct page_cgroup * pc )
{
int from = pc - > flags & PAGE_CGROUP_FLAG_ACTIVE ;
struct mem_cgroup_per_zone * mz = page_cgroup_zoneinfo ( pc ) ;
if ( from )
MEM_CGROUP_ZSTAT ( mz , MEM_CGROUP_ZSTAT_ACTIVE ) - = 1 ;
else
MEM_CGROUP_ZSTAT ( mz , MEM_CGROUP_ZSTAT_INACTIVE ) - = 1 ;
mem_cgroup_charge_statistics ( pc - > mem_cgroup , pc - > flags , false ) ;
list_del_init ( & pc - > lru ) ;
}
static void __mem_cgroup_add_list ( struct page_cgroup * pc )
{
int to = pc - > flags & PAGE_CGROUP_FLAG_ACTIVE ;
struct mem_cgroup_per_zone * mz = page_cgroup_zoneinfo ( pc ) ;
if ( ! to ) {
MEM_CGROUP_ZSTAT ( mz , MEM_CGROUP_ZSTAT_INACTIVE ) + = 1 ;
list_add ( & pc - > lru , & pc - > mem_cgroup - > inactive_list ) ;
} else {
MEM_CGROUP_ZSTAT ( mz , MEM_CGROUP_ZSTAT_ACTIVE ) + = 1 ;
list_add ( & pc - > lru , & pc - > mem_cgroup - > active_list ) ;
}
mem_cgroup_charge_statistics ( pc - > mem_cgroup , pc - > flags , true ) ;
}
2008-02-07 11:13:59 +03:00
static void __mem_cgroup_move_lists ( struct page_cgroup * pc , bool active )
2008-02-07 11:13:56 +03:00
{
2008-02-07 11:14:31 +03:00
int from = pc - > flags & PAGE_CGROUP_FLAG_ACTIVE ;
struct mem_cgroup_per_zone * mz = page_cgroup_zoneinfo ( pc ) ;
if ( from )
MEM_CGROUP_ZSTAT ( mz , MEM_CGROUP_ZSTAT_ACTIVE ) - = 1 ;
else
MEM_CGROUP_ZSTAT ( mz , MEM_CGROUP_ZSTAT_INACTIVE ) - = 1 ;
2008-02-07 11:14:23 +03:00
if ( active ) {
2008-02-07 11:14:31 +03:00
MEM_CGROUP_ZSTAT ( mz , MEM_CGROUP_ZSTAT_ACTIVE ) + = 1 ;
2008-02-07 11:14:23 +03:00
pc - > flags | = PAGE_CGROUP_FLAG_ACTIVE ;
2008-02-07 11:13:56 +03:00
list_move ( & pc - > lru , & pc - > mem_cgroup - > active_list ) ;
2008-02-07 11:14:23 +03:00
} else {
2008-02-07 11:14:31 +03:00
MEM_CGROUP_ZSTAT ( mz , MEM_CGROUP_ZSTAT_INACTIVE ) + = 1 ;
2008-02-07 11:14:23 +03:00
pc - > flags & = ~ PAGE_CGROUP_FLAG_ACTIVE ;
2008-02-07 11:13:56 +03:00
list_move ( & pc - > lru , & pc - > mem_cgroup - > inactive_list ) ;
2008-02-07 11:14:23 +03:00
}
2008-02-07 11:13:56 +03:00
}
2008-02-07 11:14:06 +03:00
int task_in_mem_cgroup ( struct task_struct * task , const struct mem_cgroup * mem )
{
int ret ;
task_lock ( task ) ;
ret = task - > mm & & mm_cgroup ( task - > mm ) = = mem ;
task_unlock ( task ) ;
return ret ;
}
2008-02-07 11:13:56 +03:00
/*
* This routine assumes that the appropriate zone ' s lru lock is already held
*/
void mem_cgroup_move_lists ( struct page_cgroup * pc , bool active )
{
struct mem_cgroup * mem ;
if ( ! pc )
return ;
mem = pc - > mem_cgroup ;
spin_lock ( & mem - > lru_lock ) ;
__mem_cgroup_move_lists ( pc , active ) ;
spin_unlock ( & mem - > lru_lock ) ;
}
2008-02-07 11:14:32 +03:00
/*
* Calculate mapped_ratio under memory controller . This will be used in
* vmscan . c for deteremining we have to reclaim mapped pages .
*/
int mem_cgroup_calc_mapped_ratio ( struct mem_cgroup * mem )
{
long total , rss ;
/*
* usage is recorded in bytes . But , here , we assume the number of
* physical pages can be represented by " long " on any arch .
*/
total = ( long ) ( mem - > res . usage > > PAGE_SHIFT ) + 1L ;
rss = ( long ) mem_cgroup_read_stat ( & mem - > stat , MEM_CGROUP_STAT_RSS ) ;
return ( int ) ( ( rss * 100L ) / total ) ;
}
2008-02-07 11:14:33 +03:00
/*
* This function is called from vmscan . c . In page reclaiming loop . balance
* between active and inactive list is calculated . For memory controller
* page reclaiming , we should use using mem_cgroup ' s imbalance rather than
* zone ' s global lru imbalance .
*/
long mem_cgroup_reclaim_imbalance ( struct mem_cgroup * mem )
{
unsigned long active , inactive ;
/* active and inactive are the number of pages. 'long' is ok.*/
active = mem_cgroup_get_all_zonestat ( mem , MEM_CGROUP_ZSTAT_ACTIVE ) ;
inactive = mem_cgroup_get_all_zonestat ( mem , MEM_CGROUP_ZSTAT_INACTIVE ) ;
return ( long ) ( active / ( inactive + 1 ) ) ;
}
2008-02-07 11:14:32 +03:00
2008-02-07 11:14:34 +03:00
/*
* prev_priority control . . . this will be used in memory reclaim path .
*/
int mem_cgroup_get_reclaim_priority ( struct mem_cgroup * mem )
{
return mem - > prev_priority ;
}
void mem_cgroup_note_reclaim_priority ( struct mem_cgroup * mem , int priority )
{
if ( priority < mem - > prev_priority )
mem - > prev_priority = priority ;
}
void mem_cgroup_record_reclaim_priority ( struct mem_cgroup * mem , int priority )
{
mem - > prev_priority = priority ;
}
2008-02-07 11:13:56 +03:00
unsigned long mem_cgroup_isolate_pages ( unsigned long nr_to_scan ,
struct list_head * dst ,
unsigned long * scanned , int order ,
int mode , struct zone * z ,
struct mem_cgroup * mem_cont ,
int active )
{
unsigned long nr_taken = 0 ;
struct page * page ;
unsigned long scan ;
LIST_HEAD ( pc_list ) ;
struct list_head * src ;
2008-02-07 11:14:11 +03:00
struct page_cgroup * pc , * tmp ;
2008-02-07 11:13:56 +03:00
if ( active )
src = & mem_cont - > active_list ;
else
src = & mem_cont - > inactive_list ;
spin_lock ( & mem_cont - > lru_lock ) ;
2008-02-07 11:14:11 +03:00
scan = 0 ;
list_for_each_entry_safe_reverse ( pc , tmp , src , lru ) {
2008-02-07 11:14:12 +03:00
if ( scan > = nr_to_scan )
2008-02-07 11:14:11 +03:00
break ;
2008-02-07 11:13:56 +03:00
page = pc - > page ;
VM_BUG_ON ( ! pc ) ;
2008-02-07 11:14:12 +03:00
if ( unlikely ( ! PageLRU ( page ) ) )
2008-02-07 11:14:11 +03:00
continue ;
2008-02-07 11:13:56 +03:00
if ( PageActive ( page ) & & ! active ) {
__mem_cgroup_move_lists ( pc , true ) ;
continue ;
}
if ( ! PageActive ( page ) & & active ) {
__mem_cgroup_move_lists ( pc , false ) ;
continue ;
}
/*
* Reclaim , per zone
* TODO : make the active / inactive lists per zone
*/
if ( page_zone ( page ) ! = z )
continue ;
2008-02-07 11:14:12 +03:00
scan + + ;
list_move ( & pc - > lru , & pc_list ) ;
2008-02-07 11:13:56 +03:00
if ( __isolate_lru_page ( page , mode ) = = 0 ) {
list_move ( & page - > lru , dst ) ;
nr_taken + + ;
}
}
list_splice ( & pc_list , src ) ;
spin_unlock ( & mem_cont - > lru_lock ) ;
* scanned = scan ;
return nr_taken ;
}
2008-02-07 11:13:53 +03:00
/*
* Charge the memory controller for page usage .
* Return
* 0 if the charge was successful
* < 0 if the cgroup is over its limit
*/
2008-02-07 11:14:17 +03:00
static int mem_cgroup_charge_common ( struct page * page , struct mm_struct * mm ,
gfp_t gfp_mask , enum charge_type ctype )
2008-02-07 11:13:53 +03:00
{
struct mem_cgroup * mem ;
2008-02-07 11:14:08 +03:00
struct page_cgroup * pc ;
2008-02-07 11:13:56 +03:00
unsigned long flags ;
unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES ;
2008-02-07 11:13:53 +03:00
/*
* Should page_cgroup ' s go to their own slab ?
* One could optimize the performance of the charging routine
* by saving a bit in the page_flags and using it as a lock
* to see if the cgroup page already has a page_cgroup associated
* with it
*/
2008-02-07 11:13:56 +03:00
retry :
2008-02-07 11:14:22 +03:00
if ( page ) {
lock_page_cgroup ( page ) ;
pc = page_get_page_cgroup ( page ) ;
/*
* The page_cgroup exists and
* the page has already been accounted .
*/
if ( pc ) {
if ( unlikely ( ! atomic_inc_not_zero ( & pc - > ref_cnt ) ) ) {
/* this page is under being uncharged ? */
unlock_page_cgroup ( page ) ;
cpu_relax ( ) ;
goto retry ;
} else {
unlock_page_cgroup ( page ) ;
goto done ;
}
2008-02-07 11:14:08 +03:00
}
2008-02-07 11:14:22 +03:00
unlock_page_cgroup ( page ) ;
2008-02-07 11:13:53 +03:00
}
2008-02-07 11:14:02 +03:00
pc = kzalloc ( sizeof ( struct page_cgroup ) , gfp_mask ) ;
2008-02-07 11:13:53 +03:00
if ( pc = = NULL )
goto err ;
/*
2008-02-07 11:14:19 +03:00
* We always charge the cgroup the mm_struct belongs to .
* The mm_struct ' s mem_cgroup changes on task migration if the
2008-02-07 11:13:53 +03:00
* thread group leader migrates . It ' s possible that mm is not
* set , if so charge the init_mm ( happens for pagecache usage ) .
*/
if ( ! mm )
mm = & init_mm ;
2008-02-07 11:14:19 +03:00
rcu_read_lock ( ) ;
2008-02-07 11:13:53 +03:00
mem = rcu_dereference ( mm - > mem_cgroup ) ;
/*
* For every charge from the cgroup , increment reference
* count
*/
css_get ( & mem - > css ) ;
rcu_read_unlock ( ) ;
/*
* If we created the page_cgroup , we should free it on exceeding
* the cgroup limit .
*/
2008-02-07 11:13:57 +03:00
while ( res_counter_charge ( & mem - > res , PAGE_SIZE ) ) {
2008-02-07 11:14:19 +03:00
if ( ! ( gfp_mask & __GFP_WAIT ) )
goto out ;
2008-02-07 11:14:02 +03:00
if ( try_to_free_mem_cgroup_pages ( mem , gfp_mask ) )
2008-02-07 11:13:56 +03:00
continue ;
/*
* try_to_free_mem_cgroup_pages ( ) might not give us a full
* picture of reclaim . Some pages are reclaimed and might be
* moved to swap cache or just unmapped from the cgroup .
* Check the limit again to see if the reclaim reduced the
* current usage of the cgroup before giving up
*/
if ( res_counter_check_under_limit ( & mem - > res ) )
continue ;
2008-02-07 11:14:19 +03:00
if ( ! nr_retries - - ) {
mem_cgroup_out_of_memory ( mem , gfp_mask ) ;
goto out ;
2008-02-07 11:13:56 +03:00
}
2008-02-07 11:14:19 +03:00
congestion_wait ( WRITE , HZ / 10 ) ;
2008-02-07 11:13:53 +03:00
}
atomic_set ( & pc - > ref_cnt , 1 ) ;
pc - > mem_cgroup = mem ;
pc - > page = page ;
2008-02-07 11:14:23 +03:00
pc - > flags = PAGE_CGROUP_FLAG_ACTIVE ;
2008-02-07 11:14:17 +03:00
if ( ctype = = MEM_CGROUP_CHARGE_TYPE_CACHE )
pc - > flags | = PAGE_CGROUP_FLAG_CACHE ;
2008-02-07 11:14:19 +03:00
2008-02-07 11:14:22 +03:00
if ( ! page | | page_cgroup_assign_new_page_cgroup ( page , pc ) ) {
2008-02-07 11:14:08 +03:00
/*
2008-02-07 11:14:19 +03:00
* Another charge has been added to this page already .
* We take lock_page_cgroup ( page ) again and read
2008-02-07 11:14:08 +03:00
* page - > cgroup , increment refcnt . . . . just retry is OK .
*/
res_counter_uncharge ( & mem - > res , PAGE_SIZE ) ;
css_put ( & mem - > css ) ;
kfree ( pc ) ;
2008-02-07 11:14:22 +03:00
if ( ! page )
goto done ;
2008-02-07 11:14:08 +03:00
goto retry ;
}
2008-02-07 11:13:53 +03:00
2008-02-07 11:13:56 +03:00
spin_lock_irqsave ( & mem - > lru_lock , flags ) ;
2008-02-07 11:14:24 +03:00
/* Update statistics vector */
2008-02-07 11:14:31 +03:00
__mem_cgroup_add_list ( pc ) ;
2008-02-07 11:13:56 +03:00
spin_unlock_irqrestore ( & mem - > lru_lock , flags ) ;
2008-02-07 11:13:53 +03:00
done :
return 0 ;
2008-02-07 11:14:19 +03:00
out :
css_put ( & mem - > css ) ;
2008-02-07 11:13:53 +03:00
kfree ( pc ) ;
err :
return - ENOMEM ;
}
2008-02-07 11:14:17 +03:00
int mem_cgroup_charge ( struct page * page , struct mm_struct * mm ,
gfp_t gfp_mask )
{
return mem_cgroup_charge_common ( page , mm , gfp_mask ,
MEM_CGROUP_CHARGE_TYPE_MAPPED ) ;
}
2008-02-07 11:13:59 +03:00
/*
* See if the cached pages should be charged at all ?
*/
2008-02-07 11:14:02 +03:00
int mem_cgroup_cache_charge ( struct page * page , struct mm_struct * mm ,
gfp_t gfp_mask )
2008-02-07 11:13:59 +03:00
{
2008-02-07 11:14:18 +03:00
int ret = 0 ;
2008-02-07 11:13:59 +03:00
struct mem_cgroup * mem ;
if ( ! mm )
mm = & init_mm ;
2008-02-07 11:14:18 +03:00
rcu_read_lock ( ) ;
2008-02-07 11:13:59 +03:00
mem = rcu_dereference ( mm - > mem_cgroup ) ;
2008-02-07 11:14:18 +03:00
css_get ( & mem - > css ) ;
rcu_read_unlock ( ) ;
2008-02-07 11:13:59 +03:00
if ( mem - > control_type = = MEM_CGROUP_TYPE_ALL )
2008-02-07 11:14:18 +03:00
ret = mem_cgroup_charge_common ( page , mm , gfp_mask ,
2008-02-07 11:14:17 +03:00
MEM_CGROUP_CHARGE_TYPE_CACHE ) ;
2008-02-07 11:14:18 +03:00
css_put ( & mem - > css ) ;
return ret ;
2008-02-07 11:13:59 +03:00
}
2008-02-07 11:13:53 +03:00
/*
* Uncharging is always a welcome operation , we never complain , simply
* uncharge .
*/
void mem_cgroup_uncharge ( struct page_cgroup * pc )
{
struct mem_cgroup * mem ;
struct page * page ;
2008-02-07 11:13:56 +03:00
unsigned long flags ;
2008-02-07 11:13:53 +03:00
2008-02-07 11:13:59 +03:00
/*
* This can handle cases when a page is not charged at all and we
* are switching between handling the control_type .
*/
2008-02-07 11:13:53 +03:00
if ( ! pc )
return ;
if ( atomic_dec_and_test ( & pc - > ref_cnt ) ) {
page = pc - > page ;
2008-02-07 11:14:08 +03:00
/*
* get page - > cgroup and clear it under lock .
2008-02-07 11:14:16 +03:00
* force_empty can drop page - > cgroup without checking refcnt .
2008-02-07 11:14:08 +03:00
*/
if ( clear_page_cgroup ( page , pc ) = = pc ) {
mem = pc - > mem_cgroup ;
css_put ( & mem - > css ) ;
res_counter_uncharge ( & mem - > res , PAGE_SIZE ) ;
spin_lock_irqsave ( & mem - > lru_lock , flags ) ;
2008-02-07 11:14:31 +03:00
__mem_cgroup_remove_list ( pc ) ;
2008-02-07 11:14:08 +03:00
spin_unlock_irqrestore ( & mem - > lru_lock , flags ) ;
kfree ( pc ) ;
}
2008-02-07 11:13:53 +03:00
}
2008-02-07 11:13:51 +03:00
}
2008-02-07 11:14:31 +03:00
2008-02-07 11:14:10 +03:00
/*
* Returns non - zero if a page ( under migration ) has valid page_cgroup member .
* Refcnt of page_cgroup is incremented .
*/
int mem_cgroup_prepare_migration ( struct page * page )
{
struct page_cgroup * pc ;
int ret = 0 ;
lock_page_cgroup ( page ) ;
pc = page_get_page_cgroup ( page ) ;
if ( pc & & atomic_inc_not_zero ( & pc - > ref_cnt ) )
ret = 1 ;
unlock_page_cgroup ( page ) ;
return ret ;
}
void mem_cgroup_end_migration ( struct page * page )
{
struct page_cgroup * pc = page_get_page_cgroup ( page ) ;
mem_cgroup_uncharge ( pc ) ;
}
/*
* We know both * page * and * newpage * are now not - on - LRU and Pg_locked .
* And no race with uncharge ( ) routines because page_cgroup for * page *
* has extra one reference by mem_cgroup_prepare_migration .
*/
void mem_cgroup_page_migration ( struct page * page , struct page * newpage )
{
struct page_cgroup * pc ;
2008-02-07 11:14:31 +03:00
struct mem_cgroup * mem ;
unsigned long flags ;
2008-02-07 11:14:10 +03:00
retry :
pc = page_get_page_cgroup ( page ) ;
if ( ! pc )
return ;
2008-02-07 11:14:31 +03:00
mem = pc - > mem_cgroup ;
2008-02-07 11:14:10 +03:00
if ( clear_page_cgroup ( page , pc ) ! = pc )
goto retry ;
2008-02-07 11:14:31 +03:00
spin_lock_irqsave ( & mem - > lru_lock , flags ) ;
__mem_cgroup_remove_list ( pc ) ;
2008-02-07 11:14:10 +03:00
pc - > page = newpage ;
lock_page_cgroup ( newpage ) ;
page_assign_page_cgroup ( newpage , pc ) ;
unlock_page_cgroup ( newpage ) ;
2008-02-07 11:14:31 +03:00
__mem_cgroup_add_list ( pc ) ;
spin_unlock_irqrestore ( & mem - > lru_lock , flags ) ;
2008-02-07 11:14:10 +03:00
return ;
}
2008-02-07 11:13:51 +03:00
2008-02-07 11:14:16 +03:00
/*
* This routine traverse page_cgroup in given list and drop them all .
* This routine ignores page_cgroup - > ref_cnt .
* * And * this routine doesn ' t reclaim page itself , just removes page_cgroup .
*/
# define FORCE_UNCHARGE_BATCH (128)
static void
mem_cgroup_force_empty_list ( struct mem_cgroup * mem , struct list_head * list )
{
struct page_cgroup * pc ;
struct page * page ;
int count ;
unsigned long flags ;
retry :
count = FORCE_UNCHARGE_BATCH ;
spin_lock_irqsave ( & mem - > lru_lock , flags ) ;
while ( - - count & & ! list_empty ( list ) ) {
pc = list_entry ( list - > prev , struct page_cgroup , lru ) ;
page = pc - > page ;
/* Avoid race with charge */
atomic_set ( & pc - > ref_cnt , 0 ) ;
if ( clear_page_cgroup ( page , pc ) = = pc ) {
css_put ( & mem - > css ) ;
res_counter_uncharge ( & mem - > res , PAGE_SIZE ) ;
2008-02-07 11:14:31 +03:00
__mem_cgroup_remove_list ( pc ) ;
2008-02-07 11:14:16 +03:00
kfree ( pc ) ;
} else /* being uncharged ? ...do relax */
break ;
}
spin_unlock_irqrestore ( & mem - > lru_lock , flags ) ;
if ( ! list_empty ( list ) ) {
cond_resched ( ) ;
goto retry ;
}
return ;
}
/*
* make mem_cgroup ' s charge to be 0 if there is no task .
* This enables deleting this mem_cgroup .
*/
int mem_cgroup_force_empty ( struct mem_cgroup * mem )
{
int ret = - EBUSY ;
css_get ( & mem - > css ) ;
/*
* page reclaim code ( kswapd etc . . ) will move pages between
` * active_list < - > inactive_list while we don ' t take a lock .
* So , we have to do loop here until all lists are empty .
*/
while ( ! ( list_empty ( & mem - > active_list ) & &
list_empty ( & mem - > inactive_list ) ) ) {
if ( atomic_read ( & mem - > css . cgroup - > count ) > 0 )
goto out ;
/* drop all page_cgroup in active_list */
mem_cgroup_force_empty_list ( mem , & mem - > active_list ) ;
/* drop all page_cgroup in inactive_list */
mem_cgroup_force_empty_list ( mem , & mem - > inactive_list ) ;
}
ret = 0 ;
out :
css_put ( & mem - > css ) ;
return ret ;
}
2008-02-07 11:13:57 +03:00
int mem_cgroup_write_strategy ( char * buf , unsigned long long * tmp )
{
* tmp = memparse ( buf , & buf ) ;
if ( * buf ! = ' \0 ' )
return - EINVAL ;
/*
* Round up the value to the closest page size
*/
* tmp = ( ( * tmp + PAGE_SIZE - 1 ) > > PAGE_SHIFT ) < < PAGE_SHIFT ;
return 0 ;
}
static ssize_t mem_cgroup_read ( struct cgroup * cont ,
struct cftype * cft , struct file * file ,
char __user * userbuf , size_t nbytes , loff_t * ppos )
2008-02-07 11:13:50 +03:00
{
return res_counter_read ( & mem_cgroup_from_cont ( cont ) - > res ,
2008-02-07 11:13:57 +03:00
cft - > private , userbuf , nbytes , ppos ,
NULL ) ;
2008-02-07 11:13:50 +03:00
}
static ssize_t mem_cgroup_write ( struct cgroup * cont , struct cftype * cft ,
struct file * file , const char __user * userbuf ,
size_t nbytes , loff_t * ppos )
{
return res_counter_write ( & mem_cgroup_from_cont ( cont ) - > res ,
2008-02-07 11:13:57 +03:00
cft - > private , userbuf , nbytes , ppos ,
mem_cgroup_write_strategy ) ;
2008-02-07 11:13:50 +03:00
}
2008-02-07 11:13:59 +03:00
static ssize_t mem_control_type_write ( struct cgroup * cont ,
struct cftype * cft , struct file * file ,
const char __user * userbuf ,
size_t nbytes , loff_t * pos )
{
int ret ;
char * buf , * end ;
unsigned long tmp ;
struct mem_cgroup * mem ;
mem = mem_cgroup_from_cont ( cont ) ;
buf = kmalloc ( nbytes + 1 , GFP_KERNEL ) ;
ret = - ENOMEM ;
if ( buf = = NULL )
goto out ;
buf [ nbytes ] = 0 ;
ret = - EFAULT ;
if ( copy_from_user ( buf , userbuf , nbytes ) )
goto out_free ;
ret = - EINVAL ;
tmp = simple_strtoul ( buf , & end , 10 ) ;
if ( * end ! = ' \0 ' )
goto out_free ;
if ( tmp < = MEM_CGROUP_TYPE_UNSPEC | | tmp > = MEM_CGROUP_TYPE_MAX )
goto out_free ;
mem - > control_type = tmp ;
ret = nbytes ;
out_free :
kfree ( buf ) ;
out :
return ret ;
}
static ssize_t mem_control_type_read ( struct cgroup * cont ,
struct cftype * cft ,
struct file * file , char __user * userbuf ,
size_t nbytes , loff_t * ppos )
{
unsigned long val ;
char buf [ 64 ] , * s ;
struct mem_cgroup * mem ;
mem = mem_cgroup_from_cont ( cont ) ;
s = buf ;
val = mem - > control_type ;
s + = sprintf ( s , " %lu \n " , val ) ;
return simple_read_from_buffer ( ( void __user * ) userbuf , nbytes ,
ppos , buf , s - buf ) ;
}
2008-02-07 11:14:16 +03:00
static ssize_t mem_force_empty_write ( struct cgroup * cont ,
struct cftype * cft , struct file * file ,
const char __user * userbuf ,
size_t nbytes , loff_t * ppos )
{
struct mem_cgroup * mem = mem_cgroup_from_cont ( cont ) ;
int ret ;
ret = mem_cgroup_force_empty ( mem ) ;
if ( ! ret )
ret = nbytes ;
return ret ;
}
/*
* Note : This should be removed if cgroup supports write - only file .
*/
static ssize_t mem_force_empty_read ( struct cgroup * cont ,
struct cftype * cft ,
struct file * file , char __user * userbuf ,
size_t nbytes , loff_t * ppos )
{
return - EINVAL ;
}
2008-02-07 11:14:25 +03:00
static const struct mem_cgroup_stat_desc {
const char * msg ;
u64 unit ;
} mem_cgroup_stat_desc [ ] = {
[ MEM_CGROUP_STAT_CACHE ] = { " cache " , PAGE_SIZE , } ,
[ MEM_CGROUP_STAT_RSS ] = { " rss " , PAGE_SIZE , } ,
} ;
static int mem_control_stat_show ( struct seq_file * m , void * arg )
{
struct cgroup * cont = m - > private ;
struct mem_cgroup * mem_cont = mem_cgroup_from_cont ( cont ) ;
struct mem_cgroup_stat * stat = & mem_cont - > stat ;
int i ;
for ( i = 0 ; i < ARRAY_SIZE ( stat - > cpustat [ 0 ] . count ) ; i + + ) {
s64 val ;
val = mem_cgroup_read_stat ( stat , i ) ;
val * = mem_cgroup_stat_desc [ i ] . unit ;
seq_printf ( m , " %s %lld \n " , mem_cgroup_stat_desc [ i ] . msg ,
( long long ) val ) ;
}
2008-02-07 11:14:31 +03:00
/* showing # of active pages */
{
unsigned long active , inactive ;
inactive = mem_cgroup_get_all_zonestat ( mem_cont ,
MEM_CGROUP_ZSTAT_INACTIVE ) ;
active = mem_cgroup_get_all_zonestat ( mem_cont ,
MEM_CGROUP_ZSTAT_ACTIVE ) ;
seq_printf ( m , " active %ld \n " , ( active ) * PAGE_SIZE ) ;
seq_printf ( m , " inactive %ld \n " , ( inactive ) * PAGE_SIZE ) ;
}
2008-02-07 11:14:25 +03:00
return 0 ;
}
static const struct file_operations mem_control_stat_file_operations = {
. read = seq_read ,
. llseek = seq_lseek ,
. release = single_release ,
} ;
static int mem_control_stat_open ( struct inode * unused , struct file * file )
{
/* XXX __d_cont */
struct cgroup * cont = file - > f_dentry - > d_parent - > d_fsdata ;
file - > f_op = & mem_control_stat_file_operations ;
return single_open ( file , mem_control_stat_show , cont ) ;
}
2008-02-07 11:13:50 +03:00
static struct cftype mem_cgroup_files [ ] = {
{
2008-02-07 11:13:57 +03:00
. name = " usage_in_bytes " ,
2008-02-07 11:13:50 +03:00
. private = RES_USAGE ,
. read = mem_cgroup_read ,
} ,
{
2008-02-07 11:13:57 +03:00
. name = " limit_in_bytes " ,
2008-02-07 11:13:50 +03:00
. private = RES_LIMIT ,
. write = mem_cgroup_write ,
. read = mem_cgroup_read ,
} ,
{
. name = " failcnt " ,
. private = RES_FAILCNT ,
. read = mem_cgroup_read ,
} ,
2008-02-07 11:13:59 +03:00
{
. name = " control_type " ,
. write = mem_control_type_write ,
. read = mem_control_type_read ,
} ,
2008-02-07 11:14:16 +03:00
{
. name = " force_empty " ,
. write = mem_force_empty_write ,
. read = mem_force_empty_read ,
} ,
2008-02-07 11:14:25 +03:00
{
. name = " stat " ,
. open = mem_control_stat_open ,
} ,
2008-02-07 11:13:50 +03:00
} ;
2008-02-07 11:14:31 +03:00
static int alloc_mem_cgroup_per_zone_info ( struct mem_cgroup * mem , int node )
{
struct mem_cgroup_per_node * pn ;
pn = kmalloc_node ( sizeof ( * pn ) , GFP_KERNEL , node ) ;
if ( ! pn )
return 1 ;
mem - > info . nodeinfo [ node ] = pn ;
memset ( pn , 0 , sizeof ( * pn ) ) ;
return 0 ;
}
2008-02-07 11:13:51 +03:00
static struct mem_cgroup init_mem_cgroup ;
2008-02-07 11:13:50 +03:00
static struct cgroup_subsys_state *
mem_cgroup_create ( struct cgroup_subsys * ss , struct cgroup * cont )
{
struct mem_cgroup * mem ;
2008-02-07 11:14:31 +03:00
int node ;
2008-02-07 11:13:50 +03:00
2008-02-07 11:13:51 +03:00
if ( unlikely ( ( cont - > parent ) = = NULL ) ) {
mem = & init_mem_cgroup ;
init_mm . mem_cgroup = mem ;
} else
mem = kzalloc ( sizeof ( struct mem_cgroup ) , GFP_KERNEL ) ;
if ( mem = = NULL )
return NULL ;
2008-02-07 11:13:50 +03:00
res_counter_init ( & mem - > res ) ;
2008-02-07 11:13:53 +03:00
INIT_LIST_HEAD ( & mem - > active_list ) ;
INIT_LIST_HEAD ( & mem - > inactive_list ) ;
2008-02-07 11:13:56 +03:00
spin_lock_init ( & mem - > lru_lock ) ;
2008-02-07 11:13:59 +03:00
mem - > control_type = MEM_CGROUP_TYPE_ALL ;
2008-02-07 11:14:31 +03:00
memset ( & mem - > info , 0 , sizeof ( mem - > info ) ) ;
for_each_node_state ( node , N_POSSIBLE )
if ( alloc_mem_cgroup_per_zone_info ( mem , node ) )
goto free_out ;
2008-02-07 11:13:50 +03:00
return & mem - > css ;
2008-02-07 11:14:31 +03:00
free_out :
for_each_node_state ( node , N_POSSIBLE )
kfree ( mem - > info . nodeinfo [ node ] ) ;
if ( cont - > parent ! = NULL )
kfree ( mem ) ;
return NULL ;
2008-02-07 11:13:50 +03:00
}
2008-02-07 11:14:28 +03:00
static void mem_cgroup_pre_destroy ( struct cgroup_subsys * ss ,
struct cgroup * cont )
{
struct mem_cgroup * mem = mem_cgroup_from_cont ( cont ) ;
mem_cgroup_force_empty ( mem ) ;
}
2008-02-07 11:13:50 +03:00
static void mem_cgroup_destroy ( struct cgroup_subsys * ss ,
struct cgroup * cont )
{
2008-02-07 11:14:31 +03:00
int node ;
struct mem_cgroup * mem = mem_cgroup_from_cont ( cont ) ;
for_each_node_state ( node , N_POSSIBLE )
kfree ( mem - > info . nodeinfo [ node ] ) ;
2008-02-07 11:13:50 +03:00
kfree ( mem_cgroup_from_cont ( cont ) ) ;
}
static int mem_cgroup_populate ( struct cgroup_subsys * ss ,
struct cgroup * cont )
{
return cgroup_add_files ( cont , ss , mem_cgroup_files ,
ARRAY_SIZE ( mem_cgroup_files ) ) ;
}
2008-02-07 11:13:54 +03:00
static void mem_cgroup_move_task ( struct cgroup_subsys * ss ,
struct cgroup * cont ,
struct cgroup * old_cont ,
struct task_struct * p )
{
struct mm_struct * mm ;
struct mem_cgroup * mem , * old_mem ;
mm = get_task_mm ( p ) ;
if ( mm = = NULL )
return ;
mem = mem_cgroup_from_cont ( cont ) ;
old_mem = mem_cgroup_from_cont ( old_cont ) ;
if ( mem = = old_mem )
goto out ;
/*
* Only thread group leaders are allowed to migrate , the mm_struct is
* in effect owned by the leader
*/
if ( p - > tgid ! = p - > pid )
goto out ;
css_get ( & mem - > css ) ;
rcu_assign_pointer ( mm - > mem_cgroup , mem ) ;
css_put ( & old_mem - > css ) ;
out :
mmput ( mm ) ;
return ;
}
2008-02-07 11:13:50 +03:00
struct cgroup_subsys mem_cgroup_subsys = {
. name = " memory " ,
. subsys_id = mem_cgroup_subsys_id ,
. create = mem_cgroup_create ,
2008-02-07 11:14:28 +03:00
. pre_destroy = mem_cgroup_pre_destroy ,
2008-02-07 11:13:50 +03:00
. destroy = mem_cgroup_destroy ,
. populate = mem_cgroup_populate ,
2008-02-07 11:13:54 +03:00
. attach = mem_cgroup_move_task ,
2008-02-07 11:14:31 +03:00
. early_init = 0 ,
2008-02-07 11:13:50 +03:00
} ;