2008-02-07 11:13:50 +03:00
/* memcontrol.c - Memory Controller
*
* Copyright IBM Corporation , 2007
* Author Balbir Singh < balbir @ linux . vnet . ibm . com >
*
2008-02-07 11:13:51 +03:00
* Copyright 2007 OpenVZ SWsoft Inc
* Author : Pavel Emelianov < xemul @ openvz . org >
*
2008-02-07 11:13:50 +03:00
* This program is free software ; you can redistribute it and / or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation ; either version 2 of the License , or
* ( at your option ) any later version .
*
* This program is distributed in the hope that it will be useful ,
* but WITHOUT ANY WARRANTY ; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the
* GNU General Public License for more details .
*/
# include <linux/res_counter.h>
# include <linux/memcontrol.h>
# include <linux/cgroup.h>
2008-02-07 11:13:51 +03:00
# include <linux/mm.h>
2008-02-07 11:14:24 +03:00
# include <linux/smp.h>
2008-02-07 11:13:53 +03:00
# include <linux/page-flags.h>
2008-02-07 11:13:56 +03:00
# include <linux/backing-dev.h>
2008-02-07 11:13:53 +03:00
# include <linux/bit_spinlock.h>
# include <linux/rcupdate.h>
2008-04-29 12:00:19 +04:00
# include <linux/slab.h>
2008-02-07 11:13:56 +03:00
# include <linux/swap.h>
# include <linux/spinlock.h>
# include <linux/fs.h>
2008-02-07 11:14:25 +03:00
# include <linux/seq_file.h>
2008-02-07 11:13:50 +03:00
2008-02-07 11:13:59 +03:00
# include <asm/uaccess.h>
2008-02-07 11:13:50 +03:00
struct cgroup_subsys mem_cgroup_subsys ;
2008-02-07 11:13:56 +03:00
static const int MEM_CGROUP_RECLAIM_RETRIES = 5 ;
2008-04-29 12:00:19 +04:00
static struct kmem_cache * page_cgroup_cache ;
2008-02-07 11:13:50 +03:00
2008-02-07 11:14:24 +03:00
/*
* Statistics for memory cgroup .
*/
enum mem_cgroup_stat_index {
/*
* For MEM_CONTAINER_TYPE_ALL , usage = pagecache + rss .
*/
MEM_CGROUP_STAT_CACHE , /* # of pages charged as cache */
MEM_CGROUP_STAT_RSS , /* # of pages charged as rss */
MEM_CGROUP_STAT_NSTATS ,
} ;
struct mem_cgroup_stat_cpu {
s64 count [ MEM_CGROUP_STAT_NSTATS ] ;
} ____cacheline_aligned_in_smp ;
struct mem_cgroup_stat {
struct mem_cgroup_stat_cpu cpustat [ NR_CPUS ] ;
} ;
/*
* For accounting under irq disable , no need for increment preempt count .
*/
static void __mem_cgroup_stat_add_safe ( struct mem_cgroup_stat * stat ,
enum mem_cgroup_stat_index idx , int val )
{
int cpu = smp_processor_id ( ) ;
stat - > cpustat [ cpu ] . count [ idx ] + = val ;
}
static s64 mem_cgroup_read_stat ( struct mem_cgroup_stat * stat ,
enum mem_cgroup_stat_index idx )
{
int cpu ;
s64 ret = 0 ;
for_each_possible_cpu ( cpu )
ret + = stat - > cpustat [ cpu ] . count [ idx ] ;
return ret ;
}
2008-02-07 11:14:31 +03:00
/*
* per - zone information in memory controller .
*/
enum mem_cgroup_zstat_index {
MEM_CGROUP_ZSTAT_ACTIVE ,
MEM_CGROUP_ZSTAT_INACTIVE ,
NR_MEM_CGROUP_ZSTAT ,
} ;
struct mem_cgroup_per_zone {
2008-02-07 11:14:39 +03:00
/*
* spin_lock to protect the per cgroup LRU
*/
spinlock_t lru_lock ;
2008-02-07 11:14:38 +03:00
struct list_head active_list ;
struct list_head inactive_list ;
2008-02-07 11:14:31 +03:00
unsigned long count [ NR_MEM_CGROUP_ZSTAT ] ;
} ;
/* Macro for accessing counter */
# define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])
struct mem_cgroup_per_node {
struct mem_cgroup_per_zone zoneinfo [ MAX_NR_ZONES ] ;
} ;
struct mem_cgroup_lru_info {
struct mem_cgroup_per_node * nodeinfo [ MAX_NUMNODES ] ;
} ;
2008-02-07 11:13:50 +03:00
/*
* The memory controller data structure . The memory controller controls both
* page cache and RSS per cgroup . We would eventually like to provide
* statistics based on the statistics developed by Rik Van Riel for clock - pro ,
* to help the administrator determine what knobs to tune .
*
* TODO : Add a water mark for the memory controller . Reclaim will begin when
2008-02-07 11:13:53 +03:00
* we hit the water mark . May be even add a low water mark , such that
* no reclaim occurs from a cgroup at it ' s low water mark , this is
* a feature that will be implemented much later in the future .
2008-02-07 11:13:50 +03:00
*/
struct mem_cgroup {
struct cgroup_subsys_state css ;
/*
* the counter to account for memory usage
*/
struct res_counter res ;
2008-02-07 11:13:51 +03:00
/*
* Per cgroup active and inactive list , similar to the
* per zone LRU lists .
*/
2008-02-07 11:14:31 +03:00
struct mem_cgroup_lru_info info ;
2008-02-07 11:14:39 +03:00
2008-02-07 11:14:34 +03:00
int prev_priority ; /* for recording reclaim priority */
2008-02-07 11:14:24 +03:00
/*
* statistics .
*/
struct mem_cgroup_stat stat ;
2008-02-07 11:13:50 +03:00
} ;
2008-03-05 01:29:09 +03:00
static struct mem_cgroup init_mem_cgroup ;
2008-02-07 11:13:50 +03:00
2008-02-07 11:13:53 +03:00
/*
* We use the lower bit of the page - > page_cgroup pointer as a bit spin
2008-03-05 01:29:07 +03:00
* lock . We need to ensure that page - > page_cgroup is at least two
* byte aligned ( based on comments from Nick Piggin ) . But since
* bit_spin_lock doesn ' t actually set that lock bit in a non - debug
* uniprocessor kernel , we should avoid setting it here too .
2008-02-07 11:13:53 +03:00
*/
# define PAGE_CGROUP_LOCK_BIT 0x0
2008-03-05 01:29:07 +03:00
# if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
# define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT)
# else
# define PAGE_CGROUP_LOCK 0x0
# endif
2008-02-07 11:13:53 +03:00
2008-02-07 11:13:50 +03:00
/*
* A page_cgroup page is associated with every page descriptor . The
* page_cgroup helps us identify information about the cgroup
*/
struct page_cgroup {
struct list_head lru ; /* per cgroup LRU list */
struct page * page ;
struct mem_cgroup * mem_cgroup ;
2008-03-05 01:29:11 +03:00
int ref_cnt ; /* cached, mapped, migrating */
2008-03-05 01:29:09 +03:00
int flags ;
2008-02-07 11:13:50 +03:00
} ;
2008-02-07 11:14:17 +03:00
# define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */
2008-02-07 11:14:23 +03:00
# define PAGE_CGROUP_FLAG_ACTIVE (0x2) /* page is active in this cgroup */
2008-02-07 11:13:50 +03:00
2008-03-05 01:29:10 +03:00
static int page_cgroup_nid ( struct page_cgroup * pc )
2008-02-07 11:14:30 +03:00
{
return page_to_nid ( pc - > page ) ;
}
2008-03-05 01:29:10 +03:00
static enum zone_type page_cgroup_zid ( struct page_cgroup * pc )
2008-02-07 11:14:30 +03:00
{
return page_zonenum ( pc - > page ) ;
}
2008-02-07 11:14:17 +03:00
enum charge_type {
MEM_CGROUP_CHARGE_TYPE_CACHE = 0 ,
MEM_CGROUP_CHARGE_TYPE_MAPPED ,
} ;
2008-02-07 11:14:24 +03:00
/*
* Always modified under lru lock . Then , not necessary to preempt_disable ( )
*/
static void mem_cgroup_charge_statistics ( struct mem_cgroup * mem , int flags ,
bool charge )
{
int val = ( charge ) ? 1 : - 1 ;
struct mem_cgroup_stat * stat = & mem - > stat ;
2008-03-05 01:29:09 +03:00
VM_BUG_ON ( ! irqs_disabled ( ) ) ;
2008-02-07 11:14:24 +03:00
if ( flags & PAGE_CGROUP_FLAG_CACHE )
2008-03-05 01:29:09 +03:00
__mem_cgroup_stat_add_safe ( stat , MEM_CGROUP_STAT_CACHE , val ) ;
2008-02-07 11:14:24 +03:00
else
__mem_cgroup_stat_add_safe ( stat , MEM_CGROUP_STAT_RSS , val ) ;
2008-02-07 11:14:31 +03:00
}
2008-03-05 01:29:10 +03:00
static struct mem_cgroup_per_zone *
2008-02-07 11:14:31 +03:00
mem_cgroup_zoneinfo ( struct mem_cgroup * mem , int nid , int zid )
{
return & mem - > info . nodeinfo [ nid ] - > zoneinfo [ zid ] ;
}
2008-03-05 01:29:10 +03:00
static struct mem_cgroup_per_zone *
2008-02-07 11:14:31 +03:00
page_cgroup_zoneinfo ( struct page_cgroup * pc )
{
struct mem_cgroup * mem = pc - > mem_cgroup ;
int nid = page_cgroup_nid ( pc ) ;
int zid = page_cgroup_zid ( pc ) ;
2008-02-07 11:14:24 +03:00
2008-02-07 11:14:31 +03:00
return mem_cgroup_zoneinfo ( mem , nid , zid ) ;
}
static unsigned long mem_cgroup_get_all_zonestat ( struct mem_cgroup * mem ,
enum mem_cgroup_zstat_index idx )
{
int nid , zid ;
struct mem_cgroup_per_zone * mz ;
u64 total = 0 ;
for_each_online_node ( nid )
for ( zid = 0 ; zid < MAX_NR_ZONES ; zid + + ) {
mz = mem_cgroup_zoneinfo ( mem , nid , zid ) ;
total + = MEM_CGROUP_ZSTAT ( mz , idx ) ;
}
return total ;
2008-02-07 11:14:24 +03:00
}
2008-03-05 01:29:10 +03:00
static struct mem_cgroup * mem_cgroup_from_cont ( struct cgroup * cont )
2008-02-07 11:13:50 +03:00
{
return container_of ( cgroup_subsys_state ( cont ,
mem_cgroup_subsys_id ) , struct mem_cgroup ,
css ) ;
}
cgroups: add an owner to the mm_struct
Remove the mem_cgroup member from mm_struct and instead adds an owner.
This approach was suggested by Paul Menage. The advantage of this approach
is that, once the mm->owner is known, using the subsystem id, the cgroup
can be determined. It also allows several control groups that are
virtually grouped by mm_struct, to exist independent of the memory
controller i.e., without adding mem_cgroup's for each controller, to
mm_struct.
A new config option CONFIG_MM_OWNER is added and the memory resource
controller selects this config option.
This patch also adds cgroup callbacks to notify subsystems when mm->owner
changes. The mm_cgroup_changed callback is called with the task_lock() of
the new task held and is called just prior to changing the mm->owner.
I am indebted to Paul Menage for the several reviews of this patchset and
helping me make it lighter and simpler.
This patch was tested on a powerpc box, it was compiled with both the
MM_OWNER config turned on and off.
After the thread group leader exits, it's moved to init_css_state by
cgroup_exit(), thus all future charges from runnings threads would be
redirected to the init_css_set's subsystem.
Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Pavel Emelianov <xemul@openvz.org>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Sudhir Kumar <skumar@linux.vnet.ibm.com>
Cc: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
Cc: Hirokazu Takahashi <taka@valinux.co.jp>
Cc: David Rientjes <rientjes@google.com>,
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Reviewed-by: Paul Menage <menage@google.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-29 12:00:16 +04:00
struct mem_cgroup * mem_cgroup_from_task ( struct task_struct * p )
2008-02-07 11:13:51 +03:00
{
return container_of ( task_subsys_state ( p , mem_cgroup_subsys_id ) ,
struct mem_cgroup , css ) ;
}
2008-02-07 11:13:53 +03:00
static inline int page_cgroup_locked ( struct page * page )
{
2008-03-05 01:29:09 +03:00
return bit_spin_is_locked ( PAGE_CGROUP_LOCK_BIT , & page - > page_cgroup ) ;
2008-02-07 11:13:53 +03:00
}
2008-03-05 01:29:07 +03:00
static void page_assign_page_cgroup ( struct page * page , struct page_cgroup * pc )
2008-02-07 11:13:51 +03:00
{
2008-03-05 01:29:07 +03:00
VM_BUG_ON ( ! page_cgroup_locked ( page ) ) ;
page - > page_cgroup = ( ( unsigned long ) pc | PAGE_CGROUP_LOCK ) ;
2008-02-07 11:13:51 +03:00
}
struct page_cgroup * page_get_page_cgroup ( struct page * page )
{
2008-03-05 01:29:09 +03:00
return ( struct page_cgroup * ) ( page - > page_cgroup & ~ PAGE_CGROUP_LOCK ) ;
2008-02-07 11:13:53 +03:00
}
2008-03-05 01:29:10 +03:00
static void lock_page_cgroup ( struct page * page )
2008-02-07 11:13:53 +03:00
{
bit_spin_lock ( PAGE_CGROUP_LOCK_BIT , & page - > page_cgroup ) ;
}
2008-03-05 01:29:13 +03:00
static int try_lock_page_cgroup ( struct page * page )
{
return bit_spin_trylock ( PAGE_CGROUP_LOCK_BIT , & page - > page_cgroup ) ;
}
2008-03-05 01:29:10 +03:00
static void unlock_page_cgroup ( struct page * page )
2008-02-07 11:13:53 +03:00
{
bit_spin_unlock ( PAGE_CGROUP_LOCK_BIT , & page - > page_cgroup ) ;
}
2008-02-07 11:14:31 +03:00
static void __mem_cgroup_remove_list ( struct page_cgroup * pc )
{
int from = pc - > flags & PAGE_CGROUP_FLAG_ACTIVE ;
struct mem_cgroup_per_zone * mz = page_cgroup_zoneinfo ( pc ) ;
if ( from )
MEM_CGROUP_ZSTAT ( mz , MEM_CGROUP_ZSTAT_ACTIVE ) - = 1 ;
else
MEM_CGROUP_ZSTAT ( mz , MEM_CGROUP_ZSTAT_INACTIVE ) - = 1 ;
mem_cgroup_charge_statistics ( pc - > mem_cgroup , pc - > flags , false ) ;
list_del_init ( & pc - > lru ) ;
}
static void __mem_cgroup_add_list ( struct page_cgroup * pc )
{
int to = pc - > flags & PAGE_CGROUP_FLAG_ACTIVE ;
struct mem_cgroup_per_zone * mz = page_cgroup_zoneinfo ( pc ) ;
if ( ! to ) {
MEM_CGROUP_ZSTAT ( mz , MEM_CGROUP_ZSTAT_INACTIVE ) + = 1 ;
2008-02-07 11:14:38 +03:00
list_add ( & pc - > lru , & mz - > inactive_list ) ;
2008-02-07 11:14:31 +03:00
} else {
MEM_CGROUP_ZSTAT ( mz , MEM_CGROUP_ZSTAT_ACTIVE ) + = 1 ;
2008-02-07 11:14:38 +03:00
list_add ( & pc - > lru , & mz - > active_list ) ;
2008-02-07 11:14:31 +03:00
}
mem_cgroup_charge_statistics ( pc - > mem_cgroup , pc - > flags , true ) ;
}
2008-02-07 11:13:59 +03:00
static void __mem_cgroup_move_lists ( struct page_cgroup * pc , bool active )
2008-02-07 11:13:56 +03:00
{
2008-02-07 11:14:31 +03:00
int from = pc - > flags & PAGE_CGROUP_FLAG_ACTIVE ;
struct mem_cgroup_per_zone * mz = page_cgroup_zoneinfo ( pc ) ;
if ( from )
MEM_CGROUP_ZSTAT ( mz , MEM_CGROUP_ZSTAT_ACTIVE ) - = 1 ;
else
MEM_CGROUP_ZSTAT ( mz , MEM_CGROUP_ZSTAT_INACTIVE ) - = 1 ;
2008-02-07 11:14:23 +03:00
if ( active ) {
2008-02-07 11:14:31 +03:00
MEM_CGROUP_ZSTAT ( mz , MEM_CGROUP_ZSTAT_ACTIVE ) + = 1 ;
2008-02-07 11:14:23 +03:00
pc - > flags | = PAGE_CGROUP_FLAG_ACTIVE ;
2008-02-07 11:14:38 +03:00
list_move ( & pc - > lru , & mz - > active_list ) ;
2008-02-07 11:14:23 +03:00
} else {
2008-02-07 11:14:31 +03:00
MEM_CGROUP_ZSTAT ( mz , MEM_CGROUP_ZSTAT_INACTIVE ) + = 1 ;
2008-02-07 11:14:23 +03:00
pc - > flags & = ~ PAGE_CGROUP_FLAG_ACTIVE ;
2008-02-07 11:14:38 +03:00
list_move ( & pc - > lru , & mz - > inactive_list ) ;
2008-02-07 11:14:23 +03:00
}
2008-02-07 11:13:56 +03:00
}
2008-02-07 11:14:06 +03:00
int task_in_mem_cgroup ( struct task_struct * task , const struct mem_cgroup * mem )
{
int ret ;
task_lock ( task ) ;
2008-03-05 01:29:01 +03:00
ret = task - > mm & & mm_match_cgroup ( task - > mm , mem ) ;
2008-02-07 11:14:06 +03:00
task_unlock ( task ) ;
return ret ;
}
2008-02-07 11:13:56 +03:00
/*
* This routine assumes that the appropriate zone ' s lru lock is already held
*/
2008-03-05 01:29:03 +03:00
void mem_cgroup_move_lists ( struct page * page , bool active )
2008-02-07 11:13:56 +03:00
{
2008-03-05 01:29:03 +03:00
struct page_cgroup * pc ;
2008-02-07 11:14:39 +03:00
struct mem_cgroup_per_zone * mz ;
unsigned long flags ;
2008-03-05 01:29:13 +03:00
/*
* We cannot lock_page_cgroup while holding zone ' s lru_lock ,
* because other holders of lock_page_cgroup can be interrupted
* with an attempt to rotate_reclaimable_page . But we cannot
* safely get to page_cgroup without it , so just try_lock it :
* mem_cgroup_isolate_pages allows for page left on wrong list .
*/
if ( ! try_lock_page_cgroup ( page ) )
2008-02-07 11:13:56 +03:00
return ;
2008-03-05 01:29:13 +03:00
pc = page_get_page_cgroup ( page ) ;
if ( pc ) {
mz = page_cgroup_zoneinfo ( pc ) ;
spin_lock_irqsave ( & mz - > lru_lock , flags ) ;
2008-03-05 01:29:15 +03:00
__mem_cgroup_move_lists ( pc , active ) ;
2008-03-05 01:29:13 +03:00
spin_unlock_irqrestore ( & mz - > lru_lock , flags ) ;
2008-03-05 01:29:15 +03:00
}
unlock_page_cgroup ( page ) ;
2008-02-07 11:13:56 +03:00
}
2008-02-07 11:14:32 +03:00
/*
* Calculate mapped_ratio under memory controller . This will be used in
* vmscan . c for deteremining we have to reclaim mapped pages .
*/
int mem_cgroup_calc_mapped_ratio ( struct mem_cgroup * mem )
{
long total , rss ;
/*
* usage is recorded in bytes . But , here , we assume the number of
* physical pages can be represented by " long " on any arch .
*/
total = ( long ) ( mem - > res . usage > > PAGE_SHIFT ) + 1L ;
rss = ( long ) mem_cgroup_read_stat ( & mem - > stat , MEM_CGROUP_STAT_RSS ) ;
return ( int ) ( ( rss * 100L ) / total ) ;
}
2008-03-05 01:29:09 +03:00
2008-02-07 11:14:33 +03:00
/*
* This function is called from vmscan . c . In page reclaiming loop . balance
* between active and inactive list is calculated . For memory controller
* page reclaiming , we should use using mem_cgroup ' s imbalance rather than
* zone ' s global lru imbalance .
*/
long mem_cgroup_reclaim_imbalance ( struct mem_cgroup * mem )
{
unsigned long active , inactive ;
/* active and inactive are the number of pages. 'long' is ok.*/
active = mem_cgroup_get_all_zonestat ( mem , MEM_CGROUP_ZSTAT_ACTIVE ) ;
inactive = mem_cgroup_get_all_zonestat ( mem , MEM_CGROUP_ZSTAT_INACTIVE ) ;
return ( long ) ( active / ( inactive + 1 ) ) ;
}
2008-02-07 11:14:32 +03:00
2008-02-07 11:14:34 +03:00
/*
* prev_priority control . . . this will be used in memory reclaim path .
*/
int mem_cgroup_get_reclaim_priority ( struct mem_cgroup * mem )
{
return mem - > prev_priority ;
}
void mem_cgroup_note_reclaim_priority ( struct mem_cgroup * mem , int priority )
{
if ( priority < mem - > prev_priority )
mem - > prev_priority = priority ;
}
void mem_cgroup_record_reclaim_priority ( struct mem_cgroup * mem , int priority )
{
mem - > prev_priority = priority ;
}
2008-02-07 11:14:35 +03:00
/*
* Calculate # of pages to be scanned in this priority / zone .
* See also vmscan . c
*
* priority starts from " DEF_PRIORITY " and decremented in each loop .
* ( see include / linux / mmzone . h )
*/
long mem_cgroup_calc_reclaim_active ( struct mem_cgroup * mem ,
struct zone * zone , int priority )
{
long nr_active ;
int nid = zone - > zone_pgdat - > node_id ;
int zid = zone_idx ( zone ) ;
struct mem_cgroup_per_zone * mz = mem_cgroup_zoneinfo ( mem , nid , zid ) ;
nr_active = MEM_CGROUP_ZSTAT ( mz , MEM_CGROUP_ZSTAT_ACTIVE ) ;
return ( nr_active > > priority ) ;
}
long mem_cgroup_calc_reclaim_inactive ( struct mem_cgroup * mem ,
struct zone * zone , int priority )
{
long nr_inactive ;
int nid = zone - > zone_pgdat - > node_id ;
int zid = zone_idx ( zone ) ;
struct mem_cgroup_per_zone * mz = mem_cgroup_zoneinfo ( mem , nid , zid ) ;
nr_inactive = MEM_CGROUP_ZSTAT ( mz , MEM_CGROUP_ZSTAT_INACTIVE ) ;
return ( nr_inactive > > priority ) ;
}
2008-02-07 11:13:56 +03:00
unsigned long mem_cgroup_isolate_pages ( unsigned long nr_to_scan ,
struct list_head * dst ,
unsigned long * scanned , int order ,
int mode , struct zone * z ,
struct mem_cgroup * mem_cont ,
int active )
{
unsigned long nr_taken = 0 ;
struct page * page ;
unsigned long scan ;
LIST_HEAD ( pc_list ) ;
struct list_head * src ;
2008-02-07 11:14:11 +03:00
struct page_cgroup * pc , * tmp ;
2008-02-07 11:14:38 +03:00
int nid = z - > zone_pgdat - > node_id ;
int zid = zone_idx ( z ) ;
struct mem_cgroup_per_zone * mz ;
2008-02-07 11:13:56 +03:00
cgroups: add an owner to the mm_struct
Remove the mem_cgroup member from mm_struct and instead adds an owner.
This approach was suggested by Paul Menage. The advantage of this approach
is that, once the mm->owner is known, using the subsystem id, the cgroup
can be determined. It also allows several control groups that are
virtually grouped by mm_struct, to exist independent of the memory
controller i.e., without adding mem_cgroup's for each controller, to
mm_struct.
A new config option CONFIG_MM_OWNER is added and the memory resource
controller selects this config option.
This patch also adds cgroup callbacks to notify subsystems when mm->owner
changes. The mm_cgroup_changed callback is called with the task_lock() of
the new task held and is called just prior to changing the mm->owner.
I am indebted to Paul Menage for the several reviews of this patchset and
helping me make it lighter and simpler.
This patch was tested on a powerpc box, it was compiled with both the
MM_OWNER config turned on and off.
After the thread group leader exits, it's moved to init_css_state by
cgroup_exit(), thus all future charges from runnings threads would be
redirected to the init_css_set's subsystem.
Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Pavel Emelianov <xemul@openvz.org>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Sudhir Kumar <skumar@linux.vnet.ibm.com>
Cc: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
Cc: Hirokazu Takahashi <taka@valinux.co.jp>
Cc: David Rientjes <rientjes@google.com>,
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Reviewed-by: Paul Menage <menage@google.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-29 12:00:16 +04:00
BUG_ON ( ! mem_cont ) ;
2008-02-07 11:14:38 +03:00
mz = mem_cgroup_zoneinfo ( mem_cont , nid , zid ) ;
2008-02-07 11:13:56 +03:00
if ( active )
2008-02-07 11:14:38 +03:00
src = & mz - > active_list ;
2008-02-07 11:13:56 +03:00
else
2008-02-07 11:14:38 +03:00
src = & mz - > inactive_list ;
2008-02-07 11:13:56 +03:00
2008-02-07 11:14:39 +03:00
spin_lock ( & mz - > lru_lock ) ;
2008-02-07 11:14:11 +03:00
scan = 0 ;
list_for_each_entry_safe_reverse ( pc , tmp , src , lru ) {
2008-02-07 11:14:12 +03:00
if ( scan > = nr_to_scan )
2008-02-07 11:14:11 +03:00
break ;
2008-02-07 11:13:56 +03:00
page = pc - > page ;
2008-02-07 11:14:12 +03:00
if ( unlikely ( ! PageLRU ( page ) ) )
2008-02-07 11:14:11 +03:00
continue ;
2008-02-07 11:13:56 +03:00
if ( PageActive ( page ) & & ! active ) {
__mem_cgroup_move_lists ( pc , true ) ;
continue ;
}
if ( ! PageActive ( page ) & & active ) {
__mem_cgroup_move_lists ( pc , false ) ;
continue ;
}
2008-02-07 11:14:12 +03:00
scan + + ;
list_move ( & pc - > lru , & pc_list ) ;
2008-02-07 11:13:56 +03:00
if ( __isolate_lru_page ( page , mode ) = = 0 ) {
list_move ( & page - > lru , dst ) ;
nr_taken + + ;
}
}
list_splice ( & pc_list , src ) ;
2008-02-07 11:14:39 +03:00
spin_unlock ( & mz - > lru_lock ) ;
2008-02-07 11:13:56 +03:00
* scanned = scan ;
return nr_taken ;
}
2008-02-07 11:13:53 +03:00
/*
* Charge the memory controller for page usage .
* Return
* 0 if the charge was successful
* < 0 if the cgroup is over its limit
*/
2008-02-07 11:14:17 +03:00
static int mem_cgroup_charge_common ( struct page * page , struct mm_struct * mm ,
gfp_t gfp_mask , enum charge_type ctype )
2008-02-07 11:13:53 +03:00
{
struct mem_cgroup * mem ;
2008-02-07 11:14:08 +03:00
struct page_cgroup * pc ;
2008-02-07 11:13:56 +03:00
unsigned long flags ;
unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES ;
2008-02-07 11:14:39 +03:00
struct mem_cgroup_per_zone * mz ;
2008-02-07 11:13:53 +03:00
2008-04-05 01:29:59 +04:00
if ( mem_cgroup_subsys . disabled )
return 0 ;
2008-02-07 11:13:53 +03:00
/*
* Should page_cgroup ' s go to their own slab ?
* One could optimize the performance of the charging routine
* by saving a bit in the page_flags and using it as a lock
* to see if the cgroup page already has a page_cgroup associated
* with it
*/
2008-02-07 11:13:56 +03:00
retry :
2008-03-05 01:29:08 +03:00
lock_page_cgroup ( page ) ;
pc = page_get_page_cgroup ( page ) ;
/*
* The page_cgroup exists and
* the page has already been accounted .
*/
if ( pc ) {
2008-03-05 01:29:11 +03:00
VM_BUG_ON ( pc - > page ! = page ) ;
VM_BUG_ON ( pc - > ref_cnt < = 0 ) ;
pc - > ref_cnt + + ;
unlock_page_cgroup ( page ) ;
goto done ;
2008-02-07 11:13:53 +03:00
}
2008-03-05 01:29:08 +03:00
unlock_page_cgroup ( page ) ;
2008-02-07 11:13:53 +03:00
2008-04-29 12:00:19 +04:00
pc = kmem_cache_zalloc ( page_cgroup_cache , gfp_mask ) ;
2008-02-07 11:13:53 +03:00
if ( pc = = NULL )
goto err ;
/*
2008-02-07 11:14:19 +03:00
* We always charge the cgroup the mm_struct belongs to .
* The mm_struct ' s mem_cgroup changes on task migration if the
2008-02-07 11:13:53 +03:00
* thread group leader migrates . It ' s possible that mm is not
* set , if so charge the init_mm ( happens for pagecache usage ) .
*/
if ( ! mm )
mm = & init_mm ;
2008-02-07 11:14:19 +03:00
rcu_read_lock ( ) ;
cgroups: add an owner to the mm_struct
Remove the mem_cgroup member from mm_struct and instead adds an owner.
This approach was suggested by Paul Menage. The advantage of this approach
is that, once the mm->owner is known, using the subsystem id, the cgroup
can be determined. It also allows several control groups that are
virtually grouped by mm_struct, to exist independent of the memory
controller i.e., without adding mem_cgroup's for each controller, to
mm_struct.
A new config option CONFIG_MM_OWNER is added and the memory resource
controller selects this config option.
This patch also adds cgroup callbacks to notify subsystems when mm->owner
changes. The mm_cgroup_changed callback is called with the task_lock() of
the new task held and is called just prior to changing the mm->owner.
I am indebted to Paul Menage for the several reviews of this patchset and
helping me make it lighter and simpler.
This patch was tested on a powerpc box, it was compiled with both the
MM_OWNER config turned on and off.
After the thread group leader exits, it's moved to init_css_state by
cgroup_exit(), thus all future charges from runnings threads would be
redirected to the init_css_set's subsystem.
Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Pavel Emelianov <xemul@openvz.org>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Sudhir Kumar <skumar@linux.vnet.ibm.com>
Cc: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
Cc: Hirokazu Takahashi <taka@valinux.co.jp>
Cc: David Rientjes <rientjes@google.com>,
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Reviewed-by: Paul Menage <menage@google.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-29 12:00:16 +04:00
mem = mem_cgroup_from_task ( rcu_dereference ( mm - > owner ) ) ;
2008-02-07 11:13:53 +03:00
/*
2008-03-05 01:29:09 +03:00
* For every charge from the cgroup , increment reference count
2008-02-07 11:13:53 +03:00
*/
css_get ( & mem - > css ) ;
rcu_read_unlock ( ) ;
2008-02-07 11:13:57 +03:00
while ( res_counter_charge ( & mem - > res , PAGE_SIZE ) ) {
2008-02-07 11:14:19 +03:00
if ( ! ( gfp_mask & __GFP_WAIT ) )
goto out ;
2008-02-07 11:14:02 +03:00
if ( try_to_free_mem_cgroup_pages ( mem , gfp_mask ) )
2008-02-07 11:13:56 +03:00
continue ;
/*
2008-03-05 01:29:09 +03:00
* try_to_free_mem_cgroup_pages ( ) might not give us a full
* picture of reclaim . Some pages are reclaimed and might be
* moved to swap cache or just unmapped from the cgroup .
* Check the limit again to see if the reclaim reduced the
* current usage of the cgroup before giving up
*/
2008-02-07 11:13:56 +03:00
if ( res_counter_check_under_limit ( & mem - > res ) )
continue ;
2008-02-07 11:14:19 +03:00
if ( ! nr_retries - - ) {
mem_cgroup_out_of_memory ( mem , gfp_mask ) ;
goto out ;
2008-02-07 11:13:56 +03:00
}
2008-02-07 11:14:19 +03:00
congestion_wait ( WRITE , HZ / 10 ) ;
2008-02-07 11:13:53 +03:00
}
2008-03-05 01:29:11 +03:00
pc - > ref_cnt = 1 ;
2008-02-07 11:13:53 +03:00
pc - > mem_cgroup = mem ;
pc - > page = page ;
2008-02-07 11:14:23 +03:00
pc - > flags = PAGE_CGROUP_FLAG_ACTIVE ;
2008-02-07 11:14:17 +03:00
if ( ctype = = MEM_CGROUP_CHARGE_TYPE_CACHE )
pc - > flags | = PAGE_CGROUP_FLAG_CACHE ;
2008-02-07 11:14:19 +03:00
2008-03-05 01:29:08 +03:00
lock_page_cgroup ( page ) ;
if ( page_get_page_cgroup ( page ) ) {
unlock_page_cgroup ( page ) ;
2008-02-07 11:14:08 +03:00
/*
2008-02-07 11:14:19 +03:00
* Another charge has been added to this page already .
* We take lock_page_cgroup ( page ) again and read
2008-02-07 11:14:08 +03:00
* page - > cgroup , increment refcnt . . . . just retry is OK .
*/
res_counter_uncharge ( & mem - > res , PAGE_SIZE ) ;
css_put ( & mem - > css ) ;
2008-04-29 12:00:19 +04:00
kmem_cache_free ( page_cgroup_cache , pc ) ;
2008-02-07 11:14:08 +03:00
goto retry ;
}
2008-03-05 01:29:08 +03:00
page_assign_page_cgroup ( page , pc ) ;
2008-02-07 11:13:53 +03:00
2008-02-07 11:14:39 +03:00
mz = page_cgroup_zoneinfo ( pc ) ;
spin_lock_irqsave ( & mz - > lru_lock , flags ) ;
2008-02-07 11:14:31 +03:00
__mem_cgroup_add_list ( pc ) ;
2008-02-07 11:14:39 +03:00
spin_unlock_irqrestore ( & mz - > lru_lock , flags ) ;
2008-02-07 11:13:56 +03:00
memcg: fix oops on NULL lru list
While testing force_empty, during an exit_mmap, __mem_cgroup_remove_list
called from mem_cgroup_uncharge_page oopsed on a NULL pointer in the lru list.
I couldn't see what racing tasks on other cpus were doing, but surmise that
another must have been in mem_cgroup_charge_common on the same page, between
its unlock_page_cgroup and spin_lock_irqsave near done (thanks to that kzalloc
which I'd almost changed to a kmalloc).
Normally such a race cannot happen, the ref_cnt prevents it, the final
uncharge cannot race with the initial charge. But force_empty buggers the
ref_cnt, that's what it's all about; and thereafter forced pages are
vulnerable to races such as this (just think of a shared page also mapped into
an mm of another mem_cgroup than that just emptied). And remain vulnerable
until they're freed indefinitely later.
This patch just fixes the oops by moving the unlock_page_cgroups down below
adding to and removing from the list (only possible given the previous patch);
and while we're at it, we might as well make it an invariant that
page->page_cgroup is always set while pc is on lru.
But this behaviour of force_empty seems highly unsatisfactory to me: why have
a ref_cnt if we always have to cope with it being violated (as in the earlier
page migration patch). We may prefer force_empty to move pages to an orphan
mem_cgroup (could be the root, but better not), from which other cgroups could
recover them; we might need to reverse the locking again; but no time now for
such concerns.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-03-05 01:29:16 +03:00
unlock_page_cgroup ( page ) ;
2008-02-07 11:13:53 +03:00
done :
return 0 ;
2008-02-07 11:14:19 +03:00
out :
css_put ( & mem - > css ) ;
2008-04-29 12:00:19 +04:00
kmem_cache_free ( page_cgroup_cache , pc ) ;
2008-02-07 11:13:53 +03:00
err :
return - ENOMEM ;
}
2008-03-05 01:29:09 +03:00
int mem_cgroup_charge ( struct page * page , struct mm_struct * mm , gfp_t gfp_mask )
2008-02-07 11:14:17 +03:00
{
return mem_cgroup_charge_common ( page , mm , gfp_mask ,
2008-03-05 01:29:09 +03:00
MEM_CGROUP_CHARGE_TYPE_MAPPED ) ;
2008-02-07 11:14:17 +03:00
}
2008-02-07 11:14:02 +03:00
int mem_cgroup_cache_charge ( struct page * page , struct mm_struct * mm ,
gfp_t gfp_mask )
2008-02-07 11:13:59 +03:00
{
if ( ! mm )
mm = & init_mm ;
2008-03-05 01:29:09 +03:00
return mem_cgroup_charge_common ( page , mm , gfp_mask ,
2008-02-07 11:14:17 +03:00
MEM_CGROUP_CHARGE_TYPE_CACHE ) ;
2008-02-07 11:13:59 +03:00
}
2008-02-07 11:13:53 +03:00
/*
* Uncharging is always a welcome operation , we never complain , simply
2008-03-05 01:29:08 +03:00
* uncharge .
2008-02-07 11:13:53 +03:00
*/
2008-03-05 01:29:08 +03:00
void mem_cgroup_uncharge_page ( struct page * page )
2008-02-07 11:13:53 +03:00
{
2008-03-05 01:29:08 +03:00
struct page_cgroup * pc ;
2008-02-07 11:13:53 +03:00
struct mem_cgroup * mem ;
2008-02-07 11:14:39 +03:00
struct mem_cgroup_per_zone * mz ;
2008-02-07 11:13:56 +03:00
unsigned long flags ;
2008-02-07 11:13:53 +03:00
2008-04-05 01:29:59 +04:00
if ( mem_cgroup_subsys . disabled )
return ;
2008-02-07 11:13:59 +03:00
/*
2008-02-07 11:14:41 +03:00
* Check if our page_cgroup is valid
2008-02-07 11:13:59 +03:00
*/
2008-03-05 01:29:08 +03:00
lock_page_cgroup ( page ) ;
pc = page_get_page_cgroup ( page ) ;
2008-02-07 11:13:53 +03:00
if ( ! pc )
2008-03-05 01:29:08 +03:00
goto unlock ;
2008-02-07 11:13:53 +03:00
2008-03-05 01:29:11 +03:00
VM_BUG_ON ( pc - > page ! = page ) ;
VM_BUG_ON ( pc - > ref_cnt < = 0 ) ;
if ( - - ( pc - > ref_cnt ) = = 0 ) {
mz = page_cgroup_zoneinfo ( pc ) ;
spin_lock_irqsave ( & mz - > lru_lock , flags ) ;
__mem_cgroup_remove_list ( pc ) ;
spin_unlock_irqrestore ( & mz - > lru_lock , flags ) ;
memcg: fix oops on NULL lru list
While testing force_empty, during an exit_mmap, __mem_cgroup_remove_list
called from mem_cgroup_uncharge_page oopsed on a NULL pointer in the lru list.
I couldn't see what racing tasks on other cpus were doing, but surmise that
another must have been in mem_cgroup_charge_common on the same page, between
its unlock_page_cgroup and spin_lock_irqsave near done (thanks to that kzalloc
which I'd almost changed to a kmalloc).
Normally such a race cannot happen, the ref_cnt prevents it, the final
uncharge cannot race with the initial charge. But force_empty buggers the
ref_cnt, that's what it's all about; and thereafter forced pages are
vulnerable to races such as this (just think of a shared page also mapped into
an mm of another mem_cgroup than that just emptied). And remain vulnerable
until they're freed indefinitely later.
This patch just fixes the oops by moving the unlock_page_cgroups down below
adding to and removing from the list (only possible given the previous patch);
and while we're at it, we might as well make it an invariant that
page->page_cgroup is always set while pc is on lru.
But this behaviour of force_empty seems highly unsatisfactory to me: why have
a ref_cnt if we always have to cope with it being violated (as in the earlier
page migration patch). We may prefer force_empty to move pages to an orphan
mem_cgroup (could be the root, but better not), from which other cgroups could
recover them; we might need to reverse the locking again; but no time now for
such concerns.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-03-05 01:29:16 +03:00
page_assign_page_cgroup ( page , NULL ) ;
unlock_page_cgroup ( page ) ;
2008-03-05 01:29:12 +03:00
mem = pc - > mem_cgroup ;
res_counter_uncharge ( & mem - > res , PAGE_SIZE ) ;
css_put ( & mem - > css ) ;
2008-04-29 12:00:19 +04:00
kmem_cache_free ( page_cgroup_cache , pc ) ;
2008-03-05 01:29:11 +03:00
return ;
2008-02-07 11:13:53 +03:00
}
2008-02-07 11:14:31 +03:00
2008-03-05 01:29:08 +03:00
unlock :
2008-02-07 11:14:41 +03:00
unlock_page_cgroup ( page ) ;
}
2008-02-07 11:14:10 +03:00
/*
* Returns non - zero if a page ( under migration ) has valid page_cgroup member .
* Refcnt of page_cgroup is incremented .
*/
int mem_cgroup_prepare_migration ( struct page * page )
{
struct page_cgroup * pc ;
2008-03-05 01:29:09 +03:00
2008-04-05 01:29:59 +04:00
if ( mem_cgroup_subsys . disabled )
return 0 ;
2008-02-07 11:14:10 +03:00
lock_page_cgroup ( page ) ;
pc = page_get_page_cgroup ( page ) ;
2008-03-05 01:29:11 +03:00
if ( pc )
pc - > ref_cnt + + ;
2008-02-07 11:14:10 +03:00
unlock_page_cgroup ( page ) ;
2008-03-05 01:29:11 +03:00
return pc ! = NULL ;
2008-02-07 11:14:10 +03:00
}
void mem_cgroup_end_migration ( struct page * page )
{
2008-03-05 01:29:08 +03:00
mem_cgroup_uncharge_page ( page ) ;
2008-02-07 11:14:10 +03:00
}
2008-03-05 01:29:09 +03:00
2008-02-07 11:14:10 +03:00
/*
2008-03-05 01:29:09 +03:00
* We know both * page * and * newpage * are now not - on - LRU and PG_locked .
2008-02-07 11:14:10 +03:00
* And no race with uncharge ( ) routines because page_cgroup for * page *
* has extra one reference by mem_cgroup_prepare_migration .
*/
void mem_cgroup_page_migration ( struct page * page , struct page * newpage )
{
struct page_cgroup * pc ;
2008-02-07 11:14:39 +03:00
struct mem_cgroup_per_zone * mz ;
2008-03-05 01:29:10 +03:00
unsigned long flags ;
2008-03-05 01:29:09 +03:00
2008-03-05 01:29:11 +03:00
lock_page_cgroup ( page ) ;
2008-02-07 11:14:10 +03:00
pc = page_get_page_cgroup ( page ) ;
2008-03-05 01:29:11 +03:00
if ( ! pc ) {
unlock_page_cgroup ( page ) ;
2008-02-07 11:14:10 +03:00
return ;
2008-03-05 01:29:11 +03:00
}
2008-03-05 01:29:09 +03:00
2008-03-05 01:29:11 +03:00
mz = page_cgroup_zoneinfo ( pc ) ;
2008-03-05 01:29:09 +03:00
spin_lock_irqsave ( & mz - > lru_lock , flags ) ;
2008-02-07 11:14:31 +03:00
__mem_cgroup_remove_list ( pc ) ;
2008-02-07 11:14:39 +03:00
spin_unlock_irqrestore ( & mz - > lru_lock , flags ) ;
memcg: fix oops on NULL lru list
While testing force_empty, during an exit_mmap, __mem_cgroup_remove_list
called from mem_cgroup_uncharge_page oopsed on a NULL pointer in the lru list.
I couldn't see what racing tasks on other cpus were doing, but surmise that
another must have been in mem_cgroup_charge_common on the same page, between
its unlock_page_cgroup and spin_lock_irqsave near done (thanks to that kzalloc
which I'd almost changed to a kmalloc).
Normally such a race cannot happen, the ref_cnt prevents it, the final
uncharge cannot race with the initial charge. But force_empty buggers the
ref_cnt, that's what it's all about; and thereafter forced pages are
vulnerable to races such as this (just think of a shared page also mapped into
an mm of another mem_cgroup than that just emptied). And remain vulnerable
until they're freed indefinitely later.
This patch just fixes the oops by moving the unlock_page_cgroups down below
adding to and removing from the list (only possible given the previous patch);
and while we're at it, we might as well make it an invariant that
page->page_cgroup is always set while pc is on lru.
But this behaviour of force_empty seems highly unsatisfactory to me: why have
a ref_cnt if we always have to cope with it being violated (as in the earlier
page migration patch). We may prefer force_empty to move pages to an orphan
mem_cgroup (could be the root, but better not), from which other cgroups could
recover them; we might need to reverse the locking again; but no time now for
such concerns.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-03-05 01:29:16 +03:00
page_assign_page_cgroup ( page , NULL ) ;
unlock_page_cgroup ( page ) ;
2008-02-07 11:14:10 +03:00
pc - > page = newpage ;
lock_page_cgroup ( newpage ) ;
page_assign_page_cgroup ( newpage , pc ) ;
2008-02-07 11:14:31 +03:00
2008-02-07 11:14:39 +03:00
mz = page_cgroup_zoneinfo ( pc ) ;
spin_lock_irqsave ( & mz - > lru_lock , flags ) ;
__mem_cgroup_add_list ( pc ) ;
spin_unlock_irqrestore ( & mz - > lru_lock , flags ) ;
memcg: fix oops on NULL lru list
While testing force_empty, during an exit_mmap, __mem_cgroup_remove_list
called from mem_cgroup_uncharge_page oopsed on a NULL pointer in the lru list.
I couldn't see what racing tasks on other cpus were doing, but surmise that
another must have been in mem_cgroup_charge_common on the same page, between
its unlock_page_cgroup and spin_lock_irqsave near done (thanks to that kzalloc
which I'd almost changed to a kmalloc).
Normally such a race cannot happen, the ref_cnt prevents it, the final
uncharge cannot race with the initial charge. But force_empty buggers the
ref_cnt, that's what it's all about; and thereafter forced pages are
vulnerable to races such as this (just think of a shared page also mapped into
an mm of another mem_cgroup than that just emptied). And remain vulnerable
until they're freed indefinitely later.
This patch just fixes the oops by moving the unlock_page_cgroups down below
adding to and removing from the list (only possible given the previous patch);
and while we're at it, we might as well make it an invariant that
page->page_cgroup is always set while pc is on lru.
But this behaviour of force_empty seems highly unsatisfactory to me: why have
a ref_cnt if we always have to cope with it being violated (as in the earlier
page migration patch). We may prefer force_empty to move pages to an orphan
mem_cgroup (could be the root, but better not), from which other cgroups could
recover them; we might need to reverse the locking again; but no time now for
such concerns.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-03-05 01:29:16 +03:00
unlock_page_cgroup ( newpage ) ;
2008-02-07 11:14:10 +03:00
}
2008-02-07 11:13:51 +03:00
2008-02-07 11:14:16 +03:00
/*
* This routine traverse page_cgroup in given list and drop them all .
* This routine ignores page_cgroup - > ref_cnt .
* * And * this routine doesn ' t reclaim page itself , just removes page_cgroup .
*/
# define FORCE_UNCHARGE_BATCH (128)
2008-03-05 01:29:09 +03:00
static void mem_cgroup_force_empty_list ( struct mem_cgroup * mem ,
2008-02-07 11:14:39 +03:00
struct mem_cgroup_per_zone * mz ,
int active )
2008-02-07 11:14:16 +03:00
{
struct page_cgroup * pc ;
struct page * page ;
2008-03-05 01:29:15 +03:00
int count = FORCE_UNCHARGE_BATCH ;
2008-02-07 11:14:16 +03:00
unsigned long flags ;
2008-02-07 11:14:39 +03:00
struct list_head * list ;
if ( active )
list = & mz - > active_list ;
else
list = & mz - > inactive_list ;
2008-02-07 11:14:16 +03:00
2008-02-07 11:14:39 +03:00
spin_lock_irqsave ( & mz - > lru_lock , flags ) ;
2008-03-05 01:29:15 +03:00
while ( ! list_empty ( list ) ) {
2008-02-07 11:14:16 +03:00
pc = list_entry ( list - > prev , struct page_cgroup , lru ) ;
page = pc - > page ;
2008-03-05 01:29:15 +03:00
get_page ( page ) ;
spin_unlock_irqrestore ( & mz - > lru_lock , flags ) ;
mem_cgroup_uncharge_page ( page ) ;
put_page ( page ) ;
if ( - - count < = 0 ) {
count = FORCE_UNCHARGE_BATCH ;
cond_resched ( ) ;
2008-03-05 01:29:11 +03:00
}
2008-03-05 01:29:15 +03:00
spin_lock_irqsave ( & mz - > lru_lock , flags ) ;
2008-02-07 11:14:16 +03:00
}
2008-02-07 11:14:39 +03:00
spin_unlock_irqrestore ( & mz - > lru_lock , flags ) ;
2008-02-07 11:14:16 +03:00
}
/*
* make mem_cgroup ' s charge to be 0 if there is no task .
* This enables deleting this mem_cgroup .
*/
2008-03-05 01:29:10 +03:00
static int mem_cgroup_force_empty ( struct mem_cgroup * mem )
2008-02-07 11:14:16 +03:00
{
int ret = - EBUSY ;
2008-02-07 11:14:38 +03:00
int node , zid ;
2008-03-05 01:29:09 +03:00
2008-04-05 01:29:59 +04:00
if ( mem_cgroup_subsys . disabled )
return 0 ;
2008-02-07 11:14:16 +03:00
css_get ( & mem - > css ) ;
/*
* page reclaim code ( kswapd etc . . ) will move pages between
2008-03-05 01:29:09 +03:00
* active_list < - > inactive_list while we don ' t take a lock .
2008-02-07 11:14:16 +03:00
* So , we have to do loop here until all lists are empty .
*/
2008-02-07 11:14:38 +03:00
while ( mem - > res . usage > 0 ) {
2008-02-07 11:14:16 +03:00
if ( atomic_read ( & mem - > css . cgroup - > count ) > 0 )
goto out ;
2008-02-07 11:14:38 +03:00
for_each_node_state ( node , N_POSSIBLE )
for ( zid = 0 ; zid < MAX_NR_ZONES ; zid + + ) {
struct mem_cgroup_per_zone * mz ;
mz = mem_cgroup_zoneinfo ( mem , node , zid ) ;
/* drop all page_cgroup in active_list */
2008-02-07 11:14:39 +03:00
mem_cgroup_force_empty_list ( mem , mz , 1 ) ;
2008-02-07 11:14:38 +03:00
/* drop all page_cgroup in inactive_list */
2008-02-07 11:14:39 +03:00
mem_cgroup_force_empty_list ( mem , mz , 0 ) ;
2008-02-07 11:14:38 +03:00
}
2008-02-07 11:14:16 +03:00
}
ret = 0 ;
out :
css_put ( & mem - > css ) ;
return ret ;
}
2008-03-05 01:29:10 +03:00
static int mem_cgroup_write_strategy ( char * buf , unsigned long long * tmp )
2008-02-07 11:13:57 +03:00
{
* tmp = memparse ( buf , & buf ) ;
if ( * buf ! = ' \0 ' )
return - EINVAL ;
/*
* Round up the value to the closest page size
*/
* tmp = ( ( * tmp + PAGE_SIZE - 1 ) > > PAGE_SHIFT ) < < PAGE_SHIFT ;
return 0 ;
}
2008-04-29 11:59:58 +04:00
static u64 mem_cgroup_read ( struct cgroup * cont , struct cftype * cft )
2008-02-07 11:13:50 +03:00
{
2008-04-29 11:59:58 +04:00
return res_counter_read_u64 ( & mem_cgroup_from_cont ( cont ) - > res ,
cft - > private ) ;
2008-02-07 11:13:50 +03:00
}
static ssize_t mem_cgroup_write ( struct cgroup * cont , struct cftype * cft ,
struct file * file , const char __user * userbuf ,
size_t nbytes , loff_t * ppos )
{
return res_counter_write ( & mem_cgroup_from_cont ( cont ) - > res ,
2008-02-07 11:13:57 +03:00
cft - > private , userbuf , nbytes , ppos ,
mem_cgroup_write_strategy ) ;
2008-02-07 11:13:50 +03:00
}
2008-04-29 12:00:21 +04:00
static int mem_cgroup_reset ( struct cgroup * cont , unsigned int event )
2008-04-29 12:00:17 +04:00
{
struct mem_cgroup * mem ;
mem = mem_cgroup_from_cont ( cont ) ;
2008-04-29 12:00:21 +04:00
switch ( event ) {
case RES_MAX_USAGE :
res_counter_reset_max ( & mem - > res ) ;
break ;
case RES_FAILCNT :
res_counter_reset_failcnt ( & mem - > res ) ;
break ;
}
2008-04-29 12:00:20 +04:00
return 0 ;
2008-04-29 12:00:17 +04:00
}
2008-04-29 12:00:20 +04:00
static int mem_force_empty_write ( struct cgroup * cont , unsigned int event )
2008-02-07 11:14:16 +03:00
{
2008-04-29 12:00:20 +04:00
return mem_cgroup_force_empty ( mem_cgroup_from_cont ( cont ) ) ;
2008-02-07 11:14:16 +03:00
}
2008-02-07 11:14:25 +03:00
static const struct mem_cgroup_stat_desc {
const char * msg ;
u64 unit ;
} mem_cgroup_stat_desc [ ] = {
[ MEM_CGROUP_STAT_CACHE ] = { " cache " , PAGE_SIZE , } ,
[ MEM_CGROUP_STAT_RSS ] = { " rss " , PAGE_SIZE , } ,
} ;
2008-04-29 12:00:02 +04:00
static int mem_control_stat_show ( struct cgroup * cont , struct cftype * cft ,
struct cgroup_map_cb * cb )
2008-02-07 11:14:25 +03:00
{
struct mem_cgroup * mem_cont = mem_cgroup_from_cont ( cont ) ;
struct mem_cgroup_stat * stat = & mem_cont - > stat ;
int i ;
for ( i = 0 ; i < ARRAY_SIZE ( stat - > cpustat [ 0 ] . count ) ; i + + ) {
s64 val ;
val = mem_cgroup_read_stat ( stat , i ) ;
val * = mem_cgroup_stat_desc [ i ] . unit ;
2008-04-29 12:00:02 +04:00
cb - > fill ( cb , mem_cgroup_stat_desc [ i ] . msg , val ) ;
2008-02-07 11:14:25 +03:00
}
2008-02-07 11:14:31 +03:00
/* showing # of active pages */
{
unsigned long active , inactive ;
inactive = mem_cgroup_get_all_zonestat ( mem_cont ,
MEM_CGROUP_ZSTAT_INACTIVE ) ;
active = mem_cgroup_get_all_zonestat ( mem_cont ,
MEM_CGROUP_ZSTAT_ACTIVE ) ;
2008-04-29 12:00:02 +04:00
cb - > fill ( cb , " active " , ( active ) * PAGE_SIZE ) ;
cb - > fill ( cb , " inactive " , ( inactive ) * PAGE_SIZE ) ;
2008-02-07 11:14:31 +03:00
}
2008-02-07 11:14:25 +03:00
return 0 ;
}
2008-02-07 11:13:50 +03:00
static struct cftype mem_cgroup_files [ ] = {
{
2008-02-07 11:13:57 +03:00
. name = " usage_in_bytes " ,
2008-02-07 11:13:50 +03:00
. private = RES_USAGE ,
2008-04-29 11:59:58 +04:00
. read_u64 = mem_cgroup_read ,
2008-02-07 11:13:50 +03:00
} ,
2008-04-29 12:00:17 +04:00
{
. name = " max_usage_in_bytes " ,
. private = RES_MAX_USAGE ,
2008-04-29 12:00:21 +04:00
. trigger = mem_cgroup_reset ,
2008-04-29 12:00:17 +04:00
. read_u64 = mem_cgroup_read ,
} ,
2008-02-07 11:13:50 +03:00
{
2008-02-07 11:13:57 +03:00
. name = " limit_in_bytes " ,
2008-02-07 11:13:50 +03:00
. private = RES_LIMIT ,
. write = mem_cgroup_write ,
2008-04-29 11:59:58 +04:00
. read_u64 = mem_cgroup_read ,
2008-02-07 11:13:50 +03:00
} ,
{
. name = " failcnt " ,
. private = RES_FAILCNT ,
2008-04-29 12:00:21 +04:00
. trigger = mem_cgroup_reset ,
2008-04-29 11:59:58 +04:00
. read_u64 = mem_cgroup_read ,
2008-02-07 11:13:50 +03:00
} ,
2008-02-07 11:14:16 +03:00
{
. name = " force_empty " ,
2008-04-29 12:00:20 +04:00
. trigger = mem_force_empty_write ,
2008-02-07 11:14:16 +03:00
} ,
2008-02-07 11:14:25 +03:00
{
. name = " stat " ,
2008-04-29 12:00:02 +04:00
. read_map = mem_control_stat_show ,
2008-02-07 11:14:25 +03:00
} ,
2008-02-07 11:13:50 +03:00
} ;
2008-02-07 11:14:31 +03:00
static int alloc_mem_cgroup_per_zone_info ( struct mem_cgroup * mem , int node )
{
struct mem_cgroup_per_node * pn ;
2008-02-07 11:14:38 +03:00
struct mem_cgroup_per_zone * mz ;
2008-04-09 04:41:54 +04:00
int zone , tmp = node ;
2008-02-07 11:14:38 +03:00
/*
* This routine is called against possible nodes .
* But it ' s BUG to call kmalloc ( ) against offline node .
*
* TODO : this routine can waste much memory for nodes which will
* never be onlined . It ' s better to use memory hotplug callback
* function .
*/
2008-04-09 04:41:54 +04:00
if ( ! node_state ( node , N_NORMAL_MEMORY ) )
tmp = - 1 ;
pn = kmalloc_node ( sizeof ( * pn ) , GFP_KERNEL , tmp ) ;
2008-02-07 11:14:31 +03:00
if ( ! pn )
return 1 ;
2008-02-07 11:14:38 +03:00
2008-02-07 11:14:31 +03:00
mem - > info . nodeinfo [ node ] = pn ;
memset ( pn , 0 , sizeof ( * pn ) ) ;
2008-02-07 11:14:38 +03:00
for ( zone = 0 ; zone < MAX_NR_ZONES ; zone + + ) {
mz = & pn - > zoneinfo [ zone ] ;
INIT_LIST_HEAD ( & mz - > active_list ) ;
INIT_LIST_HEAD ( & mz - > inactive_list ) ;
2008-02-07 11:14:39 +03:00
spin_lock_init ( & mz - > lru_lock ) ;
2008-02-07 11:14:38 +03:00
}
2008-02-07 11:14:31 +03:00
return 0 ;
}
2008-02-07 11:14:38 +03:00
static void free_mem_cgroup_per_zone_info ( struct mem_cgroup * mem , int node )
{
kfree ( mem - > info . nodeinfo [ node ] ) ;
}
2008-02-07 11:13:50 +03:00
static struct cgroup_subsys_state *
mem_cgroup_create ( struct cgroup_subsys * ss , struct cgroup * cont )
{
struct mem_cgroup * mem ;
2008-02-07 11:14:31 +03:00
int node ;
2008-02-07 11:13:50 +03:00
2008-04-29 12:00:19 +04:00
if ( unlikely ( ( cont - > parent ) = = NULL ) ) {
2008-02-07 11:13:51 +03:00
mem = & init_mem_cgroup ;
2008-04-29 12:00:19 +04:00
page_cgroup_cache = KMEM_CACHE ( page_cgroup , SLAB_PANIC ) ;
} else {
2008-02-07 11:13:51 +03:00
mem = kzalloc ( sizeof ( struct mem_cgroup ) , GFP_KERNEL ) ;
2008-04-29 12:00:19 +04:00
}
2008-02-07 11:13:51 +03:00
if ( mem = = NULL )
2008-02-24 02:24:14 +03:00
return ERR_PTR ( - ENOMEM ) ;
2008-02-07 11:13:50 +03:00
res_counter_init ( & mem - > res ) ;
2008-02-07 11:14:38 +03:00
2008-02-07 11:14:31 +03:00
memset ( & mem - > info , 0 , sizeof ( mem - > info ) ) ;
for_each_node_state ( node , N_POSSIBLE )
if ( alloc_mem_cgroup_per_zone_info ( mem , node ) )
goto free_out ;
2008-02-07 11:13:50 +03:00
return & mem - > css ;
2008-02-07 11:14:31 +03:00
free_out :
for_each_node_state ( node , N_POSSIBLE )
2008-02-07 11:14:38 +03:00
free_mem_cgroup_per_zone_info ( mem , node ) ;
2008-02-07 11:14:31 +03:00
if ( cont - > parent ! = NULL )
kfree ( mem ) ;
2008-02-24 02:24:14 +03:00
return ERR_PTR ( - ENOMEM ) ;
2008-02-07 11:13:50 +03:00
}
2008-02-07 11:14:28 +03:00
static void mem_cgroup_pre_destroy ( struct cgroup_subsys * ss ,
struct cgroup * cont )
{
struct mem_cgroup * mem = mem_cgroup_from_cont ( cont ) ;
mem_cgroup_force_empty ( mem ) ;
}
2008-02-07 11:13:50 +03:00
static void mem_cgroup_destroy ( struct cgroup_subsys * ss ,
struct cgroup * cont )
{
2008-02-07 11:14:31 +03:00
int node ;
struct mem_cgroup * mem = mem_cgroup_from_cont ( cont ) ;
for_each_node_state ( node , N_POSSIBLE )
2008-02-07 11:14:38 +03:00
free_mem_cgroup_per_zone_info ( mem , node ) ;
2008-02-07 11:14:31 +03:00
2008-02-07 11:13:50 +03:00
kfree ( mem_cgroup_from_cont ( cont ) ) ;
}
static int mem_cgroup_populate ( struct cgroup_subsys * ss ,
struct cgroup * cont )
{
2008-04-05 01:29:59 +04:00
if ( mem_cgroup_subsys . disabled )
return 0 ;
2008-02-07 11:13:50 +03:00
return cgroup_add_files ( cont , ss , mem_cgroup_files ,
ARRAY_SIZE ( mem_cgroup_files ) ) ;
}
2008-02-07 11:13:54 +03:00
static void mem_cgroup_move_task ( struct cgroup_subsys * ss ,
struct cgroup * cont ,
struct cgroup * old_cont ,
struct task_struct * p )
{
struct mm_struct * mm ;
struct mem_cgroup * mem , * old_mem ;
2008-04-05 01:29:59 +04:00
if ( mem_cgroup_subsys . disabled )
return ;
2008-02-07 11:13:54 +03:00
mm = get_task_mm ( p ) ;
if ( mm = = NULL )
return ;
mem = mem_cgroup_from_cont ( cont ) ;
old_mem = mem_cgroup_from_cont ( old_cont ) ;
if ( mem = = old_mem )
goto out ;
/*
* Only thread group leaders are allowed to migrate , the mm_struct is
* in effect owned by the leader
*/
2008-03-20 03:00:45 +03:00
if ( ! thread_group_leader ( p ) )
2008-02-07 11:13:54 +03:00
goto out ;
out :
mmput ( mm ) ;
}
2008-02-07 11:13:50 +03:00
struct cgroup_subsys mem_cgroup_subsys = {
. name = " memory " ,
. subsys_id = mem_cgroup_subsys_id ,
. create = mem_cgroup_create ,
2008-02-07 11:14:28 +03:00
. pre_destroy = mem_cgroup_pre_destroy ,
2008-02-07 11:13:50 +03:00
. destroy = mem_cgroup_destroy ,
. populate = mem_cgroup_populate ,
2008-02-07 11:13:54 +03:00
. attach = mem_cgroup_move_task ,
2008-02-07 11:14:31 +03:00
. early_init = 0 ,
2008-02-07 11:13:50 +03:00
} ;