2008-10-19 07:28:16 +04:00
# include <linux/mm.h>
# include <linux/mmzone.h>
# include <linux/bootmem.h>
# include <linux/bit_spinlock.h>
# include <linux/page_cgroup.h>
# include <linux/hash.h>
2008-10-23 01:15:05 +04:00
# include <linux/slab.h>
2008-10-19 07:28:16 +04:00
# include <linux/memory.h>
2008-10-23 01:14:58 +04:00
# include <linux/vmalloc.h>
2008-10-23 01:15:05 +04:00
# include <linux/cgroup.h>
2009-01-08 05:07:58 +03:00
# include <linux/swapops.h>
2008-10-19 07:28:16 +04:00
static void __meminit
__init_page_cgroup ( struct page_cgroup * pc , unsigned long pfn )
{
pc - > flags = 0 ;
pc - > mem_cgroup = NULL ;
pc - > page = pfn_to_page ( pfn ) ;
memcg: synchronized LRU
A big patch for changing memcg's LRU semantics.
Now,
- page_cgroup is linked to mem_cgroup's its own LRU (per zone).
- LRU of page_cgroup is not synchronous with global LRU.
- page and page_cgroup is one-to-one and statically allocated.
- To find page_cgroup is on what LRU, you have to check pc->mem_cgroup as
- lru = page_cgroup_zoneinfo(pc, nid_of_pc, zid_of_pc);
- SwapCache is handled.
And, when we handle LRU list of page_cgroup, we do following.
pc = lookup_page_cgroup(page);
lock_page_cgroup(pc); .....................(1)
mz = page_cgroup_zoneinfo(pc);
spin_lock(&mz->lru_lock);
.....add to LRU
spin_unlock(&mz->lru_lock);
unlock_page_cgroup(pc);
But (1) is spin_lock and we have to be afraid of dead-lock with zone->lru_lock.
So, trylock() is used at (1), now. Without (1), we can't trust "mz" is correct.
This is a trial to remove this dirty nesting of locks.
This patch changes mz->lru_lock to be zone->lru_lock.
Then, above sequence will be written as
spin_lock(&zone->lru_lock); # in vmscan.c or swap.c via global LRU
mem_cgroup_add/remove/etc_lru() {
pc = lookup_page_cgroup(page);
mz = page_cgroup_zoneinfo(pc);
if (PageCgroupUsed(pc)) {
....add to LRU
}
spin_lock(&zone->lru_lock); # in vmscan.c or swap.c via global LRU
This is much simpler.
(*) We're safe even if we don't take lock_page_cgroup(pc). Because..
1. When pc->mem_cgroup can be modified.
- at charge.
- at account_move().
2. at charge
the PCG_USED bit is not set before pc->mem_cgroup is fixed.
3. at account_move()
the page is isolated and not on LRU.
Pros.
- easy for maintenance.
- memcg can make use of laziness of pagevec.
- we don't have to duplicated LRU/Active/Unevictable bit in page_cgroup.
- LRU status of memcg will be synchronized with global LRU's one.
- # of locks are reduced.
- account_move() is simplified very much.
Cons.
- may increase cost of LRU rotation.
(no impact if memcg is not configured.)
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-01-08 05:08:01 +03:00
INIT_LIST_HEAD ( & pc - > lru ) ;
2008-10-19 07:28:16 +04:00
}
static unsigned long total_usage ;
# if !defined(CONFIG_SPARSEMEM)
2008-11-22 20:33:24 +03:00
void __meminit pgdat_page_cgroup_init ( struct pglist_data * pgdat )
2008-10-19 07:28:16 +04:00
{
pgdat - > node_page_cgroup = NULL ;
}
struct page_cgroup * lookup_page_cgroup ( struct page * page )
{
unsigned long pfn = page_to_pfn ( page ) ;
unsigned long offset ;
struct page_cgroup * base ;
base = NODE_DATA ( page_to_nid ( page ) ) - > node_page_cgroup ;
if ( unlikely ( ! base ) )
return NULL ;
offset = pfn - NODE_DATA ( page_to_nid ( page ) ) - > node_start_pfn ;
return base + offset ;
}
static int __init alloc_node_page_cgroup ( int nid )
{
struct page_cgroup * base , * pc ;
unsigned long table_size ;
unsigned long start_pfn , nr_pages , index ;
start_pfn = NODE_DATA ( nid ) - > node_start_pfn ;
nr_pages = NODE_DATA ( nid ) - > node_spanned_pages ;
2008-12-10 00:14:20 +03:00
if ( ! nr_pages )
return 0 ;
2008-10-19 07:28:16 +04:00
table_size = sizeof ( struct page_cgroup ) * nr_pages ;
2009-06-12 11:33:53 +04:00
base = __alloc_bootmem_node_nopanic ( NODE_DATA ( nid ) ,
table_size , PAGE_SIZE , __pa ( MAX_DMA_ADDRESS ) ) ;
if ( ! base )
2008-10-19 07:28:16 +04:00
return - ENOMEM ;
for ( index = 0 ; index < nr_pages ; index + + ) {
pc = base + index ;
__init_page_cgroup ( pc , start_pfn + index ) ;
}
NODE_DATA ( nid ) - > node_page_cgroup = base ;
total_usage + = table_size ;
return 0 ;
}
2009-06-12 11:33:53 +04:00
void __init page_cgroup_init_flatmem ( void )
2008-10-19 07:28:16 +04:00
{
int nid , fail ;
2009-01-08 05:08:02 +03:00
if ( mem_cgroup_disabled ( ) )
2008-10-23 01:15:05 +04:00
return ;
2008-10-19 07:28:16 +04:00
for_each_online_node ( nid ) {
fail = alloc_node_page_cgroup ( nid ) ;
if ( fail )
goto fail ;
}
printk ( KERN_INFO " allocated %ld bytes of page_cgroup \n " , total_usage ) ;
2009-06-18 03:26:32 +04:00
printk ( KERN_INFO " please try 'cgroup_disable=memory' option if you "
" don't want memory cgroups \n " ) ;
2008-10-19 07:28:16 +04:00
return ;
fail :
2009-06-18 03:26:32 +04:00
printk ( KERN_CRIT " allocation of page_cgroup failed. \n " ) ;
printk ( KERN_CRIT " please try 'cgroup_disable=memory' boot option \n " ) ;
2008-10-19 07:28:16 +04:00
panic ( " Out of memory " ) ;
}
# else /* CONFIG_FLAT_NODE_MEM_MAP */
struct page_cgroup * lookup_page_cgroup ( struct page * page )
{
unsigned long pfn = page_to_pfn ( page ) ;
struct mem_section * section = __pfn_to_section ( pfn ) ;
2009-06-18 03:26:34 +04:00
if ( ! section - > page_cgroup )
return NULL ;
2008-10-19 07:28:16 +04:00
return section - > page_cgroup + pfn ;
}
2008-11-22 20:33:24 +03:00
/* __alloc_bootmem...() is protected by !slab_available() */
2009-01-07 01:39:43 +03:00
static int __init_refok init_section_page_cgroup ( unsigned long pfn )
2008-10-19 07:28:16 +04:00
{
2009-01-08 05:07:51 +03:00
struct mem_section * section = __pfn_to_section ( pfn ) ;
2008-10-19 07:28:16 +04:00
struct page_cgroup * base , * pc ;
unsigned long table_size ;
int nid , index ;
2008-12-02 00:13:48 +03:00
if ( ! section - > page_cgroup ) {
nid = page_to_nid ( pfn_to_page ( pfn ) ) ;
table_size = sizeof ( struct page_cgroup ) * PAGES_PER_SECTION ;
2009-06-12 11:33:53 +04:00
VM_BUG_ON ( ! slab_is_available ( ) ) ;
2009-09-22 04:01:19 +04:00
if ( node_state ( nid , N_HIGH_MEMORY ) ) {
base = kmalloc_node ( table_size ,
2009-06-12 11:33:53 +04:00
GFP_KERNEL | __GFP_NOWARN , nid ) ;
2009-09-22 04:01:19 +04:00
if ( ! base )
base = vmalloc_node ( table_size , nid ) ;
} else {
base = kmalloc ( table_size , GFP_KERNEL | __GFP_NOWARN ) ;
if ( ! base )
base = vmalloc ( table_size ) ;
}
2008-12-02 00:13:48 +03:00
} else {
/*
* We don ' t have to allocate page_cgroup again , but
* address of memmap may be changed . So , we have to initialize
* again .
*/
base = section - > page_cgroup + pfn ;
table_size = 0 ;
/* check address of memmap is changed or not. */
if ( base - > page = = pfn_to_page ( pfn ) )
return 0 ;
2008-10-23 01:15:05 +04:00
}
2008-10-19 07:28:16 +04:00
if ( ! base ) {
printk ( KERN_ERR " page cgroup allocation failure \n " ) ;
return - ENOMEM ;
}
for ( index = 0 ; index < PAGES_PER_SECTION ; index + + ) {
pc = base + index ;
__init_page_cgroup ( pc , pfn + index ) ;
}
section - > page_cgroup = base - pfn ;
total_usage + = table_size ;
return 0 ;
}
# ifdef CONFIG_MEMORY_HOTPLUG
void __free_page_cgroup ( unsigned long pfn )
{
struct mem_section * ms ;
struct page_cgroup * base ;
ms = __pfn_to_section ( pfn ) ;
if ( ! ms | | ! ms - > page_cgroup )
return ;
base = ms - > page_cgroup + pfn ;
2008-10-23 01:15:05 +04:00
if ( is_vmalloc_addr ( base ) ) {
2008-10-19 07:28:16 +04:00
vfree ( base ) ;
2008-10-23 01:15:05 +04:00
ms - > page_cgroup = NULL ;
} else {
struct page * page = virt_to_page ( base ) ;
if ( ! PageReserved ( page ) ) { /* Is bootmem ? */
kfree ( base ) ;
ms - > page_cgroup = NULL ;
}
}
2008-10-19 07:28:16 +04:00
}
2008-11-22 20:33:24 +03:00
int __meminit online_page_cgroup ( unsigned long start_pfn ,
2008-10-19 07:28:16 +04:00
unsigned long nr_pages ,
int nid )
{
unsigned long start , end , pfn ;
int fail = 0 ;
2008-11-13 00:27:01 +03:00
start = start_pfn & ~ ( PAGES_PER_SECTION - 1 ) ;
2008-10-19 07:28:16 +04:00
end = ALIGN ( start_pfn + nr_pages , PAGES_PER_SECTION ) ;
for ( pfn = start ; ! fail & & pfn < end ; pfn + = PAGES_PER_SECTION ) {
if ( ! pfn_present ( pfn ) )
continue ;
fail = init_section_page_cgroup ( pfn ) ;
}
if ( ! fail )
return 0 ;
/* rollback */
for ( pfn = start ; pfn < end ; pfn + = PAGES_PER_SECTION )
__free_page_cgroup ( pfn ) ;
return - ENOMEM ;
}
2008-11-22 20:33:24 +03:00
int __meminit offline_page_cgroup ( unsigned long start_pfn ,
2008-10-19 07:28:16 +04:00
unsigned long nr_pages , int nid )
{
unsigned long start , end , pfn ;
2008-11-13 00:27:01 +03:00
start = start_pfn & ~ ( PAGES_PER_SECTION - 1 ) ;
2008-10-19 07:28:16 +04:00
end = ALIGN ( start_pfn + nr_pages , PAGES_PER_SECTION ) ;
for ( pfn = start ; pfn < end ; pfn + = PAGES_PER_SECTION )
__free_page_cgroup ( pfn ) ;
return 0 ;
}
2008-11-22 20:33:24 +03:00
static int __meminit page_cgroup_callback ( struct notifier_block * self ,
2008-10-19 07:28:16 +04:00
unsigned long action , void * arg )
{
struct memory_notify * mn = arg ;
int ret = 0 ;
switch ( action ) {
case MEM_GOING_ONLINE :
ret = online_page_cgroup ( mn - > start_pfn ,
mn - > nr_pages , mn - > status_change_nid ) ;
break ;
case MEM_OFFLINE :
offline_page_cgroup ( mn - > start_pfn ,
mn - > nr_pages , mn - > status_change_nid ) ;
break ;
2008-12-02 00:13:48 +03:00
case MEM_CANCEL_ONLINE :
2008-10-19 07:28:16 +04:00
case MEM_GOING_OFFLINE :
break ;
case MEM_ONLINE :
case MEM_CANCEL_OFFLINE :
break ;
}
2008-12-02 00:13:48 +03:00
if ( ret )
ret = notifier_from_errno ( ret ) ;
else
ret = NOTIFY_OK ;
2008-10-19 07:28:16 +04:00
return ret ;
}
# endif
void __init page_cgroup_init ( void )
{
unsigned long pfn ;
int fail = 0 ;
2009-01-08 05:08:02 +03:00
if ( mem_cgroup_disabled ( ) )
2008-10-23 01:15:05 +04:00
return ;
2008-10-19 07:28:16 +04:00
for ( pfn = 0 ; ! fail & & pfn < max_pfn ; pfn + = PAGES_PER_SECTION ) {
if ( ! pfn_present ( pfn ) )
continue ;
fail = init_section_page_cgroup ( pfn ) ;
}
if ( fail ) {
2009-06-18 03:26:32 +04:00
printk ( KERN_CRIT " try 'cgroup_disable=memory' boot option \n " ) ;
2008-10-19 07:28:16 +04:00
panic ( " Out of memory " ) ;
} else {
hotplug_memory_notifier ( page_cgroup_callback , 0 ) ;
}
printk ( KERN_INFO " allocated %ld bytes of page_cgroup \n " , total_usage ) ;
2009-06-18 03:26:32 +04:00
printk ( KERN_INFO " please try 'cgroup_disable=memory' option if you don't "
" want memory cgroups \n " ) ;
2008-10-19 07:28:16 +04:00
}
2008-11-22 20:33:24 +03:00
void __meminit pgdat_page_cgroup_init ( struct pglist_data * pgdat )
2008-10-19 07:28:16 +04:00
{
return ;
}
# endif
2009-01-08 05:07:58 +03:00
# ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
static DEFINE_MUTEX ( swap_cgroup_mutex ) ;
struct swap_cgroup_ctrl {
struct page * * map ;
unsigned long length ;
2010-03-15 07:34:57 +03:00
spinlock_t lock ;
2009-01-08 05:07:58 +03:00
} ;
struct swap_cgroup_ctrl swap_cgroup_ctrl [ MAX_SWAPFILES ] ;
struct swap_cgroup {
2009-04-03 03:57:45 +04:00
unsigned short id ;
2009-01-08 05:07:58 +03:00
} ;
# define SC_PER_PAGE (PAGE_SIZE / sizeof(struct swap_cgroup))
# define SC_POS_MASK (SC_PER_PAGE - 1)
/*
* SwapCgroup implements " lookup " and " exchange " operations .
* In typical usage , this swap_cgroup is accessed via memcg ' s charge / uncharge
* against SwapCache . At swap_free ( ) , this is accessed directly from swap .
*
* This means ,
* - we have no race in " exchange " when we ' re accessed via SwapCache because
* SwapCache ( and its swp_entry ) is under lock .
* - When called via swap_free ( ) , there is no user of this entry and no race .
* Then , we don ' t need lock around " exchange " .
*
* TODO : we can push these buffers out to HIGHMEM .
*/
/*
* allocate buffer for swap_cgroup .
*/
static int swap_cgroup_prepare ( int type )
{
struct page * page ;
struct swap_cgroup_ctrl * ctrl ;
unsigned long idx , max ;
ctrl = & swap_cgroup_ctrl [ type ] ;
for ( idx = 0 ; idx < ctrl - > length ; idx + + ) {
page = alloc_page ( GFP_KERNEL | __GFP_ZERO ) ;
if ( ! page )
goto not_enough_page ;
ctrl - > map [ idx ] = page ;
}
return 0 ;
not_enough_page :
max = idx ;
for ( idx = 0 ; idx < max ; idx + + )
__free_page ( ctrl - > map [ idx ] ) ;
return - ENOMEM ;
}
2010-03-11 02:22:17 +03:00
/**
* swap_cgroup_cmpxchg - cmpxchg mem_cgroup ' s id for this swp_entry .
* @ end : swap entry to be cmpxchged
* @ old : old id
* @ new : new id
*
* Returns old id at success , 0 at failure .
* ( There is no mem_cgroup useing 0 as its id )
*/
unsigned short swap_cgroup_cmpxchg ( swp_entry_t ent ,
unsigned short old , unsigned short new )
{
int type = swp_type ( ent ) ;
unsigned long offset = swp_offset ( ent ) ;
unsigned long idx = offset / SC_PER_PAGE ;
unsigned long pos = offset & SC_POS_MASK ;
struct swap_cgroup_ctrl * ctrl ;
struct page * mappage ;
struct swap_cgroup * sc ;
2010-03-15 07:34:57 +03:00
unsigned long flags ;
unsigned short retval ;
2010-03-11 02:22:17 +03:00
ctrl = & swap_cgroup_ctrl [ type ] ;
mappage = ctrl - > map [ idx ] ;
sc = page_address ( mappage ) ;
sc + = pos ;
2010-03-15 07:34:57 +03:00
spin_lock_irqsave ( & ctrl - > lock , flags ) ;
retval = sc - > id ;
if ( retval = = old )
sc - > id = new ;
2010-03-11 02:22:17 +03:00
else
2010-03-15 07:34:57 +03:00
retval = 0 ;
spin_unlock_irqrestore ( & ctrl - > lock , flags ) ;
return retval ;
2010-03-11 02:22:17 +03:00
}
2009-01-08 05:07:58 +03:00
/**
* swap_cgroup_record - record mem_cgroup for this swp_entry .
* @ ent : swap entry to be recorded into
* @ mem : mem_cgroup to be recorded
*
2009-04-03 03:57:45 +04:00
* Returns old value at success , 0 at failure .
* ( Of course , old value can be 0. )
2009-01-08 05:07:58 +03:00
*/
2009-04-03 03:57:45 +04:00
unsigned short swap_cgroup_record ( swp_entry_t ent , unsigned short id )
2009-01-08 05:07:58 +03:00
{
int type = swp_type ( ent ) ;
unsigned long offset = swp_offset ( ent ) ;
unsigned long idx = offset / SC_PER_PAGE ;
unsigned long pos = offset & SC_POS_MASK ;
struct swap_cgroup_ctrl * ctrl ;
struct page * mappage ;
struct swap_cgroup * sc ;
2009-04-03 03:57:45 +04:00
unsigned short old ;
2010-03-15 07:34:57 +03:00
unsigned long flags ;
2009-01-08 05:07:58 +03:00
ctrl = & swap_cgroup_ctrl [ type ] ;
mappage = ctrl - > map [ idx ] ;
sc = page_address ( mappage ) ;
sc + = pos ;
2010-03-15 07:34:57 +03:00
spin_lock_irqsave ( & ctrl - > lock , flags ) ;
old = sc - > id ;
sc - > id = id ;
spin_unlock_irqrestore ( & ctrl - > lock , flags ) ;
2009-01-08 05:07:58 +03:00
return old ;
}
/**
* lookup_swap_cgroup - lookup mem_cgroup tied to swap entry
* @ ent : swap entry to be looked up .
*
2009-04-03 03:57:45 +04:00
* Returns CSS ID of mem_cgroup at success . 0 at failure . ( 0 is invalid ID )
2009-01-08 05:07:58 +03:00
*/
2009-04-03 03:57:45 +04:00
unsigned short lookup_swap_cgroup ( swp_entry_t ent )
2009-01-08 05:07:58 +03:00
{
int type = swp_type ( ent ) ;
unsigned long offset = swp_offset ( ent ) ;
unsigned long idx = offset / SC_PER_PAGE ;
unsigned long pos = offset & SC_POS_MASK ;
struct swap_cgroup_ctrl * ctrl ;
struct page * mappage ;
struct swap_cgroup * sc ;
2009-04-03 03:57:45 +04:00
unsigned short ret ;
2009-01-08 05:07:58 +03:00
ctrl = & swap_cgroup_ctrl [ type ] ;
mappage = ctrl - > map [ idx ] ;
sc = page_address ( mappage ) ;
sc + = pos ;
2009-04-03 03:57:45 +04:00
ret = sc - > id ;
2009-01-08 05:07:58 +03:00
return ret ;
}
int swap_cgroup_swapon ( int type , unsigned long max_pages )
{
void * array ;
unsigned long array_size ;
unsigned long length ;
struct swap_cgroup_ctrl * ctrl ;
if ( ! do_swap_account )
return 0 ;
length = ( ( max_pages / SC_PER_PAGE ) + 1 ) ;
array_size = length * sizeof ( void * ) ;
array = vmalloc ( array_size ) ;
if ( ! array )
goto nomem ;
memset ( array , 0 , array_size ) ;
ctrl = & swap_cgroup_ctrl [ type ] ;
mutex_lock ( & swap_cgroup_mutex ) ;
ctrl - > length = length ;
ctrl - > map = array ;
2010-03-15 07:34:57 +03:00
spin_lock_init ( & ctrl - > lock ) ;
2009-01-08 05:07:58 +03:00
if ( swap_cgroup_prepare ( type ) ) {
/* memory shortage */
ctrl - > map = NULL ;
ctrl - > length = 0 ;
vfree ( array ) ;
mutex_unlock ( & swap_cgroup_mutex ) ;
goto nomem ;
}
mutex_unlock ( & swap_cgroup_mutex ) ;
return 0 ;
nomem :
printk ( KERN_INFO " couldn't allocate enough memory for swap_cgroup. \n " ) ;
printk ( KERN_INFO
" swap_cgroup can be disabled by noswapaccount boot option \n " ) ;
return - ENOMEM ;
}
void swap_cgroup_swapoff ( int type )
{
int i ;
struct swap_cgroup_ctrl * ctrl ;
if ( ! do_swap_account )
return ;
mutex_lock ( & swap_cgroup_mutex ) ;
ctrl = & swap_cgroup_ctrl [ type ] ;
if ( ctrl - > map ) {
for ( i = 0 ; i < ctrl - > length ; i + + ) {
struct page * page = ctrl - > map [ i ] ;
if ( page )
__free_page ( page ) ;
}
vfree ( ctrl - > map ) ;
ctrl - > map = NULL ;
ctrl - > length = 0 ;
}
mutex_unlock ( & swap_cgroup_mutex ) ;
}
# endif