2008-10-19 07:28:16 +04:00
# include <linux/mm.h>
# include <linux/mmzone.h>
# include <linux/bootmem.h>
# include <linux/bit_spinlock.h>
# include <linux/page_cgroup.h>
# include <linux/hash.h>
2008-10-23 01:15:05 +04:00
# include <linux/slab.h>
2008-10-19 07:28:16 +04:00
# include <linux/memory.h>
2008-10-23 01:14:58 +04:00
# include <linux/vmalloc.h>
2008-10-23 01:15:05 +04:00
# include <linux/cgroup.h>
2009-01-08 05:07:58 +03:00
# include <linux/swapops.h>
2010-07-19 14:54:14 +04:00
# include <linux/kmemleak.h>
2008-10-19 07:28:16 +04:00
static unsigned long total_usage ;
# if !defined(CONFIG_SPARSEMEM)
2008-11-22 20:33:24 +03:00
void __meminit pgdat_page_cgroup_init ( struct pglist_data * pgdat )
2008-10-19 07:28:16 +04:00
{
pgdat - > node_page_cgroup = NULL ;
}
struct page_cgroup * lookup_page_cgroup ( struct page * page )
{
unsigned long pfn = page_to_pfn ( page ) ;
unsigned long offset ;
struct page_cgroup * base ;
base = NODE_DATA ( page_to_nid ( page ) ) - > node_page_cgroup ;
2012-01-13 05:18:40 +04:00
# ifdef CONFIG_DEBUG_VM
/*
* The sanity checks the page allocator does upon freeing a
* page can reach here before the page_cgroup arrays are
* allocated when feeding a range of pages to the allocator
* for the first time during bootup or memory hotplug .
*/
2008-10-19 07:28:16 +04:00
if ( unlikely ( ! base ) )
return NULL ;
2012-01-13 05:18:40 +04:00
# endif
2008-10-19 07:28:16 +04:00
offset = pfn - NODE_DATA ( page_to_nid ( page ) ) - > node_start_pfn ;
return base + offset ;
}
static int __init alloc_node_page_cgroup ( int nid )
{
2012-01-13 05:18:18 +04:00
struct page_cgroup * base ;
2008-10-19 07:28:16 +04:00
unsigned long table_size ;
2012-01-13 05:18:18 +04:00
unsigned long nr_pages ;
2008-10-19 07:28:16 +04:00
nr_pages = NODE_DATA ( nid ) - > node_spanned_pages ;
2008-12-10 00:14:20 +03:00
if ( ! nr_pages )
return 0 ;
2008-10-19 07:28:16 +04:00
table_size = sizeof ( struct page_cgroup ) * nr_pages ;
2009-06-12 11:33:53 +04:00
base = __alloc_bootmem_node_nopanic ( NODE_DATA ( nid ) ,
table_size , PAGE_SIZE , __pa ( MAX_DMA_ADDRESS ) ) ;
if ( ! base )
2008-10-19 07:28:16 +04:00
return - ENOMEM ;
NODE_DATA ( nid ) - > node_page_cgroup = base ;
total_usage + = table_size ;
return 0 ;
}
2009-06-12 11:33:53 +04:00
void __init page_cgroup_init_flatmem ( void )
2008-10-19 07:28:16 +04:00
{
int nid , fail ;
2009-01-08 05:08:02 +03:00
if ( mem_cgroup_disabled ( ) )
2008-10-23 01:15:05 +04:00
return ;
2008-10-19 07:28:16 +04:00
for_each_online_node ( nid ) {
fail = alloc_node_page_cgroup ( nid ) ;
if ( fail )
goto fail ;
}
printk ( KERN_INFO " allocated %ld bytes of page_cgroup \n " , total_usage ) ;
2009-06-18 03:26:32 +04:00
printk ( KERN_INFO " please try 'cgroup_disable=memory' option if you "
" don't want memory cgroups \n " ) ;
2008-10-19 07:28:16 +04:00
return ;
fail :
2009-06-18 03:26:32 +04:00
printk ( KERN_CRIT " allocation of page_cgroup failed. \n " ) ;
printk ( KERN_CRIT " please try 'cgroup_disable=memory' boot option \n " ) ;
2008-10-19 07:28:16 +04:00
panic ( " Out of memory " ) ;
}
# else /* CONFIG_FLAT_NODE_MEM_MAP */
struct page_cgroup * lookup_page_cgroup ( struct page * page )
{
unsigned long pfn = page_to_pfn ( page ) ;
struct mem_section * section = __pfn_to_section ( pfn ) ;
2012-01-13 05:18:40 +04:00
# ifdef CONFIG_DEBUG_VM
/*
* The sanity checks the page allocator does upon freeing a
* page can reach here before the page_cgroup arrays are
* allocated when feeding a range of pages to the allocator
* for the first time during bootup or memory hotplug .
*/
2009-06-18 03:26:34 +04:00
if ( ! section - > page_cgroup )
return NULL ;
2012-01-13 05:18:40 +04:00
# endif
2008-10-19 07:28:16 +04:00
return section - > page_cgroup + pfn ;
}
2011-05-27 03:25:29 +04:00
static void * __meminit alloc_page_cgroup ( size_t size , int nid )
2011-03-24 02:42:40 +03:00
{
2012-01-13 05:18:18 +04:00
gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN ;
2011-03-24 02:42:40 +03:00
void * addr = NULL ;
2011-11-03 00:38:11 +04:00
addr = alloc_pages_exact_nid ( nid , size , flags ) ;
if ( addr ) {
kmemleak_alloc ( addr , size , 1 , flags ) ;
2011-03-24 02:42:40 +03:00
return addr ;
2011-11-03 00:38:11 +04:00
}
2011-03-24 02:42:40 +03:00
if ( node_state ( nid , N_HIGH_MEMORY ) )
2012-01-13 05:18:18 +04:00
addr = vzalloc_node ( size , nid ) ;
2011-03-24 02:42:40 +03:00
else
2012-01-13 05:18:18 +04:00
addr = vzalloc ( size ) ;
2011-03-24 02:42:40 +03:00
return addr ;
}
2011-06-16 02:08:42 +04:00
static int __meminit init_section_page_cgroup ( unsigned long pfn , int nid )
2008-10-19 07:28:16 +04:00
{
2011-03-24 02:42:30 +03:00
struct mem_section * section ;
2012-01-13 05:18:18 +04:00
struct page_cgroup * base ;
2008-10-19 07:28:16 +04:00
unsigned long table_size ;
2012-01-13 05:18:18 +04:00
section = __pfn_to_section ( pfn ) ;
2011-03-24 02:42:30 +03:00
if ( section - > page_cgroup )
return 0 ;
table_size = sizeof ( struct page_cgroup ) * PAGES_PER_SECTION ;
2011-03-24 02:42:40 +03:00
base = alloc_page_cgroup ( table_size , nid ) ;
2011-03-24 02:42:30 +03:00
/*
* The value stored in section - > page_cgroup is ( base - pfn )
* and it does not point to the memory block allocated above ,
* causing kmemleak false positives .
*/
kmemleak_not_leak ( base ) ;
2008-10-19 07:28:16 +04:00
if ( ! base ) {
printk ( KERN_ERR " page cgroup allocation failure \n " ) ;
return - ENOMEM ;
}
2011-06-16 02:08:42 +04:00
/*
* The passed " pfn " may not be aligned to SECTION . For the calculation
* we need to apply a mask .
*/
pfn & = PAGE_SECTION_MASK ;
2008-10-19 07:28:16 +04:00
section - > page_cgroup = base - pfn ;
total_usage + = table_size ;
return 0 ;
}
# ifdef CONFIG_MEMORY_HOTPLUG
2012-01-13 05:19:08 +04:00
static void free_page_cgroup ( void * addr )
{
if ( is_vmalloc_addr ( addr ) ) {
vfree ( addr ) ;
} else {
struct page * page = virt_to_page ( addr ) ;
size_t table_size =
sizeof ( struct page_cgroup ) * PAGES_PER_SECTION ;
BUG_ON ( PageReserved ( page ) ) ;
free_pages_exact ( addr , table_size ) ;
}
}
2008-10-19 07:28:16 +04:00
void __free_page_cgroup ( unsigned long pfn )
{
struct mem_section * ms ;
struct page_cgroup * base ;
ms = __pfn_to_section ( pfn ) ;
if ( ! ms | | ! ms - > page_cgroup )
return ;
base = ms - > page_cgroup + pfn ;
2011-03-24 02:42:40 +03:00
free_page_cgroup ( base ) ;
ms - > page_cgroup = NULL ;
2008-10-19 07:28:16 +04:00
}
2008-11-22 20:33:24 +03:00
int __meminit online_page_cgroup ( unsigned long start_pfn ,
2008-10-19 07:28:16 +04:00
unsigned long nr_pages ,
int nid )
{
unsigned long start , end , pfn ;
int fail = 0 ;
2011-07-26 04:12:13 +04:00
start = SECTION_ALIGN_DOWN ( start_pfn ) ;
end = SECTION_ALIGN_UP ( start_pfn + nr_pages ) ;
2008-10-19 07:28:16 +04:00
2011-06-16 02:08:42 +04:00
if ( nid = = - 1 ) {
/*
* In this case , " nid " already exists and contains valid memory .
* " start_pfn " passed to us is a pfn which is an arg for
* online__pages ( ) , and start_pfn should exist .
*/
nid = pfn_to_nid ( start_pfn ) ;
VM_BUG_ON ( ! node_state ( nid , N_ONLINE ) ) ;
}
2008-10-19 07:28:16 +04:00
for ( pfn = start ; ! fail & & pfn < end ; pfn + = PAGES_PER_SECTION ) {
if ( ! pfn_present ( pfn ) )
continue ;
2011-06-16 02:08:42 +04:00
fail = init_section_page_cgroup ( pfn , nid ) ;
2008-10-19 07:28:16 +04:00
}
if ( ! fail )
return 0 ;
/* rollback */
for ( pfn = start ; pfn < end ; pfn + = PAGES_PER_SECTION )
__free_page_cgroup ( pfn ) ;
return - ENOMEM ;
}
2008-11-22 20:33:24 +03:00
int __meminit offline_page_cgroup ( unsigned long start_pfn ,
2008-10-19 07:28:16 +04:00
unsigned long nr_pages , int nid )
{
unsigned long start , end , pfn ;
2011-07-26 04:12:13 +04:00
start = SECTION_ALIGN_DOWN ( start_pfn ) ;
end = SECTION_ALIGN_UP ( start_pfn + nr_pages ) ;
2008-10-19 07:28:16 +04:00
for ( pfn = start ; pfn < end ; pfn + = PAGES_PER_SECTION )
__free_page_cgroup ( pfn ) ;
return 0 ;
}
2008-11-22 20:33:24 +03:00
static int __meminit page_cgroup_callback ( struct notifier_block * self ,
2008-10-19 07:28:16 +04:00
unsigned long action , void * arg )
{
struct memory_notify * mn = arg ;
int ret = 0 ;
switch ( action ) {
case MEM_GOING_ONLINE :
ret = online_page_cgroup ( mn - > start_pfn ,
mn - > nr_pages , mn - > status_change_nid ) ;
break ;
case MEM_OFFLINE :
offline_page_cgroup ( mn - > start_pfn ,
mn - > nr_pages , mn - > status_change_nid ) ;
break ;
2008-12-02 00:13:48 +03:00
case MEM_CANCEL_ONLINE :
2008-10-19 07:28:16 +04:00
case MEM_GOING_OFFLINE :
break ;
case MEM_ONLINE :
case MEM_CANCEL_OFFLINE :
break ;
}
2008-12-02 00:13:48 +03:00
2011-03-23 02:30:49 +03:00
return notifier_from_errno ( ret ) ;
2008-10-19 07:28:16 +04:00
}
# endif
void __init page_cgroup_init ( void )
{
unsigned long pfn ;
2011-06-16 02:08:42 +04:00
int nid ;
2008-10-19 07:28:16 +04:00
2009-01-08 05:08:02 +03:00
if ( mem_cgroup_disabled ( ) )
2008-10-23 01:15:05 +04:00
return ;
2011-06-16 02:08:42 +04:00
for_each_node_state ( nid , N_HIGH_MEMORY ) {
unsigned long start_pfn , end_pfn ;
start_pfn = node_start_pfn ( nid ) ;
end_pfn = node_end_pfn ( nid ) ;
/*
* start_pfn and end_pfn may not be aligned to SECTION and the
* page - > flags of out of node pages are not initialized . So we
* scan [ start_pfn , the biggest section ' s pfn < end_pfn ) here .
*/
for ( pfn = start_pfn ;
pfn < end_pfn ;
pfn = ALIGN ( pfn + 1 , PAGES_PER_SECTION ) ) {
if ( ! pfn_valid ( pfn ) )
continue ;
/*
* Nodes ' s pfns can be overlapping .
* We know some arch can have a nodes layout such as
* - - - - - - - - - - - - - pfn - - - - - - - - - - - - - - >
* N0 | N1 | N2 | N0 | N1 | N2 | . . . .
*/
if ( pfn_to_nid ( pfn ) ! = nid )
continue ;
if ( init_section_page_cgroup ( pfn , nid ) )
goto oom ;
}
2008-10-19 07:28:16 +04:00
}
2011-06-16 02:08:42 +04:00
hotplug_memory_notifier ( page_cgroup_callback , 0 ) ;
2008-10-19 07:28:16 +04:00
printk ( KERN_INFO " allocated %ld bytes of page_cgroup \n " , total_usage ) ;
2011-06-16 02:08:42 +04:00
printk ( KERN_INFO " please try 'cgroup_disable=memory' option if you "
" don't want memory cgroups \n " ) ;
return ;
oom :
printk ( KERN_CRIT " try 'cgroup_disable=memory' boot option \n " ) ;
panic ( " Out of memory " ) ;
2008-10-19 07:28:16 +04:00
}
2008-11-22 20:33:24 +03:00
void __meminit pgdat_page_cgroup_init ( struct pglist_data * pgdat )
2008-10-19 07:28:16 +04:00
{
return ;
}
# endif
2009-01-08 05:07:58 +03:00
2012-08-01 03:43:02 +04:00
# ifdef CONFIG_MEMCG_SWAP
2009-01-08 05:07:58 +03:00
static DEFINE_MUTEX ( swap_cgroup_mutex ) ;
struct swap_cgroup_ctrl {
struct page * * map ;
unsigned long length ;
2010-03-15 07:34:57 +03:00
spinlock_t lock ;
2009-01-08 05:07:58 +03:00
} ;
2011-11-03 00:38:36 +04:00
static struct swap_cgroup_ctrl swap_cgroup_ctrl [ MAX_SWAPFILES ] ;
2009-01-08 05:07:58 +03:00
struct swap_cgroup {
2009-04-03 03:57:45 +04:00
unsigned short id ;
2009-01-08 05:07:58 +03:00
} ;
# define SC_PER_PAGE (PAGE_SIZE / sizeof(struct swap_cgroup))
/*
* SwapCgroup implements " lookup " and " exchange " operations .
* In typical usage , this swap_cgroup is accessed via memcg ' s charge / uncharge
* against SwapCache . At swap_free ( ) , this is accessed directly from swap .
*
* This means ,
* - we have no race in " exchange " when we ' re accessed via SwapCache because
* SwapCache ( and its swp_entry ) is under lock .
* - When called via swap_free ( ) , there is no user of this entry and no race .
* Then , we don ' t need lock around " exchange " .
*
* TODO : we can push these buffers out to HIGHMEM .
*/
/*
* allocate buffer for swap_cgroup .
*/
static int swap_cgroup_prepare ( int type )
{
struct page * page ;
struct swap_cgroup_ctrl * ctrl ;
unsigned long idx , max ;
ctrl = & swap_cgroup_ctrl [ type ] ;
for ( idx = 0 ; idx < ctrl - > length ; idx + + ) {
page = alloc_page ( GFP_KERNEL | __GFP_ZERO ) ;
if ( ! page )
goto not_enough_page ;
ctrl - > map [ idx ] = page ;
}
return 0 ;
not_enough_page :
max = idx ;
for ( idx = 0 ; idx < max ; idx + + )
__free_page ( ctrl - > map [ idx ] ) ;
return - ENOMEM ;
}
2012-01-13 05:18:48 +04:00
static struct swap_cgroup * lookup_swap_cgroup ( swp_entry_t ent ,
struct swap_cgroup_ctrl * * ctrlp )
{
pgoff_t offset = swp_offset ( ent ) ;
struct swap_cgroup_ctrl * ctrl ;
struct page * mappage ;
2012-03-06 08:52:55 +04:00
struct swap_cgroup * sc ;
2012-01-13 05:18:48 +04:00
ctrl = & swap_cgroup_ctrl [ swp_type ( ent ) ] ;
if ( ctrlp )
* ctrlp = ctrl ;
mappage = ctrl - > map [ offset / SC_PER_PAGE ] ;
2012-03-06 08:52:55 +04:00
sc = page_address ( mappage ) ;
return sc + offset % SC_PER_PAGE ;
2012-01-13 05:18:48 +04:00
}
2010-03-11 02:22:17 +03:00
/**
* swap_cgroup_cmpxchg - cmpxchg mem_cgroup ' s id for this swp_entry .
2012-06-20 23:53:01 +04:00
* @ ent : swap entry to be cmpxchged
2010-03-11 02:22:17 +03:00
* @ old : old id
* @ new : new id
*
* Returns old id at success , 0 at failure .
2011-03-31 05:57:33 +04:00
* ( There is no mem_cgroup using 0 as its id )
2010-03-11 02:22:17 +03:00
*/
unsigned short swap_cgroup_cmpxchg ( swp_entry_t ent ,
unsigned short old , unsigned short new )
{
struct swap_cgroup_ctrl * ctrl ;
struct swap_cgroup * sc ;
2010-03-15 07:34:57 +03:00
unsigned long flags ;
unsigned short retval ;
2010-03-11 02:22:17 +03:00
2012-01-13 05:18:48 +04:00
sc = lookup_swap_cgroup ( ent , & ctrl ) ;
2010-03-11 02:22:17 +03:00
2010-03-15 07:34:57 +03:00
spin_lock_irqsave ( & ctrl - > lock , flags ) ;
retval = sc - > id ;
if ( retval = = old )
sc - > id = new ;
2010-03-11 02:22:17 +03:00
else
2010-03-15 07:34:57 +03:00
retval = 0 ;
spin_unlock_irqrestore ( & ctrl - > lock , flags ) ;
return retval ;
2010-03-11 02:22:17 +03:00
}
2009-01-08 05:07:58 +03:00
/**
* swap_cgroup_record - record mem_cgroup for this swp_entry .
* @ ent : swap entry to be recorded into
2012-06-20 23:53:01 +04:00
* @ id : mem_cgroup to be recorded
2009-01-08 05:07:58 +03:00
*
2009-04-03 03:57:45 +04:00
* Returns old value at success , 0 at failure .
* ( Of course , old value can be 0. )
2009-01-08 05:07:58 +03:00
*/
2009-04-03 03:57:45 +04:00
unsigned short swap_cgroup_record ( swp_entry_t ent , unsigned short id )
2009-01-08 05:07:58 +03:00
{
struct swap_cgroup_ctrl * ctrl ;
struct swap_cgroup * sc ;
2009-04-03 03:57:45 +04:00
unsigned short old ;
2010-03-15 07:34:57 +03:00
unsigned long flags ;
2009-01-08 05:07:58 +03:00
2012-01-13 05:18:48 +04:00
sc = lookup_swap_cgroup ( ent , & ctrl ) ;
2009-01-08 05:07:58 +03:00
2010-03-15 07:34:57 +03:00
spin_lock_irqsave ( & ctrl - > lock , flags ) ;
old = sc - > id ;
sc - > id = id ;
spin_unlock_irqrestore ( & ctrl - > lock , flags ) ;
2009-01-08 05:07:58 +03:00
return old ;
}
/**
2012-01-13 05:18:48 +04:00
* lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry
2009-01-08 05:07:58 +03:00
* @ ent : swap entry to be looked up .
*
2009-04-03 03:57:45 +04:00
* Returns CSS ID of mem_cgroup at success . 0 at failure . ( 0 is invalid ID )
2009-01-08 05:07:58 +03:00
*/
2012-01-13 05:18:48 +04:00
unsigned short lookup_swap_cgroup_id ( swp_entry_t ent )
2009-01-08 05:07:58 +03:00
{
2012-01-13 05:18:48 +04:00
return lookup_swap_cgroup ( ent , NULL ) - > id ;
2009-01-08 05:07:58 +03:00
}
int swap_cgroup_swapon ( int type , unsigned long max_pages )
{
void * array ;
unsigned long array_size ;
unsigned long length ;
struct swap_cgroup_ctrl * ctrl ;
if ( ! do_swap_account )
return 0 ;
2011-05-27 03:25:30 +04:00
length = DIV_ROUND_UP ( max_pages , SC_PER_PAGE ) ;
2009-01-08 05:07:58 +03:00
array_size = length * sizeof ( void * ) ;
2011-05-28 21:36:34 +04:00
array = vzalloc ( array_size ) ;
2009-01-08 05:07:58 +03:00
if ( ! array )
goto nomem ;
ctrl = & swap_cgroup_ctrl [ type ] ;
mutex_lock ( & swap_cgroup_mutex ) ;
ctrl - > length = length ;
ctrl - > map = array ;
2010-03-15 07:34:57 +03:00
spin_lock_init ( & ctrl - > lock ) ;
2009-01-08 05:07:58 +03:00
if ( swap_cgroup_prepare ( type ) ) {
/* memory shortage */
ctrl - > map = NULL ;
ctrl - > length = 0 ;
mutex_unlock ( & swap_cgroup_mutex ) ;
2011-05-27 03:25:31 +04:00
vfree ( array ) ;
2009-01-08 05:07:58 +03:00
goto nomem ;
}
mutex_unlock ( & swap_cgroup_mutex ) ;
return 0 ;
nomem :
printk ( KERN_INFO " couldn't allocate enough memory for swap_cgroup. \n " ) ;
printk ( KERN_INFO
2011-07-26 04:12:12 +04:00
" swap_cgroup can be disabled by swapaccount=0 boot option \n " ) ;
2009-01-08 05:07:58 +03:00
return - ENOMEM ;
}
void swap_cgroup_swapoff ( int type )
{
2011-05-27 03:25:31 +04:00
struct page * * map ;
unsigned long i , length ;
2009-01-08 05:07:58 +03:00
struct swap_cgroup_ctrl * ctrl ;
if ( ! do_swap_account )
return ;
mutex_lock ( & swap_cgroup_mutex ) ;
ctrl = & swap_cgroup_ctrl [ type ] ;
2011-05-27 03:25:31 +04:00
map = ctrl - > map ;
length = ctrl - > length ;
ctrl - > map = NULL ;
ctrl - > length = 0 ;
mutex_unlock ( & swap_cgroup_mutex ) ;
if ( map ) {
for ( i = 0 ; i < length ; i + + ) {
struct page * page = map [ i ] ;
2009-01-08 05:07:58 +03:00
if ( page )
__free_page ( page ) ;
}
2011-05-27 03:25:31 +04:00
vfree ( map ) ;
2009-01-08 05:07:58 +03:00
}
}
# endif