2011-02-24 14:43:05 +01:00
/*
* bootmem - A boot - time physical memory allocator and configurator
*
* Copyright ( C ) 1999 Ingo Molnar
* 1999 Kanoj Sarcar , SGI
* 2008 Johannes Weiner
*
* Access to this subsystem has to be serialized externally ( which is true
* for the boot process anyway ) .
*/
# include <linux/init.h>
# include <linux/pfn.h>
# include <linux/slab.h>
# include <linux/bootmem.h>
2011-10-16 02:01:52 -04:00
# include <linux/export.h>
2011-02-24 14:43:05 +01:00
# include <linux/kmemleak.h>
# include <linux/range.h>
# include <linux/memblock.h>
# include <asm/bug.h>
# include <asm/io.h>
# include <asm/processor.h>
# include "internal.h"
2011-02-24 14:43:06 +01:00
# ifndef CONFIG_NEED_MULTIPLE_NODES
struct pglist_data __refdata contig_page_data ;
EXPORT_SYMBOL ( contig_page_data ) ;
# endif
2011-02-24 14:43:05 +01:00
unsigned long max_low_pfn ;
unsigned long min_low_pfn ;
unsigned long max_pfn ;
2011-02-24 14:43:06 +01:00
static void * __init __alloc_memory_core_early ( int nid , u64 size , u64 align ,
u64 goal , u64 limit )
{
void * ptr ;
u64 addr ;
if ( limit > memblock . current_limit )
limit = memblock . current_limit ;
2011-07-12 10:46:35 +02:00
addr = memblock_find_in_range_node ( goal , limit , size , align , nid ) ;
2011-07-12 09:58:09 +02:00
if ( ! addr )
2011-02-24 14:43:06 +01:00
return NULL ;
ptr = phys_to_virt ( addr ) ;
memset ( ptr , 0 , size ) ;
2011-07-12 11:16:06 +02:00
memblock_reserve ( addr , size ) ;
2011-02-24 14:43:06 +01:00
/*
* The min_count is set to 0 so that bootmem allocated blocks
* are never reported as leaks .
*/
kmemleak_alloc ( ptr , size , 0 , 0 ) ;
return ptr ;
}
2011-02-24 14:43:05 +01:00
/*
* free_bootmem_late - free bootmem pages directly to page allocator
* @ addr : starting address of the range
* @ size : size of the range in bytes
*
* This is only useful when the bootmem allocator has already been torn
* down , but we are still initializing the system . Pages are given directly
* to the page allocator , no bootmem metadata is updated because it is gone .
*/
void __init free_bootmem_late ( unsigned long addr , unsigned long size )
{
unsigned long cursor , end ;
kmemleak_free_part ( __va ( addr ) , size ) ;
cursor = PFN_UP ( addr ) ;
end = PFN_DOWN ( addr + size ) ;
for ( ; cursor < end ; cursor + + ) {
__free_pages_bootmem ( pfn_to_page ( cursor ) , 0 ) ;
totalram_pages + + ;
}
}
static void __init __free_pages_memory ( unsigned long start , unsigned long end )
{
2012-05-10 13:01:46 -07:00
unsigned long i , start_aligned , end_aligned ;
2011-02-24 14:43:05 +01:00
int order = ilog2 ( BITS_PER_LONG ) ;
start_aligned = ( start + ( BITS_PER_LONG - 1 ) ) & ~ ( BITS_PER_LONG - 1 ) ;
end_aligned = end & ~ ( BITS_PER_LONG - 1 ) ;
if ( end_aligned < = start_aligned ) {
for ( i = start ; i < end ; i + + )
__free_pages_bootmem ( pfn_to_page ( i ) , 0 ) ;
return ;
}
for ( i = start ; i < start_aligned ; i + + )
__free_pages_bootmem ( pfn_to_page ( i ) , 0 ) ;
for ( i = start_aligned ; i < end_aligned ; i + = BITS_PER_LONG )
__free_pages_bootmem ( pfn_to_page ( i ) , order ) ;
for ( i = end_aligned ; i < end ; i + + )
__free_pages_bootmem ( pfn_to_page ( i ) , 0 ) ;
}
2012-07-11 14:02:56 -07:00
static unsigned long __init __free_memory_core ( phys_addr_t start ,
phys_addr_t end )
{
unsigned long start_pfn = PFN_UP ( start ) ;
unsigned long end_pfn = min_t ( unsigned long ,
PFN_DOWN ( end ) , max_low_pfn ) ;
if ( start_pfn > end_pfn )
return 0 ;
__free_pages_memory ( start_pfn , end_pfn ) ;
mm: fix-up zone present pages
I think zone->present_pages indicates pages that buddy system can management,
it should be:
zone->present_pages = spanned pages - absent pages - bootmem pages,
but is now:
zone->present_pages = spanned pages - absent pages - memmap pages.
spanned pages: total size, including holes.
absent pages: holes.
bootmem pages: pages used in system boot, managed by bootmem allocator.
memmap pages: pages used by page structs.
This may cause zone->present_pages less than it should be. For example,
numa node 1 has ZONE_NORMAL and ZONE_MOVABLE, it's memmap and other
bootmem will be allocated from ZONE_MOVABLE, so ZONE_NORMAL's
present_pages should be spanned pages - absent pages, but now it also
minus memmap pages(free_area_init_core), which are actually allocated from
ZONE_MOVABLE. When offlining all memory of a zone, this will cause
zone->present_pages less than 0, because present_pages is unsigned long
type, it is actually a very large integer, it indirectly caused
zone->watermark[WMARK_MIN] becomes a large
integer(setup_per_zone_wmarks()), than cause totalreserve_pages become a
large integer(calculate_totalreserve_pages()), and finally cause memory
allocating failure when fork process(__vm_enough_memory()).
[root@localhost ~]# dmesg
-bash: fork: Cannot allocate memory
I think the bug described in
http://marc.info/?l=linux-mm&m=134502182714186&w=2
is also caused by wrong zone present pages.
This patch intends to fix-up zone->present_pages when memory are freed to
buddy system on x86_64 and IA64 platforms.
Signed-off-by: Jianguo Wu <wujianguo@huawei.com>
Signed-off-by: Jiang Liu <jiang.liu@huawei.com>
Reported-by: Petr Tesarik <ptesarik@suse.cz>
Tested-by: Petr Tesarik <ptesarik@suse.cz>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-10-08 16:33:06 -07:00
fixup_zone_present_pages ( pfn_to_nid ( start > > PAGE_SHIFT ) ,
start_pfn , end_pfn ) ;
2012-07-11 14:02:56 -07:00
return end_pfn - start_pfn ;
}
2011-07-12 11:16:01 +02:00
unsigned long __init free_low_memory_core_early ( int nodeid )
2011-02-24 14:43:05 +01:00
{
unsigned long count = 0 ;
2012-07-11 14:02:56 -07:00
phys_addr_t start , end , size ;
2011-07-12 11:16:02 +02:00
u64 i ;
mm: fix-up zone present pages
I think zone->present_pages indicates pages that buddy system can management,
it should be:
zone->present_pages = spanned pages - absent pages - bootmem pages,
but is now:
zone->present_pages = spanned pages - absent pages - memmap pages.
spanned pages: total size, including holes.
absent pages: holes.
bootmem pages: pages used in system boot, managed by bootmem allocator.
memmap pages: pages used by page structs.
This may cause zone->present_pages less than it should be. For example,
numa node 1 has ZONE_NORMAL and ZONE_MOVABLE, it's memmap and other
bootmem will be allocated from ZONE_MOVABLE, so ZONE_NORMAL's
present_pages should be spanned pages - absent pages, but now it also
minus memmap pages(free_area_init_core), which are actually allocated from
ZONE_MOVABLE. When offlining all memory of a zone, this will cause
zone->present_pages less than 0, because present_pages is unsigned long
type, it is actually a very large integer, it indirectly caused
zone->watermark[WMARK_MIN] becomes a large
integer(setup_per_zone_wmarks()), than cause totalreserve_pages become a
large integer(calculate_totalreserve_pages()), and finally cause memory
allocating failure when fork process(__vm_enough_memory()).
[root@localhost ~]# dmesg
-bash: fork: Cannot allocate memory
I think the bug described in
http://marc.info/?l=linux-mm&m=134502182714186&w=2
is also caused by wrong zone present pages.
This patch intends to fix-up zone->present_pages when memory are freed to
buddy system on x86_64 and IA64 platforms.
Signed-off-by: Jianguo Wu <wujianguo@huawei.com>
Signed-off-by: Jiang Liu <jiang.liu@huawei.com>
Reported-by: Petr Tesarik <ptesarik@suse.cz>
Tested-by: Petr Tesarik <ptesarik@suse.cz>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-10-08 16:33:06 -07:00
reset_zone_present_pages ( ) ;
2012-07-11 14:02:56 -07:00
for_each_free_mem_range ( i , MAX_NUMNODES , & start , & end , NULL )
count + = __free_memory_core ( start , end ) ;
/* free range that is used for reserved array if we allocate it */
size = get_allocated_memblock_reserved_regions_info ( & start ) ;
if ( size )
count + = __free_memory_core ( start , start + size ) ;
2011-02-24 14:43:05 +01:00
return count ;
}
/**
* free_all_bootmem_node - release a node ' s free pages to the buddy allocator
* @ pgdat : node to be released
*
* Returns the number of pages actually released .
*/
unsigned long __init free_all_bootmem_node ( pg_data_t * pgdat )
{
register_page_bootmem_info_node ( pgdat ) ;
2011-07-12 11:16:01 +02:00
/* free_low_memory_core_early(MAX_NUMNODES) will be called later */
2011-02-24 14:43:05 +01:00
return 0 ;
}
/**
* free_all_bootmem - release free pages to the buddy allocator
*
* Returns the number of pages actually released .
*/
unsigned long __init free_all_bootmem ( void )
{
/*
* We need to use MAX_NUMNODES instead of NODE_DATA ( 0 ) - > node_id
2011-03-30 22:57:33 -03:00
* because in some case like Node0 doesn ' t have RAM installed
2011-02-24 14:43:05 +01:00
* low ram will be on Node1
*/
2011-07-12 11:16:01 +02:00
return free_low_memory_core_early ( MAX_NUMNODES ) ;
2011-02-24 14:43:05 +01:00
}
/**
* free_bootmem_node - mark a page range as usable
* @ pgdat : node the range resides on
* @ physaddr : starting address of the range
* @ size : size of the range in bytes
*
* Partial pages will be considered reserved and left as they are .
*
* The range must reside completely on the specified node .
*/
void __init free_bootmem_node ( pg_data_t * pgdat , unsigned long physaddr ,
unsigned long size )
{
kmemleak_free_part ( __va ( physaddr ) , size ) ;
2011-07-12 11:16:06 +02:00
memblock_free ( physaddr , size ) ;
2011-02-24 14:43:05 +01:00
}
/**
* free_bootmem - mark a page range as usable
* @ addr : starting address of the range
* @ size : size of the range in bytes
*
* Partial pages will be considered reserved and left as they are .
*
* The range must be contiguous but may span node boundaries .
*/
void __init free_bootmem ( unsigned long addr , unsigned long size )
{
kmemleak_free_part ( __va ( addr ) , size ) ;
2011-07-12 11:16:06 +02:00
memblock_free ( addr , size ) ;
2011-02-24 14:43:05 +01:00
}
static void * __init ___alloc_bootmem_nopanic ( unsigned long size ,
unsigned long align ,
unsigned long goal ,
unsigned long limit )
{
void * ptr ;
if ( WARN_ON_ONCE ( slab_is_available ( ) ) )
return kzalloc ( size , GFP_NOWAIT ) ;
restart :
ptr = __alloc_memory_core_early ( MAX_NUMNODES , size , align , goal , limit ) ;
if ( ptr )
return ptr ;
if ( goal ! = 0 ) {
goal = 0 ;
goto restart ;
}
return NULL ;
}
/**
* __alloc_bootmem_nopanic - allocate boot memory without panicking
* @ size : size of the request in bytes
* @ align : alignment of the region
* @ goal : preferred starting address of the region
*
* The goal is dropped if it can not be satisfied and the allocation will
* fall back to memory below @ goal .
*
* Allocation may happen on any node in the system .
*
* Returns NULL on failure .
*/
void * __init __alloc_bootmem_nopanic ( unsigned long size , unsigned long align ,
unsigned long goal )
{
unsigned long limit = - 1UL ;
return ___alloc_bootmem_nopanic ( size , align , goal , limit ) ;
}
static void * __init ___alloc_bootmem ( unsigned long size , unsigned long align ,
unsigned long goal , unsigned long limit )
{
void * mem = ___alloc_bootmem_nopanic ( size , align , goal , limit ) ;
if ( mem )
return mem ;
/*
* Whoops , we cannot satisfy the allocation request .
*/
printk ( KERN_ALERT " bootmem alloc of %lu bytes failed! \n " , size ) ;
panic ( " Out of memory " ) ;
return NULL ;
}
/**
* __alloc_bootmem - allocate boot memory
* @ size : size of the request in bytes
* @ align : alignment of the region
* @ goal : preferred starting address of the region
*
* The goal is dropped if it can not be satisfied and the allocation will
* fall back to memory below @ goal .
*
* Allocation may happen on any node in the system .
*
* The function panics if the request can not be satisfied .
*/
void * __init __alloc_bootmem ( unsigned long size , unsigned long align ,
unsigned long goal )
{
unsigned long limit = - 1UL ;
return ___alloc_bootmem ( size , align , goal , limit ) ;
}
2012-07-11 14:02:53 -07:00
void * __init ___alloc_bootmem_node_nopanic ( pg_data_t * pgdat ,
2012-05-29 15:06:35 -07:00
unsigned long size ,
unsigned long align ,
unsigned long goal ,
unsigned long limit )
{
void * ptr ;
again :
ptr = __alloc_memory_core_early ( pgdat - > node_id , size , align ,
goal , limit ) ;
if ( ptr )
return ptr ;
ptr = __alloc_memory_core_early ( MAX_NUMNODES , size , align ,
goal , limit ) ;
if ( ptr )
return ptr ;
if ( goal ) {
goal = 0 ;
goto again ;
}
return NULL ;
}
void * __init __alloc_bootmem_node_nopanic ( pg_data_t * pgdat , unsigned long size ,
unsigned long align , unsigned long goal )
{
if ( WARN_ON_ONCE ( slab_is_available ( ) ) )
return kzalloc_node ( size , GFP_NOWAIT , pgdat - > node_id ) ;
return ___alloc_bootmem_node_nopanic ( pgdat , size , align , goal , 0 ) ;
}
void * __init ___alloc_bootmem_node ( pg_data_t * pgdat , unsigned long size ,
unsigned long align , unsigned long goal ,
unsigned long limit )
{
void * ptr ;
ptr = ___alloc_bootmem_node_nopanic ( pgdat , size , align , goal , limit ) ;
if ( ptr )
return ptr ;
printk ( KERN_ALERT " bootmem alloc of %lu bytes failed! \n " , size ) ;
panic ( " Out of memory " ) ;
return NULL ;
}
2011-02-24 14:43:05 +01:00
/**
* __alloc_bootmem_node - allocate boot memory from a specific node
* @ pgdat : node to allocate from
* @ size : size of the request in bytes
* @ align : alignment of the region
* @ goal : preferred starting address of the region
*
* The goal is dropped if it can not be satisfied and the allocation will
* fall back to memory below @ goal .
*
* Allocation may fall back to any node in the system if the specified node
* can not hold the requested memory .
*
* The function panics if the request can not be satisfied .
*/
void * __init __alloc_bootmem_node ( pg_data_t * pgdat , unsigned long size ,
unsigned long align , unsigned long goal )
{
if ( WARN_ON_ONCE ( slab_is_available ( ) ) )
return kzalloc_node ( size , GFP_NOWAIT , pgdat - > node_id ) ;
2012-05-29 15:06:35 -07:00
return ___alloc_bootmem_node ( pgdat , size , align , goal , 0 ) ;
2011-02-24 14:43:05 +01:00
}
void * __init __alloc_bootmem_node_high ( pg_data_t * pgdat , unsigned long size ,
unsigned long align , unsigned long goal )
{
return __alloc_bootmem_node ( pgdat , size , align , goal ) ;
}
# ifndef ARCH_LOW_ADDRESS_LIMIT
# define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL
# endif
/**
* __alloc_bootmem_low - allocate low boot memory
* @ size : size of the request in bytes
* @ align : alignment of the region
* @ goal : preferred starting address of the region
*
* The goal is dropped if it can not be satisfied and the allocation will
* fall back to memory below @ goal .
*
* Allocation may happen on any node in the system .
*
* The function panics if the request can not be satisfied .
*/
void * __init __alloc_bootmem_low ( unsigned long size , unsigned long align ,
unsigned long goal )
{
return ___alloc_bootmem ( size , align , goal , ARCH_LOW_ADDRESS_LIMIT ) ;
}
/**
* __alloc_bootmem_low_node - allocate low boot memory from a specific node
* @ pgdat : node to allocate from
* @ size : size of the request in bytes
* @ align : alignment of the region
* @ goal : preferred starting address of the region
*
* The goal is dropped if it can not be satisfied and the allocation will
* fall back to memory below @ goal .
*
* Allocation may fall back to any node in the system if the specified node
* can not hold the requested memory .
*
* The function panics if the request can not be satisfied .
*/
void * __init __alloc_bootmem_low_node ( pg_data_t * pgdat , unsigned long size ,
unsigned long align , unsigned long goal )
{
if ( WARN_ON_ONCE ( slab_is_available ( ) ) )
return kzalloc_node ( size , GFP_NOWAIT , pgdat - > node_id ) ;
2012-05-29 15:06:35 -07:00
return ___alloc_bootmem_node ( pgdat , size , align , goal ,
ARCH_LOW_ADDRESS_LIMIT ) ;
2011-02-24 14:43:05 +01:00
}