2008-01-30 13:30:37 +01:00
/*
2005-04-16 15:20:36 -07:00
* Generic VM initialization for x86 - 64 NUMA setups .
* Copyright 2002 , 2003 Andi Kleen , SuSE Labs .
2008-01-30 13:30:37 +01:00
*/
2005-04-16 15:20:36 -07:00
# include <linux/kernel.h>
# include <linux/mm.h>
# include <linux/string.h>
# include <linux/init.h>
# include <linux/bootmem.h>
2010-08-25 13:39:17 -07:00
# include <linux/memblock.h>
2005-04-16 15:20:36 -07:00
# include <linux/mmzone.h>
# include <linux/ctype.h>
# include <linux/module.h>
# include <linux/nodemask.h>
2008-01-30 13:33:11 +01:00
# include <linux/sched.h>
2011-02-16 12:13:06 +01:00
# include <linux/acpi.h>
2005-04-16 15:20:36 -07:00
# include <asm/e820.h>
# include <asm/proto.h>
# include <asm/dma.h>
# include <asm/acpi.h>
2010-09-17 18:03:43 +02:00
# include <asm/amd_nb.h>
2005-04-16 15:20:36 -07:00
2011-02-22 11:10:08 +01:00
# include "numa_internal.h"
2011-02-16 17:11:08 +01:00
2005-09-06 15:17:45 -07:00
struct pglist_data * node_data [ MAX_NUMNODES ] __read_mostly ;
2008-01-30 13:30:37 +01:00
EXPORT_SYMBOL ( node_data ) ;
2011-02-16 17:11:09 +01:00
nodemask_t numa_nodes_parsed __initdata ;
2011-02-16 12:13:07 +01:00
2006-03-25 16:31:46 +01:00
struct memnode memnode ;
2005-04-16 15:20:36 -07:00
2008-05-12 15:43:36 +02:00
static unsigned long __initdata nodemap_addr ;
static unsigned long __initdata nodemap_size ;
2005-04-16 15:20:36 -07:00
2011-02-16 17:11:08 +01:00
static struct numa_meminfo numa_meminfo __initdata ;
2011-02-16 17:11:07 +01:00
2011-02-16 17:11:09 +01:00
static int numa_distance_cnt ;
static u8 * numa_distance ;
2005-11-05 17:25:54 +01:00
/*
* Given a shift value , try to populate memnodemap [ ]
* Returns :
* 1 if OK
* 0 if memnodmap [ ] too small ( of shift too small )
* - 1 if node overlap or lost ram ( shift too big )
*/
2011-02-16 17:11:08 +01:00
static int __init populate_memnodemap ( const struct numa_meminfo * mi , int shift )
2005-04-16 15:20:36 -07:00
{
2005-11-05 17:25:54 +01:00
unsigned long addr , end ;
2008-01-30 13:30:37 +01:00
int i , res = - 1 ;
2005-07-28 21:15:38 -07:00
2008-01-30 13:33:25 +01:00
memset ( memnodemap , 0xff , sizeof ( s16 ) * memnodemapsize ) ;
2011-02-16 17:11:08 +01:00
for ( i = 0 ; i < mi - > nr_blks ; i + + ) {
addr = mi - > blk [ i ] . start ;
end = mi - > blk [ i ] . end ;
2005-11-05 17:25:54 +01:00
if ( addr > = end )
2005-07-28 21:15:38 -07:00
continue ;
2007-02-13 13:26:19 +01:00
if ( ( end > > shift ) > = memnodemapsize )
2005-11-05 17:25:54 +01:00
return 0 ;
do {
2008-01-30 13:33:25 +01:00
if ( memnodemap [ addr > > shift ] ! = NUMA_NO_NODE )
2005-07-28 21:15:38 -07:00
return - 1 ;
2011-02-16 17:11:08 +01:00
memnodemap [ addr > > shift ] = mi - > blk [ i ] . nid ;
2007-02-13 13:26:19 +01:00
addr + = ( 1UL < < shift ) ;
2005-11-05 17:25:54 +01:00
} while ( addr < end ) ;
res = 1 ;
2008-01-30 13:30:37 +01:00
}
2005-11-05 17:25:54 +01:00
return res ;
}
2007-02-13 13:26:19 +01:00
static int __init allocate_cachealigned_memnodemap ( void )
{
2008-02-01 17:49:41 +01:00
unsigned long addr ;
2007-02-13 13:26:19 +01:00
memnodemap = memnode . embedded_map ;
2008-01-30 13:33:15 +01:00
if ( memnodemapsize < = ARRAY_SIZE ( memnode . embedded_map ) )
2007-02-13 13:26:19 +01:00
return 0 ;
2008-02-01 17:49:41 +01:00
addr = 0x8000 ;
2008-07-25 16:48:58 +02:00
nodemap_size = roundup ( sizeof ( s16 ) * memnodemapsize , L1_CACHE_BYTES ) ;
2010-12-27 16:48:08 -08:00
nodemap_addr = memblock_find_in_range ( addr , get_max_mapped ( ) ,
2008-02-01 17:49:41 +01:00
nodemap_size , L1_CACHE_BYTES ) ;
2010-08-25 13:39:17 -07:00
if ( nodemap_addr = = MEMBLOCK_ERROR ) {
2007-02-13 13:26:19 +01:00
printk ( KERN_ERR
" NUMA: Unable to allocate Memory to Node hash map \n " ) ;
nodemap_addr = nodemap_size = 0 ;
return - 1 ;
}
2008-02-01 17:49:41 +01:00
memnodemap = phys_to_virt ( nodemap_addr ) ;
2010-08-25 13:39:17 -07:00
memblock_x86_reserve_range ( nodemap_addr , nodemap_addr + nodemap_size , " MEMNODEMAP " ) ;
2007-02-13 13:26:19 +01:00
printk ( KERN_DEBUG " NUMA: Allocated memnodemap from %lx - %lx \n " ,
nodemap_addr , nodemap_addr + nodemap_size ) ;
return 0 ;
}
/*
* The LSB of all start and end addresses in the node map is the value of the
* maximum possible shift .
*/
2011-02-16 17:11:08 +01:00
static int __init extract_lsb_from_nodes ( const struct numa_meminfo * mi )
2005-11-05 17:25:54 +01:00
{
2007-02-13 13:26:20 +01:00
int i , nodes_used = 0 ;
2007-02-13 13:26:19 +01:00
unsigned long start , end ;
unsigned long bitfield = 0 , memtop = 0 ;
2011-02-16 17:11:08 +01:00
for ( i = 0 ; i < mi - > nr_blks ; i + + ) {
start = mi - > blk [ i ] . start ;
end = mi - > blk [ i ] . end ;
2007-02-13 13:26:19 +01:00
if ( start > = end )
continue ;
2007-02-13 13:26:20 +01:00
bitfield | = start ;
nodes_used + + ;
2007-02-13 13:26:19 +01:00
if ( end > memtop )
memtop = end ;
}
2007-02-13 13:26:20 +01:00
if ( nodes_used < = 1 )
i = 63 ;
else
i = find_first_bit ( & bitfield , sizeof ( unsigned long ) * 8 ) ;
2007-02-13 13:26:19 +01:00
memnodemapsize = ( memtop > > i ) + 1 ;
return i ;
}
2005-11-05 17:25:54 +01:00
2011-02-16 17:11:08 +01:00
static int __init compute_hash_shift ( const struct numa_meminfo * mi )
2007-02-13 13:26:19 +01:00
{
int shift ;
2005-11-05 17:25:54 +01:00
2011-02-16 17:11:08 +01:00
shift = extract_lsb_from_nodes ( mi ) ;
2007-02-13 13:26:19 +01:00
if ( allocate_cachealigned_memnodemap ( ) )
return - 1 ;
2006-01-11 22:44:33 +01:00
printk ( KERN_DEBUG " NUMA: Using %d for the hash shift. \n " ,
2005-11-05 17:25:54 +01:00
shift ) ;
2011-02-16 17:11:08 +01:00
if ( populate_memnodemap ( mi , shift ) ! = 1 ) {
2008-01-30 13:30:37 +01:00
printk ( KERN_INFO " Your memory is not aligned you need to "
" rebuild your kernel with a bigger NODEMAPSIZE "
" shift=%d \n " , shift ) ;
2005-11-05 17:25:54 +01:00
return - 1 ;
}
2005-07-28 21:15:38 -07:00
return shift ;
2005-04-16 15:20:36 -07:00
}
mm: clean up for early_pfn_to_nid()
What's happening is that the assertion in mm/page_alloc.c:move_freepages()
is triggering:
BUG_ON(page_zone(start_page) != page_zone(end_page));
Once I knew this is what was happening, I added some annotations:
if (unlikely(page_zone(start_page) != page_zone(end_page))) {
printk(KERN_ERR "move_freepages: Bogus zones: "
"start_page[%p] end_page[%p] zone[%p]\n",
start_page, end_page, zone);
printk(KERN_ERR "move_freepages: "
"start_zone[%p] end_zone[%p]\n",
page_zone(start_page), page_zone(end_page));
printk(KERN_ERR "move_freepages: "
"start_pfn[0x%lx] end_pfn[0x%lx]\n",
page_to_pfn(start_page), page_to_pfn(end_page));
printk(KERN_ERR "move_freepages: "
"start_nid[%d] end_nid[%d]\n",
page_to_nid(start_page), page_to_nid(end_page));
...
And here's what I got:
move_freepages: Bogus zones: start_page[2207d0000] end_page[2207dffc0] zone[fffff8103effcb00]
move_freepages: start_zone[fffff8103effcb00] end_zone[fffff8003fffeb00]
move_freepages: start_pfn[0x81f600] end_pfn[0x81f7ff]
move_freepages: start_nid[1] end_nid[0]
My memory layout on this box is:
[ 0.000000] Zone PFN ranges:
[ 0.000000] Normal 0x00000000 -> 0x0081ff5d
[ 0.000000] Movable zone start PFN for each node
[ 0.000000] early_node_map[8] active PFN ranges
[ 0.000000] 0: 0x00000000 -> 0x00020000
[ 0.000000] 1: 0x00800000 -> 0x0081f7ff
[ 0.000000] 1: 0x0081f800 -> 0x0081fe50
[ 0.000000] 1: 0x0081fed1 -> 0x0081fed8
[ 0.000000] 1: 0x0081feda -> 0x0081fedb
[ 0.000000] 1: 0x0081fedd -> 0x0081fee5
[ 0.000000] 1: 0x0081fee7 -> 0x0081ff51
[ 0.000000] 1: 0x0081ff59 -> 0x0081ff5d
So it's a block move in that 0x81f600-->0x81f7ff region which triggers
the problem.
This patch:
Declaration of early_pfn_to_nid() is scattered over per-arch include
files, and it seems it's complicated to know when the declaration is used.
I think it makes fix-for-memmap-init not easy.
This patch moves all declaration to include/linux/mm.h
After this,
if !CONFIG_NODES_POPULATES_NODE_MAP && !CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
-> Use static definition in include/linux/mm.h
else if !CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
-> Use generic definition in mm/page_alloc.c
else
-> per-arch back end function will be called.
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Tested-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reported-by: David Miller <davem@davemlloft.net>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: <stable@kernel.org> [2.6.25.x, 2.6.26.x, 2.6.27.x, 2.6.28.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-02-18 14:48:32 -08:00
int __meminit __early_pfn_to_nid ( unsigned long pfn )
2005-06-23 00:08:07 -07:00
{
return phys_to_nid ( pfn < < PAGE_SHIFT ) ;
}
2008-01-30 13:30:37 +01:00
static void * __init early_node_mem ( int nodeid , unsigned long start ,
2008-02-01 17:49:41 +01:00
unsigned long end , unsigned long size ,
unsigned long align )
2006-04-07 19:49:21 +02:00
{
2010-02-10 01:20:18 -08:00
unsigned long mem ;
2008-01-30 13:30:37 +01:00
2010-02-10 01:20:18 -08:00
/*
* put it on high as possible
* something will go with NODE_DATA
*/
if ( start < ( MAX_DMA_PFN < < PAGE_SHIFT ) )
start = MAX_DMA_PFN < < PAGE_SHIFT ;
if ( start < ( MAX_DMA32_PFN < < PAGE_SHIFT ) & &
end > ( MAX_DMA32_PFN < < PAGE_SHIFT ) )
start = MAX_DMA32_PFN < < PAGE_SHIFT ;
2010-08-25 13:39:17 -07:00
mem = memblock_x86_find_in_range_node ( nodeid , start , end , size , align ) ;
if ( mem ! = MEMBLOCK_ERROR )
2006-04-07 19:49:21 +02:00
return __va ( mem ) ;
2008-02-01 17:49:42 +01:00
2010-02-10 01:20:18 -08:00
/* extend the search scope */
end = max_pfn_mapped < < PAGE_SHIFT ;
2010-10-28 09:50:17 -07:00
start = MAX_DMA_PFN < < PAGE_SHIFT ;
mem = memblock_find_in_range ( start , end , size , align ) ;
2010-08-25 13:39:17 -07:00
if ( mem ! = MEMBLOCK_ERROR )
2006-04-07 19:49:21 +02:00
return __va ( mem ) ;
2008-02-01 17:49:42 +01:00
2010-02-10 01:20:15 -08:00
printk ( KERN_ERR " Cannot find %lu bytes in node %d \n " ,
2008-01-30 13:30:37 +01:00
size , nodeid ) ;
2010-02-10 01:20:15 -08:00
return NULL ;
2006-04-07 19:49:21 +02:00
}
2011-02-16 17:11:10 +01:00
static int __init numa_add_memblk_to ( int nid , u64 start , u64 end ,
struct numa_meminfo * mi )
2011-02-16 17:11:07 +01:00
{
2011-02-16 17:11:09 +01:00
/* ignore zero length blks */
if ( start = = end )
return 0 ;
2011-02-16 17:11:08 +01:00
2011-02-16 17:11:09 +01:00
/* whine about and ignore invalid blks */
if ( start > end | | nid < 0 | | nid > = MAX_NUMNODES ) {
pr_warning ( " NUMA: Warning: invalid memblk node %d (%Lx-%Lx) \n " ,
nid , start , end ) ;
return 0 ;
2011-02-16 17:11:07 +01:00
}
2011-02-16 17:11:09 +01:00
if ( mi - > nr_blks > = NR_NODE_MEMBLKS ) {
pr_err ( " NUMA: too many memblk ranges \n " ) ;
2011-02-16 17:11:07 +01:00
return - EINVAL ;
}
2011-02-16 17:11:08 +01:00
mi - > blk [ mi - > nr_blks ] . start = start ;
mi - > blk [ mi - > nr_blks ] . end = end ;
mi - > blk [ mi - > nr_blks ] . nid = nid ;
mi - > nr_blks + + ;
2011-02-16 17:11:07 +01:00
return 0 ;
}
2011-02-22 11:10:08 +01:00
/**
* numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo
* @ idx : Index of memblk to remove
* @ mi : numa_meminfo to remove memblk from
*
* Remove @ idx ' th numa_memblk from @ mi by shifting @ mi - > blk [ ] and
* decrementing @ mi - > nr_blks .
*/
2011-02-22 11:10:08 +01:00
void __init numa_remove_memblk_from ( int idx , struct numa_meminfo * mi )
2011-02-16 17:11:09 +01:00
{
mi - > nr_blks - - ;
memmove ( & mi - > blk [ idx ] , & mi - > blk [ idx + 1 ] ,
( mi - > nr_blks - idx ) * sizeof ( mi - > blk [ 0 ] ) ) ;
}
2011-02-22 11:10:08 +01:00
/**
* numa_add_memblk - Add one numa_memblk to numa_meminfo
* @ nid : NUMA node ID of the new memblk
* @ start : Start address of the new memblk
* @ end : End address of the new memblk
*
* Add a new memblk to the default numa_meminfo .
*
* RETURNS :
* 0 on success , - errno on failure .
*/
2011-02-16 17:11:10 +01:00
int __init numa_add_memblk ( int nid , u64 start , u64 end )
{
return numa_add_memblk_to ( nid , start , end , & numa_meminfo ) ;
}
2005-04-16 15:20:36 -07:00
/* Initialize bootmem allocator for a node */
2009-05-15 13:59:37 -07:00
void __init
setup_node_bootmem ( int nodeid , unsigned long start , unsigned long end )
2008-01-30 13:30:37 +01:00
{
2010-02-10 01:20:20 -08:00
unsigned long start_pfn , last_pfn , nodedata_phys ;
2009-05-15 13:59:37 -07:00
const int pgdat_size = roundup ( sizeof ( pg_data_t ) , PAGE_SIZE ) ;
2008-03-18 12:52:37 -07:00
int nid ;
2005-04-16 15:20:36 -07:00
2009-04-22 14:19:27 -07:00
if ( ! end )
return ;
2009-05-15 13:59:37 -07:00
/*
* Don ' t confuse VM with a node that doesn ' t have the
* minimum amount of memory :
*/
if ( end & & ( end - start ) < NODE_MIN_SIZE )
return ;
2008-07-25 16:48:58 +02:00
start = roundup ( start , ZONE_ALIGN ) ;
2005-04-16 15:20:36 -07:00
2010-02-10 01:20:20 -08:00
printk ( KERN_INFO " Initmem setup node %d %016lx-%016lx \n " , nodeid ,
2008-01-30 13:30:37 +01:00
start , end ) ;
2005-04-16 15:20:36 -07:00
start_pfn = start > > PAGE_SHIFT ;
2008-05-12 15:43:36 +02:00
last_pfn = end > > PAGE_SHIFT ;
2005-04-16 15:20:36 -07:00
2008-02-01 17:49:41 +01:00
node_data [ nodeid ] = early_node_mem ( nodeid , start , end , pgdat_size ,
SMP_CACHE_BYTES ) ;
2006-04-07 19:49:21 +02:00
if ( node_data [ nodeid ] = = NULL )
return ;
nodedata_phys = __pa ( node_data [ nodeid ] ) ;
2010-08-25 13:39:17 -07:00
memblock_x86_reserve_range ( nodedata_phys , nodedata_phys + pgdat_size , " NODE_DATA " ) ;
2008-02-04 16:47:56 +01:00
printk ( KERN_INFO " NODE_DATA [%016lx - %016lx] \n " , nodedata_phys ,
nodedata_phys + pgdat_size - 1 ) ;
2010-02-10 01:20:15 -08:00
nid = phys_to_nid ( nodedata_phys ) ;
if ( nid ! = nodeid )
printk ( KERN_INFO " NODE_DATA(%d) on node %d \n " , nodeid , nid ) ;
2005-04-16 15:20:36 -07:00
memset ( NODE_DATA ( nodeid ) , 0 , sizeof ( pg_data_t ) ) ;
2010-02-10 01:20:20 -08:00
NODE_DATA ( nodeid ) - > node_id = nodeid ;
2005-04-16 15:20:36 -07:00
NODE_DATA ( nodeid ) - > node_start_pfn = start_pfn ;
2008-05-12 15:43:36 +02:00
NODE_DATA ( nodeid ) - > node_spanned_pages = last_pfn - start_pfn ;
2005-04-16 15:20:36 -07:00
node_set_online ( nodeid ) ;
2008-01-30 13:30:37 +01:00
}
2005-04-16 15:20:36 -07:00
2011-02-22 11:10:08 +01:00
/**
* numa_cleanup_meminfo - Cleanup a numa_meminfo
* @ mi : numa_meminfo to clean up
*
* Sanitize @ mi by merging and removing unncessary memblks . Also check for
* conflicts and clear unused memblks .
*
* RETURNS :
* 0 on success , - errno on failure .
*/
2011-02-22 11:10:08 +01:00
int __init numa_cleanup_meminfo ( struct numa_meminfo * mi )
2011-02-16 17:11:08 +01:00
{
2011-02-16 17:11:09 +01:00
const u64 low = 0 ;
const u64 high = ( u64 ) max_pfn < < PAGE_SHIFT ;
2011-02-16 17:11:09 +01:00
int i , j , k ;
2011-02-16 17:11:07 +01:00
2011-02-16 17:11:09 +01:00
for ( i = 0 ; i < mi - > nr_blks ; i + + ) {
2011-02-16 17:11:08 +01:00
struct numa_memblk * bi = & mi - > blk [ i ] ;
2011-02-16 17:11:07 +01:00
2011-02-16 17:11:09 +01:00
/* make sure all blocks are inside the limits */
bi - > start = max ( bi - > start , low ) ;
bi - > end = min ( bi - > end , high ) ;
/* and there's no empty block */
if ( bi - > start = = bi - > end ) {
numa_remove_memblk_from ( i - - , mi ) ;
continue ;
}
2011-02-16 17:11:09 +01:00
for ( j = i + 1 ; j < mi - > nr_blks ; j + + ) {
2011-02-16 17:11:08 +01:00
struct numa_memblk * bj = & mi - > blk [ j ] ;
2011-02-16 17:11:07 +01:00
unsigned long start , end ;
2011-02-16 17:11:09 +01:00
/*
* See whether there are overlapping blocks . Whine
* about but allow overlaps of the same nid . They
* will be merged below .
*/
if ( bi - > end > bj - > start & & bi - > start < bj - > end ) {
if ( bi - > nid ! = bj - > nid ) {
pr_err ( " NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx) \n " ,
bi - > nid , bi - > start , bi - > end ,
bj - > nid , bj - > start , bj - > end ) ;
return - EINVAL ;
}
pr_warning ( " NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx) \n " ,
bi - > nid , bi - > start , bi - > end ,
bj - > start , bj - > end ) ;
}
2011-02-16 17:11:09 +01:00
/*
* Join together blocks on the same node , holes
* between which don ' t overlap with memory on other
* nodes .
*/
2011-02-16 17:11:08 +01:00
if ( bi - > nid ! = bj - > nid )
2011-02-16 17:11:07 +01:00
continue ;
2011-02-16 17:11:09 +01:00
start = max ( min ( bi - > start , bj - > start ) , low ) ;
end = min ( max ( bi - > end , bj - > end ) , high ) ;
2011-02-16 17:11:09 +01:00
for ( k = 0 ; k < mi - > nr_blks ; k + + ) {
2011-02-16 17:11:08 +01:00
struct numa_memblk * bk = & mi - > blk [ k ] ;
if ( bi - > nid = = bk - > nid )
2011-02-16 17:11:07 +01:00
continue ;
2011-02-16 17:11:08 +01:00
if ( start < bk - > end & & end > bk - > start )
2011-02-16 17:11:07 +01:00
break ;
}
2011-02-16 17:11:08 +01:00
if ( k < mi - > nr_blks )
2011-02-16 17:11:07 +01:00
continue ;
printk ( KERN_INFO " NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx) \n " ,
2011-02-16 17:11:08 +01:00
bi - > nid , bi - > start , bi - > end , bj - > start , bj - > end ,
2011-02-16 17:11:07 +01:00
start , end ) ;
2011-02-16 17:11:08 +01:00
bi - > start = start ;
bi - > end = end ;
2011-02-16 17:11:09 +01:00
numa_remove_memblk_from ( j - - , mi ) ;
2011-02-16 17:11:07 +01:00
}
}
2011-02-16 17:11:09 +01:00
for ( i = mi - > nr_blks ; i < ARRAY_SIZE ( mi - > blk ) ; i + + ) {
mi - > blk [ i ] . start = mi - > blk [ i ] . end = 0 ;
mi - > blk [ i ] . nid = NUMA_NO_NODE ;
}
2011-02-16 17:11:09 +01:00
return 0 ;
}
2011-02-16 17:11:09 +01:00
/*
* Set nodes , which have memory in @ mi , in * @ nodemask .
*/
static void __init numa_nodemask_from_meminfo ( nodemask_t * nodemask ,
const struct numa_meminfo * mi )
{
int i ;
for ( i = 0 ; i < ARRAY_SIZE ( mi - > blk ) ; i + + )
if ( mi - > blk [ i ] . start ! = mi - > blk [ i ] . end & &
mi - > blk [ i ] . nid ! = NUMA_NO_NODE )
node_set ( mi - > blk [ i ] . nid , * nodemask ) ;
}
2011-02-22 11:10:08 +01:00
/**
* numa_reset_distance - Reset NUMA distance table
*
* The current table is freed . The next numa_set_distance ( ) call will
* create a new one .
2011-02-16 17:11:09 +01:00
*/
2011-02-22 11:10:08 +01:00
void __init numa_reset_distance ( void )
2011-02-16 17:11:09 +01:00
{
2011-03-02 11:22:14 +01:00
size_t size = numa_distance_cnt * numa_distance_cnt * sizeof ( numa_distance [ 0 ] ) ;
2011-02-16 17:11:09 +01:00
2011-03-02 11:32:47 +01:00
/* numa_distance could be 1LU marking allocation failure, test cnt */
2011-03-02 11:22:14 +01:00
if ( numa_distance_cnt )
2011-02-17 14:46:37 +01:00
memblock_x86_free_range ( __pa ( numa_distance ) ,
__pa ( numa_distance ) + size ) ;
2011-03-02 11:22:14 +01:00
numa_distance_cnt = 0 ;
2011-03-02 11:32:47 +01:00
numa_distance = NULL ; /* enable table creation */
2011-02-16 17:11:09 +01:00
}
2011-02-22 11:18:49 +01:00
static int __init numa_alloc_distance ( void )
{
nodemask_t nodes_parsed ;
size_t size ;
int i , j , cnt = 0 ;
u64 phys ;
/* size the new table and allocate it */
nodes_parsed = numa_nodes_parsed ;
numa_nodemask_from_meminfo ( & nodes_parsed , & numa_meminfo ) ;
for_each_node_mask ( i , nodes_parsed )
cnt = i ;
2011-02-25 10:06:39 +01:00
cnt + + ;
size = cnt * cnt * sizeof ( numa_distance [ 0 ] ) ;
2011-02-22 11:18:49 +01:00
phys = memblock_find_in_range ( 0 , ( u64 ) max_pfn_mapped < < PAGE_SHIFT ,
size , PAGE_SIZE ) ;
if ( phys = = MEMBLOCK_ERROR ) {
pr_warning ( " NUMA: Warning: can't allocate distance table! \n " ) ;
/* don't retry until explicitly reset */
numa_distance = ( void * ) 1LU ;
return - ENOMEM ;
}
memblock_x86_reserve_range ( phys , phys + size , " NUMA DIST " ) ;
numa_distance = __va ( phys ) ;
numa_distance_cnt = cnt ;
/* fill with the default distances */
for ( i = 0 ; i < cnt ; i + + )
for ( j = 0 ; j < cnt ; j + + )
numa_distance [ i * cnt + j ] = i = = j ?
LOCAL_DISTANCE : REMOTE_DISTANCE ;
printk ( KERN_DEBUG " NUMA: Initialized distance table, cnt=%d \n " , cnt ) ;
return 0 ;
}
2011-02-22 11:10:08 +01:00
/**
* numa_set_distance - Set NUMA distance from one NUMA to another
* @ from : the ' from ' node to set distance
* @ to : the ' to ' node to set distance
* @ distance : NUMA distance
*
* Set the distance from node @ from to @ to to @ distance . If distance table
2011-03-17 16:24:16 -03:00
* doesn ' t exist , one which is large enough to accommodate all the currently
2011-02-22 11:10:08 +01:00
* known nodes will be created .
2011-03-02 11:32:47 +01:00
*
* If such table cannot be allocated , a warning is printed and further
* calls are ignored until the distance table is reset with
* numa_reset_distance ( ) .
*
* If @ from or @ to is higher than the highest known node at the time of
* table creation or @ distance doesn ' t make sense , the call is ignored .
* This is to allow simplification of specific NUMA config implementations .
2011-02-16 17:11:09 +01:00
*/
void __init numa_set_distance ( int from , int to , int distance )
{
2011-02-22 11:18:49 +01:00
if ( ! numa_distance & & numa_alloc_distance ( ) < 0 )
return ;
2011-02-16 17:11:09 +01:00
if ( from > = numa_distance_cnt | | to > = numa_distance_cnt ) {
printk_once ( KERN_DEBUG " NUMA: Debug: distance out of bound, from=%d to=%d distance=%d \n " ,
from , to , distance ) ;
return ;
}
if ( ( u8 ) distance ! = distance | |
( from = = to & & distance ! = LOCAL_DISTANCE ) ) {
pr_warn_once ( " NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d \n " ,
from , to , distance ) ;
return ;
}
numa_distance [ from * numa_distance_cnt + to ] = distance ;
}
int __node_distance ( int from , int to )
{
if ( from > = numa_distance_cnt | | to > = numa_distance_cnt )
return from = = to ? LOCAL_DISTANCE : REMOTE_DISTANCE ;
return numa_distance [ from * numa_distance_cnt + to ] ;
}
EXPORT_SYMBOL ( __node_distance ) ;
2011-02-16 17:11:09 +01:00
/*
* Sanity check to catch more bad NUMA configurations ( they are amazingly
* common ) . Make sure the nodes cover all memory .
*/
2011-02-16 17:11:09 +01:00
static bool __init numa_meminfo_cover_memory ( const struct numa_meminfo * mi )
2011-02-16 17:11:09 +01:00
{
unsigned long numaram , e820ram ;
int i ;
numaram = 0 ;
2011-02-16 17:11:09 +01:00
for ( i = 0 ; i < mi - > nr_blks ; i + + ) {
unsigned long s = mi - > blk [ i ] . start > > PAGE_SHIFT ;
unsigned long e = mi - > blk [ i ] . end > > PAGE_SHIFT ;
2011-02-16 17:11:09 +01:00
numaram + = e - s ;
2011-02-16 17:11:09 +01:00
numaram - = __absent_pages_in_range ( mi - > blk [ i ] . nid , s , e ) ;
2011-02-16 17:11:09 +01:00
if ( ( long ) numaram < 0 )
numaram = 0 ;
}
e820ram = max_pfn - ( memblock_x86_hole_size ( 0 ,
max_pfn < < PAGE_SHIFT ) > > PAGE_SHIFT ) ;
/* We seem to lose 3 pages somewhere. Allow 1M of slack. */
if ( ( long ) ( e820ram - numaram ) > = ( 1 < < ( 20 - PAGE_SHIFT ) ) ) {
printk ( KERN_ERR " NUMA: nodes only cover %luMB of your %luMB e820 RAM. Not used. \n " ,
( numaram < < PAGE_SHIFT ) > > 20 ,
( e820ram < < PAGE_SHIFT ) > > 20 ) ;
2011-02-16 17:11:09 +01:00
return false ;
2011-02-16 17:11:09 +01:00
}
2011-02-16 17:11:09 +01:00
return true ;
2011-02-16 17:11:09 +01:00
}
static int __init numa_register_memblks ( struct numa_meminfo * mi )
{
2011-02-21 10:58:13 +01:00
int i , nid ;
2011-02-16 17:11:09 +01:00
/* Account for nodes with cpus and no memory */
2011-02-16 17:11:09 +01:00
node_possible_map = numa_nodes_parsed ;
numa_nodemask_from_meminfo ( & node_possible_map , mi ) ;
2011-02-16 17:11:09 +01:00
if ( WARN_ON ( nodes_empty ( node_possible_map ) ) )
return - EINVAL ;
2011-02-16 17:11:08 +01:00
memnode_shift = compute_hash_shift ( mi ) ;
2011-02-16 17:11:07 +01:00
if ( memnode_shift < 0 ) {
printk ( KERN_ERR " NUMA: No NUMA node hash function found. Contact maintainer \n " ) ;
return - EINVAL ;
}
2011-02-16 17:11:08 +01:00
for ( i = 0 ; i < mi - > nr_blks ; i + + )
memblock_x86_register_active_regions ( mi - > blk [ i ] . nid ,
mi - > blk [ i ] . start > > PAGE_SHIFT ,
mi - > blk [ i ] . end > > PAGE_SHIFT ) ;
2011-02-16 17:11:08 +01:00
/* for out of order entries */
sort_node_map ( ) ;
2011-02-16 17:11:09 +01:00
if ( ! numa_meminfo_cover_memory ( mi ) )
2011-02-16 17:11:08 +01:00
return - EINVAL ;
2011-02-21 10:58:13 +01:00
/* Finally register nodes. */
for_each_node_mask ( nid , node_possible_map ) {
u64 start = ( u64 ) max_pfn < < PAGE_SHIFT ;
u64 end = 0 ;
2011-02-16 17:11:09 +01:00
2011-02-21 10:58:13 +01:00
for ( i = 0 ; i < mi - > nr_blks ; i + + ) {
if ( nid ! = mi - > blk [ i ] . nid )
2011-02-16 17:11:09 +01:00
continue ;
2011-02-21 10:58:13 +01:00
start = min ( mi - > blk [ i ] . start , start ) ;
end = max ( mi - > blk [ i ] . end , end ) ;
2011-02-16 17:11:09 +01:00
}
2011-02-21 10:58:13 +01:00
if ( start < end )
setup_node_bootmem ( nid , start , end ) ;
2011-02-16 17:11:09 +01:00
}
2011-02-16 17:11:08 +01:00
2011-02-16 17:11:07 +01:00
return 0 ;
}
2011-03-04 15:17:21 +01:00
/**
* dummy_numma_init - Fallback dummy NUMA init
*
* Used if there ' s no underlying NUMA architecture , NUMA initialization
* fails , or NUMA is disabled on the command line .
*
* Must online at least one node and add memory blocks that cover all
* allowed memory . This function must not fail .
*/
2011-02-17 14:53:20 +01:00
static int __init dummy_numa_init ( void )
2011-02-16 12:13:06 +01:00
{
2005-04-16 15:20:36 -07:00
printk ( KERN_INFO " %s \n " ,
numa_off ? " NUMA turned off " : " No NUMA configuration found " ) ;
2008-01-30 13:30:37 +01:00
printk ( KERN_INFO " Faking a node at %016lx-%016lx \n " ,
2011-02-16 12:13:06 +01:00
0LU , max_pfn < < PAGE_SHIFT ) ;
2011-02-16 12:13:06 +01:00
2011-02-16 17:11:09 +01:00
node_set ( 0 , numa_nodes_parsed ) ;
2011-02-16 17:11:08 +01:00
numa_add_memblk ( 0 , 0 , ( u64 ) max_pfn < < PAGE_SHIFT ) ;
2011-02-16 12:13:07 +01:00
return 0 ;
}
2011-03-04 15:17:21 +01:00
static int __init numa_init ( int ( * init_func ) ( void ) )
2011-02-16 12:13:06 +01:00
{
2011-03-04 15:17:21 +01:00
int i ;
int ret ;
2011-02-16 12:13:06 +01:00
2011-03-04 15:17:21 +01:00
for ( i = 0 ; i < MAX_LOCAL_APIC ; i + + )
set_apicid_to_node ( i , NUMA_NO_NODE ) ;
2011-02-16 12:13:06 +01:00
2011-03-04 15:17:21 +01:00
nodes_clear ( numa_nodes_parsed ) ;
nodes_clear ( node_possible_map ) ;
nodes_clear ( node_online_map ) ;
memset ( & numa_meminfo , 0 , sizeof ( numa_meminfo ) ) ;
remove_all_active_ranges ( ) ;
numa_reset_distance ( ) ;
2011-02-16 12:13:06 +01:00
2011-03-04 15:17:21 +01:00
ret = init_func ( ) ;
if ( ret < 0 )
return ret ;
ret = numa_cleanup_meminfo ( & numa_meminfo ) ;
if ( ret < 0 )
return ret ;
2011-02-16 12:13:06 +01:00
2011-03-04 15:17:21 +01:00
numa_emulation ( & numa_meminfo , numa_distance_cnt ) ;
2011-02-16 12:13:07 +01:00
2011-03-04 15:17:21 +01:00
ret = numa_register_memblks ( & numa_meminfo ) ;
if ( ret < 0 )
return ret ;
2011-02-22 11:10:08 +01:00
2011-03-04 15:17:21 +01:00
for ( i = 0 ; i < nr_cpu_ids ; i + + ) {
int nid = early_cpu_to_node ( i ) ;
2011-02-22 11:10:08 +01:00
2011-03-04 15:17:21 +01:00
if ( nid = = NUMA_NO_NODE )
2011-02-16 17:11:08 +01:00
continue ;
2011-03-04 15:17:21 +01:00
if ( ! node_online ( nid ) )
numa_clear_node ( i ) ;
}
numa_init_array ( ) ;
return 0 ;
}
2011-02-16 17:11:08 +01:00
2011-03-04 15:17:21 +01:00
void __init initmem_init ( void )
{
int ret ;
2011-02-16 17:11:08 +01:00
2011-03-04 15:17:21 +01:00
if ( ! numa_off ) {
# ifdef CONFIG_ACPI_NUMA
ret = numa_init ( x86_acpi_numa_init ) ;
if ( ! ret )
return ;
# endif
# ifdef CONFIG_AMD_NUMA
ret = numa_init ( amd_numa_init ) ;
if ( ! ret )
return ;
# endif
2011-02-16 12:13:06 +01:00
}
2011-03-04 15:17:21 +01:00
numa_init ( dummy_numa_init ) ;
2005-11-05 17:25:53 +01:00
}
2008-01-30 13:30:37 +01:00
unsigned long __init numa_free_all_bootmem ( void )
{
2005-04-16 15:20:36 -07:00
unsigned long pages = 0 ;
2008-01-30 13:30:37 +01:00
int i ;
for_each_online_node ( i )
2005-04-16 15:20:36 -07:00
pages + = free_all_bootmem_node ( NODE_DATA ( i ) ) ;
2008-01-30 13:30:37 +01:00
2010-02-10 01:20:20 -08:00
pages + = free_all_memory_core_early ( MAX_NUMNODES ) ;
2005-04-16 15:20:36 -07:00
return pages ;
2008-01-30 13:30:37 +01:00
}
2005-04-16 15:20:36 -07:00
2011-01-23 14:37:39 +01:00
int __cpuinit numa_cpu_node ( int cpu )
2009-11-21 00:23:37 -08:00
{
2011-01-23 14:37:39 +01:00
int apicid = early_per_cpu ( x86_cpu_to_apicid , cpu ) ;
2009-11-21 00:23:37 -08:00
2011-01-23 14:37:39 +01:00
if ( apicid ! = BAD_APICID )
return __apicid_to_node [ apicid ] ;
return NUMA_NO_NODE ;
2009-11-21 00:23:37 -08:00
}