2005-04-16 15:20:36 -07:00
/*
* pSeries NUMA support
*
* Copyright ( C ) 2002 Anton Blanchard < anton @ au . ibm . com > , IBM
*
* This program is free software ; you can redistribute it and / or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation ; either version
* 2 of the License , or ( at your option ) any later version .
*/
# include <linux/threads.h>
# include <linux/bootmem.h>
# include <linux/init.h>
# include <linux/mm.h>
# include <linux/mmzone.h>
# include <linux/module.h>
# include <linux/nodemask.h>
# include <linux/cpu.h>
# include <linux/notifier.h>
2005-11-11 14:22:35 +11:00
# include <asm/sparsemem.h>
2005-04-16 15:20:36 -07:00
# include <asm/lmb.h>
2005-10-31 13:07:02 +11:00
# include <asm/system.h>
2005-11-07 13:18:13 +11:00
# include <asm/smp.h>
2005-04-16 15:20:36 -07:00
static int numa_enabled = 1 ;
static int numa_debug ;
# define dbg(args...) if (numa_debug) { printk(KERN_INFO args); }
2005-11-11 14:22:35 +11:00
int numa_cpu_lookup_table [ NR_CPUS ] ;
2005-04-16 15:20:36 -07:00
cpumask_t numa_cpumask_lookup_table [ MAX_NUMNODES ] ;
struct pglist_data * node_data [ MAX_NUMNODES ] ;
2005-11-11 14:22:35 +11:00
EXPORT_SYMBOL ( numa_cpu_lookup_table ) ;
EXPORT_SYMBOL ( numa_cpumask_lookup_table ) ;
EXPORT_SYMBOL ( node_data ) ;
static bootmem_data_t __initdata plat_node_bdata [ MAX_NUMNODES ] ;
2005-04-16 15:20:36 -07:00
static int min_common_depth ;
2005-12-05 12:06:42 -08:00
static int n_mem_addr_cells , n_mem_size_cells ;
2005-04-16 15:20:36 -07:00
/*
2005-11-11 14:22:35 +11:00
* We need somewhere to store start / end / node for each region until we have
2005-04-16 15:20:36 -07:00
* allocated the real node_data structures .
*/
2005-11-11 14:22:35 +11:00
# define MAX_REGIONS (MAX_LMB_REGIONS*2)
2005-04-16 15:20:36 -07:00
static struct {
2005-11-11 14:22:35 +11:00
unsigned long start_pfn ;
unsigned long end_pfn ;
int nid ;
} init_node_data [ MAX_REGIONS ] __initdata ;
2005-04-16 15:20:36 -07:00
2005-11-11 14:22:35 +11:00
int __init early_pfn_to_nid ( unsigned long pfn )
{
unsigned int i ;
for ( i = 0 ; init_node_data [ i ] . end_pfn ; i + + ) {
unsigned long start_pfn = init_node_data [ i ] . start_pfn ;
unsigned long end_pfn = init_node_data [ i ] . end_pfn ;
if ( ( start_pfn < = pfn ) & & ( pfn < end_pfn ) )
return init_node_data [ i ] . nid ;
}
return - 1 ;
}
void __init add_region ( unsigned int nid , unsigned long start_pfn ,
unsigned long pages )
{
unsigned int i ;
dbg ( " add_region nid %d start_pfn 0x%lx pages 0x%lx \n " ,
nid , start_pfn , pages ) ;
for ( i = 0 ; init_node_data [ i ] . end_pfn ; i + + ) {
if ( init_node_data [ i ] . nid ! = nid )
continue ;
if ( init_node_data [ i ] . end_pfn = = start_pfn ) {
init_node_data [ i ] . end_pfn + = pages ;
return ;
}
if ( init_node_data [ i ] . start_pfn = = ( start_pfn + pages ) ) {
init_node_data [ i ] . start_pfn - = pages ;
return ;
}
}
/*
* Leave last entry NULL so we dont iterate off the end ( we use
* entry . end_pfn to terminate the walk ) .
*/
if ( i > = ( MAX_REGIONS - 1 ) ) {
printk ( KERN_ERR " WARNING: too many memory regions in "
" numa code, truncating \n " ) ;
return ;
}
init_node_data [ i ] . start_pfn = start_pfn ;
init_node_data [ i ] . end_pfn = start_pfn + pages ;
init_node_data [ i ] . nid = nid ;
}
/* We assume init_node_data has no overlapping regions */
void __init get_region ( unsigned int nid , unsigned long * start_pfn ,
unsigned long * end_pfn , unsigned long * pages_present )
{
unsigned int i ;
* start_pfn = - 1UL ;
* end_pfn = * pages_present = 0 ;
for ( i = 0 ; init_node_data [ i ] . end_pfn ; i + + ) {
if ( init_node_data [ i ] . nid ! = nid )
continue ;
* pages_present + = init_node_data [ i ] . end_pfn -
init_node_data [ i ] . start_pfn ;
if ( init_node_data [ i ] . start_pfn < * start_pfn )
* start_pfn = init_node_data [ i ] . start_pfn ;
if ( init_node_data [ i ] . end_pfn > * end_pfn )
* end_pfn = init_node_data [ i ] . end_pfn ;
}
/* We didnt find a matching region, return start/end as 0 */
if ( * start_pfn = = - 1UL )
2005-12-07 13:07:23 -08:00
* start_pfn = 0 ;
2005-11-11 14:22:35 +11:00
}
2005-04-16 15:20:36 -07:00
2006-03-20 18:35:15 -06:00
static void __cpuinit map_cpu_to_node ( int cpu , int node )
2005-04-16 15:20:36 -07:00
{
numa_cpu_lookup_table [ cpu ] = node ;
2005-11-11 14:22:35 +11:00
2006-03-20 18:34:45 -06:00
dbg ( " adding cpu %d to node %d \n " , cpu , node ) ;
2005-11-11 14:22:35 +11:00
if ( ! ( cpu_isset ( cpu , numa_cpumask_lookup_table [ node ] ) ) )
2005-04-16 15:20:36 -07:00
cpu_set ( cpu , numa_cpumask_lookup_table [ node ] ) ;
}
# ifdef CONFIG_HOTPLUG_CPU
static void unmap_cpu_from_node ( unsigned long cpu )
{
int node = numa_cpu_lookup_table [ cpu ] ;
dbg ( " removing cpu %lu from node %d \n " , cpu , node ) ;
if ( cpu_isset ( cpu , numa_cpumask_lookup_table [ node ] ) ) {
cpu_clear ( cpu , numa_cpumask_lookup_table [ node ] ) ;
} else {
printk ( KERN_ERR " WARNING: cpu %lu not found in node %d \n " ,
cpu , node ) ;
}
}
# endif /* CONFIG_HOTPLUG_CPU */
2006-03-20 18:35:15 -06:00
static struct device_node * __cpuinit find_cpu_node ( unsigned int cpu )
2005-04-16 15:20:36 -07:00
{
unsigned int hw_cpuid = get_hard_smp_processor_id ( cpu ) ;
struct device_node * cpu_node = NULL ;
2006-07-12 15:35:54 +10:00
const unsigned int * interrupt_server , * reg ;
2005-04-16 15:20:36 -07:00
int len ;
while ( ( cpu_node = of_find_node_by_type ( cpu_node , " cpu " ) ) ! = NULL ) {
/* Try interrupt server first */
2006-07-12 15:35:54 +10:00
interrupt_server = get_property ( cpu_node ,
2005-04-16 15:20:36 -07:00
" ibm,ppc-interrupt-server#s " , & len ) ;
len = len / sizeof ( u32 ) ;
if ( interrupt_server & & ( len > 0 ) ) {
while ( len - - ) {
if ( interrupt_server [ len ] = = hw_cpuid )
return cpu_node ;
}
} else {
2006-07-12 15:35:54 +10:00
reg = get_property ( cpu_node , " reg " , & len ) ;
2005-04-16 15:20:36 -07:00
if ( reg & & ( len > 0 ) & & ( reg [ 0 ] = = hw_cpuid ) )
return cpu_node ;
}
}
return NULL ;
}
/* must hold reference to node during call */
2006-07-12 15:35:54 +10:00
static const int * of_get_associativity ( struct device_node * dev )
2005-04-16 15:20:36 -07:00
{
2006-07-12 15:35:54 +10:00
return get_property ( dev , " ibm,associativity " , NULL ) ;
2005-04-16 15:20:36 -07:00
}
2006-03-20 18:36:45 -06:00
/* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa
* info is found .
*/
2006-05-01 12:16:12 -07:00
static int of_node_to_nid_single ( struct device_node * device )
2005-04-16 15:20:36 -07:00
{
2006-03-20 18:36:45 -06:00
int nid = - 1 ;
2006-07-12 15:35:54 +10:00
const unsigned int * tmp ;
2005-04-16 15:20:36 -07:00
if ( min_common_depth = = - 1 )
2006-03-20 18:36:45 -06:00
goto out ;
2005-04-16 15:20:36 -07:00
tmp = of_get_associativity ( device ) ;
2006-03-20 18:36:45 -06:00
if ( ! tmp )
goto out ;
if ( tmp [ 0 ] > = min_common_depth )
2006-03-20 18:35:45 -06:00
nid = tmp [ min_common_depth ] ;
2006-03-20 18:36:15 -06:00
/* POWER4 LPAR uses 0xffff as invalid node */
2006-03-20 18:36:45 -06:00
if ( nid = = 0xffff | | nid > = MAX_NUMNODES )
nid = - 1 ;
out :
2006-03-20 18:35:45 -06:00
return nid ;
2005-04-16 15:20:36 -07:00
}
2006-05-01 12:16:12 -07:00
/* Walk the device tree upwards, looking for an associativity id */
int of_node_to_nid ( struct device_node * device )
{
struct device_node * tmp ;
int nid = - 1 ;
of_node_get ( device ) ;
while ( device ) {
nid = of_node_to_nid_single ( device ) ;
if ( nid ! = - 1 )
break ;
tmp = device ;
device = of_get_parent ( tmp ) ;
of_node_put ( tmp ) ;
}
of_node_put ( device ) ;
return nid ;
}
EXPORT_SYMBOL_GPL ( of_node_to_nid ) ;
2005-04-16 15:20:36 -07:00
/*
* In theory , the " ibm,associativity " property may contain multiple
* associativity lists because a resource may be multiply connected
* into the machine . This resource then has different associativity
* characteristics relative to its multiple connections . We ignore
* this for now . We also assume that all cpu and memory sets have
* their distances represented at a common level . This won ' t be
* true for heirarchical NUMA .
*
* In any case the ibm , associativity - reference - points should give
* the correct depth for a normal NUMA system .
*
* - Dave Hansen < haveblue @ us . ibm . com >
*/
static int __init find_min_common_depth ( void )
{
int depth ;
2006-07-12 15:35:54 +10:00
const unsigned int * ref_points ;
2005-04-16 15:20:36 -07:00
struct device_node * rtas_root ;
unsigned int len ;
rtas_root = of_find_node_by_path ( " /rtas " ) ;
if ( ! rtas_root )
return - 1 ;
/*
* this property is 2 32 - bit integers , each representing a level of
* depth in the associativity nodes . The first is for an SMP
* configuration ( should be all 0 ' s ) and the second is for a normal
* NUMA configuration .
*/
2006-07-12 15:35:54 +10:00
ref_points = get_property ( rtas_root ,
2005-04-16 15:20:36 -07:00
" ibm,associativity-reference-points " , & len ) ;
if ( ( len > = 1 ) & & ref_points ) {
depth = ref_points [ 1 ] ;
} else {
2006-03-20 18:34:45 -06:00
dbg ( " NUMA: ibm,associativity-reference-points not found. \n " ) ;
2005-04-16 15:20:36 -07:00
depth = - 1 ;
}
of_node_put ( rtas_root ) ;
return depth ;
}
2005-11-30 13:47:23 -08:00
static void __init get_n_mem_cells ( int * n_addr_cells , int * n_size_cells )
2005-04-16 15:20:36 -07:00
{
struct device_node * memory = NULL ;
memory = of_find_node_by_type ( memory , " memory " ) ;
2005-12-05 15:50:39 +11:00
if ( ! memory )
2005-11-30 13:47:23 -08:00
panic ( " numa.c: No memory nodes found! " ) ;
2005-12-05 15:50:39 +11:00
2005-11-30 13:47:23 -08:00
* n_addr_cells = prom_n_addr_cells ( memory ) ;
* n_size_cells = prom_n_size_cells ( memory ) ;
of_node_put ( memory ) ;
2005-04-16 15:20:36 -07:00
}
2006-07-12 15:35:54 +10:00
static unsigned long __devinit read_n_cells ( int n , const unsigned int * * buf )
2005-04-16 15:20:36 -07:00
{
unsigned long result = 0 ;
while ( n - - ) {
result = ( result < < 32 ) | * * buf ;
( * buf ) + + ;
}
return result ;
}
/*
* Figure out to which domain a cpu belongs and stick it there .
* Return the id of the domain used .
*/
2006-03-20 18:35:15 -06:00
static int __cpuinit numa_setup_cpu ( unsigned long lcpu )
2005-04-16 15:20:36 -07:00
{
2006-03-20 18:35:45 -06:00
int nid = 0 ;
2005-04-16 15:20:36 -07:00
struct device_node * cpu = find_cpu_node ( lcpu ) ;
if ( ! cpu ) {
WARN_ON ( 1 ) ;
goto out ;
}
2006-05-01 12:16:12 -07:00
nid = of_node_to_nid_single ( cpu ) ;
2005-04-16 15:20:36 -07:00
2006-03-20 18:36:45 -06:00
if ( nid < 0 | | ! node_online ( nid ) )
nid = any_online_node ( NODE_MASK_ALL ) ;
2005-04-16 15:20:36 -07:00
out :
2006-03-20 18:35:45 -06:00
map_cpu_to_node ( lcpu , nid ) ;
2005-04-16 15:20:36 -07:00
of_node_put ( cpu ) ;
2006-03-20 18:35:45 -06:00
return nid ;
2005-04-16 15:20:36 -07:00
}
2006-06-27 02:54:09 -07:00
static int __cpuinit cpu_numa_callback ( struct notifier_block * nfb ,
2005-04-16 15:20:36 -07:00
unsigned long action ,
void * hcpu )
{
unsigned long lcpu = ( unsigned long ) hcpu ;
int ret = NOTIFY_DONE ;
switch ( action ) {
case CPU_UP_PREPARE :
2006-03-20 18:37:15 -06:00
numa_setup_cpu ( lcpu ) ;
2005-04-16 15:20:36 -07:00
ret = NOTIFY_OK ;
break ;
# ifdef CONFIG_HOTPLUG_CPU
case CPU_DEAD :
case CPU_UP_CANCELED :
unmap_cpu_from_node ( lcpu ) ;
break ;
ret = NOTIFY_OK ;
# endif
}
return ret ;
}
/*
* Check and possibly modify a memory region to enforce the memory limit .
*
* Returns the size the region should have to enforce the memory limit .
* This will either be the original value of size , a truncated value ,
* or zero . If the returned value of size is 0 the region should be
* discarded as it lies wholy above the memory limit .
*/
2005-11-11 14:22:35 +11:00
static unsigned long __init numa_enforce_memory_limit ( unsigned long start ,
unsigned long size )
2005-04-16 15:20:36 -07:00
{
/*
* We use lmb_end_of_DRAM ( ) in here instead of memory_limit because
* we ' ve already adjusted it for the limit and it takes care of
* having memory holes below the limit .
*/
if ( ! memory_limit )
return size ;
if ( start + size < = lmb_end_of_DRAM ( ) )
return size ;
if ( start > = lmb_end_of_DRAM ( ) )
return 0 ;
return lmb_end_of_DRAM ( ) - start ;
}
static int __init parse_numa_properties ( void )
{
struct device_node * cpu = NULL ;
struct device_node * memory = NULL ;
2006-03-20 18:36:45 -06:00
int default_nid = 0 ;
2005-04-16 15:20:36 -07:00
unsigned long i ;
if ( numa_enabled = = 0 ) {
printk ( KERN_WARNING " NUMA disabled by user \n " ) ;
return - 1 ;
}
min_common_depth = find_min_common_depth ( ) ;
if ( min_common_depth < 0 )
return min_common_depth ;
2006-03-20 18:34:45 -06:00
dbg ( " NUMA associativity depth for CPU/Memory: %d \n " , min_common_depth ) ;
2005-04-16 15:20:36 -07:00
/*
2006-03-20 18:36:45 -06:00
* Even though we connect cpus to numa domains later in SMP
* init , we need to know the node ids now . This is because
* each node to be onlined must have NODE_DATA etc backing it .
2005-04-16 15:20:36 -07:00
*/
2006-03-20 18:36:45 -06:00
for_each_present_cpu ( i ) {
2006-03-20 18:35:45 -06:00
int nid ;
2005-04-16 15:20:36 -07:00
cpu = find_cpu_node ( i ) ;
2006-03-20 18:36:45 -06:00
BUG_ON ( ! cpu ) ;
2006-05-01 12:16:12 -07:00
nid = of_node_to_nid_single ( cpu ) ;
2006-03-20 18:36:45 -06:00
of_node_put ( cpu ) ;
2005-04-16 15:20:36 -07:00
2006-03-20 18:36:45 -06:00
/*
* Don ' t fall back to default_nid yet - - we will plug
* cpus into nodes once the memory scan has discovered
* the topology .
*/
if ( nid < 0 )
continue ;
node_set_online ( nid ) ;
2005-04-16 15:20:36 -07:00
}
2005-12-05 12:06:42 -08:00
get_n_mem_cells ( & n_mem_addr_cells , & n_mem_size_cells ) ;
2005-04-16 15:20:36 -07:00
memory = NULL ;
while ( ( memory = of_find_node_by_type ( memory , " memory " ) ) ! = NULL ) {
unsigned long start ;
unsigned long size ;
2006-03-20 18:35:45 -06:00
int nid ;
2005-04-16 15:20:36 -07:00
int ranges ;
2006-07-12 15:35:54 +10:00
const unsigned int * memcell_buf ;
2005-04-16 15:20:36 -07:00
unsigned int len ;
2006-07-12 15:35:54 +10:00
memcell_buf = get_property ( memory ,
2005-12-04 18:39:55 +11:00
" linux,usable-memory " , & len ) ;
if ( ! memcell_buf | | len < = 0 )
2006-07-12 15:35:54 +10:00
memcell_buf = get_property ( memory , " reg " , & len ) ;
2005-04-16 15:20:36 -07:00
if ( ! memcell_buf | | len < = 0 )
continue ;
2005-12-13 18:01:21 +11:00
/* ranges in cell */
ranges = ( len > > 2 ) / ( n_mem_addr_cells + n_mem_size_cells ) ;
2005-04-16 15:20:36 -07:00
new_range :
/* these are order-sensitive, and modify the buffer pointer */
2005-12-05 12:06:42 -08:00
start = read_n_cells ( n_mem_addr_cells , & memcell_buf ) ;
size = read_n_cells ( n_mem_size_cells , & memcell_buf ) ;
2005-04-16 15:20:36 -07:00
2006-03-20 18:36:45 -06:00
/*
* Assumption : either all memory nodes or none will
* have associativity properties . If none , then
* everything goes to default_nid .
*/
2006-05-01 12:16:12 -07:00
nid = of_node_to_nid_single ( memory ) ;
2006-03-20 18:36:45 -06:00
if ( nid < 0 )
nid = default_nid ;
node_set_online ( nid ) ;
2005-04-16 15:20:36 -07:00
2005-11-11 14:22:35 +11:00
if ( ! ( size = numa_enforce_memory_limit ( start , size ) ) ) {
2005-04-16 15:20:36 -07:00
if ( - - ranges )
goto new_range ;
else
continue ;
}
2006-03-20 18:35:45 -06:00
add_region ( nid , start > > PAGE_SHIFT ,
2005-11-11 14:22:35 +11:00
size > > PAGE_SHIFT ) ;
2005-04-16 15:20:36 -07:00
if ( - - ranges )
goto new_range ;
}
return 0 ;
}
static void __init setup_nonnuma ( void )
{
unsigned long top_of_ram = lmb_end_of_DRAM ( ) ;
unsigned long total_ram = lmb_phys_mem_size ( ) ;
2005-11-16 11:43:26 +11:00
unsigned int i ;
2005-04-16 15:20:36 -07:00
2006-04-12 15:25:01 -05:00
printk ( KERN_DEBUG " Top of RAM: 0x%lx, Total RAM: 0x%lx \n " ,
2005-04-16 15:20:36 -07:00
top_of_ram , total_ram ) ;
2006-04-12 15:25:01 -05:00
printk ( KERN_DEBUG " Memory hole size: %ldMB \n " ,
2005-04-16 15:20:36 -07:00
( top_of_ram - total_ram ) > > 20 ) ;
2005-11-16 11:43:26 +11:00
for ( i = 0 ; i < lmb . memory . cnt ; + + i )
add_region ( 0 , lmb . memory . region [ i ] . base > > PAGE_SHIFT ,
lmb_size_pages ( & lmb . memory , i ) ) ;
2005-04-16 15:20:36 -07:00
node_set_online ( 0 ) ;
}
2005-12-13 06:56:47 +11:00
void __init dump_numa_cpu_topology ( void )
{
unsigned int node ;
unsigned int cpu , count ;
if ( min_common_depth = = - 1 | | ! numa_enabled )
return ;
for_each_online_node ( node ) {
2006-04-12 15:25:01 -05:00
printk ( KERN_DEBUG " Node %d CPUs: " , node ) ;
2005-12-13 06:56:47 +11:00
count = 0 ;
/*
* If we used a CPU iterator here we would miss printing
* the holes in the cpumap .
*/
for ( cpu = 0 ; cpu < NR_CPUS ; cpu + + ) {
if ( cpu_isset ( cpu , numa_cpumask_lookup_table [ node ] ) ) {
if ( count = = 0 )
printk ( " %u " , cpu ) ;
+ + count ;
} else {
if ( count > 1 )
printk ( " -%u " , cpu - 1 ) ;
count = 0 ;
}
}
if ( count > 1 )
printk ( " -%u " , NR_CPUS - 1 ) ;
printk ( " \n " ) ;
}
}
static void __init dump_numa_memory_topology ( void )
2005-04-16 15:20:36 -07:00
{
unsigned int node ;
unsigned int count ;
if ( min_common_depth = = - 1 | | ! numa_enabled )
return ;
for_each_online_node ( node ) {
unsigned long i ;
2006-04-12 15:25:01 -05:00
printk ( KERN_DEBUG " Node %d Memory: " , node ) ;
2005-04-16 15:20:36 -07:00
count = 0 ;
2005-11-11 14:22:35 +11:00
for ( i = 0 ; i < lmb_end_of_DRAM ( ) ;
i + = ( 1 < < SECTION_SIZE_BITS ) ) {
if ( early_pfn_to_nid ( i > > PAGE_SHIFT ) = = node ) {
2005-04-16 15:20:36 -07:00
if ( count = = 0 )
printk ( " 0x%lx " , i ) ;
+ + count ;
} else {
if ( count > 0 )
printk ( " -0x%lx " , i ) ;
count = 0 ;
}
}
if ( count > 0 )
printk ( " -0x%lx " , i ) ;
printk ( " \n " ) ;
}
}
/*
* Allocate some memory , satisfying the lmb or bootmem allocator where
* required . nid is the preferred node and end is the physical address of
* the highest address in the node .
*
* Returns the physical address of the memory .
*/
2005-11-11 14:22:35 +11:00
static void __init * careful_allocation ( int nid , unsigned long size ,
unsigned long align ,
unsigned long end_pfn )
2005-04-16 15:20:36 -07:00
{
2005-11-11 14:22:35 +11:00
int new_nid ;
2006-01-25 21:31:28 +13:00
unsigned long ret = __lmb_alloc_base ( size , align , end_pfn < < PAGE_SHIFT ) ;
2005-04-16 15:20:36 -07:00
/* retry over all memory */
if ( ! ret )
2006-01-25 21:31:28 +13:00
ret = __lmb_alloc_base ( size , align , lmb_end_of_DRAM ( ) ) ;
2005-04-16 15:20:36 -07:00
if ( ! ret )
panic ( " numa.c: cannot allocate %lu bytes on node %d " ,
size , nid ) ;
/*
* If the memory came from a previously allocated node , we must
* retry with the bootmem allocator .
*/
2005-11-11 14:22:35 +11:00
new_nid = early_pfn_to_nid ( ret > > PAGE_SHIFT ) ;
if ( new_nid < nid ) {
ret = ( unsigned long ) __alloc_bootmem_node ( NODE_DATA ( new_nid ) ,
2005-04-16 15:20:36 -07:00
size , align , 0 ) ;
if ( ! ret )
panic ( " numa.c: cannot allocate %lu bytes on node %d " ,
2005-11-11 14:22:35 +11:00
size , new_nid ) ;
2005-04-16 15:20:36 -07:00
2005-11-11 14:22:35 +11:00
ret = __pa ( ret ) ;
2005-04-16 15:20:36 -07:00
dbg ( " alloc_bootmem %lx %lx \n " , ret , size ) ;
}
2005-11-11 14:22:35 +11:00
return ( void * ) ret ;
2005-04-16 15:20:36 -07:00
}
2006-06-27 02:54:09 -07:00
static struct notifier_block __cpuinitdata ppc64_numa_nb = {
. notifier_call = cpu_numa_callback ,
. priority = 1 /* Must run before sched domains notifier. */
} ;
2005-04-16 15:20:36 -07:00
void __init do_init_bootmem ( void )
{
int nid ;
2005-11-11 14:22:35 +11:00
unsigned int i ;
2005-04-16 15:20:36 -07:00
min_low_pfn = 0 ;
max_low_pfn = lmb_end_of_DRAM ( ) > > PAGE_SHIFT ;
max_pfn = max_low_pfn ;
if ( parse_numa_properties ( ) )
setup_nonnuma ( ) ;
else
2005-12-13 06:56:47 +11:00
dump_numa_memory_topology ( ) ;
2005-04-16 15:20:36 -07:00
register_cpu_notifier ( & ppc64_numa_nb ) ;
2006-03-20 18:37:15 -06:00
cpu_numa_callback ( & ppc64_numa_nb , CPU_UP_PREPARE ,
( void * ) ( unsigned long ) boot_cpuid ) ;
2005-04-16 15:20:36 -07:00
for_each_online_node ( nid ) {
2005-11-11 14:22:35 +11:00
unsigned long start_pfn , end_pfn , pages_present ;
2005-04-16 15:20:36 -07:00
unsigned long bootmem_paddr ;
unsigned long bootmap_pages ;
2005-11-11 14:22:35 +11:00
get_region ( nid , & start_pfn , & end_pfn , & pages_present ) ;
2005-04-16 15:20:36 -07:00
/* Allocate the node structure node local if possible */
2005-11-11 14:22:35 +11:00
NODE_DATA ( nid ) = careful_allocation ( nid ,
2005-04-16 15:20:36 -07:00
sizeof ( struct pglist_data ) ,
2005-11-11 14:22:35 +11:00
SMP_CACHE_BYTES , end_pfn ) ;
NODE_DATA ( nid ) = __va ( NODE_DATA ( nid ) ) ;
2005-04-16 15:20:36 -07:00
memset ( NODE_DATA ( nid ) , 0 , sizeof ( struct pglist_data ) ) ;
dbg ( " node %d \n " , nid ) ;
dbg ( " NODE_DATA() = %p \n " , NODE_DATA ( nid ) ) ;
NODE_DATA ( nid ) - > bdata = & plat_node_bdata [ nid ] ;
2005-11-11 14:22:35 +11:00
NODE_DATA ( nid ) - > node_start_pfn = start_pfn ;
NODE_DATA ( nid ) - > node_spanned_pages = end_pfn - start_pfn ;
2005-04-16 15:20:36 -07:00
if ( NODE_DATA ( nid ) - > node_spanned_pages = = 0 )
continue ;
2005-11-11 14:22:35 +11:00
dbg ( " start_paddr = %lx \n " , start_pfn < < PAGE_SHIFT ) ;
dbg ( " end_paddr = %lx \n " , end_pfn < < PAGE_SHIFT ) ;
2005-04-16 15:20:36 -07:00
2005-11-11 14:22:35 +11:00
bootmap_pages = bootmem_bootmap_pages ( end_pfn - start_pfn ) ;
bootmem_paddr = ( unsigned long ) careful_allocation ( nid ,
bootmap_pages < < PAGE_SHIFT ,
PAGE_SIZE , end_pfn ) ;
memset ( __va ( bootmem_paddr ) , 0 , bootmap_pages < < PAGE_SHIFT ) ;
2005-04-16 15:20:36 -07:00
dbg ( " bootmap_paddr = %lx \n " , bootmem_paddr ) ;
init_bootmem_node ( NODE_DATA ( nid ) , bootmem_paddr > > PAGE_SHIFT ,
2005-11-11 14:22:35 +11:00
start_pfn , end_pfn ) ;
2005-04-16 15:20:36 -07:00
2005-11-11 14:22:35 +11:00
/* Add free regions on this node */
for ( i = 0 ; init_node_data [ i ] . end_pfn ; i + + ) {
unsigned long start , end ;
2005-04-16 15:20:36 -07:00
2005-11-11 14:22:35 +11:00
if ( init_node_data [ i ] . nid ! = nid )
2005-04-16 15:20:36 -07:00
continue ;
2005-11-11 14:22:35 +11:00
start = init_node_data [ i ] . start_pfn < < PAGE_SHIFT ;
end = init_node_data [ i ] . end_pfn < < PAGE_SHIFT ;
2005-04-16 15:20:36 -07:00
2005-11-11 14:22:35 +11:00
dbg ( " free_bootmem %lx %lx \n " , start , end - start ) ;
free_bootmem_node ( NODE_DATA ( nid ) , start , end - start ) ;
2005-04-16 15:20:36 -07:00
}
2005-11-11 14:22:35 +11:00
/* Mark reserved regions on this node */
2005-04-16 15:20:36 -07:00
for ( i = 0 ; i < lmb . reserved . cnt ; i + + ) {
2005-08-03 20:21:26 +10:00
unsigned long physbase = lmb . reserved . region [ i ] . base ;
2005-04-16 15:20:36 -07:00
unsigned long size = lmb . reserved . region [ i ] . size ;
2005-11-11 14:22:35 +11:00
unsigned long start_paddr = start_pfn < < PAGE_SHIFT ;
unsigned long end_paddr = end_pfn < < PAGE_SHIFT ;
2005-04-16 15:20:36 -07:00
2005-11-11 14:22:35 +11:00
if ( early_pfn_to_nid ( physbase > > PAGE_SHIFT ) ! = nid & &
early_pfn_to_nid ( ( physbase + size - 1 ) > > PAGE_SHIFT ) ! = nid )
2005-04-16 15:20:36 -07:00
continue ;
if ( physbase < end_paddr & &
( physbase + size ) > start_paddr ) {
/* overlaps */
if ( physbase < start_paddr ) {
size - = start_paddr - physbase ;
physbase = start_paddr ;
}
if ( size > end_paddr - physbase )
size = end_paddr - physbase ;
dbg ( " reserve_bootmem %lx %lx \n " , physbase ,
size ) ;
reserve_bootmem_node ( NODE_DATA ( nid ) , physbase ,
size ) ;
}
}
2005-09-03 15:54:26 -07:00
2005-11-11 14:22:35 +11:00
/* Add regions into sparsemem */
for ( i = 0 ; init_node_data [ i ] . end_pfn ; i + + ) {
unsigned long start , end ;
if ( init_node_data [ i ] . nid ! = nid )
2005-09-03 15:54:26 -07:00
continue ;
2005-11-11 14:22:35 +11:00
start = init_node_data [ i ] . start_pfn ;
end = init_node_data [ i ] . end_pfn ;
2005-09-03 15:54:26 -07:00
2005-11-11 14:22:35 +11:00
memory_present ( nid , start , end ) ;
2005-09-03 15:54:26 -07:00
}
2005-04-16 15:20:36 -07:00
}
}
void __init paging_init ( void )
{
unsigned long zones_size [ MAX_NR_ZONES ] ;
unsigned long zholes_size [ MAX_NR_ZONES ] ;
int nid ;
memset ( zones_size , 0 , sizeof ( zones_size ) ) ;
memset ( zholes_size , 0 , sizeof ( zholes_size ) ) ;
for_each_online_node ( nid ) {
2005-11-11 14:22:35 +11:00
unsigned long start_pfn , end_pfn , pages_present ;
2005-04-16 15:20:36 -07:00
2005-11-11 14:22:35 +11:00
get_region ( nid , & start_pfn , & end_pfn , & pages_present ) ;
2005-04-16 15:20:36 -07:00
zones_size [ ZONE_DMA ] = end_pfn - start_pfn ;
2005-11-11 14:22:35 +11:00
zholes_size [ ZONE_DMA ] = zones_size [ ZONE_DMA ] - pages_present ;
2005-04-16 15:20:36 -07:00
dbg ( " free_area_init node %d %lx %lx (hole: %lx) \n " , nid ,
zones_size [ ZONE_DMA ] , start_pfn , zholes_size [ ZONE_DMA ] ) ;
2005-11-11 14:22:35 +11:00
free_area_init_node ( nid , NODE_DATA ( nid ) , zones_size , start_pfn ,
zholes_size ) ;
2005-04-16 15:20:36 -07:00
}
}
static int __init early_numa ( char * p )
{
if ( ! p )
return 0 ;
if ( strstr ( p , " off " ) )
numa_enabled = 0 ;
if ( strstr ( p , " debug " ) )
numa_debug = 1 ;
return 0 ;
}
early_param ( " numa " , early_numa ) ;
2005-12-05 12:06:42 -08:00
# ifdef CONFIG_MEMORY_HOTPLUG
/*
* Find the node associated with a hot added memory section . Section
* corresponds to a SPARSEMEM section , not an LMB . It is assumed that
* sections are fully contained within a single LMB .
*/
int hot_add_scn_to_nid ( unsigned long scn_addr )
{
struct device_node * memory = NULL ;
2005-12-16 14:30:35 -08:00
nodemask_t nodes ;
2006-03-20 18:36:45 -06:00
int default_nid = any_online_node ( NODE_MASK_ALL ) ;
2006-03-24 02:34:46 -08:00
int nid ;
2005-12-05 12:06:42 -08:00
if ( ! numa_enabled | | ( min_common_depth < 0 ) )
2006-03-20 18:36:45 -06:00
return default_nid ;
2005-12-05 12:06:42 -08:00
while ( ( memory = of_find_node_by_type ( memory , " memory " ) ) ! = NULL ) {
unsigned long start , size ;
2005-12-16 14:30:35 -08:00
int ranges ;
2006-07-12 15:35:54 +10:00
const unsigned int * memcell_buf ;
2005-12-05 12:06:42 -08:00
unsigned int len ;
2006-07-12 15:35:54 +10:00
memcell_buf = get_property ( memory , " reg " , & len ) ;
2005-12-05 12:06:42 -08:00
if ( ! memcell_buf | | len < = 0 )
continue ;
2005-12-13 18:01:21 +11:00
/* ranges in cell */
ranges = ( len > > 2 ) / ( n_mem_addr_cells + n_mem_size_cells ) ;
2005-12-05 12:06:42 -08:00
ha_new_range :
start = read_n_cells ( n_mem_addr_cells , & memcell_buf ) ;
size = read_n_cells ( n_mem_size_cells , & memcell_buf ) ;
2006-05-01 12:16:12 -07:00
nid = of_node_to_nid_single ( memory ) ;
2005-12-05 12:06:42 -08:00
/* Domains not present at boot default to 0 */
2006-03-20 18:36:45 -06:00
if ( nid < 0 | | ! node_online ( nid ) )
nid = default_nid ;
2005-12-05 12:06:42 -08:00
if ( ( scn_addr > = start ) & & ( scn_addr < ( start + size ) ) ) {
of_node_put ( memory ) ;
2006-03-20 18:35:45 -06:00
goto got_nid ;
2005-12-05 12:06:42 -08:00
}
if ( - - ranges ) /* process all ranges in cell */
goto ha_new_range ;
}
BUG ( ) ; /* section address should be found above */
2006-03-24 02:34:46 -08:00
return 0 ;
2005-12-16 14:30:35 -08:00
/* Temporary code to ensure that returned node is not empty */
2006-03-20 18:35:45 -06:00
got_nid :
2005-12-16 14:30:35 -08:00
nodes_setall ( nodes ) ;
2006-03-20 18:35:45 -06:00
while ( NODE_DATA ( nid ) - > node_spanned_pages = = 0 ) {
node_clear ( nid , nodes ) ;
nid = any_online_node ( nodes ) ;
2005-12-16 14:30:35 -08:00
}
2006-03-20 18:35:45 -06:00
return nid ;
2005-12-05 12:06:42 -08:00
}
# endif /* CONFIG_MEMORY_HOTPLUG */