2005-04-17 02:20:36 +04:00
/*
* Copyright ( c ) 2000 , 2003 Silicon Graphics , Inc . All rights reserved .
* Copyright ( c ) 2001 Intel Corp .
* Copyright ( c ) 2001 Tony Luck < tony . luck @ intel . com >
* Copyright ( c ) 2002 NEC Corp .
* Copyright ( c ) 2002 Kimio Suganuma < k - suganuma @ da . jp . nec . com >
* Copyright ( c ) 2004 Silicon Graphics , Inc
* Russ Anderson < rja @ sgi . com >
* Jesse Barnes < jbarnes @ sgi . com >
* Jack Steiner < steiner @ sgi . com >
*/
/*
* Platform initialization for Discontig Memory
*/
# include <linux/kernel.h>
# include <linux/mm.h>
# include <linux/swap.h>
# include <linux/bootmem.h>
# include <linux/acpi.h>
# include <linux/efi.h>
# include <linux/nodemask.h>
# include <asm/pgalloc.h>
# include <asm/tlb.h>
# include <asm/meminit.h>
# include <asm/numa.h>
# include <asm/sections.h>
/*
* Track per - node information needed to setup the boot memory allocator , the
* per - node areas , and the real VM .
*/
struct early_node_data {
struct ia64_node_data * node_data ;
unsigned long pernode_addr ;
unsigned long pernode_size ;
struct bootmem_data bootmem_data ;
unsigned long num_physpages ;
unsigned long num_dma_physpages ;
unsigned long min_pfn ;
unsigned long max_pfn ;
} ;
static struct early_node_data mem_data [ MAX_NUMNODES ] __initdata ;
2005-06-30 20:52:00 +04:00
static nodemask_t memory_less_mask __initdata ;
2005-04-17 02:20:36 +04:00
2006-06-27 13:53:38 +04:00
static pg_data_t * pgdat_list [ MAX_NUMNODES ] ;
2005-04-17 02:20:36 +04:00
/*
* To prevent cache aliasing effects , align per - node structures so that they
* start at addresses that are strided by node number .
*/
2005-12-05 22:56:50 +03:00
# define MAX_NODE_ALIGN_OFFSET (32 * 1024 * 1024)
2005-04-17 02:20:36 +04:00
# define NODEDATA_ALIGN(addr, node) \
2005-12-05 22:56:50 +03:00
( ( ( ( addr ) + 1024 * 1024 - 1 ) & ~ ( 1024 * 1024 - 1 ) ) + \
( ( ( node ) * PERCPU_PAGE_SIZE ) & ( MAX_NODE_ALIGN_OFFSET - 1 ) ) )
2005-04-17 02:20:36 +04:00
/**
* build_node_maps - callback to setup bootmem structs for each node
* @ start : physical start of range
* @ len : length of range
* @ node : node where this range resides
*
* We allocate a struct bootmem_data for each piece of memory that we wish to
* treat as a virtually contiguous block ( i . e . each node ) . Each such block
* must start on an % IA64_GRANULE_SIZE boundary , so we round the address down
* if necessary . Any non - existent pages will simply be part of the virtual
* memmap . We also update min_low_pfn and max_low_pfn here as we receive
* memory ranges from the caller .
*/
static int __init build_node_maps ( unsigned long start , unsigned long len ,
int node )
{
unsigned long cstart , epfn , end = start + len ;
struct bootmem_data * bdp = & mem_data [ node ] . bootmem_data ;
epfn = GRANULEROUNDUP ( end ) > > PAGE_SHIFT ;
cstart = GRANULEROUNDDOWN ( start ) ;
if ( ! bdp - > node_low_pfn ) {
bdp - > node_boot_start = cstart ;
bdp - > node_low_pfn = epfn ;
} else {
bdp - > node_boot_start = min ( cstart , bdp - > node_boot_start ) ;
bdp - > node_low_pfn = max ( epfn , bdp - > node_low_pfn ) ;
}
min_low_pfn = min ( min_low_pfn , bdp - > node_boot_start > > PAGE_SHIFT ) ;
max_low_pfn = max ( max_low_pfn , bdp - > node_low_pfn ) ;
return 0 ;
}
/**
2005-06-30 20:52:00 +04:00
* early_nr_cpus_node - return number of cpus on a given node
2005-04-17 02:20:36 +04:00
* @ node : node to check
*
2005-06-30 20:52:00 +04:00
* Count the number of cpus on @ node . We can ' t use nr_cpus_node ( ) yet because
2005-04-17 02:20:36 +04:00
* acpi_boot_init ( ) ( which builds the node_to_cpu_mask array ) hasn ' t been
2005-06-30 20:52:00 +04:00
* called yet . Note that node 0 will also count all non - existent cpus .
2005-04-17 02:20:36 +04:00
*/
2006-06-27 13:53:40 +04:00
static int __meminit early_nr_cpus_node ( int node )
2005-04-17 02:20:36 +04:00
{
int cpu , n = 0 ;
for ( cpu = 0 ; cpu < NR_CPUS ; cpu + + )
if ( node = = node_cpuid [ cpu ] . nid )
2005-06-30 20:52:00 +04:00
n + + ;
2005-04-17 02:20:36 +04:00
return n ;
}
2005-06-30 20:52:00 +04:00
/**
* compute_pernodesize - compute size of pernode data
* @ node : the node id .
*/
2006-06-27 13:53:40 +04:00
static unsigned long __meminit compute_pernodesize ( int node )
2005-06-30 20:52:00 +04:00
{
unsigned long pernodesize = 0 , cpus ;
cpus = early_nr_cpus_node ( node ) ;
pernodesize + = PERCPU_PAGE_SIZE * cpus ;
pernodesize + = node * L1_CACHE_BYTES ;
pernodesize + = L1_CACHE_ALIGN ( sizeof ( pg_data_t ) ) ;
pernodesize + = L1_CACHE_ALIGN ( sizeof ( struct ia64_node_data ) ) ;
pernodesize = PAGE_ALIGN ( pernodesize ) ;
return pernodesize ;
}
2005-04-17 02:20:36 +04:00
2005-07-07 05:18:10 +04:00
/**
* per_cpu_node_setup - setup per - cpu areas on each node
* @ cpu_data : per - cpu area on this node
* @ node : node to setup
*
* Copy the static per - cpu data into the region we just set aside and then
* setup __per_cpu_offset for each CPU on this node . Return a pointer to
* the end of the area .
*/
static void * per_cpu_node_setup ( void * cpu_data , int node )
{
# ifdef CONFIG_SMP
int cpu ;
for ( cpu = 0 ; cpu < NR_CPUS ; cpu + + ) {
if ( node = = node_cpuid [ cpu ] . nid ) {
memcpy ( __va ( cpu_data ) , __phys_per_cpu_start ,
__per_cpu_end - __per_cpu_start ) ;
__per_cpu_offset [ cpu ] = ( char * ) __va ( cpu_data ) -
__per_cpu_start ;
cpu_data + = PERCPU_PAGE_SIZE ;
}
}
# endif
return cpu_data ;
}
2005-04-17 02:20:36 +04:00
/**
2005-06-30 20:52:00 +04:00
* fill_pernode - initialize pernode data .
* @ node : the node id .
* @ pernode : physical address of pernode data
* @ pernodesize : size of the pernode data
2005-04-17 02:20:36 +04:00
*/
2005-06-30 20:52:00 +04:00
static void __init fill_pernode ( int node , unsigned long pernode ,
unsigned long pernodesize )
2005-04-17 02:20:36 +04:00
{
2005-06-30 20:52:00 +04:00
void * cpu_data ;
2005-07-07 05:18:10 +04:00
int cpus = early_nr_cpus_node ( node ) ;
2005-06-30 20:52:00 +04:00
struct bootmem_data * bdp = & mem_data [ node ] . bootmem_data ;
2005-04-17 02:20:36 +04:00
2005-06-30 20:52:00 +04:00
mem_data [ node ] . pernode_addr = pernode ;
mem_data [ node ] . pernode_size = pernodesize ;
memset ( __va ( pernode ) , 0 , pernodesize ) ;
2005-04-17 02:20:36 +04:00
2005-06-30 20:52:00 +04:00
cpu_data = ( void * ) pernode ;
pernode + = PERCPU_PAGE_SIZE * cpus ;
pernode + = node * L1_CACHE_BYTES ;
2006-06-27 13:53:38 +04:00
pgdat_list [ node ] = __va ( pernode ) ;
2005-06-30 20:52:00 +04:00
pernode + = L1_CACHE_ALIGN ( sizeof ( pg_data_t ) ) ;
mem_data [ node ] . node_data = __va ( pernode ) ;
pernode + = L1_CACHE_ALIGN ( sizeof ( struct ia64_node_data ) ) ;
2006-06-27 13:53:38 +04:00
pgdat_list [ node ] - > bdata = bdp ;
2005-06-30 20:52:00 +04:00
pernode + = L1_CACHE_ALIGN ( sizeof ( pg_data_t ) ) ;
2005-07-07 05:18:10 +04:00
cpu_data = per_cpu_node_setup ( cpu_data , node ) ;
2005-04-17 02:20:36 +04:00
2005-06-30 20:52:00 +04:00
return ;
}
2005-07-07 05:18:10 +04:00
2005-04-17 02:20:36 +04:00
/**
* find_pernode_space - allocate memory for memory map and per - node structures
* @ start : physical start of range
* @ len : length of range
* @ node : node where this range resides
*
* This routine reserves space for the per - cpu data struct , the list of
* pg_data_ts and the per - node data struct . Each node will have something like
* the following in the first chunk of addr . space large enough to hold it .
*
* ________________________
* | |
* | ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ | < - - NODEDATA_ALIGN ( start , node ) for the first
* | PERCPU_PAGE_SIZE * | start and length big enough
* | cpus_on_this_node | Node 0 will also have entries for all non - existent cpus .
* | - - - - - - - - - - - - - - - - - - - - - - - - |
* | local pg_data_t * |
* | - - - - - - - - - - - - - - - - - - - - - - - - |
* | local ia64_node_data |
* | - - - - - - - - - - - - - - - - - - - - - - - - |
* | ? ? ? |
* | ________________________ |
*
* Once this space has been set aside , the bootmem maps are initialized . We
* could probably move the allocation of the per - cpu and ia64_node_data space
* outside of this function and use alloc_bootmem_node ( ) , but doing it here
* is straightforward and we get the alignments we want so . . .
*/
static int __init find_pernode_space ( unsigned long start , unsigned long len ,
int node )
{
2005-06-30 20:52:00 +04:00
unsigned long epfn ;
2005-04-17 02:20:36 +04:00
unsigned long pernodesize = 0 , pernode , pages , mapsize ;
struct bootmem_data * bdp = & mem_data [ node ] . bootmem_data ;
epfn = ( start + len ) > > PAGE_SHIFT ;
pages = bdp - > node_low_pfn - ( bdp - > node_boot_start > > PAGE_SHIFT ) ;
mapsize = bootmem_bootmap_pages ( pages ) < < PAGE_SHIFT ;
/*
* Make sure this memory falls within this node ' s usable memory
* since we may have thrown some away in build_maps ( ) .
*/
if ( start < bdp - > node_boot_start | | epfn > bdp - > node_low_pfn )
return 0 ;
/* Don't setup this node's local space twice... */
if ( mem_data [ node ] . pernode_addr )
return 0 ;
/*
* Calculate total size needed , incl . what ' s necessary
* for good alignment and alias prevention .
*/
2005-06-30 20:52:00 +04:00
pernodesize = compute_pernodesize ( node ) ;
2005-04-17 02:20:36 +04:00
pernode = NODEDATA_ALIGN ( start , node ) ;
/* Is this range big enough for what we want to store here? */
2005-06-30 20:52:00 +04:00
if ( start + len > ( pernode + pernodesize + mapsize ) )
fill_pernode ( node , pernode , pernodesize ) ;
2005-04-17 02:20:36 +04:00
return 0 ;
}
/**
* free_node_bootmem - free bootmem allocator memory for use
* @ start : physical start of range
* @ len : length of range
* @ node : node where this range resides
*
* Simply calls the bootmem allocator to free the specified ranged from
* the given pg_data_t ' s bdata struct . After this function has been called
* for all the entries in the EFI memory map , the bootmem allocator will
* be ready to service allocation requests .
*/
static int __init free_node_bootmem ( unsigned long start , unsigned long len ,
int node )
{
2006-06-27 13:53:38 +04:00
free_bootmem_node ( pgdat_list [ node ] , start , len ) ;
2005-04-17 02:20:36 +04:00
return 0 ;
}
/**
* reserve_pernode_space - reserve memory for per - node space
*
* Reserve the space used by the bootmem maps & per - node space in the boot
* allocator so that when we actually create the real mem maps we don ' t
* use their memory .
*/
static void __init reserve_pernode_space ( void )
{
unsigned long base , size , pages ;
struct bootmem_data * bdp ;
int node ;
for_each_online_node ( node ) {
2006-06-27 13:53:38 +04:00
pg_data_t * pdp = pgdat_list [ node ] ;
2005-04-17 02:20:36 +04:00
2005-06-30 20:52:00 +04:00
if ( node_isset ( node , memory_less_mask ) )
continue ;
2005-04-17 02:20:36 +04:00
bdp = pdp - > bdata ;
/* First the bootmem_map itself */
pages = bdp - > node_low_pfn - ( bdp - > node_boot_start > > PAGE_SHIFT ) ;
size = bootmem_bootmap_pages ( pages ) < < PAGE_SHIFT ;
base = __pa ( bdp - > node_bootmem_map ) ;
reserve_bootmem_node ( pdp , base , size ) ;
/* Now the per-node space */
size = mem_data [ node ] . pernode_size ;
base = __pa ( mem_data [ node ] . pernode_addr ) ;
reserve_bootmem_node ( pdp , base , size ) ;
}
}
[PATCH] pgdat allocation and update for ia64 of memory hotplug: update pgdat address array
This is to refresh node_data[] array for ia64. As I mentioned previous
patches, ia64 has copies of information of pgdat address array on each node as
per node data.
At v2 of node_add, this function used stop_machine_run() to update them. (I
wished that they were copied safety as much as possible.) But, in this patch,
this arrays are just copied simply, and set node_online_map bit after
completion of pgdat initialization.
So, kernel must touch NODE_DATA() macro after checking node_online_map().
(Current code has already done it.) This is more simple way for just
hot-add.....
Note : It will be problem when hot-remove will occur,
because, even if online_map bit is set, kernel may
touch NODE_DATA() due to race condition. :-(
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 13:53:39 +04:00
static void __meminit scatter_node_data ( void )
{
pg_data_t * * dst ;
int node ;
2006-07-04 13:57:51 +04:00
/*
* for_each_online_node ( ) can ' t be used at here .
* node_online_map is not set for hot - added nodes at this time ,
* because we are halfway through initialization of the new node ' s
* structures . If for_each_online_node ( ) is used , a new node ' s
* pg_data_ptrs will be not initialized . Insted of using it ,
* pgdat_list [ ] is checked .
*/
for_each_node ( node ) {
if ( pgdat_list [ node ] ) {
dst = LOCAL_DATA_ADDR ( pgdat_list [ node ] ) - > pg_data_ptrs ;
memcpy ( dst , pgdat_list , sizeof ( pgdat_list ) ) ;
}
[PATCH] pgdat allocation and update for ia64 of memory hotplug: update pgdat address array
This is to refresh node_data[] array for ia64. As I mentioned previous
patches, ia64 has copies of information of pgdat address array on each node as
per node data.
At v2 of node_add, this function used stop_machine_run() to update them. (I
wished that they were copied safety as much as possible.) But, in this patch,
this arrays are just copied simply, and set node_online_map bit after
completion of pgdat initialization.
So, kernel must touch NODE_DATA() macro after checking node_online_map().
(Current code has already done it.) This is more simple way for just
hot-add.....
Note : It will be problem when hot-remove will occur,
because, even if online_map bit is set, kernel may
touch NODE_DATA() due to race condition. :-(
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 13:53:39 +04:00
}
}
2005-04-17 02:20:36 +04:00
/**
* initialize_pernode_data - fixup per - cpu & per - node pointers
*
* Each node ' s per - node area has a copy of the global pg_data_t list , so
* we copy that to each node here , as well as setting the per - cpu pointer
* to the local node data structure . The active_cpus field of the per - node
* structure gets setup by the platform_cpu_init ( ) function later .
*/
static void __init initialize_pernode_data ( void )
{
2005-07-07 05:18:10 +04:00
int cpu , node ;
2005-04-17 02:20:36 +04:00
[PATCH] pgdat allocation and update for ia64 of memory hotplug: update pgdat address array
This is to refresh node_data[] array for ia64. As I mentioned previous
patches, ia64 has copies of information of pgdat address array on each node as
per node data.
At v2 of node_add, this function used stop_machine_run() to update them. (I
wished that they were copied safety as much as possible.) But, in this patch,
this arrays are just copied simply, and set node_online_map bit after
completion of pgdat initialization.
So, kernel must touch NODE_DATA() macro after checking node_online_map().
(Current code has already done it.) This is more simple way for just
hot-add.....
Note : It will be problem when hot-remove will occur,
because, even if online_map bit is set, kernel may
touch NODE_DATA() due to race condition. :-(
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 13:53:39 +04:00
scatter_node_data ( ) ;
2005-07-07 05:18:10 +04:00
# ifdef CONFIG_SMP
2005-04-17 02:20:36 +04:00
/* Set the node_data pointer for each per-cpu struct */
for ( cpu = 0 ; cpu < NR_CPUS ; cpu + + ) {
node = node_cpuid [ cpu ] . nid ;
per_cpu ( cpu_info , cpu ) . node_data = mem_data [ node ] . node_data ;
}
2005-07-07 05:18:10 +04:00
# else
{
struct cpuinfo_ia64 * cpu0_cpu_info ;
cpu = 0 ;
node = node_cpuid [ cpu ] . nid ;
cpu0_cpu_info = ( struct cpuinfo_ia64 * ) ( __phys_per_cpu_start +
( ( char * ) & per_cpu__cpu_info - __per_cpu_start ) ) ;
cpu0_cpu_info - > node_data = mem_data [ node ] . node_data ;
}
# endif /* CONFIG_SMP */
2005-04-17 02:20:36 +04:00
}
2005-06-30 20:52:00 +04:00
/**
* memory_less_node_alloc - * attempt to allocate memory on the best NUMA slit
* node but fall back to any other node when __alloc_bootmem_node fails
* for best .
* @ nid : node id
* @ pernodesize : size of this node ' s pernode data
*/
2005-10-30 01:23:05 +04:00
static void __init * memory_less_node_alloc ( int nid , unsigned long pernodesize )
2005-06-30 20:52:00 +04:00
{
void * ptr = NULL ;
u8 best = 0xff ;
2005-10-30 01:23:05 +04:00
int bestnode = - 1 , node , anynode = 0 ;
2005-06-30 20:52:00 +04:00
for_each_online_node ( node ) {
if ( node_isset ( node , memory_less_mask ) )
continue ;
else if ( node_distance ( nid , node ) < best ) {
best = node_distance ( nid , node ) ;
bestnode = node ;
}
2005-10-30 01:23:05 +04:00
anynode = node ;
2005-06-30 20:52:00 +04:00
}
2005-10-30 01:23:05 +04:00
if ( bestnode = = - 1 )
bestnode = anynode ;
2006-06-27 13:53:38 +04:00
ptr = __alloc_bootmem_node ( pgdat_list [ bestnode ] , pernodesize ,
2005-10-30 01:23:05 +04:00
PERCPU_PAGE_SIZE , __pa ( MAX_DMA_ADDRESS ) ) ;
2005-06-30 20:52:00 +04:00
return ptr ;
}
/**
* memory_less_nodes - allocate and initialize CPU only nodes pernode
* information .
*/
static void __init memory_less_nodes ( void )
{
unsigned long pernodesize ;
void * pernode ;
int node ;
for_each_node_mask ( node , memory_less_mask ) {
pernodesize = compute_pernodesize ( node ) ;
2005-10-30 01:23:05 +04:00
pernode = memory_less_node_alloc ( node , pernodesize ) ;
2005-06-30 20:52:00 +04:00
fill_pernode ( node , __pa ( pernode ) , pernodesize ) ;
}
return ;
}
2005-10-04 23:13:57 +04:00
# ifdef CONFIG_SPARSEMEM
/**
* register_sparse_mem - notify SPARSEMEM that this memory range exists .
* @ start : physical start of range
* @ end : physical end of range
* @ arg : unused
*
* Simply calls SPARSEMEM to register memory section ( s ) .
*/
static int __init register_sparse_mem ( unsigned long start , unsigned long end ,
void * arg )
{
int nid ;
start = __pa ( start ) > > PAGE_SHIFT ;
end = __pa ( end ) > > PAGE_SHIFT ;
nid = early_pfn_to_nid ( start ) ;
memory_present ( nid , start , end ) ;
return 0 ;
}
static void __init arch_sparse_init ( void )
{
efi_memmap_walk ( register_sparse_mem , NULL ) ;
sparse_init ( ) ;
}
# else
# define arch_sparse_init() do {} while (0)
# endif
2005-04-17 02:20:36 +04:00
/**
* find_memory - walk the EFI memory map and setup the bootmem allocator
*
* Called early in boot to setup the bootmem allocator , and to
* allocate the per - cpu and per - node structures .
*/
void __init find_memory ( void )
{
int node ;
reserve_memory ( ) ;
if ( num_online_nodes ( ) = = 0 ) {
printk ( KERN_ERR " node info missing! \n " ) ;
node_set_online ( 0 ) ;
}
2005-06-30 20:52:00 +04:00
nodes_or ( memory_less_mask , memory_less_mask , node_online_map ) ;
2005-04-17 02:20:36 +04:00
min_low_pfn = - 1 ;
max_low_pfn = 0 ;
/* These actually end up getting called by call_pernode_memory() */
efi_memmap_walk ( filter_rsvd_memory , build_node_maps ) ;
efi_memmap_walk ( filter_rsvd_memory , find_pernode_space ) ;
2005-06-30 20:52:00 +04:00
for_each_online_node ( node )
if ( mem_data [ node ] . bootmem_data . node_low_pfn ) {
node_clear ( node , memory_less_mask ) ;
mem_data [ node ] . min_pfn = ~ 0UL ;
}
2005-04-17 02:20:36 +04:00
/*
* Initialize the boot memory maps in reverse order since that ' s
* what the bootmem allocator expects
*/
for ( node = MAX_NUMNODES - 1 ; node > = 0 ; node - - ) {
unsigned long pernode , pernodesize , map ;
struct bootmem_data * bdp ;
if ( ! node_online ( node ) )
continue ;
2005-06-30 20:52:00 +04:00
else if ( node_isset ( node , memory_less_mask ) )
continue ;
2005-04-17 02:20:36 +04:00
bdp = & mem_data [ node ] . bootmem_data ;
pernode = mem_data [ node ] . pernode_addr ;
pernodesize = mem_data [ node ] . pernode_size ;
map = pernode + pernodesize ;
2006-06-27 13:53:38 +04:00
init_bootmem_node ( pgdat_list [ node ] ,
2005-04-17 02:20:36 +04:00
map > > PAGE_SHIFT ,
bdp - > node_boot_start > > PAGE_SHIFT ,
bdp - > node_low_pfn ) ;
}
efi_memmap_walk ( filter_rsvd_memory , free_node_bootmem ) ;
reserve_pernode_space ( ) ;
2005-06-30 20:52:00 +04:00
memory_less_nodes ( ) ;
2005-04-17 02:20:36 +04:00
initialize_pernode_data ( ) ;
max_pfn = max_low_pfn ;
find_initrd ( ) ;
}
2005-07-07 05:18:10 +04:00
# ifdef CONFIG_SMP
2005-04-17 02:20:36 +04:00
/**
* per_cpu_init - setup per - cpu variables
*
* find_pernode_space ( ) does most of this already , we just need to set
* local_per_cpu_offset
*/
2006-03-12 20:00:13 +03:00
void __cpuinit * per_cpu_init ( void )
2005-04-17 02:20:36 +04:00
{
int cpu ;
2005-11-12 01:32:40 +03:00
static int first_time = 1 ;
2005-04-17 02:20:36 +04:00
2005-07-07 05:18:10 +04:00
if ( smp_processor_id ( ) ! = 0 )
return __per_cpu_start + __per_cpu_offset [ smp_processor_id ( ) ] ;
2005-11-12 01:32:40 +03:00
if ( first_time ) {
first_time = 0 ;
for ( cpu = 0 ; cpu < NR_CPUS ; cpu + + )
per_cpu ( local_per_cpu_offset , cpu ) = __per_cpu_offset [ cpu ] ;
}
2005-04-17 02:20:36 +04:00
return __per_cpu_start + __per_cpu_offset [ smp_processor_id ( ) ] ;
}
2005-07-07 05:18:10 +04:00
# endif /* CONFIG_SMP */
2005-04-17 02:20:36 +04:00
/**
* show_mem - give short summary of memory stats
*
* Shows a simple page count of reserved and used pages in the system .
* For discontig machines , it does this on a per - pgdat basis .
*/
void show_mem ( void )
{
int i , total_reserved = 0 ;
int total_shared = 0 , total_cached = 0 ;
unsigned long total_present = 0 ;
pg_data_t * pgdat ;
2006-09-13 16:43:42 +04:00
printk ( KERN_INFO " Mem-info: \n " ) ;
2005-04-17 02:20:36 +04:00
show_free_areas ( ) ;
2006-09-13 16:43:42 +04:00
printk ( KERN_INFO " Free swap: %6ldkB \n " ,
nr_swap_pages < < ( PAGE_SHIFT - 10 ) ) ;
2006-03-27 13:15:59 +04:00
for_each_online_pgdat ( pgdat ) {
2005-10-30 04:16:52 +03:00
unsigned long present ;
unsigned long flags ;
2005-04-17 02:20:36 +04:00
int shared = 0 , cached = 0 , reserved = 0 ;
2005-10-30 04:16:52 +03:00
2006-09-13 16:43:42 +04:00
printk ( KERN_INFO " Node ID: %d \n " , pgdat - > node_id ) ;
2005-10-30 04:16:52 +03:00
pgdat_resize_lock ( pgdat , & flags ) ;
present = pgdat - > node_present_pages ;
2005-04-17 02:20:36 +04:00
for ( i = 0 ; i < pgdat - > node_spanned_pages ; i + + ) {
2005-10-04 23:13:57 +04:00
struct page * page ;
if ( pfn_valid ( pgdat - > node_start_pfn + i ) )
page = pfn_to_page ( pgdat - > node_start_pfn + i ) ;
2006-04-14 02:34:45 +04:00
else {
2006-06-28 20:55:43 +04:00
i = vmemmap_find_next_valid_pfn ( pgdat - > node_id ,
i ) - 1 ;
2005-04-17 02:20:36 +04:00
continue ;
2006-04-14 02:34:45 +04:00
}
[PATCH] remove non-DISCONTIG use of pgdat->node_mem_map
This patch effectively eliminates direct use of pgdat->node_mem_map outside
of the DISCONTIG code. On a flat memory system, these fields aren't
currently used, neither are they on a sparsemem system.
There was also a node_mem_map(nid) macro on many architectures. Its use
along with the use of ->node_mem_map itself was not consistent. It has
been removed in favor of two new, more explicit, arch-independent macros:
pgdat_page_nr(pgdat, pagenr)
nid_page_nr(nid, pagenr)
I called them "pgdat" and "nid" because we overload the term "node" to mean
"NUMA node", "DISCONTIG node" or "pg_data_t" in very confusing ways. I
believe the newer names are much clearer.
These macros can be overridden in the sparsemem case with a theoretically
slower operation using node_start_pfn and pfn_to_page(), instead. We could
make this the only behavior if people want, but I don't want to change too
much at once. One thing at a time.
This patch removes more code than it adds.
Compile tested on alpha, alpha discontig, arm, arm-discontig, i386, i386
generic, NUMAQ, Summit, ppc64, ppc64 discontig, and x86_64. Full list
here: http://sr71.net/patches/2.6.12/2.6.12-rc1-mhp2/configs/
Boot tested on NUMAQ, x86 SMP and ppc64 power4/5 LPARs.
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin J. Bligh <mbligh@aracnet.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:07:37 +04:00
if ( PageReserved ( page ) )
2005-04-17 02:20:36 +04:00
reserved + + ;
[PATCH] remove non-DISCONTIG use of pgdat->node_mem_map
This patch effectively eliminates direct use of pgdat->node_mem_map outside
of the DISCONTIG code. On a flat memory system, these fields aren't
currently used, neither are they on a sparsemem system.
There was also a node_mem_map(nid) macro on many architectures. Its use
along with the use of ->node_mem_map itself was not consistent. It has
been removed in favor of two new, more explicit, arch-independent macros:
pgdat_page_nr(pgdat, pagenr)
nid_page_nr(nid, pagenr)
I called them "pgdat" and "nid" because we overload the term "node" to mean
"NUMA node", "DISCONTIG node" or "pg_data_t" in very confusing ways. I
believe the newer names are much clearer.
These macros can be overridden in the sparsemem case with a theoretically
slower operation using node_start_pfn and pfn_to_page(), instead. We could
make this the only behavior if people want, but I don't want to change too
much at once. One thing at a time.
This patch removes more code than it adds.
Compile tested on alpha, alpha discontig, arm, arm-discontig, i386, i386
generic, NUMAQ, Summit, ppc64, ppc64 discontig, and x86_64. Full list
here: http://sr71.net/patches/2.6.12/2.6.12-rc1-mhp2/configs/
Boot tested on NUMAQ, x86 SMP and ppc64 power4/5 LPARs.
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin J. Bligh <mbligh@aracnet.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:07:37 +04:00
else if ( PageSwapCache ( page ) )
2005-04-17 02:20:36 +04:00
cached + + ;
[PATCH] remove non-DISCONTIG use of pgdat->node_mem_map
This patch effectively eliminates direct use of pgdat->node_mem_map outside
of the DISCONTIG code. On a flat memory system, these fields aren't
currently used, neither are they on a sparsemem system.
There was also a node_mem_map(nid) macro on many architectures. Its use
along with the use of ->node_mem_map itself was not consistent. It has
been removed in favor of two new, more explicit, arch-independent macros:
pgdat_page_nr(pgdat, pagenr)
nid_page_nr(nid, pagenr)
I called them "pgdat" and "nid" because we overload the term "node" to mean
"NUMA node", "DISCONTIG node" or "pg_data_t" in very confusing ways. I
believe the newer names are much clearer.
These macros can be overridden in the sparsemem case with a theoretically
slower operation using node_start_pfn and pfn_to_page(), instead. We could
make this the only behavior if people want, but I don't want to change too
much at once. One thing at a time.
This patch removes more code than it adds.
Compile tested on alpha, alpha discontig, arm, arm-discontig, i386, i386
generic, NUMAQ, Summit, ppc64, ppc64 discontig, and x86_64. Full list
here: http://sr71.net/patches/2.6.12/2.6.12-rc1-mhp2/configs/
Boot tested on NUMAQ, x86 SMP and ppc64 power4/5 LPARs.
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin J. Bligh <mbligh@aracnet.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:07:37 +04:00
else if ( page_count ( page ) )
shared + = page_count ( page ) - 1 ;
2005-04-17 02:20:36 +04:00
}
2005-10-30 04:16:52 +03:00
pgdat_resize_unlock ( pgdat , & flags ) ;
2005-04-17 02:20:36 +04:00
total_present + = present ;
total_reserved + = reserved ;
total_cached + = cached ;
total_shared + = shared ;
2006-09-13 16:43:42 +04:00
printk ( KERN_INFO " \t %ld pages of RAM \n " , present ) ;
printk ( KERN_INFO " \t %d reserved pages \n " , reserved ) ;
printk ( KERN_INFO " \t %d pages shared \n " , shared ) ;
printk ( KERN_INFO " \t %d pages swap cached \n " , cached ) ;
2005-04-17 02:20:36 +04:00
}
2006-09-13 16:43:42 +04:00
printk ( KERN_INFO " %ld pages of RAM \n " , total_present ) ;
printk ( KERN_INFO " %d reserved pages \n " , total_reserved ) ;
printk ( KERN_INFO " %d pages shared \n " , total_shared ) ;
printk ( KERN_INFO " %d pages swap cached \n " , total_cached ) ;
printk ( KERN_INFO " Total of %ld pages in page table cache \n " ,
pgtable_quicklist_total_size ( ) ) ;
printk ( KERN_INFO " %d free buffer pages \n " , nr_free_buffer_pages ( ) ) ;
2005-04-17 02:20:36 +04:00
}
/**
* call_pernode_memory - use SRAT to call callback functions with node info
* @ start : physical start of range
* @ len : length of range
* @ arg : function to call for each range
*
* efi_memmap_walk ( ) knows nothing about layout of memory across nodes . Find
* out to which node a block of memory belongs . Ignore memory that we cannot
* identify , and split blocks that run across multiple nodes .
*
* Take this opportunity to round the start address up and the end address
* down to page boundaries .
*/
void call_pernode_memory ( unsigned long start , unsigned long len , void * arg )
{
unsigned long rs , re , end = start + len ;
void ( * func ) ( unsigned long , unsigned long , int ) ;
int i ;
start = PAGE_ALIGN ( start ) ;
end & = PAGE_MASK ;
if ( start > = end )
return ;
func = arg ;
if ( ! num_node_memblks ) {
/* No SRAT table, so assume one node (node 0) */
if ( start < end )
( * func ) ( start , end - start , 0 ) ;
return ;
}
for ( i = 0 ; i < num_node_memblks ; i + + ) {
rs = max ( start , node_memblk [ i ] . start_paddr ) ;
re = min ( end , node_memblk [ i ] . start_paddr +
node_memblk [ i ] . size ) ;
if ( rs < re )
( * func ) ( rs , re - rs , node_memblk [ i ] . nid ) ;
if ( re = = end )
break ;
}
}
/**
* count_node_pages - callback to build per - node memory info structures
* @ start : physical start of range
* @ len : length of range
* @ node : node where this range resides
*
* Each node has it ' s own number of physical pages , DMAable pages , start , and
* end page frame number . This routine will be called by call_pernode_memory ( )
* for each piece of usable memory and will setup these values for each node .
* Very similar to build_maps ( ) .
*/
static __init int count_node_pages ( unsigned long start , unsigned long len , int node )
{
unsigned long end = start + len ;
mem_data [ node ] . num_physpages + = len > > PAGE_SHIFT ;
if ( start < = __pa ( MAX_DMA_ADDRESS ) )
mem_data [ node ] . num_dma_physpages + =
( min ( end , __pa ( MAX_DMA_ADDRESS ) ) - start ) > > PAGE_SHIFT ;
start = GRANULEROUNDDOWN ( start ) ;
start = ORDERROUNDDOWN ( start ) ;
end = GRANULEROUNDUP ( end ) ;
mem_data [ node ] . max_pfn = max ( mem_data [ node ] . max_pfn ,
end > > PAGE_SHIFT ) ;
mem_data [ node ] . min_pfn = min ( mem_data [ node ] . min_pfn ,
start > > PAGE_SHIFT ) ;
return 0 ;
}
/**
* paging_init - setup page tables
*
* paging_init ( ) sets up the page tables for each node of the system and frees
* the bootmem allocator memory for general use .
*/
void __init paging_init ( void )
{
unsigned long max_dma ;
unsigned long zones_size [ MAX_NR_ZONES ] ;
unsigned long zholes_size [ MAX_NR_ZONES ] ;
unsigned long pfn_offset = 0 ;
int node ;
max_dma = virt_to_phys ( ( void * ) MAX_DMA_ADDRESS ) > > PAGE_SHIFT ;
2005-10-04 23:13:57 +04:00
arch_sparse_init ( ) ;
2005-04-17 02:20:36 +04:00
efi_memmap_walk ( filter_rsvd_memory , count_node_pages ) ;
2005-10-04 23:13:57 +04:00
# ifdef CONFIG_VIRTUAL_MEM_MAP
2006-06-28 20:54:55 +04:00
vmalloc_end - = PAGE_ALIGN ( ALIGN ( max_low_pfn , MAX_ORDER_NR_PAGES ) *
sizeof ( struct page ) ) ;
2005-06-30 20:52:00 +04:00
vmem_map = ( struct page * ) vmalloc_end ;
efi_memmap_walk ( create_mem_map_page_table , NULL ) ;
printk ( " Virtual mem_map starts at 0x%p \n " , vmem_map ) ;
2005-10-04 23:13:57 +04:00
# endif
2005-06-30 20:52:00 +04:00
2005-04-17 02:20:36 +04:00
for_each_online_node ( node ) {
memset ( zones_size , 0 , sizeof ( zones_size ) ) ;
memset ( zholes_size , 0 , sizeof ( zholes_size ) ) ;
num_physpages + = mem_data [ node ] . num_physpages ;
if ( mem_data [ node ] . min_pfn > = max_dma ) {
/* All of this node's memory is above ZONE_DMA */
zones_size [ ZONE_NORMAL ] = mem_data [ node ] . max_pfn -
mem_data [ node ] . min_pfn ;
zholes_size [ ZONE_NORMAL ] = mem_data [ node ] . max_pfn -
mem_data [ node ] . min_pfn -
mem_data [ node ] . num_physpages ;
} else if ( mem_data [ node ] . max_pfn < max_dma ) {
/* All of this node's memory is in ZONE_DMA */
zones_size [ ZONE_DMA ] = mem_data [ node ] . max_pfn -
mem_data [ node ] . min_pfn ;
zholes_size [ ZONE_DMA ] = mem_data [ node ] . max_pfn -
mem_data [ node ] . min_pfn -
mem_data [ node ] . num_dma_physpages ;
} else {
/* This node has memory in both zones */
zones_size [ ZONE_DMA ] = max_dma -
mem_data [ node ] . min_pfn ;
zholes_size [ ZONE_DMA ] = zones_size [ ZONE_DMA ] -
mem_data [ node ] . num_dma_physpages ;
zones_size [ ZONE_NORMAL ] = mem_data [ node ] . max_pfn -
max_dma ;
zholes_size [ ZONE_NORMAL ] = zones_size [ ZONE_NORMAL ] -
( mem_data [ node ] . num_physpages -
mem_data [ node ] . num_dma_physpages ) ;
}
pfn_offset = mem_data [ node ] . min_pfn ;
2005-10-04 23:13:57 +04:00
# ifdef CONFIG_VIRTUAL_MEM_MAP
2005-04-17 02:20:36 +04:00
NODE_DATA ( node ) - > node_mem_map = vmem_map + pfn_offset ;
2005-10-04 23:13:57 +04:00
# endif
2005-04-17 02:20:36 +04:00
free_area_init_node ( node , NODE_DATA ( node ) , zones_size ,
pfn_offset , zholes_size ) ;
}
zero_page_memmap_ptr = virt_to_page ( ia64_imva ( empty_zero_page ) ) ;
}
[PATCH] pgdat allocation and update for ia64 of memory hotplug: update pgdat address array
This is to refresh node_data[] array for ia64. As I mentioned previous
patches, ia64 has copies of information of pgdat address array on each node as
per node data.
At v2 of node_add, this function used stop_machine_run() to update them. (I
wished that they were copied safety as much as possible.) But, in this patch,
this arrays are just copied simply, and set node_online_map bit after
completion of pgdat initialization.
So, kernel must touch NODE_DATA() macro after checking node_online_map().
(Current code has already done it.) This is more simple way for just
hot-add.....
Note : It will be problem when hot-remove will occur,
because, even if online_map bit is set, kernel may
touch NODE_DATA() due to race condition. :-(
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 13:53:39 +04:00
2006-06-27 13:53:40 +04:00
pg_data_t * arch_alloc_nodedata ( int nid )
{
unsigned long size = compute_pernodesize ( nid ) ;
return kzalloc ( size , GFP_KERNEL ) ;
}
void arch_free_nodedata ( pg_data_t * pgdat )
{
kfree ( pgdat ) ;
}
[PATCH] pgdat allocation and update for ia64 of memory hotplug: update pgdat address array
This is to refresh node_data[] array for ia64. As I mentioned previous
patches, ia64 has copies of information of pgdat address array on each node as
per node data.
At v2 of node_add, this function used stop_machine_run() to update them. (I
wished that they were copied safety as much as possible.) But, in this patch,
this arrays are just copied simply, and set node_online_map bit after
completion of pgdat initialization.
So, kernel must touch NODE_DATA() macro after checking node_online_map().
(Current code has already done it.) This is more simple way for just
hot-add.....
Note : It will be problem when hot-remove will occur,
because, even if online_map bit is set, kernel may
touch NODE_DATA() due to race condition. :-(
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 13:53:39 +04:00
void arch_refresh_nodedata ( int update_node , pg_data_t * update_pgdat )
{
pgdat_list [ update_node ] = update_pgdat ;
scatter_node_data ( ) ;
}