2005-04-17 02:20:36 +04:00
/*
* linux / mm / bootmem . c
*
* Copyright ( C ) 1999 Ingo Molnar
* Discontiguous memory support , Kanoj Sarcar , SGI , Nov 1999
*
* simple boot - time physical memory area allocator and
* free memory collector . It ' s used to deal with reserved
* system memory and memory holes as well .
*/
# include <linux/init.h>
2006-09-26 10:31:07 +04:00
# include <linux/pfn.h>
2005-04-17 02:20:36 +04:00
# include <linux/bootmem.h>
# include <linux/module.h>
2006-09-26 10:31:06 +04:00
# include <asm/bug.h>
2005-04-17 02:20:36 +04:00
# include <asm/io.h>
2006-09-26 10:31:33 +04:00
# include <asm/processor.h>
2006-09-26 10:31:06 +04:00
2005-04-17 02:20:36 +04:00
# include "internal.h"
/*
* Access to this subsystem has to be serialized externally . ( this is
* true for the boot process anyway )
*/
unsigned long max_low_pfn ;
unsigned long min_low_pfn ;
unsigned long max_pfn ;
2006-03-27 13:15:58 +04:00
static LIST_HEAD ( bdata_list ) ;
2005-06-26 01:58:18 +04:00
# ifdef CONFIG_CRASH_DUMP
/*
* If we have booted due to a crash , max_pfn will be a very low value . We need
* to know the amount of memory that the previous kernel used .
*/
unsigned long saved_max_pfn ;
# endif
2005-04-17 02:20:36 +04:00
/* return the number of _pages_ that will be allocated for the boot bitmap */
2006-09-26 10:31:08 +04:00
unsigned long __init bootmem_bootmap_pages ( unsigned long pages )
2005-04-17 02:20:36 +04:00
{
unsigned long mapsize ;
mapsize = ( pages + 7 ) / 8 ;
mapsize = ( mapsize + ~ PAGE_MASK ) & PAGE_MASK ;
mapsize > > = PAGE_SHIFT ;
return mapsize ;
}
2006-09-26 10:31:08 +04:00
2006-03-27 13:15:58 +04:00
/*
* link bdata in order
*/
2006-09-26 10:31:04 +04:00
static void __init link_bootmem ( bootmem_data_t * bdata )
2006-03-27 13:15:58 +04:00
{
bootmem_data_t * ent ;
2006-09-26 10:31:08 +04:00
2006-03-27 13:15:58 +04:00
if ( list_empty ( & bdata_list ) ) {
list_add ( & bdata - > list , & bdata_list ) ;
return ;
}
/* insert in order */
list_for_each_entry ( ent , & bdata_list , list ) {
if ( bdata - > node_boot_start < ent - > node_boot_start ) {
list_add_tail ( & bdata - > list , & ent - > list ) ;
return ;
}
}
list_add_tail ( & bdata - > list , & bdata_list ) ;
}
2006-09-26 10:31:07 +04:00
/*
* Given an initialised bdata , it returns the size of the boot bitmap
*/
static unsigned long __init get_mapsize ( bootmem_data_t * bdata )
{
unsigned long mapsize ;
unsigned long start = PFN_DOWN ( bdata - > node_boot_start ) ;
unsigned long end = bdata - > node_low_pfn ;
mapsize = ( ( end - start ) + 7 ) / 8 ;
return ALIGN ( mapsize , sizeof ( long ) ) ;
}
2005-04-17 02:20:36 +04:00
/*
* Called once to set up the allocator itself .
*/
2006-09-26 10:31:08 +04:00
static unsigned long __init init_bootmem_core ( pg_data_t * pgdat ,
2005-04-17 02:20:36 +04:00
unsigned long mapstart , unsigned long start , unsigned long end )
{
bootmem_data_t * bdata = pgdat - > bdata ;
2006-09-26 10:31:07 +04:00
unsigned long mapsize ;
2005-04-17 02:20:36 +04:00
2006-09-26 10:31:07 +04:00
bdata - > node_bootmem_map = phys_to_virt ( PFN_PHYS ( mapstart ) ) ;
bdata - > node_boot_start = PFN_PHYS ( start ) ;
2005-04-17 02:20:36 +04:00
bdata - > node_low_pfn = end ;
2006-03-27 13:15:58 +04:00
link_bootmem ( bdata ) ;
2005-04-17 02:20:36 +04:00
/*
* Initially all pages are reserved - setup_arch ( ) has to
* register free RAM areas explicitly .
*/
2006-09-26 10:31:07 +04:00
mapsize = get_mapsize ( bdata ) ;
2005-04-17 02:20:36 +04:00
memset ( bdata - > node_bootmem_map , 0xff , mapsize ) ;
return mapsize ;
}
/*
* Marks a particular physical memory range as unallocatable . Usable RAM
* might be used for boot - time allocations - or it might get added
* to the free page pool later on .
*/
2008-03-18 22:49:12 +03:00
static int __init can_reserve_bootmem_core ( bootmem_data_t * bdata ,
2008-02-07 11:15:17 +03:00
unsigned long addr , unsigned long size , int flags )
2005-04-17 02:20:36 +04:00
{
2006-09-26 10:31:07 +04:00
unsigned long sidx , eidx ;
2005-04-17 02:20:36 +04:00
unsigned long i ;
2008-03-18 22:49:12 +03:00
BUG_ON ( ! size ) ;
/* out of range, don't hold other */
if ( addr + size < bdata - > node_boot_start | |
PFN_DOWN ( addr ) > bdata - > node_low_pfn )
return 0 ;
2006-09-26 10:31:07 +04:00
2005-04-17 02:20:36 +04:00
/*
2008-03-18 22:49:12 +03:00
* Round up to index to the range .
2005-04-17 02:20:36 +04:00
*/
2008-03-18 22:49:12 +03:00
if ( addr > bdata - > node_boot_start )
sidx = PFN_DOWN ( addr - bdata - > node_boot_start ) ;
else
sidx = 0 ;
eidx = PFN_UP ( addr + size - bdata - > node_boot_start ) ;
if ( eidx > bdata - > node_low_pfn - PFN_DOWN ( bdata - > node_boot_start ) )
eidx = bdata - > node_low_pfn - PFN_DOWN ( bdata - > node_boot_start ) ;
for ( i = sidx ; i < eidx ; i + + ) {
if ( test_bit ( i , bdata - > node_bootmem_map ) ) {
if ( flags & BOOTMEM_EXCLUSIVE )
return - EBUSY ;
}
}
return 0 ;
}
static void __init reserve_bootmem_core ( bootmem_data_t * bdata ,
unsigned long addr , unsigned long size , int flags )
{
unsigned long sidx , eidx ;
unsigned long i ;
2005-04-17 02:20:36 +04:00
BUG_ON ( ! size ) ;
2006-09-26 10:31:07 +04:00
2008-03-18 22:49:12 +03:00
/* out of range */
if ( addr + size < bdata - > node_boot_start | |
PFN_DOWN ( addr ) > bdata - > node_low_pfn )
return ;
/*
* Round up to index to the range .
*/
if ( addr > bdata - > node_boot_start )
sidx = PFN_DOWN ( addr - bdata - > node_boot_start ) ;
else
sidx = 0 ;
2006-09-26 10:31:07 +04:00
eidx = PFN_UP ( addr + size - bdata - > node_boot_start ) ;
2008-03-18 22:49:12 +03:00
if ( eidx > bdata - > node_low_pfn - PFN_DOWN ( bdata - > node_boot_start ) )
eidx = bdata - > node_low_pfn - PFN_DOWN ( bdata - > node_boot_start ) ;
2005-04-17 02:20:36 +04:00
2008-03-18 22:49:12 +03:00
for ( i = sidx ; i < eidx ; i + + ) {
2005-04-17 02:20:36 +04:00
if ( test_and_set_bit ( i , bdata - > node_bootmem_map ) ) {
# ifdef CONFIG_DEBUG_BOOTMEM
printk ( " hm, page %08lx reserved twice. \n " , i * PAGE_SIZE ) ;
# endif
}
2008-03-18 22:49:12 +03:00
}
2005-04-17 02:20:36 +04:00
}
2006-09-26 10:31:05 +04:00
static void __init free_bootmem_core ( bootmem_data_t * bdata , unsigned long addr ,
unsigned long size )
2005-04-17 02:20:36 +04:00
{
2006-09-26 10:31:07 +04:00
unsigned long sidx , eidx ;
2005-04-17 02:20:36 +04:00
unsigned long i ;
2006-09-26 10:31:07 +04:00
2008-03-24 22:29:45 +03:00
BUG_ON ( ! size ) ;
/* out range */
if ( addr + size < bdata - > node_boot_start | |
PFN_DOWN ( addr ) > bdata - > node_low_pfn )
return ;
2005-04-17 02:20:36 +04:00
/*
* round down end of usable mem , partially free pages are
* considered reserved .
*/
2008-03-24 22:29:45 +03:00
if ( addr > = bdata - > node_boot_start & & addr < bdata - > last_success )
2005-04-17 02:20:36 +04:00
bdata - > last_success = addr ;
/*
2008-03-24 22:29:45 +03:00
* Round up to index to the range .
2005-04-17 02:20:36 +04:00
*/
2008-03-24 22:29:45 +03:00
if ( PFN_UP ( addr ) > PFN_DOWN ( bdata - > node_boot_start ) )
sidx = PFN_UP ( addr ) - PFN_DOWN ( bdata - > node_boot_start ) ;
else
sidx = 0 ;
2006-09-26 10:31:07 +04:00
eidx = PFN_DOWN ( addr + size - bdata - > node_boot_start ) ;
2008-03-24 22:29:45 +03:00
if ( eidx > bdata - > node_low_pfn - PFN_DOWN ( bdata - > node_boot_start ) )
eidx = bdata - > node_low_pfn - PFN_DOWN ( bdata - > node_boot_start ) ;
2005-04-17 02:20:36 +04:00
for ( i = sidx ; i < eidx ; i + + ) {
if ( unlikely ( ! test_and_clear_bit ( i , bdata - > node_bootmem_map ) ) )
BUG ( ) ;
}
}
/*
* We ' merge ' subsequent allocations to save space . We might ' lose '
* some fraction of a page if allocations cannot be satisfied due to
* size constraints on boxes where there is physical RAM space
* fragmentation - in these cases ( mostly large memory boxes ) this
* is not a problem .
*
* On low memory boxes we get it right in 100 % of the cases .
*
* alignment has to be a power of 2 value .
*
* NOTE : This function is _not_ reentrant .
*/
2006-03-25 18:31:10 +03:00
void * __init
2005-04-17 02:20:36 +04:00
__alloc_bootmem_core ( struct bootmem_data * bdata , unsigned long size ,
2005-10-20 02:52:18 +04:00
unsigned long align , unsigned long goal , unsigned long limit )
2005-04-17 02:20:36 +04:00
{
2008-03-18 22:44:48 +03:00
unsigned long areasize , preferred ;
2006-09-26 10:31:07 +04:00
unsigned long i , start = 0 , incr , eidx , end_pfn ;
2005-04-17 02:20:36 +04:00
void * ret ;
2008-03-18 22:44:48 +03:00
unsigned long node_boot_start ;
void * node_bootmem_map ;
2005-04-17 02:20:36 +04:00
2006-09-26 10:31:08 +04:00
if ( ! size ) {
2005-04-17 02:20:36 +04:00
printk ( " __alloc_bootmem_core(): zero-sized request \n " ) ;
BUG ( ) ;
}
BUG_ON ( align & ( align - 1 ) ) ;
2006-12-07 07:32:41 +03:00
/* on nodes without memory - bootmem_map is NULL */
if ( ! bdata - > node_bootmem_map )
return NULL ;
2008-03-18 22:44:48 +03:00
/* bdata->node_boot_start is supposed to be (12+6)bits alignment on x86_64 ? */
node_boot_start = bdata - > node_boot_start ;
node_bootmem_map = bdata - > node_bootmem_map ;
if ( align ) {
node_boot_start = ALIGN ( bdata - > node_boot_start , align ) ;
if ( node_boot_start > bdata - > node_boot_start )
node_bootmem_map = ( unsigned long * ) bdata - > node_bootmem_map +
PFN_DOWN ( node_boot_start - bdata - > node_boot_start ) / BITS_PER_LONG ;
}
if ( limit & & node_boot_start > = limit )
return NULL ;
2006-09-26 10:31:07 +04:00
end_pfn = bdata - > node_low_pfn ;
limit = PFN_DOWN ( limit ) ;
2005-10-20 02:52:18 +04:00
if ( limit & & end_pfn > limit )
end_pfn = limit ;
2008-03-18 22:44:48 +03:00
eidx = end_pfn - PFN_DOWN ( node_boot_start ) ;
2005-04-17 02:20:36 +04:00
/*
* We try to allocate bootmem pages above ' goal '
* first , then we try to allocate lower pages .
*/
2008-03-11 09:23:42 +03:00
preferred = 0 ;
if ( goal & & PFN_DOWN ( goal ) < end_pfn ) {
2008-03-18 22:44:48 +03:00
if ( goal > node_boot_start )
preferred = goal - node_boot_start ;
2005-04-17 02:20:36 +04:00
2008-03-18 22:44:48 +03:00
if ( bdata - > last_success > node_boot_start & &
bdata - > last_success - node_boot_start > = preferred )
2005-10-20 02:52:18 +04:00
if ( ! limit | | ( limit & & limit > bdata - > last_success ) )
2008-03-18 22:44:48 +03:00
preferred = bdata - > last_success - node_boot_start ;
2008-03-11 09:23:42 +03:00
}
2005-04-17 02:20:36 +04:00
2008-03-18 22:44:48 +03:00
preferred = PFN_DOWN ( ALIGN ( preferred , align ) ) ;
2006-09-26 10:31:07 +04:00
areasize = ( size + PAGE_SIZE - 1 ) / PAGE_SIZE ;
2005-04-17 02:20:36 +04:00
incr = align > > PAGE_SHIFT ? : 1 ;
restart_scan :
2008-03-11 09:23:42 +03:00
for ( i = preferred ; i < eidx ; ) {
2005-04-17 02:20:36 +04:00
unsigned long j ;
2008-03-11 09:23:42 +03:00
2008-03-18 22:44:48 +03:00
i = find_next_zero_bit ( node_bootmem_map , eidx , i ) ;
2005-04-17 02:20:36 +04:00
i = ALIGN ( i , incr ) ;
2005-12-12 11:37:39 +03:00
if ( i > = eidx )
break ;
2008-03-18 22:44:48 +03:00
if ( test_bit ( i , node_bootmem_map ) ) {
2008-03-11 09:23:42 +03:00
i + = incr ;
2005-04-17 02:20:36 +04:00
continue ;
2008-03-11 09:23:42 +03:00
}
2005-04-17 02:20:36 +04:00
for ( j = i + 1 ; j < i + areasize ; + + j ) {
if ( j > = eidx )
goto fail_block ;
2008-03-18 22:44:48 +03:00
if ( test_bit ( j , node_bootmem_map ) )
2005-04-17 02:20:36 +04:00
goto fail_block ;
}
start = i ;
goto found ;
fail_block :
i = ALIGN ( j , incr ) ;
2008-03-11 09:23:42 +03:00
if ( i = = j )
i + = incr ;
2005-04-17 02:20:36 +04:00
}
2008-03-18 22:44:48 +03:00
if ( preferred > 0 ) {
preferred = 0 ;
2005-04-17 02:20:36 +04:00
goto restart_scan ;
}
return NULL ;
found :
2008-03-18 22:44:48 +03:00
bdata - > last_success = PFN_PHYS ( start ) + node_boot_start ;
2005-04-17 02:20:36 +04:00
BUG_ON ( start > = eidx ) ;
/*
* Is the next page of the previous allocation - end the start
* of this allocation ' s buffer ? If yes then we can ' merge '
* the previous partial page with this allocation .
*/
if ( align < PAGE_SIZE & &
bdata - > last_offset & & bdata - > last_pos + 1 = = start ) {
2008-03-18 22:44:48 +03:00
unsigned long offset , remaining_size ;
2005-06-26 01:59:00 +04:00
offset = ALIGN ( bdata - > last_offset , align ) ;
2005-04-17 02:20:36 +04:00
BUG_ON ( offset > PAGE_SIZE ) ;
2006-09-26 10:31:08 +04:00
remaining_size = PAGE_SIZE - offset ;
2005-04-17 02:20:36 +04:00
if ( size < remaining_size ) {
areasize = 0 ;
/* last_pos unchanged */
2006-09-26 10:31:08 +04:00
bdata - > last_offset = offset + size ;
ret = phys_to_virt ( bdata - > last_pos * PAGE_SIZE +
2008-03-18 22:44:48 +03:00
offset + node_boot_start ) ;
2005-04-17 02:20:36 +04:00
} else {
remaining_size = size - remaining_size ;
2006-09-26 10:31:08 +04:00
areasize = ( remaining_size + PAGE_SIZE - 1 ) / PAGE_SIZE ;
ret = phys_to_virt ( bdata - > last_pos * PAGE_SIZE +
2008-03-18 22:44:48 +03:00
offset + node_boot_start ) ;
2006-09-26 10:31:08 +04:00
bdata - > last_pos = start + areasize - 1 ;
2005-04-17 02:20:36 +04:00
bdata - > last_offset = remaining_size ;
}
bdata - > last_offset & = ~ PAGE_MASK ;
} else {
bdata - > last_pos = start + areasize - 1 ;
bdata - > last_offset = size & ~ PAGE_MASK ;
2008-03-18 22:44:48 +03:00
ret = phys_to_virt ( start * PAGE_SIZE + node_boot_start ) ;
2005-04-17 02:20:36 +04:00
}
/*
* Reserve the area now :
*/
2006-09-26 10:31:08 +04:00
for ( i = start ; i < start + areasize ; i + + )
2008-03-18 22:44:48 +03:00
if ( unlikely ( test_and_set_bit ( i , node_bootmem_map ) ) )
2005-04-17 02:20:36 +04:00
BUG ( ) ;
memset ( ret , 0 , size ) ;
return ret ;
}
static unsigned long __init free_all_bootmem_core ( pg_data_t * pgdat )
{
struct page * page ;
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.
A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.
Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous. It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.
Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory. This is what allows the mem_map[]
to be chopped up.
In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags. Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations. However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions. Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.
One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags. It might provide
speed increases on certain platforms and will be stored there if there is
room. But, if out of room, an alternate (theoretically slower) mechanism is
used.
This patch introduces CONFIG_FLATMEM. It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:07:54 +04:00
unsigned long pfn ;
2005-04-17 02:20:36 +04:00
bootmem_data_t * bdata = pgdat - > bdata ;
unsigned long i , count , total = 0 ;
unsigned long idx ;
unsigned long * map ;
int gofast = 0 ;
BUG_ON ( ! bdata - > node_bootmem_map ) ;
count = 0 ;
/* first extant page of the node */
2006-09-26 10:31:07 +04:00
pfn = PFN_DOWN ( bdata - > node_boot_start ) ;
idx = bdata - > node_low_pfn - pfn ;
2005-04-17 02:20:36 +04:00
map = bdata - > node_bootmem_map ;
/* Check physaddr is O(LOG2(BITS_PER_LONG)) page aligned */
if ( bdata - > node_boot_start = = 0 | |
ffs ( bdata - > node_boot_start ) - PAGE_SHIFT > ffs ( BITS_PER_LONG ) )
gofast = 1 ;
for ( i = 0 ; i < idx ; ) {
unsigned long v = ~ map [ i / BITS_PER_LONG ] ;
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.
A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.
Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous. It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.
Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory. This is what allows the mem_map[]
to be chopped up.
In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags. Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations. However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions. Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.
One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags. It might provide
speed increases on certain platforms and will be stored there if there is
room. But, if out of room, an alternate (theoretically slower) mechanism is
used.
This patch introduces CONFIG_FLATMEM. It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:07:54 +04:00
2005-04-17 02:20:36 +04:00
if ( gofast & & v = = ~ 0UL ) {
2006-01-06 11:11:08 +03:00
int order ;
2005-04-17 02:20:36 +04:00
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.
A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.
Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous. It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.
Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory. This is what allows the mem_map[]
to be chopped up.
In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags. Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations. However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions. Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.
One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags. It might provide
speed increases on certain platforms and will be stored there if there is
room. But, if out of room, an alternate (theoretically slower) mechanism is
used.
This patch introduces CONFIG_FLATMEM. It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:07:54 +04:00
page = pfn_to_page ( pfn ) ;
2005-04-17 02:20:36 +04:00
count + = BITS_PER_LONG ;
order = ffs ( BITS_PER_LONG ) - 1 ;
2006-01-06 11:11:08 +03:00
__free_pages_bootmem ( page , order ) ;
2005-04-17 02:20:36 +04:00
i + = BITS_PER_LONG ;
page + = BITS_PER_LONG ;
} else if ( v ) {
unsigned long m ;
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.
A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.
Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous. It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.
Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory. This is what allows the mem_map[]
to be chopped up.
In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags. Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations. However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions. Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.
One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags. It might provide
speed increases on certain platforms and will be stored there if there is
room. But, if out of room, an alternate (theoretically slower) mechanism is
used.
This patch introduces CONFIG_FLATMEM. It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:07:54 +04:00
page = pfn_to_page ( pfn ) ;
2005-04-17 02:20:36 +04:00
for ( m = 1 ; m & & i < idx ; m < < = 1 , page + + , i + + ) {
if ( v & m ) {
count + + ;
2006-01-06 11:11:08 +03:00
__free_pages_bootmem ( page , 0 ) ;
2005-04-17 02:20:36 +04:00
}
}
} else {
2006-09-26 10:31:08 +04:00
i + = BITS_PER_LONG ;
2005-04-17 02:20:36 +04:00
}
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.
A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.
Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous. It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.
Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory. This is what allows the mem_map[]
to be chopped up.
In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags. Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations. However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions. Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.
One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags. It might provide
speed increases on certain platforms and will be stored there if there is
room. But, if out of room, an alternate (theoretically slower) mechanism is
used.
This patch introduces CONFIG_FLATMEM. It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:07:54 +04:00
pfn + = BITS_PER_LONG ;
2005-04-17 02:20:36 +04:00
}
total + = count ;
/*
* Now free the allocator bitmap itself , it ' s not
* needed anymore :
*/
page = virt_to_page ( bdata - > node_bootmem_map ) ;
count = 0 ;
2006-09-26 10:31:07 +04:00
idx = ( get_mapsize ( bdata ) + PAGE_SIZE - 1 ) > > PAGE_SHIFT ;
for ( i = 0 ; i < idx ; i + + , page + + ) {
2006-01-06 11:11:08 +03:00
__free_pages_bootmem ( page , 0 ) ;
2006-09-26 10:31:07 +04:00
count + + ;
2005-04-17 02:20:36 +04:00
}
total + = count ;
bdata - > node_bootmem_map = NULL ;
return total ;
}
2006-09-26 10:31:08 +04:00
unsigned long __init init_bootmem_node ( pg_data_t * pgdat , unsigned long freepfn ,
2006-09-26 10:31:05 +04:00
unsigned long startpfn , unsigned long endpfn )
2005-04-17 02:20:36 +04:00
{
2006-09-26 10:31:08 +04:00
return init_bootmem_core ( pgdat , freepfn , startpfn , endpfn ) ;
2005-04-17 02:20:36 +04:00
}
2008-06-21 21:01:02 +04:00
int __init reserve_bootmem_node ( pg_data_t * pgdat , unsigned long physaddr ,
2008-02-07 11:15:17 +03:00
unsigned long size , int flags )
2005-04-17 02:20:36 +04:00
{
2008-03-18 22:49:12 +03:00
int ret ;
ret = can_reserve_bootmem_core ( pgdat - > bdata , physaddr , size , flags ) ;
if ( ret < 0 )
2008-06-21 21:01:02 +04:00
return - ENOMEM ;
2008-02-07 11:15:17 +03:00
reserve_bootmem_core ( pgdat - > bdata , physaddr , size , flags ) ;
2008-06-21 21:01:02 +04:00
return 0 ;
2005-04-17 02:20:36 +04:00
}
2006-09-26 10:31:08 +04:00
void __init free_bootmem_node ( pg_data_t * pgdat , unsigned long physaddr ,
unsigned long size )
2005-04-17 02:20:36 +04:00
{
free_bootmem_core ( pgdat - > bdata , physaddr , size ) ;
}
2006-09-26 10:31:08 +04:00
unsigned long __init free_all_bootmem_node ( pg_data_t * pgdat )
2005-04-17 02:20:36 +04:00
{
memory hotplug: register section/node id to free
This patch set is to free pages which is allocated by bootmem for
memory-hotremove. Some structures of memory management are allocated by
bootmem. ex) memmap, etc.
To remove memory physically, some of them must be freed according to
circumstance. This patch set makes basis to free those pages, and free
memmaps.
Basic my idea is using remain members of struct page to remember information
of users of bootmem (section number or node id). When the section is
removing, kernel can confirm it. By this information, some issues can be
solved.
1) When the memmap of removing section is allocated on other
section by bootmem, it should/can be free.
2) When the memmap of removing section is allocated on the
same section, it shouldn't be freed. Because the section has to be
logical memory offlined already and all pages must be isolated against
page allocater. If it is freed, page allocator may use it which will
be removed physically soon.
3) When removing section has other section's memmap,
kernel will be able to show easily which section should be removed
before it for user. (Not implemented yet)
4) When the above case 2), the page isolation will be able to check and skip
memmap's page when logical memory offline (offline_pages()).
Current page isolation code fails in this case because this page is
just reserved page and it can't distinguish this pages can be
removed or not. But, it will be able to do by this patch.
(Not implemented yet.)
5) The node information like pgdat has similar issues. But, this
will be able to be solved too by this.
(Not implemented yet, but, remembering node id in the pages.)
Fortunately, current bootmem allocator just keeps PageReserved flags,
and doesn't use any other members of page struct. The users of
bootmem doesn't use them too.
This patch:
This is to register information which is node or section's id. Kernel can
distinguish which node/section uses the pages allcated by bootmem. This is
basis for hot-remove sections or nodes.
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: Badari Pulavarty <pbadari@us.ibm.com>
Cc: Yinghai Lu <yhlu.kernel@gmail.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 13:13:31 +04:00
register_page_bootmem_info_node ( pgdat ) ;
2006-09-26 10:31:08 +04:00
return free_all_bootmem_core ( pgdat ) ;
2005-04-17 02:20:36 +04:00
}
2006-09-26 10:31:08 +04:00
unsigned long __init init_bootmem ( unsigned long start , unsigned long pages )
2005-04-17 02:20:36 +04:00
{
max_low_pfn = pages ;
min_low_pfn = start ;
2006-09-26 10:31:08 +04:00
return init_bootmem_core ( NODE_DATA ( 0 ) , start , 0 , pages ) ;
2005-04-17 02:20:36 +04:00
}
# ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
2008-02-07 11:15:17 +03:00
int __init reserve_bootmem ( unsigned long addr , unsigned long size ,
int flags )
2005-04-17 02:20:36 +04:00
{
2008-03-18 22:49:12 +03:00
bootmem_data_t * bdata ;
int ret ;
list_for_each_entry ( bdata , & bdata_list , list ) {
ret = can_reserve_bootmem_core ( bdata , addr , size , flags ) ;
if ( ret < 0 )
return ret ;
}
list_for_each_entry ( bdata , & bdata_list , list )
reserve_bootmem_core ( bdata , addr , size , flags ) ;
return 0 ;
2005-04-17 02:20:36 +04:00
}
# endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
2006-09-26 10:31:08 +04:00
void __init free_bootmem ( unsigned long addr , unsigned long size )
2005-04-17 02:20:36 +04:00
{
2008-03-24 22:29:45 +03:00
bootmem_data_t * bdata ;
list_for_each_entry ( bdata , & bdata_list , list )
free_bootmem_core ( bdata , addr , size ) ;
2005-04-17 02:20:36 +04:00
}
2006-09-26 10:31:08 +04:00
unsigned long __init free_all_bootmem ( void )
2005-04-17 02:20:36 +04:00
{
2006-09-26 10:31:08 +04:00
return free_all_bootmem_core ( NODE_DATA ( 0 ) ) ;
2005-04-17 02:20:36 +04:00
}
2006-09-26 10:31:05 +04:00
void * __init __alloc_bootmem_nopanic ( unsigned long size , unsigned long align ,
unsigned long goal )
2005-04-17 02:20:36 +04:00
{
2006-03-27 13:15:58 +04:00
bootmem_data_t * bdata ;
2005-04-17 02:20:36 +04:00
void * ptr ;
2006-09-26 10:31:08 +04:00
list_for_each_entry ( bdata , & bdata_list , list ) {
ptr = __alloc_bootmem_core ( bdata , size , align , goal , 0 ) ;
if ( ptr )
return ptr ;
}
2006-04-07 21:49:21 +04:00
return NULL ;
}
2005-04-17 02:20:36 +04:00
2006-09-26 10:31:05 +04:00
void * __init __alloc_bootmem ( unsigned long size , unsigned long align ,
unsigned long goal )
2006-04-07 21:49:21 +04:00
{
void * mem = __alloc_bootmem_nopanic ( size , align , goal ) ;
2006-09-26 10:31:08 +04:00
2006-04-07 21:49:21 +04:00
if ( mem )
return mem ;
2005-04-17 02:20:36 +04:00
/*
* Whoops , we cannot satisfy the allocation request .
*/
printk ( KERN_ALERT " bootmem alloc of %lu bytes failed! \n " , size ) ;
panic ( " Out of memory " ) ;
return NULL ;
}
2005-10-20 02:52:18 +04:00
2006-09-26 10:31:05 +04:00
void * __init __alloc_bootmem_node ( pg_data_t * pgdat , unsigned long size ,
unsigned long align , unsigned long goal )
2005-04-17 02:20:36 +04:00
{
void * ptr ;
2006-01-06 11:11:01 +03:00
ptr = __alloc_bootmem_core ( pgdat - > bdata , size , align , goal , 0 ) ;
2005-04-17 02:20:36 +04:00
if ( ptr )
2006-09-26 10:31:08 +04:00
return ptr ;
2005-04-17 02:20:36 +04:00
2006-01-06 11:11:01 +03:00
return __alloc_bootmem ( size , align , goal ) ;
2005-04-17 02:20:36 +04:00
}
2008-04-28 13:13:32 +04:00
# ifdef CONFIG_SPARSEMEM
void * __init alloc_bootmem_section ( unsigned long size ,
unsigned long section_nr )
{
void * ptr ;
unsigned long limit , goal , start_nr , end_nr , pfn ;
struct pglist_data * pgdat ;
pfn = section_nr_to_pfn ( section_nr ) ;
goal = PFN_PHYS ( pfn ) ;
limit = PFN_PHYS ( section_nr_to_pfn ( section_nr + 1 ) ) - 1 ;
pgdat = NODE_DATA ( early_pfn_to_nid ( pfn ) ) ;
ptr = __alloc_bootmem_core ( pgdat - > bdata , size , SMP_CACHE_BYTES , goal ,
limit ) ;
if ( ! ptr )
return NULL ;
start_nr = pfn_to_section_nr ( PFN_DOWN ( __pa ( ptr ) ) ) ;
end_nr = pfn_to_section_nr ( PFN_DOWN ( __pa ( ptr ) + size ) ) ;
if ( start_nr ! = section_nr | | end_nr ! = section_nr ) {
printk ( KERN_WARNING " alloc_bootmem failed on section %ld. \n " ,
section_nr ) ;
free_bootmem_core ( pgdat - > bdata , __pa ( ptr ) , size ) ;
ptr = NULL ;
}
return ptr ;
}
# endif
2006-09-26 10:31:33 +04:00
# ifndef ARCH_LOW_ADDRESS_LIMIT
# define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL
# endif
2006-01-06 11:11:01 +03:00
2006-09-26 10:31:05 +04:00
void * __init __alloc_bootmem_low ( unsigned long size , unsigned long align ,
unsigned long goal )
2006-01-06 11:11:01 +03:00
{
2006-03-27 13:15:58 +04:00
bootmem_data_t * bdata ;
2006-01-06 11:11:01 +03:00
void * ptr ;
2006-09-26 10:31:08 +04:00
list_for_each_entry ( bdata , & bdata_list , list ) {
2006-09-26 10:31:33 +04:00
ptr = __alloc_bootmem_core ( bdata , size , align , goal ,
ARCH_LOW_ADDRESS_LIMIT ) ;
2006-09-26 10:31:08 +04:00
if ( ptr )
return ptr ;
}
2006-01-06 11:11:01 +03:00
/*
* Whoops , we cannot satisfy the allocation request .
*/
printk ( KERN_ALERT " low bootmem alloc of %lu bytes failed! \n " , size ) ;
panic ( " Out of low memory " ) ;
return NULL ;
}
void * __init __alloc_bootmem_low_node ( pg_data_t * pgdat , unsigned long size ,
unsigned long align , unsigned long goal )
{
2006-09-26 10:31:33 +04:00
return __alloc_bootmem_core ( pgdat - > bdata , size , align , goal ,
ARCH_LOW_ADDRESS_LIMIT ) ;
2006-01-06 11:11:01 +03:00
}