2005-04-17 02:20:36 +04:00
/*
* linux / mm / page_alloc . c
*
* Manages the free list , the system allocates free pages here .
* Note that kmalloc ( ) lives in slab . c
*
* Copyright ( C ) 1991 , 1992 , 1993 , 1994 Linus Torvalds
* Swap reorganised 29.12 .95 , Stephen Tweedie
* Support of BIGMEM added by Gerhard Wichert , Siemens AG , July 1999
* Reshaped it to be a zoned allocator , Ingo Molnar , Red Hat , 1999
* Discontiguous memory support , Kanoj Sarcar , SGI , Nov 1999
* Zone balancing , Kanoj Sarcar , SGI , Jan 2000
* Per cpu hot / cold page lists , bulk allocation , Martin J . Bligh , Sept 2002
* ( lots of bits borrowed from Ingo Molnar & Andrew Morton )
*/
# include <linux/config.h>
# include <linux/stddef.h>
# include <linux/mm.h>
# include <linux/swap.h>
# include <linux/interrupt.h>
# include <linux/pagemap.h>
# include <linux/bootmem.h>
# include <linux/compiler.h>
2005-09-13 12:25:16 +04:00
# include <linux/kernel.h>
2005-04-17 02:20:36 +04:00
# include <linux/module.h>
# include <linux/suspend.h>
# include <linux/pagevec.h>
# include <linux/blkdev.h>
# include <linux/slab.h>
# include <linux/notifier.h>
# include <linux/topology.h>
# include <linux/sysctl.h>
# include <linux/cpu.h>
# include <linux/cpuset.h>
2005-10-30 04:16:53 +03:00
# include <linux/memory_hotplug.h>
2005-04-17 02:20:36 +04:00
# include <linux/nodemask.h>
# include <linux/vmalloc.h>
# include <asm/tlbflush.h>
# include "internal.h"
/*
* MCD - HACK : Find somewhere to initialize this EARLY , or make this
* initializer cleaner
*/
2005-09-07 02:16:33 +04:00
nodemask_t node_online_map __read_mostly = { { [ 0 ] = 1UL } } ;
2005-03-24 05:00:00 +03:00
EXPORT_SYMBOL ( node_online_map ) ;
2005-09-07 02:16:33 +04:00
nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL ;
2005-03-24 05:00:00 +03:00
EXPORT_SYMBOL ( node_possible_map ) ;
2005-09-07 02:16:33 +04:00
struct pglist_data * pgdat_list __read_mostly ;
2005-09-07 02:17:45 +04:00
unsigned long totalram_pages __read_mostly ;
unsigned long totalhigh_pages __read_mostly ;
2005-04-17 02:20:36 +04:00
long nr_swap_pages ;
/*
* results with 256 , 32 in the lowmem_reserve sysctl :
* 1 G machine - > ( 16 M dma , 800 M - 16 M normal , 1 G - 800 M high )
* 1 G machine - > ( 16 M dma , 784 M normal , 224 M high )
* NORMAL allocation will leave 784 M / 256 of ram reserved in the ZONE_DMA
* HIGHMEM allocation will leave 224 M / 32 of ram reserved in ZONE_NORMAL
* HIGHMEM allocation will ( 224 M + 784 M ) / 256 of ram reserved in ZONE_DMA
*/
int sysctl_lowmem_reserve_ratio [ MAX_NR_ZONES - 1 ] = { 256 , 32 } ;
EXPORT_SYMBOL ( totalram_pages ) ;
/*
* Used by page_zone ( ) to look up the address of the struct zone whose
* id is encoded in the upper bits of page - > flags
*/
2005-09-07 02:16:33 +04:00
struct zone * zone_table [ 1 < < ZONETABLE_SHIFT ] __read_mostly ;
2005-04-17 02:20:36 +04:00
EXPORT_SYMBOL ( zone_table ) ;
static char * zone_names [ MAX_NR_ZONES ] = { " DMA " , " Normal " , " HighMem " } ;
int min_free_kbytes = 1024 ;
unsigned long __initdata nr_kernel_pages ;
unsigned long __initdata nr_all_pages ;
2005-10-30 04:16:52 +03:00
static int page_outside_zone_boundaries ( struct zone * zone , struct page * page )
2005-04-17 02:20:36 +04:00
{
2005-10-30 04:16:53 +03:00
int ret = 0 ;
unsigned seq ;
unsigned long pfn = page_to_pfn ( page ) ;
2005-10-30 04:16:52 +03:00
2005-10-30 04:16:53 +03:00
do {
seq = zone_span_seqbegin ( zone ) ;
if ( pfn > = zone - > zone_start_pfn + zone - > spanned_pages )
ret = 1 ;
else if ( pfn < zone - > zone_start_pfn )
ret = 1 ;
} while ( zone_span_seqretry ( zone , seq ) ) ;
return ret ;
2005-10-30 04:16:52 +03:00
}
static int page_is_consistent ( struct zone * zone , struct page * page )
{
2005-04-17 02:20:36 +04:00
# ifdef CONFIG_HOLES_IN_ZONE
if ( ! pfn_valid ( page_to_pfn ( page ) ) )
2005-10-30 04:16:52 +03:00
return 0 ;
2005-04-17 02:20:36 +04:00
# endif
if ( zone ! = page_zone ( page ) )
2005-10-30 04:16:52 +03:00
return 0 ;
return 1 ;
}
/*
* Temporary debugging check for pages not lying within a given zone .
*/
static int bad_range ( struct zone * zone , struct page * page )
{
if ( page_outside_zone_boundaries ( zone , page ) )
2005-04-17 02:20:36 +04:00
return 1 ;
2005-10-30 04:16:52 +03:00
if ( ! page_is_consistent ( zone , page ) )
return 1 ;
2005-04-17 02:20:36 +04:00
return 0 ;
}
static void bad_page ( const char * function , struct page * page )
{
printk ( KERN_EMERG " Bad page state at %s (in process '%s', page %p) \n " ,
function , current - > comm , page ) ;
printk ( KERN_EMERG " flags:0x%0*lx mapping:%p mapcount:%d count:%d \n " ,
( int ) ( 2 * sizeof ( page_flags_t ) ) , ( unsigned long ) page - > flags ,
page - > mapping , page_mapcount ( page ) , page_count ( page ) ) ;
printk ( KERN_EMERG " Backtrace: \n " ) ;
dump_stack ( ) ;
printk ( KERN_EMERG " Trying to fix it up, but a reboot is needed \n " ) ;
2005-06-22 04:15:08 +04:00
page - > flags & = ~ ( 1 < < PG_lru |
1 < < PG_private |
2005-04-17 02:20:36 +04:00
1 < < PG_locked |
1 < < PG_active |
1 < < PG_dirty |
2005-06-22 04:15:08 +04:00
1 < < PG_reclaim |
1 < < PG_slab |
2005-04-17 02:20:36 +04:00
1 < < PG_swapcache |
2005-10-30 04:16:12 +03:00
1 < < PG_writeback |
1 < < PG_reserved ) ;
2005-04-17 02:20:36 +04:00
set_page_count ( page , 0 ) ;
reset_page_mapcount ( page ) ;
page - > mapping = NULL ;
2005-09-13 12:25:16 +04:00
add_taint ( TAINT_BAD_PAGE ) ;
2005-04-17 02:20:36 +04:00
}
# ifndef CONFIG_HUGETLB_PAGE
# define prep_compound_page(page, order) do { } while (0)
# define destroy_compound_page(page, order) do { } while (0)
# else
/*
* Higher - order pages are called " compound pages " . They are structured thusly :
*
* The first PAGE_SIZE page is called the " head page " .
*
* The remaining PAGE_SIZE pages are called " tail pages " .
*
* All pages have PG_compound set . All pages have their - > private pointing at
* the head page ( even the head page has this ) .
*
* The first tail page ' s - > mapping , if non - zero , holds the address of the
* compound page ' s put_page ( ) function .
*
* The order of the allocation is stored in the first tail page ' s - > index
* This is only for debug at present . This usage means that zero - order pages
* may not be compound .
*/
static void prep_compound_page ( struct page * page , unsigned long order )
{
int i ;
int nr_pages = 1 < < order ;
page [ 1 ] . mapping = NULL ;
page [ 1 ] . index = order ;
for ( i = 0 ; i < nr_pages ; i + + ) {
struct page * p = page + i ;
SetPageCompound ( p ) ;
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 04:16:40 +03:00
set_page_private ( p , ( unsigned long ) page ) ;
2005-04-17 02:20:36 +04:00
}
}
static void destroy_compound_page ( struct page * page , unsigned long order )
{
int i ;
int nr_pages = 1 < < order ;
if ( ! PageCompound ( page ) )
return ;
if ( page [ 1 ] . index ! = order )
bad_page ( __FUNCTION__ , page ) ;
for ( i = 0 ; i < nr_pages ; i + + ) {
struct page * p = page + i ;
if ( ! PageCompound ( p ) )
bad_page ( __FUNCTION__ , page ) ;
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 04:16:40 +03:00
if ( page_private ( p ) ! = ( unsigned long ) page )
2005-04-17 02:20:36 +04:00
bad_page ( __FUNCTION__ , page ) ;
ClearPageCompound ( p ) ;
}
}
# endif /* CONFIG_HUGETLB_PAGE */
/*
* function for dealing with page ' s order in buddy system .
* zone - > lock is already acquired when we use these .
* So , we don ' t need atomic page - > flags operations here .
*/
static inline unsigned long page_order ( struct page * page ) {
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 04:16:40 +03:00
return page_private ( page ) ;
2005-04-17 02:20:36 +04:00
}
static inline void set_page_order ( struct page * page , int order ) {
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 04:16:40 +03:00
set_page_private ( page , order ) ;
2005-04-17 02:20:36 +04:00
__SetPagePrivate ( page ) ;
}
static inline void rmv_page_order ( struct page * page )
{
__ClearPagePrivate ( page ) ;
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 04:16:40 +03:00
set_page_private ( page , 0 ) ;
2005-04-17 02:20:36 +04:00
}
/*
* Locate the struct page for both the matching buddy in our
* pair ( buddy1 ) and the combined O ( n + 1 ) page they form ( page ) .
*
* 1 ) Any buddy B1 will have an order O twin B2 which satisfies
* the following equation :
* B2 = B1 ^ ( 1 < < O )
* For example , if the starting buddy ( buddy2 ) is # 8 its order
* 1 buddy is # 10 :
* B2 = 8 ^ ( 1 < < 1 ) = 8 ^ 2 = 10
*
* 2 ) Any buddy B will have an order O + 1 parent P which
* satisfies the following equation :
* P = B & ~ ( 1 < < O )
*
* Assumption : * _mem_map is contigious at least up to MAX_ORDER
*/
static inline struct page *
__page_find_buddy ( struct page * page , unsigned long page_idx , unsigned int order )
{
unsigned long buddy_idx = page_idx ^ ( 1 < < order ) ;
return page + ( buddy_idx - page_idx ) ;
}
static inline unsigned long
__find_combined_index ( unsigned long page_idx , unsigned int order )
{
return ( page_idx & ~ ( 1 < < order ) ) ;
}
/*
* This function checks whether a page is free & & is the buddy
* we can do coalesce a page and its buddy if
* ( a ) the buddy is free & &
* ( b ) the buddy is on the buddy system & &
* ( c ) a page and its buddy have the same order .
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 04:16:40 +03:00
* for recording page ' s order , we use page_private ( page ) and PG_private .
2005-04-17 02:20:36 +04:00
*
*/
static inline int page_is_buddy ( struct page * page , int order )
{
if ( PagePrivate ( page ) & &
( page_order ( page ) = = order ) & &
page_count ( page ) = = 0 )
return 1 ;
return 0 ;
}
/*
* Freeing function for a buddy system allocator .
*
* The concept of a buddy system is to maintain direct - mapped table
* ( containing bit values ) for memory blocks of various " orders " .
* The bottom level table contains the map for the smallest allocatable
* units of memory ( here , pages ) , and each level above it describes
* pairs of units from the levels below , hence , " buddies " .
* At a high level , all that happens here is marking the table entry
* at the bottom level available , and propagating the changes upward
* as necessary , plus some accounting needed to play nicely with other
* parts of the VM system .
* At each level , we keep a list of pages , which are heads of continuous
* free pages of length of ( 1 < < order ) and marked with PG_Private . Page ' s
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 04:16:40 +03:00
* order is recorded in page_private ( page ) field .
2005-04-17 02:20:36 +04:00
* So when we are allocating or freeing one , we can derive the state of the
* other . That is , if we allocate a small block , and both were
* free , the remainder of the region must be split into blocks .
* If a block is freed , and its buddy is also free , then this
* triggers coalescing into a block of larger size .
*
* - - wli
*/
static inline void __free_pages_bulk ( struct page * page ,
struct zone * zone , unsigned int order )
{
unsigned long page_idx ;
int order_size = 1 < < order ;
if ( unlikely ( order ) )
destroy_compound_page ( page , order ) ;
page_idx = page_to_pfn ( page ) & ( ( 1 < < MAX_ORDER ) - 1 ) ;
BUG_ON ( page_idx & ( order_size - 1 ) ) ;
BUG_ON ( bad_range ( zone , page ) ) ;
zone - > free_pages + = order_size ;
while ( order < MAX_ORDER - 1 ) {
unsigned long combined_idx ;
struct free_area * area ;
struct page * buddy ;
combined_idx = __find_combined_index ( page_idx , order ) ;
buddy = __page_find_buddy ( page , page_idx , order ) ;
if ( bad_range ( zone , buddy ) )
break ;
if ( ! page_is_buddy ( buddy , order ) )
break ; /* Move the buddy up one level. */
list_del ( & buddy - > lru ) ;
area = zone - > free_area + order ;
area - > nr_free - - ;
rmv_page_order ( buddy ) ;
page = page + ( combined_idx - page_idx ) ;
page_idx = combined_idx ;
order + + ;
}
set_page_order ( page , order ) ;
list_add ( & page - > lru , & zone - > free_area [ order ] . free_list ) ;
zone - > free_area [ order ] . nr_free + + ;
}
static inline void free_pages_check ( const char * function , struct page * page )
{
if ( page_mapcount ( page ) | |
page - > mapping ! = NULL | |
page_count ( page ) ! = 0 | |
( page - > flags & (
1 < < PG_lru |
1 < < PG_private |
1 < < PG_locked |
1 < < PG_active |
1 < < PG_reclaim |
1 < < PG_slab |
1 < < PG_swapcache |
2005-10-30 04:16:12 +03:00
1 < < PG_writeback |
1 < < PG_reserved ) ) )
2005-04-17 02:20:36 +04:00
bad_page ( function , page ) ;
if ( PageDirty ( page ) )
2005-09-04 02:54:50 +04:00
__ClearPageDirty ( page ) ;
2005-04-17 02:20:36 +04:00
}
/*
* Frees a list of pages .
* Assumes all pages on list are in same zone , and of same order .
2005-09-10 11:26:59 +04:00
* count is the number of pages to free .
2005-04-17 02:20:36 +04:00
*
* If the zone was previously in an " all pages pinned " state then look to
* see if this freeing clears that state .
*
* And clear the zone ' s pages_scanned counter , to hold off the " all pages are
* pinned " detection logic.
*/
static int
free_pages_bulk ( struct zone * zone , int count ,
struct list_head * list , unsigned int order )
{
unsigned long flags ;
struct page * page = NULL ;
int ret = 0 ;
spin_lock_irqsave ( & zone - > lock , flags ) ;
zone - > all_unreclaimable = 0 ;
zone - > pages_scanned = 0 ;
while ( ! list_empty ( list ) & & count - - ) {
page = list_entry ( list - > prev , struct page , lru ) ;
/* have to delete it as __free_pages_bulk list manipulates */
list_del ( & page - > lru ) ;
__free_pages_bulk ( page , zone , order ) ;
ret + + ;
}
spin_unlock_irqrestore ( & zone - > lock , flags ) ;
return ret ;
}
void __free_pages_ok ( struct page * page , unsigned int order )
{
LIST_HEAD ( list ) ;
int i ;
arch_free_page ( page , order ) ;
mod_page_state ( pgfree , 1 < < order ) ;
# ifndef CONFIG_MMU
if ( order > 0 )
for ( i = 1 ; i < ( 1 < < order ) ; + + i )
__put_page ( page + i ) ;
# endif
for ( i = 0 ; i < ( 1 < < order ) ; + + i )
free_pages_check ( __FUNCTION__ , page + i ) ;
list_add ( & page - > lru , & list ) ;
kernel_map_pages ( page , 1 < < order , 0 ) ;
free_pages_bulk ( page_zone ( page ) , 1 , & list , order ) ;
}
/*
* The order of subdivision here is critical for the IO subsystem .
* Please do not alter this order without good reasons and regression
* testing . Specifically , as large blocks of memory are subdivided ,
* the order in which smaller blocks are delivered depends on the order
* they ' re subdivided in this function . This is the primary factor
* influencing the order in which pages are delivered to the IO
* subsystem according to empirical testing , and this is also justified
* by considering the behavior of a buddy system containing a single
* large block of memory acted on by a series of small allocations .
* This behavior is a critical factor in sglist merging ' s success .
*
* - - wli
*/
static inline struct page *
expand ( struct zone * zone , struct page * page ,
int low , int high , struct free_area * area )
{
unsigned long size = 1 < < high ;
while ( high > low ) {
area - - ;
high - - ;
size > > = 1 ;
BUG_ON ( bad_range ( zone , & page [ size ] ) ) ;
list_add ( & page [ size ] . lru , & area - > free_list ) ;
area - > nr_free + + ;
set_page_order ( & page [ size ] , high ) ;
}
return page ;
}
void set_page_refs ( struct page * page , int order )
{
# ifdef CONFIG_MMU
set_page_count ( page , 1 ) ;
# else
int i ;
/*
* We need to reference all the pages for this order , otherwise if
* anyone accesses one of the pages with ( get / put ) it will be freed .
* - eg : access_process_vm ( )
*/
for ( i = 0 ; i < ( 1 < < order ) ; i + + )
set_page_count ( page + i , 1 ) ;
# endif /* CONFIG_MMU */
}
/*
* This page is about to be returned from the page allocator
*/
static void prep_new_page ( struct page * page , int order )
{
2005-06-22 04:15:08 +04:00
if ( page_mapcount ( page ) | |
page - > mapping ! = NULL | |
page_count ( page ) ! = 0 | |
( page - > flags & (
1 < < PG_lru |
2005-04-17 02:20:36 +04:00
1 < < PG_private |
1 < < PG_locked |
1 < < PG_active |
1 < < PG_dirty |
1 < < PG_reclaim |
2005-06-22 04:15:08 +04:00
1 < < PG_slab |
2005-04-17 02:20:36 +04:00
1 < < PG_swapcache |
2005-10-30 04:16:12 +03:00
1 < < PG_writeback |
1 < < PG_reserved ) ) )
2005-04-17 02:20:36 +04:00
bad_page ( __FUNCTION__ , page ) ;
page - > flags & = ~ ( 1 < < PG_uptodate | 1 < < PG_error |
1 < < PG_referenced | 1 < < PG_arch_1 |
1 < < PG_checked | 1 < < PG_mappedtodisk ) ;
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 04:16:40 +03:00
set_page_private ( page , 0 ) ;
2005-04-17 02:20:36 +04:00
set_page_refs ( page , order ) ;
kernel_map_pages ( page , 1 < < order , 1 ) ;
}
/*
* Do the hard work of removing an element from the buddy allocator .
* Call me with the zone - > lock already held .
*/
static struct page * __rmqueue ( struct zone * zone , unsigned int order )
{
struct free_area * area ;
unsigned int current_order ;
struct page * page ;
for ( current_order = order ; current_order < MAX_ORDER ; + + current_order ) {
area = zone - > free_area + current_order ;
if ( list_empty ( & area - > free_list ) )
continue ;
page = list_entry ( area - > free_list . next , struct page , lru ) ;
list_del ( & page - > lru ) ;
rmv_page_order ( page ) ;
area - > nr_free - - ;
zone - > free_pages - = 1UL < < order ;
return expand ( zone , page , order , current_order , area ) ;
}
return NULL ;
}
/*
* Obtain a specified number of elements from the buddy allocator , all under
* a single hold of the lock , for efficiency . Add them to the supplied list .
* Returns the number of new pages which were placed at * list .
*/
static int rmqueue_bulk ( struct zone * zone , unsigned int order ,
unsigned long count , struct list_head * list )
{
unsigned long flags ;
int i ;
int allocated = 0 ;
struct page * page ;
spin_lock_irqsave ( & zone - > lock , flags ) ;
for ( i = 0 ; i < count ; + + i ) {
page = __rmqueue ( zone , order ) ;
if ( page = = NULL )
break ;
allocated + + ;
list_add_tail ( & page - > lru , list ) ;
}
spin_unlock_irqrestore ( & zone - > lock , flags ) ;
return allocated ;
}
2005-06-22 04:14:57 +04:00
# ifdef CONFIG_NUMA
/* Called from the slab reaper to drain remote pagesets */
void drain_remote_pages ( void )
{
struct zone * zone ;
int i ;
unsigned long flags ;
local_irq_save ( flags ) ;
for_each_zone ( zone ) {
struct per_cpu_pageset * pset ;
/* Do not drain local pagesets */
if ( zone - > zone_pgdat - > node_id = = numa_node_id ( ) )
continue ;
pset = zone - > pageset [ smp_processor_id ( ) ] ;
for ( i = 0 ; i < ARRAY_SIZE ( pset - > pcp ) ; i + + ) {
struct per_cpu_pages * pcp ;
pcp = & pset - > pcp [ i ] ;
if ( pcp - > count )
pcp - > count - = free_pages_bulk ( zone , pcp - > count ,
& pcp - > list , 0 ) ;
}
}
local_irq_restore ( flags ) ;
}
# endif
2005-04-17 02:20:36 +04:00
# if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
static void __drain_pages ( unsigned int cpu )
{
struct zone * zone ;
int i ;
for_each_zone ( zone ) {
struct per_cpu_pageset * pset ;
2005-06-22 04:14:47 +04:00
pset = zone_pcp ( zone , cpu ) ;
2005-04-17 02:20:36 +04:00
for ( i = 0 ; i < ARRAY_SIZE ( pset - > pcp ) ; i + + ) {
struct per_cpu_pages * pcp ;
pcp = & pset - > pcp [ i ] ;
pcp - > count - = free_pages_bulk ( zone , pcp - > count ,
& pcp - > list , 0 ) ;
}
}
}
# endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */
# ifdef CONFIG_PM
void mark_free_pages ( struct zone * zone )
{
unsigned long zone_pfn , flags ;
int order ;
struct list_head * curr ;
if ( ! zone - > spanned_pages )
return ;
spin_lock_irqsave ( & zone - > lock , flags ) ;
for ( zone_pfn = 0 ; zone_pfn < zone - > spanned_pages ; + + zone_pfn )
ClearPageNosaveFree ( pfn_to_page ( zone_pfn + zone - > zone_start_pfn ) ) ;
for ( order = MAX_ORDER - 1 ; order > = 0 ; - - order )
list_for_each ( curr , & zone - > free_area [ order ] . free_list ) {
unsigned long start_pfn , i ;
start_pfn = page_to_pfn ( list_entry ( curr , struct page , lru ) ) ;
for ( i = 0 ; i < ( 1 < < order ) ; i + + )
SetPageNosaveFree ( pfn_to_page ( start_pfn + i ) ) ;
}
spin_unlock_irqrestore ( & zone - > lock , flags ) ;
}
/*
* Spill all of this CPU ' s per - cpu pages back into the buddy allocator .
*/
void drain_local_pages ( void )
{
unsigned long flags ;
local_irq_save ( flags ) ;
__drain_pages ( smp_processor_id ( ) ) ;
local_irq_restore ( flags ) ;
}
# endif /* CONFIG_PM */
static void zone_statistics ( struct zonelist * zonelist , struct zone * z )
{
# ifdef CONFIG_NUMA
unsigned long flags ;
int cpu ;
pg_data_t * pg = z - > zone_pgdat ;
pg_data_t * orig = zonelist - > zones [ 0 ] - > zone_pgdat ;
struct per_cpu_pageset * p ;
local_irq_save ( flags ) ;
cpu = smp_processor_id ( ) ;
2005-06-22 04:14:47 +04:00
p = zone_pcp ( z , cpu ) ;
2005-04-17 02:20:36 +04:00
if ( pg = = orig ) {
2005-06-22 04:14:47 +04:00
p - > numa_hit + + ;
2005-04-17 02:20:36 +04:00
} else {
p - > numa_miss + + ;
2005-06-22 04:14:47 +04:00
zone_pcp ( zonelist - > zones [ 0 ] , cpu ) - > numa_foreign + + ;
2005-04-17 02:20:36 +04:00
}
if ( pg = = NODE_DATA ( numa_node_id ( ) ) )
p - > local_node + + ;
else
p - > other_node + + ;
local_irq_restore ( flags ) ;
# endif
}
/*
* Free a 0 - order page
*/
static void FASTCALL ( free_hot_cold_page ( struct page * page , int cold ) ) ;
static void fastcall free_hot_cold_page ( struct page * page , int cold )
{
struct zone * zone = page_zone ( page ) ;
struct per_cpu_pages * pcp ;
unsigned long flags ;
arch_free_page ( page , 0 ) ;
kernel_map_pages ( page , 1 , 0 ) ;
inc_page_state ( pgfree ) ;
if ( PageAnon ( page ) )
page - > mapping = NULL ;
free_pages_check ( __FUNCTION__ , page ) ;
2005-06-22 04:14:47 +04:00
pcp = & zone_pcp ( zone , get_cpu ( ) ) - > pcp [ cold ] ;
2005-04-17 02:20:36 +04:00
local_irq_save ( flags ) ;
list_add ( & page - > lru , & pcp - > list ) ;
pcp - > count + + ;
2005-06-22 04:15:00 +04:00
if ( pcp - > count > = pcp - > high )
pcp - > count - = free_pages_bulk ( zone , pcp - > batch , & pcp - > list , 0 ) ;
2005-04-17 02:20:36 +04:00
local_irq_restore ( flags ) ;
put_cpu ( ) ;
}
void fastcall free_hot_page ( struct page * page )
{
free_hot_cold_page ( page , 0 ) ;
}
void fastcall free_cold_page ( struct page * page )
{
free_hot_cold_page ( page , 1 ) ;
}
2005-10-07 10:46:04 +04:00
static inline void prep_zero_page ( struct page * page , int order , gfp_t gfp_flags )
2005-04-17 02:20:36 +04:00
{
int i ;
BUG_ON ( ( gfp_flags & ( __GFP_WAIT | __GFP_HIGHMEM ) ) = = __GFP_HIGHMEM ) ;
for ( i = 0 ; i < ( 1 < < order ) ; i + + )
clear_highpage ( page + i ) ;
}
/*
* Really , prep_compound_page ( ) should be called from __rmqueue_bulk ( ) . But
* we cheat by calling it from here , in the order > 0 path . Saves a branch
* or two .
*/
static struct page *
2005-10-07 10:46:04 +04:00
buffered_rmqueue ( struct zone * zone , int order , gfp_t gfp_flags )
2005-04-17 02:20:36 +04:00
{
unsigned long flags ;
struct page * page = NULL ;
int cold = ! ! ( gfp_flags & __GFP_COLD ) ;
if ( order = = 0 ) {
struct per_cpu_pages * pcp ;
2005-06-22 04:14:47 +04:00
pcp = & zone_pcp ( zone , get_cpu ( ) ) - > pcp [ cold ] ;
2005-04-17 02:20:36 +04:00
local_irq_save ( flags ) ;
if ( pcp - > count < = pcp - > low )
pcp - > count + = rmqueue_bulk ( zone , 0 ,
pcp - > batch , & pcp - > list ) ;
if ( pcp - > count ) {
page = list_entry ( pcp - > list . next , struct page , lru ) ;
list_del ( & page - > lru ) ;
pcp - > count - - ;
}
local_irq_restore ( flags ) ;
put_cpu ( ) ;
}
if ( page = = NULL ) {
spin_lock_irqsave ( & zone - > lock , flags ) ;
page = __rmqueue ( zone , order ) ;
spin_unlock_irqrestore ( & zone - > lock , flags ) ;
}
if ( page ! = NULL ) {
BUG_ON ( bad_range ( zone , page ) ) ;
mod_page_state_zone ( zone , pgalloc , 1 < < order ) ;
prep_new_page ( page , order ) ;
if ( gfp_flags & __GFP_ZERO )
prep_zero_page ( page , order , gfp_flags ) ;
if ( order & & ( gfp_flags & __GFP_COMP ) )
prep_compound_page ( page , order ) ;
}
return page ;
}
/*
* Return 1 if free pages are above ' mark ' . This takes into account the order
* of the allocation .
*/
int zone_watermark_ok ( struct zone * z , int order , unsigned long mark ,
2005-10-21 11:22:44 +04:00
int classzone_idx , int can_try_harder , gfp_t gfp_high )
2005-04-17 02:20:36 +04:00
{
/* free_pages my go negative - that's OK */
long min = mark , free_pages = z - > free_pages - ( 1 < < order ) + 1 ;
int o ;
if ( gfp_high )
min - = min / 2 ;
if ( can_try_harder )
min - = min / 4 ;
if ( free_pages < = min + z - > lowmem_reserve [ classzone_idx ] )
return 0 ;
for ( o = 0 ; o < order ; o + + ) {
/* At the next order, this order's pages become unavailable */
free_pages - = z - > free_area [ o ] . nr_free < < o ;
/* Require fewer higher order pages to be free */
min > > = 1 ;
if ( free_pages < = min )
return 0 ;
}
return 1 ;
}
2005-06-22 04:14:41 +04:00
static inline int
2005-10-07 10:46:04 +04:00
should_reclaim_zone ( struct zone * z , gfp_t gfp_mask )
2005-06-22 04:14:41 +04:00
{
if ( ! z - > reclaim_pages )
return 0 ;
2005-06-22 04:14:42 +04:00
if ( gfp_mask & __GFP_NORECLAIM )
return 0 ;
2005-06-22 04:14:41 +04:00
return 1 ;
}
2005-04-17 02:20:36 +04:00
/*
* This is the ' heart ' of the zoned buddy allocator .
*/
struct page * fastcall
2005-10-07 10:46:04 +04:00
__alloc_pages ( gfp_t gfp_mask , unsigned int order ,
2005-04-17 02:20:36 +04:00
struct zonelist * zonelist )
{
2005-10-21 11:22:44 +04:00
const gfp_t wait = gfp_mask & __GFP_WAIT ;
2005-04-17 02:20:36 +04:00
struct zone * * zones , * z ;
struct page * page ;
struct reclaim_state reclaim_state ;
struct task_struct * p = current ;
int i ;
int classzone_idx ;
int do_retry ;
int can_try_harder ;
int did_some_progress ;
might_sleep_if ( wait ) ;
/*
* The caller may dip into page reserves a bit more if the caller
* cannot run direct reclaim , or is the caller has realtime scheduling
* policy
*/
can_try_harder = ( unlikely ( rt_task ( p ) ) & & ! in_interrupt ( ) ) | | ! wait ;
zones = zonelist - > zones ; /* the list of zones suitable for gfp_mask */
if ( unlikely ( zones [ 0 ] = = NULL ) ) {
/* Should this ever happen?? */
return NULL ;
}
classzone_idx = zone_idx ( zones [ 0 ] ) ;
2005-06-22 04:14:41 +04:00
restart :
[PATCH] cpusets: formalize intermediate GFP_KERNEL containment
This patch makes use of the previously underutilized cpuset flag
'mem_exclusive' to provide what amounts to another layer of memory placement
resolution. With this patch, there are now the following four layers of
memory placement available:
1) The whole system (interrupt and GFP_ATOMIC allocations can use this),
2) The nearest enclosing mem_exclusive cpuset (GFP_KERNEL allocations can use),
3) The current tasks cpuset (GFP_USER allocations constrained to here), and
4) Specific node placement, using mbind and set_mempolicy.
These nest - each layer is a subset (same or within) of the previous.
Layer (2) above is new, with this patch. The call used to check whether a
zone (its node, actually) is in a cpuset (in its mems_allowed, actually) is
extended to take a gfp_mask argument, and its logic is extended, in the case
that __GFP_HARDWALL is not set in the flag bits, to look up the cpuset
hierarchy for the nearest enclosing mem_exclusive cpuset, to determine if
placement is allowed. The definition of GFP_USER, which used to be identical
to GFP_KERNEL, is changed to also set the __GFP_HARDWALL bit, in the previous
cpuset_gfp_hardwall_flag patch.
GFP_ATOMIC and GFP_KERNEL allocations will stay within the current tasks
cpuset, so long as any node therein is not too tight on memory, but will
escape to the larger layer, if need be.
The intended use is to allow something like a batch manager to handle several
jobs, each job in its own cpuset, but using common kernel memory for caches
and such. Swapper and oom_kill activity is also constrained to Layer (2). A
task in or below one mem_exclusive cpuset should not cause swapping on nodes
in another non-overlapping mem_exclusive cpuset, nor provoke oom_killing of a
task in another such cpuset. Heavy use of kernel memory for i/o caching and
such by one job should not impact the memory available to jobs in other
non-overlapping mem_exclusive cpusets.
This patch enables providing hardwall, inescapable cpusets for memory
allocations of each job, while sharing kernel memory allocations between
several jobs, in an enclosing mem_exclusive cpuset.
Like Dinakar's patch earlier to enable administering sched domains using the
cpu_exclusive flag, this patch also provides a useful meaning to a cpuset flag
that had previously done nothing much useful other than restrict what cpuset
configurations were allowed.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-07 02:18:12 +04:00
/*
* Go through the zonelist once , looking for a zone with enough free .
* See also cpuset_zone_allowed ( ) comment in kernel / cpuset . c .
*/
2005-04-17 02:20:36 +04:00
for ( i = 0 ; ( z = zones [ i ] ) ! = NULL ; i + + ) {
2005-06-22 04:14:41 +04:00
int do_reclaim = should_reclaim_zone ( z , gfp_mask ) ;
2005-04-17 02:20:36 +04:00
[PATCH] cpusets: formalize intermediate GFP_KERNEL containment
This patch makes use of the previously underutilized cpuset flag
'mem_exclusive' to provide what amounts to another layer of memory placement
resolution. With this patch, there are now the following four layers of
memory placement available:
1) The whole system (interrupt and GFP_ATOMIC allocations can use this),
2) The nearest enclosing mem_exclusive cpuset (GFP_KERNEL allocations can use),
3) The current tasks cpuset (GFP_USER allocations constrained to here), and
4) Specific node placement, using mbind and set_mempolicy.
These nest - each layer is a subset (same or within) of the previous.
Layer (2) above is new, with this patch. The call used to check whether a
zone (its node, actually) is in a cpuset (in its mems_allowed, actually) is
extended to take a gfp_mask argument, and its logic is extended, in the case
that __GFP_HARDWALL is not set in the flag bits, to look up the cpuset
hierarchy for the nearest enclosing mem_exclusive cpuset, to determine if
placement is allowed. The definition of GFP_USER, which used to be identical
to GFP_KERNEL, is changed to also set the __GFP_HARDWALL bit, in the previous
cpuset_gfp_hardwall_flag patch.
GFP_ATOMIC and GFP_KERNEL allocations will stay within the current tasks
cpuset, so long as any node therein is not too tight on memory, but will
escape to the larger layer, if need be.
The intended use is to allow something like a batch manager to handle several
jobs, each job in its own cpuset, but using common kernel memory for caches
and such. Swapper and oom_kill activity is also constrained to Layer (2). A
task in or below one mem_exclusive cpuset should not cause swapping on nodes
in another non-overlapping mem_exclusive cpuset, nor provoke oom_killing of a
task in another such cpuset. Heavy use of kernel memory for i/o caching and
such by one job should not impact the memory available to jobs in other
non-overlapping mem_exclusive cpusets.
This patch enables providing hardwall, inescapable cpusets for memory
allocations of each job, while sharing kernel memory allocations between
several jobs, in an enclosing mem_exclusive cpuset.
Like Dinakar's patch earlier to enable administering sched domains using the
cpu_exclusive flag, this patch also provides a useful meaning to a cpuset flag
that had previously done nothing much useful other than restrict what cpuset
configurations were allowed.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-07 02:18:12 +04:00
if ( ! cpuset_zone_allowed ( z , __GFP_HARDWALL ) )
2005-04-17 02:20:36 +04:00
continue ;
2005-06-22 04:14:41 +04:00
/*
* If the zone is to attempt early page reclaim then this loop
* will try to reclaim pages and check the watermark a second
* time before giving up and falling back to the next zone .
*/
zone_reclaim_retry :
if ( ! zone_watermark_ok ( z , order , z - > pages_low ,
classzone_idx , 0 , 0 ) ) {
if ( ! do_reclaim )
continue ;
else {
zone_reclaim ( z , gfp_mask , order ) ;
/* Only try reclaim once */
do_reclaim = 0 ;
goto zone_reclaim_retry ;
}
}
2005-04-17 02:20:36 +04:00
page = buffered_rmqueue ( z , order , gfp_mask ) ;
if ( page )
goto got_pg ;
}
for ( i = 0 ; ( z = zones [ i ] ) ! = NULL ; i + + )
wakeup_kswapd ( z , order ) ;
/*
* Go through the zonelist again . Let __GFP_HIGH and allocations
* coming from realtime tasks to go deeper into reserves
*
* This is the last chance , in general , before the goto nopage .
* Ignore cpuset if GFP_ATOMIC ( ! wait ) rather than fail alloc .
[PATCH] cpusets: formalize intermediate GFP_KERNEL containment
This patch makes use of the previously underutilized cpuset flag
'mem_exclusive' to provide what amounts to another layer of memory placement
resolution. With this patch, there are now the following four layers of
memory placement available:
1) The whole system (interrupt and GFP_ATOMIC allocations can use this),
2) The nearest enclosing mem_exclusive cpuset (GFP_KERNEL allocations can use),
3) The current tasks cpuset (GFP_USER allocations constrained to here), and
4) Specific node placement, using mbind and set_mempolicy.
These nest - each layer is a subset (same or within) of the previous.
Layer (2) above is new, with this patch. The call used to check whether a
zone (its node, actually) is in a cpuset (in its mems_allowed, actually) is
extended to take a gfp_mask argument, and its logic is extended, in the case
that __GFP_HARDWALL is not set in the flag bits, to look up the cpuset
hierarchy for the nearest enclosing mem_exclusive cpuset, to determine if
placement is allowed. The definition of GFP_USER, which used to be identical
to GFP_KERNEL, is changed to also set the __GFP_HARDWALL bit, in the previous
cpuset_gfp_hardwall_flag patch.
GFP_ATOMIC and GFP_KERNEL allocations will stay within the current tasks
cpuset, so long as any node therein is not too tight on memory, but will
escape to the larger layer, if need be.
The intended use is to allow something like a batch manager to handle several
jobs, each job in its own cpuset, but using common kernel memory for caches
and such. Swapper and oom_kill activity is also constrained to Layer (2). A
task in or below one mem_exclusive cpuset should not cause swapping on nodes
in another non-overlapping mem_exclusive cpuset, nor provoke oom_killing of a
task in another such cpuset. Heavy use of kernel memory for i/o caching and
such by one job should not impact the memory available to jobs in other
non-overlapping mem_exclusive cpusets.
This patch enables providing hardwall, inescapable cpusets for memory
allocations of each job, while sharing kernel memory allocations between
several jobs, in an enclosing mem_exclusive cpuset.
Like Dinakar's patch earlier to enable administering sched domains using the
cpu_exclusive flag, this patch also provides a useful meaning to a cpuset flag
that had previously done nothing much useful other than restrict what cpuset
configurations were allowed.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-07 02:18:12 +04:00
* See also cpuset_zone_allowed ( ) comment in kernel / cpuset . c .
2005-04-17 02:20:36 +04:00
*/
for ( i = 0 ; ( z = zones [ i ] ) ! = NULL ; i + + ) {
if ( ! zone_watermark_ok ( z , order , z - > pages_min ,
classzone_idx , can_try_harder ,
gfp_mask & __GFP_HIGH ) )
continue ;
[PATCH] cpusets: formalize intermediate GFP_KERNEL containment
This patch makes use of the previously underutilized cpuset flag
'mem_exclusive' to provide what amounts to another layer of memory placement
resolution. With this patch, there are now the following four layers of
memory placement available:
1) The whole system (interrupt and GFP_ATOMIC allocations can use this),
2) The nearest enclosing mem_exclusive cpuset (GFP_KERNEL allocations can use),
3) The current tasks cpuset (GFP_USER allocations constrained to here), and
4) Specific node placement, using mbind and set_mempolicy.
These nest - each layer is a subset (same or within) of the previous.
Layer (2) above is new, with this patch. The call used to check whether a
zone (its node, actually) is in a cpuset (in its mems_allowed, actually) is
extended to take a gfp_mask argument, and its logic is extended, in the case
that __GFP_HARDWALL is not set in the flag bits, to look up the cpuset
hierarchy for the nearest enclosing mem_exclusive cpuset, to determine if
placement is allowed. The definition of GFP_USER, which used to be identical
to GFP_KERNEL, is changed to also set the __GFP_HARDWALL bit, in the previous
cpuset_gfp_hardwall_flag patch.
GFP_ATOMIC and GFP_KERNEL allocations will stay within the current tasks
cpuset, so long as any node therein is not too tight on memory, but will
escape to the larger layer, if need be.
The intended use is to allow something like a batch manager to handle several
jobs, each job in its own cpuset, but using common kernel memory for caches
and such. Swapper and oom_kill activity is also constrained to Layer (2). A
task in or below one mem_exclusive cpuset should not cause swapping on nodes
in another non-overlapping mem_exclusive cpuset, nor provoke oom_killing of a
task in another such cpuset. Heavy use of kernel memory for i/o caching and
such by one job should not impact the memory available to jobs in other
non-overlapping mem_exclusive cpusets.
This patch enables providing hardwall, inescapable cpusets for memory
allocations of each job, while sharing kernel memory allocations between
several jobs, in an enclosing mem_exclusive cpuset.
Like Dinakar's patch earlier to enable administering sched domains using the
cpu_exclusive flag, this patch also provides a useful meaning to a cpuset flag
that had previously done nothing much useful other than restrict what cpuset
configurations were allowed.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-07 02:18:12 +04:00
if ( wait & & ! cpuset_zone_allowed ( z , gfp_mask ) )
2005-04-17 02:20:36 +04:00
continue ;
page = buffered_rmqueue ( z , order , gfp_mask ) ;
if ( page )
goto got_pg ;
}
/* This allocation should allow future memory freeing. */
2005-05-01 19:58:36 +04:00
if ( ( ( p - > flags & PF_MEMALLOC ) | | unlikely ( test_thread_flag ( TIF_MEMDIE ) ) )
& & ! in_interrupt ( ) ) {
if ( ! ( gfp_mask & __GFP_NOMEMALLOC ) ) {
2005-11-14 03:06:41 +03:00
nofail_alloc :
2005-05-01 19:58:36 +04:00
/* go through the zonelist yet again, ignoring mins */
for ( i = 0 ; ( z = zones [ i ] ) ! = NULL ; i + + ) {
[PATCH] cpusets: formalize intermediate GFP_KERNEL containment
This patch makes use of the previously underutilized cpuset flag
'mem_exclusive' to provide what amounts to another layer of memory placement
resolution. With this patch, there are now the following four layers of
memory placement available:
1) The whole system (interrupt and GFP_ATOMIC allocations can use this),
2) The nearest enclosing mem_exclusive cpuset (GFP_KERNEL allocations can use),
3) The current tasks cpuset (GFP_USER allocations constrained to here), and
4) Specific node placement, using mbind and set_mempolicy.
These nest - each layer is a subset (same or within) of the previous.
Layer (2) above is new, with this patch. The call used to check whether a
zone (its node, actually) is in a cpuset (in its mems_allowed, actually) is
extended to take a gfp_mask argument, and its logic is extended, in the case
that __GFP_HARDWALL is not set in the flag bits, to look up the cpuset
hierarchy for the nearest enclosing mem_exclusive cpuset, to determine if
placement is allowed. The definition of GFP_USER, which used to be identical
to GFP_KERNEL, is changed to also set the __GFP_HARDWALL bit, in the previous
cpuset_gfp_hardwall_flag patch.
GFP_ATOMIC and GFP_KERNEL allocations will stay within the current tasks
cpuset, so long as any node therein is not too tight on memory, but will
escape to the larger layer, if need be.
The intended use is to allow something like a batch manager to handle several
jobs, each job in its own cpuset, but using common kernel memory for caches
and such. Swapper and oom_kill activity is also constrained to Layer (2). A
task in or below one mem_exclusive cpuset should not cause swapping on nodes
in another non-overlapping mem_exclusive cpuset, nor provoke oom_killing of a
task in another such cpuset. Heavy use of kernel memory for i/o caching and
such by one job should not impact the memory available to jobs in other
non-overlapping mem_exclusive cpusets.
This patch enables providing hardwall, inescapable cpusets for memory
allocations of each job, while sharing kernel memory allocations between
several jobs, in an enclosing mem_exclusive cpuset.
Like Dinakar's patch earlier to enable administering sched domains using the
cpu_exclusive flag, this patch also provides a useful meaning to a cpuset flag
that had previously done nothing much useful other than restrict what cpuset
configurations were allowed.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-07 02:18:12 +04:00
if ( ! cpuset_zone_allowed ( z , gfp_mask ) )
2005-05-01 19:58:36 +04:00
continue ;
page = buffered_rmqueue ( z , order , gfp_mask ) ;
if ( page )
goto got_pg ;
}
2005-11-14 03:06:41 +03:00
if ( gfp_mask & __GFP_NOFAIL ) {
blk_congestion_wait ( WRITE , HZ / 50 ) ;
goto nofail_alloc ;
}
2005-04-17 02:20:36 +04:00
}
goto nopage ;
}
/* Atomic allocations - we can't balance anything */
if ( ! wait )
goto nopage ;
rebalance :
cond_resched ( ) ;
/* We now go into synchronous reclaim */
p - > flags | = PF_MEMALLOC ;
reclaim_state . reclaimed_slab = 0 ;
p - > reclaim_state = & reclaim_state ;
2005-06-22 04:14:53 +04:00
did_some_progress = try_to_free_pages ( zones , gfp_mask ) ;
2005-04-17 02:20:36 +04:00
p - > reclaim_state = NULL ;
p - > flags & = ~ PF_MEMALLOC ;
cond_resched ( ) ;
if ( likely ( did_some_progress ) ) {
for ( i = 0 ; ( z = zones [ i ] ) ! = NULL ; i + + ) {
if ( ! zone_watermark_ok ( z , order , z - > pages_min ,
classzone_idx , can_try_harder ,
gfp_mask & __GFP_HIGH ) )
continue ;
[PATCH] cpusets: formalize intermediate GFP_KERNEL containment
This patch makes use of the previously underutilized cpuset flag
'mem_exclusive' to provide what amounts to another layer of memory placement
resolution. With this patch, there are now the following four layers of
memory placement available:
1) The whole system (interrupt and GFP_ATOMIC allocations can use this),
2) The nearest enclosing mem_exclusive cpuset (GFP_KERNEL allocations can use),
3) The current tasks cpuset (GFP_USER allocations constrained to here), and
4) Specific node placement, using mbind and set_mempolicy.
These nest - each layer is a subset (same or within) of the previous.
Layer (2) above is new, with this patch. The call used to check whether a
zone (its node, actually) is in a cpuset (in its mems_allowed, actually) is
extended to take a gfp_mask argument, and its logic is extended, in the case
that __GFP_HARDWALL is not set in the flag bits, to look up the cpuset
hierarchy for the nearest enclosing mem_exclusive cpuset, to determine if
placement is allowed. The definition of GFP_USER, which used to be identical
to GFP_KERNEL, is changed to also set the __GFP_HARDWALL bit, in the previous
cpuset_gfp_hardwall_flag patch.
GFP_ATOMIC and GFP_KERNEL allocations will stay within the current tasks
cpuset, so long as any node therein is not too tight on memory, but will
escape to the larger layer, if need be.
The intended use is to allow something like a batch manager to handle several
jobs, each job in its own cpuset, but using common kernel memory for caches
and such. Swapper and oom_kill activity is also constrained to Layer (2). A
task in or below one mem_exclusive cpuset should not cause swapping on nodes
in another non-overlapping mem_exclusive cpuset, nor provoke oom_killing of a
task in another such cpuset. Heavy use of kernel memory for i/o caching and
such by one job should not impact the memory available to jobs in other
non-overlapping mem_exclusive cpusets.
This patch enables providing hardwall, inescapable cpusets for memory
allocations of each job, while sharing kernel memory allocations between
several jobs, in an enclosing mem_exclusive cpuset.
Like Dinakar's patch earlier to enable administering sched domains using the
cpu_exclusive flag, this patch also provides a useful meaning to a cpuset flag
that had previously done nothing much useful other than restrict what cpuset
configurations were allowed.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-07 02:18:12 +04:00
if ( ! cpuset_zone_allowed ( z , gfp_mask ) )
2005-04-17 02:20:36 +04:00
continue ;
page = buffered_rmqueue ( z , order , gfp_mask ) ;
if ( page )
goto got_pg ;
}
} else if ( ( gfp_mask & __GFP_FS ) & & ! ( gfp_mask & __GFP_NORETRY ) ) {
/*
* Go through the zonelist yet one more time , keep
* very high watermark here , this is only to catch
* a parallel oom killing , we must fail if we ' re still
* under heavy pressure .
*/
for ( i = 0 ; ( z = zones [ i ] ) ! = NULL ; i + + ) {
if ( ! zone_watermark_ok ( z , order , z - > pages_high ,
classzone_idx , 0 , 0 ) )
continue ;
[PATCH] cpusets: formalize intermediate GFP_KERNEL containment
This patch makes use of the previously underutilized cpuset flag
'mem_exclusive' to provide what amounts to another layer of memory placement
resolution. With this patch, there are now the following four layers of
memory placement available:
1) The whole system (interrupt and GFP_ATOMIC allocations can use this),
2) The nearest enclosing mem_exclusive cpuset (GFP_KERNEL allocations can use),
3) The current tasks cpuset (GFP_USER allocations constrained to here), and
4) Specific node placement, using mbind and set_mempolicy.
These nest - each layer is a subset (same or within) of the previous.
Layer (2) above is new, with this patch. The call used to check whether a
zone (its node, actually) is in a cpuset (in its mems_allowed, actually) is
extended to take a gfp_mask argument, and its logic is extended, in the case
that __GFP_HARDWALL is not set in the flag bits, to look up the cpuset
hierarchy for the nearest enclosing mem_exclusive cpuset, to determine if
placement is allowed. The definition of GFP_USER, which used to be identical
to GFP_KERNEL, is changed to also set the __GFP_HARDWALL bit, in the previous
cpuset_gfp_hardwall_flag patch.
GFP_ATOMIC and GFP_KERNEL allocations will stay within the current tasks
cpuset, so long as any node therein is not too tight on memory, but will
escape to the larger layer, if need be.
The intended use is to allow something like a batch manager to handle several
jobs, each job in its own cpuset, but using common kernel memory for caches
and such. Swapper and oom_kill activity is also constrained to Layer (2). A
task in or below one mem_exclusive cpuset should not cause swapping on nodes
in another non-overlapping mem_exclusive cpuset, nor provoke oom_killing of a
task in another such cpuset. Heavy use of kernel memory for i/o caching and
such by one job should not impact the memory available to jobs in other
non-overlapping mem_exclusive cpusets.
This patch enables providing hardwall, inescapable cpusets for memory
allocations of each job, while sharing kernel memory allocations between
several jobs, in an enclosing mem_exclusive cpuset.
Like Dinakar's patch earlier to enable administering sched domains using the
cpu_exclusive flag, this patch also provides a useful meaning to a cpuset flag
that had previously done nothing much useful other than restrict what cpuset
configurations were allowed.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-07 02:18:12 +04:00
if ( ! cpuset_zone_allowed ( z , __GFP_HARDWALL ) )
2005-04-17 02:20:36 +04:00
continue ;
page = buffered_rmqueue ( z , order , gfp_mask ) ;
if ( page )
goto got_pg ;
}
2005-07-08 04:56:04 +04:00
out_of_memory ( gfp_mask , order ) ;
2005-04-17 02:20:36 +04:00
goto restart ;
}
/*
* Don ' t let big - order allocations loop unless the caller explicitly
* requests that . Wait for some write requests to complete then retry .
*
* In this implementation , __GFP_REPEAT means __GFP_NOFAIL for order
* < = 3 , but that may not be true in other implementations .
*/
do_retry = 0 ;
if ( ! ( gfp_mask & __GFP_NORETRY ) ) {
if ( ( order < = 3 ) | | ( gfp_mask & __GFP_REPEAT ) )
do_retry = 1 ;
if ( gfp_mask & __GFP_NOFAIL )
do_retry = 1 ;
}
if ( do_retry ) {
blk_congestion_wait ( WRITE , HZ / 50 ) ;
goto rebalance ;
}
nopage :
if ( ! ( gfp_mask & __GFP_NOWARN ) & & printk_ratelimit ( ) ) {
printk ( KERN_WARNING " %s: page allocation failure. "
" order:%d, mode:0x%x \n " ,
p - > comm , order , gfp_mask ) ;
dump_stack ( ) ;
2005-06-22 04:14:56 +04:00
show_mem ( ) ;
2005-04-17 02:20:36 +04:00
}
return NULL ;
got_pg :
zone_statistics ( zonelist , z ) ;
return page ;
}
EXPORT_SYMBOL ( __alloc_pages ) ;
/*
* Common helper functions .
*/
2005-10-07 10:46:04 +04:00
fastcall unsigned long __get_free_pages ( gfp_t gfp_mask , unsigned int order )
2005-04-17 02:20:36 +04:00
{
struct page * page ;
page = alloc_pages ( gfp_mask , order ) ;
if ( ! page )
return 0 ;
return ( unsigned long ) page_address ( page ) ;
}
EXPORT_SYMBOL ( __get_free_pages ) ;
2005-10-07 10:46:04 +04:00
fastcall unsigned long get_zeroed_page ( gfp_t gfp_mask )
2005-04-17 02:20:36 +04:00
{
struct page * page ;
/*
* get_zeroed_page ( ) returns a 32 - bit address , which cannot represent
* a highmem page
*/
2005-10-21 11:22:44 +04:00
BUG_ON ( ( gfp_mask & __GFP_HIGHMEM ) ! = 0 ) ;
2005-04-17 02:20:36 +04:00
page = alloc_pages ( gfp_mask | __GFP_ZERO , 0 ) ;
if ( page )
return ( unsigned long ) page_address ( page ) ;
return 0 ;
}
EXPORT_SYMBOL ( get_zeroed_page ) ;
void __pagevec_free ( struct pagevec * pvec )
{
int i = pagevec_count ( pvec ) ;
while ( - - i > = 0 )
free_hot_cold_page ( pvec - > pages [ i ] , pvec - > cold ) ;
}
fastcall void __free_pages ( struct page * page , unsigned int order )
{
2005-10-30 04:16:12 +03:00
if ( put_page_testzero ( page ) ) {
2005-04-17 02:20:36 +04:00
if ( order = = 0 )
free_hot_page ( page ) ;
else
__free_pages_ok ( page , order ) ;
}
}
EXPORT_SYMBOL ( __free_pages ) ;
fastcall void free_pages ( unsigned long addr , unsigned int order )
{
if ( addr ! = 0 ) {
BUG_ON ( ! virt_addr_valid ( ( void * ) addr ) ) ;
__free_pages ( virt_to_page ( ( void * ) addr ) , order ) ;
}
}
EXPORT_SYMBOL ( free_pages ) ;
/*
* Total amount of free ( allocatable ) RAM :
*/
unsigned int nr_free_pages ( void )
{
unsigned int sum = 0 ;
struct zone * zone ;
for_each_zone ( zone )
sum + = zone - > free_pages ;
return sum ;
}
EXPORT_SYMBOL ( nr_free_pages ) ;
# ifdef CONFIG_NUMA
unsigned int nr_free_pages_pgdat ( pg_data_t * pgdat )
{
unsigned int i , sum = 0 ;
for ( i = 0 ; i < MAX_NR_ZONES ; i + + )
sum + = pgdat - > node_zones [ i ] . free_pages ;
return sum ;
}
# endif
static unsigned int nr_free_zone_pages ( int offset )
{
2005-07-30 09:59:18 +04:00
/* Just pick one node, since fallback list is circular */
pg_data_t * pgdat = NODE_DATA ( numa_node_id ( ) ) ;
2005-04-17 02:20:36 +04:00
unsigned int sum = 0 ;
2005-07-30 09:59:18 +04:00
struct zonelist * zonelist = pgdat - > node_zonelists + offset ;
struct zone * * zonep = zonelist - > zones ;
struct zone * zone ;
2005-04-17 02:20:36 +04:00
2005-07-30 09:59:18 +04:00
for ( zone = * zonep + + ; zone ; zone = * zonep + + ) {
unsigned long size = zone - > present_pages ;
unsigned long high = zone - > pages_high ;
if ( size > high )
sum + = size - high ;
2005-04-17 02:20:36 +04:00
}
return sum ;
}
/*
* Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL
*/
unsigned int nr_free_buffer_pages ( void )
{
2005-10-21 10:55:38 +04:00
return nr_free_zone_pages ( gfp_zone ( GFP_USER ) ) ;
2005-04-17 02:20:36 +04:00
}
/*
* Amount of free RAM allocatable within all zones
*/
unsigned int nr_free_pagecache_pages ( void )
{
2005-10-21 10:55:38 +04:00
return nr_free_zone_pages ( gfp_zone ( GFP_HIGHUSER ) ) ;
2005-04-17 02:20:36 +04:00
}
# ifdef CONFIG_HIGHMEM
unsigned int nr_free_highpages ( void )
{
pg_data_t * pgdat ;
unsigned int pages = 0 ;
for_each_pgdat ( pgdat )
pages + = pgdat - > node_zones [ ZONE_HIGHMEM ] . free_pages ;
return pages ;
}
# endif
# ifdef CONFIG_NUMA
static void show_node ( struct zone * zone )
{
printk ( " Node %d " , zone - > zone_pgdat - > node_id ) ;
}
# else
# define show_node(zone) do { } while (0)
# endif
/*
* Accumulate the page_state information across all CPUs .
* The result is unavoidably approximate - it can change
* during and after execution of this function .
*/
static DEFINE_PER_CPU ( struct page_state , page_states ) = { 0 } ;
atomic_t nr_pagecache = ATOMIC_INIT ( 0 ) ;
EXPORT_SYMBOL ( nr_pagecache ) ;
# ifdef CONFIG_SMP
DEFINE_PER_CPU ( long , nr_pagecache_local ) = 0 ;
# endif
2005-09-04 02:55:11 +04:00
void __get_page_state ( struct page_state * ret , int nr , cpumask_t * cpumask )
2005-04-17 02:20:36 +04:00
{
int cpu = 0 ;
memset ( ret , 0 , sizeof ( * ret ) ) ;
2005-09-04 02:55:11 +04:00
cpus_and ( * cpumask , * cpumask , cpu_online_map ) ;
2005-04-17 02:20:36 +04:00
2005-09-04 02:55:11 +04:00
cpu = first_cpu ( * cpumask ) ;
2005-04-17 02:20:36 +04:00
while ( cpu < NR_CPUS ) {
unsigned long * in , * out , off ;
in = ( unsigned long * ) & per_cpu ( page_states , cpu ) ;
2005-09-04 02:55:11 +04:00
cpu = next_cpu ( cpu , * cpumask ) ;
2005-04-17 02:20:36 +04:00
if ( cpu < NR_CPUS )
prefetch ( & per_cpu ( page_states , cpu ) ) ;
out = ( unsigned long * ) ret ;
for ( off = 0 ; off < nr ; off + + )
* out + + + = * in + + ;
}
}
2005-09-04 02:55:11 +04:00
void get_page_state_node ( struct page_state * ret , int node )
{
int nr ;
cpumask_t mask = node_to_cpumask ( node ) ;
nr = offsetof ( struct page_state , GET_PAGE_STATE_LAST ) ;
nr / = sizeof ( unsigned long ) ;
__get_page_state ( ret , nr + 1 , & mask ) ;
}
2005-04-17 02:20:36 +04:00
void get_page_state ( struct page_state * ret )
{
int nr ;
2005-09-04 02:55:11 +04:00
cpumask_t mask = CPU_MASK_ALL ;
2005-04-17 02:20:36 +04:00
nr = offsetof ( struct page_state , GET_PAGE_STATE_LAST ) ;
nr / = sizeof ( unsigned long ) ;
2005-09-04 02:55:11 +04:00
__get_page_state ( ret , nr + 1 , & mask ) ;
2005-04-17 02:20:36 +04:00
}
void get_full_page_state ( struct page_state * ret )
{
2005-09-04 02:55:11 +04:00
cpumask_t mask = CPU_MASK_ALL ;
__get_page_state ( ret , sizeof ( * ret ) / sizeof ( unsigned long ) , & mask ) ;
2005-04-17 02:20:36 +04:00
}
2005-06-22 04:14:55 +04:00
unsigned long __read_page_state ( unsigned long offset )
2005-04-17 02:20:36 +04:00
{
unsigned long ret = 0 ;
int cpu ;
for_each_online_cpu ( cpu ) {
unsigned long in ;
in = ( unsigned long ) & per_cpu ( page_states , cpu ) + offset ;
ret + = * ( ( unsigned long * ) in ) ;
}
return ret ;
}
2005-06-22 04:14:54 +04:00
void __mod_page_state ( unsigned long offset , unsigned long delta )
2005-04-17 02:20:36 +04:00
{
unsigned long flags ;
void * ptr ;
local_irq_save ( flags ) ;
ptr = & __get_cpu_var ( page_states ) ;
* ( unsigned long * ) ( ptr + offset ) + = delta ;
local_irq_restore ( flags ) ;
}
EXPORT_SYMBOL ( __mod_page_state ) ;
void __get_zone_counts ( unsigned long * active , unsigned long * inactive ,
unsigned long * free , struct pglist_data * pgdat )
{
struct zone * zones = pgdat - > node_zones ;
int i ;
* active = 0 ;
* inactive = 0 ;
* free = 0 ;
for ( i = 0 ; i < MAX_NR_ZONES ; i + + ) {
* active + = zones [ i ] . nr_active ;
* inactive + = zones [ i ] . nr_inactive ;
* free + = zones [ i ] . free_pages ;
}
}
void get_zone_counts ( unsigned long * active ,
unsigned long * inactive , unsigned long * free )
{
struct pglist_data * pgdat ;
* active = 0 ;
* inactive = 0 ;
* free = 0 ;
for_each_pgdat ( pgdat ) {
unsigned long l , m , n ;
__get_zone_counts ( & l , & m , & n , pgdat ) ;
* active + = l ;
* inactive + = m ;
* free + = n ;
}
}
void si_meminfo ( struct sysinfo * val )
{
val - > totalram = totalram_pages ;
val - > sharedram = 0 ;
val - > freeram = nr_free_pages ( ) ;
val - > bufferram = nr_blockdev_pages ( ) ;
# ifdef CONFIG_HIGHMEM
val - > totalhigh = totalhigh_pages ;
val - > freehigh = nr_free_highpages ( ) ;
# else
val - > totalhigh = 0 ;
val - > freehigh = 0 ;
# endif
val - > mem_unit = PAGE_SIZE ;
}
EXPORT_SYMBOL ( si_meminfo ) ;
# ifdef CONFIG_NUMA
void si_meminfo_node ( struct sysinfo * val , int nid )
{
pg_data_t * pgdat = NODE_DATA ( nid ) ;
val - > totalram = pgdat - > node_present_pages ;
val - > freeram = nr_free_pages_pgdat ( pgdat ) ;
val - > totalhigh = pgdat - > node_zones [ ZONE_HIGHMEM ] . present_pages ;
val - > freehigh = pgdat - > node_zones [ ZONE_HIGHMEM ] . free_pages ;
val - > mem_unit = PAGE_SIZE ;
}
# endif
# define K(x) ((x) << (PAGE_SHIFT-10))
/*
* Show free area list ( used inside shift_scroll - lock stuff )
* We also calculate the percentage fragmentation . We do this by counting the
* memory on each free list with the exception of the first item on the list .
*/
void show_free_areas ( void )
{
struct page_state ps ;
int cpu , temperature ;
unsigned long active ;
unsigned long inactive ;
unsigned long free ;
struct zone * zone ;
for_each_zone ( zone ) {
show_node ( zone ) ;
printk ( " %s per-cpu: " , zone - > name ) ;
if ( ! zone - > present_pages ) {
printk ( " empty \n " ) ;
continue ;
} else
printk ( " \n " ) ;
2005-11-10 23:45:56 +03:00
for_each_online_cpu ( cpu ) {
2005-04-17 02:20:36 +04:00
struct per_cpu_pageset * pageset ;
2005-06-22 04:14:47 +04:00
pageset = zone_pcp ( zone , cpu ) ;
2005-04-17 02:20:36 +04:00
for ( temperature = 0 ; temperature < 2 ; temperature + + )
2005-06-22 04:14:57 +04:00
printk ( " cpu %d %s: low %d, high %d, batch %d used:%d \n " ,
2005-04-17 02:20:36 +04:00
cpu ,
temperature ? " cold " : " hot " ,
pageset - > pcp [ temperature ] . low ,
pageset - > pcp [ temperature ] . high ,
2005-06-22 04:14:57 +04:00
pageset - > pcp [ temperature ] . batch ,
pageset - > pcp [ temperature ] . count ) ;
2005-04-17 02:20:36 +04:00
}
}
get_page_state ( & ps ) ;
get_zone_counts ( & active , & inactive , & free ) ;
2005-06-22 04:15:14 +04:00
printk ( " Free pages: %11ukB (%ukB HighMem) \n " ,
2005-04-17 02:20:36 +04:00
K ( nr_free_pages ( ) ) ,
K ( nr_free_highpages ( ) ) ) ;
printk ( " Active:%lu inactive:%lu dirty:%lu writeback:%lu "
" unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu \n " ,
active ,
inactive ,
ps . nr_dirty ,
ps . nr_writeback ,
ps . nr_unstable ,
nr_free_pages ( ) ,
ps . nr_slab ,
ps . nr_mapped ,
ps . nr_page_table_pages ) ;
for_each_zone ( zone ) {
int i ;
show_node ( zone ) ;
printk ( " %s "
" free:%lukB "
" min:%lukB "
" low:%lukB "
" high:%lukB "
" active:%lukB "
" inactive:%lukB "
" present:%lukB "
" pages_scanned:%lu "
" all_unreclaimable? %s "
" \n " ,
zone - > name ,
K ( zone - > free_pages ) ,
K ( zone - > pages_min ) ,
K ( zone - > pages_low ) ,
K ( zone - > pages_high ) ,
K ( zone - > nr_active ) ,
K ( zone - > nr_inactive ) ,
K ( zone - > present_pages ) ,
zone - > pages_scanned ,
( zone - > all_unreclaimable ? " yes " : " no " )
) ;
printk ( " lowmem_reserve[]: " ) ;
for ( i = 0 ; i < MAX_NR_ZONES ; i + + )
printk ( " %lu " , zone - > lowmem_reserve [ i ] ) ;
printk ( " \n " ) ;
}
for_each_zone ( zone ) {
unsigned long nr , flags , order , total = 0 ;
show_node ( zone ) ;
printk ( " %s: " , zone - > name ) ;
if ( ! zone - > present_pages ) {
printk ( " empty \n " ) ;
continue ;
}
spin_lock_irqsave ( & zone - > lock , flags ) ;
for ( order = 0 ; order < MAX_ORDER ; order + + ) {
nr = zone - > free_area [ order ] . nr_free ;
total + = nr < < order ;
printk ( " %lu*%lukB " , nr , K ( 1UL ) < < order ) ;
}
spin_unlock_irqrestore ( & zone - > lock , flags ) ;
printk ( " = %lukB \n " , K ( total ) ) ;
}
show_swap_cache_info ( ) ;
}
/*
* Builds allocation fallback zone lists .
*/
static int __init build_zonelists_node ( pg_data_t * pgdat , struct zonelist * zonelist , int j , int k )
{
switch ( k ) {
struct zone * zone ;
default :
BUG ( ) ;
case ZONE_HIGHMEM :
zone = pgdat - > node_zones + ZONE_HIGHMEM ;
if ( zone - > present_pages ) {
# ifndef CONFIG_HIGHMEM
BUG ( ) ;
# endif
zonelist - > zones [ j + + ] = zone ;
}
case ZONE_NORMAL :
zone = pgdat - > node_zones + ZONE_NORMAL ;
if ( zone - > present_pages )
zonelist - > zones [ j + + ] = zone ;
case ZONE_DMA :
zone = pgdat - > node_zones + ZONE_DMA ;
if ( zone - > present_pages )
zonelist - > zones [ j + + ] = zone ;
}
return j ;
}
2005-10-21 11:22:44 +04:00
static inline int highest_zone ( int zone_bits )
{
int res = ZONE_NORMAL ;
if ( zone_bits & ( __force int ) __GFP_HIGHMEM )
res = ZONE_HIGHMEM ;
if ( zone_bits & ( __force int ) __GFP_DMA )
res = ZONE_DMA ;
return res ;
}
2005-04-17 02:20:36 +04:00
# ifdef CONFIG_NUMA
# define MAX_NODE_LOAD (num_online_nodes())
static int __initdata node_load [ MAX_NUMNODES ] ;
/**
2005-05-01 19:59:25 +04:00
* find_next_best_node - find the next node that should appear in a given node ' s fallback list
2005-04-17 02:20:36 +04:00
* @ node : node whose fallback list we ' re appending
* @ used_node_mask : nodemask_t of already used nodes
*
* We use a number of factors to determine which is the next node that should
* appear on a given node ' s fallback list . The node should not have appeared
* already in @ node ' s fallback list , and it should be the next closest node
* according to the distance array ( which contains arbitrary distance values
* from each node to each node in the system ) , and should also prefer nodes
* with no CPUs , since presumably they ' ll have very little allocation pressure
* on them otherwise .
* It returns - 1 if no node is found .
*/
static int __init find_next_best_node ( int node , nodemask_t * used_node_mask )
{
int i , n , val ;
int min_val = INT_MAX ;
int best_node = - 1 ;
for_each_online_node ( i ) {
cpumask_t tmp ;
/* Start from local node */
n = ( node + i ) % num_online_nodes ( ) ;
/* Don't want a node to appear more than once */
if ( node_isset ( n , * used_node_mask ) )
continue ;
/* Use the local node if we haven't already */
if ( ! node_isset ( node , * used_node_mask ) ) {
best_node = node ;
break ;
}
/* Use the distance array to find the distance */
val = node_distance ( node , n ) ;
/* Give preference to headless and unused nodes */
tmp = node_to_cpumask ( n ) ;
if ( ! cpus_empty ( tmp ) )
val + = PENALTY_FOR_NODE_WITH_CPUS ;
/* Slight preference for less loaded node */
val * = ( MAX_NODE_LOAD * MAX_NUMNODES ) ;
val + = node_load [ n ] ;
if ( val < min_val ) {
min_val = val ;
best_node = n ;
}
}
if ( best_node > = 0 )
node_set ( best_node , * used_node_mask ) ;
return best_node ;
}
static void __init build_zonelists ( pg_data_t * pgdat )
{
int i , j , k , node , local_node ;
int prev_node , load ;
struct zonelist * zonelist ;
nodemask_t used_mask ;
/* initialize zonelists */
for ( i = 0 ; i < GFP_ZONETYPES ; i + + ) {
zonelist = pgdat - > node_zonelists + i ;
zonelist - > zones [ 0 ] = NULL ;
}
/* NUMA-aware ordering of nodes */
local_node = pgdat - > node_id ;
load = num_online_nodes ( ) ;
prev_node = local_node ;
nodes_clear ( used_mask ) ;
while ( ( node = find_next_best_node ( local_node , & used_mask ) ) > = 0 ) {
/*
* We don ' t want to pressure a particular node .
* So adding penalty to the first node in same
* distance group to make it round - robin .
*/
if ( node_distance ( local_node , node ) ! =
node_distance ( local_node , prev_node ) )
node_load [ node ] + = load ;
prev_node = node ;
load - - ;
for ( i = 0 ; i < GFP_ZONETYPES ; i + + ) {
zonelist = pgdat - > node_zonelists + i ;
for ( j = 0 ; zonelist - > zones [ j ] ! = NULL ; j + + ) ;
2005-10-21 11:22:44 +04:00
k = highest_zone ( i ) ;
2005-04-17 02:20:36 +04:00
j = build_zonelists_node ( NODE_DATA ( node ) , zonelist , j , k ) ;
zonelist - > zones [ j ] = NULL ;
}
}
}
# else /* CONFIG_NUMA */
static void __init build_zonelists ( pg_data_t * pgdat )
{
int i , j , k , node , local_node ;
local_node = pgdat - > node_id ;
for ( i = 0 ; i < GFP_ZONETYPES ; i + + ) {
struct zonelist * zonelist ;
zonelist = pgdat - > node_zonelists + i ;
j = 0 ;
2005-10-21 11:22:44 +04:00
k = highest_zone ( i ) ;
2005-04-17 02:20:36 +04:00
j = build_zonelists_node ( pgdat , zonelist , j , k ) ;
/*
* Now we build the zonelist so that it contains the zones
* of all the other nodes .
* We don ' t want to pressure a particular node , so when
* building the zones for node N , we make sure that the
* zones coming right after the local ones are those from
* node N + 1 ( modulo N )
*/
for ( node = local_node + 1 ; node < MAX_NUMNODES ; node + + ) {
if ( ! node_online ( node ) )
continue ;
j = build_zonelists_node ( NODE_DATA ( node ) , zonelist , j , k ) ;
}
for ( node = 0 ; node < local_node ; node + + ) {
if ( ! node_online ( node ) )
continue ;
j = build_zonelists_node ( NODE_DATA ( node ) , zonelist , j , k ) ;
}
zonelist - > zones [ j ] = NULL ;
}
}
# endif /* CONFIG_NUMA */
void __init build_all_zonelists ( void )
{
int i ;
for_each_online_node ( i )
build_zonelists ( NODE_DATA ( i ) ) ;
printk ( " Built %i zonelists \n " , num_online_nodes ( ) ) ;
cpuset_init_current_mems_allowed ( ) ;
}
/*
* Helper functions to size the waitqueue hash table .
* Essentially these want to choose hash table sizes sufficiently
* large so that collisions trying to wait on pages are rare .
* But in fact , the number of active page waitqueues on typical
* systems is ridiculously low , less than 200. So this is even
* conservative , even though it seems large .
*
* The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
* waitqueues , i . e . the size of the waitq table given the number of pages .
*/
# define PAGES_PER_WAITQUEUE 256
static inline unsigned long wait_table_size ( unsigned long pages )
{
unsigned long size = 1 ;
pages / = PAGES_PER_WAITQUEUE ;
while ( size < pages )
size < < = 1 ;
/*
* Once we have dozens or even hundreds of threads sleeping
* on IO we ' ve got bigger problems than wait queue collision .
* Limit the size of the wait table to a reasonable size .
*/
size = min ( size , 4096UL ) ;
return max ( size , 4UL ) ;
}
/*
* This is an integer logarithm so that shifts can be used later
* to extract the more random high bits from the multiplicative
* hash function before the remainder is taken .
*/
static inline unsigned long wait_table_bits ( unsigned long size )
{
return ffz ( ~ size ) ;
}
# define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
static void __init calculate_zone_totalpages ( struct pglist_data * pgdat ,
unsigned long * zones_size , unsigned long * zholes_size )
{
unsigned long realtotalpages , totalpages = 0 ;
int i ;
for ( i = 0 ; i < MAX_NR_ZONES ; i + + )
totalpages + = zones_size [ i ] ;
pgdat - > node_spanned_pages = totalpages ;
realtotalpages = totalpages ;
if ( zholes_size )
for ( i = 0 ; i < MAX_NR_ZONES ; i + + )
realtotalpages - = zholes_size [ i ] ;
pgdat - > node_present_pages = realtotalpages ;
printk ( KERN_DEBUG " On node %d totalpages: %lu \n " , pgdat - > node_id , realtotalpages ) ;
}
/*
* Initially all pages are reserved - free ones are freed
* up by free_all_bootmem ( ) once the early boot process is
* done . Non - atomic initialization , single - pass .
*/
2005-10-30 04:16:54 +03:00
void __devinit memmap_init_zone ( unsigned long size , int nid , unsigned long zone ,
2005-04-17 02:20:36 +04:00
unsigned long start_pfn )
{
struct page * page ;
2005-06-23 11:08:00 +04:00
unsigned long end_pfn = start_pfn + size ;
unsigned long pfn ;
2005-04-17 02:20:36 +04:00
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.
A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.
Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous. It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.
Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory. This is what allows the mem_map[]
to be chopped up.
In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags. Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations. However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions. Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.
One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags. It might provide
speed increases on certain platforms and will be stored there if there is
room. But, if out of room, an alternate (theoretically slower) mechanism is
used.
This patch introduces CONFIG_FLATMEM. It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:07:54 +04:00
for ( pfn = start_pfn ; pfn < end_pfn ; pfn + + , page + + ) {
if ( ! early_pfn_valid ( pfn ) )
continue ;
[PATCH] sparsemem swiss cheese numa layouts
The part of the sparsemem patch which modifies memmap_init_zone() has recently
become a problem. It changes behavior so that there is a call to
pfn_to_page() for each individual page inside of a node's range:
node_start_pfn through node_end_pfn. It used to simply do this once, at the
beginning of the node, but having sparsemem's non-contiguous mem_map[]s inside
of a node made it necessary to change.
Mike Kravetz recently wrote a patch which made the NUMA code accept some new
kinds of layouts. The system's memory was laid out like this, with node 0's
memory in two pieces: one before and one after node 1's memory:
Node 0: +++++ +++++
Node 1: +++++
Previous behavior before Mike's patch was to assign nodes like this:
Node 0: 00000 XXXXX
Node 1: 11111
Where the 'X' areas were simply thrown away. The new behavior was to make the
pg_data_t span node 0 across all of its areas, including areas that are really
node 1's: Node 0: 000000000000000 Node 1: 11111
This wastes a little bit of mem_map space, but ends up being OK, and more
fully utilizes the system's memory. memmap_init_zone() initializes all of the
"struct page"s for node 0, even for the "hole", but those never get used,
because there is no pfn_to_page() that resolves to those pages. However, only
calling pfn_to_page() once, memmap_init_zone() always uses the pages that were
allocated for node0->node_mem_map because:
struct page *start = pfn_to_page(start_pfn);
// effectively start = &node->node_mem_map[0]
for (page = start; page < (start + size); page++) {
init_page_here();...
page++;
}
Slow, and wasteful, but generally harmless.
But, modify that to call pfn_to_page() for each loop iteration (like sparsemem
does):
for (pfn = start_pfn; pfn < < (start_pfn + size); pfn++++) {
page = pfn_to_page(pfn);
}
And you end up trying to initialize node 1's pages too early, along with bogus
data from node 0. This patch checks for those weird layouts and declines to
touch the pages, making the more frequent pfn_to_page() calls OK to do.
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:07:59 +04:00
if ( ! early_pfn_in_nid ( pfn , nid ) )
continue ;
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.
A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.
Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous. It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.
Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory. This is what allows the mem_map[]
to be chopped up.
In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags. Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations. However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions. Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.
One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags. It might provide
speed increases on certain platforms and will be stored there if there is
room. But, if out of room, an alternate (theoretically slower) mechanism is
used.
This patch introduces CONFIG_FLATMEM. It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:07:54 +04:00
page = pfn_to_page ( pfn ) ;
set_page_links ( page , zone , nid , pfn ) ;
2005-10-30 04:16:12 +03:00
set_page_count ( page , 1 ) ;
2005-04-17 02:20:36 +04:00
reset_page_mapcount ( page ) ;
SetPageReserved ( page ) ;
INIT_LIST_HEAD ( & page - > lru ) ;
# ifdef WANT_PAGE_VIRTUAL
/* The shift won't overflow because ZONE_NORMAL is below 4G. */
if ( ! is_highmem_idx ( zone ) )
2005-06-28 01:36:28 +04:00
set_page_address ( page , __va ( pfn < < PAGE_SHIFT ) ) ;
2005-04-17 02:20:36 +04:00
# endif
}
}
void zone_init_free_lists ( struct pglist_data * pgdat , struct zone * zone ,
unsigned long size )
{
int order ;
for ( order = 0 ; order < MAX_ORDER ; order + + ) {
INIT_LIST_HEAD ( & zone - > free_area [ order ] . free_list ) ;
zone - > free_area [ order ] . nr_free = 0 ;
}
}
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.
A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.
Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous. It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.
Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory. This is what allows the mem_map[]
to be chopped up.
In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags. Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations. However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions. Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.
One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags. It might provide
speed increases on certain platforms and will be stored there if there is
room. But, if out of room, an alternate (theoretically slower) mechanism is
used.
This patch introduces CONFIG_FLATMEM. It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:07:54 +04:00
# define ZONETABLE_INDEX(x, zone_nr) ((x << ZONES_SHIFT) | zone_nr)
void zonetable_add ( struct zone * zone , int nid , int zid , unsigned long pfn ,
unsigned long size )
{
unsigned long snum = pfn_to_section_nr ( pfn ) ;
unsigned long end = pfn_to_section_nr ( pfn + size ) ;
if ( FLAGS_HAS_NODE )
zone_table [ ZONETABLE_INDEX ( nid , zid ) ] = zone ;
else
for ( ; snum < = end ; snum + + )
zone_table [ ZONETABLE_INDEX ( snum , zid ) ] = zone ;
}
2005-04-17 02:20:36 +04:00
# ifndef __HAVE_ARCH_MEMMAP_INIT
# define memmap_init(size, nid, zone, start_pfn) \
memmap_init_zone ( ( size ) , ( nid ) , ( zone ) , ( start_pfn ) )
# endif
2005-06-22 04:14:47 +04:00
static int __devinit zone_batchsize ( struct zone * zone )
{
int batch ;
/*
* The per - cpu - pages pools are set to around 1000 th of the
2005-10-30 04:15:47 +03:00
* size of the zone . But no more than 1 / 2 of a meg .
2005-06-22 04:14:47 +04:00
*
* OK , so we don ' t know how big the cache is . So guess .
*/
batch = zone - > present_pages / 1024 ;
2005-10-30 04:15:47 +03:00
if ( batch * PAGE_SIZE > 512 * 1024 )
batch = ( 512 * 1024 ) / PAGE_SIZE ;
2005-06-22 04:14:47 +04:00
batch / = 4 ; /* We effectively *= 4 below */
if ( batch < 1 )
batch = 1 ;
/*
2005-10-30 04:15:47 +03:00
* We will be trying to allcoate bigger chunks of contiguous
* memory of the order of fls ( batch ) . This should result in
* better cache coloring .
2005-06-22 04:14:47 +04:00
*
2005-10-30 04:15:47 +03:00
* A sanity check also to ensure that batch is still in limits .
2005-06-22 04:14:47 +04:00
*/
2005-10-30 04:15:47 +03:00
batch = ( 1 < < fls ( batch + batch / 2 ) ) ;
if ( fls ( batch ) > = ( PAGE_SHIFT + MAX_ORDER - 2 ) )
batch = PAGE_SHIFT + ( ( MAX_ORDER - 1 - PAGE_SHIFT ) / 2 ) ;
2005-06-22 04:14:47 +04:00
return batch ;
}
2005-06-22 04:15:00 +04:00
inline void setup_pageset ( struct per_cpu_pageset * p , unsigned long batch )
{
struct per_cpu_pages * pcp ;
2005-10-26 12:58:59 +04:00
memset ( p , 0 , sizeof ( * p ) ) ;
2005-06-22 04:15:00 +04:00
pcp = & p - > pcp [ 0 ] ; /* hot */
pcp - > count = 0 ;
2005-10-30 04:15:48 +03:00
pcp - > low = 0 ;
2005-06-22 04:15:00 +04:00
pcp - > high = 6 * batch ;
pcp - > batch = max ( 1UL , 1 * batch ) ;
INIT_LIST_HEAD ( & pcp - > list ) ;
pcp = & p - > pcp [ 1 ] ; /* cold*/
pcp - > count = 0 ;
pcp - > low = 0 ;
pcp - > high = 2 * batch ;
2005-10-30 04:15:48 +03:00
pcp - > batch = max ( 1UL , batch / 2 ) ;
2005-06-22 04:15:00 +04:00
INIT_LIST_HEAD ( & pcp - > list ) ;
}
2005-06-22 04:14:47 +04:00
# ifdef CONFIG_NUMA
/*
2005-06-22 04:15:00 +04:00
* Boot pageset table . One per cpu which is going to be used for all
* zones and all nodes . The parameters will be set in such a way
* that an item put on a list will immediately be handed over to
* the buddy list . This is safe since pageset manipulation is done
* with interrupts disabled .
*
* Some NUMA counter updates may also be caught by the boot pagesets .
2005-06-23 07:26:07 +04:00
*
* The boot_pagesets must be kept even after bootup is complete for
* unused processors and / or zones . They do play a role for bootstrapping
* hotplugged processors .
*
* zoneinfo_show ( ) and maybe other functions do
* not check if the processor is online before following the pageset pointer .
* Other parts of the kernel may not check if the zone is available .
2005-06-22 04:15:00 +04:00
*/
static struct per_cpu_pageset
2005-06-23 07:26:07 +04:00
boot_pageset [ NR_CPUS ] ;
2005-06-22 04:15:00 +04:00
/*
* Dynamically allocate memory for the
2005-06-22 04:14:47 +04:00
* per cpu pageset array in struct zone .
*/
static int __devinit process_zones ( int cpu )
{
struct zone * zone , * dzone ;
for_each_zone ( zone ) {
2005-06-22 04:15:00 +04:00
zone - > pageset [ cpu ] = kmalloc_node ( sizeof ( struct per_cpu_pageset ) ,
2005-06-22 04:14:47 +04:00
GFP_KERNEL , cpu_to_node ( cpu ) ) ;
2005-06-22 04:15:00 +04:00
if ( ! zone - > pageset [ cpu ] )
2005-06-22 04:14:47 +04:00
goto bad ;
2005-06-22 04:15:00 +04:00
setup_pageset ( zone - > pageset [ cpu ] , zone_batchsize ( zone ) ) ;
2005-06-22 04:14:47 +04:00
}
return 0 ;
bad :
for_each_zone ( dzone ) {
if ( dzone = = zone )
break ;
kfree ( dzone - > pageset [ cpu ] ) ;
dzone - > pageset [ cpu ] = NULL ;
}
return - ENOMEM ;
}
static inline void free_zone_pagesets ( int cpu )
{
# ifdef CONFIG_NUMA
struct zone * zone ;
for_each_zone ( zone ) {
struct per_cpu_pageset * pset = zone_pcp ( zone , cpu ) ;
zone_pcp ( zone , cpu ) = NULL ;
kfree ( pset ) ;
}
# endif
}
static int __devinit pageset_cpuup_callback ( struct notifier_block * nfb ,
unsigned long action ,
void * hcpu )
{
int cpu = ( long ) hcpu ;
int ret = NOTIFY_OK ;
switch ( action ) {
case CPU_UP_PREPARE :
if ( process_zones ( cpu ) )
ret = NOTIFY_BAD ;
break ;
# ifdef CONFIG_HOTPLUG_CPU
case CPU_DEAD :
free_zone_pagesets ( cpu ) ;
break ;
# endif
default :
break ;
}
return ret ;
}
static struct notifier_block pageset_notifier =
{ & pageset_cpuup_callback , NULL , 0 } ;
void __init setup_per_cpu_pageset ( )
{
int err ;
/* Initialize per_cpu_pageset for cpu 0.
* A cpuup callback will do this for every cpu
* as it comes online
*/
err = process_zones ( smp_processor_id ( ) ) ;
BUG_ON ( err ) ;
register_cpu_notifier ( & pageset_notifier ) ;
}
# endif
2005-10-30 04:16:50 +03:00
static __devinit
void zone_wait_table_init ( struct zone * zone , unsigned long zone_size_pages )
{
int i ;
struct pglist_data * pgdat = zone - > zone_pgdat ;
/*
* The per - page waitqueue mechanism uses hashed waitqueues
* per zone .
*/
zone - > wait_table_size = wait_table_size ( zone_size_pages ) ;
zone - > wait_table_bits = wait_table_bits ( zone - > wait_table_size ) ;
zone - > wait_table = ( wait_queue_head_t * )
alloc_bootmem_node ( pgdat , zone - > wait_table_size
* sizeof ( wait_queue_head_t ) ) ;
for ( i = 0 ; i < zone - > wait_table_size ; + + i )
init_waitqueue_head ( zone - > wait_table + i ) ;
}
static __devinit void zone_pcp_init ( struct zone * zone )
{
int cpu ;
unsigned long batch = zone_batchsize ( zone ) ;
for ( cpu = 0 ; cpu < NR_CPUS ; cpu + + ) {
# ifdef CONFIG_NUMA
/* Early boot. Slab allocator not functional yet */
zone - > pageset [ cpu ] = & boot_pageset [ cpu ] ;
setup_pageset ( & boot_pageset [ cpu ] , 0 ) ;
# else
setup_pageset ( zone_pcp ( zone , cpu ) , batch ) ;
# endif
}
printk ( KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu \n " ,
zone - > name , zone - > present_pages , batch ) ;
}
static __devinit void init_currently_empty_zone ( struct zone * zone ,
unsigned long zone_start_pfn , unsigned long size )
{
struct pglist_data * pgdat = zone - > zone_pgdat ;
zone_wait_table_init ( zone , size ) ;
pgdat - > nr_zones = zone_idx ( zone ) + 1 ;
zone - > zone_mem_map = pfn_to_page ( zone_start_pfn ) ;
zone - > zone_start_pfn = zone_start_pfn ;
memmap_init ( size , pgdat - > node_id , zone_idx ( zone ) , zone_start_pfn ) ;
zone_init_free_lists ( pgdat , zone , zone - > spanned_pages ) ;
}
2005-04-17 02:20:36 +04:00
/*
* Set up the zone data structures :
* - mark all pages reserved
* - mark all memory queues empty
* - clear the memory bitmaps
*/
static void __init free_area_init_core ( struct pglist_data * pgdat ,
unsigned long * zones_size , unsigned long * zholes_size )
{
2005-10-30 04:16:50 +03:00
unsigned long j ;
int nid = pgdat - > node_id ;
2005-04-17 02:20:36 +04:00
unsigned long zone_start_pfn = pgdat - > node_start_pfn ;
2005-10-30 04:16:52 +03:00
pgdat_resize_init ( pgdat ) ;
2005-04-17 02:20:36 +04:00
pgdat - > nr_zones = 0 ;
init_waitqueue_head ( & pgdat - > kswapd_wait ) ;
pgdat - > kswapd_max_order = 0 ;
for ( j = 0 ; j < MAX_NR_ZONES ; j + + ) {
struct zone * zone = pgdat - > node_zones + j ;
unsigned long size , realsize ;
realsize = size = zones_size [ j ] ;
if ( zholes_size )
realsize - = zholes_size [ j ] ;
if ( j = = ZONE_DMA | | j = = ZONE_NORMAL )
nr_kernel_pages + = realsize ;
nr_all_pages + = realsize ;
zone - > spanned_pages = size ;
zone - > present_pages = realsize ;
zone - > name = zone_names [ j ] ;
spin_lock_init ( & zone - > lock ) ;
spin_lock_init ( & zone - > lru_lock ) ;
2005-10-30 04:16:53 +03:00
zone_seqlock_init ( zone ) ;
2005-04-17 02:20:36 +04:00
zone - > zone_pgdat = pgdat ;
zone - > free_pages = 0 ;
zone - > temp_priority = zone - > prev_priority = DEF_PRIORITY ;
2005-10-30 04:16:50 +03:00
zone_pcp_init ( zone ) ;
2005-04-17 02:20:36 +04:00
INIT_LIST_HEAD ( & zone - > active_list ) ;
INIT_LIST_HEAD ( & zone - > inactive_list ) ;
zone - > nr_scan_active = 0 ;
zone - > nr_scan_inactive = 0 ;
zone - > nr_active = 0 ;
zone - > nr_inactive = 0 ;
2005-09-04 02:54:51 +04:00
atomic_set ( & zone - > reclaim_in_progress , 0 ) ;
2005-04-17 02:20:36 +04:00
if ( ! size )
continue ;
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.
A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.
Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous. It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.
Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory. This is what allows the mem_map[]
to be chopped up.
In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags. Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations. However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions. Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.
One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags. It might provide
speed increases on certain platforms and will be stored there if there is
room. But, if out of room, an alternate (theoretically slower) mechanism is
used.
This patch introduces CONFIG_FLATMEM. It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:07:54 +04:00
zonetable_add ( zone , nid , j , zone_start_pfn , size ) ;
2005-10-30 04:16:50 +03:00
init_currently_empty_zone ( zone , zone_start_pfn , size ) ;
2005-04-17 02:20:36 +04:00
zone_start_pfn + = size ;
}
}
static void __init alloc_node_mem_map ( struct pglist_data * pgdat )
{
/* Skip empty nodes */
if ( ! pgdat - > node_spanned_pages )
return ;
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.
A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.
Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous. It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.
Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory. This is what allows the mem_map[]
to be chopped up.
In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags. Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations. However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions. Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.
One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags. It might provide
speed increases on certain platforms and will be stored there if there is
room. But, if out of room, an alternate (theoretically slower) mechanism is
used.
This patch introduces CONFIG_FLATMEM. It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:07:54 +04:00
# ifdef CONFIG_FLAT_NODE_MEM_MAP
2005-04-17 02:20:36 +04:00
/* ia64 gets its own node_mem_map, before this, without bootmem */
if ( ! pgdat - > node_mem_map ) {
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.
A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.
Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous. It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.
Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory. This is what allows the mem_map[]
to be chopped up.
In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags. Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations. However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions. Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.
One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags. It might provide
speed increases on certain platforms and will be stored there if there is
room. But, if out of room, an alternate (theoretically slower) mechanism is
used.
This patch introduces CONFIG_FLATMEM. It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:07:54 +04:00
unsigned long size ;
struct page * map ;
2005-04-17 02:20:36 +04:00
size = ( pgdat - > node_spanned_pages + 1 ) * sizeof ( struct page ) ;
2005-06-23 11:07:39 +04:00
map = alloc_remap ( pgdat - > node_id , size ) ;
if ( ! map )
map = alloc_bootmem_node ( pgdat , size ) ;
pgdat - > node_mem_map = map ;
2005-04-17 02:20:36 +04:00
}
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.
A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.
Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous. It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.
Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory. This is what allows the mem_map[]
to be chopped up.
In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags. Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations. However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions. Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.
One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags. It might provide
speed increases on certain platforms and will be stored there if there is
room. But, if out of room, an alternate (theoretically slower) mechanism is
used.
This patch introduces CONFIG_FLATMEM. It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:07:54 +04:00
# ifdef CONFIG_FLATMEM
2005-04-17 02:20:36 +04:00
/*
* With no DISCONTIG , the global mem_map is just set as node 0 ' s
*/
if ( pgdat = = NODE_DATA ( 0 ) )
mem_map = NODE_DATA ( 0 ) - > node_mem_map ;
# endif
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.
A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.
Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous. It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.
Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory. This is what allows the mem_map[]
to be chopped up.
In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags. Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations. However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions. Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.
One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags. It might provide
speed increases on certain platforms and will be stored there if there is
room. But, if out of room, an alternate (theoretically slower) mechanism is
used.
This patch introduces CONFIG_FLATMEM. It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 11:07:54 +04:00
# endif /* CONFIG_FLAT_NODE_MEM_MAP */
2005-04-17 02:20:36 +04:00
}
void __init free_area_init_node ( int nid , struct pglist_data * pgdat ,
unsigned long * zones_size , unsigned long node_start_pfn ,
unsigned long * zholes_size )
{
pgdat - > node_id = nid ;
pgdat - > node_start_pfn = node_start_pfn ;
calculate_zone_totalpages ( pgdat , zones_size , zholes_size ) ;
alloc_node_mem_map ( pgdat ) ;
free_area_init_core ( pgdat , zones_size , zholes_size ) ;
}
2005-06-23 11:07:47 +04:00
# ifndef CONFIG_NEED_MULTIPLE_NODES
2005-04-17 02:20:36 +04:00
static bootmem_data_t contig_bootmem_data ;
struct pglist_data contig_page_data = { . bdata = & contig_bootmem_data } ;
EXPORT_SYMBOL ( contig_page_data ) ;
2005-06-23 11:07:47 +04:00
# endif
2005-04-17 02:20:36 +04:00
void __init free_area_init ( unsigned long * zones_size )
{
2005-06-23 11:07:47 +04:00
free_area_init_node ( 0 , NODE_DATA ( 0 ) , zones_size ,
2005-04-17 02:20:36 +04:00
__pa ( PAGE_OFFSET ) > > PAGE_SHIFT , NULL ) ;
}
# ifdef CONFIG_PROC_FS
# include <linux/seq_file.h>
static void * frag_start ( struct seq_file * m , loff_t * pos )
{
pg_data_t * pgdat ;
loff_t node = * pos ;
for ( pgdat = pgdat_list ; pgdat & & node ; pgdat = pgdat - > pgdat_next )
- - node ;
return pgdat ;
}
static void * frag_next ( struct seq_file * m , void * arg , loff_t * pos )
{
pg_data_t * pgdat = ( pg_data_t * ) arg ;
( * pos ) + + ;
return pgdat - > pgdat_next ;
}
static void frag_stop ( struct seq_file * m , void * arg )
{
}
/*
* This walks the free areas for each zone .
*/
static int frag_show ( struct seq_file * m , void * arg )
{
pg_data_t * pgdat = ( pg_data_t * ) arg ;
struct zone * zone ;
struct zone * node_zones = pgdat - > node_zones ;
unsigned long flags ;
int order ;
for ( zone = node_zones ; zone - node_zones < MAX_NR_ZONES ; + + zone ) {
if ( ! zone - > present_pages )
continue ;
spin_lock_irqsave ( & zone - > lock , flags ) ;
seq_printf ( m , " Node %d, zone %8s " , pgdat - > node_id , zone - > name ) ;
for ( order = 0 ; order < MAX_ORDER ; + + order )
seq_printf ( m , " %6lu " , zone - > free_area [ order ] . nr_free ) ;
spin_unlock_irqrestore ( & zone - > lock , flags ) ;
seq_putc ( m , ' \n ' ) ;
}
return 0 ;
}
struct seq_operations fragmentation_op = {
. start = frag_start ,
. next = frag_next ,
. stop = frag_stop ,
. show = frag_show ,
} ;
2005-06-22 04:14:38 +04:00
/*
* Output information about zones in @ pgdat .
*/
static int zoneinfo_show ( struct seq_file * m , void * arg )
{
pg_data_t * pgdat = arg ;
struct zone * zone ;
struct zone * node_zones = pgdat - > node_zones ;
unsigned long flags ;
for ( zone = node_zones ; zone - node_zones < MAX_NR_ZONES ; zone + + ) {
int i ;
if ( ! zone - > present_pages )
continue ;
spin_lock_irqsave ( & zone - > lock , flags ) ;
seq_printf ( m , " Node %d, zone %8s " , pgdat - > node_id , zone - > name ) ;
seq_printf ( m ,
" \n pages free %lu "
" \n min %lu "
" \n low %lu "
" \n high %lu "
" \n active %lu "
" \n inactive %lu "
" \n scanned %lu (a: %lu i: %lu) "
" \n spanned %lu "
" \n present %lu " ,
zone - > free_pages ,
zone - > pages_min ,
zone - > pages_low ,
zone - > pages_high ,
zone - > nr_active ,
zone - > nr_inactive ,
zone - > pages_scanned ,
zone - > nr_scan_active , zone - > nr_scan_inactive ,
zone - > spanned_pages ,
zone - > present_pages ) ;
seq_printf ( m ,
" \n protection: (%lu " ,
zone - > lowmem_reserve [ 0 ] ) ;
for ( i = 1 ; i < ARRAY_SIZE ( zone - > lowmem_reserve ) ; i + + )
seq_printf ( m , " , %lu " , zone - > lowmem_reserve [ i ] ) ;
seq_printf ( m ,
" ) "
" \n pagesets " ) ;
for ( i = 0 ; i < ARRAY_SIZE ( zone - > pageset ) ; i + + ) {
struct per_cpu_pageset * pageset ;
int j ;
2005-06-22 04:14:47 +04:00
pageset = zone_pcp ( zone , i ) ;
2005-06-22 04:14:38 +04:00
for ( j = 0 ; j < ARRAY_SIZE ( pageset - > pcp ) ; j + + ) {
if ( pageset - > pcp [ j ] . count )
break ;
}
if ( j = = ARRAY_SIZE ( pageset - > pcp ) )
continue ;
for ( j = 0 ; j < ARRAY_SIZE ( pageset - > pcp ) ; j + + ) {
seq_printf ( m ,
" \n cpu: %i pcp: %i "
" \n count: %i "
" \n low: %i "
" \n high: %i "
" \n batch: %i " ,
i , j ,
pageset - > pcp [ j ] . count ,
pageset - > pcp [ j ] . low ,
pageset - > pcp [ j ] . high ,
pageset - > pcp [ j ] . batch ) ;
}
# ifdef CONFIG_NUMA
seq_printf ( m ,
" \n numa_hit: %lu "
" \n numa_miss: %lu "
" \n numa_foreign: %lu "
" \n interleave_hit: %lu "
" \n local_node: %lu "
" \n other_node: %lu " ,
pageset - > numa_hit ,
pageset - > numa_miss ,
pageset - > numa_foreign ,
pageset - > interleave_hit ,
pageset - > local_node ,
pageset - > other_node ) ;
# endif
}
seq_printf ( m ,
" \n all_unreclaimable: %u "
" \n prev_priority: %i "
" \n temp_priority: %i "
" \n start_pfn: %lu " ,
zone - > all_unreclaimable ,
zone - > prev_priority ,
zone - > temp_priority ,
zone - > zone_start_pfn ) ;
spin_unlock_irqrestore ( & zone - > lock , flags ) ;
seq_putc ( m , ' \n ' ) ;
}
return 0 ;
}
struct seq_operations zoneinfo_op = {
. start = frag_start , /* iterate over all zones. The same as in
* fragmentation . */
. next = frag_next ,
. stop = frag_stop ,
. show = zoneinfo_show ,
} ;
2005-04-17 02:20:36 +04:00
static char * vmstat_text [ ] = {
" nr_dirty " ,
" nr_writeback " ,
" nr_unstable " ,
" nr_page_table_pages " ,
" nr_mapped " ,
" nr_slab " ,
" pgpgin " ,
" pgpgout " ,
" pswpin " ,
" pswpout " ,
" pgalloc_high " ,
" pgalloc_normal " ,
" pgalloc_dma " ,
" pgfree " ,
" pgactivate " ,
" pgdeactivate " ,
" pgfault " ,
" pgmajfault " ,
" pgrefill_high " ,
" pgrefill_normal " ,
" pgrefill_dma " ,
" pgsteal_high " ,
" pgsteal_normal " ,
" pgsteal_dma " ,
" pgscan_kswapd_high " ,
" pgscan_kswapd_normal " ,
" pgscan_kswapd_dma " ,
" pgscan_direct_high " ,
" pgscan_direct_normal " ,
" pgscan_direct_dma " ,
" pginodesteal " ,
" slabs_scanned " ,
" kswapd_steal " ,
" kswapd_inodesteal " ,
" pageoutrun " ,
" allocstall " ,
" pgrotated " ,
2005-05-01 19:58:37 +04:00
" nr_bounce " ,
2005-04-17 02:20:36 +04:00
} ;
static void * vmstat_start ( struct seq_file * m , loff_t * pos )
{
struct page_state * ps ;
if ( * pos > = ARRAY_SIZE ( vmstat_text ) )
return NULL ;
ps = kmalloc ( sizeof ( * ps ) , GFP_KERNEL ) ;
m - > private = ps ;
if ( ! ps )
return ERR_PTR ( - ENOMEM ) ;
get_full_page_state ( ps ) ;
ps - > pgpgin / = 2 ; /* sectors -> kbytes */
ps - > pgpgout / = 2 ;
return ( unsigned long * ) ps + * pos ;
}
static void * vmstat_next ( struct seq_file * m , void * arg , loff_t * pos )
{
( * pos ) + + ;
if ( * pos > = ARRAY_SIZE ( vmstat_text ) )
return NULL ;
return ( unsigned long * ) m - > private + * pos ;
}
static int vmstat_show ( struct seq_file * m , void * arg )
{
unsigned long * l = arg ;
unsigned long off = l - ( unsigned long * ) m - > private ;
seq_printf ( m , " %s %lu \n " , vmstat_text [ off ] , * l ) ;
return 0 ;
}
static void vmstat_stop ( struct seq_file * m , void * arg )
{
kfree ( m - > private ) ;
m - > private = NULL ;
}
struct seq_operations vmstat_op = {
. start = vmstat_start ,
. next = vmstat_next ,
. stop = vmstat_stop ,
. show = vmstat_show ,
} ;
# endif /* CONFIG_PROC_FS */
# ifdef CONFIG_HOTPLUG_CPU
static int page_alloc_cpu_notify ( struct notifier_block * self ,
unsigned long action , void * hcpu )
{
int cpu = ( unsigned long ) hcpu ;
long * count ;
unsigned long * src , * dest ;
if ( action = = CPU_DEAD ) {
int i ;
/* Drain local pagecache count. */
count = & per_cpu ( nr_pagecache_local , cpu ) ;
atomic_add ( * count , & nr_pagecache ) ;
* count = 0 ;
local_irq_disable ( ) ;
__drain_pages ( cpu ) ;
/* Add dead cpu's page_states to our own. */
dest = ( unsigned long * ) & __get_cpu_var ( page_states ) ;
src = ( unsigned long * ) & per_cpu ( page_states , cpu ) ;
for ( i = 0 ; i < sizeof ( struct page_state ) / sizeof ( unsigned long ) ;
i + + ) {
dest [ i ] + = src [ i ] ;
src [ i ] = 0 ;
}
local_irq_enable ( ) ;
}
return NOTIFY_OK ;
}
# endif /* CONFIG_HOTPLUG_CPU */
void __init page_alloc_init ( void )
{
hotcpu_notifier ( page_alloc_cpu_notify , 0 ) ;
}
/*
* setup_per_zone_lowmem_reserve - called whenever
* sysctl_lower_zone_reserve_ratio changes . Ensures that each zone
* has a correct pages reserved value , so an adequate number of
* pages are left in the zone after a successful __alloc_pages ( ) .
*/
static void setup_per_zone_lowmem_reserve ( void )
{
struct pglist_data * pgdat ;
int j , idx ;
for_each_pgdat ( pgdat ) {
for ( j = 0 ; j < MAX_NR_ZONES ; j + + ) {
struct zone * zone = pgdat - > node_zones + j ;
unsigned long present_pages = zone - > present_pages ;
zone - > lowmem_reserve [ j ] = 0 ;
for ( idx = j - 1 ; idx > = 0 ; idx - - ) {
struct zone * lower_zone ;
if ( sysctl_lowmem_reserve_ratio [ idx ] < 1 )
sysctl_lowmem_reserve_ratio [ idx ] = 1 ;
lower_zone = pgdat - > node_zones + idx ;
lower_zone - > lowmem_reserve [ j ] = present_pages /
sysctl_lowmem_reserve_ratio [ idx ] ;
present_pages + = lower_zone - > present_pages ;
}
}
}
}
/*
* setup_per_zone_pages_min - called when min_free_kbytes changes . Ensures
* that the pages_ { min , low , high } values for each zone are set correctly
* with respect to min_free_kbytes .
*/
2005-10-30 04:16:54 +03:00
void setup_per_zone_pages_min ( void )
2005-04-17 02:20:36 +04:00
{
unsigned long pages_min = min_free_kbytes > > ( PAGE_SHIFT - 10 ) ;
unsigned long lowmem_pages = 0 ;
struct zone * zone ;
unsigned long flags ;
/* Calculate total number of !ZONE_HIGHMEM pages */
for_each_zone ( zone ) {
if ( ! is_highmem ( zone ) )
lowmem_pages + = zone - > present_pages ;
}
for_each_zone ( zone ) {
spin_lock_irqsave ( & zone - > lru_lock , flags ) ;
if ( is_highmem ( zone ) ) {
/*
* Often , highmem doesn ' t need to reserve any pages .
* But the pages_min / low / high values are also used for
* batching up page reclaim activity so we need a
* decent value here .
*/
int min_pages ;
min_pages = zone - > present_pages / 1024 ;
if ( min_pages < SWAP_CLUSTER_MAX )
min_pages = SWAP_CLUSTER_MAX ;
if ( min_pages > 128 )
min_pages = 128 ;
zone - > pages_min = min_pages ;
} else {
2005-06-22 04:14:38 +04:00
/* if it's a lowmem zone, reserve a number of pages
2005-04-17 02:20:36 +04:00
* proportionate to the zone ' s size .
*/
2005-06-22 04:14:38 +04:00
zone - > pages_min = ( pages_min * zone - > present_pages ) /
2005-04-17 02:20:36 +04:00
lowmem_pages ;
}
/*
* When interpreting these watermarks , just keep in mind that :
* zone - > pages_min = = ( zone - > pages_min * 4 ) / 4 ;
*/
zone - > pages_low = ( zone - > pages_min * 5 ) / 4 ;
zone - > pages_high = ( zone - > pages_min * 6 ) / 4 ;
spin_unlock_irqrestore ( & zone - > lru_lock , flags ) ;
}
}
/*
* Initialise min_free_kbytes .
*
* For small machines we want it small ( 128 k min ) . For large machines
* we want it large ( 64 MB max ) . But it is not linear , because network
* bandwidth does not increase linearly with machine size . We use
*
* min_free_kbytes = 4 * sqrt ( lowmem_kbytes ) , for better accuracy :
* min_free_kbytes = sqrt ( lowmem_kbytes * 16 )
*
* which yields
*
* 16 MB : 512 k
* 32 MB : 724 k
* 64 MB : 1024 k
* 128 MB : 1448 k
* 256 MB : 2048 k
* 512 MB : 2896 k
* 1024 MB : 4096 k
* 2048 MB : 5792 k
* 4096 MB : 8192 k
* 8192 MB : 11584 k
* 16384 MB : 16384 k
*/
static int __init init_per_zone_pages_min ( void )
{
unsigned long lowmem_kbytes ;
lowmem_kbytes = nr_free_buffer_pages ( ) * ( PAGE_SIZE > > 10 ) ;
min_free_kbytes = int_sqrt ( lowmem_kbytes * 16 ) ;
if ( min_free_kbytes < 128 )
min_free_kbytes = 128 ;
if ( min_free_kbytes > 65536 )
min_free_kbytes = 65536 ;
setup_per_zone_pages_min ( ) ;
setup_per_zone_lowmem_reserve ( ) ;
return 0 ;
}
module_init ( init_per_zone_pages_min )
/*
* min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec ( ) so
* that we can call two helper functions whenever min_free_kbytes
* changes .
*/
int min_free_kbytes_sysctl_handler ( ctl_table * table , int write ,
struct file * file , void __user * buffer , size_t * length , loff_t * ppos )
{
proc_dointvec ( table , write , file , buffer , length , ppos ) ;
setup_per_zone_pages_min ( ) ;
return 0 ;
}
/*
* lowmem_reserve_ratio_sysctl_handler - just a wrapper around
* proc_dointvec ( ) so that we can call setup_per_zone_lowmem_reserve ( )
* whenever sysctl_lowmem_reserve_ratio changes .
*
* The reserve ratio obviously has absolutely no relation with the
* pages_min watermarks . The lowmem reserve ratio can only make sense
* if in function of the boot time zone sizes .
*/
int lowmem_reserve_ratio_sysctl_handler ( ctl_table * table , int write ,
struct file * file , void __user * buffer , size_t * length , loff_t * ppos )
{
proc_dointvec_minmax ( table , write , file , buffer , length , ppos ) ;
setup_per_zone_lowmem_reserve ( ) ;
return 0 ;
}
__initdata int hashdist = HASHDIST_DEFAULT ;
# ifdef CONFIG_NUMA
static int __init set_hashdist ( char * str )
{
if ( ! str )
return 0 ;
hashdist = simple_strtoul ( str , & str , 0 ) ;
return 1 ;
}
__setup ( " hashdist= " , set_hashdist ) ;
# endif
/*
* allocate a large system hash table from bootmem
* - it is assumed that the hash table must contain an exact power - of - 2
* quantity of entries
* - limit is the number of hash buckets , not the total allocation size
*/
void * __init alloc_large_system_hash ( const char * tablename ,
unsigned long bucketsize ,
unsigned long numentries ,
int scale ,
int flags ,
unsigned int * _hash_shift ,
unsigned int * _hash_mask ,
unsigned long limit )
{
unsigned long long max = limit ;
unsigned long log2qty , size ;
void * table = NULL ;
/* allow the kernel cmdline to have a say */
if ( ! numentries ) {
/* round applicable memory size up to nearest megabyte */
numentries = ( flags & HASH_HIGHMEM ) ? nr_all_pages : nr_kernel_pages ;
numentries + = ( 1UL < < ( 20 - PAGE_SHIFT ) ) - 1 ;
numentries > > = 20 - PAGE_SHIFT ;
numentries < < = 20 - PAGE_SHIFT ;
/* limit to 1 bucket per 2^scale bytes of low memory */
if ( scale > PAGE_SHIFT )
numentries > > = ( scale - PAGE_SHIFT ) ;
else
numentries < < = ( PAGE_SHIFT - scale ) ;
}
/* rounded up to nearest power of 2 in size */
numentries = 1UL < < ( long_log2 ( numentries ) + 1 ) ;
/* limit allocation size to 1/16 total memory by default */
if ( max = = 0 ) {
max = ( ( unsigned long long ) nr_all_pages < < PAGE_SHIFT ) > > 4 ;
do_div ( max , bucketsize ) ;
}
if ( numentries > max )
numentries = max ;
log2qty = long_log2 ( numentries ) ;
do {
size = bucketsize < < log2qty ;
if ( flags & HASH_EARLY )
table = alloc_bootmem ( size ) ;
else if ( hashdist )
table = __vmalloc ( size , GFP_ATOMIC , PAGE_KERNEL ) ;
else {
unsigned long order ;
for ( order = 0 ; ( ( 1UL < < order ) < < PAGE_SHIFT ) < size ; order + + )
;
table = ( void * ) __get_free_pages ( GFP_ATOMIC , order ) ;
}
} while ( ! table & & size > PAGE_SIZE & & - - log2qty ) ;
if ( ! table )
panic ( " Failed to allocate %s hash table \n " , tablename ) ;
printk ( " %s hash table entries: %d (order: %d, %lu bytes) \n " ,
tablename ,
( 1U < < log2qty ) ,
long_log2 ( size ) - PAGE_SHIFT ,
size ) ;
if ( _hash_shift )
* _hash_shift = log2qty ;
if ( _hash_mask )
* _hash_mask = ( 1 < < log2qty ) - 1 ;
return table ;
}