2005-04-16 15:20:36 -07:00
/*
* linux / mm / swap_state . c
*
* Copyright ( C ) 1991 , 1992 , 1993 , 1994 Linus Torvalds
* Swap reorganised 29.12 .95 , Stephen Tweedie
*
* Rewritten to use page cache , ( C ) 1998 Stephen Tweedie
*/
# include <linux/module.h>
# include <linux/mm.h>
# include <linux/kernel_stat.h>
# include <linux/swap.h>
# include <linux/init.h>
# include <linux/pagemap.h>
# include <linux/buffer_head.h>
# include <linux/backing-dev.h>
2006-01-06 00:10:55 -08:00
# include <linux/pagevec.h>
2006-03-22 00:09:12 -08:00
# include <linux/migrate.h>
2005-04-16 15:20:36 -07:00
# include <asm/pgtable.h>
/*
* swapper_space is a fiction , retained to simplify the path through
* vmscan ' s shrink_list , to make sync_page look nicer , and to allow
* future use of radix_tree tags in the swap cache .
*/
static struct address_space_operations swap_aops = {
. writepage = swap_writepage ,
. sync_page = block_sync_page ,
. set_page_dirty = __set_page_dirty_nobuffers ,
2006-02-01 03:05:41 -08:00
. migratepage = migrate_page ,
2005-04-16 15:20:36 -07:00
} ;
static struct backing_dev_info swap_backing_dev_info = {
. capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK ,
. unplug_io_fn = swap_unplug_io_fn ,
} ;
struct address_space swapper_space = {
. page_tree = RADIX_TREE_INIT ( GFP_ATOMIC | __GFP_NOWARN ) ,
. tree_lock = RW_LOCK_UNLOCKED ,
. a_ops = & swap_aops ,
. i_mmap_nonlinear = LIST_HEAD_INIT ( swapper_space . i_mmap_nonlinear ) ,
. backing_dev_info = & swap_backing_dev_info ,
} ;
# define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0)
static struct {
unsigned long add_total ;
unsigned long del_total ;
unsigned long find_success ;
unsigned long find_total ;
unsigned long noent_race ;
unsigned long exist_race ;
} swap_cache_info ;
void show_swap_cache_info ( void )
{
printk ( " Swap cache: add %lu, delete %lu, find %lu/%lu, race %lu+%lu \n " ,
swap_cache_info . add_total , swap_cache_info . del_total ,
swap_cache_info . find_success , swap_cache_info . find_total ,
swap_cache_info . noent_race , swap_cache_info . exist_race ) ;
printk ( " Free swap = %lukB \n " , nr_swap_pages < < ( PAGE_SHIFT - 10 ) ) ;
printk ( " Total swap = %lukB \n " , total_swap_pages < < ( PAGE_SHIFT - 10 ) ) ;
}
/*
* __add_to_swap_cache resembles add_to_page_cache on swapper_space ,
* but sets SwapCache flag and private instead of mapping and index .
*/
2005-09-10 00:26:46 -07:00
static int __add_to_swap_cache ( struct page * page , swp_entry_t entry ,
2005-10-07 07:46:04 +01:00
gfp_t gfp_mask )
2005-04-16 15:20:36 -07:00
{
int error ;
BUG_ON ( PageSwapCache ( page ) ) ;
BUG_ON ( PagePrivate ( page ) ) ;
error = radix_tree_preload ( gfp_mask ) ;
if ( ! error ) {
write_lock_irq ( & swapper_space . tree_lock ) ;
error = radix_tree_insert ( & swapper_space . page_tree ,
entry . val , page ) ;
if ( ! error ) {
page_cache_get ( page ) ;
SetPageLocked ( page ) ;
SetPageSwapCache ( page ) ;
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-29 18:16:40 -07:00
set_page_private ( page , entry . val ) ;
2005-04-16 15:20:36 -07:00
total_swapcache_pages + + ;
pagecache_acct ( 1 ) ;
}
write_unlock_irq ( & swapper_space . tree_lock ) ;
radix_tree_preload_end ( ) ;
}
return error ;
}
static int add_to_swap_cache ( struct page * page , swp_entry_t entry )
{
int error ;
if ( ! swap_duplicate ( entry ) ) {
INC_CACHE_INFO ( noent_race ) ;
return - ENOENT ;
}
error = __add_to_swap_cache ( page , entry , GFP_KERNEL ) ;
/*
* Anon pages are already on the LRU , we don ' t run lru_cache_add here .
*/
if ( error ) {
swap_free ( entry ) ;
if ( error = = - EEXIST )
INC_CACHE_INFO ( exist_race ) ;
return error ;
}
INC_CACHE_INFO ( add_total ) ;
return 0 ;
}
/*
* This must be called only on pages that have
* been verified to be in the swap cache .
*/
void __delete_from_swap_cache ( struct page * page )
{
BUG_ON ( ! PageLocked ( page ) ) ;
BUG_ON ( ! PageSwapCache ( page ) ) ;
BUG_ON ( PageWriteback ( page ) ) ;
2005-09-03 15:54:43 -07:00
BUG_ON ( PagePrivate ( page ) ) ;
2005-04-16 15:20:36 -07:00
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-29 18:16:40 -07:00
radix_tree_delete ( & swapper_space . page_tree , page_private ( page ) ) ;
set_page_private ( page , 0 ) ;
2005-04-16 15:20:36 -07:00
ClearPageSwapCache ( page ) ;
total_swapcache_pages - - ;
pagecache_acct ( - 1 ) ;
INC_CACHE_INFO ( del_total ) ;
}
/**
* add_to_swap - allocate swap space for a page
* @ page : page we want to move to swap
*
* Allocate swap space for the page and add the page to the
* swap cache . Caller needs to hold the page lock .
*/
2006-01-08 01:00:53 -08:00
int add_to_swap ( struct page * page , gfp_t gfp_mask )
2005-04-16 15:20:36 -07:00
{
swp_entry_t entry ;
int err ;
2006-04-01 01:25:12 +02:00
BUG_ON ( ! PageLocked ( page ) ) ;
2005-04-16 15:20:36 -07:00
for ( ; ; ) {
entry = get_swap_page ( ) ;
if ( ! entry . val )
return 0 ;
2005-05-01 08:58:37 -07:00
/*
* Radix - tree node allocations from PF_MEMALLOC contexts could
* completely exhaust the page allocator . __GFP_NOMEMALLOC
* stops emergency reserves from being allocated .
2005-04-16 15:20:36 -07:00
*
2005-05-01 08:58:37 -07:00
* TODO : this could cause a theoretical memory reclaim
* deadlock in the swap out path .
2005-04-16 15:20:36 -07:00
*/
/*
* Add it to the swap cache and mark it dirty
*/
2005-05-01 08:58:37 -07:00
err = __add_to_swap_cache ( page , entry ,
2006-01-08 01:00:53 -08:00
gfp_mask | __GFP_NOMEMALLOC | __GFP_NOWARN ) ;
2005-04-16 15:20:36 -07:00
switch ( err ) {
case 0 : /* Success */
SetPageUptodate ( page ) ;
SetPageDirty ( page ) ;
INC_CACHE_INFO ( add_total ) ;
return 1 ;
case - EEXIST :
/* Raced with "speculative" read_swap_cache_async */
INC_CACHE_INFO ( exist_race ) ;
swap_free ( entry ) ;
continue ;
default :
/* -ENOMEM radix-tree allocation failure */
swap_free ( entry ) ;
return 0 ;
}
}
}
/*
* This must be called only on pages that have
* been verified to be in the swap cache and locked .
* It will never put the page into the free list ,
* the caller has a reference on the page .
*/
void delete_from_swap_cache ( struct page * page )
{
swp_entry_t entry ;
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-29 18:16:40 -07:00
entry . val = page_private ( page ) ;
2005-04-16 15:20:36 -07:00
write_lock_irq ( & swapper_space . tree_lock ) ;
__delete_from_swap_cache ( page ) ;
write_unlock_irq ( & swapper_space . tree_lock ) ;
swap_free ( entry ) ;
page_cache_release ( page ) ;
}
/*
* Strange swizzling function only for use by shmem_writepage
*/
int move_to_swap_cache ( struct page * page , swp_entry_t entry )
{
int err = __add_to_swap_cache ( page , entry , GFP_ATOMIC ) ;
if ( ! err ) {
remove_from_page_cache ( page ) ;
page_cache_release ( page ) ; /* pagecache ref */
if ( ! swap_duplicate ( entry ) )
BUG ( ) ;
SetPageDirty ( page ) ;
INC_CACHE_INFO ( add_total ) ;
} else if ( err = = - EEXIST )
INC_CACHE_INFO ( exist_race ) ;
return err ;
}
/*
* Strange swizzling function for shmem_getpage ( and shmem_unuse )
*/
int move_from_swap_cache ( struct page * page , unsigned long index ,
struct address_space * mapping )
{
int err = add_to_page_cache ( page , mapping , index , GFP_ATOMIC ) ;
if ( ! err ) {
delete_from_swap_cache ( page ) ;
/* shift page from clean_pages to dirty_pages list */
ClearPageDirty ( page ) ;
set_page_dirty ( page ) ;
}
return err ;
}
/*
* If we are the only user , then try to free up the swap cache .
*
* Its ok to check for PageSwapCache without the page lock
* here because we are going to recheck again inside
* exclusive_swap_page ( ) _with_ the lock .
* - Marcelo
*/
static inline void free_swap_cache ( struct page * page )
{
if ( PageSwapCache ( page ) & & ! TestSetPageLocked ( page ) ) {
remove_exclusive_swap_page ( page ) ;
unlock_page ( page ) ;
}
}
/*
* Perform a free_page ( ) , also freeing any swap cache associated with
2005-10-29 18:16:41 -07:00
* this page if it is the last user of the page .
2005-04-16 15:20:36 -07:00
*/
void free_page_and_swap_cache ( struct page * page )
{
free_swap_cache ( page ) ;
page_cache_release ( page ) ;
}
/*
* Passed an array of pages , drop them all from swapcache and then release
* them . They are removed from the LRU and freed if this is their last use .
*/
void free_pages_and_swap_cache ( struct page * * pages , int nr )
{
struct page * * pagep = pages ;
lru_add_drain ( ) ;
while ( nr ) {
2006-01-06 00:10:55 -08:00
int todo = min ( nr , PAGEVEC_SIZE ) ;
2005-04-16 15:20:36 -07:00
int i ;
for ( i = 0 ; i < todo ; i + + )
free_swap_cache ( pagep [ i ] ) ;
release_pages ( pagep , todo , 0 ) ;
pagep + = todo ;
nr - = todo ;
}
}
/*
* Lookup a swap entry in the swap cache . A found page will be returned
* unlocked and with its refcount incremented - we rely on the kernel
* lock getting page table operations atomic even if we drop the page
* lock before returning .
*/
struct page * lookup_swap_cache ( swp_entry_t entry )
{
struct page * page ;
page = find_get_page ( & swapper_space , entry . val ) ;
if ( page )
INC_CACHE_INFO ( find_success ) ;
INC_CACHE_INFO ( find_total ) ;
return page ;
}
/*
* Locate a page of swap in physical memory , reserving swap cache space
* and reading the disk if it is not already cached .
* A failure return means that either the page allocation failed or that
* the swap entry is no longer in use .
*/
struct page * read_swap_cache_async ( swp_entry_t entry ,
struct vm_area_struct * vma , unsigned long addr )
{
struct page * found_page , * new_page = NULL ;
int err ;
do {
/*
* First check the swap cache . Since this is normally
* called after lookup_swap_cache ( ) failed , re - calling
* that would confuse statistics .
*/
found_page = find_get_page ( & swapper_space , entry . val ) ;
if ( found_page )
break ;
/*
* Get a new page to read into from swap .
*/
if ( ! new_page ) {
new_page = alloc_page_vma ( GFP_HIGHUSER , vma , addr ) ;
if ( ! new_page )
break ; /* Out of memory */
}
/*
* Associate the page with swap entry in the swap cache .
* May fail ( - ENOENT ) if swap entry has been freed since
* our caller observed it . May fail ( - EEXIST ) if there
* is already a page associated with this entry in the
* swap cache : added by a racing read_swap_cache_async ,
* or by try_to_swap_out ( or shmem_writepage ) re - using
* the just freed swap entry for an existing page .
* May fail ( - ENOMEM ) if radix - tree node allocation failed .
*/
err = add_to_swap_cache ( new_page , entry ) ;
if ( ! err ) {
/*
* Initiate read into locked page and return .
*/
lru_cache_add_active ( new_page ) ;
swap_readpage ( NULL , new_page ) ;
return new_page ;
}
} while ( err ! = - ENOENT & & err ! = - ENOMEM ) ;
if ( new_page )
page_cache_release ( new_page ) ;
return found_page ;
}