2005-04-16 15:20:36 -07:00
/*
* linux / mm / swap . c
*
* Copyright ( C ) 1991 , 1992 , 1993 , 1994 Linus Torvalds
*/
/*
* This file contains the default values for the opereation of the
* Linux VM subsystem . Fine - tuning documentation can be found in
* Documentation / sysctl / vm . txt .
* Started 18.12 .91
* Swap aging added 23.2 .95 , Stephen Tweedie .
* Buffermem limits added 12.3 .98 , Rik van Riel .
*/
# include <linux/mm.h>
# include <linux/sched.h>
# include <linux/kernel_stat.h>
# include <linux/swap.h>
# include <linux/mman.h>
# include <linux/pagemap.h>
# include <linux/pagevec.h>
# include <linux/init.h>
# include <linux/module.h>
# include <linux/mm_inline.h>
# include <linux/buffer_head.h> /* for try_to_release_page() */
# include <linux/module.h>
# include <linux/percpu_counter.h>
# include <linux/percpu.h>
# include <linux/cpu.h>
# include <linux/notifier.h>
# include <linux/init.h>
/* How many pages do we try to swap or page in/out together? */
int page_cluster ;
void put_page ( struct page * page )
{
if ( unlikely ( PageCompound ( page ) ) ) {
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-29 18:16:40 -07:00
page = ( struct page * ) page_private ( page ) ;
2005-04-16 15:20:36 -07:00
if ( put_page_testzero ( page ) ) {
void ( * dtor ) ( struct page * page ) ;
dtor = ( void ( * ) ( struct page * ) ) page [ 1 ] . mapping ;
( * dtor ) ( page ) ;
}
return ;
}
2005-10-29 18:16:12 -07:00
if ( put_page_testzero ( page ) )
2005-04-16 15:20:36 -07:00
__page_cache_release ( page ) ;
}
EXPORT_SYMBOL ( put_page ) ;
/*
* Writeback is about to end against a page which has been marked for immediate
* reclaim . If it still appears to be reclaimable , move it to the tail of the
* inactive list . The page still has PageWriteback set , which will pin it .
*
* We don ' t expect many pages to come through here , so don ' t bother batching
* things up .
*
* To avoid placing the page at the tail of the LRU while PG_writeback is still
* set , this function will clear PG_writeback before performing the page
* motion . Do that inside the lru lock because once PG_writeback is cleared
* we may not touch the page .
*
* Returns zero if it cleared PG_writeback .
*/
int rotate_reclaimable_page ( struct page * page )
{
struct zone * zone ;
unsigned long flags ;
if ( PageLocked ( page ) )
return 1 ;
if ( PageDirty ( page ) )
return 1 ;
if ( PageActive ( page ) )
return 1 ;
if ( ! PageLRU ( page ) )
return 1 ;
zone = page_zone ( page ) ;
spin_lock_irqsave ( & zone - > lru_lock , flags ) ;
if ( PageLRU ( page ) & & ! PageActive ( page ) ) {
list_del ( & page - > lru ) ;
list_add_tail ( & page - > lru , & zone - > inactive_list ) ;
inc_page_state ( pgrotated ) ;
}
if ( ! test_clear_page_writeback ( page ) )
BUG ( ) ;
spin_unlock_irqrestore ( & zone - > lru_lock , flags ) ;
return 0 ;
}
/*
* FIXME : speed this up ?
*/
void fastcall activate_page ( struct page * page )
{
struct zone * zone = page_zone ( page ) ;
spin_lock_irq ( & zone - > lru_lock ) ;
if ( PageLRU ( page ) & & ! PageActive ( page ) ) {
del_page_from_inactive_list ( zone , page ) ;
SetPageActive ( page ) ;
add_page_to_active_list ( zone , page ) ;
inc_page_state ( pgactivate ) ;
}
spin_unlock_irq ( & zone - > lru_lock ) ;
}
/*
* Mark a page as having seen activity .
*
* inactive , unreferenced - > inactive , referenced
* inactive , referenced - > active , unreferenced
* active , unreferenced - > active , referenced
*/
void fastcall mark_page_accessed ( struct page * page )
{
if ( ! PageActive ( page ) & & PageReferenced ( page ) & & PageLRU ( page ) ) {
activate_page ( page ) ;
ClearPageReferenced ( page ) ;
} else if ( ! PageReferenced ( page ) ) {
SetPageReferenced ( page ) ;
}
}
EXPORT_SYMBOL ( mark_page_accessed ) ;
/**
* lru_cache_add : add a page to the page lists
* @ page : the page to add
*/
static DEFINE_PER_CPU ( struct pagevec , lru_add_pvecs ) = { 0 , } ;
static DEFINE_PER_CPU ( struct pagevec , lru_add_active_pvecs ) = { 0 , } ;
void fastcall lru_cache_add ( struct page * page )
{
struct pagevec * pvec = & get_cpu_var ( lru_add_pvecs ) ;
page_cache_get ( page ) ;
if ( ! pagevec_add ( pvec , page ) )
__pagevec_lru_add ( pvec ) ;
put_cpu_var ( lru_add_pvecs ) ;
}
void fastcall lru_cache_add_active ( struct page * page )
{
struct pagevec * pvec = & get_cpu_var ( lru_add_active_pvecs ) ;
page_cache_get ( page ) ;
if ( ! pagevec_add ( pvec , page ) )
__pagevec_lru_add_active ( pvec ) ;
put_cpu_var ( lru_add_active_pvecs ) ;
}
2006-01-06 00:11:14 -08:00
static void __lru_add_drain ( int cpu )
2005-04-16 15:20:36 -07:00
{
2006-01-06 00:11:14 -08:00
struct pagevec * pvec = & per_cpu ( lru_add_pvecs , cpu ) ;
2005-04-16 15:20:36 -07:00
2006-01-06 00:11:14 -08:00
/* CPU is dead, so no locking needed. */
2005-04-16 15:20:36 -07:00
if ( pagevec_count ( pvec ) )
__pagevec_lru_add ( pvec ) ;
2006-01-06 00:11:14 -08:00
pvec = & per_cpu ( lru_add_active_pvecs , cpu ) ;
2005-04-16 15:20:36 -07:00
if ( pagevec_count ( pvec ) )
__pagevec_lru_add_active ( pvec ) ;
2006-01-06 00:11:14 -08:00
}
void lru_add_drain ( void )
{
__lru_add_drain ( get_cpu ( ) ) ;
put_cpu ( ) ;
2005-04-16 15:20:36 -07:00
}
/*
* This path almost never happens for VM activity - pages are normally
* freed via pagevecs . But it gets used by networking .
*/
void fastcall __page_cache_release ( struct page * page )
{
unsigned long flags ;
struct zone * zone = page_zone ( page ) ;
spin_lock_irqsave ( & zone - > lru_lock , flags ) ;
if ( TestClearPageLRU ( page ) )
del_page_from_lru ( zone , page ) ;
if ( page_count ( page ) ! = 0 )
page = NULL ;
spin_unlock_irqrestore ( & zone - > lru_lock , flags ) ;
if ( page )
free_hot_page ( page ) ;
}
EXPORT_SYMBOL ( __page_cache_release ) ;
/*
* Batched page_cache_release ( ) . Decrement the reference count on all the
* passed pages . If it fell to zero then remove the page from the LRU and
* free it .
*
* Avoid taking zone - > lru_lock if possible , but if it is taken , retain it
* for the remainder of the operation .
*
* The locking in this function is against shrink_cache ( ) : we recheck the
* page count inside the lock to see whether shrink_cache grabbed the page
* via the LRU . If it did , give up : shrink_cache will free it .
*/
void release_pages ( struct page * * pages , int nr , int cold )
{
int i ;
struct pagevec pages_to_free ;
struct zone * zone = NULL ;
pagevec_init ( & pages_to_free , cold ) ;
for ( i = 0 ; i < nr ; i + + ) {
struct page * page = pages [ i ] ;
struct zone * pagezone ;
2005-10-29 18:16:12 -07:00
if ( ! put_page_testzero ( page ) )
2005-04-16 15:20:36 -07:00
continue ;
pagezone = page_zone ( page ) ;
if ( pagezone ! = zone ) {
if ( zone )
spin_unlock_irq ( & zone - > lru_lock ) ;
zone = pagezone ;
spin_lock_irq ( & zone - > lru_lock ) ;
}
if ( TestClearPageLRU ( page ) )
del_page_from_lru ( zone , page ) ;
if ( page_count ( page ) = = 0 ) {
if ( ! pagevec_add ( & pages_to_free , page ) ) {
spin_unlock_irq ( & zone - > lru_lock ) ;
__pagevec_free ( & pages_to_free ) ;
pagevec_reinit ( & pages_to_free ) ;
zone = NULL ; /* No lock is held */
}
}
}
if ( zone )
spin_unlock_irq ( & zone - > lru_lock ) ;
pagevec_free ( & pages_to_free ) ;
}
/*
* The pages which we ' re about to release may be in the deferred lru - addition
* queues . That would prevent them from really being freed right now . That ' s
* OK from a correctness point of view but is inefficient - those pages may be
* cache - warm and we want to give them back to the page allocator ASAP .
*
* So __pagevec_release ( ) will drain those queues here . __pagevec_lru_add ( )
* and __pagevec_lru_add_active ( ) call release_pages ( ) directly to avoid
* mutual recursion .
*/
void __pagevec_release ( struct pagevec * pvec )
{
lru_add_drain ( ) ;
release_pages ( pvec - > pages , pagevec_count ( pvec ) , pvec - > cold ) ;
pagevec_reinit ( pvec ) ;
}
2005-11-01 10:22:55 -08:00
EXPORT_SYMBOL ( __pagevec_release ) ;
2005-04-16 15:20:36 -07:00
/*
* pagevec_release ( ) for pages which are known to not be on the LRU
*
* This function reinitialises the caller ' s pagevec .
*/
void __pagevec_release_nonlru ( struct pagevec * pvec )
{
int i ;
struct pagevec pages_to_free ;
pagevec_init ( & pages_to_free , pvec - > cold ) ;
for ( i = 0 ; i < pagevec_count ( pvec ) ; i + + ) {
struct page * page = pvec - > pages [ i ] ;
BUG_ON ( PageLRU ( page ) ) ;
if ( put_page_testzero ( page ) )
pagevec_add ( & pages_to_free , page ) ;
}
pagevec_free ( & pages_to_free ) ;
pagevec_reinit ( pvec ) ;
}
/*
* Add the passed pages to the LRU , then drop the caller ' s refcount
* on them . Reinitialises the caller ' s pagevec .
*/
void __pagevec_lru_add ( struct pagevec * pvec )
{
int i ;
struct zone * zone = NULL ;
for ( i = 0 ; i < pagevec_count ( pvec ) ; i + + ) {
struct page * page = pvec - > pages [ i ] ;
struct zone * pagezone = page_zone ( page ) ;
if ( pagezone ! = zone ) {
if ( zone )
spin_unlock_irq ( & zone - > lru_lock ) ;
zone = pagezone ;
spin_lock_irq ( & zone - > lru_lock ) ;
}
if ( TestSetPageLRU ( page ) )
BUG ( ) ;
add_page_to_inactive_list ( zone , page ) ;
}
if ( zone )
spin_unlock_irq ( & zone - > lru_lock ) ;
release_pages ( pvec - > pages , pvec - > nr , pvec - > cold ) ;
pagevec_reinit ( pvec ) ;
}
EXPORT_SYMBOL ( __pagevec_lru_add ) ;
void __pagevec_lru_add_active ( struct pagevec * pvec )
{
int i ;
struct zone * zone = NULL ;
for ( i = 0 ; i < pagevec_count ( pvec ) ; i + + ) {
struct page * page = pvec - > pages [ i ] ;
struct zone * pagezone = page_zone ( page ) ;
if ( pagezone ! = zone ) {
if ( zone )
spin_unlock_irq ( & zone - > lru_lock ) ;
zone = pagezone ;
spin_lock_irq ( & zone - > lru_lock ) ;
}
if ( TestSetPageLRU ( page ) )
BUG ( ) ;
if ( TestSetPageActive ( page ) )
BUG ( ) ;
add_page_to_active_list ( zone , page ) ;
}
if ( zone )
spin_unlock_irq ( & zone - > lru_lock ) ;
release_pages ( pvec - > pages , pvec - > nr , pvec - > cold ) ;
pagevec_reinit ( pvec ) ;
}
/*
* Try to drop buffers from the pages in a pagevec
*/
void pagevec_strip ( struct pagevec * pvec )
{
int i ;
for ( i = 0 ; i < pagevec_count ( pvec ) ; i + + ) {
struct page * page = pvec - > pages [ i ] ;
if ( PagePrivate ( page ) & & ! TestSetPageLocked ( page ) ) {
try_to_release_page ( page , 0 ) ;
unlock_page ( page ) ;
}
}
}
/**
* pagevec_lookup - gang pagecache lookup
* @ pvec : Where the resulting pages are placed
* @ mapping : The address_space to search
* @ start : The starting page index
* @ nr_pages : The maximum number of pages
*
* pagevec_lookup ( ) will search for and return a group of up to @ nr_pages pages
* in the mapping . The pages are placed in @ pvec . pagevec_lookup ( ) takes a
* reference against the pages in @ pvec .
*
* The search returns a group of mapping - contiguous pages with ascending
* indexes . There may be holes in the indices due to not - present pages .
*
* pagevec_lookup ( ) returns the number of pages which were found .
*/
unsigned pagevec_lookup ( struct pagevec * pvec , struct address_space * mapping ,
pgoff_t start , unsigned nr_pages )
{
pvec - > nr = find_get_pages ( mapping , start , nr_pages , pvec - > pages ) ;
return pagevec_count ( pvec ) ;
}
2006-01-11 20:47:41 +11:00
EXPORT_SYMBOL ( pagevec_lookup ) ;
2005-04-16 15:20:36 -07:00
unsigned pagevec_lookup_tag ( struct pagevec * pvec , struct address_space * mapping ,
pgoff_t * index , int tag , unsigned nr_pages )
{
pvec - > nr = find_get_pages_tag ( mapping , index , tag ,
nr_pages , pvec - > pages ) ;
return pagevec_count ( pvec ) ;
}
2005-11-01 10:22:55 -08:00
EXPORT_SYMBOL ( pagevec_lookup_tag ) ;
2005-04-16 15:20:36 -07:00
# ifdef CONFIG_SMP
/*
* We tolerate a little inaccuracy to avoid ping - ponging the counter between
* CPUs
*/
# define ACCT_THRESHOLD max(16, NR_CPUS * 2)
static DEFINE_PER_CPU ( long , committed_space ) = 0 ;
void vm_acct_memory ( long pages )
{
long * local ;
preempt_disable ( ) ;
local = & __get_cpu_var ( committed_space ) ;
* local + = pages ;
if ( * local > ACCT_THRESHOLD | | * local < - ACCT_THRESHOLD ) {
atomic_add ( * local , & vm_committed_space ) ;
* local = 0 ;
}
preempt_enable ( ) ;
}
# ifdef CONFIG_HOTPLUG_CPU
/* Drop the CPU's cached committed space back into the central pool. */
static int cpu_swap_callback ( struct notifier_block * nfb ,
unsigned long action ,
void * hcpu )
{
long * committed ;
committed = & per_cpu ( committed_space , ( long ) hcpu ) ;
if ( action = = CPU_DEAD ) {
atomic_add ( * committed , & vm_committed_space ) ;
* committed = 0 ;
2006-01-06 00:11:14 -08:00
__lru_add_drain ( ( long ) hcpu ) ;
2005-04-16 15:20:36 -07:00
}
return NOTIFY_OK ;
}
# endif /* CONFIG_HOTPLUG_CPU */
# endif /* CONFIG_SMP */
# ifdef CONFIG_SMP
void percpu_counter_mod ( struct percpu_counter * fbc , long amount )
{
long count ;
long * pcount ;
int cpu = get_cpu ( ) ;
pcount = per_cpu_ptr ( fbc - > counters , cpu ) ;
count = * pcount + amount ;
if ( count > = FBC_BATCH | | count < = - FBC_BATCH ) {
spin_lock ( & fbc - > lock ) ;
fbc - > count + = count ;
spin_unlock ( & fbc - > lock ) ;
count = 0 ;
}
* pcount = count ;
put_cpu ( ) ;
}
EXPORT_SYMBOL ( percpu_counter_mod ) ;
# endif
/*
* Perform any setup for the swap system
*/
void __init swap_setup ( void )
{
unsigned long megs = num_physpages > > ( 20 - PAGE_SHIFT ) ;
/* Use a smaller cluster for small-memory machines */
if ( megs < 16 )
page_cluster = 2 ;
else
page_cluster = 3 ;
/*
* Right now other parts of the system means that we
* _really_ don ' t want to cluster much more
*/
hotcpu_notifier ( cpu_swap_callback , 0 ) ;
}