2005-04-17 02:20:36 +04:00
/*
* linux / mm / swap . c
*
* Copyright ( C ) 1991 , 1992 , 1993 , 1994 Linus Torvalds
*/
/*
* This file contains the default values for the opereation of the
* Linux VM subsystem . Fine - tuning documentation can be found in
* Documentation / sysctl / vm . txt .
* Started 18.12 .91
* Swap aging added 23.2 .95 , Stephen Tweedie .
* Buffermem limits added 12.3 .98 , Rik van Riel .
*/
# include <linux/mm.h>
# include <linux/sched.h>
# include <linux/kernel_stat.h>
# include <linux/swap.h>
# include <linux/mman.h>
# include <linux/pagemap.h>
# include <linux/pagevec.h>
# include <linux/init.h>
# include <linux/module.h>
# include <linux/mm_inline.h>
# include <linux/buffer_head.h> /* for try_to_release_page() */
# include <linux/module.h>
# include <linux/percpu_counter.h>
# include <linux/percpu.h>
# include <linux/cpu.h>
# include <linux/notifier.h>
# include <linux/init.h>
/* How many pages do we try to swap or page in/out together? */
int page_cluster ;
2006-02-07 23:58:52 +03:00
static void put_compound_page ( struct page * page )
2005-04-17 02:20:36 +04:00
{
2006-02-07 23:58:52 +03:00
page = ( struct page * ) page_private ( page ) ;
if ( put_page_testzero ( page ) ) {
void ( * dtor ) ( struct page * page ) ;
2005-04-17 02:20:36 +04:00
[PATCH] compound page: use page[1].lru
If a compound page has its own put_page_testzero destructor (the only current
example is free_huge_page), that is noted in page[1].mapping of the compound
page. But that's rather a poor place to keep it: functions which call
set_page_dirty_lock after get_user_pages (e.g. Infiniband's
__ib_umem_release) ought to be checking first, otherwise set_page_dirty is
liable to crash on what's not the address of a struct address_space.
And now I'm about to make that worse: it turns out that every compound page
needs a destructor, so we can no longer rely on hugetlb pages going their own
special way, to avoid further problems of page->mapping reuse. For example,
not many people know that: on 50% of i386 -Os builds, the first tail page of a
compound page purports to be PageAnon (when its destructor has an odd
address), which surprises page_add_file_rmap.
Keep the compound page destructor in page[1].lru.next instead. And to free up
the common pairing of mapping and index, also move compound page order from
index to lru.prev. Slab reuses page->lru too: but if we ever need slab to use
compound pages, it can easily stack its use above this.
(akpm: decoded version of the above: the tail pages of a compound page now
have ->mapping==NULL, so there's no need for the set_page_dirty[_lock]()
caller to check that they're not compund pages before doing the dirty).
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-02-15 00:52:58 +03:00
dtor = ( void ( * ) ( struct page * ) ) page [ 1 ] . lru . next ;
2006-02-07 23:58:52 +03:00
( * dtor ) ( page ) ;
2005-04-17 02:20:36 +04:00
}
2006-02-07 23:58:52 +03:00
}
void put_page ( struct page * page )
{
if ( unlikely ( PageCompound ( page ) ) )
put_compound_page ( page ) ;
else if ( put_page_testzero ( page ) )
2005-04-17 02:20:36 +04:00
__page_cache_release ( page ) ;
}
EXPORT_SYMBOL ( put_page ) ;
/*
* Writeback is about to end against a page which has been marked for immediate
* reclaim . If it still appears to be reclaimable , move it to the tail of the
* inactive list . The page still has PageWriteback set , which will pin it .
*
* We don ' t expect many pages to come through here , so don ' t bother batching
* things up .
*
* To avoid placing the page at the tail of the LRU while PG_writeback is still
* set , this function will clear PG_writeback before performing the page
* motion . Do that inside the lru lock because once PG_writeback is cleared
* we may not touch the page .
*
* Returns zero if it cleared PG_writeback .
*/
int rotate_reclaimable_page ( struct page * page )
{
struct zone * zone ;
unsigned long flags ;
if ( PageLocked ( page ) )
return 1 ;
if ( PageDirty ( page ) )
return 1 ;
if ( PageActive ( page ) )
return 1 ;
if ( ! PageLRU ( page ) )
return 1 ;
zone = page_zone ( page ) ;
spin_lock_irqsave ( & zone - > lru_lock , flags ) ;
if ( PageLRU ( page ) & & ! PageActive ( page ) ) {
list_del ( & page - > lru ) ;
list_add_tail ( & page - > lru , & zone - > inactive_list ) ;
inc_page_state ( pgrotated ) ;
}
if ( ! test_clear_page_writeback ( page ) )
BUG ( ) ;
spin_unlock_irqrestore ( & zone - > lru_lock , flags ) ;
return 0 ;
}
/*
* FIXME : speed this up ?
*/
void fastcall activate_page ( struct page * page )
{
struct zone * zone = page_zone ( page ) ;
spin_lock_irq ( & zone - > lru_lock ) ;
if ( PageLRU ( page ) & & ! PageActive ( page ) ) {
del_page_from_inactive_list ( zone , page ) ;
SetPageActive ( page ) ;
add_page_to_active_list ( zone , page ) ;
inc_page_state ( pgactivate ) ;
}
spin_unlock_irq ( & zone - > lru_lock ) ;
}
/*
* Mark a page as having seen activity .
*
* inactive , unreferenced - > inactive , referenced
* inactive , referenced - > active , unreferenced
* active , unreferenced - > active , referenced
*/
void fastcall mark_page_accessed ( struct page * page )
{
if ( ! PageActive ( page ) & & PageReferenced ( page ) & & PageLRU ( page ) ) {
activate_page ( page ) ;
ClearPageReferenced ( page ) ;
} else if ( ! PageReferenced ( page ) ) {
SetPageReferenced ( page ) ;
}
}
EXPORT_SYMBOL ( mark_page_accessed ) ;
/**
* lru_cache_add : add a page to the page lists
* @ page : the page to add
*/
static DEFINE_PER_CPU ( struct pagevec , lru_add_pvecs ) = { 0 , } ;
static DEFINE_PER_CPU ( struct pagevec , lru_add_active_pvecs ) = { 0 , } ;
void fastcall lru_cache_add ( struct page * page )
{
struct pagevec * pvec = & get_cpu_var ( lru_add_pvecs ) ;
page_cache_get ( page ) ;
if ( ! pagevec_add ( pvec , page ) )
__pagevec_lru_add ( pvec ) ;
put_cpu_var ( lru_add_pvecs ) ;
}
void fastcall lru_cache_add_active ( struct page * page )
{
struct pagevec * pvec = & get_cpu_var ( lru_add_active_pvecs ) ;
page_cache_get ( page ) ;
if ( ! pagevec_add ( pvec , page ) )
__pagevec_lru_add_active ( pvec ) ;
put_cpu_var ( lru_add_active_pvecs ) ;
}
2006-01-06 11:11:14 +03:00
static void __lru_add_drain ( int cpu )
2005-04-17 02:20:36 +04:00
{
2006-01-06 11:11:14 +03:00
struct pagevec * pvec = & per_cpu ( lru_add_pvecs , cpu ) ;
2005-04-17 02:20:36 +04:00
2006-01-06 11:11:14 +03:00
/* CPU is dead, so no locking needed. */
2005-04-17 02:20:36 +04:00
if ( pagevec_count ( pvec ) )
__pagevec_lru_add ( pvec ) ;
2006-01-06 11:11:14 +03:00
pvec = & per_cpu ( lru_add_active_pvecs , cpu ) ;
2005-04-17 02:20:36 +04:00
if ( pagevec_count ( pvec ) )
__pagevec_lru_add_active ( pvec ) ;
2006-01-06 11:11:14 +03:00
}
void lru_add_drain ( void )
{
__lru_add_drain ( get_cpu ( ) ) ;
put_cpu ( ) ;
2005-04-17 02:20:36 +04:00
}
2006-01-19 04:42:27 +03:00
# ifdef CONFIG_NUMA
static void lru_add_drain_per_cpu ( void * dummy )
{
lru_add_drain ( ) ;
}
/*
* Returns 0 for success
*/
int lru_add_drain_all ( void )
{
return schedule_on_each_cpu ( lru_add_drain_per_cpu , NULL ) ;
}
# else
/*
* Returns 0 for success
*/
int lru_add_drain_all ( void )
{
lru_add_drain ( ) ;
return 0 ;
}
# endif
2005-04-17 02:20:36 +04:00
/*
* This path almost never happens for VM activity - pages are normally
* freed via pagevecs . But it gets used by networking .
*/
void fastcall __page_cache_release ( struct page * page )
{
unsigned long flags ;
struct zone * zone = page_zone ( page ) ;
spin_lock_irqsave ( & zone - > lru_lock , flags ) ;
if ( TestClearPageLRU ( page ) )
del_page_from_lru ( zone , page ) ;
if ( page_count ( page ) ! = 0 )
page = NULL ;
spin_unlock_irqrestore ( & zone - > lru_lock , flags ) ;
if ( page )
free_hot_page ( page ) ;
}
EXPORT_SYMBOL ( __page_cache_release ) ;
/*
* Batched page_cache_release ( ) . Decrement the reference count on all the
* passed pages . If it fell to zero then remove the page from the LRU and
* free it .
*
* Avoid taking zone - > lru_lock if possible , but if it is taken , retain it
* for the remainder of the operation .
*
* The locking in this function is against shrink_cache ( ) : we recheck the
* page count inside the lock to see whether shrink_cache grabbed the page
* via the LRU . If it did , give up : shrink_cache will free it .
*/
void release_pages ( struct page * * pages , int nr , int cold )
{
int i ;
struct pagevec pages_to_free ;
struct zone * zone = NULL ;
pagevec_init ( & pages_to_free , cold ) ;
for ( i = 0 ; i < nr ; i + + ) {
struct page * page = pages [ i ] ;
struct zone * pagezone ;
2006-02-07 23:58:52 +03:00
if ( unlikely ( PageCompound ( page ) ) ) {
if ( zone ) {
spin_unlock_irq ( & zone - > lru_lock ) ;
zone = NULL ;
}
put_compound_page ( page ) ;
continue ;
}
2005-10-30 04:16:12 +03:00
if ( ! put_page_testzero ( page ) )
2005-04-17 02:20:36 +04:00
continue ;
pagezone = page_zone ( page ) ;
if ( pagezone ! = zone ) {
if ( zone )
spin_unlock_irq ( & zone - > lru_lock ) ;
zone = pagezone ;
spin_lock_irq ( & zone - > lru_lock ) ;
}
if ( TestClearPageLRU ( page ) )
del_page_from_lru ( zone , page ) ;
if ( page_count ( page ) = = 0 ) {
if ( ! pagevec_add ( & pages_to_free , page ) ) {
spin_unlock_irq ( & zone - > lru_lock ) ;
__pagevec_free ( & pages_to_free ) ;
pagevec_reinit ( & pages_to_free ) ;
zone = NULL ; /* No lock is held */
}
}
}
if ( zone )
spin_unlock_irq ( & zone - > lru_lock ) ;
pagevec_free ( & pages_to_free ) ;
}
/*
* The pages which we ' re about to release may be in the deferred lru - addition
* queues . That would prevent them from really being freed right now . That ' s
* OK from a correctness point of view but is inefficient - those pages may be
* cache - warm and we want to give them back to the page allocator ASAP .
*
* So __pagevec_release ( ) will drain those queues here . __pagevec_lru_add ( )
* and __pagevec_lru_add_active ( ) call release_pages ( ) directly to avoid
* mutual recursion .
*/
void __pagevec_release ( struct pagevec * pvec )
{
lru_add_drain ( ) ;
release_pages ( pvec - > pages , pagevec_count ( pvec ) , pvec - > cold ) ;
pagevec_reinit ( pvec ) ;
}
2005-11-01 21:22:55 +03:00
EXPORT_SYMBOL ( __pagevec_release ) ;
2005-04-17 02:20:36 +04:00
/*
* pagevec_release ( ) for pages which are known to not be on the LRU
*
* This function reinitialises the caller ' s pagevec .
*/
void __pagevec_release_nonlru ( struct pagevec * pvec )
{
int i ;
struct pagevec pages_to_free ;
pagevec_init ( & pages_to_free , pvec - > cold ) ;
for ( i = 0 ; i < pagevec_count ( pvec ) ; i + + ) {
struct page * page = pvec - > pages [ i ] ;
BUG_ON ( PageLRU ( page ) ) ;
if ( put_page_testzero ( page ) )
pagevec_add ( & pages_to_free , page ) ;
}
pagevec_free ( & pages_to_free ) ;
pagevec_reinit ( pvec ) ;
}
/*
* Add the passed pages to the LRU , then drop the caller ' s refcount
* on them . Reinitialises the caller ' s pagevec .
*/
void __pagevec_lru_add ( struct pagevec * pvec )
{
int i ;
struct zone * zone = NULL ;
for ( i = 0 ; i < pagevec_count ( pvec ) ; i + + ) {
struct page * page = pvec - > pages [ i ] ;
struct zone * pagezone = page_zone ( page ) ;
if ( pagezone ! = zone ) {
if ( zone )
spin_unlock_irq ( & zone - > lru_lock ) ;
zone = pagezone ;
spin_lock_irq ( & zone - > lru_lock ) ;
}
if ( TestSetPageLRU ( page ) )
BUG ( ) ;
add_page_to_inactive_list ( zone , page ) ;
}
if ( zone )
spin_unlock_irq ( & zone - > lru_lock ) ;
release_pages ( pvec - > pages , pvec - > nr , pvec - > cold ) ;
pagevec_reinit ( pvec ) ;
}
EXPORT_SYMBOL ( __pagevec_lru_add ) ;
void __pagevec_lru_add_active ( struct pagevec * pvec )
{
int i ;
struct zone * zone = NULL ;
for ( i = 0 ; i < pagevec_count ( pvec ) ; i + + ) {
struct page * page = pvec - > pages [ i ] ;
struct zone * pagezone = page_zone ( page ) ;
if ( pagezone ! = zone ) {
if ( zone )
spin_unlock_irq ( & zone - > lru_lock ) ;
zone = pagezone ;
spin_lock_irq ( & zone - > lru_lock ) ;
}
if ( TestSetPageLRU ( page ) )
BUG ( ) ;
if ( TestSetPageActive ( page ) )
BUG ( ) ;
add_page_to_active_list ( zone , page ) ;
}
if ( zone )
spin_unlock_irq ( & zone - > lru_lock ) ;
release_pages ( pvec - > pages , pvec - > nr , pvec - > cold ) ;
pagevec_reinit ( pvec ) ;
}
/*
* Try to drop buffers from the pages in a pagevec
*/
void pagevec_strip ( struct pagevec * pvec )
{
int i ;
for ( i = 0 ; i < pagevec_count ( pvec ) ; i + + ) {
struct page * page = pvec - > pages [ i ] ;
if ( PagePrivate ( page ) & & ! TestSetPageLocked ( page ) ) {
try_to_release_page ( page , 0 ) ;
unlock_page ( page ) ;
}
}
}
/**
* pagevec_lookup - gang pagecache lookup
* @ pvec : Where the resulting pages are placed
* @ mapping : The address_space to search
* @ start : The starting page index
* @ nr_pages : The maximum number of pages
*
* pagevec_lookup ( ) will search for and return a group of up to @ nr_pages pages
* in the mapping . The pages are placed in @ pvec . pagevec_lookup ( ) takes a
* reference against the pages in @ pvec .
*
* The search returns a group of mapping - contiguous pages with ascending
* indexes . There may be holes in the indices due to not - present pages .
*
* pagevec_lookup ( ) returns the number of pages which were found .
*/
unsigned pagevec_lookup ( struct pagevec * pvec , struct address_space * mapping ,
pgoff_t start , unsigned nr_pages )
{
pvec - > nr = find_get_pages ( mapping , start , nr_pages , pvec - > pages ) ;
return pagevec_count ( pvec ) ;
}
2006-01-11 12:47:41 +03:00
EXPORT_SYMBOL ( pagevec_lookup ) ;
2005-04-17 02:20:36 +04:00
unsigned pagevec_lookup_tag ( struct pagevec * pvec , struct address_space * mapping ,
pgoff_t * index , int tag , unsigned nr_pages )
{
pvec - > nr = find_get_pages_tag ( mapping , index , tag ,
nr_pages , pvec - > pages ) ;
return pagevec_count ( pvec ) ;
}
2005-11-01 21:22:55 +03:00
EXPORT_SYMBOL ( pagevec_lookup_tag ) ;
2005-04-17 02:20:36 +04:00
# ifdef CONFIG_SMP
/*
* We tolerate a little inaccuracy to avoid ping - ponging the counter between
* CPUs
*/
# define ACCT_THRESHOLD max(16, NR_CPUS * 2)
static DEFINE_PER_CPU ( long , committed_space ) = 0 ;
void vm_acct_memory ( long pages )
{
long * local ;
preempt_disable ( ) ;
local = & __get_cpu_var ( committed_space ) ;
* local + = pages ;
if ( * local > ACCT_THRESHOLD | | * local < - ACCT_THRESHOLD ) {
atomic_add ( * local , & vm_committed_space ) ;
* local = 0 ;
}
preempt_enable ( ) ;
}
# ifdef CONFIG_HOTPLUG_CPU
/* Drop the CPU's cached committed space back into the central pool. */
static int cpu_swap_callback ( struct notifier_block * nfb ,
unsigned long action ,
void * hcpu )
{
long * committed ;
committed = & per_cpu ( committed_space , ( long ) hcpu ) ;
if ( action = = CPU_DEAD ) {
atomic_add ( * committed , & vm_committed_space ) ;
* committed = 0 ;
2006-01-06 11:11:14 +03:00
__lru_add_drain ( ( long ) hcpu ) ;
2005-04-17 02:20:36 +04:00
}
return NOTIFY_OK ;
}
# endif /* CONFIG_HOTPLUG_CPU */
# endif /* CONFIG_SMP */
# ifdef CONFIG_SMP
void percpu_counter_mod ( struct percpu_counter * fbc , long amount )
{
long count ;
long * pcount ;
int cpu = get_cpu ( ) ;
pcount = per_cpu_ptr ( fbc - > counters , cpu ) ;
count = * pcount + amount ;
if ( count > = FBC_BATCH | | count < = - FBC_BATCH ) {
spin_lock ( & fbc - > lock ) ;
fbc - > count + = count ;
2006-03-08 08:55:31 +03:00
* pcount = 0 ;
2005-04-17 02:20:36 +04:00
spin_unlock ( & fbc - > lock ) ;
2006-03-08 08:55:31 +03:00
} else {
* pcount = count ;
2005-04-17 02:20:36 +04:00
}
put_cpu ( ) ;
}
EXPORT_SYMBOL ( percpu_counter_mod ) ;
2006-03-08 08:55:31 +03:00
/*
* Add up all the per - cpu counts , return the result . This is a more accurate
* but much slower version of percpu_counter_read_positive ( )
*/
long percpu_counter_sum ( struct percpu_counter * fbc )
{
long ret ;
int cpu ;
spin_lock ( & fbc - > lock ) ;
ret = fbc - > count ;
for_each_cpu ( cpu ) {
long * pcount = per_cpu_ptr ( fbc - > counters , cpu ) ;
ret + = * pcount ;
}
spin_unlock ( & fbc - > lock ) ;
return ret < 0 ? 0 : ret ;
}
EXPORT_SYMBOL ( percpu_counter_sum ) ;
2005-04-17 02:20:36 +04:00
# endif
/*
* Perform any setup for the swap system
*/
void __init swap_setup ( void )
{
unsigned long megs = num_physpages > > ( 20 - PAGE_SHIFT ) ;
/* Use a smaller cluster for small-memory machines */
if ( megs < 16 )
page_cluster = 2 ;
else
page_cluster = 3 ;
/*
* Right now other parts of the system means that we
* _really_ don ' t want to cluster much more
*/
hotcpu_notifier ( cpu_swap_callback , 0 ) ;
}