2005-04-17 02:20:36 +04:00
/*
* mm / rmap . c - physical to virtual reverse mappings
*
* Copyright 2001 , Rik van Riel < riel @ conectiva . com . br >
* Released under the General Public License ( GPL ) .
*
* Simple , low overhead reverse mapping scheme .
* Please try to keep this thing as modular as possible .
*
* Provides methods for unmapping each kind of mapped page :
* the anon methods track anonymous pages , and
* the file methods track pages belonging to an inode .
*
* Original design by Rik van Riel < riel @ conectiva . com . br > 2001
* File methods by Dave McCracken < dmccr @ us . ibm . com > 2003 , 2004
* Anonymous methods by Andrea Arcangeli < andrea @ suse . de > 2004
* Contributions by Hugh Dickins < hugh @ veritas . com > 2003 , 2004
*/
/*
* Lock ordering in mm :
*
2006-01-10 02:59:24 +03:00
* inode - > i_mutex ( while writing or truncating , not reading or faulting )
2005-04-17 02:20:36 +04:00
* inode - > i_alloc_sem
*
* When a page fault occurs in writing from user to file , down_read
2006-01-10 02:59:24 +03:00
* of mmap_sem nests within i_mutex ; in sys_msync , i_mutex nests within
* down_read of mmap_sem ; i_mutex and down_write of mmap_sem are never
* taken together ; in truncation , i_mutex is taken outermost .
2005-04-17 02:20:36 +04:00
*
* mm - > mmap_sem
* page - > flags PG_locked ( lock_page )
* mapping - > i_mmap_lock
* anon_vma - > lock
2005-10-30 04:16:41 +03:00
* mm - > page_table_lock or pte_lock
2006-01-19 04:42:27 +03:00
* zone - > lru_lock ( in mark_page_accessed , isolate_lru_page )
[PATCH] swap: swap_lock replace list+device
The idea of a swap_device_lock per device, and a swap_list_lock over them all,
is appealing; but in practice almost every holder of swap_device_lock must
already hold swap_list_lock, which defeats the purpose of the split.
The only exceptions have been swap_duplicate, valid_swaphandles and an
untrodden path in try_to_unuse (plus a few places added in this series).
valid_swaphandles doesn't show up high in profiles, but swap_duplicate does
demand attention. However, with the hold time in get_swap_pages so much
reduced, I've not yet found a load and set of swap device priorities to show
even swap_duplicate benefitting from the split. Certainly the split is mere
overhead in the common case of a single swap device.
So, replace swap_list_lock and swap_device_lock by spinlock_t swap_lock
(generally we seem to prefer an _ in the name, and not hide in a macro).
If someone can show a regression in swap_duplicate, then probably we should
add a hashlock for the swap_map entries alone (shorts being anatomic), so as
to help the case of the single swap device too.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-04 02:54:41 +04:00
* swap_lock ( in swap_duplicate , swap_info_get )
2005-04-17 02:20:36 +04:00
* mmlist_lock ( in mmput , drain_mmlist and others )
* mapping - > private_lock ( in __set_page_dirty_buffers )
* inode_lock ( in set_page_dirty ' s __mark_inode_dirty )
* sb_lock ( within inode_lock in fs / fs - writeback . c )
* mapping - > tree_lock ( widely used , in set_page_dirty ,
* in arch - dependent flush_dcache_mmap_lock ,
* within inode_lock in __sync_single_inode )
*/
# include <linux/mm.h>
# include <linux/pagemap.h>
# include <linux/swap.h>
# include <linux/swapops.h>
# include <linux/slab.h>
# include <linux/init.h>
# include <linux/rmap.h>
# include <linux/rcupdate.h>
# include <asm/tlbflush.h>
//#define RMAP_DEBUG /* can be enabled only for debugging */
kmem_cache_t * anon_vma_cachep ;
static inline void validate_anon_vma ( struct vm_area_struct * find_vma )
{
# ifdef RMAP_DEBUG
struct anon_vma * anon_vma = find_vma - > anon_vma ;
struct vm_area_struct * vma ;
unsigned int mapcount = 0 ;
int found = 0 ;
list_for_each_entry ( vma , & anon_vma - > head , anon_vma_node ) {
mapcount + + ;
BUG_ON ( mapcount > 100000 ) ;
if ( vma = = find_vma )
found = 1 ;
}
BUG_ON ( ! found ) ;
# endif
}
/* This must be called under the mmap_sem. */
int anon_vma_prepare ( struct vm_area_struct * vma )
{
struct anon_vma * anon_vma = vma - > anon_vma ;
might_sleep ( ) ;
if ( unlikely ( ! anon_vma ) ) {
struct mm_struct * mm = vma - > vm_mm ;
struct anon_vma * allocated , * locked ;
anon_vma = find_mergeable_anon_vma ( vma ) ;
if ( anon_vma ) {
allocated = NULL ;
locked = anon_vma ;
spin_lock ( & locked - > lock ) ;
} else {
anon_vma = anon_vma_alloc ( ) ;
if ( unlikely ( ! anon_vma ) )
return - ENOMEM ;
allocated = anon_vma ;
locked = NULL ;
}
/* page_table_lock to protect against threads */
spin_lock ( & mm - > page_table_lock ) ;
if ( likely ( ! vma - > anon_vma ) ) {
vma - > anon_vma = anon_vma ;
list_add ( & vma - > anon_vma_node , & anon_vma - > head ) ;
allocated = NULL ;
}
spin_unlock ( & mm - > page_table_lock ) ;
if ( locked )
spin_unlock ( & locked - > lock ) ;
if ( unlikely ( allocated ) )
anon_vma_free ( allocated ) ;
}
return 0 ;
}
void __anon_vma_merge ( struct vm_area_struct * vma , struct vm_area_struct * next )
{
BUG_ON ( vma - > anon_vma ! = next - > anon_vma ) ;
list_del ( & next - > anon_vma_node ) ;
}
void __anon_vma_link ( struct vm_area_struct * vma )
{
struct anon_vma * anon_vma = vma - > anon_vma ;
if ( anon_vma ) {
list_add ( & vma - > anon_vma_node , & anon_vma - > head ) ;
validate_anon_vma ( vma ) ;
}
}
void anon_vma_link ( struct vm_area_struct * vma )
{
struct anon_vma * anon_vma = vma - > anon_vma ;
if ( anon_vma ) {
spin_lock ( & anon_vma - > lock ) ;
list_add ( & vma - > anon_vma_node , & anon_vma - > head ) ;
validate_anon_vma ( vma ) ;
spin_unlock ( & anon_vma - > lock ) ;
}
}
void anon_vma_unlink ( struct vm_area_struct * vma )
{
struct anon_vma * anon_vma = vma - > anon_vma ;
int empty ;
if ( ! anon_vma )
return ;
spin_lock ( & anon_vma - > lock ) ;
validate_anon_vma ( vma ) ;
list_del ( & vma - > anon_vma_node ) ;
/* We must garbage collect the anon_vma if it's empty */
empty = list_empty ( & anon_vma - > head ) ;
spin_unlock ( & anon_vma - > lock ) ;
if ( empty )
anon_vma_free ( anon_vma ) ;
}
static void anon_vma_ctor ( void * data , kmem_cache_t * cachep , unsigned long flags )
{
if ( ( flags & ( SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR ) ) = =
SLAB_CTOR_CONSTRUCTOR ) {
struct anon_vma * anon_vma = data ;
spin_lock_init ( & anon_vma - > lock ) ;
INIT_LIST_HEAD ( & anon_vma - > head ) ;
}
}
void __init anon_vma_init ( void )
{
anon_vma_cachep = kmem_cache_create ( " anon_vma " , sizeof ( struct anon_vma ) ,
0 , SLAB_DESTROY_BY_RCU | SLAB_PANIC , anon_vma_ctor , NULL ) ;
}
/*
* Getting a lock on a stable anon_vma from a page off the LRU is
* tricky : page_lock_anon_vma rely on RCU to guard against the races .
*/
static struct anon_vma * page_lock_anon_vma ( struct page * page )
{
struct anon_vma * anon_vma = NULL ;
unsigned long anon_mapping ;
rcu_read_lock ( ) ;
anon_mapping = ( unsigned long ) page - > mapping ;
if ( ! ( anon_mapping & PAGE_MAPPING_ANON ) )
goto out ;
if ( ! page_mapped ( page ) )
goto out ;
anon_vma = ( struct anon_vma * ) ( anon_mapping - PAGE_MAPPING_ANON ) ;
spin_lock ( & anon_vma - > lock ) ;
out :
rcu_read_unlock ( ) ;
return anon_vma ;
}
/*
* At what user virtual address is page expected in vma ?
*/
static inline unsigned long
vma_address ( struct page * page , struct vm_area_struct * vma )
{
pgoff_t pgoff = page - > index < < ( PAGE_CACHE_SHIFT - PAGE_SHIFT ) ;
unsigned long address ;
address = vma - > vm_start + ( ( pgoff - vma - > vm_pgoff ) < < PAGE_SHIFT ) ;
if ( unlikely ( address < vma - > vm_start | | address > = vma - > vm_end ) ) {
/* page should be within any vma from prio_tree_next */
BUG_ON ( ! PageAnon ( page ) ) ;
return - EFAULT ;
}
return address ;
}
/*
* At what user virtual address is page expected in vma ? checking that the
[PATCH] unpaged: anon in VM_UNPAGED
copy_one_pte needs to copy the anonymous COWed pages in a VM_UNPAGED area,
zap_pte_range needs to free them, do_wp_page needs to COW them: just like
ordinary pages, not like the unpaged.
But recognizing them is a little subtle: because PageReserved is no longer a
condition for remap_pfn_range, we can now mmap all of /dev/mem (whether the
distro permits, and whether it's advisable on this or that architecture, is
another matter). So if we can see a PageAnon, it may not be ours to mess with
(or may be ours from elsewhere in the address space). I suspect there's an
entertaining insoluble self-referential problem here, but the page_is_anon
function does a good practical job, and MAP_PRIVATE PROT_WRITE VM_UNPAGED will
always be an odd choice.
In updating the comment on page_address_in_vma, noticed a potential NULL
dereference, in a path we don't actually take, but fixed it.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-11-22 08:32:18 +03:00
* page matches the vma : currently only used on anon pages , by unuse_vma ;
2005-04-17 02:20:36 +04:00
*/
unsigned long page_address_in_vma ( struct page * page , struct vm_area_struct * vma )
{
if ( PageAnon ( page ) ) {
if ( ( void * ) vma - > anon_vma ! =
( void * ) page - > mapping - PAGE_MAPPING_ANON )
return - EFAULT ;
} else if ( page - > mapping & & ! ( vma - > vm_flags & VM_NONLINEAR ) ) {
[PATCH] unpaged: anon in VM_UNPAGED
copy_one_pte needs to copy the anonymous COWed pages in a VM_UNPAGED area,
zap_pte_range needs to free them, do_wp_page needs to COW them: just like
ordinary pages, not like the unpaged.
But recognizing them is a little subtle: because PageReserved is no longer a
condition for remap_pfn_range, we can now mmap all of /dev/mem (whether the
distro permits, and whether it's advisable on this or that architecture, is
another matter). So if we can see a PageAnon, it may not be ours to mess with
(or may be ours from elsewhere in the address space). I suspect there's an
entertaining insoluble self-referential problem here, but the page_is_anon
function does a good practical job, and MAP_PRIVATE PROT_WRITE VM_UNPAGED will
always be an odd choice.
In updating the comment on page_address_in_vma, noticed a potential NULL
dereference, in a path we don't actually take, but fixed it.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-11-22 08:32:18 +03:00
if ( ! vma - > vm_file | |
vma - > vm_file - > f_mapping ! = page - > mapping )
2005-04-17 02:20:36 +04:00
return - EFAULT ;
} else
return - EFAULT ;
return vma_address ( page , vma ) ;
}
2005-05-01 19:58:36 +04:00
/*
* Check that @ page is mapped at @ address into @ mm .
*
2005-10-30 04:16:41 +03:00
* On success returns with pte mapped and locked .
2005-05-01 19:58:36 +04:00
*/
2005-06-24 09:05:25 +04:00
pte_t * page_check_address ( struct page * page , struct mm_struct * mm ,
2005-10-30 04:16:31 +03:00
unsigned long address , spinlock_t * * ptlp )
2005-05-01 19:58:36 +04:00
{
pgd_t * pgd ;
pud_t * pud ;
pmd_t * pmd ;
pte_t * pte ;
2005-10-30 04:16:31 +03:00
spinlock_t * ptl ;
2005-05-01 19:58:36 +04:00
pgd = pgd_offset ( mm , address ) ;
2005-10-30 04:16:31 +03:00
if ( ! pgd_present ( * pgd ) )
return NULL ;
pud = pud_offset ( pgd , address ) ;
if ( ! pud_present ( * pud ) )
return NULL ;
pmd = pmd_offset ( pud , address ) ;
if ( ! pmd_present ( * pmd ) )
return NULL ;
pte = pte_offset_map ( pmd , address ) ;
/* Make a quick check before getting the lock */
if ( ! pte_present ( * pte ) ) {
pte_unmap ( pte ) ;
return NULL ;
}
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 04:16:40 +03:00
ptl = pte_lockptr ( mm , pmd ) ;
2005-10-30 04:16:31 +03:00
spin_lock ( ptl ) ;
if ( pte_present ( * pte ) & & page_to_pfn ( page ) = = pte_pfn ( * pte ) ) {
* ptlp = ptl ;
return pte ;
2005-05-01 19:58:36 +04:00
}
2005-10-30 04:16:31 +03:00
pte_unmap_unlock ( pte , ptl ) ;
return NULL ;
2005-05-01 19:58:36 +04:00
}
2005-04-17 02:20:36 +04:00
/*
* Subfunctions of page_referenced : page_referenced_one called
* repeatedly from either page_referenced_anon or page_referenced_file .
*/
static int page_referenced_one ( struct page * page ,
2005-11-29 00:44:07 +03:00
struct vm_area_struct * vma , unsigned int * mapcount )
2005-04-17 02:20:36 +04:00
{
struct mm_struct * mm = vma - > vm_mm ;
unsigned long address ;
pte_t * pte ;
2005-10-30 04:16:31 +03:00
spinlock_t * ptl ;
2005-04-17 02:20:36 +04:00
int referenced = 0 ;
address = vma_address ( page , vma ) ;
if ( address = = - EFAULT )
goto out ;
2005-10-30 04:16:31 +03:00
pte = page_check_address ( page , mm , address , & ptl ) ;
if ( ! pte )
goto out ;
2005-04-17 02:20:36 +04:00
2005-10-30 04:16:31 +03:00
if ( ptep_clear_flush_young ( vma , address , pte ) )
referenced + + ;
2005-04-17 02:20:36 +04:00
2005-10-30 04:16:31 +03:00
/* Pretend the page is referenced if the task has the
swap token and is in the middle of a page fault . */
2005-11-29 00:44:07 +03:00
if ( mm ! = current - > mm & & has_swap_token ( mm ) & &
2005-10-30 04:16:31 +03:00
rwsem_is_locked ( & mm - > mmap_sem ) )
referenced + + ;
( * mapcount ) - - ;
pte_unmap_unlock ( pte , ptl ) ;
2005-04-17 02:20:36 +04:00
out :
return referenced ;
}
2005-11-29 00:44:07 +03:00
static int page_referenced_anon ( struct page * page )
2005-04-17 02:20:36 +04:00
{
unsigned int mapcount ;
struct anon_vma * anon_vma ;
struct vm_area_struct * vma ;
int referenced = 0 ;
anon_vma = page_lock_anon_vma ( page ) ;
if ( ! anon_vma )
return referenced ;
mapcount = page_mapcount ( page ) ;
list_for_each_entry ( vma , & anon_vma - > head , anon_vma_node ) {
2005-11-29 00:44:07 +03:00
referenced + = page_referenced_one ( page , vma , & mapcount ) ;
2005-04-17 02:20:36 +04:00
if ( ! mapcount )
break ;
}
spin_unlock ( & anon_vma - > lock ) ;
return referenced ;
}
/**
* page_referenced_file - referenced check for object - based rmap
* @ page : the page we ' re checking references on .
*
* For an object - based mapped page , find all the places it is mapped and
* check / clear the referenced flag . This is done by following the page - > mapping
* pointer , then walking the chain of vmas it holds . It returns the number
* of references it found .
*
* This function is only called from page_referenced for object - based pages .
*/
2005-11-29 00:44:07 +03:00
static int page_referenced_file ( struct page * page )
2005-04-17 02:20:36 +04:00
{
unsigned int mapcount ;
struct address_space * mapping = page - > mapping ;
pgoff_t pgoff = page - > index < < ( PAGE_CACHE_SHIFT - PAGE_SHIFT ) ;
struct vm_area_struct * vma ;
struct prio_tree_iter iter ;
int referenced = 0 ;
/*
* The caller ' s checks on page - > mapping and ! PageAnon have made
* sure that this is a file page : the check for page - > mapping
* excludes the case just before it gets set on an anon page .
*/
BUG_ON ( PageAnon ( page ) ) ;
/*
* The page lock not only makes sure that page - > mapping cannot
* suddenly be NULLified by truncation , it makes sure that the
* structure at mapping cannot be freed and reused yet ,
* so we can safely take mapping - > i_mmap_lock .
*/
BUG_ON ( ! PageLocked ( page ) ) ;
spin_lock ( & mapping - > i_mmap_lock ) ;
/*
* i_mmap_lock does not stabilize mapcount at all , but mapcount
* is more likely to be accurate if we note it after spinning .
*/
mapcount = page_mapcount ( page ) ;
vma_prio_tree_foreach ( vma , & iter , & mapping - > i_mmap , pgoff , pgoff ) {
if ( ( vma - > vm_flags & ( VM_LOCKED | VM_MAYSHARE ) )
= = ( VM_LOCKED | VM_MAYSHARE ) ) {
referenced + + ;
break ;
}
2005-11-29 00:44:07 +03:00
referenced + = page_referenced_one ( page , vma , & mapcount ) ;
2005-04-17 02:20:36 +04:00
if ( ! mapcount )
break ;
}
spin_unlock ( & mapping - > i_mmap_lock ) ;
return referenced ;
}
/**
* page_referenced - test if the page was referenced
* @ page : the page to test
* @ is_locked : caller holds lock on the page
*
* Quick test_and_clear_referenced for all mappings to a page ,
* returns the number of ptes which referenced the page .
*/
2005-11-29 00:44:07 +03:00
int page_referenced ( struct page * page , int is_locked )
2005-04-17 02:20:36 +04:00
{
int referenced = 0 ;
if ( page_test_and_clear_young ( page ) )
referenced + + ;
if ( TestClearPageReferenced ( page ) )
referenced + + ;
if ( page_mapped ( page ) & & page - > mapping ) {
if ( PageAnon ( page ) )
2005-11-29 00:44:07 +03:00
referenced + = page_referenced_anon ( page ) ;
2005-04-17 02:20:36 +04:00
else if ( is_locked )
2005-11-29 00:44:07 +03:00
referenced + = page_referenced_file ( page ) ;
2005-04-17 02:20:36 +04:00
else if ( TestSetPageLocked ( page ) )
referenced + + ;
else {
if ( page - > mapping )
2005-11-29 00:44:07 +03:00
referenced + = page_referenced_file ( page ) ;
2005-04-17 02:20:36 +04:00
unlock_page ( page ) ;
}
}
return referenced ;
}
2006-01-06 11:11:12 +03:00
/**
* page_set_anon_rmap - setup new anonymous rmap
* @ page : the page to add the mapping to
* @ vma : the vm area in which the mapping is added
* @ address : the user virtual address mapped
*/
static void __page_set_anon_rmap ( struct page * page ,
struct vm_area_struct * vma , unsigned long address )
{
struct anon_vma * anon_vma = vma - > anon_vma ;
BUG_ON ( ! anon_vma ) ;
anon_vma = ( void * ) anon_vma + PAGE_MAPPING_ANON ;
page - > mapping = ( struct address_space * ) anon_vma ;
page - > index = linear_page_index ( vma , address ) ;
2006-01-06 11:11:20 +03:00
/*
* nr_mapped state can be updated without turning off
* interrupts because it is not modified via interrupt .
*/
__inc_page_state ( nr_mapped ) ;
2006-01-06 11:11:12 +03:00
}
2005-04-17 02:20:36 +04:00
/**
* page_add_anon_rmap - add pte mapping to an anonymous page
* @ page : the page to add the mapping to
* @ vma : the vm area in which the mapping is added
* @ address : the user virtual address mapped
*
2005-10-30 04:16:41 +03:00
* The caller needs to hold the pte lock .
2005-04-17 02:20:36 +04:00
*/
void page_add_anon_rmap ( struct page * page ,
struct vm_area_struct * vma , unsigned long address )
{
2006-01-06 11:11:12 +03:00
if ( atomic_inc_and_test ( & page - > _mapcount ) )
__page_set_anon_rmap ( page , vma , address ) ;
2005-04-17 02:20:36 +04:00
/* else checking page index and mapping is racy */
}
2006-01-06 11:11:12 +03:00
/*
* page_add_new_anon_rmap - add pte mapping to a new anonymous page
* @ page : the page to add the mapping to
* @ vma : the vm area in which the mapping is added
* @ address : the user virtual address mapped
*
* Same as page_add_anon_rmap but must only be called on * new * pages .
* This means the inc - and - test can be bypassed .
*/
void page_add_new_anon_rmap ( struct page * page ,
struct vm_area_struct * vma , unsigned long address )
{
atomic_set ( & page - > _mapcount , 0 ) ; /* elevate count by 1 (starts at -1) */
__page_set_anon_rmap ( page , vma , address ) ;
}
2005-04-17 02:20:36 +04:00
/**
* page_add_file_rmap - add pte mapping to a file page
* @ page : the page to add the mapping to
*
2005-10-30 04:16:41 +03:00
* The caller needs to hold the pte lock .
2005-04-17 02:20:36 +04:00
*/
void page_add_file_rmap ( struct page * page )
{
BUG_ON ( PageAnon ( page ) ) ;
2005-10-30 04:16:12 +03:00
BUG_ON ( ! pfn_valid ( page_to_pfn ( page ) ) ) ;
2005-04-17 02:20:36 +04:00
if ( atomic_inc_and_test ( & page - > _mapcount ) )
2006-01-06 11:11:20 +03:00
__inc_page_state ( nr_mapped ) ;
2005-04-17 02:20:36 +04:00
}
/**
* page_remove_rmap - take down pte mapping from a page
* @ page : page to remove mapping from
*
2005-10-30 04:16:41 +03:00
* The caller needs to hold the pte lock .
2005-04-17 02:20:36 +04:00
*/
void page_remove_rmap ( struct page * page )
{
if ( atomic_add_negative ( - 1 , & page - > _mapcount ) ) {
2006-01-08 12:01:00 +03:00
if ( page_mapcount ( page ) < 0 ) {
printk ( KERN_EMERG " Eeek! page_mapcount(page) went negative! (%d) \n " , page_mapcount ( page ) ) ;
printk ( KERN_EMERG " page->flags = %lx \n " , page - > flags ) ;
printk ( KERN_EMERG " page->count = %x \n " , page_count ( page ) ) ;
printk ( KERN_EMERG " page->mapping = %p \n " , page - > mapping ) ;
}
2005-04-17 02:20:36 +04:00
BUG_ON ( page_mapcount ( page ) < 0 ) ;
/*
* It would be tidy to reset the PageAnon mapping here ,
* but that might overwrite a racing page_add_anon_rmap
* which increments mapcount after us but sets mapping
* before us : so leave the reset to free_hot_cold_page ,
* and remember that it ' s only reliable while mapped .
* Leaving it set also helps swapoff to reinstate ptes
* faster for those pages still in swapcache .
*/
if ( page_test_and_clear_dirty ( page ) )
set_page_dirty ( page ) ;
2006-01-06 11:11:20 +03:00
__dec_page_state ( nr_mapped ) ;
2005-04-17 02:20:36 +04:00
}
}
/*
* Subfunctions of try_to_unmap : try_to_unmap_one called
* repeatedly from either try_to_unmap_anon or try_to_unmap_file .
*/
static int try_to_unmap_one ( struct page * page , struct vm_area_struct * vma )
{
struct mm_struct * mm = vma - > vm_mm ;
unsigned long address ;
pte_t * pte ;
pte_t pteval ;
2005-10-30 04:16:31 +03:00
spinlock_t * ptl ;
2005-04-17 02:20:36 +04:00
int ret = SWAP_AGAIN ;
address = vma_address ( page , vma ) ;
if ( address = = - EFAULT )
goto out ;
2005-10-30 04:16:31 +03:00
pte = page_check_address ( page , mm , address , & ptl ) ;
if ( ! pte )
2005-05-01 19:58:36 +04:00
goto out ;
2005-04-17 02:20:36 +04:00
/*
* If the page is mlock ( ) d , we cannot swap it out .
* If it ' s recently referenced ( perhaps page_referenced
* skipped over this mm ) then we should reactivate it .
*/
2005-11-22 08:32:16 +03:00
if ( ( vma - > vm_flags & VM_LOCKED ) | |
2005-04-17 02:20:36 +04:00
ptep_clear_flush_young ( vma , address , pte ) ) {
ret = SWAP_FAIL ;
goto out_unmap ;
}
/* Nuke the page table entry. */
flush_cache_page ( vma , address , page_to_pfn ( page ) ) ;
pteval = ptep_clear_flush ( vma , address , pte ) ;
/* Move the dirty bit to the physical page now the pte is gone. */
if ( pte_dirty ( pteval ) )
set_page_dirty ( page ) ;
[PATCH] mm: update_hiwaters just in time
update_mem_hiwater has attracted various criticisms, in particular from those
concerned with mm scalability. Originally it was called whenever rss or
total_vm got raised. Then many of those callsites were replaced by a timer
tick call from account_system_time. Now Frank van Maarseveen reports that to
be found inadequate. How about this? Works for Frank.
Replace update_mem_hiwater, a poor combination of two unrelated ops, by macros
update_hiwater_rss and update_hiwater_vm. Don't attempt to keep
mm->hiwater_rss up to date at timer tick, nor every time we raise rss (usually
by 1): those are hot paths. Do the opposite, update only when about to lower
rss (usually by many), or just before final accounting in do_exit. Handle
mm->hiwater_vm in the same way, though it's much less of an issue. Demand
that whoever collects these hiwater statistics do the work of taking the
maximum with rss or total_vm.
And there has been no collector of these hiwater statistics in the tree. The
new convention needs an example, so match Frank's usage by adding a VmPeak
line above VmSize to /proc/<pid>/status, and also a VmHWM line above VmRSS
(High-Water-Mark or High-Water-Memory).
There was a particular anomaly during mremap move, that hiwater_vm might be
captured too high. A fleeting such anomaly remains, but it's quickly
corrected now, whereas before it would stick.
What locking? None: if the app is racy then these statistics will be racy,
it's not worth any overhead to make them exact. But whenever it suits,
hiwater_vm is updated under exclusive mmap_sem, and hiwater_rss under
page_table_lock (for now) or with preemption disabled (later on): without
going to any trouble, minimize the time between reading current values and
updating, to minimize those occasions when a racing thread bumps a count up
and back down in between.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 04:16:18 +03:00
/* Update high watermark before we lower rss */
update_hiwater_rss ( mm ) ;
2005-04-17 02:20:36 +04:00
if ( PageAnon ( page ) ) {
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 04:16:40 +03:00
swp_entry_t entry = { . val = page_private ( page ) } ;
2005-04-17 02:20:36 +04:00
/*
* Store the swap location in the pte .
* See handle_pte_fault ( ) . . .
*/
BUG_ON ( ! PageSwapCache ( page ) ) ;
swap_duplicate ( entry ) ;
if ( list_empty ( & mm - > mmlist ) ) {
spin_lock ( & mmlist_lock ) ;
2005-10-30 04:16:41 +03:00
if ( list_empty ( & mm - > mmlist ) )
list_add ( & mm - > mmlist , & init_mm . mmlist ) ;
2005-04-17 02:20:36 +04:00
spin_unlock ( & mmlist_lock ) ;
}
set_pte_at ( mm , address , pte , swp_entry_to_pte ( entry ) ) ;
BUG_ON ( pte_file ( * pte ) ) ;
dec_mm_counter ( mm , anon_rss ) ;
2005-10-30 04:16:05 +03:00
} else
dec_mm_counter ( mm , file_rss ) ;
2005-04-17 02:20:36 +04:00
page_remove_rmap ( page ) ;
page_cache_release ( page ) ;
out_unmap :
2005-10-30 04:16:31 +03:00
pte_unmap_unlock ( pte , ptl ) ;
2005-04-17 02:20:36 +04:00
out :
return ret ;
}
/*
* objrmap doesn ' t work for nonlinear VMAs because the assumption that
* offset - into - file correlates with offset - into - virtual - addresses does not hold .
* Consequently , given a particular page and its - > index , we cannot locate the
* ptes which are mapping that page without an exhaustive linear search .
*
* So what this code does is a mini " virtual scan " of each nonlinear VMA which
* maps the file to which the target page belongs . The - > vm_private_data field
* holds the current cursor into that scan . Successive searches will circulate
* around the vma ' s virtual address space .
*
* So as more replacement pressure is applied to the pages in a nonlinear VMA ,
* more scanning pressure is placed against them as well . Eventually pages
* will become fully unmapped and are eligible for eviction .
*
* For very sparsely populated VMAs this is a little inefficient - chances are
* there there won ' t be many ptes located within the scan cluster . In this case
* maybe we could scan further - to the end of the pte page , perhaps .
*/
# define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE)
# define CLUSTER_MASK (~(CLUSTER_SIZE - 1))
static void try_to_unmap_cluster ( unsigned long cursor ,
unsigned int * mapcount , struct vm_area_struct * vma )
{
struct mm_struct * mm = vma - > vm_mm ;
pgd_t * pgd ;
pud_t * pud ;
pmd_t * pmd ;
2005-10-30 04:16:31 +03:00
pte_t * pte ;
2005-04-17 02:20:36 +04:00
pte_t pteval ;
2005-10-30 04:16:31 +03:00
spinlock_t * ptl ;
2005-04-17 02:20:36 +04:00
struct page * page ;
unsigned long address ;
unsigned long end ;
address = ( vma - > vm_start + cursor ) & CLUSTER_MASK ;
end = address + CLUSTER_SIZE ;
if ( address < vma - > vm_start )
address = vma - > vm_start ;
if ( end > vma - > vm_end )
end = vma - > vm_end ;
pgd = pgd_offset ( mm , address ) ;
if ( ! pgd_present ( * pgd ) )
2005-10-30 04:16:31 +03:00
return ;
2005-04-17 02:20:36 +04:00
pud = pud_offset ( pgd , address ) ;
if ( ! pud_present ( * pud ) )
2005-10-30 04:16:31 +03:00
return ;
2005-04-17 02:20:36 +04:00
pmd = pmd_offset ( pud , address ) ;
if ( ! pmd_present ( * pmd ) )
2005-10-30 04:16:31 +03:00
return ;
pte = pte_offset_map_lock ( mm , pmd , address , & ptl ) ;
2005-04-17 02:20:36 +04:00
[PATCH] mm: update_hiwaters just in time
update_mem_hiwater has attracted various criticisms, in particular from those
concerned with mm scalability. Originally it was called whenever rss or
total_vm got raised. Then many of those callsites were replaced by a timer
tick call from account_system_time. Now Frank van Maarseveen reports that to
be found inadequate. How about this? Works for Frank.
Replace update_mem_hiwater, a poor combination of two unrelated ops, by macros
update_hiwater_rss and update_hiwater_vm. Don't attempt to keep
mm->hiwater_rss up to date at timer tick, nor every time we raise rss (usually
by 1): those are hot paths. Do the opposite, update only when about to lower
rss (usually by many), or just before final accounting in do_exit. Handle
mm->hiwater_vm in the same way, though it's much less of an issue. Demand
that whoever collects these hiwater statistics do the work of taking the
maximum with rss or total_vm.
And there has been no collector of these hiwater statistics in the tree. The
new convention needs an example, so match Frank's usage by adding a VmPeak
line above VmSize to /proc/<pid>/status, and also a VmHWM line above VmRSS
(High-Water-Mark or High-Water-Memory).
There was a particular anomaly during mremap move, that hiwater_vm might be
captured too high. A fleeting such anomaly remains, but it's quickly
corrected now, whereas before it would stick.
What locking? None: if the app is racy then these statistics will be racy,
it's not worth any overhead to make them exact. But whenever it suits,
hiwater_vm is updated under exclusive mmap_sem, and hiwater_rss under
page_table_lock (for now) or with preemption disabled (later on): without
going to any trouble, minimize the time between reading current values and
updating, to minimize those occasions when a racing thread bumps a count up
and back down in between.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 04:16:18 +03:00
/* Update high watermark before we lower rss */
update_hiwater_rss ( mm ) ;
2005-10-30 04:16:31 +03:00
for ( ; address < end ; pte + + , address + = PAGE_SIZE ) {
2005-04-17 02:20:36 +04:00
if ( ! pte_present ( * pte ) )
continue ;
2005-11-29 01:34:23 +03:00
page = vm_normal_page ( vma , address , * pte ) ;
BUG_ON ( ! page | | PageAnon ( page ) ) ;
2005-04-17 02:20:36 +04:00
if ( ptep_clear_flush_young ( vma , address , pte ) )
continue ;
/* Nuke the page table entry. */
2005-11-29 22:45:26 +03:00
flush_cache_page ( vma , address , pte_pfn ( * pte ) ) ;
2005-04-17 02:20:36 +04:00
pteval = ptep_clear_flush ( vma , address , pte ) ;
/* If nonlinear, store the file page offset in the pte. */
if ( page - > index ! = linear_page_index ( vma , address ) )
set_pte_at ( mm , address , pte , pgoff_to_pte ( page - > index ) ) ;
/* Move the dirty bit to the physical page now the pte is gone. */
if ( pte_dirty ( pteval ) )
set_page_dirty ( page ) ;
page_remove_rmap ( page ) ;
page_cache_release ( page ) ;
2005-10-30 04:16:05 +03:00
dec_mm_counter ( mm , file_rss ) ;
2005-04-17 02:20:36 +04:00
( * mapcount ) - - ;
}
2005-10-30 04:16:31 +03:00
pte_unmap_unlock ( pte - 1 , ptl ) ;
2005-04-17 02:20:36 +04:00
}
static int try_to_unmap_anon ( struct page * page )
{
struct anon_vma * anon_vma ;
struct vm_area_struct * vma ;
int ret = SWAP_AGAIN ;
anon_vma = page_lock_anon_vma ( page ) ;
if ( ! anon_vma )
return ret ;
list_for_each_entry ( vma , & anon_vma - > head , anon_vma_node ) {
ret = try_to_unmap_one ( page , vma ) ;
if ( ret = = SWAP_FAIL | | ! page_mapped ( page ) )
break ;
}
spin_unlock ( & anon_vma - > lock ) ;
return ret ;
}
/**
* try_to_unmap_file - unmap file page using the object - based rmap method
* @ page : the page to unmap
*
* Find all the mappings of a page using the mapping pointer and the vma chains
* contained in the address_space struct it points to .
*
* This function is only called from try_to_unmap for object - based pages .
*/
static int try_to_unmap_file ( struct page * page )
{
struct address_space * mapping = page - > mapping ;
pgoff_t pgoff = page - > index < < ( PAGE_CACHE_SHIFT - PAGE_SHIFT ) ;
struct vm_area_struct * vma ;
struct prio_tree_iter iter ;
int ret = SWAP_AGAIN ;
unsigned long cursor ;
unsigned long max_nl_cursor = 0 ;
unsigned long max_nl_size = 0 ;
unsigned int mapcount ;
spin_lock ( & mapping - > i_mmap_lock ) ;
vma_prio_tree_foreach ( vma , & iter , & mapping - > i_mmap , pgoff , pgoff ) {
ret = try_to_unmap_one ( page , vma ) ;
if ( ret = = SWAP_FAIL | | ! page_mapped ( page ) )
goto out ;
}
if ( list_empty ( & mapping - > i_mmap_nonlinear ) )
goto out ;
list_for_each_entry ( vma , & mapping - > i_mmap_nonlinear ,
shared . vm_set . list ) {
2005-11-22 08:32:16 +03:00
if ( vma - > vm_flags & VM_LOCKED )
2005-04-17 02:20:36 +04:00
continue ;
cursor = ( unsigned long ) vma - > vm_private_data ;
if ( cursor > max_nl_cursor )
max_nl_cursor = cursor ;
cursor = vma - > vm_end - vma - > vm_start ;
if ( cursor > max_nl_size )
max_nl_size = cursor ;
}
if ( max_nl_size = = 0 ) { /* any nonlinears locked or reserved */
ret = SWAP_FAIL ;
goto out ;
}
/*
* We don ' t try to search for this page in the nonlinear vmas ,
* and page_referenced wouldn ' t have found it anyway . Instead
* just walk the nonlinear vmas trying to age and unmap some .
* The mapcount of the page we came in with is irrelevant ,
* but even so use it as a guide to how hard we should try ?
*/
mapcount = page_mapcount ( page ) ;
if ( ! mapcount )
goto out ;
cond_resched_lock ( & mapping - > i_mmap_lock ) ;
max_nl_size = ( max_nl_size + CLUSTER_SIZE - 1 ) & CLUSTER_MASK ;
if ( max_nl_cursor = = 0 )
max_nl_cursor = CLUSTER_SIZE ;
do {
list_for_each_entry ( vma , & mapping - > i_mmap_nonlinear ,
shared . vm_set . list ) {
2005-11-22 08:32:16 +03:00
if ( vma - > vm_flags & VM_LOCKED )
2005-04-17 02:20:36 +04:00
continue ;
cursor = ( unsigned long ) vma - > vm_private_data ;
2005-09-04 02:54:43 +04:00
while ( cursor < max_nl_cursor & &
2005-04-17 02:20:36 +04:00
cursor < vma - > vm_end - vma - > vm_start ) {
try_to_unmap_cluster ( cursor , & mapcount , vma ) ;
cursor + = CLUSTER_SIZE ;
vma - > vm_private_data = ( void * ) cursor ;
if ( ( int ) mapcount < = 0 )
goto out ;
}
vma - > vm_private_data = ( void * ) max_nl_cursor ;
}
cond_resched_lock ( & mapping - > i_mmap_lock ) ;
max_nl_cursor + = CLUSTER_SIZE ;
} while ( max_nl_cursor < = max_nl_size ) ;
/*
* Don ' t loop forever ( perhaps all the remaining pages are
* in locked vmas ) . Reset cursor on all unreserved nonlinear
* vmas , now forgetting on which ones it had fallen behind .
*/
2005-11-22 08:32:16 +03:00
list_for_each_entry ( vma , & mapping - > i_mmap_nonlinear , shared . vm_set . list )
vma - > vm_private_data = NULL ;
2005-04-17 02:20:36 +04:00
out :
spin_unlock ( & mapping - > i_mmap_lock ) ;
return ret ;
}
/**
* try_to_unmap - try to remove all page table mappings to a page
* @ page : the page to get unmapped
*
* Tries to remove all the page table entries which are mapping this
* page , used in the pageout path . Caller must hold the page lock .
* Return values are :
*
* SWAP_SUCCESS - we succeeded in removing all mappings
* SWAP_AGAIN - we missed a mapping , try again later
* SWAP_FAIL - the page is unswappable
*/
int try_to_unmap ( struct page * page )
{
int ret ;
BUG_ON ( ! PageLocked ( page ) ) ;
if ( PageAnon ( page ) )
ret = try_to_unmap_anon ( page ) ;
else
ret = try_to_unmap_file ( page ) ;
if ( ! page_mapped ( page ) )
ret = SWAP_SUCCESS ;
return ret ;
}
2005-05-01 19:58:36 +04:00