2005-04-17 02:20:36 +04:00
/*
* linux / mm / fremap . c
*
* Explicit pagetable population and nonlinear ( random ) mappings support .
*
* started by Ingo Molnar , Copyright ( C ) 2002 , 2003
*/
# include <linux/mm.h>
# include <linux/swap.h>
# include <linux/file.h>
# include <linux/mman.h>
# include <linux/pagemap.h>
# include <linux/swapops.h>
# include <linux/rmap.h>
# include <linux/module.h>
# include <linux/syscalls.h>
# include <asm/mmu_context.h>
# include <asm/cacheflush.h>
# include <asm/tlbflush.h>
2005-10-30 04:16:17 +03:00
static int zap_pte ( struct mm_struct * mm , struct vm_area_struct * vma ,
2005-04-17 02:20:36 +04:00
unsigned long addr , pte_t * ptep )
{
pte_t pte = * ptep ;
2005-10-30 04:16:17 +03:00
struct page * page = NULL ;
2005-04-17 02:20:36 +04:00
if ( pte_present ( pte ) ) {
unsigned long pfn = pte_pfn ( pte ) ;
flush_cache_page ( vma , addr , pfn ) ;
pte = ptep_clear_flush ( vma , addr , ptep ) ;
2005-10-30 04:16:12 +03:00
if ( unlikely ( ! pfn_valid ( pfn ) ) ) {
print_bad_pte ( vma , pte , addr ) ;
2005-10-30 04:16:17 +03:00
goto out ;
2005-04-17 02:20:36 +04:00
}
2005-10-30 04:16:12 +03:00
page = pfn_to_page ( pfn ) ;
if ( pte_dirty ( pte ) )
set_page_dirty ( page ) ;
page_remove_rmap ( page ) ;
page_cache_release ( page ) ;
2005-04-17 02:20:36 +04:00
} else {
if ( ! pte_file ( pte ) )
free_swap_and_cache ( pte_to_swp_entry ( pte ) ) ;
pte_clear ( mm , addr , ptep ) ;
}
2005-10-30 04:16:17 +03:00
out :
return ! ! page ;
2005-04-17 02:20:36 +04:00
}
/*
* Install a file page to a given virtual memory address , release any
* previously existing mapping .
*/
int install_page ( struct mm_struct * mm , struct vm_area_struct * vma ,
unsigned long addr , struct page * page , pgprot_t prot )
{
struct inode * inode ;
pgoff_t size ;
int err = - ENOMEM ;
pte_t * pte ;
pmd_t * pmd ;
pud_t * pud ;
pgd_t * pgd ;
pte_t pte_val ;
2005-10-30 04:16:23 +03:00
spinlock_t * ptl ;
2005-04-17 02:20:36 +04:00
2005-10-30 04:16:12 +03:00
BUG_ON ( vma - > vm_flags & VM_RESERVED ) ;
2005-04-17 02:20:36 +04:00
pgd = pgd_offset ( mm , addr ) ;
pud = pud_alloc ( mm , pgd , addr ) ;
if ( ! pud )
2005-10-30 04:16:23 +03:00
goto out ;
2005-04-17 02:20:36 +04:00
pmd = pmd_alloc ( mm , pud , addr ) ;
if ( ! pmd )
2005-10-30 04:16:23 +03:00
goto out ;
pte = pte_alloc_map_lock ( mm , pmd , addr , & ptl ) ;
2005-04-17 02:20:36 +04:00
if ( ! pte )
2005-10-30 04:16:23 +03:00
goto out ;
2005-04-17 02:20:36 +04:00
/*
* This page may have been truncated . Tell the
* caller about it .
*/
err = - EINVAL ;
inode = vma - > vm_file - > f_mapping - > host ;
size = ( i_size_read ( inode ) + PAGE_CACHE_SIZE - 1 ) > > PAGE_CACHE_SHIFT ;
if ( ! page - > mapping | | page - > index > = size )
2005-10-30 04:16:23 +03:00
goto unlock ;
2005-10-11 22:16:26 +04:00
err = - ENOMEM ;
if ( page_mapcount ( page ) > INT_MAX / 2 )
2005-10-30 04:16:23 +03:00
goto unlock ;
2005-04-17 02:20:36 +04:00
2005-10-30 04:16:17 +03:00
if ( pte_none ( * pte ) | | ! zap_pte ( mm , vma , addr , pte ) )
inc_mm_counter ( mm , file_rss ) ;
2005-04-17 02:20:36 +04:00
flush_icache_page ( vma , page ) ;
set_pte_at ( mm , addr , pte , mk_pte ( page , prot ) ) ;
page_add_file_rmap ( page ) ;
pte_val = * pte ;
update_mmu_cache ( vma , addr , pte_val ) ;
err = 0 ;
2005-10-30 04:16:23 +03:00
unlock :
pte_unmap_unlock ( pte , ptl ) ;
out :
2005-04-17 02:20:36 +04:00
return err ;
}
EXPORT_SYMBOL ( install_page ) ;
/*
* Install a file pte to a given virtual memory address , release any
* previously existing mapping .
*/
int install_file_pte ( struct mm_struct * mm , struct vm_area_struct * vma ,
unsigned long addr , unsigned long pgoff , pgprot_t prot )
{
int err = - ENOMEM ;
pte_t * pte ;
pmd_t * pmd ;
pud_t * pud ;
pgd_t * pgd ;
pte_t pte_val ;
2005-10-30 04:16:23 +03:00
spinlock_t * ptl ;
2005-04-17 02:20:36 +04:00
2005-10-30 04:16:12 +03:00
BUG_ON ( vma - > vm_flags & VM_RESERVED ) ;
2005-04-17 02:20:36 +04:00
pgd = pgd_offset ( mm , addr ) ;
pud = pud_alloc ( mm , pgd , addr ) ;
if ( ! pud )
2005-10-30 04:16:23 +03:00
goto out ;
2005-04-17 02:20:36 +04:00
pmd = pmd_alloc ( mm , pud , addr ) ;
if ( ! pmd )
2005-10-30 04:16:23 +03:00
goto out ;
pte = pte_alloc_map_lock ( mm , pmd , addr , & ptl ) ;
2005-04-17 02:20:36 +04:00
if ( ! pte )
2005-10-30 04:16:23 +03:00
goto out ;
2005-04-17 02:20:36 +04:00
[PATCH] mm: update_hiwaters just in time
update_mem_hiwater has attracted various criticisms, in particular from those
concerned with mm scalability. Originally it was called whenever rss or
total_vm got raised. Then many of those callsites were replaced by a timer
tick call from account_system_time. Now Frank van Maarseveen reports that to
be found inadequate. How about this? Works for Frank.
Replace update_mem_hiwater, a poor combination of two unrelated ops, by macros
update_hiwater_rss and update_hiwater_vm. Don't attempt to keep
mm->hiwater_rss up to date at timer tick, nor every time we raise rss (usually
by 1): those are hot paths. Do the opposite, update only when about to lower
rss (usually by many), or just before final accounting in do_exit. Handle
mm->hiwater_vm in the same way, though it's much less of an issue. Demand
that whoever collects these hiwater statistics do the work of taking the
maximum with rss or total_vm.
And there has been no collector of these hiwater statistics in the tree. The
new convention needs an example, so match Frank's usage by adding a VmPeak
line above VmSize to /proc/<pid>/status, and also a VmHWM line above VmRSS
(High-Water-Mark or High-Water-Memory).
There was a particular anomaly during mremap move, that hiwater_vm might be
captured too high. A fleeting such anomaly remains, but it's quickly
corrected now, whereas before it would stick.
What locking? None: if the app is racy then these statistics will be racy,
it's not worth any overhead to make them exact. But whenever it suits,
hiwater_vm is updated under exclusive mmap_sem, and hiwater_rss under
page_table_lock (for now) or with preemption disabled (later on): without
going to any trouble, minimize the time between reading current values and
updating, to minimize those occasions when a racing thread bumps a count up
and back down in between.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 04:16:18 +03:00
if ( ! pte_none ( * pte ) & & zap_pte ( mm , vma , addr , pte ) ) {
update_hiwater_rss ( mm ) ;
2005-10-30 04:16:17 +03:00
dec_mm_counter ( mm , file_rss ) ;
[PATCH] mm: update_hiwaters just in time
update_mem_hiwater has attracted various criticisms, in particular from those
concerned with mm scalability. Originally it was called whenever rss or
total_vm got raised. Then many of those callsites were replaced by a timer
tick call from account_system_time. Now Frank van Maarseveen reports that to
be found inadequate. How about this? Works for Frank.
Replace update_mem_hiwater, a poor combination of two unrelated ops, by macros
update_hiwater_rss and update_hiwater_vm. Don't attempt to keep
mm->hiwater_rss up to date at timer tick, nor every time we raise rss (usually
by 1): those are hot paths. Do the opposite, update only when about to lower
rss (usually by many), or just before final accounting in do_exit. Handle
mm->hiwater_vm in the same way, though it's much less of an issue. Demand
that whoever collects these hiwater statistics do the work of taking the
maximum with rss or total_vm.
And there has been no collector of these hiwater statistics in the tree. The
new convention needs an example, so match Frank's usage by adding a VmPeak
line above VmSize to /proc/<pid>/status, and also a VmHWM line above VmRSS
(High-Water-Mark or High-Water-Memory).
There was a particular anomaly during mremap move, that hiwater_vm might be
captured too high. A fleeting such anomaly remains, but it's quickly
corrected now, whereas before it would stick.
What locking? None: if the app is racy then these statistics will be racy,
it's not worth any overhead to make them exact. But whenever it suits,
hiwater_vm is updated under exclusive mmap_sem, and hiwater_rss under
page_table_lock (for now) or with preemption disabled (later on): without
going to any trouble, minimize the time between reading current values and
updating, to minimize those occasions when a racing thread bumps a count up
and back down in between.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 04:16:18 +03:00
}
2005-04-17 02:20:36 +04:00
set_pte_at ( mm , addr , pte , pgoff_to_pte ( pgoff ) ) ;
pte_val = * pte ;
update_mmu_cache ( vma , addr , pte_val ) ;
2005-10-30 04:16:23 +03:00
pte_unmap_unlock ( pte , ptl ) ;
err = 0 ;
out :
2005-04-17 02:20:36 +04:00
return err ;
}
/***
* sys_remap_file_pages - remap arbitrary pages of a shared backing store
* file within an existing vma .
* @ start : start of the remapped virtual memory range
* @ size : size of the remapped virtual memory range
* @ prot : new protection bits of the range
* @ pgoff : to be mapped page of the backing store file
* @ flags : 0 or MAP_NONBLOCKED - the later will cause no IO .
*
* this syscall works purely via pagetables , so it ' s the most efficient
* way to map the same ( large ) file into a given virtual window . Unlike
* mmap ( ) / mremap ( ) it does not create any new vmas . The new mappings are
* also safe across swapout .
*
* NOTE : the ' prot ' parameter right now is ignored , and the vma ' s default
* protection is used . Arbitrary protections might be implemented in the
* future .
*/
asmlinkage long sys_remap_file_pages ( unsigned long start , unsigned long size ,
unsigned long __prot , unsigned long pgoff , unsigned long flags )
{
struct mm_struct * mm = current - > mm ;
struct address_space * mapping ;
unsigned long end = start + size ;
struct vm_area_struct * vma ;
int err = - EINVAL ;
int has_write_lock = 0 ;
if ( __prot )
return err ;
/*
* Sanitize the syscall parameters :
*/
start = start & PAGE_MASK ;
size = size & PAGE_MASK ;
/* Does the address range wrap, or is the span zero-sized? */
if ( start + size < = start )
return err ;
/* Can we represent this offset inside this architecture's pte's? */
# if PTE_FILE_MAX_BITS < BITS_PER_LONG
if ( pgoff + ( size > > PAGE_SHIFT ) > = ( 1UL < < PTE_FILE_MAX_BITS ) )
return err ;
# endif
/* We need down_write() to change vma->vm_flags. */
down_read ( & mm - > mmap_sem ) ;
retry :
vma = find_vma ( mm , start ) ;
/*
* Make sure the vma is shared , that it supports prefaulting ,
* and that the remapped range is valid and fully within
* the single existing vma . vm_private_data is used as a
* swapout cursor in a VM_NONLINEAR vma ( unless VM_RESERVED
* or VM_LOCKED , but VM_LOCKED could be revoked later on ) .
*/
if ( vma & & ( vma - > vm_flags & VM_SHARED ) & &
( ! vma - > vm_private_data | |
( vma - > vm_flags & ( VM_NONLINEAR | VM_RESERVED ) ) ) & &
vma - > vm_ops & & vma - > vm_ops - > populate & &
end > start & & start > = vma - > vm_start & &
end < = vma - > vm_end ) {
/* Must set VM_NONLINEAR before any pages are populated. */
if ( pgoff ! = linear_page_index ( vma , start ) & &
! ( vma - > vm_flags & VM_NONLINEAR ) ) {
if ( ! has_write_lock ) {
up_read ( & mm - > mmap_sem ) ;
down_write ( & mm - > mmap_sem ) ;
has_write_lock = 1 ;
goto retry ;
}
mapping = vma - > vm_file - > f_mapping ;
spin_lock ( & mapping - > i_mmap_lock ) ;
flush_dcache_mmap_lock ( mapping ) ;
vma - > vm_flags | = VM_NONLINEAR ;
vma_prio_tree_remove ( vma , & mapping - > i_mmap ) ;
vma_nonlinear_insert ( vma , & mapping - > i_mmap_nonlinear ) ;
flush_dcache_mmap_unlock ( mapping ) ;
spin_unlock ( & mapping - > i_mmap_lock ) ;
}
err = vma - > vm_ops - > populate ( vma , start , size ,
vma - > vm_page_prot ,
pgoff , flags & MAP_NONBLOCK ) ;
/*
* We can ' t clear VM_NONLINEAR because we ' d have to do
* it after - > populate completes , and that would prevent
* downgrading the lock . ( Locks can ' t be upgraded ) .
*/
}
if ( likely ( ! has_write_lock ) )
up_read ( & mm - > mmap_sem ) ;
else
up_write ( & mm - > mmap_sem ) ;
return err ;
}