2005-04-17 02:20:36 +04:00
/*
* linux / mm / nommu . c
*
* Replacement code for mm functions to support CPU ' s that don ' t
* have any form of memory management unit ( thus no virtual memory ) .
*
* See Documentation / nommu - mmap . txt
*
2009-01-08 15:04:47 +03:00
* Copyright ( c ) 2004 - 2008 David Howells < dhowells @ redhat . com >
2005-04-17 02:20:36 +04:00
* Copyright ( c ) 2000 - 2003 David McCullough < davidm @ snapgear . com >
* Copyright ( c ) 2000 - 2001 D Jeff Dionne < jeff @ uClinux . org >
* Copyright ( c ) 2002 Greg Ungerer < gerg @ snapgear . com >
2009-01-21 11:45:47 +03:00
* Copyright ( c ) 2007 - 2009 Paul Mundt < lethal @ linux - sh . org >
2005-04-17 02:20:36 +04:00
*/
2007-10-29 16:15:39 +03:00
# include <linux/module.h>
2005-04-17 02:20:36 +04:00
# include <linux/mm.h>
# include <linux/mman.h>
# include <linux/swap.h>
# include <linux/file.h>
# include <linux/highmem.h>
# include <linux/pagemap.h>
# include <linux/slab.h>
# include <linux/vmalloc.h>
2008-07-26 06:45:50 +04:00
# include <linux/tracehook.h>
2005-04-17 02:20:36 +04:00
# include <linux/blkdev.h>
# include <linux/backing-dev.h>
# include <linux/mount.h>
# include <linux/personality.h>
# include <linux/security.h>
# include <linux/syscalls.h>
# include <asm/uaccess.h>
# include <asm/tlb.h>
# include <asm/tlbflush.h>
2009-01-08 15:04:47 +03:00
# include "internal.h"
static inline __attribute__ ( ( format ( printf , 1 , 2 ) ) )
void no_printk ( const char * fmt , . . . )
{
}
#if 0
# define kenter(FMT, ...) \
printk ( KERN_DEBUG " ==> %s( " FMT " ) \n " , __func__ , # # __VA_ARGS__ )
# define kleave(FMT, ...) \
printk ( KERN_DEBUG " <== %s() " FMT " \n " , __func__ , # # __VA_ARGS__ )
# define kdebug(FMT, ...) \
printk ( KERN_DEBUG " xxx " FMT " yyy \n " , # # __VA_ARGS__ )
# else
# define kenter(FMT, ...) \
no_printk ( KERN_DEBUG " ==> %s( " FMT " ) \n " , __func__ , # # __VA_ARGS__ )
# define kleave(FMT, ...) \
no_printk ( KERN_DEBUG " <== %s() " FMT " \n " , __func__ , # # __VA_ARGS__ )
# define kdebug(FMT, ...) \
no_printk ( KERN_DEBUG FMT " \n " , # # __VA_ARGS__ )
# endif
2005-04-17 02:20:36 +04:00
mlock: mlocked pages are unevictable
Make sure that mlocked pages also live on the unevictable LRU, so kswapd
will not scan them over and over again.
This is achieved through various strategies:
1) add yet another page flag--PG_mlocked--to indicate that
the page is locked for efficient testing in vmscan and,
optionally, fault path. This allows early culling of
unevictable pages, preventing them from getting to
page_referenced()/try_to_unmap(). Also allows separate
accounting of mlock'd pages, as Nick's original patch
did.
Note: Nick's original mlock patch used a PG_mlocked
flag. I had removed this in favor of the PG_unevictable
flag + an mlock_count [new page struct member]. I
restored the PG_mlocked flag to eliminate the new
count field.
2) add the mlock/unevictable infrastructure to mm/mlock.c,
with internal APIs in mm/internal.h. This is a rework
of Nick's original patch to these files, taking into
account that mlocked pages are now kept on unevictable
LRU list.
3) update vmscan.c:page_evictable() to check PageMlocked()
and, if vma passed in, the vm_flags. Note that the vma
will only be passed in for new pages in the fault path;
and then only if the "cull unevictable pages in fault
path" patch is included.
4) add try_to_unlock() to rmap.c to walk a page's rmap and
ClearPageMlocked() if no other vmas have it mlocked.
Reuses as much of try_to_unmap() as possible. This
effectively replaces the use of one of the lru list links
as an mlock count. If this mechanism let's pages in mlocked
vmas leak through w/o PG_mlocked set [I don't know that it
does], we should catch them later in try_to_unmap(). One
hopes this will be rare, as it will be relatively expensive.
Original mm/internal.h, mm/rmap.c and mm/mlock.c changes:
Signed-off-by: Nick Piggin <npiggin@suse.de>
splitlru: introduce __get_user_pages():
New munlock processing need to GUP_FLAGS_IGNORE_VMA_PERMISSIONS.
because current get_user_pages() can't grab PROT_NONE pages theresore it
cause PROT_NONE pages can't munlock.
[akpm@linux-foundation.org: fix this for pagemap-pass-mm-into-pagewalkers.patch]
[akpm@linux-foundation.org: untangle patch interdependencies]
[akpm@linux-foundation.org: fix things after out-of-order merging]
[hugh@veritas.com: fix page-flags mess]
[lee.schermerhorn@hp.com: fix munlock page table walk - now requires 'mm']
[kosaki.motohiro@jp.fujitsu.com: build fix]
[kosaki.motohiro@jp.fujitsu.com: fix truncate race and sevaral comments]
[kosaki.motohiro@jp.fujitsu.com: splitlru: introduce __get_user_pages()]
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-10-19 07:26:44 +04:00
# include "internal.h"
2005-04-17 02:20:36 +04:00
void * high_memory ;
struct page * mem_map ;
unsigned long max_mapnr ;
unsigned long num_physpages ;
2009-05-01 02:08:51 +04:00
struct percpu_counter vm_committed_as ;
2005-04-17 02:20:36 +04:00
int sysctl_overcommit_memory = OVERCOMMIT_GUESS ; /* heuristic overcommit */
int sysctl_overcommit_ratio = 50 ; /* default is 50% */
int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT ;
2009-05-07 03:03:05 +04:00
int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS ;
2005-04-17 02:20:36 +04:00
int heap_stack_gap = 0 ;
2009-04-03 03:56:32 +04:00
atomic_long_t mmap_pages_allocated ;
2009-01-08 15:04:47 +03:00
2005-04-17 02:20:36 +04:00
EXPORT_SYMBOL ( mem_map ) ;
2007-04-12 10:28:47 +04:00
EXPORT_SYMBOL ( num_physpages ) ;
2005-04-17 02:20:36 +04:00
2009-01-08 15:04:47 +03:00
/* list of mapped, potentially shareable regions */
static struct kmem_cache * vm_region_jar ;
struct rb_root nommu_region_tree = RB_ROOT ;
DECLARE_RWSEM ( nommu_region_sem ) ;
2005-04-17 02:20:36 +04:00
struct vm_operations_struct generic_file_vm_ops = {
} ;
/*
* Handle all mappings that got truncated by a " truncate() "
* system call .
*
* NOTE ! We have to be ready to update the memory sharing
* between the file and the memory map for a potential last
* incomplete page . Ugly , but necessary .
*/
int vmtruncate ( struct inode * inode , loff_t offset )
{
struct address_space * mapping = inode - > i_mapping ;
unsigned long limit ;
if ( inode - > i_size < offset )
goto do_expand ;
i_size_write ( inode , offset ) ;
truncate_inode_pages ( mapping , offset ) ;
goto out_truncate ;
do_expand :
limit = current - > signal - > rlim [ RLIMIT_FSIZE ] . rlim_cur ;
if ( limit ! = RLIM_INFINITY & & offset > limit )
goto out_sig ;
if ( offset > inode - > i_sb - > s_maxbytes )
goto out ;
i_size_write ( inode , offset ) ;
out_truncate :
2008-12-04 18:06:33 +03:00
if ( inode - > i_op - > truncate )
2005-04-17 02:20:36 +04:00
inode - > i_op - > truncate ( inode ) ;
return 0 ;
out_sig :
send_sig ( SIGXFSZ , current , 0 ) ;
out :
return - EFBIG ;
}
EXPORT_SYMBOL ( vmtruncate ) ;
/*
* Return the total memory allocated for this pointer , not
* just what the caller asked for .
*
* Doesn ' t have to be accurate , i . e . may have races .
*/
unsigned int kobjsize ( const void * objp )
{
struct page * page ;
2008-04-28 13:13:38 +04:00
/*
* If the object we have should not have ksize performed on it ,
* return size of 0
*/
2008-06-12 11:29:55 +04:00
if ( ! objp | | ! virt_addr_valid ( objp ) )
2008-06-06 09:46:08 +04:00
return 0 ;
page = virt_to_head_page ( objp ) ;
/*
* If the allocator sets PageSlab , we know the pointer came from
* kmalloc ( ) .
*/
2005-04-17 02:20:36 +04:00
if ( PageSlab ( page ) )
return ksize ( objp ) ;
2009-01-08 15:04:48 +03:00
/*
* If it ' s not a compound page , see if we have a matching VMA
* region . This test is intentionally done in reverse order ,
* so if there ' s no VMA , we still fall through and hand back
* PAGE_SIZE for 0 - order pages .
*/
if ( ! PageCompound ( page ) ) {
struct vm_area_struct * vma ;
vma = find_vma ( current - > mm , ( unsigned long ) objp ) ;
if ( vma )
return vma - > vm_end - vma - > vm_start ;
}
2008-06-06 09:46:08 +04:00
/*
* The ksize ( ) function is only guaranteed to work for pointers
2008-06-12 11:29:55 +04:00
* returned by kmalloc ( ) . So handle arbitrary pointers here .
2008-06-06 09:46:08 +04:00
*/
2008-06-12 11:29:55 +04:00
return PAGE_SIZE < < compound_order ( page ) ;
2005-04-17 02:20:36 +04:00
}
mlock: mlocked pages are unevictable
Make sure that mlocked pages also live on the unevictable LRU, so kswapd
will not scan them over and over again.
This is achieved through various strategies:
1) add yet another page flag--PG_mlocked--to indicate that
the page is locked for efficient testing in vmscan and,
optionally, fault path. This allows early culling of
unevictable pages, preventing them from getting to
page_referenced()/try_to_unmap(). Also allows separate
accounting of mlock'd pages, as Nick's original patch
did.
Note: Nick's original mlock patch used a PG_mlocked
flag. I had removed this in favor of the PG_unevictable
flag + an mlock_count [new page struct member]. I
restored the PG_mlocked flag to eliminate the new
count field.
2) add the mlock/unevictable infrastructure to mm/mlock.c,
with internal APIs in mm/internal.h. This is a rework
of Nick's original patch to these files, taking into
account that mlocked pages are now kept on unevictable
LRU list.
3) update vmscan.c:page_evictable() to check PageMlocked()
and, if vma passed in, the vm_flags. Note that the vma
will only be passed in for new pages in the fault path;
and then only if the "cull unevictable pages in fault
path" patch is included.
4) add try_to_unlock() to rmap.c to walk a page's rmap and
ClearPageMlocked() if no other vmas have it mlocked.
Reuses as much of try_to_unmap() as possible. This
effectively replaces the use of one of the lru list links
as an mlock count. If this mechanism let's pages in mlocked
vmas leak through w/o PG_mlocked set [I don't know that it
does], we should catch them later in try_to_unmap(). One
hopes this will be rare, as it will be relatively expensive.
Original mm/internal.h, mm/rmap.c and mm/mlock.c changes:
Signed-off-by: Nick Piggin <npiggin@suse.de>
splitlru: introduce __get_user_pages():
New munlock processing need to GUP_FLAGS_IGNORE_VMA_PERMISSIONS.
because current get_user_pages() can't grab PROT_NONE pages theresore it
cause PROT_NONE pages can't munlock.
[akpm@linux-foundation.org: fix this for pagemap-pass-mm-into-pagewalkers.patch]
[akpm@linux-foundation.org: untangle patch interdependencies]
[akpm@linux-foundation.org: fix things after out-of-order merging]
[hugh@veritas.com: fix page-flags mess]
[lee.schermerhorn@hp.com: fix munlock page table walk - now requires 'mm']
[kosaki.motohiro@jp.fujitsu.com: build fix]
[kosaki.motohiro@jp.fujitsu.com: fix truncate race and sevaral comments]
[kosaki.motohiro@jp.fujitsu.com: splitlru: introduce __get_user_pages()]
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-10-19 07:26:44 +04:00
int __get_user_pages ( struct task_struct * tsk , struct mm_struct * mm ,
2009-06-25 13:58:55 +04:00
unsigned long start , int nr_pages , int flags ,
struct page * * pages , struct vm_area_struct * * vmas )
2005-04-17 02:20:36 +04:00
{
2006-09-27 12:50:17 +04:00
struct vm_area_struct * vma ;
2006-09-27 12:50:18 +04:00
unsigned long vm_flags ;
int i ;
mlock: mlocked pages are unevictable
Make sure that mlocked pages also live on the unevictable LRU, so kswapd
will not scan them over and over again.
This is achieved through various strategies:
1) add yet another page flag--PG_mlocked--to indicate that
the page is locked for efficient testing in vmscan and,
optionally, fault path. This allows early culling of
unevictable pages, preventing them from getting to
page_referenced()/try_to_unmap(). Also allows separate
accounting of mlock'd pages, as Nick's original patch
did.
Note: Nick's original mlock patch used a PG_mlocked
flag. I had removed this in favor of the PG_unevictable
flag + an mlock_count [new page struct member]. I
restored the PG_mlocked flag to eliminate the new
count field.
2) add the mlock/unevictable infrastructure to mm/mlock.c,
with internal APIs in mm/internal.h. This is a rework
of Nick's original patch to these files, taking into
account that mlocked pages are now kept on unevictable
LRU list.
3) update vmscan.c:page_evictable() to check PageMlocked()
and, if vma passed in, the vm_flags. Note that the vma
will only be passed in for new pages in the fault path;
and then only if the "cull unevictable pages in fault
path" patch is included.
4) add try_to_unlock() to rmap.c to walk a page's rmap and
ClearPageMlocked() if no other vmas have it mlocked.
Reuses as much of try_to_unmap() as possible. This
effectively replaces the use of one of the lru list links
as an mlock count. If this mechanism let's pages in mlocked
vmas leak through w/o PG_mlocked set [I don't know that it
does], we should catch them later in try_to_unmap(). One
hopes this will be rare, as it will be relatively expensive.
Original mm/internal.h, mm/rmap.c and mm/mlock.c changes:
Signed-off-by: Nick Piggin <npiggin@suse.de>
splitlru: introduce __get_user_pages():
New munlock processing need to GUP_FLAGS_IGNORE_VMA_PERMISSIONS.
because current get_user_pages() can't grab PROT_NONE pages theresore it
cause PROT_NONE pages can't munlock.
[akpm@linux-foundation.org: fix this for pagemap-pass-mm-into-pagewalkers.patch]
[akpm@linux-foundation.org: untangle patch interdependencies]
[akpm@linux-foundation.org: fix things after out-of-order merging]
[hugh@veritas.com: fix page-flags mess]
[lee.schermerhorn@hp.com: fix munlock page table walk - now requires 'mm']
[kosaki.motohiro@jp.fujitsu.com: build fix]
[kosaki.motohiro@jp.fujitsu.com: fix truncate race and sevaral comments]
[kosaki.motohiro@jp.fujitsu.com: splitlru: introduce __get_user_pages()]
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-10-19 07:26:44 +04:00
int write = ! ! ( flags & GUP_FLAGS_WRITE ) ;
int force = ! ! ( flags & GUP_FLAGS_FORCE ) ;
int ignore = ! ! ( flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS ) ;
2006-09-27 12:50:18 +04:00
/* calculate required read or write permissions.
* - if ' force ' is set , we only require the " MAY " flags .
*/
vm_flags = write ? ( VM_WRITE | VM_MAYWRITE ) : ( VM_READ | VM_MAYREAD ) ;
vm_flags & = force ? ( VM_MAYREAD | VM_MAYWRITE ) : ( VM_READ | VM_WRITE ) ;
2005-04-17 02:20:36 +04:00
2009-06-25 13:58:55 +04:00
for ( i = 0 ; i < nr_pages ; i + + ) {
2006-09-27 12:50:17 +04:00
vma = find_vma ( mm , start ) ;
2006-09-27 12:50:18 +04:00
if ( ! vma )
goto finish_or_fault ;
/* protect what we can, including chardevs */
if ( vma - > vm_flags & ( VM_IO | VM_PFNMAP ) | |
mlock: mlocked pages are unevictable
Make sure that mlocked pages also live on the unevictable LRU, so kswapd
will not scan them over and over again.
This is achieved through various strategies:
1) add yet another page flag--PG_mlocked--to indicate that
the page is locked for efficient testing in vmscan and,
optionally, fault path. This allows early culling of
unevictable pages, preventing them from getting to
page_referenced()/try_to_unmap(). Also allows separate
accounting of mlock'd pages, as Nick's original patch
did.
Note: Nick's original mlock patch used a PG_mlocked
flag. I had removed this in favor of the PG_unevictable
flag + an mlock_count [new page struct member]. I
restored the PG_mlocked flag to eliminate the new
count field.
2) add the mlock/unevictable infrastructure to mm/mlock.c,
with internal APIs in mm/internal.h. This is a rework
of Nick's original patch to these files, taking into
account that mlocked pages are now kept on unevictable
LRU list.
3) update vmscan.c:page_evictable() to check PageMlocked()
and, if vma passed in, the vm_flags. Note that the vma
will only be passed in for new pages in the fault path;
and then only if the "cull unevictable pages in fault
path" patch is included.
4) add try_to_unlock() to rmap.c to walk a page's rmap and
ClearPageMlocked() if no other vmas have it mlocked.
Reuses as much of try_to_unmap() as possible. This
effectively replaces the use of one of the lru list links
as an mlock count. If this mechanism let's pages in mlocked
vmas leak through w/o PG_mlocked set [I don't know that it
does], we should catch them later in try_to_unmap(). One
hopes this will be rare, as it will be relatively expensive.
Original mm/internal.h, mm/rmap.c and mm/mlock.c changes:
Signed-off-by: Nick Piggin <npiggin@suse.de>
splitlru: introduce __get_user_pages():
New munlock processing need to GUP_FLAGS_IGNORE_VMA_PERMISSIONS.
because current get_user_pages() can't grab PROT_NONE pages theresore it
cause PROT_NONE pages can't munlock.
[akpm@linux-foundation.org: fix this for pagemap-pass-mm-into-pagewalkers.patch]
[akpm@linux-foundation.org: untangle patch interdependencies]
[akpm@linux-foundation.org: fix things after out-of-order merging]
[hugh@veritas.com: fix page-flags mess]
[lee.schermerhorn@hp.com: fix munlock page table walk - now requires 'mm']
[kosaki.motohiro@jp.fujitsu.com: build fix]
[kosaki.motohiro@jp.fujitsu.com: fix truncate race and sevaral comments]
[kosaki.motohiro@jp.fujitsu.com: splitlru: introduce __get_user_pages()]
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-10-19 07:26:44 +04:00
( ! ignore & & ! ( vm_flags & vma - > vm_flags ) ) )
2006-09-27 12:50:18 +04:00
goto finish_or_fault ;
2006-09-27 12:50:17 +04:00
2005-04-17 02:20:36 +04:00
if ( pages ) {
pages [ i ] = virt_to_page ( start ) ;
if ( pages [ i ] )
page_cache_get ( pages [ i ] ) ;
}
if ( vmas )
2006-09-27 12:50:17 +04:00
vmas [ i ] = vma ;
2005-04-17 02:20:36 +04:00
start + = PAGE_SIZE ;
}
2006-09-27 12:50:18 +04:00
return i ;
finish_or_fault :
return i ? : - EFAULT ;
2005-04-17 02:20:36 +04:00
}
mlock: mlocked pages are unevictable
Make sure that mlocked pages also live on the unevictable LRU, so kswapd
will not scan them over and over again.
This is achieved through various strategies:
1) add yet another page flag--PG_mlocked--to indicate that
the page is locked for efficient testing in vmscan and,
optionally, fault path. This allows early culling of
unevictable pages, preventing them from getting to
page_referenced()/try_to_unmap(). Also allows separate
accounting of mlock'd pages, as Nick's original patch
did.
Note: Nick's original mlock patch used a PG_mlocked
flag. I had removed this in favor of the PG_unevictable
flag + an mlock_count [new page struct member]. I
restored the PG_mlocked flag to eliminate the new
count field.
2) add the mlock/unevictable infrastructure to mm/mlock.c,
with internal APIs in mm/internal.h. This is a rework
of Nick's original patch to these files, taking into
account that mlocked pages are now kept on unevictable
LRU list.
3) update vmscan.c:page_evictable() to check PageMlocked()
and, if vma passed in, the vm_flags. Note that the vma
will only be passed in for new pages in the fault path;
and then only if the "cull unevictable pages in fault
path" patch is included.
4) add try_to_unlock() to rmap.c to walk a page's rmap and
ClearPageMlocked() if no other vmas have it mlocked.
Reuses as much of try_to_unmap() as possible. This
effectively replaces the use of one of the lru list links
as an mlock count. If this mechanism let's pages in mlocked
vmas leak through w/o PG_mlocked set [I don't know that it
does], we should catch them later in try_to_unmap(). One
hopes this will be rare, as it will be relatively expensive.
Original mm/internal.h, mm/rmap.c and mm/mlock.c changes:
Signed-off-by: Nick Piggin <npiggin@suse.de>
splitlru: introduce __get_user_pages():
New munlock processing need to GUP_FLAGS_IGNORE_VMA_PERMISSIONS.
because current get_user_pages() can't grab PROT_NONE pages theresore it
cause PROT_NONE pages can't munlock.
[akpm@linux-foundation.org: fix this for pagemap-pass-mm-into-pagewalkers.patch]
[akpm@linux-foundation.org: untangle patch interdependencies]
[akpm@linux-foundation.org: fix things after out-of-order merging]
[hugh@veritas.com: fix page-flags mess]
[lee.schermerhorn@hp.com: fix munlock page table walk - now requires 'mm']
[kosaki.motohiro@jp.fujitsu.com: build fix]
[kosaki.motohiro@jp.fujitsu.com: fix truncate race and sevaral comments]
[kosaki.motohiro@jp.fujitsu.com: splitlru: introduce __get_user_pages()]
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-10-19 07:26:44 +04:00
/*
* get a list of pages in an address range belonging to the specified process
* and indicate the VMA that covers each page
* - this is potentially dodgy as we may end incrementing the page count of a
* slab page or a secondary page from a compound page
* - don ' t permit access to VMAs that don ' t support it , such as I / O mappings
*/
int get_user_pages ( struct task_struct * tsk , struct mm_struct * mm ,
2009-06-25 13:58:55 +04:00
unsigned long start , int nr_pages , int write , int force ,
mlock: mlocked pages are unevictable
Make sure that mlocked pages also live on the unevictable LRU, so kswapd
will not scan them over and over again.
This is achieved through various strategies:
1) add yet another page flag--PG_mlocked--to indicate that
the page is locked for efficient testing in vmscan and,
optionally, fault path. This allows early culling of
unevictable pages, preventing them from getting to
page_referenced()/try_to_unmap(). Also allows separate
accounting of mlock'd pages, as Nick's original patch
did.
Note: Nick's original mlock patch used a PG_mlocked
flag. I had removed this in favor of the PG_unevictable
flag + an mlock_count [new page struct member]. I
restored the PG_mlocked flag to eliminate the new
count field.
2) add the mlock/unevictable infrastructure to mm/mlock.c,
with internal APIs in mm/internal.h. This is a rework
of Nick's original patch to these files, taking into
account that mlocked pages are now kept on unevictable
LRU list.
3) update vmscan.c:page_evictable() to check PageMlocked()
and, if vma passed in, the vm_flags. Note that the vma
will only be passed in for new pages in the fault path;
and then only if the "cull unevictable pages in fault
path" patch is included.
4) add try_to_unlock() to rmap.c to walk a page's rmap and
ClearPageMlocked() if no other vmas have it mlocked.
Reuses as much of try_to_unmap() as possible. This
effectively replaces the use of one of the lru list links
as an mlock count. If this mechanism let's pages in mlocked
vmas leak through w/o PG_mlocked set [I don't know that it
does], we should catch them later in try_to_unmap(). One
hopes this will be rare, as it will be relatively expensive.
Original mm/internal.h, mm/rmap.c and mm/mlock.c changes:
Signed-off-by: Nick Piggin <npiggin@suse.de>
splitlru: introduce __get_user_pages():
New munlock processing need to GUP_FLAGS_IGNORE_VMA_PERMISSIONS.
because current get_user_pages() can't grab PROT_NONE pages theresore it
cause PROT_NONE pages can't munlock.
[akpm@linux-foundation.org: fix this for pagemap-pass-mm-into-pagewalkers.patch]
[akpm@linux-foundation.org: untangle patch interdependencies]
[akpm@linux-foundation.org: fix things after out-of-order merging]
[hugh@veritas.com: fix page-flags mess]
[lee.schermerhorn@hp.com: fix munlock page table walk - now requires 'mm']
[kosaki.motohiro@jp.fujitsu.com: build fix]
[kosaki.motohiro@jp.fujitsu.com: fix truncate race and sevaral comments]
[kosaki.motohiro@jp.fujitsu.com: splitlru: introduce __get_user_pages()]
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-10-19 07:26:44 +04:00
struct page * * pages , struct vm_area_struct * * vmas )
{
int flags = 0 ;
if ( write )
flags | = GUP_FLAGS_WRITE ;
if ( force )
flags | = GUP_FLAGS_FORCE ;
2009-06-25 13:58:55 +04:00
return __get_user_pages ( tsk , mm , start , nr_pages , flags , pages , vmas ) ;
mlock: mlocked pages are unevictable
Make sure that mlocked pages also live on the unevictable LRU, so kswapd
will not scan them over and over again.
This is achieved through various strategies:
1) add yet another page flag--PG_mlocked--to indicate that
the page is locked for efficient testing in vmscan and,
optionally, fault path. This allows early culling of
unevictable pages, preventing them from getting to
page_referenced()/try_to_unmap(). Also allows separate
accounting of mlock'd pages, as Nick's original patch
did.
Note: Nick's original mlock patch used a PG_mlocked
flag. I had removed this in favor of the PG_unevictable
flag + an mlock_count [new page struct member]. I
restored the PG_mlocked flag to eliminate the new
count field.
2) add the mlock/unevictable infrastructure to mm/mlock.c,
with internal APIs in mm/internal.h. This is a rework
of Nick's original patch to these files, taking into
account that mlocked pages are now kept on unevictable
LRU list.
3) update vmscan.c:page_evictable() to check PageMlocked()
and, if vma passed in, the vm_flags. Note that the vma
will only be passed in for new pages in the fault path;
and then only if the "cull unevictable pages in fault
path" patch is included.
4) add try_to_unlock() to rmap.c to walk a page's rmap and
ClearPageMlocked() if no other vmas have it mlocked.
Reuses as much of try_to_unmap() as possible. This
effectively replaces the use of one of the lru list links
as an mlock count. If this mechanism let's pages in mlocked
vmas leak through w/o PG_mlocked set [I don't know that it
does], we should catch them later in try_to_unmap(). One
hopes this will be rare, as it will be relatively expensive.
Original mm/internal.h, mm/rmap.c and mm/mlock.c changes:
Signed-off-by: Nick Piggin <npiggin@suse.de>
splitlru: introduce __get_user_pages():
New munlock processing need to GUP_FLAGS_IGNORE_VMA_PERMISSIONS.
because current get_user_pages() can't grab PROT_NONE pages theresore it
cause PROT_NONE pages can't munlock.
[akpm@linux-foundation.org: fix this for pagemap-pass-mm-into-pagewalkers.patch]
[akpm@linux-foundation.org: untangle patch interdependencies]
[akpm@linux-foundation.org: fix things after out-of-order merging]
[hugh@veritas.com: fix page-flags mess]
[lee.schermerhorn@hp.com: fix munlock page table walk - now requires 'mm']
[kosaki.motohiro@jp.fujitsu.com: build fix]
[kosaki.motohiro@jp.fujitsu.com: fix truncate race and sevaral comments]
[kosaki.motohiro@jp.fujitsu.com: splitlru: introduce __get_user_pages()]
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-10-19 07:26:44 +04:00
}
2005-09-12 05:18:10 +04:00
EXPORT_SYMBOL ( get_user_pages ) ;
2009-06-25 23:31:57 +04:00
/**
* follow_pfn - look up PFN at a user virtual address
* @ vma : memory mapping
* @ address : user virtual address
* @ pfn : location to store found PFN
*
* Only IO mappings and raw PFN mappings are allowed .
*
* Returns zero and the pfn at @ pfn on success , - ve otherwise .
*/
int follow_pfn ( struct vm_area_struct * vma , unsigned long address ,
unsigned long * pfn )
{
if ( ! ( vma - > vm_flags & ( VM_IO | VM_PFNMAP ) ) )
return - EINVAL ;
* pfn = address > > PAGE_SHIFT ;
return 0 ;
}
EXPORT_SYMBOL ( follow_pfn ) ;
2005-04-17 02:20:36 +04:00
DEFINE_RWLOCK ( vmlist_lock ) ;
struct vm_struct * vmlist ;
2008-02-05 09:28:32 +03:00
void vfree ( const void * addr )
2005-04-17 02:20:36 +04:00
{
kfree ( addr ) ;
}
2007-07-21 15:37:25 +04:00
EXPORT_SYMBOL ( vfree ) ;
2005-04-17 02:20:36 +04:00
2005-10-07 10:46:04 +04:00
void * __vmalloc ( unsigned long size , gfp_t gfp_mask , pgprot_t prot )
2005-04-17 02:20:36 +04:00
{
/*
2007-10-20 01:11:38 +04:00
* You can ' t specify __GFP_HIGHMEM with kmalloc ( ) since kmalloc ( )
* returns only a logical address .
2005-04-17 02:20:36 +04:00
*/
2006-03-22 11:08:34 +03:00
return kmalloc ( size , ( gfp_mask | __GFP_COMP ) & ~ __GFP_HIGHMEM ) ;
2005-04-17 02:20:36 +04:00
}
2007-07-21 15:37:25 +04:00
EXPORT_SYMBOL ( __vmalloc ) ;
2005-04-17 02:20:36 +04:00
2008-02-05 09:29:59 +03:00
void * vmalloc_user ( unsigned long size )
{
void * ret ;
ret = __vmalloc ( size , GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO ,
PAGE_KERNEL ) ;
if ( ret ) {
struct vm_area_struct * vma ;
down_write ( & current - > mm - > mmap_sem ) ;
vma = find_vma ( current - > mm , ( unsigned long ) ret ) ;
if ( vma )
vma - > vm_flags | = VM_USERMAP ;
up_write ( & current - > mm - > mmap_sem ) ;
}
return ret ;
}
EXPORT_SYMBOL ( vmalloc_user ) ;
2008-02-05 09:28:32 +03:00
struct page * vmalloc_to_page ( const void * addr )
2005-04-17 02:20:36 +04:00
{
return virt_to_page ( addr ) ;
}
2007-07-21 15:37:25 +04:00
EXPORT_SYMBOL ( vmalloc_to_page ) ;
2005-04-17 02:20:36 +04:00
2008-02-05 09:28:32 +03:00
unsigned long vmalloc_to_pfn ( const void * addr )
2005-04-17 02:20:36 +04:00
{
return page_to_pfn ( virt_to_page ( addr ) ) ;
}
2007-07-21 15:37:25 +04:00
EXPORT_SYMBOL ( vmalloc_to_pfn ) ;
2005-04-17 02:20:36 +04:00
long vread ( char * buf , char * addr , unsigned long count )
{
memcpy ( buf , addr , count ) ;
return count ;
}
long vwrite ( char * buf , char * addr , unsigned long count )
{
/* Don't allow overflow */
if ( ( unsigned long ) addr + count < count )
count = - ( unsigned long ) addr ;
memcpy ( addr , buf , count ) ;
return ( count ) ;
}
/*
* vmalloc - allocate virtually continguos memory
*
* @ size : allocation size
*
* Allocate enough pages to cover @ size from the page level
* allocator and map them into continguos kernel virtual space .
*
2006-10-04 01:21:02 +04:00
* For tight control over page level allocator and protection flags
2005-04-17 02:20:36 +04:00
* use __vmalloc ( ) instead .
*/
void * vmalloc ( unsigned long size )
{
return __vmalloc ( size , GFP_KERNEL | __GFP_HIGHMEM , PAGE_KERNEL ) ;
}
2006-03-01 03:59:18 +03:00
EXPORT_SYMBOL ( vmalloc ) ;
void * vmalloc_node ( unsigned long size , int node )
{
return vmalloc ( size ) ;
}
EXPORT_SYMBOL ( vmalloc_node ) ;
2005-04-17 02:20:36 +04:00
2008-08-04 11:01:47 +04:00
# ifndef PAGE_KERNEL_EXEC
# define PAGE_KERNEL_EXEC PAGE_KERNEL
# endif
/**
* vmalloc_exec - allocate virtually contiguous , executable memory
* @ size : allocation size
*
* Kernel - internal function to allocate enough pages to cover @ size
* the page level allocator and map them into contiguous and
* executable kernel virtual space .
*
* For tight control over page level allocator and protection flags
* use __vmalloc ( ) instead .
*/
void * vmalloc_exec ( unsigned long size )
{
return __vmalloc ( size , GFP_KERNEL | __GFP_HIGHMEM , PAGE_KERNEL_EXEC ) ;
}
2007-07-21 15:37:25 +04:00
/**
* vmalloc_32 - allocate virtually contiguous memory ( 32 bit addressable )
2005-04-17 02:20:36 +04:00
* @ size : allocation size
*
* Allocate enough 32 bit PA addressable pages to cover @ size from the
* page level allocator and map them into continguos kernel virtual space .
*/
void * vmalloc_32 ( unsigned long size )
{
return __vmalloc ( size , GFP_KERNEL , PAGE_KERNEL ) ;
}
2007-07-21 15:37:25 +04:00
EXPORT_SYMBOL ( vmalloc_32 ) ;
/**
* vmalloc_32_user - allocate zeroed virtually contiguous 32 bit memory
* @ size : allocation size
*
* The resulting memory area is 32 bit addressable and zeroed so it can be
* mapped to userspace without leaking data .
2008-02-05 09:29:59 +03:00
*
* VM_USERMAP is set on the corresponding VMA so that subsequent calls to
* remap_vmalloc_range ( ) are permissible .
2007-07-21 15:37:25 +04:00
*/
void * vmalloc_32_user ( unsigned long size )
{
2008-02-05 09:29:59 +03:00
/*
* We ' ll have to sort out the ZONE_DMA bits for 64 - bit ,
* but for now this can simply use vmalloc_user ( ) directly .
*/
return vmalloc_user ( size ) ;
2007-07-21 15:37:25 +04:00
}
EXPORT_SYMBOL ( vmalloc_32_user ) ;
2005-04-17 02:20:36 +04:00
void * vmap ( struct page * * pages , unsigned int count , unsigned long flags , pgprot_t prot )
{
BUG ( ) ;
return NULL ;
}
2007-07-21 15:37:25 +04:00
EXPORT_SYMBOL ( vmap ) ;
2005-04-17 02:20:36 +04:00
2008-02-05 09:28:32 +03:00
void vunmap ( const void * addr )
2005-04-17 02:20:36 +04:00
{
BUG ( ) ;
}
2007-07-21 15:37:25 +04:00
EXPORT_SYMBOL ( vunmap ) ;
2005-04-17 02:20:36 +04:00
2009-01-21 11:45:47 +03:00
void * vm_map_ram ( struct page * * pages , unsigned int count , int node , pgprot_t prot )
{
BUG ( ) ;
return NULL ;
}
EXPORT_SYMBOL ( vm_map_ram ) ;
void vm_unmap_ram ( const void * mem , unsigned int count )
{
BUG ( ) ;
}
EXPORT_SYMBOL ( vm_unmap_ram ) ;
void vm_unmap_aliases ( void )
{
}
EXPORT_SYMBOL_GPL ( vm_unmap_aliases ) ;
2007-05-08 11:27:03 +04:00
/*
* Implement a stub for vmalloc_sync_all ( ) if the architecture chose not to
* have one .
*/
void __attribute__ ( ( weak ) ) vmalloc_sync_all ( void )
{
}
2007-07-21 15:37:25 +04:00
int vm_insert_page ( struct vm_area_struct * vma , unsigned long addr ,
struct page * page )
{
return - EINVAL ;
}
EXPORT_SYMBOL ( vm_insert_page ) ;
2005-04-17 02:20:36 +04:00
/*
* sys_brk ( ) for the most part doesn ' t need the global kernel
* lock , except when an application is doing something nasty
* like trying to un - brk an area that has already been mapped
* to a regular file . in this case , the unmapping will need
* to invoke file system routines that need the global lock .
*/
2009-01-14 16:14:15 +03:00
SYSCALL_DEFINE1 ( brk , unsigned long , brk )
2005-04-17 02:20:36 +04:00
{
struct mm_struct * mm = current - > mm ;
if ( brk < mm - > start_brk | | brk > mm - > context . end_brk )
return mm - > brk ;
if ( mm - > brk = = brk )
return mm - > brk ;
/*
* Always allow shrinking brk
*/
if ( brk < = mm - > brk ) {
mm - > brk = brk ;
return brk ;
}
/*
* Ok , looks good - let it rip .
*/
return mm - > brk = brk ;
}
2009-01-08 15:04:47 +03:00
/*
* initialise the VMA and region record slabs
*/
void __init mmap_init ( void )
2005-04-17 02:20:36 +04:00
{
2009-05-01 02:08:51 +04:00
int ret ;
ret = percpu_counter_init ( & vm_committed_as , 0 ) ;
VM_BUG_ON ( ret ) ;
2009-04-03 03:56:32 +04:00
vm_region_jar = KMEM_CACHE ( vm_region , SLAB_PANIC ) ;
2005-04-17 02:20:36 +04:00
}
2006-09-27 12:50:20 +04:00
/*
2009-01-08 15:04:47 +03:00
* validate the region tree
* - the caller must hold the region lock
2006-09-27 12:50:20 +04:00
*/
2009-01-08 15:04:47 +03:00
# ifdef CONFIG_DEBUG_NOMMU_REGIONS
static noinline void validate_nommu_regions ( void )
2006-09-27 12:50:20 +04:00
{
2009-01-08 15:04:47 +03:00
struct vm_region * region , * last ;
struct rb_node * p , * lastp ;
2006-09-27 12:50:20 +04:00
2009-01-08 15:04:47 +03:00
lastp = rb_first ( & nommu_region_tree ) ;
if ( ! lastp )
return ;
last = rb_entry ( lastp , struct vm_region , vm_rb ) ;
2009-04-03 03:56:32 +04:00
BUG_ON ( unlikely ( last - > vm_end < = last - > vm_start ) ) ;
BUG_ON ( unlikely ( last - > vm_top < last - > vm_end ) ) ;
2009-01-08 15:04:47 +03:00
while ( ( p = rb_next ( lastp ) ) ) {
region = rb_entry ( p , struct vm_region , vm_rb ) ;
last = rb_entry ( lastp , struct vm_region , vm_rb ) ;
2009-04-03 03:56:32 +04:00
BUG_ON ( unlikely ( region - > vm_end < = region - > vm_start ) ) ;
BUG_ON ( unlikely ( region - > vm_top < region - > vm_end ) ) ;
BUG_ON ( unlikely ( region - > vm_start < last - > vm_top ) ) ;
2006-09-27 12:50:20 +04:00
2009-01-08 15:04:47 +03:00
lastp = p ;
}
2006-09-27 12:50:20 +04:00
}
2009-01-08 15:04:47 +03:00
# else
2009-04-03 03:56:32 +04:00
static void validate_nommu_regions ( void )
{
}
2009-01-08 15:04:47 +03:00
# endif
2006-09-27 12:50:20 +04:00
/*
2009-01-08 15:04:47 +03:00
* add a region into the global tree
2006-09-27 12:50:20 +04:00
*/
2009-01-08 15:04:47 +03:00
static void add_nommu_region ( struct vm_region * region )
2006-09-27 12:50:20 +04:00
{
2009-01-08 15:04:47 +03:00
struct vm_region * pregion ;
struct rb_node * * p , * parent ;
2006-09-27 12:50:20 +04:00
2009-01-08 15:04:47 +03:00
validate_nommu_regions ( ) ;
parent = NULL ;
p = & nommu_region_tree . rb_node ;
while ( * p ) {
parent = * p ;
pregion = rb_entry ( parent , struct vm_region , vm_rb ) ;
if ( region - > vm_start < pregion - > vm_start )
p = & ( * p ) - > rb_left ;
else if ( region - > vm_start > pregion - > vm_start )
p = & ( * p ) - > rb_right ;
else if ( pregion = = region )
return ;
else
BUG ( ) ;
2006-09-27 12:50:20 +04:00
}
2009-01-08 15:04:47 +03:00
rb_link_node ( & region - > vm_rb , parent , p ) ;
rb_insert_color ( & region - > vm_rb , & nommu_region_tree ) ;
2006-09-27 12:50:20 +04:00
2009-01-08 15:04:47 +03:00
validate_nommu_regions ( ) ;
2006-09-27 12:50:20 +04:00
}
[PATCH] NOMMU: Make futexes work under NOMMU conditions
Make futexes work under NOMMU conditions.
This can be tested by running this in one shell:
#define SYSERROR(X, Y) \
do { if ((long)(X) == -1L) { perror(Y); exit(1); }} while(0)
int main()
{
int shmid, tmp, *f, n;
shmid = shmget(23, 4, IPC_CREAT|0666);
SYSERROR(shmid, "shmget");
f = shmat(shmid, NULL, 0);
SYSERROR(f, "shmat");
n = *f;
printf("WAIT: %p{%x}\n", f, n);
tmp = futex(f, FUTEX_WAIT, n, NULL, NULL, 0);
SYSERROR(tmp, "futex");
printf("WAITED: %d\n", tmp);
tmp = shmdt(f);
SYSERROR(tmp, "shmdt");
exit(0);
}
And then this in the other shell:
#define SYSERROR(X, Y) \
do { if ((long)(X) == -1L) { perror(Y); exit(1); }} while(0)
int main()
{
int shmid, tmp, *f;
shmid = shmget(23, 4, IPC_CREAT|0666);
SYSERROR(shmid, "shmget");
f = shmat(shmid, NULL, 0);
SYSERROR(f, "shmat");
(*f)++;
printf("WAKE: %p{%x}\n", f, *f);
tmp = futex(f, FUTEX_WAKE, 1, NULL, NULL, 0);
SYSERROR(tmp, "futex");
printf("WOKE: %d\n", tmp);
tmp = shmdt(f);
SYSERROR(tmp, "shmdt");
exit(0);
}
The first program will set up a SYSV IPC SHM segment and wait on a futex in it
for the number at the start to change. The program will increment that number
and wake the first program up. This leads to output of the form:
SHELL 1 SHELL 2
======================= =======================
# /dowait
WAIT: 0xc32ac000{0}
# /dowake
WAKE: 0xc32ac000{1}
WAITED: 0 WOKE: 1
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 12:50:22 +04:00
/*
2009-01-08 15:04:47 +03:00
* delete a region from the global tree
[PATCH] NOMMU: Make futexes work under NOMMU conditions
Make futexes work under NOMMU conditions.
This can be tested by running this in one shell:
#define SYSERROR(X, Y) \
do { if ((long)(X) == -1L) { perror(Y); exit(1); }} while(0)
int main()
{
int shmid, tmp, *f, n;
shmid = shmget(23, 4, IPC_CREAT|0666);
SYSERROR(shmid, "shmget");
f = shmat(shmid, NULL, 0);
SYSERROR(f, "shmat");
n = *f;
printf("WAIT: %p{%x}\n", f, n);
tmp = futex(f, FUTEX_WAIT, n, NULL, NULL, 0);
SYSERROR(tmp, "futex");
printf("WAITED: %d\n", tmp);
tmp = shmdt(f);
SYSERROR(tmp, "shmdt");
exit(0);
}
And then this in the other shell:
#define SYSERROR(X, Y) \
do { if ((long)(X) == -1L) { perror(Y); exit(1); }} while(0)
int main()
{
int shmid, tmp, *f;
shmid = shmget(23, 4, IPC_CREAT|0666);
SYSERROR(shmid, "shmget");
f = shmat(shmid, NULL, 0);
SYSERROR(f, "shmat");
(*f)++;
printf("WAKE: %p{%x}\n", f, *f);
tmp = futex(f, FUTEX_WAKE, 1, NULL, NULL, 0);
SYSERROR(tmp, "futex");
printf("WOKE: %d\n", tmp);
tmp = shmdt(f);
SYSERROR(tmp, "shmdt");
exit(0);
}
The first program will set up a SYSV IPC SHM segment and wait on a futex in it
for the number at the start to change. The program will increment that number
and wake the first program up. This leads to output of the form:
SHELL 1 SHELL 2
======================= =======================
# /dowait
WAIT: 0xc32ac000{0}
# /dowake
WAKE: 0xc32ac000{1}
WAITED: 0 WOKE: 1
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 12:50:22 +04:00
*/
2009-01-08 15:04:47 +03:00
static void delete_nommu_region ( struct vm_region * region )
[PATCH] NOMMU: Make futexes work under NOMMU conditions
Make futexes work under NOMMU conditions.
This can be tested by running this in one shell:
#define SYSERROR(X, Y) \
do { if ((long)(X) == -1L) { perror(Y); exit(1); }} while(0)
int main()
{
int shmid, tmp, *f, n;
shmid = shmget(23, 4, IPC_CREAT|0666);
SYSERROR(shmid, "shmget");
f = shmat(shmid, NULL, 0);
SYSERROR(f, "shmat");
n = *f;
printf("WAIT: %p{%x}\n", f, n);
tmp = futex(f, FUTEX_WAIT, n, NULL, NULL, 0);
SYSERROR(tmp, "futex");
printf("WAITED: %d\n", tmp);
tmp = shmdt(f);
SYSERROR(tmp, "shmdt");
exit(0);
}
And then this in the other shell:
#define SYSERROR(X, Y) \
do { if ((long)(X) == -1L) { perror(Y); exit(1); }} while(0)
int main()
{
int shmid, tmp, *f;
shmid = shmget(23, 4, IPC_CREAT|0666);
SYSERROR(shmid, "shmget");
f = shmat(shmid, NULL, 0);
SYSERROR(f, "shmat");
(*f)++;
printf("WAKE: %p{%x}\n", f, *f);
tmp = futex(f, FUTEX_WAKE, 1, NULL, NULL, 0);
SYSERROR(tmp, "futex");
printf("WOKE: %d\n", tmp);
tmp = shmdt(f);
SYSERROR(tmp, "shmdt");
exit(0);
}
The first program will set up a SYSV IPC SHM segment and wait on a futex in it
for the number at the start to change. The program will increment that number
and wake the first program up. This leads to output of the form:
SHELL 1 SHELL 2
======================= =======================
# /dowait
WAIT: 0xc32ac000{0}
# /dowake
WAKE: 0xc32ac000{1}
WAITED: 0 WOKE: 1
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 12:50:22 +04:00
{
2009-01-08 15:04:47 +03:00
BUG_ON ( ! nommu_region_tree . rb_node ) ;
[PATCH] NOMMU: Make futexes work under NOMMU conditions
Make futexes work under NOMMU conditions.
This can be tested by running this in one shell:
#define SYSERROR(X, Y) \
do { if ((long)(X) == -1L) { perror(Y); exit(1); }} while(0)
int main()
{
int shmid, tmp, *f, n;
shmid = shmget(23, 4, IPC_CREAT|0666);
SYSERROR(shmid, "shmget");
f = shmat(shmid, NULL, 0);
SYSERROR(f, "shmat");
n = *f;
printf("WAIT: %p{%x}\n", f, n);
tmp = futex(f, FUTEX_WAIT, n, NULL, NULL, 0);
SYSERROR(tmp, "futex");
printf("WAITED: %d\n", tmp);
tmp = shmdt(f);
SYSERROR(tmp, "shmdt");
exit(0);
}
And then this in the other shell:
#define SYSERROR(X, Y) \
do { if ((long)(X) == -1L) { perror(Y); exit(1); }} while(0)
int main()
{
int shmid, tmp, *f;
shmid = shmget(23, 4, IPC_CREAT|0666);
SYSERROR(shmid, "shmget");
f = shmat(shmid, NULL, 0);
SYSERROR(f, "shmat");
(*f)++;
printf("WAKE: %p{%x}\n", f, *f);
tmp = futex(f, FUTEX_WAKE, 1, NULL, NULL, 0);
SYSERROR(tmp, "futex");
printf("WOKE: %d\n", tmp);
tmp = shmdt(f);
SYSERROR(tmp, "shmdt");
exit(0);
}
The first program will set up a SYSV IPC SHM segment and wait on a futex in it
for the number at the start to change. The program will increment that number
and wake the first program up. This leads to output of the form:
SHELL 1 SHELL 2
======================= =======================
# /dowait
WAIT: 0xc32ac000{0}
# /dowake
WAKE: 0xc32ac000{1}
WAITED: 0 WOKE: 1
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 12:50:22 +04:00
2009-01-08 15:04:47 +03:00
validate_nommu_regions ( ) ;
rb_erase ( & region - > vm_rb , & nommu_region_tree ) ;
validate_nommu_regions ( ) ;
2007-07-16 10:38:28 +04:00
}
2006-09-27 12:50:21 +04:00
/*
2009-01-08 15:04:47 +03:00
* free a contiguous series of pages
2006-09-27 12:50:21 +04:00
*/
2009-01-08 15:04:47 +03:00
static void free_page_series ( unsigned long from , unsigned long to )
2006-09-27 12:50:21 +04:00
{
2009-01-08 15:04:47 +03:00
for ( ; from < to ; from + = PAGE_SIZE ) {
struct page * page = virt_to_page ( from ) ;
kdebug ( " - free %lx " , from ) ;
2009-04-03 03:56:32 +04:00
atomic_long_dec ( & mmap_pages_allocated ) ;
2009-01-08 15:04:47 +03:00
if ( page_count ( page ) ! = 1 )
2009-04-03 03:56:32 +04:00
kdebug ( " free page %p: refcount not one: %d " ,
page , page_count ( page ) ) ;
2009-01-08 15:04:47 +03:00
put_page ( page ) ;
2006-09-27 12:50:21 +04:00
}
}
2006-09-27 12:50:20 +04:00
/*
2009-01-08 15:04:47 +03:00
* release a reference to a region
2009-04-03 03:56:32 +04:00
* - the caller must hold the region semaphore for writing , which this releases
2009-01-08 15:04:47 +03:00
* - the region may not have been added to the tree yet , in which case vm_top
2009-01-08 15:04:47 +03:00
* will equal vm_start
2006-09-27 12:50:20 +04:00
*/
2009-01-08 15:04:47 +03:00
static void __put_nommu_region ( struct vm_region * region )
__releases ( nommu_region_sem )
2005-04-17 02:20:36 +04:00
{
2009-01-08 15:04:47 +03:00
kenter ( " %p{%d} " , region , atomic_read ( & region - > vm_usage ) ) ;
2005-04-17 02:20:36 +04:00
2009-01-08 15:04:47 +03:00
BUG_ON ( ! nommu_region_tree . rb_node ) ;
2005-04-17 02:20:36 +04:00
2009-01-08 15:04:47 +03:00
if ( atomic_dec_and_test ( & region - > vm_usage ) ) {
2009-01-08 15:04:47 +03:00
if ( region - > vm_top > region - > vm_start )
2009-01-08 15:04:47 +03:00
delete_nommu_region ( region ) ;
up_write ( & nommu_region_sem ) ;
if ( region - > vm_file )
fput ( region - > vm_file ) ;
/* IO memory and memory shared directly out of the pagecache
* from ramfs / tmpfs mustn ' t be released here */
if ( region - > vm_flags & VM_MAPPED_COPY ) {
kdebug ( " free series " ) ;
2009-01-08 15:04:47 +03:00
free_page_series ( region - > vm_start , region - > vm_top ) ;
2009-01-08 15:04:47 +03:00
}
kmem_cache_free ( vm_region_jar , region ) ;
} else {
up_write ( & nommu_region_sem ) ;
2005-04-17 02:20:36 +04:00
}
2009-01-08 15:04:47 +03:00
}
2005-04-17 02:20:36 +04:00
2009-01-08 15:04:47 +03:00
/*
* release a reference to a region
*/
static void put_nommu_region ( struct vm_region * region )
{
down_write ( & nommu_region_sem ) ;
__put_nommu_region ( region ) ;
2005-04-17 02:20:36 +04:00
}
2006-09-27 12:50:20 +04:00
/*
2009-01-08 15:04:47 +03:00
* add a VMA into a process ' s mm_struct in the appropriate place in the list
* and tree and add to the address space ' s page tree also if not an anonymous
* page
* - should be called with mm - > mmap_sem held writelocked
2006-09-27 12:50:20 +04:00
*/
2009-01-08 15:04:47 +03:00
static void add_vma_to_mm ( struct mm_struct * mm , struct vm_area_struct * vma )
2005-04-17 02:20:36 +04:00
{
2009-01-08 15:04:47 +03:00
struct vm_area_struct * pvma , * * pp ;
2005-04-17 02:20:36 +04:00
struct address_space * mapping ;
2009-01-08 15:04:47 +03:00
struct rb_node * * p , * parent ;
kenter ( " ,%p " , vma ) ;
BUG_ON ( ! vma - > vm_region ) ;
mm - > map_count + + ;
vma - > vm_mm = mm ;
2005-04-17 02:20:36 +04:00
/* add the VMA to the mapping */
if ( vma - > vm_file ) {
mapping = vma - > vm_file - > f_mapping ;
flush_dcache_mmap_lock ( mapping ) ;
vma_prio_tree_insert ( vma , & mapping - > i_mmap ) ;
flush_dcache_mmap_unlock ( mapping ) ;
}
2009-01-08 15:04:47 +03:00
/* add the VMA to the tree */
parent = NULL ;
p = & mm - > mm_rb . rb_node ;
2005-04-17 02:20:36 +04:00
while ( * p ) {
parent = * p ;
pvma = rb_entry ( parent , struct vm_area_struct , vm_rb ) ;
2009-01-08 15:04:47 +03:00
/* sort by: start addr, end addr, VMA struct addr in that order
* ( the latter is necessary as we may get identical VMAs ) */
if ( vma - > vm_start < pvma - > vm_start )
2005-04-17 02:20:36 +04:00
p = & ( * p ) - > rb_left ;
2009-01-08 15:04:47 +03:00
else if ( vma - > vm_start > pvma - > vm_start )
2005-04-17 02:20:36 +04:00
p = & ( * p ) - > rb_right ;
2009-01-08 15:04:47 +03:00
else if ( vma - > vm_end < pvma - > vm_end )
p = & ( * p ) - > rb_left ;
else if ( vma - > vm_end > pvma - > vm_end )
p = & ( * p ) - > rb_right ;
else if ( vma < pvma )
p = & ( * p ) - > rb_left ;
else if ( vma > pvma )
p = & ( * p ) - > rb_right ;
else
BUG ( ) ;
2005-04-17 02:20:36 +04:00
}
rb_link_node ( & vma - > vm_rb , parent , p ) ;
2009-01-08 15:04:47 +03:00
rb_insert_color ( & vma - > vm_rb , & mm - > mm_rb ) ;
/* add VMA to the VMA list also */
for ( pp = & mm - > mmap ; ( pvma = * pp ) ; pp = & ( * pp ) - > vm_next ) {
if ( pvma - > vm_start > vma - > vm_start )
break ;
if ( pvma - > vm_start < vma - > vm_start )
continue ;
if ( pvma - > vm_end < vma - > vm_end )
break ;
}
vma - > vm_next = * pp ;
* pp = vma ;
2005-04-17 02:20:36 +04:00
}
2006-09-27 12:50:20 +04:00
/*
2009-01-08 15:04:47 +03:00
* delete a VMA from its owning mm_struct and address space
2006-09-27 12:50:20 +04:00
*/
2009-01-08 15:04:47 +03:00
static void delete_vma_from_mm ( struct vm_area_struct * vma )
2005-04-17 02:20:36 +04:00
{
2009-01-08 15:04:47 +03:00
struct vm_area_struct * * pp ;
2005-04-17 02:20:36 +04:00
struct address_space * mapping ;
2009-01-08 15:04:47 +03:00
struct mm_struct * mm = vma - > vm_mm ;
kenter ( " %p " , vma ) ;
mm - > map_count - - ;
if ( mm - > mmap_cache = = vma )
mm - > mmap_cache = NULL ;
2005-04-17 02:20:36 +04:00
/* remove the VMA from the mapping */
if ( vma - > vm_file ) {
mapping = vma - > vm_file - > f_mapping ;
flush_dcache_mmap_lock ( mapping ) ;
vma_prio_tree_remove ( vma , & mapping - > i_mmap ) ;
flush_dcache_mmap_unlock ( mapping ) ;
}
2009-01-08 15:04:47 +03:00
/* remove from the MM's tree and list */
rb_erase ( & vma - > vm_rb , & mm - > mm_rb ) ;
for ( pp = & mm - > mmap ; * pp ; pp = & ( * pp ) - > vm_next ) {
if ( * pp = = vma ) {
* pp = vma - > vm_next ;
break ;
}
}
vma - > vm_mm = NULL ;
}
/*
* destroy a VMA record
*/
static void delete_vma ( struct mm_struct * mm , struct vm_area_struct * vma )
{
kenter ( " %p " , vma ) ;
if ( vma - > vm_ops & & vma - > vm_ops - > close )
vma - > vm_ops - > close ( vma ) ;
if ( vma - > vm_file ) {
fput ( vma - > vm_file ) ;
if ( vma - > vm_flags & VM_EXECUTABLE )
removed_exe_file_vma ( mm ) ;
}
put_nommu_region ( vma - > vm_region ) ;
kmem_cache_free ( vm_area_cachep , vma ) ;
}
/*
* look up the first VMA in which addr resides , NULL if none
* - should be called with mm - > mmap_sem at least held readlocked
*/
struct vm_area_struct * find_vma ( struct mm_struct * mm , unsigned long addr )
{
struct vm_area_struct * vma ;
struct rb_node * n = mm - > mm_rb . rb_node ;
/* check the cache first */
vma = mm - > mmap_cache ;
if ( vma & & vma - > vm_start < = addr & & vma - > vm_end > addr )
return vma ;
/* trawl the tree (there may be multiple mappings in which addr
* resides ) */
for ( n = rb_first ( & mm - > mm_rb ) ; n ; n = rb_next ( n ) ) {
vma = rb_entry ( n , struct vm_area_struct , vm_rb ) ;
if ( vma - > vm_start > addr )
return NULL ;
if ( vma - > vm_end > addr ) {
mm - > mmap_cache = vma ;
return vma ;
}
}
return NULL ;
}
EXPORT_SYMBOL ( find_vma ) ;
/*
* find a VMA
* - we don ' t extend stack VMAs under NOMMU conditions
*/
struct vm_area_struct * find_extend_vma ( struct mm_struct * mm , unsigned long addr )
{
return find_vma ( mm , addr ) ;
}
/*
* expand a stack to a given address
* - not supported under NOMMU conditions
*/
int expand_stack ( struct vm_area_struct * vma , unsigned long address )
{
return - ENOMEM ;
}
/*
* look up the first VMA exactly that exactly matches addr
* - should be called with mm - > mmap_sem at least held readlocked
*/
static struct vm_area_struct * find_vma_exact ( struct mm_struct * mm ,
unsigned long addr ,
unsigned long len )
{
struct vm_area_struct * vma ;
struct rb_node * n = mm - > mm_rb . rb_node ;
unsigned long end = addr + len ;
/* check the cache first */
vma = mm - > mmap_cache ;
if ( vma & & vma - > vm_start = = addr & & vma - > vm_end = = end )
return vma ;
/* trawl the tree (there may be multiple mappings in which addr
* resides ) */
for ( n = rb_first ( & mm - > mm_rb ) ; n ; n = rb_next ( n ) ) {
vma = rb_entry ( n , struct vm_area_struct , vm_rb ) ;
if ( vma - > vm_start < addr )
continue ;
if ( vma - > vm_start > addr )
return NULL ;
if ( vma - > vm_end = = end ) {
mm - > mmap_cache = vma ;
return vma ;
}
}
return NULL ;
2005-04-17 02:20:36 +04:00
}
/*
* determine whether a mapping should be permitted and , if so , what sort of
* mapping we ' re capable of supporting
*/
static int validate_mmap_request ( struct file * file ,
unsigned long addr ,
unsigned long len ,
unsigned long prot ,
unsigned long flags ,
unsigned long pgoff ,
unsigned long * _capabilities )
{
2009-01-08 15:04:47 +03:00
unsigned long capabilities , rlen ;
2005-04-17 02:20:36 +04:00
unsigned long reqprot = prot ;
int ret ;
/* do the simple checks first */
if ( flags & MAP_FIXED | | addr ) {
printk ( KERN_DEBUG
" %d: Can't do fixed-address/overlay mmap of RAM \n " ,
current - > pid ) ;
return - EINVAL ;
}
if ( ( flags & MAP_TYPE ) ! = MAP_PRIVATE & &
( flags & MAP_TYPE ) ! = MAP_SHARED )
return - EINVAL ;
2006-12-06 05:02:59 +03:00
if ( ! len )
2005-04-17 02:20:36 +04:00
return - EINVAL ;
2006-12-06 05:02:59 +03:00
/* Careful about overflows.. */
2009-01-08 15:04:47 +03:00
rlen = PAGE_ALIGN ( len ) ;
if ( ! rlen | | rlen > TASK_SIZE )
2006-12-06 05:02:59 +03:00
return - ENOMEM ;
2005-04-17 02:20:36 +04:00
/* offset overflow? */
2009-01-08 15:04:47 +03:00
if ( ( pgoff + ( rlen > > PAGE_SHIFT ) ) < pgoff )
2006-12-06 05:02:59 +03:00
return - EOVERFLOW ;
2005-04-17 02:20:36 +04:00
if ( file ) {
/* validate file mapping requests */
struct address_space * mapping ;
/* files must support mmap */
if ( ! file - > f_op | | ! file - > f_op - > mmap )
return - ENODEV ;
/* work out if what we've got could possibly be shared
* - we support chardevs that provide their own " memory "
* - we support files / blockdevs that are memory backed
*/
mapping = file - > f_mapping ;
if ( ! mapping )
2006-12-08 13:37:21 +03:00
mapping = file - > f_path . dentry - > d_inode - > i_mapping ;
2005-04-17 02:20:36 +04:00
capabilities = 0 ;
if ( mapping & & mapping - > backing_dev_info )
capabilities = mapping - > backing_dev_info - > capabilities ;
if ( ! capabilities ) {
/* no explicit capabilities set, so assume some
* defaults */
2006-12-08 13:37:21 +03:00
switch ( file - > f_path . dentry - > d_inode - > i_mode & S_IFMT ) {
2005-04-17 02:20:36 +04:00
case S_IFREG :
case S_IFBLK :
capabilities = BDI_CAP_MAP_COPY ;
break ;
case S_IFCHR :
capabilities =
BDI_CAP_MAP_DIRECT |
BDI_CAP_READ_MAP |
BDI_CAP_WRITE_MAP ;
break ;
default :
return - EINVAL ;
}
}
/* eliminate any capabilities that we can't support on this
* device */
if ( ! file - > f_op - > get_unmapped_area )
capabilities & = ~ BDI_CAP_MAP_DIRECT ;
if ( ! file - > f_op - > read )
capabilities & = ~ BDI_CAP_MAP_COPY ;
2009-08-19 01:11:17 +04:00
/* The file shall have been opened with read permission. */
if ( ! ( file - > f_mode & FMODE_READ ) )
return - EACCES ;
2005-04-17 02:20:36 +04:00
if ( flags & MAP_SHARED ) {
/* do checks for writing, appending and locking */
if ( ( prot & PROT_WRITE ) & &
! ( file - > f_mode & FMODE_WRITE ) )
return - EACCES ;
2006-12-08 13:37:21 +03:00
if ( IS_APPEND ( file - > f_path . dentry - > d_inode ) & &
2005-04-17 02:20:36 +04:00
( file - > f_mode & FMODE_WRITE ) )
return - EACCES ;
2006-12-08 13:37:21 +03:00
if ( locks_verify_locked ( file - > f_path . dentry - > d_inode ) )
2005-04-17 02:20:36 +04:00
return - EAGAIN ;
if ( ! ( capabilities & BDI_CAP_MAP_DIRECT ) )
return - ENODEV ;
if ( ( ( prot & PROT_READ ) & & ! ( capabilities & BDI_CAP_READ_MAP ) ) | |
( ( prot & PROT_WRITE ) & & ! ( capabilities & BDI_CAP_WRITE_MAP ) ) | |
( ( prot & PROT_EXEC ) & & ! ( capabilities & BDI_CAP_EXEC_MAP ) )
) {
printk ( " MAP_SHARED not completely supported on !MMU \n " ) ;
return - EINVAL ;
}
/* we mustn't privatise shared mappings */
capabilities & = ~ BDI_CAP_MAP_COPY ;
}
else {
/* we're going to read the file into private memory we
* allocate */
if ( ! ( capabilities & BDI_CAP_MAP_COPY ) )
return - ENODEV ;
/* we don't permit a private writable mapping to be
* shared with the backing device */
if ( prot & PROT_WRITE )
capabilities & = ~ BDI_CAP_MAP_DIRECT ;
}
/* handle executable mappings and implied executable
* mappings */
2006-12-08 13:37:21 +03:00
if ( file - > f_path . mnt - > mnt_flags & MNT_NOEXEC ) {
2005-04-17 02:20:36 +04:00
if ( prot & PROT_EXEC )
return - EPERM ;
}
else if ( ( prot & PROT_READ ) & & ! ( prot & PROT_EXEC ) ) {
/* handle implication of PROT_EXEC by PROT_READ */
if ( current - > personality & READ_IMPLIES_EXEC ) {
if ( capabilities & BDI_CAP_EXEC_MAP )
prot | = PROT_EXEC ;
}
}
else if ( ( prot & PROT_READ ) & &
( prot & PROT_EXEC ) & &
! ( capabilities & BDI_CAP_EXEC_MAP )
) {
/* backing file is not executable, try to copy */
capabilities & = ~ BDI_CAP_MAP_DIRECT ;
}
}
else {
/* anonymous mappings are always memory backed and can be
* privately mapped
*/
capabilities = BDI_CAP_MAP_COPY ;
/* handle PROT_EXEC implication by PROT_READ */
if ( ( prot & PROT_READ ) & &
( current - > personality & READ_IMPLIES_EXEC ) )
prot | = PROT_EXEC ;
}
/* allow the security API to have its say */
2007-06-28 23:55:21 +04:00
ret = security_file_mmap ( file , reqprot , prot , flags , addr , 0 ) ;
2005-04-17 02:20:36 +04:00
if ( ret < 0 )
return ret ;
/* looks okay */
* _capabilities = capabilities ;
return 0 ;
}
/*
* we ' ve determined that we can make the mapping , now translate what we
* now know into VMA flags
*/
static unsigned long determine_vm_flags ( struct file * file ,
unsigned long prot ,
unsigned long flags ,
unsigned long capabilities )
{
unsigned long vm_flags ;
vm_flags = calc_vm_prot_bits ( prot ) | calc_vm_flag_bits ( flags ) ;
vm_flags | = VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC ;
/* vm_flags |= mm->def_flags; */
if ( ! ( capabilities & BDI_CAP_MAP_DIRECT ) ) {
/* attempt to share read-only copies of mapped file chunks */
if ( file & & ! ( prot & PROT_WRITE ) )
vm_flags | = VM_MAYSHARE ;
}
else {
/* overlay a shareable mapping on the backing device or inode
* if possible - used for chardevs , ramfs / tmpfs / shmfs and
* romfs / cramfs */
if ( flags & MAP_SHARED )
vm_flags | = VM_MAYSHARE | VM_SHARED ;
else if ( ( ( ( vm_flags & capabilities ) ^ vm_flags ) & BDI_CAP_VMFLAGS ) = = 0 )
vm_flags | = VM_MAYSHARE ;
}
/* refuse to let anyone share private mappings with this process if
* it ' s being traced - otherwise breakpoints set in it may interfere
* with another untraced process
*/
2008-07-26 06:45:50 +04:00
if ( ( flags & MAP_PRIVATE ) & & tracehook_expect_breakpoints ( current ) )
2005-04-17 02:20:36 +04:00
vm_flags & = ~ VM_MAYSHARE ;
return vm_flags ;
}
/*
2009-01-08 15:04:47 +03:00
* set up a shared mapping on a file ( the driver or filesystem provides and
* pins the storage )
2005-04-17 02:20:36 +04:00
*/
2009-01-08 15:04:47 +03:00
static int do_mmap_shared_file ( struct vm_area_struct * vma )
2005-04-17 02:20:36 +04:00
{
int ret ;
ret = vma - > vm_file - > f_op - > mmap ( vma - > vm_file , vma ) ;
2009-01-08 15:04:47 +03:00
if ( ret = = 0 ) {
vma - > vm_region - > vm_top = vma - > vm_region - > vm_end ;
return ret ;
}
2005-04-17 02:20:36 +04:00
if ( ret ! = - ENOSYS )
return ret ;
/* getting an ENOSYS error indicates that direct mmap isn't
* possible ( as opposed to tried but failed ) so we ' ll fall
* through to making a private copy of the data and mapping
* that if we can */
return - ENODEV ;
}
/*
* set up a private mapping or an anonymous shared mapping
*/
2009-01-08 15:04:47 +03:00
static int do_mmap_private ( struct vm_area_struct * vma ,
struct vm_region * region ,
unsigned long len )
2005-04-17 02:20:36 +04:00
{
2009-01-08 15:04:47 +03:00
struct page * pages ;
unsigned long total , point , n , rlen ;
2005-04-17 02:20:36 +04:00
void * base ;
2009-01-08 15:04:47 +03:00
int ret , order ;
2005-04-17 02:20:36 +04:00
/* invoke the file's mapping function so that it can keep track of
* shared mappings on devices or memory
* - VM_MAYSHARE will be set if it may attempt to share
*/
if ( vma - > vm_file ) {
ret = vma - > vm_file - > f_op - > mmap ( vma - > vm_file , vma ) ;
2009-01-08 15:04:47 +03:00
if ( ret = = 0 ) {
2005-04-17 02:20:36 +04:00
/* shouldn't return success if we're not sharing */
2009-01-08 15:04:47 +03:00
BUG_ON ( ! ( vma - > vm_flags & VM_MAYSHARE ) ) ;
vma - > vm_region - > vm_top = vma - > vm_region - > vm_end ;
return ret ;
2005-04-17 02:20:36 +04:00
}
2009-01-08 15:04:47 +03:00
if ( ret ! = - ENOSYS )
return ret ;
2005-04-17 02:20:36 +04:00
/* getting an ENOSYS error indicates that direct mmap isn't
* possible ( as opposed to tried but failed ) so we ' ll try to
* make a private copy of the data and map that instead */
}
2009-01-08 15:04:47 +03:00
rlen = PAGE_ALIGN ( len ) ;
2005-04-17 02:20:36 +04:00
/* allocate some memory to hold the mapping
* - note that this may not return a page - aligned address if the object
* we ' re allocating is smaller than a page
*/
2009-01-08 15:04:47 +03:00
order = get_order ( rlen ) ;
kdebug ( " alloc order %d for %lx " , order , len ) ;
pages = alloc_pages ( GFP_KERNEL , order ) ;
if ( ! pages )
2005-04-17 02:20:36 +04:00
goto enomem ;
2009-01-08 15:04:47 +03:00
total = 1 < < order ;
2009-04-03 03:56:32 +04:00
atomic_long_add ( total , & mmap_pages_allocated ) ;
2009-01-08 15:04:47 +03:00
point = rlen > > PAGE_SHIFT ;
2009-01-08 15:04:47 +03:00
/* we allocated a power-of-2 sized page set, so we may want to trim off
* the excess */
if ( sysctl_nr_trim_pages & & total - point > = sysctl_nr_trim_pages ) {
while ( total > point ) {
order = ilog2 ( total - point ) ;
n = 1 < < order ;
kdebug ( " shave %lu/%lu @%lu " , n , total - point , total ) ;
2009-04-03 03:56:32 +04:00
atomic_long_sub ( n , & mmap_pages_allocated ) ;
2009-01-08 15:04:47 +03:00
total - = n ;
set_page_refcounted ( pages + total ) ;
__free_pages ( pages + total , order ) ;
}
2009-01-08 15:04:47 +03:00
}
for ( point = 1 ; point < total ; point + + )
set_page_refcounted ( & pages [ point ] ) ;
2005-04-17 02:20:36 +04:00
2009-01-08 15:04:47 +03:00
base = page_address ( pages ) ;
region - > vm_flags = vma - > vm_flags | = VM_MAPPED_COPY ;
region - > vm_start = ( unsigned long ) base ;
region - > vm_end = region - > vm_start + rlen ;
2009-01-08 15:04:47 +03:00
region - > vm_top = region - > vm_start + ( total < < PAGE_SHIFT ) ;
2009-01-08 15:04:47 +03:00
vma - > vm_start = region - > vm_start ;
vma - > vm_end = region - > vm_start + len ;
2005-04-17 02:20:36 +04:00
if ( vma - > vm_file ) {
/* read the contents of a file into the copy */
mm_segment_t old_fs ;
loff_t fpos ;
fpos = vma - > vm_pgoff ;
fpos < < = PAGE_SHIFT ;
old_fs = get_fs ( ) ;
set_fs ( KERNEL_DS ) ;
2009-01-08 15:04:47 +03:00
ret = vma - > vm_file - > f_op - > read ( vma - > vm_file , base , rlen , & fpos ) ;
2005-04-17 02:20:36 +04:00
set_fs ( old_fs ) ;
if ( ret < 0 )
goto error_free ;
/* clear the last little bit */
2009-01-08 15:04:47 +03:00
if ( ret < rlen )
memset ( base + ret , 0 , rlen - ret ) ;
2005-04-17 02:20:36 +04:00
} else {
/* if it's an anonymous mapping, then just clear it */
2009-01-08 15:04:47 +03:00
memset ( base , 0 , rlen ) ;
2005-04-17 02:20:36 +04:00
}
return 0 ;
error_free :
2009-01-08 15:04:47 +03:00
free_page_series ( region - > vm_start , region - > vm_end ) ;
region - > vm_start = vma - > vm_start = 0 ;
region - > vm_end = vma - > vm_end = 0 ;
2009-01-08 15:04:47 +03:00
region - > vm_top = 0 ;
2005-04-17 02:20:36 +04:00
return ret ;
enomem :
2009-01-13 10:30:22 +03:00
printk ( " Allocation of length %lu from process %d (%s) failed \n " ,
len , current - > pid , current - > comm ) ;
2005-04-17 02:20:36 +04:00
show_free_areas ( ) ;
return - ENOMEM ;
}
/*
* handle mapping creation for uClinux
*/
unsigned long do_mmap_pgoff ( struct file * file ,
unsigned long addr ,
unsigned long len ,
unsigned long prot ,
unsigned long flags ,
unsigned long pgoff )
{
2009-01-08 15:04:47 +03:00
struct vm_area_struct * vma ;
struct vm_region * region ;
2005-04-17 02:20:36 +04:00
struct rb_node * rb ;
2009-01-08 15:04:47 +03:00
unsigned long capabilities , vm_flags , result ;
2005-04-17 02:20:36 +04:00
int ret ;
2009-01-08 15:04:47 +03:00
kenter ( " ,%lx,%lx,%lx,%lx,%lx " , addr , len , prot , flags , pgoff ) ;
2007-11-27 02:47:40 +03:00
if ( ! ( flags & MAP_FIXED ) )
addr = round_hint_to_min ( addr ) ;
2005-04-17 02:20:36 +04:00
/* decide whether we should attempt the mapping, and if so what sort of
* mapping */
ret = validate_mmap_request ( file , addr , len , prot , flags , pgoff ,
& capabilities ) ;
2009-01-08 15:04:47 +03:00
if ( ret < 0 ) {
kleave ( " = %d [val] " , ret ) ;
2005-04-17 02:20:36 +04:00
return ret ;
2009-01-08 15:04:47 +03:00
}
2005-04-17 02:20:36 +04:00
/* we've determined that we can make the mapping, now translate what we
* now know into VMA flags */
vm_flags = determine_vm_flags ( file , prot , flags , capabilities ) ;
2009-01-08 15:04:47 +03:00
/* we're going to need to record the mapping */
region = kmem_cache_zalloc ( vm_region_jar , GFP_KERNEL ) ;
if ( ! region )
goto error_getting_region ;
vma = kmem_cache_zalloc ( vm_area_cachep , GFP_KERNEL ) ;
if ( ! vma )
goto error_getting_vma ;
2005-04-17 02:20:36 +04:00
2009-01-08 15:04:47 +03:00
atomic_set ( & region - > vm_usage , 1 ) ;
region - > vm_flags = vm_flags ;
region - > vm_pgoff = pgoff ;
INIT_LIST_HEAD ( & vma - > anon_vma_node ) ;
vma - > vm_flags = vm_flags ;
vma - > vm_pgoff = pgoff ;
2005-04-17 02:20:36 +04:00
2009-01-08 15:04:47 +03:00
if ( file ) {
region - > vm_file = file ;
get_file ( file ) ;
vma - > vm_file = file ;
get_file ( file ) ;
if ( vm_flags & VM_EXECUTABLE ) {
added_exe_file_vma ( current - > mm ) ;
vma - > vm_mm = current - > mm ;
}
}
down_write ( & nommu_region_sem ) ;
/* if we want to share, we need to check for regions created by other
2005-04-17 02:20:36 +04:00
* mmap ( ) calls that overlap with our proposed mapping
2009-01-08 15:04:47 +03:00
* - we can only share with a superset match on most regular files
2005-04-17 02:20:36 +04:00
* - shared mappings on character devices and memory backed files are
* permitted to overlap inexactly as far as we are concerned for in
* these cases , sharing is handled in the driver or filesystem rather
* than here
*/
if ( vm_flags & VM_MAYSHARE ) {
2009-01-08 15:04:47 +03:00
struct vm_region * pregion ;
unsigned long pglen , rpglen , pgend , rpgend , start ;
2005-04-17 02:20:36 +04:00
2009-01-08 15:04:47 +03:00
pglen = ( len + PAGE_SIZE - 1 ) > > PAGE_SHIFT ;
pgend = pgoff + pglen ;
2007-03-22 11:11:24 +03:00
2009-01-08 15:04:47 +03:00
for ( rb = rb_first ( & nommu_region_tree ) ; rb ; rb = rb_next ( rb ) ) {
pregion = rb_entry ( rb , struct vm_region , vm_rb ) ;
2005-04-17 02:20:36 +04:00
2009-01-08 15:04:47 +03:00
if ( ! ( pregion - > vm_flags & VM_MAYSHARE ) )
2005-04-17 02:20:36 +04:00
continue ;
/* search for overlapping mappings on the same file */
2009-01-08 15:04:47 +03:00
if ( pregion - > vm_file - > f_path . dentry - > d_inode ! =
file - > f_path . dentry - > d_inode )
2005-04-17 02:20:36 +04:00
continue ;
2009-01-08 15:04:47 +03:00
if ( pregion - > vm_pgoff > = pgend )
2005-04-17 02:20:36 +04:00
continue ;
2009-01-08 15:04:47 +03:00
rpglen = pregion - > vm_end - pregion - > vm_start ;
rpglen = ( rpglen + PAGE_SIZE - 1 ) > > PAGE_SHIFT ;
rpgend = pregion - > vm_pgoff + rpglen ;
if ( pgoff > = rpgend )
2005-04-17 02:20:36 +04:00
continue ;
2009-01-08 15:04:47 +03:00
/* handle inexactly overlapping matches between
* mappings */
if ( ( pregion - > vm_pgoff ! = pgoff | | rpglen ! = pglen ) & &
! ( pgoff > = pregion - > vm_pgoff & & pgend < = rpgend ) ) {
/* new mapping is not a subset of the region */
2005-04-17 02:20:36 +04:00
if ( ! ( capabilities & BDI_CAP_MAP_DIRECT ) )
goto sharing_violation ;
continue ;
}
2009-01-08 15:04:47 +03:00
/* we've found a region we can share */
atomic_inc ( & pregion - > vm_usage ) ;
vma - > vm_region = pregion ;
start = pregion - > vm_start ;
start + = ( pgoff - pregion - > vm_pgoff ) < < PAGE_SHIFT ;
vma - > vm_start = start ;
vma - > vm_end = start + len ;
if ( pregion - > vm_flags & VM_MAPPED_COPY ) {
kdebug ( " share copy " ) ;
vma - > vm_flags | = VM_MAPPED_COPY ;
} else {
kdebug ( " share mmap " ) ;
ret = do_mmap_shared_file ( vma ) ;
if ( ret < 0 ) {
vma - > vm_region = NULL ;
vma - > vm_start = 0 ;
vma - > vm_end = 0 ;
atomic_dec ( & pregion - > vm_usage ) ;
pregion = NULL ;
goto error_just_free ;
}
}
fput ( region - > vm_file ) ;
kmem_cache_free ( vm_region_jar , region ) ;
region = pregion ;
result = start ;
goto share ;
2005-04-17 02:20:36 +04:00
}
/* obtain the address at which to make a shared mapping
* - this is the hook for quasi - memory character devices to
* tell us the location of a shared mapping
*/
if ( file & & file - > f_op - > get_unmapped_area ) {
addr = file - > f_op - > get_unmapped_area ( file , addr , len ,
pgoff , flags ) ;
if ( IS_ERR ( ( void * ) addr ) ) {
ret = addr ;
if ( ret ! = ( unsigned long ) - ENOSYS )
2009-01-08 15:04:47 +03:00
goto error_just_free ;
2005-04-17 02:20:36 +04:00
/* the driver refused to tell us where to site
* the mapping so we ' ll have to attempt to copy
* it */
ret = ( unsigned long ) - ENODEV ;
if ( ! ( capabilities & BDI_CAP_MAP_COPY ) )
2009-01-08 15:04:47 +03:00
goto error_just_free ;
2005-04-17 02:20:36 +04:00
capabilities & = ~ BDI_CAP_MAP_DIRECT ;
2009-01-08 15:04:47 +03:00
} else {
vma - > vm_start = region - > vm_start = addr ;
vma - > vm_end = region - > vm_end = addr + len ;
2005-04-17 02:20:36 +04:00
}
}
}
2009-01-08 15:04:47 +03:00
vma - > vm_region = region ;
2009-09-05 22:17:07 +04:00
add_nommu_region ( region ) ;
2005-04-17 02:20:36 +04:00
/* set up the mapping */
if ( file & & vma - > vm_flags & VM_SHARED )
2009-01-08 15:04:47 +03:00
ret = do_mmap_shared_file ( vma ) ;
2005-04-17 02:20:36 +04:00
else
2009-01-08 15:04:47 +03:00
ret = do_mmap_private ( vma , region , len ) ;
2005-04-17 02:20:36 +04:00
if ( ret < 0 )
2009-01-08 15:04:47 +03:00
goto error_put_region ;
2005-04-17 02:20:36 +04:00
/* okay... we have a mapping; now we have to register it */
2009-01-08 15:04:47 +03:00
result = vma - > vm_start ;
2005-04-17 02:20:36 +04:00
current - > mm - > total_vm + = len > > PAGE_SHIFT ;
2009-01-08 15:04:47 +03:00
share :
add_vma_to_mm ( current - > mm , vma ) ;
2005-04-17 02:20:36 +04:00
2009-01-08 15:04:47 +03:00
up_write ( & nommu_region_sem ) ;
2005-04-17 02:20:36 +04:00
if ( prot & PROT_EXEC )
2009-01-08 15:04:47 +03:00
flush_icache_range ( result , result + len ) ;
2005-04-17 02:20:36 +04:00
2009-01-08 15:04:47 +03:00
kleave ( " = %lx " , result ) ;
return result ;
2005-04-17 02:20:36 +04:00
2009-01-08 15:04:47 +03:00
error_put_region :
__put_nommu_region ( region ) ;
2005-04-17 02:20:36 +04:00
if ( vma ) {
2008-04-29 12:01:36 +04:00
if ( vma - > vm_file ) {
2006-10-01 10:27:01 +04:00
fput ( vma - > vm_file ) ;
2008-04-29 12:01:36 +04:00
if ( vma - > vm_flags & VM_EXECUTABLE )
removed_exe_file_vma ( vma - > vm_mm ) ;
}
2009-01-08 15:04:47 +03:00
kmem_cache_free ( vm_area_cachep , vma ) ;
2005-04-17 02:20:36 +04:00
}
2009-01-08 15:04:47 +03:00
kleave ( " = %d [pr] " , ret ) ;
2005-04-17 02:20:36 +04:00
return ret ;
2009-01-08 15:04:47 +03:00
error_just_free :
up_write ( & nommu_region_sem ) ;
error :
fput ( region - > vm_file ) ;
kmem_cache_free ( vm_region_jar , region ) ;
fput ( vma - > vm_file ) ;
if ( vma - > vm_flags & VM_EXECUTABLE )
removed_exe_file_vma ( vma - > vm_mm ) ;
kmem_cache_free ( vm_area_cachep , vma ) ;
kleave ( " = %d " , ret ) ;
return ret ;
sharing_violation :
up_write ( & nommu_region_sem ) ;
printk ( KERN_WARNING " Attempt to share mismatched mappings \n " ) ;
ret = - EINVAL ;
goto error ;
2005-04-17 02:20:36 +04:00
2009-01-08 15:04:47 +03:00
error_getting_vma :
kmem_cache_free ( vm_region_jar , region ) ;
printk ( KERN_WARNING " Allocation of vma for %lu byte allocation "
" from process %d failed \n " ,
2005-04-17 02:20:36 +04:00
len , current - > pid ) ;
show_free_areas ( ) ;
return - ENOMEM ;
2009-01-08 15:04:47 +03:00
error_getting_region :
printk ( KERN_WARNING " Allocation of vm region for %lu byte allocation "
" from process %d failed \n " ,
2005-04-17 02:20:36 +04:00
len , current - > pid ) ;
show_free_areas ( ) ;
return - ENOMEM ;
}
2007-07-21 15:37:25 +04:00
EXPORT_SYMBOL ( do_mmap_pgoff ) ;
2005-04-17 02:20:36 +04:00
/*
2009-01-08 15:04:47 +03:00
* split a vma into two pieces at address ' addr ' , a new vma is allocated either
* for the first part or the tail .
2005-04-17 02:20:36 +04:00
*/
2009-01-08 15:04:47 +03:00
int split_vma ( struct mm_struct * mm , struct vm_area_struct * vma ,
unsigned long addr , int new_below )
2005-04-17 02:20:36 +04:00
{
2009-01-08 15:04:47 +03:00
struct vm_area_struct * new ;
struct vm_region * region ;
unsigned long npages ;
2005-04-17 02:20:36 +04:00
2009-01-08 15:04:47 +03:00
kenter ( " " ) ;
2005-04-17 02:20:36 +04:00
2009-01-08 15:04:47 +03:00
/* we're only permitted to split anonymous regions that have a single
* owner */
if ( vma - > vm_file | |
atomic_read ( & vma - > vm_region - > vm_usage ) ! = 1 )
return - ENOMEM ;
2005-04-17 02:20:36 +04:00
2009-01-08 15:04:47 +03:00
if ( mm - > map_count > = sysctl_max_map_count )
return - ENOMEM ;
2005-04-17 02:20:36 +04:00
2009-01-08 15:04:47 +03:00
region = kmem_cache_alloc ( vm_region_jar , GFP_KERNEL ) ;
if ( ! region )
return - ENOMEM ;
2005-04-17 02:20:36 +04:00
2009-01-08 15:04:47 +03:00
new = kmem_cache_alloc ( vm_area_cachep , GFP_KERNEL ) ;
if ( ! new ) {
kmem_cache_free ( vm_region_jar , region ) ;
return - ENOMEM ;
}
/* most fields are the same, copy all, and then fixup */
* new = * vma ;
* region = * vma - > vm_region ;
new - > vm_region = region ;
npages = ( addr - vma - > vm_start ) > > PAGE_SHIFT ;
if ( new_below ) {
2009-01-08 15:04:47 +03:00
region - > vm_top = region - > vm_end = new - > vm_end = addr ;
2009-01-08 15:04:47 +03:00
} else {
region - > vm_start = new - > vm_start = addr ;
region - > vm_pgoff = new - > vm_pgoff + = npages ;
2005-04-17 02:20:36 +04:00
}
2009-01-08 15:04:47 +03:00
if ( new - > vm_ops & & new - > vm_ops - > open )
new - > vm_ops - > open ( new ) ;
delete_vma_from_mm ( vma ) ;
down_write ( & nommu_region_sem ) ;
delete_nommu_region ( vma - > vm_region ) ;
if ( new_below ) {
vma - > vm_region - > vm_start = vma - > vm_start = addr ;
vma - > vm_region - > vm_pgoff = vma - > vm_pgoff + = npages ;
} else {
vma - > vm_region - > vm_end = vma - > vm_end = addr ;
2009-01-08 15:04:47 +03:00
vma - > vm_region - > vm_top = addr ;
2009-01-08 15:04:47 +03:00
}
add_nommu_region ( vma - > vm_region ) ;
add_nommu_region ( new - > vm_region ) ;
up_write ( & nommu_region_sem ) ;
add_vma_to_mm ( mm , vma ) ;
add_vma_to_mm ( mm , new ) ;
return 0 ;
2005-04-17 02:20:36 +04:00
}
2006-09-27 12:50:20 +04:00
/*
2009-01-08 15:04:47 +03:00
* shrink a VMA by removing the specified chunk from either the beginning or
* the end
2006-09-27 12:50:20 +04:00
*/
2009-01-08 15:04:47 +03:00
static int shrink_vma ( struct mm_struct * mm ,
struct vm_area_struct * vma ,
unsigned long from , unsigned long to )
2005-04-17 02:20:36 +04:00
{
2009-01-08 15:04:47 +03:00
struct vm_region * region ;
2005-04-17 02:20:36 +04:00
2009-01-08 15:04:47 +03:00
kenter ( " " ) ;
2005-04-17 02:20:36 +04:00
2009-01-08 15:04:47 +03:00
/* adjust the VMA's pointers, which may reposition it in the MM's tree
* and list */
delete_vma_from_mm ( vma ) ;
if ( from > vma - > vm_start )
vma - > vm_end = from ;
else
vma - > vm_start = to ;
add_vma_to_mm ( mm , vma ) ;
2005-04-17 02:20:36 +04:00
2009-01-08 15:04:47 +03:00
/* cut the backing region down to size */
region = vma - > vm_region ;
BUG_ON ( atomic_read ( & region - > vm_usage ) ! = 1 ) ;
down_write ( & nommu_region_sem ) ;
delete_nommu_region ( region ) ;
2009-01-08 15:04:47 +03:00
if ( from > region - > vm_start ) {
to = region - > vm_top ;
region - > vm_top = region - > vm_end = from ;
} else {
2009-01-08 15:04:47 +03:00
region - > vm_start = to ;
2009-01-08 15:04:47 +03:00
}
2009-01-08 15:04:47 +03:00
add_nommu_region ( region ) ;
up_write ( & nommu_region_sem ) ;
free_page_series ( from , to ) ;
return 0 ;
}
2005-04-17 02:20:36 +04:00
2009-01-08 15:04:47 +03:00
/*
* release a mapping
* - under NOMMU conditions the chunk to be unmapped must be backed by a single
* VMA , though it need not cover the whole VMA
*/
int do_munmap ( struct mm_struct * mm , unsigned long start , size_t len )
{
struct vm_area_struct * vma ;
struct rb_node * rb ;
unsigned long end = start + len ;
int ret ;
2005-04-17 02:20:36 +04:00
2009-01-08 15:04:47 +03:00
kenter ( " ,%lx,%zx " , start , len ) ;
2005-04-17 02:20:36 +04:00
2009-01-08 15:04:47 +03:00
if ( len = = 0 )
return - EINVAL ;
[PATCH] mm: update_hiwaters just in time
update_mem_hiwater has attracted various criticisms, in particular from those
concerned with mm scalability. Originally it was called whenever rss or
total_vm got raised. Then many of those callsites were replaced by a timer
tick call from account_system_time. Now Frank van Maarseveen reports that to
be found inadequate. How about this? Works for Frank.
Replace update_mem_hiwater, a poor combination of two unrelated ops, by macros
update_hiwater_rss and update_hiwater_vm. Don't attempt to keep
mm->hiwater_rss up to date at timer tick, nor every time we raise rss (usually
by 1): those are hot paths. Do the opposite, update only when about to lower
rss (usually by many), or just before final accounting in do_exit. Handle
mm->hiwater_vm in the same way, though it's much less of an issue. Demand
that whoever collects these hiwater statistics do the work of taking the
maximum with rss or total_vm.
And there has been no collector of these hiwater statistics in the tree. The
new convention needs an example, so match Frank's usage by adding a VmPeak
line above VmSize to /proc/<pid>/status, and also a VmHWM line above VmRSS
(High-Water-Mark or High-Water-Memory).
There was a particular anomaly during mremap move, that hiwater_vm might be
captured too high. A fleeting such anomaly remains, but it's quickly
corrected now, whereas before it would stick.
What locking? None: if the app is racy then these statistics will be racy,
it's not worth any overhead to make them exact. But whenever it suits,
hiwater_vm is updated under exclusive mmap_sem, and hiwater_rss under
page_table_lock (for now) or with preemption disabled (later on): without
going to any trouble, minimize the time between reading current values and
updating, to minimize those occasions when a racing thread bumps a count up
and back down in between.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 04:16:18 +03:00
2009-01-08 15:04:47 +03:00
/* find the first potentially overlapping VMA */
vma = find_vma ( mm , start ) ;
if ( ! vma ) {
2009-04-03 03:56:32 +04:00
static int limit = 0 ;
if ( limit < 5 ) {
printk ( KERN_WARNING
" munmap of memory not mmapped by process %d "
" (%s): 0x%lx-0x%lx \n " ,
current - > pid , current - > comm ,
start , start + len - 1 ) ;
limit + + ;
}
2009-01-08 15:04:47 +03:00
return - EINVAL ;
}
2005-04-17 02:20:36 +04:00
2009-01-08 15:04:47 +03:00
/* we're allowed to split an anonymous VMA but not a file-backed one */
if ( vma - > vm_file ) {
do {
if ( start > vma - > vm_start ) {
kleave ( " = -EINVAL [miss] " ) ;
return - EINVAL ;
}
if ( end = = vma - > vm_end )
goto erase_whole_vma ;
rb = rb_next ( & vma - > vm_rb ) ;
vma = rb_entry ( rb , struct vm_area_struct , vm_rb ) ;
} while ( rb ) ;
kleave ( " = -EINVAL [split file] " ) ;
return - EINVAL ;
} else {
/* the chunk must be a subset of the VMA found */
if ( start = = vma - > vm_start & & end = = vma - > vm_end )
goto erase_whole_vma ;
if ( start < vma - > vm_start | | end > vma - > vm_end ) {
kleave ( " = -EINVAL [superset] " ) ;
return - EINVAL ;
}
if ( start & ~ PAGE_MASK ) {
kleave ( " = -EINVAL [unaligned start] " ) ;
return - EINVAL ;
}
if ( end ! = vma - > vm_end & & end & ~ PAGE_MASK ) {
kleave ( " = -EINVAL [unaligned split] " ) ;
return - EINVAL ;
}
if ( start ! = vma - > vm_start & & end ! = vma - > vm_end ) {
ret = split_vma ( mm , vma , start , 1 ) ;
if ( ret < 0 ) {
kleave ( " = %d [split] " , ret ) ;
return ret ;
}
}
return shrink_vma ( mm , vma , start , end ) ;
}
2005-04-17 02:20:36 +04:00
2009-01-08 15:04:47 +03:00
erase_whole_vma :
delete_vma_from_mm ( vma ) ;
delete_vma ( mm , vma ) ;
kleave ( " = 0 " ) ;
2005-04-17 02:20:36 +04:00
return 0 ;
}
2007-07-21 15:37:25 +04:00
EXPORT_SYMBOL ( do_munmap ) ;
2005-04-17 02:20:36 +04:00
2009-01-14 16:14:15 +03:00
SYSCALL_DEFINE2 ( munmap , unsigned long , addr , size_t , len )
2006-09-27 12:50:20 +04:00
{
int ret ;
struct mm_struct * mm = current - > mm ;
down_write ( & mm - > mmap_sem ) ;
ret = do_munmap ( mm , addr , len ) ;
up_write ( & mm - > mmap_sem ) ;
return ret ;
}
/*
2009-01-08 15:04:47 +03:00
* release all the mappings made in a process ' s VM space
2006-09-27 12:50:20 +04:00
*/
2009-01-08 15:04:47 +03:00
void exit_mmap ( struct mm_struct * mm )
2005-04-17 02:20:36 +04:00
{
2009-01-08 15:04:47 +03:00
struct vm_area_struct * vma ;
2005-04-17 02:20:36 +04:00
2009-01-08 15:04:47 +03:00
if ( ! mm )
return ;
2005-04-17 02:20:36 +04:00
2009-01-08 15:04:47 +03:00
kenter ( " " ) ;
2005-04-17 02:20:36 +04:00
2009-01-08 15:04:47 +03:00
mm - > total_vm = 0 ;
2005-04-17 02:20:36 +04:00
2009-01-08 15:04:47 +03:00
while ( ( vma = mm - > mmap ) ) {
mm - > mmap = vma - > vm_next ;
delete_vma_from_mm ( vma ) ;
delete_vma ( mm , vma ) ;
2005-04-17 02:20:36 +04:00
}
2009-01-08 15:04:47 +03:00
kleave ( " " ) ;
2005-04-17 02:20:36 +04:00
}
unsigned long do_brk ( unsigned long addr , unsigned long len )
{
return - ENOMEM ;
}
/*
2006-09-27 12:50:21 +04:00
* expand ( or shrink ) an existing mapping , potentially moving it at the same
* time ( controlled by the MREMAP_MAYMOVE flag and available VM space )
2005-04-17 02:20:36 +04:00
*
2006-09-27 12:50:21 +04:00
* under NOMMU conditions , we only permit changing a mapping ' s size , and only
2009-01-08 15:04:47 +03:00
* as long as it stays within the region allocated by do_mmap_private ( ) and the
* block is not shareable
2005-04-17 02:20:36 +04:00
*
2006-09-27 12:50:21 +04:00
* MREMAP_FIXED is not supported under NOMMU conditions
2005-04-17 02:20:36 +04:00
*/
unsigned long do_mremap ( unsigned long addr ,
unsigned long old_len , unsigned long new_len ,
unsigned long flags , unsigned long new_addr )
{
2006-09-27 12:50:21 +04:00
struct vm_area_struct * vma ;
2005-04-17 02:20:36 +04:00
/* insanity checks first */
2009-01-08 15:04:47 +03:00
if ( old_len = = 0 | | new_len = = 0 )
2005-04-17 02:20:36 +04:00
return ( unsigned long ) - EINVAL ;
2009-01-08 15:04:47 +03:00
if ( addr & ~ PAGE_MASK )
return - EINVAL ;
2005-04-17 02:20:36 +04:00
if ( flags & MREMAP_FIXED & & new_addr ! = addr )
return ( unsigned long ) - EINVAL ;
2009-01-08 15:04:47 +03:00
vma = find_vma_exact ( current - > mm , addr , old_len ) ;
2006-09-27 12:50:21 +04:00
if ( ! vma )
return ( unsigned long ) - EINVAL ;
2005-04-17 02:20:36 +04:00
2006-09-27 12:50:21 +04:00
if ( vma - > vm_end ! = vma - > vm_start + old_len )
2005-04-17 02:20:36 +04:00
return ( unsigned long ) - EFAULT ;
2006-09-27 12:50:21 +04:00
if ( vma - > vm_flags & VM_MAYSHARE )
2005-04-17 02:20:36 +04:00
return ( unsigned long ) - EPERM ;
2009-01-08 15:04:47 +03:00
if ( new_len > vma - > vm_region - > vm_end - vma - > vm_region - > vm_start )
2005-04-17 02:20:36 +04:00
return ( unsigned long ) - ENOMEM ;
/* all checks complete - do it */
2006-09-27 12:50:21 +04:00
vma - > vm_end = vma - > vm_start + new_len ;
return vma - > vm_start ;
}
2007-07-21 15:37:25 +04:00
EXPORT_SYMBOL ( do_mremap ) ;
2006-09-27 12:50:21 +04:00
2009-01-14 16:14:15 +03:00
SYSCALL_DEFINE5 ( mremap , unsigned long , addr , unsigned long , old_len ,
unsigned long , new_len , unsigned long , flags ,
unsigned long , new_addr )
2006-09-27 12:50:21 +04:00
{
unsigned long ret ;
down_write ( & current - > mm - > mmap_sem ) ;
ret = do_mremap ( addr , old_len , new_len , flags , new_addr ) ;
up_write ( & current - > mm - > mmap_sem ) ;
return ret ;
2005-04-17 02:20:36 +04:00
}
2005-11-29 01:34:23 +03:00
struct page * follow_page ( struct vm_area_struct * vma , unsigned long address ,
2005-10-30 04:16:33 +03:00
unsigned int foll_flags )
2005-04-17 02:20:36 +04:00
{
return NULL ;
}
int remap_pfn_range ( struct vm_area_struct * vma , unsigned long from ,
unsigned long to , unsigned long size , pgprot_t prot )
{
2005-09-12 05:18:10 +04:00
vma - > vm_start = vma - > vm_pgoff < < PAGE_SHIFT ;
return 0 ;
2005-04-17 02:20:36 +04:00
}
2006-07-14 11:24:09 +04:00
EXPORT_SYMBOL ( remap_pfn_range ) ;
2005-04-17 02:20:36 +04:00
2008-02-05 09:29:59 +03:00
int remap_vmalloc_range ( struct vm_area_struct * vma , void * addr ,
unsigned long pgoff )
{
unsigned int size = vma - > vm_end - vma - > vm_start ;
if ( ! ( vma - > vm_flags & VM_USERMAP ) )
return - EINVAL ;
vma - > vm_start = ( unsigned long ) ( addr + ( pgoff < < PAGE_SHIFT ) ) ;
vma - > vm_end = vma - > vm_start + size ;
return 0 ;
}
EXPORT_SYMBOL ( remap_vmalloc_range ) ;
2005-04-17 02:20:36 +04:00
void swap_unplug_io_fn ( struct backing_dev_info * bdi , struct page * page )
{
}
unsigned long arch_get_unmapped_area ( struct file * file , unsigned long addr ,
unsigned long len , unsigned long pgoff , unsigned long flags )
{
return - ENOMEM ;
}
2005-06-22 04:14:49 +04:00
void arch_unmap_area ( struct mm_struct * mm , unsigned long addr )
2005-04-17 02:20:36 +04:00
{
}
void unmap_mapping_range ( struct address_space * mapping ,
loff_t const holebegin , loff_t const holelen ,
int even_cows )
{
}
2006-07-14 11:24:09 +04:00
EXPORT_SYMBOL ( unmap_mapping_range ) ;
2005-04-17 02:20:36 +04:00
2007-03-22 11:11:23 +03:00
/*
* ask for an unmapped area at which to create a mapping on a file
*/
unsigned long get_unmapped_area ( struct file * file , unsigned long addr ,
unsigned long len , unsigned long pgoff ,
unsigned long flags )
{
unsigned long ( * get_area ) ( struct file * , unsigned long , unsigned long ,
unsigned long , unsigned long ) ;
get_area = current - > mm - > get_unmapped_area ;
if ( file & & file - > f_op & & file - > f_op - > get_unmapped_area )
get_area = file - > f_op - > get_unmapped_area ;
if ( ! get_area )
return - ENOSYS ;
return get_area ( file , addr , len , pgoff , flags ) ;
}
EXPORT_SYMBOL ( get_unmapped_area ) ;
2005-04-17 02:20:36 +04:00
/*
* Check that a process has enough memory to allocate a new virtual
* mapping . 0 means there is enough memory for the allocation to
* succeed and - ENOMEM implies there is not .
*
* We currently support three overcommit policies , which are set via the
* vm . overcommit_memory sysctl . See Documentation / vm / overcommit - accounting
*
* Strict overcommit modes added 2002 Feb 26 by Alan Cox .
* Additional code 2002 Jul 20 by Robert Love .
*
* cap_sys_admin is 1 if the process has admin privileges , 0 otherwise .
*
* Note this is a helper function intended to be used by LSMs which
* wish to use this logic .
*/
2007-08-23 01:01:28 +04:00
int __vm_enough_memory ( struct mm_struct * mm , long pages , int cap_sys_admin )
2005-04-17 02:20:36 +04:00
{
unsigned long free , allowed ;
vm_acct_memory ( pages ) ;
/*
* Sometimes we want to use more memory than we have
*/
if ( sysctl_overcommit_memory = = OVERCOMMIT_ALWAYS )
return 0 ;
if ( sysctl_overcommit_memory = = OVERCOMMIT_GUESS ) {
unsigned long n ;
2006-06-30 12:55:35 +04:00
free = global_page_state ( NR_FILE_PAGES ) ;
2005-04-17 02:20:36 +04:00
free + = nr_swap_pages ;
/*
* Any slabs which are created with the
* SLAB_RECLAIM_ACCOUNT flag claim to have contents
* which are reclaimable , under pressure . The dentry
* cache and most inode caches should fall into this
*/
2006-09-26 10:31:51 +04:00
free + = global_page_state ( NR_SLAB_RECLAIMABLE ) ;
2005-04-17 02:20:36 +04:00
/*
* Leave the last 3 % for root
*/
if ( ! cap_sys_admin )
free - = free / 32 ;
if ( free > pages )
return 0 ;
/*
* nr_free_pages ( ) is very expensive on large systems ,
* only call if we ' re about to fail .
*/
n = nr_free_pages ( ) ;
2006-04-11 09:53:01 +04:00
/*
* Leave reserved pages . The pages are not for anonymous pages .
*/
if ( n < = totalreserve_pages )
goto error ;
else
n - = totalreserve_pages ;
/*
* Leave the last 3 % for root
*/
2005-04-17 02:20:36 +04:00
if ( ! cap_sys_admin )
n - = n / 32 ;
free + = n ;
if ( free > pages )
return 0 ;
2006-04-11 09:53:01 +04:00
goto error ;
2005-04-17 02:20:36 +04:00
}
allowed = totalram_pages * sysctl_overcommit_ratio / 100 ;
/*
* Leave the last 3 % for root
*/
if ( ! cap_sys_admin )
allowed - = allowed / 32 ;
allowed + = total_swap_pages ;
/* Don't let a single process grow too big:
leave 3 % of the size of this process for other processes */
2008-10-30 00:01:20 +03:00
if ( mm )
allowed - = mm - > total_vm / 32 ;
2005-04-17 02:20:36 +04:00
2009-05-01 02:08:51 +04:00
if ( percpu_counter_read_positive ( & vm_committed_as ) < allowed )
2005-04-17 02:20:36 +04:00
return 0 ;
2009-05-01 02:08:51 +04:00
2006-04-11 09:53:01 +04:00
error :
2005-04-17 02:20:36 +04:00
vm_unacct_memory ( pages ) ;
return - ENOMEM ;
}
int in_gate_area_no_task ( unsigned long addr )
{
return 0 ;
}
2006-01-06 11:11:42 +03:00
2007-07-19 12:47:03 +04:00
int filemap_fault ( struct vm_area_struct * vma , struct vm_fault * vmf )
2006-01-06 11:11:42 +03:00
{
BUG ( ) ;
2007-07-19 12:47:03 +04:00
return 0 ;
2006-01-06 11:11:42 +03:00
}
2007-07-21 15:37:25 +04:00
EXPORT_SYMBOL ( filemap_fault ) ;
2006-09-27 12:50:15 +04:00
/*
* Access another process ' address space .
* - source / target buffer must be kernel space
*/
int access_process_vm ( struct task_struct * tsk , unsigned long addr , void * buf , int len , int write )
{
struct vm_area_struct * vma ;
struct mm_struct * mm ;
if ( addr + len < addr )
return 0 ;
mm = get_task_mm ( tsk ) ;
if ( ! mm )
return 0 ;
down_read ( & mm - > mmap_sem ) ;
/* the access must start within one of the target process's mappings */
2006-09-27 12:50:16 +04:00
vma = find_vma ( mm , addr ) ;
if ( vma ) {
2006-09-27 12:50:15 +04:00
/* don't overrun this mapping */
if ( addr + len > = vma - > vm_end )
len = vma - > vm_end - addr ;
/* only read or write mappings where it is permitted */
2006-09-27 12:50:19 +04:00
if ( write & & vma - > vm_flags & VM_MAYWRITE )
2006-09-27 12:50:15 +04:00
len - = copy_to_user ( ( void * ) addr , buf , len ) ;
2006-09-27 12:50:19 +04:00
else if ( ! write & & vma - > vm_flags & VM_MAYREAD )
2006-09-27 12:50:15 +04:00
len - = copy_from_user ( buf , ( void * ) addr , len ) ;
else
len = 0 ;
} else {
len = 0 ;
}
up_read ( & mm - > mmap_sem ) ;
mmput ( mm ) ;
return len ;
}