2005-04-17 02:20:36 +04:00
/*
* linux / mm / nommu . c
*
* Replacement code for mm functions to support CPU ' s that don ' t
* have any form of memory management unit ( thus no virtual memory ) .
*
* See Documentation / nommu - mmap . txt
*
2009-01-08 15:04:47 +03:00
* Copyright ( c ) 2004 - 2008 David Howells < dhowells @ redhat . com >
2005-04-17 02:20:36 +04:00
* Copyright ( c ) 2000 - 2003 David McCullough < davidm @ snapgear . com >
* Copyright ( c ) 2000 - 2001 D Jeff Dionne < jeff @ uClinux . org >
* Copyright ( c ) 2002 Greg Ungerer < gerg @ snapgear . com >
2009-01-21 11:45:47 +03:00
* Copyright ( c ) 2007 - 2009 Paul Mundt < lethal @ linux - sh . org >
2005-04-17 02:20:36 +04:00
*/
2007-10-29 16:15:39 +03:00
# include <linux/module.h>
2005-04-17 02:20:36 +04:00
# include <linux/mm.h>
# include <linux/mman.h>
# include <linux/swap.h>
# include <linux/file.h>
# include <linux/highmem.h>
# include <linux/pagemap.h>
# include <linux/slab.h>
# include <linux/vmalloc.h>
2008-07-26 06:45:50 +04:00
# include <linux/tracehook.h>
2005-04-17 02:20:36 +04:00
# include <linux/blkdev.h>
# include <linux/backing-dev.h>
# include <linux/mount.h>
# include <linux/personality.h>
# include <linux/security.h>
# include <linux/syscalls.h>
# include <asm/uaccess.h>
# include <asm/tlb.h>
# include <asm/tlbflush.h>
2009-09-22 04:03:57 +04:00
# include <asm/mmu_context.h>
2009-01-08 15:04:47 +03:00
# include "internal.h"
#if 0
# define kenter(FMT, ...) \
printk ( KERN_DEBUG " ==> %s( " FMT " ) \n " , __func__ , # # __VA_ARGS__ )
# define kleave(FMT, ...) \
printk ( KERN_DEBUG " <== %s() " FMT " \n " , __func__ , # # __VA_ARGS__ )
# define kdebug(FMT, ...) \
printk ( KERN_DEBUG " xxx " FMT " yyy \n " , # # __VA_ARGS__ )
# else
# define kenter(FMT, ...) \
no_printk ( KERN_DEBUG " ==> %s( " FMT " ) \n " , __func__ , # # __VA_ARGS__ )
# define kleave(FMT, ...) \
no_printk ( KERN_DEBUG " <== %s() " FMT " \n " , __func__ , # # __VA_ARGS__ )
# define kdebug(FMT, ...) \
no_printk ( KERN_DEBUG FMT " \n " , # # __VA_ARGS__ )
# endif
2005-04-17 02:20:36 +04:00
void * high_memory ;
struct page * mem_map ;
unsigned long max_mapnr ;
unsigned long num_physpages ;
2009-09-23 20:05:53 +04:00
unsigned long highest_memmap_pfn ;
2009-05-01 02:08:51 +04:00
struct percpu_counter vm_committed_as ;
2005-04-17 02:20:36 +04:00
int sysctl_overcommit_memory = OVERCOMMIT_GUESS ; /* heuristic overcommit */
int sysctl_overcommit_ratio = 50 ; /* default is 50% */
int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT ;
2009-05-07 03:03:05 +04:00
int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS ;
2005-04-17 02:20:36 +04:00
int heap_stack_gap = 0 ;
2009-04-03 03:56:32 +04:00
atomic_long_t mmap_pages_allocated ;
2009-01-08 15:04:47 +03:00
2005-04-17 02:20:36 +04:00
EXPORT_SYMBOL ( mem_map ) ;
2007-04-12 10:28:47 +04:00
EXPORT_SYMBOL ( num_physpages ) ;
2005-04-17 02:20:36 +04:00
2009-01-08 15:04:47 +03:00
/* list of mapped, potentially shareable regions */
static struct kmem_cache * vm_region_jar ;
struct rb_root nommu_region_tree = RB_ROOT ;
DECLARE_RWSEM ( nommu_region_sem ) ;
2005-04-17 02:20:36 +04:00
2009-09-27 22:29:37 +04:00
const struct vm_operations_struct generic_file_vm_ops = {
2005-04-17 02:20:36 +04:00
} ;
/*
* Return the total memory allocated for this pointer , not
* just what the caller asked for .
*
* Doesn ' t have to be accurate , i . e . may have races .
*/
unsigned int kobjsize ( const void * objp )
{
struct page * page ;
2008-04-28 13:13:38 +04:00
/*
* If the object we have should not have ksize performed on it ,
* return size of 0
*/
2008-06-12 11:29:55 +04:00
if ( ! objp | | ! virt_addr_valid ( objp ) )
2008-06-06 09:46:08 +04:00
return 0 ;
page = virt_to_head_page ( objp ) ;
/*
* If the allocator sets PageSlab , we know the pointer came from
* kmalloc ( ) .
*/
2005-04-17 02:20:36 +04:00
if ( PageSlab ( page ) )
return ksize ( objp ) ;
2009-01-08 15:04:48 +03:00
/*
* If it ' s not a compound page , see if we have a matching VMA
* region . This test is intentionally done in reverse order ,
* so if there ' s no VMA , we still fall through and hand back
* PAGE_SIZE for 0 - order pages .
*/
if ( ! PageCompound ( page ) ) {
struct vm_area_struct * vma ;
vma = find_vma ( current - > mm , ( unsigned long ) objp ) ;
if ( vma )
return vma - > vm_end - vma - > vm_start ;
}
2008-06-06 09:46:08 +04:00
/*
* The ksize ( ) function is only guaranteed to work for pointers
2008-06-12 11:29:55 +04:00
* returned by kmalloc ( ) . So handle arbitrary pointers here .
2008-06-06 09:46:08 +04:00
*/
2008-06-12 11:29:55 +04:00
return PAGE_SIZE < < compound_order ( page ) ;
2005-04-17 02:20:36 +04:00
}
mlock: mlocked pages are unevictable
Make sure that mlocked pages also live on the unevictable LRU, so kswapd
will not scan them over and over again.
This is achieved through various strategies:
1) add yet another page flag--PG_mlocked--to indicate that
the page is locked for efficient testing in vmscan and,
optionally, fault path. This allows early culling of
unevictable pages, preventing them from getting to
page_referenced()/try_to_unmap(). Also allows separate
accounting of mlock'd pages, as Nick's original patch
did.
Note: Nick's original mlock patch used a PG_mlocked
flag. I had removed this in favor of the PG_unevictable
flag + an mlock_count [new page struct member]. I
restored the PG_mlocked flag to eliminate the new
count field.
2) add the mlock/unevictable infrastructure to mm/mlock.c,
with internal APIs in mm/internal.h. This is a rework
of Nick's original patch to these files, taking into
account that mlocked pages are now kept on unevictable
LRU list.
3) update vmscan.c:page_evictable() to check PageMlocked()
and, if vma passed in, the vm_flags. Note that the vma
will only be passed in for new pages in the fault path;
and then only if the "cull unevictable pages in fault
path" patch is included.
4) add try_to_unlock() to rmap.c to walk a page's rmap and
ClearPageMlocked() if no other vmas have it mlocked.
Reuses as much of try_to_unmap() as possible. This
effectively replaces the use of one of the lru list links
as an mlock count. If this mechanism let's pages in mlocked
vmas leak through w/o PG_mlocked set [I don't know that it
does], we should catch them later in try_to_unmap(). One
hopes this will be rare, as it will be relatively expensive.
Original mm/internal.h, mm/rmap.c and mm/mlock.c changes:
Signed-off-by: Nick Piggin <npiggin@suse.de>
splitlru: introduce __get_user_pages():
New munlock processing need to GUP_FLAGS_IGNORE_VMA_PERMISSIONS.
because current get_user_pages() can't grab PROT_NONE pages theresore it
cause PROT_NONE pages can't munlock.
[akpm@linux-foundation.org: fix this for pagemap-pass-mm-into-pagewalkers.patch]
[akpm@linux-foundation.org: untangle patch interdependencies]
[akpm@linux-foundation.org: fix things after out-of-order merging]
[hugh@veritas.com: fix page-flags mess]
[lee.schermerhorn@hp.com: fix munlock page table walk - now requires 'mm']
[kosaki.motohiro@jp.fujitsu.com: build fix]
[kosaki.motohiro@jp.fujitsu.com: fix truncate race and sevaral comments]
[kosaki.motohiro@jp.fujitsu.com: splitlru: introduce __get_user_pages()]
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-10-19 07:26:44 +04:00
int __get_user_pages ( struct task_struct * tsk , struct mm_struct * mm ,
2009-09-23 20:05:53 +04:00
unsigned long start , int nr_pages , unsigned int foll_flags ,
2009-06-25 13:58:55 +04:00
struct page * * pages , struct vm_area_struct * * vmas )
2005-04-17 02:20:36 +04:00
{
2006-09-27 12:50:17 +04:00
struct vm_area_struct * vma ;
2006-09-27 12:50:18 +04:00
unsigned long vm_flags ;
int i ;
/* calculate required read or write permissions.
2009-09-22 04:03:31 +04:00
* If FOLL_FORCE is set , we only require the " MAY " flags .
2006-09-27 12:50:18 +04:00
*/
2009-09-22 04:03:31 +04:00
vm_flags = ( foll_flags & FOLL_WRITE ) ?
( VM_WRITE | VM_MAYWRITE ) : ( VM_READ | VM_MAYREAD ) ;
vm_flags & = ( foll_flags & FOLL_FORCE ) ?
( VM_MAYREAD | VM_MAYWRITE ) : ( VM_READ | VM_WRITE ) ;
2005-04-17 02:20:36 +04:00
2009-06-25 13:58:55 +04:00
for ( i = 0 ; i < nr_pages ; i + + ) {
2010-03-25 19:48:38 +03:00
vma = find_vma ( mm , start ) ;
2006-09-27 12:50:18 +04:00
if ( ! vma )
goto finish_or_fault ;
/* protect what we can, including chardevs */
2009-09-22 04:03:24 +04:00
if ( ( vma - > vm_flags & ( VM_IO | VM_PFNMAP ) ) | |
! ( vm_flags & vma - > vm_flags ) )
2006-09-27 12:50:18 +04:00
goto finish_or_fault ;
2006-09-27 12:50:17 +04:00
2005-04-17 02:20:36 +04:00
if ( pages ) {
pages [ i ] = virt_to_page ( start ) ;
if ( pages [ i ] )
page_cache_get ( pages [ i ] ) ;
}
if ( vmas )
2006-09-27 12:50:17 +04:00
vmas [ i ] = vma ;
2010-03-25 19:48:44 +03:00
start = ( start + PAGE_SIZE ) & PAGE_MASK ;
2005-04-17 02:20:36 +04:00
}
2006-09-27 12:50:18 +04:00
return i ;
finish_or_fault :
return i ? : - EFAULT ;
2005-04-17 02:20:36 +04:00
}
mlock: mlocked pages are unevictable
Make sure that mlocked pages also live on the unevictable LRU, so kswapd
will not scan them over and over again.
This is achieved through various strategies:
1) add yet another page flag--PG_mlocked--to indicate that
the page is locked for efficient testing in vmscan and,
optionally, fault path. This allows early culling of
unevictable pages, preventing them from getting to
page_referenced()/try_to_unmap(). Also allows separate
accounting of mlock'd pages, as Nick's original patch
did.
Note: Nick's original mlock patch used a PG_mlocked
flag. I had removed this in favor of the PG_unevictable
flag + an mlock_count [new page struct member]. I
restored the PG_mlocked flag to eliminate the new
count field.
2) add the mlock/unevictable infrastructure to mm/mlock.c,
with internal APIs in mm/internal.h. This is a rework
of Nick's original patch to these files, taking into
account that mlocked pages are now kept on unevictable
LRU list.
3) update vmscan.c:page_evictable() to check PageMlocked()
and, if vma passed in, the vm_flags. Note that the vma
will only be passed in for new pages in the fault path;
and then only if the "cull unevictable pages in fault
path" patch is included.
4) add try_to_unlock() to rmap.c to walk a page's rmap and
ClearPageMlocked() if no other vmas have it mlocked.
Reuses as much of try_to_unmap() as possible. This
effectively replaces the use of one of the lru list links
as an mlock count. If this mechanism let's pages in mlocked
vmas leak through w/o PG_mlocked set [I don't know that it
does], we should catch them later in try_to_unmap(). One
hopes this will be rare, as it will be relatively expensive.
Original mm/internal.h, mm/rmap.c and mm/mlock.c changes:
Signed-off-by: Nick Piggin <npiggin@suse.de>
splitlru: introduce __get_user_pages():
New munlock processing need to GUP_FLAGS_IGNORE_VMA_PERMISSIONS.
because current get_user_pages() can't grab PROT_NONE pages theresore it
cause PROT_NONE pages can't munlock.
[akpm@linux-foundation.org: fix this for pagemap-pass-mm-into-pagewalkers.patch]
[akpm@linux-foundation.org: untangle patch interdependencies]
[akpm@linux-foundation.org: fix things after out-of-order merging]
[hugh@veritas.com: fix page-flags mess]
[lee.schermerhorn@hp.com: fix munlock page table walk - now requires 'mm']
[kosaki.motohiro@jp.fujitsu.com: build fix]
[kosaki.motohiro@jp.fujitsu.com: fix truncate race and sevaral comments]
[kosaki.motohiro@jp.fujitsu.com: splitlru: introduce __get_user_pages()]
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-10-19 07:26:44 +04:00
/*
* get a list of pages in an address range belonging to the specified process
* and indicate the VMA that covers each page
* - this is potentially dodgy as we may end incrementing the page count of a
* slab page or a secondary page from a compound page
* - don ' t permit access to VMAs that don ' t support it , such as I / O mappings
*/
int get_user_pages ( struct task_struct * tsk , struct mm_struct * mm ,
2009-06-25 13:58:55 +04:00
unsigned long start , int nr_pages , int write , int force ,
mlock: mlocked pages are unevictable
Make sure that mlocked pages also live on the unevictable LRU, so kswapd
will not scan them over and over again.
This is achieved through various strategies:
1) add yet another page flag--PG_mlocked--to indicate that
the page is locked for efficient testing in vmscan and,
optionally, fault path. This allows early culling of
unevictable pages, preventing them from getting to
page_referenced()/try_to_unmap(). Also allows separate
accounting of mlock'd pages, as Nick's original patch
did.
Note: Nick's original mlock patch used a PG_mlocked
flag. I had removed this in favor of the PG_unevictable
flag + an mlock_count [new page struct member]. I
restored the PG_mlocked flag to eliminate the new
count field.
2) add the mlock/unevictable infrastructure to mm/mlock.c,
with internal APIs in mm/internal.h. This is a rework
of Nick's original patch to these files, taking into
account that mlocked pages are now kept on unevictable
LRU list.
3) update vmscan.c:page_evictable() to check PageMlocked()
and, if vma passed in, the vm_flags. Note that the vma
will only be passed in for new pages in the fault path;
and then only if the "cull unevictable pages in fault
path" patch is included.
4) add try_to_unlock() to rmap.c to walk a page's rmap and
ClearPageMlocked() if no other vmas have it mlocked.
Reuses as much of try_to_unmap() as possible. This
effectively replaces the use of one of the lru list links
as an mlock count. If this mechanism let's pages in mlocked
vmas leak through w/o PG_mlocked set [I don't know that it
does], we should catch them later in try_to_unmap(). One
hopes this will be rare, as it will be relatively expensive.
Original mm/internal.h, mm/rmap.c and mm/mlock.c changes:
Signed-off-by: Nick Piggin <npiggin@suse.de>
splitlru: introduce __get_user_pages():
New munlock processing need to GUP_FLAGS_IGNORE_VMA_PERMISSIONS.
because current get_user_pages() can't grab PROT_NONE pages theresore it
cause PROT_NONE pages can't munlock.
[akpm@linux-foundation.org: fix this for pagemap-pass-mm-into-pagewalkers.patch]
[akpm@linux-foundation.org: untangle patch interdependencies]
[akpm@linux-foundation.org: fix things after out-of-order merging]
[hugh@veritas.com: fix page-flags mess]
[lee.schermerhorn@hp.com: fix munlock page table walk - now requires 'mm']
[kosaki.motohiro@jp.fujitsu.com: build fix]
[kosaki.motohiro@jp.fujitsu.com: fix truncate race and sevaral comments]
[kosaki.motohiro@jp.fujitsu.com: splitlru: introduce __get_user_pages()]
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-10-19 07:26:44 +04:00
struct page * * pages , struct vm_area_struct * * vmas )
{
int flags = 0 ;
if ( write )
2009-09-22 04:03:31 +04:00
flags | = FOLL_WRITE ;
mlock: mlocked pages are unevictable
Make sure that mlocked pages also live on the unevictable LRU, so kswapd
will not scan them over and over again.
This is achieved through various strategies:
1) add yet another page flag--PG_mlocked--to indicate that
the page is locked for efficient testing in vmscan and,
optionally, fault path. This allows early culling of
unevictable pages, preventing them from getting to
page_referenced()/try_to_unmap(). Also allows separate
accounting of mlock'd pages, as Nick's original patch
did.
Note: Nick's original mlock patch used a PG_mlocked
flag. I had removed this in favor of the PG_unevictable
flag + an mlock_count [new page struct member]. I
restored the PG_mlocked flag to eliminate the new
count field.
2) add the mlock/unevictable infrastructure to mm/mlock.c,
with internal APIs in mm/internal.h. This is a rework
of Nick's original patch to these files, taking into
account that mlocked pages are now kept on unevictable
LRU list.
3) update vmscan.c:page_evictable() to check PageMlocked()
and, if vma passed in, the vm_flags. Note that the vma
will only be passed in for new pages in the fault path;
and then only if the "cull unevictable pages in fault
path" patch is included.
4) add try_to_unlock() to rmap.c to walk a page's rmap and
ClearPageMlocked() if no other vmas have it mlocked.
Reuses as much of try_to_unmap() as possible. This
effectively replaces the use of one of the lru list links
as an mlock count. If this mechanism let's pages in mlocked
vmas leak through w/o PG_mlocked set [I don't know that it
does], we should catch them later in try_to_unmap(). One
hopes this will be rare, as it will be relatively expensive.
Original mm/internal.h, mm/rmap.c and mm/mlock.c changes:
Signed-off-by: Nick Piggin <npiggin@suse.de>
splitlru: introduce __get_user_pages():
New munlock processing need to GUP_FLAGS_IGNORE_VMA_PERMISSIONS.
because current get_user_pages() can't grab PROT_NONE pages theresore it
cause PROT_NONE pages can't munlock.
[akpm@linux-foundation.org: fix this for pagemap-pass-mm-into-pagewalkers.patch]
[akpm@linux-foundation.org: untangle patch interdependencies]
[akpm@linux-foundation.org: fix things after out-of-order merging]
[hugh@veritas.com: fix page-flags mess]
[lee.schermerhorn@hp.com: fix munlock page table walk - now requires 'mm']
[kosaki.motohiro@jp.fujitsu.com: build fix]
[kosaki.motohiro@jp.fujitsu.com: fix truncate race and sevaral comments]
[kosaki.motohiro@jp.fujitsu.com: splitlru: introduce __get_user_pages()]
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-10-19 07:26:44 +04:00
if ( force )
2009-09-22 04:03:31 +04:00
flags | = FOLL_FORCE ;
mlock: mlocked pages are unevictable
Make sure that mlocked pages also live on the unevictable LRU, so kswapd
will not scan them over and over again.
This is achieved through various strategies:
1) add yet another page flag--PG_mlocked--to indicate that
the page is locked for efficient testing in vmscan and,
optionally, fault path. This allows early culling of
unevictable pages, preventing them from getting to
page_referenced()/try_to_unmap(). Also allows separate
accounting of mlock'd pages, as Nick's original patch
did.
Note: Nick's original mlock patch used a PG_mlocked
flag. I had removed this in favor of the PG_unevictable
flag + an mlock_count [new page struct member]. I
restored the PG_mlocked flag to eliminate the new
count field.
2) add the mlock/unevictable infrastructure to mm/mlock.c,
with internal APIs in mm/internal.h. This is a rework
of Nick's original patch to these files, taking into
account that mlocked pages are now kept on unevictable
LRU list.
3) update vmscan.c:page_evictable() to check PageMlocked()
and, if vma passed in, the vm_flags. Note that the vma
will only be passed in for new pages in the fault path;
and then only if the "cull unevictable pages in fault
path" patch is included.
4) add try_to_unlock() to rmap.c to walk a page's rmap and
ClearPageMlocked() if no other vmas have it mlocked.
Reuses as much of try_to_unmap() as possible. This
effectively replaces the use of one of the lru list links
as an mlock count. If this mechanism let's pages in mlocked
vmas leak through w/o PG_mlocked set [I don't know that it
does], we should catch them later in try_to_unmap(). One
hopes this will be rare, as it will be relatively expensive.
Original mm/internal.h, mm/rmap.c and mm/mlock.c changes:
Signed-off-by: Nick Piggin <npiggin@suse.de>
splitlru: introduce __get_user_pages():
New munlock processing need to GUP_FLAGS_IGNORE_VMA_PERMISSIONS.
because current get_user_pages() can't grab PROT_NONE pages theresore it
cause PROT_NONE pages can't munlock.
[akpm@linux-foundation.org: fix this for pagemap-pass-mm-into-pagewalkers.patch]
[akpm@linux-foundation.org: untangle patch interdependencies]
[akpm@linux-foundation.org: fix things after out-of-order merging]
[hugh@veritas.com: fix page-flags mess]
[lee.schermerhorn@hp.com: fix munlock page table walk - now requires 'mm']
[kosaki.motohiro@jp.fujitsu.com: build fix]
[kosaki.motohiro@jp.fujitsu.com: fix truncate race and sevaral comments]
[kosaki.motohiro@jp.fujitsu.com: splitlru: introduce __get_user_pages()]
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-10-19 07:26:44 +04:00
2009-06-25 13:58:55 +04:00
return __get_user_pages ( tsk , mm , start , nr_pages , flags , pages , vmas ) ;
mlock: mlocked pages are unevictable
Make sure that mlocked pages also live on the unevictable LRU, so kswapd
will not scan them over and over again.
This is achieved through various strategies:
1) add yet another page flag--PG_mlocked--to indicate that
the page is locked for efficient testing in vmscan and,
optionally, fault path. This allows early culling of
unevictable pages, preventing them from getting to
page_referenced()/try_to_unmap(). Also allows separate
accounting of mlock'd pages, as Nick's original patch
did.
Note: Nick's original mlock patch used a PG_mlocked
flag. I had removed this in favor of the PG_unevictable
flag + an mlock_count [new page struct member]. I
restored the PG_mlocked flag to eliminate the new
count field.
2) add the mlock/unevictable infrastructure to mm/mlock.c,
with internal APIs in mm/internal.h. This is a rework
of Nick's original patch to these files, taking into
account that mlocked pages are now kept on unevictable
LRU list.
3) update vmscan.c:page_evictable() to check PageMlocked()
and, if vma passed in, the vm_flags. Note that the vma
will only be passed in for new pages in the fault path;
and then only if the "cull unevictable pages in fault
path" patch is included.
4) add try_to_unlock() to rmap.c to walk a page's rmap and
ClearPageMlocked() if no other vmas have it mlocked.
Reuses as much of try_to_unmap() as possible. This
effectively replaces the use of one of the lru list links
as an mlock count. If this mechanism let's pages in mlocked
vmas leak through w/o PG_mlocked set [I don't know that it
does], we should catch them later in try_to_unmap(). One
hopes this will be rare, as it will be relatively expensive.
Original mm/internal.h, mm/rmap.c and mm/mlock.c changes:
Signed-off-by: Nick Piggin <npiggin@suse.de>
splitlru: introduce __get_user_pages():
New munlock processing need to GUP_FLAGS_IGNORE_VMA_PERMISSIONS.
because current get_user_pages() can't grab PROT_NONE pages theresore it
cause PROT_NONE pages can't munlock.
[akpm@linux-foundation.org: fix this for pagemap-pass-mm-into-pagewalkers.patch]
[akpm@linux-foundation.org: untangle patch interdependencies]
[akpm@linux-foundation.org: fix things after out-of-order merging]
[hugh@veritas.com: fix page-flags mess]
[lee.schermerhorn@hp.com: fix munlock page table walk - now requires 'mm']
[kosaki.motohiro@jp.fujitsu.com: build fix]
[kosaki.motohiro@jp.fujitsu.com: fix truncate race and sevaral comments]
[kosaki.motohiro@jp.fujitsu.com: splitlru: introduce __get_user_pages()]
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-10-19 07:26:44 +04:00
}
2005-09-12 05:18:10 +04:00
EXPORT_SYMBOL ( get_user_pages ) ;
2009-06-25 23:31:57 +04:00
/**
* follow_pfn - look up PFN at a user virtual address
* @ vma : memory mapping
* @ address : user virtual address
* @ pfn : location to store found PFN
*
* Only IO mappings and raw PFN mappings are allowed .
*
* Returns zero and the pfn at @ pfn on success , - ve otherwise .
*/
int follow_pfn ( struct vm_area_struct * vma , unsigned long address ,
unsigned long * pfn )
{
if ( ! ( vma - > vm_flags & ( VM_IO | VM_PFNMAP ) ) )
return - EINVAL ;
* pfn = address > > PAGE_SHIFT ;
return 0 ;
}
EXPORT_SYMBOL ( follow_pfn ) ;
2005-04-17 02:20:36 +04:00
DEFINE_RWLOCK ( vmlist_lock ) ;
struct vm_struct * vmlist ;
2008-02-05 09:28:32 +03:00
void vfree ( const void * addr )
2005-04-17 02:20:36 +04:00
{
kfree ( addr ) ;
}
2007-07-21 15:37:25 +04:00
EXPORT_SYMBOL ( vfree ) ;
2005-04-17 02:20:36 +04:00
2005-10-07 10:46:04 +04:00
void * __vmalloc ( unsigned long size , gfp_t gfp_mask , pgprot_t prot )
2005-04-17 02:20:36 +04:00
{
/*
2007-10-20 01:11:38 +04:00
* You can ' t specify __GFP_HIGHMEM with kmalloc ( ) since kmalloc ( )
* returns only a logical address .
2005-04-17 02:20:36 +04:00
*/
2006-03-22 11:08:34 +03:00
return kmalloc ( size , ( gfp_mask | __GFP_COMP ) & ~ __GFP_HIGHMEM ) ;
2005-04-17 02:20:36 +04:00
}
2007-07-21 15:37:25 +04:00
EXPORT_SYMBOL ( __vmalloc ) ;
2005-04-17 02:20:36 +04:00
2008-02-05 09:29:59 +03:00
void * vmalloc_user ( unsigned long size )
{
void * ret ;
ret = __vmalloc ( size , GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO ,
PAGE_KERNEL ) ;
if ( ret ) {
struct vm_area_struct * vma ;
down_write ( & current - > mm - > mmap_sem ) ;
vma = find_vma ( current - > mm , ( unsigned long ) ret ) ;
if ( vma )
vma - > vm_flags | = VM_USERMAP ;
up_write ( & current - > mm - > mmap_sem ) ;
}
return ret ;
}
EXPORT_SYMBOL ( vmalloc_user ) ;
2008-02-05 09:28:32 +03:00
struct page * vmalloc_to_page ( const void * addr )
2005-04-17 02:20:36 +04:00
{
return virt_to_page ( addr ) ;
}
2007-07-21 15:37:25 +04:00
EXPORT_SYMBOL ( vmalloc_to_page ) ;
2005-04-17 02:20:36 +04:00
2008-02-05 09:28:32 +03:00
unsigned long vmalloc_to_pfn ( const void * addr )
2005-04-17 02:20:36 +04:00
{
return page_to_pfn ( virt_to_page ( addr ) ) ;
}
2007-07-21 15:37:25 +04:00
EXPORT_SYMBOL ( vmalloc_to_pfn ) ;
2005-04-17 02:20:36 +04:00
long vread ( char * buf , char * addr , unsigned long count )
{
memcpy ( buf , addr , count ) ;
return count ;
}
long vwrite ( char * buf , char * addr , unsigned long count )
{
/* Don't allow overflow */
if ( ( unsigned long ) addr + count < count )
count = - ( unsigned long ) addr ;
memcpy ( addr , buf , count ) ;
return ( count ) ;
}
/*
* vmalloc - allocate virtually continguos memory
*
* @ size : allocation size
*
* Allocate enough pages to cover @ size from the page level
* allocator and map them into continguos kernel virtual space .
*
2006-10-04 01:21:02 +04:00
* For tight control over page level allocator and protection flags
2005-04-17 02:20:36 +04:00
* use __vmalloc ( ) instead .
*/
void * vmalloc ( unsigned long size )
{
return __vmalloc ( size , GFP_KERNEL | __GFP_HIGHMEM , PAGE_KERNEL ) ;
}
2006-03-01 03:59:18 +03:00
EXPORT_SYMBOL ( vmalloc ) ;
void * vmalloc_node ( unsigned long size , int node )
{
return vmalloc ( size ) ;
}
EXPORT_SYMBOL ( vmalloc_node ) ;
2005-04-17 02:20:36 +04:00
2008-08-04 11:01:47 +04:00
# ifndef PAGE_KERNEL_EXEC
# define PAGE_KERNEL_EXEC PAGE_KERNEL
# endif
/**
* vmalloc_exec - allocate virtually contiguous , executable memory
* @ size : allocation size
*
* Kernel - internal function to allocate enough pages to cover @ size
* the page level allocator and map them into contiguous and
* executable kernel virtual space .
*
* For tight control over page level allocator and protection flags
* use __vmalloc ( ) instead .
*/
void * vmalloc_exec ( unsigned long size )
{
return __vmalloc ( size , GFP_KERNEL | __GFP_HIGHMEM , PAGE_KERNEL_EXEC ) ;
}
2007-07-21 15:37:25 +04:00
/**
* vmalloc_32 - allocate virtually contiguous memory ( 32 bit addressable )
2005-04-17 02:20:36 +04:00
* @ size : allocation size
*
* Allocate enough 32 bit PA addressable pages to cover @ size from the
* page level allocator and map them into continguos kernel virtual space .
*/
void * vmalloc_32 ( unsigned long size )
{
return __vmalloc ( size , GFP_KERNEL , PAGE_KERNEL ) ;
}
2007-07-21 15:37:25 +04:00
EXPORT_SYMBOL ( vmalloc_32 ) ;
/**
* vmalloc_32_user - allocate zeroed virtually contiguous 32 bit memory
* @ size : allocation size
*
* The resulting memory area is 32 bit addressable and zeroed so it can be
* mapped to userspace without leaking data .
2008-02-05 09:29:59 +03:00
*
* VM_USERMAP is set on the corresponding VMA so that subsequent calls to
* remap_vmalloc_range ( ) are permissible .
2007-07-21 15:37:25 +04:00
*/
void * vmalloc_32_user ( unsigned long size )
{
2008-02-05 09:29:59 +03:00
/*
* We ' ll have to sort out the ZONE_DMA bits for 64 - bit ,
* but for now this can simply use vmalloc_user ( ) directly .
*/
return vmalloc_user ( size ) ;
2007-07-21 15:37:25 +04:00
}
EXPORT_SYMBOL ( vmalloc_32_user ) ;
2005-04-17 02:20:36 +04:00
void * vmap ( struct page * * pages , unsigned int count , unsigned long flags , pgprot_t prot )
{
BUG ( ) ;
return NULL ;
}
2007-07-21 15:37:25 +04:00
EXPORT_SYMBOL ( vmap ) ;
2005-04-17 02:20:36 +04:00
2008-02-05 09:28:32 +03:00
void vunmap ( const void * addr )
2005-04-17 02:20:36 +04:00
{
BUG ( ) ;
}
2007-07-21 15:37:25 +04:00
EXPORT_SYMBOL ( vunmap ) ;
2005-04-17 02:20:36 +04:00
2009-01-21 11:45:47 +03:00
void * vm_map_ram ( struct page * * pages , unsigned int count , int node , pgprot_t prot )
{
BUG ( ) ;
return NULL ;
}
EXPORT_SYMBOL ( vm_map_ram ) ;
void vm_unmap_ram ( const void * mem , unsigned int count )
{
BUG ( ) ;
}
EXPORT_SYMBOL ( vm_unmap_ram ) ;
void vm_unmap_aliases ( void )
{
}
EXPORT_SYMBOL_GPL ( vm_unmap_aliases ) ;
2007-05-08 11:27:03 +04:00
/*
* Implement a stub for vmalloc_sync_all ( ) if the architecture chose not to
* have one .
*/
void __attribute__ ( ( weak ) ) vmalloc_sync_all ( void )
{
}
2007-07-21 15:37:25 +04:00
int vm_insert_page ( struct vm_area_struct * vma , unsigned long addr ,
struct page * page )
{
return - EINVAL ;
}
EXPORT_SYMBOL ( vm_insert_page ) ;
2005-04-17 02:20:36 +04:00
/*
* sys_brk ( ) for the most part doesn ' t need the global kernel
* lock , except when an application is doing something nasty
* like trying to un - brk an area that has already been mapped
* to a regular file . in this case , the unmapping will need
* to invoke file system routines that need the global lock .
*/
2009-01-14 16:14:15 +03:00
SYSCALL_DEFINE1 ( brk , unsigned long , brk )
2005-04-17 02:20:36 +04:00
{
struct mm_struct * mm = current - > mm ;
if ( brk < mm - > start_brk | | brk > mm - > context . end_brk )
return mm - > brk ;
if ( mm - > brk = = brk )
return mm - > brk ;
/*
* Always allow shrinking brk
*/
if ( brk < = mm - > brk ) {
mm - > brk = brk ;
return brk ;
}
/*
* Ok , looks good - let it rip .
*/
NOMMU: Avoiding duplicate icache flushes of shared maps
When working with FDPIC, there are many shared mappings of read-only
code regions between applications (the C library, applet packages like
busybox, etc.), but the current do_mmap_pgoff() function will issue an
icache flush whenever a VMA is added to an MM instead of only doing it
when the map is initially created.
The flush can instead be done when a region is first mmapped PROT_EXEC.
Note that we may not rely on the first mapping of a region being
executable - it's possible for it to be PROT_READ only, so we have to
remember whether we've flushed the region or not, and then flush the
entire region when a bit of it is made executable.
However, this also affects the brk area. That will no longer be
executable. We can mprotect() it to PROT_EXEC on MPU-mode kernels, but
for NOMMU mode kernels, when it increases the brk allocation, making
sys_brk() flush the extra from the icache should suffice. The brk area
probably isn't used by NOMMU programs since the brk area can only use up
the leavings from the stack allocation, where the stack allocation is
larger than requested.
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-01-06 20:23:23 +03:00
flush_icache_range ( mm - > brk , brk ) ;
2005-04-17 02:20:36 +04:00
return mm - > brk = brk ;
}
2009-01-08 15:04:47 +03:00
/*
* initialise the VMA and region record slabs
*/
void __init mmap_init ( void )
2005-04-17 02:20:36 +04:00
{
2009-05-01 02:08:51 +04:00
int ret ;
ret = percpu_counter_init ( & vm_committed_as , 0 ) ;
VM_BUG_ON ( ret ) ;
2009-04-03 03:56:32 +04:00
vm_region_jar = KMEM_CACHE ( vm_region , SLAB_PANIC ) ;
2005-04-17 02:20:36 +04:00
}
2006-09-27 12:50:20 +04:00
/*
2009-01-08 15:04:47 +03:00
* validate the region tree
* - the caller must hold the region lock
2006-09-27 12:50:20 +04:00
*/
2009-01-08 15:04:47 +03:00
# ifdef CONFIG_DEBUG_NOMMU_REGIONS
static noinline void validate_nommu_regions ( void )
2006-09-27 12:50:20 +04:00
{
2009-01-08 15:04:47 +03:00
struct vm_region * region , * last ;
struct rb_node * p , * lastp ;
2006-09-27 12:50:20 +04:00
2009-01-08 15:04:47 +03:00
lastp = rb_first ( & nommu_region_tree ) ;
if ( ! lastp )
return ;
last = rb_entry ( lastp , struct vm_region , vm_rb ) ;
2009-04-03 03:56:32 +04:00
BUG_ON ( unlikely ( last - > vm_end < = last - > vm_start ) ) ;
BUG_ON ( unlikely ( last - > vm_top < last - > vm_end ) ) ;
2009-01-08 15:04:47 +03:00
while ( ( p = rb_next ( lastp ) ) ) {
region = rb_entry ( p , struct vm_region , vm_rb ) ;
last = rb_entry ( lastp , struct vm_region , vm_rb ) ;
2009-04-03 03:56:32 +04:00
BUG_ON ( unlikely ( region - > vm_end < = region - > vm_start ) ) ;
BUG_ON ( unlikely ( region - > vm_top < region - > vm_end ) ) ;
BUG_ON ( unlikely ( region - > vm_start < last - > vm_top ) ) ;
2006-09-27 12:50:20 +04:00
2009-01-08 15:04:47 +03:00
lastp = p ;
}
2006-09-27 12:50:20 +04:00
}
2009-01-08 15:04:47 +03:00
# else
2009-04-03 03:56:32 +04:00
static void validate_nommu_regions ( void )
{
}
2009-01-08 15:04:47 +03:00
# endif
2006-09-27 12:50:20 +04:00
/*
2009-01-08 15:04:47 +03:00
* add a region into the global tree
2006-09-27 12:50:20 +04:00
*/
2009-01-08 15:04:47 +03:00
static void add_nommu_region ( struct vm_region * region )
2006-09-27 12:50:20 +04:00
{
2009-01-08 15:04:47 +03:00
struct vm_region * pregion ;
struct rb_node * * p , * parent ;
2006-09-27 12:50:20 +04:00
2009-01-08 15:04:47 +03:00
validate_nommu_regions ( ) ;
parent = NULL ;
p = & nommu_region_tree . rb_node ;
while ( * p ) {
parent = * p ;
pregion = rb_entry ( parent , struct vm_region , vm_rb ) ;
if ( region - > vm_start < pregion - > vm_start )
p = & ( * p ) - > rb_left ;
else if ( region - > vm_start > pregion - > vm_start )
p = & ( * p ) - > rb_right ;
else if ( pregion = = region )
return ;
else
BUG ( ) ;
2006-09-27 12:50:20 +04:00
}
2009-01-08 15:04:47 +03:00
rb_link_node ( & region - > vm_rb , parent , p ) ;
rb_insert_color ( & region - > vm_rb , & nommu_region_tree ) ;
2006-09-27 12:50:20 +04:00
2009-01-08 15:04:47 +03:00
validate_nommu_regions ( ) ;
2006-09-27 12:50:20 +04:00
}
[PATCH] NOMMU: Make futexes work under NOMMU conditions
Make futexes work under NOMMU conditions.
This can be tested by running this in one shell:
#define SYSERROR(X, Y) \
do { if ((long)(X) == -1L) { perror(Y); exit(1); }} while(0)
int main()
{
int shmid, tmp, *f, n;
shmid = shmget(23, 4, IPC_CREAT|0666);
SYSERROR(shmid, "shmget");
f = shmat(shmid, NULL, 0);
SYSERROR(f, "shmat");
n = *f;
printf("WAIT: %p{%x}\n", f, n);
tmp = futex(f, FUTEX_WAIT, n, NULL, NULL, 0);
SYSERROR(tmp, "futex");
printf("WAITED: %d\n", tmp);
tmp = shmdt(f);
SYSERROR(tmp, "shmdt");
exit(0);
}
And then this in the other shell:
#define SYSERROR(X, Y) \
do { if ((long)(X) == -1L) { perror(Y); exit(1); }} while(0)
int main()
{
int shmid, tmp, *f;
shmid = shmget(23, 4, IPC_CREAT|0666);
SYSERROR(shmid, "shmget");
f = shmat(shmid, NULL, 0);
SYSERROR(f, "shmat");
(*f)++;
printf("WAKE: %p{%x}\n", f, *f);
tmp = futex(f, FUTEX_WAKE, 1, NULL, NULL, 0);
SYSERROR(tmp, "futex");
printf("WOKE: %d\n", tmp);
tmp = shmdt(f);
SYSERROR(tmp, "shmdt");
exit(0);
}
The first program will set up a SYSV IPC SHM segment and wait on a futex in it
for the number at the start to change. The program will increment that number
and wake the first program up. This leads to output of the form:
SHELL 1 SHELL 2
======================= =======================
# /dowait
WAIT: 0xc32ac000{0}
# /dowake
WAKE: 0xc32ac000{1}
WAITED: 0 WOKE: 1
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 12:50:22 +04:00
/*
2009-01-08 15:04:47 +03:00
* delete a region from the global tree
[PATCH] NOMMU: Make futexes work under NOMMU conditions
Make futexes work under NOMMU conditions.
This can be tested by running this in one shell:
#define SYSERROR(X, Y) \
do { if ((long)(X) == -1L) { perror(Y); exit(1); }} while(0)
int main()
{
int shmid, tmp, *f, n;
shmid = shmget(23, 4, IPC_CREAT|0666);
SYSERROR(shmid, "shmget");
f = shmat(shmid, NULL, 0);
SYSERROR(f, "shmat");
n = *f;
printf("WAIT: %p{%x}\n", f, n);
tmp = futex(f, FUTEX_WAIT, n, NULL, NULL, 0);
SYSERROR(tmp, "futex");
printf("WAITED: %d\n", tmp);
tmp = shmdt(f);
SYSERROR(tmp, "shmdt");
exit(0);
}
And then this in the other shell:
#define SYSERROR(X, Y) \
do { if ((long)(X) == -1L) { perror(Y); exit(1); }} while(0)
int main()
{
int shmid, tmp, *f;
shmid = shmget(23, 4, IPC_CREAT|0666);
SYSERROR(shmid, "shmget");
f = shmat(shmid, NULL, 0);
SYSERROR(f, "shmat");
(*f)++;
printf("WAKE: %p{%x}\n", f, *f);
tmp = futex(f, FUTEX_WAKE, 1, NULL, NULL, 0);
SYSERROR(tmp, "futex");
printf("WOKE: %d\n", tmp);
tmp = shmdt(f);
SYSERROR(tmp, "shmdt");
exit(0);
}
The first program will set up a SYSV IPC SHM segment and wait on a futex in it
for the number at the start to change. The program will increment that number
and wake the first program up. This leads to output of the form:
SHELL 1 SHELL 2
======================= =======================
# /dowait
WAIT: 0xc32ac000{0}
# /dowake
WAKE: 0xc32ac000{1}
WAITED: 0 WOKE: 1
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 12:50:22 +04:00
*/
2009-01-08 15:04:47 +03:00
static void delete_nommu_region ( struct vm_region * region )
[PATCH] NOMMU: Make futexes work under NOMMU conditions
Make futexes work under NOMMU conditions.
This can be tested by running this in one shell:
#define SYSERROR(X, Y) \
do { if ((long)(X) == -1L) { perror(Y); exit(1); }} while(0)
int main()
{
int shmid, tmp, *f, n;
shmid = shmget(23, 4, IPC_CREAT|0666);
SYSERROR(shmid, "shmget");
f = shmat(shmid, NULL, 0);
SYSERROR(f, "shmat");
n = *f;
printf("WAIT: %p{%x}\n", f, n);
tmp = futex(f, FUTEX_WAIT, n, NULL, NULL, 0);
SYSERROR(tmp, "futex");
printf("WAITED: %d\n", tmp);
tmp = shmdt(f);
SYSERROR(tmp, "shmdt");
exit(0);
}
And then this in the other shell:
#define SYSERROR(X, Y) \
do { if ((long)(X) == -1L) { perror(Y); exit(1); }} while(0)
int main()
{
int shmid, tmp, *f;
shmid = shmget(23, 4, IPC_CREAT|0666);
SYSERROR(shmid, "shmget");
f = shmat(shmid, NULL, 0);
SYSERROR(f, "shmat");
(*f)++;
printf("WAKE: %p{%x}\n", f, *f);
tmp = futex(f, FUTEX_WAKE, 1, NULL, NULL, 0);
SYSERROR(tmp, "futex");
printf("WOKE: %d\n", tmp);
tmp = shmdt(f);
SYSERROR(tmp, "shmdt");
exit(0);
}
The first program will set up a SYSV IPC SHM segment and wait on a futex in it
for the number at the start to change. The program will increment that number
and wake the first program up. This leads to output of the form:
SHELL 1 SHELL 2
======================= =======================
# /dowait
WAIT: 0xc32ac000{0}
# /dowake
WAKE: 0xc32ac000{1}
WAITED: 0 WOKE: 1
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 12:50:22 +04:00
{
2009-01-08 15:04:47 +03:00
BUG_ON ( ! nommu_region_tree . rb_node ) ;
[PATCH] NOMMU: Make futexes work under NOMMU conditions
Make futexes work under NOMMU conditions.
This can be tested by running this in one shell:
#define SYSERROR(X, Y) \
do { if ((long)(X) == -1L) { perror(Y); exit(1); }} while(0)
int main()
{
int shmid, tmp, *f, n;
shmid = shmget(23, 4, IPC_CREAT|0666);
SYSERROR(shmid, "shmget");
f = shmat(shmid, NULL, 0);
SYSERROR(f, "shmat");
n = *f;
printf("WAIT: %p{%x}\n", f, n);
tmp = futex(f, FUTEX_WAIT, n, NULL, NULL, 0);
SYSERROR(tmp, "futex");
printf("WAITED: %d\n", tmp);
tmp = shmdt(f);
SYSERROR(tmp, "shmdt");
exit(0);
}
And then this in the other shell:
#define SYSERROR(X, Y) \
do { if ((long)(X) == -1L) { perror(Y); exit(1); }} while(0)
int main()
{
int shmid, tmp, *f;
shmid = shmget(23, 4, IPC_CREAT|0666);
SYSERROR(shmid, "shmget");
f = shmat(shmid, NULL, 0);
SYSERROR(f, "shmat");
(*f)++;
printf("WAKE: %p{%x}\n", f, *f);
tmp = futex(f, FUTEX_WAKE, 1, NULL, NULL, 0);
SYSERROR(tmp, "futex");
printf("WOKE: %d\n", tmp);
tmp = shmdt(f);
SYSERROR(tmp, "shmdt");
exit(0);
}
The first program will set up a SYSV IPC SHM segment and wait on a futex in it
for the number at the start to change. The program will increment that number
and wake the first program up. This leads to output of the form:
SHELL 1 SHELL 2
======================= =======================
# /dowait
WAIT: 0xc32ac000{0}
# /dowake
WAKE: 0xc32ac000{1}
WAITED: 0 WOKE: 1
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-27 12:50:22 +04:00
2009-01-08 15:04:47 +03:00
validate_nommu_regions ( ) ;
rb_erase ( & region - > vm_rb , & nommu_region_tree ) ;
validate_nommu_regions ( ) ;
2007-07-16 10:38:28 +04:00
}
2006-09-27 12:50:21 +04:00
/*
2009-01-08 15:04:47 +03:00
* free a contiguous series of pages
2006-09-27 12:50:21 +04:00
*/
2009-01-08 15:04:47 +03:00
static void free_page_series ( unsigned long from , unsigned long to )
2006-09-27 12:50:21 +04:00
{
2009-01-08 15:04:47 +03:00
for ( ; from < to ; from + = PAGE_SIZE ) {
struct page * page = virt_to_page ( from ) ;
kdebug ( " - free %lx " , from ) ;
2009-04-03 03:56:32 +04:00
atomic_long_dec ( & mmap_pages_allocated ) ;
2009-01-08 15:04:47 +03:00
if ( page_count ( page ) ! = 1 )
2009-04-03 03:56:32 +04:00
kdebug ( " free page %p: refcount not one: %d " ,
page , page_count ( page ) ) ;
2009-01-08 15:04:47 +03:00
put_page ( page ) ;
2006-09-27 12:50:21 +04:00
}
}
2006-09-27 12:50:20 +04:00
/*
2009-01-08 15:04:47 +03:00
* release a reference to a region
2009-04-03 03:56:32 +04:00
* - the caller must hold the region semaphore for writing , which this releases
2009-01-08 15:04:47 +03:00
* - the region may not have been added to the tree yet , in which case vm_top
2009-01-08 15:04:47 +03:00
* will equal vm_start
2006-09-27 12:50:20 +04:00
*/
2009-01-08 15:04:47 +03:00
static void __put_nommu_region ( struct vm_region * region )
__releases ( nommu_region_sem )
2005-04-17 02:20:36 +04:00
{
2010-01-16 04:01:33 +03:00
kenter ( " %p{%d} " , region , region - > vm_usage ) ;
2005-04-17 02:20:36 +04:00
2009-01-08 15:04:47 +03:00
BUG_ON ( ! nommu_region_tree . rb_node ) ;
2005-04-17 02:20:36 +04:00
2010-01-16 04:01:33 +03:00
if ( - - region - > vm_usage = = 0 ) {
2009-01-08 15:04:47 +03:00
if ( region - > vm_top > region - > vm_start )
2009-01-08 15:04:47 +03:00
delete_nommu_region ( region ) ;
up_write ( & nommu_region_sem ) ;
if ( region - > vm_file )
fput ( region - > vm_file ) ;
/* IO memory and memory shared directly out of the pagecache
* from ramfs / tmpfs mustn ' t be released here */
if ( region - > vm_flags & VM_MAPPED_COPY ) {
kdebug ( " free series " ) ;
2009-01-08 15:04:47 +03:00
free_page_series ( region - > vm_start , region - > vm_top ) ;
2009-01-08 15:04:47 +03:00
}
kmem_cache_free ( vm_region_jar , region ) ;
} else {
up_write ( & nommu_region_sem ) ;
2005-04-17 02:20:36 +04:00
}
2009-01-08 15:04:47 +03:00
}
2005-04-17 02:20:36 +04:00
2009-01-08 15:04:47 +03:00
/*
* release a reference to a region
*/
static void put_nommu_region ( struct vm_region * region )
{
down_write ( & nommu_region_sem ) ;
__put_nommu_region ( region ) ;
2005-04-17 02:20:36 +04:00
}
2009-09-22 04:03:57 +04:00
/*
* update protection on a vma
*/
static void protect_vma ( struct vm_area_struct * vma , unsigned long flags )
{
# ifdef CONFIG_MPU
struct mm_struct * mm = vma - > vm_mm ;
long start = vma - > vm_start & PAGE_MASK ;
while ( start < vma - > vm_end ) {
protect_page ( mm , start , flags ) ;
start + = PAGE_SIZE ;
}
update_protections ( mm ) ;
# endif
}
2006-09-27 12:50:20 +04:00
/*
2009-01-08 15:04:47 +03:00
* add a VMA into a process ' s mm_struct in the appropriate place in the list
* and tree and add to the address space ' s page tree also if not an anonymous
* page
* - should be called with mm - > mmap_sem held writelocked
2006-09-27 12:50:20 +04:00
*/
2009-01-08 15:04:47 +03:00
static void add_vma_to_mm ( struct mm_struct * mm , struct vm_area_struct * vma )
2005-04-17 02:20:36 +04:00
{
2009-01-08 15:04:47 +03:00
struct vm_area_struct * pvma , * * pp ;
2005-04-17 02:20:36 +04:00
struct address_space * mapping ;
2009-01-08 15:04:47 +03:00
struct rb_node * * p , * parent ;
kenter ( " ,%p " , vma ) ;
BUG_ON ( ! vma - > vm_region ) ;
mm - > map_count + + ;
vma - > vm_mm = mm ;
2005-04-17 02:20:36 +04:00
2009-09-22 04:03:57 +04:00
protect_vma ( vma , vma - > vm_flags ) ;
2005-04-17 02:20:36 +04:00
/* add the VMA to the mapping */
if ( vma - > vm_file ) {
mapping = vma - > vm_file - > f_mapping ;
flush_dcache_mmap_lock ( mapping ) ;
vma_prio_tree_insert ( vma , & mapping - > i_mmap ) ;
flush_dcache_mmap_unlock ( mapping ) ;
}
2009-01-08 15:04:47 +03:00
/* add the VMA to the tree */
parent = NULL ;
p = & mm - > mm_rb . rb_node ;
2005-04-17 02:20:36 +04:00
while ( * p ) {
parent = * p ;
pvma = rb_entry ( parent , struct vm_area_struct , vm_rb ) ;
2009-01-08 15:04:47 +03:00
/* sort by: start addr, end addr, VMA struct addr in that order
* ( the latter is necessary as we may get identical VMAs ) */
if ( vma - > vm_start < pvma - > vm_start )
2005-04-17 02:20:36 +04:00
p = & ( * p ) - > rb_left ;
2009-01-08 15:04:47 +03:00
else if ( vma - > vm_start > pvma - > vm_start )
2005-04-17 02:20:36 +04:00
p = & ( * p ) - > rb_right ;
2009-01-08 15:04:47 +03:00
else if ( vma - > vm_end < pvma - > vm_end )
p = & ( * p ) - > rb_left ;
else if ( vma - > vm_end > pvma - > vm_end )
p = & ( * p ) - > rb_right ;
else if ( vma < pvma )
p = & ( * p ) - > rb_left ;
else if ( vma > pvma )
p = & ( * p ) - > rb_right ;
else
BUG ( ) ;
2005-04-17 02:20:36 +04:00
}
rb_link_node ( & vma - > vm_rb , parent , p ) ;
2009-01-08 15:04:47 +03:00
rb_insert_color ( & vma - > vm_rb , & mm - > mm_rb ) ;
/* add VMA to the VMA list also */
for ( pp = & mm - > mmap ; ( pvma = * pp ) ; pp = & ( * pp ) - > vm_next ) {
if ( pvma - > vm_start > vma - > vm_start )
break ;
if ( pvma - > vm_start < vma - > vm_start )
continue ;
if ( pvma - > vm_end < vma - > vm_end )
break ;
}
vma - > vm_next = * pp ;
* pp = vma ;
2005-04-17 02:20:36 +04:00
}
2006-09-27 12:50:20 +04:00
/*
2009-01-08 15:04:47 +03:00
* delete a VMA from its owning mm_struct and address space
2006-09-27 12:50:20 +04:00
*/
2009-01-08 15:04:47 +03:00
static void delete_vma_from_mm ( struct vm_area_struct * vma )
2005-04-17 02:20:36 +04:00
{
2009-01-08 15:04:47 +03:00
struct vm_area_struct * * pp ;
2005-04-17 02:20:36 +04:00
struct address_space * mapping ;
2009-01-08 15:04:47 +03:00
struct mm_struct * mm = vma - > vm_mm ;
kenter ( " %p " , vma ) ;
2009-09-22 04:03:57 +04:00
protect_vma ( vma , 0 ) ;
2009-01-08 15:04:47 +03:00
mm - > map_count - - ;
if ( mm - > mmap_cache = = vma )
mm - > mmap_cache = NULL ;
2005-04-17 02:20:36 +04:00
/* remove the VMA from the mapping */
if ( vma - > vm_file ) {
mapping = vma - > vm_file - > f_mapping ;
flush_dcache_mmap_lock ( mapping ) ;
vma_prio_tree_remove ( vma , & mapping - > i_mmap ) ;
flush_dcache_mmap_unlock ( mapping ) ;
}
2009-01-08 15:04:47 +03:00
/* remove from the MM's tree and list */
rb_erase ( & vma - > vm_rb , & mm - > mm_rb ) ;
for ( pp = & mm - > mmap ; * pp ; pp = & ( * pp ) - > vm_next ) {
if ( * pp = = vma ) {
* pp = vma - > vm_next ;
break ;
}
}
vma - > vm_mm = NULL ;
}
/*
* destroy a VMA record
*/
static void delete_vma ( struct mm_struct * mm , struct vm_area_struct * vma )
{
kenter ( " %p " , vma ) ;
if ( vma - > vm_ops & & vma - > vm_ops - > close )
vma - > vm_ops - > close ( vma ) ;
if ( vma - > vm_file ) {
fput ( vma - > vm_file ) ;
if ( vma - > vm_flags & VM_EXECUTABLE )
removed_exe_file_vma ( mm ) ;
}
put_nommu_region ( vma - > vm_region ) ;
kmem_cache_free ( vm_area_cachep , vma ) ;
}
/*
* look up the first VMA in which addr resides , NULL if none
* - should be called with mm - > mmap_sem at least held readlocked
*/
struct vm_area_struct * find_vma ( struct mm_struct * mm , unsigned long addr )
{
struct vm_area_struct * vma ;
struct rb_node * n = mm - > mm_rb . rb_node ;
/* check the cache first */
vma = mm - > mmap_cache ;
if ( vma & & vma - > vm_start < = addr & & vma - > vm_end > addr )
return vma ;
/* trawl the tree (there may be multiple mappings in which addr
* resides ) */
for ( n = rb_first ( & mm - > mm_rb ) ; n ; n = rb_next ( n ) ) {
vma = rb_entry ( n , struct vm_area_struct , vm_rb ) ;
if ( vma - > vm_start > addr )
return NULL ;
if ( vma - > vm_end > addr ) {
mm - > mmap_cache = vma ;
return vma ;
}
}
return NULL ;
}
EXPORT_SYMBOL ( find_vma ) ;
/*
* find a VMA
* - we don ' t extend stack VMAs under NOMMU conditions
*/
struct vm_area_struct * find_extend_vma ( struct mm_struct * mm , unsigned long addr )
{
2010-03-25 19:48:38 +03:00
return find_vma ( mm , addr ) ;
2009-01-08 15:04:47 +03:00
}
/*
* expand a stack to a given address
* - not supported under NOMMU conditions
*/
int expand_stack ( struct vm_area_struct * vma , unsigned long address )
{
return - ENOMEM ;
}
/*
* look up the first VMA exactly that exactly matches addr
* - should be called with mm - > mmap_sem at least held readlocked
*/
static struct vm_area_struct * find_vma_exact ( struct mm_struct * mm ,
unsigned long addr ,
unsigned long len )
{
struct vm_area_struct * vma ;
struct rb_node * n = mm - > mm_rb . rb_node ;
unsigned long end = addr + len ;
/* check the cache first */
vma = mm - > mmap_cache ;
if ( vma & & vma - > vm_start = = addr & & vma - > vm_end = = end )
return vma ;
/* trawl the tree (there may be multiple mappings in which addr
* resides ) */
for ( n = rb_first ( & mm - > mm_rb ) ; n ; n = rb_next ( n ) ) {
vma = rb_entry ( n , struct vm_area_struct , vm_rb ) ;
if ( vma - > vm_start < addr )
continue ;
if ( vma - > vm_start > addr )
return NULL ;
if ( vma - > vm_end = = end ) {
mm - > mmap_cache = vma ;
return vma ;
}
}
return NULL ;
2005-04-17 02:20:36 +04:00
}
/*
* determine whether a mapping should be permitted and , if so , what sort of
* mapping we ' re capable of supporting
*/
static int validate_mmap_request ( struct file * file ,
unsigned long addr ,
unsigned long len ,
unsigned long prot ,
unsigned long flags ,
unsigned long pgoff ,
unsigned long * _capabilities )
{
2009-01-08 15:04:47 +03:00
unsigned long capabilities , rlen ;
2005-04-17 02:20:36 +04:00
unsigned long reqprot = prot ;
int ret ;
/* do the simple checks first */
2009-09-24 15:33:48 +04:00
if ( flags & MAP_FIXED ) {
2005-04-17 02:20:36 +04:00
printk ( KERN_DEBUG
" %d: Can't do fixed-address/overlay mmap of RAM \n " ,
current - > pid ) ;
return - EINVAL ;
}
if ( ( flags & MAP_TYPE ) ! = MAP_PRIVATE & &
( flags & MAP_TYPE ) ! = MAP_SHARED )
return - EINVAL ;
2006-12-06 05:02:59 +03:00
if ( ! len )
2005-04-17 02:20:36 +04:00
return - EINVAL ;
2006-12-06 05:02:59 +03:00
/* Careful about overflows.. */
2009-01-08 15:04:47 +03:00
rlen = PAGE_ALIGN ( len ) ;
if ( ! rlen | | rlen > TASK_SIZE )
2006-12-06 05:02:59 +03:00
return - ENOMEM ;
2005-04-17 02:20:36 +04:00
/* offset overflow? */
2009-01-08 15:04:47 +03:00
if ( ( pgoff + ( rlen > > PAGE_SHIFT ) ) < pgoff )
2006-12-06 05:02:59 +03:00
return - EOVERFLOW ;
2005-04-17 02:20:36 +04:00
if ( file ) {
/* validate file mapping requests */
struct address_space * mapping ;
/* files must support mmap */
if ( ! file - > f_op | | ! file - > f_op - > mmap )
return - ENODEV ;
/* work out if what we've got could possibly be shared
* - we support chardevs that provide their own " memory "
* - we support files / blockdevs that are memory backed
*/
mapping = file - > f_mapping ;
if ( ! mapping )
2006-12-08 13:37:21 +03:00
mapping = file - > f_path . dentry - > d_inode - > i_mapping ;
2005-04-17 02:20:36 +04:00
capabilities = 0 ;
if ( mapping & & mapping - > backing_dev_info )
capabilities = mapping - > backing_dev_info - > capabilities ;
if ( ! capabilities ) {
/* no explicit capabilities set, so assume some
* defaults */
2006-12-08 13:37:21 +03:00
switch ( file - > f_path . dentry - > d_inode - > i_mode & S_IFMT ) {
2005-04-17 02:20:36 +04:00
case S_IFREG :
case S_IFBLK :
capabilities = BDI_CAP_MAP_COPY ;
break ;
case S_IFCHR :
capabilities =
BDI_CAP_MAP_DIRECT |
BDI_CAP_READ_MAP |
BDI_CAP_WRITE_MAP ;
break ;
default :
return - EINVAL ;
}
}
/* eliminate any capabilities that we can't support on this
* device */
if ( ! file - > f_op - > get_unmapped_area )
capabilities & = ~ BDI_CAP_MAP_DIRECT ;
if ( ! file - > f_op - > read )
capabilities & = ~ BDI_CAP_MAP_COPY ;
2009-08-19 01:11:17 +04:00
/* The file shall have been opened with read permission. */
if ( ! ( file - > f_mode & FMODE_READ ) )
return - EACCES ;
2005-04-17 02:20:36 +04:00
if ( flags & MAP_SHARED ) {
/* do checks for writing, appending and locking */
if ( ( prot & PROT_WRITE ) & &
! ( file - > f_mode & FMODE_WRITE ) )
return - EACCES ;
2006-12-08 13:37:21 +03:00
if ( IS_APPEND ( file - > f_path . dentry - > d_inode ) & &
2005-04-17 02:20:36 +04:00
( file - > f_mode & FMODE_WRITE ) )
return - EACCES ;
2006-12-08 13:37:21 +03:00
if ( locks_verify_locked ( file - > f_path . dentry - > d_inode ) )
2005-04-17 02:20:36 +04:00
return - EAGAIN ;
if ( ! ( capabilities & BDI_CAP_MAP_DIRECT ) )
return - ENODEV ;
/* we mustn't privatise shared mappings */
capabilities & = ~ BDI_CAP_MAP_COPY ;
}
else {
/* we're going to read the file into private memory we
* allocate */
if ( ! ( capabilities & BDI_CAP_MAP_COPY ) )
return - ENODEV ;
/* we don't permit a private writable mapping to be
* shared with the backing device */
if ( prot & PROT_WRITE )
capabilities & = ~ BDI_CAP_MAP_DIRECT ;
}
2010-05-26 10:43:00 +04:00
if ( capabilities & BDI_CAP_MAP_DIRECT ) {
if ( ( ( prot & PROT_READ ) & & ! ( capabilities & BDI_CAP_READ_MAP ) ) | |
( ( prot & PROT_WRITE ) & & ! ( capabilities & BDI_CAP_WRITE_MAP ) ) | |
( ( prot & PROT_EXEC ) & & ! ( capabilities & BDI_CAP_EXEC_MAP ) )
) {
capabilities & = ~ BDI_CAP_MAP_DIRECT ;
if ( flags & MAP_SHARED ) {
printk ( KERN_WARNING
" MAP_SHARED not completely supported on !MMU \n " ) ;
return - EINVAL ;
}
}
}
2005-04-17 02:20:36 +04:00
/* handle executable mappings and implied executable
* mappings */
2006-12-08 13:37:21 +03:00
if ( file - > f_path . mnt - > mnt_flags & MNT_NOEXEC ) {
2005-04-17 02:20:36 +04:00
if ( prot & PROT_EXEC )
return - EPERM ;
}
else if ( ( prot & PROT_READ ) & & ! ( prot & PROT_EXEC ) ) {
/* handle implication of PROT_EXEC by PROT_READ */
if ( current - > personality & READ_IMPLIES_EXEC ) {
if ( capabilities & BDI_CAP_EXEC_MAP )
prot | = PROT_EXEC ;
}
}
else if ( ( prot & PROT_READ ) & &
( prot & PROT_EXEC ) & &
! ( capabilities & BDI_CAP_EXEC_MAP )
) {
/* backing file is not executable, try to copy */
capabilities & = ~ BDI_CAP_MAP_DIRECT ;
}
}
else {
/* anonymous mappings are always memory backed and can be
* privately mapped
*/
capabilities = BDI_CAP_MAP_COPY ;
/* handle PROT_EXEC implication by PROT_READ */
if ( ( prot & PROT_READ ) & &
( current - > personality & READ_IMPLIES_EXEC ) )
prot | = PROT_EXEC ;
}
/* allow the security API to have its say */
2007-06-28 23:55:21 +04:00
ret = security_file_mmap ( file , reqprot , prot , flags , addr , 0 ) ;
2005-04-17 02:20:36 +04:00
if ( ret < 0 )
return ret ;
/* looks okay */
* _capabilities = capabilities ;
return 0 ;
}
/*
* we ' ve determined that we can make the mapping , now translate what we
* now know into VMA flags
*/
static unsigned long determine_vm_flags ( struct file * file ,
unsigned long prot ,
unsigned long flags ,
unsigned long capabilities )
{
unsigned long vm_flags ;
vm_flags = calc_vm_prot_bits ( prot ) | calc_vm_flag_bits ( flags ) ;
/* vm_flags |= mm->def_flags; */
if ( ! ( capabilities & BDI_CAP_MAP_DIRECT ) ) {
/* attempt to share read-only copies of mapped file chunks */
2010-05-26 10:43:00 +04:00
vm_flags | = VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC ;
2005-04-17 02:20:36 +04:00
if ( file & & ! ( prot & PROT_WRITE ) )
vm_flags | = VM_MAYSHARE ;
2010-05-26 10:43:00 +04:00
} else {
2005-04-17 02:20:36 +04:00
/* overlay a shareable mapping on the backing device or inode
* if possible - used for chardevs , ramfs / tmpfs / shmfs and
* romfs / cramfs */
2010-05-26 10:43:00 +04:00
vm_flags | = VM_MAYSHARE | ( capabilities & BDI_CAP_VMFLAGS ) ;
2005-04-17 02:20:36 +04:00
if ( flags & MAP_SHARED )
2010-05-26 10:43:00 +04:00
vm_flags | = VM_SHARED ;
2005-04-17 02:20:36 +04:00
}
/* refuse to let anyone share private mappings with this process if
* it ' s being traced - otherwise breakpoints set in it may interfere
* with another untraced process
*/
2008-07-26 06:45:50 +04:00
if ( ( flags & MAP_PRIVATE ) & & tracehook_expect_breakpoints ( current ) )
2005-04-17 02:20:36 +04:00
vm_flags & = ~ VM_MAYSHARE ;
return vm_flags ;
}
/*
2009-01-08 15:04:47 +03:00
* set up a shared mapping on a file ( the driver or filesystem provides and
* pins the storage )
2005-04-17 02:20:36 +04:00
*/
2009-01-08 15:04:47 +03:00
static int do_mmap_shared_file ( struct vm_area_struct * vma )
2005-04-17 02:20:36 +04:00
{
int ret ;
ret = vma - > vm_file - > f_op - > mmap ( vma - > vm_file , vma ) ;
2009-01-08 15:04:47 +03:00
if ( ret = = 0 ) {
vma - > vm_region - > vm_top = vma - > vm_region - > vm_end ;
NOMMU: Fix MAP_PRIVATE mmap() of objects where the data can be mapped directly
Fix MAP_PRIVATE mmap() of files and devices where the data in the backing store
might be mapped directly. Use the BDI_CAP_MAP_DIRECT capability flag to govern
whether or not we should be trying to map a file directly. This can be used to
determine whether or not a region has been filled in at the point where we call
do_mmap_shared() or do_mmap_private().
The BDI_CAP_MAP_DIRECT capability flag is cleared by validate_mmap_request() if
there's any reason we can't use it. It's also cleared in do_mmap_pgoff() if
f_op->get_unmapped_area() fails.
Without this fix, attempting to run a program from a RomFS image on a
non-mappable MTD partition results in a BUG as the kernel attempts XIP, and
this can be caught in gdb:
Program received signal SIGABRT, Aborted.
0xc005dce8 in add_nommu_region (region=<value optimized out>) at mm/nommu.c:547
(gdb) bt
#0 0xc005dce8 in add_nommu_region (region=<value optimized out>) at mm/nommu.c:547
#1 0xc005f168 in do_mmap_pgoff (file=0xc31a6620, addr=<value optimized out>, len=3808, prot=3, flags=6146, pgoff=0) at mm/nommu.c:1373
#2 0xc00a96b8 in elf_fdpic_map_file (params=0xc33fbbec, file=0xc31a6620, mm=0xc31bef60, what=0xc0213144 "executable") at mm.h:1145
#3 0xc00aa8b4 in load_elf_fdpic_binary (bprm=0xc316cb00, regs=<value optimized out>) at fs/binfmt_elf_fdpic.c:343
#4 0xc006b588 in search_binary_handler (bprm=0x6, regs=0xc33fbce0) at fs/exec.c:1234
#5 0xc006c648 in do_execve (filename=<value optimized out>, argv=0xc3ad14cc, envp=0xc3ad1460, regs=0xc33fbce0) at fs/exec.c:1356
#6 0xc0008cf0 in sys_execve (name=<value optimized out>, argv=0xc3ad14cc, envp=0xc3ad1460) at arch/frv/kernel/process.c:263
#7 0xc00075dc in __syscall_call () at arch/frv/kernel/entry.S:897
Note that this fix does the following commit differently:
commit a190887b58c32d19c2eee007c5eb8faa970a69ba
Author: David Howells <dhowells@redhat.com>
Date: Sat Sep 5 11:17:07 2009 -0700
nommu: fix error handling in do_mmap_pgoff()
Reported-by: Graff Yang <graff.yang@gmail.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Greg Ungerer <gerg@snapgear.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-09-24 18:13:10 +04:00
return 0 ;
2009-01-08 15:04:47 +03:00
}
2005-04-17 02:20:36 +04:00
if ( ret ! = - ENOSYS )
return ret ;
2010-03-23 23:35:21 +03:00
/* getting -ENOSYS indicates that direct mmap isn't possible (as
* opposed to tried but failed ) so we can only give a suitable error as
* it ' s not possible to make a private copy if MAP_SHARED was given */
2005-04-17 02:20:36 +04:00
return - ENODEV ;
}
/*
* set up a private mapping or an anonymous shared mapping
*/
2009-01-08 15:04:47 +03:00
static int do_mmap_private ( struct vm_area_struct * vma ,
struct vm_region * region ,
NOMMU: Fix MAP_PRIVATE mmap() of objects where the data can be mapped directly
Fix MAP_PRIVATE mmap() of files and devices where the data in the backing store
might be mapped directly. Use the BDI_CAP_MAP_DIRECT capability flag to govern
whether or not we should be trying to map a file directly. This can be used to
determine whether or not a region has been filled in at the point where we call
do_mmap_shared() or do_mmap_private().
The BDI_CAP_MAP_DIRECT capability flag is cleared by validate_mmap_request() if
there's any reason we can't use it. It's also cleared in do_mmap_pgoff() if
f_op->get_unmapped_area() fails.
Without this fix, attempting to run a program from a RomFS image on a
non-mappable MTD partition results in a BUG as the kernel attempts XIP, and
this can be caught in gdb:
Program received signal SIGABRT, Aborted.
0xc005dce8 in add_nommu_region (region=<value optimized out>) at mm/nommu.c:547
(gdb) bt
#0 0xc005dce8 in add_nommu_region (region=<value optimized out>) at mm/nommu.c:547
#1 0xc005f168 in do_mmap_pgoff (file=0xc31a6620, addr=<value optimized out>, len=3808, prot=3, flags=6146, pgoff=0) at mm/nommu.c:1373
#2 0xc00a96b8 in elf_fdpic_map_file (params=0xc33fbbec, file=0xc31a6620, mm=0xc31bef60, what=0xc0213144 "executable") at mm.h:1145
#3 0xc00aa8b4 in load_elf_fdpic_binary (bprm=0xc316cb00, regs=<value optimized out>) at fs/binfmt_elf_fdpic.c:343
#4 0xc006b588 in search_binary_handler (bprm=0x6, regs=0xc33fbce0) at fs/exec.c:1234
#5 0xc006c648 in do_execve (filename=<value optimized out>, argv=0xc3ad14cc, envp=0xc3ad1460, regs=0xc33fbce0) at fs/exec.c:1356
#6 0xc0008cf0 in sys_execve (name=<value optimized out>, argv=0xc3ad14cc, envp=0xc3ad1460) at arch/frv/kernel/process.c:263
#7 0xc00075dc in __syscall_call () at arch/frv/kernel/entry.S:897
Note that this fix does the following commit differently:
commit a190887b58c32d19c2eee007c5eb8faa970a69ba
Author: David Howells <dhowells@redhat.com>
Date: Sat Sep 5 11:17:07 2009 -0700
nommu: fix error handling in do_mmap_pgoff()
Reported-by: Graff Yang <graff.yang@gmail.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Greg Ungerer <gerg@snapgear.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-09-24 18:13:10 +04:00
unsigned long len ,
unsigned long capabilities )
2005-04-17 02:20:36 +04:00
{
2009-01-08 15:04:47 +03:00
struct page * pages ;
unsigned long total , point , n , rlen ;
2005-04-17 02:20:36 +04:00
void * base ;
2009-01-08 15:04:47 +03:00
int ret , order ;
2005-04-17 02:20:36 +04:00
/* invoke the file's mapping function so that it can keep track of
* shared mappings on devices or memory
* - VM_MAYSHARE will be set if it may attempt to share
*/
NOMMU: Fix MAP_PRIVATE mmap() of objects where the data can be mapped directly
Fix MAP_PRIVATE mmap() of files and devices where the data in the backing store
might be mapped directly. Use the BDI_CAP_MAP_DIRECT capability flag to govern
whether or not we should be trying to map a file directly. This can be used to
determine whether or not a region has been filled in at the point where we call
do_mmap_shared() or do_mmap_private().
The BDI_CAP_MAP_DIRECT capability flag is cleared by validate_mmap_request() if
there's any reason we can't use it. It's also cleared in do_mmap_pgoff() if
f_op->get_unmapped_area() fails.
Without this fix, attempting to run a program from a RomFS image on a
non-mappable MTD partition results in a BUG as the kernel attempts XIP, and
this can be caught in gdb:
Program received signal SIGABRT, Aborted.
0xc005dce8 in add_nommu_region (region=<value optimized out>) at mm/nommu.c:547
(gdb) bt
#0 0xc005dce8 in add_nommu_region (region=<value optimized out>) at mm/nommu.c:547
#1 0xc005f168 in do_mmap_pgoff (file=0xc31a6620, addr=<value optimized out>, len=3808, prot=3, flags=6146, pgoff=0) at mm/nommu.c:1373
#2 0xc00a96b8 in elf_fdpic_map_file (params=0xc33fbbec, file=0xc31a6620, mm=0xc31bef60, what=0xc0213144 "executable") at mm.h:1145
#3 0xc00aa8b4 in load_elf_fdpic_binary (bprm=0xc316cb00, regs=<value optimized out>) at fs/binfmt_elf_fdpic.c:343
#4 0xc006b588 in search_binary_handler (bprm=0x6, regs=0xc33fbce0) at fs/exec.c:1234
#5 0xc006c648 in do_execve (filename=<value optimized out>, argv=0xc3ad14cc, envp=0xc3ad1460, regs=0xc33fbce0) at fs/exec.c:1356
#6 0xc0008cf0 in sys_execve (name=<value optimized out>, argv=0xc3ad14cc, envp=0xc3ad1460) at arch/frv/kernel/process.c:263
#7 0xc00075dc in __syscall_call () at arch/frv/kernel/entry.S:897
Note that this fix does the following commit differently:
commit a190887b58c32d19c2eee007c5eb8faa970a69ba
Author: David Howells <dhowells@redhat.com>
Date: Sat Sep 5 11:17:07 2009 -0700
nommu: fix error handling in do_mmap_pgoff()
Reported-by: Graff Yang <graff.yang@gmail.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Greg Ungerer <gerg@snapgear.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-09-24 18:13:10 +04:00
if ( capabilities & BDI_CAP_MAP_DIRECT ) {
2005-04-17 02:20:36 +04:00
ret = vma - > vm_file - > f_op - > mmap ( vma - > vm_file , vma ) ;
2009-01-08 15:04:47 +03:00
if ( ret = = 0 ) {
2005-04-17 02:20:36 +04:00
/* shouldn't return success if we're not sharing */
2009-01-08 15:04:47 +03:00
BUG_ON ( ! ( vma - > vm_flags & VM_MAYSHARE ) ) ;
vma - > vm_region - > vm_top = vma - > vm_region - > vm_end ;
NOMMU: Fix MAP_PRIVATE mmap() of objects where the data can be mapped directly
Fix MAP_PRIVATE mmap() of files and devices where the data in the backing store
might be mapped directly. Use the BDI_CAP_MAP_DIRECT capability flag to govern
whether or not we should be trying to map a file directly. This can be used to
determine whether or not a region has been filled in at the point where we call
do_mmap_shared() or do_mmap_private().
The BDI_CAP_MAP_DIRECT capability flag is cleared by validate_mmap_request() if
there's any reason we can't use it. It's also cleared in do_mmap_pgoff() if
f_op->get_unmapped_area() fails.
Without this fix, attempting to run a program from a RomFS image on a
non-mappable MTD partition results in a BUG as the kernel attempts XIP, and
this can be caught in gdb:
Program received signal SIGABRT, Aborted.
0xc005dce8 in add_nommu_region (region=<value optimized out>) at mm/nommu.c:547
(gdb) bt
#0 0xc005dce8 in add_nommu_region (region=<value optimized out>) at mm/nommu.c:547
#1 0xc005f168 in do_mmap_pgoff (file=0xc31a6620, addr=<value optimized out>, len=3808, prot=3, flags=6146, pgoff=0) at mm/nommu.c:1373
#2 0xc00a96b8 in elf_fdpic_map_file (params=0xc33fbbec, file=0xc31a6620, mm=0xc31bef60, what=0xc0213144 "executable") at mm.h:1145
#3 0xc00aa8b4 in load_elf_fdpic_binary (bprm=0xc316cb00, regs=<value optimized out>) at fs/binfmt_elf_fdpic.c:343
#4 0xc006b588 in search_binary_handler (bprm=0x6, regs=0xc33fbce0) at fs/exec.c:1234
#5 0xc006c648 in do_execve (filename=<value optimized out>, argv=0xc3ad14cc, envp=0xc3ad1460, regs=0xc33fbce0) at fs/exec.c:1356
#6 0xc0008cf0 in sys_execve (name=<value optimized out>, argv=0xc3ad14cc, envp=0xc3ad1460) at arch/frv/kernel/process.c:263
#7 0xc00075dc in __syscall_call () at arch/frv/kernel/entry.S:897
Note that this fix does the following commit differently:
commit a190887b58c32d19c2eee007c5eb8faa970a69ba
Author: David Howells <dhowells@redhat.com>
Date: Sat Sep 5 11:17:07 2009 -0700
nommu: fix error handling in do_mmap_pgoff()
Reported-by: Graff Yang <graff.yang@gmail.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Greg Ungerer <gerg@snapgear.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-09-24 18:13:10 +04:00
return 0 ;
2005-04-17 02:20:36 +04:00
}
2009-01-08 15:04:47 +03:00
if ( ret ! = - ENOSYS )
return ret ;
2005-04-17 02:20:36 +04:00
/* getting an ENOSYS error indicates that direct mmap isn't
* possible ( as opposed to tried but failed ) so we ' ll try to
* make a private copy of the data and map that instead */
}
2009-01-08 15:04:47 +03:00
rlen = PAGE_ALIGN ( len ) ;
2005-04-17 02:20:36 +04:00
/* allocate some memory to hold the mapping
* - note that this may not return a page - aligned address if the object
* we ' re allocating is smaller than a page
*/
2009-01-08 15:04:47 +03:00
order = get_order ( rlen ) ;
kdebug ( " alloc order %d for %lx " , order , len ) ;
pages = alloc_pages ( GFP_KERNEL , order ) ;
if ( ! pages )
2005-04-17 02:20:36 +04:00
goto enomem ;
2009-01-08 15:04:47 +03:00
total = 1 < < order ;
2009-04-03 03:56:32 +04:00
atomic_long_add ( total , & mmap_pages_allocated ) ;
2009-01-08 15:04:47 +03:00
point = rlen > > PAGE_SHIFT ;
2009-01-08 15:04:47 +03:00
/* we allocated a power-of-2 sized page set, so we may want to trim off
* the excess */
if ( sysctl_nr_trim_pages & & total - point > = sysctl_nr_trim_pages ) {
while ( total > point ) {
order = ilog2 ( total - point ) ;
n = 1 < < order ;
kdebug ( " shave %lu/%lu @%lu " , n , total - point , total ) ;
2009-04-03 03:56:32 +04:00
atomic_long_sub ( n , & mmap_pages_allocated ) ;
2009-01-08 15:04:47 +03:00
total - = n ;
set_page_refcounted ( pages + total ) ;
__free_pages ( pages + total , order ) ;
}
2009-01-08 15:04:47 +03:00
}
for ( point = 1 ; point < total ; point + + )
set_page_refcounted ( & pages [ point ] ) ;
2005-04-17 02:20:36 +04:00
2009-01-08 15:04:47 +03:00
base = page_address ( pages ) ;
region - > vm_flags = vma - > vm_flags | = VM_MAPPED_COPY ;
region - > vm_start = ( unsigned long ) base ;
region - > vm_end = region - > vm_start + rlen ;
2009-01-08 15:04:47 +03:00
region - > vm_top = region - > vm_start + ( total < < PAGE_SHIFT ) ;
2009-01-08 15:04:47 +03:00
vma - > vm_start = region - > vm_start ;
vma - > vm_end = region - > vm_start + len ;
2005-04-17 02:20:36 +04:00
if ( vma - > vm_file ) {
/* read the contents of a file into the copy */
mm_segment_t old_fs ;
loff_t fpos ;
fpos = vma - > vm_pgoff ;
fpos < < = PAGE_SHIFT ;
old_fs = get_fs ( ) ;
set_fs ( KERNEL_DS ) ;
2009-01-08 15:04:47 +03:00
ret = vma - > vm_file - > f_op - > read ( vma - > vm_file , base , rlen , & fpos ) ;
2005-04-17 02:20:36 +04:00
set_fs ( old_fs ) ;
if ( ret < 0 )
goto error_free ;
/* clear the last little bit */
2009-01-08 15:04:47 +03:00
if ( ret < rlen )
memset ( base + ret , 0 , rlen - ret ) ;
2005-04-17 02:20:36 +04:00
}
return 0 ;
error_free :
2009-01-08 15:04:47 +03:00
free_page_series ( region - > vm_start , region - > vm_end ) ;
region - > vm_start = vma - > vm_start = 0 ;
region - > vm_end = vma - > vm_end = 0 ;
2009-01-08 15:04:47 +03:00
region - > vm_top = 0 ;
2005-04-17 02:20:36 +04:00
return ret ;
enomem :
2009-01-13 10:30:22 +03:00
printk ( " Allocation of length %lu from process %d (%s) failed \n " ,
len , current - > pid , current - > comm ) ;
2005-04-17 02:20:36 +04:00
show_free_areas ( ) ;
return - ENOMEM ;
}
/*
* handle mapping creation for uClinux
*/
unsigned long do_mmap_pgoff ( struct file * file ,
unsigned long addr ,
unsigned long len ,
unsigned long prot ,
unsigned long flags ,
unsigned long pgoff )
{
2009-01-08 15:04:47 +03:00
struct vm_area_struct * vma ;
struct vm_region * region ;
2005-04-17 02:20:36 +04:00
struct rb_node * rb ;
2009-01-08 15:04:47 +03:00
unsigned long capabilities , vm_flags , result ;
2005-04-17 02:20:36 +04:00
int ret ;
2009-01-08 15:04:47 +03:00
kenter ( " ,%lx,%lx,%lx,%lx,%lx " , addr , len , prot , flags , pgoff ) ;
2005-04-17 02:20:36 +04:00
/* decide whether we should attempt the mapping, and if so what sort of
* mapping */
ret = validate_mmap_request ( file , addr , len , prot , flags , pgoff ,
& capabilities ) ;
2009-01-08 15:04:47 +03:00
if ( ret < 0 ) {
kleave ( " = %d [val] " , ret ) ;
2005-04-17 02:20:36 +04:00
return ret ;
2009-01-08 15:04:47 +03:00
}
2005-04-17 02:20:36 +04:00
2009-09-24 15:33:48 +04:00
/* we ignore the address hint */
addr = 0 ;
2005-04-17 02:20:36 +04:00
/* we've determined that we can make the mapping, now translate what we
* now know into VMA flags */
vm_flags = determine_vm_flags ( file , prot , flags , capabilities ) ;
2009-01-08 15:04:47 +03:00
/* we're going to need to record the mapping */
region = kmem_cache_zalloc ( vm_region_jar , GFP_KERNEL ) ;
if ( ! region )
goto error_getting_region ;
vma = kmem_cache_zalloc ( vm_area_cachep , GFP_KERNEL ) ;
if ( ! vma )
goto error_getting_vma ;
2005-04-17 02:20:36 +04:00
2010-01-16 04:01:33 +03:00
region - > vm_usage = 1 ;
2009-01-08 15:04:47 +03:00
region - > vm_flags = vm_flags ;
region - > vm_pgoff = pgoff ;
mm: change anon_vma linking to fix multi-process server scalability issue
The old anon_vma code can lead to scalability issues with heavily forking
workloads. Specifically, each anon_vma will be shared between the parent
process and all its child processes.
In a workload with 1000 child processes and a VMA with 1000 anonymous
pages per process that get COWed, this leads to a system with a million
anonymous pages in the same anon_vma, each of which is mapped in just one
of the 1000 processes. However, the current rmap code needs to walk them
all, leading to O(N) scanning complexity for each page.
This can result in systems where one CPU is walking the page tables of
1000 processes in page_referenced_one, while all other CPUs are stuck on
the anon_vma lock. This leads to catastrophic failure for a benchmark
like AIM7, where the total number of processes can reach in the tens of
thousands. Real workloads are still a factor 10 less process intensive
than AIM7, but they are catching up.
This patch changes the way anon_vmas and VMAs are linked, which allows us
to associate multiple anon_vmas with a VMA. At fork time, each child
process gets its own anon_vmas, in which its COWed pages will be
instantiated. The parents' anon_vma is also linked to the VMA, because
non-COWed pages could be present in any of the children.
This reduces rmap scanning complexity to O(1) for the pages of the 1000
child processes, with O(N) complexity for at most 1/N pages in the system.
This reduces the average scanning cost in heavily forking workloads from
O(N) to 2.
The only real complexity in this patch stems from the fact that linking a
VMA to anon_vmas now involves memory allocations. This means vma_adjust
can fail, if it needs to attach a VMA to anon_vma structures. This in
turn means error handling needs to be added to the calling functions.
A second source of complexity is that, because there can be multiple
anon_vmas, the anon_vma linking in vma_adjust can no longer be done under
"the" anon_vma lock. To prevent the rmap code from walking up an
incomplete VMA, this patch introduces the VM_LOCK_RMAP VMA flag. This bit
flag uses the same slot as the NOMMU VM_MAPPED_COPY, with an ifdef in mm.h
to make sure it is impossible to compile a kernel that needs both symbolic
values for the same bitflag.
Some test results:
Without the anon_vma changes, when AIM7 hits around 9.7k users (on a test
box with 16GB RAM and not quite enough IO), the system ends up running
>99% in system time, with every CPU on the same anon_vma lock in the
pageout code.
With these changes, AIM7 hits the cross-over point around 29.7k users.
This happens with ~99% IO wait time, there never seems to be any spike in
system time. The anon_vma lock contention appears to be resolved.
[akpm@linux-foundation.org: cleanups]
Signed-off-by: Rik van Riel <riel@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-03-06 00:42:07 +03:00
INIT_LIST_HEAD ( & vma - > anon_vma_chain ) ;
2009-01-08 15:04:47 +03:00
vma - > vm_flags = vm_flags ;
vma - > vm_pgoff = pgoff ;
2005-04-17 02:20:36 +04:00
2009-01-08 15:04:47 +03:00
if ( file ) {
region - > vm_file = file ;
get_file ( file ) ;
vma - > vm_file = file ;
get_file ( file ) ;
if ( vm_flags & VM_EXECUTABLE ) {
added_exe_file_vma ( current - > mm ) ;
vma - > vm_mm = current - > mm ;
}
}
down_write ( & nommu_region_sem ) ;
/* if we want to share, we need to check for regions created by other
2005-04-17 02:20:36 +04:00
* mmap ( ) calls that overlap with our proposed mapping
2009-01-08 15:04:47 +03:00
* - we can only share with a superset match on most regular files
2005-04-17 02:20:36 +04:00
* - shared mappings on character devices and memory backed files are
* permitted to overlap inexactly as far as we are concerned for in
* these cases , sharing is handled in the driver or filesystem rather
* than here
*/
if ( vm_flags & VM_MAYSHARE ) {
2009-01-08 15:04:47 +03:00
struct vm_region * pregion ;
unsigned long pglen , rpglen , pgend , rpgend , start ;
2005-04-17 02:20:36 +04:00
2009-01-08 15:04:47 +03:00
pglen = ( len + PAGE_SIZE - 1 ) > > PAGE_SHIFT ;
pgend = pgoff + pglen ;
2007-03-22 11:11:24 +03:00
2009-01-08 15:04:47 +03:00
for ( rb = rb_first ( & nommu_region_tree ) ; rb ; rb = rb_next ( rb ) ) {
pregion = rb_entry ( rb , struct vm_region , vm_rb ) ;
2005-04-17 02:20:36 +04:00
2009-01-08 15:04:47 +03:00
if ( ! ( pregion - > vm_flags & VM_MAYSHARE ) )
2005-04-17 02:20:36 +04:00
continue ;
/* search for overlapping mappings on the same file */
2009-01-08 15:04:47 +03:00
if ( pregion - > vm_file - > f_path . dentry - > d_inode ! =
file - > f_path . dentry - > d_inode )
2005-04-17 02:20:36 +04:00
continue ;
2009-01-08 15:04:47 +03:00
if ( pregion - > vm_pgoff > = pgend )
2005-04-17 02:20:36 +04:00
continue ;
2009-01-08 15:04:47 +03:00
rpglen = pregion - > vm_end - pregion - > vm_start ;
rpglen = ( rpglen + PAGE_SIZE - 1 ) > > PAGE_SHIFT ;
rpgend = pregion - > vm_pgoff + rpglen ;
if ( pgoff > = rpgend )
2005-04-17 02:20:36 +04:00
continue ;
2009-01-08 15:04:47 +03:00
/* handle inexactly overlapping matches between
* mappings */
if ( ( pregion - > vm_pgoff ! = pgoff | | rpglen ! = pglen ) & &
! ( pgoff > = pregion - > vm_pgoff & & pgend < = rpgend ) ) {
/* new mapping is not a subset of the region */
2005-04-17 02:20:36 +04:00
if ( ! ( capabilities & BDI_CAP_MAP_DIRECT ) )
goto sharing_violation ;
continue ;
}
2009-01-08 15:04:47 +03:00
/* we've found a region we can share */
2010-01-16 04:01:33 +03:00
pregion - > vm_usage + + ;
2009-01-08 15:04:47 +03:00
vma - > vm_region = pregion ;
start = pregion - > vm_start ;
start + = ( pgoff - pregion - > vm_pgoff ) < < PAGE_SHIFT ;
vma - > vm_start = start ;
vma - > vm_end = start + len ;
if ( pregion - > vm_flags & VM_MAPPED_COPY ) {
kdebug ( " share copy " ) ;
vma - > vm_flags | = VM_MAPPED_COPY ;
} else {
kdebug ( " share mmap " ) ;
ret = do_mmap_shared_file ( vma ) ;
if ( ret < 0 ) {
vma - > vm_region = NULL ;
vma - > vm_start = 0 ;
vma - > vm_end = 0 ;
2010-01-16 04:01:33 +03:00
pregion - > vm_usage - - ;
2009-01-08 15:04:47 +03:00
pregion = NULL ;
goto error_just_free ;
}
}
fput ( region - > vm_file ) ;
kmem_cache_free ( vm_region_jar , region ) ;
region = pregion ;
result = start ;
goto share ;
2005-04-17 02:20:36 +04:00
}
/* obtain the address at which to make a shared mapping
* - this is the hook for quasi - memory character devices to
* tell us the location of a shared mapping
*/
NOMMU: Fix MAP_PRIVATE mmap() of objects where the data can be mapped directly
Fix MAP_PRIVATE mmap() of files and devices where the data in the backing store
might be mapped directly. Use the BDI_CAP_MAP_DIRECT capability flag to govern
whether or not we should be trying to map a file directly. This can be used to
determine whether or not a region has been filled in at the point where we call
do_mmap_shared() or do_mmap_private().
The BDI_CAP_MAP_DIRECT capability flag is cleared by validate_mmap_request() if
there's any reason we can't use it. It's also cleared in do_mmap_pgoff() if
f_op->get_unmapped_area() fails.
Without this fix, attempting to run a program from a RomFS image on a
non-mappable MTD partition results in a BUG as the kernel attempts XIP, and
this can be caught in gdb:
Program received signal SIGABRT, Aborted.
0xc005dce8 in add_nommu_region (region=<value optimized out>) at mm/nommu.c:547
(gdb) bt
#0 0xc005dce8 in add_nommu_region (region=<value optimized out>) at mm/nommu.c:547
#1 0xc005f168 in do_mmap_pgoff (file=0xc31a6620, addr=<value optimized out>, len=3808, prot=3, flags=6146, pgoff=0) at mm/nommu.c:1373
#2 0xc00a96b8 in elf_fdpic_map_file (params=0xc33fbbec, file=0xc31a6620, mm=0xc31bef60, what=0xc0213144 "executable") at mm.h:1145
#3 0xc00aa8b4 in load_elf_fdpic_binary (bprm=0xc316cb00, regs=<value optimized out>) at fs/binfmt_elf_fdpic.c:343
#4 0xc006b588 in search_binary_handler (bprm=0x6, regs=0xc33fbce0) at fs/exec.c:1234
#5 0xc006c648 in do_execve (filename=<value optimized out>, argv=0xc3ad14cc, envp=0xc3ad1460, regs=0xc33fbce0) at fs/exec.c:1356
#6 0xc0008cf0 in sys_execve (name=<value optimized out>, argv=0xc3ad14cc, envp=0xc3ad1460) at arch/frv/kernel/process.c:263
#7 0xc00075dc in __syscall_call () at arch/frv/kernel/entry.S:897
Note that this fix does the following commit differently:
commit a190887b58c32d19c2eee007c5eb8faa970a69ba
Author: David Howells <dhowells@redhat.com>
Date: Sat Sep 5 11:17:07 2009 -0700
nommu: fix error handling in do_mmap_pgoff()
Reported-by: Graff Yang <graff.yang@gmail.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Greg Ungerer <gerg@snapgear.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-09-24 18:13:10 +04:00
if ( capabilities & BDI_CAP_MAP_DIRECT ) {
2005-04-17 02:20:36 +04:00
addr = file - > f_op - > get_unmapped_area ( file , addr , len ,
pgoff , flags ) ;
if ( IS_ERR ( ( void * ) addr ) ) {
ret = addr ;
if ( ret ! = ( unsigned long ) - ENOSYS )
2009-01-08 15:04:47 +03:00
goto error_just_free ;
2005-04-17 02:20:36 +04:00
/* the driver refused to tell us where to site
* the mapping so we ' ll have to attempt to copy
* it */
ret = ( unsigned long ) - ENODEV ;
if ( ! ( capabilities & BDI_CAP_MAP_COPY ) )
2009-01-08 15:04:47 +03:00
goto error_just_free ;
2005-04-17 02:20:36 +04:00
capabilities & = ~ BDI_CAP_MAP_DIRECT ;
2009-01-08 15:04:47 +03:00
} else {
vma - > vm_start = region - > vm_start = addr ;
vma - > vm_end = region - > vm_end = addr + len ;
2005-04-17 02:20:36 +04:00
}
}
}
2009-01-08 15:04:47 +03:00
vma - > vm_region = region ;
2005-04-17 02:20:36 +04:00
NOMMU: Fix MAP_PRIVATE mmap() of objects where the data can be mapped directly
Fix MAP_PRIVATE mmap() of files and devices where the data in the backing store
might be mapped directly. Use the BDI_CAP_MAP_DIRECT capability flag to govern
whether or not we should be trying to map a file directly. This can be used to
determine whether or not a region has been filled in at the point where we call
do_mmap_shared() or do_mmap_private().
The BDI_CAP_MAP_DIRECT capability flag is cleared by validate_mmap_request() if
there's any reason we can't use it. It's also cleared in do_mmap_pgoff() if
f_op->get_unmapped_area() fails.
Without this fix, attempting to run a program from a RomFS image on a
non-mappable MTD partition results in a BUG as the kernel attempts XIP, and
this can be caught in gdb:
Program received signal SIGABRT, Aborted.
0xc005dce8 in add_nommu_region (region=<value optimized out>) at mm/nommu.c:547
(gdb) bt
#0 0xc005dce8 in add_nommu_region (region=<value optimized out>) at mm/nommu.c:547
#1 0xc005f168 in do_mmap_pgoff (file=0xc31a6620, addr=<value optimized out>, len=3808, prot=3, flags=6146, pgoff=0) at mm/nommu.c:1373
#2 0xc00a96b8 in elf_fdpic_map_file (params=0xc33fbbec, file=0xc31a6620, mm=0xc31bef60, what=0xc0213144 "executable") at mm.h:1145
#3 0xc00aa8b4 in load_elf_fdpic_binary (bprm=0xc316cb00, regs=<value optimized out>) at fs/binfmt_elf_fdpic.c:343
#4 0xc006b588 in search_binary_handler (bprm=0x6, regs=0xc33fbce0) at fs/exec.c:1234
#5 0xc006c648 in do_execve (filename=<value optimized out>, argv=0xc3ad14cc, envp=0xc3ad1460, regs=0xc33fbce0) at fs/exec.c:1356
#6 0xc0008cf0 in sys_execve (name=<value optimized out>, argv=0xc3ad14cc, envp=0xc3ad1460) at arch/frv/kernel/process.c:263
#7 0xc00075dc in __syscall_call () at arch/frv/kernel/entry.S:897
Note that this fix does the following commit differently:
commit a190887b58c32d19c2eee007c5eb8faa970a69ba
Author: David Howells <dhowells@redhat.com>
Date: Sat Sep 5 11:17:07 2009 -0700
nommu: fix error handling in do_mmap_pgoff()
Reported-by: Graff Yang <graff.yang@gmail.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Greg Ungerer <gerg@snapgear.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-09-24 18:13:10 +04:00
/* set up the mapping
* - the region is filled in if BDI_CAP_MAP_DIRECT is still set
*/
2005-04-17 02:20:36 +04:00
if ( file & & vma - > vm_flags & VM_SHARED )
2009-01-08 15:04:47 +03:00
ret = do_mmap_shared_file ( vma ) ;
2005-04-17 02:20:36 +04:00
else
NOMMU: Fix MAP_PRIVATE mmap() of objects where the data can be mapped directly
Fix MAP_PRIVATE mmap() of files and devices where the data in the backing store
might be mapped directly. Use the BDI_CAP_MAP_DIRECT capability flag to govern
whether or not we should be trying to map a file directly. This can be used to
determine whether or not a region has been filled in at the point where we call
do_mmap_shared() or do_mmap_private().
The BDI_CAP_MAP_DIRECT capability flag is cleared by validate_mmap_request() if
there's any reason we can't use it. It's also cleared in do_mmap_pgoff() if
f_op->get_unmapped_area() fails.
Without this fix, attempting to run a program from a RomFS image on a
non-mappable MTD partition results in a BUG as the kernel attempts XIP, and
this can be caught in gdb:
Program received signal SIGABRT, Aborted.
0xc005dce8 in add_nommu_region (region=<value optimized out>) at mm/nommu.c:547
(gdb) bt
#0 0xc005dce8 in add_nommu_region (region=<value optimized out>) at mm/nommu.c:547
#1 0xc005f168 in do_mmap_pgoff (file=0xc31a6620, addr=<value optimized out>, len=3808, prot=3, flags=6146, pgoff=0) at mm/nommu.c:1373
#2 0xc00a96b8 in elf_fdpic_map_file (params=0xc33fbbec, file=0xc31a6620, mm=0xc31bef60, what=0xc0213144 "executable") at mm.h:1145
#3 0xc00aa8b4 in load_elf_fdpic_binary (bprm=0xc316cb00, regs=<value optimized out>) at fs/binfmt_elf_fdpic.c:343
#4 0xc006b588 in search_binary_handler (bprm=0x6, regs=0xc33fbce0) at fs/exec.c:1234
#5 0xc006c648 in do_execve (filename=<value optimized out>, argv=0xc3ad14cc, envp=0xc3ad1460, regs=0xc33fbce0) at fs/exec.c:1356
#6 0xc0008cf0 in sys_execve (name=<value optimized out>, argv=0xc3ad14cc, envp=0xc3ad1460) at arch/frv/kernel/process.c:263
#7 0xc00075dc in __syscall_call () at arch/frv/kernel/entry.S:897
Note that this fix does the following commit differently:
commit a190887b58c32d19c2eee007c5eb8faa970a69ba
Author: David Howells <dhowells@redhat.com>
Date: Sat Sep 5 11:17:07 2009 -0700
nommu: fix error handling in do_mmap_pgoff()
Reported-by: Graff Yang <graff.yang@gmail.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Greg Ungerer <gerg@snapgear.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-09-24 18:13:10 +04:00
ret = do_mmap_private ( vma , region , len , capabilities ) ;
2005-04-17 02:20:36 +04:00
if ( ret < 0 )
NOMMU: Fix MAP_PRIVATE mmap() of objects where the data can be mapped directly
Fix MAP_PRIVATE mmap() of files and devices where the data in the backing store
might be mapped directly. Use the BDI_CAP_MAP_DIRECT capability flag to govern
whether or not we should be trying to map a file directly. This can be used to
determine whether or not a region has been filled in at the point where we call
do_mmap_shared() or do_mmap_private().
The BDI_CAP_MAP_DIRECT capability flag is cleared by validate_mmap_request() if
there's any reason we can't use it. It's also cleared in do_mmap_pgoff() if
f_op->get_unmapped_area() fails.
Without this fix, attempting to run a program from a RomFS image on a
non-mappable MTD partition results in a BUG as the kernel attempts XIP, and
this can be caught in gdb:
Program received signal SIGABRT, Aborted.
0xc005dce8 in add_nommu_region (region=<value optimized out>) at mm/nommu.c:547
(gdb) bt
#0 0xc005dce8 in add_nommu_region (region=<value optimized out>) at mm/nommu.c:547
#1 0xc005f168 in do_mmap_pgoff (file=0xc31a6620, addr=<value optimized out>, len=3808, prot=3, flags=6146, pgoff=0) at mm/nommu.c:1373
#2 0xc00a96b8 in elf_fdpic_map_file (params=0xc33fbbec, file=0xc31a6620, mm=0xc31bef60, what=0xc0213144 "executable") at mm.h:1145
#3 0xc00aa8b4 in load_elf_fdpic_binary (bprm=0xc316cb00, regs=<value optimized out>) at fs/binfmt_elf_fdpic.c:343
#4 0xc006b588 in search_binary_handler (bprm=0x6, regs=0xc33fbce0) at fs/exec.c:1234
#5 0xc006c648 in do_execve (filename=<value optimized out>, argv=0xc3ad14cc, envp=0xc3ad1460, regs=0xc33fbce0) at fs/exec.c:1356
#6 0xc0008cf0 in sys_execve (name=<value optimized out>, argv=0xc3ad14cc, envp=0xc3ad1460) at arch/frv/kernel/process.c:263
#7 0xc00075dc in __syscall_call () at arch/frv/kernel/entry.S:897
Note that this fix does the following commit differently:
commit a190887b58c32d19c2eee007c5eb8faa970a69ba
Author: David Howells <dhowells@redhat.com>
Date: Sat Sep 5 11:17:07 2009 -0700
nommu: fix error handling in do_mmap_pgoff()
Reported-by: Graff Yang <graff.yang@gmail.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Greg Ungerer <gerg@snapgear.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-09-24 18:13:10 +04:00
goto error_just_free ;
add_nommu_region ( region ) ;
2009-01-08 15:04:47 +03:00
2009-12-15 05:00:02 +03:00
/* clear anonymous mappings that don't ask for uninitialized data */
if ( ! vma - > vm_file & & ! ( flags & MAP_UNINITIALIZED ) )
memset ( ( void * ) region - > vm_start , 0 ,
region - > vm_end - region - > vm_start ) ;
2005-04-17 02:20:36 +04:00
/* okay... we have a mapping; now we have to register it */
2009-01-08 15:04:47 +03:00
result = vma - > vm_start ;
2005-04-17 02:20:36 +04:00
current - > mm - > total_vm + = len > > PAGE_SHIFT ;
2009-01-08 15:04:47 +03:00
share :
add_vma_to_mm ( current - > mm , vma ) ;
2005-04-17 02:20:36 +04:00
NOMMU: Avoiding duplicate icache flushes of shared maps
When working with FDPIC, there are many shared mappings of read-only
code regions between applications (the C library, applet packages like
busybox, etc.), but the current do_mmap_pgoff() function will issue an
icache flush whenever a VMA is added to an MM instead of only doing it
when the map is initially created.
The flush can instead be done when a region is first mmapped PROT_EXEC.
Note that we may not rely on the first mapping of a region being
executable - it's possible for it to be PROT_READ only, so we have to
remember whether we've flushed the region or not, and then flush the
entire region when a bit of it is made executable.
However, this also affects the brk area. That will no longer be
executable. We can mprotect() it to PROT_EXEC on MPU-mode kernels, but
for NOMMU mode kernels, when it increases the brk allocation, making
sys_brk() flush the extra from the icache should suffice. The brk area
probably isn't used by NOMMU programs since the brk area can only use up
the leavings from the stack allocation, where the stack allocation is
larger than requested.
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-01-06 20:23:23 +03:00
/* we flush the region from the icache only when the first executable
* mapping of it is made */
if ( vma - > vm_flags & VM_EXEC & & ! region - > vm_icache_flushed ) {
flush_icache_range ( region - > vm_start , region - > vm_end ) ;
region - > vm_icache_flushed = true ;
}
2005-04-17 02:20:36 +04:00
NOMMU: Avoiding duplicate icache flushes of shared maps
When working with FDPIC, there are many shared mappings of read-only
code regions between applications (the C library, applet packages like
busybox, etc.), but the current do_mmap_pgoff() function will issue an
icache flush whenever a VMA is added to an MM instead of only doing it
when the map is initially created.
The flush can instead be done when a region is first mmapped PROT_EXEC.
Note that we may not rely on the first mapping of a region being
executable - it's possible for it to be PROT_READ only, so we have to
remember whether we've flushed the region or not, and then flush the
entire region when a bit of it is made executable.
However, this also affects the brk area. That will no longer be
executable. We can mprotect() it to PROT_EXEC on MPU-mode kernels, but
for NOMMU mode kernels, when it increases the brk allocation, making
sys_brk() flush the extra from the icache should suffice. The brk area
probably isn't used by NOMMU programs since the brk area can only use up
the leavings from the stack allocation, where the stack allocation is
larger than requested.
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-01-06 20:23:23 +03:00
up_write ( & nommu_region_sem ) ;
2005-04-17 02:20:36 +04:00
2009-01-08 15:04:47 +03:00
kleave ( " = %lx " , result ) ;
return result ;
2005-04-17 02:20:36 +04:00
2009-01-08 15:04:47 +03:00
error_just_free :
up_write ( & nommu_region_sem ) ;
error :
2009-10-30 16:13:26 +03:00
if ( region - > vm_file )
fput ( region - > vm_file ) ;
2009-01-08 15:04:47 +03:00
kmem_cache_free ( vm_region_jar , region ) ;
2009-10-30 16:13:26 +03:00
if ( vma - > vm_file )
fput ( vma - > vm_file ) ;
2009-01-08 15:04:47 +03:00
if ( vma - > vm_flags & VM_EXECUTABLE )
removed_exe_file_vma ( vma - > vm_mm ) ;
kmem_cache_free ( vm_area_cachep , vma ) ;
kleave ( " = %d " , ret ) ;
return ret ;
sharing_violation :
up_write ( & nommu_region_sem ) ;
printk ( KERN_WARNING " Attempt to share mismatched mappings \n " ) ;
ret = - EINVAL ;
goto error ;
2005-04-17 02:20:36 +04:00
2009-01-08 15:04:47 +03:00
error_getting_vma :
kmem_cache_free ( vm_region_jar , region ) ;
printk ( KERN_WARNING " Allocation of vma for %lu byte allocation "
" from process %d failed \n " ,
2005-04-17 02:20:36 +04:00
len , current - > pid ) ;
show_free_areas ( ) ;
return - ENOMEM ;
2009-01-08 15:04:47 +03:00
error_getting_region :
printk ( KERN_WARNING " Allocation of vm region for %lu byte allocation "
" from process %d failed \n " ,
2005-04-17 02:20:36 +04:00
len , current - > pid ) ;
show_free_areas ( ) ;
return - ENOMEM ;
}
2007-07-21 15:37:25 +04:00
EXPORT_SYMBOL ( do_mmap_pgoff ) ;
2005-04-17 02:20:36 +04:00
2009-12-30 23:17:34 +03:00
SYSCALL_DEFINE6 ( mmap_pgoff , unsigned long , addr , unsigned long , len ,
unsigned long , prot , unsigned long , flags ,
unsigned long , fd , unsigned long , pgoff )
{
struct file * file = NULL ;
unsigned long retval = - EBADF ;
if ( ! ( flags & MAP_ANONYMOUS ) ) {
file = fget ( fd ) ;
if ( ! file )
goto out ;
}
flags & = ~ ( MAP_EXECUTABLE | MAP_DENYWRITE ) ;
down_write ( & current - > mm - > mmap_sem ) ;
retval = do_mmap_pgoff ( file , addr , len , prot , flags , pgoff ) ;
up_write ( & current - > mm - > mmap_sem ) ;
if ( file )
fput ( file ) ;
out :
return retval ;
}
2010-03-11 02:21:15 +03:00
# ifdef __ARCH_WANT_SYS_OLD_MMAP
struct mmap_arg_struct {
unsigned long addr ;
unsigned long len ;
unsigned long prot ;
unsigned long flags ;
unsigned long fd ;
unsigned long offset ;
} ;
SYSCALL_DEFINE1 ( old_mmap , struct mmap_arg_struct __user * , arg )
{
struct mmap_arg_struct a ;
if ( copy_from_user ( & a , arg , sizeof ( a ) ) )
return - EFAULT ;
if ( a . offset & ~ PAGE_MASK )
return - EINVAL ;
return sys_mmap_pgoff ( a . addr , a . len , a . prot , a . flags , a . fd ,
a . offset > > PAGE_SHIFT ) ;
}
# endif /* __ARCH_WANT_SYS_OLD_MMAP */
2005-04-17 02:20:36 +04:00
/*
2009-01-08 15:04:47 +03:00
* split a vma into two pieces at address ' addr ' , a new vma is allocated either
* for the first part or the tail .
2005-04-17 02:20:36 +04:00
*/
2009-01-08 15:04:47 +03:00
int split_vma ( struct mm_struct * mm , struct vm_area_struct * vma ,
unsigned long addr , int new_below )
2005-04-17 02:20:36 +04:00
{
2009-01-08 15:04:47 +03:00
struct vm_area_struct * new ;
struct vm_region * region ;
unsigned long npages ;
2005-04-17 02:20:36 +04:00
2009-01-08 15:04:47 +03:00
kenter ( " " ) ;
2005-04-17 02:20:36 +04:00
2010-01-16 04:01:34 +03:00
/* we're only permitted to split anonymous regions (these should have
* only a single usage on the region ) */
if ( vma - > vm_file )
2009-01-08 15:04:47 +03:00
return - ENOMEM ;
2005-04-17 02:20:36 +04:00
2009-01-08 15:04:47 +03:00
if ( mm - > map_count > = sysctl_max_map_count )
return - ENOMEM ;
2005-04-17 02:20:36 +04:00
2009-01-08 15:04:47 +03:00
region = kmem_cache_alloc ( vm_region_jar , GFP_KERNEL ) ;
if ( ! region )
return - ENOMEM ;
2005-04-17 02:20:36 +04:00
2009-01-08 15:04:47 +03:00
new = kmem_cache_alloc ( vm_area_cachep , GFP_KERNEL ) ;
if ( ! new ) {
kmem_cache_free ( vm_region_jar , region ) ;
return - ENOMEM ;
}
/* most fields are the same, copy all, and then fixup */
* new = * vma ;
* region = * vma - > vm_region ;
new - > vm_region = region ;
npages = ( addr - vma - > vm_start ) > > PAGE_SHIFT ;
if ( new_below ) {
2009-01-08 15:04:47 +03:00
region - > vm_top = region - > vm_end = new - > vm_end = addr ;
2009-01-08 15:04:47 +03:00
} else {
region - > vm_start = new - > vm_start = addr ;
region - > vm_pgoff = new - > vm_pgoff + = npages ;
2005-04-17 02:20:36 +04:00
}
2009-01-08 15:04:47 +03:00
if ( new - > vm_ops & & new - > vm_ops - > open )
new - > vm_ops - > open ( new ) ;
delete_vma_from_mm ( vma ) ;
down_write ( & nommu_region_sem ) ;
delete_nommu_region ( vma - > vm_region ) ;
if ( new_below ) {
vma - > vm_region - > vm_start = vma - > vm_start = addr ;
vma - > vm_region - > vm_pgoff = vma - > vm_pgoff + = npages ;
} else {
vma - > vm_region - > vm_end = vma - > vm_end = addr ;
2009-01-08 15:04:47 +03:00
vma - > vm_region - > vm_top = addr ;
2009-01-08 15:04:47 +03:00
}
add_nommu_region ( vma - > vm_region ) ;
add_nommu_region ( new - > vm_region ) ;
up_write ( & nommu_region_sem ) ;
add_vma_to_mm ( mm , vma ) ;
add_vma_to_mm ( mm , new ) ;
return 0 ;
2005-04-17 02:20:36 +04:00
}
2006-09-27 12:50:20 +04:00
/*
2009-01-08 15:04:47 +03:00
* shrink a VMA by removing the specified chunk from either the beginning or
* the end
2006-09-27 12:50:20 +04:00
*/
2009-01-08 15:04:47 +03:00
static int shrink_vma ( struct mm_struct * mm ,
struct vm_area_struct * vma ,
unsigned long from , unsigned long to )
2005-04-17 02:20:36 +04:00
{
2009-01-08 15:04:47 +03:00
struct vm_region * region ;
2005-04-17 02:20:36 +04:00
2009-01-08 15:04:47 +03:00
kenter ( " " ) ;
2005-04-17 02:20:36 +04:00
2009-01-08 15:04:47 +03:00
/* adjust the VMA's pointers, which may reposition it in the MM's tree
* and list */
delete_vma_from_mm ( vma ) ;
if ( from > vma - > vm_start )
vma - > vm_end = from ;
else
vma - > vm_start = to ;
add_vma_to_mm ( mm , vma ) ;
2005-04-17 02:20:36 +04:00
2009-01-08 15:04:47 +03:00
/* cut the backing region down to size */
region = vma - > vm_region ;
2010-01-16 04:01:33 +03:00
BUG_ON ( region - > vm_usage ! = 1 ) ;
2009-01-08 15:04:47 +03:00
down_write ( & nommu_region_sem ) ;
delete_nommu_region ( region ) ;
2009-01-08 15:04:47 +03:00
if ( from > region - > vm_start ) {
to = region - > vm_top ;
region - > vm_top = region - > vm_end = from ;
} else {
2009-01-08 15:04:47 +03:00
region - > vm_start = to ;
2009-01-08 15:04:47 +03:00
}
2009-01-08 15:04:47 +03:00
add_nommu_region ( region ) ;
up_write ( & nommu_region_sem ) ;
free_page_series ( from , to ) ;
return 0 ;
}
2005-04-17 02:20:36 +04:00
2009-01-08 15:04:47 +03:00
/*
* release a mapping
* - under NOMMU conditions the chunk to be unmapped must be backed by a single
* VMA , though it need not cover the whole VMA
*/
int do_munmap ( struct mm_struct * mm , unsigned long start , size_t len )
{
struct vm_area_struct * vma ;
struct rb_node * rb ;
unsigned long end = start + len ;
int ret ;
2005-04-17 02:20:36 +04:00
2009-01-08 15:04:47 +03:00
kenter ( " ,%lx,%zx " , start , len ) ;
2005-04-17 02:20:36 +04:00
2009-01-08 15:04:47 +03:00
if ( len = = 0 )
return - EINVAL ;
[PATCH] mm: update_hiwaters just in time
update_mem_hiwater has attracted various criticisms, in particular from those
concerned with mm scalability. Originally it was called whenever rss or
total_vm got raised. Then many of those callsites were replaced by a timer
tick call from account_system_time. Now Frank van Maarseveen reports that to
be found inadequate. How about this? Works for Frank.
Replace update_mem_hiwater, a poor combination of two unrelated ops, by macros
update_hiwater_rss and update_hiwater_vm. Don't attempt to keep
mm->hiwater_rss up to date at timer tick, nor every time we raise rss (usually
by 1): those are hot paths. Do the opposite, update only when about to lower
rss (usually by many), or just before final accounting in do_exit. Handle
mm->hiwater_vm in the same way, though it's much less of an issue. Demand
that whoever collects these hiwater statistics do the work of taking the
maximum with rss or total_vm.
And there has been no collector of these hiwater statistics in the tree. The
new convention needs an example, so match Frank's usage by adding a VmPeak
line above VmSize to /proc/<pid>/status, and also a VmHWM line above VmRSS
(High-Water-Mark or High-Water-Memory).
There was a particular anomaly during mremap move, that hiwater_vm might be
captured too high. A fleeting such anomaly remains, but it's quickly
corrected now, whereas before it would stick.
What locking? None: if the app is racy then these statistics will be racy,
it's not worth any overhead to make them exact. But whenever it suits,
hiwater_vm is updated under exclusive mmap_sem, and hiwater_rss under
page_table_lock (for now) or with preemption disabled (later on): without
going to any trouble, minimize the time between reading current values and
updating, to minimize those occasions when a racing thread bumps a count up
and back down in between.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 04:16:18 +03:00
2009-01-08 15:04:47 +03:00
/* find the first potentially overlapping VMA */
vma = find_vma ( mm , start ) ;
if ( ! vma ) {
2009-04-03 03:56:32 +04:00
static int limit = 0 ;
if ( limit < 5 ) {
printk ( KERN_WARNING
" munmap of memory not mmapped by process %d "
" (%s): 0x%lx-0x%lx \n " ,
current - > pid , current - > comm ,
start , start + len - 1 ) ;
limit + + ;
}
2009-01-08 15:04:47 +03:00
return - EINVAL ;
}
2005-04-17 02:20:36 +04:00
2009-01-08 15:04:47 +03:00
/* we're allowed to split an anonymous VMA but not a file-backed one */
if ( vma - > vm_file ) {
do {
if ( start > vma - > vm_start ) {
kleave ( " = -EINVAL [miss] " ) ;
return - EINVAL ;
}
if ( end = = vma - > vm_end )
goto erase_whole_vma ;
rb = rb_next ( & vma - > vm_rb ) ;
vma = rb_entry ( rb , struct vm_area_struct , vm_rb ) ;
} while ( rb ) ;
kleave ( " = -EINVAL [split file] " ) ;
return - EINVAL ;
} else {
/* the chunk must be a subset of the VMA found */
if ( start = = vma - > vm_start & & end = = vma - > vm_end )
goto erase_whole_vma ;
if ( start < vma - > vm_start | | end > vma - > vm_end ) {
kleave ( " = -EINVAL [superset] " ) ;
return - EINVAL ;
}
if ( start & ~ PAGE_MASK ) {
kleave ( " = -EINVAL [unaligned start] " ) ;
return - EINVAL ;
}
if ( end ! = vma - > vm_end & & end & ~ PAGE_MASK ) {
kleave ( " = -EINVAL [unaligned split] " ) ;
return - EINVAL ;
}
if ( start ! = vma - > vm_start & & end ! = vma - > vm_end ) {
ret = split_vma ( mm , vma , start , 1 ) ;
if ( ret < 0 ) {
kleave ( " = %d [split] " , ret ) ;
return ret ;
}
}
return shrink_vma ( mm , vma , start , end ) ;
}
2005-04-17 02:20:36 +04:00
2009-01-08 15:04:47 +03:00
erase_whole_vma :
delete_vma_from_mm ( vma ) ;
delete_vma ( mm , vma ) ;
kleave ( " = 0 " ) ;
2005-04-17 02:20:36 +04:00
return 0 ;
}
2007-07-21 15:37:25 +04:00
EXPORT_SYMBOL ( do_munmap ) ;
2005-04-17 02:20:36 +04:00
2009-01-14 16:14:15 +03:00
SYSCALL_DEFINE2 ( munmap , unsigned long , addr , size_t , len )
2006-09-27 12:50:20 +04:00
{
int ret ;
struct mm_struct * mm = current - > mm ;
down_write ( & mm - > mmap_sem ) ;
ret = do_munmap ( mm , addr , len ) ;
up_write ( & mm - > mmap_sem ) ;
return ret ;
}
/*
2009-01-08 15:04:47 +03:00
* release all the mappings made in a process ' s VM space
2006-09-27 12:50:20 +04:00
*/
2009-01-08 15:04:47 +03:00
void exit_mmap ( struct mm_struct * mm )
2005-04-17 02:20:36 +04:00
{
2009-01-08 15:04:47 +03:00
struct vm_area_struct * vma ;
2005-04-17 02:20:36 +04:00
2009-01-08 15:04:47 +03:00
if ( ! mm )
return ;
2005-04-17 02:20:36 +04:00
2009-01-08 15:04:47 +03:00
kenter ( " " ) ;
2005-04-17 02:20:36 +04:00
2009-01-08 15:04:47 +03:00
mm - > total_vm = 0 ;
2005-04-17 02:20:36 +04:00
2009-01-08 15:04:47 +03:00
while ( ( vma = mm - > mmap ) ) {
mm - > mmap = vma - > vm_next ;
delete_vma_from_mm ( vma ) ;
delete_vma ( mm , vma ) ;
2005-04-17 02:20:36 +04:00
}
2009-01-08 15:04:47 +03:00
kleave ( " " ) ;
2005-04-17 02:20:36 +04:00
}
unsigned long do_brk ( unsigned long addr , unsigned long len )
{
return - ENOMEM ;
}
/*
2006-09-27 12:50:21 +04:00
* expand ( or shrink ) an existing mapping , potentially moving it at the same
* time ( controlled by the MREMAP_MAYMOVE flag and available VM space )
2005-04-17 02:20:36 +04:00
*
2006-09-27 12:50:21 +04:00
* under NOMMU conditions , we only permit changing a mapping ' s size , and only
2009-01-08 15:04:47 +03:00
* as long as it stays within the region allocated by do_mmap_private ( ) and the
* block is not shareable
2005-04-17 02:20:36 +04:00
*
2006-09-27 12:50:21 +04:00
* MREMAP_FIXED is not supported under NOMMU conditions
2005-04-17 02:20:36 +04:00
*/
unsigned long do_mremap ( unsigned long addr ,
unsigned long old_len , unsigned long new_len ,
unsigned long flags , unsigned long new_addr )
{
2006-09-27 12:50:21 +04:00
struct vm_area_struct * vma ;
2005-04-17 02:20:36 +04:00
/* insanity checks first */
2009-01-08 15:04:47 +03:00
if ( old_len = = 0 | | new_len = = 0 )
2005-04-17 02:20:36 +04:00
return ( unsigned long ) - EINVAL ;
2009-01-08 15:04:47 +03:00
if ( addr & ~ PAGE_MASK )
return - EINVAL ;
2005-04-17 02:20:36 +04:00
if ( flags & MREMAP_FIXED & & new_addr ! = addr )
return ( unsigned long ) - EINVAL ;
2009-01-08 15:04:47 +03:00
vma = find_vma_exact ( current - > mm , addr , old_len ) ;
2006-09-27 12:50:21 +04:00
if ( ! vma )
return ( unsigned long ) - EINVAL ;
2005-04-17 02:20:36 +04:00
2006-09-27 12:50:21 +04:00
if ( vma - > vm_end ! = vma - > vm_start + old_len )
2005-04-17 02:20:36 +04:00
return ( unsigned long ) - EFAULT ;
2006-09-27 12:50:21 +04:00
if ( vma - > vm_flags & VM_MAYSHARE )
2005-04-17 02:20:36 +04:00
return ( unsigned long ) - EPERM ;
2009-01-08 15:04:47 +03:00
if ( new_len > vma - > vm_region - > vm_end - vma - > vm_region - > vm_start )
2005-04-17 02:20:36 +04:00
return ( unsigned long ) - ENOMEM ;
/* all checks complete - do it */
2006-09-27 12:50:21 +04:00
vma - > vm_end = vma - > vm_start + new_len ;
return vma - > vm_start ;
}
2007-07-21 15:37:25 +04:00
EXPORT_SYMBOL ( do_mremap ) ;
2006-09-27 12:50:21 +04:00
2009-01-14 16:14:15 +03:00
SYSCALL_DEFINE5 ( mremap , unsigned long , addr , unsigned long , old_len ,
unsigned long , new_len , unsigned long , flags ,
unsigned long , new_addr )
2006-09-27 12:50:21 +04:00
{
unsigned long ret ;
down_write ( & current - > mm - > mmap_sem ) ;
ret = do_mremap ( addr , old_len , new_len , flags , new_addr ) ;
up_write ( & current - > mm - > mmap_sem ) ;
return ret ;
2005-04-17 02:20:36 +04:00
}
2005-11-29 01:34:23 +03:00
struct page * follow_page ( struct vm_area_struct * vma , unsigned long address ,
2005-10-30 04:16:33 +03:00
unsigned int foll_flags )
2005-04-17 02:20:36 +04:00
{
return NULL ;
}
int remap_pfn_range ( struct vm_area_struct * vma , unsigned long from ,
unsigned long to , unsigned long size , pgprot_t prot )
{
2005-09-12 05:18:10 +04:00
vma - > vm_start = vma - > vm_pgoff < < PAGE_SHIFT ;
return 0 ;
2005-04-17 02:20:36 +04:00
}
2006-07-14 11:24:09 +04:00
EXPORT_SYMBOL ( remap_pfn_range ) ;
2005-04-17 02:20:36 +04:00
2008-02-05 09:29:59 +03:00
int remap_vmalloc_range ( struct vm_area_struct * vma , void * addr ,
unsigned long pgoff )
{
unsigned int size = vma - > vm_end - vma - > vm_start ;
if ( ! ( vma - > vm_flags & VM_USERMAP ) )
return - EINVAL ;
vma - > vm_start = ( unsigned long ) ( addr + ( pgoff < < PAGE_SHIFT ) ) ;
vma - > vm_end = vma - > vm_start + size ;
return 0 ;
}
EXPORT_SYMBOL ( remap_vmalloc_range ) ;
2005-04-17 02:20:36 +04:00
void swap_unplug_io_fn ( struct backing_dev_info * bdi , struct page * page )
{
}
unsigned long arch_get_unmapped_area ( struct file * file , unsigned long addr ,
unsigned long len , unsigned long pgoff , unsigned long flags )
{
return - ENOMEM ;
}
2005-06-22 04:14:49 +04:00
void arch_unmap_area ( struct mm_struct * mm , unsigned long addr )
2005-04-17 02:20:36 +04:00
{
}
void unmap_mapping_range ( struct address_space * mapping ,
loff_t const holebegin , loff_t const holelen ,
int even_cows )
{
}
2006-07-14 11:24:09 +04:00
EXPORT_SYMBOL ( unmap_mapping_range ) ;
2005-04-17 02:20:36 +04:00
/*
* Check that a process has enough memory to allocate a new virtual
* mapping . 0 means there is enough memory for the allocation to
* succeed and - ENOMEM implies there is not .
*
* We currently support three overcommit policies , which are set via the
* vm . overcommit_memory sysctl . See Documentation / vm / overcommit - accounting
*
* Strict overcommit modes added 2002 Feb 26 by Alan Cox .
* Additional code 2002 Jul 20 by Robert Love .
*
* cap_sys_admin is 1 if the process has admin privileges , 0 otherwise .
*
* Note this is a helper function intended to be used by LSMs which
* wish to use this logic .
*/
2007-08-23 01:01:28 +04:00
int __vm_enough_memory ( struct mm_struct * mm , long pages , int cap_sys_admin )
2005-04-17 02:20:36 +04:00
{
unsigned long free , allowed ;
vm_acct_memory ( pages ) ;
/*
* Sometimes we want to use more memory than we have
*/
if ( sysctl_overcommit_memory = = OVERCOMMIT_ALWAYS )
return 0 ;
if ( sysctl_overcommit_memory = = OVERCOMMIT_GUESS ) {
unsigned long n ;
2006-06-30 12:55:35 +04:00
free = global_page_state ( NR_FILE_PAGES ) ;
2005-04-17 02:20:36 +04:00
free + = nr_swap_pages ;
/*
* Any slabs which are created with the
* SLAB_RECLAIM_ACCOUNT flag claim to have contents
* which are reclaimable , under pressure . The dentry
* cache and most inode caches should fall into this
*/
2006-09-26 10:31:51 +04:00
free + = global_page_state ( NR_SLAB_RECLAIMABLE ) ;
2005-04-17 02:20:36 +04:00
/*
* Leave the last 3 % for root
*/
if ( ! cap_sys_admin )
free - = free / 32 ;
if ( free > pages )
return 0 ;
/*
* nr_free_pages ( ) is very expensive on large systems ,
* only call if we ' re about to fail .
*/
n = nr_free_pages ( ) ;
2006-04-11 09:53:01 +04:00
/*
* Leave reserved pages . The pages are not for anonymous pages .
*/
if ( n < = totalreserve_pages )
goto error ;
else
n - = totalreserve_pages ;
/*
* Leave the last 3 % for root
*/
2005-04-17 02:20:36 +04:00
if ( ! cap_sys_admin )
n - = n / 32 ;
free + = n ;
if ( free > pages )
return 0 ;
2006-04-11 09:53:01 +04:00
goto error ;
2005-04-17 02:20:36 +04:00
}
allowed = totalram_pages * sysctl_overcommit_ratio / 100 ;
/*
* Leave the last 3 % for root
*/
if ( ! cap_sys_admin )
allowed - = allowed / 32 ;
allowed + = total_swap_pages ;
/* Don't let a single process grow too big:
leave 3 % of the size of this process for other processes */
2008-10-30 00:01:20 +03:00
if ( mm )
allowed - = mm - > total_vm / 32 ;
2005-04-17 02:20:36 +04:00
2009-05-01 02:08:51 +04:00
if ( percpu_counter_read_positive ( & vm_committed_as ) < allowed )
2005-04-17 02:20:36 +04:00
return 0 ;
2009-05-01 02:08:51 +04:00
2006-04-11 09:53:01 +04:00
error :
2005-04-17 02:20:36 +04:00
vm_unacct_memory ( pages ) ;
return - ENOMEM ;
}
int in_gate_area_no_task ( unsigned long addr )
{
return 0 ;
}
2006-01-06 11:11:42 +03:00
2007-07-19 12:47:03 +04:00
int filemap_fault ( struct vm_area_struct * vma , struct vm_fault * vmf )
2006-01-06 11:11:42 +03:00
{
BUG ( ) ;
2007-07-19 12:47:03 +04:00
return 0 ;
2006-01-06 11:11:42 +03:00
}
2007-07-21 15:37:25 +04:00
EXPORT_SYMBOL ( filemap_fault ) ;
2006-09-27 12:50:15 +04:00
/*
* Access another process ' address space .
* - source / target buffer must be kernel space
*/
int access_process_vm ( struct task_struct * tsk , unsigned long addr , void * buf , int len , int write )
{
struct vm_area_struct * vma ;
struct mm_struct * mm ;
if ( addr + len < addr )
return 0 ;
mm = get_task_mm ( tsk ) ;
if ( ! mm )
return 0 ;
down_read ( & mm - > mmap_sem ) ;
/* the access must start within one of the target process's mappings */
2006-09-27 12:50:16 +04:00
vma = find_vma ( mm , addr ) ;
if ( vma ) {
2006-09-27 12:50:15 +04:00
/* don't overrun this mapping */
if ( addr + len > = vma - > vm_end )
len = vma - > vm_end - addr ;
/* only read or write mappings where it is permitted */
2006-09-27 12:50:19 +04:00
if ( write & & vma - > vm_flags & VM_MAYWRITE )
2010-01-06 20:23:28 +03:00
copy_to_user_page ( vma , NULL , addr ,
( void * ) addr , buf , len ) ;
2006-09-27 12:50:19 +04:00
else if ( ! write & & vma - > vm_flags & VM_MAYREAD )
2010-01-06 20:23:28 +03:00
copy_from_user_page ( vma , NULL , addr ,
buf , ( void * ) addr , len ) ;
2006-09-27 12:50:15 +04:00
else
len = 0 ;
} else {
len = 0 ;
}
up_read ( & mm - > mmap_sem ) ;
mmput ( mm ) ;
return len ;
}
nommu: fix shared mmap after truncate shrinkage problems
Fix a problem in NOMMU mmap with ramfs whereby a shared mmap can happen
over the end of a truncation. The problem is that
ramfs_nommu_check_mappings() checks that the reduced file size against the
VMA tree, but not the vm_region tree.
The following sequence of events can cause the problem:
fd = open("/tmp/x", O_RDWR|O_TRUNC|O_CREAT, 0600);
ftruncate(fd, 32 * 1024);
a = mmap(NULL, 32 * 1024, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
b = mmap(NULL, 16 * 1024, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
munmap(a, 32 * 1024);
ftruncate(fd, 16 * 1024);
c = mmap(NULL, 32 * 1024, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
Mapping 'a' creates a vm_region covering 32KB of the file. Mapping 'b'
sees that the vm_region from 'a' is covering the region it wants and so
shares it, pinning it in memory.
Mapping 'a' then goes away and the file is truncated to the end of VMA
'b'. However, the region allocated by 'a' is still in effect, and has
_not_ been reduced.
Mapping 'c' is then created, and because there's a vm_region covering the
desired region, get_unmapped_area() is _not_ called to repeat the check,
and the mapping is granted, even though the pages from the latter half of
the mapping have been discarded.
However:
d = mmap(NULL, 16 * 1024, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
Mapping 'd' should work, and should end up sharing the region allocated by
'a'.
To deal with this, we shrink the vm_region struct during the truncation,
lest do_mmap_pgoff() take it as licence to share the full region
automatically without calling the get_unmapped_area() file op again.
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Greg Ungerer <gerg@snapgear.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-01-16 04:01:39 +03:00
/**
* nommu_shrink_inode_mappings - Shrink the shared mappings on an inode
* @ inode : The inode to check
* @ size : The current filesize of the inode
* @ newsize : The proposed filesize of the inode
*
* Check the shared mappings on an inode on behalf of a shrinking truncate to
* make sure that that any outstanding VMAs aren ' t broken and then shrink the
* vm_regions that extend that beyond so that do_mmap_pgoff ( ) doesn ' t
* automatically grant mappings that are too large .
*/
int nommu_shrink_inode_mappings ( struct inode * inode , size_t size ,
size_t newsize )
{
struct vm_area_struct * vma ;
struct prio_tree_iter iter ;
struct vm_region * region ;
pgoff_t low , high ;
size_t r_size , r_top ;
low = newsize > > PAGE_SHIFT ;
high = ( size + PAGE_SIZE - 1 ) > > PAGE_SHIFT ;
down_write ( & nommu_region_sem ) ;
/* search for VMAs that fall within the dead zone */
vma_prio_tree_foreach ( vma , & iter , & inode - > i_mapping - > i_mmap ,
low , high ) {
/* found one - only interested if it's shared out of the page
* cache */
if ( vma - > vm_flags & VM_SHARED ) {
up_write ( & nommu_region_sem ) ;
return - ETXTBSY ; /* not quite true, but near enough */
}
}
/* reduce any regions that overlap the dead zone - if in existence,
* these will be pointed to by VMAs that don ' t overlap the dead zone
*
* we don ' t check for any regions that start beyond the EOF as there
* shouldn ' t be any
*/
vma_prio_tree_foreach ( vma , & iter , & inode - > i_mapping - > i_mmap ,
0 , ULONG_MAX ) {
if ( ! ( vma - > vm_flags & VM_SHARED ) )
continue ;
region = vma - > vm_region ;
r_size = region - > vm_top - region - > vm_start ;
r_top = ( region - > vm_pgoff < < PAGE_SHIFT ) + r_size ;
if ( r_top > newsize ) {
region - > vm_top - = r_top - newsize ;
if ( region - > vm_end > region - > vm_top )
region - > vm_end = region - > vm_top ;
}
}
up_write ( & nommu_region_sem ) ;
return 0 ;
}