2005-04-17 02:20:36 +04:00
/*
* mm / mremap . c
*
* ( C ) Copyright 1996 Linus Torvalds
*
* Address space accounting code < alan @ redhat . com >
* ( C ) Copyright 2002 Red Hat Inc , All Rights Reserved
*/
# include <linux/mm.h>
# include <linux/hugetlb.h>
# include <linux/slab.h>
# include <linux/shm.h>
# include <linux/mman.h>
# include <linux/swap.h>
2006-01-11 23:17:46 +03:00
# include <linux/capability.h>
2005-04-17 02:20:36 +04:00
# include <linux/fs.h>
# include <linux/highmem.h>
# include <linux/security.h>
# include <linux/syscalls.h>
# include <asm/uaccess.h>
# include <asm/cacheflush.h>
# include <asm/tlbflush.h>
2005-10-30 04:16:00 +03:00
static pmd_t * get_old_pmd ( struct mm_struct * mm , unsigned long addr )
2005-04-17 02:20:36 +04:00
{
pgd_t * pgd ;
pud_t * pud ;
pmd_t * pmd ;
pgd = pgd_offset ( mm , addr ) ;
if ( pgd_none_or_clear_bad ( pgd ) )
return NULL ;
pud = pud_offset ( pgd , addr ) ;
if ( pud_none_or_clear_bad ( pud ) )
return NULL ;
pmd = pmd_offset ( pud , addr ) ;
if ( pmd_none_or_clear_bad ( pmd ) )
return NULL ;
2005-10-30 04:16:00 +03:00
return pmd ;
2005-04-17 02:20:36 +04:00
}
2005-10-30 04:16:00 +03:00
static pmd_t * alloc_new_pmd ( struct mm_struct * mm , unsigned long addr )
2005-04-17 02:20:36 +04:00
{
pgd_t * pgd ;
pud_t * pud ;
2005-10-30 04:16:23 +03:00
pmd_t * pmd ;
2005-04-17 02:20:36 +04:00
pgd = pgd_offset ( mm , addr ) ;
pud = pud_alloc ( mm , pgd , addr ) ;
if ( ! pud )
2005-10-30 04:16:23 +03:00
return NULL ;
2005-10-30 04:16:00 +03:00
2005-04-17 02:20:36 +04:00
pmd = pmd_alloc ( mm , pud , addr ) ;
2005-10-30 04:16:00 +03:00
if ( ! pmd )
2005-10-30 04:16:23 +03:00
return NULL ;
2005-10-30 04:16:00 +03:00
2005-10-30 04:16:22 +03:00
if ( ! pmd_present ( * pmd ) & & __pte_alloc ( mm , pmd , addr ) )
2005-10-30 04:16:23 +03:00
return NULL ;
2005-10-30 04:16:00 +03:00
return pmd ;
2005-04-17 02:20:36 +04:00
}
2005-10-30 04:16:00 +03:00
static void move_ptes ( struct vm_area_struct * vma , pmd_t * old_pmd ,
unsigned long old_addr , unsigned long old_end ,
struct vm_area_struct * new_vma , pmd_t * new_pmd ,
unsigned long new_addr )
2005-04-17 02:20:36 +04:00
{
struct address_space * mapping = NULL ;
struct mm_struct * mm = vma - > vm_mm ;
2005-10-30 04:16:00 +03:00
pte_t * old_pte , * new_pte , pte ;
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 04:16:40 +03:00
spinlock_t * old_ptl , * new_ptl ;
2005-04-17 02:20:36 +04:00
if ( vma - > vm_file ) {
/*
* Subtle point from Rajesh Venkatasubramanian : before
* moving file - based ptes , we must lock vmtruncate out ,
* since it might clean the dst vma before the src vma ,
* and we propagate stale pages into the dst afterward .
*/
mapping = vma - > vm_file - > f_mapping ;
spin_lock ( & mapping - > i_mmap_lock ) ;
if ( new_vma - > vm_truncate_count & &
new_vma - > vm_truncate_count ! = vma - > vm_truncate_count )
new_vma - > vm_truncate_count = 0 ;
}
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 04:16:40 +03:00
/*
* We don ' t have to worry about the ordering of src and dst
* pte locks because exclusive mmap_sem prevents deadlock .
*/
2005-10-30 04:16:23 +03:00
old_pte = pte_offset_map_lock ( mm , old_pmd , old_addr , & old_ptl ) ;
new_pte = pte_offset_map_nested ( new_pmd , new_addr ) ;
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 04:16:40 +03:00
new_ptl = pte_lockptr ( mm , new_pmd ) ;
if ( new_ptl ! = old_ptl )
2006-07-03 11:25:08 +04:00
spin_lock_nested ( new_ptl , SINGLE_DEPTH_NESTING ) ;
2006-10-01 10:29:33 +04:00
arch_enter_lazy_mmu_mode ( ) ;
2005-10-30 04:16:00 +03:00
for ( ; old_addr < old_end ; old_pte + + , old_addr + = PAGE_SIZE ,
new_pte + + , new_addr + = PAGE_SIZE ) {
if ( pte_none ( * old_pte ) )
continue ;
pte = ptep_clear_flush ( vma , old_addr , old_pte ) ;
pte = move_pte ( pte , new_vma - > vm_page_prot , old_addr , new_addr ) ;
set_pte_at ( mm , new_addr , new_pte , pte ) ;
2005-04-17 02:20:36 +04:00
}
2005-10-30 04:16:00 +03:00
2006-10-01 10:29:33 +04:00
arch_leave_lazy_mmu_mode ( ) ;
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 04:16:40 +03:00
if ( new_ptl ! = old_ptl )
spin_unlock ( new_ptl ) ;
2005-10-30 04:16:00 +03:00
pte_unmap_nested ( new_pte - 1 ) ;
2005-10-30 04:16:23 +03:00
pte_unmap_unlock ( old_pte - 1 , old_ptl ) ;
2005-04-17 02:20:36 +04:00
if ( mapping )
spin_unlock ( & mapping - > i_mmap_lock ) ;
}
2005-10-30 04:16:00 +03:00
# define LATENCY_LIMIT (64 * PAGE_SIZE)
2007-07-19 12:48:16 +04:00
unsigned long move_page_tables ( struct vm_area_struct * vma ,
2005-04-17 02:20:36 +04:00
unsigned long old_addr , struct vm_area_struct * new_vma ,
unsigned long new_addr , unsigned long len )
{
2005-10-30 04:16:00 +03:00
unsigned long extent , next , old_end ;
pmd_t * old_pmd , * new_pmd ;
2005-04-17 02:20:36 +04:00
2005-10-30 04:16:00 +03:00
old_end = old_addr + len ;
flush_cache_range ( vma , old_addr , old_end ) ;
2005-04-17 02:20:36 +04:00
2005-10-30 04:16:00 +03:00
for ( ; old_addr < old_end ; old_addr + = extent , new_addr + = extent ) {
2005-04-17 02:20:36 +04:00
cond_resched ( ) ;
2005-10-30 04:16:00 +03:00
next = ( old_addr + PMD_SIZE ) & PMD_MASK ;
if ( next - 1 > old_end )
next = old_end ;
extent = next - old_addr ;
old_pmd = get_old_pmd ( vma - > vm_mm , old_addr ) ;
if ( ! old_pmd )
continue ;
new_pmd = alloc_new_pmd ( vma - > vm_mm , new_addr ) ;
if ( ! new_pmd )
break ;
next = ( new_addr + PMD_SIZE ) & PMD_MASK ;
if ( extent > next - new_addr )
extent = next - new_addr ;
if ( extent > LATENCY_LIMIT )
extent = LATENCY_LIMIT ;
move_ptes ( vma , old_pmd , old_addr , old_addr + extent ,
new_vma , new_pmd , new_addr ) ;
2005-04-17 02:20:36 +04:00
}
2005-10-30 04:16:00 +03:00
return len + old_addr - old_end ; /* how much done */
2005-04-17 02:20:36 +04:00
}
static unsigned long move_vma ( struct vm_area_struct * vma ,
unsigned long old_addr , unsigned long old_len ,
unsigned long new_len , unsigned long new_addr )
{
struct mm_struct * mm = vma - > vm_mm ;
struct vm_area_struct * new_vma ;
unsigned long vm_flags = vma - > vm_flags ;
unsigned long new_pgoff ;
unsigned long moved_len ;
unsigned long excess = 0 ;
[PATCH] mm: update_hiwaters just in time
update_mem_hiwater has attracted various criticisms, in particular from those
concerned with mm scalability. Originally it was called whenever rss or
total_vm got raised. Then many of those callsites were replaced by a timer
tick call from account_system_time. Now Frank van Maarseveen reports that to
be found inadequate. How about this? Works for Frank.
Replace update_mem_hiwater, a poor combination of two unrelated ops, by macros
update_hiwater_rss and update_hiwater_vm. Don't attempt to keep
mm->hiwater_rss up to date at timer tick, nor every time we raise rss (usually
by 1): those are hot paths. Do the opposite, update only when about to lower
rss (usually by many), or just before final accounting in do_exit. Handle
mm->hiwater_vm in the same way, though it's much less of an issue. Demand
that whoever collects these hiwater statistics do the work of taking the
maximum with rss or total_vm.
And there has been no collector of these hiwater statistics in the tree. The
new convention needs an example, so match Frank's usage by adding a VmPeak
line above VmSize to /proc/<pid>/status, and also a VmHWM line above VmRSS
(High-Water-Mark or High-Water-Memory).
There was a particular anomaly during mremap move, that hiwater_vm might be
captured too high. A fleeting such anomaly remains, but it's quickly
corrected now, whereas before it would stick.
What locking? None: if the app is racy then these statistics will be racy,
it's not worth any overhead to make them exact. But whenever it suits,
hiwater_vm is updated under exclusive mmap_sem, and hiwater_rss under
page_table_lock (for now) or with preemption disabled (later on): without
going to any trouble, minimize the time between reading current values and
updating, to minimize those occasions when a racing thread bumps a count up
and back down in between.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 04:16:18 +03:00
unsigned long hiwater_vm ;
2005-04-17 02:20:36 +04:00
int split = 0 ;
/*
* We ' d prefer to avoid failure later on in do_munmap :
* which may split one vma into three before unmapping .
*/
if ( mm - > map_count > = sysctl_max_map_count - 3 )
return - ENOMEM ;
new_pgoff = vma - > vm_pgoff + ( ( old_addr - vma - > vm_start ) > > PAGE_SHIFT ) ;
new_vma = copy_vma ( & vma , new_addr , new_len , new_pgoff ) ;
if ( ! new_vma )
return - ENOMEM ;
moved_len = move_page_tables ( vma , old_addr , new_vma , new_addr , old_len ) ;
if ( moved_len < old_len ) {
/*
* On error , move entries back from new area to old ,
* which will succeed since page tables still there ,
* and then proceed to unmap new area instead of old .
*/
move_page_tables ( new_vma , new_addr , vma , old_addr , moved_len ) ;
vma = new_vma ;
old_len = new_len ;
old_addr = new_addr ;
new_addr = - ENOMEM ;
}
/* Conceal VM_ACCOUNT so old reservation is not undone */
if ( vm_flags & VM_ACCOUNT ) {
vma - > vm_flags & = ~ VM_ACCOUNT ;
excess = vma - > vm_end - vma - > vm_start - old_len ;
if ( old_addr > vma - > vm_start & &
old_addr + old_len < vma - > vm_end )
split = 1 ;
}
2005-05-17 08:53:18 +04:00
/*
[PATCH] mm: update_hiwaters just in time
update_mem_hiwater has attracted various criticisms, in particular from those
concerned with mm scalability. Originally it was called whenever rss or
total_vm got raised. Then many of those callsites were replaced by a timer
tick call from account_system_time. Now Frank van Maarseveen reports that to
be found inadequate. How about this? Works for Frank.
Replace update_mem_hiwater, a poor combination of two unrelated ops, by macros
update_hiwater_rss and update_hiwater_vm. Don't attempt to keep
mm->hiwater_rss up to date at timer tick, nor every time we raise rss (usually
by 1): those are hot paths. Do the opposite, update only when about to lower
rss (usually by many), or just before final accounting in do_exit. Handle
mm->hiwater_vm in the same way, though it's much less of an issue. Demand
that whoever collects these hiwater statistics do the work of taking the
maximum with rss or total_vm.
And there has been no collector of these hiwater statistics in the tree. The
new convention needs an example, so match Frank's usage by adding a VmPeak
line above VmSize to /proc/<pid>/status, and also a VmHWM line above VmRSS
(High-Water-Mark or High-Water-Memory).
There was a particular anomaly during mremap move, that hiwater_vm might be
captured too high. A fleeting such anomaly remains, but it's quickly
corrected now, whereas before it would stick.
What locking? None: if the app is racy then these statistics will be racy,
it's not worth any overhead to make them exact. But whenever it suits,
hiwater_vm is updated under exclusive mmap_sem, and hiwater_rss under
page_table_lock (for now) or with preemption disabled (later on): without
going to any trouble, minimize the time between reading current values and
updating, to minimize those occasions when a racing thread bumps a count up
and back down in between.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 04:16:18 +03:00
* If we failed to move page tables we still do total_vm increment
* since do_munmap ( ) will decrement it by old_len = = new_len .
*
* Since total_vm is about to be raised artificially high for a
* moment , we need to restore high watermark afterwards : if stats
* are taken meanwhile , total_vm and hiwater_vm appear too high .
* If this were a serious issue , we ' d add a flag to do_munmap ( ) .
2005-05-17 08:53:18 +04:00
*/
[PATCH] mm: update_hiwaters just in time
update_mem_hiwater has attracted various criticisms, in particular from those
concerned with mm scalability. Originally it was called whenever rss or
total_vm got raised. Then many of those callsites were replaced by a timer
tick call from account_system_time. Now Frank van Maarseveen reports that to
be found inadequate. How about this? Works for Frank.
Replace update_mem_hiwater, a poor combination of two unrelated ops, by macros
update_hiwater_rss and update_hiwater_vm. Don't attempt to keep
mm->hiwater_rss up to date at timer tick, nor every time we raise rss (usually
by 1): those are hot paths. Do the opposite, update only when about to lower
rss (usually by many), or just before final accounting in do_exit. Handle
mm->hiwater_vm in the same way, though it's much less of an issue. Demand
that whoever collects these hiwater statistics do the work of taking the
maximum with rss or total_vm.
And there has been no collector of these hiwater statistics in the tree. The
new convention needs an example, so match Frank's usage by adding a VmPeak
line above VmSize to /proc/<pid>/status, and also a VmHWM line above VmRSS
(High-Water-Mark or High-Water-Memory).
There was a particular anomaly during mremap move, that hiwater_vm might be
captured too high. A fleeting such anomaly remains, but it's quickly
corrected now, whereas before it would stick.
What locking? None: if the app is racy then these statistics will be racy,
it's not worth any overhead to make them exact. But whenever it suits,
hiwater_vm is updated under exclusive mmap_sem, and hiwater_rss under
page_table_lock (for now) or with preemption disabled (later on): without
going to any trouble, minimize the time between reading current values and
updating, to minimize those occasions when a racing thread bumps a count up
and back down in between.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 04:16:18 +03:00
hiwater_vm = mm - > hiwater_vm ;
2005-05-17 08:53:18 +04:00
mm - > total_vm + = new_len > > PAGE_SHIFT ;
2005-10-30 04:15:56 +03:00
vm_stat_account ( mm , vma - > vm_flags , vma - > vm_file , new_len > > PAGE_SHIFT ) ;
2005-05-17 08:53:18 +04:00
2005-04-17 02:20:36 +04:00
if ( do_munmap ( mm , old_addr , old_len ) < 0 ) {
/* OOM: unable to split vma, just get accounts right */
vm_unacct_memory ( excess > > PAGE_SHIFT ) ;
excess = 0 ;
}
[PATCH] mm: update_hiwaters just in time
update_mem_hiwater has attracted various criticisms, in particular from those
concerned with mm scalability. Originally it was called whenever rss or
total_vm got raised. Then many of those callsites were replaced by a timer
tick call from account_system_time. Now Frank van Maarseveen reports that to
be found inadequate. How about this? Works for Frank.
Replace update_mem_hiwater, a poor combination of two unrelated ops, by macros
update_hiwater_rss and update_hiwater_vm. Don't attempt to keep
mm->hiwater_rss up to date at timer tick, nor every time we raise rss (usually
by 1): those are hot paths. Do the opposite, update only when about to lower
rss (usually by many), or just before final accounting in do_exit. Handle
mm->hiwater_vm in the same way, though it's much less of an issue. Demand
that whoever collects these hiwater statistics do the work of taking the
maximum with rss or total_vm.
And there has been no collector of these hiwater statistics in the tree. The
new convention needs an example, so match Frank's usage by adding a VmPeak
line above VmSize to /proc/<pid>/status, and also a VmHWM line above VmRSS
(High-Water-Mark or High-Water-Memory).
There was a particular anomaly during mremap move, that hiwater_vm might be
captured too high. A fleeting such anomaly remains, but it's quickly
corrected now, whereas before it would stick.
What locking? None: if the app is racy then these statistics will be racy,
it's not worth any overhead to make them exact. But whenever it suits,
hiwater_vm is updated under exclusive mmap_sem, and hiwater_rss under
page_table_lock (for now) or with preemption disabled (later on): without
going to any trouble, minimize the time between reading current values and
updating, to minimize those occasions when a racing thread bumps a count up
and back down in between.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 04:16:18 +03:00
mm - > hiwater_vm = hiwater_vm ;
2005-04-17 02:20:36 +04:00
/* Restore VM_ACCOUNT if one or two pieces of vma left */
if ( excess ) {
vma - > vm_flags | = VM_ACCOUNT ;
if ( split )
vma - > vm_next - > vm_flags | = VM_ACCOUNT ;
}
if ( vm_flags & VM_LOCKED ) {
mm - > locked_vm + = new_len > > PAGE_SHIFT ;
if ( new_len > old_len )
make_pages_present ( new_addr + old_len ,
new_addr + new_len ) ;
}
return new_addr ;
}
/*
* Expand ( or shrink ) an existing mapping , potentially moving it at the
* same time ( controlled by the MREMAP_MAYMOVE flag and available VM space )
*
* MREMAP_FIXED option added 5 - Dec - 1999 by Benjamin LaHaise
* This option implies MREMAP_MAYMOVE .
*/
unsigned long do_mremap ( unsigned long addr ,
unsigned long old_len , unsigned long new_len ,
unsigned long flags , unsigned long new_addr )
{
2005-10-30 04:16:16 +03:00
struct mm_struct * mm = current - > mm ;
2005-04-17 02:20:36 +04:00
struct vm_area_struct * vma ;
unsigned long ret = - EINVAL ;
unsigned long charged = 0 ;
if ( flags & ~ ( MREMAP_FIXED | MREMAP_MAYMOVE ) )
goto out ;
if ( addr & ~ PAGE_MASK )
goto out ;
old_len = PAGE_ALIGN ( old_len ) ;
new_len = PAGE_ALIGN ( new_len ) ;
/*
* We allow a zero old - len as a special case
* for DOS - emu " duplicate shm area " thing . But
* a zero new - len is nonsensical .
*/
if ( ! new_len )
goto out ;
/* new_addr is only valid if MREMAP_FIXED is specified */
if ( flags & MREMAP_FIXED ) {
if ( new_addr & ~ PAGE_MASK )
goto out ;
if ( ! ( flags & MREMAP_MAYMOVE ) )
goto out ;
if ( new_len > TASK_SIZE | | new_addr > TASK_SIZE - new_len )
goto out ;
/* Check if the location we're moving into overlaps the
* old location at all , and fail if it does .
*/
if ( ( new_addr < = addr ) & & ( new_addr + new_len ) > addr )
goto out ;
if ( ( addr < = new_addr ) & & ( addr + old_len ) > new_addr )
goto out ;
2007-10-18 14:07:05 +04:00
ret = security_file_mmap ( NULL , 0 , 0 , 0 , new_addr , 1 ) ;
2007-06-28 23:55:21 +04:00
if ( ret )
goto out ;
2005-10-30 04:16:16 +03:00
ret = do_munmap ( mm , new_addr , new_len ) ;
2005-04-17 02:20:36 +04:00
if ( ret )
goto out ;
}
/*
* Always allow a shrinking remap : that just unmaps
* the unnecessary pages . .
* do_munmap does all the needed commit accounting
*/
if ( old_len > = new_len ) {
2005-10-30 04:16:16 +03:00
ret = do_munmap ( mm , addr + new_len , old_len - new_len ) ;
2005-04-17 02:20:36 +04:00
if ( ret & & old_len ! = new_len )
goto out ;
ret = addr ;
if ( ! ( flags & MREMAP_FIXED ) | | ( new_addr = = addr ) )
goto out ;
old_len = new_len ;
}
/*
* Ok , we need to grow . . or relocate .
*/
ret = - EFAULT ;
2005-10-30 04:16:16 +03:00
vma = find_vma ( mm , addr ) ;
2005-04-17 02:20:36 +04:00
if ( ! vma | | vma - > vm_start > addr )
goto out ;
if ( is_vm_hugetlb_page ( vma ) ) {
ret = - EINVAL ;
goto out ;
}
/* We can't remap across vm area boundaries */
if ( old_len > vma - > vm_end - addr )
goto out ;
2005-12-16 21:21:23 +03:00
if ( vma - > vm_flags & ( VM_DONTEXPAND | VM_PFNMAP ) ) {
2005-04-17 02:20:36 +04:00
if ( new_len > old_len )
goto out ;
}
if ( vma - > vm_flags & VM_LOCKED ) {
unsigned long locked , lock_limit ;
2005-10-30 04:16:16 +03:00
locked = mm - > locked_vm < < PAGE_SHIFT ;
2005-04-17 02:20:36 +04:00
lock_limit = current - > signal - > rlim [ RLIMIT_MEMLOCK ] . rlim_cur ;
locked + = new_len - old_len ;
ret = - EAGAIN ;
if ( locked > lock_limit & & ! capable ( CAP_IPC_LOCK ) )
goto out ;
}
2005-10-30 04:16:16 +03:00
if ( ! may_expand_vm ( mm , ( new_len - old_len ) > > PAGE_SHIFT ) ) {
2005-05-01 19:58:35 +04:00
ret = - ENOMEM ;
2005-04-17 02:20:36 +04:00
goto out ;
2005-05-01 19:58:35 +04:00
}
2005-04-17 02:20:36 +04:00
if ( vma - > vm_flags & VM_ACCOUNT ) {
charged = ( new_len - old_len ) > > PAGE_SHIFT ;
if ( security_vm_enough_memory ( charged ) )
goto out_nc ;
}
/* old_len exactly to the end of the area..
* And we ' re not relocating the area .
*/
if ( old_len = = vma - > vm_end - addr & &
! ( ( flags & MREMAP_FIXED ) & & ( addr ! = new_addr ) ) & &
( old_len ! = new_len | | ! ( flags & MREMAP_MAYMOVE ) ) ) {
unsigned long max_addr = TASK_SIZE ;
if ( vma - > vm_next )
max_addr = vma - > vm_next - > vm_start ;
/* can we just expand the current mapping? */
if ( max_addr - addr > = new_len ) {
int pages = ( new_len - old_len ) > > PAGE_SHIFT ;
vma_adjust ( vma , vma - > vm_start ,
addr + new_len , vma - > vm_pgoff , NULL ) ;
2005-10-30 04:16:16 +03:00
mm - > total_vm + = pages ;
vm_stat_account ( mm , vma - > vm_flags , vma - > vm_file , pages ) ;
2005-04-17 02:20:36 +04:00
if ( vma - > vm_flags & VM_LOCKED ) {
2005-10-30 04:16:16 +03:00
mm - > locked_vm + = pages ;
2005-04-17 02:20:36 +04:00
make_pages_present ( addr + old_len ,
addr + new_len ) ;
}
ret = addr ;
goto out ;
}
}
/*
* We weren ' t able to just expand or shrink the area ,
* we need to create a new one and move it . .
*/
ret = - ENOMEM ;
if ( flags & MREMAP_MAYMOVE ) {
if ( ! ( flags & MREMAP_FIXED ) ) {
unsigned long map_flags = 0 ;
if ( vma - > vm_flags & VM_MAYSHARE )
map_flags | = MAP_SHARED ;
new_addr = get_unmapped_area ( vma - > vm_file , 0 , new_len ,
vma - > vm_pgoff , map_flags ) ;
2007-06-28 23:55:21 +04:00
if ( new_addr & ~ PAGE_MASK ) {
ret = new_addr ;
goto out ;
}
2007-10-18 14:07:05 +04:00
ret = security_file_mmap ( NULL , 0 , 0 , 0 , new_addr , 1 ) ;
2007-06-28 23:55:21 +04:00
if ( ret )
2005-04-17 02:20:36 +04:00
goto out ;
}
ret = move_vma ( vma , addr , old_len , new_len , new_addr ) ;
}
out :
if ( ret & ~ PAGE_MASK )
vm_unacct_memory ( charged ) ;
out_nc :
return ret ;
}
asmlinkage unsigned long sys_mremap ( unsigned long addr ,
unsigned long old_len , unsigned long new_len ,
unsigned long flags , unsigned long new_addr )
{
unsigned long ret ;
down_write ( & current - > mm - > mmap_sem ) ;
ret = do_mremap ( addr , old_len , new_len , flags , new_addr ) ;
up_write ( & current - > mm - > mmap_sem ) ;
return ret ;
}