2005-04-17 02:20:36 +04:00
# include <linux/mm.h>
# include <linux/hugetlb.h>
# include <linux/mount.h>
# include <linux/seq_file.h>
2005-09-04 02:55:10 +04:00
# include <linux/highmem.h>
2007-05-08 11:26:04 +04:00
# include <linux/ptrace.h>
2005-09-04 02:54:45 +04:00
# include <linux/pagemap.h>
# include <linux/mempolicy.h>
2005-09-04 02:55:10 +04:00
2005-04-17 02:20:36 +04:00
# include <asm/elf.h>
# include <asm/uaccess.h>
2005-09-04 02:55:10 +04:00
# include <asm/tlbflush.h>
2005-04-17 02:20:36 +04:00
# include "internal.h"
char * task_mem ( struct mm_struct * mm , char * buffer )
{
unsigned long data , text , lib ;
[PATCH] mm: update_hiwaters just in time
update_mem_hiwater has attracted various criticisms, in particular from those
concerned with mm scalability. Originally it was called whenever rss or
total_vm got raised. Then many of those callsites were replaced by a timer
tick call from account_system_time. Now Frank van Maarseveen reports that to
be found inadequate. How about this? Works for Frank.
Replace update_mem_hiwater, a poor combination of two unrelated ops, by macros
update_hiwater_rss and update_hiwater_vm. Don't attempt to keep
mm->hiwater_rss up to date at timer tick, nor every time we raise rss (usually
by 1): those are hot paths. Do the opposite, update only when about to lower
rss (usually by many), or just before final accounting in do_exit. Handle
mm->hiwater_vm in the same way, though it's much less of an issue. Demand
that whoever collects these hiwater statistics do the work of taking the
maximum with rss or total_vm.
And there has been no collector of these hiwater statistics in the tree. The
new convention needs an example, so match Frank's usage by adding a VmPeak
line above VmSize to /proc/<pid>/status, and also a VmHWM line above VmRSS
(High-Water-Mark or High-Water-Memory).
There was a particular anomaly during mremap move, that hiwater_vm might be
captured too high. A fleeting such anomaly remains, but it's quickly
corrected now, whereas before it would stick.
What locking? None: if the app is racy then these statistics will be racy,
it's not worth any overhead to make them exact. But whenever it suits,
hiwater_vm is updated under exclusive mmap_sem, and hiwater_rss under
page_table_lock (for now) or with preemption disabled (later on): without
going to any trouble, minimize the time between reading current values and
updating, to minimize those occasions when a racing thread bumps a count up
and back down in between.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 04:16:18 +03:00
unsigned long hiwater_vm , total_vm , hiwater_rss , total_rss ;
/*
* Note : to minimize their overhead , mm maintains hiwater_vm and
* hiwater_rss only when about to * lower * total_vm or rss . Any
* collector of these hiwater stats must therefore get total_vm
* and rss too , which will usually be the higher . Barriers ? not
* worth the effort , such snapshots can always be inconsistent .
*/
hiwater_vm = total_vm = mm - > total_vm ;
if ( hiwater_vm < mm - > hiwater_vm )
hiwater_vm = mm - > hiwater_vm ;
hiwater_rss = total_rss = get_mm_rss ( mm ) ;
if ( hiwater_rss < mm - > hiwater_rss )
hiwater_rss = mm - > hiwater_rss ;
2005-04-17 02:20:36 +04:00
data = mm - > total_vm - mm - > shared_vm - mm - > stack_vm ;
text = ( PAGE_ALIGN ( mm - > end_code ) - ( mm - > start_code & PAGE_MASK ) ) > > 10 ;
lib = ( mm - > exec_vm < < ( PAGE_SHIFT - 10 ) ) - text ;
buffer + = sprintf ( buffer ,
[PATCH] mm: update_hiwaters just in time
update_mem_hiwater has attracted various criticisms, in particular from those
concerned with mm scalability. Originally it was called whenever rss or
total_vm got raised. Then many of those callsites were replaced by a timer
tick call from account_system_time. Now Frank van Maarseveen reports that to
be found inadequate. How about this? Works for Frank.
Replace update_mem_hiwater, a poor combination of two unrelated ops, by macros
update_hiwater_rss and update_hiwater_vm. Don't attempt to keep
mm->hiwater_rss up to date at timer tick, nor every time we raise rss (usually
by 1): those are hot paths. Do the opposite, update only when about to lower
rss (usually by many), or just before final accounting in do_exit. Handle
mm->hiwater_vm in the same way, though it's much less of an issue. Demand
that whoever collects these hiwater statistics do the work of taking the
maximum with rss or total_vm.
And there has been no collector of these hiwater statistics in the tree. The
new convention needs an example, so match Frank's usage by adding a VmPeak
line above VmSize to /proc/<pid>/status, and also a VmHWM line above VmRSS
(High-Water-Mark or High-Water-Memory).
There was a particular anomaly during mremap move, that hiwater_vm might be
captured too high. A fleeting such anomaly remains, but it's quickly
corrected now, whereas before it would stick.
What locking? None: if the app is racy then these statistics will be racy,
it's not worth any overhead to make them exact. But whenever it suits,
hiwater_vm is updated under exclusive mmap_sem, and hiwater_rss under
page_table_lock (for now) or with preemption disabled (later on): without
going to any trouble, minimize the time between reading current values and
updating, to minimize those occasions when a racing thread bumps a count up
and back down in between.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 04:16:18 +03:00
" VmPeak: \t %8lu kB \n "
2005-04-17 02:20:36 +04:00
" VmSize: \t %8lu kB \n "
" VmLck: \t %8lu kB \n "
[PATCH] mm: update_hiwaters just in time
update_mem_hiwater has attracted various criticisms, in particular from those
concerned with mm scalability. Originally it was called whenever rss or
total_vm got raised. Then many of those callsites were replaced by a timer
tick call from account_system_time. Now Frank van Maarseveen reports that to
be found inadequate. How about this? Works for Frank.
Replace update_mem_hiwater, a poor combination of two unrelated ops, by macros
update_hiwater_rss and update_hiwater_vm. Don't attempt to keep
mm->hiwater_rss up to date at timer tick, nor every time we raise rss (usually
by 1): those are hot paths. Do the opposite, update only when about to lower
rss (usually by many), or just before final accounting in do_exit. Handle
mm->hiwater_vm in the same way, though it's much less of an issue. Demand
that whoever collects these hiwater statistics do the work of taking the
maximum with rss or total_vm.
And there has been no collector of these hiwater statistics in the tree. The
new convention needs an example, so match Frank's usage by adding a VmPeak
line above VmSize to /proc/<pid>/status, and also a VmHWM line above VmRSS
(High-Water-Mark or High-Water-Memory).
There was a particular anomaly during mremap move, that hiwater_vm might be
captured too high. A fleeting such anomaly remains, but it's quickly
corrected now, whereas before it would stick.
What locking? None: if the app is racy then these statistics will be racy,
it's not worth any overhead to make them exact. But whenever it suits,
hiwater_vm is updated under exclusive mmap_sem, and hiwater_rss under
page_table_lock (for now) or with preemption disabled (later on): without
going to any trouble, minimize the time between reading current values and
updating, to minimize those occasions when a racing thread bumps a count up
and back down in between.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 04:16:18 +03:00
" VmHWM: \t %8lu kB \n "
2005-04-17 02:20:36 +04:00
" VmRSS: \t %8lu kB \n "
" VmData: \t %8lu kB \n "
" VmStk: \t %8lu kB \n "
" VmExe: \t %8lu kB \n "
" VmLib: \t %8lu kB \n "
" VmPTE: \t %8lu kB \n " ,
[PATCH] mm: update_hiwaters just in time
update_mem_hiwater has attracted various criticisms, in particular from those
concerned with mm scalability. Originally it was called whenever rss or
total_vm got raised. Then many of those callsites were replaced by a timer
tick call from account_system_time. Now Frank van Maarseveen reports that to
be found inadequate. How about this? Works for Frank.
Replace update_mem_hiwater, a poor combination of two unrelated ops, by macros
update_hiwater_rss and update_hiwater_vm. Don't attempt to keep
mm->hiwater_rss up to date at timer tick, nor every time we raise rss (usually
by 1): those are hot paths. Do the opposite, update only when about to lower
rss (usually by many), or just before final accounting in do_exit. Handle
mm->hiwater_vm in the same way, though it's much less of an issue. Demand
that whoever collects these hiwater statistics do the work of taking the
maximum with rss or total_vm.
And there has been no collector of these hiwater statistics in the tree. The
new convention needs an example, so match Frank's usage by adding a VmPeak
line above VmSize to /proc/<pid>/status, and also a VmHWM line above VmRSS
(High-Water-Mark or High-Water-Memory).
There was a particular anomaly during mremap move, that hiwater_vm might be
captured too high. A fleeting such anomaly remains, but it's quickly
corrected now, whereas before it would stick.
What locking? None: if the app is racy then these statistics will be racy,
it's not worth any overhead to make them exact. But whenever it suits,
hiwater_vm is updated under exclusive mmap_sem, and hiwater_rss under
page_table_lock (for now) or with preemption disabled (later on): without
going to any trouble, minimize the time between reading current values and
updating, to minimize those occasions when a racing thread bumps a count up
and back down in between.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 04:16:18 +03:00
hiwater_vm < < ( PAGE_SHIFT - 10 ) ,
( total_vm - mm - > reserved_vm ) < < ( PAGE_SHIFT - 10 ) ,
2005-04-17 02:20:36 +04:00
mm - > locked_vm < < ( PAGE_SHIFT - 10 ) ,
[PATCH] mm: update_hiwaters just in time
update_mem_hiwater has attracted various criticisms, in particular from those
concerned with mm scalability. Originally it was called whenever rss or
total_vm got raised. Then many of those callsites were replaced by a timer
tick call from account_system_time. Now Frank van Maarseveen reports that to
be found inadequate. How about this? Works for Frank.
Replace update_mem_hiwater, a poor combination of two unrelated ops, by macros
update_hiwater_rss and update_hiwater_vm. Don't attempt to keep
mm->hiwater_rss up to date at timer tick, nor every time we raise rss (usually
by 1): those are hot paths. Do the opposite, update only when about to lower
rss (usually by many), or just before final accounting in do_exit. Handle
mm->hiwater_vm in the same way, though it's much less of an issue. Demand
that whoever collects these hiwater statistics do the work of taking the
maximum with rss or total_vm.
And there has been no collector of these hiwater statistics in the tree. The
new convention needs an example, so match Frank's usage by adding a VmPeak
line above VmSize to /proc/<pid>/status, and also a VmHWM line above VmRSS
(High-Water-Mark or High-Water-Memory).
There was a particular anomaly during mremap move, that hiwater_vm might be
captured too high. A fleeting such anomaly remains, but it's quickly
corrected now, whereas before it would stick.
What locking? None: if the app is racy then these statistics will be racy,
it's not worth any overhead to make them exact. But whenever it suits,
hiwater_vm is updated under exclusive mmap_sem, and hiwater_rss under
page_table_lock (for now) or with preemption disabled (later on): without
going to any trouble, minimize the time between reading current values and
updating, to minimize those occasions when a racing thread bumps a count up
and back down in between.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 04:16:18 +03:00
hiwater_rss < < ( PAGE_SHIFT - 10 ) ,
total_rss < < ( PAGE_SHIFT - 10 ) ,
2005-04-17 02:20:36 +04:00
data < < ( PAGE_SHIFT - 10 ) ,
mm - > stack_vm < < ( PAGE_SHIFT - 10 ) , text , lib ,
( PTRS_PER_PTE * sizeof ( pte_t ) * mm - > nr_ptes ) > > 10 ) ;
return buffer ;
}
unsigned long task_vsize ( struct mm_struct * mm )
{
return PAGE_SIZE * mm - > total_vm ;
}
int task_statm ( struct mm_struct * mm , int * shared , int * text ,
int * data , int * resident )
{
2005-10-30 04:16:05 +03:00
* shared = get_mm_counter ( mm , file_rss ) ;
2005-04-17 02:20:36 +04:00
* text = ( PAGE_ALIGN ( mm - > end_code ) - ( mm - > start_code & PAGE_MASK ) )
> > PAGE_SHIFT ;
* data = mm - > total_vm - mm - > shared_vm ;
2005-10-30 04:16:05 +03:00
* resident = * shared + get_mm_counter ( mm , anon_rss ) ;
2005-04-17 02:20:36 +04:00
return mm - > total_vm ;
}
int proc_exe_link ( struct inode * inode , struct dentry * * dentry , struct vfsmount * * mnt )
{
struct vm_area_struct * vma ;
int result = - ENOENT ;
2006-06-26 11:25:55 +04:00
struct task_struct * task = get_proc_task ( inode ) ;
struct mm_struct * mm = NULL ;
2005-04-17 02:20:36 +04:00
2006-06-26 11:25:55 +04:00
if ( task ) {
mm = get_task_mm ( task ) ;
put_task_struct ( task ) ;
}
2005-04-17 02:20:36 +04:00
if ( ! mm )
goto out ;
down_read ( & mm - > mmap_sem ) ;
vma = mm - > mmap ;
while ( vma ) {
if ( ( vma - > vm_flags & VM_EXECUTABLE ) & & vma - > vm_file )
break ;
vma = vma - > vm_next ;
}
if ( vma ) {
2006-12-08 13:36:36 +03:00
* mnt = mntget ( vma - > vm_file - > f_path . mnt ) ;
* dentry = dget ( vma - > vm_file - > f_path . dentry ) ;
2005-04-17 02:20:36 +04:00
result = 0 ;
}
up_read ( & mm - > mmap_sem ) ;
mmput ( mm ) ;
out :
return result ;
}
static void pad_len_spaces ( struct seq_file * m , int len )
{
len = 25 + sizeof ( void * ) * 6 - len ;
if ( len < 1 )
len = 1 ;
seq_printf ( m , " %*c " , len , ' ' ) ;
}
2008-02-05 09:28:56 +03:00
/*
* Proportional Set Size ( PSS ) : my share of RSS .
*
* PSS of a process is the count of pages it has in memory , where each
* page is divided by the number of processes sharing it . So if a
* process has 1000 pages all to itself , and 1000 shared with one other
* process , its PSS will be 1500.
*
* To keep ( accumulated ) division errors low , we adopt a 64 bit
* fixed - point pss counter to minimize division errors . So ( pss > >
* PSS_SHIFT ) would be the real byte count .
*
* A shift of 12 before division means ( assuming 4 K page size ) :
* - 1 M 3 - user - pages add up to 8 KB errors ;
* - supports mapcount up to 2 ^ 24 , or 16 M ;
* - supports PSS up to 2 ^ 52 bytes , or 4 PB .
*/
# define PSS_SHIFT 12
2005-09-04 02:55:10 +04:00
struct mem_size_stats
{
unsigned long resident ;
unsigned long shared_clean ;
unsigned long shared_dirty ;
unsigned long private_clean ;
unsigned long private_dirty ;
2007-05-07 01:49:22 +04:00
unsigned long referenced ;
2008-02-05 09:28:56 +03:00
u64 pss ;
2005-09-04 02:55:10 +04:00
} ;
smaps: extract pmd walker from smaps code
Extracts the pmd walker from smaps-specific code in fs/proc/task_mmu.c.
The new struct pmd_walker includes the struct vm_area_struct of the memory to
walk over. Iteration begins at the vma->vm_start and completes at
vma->vm_end. A pointer to another data structure may be stored in the private
field such as struct mem_size_stats, which acts as the smaps accumulator. For
each pmd in the VMA, the action function is called with a pointer to its
struct vm_area_struct, a pointer to the pmd_t, its start and end addresses,
and the private field.
The interface for walking pmd's in a VMA for fs/proc/task_mmu.c is now:
void for_each_pmd(struct vm_area_struct *vma,
void (*action)(struct vm_area_struct *vma,
pmd_t *pmd, unsigned long addr,
unsigned long end,
void *private),
void *private);
Since the pmd walker is now extracted from the smaps code, smaps_one_pmd() is
invoked for each pmd in the VMA. Its behavior and efficiency is identical to
the existing implementation.
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Christoph Lameter <clameter@sgi.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-05-07 01:49:21 +04:00
struct pmd_walker {
struct vm_area_struct * vma ;
void * private ;
void ( * action ) ( struct vm_area_struct * , pmd_t * , unsigned long ,
unsigned long , void * ) ;
} ;
2005-09-04 02:55:10 +04:00
static int show_map_internal ( struct seq_file * m , void * v , struct mem_size_stats * mss )
2005-04-17 02:20:36 +04:00
{
2006-06-26 11:25:55 +04:00
struct proc_maps_private * priv = m - > private ;
struct task_struct * task = priv - > task ;
2005-09-04 02:55:10 +04:00
struct vm_area_struct * vma = v ;
struct mm_struct * mm = vma - > vm_mm ;
struct file * file = vma - > vm_file ;
int flags = vma - > vm_flags ;
2005-04-17 02:20:36 +04:00
unsigned long ino = 0 ;
dev_t dev = 0 ;
int len ;
2007-05-08 11:26:04 +04:00
if ( maps_protect & & ! ptrace_may_attach ( task ) )
return - EACCES ;
2005-04-17 02:20:36 +04:00
if ( file ) {
2006-12-08 13:36:36 +03:00
struct inode * inode = vma - > vm_file - > f_path . dentry - > d_inode ;
2005-04-17 02:20:36 +04:00
dev = inode - > i_sb - > s_dev ;
ino = inode - > i_ino ;
}
seq_printf ( m , " %08lx-%08lx %c%c%c%c %08lx %02x:%02x %lu %n " ,
2005-09-04 02:55:10 +04:00
vma - > vm_start ,
vma - > vm_end ,
2005-04-17 02:20:36 +04:00
flags & VM_READ ? ' r ' : ' - ' ,
flags & VM_WRITE ? ' w ' : ' - ' ,
flags & VM_EXEC ? ' x ' : ' - ' ,
flags & VM_MAYSHARE ? ' s ' : ' p ' ,
2005-09-04 02:55:10 +04:00
vma - > vm_pgoff < < PAGE_SHIFT ,
2005-04-17 02:20:36 +04:00
MAJOR ( dev ) , MINOR ( dev ) , ino , & len ) ;
/*
* Print the dentry name for named mappings , and a
* special [ heap ] marker for the heap :
*/
2005-09-04 02:55:10 +04:00
if ( file ) {
2005-04-17 02:20:36 +04:00
pad_len_spaces ( m , len ) ;
2006-12-08 13:36:36 +03:00
seq_path ( m , file - > f_path . mnt , file - > f_path . dentry , " \n " ) ;
2005-04-17 02:20:36 +04:00
} else {
[PATCH] vdso: randomize the i386 vDSO by moving it into a vma
Move the i386 VDSO down into a vma and thus randomize it.
Besides the security implications, this feature also helps debuggers, which
can COW a vma-backed VDSO just like a normal DSO and can thus do
single-stepping and other debugging features.
It's good for hypervisors (Xen, VMWare) too, which typically live in the same
high-mapped address space as the VDSO, hence whenever the VDSO is used, they
get lots of guest pagefaults and have to fix such guest accesses up - which
slows things down instead of speeding things up (the primary purpose of the
VDSO).
There's a new CONFIG_COMPAT_VDSO (default=y) option, which provides support
for older glibcs that still rely on a prelinked high-mapped VDSO. Newer
distributions (using glibc 2.3.3 or later) can turn this option off. Turning
it off is also recommended for security reasons: attackers cannot use the
predictable high-mapped VDSO page as syscall trampoline anymore.
There is a new vdso=[0|1] boot option as well, and a runtime
/proc/sys/vm/vdso_enabled sysctl switch, that allows the VDSO to be turned
on/off.
(This version of the VDSO-randomization patch also has working ELF
coredumping, the previous patch crashed in the coredumping code.)
This code is a combined work of the exec-shield VDSO randomization
code and Gerd Hoffmann's hypervisor-centric VDSO patch. Rusty Russell
started this patch and i completed it.
[akpm@osdl.org: cleanups]
[akpm@osdl.org: compile fix]
[akpm@osdl.org: compile fix 2]
[akpm@osdl.org: compile fix 3]
[akpm@osdl.org: revernt MAXMEM change]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Cc: Gerd Hoffmann <kraxel@suse.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Andi Kleen <ak@muc.de>
Cc: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 13:53:50 +04:00
const char * name = arch_vma_name ( vma ) ;
if ( ! name ) {
if ( mm ) {
if ( vma - > vm_start < = mm - > start_brk & &
2005-09-04 02:55:10 +04:00
vma - > vm_end > = mm - > brk ) {
[PATCH] vdso: randomize the i386 vDSO by moving it into a vma
Move the i386 VDSO down into a vma and thus randomize it.
Besides the security implications, this feature also helps debuggers, which
can COW a vma-backed VDSO just like a normal DSO and can thus do
single-stepping and other debugging features.
It's good for hypervisors (Xen, VMWare) too, which typically live in the same
high-mapped address space as the VDSO, hence whenever the VDSO is used, they
get lots of guest pagefaults and have to fix such guest accesses up - which
slows things down instead of speeding things up (the primary purpose of the
VDSO).
There's a new CONFIG_COMPAT_VDSO (default=y) option, which provides support
for older glibcs that still rely on a prelinked high-mapped VDSO. Newer
distributions (using glibc 2.3.3 or later) can turn this option off. Turning
it off is also recommended for security reasons: attackers cannot use the
predictable high-mapped VDSO page as syscall trampoline anymore.
There is a new vdso=[0|1] boot option as well, and a runtime
/proc/sys/vm/vdso_enabled sysctl switch, that allows the VDSO to be turned
on/off.
(This version of the VDSO-randomization patch also has working ELF
coredumping, the previous patch crashed in the coredumping code.)
This code is a combined work of the exec-shield VDSO randomization
code and Gerd Hoffmann's hypervisor-centric VDSO patch. Rusty Russell
started this patch and i completed it.
[akpm@osdl.org: cleanups]
[akpm@osdl.org: compile fix]
[akpm@osdl.org: compile fix 2]
[akpm@osdl.org: compile fix 3]
[akpm@osdl.org: revernt MAXMEM change]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Cc: Gerd Hoffmann <kraxel@suse.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Andi Kleen <ak@muc.de>
Cc: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 13:53:50 +04:00
name = " [heap] " ;
} else if ( vma - > vm_start < = mm - > start_stack & &
vma - > vm_end > = mm - > start_stack ) {
name = " [stack] " ;
2005-04-17 02:20:36 +04:00
}
[PATCH] vdso: randomize the i386 vDSO by moving it into a vma
Move the i386 VDSO down into a vma and thus randomize it.
Besides the security implications, this feature also helps debuggers, which
can COW a vma-backed VDSO just like a normal DSO and can thus do
single-stepping and other debugging features.
It's good for hypervisors (Xen, VMWare) too, which typically live in the same
high-mapped address space as the VDSO, hence whenever the VDSO is used, they
get lots of guest pagefaults and have to fix such guest accesses up - which
slows things down instead of speeding things up (the primary purpose of the
VDSO).
There's a new CONFIG_COMPAT_VDSO (default=y) option, which provides support
for older glibcs that still rely on a prelinked high-mapped VDSO. Newer
distributions (using glibc 2.3.3 or later) can turn this option off. Turning
it off is also recommended for security reasons: attackers cannot use the
predictable high-mapped VDSO page as syscall trampoline anymore.
There is a new vdso=[0|1] boot option as well, and a runtime
/proc/sys/vm/vdso_enabled sysctl switch, that allows the VDSO to be turned
on/off.
(This version of the VDSO-randomization patch also has working ELF
coredumping, the previous patch crashed in the coredumping code.)
This code is a combined work of the exec-shield VDSO randomization
code and Gerd Hoffmann's hypervisor-centric VDSO patch. Rusty Russell
started this patch and i completed it.
[akpm@osdl.org: cleanups]
[akpm@osdl.org: compile fix]
[akpm@osdl.org: compile fix 2]
[akpm@osdl.org: compile fix 3]
[akpm@osdl.org: revernt MAXMEM change]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Cc: Gerd Hoffmann <kraxel@suse.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Andi Kleen <ak@muc.de>
Cc: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 13:53:50 +04:00
} else {
name = " [vdso] " ;
2005-04-17 02:20:36 +04:00
}
[PATCH] vdso: randomize the i386 vDSO by moving it into a vma
Move the i386 VDSO down into a vma and thus randomize it.
Besides the security implications, this feature also helps debuggers, which
can COW a vma-backed VDSO just like a normal DSO and can thus do
single-stepping and other debugging features.
It's good for hypervisors (Xen, VMWare) too, which typically live in the same
high-mapped address space as the VDSO, hence whenever the VDSO is used, they
get lots of guest pagefaults and have to fix such guest accesses up - which
slows things down instead of speeding things up (the primary purpose of the
VDSO).
There's a new CONFIG_COMPAT_VDSO (default=y) option, which provides support
for older glibcs that still rely on a prelinked high-mapped VDSO. Newer
distributions (using glibc 2.3.3 or later) can turn this option off. Turning
it off is also recommended for security reasons: attackers cannot use the
predictable high-mapped VDSO page as syscall trampoline anymore.
There is a new vdso=[0|1] boot option as well, and a runtime
/proc/sys/vm/vdso_enabled sysctl switch, that allows the VDSO to be turned
on/off.
(This version of the VDSO-randomization patch also has working ELF
coredumping, the previous patch crashed in the coredumping code.)
This code is a combined work of the exec-shield VDSO randomization
code and Gerd Hoffmann's hypervisor-centric VDSO patch. Rusty Russell
started this patch and i completed it.
[akpm@osdl.org: cleanups]
[akpm@osdl.org: compile fix]
[akpm@osdl.org: compile fix 2]
[akpm@osdl.org: compile fix 3]
[akpm@osdl.org: revernt MAXMEM change]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Cc: Gerd Hoffmann <kraxel@suse.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Andi Kleen <ak@muc.de>
Cc: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 13:53:50 +04:00
}
if ( name ) {
2005-04-17 02:20:36 +04:00
pad_len_spaces ( m , len ) ;
[PATCH] vdso: randomize the i386 vDSO by moving it into a vma
Move the i386 VDSO down into a vma and thus randomize it.
Besides the security implications, this feature also helps debuggers, which
can COW a vma-backed VDSO just like a normal DSO and can thus do
single-stepping and other debugging features.
It's good for hypervisors (Xen, VMWare) too, which typically live in the same
high-mapped address space as the VDSO, hence whenever the VDSO is used, they
get lots of guest pagefaults and have to fix such guest accesses up - which
slows things down instead of speeding things up (the primary purpose of the
VDSO).
There's a new CONFIG_COMPAT_VDSO (default=y) option, which provides support
for older glibcs that still rely on a prelinked high-mapped VDSO. Newer
distributions (using glibc 2.3.3 or later) can turn this option off. Turning
it off is also recommended for security reasons: attackers cannot use the
predictable high-mapped VDSO page as syscall trampoline anymore.
There is a new vdso=[0|1] boot option as well, and a runtime
/proc/sys/vm/vdso_enabled sysctl switch, that allows the VDSO to be turned
on/off.
(This version of the VDSO-randomization patch also has working ELF
coredumping, the previous patch crashed in the coredumping code.)
This code is a combined work of the exec-shield VDSO randomization
code and Gerd Hoffmann's hypervisor-centric VDSO patch. Rusty Russell
started this patch and i completed it.
[akpm@osdl.org: cleanups]
[akpm@osdl.org: compile fix]
[akpm@osdl.org: compile fix 2]
[akpm@osdl.org: compile fix 3]
[akpm@osdl.org: revernt MAXMEM change]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Cc: Gerd Hoffmann <kraxel@suse.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Andi Kleen <ak@muc.de>
Cc: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 13:53:50 +04:00
seq_puts ( m , name ) ;
2005-04-17 02:20:36 +04:00
}
}
seq_putc ( m , ' \n ' ) ;
2005-09-04 02:55:10 +04:00
if ( mss )
seq_printf ( m ,
2007-05-07 01:49:22 +04:00
" Size: %8lu kB \n "
" Rss: %8lu kB \n "
2008-02-05 09:28:56 +03:00
" Pss: %8lu kB \n "
2007-05-07 01:49:22 +04:00
" Shared_Clean: %8lu kB \n "
" Shared_Dirty: %8lu kB \n "
" Private_Clean: %8lu kB \n "
" Private_Dirty: %8lu kB \n "
2007-05-07 01:49:24 +04:00
" Referenced: %8lu kB \n " ,
2005-09-04 02:55:10 +04:00
( vma - > vm_end - vma - > vm_start ) > > 10 ,
mss - > resident > > 10 ,
2008-02-05 09:28:56 +03:00
( unsigned long ) ( mss - > pss > > ( 10 + PSS_SHIFT ) ) ,
2005-09-04 02:55:10 +04:00
mss - > shared_clean > > 10 ,
mss - > shared_dirty > > 10 ,
mss - > private_clean > > 10 ,
2007-05-07 01:49:22 +04:00
mss - > private_dirty > > 10 ,
mss - > referenced > > 10 ) ;
2005-09-04 02:55:10 +04:00
if ( m - > count < m - > size ) /* vma is copied successfully */
m - > version = ( vma ! = get_gate_vma ( task ) ) ? vma - > vm_start : 0 ;
2005-04-17 02:20:36 +04:00
return 0 ;
}
2005-09-04 02:55:10 +04:00
static int show_map ( struct seq_file * m , void * v )
{
2005-11-14 03:07:20 +03:00
return show_map_internal ( m , v , NULL ) ;
2005-09-04 02:55:10 +04:00
}
2007-05-07 01:49:24 +04:00
static void smaps_pte_range ( struct vm_area_struct * vma , pmd_t * pmd ,
unsigned long addr , unsigned long end ,
void * private )
2005-09-04 02:55:10 +04:00
{
smaps: extract pmd walker from smaps code
Extracts the pmd walker from smaps-specific code in fs/proc/task_mmu.c.
The new struct pmd_walker includes the struct vm_area_struct of the memory to
walk over. Iteration begins at the vma->vm_start and completes at
vma->vm_end. A pointer to another data structure may be stored in the private
field such as struct mem_size_stats, which acts as the smaps accumulator. For
each pmd in the VMA, the action function is called with a pointer to its
struct vm_area_struct, a pointer to the pmd_t, its start and end addresses,
and the private field.
The interface for walking pmd's in a VMA for fs/proc/task_mmu.c is now:
void for_each_pmd(struct vm_area_struct *vma,
void (*action)(struct vm_area_struct *vma,
pmd_t *pmd, unsigned long addr,
unsigned long end,
void *private),
void *private);
Since the pmd walker is now extracted from the smaps code, smaps_one_pmd() is
invoked for each pmd in the VMA. Its behavior and efficiency is identical to
the existing implementation.
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Christoph Lameter <clameter@sgi.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-05-07 01:49:21 +04:00
struct mem_size_stats * mss = private ;
2005-09-04 02:55:10 +04:00
pte_t * pte , ptent ;
2005-10-30 04:16:27 +03:00
spinlock_t * ptl ;
2005-09-04 02:55:10 +04:00
struct page * page ;
2008-02-05 09:28:56 +03:00
int mapcount ;
2005-09-04 02:55:10 +04:00
2005-10-30 04:16:27 +03:00
pte = pte_offset_map_lock ( vma - > vm_mm , pmd , addr , & ptl ) ;
smaps: extract pmd walker from smaps code
Extracts the pmd walker from smaps-specific code in fs/proc/task_mmu.c.
The new struct pmd_walker includes the struct vm_area_struct of the memory to
walk over. Iteration begins at the vma->vm_start and completes at
vma->vm_end. A pointer to another data structure may be stored in the private
field such as struct mem_size_stats, which acts as the smaps accumulator. For
each pmd in the VMA, the action function is called with a pointer to its
struct vm_area_struct, a pointer to the pmd_t, its start and end addresses,
and the private field.
The interface for walking pmd's in a VMA for fs/proc/task_mmu.c is now:
void for_each_pmd(struct vm_area_struct *vma,
void (*action)(struct vm_area_struct *vma,
pmd_t *pmd, unsigned long addr,
unsigned long end,
void *private),
void *private);
Since the pmd walker is now extracted from the smaps code, smaps_one_pmd() is
invoked for each pmd in the VMA. Its behavior and efficiency is identical to
the existing implementation.
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Christoph Lameter <clameter@sgi.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-05-07 01:49:21 +04:00
for ( ; addr ! = end ; pte + + , addr + = PAGE_SIZE ) {
2005-09-04 02:55:10 +04:00
ptent = * pte ;
2005-10-30 04:16:27 +03:00
if ( ! pte_present ( ptent ) )
2005-09-04 02:55:10 +04:00
continue ;
mss - > resident + = PAGE_SIZE ;
2006-03-07 02:42:58 +03:00
page = vm_normal_page ( vma , addr , ptent ) ;
if ( ! page )
2005-09-04 02:55:10 +04:00
continue ;
2007-05-07 01:49:22 +04:00
/* Accumulate the size in pages that have been accessed. */
if ( pte_young ( ptent ) | | PageReferenced ( page ) )
mss - > referenced + = PAGE_SIZE ;
2008-02-05 09:28:56 +03:00
mapcount = page_mapcount ( page ) ;
if ( mapcount > = 2 ) {
2005-09-04 02:55:10 +04:00
if ( pte_dirty ( ptent ) )
mss - > shared_dirty + = PAGE_SIZE ;
else
mss - > shared_clean + = PAGE_SIZE ;
2008-02-05 09:28:56 +03:00
mss - > pss + = ( PAGE_SIZE < < PSS_SHIFT ) / mapcount ;
2005-09-04 02:55:10 +04:00
} else {
if ( pte_dirty ( ptent ) )
mss - > private_dirty + = PAGE_SIZE ;
else
mss - > private_clean + = PAGE_SIZE ;
2008-02-05 09:28:56 +03:00
mss - > pss + = ( PAGE_SIZE < < PSS_SHIFT ) ;
2005-09-04 02:55:10 +04:00
}
smaps: extract pmd walker from smaps code
Extracts the pmd walker from smaps-specific code in fs/proc/task_mmu.c.
The new struct pmd_walker includes the struct vm_area_struct of the memory to
walk over. Iteration begins at the vma->vm_start and completes at
vma->vm_end. A pointer to another data structure may be stored in the private
field such as struct mem_size_stats, which acts as the smaps accumulator. For
each pmd in the VMA, the action function is called with a pointer to its
struct vm_area_struct, a pointer to the pmd_t, its start and end addresses,
and the private field.
The interface for walking pmd's in a VMA for fs/proc/task_mmu.c is now:
void for_each_pmd(struct vm_area_struct *vma,
void (*action)(struct vm_area_struct *vma,
pmd_t *pmd, unsigned long addr,
unsigned long end,
void *private),
void *private);
Since the pmd walker is now extracted from the smaps code, smaps_one_pmd() is
invoked for each pmd in the VMA. Its behavior and efficiency is identical to
the existing implementation.
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Christoph Lameter <clameter@sgi.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-05-07 01:49:21 +04:00
}
2005-10-30 04:16:27 +03:00
pte_unmap_unlock ( pte - 1 , ptl ) ;
cond_resched ( ) ;
2005-09-04 02:55:10 +04:00
}
2007-05-07 01:49:24 +04:00
static void clear_refs_pte_range ( struct vm_area_struct * vma , pmd_t * pmd ,
unsigned long addr , unsigned long end ,
void * private )
{
pte_t * pte , ptent ;
spinlock_t * ptl ;
struct page * page ;
pte = pte_offset_map_lock ( vma - > vm_mm , pmd , addr , & ptl ) ;
for ( ; addr ! = end ; pte + + , addr + = PAGE_SIZE ) {
ptent = * pte ;
if ( ! pte_present ( ptent ) )
continue ;
page = vm_normal_page ( vma , addr , ptent ) ;
if ( ! page )
continue ;
/* Clear accessed and referenced bits. */
ptep_test_and_clear_young ( vma , addr , pte ) ;
ClearPageReferenced ( page ) ;
}
pte_unmap_unlock ( pte - 1 , ptl ) ;
cond_resched ( ) ;
}
static inline void walk_pmd_range ( struct pmd_walker * walker , pud_t * pud ,
unsigned long addr , unsigned long end )
2005-09-04 02:55:10 +04:00
{
pmd_t * pmd ;
unsigned long next ;
smaps: extract pmd walker from smaps code
Extracts the pmd walker from smaps-specific code in fs/proc/task_mmu.c.
The new struct pmd_walker includes the struct vm_area_struct of the memory to
walk over. Iteration begins at the vma->vm_start and completes at
vma->vm_end. A pointer to another data structure may be stored in the private
field such as struct mem_size_stats, which acts as the smaps accumulator. For
each pmd in the VMA, the action function is called with a pointer to its
struct vm_area_struct, a pointer to the pmd_t, its start and end addresses,
and the private field.
The interface for walking pmd's in a VMA for fs/proc/task_mmu.c is now:
void for_each_pmd(struct vm_area_struct *vma,
void (*action)(struct vm_area_struct *vma,
pmd_t *pmd, unsigned long addr,
unsigned long end,
void *private),
void *private);
Since the pmd walker is now extracted from the smaps code, smaps_one_pmd() is
invoked for each pmd in the VMA. Its behavior and efficiency is identical to
the existing implementation.
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Christoph Lameter <clameter@sgi.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-05-07 01:49:21 +04:00
for ( pmd = pmd_offset ( pud , addr ) ; addr ! = end ;
pmd + + , addr = next ) {
2005-09-04 02:55:10 +04:00
next = pmd_addr_end ( addr , end ) ;
if ( pmd_none_or_clear_bad ( pmd ) )
continue ;
smaps: extract pmd walker from smaps code
Extracts the pmd walker from smaps-specific code in fs/proc/task_mmu.c.
The new struct pmd_walker includes the struct vm_area_struct of the memory to
walk over. Iteration begins at the vma->vm_start and completes at
vma->vm_end. A pointer to another data structure may be stored in the private
field such as struct mem_size_stats, which acts as the smaps accumulator. For
each pmd in the VMA, the action function is called with a pointer to its
struct vm_area_struct, a pointer to the pmd_t, its start and end addresses,
and the private field.
The interface for walking pmd's in a VMA for fs/proc/task_mmu.c is now:
void for_each_pmd(struct vm_area_struct *vma,
void (*action)(struct vm_area_struct *vma,
pmd_t *pmd, unsigned long addr,
unsigned long end,
void *private),
void *private);
Since the pmd walker is now extracted from the smaps code, smaps_one_pmd() is
invoked for each pmd in the VMA. Its behavior and efficiency is identical to
the existing implementation.
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Christoph Lameter <clameter@sgi.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-05-07 01:49:21 +04:00
walker - > action ( walker - > vma , pmd , addr , next , walker - > private ) ;
}
2005-09-04 02:55:10 +04:00
}
2007-05-07 01:49:24 +04:00
static inline void walk_pud_range ( struct pmd_walker * walker , pgd_t * pgd ,
unsigned long addr , unsigned long end )
2005-09-04 02:55:10 +04:00
{
pud_t * pud ;
unsigned long next ;
smaps: extract pmd walker from smaps code
Extracts the pmd walker from smaps-specific code in fs/proc/task_mmu.c.
The new struct pmd_walker includes the struct vm_area_struct of the memory to
walk over. Iteration begins at the vma->vm_start and completes at
vma->vm_end. A pointer to another data structure may be stored in the private
field such as struct mem_size_stats, which acts as the smaps accumulator. For
each pmd in the VMA, the action function is called with a pointer to its
struct vm_area_struct, a pointer to the pmd_t, its start and end addresses,
and the private field.
The interface for walking pmd's in a VMA for fs/proc/task_mmu.c is now:
void for_each_pmd(struct vm_area_struct *vma,
void (*action)(struct vm_area_struct *vma,
pmd_t *pmd, unsigned long addr,
unsigned long end,
void *private),
void *private);
Since the pmd walker is now extracted from the smaps code, smaps_one_pmd() is
invoked for each pmd in the VMA. Its behavior and efficiency is identical to
the existing implementation.
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Christoph Lameter <clameter@sgi.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-05-07 01:49:21 +04:00
for ( pud = pud_offset ( pgd , addr ) ; addr ! = end ;
pud + + , addr = next ) {
2005-09-04 02:55:10 +04:00
next = pud_addr_end ( addr , end ) ;
if ( pud_none_or_clear_bad ( pud ) )
continue ;
2007-05-07 01:49:24 +04:00
walk_pmd_range ( walker , pud , addr , next ) ;
smaps: extract pmd walker from smaps code
Extracts the pmd walker from smaps-specific code in fs/proc/task_mmu.c.
The new struct pmd_walker includes the struct vm_area_struct of the memory to
walk over. Iteration begins at the vma->vm_start and completes at
vma->vm_end. A pointer to another data structure may be stored in the private
field such as struct mem_size_stats, which acts as the smaps accumulator. For
each pmd in the VMA, the action function is called with a pointer to its
struct vm_area_struct, a pointer to the pmd_t, its start and end addresses,
and the private field.
The interface for walking pmd's in a VMA for fs/proc/task_mmu.c is now:
void for_each_pmd(struct vm_area_struct *vma,
void (*action)(struct vm_area_struct *vma,
pmd_t *pmd, unsigned long addr,
unsigned long end,
void *private),
void *private);
Since the pmd walker is now extracted from the smaps code, smaps_one_pmd() is
invoked for each pmd in the VMA. Its behavior and efficiency is identical to
the existing implementation.
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Christoph Lameter <clameter@sgi.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-05-07 01:49:21 +04:00
}
2005-09-04 02:55:10 +04:00
}
2007-05-07 01:49:24 +04:00
/*
* walk_page_range - walk the page tables of a VMA with a callback
* @ vma - VMA to walk
* @ action - callback invoked for every bottom - level ( PTE ) page table
* @ private - private data passed to the callback function
*
* Recursively walk the page table for the memory area in a VMA , calling
* a callback for every bottom - level ( PTE ) page table .
*/
static inline void walk_page_range ( struct vm_area_struct * vma ,
void ( * action ) ( struct vm_area_struct * ,
pmd_t * , unsigned long ,
unsigned long , void * ) ,
void * private )
2005-09-04 02:55:10 +04:00
{
smaps: extract pmd walker from smaps code
Extracts the pmd walker from smaps-specific code in fs/proc/task_mmu.c.
The new struct pmd_walker includes the struct vm_area_struct of the memory to
walk over. Iteration begins at the vma->vm_start and completes at
vma->vm_end. A pointer to another data structure may be stored in the private
field such as struct mem_size_stats, which acts as the smaps accumulator. For
each pmd in the VMA, the action function is called with a pointer to its
struct vm_area_struct, a pointer to the pmd_t, its start and end addresses,
and the private field.
The interface for walking pmd's in a VMA for fs/proc/task_mmu.c is now:
void for_each_pmd(struct vm_area_struct *vma,
void (*action)(struct vm_area_struct *vma,
pmd_t *pmd, unsigned long addr,
unsigned long end,
void *private),
void *private);
Since the pmd walker is now extracted from the smaps code, smaps_one_pmd() is
invoked for each pmd in the VMA. Its behavior and efficiency is identical to
the existing implementation.
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Christoph Lameter <clameter@sgi.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-05-07 01:49:21 +04:00
unsigned long addr = vma - > vm_start ;
unsigned long end = vma - > vm_end ;
struct pmd_walker walker = {
. vma = vma ,
. private = private ,
. action = action ,
} ;
2005-09-04 02:55:10 +04:00
pgd_t * pgd ;
unsigned long next ;
smaps: extract pmd walker from smaps code
Extracts the pmd walker from smaps-specific code in fs/proc/task_mmu.c.
The new struct pmd_walker includes the struct vm_area_struct of the memory to
walk over. Iteration begins at the vma->vm_start and completes at
vma->vm_end. A pointer to another data structure may be stored in the private
field such as struct mem_size_stats, which acts as the smaps accumulator. For
each pmd in the VMA, the action function is called with a pointer to its
struct vm_area_struct, a pointer to the pmd_t, its start and end addresses,
and the private field.
The interface for walking pmd's in a VMA for fs/proc/task_mmu.c is now:
void for_each_pmd(struct vm_area_struct *vma,
void (*action)(struct vm_area_struct *vma,
pmd_t *pmd, unsigned long addr,
unsigned long end,
void *private),
void *private);
Since the pmd walker is now extracted from the smaps code, smaps_one_pmd() is
invoked for each pmd in the VMA. Its behavior and efficiency is identical to
the existing implementation.
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Christoph Lameter <clameter@sgi.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-05-07 01:49:21 +04:00
for ( pgd = pgd_offset ( vma - > vm_mm , addr ) ; addr ! = end ;
pgd + + , addr = next ) {
2005-09-04 02:55:10 +04:00
next = pgd_addr_end ( addr , end ) ;
if ( pgd_none_or_clear_bad ( pgd ) )
continue ;
2007-05-07 01:49:24 +04:00
walk_pud_range ( & walker , pgd , addr , next ) ;
smaps: extract pmd walker from smaps code
Extracts the pmd walker from smaps-specific code in fs/proc/task_mmu.c.
The new struct pmd_walker includes the struct vm_area_struct of the memory to
walk over. Iteration begins at the vma->vm_start and completes at
vma->vm_end. A pointer to another data structure may be stored in the private
field such as struct mem_size_stats, which acts as the smaps accumulator. For
each pmd in the VMA, the action function is called with a pointer to its
struct vm_area_struct, a pointer to the pmd_t, its start and end addresses,
and the private field.
The interface for walking pmd's in a VMA for fs/proc/task_mmu.c is now:
void for_each_pmd(struct vm_area_struct *vma,
void (*action)(struct vm_area_struct *vma,
pmd_t *pmd, unsigned long addr,
unsigned long end,
void *private),
void *private);
Since the pmd walker is now extracted from the smaps code, smaps_one_pmd() is
invoked for each pmd in the VMA. Its behavior and efficiency is identical to
the existing implementation.
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Christoph Lameter <clameter@sgi.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-05-07 01:49:21 +04:00
}
2005-09-04 02:55:10 +04:00
}
static int show_smap ( struct seq_file * m , void * v )
{
struct vm_area_struct * vma = v ;
struct mem_size_stats mss ;
memset ( & mss , 0 , sizeof mss ) ;
2006-03-07 02:42:57 +03:00
if ( vma - > vm_mm & & ! is_vm_hugetlb_page ( vma ) )
2007-05-07 01:49:24 +04:00
walk_page_range ( vma , smaps_pte_range , & mss ) ;
2005-09-04 02:55:10 +04:00
return show_map_internal ( m , v , & mss ) ;
}
2007-05-07 01:49:24 +04:00
void clear_refs_smap ( struct mm_struct * mm )
{
struct vm_area_struct * vma ;
down_read ( & mm - > mmap_sem ) ;
for ( vma = mm - > mmap ; vma ; vma = vma - > vm_next )
if ( vma - > vm_mm & & ! is_vm_hugetlb_page ( vma ) )
walk_page_range ( vma , clear_refs_pte_range , NULL ) ;
flush_tlb_mm ( mm ) ;
up_read ( & mm - > mmap_sem ) ;
}
2005-04-17 02:20:36 +04:00
static void * m_start ( struct seq_file * m , loff_t * pos )
{
2006-06-26 11:25:55 +04:00
struct proc_maps_private * priv = m - > private ;
2005-04-17 02:20:36 +04:00
unsigned long last_addr = m - > version ;
struct mm_struct * mm ;
2006-06-26 11:25:55 +04:00
struct vm_area_struct * vma , * tail_vma = NULL ;
2005-04-17 02:20:36 +04:00
loff_t l = * pos ;
2006-06-26 11:25:55 +04:00
/* Clear the per syscall fields in priv */
priv - > task = NULL ;
priv - > tail_vma = NULL ;
2005-04-17 02:20:36 +04:00
/*
* We remember last_addr rather than next_addr to hit with
* mmap_cache most of the time . We have zero last_addr at
2005-09-04 02:55:10 +04:00
* the beginning and also after lseek . We will have - 1 last_addr
* after the end of the vmas .
2005-04-17 02:20:36 +04:00
*/
if ( last_addr = = - 1UL )
return NULL ;
2006-06-26 11:25:56 +04:00
priv - > task = get_pid_task ( priv - > pid , PIDTYPE_PID ) ;
2006-06-26 11:25:55 +04:00
if ( ! priv - > task )
return NULL ;
2008-01-02 17:09:57 +03:00
mm = mm_for_maps ( priv - > task ) ;
2005-04-17 02:20:36 +04:00
if ( ! mm )
return NULL ;
2006-06-26 11:25:55 +04:00
priv - > tail_vma = tail_vma = get_gate_vma ( priv - > task ) ;
2005-04-17 02:20:36 +04:00
/* Start with last addr hint */
2005-09-04 02:55:10 +04:00
if ( last_addr & & ( vma = find_vma ( mm , last_addr ) ) ) {
vma = vma - > vm_next ;
2005-04-17 02:20:36 +04:00
goto out ;
}
/*
2005-09-04 02:55:10 +04:00
* Check the vma index is within the range and do
2005-04-17 02:20:36 +04:00
* sequential scan until m_index .
*/
2005-09-04 02:55:10 +04:00
vma = NULL ;
2005-04-17 02:20:36 +04:00
if ( ( unsigned long ) l < mm - > map_count ) {
2005-09-04 02:55:10 +04:00
vma = mm - > mmap ;
while ( l - - & & vma )
vma = vma - > vm_next ;
2005-04-17 02:20:36 +04:00
goto out ;
}
if ( l ! = mm - > map_count )
2005-09-04 02:55:10 +04:00
tail_vma = NULL ; /* After gate vma */
2005-04-17 02:20:36 +04:00
out :
2005-09-04 02:55:10 +04:00
if ( vma )
return vma ;
2005-04-17 02:20:36 +04:00
2005-09-04 02:55:10 +04:00
/* End of vmas has been reached */
m - > version = ( tail_vma ! = NULL ) ? 0 : - 1UL ;
2005-04-17 02:20:36 +04:00
up_read ( & mm - > mmap_sem ) ;
mmput ( mm ) ;
2005-09-04 02:55:10 +04:00
return tail_vma ;
2005-04-17 02:20:36 +04:00
}
2006-06-26 11:25:55 +04:00
static void vma_stop ( struct proc_maps_private * priv , struct vm_area_struct * vma )
2005-04-17 02:20:36 +04:00
{
2006-06-26 11:25:55 +04:00
if ( vma & & vma ! = priv - > tail_vma ) {
2005-09-04 02:55:10 +04:00
struct mm_struct * mm = vma - > vm_mm ;
2005-04-17 02:20:36 +04:00
up_read ( & mm - > mmap_sem ) ;
mmput ( mm ) ;
}
}
static void * m_next ( struct seq_file * m , void * v , loff_t * pos )
{
2006-06-26 11:25:55 +04:00
struct proc_maps_private * priv = m - > private ;
2005-09-04 02:55:10 +04:00
struct vm_area_struct * vma = v ;
2006-06-26 11:25:55 +04:00
struct vm_area_struct * tail_vma = priv - > tail_vma ;
2005-04-17 02:20:36 +04:00
( * pos ) + + ;
2005-09-04 02:55:10 +04:00
if ( vma & & ( vma ! = tail_vma ) & & vma - > vm_next )
return vma - > vm_next ;
2006-06-26 11:25:55 +04:00
vma_stop ( priv , vma ) ;
2005-09-04 02:55:10 +04:00
return ( vma ! = tail_vma ) ? tail_vma : NULL ;
2005-04-17 02:20:36 +04:00
}
2006-06-26 11:25:55 +04:00
static void m_stop ( struct seq_file * m , void * v )
{
struct proc_maps_private * priv = m - > private ;
struct vm_area_struct * vma = v ;
vma_stop ( priv , vma ) ;
if ( priv - > task )
put_task_struct ( priv - > task ) ;
}
2006-06-26 11:25:48 +04:00
static struct seq_operations proc_pid_maps_op = {
2005-04-17 02:20:36 +04:00
. start = m_start ,
. next = m_next ,
. stop = m_stop ,
. show = show_map
} ;
2005-09-04 02:54:45 +04:00
2006-06-26 11:25:48 +04:00
static struct seq_operations proc_pid_smaps_op = {
2005-09-04 02:55:10 +04:00
. start = m_start ,
. next = m_next ,
. stop = m_stop ,
. show = show_smap
} ;
2006-06-26 11:25:48 +04:00
static int do_maps_open ( struct inode * inode , struct file * file ,
struct seq_operations * ops )
{
2006-06-26 11:25:55 +04:00
struct proc_maps_private * priv ;
int ret = - ENOMEM ;
priv = kzalloc ( sizeof ( * priv ) , GFP_KERNEL ) ;
if ( priv ) {
2006-06-26 11:25:56 +04:00
priv - > pid = proc_pid ( inode ) ;
2006-06-26 11:25:55 +04:00
ret = seq_open ( file , ops ) ;
if ( ! ret ) {
struct seq_file * m = file - > private_data ;
m - > private = priv ;
} else {
kfree ( priv ) ;
}
2006-06-26 11:25:48 +04:00
}
return ret ;
}
static int maps_open ( struct inode * inode , struct file * file )
{
return do_maps_open ( inode , file , & proc_pid_maps_op ) ;
}
2007-02-12 11:55:34 +03:00
const struct file_operations proc_maps_operations = {
2006-06-26 11:25:48 +04:00
. open = maps_open ,
. read = seq_read ,
. llseek = seq_lseek ,
2006-06-26 11:25:55 +04:00
. release = seq_release_private ,
2006-06-26 11:25:48 +04:00
} ;
2005-09-04 02:54:45 +04:00
# ifdef CONFIG_NUMA
2006-01-08 12:01:02 +03:00
extern int show_numa_map ( struct seq_file * m , void * v ) ;
2005-09-04 02:54:45 +04:00
2007-05-08 11:26:04 +04:00
static int show_numa_map_checked ( struct seq_file * m , void * v )
{
struct proc_maps_private * priv = m - > private ;
struct task_struct * task = priv - > task ;
if ( maps_protect & & ! ptrace_may_attach ( task ) )
return - EACCES ;
return show_numa_map ( m , v ) ;
}
2006-06-26 11:25:48 +04:00
static struct seq_operations proc_pid_numa_maps_op = {
2006-01-08 12:01:02 +03:00
. start = m_start ,
. next = m_next ,
. stop = m_stop ,
2007-05-08 11:26:04 +04:00
. show = show_numa_map_checked
2005-09-04 02:54:45 +04:00
} ;
2006-06-26 11:25:48 +04:00
static int numa_maps_open ( struct inode * inode , struct file * file )
{
return do_maps_open ( inode , file , & proc_pid_numa_maps_op ) ;
}
2007-02-12 11:55:34 +03:00
const struct file_operations proc_numa_maps_operations = {
2006-06-26 11:25:48 +04:00
. open = numa_maps_open ,
. read = seq_read ,
. llseek = seq_lseek ,
2006-06-26 11:25:55 +04:00
. release = seq_release_private ,
2006-06-26 11:25:48 +04:00
} ;
2005-09-04 02:54:45 +04:00
# endif
2006-06-26 11:25:48 +04:00
static int smaps_open ( struct inode * inode , struct file * file )
{
return do_maps_open ( inode , file , & proc_pid_smaps_op ) ;
}
2007-02-12 11:55:34 +03:00
const struct file_operations proc_smaps_operations = {
2006-06-26 11:25:48 +04:00
. open = smaps_open ,
. read = seq_read ,
. llseek = seq_lseek ,
2006-06-26 11:25:55 +04:00
. release = seq_release_private ,
2006-06-26 11:25:48 +04:00
} ;