2005-04-17 02:20:36 +04:00
# include <linux/mm.h>
# include <linux/hugetlb.h>
# include <linux/mount.h>
# include <linux/seq_file.h>
2005-09-04 02:55:10 +04:00
# include <linux/highmem.h>
2005-09-04 02:54:45 +04:00
# include <linux/pagemap.h>
# include <linux/mempolicy.h>
2005-09-04 02:55:10 +04:00
2005-04-17 02:20:36 +04:00
# include <asm/elf.h>
# include <asm/uaccess.h>
2005-09-04 02:55:10 +04:00
# include <asm/tlbflush.h>
2005-04-17 02:20:36 +04:00
# include "internal.h"
char * task_mem ( struct mm_struct * mm , char * buffer )
{
unsigned long data , text , lib ;
[PATCH] mm: update_hiwaters just in time
update_mem_hiwater has attracted various criticisms, in particular from those
concerned with mm scalability. Originally it was called whenever rss or
total_vm got raised. Then many of those callsites were replaced by a timer
tick call from account_system_time. Now Frank van Maarseveen reports that to
be found inadequate. How about this? Works for Frank.
Replace update_mem_hiwater, a poor combination of two unrelated ops, by macros
update_hiwater_rss and update_hiwater_vm. Don't attempt to keep
mm->hiwater_rss up to date at timer tick, nor every time we raise rss (usually
by 1): those are hot paths. Do the opposite, update only when about to lower
rss (usually by many), or just before final accounting in do_exit. Handle
mm->hiwater_vm in the same way, though it's much less of an issue. Demand
that whoever collects these hiwater statistics do the work of taking the
maximum with rss or total_vm.
And there has been no collector of these hiwater statistics in the tree. The
new convention needs an example, so match Frank's usage by adding a VmPeak
line above VmSize to /proc/<pid>/status, and also a VmHWM line above VmRSS
(High-Water-Mark or High-Water-Memory).
There was a particular anomaly during mremap move, that hiwater_vm might be
captured too high. A fleeting such anomaly remains, but it's quickly
corrected now, whereas before it would stick.
What locking? None: if the app is racy then these statistics will be racy,
it's not worth any overhead to make them exact. But whenever it suits,
hiwater_vm is updated under exclusive mmap_sem, and hiwater_rss under
page_table_lock (for now) or with preemption disabled (later on): without
going to any trouble, minimize the time between reading current values and
updating, to minimize those occasions when a racing thread bumps a count up
and back down in between.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 04:16:18 +03:00
unsigned long hiwater_vm , total_vm , hiwater_rss , total_rss ;
/*
* Note : to minimize their overhead , mm maintains hiwater_vm and
* hiwater_rss only when about to * lower * total_vm or rss . Any
* collector of these hiwater stats must therefore get total_vm
* and rss too , which will usually be the higher . Barriers ? not
* worth the effort , such snapshots can always be inconsistent .
*/
hiwater_vm = total_vm = mm - > total_vm ;
if ( hiwater_vm < mm - > hiwater_vm )
hiwater_vm = mm - > hiwater_vm ;
hiwater_rss = total_rss = get_mm_rss ( mm ) ;
if ( hiwater_rss < mm - > hiwater_rss )
hiwater_rss = mm - > hiwater_rss ;
2005-04-17 02:20:36 +04:00
data = mm - > total_vm - mm - > shared_vm - mm - > stack_vm ;
text = ( PAGE_ALIGN ( mm - > end_code ) - ( mm - > start_code & PAGE_MASK ) ) > > 10 ;
lib = ( mm - > exec_vm < < ( PAGE_SHIFT - 10 ) ) - text ;
buffer + = sprintf ( buffer ,
[PATCH] mm: update_hiwaters just in time
update_mem_hiwater has attracted various criticisms, in particular from those
concerned with mm scalability. Originally it was called whenever rss or
total_vm got raised. Then many of those callsites were replaced by a timer
tick call from account_system_time. Now Frank van Maarseveen reports that to
be found inadequate. How about this? Works for Frank.
Replace update_mem_hiwater, a poor combination of two unrelated ops, by macros
update_hiwater_rss and update_hiwater_vm. Don't attempt to keep
mm->hiwater_rss up to date at timer tick, nor every time we raise rss (usually
by 1): those are hot paths. Do the opposite, update only when about to lower
rss (usually by many), or just before final accounting in do_exit. Handle
mm->hiwater_vm in the same way, though it's much less of an issue. Demand
that whoever collects these hiwater statistics do the work of taking the
maximum with rss or total_vm.
And there has been no collector of these hiwater statistics in the tree. The
new convention needs an example, so match Frank's usage by adding a VmPeak
line above VmSize to /proc/<pid>/status, and also a VmHWM line above VmRSS
(High-Water-Mark or High-Water-Memory).
There was a particular anomaly during mremap move, that hiwater_vm might be
captured too high. A fleeting such anomaly remains, but it's quickly
corrected now, whereas before it would stick.
What locking? None: if the app is racy then these statistics will be racy,
it's not worth any overhead to make them exact. But whenever it suits,
hiwater_vm is updated under exclusive mmap_sem, and hiwater_rss under
page_table_lock (for now) or with preemption disabled (later on): without
going to any trouble, minimize the time between reading current values and
updating, to minimize those occasions when a racing thread bumps a count up
and back down in between.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 04:16:18 +03:00
" VmPeak: \t %8lu kB \n "
2005-04-17 02:20:36 +04:00
" VmSize: \t %8lu kB \n "
" VmLck: \t %8lu kB \n "
[PATCH] mm: update_hiwaters just in time
update_mem_hiwater has attracted various criticisms, in particular from those
concerned with mm scalability. Originally it was called whenever rss or
total_vm got raised. Then many of those callsites were replaced by a timer
tick call from account_system_time. Now Frank van Maarseveen reports that to
be found inadequate. How about this? Works for Frank.
Replace update_mem_hiwater, a poor combination of two unrelated ops, by macros
update_hiwater_rss and update_hiwater_vm. Don't attempt to keep
mm->hiwater_rss up to date at timer tick, nor every time we raise rss (usually
by 1): those are hot paths. Do the opposite, update only when about to lower
rss (usually by many), or just before final accounting in do_exit. Handle
mm->hiwater_vm in the same way, though it's much less of an issue. Demand
that whoever collects these hiwater statistics do the work of taking the
maximum with rss or total_vm.
And there has been no collector of these hiwater statistics in the tree. The
new convention needs an example, so match Frank's usage by adding a VmPeak
line above VmSize to /proc/<pid>/status, and also a VmHWM line above VmRSS
(High-Water-Mark or High-Water-Memory).
There was a particular anomaly during mremap move, that hiwater_vm might be
captured too high. A fleeting such anomaly remains, but it's quickly
corrected now, whereas before it would stick.
What locking? None: if the app is racy then these statistics will be racy,
it's not worth any overhead to make them exact. But whenever it suits,
hiwater_vm is updated under exclusive mmap_sem, and hiwater_rss under
page_table_lock (for now) or with preemption disabled (later on): without
going to any trouble, minimize the time between reading current values and
updating, to minimize those occasions when a racing thread bumps a count up
and back down in between.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 04:16:18 +03:00
" VmHWM: \t %8lu kB \n "
2005-04-17 02:20:36 +04:00
" VmRSS: \t %8lu kB \n "
" VmData: \t %8lu kB \n "
" VmStk: \t %8lu kB \n "
" VmExe: \t %8lu kB \n "
" VmLib: \t %8lu kB \n "
" VmPTE: \t %8lu kB \n " ,
[PATCH] mm: update_hiwaters just in time
update_mem_hiwater has attracted various criticisms, in particular from those
concerned with mm scalability. Originally it was called whenever rss or
total_vm got raised. Then many of those callsites were replaced by a timer
tick call from account_system_time. Now Frank van Maarseveen reports that to
be found inadequate. How about this? Works for Frank.
Replace update_mem_hiwater, a poor combination of two unrelated ops, by macros
update_hiwater_rss and update_hiwater_vm. Don't attempt to keep
mm->hiwater_rss up to date at timer tick, nor every time we raise rss (usually
by 1): those are hot paths. Do the opposite, update only when about to lower
rss (usually by many), or just before final accounting in do_exit. Handle
mm->hiwater_vm in the same way, though it's much less of an issue. Demand
that whoever collects these hiwater statistics do the work of taking the
maximum with rss or total_vm.
And there has been no collector of these hiwater statistics in the tree. The
new convention needs an example, so match Frank's usage by adding a VmPeak
line above VmSize to /proc/<pid>/status, and also a VmHWM line above VmRSS
(High-Water-Mark or High-Water-Memory).
There was a particular anomaly during mremap move, that hiwater_vm might be
captured too high. A fleeting such anomaly remains, but it's quickly
corrected now, whereas before it would stick.
What locking? None: if the app is racy then these statistics will be racy,
it's not worth any overhead to make them exact. But whenever it suits,
hiwater_vm is updated under exclusive mmap_sem, and hiwater_rss under
page_table_lock (for now) or with preemption disabled (later on): without
going to any trouble, minimize the time between reading current values and
updating, to minimize those occasions when a racing thread bumps a count up
and back down in between.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 04:16:18 +03:00
hiwater_vm < < ( PAGE_SHIFT - 10 ) ,
( total_vm - mm - > reserved_vm ) < < ( PAGE_SHIFT - 10 ) ,
2005-04-17 02:20:36 +04:00
mm - > locked_vm < < ( PAGE_SHIFT - 10 ) ,
[PATCH] mm: update_hiwaters just in time
update_mem_hiwater has attracted various criticisms, in particular from those
concerned with mm scalability. Originally it was called whenever rss or
total_vm got raised. Then many of those callsites were replaced by a timer
tick call from account_system_time. Now Frank van Maarseveen reports that to
be found inadequate. How about this? Works for Frank.
Replace update_mem_hiwater, a poor combination of two unrelated ops, by macros
update_hiwater_rss and update_hiwater_vm. Don't attempt to keep
mm->hiwater_rss up to date at timer tick, nor every time we raise rss (usually
by 1): those are hot paths. Do the opposite, update only when about to lower
rss (usually by many), or just before final accounting in do_exit. Handle
mm->hiwater_vm in the same way, though it's much less of an issue. Demand
that whoever collects these hiwater statistics do the work of taking the
maximum with rss or total_vm.
And there has been no collector of these hiwater statistics in the tree. The
new convention needs an example, so match Frank's usage by adding a VmPeak
line above VmSize to /proc/<pid>/status, and also a VmHWM line above VmRSS
(High-Water-Mark or High-Water-Memory).
There was a particular anomaly during mremap move, that hiwater_vm might be
captured too high. A fleeting such anomaly remains, but it's quickly
corrected now, whereas before it would stick.
What locking? None: if the app is racy then these statistics will be racy,
it's not worth any overhead to make them exact. But whenever it suits,
hiwater_vm is updated under exclusive mmap_sem, and hiwater_rss under
page_table_lock (for now) or with preemption disabled (later on): without
going to any trouble, minimize the time between reading current values and
updating, to minimize those occasions when a racing thread bumps a count up
and back down in between.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-30 04:16:18 +03:00
hiwater_rss < < ( PAGE_SHIFT - 10 ) ,
total_rss < < ( PAGE_SHIFT - 10 ) ,
2005-04-17 02:20:36 +04:00
data < < ( PAGE_SHIFT - 10 ) ,
mm - > stack_vm < < ( PAGE_SHIFT - 10 ) , text , lib ,
( PTRS_PER_PTE * sizeof ( pte_t ) * mm - > nr_ptes ) > > 10 ) ;
return buffer ;
}
unsigned long task_vsize ( struct mm_struct * mm )
{
return PAGE_SIZE * mm - > total_vm ;
}
int task_statm ( struct mm_struct * mm , int * shared , int * text ,
int * data , int * resident )
{
2005-10-30 04:16:05 +03:00
* shared = get_mm_counter ( mm , file_rss ) ;
2005-04-17 02:20:36 +04:00
* text = ( PAGE_ALIGN ( mm - > end_code ) - ( mm - > start_code & PAGE_MASK ) )
> > PAGE_SHIFT ;
* data = mm - > total_vm - mm - > shared_vm ;
2005-10-30 04:16:05 +03:00
* resident = * shared + get_mm_counter ( mm , anon_rss ) ;
2005-04-17 02:20:36 +04:00
return mm - > total_vm ;
}
int proc_exe_link ( struct inode * inode , struct dentry * * dentry , struct vfsmount * * mnt )
{
struct vm_area_struct * vma ;
int result = - ENOENT ;
2006-06-26 11:25:55 +04:00
struct task_struct * task = get_proc_task ( inode ) ;
struct mm_struct * mm = NULL ;
2005-04-17 02:20:36 +04:00
2006-06-26 11:25:55 +04:00
if ( task ) {
mm = get_task_mm ( task ) ;
put_task_struct ( task ) ;
}
2005-04-17 02:20:36 +04:00
if ( ! mm )
goto out ;
down_read ( & mm - > mmap_sem ) ;
vma = mm - > mmap ;
while ( vma ) {
if ( ( vma - > vm_flags & VM_EXECUTABLE ) & & vma - > vm_file )
break ;
vma = vma - > vm_next ;
}
if ( vma ) {
* mnt = mntget ( vma - > vm_file - > f_vfsmnt ) ;
* dentry = dget ( vma - > vm_file - > f_dentry ) ;
result = 0 ;
}
up_read ( & mm - > mmap_sem ) ;
mmput ( mm ) ;
out :
return result ;
}
static void pad_len_spaces ( struct seq_file * m , int len )
{
len = 25 + sizeof ( void * ) * 6 - len ;
if ( len < 1 )
len = 1 ;
seq_printf ( m , " %*c " , len , ' ' ) ;
}
2005-09-04 02:55:10 +04:00
struct mem_size_stats
{
unsigned long resident ;
unsigned long shared_clean ;
unsigned long shared_dirty ;
unsigned long private_clean ;
unsigned long private_dirty ;
} ;
static int show_map_internal ( struct seq_file * m , void * v , struct mem_size_stats * mss )
2005-04-17 02:20:36 +04:00
{
2006-06-26 11:25:55 +04:00
struct proc_maps_private * priv = m - > private ;
struct task_struct * task = priv - > task ;
2005-09-04 02:55:10 +04:00
struct vm_area_struct * vma = v ;
struct mm_struct * mm = vma - > vm_mm ;
struct file * file = vma - > vm_file ;
int flags = vma - > vm_flags ;
2005-04-17 02:20:36 +04:00
unsigned long ino = 0 ;
dev_t dev = 0 ;
int len ;
if ( file ) {
2005-09-04 02:55:10 +04:00
struct inode * inode = vma - > vm_file - > f_dentry - > d_inode ;
2005-04-17 02:20:36 +04:00
dev = inode - > i_sb - > s_dev ;
ino = inode - > i_ino ;
}
seq_printf ( m , " %08lx-%08lx %c%c%c%c %08lx %02x:%02x %lu %n " ,
2005-09-04 02:55:10 +04:00
vma - > vm_start ,
vma - > vm_end ,
2005-04-17 02:20:36 +04:00
flags & VM_READ ? ' r ' : ' - ' ,
flags & VM_WRITE ? ' w ' : ' - ' ,
flags & VM_EXEC ? ' x ' : ' - ' ,
flags & VM_MAYSHARE ? ' s ' : ' p ' ,
2005-09-04 02:55:10 +04:00
vma - > vm_pgoff < < PAGE_SHIFT ,
2005-04-17 02:20:36 +04:00
MAJOR ( dev ) , MINOR ( dev ) , ino , & len ) ;
/*
* Print the dentry name for named mappings , and a
* special [ heap ] marker for the heap :
*/
2005-09-04 02:55:10 +04:00
if ( file ) {
2005-04-17 02:20:36 +04:00
pad_len_spaces ( m , len ) ;
2005-09-04 02:55:10 +04:00
seq_path ( m , file - > f_vfsmnt , file - > f_dentry , " \n " ) ;
2005-04-17 02:20:36 +04:00
} else {
[PATCH] vdso: randomize the i386 vDSO by moving it into a vma
Move the i386 VDSO down into a vma and thus randomize it.
Besides the security implications, this feature also helps debuggers, which
can COW a vma-backed VDSO just like a normal DSO and can thus do
single-stepping and other debugging features.
It's good for hypervisors (Xen, VMWare) too, which typically live in the same
high-mapped address space as the VDSO, hence whenever the VDSO is used, they
get lots of guest pagefaults and have to fix such guest accesses up - which
slows things down instead of speeding things up (the primary purpose of the
VDSO).
There's a new CONFIG_COMPAT_VDSO (default=y) option, which provides support
for older glibcs that still rely on a prelinked high-mapped VDSO. Newer
distributions (using glibc 2.3.3 or later) can turn this option off. Turning
it off is also recommended for security reasons: attackers cannot use the
predictable high-mapped VDSO page as syscall trampoline anymore.
There is a new vdso=[0|1] boot option as well, and a runtime
/proc/sys/vm/vdso_enabled sysctl switch, that allows the VDSO to be turned
on/off.
(This version of the VDSO-randomization patch also has working ELF
coredumping, the previous patch crashed in the coredumping code.)
This code is a combined work of the exec-shield VDSO randomization
code and Gerd Hoffmann's hypervisor-centric VDSO patch. Rusty Russell
started this patch and i completed it.
[akpm@osdl.org: cleanups]
[akpm@osdl.org: compile fix]
[akpm@osdl.org: compile fix 2]
[akpm@osdl.org: compile fix 3]
[akpm@osdl.org: revernt MAXMEM change]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Cc: Gerd Hoffmann <kraxel@suse.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Andi Kleen <ak@muc.de>
Cc: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 13:53:50 +04:00
const char * name = arch_vma_name ( vma ) ;
if ( ! name ) {
if ( mm ) {
if ( vma - > vm_start < = mm - > start_brk & &
2005-09-04 02:55:10 +04:00
vma - > vm_end > = mm - > brk ) {
[PATCH] vdso: randomize the i386 vDSO by moving it into a vma
Move the i386 VDSO down into a vma and thus randomize it.
Besides the security implications, this feature also helps debuggers, which
can COW a vma-backed VDSO just like a normal DSO and can thus do
single-stepping and other debugging features.
It's good for hypervisors (Xen, VMWare) too, which typically live in the same
high-mapped address space as the VDSO, hence whenever the VDSO is used, they
get lots of guest pagefaults and have to fix such guest accesses up - which
slows things down instead of speeding things up (the primary purpose of the
VDSO).
There's a new CONFIG_COMPAT_VDSO (default=y) option, which provides support
for older glibcs that still rely on a prelinked high-mapped VDSO. Newer
distributions (using glibc 2.3.3 or later) can turn this option off. Turning
it off is also recommended for security reasons: attackers cannot use the
predictable high-mapped VDSO page as syscall trampoline anymore.
There is a new vdso=[0|1] boot option as well, and a runtime
/proc/sys/vm/vdso_enabled sysctl switch, that allows the VDSO to be turned
on/off.
(This version of the VDSO-randomization patch also has working ELF
coredumping, the previous patch crashed in the coredumping code.)
This code is a combined work of the exec-shield VDSO randomization
code and Gerd Hoffmann's hypervisor-centric VDSO patch. Rusty Russell
started this patch and i completed it.
[akpm@osdl.org: cleanups]
[akpm@osdl.org: compile fix]
[akpm@osdl.org: compile fix 2]
[akpm@osdl.org: compile fix 3]
[akpm@osdl.org: revernt MAXMEM change]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Cc: Gerd Hoffmann <kraxel@suse.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Andi Kleen <ak@muc.de>
Cc: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 13:53:50 +04:00
name = " [heap] " ;
} else if ( vma - > vm_start < = mm - > start_stack & &
vma - > vm_end > = mm - > start_stack ) {
name = " [stack] " ;
2005-04-17 02:20:36 +04:00
}
[PATCH] vdso: randomize the i386 vDSO by moving it into a vma
Move the i386 VDSO down into a vma and thus randomize it.
Besides the security implications, this feature also helps debuggers, which
can COW a vma-backed VDSO just like a normal DSO and can thus do
single-stepping and other debugging features.
It's good for hypervisors (Xen, VMWare) too, which typically live in the same
high-mapped address space as the VDSO, hence whenever the VDSO is used, they
get lots of guest pagefaults and have to fix such guest accesses up - which
slows things down instead of speeding things up (the primary purpose of the
VDSO).
There's a new CONFIG_COMPAT_VDSO (default=y) option, which provides support
for older glibcs that still rely on a prelinked high-mapped VDSO. Newer
distributions (using glibc 2.3.3 or later) can turn this option off. Turning
it off is also recommended for security reasons: attackers cannot use the
predictable high-mapped VDSO page as syscall trampoline anymore.
There is a new vdso=[0|1] boot option as well, and a runtime
/proc/sys/vm/vdso_enabled sysctl switch, that allows the VDSO to be turned
on/off.
(This version of the VDSO-randomization patch also has working ELF
coredumping, the previous patch crashed in the coredumping code.)
This code is a combined work of the exec-shield VDSO randomization
code and Gerd Hoffmann's hypervisor-centric VDSO patch. Rusty Russell
started this patch and i completed it.
[akpm@osdl.org: cleanups]
[akpm@osdl.org: compile fix]
[akpm@osdl.org: compile fix 2]
[akpm@osdl.org: compile fix 3]
[akpm@osdl.org: revernt MAXMEM change]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Cc: Gerd Hoffmann <kraxel@suse.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Andi Kleen <ak@muc.de>
Cc: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 13:53:50 +04:00
} else {
name = " [vdso] " ;
2005-04-17 02:20:36 +04:00
}
[PATCH] vdso: randomize the i386 vDSO by moving it into a vma
Move the i386 VDSO down into a vma and thus randomize it.
Besides the security implications, this feature also helps debuggers, which
can COW a vma-backed VDSO just like a normal DSO and can thus do
single-stepping and other debugging features.
It's good for hypervisors (Xen, VMWare) too, which typically live in the same
high-mapped address space as the VDSO, hence whenever the VDSO is used, they
get lots of guest pagefaults and have to fix such guest accesses up - which
slows things down instead of speeding things up (the primary purpose of the
VDSO).
There's a new CONFIG_COMPAT_VDSO (default=y) option, which provides support
for older glibcs that still rely on a prelinked high-mapped VDSO. Newer
distributions (using glibc 2.3.3 or later) can turn this option off. Turning
it off is also recommended for security reasons: attackers cannot use the
predictable high-mapped VDSO page as syscall trampoline anymore.
There is a new vdso=[0|1] boot option as well, and a runtime
/proc/sys/vm/vdso_enabled sysctl switch, that allows the VDSO to be turned
on/off.
(This version of the VDSO-randomization patch also has working ELF
coredumping, the previous patch crashed in the coredumping code.)
This code is a combined work of the exec-shield VDSO randomization
code and Gerd Hoffmann's hypervisor-centric VDSO patch. Rusty Russell
started this patch and i completed it.
[akpm@osdl.org: cleanups]
[akpm@osdl.org: compile fix]
[akpm@osdl.org: compile fix 2]
[akpm@osdl.org: compile fix 3]
[akpm@osdl.org: revernt MAXMEM change]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Cc: Gerd Hoffmann <kraxel@suse.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Andi Kleen <ak@muc.de>
Cc: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 13:53:50 +04:00
}
if ( name ) {
2005-04-17 02:20:36 +04:00
pad_len_spaces ( m , len ) ;
[PATCH] vdso: randomize the i386 vDSO by moving it into a vma
Move the i386 VDSO down into a vma and thus randomize it.
Besides the security implications, this feature also helps debuggers, which
can COW a vma-backed VDSO just like a normal DSO and can thus do
single-stepping and other debugging features.
It's good for hypervisors (Xen, VMWare) too, which typically live in the same
high-mapped address space as the VDSO, hence whenever the VDSO is used, they
get lots of guest pagefaults and have to fix such guest accesses up - which
slows things down instead of speeding things up (the primary purpose of the
VDSO).
There's a new CONFIG_COMPAT_VDSO (default=y) option, which provides support
for older glibcs that still rely on a prelinked high-mapped VDSO. Newer
distributions (using glibc 2.3.3 or later) can turn this option off. Turning
it off is also recommended for security reasons: attackers cannot use the
predictable high-mapped VDSO page as syscall trampoline anymore.
There is a new vdso=[0|1] boot option as well, and a runtime
/proc/sys/vm/vdso_enabled sysctl switch, that allows the VDSO to be turned
on/off.
(This version of the VDSO-randomization patch also has working ELF
coredumping, the previous patch crashed in the coredumping code.)
This code is a combined work of the exec-shield VDSO randomization
code and Gerd Hoffmann's hypervisor-centric VDSO patch. Rusty Russell
started this patch and i completed it.
[akpm@osdl.org: cleanups]
[akpm@osdl.org: compile fix]
[akpm@osdl.org: compile fix 2]
[akpm@osdl.org: compile fix 3]
[akpm@osdl.org: revernt MAXMEM change]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Cc: Gerd Hoffmann <kraxel@suse.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Andi Kleen <ak@muc.de>
Cc: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 13:53:50 +04:00
seq_puts ( m , name ) ;
2005-04-17 02:20:36 +04:00
}
}
seq_putc ( m , ' \n ' ) ;
2005-09-04 02:55:10 +04:00
if ( mss )
seq_printf ( m ,
" Size: %8lu kB \n "
" Rss: %8lu kB \n "
" Shared_Clean: %8lu kB \n "
" Shared_Dirty: %8lu kB \n "
" Private_Clean: %8lu kB \n "
" Private_Dirty: %8lu kB \n " ,
( vma - > vm_end - vma - > vm_start ) > > 10 ,
mss - > resident > > 10 ,
mss - > shared_clean > > 10 ,
mss - > shared_dirty > > 10 ,
mss - > private_clean > > 10 ,
mss - > private_dirty > > 10 ) ;
if ( m - > count < m - > size ) /* vma is copied successfully */
m - > version = ( vma ! = get_gate_vma ( task ) ) ? vma - > vm_start : 0 ;
2005-04-17 02:20:36 +04:00
return 0 ;
}
2005-09-04 02:55:10 +04:00
static int show_map ( struct seq_file * m , void * v )
{
2005-11-14 03:07:20 +03:00
return show_map_internal ( m , v , NULL ) ;
2005-09-04 02:55:10 +04:00
}
static void smaps_pte_range ( struct vm_area_struct * vma , pmd_t * pmd ,
unsigned long addr , unsigned long end ,
struct mem_size_stats * mss )
{
pte_t * pte , ptent ;
2005-10-30 04:16:27 +03:00
spinlock_t * ptl ;
2005-09-04 02:55:10 +04:00
struct page * page ;
2005-10-30 04:16:27 +03:00
pte = pte_offset_map_lock ( vma - > vm_mm , pmd , addr , & ptl ) ;
2005-09-04 02:55:10 +04:00
do {
ptent = * pte ;
2005-10-30 04:16:27 +03:00
if ( ! pte_present ( ptent ) )
2005-09-04 02:55:10 +04:00
continue ;
mss - > resident + = PAGE_SIZE ;
2006-03-07 02:42:58 +03:00
page = vm_normal_page ( vma , addr , ptent ) ;
if ( ! page )
2005-09-04 02:55:10 +04:00
continue ;
2006-03-07 02:42:58 +03:00
if ( page_mapcount ( page ) > = 2 ) {
2005-09-04 02:55:10 +04:00
if ( pte_dirty ( ptent ) )
mss - > shared_dirty + = PAGE_SIZE ;
else
mss - > shared_clean + = PAGE_SIZE ;
} else {
if ( pte_dirty ( ptent ) )
mss - > private_dirty + = PAGE_SIZE ;
else
mss - > private_clean + = PAGE_SIZE ;
}
} while ( pte + + , addr + = PAGE_SIZE , addr ! = end ) ;
2005-10-30 04:16:27 +03:00
pte_unmap_unlock ( pte - 1 , ptl ) ;
cond_resched ( ) ;
2005-09-04 02:55:10 +04:00
}
static inline void smaps_pmd_range ( struct vm_area_struct * vma , pud_t * pud ,
unsigned long addr , unsigned long end ,
struct mem_size_stats * mss )
{
pmd_t * pmd ;
unsigned long next ;
pmd = pmd_offset ( pud , addr ) ;
do {
next = pmd_addr_end ( addr , end ) ;
if ( pmd_none_or_clear_bad ( pmd ) )
continue ;
smaps_pte_range ( vma , pmd , addr , next , mss ) ;
} while ( pmd + + , addr = next , addr ! = end ) ;
}
static inline void smaps_pud_range ( struct vm_area_struct * vma , pgd_t * pgd ,
unsigned long addr , unsigned long end ,
struct mem_size_stats * mss )
{
pud_t * pud ;
unsigned long next ;
pud = pud_offset ( pgd , addr ) ;
do {
next = pud_addr_end ( addr , end ) ;
if ( pud_none_or_clear_bad ( pud ) )
continue ;
smaps_pmd_range ( vma , pud , addr , next , mss ) ;
} while ( pud + + , addr = next , addr ! = end ) ;
}
static inline void smaps_pgd_range ( struct vm_area_struct * vma ,
unsigned long addr , unsigned long end ,
struct mem_size_stats * mss )
{
pgd_t * pgd ;
unsigned long next ;
pgd = pgd_offset ( vma - > vm_mm , addr ) ;
do {
next = pgd_addr_end ( addr , end ) ;
if ( pgd_none_or_clear_bad ( pgd ) )
continue ;
smaps_pud_range ( vma , pgd , addr , next , mss ) ;
} while ( pgd + + , addr = next , addr ! = end ) ;
}
static int show_smap ( struct seq_file * m , void * v )
{
struct vm_area_struct * vma = v ;
struct mem_size_stats mss ;
memset ( & mss , 0 , sizeof mss ) ;
2006-03-07 02:42:57 +03:00
if ( vma - > vm_mm & & ! is_vm_hugetlb_page ( vma ) )
2005-09-04 02:55:10 +04:00
smaps_pgd_range ( vma , vma - > vm_start , vma - > vm_end , & mss ) ;
return show_map_internal ( m , v , & mss ) ;
}
2005-04-17 02:20:36 +04:00
static void * m_start ( struct seq_file * m , loff_t * pos )
{
2006-06-26 11:25:55 +04:00
struct proc_maps_private * priv = m - > private ;
2005-04-17 02:20:36 +04:00
unsigned long last_addr = m - > version ;
struct mm_struct * mm ;
2006-06-26 11:25:55 +04:00
struct vm_area_struct * vma , * tail_vma = NULL ;
2005-04-17 02:20:36 +04:00
loff_t l = * pos ;
2006-06-26 11:25:55 +04:00
/* Clear the per syscall fields in priv */
priv - > task = NULL ;
priv - > tail_vma = NULL ;
2005-04-17 02:20:36 +04:00
/*
* We remember last_addr rather than next_addr to hit with
* mmap_cache most of the time . We have zero last_addr at
2005-09-04 02:55:10 +04:00
* the beginning and also after lseek . We will have - 1 last_addr
* after the end of the vmas .
2005-04-17 02:20:36 +04:00
*/
if ( last_addr = = - 1UL )
return NULL ;
2006-06-26 11:25:56 +04:00
priv - > task = get_pid_task ( priv - > pid , PIDTYPE_PID ) ;
2006-06-26 11:25:55 +04:00
if ( ! priv - > task )
return NULL ;
mm = get_task_mm ( priv - > task ) ;
2005-04-17 02:20:36 +04:00
if ( ! mm )
return NULL ;
2006-06-26 11:25:55 +04:00
priv - > tail_vma = tail_vma = get_gate_vma ( priv - > task ) ;
2005-04-17 02:20:36 +04:00
down_read ( & mm - > mmap_sem ) ;
/* Start with last addr hint */
2005-09-04 02:55:10 +04:00
if ( last_addr & & ( vma = find_vma ( mm , last_addr ) ) ) {
vma = vma - > vm_next ;
2005-04-17 02:20:36 +04:00
goto out ;
}
/*
2005-09-04 02:55:10 +04:00
* Check the vma index is within the range and do
2005-04-17 02:20:36 +04:00
* sequential scan until m_index .
*/
2005-09-04 02:55:10 +04:00
vma = NULL ;
2005-04-17 02:20:36 +04:00
if ( ( unsigned long ) l < mm - > map_count ) {
2005-09-04 02:55:10 +04:00
vma = mm - > mmap ;
while ( l - - & & vma )
vma = vma - > vm_next ;
2005-04-17 02:20:36 +04:00
goto out ;
}
if ( l ! = mm - > map_count )
2005-09-04 02:55:10 +04:00
tail_vma = NULL ; /* After gate vma */
2005-04-17 02:20:36 +04:00
out :
2005-09-04 02:55:10 +04:00
if ( vma )
return vma ;
2005-04-17 02:20:36 +04:00
2005-09-04 02:55:10 +04:00
/* End of vmas has been reached */
m - > version = ( tail_vma ! = NULL ) ? 0 : - 1UL ;
2005-04-17 02:20:36 +04:00
up_read ( & mm - > mmap_sem ) ;
mmput ( mm ) ;
2005-09-04 02:55:10 +04:00
return tail_vma ;
2005-04-17 02:20:36 +04:00
}
2006-06-26 11:25:55 +04:00
static void vma_stop ( struct proc_maps_private * priv , struct vm_area_struct * vma )
2005-04-17 02:20:36 +04:00
{
2006-06-26 11:25:55 +04:00
if ( vma & & vma ! = priv - > tail_vma ) {
2005-09-04 02:55:10 +04:00
struct mm_struct * mm = vma - > vm_mm ;
2005-04-17 02:20:36 +04:00
up_read ( & mm - > mmap_sem ) ;
mmput ( mm ) ;
}
}
static void * m_next ( struct seq_file * m , void * v , loff_t * pos )
{
2006-06-26 11:25:55 +04:00
struct proc_maps_private * priv = m - > private ;
2005-09-04 02:55:10 +04:00
struct vm_area_struct * vma = v ;
2006-06-26 11:25:55 +04:00
struct vm_area_struct * tail_vma = priv - > tail_vma ;
2005-04-17 02:20:36 +04:00
( * pos ) + + ;
2005-09-04 02:55:10 +04:00
if ( vma & & ( vma ! = tail_vma ) & & vma - > vm_next )
return vma - > vm_next ;
2006-06-26 11:25:55 +04:00
vma_stop ( priv , vma ) ;
2005-09-04 02:55:10 +04:00
return ( vma ! = tail_vma ) ? tail_vma : NULL ;
2005-04-17 02:20:36 +04:00
}
2006-06-26 11:25:55 +04:00
static void m_stop ( struct seq_file * m , void * v )
{
struct proc_maps_private * priv = m - > private ;
struct vm_area_struct * vma = v ;
vma_stop ( priv , vma ) ;
if ( priv - > task )
put_task_struct ( priv - > task ) ;
}
2006-06-26 11:25:48 +04:00
static struct seq_operations proc_pid_maps_op = {
2005-04-17 02:20:36 +04:00
. start = m_start ,
. next = m_next ,
. stop = m_stop ,
. show = show_map
} ;
2005-09-04 02:54:45 +04:00
2006-06-26 11:25:48 +04:00
static struct seq_operations proc_pid_smaps_op = {
2005-09-04 02:55:10 +04:00
. start = m_start ,
. next = m_next ,
. stop = m_stop ,
. show = show_smap
} ;
2006-06-26 11:25:48 +04:00
static int do_maps_open ( struct inode * inode , struct file * file ,
struct seq_operations * ops )
{
2006-06-26 11:25:55 +04:00
struct proc_maps_private * priv ;
int ret = - ENOMEM ;
priv = kzalloc ( sizeof ( * priv ) , GFP_KERNEL ) ;
if ( priv ) {
2006-06-26 11:25:56 +04:00
priv - > pid = proc_pid ( inode ) ;
2006-06-26 11:25:55 +04:00
ret = seq_open ( file , ops ) ;
if ( ! ret ) {
struct seq_file * m = file - > private_data ;
m - > private = priv ;
} else {
kfree ( priv ) ;
}
2006-06-26 11:25:48 +04:00
}
return ret ;
}
static int maps_open ( struct inode * inode , struct file * file )
{
return do_maps_open ( inode , file , & proc_pid_maps_op ) ;
}
struct file_operations proc_maps_operations = {
. open = maps_open ,
. read = seq_read ,
. llseek = seq_lseek ,
2006-06-26 11:25:55 +04:00
. release = seq_release_private ,
2006-06-26 11:25:48 +04:00
} ;
2005-09-04 02:54:45 +04:00
# ifdef CONFIG_NUMA
2006-01-08 12:01:02 +03:00
extern int show_numa_map ( struct seq_file * m , void * v ) ;
2005-09-04 02:54:45 +04:00
2006-06-26 11:25:48 +04:00
static struct seq_operations proc_pid_numa_maps_op = {
2006-01-08 12:01:02 +03:00
. start = m_start ,
. next = m_next ,
. stop = m_stop ,
. show = show_numa_map
2005-09-04 02:54:45 +04:00
} ;
2006-06-26 11:25:48 +04:00
static int numa_maps_open ( struct inode * inode , struct file * file )
{
return do_maps_open ( inode , file , & proc_pid_numa_maps_op ) ;
}
struct file_operations proc_numa_maps_operations = {
. open = numa_maps_open ,
. read = seq_read ,
. llseek = seq_lseek ,
2006-06-26 11:25:55 +04:00
. release = seq_release_private ,
2006-06-26 11:25:48 +04:00
} ;
2005-09-04 02:54:45 +04:00
# endif
2006-06-26 11:25:48 +04:00
static int smaps_open ( struct inode * inode , struct file * file )
{
return do_maps_open ( inode , file , & proc_pid_smaps_op ) ;
}
struct file_operations proc_smaps_operations = {
. open = smaps_open ,
. read = seq_read ,
. llseek = seq_lseek ,
2006-06-26 11:25:55 +04:00
. release = seq_release_private ,
2006-06-26 11:25:48 +04:00
} ;