2005-04-16 15:20:36 -07:00
# include <linux/mm.h>
# include <linux/hugetlb.h>
# include <linux/mount.h>
# include <linux/seq_file.h>
2005-09-03 15:55:10 -07:00
# include <linux/highmem.h>
2007-05-08 00:26:04 -07:00
# include <linux/ptrace.h>
2005-09-03 15:54:45 -07:00
# include <linux/pagemap.h>
2008-02-04 22:29:04 -08:00
# include <linux/ptrace.h>
2005-09-03 15:54:45 -07:00
# include <linux/mempolicy.h>
2008-02-04 22:29:04 -08:00
# include <linux/swap.h>
# include <linux/swapops.h>
2008-02-08 04:18:33 -08:00
# include <linux/seq_file.h>
2005-09-03 15:55:10 -07:00
2005-04-16 15:20:36 -07:00
# include <asm/elf.h>
# include <asm/uaccess.h>
2005-09-03 15:55:10 -07:00
# include <asm/tlbflush.h>
2005-04-16 15:20:36 -07:00
# include "internal.h"
2008-02-08 04:18:33 -08:00
void task_mem ( struct seq_file * m , struct mm_struct * mm )
2005-04-16 15:20:36 -07:00
{
unsigned long data , text , lib ;
[PATCH] mm: update_hiwaters just in time
update_mem_hiwater has attracted various criticisms, in particular from those
concerned with mm scalability. Originally it was called whenever rss or
total_vm got raised. Then many of those callsites were replaced by a timer
tick call from account_system_time. Now Frank van Maarseveen reports that to
be found inadequate. How about this? Works for Frank.
Replace update_mem_hiwater, a poor combination of two unrelated ops, by macros
update_hiwater_rss and update_hiwater_vm. Don't attempt to keep
mm->hiwater_rss up to date at timer tick, nor every time we raise rss (usually
by 1): those are hot paths. Do the opposite, update only when about to lower
rss (usually by many), or just before final accounting in do_exit. Handle
mm->hiwater_vm in the same way, though it's much less of an issue. Demand
that whoever collects these hiwater statistics do the work of taking the
maximum with rss or total_vm.
And there has been no collector of these hiwater statistics in the tree. The
new convention needs an example, so match Frank's usage by adding a VmPeak
line above VmSize to /proc/<pid>/status, and also a VmHWM line above VmRSS
(High-Water-Mark or High-Water-Memory).
There was a particular anomaly during mremap move, that hiwater_vm might be
captured too high. A fleeting such anomaly remains, but it's quickly
corrected now, whereas before it would stick.
What locking? None: if the app is racy then these statistics will be racy,
it's not worth any overhead to make them exact. But whenever it suits,
hiwater_vm is updated under exclusive mmap_sem, and hiwater_rss under
page_table_lock (for now) or with preemption disabled (later on): without
going to any trouble, minimize the time between reading current values and
updating, to minimize those occasions when a racing thread bumps a count up
and back down in between.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-29 18:16:18 -07:00
unsigned long hiwater_vm , total_vm , hiwater_rss , total_rss ;
/*
* Note : to minimize their overhead , mm maintains hiwater_vm and
* hiwater_rss only when about to * lower * total_vm or rss . Any
* collector of these hiwater stats must therefore get total_vm
* and rss too , which will usually be the higher . Barriers ? not
* worth the effort , such snapshots can always be inconsistent .
*/
hiwater_vm = total_vm = mm - > total_vm ;
if ( hiwater_vm < mm - > hiwater_vm )
hiwater_vm = mm - > hiwater_vm ;
hiwater_rss = total_rss = get_mm_rss ( mm ) ;
if ( hiwater_rss < mm - > hiwater_rss )
hiwater_rss = mm - > hiwater_rss ;
2005-04-16 15:20:36 -07:00
data = mm - > total_vm - mm - > shared_vm - mm - > stack_vm ;
text = ( PAGE_ALIGN ( mm - > end_code ) - ( mm - > start_code & PAGE_MASK ) ) > > 10 ;
lib = ( mm - > exec_vm < < ( PAGE_SHIFT - 10 ) ) - text ;
2008-02-08 04:18:33 -08:00
seq_printf ( m ,
[PATCH] mm: update_hiwaters just in time
update_mem_hiwater has attracted various criticisms, in particular from those
concerned with mm scalability. Originally it was called whenever rss or
total_vm got raised. Then many of those callsites were replaced by a timer
tick call from account_system_time. Now Frank van Maarseveen reports that to
be found inadequate. How about this? Works for Frank.
Replace update_mem_hiwater, a poor combination of two unrelated ops, by macros
update_hiwater_rss and update_hiwater_vm. Don't attempt to keep
mm->hiwater_rss up to date at timer tick, nor every time we raise rss (usually
by 1): those are hot paths. Do the opposite, update only when about to lower
rss (usually by many), or just before final accounting in do_exit. Handle
mm->hiwater_vm in the same way, though it's much less of an issue. Demand
that whoever collects these hiwater statistics do the work of taking the
maximum with rss or total_vm.
And there has been no collector of these hiwater statistics in the tree. The
new convention needs an example, so match Frank's usage by adding a VmPeak
line above VmSize to /proc/<pid>/status, and also a VmHWM line above VmRSS
(High-Water-Mark or High-Water-Memory).
There was a particular anomaly during mremap move, that hiwater_vm might be
captured too high. A fleeting such anomaly remains, but it's quickly
corrected now, whereas before it would stick.
What locking? None: if the app is racy then these statistics will be racy,
it's not worth any overhead to make them exact. But whenever it suits,
hiwater_vm is updated under exclusive mmap_sem, and hiwater_rss under
page_table_lock (for now) or with preemption disabled (later on): without
going to any trouble, minimize the time between reading current values and
updating, to minimize those occasions when a racing thread bumps a count up
and back down in between.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-29 18:16:18 -07:00
" VmPeak: \t %8lu kB \n "
2005-04-16 15:20:36 -07:00
" VmSize: \t %8lu kB \n "
" VmLck: \t %8lu kB \n "
[PATCH] mm: update_hiwaters just in time
update_mem_hiwater has attracted various criticisms, in particular from those
concerned with mm scalability. Originally it was called whenever rss or
total_vm got raised. Then many of those callsites were replaced by a timer
tick call from account_system_time. Now Frank van Maarseveen reports that to
be found inadequate. How about this? Works for Frank.
Replace update_mem_hiwater, a poor combination of two unrelated ops, by macros
update_hiwater_rss and update_hiwater_vm. Don't attempt to keep
mm->hiwater_rss up to date at timer tick, nor every time we raise rss (usually
by 1): those are hot paths. Do the opposite, update only when about to lower
rss (usually by many), or just before final accounting in do_exit. Handle
mm->hiwater_vm in the same way, though it's much less of an issue. Demand
that whoever collects these hiwater statistics do the work of taking the
maximum with rss or total_vm.
And there has been no collector of these hiwater statistics in the tree. The
new convention needs an example, so match Frank's usage by adding a VmPeak
line above VmSize to /proc/<pid>/status, and also a VmHWM line above VmRSS
(High-Water-Mark or High-Water-Memory).
There was a particular anomaly during mremap move, that hiwater_vm might be
captured too high. A fleeting such anomaly remains, but it's quickly
corrected now, whereas before it would stick.
What locking? None: if the app is racy then these statistics will be racy,
it's not worth any overhead to make them exact. But whenever it suits,
hiwater_vm is updated under exclusive mmap_sem, and hiwater_rss under
page_table_lock (for now) or with preemption disabled (later on): without
going to any trouble, minimize the time between reading current values and
updating, to minimize those occasions when a racing thread bumps a count up
and back down in between.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-29 18:16:18 -07:00
" VmHWM: \t %8lu kB \n "
2005-04-16 15:20:36 -07:00
" VmRSS: \t %8lu kB \n "
" VmData: \t %8lu kB \n "
" VmStk: \t %8lu kB \n "
" VmExe: \t %8lu kB \n "
" VmLib: \t %8lu kB \n "
" VmPTE: \t %8lu kB \n " ,
[PATCH] mm: update_hiwaters just in time
update_mem_hiwater has attracted various criticisms, in particular from those
concerned with mm scalability. Originally it was called whenever rss or
total_vm got raised. Then many of those callsites were replaced by a timer
tick call from account_system_time. Now Frank van Maarseveen reports that to
be found inadequate. How about this? Works for Frank.
Replace update_mem_hiwater, a poor combination of two unrelated ops, by macros
update_hiwater_rss and update_hiwater_vm. Don't attempt to keep
mm->hiwater_rss up to date at timer tick, nor every time we raise rss (usually
by 1): those are hot paths. Do the opposite, update only when about to lower
rss (usually by many), or just before final accounting in do_exit. Handle
mm->hiwater_vm in the same way, though it's much less of an issue. Demand
that whoever collects these hiwater statistics do the work of taking the
maximum with rss or total_vm.
And there has been no collector of these hiwater statistics in the tree. The
new convention needs an example, so match Frank's usage by adding a VmPeak
line above VmSize to /proc/<pid>/status, and also a VmHWM line above VmRSS
(High-Water-Mark or High-Water-Memory).
There was a particular anomaly during mremap move, that hiwater_vm might be
captured too high. A fleeting such anomaly remains, but it's quickly
corrected now, whereas before it would stick.
What locking? None: if the app is racy then these statistics will be racy,
it's not worth any overhead to make them exact. But whenever it suits,
hiwater_vm is updated under exclusive mmap_sem, and hiwater_rss under
page_table_lock (for now) or with preemption disabled (later on): without
going to any trouble, minimize the time between reading current values and
updating, to minimize those occasions when a racing thread bumps a count up
and back down in between.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-29 18:16:18 -07:00
hiwater_vm < < ( PAGE_SHIFT - 10 ) ,
( total_vm - mm - > reserved_vm ) < < ( PAGE_SHIFT - 10 ) ,
2005-04-16 15:20:36 -07:00
mm - > locked_vm < < ( PAGE_SHIFT - 10 ) ,
[PATCH] mm: update_hiwaters just in time
update_mem_hiwater has attracted various criticisms, in particular from those
concerned with mm scalability. Originally it was called whenever rss or
total_vm got raised. Then many of those callsites were replaced by a timer
tick call from account_system_time. Now Frank van Maarseveen reports that to
be found inadequate. How about this? Works for Frank.
Replace update_mem_hiwater, a poor combination of two unrelated ops, by macros
update_hiwater_rss and update_hiwater_vm. Don't attempt to keep
mm->hiwater_rss up to date at timer tick, nor every time we raise rss (usually
by 1): those are hot paths. Do the opposite, update only when about to lower
rss (usually by many), or just before final accounting in do_exit. Handle
mm->hiwater_vm in the same way, though it's much less of an issue. Demand
that whoever collects these hiwater statistics do the work of taking the
maximum with rss or total_vm.
And there has been no collector of these hiwater statistics in the tree. The
new convention needs an example, so match Frank's usage by adding a VmPeak
line above VmSize to /proc/<pid>/status, and also a VmHWM line above VmRSS
(High-Water-Mark or High-Water-Memory).
There was a particular anomaly during mremap move, that hiwater_vm might be
captured too high. A fleeting such anomaly remains, but it's quickly
corrected now, whereas before it would stick.
What locking? None: if the app is racy then these statistics will be racy,
it's not worth any overhead to make them exact. But whenever it suits,
hiwater_vm is updated under exclusive mmap_sem, and hiwater_rss under
page_table_lock (for now) or with preemption disabled (later on): without
going to any trouble, minimize the time between reading current values and
updating, to minimize those occasions when a racing thread bumps a count up
and back down in between.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-10-29 18:16:18 -07:00
hiwater_rss < < ( PAGE_SHIFT - 10 ) ,
total_rss < < ( PAGE_SHIFT - 10 ) ,
2005-04-16 15:20:36 -07:00
data < < ( PAGE_SHIFT - 10 ) ,
mm - > stack_vm < < ( PAGE_SHIFT - 10 ) , text , lib ,
( PTRS_PER_PTE * sizeof ( pte_t ) * mm - > nr_ptes ) > > 10 ) ;
}
unsigned long task_vsize ( struct mm_struct * mm )
{
return PAGE_SIZE * mm - > total_vm ;
}
int task_statm ( struct mm_struct * mm , int * shared , int * text ,
int * data , int * resident )
{
2005-10-29 18:16:05 -07:00
* shared = get_mm_counter ( mm , file_rss ) ;
2005-04-16 15:20:36 -07:00
* text = ( PAGE_ALIGN ( mm - > end_code ) - ( mm - > start_code & PAGE_MASK ) )
> > PAGE_SHIFT ;
* data = mm - > total_vm - mm - > shared_vm ;
2005-10-29 18:16:05 -07:00
* resident = * shared + get_mm_counter ( mm , anon_rss ) ;
2005-04-16 15:20:36 -07:00
return mm - > total_vm ;
}
2008-02-14 19:38:35 -08:00
int proc_exe_link ( struct inode * inode , struct path * path )
2005-04-16 15:20:36 -07:00
{
struct vm_area_struct * vma ;
int result = - ENOENT ;
2006-06-26 00:25:55 -07:00
struct task_struct * task = get_proc_task ( inode ) ;
struct mm_struct * mm = NULL ;
2005-04-16 15:20:36 -07:00
2006-06-26 00:25:55 -07:00
if ( task ) {
mm = get_task_mm ( task ) ;
put_task_struct ( task ) ;
}
2005-04-16 15:20:36 -07:00
if ( ! mm )
goto out ;
down_read ( & mm - > mmap_sem ) ;
vma = mm - > mmap ;
while ( vma ) {
if ( ( vma - > vm_flags & VM_EXECUTABLE ) & & vma - > vm_file )
break ;
vma = vma - > vm_next ;
}
if ( vma ) {
2008-02-14 19:38:35 -08:00
* path = vma - > vm_file - > f_path ;
path_get ( & vma - > vm_file - > f_path ) ;
2005-04-16 15:20:36 -07:00
result = 0 ;
}
up_read ( & mm - > mmap_sem ) ;
mmput ( mm ) ;
out :
return result ;
}
static void pad_len_spaces ( struct seq_file * m , int len )
{
len = 25 + sizeof ( void * ) * 6 - len ;
if ( len < 1 )
len = 1 ;
seq_printf ( m , " %*c " , len , ' ' ) ;
}
2008-02-04 22:29:03 -08:00
static void vma_stop ( struct proc_maps_private * priv , struct vm_area_struct * vma )
{
if ( vma & & vma ! = priv - > tail_vma ) {
struct mm_struct * mm = vma - > vm_mm ;
up_read ( & mm - > mmap_sem ) ;
mmput ( mm ) ;
}
}
2008-02-04 22:28:56 -08:00
2008-02-04 22:29:03 -08:00
static void * m_start ( struct seq_file * m , loff_t * pos )
2005-09-03 15:55:10 -07:00
{
2008-02-04 22:29:03 -08:00
struct proc_maps_private * priv = m - > private ;
unsigned long last_addr = m - > version ;
struct mm_struct * mm ;
struct vm_area_struct * vma , * tail_vma = NULL ;
loff_t l = * pos ;
/* Clear the per syscall fields in priv */
priv - > task = NULL ;
priv - > tail_vma = NULL ;
/*
* We remember last_addr rather than next_addr to hit with
* mmap_cache most of the time . We have zero last_addr at
* the beginning and also after lseek . We will have - 1 last_addr
* after the end of the vmas .
*/
if ( last_addr = = - 1UL )
return NULL ;
priv - > task = get_pid_task ( priv - > pid , PIDTYPE_PID ) ;
if ( ! priv - > task )
return NULL ;
mm = mm_for_maps ( priv - > task ) ;
if ( ! mm )
return NULL ;
tail_vma = get_gate_vma ( priv - > task ) ;
priv - > tail_vma = tail_vma ;
/* Start with last addr hint */
vma = find_vma ( mm , last_addr ) ;
if ( last_addr & & vma ) {
vma = vma - > vm_next ;
goto out ;
}
/*
* Check the vma index is within the range and do
* sequential scan until m_index .
*/
vma = NULL ;
if ( ( unsigned long ) l < mm - > map_count ) {
vma = mm - > mmap ;
while ( l - - & & vma )
vma = vma - > vm_next ;
goto out ;
}
if ( l ! = mm - > map_count )
tail_vma = NULL ; /* After gate vma */
out :
if ( vma )
return vma ;
/* End of vmas has been reached */
m - > version = ( tail_vma ! = NULL ) ? 0 : - 1UL ;
up_read ( & mm - > mmap_sem ) ;
mmput ( mm ) ;
return tail_vma ;
}
static void * m_next ( struct seq_file * m , void * v , loff_t * pos )
{
struct proc_maps_private * priv = m - > private ;
struct vm_area_struct * vma = v ;
struct vm_area_struct * tail_vma = priv - > tail_vma ;
( * pos ) + + ;
if ( vma & & ( vma ! = tail_vma ) & & vma - > vm_next )
return vma - > vm_next ;
vma_stop ( priv , vma ) ;
return ( vma ! = tail_vma ) ? tail_vma : NULL ;
}
static void m_stop ( struct seq_file * m , void * v )
{
struct proc_maps_private * priv = m - > private ;
struct vm_area_struct * vma = v ;
vma_stop ( priv , vma ) ;
if ( priv - > task )
put_task_struct ( priv - > task ) ;
}
static int do_maps_open ( struct inode * inode , struct file * file ,
2008-02-08 04:21:19 -08:00
const struct seq_operations * ops )
2008-02-04 22:29:03 -08:00
{
struct proc_maps_private * priv ;
int ret = - ENOMEM ;
priv = kzalloc ( sizeof ( * priv ) , GFP_KERNEL ) ;
if ( priv ) {
priv - > pid = proc_pid ( inode ) ;
ret = seq_open ( file , ops ) ;
if ( ! ret ) {
struct seq_file * m = file - > private_data ;
m - > private = priv ;
} else {
kfree ( priv ) ;
}
}
return ret ;
}
2005-09-03 15:55:10 -07:00
2008-02-04 22:29:02 -08:00
static int show_map ( struct seq_file * m , void * v )
2005-04-16 15:20:36 -07:00
{
2006-06-26 00:25:55 -07:00
struct proc_maps_private * priv = m - > private ;
struct task_struct * task = priv - > task ;
2005-09-03 15:55:10 -07:00
struct vm_area_struct * vma = v ;
struct mm_struct * mm = vma - > vm_mm ;
struct file * file = vma - > vm_file ;
int flags = vma - > vm_flags ;
2005-04-16 15:20:36 -07:00
unsigned long ino = 0 ;
dev_t dev = 0 ;
int len ;
2007-05-08 00:26:04 -07:00
if ( maps_protect & & ! ptrace_may_attach ( task ) )
return - EACCES ;
2005-04-16 15:20:36 -07:00
if ( file ) {
2006-12-08 02:36:36 -08:00
struct inode * inode = vma - > vm_file - > f_path . dentry - > d_inode ;
2005-04-16 15:20:36 -07:00
dev = inode - > i_sb - > s_dev ;
ino = inode - > i_ino ;
}
seq_printf ( m , " %08lx-%08lx %c%c%c%c %08lx %02x:%02x %lu %n " ,
2005-09-03 15:55:10 -07:00
vma - > vm_start ,
vma - > vm_end ,
2005-04-16 15:20:36 -07:00
flags & VM_READ ? ' r ' : ' - ' ,
flags & VM_WRITE ? ' w ' : ' - ' ,
flags & VM_EXEC ? ' x ' : ' - ' ,
flags & VM_MAYSHARE ? ' s ' : ' p ' ,
2005-09-03 15:55:10 -07:00
vma - > vm_pgoff < < PAGE_SHIFT ,
2005-04-16 15:20:36 -07:00
MAJOR ( dev ) , MINOR ( dev ) , ino , & len ) ;
/*
* Print the dentry name for named mappings , and a
* special [ heap ] marker for the heap :
*/
2005-09-03 15:55:10 -07:00
if ( file ) {
2005-04-16 15:20:36 -07:00
pad_len_spaces ( m , len ) ;
2008-02-14 19:38:43 -08:00
seq_path ( m , & file - > f_path , " \n " ) ;
2005-04-16 15:20:36 -07:00
} else {
[PATCH] vdso: randomize the i386 vDSO by moving it into a vma
Move the i386 VDSO down into a vma and thus randomize it.
Besides the security implications, this feature also helps debuggers, which
can COW a vma-backed VDSO just like a normal DSO and can thus do
single-stepping and other debugging features.
It's good for hypervisors (Xen, VMWare) too, which typically live in the same
high-mapped address space as the VDSO, hence whenever the VDSO is used, they
get lots of guest pagefaults and have to fix such guest accesses up - which
slows things down instead of speeding things up (the primary purpose of the
VDSO).
There's a new CONFIG_COMPAT_VDSO (default=y) option, which provides support
for older glibcs that still rely on a prelinked high-mapped VDSO. Newer
distributions (using glibc 2.3.3 or later) can turn this option off. Turning
it off is also recommended for security reasons: attackers cannot use the
predictable high-mapped VDSO page as syscall trampoline anymore.
There is a new vdso=[0|1] boot option as well, and a runtime
/proc/sys/vm/vdso_enabled sysctl switch, that allows the VDSO to be turned
on/off.
(This version of the VDSO-randomization patch also has working ELF
coredumping, the previous patch crashed in the coredumping code.)
This code is a combined work of the exec-shield VDSO randomization
code and Gerd Hoffmann's hypervisor-centric VDSO patch. Rusty Russell
started this patch and i completed it.
[akpm@osdl.org: cleanups]
[akpm@osdl.org: compile fix]
[akpm@osdl.org: compile fix 2]
[akpm@osdl.org: compile fix 3]
[akpm@osdl.org: revernt MAXMEM change]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Cc: Gerd Hoffmann <kraxel@suse.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Andi Kleen <ak@muc.de>
Cc: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 02:53:50 -07:00
const char * name = arch_vma_name ( vma ) ;
if ( ! name ) {
if ( mm ) {
if ( vma - > vm_start < = mm - > start_brk & &
2005-09-03 15:55:10 -07:00
vma - > vm_end > = mm - > brk ) {
[PATCH] vdso: randomize the i386 vDSO by moving it into a vma
Move the i386 VDSO down into a vma and thus randomize it.
Besides the security implications, this feature also helps debuggers, which
can COW a vma-backed VDSO just like a normal DSO and can thus do
single-stepping and other debugging features.
It's good for hypervisors (Xen, VMWare) too, which typically live in the same
high-mapped address space as the VDSO, hence whenever the VDSO is used, they
get lots of guest pagefaults and have to fix such guest accesses up - which
slows things down instead of speeding things up (the primary purpose of the
VDSO).
There's a new CONFIG_COMPAT_VDSO (default=y) option, which provides support
for older glibcs that still rely on a prelinked high-mapped VDSO. Newer
distributions (using glibc 2.3.3 or later) can turn this option off. Turning
it off is also recommended for security reasons: attackers cannot use the
predictable high-mapped VDSO page as syscall trampoline anymore.
There is a new vdso=[0|1] boot option as well, and a runtime
/proc/sys/vm/vdso_enabled sysctl switch, that allows the VDSO to be turned
on/off.
(This version of the VDSO-randomization patch also has working ELF
coredumping, the previous patch crashed in the coredumping code.)
This code is a combined work of the exec-shield VDSO randomization
code and Gerd Hoffmann's hypervisor-centric VDSO patch. Rusty Russell
started this patch and i completed it.
[akpm@osdl.org: cleanups]
[akpm@osdl.org: compile fix]
[akpm@osdl.org: compile fix 2]
[akpm@osdl.org: compile fix 3]
[akpm@osdl.org: revernt MAXMEM change]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Cc: Gerd Hoffmann <kraxel@suse.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Andi Kleen <ak@muc.de>
Cc: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 02:53:50 -07:00
name = " [heap] " ;
} else if ( vma - > vm_start < = mm - > start_stack & &
vma - > vm_end > = mm - > start_stack ) {
name = " [stack] " ;
2005-04-16 15:20:36 -07:00
}
[PATCH] vdso: randomize the i386 vDSO by moving it into a vma
Move the i386 VDSO down into a vma and thus randomize it.
Besides the security implications, this feature also helps debuggers, which
can COW a vma-backed VDSO just like a normal DSO and can thus do
single-stepping and other debugging features.
It's good for hypervisors (Xen, VMWare) too, which typically live in the same
high-mapped address space as the VDSO, hence whenever the VDSO is used, they
get lots of guest pagefaults and have to fix such guest accesses up - which
slows things down instead of speeding things up (the primary purpose of the
VDSO).
There's a new CONFIG_COMPAT_VDSO (default=y) option, which provides support
for older glibcs that still rely on a prelinked high-mapped VDSO. Newer
distributions (using glibc 2.3.3 or later) can turn this option off. Turning
it off is also recommended for security reasons: attackers cannot use the
predictable high-mapped VDSO page as syscall trampoline anymore.
There is a new vdso=[0|1] boot option as well, and a runtime
/proc/sys/vm/vdso_enabled sysctl switch, that allows the VDSO to be turned
on/off.
(This version of the VDSO-randomization patch also has working ELF
coredumping, the previous patch crashed in the coredumping code.)
This code is a combined work of the exec-shield VDSO randomization
code and Gerd Hoffmann's hypervisor-centric VDSO patch. Rusty Russell
started this patch and i completed it.
[akpm@osdl.org: cleanups]
[akpm@osdl.org: compile fix]
[akpm@osdl.org: compile fix 2]
[akpm@osdl.org: compile fix 3]
[akpm@osdl.org: revernt MAXMEM change]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Cc: Gerd Hoffmann <kraxel@suse.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Andi Kleen <ak@muc.de>
Cc: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 02:53:50 -07:00
} else {
name = " [vdso] " ;
2005-04-16 15:20:36 -07:00
}
[PATCH] vdso: randomize the i386 vDSO by moving it into a vma
Move the i386 VDSO down into a vma and thus randomize it.
Besides the security implications, this feature also helps debuggers, which
can COW a vma-backed VDSO just like a normal DSO and can thus do
single-stepping and other debugging features.
It's good for hypervisors (Xen, VMWare) too, which typically live in the same
high-mapped address space as the VDSO, hence whenever the VDSO is used, they
get lots of guest pagefaults and have to fix such guest accesses up - which
slows things down instead of speeding things up (the primary purpose of the
VDSO).
There's a new CONFIG_COMPAT_VDSO (default=y) option, which provides support
for older glibcs that still rely on a prelinked high-mapped VDSO. Newer
distributions (using glibc 2.3.3 or later) can turn this option off. Turning
it off is also recommended for security reasons: attackers cannot use the
predictable high-mapped VDSO page as syscall trampoline anymore.
There is a new vdso=[0|1] boot option as well, and a runtime
/proc/sys/vm/vdso_enabled sysctl switch, that allows the VDSO to be turned
on/off.
(This version of the VDSO-randomization patch also has working ELF
coredumping, the previous patch crashed in the coredumping code.)
This code is a combined work of the exec-shield VDSO randomization
code and Gerd Hoffmann's hypervisor-centric VDSO patch. Rusty Russell
started this patch and i completed it.
[akpm@osdl.org: cleanups]
[akpm@osdl.org: compile fix]
[akpm@osdl.org: compile fix 2]
[akpm@osdl.org: compile fix 3]
[akpm@osdl.org: revernt MAXMEM change]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Cc: Gerd Hoffmann <kraxel@suse.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Andi Kleen <ak@muc.de>
Cc: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 02:53:50 -07:00
}
if ( name ) {
2005-04-16 15:20:36 -07:00
pad_len_spaces ( m , len ) ;
[PATCH] vdso: randomize the i386 vDSO by moving it into a vma
Move the i386 VDSO down into a vma and thus randomize it.
Besides the security implications, this feature also helps debuggers, which
can COW a vma-backed VDSO just like a normal DSO and can thus do
single-stepping and other debugging features.
It's good for hypervisors (Xen, VMWare) too, which typically live in the same
high-mapped address space as the VDSO, hence whenever the VDSO is used, they
get lots of guest pagefaults and have to fix such guest accesses up - which
slows things down instead of speeding things up (the primary purpose of the
VDSO).
There's a new CONFIG_COMPAT_VDSO (default=y) option, which provides support
for older glibcs that still rely on a prelinked high-mapped VDSO. Newer
distributions (using glibc 2.3.3 or later) can turn this option off. Turning
it off is also recommended for security reasons: attackers cannot use the
predictable high-mapped VDSO page as syscall trampoline anymore.
There is a new vdso=[0|1] boot option as well, and a runtime
/proc/sys/vm/vdso_enabled sysctl switch, that allows the VDSO to be turned
on/off.
(This version of the VDSO-randomization patch also has working ELF
coredumping, the previous patch crashed in the coredumping code.)
This code is a combined work of the exec-shield VDSO randomization
code and Gerd Hoffmann's hypervisor-centric VDSO patch. Rusty Russell
started this patch and i completed it.
[akpm@osdl.org: cleanups]
[akpm@osdl.org: compile fix]
[akpm@osdl.org: compile fix 2]
[akpm@osdl.org: compile fix 3]
[akpm@osdl.org: revernt MAXMEM change]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Cc: Gerd Hoffmann <kraxel@suse.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Andi Kleen <ak@muc.de>
Cc: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 02:53:50 -07:00
seq_puts ( m , name ) ;
2005-04-16 15:20:36 -07:00
}
}
seq_putc ( m , ' \n ' ) ;
2005-09-03 15:55:10 -07:00
if ( m - > count < m - > size ) /* vma is copied successfully */
m - > version = ( vma ! = get_gate_vma ( task ) ) ? vma - > vm_start : 0 ;
2005-04-16 15:20:36 -07:00
return 0 ;
}
2008-02-08 04:21:19 -08:00
static const struct seq_operations proc_pid_maps_op = {
2008-02-04 22:29:03 -08:00
. start = m_start ,
. next = m_next ,
. stop = m_stop ,
. show = show_map
} ;
static int maps_open ( struct inode * inode , struct file * file )
{
return do_maps_open ( inode , file , & proc_pid_maps_op ) ;
}
const struct file_operations proc_maps_operations = {
. open = maps_open ,
. read = seq_read ,
. llseek = seq_lseek ,
. release = seq_release_private ,
} ;
/*
* Proportional Set Size ( PSS ) : my share of RSS .
*
* PSS of a process is the count of pages it has in memory , where each
* page is divided by the number of processes sharing it . So if a
* process has 1000 pages all to itself , and 1000 shared with one other
* process , its PSS will be 1500.
*
* To keep ( accumulated ) division errors low , we adopt a 64 bit
* fixed - point pss counter to minimize division errors . So ( pss > >
* PSS_SHIFT ) would be the real byte count .
*
* A shift of 12 before division means ( assuming 4 K page size ) :
* - 1 M 3 - user - pages add up to 8 KB errors ;
* - supports mapcount up to 2 ^ 24 , or 16 M ;
* - supports PSS up to 2 ^ 52 bytes , or 4 PB .
*/
# define PSS_SHIFT 12
2008-02-04 22:29:07 -08:00
# ifdef CONFIG_PROC_PAGE_MONITOR
2008-04-28 02:12:55 -07:00
struct mem_size_stats {
2008-02-04 22:29:03 -08:00
struct vm_area_struct * vma ;
unsigned long resident ;
unsigned long shared_clean ;
unsigned long shared_dirty ;
unsigned long private_clean ;
unsigned long private_dirty ;
unsigned long referenced ;
2008-04-28 02:12:55 -07:00
unsigned long swap ;
2008-02-04 22:29:03 -08:00
u64 pss ;
} ;
2008-02-04 22:29:01 -08:00
static int smaps_pte_range ( pmd_t * pmd , unsigned long addr , unsigned long end ,
void * private )
2005-09-03 15:55:10 -07:00
{
smaps: extract pmd walker from smaps code
Extracts the pmd walker from smaps-specific code in fs/proc/task_mmu.c.
The new struct pmd_walker includes the struct vm_area_struct of the memory to
walk over. Iteration begins at the vma->vm_start and completes at
vma->vm_end. A pointer to another data structure may be stored in the private
field such as struct mem_size_stats, which acts as the smaps accumulator. For
each pmd in the VMA, the action function is called with a pointer to its
struct vm_area_struct, a pointer to the pmd_t, its start and end addresses,
and the private field.
The interface for walking pmd's in a VMA for fs/proc/task_mmu.c is now:
void for_each_pmd(struct vm_area_struct *vma,
void (*action)(struct vm_area_struct *vma,
pmd_t *pmd, unsigned long addr,
unsigned long end,
void *private),
void *private);
Since the pmd walker is now extracted from the smaps code, smaps_one_pmd() is
invoked for each pmd in the VMA. Its behavior and efficiency is identical to
the existing implementation.
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Christoph Lameter <clameter@sgi.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-05-06 14:49:21 -07:00
struct mem_size_stats * mss = private ;
2008-02-04 22:29:01 -08:00
struct vm_area_struct * vma = mss - > vma ;
2005-09-03 15:55:10 -07:00
pte_t * pte , ptent ;
2005-10-29 18:16:27 -07:00
spinlock_t * ptl ;
2005-09-03 15:55:10 -07:00
struct page * page ;
2008-02-04 22:28:56 -08:00
int mapcount ;
2005-09-03 15:55:10 -07:00
2005-10-29 18:16:27 -07:00
pte = pte_offset_map_lock ( vma - > vm_mm , pmd , addr , & ptl ) ;
smaps: extract pmd walker from smaps code
Extracts the pmd walker from smaps-specific code in fs/proc/task_mmu.c.
The new struct pmd_walker includes the struct vm_area_struct of the memory to
walk over. Iteration begins at the vma->vm_start and completes at
vma->vm_end. A pointer to another data structure may be stored in the private
field such as struct mem_size_stats, which acts as the smaps accumulator. For
each pmd in the VMA, the action function is called with a pointer to its
struct vm_area_struct, a pointer to the pmd_t, its start and end addresses,
and the private field.
The interface for walking pmd's in a VMA for fs/proc/task_mmu.c is now:
void for_each_pmd(struct vm_area_struct *vma,
void (*action)(struct vm_area_struct *vma,
pmd_t *pmd, unsigned long addr,
unsigned long end,
void *private),
void *private);
Since the pmd walker is now extracted from the smaps code, smaps_one_pmd() is
invoked for each pmd in the VMA. Its behavior and efficiency is identical to
the existing implementation.
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Christoph Lameter <clameter@sgi.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-05-06 14:49:21 -07:00
for ( ; addr ! = end ; pte + + , addr + = PAGE_SIZE ) {
2005-09-03 15:55:10 -07:00
ptent = * pte ;
2008-04-28 02:12:55 -07:00
if ( is_swap_pte ( ptent ) ) {
mss - > swap + = PAGE_SIZE ;
continue ;
}
2005-10-29 18:16:27 -07:00
if ( ! pte_present ( ptent ) )
2005-09-03 15:55:10 -07:00
continue ;
mss - > resident + = PAGE_SIZE ;
2006-03-06 15:42:58 -08:00
page = vm_normal_page ( vma , addr , ptent ) ;
if ( ! page )
2005-09-03 15:55:10 -07:00
continue ;
2007-05-06 14:49:22 -07:00
/* Accumulate the size in pages that have been accessed. */
if ( pte_young ( ptent ) | | PageReferenced ( page ) )
mss - > referenced + = PAGE_SIZE ;
2008-02-04 22:28:56 -08:00
mapcount = page_mapcount ( page ) ;
if ( mapcount > = 2 ) {
2005-09-03 15:55:10 -07:00
if ( pte_dirty ( ptent ) )
mss - > shared_dirty + = PAGE_SIZE ;
else
mss - > shared_clean + = PAGE_SIZE ;
2008-02-04 22:28:56 -08:00
mss - > pss + = ( PAGE_SIZE < < PSS_SHIFT ) / mapcount ;
2005-09-03 15:55:10 -07:00
} else {
if ( pte_dirty ( ptent ) )
mss - > private_dirty + = PAGE_SIZE ;
else
mss - > private_clean + = PAGE_SIZE ;
2008-02-04 22:28:56 -08:00
mss - > pss + = ( PAGE_SIZE < < PSS_SHIFT ) ;
2005-09-03 15:55:10 -07:00
}
smaps: extract pmd walker from smaps code
Extracts the pmd walker from smaps-specific code in fs/proc/task_mmu.c.
The new struct pmd_walker includes the struct vm_area_struct of the memory to
walk over. Iteration begins at the vma->vm_start and completes at
vma->vm_end. A pointer to another data structure may be stored in the private
field such as struct mem_size_stats, which acts as the smaps accumulator. For
each pmd in the VMA, the action function is called with a pointer to its
struct vm_area_struct, a pointer to the pmd_t, its start and end addresses,
and the private field.
The interface for walking pmd's in a VMA for fs/proc/task_mmu.c is now:
void for_each_pmd(struct vm_area_struct *vma,
void (*action)(struct vm_area_struct *vma,
pmd_t *pmd, unsigned long addr,
unsigned long end,
void *private),
void *private);
Since the pmd walker is now extracted from the smaps code, smaps_one_pmd() is
invoked for each pmd in the VMA. Its behavior and efficiency is identical to
the existing implementation.
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Christoph Lameter <clameter@sgi.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-05-06 14:49:21 -07:00
}
2005-10-29 18:16:27 -07:00
pte_unmap_unlock ( pte - 1 , ptl ) ;
cond_resched ( ) ;
2008-02-04 22:29:01 -08:00
return 0 ;
2005-09-03 15:55:10 -07:00
}
2008-02-04 22:29:01 -08:00
static struct mm_walk smaps_walk = { . pmd_entry = smaps_pte_range } ;
2005-09-03 15:55:10 -07:00
static int show_smap ( struct seq_file * m , void * v )
{
struct vm_area_struct * vma = v ;
struct mem_size_stats mss ;
2008-02-04 22:29:02 -08:00
int ret ;
2005-09-03 15:55:10 -07:00
memset ( & mss , 0 , sizeof mss ) ;
2008-02-04 22:29:01 -08:00
mss . vma = vma ;
2006-03-06 15:42:57 -08:00
if ( vma - > vm_mm & & ! is_vm_hugetlb_page ( vma ) )
2008-02-04 22:29:01 -08:00
walk_page_range ( vma - > vm_mm , vma - > vm_start , vma - > vm_end ,
& smaps_walk , & mss ) ;
2008-02-04 22:29:02 -08:00
ret = show_map ( m , v ) ;
if ( ret )
return ret ;
seq_printf ( m ,
" Size: %8lu kB \n "
" Rss: %8lu kB \n "
" Pss: %8lu kB \n "
" Shared_Clean: %8lu kB \n "
" Shared_Dirty: %8lu kB \n "
" Private_Clean: %8lu kB \n "
" Private_Dirty: %8lu kB \n "
2008-04-28 02:12:55 -07:00
" Referenced: %8lu kB \n "
" Swap: %8lu kB \n " ,
2008-02-04 22:29:02 -08:00
( vma - > vm_end - vma - > vm_start ) > > 10 ,
mss . resident > > 10 ,
( unsigned long ) ( mss . pss > > ( 10 + PSS_SHIFT ) ) ,
mss . shared_clean > > 10 ,
mss . shared_dirty > > 10 ,
mss . private_clean > > 10 ,
mss . private_dirty > > 10 ,
2008-04-28 02:12:55 -07:00
mss . referenced > > 10 ,
mss . swap > > 10 ) ;
2008-02-04 22:29:02 -08:00
return ret ;
2005-09-03 15:55:10 -07:00
}
2008-02-08 04:21:19 -08:00
static const struct seq_operations proc_pid_smaps_op = {
2008-02-04 22:29:03 -08:00
. start = m_start ,
. next = m_next ,
. stop = m_stop ,
. show = show_smap
} ;
static int smaps_open ( struct inode * inode , struct file * file )
{
return do_maps_open ( inode , file , & proc_pid_smaps_op ) ;
}
const struct file_operations proc_smaps_operations = {
. open = smaps_open ,
. read = seq_read ,
. llseek = seq_lseek ,
. release = seq_release_private ,
} ;
static int clear_refs_pte_range ( pmd_t * pmd , unsigned long addr ,
unsigned long end , void * private )
{
struct vm_area_struct * vma = private ;
pte_t * pte , ptent ;
spinlock_t * ptl ;
struct page * page ;
pte = pte_offset_map_lock ( vma - > vm_mm , pmd , addr , & ptl ) ;
for ( ; addr ! = end ; pte + + , addr + = PAGE_SIZE ) {
ptent = * pte ;
if ( ! pte_present ( ptent ) )
continue ;
page = vm_normal_page ( vma , addr , ptent ) ;
if ( ! page )
continue ;
/* Clear accessed and referenced bits. */
ptep_test_and_clear_young ( vma , addr , pte ) ;
ClearPageReferenced ( page ) ;
}
pte_unmap_unlock ( pte - 1 , ptl ) ;
cond_resched ( ) ;
return 0 ;
}
2008-02-04 22:29:01 -08:00
static struct mm_walk clear_refs_walk = { . pmd_entry = clear_refs_pte_range } ;
2008-02-04 22:29:03 -08:00
static ssize_t clear_refs_write ( struct file * file , const char __user * buf ,
size_t count , loff_t * ppos )
2007-05-06 14:49:24 -07:00
{
2008-02-04 22:29:03 -08:00
struct task_struct * task ;
char buffer [ PROC_NUMBUF ] , * end ;
struct mm_struct * mm ;
2007-05-06 14:49:24 -07:00
struct vm_area_struct * vma ;
2008-02-04 22:29:03 -08:00
memset ( buffer , 0 , sizeof ( buffer ) ) ;
if ( count > sizeof ( buffer ) - 1 )
count = sizeof ( buffer ) - 1 ;
if ( copy_from_user ( buffer , buf , count ) )
return - EFAULT ;
if ( ! simple_strtol ( buffer , & end , 0 ) )
return - EINVAL ;
if ( * end = = ' \n ' )
end + + ;
task = get_proc_task ( file - > f_path . dentry - > d_inode ) ;
if ( ! task )
return - ESRCH ;
mm = get_task_mm ( task ) ;
if ( mm ) {
down_read ( & mm - > mmap_sem ) ;
for ( vma = mm - > mmap ; vma ; vma = vma - > vm_next )
if ( ! is_vm_hugetlb_page ( vma ) )
walk_page_range ( mm , vma - > vm_start , vma - > vm_end ,
& clear_refs_walk , vma ) ;
flush_tlb_mm ( mm ) ;
up_read ( & mm - > mmap_sem ) ;
mmput ( mm ) ;
}
put_task_struct ( task ) ;
if ( end - buffer = = 0 )
return - EIO ;
return end - buffer ;
2007-05-06 14:49:24 -07:00
}
2008-02-04 22:29:03 -08:00
const struct file_operations proc_clear_refs_operations = {
. write = clear_refs_write ,
} ;
2008-02-04 22:29:04 -08:00
struct pagemapread {
char __user * out , * end ;
} ;
2008-03-21 18:46:59 -05:00
# define PM_ENTRY_BYTES sizeof(u64)
# define PM_STATUS_BITS 3
# define PM_STATUS_OFFSET (64 - PM_STATUS_BITS)
# define PM_STATUS_MASK (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET)
# define PM_STATUS(nr) (((nr) << PM_STATUS_OFFSET) & PM_STATUS_MASK)
# define PM_PSHIFT_BITS 6
# define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS)
# define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET)
# define PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)
# define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1)
# define PM_PFRAME(x) ((x) & PM_PFRAME_MASK)
# define PM_PRESENT PM_STATUS(4LL)
# define PM_SWAP PM_STATUS(2LL)
# define PM_NOT_PRESENT PM_PSHIFT(PAGE_SHIFT)
2008-02-04 22:29:04 -08:00
# define PM_END_OF_BUFFER 1
static int add_to_pagemap ( unsigned long addr , u64 pfn ,
struct pagemapread * pm )
{
/*
* Make sure there ' s room in the buffer for an
* entire entry . Otherwise , only copy part of
* the pfn .
*/
if ( pm - > out + PM_ENTRY_BYTES > = pm - > end ) {
if ( copy_to_user ( pm - > out , & pfn , pm - > end - pm - > out ) )
return - EFAULT ;
pm - > out = pm - > end ;
return PM_END_OF_BUFFER ;
}
if ( put_user ( pfn , pm - > out ) )
return - EFAULT ;
pm - > out + = PM_ENTRY_BYTES ;
return 0 ;
}
static int pagemap_pte_hole ( unsigned long start , unsigned long end ,
void * private )
{
struct pagemapread * pm = private ;
unsigned long addr ;
int err = 0 ;
for ( addr = start ; addr < end ; addr + = PAGE_SIZE ) {
err = add_to_pagemap ( addr , PM_NOT_PRESENT , pm ) ;
if ( err )
break ;
}
return err ;
}
2008-04-28 02:12:11 -07:00
static u64 swap_pte_to_pagemap_entry ( pte_t pte )
2008-02-04 22:29:04 -08:00
{
swp_entry_t e = pte_to_swp_entry ( pte ) ;
2008-03-21 18:46:59 -05:00
return swp_type ( e ) | ( swp_offset ( e ) < < MAX_SWAPFILES_SHIFT ) ;
2008-02-04 22:29:04 -08:00
}
static int pagemap_pte_range ( pmd_t * pmd , unsigned long addr , unsigned long end ,
void * private )
{
struct pagemapread * pm = private ;
pte_t * pte ;
int err = 0 ;
for ( ; addr ! = end ; addr + = PAGE_SIZE ) {
u64 pfn = PM_NOT_PRESENT ;
pte = pte_offset_map ( pmd , addr ) ;
if ( is_swap_pte ( * pte ) )
2008-03-21 18:46:59 -05:00
pfn = PM_PFRAME ( swap_pte_to_pagemap_entry ( * pte ) )
| PM_PSHIFT ( PAGE_SHIFT ) | PM_SWAP ;
2008-02-04 22:29:04 -08:00
else if ( pte_present ( * pte ) )
2008-03-21 18:46:59 -05:00
pfn = PM_PFRAME ( pte_pfn ( * pte ) )
| PM_PSHIFT ( PAGE_SHIFT ) | PM_PRESENT ;
2008-02-04 22:29:04 -08:00
/* unmap so we're not in atomic when we copy to userspace */
pte_unmap ( pte ) ;
err = add_to_pagemap ( addr , pfn , pm ) ;
if ( err )
return err ;
}
cond_resched ( ) ;
return err ;
}
static struct mm_walk pagemap_walk = {
. pmd_entry = pagemap_pte_range ,
. pte_hole = pagemap_pte_hole
} ;
/*
* / proc / pid / pagemap - an array mapping virtual pages to pfns
*
2008-03-21 18:46:59 -05:00
* For each page in the address space , this file contains one 64 - bit entry
* consisting of the following :
*
* Bits 0 - 55 page frame number ( PFN ) if present
* Bits 0 - 4 swap type if swapped
* Bits 5 - 55 swap offset if swapped
* Bits 55 - 60 page shift ( page size = 1 < < page shift )
* Bit 61 reserved for future use
* Bit 62 page swapped
* Bit 63 page present
*
* If the page is not present but in swap , then the PFN contains an
* encoding of the swap file number and the page ' s offset into the
* swap . Unmapped pages return a null PFN . This allows determining
2008-02-04 22:29:04 -08:00
* precisely which pages are mapped ( or in swap ) and comparing mapped
* pages between processes .
*
* Efficient users of this interface will use / proc / pid / maps to
* determine which areas of memory are actually mapped and llseek to
* skip over unmapped regions .
*/
static ssize_t pagemap_read ( struct file * file , char __user * buf ,
size_t count , loff_t * ppos )
{
struct task_struct * task = get_proc_task ( file - > f_path . dentry - > d_inode ) ;
struct page * * pages , * page ;
unsigned long uaddr , uend ;
struct mm_struct * mm ;
struct pagemapread pm ;
int pagecount ;
int ret = - ESRCH ;
if ( ! task )
goto out ;
ret = - EACCES ;
if ( ! ptrace_may_attach ( task ) )
2008-03-13 12:32:35 -07:00
goto out_task ;
2008-02-04 22:29:04 -08:00
ret = - EINVAL ;
/* file position must be aligned */
if ( * ppos % PM_ENTRY_BYTES )
2008-03-13 12:32:35 -07:00
goto out_task ;
2008-02-04 22:29:04 -08:00
ret = 0 ;
mm = get_task_mm ( task ) ;
if ( ! mm )
2008-03-13 12:32:35 -07:00
goto out_task ;
2008-02-04 22:29:04 -08:00
ret = - ENOMEM ;
uaddr = ( unsigned long ) buf & PAGE_MASK ;
uend = ( unsigned long ) ( buf + count ) ;
pagecount = ( PAGE_ALIGN ( uend ) - uaddr ) / PAGE_SIZE ;
pages = kmalloc ( pagecount * sizeof ( struct page * ) , GFP_KERNEL ) ;
if ( ! pages )
2008-03-13 12:32:35 -07:00
goto out_mm ;
2008-02-04 22:29:04 -08:00
down_read ( & current - > mm - > mmap_sem ) ;
ret = get_user_pages ( current , current - > mm , uaddr , pagecount ,
1 , 0 , pages , NULL ) ;
up_read ( & current - > mm - > mmap_sem ) ;
if ( ret < 0 )
goto out_free ;
2008-03-13 12:32:35 -07:00
if ( ret ! = pagecount ) {
pagecount = ret ;
ret = - EFAULT ;
goto out_pages ;
}
2008-02-04 22:29:04 -08:00
pm . out = buf ;
pm . end = buf + count ;
if ( ! ptrace_may_attach ( task ) ) {
ret = - EIO ;
} else {
unsigned long src = * ppos ;
unsigned long svpfn = src / PM_ENTRY_BYTES ;
unsigned long start_vaddr = svpfn < < PAGE_SHIFT ;
unsigned long end_vaddr = TASK_SIZE_OF ( task ) ;
/* watch out for wraparound */
if ( svpfn > TASK_SIZE_OF ( task ) > > PAGE_SHIFT )
start_vaddr = end_vaddr ;
/*
* The odds are that this will stop walking way
* before end_vaddr , because the length of the
* user buffer is tracked in " pm " , and the walk
* will stop when we hit the end of the buffer .
*/
ret = walk_page_range ( mm , start_vaddr , end_vaddr ,
& pagemap_walk , & pm ) ;
if ( ret = = PM_END_OF_BUFFER )
ret = 0 ;
/* don't need mmap_sem for these, but this looks cleaner */
* ppos + = pm . out - buf ;
if ( ! ret )
ret = pm . out - buf ;
}
2008-03-13 12:32:35 -07:00
out_pages :
2008-02-04 22:29:04 -08:00
for ( ; pagecount ; pagecount - - ) {
page = pages [ pagecount - 1 ] ;
if ( ! PageReserved ( page ) )
SetPageDirty ( page ) ;
page_cache_release ( page ) ;
}
out_free :
kfree ( pages ) ;
2008-03-13 12:32:35 -07:00
out_mm :
mmput ( mm ) ;
2008-02-04 22:29:04 -08:00
out_task :
put_task_struct ( task ) ;
out :
return ret ;
}
const struct file_operations proc_pagemap_operations = {
. llseek = mem_lseek , /* borrow this */
. read = pagemap_read ,
} ;
2008-02-04 22:29:07 -08:00
# endif /* CONFIG_PROC_PAGE_MONITOR */
2008-02-04 22:29:04 -08:00
2005-09-03 15:54:45 -07:00
# ifdef CONFIG_NUMA
2006-01-08 01:01:02 -08:00
extern int show_numa_map ( struct seq_file * m , void * v ) ;
2005-09-03 15:54:45 -07:00
2007-05-08 00:26:04 -07:00
static int show_numa_map_checked ( struct seq_file * m , void * v )
{
struct proc_maps_private * priv = m - > private ;
struct task_struct * task = priv - > task ;
if ( maps_protect & & ! ptrace_may_attach ( task ) )
return - EACCES ;
return show_numa_map ( m , v ) ;
}
2008-02-08 04:21:19 -08:00
static const struct seq_operations proc_pid_numa_maps_op = {
2006-01-08 01:01:02 -08:00
. start = m_start ,
. next = m_next ,
. stop = m_stop ,
2007-05-08 00:26:04 -07:00
. show = show_numa_map_checked
2005-09-03 15:54:45 -07:00
} ;
2006-06-26 00:25:48 -07:00
static int numa_maps_open ( struct inode * inode , struct file * file )
{
return do_maps_open ( inode , file , & proc_pid_numa_maps_op ) ;
}
2007-02-12 00:55:34 -08:00
const struct file_operations proc_numa_maps_operations = {
2006-06-26 00:25:48 -07:00
. open = numa_maps_open ,
. read = seq_read ,
. llseek = seq_lseek ,
2006-06-26 00:25:55 -07:00
. release = seq_release_private ,
2006-06-26 00:25:48 -07:00
} ;
2005-09-03 15:54:45 -07:00
# endif