2005-04-16 15:20:36 -07:00
/* binfmt_elf_fdpic.c: FDPIC ELF binary format
*
2006-07-10 04:44:53 -07:00
* Copyright ( C ) 2003 , 2004 , 2006 Red Hat , Inc . All Rights Reserved .
2005-04-16 15:20:36 -07:00
* Written by David Howells ( dhowells @ redhat . com )
* Derived from binfmt_elf . c
*
* This program is free software ; you can redistribute it and / or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation ; either version
* 2 of the License , or ( at your option ) any later version .
*/
# include <linux/module.h>
# include <linux/fs.h>
# include <linux/stat.h>
# include <linux/sched.h>
# include <linux/mm.h>
# include <linux/mman.h>
# include <linux/errno.h>
# include <linux/signal.h>
# include <linux/binfmts.h>
# include <linux/string.h>
# include <linux/file.h>
# include <linux/fcntl.h>
# include <linux/slab.h>
2006-07-10 04:44:55 -07:00
# include <linux/pagemap.h>
2008-10-15 22:04:16 -07:00
# include <linux/security.h>
2005-04-16 15:20:36 -07:00
# include <linux/highmem.h>
2006-07-10 04:44:55 -07:00
# include <linux/highuid.h>
2005-04-16 15:20:36 -07:00
# include <linux/personality.h>
# include <linux/ptrace.h>
# include <linux/init.h>
# include <linux/elf.h>
# include <linux/elf-fdpic.h>
# include <linux/elfcore.h>
2010-03-05 13:44:06 -08:00
# include <linux/coredump.h>
2005-04-16 15:20:36 -07:00
# include <asm/uaccess.h>
# include <asm/param.h>
# include <asm/pgalloc.h>
typedef char * elf_caddr_t ;
#if 0
# define kdebug(fmt, ...) printk("FDPIC "fmt"\n" ,##__VA_ARGS__ )
# else
# define kdebug(fmt, ...) do {} while(0)
# endif
2006-07-10 04:44:55 -07:00
#if 0
# define kdcore(fmt, ...) printk("FDPIC "fmt"\n" ,##__VA_ARGS__ )
# else
# define kdcore(fmt, ...) do {} while(0)
# endif
2005-04-16 15:20:36 -07:00
MODULE_LICENSE ( " GPL " ) ;
2012-10-20 22:00:48 -04:00
static int load_elf_fdpic_binary ( struct linux_binprm * ) ;
2006-07-10 04:44:53 -07:00
static int elf_fdpic_fetch_phdrs ( struct elf_fdpic_params * , struct file * ) ;
static int elf_fdpic_map_file ( struct elf_fdpic_params * , struct file * ,
struct mm_struct * , const char * ) ;
2005-04-16 15:20:36 -07:00
2006-07-10 04:44:53 -07:00
static int create_elf_fdpic_tables ( struct linux_binprm * , struct mm_struct * ,
struct elf_fdpic_params * ,
struct elf_fdpic_params * ) ;
2005-04-16 15:20:36 -07:00
# ifndef CONFIG_MMU
2006-07-10 04:44:53 -07:00
static int elf_fdpic_transfer_args_to_stack ( struct linux_binprm * ,
unsigned long * ) ;
static int elf_fdpic_map_file_constdisp_on_uclinux ( struct elf_fdpic_params * ,
struct file * ,
struct mm_struct * ) ;
2005-04-16 15:20:36 -07:00
# endif
2006-07-10 04:44:53 -07:00
static int elf_fdpic_map_file_by_direct_mmap ( struct elf_fdpic_params * ,
struct file * , struct mm_struct * ) ;
2005-04-16 15:20:36 -07:00
2009-12-15 16:47:37 -08:00
# ifdef CONFIG_ELF_CORE
2009-12-17 15:27:16 -08:00
static int elf_fdpic_core_dump ( struct coredump_params * cprm ) ;
2006-07-10 04:44:55 -07:00
# endif
2005-04-16 15:20:36 -07:00
static struct linux_binfmt elf_fdpic_format = {
. module = THIS_MODULE ,
. load_binary = load_elf_fdpic_binary ,
2009-12-15 16:47:37 -08:00
# ifdef CONFIG_ELF_CORE
2006-07-10 04:44:55 -07:00
. core_dump = elf_fdpic_core_dump ,
# endif
2005-04-16 15:20:36 -07:00
. min_coredump = ELF_EXEC_PAGESIZE ,
} ;
2006-07-10 04:44:53 -07:00
static int __init init_elf_fdpic_binfmt ( void )
{
2012-03-17 03:05:16 -04:00
register_binfmt ( & elf_fdpic_format ) ;
return 0 ;
2006-07-10 04:44:53 -07:00
}
2005-04-16 15:20:36 -07:00
2006-07-10 04:44:53 -07:00
static void __exit exit_elf_fdpic_binfmt ( void )
{
unregister_binfmt ( & elf_fdpic_format ) ;
}
2006-07-10 04:44:55 -07:00
core_initcall ( init_elf_fdpic_binfmt ) ;
2006-07-10 04:44:53 -07:00
module_exit ( exit_elf_fdpic_binfmt ) ;
2005-04-16 15:20:36 -07:00
static int is_elf_fdpic ( struct elfhdr * hdr , struct file * file )
{
if ( memcmp ( hdr - > e_ident , ELFMAG , SELFMAG ) ! = 0 )
return 0 ;
if ( hdr - > e_type ! = ET_EXEC & & hdr - > e_type ! = ET_DYN )
return 0 ;
if ( ! elf_check_arch ( hdr ) | | ! elf_check_fdpic ( hdr ) )
return 0 ;
2013-09-22 16:27:52 -04:00
if ( ! file - > f_op - > mmap )
2005-04-16 15:20:36 -07:00
return 0 ;
return 1 ;
}
/*****************************************************************************/
/*
* read the program headers table into memory
*/
2006-07-10 04:44:53 -07:00
static int elf_fdpic_fetch_phdrs ( struct elf_fdpic_params * params ,
struct file * file )
2005-04-16 15:20:36 -07:00
{
struct elf32_phdr * phdr ;
unsigned long size ;
int retval , loop ;
if ( params - > hdr . e_phentsize ! = sizeof ( struct elf_phdr ) )
return - ENOMEM ;
if ( params - > hdr . e_phnum > 65536U / sizeof ( struct elf_phdr ) )
return - ENOMEM ;
size = params - > hdr . e_phnum * sizeof ( struct elf_phdr ) ;
params - > phdrs = kmalloc ( size , GFP_KERNEL ) ;
if ( ! params - > phdrs )
return - ENOMEM ;
2006-07-10 04:44:53 -07:00
retval = kernel_read ( file , params - > hdr . e_phoff ,
( char * ) params - > phdrs , size ) ;
2008-04-29 00:59:34 -07:00
if ( unlikely ( retval ! = size ) )
return retval < 0 ? retval : - ENOEXEC ;
2005-04-16 15:20:36 -07:00
/* determine stack size for this binary */
phdr = params - > phdrs ;
for ( loop = 0 ; loop < params - > hdr . e_phnum ; loop + + , phdr + + ) {
if ( phdr - > p_type ! = PT_GNU_STACK )
continue ;
if ( phdr - > p_flags & PF_X )
params - > flags | = ELF_FDPIC_FLAG_EXEC_STACK ;
else
params - > flags | = ELF_FDPIC_FLAG_NOEXEC_STACK ;
params - > stack_size = phdr - > p_memsz ;
break ;
}
return 0 ;
2006-07-10 04:44:53 -07:00
}
2005-04-16 15:20:36 -07:00
/*****************************************************************************/
/*
* load an fdpic binary into various bits of memory
*/
2012-10-20 22:00:48 -04:00
static int load_elf_fdpic_binary ( struct linux_binprm * bprm )
2005-04-16 15:20:36 -07:00
{
struct elf_fdpic_params exec_params , interp_params ;
2012-10-20 22:00:48 -04:00
struct pt_regs * regs = current_pt_regs ( ) ;
2005-04-16 15:20:36 -07:00
struct elf_phdr * phdr ;
2006-07-10 04:44:53 -07:00
unsigned long stack_size , entryaddr ;
# ifdef ELF_FDPIC_PLAT_INIT
unsigned long dynaddr ;
2010-01-06 17:23:17 +00:00
# endif
# ifndef CONFIG_MMU
unsigned long stack_prot ;
2006-07-10 04:44:53 -07:00
# endif
2005-04-16 15:20:36 -07:00
struct file * interpreter = NULL ; /* to shut gcc up */
char * interpreter_name = NULL ;
int executable_stack ;
int retval , i ;
2007-03-23 00:10:00 -07:00
kdebug ( " ____ LOAD %d ____ " , current - > pid ) ;
2005-04-16 15:20:36 -07:00
memset ( & exec_params , 0 , sizeof ( exec_params ) ) ;
memset ( & interp_params , 0 , sizeof ( interp_params ) ) ;
exec_params . hdr = * ( struct elfhdr * ) bprm - > buf ;
exec_params . flags = ELF_FDPIC_FLAG_PRESENT | ELF_FDPIC_FLAG_EXECUTABLE ;
/* check that this is a binary we know how to deal with */
retval = - ENOEXEC ;
if ( ! is_elf_fdpic ( & exec_params . hdr , bprm - > file ) )
goto error ;
/* read the program header table */
retval = elf_fdpic_fetch_phdrs ( & exec_params , bprm - > file ) ;
if ( retval < 0 )
goto error ;
/* scan for a program header that specifies an interpreter */
phdr = exec_params . phdrs ;
for ( i = 0 ; i < exec_params . hdr . e_phnum ; i + + , phdr + + ) {
switch ( phdr - > p_type ) {
case PT_INTERP :
retval = - ENOMEM ;
if ( phdr - > p_filesz > PATH_MAX )
goto error ;
retval = - ENOENT ;
if ( phdr - > p_filesz < 2 )
goto error ;
/* read the name of the interpreter into memory */
2006-01-09 20:54:45 -08:00
interpreter_name = kmalloc ( phdr - > p_filesz , GFP_KERNEL ) ;
2005-04-16 15:20:36 -07:00
if ( ! interpreter_name )
goto error ;
retval = kernel_read ( bprm - > file ,
phdr - > p_offset ,
interpreter_name ,
phdr - > p_filesz ) ;
2008-04-29 00:59:34 -07:00
if ( unlikely ( retval ! = phdr - > p_filesz ) ) {
if ( retval > = 0 )
retval = - ENOEXEC ;
2005-04-16 15:20:36 -07:00
goto error ;
2008-04-29 00:59:34 -07:00
}
2005-04-16 15:20:36 -07:00
retval = - ENOENT ;
if ( interpreter_name [ phdr - > p_filesz - 1 ] ! = ' \0 ' )
goto error ;
kdebug ( " Using ELF interpreter %s " , interpreter_name ) ;
/* replace the program with the interpreter */
interpreter = open_exec ( interpreter_name ) ;
retval = PTR_ERR ( interpreter ) ;
if ( IS_ERR ( interpreter ) ) {
interpreter = NULL ;
goto error ;
}
2007-01-26 00:57:16 -08:00
/*
* If the binary is not readable then enforce
* mm - > dumpable = 0 regardless of the interpreter ' s
* permissions .
*/
2011-06-19 12:49:47 -04:00
would_dump ( bprm , interpreter ) ;
2007-01-26 00:57:16 -08:00
2006-07-10 04:44:53 -07:00
retval = kernel_read ( interpreter , 0 , bprm - > buf ,
BINPRM_BUF_SIZE ) ;
2008-04-29 00:59:34 -07:00
if ( unlikely ( retval ! = BINPRM_BUF_SIZE ) ) {
if ( retval > = 0 )
retval = - ENOEXEC ;
2005-04-16 15:20:36 -07:00
goto error ;
2008-04-29 00:59:34 -07:00
}
2005-04-16 15:20:36 -07:00
interp_params . hdr = * ( ( struct elfhdr * ) bprm - > buf ) ;
break ;
case PT_LOAD :
# ifdef CONFIG_MMU
if ( exec_params . load_addr = = 0 )
exec_params . load_addr = phdr - > p_vaddr ;
# endif
break ;
}
}
if ( elf_check_const_displacement ( & exec_params . hdr ) )
exec_params . flags | = ELF_FDPIC_FLAG_CONSTDISP ;
/* perform insanity checks on the interpreter */
if ( interpreter_name ) {
retval = - ELIBBAD ;
if ( ! is_elf_fdpic ( & interp_params . hdr , interpreter ) )
goto error ;
interp_params . flags = ELF_FDPIC_FLAG_PRESENT ;
/* read the interpreter's program header table */
retval = elf_fdpic_fetch_phdrs ( & interp_params , interpreter ) ;
if ( retval < 0 )
goto error ;
}
stack_size = exec_params . stack_size ;
if ( exec_params . flags & ELF_FDPIC_FLAG_EXEC_STACK )
executable_stack = EXSTACK_ENABLE_X ;
else if ( exec_params . flags & ELF_FDPIC_FLAG_NOEXEC_STACK )
executable_stack = EXSTACK_DISABLE_X ;
else
executable_stack = EXSTACK_DEFAULT ;
2009-09-23 15:57:06 -07:00
if ( stack_size = = 0 ) {
stack_size = interp_params . stack_size ;
if ( interp_params . flags & ELF_FDPIC_FLAG_EXEC_STACK )
executable_stack = EXSTACK_ENABLE_X ;
else if ( interp_params . flags & ELF_FDPIC_FLAG_NOEXEC_STACK )
executable_stack = EXSTACK_DISABLE_X ;
else
executable_stack = EXSTACK_DEFAULT ;
}
2005-04-16 15:20:36 -07:00
retval = - ENOEXEC ;
if ( stack_size = = 0 )
goto error ;
if ( elf_check_const_displacement ( & interp_params . hdr ) )
interp_params . flags | = ELF_FDPIC_FLAG_CONSTDISP ;
/* flush all traces of the currently running executable */
retval = flush_old_exec ( bprm ) ;
if ( retval )
goto error ;
/* there's now no turning back... the old userspace image is dead,
* defunct , deceased , etc . after this point we have to exit via
* error_kill */
set_personality ( PER_LINUX_FDPIC ) ;
2010-01-06 17:23:17 +00:00
if ( elf_read_implies_exec ( & exec_params . hdr , executable_stack ) )
current - > personality | = READ_IMPLIES_EXEC ;
Split 'flush_old_exec' into two functions
'flush_old_exec()' is the point of no return when doing an execve(), and
it is pretty badly misnamed. It doesn't just flush the old executable
environment, it also starts up the new one.
Which is very inconvenient for things like setting up the new
personality, because we want the new personality to affect the starting
of the new environment, but at the same time we do _not_ want the new
personality to take effect if flushing the old one fails.
As a result, the x86-64 '32-bit' personality is actually done using this
insane "I'm going to change the ABI, but I haven't done it yet" bit
(TIF_ABI_PENDING), with SET_PERSONALITY() not actually setting the
personality, but just the "pending" bit, so that "flush_thread()" can do
the actual personality magic.
This patch in no way changes any of that insanity, but it does split the
'flush_old_exec()' function up into a preparatory part that can fail
(still called flush_old_exec()), and a new part that will actually set
up the new exec environment (setup_new_exec()). All callers are changed
to trivially comply with the new world order.
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: stable@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-01-28 22:14:42 -08:00
setup_new_exec ( bprm ) ;
2005-04-16 15:20:36 -07:00
set_binfmt ( & elf_fdpic_format ) ;
current - > mm - > start_code = 0 ;
current - > mm - > end_code = 0 ;
current - > mm - > start_stack = 0 ;
current - > mm - > start_data = 0 ;
current - > mm - > end_data = 0 ;
current - > mm - > context . exec_fdpic_loadmap = 0 ;
current - > mm - > context . interp_fdpic_loadmap = 0 ;
# ifdef CONFIG_MMU
elf_fdpic_arch_lay_out_mm ( & exec_params ,
& interp_params ,
& current - > mm - > start_stack ,
& current - > mm - > start_brk ) ;
2006-07-10 04:44:53 -07:00
retval = setup_arg_pages ( bprm , current - > mm - > start_stack ,
executable_stack ) ;
2005-04-16 15:20:36 -07:00
if ( retval < 0 ) {
send_sig ( SIGKILL , current , 0 ) ;
goto error_kill ;
}
# endif
/* load the executable and interpreter into memory */
2006-07-10 04:44:53 -07:00
retval = elf_fdpic_map_file ( & exec_params , bprm - > file , current - > mm ,
" executable " ) ;
2005-04-16 15:20:36 -07:00
if ( retval < 0 )
goto error_kill ;
if ( interpreter_name ) {
retval = elf_fdpic_map_file ( & interp_params , interpreter ,
current - > mm , " interpreter " ) ;
if ( retval < 0 ) {
printk ( KERN_ERR " Unable to load interpreter \n " ) ;
goto error_kill ;
}
allow_write_access ( interpreter ) ;
fput ( interpreter ) ;
interpreter = NULL ;
}
# ifdef CONFIG_MMU
if ( ! current - > mm - > start_brk )
current - > mm - > start_brk = current - > mm - > end_data ;
2006-07-10 04:44:53 -07:00
current - > mm - > brk = current - > mm - > start_brk =
PAGE_ALIGN ( current - > mm - > start_brk ) ;
2005-04-16 15:20:36 -07:00
# else
/* create a stack and brk area big enough for everyone
* - the brk heap starts at the bottom and works up
* - the stack starts at the top and works down
*/
stack_size = ( stack_size + PAGE_SIZE - 1 ) & PAGE_MASK ;
if ( stack_size < PAGE_SIZE * 2 )
stack_size = PAGE_SIZE * 2 ;
2010-01-06 17:23:17 +00:00
stack_prot = PROT_READ | PROT_WRITE ;
if ( executable_stack = = EXSTACK_ENABLE_X | |
( executable_stack = = EXSTACK_DEFAULT & & VM_STACK_FLAGS & VM_EXEC ) )
stack_prot | = PROT_EXEC ;
2012-04-20 17:13:58 -07:00
current - > mm - > start_brk = vm_mmap ( NULL , 0 , stack_size , stack_prot ,
2009-12-14 18:00:02 -08:00
MAP_PRIVATE | MAP_ANONYMOUS |
MAP_UNINITIALIZED | MAP_GROWSDOWN ,
2005-04-16 15:20:36 -07:00
0 ) ;
2006-07-10 04:44:53 -07:00
if ( IS_ERR_VALUE ( current - > mm - > start_brk ) ) {
2005-04-16 15:20:36 -07:00
retval = current - > mm - > start_brk ;
current - > mm - > start_brk = 0 ;
goto error_kill ;
}
current - > mm - > brk = current - > mm - > start_brk ;
current - > mm - > context . end_brk = current - > mm - > start_brk ;
2006-07-10 04:44:53 -07:00
current - > mm - > context . end_brk + =
( stack_size > PAGE_SIZE ) ? ( stack_size - PAGE_SIZE ) : 0 ;
2005-04-16 15:20:36 -07:00
current - > mm - > start_stack = current - > mm - > start_brk + stack_size ;
# endif
CRED: Make execve() take advantage of copy-on-write credentials
Make execve() take advantage of copy-on-write credentials, allowing it to set
up the credentials in advance, and then commit the whole lot after the point
of no return.
This patch and the preceding patches have been tested with the LTP SELinux
testsuite.
This patch makes several logical sets of alteration:
(1) execve().
The credential bits from struct linux_binprm are, for the most part,
replaced with a single credentials pointer (bprm->cred). This means that
all the creds can be calculated in advance and then applied at the point
of no return with no possibility of failure.
I would like to replace bprm->cap_effective with:
cap_isclear(bprm->cap_effective)
but this seems impossible due to special behaviour for processes of pid 1
(they always retain their parent's capability masks where normally they'd
be changed - see cap_bprm_set_creds()).
The following sequence of events now happens:
(a) At the start of do_execve, the current task's cred_exec_mutex is
locked to prevent PTRACE_ATTACH from obsoleting the calculation of
creds that we make.
(a) prepare_exec_creds() is then called to make a copy of the current
task's credentials and prepare it. This copy is then assigned to
bprm->cred.
This renders security_bprm_alloc() and security_bprm_free()
unnecessary, and so they've been removed.
(b) The determination of unsafe execution is now performed immediately
after (a) rather than later on in the code. The result is stored in
bprm->unsafe for future reference.
(c) prepare_binprm() is called, possibly multiple times.
(i) This applies the result of set[ug]id binaries to the new creds
attached to bprm->cred. Personality bit clearance is recorded,
but now deferred on the basis that the exec procedure may yet
fail.
(ii) This then calls the new security_bprm_set_creds(). This should
calculate the new LSM and capability credentials into *bprm->cred.
This folds together security_bprm_set() and parts of
security_bprm_apply_creds() (these two have been removed).
Anything that might fail must be done at this point.
(iii) bprm->cred_prepared is set to 1.
bprm->cred_prepared is 0 on the first pass of the security
calculations, and 1 on all subsequent passes. This allows SELinux
in (ii) to base its calculations only on the initial script and
not on the interpreter.
(d) flush_old_exec() is called to commit the task to execution. This
performs the following steps with regard to credentials:
(i) Clear pdeath_signal and set dumpable on certain circumstances that
may not be covered by commit_creds().
(ii) Clear any bits in current->personality that were deferred from
(c.i).
(e) install_exec_creds() [compute_creds() as was] is called to install the
new credentials. This performs the following steps with regard to
credentials:
(i) Calls security_bprm_committing_creds() to apply any security
requirements, such as flushing unauthorised files in SELinux, that
must be done before the credentials are changed.
This is made up of bits of security_bprm_apply_creds() and
security_bprm_post_apply_creds(), both of which have been removed.
This function is not allowed to fail; anything that might fail
must have been done in (c.ii).
(ii) Calls commit_creds() to apply the new credentials in a single
assignment (more or less). Possibly pdeath_signal and dumpable
should be part of struct creds.
(iii) Unlocks the task's cred_replace_mutex, thus allowing
PTRACE_ATTACH to take place.
(iv) Clears The bprm->cred pointer as the credentials it was holding
are now immutable.
(v) Calls security_bprm_committed_creds() to apply any security
alterations that must be done after the creds have been changed.
SELinux uses this to flush signals and signal handlers.
(f) If an error occurs before (d.i), bprm_free() will call abort_creds()
to destroy the proposed new credentials and will then unlock
cred_replace_mutex. No changes to the credentials will have been
made.
(2) LSM interface.
A number of functions have been changed, added or removed:
(*) security_bprm_alloc(), ->bprm_alloc_security()
(*) security_bprm_free(), ->bprm_free_security()
Removed in favour of preparing new credentials and modifying those.
(*) security_bprm_apply_creds(), ->bprm_apply_creds()
(*) security_bprm_post_apply_creds(), ->bprm_post_apply_creds()
Removed; split between security_bprm_set_creds(),
security_bprm_committing_creds() and security_bprm_committed_creds().
(*) security_bprm_set(), ->bprm_set_security()
Removed; folded into security_bprm_set_creds().
(*) security_bprm_set_creds(), ->bprm_set_creds()
New. The new credentials in bprm->creds should be checked and set up
as appropriate. bprm->cred_prepared is 0 on the first call, 1 on the
second and subsequent calls.
(*) security_bprm_committing_creds(), ->bprm_committing_creds()
(*) security_bprm_committed_creds(), ->bprm_committed_creds()
New. Apply the security effects of the new credentials. This
includes closing unauthorised files in SELinux. This function may not
fail. When the former is called, the creds haven't yet been applied
to the process; when the latter is called, they have.
The former may access bprm->cred, the latter may not.
(3) SELinux.
SELinux has a number of changes, in addition to those to support the LSM
interface changes mentioned above:
(a) The bprm_security_struct struct has been removed in favour of using
the credentials-under-construction approach.
(c) flush_unauthorized_files() now takes a cred pointer and passes it on
to inode_has_perm(), file_has_perm() and dentry_open().
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
2008-11-14 10:39:24 +11:00
install_exec_creds ( bprm ) ;
2006-07-10 04:44:53 -07:00
if ( create_elf_fdpic_tables ( bprm , current - > mm ,
& exec_params , & interp_params ) < 0 )
2005-04-16 15:20:36 -07:00
goto error_kill ;
2006-07-10 04:44:53 -07:00
kdebug ( " - start_code %lx " , current - > mm - > start_code ) ;
kdebug ( " - end_code %lx " , current - > mm - > end_code ) ;
kdebug ( " - start_data %lx " , current - > mm - > start_data ) ;
kdebug ( " - end_data %lx " , current - > mm - > end_data ) ;
kdebug ( " - start_brk %lx " , current - > mm - > start_brk ) ;
kdebug ( " - brk %lx " , current - > mm - > brk ) ;
kdebug ( " - start_stack %lx " , current - > mm - > start_stack ) ;
2005-04-16 15:20:36 -07:00
# ifdef ELF_FDPIC_PLAT_INIT
/*
* The ABI may specify that certain registers be set up in special
* ways ( on i386 % edx is the address of a DT_FINI function , for
* example . This macro performs whatever initialization to
* the regs structure is required .
*/
2006-07-10 04:44:53 -07:00
dynaddr = interp_params . dynamic_addr ? : exec_params . dynamic_addr ;
ELF_FDPIC_PLAT_INIT ( regs , exec_params . map_addr , interp_params . map_addr ,
dynaddr ) ;
2005-04-16 15:20:36 -07:00
# endif
/* everything is now ready... get the userspace context ready to roll */
2006-07-10 04:44:53 -07:00
entryaddr = interp_params . entry_addr ? : exec_params . entry_addr ;
start_thread ( regs , entryaddr , current - > mm - > start_stack ) ;
2005-04-16 15:20:36 -07:00
retval = 0 ;
error :
if ( interpreter ) {
allow_write_access ( interpreter ) ;
fput ( interpreter ) ;
}
2005-11-07 01:01:34 -08:00
kfree ( interpreter_name ) ;
kfree ( exec_params . phdrs ) ;
kfree ( exec_params . loadmap ) ;
kfree ( interp_params . phdrs ) ;
kfree ( interp_params . loadmap ) ;
2005-04-16 15:20:36 -07:00
return retval ;
/* unrecoverable error - kill the process */
2006-07-10 04:44:53 -07:00
error_kill :
2005-04-16 15:20:36 -07:00
send_sig ( SIGSEGV , current , 0 ) ;
goto error ;
2006-07-10 04:44:53 -07:00
}
2005-04-16 15:20:36 -07:00
/*****************************************************************************/
2008-10-15 22:04:15 -07:00
# ifndef ELF_BASE_PLATFORM
/*
* AT_BASE_PLATFORM indicates the " real " hardware / microarchitecture .
* If the arch defines ELF_BASE_PLATFORM ( in asm / elf . h ) , the value
* will be copied to the user stack in the same manner as AT_PLATFORM .
*/
# define ELF_BASE_PLATFORM NULL
# endif
2005-04-16 15:20:36 -07:00
/*
2008-10-15 22:04:15 -07:00
* present useful information to the program by shovelling it onto the new
* process ' s stack
2005-04-16 15:20:36 -07:00
*/
static int create_elf_fdpic_tables ( struct linux_binprm * bprm ,
struct mm_struct * mm ,
struct elf_fdpic_params * exec_params ,
struct elf_fdpic_params * interp_params )
{
2008-11-14 10:39:18 +11:00
const struct cred * cred = current_cred ( ) ;
2005-04-16 15:20:36 -07:00
unsigned long sp , csp , nitems ;
2006-06-23 02:04:05 -07:00
elf_caddr_t __user * argv , * envp ;
2005-04-16 15:20:36 -07:00
size_t platform_len = 0 , len ;
2008-10-15 22:04:15 -07:00
char * k_platform , * k_base_platform ;
char __user * u_platform , * u_base_platform , * p ;
2005-04-16 15:20:36 -07:00
int loop ;
binfmt_elf_fdpic: Magical stack pointer index, for NEW_AUX_ENT compat.
While implementing binfmt_elf_fdpic on SH it quickly became apparent
that SH was the first platform to support both binfmt_elf_fdpic and
binfmt_elf, as well as the only of the FDPIC platforms to make use of the
auxvt.
Currently binfmt_elf_fdpic uses a special version of NEW_AUX_ENT() where
the first argument is the entry displacement after csp has been adjusted,
being reset after each adjustment. As we have no ability to sort this out
through the platform's ARCH_DLINFO, this index needs to be managed
entirely in create_elf_fdpic_tables(). Presently none of the platforms
that set their own auxvt entries are able to do so through their
respective ARCH_DLINFOs when using binfmt_elf_fdpic.
In addition to this, binfmt_elf_fdpic has been looking at
DLINFO_ARCH_ITEMS for the number of architecture-specific entries in the
auxvt. This is legacy cruft, and is not defined by any platforms in-tree,
even those that make heavy use of the auxvt. AT_VECTOR_SIZE_ARCH is
always available, and contains the number that is of interest here, so we
switch to using that unconditionally as well.
As this has direct bearing on how much stack is used, platforms that have
configurable (or dynamically adjustable) NEW_AUX_ENT calls need to either
make AT_VECTOR_SIZE_ARCH more fine-grained, or leave it as a worst-case
and live with some lost stack space if those entries aren't pushed (some
platforms may also need to purposely sacrifice some space here for
alignment considerations, as noted in the code -- although not an issue
for any FDPIC-capable platform today).
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
Acked-by: David Howells <dhowells@redhat.com>
2008-05-19 13:34:45 +09:00
int nr ; /* reset for each csp adjustment */
2005-04-16 15:20:36 -07:00
# ifdef CONFIG_MMU
2008-10-15 22:04:15 -07:00
/* In some cases (e.g. Hyper-Threading), we want to avoid L1 evictions
* by the processes running on the same package . One thing we can do is
* to shuffle the initial stack for them , so we give the architecture
* an opportunity to do so here .
*/
sp = arch_align_stack ( bprm - > p ) ;
2005-04-16 15:20:36 -07:00
# else
sp = mm - > start_stack ;
/* stack the program arguments and environment */
if ( elf_fdpic_transfer_args_to_stack ( bprm , & sp ) < 0 )
return - EFAULT ;
# endif
2008-10-15 22:04:15 -07:00
/*
* If this architecture has a platform capability string , copy it
* to userspace . In some cases ( Sparc ) , this info is impossible
* for userspace to get any other way , in others ( i386 ) it is
* merely difficult .
*/
2005-04-16 15:20:36 -07:00
k_platform = ELF_PLATFORM ;
2006-07-10 04:44:50 -07:00
u_platform = NULL ;
2005-04-16 15:20:36 -07:00
if ( k_platform ) {
platform_len = strlen ( k_platform ) + 1 ;
sp - = platform_len ;
2006-06-23 02:04:05 -07:00
u_platform = ( char __user * ) sp ;
2005-04-16 15:20:36 -07:00
if ( __copy_to_user ( u_platform , k_platform , platform_len ) ! = 0 )
return - EFAULT ;
}
2008-10-15 22:04:15 -07:00
/*
* If this architecture has a " base " platform capability
* string , copy it to userspace .
*/
k_base_platform = ELF_BASE_PLATFORM ;
u_base_platform = NULL ;
if ( k_base_platform ) {
platform_len = strlen ( k_base_platform ) + 1 ;
sp - = platform_len ;
u_base_platform = ( char __user * ) sp ;
if ( __copy_to_user ( u_base_platform , k_base_platform , platform_len ) ! = 0 )
return - EFAULT ;
}
2005-04-16 15:20:36 -07:00
sp & = ~ 7UL ;
/* stack the load map(s) */
len = sizeof ( struct elf32_fdpic_loadmap ) ;
len + = sizeof ( struct elf32_fdpic_loadseg ) * exec_params - > loadmap - > nsegs ;
sp = ( sp - len ) & ~ 7UL ;
exec_params - > map_addr = sp ;
2006-06-23 02:04:05 -07:00
if ( copy_to_user ( ( void __user * ) sp , exec_params - > loadmap , len ) ! = 0 )
2005-04-16 15:20:36 -07:00
return - EFAULT ;
current - > mm - > context . exec_fdpic_loadmap = ( unsigned long ) sp ;
if ( interp_params - > loadmap ) {
len = sizeof ( struct elf32_fdpic_loadmap ) ;
2006-07-10 04:44:53 -07:00
len + = sizeof ( struct elf32_fdpic_loadseg ) *
interp_params - > loadmap - > nsegs ;
2005-04-16 15:20:36 -07:00
sp = ( sp - len ) & ~ 7UL ;
interp_params - > map_addr = sp ;
2006-07-10 04:44:53 -07:00
if ( copy_to_user ( ( void __user * ) sp , interp_params - > loadmap ,
len ) ! = 0 )
2005-04-16 15:20:36 -07:00
return - EFAULT ;
current - > mm - > context . interp_fdpic_loadmap = ( unsigned long ) sp ;
}
/* force 16 byte _final_ alignment here for generality */
2008-10-15 22:04:16 -07:00
# define DLINFO_ITEMS 15
2005-04-16 15:20:36 -07:00
2008-10-15 22:04:15 -07:00
nitems = 1 + DLINFO_ITEMS + ( k_platform ? 1 : 0 ) +
( k_base_platform ? 1 : 0 ) + AT_VECTOR_SIZE_ARCH ;
2005-04-16 15:20:36 -07:00
2008-10-15 22:04:16 -07:00
if ( bprm - > interp_flags & BINPRM_FLAGS_EXECFD )
nitems + + ;
2005-04-16 15:20:36 -07:00
csp = sp ;
sp - = nitems * 2 * sizeof ( unsigned long ) ;
sp - = ( bprm - > envc + 1 ) * sizeof ( char * ) ; /* envv[] */
sp - = ( bprm - > argc + 1 ) * sizeof ( char * ) ; /* argv[] */
sp - = 1 * sizeof ( unsigned long ) ; /* argc */
csp - = sp & 15UL ;
sp - = sp & 15UL ;
/* put the ELF interpreter info on the stack */
binfmt_elf_fdpic: Magical stack pointer index, for NEW_AUX_ENT compat.
While implementing binfmt_elf_fdpic on SH it quickly became apparent
that SH was the first platform to support both binfmt_elf_fdpic and
binfmt_elf, as well as the only of the FDPIC platforms to make use of the
auxvt.
Currently binfmt_elf_fdpic uses a special version of NEW_AUX_ENT() where
the first argument is the entry displacement after csp has been adjusted,
being reset after each adjustment. As we have no ability to sort this out
through the platform's ARCH_DLINFO, this index needs to be managed
entirely in create_elf_fdpic_tables(). Presently none of the platforms
that set their own auxvt entries are able to do so through their
respective ARCH_DLINFOs when using binfmt_elf_fdpic.
In addition to this, binfmt_elf_fdpic has been looking at
DLINFO_ARCH_ITEMS for the number of architecture-specific entries in the
auxvt. This is legacy cruft, and is not defined by any platforms in-tree,
even those that make heavy use of the auxvt. AT_VECTOR_SIZE_ARCH is
always available, and contains the number that is of interest here, so we
switch to using that unconditionally as well.
As this has direct bearing on how much stack is used, platforms that have
configurable (or dynamically adjustable) NEW_AUX_ENT calls need to either
make AT_VECTOR_SIZE_ARCH more fine-grained, or leave it as a worst-case
and live with some lost stack space if those entries aren't pushed (some
platforms may also need to purposely sacrifice some space here for
alignment considerations, as noted in the code -- although not an issue
for any FDPIC-capable platform today).
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
Acked-by: David Howells <dhowells@redhat.com>
2008-05-19 13:34:45 +09:00
# define NEW_AUX_ENT(id, val) \
2006-07-10 04:44:53 -07:00
do { \
struct { unsigned long _id , _val ; } __user * ent ; \
\
ent = ( void __user * ) csp ; \
__put_user ( ( id ) , & ent [ nr ] . _id ) ; \
__put_user ( ( val ) , & ent [ nr ] . _val ) ; \
binfmt_elf_fdpic: Magical stack pointer index, for NEW_AUX_ENT compat.
While implementing binfmt_elf_fdpic on SH it quickly became apparent
that SH was the first platform to support both binfmt_elf_fdpic and
binfmt_elf, as well as the only of the FDPIC platforms to make use of the
auxvt.
Currently binfmt_elf_fdpic uses a special version of NEW_AUX_ENT() where
the first argument is the entry displacement after csp has been adjusted,
being reset after each adjustment. As we have no ability to sort this out
through the platform's ARCH_DLINFO, this index needs to be managed
entirely in create_elf_fdpic_tables(). Presently none of the platforms
that set their own auxvt entries are able to do so through their
respective ARCH_DLINFOs when using binfmt_elf_fdpic.
In addition to this, binfmt_elf_fdpic has been looking at
DLINFO_ARCH_ITEMS for the number of architecture-specific entries in the
auxvt. This is legacy cruft, and is not defined by any platforms in-tree,
even those that make heavy use of the auxvt. AT_VECTOR_SIZE_ARCH is
always available, and contains the number that is of interest here, so we
switch to using that unconditionally as well.
As this has direct bearing on how much stack is used, platforms that have
configurable (or dynamically adjustable) NEW_AUX_ENT calls need to either
make AT_VECTOR_SIZE_ARCH more fine-grained, or leave it as a worst-case
and live with some lost stack space if those entries aren't pushed (some
platforms may also need to purposely sacrifice some space here for
alignment considerations, as noted in the code -- although not an issue
for any FDPIC-capable platform today).
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
Acked-by: David Howells <dhowells@redhat.com>
2008-05-19 13:34:45 +09:00
nr + + ; \
2005-04-16 15:20:36 -07:00
} while ( 0 )
binfmt_elf_fdpic: Magical stack pointer index, for NEW_AUX_ENT compat.
While implementing binfmt_elf_fdpic on SH it quickly became apparent
that SH was the first platform to support both binfmt_elf_fdpic and
binfmt_elf, as well as the only of the FDPIC platforms to make use of the
auxvt.
Currently binfmt_elf_fdpic uses a special version of NEW_AUX_ENT() where
the first argument is the entry displacement after csp has been adjusted,
being reset after each adjustment. As we have no ability to sort this out
through the platform's ARCH_DLINFO, this index needs to be managed
entirely in create_elf_fdpic_tables(). Presently none of the platforms
that set their own auxvt entries are able to do so through their
respective ARCH_DLINFOs when using binfmt_elf_fdpic.
In addition to this, binfmt_elf_fdpic has been looking at
DLINFO_ARCH_ITEMS for the number of architecture-specific entries in the
auxvt. This is legacy cruft, and is not defined by any platforms in-tree,
even those that make heavy use of the auxvt. AT_VECTOR_SIZE_ARCH is
always available, and contains the number that is of interest here, so we
switch to using that unconditionally as well.
As this has direct bearing on how much stack is used, platforms that have
configurable (or dynamically adjustable) NEW_AUX_ENT calls need to either
make AT_VECTOR_SIZE_ARCH more fine-grained, or leave it as a worst-case
and live with some lost stack space if those entries aren't pushed (some
platforms may also need to purposely sacrifice some space here for
alignment considerations, as noted in the code -- although not an issue
for any FDPIC-capable platform today).
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
Acked-by: David Howells <dhowells@redhat.com>
2008-05-19 13:34:45 +09:00
nr = 0 ;
2005-04-16 15:20:36 -07:00
csp - = 2 * sizeof ( unsigned long ) ;
binfmt_elf_fdpic: Magical stack pointer index, for NEW_AUX_ENT compat.
While implementing binfmt_elf_fdpic on SH it quickly became apparent
that SH was the first platform to support both binfmt_elf_fdpic and
binfmt_elf, as well as the only of the FDPIC platforms to make use of the
auxvt.
Currently binfmt_elf_fdpic uses a special version of NEW_AUX_ENT() where
the first argument is the entry displacement after csp has been adjusted,
being reset after each adjustment. As we have no ability to sort this out
through the platform's ARCH_DLINFO, this index needs to be managed
entirely in create_elf_fdpic_tables(). Presently none of the platforms
that set their own auxvt entries are able to do so through their
respective ARCH_DLINFOs when using binfmt_elf_fdpic.
In addition to this, binfmt_elf_fdpic has been looking at
DLINFO_ARCH_ITEMS for the number of architecture-specific entries in the
auxvt. This is legacy cruft, and is not defined by any platforms in-tree,
even those that make heavy use of the auxvt. AT_VECTOR_SIZE_ARCH is
always available, and contains the number that is of interest here, so we
switch to using that unconditionally as well.
As this has direct bearing on how much stack is used, platforms that have
configurable (or dynamically adjustable) NEW_AUX_ENT calls need to either
make AT_VECTOR_SIZE_ARCH more fine-grained, or leave it as a worst-case
and live with some lost stack space if those entries aren't pushed (some
platforms may also need to purposely sacrifice some space here for
alignment considerations, as noted in the code -- although not an issue
for any FDPIC-capable platform today).
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
Acked-by: David Howells <dhowells@redhat.com>
2008-05-19 13:34:45 +09:00
NEW_AUX_ENT ( AT_NULL , 0 ) ;
2005-04-16 15:20:36 -07:00
if ( k_platform ) {
binfmt_elf_fdpic: Magical stack pointer index, for NEW_AUX_ENT compat.
While implementing binfmt_elf_fdpic on SH it quickly became apparent
that SH was the first platform to support both binfmt_elf_fdpic and
binfmt_elf, as well as the only of the FDPIC platforms to make use of the
auxvt.
Currently binfmt_elf_fdpic uses a special version of NEW_AUX_ENT() where
the first argument is the entry displacement after csp has been adjusted,
being reset after each adjustment. As we have no ability to sort this out
through the platform's ARCH_DLINFO, this index needs to be managed
entirely in create_elf_fdpic_tables(). Presently none of the platforms
that set their own auxvt entries are able to do so through their
respective ARCH_DLINFOs when using binfmt_elf_fdpic.
In addition to this, binfmt_elf_fdpic has been looking at
DLINFO_ARCH_ITEMS for the number of architecture-specific entries in the
auxvt. This is legacy cruft, and is not defined by any platforms in-tree,
even those that make heavy use of the auxvt. AT_VECTOR_SIZE_ARCH is
always available, and contains the number that is of interest here, so we
switch to using that unconditionally as well.
As this has direct bearing on how much stack is used, platforms that have
configurable (or dynamically adjustable) NEW_AUX_ENT calls need to either
make AT_VECTOR_SIZE_ARCH more fine-grained, or leave it as a worst-case
and live with some lost stack space if those entries aren't pushed (some
platforms may also need to purposely sacrifice some space here for
alignment considerations, as noted in the code -- although not an issue
for any FDPIC-capable platform today).
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
Acked-by: David Howells <dhowells@redhat.com>
2008-05-19 13:34:45 +09:00
nr = 0 ;
2005-04-16 15:20:36 -07:00
csp - = 2 * sizeof ( unsigned long ) ;
binfmt_elf_fdpic: Magical stack pointer index, for NEW_AUX_ENT compat.
While implementing binfmt_elf_fdpic on SH it quickly became apparent
that SH was the first platform to support both binfmt_elf_fdpic and
binfmt_elf, as well as the only of the FDPIC platforms to make use of the
auxvt.
Currently binfmt_elf_fdpic uses a special version of NEW_AUX_ENT() where
the first argument is the entry displacement after csp has been adjusted,
being reset after each adjustment. As we have no ability to sort this out
through the platform's ARCH_DLINFO, this index needs to be managed
entirely in create_elf_fdpic_tables(). Presently none of the platforms
that set their own auxvt entries are able to do so through their
respective ARCH_DLINFOs when using binfmt_elf_fdpic.
In addition to this, binfmt_elf_fdpic has been looking at
DLINFO_ARCH_ITEMS for the number of architecture-specific entries in the
auxvt. This is legacy cruft, and is not defined by any platforms in-tree,
even those that make heavy use of the auxvt. AT_VECTOR_SIZE_ARCH is
always available, and contains the number that is of interest here, so we
switch to using that unconditionally as well.
As this has direct bearing on how much stack is used, platforms that have
configurable (or dynamically adjustable) NEW_AUX_ENT calls need to either
make AT_VECTOR_SIZE_ARCH more fine-grained, or leave it as a worst-case
and live with some lost stack space if those entries aren't pushed (some
platforms may also need to purposely sacrifice some space here for
alignment considerations, as noted in the code -- although not an issue
for any FDPIC-capable platform today).
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
Acked-by: David Howells <dhowells@redhat.com>
2008-05-19 13:34:45 +09:00
NEW_AUX_ENT ( AT_PLATFORM ,
2006-07-10 04:44:53 -07:00
( elf_addr_t ) ( unsigned long ) u_platform ) ;
2005-04-16 15:20:36 -07:00
}
2008-10-15 22:04:15 -07:00
if ( k_base_platform ) {
nr = 0 ;
csp - = 2 * sizeof ( unsigned long ) ;
NEW_AUX_ENT ( AT_BASE_PLATFORM ,
( elf_addr_t ) ( unsigned long ) u_base_platform ) ;
}
2008-10-15 22:04:16 -07:00
if ( bprm - > interp_flags & BINPRM_FLAGS_EXECFD ) {
nr = 0 ;
csp - = 2 * sizeof ( unsigned long ) ;
NEW_AUX_ENT ( AT_EXECFD , bprm - > interp_data ) ;
}
binfmt_elf_fdpic: Magical stack pointer index, for NEW_AUX_ENT compat.
While implementing binfmt_elf_fdpic on SH it quickly became apparent
that SH was the first platform to support both binfmt_elf_fdpic and
binfmt_elf, as well as the only of the FDPIC platforms to make use of the
auxvt.
Currently binfmt_elf_fdpic uses a special version of NEW_AUX_ENT() where
the first argument is the entry displacement after csp has been adjusted,
being reset after each adjustment. As we have no ability to sort this out
through the platform's ARCH_DLINFO, this index needs to be managed
entirely in create_elf_fdpic_tables(). Presently none of the platforms
that set their own auxvt entries are able to do so through their
respective ARCH_DLINFOs when using binfmt_elf_fdpic.
In addition to this, binfmt_elf_fdpic has been looking at
DLINFO_ARCH_ITEMS for the number of architecture-specific entries in the
auxvt. This is legacy cruft, and is not defined by any platforms in-tree,
even those that make heavy use of the auxvt. AT_VECTOR_SIZE_ARCH is
always available, and contains the number that is of interest here, so we
switch to using that unconditionally as well.
As this has direct bearing on how much stack is used, platforms that have
configurable (or dynamically adjustable) NEW_AUX_ENT calls need to either
make AT_VECTOR_SIZE_ARCH more fine-grained, or leave it as a worst-case
and live with some lost stack space if those entries aren't pushed (some
platforms may also need to purposely sacrifice some space here for
alignment considerations, as noted in the code -- although not an issue
for any FDPIC-capable platform today).
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
Acked-by: David Howells <dhowells@redhat.com>
2008-05-19 13:34:45 +09:00
nr = 0 ;
2005-04-16 15:20:36 -07:00
csp - = DLINFO_ITEMS * 2 * sizeof ( unsigned long ) ;
2013-04-17 17:33:11 +00:00
NEW_AUX_ENT ( AT_HWCAP , ELF_HWCAP ) ;
# ifdef ELF_HWCAP2
NEW_AUX_ENT ( AT_HWCAP2 , ELF_HWCAP2 ) ;
# endif
binfmt_elf_fdpic: Magical stack pointer index, for NEW_AUX_ENT compat.
While implementing binfmt_elf_fdpic on SH it quickly became apparent
that SH was the first platform to support both binfmt_elf_fdpic and
binfmt_elf, as well as the only of the FDPIC platforms to make use of the
auxvt.
Currently binfmt_elf_fdpic uses a special version of NEW_AUX_ENT() where
the first argument is the entry displacement after csp has been adjusted,
being reset after each adjustment. As we have no ability to sort this out
through the platform's ARCH_DLINFO, this index needs to be managed
entirely in create_elf_fdpic_tables(). Presently none of the platforms
that set their own auxvt entries are able to do so through their
respective ARCH_DLINFOs when using binfmt_elf_fdpic.
In addition to this, binfmt_elf_fdpic has been looking at
DLINFO_ARCH_ITEMS for the number of architecture-specific entries in the
auxvt. This is legacy cruft, and is not defined by any platforms in-tree,
even those that make heavy use of the auxvt. AT_VECTOR_SIZE_ARCH is
always available, and contains the number that is of interest here, so we
switch to using that unconditionally as well.
As this has direct bearing on how much stack is used, platforms that have
configurable (or dynamically adjustable) NEW_AUX_ENT calls need to either
make AT_VECTOR_SIZE_ARCH more fine-grained, or leave it as a worst-case
and live with some lost stack space if those entries aren't pushed (some
platforms may also need to purposely sacrifice some space here for
alignment considerations, as noted in the code -- although not an issue
for any FDPIC-capable platform today).
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
Acked-by: David Howells <dhowells@redhat.com>
2008-05-19 13:34:45 +09:00
NEW_AUX_ENT ( AT_PAGESZ , PAGE_SIZE ) ;
NEW_AUX_ENT ( AT_CLKTCK , CLOCKS_PER_SEC ) ;
NEW_AUX_ENT ( AT_PHDR , exec_params - > ph_addr ) ;
NEW_AUX_ENT ( AT_PHENT , sizeof ( struct elf_phdr ) ) ;
NEW_AUX_ENT ( AT_PHNUM , exec_params - > hdr . e_phnum ) ;
NEW_AUX_ENT ( AT_BASE , interp_params - > elfhdr_addr ) ;
NEW_AUX_ENT ( AT_FLAGS , 0 ) ;
NEW_AUX_ENT ( AT_ENTRY , exec_params - > entry_addr ) ;
2012-02-07 18:36:10 -08:00
NEW_AUX_ENT ( AT_UID , ( elf_addr_t ) from_kuid_munged ( cred - > user_ns , cred - > uid ) ) ;
NEW_AUX_ENT ( AT_EUID , ( elf_addr_t ) from_kuid_munged ( cred - > user_ns , cred - > euid ) ) ;
NEW_AUX_ENT ( AT_GID , ( elf_addr_t ) from_kgid_munged ( cred - > user_ns , cred - > gid ) ) ;
NEW_AUX_ENT ( AT_EGID , ( elf_addr_t ) from_kgid_munged ( cred - > user_ns , cred - > egid ) ) ;
2008-10-15 22:04:16 -07:00
NEW_AUX_ENT ( AT_SECURE , security_bprm_secureexec ( bprm ) ) ;
NEW_AUX_ENT ( AT_EXECFN , bprm - > exec ) ;
2005-04-16 15:20:36 -07:00
# ifdef ARCH_DLINFO
binfmt_elf_fdpic: Magical stack pointer index, for NEW_AUX_ENT compat.
While implementing binfmt_elf_fdpic on SH it quickly became apparent
that SH was the first platform to support both binfmt_elf_fdpic and
binfmt_elf, as well as the only of the FDPIC platforms to make use of the
auxvt.
Currently binfmt_elf_fdpic uses a special version of NEW_AUX_ENT() where
the first argument is the entry displacement after csp has been adjusted,
being reset after each adjustment. As we have no ability to sort this out
through the platform's ARCH_DLINFO, this index needs to be managed
entirely in create_elf_fdpic_tables(). Presently none of the platforms
that set their own auxvt entries are able to do so through their
respective ARCH_DLINFOs when using binfmt_elf_fdpic.
In addition to this, binfmt_elf_fdpic has been looking at
DLINFO_ARCH_ITEMS for the number of architecture-specific entries in the
auxvt. This is legacy cruft, and is not defined by any platforms in-tree,
even those that make heavy use of the auxvt. AT_VECTOR_SIZE_ARCH is
always available, and contains the number that is of interest here, so we
switch to using that unconditionally as well.
As this has direct bearing on how much stack is used, platforms that have
configurable (or dynamically adjustable) NEW_AUX_ENT calls need to either
make AT_VECTOR_SIZE_ARCH more fine-grained, or leave it as a worst-case
and live with some lost stack space if those entries aren't pushed (some
platforms may also need to purposely sacrifice some space here for
alignment considerations, as noted in the code -- although not an issue
for any FDPIC-capable platform today).
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
Acked-by: David Howells <dhowells@redhat.com>
2008-05-19 13:34:45 +09:00
nr = 0 ;
csp - = AT_VECTOR_SIZE_ARCH * 2 * sizeof ( unsigned long ) ;
2005-04-16 15:20:36 -07:00
/* ARCH_DLINFO must come last so platform specific code can enforce
* special alignment requirements on the AUXV if necessary ( eg . PPC ) .
*/
ARCH_DLINFO ;
# endif
# undef NEW_AUX_ENT
/* allocate room for argv[] and envv[] */
csp - = ( bprm - > envc + 1 ) * sizeof ( elf_caddr_t ) ;
2006-06-23 02:04:05 -07:00
envp = ( elf_caddr_t __user * ) csp ;
2005-04-16 15:20:36 -07:00
csp - = ( bprm - > argc + 1 ) * sizeof ( elf_caddr_t ) ;
2006-06-23 02:04:05 -07:00
argv = ( elf_caddr_t __user * ) csp ;
2005-04-16 15:20:36 -07:00
/* stack argc */
csp - = sizeof ( unsigned long ) ;
2006-06-23 02:04:05 -07:00
__put_user ( bprm - > argc , ( unsigned long __user * ) csp ) ;
2005-04-16 15:20:36 -07:00
2006-03-24 18:38:48 +01:00
BUG_ON ( csp ! = sp ) ;
2005-04-16 15:20:36 -07:00
/* fill in the argv[] array */
# ifdef CONFIG_MMU
current - > mm - > arg_start = bprm - > p ;
# else
2006-07-10 04:44:53 -07:00
current - > mm - > arg_start = current - > mm - > start_stack -
( MAX_ARG_PAGES * PAGE_SIZE - bprm - > p ) ;
2005-04-16 15:20:36 -07:00
# endif
2006-06-23 02:04:05 -07:00
p = ( char __user * ) current - > mm - > arg_start ;
2005-04-16 15:20:36 -07:00
for ( loop = bprm - > argc ; loop > 0 ; loop - - ) {
__put_user ( ( elf_caddr_t ) p , argv + + ) ;
2007-07-19 01:48:16 -07:00
len = strnlen_user ( p , MAX_ARG_STRLEN ) ;
if ( ! len | | len > MAX_ARG_STRLEN )
2005-04-16 15:20:36 -07:00
return - EINVAL ;
p + = len ;
}
__put_user ( NULL , argv ) ;
current - > mm - > arg_end = ( unsigned long ) p ;
/* fill in the envv[] array */
current - > mm - > env_start = ( unsigned long ) p ;
for ( loop = bprm - > envc ; loop > 0 ; loop - - ) {
__put_user ( ( elf_caddr_t ) ( unsigned long ) p , envp + + ) ;
2007-07-19 01:48:16 -07:00
len = strnlen_user ( p , MAX_ARG_STRLEN ) ;
if ( ! len | | len > MAX_ARG_STRLEN )
2005-04-16 15:20:36 -07:00
return - EINVAL ;
p + = len ;
}
__put_user ( NULL , envp ) ;
current - > mm - > env_end = ( unsigned long ) p ;
mm - > start_stack = ( unsigned long ) sp ;
return 0 ;
2006-07-10 04:44:53 -07:00
}
2005-04-16 15:20:36 -07:00
/*****************************************************************************/
/*
* transfer the program arguments and environment from the holding pages onto
* the stack
*/
# ifndef CONFIG_MMU
2006-07-10 04:44:53 -07:00
static int elf_fdpic_transfer_args_to_stack ( struct linux_binprm * bprm ,
unsigned long * _sp )
2005-04-16 15:20:36 -07:00
{
unsigned long index , stop , sp ;
char * src ;
int ret = 0 ;
stop = bprm - > p > > PAGE_SHIFT ;
sp = * _sp ;
for ( index = MAX_ARG_PAGES - 1 ; index > = stop ; index - - ) {
src = kmap ( bprm - > page [ index ] ) ;
sp - = PAGE_SIZE ;
if ( copy_to_user ( ( void * ) sp , src , PAGE_SIZE ) ! = 0 )
ret = - EFAULT ;
kunmap ( bprm - > page [ index ] ) ;
if ( ret < 0 )
goto out ;
}
* _sp = ( * _sp - ( MAX_ARG_PAGES * PAGE_SIZE - bprm - > p ) ) & ~ 15 ;
2006-07-10 04:44:53 -07:00
out :
2005-04-16 15:20:36 -07:00
return ret ;
2006-07-10 04:44:53 -07:00
}
2005-04-16 15:20:36 -07:00
# endif
/*****************************************************************************/
/*
* load the appropriate binary image ( executable or interpreter ) into memory
* - we assume no MMU is available
* - if no other PIC bits are set in params - > hdr - > e_flags
* - we assume that the LOADable segments in the binary are independently relocatable
* - we assume R / O executable segments are shareable
* - else
* - we assume the loadable parts of the image to require fixed displacement
* - the image is not shareable
*/
static int elf_fdpic_map_file ( struct elf_fdpic_params * params ,
struct file * file ,
struct mm_struct * mm ,
const char * what )
{
struct elf32_fdpic_loadmap * loadmap ;
# ifdef CONFIG_MMU
struct elf32_fdpic_loadseg * mseg ;
# endif
struct elf32_fdpic_loadseg * seg ;
struct elf32_phdr * phdr ;
unsigned long load_addr , stop ;
unsigned nloads , tmp ;
size_t size ;
int loop , ret ;
/* allocate a load map table */
nloads = 0 ;
for ( loop = 0 ; loop < params - > hdr . e_phnum ; loop + + )
if ( params - > phdrs [ loop ] . p_type = = PT_LOAD )
nloads + + ;
if ( nloads = = 0 )
return - ELIBBAD ;
size = sizeof ( * loadmap ) + nloads * sizeof ( * seg ) ;
2006-12-12 20:07:35 +01:00
loadmap = kzalloc ( size , GFP_KERNEL ) ;
2005-04-16 15:20:36 -07:00
if ( ! loadmap )
return - ENOMEM ;
params - > loadmap = loadmap ;
loadmap - > version = ELF32_FDPIC_LOADMAP_VERSION ;
loadmap - > nsegs = nloads ;
load_addr = params - > load_addr ;
seg = loadmap - > segs ;
/* map the requested LOADs into the memory space */
switch ( params - > flags & ELF_FDPIC_FLAG_ARRANGEMENT ) {
case ELF_FDPIC_FLAG_CONSTDISP :
case ELF_FDPIC_FLAG_CONTIGUOUS :
# ifndef CONFIG_MMU
ret = elf_fdpic_map_file_constdisp_on_uclinux ( params , file , mm ) ;
if ( ret < 0 )
return ret ;
break ;
# endif
default :
ret = elf_fdpic_map_file_by_direct_mmap ( params , file , mm ) ;
if ( ret < 0 )
return ret ;
break ;
}
/* map the entry point */
if ( params - > hdr . e_entry ) {
seg = loadmap - > segs ;
for ( loop = loadmap - > nsegs ; loop > 0 ; loop - - , seg + + ) {
if ( params - > hdr . e_entry > = seg - > p_vaddr & &
2006-07-10 04:44:53 -07:00
params - > hdr . e_entry < seg - > p_vaddr + seg - > p_memsz ) {
2005-04-16 15:20:36 -07:00
params - > entry_addr =
2006-07-10 04:44:53 -07:00
( params - > hdr . e_entry - seg - > p_vaddr ) +
seg - > addr ;
2005-04-16 15:20:36 -07:00
break ;
}
}
}
/* determine where the program header table has wound up if mapped */
2006-07-10 04:44:53 -07:00
stop = params - > hdr . e_phoff ;
stop + = params - > hdr . e_phnum * sizeof ( struct elf_phdr ) ;
2005-04-16 15:20:36 -07:00
phdr = params - > phdrs ;
for ( loop = 0 ; loop < params - > hdr . e_phnum ; loop + + , phdr + + ) {
if ( phdr - > p_type ! = PT_LOAD )
continue ;
if ( phdr - > p_offset > params - > hdr . e_phoff | |
phdr - > p_offset + phdr - > p_filesz < stop )
continue ;
seg = loadmap - > segs ;
for ( loop = loadmap - > nsegs ; loop > 0 ; loop - - , seg + + ) {
if ( phdr - > p_vaddr > = seg - > p_vaddr & &
2006-07-10 04:44:53 -07:00
phdr - > p_vaddr + phdr - > p_filesz < =
seg - > p_vaddr + seg - > p_memsz ) {
params - > ph_addr =
( phdr - > p_vaddr - seg - > p_vaddr ) +
seg - > addr +
2005-04-16 15:20:36 -07:00
params - > hdr . e_phoff - phdr - > p_offset ;
break ;
}
}
break ;
}
/* determine where the dynamic section has wound up if there is one */
phdr = params - > phdrs ;
for ( loop = 0 ; loop < params - > hdr . e_phnum ; loop + + , phdr + + ) {
if ( phdr - > p_type ! = PT_DYNAMIC )
continue ;
seg = loadmap - > segs ;
for ( loop = loadmap - > nsegs ; loop > 0 ; loop - - , seg + + ) {
if ( phdr - > p_vaddr > = seg - > p_vaddr & &
2006-07-10 04:44:53 -07:00
phdr - > p_vaddr + phdr - > p_memsz < =
seg - > p_vaddr + seg - > p_memsz ) {
params - > dynamic_addr =
( phdr - > p_vaddr - seg - > p_vaddr ) +
seg - > addr ;
/* check the dynamic section contains at least
* one item , and that the last item is a NULL
* entry */
2005-04-16 15:20:36 -07:00
if ( phdr - > p_memsz = = 0 | |
phdr - > p_memsz % sizeof ( Elf32_Dyn ) ! = 0 )
goto dynamic_error ;
tmp = phdr - > p_memsz / sizeof ( Elf32_Dyn ) ;
2006-07-10 04:44:53 -07:00
if ( ( ( Elf32_Dyn * )
params - > dynamic_addr ) [ tmp - 1 ] . d_tag ! = 0 )
2005-04-16 15:20:36 -07:00
goto dynamic_error ;
break ;
}
}
break ;
}
/* now elide adjacent segments in the load map on MMU linux
2006-07-10 04:44:53 -07:00
* - on uClinux the holes between may actually be filled with system
* stuff or stuff from other processes
2005-04-16 15:20:36 -07:00
*/
# ifdef CONFIG_MMU
nloads = loadmap - > nsegs ;
mseg = loadmap - > segs ;
seg = mseg + 1 ;
for ( loop = 1 ; loop < nloads ; loop + + ) {
/* see if we have a candidate for merging */
if ( seg - > p_vaddr - mseg - > p_vaddr = = seg - > addr - mseg - > addr ) {
load_addr = PAGE_ALIGN ( mseg - > addr + mseg - > p_memsz ) ;
if ( load_addr = = ( seg - > addr & PAGE_MASK ) ) {
2006-07-10 04:44:53 -07:00
mseg - > p_memsz + =
load_addr -
( mseg - > addr + mseg - > p_memsz ) ;
2005-04-16 15:20:36 -07:00
mseg - > p_memsz + = seg - > addr & ~ PAGE_MASK ;
mseg - > p_memsz + = seg - > p_memsz ;
loadmap - > nsegs - - ;
continue ;
}
}
mseg + + ;
if ( mseg ! = seg )
* mseg = * seg ;
}
# endif
kdebug ( " Mapped Object [%s]: " , what ) ;
kdebug ( " - elfhdr : %lx " , params - > elfhdr_addr ) ;
kdebug ( " - entry : %lx " , params - > entry_addr ) ;
kdebug ( " - PHDR[] : %lx " , params - > ph_addr ) ;
kdebug ( " - DYNAMIC[]: %lx " , params - > dynamic_addr ) ;
seg = loadmap - > segs ;
for ( loop = 0 ; loop < loadmap - > nsegs ; loop + + , seg + + )
kdebug ( " - LOAD[%d] : %08x-%08x [va=%x ms=%x] " ,
loop ,
seg - > addr , seg - > addr + seg - > p_memsz - 1 ,
seg - > p_vaddr , seg - > p_memsz ) ;
return 0 ;
2006-07-10 04:44:53 -07:00
dynamic_error :
2005-04-16 15:20:36 -07:00
printk ( " ELF FDPIC %s with invalid DYNAMIC section (inode=%lu) \n " ,
2013-01-23 17:07:38 -05:00
what , file_inode ( file ) - > i_ino ) ;
2005-04-16 15:20:36 -07:00
return - ELIBBAD ;
2006-07-10 04:44:53 -07:00
}
2005-04-16 15:20:36 -07:00
/*****************************************************************************/
/*
* map a file with constant displacement under uClinux
*/
# ifndef CONFIG_MMU
2006-07-10 04:44:53 -07:00
static int elf_fdpic_map_file_constdisp_on_uclinux (
struct elf_fdpic_params * params ,
struct file * file ,
struct mm_struct * mm )
2005-04-16 15:20:36 -07:00
{
struct elf32_fdpic_loadseg * seg ;
struct elf32_phdr * phdr ;
unsigned long load_addr , base = ULONG_MAX , top = 0 , maddr = 0 , mflags ;
int loop , ret ;
load_addr = params - > load_addr ;
seg = params - > loadmap - > segs ;
2006-07-10 04:44:53 -07:00
/* determine the bounds of the contiguous overall allocation we must
* make */
2005-04-16 15:20:36 -07:00
phdr = params - > phdrs ;
for ( loop = 0 ; loop < params - > hdr . e_phnum ; loop + + , phdr + + ) {
if ( params - > phdrs [ loop ] . p_type ! = PT_LOAD )
continue ;
if ( base > phdr - > p_vaddr )
base = phdr - > p_vaddr ;
if ( top < phdr - > p_vaddr + phdr - > p_memsz )
top = phdr - > p_vaddr + phdr - > p_memsz ;
}
/* allocate one big anon block for everything */
mflags = MAP_PRIVATE ;
if ( params - > flags & ELF_FDPIC_FLAG_EXECUTABLE )
mflags | = MAP_EXECUTABLE ;
2012-04-20 17:13:58 -07:00
maddr = vm_mmap ( NULL , load_addr , top - base ,
2005-04-16 15:20:36 -07:00
PROT_READ | PROT_WRITE | PROT_EXEC , mflags , 0 ) ;
2006-07-10 04:44:53 -07:00
if ( IS_ERR_VALUE ( maddr ) )
2005-04-16 15:20:36 -07:00
return ( int ) maddr ;
if ( load_addr ! = 0 )
load_addr + = PAGE_ALIGN ( top - base ) ;
/* and then load the file segments into it */
phdr = params - > phdrs ;
for ( loop = 0 ; loop < params - > hdr . e_phnum ; loop + + , phdr + + ) {
if ( params - > phdrs [ loop ] . p_type ! = PT_LOAD )
continue ;
seg - > addr = maddr + ( phdr - > p_vaddr - base ) ;
seg - > p_vaddr = phdr - > p_vaddr ;
seg - > p_memsz = phdr - > p_memsz ;
2013-04-13 20:31:37 -04:00
ret = read_code ( file , seg - > addr , phdr - > p_offset ,
phdr - > p_filesz ) ;
2005-04-16 15:20:36 -07:00
if ( ret < 0 )
return ret ;
/* map the ELF header address if in this segment */
if ( phdr - > p_offset = = 0 )
params - > elfhdr_addr = seg - > addr ;
/* clear any space allocated but not loaded */
2009-04-02 16:58:28 -07:00
if ( phdr - > p_filesz < phdr - > p_memsz ) {
2010-06-01 14:10:47 +01:00
if ( clear_user ( ( void * ) ( seg - > addr + phdr - > p_filesz ) ,
phdr - > p_memsz - phdr - > p_filesz ) )
return - EFAULT ;
2009-04-02 16:58:28 -07:00
}
2005-04-16 15:20:36 -07:00
if ( mm ) {
if ( phdr - > p_flags & PF_X ) {
2007-03-23 00:10:00 -07:00
if ( ! mm - > start_code ) {
mm - > start_code = seg - > addr ;
mm - > end_code = seg - > addr +
phdr - > p_memsz ;
}
2006-07-10 04:44:53 -07:00
} else if ( ! mm - > start_data ) {
2005-04-16 15:20:36 -07:00
mm - > start_data = seg - > addr ;
mm - > end_data = seg - > addr + phdr - > p_memsz ;
}
}
seg + + ;
}
return 0 ;
2006-07-10 04:44:53 -07:00
}
2005-04-16 15:20:36 -07:00
# endif
/*****************************************************************************/
/*
* map a binary by direct mmap ( ) of the individual PT_LOAD segments
*/
static int elf_fdpic_map_file_by_direct_mmap ( struct elf_fdpic_params * params ,
struct file * file ,
struct mm_struct * mm )
{
struct elf32_fdpic_loadseg * seg ;
struct elf32_phdr * phdr ;
unsigned long load_addr , delta_vaddr ;
2010-06-01 14:10:47 +01:00
int loop , dvset ;
2005-04-16 15:20:36 -07:00
load_addr = params - > load_addr ;
delta_vaddr = 0 ;
dvset = 0 ;
seg = params - > loadmap - > segs ;
/* deal with each load segment separately */
phdr = params - > phdrs ;
for ( loop = 0 ; loop < params - > hdr . e_phnum ; loop + + , phdr + + ) {
unsigned long maddr , disp , excess , excess1 ;
int prot = 0 , flags ;
if ( phdr - > p_type ! = PT_LOAD )
continue ;
kdebug ( " [LOAD] va=%lx of=%lx fs=%lx ms=%lx " ,
( unsigned long ) phdr - > p_vaddr ,
( unsigned long ) phdr - > p_offset ,
( unsigned long ) phdr - > p_filesz ,
( unsigned long ) phdr - > p_memsz ) ;
/* determine the mapping parameters */
if ( phdr - > p_flags & PF_R ) prot | = PROT_READ ;
if ( phdr - > p_flags & PF_W ) prot | = PROT_WRITE ;
if ( phdr - > p_flags & PF_X ) prot | = PROT_EXEC ;
flags = MAP_PRIVATE | MAP_DENYWRITE ;
if ( params - > flags & ELF_FDPIC_FLAG_EXECUTABLE )
flags | = MAP_EXECUTABLE ;
maddr = 0 ;
switch ( params - > flags & ELF_FDPIC_FLAG_ARRANGEMENT ) {
case ELF_FDPIC_FLAG_INDEPENDENT :
/* PT_LOADs are independently locatable */
break ;
case ELF_FDPIC_FLAG_HONOURVADDR :
/* the specified virtual address must be honoured */
maddr = phdr - > p_vaddr ;
flags | = MAP_FIXED ;
break ;
case ELF_FDPIC_FLAG_CONSTDISP :
/* constant displacement
2006-07-10 04:44:53 -07:00
* - can be mapped anywhere , but must be mapped as a
* unit
2005-04-16 15:20:36 -07:00
*/
if ( ! dvset ) {
maddr = load_addr ;
delta_vaddr = phdr - > p_vaddr ;
dvset = 1 ;
2006-07-10 04:44:53 -07:00
} else {
2005-04-16 15:20:36 -07:00
maddr = load_addr + phdr - > p_vaddr - delta_vaddr ;
flags | = MAP_FIXED ;
}
break ;
case ELF_FDPIC_FLAG_CONTIGUOUS :
/* contiguity handled later */
break ;
default :
BUG ( ) ;
}
maddr & = PAGE_MASK ;
/* create the mapping */
disp = phdr - > p_vaddr & ~ PAGE_MASK ;
2012-04-20 17:13:58 -07:00
maddr = vm_mmap ( file , maddr , phdr - > p_memsz + disp , prot , flags ,
2005-04-16 15:20:36 -07:00
phdr - > p_offset - disp ) ;
kdebug ( " mmap[%d] <file> sz=%lx pr=%x fl=%x of=%lx --> %08lx " ,
2006-07-10 04:44:53 -07:00
loop , phdr - > p_memsz + disp , prot , flags ,
phdr - > p_offset - disp , maddr ) ;
2005-04-16 15:20:36 -07:00
2006-07-10 04:44:53 -07:00
if ( IS_ERR_VALUE ( maddr ) )
2005-04-16 15:20:36 -07:00
return ( int ) maddr ;
2006-07-10 04:44:53 -07:00
if ( ( params - > flags & ELF_FDPIC_FLAG_ARRANGEMENT ) = =
ELF_FDPIC_FLAG_CONTIGUOUS )
2005-04-16 15:20:36 -07:00
load_addr + = PAGE_ALIGN ( phdr - > p_memsz + disp ) ;
seg - > addr = maddr + disp ;
seg - > p_vaddr = phdr - > p_vaddr ;
seg - > p_memsz = phdr - > p_memsz ;
/* map the ELF header address if in this segment */
if ( phdr - > p_offset = = 0 )
params - > elfhdr_addr = seg - > addr ;
2006-07-10 04:44:53 -07:00
/* clear the bit between beginning of mapping and beginning of
* PT_LOAD */
2005-04-16 15:20:36 -07:00
if ( prot & PROT_WRITE & & disp > 0 ) {
kdebug ( " clear[%d] ad=%lx sz=%lx " , loop , maddr , disp ) ;
2010-06-01 14:10:47 +01:00
if ( clear_user ( ( void __user * ) maddr , disp ) )
return - EFAULT ;
2005-04-16 15:20:36 -07:00
maddr + = disp ;
}
/* clear any space allocated but not loaded
* - on uClinux we can just clear the lot
* - on MMU linux we ' ll get a SIGBUS beyond the last page
* extant in the file
*/
excess = phdr - > p_memsz - phdr - > p_filesz ;
excess1 = PAGE_SIZE - ( ( maddr + phdr - > p_filesz ) & ~ PAGE_MASK ) ;
# ifdef CONFIG_MMU
if ( excess > excess1 ) {
unsigned long xaddr = maddr + phdr - > p_filesz + excess1 ;
unsigned long xmaddr ;
flags | = MAP_FIXED | MAP_ANONYMOUS ;
2012-04-20 17:13:58 -07:00
xmaddr = vm_mmap ( NULL , xaddr , excess - excess1 ,
2006-07-10 04:44:53 -07:00
prot , flags , 0 ) ;
2005-04-16 15:20:36 -07:00
kdebug ( " mmap[%d] <anon> "
" ad=%lx sz=%lx pr=%x fl=%x of=0 --> %08lx " ,
2006-07-10 04:44:53 -07:00
loop , xaddr , excess - excess1 , prot , flags ,
xmaddr ) ;
2005-04-16 15:20:36 -07:00
if ( xmaddr ! = xaddr )
return - ENOMEM ;
}
if ( prot & PROT_WRITE & & excess1 > 0 ) {
kdebug ( " clear[%d] ad=%lx sz=%lx " ,
loop , maddr + phdr - > p_filesz , excess1 ) ;
2010-06-01 14:10:47 +01:00
if ( clear_user ( ( void __user * ) maddr + phdr - > p_filesz ,
excess1 ) )
return - EFAULT ;
2005-04-16 15:20:36 -07:00
}
# else
if ( excess > 0 ) {
kdebug ( " clear[%d] ad=%lx sz=%lx " ,
loop , maddr + phdr - > p_filesz , excess ) ;
2010-06-01 14:10:47 +01:00
if ( clear_user ( ( void * ) maddr + phdr - > p_filesz , excess ) )
return - EFAULT ;
2005-04-16 15:20:36 -07:00
}
# endif
if ( mm ) {
if ( phdr - > p_flags & PF_X ) {
2007-03-23 00:10:00 -07:00
if ( ! mm - > start_code ) {
mm - > start_code = maddr ;
mm - > end_code = maddr + phdr - > p_memsz ;
}
2006-07-10 04:44:53 -07:00
} else if ( ! mm - > start_data ) {
2005-04-16 15:20:36 -07:00
mm - > start_data = maddr ;
mm - > end_data = maddr + phdr - > p_memsz ;
}
}
seg + + ;
}
return 0 ;
2006-07-10 04:44:53 -07:00
}
2006-07-10 04:44:55 -07:00
/*****************************************************************************/
/*
* ELF - FDPIC core dumper
*
* Modelled on fs / exec . c : aout_core_dump ( )
* Jeremy Fitzhardinge < jeremy @ sw . oz . au >
*
* Modelled on fs / binfmt_elf . c core dumper
*/
2009-12-15 16:47:37 -08:00
# ifdef CONFIG_ELF_CORE
2006-07-10 04:44:55 -07:00
/*
* Decide whether a segment is worth dumping ; default is yes to be
* sure ( missing info is worse than too much ; etc ) .
* Personally I ' d include everything , and use the coredump limit . . .
*
* I think we should skip something . But I am not sure how . H . J .
*/
2007-07-19 01:48:30 -07:00
static int maydump ( struct vm_area_struct * vma , unsigned long mm_flags )
2006-07-10 04:44:55 -07:00
{
2007-07-19 01:48:30 -07:00
int dump_ok ;
2006-07-10 04:44:55 -07:00
/* Do not dump I/O mapped devices or special mappings */
mm: kill vma flag VM_RESERVED and mm->reserved_vm counter
A long time ago, in v2.4, VM_RESERVED kept swapout process off VMA,
currently it lost original meaning but still has some effects:
| effect | alternative flags
-+------------------------+---------------------------------------------
1| account as reserved_vm | VM_IO
2| skip in core dump | VM_IO, VM_DONTDUMP
3| do not merge or expand | VM_IO, VM_DONTEXPAND, VM_HUGETLB, VM_PFNMAP
4| do not mlock | VM_IO, VM_DONTEXPAND, VM_HUGETLB, VM_PFNMAP
This patch removes reserved_vm counter from mm_struct. Seems like nobody
cares about it, it does not exported into userspace directly, it only
reduces total_vm showed in proc.
Thus VM_RESERVED can be replaced with VM_IO or pair VM_DONTEXPAND | VM_DONTDUMP.
remap_pfn_range() and io_remap_pfn_range() set VM_IO|VM_DONTEXPAND|VM_DONTDUMP.
remap_vmalloc_range() set VM_DONTEXPAND | VM_DONTDUMP.
[akpm@linux-foundation.org: drivers/vfio/pci/vfio_pci.c fixup]
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Carsten Otte <cotte@de.ibm.com>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Eric Paris <eparis@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Morris <james.l.morris@oracle.com>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Kentaro Takeda <takedakn@nttdata.co.jp>
Cc: Matt Helsley <matthltc@us.ibm.com>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Venkatesh Pallipadi <venki@google.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-10-08 16:29:02 -07:00
if ( vma - > vm_flags & VM_IO ) {
2006-07-10 04:44:55 -07:00
kdcore ( " %08lx: %08lx: no (IO) " , vma - > vm_start , vma - > vm_flags ) ;
return 0 ;
}
/* If we may not read the contents, don't allow us to dump
* them either . " dump_write() " can ' t handle it anyway .
*/
if ( ! ( vma - > vm_flags & VM_READ ) ) {
kdcore ( " %08lx: %08lx: no (!read) " , vma - > vm_start , vma - > vm_flags ) ;
return 0 ;
}
2007-07-19 01:48:30 -07:00
/* By default, dump shared memory if mapped from an anonymous file. */
2006-07-10 04:44:55 -07:00
if ( vma - > vm_flags & VM_SHARED ) {
2013-01-23 17:07:38 -05:00
if ( file_inode ( vma - > vm_file ) - > i_nlink = = 0 ) {
2007-07-19 01:48:30 -07:00
dump_ok = test_bit ( MMF_DUMP_ANON_SHARED , & mm_flags ) ;
kdcore ( " %08lx: %08lx: %s (share) " , vma - > vm_start ,
vma - > vm_flags , dump_ok ? " yes " : " no " ) ;
return dump_ok ;
2006-07-10 04:44:55 -07:00
}
2007-07-19 01:48:30 -07:00
dump_ok = test_bit ( MMF_DUMP_MAPPED_SHARED , & mm_flags ) ;
kdcore ( " %08lx: %08lx: %s (share) " , vma - > vm_start ,
vma - > vm_flags , dump_ok ? " yes " : " no " ) ;
return dump_ok ;
2006-07-10 04:44:55 -07:00
}
# ifdef CONFIG_MMU
2007-07-19 01:48:30 -07:00
/* By default, if it hasn't been written to, don't write it out */
2006-07-10 04:44:55 -07:00
if ( ! vma - > anon_vma ) {
2007-07-19 01:48:30 -07:00
dump_ok = test_bit ( MMF_DUMP_MAPPED_PRIVATE , & mm_flags ) ;
kdcore ( " %08lx: %08lx: %s (!anon) " , vma - > vm_start ,
vma - > vm_flags , dump_ok ? " yes " : " no " ) ;
return dump_ok ;
2006-07-10 04:44:55 -07:00
}
# endif
2007-07-19 01:48:30 -07:00
dump_ok = test_bit ( MMF_DUMP_ANON_PRIVATE , & mm_flags ) ;
kdcore ( " %08lx: %08lx: %s " , vma - > vm_start , vma - > vm_flags ,
dump_ok ? " yes " : " no " ) ;
return dump_ok ;
2006-07-10 04:44:55 -07:00
}
/* An ELF note in memory */
struct memelfnote
{
const char * name ;
int type ;
unsigned int datasz ;
void * data ;
} ;
static int notesize ( struct memelfnote * en )
{
int sz ;
sz = sizeof ( struct elf_note ) ;
sz + = roundup ( strlen ( en - > name ) + 1 , 4 ) ;
sz + = roundup ( en - > datasz , 4 ) ;
return sz ;
}
/* #define DEBUG */
coredump: unify dump_seek() implementations for each binfmt_*.c
The current ELF dumper can produce broken corefiles if program headers
exceed 65535. In particular, the program in 64-bit environment often
demands more than 65535 mmaps. If you google max_map_count, then you can
find many users facing this problem.
Solaris has already dealt with this issue, and other OSes have also
adopted the same method as in Solaris. Currently, Sun's document and AMD
64 ABI include the description for the extension, where they call the
extension Extended Numbering. See Reference for further information.
I believe that linux kernel should adopt the same way as they did, so I've
written this patch.
I am also preparing for patches of GDB and binutils.
How to fix
==========
In new dumping process, there are two cases according to weather or
not the number of program headers is equal to or more than 65535.
- if less than 65535, the produced corefile format is exactly the same
as the ordinary one.
- if equal to or more than 65535, then e_phnum field is set to newly
introduced constant PN_XNUM(0xffff) and the actual number of program
headers is set to sh_info field of the section header at index 0.
Compatibility Concern
=====================
* As already mentioned in Summary, Sun and AMD64 has already adopted
this. See Reference.
* There are four combinations according to whether kernel and userland
tools are respectively modified or not. The next table summarizes
shortly for each combination.
---------------------------------------------
Original Kernel | Modified Kernel
---------------------------------------------
< 65535 | >= 65535 | < 65535 | >= 65535
-------------------------------------------------------------
Original Tools | OK | broken | OK | broken (#)
-------------------------------------------------------------
Modified Tools | OK | broken | OK | OK
-------------------------------------------------------------
Note that there is no case that `OK' changes to `broken'.
(#) Although this case remains broken, O-M behaves better than
O-O. That is, while in O-O case e_phnum field would be extremely
small due to integer overflow, in O-M case it is guaranteed to be at
least 65535 by being set to PN_XNUM(0xFFFF), much closer to the
actual correct value than the O-O case.
Test Program
============
Here is a test program mkmmaps.c that is useful to produce the
corefile with many mmaps. To use this, please take the following
steps:
$ ulimit -c unlimited
$ sysctl vm.max_map_count=70000 # default 65530 is too small
$ sysctl fs.file-max=70000
$ mkmmaps 65535
Then, the program will abort and a corefile will be generated.
If failed, there are two cases according to the error message
displayed.
* ``out of memory'' means vm.max_map_count is still smaller
* ``too many open files'' means fs.file-max is still smaller
So, please change it to a larger value, and then retry it.
mkmmaps.c
==
#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <unistd.h>
int main(int argc, char **argv)
{
int maps_num;
if (argc < 2) {
fprintf(stderr, "mkmmaps [number of maps to be created]\n");
exit(1);
}
if (sscanf(argv[1], "%d", &maps_num) == EOF) {
perror("sscanf");
exit(2);
}
if (maps_num < 0) {
fprintf(stderr, "%d is invalid\n", maps_num);
exit(3);
}
for (; maps_num > 0; --maps_num) {
if (MAP_FAILED == mmap((void *)NULL, (size_t) 1, PROT_READ,
MAP_SHARED | MAP_ANONYMOUS, (int) -1,
(off_t) NULL)) {
perror("mmap");
exit(4);
}
}
abort();
{
char buffer[128];
sprintf(buffer, "wc -l /proc/%u/maps", getpid());
system(buffer);
}
return 0;
}
Tested on i386, ia64 and um/sys-i386.
Built on sh4 (which covers fs/binfmt_elf_fdpic.c)
References
==========
- Sun microsystems: Linker and Libraries.
Part No: 817-1984-17, September 2008.
URL: http://docs.sun.com/app/docs/doc/817-1984
- System V ABI AMD64 Architecture Processor Supplement
Draft Version 0.99., May 11, 2009.
URL: http://www.x86-64.org/
This patch:
There are three different definitions for dump_seek() functions in
binfmt_aout.c, binfmt_elf.c and binfmt_elf_fdpic.c, respectively. The
only for binfmt_elf.c.
My next patch will move dump_seek() into a header file in order to share
the same implementations for dump_write() and dump_seek(). As the first
step, this patch unify these three definitions for dump_seek() by applying
the past commits that have been applied only for binfmt_elf.c.
Specifically, the modification made here is part of the following commits:
* d025c9db7f31fc0554ce7fb2dfc78d35a77f3487
* 7f14daa19ea36b200d237ad3ac5826ae25360461
This patch does not change a shape of corefiles.
Signed-off-by: Daisuke HATAYAMA <d.hatayama@jp.fujitsu.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Greg Ungerer <gerg@snapgear.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-03-05 13:44:05 -08:00
# define DUMP_WRITE(addr, nr, foffset) \
do { if ( ! dump_write ( file , ( addr ) , ( nr ) ) ) return 0 ; * foffset + = ( nr ) ; } while ( 0 )
2006-07-10 04:44:55 -07:00
coredump: unify dump_seek() implementations for each binfmt_*.c
The current ELF dumper can produce broken corefiles if program headers
exceed 65535. In particular, the program in 64-bit environment often
demands more than 65535 mmaps. If you google max_map_count, then you can
find many users facing this problem.
Solaris has already dealt with this issue, and other OSes have also
adopted the same method as in Solaris. Currently, Sun's document and AMD
64 ABI include the description for the extension, where they call the
extension Extended Numbering. See Reference for further information.
I believe that linux kernel should adopt the same way as they did, so I've
written this patch.
I am also preparing for patches of GDB and binutils.
How to fix
==========
In new dumping process, there are two cases according to weather or
not the number of program headers is equal to or more than 65535.
- if less than 65535, the produced corefile format is exactly the same
as the ordinary one.
- if equal to or more than 65535, then e_phnum field is set to newly
introduced constant PN_XNUM(0xffff) and the actual number of program
headers is set to sh_info field of the section header at index 0.
Compatibility Concern
=====================
* As already mentioned in Summary, Sun and AMD64 has already adopted
this. See Reference.
* There are four combinations according to whether kernel and userland
tools are respectively modified or not. The next table summarizes
shortly for each combination.
---------------------------------------------
Original Kernel | Modified Kernel
---------------------------------------------
< 65535 | >= 65535 | < 65535 | >= 65535
-------------------------------------------------------------
Original Tools | OK | broken | OK | broken (#)
-------------------------------------------------------------
Modified Tools | OK | broken | OK | OK
-------------------------------------------------------------
Note that there is no case that `OK' changes to `broken'.
(#) Although this case remains broken, O-M behaves better than
O-O. That is, while in O-O case e_phnum field would be extremely
small due to integer overflow, in O-M case it is guaranteed to be at
least 65535 by being set to PN_XNUM(0xFFFF), much closer to the
actual correct value than the O-O case.
Test Program
============
Here is a test program mkmmaps.c that is useful to produce the
corefile with many mmaps. To use this, please take the following
steps:
$ ulimit -c unlimited
$ sysctl vm.max_map_count=70000 # default 65530 is too small
$ sysctl fs.file-max=70000
$ mkmmaps 65535
Then, the program will abort and a corefile will be generated.
If failed, there are two cases according to the error message
displayed.
* ``out of memory'' means vm.max_map_count is still smaller
* ``too many open files'' means fs.file-max is still smaller
So, please change it to a larger value, and then retry it.
mkmmaps.c
==
#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <unistd.h>
int main(int argc, char **argv)
{
int maps_num;
if (argc < 2) {
fprintf(stderr, "mkmmaps [number of maps to be created]\n");
exit(1);
}
if (sscanf(argv[1], "%d", &maps_num) == EOF) {
perror("sscanf");
exit(2);
}
if (maps_num < 0) {
fprintf(stderr, "%d is invalid\n", maps_num);
exit(3);
}
for (; maps_num > 0; --maps_num) {
if (MAP_FAILED == mmap((void *)NULL, (size_t) 1, PROT_READ,
MAP_SHARED | MAP_ANONYMOUS, (int) -1,
(off_t) NULL)) {
perror("mmap");
exit(4);
}
}
abort();
{
char buffer[128];
sprintf(buffer, "wc -l /proc/%u/maps", getpid());
system(buffer);
}
return 0;
}
Tested on i386, ia64 and um/sys-i386.
Built on sh4 (which covers fs/binfmt_elf_fdpic.c)
References
==========
- Sun microsystems: Linker and Libraries.
Part No: 817-1984-17, September 2008.
URL: http://docs.sun.com/app/docs/doc/817-1984
- System V ABI AMD64 Architecture Processor Supplement
Draft Version 0.99., May 11, 2009.
URL: http://www.x86-64.org/
This patch:
There are three different definitions for dump_seek() functions in
binfmt_aout.c, binfmt_elf.c and binfmt_elf_fdpic.c, respectively. The
only for binfmt_elf.c.
My next patch will move dump_seek() into a header file in order to share
the same implementations for dump_write() and dump_seek(). As the first
step, this patch unify these three definitions for dump_seek() by applying
the past commits that have been applied only for binfmt_elf.c.
Specifically, the modification made here is part of the following commits:
* d025c9db7f31fc0554ce7fb2dfc78d35a77f3487
* 7f14daa19ea36b200d237ad3ac5826ae25360461
This patch does not change a shape of corefiles.
Signed-off-by: Daisuke HATAYAMA <d.hatayama@jp.fujitsu.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Greg Ungerer <gerg@snapgear.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-03-05 13:44:05 -08:00
static int alignfile ( struct file * file , loff_t * foffset )
2006-07-10 04:44:55 -07:00
{
coredump: unify dump_seek() implementations for each binfmt_*.c
The current ELF dumper can produce broken corefiles if program headers
exceed 65535. In particular, the program in 64-bit environment often
demands more than 65535 mmaps. If you google max_map_count, then you can
find many users facing this problem.
Solaris has already dealt with this issue, and other OSes have also
adopted the same method as in Solaris. Currently, Sun's document and AMD
64 ABI include the description for the extension, where they call the
extension Extended Numbering. See Reference for further information.
I believe that linux kernel should adopt the same way as they did, so I've
written this patch.
I am also preparing for patches of GDB and binutils.
How to fix
==========
In new dumping process, there are two cases according to weather or
not the number of program headers is equal to or more than 65535.
- if less than 65535, the produced corefile format is exactly the same
as the ordinary one.
- if equal to or more than 65535, then e_phnum field is set to newly
introduced constant PN_XNUM(0xffff) and the actual number of program
headers is set to sh_info field of the section header at index 0.
Compatibility Concern
=====================
* As already mentioned in Summary, Sun and AMD64 has already adopted
this. See Reference.
* There are four combinations according to whether kernel and userland
tools are respectively modified or not. The next table summarizes
shortly for each combination.
---------------------------------------------
Original Kernel | Modified Kernel
---------------------------------------------
< 65535 | >= 65535 | < 65535 | >= 65535
-------------------------------------------------------------
Original Tools | OK | broken | OK | broken (#)
-------------------------------------------------------------
Modified Tools | OK | broken | OK | OK
-------------------------------------------------------------
Note that there is no case that `OK' changes to `broken'.
(#) Although this case remains broken, O-M behaves better than
O-O. That is, while in O-O case e_phnum field would be extremely
small due to integer overflow, in O-M case it is guaranteed to be at
least 65535 by being set to PN_XNUM(0xFFFF), much closer to the
actual correct value than the O-O case.
Test Program
============
Here is a test program mkmmaps.c that is useful to produce the
corefile with many mmaps. To use this, please take the following
steps:
$ ulimit -c unlimited
$ sysctl vm.max_map_count=70000 # default 65530 is too small
$ sysctl fs.file-max=70000
$ mkmmaps 65535
Then, the program will abort and a corefile will be generated.
If failed, there are two cases according to the error message
displayed.
* ``out of memory'' means vm.max_map_count is still smaller
* ``too many open files'' means fs.file-max is still smaller
So, please change it to a larger value, and then retry it.
mkmmaps.c
==
#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <unistd.h>
int main(int argc, char **argv)
{
int maps_num;
if (argc < 2) {
fprintf(stderr, "mkmmaps [number of maps to be created]\n");
exit(1);
}
if (sscanf(argv[1], "%d", &maps_num) == EOF) {
perror("sscanf");
exit(2);
}
if (maps_num < 0) {
fprintf(stderr, "%d is invalid\n", maps_num);
exit(3);
}
for (; maps_num > 0; --maps_num) {
if (MAP_FAILED == mmap((void *)NULL, (size_t) 1, PROT_READ,
MAP_SHARED | MAP_ANONYMOUS, (int) -1,
(off_t) NULL)) {
perror("mmap");
exit(4);
}
}
abort();
{
char buffer[128];
sprintf(buffer, "wc -l /proc/%u/maps", getpid());
system(buffer);
}
return 0;
}
Tested on i386, ia64 and um/sys-i386.
Built on sh4 (which covers fs/binfmt_elf_fdpic.c)
References
==========
- Sun microsystems: Linker and Libraries.
Part No: 817-1984-17, September 2008.
URL: http://docs.sun.com/app/docs/doc/817-1984
- System V ABI AMD64 Architecture Processor Supplement
Draft Version 0.99., May 11, 2009.
URL: http://www.x86-64.org/
This patch:
There are three different definitions for dump_seek() functions in
binfmt_aout.c, binfmt_elf.c and binfmt_elf_fdpic.c, respectively. The
only for binfmt_elf.c.
My next patch will move dump_seek() into a header file in order to share
the same implementations for dump_write() and dump_seek(). As the first
step, this patch unify these three definitions for dump_seek() by applying
the past commits that have been applied only for binfmt_elf.c.
Specifically, the modification made here is part of the following commits:
* d025c9db7f31fc0554ce7fb2dfc78d35a77f3487
* 7f14daa19ea36b200d237ad3ac5826ae25360461
This patch does not change a shape of corefiles.
Signed-off-by: Daisuke HATAYAMA <d.hatayama@jp.fujitsu.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Greg Ungerer <gerg@snapgear.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-03-05 13:44:05 -08:00
static const char buf [ 4 ] = { 0 , } ;
DUMP_WRITE ( buf , roundup ( * foffset , 4 ) - * foffset , foffset ) ;
return 1 ;
}
2006-07-10 04:44:55 -07:00
coredump: unify dump_seek() implementations for each binfmt_*.c
The current ELF dumper can produce broken corefiles if program headers
exceed 65535. In particular, the program in 64-bit environment often
demands more than 65535 mmaps. If you google max_map_count, then you can
find many users facing this problem.
Solaris has already dealt with this issue, and other OSes have also
adopted the same method as in Solaris. Currently, Sun's document and AMD
64 ABI include the description for the extension, where they call the
extension Extended Numbering. See Reference for further information.
I believe that linux kernel should adopt the same way as they did, so I've
written this patch.
I am also preparing for patches of GDB and binutils.
How to fix
==========
In new dumping process, there are two cases according to weather or
not the number of program headers is equal to or more than 65535.
- if less than 65535, the produced corefile format is exactly the same
as the ordinary one.
- if equal to or more than 65535, then e_phnum field is set to newly
introduced constant PN_XNUM(0xffff) and the actual number of program
headers is set to sh_info field of the section header at index 0.
Compatibility Concern
=====================
* As already mentioned in Summary, Sun and AMD64 has already adopted
this. See Reference.
* There are four combinations according to whether kernel and userland
tools are respectively modified or not. The next table summarizes
shortly for each combination.
---------------------------------------------
Original Kernel | Modified Kernel
---------------------------------------------
< 65535 | >= 65535 | < 65535 | >= 65535
-------------------------------------------------------------
Original Tools | OK | broken | OK | broken (#)
-------------------------------------------------------------
Modified Tools | OK | broken | OK | OK
-------------------------------------------------------------
Note that there is no case that `OK' changes to `broken'.
(#) Although this case remains broken, O-M behaves better than
O-O. That is, while in O-O case e_phnum field would be extremely
small due to integer overflow, in O-M case it is guaranteed to be at
least 65535 by being set to PN_XNUM(0xFFFF), much closer to the
actual correct value than the O-O case.
Test Program
============
Here is a test program mkmmaps.c that is useful to produce the
corefile with many mmaps. To use this, please take the following
steps:
$ ulimit -c unlimited
$ sysctl vm.max_map_count=70000 # default 65530 is too small
$ sysctl fs.file-max=70000
$ mkmmaps 65535
Then, the program will abort and a corefile will be generated.
If failed, there are two cases according to the error message
displayed.
* ``out of memory'' means vm.max_map_count is still smaller
* ``too many open files'' means fs.file-max is still smaller
So, please change it to a larger value, and then retry it.
mkmmaps.c
==
#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <unistd.h>
int main(int argc, char **argv)
{
int maps_num;
if (argc < 2) {
fprintf(stderr, "mkmmaps [number of maps to be created]\n");
exit(1);
}
if (sscanf(argv[1], "%d", &maps_num) == EOF) {
perror("sscanf");
exit(2);
}
if (maps_num < 0) {
fprintf(stderr, "%d is invalid\n", maps_num);
exit(3);
}
for (; maps_num > 0; --maps_num) {
if (MAP_FAILED == mmap((void *)NULL, (size_t) 1, PROT_READ,
MAP_SHARED | MAP_ANONYMOUS, (int) -1,
(off_t) NULL)) {
perror("mmap");
exit(4);
}
}
abort();
{
char buffer[128];
sprintf(buffer, "wc -l /proc/%u/maps", getpid());
system(buffer);
}
return 0;
}
Tested on i386, ia64 and um/sys-i386.
Built on sh4 (which covers fs/binfmt_elf_fdpic.c)
References
==========
- Sun microsystems: Linker and Libraries.
Part No: 817-1984-17, September 2008.
URL: http://docs.sun.com/app/docs/doc/817-1984
- System V ABI AMD64 Architecture Processor Supplement
Draft Version 0.99., May 11, 2009.
URL: http://www.x86-64.org/
This patch:
There are three different definitions for dump_seek() functions in
binfmt_aout.c, binfmt_elf.c and binfmt_elf_fdpic.c, respectively. The
only for binfmt_elf.c.
My next patch will move dump_seek() into a header file in order to share
the same implementations for dump_write() and dump_seek(). As the first
step, this patch unify these three definitions for dump_seek() by applying
the past commits that have been applied only for binfmt_elf.c.
Specifically, the modification made here is part of the following commits:
* d025c9db7f31fc0554ce7fb2dfc78d35a77f3487
* 7f14daa19ea36b200d237ad3ac5826ae25360461
This patch does not change a shape of corefiles.
Signed-off-by: Daisuke HATAYAMA <d.hatayama@jp.fujitsu.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Greg Ungerer <gerg@snapgear.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-03-05 13:44:05 -08:00
static int writenote ( struct memelfnote * men , struct file * file ,
loff_t * foffset )
{
struct elf_note en ;
2006-07-10 04:44:55 -07:00
en . n_namesz = strlen ( men - > name ) + 1 ;
en . n_descsz = men - > datasz ;
en . n_type = men - > type ;
coredump: unify dump_seek() implementations for each binfmt_*.c
The current ELF dumper can produce broken corefiles if program headers
exceed 65535. In particular, the program in 64-bit environment often
demands more than 65535 mmaps. If you google max_map_count, then you can
find many users facing this problem.
Solaris has already dealt with this issue, and other OSes have also
adopted the same method as in Solaris. Currently, Sun's document and AMD
64 ABI include the description for the extension, where they call the
extension Extended Numbering. See Reference for further information.
I believe that linux kernel should adopt the same way as they did, so I've
written this patch.
I am also preparing for patches of GDB and binutils.
How to fix
==========
In new dumping process, there are two cases according to weather or
not the number of program headers is equal to or more than 65535.
- if less than 65535, the produced corefile format is exactly the same
as the ordinary one.
- if equal to or more than 65535, then e_phnum field is set to newly
introduced constant PN_XNUM(0xffff) and the actual number of program
headers is set to sh_info field of the section header at index 0.
Compatibility Concern
=====================
* As already mentioned in Summary, Sun and AMD64 has already adopted
this. See Reference.
* There are four combinations according to whether kernel and userland
tools are respectively modified or not. The next table summarizes
shortly for each combination.
---------------------------------------------
Original Kernel | Modified Kernel
---------------------------------------------
< 65535 | >= 65535 | < 65535 | >= 65535
-------------------------------------------------------------
Original Tools | OK | broken | OK | broken (#)
-------------------------------------------------------------
Modified Tools | OK | broken | OK | OK
-------------------------------------------------------------
Note that there is no case that `OK' changes to `broken'.
(#) Although this case remains broken, O-M behaves better than
O-O. That is, while in O-O case e_phnum field would be extremely
small due to integer overflow, in O-M case it is guaranteed to be at
least 65535 by being set to PN_XNUM(0xFFFF), much closer to the
actual correct value than the O-O case.
Test Program
============
Here is a test program mkmmaps.c that is useful to produce the
corefile with many mmaps. To use this, please take the following
steps:
$ ulimit -c unlimited
$ sysctl vm.max_map_count=70000 # default 65530 is too small
$ sysctl fs.file-max=70000
$ mkmmaps 65535
Then, the program will abort and a corefile will be generated.
If failed, there are two cases according to the error message
displayed.
* ``out of memory'' means vm.max_map_count is still smaller
* ``too many open files'' means fs.file-max is still smaller
So, please change it to a larger value, and then retry it.
mkmmaps.c
==
#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <unistd.h>
int main(int argc, char **argv)
{
int maps_num;
if (argc < 2) {
fprintf(stderr, "mkmmaps [number of maps to be created]\n");
exit(1);
}
if (sscanf(argv[1], "%d", &maps_num) == EOF) {
perror("sscanf");
exit(2);
}
if (maps_num < 0) {
fprintf(stderr, "%d is invalid\n", maps_num);
exit(3);
}
for (; maps_num > 0; --maps_num) {
if (MAP_FAILED == mmap((void *)NULL, (size_t) 1, PROT_READ,
MAP_SHARED | MAP_ANONYMOUS, (int) -1,
(off_t) NULL)) {
perror("mmap");
exit(4);
}
}
abort();
{
char buffer[128];
sprintf(buffer, "wc -l /proc/%u/maps", getpid());
system(buffer);
}
return 0;
}
Tested on i386, ia64 and um/sys-i386.
Built on sh4 (which covers fs/binfmt_elf_fdpic.c)
References
==========
- Sun microsystems: Linker and Libraries.
Part No: 817-1984-17, September 2008.
URL: http://docs.sun.com/app/docs/doc/817-1984
- System V ABI AMD64 Architecture Processor Supplement
Draft Version 0.99., May 11, 2009.
URL: http://www.x86-64.org/
This patch:
There are three different definitions for dump_seek() functions in
binfmt_aout.c, binfmt_elf.c and binfmt_elf_fdpic.c, respectively. The
only for binfmt_elf.c.
My next patch will move dump_seek() into a header file in order to share
the same implementations for dump_write() and dump_seek(). As the first
step, this patch unify these three definitions for dump_seek() by applying
the past commits that have been applied only for binfmt_elf.c.
Specifically, the modification made here is part of the following commits:
* d025c9db7f31fc0554ce7fb2dfc78d35a77f3487
* 7f14daa19ea36b200d237ad3ac5826ae25360461
This patch does not change a shape of corefiles.
Signed-off-by: Daisuke HATAYAMA <d.hatayama@jp.fujitsu.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Greg Ungerer <gerg@snapgear.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-03-05 13:44:05 -08:00
DUMP_WRITE ( & en , sizeof ( en ) , foffset ) ;
DUMP_WRITE ( men - > name , en . n_namesz , foffset ) ;
if ( ! alignfile ( file , foffset ) )
return 0 ;
DUMP_WRITE ( men - > data , men - > datasz , foffset ) ;
if ( ! alignfile ( file , foffset ) )
return 0 ;
2006-07-10 04:44:55 -07:00
return 1 ;
}
# undef DUMP_WRITE
static inline void fill_elf_fdpic_header ( struct elfhdr * elf , int segs )
{
memcpy ( elf - > e_ident , ELFMAG , SELFMAG ) ;
elf - > e_ident [ EI_CLASS ] = ELF_CLASS ;
elf - > e_ident [ EI_DATA ] = ELF_DATA ;
elf - > e_ident [ EI_VERSION ] = EV_CURRENT ;
elf - > e_ident [ EI_OSABI ] = ELF_OSABI ;
memset ( elf - > e_ident + EI_PAD , 0 , EI_NIDENT - EI_PAD ) ;
elf - > e_type = ET_CORE ;
elf - > e_machine = ELF_ARCH ;
elf - > e_version = EV_CURRENT ;
elf - > e_entry = 0 ;
elf - > e_phoff = sizeof ( struct elfhdr ) ;
elf - > e_shoff = 0 ;
elf - > e_flags = ELF_FDPIC_CORE_EFLAGS ;
elf - > e_ehsize = sizeof ( struct elfhdr ) ;
elf - > e_phentsize = sizeof ( struct elf_phdr ) ;
elf - > e_phnum = segs ;
elf - > e_shentsize = 0 ;
elf - > e_shnum = 0 ;
elf - > e_shstrndx = 0 ;
return ;
}
static inline void fill_elf_note_phdr ( struct elf_phdr * phdr , int sz , loff_t offset )
{
phdr - > p_type = PT_NOTE ;
phdr - > p_offset = offset ;
phdr - > p_vaddr = 0 ;
phdr - > p_paddr = 0 ;
phdr - > p_filesz = sz ;
phdr - > p_memsz = 0 ;
phdr - > p_flags = 0 ;
phdr - > p_align = 0 ;
return ;
}
static inline void fill_note ( struct memelfnote * note , const char * name , int type ,
unsigned int sz , void * data )
{
note - > name = name ;
note - > type = type ;
note - > datasz = sz ;
note - > data = data ;
return ;
}
/*
* fill up all the fields in prstatus from the given task struct , except
tree-wide: Assorted spelling fixes
In particular, several occurances of funny versions of 'success',
'unknown', 'therefore', 'acknowledge', 'argument', 'achieve', 'address',
'beginning', 'desirable', 'separate' and 'necessary' are fixed.
Signed-off-by: Daniel Mack <daniel@caiaq.de>
Cc: Joe Perches <joe@perches.com>
Cc: Junio C Hamano <gitster@pobox.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
2010-02-03 08:01:28 +08:00
* registers which need to be filled up separately .
2006-07-10 04:44:55 -07:00
*/
static void fill_prstatus ( struct elf_prstatus * prstatus ,
struct task_struct * p , long signr )
{
prstatus - > pr_info . si_signo = prstatus - > pr_cursig = signr ;
prstatus - > pr_sigpend = p - > pending . signal . sig [ 0 ] ;
prstatus - > pr_sighold = p - > blocked . sig [ 0 ] ;
2009-06-17 16:27:38 -07:00
rcu_read_lock ( ) ;
prstatus - > pr_ppid = task_pid_vnr ( rcu_dereference ( p - > real_parent ) ) ;
rcu_read_unlock ( ) ;
2007-10-18 23:40:14 -07:00
prstatus - > pr_pid = task_pid_vnr ( p ) ;
prstatus - > pr_pgrp = task_pgrp_vnr ( p ) ;
prstatus - > pr_sid = task_session_vnr ( p ) ;
2006-07-10 04:44:55 -07:00
if ( thread_group_leader ( p ) ) {
2008-10-21 12:07:40 +09:00
struct task_cputime cputime ;
2006-07-10 04:44:55 -07:00
/*
2008-10-21 12:07:40 +09:00
* This is the record for the group leader . It shows the
* group - wide total , not its individual thread total .
2006-07-10 04:44:55 -07:00
*/
2008-10-21 12:07:40 +09:00
thread_group_cputime ( p , & cputime ) ;
cputime_to_timeval ( cputime . utime , & prstatus - > pr_utime ) ;
cputime_to_timeval ( cputime . stime , & prstatus - > pr_stime ) ;
2006-07-10 04:44:55 -07:00
} else {
2012-11-13 14:20:55 +01:00
cputime_t utime , stime ;
task_cputime ( p , & utime , & stime ) ;
cputime_to_timeval ( utime , & prstatus - > pr_utime ) ;
cputime_to_timeval ( stime , & prstatus - > pr_stime ) ;
2006-07-10 04:44:55 -07:00
}
cputime_to_timeval ( p - > signal - > cutime , & prstatus - > pr_cutime ) ;
cputime_to_timeval ( p - > signal - > cstime , & prstatus - > pr_cstime ) ;
prstatus - > pr_exec_fdpic_loadmap = p - > mm - > context . exec_fdpic_loadmap ;
prstatus - > pr_interp_fdpic_loadmap = p - > mm - > context . interp_fdpic_loadmap ;
}
static int fill_psinfo ( struct elf_prpsinfo * psinfo , struct task_struct * p ,
struct mm_struct * mm )
{
2008-11-14 10:39:19 +11:00
const struct cred * cred ;
2006-07-10 04:44:55 -07:00
unsigned int i , len ;
/* first copy the parameters from user space */
memset ( psinfo , 0 , sizeof ( struct elf_prpsinfo ) ) ;
len = mm - > arg_end - mm - > arg_start ;
if ( len > = ELF_PRARGSZ )
len = ELF_PRARGSZ - 1 ;
if ( copy_from_user ( & psinfo - > pr_psargs ,
( const char __user * ) mm - > arg_start , len ) )
return - EFAULT ;
for ( i = 0 ; i < len ; i + + )
if ( psinfo - > pr_psargs [ i ] = = 0 )
psinfo - > pr_psargs [ i ] = ' ' ;
psinfo - > pr_psargs [ len ] = 0 ;
2009-06-17 16:27:38 -07:00
rcu_read_lock ( ) ;
psinfo - > pr_ppid = task_pid_vnr ( rcu_dereference ( p - > real_parent ) ) ;
rcu_read_unlock ( ) ;
2007-10-18 23:40:14 -07:00
psinfo - > pr_pid = task_pid_vnr ( p ) ;
psinfo - > pr_pgrp = task_pgrp_vnr ( p ) ;
psinfo - > pr_sid = task_session_vnr ( p ) ;
2006-07-10 04:44:55 -07:00
i = p - > state ? ffz ( ~ p - > state ) + 1 : 0 ;
psinfo - > pr_state = i ;
psinfo - > pr_sname = ( i > 5 ) ? ' . ' : " RSDTZW " [ i ] ;
psinfo - > pr_zomb = psinfo - > pr_sname = = ' Z ' ;
psinfo - > pr_nice = task_nice ( p ) ;
psinfo - > pr_flag = p - > flags ;
2008-11-14 10:39:19 +11:00
rcu_read_lock ( ) ;
cred = __task_cred ( p ) ;
2012-02-07 18:36:10 -08:00
SET_UID ( psinfo - > pr_uid , from_kuid_munged ( cred - > user_ns , cred - > uid ) ) ;
SET_GID ( psinfo - > pr_gid , from_kgid_munged ( cred - > user_ns , cred - > gid ) ) ;
2008-11-14 10:39:19 +11:00
rcu_read_unlock ( ) ;
2006-07-10 04:44:55 -07:00
strncpy ( psinfo - > pr_fname , p - > comm , sizeof ( psinfo - > pr_fname ) ) ;
return 0 ;
}
/* Here is the structure in which status of each thread is captured. */
struct elf_thread_status
{
struct list_head list ;
struct elf_prstatus prstatus ; /* NT_PRSTATUS */
elf_fpregset_t fpu ; /* NT_PRFPREG */
struct task_struct * thread ;
# ifdef ELF_CORE_COPY_XFPREGS
2007-10-16 23:25:39 -07:00
elf_fpxregset_t xfpu ; /* ELF_CORE_XFPREG_TYPE */
2006-07-10 04:44:55 -07:00
# endif
struct memelfnote notes [ 3 ] ;
int num_notes ;
} ;
/*
* In order to add the specific thread information for the elf file format ,
* we need to keep a linked list of every thread ' s pr_status and then create
* a single section for them in the final core file .
*/
static int elf_dump_thread_status ( long signr , struct elf_thread_status * t )
{
struct task_struct * p = t - > thread ;
int sz = 0 ;
t - > num_notes = 0 ;
fill_prstatus ( & t - > prstatus , p , signr ) ;
elf_core_copy_task_regs ( p , & t - > prstatus . pr_reg ) ;
fill_note ( & t - > notes [ 0 ] , " CORE " , NT_PRSTATUS , sizeof ( t - > prstatus ) ,
& t - > prstatus ) ;
t - > num_notes + + ;
sz + = notesize ( & t - > notes [ 0 ] ) ;
t - > prstatus . pr_fpvalid = elf_core_copy_task_fpregs ( p , NULL , & t - > fpu ) ;
if ( t - > prstatus . pr_fpvalid ) {
fill_note ( & t - > notes [ 1 ] , " CORE " , NT_PRFPREG , sizeof ( t - > fpu ) ,
& t - > fpu ) ;
t - > num_notes + + ;
sz + = notesize ( & t - > notes [ 1 ] ) ;
}
# ifdef ELF_CORE_COPY_XFPREGS
if ( elf_core_copy_task_xfpregs ( p , & t - > xfpu ) ) {
2007-10-16 23:25:39 -07:00
fill_note ( & t - > notes [ 2 ] , " LINUX " , ELF_CORE_XFPREG_TYPE ,
sizeof ( t - > xfpu ) , & t - > xfpu ) ;
2006-07-10 04:44:55 -07:00
t - > num_notes + + ;
sz + = notesize ( & t - > notes [ 2 ] ) ;
}
# endif
return sz ;
}
2010-03-05 13:44:10 -08:00
static void fill_extnum_info ( struct elfhdr * elf , struct elf_shdr * shdr4extnum ,
elf_addr_t e_shoff , int segs )
{
elf - > e_shoff = e_shoff ;
elf - > e_shentsize = sizeof ( * shdr4extnum ) ;
elf - > e_shnum = 1 ;
elf - > e_shstrndx = SHN_UNDEF ;
memset ( shdr4extnum , 0 , sizeof ( * shdr4extnum ) ) ;
shdr4extnum - > sh_type = SHT_NULL ;
shdr4extnum - > sh_size = elf - > e_shnum ;
shdr4extnum - > sh_link = elf - > e_shstrndx ;
shdr4extnum - > sh_info = segs ;
}
2006-07-10 04:44:55 -07:00
/*
* dump the segments for an MMU process
*/
# ifdef CONFIG_MMU
2007-07-19 01:48:30 -07:00
static int elf_fdpic_dump_segments ( struct file * file , size_t * size ,
2007-07-19 01:48:30 -07:00
unsigned long * limit , unsigned long mm_flags )
2006-07-10 04:44:55 -07:00
{
struct vm_area_struct * vma ;
2009-09-21 17:03:25 -07:00
int err = 0 ;
2006-07-10 04:44:55 -07:00
for ( vma = current - > mm - > mmap ; vma ; vma = vma - > vm_next ) {
unsigned long addr ;
2007-07-19 01:48:30 -07:00
if ( ! maydump ( vma , mm_flags ) )
2006-07-10 04:44:55 -07:00
continue ;
2009-09-21 17:03:25 -07:00
for ( addr = vma - > vm_start ; addr < vma - > vm_end ;
addr + = PAGE_SIZE ) {
struct page * page = get_dump_page ( addr ) ;
if ( page ) {
void * kaddr = kmap ( page ) ;
* size + = PAGE_SIZE ;
if ( * size > * limit )
err = - EFBIG ;
else if ( ! dump_write ( file , kaddr , PAGE_SIZE ) )
err = - EIO ;
2006-07-10 04:44:55 -07:00
kunmap ( page ) ;
page_cache_release ( page ) ;
coredump: unify dump_seek() implementations for each binfmt_*.c
The current ELF dumper can produce broken corefiles if program headers
exceed 65535. In particular, the program in 64-bit environment often
demands more than 65535 mmaps. If you google max_map_count, then you can
find many users facing this problem.
Solaris has already dealt with this issue, and other OSes have also
adopted the same method as in Solaris. Currently, Sun's document and AMD
64 ABI include the description for the extension, where they call the
extension Extended Numbering. See Reference for further information.
I believe that linux kernel should adopt the same way as they did, so I've
written this patch.
I am also preparing for patches of GDB and binutils.
How to fix
==========
In new dumping process, there are two cases according to weather or
not the number of program headers is equal to or more than 65535.
- if less than 65535, the produced corefile format is exactly the same
as the ordinary one.
- if equal to or more than 65535, then e_phnum field is set to newly
introduced constant PN_XNUM(0xffff) and the actual number of program
headers is set to sh_info field of the section header at index 0.
Compatibility Concern
=====================
* As already mentioned in Summary, Sun and AMD64 has already adopted
this. See Reference.
* There are four combinations according to whether kernel and userland
tools are respectively modified or not. The next table summarizes
shortly for each combination.
---------------------------------------------
Original Kernel | Modified Kernel
---------------------------------------------
< 65535 | >= 65535 | < 65535 | >= 65535
-------------------------------------------------------------
Original Tools | OK | broken | OK | broken (#)
-------------------------------------------------------------
Modified Tools | OK | broken | OK | OK
-------------------------------------------------------------
Note that there is no case that `OK' changes to `broken'.
(#) Although this case remains broken, O-M behaves better than
O-O. That is, while in O-O case e_phnum field would be extremely
small due to integer overflow, in O-M case it is guaranteed to be at
least 65535 by being set to PN_XNUM(0xFFFF), much closer to the
actual correct value than the O-O case.
Test Program
============
Here is a test program mkmmaps.c that is useful to produce the
corefile with many mmaps. To use this, please take the following
steps:
$ ulimit -c unlimited
$ sysctl vm.max_map_count=70000 # default 65530 is too small
$ sysctl fs.file-max=70000
$ mkmmaps 65535
Then, the program will abort and a corefile will be generated.
If failed, there are two cases according to the error message
displayed.
* ``out of memory'' means vm.max_map_count is still smaller
* ``too many open files'' means fs.file-max is still smaller
So, please change it to a larger value, and then retry it.
mkmmaps.c
==
#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <unistd.h>
int main(int argc, char **argv)
{
int maps_num;
if (argc < 2) {
fprintf(stderr, "mkmmaps [number of maps to be created]\n");
exit(1);
}
if (sscanf(argv[1], "%d", &maps_num) == EOF) {
perror("sscanf");
exit(2);
}
if (maps_num < 0) {
fprintf(stderr, "%d is invalid\n", maps_num);
exit(3);
}
for (; maps_num > 0; --maps_num) {
if (MAP_FAILED == mmap((void *)NULL, (size_t) 1, PROT_READ,
MAP_SHARED | MAP_ANONYMOUS, (int) -1,
(off_t) NULL)) {
perror("mmap");
exit(4);
}
}
abort();
{
char buffer[128];
sprintf(buffer, "wc -l /proc/%u/maps", getpid());
system(buffer);
}
return 0;
}
Tested on i386, ia64 and um/sys-i386.
Built on sh4 (which covers fs/binfmt_elf_fdpic.c)
References
==========
- Sun microsystems: Linker and Libraries.
Part No: 817-1984-17, September 2008.
URL: http://docs.sun.com/app/docs/doc/817-1984
- System V ABI AMD64 Architecture Processor Supplement
Draft Version 0.99., May 11, 2009.
URL: http://www.x86-64.org/
This patch:
There are three different definitions for dump_seek() functions in
binfmt_aout.c, binfmt_elf.c and binfmt_elf_fdpic.c, respectively. The
only for binfmt_elf.c.
My next patch will move dump_seek() into a header file in order to share
the same implementations for dump_write() and dump_seek(). As the first
step, this patch unify these three definitions for dump_seek() by applying
the past commits that have been applied only for binfmt_elf.c.
Specifically, the modification made here is part of the following commits:
* d025c9db7f31fc0554ce7fb2dfc78d35a77f3487
* 7f14daa19ea36b200d237ad3ac5826ae25360461
This patch does not change a shape of corefiles.
Signed-off-by: Daisuke HATAYAMA <d.hatayama@jp.fujitsu.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Greg Ungerer <gerg@snapgear.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-03-05 13:44:05 -08:00
} else if ( ! dump_seek ( file , PAGE_SIZE ) )
2009-09-21 17:03:25 -07:00
err = - EFBIG ;
if ( err )
goto out ;
2006-07-10 04:44:55 -07:00
}
}
2009-09-21 17:03:25 -07:00
out :
return err ;
2006-07-10 04:44:55 -07:00
}
# endif
/*
* dump the segments for a NOMMU process
*/
# ifndef CONFIG_MMU
2007-07-19 01:48:30 -07:00
static int elf_fdpic_dump_segments ( struct file * file , size_t * size ,
2007-07-19 01:48:30 -07:00
unsigned long * limit , unsigned long mm_flags )
2006-07-10 04:44:55 -07:00
{
2009-01-08 12:04:47 +00:00
struct vm_area_struct * vma ;
2006-07-10 04:44:55 -07:00
2009-01-08 12:04:47 +00:00
for ( vma = current - > mm - > mmap ; vma ; vma = vma - > vm_next ) {
2007-07-19 01:48:30 -07:00
if ( ! maydump ( vma , mm_flags ) )
2006-07-10 04:44:55 -07:00
continue ;
if ( ( * size + = PAGE_SIZE ) > * limit )
return - EFBIG ;
if ( ! dump_write ( file , ( void * ) vma - > vm_start ,
vma - > vm_end - vma - > vm_start ) )
return - EIO ;
}
return 0 ;
}
# endif
2010-03-05 13:44:10 -08:00
static size_t elf_core_vma_data_size ( unsigned long mm_flags )
{
struct vm_area_struct * vma ;
size_t size = 0 ;
2010-03-24 17:02:28 +00:00
for ( vma = current - > mm - > mmap ; vma ; vma = vma - > vm_next )
2010-03-05 13:44:10 -08:00
if ( maydump ( vma , mm_flags ) )
size + = vma - > vm_end - vma - > vm_start ;
return size ;
}
2006-07-10 04:44:55 -07:00
/*
* Actual dumper
*
* This is a two - pass process ; first we find the offsets of the bits ,
* and then they are actually written out . If we run out of core limit
* we just truncate .
*/
2009-12-17 15:27:16 -08:00
static int elf_fdpic_core_dump ( struct coredump_params * cprm )
2006-07-10 04:44:55 -07:00
{
# define NUM_NOTES 6
int has_dumped = 0 ;
mm_segment_t fs ;
int segs ;
size_t size = 0 ;
int i ;
struct vm_area_struct * vma ;
struct elfhdr * elf = NULL ;
coredump: unify dump_seek() implementations for each binfmt_*.c
The current ELF dumper can produce broken corefiles if program headers
exceed 65535. In particular, the program in 64-bit environment often
demands more than 65535 mmaps. If you google max_map_count, then you can
find many users facing this problem.
Solaris has already dealt with this issue, and other OSes have also
adopted the same method as in Solaris. Currently, Sun's document and AMD
64 ABI include the description for the extension, where they call the
extension Extended Numbering. See Reference for further information.
I believe that linux kernel should adopt the same way as they did, so I've
written this patch.
I am also preparing for patches of GDB and binutils.
How to fix
==========
In new dumping process, there are two cases according to weather or
not the number of program headers is equal to or more than 65535.
- if less than 65535, the produced corefile format is exactly the same
as the ordinary one.
- if equal to or more than 65535, then e_phnum field is set to newly
introduced constant PN_XNUM(0xffff) and the actual number of program
headers is set to sh_info field of the section header at index 0.
Compatibility Concern
=====================
* As already mentioned in Summary, Sun and AMD64 has already adopted
this. See Reference.
* There are four combinations according to whether kernel and userland
tools are respectively modified or not. The next table summarizes
shortly for each combination.
---------------------------------------------
Original Kernel | Modified Kernel
---------------------------------------------
< 65535 | >= 65535 | < 65535 | >= 65535
-------------------------------------------------------------
Original Tools | OK | broken | OK | broken (#)
-------------------------------------------------------------
Modified Tools | OK | broken | OK | OK
-------------------------------------------------------------
Note that there is no case that `OK' changes to `broken'.
(#) Although this case remains broken, O-M behaves better than
O-O. That is, while in O-O case e_phnum field would be extremely
small due to integer overflow, in O-M case it is guaranteed to be at
least 65535 by being set to PN_XNUM(0xFFFF), much closer to the
actual correct value than the O-O case.
Test Program
============
Here is a test program mkmmaps.c that is useful to produce the
corefile with many mmaps. To use this, please take the following
steps:
$ ulimit -c unlimited
$ sysctl vm.max_map_count=70000 # default 65530 is too small
$ sysctl fs.file-max=70000
$ mkmmaps 65535
Then, the program will abort and a corefile will be generated.
If failed, there are two cases according to the error message
displayed.
* ``out of memory'' means vm.max_map_count is still smaller
* ``too many open files'' means fs.file-max is still smaller
So, please change it to a larger value, and then retry it.
mkmmaps.c
==
#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <unistd.h>
int main(int argc, char **argv)
{
int maps_num;
if (argc < 2) {
fprintf(stderr, "mkmmaps [number of maps to be created]\n");
exit(1);
}
if (sscanf(argv[1], "%d", &maps_num) == EOF) {
perror("sscanf");
exit(2);
}
if (maps_num < 0) {
fprintf(stderr, "%d is invalid\n", maps_num);
exit(3);
}
for (; maps_num > 0; --maps_num) {
if (MAP_FAILED == mmap((void *)NULL, (size_t) 1, PROT_READ,
MAP_SHARED | MAP_ANONYMOUS, (int) -1,
(off_t) NULL)) {
perror("mmap");
exit(4);
}
}
abort();
{
char buffer[128];
sprintf(buffer, "wc -l /proc/%u/maps", getpid());
system(buffer);
}
return 0;
}
Tested on i386, ia64 and um/sys-i386.
Built on sh4 (which covers fs/binfmt_elf_fdpic.c)
References
==========
- Sun microsystems: Linker and Libraries.
Part No: 817-1984-17, September 2008.
URL: http://docs.sun.com/app/docs/doc/817-1984
- System V ABI AMD64 Architecture Processor Supplement
Draft Version 0.99., May 11, 2009.
URL: http://www.x86-64.org/
This patch:
There are three different definitions for dump_seek() functions in
binfmt_aout.c, binfmt_elf.c and binfmt_elf_fdpic.c, respectively. The
only for binfmt_elf.c.
My next patch will move dump_seek() into a header file in order to share
the same implementations for dump_write() and dump_seek(). As the first
step, this patch unify these three definitions for dump_seek() by applying
the past commits that have been applied only for binfmt_elf.c.
Specifically, the modification made here is part of the following commits:
* d025c9db7f31fc0554ce7fb2dfc78d35a77f3487
* 7f14daa19ea36b200d237ad3ac5826ae25360461
This patch does not change a shape of corefiles.
Signed-off-by: Daisuke HATAYAMA <d.hatayama@jp.fujitsu.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Greg Ungerer <gerg@snapgear.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-03-05 13:44:05 -08:00
loff_t offset = 0 , dataoff , foffset ;
2006-07-10 04:44:55 -07:00
int numnote ;
struct memelfnote * notes = NULL ;
struct elf_prstatus * prstatus = NULL ; /* NT_PRSTATUS */
struct elf_prpsinfo * psinfo = NULL ; /* NT_PRPSINFO */
LIST_HEAD ( thread_list ) ;
struct list_head * t ;
elf_fpregset_t * fpu = NULL ;
# ifdef ELF_CORE_COPY_XFPREGS
elf_fpxregset_t * xfpu = NULL ;
# endif
int thread_status_size = 0 ;
elf_addr_t * auxv ;
2010-03-05 13:44:09 -08:00
struct elf_phdr * phdr4note = NULL ;
2010-03-05 13:44:10 -08:00
struct elf_shdr * shdr4extnum = NULL ;
Elf_Half e_phnum ;
elf_addr_t e_shoff ;
2006-07-10 04:44:55 -07:00
/*
* We no longer stop all VM operations .
*
* This is because those proceses that could possibly change map_count
* or the mmap / vma pages are now blocked in do_exit on current
* finishing this core dump .
*
* Only ptrace can touch these memory addresses , but it doesn ' t change
* the map_count or the pages allocated . So no possibility of crashing
* exists while dumping the mm - > vm_next areas to the core file .
*/
/* alloc memory for large data structures: too large to be on stack */
elf = kmalloc ( sizeof ( * elf ) , GFP_KERNEL ) ;
if ( ! elf )
goto cleanup ;
prstatus = kzalloc ( sizeof ( * prstatus ) , GFP_KERNEL ) ;
if ( ! prstatus )
goto cleanup ;
psinfo = kmalloc ( sizeof ( * psinfo ) , GFP_KERNEL ) ;
if ( ! psinfo )
goto cleanup ;
notes = kmalloc ( NUM_NOTES * sizeof ( struct memelfnote ) , GFP_KERNEL ) ;
if ( ! notes )
goto cleanup ;
fpu = kmalloc ( sizeof ( * fpu ) , GFP_KERNEL ) ;
if ( ! fpu )
goto cleanup ;
# ifdef ELF_CORE_COPY_XFPREGS
xfpu = kmalloc ( sizeof ( * xfpu ) , GFP_KERNEL ) ;
if ( ! xfpu )
goto cleanup ;
# endif
2012-10-04 17:15:29 -07:00
if ( cprm - > siginfo - > si_signo ) {
2008-07-25 01:47:45 -07:00
struct core_thread * ct ;
2006-07-10 04:44:55 -07:00
struct elf_thread_status * tmp ;
2008-07-25 01:47:45 -07:00
for ( ct = current - > mm - > core_state - > dumper . next ;
ct ; ct = ct - > next ) {
tmp = kzalloc ( sizeof ( * tmp ) , GFP_KERNEL ) ;
if ( ! tmp )
goto cleanup ;
tmp - > thread = ct - > task ;
list_add ( & tmp - > list , & thread_list ) ;
}
2006-07-10 04:44:55 -07:00
list_for_each ( t , & thread_list ) {
struct elf_thread_status * tmp ;
int sz ;
tmp = list_entry ( t , struct elf_thread_status , list ) ;
2012-10-04 17:15:29 -07:00
sz = elf_dump_thread_status ( cprm - > siginfo - > si_signo , tmp ) ;
2006-07-10 04:44:55 -07:00
thread_status_size + = sz ;
}
}
/* now collect the dump for the current */
2012-10-04 17:15:29 -07:00
fill_prstatus ( prstatus , current , cprm - > siginfo - > si_signo ) ;
2009-12-17 15:27:16 -08:00
elf_core_copy_regs ( & prstatus - > pr_reg , cprm - > regs ) ;
2006-07-10 04:44:55 -07:00
segs = current - > mm - > map_count ;
2010-03-05 13:44:07 -08:00
segs + = elf_core_extra_phdrs ( ) ;
2006-07-10 04:44:55 -07:00
2010-03-05 13:44:10 -08:00
/* for notes section */
segs + + ;
/* If segs > PN_XNUM(0xffff), then e_phnum overflows. To avoid
* this , kernel supports extended numbering . Have a look at
* include / linux / elf . h for further information . */
e_phnum = segs > PN_XNUM ? PN_XNUM : segs ;
2006-07-10 04:44:55 -07:00
/* Set up header */
2010-03-05 13:44:10 -08:00
fill_elf_fdpic_header ( elf , e_phnum ) ;
2006-07-10 04:44:55 -07:00
has_dumped = 1 ;
/*
* Set up the notes in similar form to SVR4 core dumps made
* with info from their / proc .
*/
fill_note ( notes + 0 , " CORE " , NT_PRSTATUS , sizeof ( * prstatus ) , prstatus ) ;
fill_psinfo ( psinfo , current - > group_leader , current - > mm ) ;
fill_note ( notes + 1 , " CORE " , NT_PRPSINFO , sizeof ( * psinfo ) , psinfo ) ;
numnote = 2 ;
auxv = ( elf_addr_t * ) current - > mm - > saved_auxv ;
i = 0 ;
do
i + = 2 ;
while ( auxv [ i - 2 ] ! = AT_NULL ) ;
fill_note ( & notes [ numnote + + ] , " CORE " , NT_AUXV ,
i * sizeof ( elf_addr_t ) , auxv ) ;
/* Try to dump the FPU. */
if ( ( prstatus - > pr_fpvalid =
2009-12-17 15:27:16 -08:00
elf_core_copy_task_fpregs ( current , cprm - > regs , fpu ) ) )
2006-07-10 04:44:55 -07:00
fill_note ( notes + numnote + + ,
" CORE " , NT_PRFPREG , sizeof ( * fpu ) , fpu ) ;
# ifdef ELF_CORE_COPY_XFPREGS
if ( elf_core_copy_task_xfpregs ( current , xfpu ) )
fill_note ( notes + numnote + + ,
2007-10-16 23:25:39 -07:00
" LINUX " , ELF_CORE_XFPREG_TYPE , sizeof ( * xfpu ) , xfpu ) ;
2006-07-10 04:44:55 -07:00
# endif
fs = get_fs ( ) ;
set_fs ( KERNEL_DS ) ;
offset + = sizeof ( * elf ) ; /* Elf header */
2010-03-05 13:44:10 -08:00
offset + = segs * sizeof ( struct elf_phdr ) ; /* Program headers */
coredump: unify dump_seek() implementations for each binfmt_*.c
The current ELF dumper can produce broken corefiles if program headers
exceed 65535. In particular, the program in 64-bit environment often
demands more than 65535 mmaps. If you google max_map_count, then you can
find many users facing this problem.
Solaris has already dealt with this issue, and other OSes have also
adopted the same method as in Solaris. Currently, Sun's document and AMD
64 ABI include the description for the extension, where they call the
extension Extended Numbering. See Reference for further information.
I believe that linux kernel should adopt the same way as they did, so I've
written this patch.
I am also preparing for patches of GDB and binutils.
How to fix
==========
In new dumping process, there are two cases according to weather or
not the number of program headers is equal to or more than 65535.
- if less than 65535, the produced corefile format is exactly the same
as the ordinary one.
- if equal to or more than 65535, then e_phnum field is set to newly
introduced constant PN_XNUM(0xffff) and the actual number of program
headers is set to sh_info field of the section header at index 0.
Compatibility Concern
=====================
* As already mentioned in Summary, Sun and AMD64 has already adopted
this. See Reference.
* There are four combinations according to whether kernel and userland
tools are respectively modified or not. The next table summarizes
shortly for each combination.
---------------------------------------------
Original Kernel | Modified Kernel
---------------------------------------------
< 65535 | >= 65535 | < 65535 | >= 65535
-------------------------------------------------------------
Original Tools | OK | broken | OK | broken (#)
-------------------------------------------------------------
Modified Tools | OK | broken | OK | OK
-------------------------------------------------------------
Note that there is no case that `OK' changes to `broken'.
(#) Although this case remains broken, O-M behaves better than
O-O. That is, while in O-O case e_phnum field would be extremely
small due to integer overflow, in O-M case it is guaranteed to be at
least 65535 by being set to PN_XNUM(0xFFFF), much closer to the
actual correct value than the O-O case.
Test Program
============
Here is a test program mkmmaps.c that is useful to produce the
corefile with many mmaps. To use this, please take the following
steps:
$ ulimit -c unlimited
$ sysctl vm.max_map_count=70000 # default 65530 is too small
$ sysctl fs.file-max=70000
$ mkmmaps 65535
Then, the program will abort and a corefile will be generated.
If failed, there are two cases according to the error message
displayed.
* ``out of memory'' means vm.max_map_count is still smaller
* ``too many open files'' means fs.file-max is still smaller
So, please change it to a larger value, and then retry it.
mkmmaps.c
==
#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <unistd.h>
int main(int argc, char **argv)
{
int maps_num;
if (argc < 2) {
fprintf(stderr, "mkmmaps [number of maps to be created]\n");
exit(1);
}
if (sscanf(argv[1], "%d", &maps_num) == EOF) {
perror("sscanf");
exit(2);
}
if (maps_num < 0) {
fprintf(stderr, "%d is invalid\n", maps_num);
exit(3);
}
for (; maps_num > 0; --maps_num) {
if (MAP_FAILED == mmap((void *)NULL, (size_t) 1, PROT_READ,
MAP_SHARED | MAP_ANONYMOUS, (int) -1,
(off_t) NULL)) {
perror("mmap");
exit(4);
}
}
abort();
{
char buffer[128];
sprintf(buffer, "wc -l /proc/%u/maps", getpid());
system(buffer);
}
return 0;
}
Tested on i386, ia64 and um/sys-i386.
Built on sh4 (which covers fs/binfmt_elf_fdpic.c)
References
==========
- Sun microsystems: Linker and Libraries.
Part No: 817-1984-17, September 2008.
URL: http://docs.sun.com/app/docs/doc/817-1984
- System V ABI AMD64 Architecture Processor Supplement
Draft Version 0.99., May 11, 2009.
URL: http://www.x86-64.org/
This patch:
There are three different definitions for dump_seek() functions in
binfmt_aout.c, binfmt_elf.c and binfmt_elf_fdpic.c, respectively. The
only for binfmt_elf.c.
My next patch will move dump_seek() into a header file in order to share
the same implementations for dump_write() and dump_seek(). As the first
step, this patch unify these three definitions for dump_seek() by applying
the past commits that have been applied only for binfmt_elf.c.
Specifically, the modification made here is part of the following commits:
* d025c9db7f31fc0554ce7fb2dfc78d35a77f3487
* 7f14daa19ea36b200d237ad3ac5826ae25360461
This patch does not change a shape of corefiles.
Signed-off-by: Daisuke HATAYAMA <d.hatayama@jp.fujitsu.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Greg Ungerer <gerg@snapgear.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-03-05 13:44:05 -08:00
foffset = offset ;
2006-07-10 04:44:55 -07:00
/* Write notes phdr entry */
{
int sz = 0 ;
for ( i = 0 ; i < numnote ; i + + )
sz + = notesize ( notes + i ) ;
sz + = thread_status_size ;
2010-03-05 13:44:09 -08:00
phdr4note = kmalloc ( sizeof ( * phdr4note ) , GFP_KERNEL ) ;
if ( ! phdr4note )
2010-03-05 13:44:06 -08:00
goto end_coredump ;
2010-03-05 13:44:09 -08:00
fill_elf_note_phdr ( phdr4note , sz , offset ) ;
offset + = sz ;
2006-07-10 04:44:55 -07:00
}
/* Page-align dumped data */
dataoff = offset = roundup ( offset , ELF_EXEC_PAGESIZE ) ;
2010-03-05 13:44:12 -08:00
offset + = elf_core_vma_data_size ( cprm - > mm_flags ) ;
2010-03-05 13:44:10 -08:00
offset + = elf_core_extra_data_size ( ) ;
e_shoff = offset ;
if ( e_phnum = = PN_XNUM ) {
shdr4extnum = kmalloc ( sizeof ( * shdr4extnum ) , GFP_KERNEL ) ;
if ( ! shdr4extnum )
goto end_coredump ;
fill_extnum_info ( elf , shdr4extnum , e_shoff , segs ) ;
}
offset = dataoff ;
2010-03-05 13:44:09 -08:00
size + = sizeof ( * elf ) ;
if ( size > cprm - > limit | | ! dump_write ( cprm - > file , elf , sizeof ( * elf ) ) )
goto end_coredump ;
size + = sizeof ( * phdr4note ) ;
if ( size > cprm - > limit
| | ! dump_write ( cprm - > file , phdr4note , sizeof ( * phdr4note ) ) )
goto end_coredump ;
2006-07-10 04:44:55 -07:00
/* write program headers for segments dump */
2009-01-08 12:04:47 +00:00
for ( vma = current - > mm - > mmap ; vma ; vma = vma - > vm_next ) {
2006-07-10 04:44:55 -07:00
struct elf_phdr phdr ;
size_t sz ;
sz = vma - > vm_end - vma - > vm_start ;
phdr . p_type = PT_LOAD ;
phdr . p_offset = offset ;
phdr . p_vaddr = vma - > vm_start ;
phdr . p_paddr = 0 ;
2010-03-05 13:44:12 -08:00
phdr . p_filesz = maydump ( vma , cprm - > mm_flags ) ? sz : 0 ;
2006-07-10 04:44:55 -07:00
phdr . p_memsz = sz ;
offset + = phdr . p_filesz ;
phdr . p_flags = vma - > vm_flags & VM_READ ? PF_R : 0 ;
if ( vma - > vm_flags & VM_WRITE )
phdr . p_flags | = PF_W ;
if ( vma - > vm_flags & VM_EXEC )
phdr . p_flags | = PF_X ;
phdr . p_align = ELF_EXEC_PAGESIZE ;
2010-03-05 13:44:06 -08:00
size + = sizeof ( phdr ) ;
if ( size > cprm - > limit
| | ! dump_write ( cprm - > file , & phdr , sizeof ( phdr ) ) )
goto end_coredump ;
2006-07-10 04:44:55 -07:00
}
2010-03-05 13:44:07 -08:00
if ( ! elf_core_write_extra_phdrs ( cprm - > file , offset , & size , cprm - > limit ) )
goto end_coredump ;
2006-07-10 04:44:55 -07:00
/* write out the notes section */
for ( i = 0 ; i < numnote ; i + + )
coredump: unify dump_seek() implementations for each binfmt_*.c
The current ELF dumper can produce broken corefiles if program headers
exceed 65535. In particular, the program in 64-bit environment often
demands more than 65535 mmaps. If you google max_map_count, then you can
find many users facing this problem.
Solaris has already dealt with this issue, and other OSes have also
adopted the same method as in Solaris. Currently, Sun's document and AMD
64 ABI include the description for the extension, where they call the
extension Extended Numbering. See Reference for further information.
I believe that linux kernel should adopt the same way as they did, so I've
written this patch.
I am also preparing for patches of GDB and binutils.
How to fix
==========
In new dumping process, there are two cases according to weather or
not the number of program headers is equal to or more than 65535.
- if less than 65535, the produced corefile format is exactly the same
as the ordinary one.
- if equal to or more than 65535, then e_phnum field is set to newly
introduced constant PN_XNUM(0xffff) and the actual number of program
headers is set to sh_info field of the section header at index 0.
Compatibility Concern
=====================
* As already mentioned in Summary, Sun and AMD64 has already adopted
this. See Reference.
* There are four combinations according to whether kernel and userland
tools are respectively modified or not. The next table summarizes
shortly for each combination.
---------------------------------------------
Original Kernel | Modified Kernel
---------------------------------------------
< 65535 | >= 65535 | < 65535 | >= 65535
-------------------------------------------------------------
Original Tools | OK | broken | OK | broken (#)
-------------------------------------------------------------
Modified Tools | OK | broken | OK | OK
-------------------------------------------------------------
Note that there is no case that `OK' changes to `broken'.
(#) Although this case remains broken, O-M behaves better than
O-O. That is, while in O-O case e_phnum field would be extremely
small due to integer overflow, in O-M case it is guaranteed to be at
least 65535 by being set to PN_XNUM(0xFFFF), much closer to the
actual correct value than the O-O case.
Test Program
============
Here is a test program mkmmaps.c that is useful to produce the
corefile with many mmaps. To use this, please take the following
steps:
$ ulimit -c unlimited
$ sysctl vm.max_map_count=70000 # default 65530 is too small
$ sysctl fs.file-max=70000
$ mkmmaps 65535
Then, the program will abort and a corefile will be generated.
If failed, there are two cases according to the error message
displayed.
* ``out of memory'' means vm.max_map_count is still smaller
* ``too many open files'' means fs.file-max is still smaller
So, please change it to a larger value, and then retry it.
mkmmaps.c
==
#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <unistd.h>
int main(int argc, char **argv)
{
int maps_num;
if (argc < 2) {
fprintf(stderr, "mkmmaps [number of maps to be created]\n");
exit(1);
}
if (sscanf(argv[1], "%d", &maps_num) == EOF) {
perror("sscanf");
exit(2);
}
if (maps_num < 0) {
fprintf(stderr, "%d is invalid\n", maps_num);
exit(3);
}
for (; maps_num > 0; --maps_num) {
if (MAP_FAILED == mmap((void *)NULL, (size_t) 1, PROT_READ,
MAP_SHARED | MAP_ANONYMOUS, (int) -1,
(off_t) NULL)) {
perror("mmap");
exit(4);
}
}
abort();
{
char buffer[128];
sprintf(buffer, "wc -l /proc/%u/maps", getpid());
system(buffer);
}
return 0;
}
Tested on i386, ia64 and um/sys-i386.
Built on sh4 (which covers fs/binfmt_elf_fdpic.c)
References
==========
- Sun microsystems: Linker and Libraries.
Part No: 817-1984-17, September 2008.
URL: http://docs.sun.com/app/docs/doc/817-1984
- System V ABI AMD64 Architecture Processor Supplement
Draft Version 0.99., May 11, 2009.
URL: http://www.x86-64.org/
This patch:
There are three different definitions for dump_seek() functions in
binfmt_aout.c, binfmt_elf.c and binfmt_elf_fdpic.c, respectively. The
only for binfmt_elf.c.
My next patch will move dump_seek() into a header file in order to share
the same implementations for dump_write() and dump_seek(). As the first
step, this patch unify these three definitions for dump_seek() by applying
the past commits that have been applied only for binfmt_elf.c.
Specifically, the modification made here is part of the following commits:
* d025c9db7f31fc0554ce7fb2dfc78d35a77f3487
* 7f14daa19ea36b200d237ad3ac5826ae25360461
This patch does not change a shape of corefiles.
Signed-off-by: Daisuke HATAYAMA <d.hatayama@jp.fujitsu.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Greg Ungerer <gerg@snapgear.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-03-05 13:44:05 -08:00
if ( ! writenote ( notes + i , cprm - > file , & foffset ) )
2006-07-10 04:44:55 -07:00
goto end_coredump ;
/* write out the thread status notes section */
list_for_each ( t , & thread_list ) {
struct elf_thread_status * tmp =
list_entry ( t , struct elf_thread_status , list ) ;
for ( i = 0 ; i < tmp - > num_notes ; i + + )
coredump: unify dump_seek() implementations for each binfmt_*.c
The current ELF dumper can produce broken corefiles if program headers
exceed 65535. In particular, the program in 64-bit environment often
demands more than 65535 mmaps. If you google max_map_count, then you can
find many users facing this problem.
Solaris has already dealt with this issue, and other OSes have also
adopted the same method as in Solaris. Currently, Sun's document and AMD
64 ABI include the description for the extension, where they call the
extension Extended Numbering. See Reference for further information.
I believe that linux kernel should adopt the same way as they did, so I've
written this patch.
I am also preparing for patches of GDB and binutils.
How to fix
==========
In new dumping process, there are two cases according to weather or
not the number of program headers is equal to or more than 65535.
- if less than 65535, the produced corefile format is exactly the same
as the ordinary one.
- if equal to or more than 65535, then e_phnum field is set to newly
introduced constant PN_XNUM(0xffff) and the actual number of program
headers is set to sh_info field of the section header at index 0.
Compatibility Concern
=====================
* As already mentioned in Summary, Sun and AMD64 has already adopted
this. See Reference.
* There are four combinations according to whether kernel and userland
tools are respectively modified or not. The next table summarizes
shortly for each combination.
---------------------------------------------
Original Kernel | Modified Kernel
---------------------------------------------
< 65535 | >= 65535 | < 65535 | >= 65535
-------------------------------------------------------------
Original Tools | OK | broken | OK | broken (#)
-------------------------------------------------------------
Modified Tools | OK | broken | OK | OK
-------------------------------------------------------------
Note that there is no case that `OK' changes to `broken'.
(#) Although this case remains broken, O-M behaves better than
O-O. That is, while in O-O case e_phnum field would be extremely
small due to integer overflow, in O-M case it is guaranteed to be at
least 65535 by being set to PN_XNUM(0xFFFF), much closer to the
actual correct value than the O-O case.
Test Program
============
Here is a test program mkmmaps.c that is useful to produce the
corefile with many mmaps. To use this, please take the following
steps:
$ ulimit -c unlimited
$ sysctl vm.max_map_count=70000 # default 65530 is too small
$ sysctl fs.file-max=70000
$ mkmmaps 65535
Then, the program will abort and a corefile will be generated.
If failed, there are two cases according to the error message
displayed.
* ``out of memory'' means vm.max_map_count is still smaller
* ``too many open files'' means fs.file-max is still smaller
So, please change it to a larger value, and then retry it.
mkmmaps.c
==
#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <unistd.h>
int main(int argc, char **argv)
{
int maps_num;
if (argc < 2) {
fprintf(stderr, "mkmmaps [number of maps to be created]\n");
exit(1);
}
if (sscanf(argv[1], "%d", &maps_num) == EOF) {
perror("sscanf");
exit(2);
}
if (maps_num < 0) {
fprintf(stderr, "%d is invalid\n", maps_num);
exit(3);
}
for (; maps_num > 0; --maps_num) {
if (MAP_FAILED == mmap((void *)NULL, (size_t) 1, PROT_READ,
MAP_SHARED | MAP_ANONYMOUS, (int) -1,
(off_t) NULL)) {
perror("mmap");
exit(4);
}
}
abort();
{
char buffer[128];
sprintf(buffer, "wc -l /proc/%u/maps", getpid());
system(buffer);
}
return 0;
}
Tested on i386, ia64 and um/sys-i386.
Built on sh4 (which covers fs/binfmt_elf_fdpic.c)
References
==========
- Sun microsystems: Linker and Libraries.
Part No: 817-1984-17, September 2008.
URL: http://docs.sun.com/app/docs/doc/817-1984
- System V ABI AMD64 Architecture Processor Supplement
Draft Version 0.99., May 11, 2009.
URL: http://www.x86-64.org/
This patch:
There are three different definitions for dump_seek() functions in
binfmt_aout.c, binfmt_elf.c and binfmt_elf_fdpic.c, respectively. The
only for binfmt_elf.c.
My next patch will move dump_seek() into a header file in order to share
the same implementations for dump_write() and dump_seek(). As the first
step, this patch unify these three definitions for dump_seek() by applying
the past commits that have been applied only for binfmt_elf.c.
Specifically, the modification made here is part of the following commits:
* d025c9db7f31fc0554ce7fb2dfc78d35a77f3487
* 7f14daa19ea36b200d237ad3ac5826ae25360461
This patch does not change a shape of corefiles.
Signed-off-by: Daisuke HATAYAMA <d.hatayama@jp.fujitsu.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Greg Ungerer <gerg@snapgear.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-03-05 13:44:05 -08:00
if ( ! writenote ( & tmp - > notes [ i ] , cprm - > file , & foffset ) )
2006-07-10 04:44:55 -07:00
goto end_coredump ;
}
coredump: unify dump_seek() implementations for each binfmt_*.c
The current ELF dumper can produce broken corefiles if program headers
exceed 65535. In particular, the program in 64-bit environment often
demands more than 65535 mmaps. If you google max_map_count, then you can
find many users facing this problem.
Solaris has already dealt with this issue, and other OSes have also
adopted the same method as in Solaris. Currently, Sun's document and AMD
64 ABI include the description for the extension, where they call the
extension Extended Numbering. See Reference for further information.
I believe that linux kernel should adopt the same way as they did, so I've
written this patch.
I am also preparing for patches of GDB and binutils.
How to fix
==========
In new dumping process, there are two cases according to weather or
not the number of program headers is equal to or more than 65535.
- if less than 65535, the produced corefile format is exactly the same
as the ordinary one.
- if equal to or more than 65535, then e_phnum field is set to newly
introduced constant PN_XNUM(0xffff) and the actual number of program
headers is set to sh_info field of the section header at index 0.
Compatibility Concern
=====================
* As already mentioned in Summary, Sun and AMD64 has already adopted
this. See Reference.
* There are four combinations according to whether kernel and userland
tools are respectively modified or not. The next table summarizes
shortly for each combination.
---------------------------------------------
Original Kernel | Modified Kernel
---------------------------------------------
< 65535 | >= 65535 | < 65535 | >= 65535
-------------------------------------------------------------
Original Tools | OK | broken | OK | broken (#)
-------------------------------------------------------------
Modified Tools | OK | broken | OK | OK
-------------------------------------------------------------
Note that there is no case that `OK' changes to `broken'.
(#) Although this case remains broken, O-M behaves better than
O-O. That is, while in O-O case e_phnum field would be extremely
small due to integer overflow, in O-M case it is guaranteed to be at
least 65535 by being set to PN_XNUM(0xFFFF), much closer to the
actual correct value than the O-O case.
Test Program
============
Here is a test program mkmmaps.c that is useful to produce the
corefile with many mmaps. To use this, please take the following
steps:
$ ulimit -c unlimited
$ sysctl vm.max_map_count=70000 # default 65530 is too small
$ sysctl fs.file-max=70000
$ mkmmaps 65535
Then, the program will abort and a corefile will be generated.
If failed, there are two cases according to the error message
displayed.
* ``out of memory'' means vm.max_map_count is still smaller
* ``too many open files'' means fs.file-max is still smaller
So, please change it to a larger value, and then retry it.
mkmmaps.c
==
#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <unistd.h>
int main(int argc, char **argv)
{
int maps_num;
if (argc < 2) {
fprintf(stderr, "mkmmaps [number of maps to be created]\n");
exit(1);
}
if (sscanf(argv[1], "%d", &maps_num) == EOF) {
perror("sscanf");
exit(2);
}
if (maps_num < 0) {
fprintf(stderr, "%d is invalid\n", maps_num);
exit(3);
}
for (; maps_num > 0; --maps_num) {
if (MAP_FAILED == mmap((void *)NULL, (size_t) 1, PROT_READ,
MAP_SHARED | MAP_ANONYMOUS, (int) -1,
(off_t) NULL)) {
perror("mmap");
exit(4);
}
}
abort();
{
char buffer[128];
sprintf(buffer, "wc -l /proc/%u/maps", getpid());
system(buffer);
}
return 0;
}
Tested on i386, ia64 and um/sys-i386.
Built on sh4 (which covers fs/binfmt_elf_fdpic.c)
References
==========
- Sun microsystems: Linker and Libraries.
Part No: 817-1984-17, September 2008.
URL: http://docs.sun.com/app/docs/doc/817-1984
- System V ABI AMD64 Architecture Processor Supplement
Draft Version 0.99., May 11, 2009.
URL: http://www.x86-64.org/
This patch:
There are three different definitions for dump_seek() functions in
binfmt_aout.c, binfmt_elf.c and binfmt_elf_fdpic.c, respectively. The
only for binfmt_elf.c.
My next patch will move dump_seek() into a header file in order to share
the same implementations for dump_write() and dump_seek(). As the first
step, this patch unify these three definitions for dump_seek() by applying
the past commits that have been applied only for binfmt_elf.c.
Specifically, the modification made here is part of the following commits:
* d025c9db7f31fc0554ce7fb2dfc78d35a77f3487
* 7f14daa19ea36b200d237ad3ac5826ae25360461
This patch does not change a shape of corefiles.
Signed-off-by: Daisuke HATAYAMA <d.hatayama@jp.fujitsu.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Greg Ungerer <gerg@snapgear.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-03-05 13:44:05 -08:00
if ( ! dump_seek ( cprm - > file , dataoff - foffset ) )
2009-09-21 17:03:25 -07:00
goto end_coredump ;
2006-07-10 04:44:55 -07:00
2009-12-17 15:27:16 -08:00
if ( elf_fdpic_dump_segments ( cprm - > file , & size , & cprm - > limit ,
2010-03-05 13:44:12 -08:00
cprm - > mm_flags ) < 0 )
2006-07-10 04:44:55 -07:00
goto end_coredump ;
2010-03-05 13:44:07 -08:00
if ( ! elf_core_write_extra_data ( cprm - > file , & size , cprm - > limit ) )
goto end_coredump ;
2006-07-10 04:44:55 -07:00
2010-03-05 13:44:10 -08:00
if ( e_phnum = = PN_XNUM ) {
size + = sizeof ( * shdr4extnum ) ;
if ( size > cprm - > limit
| | ! dump_write ( cprm - > file , shdr4extnum ,
sizeof ( * shdr4extnum ) ) )
goto end_coredump ;
}
2010-01-04 15:42:14 +09:00
if ( cprm - > file - > f_pos ! = offset ) {
2006-07-10 04:44:55 -07:00
/* Sanity check */
printk ( KERN_WARNING
" elf_core_dump: file->f_pos (%lld) != offset (%lld) \n " ,
2010-01-04 15:42:14 +09:00
cprm - > file - > f_pos , offset ) ;
2006-07-10 04:44:55 -07:00
}
end_coredump :
set_fs ( fs ) ;
cleanup :
while ( ! list_empty ( & thread_list ) ) {
struct list_head * tmp = thread_list . next ;
list_del ( tmp ) ;
kfree ( list_entry ( tmp , struct elf_thread_status , list ) ) ;
}
2010-03-05 13:44:09 -08:00
kfree ( phdr4note ) ;
2006-07-10 04:44:55 -07:00
kfree ( elf ) ;
kfree ( prstatus ) ;
kfree ( psinfo ) ;
kfree ( notes ) ;
kfree ( fpu ) ;
2011-07-06 12:26:05 +01:00
kfree ( shdr4extnum ) ;
2006-07-10 04:44:55 -07:00
# ifdef ELF_CORE_COPY_XFPREGS
kfree ( xfpu ) ;
# endif
return has_dumped ;
# undef NUM_NOTES
}
2009-12-15 16:47:37 -08:00
# endif /* CONFIG_ELF_CORE */