2005-04-16 15:20:36 -07:00
/*
* linux / fs / binfmt_aout . c
*
* Copyright ( C ) 1991 , 1992 , 1996 Linus Torvalds
*/
# include <linux/module.h>
# include <linux/time.h>
# include <linux/kernel.h>
# include <linux/mm.h>
# include <linux/mman.h>
# include <linux/a.out.h>
# include <linux/errno.h>
# include <linux/signal.h>
# include <linux/string.h>
# include <linux/fs.h>
# include <linux/file.h>
# include <linux/stat.h>
# include <linux/fcntl.h>
# include <linux/ptrace.h>
# include <linux/user.h>
# include <linux/binfmts.h>
# include <linux/personality.h>
# include <linux/init.h>
2010-03-05 13:44:06 -08:00
# include <linux/coredump.h>
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h
percpu.h is included by sched.h and module.h and thus ends up being
included when building most .c files. percpu.h includes slab.h which
in turn includes gfp.h making everything defined by the two files
universally available and complicating inclusion dependencies.
percpu.h -> slab.h dependency is about to be removed. Prepare for
this change by updating users of gfp and slab facilities include those
headers directly instead of assuming availability. As this conversion
needs to touch large number of source files, the following script is
used as the basis of conversion.
http://userweb.kernel.org/~tj/misc/slabh-sweep.py
The script does the followings.
* Scan files for gfp and slab usages and update includes such that
only the necessary includes are there. ie. if only gfp is used,
gfp.h, if slab is used, slab.h.
* When the script inserts a new include, it looks at the include
blocks and try to put the new include such that its order conforms
to its surrounding. It's put in the include block which contains
core kernel includes, in the same order that the rest are ordered -
alphabetical, Christmas tree, rev-Xmas-tree or at the end if there
doesn't seem to be any matching order.
* If the script can't find a place to put a new include (mostly
because the file doesn't have fitting include block), it prints out
an error message indicating which .h file needs to be added to the
file.
The conversion was done in the following steps.
1. The initial automatic conversion of all .c files updated slightly
over 4000 files, deleting around 700 includes and adding ~480 gfp.h
and ~3000 slab.h inclusions. The script emitted errors for ~400
files.
2. Each error was manually checked. Some didn't need the inclusion,
some needed manual addition while adding it to implementation .h or
embedding .c file was more appropriate for others. This step added
inclusions to around 150 files.
3. The script was run again and the output was compared to the edits
from #2 to make sure no file was left behind.
4. Several build tests were done and a couple of problems were fixed.
e.g. lib/decompress_*.c used malloc/free() wrappers around slab
APIs requiring slab.h to be added manually.
5. The script was run on all .h files but without automatically
editing them as sprinkling gfp.h and slab.h inclusions around .h
files could easily lead to inclusion dependency hell. Most gfp.h
inclusion directives were ignored as stuff from gfp.h was usually
wildly available and often used in preprocessor macros. Each
slab.h inclusion directive was examined and added manually as
necessary.
6. percpu.h was updated not to include slab.h.
7. Build test were done on the following configurations and failures
were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my
distributed build env didn't work with gcov compiles) and a few
more options had to be turned off depending on archs to make things
build (like ipr on powerpc/64 which failed due to missing writeq).
* x86 and x86_64 UP and SMP allmodconfig and a custom test config.
* powerpc and powerpc64 SMP allmodconfig
* sparc and sparc64 SMP allmodconfig
* ia64 SMP allmodconfig
* s390 SMP allmodconfig
* alpha SMP allmodconfig
* um on x86_64 SMP allmodconfig
8. percpu.h modifications were reverted so that it could be applied as
a separate patch and serve as bisection point.
Given the fact that I had only a couple of failures from tests on step
6, I'm fairly confident about the coverage of this conversion patch.
If there is a breakage, it's likely to be something in one of the arch
headers which should be easily discoverable easily on most builds of
the specific arch.
Signed-off-by: Tejun Heo <tj@kernel.org>
Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 17:04:11 +09:00
# include <linux/slab.h>
2017-02-08 18:51:37 +01:00
# include <linux/sched/task_stack.h>
2005-04-16 15:20:36 -07:00
2016-12-24 11:46:01 -08:00
# include <linux/uaccess.h>
2005-04-16 15:20:36 -07:00
# include <asm/cacheflush.h>
2008-02-08 04:19:28 -08:00
# include <asm/a.out-core.h>
2005-04-16 15:20:36 -07:00
2012-10-20 22:00:48 -04:00
static int load_aout_binary ( struct linux_binprm * ) ;
2005-04-16 15:20:36 -07:00
static int load_aout_library ( struct file * ) ;
2012-10-04 17:15:23 -07:00
# ifdef CONFIG_COREDUMP
2005-04-16 15:20:36 -07:00
/*
* Routine writes a core dump image in the current directory .
* Currently only a stub - function .
*
* Note that setuid / setgid files won ' t make a core - dump if the uid / gid
* changed due to the set [ u | g ] id . It ' s enforced by the " current->mm->dumpable "
* field , which also makes sure the core - dumps won ' t be recursive if the
* dumping of the process results in another error . .
*/
2009-12-17 15:27:16 -08:00
static int aout_core_dump ( struct coredump_params * cprm )
2005-04-16 15:20:36 -07:00
{
mm_segment_t fs ;
int has_dumped = 0 ;
2010-03-23 13:35:15 -07:00
void __user * dump_start ;
int dump_size ;
2005-04-16 15:20:36 -07:00
struct user dump ;
2009-01-03 07:16:23 +00:00
# ifdef __alpha__
2010-03-23 13:35:15 -07:00
# define START_DATA(u) ((void __user *)u.start_data)
2009-01-03 07:16:23 +00:00
# else
2010-03-23 13:35:15 -07:00
# define START_DATA(u) ((void __user *)((u.u_tsize << PAGE_SHIFT) + \
u . start_code ) )
2005-04-16 15:20:36 -07:00
# endif
2010-03-23 13:35:15 -07:00
# define START_STACK(u) ((void __user *)u.start_stack)
2005-04-16 15:20:36 -07:00
fs = get_fs ( ) ;
set_fs ( KERNEL_DS ) ;
has_dumped = 1 ;
strncpy ( dump . u_comm , current - > comm , sizeof ( dump . u_comm ) ) ;
2008-02-07 00:15:57 -08:00
dump . u_ar0 = offsetof ( struct user , regs ) ;
2012-10-04 17:15:29 -07:00
dump . signal = cprm - > siginfo - > si_signo ;
2009-12-17 15:27:16 -08:00
aout_dump_thread ( cprm - > regs , & dump ) ;
2005-04-16 15:20:36 -07:00
/* If the size of the dump file exceeds the rlimit, then see what would happen
if we wrote the stack , but not the data area . */
2009-12-17 15:27:16 -08:00
if ( ( dump . u_dsize + dump . u_ssize + 1 ) * PAGE_SIZE > cprm - > limit )
2005-04-16 15:20:36 -07:00
dump . u_dsize = 0 ;
/* Make sure we have enough room to write the stack and data areas. */
2009-12-17 15:27:16 -08:00
if ( ( dump . u_ssize + 1 ) * PAGE_SIZE > cprm - > limit )
2005-04-16 15:20:36 -07:00
dump . u_ssize = 0 ;
/* make sure we actually have a data and stack area to dump */
set_fs ( USER_DS ) ;
Remove 'type' argument from access_ok() function
Nobody has actually used the type (VERIFY_READ vs VERIFY_WRITE) argument
of the user address range verification function since we got rid of the
old racy i386-only code to walk page tables by hand.
It existed because the original 80386 would not honor the write protect
bit when in kernel mode, so you had to do COW by hand before doing any
user access. But we haven't supported that in a long time, and these
days the 'type' argument is a purely historical artifact.
A discussion about extending 'user_access_begin()' to do the range
checking resulted this patch, because there is no way we're going to
move the old VERIFY_xyz interface to that model. And it's best done at
the end of the merge window when I've done most of my merges, so let's
just get this done once and for all.
This patch was mostly done with a sed-script, with manual fix-ups for
the cases that weren't of the trivial 'access_ok(VERIFY_xyz' form.
There were a couple of notable cases:
- csky still had the old "verify_area()" name as an alias.
- the iter_iov code had magical hardcoded knowledge of the actual
values of VERIFY_{READ,WRITE} (not that they mattered, since nothing
really used it)
- microblaze used the type argument for a debug printout
but other than those oddities this should be a total no-op patch.
I tried to fix up all architectures, did fairly extensive grepping for
access_ok() uses, and the changes are trivial, but I may have missed
something. Any missed conversion should be trivially fixable, though.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-01-03 18:57:57 -08:00
if ( ! access_ok ( START_DATA ( dump ) , dump . u_dsize < < PAGE_SHIFT ) )
2005-04-16 15:20:36 -07:00
dump . u_dsize = 0 ;
Remove 'type' argument from access_ok() function
Nobody has actually used the type (VERIFY_READ vs VERIFY_WRITE) argument
of the user address range verification function since we got rid of the
old racy i386-only code to walk page tables by hand.
It existed because the original 80386 would not honor the write protect
bit when in kernel mode, so you had to do COW by hand before doing any
user access. But we haven't supported that in a long time, and these
days the 'type' argument is a purely historical artifact.
A discussion about extending 'user_access_begin()' to do the range
checking resulted this patch, because there is no way we're going to
move the old VERIFY_xyz interface to that model. And it's best done at
the end of the merge window when I've done most of my merges, so let's
just get this done once and for all.
This patch was mostly done with a sed-script, with manual fix-ups for
the cases that weren't of the trivial 'access_ok(VERIFY_xyz' form.
There were a couple of notable cases:
- csky still had the old "verify_area()" name as an alias.
- the iter_iov code had magical hardcoded knowledge of the actual
values of VERIFY_{READ,WRITE} (not that they mattered, since nothing
really used it)
- microblaze used the type argument for a debug printout
but other than those oddities this should be a total no-op patch.
I tried to fix up all architectures, did fairly extensive grepping for
access_ok() uses, and the changes are trivial, but I may have missed
something. Any missed conversion should be trivially fixable, though.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-01-03 18:57:57 -08:00
if ( ! access_ok ( START_STACK ( dump ) , dump . u_ssize < < PAGE_SHIFT ) )
2005-04-16 15:20:36 -07:00
dump . u_ssize = 0 ;
set_fs ( KERNEL_DS ) ;
/* struct user */
2013-10-07 07:22:01 -04:00
if ( ! dump_emit ( cprm , & dump , sizeof ( dump ) ) )
2010-03-05 13:44:06 -08:00
goto end_coredump ;
2005-04-16 15:20:36 -07:00
/* Now dump all of the user data. Include malloced stuff as well */
2013-10-08 09:26:08 -04:00
if ( ! dump_skip ( cprm , PAGE_SIZE - sizeof ( dump ) ) )
coredump: unify dump_seek() implementations for each binfmt_*.c
The current ELF dumper can produce broken corefiles if program headers
exceed 65535. In particular, the program in 64-bit environment often
demands more than 65535 mmaps. If you google max_map_count, then you can
find many users facing this problem.
Solaris has already dealt with this issue, and other OSes have also
adopted the same method as in Solaris. Currently, Sun's document and AMD
64 ABI include the description for the extension, where they call the
extension Extended Numbering. See Reference for further information.
I believe that linux kernel should adopt the same way as they did, so I've
written this patch.
I am also preparing for patches of GDB and binutils.
How to fix
==========
In new dumping process, there are two cases according to weather or
not the number of program headers is equal to or more than 65535.
- if less than 65535, the produced corefile format is exactly the same
as the ordinary one.
- if equal to or more than 65535, then e_phnum field is set to newly
introduced constant PN_XNUM(0xffff) and the actual number of program
headers is set to sh_info field of the section header at index 0.
Compatibility Concern
=====================
* As already mentioned in Summary, Sun and AMD64 has already adopted
this. See Reference.
* There are four combinations according to whether kernel and userland
tools are respectively modified or not. The next table summarizes
shortly for each combination.
---------------------------------------------
Original Kernel | Modified Kernel
---------------------------------------------
< 65535 | >= 65535 | < 65535 | >= 65535
-------------------------------------------------------------
Original Tools | OK | broken | OK | broken (#)
-------------------------------------------------------------
Modified Tools | OK | broken | OK | OK
-------------------------------------------------------------
Note that there is no case that `OK' changes to `broken'.
(#) Although this case remains broken, O-M behaves better than
O-O. That is, while in O-O case e_phnum field would be extremely
small due to integer overflow, in O-M case it is guaranteed to be at
least 65535 by being set to PN_XNUM(0xFFFF), much closer to the
actual correct value than the O-O case.
Test Program
============
Here is a test program mkmmaps.c that is useful to produce the
corefile with many mmaps. To use this, please take the following
steps:
$ ulimit -c unlimited
$ sysctl vm.max_map_count=70000 # default 65530 is too small
$ sysctl fs.file-max=70000
$ mkmmaps 65535
Then, the program will abort and a corefile will be generated.
If failed, there are two cases according to the error message
displayed.
* ``out of memory'' means vm.max_map_count is still smaller
* ``too many open files'' means fs.file-max is still smaller
So, please change it to a larger value, and then retry it.
mkmmaps.c
==
#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <unistd.h>
int main(int argc, char **argv)
{
int maps_num;
if (argc < 2) {
fprintf(stderr, "mkmmaps [number of maps to be created]\n");
exit(1);
}
if (sscanf(argv[1], "%d", &maps_num) == EOF) {
perror("sscanf");
exit(2);
}
if (maps_num < 0) {
fprintf(stderr, "%d is invalid\n", maps_num);
exit(3);
}
for (; maps_num > 0; --maps_num) {
if (MAP_FAILED == mmap((void *)NULL, (size_t) 1, PROT_READ,
MAP_SHARED | MAP_ANONYMOUS, (int) -1,
(off_t) NULL)) {
perror("mmap");
exit(4);
}
}
abort();
{
char buffer[128];
sprintf(buffer, "wc -l /proc/%u/maps", getpid());
system(buffer);
}
return 0;
}
Tested on i386, ia64 and um/sys-i386.
Built on sh4 (which covers fs/binfmt_elf_fdpic.c)
References
==========
- Sun microsystems: Linker and Libraries.
Part No: 817-1984-17, September 2008.
URL: http://docs.sun.com/app/docs/doc/817-1984
- System V ABI AMD64 Architecture Processor Supplement
Draft Version 0.99., May 11, 2009.
URL: http://www.x86-64.org/
This patch:
There are three different definitions for dump_seek() functions in
binfmt_aout.c, binfmt_elf.c and binfmt_elf_fdpic.c, respectively. The
only for binfmt_elf.c.
My next patch will move dump_seek() into a header file in order to share
the same implementations for dump_write() and dump_seek(). As the first
step, this patch unify these three definitions for dump_seek() by applying
the past commits that have been applied only for binfmt_elf.c.
Specifically, the modification made here is part of the following commits:
* d025c9db7f31fc0554ce7fb2dfc78d35a77f3487
* 7f14daa19ea36b200d237ad3ac5826ae25360461
This patch does not change a shape of corefiles.
Signed-off-by: Daisuke HATAYAMA <d.hatayama@jp.fujitsu.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Greg Ungerer <gerg@snapgear.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-03-05 13:44:05 -08:00
goto end_coredump ;
2005-04-16 15:20:36 -07:00
/* now we start writing out the user space info */
set_fs ( USER_DS ) ;
/* Dump the data area */
if ( dump . u_dsize ! = 0 ) {
dump_start = START_DATA ( dump ) ;
dump_size = dump . u_dsize < < PAGE_SHIFT ;
2013-10-07 07:22:01 -04:00
if ( ! dump_emit ( cprm , dump_start , dump_size ) )
2010-03-05 13:44:06 -08:00
goto end_coredump ;
2005-04-16 15:20:36 -07:00
}
/* Now prepare to dump the stack area */
if ( dump . u_ssize ! = 0 ) {
dump_start = START_STACK ( dump ) ;
dump_size = dump . u_ssize < < PAGE_SHIFT ;
2013-10-07 07:22:01 -04:00
if ( ! dump_emit ( cprm , dump_start , dump_size ) )
2010-03-05 13:44:06 -08:00
goto end_coredump ;
2005-04-16 15:20:36 -07:00
}
end_coredump :
set_fs ( fs ) ;
return has_dumped ;
}
2012-10-04 17:15:23 -07:00
# else
# define aout_core_dump NULL
# endif
static struct linux_binfmt aout_format = {
. module = THIS_MODULE ,
. load_binary = load_aout_binary ,
. load_shlib = load_aout_library ,
. core_dump = aout_core_dump ,
. min_coredump = PAGE_SIZE
} ;
# define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE)
static int set_brk ( unsigned long start , unsigned long end )
{
start = PAGE_ALIGN ( start ) ;
end = PAGE_ALIGN ( end ) ;
2016-05-27 15:57:31 -07:00
if ( end > start )
return vm_brk ( start , end - start ) ;
2012-10-04 17:15:23 -07:00
return 0 ;
}
2005-04-16 15:20:36 -07:00
/*
* create_aout_tables ( ) parses the env - and arg - strings in new user
* memory and creates the pointer tables from them , and puts their
* addresses on the " stack " , returning the new stack pointer value .
*/
static unsigned long __user * create_aout_tables ( char __user * p , struct linux_binprm * bprm )
{
char __user * __user * argv ;
char __user * __user * envp ;
unsigned long __user * sp ;
int argc = bprm - > argc ;
int envc = bprm - > envc ;
sp = ( void __user * ) ( ( - ( unsigned long ) sizeof ( char * ) ) & ( unsigned long ) p ) ;
# ifdef __alpha__
/* whee.. test-programs are so much fun. */
put_user ( 0 , - - sp ) ;
put_user ( 0 , - - sp ) ;
if ( bprm - > loader ) {
put_user ( 0 , - - sp ) ;
2009-01-03 07:16:23 +00:00
put_user ( 1003 , - - sp ) ;
2005-04-16 15:20:36 -07:00
put_user ( bprm - > loader , - - sp ) ;
2009-01-03 07:16:23 +00:00
put_user ( 1002 , - - sp ) ;
2005-04-16 15:20:36 -07:00
}
put_user ( bprm - > exec , - - sp ) ;
2009-01-03 07:16:23 +00:00
put_user ( 1001 , - - sp ) ;
2005-04-16 15:20:36 -07:00
# endif
sp - = envc + 1 ;
envp = ( char __user * __user * ) sp ;
sp - = argc + 1 ;
argv = ( char __user * __user * ) sp ;
2009-01-03 07:16:23 +00:00
# ifndef __alpha__
2005-04-16 15:20:36 -07:00
put_user ( ( unsigned long ) envp , - - sp ) ;
put_user ( ( unsigned long ) argv , - - sp ) ;
# endif
put_user ( argc , - - sp ) ;
current - > mm - > arg_start = ( unsigned long ) p ;
while ( argc - - > 0 ) {
char c ;
put_user ( p , argv + + ) ;
do {
get_user ( c , p + + ) ;
} while ( c ) ;
}
put_user ( NULL , argv ) ;
current - > mm - > arg_end = current - > mm - > env_start = ( unsigned long ) p ;
while ( envc - - > 0 ) {
char c ;
put_user ( p , envp + + ) ;
do {
get_user ( c , p + + ) ;
} while ( c ) ;
}
put_user ( NULL , envp ) ;
current - > mm - > env_end = ( unsigned long ) p ;
return sp ;
}
/*
* These are the functions used to load a . out style executables and shared
* libraries . There is no binary dependent code anywhere else .
*/
2012-10-20 22:00:48 -04:00
static int load_aout_binary ( struct linux_binprm * bprm )
2005-04-16 15:20:36 -07:00
{
2012-10-20 22:00:48 -04:00
struct pt_regs * regs = current_pt_regs ( ) ;
2005-04-16 15:20:36 -07:00
struct exec ex ;
unsigned long error ;
unsigned long fd_offset ;
unsigned long rlim ;
int retval ;
ex = * ( ( struct exec * ) bprm - > buf ) ; /* exec-header */
if ( ( N_MAGIC ( ex ) ! = ZMAGIC & & N_MAGIC ( ex ) ! = OMAGIC & &
N_MAGIC ( ex ) ! = QMAGIC & & N_MAGIC ( ex ) ! = NMAGIC ) | |
N_TRSIZE ( ex ) | | N_DRSIZE ( ex ) | |
2013-01-23 17:07:38 -05:00
i_size_read ( file_inode ( bprm - > file ) ) < ex . a_text + ex . a_data + N_SYMSIZE ( ex ) + N_TXTOFF ( ex ) ) {
2005-04-16 15:20:36 -07:00
return - ENOEXEC ;
}
2006-09-29 01:59:33 -07:00
/*
* Requires a mmap handler . This prevents people from using a . out
* as part of an exploit attack against / proc - related vulnerabilities .
*/
2013-09-22 16:27:52 -04:00
if ( ! bprm - > file - > f_op - > mmap )
2006-09-29 01:59:33 -07:00
return - ENOEXEC ;
2005-04-16 15:20:36 -07:00
fd_offset = N_TXTOFF ( ex ) ;
/* Check initial limits. This avoids letting people circumvent
* size limits imposed on them by creating programs with large
* arrays in the data or bss .
*/
2010-03-05 13:42:42 -08:00
rlim = rlimit ( RLIMIT_DATA ) ;
2005-04-16 15:20:36 -07:00
if ( rlim > = RLIM_INFINITY )
rlim = ~ 0 ;
if ( ex . a_data + ex . a_bss > rlim )
return - ENOMEM ;
/* Flush all traces of the currently running executable */
retval = flush_old_exec ( bprm ) ;
if ( retval )
return retval ;
/* OK, This is the point of no return */
2009-01-03 07:16:23 +00:00
# ifdef __alpha__
2005-04-16 15:20:36 -07:00
SET_AOUT_PERSONALITY ( bprm , ex ) ;
# else
set_personality ( PER_LINUX ) ;
# endif
Split 'flush_old_exec' into two functions
'flush_old_exec()' is the point of no return when doing an execve(), and
it is pretty badly misnamed. It doesn't just flush the old executable
environment, it also starts up the new one.
Which is very inconvenient for things like setting up the new
personality, because we want the new personality to affect the starting
of the new environment, but at the same time we do _not_ want the new
personality to take effect if flushing the old one fails.
As a result, the x86-64 '32-bit' personality is actually done using this
insane "I'm going to change the ABI, but I haven't done it yet" bit
(TIF_ABI_PENDING), with SET_PERSONALITY() not actually setting the
personality, but just the "pending" bit, so that "flush_thread()" can do
the actual personality magic.
This patch in no way changes any of that insanity, but it does split the
'flush_old_exec()' function up into a preparatory part that can fail
(still called flush_old_exec()), and a new part that will actually set
up the new exec environment (setup_new_exec()). All callers are changed
to trivially comply with the new world order.
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: stable@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-01-28 22:14:42 -08:00
setup_new_exec ( bprm ) ;
2005-04-16 15:20:36 -07:00
current - > mm - > end_code = ex . a_text +
( current - > mm - > start_code = N_TXTADDR ( ex ) ) ;
current - > mm - > end_data = ex . a_data +
( current - > mm - > start_data = N_DATADDR ( ex ) ) ;
current - > mm - > brk = ex . a_bss +
( current - > mm - > start_brk = N_BSSADDR ( ex ) ) ;
2012-03-05 06:38:42 +00:00
retval = setup_arg_pages ( bprm , STACK_TOP , EXSTACK_DEFAULT ) ;
2014-05-04 20:11:36 -04:00
if ( retval < 0 )
2012-03-05 06:38:42 +00:00
return retval ;
CRED: Make execve() take advantage of copy-on-write credentials
Make execve() take advantage of copy-on-write credentials, allowing it to set
up the credentials in advance, and then commit the whole lot after the point
of no return.
This patch and the preceding patches have been tested with the LTP SELinux
testsuite.
This patch makes several logical sets of alteration:
(1) execve().
The credential bits from struct linux_binprm are, for the most part,
replaced with a single credentials pointer (bprm->cred). This means that
all the creds can be calculated in advance and then applied at the point
of no return with no possibility of failure.
I would like to replace bprm->cap_effective with:
cap_isclear(bprm->cap_effective)
but this seems impossible due to special behaviour for processes of pid 1
(they always retain their parent's capability masks where normally they'd
be changed - see cap_bprm_set_creds()).
The following sequence of events now happens:
(a) At the start of do_execve, the current task's cred_exec_mutex is
locked to prevent PTRACE_ATTACH from obsoleting the calculation of
creds that we make.
(a) prepare_exec_creds() is then called to make a copy of the current
task's credentials and prepare it. This copy is then assigned to
bprm->cred.
This renders security_bprm_alloc() and security_bprm_free()
unnecessary, and so they've been removed.
(b) The determination of unsafe execution is now performed immediately
after (a) rather than later on in the code. The result is stored in
bprm->unsafe for future reference.
(c) prepare_binprm() is called, possibly multiple times.
(i) This applies the result of set[ug]id binaries to the new creds
attached to bprm->cred. Personality bit clearance is recorded,
but now deferred on the basis that the exec procedure may yet
fail.
(ii) This then calls the new security_bprm_set_creds(). This should
calculate the new LSM and capability credentials into *bprm->cred.
This folds together security_bprm_set() and parts of
security_bprm_apply_creds() (these two have been removed).
Anything that might fail must be done at this point.
(iii) bprm->cred_prepared is set to 1.
bprm->cred_prepared is 0 on the first pass of the security
calculations, and 1 on all subsequent passes. This allows SELinux
in (ii) to base its calculations only on the initial script and
not on the interpreter.
(d) flush_old_exec() is called to commit the task to execution. This
performs the following steps with regard to credentials:
(i) Clear pdeath_signal and set dumpable on certain circumstances that
may not be covered by commit_creds().
(ii) Clear any bits in current->personality that were deferred from
(c.i).
(e) install_exec_creds() [compute_creds() as was] is called to install the
new credentials. This performs the following steps with regard to
credentials:
(i) Calls security_bprm_committing_creds() to apply any security
requirements, such as flushing unauthorised files in SELinux, that
must be done before the credentials are changed.
This is made up of bits of security_bprm_apply_creds() and
security_bprm_post_apply_creds(), both of which have been removed.
This function is not allowed to fail; anything that might fail
must have been done in (c.ii).
(ii) Calls commit_creds() to apply the new credentials in a single
assignment (more or less). Possibly pdeath_signal and dumpable
should be part of struct creds.
(iii) Unlocks the task's cred_replace_mutex, thus allowing
PTRACE_ATTACH to take place.
(iv) Clears The bprm->cred pointer as the credentials it was holding
are now immutable.
(v) Calls security_bprm_committed_creds() to apply any security
alterations that must be done after the creds have been changed.
SELinux uses this to flush signals and signal handlers.
(f) If an error occurs before (d.i), bprm_free() will call abort_creds()
to destroy the proposed new credentials and will then unlock
cred_replace_mutex. No changes to the credentials will have been
made.
(2) LSM interface.
A number of functions have been changed, added or removed:
(*) security_bprm_alloc(), ->bprm_alloc_security()
(*) security_bprm_free(), ->bprm_free_security()
Removed in favour of preparing new credentials and modifying those.
(*) security_bprm_apply_creds(), ->bprm_apply_creds()
(*) security_bprm_post_apply_creds(), ->bprm_post_apply_creds()
Removed; split between security_bprm_set_creds(),
security_bprm_committing_creds() and security_bprm_committed_creds().
(*) security_bprm_set(), ->bprm_set_security()
Removed; folded into security_bprm_set_creds().
(*) security_bprm_set_creds(), ->bprm_set_creds()
New. The new credentials in bprm->creds should be checked and set up
as appropriate. bprm->cred_prepared is 0 on the first call, 1 on the
second and subsequent calls.
(*) security_bprm_committing_creds(), ->bprm_committing_creds()
(*) security_bprm_committed_creds(), ->bprm_committed_creds()
New. Apply the security effects of the new credentials. This
includes closing unauthorised files in SELinux. This function may not
fail. When the former is called, the creds haven't yet been applied
to the process; when the latter is called, they have.
The former may access bprm->cred, the latter may not.
(3) SELinux.
SELinux has a number of changes, in addition to those to support the LSM
interface changes mentioned above:
(a) The bprm_security_struct struct has been removed in favour of using
the credentials-under-construction approach.
(c) flush_unauthorized_files() now takes a cred pointer and passes it on
to inode_has_perm(), file_has_perm() and dentry_open().
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
2008-11-14 10:39:24 +11:00
install_exec_creds ( bprm ) ;
2005-04-16 15:20:36 -07:00
if ( N_MAGIC ( ex ) = = OMAGIC ) {
unsigned long text_addr , map_size ;
loff_t pos ;
text_addr = N_TXTADDR ( ex ) ;
2009-01-03 07:16:13 +00:00
# ifdef __alpha__
2005-04-16 15:20:36 -07:00
pos = fd_offset ;
map_size = ex . a_text + ex . a_data + PAGE_SIZE - 1 ;
# else
pos = 32 ;
map_size = ex . a_text + ex . a_data ;
# endif
2012-04-20 15:35:40 -07:00
error = vm_brk ( text_addr & PAGE_MASK , map_size ) ;
2016-05-27 15:57:31 -07:00
if ( error )
2005-04-16 15:20:36 -07:00
return error ;
2013-04-13 20:31:37 -04:00
error = read_code ( bprm - > file , text_addr , pos ,
ex . a_text + ex . a_data ) ;
2014-05-04 20:11:36 -04:00
if ( ( signed long ) error < 0 )
2005-04-16 15:20:36 -07:00
return error ;
} else {
if ( ( ex . a_text & 0xfff | | ex . a_data & 0xfff ) & &
2008-04-29 00:59:26 -07:00
( N_MAGIC ( ex ) ! = NMAGIC ) & & printk_ratelimit ( ) )
2005-04-16 15:20:36 -07:00
{
printk ( KERN_NOTICE " executable not page aligned \n " ) ;
}
2008-04-29 00:59:26 -07:00
if ( ( fd_offset & ~ PAGE_MASK ) ! = 0 & & printk_ratelimit ( ) )
2005-04-16 15:20:36 -07:00
{
printk ( KERN_WARNING
2014-10-21 20:11:25 -04:00
" fd_offset is not page aligned. Please convert program: %pD \n " ,
bprm - > file ) ;
2005-04-16 15:20:36 -07:00
}
if ( ! bprm - > file - > f_op - > mmap | | ( ( fd_offset & ~ PAGE_MASK ) ! = 0 ) ) {
2016-05-23 16:25:36 -07:00
error = vm_brk ( N_TXTADDR ( ex ) , ex . a_text + ex . a_data ) ;
2016-05-27 15:57:31 -07:00
if ( error )
2016-05-23 16:25:36 -07:00
return error ;
2013-04-13 20:31:37 -04:00
read_code ( bprm - > file , N_TXTADDR ( ex ) , fd_offset ,
ex . a_text + ex . a_data ) ;
2005-04-16 15:20:36 -07:00
goto beyond_if ;
}
2012-04-20 17:13:58 -07:00
error = vm_mmap ( bprm - > file , N_TXTADDR ( ex ) , ex . a_text ,
2005-04-16 15:20:36 -07:00
PROT_READ | PROT_EXEC ,
MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE ,
fd_offset ) ;
2014-05-04 20:11:36 -04:00
if ( error ! = N_TXTADDR ( ex ) )
2005-04-16 15:20:36 -07:00
return error ;
2012-04-20 17:13:58 -07:00
error = vm_mmap ( bprm - > file , N_DATADDR ( ex ) , ex . a_data ,
2005-04-16 15:20:36 -07:00
PROT_READ | PROT_WRITE | PROT_EXEC ,
MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE ,
fd_offset + ex . a_text ) ;
2014-05-04 20:11:36 -04:00
if ( error ! = N_DATADDR ( ex ) )
2005-04-16 15:20:36 -07:00
return error ;
}
beyond_if :
set_binfmt ( & aout_format ) ;
retval = set_brk ( current - > mm - > start_brk , current - > mm - > brk ) ;
2014-05-04 20:11:36 -04:00
if ( retval < 0 )
2005-04-16 15:20:36 -07:00
return retval ;
current - > mm - > start_stack =
( unsigned long ) create_aout_tables ( ( char __user * ) bprm - > p , bprm ) ;
# ifdef __alpha__
regs - > gp = ex . a_gpvalue ;
# endif
2018-04-10 16:34:57 -07:00
finalize_exec ( bprm ) ;
2005-04-16 15:20:36 -07:00
start_thread ( regs , ex . a_entry , current - > mm - > start_stack ) ;
return 0 ;
}
static int load_aout_library ( struct file * file )
{
struct inode * inode ;
unsigned long bss , start_addr , len ;
unsigned long error ;
int retval ;
struct exec ex ;
2017-09-01 17:39:13 +02:00
loff_t pos = 0 ;
2005-04-16 15:20:36 -07:00
2013-01-23 17:07:38 -05:00
inode = file_inode ( file ) ;
2005-04-16 15:20:36 -07:00
retval = - ENOEXEC ;
2017-09-01 17:39:13 +02:00
error = kernel_read ( file , & ex , sizeof ( ex ) , & pos ) ;
2005-04-16 15:20:36 -07:00
if ( error ! = sizeof ( ex ) )
goto out ;
/* We come in here for the regular a.out style of shared libraries */
if ( ( N_MAGIC ( ex ) ! = ZMAGIC & & N_MAGIC ( ex ) ! = QMAGIC ) | | N_TRSIZE ( ex ) | |
N_DRSIZE ( ex ) | | ( ( ex . a_entry & 0xfff ) & & N_MAGIC ( ex ) = = ZMAGIC ) | |
i_size_read ( inode ) < ex . a_text + ex . a_data + N_SYMSIZE ( ex ) + N_TXTOFF ( ex ) ) {
goto out ;
}
2006-09-29 01:59:33 -07:00
/*
* Requires a mmap handler . This prevents people from using a . out
* as part of an exploit attack against / proc - related vulnerabilities .
*/
2013-09-22 16:27:52 -04:00
if ( ! file - > f_op - > mmap )
2006-09-29 01:59:33 -07:00
goto out ;
2005-04-16 15:20:36 -07:00
if ( N_FLAGS ( ex ) )
goto out ;
/* For QMAGIC, the starting address is 0x20 into the page. We mask
this off to get the starting address for the page */
start_addr = ex . a_entry & 0xfffff000 ;
if ( ( N_TXTOFF ( ex ) & ~ PAGE_MASK ) ! = 0 ) {
2008-04-29 00:59:26 -07:00
if ( printk_ratelimit ( ) )
2005-04-16 15:20:36 -07:00
{
printk ( KERN_WARNING
2014-10-21 20:11:25 -04:00
" N_TXTOFF is not page aligned. Please convert library: %pD \n " ,
file ) ;
2005-04-16 15:20:36 -07:00
}
2016-05-23 16:25:36 -07:00
retval = vm_brk ( start_addr , ex . a_text + ex . a_data + ex . a_bss ) ;
2016-05-27 15:57:31 -07:00
if ( retval )
2016-05-23 16:25:36 -07:00
goto out ;
2013-04-13 20:31:37 -04:00
read_code ( file , start_addr , N_TXTOFF ( ex ) ,
ex . a_text + ex . a_data ) ;
2005-04-16 15:20:36 -07:00
retval = 0 ;
goto out ;
}
/* Now use mmap to map the library into memory. */
2012-04-20 17:13:58 -07:00
error = vm_mmap ( file , start_addr , ex . a_text + ex . a_data ,
2005-04-16 15:20:36 -07:00
PROT_READ | PROT_WRITE | PROT_EXEC ,
MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE ,
N_TXTOFF ( ex ) ) ;
retval = error ;
if ( error ! = start_addr )
goto out ;
len = PAGE_ALIGN ( ex . a_text + ex . a_data ) ;
bss = ex . a_text + ex . a_data + ex . a_bss ;
if ( bss > len ) {
2016-05-27 15:57:31 -07:00
retval = vm_brk ( start_addr + len , bss - len ) ;
if ( retval )
2005-04-16 15:20:36 -07:00
goto out ;
}
retval = 0 ;
out :
return retval ;
}
static int __init init_aout_binfmt ( void )
{
2012-03-17 03:05:16 -04:00
register_binfmt ( & aout_format ) ;
return 0 ;
2005-04-16 15:20:36 -07:00
}
static void __exit exit_aout_binfmt ( void )
{
unregister_binfmt ( & aout_format ) ;
}
core_initcall ( init_aout_binfmt ) ;
module_exit ( exit_aout_binfmt ) ;
MODULE_LICENSE ( " GPL " ) ;