2005-04-16 15:20:36 -07:00
/*
* linux / fs / exec . c
*
* Copyright ( C ) 1991 , 1992 Linus Torvalds
*/
/*
* # ! - checking implemented by tytso .
*/
/*
* Demand - loading implemented 01.12 .91 - no need to read anything but
* the header into memory . The inode of the executable is put into
* " current->executable " , and page faults do the actual loading . Clean .
*
* Once more I can proudly say that linux stood up to being changed : it
* was less than 2 hours work to get demand - loading completely implemented .
*
* Demand loading changed July 1993 by Eric Youngdale . Use mmap instead ,
* current - > executable is only used by the procfs . This allows a dispatch
* table to check for several different types of binary formats . We keep
* trying until we recognize the file or we run out of supported binary
* formats .
*/
# include <linux/slab.h>
# include <linux/file.h>
2008-04-24 07:44:08 -04:00
# include <linux/fdtable.h>
2008-07-25 01:45:43 -07:00
# include <linux/mm.h>
2005-04-16 15:20:36 -07:00
# include <linux/stat.h>
# include <linux/fcntl.h>
# include <linux/smp_lock.h>
2008-07-25 01:45:43 -07:00
# include <linux/swap.h>
2007-10-16 23:26:35 -07:00
# include <linux/string.h>
2005-04-16 15:20:36 -07:00
# include <linux/init.h>
2008-07-28 15:46:18 -07:00
# include <linux/pagemap.h>
2005-04-16 15:20:36 -07:00
# include <linux/highmem.h>
# include <linux/spinlock.h>
# include <linux/key.h>
# include <linux/personality.h>
# include <linux/binfmts.h>
# include <linux/utsname.h>
2006-12-08 02:38:01 -08:00
# include <linux/pid_namespace.h>
2005-04-16 15:20:36 -07:00
# include <linux/module.h>
# include <linux/namei.h>
# include <linux/proc_fs.h>
# include <linux/mount.h>
# include <linux/security.h>
# include <linux/syscalls.h>
2006-09-30 23:28:59 -07:00
# include <linux/tsacct_kern.h>
2005-11-07 00:59:16 -08:00
# include <linux/cn_proc.h>
2006-04-26 14:04:08 -04:00
# include <linux/audit.h>
2008-07-25 19:45:44 -07:00
# include <linux/tracehook.h>
2008-07-09 10:28:40 +02:00
# include <linux/kmod.h>
2005-04-16 15:20:36 -07:00
# include <asm/uaccess.h>
# include <asm/mmu_context.h>
2007-07-19 01:48:16 -07:00
# include <asm/tlb.h>
2005-04-16 15:20:36 -07:00
2008-06-16 12:11:54 +01:00
# ifdef __alpha__
/* for /sbin/loader handling in search_binary_handler() */
# include <linux/a.out.h>
# endif
2005-04-16 15:20:36 -07:00
int core_uses_pid ;
2007-05-16 22:11:16 -07:00
char core_pattern [ CORENAME_MAX_SIZE ] = " core " ;
2005-06-23 00:09:43 -07:00
int suid_dumpable = 0 ;
2005-04-16 15:20:36 -07:00
/* The maximal length of core_pattern is also specified in sysctl.c */
2007-10-16 23:26:03 -07:00
static LIST_HEAD ( formats ) ;
2005-04-16 15:20:36 -07:00
static DEFINE_RWLOCK ( binfmt_lock ) ;
int register_binfmt ( struct linux_binfmt * fmt )
{
if ( ! fmt )
return - EINVAL ;
write_lock ( & binfmt_lock ) ;
2007-10-16 23:26:03 -07:00
list_add ( & fmt - > lh , & formats ) ;
2005-04-16 15:20:36 -07:00
write_unlock ( & binfmt_lock ) ;
return 0 ;
}
EXPORT_SYMBOL ( register_binfmt ) ;
2007-10-16 23:26:04 -07:00
void unregister_binfmt ( struct linux_binfmt * fmt )
2005-04-16 15:20:36 -07:00
{
write_lock ( & binfmt_lock ) ;
2007-10-16 23:26:03 -07:00
list_del ( & fmt - > lh ) ;
2005-04-16 15:20:36 -07:00
write_unlock ( & binfmt_lock ) ;
}
EXPORT_SYMBOL ( unregister_binfmt ) ;
static inline void put_binfmt ( struct linux_binfmt * fmt )
{
module_put ( fmt - > module ) ;
}
/*
* Note that a shared library must be both readable and executable due to
* security reasons .
*
* Also note that we take the address to load from from the file itself .
*/
asmlinkage long sys_uselib ( const char __user * library )
{
2008-07-26 03:33:14 -04:00
struct file * file ;
2005-04-16 15:20:36 -07:00
struct nameidata nd ;
2008-07-26 03:33:14 -04:00
char * tmp = getname ( library ) ;
int error = PTR_ERR ( tmp ) ;
if ( ! IS_ERR ( tmp ) ) {
error = path_lookup_open ( AT_FDCWD , tmp ,
LOOKUP_FOLLOW , & nd ,
FMODE_READ | FMODE_EXEC ) ;
putname ( tmp ) ;
}
2005-04-16 15:20:36 -07:00
if ( error )
goto out ;
error = - EINVAL ;
2008-02-14 19:34:32 -08:00
if ( ! S_ISREG ( nd . path . dentry - > d_inode - > i_mode ) )
2005-04-16 15:20:36 -07:00
goto exit ;
2008-07-22 00:02:33 -04:00
error = - EACCES ;
if ( nd . path . mnt - > mnt_flags & MNT_NOEXEC )
goto exit ;
2008-07-17 09:37:02 -04:00
error = vfs_permission ( & nd , MAY_READ | MAY_EXEC | MAY_OPEN ) ;
2005-04-16 15:20:36 -07:00
if ( error )
goto exit ;
2008-02-08 04:20:23 -08:00
file = nameidata_to_filp ( & nd , O_RDONLY | O_LARGEFILE ) ;
2005-04-16 15:20:36 -07:00
error = PTR_ERR ( file ) ;
if ( IS_ERR ( file ) )
goto out ;
error = - ENOEXEC ;
if ( file - > f_op ) {
struct linux_binfmt * fmt ;
read_lock ( & binfmt_lock ) ;
2007-10-16 23:26:03 -07:00
list_for_each_entry ( fmt , & formats , lh ) {
2005-04-16 15:20:36 -07:00
if ( ! fmt - > load_shlib )
continue ;
if ( ! try_module_get ( fmt - > module ) )
continue ;
read_unlock ( & binfmt_lock ) ;
error = fmt - > load_shlib ( file ) ;
read_lock ( & binfmt_lock ) ;
put_binfmt ( fmt ) ;
if ( error ! = - ENOEXEC )
break ;
}
read_unlock ( & binfmt_lock ) ;
}
fput ( file ) ;
out :
return error ;
exit :
2005-10-18 14:20:16 -07:00
release_open_intent ( & nd ) ;
2008-02-14 19:34:35 -08:00
path_put ( & nd . path ) ;
2005-04-16 15:20:36 -07:00
goto out ;
}
2007-07-19 01:48:16 -07:00
# ifdef CONFIG_MMU
static struct page * get_arg_page ( struct linux_binprm * bprm , unsigned long pos ,
int write )
{
struct page * page ;
int ret ;
# ifdef CONFIG_STACK_GROWSUP
if ( write ) {
ret = expand_stack_downwards ( bprm - > vma , pos ) ;
if ( ret < 0 )
return NULL ;
}
# endif
ret = get_user_pages ( current , bprm - > mm , pos ,
1 , write , 1 , & page , NULL ) ;
if ( ret < = 0 )
return NULL ;
if ( write ) {
unsigned long size = bprm - > vma - > vm_end - bprm - > vma - > vm_start ;
2008-03-03 10:12:14 -08:00
struct rlimit * rlim ;
/*
* We ' ve historically supported up to 32 pages ( ARG_MAX )
* of argument strings even with small stacks
*/
if ( size < = ARG_MAX )
return page ;
2007-07-19 01:48:16 -07:00
/*
* Limit to 1 / 4 - th the stack size for the argv + env strings .
* This ensures that :
* - the remaining binfmt code will not run out of stack space ,
* - the program will have a reasonable amount of stack left
* to work from .
*/
2008-03-03 10:12:14 -08:00
rlim = current - > signal - > rlim ;
2007-07-19 01:48:16 -07:00
if ( size > rlim [ RLIMIT_STACK ] . rlim_cur / 4 ) {
put_page ( page ) ;
return NULL ;
}
}
return page ;
}
static void put_arg_page ( struct page * page )
{
put_page ( page ) ;
}
static void free_arg_page ( struct linux_binprm * bprm , int i )
{
}
static void free_arg_pages ( struct linux_binprm * bprm )
{
}
static void flush_arg_page ( struct linux_binprm * bprm , unsigned long pos ,
struct page * page )
{
flush_cache_page ( bprm - > vma , pos , page_to_pfn ( page ) ) ;
}
static int __bprm_mm_init ( struct linux_binprm * bprm )
{
int err = - ENOMEM ;
struct vm_area_struct * vma = NULL ;
struct mm_struct * mm = bprm - > mm ;
bprm - > vma = vma = kmem_cache_zalloc ( vm_area_cachep , GFP_KERNEL ) ;
if ( ! vma )
goto err ;
down_write ( & mm - > mmap_sem ) ;
vma - > vm_mm = mm ;
/*
* Place the stack at the largest stack address the architecture
* supports . Later , we ' ll move this to an appropriate place . We don ' t
* use STACK_TOP because that can depend on attributes which aren ' t
* configured yet .
*/
vma - > vm_end = STACK_TOP_MAX ;
vma - > vm_start = vma - > vm_end - PAGE_SIZE ;
vma - > vm_flags = VM_STACK_FLAGS ;
2007-10-18 23:39:15 -07:00
vma - > vm_page_prot = vm_get_page_prot ( vma - > vm_flags ) ;
2007-07-19 01:48:16 -07:00
err = insert_vm_struct ( mm , vma ) ;
if ( err ) {
up_write ( & mm - > mmap_sem ) ;
goto err ;
}
mm - > stack_vm = mm - > total_vm = 1 ;
up_write ( & mm - > mmap_sem ) ;
bprm - > p = vma - > vm_end - sizeof ( void * ) ;
return 0 ;
err :
if ( vma ) {
bprm - > vma = NULL ;
kmem_cache_free ( vm_area_cachep , vma ) ;
}
return err ;
}
static bool valid_arg_len ( struct linux_binprm * bprm , long len )
{
return len < = MAX_ARG_STRLEN ;
}
# else
static struct page * get_arg_page ( struct linux_binprm * bprm , unsigned long pos ,
int write )
{
struct page * page ;
page = bprm - > page [ pos / PAGE_SIZE ] ;
if ( ! page & & write ) {
page = alloc_page ( GFP_HIGHUSER | __GFP_ZERO ) ;
if ( ! page )
return NULL ;
bprm - > page [ pos / PAGE_SIZE ] = page ;
}
return page ;
}
static void put_arg_page ( struct page * page )
{
}
static void free_arg_page ( struct linux_binprm * bprm , int i )
{
if ( bprm - > page [ i ] ) {
__free_page ( bprm - > page [ i ] ) ;
bprm - > page [ i ] = NULL ;
}
}
static void free_arg_pages ( struct linux_binprm * bprm )
{
int i ;
for ( i = 0 ; i < MAX_ARG_PAGES ; i + + )
free_arg_page ( bprm , i ) ;
}
static void flush_arg_page ( struct linux_binprm * bprm , unsigned long pos ,
struct page * page )
{
}
static int __bprm_mm_init ( struct linux_binprm * bprm )
{
bprm - > p = PAGE_SIZE * MAX_ARG_PAGES - sizeof ( void * ) ;
return 0 ;
}
static bool valid_arg_len ( struct linux_binprm * bprm , long len )
{
return len < = bprm - > p ;
}
# endif /* CONFIG_MMU */
/*
* Create a new mm_struct and populate it with a temporary stack
* vm_area_struct . We don ' t have enough context at this point to set the stack
* flags , permissions , and offset , so we use temporary values . We ' ll update
* them later in setup_arg_pages ( ) .
*/
int bprm_mm_init ( struct linux_binprm * bprm )
{
int err ;
struct mm_struct * mm = NULL ;
bprm - > mm = mm = mm_alloc ( ) ;
err = - ENOMEM ;
if ( ! mm )
goto err ;
err = init_new_context ( current , mm ) ;
if ( err )
goto err ;
err = __bprm_mm_init ( bprm ) ;
if ( err )
goto err ;
return 0 ;
err :
if ( mm ) {
bprm - > mm = NULL ;
mmdrop ( mm ) ;
}
return err ;
}
2005-04-16 15:20:36 -07:00
/*
* count ( ) counts the number of strings in array ARGV .
*/
static int count ( char __user * __user * argv , int max )
{
int i = 0 ;
if ( argv ! = NULL ) {
for ( ; ; ) {
char __user * p ;
if ( get_user ( p , argv ) )
return - EFAULT ;
if ( ! p )
break ;
argv + + ;
2008-10-15 22:01:52 -07:00
if ( i + + > = max )
2005-04-16 15:20:36 -07:00
return - E2BIG ;
cond_resched ( ) ;
}
}
return i ;
}
/*
2007-07-19 01:48:16 -07:00
* ' copy_strings ( ) ' copies argument / environment strings from the old
* processes ' s memory to the new process ' s stack . The call to get_user_pages ( )
* ensures the destination page is created and not swapped out .
2005-04-16 15:20:36 -07:00
*/
2005-05-05 16:16:09 -07:00
static int copy_strings ( int argc , char __user * __user * argv ,
struct linux_binprm * bprm )
2005-04-16 15:20:36 -07:00
{
struct page * kmapped_page = NULL ;
char * kaddr = NULL ;
2007-07-19 01:48:16 -07:00
unsigned long kpos = 0 ;
2005-04-16 15:20:36 -07:00
int ret ;
while ( argc - - > 0 ) {
char __user * str ;
int len ;
unsigned long pos ;
if ( get_user ( str , argv + argc ) | |
2007-07-19 01:48:16 -07:00
! ( len = strnlen_user ( str , MAX_ARG_STRLEN ) ) ) {
2005-04-16 15:20:36 -07:00
ret = - EFAULT ;
goto out ;
}
2007-07-19 01:48:16 -07:00
if ( ! valid_arg_len ( bprm , len ) ) {
2005-04-16 15:20:36 -07:00
ret = - E2BIG ;
goto out ;
}
2007-07-19 01:48:16 -07:00
/* We're going to work our way backwords. */
2005-04-16 15:20:36 -07:00
pos = bprm - > p ;
2007-07-19 01:48:16 -07:00
str + = len ;
bprm - > p - = len ;
2005-04-16 15:20:36 -07:00
while ( len > 0 ) {
int offset , bytes_to_copy ;
offset = pos % PAGE_SIZE ;
2007-07-19 01:48:16 -07:00
if ( offset = = 0 )
offset = PAGE_SIZE ;
bytes_to_copy = offset ;
if ( bytes_to_copy > len )
bytes_to_copy = len ;
offset - = bytes_to_copy ;
pos - = bytes_to_copy ;
str - = bytes_to_copy ;
len - = bytes_to_copy ;
if ( ! kmapped_page | | kpos ! = ( pos & PAGE_MASK ) ) {
struct page * page ;
page = get_arg_page ( bprm , pos , 1 ) ;
2005-04-16 15:20:36 -07:00
if ( ! page ) {
2007-07-19 01:48:16 -07:00
ret = - E2BIG ;
2005-04-16 15:20:36 -07:00
goto out ;
}
2007-07-19 01:48:16 -07:00
if ( kmapped_page ) {
flush_kernel_dcache_page ( kmapped_page ) ;
2005-04-16 15:20:36 -07:00
kunmap ( kmapped_page ) ;
2007-07-19 01:48:16 -07:00
put_arg_page ( kmapped_page ) ;
}
2005-04-16 15:20:36 -07:00
kmapped_page = page ;
kaddr = kmap ( kmapped_page ) ;
2007-07-19 01:48:16 -07:00
kpos = pos & PAGE_MASK ;
flush_arg_page ( bprm , kpos , kmapped_page ) ;
2005-04-16 15:20:36 -07:00
}
2007-07-19 01:48:16 -07:00
if ( copy_from_user ( kaddr + offset , str , bytes_to_copy ) ) {
2005-04-16 15:20:36 -07:00
ret = - EFAULT ;
goto out ;
}
}
}
ret = 0 ;
out :
2007-07-19 01:48:16 -07:00
if ( kmapped_page ) {
flush_kernel_dcache_page ( kmapped_page ) ;
2005-04-16 15:20:36 -07:00
kunmap ( kmapped_page ) ;
2007-07-19 01:48:16 -07:00
put_arg_page ( kmapped_page ) ;
}
2005-04-16 15:20:36 -07:00
return ret ;
}
/*
* Like copy_strings , but get argv and its values from kernel memory .
*/
int copy_strings_kernel ( int argc , char * * argv , struct linux_binprm * bprm )
{
int r ;
mm_segment_t oldfs = get_fs ( ) ;
set_fs ( KERNEL_DS ) ;
r = copy_strings ( argc , ( char __user * __user * ) argv , bprm ) ;
set_fs ( oldfs ) ;
return r ;
}
EXPORT_SYMBOL ( copy_strings_kernel ) ;
# ifdef CONFIG_MMU
2007-07-19 01:48:16 -07:00
2005-04-16 15:20:36 -07:00
/*
2007-07-19 01:48:16 -07:00
* During bprm_mm_init ( ) , we create a temporary stack at STACK_TOP_MAX . Once
* the binfmt code determines where the new stack should reside , we shift it to
* its final location . The process proceeds as follows :
2005-04-16 15:20:36 -07:00
*
2007-07-19 01:48:16 -07:00
* 1 ) Use shift to calculate the new vma endpoints .
* 2 ) Extend vma to cover both the old and new ranges . This ensures the
* arguments passed to subsequent functions are consistent .
* 3 ) Move vma ' s page tables to the new range .
* 4 ) Free up any cleared pgd range .
* 5 ) Shrink the vma to cover only the new range .
2005-04-16 15:20:36 -07:00
*/
2007-07-19 01:48:16 -07:00
static int shift_arg_pages ( struct vm_area_struct * vma , unsigned long shift )
2005-04-16 15:20:36 -07:00
{
struct mm_struct * mm = vma - > vm_mm ;
2007-07-19 01:48:16 -07:00
unsigned long old_start = vma - > vm_start ;
unsigned long old_end = vma - > vm_end ;
unsigned long length = old_end - old_start ;
unsigned long new_start = old_start - shift ;
unsigned long new_end = old_end - shift ;
struct mmu_gather * tlb ;
2005-04-16 15:20:36 -07:00
2007-07-19 01:48:16 -07:00
BUG_ON ( new_start > new_end ) ;
2005-04-16 15:20:36 -07:00
2007-07-19 01:48:16 -07:00
/*
* ensure there are no vmas between where we want to go
* and where we are
*/
if ( vma ! = find_vma ( mm , new_start ) )
return - EFAULT ;
/*
* cover the whole range : [ new_start , old_end )
*/
vma_adjust ( vma , new_start , old_end , vma - > vm_pgoff , NULL ) ;
/*
* move the page tables downwards , on failure we rely on
* process cleanup to remove whatever mess we made .
*/
if ( length ! = move_page_tables ( vma , old_start ,
vma , new_start , length ) )
return - ENOMEM ;
lru_add_drain ( ) ;
tlb = tlb_gather_mmu ( mm , 0 ) ;
if ( new_end > old_start ) {
/*
* when the old and new regions overlap clear from new_end .
*/
2008-07-23 21:27:10 -07:00
free_pgd_range ( tlb , new_end , old_end , new_end ,
2007-07-19 01:48:16 -07:00
vma - > vm_next ? vma - > vm_next - > vm_start : 0 ) ;
} else {
/*
* otherwise , clean from old_start ; this is done to not touch
* the address space in [ new_end , old_start ) some architectures
* have constraints on va - space that make this illegal ( IA64 ) -
* for the others its just a little faster .
*/
2008-07-23 21:27:10 -07:00
free_pgd_range ( tlb , old_start , old_end , new_end ,
2007-07-19 01:48:16 -07:00
vma - > vm_next ? vma - > vm_next - > vm_start : 0 ) ;
2005-04-16 15:20:36 -07:00
}
2007-07-19 01:48:16 -07:00
tlb_finish_mmu ( tlb , new_end , old_end ) ;
/*
* shrink the vma to just the new range .
*/
vma_adjust ( vma , new_start , new_end , vma - > vm_pgoff , NULL ) ;
return 0 ;
2005-04-16 15:20:36 -07:00
}
# define EXTRA_STACK_VM_PAGES 20 /* random */
2007-07-19 01:48:16 -07:00
/*
* Finalizes the stack vm_area_struct . The flags and permissions are updated ,
* the stack is optionally relocated , and some extra space is added .
*/
2005-04-16 15:20:36 -07:00
int setup_arg_pages ( struct linux_binprm * bprm ,
unsigned long stack_top ,
int executable_stack )
{
2007-07-19 01:48:16 -07:00
unsigned long ret ;
unsigned long stack_shift ;
2005-04-16 15:20:36 -07:00
struct mm_struct * mm = current - > mm ;
2007-07-19 01:48:16 -07:00
struct vm_area_struct * vma = bprm - > vma ;
struct vm_area_struct * prev = NULL ;
unsigned long vm_flags ;
unsigned long stack_base ;
2005-04-16 15:20:36 -07:00
# ifdef CONFIG_STACK_GROWSUP
/* Limit stack size to 1GB */
stack_base = current - > signal - > rlim [ RLIMIT_STACK ] . rlim_max ;
if ( stack_base > ( 1 < < 30 ) )
stack_base = 1 < < 30 ;
2007-07-19 01:48:16 -07:00
/* Make sure we didn't let the argument array grow too large. */
if ( vma - > vm_end - vma - > vm_start > stack_base )
return - ENOMEM ;
2005-04-16 15:20:36 -07:00
2007-07-19 01:48:16 -07:00
stack_base = PAGE_ALIGN ( stack_top - stack_base ) ;
2005-04-16 15:20:36 -07:00
2007-07-19 01:48:16 -07:00
stack_shift = vma - > vm_start - stack_base ;
mm - > arg_start = bprm - > p - stack_shift ;
bprm - > p = vma - > vm_end - stack_shift ;
2005-04-16 15:20:36 -07:00
# else
2007-07-19 01:48:16 -07:00
stack_top = arch_align_stack ( stack_top ) ;
stack_top = PAGE_ALIGN ( stack_top ) ;
stack_shift = vma - > vm_end - stack_top ;
bprm - > p - = stack_shift ;
2005-04-16 15:20:36 -07:00
mm - > arg_start = bprm - > p ;
# endif
if ( bprm - > loader )
2007-07-19 01:48:16 -07:00
bprm - > loader - = stack_shift ;
bprm - > exec - = stack_shift ;
2005-04-16 15:20:36 -07:00
down_write ( & mm - > mmap_sem ) ;
2008-07-10 21:19:20 +01:00
vm_flags = VM_STACK_FLAGS ;
2007-07-19 01:48:16 -07:00
/*
* Adjust stack execute permissions ; explicitly enable for
* EXSTACK_ENABLE_X , disable for EXSTACK_DISABLE_X and leave alone
* ( arch default ) otherwise .
*/
if ( unlikely ( executable_stack = = EXSTACK_ENABLE_X ) )
vm_flags | = VM_EXEC ;
else if ( executable_stack = = EXSTACK_DISABLE_X )
vm_flags & = ~ VM_EXEC ;
vm_flags | = mm - > def_flags ;
ret = mprotect_fixup ( vma , & prev , vma - > vm_start , vma - > vm_end ,
vm_flags ) ;
if ( ret )
goto out_unlock ;
BUG_ON ( prev ! = vma ) ;
/* Move stack pages down in memory. */
if ( stack_shift ) {
ret = shift_arg_pages ( vma , stack_shift ) ;
if ( ret ) {
2005-04-16 15:20:36 -07:00
up_write ( & mm - > mmap_sem ) ;
return ret ;
}
}
2007-07-19 01:48:16 -07:00
# ifdef CONFIG_STACK_GROWSUP
stack_base = vma - > vm_end + EXTRA_STACK_VM_PAGES * PAGE_SIZE ;
# else
stack_base = vma - > vm_start - EXTRA_STACK_VM_PAGES * PAGE_SIZE ;
# endif
ret = expand_stack ( vma , stack_base ) ;
if ( ret )
ret = - EFAULT ;
out_unlock :
2005-04-16 15:20:36 -07:00
up_write ( & mm - > mmap_sem ) ;
return 0 ;
}
EXPORT_SYMBOL ( setup_arg_pages ) ;
# endif /* CONFIG_MMU */
struct file * open_exec ( const char * name )
{
struct nameidata nd ;
struct file * file ;
2008-05-19 07:53:34 +02:00
int err ;
2005-04-16 15:20:36 -07:00
2008-05-19 07:53:34 +02:00
err = path_lookup_open ( AT_FDCWD , name , LOOKUP_FOLLOW , & nd ,
FMODE_READ | FMODE_EXEC ) ;
if ( err )
goto out ;
err = - EACCES ;
if ( ! S_ISREG ( nd . path . dentry - > d_inode - > i_mode ) )
goto out_path_put ;
2008-07-22 00:02:33 -04:00
if ( nd . path . mnt - > mnt_flags & MNT_NOEXEC )
goto out_path_put ;
2008-05-19 07:53:34 +02:00
err = vfs_permission ( & nd , MAY_EXEC | MAY_OPEN ) ;
if ( err )
goto out_path_put ;
file = nameidata_to_filp ( & nd , O_RDONLY | O_LARGEFILE ) ;
if ( IS_ERR ( file ) )
return file ;
err = deny_write_access ( file ) ;
if ( err ) {
fput ( file ) ;
goto out ;
2005-04-16 15:20:36 -07:00
}
2008-05-19 07:53:34 +02:00
return file ;
out_path_put :
release_open_intent ( & nd ) ;
path_put ( & nd . path ) ;
out :
return ERR_PTR ( err ) ;
}
2005-04-16 15:20:36 -07:00
EXPORT_SYMBOL ( open_exec ) ;
int kernel_read ( struct file * file , unsigned long offset ,
char * addr , unsigned long count )
{
mm_segment_t old_fs ;
loff_t pos = offset ;
int result ;
old_fs = get_fs ( ) ;
set_fs ( get_ds ( ) ) ;
/* The cast to a user pointer is valid due to the set_fs() */
result = vfs_read ( file , ( void __user * ) addr , count , & pos ) ;
set_fs ( old_fs ) ;
return result ;
}
EXPORT_SYMBOL ( kernel_read ) ;
static int exec_mmap ( struct mm_struct * mm )
{
struct task_struct * tsk ;
struct mm_struct * old_mm , * active_mm ;
/* Notify parent that we're no longer interested in the old VM */
tsk = current ;
old_mm = current - > mm ;
mm_release ( tsk , old_mm ) ;
if ( old_mm ) {
/*
* Make sure that if there is a core dump in progress
* for the old mm , we get out and die instead of going
* through with the exec . We must hold mmap_sem around
2008-07-25 01:47:41 -07:00
* checking core_state and changing tsk - > mm .
2005-04-16 15:20:36 -07:00
*/
down_read ( & old_mm - > mmap_sem ) ;
2008-07-25 01:47:41 -07:00
if ( unlikely ( old_mm - > core_state ) ) {
2005-04-16 15:20:36 -07:00
up_read ( & old_mm - > mmap_sem ) ;
return - EINTR ;
}
}
task_lock ( tsk ) ;
active_mm = tsk - > active_mm ;
tsk - > mm = mm ;
tsk - > active_mm = mm ;
activate_mm ( active_mm , mm ) ;
task_unlock ( tsk ) ;
arch_pick_mmap_layout ( mm ) ;
if ( old_mm ) {
up_read ( & old_mm - > mmap_sem ) ;
2006-04-01 01:13:38 +02:00
BUG_ON ( active_mm ! = old_mm ) ;
mm owner: fix race between swapoff and exit
There's a race between mm->owner assignment and swapoff, more easily
seen when task slab poisoning is turned on. The condition occurs when
try_to_unuse() runs in parallel with an exiting task. A similar race
can occur with callers of get_task_mm(), such as /proc/<pid>/<mmstats>
or ptrace or page migration.
CPU0 CPU1
try_to_unuse
looks at mm = task0->mm
increments mm->mm_users
task 0 exits
mm->owner needs to be updated, but no
new owner is found (mm_users > 1, but
no other task has task->mm = task0->mm)
mm_update_next_owner() leaves
mmput(mm) decrements mm->mm_users
task0 freed
dereferencing mm->owner fails
The fix is to notify the subsystem via mm_owner_changed callback(),
if no new owner is found, by specifying the new task as NULL.
Jiri Slaby:
mm->owner was set to NULL prior to calling cgroup_mm_owner_callbacks(), but
must be set after that, so as not to pass NULL as old owner causing oops.
Daisuke Nishimura:
mm_update_next_owner() may set mm->owner to NULL, but mem_cgroup_from_task()
and its callers need to take account of this situation to avoid oops.
Hugh Dickins:
Lockdep warning and hang below exec_mmap() when testing these patches.
exit_mm() up_reads mmap_sem before calling mm_update_next_owner(),
so exec_mmap() now needs to do the same. And with that repositioning,
there's now no point in mm_need_new_owner() allowing for NULL mm.
Reported-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-09-28 23:09:31 +01:00
mm_update_next_owner ( old_mm ) ;
2005-04-16 15:20:36 -07:00
mmput ( old_mm ) ;
return 0 ;
}
mmdrop ( active_mm ) ;
return 0 ;
}
/*
* This function makes sure the current process has its own signal table ,
* so that flush_signal_handlers can later reset the handlers without
* disturbing other processes . ( Other processes might share the signal
* table via the CLONE_SIGHAND option to clone ( ) . )
*/
2006-01-14 13:20:43 -08:00
static int de_thread ( struct task_struct * tsk )
2005-04-16 15:20:36 -07:00
{
struct signal_struct * sig = tsk - > signal ;
2007-10-16 23:27:22 -07:00
struct sighand_struct * oldsighand = tsk - > sighand ;
2005-04-16 15:20:36 -07:00
spinlock_t * lock = & oldsighand - > siglock ;
2005-11-07 21:12:43 +03:00
struct task_struct * leader = NULL ;
2005-04-16 15:20:36 -07:00
int count ;
2006-09-27 01:51:13 -07:00
if ( thread_group_empty ( tsk ) )
2005-04-16 15:20:36 -07:00
goto no_thread_group ;
/*
* Kill all other threads in the thread group .
*/
spin_lock_irq ( lock ) ;
2008-02-04 22:27:24 -08:00
if ( signal_group_exit ( sig ) ) {
2005-04-16 15:20:36 -07:00
/*
* Another group action in progress , just
* return so that the signal is processed .
*/
spin_unlock_irq ( lock ) ;
return - EAGAIN ;
}
2008-02-04 22:27:24 -08:00
sig - > group_exit_task = tsk ;
2006-09-27 01:51:13 -07:00
zap_other_threads ( tsk ) ;
2005-04-16 15:20:36 -07:00
2008-02-08 04:19:19 -08:00
/* Account for the thread group leader hanging around: */
count = thread_group_leader ( tsk ) ? 1 : 2 ;
2007-10-16 23:27:23 -07:00
sig - > notify_count = count ;
2005-04-16 15:20:36 -07:00
while ( atomic_read ( & sig - > count ) > count ) {
__set_current_state ( TASK_UNINTERRUPTIBLE ) ;
spin_unlock_irq ( lock ) ;
schedule ( ) ;
spin_lock_irq ( lock ) ;
}
spin_unlock_irq ( lock ) ;
/*
* At this point all other threads have exited , all we have to
* do is to wait for the thread group leader to become inactive ,
* and to assume its PID :
*/
2006-09-27 01:51:13 -07:00
if ( ! thread_group_leader ( tsk ) ) {
leader = tsk - > group_leader ;
2007-10-16 23:27:23 -07:00
2008-04-30 00:53:12 -07:00
sig - > notify_count = - 1 ; /* for exit_notify() */
2007-10-16 23:27:23 -07:00
for ( ; ; ) {
write_lock_irq ( & tasklist_lock ) ;
if ( likely ( leader - > exit_state ) )
break ;
__set_current_state ( TASK_UNINTERRUPTIBLE ) ;
write_unlock_irq ( & tasklist_lock ) ;
schedule ( ) ;
}
2005-04-16 15:20:36 -07:00
2006-04-10 22:54:16 -07:00
/*
* The only record we have of the real - time age of a
* process , regardless of execs it ' s done , is start_time .
* All the past CPU time is accumulated in signal_struct
* from sister threads now dead . But in this non - leader
* exec , nothing survives from the original leader thread ,
* whose birth marks the true age of this process now .
* When we take on its identity by switching to its PID , we
* also take its birthdate ( always earlier than our own ) .
*/
2006-09-27 01:51:13 -07:00
tsk - > start_time = leader - > start_time ;
2006-04-10 22:54:16 -07:00
2007-10-18 23:40:18 -07:00
BUG_ON ( ! same_thread_group ( leader , tsk ) ) ;
BUG_ON ( has_group_leader_pid ( tsk ) ) ;
2005-04-16 15:20:36 -07:00
/*
* An exec ( ) starts a new thread group with the
* TGID of the previous thread group . Rehash the
* two threads with a switched PID , and release
* the former thread group leader :
*/
2006-03-28 16:11:03 -08:00
/* Become a process group leader with the old leader's pid.
2006-09-27 01:51:06 -07:00
* The old leader becomes a thread of the this thread group .
* Note : The old leader also uses this pid until release_task
2006-03-28 16:11:03 -08:00
* is called . Odd but simple and correct .
*/
2006-09-27 01:51:13 -07:00
detach_pid ( tsk , PIDTYPE_PID ) ;
tsk - > pid = leader - > pid ;
2007-10-18 23:39:51 -07:00
attach_pid ( tsk , PIDTYPE_PID , task_pid ( leader ) ) ;
2006-09-27 01:51:13 -07:00
transfer_pid ( leader , tsk , PIDTYPE_PGID ) ;
transfer_pid ( leader , tsk , PIDTYPE_SID ) ;
list_replace_rcu ( & leader - > tasks , & tsk - > tasks ) ;
2005-04-16 15:20:36 -07:00
2006-09-27 01:51:13 -07:00
tsk - > group_leader = tsk ;
leader - > group_leader = tsk ;
2006-04-10 17:16:49 -06:00
2006-09-27 01:51:13 -07:00
tsk - > exit_signal = SIGCHLD ;
2005-11-23 13:37:43 -08:00
BUG_ON ( leader - > exit_state ! = EXIT_ZOMBIE ) ;
leader - > exit_state = EXIT_DEAD ;
2005-04-16 15:20:36 -07:00
write_unlock_irq ( & tasklist_lock ) ;
2008-02-04 22:27:24 -08:00
}
2005-04-16 15:20:36 -07:00
2007-10-16 23:27:23 -07:00
sig - > group_exit_task = NULL ;
sig - > notify_count = 0 ;
2005-04-16 15:20:36 -07:00
no_thread_group :
exit_itimers ( sig ) ;
2008-05-26 20:55:42 +04:00
flush_itimer_signals ( ) ;
2005-11-07 21:12:43 +03:00
if ( leader )
release_task ( leader ) ;
2007-10-16 23:27:22 -07:00
if ( atomic_read ( & oldsighand - > count ) ! = 1 ) {
struct sighand_struct * newsighand ;
2005-04-16 15:20:36 -07:00
/*
2007-10-16 23:27:22 -07:00
* This - > sighand is shared with the CLONE_SIGHAND
* but not CLONE_THREAD task , switch to the new one .
2005-04-16 15:20:36 -07:00
*/
2007-10-16 23:27:22 -07:00
newsighand = kmem_cache_alloc ( sighand_cachep , GFP_KERNEL ) ;
if ( ! newsighand )
return - ENOMEM ;
2005-04-16 15:20:36 -07:00
atomic_set ( & newsighand - > count , 1 ) ;
memcpy ( newsighand - > action , oldsighand - > action ,
sizeof ( newsighand - > action ) ) ;
write_lock_irq ( & tasklist_lock ) ;
spin_lock ( & oldsighand - > siglock ) ;
2006-09-27 01:51:13 -07:00
rcu_assign_pointer ( tsk - > sighand , newsighand ) ;
2005-04-16 15:20:36 -07:00
spin_unlock ( & oldsighand - > siglock ) ;
write_unlock_irq ( & tasklist_lock ) ;
signal/timer/event: signalfd core
This patch series implements the new signalfd() system call.
I took part of the original Linus code (and you know how badly it can be
broken :), and I added even more breakage ;) Signals are fetched from the same
signal queue used by the process, so signalfd will compete with standard
kernel delivery in dequeue_signal(). If you want to reliably fetch signals on
the signalfd file, you need to block them with sigprocmask(SIG_BLOCK). This
seems to be working fine on my Dual Opteron machine. I made a quick test
program for it:
http://www.xmailserver.org/signafd-test.c
The signalfd() system call implements signal delivery into a file descriptor
receiver. The signalfd file descriptor if created with the following API:
int signalfd(int ufd, const sigset_t *mask, size_t masksize);
The "ufd" parameter allows to change an existing signalfd sigmask, w/out going
to close/create cycle (Linus idea). Use "ufd" == -1 if you want a brand new
signalfd file.
The "mask" allows to specify the signal mask of signals that we are interested
in. The "masksize" parameter is the size of "mask".
The signalfd fd supports the poll(2) and read(2) system calls. The poll(2)
will return POLLIN when signals are available to be dequeued. As a direct
consequence of supporting the Linux poll subsystem, the signalfd fd can use
used together with epoll(2) too.
The read(2) system call will return a "struct signalfd_siginfo" structure in
the userspace supplied buffer. The return value is the number of bytes copied
in the supplied buffer, or -1 in case of error. The read(2) call can also
return 0, in case the sighand structure to which the signalfd was attached,
has been orphaned. The O_NONBLOCK flag is also supported, and read(2) will
return -EAGAIN in case no signal is available.
If the size of the buffer passed to read(2) is lower than sizeof(struct
signalfd_siginfo), -EINVAL is returned. A read from the signalfd can also
return -ERESTARTSYS in case a signal hits the process. The format of the
struct signalfd_siginfo is, and the valid fields depends of the (->code &
__SI_MASK) value, in the same way a struct siginfo would:
struct signalfd_siginfo {
__u32 signo; /* si_signo */
__s32 err; /* si_errno */
__s32 code; /* si_code */
__u32 pid; /* si_pid */
__u32 uid; /* si_uid */
__s32 fd; /* si_fd */
__u32 tid; /* si_fd */
__u32 band; /* si_band */
__u32 overrun; /* si_overrun */
__u32 trapno; /* si_trapno */
__s32 status; /* si_status */
__s32 svint; /* si_int */
__u64 svptr; /* si_ptr */
__u64 utime; /* si_utime */
__u64 stime; /* si_stime */
__u64 addr; /* si_addr */
};
[akpm@linux-foundation.org: fix signalfd_copyinfo() on i386]
Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-05-10 22:23:13 -07:00
__cleanup_sighand ( oldsighand ) ;
2005-04-16 15:20:36 -07:00
}
2006-09-27 01:51:13 -07:00
BUG_ON ( ! thread_group_leader ( tsk ) ) ;
2005-04-16 15:20:36 -07:00
return 0 ;
}
2007-10-16 23:27:22 -07:00
2005-04-16 15:20:36 -07:00
/*
* These functions flushes out all traces of the currently running executable
* so that a new one can be started
*/
2006-01-14 13:20:43 -08:00
static void flush_old_files ( struct files_struct * files )
2005-04-16 15:20:36 -07:00
{
long j = - 1 ;
2005-09-09 13:04:10 -07:00
struct fdtable * fdt ;
2005-04-16 15:20:36 -07:00
spin_lock ( & files - > file_lock ) ;
for ( ; ; ) {
unsigned long set , i ;
j + + ;
i = j * __NFDBITS ;
2005-09-09 13:04:10 -07:00
fdt = files_fdtable ( files ) ;
2006-12-10 02:21:12 -08:00
if ( i > = fdt - > max_fds )
2005-04-16 15:20:36 -07:00
break ;
2005-09-09 13:04:10 -07:00
set = fdt - > close_on_exec - > fds_bits [ j ] ;
2005-04-16 15:20:36 -07:00
if ( ! set )
continue ;
2005-09-09 13:04:10 -07:00
fdt - > close_on_exec - > fds_bits [ j ] = 0 ;
2005-04-16 15:20:36 -07:00
spin_unlock ( & files - > file_lock ) ;
for ( ; set ; i + + , set > > = 1 ) {
if ( set & 1 ) {
sys_close ( i ) ;
}
}
spin_lock ( & files - > file_lock ) ;
}
spin_unlock ( & files - > file_lock ) ;
}
2008-02-04 22:27:21 -08:00
char * get_task_comm ( char * buf , struct task_struct * tsk )
2005-04-16 15:20:36 -07:00
{
/* buf must be at least sizeof(tsk->comm) in size */
task_lock ( tsk ) ;
strncpy ( buf , tsk - > comm , sizeof ( tsk - > comm ) ) ;
task_unlock ( tsk ) ;
2008-02-04 22:27:21 -08:00
return buf ;
2005-04-16 15:20:36 -07:00
}
void set_task_comm ( struct task_struct * tsk , char * buf )
{
task_lock ( tsk ) ;
strlcpy ( tsk - > comm , buf , sizeof ( tsk - > comm ) ) ;
task_unlock ( tsk ) ;
}
int flush_old_exec ( struct linux_binprm * bprm )
{
char * name ;
int i , ch , retval ;
char tcomm [ sizeof ( current - > comm ) ] ;
/*
* Make sure we have a private signal table and that
* we are unassociated from the previous thread group .
*/
retval = de_thread ( current ) ;
if ( retval )
goto out ;
2008-04-29 01:01:36 -07:00
set_mm_exe_file ( bprm - > mm , bprm - > file ) ;
2005-04-16 15:20:36 -07:00
/*
* Release all of the old mmap stuff
*/
retval = exec_mmap ( bprm - > mm ) ;
if ( retval )
2008-04-22 05:11:59 -04:00
goto out ;
2005-04-16 15:20:36 -07:00
bprm - > mm = NULL ; /* We're using it now */
/* This is the point of no return */
current - > sas_ss_sp = current - > sas_ss_size = 0 ;
if ( current - > euid = = current - > uid & & current - > egid = = current - > gid )
2007-07-19 01:48:27 -07:00
set_dumpable ( current - > mm , 1 ) ;
2005-06-23 00:09:43 -07:00
else
2007-07-19 01:48:27 -07:00
set_dumpable ( current - > mm , suid_dumpable ) ;
2005-06-23 00:09:43 -07:00
2005-04-16 15:20:36 -07:00
name = bprm - > filename ;
2005-05-05 16:16:12 -07:00
/* Copies the binary name from after last slash */
2005-04-16 15:20:36 -07:00
for ( i = 0 ; ( ch = * ( name + + ) ) ! = ' \0 ' ; ) {
if ( ch = = ' / ' )
2005-05-05 16:16:12 -07:00
i = 0 ; /* overwrite what we wrote */
2005-04-16 15:20:36 -07:00
else
if ( i < ( sizeof ( tcomm ) - 1 ) )
tcomm [ i + + ] = ch ;
}
tcomm [ i ] = ' \0 ' ;
set_task_comm ( current , tcomm ) ;
current - > flags & = ~ PF_RANDOMIZE ;
flush_thread ( ) ;
2006-02-28 16:59:19 -08:00
/* Set the new mm task size. We have to do that late because it may
* depend on TIF_32BIT which is only updated in flush_thread ( ) on
* some architectures like powerpc
*/
current - > mm - > task_size = TASK_SIZE ;
2007-08-17 21:47:58 +02:00
if ( bprm - > e_uid ! = current - > euid | | bprm - > e_gid ! = current - > egid ) {
suid_keys ( current ) ;
set_dumpable ( current - > mm , suid_dumpable ) ;
current - > pdeath_signal = 0 ;
} else if ( file_permission ( bprm - > file , MAY_READ ) | |
( bprm - > interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP ) ) {
2005-04-16 15:20:36 -07:00
suid_keys ( current ) ;
2007-07-19 01:48:27 -07:00
set_dumpable ( current - > mm , suid_dumpable ) ;
2005-04-16 15:20:36 -07:00
}
/* An exec changes our domain. We are no longer part of the thread
group */
current - > self_exec_id + + ;
flush_signal_handlers ( current , 0 ) ;
flush_old_files ( current - > files ) ;
return 0 ;
out :
return retval ;
}
EXPORT_SYMBOL ( flush_old_exec ) ;
/*
* Fill the binprm structure from the inode .
* Check permissions , then read the first 128 ( BINPRM_BUF_SIZE ) bytes
*/
int prepare_binprm ( struct linux_binprm * bprm )
{
int mode ;
2006-12-08 02:36:35 -08:00
struct inode * inode = bprm - > file - > f_path . dentry - > d_inode ;
2005-04-16 15:20:36 -07:00
int retval ;
mode = inode - > i_mode ;
if ( bprm - > file - > f_op = = NULL )
return - EACCES ;
bprm - > e_uid = current - > euid ;
bprm - > e_gid = current - > egid ;
2006-12-08 02:36:35 -08:00
if ( ! ( bprm - > file - > f_path . mnt - > mnt_flags & MNT_NOSUID ) ) {
2005-04-16 15:20:36 -07:00
/* Set-uid? */
if ( mode & S_ISUID ) {
current - > personality & = ~ PER_CLEAR_ON_SETID ;
bprm - > e_uid = inode - > i_uid ;
}
/* Set-gid? */
/*
* If setgid is set but no group execute bit then this
* is a candidate for mandatory locking , not a setgid
* executable .
*/
if ( ( mode & ( S_ISGID | S_IXGRP ) ) = = ( S_ISGID | S_IXGRP ) ) {
current - > personality & = ~ PER_CLEAR_ON_SETID ;
bprm - > e_gid = inode - > i_gid ;
}
}
/* fill in binprm security blob */
retval = security_bprm_set ( bprm ) ;
if ( retval )
return retval ;
memset ( bprm - > buf , 0 , BINPRM_BUF_SIZE ) ;
return kernel_read ( bprm - > file , 0 , bprm - > buf , BINPRM_BUF_SIZE ) ;
}
EXPORT_SYMBOL ( prepare_binprm ) ;
2006-01-14 13:20:43 -08:00
static int unsafe_exec ( struct task_struct * p )
2005-04-16 15:20:36 -07:00
{
2008-07-25 19:45:44 -07:00
int unsafe = tracehook_unsafe_exec ( p ) ;
2005-04-16 15:20:36 -07:00
if ( atomic_read ( & p - > fs - > count ) > 1 | |
atomic_read ( & p - > files - > count ) > 1 | |
atomic_read ( & p - > sighand - > count ) > 1 )
unsafe | = LSM_UNSAFE_SHARE ;
return unsafe ;
}
void compute_creds ( struct linux_binprm * bprm )
{
int unsafe ;
2007-08-17 21:47:58 +02:00
if ( bprm - > e_uid ! = current - > uid ) {
2005-04-16 15:20:36 -07:00
suid_keys ( current ) ;
2007-08-17 21:47:58 +02:00
current - > pdeath_signal = 0 ;
}
2005-04-16 15:20:36 -07:00
exec_keys ( current ) ;
task_lock ( current ) ;
unsafe = unsafe_exec ( current ) ;
security_bprm_apply_creds ( bprm , unsafe ) ;
task_unlock ( current ) ;
security_bprm_post_apply_creds ( bprm ) ;
}
EXPORT_SYMBOL ( compute_creds ) ;
2007-05-08 00:25:16 -07:00
/*
* Arguments are ' \0 ' separated strings found at the location bprm - > p
* points to ; chop off the first by relocating brpm - > p to right after
* the first ' \0 ' encountered .
*/
2007-07-19 01:48:16 -07:00
int remove_arg_zero ( struct linux_binprm * bprm )
2005-04-16 15:20:36 -07:00
{
2007-07-19 01:48:16 -07:00
int ret = 0 ;
unsigned long offset ;
char * kaddr ;
struct page * page ;
2007-05-08 00:25:16 -07:00
2007-07-19 01:48:16 -07:00
if ( ! bprm - > argc )
return 0 ;
2005-04-16 15:20:36 -07:00
2007-07-19 01:48:16 -07:00
do {
offset = bprm - > p & ~ PAGE_MASK ;
page = get_arg_page ( bprm , bprm - > p , 0 ) ;
if ( ! page ) {
ret = - EFAULT ;
goto out ;
}
kaddr = kmap_atomic ( page , KM_USER0 ) ;
2007-05-08 00:25:16 -07:00
2007-07-19 01:48:16 -07:00
for ( ; offset < PAGE_SIZE & & kaddr [ offset ] ;
offset + + , bprm - > p + + )
;
2007-05-08 00:25:16 -07:00
2007-07-19 01:48:16 -07:00
kunmap_atomic ( kaddr , KM_USER0 ) ;
put_arg_page ( page ) ;
2007-05-08 00:25:16 -07:00
2007-07-19 01:48:16 -07:00
if ( offset = = PAGE_SIZE )
free_arg_page ( bprm , ( bprm - > p > > PAGE_SHIFT ) - 1 ) ;
} while ( offset = = PAGE_SIZE ) ;
2007-05-08 00:25:16 -07:00
2007-07-19 01:48:16 -07:00
bprm - > p + + ;
bprm - > argc - - ;
ret = 0 ;
2007-05-08 00:25:16 -07:00
2007-07-19 01:48:16 -07:00
out :
return ret ;
2005-04-16 15:20:36 -07:00
}
EXPORT_SYMBOL ( remove_arg_zero ) ;
/*
* cycle the list of binary formats handler , until one recognizes the image
*/
int search_binary_handler ( struct linux_binprm * bprm , struct pt_regs * regs )
{
tracehook: exec double-reporting fix
The patch 6341c39 "tracehook: exec" introduced a small regression in
2.6.27 regarding binfmt_misc exec event reporting. Since the reporting
is now done in the common search_binary_handler() function, an exec
of a misc binary will result in two (or possibly multiple) exec events
being reported, instead of just a single one, because the misc handler
contains a recursive call to search_binary_handler.
To add to the confusion, if PTRACE_O_TRACEEXEC is not active, the multiple
SIGTRAP signals will in fact cause only a single ptrace intercept, as the
signals are not queued. However, if PTRACE_O_TRACEEXEC is on, the debugger
will actually see multiple ptrace intercepts (PTRACE_EVENT_EXEC).
The test program included below demonstrates the problem.
This change fixes the bug by calling tracehook_report_exec() only in the
outermost search_binary_handler() call (bprm->recursion_depth == 0).
The additional change to restore bprm->recursion_depth after each binfmt
load_binary call is actually superfluous for this bug, since we test the
value saved on entry to search_binary_handler(). But it keeps the use of
of the depth count to its most obvious expected meaning. Depending on what
binfmt handlers do in certain cases, there could have been false-positive
tests for recursion limits before this change.
/* Test program using PTRACE_O_TRACEEXEC.
This forks and exec's the first argument with the rest of the arguments,
while ptrace'ing. It expects to see one PTRACE_EVENT_EXEC stop and
then a successful exit, with no other signals or events in between.
Test for kernel doing two PTRACE_EVENT_EXEC stops for a binfmt_misc exec:
$ gcc -g traceexec.c -o traceexec
$ sudo sh -c 'echo :test:M::foobar::/bin/cat: > /proc/sys/fs/binfmt_misc/register'
$ echo 'foobar test' > ./foobar
$ chmod +x ./foobar
$ ./traceexec ./foobar; echo $?
==> good <==
foobar test
0
$
==> bad <==
foobar test
unexpected status 0x4057f != 0
3
$
*/
#include <stdio.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <sys/ptrace.h>
#include <unistd.h>
#include <signal.h>
#include <stdlib.h>
static void
wait_for (pid_t child, int expect)
{
int status;
pid_t p = wait (&status);
if (p != child)
{
perror ("wait");
exit (2);
}
if (status != expect)
{
fprintf (stderr, "unexpected status %#x != %#x\n", status, expect);
exit (3);
}
}
int
main (int argc, char **argv)
{
pid_t child = fork ();
if (child < 0)
{
perror ("fork");
return 127;
}
else if (child == 0)
{
ptrace (PTRACE_TRACEME);
raise (SIGUSR1);
execv (argv[1], &argv[1]);
perror ("execve");
_exit (127);
}
wait_for (child, W_STOPCODE (SIGUSR1));
if (ptrace (PTRACE_SETOPTIONS, child,
0L, (void *) (long) PTRACE_O_TRACEEXEC) != 0)
{
perror ("PTRACE_SETOPTIONS");
return 4;
}
if (ptrace (PTRACE_CONT, child, 0L, 0L) != 0)
{
perror ("PTRACE_CONT");
return 5;
}
wait_for (child, W_STOPCODE (SIGTRAP | (PTRACE_EVENT_EXEC << 8)));
if (ptrace (PTRACE_CONT, child, 0L, 0L) != 0)
{
perror ("PTRACE_CONT");
return 6;
}
wait_for (child, W_EXITCODE (0, 0));
return 0;
}
Reported-by: Arnd Bergmann <arnd@arndb.de>
CC: Ulrich Weigand <ulrich.weigand@de.ibm.com>
Signed-off-by: Roland McGrath <roland@redhat.com>
2008-12-09 19:36:38 -08:00
unsigned int depth = bprm - > recursion_depth ;
2005-04-16 15:20:36 -07:00
int try , retval ;
struct linux_binfmt * fmt ;
2008-06-16 12:11:54 +01:00
# ifdef __alpha__
2005-04-16 15:20:36 -07:00
/* handle /sbin/loader.. */
{
struct exec * eh = ( struct exec * ) bprm - > buf ;
if ( ! bprm - > loader & & eh - > fh . f_magic = = 0x183 & &
( eh - > fh . f_flags & 0x3000 ) = = 0x3000 )
{
struct file * file ;
unsigned long loader ;
allow_write_access ( bprm - > file ) ;
fput ( bprm - > file ) ;
bprm - > file = NULL ;
2007-07-19 01:48:16 -07:00
loader = bprm - > vma - > vm_end - sizeof ( void * ) ;
2005-04-16 15:20:36 -07:00
file = open_exec ( " /sbin/loader " ) ;
retval = PTR_ERR ( file ) ;
if ( IS_ERR ( file ) )
return retval ;
/* Remember if the application is TASO. */
2008-10-15 22:02:37 -07:00
bprm - > taso = eh - > ah . entry < 0x100000000UL ;
2005-04-16 15:20:36 -07:00
bprm - > file = file ;
bprm - > loader = loader ;
retval = prepare_binprm ( bprm ) ;
if ( retval < 0 )
return retval ;
/* should call search_binary_handler recursively here,
but it does not matter */
}
}
# endif
retval = security_bprm_check ( bprm ) ;
if ( retval )
return retval ;
/* kernel module loader fixup */
/* so we don't try to load run modprobe in kernel space. */
set_fs ( USER_DS ) ;
2006-04-26 14:04:08 -04:00
retval = audit_bprm ( bprm ) ;
if ( retval )
return retval ;
2005-04-16 15:20:36 -07:00
retval = - ENOENT ;
for ( try = 0 ; try < 2 ; try + + ) {
read_lock ( & binfmt_lock ) ;
2007-10-16 23:26:03 -07:00
list_for_each_entry ( fmt , & formats , lh ) {
2005-04-16 15:20:36 -07:00
int ( * fn ) ( struct linux_binprm * , struct pt_regs * ) = fmt - > load_binary ;
if ( ! fn )
continue ;
if ( ! try_module_get ( fmt - > module ) )
continue ;
read_unlock ( & binfmt_lock ) ;
retval = fn ( bprm , regs ) ;
tracehook: exec double-reporting fix
The patch 6341c39 "tracehook: exec" introduced a small regression in
2.6.27 regarding binfmt_misc exec event reporting. Since the reporting
is now done in the common search_binary_handler() function, an exec
of a misc binary will result in two (or possibly multiple) exec events
being reported, instead of just a single one, because the misc handler
contains a recursive call to search_binary_handler.
To add to the confusion, if PTRACE_O_TRACEEXEC is not active, the multiple
SIGTRAP signals will in fact cause only a single ptrace intercept, as the
signals are not queued. However, if PTRACE_O_TRACEEXEC is on, the debugger
will actually see multiple ptrace intercepts (PTRACE_EVENT_EXEC).
The test program included below demonstrates the problem.
This change fixes the bug by calling tracehook_report_exec() only in the
outermost search_binary_handler() call (bprm->recursion_depth == 0).
The additional change to restore bprm->recursion_depth after each binfmt
load_binary call is actually superfluous for this bug, since we test the
value saved on entry to search_binary_handler(). But it keeps the use of
of the depth count to its most obvious expected meaning. Depending on what
binfmt handlers do in certain cases, there could have been false-positive
tests for recursion limits before this change.
/* Test program using PTRACE_O_TRACEEXEC.
This forks and exec's the first argument with the rest of the arguments,
while ptrace'ing. It expects to see one PTRACE_EVENT_EXEC stop and
then a successful exit, with no other signals or events in between.
Test for kernel doing two PTRACE_EVENT_EXEC stops for a binfmt_misc exec:
$ gcc -g traceexec.c -o traceexec
$ sudo sh -c 'echo :test:M::foobar::/bin/cat: > /proc/sys/fs/binfmt_misc/register'
$ echo 'foobar test' > ./foobar
$ chmod +x ./foobar
$ ./traceexec ./foobar; echo $?
==> good <==
foobar test
0
$
==> bad <==
foobar test
unexpected status 0x4057f != 0
3
$
*/
#include <stdio.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <sys/ptrace.h>
#include <unistd.h>
#include <signal.h>
#include <stdlib.h>
static void
wait_for (pid_t child, int expect)
{
int status;
pid_t p = wait (&status);
if (p != child)
{
perror ("wait");
exit (2);
}
if (status != expect)
{
fprintf (stderr, "unexpected status %#x != %#x\n", status, expect);
exit (3);
}
}
int
main (int argc, char **argv)
{
pid_t child = fork ();
if (child < 0)
{
perror ("fork");
return 127;
}
else if (child == 0)
{
ptrace (PTRACE_TRACEME);
raise (SIGUSR1);
execv (argv[1], &argv[1]);
perror ("execve");
_exit (127);
}
wait_for (child, W_STOPCODE (SIGUSR1));
if (ptrace (PTRACE_SETOPTIONS, child,
0L, (void *) (long) PTRACE_O_TRACEEXEC) != 0)
{
perror ("PTRACE_SETOPTIONS");
return 4;
}
if (ptrace (PTRACE_CONT, child, 0L, 0L) != 0)
{
perror ("PTRACE_CONT");
return 5;
}
wait_for (child, W_STOPCODE (SIGTRAP | (PTRACE_EVENT_EXEC << 8)));
if (ptrace (PTRACE_CONT, child, 0L, 0L) != 0)
{
perror ("PTRACE_CONT");
return 6;
}
wait_for (child, W_EXITCODE (0, 0));
return 0;
}
Reported-by: Arnd Bergmann <arnd@arndb.de>
CC: Ulrich Weigand <ulrich.weigand@de.ibm.com>
Signed-off-by: Roland McGrath <roland@redhat.com>
2008-12-09 19:36:38 -08:00
/*
* Restore the depth counter to its starting value
* in this call , so we don ' t have to rely on every
* load_binary function to restore it on return .
*/
bprm - > recursion_depth = depth ;
2005-04-16 15:20:36 -07:00
if ( retval > = 0 ) {
tracehook: exec double-reporting fix
The patch 6341c39 "tracehook: exec" introduced a small regression in
2.6.27 regarding binfmt_misc exec event reporting. Since the reporting
is now done in the common search_binary_handler() function, an exec
of a misc binary will result in two (or possibly multiple) exec events
being reported, instead of just a single one, because the misc handler
contains a recursive call to search_binary_handler.
To add to the confusion, if PTRACE_O_TRACEEXEC is not active, the multiple
SIGTRAP signals will in fact cause only a single ptrace intercept, as the
signals are not queued. However, if PTRACE_O_TRACEEXEC is on, the debugger
will actually see multiple ptrace intercepts (PTRACE_EVENT_EXEC).
The test program included below demonstrates the problem.
This change fixes the bug by calling tracehook_report_exec() only in the
outermost search_binary_handler() call (bprm->recursion_depth == 0).
The additional change to restore bprm->recursion_depth after each binfmt
load_binary call is actually superfluous for this bug, since we test the
value saved on entry to search_binary_handler(). But it keeps the use of
of the depth count to its most obvious expected meaning. Depending on what
binfmt handlers do in certain cases, there could have been false-positive
tests for recursion limits before this change.
/* Test program using PTRACE_O_TRACEEXEC.
This forks and exec's the first argument with the rest of the arguments,
while ptrace'ing. It expects to see one PTRACE_EVENT_EXEC stop and
then a successful exit, with no other signals or events in between.
Test for kernel doing two PTRACE_EVENT_EXEC stops for a binfmt_misc exec:
$ gcc -g traceexec.c -o traceexec
$ sudo sh -c 'echo :test:M::foobar::/bin/cat: > /proc/sys/fs/binfmt_misc/register'
$ echo 'foobar test' > ./foobar
$ chmod +x ./foobar
$ ./traceexec ./foobar; echo $?
==> good <==
foobar test
0
$
==> bad <==
foobar test
unexpected status 0x4057f != 0
3
$
*/
#include <stdio.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <sys/ptrace.h>
#include <unistd.h>
#include <signal.h>
#include <stdlib.h>
static void
wait_for (pid_t child, int expect)
{
int status;
pid_t p = wait (&status);
if (p != child)
{
perror ("wait");
exit (2);
}
if (status != expect)
{
fprintf (stderr, "unexpected status %#x != %#x\n", status, expect);
exit (3);
}
}
int
main (int argc, char **argv)
{
pid_t child = fork ();
if (child < 0)
{
perror ("fork");
return 127;
}
else if (child == 0)
{
ptrace (PTRACE_TRACEME);
raise (SIGUSR1);
execv (argv[1], &argv[1]);
perror ("execve");
_exit (127);
}
wait_for (child, W_STOPCODE (SIGUSR1));
if (ptrace (PTRACE_SETOPTIONS, child,
0L, (void *) (long) PTRACE_O_TRACEEXEC) != 0)
{
perror ("PTRACE_SETOPTIONS");
return 4;
}
if (ptrace (PTRACE_CONT, child, 0L, 0L) != 0)
{
perror ("PTRACE_CONT");
return 5;
}
wait_for (child, W_STOPCODE (SIGTRAP | (PTRACE_EVENT_EXEC << 8)));
if (ptrace (PTRACE_CONT, child, 0L, 0L) != 0)
{
perror ("PTRACE_CONT");
return 6;
}
wait_for (child, W_EXITCODE (0, 0));
return 0;
}
Reported-by: Arnd Bergmann <arnd@arndb.de>
CC: Ulrich Weigand <ulrich.weigand@de.ibm.com>
Signed-off-by: Roland McGrath <roland@redhat.com>
2008-12-09 19:36:38 -08:00
if ( depth = = 0 )
tracehook_report_exec ( fmt , bprm , regs ) ;
2005-04-16 15:20:36 -07:00
put_binfmt ( fmt ) ;
allow_write_access ( bprm - > file ) ;
if ( bprm - > file )
fput ( bprm - > file ) ;
bprm - > file = NULL ;
current - > did_exec = 1 ;
2005-11-07 00:59:16 -08:00
proc_exec_connector ( current ) ;
2005-04-16 15:20:36 -07:00
return retval ;
}
read_lock ( & binfmt_lock ) ;
put_binfmt ( fmt ) ;
if ( retval ! = - ENOEXEC | | bprm - > mm = = NULL )
break ;
if ( ! bprm - > file ) {
read_unlock ( & binfmt_lock ) ;
return retval ;
}
}
read_unlock ( & binfmt_lock ) ;
if ( retval ! = - ENOEXEC | | bprm - > mm = = NULL ) {
break ;
2008-07-09 10:28:40 +02:00
# ifdef CONFIG_MODULES
} else {
2005-04-16 15:20:36 -07:00
# define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e))
if ( printable ( bprm - > buf [ 0 ] ) & &
printable ( bprm - > buf [ 1 ] ) & &
printable ( bprm - > buf [ 2 ] ) & &
printable ( bprm - > buf [ 3 ] ) )
break ; /* -ENOEXEC */
request_module ( " binfmt-%04x " , * ( unsigned short * ) ( & bprm - > buf [ 2 ] ) ) ;
# endif
}
}
return retval ;
}
EXPORT_SYMBOL ( search_binary_handler ) ;
2008-05-10 16:38:25 -04:00
void free_bprm ( struct linux_binprm * bprm )
{
free_arg_pages ( bprm ) ;
kfree ( bprm ) ;
}
2005-04-16 15:20:36 -07:00
/*
* sys_execve ( ) executes a new program .
*/
int do_execve ( char * filename ,
char __user * __user * argv ,
char __user * __user * envp ,
struct pt_regs * regs )
{
struct linux_binprm * bprm ;
struct file * file ;
2008-04-22 05:31:30 -04:00
struct files_struct * displaced ;
2005-04-16 15:20:36 -07:00
int retval ;
2008-04-22 05:31:30 -04:00
retval = unshare_files ( & displaced ) ;
2008-04-22 05:11:59 -04:00
if ( retval )
goto out_ret ;
2005-04-16 15:20:36 -07:00
retval = - ENOMEM ;
2006-03-25 03:08:13 -08:00
bprm = kzalloc ( sizeof ( * bprm ) , GFP_KERNEL ) ;
2005-04-16 15:20:36 -07:00
if ( ! bprm )
2008-04-22 05:11:59 -04:00
goto out_files ;
2005-04-16 15:20:36 -07:00
file = open_exec ( filename ) ;
retval = PTR_ERR ( file ) ;
if ( IS_ERR ( file ) )
goto out_kfree ;
sched_exec ( ) ;
bprm - > file = file ;
bprm - > filename = filename ;
bprm - > interp = filename ;
2007-07-19 01:48:16 -07:00
retval = bprm_mm_init ( bprm ) ;
if ( retval )
goto out_file ;
2005-04-16 15:20:36 -07:00
2007-07-19 01:48:16 -07:00
bprm - > argc = count ( argv , MAX_ARG_STRINGS ) ;
2005-04-16 15:20:36 -07:00
if ( ( retval = bprm - > argc ) < 0 )
goto out_mm ;
2007-07-19 01:48:16 -07:00
bprm - > envc = count ( envp , MAX_ARG_STRINGS ) ;
2005-04-16 15:20:36 -07:00
if ( ( retval = bprm - > envc ) < 0 )
goto out_mm ;
retval = security_bprm_alloc ( bprm ) ;
if ( retval )
goto out ;
retval = prepare_binprm ( bprm ) ;
if ( retval < 0 )
goto out ;
retval = copy_strings_kernel ( 1 , & bprm - > filename , bprm ) ;
if ( retval < 0 )
goto out ;
bprm - > exec = bprm - > p ;
retval = copy_strings ( bprm - > envc , envp , bprm ) ;
if ( retval < 0 )
goto out ;
retval = copy_strings ( bprm - > argc , argv , bprm ) ;
if ( retval < 0 )
goto out ;
2008-07-25 01:47:37 -07:00
current - > flags & = ~ PF_KTHREAD ;
2005-04-16 15:20:36 -07:00
retval = search_binary_handler ( bprm , regs ) ;
if ( retval > = 0 ) {
/* execve success */
security_bprm_free ( bprm ) ;
acct_update_integrals ( current ) ;
2008-05-10 16:38:25 -04:00
free_bprm ( bprm ) ;
2008-04-22 05:31:30 -04:00
if ( displaced )
put_files_struct ( displaced ) ;
2005-04-16 15:20:36 -07:00
return retval ;
}
out :
if ( bprm - > security )
security_bprm_free ( bprm ) ;
out_mm :
if ( bprm - > mm )
2007-07-19 01:48:16 -07:00
mmput ( bprm - > mm ) ;
2005-04-16 15:20:36 -07:00
out_file :
if ( bprm - > file ) {
allow_write_access ( bprm - > file ) ;
fput ( bprm - > file ) ;
}
out_kfree :
2008-05-10 16:38:25 -04:00
free_bprm ( bprm ) ;
2005-04-16 15:20:36 -07:00
2008-04-22 05:11:59 -04:00
out_files :
2008-04-22 05:31:30 -04:00
if ( displaced )
reset_files_struct ( displaced ) ;
2005-04-16 15:20:36 -07:00
out_ret :
return retval ;
}
int set_binfmt ( struct linux_binfmt * new )
{
struct linux_binfmt * old = current - > binfmt ;
if ( new ) {
if ( ! try_module_get ( new - > module ) )
return - 1 ;
}
current - > binfmt = new ;
if ( old )
module_put ( old - > module ) ;
return 0 ;
}
EXPORT_SYMBOL ( set_binfmt ) ;
/* format_corename will inspect the pattern parameter, and output a
* name into corename , which must have space for at least
* CORENAME_MAX_SIZE bytes plus one byte for the zero terminator .
*/
2008-10-18 20:28:22 -07:00
static int format_corename ( char * corename , long signr )
2005-04-16 15:20:36 -07:00
{
2008-07-25 01:47:47 -07:00
const char * pat_ptr = core_pattern ;
int ispipe = ( * pat_ptr = = ' | ' ) ;
2005-04-16 15:20:36 -07:00
char * out_ptr = corename ;
char * const out_end = corename + CORENAME_MAX_SIZE ;
int rc ;
int pid_in_pattern = 0 ;
/* Repeat as long as we have more pattern to process and more output
space */
while ( * pat_ptr ) {
if ( * pat_ptr ! = ' % ' ) {
if ( out_ptr = = out_end )
goto out ;
* out_ptr + + = * pat_ptr + + ;
} else {
switch ( * + + pat_ptr ) {
case 0 :
goto out ;
/* Double percent, output one percent */
case ' % ' :
if ( out_ptr = = out_end )
goto out ;
* out_ptr + + = ' % ' ;
break ;
/* pid */
case ' p ' :
pid_in_pattern = 1 ;
rc = snprintf ( out_ptr , out_end - out_ptr ,
2007-10-18 23:40:14 -07:00
" %d " , task_tgid_vnr ( current ) ) ;
2005-04-16 15:20:36 -07:00
if ( rc > out_end - out_ptr )
goto out ;
out_ptr + = rc ;
break ;
/* uid */
case ' u ' :
rc = snprintf ( out_ptr , out_end - out_ptr ,
" %d " , current - > uid ) ;
if ( rc > out_end - out_ptr )
goto out ;
out_ptr + = rc ;
break ;
/* gid */
case ' g ' :
rc = snprintf ( out_ptr , out_end - out_ptr ,
" %d " , current - > gid ) ;
if ( rc > out_end - out_ptr )
goto out ;
out_ptr + = rc ;
break ;
/* signal that caused the coredump */
case ' s ' :
rc = snprintf ( out_ptr , out_end - out_ptr ,
" %ld " , signr ) ;
if ( rc > out_end - out_ptr )
goto out ;
out_ptr + = rc ;
break ;
/* UNIX time of coredump */
case ' t ' : {
struct timeval tv ;
do_gettimeofday ( & tv ) ;
rc = snprintf ( out_ptr , out_end - out_ptr ,
" %lu " , tv . tv_sec ) ;
if ( rc > out_end - out_ptr )
goto out ;
out_ptr + = rc ;
break ;
}
/* hostname */
case ' h ' :
down_read ( & uts_sem ) ;
rc = snprintf ( out_ptr , out_end - out_ptr ,
2006-10-02 02:18:11 -07:00
" %s " , utsname ( ) - > nodename ) ;
2005-04-16 15:20:36 -07:00
up_read ( & uts_sem ) ;
if ( rc > out_end - out_ptr )
goto out ;
out_ptr + = rc ;
break ;
/* executable */
case ' e ' :
rc = snprintf ( out_ptr , out_end - out_ptr ,
" %s " , current - > comm ) ;
if ( rc > out_end - out_ptr )
goto out ;
out_ptr + = rc ;
break ;
2007-10-16 23:26:35 -07:00
/* core limit size */
case ' c ' :
rc = snprintf ( out_ptr , out_end - out_ptr ,
" %lu " , current - > signal - > rlim [ RLIMIT_CORE ] . rlim_cur ) ;
if ( rc > out_end - out_ptr )
goto out ;
out_ptr + = rc ;
break ;
2005-04-16 15:20:36 -07:00
default :
break ;
}
+ + pat_ptr ;
}
}
/* Backward compatibility with core_uses_pid:
*
* If core_pattern does not include a % p ( as is the default )
* and core_uses_pid is set , then . % pid will be appended to
2007-04-16 22:53:13 -07:00
* the filename . Do not do this for piped commands . */
2008-10-18 20:28:22 -07:00
if ( ! ispipe & & ! pid_in_pattern & & core_uses_pid ) {
2005-04-16 15:20:36 -07:00
rc = snprintf ( out_ptr , out_end - out_ptr ,
2007-10-18 23:40:14 -07:00
" .%d " , task_tgid_vnr ( current ) ) ;
2005-04-16 15:20:36 -07:00
if ( rc > out_end - out_ptr )
goto out ;
out_ptr + = rc ;
}
2007-04-16 22:53:13 -07:00
out :
2005-04-16 15:20:36 -07:00
* out_ptr = 0 ;
2007-04-16 22:53:13 -07:00
return ispipe ;
2005-04-16 15:20:36 -07:00
}
2008-07-25 01:47:42 -07:00
static int zap_process ( struct task_struct * start )
2006-06-26 00:26:05 -07:00
{
struct task_struct * t ;
2008-07-25 01:47:42 -07:00
int nr = 0 ;
2006-06-26 00:26:06 -07:00
2006-06-26 00:26:07 -07:00
start - > signal - > flags = SIGNAL_GROUP_EXIT ;
start - > signal - > group_stop_count = 0 ;
2006-06-26 00:26:05 -07:00
t = start ;
do {
if ( t ! = current & & t - > mm ) {
2006-06-26 00:26:06 -07:00
sigaddset ( & t - > pending . signal , SIGKILL ) ;
signal_wake_up ( t , 1 ) ;
2008-07-25 01:47:42 -07:00
nr + + ;
2006-06-26 00:26:05 -07:00
}
2008-07-25 01:47:31 -07:00
} while_each_thread ( start , t ) ;
2008-07-25 01:47:42 -07:00
return nr ;
2006-06-26 00:26:05 -07:00
}
2006-06-26 00:26:08 -07:00
static inline int zap_threads ( struct task_struct * tsk , struct mm_struct * mm ,
2008-07-25 01:47:42 -07:00
struct core_state * core_state , int exit_code )
2005-04-16 15:20:36 -07:00
{
struct task_struct * g , * p ;
2006-06-26 00:26:09 -07:00
unsigned long flags ;
2008-07-25 01:47:42 -07:00
int nr = - EAGAIN ;
2006-06-26 00:26:08 -07:00
spin_lock_irq ( & tsk - > sighand - > siglock ) ;
2008-02-04 22:27:24 -08:00
if ( ! signal_group_exit ( tsk - > signal ) ) {
2008-07-25 01:47:42 -07:00
mm - > core_state = core_state ;
2006-06-26 00:26:08 -07:00
tsk - > signal - > group_exit_code = exit_code ;
2008-07-25 01:47:42 -07:00
nr = zap_process ( tsk ) ;
2005-04-16 15:20:36 -07:00
}
2006-06-26 00:26:08 -07:00
spin_unlock_irq ( & tsk - > sighand - > siglock ) ;
2008-07-25 01:47:42 -07:00
if ( unlikely ( nr < 0 ) )
return nr ;
2005-04-16 15:20:36 -07:00
2008-07-25 01:47:42 -07:00
if ( atomic_read ( & mm - > mm_users ) = = nr + 1 )
2006-06-26 00:26:09 -07:00
goto done ;
2008-07-25 01:47:31 -07:00
/*
* We should find and kill all tasks which use this mm , and we should
2008-07-25 01:47:41 -07:00
* count them correctly into - > nr_threads . We don ' t take tasklist
2008-07-25 01:47:31 -07:00
* lock , but this is safe wrt :
*
* fork :
* None of sub - threads can fork after zap_process ( leader ) . All
* processes which were created before this point should be
* visible to zap_threads ( ) because copy_process ( ) adds the new
* process to the tail of init_task . tasks list , and lock / unlock
* of - > siglock provides a memory barrier .
*
* do_exit :
* The caller holds mm - > mmap_sem . This means that the task which
* uses this mm can ' t pass exit_mm ( ) , so it can ' t exit or clear
* its - > mm .
*
* de_thread :
* It does list_replace_rcu ( & leader - > tasks , & current - > tasks ) ,
* we must see either old or new leader , this does not matter .
* However , it can change p - > sighand , so lock_task_sighand ( p )
* must be used . Since p - > mm ! = NULL and we hold - > mmap_sem
* it can ' t fail .
*
* Note also that " g " can be the old leader with - > mm = = NULL
* and already unhashed and thus removed from - > thread_group .
* This is OK , __unhash_process ( ) - > list_del_rcu ( ) does not
* clear the - > next pointer , we will find the new leader via
* next_thread ( ) .
*/
2006-06-26 00:26:08 -07:00
rcu_read_lock ( ) ;
2006-06-26 00:26:05 -07:00
for_each_process ( g ) {
2006-06-26 00:26:09 -07:00
if ( g = = tsk - > group_leader )
continue ;
2008-07-25 01:47:39 -07:00
if ( g - > flags & PF_KTHREAD )
continue ;
2006-06-26 00:26:05 -07:00
p = g ;
do {
if ( p - > mm ) {
2008-07-25 01:47:39 -07:00
if ( unlikely ( p - > mm = = mm ) ) {
2006-06-26 00:26:09 -07:00
lock_task_sighand ( p , & flags ) ;
2008-07-25 01:47:42 -07:00
nr + = zap_process ( p ) ;
2006-06-26 00:26:09 -07:00
unlock_task_sighand ( p , & flags ) ;
}
2006-06-26 00:26:05 -07:00
break ;
}
2008-07-25 01:47:31 -07:00
} while_each_thread ( g , p ) ;
2006-06-26 00:26:05 -07:00
}
2006-06-26 00:26:08 -07:00
rcu_read_unlock ( ) ;
2006-06-26 00:26:09 -07:00
done :
2008-07-25 01:47:42 -07:00
atomic_set ( & core_state - > nr_threads , nr ) ;
2008-07-25 01:47:42 -07:00
return nr ;
2005-04-16 15:20:36 -07:00
}
2008-07-25 01:47:43 -07:00
static int coredump_wait ( int exit_code , struct core_state * core_state )
2005-04-16 15:20:36 -07:00
{
2006-06-26 00:26:08 -07:00
struct task_struct * tsk = current ;
struct mm_struct * mm = tsk - > mm ;
struct completion * vfork_done ;
2005-10-30 15:02:47 -08:00
int core_waiters ;
2005-04-16 15:20:36 -07:00
2008-07-25 01:47:43 -07:00
init_completion ( & core_state - > startup ) ;
2008-07-25 01:47:44 -07:00
core_state - > dumper . task = tsk ;
core_state - > dumper . next = NULL ;
2008-07-25 01:47:43 -07:00
core_waiters = zap_threads ( tsk , mm , core_state , exit_code ) ;
2005-10-30 15:02:47 -08:00
up_write ( & mm - > mmap_sem ) ;
2006-06-26 00:26:08 -07:00
if ( unlikely ( core_waiters < 0 ) )
goto fail ;
/*
* Make sure nobody is waiting for us to release the VM ,
* otherwise we can deadlock when we wait on each other
*/
vfork_done = tsk - > vfork_done ;
if ( vfork_done ) {
tsk - > vfork_done = NULL ;
complete ( vfork_done ) ;
}
2005-10-30 15:02:47 -08:00
if ( core_waiters )
2008-07-25 01:47:43 -07:00
wait_for_completion ( & core_state - > startup ) ;
2006-06-26 00:26:08 -07:00
fail :
return core_waiters ;
2005-04-16 15:20:36 -07:00
}
2008-07-25 01:47:46 -07:00
static void coredump_finish ( struct mm_struct * mm )
{
struct core_thread * curr , * next ;
struct task_struct * task ;
next = mm - > core_state - > dumper . next ;
while ( ( curr = next ) ! = NULL ) {
next = curr - > next ;
task = curr - > task ;
/*
* see exit_mm ( ) , curr - > task must not see
* - > task = = NULL before we read - > next .
*/
smp_mb ( ) ;
curr - > task = NULL ;
wake_up_process ( task ) ;
}
mm - > core_state = NULL ;
}
2007-07-19 01:48:27 -07:00
/*
* set_dumpable converts traditional three - value dumpable to two flags and
* stores them into mm - > flags . It modifies lower two bits of mm - > flags , but
* these bits are not changed atomically . So get_dumpable can observe the
* intermediate state . To avoid doing unexpected behavior , get get_dumpable
* return either old dumpable or new one by paying attention to the order of
* modifying the bits .
*
* dumpable | mm - > flags ( binary )
* old new | initial interim final
* - - - - - - - - - + - - - - - - - - - - - - - - - - - - - - - - -
* 0 1 | 00 01 01
* 0 2 | 00 10 ( * ) 11
* 1 0 | 01 00 00
* 1 2 | 01 11 11
* 2 0 | 11 10 ( * ) 00
* 2 1 | 11 11 01
*
* ( * ) get_dumpable regards interim value of 10 as 11.
*/
void set_dumpable ( struct mm_struct * mm , int value )
{
switch ( value ) {
case 0 :
clear_bit ( MMF_DUMPABLE , & mm - > flags ) ;
smp_wmb ( ) ;
clear_bit ( MMF_DUMP_SECURELY , & mm - > flags ) ;
break ;
case 1 :
set_bit ( MMF_DUMPABLE , & mm - > flags ) ;
smp_wmb ( ) ;
clear_bit ( MMF_DUMP_SECURELY , & mm - > flags ) ;
break ;
case 2 :
set_bit ( MMF_DUMP_SECURELY , & mm - > flags ) ;
smp_wmb ( ) ;
set_bit ( MMF_DUMPABLE , & mm - > flags ) ;
break ;
}
}
int get_dumpable ( struct mm_struct * mm )
{
int ret ;
ret = mm - > flags & 0x3 ;
return ( ret > = 2 ) ? 2 : ret ;
}
2005-04-16 15:20:36 -07:00
int do_coredump ( long signr , int exit_code , struct pt_regs * regs )
{
2008-07-25 01:47:43 -07:00
struct core_state core_state ;
2005-04-16 15:20:36 -07:00
char corename [ CORENAME_MAX_SIZE + 1 ] ;
struct mm_struct * mm = current - > mm ;
struct linux_binfmt * binfmt ;
struct inode * inode ;
struct file * file ;
int retval = 0 ;
2005-06-23 00:09:43 -07:00
int fsuid = current - > fsuid ;
int flag = 0 ;
2006-09-30 23:29:28 -07:00
int ispipe = 0 ;
2007-10-16 23:26:34 -07:00
unsigned long core_limit = current - > signal - > rlim [ RLIMIT_CORE ] . rlim_cur ;
2007-10-16 23:26:35 -07:00
char * * helper_argv = NULL ;
int helper_argc = 0 ;
char * delimit ;
2005-04-16 15:20:36 -07:00
2007-04-19 10:28:21 -04:00
audit_core_dumps ( signr ) ;
2005-04-16 15:20:36 -07:00
binfmt = current - > binfmt ;
if ( ! binfmt | | ! binfmt - > core_dump )
goto fail ;
down_write ( & mm - > mmap_sem ) ;
2007-11-11 19:13:43 -08:00
/*
* If another thread got here first , or we are not dumpable , bail out .
*/
2008-07-25 01:47:41 -07:00
if ( mm - > core_state | | ! get_dumpable ( mm ) ) {
2005-04-16 15:20:36 -07:00
up_write ( & mm - > mmap_sem ) ;
goto fail ;
}
2005-06-23 00:09:43 -07:00
/*
* We cannot trust fsuid as being the " true " uid of the
* process nor do we know its entire history . We only know it
* was tainted so we dump it as root in mode 2.
*/
2007-07-19 01:48:27 -07:00
if ( get_dumpable ( mm ) = = 2 ) { /* Setuid core dump mode */
2005-06-23 00:09:43 -07:00
flag = O_EXCL ; /* Stop rewrite attacks */
current - > fsuid = 0 ; /* Dump root private */
}
2005-10-30 15:02:54 -08:00
2008-07-25 01:47:43 -07:00
retval = coredump_wait ( exit_code , & core_state ) ;
2006-06-26 00:26:08 -07:00
if ( retval < 0 )
2005-10-30 15:02:54 -08:00
goto fail ;
2005-04-16 15:20:36 -07:00
/*
* Clear any false indication of pending signals that might
* be seen by the filesystem code called to write the core file .
*/
clear_thread_flag ( TIF_SIGPENDING ) ;
/*
* lock_kernel ( ) because format_corename ( ) is controlled by sysctl , which
* uses lock_kernel ( )
*/
lock_kernel ( ) ;
2008-10-18 20:28:22 -07:00
ispipe = format_corename ( corename , signr ) ;
2005-04-16 15:20:36 -07:00
unlock_kernel ( ) ;
2007-10-16 23:26:34 -07:00
/*
* Don ' t bother to check the RLIMIT_CORE value if core_pattern points
* to a pipe . Since we ' re not writing directly to the filesystem
* RLIMIT_CORE doesn ' t really apply , as no actual core file will be
* created unless the pipe reader choses to write out the core file
* at which point file size limits and permissions will be imposed
* as it does with any other process
*/
2007-10-16 23:26:35 -07:00
if ( ( ! ispipe ) & & ( core_limit < binfmt - > min_coredump ) )
2007-10-16 23:26:34 -07:00
goto fail_unlock ;
2007-04-16 22:53:13 -07:00
if ( ispipe ) {
2007-10-16 23:26:35 -07:00
helper_argv = argv_split ( GFP_KERNEL , corename + 1 , & helper_argc ) ;
/* Terminate the string before the first option */
delimit = strchr ( corename , ' ' ) ;
if ( delimit )
* delimit = ' \0 ' ;
2007-10-16 23:26:36 -07:00
delimit = strrchr ( helper_argv [ 0 ] , ' / ' ) ;
if ( delimit )
delimit + + ;
else
delimit = helper_argv [ 0 ] ;
if ( ! strcmp ( delimit , current - > comm ) ) {
printk ( KERN_NOTICE " Recursive core dump detected, "
" aborting \n " ) ;
goto fail_unlock ;
}
core_limit = RLIM_INFINITY ;
2006-09-30 23:29:28 -07:00
/* SIGPIPE can happen, but it's just never processed */
2007-10-16 23:26:36 -07:00
if ( call_usermodehelper_pipe ( corename + 1 , helper_argv , NULL ,
& file ) ) {
2006-09-30 23:29:28 -07:00
printk ( KERN_INFO " Core dump to %s pipe failed \n " ,
corename ) ;
goto fail_unlock ;
}
} else
file = filp_open ( corename ,
2006-12-06 20:40:39 -08:00
O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag ,
0600 ) ;
2005-04-16 15:20:36 -07:00
if ( IS_ERR ( file ) )
goto fail_unlock ;
2006-12-08 02:36:35 -08:00
inode = file - > f_path . dentry - > d_inode ;
2005-04-16 15:20:36 -07:00
if ( inode - > i_nlink > 1 )
goto close_fail ; /* multiple links - don't dump */
2006-12-08 02:36:35 -08:00
if ( ! ispipe & & d_unhashed ( file - > f_path . dentry ) )
2005-04-16 15:20:36 -07:00
goto close_fail ;
2006-09-30 23:29:28 -07:00
/* AK: actually i see no reason to not allow this for named pipes etc.,
but keep the previous behaviour for now . */
if ( ! ispipe & & ! S_ISREG ( inode - > i_mode ) )
2005-04-16 15:20:36 -07:00
goto close_fail ;
2007-11-28 13:59:18 +01:00
/*
* Dont allow local users get cute and trick others to coredump
* into their pre - created files :
*/
if ( inode - > i_uid ! = current - > fsuid )
goto close_fail ;
2005-04-16 15:20:36 -07:00
if ( ! file - > f_op )
goto close_fail ;
if ( ! file - > f_op - > write )
goto close_fail ;
2006-12-08 02:36:35 -08:00
if ( ! ispipe & & do_truncate ( file - > f_path . dentry , 0 , 0 , file ) ! = 0 )
2005-04-16 15:20:36 -07:00
goto close_fail ;
2007-10-16 23:26:34 -07:00
retval = binfmt - > core_dump ( signr , regs , file , core_limit ) ;
2005-04-16 15:20:36 -07:00
if ( retval )
current - > signal - > group_exit_code | = 0x80 ;
close_fail :
filp_close ( file , NULL ) ;
fail_unlock :
2007-10-16 23:26:35 -07:00
if ( helper_argv )
argv_free ( helper_argv ) ;
2005-06-23 00:09:43 -07:00
current - > fsuid = fsuid ;
2008-07-25 01:47:46 -07:00
coredump_finish ( mm ) ;
2005-04-16 15:20:36 -07:00
fail :
return retval ;
}