2005-04-16 15:20:36 -07:00
/*
2014-10-29 14:33:46 -07:00
* Copyright ( c ) 2012 - 2014 Andy Lutomirski < luto @ amacapital . net >
*
* Based on the original implementation which is :
2005-04-16 15:20:36 -07:00
* Copyright ( C ) 2001 Andrea Arcangeli < andrea @ suse . de > SuSE
* Copyright 2003 Andi Kleen , SuSE Labs .
*
2014-10-29 14:33:46 -07:00
* Parts of the original code have been moved to arch / x86 / vdso / vma . c
*
* This file implements vsyscall emulation . vsyscalls are a legacy ABI :
* Userspace can request certain kernel services by calling fixed
* addresses . This concept is problematic :
2011-06-05 13:50:24 -04:00
*
2014-10-29 14:33:46 -07:00
* - It interferes with ASLR .
* - It ' s awkward to write code that lives in kernel addresses but is
* callable by userspace at fixed addresses .
* - The whole concept is impossible for 32 - bit compat userspace .
* - UML cannot easily virtualize a vsyscall .
2005-04-16 15:20:36 -07:00
*
2014-10-29 14:33:46 -07:00
* As of mid - 2014 , I believe that there is no new userspace code that
* will use a vsyscall if the vDSO is present . I hope that there will
* soon be no new userspace code that will ever use a vsyscall .
2005-04-16 15:20:36 -07:00
*
2014-10-29 14:33:46 -07:00
* The code in this file emulates vsyscalls when notified of a page
* fault to a vsyscall address .
2005-04-16 15:20:36 -07:00
*/
# include <linux/kernel.h>
# include <linux/timer.h>
2011-06-05 13:50:24 -04:00
# include <linux/syscalls.h>
# include <linux/ratelimit.h>
2005-04-16 15:20:36 -07:00
# include <asm/vsyscall.h>
2007-02-16 01:28:21 -08:00
# include <asm/unistd.h>
2005-04-16 15:20:36 -07:00
# include <asm/fixmap.h>
2011-06-05 13:50:24 -04:00
# include <asm/traps.h>
2005-04-16 15:20:36 -07:00
2011-08-03 09:31:54 -04:00
# define CREATE_TRACE_POINTS
# include "vsyscall_trace.h"
2011-11-07 16:33:41 -08:00
static enum { EMULATE , NATIVE , NONE } vsyscall_mode = EMULATE ;
2011-08-10 11:15:32 -04:00
static int __init vsyscall_setup ( char * str )
{
if ( str ) {
if ( ! strcmp ( " emulate " , str ) )
vsyscall_mode = EMULATE ;
else if ( ! strcmp ( " native " , str ) )
vsyscall_mode = NATIVE ;
else if ( ! strcmp ( " none " , str ) )
vsyscall_mode = NONE ;
else
return - EINVAL ;
return 0 ;
}
return - EINVAL ;
}
early_param ( " vsyscall " , vsyscall_setup ) ;
2011-06-05 13:50:24 -04:00
static void warn_bad_vsyscall ( const char * level , struct pt_regs * regs ,
const char * message )
2005-04-16 15:20:36 -07:00
{
2012-05-21 19:50:07 -07:00
if ( ! show_unhandled_signals )
2011-06-05 13:50:24 -04:00
return ;
2005-04-16 15:20:36 -07:00
2014-07-25 16:30:27 -07:00
printk_ratelimited ( " %s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx \n " ,
level , current - > comm , task_pid_nr ( current ) ,
message , regs - > ip , regs - > cs ,
regs - > sp , regs - > ax , regs - > si , regs - > di ) ;
2011-07-13 09:24:09 -04:00
}
static int addr_to_vsyscall_nr ( unsigned long addr )
{
int nr ;
2014-05-05 12:19:36 -07:00
if ( ( addr & ~ 0xC00UL ) ! = VSYSCALL_ADDR )
2011-07-13 09:24:09 -04:00
return - EINVAL ;
nr = ( addr & 0xC00UL ) > > 10 ;
if ( nr > = 3 )
return - EINVAL ;
return nr ;
2005-04-16 15:20:36 -07:00
}
2011-11-07 16:33:40 -08:00
static bool write_ok_or_segv ( unsigned long ptr , size_t size )
{
/*
* XXX : if access_ok , get_user , and put_user handled
* sig_on_uaccess_error , this could go away .
*/
if ( ! access_ok ( VERIFY_WRITE , ( void __user * ) ptr , size ) ) {
siginfo_t info ;
struct thread_struct * thread = & current - > thread ;
thread - > error_code = 6 ; /* user fault, no page, write */
thread - > cr2 = ptr ;
2012-03-12 14:55:55 +05:30
thread - > trap_nr = X86_TRAP_PF ;
2011-11-07 16:33:40 -08:00
memset ( & info , 0 , sizeof ( info ) ) ;
info . si_signo = SIGSEGV ;
info . si_errno = 0 ;
info . si_code = SEGV_MAPERR ;
info . si_addr = ( void __user * ) ptr ;
force_sig_info ( SIGSEGV , & info , current ) ;
return false ;
} else {
return true ;
}
}
2011-08-10 11:15:32 -04:00
bool emulate_vsyscall ( struct pt_regs * regs , unsigned long address )
2005-04-16 15:20:36 -07:00
{
2011-06-05 13:50:24 -04:00
struct task_struct * tsk ;
unsigned long caller ;
2012-10-01 11:40:45 -07:00
int vsyscall_nr , syscall_nr , tmp ;
2011-11-07 16:33:40 -08:00
int prev_sig_on_uaccess_error ;
2011-06-05 13:50:24 -04:00
long ret ;
2011-08-10 11:15:32 -04:00
/*
* No point in checking CS - - the only way to get here is a user mode
* trap to a high address , which means that we ' re in 64 - bit user code .
*/
2011-06-05 13:50:24 -04:00
2011-08-10 11:15:32 -04:00
WARN_ON_ONCE ( address ! = regs - > ip ) ;
2011-07-13 09:24:09 -04:00
2011-08-10 11:15:32 -04:00
if ( vsyscall_mode = = NONE ) {
warn_bad_vsyscall ( KERN_INFO , regs ,
" vsyscall attempted with vsyscall=none " ) ;
return false ;
2011-07-13 09:24:09 -04:00
}
2011-08-10 11:15:32 -04:00
vsyscall_nr = addr_to_vsyscall_nr ( address ) ;
2011-08-03 09:31:54 -04:00
trace_emulate_vsyscall ( vsyscall_nr ) ;
2011-07-13 09:24:09 -04:00
if ( vsyscall_nr < 0 ) {
warn_bad_vsyscall ( KERN_WARNING , regs ,
2011-08-10 11:15:32 -04:00
" misaligned vsyscall (exploit attempt or buggy program) -- look up the vsyscall kernel parameter if you need a workaround " ) ;
2011-06-05 13:50:24 -04:00
goto sigsegv ;
}
2007-05-21 14:31:52 +02:00
2011-06-05 13:50:24 -04:00
if ( get_user ( caller , ( unsigned long __user * ) regs - > sp ) ! = 0 ) {
2011-08-10 11:15:32 -04:00
warn_bad_vsyscall ( KERN_WARNING , regs ,
" vsyscall with bad stack (exploit attempt?) " ) ;
2011-06-05 13:50:24 -04:00
goto sigsegv ;
}
2010-07-13 17:56:18 -07:00
2011-06-05 13:50:24 -04:00
tsk = current ;
2011-11-07 16:33:40 -08:00
/*
2012-10-01 11:40:45 -07:00
* Check for access_ok violations and find the syscall nr .
*
2012-04-01 20:48:04 +02:00
* NULL is a valid user pointer ( in the access_ok sense ) on 32 - bit and
2011-11-07 16:33:40 -08:00
* 64 - bit , so we don ' t need to special - case it here . For all the
2012-04-01 20:48:04 +02:00
* vsyscalls , NULL means " don't write anything " not " write it at
2011-11-07 16:33:40 -08:00
* address 0 " .
*/
2011-06-05 13:50:24 -04:00
switch ( vsyscall_nr ) {
case 0 :
2011-11-07 16:33:40 -08:00
if ( ! write_ok_or_segv ( regs - > di , sizeof ( struct timeval ) ) | |
2012-10-01 11:40:45 -07:00
! write_ok_or_segv ( regs - > si , sizeof ( struct timezone ) ) ) {
ret = - EFAULT ;
goto check_fault ;
}
2011-11-07 16:33:40 -08:00
2012-10-01 11:40:45 -07:00
syscall_nr = __NR_gettimeofday ;
break ;
case 1 :
if ( ! write_ok_or_segv ( regs - > di , sizeof ( time_t ) ) ) {
ret = - EFAULT ;
goto check_fault ;
}
syscall_nr = __NR_time ;
break ;
case 2 :
if ( ! write_ok_or_segv ( regs - > di , sizeof ( unsigned ) ) | |
! write_ok_or_segv ( regs - > si , sizeof ( unsigned ) ) ) {
ret = - EFAULT ;
goto check_fault ;
}
syscall_nr = __NR_getcpu ;
break ;
}
/*
* Handle seccomp . regs - > ip must be the original value .
* See seccomp_send_sigsys and Documentation / prctl / seccomp_filter . txt .
*
* We could optimize the seccomp disabled case , but performance
* here doesn ' t matter .
*/
regs - > orig_ax = syscall_nr ;
regs - > ax = - ENOSYS ;
seccomp,x86,arm,mips,s390: Remove nr parameter from secure_computing
The secure_computing function took a syscall number parameter, but
it only paid any attention to that parameter if seccomp mode 1 was
enabled. Rather than coming up with a kludge to get the parameter
to work in mode 2, just remove the parameter.
To avoid churn in arches that don't have seccomp filters (and may
not even support syscall_get_nr right now), this leaves the
parameter in secure_computing_strict, which is now a real function.
For ARM, this is a bit ugly due to the fact that ARM conditionally
supports seccomp filters. Fixing that would probably only be a
couple of lines of code, but it should be coordinated with the audit
maintainers.
This will be a slight slowdown on some arches. The right fix is to
pass in all of seccomp_data instead of trying to make just the
syscall nr part be fast.
This is a prerequisite for making two-phase seccomp work cleanly.
Cc: Russell King <linux@arm.linux.org.uk>
Cc: linux-arm-kernel@lists.infradead.org
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: linux-s390@vger.kernel.org
Cc: x86@kernel.org
Cc: Kees Cook <keescook@chromium.org>
Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Kees Cook <keescook@chromium.org>
2014-07-21 18:49:14 -07:00
tmp = secure_computing ( ) ;
2012-10-01 11:40:45 -07:00
if ( ( ! tmp & & regs - > orig_ax ! = syscall_nr ) | | regs - > ip ! = address ) {
warn_bad_vsyscall ( KERN_DEBUG , regs ,
" seccomp tried to change syscall nr or ip " ) ;
do_exit ( SIGSYS ) ;
}
2014-11-04 15:36:50 -08:00
regs - > orig_ax = - 1 ;
2012-10-01 11:40:45 -07:00
if ( tmp )
goto do_ret ; /* skip requested */
/*
* With a real vsyscall , page faults cause SIGSEGV . We want to
* preserve that behavior to make writing exploits harder .
*/
prev_sig_on_uaccess_error = current_thread_info ( ) - > sig_on_uaccess_error ;
current_thread_info ( ) - > sig_on_uaccess_error = 1 ;
ret = - EFAULT ;
switch ( vsyscall_nr ) {
case 0 :
2011-06-05 13:50:24 -04:00
ret = sys_gettimeofday (
( struct timeval __user * ) regs - > di ,
( struct timezone __user * ) regs - > si ) ;
break ;
case 1 :
ret = sys_time ( ( time_t __user * ) regs - > di ) ;
break ;
case 2 :
ret = sys_getcpu ( ( unsigned __user * ) regs - > di ,
( unsigned __user * ) regs - > si ,
2012-04-01 20:48:04 +02:00
NULL ) ;
2011-06-05 13:50:24 -04:00
break ;
}
2010-07-13 17:56:18 -07:00
2011-11-07 16:33:40 -08:00
current_thread_info ( ) - > sig_on_uaccess_error = prev_sig_on_uaccess_error ;
2012-10-01 11:40:45 -07:00
check_fault :
2011-06-05 13:50:24 -04:00
if ( ret = = - EFAULT ) {
2011-11-07 16:33:40 -08:00
/* Bad news -- userspace fed a bad pointer to a vsyscall. */
2011-06-05 13:50:24 -04:00
warn_bad_vsyscall ( KERN_INFO , regs ,
" vsyscall fault (exploit attempt?) " ) ;
2011-11-07 16:33:40 -08:00
/*
* If we failed to generate a signal for any reason ,
* generate one here . ( This should be impossible . )
*/
if ( WARN_ON_ONCE ( ! sigismember ( & tsk - > pending . signal , SIGBUS ) & &
! sigismember ( & tsk - > pending . signal , SIGSEGV ) ) )
goto sigsegv ;
return true ; /* Don't emulate the ret. */
2011-06-05 13:50:24 -04:00
}
2010-07-13 17:56:18 -07:00
2011-06-05 13:50:24 -04:00
regs - > ax = ret ;
2005-04-16 15:20:36 -07:00
2012-07-13 12:06:35 -05:00
do_ret :
2011-06-05 13:50:24 -04:00
/* Emulate a ret instruction. */
regs - > ip = caller ;
regs - > sp + = 8 ;
2011-08-10 11:15:32 -04:00
return true ;
2011-06-05 13:50:24 -04:00
sigsegv :
force_sig ( SIGSEGV , current ) ;
2011-08-10 11:15:32 -04:00
return true ;
2005-04-16 15:20:36 -07:00
}
2014-09-23 10:50:51 -07:00
/*
* A pseudo VMA to allow ptrace access for the vsyscall page . This only
* covers the 64 bit vsyscall page now . 32 bit has a real VMA now and does
* not need special handling anymore :
*/
static const char * gate_vma_name ( struct vm_area_struct * vma )
{
return " [vsyscall] " ;
}
2015-09-09 15:39:26 -07:00
static const struct vm_operations_struct gate_vma_ops = {
2014-09-23 10:50:51 -07:00
. name = gate_vma_name ,
} ;
static struct vm_area_struct gate_vma = {
. vm_start = VSYSCALL_ADDR ,
. vm_end = VSYSCALL_ADDR + PAGE_SIZE ,
. vm_page_prot = PAGE_READONLY_EXEC ,
. vm_flags = VM_READ | VM_EXEC ,
. vm_ops = & gate_vma_ops ,
} ;
struct vm_area_struct * get_gate_vma ( struct mm_struct * mm )
{
2015-06-22 07:55:16 -04:00
# ifdef CONFIG_COMPAT
2014-09-23 10:50:51 -07:00
if ( ! mm | | mm - > context . ia32_compat )
return NULL ;
# endif
2014-10-29 14:33:45 -07:00
if ( vsyscall_mode = = NONE )
return NULL ;
2014-09-23 10:50:51 -07:00
return & gate_vma ;
}
int in_gate_area ( struct mm_struct * mm , unsigned long addr )
{
struct vm_area_struct * vma = get_gate_vma ( mm ) ;
if ( ! vma )
return 0 ;
return ( addr > = vma - > vm_start ) & & ( addr < vma - > vm_end ) ;
}
/*
* Use this when you have no reliable mm , typically from interrupt
* context . It is less reliable than using a task ' s mm and may give
* false positives .
*/
int in_gate_area_no_mm ( unsigned long addr )
{
2014-10-29 14:33:45 -07:00
return vsyscall_mode ! = NONE & & ( addr & PAGE_MASK ) = = VSYSCALL_ADDR ;
2014-09-23 10:50:51 -07:00
}
2008-01-30 13:32:39 +01:00
void __init map_vsyscall ( void )
2005-04-16 15:20:36 -07:00
{
2011-08-10 11:15:32 -04:00
extern char __vsyscall_page ;
unsigned long physaddr_vsyscall = __pa_symbol ( & __vsyscall_page ) ;
2005-04-16 15:20:36 -07:00
2014-10-29 14:33:45 -07:00
if ( vsyscall_mode ! = NONE )
__set_fixmap ( VSYSCALL_PAGE , physaddr_vsyscall ,
vsyscall_mode = = NATIVE
? PAGE_KERNEL_VSYSCALL
: PAGE_KERNEL_VVAR ) ;
2014-05-05 12:19:36 -07:00
BUILD_BUG_ON ( ( unsigned long ) __fix_to_virt ( VSYSCALL_PAGE ) ! =
( unsigned long ) VSYSCALL_ADDR ) ;
2005-04-16 15:20:36 -07:00
}