2019-05-29 07:12:31 -07:00
// SPDX-License-Identifier: GPL-2.0-only
2015-07-03 12:44:23 -07:00
/*
* common . c - C code for kernel entry and exit
* Copyright ( c ) 2015 Andrew Lutomirski
*
* Based on asm and ptrace code by many authors . The code here originated
* in ptrace . c and signal . c .
*/
# include <linux/kernel.h>
# include <linux/sched.h>
2017-02-08 18:51:37 +01:00
# include <linux/sched/task_stack.h>
2020-07-23 00:00:04 +02:00
# include <linux/entry-common.h>
2015-07-03 12:44:23 -07:00
# include <linux/mm.h>
# include <linux/smp.h>
# include <linux/errno.h>
# include <linux/ptrace.h>
# include <linux/export.h>
2018-01-29 17:02:59 -08:00
# include <linux/nospec.h>
2017-06-14 18:12:01 -07:00
# include <linux/syscalls.h>
2019-04-03 18:41:52 +02:00
# include <linux/uaccess.h>
2015-07-03 12:44:23 -07:00
2020-05-21 22:05:26 +02:00
# ifdef CONFIG_XEN_PV
# include <xen/xen-ops.h>
# include <xen/events.h>
# endif
2015-07-03 12:44:23 -07:00
# include <asm/desc.h>
# include <asm/traps.h>
2015-10-05 17:48:10 -07:00
# include <asm/vdso.h>
2016-01-26 22:12:04 +01:00
# include <asm/cpufeature.h>
2019-04-03 18:41:52 +02:00
# include <asm/fpu/api.h>
2019-02-18 23:42:51 +01:00
# include <asm/nospec-branch.h>
2019-11-11 23:03:23 +01:00
# include <asm/io_bitmap.h>
2020-01-23 16:27:54 +01:00
# include <asm/syscall.h>
2020-05-21 22:05:26 +02:00
# include <asm/irq_stack.h>
2015-07-03 12:44:23 -07:00
2020-07-23 00:00:01 +02:00
# ifdef CONFIG_X86_64
2021-05-18 12:13:03 -07:00
static __always_inline bool do_syscall_x64 ( struct pt_regs * regs , int nr )
{
/*
* Convert negative numbers to very high and thus out of range
* numbers for comparisons .
*/
unsigned int unr = nr ;
if ( likely ( unr < NR_syscalls ) ) {
unr = array_index_nospec ( unr , NR_syscalls ) ;
regs - > ax = sys_call_table [ unr ] ( regs ) ;
return true ;
}
return false ;
}
static __always_inline bool do_syscall_x32 ( struct pt_regs * regs , int nr )
{
/*
* Adjust the starting offset of the table , and convert numbers
* < __X32_SYSCALL_BIT to very high and thus out of range
* numbers for comparisons .
*/
unsigned int xnr = nr - __X32_SYSCALL_BIT ;
if ( IS_ENABLED ( CONFIG_X86_X32_ABI ) & & likely ( xnr < X32_NR_syscalls ) ) {
xnr = array_index_nospec ( xnr , X32_NR_syscalls ) ;
regs - > ax = x32_sys_call_table [ xnr ] ( regs ) ;
return true ;
}
return false ;
}
__visible noinstr void do_syscall_64 ( struct pt_regs * regs , int nr )
2020-07-23 00:00:01 +02:00
{
2021-04-01 16:23:45 -07:00
add_random_kstack_offset ( ) ;
2020-07-23 00:00:04 +02:00
nr = syscall_enter_from_user_mode ( regs , nr ) ;
2020-07-23 00:00:01 +02:00
instrumentation_begin ( ) ;
2021-05-18 12:13:03 -07:00
if ( ! do_syscall_x64 ( regs , nr ) & & ! do_syscall_x32 ( regs , nr ) & & nr ! = - 1 ) {
/* Invalid system call, but still a system call. */
2021-05-18 12:13:02 -07:00
regs - > ax = __x64_sys_ni_syscall ( regs ) ;
2016-01-28 15:11:28 -08:00
}
2021-05-18 12:13:03 -07:00
2020-03-10 14:46:27 +01:00
instrumentation_end ( ) ;
2020-07-23 00:00:05 +02:00
syscall_exit_to_user_mode ( regs ) ;
2016-01-28 15:11:28 -08:00
}
# endif
2015-10-05 17:48:08 -07:00
# if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
2021-05-18 12:13:03 -07:00
static __always_inline int syscall_32_enter ( struct pt_regs * regs )
2020-07-23 00:00:01 +02:00
{
if ( IS_ENABLED ( CONFIG_IA32_EMULATION ) )
current_thread_info ( ) - > status | = TS_COMPAT ;
2020-09-02 01:50:54 +02:00
2021-05-18 12:13:03 -07:00
return ( int ) regs - > orig_ax ;
2020-07-23 00:00:01 +02:00
}
2015-10-05 17:48:08 -07:00
/*
2020-07-23 00:00:01 +02:00
* Invoke a 32 - bit syscall . Called with IRQs on in CONTEXT_KERNEL .
2015-10-05 17:48:08 -07:00
*/
2021-05-18 12:13:03 -07:00
static __always_inline void do_syscall_32_irqs_on ( struct pt_regs * regs , int nr )
2015-10-05 17:48:08 -07:00
{
2021-05-18 12:13:03 -07:00
/*
* Convert negative numbers to very high and thus out of range
* numbers for comparisons .
*/
unsigned int unr = nr ;
if ( likely ( unr < IA32_NR_syscalls ) ) {
unr = array_index_nospec ( unr , IA32_NR_syscalls ) ;
regs - > ax = ia32_sys_call_table [ unr ] ( regs ) ;
} else if ( nr ! = - 1 ) {
2021-05-18 12:13:02 -07:00
regs - > ax = __ia32_sys_ni_syscall ( regs ) ;
2015-10-05 17:48:08 -07:00
}
}
2015-10-05 17:48:10 -07:00
2016-03-09 13:24:32 -08:00
/* Handles int $0x80 */
2020-03-10 14:46:27 +01:00
__visible noinstr void do_int80_syscall_32 ( struct pt_regs * regs )
2015-10-05 17:48:17 -07:00
{
2021-05-18 12:13:03 -07:00
int nr = syscall_32_enter ( regs ) ;
2020-03-10 14:46:27 +01:00
2021-04-01 16:23:45 -07:00
add_random_kstack_offset ( ) ;
2020-09-02 01:50:54 +02:00
/*
2021-05-18 12:13:03 -07:00
* Subtlety here : if ptrace pokes something larger than 2 ^ 31 - 1 into
* orig_ax , the int return value truncates it . This matches
* the semantics of syscall_get_nr ( ) .
2020-09-02 01:50:54 +02:00
*/
2021-05-18 12:13:03 -07:00
nr = syscall_enter_from_user_mode ( regs , nr ) ;
2021-01-06 15:36:20 +01:00
instrumentation_begin ( ) ;
2020-09-02 01:50:54 +02:00
2020-07-23 00:00:01 +02:00
do_syscall_32_irqs_on ( regs , nr ) ;
2021-01-06 15:36:20 +01:00
instrumentation_end ( ) ;
2020-07-23 00:00:05 +02:00
syscall_exit_to_user_mode ( regs ) ;
2020-03-10 14:46:27 +01:00
}
2020-07-23 00:00:01 +02:00
static noinstr bool __do_fast_syscall_32 ( struct pt_regs * regs )
2020-03-10 14:46:27 +01:00
{
2021-05-18 12:13:03 -07:00
int nr = syscall_32_enter ( regs ) ;
2020-03-10 14:46:27 +01:00
int res ;
2021-04-01 16:23:45 -07:00
add_random_kstack_offset ( ) ;
2020-09-02 01:50:54 +02:00
/*
* This cannot use syscall_enter_from_user_mode ( ) as it has to
* fetch EBP before invoking any of the syscall entry work
* functions .
*/
syscall_enter_from_user_mode_prepare ( regs ) ;
2020-07-23 00:00:01 +02:00
instrumentation_begin ( ) ;
2020-03-10 14:46:27 +01:00
/* Fetch EBP from where the vDSO stashed it. */
if ( IS_ENABLED ( CONFIG_X86_64 ) ) {
/*
* Micro - optimization : the pointer we ' re following is
* explicitly 32 bits , so it can ' t be out of range .
*/
res = __get_user ( * ( u32 * ) & regs - > bp ,
( u32 __user __force * ) ( unsigned long ) ( u32 ) regs - > sp ) ;
} else {
res = get_user ( * ( u32 * ) & regs - > bp ,
( u32 __user __force * ) ( unsigned long ) ( u32 ) regs - > sp ) ;
}
if ( res ) {
/* User code screwed up. */
regs - > ax = - EFAULT ;
2021-01-06 15:36:20 +01:00
2021-03-04 11:05:54 -08:00
local_irq_disable ( ) ;
2021-06-21 13:12:34 +02:00
instrumentation_end ( ) ;
2021-03-04 11:05:54 -08:00
irqentry_exit_to_user_mode ( regs ) ;
2020-03-10 14:46:27 +01:00
return false ;
}
2021-05-18 12:13:03 -07:00
nr = syscall_enter_from_user_mode_work ( regs , nr ) ;
2020-09-02 01:50:54 +02:00
2020-03-10 14:46:27 +01:00
/* Now this is just like a normal syscall. */
2020-07-23 00:00:01 +02:00
do_syscall_32_irqs_on ( regs , nr ) ;
2021-01-06 15:36:20 +01:00
instrumentation_end ( ) ;
2020-07-23 00:00:05 +02:00
syscall_exit_to_user_mode ( regs ) ;
2020-03-10 14:46:27 +01:00
return true ;
2015-10-05 17:48:17 -07:00
}
2015-10-05 17:48:15 -07:00
/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
2020-03-10 14:46:27 +01:00
__visible noinstr long do_fast_syscall_32 ( struct pt_regs * regs )
2015-10-05 17:48:10 -07:00
{
/*
* Called using the internal vDSO SYSENTER / SYSCALL32 calling
* convention . Adjust regs so it looks like we entered using int80 .
*/
unsigned long landing_pad = ( unsigned long ) current - > mm - > context . vdso +
2020-03-10 14:46:27 +01:00
vdso_image_32 . sym_int80_landing_pad ;
2015-10-05 17:48:10 -07:00
/*
* SYSENTER loses EIP , and even SYSCALL32 needs us to skip forward
* so that ' regs - > ip - = 2 ' lands back on an int $ 0x80 instruction .
* Fix it up .
*/
regs - > ip = landing_pad ;
2020-07-23 00:00:01 +02:00
/* Invoke the syscall. If it failed, keep it simple: use IRET. */
if ( ! __do_fast_syscall_32 ( regs ) )
2020-03-10 14:46:27 +01:00
return 0 ;
2015-10-05 17:48:12 -07:00
# ifdef CONFIG_X86_64
/*
* Opportunistic SYSRETL : if possible , try to return using SYSRETL .
* SYSRETL is available on all 64 - bit CPUs , so we don ' t need to
* bother with SYSEXIT .
*
* Unlike 64 - bit opportunistic SYSRET , we can ' t check that CX = = IP ,
* because the ECX fixup above will ensure that this is essentially
* never the case .
*/
return regs - > cs = = __USER32_CS & & regs - > ss = = __USER_DS & &
regs - > ip = = landing_pad & &
( regs - > flags & ( X86_EFLAGS_RF | X86_EFLAGS_TF ) ) = = 0 ;
# else
2015-10-05 17:48:15 -07:00
/*
* Opportunistic SYSEXIT : if possible , try to return using SYSEXIT .
*
* Unlike 64 - bit opportunistic SYSRET , we can ' t check that CX = = IP ,
* because the ECX fixup above will ensure that this is essentially
* never the case .
*
* We don ' t allow syscalls at all from VM86 mode , but we still
* need to check VM , because we might be returning from sys_vm86 .
*/
return static_cpu_has ( X86_FEATURE_SEP ) & &
regs - > cs = = __USER_CS & & regs - > ss = = __USER_DS & &
regs - > ip = = landing_pad & &
( regs - > flags & ( X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM ) ) = = 0 ;
2015-10-05 17:48:12 -07:00
# endif
2015-10-05 17:48:10 -07:00
}
2020-06-26 10:21:12 -07:00
/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
__visible noinstr long do_SYSENTER_32 ( struct pt_regs * regs )
{
/* SYSENTER loses RSP, but the vDSO saved it in RBP. */
regs - > sp = regs - > bp ;
/* SYSENTER clobbers EFLAGS.IF. Assume it was set in usermode. */
regs - > flags | = X86_EFLAGS_IF ;
return do_fast_syscall_32 ( regs ) ;
}
2015-10-05 17:48:08 -07:00
# endif
2020-03-13 15:51:32 -04:00
SYSCALL_DEFINE0 ( ni_syscall )
{
return - ENOSYS ;
}
2020-03-26 16:28:52 +01:00
2020-05-21 22:05:26 +02:00
# ifdef CONFIG_XEN_PV
# ifndef CONFIG_PREEMPTION
/*
* Some hypercalls issued by the toolstack can take many 10 s of
* seconds . Allow tasks running hypercalls via the privcmd driver to
* be voluntarily preempted even if full kernel preemption is
* disabled .
*
* Such preemptible hypercalls are bracketed by
* xen_preemptible_hcall_begin ( ) and xen_preemptible_hcall_end ( )
* calls .
*/
DEFINE_PER_CPU ( bool , xen_in_preemptible_hcall ) ;
EXPORT_SYMBOL_GPL ( xen_in_preemptible_hcall ) ;
/*
* In case of scheduling the flag must be cleared and restored after
* returning from schedule as the task might move to a different CPU .
*/
static __always_inline bool get_and_clear_inhcall ( void )
{
bool inhcall = __this_cpu_read ( xen_in_preemptible_hcall ) ;
__this_cpu_write ( xen_in_preemptible_hcall , false ) ;
return inhcall ;
}
static __always_inline void restore_inhcall ( bool inhcall )
{
__this_cpu_write ( xen_in_preemptible_hcall , inhcall ) ;
}
# else
static __always_inline bool get_and_clear_inhcall ( void ) { return false ; }
static __always_inline void restore_inhcall ( bool inhcall ) { }
# endif
2021-02-10 00:40:49 +01:00
static void __xen_pv_evtchn_do_upcall ( struct pt_regs * regs )
2020-05-21 22:05:26 +02:00
{
2021-02-10 00:40:49 +01:00
struct pt_regs * old_regs = set_irq_regs ( regs ) ;
2020-05-21 22:05:26 +02:00
inc_irq_stat ( irq_hv_callback_count ) ;
xen_hvm_evtchn_do_upcall ( ) ;
2021-02-10 00:40:49 +01:00
set_irq_regs ( old_regs ) ;
2020-05-21 22:05:26 +02:00
}
__visible noinstr void xen_pv_evtchn_do_upcall ( struct pt_regs * regs )
{
2021-02-10 00:40:49 +01:00
irqentry_state_t state = irqentry_enter ( regs ) ;
2020-07-03 10:02:58 -07:00
bool inhcall ;
2020-05-21 22:05:26 +02:00
2021-06-21 13:12:35 +02:00
instrumentation_begin ( ) ;
2021-02-10 00:40:49 +01:00
run_sysvec_on_irqstack_cond ( __xen_pv_evtchn_do_upcall , regs ) ;
2020-05-21 22:05:26 +02:00
inhcall = get_and_clear_inhcall ( ) ;
2020-07-03 10:02:58 -07:00
if ( inhcall & & ! WARN_ON_ONCE ( state . exit_rcu ) ) {
2020-07-23 00:00:07 +02:00
irqentry_exit_cond_resched ( ) ;
2020-05-21 22:05:26 +02:00
instrumentation_end ( ) ;
restore_inhcall ( inhcall ) ;
} else {
2021-06-21 13:12:35 +02:00
instrumentation_end ( ) ;
2020-07-23 00:00:08 +02:00
irqentry_exit ( regs , state ) ;
2020-05-21 22:05:26 +02:00
}
}
# endif /* CONFIG_XEN_PV */