2018-07-11 16:56:43 +03:00
// SPDX-License-Identifier: GPL-2.0
2018-07-11 16:56:44 +03:00
# include <linux/compiler.h>
# include <linux/context_tracking.h>
2018-07-11 16:56:43 +03:00
# include <linux/errno.h>
# include <linux/nospec.h>
# include <linux/ptrace.h>
2021-04-02 02:23:46 +03:00
# include <linux/randomize_kstack.h>
2018-07-11 16:56:43 +03:00
# include <linux/syscalls.h>
2019-04-29 15:03:57 +03:00
# include <asm/debug-monitors.h>
2021-01-14 15:48:12 +03:00
# include <asm/exception.h>
2018-07-11 16:56:45 +03:00
# include <asm/fpsimd.h>
2018-07-11 16:56:43 +03:00
# include <asm/syscall.h>
2018-07-11 16:56:44 +03:00
# include <asm/thread_info.h>
2018-07-11 16:56:45 +03:00
# include <asm/unistd.h>
2018-07-11 16:56:43 +03:00
2019-01-03 21:00:39 +03:00
long compat_arm_syscall ( struct pt_regs * regs , int scno ) ;
arm64: implement syscall wrappers
To minimize the risk of userspace-controlled values being used under
speculation, this patch adds pt_regs based syscall wrappers for arm64,
which pass the minimum set of required userspace values to syscall
implementations. For each syscall, a wrapper which takes a pt_regs
argument is automatically generated, and this extracts the arguments
before calling the "real" syscall implementation.
Each syscall has three functions generated:
* __do_<compat_>sys_<name> is the "real" syscall implementation, with
the expected prototype.
* __se_<compat_>sys_<name> is the sign-extension/narrowing wrapper,
inherited from common code. This takes a series of long parameters,
casting each to the requisite types required by the "real" syscall
implementation in __do_<compat_>sys_<name>.
This wrapper *may* not be necessary on arm64 given the AAPCS rules on
unused register bits, but it seemed safer to keep the wrapper for now.
* __arm64_<compat_>_sys_<name> takes a struct pt_regs pointer, and
extracts *only* the relevant register values, passing these on to the
__se_<compat_>sys_<name> wrapper.
The syscall invocation code is updated to handle the calling convention
required by __arm64_<compat_>_sys_<name>, and passes a single struct
pt_regs pointer.
The compiler can fold the syscall implementation and its wrappers, such
that the overhead of this approach is minimized.
Note that we play games with sys_ni_syscall(). It can't be defined with
SYSCALL_DEFINE0() because we must avoid the possibility of error
injection. Additionally, there are a couple of locations where we need
to call it from C code, and we don't (currently) have a
ksys_ni_syscall(). While it has no wrapper, passing in a redundant
pt_regs pointer is benign per the AAPCS.
When ARCH_HAS_SYSCALL_WRAPPER is selected, no prototype is defines for
sys_ni_syscall(). Since we need to treat it differently for in-kernel
calls and the syscall tables, the prototype is defined as-required.
The wrappers are largely the same as their x86 counterparts, but
simplified as we don't have a variety of compat calling conventions that
require separate stubs. Unlike x86, we have some zero-argument compat
syscalls, and must define COMPAT_SYSCALL_DEFINE0() to ensure that these
are also given an __arm64_compat_sys_ prefix.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Dominik Brodowski <linux@dominikbrodowski.net>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
2018-07-11 16:56:56 +03:00
long sys_ni_syscall ( void ) ;
2019-01-03 21:00:39 +03:00
static long do_ni_syscall ( struct pt_regs * regs , int scno )
2018-07-11 16:56:43 +03:00
{
# ifdef CONFIG_COMPAT
long ret ;
if ( is_compat_task ( ) ) {
2019-01-03 21:00:39 +03:00
ret = compat_arm_syscall ( regs , scno ) ;
2018-07-11 16:56:43 +03:00
if ( ret ! = - ENOSYS )
return ret ;
}
# endif
return sys_ni_syscall ( ) ;
}
static long __invoke_syscall ( struct pt_regs * regs , syscall_fn_t syscall_fn )
{
arm64: implement syscall wrappers
To minimize the risk of userspace-controlled values being used under
speculation, this patch adds pt_regs based syscall wrappers for arm64,
which pass the minimum set of required userspace values to syscall
implementations. For each syscall, a wrapper which takes a pt_regs
argument is automatically generated, and this extracts the arguments
before calling the "real" syscall implementation.
Each syscall has three functions generated:
* __do_<compat_>sys_<name> is the "real" syscall implementation, with
the expected prototype.
* __se_<compat_>sys_<name> is the sign-extension/narrowing wrapper,
inherited from common code. This takes a series of long parameters,
casting each to the requisite types required by the "real" syscall
implementation in __do_<compat_>sys_<name>.
This wrapper *may* not be necessary on arm64 given the AAPCS rules on
unused register bits, but it seemed safer to keep the wrapper for now.
* __arm64_<compat_>_sys_<name> takes a struct pt_regs pointer, and
extracts *only* the relevant register values, passing these on to the
__se_<compat_>sys_<name> wrapper.
The syscall invocation code is updated to handle the calling convention
required by __arm64_<compat_>_sys_<name>, and passes a single struct
pt_regs pointer.
The compiler can fold the syscall implementation and its wrappers, such
that the overhead of this approach is minimized.
Note that we play games with sys_ni_syscall(). It can't be defined with
SYSCALL_DEFINE0() because we must avoid the possibility of error
injection. Additionally, there are a couple of locations where we need
to call it from C code, and we don't (currently) have a
ksys_ni_syscall(). While it has no wrapper, passing in a redundant
pt_regs pointer is benign per the AAPCS.
When ARCH_HAS_SYSCALL_WRAPPER is selected, no prototype is defines for
sys_ni_syscall(). Since we need to treat it differently for in-kernel
calls and the syscall tables, the prototype is defined as-required.
The wrappers are largely the same as their x86 counterparts, but
simplified as we don't have a variety of compat calling conventions that
require separate stubs. Unlike x86, we have some zero-argument compat
syscalls, and must define COMPAT_SYSCALL_DEFINE0() to ensure that these
are also given an __arm64_compat_sys_ prefix.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Dominik Brodowski <linux@dominikbrodowski.net>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
2018-07-11 16:56:56 +03:00
return syscall_fn ( regs ) ;
2018-07-11 16:56:43 +03:00
}
2018-07-11 16:56:44 +03:00
static void invoke_syscall ( struct pt_regs * regs , unsigned int scno ,
unsigned int sc_nr ,
const syscall_fn_t syscall_table [ ] )
2018-07-11 16:56:43 +03:00
{
long ret ;
2021-04-02 02:23:46 +03:00
add_random_kstack_offset ( ) ;
2018-07-11 16:56:43 +03:00
if ( scno < sc_nr ) {
syscall_fn_t syscall_fn ;
syscall_fn = syscall_table [ array_index_nospec ( scno , sc_nr ) ] ;
ret = __invoke_syscall ( regs , syscall_fn ) ;
} else {
2019-01-03 21:00:39 +03:00
ret = do_ni_syscall ( regs , scno ) ;
2018-07-11 16:56:43 +03:00
}
arm64: fix compat syscall return truncation
Due to inconsistencies in the way we manipulate compat GPRs, we have a
few issues today:
* For audit and tracing, where error codes are handled as a (native)
long, negative error codes are expected to be sign-extended to the
native 64-bits, or they may fail to be matched correctly. Thus a
syscall which fails with an error may erroneously be identified as
failing.
* For ptrace, *all* compat return values should be sign-extended for
consistency with 32-bit arm, but we currently only do this for
negative return codes.
* As we may transiently set the upper 32 bits of some compat GPRs while
in the kernel, these can be sampled by perf, which is somewhat
confusing. This means that where a syscall returns a pointer above 2G,
this will be sign-extended, but will not be mistaken for an error as
error codes are constrained to the inclusive range [-4096, -1] where
no user pointer can exist.
To fix all of these, we must consistently use helpers to get/set the
compat GPRs, ensuring that we never write the upper 32 bits of the
return code, and always sign-extend when reading the return code. This
patch does so, with the following changes:
* We re-organise syscall_get_return_value() to always sign-extend for
compat tasks, and reimplement syscall_get_error() atop. We update
syscall_trace_exit() to use syscall_get_return_value().
* We consistently use syscall_set_return_value() to set the return
value, ensureing the upper 32 bits are never set unexpectedly.
* As the core audit code currently uses regs_return_value() rather than
syscall_get_return_value(), we special-case this for
compat_user_mode(regs) such that this will do the right thing. Going
forward, we should try to move the core audit code over to
syscall_get_return_value().
Cc: <stable@vger.kernel.org>
Reported-by: He Zhe <zhe.he@windriver.com>
Reported-by: weiyuchen <weiyuchen3@huawei.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Link: https://lore.kernel.org/r/20210802104200.21390-1-mark.rutland@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
2021-08-02 13:42:00 +03:00
syscall_set_return_value ( current , regs , 0 , ret ) ;
2021-04-02 02:23:46 +03:00
/*
* Ultimately , this value will get limited by KSTACK_OFFSET_MAX ( ) ,
* but not enough for arm64 stack utilization comfort . To keep
* reasonable stack head room , reduce the maximum offset to 9 bits .
*
* The actual entropy will be further reduced by the compiler when
* applying stack alignment constraints : the AAPCS mandates a
* 16 - byte ( i . e . 4 - bit ) aligned SP at function boundaries .
*
* The resulting 5 bits of entropy is seen in SP [ 8 : 4 ] .
*/
2022-10-05 18:23:53 +03:00
choose_random_kstack_offset ( get_random_u16 ( ) & 0x1FF ) ;
2018-07-11 16:56:43 +03:00
}
2018-07-11 16:56:44 +03:00
static inline bool has_syscall_work ( unsigned long flags )
{
return unlikely ( flags & _TIF_SYSCALL_WORK ) ;
}
2018-07-11 16:56:45 +03:00
static void el0_svc_common ( struct pt_regs * regs , int scno , int sc_nr ,
const syscall_fn_t syscall_table [ ] )
2018-07-11 16:56:44 +03:00
{
2021-11-29 16:06:48 +03:00
unsigned long flags = read_thread_flags ( ) ;
2018-07-11 16:56:44 +03:00
regs - > orig_x0 = regs - > regs [ 0 ] ;
regs - > syscallno = scno ;
2020-03-16 19:50:45 +03:00
/*
* BTI note :
* The architecture does not guarantee that SPSR . BTYPE is zero
* on taking an SVC , so we could return to userspace with a
* non - zero BTYPE after the syscall .
*
* This shouldn ' t matter except when userspace is explicitly
* doing something stupid , such as setting PROT_BTI on a page
* that lacks conforming BTI / PACIxSP instructions , falling
* through from one executable page to another with differing
* PROT_BTI , or messing with BTYPE via ptrace : in such cases ,
* userspace should not be surprised if a SIGILL occurs on
* syscall return .
*
* So , don ' t touch regs - > pstate & PSR_BTYPE_MASK here .
* ( Similarly for HVC and SMC elsewhere . )
*/
2020-11-18 06:20:51 +03:00
if ( flags & _TIF_MTE_ASYNC_FAULT ) {
2019-09-16 13:51:17 +03:00
/*
* Process the asynchronous tag check fault before the actual
* syscall . do_notify_resume ( ) will send a signal to userspace
* before the syscall is restarted .
*/
arm64: fix compat syscall return truncation
Due to inconsistencies in the way we manipulate compat GPRs, we have a
few issues today:
* For audit and tracing, where error codes are handled as a (native)
long, negative error codes are expected to be sign-extended to the
native 64-bits, or they may fail to be matched correctly. Thus a
syscall which fails with an error may erroneously be identified as
failing.
* For ptrace, *all* compat return values should be sign-extended for
consistency with 32-bit arm, but we currently only do this for
negative return codes.
* As we may transiently set the upper 32 bits of some compat GPRs while
in the kernel, these can be sampled by perf, which is somewhat
confusing. This means that where a syscall returns a pointer above 2G,
this will be sign-extended, but will not be mistaken for an error as
error codes are constrained to the inclusive range [-4096, -1] where
no user pointer can exist.
To fix all of these, we must consistently use helpers to get/set the
compat GPRs, ensuring that we never write the upper 32 bits of the
return code, and always sign-extend when reading the return code. This
patch does so, with the following changes:
* We re-organise syscall_get_return_value() to always sign-extend for
compat tasks, and reimplement syscall_get_error() atop. We update
syscall_trace_exit() to use syscall_get_return_value().
* We consistently use syscall_set_return_value() to set the return
value, ensureing the upper 32 bits are never set unexpectedly.
* As the core audit code currently uses regs_return_value() rather than
syscall_get_return_value(), we special-case this for
compat_user_mode(regs) such that this will do the right thing. Going
forward, we should try to move the core audit code over to
syscall_get_return_value().
Cc: <stable@vger.kernel.org>
Reported-by: He Zhe <zhe.he@windriver.com>
Reported-by: weiyuchen <weiyuchen3@huawei.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Link: https://lore.kernel.org/r/20210802104200.21390-1-mark.rutland@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
2021-08-02 13:42:00 +03:00
syscall_set_return_value ( current , regs , - ERESTARTNOINTR , 0 ) ;
2019-09-16 13:51:17 +03:00
return ;
}
2018-07-11 16:56:44 +03:00
if ( has_syscall_work ( flags ) ) {
2020-07-03 11:41:24 +03:00
/*
* The de - facto standard way to skip a system call using ptrace
* is to set the system call to - 1 ( NO_SYSCALL ) and set x0 to a
* suitable error code for consumption by userspace . However ,
* this cannot be distinguished from a user - issued syscall ( - 1 )
* and so we must set x0 to - ENOSYS here in case the tracer doesn ' t
* issue the skip and we fall into trace_exit with x0 preserved .
*
* This is slightly odd because it also means that if a tracer
* sets the system call number to - 1 but does not initialise x0 ,
* then x0 will be preserved for all system calls apart from a
* user - issued syscall ( - 1 ) . However , requesting a skip and not
* setting the return value is unlikely to do anything sensible
* anyway .
*/
2018-07-11 16:56:44 +03:00
if ( scno = = NO_SYSCALL )
arm64: fix compat syscall return truncation
Due to inconsistencies in the way we manipulate compat GPRs, we have a
few issues today:
* For audit and tracing, where error codes are handled as a (native)
long, negative error codes are expected to be sign-extended to the
native 64-bits, or they may fail to be matched correctly. Thus a
syscall which fails with an error may erroneously be identified as
failing.
* For ptrace, *all* compat return values should be sign-extended for
consistency with 32-bit arm, but we currently only do this for
negative return codes.
* As we may transiently set the upper 32 bits of some compat GPRs while
in the kernel, these can be sampled by perf, which is somewhat
confusing. This means that where a syscall returns a pointer above 2G,
this will be sign-extended, but will not be mistaken for an error as
error codes are constrained to the inclusive range [-4096, -1] where
no user pointer can exist.
To fix all of these, we must consistently use helpers to get/set the
compat GPRs, ensuring that we never write the upper 32 bits of the
return code, and always sign-extend when reading the return code. This
patch does so, with the following changes:
* We re-organise syscall_get_return_value() to always sign-extend for
compat tasks, and reimplement syscall_get_error() atop. We update
syscall_trace_exit() to use syscall_get_return_value().
* We consistently use syscall_set_return_value() to set the return
value, ensureing the upper 32 bits are never set unexpectedly.
* As the core audit code currently uses regs_return_value() rather than
syscall_get_return_value(), we special-case this for
compat_user_mode(regs) such that this will do the right thing. Going
forward, we should try to move the core audit code over to
syscall_get_return_value().
Cc: <stable@vger.kernel.org>
Reported-by: He Zhe <zhe.he@windriver.com>
Reported-by: weiyuchen <weiyuchen3@huawei.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Link: https://lore.kernel.org/r/20210802104200.21390-1-mark.rutland@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
2021-08-02 13:42:00 +03:00
syscall_set_return_value ( current , regs , - ENOSYS , 0 ) ;
2018-07-11 16:56:44 +03:00
scno = syscall_trace_enter ( regs ) ;
if ( scno = = NO_SYSCALL )
goto trace_exit ;
}
invoke_syscall ( regs , scno , sc_nr , syscall_table ) ;
/*
* The tracing status may have changed under our feet , so we have to
* check again . However , if we were tracing entry , then we always trace
* exit regardless , as the old entry assembly did .
*/
if ( ! has_syscall_work ( flags ) & & ! IS_ENABLED ( CONFIG_DEBUG_RSEQ ) ) {
2021-11-29 16:06:48 +03:00
flags = read_thread_flags ( ) ;
arm64: entry: remove redundant IRQ flag tracing
All EL0 returns go via ret_to_user(), which masks IRQs and notifies
lockdep and tracing before calling into do_notify_resume(). Therefore,
there's no need for do_notify_resume() to call trace_hardirqs_off(), and
the comment is stale. The call is simply redundant.
In ret_to_user() we call exit_to_user_mode(), which notifies lockdep and
tracing the IRQs will be enabled in userspace, so there's no need for
el0_svc_common() to call trace_hardirqs_on() before returning. Further,
at the start of ret_to_user() we call trace_hardirqs_off(), so not only
is this redundant, but it is immediately undone.
In addition to being redundant, the trace_hardirqs_on() in
el0_svc_common() leaves lockdep inconsistent with the hardware state,
and is liable to cause issues for any C code or instrumentation
between this and the call to trace_hardirqs_off() which undoes it in
ret_to_user().
This patch removes the redundant tracing calls and associated stale
comments.
Fixes: 23529049c684 ("arm64: entry: fix non-NMI user<->kernel transitions")
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Will Deacon <will@kernel.org>
Cc: James Morse <james.morse@arm.com>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20210107145310.44616-1-mark.rutland@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2021-01-07 17:53:10 +03:00
if ( ! has_syscall_work ( flags ) & & ! ( flags & _TIF_SINGLESTEP ) )
2018-07-11 16:56:44 +03:00
return ;
}
trace_exit :
syscall_trace_exit ( regs ) ;
}
2018-07-11 16:56:45 +03:00
2020-01-16 21:35:47 +03:00
void do_el0_svc ( struct pt_regs * regs )
2018-07-11 16:56:45 +03:00
{
el0_svc_common ( regs , regs - > regs [ 8 ] , __NR_syscalls , sys_call_table ) ;
}
# ifdef CONFIG_COMPAT
2020-01-16 21:35:47 +03:00
void do_el0_svc_compat ( struct pt_regs * regs )
2018-07-11 16:56:45 +03:00
{
el0_svc_common ( regs , regs - > regs [ 7 ] , __NR_compat_syscalls ,
compat_sys_call_table ) ;
}
# endif