2019-05-27 09:55:01 +03:00
// SPDX-License-Identifier: GPL-2.0-or-later
2005-04-17 02:20:36 +04:00
/*
* PowerPC version
* Copyright ( C ) 1995 - 1996 Gary Thomas ( gdt @ linuxppc . org )
*
* Derived from " arch/i386/kernel/signal.c "
* Copyright ( C ) 1991 , 1992 Linus Torvalds
* 1997 - 11 - 28 Modified for POSIX .1 b signals by Richard Henderson
*/
# include <linux/sched.h>
# include <linux/mm.h>
# include <linux/smp.h>
# include <linux/kernel.h>
# include <linux/signal.h>
# include <linux/errno.h>
# include <linux/wait.h>
# include <linux/unistd.h>
# include <linux/stddef.h>
# include <linux/elf.h>
# include <linux/ptrace.h>
2011-06-04 09:36:54 +04:00
# include <linux/ratelimit.h>
2018-05-02 16:20:47 +03:00
# include <linux/syscalls.h>
2020-07-07 21:32:25 +03:00
# include <linux/pagemap.h>
2005-04-17 02:20:36 +04:00
# include <asm/sigcontext.h>
# include <asm/ucontext.h>
2016-12-24 22:46:01 +03:00
# include <linux/uaccess.h>
2005-04-17 02:20:36 +04:00
# include <asm/unistd.h>
# include <asm/cacheflush.h>
2006-03-23 02:00:08 +03:00
# include <asm/syscalls.h>
2005-04-17 02:20:36 +04:00
# include <asm/vdso.h>
2012-03-28 21:30:02 +04:00
# include <asm/switch_to.h>
2013-02-13 20:21:41 +04:00
# include <asm/tm.h>
2016-09-06 08:32:43 +03:00
# include <asm/asm-prototypes.h>
2005-04-17 02:20:36 +04:00
2007-06-04 09:15:49 +04:00
# include "signal.h"
2005-04-17 02:20:36 +04:00
2005-05-06 06:10:04 +04:00
# define GP_REGS_SIZE min(sizeof(elf_gregset_t), sizeof(struct pt_regs))
2005-04-17 02:20:36 +04:00
# define FP_REGS_SIZE sizeof(elf_fpregset_t)
powerpc/64/signal: Balance return predictor stack in signal trampoline
Returning from an interrupt or syscall to a signal handler currently
begins execution directly at the handler's entry point, with LR set to
the address of the sigreturn trampoline. When the signal handler
function returns, it runs the trampoline. It looks like this:
# interrupt at user address xyz
# kernel stuff... signal is raised
rfid
# void handler(int sig)
addis 2,12,.TOC.-.LCF0@ha
addi 2,2,.TOC.-.LCF0@l
mflr 0
std 0,16(1)
stdu 1,-96(1)
# handler stuff
ld 0,16(1)
mtlr 0
blr
# __kernel_sigtramp_rt64
addi r1,r1,__SIGNAL_FRAMESIZE
li r0,__NR_rt_sigreturn
sc
# kernel executes rt_sigreturn
rfid
# back to user address xyz
Note the blr with no matching bl. This can corrupt the return
predictor.
Solve this by instead resuming execution at the signal trampoline
which then calls the signal handler. qtrace-tools link_stack checker
confirms the entire user/kernel/vdso cycle is balanced after this
patch, whereas it's not upstream.
Alan confirms the dwarf unwind info still looks good. gdb still
recognises the signal frame and can step into parent frames if it
break inside a signal handler.
Performance is pretty noisy, not a very significant change on a POWER9
here, but branch misses are consistently a lot lower on a
microbenchmark:
Performance counter stats for './signal':
13,085.72 msec task-clock # 1.000 CPUs utilized
45,024,760,101 cycles # 3.441 GHz
65,102,895,542 instructions # 1.45 insn per cycle
11,271,673,787 branches # 861.372 M/sec
59,468,979 branch-misses # 0.53% of all branches
12,989.09 msec task-clock # 1.000 CPUs utilized
44,692,719,559 cycles # 3.441 GHz
65,109,984,964 instructions # 1.46 insn per cycle
11,282,136,057 branches # 868.585 M/sec
39,786,942 branch-misses # 0.35% of all branches
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200511101952.1463138-1-npiggin@gmail.com
2020-05-11 13:19:52 +03:00
# define TRAMP_TRACEBACK 4
# define TRAMP_SIZE 7
2005-04-17 02:20:36 +04:00
/*
* When we have signals to deliver , we set up on the user stack ,
* going down from the original stack pointer :
* 1 ) a rt_sigframe struct which contains the ucontext
* 2 ) a gap of __SIGNAL_FRAMESIZE bytes which acts as a dummy caller
* frame for the signal handler .
*/
struct rt_sigframe {
/* sys_rt_sigreturn requires the ucontext be the first field */
struct ucontext uc ;
2013-02-13 20:21:41 +04:00
# ifdef CONFIG_PPC_TRANSACTIONAL_MEM
struct ucontext uc_transact ;
# endif
2005-04-17 02:20:36 +04:00
unsigned long _unused [ 2 ] ;
unsigned int tramp [ TRAMP_SIZE ] ;
2006-02-01 13:28:09 +03:00
struct siginfo __user * pinfo ;
void __user * puc ;
2005-04-17 02:20:36 +04:00
struct siginfo info ;
2014-02-26 10:07:38 +04:00
/* New 64 bit little-endian ABI allows redzone of 512 bytes below sp */
char abigap [ USER_REDZONE_SIZE ] ;
2005-04-17 02:20:36 +04:00
} __attribute__ ( ( aligned ( 16 ) ) ) ;
2007-10-12 04:20:07 +04:00
static const char fmt32 [ ] = KERN_INFO \
" %s[%d]: bad frame in %s: %08lx nip %08lx lr %08lx \n " ;
static const char fmt64 [ ] = KERN_INFO \
" %s[%d]: bad frame in %s: %016lx nip %016lx lr %016lx \n " ;
2015-07-20 05:58:43 +03:00
/*
* This computes a quad word aligned pointer inside the vmx_reserve array
* element . For historical reasons sigcontext might not be quad word aligned ,
* but the location we write the VMX regs to must be . See the comment in
* sigcontext for more detail .
*/
# ifdef CONFIG_ALTIVEC
static elf_vrreg_t __user * sigcontext_vmx_regs ( struct sigcontext __user * sc )
{
return ( elf_vrreg_t __user * ) ( ( ( unsigned long ) sc - > vmx_reserve + 15 ) & ~ 0xful ) ;
}
# endif
2005-04-17 02:20:36 +04:00
/*
* Set up the sigcontext for the signal frame .
*/
2016-09-23 09:18:12 +03:00
static long setup_sigcontext ( struct sigcontext __user * sc ,
struct task_struct * tsk , int signr , sigset_t * set ,
unsigned long handler , int ctx_has_vsx_region )
2005-04-17 02:20:36 +04:00
{
/* When CONFIG_ALTIVEC is set, we _always_ setup v_regs even if the
* process never used altivec yet ( MSR_VEC is zero in pt_regs of
* the context ) . This is very important because we must ensure we
* don ' t lose the VRSAVE content that may have been set prior to
* the process doing its first vector operation
2012-09-20 05:48:00 +04:00
* Userland shall check AT_HWCAP to know whether it can rely on the
2005-04-17 02:20:36 +04:00
* v_regs pointer or not
*/
# ifdef CONFIG_ALTIVEC
2015-07-20 05:58:43 +03:00
elf_vrreg_t __user * v_regs = sigcontext_vmx_regs ( sc ) ;
2016-05-29 15:03:51 +03:00
unsigned long vrsave ;
2005-04-17 02:20:36 +04:00
# endif
2016-09-23 09:18:12 +03:00
struct pt_regs * regs = tsk - > thread . regs ;
2008-06-02 10:22:59 +04:00
unsigned long msr = regs - > msr ;
2005-04-17 02:20:36 +04:00
long err = 0 ;
2017-08-20 20:58:24 +03:00
/* Force usr to alway see softe as 1 (interrupts enabled) */
unsigned long softe = 0x1 ;
2005-04-17 02:20:36 +04:00
2016-09-23 09:18:12 +03:00
BUG_ON ( tsk ! = current ) ;
2005-04-17 02:20:36 +04:00
# ifdef CONFIG_ALTIVEC
err | = __put_user ( v_regs , & sc - > v_regs ) ;
/* save altivec registers */
2016-09-23 09:18:12 +03:00
if ( tsk - > thread . used_vr ) {
flush_altivec_to_thread ( tsk ) ;
2005-04-17 02:20:36 +04:00
/* Copy 33 vec registers (vr0..31 and vscr) to the stack */
2016-09-23 09:18:12 +03:00
err | = __copy_to_user ( v_regs , & tsk - > thread . vr_state ,
2013-09-10 14:20:42 +04:00
33 * sizeof ( vector128 ) ) ;
2005-04-17 02:20:36 +04:00
/* set MSR_VEC in the MSR value in the frame to indicate that sc->v_reg)
* contains valid data .
*/
2008-06-02 10:22:59 +04:00
msr | = MSR_VEC ;
2005-04-17 02:20:36 +04:00
}
/* We always copy to/from vrsave, it's 0 if we don't have or don't
* use altivec .
*/
2016-05-29 15:03:51 +03:00
vrsave = 0 ;
if ( cpu_has_feature ( CPU_FTR_ALTIVEC ) ) {
vrsave = mfspr ( SPRN_VRSAVE ) ;
2016-09-23 09:18:12 +03:00
tsk - > thread . vrsave = vrsave ;
2016-05-29 15:03:51 +03:00
}
err | = __put_user ( vrsave , ( u32 __user * ) & v_regs [ 33 ] ) ;
2005-04-17 02:20:36 +04:00
# else /* CONFIG_ALTIVEC */
err | = __put_user ( 0 , & sc - > v_regs ) ;
# endif /* CONFIG_ALTIVEC */
2016-09-23 09:18:12 +03:00
flush_fp_to_thread ( tsk ) ;
2008-07-02 08:06:37 +04:00
/* copy fpr regs and fpscr */
2016-09-23 09:18:12 +03:00
err | = copy_fpr_to_user ( & sc - > fp_regs , tsk ) ;
2013-11-25 04:12:20 +04:00
/*
* Clear the MSR VSX bit to indicate there is no valid state attached
* to this context , except in the specific case below where we set it .
*/
msr & = ~ MSR_VSX ;
2008-06-25 08:07:18 +04:00
# ifdef CONFIG_VSX
2008-06-25 08:07:18 +04:00
/*
* Copy VSX low doubleword to local buffer for formatting ,
* then out to userspace . Update v_regs to point after the
* VMX data .
*/
2016-09-23 09:18:12 +03:00
if ( tsk - > thread . used_vsr & & ctx_has_vsx_region ) {
flush_vsx_to_thread ( tsk ) ;
2008-06-25 08:07:18 +04:00
v_regs + = ELF_NVRREG ;
2016-09-23 09:18:12 +03:00
err | = copy_vsx_to_user ( v_regs , tsk ) ;
2008-06-25 08:07:18 +04:00
/* set MSR_VSX in the MSR value in the frame to
* indicate that sc - > vs_reg ) contains valid data .
*/
msr | = MSR_VSX ;
}
2008-06-25 08:07:18 +04:00
# endif /* CONFIG_VSX */
2005-04-17 02:20:36 +04:00
err | = __put_user ( & sc - > gp_regs , & sc - > regs ) ;
2006-03-08 05:24:22 +03:00
WARN_ON ( ! FULL_REGS ( regs ) ) ;
2005-04-17 02:20:36 +04:00
err | = __copy_to_user ( & sc - > gp_regs , regs , GP_REGS_SIZE ) ;
2008-06-02 10:22:59 +04:00
err | = __put_user ( msr , & sc - > gp_regs [ PT_MSR ] ) ;
2017-08-20 20:58:24 +03:00
err | = __put_user ( softe , & sc - > gp_regs [ PT_SOFTE ] ) ;
2005-04-17 02:20:36 +04:00
err | = __put_user ( signr , & sc - > signal ) ;
err | = __put_user ( handler , & sc - > handler ) ;
if ( set ! = NULL )
err | = __put_user ( set - > sig [ 0 ] , & sc - > oldmask ) ;
return err ;
}
2013-02-13 20:21:41 +04:00
# ifdef CONFIG_PPC_TRANSACTIONAL_MEM
/*
* As above , but Transactional Memory is in use , so deliver sigcontexts
* containing checkpointed and transactional register states .
*
2013-05-26 22:09:41 +04:00
* To do this , we treclaim ( done before entering here ) to gather both sets of
* registers and set up the ' normal ' sigcontext registers with rolled - back
* register values such that a simple signal handler sees a correct
* checkpointed register state . If interested , a TM - aware sighandler can
* examine the transactional registers in the 2 nd sigcontext to determine the
* real origin of the signal .
2013-02-13 20:21:41 +04:00
*/
static long setup_tm_sigcontexts ( struct sigcontext __user * sc ,
struct sigcontext __user * tm_sc ,
2016-09-23 09:18:12 +03:00
struct task_struct * tsk ,
powerpc/tm: Fix clearing MSR[TS] in current when reclaiming on signal delivery
After a treclaim, we expect to be in non-transactional state. If we
don't clear the current thread's MSR[TS] before we get preempted, then
tm_recheckpoint_new_task() will recheckpoint and we get rescheduled in
suspended transaction state.
When handling a signal caught in transactional state,
handle_rt_signal64() calls get_tm_stackpointer() that treclaims the
transaction using tm_reclaim_current() but without clearing the
thread's MSR[TS]. This can cause the TM Bad Thing exception below if
later we pagefault and get preempted trying to access the user's
sigframe, using __put_user(). Afterwards, when we are rescheduled back
into do_page_fault() (but now in suspended state since the thread's
MSR[TS] was not cleared), upon executing 'rfid' after completion of
the page fault handling, the exception is raised because a transition
from suspended to non-transactional state is invalid.
Unexpected TM Bad Thing exception at c00000000000de44 (msr 0x8000000302a03031) tm_scratch=800000010280b033
Oops: Unrecoverable exception, sig: 6 [#1]
LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
CPU: 25 PID: 15547 Comm: a.out Not tainted 5.4.0-rc2 #32
NIP: c00000000000de44 LR: c000000000034728 CTR: 0000000000000000
REGS: c00000003fe7bd70 TRAP: 0700 Not tainted (5.4.0-rc2)
MSR: 8000000302a03031 <SF,VEC,VSX,FP,ME,IR,DR,LE,TM[SE]> CR: 44000884 XER: 00000000
CFAR: c00000000000dda4 IRQMASK: 0
PACATMSCRATCH: 800000010280b033
GPR00: c000000000034728 c000000f65a17c80 c000000001662800 00007fffacf3fd78
GPR04: 0000000000001000 0000000000001000 0000000000000000 c000000f611f8af0
GPR08: 0000000000000000 0000000078006001 0000000000000000 000c000000000000
GPR12: c000000f611f84b0 c00000003ffcb200 0000000000000000 0000000000000000
GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
GPR20: 0000000000000000 0000000000000000 0000000000000000 c000000f611f8140
GPR24: 0000000000000000 00007fffacf3fd68 c000000f65a17d90 c000000f611f7800
GPR28: c000000f65a17e90 c000000f65a17e90 c000000001685e18 00007fffacf3f000
NIP [c00000000000de44] fast_exception_return+0xf4/0x1b0
LR [c000000000034728] handle_rt_signal64+0x78/0xc50
Call Trace:
[c000000f65a17c80] [c000000000034710] handle_rt_signal64+0x60/0xc50 (unreliable)
[c000000f65a17d30] [c000000000023640] do_notify_resume+0x330/0x460
[c000000f65a17e20] [c00000000000dcc4] ret_from_except_lite+0x70/0x74
Instruction dump:
7c4ff120 e8410170 7c5a03a6 38400000 f8410060 e8010070 e8410080 e8610088
60000000 60000000 e8810090 e8210078 <4c000024> 48000000 e8610178 88ed0989
---[ end trace 93094aa44b442f87 ]---
The simplified sequence of events that triggers the above exception is:
... # userspace in NON-TRANSACTIONAL state
tbegin # userspace in TRANSACTIONAL state
signal delivery # kernelspace in SUSPENDED state
handle_rt_signal64()
get_tm_stackpointer()
treclaim # kernelspace in NON-TRANSACTIONAL state
__put_user()
page fault happens. We will never get back here because of the TM Bad Thing exception.
page fault handling kicks in and we voluntarily preempt ourselves
do_page_fault()
__schedule()
__switch_to(other_task)
our task is rescheduled and we recheckpoint because the thread's MSR[TS] was not cleared
__switch_to(our_task)
switch_to_tm()
tm_recheckpoint_new_task()
trechkpt # kernelspace in SUSPENDED state
The page fault handling resumes, but now we are in suspended transaction state
do_page_fault() completes
rfid <----- trying to get back where the page fault happened (we were non-transactional back then)
TM Bad Thing # illegal transition from suspended to non-transactional
This patch fixes that issue by clearing the current thread's MSR[TS]
just after treclaim in get_tm_stackpointer() so that we stay in
non-transactional state in case we are preempted. In order to make
treclaim and clearing the thread's MSR[TS] atomic from a preemption
perspective when CONFIG_PREEMPT is set, preempt_disable/enable() is
used. It's also necessary to save the previous value of the thread's
MSR before get_tm_stackpointer() is called so that it can be exposed
to the signal handler later in setup_tm_sigcontexts() to inform the
userspace MSR at the moment of the signal delivery.
Found with tm-signal-context-force-tm kernel selftest.
Fixes: 2b0a576d15e0 ("powerpc: Add new transactional memory state to the signal context")
Cc: stable@vger.kernel.org # v3.9
Signed-off-by: Gustavo Luiz Duarte <gustavold@linux.ibm.com>
Acked-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200211033831.11165-1-gustavold@linux.ibm.com
2020-02-11 06:38:29 +03:00
int signr , sigset_t * set , unsigned long handler ,
unsigned long msr )
2013-02-13 20:21:41 +04:00
{
/* When CONFIG_ALTIVEC is set, we _always_ setup v_regs even if the
* process never used altivec yet ( MSR_VEC is zero in pt_regs of
* the context ) . This is very important because we must ensure we
* don ' t lose the VRSAVE content that may have been set prior to
* the process doing its first vector operation
* Userland shall check AT_HWCAP to know wether it can rely on the
* v_regs pointer or not .
*/
# ifdef CONFIG_ALTIVEC
2015-07-20 05:58:43 +03:00
elf_vrreg_t __user * v_regs = sigcontext_vmx_regs ( sc ) ;
elf_vrreg_t __user * tm_v_regs = sigcontext_vmx_regs ( tm_sc ) ;
2013-02-13 20:21:41 +04:00
# endif
2016-09-23 09:18:12 +03:00
struct pt_regs * regs = tsk - > thread . regs ;
2013-02-13 20:21:41 +04:00
long err = 0 ;
2016-09-23 09:18:12 +03:00
BUG_ON ( tsk ! = current ) ;
powerpc/tm: Fix clearing MSR[TS] in current when reclaiming on signal delivery
After a treclaim, we expect to be in non-transactional state. If we
don't clear the current thread's MSR[TS] before we get preempted, then
tm_recheckpoint_new_task() will recheckpoint and we get rescheduled in
suspended transaction state.
When handling a signal caught in transactional state,
handle_rt_signal64() calls get_tm_stackpointer() that treclaims the
transaction using tm_reclaim_current() but without clearing the
thread's MSR[TS]. This can cause the TM Bad Thing exception below if
later we pagefault and get preempted trying to access the user's
sigframe, using __put_user(). Afterwards, when we are rescheduled back
into do_page_fault() (but now in suspended state since the thread's
MSR[TS] was not cleared), upon executing 'rfid' after completion of
the page fault handling, the exception is raised because a transition
from suspended to non-transactional state is invalid.
Unexpected TM Bad Thing exception at c00000000000de44 (msr 0x8000000302a03031) tm_scratch=800000010280b033
Oops: Unrecoverable exception, sig: 6 [#1]
LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
CPU: 25 PID: 15547 Comm: a.out Not tainted 5.4.0-rc2 #32
NIP: c00000000000de44 LR: c000000000034728 CTR: 0000000000000000
REGS: c00000003fe7bd70 TRAP: 0700 Not tainted (5.4.0-rc2)
MSR: 8000000302a03031 <SF,VEC,VSX,FP,ME,IR,DR,LE,TM[SE]> CR: 44000884 XER: 00000000
CFAR: c00000000000dda4 IRQMASK: 0
PACATMSCRATCH: 800000010280b033
GPR00: c000000000034728 c000000f65a17c80 c000000001662800 00007fffacf3fd78
GPR04: 0000000000001000 0000000000001000 0000000000000000 c000000f611f8af0
GPR08: 0000000000000000 0000000078006001 0000000000000000 000c000000000000
GPR12: c000000f611f84b0 c00000003ffcb200 0000000000000000 0000000000000000
GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
GPR20: 0000000000000000 0000000000000000 0000000000000000 c000000f611f8140
GPR24: 0000000000000000 00007fffacf3fd68 c000000f65a17d90 c000000f611f7800
GPR28: c000000f65a17e90 c000000f65a17e90 c000000001685e18 00007fffacf3f000
NIP [c00000000000de44] fast_exception_return+0xf4/0x1b0
LR [c000000000034728] handle_rt_signal64+0x78/0xc50
Call Trace:
[c000000f65a17c80] [c000000000034710] handle_rt_signal64+0x60/0xc50 (unreliable)
[c000000f65a17d30] [c000000000023640] do_notify_resume+0x330/0x460
[c000000f65a17e20] [c00000000000dcc4] ret_from_except_lite+0x70/0x74
Instruction dump:
7c4ff120 e8410170 7c5a03a6 38400000 f8410060 e8010070 e8410080 e8610088
60000000 60000000 e8810090 e8210078 <4c000024> 48000000 e8610178 88ed0989
---[ end trace 93094aa44b442f87 ]---
The simplified sequence of events that triggers the above exception is:
... # userspace in NON-TRANSACTIONAL state
tbegin # userspace in TRANSACTIONAL state
signal delivery # kernelspace in SUSPENDED state
handle_rt_signal64()
get_tm_stackpointer()
treclaim # kernelspace in NON-TRANSACTIONAL state
__put_user()
page fault happens. We will never get back here because of the TM Bad Thing exception.
page fault handling kicks in and we voluntarily preempt ourselves
do_page_fault()
__schedule()
__switch_to(other_task)
our task is rescheduled and we recheckpoint because the thread's MSR[TS] was not cleared
__switch_to(our_task)
switch_to_tm()
tm_recheckpoint_new_task()
trechkpt # kernelspace in SUSPENDED state
The page fault handling resumes, but now we are in suspended transaction state
do_page_fault() completes
rfid <----- trying to get back where the page fault happened (we were non-transactional back then)
TM Bad Thing # illegal transition from suspended to non-transactional
This patch fixes that issue by clearing the current thread's MSR[TS]
just after treclaim in get_tm_stackpointer() so that we stay in
non-transactional state in case we are preempted. In order to make
treclaim and clearing the thread's MSR[TS] atomic from a preemption
perspective when CONFIG_PREEMPT is set, preempt_disable/enable() is
used. It's also necessary to save the previous value of the thread's
MSR before get_tm_stackpointer() is called so that it can be exposed
to the signal handler later in setup_tm_sigcontexts() to inform the
userspace MSR at the moment of the signal delivery.
Found with tm-signal-context-force-tm kernel selftest.
Fixes: 2b0a576d15e0 ("powerpc: Add new transactional memory state to the signal context")
Cc: stable@vger.kernel.org # v3.9
Signed-off-by: Gustavo Luiz Duarte <gustavold@linux.ibm.com>
Acked-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200211033831.11165-1-gustavold@linux.ibm.com
2020-02-11 06:38:29 +03:00
BUG_ON ( ! MSR_TM_ACTIVE ( msr ) ) ;
2013-02-13 20:21:41 +04:00
2017-10-12 13:17:19 +03:00
WARN_ON ( tm_suspend_disabled ) ;
2018-01-01 02:20:45 +03:00
/* Restore checkpointed FP, VEC, and VSX bits from ckpt_regs as
* it contains the correct FP , VEC , VSX state after we treclaimed
* the transaction and giveup_all ( ) was called on reclaiming .
*/
msr | = tsk - > thread . ckpt_regs . msr & ( MSR_FP | MSR_VEC | MSR_VSX ) ;
2013-02-13 20:21:41 +04:00
# ifdef CONFIG_ALTIVEC
err | = __put_user ( v_regs , & sc - > v_regs ) ;
err | = __put_user ( tm_v_regs , & tm_sc - > v_regs ) ;
/* save altivec registers */
2016-09-23 09:18:12 +03:00
if ( tsk - > thread . used_vr ) {
2013-02-13 20:21:41 +04:00
/* Copy 33 vec registers (vr0..31 and vscr) to the stack */
2016-09-23 09:18:25 +03:00
err | = __copy_to_user ( v_regs , & tsk - > thread . ckvr_state ,
2013-02-13 20:21:41 +04:00
33 * sizeof ( vector128 ) ) ;
/* If VEC was enabled there are transactional VRs valid too,
* else they ' re a copy of the checkpointed VRs .
*/
if ( msr & MSR_VEC )
err | = __copy_to_user ( tm_v_regs ,
2016-09-23 09:18:24 +03:00
& tsk - > thread . vr_state ,
2013-02-13 20:21:41 +04:00
33 * sizeof ( vector128 ) ) ;
else
err | = __copy_to_user ( tm_v_regs ,
2016-09-23 09:18:25 +03:00
& tsk - > thread . ckvr_state ,
2013-02-13 20:21:41 +04:00
33 * sizeof ( vector128 ) ) ;
/* set MSR_VEC in the MSR value in the frame to indicate
* that sc - > v_reg contains valid data .
*/
msr | = MSR_VEC ;
}
/* We always copy to/from vrsave, it's 0 if we don't have or don't
* use altivec .
*/
2013-08-05 08:13:16 +04:00
if ( cpu_has_feature ( CPU_FTR_ALTIVEC ) )
2016-09-23 09:18:25 +03:00
tsk - > thread . ckvrsave = mfspr ( SPRN_VRSAVE ) ;
err | = __put_user ( tsk - > thread . ckvrsave , ( u32 __user * ) & v_regs [ 33 ] ) ;
2013-02-13 20:21:41 +04:00
if ( msr & MSR_VEC )
2016-09-23 09:18:24 +03:00
err | = __put_user ( tsk - > thread . vrsave ,
2013-02-13 20:21:41 +04:00
( u32 __user * ) & tm_v_regs [ 33 ] ) ;
else
2016-09-23 09:18:25 +03:00
err | = __put_user ( tsk - > thread . ckvrsave ,
2013-02-13 20:21:41 +04:00
( u32 __user * ) & tm_v_regs [ 33 ] ) ;
# else /* CONFIG_ALTIVEC */
err | = __put_user ( 0 , & sc - > v_regs ) ;
err | = __put_user ( 0 , & tm_sc - > v_regs ) ;
# endif /* CONFIG_ALTIVEC */
/* copy fpr regs and fpscr */
2016-09-23 09:18:25 +03:00
err | = copy_ckfpr_to_user ( & sc - > fp_regs , tsk ) ;
2013-02-13 20:21:41 +04:00
if ( msr & MSR_FP )
2016-09-23 09:18:12 +03:00
err | = copy_fpr_to_user ( & tm_sc - > fp_regs , tsk ) ;
2016-09-23 09:18:24 +03:00
else
2016-09-23 09:18:25 +03:00
err | = copy_ckfpr_to_user ( & tm_sc - > fp_regs , tsk ) ;
2013-02-13 20:21:41 +04:00
# ifdef CONFIG_VSX
/*
* Copy VSX low doubleword to local buffer for formatting ,
* then out to userspace . Update v_regs to point after the
* VMX data .
*/
2016-09-23 09:18:12 +03:00
if ( tsk - > thread . used_vsr ) {
2013-02-13 20:21:41 +04:00
v_regs + = ELF_NVRREG ;
tm_v_regs + = ELF_NVRREG ;
2016-09-23 09:18:25 +03:00
err | = copy_ckvsx_to_user ( v_regs , tsk ) ;
2013-02-13 20:21:41 +04:00
if ( msr & MSR_VSX )
2016-09-23 09:18:12 +03:00
err | = copy_vsx_to_user ( tm_v_regs , tsk ) ;
2016-09-23 09:18:24 +03:00
else
2016-09-23 09:18:25 +03:00
err | = copy_ckvsx_to_user ( tm_v_regs , tsk ) ;
2013-02-13 20:21:41 +04:00
/* set MSR_VSX in the MSR value in the frame to
* indicate that sc - > vs_reg ) contains valid data .
*/
msr | = MSR_VSX ;
}
# endif /* CONFIG_VSX */
err | = __put_user ( & sc - > gp_regs , & sc - > regs ) ;
err | = __put_user ( & tm_sc - > gp_regs , & tm_sc - > regs ) ;
WARN_ON ( ! FULL_REGS ( regs ) ) ;
err | = __copy_to_user ( & tm_sc - > gp_regs , regs , GP_REGS_SIZE ) ;
err | = __copy_to_user ( & sc - > gp_regs ,
2016-09-23 09:18:12 +03:00
& tsk - > thread . ckpt_regs , GP_REGS_SIZE ) ;
2013-02-13 20:21:41 +04:00
err | = __put_user ( msr , & tm_sc - > gp_regs [ PT_MSR ] ) ;
err | = __put_user ( msr , & sc - > gp_regs [ PT_MSR ] ) ;
err | = __put_user ( signr , & sc - > signal ) ;
err | = __put_user ( handler , & sc - > handler ) ;
if ( set ! = NULL )
err | = __put_user ( set - > sig [ 0 ] , & sc - > oldmask ) ;
return err ;
}
# endif
2005-04-17 02:20:36 +04:00
/*
* Restore the sigcontext from the signal frame .
*/
2016-09-23 09:18:12 +03:00
static long restore_sigcontext ( struct task_struct * tsk , sigset_t * set , int sig ,
2005-04-17 02:20:36 +04:00
struct sigcontext __user * sc )
{
# ifdef CONFIG_ALTIVEC
elf_vrreg_t __user * v_regs ;
# endif
unsigned long err = 0 ;
unsigned long save_r13 = 0 ;
unsigned long msr ;
2016-09-23 09:18:12 +03:00
struct pt_regs * regs = tsk - > thread . regs ;
2008-07-02 08:06:37 +04:00
# ifdef CONFIG_VSX
int i ;
# endif
2005-04-17 02:20:36 +04:00
2016-09-23 09:18:12 +03:00
BUG_ON ( tsk ! = current ) ;
2005-04-17 02:20:36 +04:00
/* If this is not a signal return, we preserve the TLS in r13 */
if ( ! sig )
save_r13 = regs - > gpr [ 13 ] ;
2008-06-27 10:18:27 +04:00
/* copy the GPRs */
err | = __copy_from_user ( regs - > gpr , sc - > gp_regs , sizeof ( regs - > gpr ) ) ;
err | = __get_user ( regs - > nip , & sc - > gp_regs [ PT_NIP ] ) ;
2006-06-07 10:14:40 +04:00
/* get MSR separately, transfer the LE bit if doing signal return */
err | = __get_user ( msr , & sc - > gp_regs [ PT_MSR ] ) ;
if ( sig )
regs - > msr = ( regs - > msr & ~ MSR_LE ) | ( msr & MSR_LE ) ;
2008-06-27 10:18:27 +04:00
err | = __get_user ( regs - > orig_gpr3 , & sc - > gp_regs [ PT_ORIG_R3 ] ) ;
err | = __get_user ( regs - > ctr , & sc - > gp_regs [ PT_CTR ] ) ;
err | = __get_user ( regs - > link , & sc - > gp_regs [ PT_LNK ] ) ;
err | = __get_user ( regs - > xer , & sc - > gp_regs [ PT_XER ] ) ;
err | = __get_user ( regs - > ccr , & sc - > gp_regs [ PT_CCR ] ) ;
2020-05-07 15:13:32 +03:00
/* Don't allow userspace to set SOFTE */
set_trap_norestart ( regs ) ;
2008-06-27 10:18:27 +04:00
err | = __get_user ( regs - > dar , & sc - > gp_regs [ PT_DAR ] ) ;
err | = __get_user ( regs - > dsisr , & sc - > gp_regs [ PT_DSISR ] ) ;
err | = __get_user ( regs - > result , & sc - > gp_regs [ PT_RESULT ] ) ;
2005-04-17 02:20:36 +04:00
if ( ! sig )
regs - > gpr [ 13 ] = save_r13 ;
if ( set ! = NULL )
err | = __get_user ( set - > sig [ 0 ] , & sc - > oldmask ) ;
2007-06-26 08:49:11 +04:00
/*
* Force reload of FP / VEC .
2016-09-23 09:18:12 +03:00
* This has to be done before copying stuff into tsk - > thread . fpr / vr
2007-06-26 08:49:11 +04:00
* for the reasons explained in the previous comment .
*/
2008-06-25 08:07:18 +04:00
regs - > msr & = ~ ( MSR_FP | MSR_FE0 | MSR_FE1 | MSR_VEC | MSR_VSX ) ;
2007-06-26 08:49:11 +04:00
2005-04-17 02:20:36 +04:00
# ifdef CONFIG_ALTIVEC
err | = __get_user ( v_regs , & sc - > v_regs ) ;
if ( err )
return err ;
Remove 'type' argument from access_ok() function
Nobody has actually used the type (VERIFY_READ vs VERIFY_WRITE) argument
of the user address range verification function since we got rid of the
old racy i386-only code to walk page tables by hand.
It existed because the original 80386 would not honor the write protect
bit when in kernel mode, so you had to do COW by hand before doing any
user access. But we haven't supported that in a long time, and these
days the 'type' argument is a purely historical artifact.
A discussion about extending 'user_access_begin()' to do the range
checking resulted this patch, because there is no way we're going to
move the old VERIFY_xyz interface to that model. And it's best done at
the end of the merge window when I've done most of my merges, so let's
just get this done once and for all.
This patch was mostly done with a sed-script, with manual fix-ups for
the cases that weren't of the trivial 'access_ok(VERIFY_xyz' form.
There were a couple of notable cases:
- csky still had the old "verify_area()" name as an alias.
- the iter_iov code had magical hardcoded knowledge of the actual
values of VERIFY_{READ,WRITE} (not that they mattered, since nothing
really used it)
- microblaze used the type argument for a debug printout
but other than those oddities this should be a total no-op patch.
I tried to fix up all architectures, did fairly extensive grepping for
access_ok() uses, and the changes are trivial, but I may have missed
something. Any missed conversion should be trivially fixable, though.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-01-04 05:57:57 +03:00
if ( v_regs & & ! access_ok ( v_regs , 34 * sizeof ( vector128 ) ) )
2006-06-09 07:02:59 +04:00
return - EFAULT ;
2005-04-17 02:20:36 +04:00
/* Copy 33 vec registers (vr0..31 and vscr) from the stack */
2016-07-26 11:06:01 +03:00
if ( v_regs ! = NULL & & ( msr & MSR_VEC ) ! = 0 ) {
2016-09-23 09:18:12 +03:00
err | = __copy_from_user ( & tsk - > thread . vr_state , v_regs ,
2005-04-17 02:20:36 +04:00
33 * sizeof ( vector128 ) ) ;
2016-09-23 09:18:12 +03:00
tsk - > thread . used_vr = true ;
} else if ( tsk - > thread . used_vr ) {
memset ( & tsk - > thread . vr_state , 0 , 33 * sizeof ( vector128 ) ) ;
2016-07-26 11:06:01 +03:00
}
2005-04-17 02:20:36 +04:00
/* Always get VRSAVE back */
2013-08-06 20:01:24 +04:00
if ( v_regs ! = NULL )
2016-09-23 09:18:12 +03:00
err | = __get_user ( tsk - > thread . vrsave , ( u32 __user * ) & v_regs [ 33 ] ) ;
2005-04-17 02:20:36 +04:00
else
2016-09-23 09:18:12 +03:00
tsk - > thread . vrsave = 0 ;
2013-08-05 08:13:16 +04:00
if ( cpu_has_feature ( CPU_FTR_ALTIVEC ) )
2016-09-23 09:18:12 +03:00
mtspr ( SPRN_VRSAVE , tsk - > thread . vrsave ) ;
2005-04-17 02:20:36 +04:00
# endif /* CONFIG_ALTIVEC */
2008-06-25 08:07:18 +04:00
/* restore floating point */
2016-09-23 09:18:12 +03:00
err | = copy_fpr_from_user ( tsk , & sc - > fp_regs ) ;
2008-07-02 08:06:37 +04:00
# ifdef CONFIG_VSX
2008-06-25 08:07:18 +04:00
/*
* Get additional VSX data . Update v_regs to point after the
* VMX data . Copy VSX low doubleword from userspace to local
* buffer for formatting , then into the taskstruct .
*/
v_regs + = ELF_NVRREG ;
2016-07-26 11:06:01 +03:00
if ( ( msr & MSR_VSX ) ! = 0 ) {
2016-09-23 09:18:12 +03:00
err | = copy_vsx_from_user ( tsk , v_regs ) ;
tsk - > thread . used_vsr = true ;
} else {
2008-07-02 08:06:37 +04:00
for ( i = 0 ; i < 32 ; i + + )
2016-09-23 09:18:12 +03:00
tsk - > thread . fp_state . fpr [ i ] [ TS_VSRLOWOFFSET ] = 0 ;
}
2008-06-25 08:07:18 +04:00
# endif
2005-04-17 02:20:36 +04:00
return err ;
}
2013-02-13 20:21:41 +04:00
# ifdef CONFIG_PPC_TRANSACTIONAL_MEM
/*
* Restore the two sigcontexts from the frame of a transactional processes .
*/
2016-09-23 09:18:12 +03:00
static long restore_tm_sigcontexts ( struct task_struct * tsk ,
2013-02-13 20:21:41 +04:00
struct sigcontext __user * sc ,
struct sigcontext __user * tm_sc )
{
# ifdef CONFIG_ALTIVEC
elf_vrreg_t __user * v_regs , * tm_v_regs ;
# endif
unsigned long err = 0 ;
unsigned long msr ;
2016-09-23 09:18:12 +03:00
struct pt_regs * regs = tsk - > thread . regs ;
2013-02-13 20:21:41 +04:00
# ifdef CONFIG_VSX
int i ;
# endif
2016-09-23 09:18:12 +03:00
BUG_ON ( tsk ! = current ) ;
2017-10-12 13:17:19 +03:00
if ( tm_suspend_disabled )
return - EINVAL ;
2013-02-13 20:21:41 +04:00
/* copy the GPRs */
err | = __copy_from_user ( regs - > gpr , tm_sc - > gp_regs , sizeof ( regs - > gpr ) ) ;
2016-09-23 09:18:12 +03:00
err | = __copy_from_user ( & tsk - > thread . ckpt_regs , sc - > gp_regs ,
2013-02-13 20:21:41 +04:00
sizeof ( regs - > gpr ) ) ;
/*
* TFHAR is restored from the checkpointed ' wound - back ' ucontext ' s NIP .
* TEXASR was set by the signal delivery reclaim , as was TFIAR .
* Users doing anything abhorrent like thread - switching w / signals for
* TM - Suspended code will have to back TEXASR / TFIAR up themselves .
* For the case of getting a signal and simply returning from it ,
* we don ' t need to re - copy them here .
*/
err | = __get_user ( regs - > nip , & tm_sc - > gp_regs [ PT_NIP ] ) ;
2016-09-23 09:18:12 +03:00
err | = __get_user ( tsk - > thread . tm_tfhar , & sc - > gp_regs [ PT_NIP ] ) ;
2013-02-13 20:21:41 +04:00
/* get MSR separately, transfer the LE bit if doing signal return */
err | = __get_user ( msr , & sc - > gp_regs [ PT_MSR ] ) ;
2015-11-19 07:44:44 +03:00
/* Don't allow reserved mode. */
if ( MSR_TM_RESV ( msr ) )
return - EINVAL ;
2013-06-09 15:23:19 +04:00
/* pull in MSR LE from user context */
2013-02-13 20:21:41 +04:00
regs - > msr = ( regs - > msr & ~ MSR_LE ) | ( msr & MSR_LE ) ;
/* The following non-GPR non-FPR non-VR state is also checkpointed: */
err | = __get_user ( regs - > ctr , & tm_sc - > gp_regs [ PT_CTR ] ) ;
err | = __get_user ( regs - > link , & tm_sc - > gp_regs [ PT_LNK ] ) ;
err | = __get_user ( regs - > xer , & tm_sc - > gp_regs [ PT_XER ] ) ;
err | = __get_user ( regs - > ccr , & tm_sc - > gp_regs [ PT_CCR ] ) ;
2016-09-23 09:18:12 +03:00
err | = __get_user ( tsk - > thread . ckpt_regs . ctr ,
2013-02-13 20:21:41 +04:00
& sc - > gp_regs [ PT_CTR ] ) ;
2016-09-23 09:18:12 +03:00
err | = __get_user ( tsk - > thread . ckpt_regs . link ,
2013-02-13 20:21:41 +04:00
& sc - > gp_regs [ PT_LNK ] ) ;
2016-09-23 09:18:12 +03:00
err | = __get_user ( tsk - > thread . ckpt_regs . xer ,
2013-02-13 20:21:41 +04:00
& sc - > gp_regs [ PT_XER ] ) ;
2016-09-23 09:18:12 +03:00
err | = __get_user ( tsk - > thread . ckpt_regs . ccr ,
2013-02-13 20:21:41 +04:00
& sc - > gp_regs [ PT_CCR ] ) ;
2020-05-07 15:13:32 +03:00
/* Don't allow userspace to set SOFTE */
set_trap_norestart ( regs ) ;
2013-02-13 20:21:41 +04:00
/* These regs are not checkpointed; they can go in 'regs'. */
err | = __get_user ( regs - > dar , & sc - > gp_regs [ PT_DAR ] ) ;
err | = __get_user ( regs - > dsisr , & sc - > gp_regs [ PT_DSISR ] ) ;
err | = __get_user ( regs - > result , & sc - > gp_regs [ PT_RESULT ] ) ;
/*
* Force reload of FP / VEC .
2016-09-23 09:18:12 +03:00
* This has to be done before copying stuff into tsk - > thread . fpr / vr
2013-02-13 20:21:41 +04:00
* for the reasons explained in the previous comment .
*/
regs - > msr & = ~ ( MSR_FP | MSR_FE0 | MSR_FE1 | MSR_VEC | MSR_VSX ) ;
# ifdef CONFIG_ALTIVEC
err | = __get_user ( v_regs , & sc - > v_regs ) ;
err | = __get_user ( tm_v_regs , & tm_sc - > v_regs ) ;
if ( err )
return err ;
Remove 'type' argument from access_ok() function
Nobody has actually used the type (VERIFY_READ vs VERIFY_WRITE) argument
of the user address range verification function since we got rid of the
old racy i386-only code to walk page tables by hand.
It existed because the original 80386 would not honor the write protect
bit when in kernel mode, so you had to do COW by hand before doing any
user access. But we haven't supported that in a long time, and these
days the 'type' argument is a purely historical artifact.
A discussion about extending 'user_access_begin()' to do the range
checking resulted this patch, because there is no way we're going to
move the old VERIFY_xyz interface to that model. And it's best done at
the end of the merge window when I've done most of my merges, so let's
just get this done once and for all.
This patch was mostly done with a sed-script, with manual fix-ups for
the cases that weren't of the trivial 'access_ok(VERIFY_xyz' form.
There were a couple of notable cases:
- csky still had the old "verify_area()" name as an alias.
- the iter_iov code had magical hardcoded knowledge of the actual
values of VERIFY_{READ,WRITE} (not that they mattered, since nothing
really used it)
- microblaze used the type argument for a debug printout
but other than those oddities this should be a total no-op patch.
I tried to fix up all architectures, did fairly extensive grepping for
access_ok() uses, and the changes are trivial, but I may have missed
something. Any missed conversion should be trivially fixable, though.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-01-04 05:57:57 +03:00
if ( v_regs & & ! access_ok ( v_regs , 34 * sizeof ( vector128 ) ) )
2013-02-13 20:21:41 +04:00
return - EFAULT ;
Remove 'type' argument from access_ok() function
Nobody has actually used the type (VERIFY_READ vs VERIFY_WRITE) argument
of the user address range verification function since we got rid of the
old racy i386-only code to walk page tables by hand.
It existed because the original 80386 would not honor the write protect
bit when in kernel mode, so you had to do COW by hand before doing any
user access. But we haven't supported that in a long time, and these
days the 'type' argument is a purely historical artifact.
A discussion about extending 'user_access_begin()' to do the range
checking resulted this patch, because there is no way we're going to
move the old VERIFY_xyz interface to that model. And it's best done at
the end of the merge window when I've done most of my merges, so let's
just get this done once and for all.
This patch was mostly done with a sed-script, with manual fix-ups for
the cases that weren't of the trivial 'access_ok(VERIFY_xyz' form.
There were a couple of notable cases:
- csky still had the old "verify_area()" name as an alias.
- the iter_iov code had magical hardcoded knowledge of the actual
values of VERIFY_{READ,WRITE} (not that they mattered, since nothing
really used it)
- microblaze used the type argument for a debug printout
but other than those oddities this should be a total no-op patch.
I tried to fix up all architectures, did fairly extensive grepping for
access_ok() uses, and the changes are trivial, but I may have missed
something. Any missed conversion should be trivially fixable, though.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-01-04 05:57:57 +03:00
if ( tm_v_regs & & ! access_ok ( tm_v_regs , 34 * sizeof ( vector128 ) ) )
2013-02-13 20:21:41 +04:00
return - EFAULT ;
/* Copy 33 vec registers (vr0..31 and vscr) from the stack */
2013-08-06 20:01:24 +04:00
if ( v_regs ! = NULL & & tm_v_regs ! = NULL & & ( msr & MSR_VEC ) ! = 0 ) {
2016-09-23 09:18:25 +03:00
err | = __copy_from_user ( & tsk - > thread . ckvr_state , v_regs ,
2013-02-13 20:21:41 +04:00
33 * sizeof ( vector128 ) ) ;
2016-09-23 09:18:24 +03:00
err | = __copy_from_user ( & tsk - > thread . vr_state , tm_v_regs ,
2013-02-13 20:21:41 +04:00
33 * sizeof ( vector128 ) ) ;
2016-07-26 11:06:01 +03:00
current - > thread . used_vr = true ;
2013-02-13 20:21:41 +04:00
}
2016-09-23 09:18:12 +03:00
else if ( tsk - > thread . used_vr ) {
memset ( & tsk - > thread . vr_state , 0 , 33 * sizeof ( vector128 ) ) ;
2016-09-23 09:18:25 +03:00
memset ( & tsk - > thread . ckvr_state , 0 , 33 * sizeof ( vector128 ) ) ;
2013-02-13 20:21:41 +04:00
}
/* Always get VRSAVE back */
2013-08-06 20:01:24 +04:00
if ( v_regs ! = NULL & & tm_v_regs ! = NULL ) {
2016-09-23 09:18:25 +03:00
err | = __get_user ( tsk - > thread . ckvrsave ,
2016-09-23 09:18:24 +03:00
( u32 __user * ) & v_regs [ 33 ] ) ;
err | = __get_user ( tsk - > thread . vrsave ,
2013-02-13 20:21:41 +04:00
( u32 __user * ) & tm_v_regs [ 33 ] ) ;
}
else {
2016-09-23 09:18:12 +03:00
tsk - > thread . vrsave = 0 ;
2016-09-23 09:18:25 +03:00
tsk - > thread . ckvrsave = 0 ;
2013-02-13 20:21:41 +04:00
}
2013-08-05 08:13:16 +04:00
if ( cpu_has_feature ( CPU_FTR_ALTIVEC ) )
2016-09-23 09:18:12 +03:00
mtspr ( SPRN_VRSAVE , tsk - > thread . vrsave ) ;
2013-02-13 20:21:41 +04:00
# endif /* CONFIG_ALTIVEC */
/* restore floating point */
2016-09-23 09:18:24 +03:00
err | = copy_fpr_from_user ( tsk , & tm_sc - > fp_regs ) ;
2016-09-23 09:18:25 +03:00
err | = copy_ckfpr_from_user ( tsk , & sc - > fp_regs ) ;
2013-02-13 20:21:41 +04:00
# ifdef CONFIG_VSX
/*
* Get additional VSX data . Update v_regs to point after the
* VMX data . Copy VSX low doubleword from userspace to local
* buffer for formatting , then into the taskstruct .
*/
if ( v_regs & & ( ( msr & MSR_VSX ) ! = 0 ) ) {
v_regs + = ELF_NVRREG ;
tm_v_regs + = ELF_NVRREG ;
2016-09-23 09:18:24 +03:00
err | = copy_vsx_from_user ( tsk , tm_v_regs ) ;
2016-09-23 09:18:25 +03:00
err | = copy_ckvsx_from_user ( tsk , v_regs ) ;
2016-09-23 09:18:12 +03:00
tsk - > thread . used_vsr = true ;
2013-02-13 20:21:41 +04:00
} else {
for ( i = 0 ; i < 32 ; i + + ) {
2016-09-23 09:18:12 +03:00
tsk - > thread . fp_state . fpr [ i ] [ TS_VSRLOWOFFSET ] = 0 ;
2016-09-23 09:18:25 +03:00
tsk - > thread . ckfp_state . fpr [ i ] [ TS_VSRLOWOFFSET ] = 0 ;
2013-02-13 20:21:41 +04:00
}
}
# endif
tm_enable ( ) ;
2014-04-04 13:19:48 +04:00
/* Make sure the transaction is marked as failed */
2016-09-23 09:18:12 +03:00
tsk - > thread . tm_texasr | = TEXASR_FS ;
powerpc/tm: Set MSR[TS] just prior to recheckpoint
On a signal handler return, the user could set a context with MSR[TS] bits
set, and these bits would be copied to task regs->msr.
At restore_tm_sigcontexts(), after current task regs->msr[TS] bits are set,
several __get_user() are called and then a recheckpoint is executed.
This is a problem since a page fault (in kernel space) could happen when
calling __get_user(). If it happens, the process MSR[TS] bits were
already set, but recheckpoint was not executed, and SPRs are still invalid.
The page fault can cause the current process to be de-scheduled, with
MSR[TS] active and without tm_recheckpoint() being called. More
importantly, without TEXASR[FS] bit set also.
Since TEXASR might not have the FS bit set, and when the process is
scheduled back, it will try to reclaim, which will be aborted because of
the CPU is not in the suspended state, and, then, recheckpoint. This
recheckpoint will restore thread->texasr into TEXASR SPR, which might be
zero, hitting a BUG_ON().
kernel BUG at /build/linux-sf3Co9/linux-4.9.30/arch/powerpc/kernel/tm.S:434!
cpu 0xb: Vector: 700 (Program Check) at [c00000041f1576d0]
pc: c000000000054550: restore_gprs+0xb0/0x180
lr: 0000000000000000
sp: c00000041f157950
msr: 8000000100021033
current = 0xc00000041f143000
paca = 0xc00000000fb86300 softe: 0 irq_happened: 0x01
pid = 1021, comm = kworker/11:1
kernel BUG at /build/linux-sf3Co9/linux-4.9.30/arch/powerpc/kernel/tm.S:434!
Linux version 4.9.0-3-powerpc64le (debian-kernel@lists.debian.org) (gcc version 6.3.0 20170516 (Debian 6.3.0-18) ) #1 SMP Debian 4.9.30-2+deb9u2 (2017-06-26)
enter ? for help
[c00000041f157b30] c00000000001bc3c tm_recheckpoint.part.11+0x6c/0xa0
[c00000041f157b70] c00000000001d184 __switch_to+0x1e4/0x4c0
[c00000041f157bd0] c00000000082eeb8 __schedule+0x2f8/0x990
[c00000041f157cb0] c00000000082f598 schedule+0x48/0xc0
[c00000041f157ce0] c0000000000f0d28 worker_thread+0x148/0x610
[c00000041f157d80] c0000000000f96b0 kthread+0x120/0x140
[c00000041f157e30] c00000000000c0e0 ret_from_kernel_thread+0x5c/0x7c
This patch simply delays the MSR[TS] set, so, if there is any page fault in
the __get_user() section, it does not have regs->msr[TS] set, since the TM
structures are still invalid, thus avoiding doing TM operations for
in-kernel exceptions and possible process reschedule.
With this patch, the MSR[TS] will only be set just before recheckpointing
and setting TEXASR[FS] = 1, thus avoiding an interrupt with TM registers in
invalid state.
Other than that, if CONFIG_PREEMPT is set, there might be a preemption just
after setting MSR[TS] and before tm_recheckpoint(), thus, this block must
be atomic from a preemption perspective, thus, calling
preempt_disable/enable() on this code.
It is not possible to move tm_recheckpoint to happen earlier, because it is
required to get the checkpointed registers from userspace, with
__get_user(), thus, the only way to avoid this undesired behavior is
delaying the MSR[TS] set.
The 32-bits signal handler seems to be safe this current issue, but, it
might be exposed to the preemption issue, thus, disabling preemption in
this chunk of code.
Changes from v2:
* Run the critical section with preempt_disable.
Fixes: 87b4e5393af7 ("powerpc/tm: Fix return of active 64bit signals")
Cc: stable@vger.kernel.org (v3.9+)
Signed-off-by: Breno Leitao <leitao@debian.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2018-11-21 22:21:09 +03:00
/*
* Disabling preemption , since it is unsafe to be preempted
* with MSR [ TS ] set without recheckpointing .
*/
preempt_disable ( ) ;
/* pull in MSR TS bits from user context */
powerpc/tm: Avoid machine crash on rt_sigreturn()
There is a kernel crash that happens if rt_sigreturn() is called inside
a transactional block.
This crash happens if the kernel hits an in-kernel page fault when
accessing userspace memory, usually through copy_ckvsx_to_user(). A
major page fault calls might_sleep() function, which can cause a task
reschedule. A task reschedule (switch_to()) reclaim and recheckpoint
the TM states, but, in the signal return path, the checkpointed memory
was already reclaimed, thus the exception stack has MSR that points to
MSR[TS]=0.
When the code returns from might_sleep() and a task reschedule
happened, then this task is returned with the memory recheckpointed,
and CPU MSR[TS] = suspended.
This means that there is a side effect at might_sleep() if it is
called with CPU MSR[TS] = 0 and the task has regs->msr[TS] != 0.
This side effect can cause a TM bad thing, since at the exception
entrance, the stack saves MSR[TS]=0, and this is what will be used at
RFID, but, the processor has MSR[TS] = Suspended, and this transition
will be invalid and a TM Bad thing will be raised, causing the
following crash:
Unexpected TM Bad Thing exception at c00000000000e9ec (msr 0x8000000302a03031) tm_scratch=800000010280b033
cpu 0xc: Vector: 700 (Program Check) at [c00000003ff1fd70]
pc: c00000000000e9ec: fast_exception_return+0x100/0x1bc
lr: c000000000032948: handle_rt_signal64+0xb8/0xaf0
sp: c0000004263ebc40
msr: 8000000302a03031
current = 0xc000000415050300
paca = 0xc00000003ffc4080 irqmask: 0x03 irq_happened: 0x01
pid = 25006, comm = sigfuz
Linux version 5.0.0-rc1-00001-g3bd6e94bec12 (breno@debian) (gcc version 8.2.0 (Debian 8.2.0-3)) #899 SMP Mon Jan 7 11:30:07 EST 2019
WARNING: exception is not recoverable, can't continue
enter ? for help
[c0000004263ebc40] c000000000032948 handle_rt_signal64+0xb8/0xaf0 (unreliable)
[c0000004263ebd30] c000000000022780 do_notify_resume+0x2f0/0x430
[c0000004263ebe20] c00000000000e844 ret_from_except_lite+0x70/0x74
--- Exception: c00 (System Call) at 00007fffbaac400c
SP (7fffeca90f40) is in userspace
The solution for this problem is running the sigreturn code with
regs->msr[TS] disabled, thus, avoiding hitting the side effect above.
This does not seem to be a problem since regs->msr will be replaced by
the ucontext value, so, it is being flushed already. In this case, it
is flushed earlier.
Signed-off-by: Breno Leitao <leitao@debian.org>
Acked-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2019-01-16 19:47:44 +03:00
regs - > msr | = msr & MSR_TS_MASK ;
powerpc/tm: Set MSR[TS] just prior to recheckpoint
On a signal handler return, the user could set a context with MSR[TS] bits
set, and these bits would be copied to task regs->msr.
At restore_tm_sigcontexts(), after current task regs->msr[TS] bits are set,
several __get_user() are called and then a recheckpoint is executed.
This is a problem since a page fault (in kernel space) could happen when
calling __get_user(). If it happens, the process MSR[TS] bits were
already set, but recheckpoint was not executed, and SPRs are still invalid.
The page fault can cause the current process to be de-scheduled, with
MSR[TS] active and without tm_recheckpoint() being called. More
importantly, without TEXASR[FS] bit set also.
Since TEXASR might not have the FS bit set, and when the process is
scheduled back, it will try to reclaim, which will be aborted because of
the CPU is not in the suspended state, and, then, recheckpoint. This
recheckpoint will restore thread->texasr into TEXASR SPR, which might be
zero, hitting a BUG_ON().
kernel BUG at /build/linux-sf3Co9/linux-4.9.30/arch/powerpc/kernel/tm.S:434!
cpu 0xb: Vector: 700 (Program Check) at [c00000041f1576d0]
pc: c000000000054550: restore_gprs+0xb0/0x180
lr: 0000000000000000
sp: c00000041f157950
msr: 8000000100021033
current = 0xc00000041f143000
paca = 0xc00000000fb86300 softe: 0 irq_happened: 0x01
pid = 1021, comm = kworker/11:1
kernel BUG at /build/linux-sf3Co9/linux-4.9.30/arch/powerpc/kernel/tm.S:434!
Linux version 4.9.0-3-powerpc64le (debian-kernel@lists.debian.org) (gcc version 6.3.0 20170516 (Debian 6.3.0-18) ) #1 SMP Debian 4.9.30-2+deb9u2 (2017-06-26)
enter ? for help
[c00000041f157b30] c00000000001bc3c tm_recheckpoint.part.11+0x6c/0xa0
[c00000041f157b70] c00000000001d184 __switch_to+0x1e4/0x4c0
[c00000041f157bd0] c00000000082eeb8 __schedule+0x2f8/0x990
[c00000041f157cb0] c00000000082f598 schedule+0x48/0xc0
[c00000041f157ce0] c0000000000f0d28 worker_thread+0x148/0x610
[c00000041f157d80] c0000000000f96b0 kthread+0x120/0x140
[c00000041f157e30] c00000000000c0e0 ret_from_kernel_thread+0x5c/0x7c
This patch simply delays the MSR[TS] set, so, if there is any page fault in
the __get_user() section, it does not have regs->msr[TS] set, since the TM
structures are still invalid, thus avoiding doing TM operations for
in-kernel exceptions and possible process reschedule.
With this patch, the MSR[TS] will only be set just before recheckpointing
and setting TEXASR[FS] = 1, thus avoiding an interrupt with TM registers in
invalid state.
Other than that, if CONFIG_PREEMPT is set, there might be a preemption just
after setting MSR[TS] and before tm_recheckpoint(), thus, this block must
be atomic from a preemption perspective, thus, calling
preempt_disable/enable() on this code.
It is not possible to move tm_recheckpoint to happen earlier, because it is
required to get the checkpointed registers from userspace, with
__get_user(), thus, the only way to avoid this undesired behavior is
delaying the MSR[TS] set.
The 32-bits signal handler seems to be safe this current issue, but, it
might be exposed to the preemption issue, thus, disabling preemption in
this chunk of code.
Changes from v2:
* Run the critical section with preempt_disable.
Fixes: 87b4e5393af7 ("powerpc/tm: Fix return of active 64bit signals")
Cc: stable@vger.kernel.org (v3.9+)
Signed-off-by: Breno Leitao <leitao@debian.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2018-11-21 22:21:09 +03:00
/*
* Ensure that TM is enabled in regs - > msr before we leave the signal
* handler . It could be the case that ( a ) user disabled the TM bit
* through the manipulation of the MSR bits in uc_mcontext or ( b ) the
* TM bit was disabled because a sufficient number of context switches
* happened whilst in the signal handler and load_tm overflowed ,
* disabling the TM bit . In either case we can end up with an illegal
* TM state leading to a TM Bad Thing when we return to userspace .
*
* CAUTION :
* After regs - > MSR [ TS ] being updated , make sure that get_user ( ) ,
* put_user ( ) or similar functions are * not * called . These
* functions can generate page faults which will cause the process
* to be de - scheduled with MSR [ TS ] set but without calling
* tm_recheckpoint ( ) . This can cause a bug .
*/
regs - > msr | = MSR_TM ;
2013-02-13 20:21:41 +04:00
/* This loads the checkpointed FP/VEC state, if used */
2017-11-02 06:09:05 +03:00
tm_recheckpoint ( & tsk - > thread ) ;
2013-02-13 20:21:41 +04:00
2016-09-23 09:18:24 +03:00
msr_check_and_set ( msr & ( MSR_FP | MSR_VEC ) ) ;
2013-02-13 20:21:41 +04:00
if ( msr & MSR_FP ) {
2016-09-23 09:18:24 +03:00
load_fp_state ( & tsk - > thread . fp_state ) ;
2016-09-23 09:18:12 +03:00
regs - > msr | = ( MSR_FP | tsk - > thread . fpexc_mode ) ;
2013-02-13 20:21:41 +04:00
}
if ( msr & MSR_VEC ) {
2016-09-23 09:18:24 +03:00
load_vr_state ( & tsk - > thread . vr_state ) ;
2013-02-13 20:21:41 +04:00
regs - > msr | = MSR_VEC ;
}
powerpc/tm: Set MSR[TS] just prior to recheckpoint
On a signal handler return, the user could set a context with MSR[TS] bits
set, and these bits would be copied to task regs->msr.
At restore_tm_sigcontexts(), after current task regs->msr[TS] bits are set,
several __get_user() are called and then a recheckpoint is executed.
This is a problem since a page fault (in kernel space) could happen when
calling __get_user(). If it happens, the process MSR[TS] bits were
already set, but recheckpoint was not executed, and SPRs are still invalid.
The page fault can cause the current process to be de-scheduled, with
MSR[TS] active and without tm_recheckpoint() being called. More
importantly, without TEXASR[FS] bit set also.
Since TEXASR might not have the FS bit set, and when the process is
scheduled back, it will try to reclaim, which will be aborted because of
the CPU is not in the suspended state, and, then, recheckpoint. This
recheckpoint will restore thread->texasr into TEXASR SPR, which might be
zero, hitting a BUG_ON().
kernel BUG at /build/linux-sf3Co9/linux-4.9.30/arch/powerpc/kernel/tm.S:434!
cpu 0xb: Vector: 700 (Program Check) at [c00000041f1576d0]
pc: c000000000054550: restore_gprs+0xb0/0x180
lr: 0000000000000000
sp: c00000041f157950
msr: 8000000100021033
current = 0xc00000041f143000
paca = 0xc00000000fb86300 softe: 0 irq_happened: 0x01
pid = 1021, comm = kworker/11:1
kernel BUG at /build/linux-sf3Co9/linux-4.9.30/arch/powerpc/kernel/tm.S:434!
Linux version 4.9.0-3-powerpc64le (debian-kernel@lists.debian.org) (gcc version 6.3.0 20170516 (Debian 6.3.0-18) ) #1 SMP Debian 4.9.30-2+deb9u2 (2017-06-26)
enter ? for help
[c00000041f157b30] c00000000001bc3c tm_recheckpoint.part.11+0x6c/0xa0
[c00000041f157b70] c00000000001d184 __switch_to+0x1e4/0x4c0
[c00000041f157bd0] c00000000082eeb8 __schedule+0x2f8/0x990
[c00000041f157cb0] c00000000082f598 schedule+0x48/0xc0
[c00000041f157ce0] c0000000000f0d28 worker_thread+0x148/0x610
[c00000041f157d80] c0000000000f96b0 kthread+0x120/0x140
[c00000041f157e30] c00000000000c0e0 ret_from_kernel_thread+0x5c/0x7c
This patch simply delays the MSR[TS] set, so, if there is any page fault in
the __get_user() section, it does not have regs->msr[TS] set, since the TM
structures are still invalid, thus avoiding doing TM operations for
in-kernel exceptions and possible process reschedule.
With this patch, the MSR[TS] will only be set just before recheckpointing
and setting TEXASR[FS] = 1, thus avoiding an interrupt with TM registers in
invalid state.
Other than that, if CONFIG_PREEMPT is set, there might be a preemption just
after setting MSR[TS] and before tm_recheckpoint(), thus, this block must
be atomic from a preemption perspective, thus, calling
preempt_disable/enable() on this code.
It is not possible to move tm_recheckpoint to happen earlier, because it is
required to get the checkpointed registers from userspace, with
__get_user(), thus, the only way to avoid this undesired behavior is
delaying the MSR[TS] set.
The 32-bits signal handler seems to be safe this current issue, but, it
might be exposed to the preemption issue, thus, disabling preemption in
this chunk of code.
Changes from v2:
* Run the critical section with preempt_disable.
Fixes: 87b4e5393af7 ("powerpc/tm: Fix return of active 64bit signals")
Cc: stable@vger.kernel.org (v3.9+)
Signed-off-by: Breno Leitao <leitao@debian.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2018-11-21 22:21:09 +03:00
preempt_enable ( ) ;
2013-02-13 20:21:41 +04:00
return err ;
}
# endif
2005-04-17 02:20:36 +04:00
/*
* Setup the trampoline code on the stack
*/
static long setup_trampoline ( unsigned int syscall , unsigned int __user * tramp )
{
int i ;
long err = 0 ;
powerpc/64/signal: Balance return predictor stack in signal trampoline
Returning from an interrupt or syscall to a signal handler currently
begins execution directly at the handler's entry point, with LR set to
the address of the sigreturn trampoline. When the signal handler
function returns, it runs the trampoline. It looks like this:
# interrupt at user address xyz
# kernel stuff... signal is raised
rfid
# void handler(int sig)
addis 2,12,.TOC.-.LCF0@ha
addi 2,2,.TOC.-.LCF0@l
mflr 0
std 0,16(1)
stdu 1,-96(1)
# handler stuff
ld 0,16(1)
mtlr 0
blr
# __kernel_sigtramp_rt64
addi r1,r1,__SIGNAL_FRAMESIZE
li r0,__NR_rt_sigreturn
sc
# kernel executes rt_sigreturn
rfid
# back to user address xyz
Note the blr with no matching bl. This can corrupt the return
predictor.
Solve this by instead resuming execution at the signal trampoline
which then calls the signal handler. qtrace-tools link_stack checker
confirms the entire user/kernel/vdso cycle is balanced after this
patch, whereas it's not upstream.
Alan confirms the dwarf unwind info still looks good. gdb still
recognises the signal frame and can step into parent frames if it
break inside a signal handler.
Performance is pretty noisy, not a very significant change on a POWER9
here, but branch misses are consistently a lot lower on a
microbenchmark:
Performance counter stats for './signal':
13,085.72 msec task-clock # 1.000 CPUs utilized
45,024,760,101 cycles # 3.441 GHz
65,102,895,542 instructions # 1.45 insn per cycle
11,271,673,787 branches # 861.372 M/sec
59,468,979 branch-misses # 0.53% of all branches
12,989.09 msec task-clock # 1.000 CPUs utilized
44,692,719,559 cycles # 3.441 GHz
65,109,984,964 instructions # 1.46 insn per cycle
11,282,136,057 branches # 868.585 M/sec
39,786,942 branch-misses # 0.35% of all branches
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200511101952.1463138-1-npiggin@gmail.com
2020-05-11 13:19:52 +03:00
/* bctrl # call the handler */
err | = __put_user ( PPC_INST_BCTRL , & tramp [ 0 ] ) ;
2005-04-17 02:20:36 +04:00
/* addi r1, r1, __SIGNAL_FRAMESIZE # Pop the dummy stackframe */
2018-11-09 20:33:28 +03:00
err | = __put_user ( PPC_INST_ADDI | __PPC_RT ( R1 ) | __PPC_RA ( R1 ) |
powerpc/64/signal: Balance return predictor stack in signal trampoline
Returning from an interrupt or syscall to a signal handler currently
begins execution directly at the handler's entry point, with LR set to
the address of the sigreturn trampoline. When the signal handler
function returns, it runs the trampoline. It looks like this:
# interrupt at user address xyz
# kernel stuff... signal is raised
rfid
# void handler(int sig)
addis 2,12,.TOC.-.LCF0@ha
addi 2,2,.TOC.-.LCF0@l
mflr 0
std 0,16(1)
stdu 1,-96(1)
# handler stuff
ld 0,16(1)
mtlr 0
blr
# __kernel_sigtramp_rt64
addi r1,r1,__SIGNAL_FRAMESIZE
li r0,__NR_rt_sigreturn
sc
# kernel executes rt_sigreturn
rfid
# back to user address xyz
Note the blr with no matching bl. This can corrupt the return
predictor.
Solve this by instead resuming execution at the signal trampoline
which then calls the signal handler. qtrace-tools link_stack checker
confirms the entire user/kernel/vdso cycle is balanced after this
patch, whereas it's not upstream.
Alan confirms the dwarf unwind info still looks good. gdb still
recognises the signal frame and can step into parent frames if it
break inside a signal handler.
Performance is pretty noisy, not a very significant change on a POWER9
here, but branch misses are consistently a lot lower on a
microbenchmark:
Performance counter stats for './signal':
13,085.72 msec task-clock # 1.000 CPUs utilized
45,024,760,101 cycles # 3.441 GHz
65,102,895,542 instructions # 1.45 insn per cycle
11,271,673,787 branches # 861.372 M/sec
59,468,979 branch-misses # 0.53% of all branches
12,989.09 msec task-clock # 1.000 CPUs utilized
44,692,719,559 cycles # 3.441 GHz
65,109,984,964 instructions # 1.46 insn per cycle
11,282,136,057 branches # 868.585 M/sec
39,786,942 branch-misses # 0.35% of all branches
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200511101952.1463138-1-npiggin@gmail.com
2020-05-11 13:19:52 +03:00
( __SIGNAL_FRAMESIZE & 0xffff ) , & tramp [ 1 ] ) ;
2005-04-17 02:20:36 +04:00
/* li r0, __NR_[rt_]sigreturn| */
powerpc/64/signal: Balance return predictor stack in signal trampoline
Returning from an interrupt or syscall to a signal handler currently
begins execution directly at the handler's entry point, with LR set to
the address of the sigreturn trampoline. When the signal handler
function returns, it runs the trampoline. It looks like this:
# interrupt at user address xyz
# kernel stuff... signal is raised
rfid
# void handler(int sig)
addis 2,12,.TOC.-.LCF0@ha
addi 2,2,.TOC.-.LCF0@l
mflr 0
std 0,16(1)
stdu 1,-96(1)
# handler stuff
ld 0,16(1)
mtlr 0
blr
# __kernel_sigtramp_rt64
addi r1,r1,__SIGNAL_FRAMESIZE
li r0,__NR_rt_sigreturn
sc
# kernel executes rt_sigreturn
rfid
# back to user address xyz
Note the blr with no matching bl. This can corrupt the return
predictor.
Solve this by instead resuming execution at the signal trampoline
which then calls the signal handler. qtrace-tools link_stack checker
confirms the entire user/kernel/vdso cycle is balanced after this
patch, whereas it's not upstream.
Alan confirms the dwarf unwind info still looks good. gdb still
recognises the signal frame and can step into parent frames if it
break inside a signal handler.
Performance is pretty noisy, not a very significant change on a POWER9
here, but branch misses are consistently a lot lower on a
microbenchmark:
Performance counter stats for './signal':
13,085.72 msec task-clock # 1.000 CPUs utilized
45,024,760,101 cycles # 3.441 GHz
65,102,895,542 instructions # 1.45 insn per cycle
11,271,673,787 branches # 861.372 M/sec
59,468,979 branch-misses # 0.53% of all branches
12,989.09 msec task-clock # 1.000 CPUs utilized
44,692,719,559 cycles # 3.441 GHz
65,109,984,964 instructions # 1.46 insn per cycle
11,282,136,057 branches # 868.585 M/sec
39,786,942 branch-misses # 0.35% of all branches
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200511101952.1463138-1-npiggin@gmail.com
2020-05-11 13:19:52 +03:00
err | = __put_user ( PPC_INST_ADDI | ( syscall & 0xffff ) , & tramp [ 2 ] ) ;
2005-04-17 02:20:36 +04:00
/* sc */
powerpc/64/signal: Balance return predictor stack in signal trampoline
Returning from an interrupt or syscall to a signal handler currently
begins execution directly at the handler's entry point, with LR set to
the address of the sigreturn trampoline. When the signal handler
function returns, it runs the trampoline. It looks like this:
# interrupt at user address xyz
# kernel stuff... signal is raised
rfid
# void handler(int sig)
addis 2,12,.TOC.-.LCF0@ha
addi 2,2,.TOC.-.LCF0@l
mflr 0
std 0,16(1)
stdu 1,-96(1)
# handler stuff
ld 0,16(1)
mtlr 0
blr
# __kernel_sigtramp_rt64
addi r1,r1,__SIGNAL_FRAMESIZE
li r0,__NR_rt_sigreturn
sc
# kernel executes rt_sigreturn
rfid
# back to user address xyz
Note the blr with no matching bl. This can corrupt the return
predictor.
Solve this by instead resuming execution at the signal trampoline
which then calls the signal handler. qtrace-tools link_stack checker
confirms the entire user/kernel/vdso cycle is balanced after this
patch, whereas it's not upstream.
Alan confirms the dwarf unwind info still looks good. gdb still
recognises the signal frame and can step into parent frames if it
break inside a signal handler.
Performance is pretty noisy, not a very significant change on a POWER9
here, but branch misses are consistently a lot lower on a
microbenchmark:
Performance counter stats for './signal':
13,085.72 msec task-clock # 1.000 CPUs utilized
45,024,760,101 cycles # 3.441 GHz
65,102,895,542 instructions # 1.45 insn per cycle
11,271,673,787 branches # 861.372 M/sec
59,468,979 branch-misses # 0.53% of all branches
12,989.09 msec task-clock # 1.000 CPUs utilized
44,692,719,559 cycles # 3.441 GHz
65,109,984,964 instructions # 1.46 insn per cycle
11,282,136,057 branches # 868.585 M/sec
39,786,942 branch-misses # 0.35% of all branches
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200511101952.1463138-1-npiggin@gmail.com
2020-05-11 13:19:52 +03:00
err | = __put_user ( PPC_INST_SC , & tramp [ 3 ] ) ;
2005-04-17 02:20:36 +04:00
/* Minimal traceback info */
for ( i = TRAMP_TRACEBACK ; i < TRAMP_SIZE ; i + + )
err | = __put_user ( 0 , & tramp [ i ] ) ;
if ( ! err )
flush_icache_range ( ( unsigned long ) & tramp [ 0 ] ,
( unsigned long ) & tramp [ TRAMP_SIZE ] ) ;
return err ;
}
2008-07-08 12:43:41 +04:00
/*
* Userspace code may pass a ucontext which doesn ' t include VSX added
* at the end . We need to check for this case .
*/
# define UCONTEXTSIZEWITHOUTVSX \
( sizeof ( struct ucontext ) - 32 * sizeof ( long ) )
2005-04-17 02:20:36 +04:00
/*
* Handle { get , set , swap } _context operations
*/
2018-05-02 16:20:47 +03:00
SYSCALL_DEFINE3 ( swapcontext , struct ucontext __user * , old_ctx ,
struct ucontext __user * , new_ctx , long , ctx_size )
2005-04-17 02:20:36 +04:00
{
sigset_t set ;
2008-07-08 12:43:41 +04:00
unsigned long new_msr = 0 ;
2008-10-23 04:42:36 +04:00
int ctx_has_vsx_region = 0 ;
2005-04-17 02:20:36 +04:00
2008-07-08 12:43:41 +04:00
if ( new_ctx & &
2008-10-23 04:42:36 +04:00
get_user ( new_msr , & new_ctx - > uc_mcontext . gp_regs [ PT_MSR ] ) )
2008-07-08 12:43:41 +04:00
return - EFAULT ;
/*
* Check that the context is not smaller than the original
* size ( with VMX but without VSX )
2005-04-17 02:20:36 +04:00
*/
2008-07-08 12:43:41 +04:00
if ( ctx_size < UCONTEXTSIZEWITHOUTVSX )
2005-04-17 02:20:36 +04:00
return - EINVAL ;
2008-07-08 12:43:41 +04:00
/*
* If the new context state sets the MSR VSX bits but
* it doesn ' t provide VSX state .
*/
if ( ( ctx_size < sizeof ( struct ucontext ) ) & &
( new_msr & MSR_VSX ) )
return - EINVAL ;
2008-10-23 04:42:36 +04:00
/* Does the context have enough room to store VSX data? */
if ( ctx_size > = sizeof ( struct ucontext ) )
ctx_has_vsx_region = 1 ;
2005-04-17 02:20:36 +04:00
if ( old_ctx ! = NULL ) {
Remove 'type' argument from access_ok() function
Nobody has actually used the type (VERIFY_READ vs VERIFY_WRITE) argument
of the user address range verification function since we got rid of the
old racy i386-only code to walk page tables by hand.
It existed because the original 80386 would not honor the write protect
bit when in kernel mode, so you had to do COW by hand before doing any
user access. But we haven't supported that in a long time, and these
days the 'type' argument is a purely historical artifact.
A discussion about extending 'user_access_begin()' to do the range
checking resulted this patch, because there is no way we're going to
move the old VERIFY_xyz interface to that model. And it's best done at
the end of the merge window when I've done most of my merges, so let's
just get this done once and for all.
This patch was mostly done with a sed-script, with manual fix-ups for
the cases that weren't of the trivial 'access_ok(VERIFY_xyz' form.
There were a couple of notable cases:
- csky still had the old "verify_area()" name as an alias.
- the iter_iov code had magical hardcoded knowledge of the actual
values of VERIFY_{READ,WRITE} (not that they mattered, since nothing
really used it)
- microblaze used the type argument for a debug printout
but other than those oddities this should be a total no-op patch.
I tried to fix up all architectures, did fairly extensive grepping for
access_ok() uses, and the changes are trivial, but I may have missed
something. Any missed conversion should be trivially fixable, though.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-01-04 05:57:57 +03:00
if ( ! access_ok ( old_ctx , ctx_size )
2016-09-23 09:18:12 +03:00
| | setup_sigcontext ( & old_ctx - > uc_mcontext , current , 0 , NULL , 0 ,
2008-10-23 04:42:36 +04:00
ctx_has_vsx_region )
2005-04-17 02:20:36 +04:00
| | __copy_to_user ( & old_ctx - > uc_sigmask ,
& current - > blocked , sizeof ( sigset_t ) ) )
return - EFAULT ;
}
if ( new_ctx = = NULL )
return 0 ;
2020-07-07 21:32:25 +03:00
if ( ! access_ok ( new_ctx , ctx_size ) | |
fault_in_pages_readable ( ( u8 __user * ) new_ctx , ctx_size ) )
2005-04-17 02:20:36 +04:00
return - EFAULT ;
/*
* If we get a fault copying the context into the kernel ' s
* image of the user ' s registers , we can ' t just return - EFAULT
* because the user ' s registers will be corrupted . For instance
* the NIP value may have been updated but not some of the
* other registers . Given that we have done the access_ok
* and successfully read the first and last bytes of the region
* above , this should only happen in an out - of - memory situation
* or if another thread unmaps the region containing the context .
* We kill the task with a SIGSEGV in this situation .
*/
if ( __copy_from_user ( & set , & new_ctx - > uc_sigmask , sizeof ( set ) ) )
do_exit ( SIGSEGV ) ;
2012-04-27 22:09:19 +04:00
set_current_blocked ( & set ) ;
2016-09-23 09:18:12 +03:00
if ( restore_sigcontext ( current , NULL , 0 , & new_ctx - > uc_mcontext ) )
2005-04-17 02:20:36 +04:00
do_exit ( SIGSEGV ) ;
/* This returns like rt_sigreturn */
[PATCH] syscall entry/exit revamp
This cleanup patch speeds up the null syscall path on ppc64 by about 3%,
and brings the ppc32 and ppc64 code slightly closer together.
The ppc64 code was checking current_thread_info()->flags twice in the
syscall exit path; once for TIF_SYSCALL_T_OR_A before disabling
interrupts, and then again for TIF_SIGPENDING|TIF_NEED_RESCHED etc after
disabling interrupts. Now we do the same as ppc32 -- check the flags
only once in the fast path, and re-enable interrupts if necessary in the
ptrace case.
The patch abolishes the 'syscall_noerror' member of struct thread_info
and replaces it with a TIF_NOERROR bit in the flags, which is handled in
the slow path. This shortens the syscall entry code, which no longer
needs to clear syscall_noerror.
The patch adds a TIF_SAVE_NVGPRS flag which causes the syscall exit slow
path to save the non-volatile GPRs into a signal frame. This removes the
need for the assembly wrappers around sys_sigsuspend(),
sys_rt_sigsuspend(), et al which existed solely to save those registers
in advance. It also means I don't have to add new wrappers for ppoll()
and pselect(), which is what I was supposed to be doing when I got
distracted into this...
Finally, it unifies the ppc64 and ppc32 methods of handling syscall exit
directly into a signal handler (as required by sigsuspend et al) by
introducing a TIF_RESTOREALL flag which causes _all_ the registers to be
reloaded from the pt_regs by taking the ret_from_exception path, instead
of the normal syscall exit path which stomps on the callee-saved GPRs.
It appears to pass an LTP test run on ppc64, and passes basic testing on
ppc32 too. Brief tests of ptrace functionality with strace and gdb also
appear OK. I wouldn't send it to Linus for 2.6.15 just yet though :)
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
2005-11-15 21:52:18 +03:00
set_thread_flag ( TIF_RESTOREALL ) ;
2005-04-17 02:20:36 +04:00
return 0 ;
}
/*
* Do a signal return ; undo the signal stack .
*/
2018-05-02 16:20:47 +03:00
SYSCALL_DEFINE0 ( rt_sigreturn )
2005-04-17 02:20:36 +04:00
{
2018-05-02 16:20:47 +03:00
struct pt_regs * regs = current_pt_regs ( ) ;
2005-04-17 02:20:36 +04:00
struct ucontext __user * uc = ( struct ucontext __user * ) regs - > gpr [ 1 ] ;
sigset_t set ;
2013-02-13 20:21:41 +04:00
# ifdef CONFIG_PPC_TRANSACTIONAL_MEM
unsigned long msr ;
# endif
2005-04-17 02:20:36 +04:00
/* Always make any pending restarted system calls return -EINTR */
2015-02-13 02:01:14 +03:00
current - > restart_block . fn = do_no_restart_syscall ;
2005-04-17 02:20:36 +04:00
Remove 'type' argument from access_ok() function
Nobody has actually used the type (VERIFY_READ vs VERIFY_WRITE) argument
of the user address range verification function since we got rid of the
old racy i386-only code to walk page tables by hand.
It existed because the original 80386 would not honor the write protect
bit when in kernel mode, so you had to do COW by hand before doing any
user access. But we haven't supported that in a long time, and these
days the 'type' argument is a purely historical artifact.
A discussion about extending 'user_access_begin()' to do the range
checking resulted this patch, because there is no way we're going to
move the old VERIFY_xyz interface to that model. And it's best done at
the end of the merge window when I've done most of my merges, so let's
just get this done once and for all.
This patch was mostly done with a sed-script, with manual fix-ups for
the cases that weren't of the trivial 'access_ok(VERIFY_xyz' form.
There were a couple of notable cases:
- csky still had the old "verify_area()" name as an alias.
- the iter_iov code had magical hardcoded knowledge of the actual
values of VERIFY_{READ,WRITE} (not that they mattered, since nothing
really used it)
- microblaze used the type argument for a debug printout
but other than those oddities this should be a total no-op patch.
I tried to fix up all architectures, did fairly extensive grepping for
access_ok() uses, and the changes are trivial, but I may have missed
something. Any missed conversion should be trivially fixable, though.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-01-04 05:57:57 +03:00
if ( ! access_ok ( uc , sizeof ( * uc ) ) )
2005-04-17 02:20:36 +04:00
goto badframe ;
if ( __copy_from_user ( & set , & uc - > uc_sigmask , sizeof ( set ) ) )
goto badframe ;
2012-04-27 22:09:19 +04:00
set_current_blocked ( & set ) ;
powerpc: signals: Discard transaction state from signal frames
Userspace can begin and suspend a transaction within the signal
handler which means they might enter sys_rt_sigreturn() with the
processor in suspended state.
sys_rt_sigreturn() wants to restore process context (which may have
been in a transaction before signal delivery). To do this it must
restore TM SPRS. To achieve this, any transaction initiated within the
signal frame must be discarded in order to be able to restore TM SPRs
as TM SPRs can only be manipulated non-transactionally..
>From the PowerPC ISA:
TM Bad Thing Exception [Category: Transactional Memory]
An attempt is made to execute a mtspr targeting a TM register in
other than Non-transactional state.
Not doing so results in a TM Bad Thing:
[12045.221359] Kernel BUG at c000000000050a40 [verbose debug info unavailable]
[12045.221470] Unexpected TM Bad Thing exception at c000000000050a40 (msr 0x201033)
[12045.221540] Oops: Unrecoverable exception, sig: 6 [#1]
[12045.221586] SMP NR_CPUS=2048 NUMA PowerNV
[12045.221634] Modules linked in: xt_CHECKSUM iptable_mangle ipt_MASQUERADE
nf_nat_masquerade_ipv4 iptable_nat nf_nat_ipv4 nf_nat nf_conntrack_ipv4 nf_defrag_ipv4
xt_conntrack nf_conntrack ipt_REJECT nf_reject_ipv4 xt_tcpudp bridge stp llc ebtable_filter
ebtables ip6table_filter ip6_tables iptable_filter ip_tables x_tables kvm_hv kvm
uio_pdrv_genirq ipmi_powernv uio powernv_rng ipmi_msghandler autofs4 ses enclosure
scsi_transport_sas bnx2x ipr mdio libcrc32c
[12045.222167] CPU: 68 PID: 6178 Comm: sigreturnpanic Not tainted 4.7.0 #34
[12045.222224] task: c0000000fce38600 ti: c0000000fceb4000 task.ti: c0000000fceb4000
[12045.222293] NIP: c000000000050a40 LR: c0000000000163bc CTR: 0000000000000000
[12045.222361] REGS: c0000000fceb7ac0 TRAP: 0700 Not tainted (4.7.0)
[12045.222418] MSR: 9000000300201033 <SF,HV,ME,IR,DR,RI,LE,TM[SE]> CR: 28444280 XER: 20000000
[12045.222625] CFAR: c0000000000163b8 SOFTE: 0 PACATMSCRATCH: 900000014280f033
GPR00: 01100000b8000001 c0000000fceb7d40 c00000000139c100 c0000000fce390d0
GPR04: 900000034280f033 0000000000000000 0000000000000000 0000000000000000
GPR08: 0000000000000000 b000000000001033 0000000000000001 0000000000000000
GPR12: 0000000000000000 c000000002926400 0000000000000000 0000000000000000
GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
GPR20: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
GPR24: 0000000000000000 00003ffff98cadd0 00003ffff98cb470 0000000000000000
GPR28: 900000034280f033 c0000000fceb7ea0 0000000000000001 c0000000fce390d0
[12045.223535] NIP [c000000000050a40] tm_restore_sprs+0xc/0x1c
[12045.223584] LR [c0000000000163bc] tm_recheckpoint+0x5c/0xa0
[12045.223630] Call Trace:
[12045.223655] [c0000000fceb7d80] [c000000000026e74] sys_rt_sigreturn+0x494/0x6c0
[12045.223738] [c0000000fceb7e30] [c0000000000092e0] system_call+0x38/0x108
[12045.223806] Instruction dump:
[12045.223841] 7c800164 4e800020 7c0022a6 f80304a8 7c0222a6 f80304b0 7c0122a6 f80304b8
[12045.223955] 4e800020 e80304a8 7c0023a6 e80304b0 <7c0223a6> e80304b8 7c0123a6 4e800020
[12045.224074] ---[ end trace cb8002ee240bae76 ]---
It isn't clear exactly if there is really a use case for userspace
returning with a suspended transaction, however, doing so doesn't (on
its own) constitute a bad frame. As such, this patch simply discards
the transactional state of the context calling the sigreturn and
continues.
Reported-by: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Signed-off-by: Cyril Bur <cyrilbur@gmail.com>
Tested-by: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Reviewed-by: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Acked-by: Simon Guo <wei.guo.simon@gmail.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2016-08-23 03:46:17 +03:00
2013-02-13 20:21:41 +04:00
# ifdef CONFIG_PPC_TRANSACTIONAL_MEM
powerpc: signals: Discard transaction state from signal frames
Userspace can begin and suspend a transaction within the signal
handler which means they might enter sys_rt_sigreturn() with the
processor in suspended state.
sys_rt_sigreturn() wants to restore process context (which may have
been in a transaction before signal delivery). To do this it must
restore TM SPRS. To achieve this, any transaction initiated within the
signal frame must be discarded in order to be able to restore TM SPRs
as TM SPRs can only be manipulated non-transactionally..
>From the PowerPC ISA:
TM Bad Thing Exception [Category: Transactional Memory]
An attempt is made to execute a mtspr targeting a TM register in
other than Non-transactional state.
Not doing so results in a TM Bad Thing:
[12045.221359] Kernel BUG at c000000000050a40 [verbose debug info unavailable]
[12045.221470] Unexpected TM Bad Thing exception at c000000000050a40 (msr 0x201033)
[12045.221540] Oops: Unrecoverable exception, sig: 6 [#1]
[12045.221586] SMP NR_CPUS=2048 NUMA PowerNV
[12045.221634] Modules linked in: xt_CHECKSUM iptable_mangle ipt_MASQUERADE
nf_nat_masquerade_ipv4 iptable_nat nf_nat_ipv4 nf_nat nf_conntrack_ipv4 nf_defrag_ipv4
xt_conntrack nf_conntrack ipt_REJECT nf_reject_ipv4 xt_tcpudp bridge stp llc ebtable_filter
ebtables ip6table_filter ip6_tables iptable_filter ip_tables x_tables kvm_hv kvm
uio_pdrv_genirq ipmi_powernv uio powernv_rng ipmi_msghandler autofs4 ses enclosure
scsi_transport_sas bnx2x ipr mdio libcrc32c
[12045.222167] CPU: 68 PID: 6178 Comm: sigreturnpanic Not tainted 4.7.0 #34
[12045.222224] task: c0000000fce38600 ti: c0000000fceb4000 task.ti: c0000000fceb4000
[12045.222293] NIP: c000000000050a40 LR: c0000000000163bc CTR: 0000000000000000
[12045.222361] REGS: c0000000fceb7ac0 TRAP: 0700 Not tainted (4.7.0)
[12045.222418] MSR: 9000000300201033 <SF,HV,ME,IR,DR,RI,LE,TM[SE]> CR: 28444280 XER: 20000000
[12045.222625] CFAR: c0000000000163b8 SOFTE: 0 PACATMSCRATCH: 900000014280f033
GPR00: 01100000b8000001 c0000000fceb7d40 c00000000139c100 c0000000fce390d0
GPR04: 900000034280f033 0000000000000000 0000000000000000 0000000000000000
GPR08: 0000000000000000 b000000000001033 0000000000000001 0000000000000000
GPR12: 0000000000000000 c000000002926400 0000000000000000 0000000000000000
GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
GPR20: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
GPR24: 0000000000000000 00003ffff98cadd0 00003ffff98cb470 0000000000000000
GPR28: 900000034280f033 c0000000fceb7ea0 0000000000000001 c0000000fce390d0
[12045.223535] NIP [c000000000050a40] tm_restore_sprs+0xc/0x1c
[12045.223584] LR [c0000000000163bc] tm_recheckpoint+0x5c/0xa0
[12045.223630] Call Trace:
[12045.223655] [c0000000fceb7d80] [c000000000026e74] sys_rt_sigreturn+0x494/0x6c0
[12045.223738] [c0000000fceb7e30] [c0000000000092e0] system_call+0x38/0x108
[12045.223806] Instruction dump:
[12045.223841] 7c800164 4e800020 7c0022a6 f80304a8 7c0222a6 f80304b0 7c0122a6 f80304b8
[12045.223955] 4e800020 e80304a8 7c0023a6 e80304b0 <7c0223a6> e80304b8 7c0123a6 4e800020
[12045.224074] ---[ end trace cb8002ee240bae76 ]---
It isn't clear exactly if there is really a use case for userspace
returning with a suspended transaction, however, doing so doesn't (on
its own) constitute a bad frame. As such, this patch simply discards
the transactional state of the context calling the sigreturn and
continues.
Reported-by: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Signed-off-by: Cyril Bur <cyrilbur@gmail.com>
Tested-by: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Reviewed-by: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Acked-by: Simon Guo <wei.guo.simon@gmail.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2016-08-23 03:46:17 +03:00
/*
* If there is a transactional state then throw it away .
* The purpose of a sigreturn is to destroy all traces of the
* signal frame , this includes any transactional state created
* within in . We only check for suspended as we can never be
* active in the kernel , we are active , there is nothing better to
* do than go ahead and Bad Thing later .
* The cause is not important as there will never be a
* recheckpoint so it ' s not user visible .
*/
if ( MSR_TM_SUSPENDED ( mfmsr ( ) ) )
tm_reclaim_current ( 0 ) ;
powerpc/tm: Avoid machine crash on rt_sigreturn()
There is a kernel crash that happens if rt_sigreturn() is called inside
a transactional block.
This crash happens if the kernel hits an in-kernel page fault when
accessing userspace memory, usually through copy_ckvsx_to_user(). A
major page fault calls might_sleep() function, which can cause a task
reschedule. A task reschedule (switch_to()) reclaim and recheckpoint
the TM states, but, in the signal return path, the checkpointed memory
was already reclaimed, thus the exception stack has MSR that points to
MSR[TS]=0.
When the code returns from might_sleep() and a task reschedule
happened, then this task is returned with the memory recheckpointed,
and CPU MSR[TS] = suspended.
This means that there is a side effect at might_sleep() if it is
called with CPU MSR[TS] = 0 and the task has regs->msr[TS] != 0.
This side effect can cause a TM bad thing, since at the exception
entrance, the stack saves MSR[TS]=0, and this is what will be used at
RFID, but, the processor has MSR[TS] = Suspended, and this transition
will be invalid and a TM Bad thing will be raised, causing the
following crash:
Unexpected TM Bad Thing exception at c00000000000e9ec (msr 0x8000000302a03031) tm_scratch=800000010280b033
cpu 0xc: Vector: 700 (Program Check) at [c00000003ff1fd70]
pc: c00000000000e9ec: fast_exception_return+0x100/0x1bc
lr: c000000000032948: handle_rt_signal64+0xb8/0xaf0
sp: c0000004263ebc40
msr: 8000000302a03031
current = 0xc000000415050300
paca = 0xc00000003ffc4080 irqmask: 0x03 irq_happened: 0x01
pid = 25006, comm = sigfuz
Linux version 5.0.0-rc1-00001-g3bd6e94bec12 (breno@debian) (gcc version 8.2.0 (Debian 8.2.0-3)) #899 SMP Mon Jan 7 11:30:07 EST 2019
WARNING: exception is not recoverable, can't continue
enter ? for help
[c0000004263ebc40] c000000000032948 handle_rt_signal64+0xb8/0xaf0 (unreliable)
[c0000004263ebd30] c000000000022780 do_notify_resume+0x2f0/0x430
[c0000004263ebe20] c00000000000e844 ret_from_except_lite+0x70/0x74
--- Exception: c00 (System Call) at 00007fffbaac400c
SP (7fffeca90f40) is in userspace
The solution for this problem is running the sigreturn code with
regs->msr[TS] disabled, thus, avoiding hitting the side effect above.
This does not seem to be a problem since regs->msr will be replaced by
the ucontext value, so, it is being flushed already. In this case, it
is flushed earlier.
Signed-off-by: Breno Leitao <leitao@debian.org>
Acked-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2019-01-16 19:47:44 +03:00
/*
* Disable MSR [ TS ] bit also , so , if there is an exception in the
* code below ( as a page fault in copy_ckvsx_to_user ( ) ) , it does
* not recheckpoint this task if there was a context switch inside
* the exception .
*
* A major page fault can indirectly call schedule ( ) . A reschedule
* process in the middle of an exception can have a side effect
* ( Changing the CPU MSR [ TS ] state ) , since schedule ( ) is called
* with the CPU MSR [ TS ] disable and returns with MSR [ TS ] = Suspended
* ( switch_to ( ) calls tm_recheckpoint ( ) for the ' new ' process ) . In
* this case , the process continues to be the same in the CPU , but
* the CPU state just changed .
*
* This can cause a TM Bad Thing , since the MSR in the stack will
* have the MSR [ TS ] = 0 , and this is what will be used to RFID .
*
* Clearing MSR [ TS ] state here will avoid a recheckpoint if there
* is any process reschedule in kernel space . The MSR [ TS ] state
* does not need to be saved also , since it will be replaced with
* the MSR [ TS ] that came from user context later , at
* restore_tm_sigcontexts .
*/
regs - > msr & = ~ MSR_TS_MASK ;
2013-02-13 20:21:41 +04:00
if ( __get_user ( msr , & uc - > uc_mcontext . gp_regs [ PT_MSR ] ) )
goto badframe ;
2013-06-09 15:23:19 +04:00
if ( MSR_TM_ACTIVE ( msr ) ) {
2013-02-13 20:21:41 +04:00
/* We recheckpoint on return. */
struct ucontext __user * uc_transact ;
powerpc/tm: Fix oops on sigreturn on systems without TM
On systems like P9 powernv where we have no TM (or P8 booted with
ppc_tm=off), userspace can construct a signal context which still has
the MSR TS bits set. The kernel tries to restore this context which
results in the following crash:
Unexpected TM Bad Thing exception at c0000000000022fc (msr 0x8000000102a03031) tm_scratch=800000020280f033
Oops: Unrecoverable exception, sig: 6 [#1]
LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
Modules linked in:
CPU: 0 PID: 1636 Comm: sigfuz Not tainted 5.2.0-11043-g0a8ad0ffa4 #69
NIP: c0000000000022fc LR: 00007fffb2d67e48 CTR: 0000000000000000
REGS: c00000003fffbd70 TRAP: 0700 Not tainted (5.2.0-11045-g7142b497d8)
MSR: 8000000102a03031 <SF,VEC,VSX,FP,ME,IR,DR,LE,TM[E]> CR: 42004242 XER: 00000000
CFAR: c0000000000022e0 IRQMASK: 0
GPR00: 0000000000000072 00007fffb2b6e560 00007fffb2d87f00 0000000000000669
GPR04: 00007fffb2b6e728 0000000000000000 0000000000000000 00007fffb2b6f2a8
GPR08: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
GPR12: 0000000000000000 00007fffb2b76900 0000000000000000 0000000000000000
GPR16: 00007fffb2370000 00007fffb2d84390 00007fffea3a15ac 000001000a250420
GPR20: 00007fffb2b6f260 0000000010001770 0000000000000000 0000000000000000
GPR24: 00007fffb2d843a0 00007fffea3a14a0 0000000000010000 0000000000800000
GPR28: 00007fffea3a14d8 00000000003d0f00 0000000000000000 00007fffb2b6e728
NIP [c0000000000022fc] rfi_flush_fallback+0x7c/0x80
LR [00007fffb2d67e48] 0x7fffb2d67e48
Call Trace:
Instruction dump:
e96a0220 e96a02a8 e96a0330 e96a03b8 394a0400 4200ffdc 7d2903a6 e92d0c00
e94d0c08 e96d0c10 e82d0c18 7db242a6 <4c000024> 7db243a6 7db142a6 f82d0c18
The problem is the signal code assumes TM is enabled when
CONFIG_PPC_TRANSACTIONAL_MEM is enabled. This may not be the case as
with P9 powernv or if `ppc_tm=off` is used on P8.
This means any local user can crash the system.
Fix the problem by returning a bad stack frame to the user if they try
to set the MSR TS bits with sigreturn() on systems where TM is not
supported.
Found with sigfuz kernel selftest on P9.
This fixes CVE-2019-13648.
Fixes: 2b0a576d15e0 ("powerpc: Add new transactional memory state to the signal context")
Cc: stable@vger.kernel.org # v3.9
Reported-by: Praveen Pandey <Praveen.Pandey@in.ibm.com>
Signed-off-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190719050502.405-1-mikey@neuling.org
2019-07-19 08:05:02 +03:00
/* Trying to start TM on non TM system */
if ( ! cpu_has_feature ( CPU_FTR_TM ) )
goto badframe ;
2013-02-13 20:21:41 +04:00
if ( __get_user ( uc_transact , & uc - > uc_link ) )
goto badframe ;
2016-09-23 09:18:12 +03:00
if ( restore_tm_sigcontexts ( current , & uc - > uc_mcontext ,
2013-02-13 20:21:41 +04:00
& uc_transact - > uc_mcontext ) )
goto badframe ;
2019-01-09 16:16:45 +03:00
} else
2013-02-13 20:21:41 +04:00
# endif
2019-01-09 16:16:45 +03:00
{
powerpc/tm: Unset MSR[TS] if not recheckpointing
There is a TM Bad Thing bug that can be caused when you return from a
signal context in a suspended transaction but with ucontext MSR[TS] unset.
This forces regs->msr[TS] to be set at syscall entrance (since the CPU
state is transactional). It also calls treclaim() to flush the transaction
state, which is done based on the live (mfmsr) MSR state.
Since user context MSR[TS] is not set, then restore_tm_sigcontexts() is not
called, thus, not executing recheckpoint, keeping the CPU state as not
transactional. When calling rfid, SRR1 will have MSR[TS] set, but the CPU
state is non transactional, causing the TM Bad Thing with the following
stack:
[ 33.862316] Bad kernel stack pointer 3fffd9dce3e0 at c00000000000c47c
cpu 0x8: Vector: 700 (Program Check) at [c00000003ff7fd40]
pc: c00000000000c47c: fast_exception_return+0xac/0xb4
lr: 00003fff865f442c
sp: 3fffd9dce3e0
msr: 8000000102a03031
current = 0xc00000041f68b700
paca = 0xc00000000fb84800 softe: 0 irq_happened: 0x01
pid = 1721, comm = tm-signal-sigre
Linux version 4.9.0-3-powerpc64le (debian-kernel@lists.debian.org) (gcc version 6.3.0 20170516 (Debian 6.3.0-18) ) #1 SMP Debian 4.9.30-2+deb9u2 (2017-06-26)
WARNING: exception is not recoverable, can't continue
The same problem happens on 32-bits signal handler, and the fix is very
similar, if tm_recheckpoint() is not executed, then regs->msr[TS] should be
zeroed.
This patch also fixes a sparse warning related to lack of indentation when
CONFIG_PPC_TRANSACTIONAL_MEM is set.
Fixes: 2b0a576d15e0e ("powerpc: Add new transactional memory state to the signal context")
CC: Stable <stable@vger.kernel.org> # 3.10+
Signed-off-by: Breno Leitao <leitao@debian.org>
Tested-by: Michal Suchánek <msuchanek@suse.de>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2018-11-26 23:12:00 +03:00
/*
2019-01-09 16:16:45 +03:00
* Fall through , for non - TM restore
*
powerpc/tm: Unset MSR[TS] if not recheckpointing
There is a TM Bad Thing bug that can be caused when you return from a
signal context in a suspended transaction but with ucontext MSR[TS] unset.
This forces regs->msr[TS] to be set at syscall entrance (since the CPU
state is transactional). It also calls treclaim() to flush the transaction
state, which is done based on the live (mfmsr) MSR state.
Since user context MSR[TS] is not set, then restore_tm_sigcontexts() is not
called, thus, not executing recheckpoint, keeping the CPU state as not
transactional. When calling rfid, SRR1 will have MSR[TS] set, but the CPU
state is non transactional, causing the TM Bad Thing with the following
stack:
[ 33.862316] Bad kernel stack pointer 3fffd9dce3e0 at c00000000000c47c
cpu 0x8: Vector: 700 (Program Check) at [c00000003ff7fd40]
pc: c00000000000c47c: fast_exception_return+0xac/0xb4
lr: 00003fff865f442c
sp: 3fffd9dce3e0
msr: 8000000102a03031
current = 0xc00000041f68b700
paca = 0xc00000000fb84800 softe: 0 irq_happened: 0x01
pid = 1721, comm = tm-signal-sigre
Linux version 4.9.0-3-powerpc64le (debian-kernel@lists.debian.org) (gcc version 6.3.0 20170516 (Debian 6.3.0-18) ) #1 SMP Debian 4.9.30-2+deb9u2 (2017-06-26)
WARNING: exception is not recoverable, can't continue
The same problem happens on 32-bits signal handler, and the fix is very
similar, if tm_recheckpoint() is not executed, then regs->msr[TS] should be
zeroed.
This patch also fixes a sparse warning related to lack of indentation when
CONFIG_PPC_TRANSACTIONAL_MEM is set.
Fixes: 2b0a576d15e0e ("powerpc: Add new transactional memory state to the signal context")
CC: Stable <stable@vger.kernel.org> # 3.10+
Signed-off-by: Breno Leitao <leitao@debian.org>
Tested-by: Michal Suchánek <msuchanek@suse.de>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2018-11-26 23:12:00 +03:00
* Unset MSR [ TS ] on the thread regs since MSR from user
* context does not have MSR active , and recheckpoint was
* not called since restore_tm_sigcontexts ( ) was not called
* also .
*
* If not unsetting it , the code can RFID to userspace with
* MSR [ TS ] set , but without CPU in the proper state ,
* causing a TM bad thing .
*/
current - > thread . regs - > msr & = ~ MSR_TS_MASK ;
if ( restore_sigcontext ( current , NULL , 1 , & uc - > uc_mcontext ) )
goto badframe ;
}
2005-04-17 02:20:36 +04:00
2012-12-23 12:26:46 +04:00
if ( restore_altstack ( & uc - > uc_stack ) )
goto badframe ;
2005-04-17 02:20:36 +04:00
[PATCH] syscall entry/exit revamp
This cleanup patch speeds up the null syscall path on ppc64 by about 3%,
and brings the ppc32 and ppc64 code slightly closer together.
The ppc64 code was checking current_thread_info()->flags twice in the
syscall exit path; once for TIF_SYSCALL_T_OR_A before disabling
interrupts, and then again for TIF_SIGPENDING|TIF_NEED_RESCHED etc after
disabling interrupts. Now we do the same as ppc32 -- check the flags
only once in the fast path, and re-enable interrupts if necessary in the
ptrace case.
The patch abolishes the 'syscall_noerror' member of struct thread_info
and replaces it with a TIF_NOERROR bit in the flags, which is handled in
the slow path. This shortens the syscall entry code, which no longer
needs to clear syscall_noerror.
The patch adds a TIF_SAVE_NVGPRS flag which causes the syscall exit slow
path to save the non-volatile GPRs into a signal frame. This removes the
need for the assembly wrappers around sys_sigsuspend(),
sys_rt_sigsuspend(), et al which existed solely to save those registers
in advance. It also means I don't have to add new wrappers for ppoll()
and pselect(), which is what I was supposed to be doing when I got
distracted into this...
Finally, it unifies the ppc64 and ppc32 methods of handling syscall exit
directly into a signal handler (as required by sigsuspend et al) by
introducing a TIF_RESTOREALL flag which causes _all_ the registers to be
reloaded from the pt_regs by taking the ret_from_exception path, instead
of the normal syscall exit path which stomps on the callee-saved GPRs.
It appears to pass an LTP test run on ppc64, and passes basic testing on
ppc32 too. Brief tests of ptrace functionality with strace and gdb also
appear OK. I wouldn't send it to Linus for 2.6.15 just yet though :)
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
2005-11-15 21:52:18 +03:00
set_thread_flag ( TIF_RESTOREALL ) ;
return 0 ;
2005-04-17 02:20:36 +04:00
badframe :
2011-06-04 09:36:54 +04:00
if ( show_unhandled_signals )
printk_ratelimited ( regs - > msr & MSR_64BIT ? fmt64 : fmt32 ,
current - > comm , current - > pid , " rt_sigreturn " ,
( long ) uc , regs - > nip , regs - > link ) ;
2007-10-12 04:20:07 +04:00
2019-05-23 18:17:27 +03:00
force_sig ( SIGSEGV ) ;
2005-04-17 02:20:36 +04:00
return 0 ;
}
2016-09-23 09:18:12 +03:00
int handle_rt_signal64 ( struct ksignal * ksig , sigset_t * set ,
struct task_struct * tsk )
2005-04-17 02:20:36 +04:00
{
struct rt_sigframe __user * frame ;
unsigned long newsp = 0 ;
long err = 0 ;
2016-09-23 09:18:12 +03:00
struct pt_regs * regs = tsk - > thread . regs ;
powerpc/tm: Fix clearing MSR[TS] in current when reclaiming on signal delivery
After a treclaim, we expect to be in non-transactional state. If we
don't clear the current thread's MSR[TS] before we get preempted, then
tm_recheckpoint_new_task() will recheckpoint and we get rescheduled in
suspended transaction state.
When handling a signal caught in transactional state,
handle_rt_signal64() calls get_tm_stackpointer() that treclaims the
transaction using tm_reclaim_current() but without clearing the
thread's MSR[TS]. This can cause the TM Bad Thing exception below if
later we pagefault and get preempted trying to access the user's
sigframe, using __put_user(). Afterwards, when we are rescheduled back
into do_page_fault() (but now in suspended state since the thread's
MSR[TS] was not cleared), upon executing 'rfid' after completion of
the page fault handling, the exception is raised because a transition
from suspended to non-transactional state is invalid.
Unexpected TM Bad Thing exception at c00000000000de44 (msr 0x8000000302a03031) tm_scratch=800000010280b033
Oops: Unrecoverable exception, sig: 6 [#1]
LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
CPU: 25 PID: 15547 Comm: a.out Not tainted 5.4.0-rc2 #32
NIP: c00000000000de44 LR: c000000000034728 CTR: 0000000000000000
REGS: c00000003fe7bd70 TRAP: 0700 Not tainted (5.4.0-rc2)
MSR: 8000000302a03031 <SF,VEC,VSX,FP,ME,IR,DR,LE,TM[SE]> CR: 44000884 XER: 00000000
CFAR: c00000000000dda4 IRQMASK: 0
PACATMSCRATCH: 800000010280b033
GPR00: c000000000034728 c000000f65a17c80 c000000001662800 00007fffacf3fd78
GPR04: 0000000000001000 0000000000001000 0000000000000000 c000000f611f8af0
GPR08: 0000000000000000 0000000078006001 0000000000000000 000c000000000000
GPR12: c000000f611f84b0 c00000003ffcb200 0000000000000000 0000000000000000
GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
GPR20: 0000000000000000 0000000000000000 0000000000000000 c000000f611f8140
GPR24: 0000000000000000 00007fffacf3fd68 c000000f65a17d90 c000000f611f7800
GPR28: c000000f65a17e90 c000000f65a17e90 c000000001685e18 00007fffacf3f000
NIP [c00000000000de44] fast_exception_return+0xf4/0x1b0
LR [c000000000034728] handle_rt_signal64+0x78/0xc50
Call Trace:
[c000000f65a17c80] [c000000000034710] handle_rt_signal64+0x60/0xc50 (unreliable)
[c000000f65a17d30] [c000000000023640] do_notify_resume+0x330/0x460
[c000000f65a17e20] [c00000000000dcc4] ret_from_except_lite+0x70/0x74
Instruction dump:
7c4ff120 e8410170 7c5a03a6 38400000 f8410060 e8010070 e8410080 e8610088
60000000 60000000 e8810090 e8210078 <4c000024> 48000000 e8610178 88ed0989
---[ end trace 93094aa44b442f87 ]---
The simplified sequence of events that triggers the above exception is:
... # userspace in NON-TRANSACTIONAL state
tbegin # userspace in TRANSACTIONAL state
signal delivery # kernelspace in SUSPENDED state
handle_rt_signal64()
get_tm_stackpointer()
treclaim # kernelspace in NON-TRANSACTIONAL state
__put_user()
page fault happens. We will never get back here because of the TM Bad Thing exception.
page fault handling kicks in and we voluntarily preempt ourselves
do_page_fault()
__schedule()
__switch_to(other_task)
our task is rescheduled and we recheckpoint because the thread's MSR[TS] was not cleared
__switch_to(our_task)
switch_to_tm()
tm_recheckpoint_new_task()
trechkpt # kernelspace in SUSPENDED state
The page fault handling resumes, but now we are in suspended transaction state
do_page_fault() completes
rfid <----- trying to get back where the page fault happened (we were non-transactional back then)
TM Bad Thing # illegal transition from suspended to non-transactional
This patch fixes that issue by clearing the current thread's MSR[TS]
just after treclaim in get_tm_stackpointer() so that we stay in
non-transactional state in case we are preempted. In order to make
treclaim and clearing the thread's MSR[TS] atomic from a preemption
perspective when CONFIG_PREEMPT is set, preempt_disable/enable() is
used. It's also necessary to save the previous value of the thread's
MSR before get_tm_stackpointer() is called so that it can be exposed
to the signal handler later in setup_tm_sigcontexts() to inform the
userspace MSR at the moment of the signal delivery.
Found with tm-signal-context-force-tm kernel selftest.
Fixes: 2b0a576d15e0 ("powerpc: Add new transactional memory state to the signal context")
Cc: stable@vger.kernel.org # v3.9
Signed-off-by: Gustavo Luiz Duarte <gustavold@linux.ibm.com>
Acked-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200211033831.11165-1-gustavold@linux.ibm.com
2020-02-11 06:38:29 +03:00
# ifdef CONFIG_PPC_TRANSACTIONAL_MEM
/* Save the thread's msr before get_tm_stackpointer() changes it */
unsigned long msr = regs - > msr ;
# endif
2016-09-23 09:18:12 +03:00
BUG_ON ( tsk ! = current ) ;
2005-04-17 02:20:36 +04:00
2016-09-23 09:18:12 +03:00
frame = get_sigframe ( ksig , get_tm_stackpointer ( tsk ) , sizeof ( * frame ) , 0 ) ;
2007-06-04 11:22:48 +04:00
if ( unlikely ( frame = = NULL ) )
2005-04-17 02:20:36 +04:00
goto badframe ;
err | = __put_user ( & frame - > info , & frame - > pinfo ) ;
err | = __put_user ( & frame - > uc , & frame - > puc ) ;
2014-03-02 17:46:11 +04:00
err | = copy_siginfo_to_user ( & frame - > info , & ksig - > info ) ;
2005-04-17 02:20:36 +04:00
if ( err )
goto badframe ;
/* Create the ucontext. */
err | = __put_user ( 0 , & frame - > uc . uc_flags ) ;
2012-12-23 12:26:46 +04:00
err | = __save_altstack ( & frame - > uc . uc_stack , regs - > gpr [ 1 ] ) ;
2013-02-13 20:21:41 +04:00
# ifdef CONFIG_PPC_TRANSACTIONAL_MEM
powerpc/tm: Fix clearing MSR[TS] in current when reclaiming on signal delivery
After a treclaim, we expect to be in non-transactional state. If we
don't clear the current thread's MSR[TS] before we get preempted, then
tm_recheckpoint_new_task() will recheckpoint and we get rescheduled in
suspended transaction state.
When handling a signal caught in transactional state,
handle_rt_signal64() calls get_tm_stackpointer() that treclaims the
transaction using tm_reclaim_current() but without clearing the
thread's MSR[TS]. This can cause the TM Bad Thing exception below if
later we pagefault and get preempted trying to access the user's
sigframe, using __put_user(). Afterwards, when we are rescheduled back
into do_page_fault() (but now in suspended state since the thread's
MSR[TS] was not cleared), upon executing 'rfid' after completion of
the page fault handling, the exception is raised because a transition
from suspended to non-transactional state is invalid.
Unexpected TM Bad Thing exception at c00000000000de44 (msr 0x8000000302a03031) tm_scratch=800000010280b033
Oops: Unrecoverable exception, sig: 6 [#1]
LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
CPU: 25 PID: 15547 Comm: a.out Not tainted 5.4.0-rc2 #32
NIP: c00000000000de44 LR: c000000000034728 CTR: 0000000000000000
REGS: c00000003fe7bd70 TRAP: 0700 Not tainted (5.4.0-rc2)
MSR: 8000000302a03031 <SF,VEC,VSX,FP,ME,IR,DR,LE,TM[SE]> CR: 44000884 XER: 00000000
CFAR: c00000000000dda4 IRQMASK: 0
PACATMSCRATCH: 800000010280b033
GPR00: c000000000034728 c000000f65a17c80 c000000001662800 00007fffacf3fd78
GPR04: 0000000000001000 0000000000001000 0000000000000000 c000000f611f8af0
GPR08: 0000000000000000 0000000078006001 0000000000000000 000c000000000000
GPR12: c000000f611f84b0 c00000003ffcb200 0000000000000000 0000000000000000
GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
GPR20: 0000000000000000 0000000000000000 0000000000000000 c000000f611f8140
GPR24: 0000000000000000 00007fffacf3fd68 c000000f65a17d90 c000000f611f7800
GPR28: c000000f65a17e90 c000000f65a17e90 c000000001685e18 00007fffacf3f000
NIP [c00000000000de44] fast_exception_return+0xf4/0x1b0
LR [c000000000034728] handle_rt_signal64+0x78/0xc50
Call Trace:
[c000000f65a17c80] [c000000000034710] handle_rt_signal64+0x60/0xc50 (unreliable)
[c000000f65a17d30] [c000000000023640] do_notify_resume+0x330/0x460
[c000000f65a17e20] [c00000000000dcc4] ret_from_except_lite+0x70/0x74
Instruction dump:
7c4ff120 e8410170 7c5a03a6 38400000 f8410060 e8010070 e8410080 e8610088
60000000 60000000 e8810090 e8210078 <4c000024> 48000000 e8610178 88ed0989
---[ end trace 93094aa44b442f87 ]---
The simplified sequence of events that triggers the above exception is:
... # userspace in NON-TRANSACTIONAL state
tbegin # userspace in TRANSACTIONAL state
signal delivery # kernelspace in SUSPENDED state
handle_rt_signal64()
get_tm_stackpointer()
treclaim # kernelspace in NON-TRANSACTIONAL state
__put_user()
page fault happens. We will never get back here because of the TM Bad Thing exception.
page fault handling kicks in and we voluntarily preempt ourselves
do_page_fault()
__schedule()
__switch_to(other_task)
our task is rescheduled and we recheckpoint because the thread's MSR[TS] was not cleared
__switch_to(our_task)
switch_to_tm()
tm_recheckpoint_new_task()
trechkpt # kernelspace in SUSPENDED state
The page fault handling resumes, but now we are in suspended transaction state
do_page_fault() completes
rfid <----- trying to get back where the page fault happened (we were non-transactional back then)
TM Bad Thing # illegal transition from suspended to non-transactional
This patch fixes that issue by clearing the current thread's MSR[TS]
just after treclaim in get_tm_stackpointer() so that we stay in
non-transactional state in case we are preempted. In order to make
treclaim and clearing the thread's MSR[TS] atomic from a preemption
perspective when CONFIG_PREEMPT is set, preempt_disable/enable() is
used. It's also necessary to save the previous value of the thread's
MSR before get_tm_stackpointer() is called so that it can be exposed
to the signal handler later in setup_tm_sigcontexts() to inform the
userspace MSR at the moment of the signal delivery.
Found with tm-signal-context-force-tm kernel selftest.
Fixes: 2b0a576d15e0 ("powerpc: Add new transactional memory state to the signal context")
Cc: stable@vger.kernel.org # v3.9
Signed-off-by: Gustavo Luiz Duarte <gustavold@linux.ibm.com>
Acked-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200211033831.11165-1-gustavold@linux.ibm.com
2020-02-11 06:38:29 +03:00
if ( MSR_TM_ACTIVE ( msr ) ) {
2013-02-13 20:21:41 +04:00
/* The ucontext_t passed to userland points to the second
* ucontext_t ( for transactional state ) with its uc_link ptr .
*/
err | = __put_user ( & frame - > uc_transact , & frame - > uc . uc_link ) ;
err | = setup_tm_sigcontexts ( & frame - > uc . uc_mcontext ,
& frame - > uc_transact . uc_mcontext ,
2016-09-23 09:18:12 +03:00
tsk , ksig - > sig , NULL ,
powerpc/tm: Fix clearing MSR[TS] in current when reclaiming on signal delivery
After a treclaim, we expect to be in non-transactional state. If we
don't clear the current thread's MSR[TS] before we get preempted, then
tm_recheckpoint_new_task() will recheckpoint and we get rescheduled in
suspended transaction state.
When handling a signal caught in transactional state,
handle_rt_signal64() calls get_tm_stackpointer() that treclaims the
transaction using tm_reclaim_current() but without clearing the
thread's MSR[TS]. This can cause the TM Bad Thing exception below if
later we pagefault and get preempted trying to access the user's
sigframe, using __put_user(). Afterwards, when we are rescheduled back
into do_page_fault() (but now in suspended state since the thread's
MSR[TS] was not cleared), upon executing 'rfid' after completion of
the page fault handling, the exception is raised because a transition
from suspended to non-transactional state is invalid.
Unexpected TM Bad Thing exception at c00000000000de44 (msr 0x8000000302a03031) tm_scratch=800000010280b033
Oops: Unrecoverable exception, sig: 6 [#1]
LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
CPU: 25 PID: 15547 Comm: a.out Not tainted 5.4.0-rc2 #32
NIP: c00000000000de44 LR: c000000000034728 CTR: 0000000000000000
REGS: c00000003fe7bd70 TRAP: 0700 Not tainted (5.4.0-rc2)
MSR: 8000000302a03031 <SF,VEC,VSX,FP,ME,IR,DR,LE,TM[SE]> CR: 44000884 XER: 00000000
CFAR: c00000000000dda4 IRQMASK: 0
PACATMSCRATCH: 800000010280b033
GPR00: c000000000034728 c000000f65a17c80 c000000001662800 00007fffacf3fd78
GPR04: 0000000000001000 0000000000001000 0000000000000000 c000000f611f8af0
GPR08: 0000000000000000 0000000078006001 0000000000000000 000c000000000000
GPR12: c000000f611f84b0 c00000003ffcb200 0000000000000000 0000000000000000
GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
GPR20: 0000000000000000 0000000000000000 0000000000000000 c000000f611f8140
GPR24: 0000000000000000 00007fffacf3fd68 c000000f65a17d90 c000000f611f7800
GPR28: c000000f65a17e90 c000000f65a17e90 c000000001685e18 00007fffacf3f000
NIP [c00000000000de44] fast_exception_return+0xf4/0x1b0
LR [c000000000034728] handle_rt_signal64+0x78/0xc50
Call Trace:
[c000000f65a17c80] [c000000000034710] handle_rt_signal64+0x60/0xc50 (unreliable)
[c000000f65a17d30] [c000000000023640] do_notify_resume+0x330/0x460
[c000000f65a17e20] [c00000000000dcc4] ret_from_except_lite+0x70/0x74
Instruction dump:
7c4ff120 e8410170 7c5a03a6 38400000 f8410060 e8010070 e8410080 e8610088
60000000 60000000 e8810090 e8210078 <4c000024> 48000000 e8610178 88ed0989
---[ end trace 93094aa44b442f87 ]---
The simplified sequence of events that triggers the above exception is:
... # userspace in NON-TRANSACTIONAL state
tbegin # userspace in TRANSACTIONAL state
signal delivery # kernelspace in SUSPENDED state
handle_rt_signal64()
get_tm_stackpointer()
treclaim # kernelspace in NON-TRANSACTIONAL state
__put_user()
page fault happens. We will never get back here because of the TM Bad Thing exception.
page fault handling kicks in and we voluntarily preempt ourselves
do_page_fault()
__schedule()
__switch_to(other_task)
our task is rescheduled and we recheckpoint because the thread's MSR[TS] was not cleared
__switch_to(our_task)
switch_to_tm()
tm_recheckpoint_new_task()
trechkpt # kernelspace in SUSPENDED state
The page fault handling resumes, but now we are in suspended transaction state
do_page_fault() completes
rfid <----- trying to get back where the page fault happened (we were non-transactional back then)
TM Bad Thing # illegal transition from suspended to non-transactional
This patch fixes that issue by clearing the current thread's MSR[TS]
just after treclaim in get_tm_stackpointer() so that we stay in
non-transactional state in case we are preempted. In order to make
treclaim and clearing the thread's MSR[TS] atomic from a preemption
perspective when CONFIG_PREEMPT is set, preempt_disable/enable() is
used. It's also necessary to save the previous value of the thread's
MSR before get_tm_stackpointer() is called so that it can be exposed
to the signal handler later in setup_tm_sigcontexts() to inform the
userspace MSR at the moment of the signal delivery.
Found with tm-signal-context-force-tm kernel selftest.
Fixes: 2b0a576d15e0 ("powerpc: Add new transactional memory state to the signal context")
Cc: stable@vger.kernel.org # v3.9
Signed-off-by: Gustavo Luiz Duarte <gustavold@linux.ibm.com>
Acked-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200211033831.11165-1-gustavold@linux.ibm.com
2020-02-11 06:38:29 +03:00
( unsigned long ) ksig - > ka . sa . sa_handler ,
msr ) ;
2013-02-13 20:21:41 +04:00
} else
# endif
{
err | = __put_user ( 0 , & frame - > uc . uc_link ) ;
2016-09-23 09:18:12 +03:00
err | = setup_sigcontext ( & frame - > uc . uc_mcontext , tsk , ksig - > sig ,
2014-03-02 17:46:11 +04:00
NULL , ( unsigned long ) ksig - > ka . sa . sa_handler ,
2013-02-13 20:21:41 +04:00
1 ) ;
}
2005-04-17 02:20:36 +04:00
err | = __copy_to_user ( & frame - > uc . uc_sigmask , set , sizeof ( * set ) ) ;
if ( err )
goto badframe ;
2005-11-14 13:55:15 +03:00
/* Make sure signal handler doesn't get spurious FP exceptions */
2016-09-23 09:18:12 +03:00
tsk - > thread . fp_state . fpscr = 0 ;
2005-11-14 13:55:15 +03:00
2005-04-17 02:20:36 +04:00
/* Set up to return from userspace. */
2016-09-23 09:18:12 +03:00
if ( vdso64_rt_sigtramp & & tsk - > mm - > context . vdso_base ) {
powerpc/64/signal: Balance return predictor stack in signal trampoline
Returning from an interrupt or syscall to a signal handler currently
begins execution directly at the handler's entry point, with LR set to
the address of the sigreturn trampoline. When the signal handler
function returns, it runs the trampoline. It looks like this:
# interrupt at user address xyz
# kernel stuff... signal is raised
rfid
# void handler(int sig)
addis 2,12,.TOC.-.LCF0@ha
addi 2,2,.TOC.-.LCF0@l
mflr 0
std 0,16(1)
stdu 1,-96(1)
# handler stuff
ld 0,16(1)
mtlr 0
blr
# __kernel_sigtramp_rt64
addi r1,r1,__SIGNAL_FRAMESIZE
li r0,__NR_rt_sigreturn
sc
# kernel executes rt_sigreturn
rfid
# back to user address xyz
Note the blr with no matching bl. This can corrupt the return
predictor.
Solve this by instead resuming execution at the signal trampoline
which then calls the signal handler. qtrace-tools link_stack checker
confirms the entire user/kernel/vdso cycle is balanced after this
patch, whereas it's not upstream.
Alan confirms the dwarf unwind info still looks good. gdb still
recognises the signal frame and can step into parent frames if it
break inside a signal handler.
Performance is pretty noisy, not a very significant change on a POWER9
here, but branch misses are consistently a lot lower on a
microbenchmark:
Performance counter stats for './signal':
13,085.72 msec task-clock # 1.000 CPUs utilized
45,024,760,101 cycles # 3.441 GHz
65,102,895,542 instructions # 1.45 insn per cycle
11,271,673,787 branches # 861.372 M/sec
59,468,979 branch-misses # 0.53% of all branches
12,989.09 msec task-clock # 1.000 CPUs utilized
44,692,719,559 cycles # 3.441 GHz
65,109,984,964 instructions # 1.46 insn per cycle
11,282,136,057 branches # 868.585 M/sec
39,786,942 branch-misses # 0.35% of all branches
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200511101952.1463138-1-npiggin@gmail.com
2020-05-11 13:19:52 +03:00
regs - > nip = tsk - > mm - > context . vdso_base + vdso64_rt_sigtramp ;
2005-04-17 02:20:36 +04:00
} else {
err | = setup_trampoline ( __NR_rt_sigreturn , & frame - > tramp [ 0 ] ) ;
if ( err )
goto badframe ;
powerpc/64/signal: Balance return predictor stack in signal trampoline
Returning from an interrupt or syscall to a signal handler currently
begins execution directly at the handler's entry point, with LR set to
the address of the sigreturn trampoline. When the signal handler
function returns, it runs the trampoline. It looks like this:
# interrupt at user address xyz
# kernel stuff... signal is raised
rfid
# void handler(int sig)
addis 2,12,.TOC.-.LCF0@ha
addi 2,2,.TOC.-.LCF0@l
mflr 0
std 0,16(1)
stdu 1,-96(1)
# handler stuff
ld 0,16(1)
mtlr 0
blr
# __kernel_sigtramp_rt64
addi r1,r1,__SIGNAL_FRAMESIZE
li r0,__NR_rt_sigreturn
sc
# kernel executes rt_sigreturn
rfid
# back to user address xyz
Note the blr with no matching bl. This can corrupt the return
predictor.
Solve this by instead resuming execution at the signal trampoline
which then calls the signal handler. qtrace-tools link_stack checker
confirms the entire user/kernel/vdso cycle is balanced after this
patch, whereas it's not upstream.
Alan confirms the dwarf unwind info still looks good. gdb still
recognises the signal frame and can step into parent frames if it
break inside a signal handler.
Performance is pretty noisy, not a very significant change on a POWER9
here, but branch misses are consistently a lot lower on a
microbenchmark:
Performance counter stats for './signal':
13,085.72 msec task-clock # 1.000 CPUs utilized
45,024,760,101 cycles # 3.441 GHz
65,102,895,542 instructions # 1.45 insn per cycle
11,271,673,787 branches # 861.372 M/sec
59,468,979 branch-misses # 0.53% of all branches
12,989.09 msec task-clock # 1.000 CPUs utilized
44,692,719,559 cycles # 3.441 GHz
65,109,984,964 instructions # 1.46 insn per cycle
11,282,136,057 branches # 868.585 M/sec
39,786,942 branch-misses # 0.35% of all branches
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200511101952.1463138-1-npiggin@gmail.com
2020-05-11 13:19:52 +03:00
regs - > nip = ( unsigned long ) & frame - > tramp [ 0 ] ;
2005-04-17 02:20:36 +04:00
}
/* Allocate a dummy caller frame for the signal handler. */
2007-06-04 11:22:48 +04:00
newsp = ( ( unsigned long ) frame ) - __SIGNAL_FRAMESIZE ;
2005-04-17 02:20:36 +04:00
err | = put_user ( regs - > gpr [ 1 ] , ( unsigned long __user * ) newsp ) ;
/* Set up "regs" so we "return" to the signal handler. */
2013-11-20 15:15:03 +04:00
if ( is_elf2_task ( ) ) {
powerpc/64/signal: Balance return predictor stack in signal trampoline
Returning from an interrupt or syscall to a signal handler currently
begins execution directly at the handler's entry point, with LR set to
the address of the sigreturn trampoline. When the signal handler
function returns, it runs the trampoline. It looks like this:
# interrupt at user address xyz
# kernel stuff... signal is raised
rfid
# void handler(int sig)
addis 2,12,.TOC.-.LCF0@ha
addi 2,2,.TOC.-.LCF0@l
mflr 0
std 0,16(1)
stdu 1,-96(1)
# handler stuff
ld 0,16(1)
mtlr 0
blr
# __kernel_sigtramp_rt64
addi r1,r1,__SIGNAL_FRAMESIZE
li r0,__NR_rt_sigreturn
sc
# kernel executes rt_sigreturn
rfid
# back to user address xyz
Note the blr with no matching bl. This can corrupt the return
predictor.
Solve this by instead resuming execution at the signal trampoline
which then calls the signal handler. qtrace-tools link_stack checker
confirms the entire user/kernel/vdso cycle is balanced after this
patch, whereas it's not upstream.
Alan confirms the dwarf unwind info still looks good. gdb still
recognises the signal frame and can step into parent frames if it
break inside a signal handler.
Performance is pretty noisy, not a very significant change on a POWER9
here, but branch misses are consistently a lot lower on a
microbenchmark:
Performance counter stats for './signal':
13,085.72 msec task-clock # 1.000 CPUs utilized
45,024,760,101 cycles # 3.441 GHz
65,102,895,542 instructions # 1.45 insn per cycle
11,271,673,787 branches # 861.372 M/sec
59,468,979 branch-misses # 0.53% of all branches
12,989.09 msec task-clock # 1.000 CPUs utilized
44,692,719,559 cycles # 3.441 GHz
65,109,984,964 instructions # 1.46 insn per cycle
11,282,136,057 branches # 868.585 M/sec
39,786,942 branch-misses # 0.35% of all branches
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200511101952.1463138-1-npiggin@gmail.com
2020-05-11 13:19:52 +03:00
regs - > ctr = ( unsigned long ) ksig - > ka . sa . sa_handler ;
regs - > gpr [ 12 ] = regs - > ctr ;
2013-11-20 15:15:03 +04:00
} else {
/* Handler is *really* a pointer to the function descriptor for
* the signal routine . The first entry in the function
* descriptor is the entry address of signal and the second
* entry is the TOC value we need to use .
*/
func_descr_t __user * funct_desc_ptr =
2014-03-02 17:46:11 +04:00
( func_descr_t __user * ) ksig - > ka . sa . sa_handler ;
2013-11-20 15:15:03 +04:00
powerpc/64/signal: Balance return predictor stack in signal trampoline
Returning from an interrupt or syscall to a signal handler currently
begins execution directly at the handler's entry point, with LR set to
the address of the sigreturn trampoline. When the signal handler
function returns, it runs the trampoline. It looks like this:
# interrupt at user address xyz
# kernel stuff... signal is raised
rfid
# void handler(int sig)
addis 2,12,.TOC.-.LCF0@ha
addi 2,2,.TOC.-.LCF0@l
mflr 0
std 0,16(1)
stdu 1,-96(1)
# handler stuff
ld 0,16(1)
mtlr 0
blr
# __kernel_sigtramp_rt64
addi r1,r1,__SIGNAL_FRAMESIZE
li r0,__NR_rt_sigreturn
sc
# kernel executes rt_sigreturn
rfid
# back to user address xyz
Note the blr with no matching bl. This can corrupt the return
predictor.
Solve this by instead resuming execution at the signal trampoline
which then calls the signal handler. qtrace-tools link_stack checker
confirms the entire user/kernel/vdso cycle is balanced after this
patch, whereas it's not upstream.
Alan confirms the dwarf unwind info still looks good. gdb still
recognises the signal frame and can step into parent frames if it
break inside a signal handler.
Performance is pretty noisy, not a very significant change on a POWER9
here, but branch misses are consistently a lot lower on a
microbenchmark:
Performance counter stats for './signal':
13,085.72 msec task-clock # 1.000 CPUs utilized
45,024,760,101 cycles # 3.441 GHz
65,102,895,542 instructions # 1.45 insn per cycle
11,271,673,787 branches # 861.372 M/sec
59,468,979 branch-misses # 0.53% of all branches
12,989.09 msec task-clock # 1.000 CPUs utilized
44,692,719,559 cycles # 3.441 GHz
65,109,984,964 instructions # 1.46 insn per cycle
11,282,136,057 branches # 868.585 M/sec
39,786,942 branch-misses # 0.35% of all branches
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200511101952.1463138-1-npiggin@gmail.com
2020-05-11 13:19:52 +03:00
err | = get_user ( regs - > ctr , & funct_desc_ptr - > entry ) ;
2013-11-20 15:15:03 +04:00
err | = get_user ( regs - > gpr [ 2 ] , & funct_desc_ptr - > toc ) ;
}
2013-09-23 06:04:43 +04:00
/* enter the signal handler in native-endian mode */
2006-06-07 10:14:40 +04:00
regs - > msr & = ~ MSR_LE ;
2013-09-23 06:04:43 +04:00
regs - > msr | = ( MSR_KERNEL & MSR_LE ) ;
2005-04-17 02:20:36 +04:00
regs - > gpr [ 1 ] = newsp ;
2014-03-02 17:46:11 +04:00
regs - > gpr [ 3 ] = ksig - > sig ;
2005-04-17 02:20:36 +04:00
regs - > result = 0 ;
2014-03-02 17:46:11 +04:00
if ( ksig - > ka . sa . sa_flags & SA_SIGINFO ) {
2005-04-17 02:20:36 +04:00
err | = get_user ( regs - > gpr [ 4 ] , ( unsigned long __user * ) & frame - > pinfo ) ;
err | = get_user ( regs - > gpr [ 5 ] , ( unsigned long __user * ) & frame - > puc ) ;
regs - > gpr [ 6 ] = ( unsigned long ) frame ;
} else {
regs - > gpr [ 4 ] = ( unsigned long ) & frame - > uc . uc_mcontext ;
}
if ( err )
goto badframe ;
2014-03-02 17:46:11 +04:00
return 0 ;
2005-04-17 02:20:36 +04:00
badframe :
2011-06-04 09:36:54 +04:00
if ( show_unhandled_signals )
printk_ratelimited ( regs - > msr & MSR_64BIT ? fmt64 : fmt32 ,
2016-09-23 09:18:12 +03:00
tsk - > comm , tsk - > pid , " setup_rt_frame " ,
2011-06-04 09:36:54 +04:00
( long ) frame , regs - > nip , regs - > link ) ;
2007-10-12 04:20:07 +04:00
2014-03-02 17:46:11 +04:00
return 1 ;
2005-04-17 02:20:36 +04:00
}