2019-05-27 09:55:01 +03:00
// SPDX-License-Identifier: GPL-2.0-or-later
2005-09-26 10:04:21 +04:00
/*
* Derived from " arch/i386/kernel/process.c "
* Copyright ( C ) 1995 Linus Torvalds
*
* Updated and modified by Cort Dougan ( cort @ cs . nmt . edu ) and
* Paul Mackerras ( paulus @ cs . anu . edu . au )
*
* PowerPC version
* Copyright ( C ) 1995 - 1996 Gary Thomas ( gdt @ linuxppc . org )
*/
# include <linux/errno.h>
# include <linux/sched.h>
2017-02-08 20:51:35 +03:00
# include <linux/sched/debug.h>
2017-02-08 20:51:36 +03:00
# include <linux/sched/task.h>
2017-02-08 20:51:37 +03:00
# include <linux/sched/task_stack.h>
2005-09-26 10:04:21 +04:00
# include <linux/kernel.h>
# include <linux/mm.h>
# include <linux/smp.h>
# include <linux/stddef.h>
# include <linux/unistd.h>
# include <linux/ptrace.h>
# include <linux/slab.h>
# include <linux/user.h>
# include <linux/elf.h>
# include <linux/prctl.h>
# include <linux/init_task.h>
2011-07-23 02:24:23 +04:00
# include <linux/export.h>
2005-09-26 10:04:21 +04:00
# include <linux/kallsyms.h>
# include <linux/mqueue.h>
# include <linux/hardirq.h>
2005-10-10 16:29:05 +04:00
# include <linux/utsname.h>
2009-02-10 08:10:27 +03:00
# include <linux/ftrace.h>
2008-12-31 17:11:38 +03:00
# include <linux/kernel_stat.h>
2009-02-22 04:50:03 +03:00
# include <linux/personality.h>
2010-06-15 10:05:19 +04:00
# include <linux/hw_breakpoint.h>
2014-10-13 13:27:15 +04:00
# include <linux/uaccess.h>
2018-01-19 04:50:31 +03:00
# include <linux/pkeys.h>
2018-10-06 19:51:14 +03:00
# include <linux/seq_buf.h>
2005-09-26 10:04:21 +04:00
2021-01-30 16:08:38 +03:00
# include <asm/interrupt.h>
2005-09-26 10:04:21 +04:00
# include <asm/io.h>
# include <asm/processor.h>
# include <asm/mmu.h>
2005-11-07 05:12:03 +03:00
# include <asm/machdep.h>
powerpc: Implement accurate task and CPU time accounting
This implements accurate task and cpu time accounting for 64-bit
powerpc kernels. Instead of accounting a whole jiffy of time to a
task on a timer interrupt because that task happened to be running at
the time, we now account time in units of timebase ticks according to
the actual time spent by the task in user mode and kernel mode. We
also count the time spent processing hardware and software interrupts
accurately. This is conditional on CONFIG_VIRT_CPU_ACCOUNTING. If
that is not set, we do tick-based approximate accounting as before.
To get this accurate information, we read either the PURR (processor
utilization of resources register) on POWER5 machines, or the timebase
on other machines on
* each entry to the kernel from usermode
* each exit to usermode
* transitions between process context, hard irq context and soft irq
context in kernel mode
* context switches.
On POWER5 systems with shared-processor logical partitioning we also
read both the PURR and the timebase at each timer interrupt and
context switch in order to determine how much time has been taken by
the hypervisor to run other partitions ("steal" time). Unfortunately,
since we need values of the PURR on both threads at the same time to
accurately calculate the steal time, and since we can only calculate
steal time on a per-core basis, the apportioning of the steal time
between idle time (time which we ceded to the hypervisor in the idle
loop) and actual stolen time is somewhat approximate at the moment.
This is all based quite heavily on what s390 does, and it uses the
generic interfaces that were added by the s390 developers,
i.e. account_system_time(), account_user_time(), etc.
This patch doesn't add any new interfaces between the kernel and
userspace, and doesn't change the units in which time is reported to
userspace by things such as /proc/stat, /proc/<pid>/stat, getrusage(),
times(), etc. Internally the various task and cpu times are stored in
timebase units, but they are converted to USER_HZ units (1/100th of a
second) when reported to userspace. Some precision is therefore lost
but there should not be any accumulating error, since the internal
accumulation is at full precision.
Signed-off-by: Paul Mackerras <paulus@samba.org>
2006-02-24 02:06:59 +03:00
# include <asm/time.h>
2012-03-28 21:30:02 +04:00
# include <asm/runlatch.h>
2006-03-23 02:00:08 +03:00
# include <asm/syscalls.h>
2012-03-28 21:30:02 +04:00
# include <asm/switch_to.h>
2013-02-13 20:21:37 +04:00
# include <asm/tm.h>
2012-03-28 21:30:02 +04:00
# include <asm/debug.h>
2005-10-10 16:29:05 +04:00
# ifdef CONFIG_PPC64
# include <asm/firmware.h>
2017-12-20 06:55:42 +03:00
# include <asm/hw_irq.h>
2005-10-10 16:29:05 +04:00
# endif
2014-02-04 09:08:51 +04:00
# include <asm/code-patching.h>
2016-01-06 03:45:51 +03:00
# include <asm/exec.h>
2016-03-24 14:04:04 +03:00
# include <asm/livepatch.h>
2016-07-23 12:12:40 +03:00
# include <asm/cpu_has_feature.h>
2016-09-06 08:32:43 +03:00
# include <asm/asm-prototypes.h>
2018-10-09 08:46:25 +03:00
# include <asm/stacktrace.h>
2019-04-01 09:03:12 +03:00
# include <asm/hw_breakpoint.h>
2016-03-24 14:04:04 +03:00
2008-07-23 20:10:41 +04:00
# include <linux/kprobes.h>
# include <linux/kdebug.h>
2005-09-26 10:04:21 +04:00
2013-02-13 20:21:32 +04:00
/* Transactional Memory debug */
# ifdef TM_DEBUG_SW
# define TM_DEBUG(x...) printk(KERN_INFO x)
# else
# define TM_DEBUG(x...) do { } while(0)
# endif
2005-09-26 10:04:21 +04:00
extern unsigned long _get_SP ( void ) ;
powerpc: Don't corrupt transactional state when using FP/VMX in kernel
Currently, when we have a process using the transactional memory
facilities on POWER8 (that is, the processor is in transactional
or suspended state), and the process enters the kernel and the
kernel then uses the floating-point or vector (VMX/Altivec) facility,
we end up corrupting the user-visible FP/VMX/VSX state. This
happens, for example, if a page fault causes a copy-on-write
operation, because the copy_page function will use VMX to do the
copy on POWER8. The test program below demonstrates the bug.
The bug happens because when FP/VMX state for a transactional process
is stored in the thread_struct, we store the checkpointed state in
.fp_state/.vr_state and the transactional (current) state in
.transact_fp/.transact_vr. However, when the kernel wants to use
FP/VMX, it calls enable_kernel_fp() or enable_kernel_altivec(),
which saves the current state in .fp_state/.vr_state. Furthermore,
when we return to the user process we return with FP/VMX/VSX
disabled. The next time the process uses FP/VMX/VSX, we don't know
which set of state (the current register values, .fp_state/.vr_state,
or .transact_fp/.transact_vr) we should be using, since we have no
way to tell if we are still in the same transaction, and if not,
whether the previous transaction succeeded or failed.
Thus it is necessary to strictly adhere to the rule that if FP has
been enabled at any point in a transaction, we must keep FP enabled
for the user process with the current transactional state in the
FP registers, until we detect that it is no longer in a transaction.
Similarly for VMX; once enabled it must stay enabled until the
process is no longer transactional.
In order to keep this rule, we add a new thread_info flag which we
test when returning from the kernel to userspace, called TIF_RESTORE_TM.
This flag indicates that there is FP/VMX/VSX state to be restored
before entering userspace, and when it is set the .tm_orig_msr field
in the thread_struct indicates what state needs to be restored.
The restoration is done by restore_tm_state(). The TIF_RESTORE_TM
bit is set by new giveup_fpu/altivec_maybe_transactional helpers,
which are called from enable_kernel_fp/altivec, giveup_vsx, and
flush_fp/altivec_to_thread instead of giveup_fpu/altivec.
The other thing to be done is to get the transactional FP/VMX/VSX
state from .fp_state/.vr_state when doing reclaim, if that state
has been saved there by giveup_fpu/altivec_maybe_transactional.
Having done this, we set the FP/VMX bit in the thread's MSR after
reclaim to indicate that that part of the state is now valid
(having been reclaimed from the processor's checkpointed state).
Finally, in the signal handling code, we move the clearing of the
transactional state bits in the thread's MSR a bit earlier, before
calling flush_fp_to_thread(), so that we don't unnecessarily set
the TIF_RESTORE_TM bit.
This is the test program:
/* Michael Neuling 4/12/2013
*
* See if the altivec state is leaked out of an aborted transaction due to
* kernel vmx copy loops.
*
* gcc -m64 htm_vmxcopy.c -o htm_vmxcopy
*
*/
/* We don't use all of these, but for reference: */
int main(int argc, char *argv[])
{
long double vecin = 1.3;
long double vecout;
unsigned long pgsize = getpagesize();
int i;
int fd;
int size = pgsize*16;
char tmpfile[] = "/tmp/page_faultXXXXXX";
char buf[pgsize];
char *a;
uint64_t aborted = 0;
fd = mkstemp(tmpfile);
assert(fd >= 0);
memset(buf, 0, pgsize);
for (i = 0; i < size; i += pgsize)
assert(write(fd, buf, pgsize) == pgsize);
unlink(tmpfile);
a = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
assert(a != MAP_FAILED);
asm __volatile__(
"lxvd2x 40,0,%[vecinptr] ; " // set 40 to initial value
TBEGIN
"beq 3f ;"
TSUSPEND
"xxlxor 40,40,40 ; " // set 40 to 0
"std 5, 0(%[map]) ;" // cause kernel vmx copy page
TABORT
TRESUME
TEND
"li %[res], 0 ;"
"b 5f ;"
"3: ;" // Abort handler
"li %[res], 1 ;"
"5: ;"
"stxvd2x 40,0,%[vecoutptr] ; "
: [res]"=r"(aborted)
: [vecinptr]"r"(&vecin),
[vecoutptr]"r"(&vecout),
[map]"r"(a)
: "memory", "r0", "r3", "r4", "r5", "r6", "r7");
if (aborted && (vecin != vecout)){
printf("FAILED: vector state leaked on abort %f != %f\n",
(double)vecin, (double)vecout);
exit(1);
}
munmap(a, size);
close(fd);
printf("PASSED!\n");
return 0;
}
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2014-01-13 08:56:29 +04:00
# ifdef CONFIG_PPC_TRANSACTIONAL_MEM
2017-10-12 13:17:18 +03:00
/*
* Are we running in " Suspend disabled " mode ? If so we have to block any
* sigreturn that would get us into suspended state , and we also warn in some
* other paths that we should never reach with suspend disabled .
*/
bool tm_suspend_disabled __ro_after_init = false ;
2015-10-29 03:43:58 +03:00
static void check_if_tm_restore_required ( struct task_struct * tsk )
powerpc: Don't corrupt transactional state when using FP/VMX in kernel
Currently, when we have a process using the transactional memory
facilities on POWER8 (that is, the processor is in transactional
or suspended state), and the process enters the kernel and the
kernel then uses the floating-point or vector (VMX/Altivec) facility,
we end up corrupting the user-visible FP/VMX/VSX state. This
happens, for example, if a page fault causes a copy-on-write
operation, because the copy_page function will use VMX to do the
copy on POWER8. The test program below demonstrates the bug.
The bug happens because when FP/VMX state for a transactional process
is stored in the thread_struct, we store the checkpointed state in
.fp_state/.vr_state and the transactional (current) state in
.transact_fp/.transact_vr. However, when the kernel wants to use
FP/VMX, it calls enable_kernel_fp() or enable_kernel_altivec(),
which saves the current state in .fp_state/.vr_state. Furthermore,
when we return to the user process we return with FP/VMX/VSX
disabled. The next time the process uses FP/VMX/VSX, we don't know
which set of state (the current register values, .fp_state/.vr_state,
or .transact_fp/.transact_vr) we should be using, since we have no
way to tell if we are still in the same transaction, and if not,
whether the previous transaction succeeded or failed.
Thus it is necessary to strictly adhere to the rule that if FP has
been enabled at any point in a transaction, we must keep FP enabled
for the user process with the current transactional state in the
FP registers, until we detect that it is no longer in a transaction.
Similarly for VMX; once enabled it must stay enabled until the
process is no longer transactional.
In order to keep this rule, we add a new thread_info flag which we
test when returning from the kernel to userspace, called TIF_RESTORE_TM.
This flag indicates that there is FP/VMX/VSX state to be restored
before entering userspace, and when it is set the .tm_orig_msr field
in the thread_struct indicates what state needs to be restored.
The restoration is done by restore_tm_state(). The TIF_RESTORE_TM
bit is set by new giveup_fpu/altivec_maybe_transactional helpers,
which are called from enable_kernel_fp/altivec, giveup_vsx, and
flush_fp/altivec_to_thread instead of giveup_fpu/altivec.
The other thing to be done is to get the transactional FP/VMX/VSX
state from .fp_state/.vr_state when doing reclaim, if that state
has been saved there by giveup_fpu/altivec_maybe_transactional.
Having done this, we set the FP/VMX bit in the thread's MSR after
reclaim to indicate that that part of the state is now valid
(having been reclaimed from the processor's checkpointed state).
Finally, in the signal handling code, we move the clearing of the
transactional state bits in the thread's MSR a bit earlier, before
calling flush_fp_to_thread(), so that we don't unnecessarily set
the TIF_RESTORE_TM bit.
This is the test program:
/* Michael Neuling 4/12/2013
*
* See if the altivec state is leaked out of an aborted transaction due to
* kernel vmx copy loops.
*
* gcc -m64 htm_vmxcopy.c -o htm_vmxcopy
*
*/
/* We don't use all of these, but for reference: */
int main(int argc, char *argv[])
{
long double vecin = 1.3;
long double vecout;
unsigned long pgsize = getpagesize();
int i;
int fd;
int size = pgsize*16;
char tmpfile[] = "/tmp/page_faultXXXXXX";
char buf[pgsize];
char *a;
uint64_t aborted = 0;
fd = mkstemp(tmpfile);
assert(fd >= 0);
memset(buf, 0, pgsize);
for (i = 0; i < size; i += pgsize)
assert(write(fd, buf, pgsize) == pgsize);
unlink(tmpfile);
a = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
assert(a != MAP_FAILED);
asm __volatile__(
"lxvd2x 40,0,%[vecinptr] ; " // set 40 to initial value
TBEGIN
"beq 3f ;"
TSUSPEND
"xxlxor 40,40,40 ; " // set 40 to 0
"std 5, 0(%[map]) ;" // cause kernel vmx copy page
TABORT
TRESUME
TEND
"li %[res], 0 ;"
"b 5f ;"
"3: ;" // Abort handler
"li %[res], 1 ;"
"5: ;"
"stxvd2x 40,0,%[vecoutptr] ; "
: [res]"=r"(aborted)
: [vecinptr]"r"(&vecin),
[vecoutptr]"r"(&vecout),
[map]"r"(a)
: "memory", "r0", "r3", "r4", "r5", "r6", "r7");
if (aborted && (vecin != vecout)){
printf("FAILED: vector state leaked on abort %f != %f\n",
(double)vecin, (double)vecout);
exit(1);
}
munmap(a, size);
close(fd);
printf("PASSED!\n");
return 0;
}
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2014-01-13 08:56:29 +04:00
{
/*
* If we are saving the current thread ' s registers , and the
* thread is in a transactional state , set the TIF_RESTORE_TM
* bit so that we know to restore the registers before
* returning to userspace .
*/
if ( tsk = = current & & tsk - > thread . regs & &
MSR_TM_ACTIVE ( tsk - > thread . regs - > msr ) & &
! test_thread_flag ( TIF_RESTORE_TM ) ) {
2021-06-17 18:51:03 +03:00
regs_set_return_msr ( & tsk - > thread . ckpt_regs ,
tsk - > thread . regs - > msr ) ;
powerpc: Don't corrupt transactional state when using FP/VMX in kernel
Currently, when we have a process using the transactional memory
facilities on POWER8 (that is, the processor is in transactional
or suspended state), and the process enters the kernel and the
kernel then uses the floating-point or vector (VMX/Altivec) facility,
we end up corrupting the user-visible FP/VMX/VSX state. This
happens, for example, if a page fault causes a copy-on-write
operation, because the copy_page function will use VMX to do the
copy on POWER8. The test program below demonstrates the bug.
The bug happens because when FP/VMX state for a transactional process
is stored in the thread_struct, we store the checkpointed state in
.fp_state/.vr_state and the transactional (current) state in
.transact_fp/.transact_vr. However, when the kernel wants to use
FP/VMX, it calls enable_kernel_fp() or enable_kernel_altivec(),
which saves the current state in .fp_state/.vr_state. Furthermore,
when we return to the user process we return with FP/VMX/VSX
disabled. The next time the process uses FP/VMX/VSX, we don't know
which set of state (the current register values, .fp_state/.vr_state,
or .transact_fp/.transact_vr) we should be using, since we have no
way to tell if we are still in the same transaction, and if not,
whether the previous transaction succeeded or failed.
Thus it is necessary to strictly adhere to the rule that if FP has
been enabled at any point in a transaction, we must keep FP enabled
for the user process with the current transactional state in the
FP registers, until we detect that it is no longer in a transaction.
Similarly for VMX; once enabled it must stay enabled until the
process is no longer transactional.
In order to keep this rule, we add a new thread_info flag which we
test when returning from the kernel to userspace, called TIF_RESTORE_TM.
This flag indicates that there is FP/VMX/VSX state to be restored
before entering userspace, and when it is set the .tm_orig_msr field
in the thread_struct indicates what state needs to be restored.
The restoration is done by restore_tm_state(). The TIF_RESTORE_TM
bit is set by new giveup_fpu/altivec_maybe_transactional helpers,
which are called from enable_kernel_fp/altivec, giveup_vsx, and
flush_fp/altivec_to_thread instead of giveup_fpu/altivec.
The other thing to be done is to get the transactional FP/VMX/VSX
state from .fp_state/.vr_state when doing reclaim, if that state
has been saved there by giveup_fpu/altivec_maybe_transactional.
Having done this, we set the FP/VMX bit in the thread's MSR after
reclaim to indicate that that part of the state is now valid
(having been reclaimed from the processor's checkpointed state).
Finally, in the signal handling code, we move the clearing of the
transactional state bits in the thread's MSR a bit earlier, before
calling flush_fp_to_thread(), so that we don't unnecessarily set
the TIF_RESTORE_TM bit.
This is the test program:
/* Michael Neuling 4/12/2013
*
* See if the altivec state is leaked out of an aborted transaction due to
* kernel vmx copy loops.
*
* gcc -m64 htm_vmxcopy.c -o htm_vmxcopy
*
*/
/* We don't use all of these, but for reference: */
int main(int argc, char *argv[])
{
long double vecin = 1.3;
long double vecout;
unsigned long pgsize = getpagesize();
int i;
int fd;
int size = pgsize*16;
char tmpfile[] = "/tmp/page_faultXXXXXX";
char buf[pgsize];
char *a;
uint64_t aborted = 0;
fd = mkstemp(tmpfile);
assert(fd >= 0);
memset(buf, 0, pgsize);
for (i = 0; i < size; i += pgsize)
assert(write(fd, buf, pgsize) == pgsize);
unlink(tmpfile);
a = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
assert(a != MAP_FAILED);
asm __volatile__(
"lxvd2x 40,0,%[vecinptr] ; " // set 40 to initial value
TBEGIN
"beq 3f ;"
TSUSPEND
"xxlxor 40,40,40 ; " // set 40 to 0
"std 5, 0(%[map]) ;" // cause kernel vmx copy page
TABORT
TRESUME
TEND
"li %[res], 0 ;"
"b 5f ;"
"3: ;" // Abort handler
"li %[res], 1 ;"
"5: ;"
"stxvd2x 40,0,%[vecoutptr] ; "
: [res]"=r"(aborted)
: [vecinptr]"r"(&vecin),
[vecoutptr]"r"(&vecout),
[map]"r"(a)
: "memory", "r0", "r3", "r4", "r5", "r6", "r7");
if (aborted && (vecin != vecout)){
printf("FAILED: vector state leaked on abort %f != %f\n",
(double)vecin, (double)vecout);
exit(1);
}
munmap(a, size);
close(fd);
printf("PASSED!\n");
return 0;
}
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2014-01-13 08:56:29 +04:00
set_thread_flag ( TIF_RESTORE_TM ) ;
}
}
2016-09-23 09:18:08 +03:00
powerpc: Don't corrupt transactional state when using FP/VMX in kernel
Currently, when we have a process using the transactional memory
facilities on POWER8 (that is, the processor is in transactional
or suspended state), and the process enters the kernel and the
kernel then uses the floating-point or vector (VMX/Altivec) facility,
we end up corrupting the user-visible FP/VMX/VSX state. This
happens, for example, if a page fault causes a copy-on-write
operation, because the copy_page function will use VMX to do the
copy on POWER8. The test program below demonstrates the bug.
The bug happens because when FP/VMX state for a transactional process
is stored in the thread_struct, we store the checkpointed state in
.fp_state/.vr_state and the transactional (current) state in
.transact_fp/.transact_vr. However, when the kernel wants to use
FP/VMX, it calls enable_kernel_fp() or enable_kernel_altivec(),
which saves the current state in .fp_state/.vr_state. Furthermore,
when we return to the user process we return with FP/VMX/VSX
disabled. The next time the process uses FP/VMX/VSX, we don't know
which set of state (the current register values, .fp_state/.vr_state,
or .transact_fp/.transact_vr) we should be using, since we have no
way to tell if we are still in the same transaction, and if not,
whether the previous transaction succeeded or failed.
Thus it is necessary to strictly adhere to the rule that if FP has
been enabled at any point in a transaction, we must keep FP enabled
for the user process with the current transactional state in the
FP registers, until we detect that it is no longer in a transaction.
Similarly for VMX; once enabled it must stay enabled until the
process is no longer transactional.
In order to keep this rule, we add a new thread_info flag which we
test when returning from the kernel to userspace, called TIF_RESTORE_TM.
This flag indicates that there is FP/VMX/VSX state to be restored
before entering userspace, and when it is set the .tm_orig_msr field
in the thread_struct indicates what state needs to be restored.
The restoration is done by restore_tm_state(). The TIF_RESTORE_TM
bit is set by new giveup_fpu/altivec_maybe_transactional helpers,
which are called from enable_kernel_fp/altivec, giveup_vsx, and
flush_fp/altivec_to_thread instead of giveup_fpu/altivec.
The other thing to be done is to get the transactional FP/VMX/VSX
state from .fp_state/.vr_state when doing reclaim, if that state
has been saved there by giveup_fpu/altivec_maybe_transactional.
Having done this, we set the FP/VMX bit in the thread's MSR after
reclaim to indicate that that part of the state is now valid
(having been reclaimed from the processor's checkpointed state).
Finally, in the signal handling code, we move the clearing of the
transactional state bits in the thread's MSR a bit earlier, before
calling flush_fp_to_thread(), so that we don't unnecessarily set
the TIF_RESTORE_TM bit.
This is the test program:
/* Michael Neuling 4/12/2013
*
* See if the altivec state is leaked out of an aborted transaction due to
* kernel vmx copy loops.
*
* gcc -m64 htm_vmxcopy.c -o htm_vmxcopy
*
*/
/* We don't use all of these, but for reference: */
int main(int argc, char *argv[])
{
long double vecin = 1.3;
long double vecout;
unsigned long pgsize = getpagesize();
int i;
int fd;
int size = pgsize*16;
char tmpfile[] = "/tmp/page_faultXXXXXX";
char buf[pgsize];
char *a;
uint64_t aborted = 0;
fd = mkstemp(tmpfile);
assert(fd >= 0);
memset(buf, 0, pgsize);
for (i = 0; i < size; i += pgsize)
assert(write(fd, buf, pgsize) == pgsize);
unlink(tmpfile);
a = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
assert(a != MAP_FAILED);
asm __volatile__(
"lxvd2x 40,0,%[vecinptr] ; " // set 40 to initial value
TBEGIN
"beq 3f ;"
TSUSPEND
"xxlxor 40,40,40 ; " // set 40 to 0
"std 5, 0(%[map]) ;" // cause kernel vmx copy page
TABORT
TRESUME
TEND
"li %[res], 0 ;"
"b 5f ;"
"3: ;" // Abort handler
"li %[res], 1 ;"
"5: ;"
"stxvd2x 40,0,%[vecoutptr] ; "
: [res]"=r"(aborted)
: [vecinptr]"r"(&vecin),
[vecoutptr]"r"(&vecout),
[map]"r"(a)
: "memory", "r0", "r3", "r4", "r5", "r6", "r7");
if (aborted && (vecin != vecout)){
printf("FAILED: vector state leaked on abort %f != %f\n",
(double)vecin, (double)vecout);
exit(1);
}
munmap(a, size);
close(fd);
printf("PASSED!\n");
return 0;
}
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2014-01-13 08:56:29 +04:00
# else
2015-10-29 03:43:58 +03:00
static inline void check_if_tm_restore_required ( struct task_struct * tsk ) { }
powerpc: Don't corrupt transactional state when using FP/VMX in kernel
Currently, when we have a process using the transactional memory
facilities on POWER8 (that is, the processor is in transactional
or suspended state), and the process enters the kernel and the
kernel then uses the floating-point or vector (VMX/Altivec) facility,
we end up corrupting the user-visible FP/VMX/VSX state. This
happens, for example, if a page fault causes a copy-on-write
operation, because the copy_page function will use VMX to do the
copy on POWER8. The test program below demonstrates the bug.
The bug happens because when FP/VMX state for a transactional process
is stored in the thread_struct, we store the checkpointed state in
.fp_state/.vr_state and the transactional (current) state in
.transact_fp/.transact_vr. However, when the kernel wants to use
FP/VMX, it calls enable_kernel_fp() or enable_kernel_altivec(),
which saves the current state in .fp_state/.vr_state. Furthermore,
when we return to the user process we return with FP/VMX/VSX
disabled. The next time the process uses FP/VMX/VSX, we don't know
which set of state (the current register values, .fp_state/.vr_state,
or .transact_fp/.transact_vr) we should be using, since we have no
way to tell if we are still in the same transaction, and if not,
whether the previous transaction succeeded or failed.
Thus it is necessary to strictly adhere to the rule that if FP has
been enabled at any point in a transaction, we must keep FP enabled
for the user process with the current transactional state in the
FP registers, until we detect that it is no longer in a transaction.
Similarly for VMX; once enabled it must stay enabled until the
process is no longer transactional.
In order to keep this rule, we add a new thread_info flag which we
test when returning from the kernel to userspace, called TIF_RESTORE_TM.
This flag indicates that there is FP/VMX/VSX state to be restored
before entering userspace, and when it is set the .tm_orig_msr field
in the thread_struct indicates what state needs to be restored.
The restoration is done by restore_tm_state(). The TIF_RESTORE_TM
bit is set by new giveup_fpu/altivec_maybe_transactional helpers,
which are called from enable_kernel_fp/altivec, giveup_vsx, and
flush_fp/altivec_to_thread instead of giveup_fpu/altivec.
The other thing to be done is to get the transactional FP/VMX/VSX
state from .fp_state/.vr_state when doing reclaim, if that state
has been saved there by giveup_fpu/altivec_maybe_transactional.
Having done this, we set the FP/VMX bit in the thread's MSR after
reclaim to indicate that that part of the state is now valid
(having been reclaimed from the processor's checkpointed state).
Finally, in the signal handling code, we move the clearing of the
transactional state bits in the thread's MSR a bit earlier, before
calling flush_fp_to_thread(), so that we don't unnecessarily set
the TIF_RESTORE_TM bit.
This is the test program:
/* Michael Neuling 4/12/2013
*
* See if the altivec state is leaked out of an aborted transaction due to
* kernel vmx copy loops.
*
* gcc -m64 htm_vmxcopy.c -o htm_vmxcopy
*
*/
/* We don't use all of these, but for reference: */
int main(int argc, char *argv[])
{
long double vecin = 1.3;
long double vecout;
unsigned long pgsize = getpagesize();
int i;
int fd;
int size = pgsize*16;
char tmpfile[] = "/tmp/page_faultXXXXXX";
char buf[pgsize];
char *a;
uint64_t aborted = 0;
fd = mkstemp(tmpfile);
assert(fd >= 0);
memset(buf, 0, pgsize);
for (i = 0; i < size; i += pgsize)
assert(write(fd, buf, pgsize) == pgsize);
unlink(tmpfile);
a = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
assert(a != MAP_FAILED);
asm __volatile__(
"lxvd2x 40,0,%[vecinptr] ; " // set 40 to initial value
TBEGIN
"beq 3f ;"
TSUSPEND
"xxlxor 40,40,40 ; " // set 40 to 0
"std 5, 0(%[map]) ;" // cause kernel vmx copy page
TABORT
TRESUME
TEND
"li %[res], 0 ;"
"b 5f ;"
"3: ;" // Abort handler
"li %[res], 1 ;"
"5: ;"
"stxvd2x 40,0,%[vecoutptr] ; "
: [res]"=r"(aborted)
: [vecinptr]"r"(&vecin),
[vecoutptr]"r"(&vecout),
[map]"r"(a)
: "memory", "r0", "r3", "r4", "r5", "r6", "r7");
if (aborted && (vecin != vecout)){
printf("FAILED: vector state leaked on abort %f != %f\n",
(double)vecin, (double)vecout);
exit(1);
}
munmap(a, size);
close(fd);
printf("PASSED!\n");
return 0;
}
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2014-01-13 08:56:29 +04:00
# endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
2015-10-29 03:44:06 +03:00
bool strict_msr_control ;
EXPORT_SYMBOL ( strict_msr_control ) ;
static int __init enable_strict_msr_control ( char * str )
{
strict_msr_control = true ;
pr_info ( " Enabling strict facility control \n " ) ;
return 0 ;
}
early_param ( " ppc_strict_facility_enable " , enable_strict_msr_control ) ;
powerpc/64: Don't trace code that runs with the soft irq mask unreconciled
"Reconciling" in terms of interrupt handling, is to bring the soft irq
mask state in to synch with the hardware, after an interrupt causes
MSR[EE] to be cleared (while the soft mask may be enabled, and hard
irqs not marked disabled).
General kernel code should not be called while unreconciled, because
local_irq_disable, etc. manipulations can cause surprising irq traces,
and it's fragile because the soft irq code does not really expect to
be called in this situation.
When exiting from an interrupt, MSR[EE] is cleared to prevent races,
but soft irq state is enabled for the returned-to context, so this is
now an unreconciled state. restore_math is called in this state, and
that can be ftraced, and the ftrace subsystem disables local irqs.
Mark restore_math and its callees as notrace. Restore a sanity check
in the soft irq code that had to be disabled for this case, by commit
4da1f79227ad4 ("powerpc/64: Disable irq restore warning for now").
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2019-05-02 08:21:07 +03:00
/* notrace because it's called by restore_math */
unsigned long notrace msr_check_and_set ( unsigned long bits )
2015-10-29 03:44:01 +03:00
{
2015-10-29 03:44:04 +03:00
unsigned long oldmsr = mfmsr ( ) ;
unsigned long newmsr ;
2015-10-29 03:44:01 +03:00
2015-10-29 03:44:04 +03:00
newmsr = oldmsr | bits ;
2015-10-29 03:44:01 +03:00
2015-10-29 03:44:04 +03:00
if ( cpu_has_feature ( CPU_FTR_VSX ) & & ( bits & MSR_FP ) )
2015-10-29 03:44:01 +03:00
newmsr | = MSR_VSX ;
2015-10-29 03:44:04 +03:00
2015-10-29 03:44:01 +03:00
if ( oldmsr ! = newmsr )
2022-10-04 08:11:57 +03:00
newmsr = mtmsr_isync_irqsafe ( newmsr ) ;
2016-09-23 09:18:10 +03:00
return newmsr ;
2015-10-29 03:44:04 +03:00
}
2018-05-23 10:01:44 +03:00
EXPORT_SYMBOL_GPL ( msr_check_and_set ) ;
2015-10-29 03:44:01 +03:00
powerpc/64: Don't trace code that runs with the soft irq mask unreconciled
"Reconciling" in terms of interrupt handling, is to bring the soft irq
mask state in to synch with the hardware, after an interrupt causes
MSR[EE] to be cleared (while the soft mask may be enabled, and hard
irqs not marked disabled).
General kernel code should not be called while unreconciled, because
local_irq_disable, etc. manipulations can cause surprising irq traces,
and it's fragile because the soft irq code does not really expect to
be called in this situation.
When exiting from an interrupt, MSR[EE] is cleared to prevent races,
but soft irq state is enabled for the returned-to context, so this is
now an unreconciled state. restore_math is called in this state, and
that can be ftraced, and the ftrace subsystem disables local irqs.
Mark restore_math and its callees as notrace. Restore a sanity check
in the soft irq code that had to be disabled for this case, by commit
4da1f79227ad4 ("powerpc/64: Disable irq restore warning for now").
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2019-05-02 08:21:07 +03:00
/* notrace because it's called by restore_math */
void notrace __msr_check_and_clear ( unsigned long bits )
2015-10-29 03:44:04 +03:00
{
unsigned long oldmsr = mfmsr ( ) ;
unsigned long newmsr ;
newmsr = oldmsr & ~ bits ;
if ( cpu_has_feature ( CPU_FTR_VSX ) & & ( bits & MSR_FP ) )
newmsr & = ~ MSR_VSX ;
if ( oldmsr ! = newmsr )
2022-10-04 08:11:57 +03:00
mtmsr_isync_irqsafe ( newmsr ) ;
2015-10-29 03:44:04 +03:00
}
2015-10-29 03:44:06 +03:00
EXPORT_SYMBOL ( __msr_check_and_clear ) ;
2015-10-29 03:44:04 +03:00
# ifdef CONFIG_PPC_FPU
2018-02-25 20:22:23 +03:00
static void __giveup_fpu ( struct task_struct * tsk )
2016-02-29 09:53:49 +03:00
{
powerpc: Avoid load hit store in __giveup_fpu() and __giveup_altivec()
In both __giveup_fpu() and __giveup_altivec() we make two modifications
to tsk->thread.regs->msr. gcc decides to do a read/modify/write of
each change, so we end up with a load hit store:
ld r9,264(r10)
rldicl r9,r9,50,1
rotldi r9,r9,14
std r9,264(r10)
...
ld r9,264(r10)
rldicl r9,r9,40,1
rotldi r9,r9,24
std r9,264(r10)
Fix this by using a temporary.
Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2016-05-29 15:03:50 +03:00
unsigned long msr ;
2016-02-29 09:53:49 +03:00
save_fpu ( tsk ) ;
powerpc: Avoid load hit store in __giveup_fpu() and __giveup_altivec()
In both __giveup_fpu() and __giveup_altivec() we make two modifications
to tsk->thread.regs->msr. gcc decides to do a read/modify/write of
each change, so we end up with a load hit store:
ld r9,264(r10)
rldicl r9,r9,50,1
rotldi r9,r9,14
std r9,264(r10)
...
ld r9,264(r10)
rldicl r9,r9,40,1
rotldi r9,r9,24
std r9,264(r10)
Fix this by using a temporary.
Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2016-05-29 15:03:50 +03:00
msr = tsk - > thread . regs - > msr ;
2019-02-08 17:33:19 +03:00
msr & = ~ ( MSR_FP | MSR_FE0 | MSR_FE1 ) ;
2016-02-29 09:53:49 +03:00
if ( cpu_has_feature ( CPU_FTR_VSX ) )
powerpc: Avoid load hit store in __giveup_fpu() and __giveup_altivec()
In both __giveup_fpu() and __giveup_altivec() we make two modifications
to tsk->thread.regs->msr. gcc decides to do a read/modify/write of
each change, so we end up with a load hit store:
ld r9,264(r10)
rldicl r9,r9,50,1
rotldi r9,r9,14
std r9,264(r10)
...
ld r9,264(r10)
rldicl r9,r9,40,1
rotldi r9,r9,24
std r9,264(r10)
Fix this by using a temporary.
Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2016-05-29 15:03:50 +03:00
msr & = ~ MSR_VSX ;
2021-06-17 18:51:03 +03:00
regs_set_return_msr ( tsk - > thread . regs , msr ) ;
2016-02-29 09:53:49 +03:00
}
2015-10-29 03:44:04 +03:00
void giveup_fpu ( struct task_struct * tsk )
{
check_if_tm_restore_required ( tsk ) ;
msr_check_and_set ( MSR_FP ) ;
2015-10-29 03:44:01 +03:00
__giveup_fpu ( tsk ) ;
2015-10-29 03:44:04 +03:00
msr_check_and_clear ( MSR_FP ) ;
2015-10-29 03:44:01 +03:00
}
EXPORT_SYMBOL ( giveup_fpu ) ;
2005-09-26 10:04:21 +04:00
/*
* Make sure the floating - point register state in the
* the thread_struct is up to date for task tsk .
*/
void flush_fp_to_thread ( struct task_struct * tsk )
{
if ( tsk - > thread . regs ) {
/*
* We need to disable preemption here because if we didn ' t ,
* another process could get scheduled after the regs - > msr
* test but before we have finished saving the FP registers
* to the thread_struct . That process could take over the
* FPU , and then when we get scheduled again we would store
* bogus values for the remaining FP registers .
*/
preempt_disable ( ) ;
if ( tsk - > thread . regs - > msr & MSR_FP ) {
/*
* This should only ever be called for current or
* for a stopped child process . Since we save away
2015-10-29 03:43:57 +03:00
* the FP register state on context switch ,
2005-09-26 10:04:21 +04:00
* there is something wrong if a stopped child appears
* to still have its FP state in the CPU registers .
*/
BUG_ON ( tsk ! = current ) ;
2015-10-29 03:43:58 +03:00
giveup_fpu ( tsk ) ;
2005-09-26 10:04:21 +04:00
}
preempt_enable ( ) ;
}
}
KVM: PPC: Add support for Book3S processors in hypervisor mode
This adds support for KVM running on 64-bit Book 3S processors,
specifically POWER7, in hypervisor mode. Using hypervisor mode means
that the guest can use the processor's supervisor mode. That means
that the guest can execute privileged instructions and access privileged
registers itself without trapping to the host. This gives excellent
performance, but does mean that KVM cannot emulate a processor
architecture other than the one that the hardware implements.
This code assumes that the guest is running paravirtualized using the
PAPR (Power Architecture Platform Requirements) interface, which is the
interface that IBM's PowerVM hypervisor uses. That means that existing
Linux distributions that run on IBM pSeries machines will also run
under KVM without modification. In order to communicate the PAPR
hypercalls to qemu, this adds a new KVM_EXIT_PAPR_HCALL exit code
to include/linux/kvm.h.
Currently the choice between book3s_hv support and book3s_pr support
(i.e. the existing code, which runs the guest in user mode) has to be
made at kernel configuration time, so a given kernel binary can only
do one or the other.
This new book3s_hv code doesn't support MMIO emulation at present.
Since we are running paravirtualized guests, this isn't a serious
restriction.
With the guest running in supervisor mode, most exceptions go straight
to the guest. We will never get data or instruction storage or segment
interrupts, alignment interrupts, decrementer interrupts, program
interrupts, single-step interrupts, etc., coming to the hypervisor from
the guest. Therefore this introduces a new KVMTEST_NONHV macro for the
exception entry path so that we don't have to do the KVM test on entry
to those exception handlers.
We do however get hypervisor decrementer, hypervisor data storage,
hypervisor instruction storage, and hypervisor emulation assist
interrupts, so we have to handle those.
In hypervisor mode, real-mode accesses can access all of RAM, not just
a limited amount. Therefore we put all the guest state in the vcpu.arch
and use the shadow_vcpu in the PACA only for temporary scratch space.
We allocate the vcpu with kzalloc rather than vzalloc, and we don't use
anything in the kvmppc_vcpu_book3s struct, so we don't allocate it.
We don't have a shared page with the guest, but we still need a
kvm_vcpu_arch_shared struct to store the values of various registers,
so we include one in the vcpu_arch struct.
The POWER7 processor has a restriction that all threads in a core have
to be in the same partition. MMU-on kernel code counts as a partition
(partition 0), so we have to do a partition switch on every entry to and
exit from the guest. At present we require the host and guest to run
in single-thread mode because of this hardware restriction.
This code allocates a hashed page table for the guest and initializes
it with HPTEs for the guest's Virtual Real Memory Area (VRMA). We
require that the guest memory is allocated using 16MB huge pages, in
order to simplify the low-level memory management. This also means that
we can get away without tracking paging activity in the host for now,
since huge pages can't be paged or swapped.
This also adds a few new exports needed by the book3s_hv code.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2011-06-29 04:21:34 +04:00
EXPORT_SYMBOL_GPL ( flush_fp_to_thread ) ;
2005-09-26 10:04:21 +04:00
void enable_kernel_fp ( void )
{
2016-09-23 09:18:11 +03:00
unsigned long cpumsr ;
2005-09-26 10:04:21 +04:00
WARN_ON ( preemptible ( ) ) ;
2016-09-23 09:18:11 +03:00
cpumsr = msr_check_and_set ( MSR_FP ) ;
2015-10-29 03:43:59 +03:00
2015-12-10 12:04:05 +03:00
if ( current - > thread . regs & & ( current - > thread . regs - > msr & MSR_FP ) ) {
check_if_tm_restore_required ( current ) ;
2016-09-23 09:18:11 +03:00
/*
* If a thread has already been reclaimed then the
* checkpointed registers are on the CPU but have definitely
* been saved by the reclaim code . Don ' t need to and * cannot *
* giveup as this would save to the ' live ' structure not the
* checkpointed structure .
*/
2018-08-16 20:21:07 +03:00
if ( ! MSR_TM_ACTIVE ( cpumsr ) & &
MSR_TM_ACTIVE ( current - > thread . regs - > msr ) )
2016-09-23 09:18:11 +03:00
return ;
2015-10-29 03:44:04 +03:00
__giveup_fpu ( current ) ;
2015-12-10 12:04:05 +03:00
}
2005-09-26 10:04:21 +04:00
}
EXPORT_SYMBOL ( enable_kernel_fp ) ;
2020-08-17 08:47:58 +03:00
# else
static inline void __giveup_fpu ( struct task_struct * tsk ) { }
2015-10-29 03:44:11 +03:00
# endif /* CONFIG_PPC_FPU */
2005-09-26 10:04:21 +04:00
# ifdef CONFIG_ALTIVEC
2016-02-29 09:53:50 +03:00
static void __giveup_altivec ( struct task_struct * tsk )
{
powerpc: Avoid load hit store in __giveup_fpu() and __giveup_altivec()
In both __giveup_fpu() and __giveup_altivec() we make two modifications
to tsk->thread.regs->msr. gcc decides to do a read/modify/write of
each change, so we end up with a load hit store:
ld r9,264(r10)
rldicl r9,r9,50,1
rotldi r9,r9,14
std r9,264(r10)
...
ld r9,264(r10)
rldicl r9,r9,40,1
rotldi r9,r9,24
std r9,264(r10)
Fix this by using a temporary.
Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2016-05-29 15:03:50 +03:00
unsigned long msr ;
2016-02-29 09:53:50 +03:00
save_altivec ( tsk ) ;
powerpc: Avoid load hit store in __giveup_fpu() and __giveup_altivec()
In both __giveup_fpu() and __giveup_altivec() we make two modifications
to tsk->thread.regs->msr. gcc decides to do a read/modify/write of
each change, so we end up with a load hit store:
ld r9,264(r10)
rldicl r9,r9,50,1
rotldi r9,r9,14
std r9,264(r10)
...
ld r9,264(r10)
rldicl r9,r9,40,1
rotldi r9,r9,24
std r9,264(r10)
Fix this by using a temporary.
Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2016-05-29 15:03:50 +03:00
msr = tsk - > thread . regs - > msr ;
msr & = ~ MSR_VEC ;
2016-02-29 09:53:50 +03:00
if ( cpu_has_feature ( CPU_FTR_VSX ) )
powerpc: Avoid load hit store in __giveup_fpu() and __giveup_altivec()
In both __giveup_fpu() and __giveup_altivec() we make two modifications
to tsk->thread.regs->msr. gcc decides to do a read/modify/write of
each change, so we end up with a load hit store:
ld r9,264(r10)
rldicl r9,r9,50,1
rotldi r9,r9,14
std r9,264(r10)
...
ld r9,264(r10)
rldicl r9,r9,40,1
rotldi r9,r9,24
std r9,264(r10)
Fix this by using a temporary.
Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2016-05-29 15:03:50 +03:00
msr & = ~ MSR_VSX ;
2021-06-17 18:51:03 +03:00
regs_set_return_msr ( tsk - > thread . regs , msr ) ;
2016-02-29 09:53:50 +03:00
}
2015-10-29 03:44:01 +03:00
void giveup_altivec ( struct task_struct * tsk )
{
check_if_tm_restore_required ( tsk ) ;
2015-10-29 03:44:04 +03:00
msr_check_and_set ( MSR_VEC ) ;
2015-10-29 03:44:01 +03:00
__giveup_altivec ( tsk ) ;
2015-10-29 03:44:04 +03:00
msr_check_and_clear ( MSR_VEC ) ;
2015-10-29 03:44:01 +03:00
}
EXPORT_SYMBOL ( giveup_altivec ) ;
2005-09-26 10:04:21 +04:00
void enable_kernel_altivec ( void )
{
2016-09-23 09:18:11 +03:00
unsigned long cpumsr ;
2005-09-26 10:04:21 +04:00
WARN_ON ( preemptible ( ) ) ;
2016-09-23 09:18:11 +03:00
cpumsr = msr_check_and_set ( MSR_VEC ) ;
2015-10-29 03:43:59 +03:00
2015-12-10 12:04:05 +03:00
if ( current - > thread . regs & & ( current - > thread . regs - > msr & MSR_VEC ) ) {
check_if_tm_restore_required ( current ) ;
2016-09-23 09:18:11 +03:00
/*
* If a thread has already been reclaimed then the
* checkpointed registers are on the CPU but have definitely
* been saved by the reclaim code . Don ' t need to and * cannot *
* giveup as this would save to the ' live ' structure not the
* checkpointed structure .
*/
2018-08-16 20:21:07 +03:00
if ( ! MSR_TM_ACTIVE ( cpumsr ) & &
MSR_TM_ACTIVE ( current - > thread . regs - > msr ) )
2016-09-23 09:18:11 +03:00
return ;
2015-10-29 03:44:04 +03:00
__giveup_altivec ( current ) ;
2015-12-10 12:04:05 +03:00
}
2005-09-26 10:04:21 +04:00
}
EXPORT_SYMBOL ( enable_kernel_altivec ) ;
/*
* Make sure the VMX / Altivec register state in the
* the thread_struct is up to date for task tsk .
*/
void flush_altivec_to_thread ( struct task_struct * tsk )
{
if ( tsk - > thread . regs ) {
preempt_disable ( ) ;
if ( tsk - > thread . regs - > msr & MSR_VEC ) {
BUG_ON ( tsk ! = current ) ;
2015-10-29 03:43:58 +03:00
giveup_altivec ( tsk ) ;
2005-09-26 10:04:21 +04:00
}
preempt_enable ( ) ;
}
}
KVM: PPC: Add support for Book3S processors in hypervisor mode
This adds support for KVM running on 64-bit Book 3S processors,
specifically POWER7, in hypervisor mode. Using hypervisor mode means
that the guest can use the processor's supervisor mode. That means
that the guest can execute privileged instructions and access privileged
registers itself without trapping to the host. This gives excellent
performance, but does mean that KVM cannot emulate a processor
architecture other than the one that the hardware implements.
This code assumes that the guest is running paravirtualized using the
PAPR (Power Architecture Platform Requirements) interface, which is the
interface that IBM's PowerVM hypervisor uses. That means that existing
Linux distributions that run on IBM pSeries machines will also run
under KVM without modification. In order to communicate the PAPR
hypercalls to qemu, this adds a new KVM_EXIT_PAPR_HCALL exit code
to include/linux/kvm.h.
Currently the choice between book3s_hv support and book3s_pr support
(i.e. the existing code, which runs the guest in user mode) has to be
made at kernel configuration time, so a given kernel binary can only
do one or the other.
This new book3s_hv code doesn't support MMIO emulation at present.
Since we are running paravirtualized guests, this isn't a serious
restriction.
With the guest running in supervisor mode, most exceptions go straight
to the guest. We will never get data or instruction storage or segment
interrupts, alignment interrupts, decrementer interrupts, program
interrupts, single-step interrupts, etc., coming to the hypervisor from
the guest. Therefore this introduces a new KVMTEST_NONHV macro for the
exception entry path so that we don't have to do the KVM test on entry
to those exception handlers.
We do however get hypervisor decrementer, hypervisor data storage,
hypervisor instruction storage, and hypervisor emulation assist
interrupts, so we have to handle those.
In hypervisor mode, real-mode accesses can access all of RAM, not just
a limited amount. Therefore we put all the guest state in the vcpu.arch
and use the shadow_vcpu in the PACA only for temporary scratch space.
We allocate the vcpu with kzalloc rather than vzalloc, and we don't use
anything in the kvmppc_vcpu_book3s struct, so we don't allocate it.
We don't have a shared page with the guest, but we still need a
kvm_vcpu_arch_shared struct to store the values of various registers,
so we include one in the vcpu_arch struct.
The POWER7 processor has a restriction that all threads in a core have
to be in the same partition. MMU-on kernel code counts as a partition
(partition 0), so we have to do a partition switch on every entry to and
exit from the guest. At present we require the host and guest to run
in single-thread mode because of this hardware restriction.
This code allocates a hashed page table for the guest and initializes
it with HPTEs for the guest's Virtual Real Memory Area (VRMA). We
require that the guest memory is allocated using 16MB huge pages, in
order to simplify the low-level memory management. This also means that
we can get away without tracking paging activity in the host for now,
since huge pages can't be paged or swapped.
This also adds a few new exports needed by the book3s_hv code.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2011-06-29 04:21:34 +04:00
EXPORT_SYMBOL_GPL ( flush_altivec_to_thread ) ;
2005-09-26 10:04:21 +04:00
# endif /* CONFIG_ALTIVEC */
2008-06-25 08:07:18 +04:00
# ifdef CONFIG_VSX
2016-02-29 09:53:51 +03:00
static void __giveup_vsx ( struct task_struct * tsk )
2015-10-29 03:44:02 +03:00
{
2017-08-16 09:01:17 +03:00
unsigned long msr = tsk - > thread . regs - > msr ;
/*
2022-04-30 21:56:54 +03:00
* We should never be setting MSR_VSX without also setting
2017-08-16 09:01:17 +03:00
* MSR_FP and MSR_VEC
*/
WARN_ON ( ( msr & MSR_VSX ) & & ! ( ( msr & MSR_FP ) & & ( msr & MSR_VEC ) ) ) ;
/* __giveup_fpu will clear MSR_VSX */
if ( msr & MSR_FP )
2015-10-29 03:44:02 +03:00
__giveup_fpu ( tsk ) ;
2017-08-16 09:01:17 +03:00
if ( msr & MSR_VEC )
2015-10-29 03:44:02 +03:00
__giveup_altivec ( tsk ) ;
2016-02-29 09:53:51 +03:00
}
static void giveup_vsx ( struct task_struct * tsk )
{
check_if_tm_restore_required ( tsk ) ;
msr_check_and_set ( MSR_FP | MSR_VEC | MSR_VSX ) ;
2015-10-29 03:44:02 +03:00
__giveup_vsx ( tsk ) ;
2015-10-29 03:44:04 +03:00
msr_check_and_clear ( MSR_FP | MSR_VEC | MSR_VSX ) ;
2015-10-29 03:44:02 +03:00
}
2016-02-29 09:53:51 +03:00
2008-06-25 08:07:18 +04:00
void enable_kernel_vsx ( void )
{
2016-09-23 09:18:11 +03:00
unsigned long cpumsr ;
2008-06-25 08:07:18 +04:00
WARN_ON ( preemptible ( ) ) ;
2016-09-23 09:18:11 +03:00
cpumsr = msr_check_and_set ( MSR_FP | MSR_VEC | MSR_VSX ) ;
2015-10-29 03:43:59 +03:00
2017-08-16 09:01:14 +03:00
if ( current - > thread . regs & &
( current - > thread . regs - > msr & ( MSR_VSX | MSR_VEC | MSR_FP ) ) ) {
2015-12-10 12:04:05 +03:00
check_if_tm_restore_required ( current ) ;
2016-09-23 09:18:11 +03:00
/*
* If a thread has already been reclaimed then the
* checkpointed registers are on the CPU but have definitely
* been saved by the reclaim code . Don ' t need to and * cannot *
* giveup as this would save to the ' live ' structure not the
* checkpointed structure .
*/
2018-08-16 20:21:07 +03:00
if ( ! MSR_TM_ACTIVE ( cpumsr ) & &
MSR_TM_ACTIVE ( current - > thread . regs - > msr ) )
2016-09-23 09:18:11 +03:00
return ;
2015-10-29 03:44:04 +03:00
__giveup_vsx ( current ) ;
2015-10-29 03:43:59 +03:00
}
2008-06-25 08:07:18 +04:00
}
EXPORT_SYMBOL ( enable_kernel_vsx ) ;
void flush_vsx_to_thread ( struct task_struct * tsk )
{
if ( tsk - > thread . regs ) {
preempt_disable ( ) ;
2017-08-16 09:01:14 +03:00
if ( tsk - > thread . regs - > msr & ( MSR_VSX | MSR_VEC | MSR_FP ) ) {
2008-06-25 08:07:18 +04:00
BUG_ON ( tsk ! = current ) ;
giveup_vsx ( tsk ) ;
}
preempt_enable ( ) ;
}
}
KVM: PPC: Add support for Book3S processors in hypervisor mode
This adds support for KVM running on 64-bit Book 3S processors,
specifically POWER7, in hypervisor mode. Using hypervisor mode means
that the guest can use the processor's supervisor mode. That means
that the guest can execute privileged instructions and access privileged
registers itself without trapping to the host. This gives excellent
performance, but does mean that KVM cannot emulate a processor
architecture other than the one that the hardware implements.
This code assumes that the guest is running paravirtualized using the
PAPR (Power Architecture Platform Requirements) interface, which is the
interface that IBM's PowerVM hypervisor uses. That means that existing
Linux distributions that run on IBM pSeries machines will also run
under KVM without modification. In order to communicate the PAPR
hypercalls to qemu, this adds a new KVM_EXIT_PAPR_HCALL exit code
to include/linux/kvm.h.
Currently the choice between book3s_hv support and book3s_pr support
(i.e. the existing code, which runs the guest in user mode) has to be
made at kernel configuration time, so a given kernel binary can only
do one or the other.
This new book3s_hv code doesn't support MMIO emulation at present.
Since we are running paravirtualized guests, this isn't a serious
restriction.
With the guest running in supervisor mode, most exceptions go straight
to the guest. We will never get data or instruction storage or segment
interrupts, alignment interrupts, decrementer interrupts, program
interrupts, single-step interrupts, etc., coming to the hypervisor from
the guest. Therefore this introduces a new KVMTEST_NONHV macro for the
exception entry path so that we don't have to do the KVM test on entry
to those exception handlers.
We do however get hypervisor decrementer, hypervisor data storage,
hypervisor instruction storage, and hypervisor emulation assist
interrupts, so we have to handle those.
In hypervisor mode, real-mode accesses can access all of RAM, not just
a limited amount. Therefore we put all the guest state in the vcpu.arch
and use the shadow_vcpu in the PACA only for temporary scratch space.
We allocate the vcpu with kzalloc rather than vzalloc, and we don't use
anything in the kvmppc_vcpu_book3s struct, so we don't allocate it.
We don't have a shared page with the guest, but we still need a
kvm_vcpu_arch_shared struct to store the values of various registers,
so we include one in the vcpu_arch struct.
The POWER7 processor has a restriction that all threads in a core have
to be in the same partition. MMU-on kernel code counts as a partition
(partition 0), so we have to do a partition switch on every entry to and
exit from the guest. At present we require the host and guest to run
in single-thread mode because of this hardware restriction.
This code allocates a hashed page table for the guest and initializes
it with HPTEs for the guest's Virtual Real Memory Area (VRMA). We
require that the guest memory is allocated using 16MB huge pages, in
order to simplify the low-level memory management. This also means that
we can get away without tracking paging activity in the host for now,
since huge pages can't be paged or swapped.
This also adds a few new exports needed by the book3s_hv code.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
2011-06-29 04:21:34 +04:00
EXPORT_SYMBOL_GPL ( flush_vsx_to_thread ) ;
2008-06-25 08:07:18 +04:00
# endif /* CONFIG_VSX */
2005-09-26 10:04:21 +04:00
# ifdef CONFIG_SPE
2015-10-29 03:44:01 +03:00
void giveup_spe ( struct task_struct * tsk )
{
check_if_tm_restore_required ( tsk ) ;
2015-10-29 03:44:04 +03:00
msr_check_and_set ( MSR_SPE ) ;
2015-10-29 03:44:01 +03:00
__giveup_spe ( tsk ) ;
2015-10-29 03:44:04 +03:00
msr_check_and_clear ( MSR_SPE ) ;
2015-10-29 03:44:01 +03:00
}
EXPORT_SYMBOL ( giveup_spe ) ;
2005-09-26 10:04:21 +04:00
void enable_kernel_spe ( void )
{
WARN_ON ( preemptible ( ) ) ;
2015-10-29 03:44:04 +03:00
msr_check_and_set ( MSR_SPE ) ;
2015-10-29 03:43:59 +03:00
2015-12-10 12:04:05 +03:00
if ( current - > thread . regs & & ( current - > thread . regs - > msr & MSR_SPE ) ) {
check_if_tm_restore_required ( current ) ;
2015-10-29 03:44:04 +03:00
__giveup_spe ( current ) ;
2015-12-10 12:04:05 +03:00
}
2005-09-26 10:04:21 +04:00
}
EXPORT_SYMBOL ( enable_kernel_spe ) ;
void flush_spe_to_thread ( struct task_struct * tsk )
{
if ( tsk - > thread . regs ) {
preempt_disable ( ) ;
if ( tsk - > thread . regs - > msr & MSR_SPE ) {
BUG_ON ( tsk ! = current ) ;
2011-06-15 03:34:25 +04:00
tsk - > thread . spefscr = mfspr ( SPRN_SPEFSCR ) ;
2007-08-29 06:15:53 +04:00
giveup_spe ( tsk ) ;
2005-09-26 10:04:21 +04:00
}
preempt_enable ( ) ;
}
}
# endif /* CONFIG_SPE */
2015-10-29 03:44:08 +03:00
static unsigned long msr_all_available ;
static int __init init_msr_all_available ( void )
{
2020-08-17 08:47:58 +03:00
if ( IS_ENABLED ( CONFIG_PPC_FPU ) )
msr_all_available | = MSR_FP ;
2015-10-29 03:44:08 +03:00
if ( cpu_has_feature ( CPU_FTR_ALTIVEC ) )
msr_all_available | = MSR_VEC ;
if ( cpu_has_feature ( CPU_FTR_VSX ) )
msr_all_available | = MSR_VSX ;
if ( cpu_has_feature ( CPU_FTR_SPE ) )
msr_all_available | = MSR_SPE ;
return 0 ;
}
early_initcall ( init_msr_all_available ) ;
void giveup_all ( struct task_struct * tsk )
{
unsigned long usermsr ;
if ( ! tsk - > thread . regs )
return ;
2019-09-04 07:55:27 +03:00
check_if_tm_restore_required ( tsk ) ;
2015-10-29 03:44:08 +03:00
usermsr = tsk - > thread . regs - > msr ;
if ( ( usermsr & msr_all_available ) = = 0 )
return ;
msr_check_and_set ( msr_all_available ) ;
2017-08-16 09:01:18 +03:00
WARN_ON ( ( usermsr & MSR_VSX ) & & ! ( ( usermsr & MSR_FP ) & & ( usermsr & MSR_VEC ) ) ) ;
2015-10-29 03:44:08 +03:00
if ( usermsr & MSR_FP )
__giveup_fpu ( tsk ) ;
if ( usermsr & MSR_VEC )
__giveup_altivec ( tsk ) ;
if ( usermsr & MSR_SPE )
__giveup_spe ( tsk ) ;
msr_check_and_clear ( msr_all_available ) ;
}
EXPORT_SYMBOL ( giveup_all ) ;
powerpc/64s: Implement interrupt exit logic in C
Implement the bulk of interrupt return logic in C. The asm return code
must handle a few cases: restoring full GPRs, and emulating stack
store.
The stack store emulation is significantly simplfied, rather than
creating a new return frame and switching to that before performing
the store, it uses the PACA to keep a scratch register around to
perform the store.
The asm return code is moved into 64e for now. The new logic has made
allowance for 64e, but I don't have a full environment that works well
to test it, and even booting in emulated qemu is not great for stress
testing. 64e shouldn't be too far off working with this, given a bit
more testing and auditing of the logic.
This is slightly faster on a POWER9 (page fault speed increases about
1.1%), probably due to reduced mtmsrd.
mpe: Includes fixes from Nick for _TIF_EMULATE_STACK_STORE
handling (including the fast_interrupt_return path), to remove
trace_hardirqs_on(), and fixes the interrupt-return part of the
MSR_VSX restore bug caught by tm-unavailable selftest.
mpe: Incorporate fix from Nick:
The return-to-kernel path has to replay any soft-pending interrupts if
it is returning to a context that had interrupts soft-enabled. It has
to do this carefully and avoid plain enabling interrupts if this is an
irq context, which can cause multiple nesting of interrupts on the
stack, and other unexpected issues.
The code which avoided this case got the soft-mask state wrong, and
marked interrupts as enabled before going around again to retry. This
seems to be mostly harmless except when PREEMPT=y, this calls
preempt_schedule_irq with irqs apparently enabled and runs into a BUG
in kernel/sched/core.c
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michal Suchanek <msuchanek@suse.de>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200225173541.1549955-29-npiggin@gmail.com
2020-02-25 20:35:37 +03:00
# ifdef CONFIG_PPC_BOOK3S_64
# ifdef CONFIG_PPC_FPU
powerpc/64s: Fix restore_math unnecessarily changing MSR
Before returning to user, if there are missing FP/VEC/VSX bits from the
user MSR then those registers had been saved and must be restored again
before use. restore_math will decide whether to restore immediately, or
skip the restore and let fp/vec/vsx unavailable faults demand load the
registers.
Each time restore_math restores one of the FP/VSX or VEC register sets
is loaded, an 8-bit counter is incremented (load_fp and load_vec). When
these wrap to zero, restore_math no longer restores that register set
until after they are next demand faulted.
It's quite usual for those counters to have different values, so if one
wraps to zero and restore_math no longer restores its registers or user
MSR bit but the other is not zero yet does not need to be restored
(because the kernel is not frequently using the FPU), then restore_math
will be called and it will also not return in the early exit check.
This causes msr_check_and_set to test and set the MSR at every kernel
exit despite having no work to do.
This can cause workloads (e.g., a NULL syscall microbenchmark) to run
fast for a time while both counters are non-zero, then slow down when
one of the counters reaches zero, then speed up again after the second
counter reaches zero. The cost is significant, about 10% slowdown on a
NULL syscall benchmark, and the jittery behaviour is very undesirable.
Fix this by having restore_math test all conditions first, and only
update MSR if we will be loading registers.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200623234139.2262227-2-npiggin@gmail.com
2020-06-24 02:41:38 +03:00
static bool should_restore_fp ( void )
powerpc/64s: Implement interrupt exit logic in C
Implement the bulk of interrupt return logic in C. The asm return code
must handle a few cases: restoring full GPRs, and emulating stack
store.
The stack store emulation is significantly simplfied, rather than
creating a new return frame and switching to that before performing
the store, it uses the PACA to keep a scratch register around to
perform the store.
The asm return code is moved into 64e for now. The new logic has made
allowance for 64e, but I don't have a full environment that works well
to test it, and even booting in emulated qemu is not great for stress
testing. 64e shouldn't be too far off working with this, given a bit
more testing and auditing of the logic.
This is slightly faster on a POWER9 (page fault speed increases about
1.1%), probably due to reduced mtmsrd.
mpe: Includes fixes from Nick for _TIF_EMULATE_STACK_STORE
handling (including the fast_interrupt_return path), to remove
trace_hardirqs_on(), and fixes the interrupt-return part of the
MSR_VSX restore bug caught by tm-unavailable selftest.
mpe: Incorporate fix from Nick:
The return-to-kernel path has to replay any soft-pending interrupts if
it is returning to a context that had interrupts soft-enabled. It has
to do this carefully and avoid plain enabling interrupts if this is an
irq context, which can cause multiple nesting of interrupts on the
stack, and other unexpected issues.
The code which avoided this case got the soft-mask state wrong, and
marked interrupts as enabled before going around again to retry. This
seems to be mostly harmless except when PREEMPT=y, this calls
preempt_schedule_irq with irqs apparently enabled and runs into a BUG
in kernel/sched/core.c
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michal Suchanek <msuchanek@suse.de>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200225173541.1549955-29-npiggin@gmail.com
2020-02-25 20:35:37 +03:00
{
powerpc/64s: Fix restore_math unnecessarily changing MSR
Before returning to user, if there are missing FP/VEC/VSX bits from the
user MSR then those registers had been saved and must be restored again
before use. restore_math will decide whether to restore immediately, or
skip the restore and let fp/vec/vsx unavailable faults demand load the
registers.
Each time restore_math restores one of the FP/VSX or VEC register sets
is loaded, an 8-bit counter is incremented (load_fp and load_vec). When
these wrap to zero, restore_math no longer restores that register set
until after they are next demand faulted.
It's quite usual for those counters to have different values, so if one
wraps to zero and restore_math no longer restores its registers or user
MSR bit but the other is not zero yet does not need to be restored
(because the kernel is not frequently using the FPU), then restore_math
will be called and it will also not return in the early exit check.
This causes msr_check_and_set to test and set the MSR at every kernel
exit despite having no work to do.
This can cause workloads (e.g., a NULL syscall microbenchmark) to run
fast for a time while both counters are non-zero, then slow down when
one of the counters reaches zero, then speed up again after the second
counter reaches zero. The cost is significant, about 10% slowdown on a
NULL syscall benchmark, and the jittery behaviour is very undesirable.
Fix this by having restore_math test all conditions first, and only
update MSR if we will be loading registers.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200623234139.2262227-2-npiggin@gmail.com
2020-06-24 02:41:38 +03:00
if ( current - > thread . load_fp ) {
powerpc/64s: Implement interrupt exit logic in C
Implement the bulk of interrupt return logic in C. The asm return code
must handle a few cases: restoring full GPRs, and emulating stack
store.
The stack store emulation is significantly simplfied, rather than
creating a new return frame and switching to that before performing
the store, it uses the PACA to keep a scratch register around to
perform the store.
The asm return code is moved into 64e for now. The new logic has made
allowance for 64e, but I don't have a full environment that works well
to test it, and even booting in emulated qemu is not great for stress
testing. 64e shouldn't be too far off working with this, given a bit
more testing and auditing of the logic.
This is slightly faster on a POWER9 (page fault speed increases about
1.1%), probably due to reduced mtmsrd.
mpe: Includes fixes from Nick for _TIF_EMULATE_STACK_STORE
handling (including the fast_interrupt_return path), to remove
trace_hardirqs_on(), and fixes the interrupt-return part of the
MSR_VSX restore bug caught by tm-unavailable selftest.
mpe: Incorporate fix from Nick:
The return-to-kernel path has to replay any soft-pending interrupts if
it is returning to a context that had interrupts soft-enabled. It has
to do this carefully and avoid plain enabling interrupts if this is an
irq context, which can cause multiple nesting of interrupts on the
stack, and other unexpected issues.
The code which avoided this case got the soft-mask state wrong, and
marked interrupts as enabled before going around again to retry. This
seems to be mostly harmless except when PREEMPT=y, this calls
preempt_schedule_irq with irqs apparently enabled and runs into a BUG
in kernel/sched/core.c
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michal Suchanek <msuchanek@suse.de>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200225173541.1549955-29-npiggin@gmail.com
2020-02-25 20:35:37 +03:00
current - > thread . load_fp + + ;
powerpc/64s: Fix restore_math unnecessarily changing MSR
Before returning to user, if there are missing FP/VEC/VSX bits from the
user MSR then those registers had been saved and must be restored again
before use. restore_math will decide whether to restore immediately, or
skip the restore and let fp/vec/vsx unavailable faults demand load the
registers.
Each time restore_math restores one of the FP/VSX or VEC register sets
is loaded, an 8-bit counter is incremented (load_fp and load_vec). When
these wrap to zero, restore_math no longer restores that register set
until after they are next demand faulted.
It's quite usual for those counters to have different values, so if one
wraps to zero and restore_math no longer restores its registers or user
MSR bit but the other is not zero yet does not need to be restored
(because the kernel is not frequently using the FPU), then restore_math
will be called and it will also not return in the early exit check.
This causes msr_check_and_set to test and set the MSR at every kernel
exit despite having no work to do.
This can cause workloads (e.g., a NULL syscall microbenchmark) to run
fast for a time while both counters are non-zero, then slow down when
one of the counters reaches zero, then speed up again after the second
counter reaches zero. The cost is significant, about 10% slowdown on a
NULL syscall benchmark, and the jittery behaviour is very undesirable.
Fix this by having restore_math test all conditions first, and only
update MSR if we will be loading registers.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200623234139.2262227-2-npiggin@gmail.com
2020-06-24 02:41:38 +03:00
return true ;
powerpc/64s: Implement interrupt exit logic in C
Implement the bulk of interrupt return logic in C. The asm return code
must handle a few cases: restoring full GPRs, and emulating stack
store.
The stack store emulation is significantly simplfied, rather than
creating a new return frame and switching to that before performing
the store, it uses the PACA to keep a scratch register around to
perform the store.
The asm return code is moved into 64e for now. The new logic has made
allowance for 64e, but I don't have a full environment that works well
to test it, and even booting in emulated qemu is not great for stress
testing. 64e shouldn't be too far off working with this, given a bit
more testing and auditing of the logic.
This is slightly faster on a POWER9 (page fault speed increases about
1.1%), probably due to reduced mtmsrd.
mpe: Includes fixes from Nick for _TIF_EMULATE_STACK_STORE
handling (including the fast_interrupt_return path), to remove
trace_hardirqs_on(), and fixes the interrupt-return part of the
MSR_VSX restore bug caught by tm-unavailable selftest.
mpe: Incorporate fix from Nick:
The return-to-kernel path has to replay any soft-pending interrupts if
it is returning to a context that had interrupts soft-enabled. It has
to do this carefully and avoid plain enabling interrupts if this is an
irq context, which can cause multiple nesting of interrupts on the
stack, and other unexpected issues.
The code which avoided this case got the soft-mask state wrong, and
marked interrupts as enabled before going around again to retry. This
seems to be mostly harmless except when PREEMPT=y, this calls
preempt_schedule_irq with irqs apparently enabled and runs into a BUG
in kernel/sched/core.c
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michal Suchanek <msuchanek@suse.de>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200225173541.1549955-29-npiggin@gmail.com
2020-02-25 20:35:37 +03:00
}
powerpc/64s: Fix restore_math unnecessarily changing MSR
Before returning to user, if there are missing FP/VEC/VSX bits from the
user MSR then those registers had been saved and must be restored again
before use. restore_math will decide whether to restore immediately, or
skip the restore and let fp/vec/vsx unavailable faults demand load the
registers.
Each time restore_math restores one of the FP/VSX or VEC register sets
is loaded, an 8-bit counter is incremented (load_fp and load_vec). When
these wrap to zero, restore_math no longer restores that register set
until after they are next demand faulted.
It's quite usual for those counters to have different values, so if one
wraps to zero and restore_math no longer restores its registers or user
MSR bit but the other is not zero yet does not need to be restored
(because the kernel is not frequently using the FPU), then restore_math
will be called and it will also not return in the early exit check.
This causes msr_check_and_set to test and set the MSR at every kernel
exit despite having no work to do.
This can cause workloads (e.g., a NULL syscall microbenchmark) to run
fast for a time while both counters are non-zero, then slow down when
one of the counters reaches zero, then speed up again after the second
counter reaches zero. The cost is significant, about 10% slowdown on a
NULL syscall benchmark, and the jittery behaviour is very undesirable.
Fix this by having restore_math test all conditions first, and only
update MSR if we will be loading registers.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200623234139.2262227-2-npiggin@gmail.com
2020-06-24 02:41:38 +03:00
return false ;
}
static void do_restore_fp ( void )
{
load_fp_state ( & current - > thread . fp_state ) ;
powerpc/64s: Implement interrupt exit logic in C
Implement the bulk of interrupt return logic in C. The asm return code
must handle a few cases: restoring full GPRs, and emulating stack
store.
The stack store emulation is significantly simplfied, rather than
creating a new return frame and switching to that before performing
the store, it uses the PACA to keep a scratch register around to
perform the store.
The asm return code is moved into 64e for now. The new logic has made
allowance for 64e, but I don't have a full environment that works well
to test it, and even booting in emulated qemu is not great for stress
testing. 64e shouldn't be too far off working with this, given a bit
more testing and auditing of the logic.
This is slightly faster on a POWER9 (page fault speed increases about
1.1%), probably due to reduced mtmsrd.
mpe: Includes fixes from Nick for _TIF_EMULATE_STACK_STORE
handling (including the fast_interrupt_return path), to remove
trace_hardirqs_on(), and fixes the interrupt-return part of the
MSR_VSX restore bug caught by tm-unavailable selftest.
mpe: Incorporate fix from Nick:
The return-to-kernel path has to replay any soft-pending interrupts if
it is returning to a context that had interrupts soft-enabled. It has
to do this carefully and avoid plain enabling interrupts if this is an
irq context, which can cause multiple nesting of interrupts on the
stack, and other unexpected issues.
The code which avoided this case got the soft-mask state wrong, and
marked interrupts as enabled before going around again to retry. This
seems to be mostly harmless except when PREEMPT=y, this calls
preempt_schedule_irq with irqs apparently enabled and runs into a BUG
in kernel/sched/core.c
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michal Suchanek <msuchanek@suse.de>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200225173541.1549955-29-npiggin@gmail.com
2020-02-25 20:35:37 +03:00
}
# else
powerpc/64s: Fix restore_math unnecessarily changing MSR
Before returning to user, if there are missing FP/VEC/VSX bits from the
user MSR then those registers had been saved and must be restored again
before use. restore_math will decide whether to restore immediately, or
skip the restore and let fp/vec/vsx unavailable faults demand load the
registers.
Each time restore_math restores one of the FP/VSX or VEC register sets
is loaded, an 8-bit counter is incremented (load_fp and load_vec). When
these wrap to zero, restore_math no longer restores that register set
until after they are next demand faulted.
It's quite usual for those counters to have different values, so if one
wraps to zero and restore_math no longer restores its registers or user
MSR bit but the other is not zero yet does not need to be restored
(because the kernel is not frequently using the FPU), then restore_math
will be called and it will also not return in the early exit check.
This causes msr_check_and_set to test and set the MSR at every kernel
exit despite having no work to do.
This can cause workloads (e.g., a NULL syscall microbenchmark) to run
fast for a time while both counters are non-zero, then slow down when
one of the counters reaches zero, then speed up again after the second
counter reaches zero. The cost is significant, about 10% slowdown on a
NULL syscall benchmark, and the jittery behaviour is very undesirable.
Fix this by having restore_math test all conditions first, and only
update MSR if we will be loading registers.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200623234139.2262227-2-npiggin@gmail.com
2020-06-24 02:41:38 +03:00
static bool should_restore_fp ( void ) { return false ; }
static void do_restore_fp ( void ) { }
powerpc/64s: Implement interrupt exit logic in C
Implement the bulk of interrupt return logic in C. The asm return code
must handle a few cases: restoring full GPRs, and emulating stack
store.
The stack store emulation is significantly simplfied, rather than
creating a new return frame and switching to that before performing
the store, it uses the PACA to keep a scratch register around to
perform the store.
The asm return code is moved into 64e for now. The new logic has made
allowance for 64e, but I don't have a full environment that works well
to test it, and even booting in emulated qemu is not great for stress
testing. 64e shouldn't be too far off working with this, given a bit
more testing and auditing of the logic.
This is slightly faster on a POWER9 (page fault speed increases about
1.1%), probably due to reduced mtmsrd.
mpe: Includes fixes from Nick for _TIF_EMULATE_STACK_STORE
handling (including the fast_interrupt_return path), to remove
trace_hardirqs_on(), and fixes the interrupt-return part of the
MSR_VSX restore bug caught by tm-unavailable selftest.
mpe: Incorporate fix from Nick:
The return-to-kernel path has to replay any soft-pending interrupts if
it is returning to a context that had interrupts soft-enabled. It has
to do this carefully and avoid plain enabling interrupts if this is an
irq context, which can cause multiple nesting of interrupts on the
stack, and other unexpected issues.
The code which avoided this case got the soft-mask state wrong, and
marked interrupts as enabled before going around again to retry. This
seems to be mostly harmless except when PREEMPT=y, this calls
preempt_schedule_irq with irqs apparently enabled and runs into a BUG
in kernel/sched/core.c
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michal Suchanek <msuchanek@suse.de>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200225173541.1549955-29-npiggin@gmail.com
2020-02-25 20:35:37 +03:00
# endif /* CONFIG_PPC_FPU */
# ifdef CONFIG_ALTIVEC
powerpc/64s: Fix restore_math unnecessarily changing MSR
Before returning to user, if there are missing FP/VEC/VSX bits from the
user MSR then those registers had been saved and must be restored again
before use. restore_math will decide whether to restore immediately, or
skip the restore and let fp/vec/vsx unavailable faults demand load the
registers.
Each time restore_math restores one of the FP/VSX or VEC register sets
is loaded, an 8-bit counter is incremented (load_fp and load_vec). When
these wrap to zero, restore_math no longer restores that register set
until after they are next demand faulted.
It's quite usual for those counters to have different values, so if one
wraps to zero and restore_math no longer restores its registers or user
MSR bit but the other is not zero yet does not need to be restored
(because the kernel is not frequently using the FPU), then restore_math
will be called and it will also not return in the early exit check.
This causes msr_check_and_set to test and set the MSR at every kernel
exit despite having no work to do.
This can cause workloads (e.g., a NULL syscall microbenchmark) to run
fast for a time while both counters are non-zero, then slow down when
one of the counters reaches zero, then speed up again after the second
counter reaches zero. The cost is significant, about 10% slowdown on a
NULL syscall benchmark, and the jittery behaviour is very undesirable.
Fix this by having restore_math test all conditions first, and only
update MSR if we will be loading registers.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200623234139.2262227-2-npiggin@gmail.com
2020-06-24 02:41:38 +03:00
static bool should_restore_altivec ( void )
powerpc/64s: Implement interrupt exit logic in C
Implement the bulk of interrupt return logic in C. The asm return code
must handle a few cases: restoring full GPRs, and emulating stack
store.
The stack store emulation is significantly simplfied, rather than
creating a new return frame and switching to that before performing
the store, it uses the PACA to keep a scratch register around to
perform the store.
The asm return code is moved into 64e for now. The new logic has made
allowance for 64e, but I don't have a full environment that works well
to test it, and even booting in emulated qemu is not great for stress
testing. 64e shouldn't be too far off working with this, given a bit
more testing and auditing of the logic.
This is slightly faster on a POWER9 (page fault speed increases about
1.1%), probably due to reduced mtmsrd.
mpe: Includes fixes from Nick for _TIF_EMULATE_STACK_STORE
handling (including the fast_interrupt_return path), to remove
trace_hardirqs_on(), and fixes the interrupt-return part of the
MSR_VSX restore bug caught by tm-unavailable selftest.
mpe: Incorporate fix from Nick:
The return-to-kernel path has to replay any soft-pending interrupts if
it is returning to a context that had interrupts soft-enabled. It has
to do this carefully and avoid plain enabling interrupts if this is an
irq context, which can cause multiple nesting of interrupts on the
stack, and other unexpected issues.
The code which avoided this case got the soft-mask state wrong, and
marked interrupts as enabled before going around again to retry. This
seems to be mostly harmless except when PREEMPT=y, this calls
preempt_schedule_irq with irqs apparently enabled and runs into a BUG
in kernel/sched/core.c
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michal Suchanek <msuchanek@suse.de>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200225173541.1549955-29-npiggin@gmail.com
2020-02-25 20:35:37 +03:00
{
powerpc/64s: Fix restore_math unnecessarily changing MSR
Before returning to user, if there are missing FP/VEC/VSX bits from the
user MSR then those registers had been saved and must be restored again
before use. restore_math will decide whether to restore immediately, or
skip the restore and let fp/vec/vsx unavailable faults demand load the
registers.
Each time restore_math restores one of the FP/VSX or VEC register sets
is loaded, an 8-bit counter is incremented (load_fp and load_vec). When
these wrap to zero, restore_math no longer restores that register set
until after they are next demand faulted.
It's quite usual for those counters to have different values, so if one
wraps to zero and restore_math no longer restores its registers or user
MSR bit but the other is not zero yet does not need to be restored
(because the kernel is not frequently using the FPU), then restore_math
will be called and it will also not return in the early exit check.
This causes msr_check_and_set to test and set the MSR at every kernel
exit despite having no work to do.
This can cause workloads (e.g., a NULL syscall microbenchmark) to run
fast for a time while both counters are non-zero, then slow down when
one of the counters reaches zero, then speed up again after the second
counter reaches zero. The cost is significant, about 10% slowdown on a
NULL syscall benchmark, and the jittery behaviour is very undesirable.
Fix this by having restore_math test all conditions first, and only
update MSR if we will be loading registers.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200623234139.2262227-2-npiggin@gmail.com
2020-06-24 02:41:38 +03:00
if ( cpu_has_feature ( CPU_FTR_ALTIVEC ) & & ( current - > thread . load_vec ) ) {
current - > thread . load_vec + + ;
return true ;
powerpc/64s: Implement interrupt exit logic in C
Implement the bulk of interrupt return logic in C. The asm return code
must handle a few cases: restoring full GPRs, and emulating stack
store.
The stack store emulation is significantly simplfied, rather than
creating a new return frame and switching to that before performing
the store, it uses the PACA to keep a scratch register around to
perform the store.
The asm return code is moved into 64e for now. The new logic has made
allowance for 64e, but I don't have a full environment that works well
to test it, and even booting in emulated qemu is not great for stress
testing. 64e shouldn't be too far off working with this, given a bit
more testing and auditing of the logic.
This is slightly faster on a POWER9 (page fault speed increases about
1.1%), probably due to reduced mtmsrd.
mpe: Includes fixes from Nick for _TIF_EMULATE_STACK_STORE
handling (including the fast_interrupt_return path), to remove
trace_hardirqs_on(), and fixes the interrupt-return part of the
MSR_VSX restore bug caught by tm-unavailable selftest.
mpe: Incorporate fix from Nick:
The return-to-kernel path has to replay any soft-pending interrupts if
it is returning to a context that had interrupts soft-enabled. It has
to do this carefully and avoid plain enabling interrupts if this is an
irq context, which can cause multiple nesting of interrupts on the
stack, and other unexpected issues.
The code which avoided this case got the soft-mask state wrong, and
marked interrupts as enabled before going around again to retry. This
seems to be mostly harmless except when PREEMPT=y, this calls
preempt_schedule_irq with irqs apparently enabled and runs into a BUG
in kernel/sched/core.c
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michal Suchanek <msuchanek@suse.de>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200225173541.1549955-29-npiggin@gmail.com
2020-02-25 20:35:37 +03:00
}
powerpc/64s: Fix restore_math unnecessarily changing MSR
Before returning to user, if there are missing FP/VEC/VSX bits from the
user MSR then those registers had been saved and must be restored again
before use. restore_math will decide whether to restore immediately, or
skip the restore and let fp/vec/vsx unavailable faults demand load the
registers.
Each time restore_math restores one of the FP/VSX or VEC register sets
is loaded, an 8-bit counter is incremented (load_fp and load_vec). When
these wrap to zero, restore_math no longer restores that register set
until after they are next demand faulted.
It's quite usual for those counters to have different values, so if one
wraps to zero and restore_math no longer restores its registers or user
MSR bit but the other is not zero yet does not need to be restored
(because the kernel is not frequently using the FPU), then restore_math
will be called and it will also not return in the early exit check.
This causes msr_check_and_set to test and set the MSR at every kernel
exit despite having no work to do.
This can cause workloads (e.g., a NULL syscall microbenchmark) to run
fast for a time while both counters are non-zero, then slow down when
one of the counters reaches zero, then speed up again after the second
counter reaches zero. The cost is significant, about 10% slowdown on a
NULL syscall benchmark, and the jittery behaviour is very undesirable.
Fix this by having restore_math test all conditions first, and only
update MSR if we will be loading registers.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200623234139.2262227-2-npiggin@gmail.com
2020-06-24 02:41:38 +03:00
return false ;
}
static void do_restore_altivec ( void )
{
load_vr_state ( & current - > thread . vr_state ) ;
current - > thread . used_vr = 1 ;
powerpc/64s: Implement interrupt exit logic in C
Implement the bulk of interrupt return logic in C. The asm return code
must handle a few cases: restoring full GPRs, and emulating stack
store.
The stack store emulation is significantly simplfied, rather than
creating a new return frame and switching to that before performing
the store, it uses the PACA to keep a scratch register around to
perform the store.
The asm return code is moved into 64e for now. The new logic has made
allowance for 64e, but I don't have a full environment that works well
to test it, and even booting in emulated qemu is not great for stress
testing. 64e shouldn't be too far off working with this, given a bit
more testing and auditing of the logic.
This is slightly faster on a POWER9 (page fault speed increases about
1.1%), probably due to reduced mtmsrd.
mpe: Includes fixes from Nick for _TIF_EMULATE_STACK_STORE
handling (including the fast_interrupt_return path), to remove
trace_hardirqs_on(), and fixes the interrupt-return part of the
MSR_VSX restore bug caught by tm-unavailable selftest.
mpe: Incorporate fix from Nick:
The return-to-kernel path has to replay any soft-pending interrupts if
it is returning to a context that had interrupts soft-enabled. It has
to do this carefully and avoid plain enabling interrupts if this is an
irq context, which can cause multiple nesting of interrupts on the
stack, and other unexpected issues.
The code which avoided this case got the soft-mask state wrong, and
marked interrupts as enabled before going around again to retry. This
seems to be mostly harmless except when PREEMPT=y, this calls
preempt_schedule_irq with irqs apparently enabled and runs into a BUG
in kernel/sched/core.c
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michal Suchanek <msuchanek@suse.de>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200225173541.1549955-29-npiggin@gmail.com
2020-02-25 20:35:37 +03:00
}
# else
powerpc/64s: Fix restore_math unnecessarily changing MSR
Before returning to user, if there are missing FP/VEC/VSX bits from the
user MSR then those registers had been saved and must be restored again
before use. restore_math will decide whether to restore immediately, or
skip the restore and let fp/vec/vsx unavailable faults demand load the
registers.
Each time restore_math restores one of the FP/VSX or VEC register sets
is loaded, an 8-bit counter is incremented (load_fp and load_vec). When
these wrap to zero, restore_math no longer restores that register set
until after they are next demand faulted.
It's quite usual for those counters to have different values, so if one
wraps to zero and restore_math no longer restores its registers or user
MSR bit but the other is not zero yet does not need to be restored
(because the kernel is not frequently using the FPU), then restore_math
will be called and it will also not return in the early exit check.
This causes msr_check_and_set to test and set the MSR at every kernel
exit despite having no work to do.
This can cause workloads (e.g., a NULL syscall microbenchmark) to run
fast for a time while both counters are non-zero, then slow down when
one of the counters reaches zero, then speed up again after the second
counter reaches zero. The cost is significant, about 10% slowdown on a
NULL syscall benchmark, and the jittery behaviour is very undesirable.
Fix this by having restore_math test all conditions first, and only
update MSR if we will be loading registers.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200623234139.2262227-2-npiggin@gmail.com
2020-06-24 02:41:38 +03:00
static bool should_restore_altivec ( void ) { return false ; }
static void do_restore_altivec ( void ) { }
powerpc/64s: Implement interrupt exit logic in C
Implement the bulk of interrupt return logic in C. The asm return code
must handle a few cases: restoring full GPRs, and emulating stack
store.
The stack store emulation is significantly simplfied, rather than
creating a new return frame and switching to that before performing
the store, it uses the PACA to keep a scratch register around to
perform the store.
The asm return code is moved into 64e for now. The new logic has made
allowance for 64e, but I don't have a full environment that works well
to test it, and even booting in emulated qemu is not great for stress
testing. 64e shouldn't be too far off working with this, given a bit
more testing and auditing of the logic.
This is slightly faster on a POWER9 (page fault speed increases about
1.1%), probably due to reduced mtmsrd.
mpe: Includes fixes from Nick for _TIF_EMULATE_STACK_STORE
handling (including the fast_interrupt_return path), to remove
trace_hardirqs_on(), and fixes the interrupt-return part of the
MSR_VSX restore bug caught by tm-unavailable selftest.
mpe: Incorporate fix from Nick:
The return-to-kernel path has to replay any soft-pending interrupts if
it is returning to a context that had interrupts soft-enabled. It has
to do this carefully and avoid plain enabling interrupts if this is an
irq context, which can cause multiple nesting of interrupts on the
stack, and other unexpected issues.
The code which avoided this case got the soft-mask state wrong, and
marked interrupts as enabled before going around again to retry. This
seems to be mostly harmless except when PREEMPT=y, this calls
preempt_schedule_irq with irqs apparently enabled and runs into a BUG
in kernel/sched/core.c
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michal Suchanek <msuchanek@suse.de>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200225173541.1549955-29-npiggin@gmail.com
2020-02-25 20:35:37 +03:00
# endif /* CONFIG_ALTIVEC */
powerpc/64s: Fix restore_math unnecessarily changing MSR
Before returning to user, if there are missing FP/VEC/VSX bits from the
user MSR then those registers had been saved and must be restored again
before use. restore_math will decide whether to restore immediately, or
skip the restore and let fp/vec/vsx unavailable faults demand load the
registers.
Each time restore_math restores one of the FP/VSX or VEC register sets
is loaded, an 8-bit counter is incremented (load_fp and load_vec). When
these wrap to zero, restore_math no longer restores that register set
until after they are next demand faulted.
It's quite usual for those counters to have different values, so if one
wraps to zero and restore_math no longer restores its registers or user
MSR bit but the other is not zero yet does not need to be restored
(because the kernel is not frequently using the FPU), then restore_math
will be called and it will also not return in the early exit check.
This causes msr_check_and_set to test and set the MSR at every kernel
exit despite having no work to do.
This can cause workloads (e.g., a NULL syscall microbenchmark) to run
fast for a time while both counters are non-zero, then slow down when
one of the counters reaches zero, then speed up again after the second
counter reaches zero. The cost is significant, about 10% slowdown on a
NULL syscall benchmark, and the jittery behaviour is very undesirable.
Fix this by having restore_math test all conditions first, and only
update MSR if we will be loading registers.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200623234139.2262227-2-npiggin@gmail.com
2020-06-24 02:41:38 +03:00
static bool should_restore_vsx ( void )
powerpc/64s: Implement interrupt exit logic in C
Implement the bulk of interrupt return logic in C. The asm return code
must handle a few cases: restoring full GPRs, and emulating stack
store.
The stack store emulation is significantly simplfied, rather than
creating a new return frame and switching to that before performing
the store, it uses the PACA to keep a scratch register around to
perform the store.
The asm return code is moved into 64e for now. The new logic has made
allowance for 64e, but I don't have a full environment that works well
to test it, and even booting in emulated qemu is not great for stress
testing. 64e shouldn't be too far off working with this, given a bit
more testing and auditing of the logic.
This is slightly faster on a POWER9 (page fault speed increases about
1.1%), probably due to reduced mtmsrd.
mpe: Includes fixes from Nick for _TIF_EMULATE_STACK_STORE
handling (including the fast_interrupt_return path), to remove
trace_hardirqs_on(), and fixes the interrupt-return part of the
MSR_VSX restore bug caught by tm-unavailable selftest.
mpe: Incorporate fix from Nick:
The return-to-kernel path has to replay any soft-pending interrupts if
it is returning to a context that had interrupts soft-enabled. It has
to do this carefully and avoid plain enabling interrupts if this is an
irq context, which can cause multiple nesting of interrupts on the
stack, and other unexpected issues.
The code which avoided this case got the soft-mask state wrong, and
marked interrupts as enabled before going around again to retry. This
seems to be mostly harmless except when PREEMPT=y, this calls
preempt_schedule_irq with irqs apparently enabled and runs into a BUG
in kernel/sched/core.c
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michal Suchanek <msuchanek@suse.de>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200225173541.1549955-29-npiggin@gmail.com
2020-02-25 20:35:37 +03:00
{
powerpc/64s: Fix restore_math unnecessarily changing MSR
Before returning to user, if there are missing FP/VEC/VSX bits from the
user MSR then those registers had been saved and must be restored again
before use. restore_math will decide whether to restore immediately, or
skip the restore and let fp/vec/vsx unavailable faults demand load the
registers.
Each time restore_math restores one of the FP/VSX or VEC register sets
is loaded, an 8-bit counter is incremented (load_fp and load_vec). When
these wrap to zero, restore_math no longer restores that register set
until after they are next demand faulted.
It's quite usual for those counters to have different values, so if one
wraps to zero and restore_math no longer restores its registers or user
MSR bit but the other is not zero yet does not need to be restored
(because the kernel is not frequently using the FPU), then restore_math
will be called and it will also not return in the early exit check.
This causes msr_check_and_set to test and set the MSR at every kernel
exit despite having no work to do.
This can cause workloads (e.g., a NULL syscall microbenchmark) to run
fast for a time while both counters are non-zero, then slow down when
one of the counters reaches zero, then speed up again after the second
counter reaches zero. The cost is significant, about 10% slowdown on a
NULL syscall benchmark, and the jittery behaviour is very undesirable.
Fix this by having restore_math test all conditions first, and only
update MSR if we will be loading registers.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200623234139.2262227-2-npiggin@gmail.com
2020-06-24 02:41:38 +03:00
if ( cpu_has_feature ( CPU_FTR_VSX ) )
return true ;
return false ;
}
2020-08-17 08:47:55 +03:00
# ifdef CONFIG_VSX
powerpc/64s: Fix restore_math unnecessarily changing MSR
Before returning to user, if there are missing FP/VEC/VSX bits from the
user MSR then those registers had been saved and must be restored again
before use. restore_math will decide whether to restore immediately, or
skip the restore and let fp/vec/vsx unavailable faults demand load the
registers.
Each time restore_math restores one of the FP/VSX or VEC register sets
is loaded, an 8-bit counter is incremented (load_fp and load_vec). When
these wrap to zero, restore_math no longer restores that register set
until after they are next demand faulted.
It's quite usual for those counters to have different values, so if one
wraps to zero and restore_math no longer restores its registers or user
MSR bit but the other is not zero yet does not need to be restored
(because the kernel is not frequently using the FPU), then restore_math
will be called and it will also not return in the early exit check.
This causes msr_check_and_set to test and set the MSR at every kernel
exit despite having no work to do.
This can cause workloads (e.g., a NULL syscall microbenchmark) to run
fast for a time while both counters are non-zero, then slow down when
one of the counters reaches zero, then speed up again after the second
counter reaches zero. The cost is significant, about 10% slowdown on a
NULL syscall benchmark, and the jittery behaviour is very undesirable.
Fix this by having restore_math test all conditions first, and only
update MSR if we will be loading registers.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200623234139.2262227-2-npiggin@gmail.com
2020-06-24 02:41:38 +03:00
static void do_restore_vsx ( void )
{
current - > thread . used_vsr = 1 ;
powerpc/64s: Implement interrupt exit logic in C
Implement the bulk of interrupt return logic in C. The asm return code
must handle a few cases: restoring full GPRs, and emulating stack
store.
The stack store emulation is significantly simplfied, rather than
creating a new return frame and switching to that before performing
the store, it uses the PACA to keep a scratch register around to
perform the store.
The asm return code is moved into 64e for now. The new logic has made
allowance for 64e, but I don't have a full environment that works well
to test it, and even booting in emulated qemu is not great for stress
testing. 64e shouldn't be too far off working with this, given a bit
more testing and auditing of the logic.
This is slightly faster on a POWER9 (page fault speed increases about
1.1%), probably due to reduced mtmsrd.
mpe: Includes fixes from Nick for _TIF_EMULATE_STACK_STORE
handling (including the fast_interrupt_return path), to remove
trace_hardirqs_on(), and fixes the interrupt-return part of the
MSR_VSX restore bug caught by tm-unavailable selftest.
mpe: Incorporate fix from Nick:
The return-to-kernel path has to replay any soft-pending interrupts if
it is returning to a context that had interrupts soft-enabled. It has
to do this carefully and avoid plain enabling interrupts if this is an
irq context, which can cause multiple nesting of interrupts on the
stack, and other unexpected issues.
The code which avoided this case got the soft-mask state wrong, and
marked interrupts as enabled before going around again to retry. This
seems to be mostly harmless except when PREEMPT=y, this calls
preempt_schedule_irq with irqs apparently enabled and runs into a BUG
in kernel/sched/core.c
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michal Suchanek <msuchanek@suse.de>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200225173541.1549955-29-npiggin@gmail.com
2020-02-25 20:35:37 +03:00
}
# else
powerpc/64s: Fix restore_math unnecessarily changing MSR
Before returning to user, if there are missing FP/VEC/VSX bits from the
user MSR then those registers had been saved and must be restored again
before use. restore_math will decide whether to restore immediately, or
skip the restore and let fp/vec/vsx unavailable faults demand load the
registers.
Each time restore_math restores one of the FP/VSX or VEC register sets
is loaded, an 8-bit counter is incremented (load_fp and load_vec). When
these wrap to zero, restore_math no longer restores that register set
until after they are next demand faulted.
It's quite usual for those counters to have different values, so if one
wraps to zero and restore_math no longer restores its registers or user
MSR bit but the other is not zero yet does not need to be restored
(because the kernel is not frequently using the FPU), then restore_math
will be called and it will also not return in the early exit check.
This causes msr_check_and_set to test and set the MSR at every kernel
exit despite having no work to do.
This can cause workloads (e.g., a NULL syscall microbenchmark) to run
fast for a time while both counters are non-zero, then slow down when
one of the counters reaches zero, then speed up again after the second
counter reaches zero. The cost is significant, about 10% slowdown on a
NULL syscall benchmark, and the jittery behaviour is very undesirable.
Fix this by having restore_math test all conditions first, and only
update MSR if we will be loading registers.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200623234139.2262227-2-npiggin@gmail.com
2020-06-24 02:41:38 +03:00
static void do_restore_vsx ( void ) { }
powerpc/64s: Implement interrupt exit logic in C
Implement the bulk of interrupt return logic in C. The asm return code
must handle a few cases: restoring full GPRs, and emulating stack
store.
The stack store emulation is significantly simplfied, rather than
creating a new return frame and switching to that before performing
the store, it uses the PACA to keep a scratch register around to
perform the store.
The asm return code is moved into 64e for now. The new logic has made
allowance for 64e, but I don't have a full environment that works well
to test it, and even booting in emulated qemu is not great for stress
testing. 64e shouldn't be too far off working with this, given a bit
more testing and auditing of the logic.
This is slightly faster on a POWER9 (page fault speed increases about
1.1%), probably due to reduced mtmsrd.
mpe: Includes fixes from Nick for _TIF_EMULATE_STACK_STORE
handling (including the fast_interrupt_return path), to remove
trace_hardirqs_on(), and fixes the interrupt-return part of the
MSR_VSX restore bug caught by tm-unavailable selftest.
mpe: Incorporate fix from Nick:
The return-to-kernel path has to replay any soft-pending interrupts if
it is returning to a context that had interrupts soft-enabled. It has
to do this carefully and avoid plain enabling interrupts if this is an
irq context, which can cause multiple nesting of interrupts on the
stack, and other unexpected issues.
The code which avoided this case got the soft-mask state wrong, and
marked interrupts as enabled before going around again to retry. This
seems to be mostly harmless except when PREEMPT=y, this calls
preempt_schedule_irq with irqs apparently enabled and runs into a BUG
in kernel/sched/core.c
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michal Suchanek <msuchanek@suse.de>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200225173541.1549955-29-npiggin@gmail.com
2020-02-25 20:35:37 +03:00
# endif /* CONFIG_VSX */
powerpc/64: Don't trace code that runs with the soft irq mask unreconciled
"Reconciling" in terms of interrupt handling, is to bring the soft irq
mask state in to synch with the hardware, after an interrupt causes
MSR[EE] to be cleared (while the soft mask may be enabled, and hard
irqs not marked disabled).
General kernel code should not be called while unreconciled, because
local_irq_disable, etc. manipulations can cause surprising irq traces,
and it's fragile because the soft irq code does not really expect to
be called in this situation.
When exiting from an interrupt, MSR[EE] is cleared to prevent races,
but soft irq state is enabled for the returned-to context, so this is
now an unreconciled state. restore_math is called in this state, and
that can be ftraced, and the ftrace subsystem disables local irqs.
Mark restore_math and its callees as notrace. Restore a sanity check
in the soft irq code that had to be disabled for this case, by commit
4da1f79227ad4 ("powerpc/64: Disable irq restore warning for now").
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2019-05-02 08:21:07 +03:00
/*
* The exception exit path calls restore_math ( ) with interrupts hard disabled
* but the soft irq state not " reconciled " . ftrace code that calls
* local_irq_save / restore causes warnings .
*
* Rather than complicate the exit path , just don ' t trace restore_math . This
* could be done by having ftrace entry code check for this un - reconciled
* condition where MSR [ EE ] = 0 and PACA_IRQ_HARD_DIS is not set , and
* temporarily fix it up for the duration of the ftrace call .
*/
void notrace restore_math ( struct pt_regs * regs )
2016-02-29 09:53:47 +03:00
{
unsigned long msr ;
powerpc/64s: Fix restore_math unnecessarily changing MSR
Before returning to user, if there are missing FP/VEC/VSX bits from the
user MSR then those registers had been saved and must be restored again
before use. restore_math will decide whether to restore immediately, or
skip the restore and let fp/vec/vsx unavailable faults demand load the
registers.
Each time restore_math restores one of the FP/VSX or VEC register sets
is loaded, an 8-bit counter is incremented (load_fp and load_vec). When
these wrap to zero, restore_math no longer restores that register set
until after they are next demand faulted.
It's quite usual for those counters to have different values, so if one
wraps to zero and restore_math no longer restores its registers or user
MSR bit but the other is not zero yet does not need to be restored
(because the kernel is not frequently using the FPU), then restore_math
will be called and it will also not return in the early exit check.
This causes msr_check_and_set to test and set the MSR at every kernel
exit despite having no work to do.
This can cause workloads (e.g., a NULL syscall microbenchmark) to run
fast for a time while both counters are non-zero, then slow down when
one of the counters reaches zero, then speed up again after the second
counter reaches zero. The cost is significant, about 10% slowdown on a
NULL syscall benchmark, and the jittery behaviour is very undesirable.
Fix this by having restore_math test all conditions first, and only
update MSR if we will be loading registers.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200623234139.2262227-2-npiggin@gmail.com
2020-06-24 02:41:38 +03:00
unsigned long new_msr = 0 ;
2016-02-29 09:53:47 +03:00
msr = regs - > msr ;
/*
powerpc/64s: Fix restore_math unnecessarily changing MSR
Before returning to user, if there are missing FP/VEC/VSX bits from the
user MSR then those registers had been saved and must be restored again
before use. restore_math will decide whether to restore immediately, or
skip the restore and let fp/vec/vsx unavailable faults demand load the
registers.
Each time restore_math restores one of the FP/VSX or VEC register sets
is loaded, an 8-bit counter is incremented (load_fp and load_vec). When
these wrap to zero, restore_math no longer restores that register set
until after they are next demand faulted.
It's quite usual for those counters to have different values, so if one
wraps to zero and restore_math no longer restores its registers or user
MSR bit but the other is not zero yet does not need to be restored
(because the kernel is not frequently using the FPU), then restore_math
will be called and it will also not return in the early exit check.
This causes msr_check_and_set to test and set the MSR at every kernel
exit despite having no work to do.
This can cause workloads (e.g., a NULL syscall microbenchmark) to run
fast for a time while both counters are non-zero, then slow down when
one of the counters reaches zero, then speed up again after the second
counter reaches zero. The cost is significant, about 10% slowdown on a
NULL syscall benchmark, and the jittery behaviour is very undesirable.
Fix this by having restore_math test all conditions first, and only
update MSR if we will be loading registers.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200623234139.2262227-2-npiggin@gmail.com
2020-06-24 02:41:38 +03:00
* new_msr tracks the facilities that are to be restored . Only reload
* if the bit is not set in the user MSR ( if it is set , the registers
* are live for the user thread ) .
2016-02-29 09:53:47 +03:00
*/
powerpc/64s: Fix restore_math unnecessarily changing MSR
Before returning to user, if there are missing FP/VEC/VSX bits from the
user MSR then those registers had been saved and must be restored again
before use. restore_math will decide whether to restore immediately, or
skip the restore and let fp/vec/vsx unavailable faults demand load the
registers.
Each time restore_math restores one of the FP/VSX or VEC register sets
is loaded, an 8-bit counter is incremented (load_fp and load_vec). When
these wrap to zero, restore_math no longer restores that register set
until after they are next demand faulted.
It's quite usual for those counters to have different values, so if one
wraps to zero and restore_math no longer restores its registers or user
MSR bit but the other is not zero yet does not need to be restored
(because the kernel is not frequently using the FPU), then restore_math
will be called and it will also not return in the early exit check.
This causes msr_check_and_set to test and set the MSR at every kernel
exit despite having no work to do.
This can cause workloads (e.g., a NULL syscall microbenchmark) to run
fast for a time while both counters are non-zero, then slow down when
one of the counters reaches zero, then speed up again after the second
counter reaches zero. The cost is significant, about 10% slowdown on a
NULL syscall benchmark, and the jittery behaviour is very undesirable.
Fix this by having restore_math test all conditions first, and only
update MSR if we will be loading registers.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200623234139.2262227-2-npiggin@gmail.com
2020-06-24 02:41:38 +03:00
if ( ( ! ( msr & MSR_FP ) ) & & should_restore_fp ( ) )
powerpc/64s: Fix crash in load_fp_state() due to fpexc_mode
The recent commit 01eb01877f33 ("powerpc/64s: Fix restore_math
unnecessarily changing MSR") changed some of the handling of floating
point/vector restore.
In particular it caused current->thread.fpexc_mode to be copied into
the current MSR (via msr_check_and_set()), rather than just into
regs->msr (which is moved into MSR on return to userspace).
This can lead to a crash in the kernel if we take a floating point
exception when restoring FPSCR:
Oops: Exception in kernel mode, sig: 8 [#1]
LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA PowerNV
Modules linked in:
CPU: 3 PID: 101213 Comm: ld64.so.2 Not tainted 5.9.0-rc1-00098-g18445bf405cb-dirty #9
NIP: c00000000000fbb4 LR: c00000000001a7ac CTR: c000000000183570
REGS: c0000016b7cfb3b0 TRAP: 0700 Not tainted (5.9.0-rc1-00098-g18445bf405cb-dirty)
MSR: 900000000290b933 <SF,HV,VEC,VSX,EE,FP,ME,IR,DR,RI,LE> CR: 44002444 XER: 00000000
CFAR: c00000000001a7a8 IRQMASK: 1
GPR00: c00000000001ae40 c0000016b7cfb640 c0000000011b7f00 c000001542a0f740
GPR04: c000001542a0f720 c000001542a0eb00 0000000000000900 c000001542a0eb00
GPR08: 000000000000000a 0000000000002000 9000000000009033 0000000000000000
GPR12: 0000000000004000 c0000017ffffd900 0000000000000001 c000000000df5a58
GPR16: c000000000e19c18 c0000000010e1123 0000000000000001 c000000000e1a638
GPR20: 0000000000000000 c0000000044b1d00 0000000000000000 c000001542a0f2a0
GPR24: 00000016c7fe0000 c000001542a0f720 c000000001c93da0 c000000000fe5f28
GPR28: c000001542a0f720 0000000000800000 c0000016b7cfbe90 0000000002802900
NIP load_fp_state+0x4/0x214
LR restore_math+0x17c/0x1f0
Call Trace:
0xc0000016b7cfb680 (unreliable)
__switch_to+0x330/0x460
__schedule+0x318/0x920
schedule+0x74/0x140
schedule_timeout+0x318/0x3f0
wait_for_completion+0xc8/0x210
call_usermodehelper_exec+0x234/0x280
do_coredump+0xedc/0x13c0
get_signal+0x1d4/0xbe0
do_notify_resume+0x1a0/0x490
interrupt_exit_user_prepare+0x1c4/0x230
interrupt_return+0x14/0x1c0
Instruction dump:
ebe10168 e88101a0 7c8ff120 382101e0 e8010010 7c0803a6 4e800020 790605c4
782905c4 7c0008a8 7c0008a8 c8030200 <fffe058e> 48000088 c8030000 c8230010
Fix it by only loading the fpexc_mode value into regs->msr.
Also add a comment to explain that although VSX is subject to the
value of fpexc_mode, we don't have to handle that separately because
we only allow VSX to be enabled if FP is also enabled.
Fixes: 01eb01877f33 ("powerpc/64s: Fix restore_math unnecessarily changing MSR")
Reported-by: Milton Miller <miltonm@us.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Reviewed-by: Nicholas Piggin <npiggin@gmail.com>
Link: https://lore.kernel.org/r/20200825093424.3967813-1-mpe@ellerman.id.au
2020-08-25 12:34:24 +03:00
new_msr | = MSR_FP ;
2016-02-29 09:53:47 +03:00
powerpc/64s: Fix restore_math unnecessarily changing MSR
Before returning to user, if there are missing FP/VEC/VSX bits from the
user MSR then those registers had been saved and must be restored again
before use. restore_math will decide whether to restore immediately, or
skip the restore and let fp/vec/vsx unavailable faults demand load the
registers.
Each time restore_math restores one of the FP/VSX or VEC register sets
is loaded, an 8-bit counter is incremented (load_fp and load_vec). When
these wrap to zero, restore_math no longer restores that register set
until after they are next demand faulted.
It's quite usual for those counters to have different values, so if one
wraps to zero and restore_math no longer restores its registers or user
MSR bit but the other is not zero yet does not need to be restored
(because the kernel is not frequently using the FPU), then restore_math
will be called and it will also not return in the early exit check.
This causes msr_check_and_set to test and set the MSR at every kernel
exit despite having no work to do.
This can cause workloads (e.g., a NULL syscall microbenchmark) to run
fast for a time while both counters are non-zero, then slow down when
one of the counters reaches zero, then speed up again after the second
counter reaches zero. The cost is significant, about 10% slowdown on a
NULL syscall benchmark, and the jittery behaviour is very undesirable.
Fix this by having restore_math test all conditions first, and only
update MSR if we will be loading registers.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200623234139.2262227-2-npiggin@gmail.com
2020-06-24 02:41:38 +03:00
if ( ( ! ( msr & MSR_VEC ) ) & & should_restore_altivec ( ) )
new_msr | = MSR_VEC ;
2016-02-29 09:53:47 +03:00
powerpc/64s: Fix restore_math unnecessarily changing MSR
Before returning to user, if there are missing FP/VEC/VSX bits from the
user MSR then those registers had been saved and must be restored again
before use. restore_math will decide whether to restore immediately, or
skip the restore and let fp/vec/vsx unavailable faults demand load the
registers.
Each time restore_math restores one of the FP/VSX or VEC register sets
is loaded, an 8-bit counter is incremented (load_fp and load_vec). When
these wrap to zero, restore_math no longer restores that register set
until after they are next demand faulted.
It's quite usual for those counters to have different values, so if one
wraps to zero and restore_math no longer restores its registers or user
MSR bit but the other is not zero yet does not need to be restored
(because the kernel is not frequently using the FPU), then restore_math
will be called and it will also not return in the early exit check.
This causes msr_check_and_set to test and set the MSR at every kernel
exit despite having no work to do.
This can cause workloads (e.g., a NULL syscall microbenchmark) to run
fast for a time while both counters are non-zero, then slow down when
one of the counters reaches zero, then speed up again after the second
counter reaches zero. The cost is significant, about 10% slowdown on a
NULL syscall benchmark, and the jittery behaviour is very undesirable.
Fix this by having restore_math test all conditions first, and only
update MSR if we will be loading registers.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200623234139.2262227-2-npiggin@gmail.com
2020-06-24 02:41:38 +03:00
if ( ( ! ( msr & MSR_VSX ) ) & & should_restore_vsx ( ) ) {
if ( ( ( msr | new_msr ) & ( MSR_FP | MSR_VEC ) ) = = ( MSR_FP | MSR_VEC ) )
new_msr | = MSR_VSX ;
2016-02-29 09:53:47 +03:00
}
powerpc/64s: Fix restore_math unnecessarily changing MSR
Before returning to user, if there are missing FP/VEC/VSX bits from the
user MSR then those registers had been saved and must be restored again
before use. restore_math will decide whether to restore immediately, or
skip the restore and let fp/vec/vsx unavailable faults demand load the
registers.
Each time restore_math restores one of the FP/VSX or VEC register sets
is loaded, an 8-bit counter is incremented (load_fp and load_vec). When
these wrap to zero, restore_math no longer restores that register set
until after they are next demand faulted.
It's quite usual for those counters to have different values, so if one
wraps to zero and restore_math no longer restores its registers or user
MSR bit but the other is not zero yet does not need to be restored
(because the kernel is not frequently using the FPU), then restore_math
will be called and it will also not return in the early exit check.
This causes msr_check_and_set to test and set the MSR at every kernel
exit despite having no work to do.
This can cause workloads (e.g., a NULL syscall microbenchmark) to run
fast for a time while both counters are non-zero, then slow down when
one of the counters reaches zero, then speed up again after the second
counter reaches zero. The cost is significant, about 10% slowdown on a
NULL syscall benchmark, and the jittery behaviour is very undesirable.
Fix this by having restore_math test all conditions first, and only
update MSR if we will be loading registers.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200623234139.2262227-2-npiggin@gmail.com
2020-06-24 02:41:38 +03:00
if ( new_msr ) {
powerpc/64s: Fix crash in load_fp_state() due to fpexc_mode
The recent commit 01eb01877f33 ("powerpc/64s: Fix restore_math
unnecessarily changing MSR") changed some of the handling of floating
point/vector restore.
In particular it caused current->thread.fpexc_mode to be copied into
the current MSR (via msr_check_and_set()), rather than just into
regs->msr (which is moved into MSR on return to userspace).
This can lead to a crash in the kernel if we take a floating point
exception when restoring FPSCR:
Oops: Exception in kernel mode, sig: 8 [#1]
LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA PowerNV
Modules linked in:
CPU: 3 PID: 101213 Comm: ld64.so.2 Not tainted 5.9.0-rc1-00098-g18445bf405cb-dirty #9
NIP: c00000000000fbb4 LR: c00000000001a7ac CTR: c000000000183570
REGS: c0000016b7cfb3b0 TRAP: 0700 Not tainted (5.9.0-rc1-00098-g18445bf405cb-dirty)
MSR: 900000000290b933 <SF,HV,VEC,VSX,EE,FP,ME,IR,DR,RI,LE> CR: 44002444 XER: 00000000
CFAR: c00000000001a7a8 IRQMASK: 1
GPR00: c00000000001ae40 c0000016b7cfb640 c0000000011b7f00 c000001542a0f740
GPR04: c000001542a0f720 c000001542a0eb00 0000000000000900 c000001542a0eb00
GPR08: 000000000000000a 0000000000002000 9000000000009033 0000000000000000
GPR12: 0000000000004000 c0000017ffffd900 0000000000000001 c000000000df5a58
GPR16: c000000000e19c18 c0000000010e1123 0000000000000001 c000000000e1a638
GPR20: 0000000000000000 c0000000044b1d00 0000000000000000 c000001542a0f2a0
GPR24: 00000016c7fe0000 c000001542a0f720 c000000001c93da0 c000000000fe5f28
GPR28: c000001542a0f720 0000000000800000 c0000016b7cfbe90 0000000002802900
NIP load_fp_state+0x4/0x214
LR restore_math+0x17c/0x1f0
Call Trace:
0xc0000016b7cfb680 (unreliable)
__switch_to+0x330/0x460
__schedule+0x318/0x920
schedule+0x74/0x140
schedule_timeout+0x318/0x3f0
wait_for_completion+0xc8/0x210
call_usermodehelper_exec+0x234/0x280
do_coredump+0xedc/0x13c0
get_signal+0x1d4/0xbe0
do_notify_resume+0x1a0/0x490
interrupt_exit_user_prepare+0x1c4/0x230
interrupt_return+0x14/0x1c0
Instruction dump:
ebe10168 e88101a0 7c8ff120 382101e0 e8010010 7c0803a6 4e800020 790605c4
782905c4 7c0008a8 7c0008a8 c8030200 <fffe058e> 48000088 c8030000 c8230010
Fix it by only loading the fpexc_mode value into regs->msr.
Also add a comment to explain that although VSX is subject to the
value of fpexc_mode, we don't have to handle that separately because
we only allow VSX to be enabled if FP is also enabled.
Fixes: 01eb01877f33 ("powerpc/64s: Fix restore_math unnecessarily changing MSR")
Reported-by: Milton Miller <miltonm@us.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Reviewed-by: Nicholas Piggin <npiggin@gmail.com>
Link: https://lore.kernel.org/r/20200825093424.3967813-1-mpe@ellerman.id.au
2020-08-25 12:34:24 +03:00
unsigned long fpexc_mode = 0 ;
powerpc/64s: Fix restore_math unnecessarily changing MSR
Before returning to user, if there are missing FP/VEC/VSX bits from the
user MSR then those registers had been saved and must be restored again
before use. restore_math will decide whether to restore immediately, or
skip the restore and let fp/vec/vsx unavailable faults demand load the
registers.
Each time restore_math restores one of the FP/VSX or VEC register sets
is loaded, an 8-bit counter is incremented (load_fp and load_vec). When
these wrap to zero, restore_math no longer restores that register set
until after they are next demand faulted.
It's quite usual for those counters to have different values, so if one
wraps to zero and restore_math no longer restores its registers or user
MSR bit but the other is not zero yet does not need to be restored
(because the kernel is not frequently using the FPU), then restore_math
will be called and it will also not return in the early exit check.
This causes msr_check_and_set to test and set the MSR at every kernel
exit despite having no work to do.
This can cause workloads (e.g., a NULL syscall microbenchmark) to run
fast for a time while both counters are non-zero, then slow down when
one of the counters reaches zero, then speed up again after the second
counter reaches zero. The cost is significant, about 10% slowdown on a
NULL syscall benchmark, and the jittery behaviour is very undesirable.
Fix this by having restore_math test all conditions first, and only
update MSR if we will be loading registers.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200623234139.2262227-2-npiggin@gmail.com
2020-06-24 02:41:38 +03:00
msr_check_and_set ( new_msr ) ;
powerpc/64s: Fix crash in load_fp_state() due to fpexc_mode
The recent commit 01eb01877f33 ("powerpc/64s: Fix restore_math
unnecessarily changing MSR") changed some of the handling of floating
point/vector restore.
In particular it caused current->thread.fpexc_mode to be copied into
the current MSR (via msr_check_and_set()), rather than just into
regs->msr (which is moved into MSR on return to userspace).
This can lead to a crash in the kernel if we take a floating point
exception when restoring FPSCR:
Oops: Exception in kernel mode, sig: 8 [#1]
LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA PowerNV
Modules linked in:
CPU: 3 PID: 101213 Comm: ld64.so.2 Not tainted 5.9.0-rc1-00098-g18445bf405cb-dirty #9
NIP: c00000000000fbb4 LR: c00000000001a7ac CTR: c000000000183570
REGS: c0000016b7cfb3b0 TRAP: 0700 Not tainted (5.9.0-rc1-00098-g18445bf405cb-dirty)
MSR: 900000000290b933 <SF,HV,VEC,VSX,EE,FP,ME,IR,DR,RI,LE> CR: 44002444 XER: 00000000
CFAR: c00000000001a7a8 IRQMASK: 1
GPR00: c00000000001ae40 c0000016b7cfb640 c0000000011b7f00 c000001542a0f740
GPR04: c000001542a0f720 c000001542a0eb00 0000000000000900 c000001542a0eb00
GPR08: 000000000000000a 0000000000002000 9000000000009033 0000000000000000
GPR12: 0000000000004000 c0000017ffffd900 0000000000000001 c000000000df5a58
GPR16: c000000000e19c18 c0000000010e1123 0000000000000001 c000000000e1a638
GPR20: 0000000000000000 c0000000044b1d00 0000000000000000 c000001542a0f2a0
GPR24: 00000016c7fe0000 c000001542a0f720 c000000001c93da0 c000000000fe5f28
GPR28: c000001542a0f720 0000000000800000 c0000016b7cfbe90 0000000002802900
NIP load_fp_state+0x4/0x214
LR restore_math+0x17c/0x1f0
Call Trace:
0xc0000016b7cfb680 (unreliable)
__switch_to+0x330/0x460
__schedule+0x318/0x920
schedule+0x74/0x140
schedule_timeout+0x318/0x3f0
wait_for_completion+0xc8/0x210
call_usermodehelper_exec+0x234/0x280
do_coredump+0xedc/0x13c0
get_signal+0x1d4/0xbe0
do_notify_resume+0x1a0/0x490
interrupt_exit_user_prepare+0x1c4/0x230
interrupt_return+0x14/0x1c0
Instruction dump:
ebe10168 e88101a0 7c8ff120 382101e0 e8010010 7c0803a6 4e800020 790605c4
782905c4 7c0008a8 7c0008a8 c8030200 <fffe058e> 48000088 c8030000 c8230010
Fix it by only loading the fpexc_mode value into regs->msr.
Also add a comment to explain that although VSX is subject to the
value of fpexc_mode, we don't have to handle that separately because
we only allow VSX to be enabled if FP is also enabled.
Fixes: 01eb01877f33 ("powerpc/64s: Fix restore_math unnecessarily changing MSR")
Reported-by: Milton Miller <miltonm@us.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Reviewed-by: Nicholas Piggin <npiggin@gmail.com>
Link: https://lore.kernel.org/r/20200825093424.3967813-1-mpe@ellerman.id.au
2020-08-25 12:34:24 +03:00
if ( new_msr & MSR_FP ) {
powerpc/64s: Fix restore_math unnecessarily changing MSR
Before returning to user, if there are missing FP/VEC/VSX bits from the
user MSR then those registers had been saved and must be restored again
before use. restore_math will decide whether to restore immediately, or
skip the restore and let fp/vec/vsx unavailable faults demand load the
registers.
Each time restore_math restores one of the FP/VSX or VEC register sets
is loaded, an 8-bit counter is incremented (load_fp and load_vec). When
these wrap to zero, restore_math no longer restores that register set
until after they are next demand faulted.
It's quite usual for those counters to have different values, so if one
wraps to zero and restore_math no longer restores its registers or user
MSR bit but the other is not zero yet does not need to be restored
(because the kernel is not frequently using the FPU), then restore_math
will be called and it will also not return in the early exit check.
This causes msr_check_and_set to test and set the MSR at every kernel
exit despite having no work to do.
This can cause workloads (e.g., a NULL syscall microbenchmark) to run
fast for a time while both counters are non-zero, then slow down when
one of the counters reaches zero, then speed up again after the second
counter reaches zero. The cost is significant, about 10% slowdown on a
NULL syscall benchmark, and the jittery behaviour is very undesirable.
Fix this by having restore_math test all conditions first, and only
update MSR if we will be loading registers.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200623234139.2262227-2-npiggin@gmail.com
2020-06-24 02:41:38 +03:00
do_restore_fp ( ) ;
powerpc/64s: Fix crash in load_fp_state() due to fpexc_mode
The recent commit 01eb01877f33 ("powerpc/64s: Fix restore_math
unnecessarily changing MSR") changed some of the handling of floating
point/vector restore.
In particular it caused current->thread.fpexc_mode to be copied into
the current MSR (via msr_check_and_set()), rather than just into
regs->msr (which is moved into MSR on return to userspace).
This can lead to a crash in the kernel if we take a floating point
exception when restoring FPSCR:
Oops: Exception in kernel mode, sig: 8 [#1]
LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA PowerNV
Modules linked in:
CPU: 3 PID: 101213 Comm: ld64.so.2 Not tainted 5.9.0-rc1-00098-g18445bf405cb-dirty #9
NIP: c00000000000fbb4 LR: c00000000001a7ac CTR: c000000000183570
REGS: c0000016b7cfb3b0 TRAP: 0700 Not tainted (5.9.0-rc1-00098-g18445bf405cb-dirty)
MSR: 900000000290b933 <SF,HV,VEC,VSX,EE,FP,ME,IR,DR,RI,LE> CR: 44002444 XER: 00000000
CFAR: c00000000001a7a8 IRQMASK: 1
GPR00: c00000000001ae40 c0000016b7cfb640 c0000000011b7f00 c000001542a0f740
GPR04: c000001542a0f720 c000001542a0eb00 0000000000000900 c000001542a0eb00
GPR08: 000000000000000a 0000000000002000 9000000000009033 0000000000000000
GPR12: 0000000000004000 c0000017ffffd900 0000000000000001 c000000000df5a58
GPR16: c000000000e19c18 c0000000010e1123 0000000000000001 c000000000e1a638
GPR20: 0000000000000000 c0000000044b1d00 0000000000000000 c000001542a0f2a0
GPR24: 00000016c7fe0000 c000001542a0f720 c000000001c93da0 c000000000fe5f28
GPR28: c000001542a0f720 0000000000800000 c0000016b7cfbe90 0000000002802900
NIP load_fp_state+0x4/0x214
LR restore_math+0x17c/0x1f0
Call Trace:
0xc0000016b7cfb680 (unreliable)
__switch_to+0x330/0x460
__schedule+0x318/0x920
schedule+0x74/0x140
schedule_timeout+0x318/0x3f0
wait_for_completion+0xc8/0x210
call_usermodehelper_exec+0x234/0x280
do_coredump+0xedc/0x13c0
get_signal+0x1d4/0xbe0
do_notify_resume+0x1a0/0x490
interrupt_exit_user_prepare+0x1c4/0x230
interrupt_return+0x14/0x1c0
Instruction dump:
ebe10168 e88101a0 7c8ff120 382101e0 e8010010 7c0803a6 4e800020 790605c4
782905c4 7c0008a8 7c0008a8 c8030200 <fffe058e> 48000088 c8030000 c8230010
Fix it by only loading the fpexc_mode value into regs->msr.
Also add a comment to explain that although VSX is subject to the
value of fpexc_mode, we don't have to handle that separately because
we only allow VSX to be enabled if FP is also enabled.
Fixes: 01eb01877f33 ("powerpc/64s: Fix restore_math unnecessarily changing MSR")
Reported-by: Milton Miller <miltonm@us.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Reviewed-by: Nicholas Piggin <npiggin@gmail.com>
Link: https://lore.kernel.org/r/20200825093424.3967813-1-mpe@ellerman.id.au
2020-08-25 12:34:24 +03:00
// This also covers VSX, because VSX implies FP
fpexc_mode = current - > thread . fpexc_mode ;
}
powerpc/64s: Fix restore_math unnecessarily changing MSR
Before returning to user, if there are missing FP/VEC/VSX bits from the
user MSR then those registers had been saved and must be restored again
before use. restore_math will decide whether to restore immediately, or
skip the restore and let fp/vec/vsx unavailable faults demand load the
registers.
Each time restore_math restores one of the FP/VSX or VEC register sets
is loaded, an 8-bit counter is incremented (load_fp and load_vec). When
these wrap to zero, restore_math no longer restores that register set
until after they are next demand faulted.
It's quite usual for those counters to have different values, so if one
wraps to zero and restore_math no longer restores its registers or user
MSR bit but the other is not zero yet does not need to be restored
(because the kernel is not frequently using the FPU), then restore_math
will be called and it will also not return in the early exit check.
This causes msr_check_and_set to test and set the MSR at every kernel
exit despite having no work to do.
This can cause workloads (e.g., a NULL syscall microbenchmark) to run
fast for a time while both counters are non-zero, then slow down when
one of the counters reaches zero, then speed up again after the second
counter reaches zero. The cost is significant, about 10% slowdown on a
NULL syscall benchmark, and the jittery behaviour is very undesirable.
Fix this by having restore_math test all conditions first, and only
update MSR if we will be loading registers.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200623234139.2262227-2-npiggin@gmail.com
2020-06-24 02:41:38 +03:00
if ( new_msr & MSR_VEC )
do_restore_altivec ( ) ;
2016-02-29 09:53:47 +03:00
powerpc/64s: Fix restore_math unnecessarily changing MSR
Before returning to user, if there are missing FP/VEC/VSX bits from the
user MSR then those registers had been saved and must be restored again
before use. restore_math will decide whether to restore immediately, or
skip the restore and let fp/vec/vsx unavailable faults demand load the
registers.
Each time restore_math restores one of the FP/VSX or VEC register sets
is loaded, an 8-bit counter is incremented (load_fp and load_vec). When
these wrap to zero, restore_math no longer restores that register set
until after they are next demand faulted.
It's quite usual for those counters to have different values, so if one
wraps to zero and restore_math no longer restores its registers or user
MSR bit but the other is not zero yet does not need to be restored
(because the kernel is not frequently using the FPU), then restore_math
will be called and it will also not return in the early exit check.
This causes msr_check_and_set to test and set the MSR at every kernel
exit despite having no work to do.
This can cause workloads (e.g., a NULL syscall microbenchmark) to run
fast for a time while both counters are non-zero, then slow down when
one of the counters reaches zero, then speed up again after the second
counter reaches zero. The cost is significant, about 10% slowdown on a
NULL syscall benchmark, and the jittery behaviour is very undesirable.
Fix this by having restore_math test all conditions first, and only
update MSR if we will be loading registers.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200623234139.2262227-2-npiggin@gmail.com
2020-06-24 02:41:38 +03:00
if ( new_msr & MSR_VSX )
do_restore_vsx ( ) ;
msr_check_and_clear ( new_msr ) ;
2021-06-17 18:51:03 +03:00
regs_set_return_msr ( regs , regs - > msr | new_msr | fpexc_mode ) ;
powerpc/64s: Fix restore_math unnecessarily changing MSR
Before returning to user, if there are missing FP/VEC/VSX bits from the
user MSR then those registers had been saved and must be restored again
before use. restore_math will decide whether to restore immediately, or
skip the restore and let fp/vec/vsx unavailable faults demand load the
registers.
Each time restore_math restores one of the FP/VSX or VEC register sets
is loaded, an 8-bit counter is incremented (load_fp and load_vec). When
these wrap to zero, restore_math no longer restores that register set
until after they are next demand faulted.
It's quite usual for those counters to have different values, so if one
wraps to zero and restore_math no longer restores its registers or user
MSR bit but the other is not zero yet does not need to be restored
(because the kernel is not frequently using the FPU), then restore_math
will be called and it will also not return in the early exit check.
This causes msr_check_and_set to test and set the MSR at every kernel
exit despite having no work to do.
This can cause workloads (e.g., a NULL syscall microbenchmark) to run
fast for a time while both counters are non-zero, then slow down when
one of the counters reaches zero, then speed up again after the second
counter reaches zero. The cost is significant, about 10% slowdown on a
NULL syscall benchmark, and the jittery behaviour is very undesirable.
Fix this by having restore_math test all conditions first, and only
update MSR if we will be loading registers.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200623234139.2262227-2-npiggin@gmail.com
2020-06-24 02:41:38 +03:00
}
2016-02-29 09:53:47 +03:00
}
2020-08-17 08:46:45 +03:00
# endif /* CONFIG_PPC_BOOK3S_64 */
2016-02-29 09:53:47 +03:00
2018-02-25 20:22:23 +03:00
static void save_all ( struct task_struct * tsk )
2016-02-29 09:53:48 +03:00
{
unsigned long usermsr ;
if ( ! tsk - > thread . regs )
return ;
usermsr = tsk - > thread . regs - > msr ;
if ( ( usermsr & msr_all_available ) = = 0 )
return ;
msr_check_and_set ( msr_all_available ) ;
2017-08-16 09:01:18 +03:00
WARN_ON ( ( usermsr & MSR_VSX ) & & ! ( ( usermsr & MSR_FP ) & & ( usermsr & MSR_VEC ) ) ) ;
if ( usermsr & MSR_FP )
save_fpu ( tsk ) ;
if ( usermsr & MSR_VEC )
save_altivec ( tsk ) ;
2016-02-29 09:53:48 +03:00
if ( usermsr & MSR_SPE )
__giveup_spe ( tsk ) ;
msr_check_and_clear ( msr_all_available ) ;
}
2015-10-29 03:44:09 +03:00
void flush_all_to_thread ( struct task_struct * tsk )
{
if ( tsk - > thread . regs ) {
preempt_disable ( ) ;
BUG_ON ( tsk ! = current ) ;
# ifdef CONFIG_SPE
if ( tsk - > thread . regs - > msr & MSR_SPE )
tsk - > thread . spefscr = mfspr ( SPRN_SPEFSCR ) ;
# endif
2018-10-24 16:57:22 +03:00
save_all ( tsk ) ;
2015-10-29 03:44:09 +03:00
preempt_enable ( ) ;
}
}
EXPORT_SYMBOL ( flush_all_to_thread ) ;
2010-02-08 14:51:18 +03:00
# ifdef CONFIG_PPC_ADV_DEBUG_REGS
void do_send_trap ( struct pt_regs * regs , unsigned long address ,
2018-01-17 01:12:38 +03:00
unsigned long error_code , int breakpt )
2010-02-08 14:51:18 +03:00
{
2018-01-17 01:12:38 +03:00
current - > thread . trap_nr = TRAP_HWBKPT ;
2010-02-08 14:51:18 +03:00
if ( notify_die ( DIE_DABR_MATCH , " dabr_match " , regs , error_code ,
11 , SIGSEGV ) = = NOTIFY_STOP )
return ;
/* Deliver the signal to userspace */
2018-01-22 23:37:25 +03:00
force_sig_ptrace_errno_trap ( breakpt , /* breakpoint or watchpoint id */
( void __user * ) address ) ;
2010-02-08 14:51:18 +03:00
}
# else /* !CONFIG_PPC_ADV_DEBUG_REGS */
2020-09-02 07:29:42 +03:00
static void do_break_handler ( struct pt_regs * regs )
{
struct arch_hw_breakpoint null_brk = { 0 } ;
struct arch_hw_breakpoint * info ;
2021-11-29 20:49:38 +03:00
ppc_inst_t instr = ppc_inst ( 0 ) ;
2020-09-02 07:29:42 +03:00
int type = 0 ;
int size = 0 ;
unsigned long ea ;
int i ;
/*
* If underneath hw supports only one watchpoint , we know it
* caused exception . 8 xx also falls into this category .
*/
if ( nr_wp_slots ( ) = = 1 ) {
__set_breakpoint ( 0 , & null_brk ) ;
current - > thread . hw_brk [ 0 ] = null_brk ;
current - > thread . hw_brk [ 0 ] . flags | = HW_BRK_FLAG_DISABLED ;
return ;
}
2022-04-30 21:56:54 +03:00
/* Otherwise find out which DAWR caused exception and disable it. */
2020-09-02 07:29:42 +03:00
wp_get_instr_detail ( regs , & instr , & type , & size , & ea ) ;
for ( i = 0 ; i < nr_wp_slots ( ) ; i + + ) {
info = & current - > thread . hw_brk [ i ] ;
if ( ! info - > address )
continue ;
if ( wp_check_constraints ( regs , instr , ea , type , size , info ) ) {
__set_breakpoint ( i , & null_brk ) ;
current - > thread . hw_brk [ i ] = null_brk ;
current - > thread . hw_brk [ i ] . flags | = HW_BRK_FLAG_DISABLED ;
}
}
}
2021-01-30 16:08:38 +03:00
DEFINE_INTERRUPT_HANDLER ( do_break )
2008-07-23 20:10:41 +04:00
{
2012-08-24 01:27:09 +04:00
current - > thread . trap_nr = TRAP_HWBKPT ;
2021-01-30 16:08:18 +03:00
if ( notify_die ( DIE_DABR_MATCH , " dabr_match " , regs , regs - > dsisr ,
2008-07-23 20:10:41 +04:00
11 , SIGSEGV ) = = NOTIFY_STOP )
return ;
2012-12-20 18:06:44 +04:00
if ( debugger_break_match ( regs ) )
2008-07-23 20:10:41 +04:00
return ;
2020-09-02 07:29:42 +03:00
/*
* We reach here only when watchpoint exception is generated by ptrace
* event ( or hw is buggy ! ) . Now if CONFIG_HAVE_HW_BREAKPOINT is set ,
* watchpoint is already handled by hw_breakpoint_handler ( ) so we don ' t
* have to do anything . But when CONFIG_HAVE_HW_BREAKPOINT is not set ,
* we need to manually handle the watchpoint here .
*/
if ( ! IS_ENABLED ( CONFIG_HAVE_HW_BREAKPOINT ) )
do_break_handler ( regs ) ;
2008-07-23 20:10:41 +04:00
/* Deliver the signal to userspace */
2021-01-30 16:08:18 +03:00
force_sig_fault ( SIGTRAP , TRAP_HWBKPT , ( void __user * ) regs - > dar ) ;
2008-07-23 20:10:41 +04:00
}
2010-02-08 14:51:18 +03:00
# endif /* CONFIG_PPC_ADV_DEBUG_REGS */
2008-07-23 20:10:41 +04:00
2020-05-14 14:17:31 +03:00
static DEFINE_PER_CPU ( struct arch_hw_breakpoint , current_brk [ HBP_NUM_MAX ] ) ;
2008-03-28 11:11:48 +03:00
2010-02-08 14:51:18 +03:00
# ifdef CONFIG_PPC_ADV_DEBUG_REGS
/*
* Set the debug registers back to their default " safe " values .
*/
static void set_debug_reg_defaults ( struct thread_struct * thread )
{
2013-07-04 10:15:46 +04:00
thread - > debug . iac1 = thread - > debug . iac2 = 0 ;
2010-02-08 14:51:18 +03:00
# if CONFIG_PPC_ADV_DEBUG_IACS > 2
2013-07-04 10:15:46 +04:00
thread - > debug . iac3 = thread - > debug . iac4 = 0 ;
2010-02-08 14:51:18 +03:00
# endif
2013-07-04 10:15:46 +04:00
thread - > debug . dac1 = thread - > debug . dac2 = 0 ;
2010-02-08 14:51:18 +03:00
# if CONFIG_PPC_ADV_DEBUG_DVCS > 0
2013-07-04 10:15:46 +04:00
thread - > debug . dvc1 = thread - > debug . dvc2 = 0 ;
2010-02-08 14:51:18 +03:00
# endif
2013-07-04 10:15:46 +04:00
thread - > debug . dbcr0 = 0 ;
2010-02-08 14:51:18 +03:00
# ifdef CONFIG_BOOKE
/*
* Force User / Supervisor bits to b11 ( user - only MSR [ PR ] = 1 )
*/
2013-07-04 10:15:46 +04:00
thread - > debug . dbcr1 = DBCR1_IAC1US | DBCR1_IAC2US |
2010-02-08 14:51:18 +03:00
DBCR1_IAC3US | DBCR1_IAC4US ;
/*
* Force Data Address Compare User / Supervisor bits to be User - only
* ( 0 b11 MSR [ PR ] = 1 ) and set all other bits in DBCR2 register to be 0.
*/
2013-07-04 10:15:46 +04:00
thread - > debug . dbcr2 = DBCR2_DAC1US | DBCR2_DAC2US ;
2010-02-08 14:51:18 +03:00
# else
2013-07-04 10:15:46 +04:00
thread - > debug . dbcr1 = 0 ;
2010-02-08 14:51:18 +03:00
# endif
}
2013-11-23 01:52:29 +04:00
static void prime_debug_regs ( struct debug_reg * debug )
2010-02-08 14:51:18 +03:00
{
2013-05-13 18:14:53 +04:00
/*
* We could have inherited MSR_DE from userspace , since
* it doesn ' t get cleared on exception entry . Make sure
* MSR_DE is clear before we enable any debug events .
*/
mtmsr ( mfmsr ( ) & ~ MSR_DE ) ;
2013-11-23 01:52:29 +04:00
mtspr ( SPRN_IAC1 , debug - > iac1 ) ;
mtspr ( SPRN_IAC2 , debug - > iac2 ) ;
2010-02-08 14:51:18 +03:00
# if CONFIG_PPC_ADV_DEBUG_IACS > 2
2013-11-23 01:52:29 +04:00
mtspr ( SPRN_IAC3 , debug - > iac3 ) ;
mtspr ( SPRN_IAC4 , debug - > iac4 ) ;
2010-02-08 14:51:18 +03:00
# endif
2013-11-23 01:52:29 +04:00
mtspr ( SPRN_DAC1 , debug - > dac1 ) ;
mtspr ( SPRN_DAC2 , debug - > dac2 ) ;
2010-02-08 14:51:18 +03:00
# if CONFIG_PPC_ADV_DEBUG_DVCS > 0
2013-11-23 01:52:29 +04:00
mtspr ( SPRN_DVC1 , debug - > dvc1 ) ;
mtspr ( SPRN_DVC2 , debug - > dvc2 ) ;
2010-02-08 14:51:18 +03:00
# endif
2013-11-23 01:52:29 +04:00
mtspr ( SPRN_DBCR0 , debug - > dbcr0 ) ;
mtspr ( SPRN_DBCR1 , debug - > dbcr1 ) ;
2010-02-08 14:51:18 +03:00
# ifdef CONFIG_BOOKE
2013-11-23 01:52:29 +04:00
mtspr ( SPRN_DBCR2 , debug - > dbcr2 ) ;
2010-02-08 14:51:18 +03:00
# endif
}
/*
* Unless neither the old or new thread are making use of the
* debug registers , set the debug registers from the values
* stored in the new thread .
*/
2013-11-23 01:52:29 +04:00
void switch_booke_debug_regs ( struct debug_reg * new_debug )
2010-02-08 14:51:18 +03:00
{
2013-07-04 10:15:46 +04:00
if ( ( current - > thread . debug . dbcr0 & DBCR0_IDM )
2013-11-23 01:52:29 +04:00
| | ( new_debug - > dbcr0 & DBCR0_IDM ) )
prime_debug_regs ( new_debug ) ;
2010-02-08 14:51:18 +03:00
}
2013-07-04 10:57:44 +04:00
EXPORT_SYMBOL_GPL ( switch_booke_debug_regs ) ;
2010-02-08 14:51:18 +03:00
# else /* !CONFIG_PPC_ADV_DEBUG_REGS */
2011-02-10 07:44:35 +03:00
# ifndef CONFIG_HAVE_HW_BREAKPOINT
2020-05-14 14:17:34 +03:00
static void set_breakpoint ( int i , struct arch_hw_breakpoint * brk )
2018-07-05 19:25:05 +03:00
{
preempt_disable ( ) ;
2020-05-14 14:17:34 +03:00
__set_breakpoint ( i , brk ) ;
2018-07-05 19:25:05 +03:00
preempt_enable ( ) ;
}
2010-02-08 14:51:18 +03:00
static void set_debug_reg_defaults ( struct thread_struct * thread )
{
2020-05-14 14:17:34 +03:00
int i ;
struct arch_hw_breakpoint null_brk = { 0 } ;
for ( i = 0 ; i < nr_wp_slots ( ) ; i + + ) {
thread - > hw_brk [ i ] = null_brk ;
if ( ppc_breakpoint_available ( ) )
set_breakpoint ( i , & thread - > hw_brk [ i ] ) ;
}
}
static inline bool hw_brk_match ( struct arch_hw_breakpoint * a ,
struct arch_hw_breakpoint * b )
{
if ( a - > address ! = b - > address )
return false ;
if ( a - > type ! = b - > type )
return false ;
if ( a - > len ! = b - > len )
return false ;
/* no need to check hw_len. it's calculated from address and len */
return true ;
}
static void switch_hw_breakpoint ( struct task_struct * new )
{
int i ;
for ( i = 0 ; i < nr_wp_slots ( ) ; i + + ) {
if ( likely ( hw_brk_match ( this_cpu_ptr ( & current_brk [ i ] ) ,
& new - > thread . hw_brk [ i ] ) ) )
continue ;
__set_breakpoint ( i , & new - > thread . hw_brk [ i ] ) ;
}
2010-02-08 14:51:18 +03:00
}
2011-02-10 07:44:35 +03:00
# endif /* !CONFIG_HAVE_HW_BREAKPOINT */
2010-02-08 14:51:18 +03:00
# endif /* CONFIG_PPC_ADV_DEBUG_REGS */
2012-12-20 18:06:44 +04:00
static inline int set_dabr ( struct arch_hw_breakpoint * brk )
{
unsigned long dabr , dabrx ;
dabr = brk - > address | ( brk - > type & HW_BRK_TYPE_DABR ) ;
dabrx = ( ( brk - > type > > 3 ) & 0x7 ) ;
if ( ppc_md . set_dabr )
return ppc_md . set_dabr ( dabr , dabrx ) ;
2020-12-04 13:12:51 +03:00
if ( IS_ENABLED ( CONFIG_PPC_ADV_DEBUG_REGS ) ) {
mtspr ( SPRN_DAC1 , dabr ) ;
if ( IS_ENABLED ( CONFIG_PPC_47x ) )
isync ( ) ;
return 0 ;
} else if ( IS_ENABLED ( CONFIG_PPC_BOOK3S ) ) {
mtspr ( SPRN_DABR , dabr ) ;
if ( cpu_has_feature ( CPU_FTR_DABRX ) )
mtspr ( SPRN_DABRX , dabrx ) ;
return 0 ;
} else {
return - EINVAL ;
}
2012-12-20 18:06:44 +04:00
}
2019-11-26 20:43:29 +03:00
static inline int set_breakpoint_8xx ( struct arch_hw_breakpoint * brk )
{
unsigned long lctrl1 = LCTRL1_CTE_GT | LCTRL1_CTF_LT | LCTRL1_CRWE_RW |
LCTRL1_CRWF_RW ;
unsigned long lctrl2 = LCTRL2_LW0EN | LCTRL2_LW0LADC | LCTRL2_SLW0EN ;
2020-05-14 14:17:37 +03:00
unsigned long start_addr = ALIGN_DOWN ( brk - > address , HW_BREAKPOINT_SIZE ) ;
unsigned long end_addr = ALIGN ( brk - > address + brk - > len , HW_BREAKPOINT_SIZE ) ;
2019-11-26 20:43:29 +03:00
if ( start_addr = = 0 )
lctrl2 | = LCTRL2_LW0LA_F ;
2020-05-14 14:17:37 +03:00
else if ( end_addr = = 0 )
2019-11-26 20:43:29 +03:00
lctrl2 | = LCTRL2_LW0LA_E ;
else
lctrl2 | = LCTRL2_LW0LA_EandF ;
mtspr ( SPRN_LCTRL2 , 0 ) ;
if ( ( brk - > type & HW_BRK_TYPE_RDWR ) = = 0 )
return 0 ;
if ( ( brk - > type & HW_BRK_TYPE_RDWR ) = = HW_BRK_TYPE_READ )
lctrl1 | = LCTRL1_CRWE_RO | LCTRL1_CRWF_RO ;
if ( ( brk - > type & HW_BRK_TYPE_RDWR ) = = HW_BRK_TYPE_WRITE )
lctrl1 | = LCTRL1_CRWE_WO | LCTRL1_CRWF_WO ;
mtspr ( SPRN_CMPE , start_addr - 1 ) ;
2020-05-14 14:17:37 +03:00
mtspr ( SPRN_CMPF , end_addr ) ;
2019-11-26 20:43:29 +03:00
mtspr ( SPRN_LCTRL1 , lctrl1 ) ;
mtspr ( SPRN_LCTRL2 , lctrl2 ) ;
return 0 ;
}
2022-11-09 07:51:04 +03:00
static void set_hw_breakpoint ( int nr , struct arch_hw_breakpoint * brk )
2012-12-20 18:06:44 +04:00
{
2019-04-01 09:03:12 +03:00
if ( dawr_enabled ( ) )
2018-04-01 08:50:36 +03:00
// Power8 or later
2020-05-14 14:17:31 +03:00
set_dawr ( nr , brk ) ;
2019-11-26 20:43:29 +03:00
else if ( IS_ENABLED ( CONFIG_PPC_8xx ) )
set_breakpoint_8xx ( brk ) ;
2018-04-01 08:50:36 +03:00
else if ( ! cpu_has_feature ( CPU_FTR_ARCH_207S ) )
// Power7 or earlier
2014-04-29 23:25:16 +04:00
set_dabr ( brk ) ;
2018-04-01 08:50:36 +03:00
else
// Shouldn't happen due to higher level checks
WARN_ON_ONCE ( 1 ) ;
2012-12-20 18:06:44 +04:00
}
2005-09-26 10:04:21 +04:00
2022-11-09 07:51:04 +03:00
void __set_breakpoint ( int nr , struct arch_hw_breakpoint * brk )
{
memcpy ( this_cpu_ptr ( & current_brk [ nr ] ) , brk , sizeof ( * brk ) ) ;
set_hw_breakpoint ( nr , brk ) ;
}
2018-03-27 07:37:17 +03:00
/* Check if we have DAWR or DABR hardware */
bool ppc_breakpoint_available ( void )
{
2019-04-01 09:03:12 +03:00
if ( dawr_enabled ( ) )
return true ; /* POWER8 DAWR or POWER9 forced DAWR */
2018-03-27 07:37:17 +03:00
if ( cpu_has_feature ( CPU_FTR_ARCH_207S ) )
return false ; /* POWER9 with DAWR disabled */
/* DABR: Everything but POWER8 and POWER9 */
return true ;
}
EXPORT_SYMBOL_GPL ( ppc_breakpoint_available ) ;
2022-11-09 07:51:04 +03:00
/* Disable the breakpoint in hardware without touching current_brk[] */
void suspend_breakpoints ( void )
{
struct arch_hw_breakpoint brk = { 0 } ;
int i ;
if ( ! ppc_breakpoint_available ( ) )
return ;
for ( i = 0 ; i < nr_wp_slots ( ) ; i + + )
set_hw_breakpoint ( i , & brk ) ;
}
/*
* Re - enable breakpoints suspended by suspend_breakpoints ( ) in hardware
* from current_brk [ ]
*/
void restore_breakpoints ( void )
{
int i ;
if ( ! ppc_breakpoint_available ( ) )
return ;
for ( i = 0 ; i < nr_wp_slots ( ) ; i + + )
set_hw_breakpoint ( i , this_cpu_ptr ( & current_brk [ i ] ) ) ;
}
2013-02-13 20:21:37 +04:00
# ifdef CONFIG_PPC_TRANSACTIONAL_MEM
2016-09-14 11:02:16 +03:00
static inline bool tm_enabled ( struct task_struct * tsk )
{
return tsk & & tsk - > thread . regs & & ( tsk - > thread . regs - > msr & MSR_TM ) ;
}
2018-02-01 04:07:46 +03:00
static void tm_reclaim_thread ( struct thread_struct * thr , uint8_t cause )
powerpc: Don't corrupt transactional state when using FP/VMX in kernel
Currently, when we have a process using the transactional memory
facilities on POWER8 (that is, the processor is in transactional
or suspended state), and the process enters the kernel and the
kernel then uses the floating-point or vector (VMX/Altivec) facility,
we end up corrupting the user-visible FP/VMX/VSX state. This
happens, for example, if a page fault causes a copy-on-write
operation, because the copy_page function will use VMX to do the
copy on POWER8. The test program below demonstrates the bug.
The bug happens because when FP/VMX state for a transactional process
is stored in the thread_struct, we store the checkpointed state in
.fp_state/.vr_state and the transactional (current) state in
.transact_fp/.transact_vr. However, when the kernel wants to use
FP/VMX, it calls enable_kernel_fp() or enable_kernel_altivec(),
which saves the current state in .fp_state/.vr_state. Furthermore,
when we return to the user process we return with FP/VMX/VSX
disabled. The next time the process uses FP/VMX/VSX, we don't know
which set of state (the current register values, .fp_state/.vr_state,
or .transact_fp/.transact_vr) we should be using, since we have no
way to tell if we are still in the same transaction, and if not,
whether the previous transaction succeeded or failed.
Thus it is necessary to strictly adhere to the rule that if FP has
been enabled at any point in a transaction, we must keep FP enabled
for the user process with the current transactional state in the
FP registers, until we detect that it is no longer in a transaction.
Similarly for VMX; once enabled it must stay enabled until the
process is no longer transactional.
In order to keep this rule, we add a new thread_info flag which we
test when returning from the kernel to userspace, called TIF_RESTORE_TM.
This flag indicates that there is FP/VMX/VSX state to be restored
before entering userspace, and when it is set the .tm_orig_msr field
in the thread_struct indicates what state needs to be restored.
The restoration is done by restore_tm_state(). The TIF_RESTORE_TM
bit is set by new giveup_fpu/altivec_maybe_transactional helpers,
which are called from enable_kernel_fp/altivec, giveup_vsx, and
flush_fp/altivec_to_thread instead of giveup_fpu/altivec.
The other thing to be done is to get the transactional FP/VMX/VSX
state from .fp_state/.vr_state when doing reclaim, if that state
has been saved there by giveup_fpu/altivec_maybe_transactional.
Having done this, we set the FP/VMX bit in the thread's MSR after
reclaim to indicate that that part of the state is now valid
(having been reclaimed from the processor's checkpointed state).
Finally, in the signal handling code, we move the clearing of the
transactional state bits in the thread's MSR a bit earlier, before
calling flush_fp_to_thread(), so that we don't unnecessarily set
the TIF_RESTORE_TM bit.
This is the test program:
/* Michael Neuling 4/12/2013
*
* See if the altivec state is leaked out of an aborted transaction due to
* kernel vmx copy loops.
*
* gcc -m64 htm_vmxcopy.c -o htm_vmxcopy
*
*/
/* We don't use all of these, but for reference: */
int main(int argc, char *argv[])
{
long double vecin = 1.3;
long double vecout;
unsigned long pgsize = getpagesize();
int i;
int fd;
int size = pgsize*16;
char tmpfile[] = "/tmp/page_faultXXXXXX";
char buf[pgsize];
char *a;
uint64_t aborted = 0;
fd = mkstemp(tmpfile);
assert(fd >= 0);
memset(buf, 0, pgsize);
for (i = 0; i < size; i += pgsize)
assert(write(fd, buf, pgsize) == pgsize);
unlink(tmpfile);
a = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
assert(a != MAP_FAILED);
asm __volatile__(
"lxvd2x 40,0,%[vecinptr] ; " // set 40 to initial value
TBEGIN
"beq 3f ;"
TSUSPEND
"xxlxor 40,40,40 ; " // set 40 to 0
"std 5, 0(%[map]) ;" // cause kernel vmx copy page
TABORT
TRESUME
TEND
"li %[res], 0 ;"
"b 5f ;"
"3: ;" // Abort handler
"li %[res], 1 ;"
"5: ;"
"stxvd2x 40,0,%[vecoutptr] ; "
: [res]"=r"(aborted)
: [vecinptr]"r"(&vecin),
[vecoutptr]"r"(&vecout),
[map]"r"(a)
: "memory", "r0", "r3", "r4", "r5", "r6", "r7");
if (aborted && (vecin != vecout)){
printf("FAILED: vector state leaked on abort %f != %f\n",
(double)vecin, (double)vecout);
exit(1);
}
munmap(a, size);
close(fd);
printf("PASSED!\n");
return 0;
}
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2014-01-13 08:56:29 +04:00
{
2015-11-19 07:44:45 +03:00
/*
* Use the current MSR TM suspended bit to track if we have
* checkpointed state outstanding .
* On signal delivery , we ' d normally reclaim the checkpointed
* state to obtain stack pointer ( see : get_tm_stackpointer ( ) ) .
* This will then directly return to userspace without going
* through __switch_to ( ) . However , if the stack frame is bad ,
* we need to exit this thread which calls __switch_to ( ) which
* will again attempt to reclaim the already saved tm state .
* Hence we need to check that we ' ve not already reclaimed
* this state .
* We do this using the current MSR , rather tracking it in
* some specific thread_struct bit , as it has the additional
2016-06-01 09:34:37 +03:00
* benefit of checking for a potential TM bad thing exception .
2015-11-19 07:44:45 +03:00
*/
if ( ! MSR_TM_SUSPENDED ( mfmsr ( ) ) )
return ;
2017-11-02 06:09:04 +03:00
giveup_all ( container_of ( thr , struct task_struct , thread ) ) ;
2017-11-02 06:09:05 +03:00
tm_reclaim ( thr , cause ) ;
2017-05-08 10:16:26 +03:00
/*
* If we are in a transaction and FP is off then we can ' t have
* used FP inside that transaction . Hence the checkpointed
* state is the same as the live state . We need to copy the
* live state to the checkpointed state so that when the
* transaction is restored , the checkpointed state is correct
* and the aborted transaction sees the correct state . We use
* ckpt_regs . msr here as that ' s what tm_reclaim will use to
* determine if it ' s going to write the checkpointed state or
* not . So either this will write the checkpointed registers ,
* or reclaim will . Similarly for VMX .
*/
if ( ( thr - > ckpt_regs . msr & MSR_FP ) = = 0 )
memcpy ( & thr - > ckfp_state , & thr - > fp_state ,
sizeof ( struct thread_fp_state ) ) ;
if ( ( thr - > ckpt_regs . msr & MSR_VEC ) = = 0 )
memcpy ( & thr - > ckvr_state , & thr - > vr_state ,
sizeof ( struct thread_vr_state ) ) ;
powerpc: Don't corrupt transactional state when using FP/VMX in kernel
Currently, when we have a process using the transactional memory
facilities on POWER8 (that is, the processor is in transactional
or suspended state), and the process enters the kernel and the
kernel then uses the floating-point or vector (VMX/Altivec) facility,
we end up corrupting the user-visible FP/VMX/VSX state. This
happens, for example, if a page fault causes a copy-on-write
operation, because the copy_page function will use VMX to do the
copy on POWER8. The test program below demonstrates the bug.
The bug happens because when FP/VMX state for a transactional process
is stored in the thread_struct, we store the checkpointed state in
.fp_state/.vr_state and the transactional (current) state in
.transact_fp/.transact_vr. However, when the kernel wants to use
FP/VMX, it calls enable_kernel_fp() or enable_kernel_altivec(),
which saves the current state in .fp_state/.vr_state. Furthermore,
when we return to the user process we return with FP/VMX/VSX
disabled. The next time the process uses FP/VMX/VSX, we don't know
which set of state (the current register values, .fp_state/.vr_state,
or .transact_fp/.transact_vr) we should be using, since we have no
way to tell if we are still in the same transaction, and if not,
whether the previous transaction succeeded or failed.
Thus it is necessary to strictly adhere to the rule that if FP has
been enabled at any point in a transaction, we must keep FP enabled
for the user process with the current transactional state in the
FP registers, until we detect that it is no longer in a transaction.
Similarly for VMX; once enabled it must stay enabled until the
process is no longer transactional.
In order to keep this rule, we add a new thread_info flag which we
test when returning from the kernel to userspace, called TIF_RESTORE_TM.
This flag indicates that there is FP/VMX/VSX state to be restored
before entering userspace, and when it is set the .tm_orig_msr field
in the thread_struct indicates what state needs to be restored.
The restoration is done by restore_tm_state(). The TIF_RESTORE_TM
bit is set by new giveup_fpu/altivec_maybe_transactional helpers,
which are called from enable_kernel_fp/altivec, giveup_vsx, and
flush_fp/altivec_to_thread instead of giveup_fpu/altivec.
The other thing to be done is to get the transactional FP/VMX/VSX
state from .fp_state/.vr_state when doing reclaim, if that state
has been saved there by giveup_fpu/altivec_maybe_transactional.
Having done this, we set the FP/VMX bit in the thread's MSR after
reclaim to indicate that that part of the state is now valid
(having been reclaimed from the processor's checkpointed state).
Finally, in the signal handling code, we move the clearing of the
transactional state bits in the thread's MSR a bit earlier, before
calling flush_fp_to_thread(), so that we don't unnecessarily set
the TIF_RESTORE_TM bit.
This is the test program:
/* Michael Neuling 4/12/2013
*
* See if the altivec state is leaked out of an aborted transaction due to
* kernel vmx copy loops.
*
* gcc -m64 htm_vmxcopy.c -o htm_vmxcopy
*
*/
/* We don't use all of these, but for reference: */
int main(int argc, char *argv[])
{
long double vecin = 1.3;
long double vecout;
unsigned long pgsize = getpagesize();
int i;
int fd;
int size = pgsize*16;
char tmpfile[] = "/tmp/page_faultXXXXXX";
char buf[pgsize];
char *a;
uint64_t aborted = 0;
fd = mkstemp(tmpfile);
assert(fd >= 0);
memset(buf, 0, pgsize);
for (i = 0; i < size; i += pgsize)
assert(write(fd, buf, pgsize) == pgsize);
unlink(tmpfile);
a = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
assert(a != MAP_FAILED);
asm __volatile__(
"lxvd2x 40,0,%[vecinptr] ; " // set 40 to initial value
TBEGIN
"beq 3f ;"
TSUSPEND
"xxlxor 40,40,40 ; " // set 40 to 0
"std 5, 0(%[map]) ;" // cause kernel vmx copy page
TABORT
TRESUME
TEND
"li %[res], 0 ;"
"b 5f ;"
"3: ;" // Abort handler
"li %[res], 1 ;"
"5: ;"
"stxvd2x 40,0,%[vecoutptr] ; "
: [res]"=r"(aborted)
: [vecinptr]"r"(&vecin),
[vecoutptr]"r"(&vecout),
[map]"r"(a)
: "memory", "r0", "r3", "r4", "r5", "r6", "r7");
if (aborted && (vecin != vecout)){
printf("FAILED: vector state leaked on abort %f != %f\n",
(double)vecin, (double)vecout);
exit(1);
}
munmap(a, size);
close(fd);
printf("PASSED!\n");
return 0;
}
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2014-01-13 08:56:29 +04:00
}
void tm_reclaim_current ( uint8_t cause )
{
tm_enable ( ) ;
2018-02-01 04:07:46 +03:00
tm_reclaim_thread ( & current - > thread , cause ) ;
powerpc: Don't corrupt transactional state when using FP/VMX in kernel
Currently, when we have a process using the transactional memory
facilities on POWER8 (that is, the processor is in transactional
or suspended state), and the process enters the kernel and the
kernel then uses the floating-point or vector (VMX/Altivec) facility,
we end up corrupting the user-visible FP/VMX/VSX state. This
happens, for example, if a page fault causes a copy-on-write
operation, because the copy_page function will use VMX to do the
copy on POWER8. The test program below demonstrates the bug.
The bug happens because when FP/VMX state for a transactional process
is stored in the thread_struct, we store the checkpointed state in
.fp_state/.vr_state and the transactional (current) state in
.transact_fp/.transact_vr. However, when the kernel wants to use
FP/VMX, it calls enable_kernel_fp() or enable_kernel_altivec(),
which saves the current state in .fp_state/.vr_state. Furthermore,
when we return to the user process we return with FP/VMX/VSX
disabled. The next time the process uses FP/VMX/VSX, we don't know
which set of state (the current register values, .fp_state/.vr_state,
or .transact_fp/.transact_vr) we should be using, since we have no
way to tell if we are still in the same transaction, and if not,
whether the previous transaction succeeded or failed.
Thus it is necessary to strictly adhere to the rule that if FP has
been enabled at any point in a transaction, we must keep FP enabled
for the user process with the current transactional state in the
FP registers, until we detect that it is no longer in a transaction.
Similarly for VMX; once enabled it must stay enabled until the
process is no longer transactional.
In order to keep this rule, we add a new thread_info flag which we
test when returning from the kernel to userspace, called TIF_RESTORE_TM.
This flag indicates that there is FP/VMX/VSX state to be restored
before entering userspace, and when it is set the .tm_orig_msr field
in the thread_struct indicates what state needs to be restored.
The restoration is done by restore_tm_state(). The TIF_RESTORE_TM
bit is set by new giveup_fpu/altivec_maybe_transactional helpers,
which are called from enable_kernel_fp/altivec, giveup_vsx, and
flush_fp/altivec_to_thread instead of giveup_fpu/altivec.
The other thing to be done is to get the transactional FP/VMX/VSX
state from .fp_state/.vr_state when doing reclaim, if that state
has been saved there by giveup_fpu/altivec_maybe_transactional.
Having done this, we set the FP/VMX bit in the thread's MSR after
reclaim to indicate that that part of the state is now valid
(having been reclaimed from the processor's checkpointed state).
Finally, in the signal handling code, we move the clearing of the
transactional state bits in the thread's MSR a bit earlier, before
calling flush_fp_to_thread(), so that we don't unnecessarily set
the TIF_RESTORE_TM bit.
This is the test program:
/* Michael Neuling 4/12/2013
*
* See if the altivec state is leaked out of an aborted transaction due to
* kernel vmx copy loops.
*
* gcc -m64 htm_vmxcopy.c -o htm_vmxcopy
*
*/
/* We don't use all of these, but for reference: */
int main(int argc, char *argv[])
{
long double vecin = 1.3;
long double vecout;
unsigned long pgsize = getpagesize();
int i;
int fd;
int size = pgsize*16;
char tmpfile[] = "/tmp/page_faultXXXXXX";
char buf[pgsize];
char *a;
uint64_t aborted = 0;
fd = mkstemp(tmpfile);
assert(fd >= 0);
memset(buf, 0, pgsize);
for (i = 0; i < size; i += pgsize)
assert(write(fd, buf, pgsize) == pgsize);
unlink(tmpfile);
a = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
assert(a != MAP_FAILED);
asm __volatile__(
"lxvd2x 40,0,%[vecinptr] ; " // set 40 to initial value
TBEGIN
"beq 3f ;"
TSUSPEND
"xxlxor 40,40,40 ; " // set 40 to 0
"std 5, 0(%[map]) ;" // cause kernel vmx copy page
TABORT
TRESUME
TEND
"li %[res], 0 ;"
"b 5f ;"
"3: ;" // Abort handler
"li %[res], 1 ;"
"5: ;"
"stxvd2x 40,0,%[vecoutptr] ; "
: [res]"=r"(aborted)
: [vecinptr]"r"(&vecin),
[vecoutptr]"r"(&vecout),
[map]"r"(a)
: "memory", "r0", "r3", "r4", "r5", "r6", "r7");
if (aborted && (vecin != vecout)){
printf("FAILED: vector state leaked on abort %f != %f\n",
(double)vecin, (double)vecout);
exit(1);
}
munmap(a, size);
close(fd);
printf("PASSED!\n");
return 0;
}
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2014-01-13 08:56:29 +04:00
}
2013-02-13 20:21:37 +04:00
static inline void tm_reclaim_task ( struct task_struct * tsk )
{
/* We have to work out if we're switching from/to a task that's in the
* middle of a transaction .
*
* In switching we need to maintain a 2 nd register state as
* oldtask - > thread . ckpt_regs . We tm_reclaim ( oldproc ) ; this saves the
2016-09-23 09:18:25 +03:00
* checkpointed ( tbegin ) state in ckpt_regs , ckfp_state and
* ckvr_state
2013-02-13 20:21:37 +04:00
*
* We also context switch ( save ) TFHAR / TEXASR / TFIAR in here .
*/
struct thread_struct * thr = & tsk - > thread ;
if ( ! thr - > regs )
return ;
if ( ! MSR_TM_ACTIVE ( thr - > regs - > msr ) )
goto out_and_saveregs ;
2017-10-12 13:17:19 +03:00
WARN_ON ( tm_suspend_disabled ) ;
2013-02-13 20:21:37 +04:00
TM_DEBUG ( " --- tm_reclaim on pid %d (NIP=%lx, "
" ccr=%lx, msr=%lx, trap=%lx) \n " ,
tsk - > pid , thr - > regs - > nip ,
thr - > regs - > ccr , thr - > regs - > msr ,
thr - > regs - > trap ) ;
2018-02-01 04:07:46 +03:00
tm_reclaim_thread ( thr , TM_CAUSE_RESCHED ) ;
2013-02-13 20:21:37 +04:00
TM_DEBUG ( " --- tm_reclaim on pid %d complete \n " ,
tsk - > pid ) ;
out_and_saveregs :
/* Always save the regs here, even if a transaction's not active.
* This context - switches a thread ' s TM info SPRs . We do it here to
* be consistent with the restore path ( in recheckpoint ) which
* cannot happen later in _switch ( ) .
*/
tm_save_sprs ( thr ) ;
}
2017-11-02 06:09:05 +03:00
extern void __tm_recheckpoint ( struct thread_struct * thread ) ;
2014-04-04 13:19:48 +04:00
2017-11-02 06:09:05 +03:00
void tm_recheckpoint ( struct thread_struct * thread )
2014-04-04 13:19:48 +04:00
{
unsigned long flags ;
2016-09-14 11:02:16 +03:00
if ( ! ( thread - > regs - > msr & MSR_TM ) )
return ;
2014-04-04 13:19:48 +04:00
/* We really can't be interrupted here as the TEXASR registers can't
* change and later in the trecheckpoint code , we have a userspace R1 .
* So let ' s hard disable over this region .
*/
local_irq_save ( flags ) ;
hard_irq_disable ( ) ;
/* The TM SPRs are restored here, so that TEXASR.FS can be set
* before the trecheckpoint and no explosion occurs .
*/
tm_restore_sprs ( thread ) ;
2017-11-02 06:09:05 +03:00
__tm_recheckpoint ( thread ) ;
2014-04-04 13:19:48 +04:00
local_irq_restore ( flags ) ;
}
2013-02-13 20:21:40 +04:00
static inline void tm_recheckpoint_new_task ( struct task_struct * new )
2013-02-13 20:21:37 +04:00
{
if ( ! cpu_has_feature ( CPU_FTR_TM ) )
return ;
/* Recheckpoint the registers of the thread we're about to switch to.
*
* If the task was using FP , we non - lazily reload both the original and
* the speculative FP register states . This is because the kernel
* doesn ' t see if / when a TM rollback occurs , so if we take an FP
2016-09-23 09:18:24 +03:00
* unavailable later , we are unable to determine which set of FP regs
2013-02-13 20:21:37 +04:00
* need to be restored .
*/
2016-09-14 11:02:16 +03:00
if ( ! tm_enabled ( new ) )
2013-02-13 20:21:37 +04:00
return ;
2014-04-04 13:19:48 +04:00
if ( ! MSR_TM_ACTIVE ( new - > thread . regs - > msr ) ) {
tm_restore_sprs ( & new - > thread ) ;
2013-02-13 20:21:37 +04:00
return ;
2014-04-04 13:19:48 +04:00
}
2013-02-13 20:21:37 +04:00
/* Recheckpoint to restore original checkpointed register state. */
2017-11-02 06:09:05 +03:00
TM_DEBUG ( " *** tm_recheckpoint of pid %d (new->msr 0x%lx) \n " ,
new - > pid , new - > thread . regs - > msr ) ;
2013-02-13 20:21:37 +04:00
2017-11-02 06:09:05 +03:00
tm_recheckpoint ( & new - > thread ) ;
2013-02-13 20:21:37 +04:00
2016-09-23 09:18:24 +03:00
/*
* The checkpointed state has been restored but the live state has
* not , ensure all the math functionality is turned off to trigger
* restore_math ( ) to reload .
*/
new - > thread . regs - > msr & = ~ ( MSR_FP | MSR_VEC | MSR_VSX ) ;
2013-02-13 20:21:37 +04:00
TM_DEBUG ( " *** tm_recheckpoint of pid %d complete "
" (kernel msr 0x%lx) \n " ,
new - > pid , mfmsr ( ) ) ;
}
2016-09-23 09:18:24 +03:00
static inline void __switch_to_tm ( struct task_struct * prev ,
struct task_struct * new )
2013-02-13 20:21:37 +04:00
{
if ( cpu_has_feature ( CPU_FTR_TM ) ) {
2016-09-14 11:02:16 +03:00
if ( tm_enabled ( prev ) | | tm_enabled ( new ) )
tm_enable ( ) ;
if ( tm_enabled ( prev ) ) {
prev - > thread . load_tm + + ;
tm_reclaim_task ( prev ) ;
if ( ! MSR_TM_ACTIVE ( prev - > thread . regs - > msr ) & & prev - > thread . load_tm = = 0 )
prev - > thread . regs - > msr & = ~ MSR_TM ;
}
2016-09-23 09:18:24 +03:00
tm_recheckpoint_new_task ( new ) ;
2013-02-13 20:21:37 +04:00
}
}
powerpc: Don't corrupt transactional state when using FP/VMX in kernel
Currently, when we have a process using the transactional memory
facilities on POWER8 (that is, the processor is in transactional
or suspended state), and the process enters the kernel and the
kernel then uses the floating-point or vector (VMX/Altivec) facility,
we end up corrupting the user-visible FP/VMX/VSX state. This
happens, for example, if a page fault causes a copy-on-write
operation, because the copy_page function will use VMX to do the
copy on POWER8. The test program below demonstrates the bug.
The bug happens because when FP/VMX state for a transactional process
is stored in the thread_struct, we store the checkpointed state in
.fp_state/.vr_state and the transactional (current) state in
.transact_fp/.transact_vr. However, when the kernel wants to use
FP/VMX, it calls enable_kernel_fp() or enable_kernel_altivec(),
which saves the current state in .fp_state/.vr_state. Furthermore,
when we return to the user process we return with FP/VMX/VSX
disabled. The next time the process uses FP/VMX/VSX, we don't know
which set of state (the current register values, .fp_state/.vr_state,
or .transact_fp/.transact_vr) we should be using, since we have no
way to tell if we are still in the same transaction, and if not,
whether the previous transaction succeeded or failed.
Thus it is necessary to strictly adhere to the rule that if FP has
been enabled at any point in a transaction, we must keep FP enabled
for the user process with the current transactional state in the
FP registers, until we detect that it is no longer in a transaction.
Similarly for VMX; once enabled it must stay enabled until the
process is no longer transactional.
In order to keep this rule, we add a new thread_info flag which we
test when returning from the kernel to userspace, called TIF_RESTORE_TM.
This flag indicates that there is FP/VMX/VSX state to be restored
before entering userspace, and when it is set the .tm_orig_msr field
in the thread_struct indicates what state needs to be restored.
The restoration is done by restore_tm_state(). The TIF_RESTORE_TM
bit is set by new giveup_fpu/altivec_maybe_transactional helpers,
which are called from enable_kernel_fp/altivec, giveup_vsx, and
flush_fp/altivec_to_thread instead of giveup_fpu/altivec.
The other thing to be done is to get the transactional FP/VMX/VSX
state from .fp_state/.vr_state when doing reclaim, if that state
has been saved there by giveup_fpu/altivec_maybe_transactional.
Having done this, we set the FP/VMX bit in the thread's MSR after
reclaim to indicate that that part of the state is now valid
(having been reclaimed from the processor's checkpointed state).
Finally, in the signal handling code, we move the clearing of the
transactional state bits in the thread's MSR a bit earlier, before
calling flush_fp_to_thread(), so that we don't unnecessarily set
the TIF_RESTORE_TM bit.
This is the test program:
/* Michael Neuling 4/12/2013
*
* See if the altivec state is leaked out of an aborted transaction due to
* kernel vmx copy loops.
*
* gcc -m64 htm_vmxcopy.c -o htm_vmxcopy
*
*/
/* We don't use all of these, but for reference: */
int main(int argc, char *argv[])
{
long double vecin = 1.3;
long double vecout;
unsigned long pgsize = getpagesize();
int i;
int fd;
int size = pgsize*16;
char tmpfile[] = "/tmp/page_faultXXXXXX";
char buf[pgsize];
char *a;
uint64_t aborted = 0;
fd = mkstemp(tmpfile);
assert(fd >= 0);
memset(buf, 0, pgsize);
for (i = 0; i < size; i += pgsize)
assert(write(fd, buf, pgsize) == pgsize);
unlink(tmpfile);
a = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
assert(a != MAP_FAILED);
asm __volatile__(
"lxvd2x 40,0,%[vecinptr] ; " // set 40 to initial value
TBEGIN
"beq 3f ;"
TSUSPEND
"xxlxor 40,40,40 ; " // set 40 to 0
"std 5, 0(%[map]) ;" // cause kernel vmx copy page
TABORT
TRESUME
TEND
"li %[res], 0 ;"
"b 5f ;"
"3: ;" // Abort handler
"li %[res], 1 ;"
"5: ;"
"stxvd2x 40,0,%[vecoutptr] ; "
: [res]"=r"(aborted)
: [vecinptr]"r"(&vecin),
[vecoutptr]"r"(&vecout),
[map]"r"(a)
: "memory", "r0", "r3", "r4", "r5", "r6", "r7");
if (aborted && (vecin != vecout)){
printf("FAILED: vector state leaked on abort %f != %f\n",
(double)vecin, (double)vecout);
exit(1);
}
munmap(a, size);
close(fd);
printf("PASSED!\n");
return 0;
}
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2014-01-13 08:56:29 +04:00
/*
* This is called if we are on the way out to userspace and the
* TIF_RESTORE_TM flag is set . It checks if we need to reload
* FP and / or vector state and does so if necessary .
* If userspace is inside a transaction ( whether active or
* suspended ) and FP / VMX / VSX instructions have ever been enabled
* inside that transaction , then we have to keep them enabled
* and keep the FP / VMX / VSX state loaded while ever the transaction
* continues . The reason is that if we didn ' t , and subsequently
* got a FP / VMX / VSX unavailable interrupt inside a transaction ,
* we don ' t know whether it ' s the same transaction , and thus we
* don ' t know which of the checkpointed state and the transactional
* state to use .
*/
void restore_tm_state ( struct pt_regs * regs )
{
unsigned long msr_diff ;
2016-09-23 09:18:24 +03:00
/*
* This is the only moment we should clear TIF_RESTORE_TM as
* it is here that ckpt_regs . msr and pt_regs . msr become the same
* again , anything else could lead to an incorrect ckpt_msr being
* saved and therefore incorrect signal contexts .
*/
powerpc: Don't corrupt transactional state when using FP/VMX in kernel
Currently, when we have a process using the transactional memory
facilities on POWER8 (that is, the processor is in transactional
or suspended state), and the process enters the kernel and the
kernel then uses the floating-point or vector (VMX/Altivec) facility,
we end up corrupting the user-visible FP/VMX/VSX state. This
happens, for example, if a page fault causes a copy-on-write
operation, because the copy_page function will use VMX to do the
copy on POWER8. The test program below demonstrates the bug.
The bug happens because when FP/VMX state for a transactional process
is stored in the thread_struct, we store the checkpointed state in
.fp_state/.vr_state and the transactional (current) state in
.transact_fp/.transact_vr. However, when the kernel wants to use
FP/VMX, it calls enable_kernel_fp() or enable_kernel_altivec(),
which saves the current state in .fp_state/.vr_state. Furthermore,
when we return to the user process we return with FP/VMX/VSX
disabled. The next time the process uses FP/VMX/VSX, we don't know
which set of state (the current register values, .fp_state/.vr_state,
or .transact_fp/.transact_vr) we should be using, since we have no
way to tell if we are still in the same transaction, and if not,
whether the previous transaction succeeded or failed.
Thus it is necessary to strictly adhere to the rule that if FP has
been enabled at any point in a transaction, we must keep FP enabled
for the user process with the current transactional state in the
FP registers, until we detect that it is no longer in a transaction.
Similarly for VMX; once enabled it must stay enabled until the
process is no longer transactional.
In order to keep this rule, we add a new thread_info flag which we
test when returning from the kernel to userspace, called TIF_RESTORE_TM.
This flag indicates that there is FP/VMX/VSX state to be restored
before entering userspace, and when it is set the .tm_orig_msr field
in the thread_struct indicates what state needs to be restored.
The restoration is done by restore_tm_state(). The TIF_RESTORE_TM
bit is set by new giveup_fpu/altivec_maybe_transactional helpers,
which are called from enable_kernel_fp/altivec, giveup_vsx, and
flush_fp/altivec_to_thread instead of giveup_fpu/altivec.
The other thing to be done is to get the transactional FP/VMX/VSX
state from .fp_state/.vr_state when doing reclaim, if that state
has been saved there by giveup_fpu/altivec_maybe_transactional.
Having done this, we set the FP/VMX bit in the thread's MSR after
reclaim to indicate that that part of the state is now valid
(having been reclaimed from the processor's checkpointed state).
Finally, in the signal handling code, we move the clearing of the
transactional state bits in the thread's MSR a bit earlier, before
calling flush_fp_to_thread(), so that we don't unnecessarily set
the TIF_RESTORE_TM bit.
This is the test program:
/* Michael Neuling 4/12/2013
*
* See if the altivec state is leaked out of an aborted transaction due to
* kernel vmx copy loops.
*
* gcc -m64 htm_vmxcopy.c -o htm_vmxcopy
*
*/
/* We don't use all of these, but for reference: */
int main(int argc, char *argv[])
{
long double vecin = 1.3;
long double vecout;
unsigned long pgsize = getpagesize();
int i;
int fd;
int size = pgsize*16;
char tmpfile[] = "/tmp/page_faultXXXXXX";
char buf[pgsize];
char *a;
uint64_t aborted = 0;
fd = mkstemp(tmpfile);
assert(fd >= 0);
memset(buf, 0, pgsize);
for (i = 0; i < size; i += pgsize)
assert(write(fd, buf, pgsize) == pgsize);
unlink(tmpfile);
a = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
assert(a != MAP_FAILED);
asm __volatile__(
"lxvd2x 40,0,%[vecinptr] ; " // set 40 to initial value
TBEGIN
"beq 3f ;"
TSUSPEND
"xxlxor 40,40,40 ; " // set 40 to 0
"std 5, 0(%[map]) ;" // cause kernel vmx copy page
TABORT
TRESUME
TEND
"li %[res], 0 ;"
"b 5f ;"
"3: ;" // Abort handler
"li %[res], 1 ;"
"5: ;"
"stxvd2x 40,0,%[vecoutptr] ; "
: [res]"=r"(aborted)
: [vecinptr]"r"(&vecin),
[vecoutptr]"r"(&vecout),
[map]"r"(a)
: "memory", "r0", "r3", "r4", "r5", "r6", "r7");
if (aborted && (vecin != vecout)){
printf("FAILED: vector state leaked on abort %f != %f\n",
(double)vecin, (double)vecout);
exit(1);
}
munmap(a, size);
close(fd);
printf("PASSED!\n");
return 0;
}
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2014-01-13 08:56:29 +04:00
clear_thread_flag ( TIF_RESTORE_TM ) ;
if ( ! MSR_TM_ACTIVE ( regs - > msr ) )
return ;
2015-07-06 13:54:10 +03:00
msr_diff = current - > thread . ckpt_regs . msr & ~ regs - > msr ;
powerpc: Don't corrupt transactional state when using FP/VMX in kernel
Currently, when we have a process using the transactional memory
facilities on POWER8 (that is, the processor is in transactional
or suspended state), and the process enters the kernel and the
kernel then uses the floating-point or vector (VMX/Altivec) facility,
we end up corrupting the user-visible FP/VMX/VSX state. This
happens, for example, if a page fault causes a copy-on-write
operation, because the copy_page function will use VMX to do the
copy on POWER8. The test program below demonstrates the bug.
The bug happens because when FP/VMX state for a transactional process
is stored in the thread_struct, we store the checkpointed state in
.fp_state/.vr_state and the transactional (current) state in
.transact_fp/.transact_vr. However, when the kernel wants to use
FP/VMX, it calls enable_kernel_fp() or enable_kernel_altivec(),
which saves the current state in .fp_state/.vr_state. Furthermore,
when we return to the user process we return with FP/VMX/VSX
disabled. The next time the process uses FP/VMX/VSX, we don't know
which set of state (the current register values, .fp_state/.vr_state,
or .transact_fp/.transact_vr) we should be using, since we have no
way to tell if we are still in the same transaction, and if not,
whether the previous transaction succeeded or failed.
Thus it is necessary to strictly adhere to the rule that if FP has
been enabled at any point in a transaction, we must keep FP enabled
for the user process with the current transactional state in the
FP registers, until we detect that it is no longer in a transaction.
Similarly for VMX; once enabled it must stay enabled until the
process is no longer transactional.
In order to keep this rule, we add a new thread_info flag which we
test when returning from the kernel to userspace, called TIF_RESTORE_TM.
This flag indicates that there is FP/VMX/VSX state to be restored
before entering userspace, and when it is set the .tm_orig_msr field
in the thread_struct indicates what state needs to be restored.
The restoration is done by restore_tm_state(). The TIF_RESTORE_TM
bit is set by new giveup_fpu/altivec_maybe_transactional helpers,
which are called from enable_kernel_fp/altivec, giveup_vsx, and
flush_fp/altivec_to_thread instead of giveup_fpu/altivec.
The other thing to be done is to get the transactional FP/VMX/VSX
state from .fp_state/.vr_state when doing reclaim, if that state
has been saved there by giveup_fpu/altivec_maybe_transactional.
Having done this, we set the FP/VMX bit in the thread's MSR after
reclaim to indicate that that part of the state is now valid
(having been reclaimed from the processor's checkpointed state).
Finally, in the signal handling code, we move the clearing of the
transactional state bits in the thread's MSR a bit earlier, before
calling flush_fp_to_thread(), so that we don't unnecessarily set
the TIF_RESTORE_TM bit.
This is the test program:
/* Michael Neuling 4/12/2013
*
* See if the altivec state is leaked out of an aborted transaction due to
* kernel vmx copy loops.
*
* gcc -m64 htm_vmxcopy.c -o htm_vmxcopy
*
*/
/* We don't use all of these, but for reference: */
int main(int argc, char *argv[])
{
long double vecin = 1.3;
long double vecout;
unsigned long pgsize = getpagesize();
int i;
int fd;
int size = pgsize*16;
char tmpfile[] = "/tmp/page_faultXXXXXX";
char buf[pgsize];
char *a;
uint64_t aborted = 0;
fd = mkstemp(tmpfile);
assert(fd >= 0);
memset(buf, 0, pgsize);
for (i = 0; i < size; i += pgsize)
assert(write(fd, buf, pgsize) == pgsize);
unlink(tmpfile);
a = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
assert(a != MAP_FAILED);
asm __volatile__(
"lxvd2x 40,0,%[vecinptr] ; " // set 40 to initial value
TBEGIN
"beq 3f ;"
TSUSPEND
"xxlxor 40,40,40 ; " // set 40 to 0
"std 5, 0(%[map]) ;" // cause kernel vmx copy page
TABORT
TRESUME
TEND
"li %[res], 0 ;"
"b 5f ;"
"3: ;" // Abort handler
"li %[res], 1 ;"
"5: ;"
"stxvd2x 40,0,%[vecoutptr] ; "
: [res]"=r"(aborted)
: [vecinptr]"r"(&vecin),
[vecoutptr]"r"(&vecout),
[map]"r"(a)
: "memory", "r0", "r3", "r4", "r5", "r6", "r7");
if (aborted && (vecin != vecout)){
printf("FAILED: vector state leaked on abort %f != %f\n",
(double)vecin, (double)vecout);
exit(1);
}
munmap(a, size);
close(fd);
printf("PASSED!\n");
return 0;
}
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2014-01-13 08:56:29 +04:00
msr_diff & = MSR_FP | MSR_VEC | MSR_VSX ;
2016-02-29 09:53:47 +03:00
2016-09-23 09:18:08 +03:00
/* Ensure that restore_math() will restore */
if ( msr_diff & MSR_FP )
current - > thread . load_fp = 1 ;
2016-10-05 08:57:26 +03:00
# ifdef CONFIG_ALTIVEC
2016-09-23 09:18:08 +03:00
if ( cpu_has_feature ( CPU_FTR_ALTIVEC ) & & msr_diff & MSR_VEC )
current - > thread . load_vec = 1 ;
# endif
2016-02-29 09:53:47 +03:00
restore_math ( regs ) ;
2021-06-17 18:51:03 +03:00
regs_set_return_msr ( regs , regs - > msr | msr_diff ) ;
powerpc: Don't corrupt transactional state when using FP/VMX in kernel
Currently, when we have a process using the transactional memory
facilities on POWER8 (that is, the processor is in transactional
or suspended state), and the process enters the kernel and the
kernel then uses the floating-point or vector (VMX/Altivec) facility,
we end up corrupting the user-visible FP/VMX/VSX state. This
happens, for example, if a page fault causes a copy-on-write
operation, because the copy_page function will use VMX to do the
copy on POWER8. The test program below demonstrates the bug.
The bug happens because when FP/VMX state for a transactional process
is stored in the thread_struct, we store the checkpointed state in
.fp_state/.vr_state and the transactional (current) state in
.transact_fp/.transact_vr. However, when the kernel wants to use
FP/VMX, it calls enable_kernel_fp() or enable_kernel_altivec(),
which saves the current state in .fp_state/.vr_state. Furthermore,
when we return to the user process we return with FP/VMX/VSX
disabled. The next time the process uses FP/VMX/VSX, we don't know
which set of state (the current register values, .fp_state/.vr_state,
or .transact_fp/.transact_vr) we should be using, since we have no
way to tell if we are still in the same transaction, and if not,
whether the previous transaction succeeded or failed.
Thus it is necessary to strictly adhere to the rule that if FP has
been enabled at any point in a transaction, we must keep FP enabled
for the user process with the current transactional state in the
FP registers, until we detect that it is no longer in a transaction.
Similarly for VMX; once enabled it must stay enabled until the
process is no longer transactional.
In order to keep this rule, we add a new thread_info flag which we
test when returning from the kernel to userspace, called TIF_RESTORE_TM.
This flag indicates that there is FP/VMX/VSX state to be restored
before entering userspace, and when it is set the .tm_orig_msr field
in the thread_struct indicates what state needs to be restored.
The restoration is done by restore_tm_state(). The TIF_RESTORE_TM
bit is set by new giveup_fpu/altivec_maybe_transactional helpers,
which are called from enable_kernel_fp/altivec, giveup_vsx, and
flush_fp/altivec_to_thread instead of giveup_fpu/altivec.
The other thing to be done is to get the transactional FP/VMX/VSX
state from .fp_state/.vr_state when doing reclaim, if that state
has been saved there by giveup_fpu/altivec_maybe_transactional.
Having done this, we set the FP/VMX bit in the thread's MSR after
reclaim to indicate that that part of the state is now valid
(having been reclaimed from the processor's checkpointed state).
Finally, in the signal handling code, we move the clearing of the
transactional state bits in the thread's MSR a bit earlier, before
calling flush_fp_to_thread(), so that we don't unnecessarily set
the TIF_RESTORE_TM bit.
This is the test program:
/* Michael Neuling 4/12/2013
*
* See if the altivec state is leaked out of an aborted transaction due to
* kernel vmx copy loops.
*
* gcc -m64 htm_vmxcopy.c -o htm_vmxcopy
*
*/
/* We don't use all of these, but for reference: */
int main(int argc, char *argv[])
{
long double vecin = 1.3;
long double vecout;
unsigned long pgsize = getpagesize();
int i;
int fd;
int size = pgsize*16;
char tmpfile[] = "/tmp/page_faultXXXXXX";
char buf[pgsize];
char *a;
uint64_t aborted = 0;
fd = mkstemp(tmpfile);
assert(fd >= 0);
memset(buf, 0, pgsize);
for (i = 0; i < size; i += pgsize)
assert(write(fd, buf, pgsize) == pgsize);
unlink(tmpfile);
a = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
assert(a != MAP_FAILED);
asm __volatile__(
"lxvd2x 40,0,%[vecinptr] ; " // set 40 to initial value
TBEGIN
"beq 3f ;"
TSUSPEND
"xxlxor 40,40,40 ; " // set 40 to 0
"std 5, 0(%[map]) ;" // cause kernel vmx copy page
TABORT
TRESUME
TEND
"li %[res], 0 ;"
"b 5f ;"
"3: ;" // Abort handler
"li %[res], 1 ;"
"5: ;"
"stxvd2x 40,0,%[vecoutptr] ; "
: [res]"=r"(aborted)
: [vecinptr]"r"(&vecin),
[vecoutptr]"r"(&vecout),
[map]"r"(a)
: "memory", "r0", "r3", "r4", "r5", "r6", "r7");
if (aborted && (vecin != vecout)){
printf("FAILED: vector state leaked on abort %f != %f\n",
(double)vecin, (double)vecout);
exit(1);
}
munmap(a, size);
close(fd);
printf("PASSED!\n");
return 0;
}
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2014-01-13 08:56:29 +04:00
}
2021-02-27 04:12:54 +03:00
# else /* !CONFIG_PPC_TRANSACTIONAL_MEM */
2013-02-13 20:21:37 +04:00
# define tm_recheckpoint_new_task(new)
2016-09-23 09:18:24 +03:00
# define __switch_to_tm(prev, new)
2021-02-27 04:12:54 +03:00
void tm_reclaim_current ( uint8_t cause ) { }
2013-02-13 20:21:37 +04:00
# endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
2012-12-20 18:06:44 +04:00
2015-10-29 03:43:55 +03:00
static inline void save_sprs ( struct thread_struct * t )
{
# ifdef CONFIG_ALTIVEC
2016-03-08 01:08:47 +03:00
if ( cpu_has_feature ( CPU_FTR_ALTIVEC ) )
2015-10-29 03:43:55 +03:00
t - > vrsave = mfspr ( SPRN_VRSAVE ) ;
# endif
2021-05-14 16:14:53 +03:00
# ifdef CONFIG_SPE
if ( cpu_has_feature ( CPU_FTR_SPE ) )
t - > spefscr = mfspr ( SPRN_SPEFSCR ) ;
# endif
2015-10-29 03:43:55 +03:00
# ifdef CONFIG_PPC_BOOK3S_64
if ( cpu_has_feature ( CPU_FTR_DSCR ) )
t - > dscr = mfspr ( SPRN_DSCR ) ;
if ( cpu_has_feature ( CPU_FTR_ARCH_207S ) ) {
t - > bescr = mfspr ( SPRN_BESCR ) ;
t - > ebbhr = mfspr ( SPRN_EBBHR ) ;
t - > ebbrr = mfspr ( SPRN_EBBRR ) ;
t - > fscr = mfspr ( SPRN_FSCR ) ;
/*
* Note that the TAR is not available for use in the kernel .
* ( To provide this , the TAR should be backed up / restored on
* exception entry / exit instead , and be in pt_regs . FIXME ,
* this should be in pt_regs anyway ( for debug ) . )
*/
t - > tar = mfspr ( SPRN_TAR ) ;
}
# endif
}
2021-11-23 12:51:57 +03:00
# ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
void kvmppc_save_user_regs ( void )
{
unsigned long usermsr ;
if ( ! current - > thread . regs )
return ;
usermsr = current - > thread . regs - > msr ;
if ( usermsr & MSR_FP )
save_fpu ( current ) ;
if ( usermsr & MSR_VEC )
save_altivec ( current ) ;
# ifdef CONFIG_PPC_TRANSACTIONAL_MEM
if ( usermsr & MSR_TM ) {
current - > thread . tm_tfhar = mfspr ( SPRN_TFHAR ) ;
current - > thread . tm_tfiar = mfspr ( SPRN_TFIAR ) ;
current - > thread . tm_texasr = mfspr ( SPRN_TEXASR ) ;
current - > thread . regs - > msr & = ~ MSR_TM ;
}
# endif
}
EXPORT_SYMBOL_GPL ( kvmppc_save_user_regs ) ;
2021-11-23 12:52:16 +03:00
void kvmppc_save_current_sprs ( void )
{
save_sprs ( & current - > thread ) ;
}
EXPORT_SYMBOL_GPL ( kvmppc_save_current_sprs ) ;
2021-11-23 12:51:57 +03:00
# endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
2015-10-29 03:43:55 +03:00
static inline void restore_sprs ( struct thread_struct * old_thread ,
struct thread_struct * new_thread )
{
# ifdef CONFIG_ALTIVEC
if ( cpu_has_feature ( CPU_FTR_ALTIVEC ) & &
old_thread - > vrsave ! = new_thread - > vrsave )
mtspr ( SPRN_VRSAVE , new_thread - > vrsave ) ;
# endif
2021-05-14 16:14:53 +03:00
# ifdef CONFIG_SPE
if ( cpu_has_feature ( CPU_FTR_SPE ) & &
old_thread - > spefscr ! = new_thread - > spefscr )
mtspr ( SPRN_SPEFSCR , new_thread - > spefscr ) ;
# endif
2015-10-29 03:43:55 +03:00
# ifdef CONFIG_PPC_BOOK3S_64
if ( cpu_has_feature ( CPU_FTR_DSCR ) ) {
u64 dscr = get_paca ( ) - > dscr_default ;
2016-06-09 05:31:08 +03:00
if ( new_thread - > dscr_inherit )
2015-10-29 03:43:55 +03:00
dscr = new_thread - > dscr ;
if ( old_thread - > dscr ! = dscr )
mtspr ( SPRN_DSCR , dscr ) ;
}
if ( cpu_has_feature ( CPU_FTR_ARCH_207S ) ) {
if ( old_thread - > bescr ! = new_thread - > bescr )
mtspr ( SPRN_BESCR , new_thread - > bescr ) ;
if ( old_thread - > ebbhr ! = new_thread - > ebbhr )
mtspr ( SPRN_EBBHR , new_thread - > ebbhr ) ;
if ( old_thread - > ebbrr ! = new_thread - > ebbrr )
mtspr ( SPRN_EBBRR , new_thread - > ebbrr ) ;
2016-06-09 05:31:08 +03:00
if ( old_thread - > fscr ! = new_thread - > fscr )
mtspr ( SPRN_FSCR , new_thread - > fscr ) ;
2015-10-29 03:43:55 +03:00
if ( old_thread - > tar ! = new_thread - > tar )
mtspr ( SPRN_TAR , new_thread - > tar ) ;
}
2017-11-08 05:23:53 +03:00
2018-05-11 09:12:58 +03:00
if ( cpu_has_feature ( CPU_FTR_P9_TIDR ) & &
2017-11-08 05:23:53 +03:00
old_thread - > tidr ! = new_thread - > tidr )
mtspr ( SPRN_TIDR , new_thread - > tidr ) ;
2015-10-29 03:43:55 +03:00
# endif
2018-01-19 04:50:31 +03:00
2015-10-29 03:43:55 +03:00
}
2005-09-26 10:04:21 +04:00
struct task_struct * __switch_to ( struct task_struct * prev ,
struct task_struct * new )
{
struct thread_struct * new_thread , * old_thread ;
struct task_struct * last ;
2021-12-01 17:41:52 +03:00
# ifdef CONFIG_PPC_64S_HASH_MMU
2011-05-25 04:11:48 +04:00
struct ppc64_tlb_batch * batch ;
# endif
2005-09-26 10:04:21 +04:00
2015-10-29 03:43:55 +03:00
new_thread = & new - > thread ;
old_thread = & current - > thread ;
2013-10-02 11:15:14 +04:00
WARN_ON ( ! irqs_disabled ( ) ) ;
2021-12-01 17:41:52 +03:00
# ifdef CONFIG_PPC_64S_HASH_MMU
powerpc: Replace __get_cpu_var uses
This still has not been merged and now powerpc is the only arch that does
not have this change. Sorry about missing linuxppc-dev before.
V2->V2
- Fix up to work against 3.18-rc1
__get_cpu_var() is used for multiple purposes in the kernel source. One of
them is address calculation via the form &__get_cpu_var(x). This calculates
the address for the instance of the percpu variable of the current processor
based on an offset.
Other use cases are for storing and retrieving data from the current
processors percpu area. __get_cpu_var() can be used as an lvalue when
writing data or on the right side of an assignment.
__get_cpu_var() is defined as :
__get_cpu_var() always only does an address determination. However, store
and retrieve operations could use a segment prefix (or global register on
other platforms) to avoid the address calculation.
this_cpu_write() and this_cpu_read() can directly take an offset into a
percpu area and use optimized assembly code to read and write per cpu
variables.
This patch converts __get_cpu_var into either an explicit address
calculation using this_cpu_ptr() or into a use of this_cpu operations that
use the offset. Thereby address calculations are avoided and less registers
are used when code is generated.
At the end of the patch set all uses of __get_cpu_var have been removed so
the macro is removed too.
The patch set includes passes over all arches as well. Once these operations
are used throughout then specialized macros can be defined in non -x86
arches as well in order to optimize per cpu access by f.e. using a global
register that may be set to the per cpu base.
Transformations done to __get_cpu_var()
1. Determine the address of the percpu instance of the current processor.
DEFINE_PER_CPU(int, y);
int *x = &__get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(&y);
2. Same as #1 but this time an array structure is involved.
DEFINE_PER_CPU(int, y[20]);
int *x = __get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(y);
3. Retrieve the content of the current processors instance of a per cpu
variable.
DEFINE_PER_CPU(int, y);
int x = __get_cpu_var(y)
Converts to
int x = __this_cpu_read(y);
4. Retrieve the content of a percpu struct
DEFINE_PER_CPU(struct mystruct, y);
struct mystruct x = __get_cpu_var(y);
Converts to
memcpy(&x, this_cpu_ptr(&y), sizeof(x));
5. Assignment to a per cpu variable
DEFINE_PER_CPU(int, y)
__get_cpu_var(y) = x;
Converts to
__this_cpu_write(y, x);
6. Increment/Decrement etc of a per cpu variable
DEFINE_PER_CPU(int, y);
__get_cpu_var(y)++
Converts to
__this_cpu_inc(y)
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
CC: Paul Mackerras <paulus@samba.org>
Signed-off-by: Christoph Lameter <cl@linux.com>
[mpe: Fix build errors caused by set/or_softirq_pending(), and rework
assignment in __set_breakpoint() to use memcpy().]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2014-10-22 00:23:25 +04:00
batch = this_cpu_ptr ( & ppc64_tlb_batch ) ;
2011-05-25 04:11:48 +04:00
if ( batch - > active ) {
current_thread_info ( ) - > local_flags | = _TLF_LAZY_MMU ;
if ( batch - > index )
__flush_tlb_pending ( batch ) ;
batch - > active = 0 ;
}
2021-06-22 08:30:36 +03:00
/*
* On POWER9 the copy - paste buffer can only paste into
* foreign real addresses , so unprivileged processes can not
* see the data or use it in any way unless they have
* foreign real mappings . If the new process has the foreign
* real address mappings , we must issue a cp_abort to clear
* any state and prevent snooping , corruption or a covert
* channel . ISA v3 .1 supports paste into local memory .
*/
if ( new - > mm & & ( cpu_has_feature ( CPU_FTR_ARCH_31 ) | |
atomic_read ( & new - > mm - > context . vas_windows ) ) )
asm volatile ( PPC_CP_ABORT ) ;
2017-10-19 07:08:43 +03:00
# endif /* CONFIG_PPC_BOOK3S_64 */
2005-10-10 16:29:05 +04:00
2015-10-29 03:44:10 +03:00
# ifdef CONFIG_PPC_ADV_DEBUG_REGS
switch_booke_debug_regs ( & new - > thread . debug ) ;
# else
/*
* For PPC_BOOK3S_64 , we use the hw - breakpoint interfaces that would
* schedule DABR
*/
# ifndef CONFIG_HAVE_HW_BREAKPOINT
2020-05-14 14:17:34 +03:00
switch_hw_breakpoint ( new ) ;
2015-10-29 03:44:10 +03:00
# endif /* CONFIG_HAVE_HW_BREAKPOINT */
# endif
/*
* We need to save SPRs before treclaim / trecheckpoint as these will
* change a number of them .
*/
save_sprs ( & prev - > thread ) ;
/* Save FPU, Altivec, VSX and SPE state */
giveup_all ( prev ) ;
2016-09-23 09:18:24 +03:00
__switch_to_tm ( prev , new ) ;
2017-06-08 18:36:06 +03:00
if ( ! radix_enabled ( ) ) {
/*
* We can ' t take a PMU exception inside _switch ( ) since there
* is a window where the kernel stack SLB and the kernel stack
* are out of sync . Hard disable here .
*/
hard_irq_disable ( ) ;
}
2013-02-13 20:21:40 +04:00
2015-12-10 12:44:39 +03:00
/*
2021-06-17 18:51:03 +03:00
* Call restore_sprs ( ) and set_return_regs_changed ( ) before calling
* _switch ( ) . If we move it after _switch ( ) then we miss out on calling
* it for new tasks . The reason for this is we manually create a stack
* frame for new tasks that directly returns through ret_from_fork ( ) or
2015-12-10 12:44:39 +03:00
* ret_from_kernel_thread ( ) . See copy_thread ( ) for details .
*/
2015-10-29 03:44:10 +03:00
restore_sprs ( old_thread , new_thread ) ;
2021-06-17 18:51:03 +03:00
set_return_regs_changed ( ) ; /* _switch changes stack (and regs) */
2021-10-19 10:29:26 +03:00
if ( ! IS_ENABLED ( CONFIG_PPC_BOOK3S_64 ) )
kuap_assert_locked ( ) ;
2015-12-10 12:44:39 +03:00
last = _switch ( old_thread , new_thread ) ;
2021-06-22 08:30:36 +03:00
/*
* Nothing after _switch will be run for newly created tasks ,
* because they switch directly to ret_from_fork / ret_from_kernel_thread
* etc . Code added here should have a comment explaining why that is
* okay .
*/
2017-10-19 07:08:43 +03:00
# ifdef CONFIG_PPC_BOOK3S_64
2021-12-01 17:41:52 +03:00
# ifdef CONFIG_PPC_64S_HASH_MMU
2021-06-22 08:30:36 +03:00
/*
* This applies to a process that was context switched while inside
* arch_enter_lazy_mmu_mode ( ) , to re - activate the batch that was
* deactivated above , before _switch ( ) . This will never be the case
* for new tasks .
*/
2011-05-25 04:11:48 +04:00
if ( current_thread_info ( ) - > local_flags & _TLF_LAZY_MMU ) {
current_thread_info ( ) - > local_flags & = ~ _TLF_LAZY_MMU ;
powerpc: Replace __get_cpu_var uses
This still has not been merged and now powerpc is the only arch that does
not have this change. Sorry about missing linuxppc-dev before.
V2->V2
- Fix up to work against 3.18-rc1
__get_cpu_var() is used for multiple purposes in the kernel source. One of
them is address calculation via the form &__get_cpu_var(x). This calculates
the address for the instance of the percpu variable of the current processor
based on an offset.
Other use cases are for storing and retrieving data from the current
processors percpu area. __get_cpu_var() can be used as an lvalue when
writing data or on the right side of an assignment.
__get_cpu_var() is defined as :
__get_cpu_var() always only does an address determination. However, store
and retrieve operations could use a segment prefix (or global register on
other platforms) to avoid the address calculation.
this_cpu_write() and this_cpu_read() can directly take an offset into a
percpu area and use optimized assembly code to read and write per cpu
variables.
This patch converts __get_cpu_var into either an explicit address
calculation using this_cpu_ptr() or into a use of this_cpu operations that
use the offset. Thereby address calculations are avoided and less registers
are used when code is generated.
At the end of the patch set all uses of __get_cpu_var have been removed so
the macro is removed too.
The patch set includes passes over all arches as well. Once these operations
are used throughout then specialized macros can be defined in non -x86
arches as well in order to optimize per cpu access by f.e. using a global
register that may be set to the per cpu base.
Transformations done to __get_cpu_var()
1. Determine the address of the percpu instance of the current processor.
DEFINE_PER_CPU(int, y);
int *x = &__get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(&y);
2. Same as #1 but this time an array structure is involved.
DEFINE_PER_CPU(int, y[20]);
int *x = __get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(y);
3. Retrieve the content of the current processors instance of a per cpu
variable.
DEFINE_PER_CPU(int, y);
int x = __get_cpu_var(y)
Converts to
int x = __this_cpu_read(y);
4. Retrieve the content of a percpu struct
DEFINE_PER_CPU(struct mystruct, y);
struct mystruct x = __get_cpu_var(y);
Converts to
memcpy(&x, this_cpu_ptr(&y), sizeof(x));
5. Assignment to a per cpu variable
DEFINE_PER_CPU(int, y)
__get_cpu_var(y) = x;
Converts to
__this_cpu_write(y, x);
6. Increment/Decrement etc of a per cpu variable
DEFINE_PER_CPU(int, y);
__get_cpu_var(y)++
Converts to
__this_cpu_inc(y)
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
CC: Paul Mackerras <paulus@samba.org>
Signed-off-by: Christoph Lameter <cl@linux.com>
[mpe: Fix build errors caused by set/or_softirq_pending(), and rework
assignment in __set_breakpoint() to use memcpy().]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2014-10-22 00:23:25 +04:00
batch = this_cpu_ptr ( & ppc64_tlb_batch ) ;
2011-05-25 04:11:48 +04:00
batch - > active = 1 ;
}
2021-12-01 17:41:52 +03:00
# endif
2016-02-29 09:53:47 +03:00
2021-06-22 08:30:36 +03:00
/*
* Math facilities are masked out of the child MSR in copy_thread .
* A new task does not need to restore_math because it will
* demand fault them .
*/
if ( current - > thread . regs )
2019-01-17 15:25:12 +03:00
restore_math ( current - > thread . regs ) ;
2017-10-19 07:08:43 +03:00
# endif /* CONFIG_PPC_BOOK3S_64 */
2011-05-25 04:11:48 +04:00
2005-09-26 10:04:21 +04:00
return last ;
}
2018-10-06 19:51:16 +03:00
# define NR_INSN_TO_PRINT 16
2005-10-10 16:29:05 +04:00
static void show_instructions ( struct pt_regs * regs )
{
int i ;
2020-05-24 12:38:19 +03:00
unsigned long nip = regs - > nip ;
2018-10-06 19:51:16 +03:00
unsigned long pc = regs - > nip - ( NR_INSN_TO_PRINT * 3 / 4 * sizeof ( int ) ) ;
2005-10-10 16:29:05 +04:00
2022-10-06 06:20:19 +03:00
printk ( " Code: " ) ;
2005-10-10 16:29:05 +04:00
2020-05-24 12:38:19 +03:00
/*
* If we were executing with the MMU off for instructions , adjust pc
* rather than printing XXXXXXXX .
*/
if ( ! IS_ENABLED ( CONFIG_BOOKE ) & & ! ( regs - > msr & MSR_IR ) ) {
pc = ( unsigned long ) phys_to_virt ( pc ) ;
nip = ( unsigned long ) phys_to_virt ( regs - > nip ) ;
}
2018-10-06 19:51:16 +03:00
for ( i = 0 ; i < NR_INSN_TO_PRINT ; i + + ) {
2005-10-10 16:29:05 +04:00
int instr ;
2023-02-01 13:04:23 +03:00
if ( get_kernel_nofault ( instr , ( const void * ) pc ) ) {
2016-11-04 09:20:40 +03:00
pr_cont ( " XXXXXXXX " ) ;
2005-10-10 16:29:05 +04:00
} else {
2020-05-24 12:38:19 +03:00
if ( nip = = pc )
2016-11-04 09:20:40 +03:00
pr_cont ( " <%08x> " , instr ) ;
2005-10-10 16:29:05 +04:00
else
2016-11-04 09:20:40 +03:00
pr_cont ( " %08x " , instr ) ;
2005-10-10 16:29:05 +04:00
}
pc + = sizeof ( int ) ;
}
2016-11-04 09:20:40 +03:00
pr_cont ( " \n " ) ;
2005-10-10 16:29:05 +04:00
}
2018-08-02 00:33:19 +03:00
void show_user_instructions ( struct pt_regs * regs )
{
unsigned long pc ;
2018-10-06 19:51:16 +03:00
int n = NR_INSN_TO_PRINT ;
2018-10-06 19:51:14 +03:00
struct seq_buf s ;
char buf [ 96 ] ; /* enough for 8 times 9 + 2 chars */
2018-08-02 00:33:19 +03:00
2018-10-06 19:51:16 +03:00
pc = regs - > nip - ( NR_INSN_TO_PRINT * 3 / 4 * sizeof ( int ) ) ;
2018-08-02 00:33:19 +03:00
2018-10-06 19:51:14 +03:00
seq_buf_init ( & s , buf , sizeof ( buf ) ) ;
2018-08-02 00:33:19 +03:00
2018-10-06 19:51:14 +03:00
while ( n ) {
int i ;
2018-08-02 00:33:19 +03:00
2018-10-06 19:51:14 +03:00
seq_buf_clear ( & s ) ;
2018-08-02 00:33:19 +03:00
2018-10-06 19:51:14 +03:00
for ( i = 0 ; i < 8 & & n ; i + + , n - - , pc + = sizeof ( int ) ) {
int instr ;
2020-06-17 10:37:54 +03:00
if ( copy_from_user_nofault ( & instr , ( void __user * ) pc ,
sizeof ( instr ) ) ) {
2018-10-06 19:51:14 +03:00
seq_buf_printf ( & s , " XXXXXXXX " ) ;
continue ;
}
seq_buf_printf ( & s , regs - > nip = = pc ? " <%08x> " : " %08x " , instr ) ;
2018-08-02 00:33:19 +03:00
}
2018-10-06 19:51:14 +03:00
if ( ! seq_buf_has_overflowed ( & s ) )
pr_info ( " %s[%d]: code: %s \n " , current - > comm ,
current - > pid , s . buffer ) ;
2018-08-02 00:33:19 +03:00
}
}
powerpc: Print MSR TM bits in oops messages
Print MSR TM bits in oops messages. This appends them to the end
like this:
MSR: 8000000502823031 <SF,VEC,VSX,FP,ME,IR,DR,LE,TM[TE]>
You get the TM[] only if at least one TM MSR bit is set. Inside the
TM[], E means Enabled (bit 32), S means Suspended (bit 33), and T
means Transactional (bit 34)
If no bits are set, you get no TM[] output.
Include rework of printbits() to handle this case.
Signed-off-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2015-11-20 07:15:32 +03:00
struct regbit {
2005-10-10 16:29:05 +04:00
unsigned long bit ;
const char * name ;
powerpc: Print MSR TM bits in oops messages
Print MSR TM bits in oops messages. This appends them to the end
like this:
MSR: 8000000502823031 <SF,VEC,VSX,FP,ME,IR,DR,LE,TM[TE]>
You get the TM[] only if at least one TM MSR bit is set. Inside the
TM[], E means Enabled (bit 32), S means Suspended (bit 33), and T
means Transactional (bit 34)
If no bits are set, you get no TM[] output.
Include rework of printbits() to handle this case.
Signed-off-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2015-11-20 07:15:32 +03:00
} ;
static struct regbit msr_bits [ ] = {
2011-11-24 23:35:57 +04:00
# if defined(CONFIG_PPC64) && !defined(CONFIG_BOOKE)
{ MSR_SF , " SF " } ,
{ MSR_HV , " HV " } ,
# endif
{ MSR_VEC , " VEC " } ,
{ MSR_VSX , " VSX " } ,
# ifdef CONFIG_BOOKE
{ MSR_CE , " CE " } ,
# endif
2005-10-10 16:29:05 +04:00
{ MSR_EE , " EE " } ,
{ MSR_PR , " PR " } ,
{ MSR_FP , " FP " } ,
{ MSR_ME , " ME " } ,
2011-11-24 23:35:57 +04:00
# ifdef CONFIG_BOOKE
2008-11-19 07:39:53 +03:00
{ MSR_DE , " DE " } ,
2011-11-24 23:35:57 +04:00
# else
{ MSR_SE , " SE " } ,
{ MSR_BE , " BE " } ,
# endif
2005-10-10 16:29:05 +04:00
{ MSR_IR , " IR " } ,
{ MSR_DR , " DR " } ,
2011-11-24 23:35:57 +04:00
{ MSR_PMM , " PMM " } ,
# ifndef CONFIG_BOOKE
{ MSR_RI , " RI " } ,
{ MSR_LE , " LE " } ,
# endif
2005-10-10 16:29:05 +04:00
{ 0 , NULL }
} ;
powerpc: Print MSR TM bits in oops messages
Print MSR TM bits in oops messages. This appends them to the end
like this:
MSR: 8000000502823031 <SF,VEC,VSX,FP,ME,IR,DR,LE,TM[TE]>
You get the TM[] only if at least one TM MSR bit is set. Inside the
TM[], E means Enabled (bit 32), S means Suspended (bit 33), and T
means Transactional (bit 34)
If no bits are set, you get no TM[] output.
Include rework of printbits() to handle this case.
Signed-off-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2015-11-20 07:15:32 +03:00
static void print_bits ( unsigned long val , struct regbit * bits , const char * sep )
2005-10-10 16:29:05 +04:00
{
powerpc: Print MSR TM bits in oops messages
Print MSR TM bits in oops messages. This appends them to the end
like this:
MSR: 8000000502823031 <SF,VEC,VSX,FP,ME,IR,DR,LE,TM[TE]>
You get the TM[] only if at least one TM MSR bit is set. Inside the
TM[], E means Enabled (bit 32), S means Suspended (bit 33), and T
means Transactional (bit 34)
If no bits are set, you get no TM[] output.
Include rework of printbits() to handle this case.
Signed-off-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2015-11-20 07:15:32 +03:00
const char * s = " " ;
2005-10-10 16:29:05 +04:00
for ( ; bits - > bit ; + + bits )
if ( val & bits - > bit ) {
2016-11-02 14:20:47 +03:00
pr_cont ( " %s%s " , s , bits - > name ) ;
powerpc: Print MSR TM bits in oops messages
Print MSR TM bits in oops messages. This appends them to the end
like this:
MSR: 8000000502823031 <SF,VEC,VSX,FP,ME,IR,DR,LE,TM[TE]>
You get the TM[] only if at least one TM MSR bit is set. Inside the
TM[], E means Enabled (bit 32), S means Suspended (bit 33), and T
means Transactional (bit 34)
If no bits are set, you get no TM[] output.
Include rework of printbits() to handle this case.
Signed-off-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2015-11-20 07:15:32 +03:00
s = sep ;
2005-10-10 16:29:05 +04:00
}
powerpc: Print MSR TM bits in oops messages
Print MSR TM bits in oops messages. This appends them to the end
like this:
MSR: 8000000502823031 <SF,VEC,VSX,FP,ME,IR,DR,LE,TM[TE]>
You get the TM[] only if at least one TM MSR bit is set. Inside the
TM[], E means Enabled (bit 32), S means Suspended (bit 33), and T
means Transactional (bit 34)
If no bits are set, you get no TM[] output.
Include rework of printbits() to handle this case.
Signed-off-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2015-11-20 07:15:32 +03:00
}
# ifdef CONFIG_PPC_TRANSACTIONAL_MEM
static struct regbit msr_tm_bits [ ] = {
{ MSR_TS_T , " T " } ,
{ MSR_TS_S , " S " } ,
{ MSR_TM , " E " } ,
{ 0 , NULL }
} ;
static void print_tm_bits ( unsigned long val )
{
/*
* This only prints something if at least one of the TM bit is set .
* Inside the TM [ ] , the output means :
* E : Enabled ( bit 32 )
* S : Suspended ( bit 33 )
* T : Transactional ( bit 34 )
*/
if ( val & ( MSR_TM | MSR_TS_S | MSR_TS_T ) ) {
2016-11-02 14:20:47 +03:00
pr_cont ( " ,TM[ " ) ;
powerpc: Print MSR TM bits in oops messages
Print MSR TM bits in oops messages. This appends them to the end
like this:
MSR: 8000000502823031 <SF,VEC,VSX,FP,ME,IR,DR,LE,TM[TE]>
You get the TM[] only if at least one TM MSR bit is set. Inside the
TM[], E means Enabled (bit 32), S means Suspended (bit 33), and T
means Transactional (bit 34)
If no bits are set, you get no TM[] output.
Include rework of printbits() to handle this case.
Signed-off-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2015-11-20 07:15:32 +03:00
print_bits ( val , msr_tm_bits , " " ) ;
2016-11-02 14:20:47 +03:00
pr_cont ( " ] " ) ;
powerpc: Print MSR TM bits in oops messages
Print MSR TM bits in oops messages. This appends them to the end
like this:
MSR: 8000000502823031 <SF,VEC,VSX,FP,ME,IR,DR,LE,TM[TE]>
You get the TM[] only if at least one TM MSR bit is set. Inside the
TM[], E means Enabled (bit 32), S means Suspended (bit 33), and T
means Transactional (bit 34)
If no bits are set, you get no TM[] output.
Include rework of printbits() to handle this case.
Signed-off-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2015-11-20 07:15:32 +03:00
}
}
# else
static void print_tm_bits ( unsigned long val ) { }
# endif
static void print_msr_bits ( unsigned long val )
{
2016-11-02 14:20:47 +03:00
pr_cont ( " < " ) ;
powerpc: Print MSR TM bits in oops messages
Print MSR TM bits in oops messages. This appends them to the end
like this:
MSR: 8000000502823031 <SF,VEC,VSX,FP,ME,IR,DR,LE,TM[TE]>
You get the TM[] only if at least one TM MSR bit is set. Inside the
TM[], E means Enabled (bit 32), S means Suspended (bit 33), and T
means Transactional (bit 34)
If no bits are set, you get no TM[] output.
Include rework of printbits() to handle this case.
Signed-off-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2015-11-20 07:15:32 +03:00
print_bits ( val , msr_bits , " , " ) ;
print_tm_bits ( val ) ;
2016-11-02 14:20:47 +03:00
pr_cont ( " > " ) ;
2005-10-10 16:29:05 +04:00
}
# ifdef CONFIG_PPC64
2007-03-21 04:38:19 +03:00
# define REG "%016lx"
2005-10-10 16:29:05 +04:00
# define REGS_PER_LINE 4
# else
2007-03-21 04:38:19 +03:00
# define REG "%08lx"
2005-10-10 16:29:05 +04:00
# define REGS_PER_LINE 8
# endif
powerpc: show registers when unwinding interrupt frames
It's often useful to know the register state for interrupts in
the stack frame. In the below example (with this patch applied),
the important information is the state of the page fault.
A blatant case like this probably rather should have the page
fault regs passed down to the warning, but quite often there are
less obvious cases where an interrupt shows up that might give
some more clues.
The downside is longer and more complex bug output.
Bug: Write fault blocked by AMR!
WARNING: CPU: 0 PID: 72 at arch/powerpc/include/asm/book3s/64/kup-radix.h:164 __do_page_fault+0x880/0xa90
Modules linked in:
CPU: 0 PID: 72 Comm: systemd-gpt-aut Not tainted
NIP: c00000000006e2f0 LR: c00000000006e2ec CTR: 0000000000000000
REGS: c00000000a4f3420 TRAP: 0700
MSR: 8000000000021033 <SF,ME,IR,DR,RI,LE> CR: 28002840 XER: 20040000
CFAR: c000000000128be0 IRQMASK: 3
GPR00: c00000000006e2ec c00000000a4f36c0 c0000000014f0700 0000000000000020
GPR04: 0000000000000001 c000000001290f50 0000000000000001 c000000001290f80
GPR08: c000000001612b08 0000000000000000 0000000000000000 00000000ffffe0f7
GPR12: 0000000048002840 c0000000016e0000 c00c000000021c80 c000000000fd6f60
GPR16: 0000000000000000 c00000000a104698 0000000000000003 c0000000087f0000
GPR20: 0000000000000100 c0000000070330b8 0000000000000000 0000000000000004
GPR24: 0000000002000000 0000000000000300 0000000002000000 c00000000a5b0c00
GPR28: 0000000000000000 000000000a000000 00007fffb2a90038 c00000000a4f3820
NIP [c00000000006e2f0] __do_page_fault+0x880/0xa90
LR [c00000000006e2ec] __do_page_fault+0x87c/0xa90
Call Trace:
[c00000000a4f36c0] [c00000000006e2ec] __do_page_fault+0x87c/0xa90 (unreliable)
[c00000000a4f3780] [c000000000e1c034] do_page_fault+0x34/0x90
[c00000000a4f37b0] [c000000000008908] data_access_common_virt+0x158/0x1b0
--- interrupt: 300 at __copy_tofrom_user_base+0x9c/0x5a4
NIP: c00000000009b028 LR: c000000000802978 CTR: 0000000000000800
REGS: c00000000a4f3820 TRAP: 0300
MSR: 800000000280b033 <SF,VEC,VSX,EE,FP,ME,IR,DR,RI,LE> CR: 24004840 XER: 00000000
CFAR: c00000000009aff4 DAR: 00007fffb2a90038 DSISR: 0a000000 IRQMASK: 0
GPR00: 0000000000000000 c00000000a4f3ac0 c0000000014f0700 00007fffb2a90028
GPR04: c000000008720010 0000000000010000 0000000000000000 0000000000000000
GPR08: 0000000000000000 0000000000000000 0000000000000000 0000000000000001
GPR12: 0000000000004000 c0000000016e0000 c00c000000021c80 c000000000fd6f60
GPR16: 0000000000000000 c00000000a104698 0000000000000003 c0000000087f0000
GPR20: 0000000000000100 c0000000070330b8 0000000000000000 0000000000000004
GPR24: c00000000a4f3c80 c000000008720000 0000000000010000 0000000000000000
GPR28: 0000000000010000 0000000008720000 0000000000010000 c000000001515b98
NIP [c00000000009b028] __copy_tofrom_user_base+0x9c/0x5a4
LR [c000000000802978] copyout+0x68/0xc0
--- interrupt: 300
[c00000000a4f3af0] [c0000000008074b8] copy_page_to_iter+0x188/0x540
[c00000000a4f3b50] [c00000000035c678] generic_file_buffered_read+0x358/0xd80
[c00000000a4f3c40] [c0000000004c1e90] blkdev_read_iter+0x50/0x80
[c00000000a4f3c60] [c00000000045733c] new_sync_read+0x12c/0x1c0
[c00000000a4f3d00] [c00000000045a1f0] vfs_read+0x1d0/0x240
[c00000000a4f3d50] [c00000000045a7f4] ksys_read+0x84/0x140
[c00000000a4f3da0] [c000000000033a60] system_call_exception+0x100/0x280
[c00000000a4f3e10] [c00000000000c508] system_call_common+0xf8/0x2f8
Instruction dump:
eae10078 3be0000b 4bfff890 60420000 792917e1 4182ff18 3c82ffab 3884a5e0
3c62ffab 3863a6e8 480ba891 60000000 <0fe00000> 3be0000b 4bfff860 e93c0938
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20201107023305.2384874-1-npiggin@gmail.com
2020-11-07 05:33:05 +03:00
static void __show_regs ( struct pt_regs * regs )
2005-09-26 10:04:21 +04:00
{
int i , trap ;
powerpc/oops: Line up NIP & MSR with other rows
This is purely cosmetic, but does look nicer IMHO:
Before:
task: c000000001453400 task.stack: c000000001c6c000
NIP: c000000000a0fbfc LR: c000000000a0fbf4 CTR: c000000000ba6220
REGS: c0000001fffef820 TRAP: 0300 Not tainted (4.13.0-rc6-gcc-6.3.1-00234-g423af27f7d81)
MSR: 8000000000009033 <SF,EE,ME,IR,DR,RI,LE> CR: 88088242 XER: 00000000
CFAR: c0000000000b3488 DAR: 0000000000000000 DSISR: 42000000 SOFTE: 0
After:
task: c000000001453400 task.stack: c000000001c6c000
NIP: c000000000a0fbfc LR: c000000000a0fbf4 CTR: c000000000ba6220
REGS: c0000001fffef820 TRAP: 0300 Not tainted (4.13.0-rc6-gcc-6.3.1-00234-g423af27f7d81-dirty)
MSR: 8000000000009033 <SF,EE,ME,IR,DR,RI,LE> CR: 88088242 XER: 00000000
CFAR: c0000000000b34a4 DAR: 0000000000000000 DSISR: 42000000 SOFTE: 0
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2017-08-23 16:56:24 +03:00
printk ( " NIP: " REG " LR: " REG " CTR: " REG " \n " ,
2005-10-10 16:29:05 +04:00
regs - > nip , regs - > link , regs - > ctr ) ;
2017-12-18 08:33:36 +03:00
printk ( " REGS: %px TRAP: %04lx %s (%s) \n " ,
2006-10-02 13:18:13 +04:00
regs , regs - > trap , print_tainted ( ) , init_utsname ( ) - > release ) ;
powerpc/oops: Line up NIP & MSR with other rows
This is purely cosmetic, but does look nicer IMHO:
Before:
task: c000000001453400 task.stack: c000000001c6c000
NIP: c000000000a0fbfc LR: c000000000a0fbf4 CTR: c000000000ba6220
REGS: c0000001fffef820 TRAP: 0300 Not tainted (4.13.0-rc6-gcc-6.3.1-00234-g423af27f7d81)
MSR: 8000000000009033 <SF,EE,ME,IR,DR,RI,LE> CR: 88088242 XER: 00000000
CFAR: c0000000000b3488 DAR: 0000000000000000 DSISR: 42000000 SOFTE: 0
After:
task: c000000001453400 task.stack: c000000001c6c000
NIP: c000000000a0fbfc LR: c000000000a0fbf4 CTR: c000000000ba6220
REGS: c0000001fffef820 TRAP: 0300 Not tainted (4.13.0-rc6-gcc-6.3.1-00234-g423af27f7d81-dirty)
MSR: 8000000000009033 <SF,EE,ME,IR,DR,RI,LE> CR: 88088242 XER: 00000000
CFAR: c0000000000b34a4 DAR: 0000000000000000 DSISR: 42000000 SOFTE: 0
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2017-08-23 16:56:24 +03:00
printk ( " MSR: " REG " " , regs - > msr ) ;
powerpc: Print MSR TM bits in oops messages
Print MSR TM bits in oops messages. This appends them to the end
like this:
MSR: 8000000502823031 <SF,VEC,VSX,FP,ME,IR,DR,LE,TM[TE]>
You get the TM[] only if at least one TM MSR bit is set. Inside the
TM[], E means Enabled (bit 32), S means Suspended (bit 33), and T
means Transactional (bit 34)
If no bits are set, you get no TM[] output.
Include rework of printbits() to handle this case.
Signed-off-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2015-11-20 07:15:32 +03:00
print_msr_bits ( regs - > msr ) ;
2017-08-23 16:56:23 +03:00
pr_cont ( " CR: %08lx XER: %08lx \n " , regs - > ccr , regs - > xer ) ;
2005-09-26 10:04:21 +04:00
trap = TRAP ( regs ) ;
2020-05-07 15:13:31 +03:00
if ( ! trap_is_syscall ( regs ) & & cpu_has_feature ( CPU_FTR_CFAR ) )
2016-11-03 12:45:26 +03:00
pr_cont ( " CFAR: " REG " " , regs - > orig_gpr3 ) ;
2021-04-14 14:00:33 +03:00
if ( trap = = INTERRUPT_MACHINE_CHECK | |
trap = = INTERRUPT_DATA_STORAGE | |
trap = = INTERRUPT_ALIGNMENT ) {
2020-08-17 08:46:43 +03:00
if ( IS_ENABLED ( CONFIG_4xx ) | | IS_ENABLED ( CONFIG_BOOKE ) )
2021-08-07 04:02:38 +03:00
pr_cont ( " DEAR: " REG " ESR: " REG " " , regs - > dear , regs - > esr ) ;
2020-08-17 08:46:43 +03:00
else
pr_cont ( " DAR: " REG " DSISR: %08lx " , regs - > dar , regs - > dsisr ) ;
}
powerpc: Remove a few lines of oops output
We waste quite a few lines in our oops output:
...
MSR: 8000000000009032 <SF,EE,ME,IR,DR,RI> CR: 28044024 XER: 00000000
SOFTE: 0
CFAR: 0000000000009088
DAR: 000000000000001c, DSISR: 40000000
GPR00: c0000000000c74f0 c00000037cc1b010 c000000000d2bb30 0000000000000000
...
We can do a better job here and remove 3 lines:
MSR: 8000000000009032 <SF,EE,ME,IR,DR,RI> CR: 28044024 XER: 00000000
CFAR: 0000000000009088 DAR: 0000000000000010, DSISR: 40000000 SOFTE: 1
GPR00: c0000000000e3d10 c00000037cc2fda0 c000000000d2c3a8 0000000000000001
Also move PACATMSCRATCH up, it doesn't really belong in the stack
trace section.
Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2013-11-15 08:48:38 +04:00
# ifdef CONFIG_PPC64
2018-05-10 04:04:24 +03:00
pr_cont ( " IRQMASK: %lx " , regs - > softe ) ;
powerpc: Remove a few lines of oops output
We waste quite a few lines in our oops output:
...
MSR: 8000000000009032 <SF,EE,ME,IR,DR,RI> CR: 28044024 XER: 00000000
SOFTE: 0
CFAR: 0000000000009088
DAR: 000000000000001c, DSISR: 40000000
GPR00: c0000000000c74f0 c00000037cc1b010 c000000000d2bb30 0000000000000000
...
We can do a better job here and remove 3 lines:
MSR: 8000000000009032 <SF,EE,ME,IR,DR,RI> CR: 28044024 XER: 00000000
CFAR: 0000000000009088 DAR: 0000000000000010, DSISR: 40000000 SOFTE: 1
GPR00: c0000000000e3d10 c00000037cc2fda0 c000000000d2c3a8 0000000000000001
Also move PACATMSCRATCH up, it doesn't really belong in the stack
trace section.
Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2013-11-15 08:48:38 +04:00
# endif
# ifdef CONFIG_PPC_TRANSACTIONAL_MEM
2013-11-18 06:19:17 +04:00
if ( MSR_TM_ACTIVE ( regs - > msr ) )
2016-11-03 12:45:26 +03:00
pr_cont ( " \n PACATMSCRATCH: %016llx " , get_paca ( ) - > tm_scratch ) ;
2007-07-26 09:46:15 +04:00
# endif
2005-09-26 10:04:21 +04:00
for ( i = 0 ; i < 32 ; i + + ) {
2005-10-10 16:29:05 +04:00
if ( ( i % REGS_PER_LINE ) = = 0 )
2016-11-03 12:45:26 +03:00
pr_cont ( " \n GPR%02d: " , i ) ;
pr_cont ( REG " " , regs - > gpr [ i ] ) ;
2005-09-26 10:04:21 +04:00
}
2016-11-03 12:45:26 +03:00
pr_cont ( " \n " ) ;
2005-09-26 10:04:21 +04:00
/*
* Lookup NIP late so we have the best change of getting the
* above info out without failing
*/
2020-08-17 08:46:44 +03:00
if ( IS_ENABLED ( CONFIG_KALLSYMS ) ) {
printk ( " NIP [ " REG " ] %pS \n " , regs - > nip , ( void * ) regs - > nip ) ;
printk ( " LR [ " REG " ] %pS \n " , regs - > link , ( void * ) regs - > link ) ;
}
powerpc: show registers when unwinding interrupt frames
It's often useful to know the register state for interrupts in
the stack frame. In the below example (with this patch applied),
the important information is the state of the page fault.
A blatant case like this probably rather should have the page
fault regs passed down to the warning, but quite often there are
less obvious cases where an interrupt shows up that might give
some more clues.
The downside is longer and more complex bug output.
Bug: Write fault blocked by AMR!
WARNING: CPU: 0 PID: 72 at arch/powerpc/include/asm/book3s/64/kup-radix.h:164 __do_page_fault+0x880/0xa90
Modules linked in:
CPU: 0 PID: 72 Comm: systemd-gpt-aut Not tainted
NIP: c00000000006e2f0 LR: c00000000006e2ec CTR: 0000000000000000
REGS: c00000000a4f3420 TRAP: 0700
MSR: 8000000000021033 <SF,ME,IR,DR,RI,LE> CR: 28002840 XER: 20040000
CFAR: c000000000128be0 IRQMASK: 3
GPR00: c00000000006e2ec c00000000a4f36c0 c0000000014f0700 0000000000000020
GPR04: 0000000000000001 c000000001290f50 0000000000000001 c000000001290f80
GPR08: c000000001612b08 0000000000000000 0000000000000000 00000000ffffe0f7
GPR12: 0000000048002840 c0000000016e0000 c00c000000021c80 c000000000fd6f60
GPR16: 0000000000000000 c00000000a104698 0000000000000003 c0000000087f0000
GPR20: 0000000000000100 c0000000070330b8 0000000000000000 0000000000000004
GPR24: 0000000002000000 0000000000000300 0000000002000000 c00000000a5b0c00
GPR28: 0000000000000000 000000000a000000 00007fffb2a90038 c00000000a4f3820
NIP [c00000000006e2f0] __do_page_fault+0x880/0xa90
LR [c00000000006e2ec] __do_page_fault+0x87c/0xa90
Call Trace:
[c00000000a4f36c0] [c00000000006e2ec] __do_page_fault+0x87c/0xa90 (unreliable)
[c00000000a4f3780] [c000000000e1c034] do_page_fault+0x34/0x90
[c00000000a4f37b0] [c000000000008908] data_access_common_virt+0x158/0x1b0
--- interrupt: 300 at __copy_tofrom_user_base+0x9c/0x5a4
NIP: c00000000009b028 LR: c000000000802978 CTR: 0000000000000800
REGS: c00000000a4f3820 TRAP: 0300
MSR: 800000000280b033 <SF,VEC,VSX,EE,FP,ME,IR,DR,RI,LE> CR: 24004840 XER: 00000000
CFAR: c00000000009aff4 DAR: 00007fffb2a90038 DSISR: 0a000000 IRQMASK: 0
GPR00: 0000000000000000 c00000000a4f3ac0 c0000000014f0700 00007fffb2a90028
GPR04: c000000008720010 0000000000010000 0000000000000000 0000000000000000
GPR08: 0000000000000000 0000000000000000 0000000000000000 0000000000000001
GPR12: 0000000000004000 c0000000016e0000 c00c000000021c80 c000000000fd6f60
GPR16: 0000000000000000 c00000000a104698 0000000000000003 c0000000087f0000
GPR20: 0000000000000100 c0000000070330b8 0000000000000000 0000000000000004
GPR24: c00000000a4f3c80 c000000008720000 0000000000010000 0000000000000000
GPR28: 0000000000010000 0000000008720000 0000000000010000 c000000001515b98
NIP [c00000000009b028] __copy_tofrom_user_base+0x9c/0x5a4
LR [c000000000802978] copyout+0x68/0xc0
--- interrupt: 300
[c00000000a4f3af0] [c0000000008074b8] copy_page_to_iter+0x188/0x540
[c00000000a4f3b50] [c00000000035c678] generic_file_buffered_read+0x358/0xd80
[c00000000a4f3c40] [c0000000004c1e90] blkdev_read_iter+0x50/0x80
[c00000000a4f3c60] [c00000000045733c] new_sync_read+0x12c/0x1c0
[c00000000a4f3d00] [c00000000045a1f0] vfs_read+0x1d0/0x240
[c00000000a4f3d50] [c00000000045a7f4] ksys_read+0x84/0x140
[c00000000a4f3da0] [c000000000033a60] system_call_exception+0x100/0x280
[c00000000a4f3e10] [c00000000000c508] system_call_common+0xf8/0x2f8
Instruction dump:
eae10078 3be0000b 4bfff890 60420000 792917e1 4182ff18 3c82ffab 3884a5e0
3c62ffab 3863a6e8 480ba891 60000000 <0fe00000> 3be0000b 4bfff860 e93c0938
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20201107023305.2384874-1-npiggin@gmail.com
2020-11-07 05:33:05 +03:00
}
void show_regs ( struct pt_regs * regs )
{
show_regs_print_info ( KERN_DEFAULT ) ;
__show_regs ( regs ) ;
2020-06-09 07:32:29 +03:00
show_stack ( current , ( unsigned long * ) regs - > gpr [ 1 ] , KERN_DEFAULT ) ;
2005-10-10 16:29:05 +04:00
if ( ! user_mode ( regs ) )
show_instructions ( regs ) ;
2005-09-26 10:04:21 +04:00
}
void flush_thread ( void )
{
2011-02-10 07:44:35 +03:00
# ifdef CONFIG_HAVE_HW_BREAKPOINT
2010-06-15 10:05:19 +04:00
flush_ptrace_hw_breakpoint ( current ) ;
2011-02-10 07:44:35 +03:00
# else /* CONFIG_HAVE_HW_BREAKPOINT */
2010-02-08 14:51:18 +03:00
set_debug_reg_defaults ( & current - > thread ) ;
2011-02-10 07:44:35 +03:00
# endif /* CONFIG_HAVE_HW_BREAKPOINT */
2005-09-26 10:04:21 +04:00
}
2018-09-14 18:30:55 +03:00
void arch_setup_new_exec ( void )
{
2020-11-27 07:44:11 +03:00
# ifdef CONFIG_PPC_BOOK3S_64
if ( ! radix_enabled ( ) )
hash__setup_new_exec ( ) ;
2018-09-14 18:30:55 +03:00
# endif
2020-11-27 07:44:11 +03:00
/*
* If we exec out of a kernel thread then thread . regs will not be
* set . Do it now .
*/
if ( ! current - > thread . regs ) {
struct pt_regs * regs = task_stack_page ( current ) + THREAD_SIZE ;
current - > thread . regs = regs - 1 ;
}
2020-11-27 07:44:14 +03:00
# ifdef CONFIG_PPC_MEM_KEYS
current - > thread . regs - > amr = default_amr ;
current - > thread . regs - > iamr = default_iamr ;
# endif
2020-11-27 07:44:11 +03:00
}
2018-09-14 18:30:55 +03:00
2017-11-08 05:23:53 +03:00
# ifdef CONFIG_PPC64
2022-11-01 04:54:52 +03:00
/*
2018-05-11 09:12:59 +03:00
* Assign a TIDR ( thread ID ) for task @ t and set it in the thread
* structure . For now , we only support setting TIDR for ' current ' task .
2017-11-08 05:23:53 +03:00
*
2018-05-11 09:12:59 +03:00
* Since the TID value is a truncated form of it PID , it is possible
* ( but unlikely ) for 2 threads to have the same TID . In the unlikely event
* that 2 threads share the same TID and are waiting , one of the following
* cases will happen :
2017-11-08 05:23:53 +03:00
*
2018-05-11 09:12:59 +03:00
* 1. The correct thread is running , the wrong thread is not
* In this situation , the correct thread is woken and proceeds to pass it ' s
* condition check .
2017-11-08 05:23:53 +03:00
*
2018-05-11 09:12:59 +03:00
* 2. Neither threads are running
* In this situation , neither thread will be woken . When scheduled , the waiting
* threads will execute either a wait , which will return immediately , followed
* by a condition check , which will pass for the correct thread and fail
* for the wrong thread , or they will execute the condition check immediately .
2017-11-08 05:23:53 +03:00
*
2018-05-11 09:12:59 +03:00
* 3. The wrong thread is running , the correct thread is not
* The wrong thread will be woken , but will fail it ' s condition check and
* re - execute wait . The correct thread , when scheduled , will execute either
* it ' s condition check ( which will pass ) , or wait , which returns immediately
* when called the first time after the thread is scheduled , followed by it ' s
* condition check ( which will pass ) .
2017-11-08 05:23:53 +03:00
*
2018-05-11 09:12:59 +03:00
* 4. Both threads are running
* Both threads will be woken . The wrong thread will fail it ' s condition check
* and execute another wait , while the correct thread will pass it ' s condition
* check .
*
* @ t : the task to set the thread ID for
2017-11-08 05:23:53 +03:00
*/
int set_thread_tidr ( struct task_struct * t )
{
2018-05-11 09:12:58 +03:00
if ( ! cpu_has_feature ( CPU_FTR_P9_TIDR ) )
2017-11-08 05:23:53 +03:00
return - EINVAL ;
if ( t ! = current )
return - EINVAL ;
2017-11-24 11:33:38 +03:00
if ( t - > thread . tidr )
return 0 ;
2018-05-11 09:12:59 +03:00
t - > thread . tidr = ( u16 ) task_pid_nr ( t ) ;
2017-11-08 05:23:53 +03:00
mtspr ( SPRN_TIDR , t - > thread . tidr ) ;
return 0 ;
}
2018-01-11 11:55:25 +03:00
EXPORT_SYMBOL_GPL ( set_thread_tidr ) ;
2017-11-08 05:23:53 +03:00
# endif /* CONFIG_PPC64 */
2005-09-26 10:04:21 +04:00
/*
2012-05-17 02:03:51 +04:00
* this gets called so that we can store coprocessor state into memory and
* copy the current task into the new thread .
2005-09-26 10:04:21 +04:00
*/
2012-05-17 02:03:51 +04:00
int arch_dup_task_struct ( struct task_struct * dst , struct task_struct * src )
2005-09-26 10:04:21 +04:00
{
2015-10-29 03:44:09 +03:00
flush_all_to_thread ( src ) ;
2014-03-03 07:21:40 +04:00
/*
* Flush TM state out so we can copy it . __switch_to_tm ( ) does this
* flush but it removes the checkpointed state from the current CPU and
* transitions the CPU out of TM mode . Hence we need to call
* tm_recheckpoint_new_task ( ) ( on the same task ) to restore the
* checkpointed state back and the TM mode .
2016-09-14 11:02:16 +03:00
*
* Can ' t pass dst because it isn ' t ready . Doesn ' t matter , passing
* dst is only important for __switch_to ( )
2014-03-03 07:21:40 +04:00
*/
2016-09-23 09:18:24 +03:00
__switch_to_tm ( src , src ) ;
2013-06-28 12:15:16 +04:00
2012-05-17 02:03:51 +04:00
* dst = * src ;
2013-06-28 12:15:16 +04:00
clear_task_ebb ( dst ) ;
2012-05-17 02:03:51 +04:00
return 0 ;
2005-09-26 10:04:21 +04:00
}
2014-07-10 06:29:21 +04:00
static void setup_ksp_vsid ( struct task_struct * p , unsigned long sp )
{
2021-12-01 17:41:52 +03:00
# ifdef CONFIG_PPC_64S_HASH_MMU
2014-07-10 06:29:21 +04:00
unsigned long sp_vsid ;
unsigned long llp = mmu_psize_defs [ mmu_linear_psize ] . sllp ;
2016-04-29 16:26:07 +03:00
if ( radix_enabled ( ) )
return ;
2014-07-10 06:29:21 +04:00
if ( mmu_has_feature ( MMU_FTR_1T_SEGMENT ) )
sp_vsid = get_kernel_vsid ( sp , MMU_SEGSIZE_1T )
< < SLB_VSID_SHIFT_1T ;
else
sp_vsid = get_kernel_vsid ( sp , MMU_SEGSIZE_256M )
< < SLB_VSID_SHIFT ;
sp_vsid | = SLB_VSID_KERNEL | llp ;
p - > thread . ksp_vsid = sp_vsid ;
# endif
}
2005-09-26 10:04:21 +04:00
/*
* Copy a thread . .
*/
2011-03-02 18:18:48 +03:00
2015-03-13 21:14:46 +03:00
/*
* Copy architecture - specific thread state
*/
2022-04-09 02:07:50 +03:00
int copy_thread ( struct task_struct * p , const struct kernel_clone_args * args )
2005-09-26 10:04:21 +04:00
{
2022-04-09 02:07:50 +03:00
unsigned long clone_flags = args - > flags ;
unsigned long usp = args - > stack ;
unsigned long tls = args - > tls ;
2005-09-26 10:04:21 +04:00
struct pt_regs * childregs , * kregs ;
extern void ret_from_fork ( void ) ;
2020-06-11 11:12:03 +03:00
extern void ret_from_fork_scv ( void ) ;
2012-09-13 02:32:42 +04:00
extern void ret_from_kernel_thread ( void ) ;
void ( * f ) ( void ) ;
2006-01-12 12:06:02 +03:00
unsigned long sp = ( unsigned long ) task_stack_page ( p ) + THREAD_SIZE ;
2016-03-24 14:04:04 +03:00
struct thread_info * ti = task_thread_info ( p ) ;
2020-05-14 14:17:35 +03:00
# ifdef CONFIG_HAVE_HW_BREAKPOINT
int i ;
# endif
2016-03-24 14:04:04 +03:00
2019-01-31 13:08:58 +03:00
klp_init_thread_info ( p ) ;
2005-09-26 10:04:21 +04:00
2022-11-27 15:49:29 +03:00
/* Create initial stack frame. */
2022-11-27 15:49:35 +03:00
sp - = STACK_USER_INT_FRAME_SIZE ;
2022-11-27 15:49:37 +03:00
* ( unsigned long * ) ( sp + STACK_INT_FRAME_MARKER ) = STACK_FRAME_REGS_MARKER ;
2022-11-27 15:49:29 +03:00
2005-09-26 10:04:21 +04:00
/* Copy registers */
2022-11-27 15:49:35 +03:00
childregs = ( struct pt_regs * ) ( sp + STACK_INT_FRAME_REGS ) ;
2022-04-12 18:18:48 +03:00
if ( unlikely ( args - > fn ) ) {
2015-03-13 21:14:46 +03:00
/* kernel thread */
2022-11-27 15:49:37 +03:00
( ( unsigned long * ) sp ) [ 0 ] = 0 ;
2012-09-13 02:32:42 +04:00
memset ( childregs , 0 , sizeof ( struct pt_regs ) ) ;
2022-11-27 15:49:35 +03:00
childregs - > gpr [ 1 ] = sp + STACK_USER_INT_FRAME_SIZE ;
2014-02-04 09:08:51 +04:00
/* function */
2022-04-12 18:18:48 +03:00
if ( args - > fn )
childregs - > gpr [ 14 ] = ppc_function_entry ( ( void * ) args - > fn ) ;
2012-09-13 02:32:42 +04:00
# ifdef CONFIG_PPC64
2006-01-12 12:06:01 +03:00
clear_tsk_thread_flag ( p , TIF_32BIT ) ;
2017-12-20 06:55:42 +03:00
childregs - > softe = IRQS_ENABLED ;
2005-10-10 16:29:05 +04:00
# endif
2022-04-12 18:18:48 +03:00
childregs - > gpr [ 15 ] = ( unsigned long ) args - > fn_arg ;
2005-09-26 10:04:21 +04:00
p - > thread . regs = NULL ; /* no user register state */
2012-10-11 16:41:43 +04:00
ti - > flags | = _TIF_RESTOREALL ;
2012-09-13 02:32:42 +04:00
f = ret_from_kernel_thread ;
2005-09-26 10:04:21 +04:00
} else {
2015-03-13 21:14:46 +03:00
/* user thread */
2012-10-23 06:51:14 +04:00
struct pt_regs * regs = current_pt_regs ( ) ;
2012-09-13 02:32:42 +04:00
* childregs = * regs ;
2012-10-22 06:28:43 +04:00
if ( usp )
childregs - > gpr [ 1 ] = usp ;
2022-11-27 15:49:37 +03:00
( ( unsigned long * ) sp ) [ 0 ] = childregs - > gpr [ 1 ] ;
2005-09-26 10:04:21 +04:00
p - > thread . regs = childregs ;
2020-06-11 11:12:03 +03:00
/* 64s sets this in ret_from_fork */
if ( ! IS_ENABLED ( CONFIG_PPC_BOOK3S_64 ) )
childregs - > gpr [ 3 ] = 0 ; /* Result from fork() */
2005-10-10 16:29:05 +04:00
if ( clone_flags & CLONE_SETTLS ) {
2010-07-30 02:04:39 +04:00
if ( ! is_32bit_task ( ) )
2019-08-27 06:30:06 +03:00
childregs - > gpr [ 13 ] = tls ;
2005-10-10 16:29:05 +04:00
else
2019-08-27 06:30:06 +03:00
childregs - > gpr [ 2 ] = tls ;
2005-10-10 16:29:05 +04:00
}
2012-09-13 02:32:42 +04:00
2020-06-11 11:12:03 +03:00
if ( trap_is_scv ( regs ) )
f = ret_from_fork_scv ;
else
f = ret_from_fork ;
2005-09-26 10:04:21 +04:00
}
2016-02-29 09:53:46 +03:00
childregs - > msr & = ~ ( MSR_FP | MSR_VEC | MSR_VSX ) ;
2005-09-26 10:04:21 +04:00
/*
* The way this works is that at some point in the future
* some task will call _switch to switch to the new task .
* That will pop off the stack frame created below and start
* the new task running at ret_from_fork . The new task will
* do some house keeping and then return from the fork or clone
* system call , using the stack frame created above .
*/
2022-11-27 15:49:38 +03:00
( ( unsigned long * ) sp ) [ STACK_FRAME_LR_SAVE ] = ( unsigned long ) f ;
2022-11-27 15:49:36 +03:00
sp - = STACK_SWITCH_FRAME_SIZE ;
2022-11-27 15:49:38 +03:00
( ( unsigned long * ) sp ) [ 0 ] = sp + STACK_SWITCH_FRAME_SIZE ;
2022-11-27 15:49:36 +03:00
kregs = ( struct pt_regs * ) ( sp + STACK_SWITCH_FRAME_REGS ) ;
2005-09-26 10:04:21 +04:00
p - > thread . ksp = sp ;
2022-11-27 15:49:36 +03:00
2013-04-21 10:47:59 +04:00
# ifdef CONFIG_HAVE_HW_BREAKPOINT
2020-05-14 14:17:35 +03:00
for ( i = 0 ; i < nr_wp_slots ( ) ; i + + )
p - > thread . ptrace_bps [ i ] = NULL ;
2013-04-21 10:47:59 +04:00
# endif
2020-08-18 20:19:17 +03:00
# ifdef CONFIG_PPC_FPU_REGS
2013-09-10 14:21:10 +04:00
p - > thread . fp_save_area = NULL ;
2020-08-18 20:19:17 +03:00
# endif
2013-09-10 14:21:10 +04:00
# ifdef CONFIG_ALTIVEC
p - > thread . vr_save_area = NULL ;
# endif
powerpc/32s: Rework Kernel Userspace Access Protection
On book3s/32, KUAP is provided by toggling Ks bit in segment registers.
One segment register addresses 256M of virtual memory.
At the time being, KUAP implements a complex logic to apply the
unlock/lock on the exact number of segments covering the user range
to access, with saving the boundaries of the range of segments in
a member of thread struct.
But most if not all user accesses are within a single segment.
Rework KUAP with a different approach:
- Open only one segment, the one corresponding to the starting
address of the range to be accessed.
- If a second segment is involved, it will generate a page fault. The
segment will then be open by the page fault handler.
The kuap member of thread struct will now contain:
- The start address of the current on going user access, that will be
used to know which segment to lock at the end of the user access.
- ~0 when no user access is open
- ~1 when additionnal segments are opened by a page fault.
Then, at lock time
- When only one segment is open, close it.
- When several segments are open, close all user segments.
Almost 100% of the time, only one segment will be involved.
In interrupts, inline the function that unlock/lock all segments,
because not inlining them implies a lot of register save/restore.
With the patch, writing value 128 in userspace in perf_copy_attr() is
done with 16 instructions:
3890: 93 82 04 dc stw r28,1244(r2)
3894: 7d 20 e5 26 mfsrin r9,r28
3898: 55 29 00 80 rlwinm r9,r9,0,2,0
389c: 7d 20 e1 e4 mtsrin r9,r28
38a0: 4c 00 01 2c isync
38a4: 39 20 00 80 li r9,128
38a8: 91 3c 00 00 stw r9,0(r28)
38ac: 81 42 04 dc lwz r10,1244(r2)
38b0: 39 00 ff ff li r8,-1
38b4: 91 02 04 dc stw r8,1244(r2)
38b8: 2c 0a ff fe cmpwi r10,-2
38bc: 41 82 00 88 beq 3944 <perf_copy_attr+0x36c>
38c0: 7d 20 55 26 mfsrin r9,r10
38c4: 65 29 40 00 oris r9,r9,16384
38c8: 7d 20 51 e4 mtsrin r9,r10
38cc: 4c 00 01 2c isync
...
3944: 48 00 00 01 bl 3944 <perf_copy_attr+0x36c>
3944: R_PPC_REL24 kuap_lock_all_ool
Before the patch it was 118 instructions. In reality only 42 are
executed in most cases, but GCC is not able to see that a properly
aligned user access cannot involve more than one segment.
5060: 39 1d 00 04 addi r8,r29,4
5064: 3d 20 b0 00 lis r9,-20480
5068: 7c 08 48 40 cmplw r8,r9
506c: 40 81 00 08 ble 5074 <perf_copy_attr+0x2cc>
5070: 3d 00 b0 00 lis r8,-20480
5074: 39 28 ff ff addi r9,r8,-1
5078: 57 aa 00 06 rlwinm r10,r29,0,0,3
507c: 55 29 27 3e rlwinm r9,r9,4,28,31
5080: 39 29 00 01 addi r9,r9,1
5084: 7d 29 53 78 or r9,r9,r10
5088: 91 22 04 dc stw r9,1244(r2)
508c: 7d 20 ed 26 mfsrin r9,r29
5090: 55 29 00 80 rlwinm r9,r9,0,2,0
5094: 7c 08 50 40 cmplw r8,r10
5098: 40 81 00 c0 ble 5158 <perf_copy_attr+0x3b0>
509c: 7d 46 50 f8 not r6,r10
50a0: 7c c6 42 14 add r6,r6,r8
50a4: 54 c6 27 be rlwinm r6,r6,4,30,31
50a8: 7d 20 51 e4 mtsrin r9,r10
50ac: 3c ea 10 00 addis r7,r10,4096
50b0: 39 29 01 11 addi r9,r9,273
50b4: 7f 88 38 40 cmplw cr7,r8,r7
50b8: 55 29 02 06 rlwinm r9,r9,0,8,3
50bc: 40 9d 00 9c ble cr7,5158 <perf_copy_attr+0x3b0>
50c0: 2f 86 00 00 cmpwi cr7,r6,0
50c4: 41 9e 00 4c beq cr7,5110 <perf_copy_attr+0x368>
50c8: 2f 86 00 01 cmpwi cr7,r6,1
50cc: 41 9e 00 2c beq cr7,50f8 <perf_copy_attr+0x350>
50d0: 2f 86 00 02 cmpwi cr7,r6,2
50d4: 41 9e 00 14 beq cr7,50e8 <perf_copy_attr+0x340>
50d8: 7d 20 39 e4 mtsrin r9,r7
50dc: 39 29 01 11 addi r9,r9,273
50e0: 3c e7 10 00 addis r7,r7,4096
50e4: 55 29 02 06 rlwinm r9,r9,0,8,3
50e8: 7d 20 39 e4 mtsrin r9,r7
50ec: 39 29 01 11 addi r9,r9,273
50f0: 3c e7 10 00 addis r7,r7,4096
50f4: 55 29 02 06 rlwinm r9,r9,0,8,3
50f8: 7d 20 39 e4 mtsrin r9,r7
50fc: 3c e7 10 00 addis r7,r7,4096
5100: 39 29 01 11 addi r9,r9,273
5104: 7f 88 38 40 cmplw cr7,r8,r7
5108: 55 29 02 06 rlwinm r9,r9,0,8,3
510c: 40 9d 00 4c ble cr7,5158 <perf_copy_attr+0x3b0>
5110: 7d 20 39 e4 mtsrin r9,r7
5114: 39 29 01 11 addi r9,r9,273
5118: 3c c7 10 00 addis r6,r7,4096
511c: 55 29 02 06 rlwinm r9,r9,0,8,3
5120: 7d 20 31 e4 mtsrin r9,r6
5124: 39 29 01 11 addi r9,r9,273
5128: 3c c6 10 00 addis r6,r6,4096
512c: 55 29 02 06 rlwinm r9,r9,0,8,3
5130: 7d 20 31 e4 mtsrin r9,r6
5134: 39 29 01 11 addi r9,r9,273
5138: 3c c7 30 00 addis r6,r7,12288
513c: 55 29 02 06 rlwinm r9,r9,0,8,3
5140: 7d 20 31 e4 mtsrin r9,r6
5144: 3c e7 40 00 addis r7,r7,16384
5148: 39 29 01 11 addi r9,r9,273
514c: 7f 88 38 40 cmplw cr7,r8,r7
5150: 55 29 02 06 rlwinm r9,r9,0,8,3
5154: 41 9d ff bc bgt cr7,5110 <perf_copy_attr+0x368>
5158: 4c 00 01 2c isync
515c: 39 20 00 80 li r9,128
5160: 91 3d 00 00 stw r9,0(r29)
5164: 38 e0 00 00 li r7,0
5168: 90 e2 04 dc stw r7,1244(r2)
516c: 7d 20 ed 26 mfsrin r9,r29
5170: 65 29 40 00 oris r9,r9,16384
5174: 40 81 00 c0 ble 5234 <perf_copy_attr+0x48c>
5178: 7d 47 50 f8 not r7,r10
517c: 7c e7 42 14 add r7,r7,r8
5180: 54 e7 27 be rlwinm r7,r7,4,30,31
5184: 7d 20 51 e4 mtsrin r9,r10
5188: 3d 4a 10 00 addis r10,r10,4096
518c: 39 29 01 11 addi r9,r9,273
5190: 7c 08 50 40 cmplw r8,r10
5194: 55 29 02 06 rlwinm r9,r9,0,8,3
5198: 40 81 00 9c ble 5234 <perf_copy_attr+0x48c>
519c: 2c 07 00 00 cmpwi r7,0
51a0: 41 82 00 4c beq 51ec <perf_copy_attr+0x444>
51a4: 2c 07 00 01 cmpwi r7,1
51a8: 41 82 00 2c beq 51d4 <perf_copy_attr+0x42c>
51ac: 2c 07 00 02 cmpwi r7,2
51b0: 41 82 00 14 beq 51c4 <perf_copy_attr+0x41c>
51b4: 7d 20 51 e4 mtsrin r9,r10
51b8: 39 29 01 11 addi r9,r9,273
51bc: 3d 4a 10 00 addis r10,r10,4096
51c0: 55 29 02 06 rlwinm r9,r9,0,8,3
51c4: 7d 20 51 e4 mtsrin r9,r10
51c8: 39 29 01 11 addi r9,r9,273
51cc: 3d 4a 10 00 addis r10,r10,4096
51d0: 55 29 02 06 rlwinm r9,r9,0,8,3
51d4: 7d 20 51 e4 mtsrin r9,r10
51d8: 3d 4a 10 00 addis r10,r10,4096
51dc: 39 29 01 11 addi r9,r9,273
51e0: 7c 08 50 40 cmplw r8,r10
51e4: 55 29 02 06 rlwinm r9,r9,0,8,3
51e8: 40 81 00 4c ble 5234 <perf_copy_attr+0x48c>
51ec: 7d 20 51 e4 mtsrin r9,r10
51f0: 39 29 01 11 addi r9,r9,273
51f4: 3c ea 10 00 addis r7,r10,4096
51f8: 55 29 02 06 rlwinm r9,r9,0,8,3
51fc: 7d 20 39 e4 mtsrin r9,r7
5200: 39 29 01 11 addi r9,r9,273
5204: 3c e7 10 00 addis r7,r7,4096
5208: 55 29 02 06 rlwinm r9,r9,0,8,3
520c: 7d 20 39 e4 mtsrin r9,r7
5210: 39 29 01 11 addi r9,r9,273
5214: 3c ea 30 00 addis r7,r10,12288
5218: 55 29 02 06 rlwinm r9,r9,0,8,3
521c: 7d 20 39 e4 mtsrin r9,r7
5220: 3d 4a 40 00 addis r10,r10,16384
5224: 39 29 01 11 addi r9,r9,273
5228: 7c 08 50 40 cmplw r8,r10
522c: 55 29 02 06 rlwinm r9,r9,0,8,3
5230: 41 81 ff bc bgt 51ec <perf_copy_attr+0x444>
5234: 4c 00 01 2c isync
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
[mpe: Export the ool handlers to fix build errors]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/d9121f96a7c4302946839a0771f5d1daeeb6968c.1622708530.git.christophe.leroy@csgroup.eu
2021-06-03 11:41:44 +03:00
# if defined(CONFIG_PPC_BOOK3S_32) && defined(CONFIG_PPC_KUAP)
p - > thread . kuap = KUAP_NONE ;
# endif
2021-10-19 10:29:28 +03:00
# if defined(CONFIG_BOOKE_OR_40x) && defined(CONFIG_PPC_KUAP)
p - > thread . pid = MMU_NO_CONTEXT ;
# endif
2013-09-10 14:21:10 +04:00
2014-07-10 06:29:21 +04:00
setup_ksp_vsid ( p , sp ) ;
2011-03-02 18:18:48 +03:00
# ifdef CONFIG_PPC64
if ( cpu_has_feature ( CPU_FTR_DSCR ) ) {
2012-09-03 20:49:47 +04:00
p - > thread . dscr_inherit = current - > thread . dscr_inherit ;
2015-12-09 12:11:47 +03:00
p - > thread . dscr = mfspr ( SPRN_DSCR ) ;
2011-03-02 18:18:48 +03:00
}
2012-12-07 01:49:56 +04:00
if ( cpu_has_feature ( CPU_FTR_HAS_PPR ) )
2018-10-12 16:15:16 +03:00
childregs - > ppr = DEFAULT_PPR ;
2017-11-08 05:23:53 +03:00
p - > thread . tidr = 0 ;
2020-11-27 07:44:13 +03:00
# endif
/*
* Run with the current AMR value of the kernel
*/
# ifdef CONFIG_PPC_PKEY
if ( mmu_has_feature ( MMU_FTR_BOOK3S_KUAP ) )
kregs - > amr = AMR_KUAP_BLOCKED ;
if ( mmu_has_feature ( MMU_FTR_BOOK3S_KUEP ) )
kregs - > iamr = AMR_KUEP_BLOCKED ;
2011-03-02 18:18:48 +03:00
# endif
2014-02-04 09:08:51 +04:00
kregs - > nip = ppc_function_entry ( f ) ;
2005-09-26 10:04:21 +04:00
return 0 ;
}
powerpc/64s/hash: Add a SLB preload cache
When switching processes, currently all user SLBEs are cleared, and a
few (exec_base, pc, and stack) are preloaded. In trivial testing with
small apps, this tends to miss the heap and low 256MB segments, and it
will also miss commonly accessed segments on large memory workloads.
Add a simple round-robin preload cache that just inserts the last SLB
miss into the head of the cache and preloads those at context switch
time. Every 256 context switches, the oldest entry is removed from the
cache to shrink the cache and require fewer slbmte if they are unused.
Much more could go into this, including into the SLB entry reclaim
side to track some LRU information etc, which would require a study of
large memory workloads. But this is a simple thing we can do now that
is an obvious win for common workloads.
With the full series, process switching speed on the context_switch
benchmark on POWER9/hash (with kernel speculation security masures
disabled) increases from 140K/s to 178K/s (27%).
POWER8 does not change much (within 1%), it's unclear why it does not
see a big gain like POWER9.
Booting to busybox init with 256MB segments has SLB misses go down
from 945 to 69, and with 1T segments 900 to 21. These could almost all
be eliminated by preloading a bit more carefully with ELF binary
loading.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2018-09-14 18:30:56 +03:00
void preload_new_slb_context ( unsigned long start , unsigned long sp ) ;
2005-09-26 10:04:21 +04:00
/*
* Set up a thread for executing a new program
*/
2005-10-10 16:29:05 +04:00
void start_thread ( struct pt_regs * regs , unsigned long start , unsigned long sp )
2005-09-26 10:04:21 +04:00
{
2005-10-21 10:01:33 +04:00
# ifdef CONFIG_PPC64
unsigned long load_addr = regs - > gpr [ 2 ] ; /* saved by ELF_PLAT_INIT */
powerpc/64s/hash: Add a SLB preload cache
When switching processes, currently all user SLBEs are cleared, and a
few (exec_base, pc, and stack) are preloaded. In trivial testing with
small apps, this tends to miss the heap and low 256MB segments, and it
will also miss commonly accessed segments on large memory workloads.
Add a simple round-robin preload cache that just inserts the last SLB
miss into the head of the cache and preloads those at context switch
time. Every 256 context switches, the oldest entry is removed from the
cache to shrink the cache and require fewer slbmte if they are unused.
Much more could go into this, including into the SLB entry reclaim
side to track some LRU information etc, which would require a study of
large memory workloads. But this is a simple thing we can do now that
is an obvious win for common workloads.
With the full series, process switching speed on the context_switch
benchmark on POWER9/hash (with kernel speculation security masures
disabled) increases from 140K/s to 178K/s (27%).
POWER8 does not change much (within 1%), it's unclear why it does not
see a big gain like POWER9.
Booting to busybox init with 256MB segments has SLB misses go down
from 945 to 69, and with 1T segments 900 to 21. These could almost all
be eliminated by preloading a bit more carefully with ELF binary
loading.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2018-09-14 18:30:56 +03:00
2020-08-17 08:46:42 +03:00
if ( IS_ENABLED ( CONFIG_PPC_BOOK3S_64 ) & & ! radix_enabled ( ) )
2019-04-09 07:03:28 +03:00
preload_new_slb_context ( start , sp ) ;
2005-10-21 10:01:33 +04:00
# endif
powerpc/tm: Always reclaim in start_thread() for exec() class syscalls
Userspace can quite legitimately perform an exec() syscall with a
suspended transaction. exec() does not return to the old process, rather
it load a new one and starts that, the expectation therefore is that the
new process starts not in a transaction. Currently exec() is not treated
any differently to any other syscall which creates problems.
Firstly it could allow a new process to start with a suspended
transaction for a binary that no longer exists. This means that the
checkpointed state won't be valid and if the suspended transaction were
ever to be resumed and subsequently aborted (a possibility which is
exceedingly likely as exec()ing will likely doom the transaction) the
new process will jump to invalid state.
Secondly the incorrect attempt to keep the transactional state while
still zeroing state for the new process creates at least two TM Bad
Things. The first triggers on the rfid to return to userspace as
start_thread() has given the new process a 'clean' MSR but the suspend
will still be set in the hardware MSR. The second TM Bad Thing triggers
in __switch_to() as the processor is still transactionally suspended but
__switch_to() wants to zero the TM sprs for the new process.
This is an example of the outcome of calling exec() with a suspended
transaction. Note the first 700 is likely the first TM bad thing
decsribed earlier only the kernel can't report it as we've loaded
userspace registers. c000000000009980 is the rfid in
fast_exception_return()
Bad kernel stack pointer 3fffcfa1a370 at c000000000009980
Oops: Bad kernel stack pointer, sig: 6 [#1]
CPU: 0 PID: 2006 Comm: tm-execed Not tainted
NIP: c000000000009980 LR: 0000000000000000 CTR: 0000000000000000
REGS: c00000003ffefd40 TRAP: 0700 Not tainted
MSR: 8000000300201031 <SF,ME,IR,DR,LE,TM[SE]> CR: 00000000 XER: 00000000
CFAR: c0000000000098b4 SOFTE: 0
PACATMSCRATCH: b00000010000d033
GPR00: 0000000000000000 00003fffcfa1a370 0000000000000000 0000000000000000
GPR04: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
GPR08: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
GPR12: 00003fff966611c0 0000000000000000 0000000000000000 0000000000000000
NIP [c000000000009980] fast_exception_return+0xb0/0xb8
LR [0000000000000000] (null)
Call Trace:
Instruction dump:
f84d0278 e9a100d8 7c7b03a6 e84101a0 7c4ff120 e8410170 7c5a03a6 e8010070
e8410080 e8610088 e8810090 e8210078 <4c000024> 48000000 e8610178 88ed023b
Kernel BUG at c000000000043e80 [verbose debug info unavailable]
Unexpected TM Bad Thing exception at c000000000043e80 (msr 0x201033)
Oops: Unrecoverable exception, sig: 6 [#2]
CPU: 0 PID: 2006 Comm: tm-execed Tainted: G D
task: c0000000fbea6d80 ti: c00000003ffec000 task.ti: c0000000fb7ec000
NIP: c000000000043e80 LR: c000000000015a24 CTR: 0000000000000000
REGS: c00000003ffef7e0 TRAP: 0700 Tainted: G D
MSR: 8000000300201033 <SF,ME,IR,DR,RI,LE,TM[SE]> CR: 28002828 XER: 00000000
CFAR: c000000000015a20 SOFTE: 0
PACATMSCRATCH: b00000010000d033
GPR00: 0000000000000000 c00000003ffefa60 c000000000db5500 c0000000fbead000
GPR04: 8000000300001033 2222222222222222 2222222222222222 00000000ff160000
GPR08: 0000000000000000 800000010000d033 c0000000fb7e3ea0 c00000000fe00004
GPR12: 0000000000002200 c00000000fe00000 0000000000000000 0000000000000000
GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
GPR20: 0000000000000000 0000000000000000 c0000000fbea7410 00000000ff160000
GPR24: c0000000ffe1f600 c0000000fbea8700 c0000000fbea8700 c0000000fbead000
GPR28: c000000000e20198 c0000000fbea6d80 c0000000fbeab680 c0000000fbea6d80
NIP [c000000000043e80] tm_restore_sprs+0xc/0x1c
LR [c000000000015a24] __switch_to+0x1f4/0x420
Call Trace:
Instruction dump:
7c800164 4e800020 7c0022a6 f80304a8 7c0222a6 f80304b0 7c0122a6 f80304b8
4e800020 e80304a8 7c0023a6 e80304b0 <7c0223a6> e80304b8 7c0123a6 4e800020
This fixes CVE-2016-5828.
Fixes: bc2a9408fa65 ("powerpc: Hook in new transactional memory code")
Cc: stable@vger.kernel.org # v3.9+
Signed-off-by: Cyril Bur <cyrilbur@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2016-06-17 07:58:34 +03:00
# ifdef CONFIG_PPC_TRANSACTIONAL_MEM
/*
* Clear any transactional state , we ' re exec ( ) ing . The cause is
* not important as there will never be a recheckpoint so it ' s not
* user visible .
*/
if ( MSR_TM_SUSPENDED ( mfmsr ( ) ) )
tm_reclaim_current ( 0 ) ;
# endif
powerpc: Enable execve syscall exit tracepoint
On execve[at], we are zero'ing out most of the thread register state
including gpr[0], which contains the syscall number. Due to this, we
fail to trigger the syscall exit tracepoint properly. Fix this by
retaining gpr[0] in the thread register state.
Before this patch:
# tail /sys/kernel/debug/tracing/trace
cat-123 [000] ..... 61.449351: sys_execve(filename:
7fffa6b23448, argv: 7fffa6b233e0, envp: 7fffa6b233f8)
cat-124 [000] ..... 62.428481: sys_execve(filename:
7fffa6b23448, argv: 7fffa6b233e0, envp: 7fffa6b233f8)
echo-125 [000] ..... 65.813702: sys_execve(filename:
7fffa6b23378, argv: 7fffa6b233a0, envp: 7fffa6b233b0)
echo-125 [000] ..... 65.822214: sys_execveat(fd: 0,
filename: 1009ac48, argv: 7ffff65d0c98, envp: 7ffff65d0ca8, flags: 0)
After this patch:
# tail /sys/kernel/debug/tracing/trace
cat-127 [000] ..... 100.416262: sys_execve(filename:
7fffa41b3448, argv: 7fffa41b33e0, envp: 7fffa41b33f8)
cat-127 [000] ..... 100.418203: sys_execve -> 0x0
echo-128 [000] ..... 103.873968: sys_execve(filename:
7fffa41b3378, argv: 7fffa41b33a0, envp: 7fffa41b33b0)
echo-128 [000] ..... 103.875102: sys_execve -> 0x0
echo-128 [000] ..... 103.882097: sys_execveat(fd: 0,
filename: 1009ac48, argv: 7fffd10d2148, envp: 7fffd10d2158, flags: 0)
echo-128 [000] ..... 103.883225: sys_execveat -> 0x0
Cc: stable@vger.kernel.org
Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Tested-by: Sumit Dubey2 <Sumit.Dubey2@ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20220609103328.41306-1-naveen.n.rao@linux.vnet.ibm.com
2022-06-09 13:33:28 +03:00
memset ( & regs - > gpr [ 1 ] , 0 , sizeof ( regs - > gpr ) - sizeof ( regs - > gpr [ 0 ] ) ) ;
2005-09-26 10:04:21 +04:00
regs - > ctr = 0 ;
regs - > link = 0 ;
regs - > xer = 0 ;
regs - > ccr = 0 ;
regs - > gpr [ 1 ] = sp ;
2005-10-10 16:29:05 +04:00
# ifdef CONFIG_PPC32
regs - > mq = 0 ;
regs - > nip = start ;
2005-09-26 10:04:21 +04:00
regs - > msr = MSR_USER ;
2005-10-10 16:29:05 +04:00
# else
2010-07-30 02:04:39 +04:00
if ( ! is_32bit_task ( ) ) {
2013-11-20 15:15:02 +04:00
unsigned long entry ;
2005-10-10 16:29:05 +04:00
2013-11-20 15:15:02 +04:00
if ( is_elf2_task ( ) ) {
/* Look ma, no function descriptors! */
entry = start ;
2005-10-10 16:29:05 +04:00
2013-11-20 15:15:02 +04:00
/*
* Ulrich says :
* The latest iteration of the ABI requires that when
* calling a function ( at its global entry point ) ,
* the caller must ensure r12 holds the entry point
* address ( so that the function can quickly
* establish addressability ) .
*/
regs - > gpr [ 12 ] = start ;
/* Make sure that's restored on entry to userspace. */
set_thread_flag ( TIF_RESTOREALL ) ;
} else {
unsigned long toc ;
/* start is a relocated pointer to the function
* descriptor for the elf _start routine . The first
* entry in the function descriptor is the entry
* address of _start and the second entry is the TOC
* value we need to use .
*/
__get_user ( entry , ( unsigned long __user * ) start ) ;
__get_user ( toc , ( unsigned long __user * ) start + 1 ) ;
/* Check whether the e_entry function descriptor entries
* need to be relocated before we can use them .
*/
if ( load_addr ! = 0 ) {
entry + = load_addr ;
toc + = load_addr ;
}
regs - > gpr [ 2 ] = toc ;
2005-10-10 16:29:05 +04:00
}
2021-06-17 18:51:03 +03:00
regs_set_return_ip ( regs , entry ) ;
regs_set_return_msr ( regs , MSR_USER64 ) ;
2005-10-13 07:40:54 +04:00
} else {
regs - > gpr [ 2 ] = 0 ;
2021-06-17 18:51:03 +03:00
regs_set_return_ip ( regs , start ) ;
regs_set_return_msr ( regs , MSR_USER32 ) ;
2005-10-10 16:29:05 +04:00
}
2021-06-17 18:51:03 +03:00
2005-10-10 16:29:05 +04:00
# endif
2008-06-25 08:07:18 +04:00
# ifdef CONFIG_VSX
current - > thread . used_vsr = 0 ;
# endif
powerpc/64s/hash: Add a SLB preload cache
When switching processes, currently all user SLBEs are cleared, and a
few (exec_base, pc, and stack) are preloaded. In trivial testing with
small apps, this tends to miss the heap and low 256MB segments, and it
will also miss commonly accessed segments on large memory workloads.
Add a simple round-robin preload cache that just inserts the last SLB
miss into the head of the cache and preloads those at context switch
time. Every 256 context switches, the oldest entry is removed from the
cache to shrink the cache and require fewer slbmte if they are unused.
Much more could go into this, including into the SLB entry reclaim
side to track some LRU information etc, which would require a study of
large memory workloads. But this is a simple thing we can do now that
is an obvious win for common workloads.
With the full series, process switching speed on the context_switch
benchmark on POWER9/hash (with kernel speculation security masures
disabled) increases from 140K/s to 178K/s (27%).
POWER8 does not change much (within 1%), it's unclear why it does not
see a big gain like POWER9.
Booting to busybox init with 256MB segments has SLB misses go down
from 945 to 69, and with 1T segments 900 to 21. These could almost all
be eliminated by preloading a bit more carefully with ELF binary
loading.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2018-09-14 18:30:56 +03:00
current - > thread . load_slb = 0 ;
2017-06-03 00:43:30 +03:00
current - > thread . load_fp = 0 ;
2020-08-18 20:19:17 +03:00
# ifdef CONFIG_PPC_FPU_REGS
2013-09-10 14:20:42 +04:00
memset ( & current - > thread . fp_state , 0 , sizeof ( current - > thread . fp_state ) ) ;
2013-09-10 14:21:10 +04:00
current - > thread . fp_save_area = NULL ;
2020-08-18 20:19:17 +03:00
# endif
2005-09-26 10:04:21 +04:00
# ifdef CONFIG_ALTIVEC
2013-09-10 14:20:42 +04:00
memset ( & current - > thread . vr_state , 0 , sizeof ( current - > thread . vr_state ) ) ;
current - > thread . vr_state . vscr . u [ 3 ] = 0x00010000 ; /* Java mode disabled */
2013-09-10 14:21:10 +04:00
current - > thread . vr_save_area = NULL ;
2005-09-26 10:04:21 +04:00
current - > thread . vrsave = 0 ;
current - > thread . used_vr = 0 ;
2017-06-03 00:43:30 +03:00
current - > thread . load_vec = 0 ;
2005-09-26 10:04:21 +04:00
# endif /* CONFIG_ALTIVEC */
# ifdef CONFIG_SPE
memset ( current - > thread . evr , 0 , sizeof ( current - > thread . evr ) ) ;
current - > thread . acc = 0 ;
current - > thread . spefscr = 0 ;
current - > thread . used_spe = 0 ;
# endif /* CONFIG_SPE */
2013-02-13 20:21:40 +04:00
# ifdef CONFIG_PPC_TRANSACTIONAL_MEM
current - > thread . tm_tfhar = 0 ;
current - > thread . tm_texasr = 0 ;
current - > thread . tm_tfiar = 0 ;
2017-06-05 17:40:59 +03:00
current - > thread . load_tm = 0 ;
2013-02-13 20:21:40 +04:00
# endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
2005-09-26 10:04:21 +04:00
}
2014-08-20 02:00:02 +04:00
EXPORT_SYMBOL ( start_thread ) ;
2005-09-26 10:04:21 +04:00
# define PR_FP_ALL_EXCEPT (PR_FP_EXC_DIV | PR_FP_EXC_OVF | PR_FP_EXC_UND \
| PR_FP_EXC_RES | PR_FP_EXC_INV )
int set_fpexc_mode ( struct task_struct * tsk , unsigned int val )
{
struct pt_regs * regs = tsk - > thread . regs ;
/* This is a bit hairy. If we are an SPE enabled processor
* ( have embedded fp ) we store the IEEE exception enable flags in
* fpexc_mode . fpexc_mode is also used for setting FP exception
* mode ( asyn , precise , disabled ) for ' Classic ' FP . */
if ( val & PR_FP_EXC_SW_ENABLE ) {
2007-09-13 10:44:20 +04:00
if ( cpu_has_feature ( CPU_FTR_SPE ) ) {
powerpc: fix exception clearing in e500 SPE float emulation
The e500 SPE floating-point emulation code clears existing exceptions
(__FPU_FPSCR &= ~FP_EX_MASK;) before ORing in the exceptions from the
emulated operation. However, these exception bits are the "sticky",
cumulative exception bits, and should only be cleared by the user
program setting SPEFSCR, not implicitly by any floating-point
instruction (whether executed purely by the hardware or emulated).
The spurious clearing of these bits shows up as missing exceptions in
glibc testing.
Fixing this, however, is not as simple as just not clearing the bits,
because while the bits may be from previous floating-point operations
(in which case they should not be cleared), the processor can also set
the sticky bits itself before the interrupt for an exception occurs,
and this can happen in cases when IEEE 754 semantics are that the
sticky bit should not be set. Specifically, the "invalid" sticky bit
is set in various cases with non-finite operands, where IEEE 754
semantics do not involve raising such an exception, and the
"underflow" sticky bit is set in cases of exact underflow, whereas
IEEE 754 semantics are that this flag is set only for inexact
underflow. Thus, for correct emulation the kernel needs to know the
setting of these two sticky bits before the instruction being
emulated.
When a floating-point operation raises an exception, the kernel can
note the state of the sticky bits immediately afterwards. Some
<fenv.h> functions that affect the state of these bits, such as
fesetenv and feholdexcept, need to use prctl with PR_GET_FPEXC and
PR_SET_FPEXC anyway, and so it is natural to record the state of those
bits during that call into the kernel and so avoid any need for a
separate call into the kernel to inform it of a change to those bits.
Thus, the interface I chose to use (in this patch and the glibc port)
is that one of those prctl calls must be made after any userspace
change to those sticky bits, other than through a floating-point
operation that traps into the kernel anyway. feclearexcept and
fesetexceptflag duly make those calls, which would not be required
were it not for this issue.
The previous EGLIBC port, and the uClibc code copied from it, is
fundamentally broken as regards any use of prctl for floating-point
exceptions because it didn't use the PR_FP_EXC_SW_ENABLE bit in its
prctl calls (and did various worse things, such as passing a pointer
when prctl expected an integer). If you avoid anything where prctl is
used, the clearing of sticky bits still means it will never give
anything approximating correct exception semantics with existing
kernels. I don't believe the patch makes things any worse for
existing code that doesn't try to inform the kernel of changes to
sticky bits - such code may get incorrect exceptions in some cases,
but it would have done so anyway in other cases.
Signed-off-by: Joseph Myers <joseph@codesourcery.com>
Signed-off-by: Scott Wood <scottwood@freescale.com>
2013-12-11 03:07:45 +04:00
/*
* When the sticky exception bits are set
* directly by userspace , it must call prctl
* with PR_GET_FPEXC ( with PR_FP_EXC_SW_ENABLE
* in the existing prctl settings ) or
* PR_SET_FPEXC ( with PR_FP_EXC_SW_ENABLE in
* the bits being set ) . < fenv . h > functions
* saving and restoring the whole
* floating - point environment need to do so
* anyway to restore the prctl settings from
* the saved environment .
*/
2020-08-17 08:47:57 +03:00
# ifdef CONFIG_SPE
powerpc: fix exception clearing in e500 SPE float emulation
The e500 SPE floating-point emulation code clears existing exceptions
(__FPU_FPSCR &= ~FP_EX_MASK;) before ORing in the exceptions from the
emulated operation. However, these exception bits are the "sticky",
cumulative exception bits, and should only be cleared by the user
program setting SPEFSCR, not implicitly by any floating-point
instruction (whether executed purely by the hardware or emulated).
The spurious clearing of these bits shows up as missing exceptions in
glibc testing.
Fixing this, however, is not as simple as just not clearing the bits,
because while the bits may be from previous floating-point operations
(in which case they should not be cleared), the processor can also set
the sticky bits itself before the interrupt for an exception occurs,
and this can happen in cases when IEEE 754 semantics are that the
sticky bit should not be set. Specifically, the "invalid" sticky bit
is set in various cases with non-finite operands, where IEEE 754
semantics do not involve raising such an exception, and the
"underflow" sticky bit is set in cases of exact underflow, whereas
IEEE 754 semantics are that this flag is set only for inexact
underflow. Thus, for correct emulation the kernel needs to know the
setting of these two sticky bits before the instruction being
emulated.
When a floating-point operation raises an exception, the kernel can
note the state of the sticky bits immediately afterwards. Some
<fenv.h> functions that affect the state of these bits, such as
fesetenv and feholdexcept, need to use prctl with PR_GET_FPEXC and
PR_SET_FPEXC anyway, and so it is natural to record the state of those
bits during that call into the kernel and so avoid any need for a
separate call into the kernel to inform it of a change to those bits.
Thus, the interface I chose to use (in this patch and the glibc port)
is that one of those prctl calls must be made after any userspace
change to those sticky bits, other than through a floating-point
operation that traps into the kernel anyway. feclearexcept and
fesetexceptflag duly make those calls, which would not be required
were it not for this issue.
The previous EGLIBC port, and the uClibc code copied from it, is
fundamentally broken as regards any use of prctl for floating-point
exceptions because it didn't use the PR_FP_EXC_SW_ENABLE bit in its
prctl calls (and did various worse things, such as passing a pointer
when prctl expected an integer). If you avoid anything where prctl is
used, the clearing of sticky bits still means it will never give
anything approximating correct exception semantics with existing
kernels. I don't believe the patch makes things any worse for
existing code that doesn't try to inform the kernel of changes to
sticky bits - such code may get incorrect exceptions in some cases,
but it would have done so anyway in other cases.
Signed-off-by: Joseph Myers <joseph@codesourcery.com>
Signed-off-by: Scott Wood <scottwood@freescale.com>
2013-12-11 03:07:45 +04:00
tsk - > thread . spefscr_last = mfspr ( SPRN_SPEFSCR ) ;
2007-09-13 10:44:20 +04:00
tsk - > thread . fpexc_mode = val &
( PR_FP_EXC_SW_ENABLE | PR_FP_ALL_EXCEPT ) ;
2020-08-17 08:47:57 +03:00
# endif
2007-09-13 10:44:20 +04:00
return 0 ;
} else {
return - EINVAL ;
}
2005-09-26 10:04:21 +04:00
}
2005-10-10 16:29:05 +04:00
/* on a CONFIG_SPE this does not hurt us. The bits that
* __pack_fe01 use do not overlap with bits used for
* PR_FP_EXC_SW_ENABLE . Additionally , the MSR [ FE0 , FE1 ] bits
* on CONFIG_SPE implementations are reserved so writing to
* them does not change anything */
if ( val > PR_FP_EXC_PRECISE )
return - EINVAL ;
tsk - > thread . fpexc_mode = __pack_fe01 ( val ) ;
2021-06-17 18:51:03 +03:00
if ( regs ! = NULL & & ( regs - > msr & MSR_FP ) ! = 0 ) {
regs_set_return_msr ( regs , ( regs - > msr & ~ ( MSR_FE0 | MSR_FE1 ) )
| tsk - > thread . fpexc_mode ) ;
}
2005-09-26 10:04:21 +04:00
return 0 ;
}
int get_fpexc_mode ( struct task_struct * tsk , unsigned long adr )
{
2020-09-17 05:20:16 +03:00
unsigned int val = 0 ;
2005-09-26 10:04:21 +04:00
2020-08-17 08:47:57 +03:00
if ( tsk - > thread . fpexc_mode & PR_FP_EXC_SW_ENABLE ) {
powerpc: fix exception clearing in e500 SPE float emulation
The e500 SPE floating-point emulation code clears existing exceptions
(__FPU_FPSCR &= ~FP_EX_MASK;) before ORing in the exceptions from the
emulated operation. However, these exception bits are the "sticky",
cumulative exception bits, and should only be cleared by the user
program setting SPEFSCR, not implicitly by any floating-point
instruction (whether executed purely by the hardware or emulated).
The spurious clearing of these bits shows up as missing exceptions in
glibc testing.
Fixing this, however, is not as simple as just not clearing the bits,
because while the bits may be from previous floating-point operations
(in which case they should not be cleared), the processor can also set
the sticky bits itself before the interrupt for an exception occurs,
and this can happen in cases when IEEE 754 semantics are that the
sticky bit should not be set. Specifically, the "invalid" sticky bit
is set in various cases with non-finite operands, where IEEE 754
semantics do not involve raising such an exception, and the
"underflow" sticky bit is set in cases of exact underflow, whereas
IEEE 754 semantics are that this flag is set only for inexact
underflow. Thus, for correct emulation the kernel needs to know the
setting of these two sticky bits before the instruction being
emulated.
When a floating-point operation raises an exception, the kernel can
note the state of the sticky bits immediately afterwards. Some
<fenv.h> functions that affect the state of these bits, such as
fesetenv and feholdexcept, need to use prctl with PR_GET_FPEXC and
PR_SET_FPEXC anyway, and so it is natural to record the state of those
bits during that call into the kernel and so avoid any need for a
separate call into the kernel to inform it of a change to those bits.
Thus, the interface I chose to use (in this patch and the glibc port)
is that one of those prctl calls must be made after any userspace
change to those sticky bits, other than through a floating-point
operation that traps into the kernel anyway. feclearexcept and
fesetexceptflag duly make those calls, which would not be required
were it not for this issue.
The previous EGLIBC port, and the uClibc code copied from it, is
fundamentally broken as regards any use of prctl for floating-point
exceptions because it didn't use the PR_FP_EXC_SW_ENABLE bit in its
prctl calls (and did various worse things, such as passing a pointer
when prctl expected an integer). If you avoid anything where prctl is
used, the clearing of sticky bits still means it will never give
anything approximating correct exception semantics with existing
kernels. I don't believe the patch makes things any worse for
existing code that doesn't try to inform the kernel of changes to
sticky bits - such code may get incorrect exceptions in some cases,
but it would have done so anyway in other cases.
Signed-off-by: Joseph Myers <joseph@codesourcery.com>
Signed-off-by: Scott Wood <scottwood@freescale.com>
2013-12-11 03:07:45 +04:00
if ( cpu_has_feature ( CPU_FTR_SPE ) ) {
/*
* When the sticky exception bits are set
* directly by userspace , it must call prctl
* with PR_GET_FPEXC ( with PR_FP_EXC_SW_ENABLE
* in the existing prctl settings ) or
* PR_SET_FPEXC ( with PR_FP_EXC_SW_ENABLE in
* the bits being set ) . < fenv . h > functions
* saving and restoring the whole
* floating - point environment need to do so
* anyway to restore the prctl settings from
* the saved environment .
*/
2020-08-17 08:47:57 +03:00
# ifdef CONFIG_SPE
powerpc: fix exception clearing in e500 SPE float emulation
The e500 SPE floating-point emulation code clears existing exceptions
(__FPU_FPSCR &= ~FP_EX_MASK;) before ORing in the exceptions from the
emulated operation. However, these exception bits are the "sticky",
cumulative exception bits, and should only be cleared by the user
program setting SPEFSCR, not implicitly by any floating-point
instruction (whether executed purely by the hardware or emulated).
The spurious clearing of these bits shows up as missing exceptions in
glibc testing.
Fixing this, however, is not as simple as just not clearing the bits,
because while the bits may be from previous floating-point operations
(in which case they should not be cleared), the processor can also set
the sticky bits itself before the interrupt for an exception occurs,
and this can happen in cases when IEEE 754 semantics are that the
sticky bit should not be set. Specifically, the "invalid" sticky bit
is set in various cases with non-finite operands, where IEEE 754
semantics do not involve raising such an exception, and the
"underflow" sticky bit is set in cases of exact underflow, whereas
IEEE 754 semantics are that this flag is set only for inexact
underflow. Thus, for correct emulation the kernel needs to know the
setting of these two sticky bits before the instruction being
emulated.
When a floating-point operation raises an exception, the kernel can
note the state of the sticky bits immediately afterwards. Some
<fenv.h> functions that affect the state of these bits, such as
fesetenv and feholdexcept, need to use prctl with PR_GET_FPEXC and
PR_SET_FPEXC anyway, and so it is natural to record the state of those
bits during that call into the kernel and so avoid any need for a
separate call into the kernel to inform it of a change to those bits.
Thus, the interface I chose to use (in this patch and the glibc port)
is that one of those prctl calls must be made after any userspace
change to those sticky bits, other than through a floating-point
operation that traps into the kernel anyway. feclearexcept and
fesetexceptflag duly make those calls, which would not be required
were it not for this issue.
The previous EGLIBC port, and the uClibc code copied from it, is
fundamentally broken as regards any use of prctl for floating-point
exceptions because it didn't use the PR_FP_EXC_SW_ENABLE bit in its
prctl calls (and did various worse things, such as passing a pointer
when prctl expected an integer). If you avoid anything where prctl is
used, the clearing of sticky bits still means it will never give
anything approximating correct exception semantics with existing
kernels. I don't believe the patch makes things any worse for
existing code that doesn't try to inform the kernel of changes to
sticky bits - such code may get incorrect exceptions in some cases,
but it would have done so anyway in other cases.
Signed-off-by: Joseph Myers <joseph@codesourcery.com>
Signed-off-by: Scott Wood <scottwood@freescale.com>
2013-12-11 03:07:45 +04:00
tsk - > thread . spefscr_last = mfspr ( SPRN_SPEFSCR ) ;
2007-09-13 10:44:20 +04:00
val = tsk - > thread . fpexc_mode ;
2020-08-17 08:47:57 +03:00
# endif
powerpc: fix exception clearing in e500 SPE float emulation
The e500 SPE floating-point emulation code clears existing exceptions
(__FPU_FPSCR &= ~FP_EX_MASK;) before ORing in the exceptions from the
emulated operation. However, these exception bits are the "sticky",
cumulative exception bits, and should only be cleared by the user
program setting SPEFSCR, not implicitly by any floating-point
instruction (whether executed purely by the hardware or emulated).
The spurious clearing of these bits shows up as missing exceptions in
glibc testing.
Fixing this, however, is not as simple as just not clearing the bits,
because while the bits may be from previous floating-point operations
(in which case they should not be cleared), the processor can also set
the sticky bits itself before the interrupt for an exception occurs,
and this can happen in cases when IEEE 754 semantics are that the
sticky bit should not be set. Specifically, the "invalid" sticky bit
is set in various cases with non-finite operands, where IEEE 754
semantics do not involve raising such an exception, and the
"underflow" sticky bit is set in cases of exact underflow, whereas
IEEE 754 semantics are that this flag is set only for inexact
underflow. Thus, for correct emulation the kernel needs to know the
setting of these two sticky bits before the instruction being
emulated.
When a floating-point operation raises an exception, the kernel can
note the state of the sticky bits immediately afterwards. Some
<fenv.h> functions that affect the state of these bits, such as
fesetenv and feholdexcept, need to use prctl with PR_GET_FPEXC and
PR_SET_FPEXC anyway, and so it is natural to record the state of those
bits during that call into the kernel and so avoid any need for a
separate call into the kernel to inform it of a change to those bits.
Thus, the interface I chose to use (in this patch and the glibc port)
is that one of those prctl calls must be made after any userspace
change to those sticky bits, other than through a floating-point
operation that traps into the kernel anyway. feclearexcept and
fesetexceptflag duly make those calls, which would not be required
were it not for this issue.
The previous EGLIBC port, and the uClibc code copied from it, is
fundamentally broken as regards any use of prctl for floating-point
exceptions because it didn't use the PR_FP_EXC_SW_ENABLE bit in its
prctl calls (and did various worse things, such as passing a pointer
when prctl expected an integer). If you avoid anything where prctl is
used, the clearing of sticky bits still means it will never give
anything approximating correct exception semantics with existing
kernels. I don't believe the patch makes things any worse for
existing code that doesn't try to inform the kernel of changes to
sticky bits - such code may get incorrect exceptions in some cases,
but it would have done so anyway in other cases.
Signed-off-by: Joseph Myers <joseph@codesourcery.com>
Signed-off-by: Scott Wood <scottwood@freescale.com>
2013-12-11 03:07:45 +04:00
} else
2007-09-13 10:44:20 +04:00
return - EINVAL ;
2020-08-17 08:47:57 +03:00
} else {
2005-09-26 10:04:21 +04:00
val = __unpack_fe01 ( tsk - > thread . fpexc_mode ) ;
2020-08-17 08:47:57 +03:00
}
2005-09-26 10:04:21 +04:00
return put_user ( val , ( unsigned int __user * ) adr ) ;
}
2006-06-07 10:14:40 +04:00
int set_endian ( struct task_struct * tsk , unsigned int val )
{
struct pt_regs * regs = tsk - > thread . regs ;
if ( ( val = = PR_ENDIAN_LITTLE & & ! cpu_has_feature ( CPU_FTR_REAL_LE ) ) | |
( val = = PR_ENDIAN_PPC_LITTLE & & ! cpu_has_feature ( CPU_FTR_PPC_LE ) ) )
return - EINVAL ;
if ( regs = = NULL )
return - EINVAL ;
if ( val = = PR_ENDIAN_BIG )
2021-06-17 18:51:03 +03:00
regs_set_return_msr ( regs , regs - > msr & ~ MSR_LE ) ;
2006-06-07 10:14:40 +04:00
else if ( val = = PR_ENDIAN_LITTLE | | val = = PR_ENDIAN_PPC_LITTLE )
2021-06-17 18:51:03 +03:00
regs_set_return_msr ( regs , regs - > msr | MSR_LE ) ;
2006-06-07 10:14:40 +04:00
else
return - EINVAL ;
return 0 ;
}
int get_endian ( struct task_struct * tsk , unsigned long adr )
{
struct pt_regs * regs = tsk - > thread . regs ;
unsigned int val ;
if ( ! cpu_has_feature ( CPU_FTR_PPC_LE ) & &
! cpu_has_feature ( CPU_FTR_REAL_LE ) )
return - EINVAL ;
if ( regs = = NULL )
return - EINVAL ;
if ( regs - > msr & MSR_LE ) {
if ( cpu_has_feature ( CPU_FTR_REAL_LE ) )
val = PR_ENDIAN_LITTLE ;
else
val = PR_ENDIAN_PPC_LITTLE ;
} else
val = PR_ENDIAN_BIG ;
return put_user ( val , ( unsigned int __user * ) adr ) ;
}
2006-06-07 10:15:39 +04:00
int set_unalign_ctl ( struct task_struct * tsk , unsigned int val )
{
tsk - > thread . align_ctl = val ;
return 0 ;
}
int get_unalign_ctl ( struct task_struct * tsk , unsigned long adr )
{
return put_user ( tsk - > thread . align_ctl , ( unsigned int __user * ) adr ) ;
}
2007-02-19 03:42:42 +03:00
static inline int valid_irq_stack ( unsigned long sp , struct task_struct * p ,
unsigned long nbytes )
{
unsigned long stack_page ;
unsigned long cpu = task_cpu ( p ) ;
2022-12-16 14:59:30 +03:00
if ( ! hardirq_ctx [ cpu ] | | ! softirq_ctx [ cpu ] )
return 0 ;
2019-01-31 13:09:00 +03:00
stack_page = ( unsigned long ) hardirq_ctx [ cpu ] ;
if ( sp > = stack_page & & sp < = stack_page + THREAD_SIZE - nbytes )
return 1 ;
stack_page = ( unsigned long ) softirq_ctx [ cpu ] ;
if ( sp > = stack_page & & sp < = stack_page + THREAD_SIZE - nbytes )
return 1 ;
2007-02-19 03:42:42 +03:00
return 0 ;
}
2020-03-25 13:41:44 +03:00
static inline int valid_emergency_stack ( unsigned long sp , struct task_struct * p ,
unsigned long nbytes )
{
# ifdef CONFIG_PPC64
unsigned long stack_page ;
unsigned long cpu = task_cpu ( p ) ;
2021-02-02 16:02:06 +03:00
if ( ! paca_ptrs )
return 0 ;
2022-12-16 14:59:30 +03:00
if ( ! paca_ptrs [ cpu ] - > emergency_sp )
return 0 ;
# ifdef CONFIG_PPC_BOOK3S_64
if ( ! paca_ptrs [ cpu ] - > nmi_emergency_sp | | ! paca_ptrs [ cpu ] - > mc_emergency_sp )
return 0 ;
# endif
2020-03-25 13:41:44 +03:00
stack_page = ( unsigned long ) paca_ptrs [ cpu ] - > emergency_sp - THREAD_SIZE ;
if ( sp > = stack_page & & sp < = stack_page + THREAD_SIZE - nbytes )
return 1 ;
# ifdef CONFIG_PPC_BOOK3S_64
stack_page = ( unsigned long ) paca_ptrs [ cpu ] - > nmi_emergency_sp - THREAD_SIZE ;
if ( sp > = stack_page & & sp < = stack_page + THREAD_SIZE - nbytes )
return 1 ;
stack_page = ( unsigned long ) paca_ptrs [ cpu ] - > mc_emergency_sp - THREAD_SIZE ;
if ( sp > = stack_page & & sp < = stack_page + THREAD_SIZE - nbytes )
return 1 ;
# endif
# endif
return 0 ;
}
2022-11-27 15:49:39 +03:00
/*
* validate the stack frame of a particular minimum size , used for when we are
* looking at a certain object in the stack beyond the minimum .
*/
int validate_sp_size ( unsigned long sp , struct task_struct * p ,
unsigned long nbytes )
2005-09-26 10:04:21 +04:00
{
2006-01-12 12:06:02 +03:00
unsigned long stack_page = ( unsigned long ) task_stack_page ( p ) ;
2005-09-26 10:04:21 +04:00
2019-01-31 13:09:00 +03:00
if ( sp < THREAD_SIZE )
return 0 ;
if ( sp > = stack_page & & sp < = stack_page + THREAD_SIZE - nbytes )
2005-09-26 10:04:21 +04:00
return 1 ;
2020-03-25 13:41:44 +03:00
if ( valid_irq_stack ( sp , p , nbytes ) )
return 1 ;
return valid_emergency_stack ( sp , p , nbytes ) ;
2005-09-26 10:04:21 +04:00
}
2022-11-27 15:49:39 +03:00
int validate_sp ( unsigned long sp , struct task_struct * p )
{
2022-11-27 15:49:40 +03:00
return validate_sp_size ( sp , p , STACK_FRAME_MIN_SIZE ) ;
2022-11-27 15:49:39 +03:00
}
2006-03-27 04:46:18 +04:00
2021-09-30 01:02:14 +03:00
static unsigned long ___get_wchan ( struct task_struct * p )
2005-09-26 10:04:21 +04:00
{
unsigned long ip , sp ;
int count = 0 ;
sp = p - > thread . ksp ;
2022-11-27 15:49:39 +03:00
if ( ! validate_sp ( sp , p ) )
2005-09-26 10:04:21 +04:00
return 0 ;
do {
2022-01-21 04:44:18 +03:00
sp = READ_ONCE_NOCHECK ( * ( unsigned long * ) sp ) ;
2022-11-27 15:49:39 +03:00
if ( ! validate_sp ( sp , p ) | | task_is_running ( p ) )
2005-09-26 10:04:21 +04:00
return 0 ;
if ( count > 0 ) {
2022-01-21 04:44:18 +03:00
ip = READ_ONCE_NOCHECK ( ( ( unsigned long * ) sp ) [ STACK_FRAME_LR_SAVE ] ) ;
2005-09-26 10:04:21 +04:00
if ( ! in_sched_functions ( ip ) )
return ip ;
}
} while ( count + + < 16 ) ;
return 0 ;
}
2005-10-10 16:29:05 +04:00
2021-09-30 01:02:14 +03:00
unsigned long __get_wchan ( struct task_struct * p )
2019-01-31 13:08:52 +03:00
{
unsigned long ret ;
if ( ! try_get_task_stack ( p ) )
return 0 ;
2021-09-30 01:02:14 +03:00
ret = ___get_wchan ( p ) ;
2019-01-31 13:08:52 +03:00
put_task_stack ( p ) ;
return ret ;
}
2008-11-20 06:24:07 +03:00
static int kstack_depth_to_print = CONFIG_PRINT_STACK_DEPTH ;
2005-10-10 16:29:05 +04:00
2021-06-14 15:09:07 +03:00
void __no_sanitize_address show_stack ( struct task_struct * tsk ,
unsigned long * stack ,
const char * loglvl )
2005-10-10 16:29:05 +04:00
{
unsigned long sp , ip , lr , newsp ;
int count = 0 ;
int firstframe = 1 ;
2019-09-05 21:20:30 +03:00
unsigned long ret_addr ;
int ftrace_idx = 0 ;
2005-10-10 16:29:05 +04:00
if ( tsk = = NULL )
tsk = current ;
2019-01-31 13:08:52 +03:00
if ( ! try_get_task_stack ( tsk ) )
return ;
sp = ( unsigned long ) stack ;
2005-10-10 16:29:05 +04:00
if ( sp = = 0 ) {
if ( tsk = = current )
2020-02-20 14:51:37 +03:00
sp = current_stack_frame ( ) ;
2005-10-10 16:29:05 +04:00
else
sp = tsk - > thread . ksp ;
}
lr = 0 ;
2020-06-09 07:31:14 +03:00
printk ( " %sCall Trace: \n " , loglvl ) ;
2005-10-10 16:29:05 +04:00
do {
2022-11-27 15:49:39 +03:00
if ( ! validate_sp ( sp , tsk ) )
2019-01-31 13:08:52 +03:00
break ;
2005-10-10 16:29:05 +04:00
stack = ( unsigned long * ) sp ;
newsp = stack [ 0 ] ;
2008-04-17 08:34:59 +04:00
ip = stack [ STACK_FRAME_LR_SAVE ] ;
2005-10-10 16:29:05 +04:00
if ( ! firstframe | | ip ! = lr ) {
2020-06-09 07:31:14 +03:00
printk ( " %s[ " REG " ] [ " REG " ] %pS " ,
loglvl , sp , ip , ( void * ) ip ) ;
2019-09-05 21:20:30 +03:00
ret_addr = ftrace_graph_ret_addr ( current ,
& ftrace_idx , ip , stack ) ;
if ( ret_addr ! = ip )
pr_cont ( " (%pS) " , ( void * ) ret_addr ) ;
2005-10-10 16:29:05 +04:00
if ( firstframe )
2016-11-02 14:20:46 +03:00
pr_cont ( " (unreliable) " ) ;
pr_cont ( " \n " ) ;
2005-10-10 16:29:05 +04:00
}
firstframe = 0 ;
/*
* See if this is an exception frame .
2022-11-27 15:49:32 +03:00
* We look for the " regs " marker in the current frame .
2022-11-27 15:49:36 +03:00
*
* STACK_SWITCH_FRAME_SIZE being the smallest frame that
* could hold a pt_regs , if that does not fit then it can ' t
* have regs .
2005-10-10 16:29:05 +04:00
*/
2022-11-27 15:49:39 +03:00
if ( validate_sp_size ( sp , tsk , STACK_SWITCH_FRAME_SIZE )
2022-11-27 15:49:34 +03:00
& & stack [ STACK_INT_FRAME_MARKER_LONGS ] = = STACK_FRAME_REGS_MARKER ) {
2005-10-10 16:29:05 +04:00
struct pt_regs * regs = ( struct pt_regs * )
2022-11-27 15:49:32 +03:00
( sp + STACK_INT_FRAME_REGS ) ;
powerpc: show registers when unwinding interrupt frames
It's often useful to know the register state for interrupts in
the stack frame. In the below example (with this patch applied),
the important information is the state of the page fault.
A blatant case like this probably rather should have the page
fault regs passed down to the warning, but quite often there are
less obvious cases where an interrupt shows up that might give
some more clues.
The downside is longer and more complex bug output.
Bug: Write fault blocked by AMR!
WARNING: CPU: 0 PID: 72 at arch/powerpc/include/asm/book3s/64/kup-radix.h:164 __do_page_fault+0x880/0xa90
Modules linked in:
CPU: 0 PID: 72 Comm: systemd-gpt-aut Not tainted
NIP: c00000000006e2f0 LR: c00000000006e2ec CTR: 0000000000000000
REGS: c00000000a4f3420 TRAP: 0700
MSR: 8000000000021033 <SF,ME,IR,DR,RI,LE> CR: 28002840 XER: 20040000
CFAR: c000000000128be0 IRQMASK: 3
GPR00: c00000000006e2ec c00000000a4f36c0 c0000000014f0700 0000000000000020
GPR04: 0000000000000001 c000000001290f50 0000000000000001 c000000001290f80
GPR08: c000000001612b08 0000000000000000 0000000000000000 00000000ffffe0f7
GPR12: 0000000048002840 c0000000016e0000 c00c000000021c80 c000000000fd6f60
GPR16: 0000000000000000 c00000000a104698 0000000000000003 c0000000087f0000
GPR20: 0000000000000100 c0000000070330b8 0000000000000000 0000000000000004
GPR24: 0000000002000000 0000000000000300 0000000002000000 c00000000a5b0c00
GPR28: 0000000000000000 000000000a000000 00007fffb2a90038 c00000000a4f3820
NIP [c00000000006e2f0] __do_page_fault+0x880/0xa90
LR [c00000000006e2ec] __do_page_fault+0x87c/0xa90
Call Trace:
[c00000000a4f36c0] [c00000000006e2ec] __do_page_fault+0x87c/0xa90 (unreliable)
[c00000000a4f3780] [c000000000e1c034] do_page_fault+0x34/0x90
[c00000000a4f37b0] [c000000000008908] data_access_common_virt+0x158/0x1b0
--- interrupt: 300 at __copy_tofrom_user_base+0x9c/0x5a4
NIP: c00000000009b028 LR: c000000000802978 CTR: 0000000000000800
REGS: c00000000a4f3820 TRAP: 0300
MSR: 800000000280b033 <SF,VEC,VSX,EE,FP,ME,IR,DR,RI,LE> CR: 24004840 XER: 00000000
CFAR: c00000000009aff4 DAR: 00007fffb2a90038 DSISR: 0a000000 IRQMASK: 0
GPR00: 0000000000000000 c00000000a4f3ac0 c0000000014f0700 00007fffb2a90028
GPR04: c000000008720010 0000000000010000 0000000000000000 0000000000000000
GPR08: 0000000000000000 0000000000000000 0000000000000000 0000000000000001
GPR12: 0000000000004000 c0000000016e0000 c00c000000021c80 c000000000fd6f60
GPR16: 0000000000000000 c00000000a104698 0000000000000003 c0000000087f0000
GPR20: 0000000000000100 c0000000070330b8 0000000000000000 0000000000000004
GPR24: c00000000a4f3c80 c000000008720000 0000000000010000 0000000000000000
GPR28: 0000000000010000 0000000008720000 0000000000010000 c000000001515b98
NIP [c00000000009b028] __copy_tofrom_user_base+0x9c/0x5a4
LR [c000000000802978] copyout+0x68/0xc0
--- interrupt: 300
[c00000000a4f3af0] [c0000000008074b8] copy_page_to_iter+0x188/0x540
[c00000000a4f3b50] [c00000000035c678] generic_file_buffered_read+0x358/0xd80
[c00000000a4f3c40] [c0000000004c1e90] blkdev_read_iter+0x50/0x80
[c00000000a4f3c60] [c00000000045733c] new_sync_read+0x12c/0x1c0
[c00000000a4f3d00] [c00000000045a1f0] vfs_read+0x1d0/0x240
[c00000000a4f3d50] [c00000000045a7f4] ksys_read+0x84/0x140
[c00000000a4f3da0] [c000000000033a60] system_call_exception+0x100/0x280
[c00000000a4f3e10] [c00000000000c508] system_call_common+0xf8/0x2f8
Instruction dump:
eae10078 3be0000b 4bfff890 60420000 792917e1 4182ff18 3c82ffab 3884a5e0
3c62ffab 3863a6e8 480ba891 60000000 <0fe00000> 3be0000b 4bfff860 e93c0938
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20201107023305.2384874-1-npiggin@gmail.com
2020-11-07 05:33:05 +03:00
2005-10-10 16:29:05 +04:00
lr = regs - > link ;
powerpc: show registers when unwinding interrupt frames
It's often useful to know the register state for interrupts in
the stack frame. In the below example (with this patch applied),
the important information is the state of the page fault.
A blatant case like this probably rather should have the page
fault regs passed down to the warning, but quite often there are
less obvious cases where an interrupt shows up that might give
some more clues.
The downside is longer and more complex bug output.
Bug: Write fault blocked by AMR!
WARNING: CPU: 0 PID: 72 at arch/powerpc/include/asm/book3s/64/kup-radix.h:164 __do_page_fault+0x880/0xa90
Modules linked in:
CPU: 0 PID: 72 Comm: systemd-gpt-aut Not tainted
NIP: c00000000006e2f0 LR: c00000000006e2ec CTR: 0000000000000000
REGS: c00000000a4f3420 TRAP: 0700
MSR: 8000000000021033 <SF,ME,IR,DR,RI,LE> CR: 28002840 XER: 20040000
CFAR: c000000000128be0 IRQMASK: 3
GPR00: c00000000006e2ec c00000000a4f36c0 c0000000014f0700 0000000000000020
GPR04: 0000000000000001 c000000001290f50 0000000000000001 c000000001290f80
GPR08: c000000001612b08 0000000000000000 0000000000000000 00000000ffffe0f7
GPR12: 0000000048002840 c0000000016e0000 c00c000000021c80 c000000000fd6f60
GPR16: 0000000000000000 c00000000a104698 0000000000000003 c0000000087f0000
GPR20: 0000000000000100 c0000000070330b8 0000000000000000 0000000000000004
GPR24: 0000000002000000 0000000000000300 0000000002000000 c00000000a5b0c00
GPR28: 0000000000000000 000000000a000000 00007fffb2a90038 c00000000a4f3820
NIP [c00000000006e2f0] __do_page_fault+0x880/0xa90
LR [c00000000006e2ec] __do_page_fault+0x87c/0xa90
Call Trace:
[c00000000a4f36c0] [c00000000006e2ec] __do_page_fault+0x87c/0xa90 (unreliable)
[c00000000a4f3780] [c000000000e1c034] do_page_fault+0x34/0x90
[c00000000a4f37b0] [c000000000008908] data_access_common_virt+0x158/0x1b0
--- interrupt: 300 at __copy_tofrom_user_base+0x9c/0x5a4
NIP: c00000000009b028 LR: c000000000802978 CTR: 0000000000000800
REGS: c00000000a4f3820 TRAP: 0300
MSR: 800000000280b033 <SF,VEC,VSX,EE,FP,ME,IR,DR,RI,LE> CR: 24004840 XER: 00000000
CFAR: c00000000009aff4 DAR: 00007fffb2a90038 DSISR: 0a000000 IRQMASK: 0
GPR00: 0000000000000000 c00000000a4f3ac0 c0000000014f0700 00007fffb2a90028
GPR04: c000000008720010 0000000000010000 0000000000000000 0000000000000000
GPR08: 0000000000000000 0000000000000000 0000000000000000 0000000000000001
GPR12: 0000000000004000 c0000000016e0000 c00c000000021c80 c000000000fd6f60
GPR16: 0000000000000000 c00000000a104698 0000000000000003 c0000000087f0000
GPR20: 0000000000000100 c0000000070330b8 0000000000000000 0000000000000004
GPR24: c00000000a4f3c80 c000000008720000 0000000000010000 0000000000000000
GPR28: 0000000000010000 0000000008720000 0000000000010000 c000000001515b98
NIP [c00000000009b028] __copy_tofrom_user_base+0x9c/0x5a4
LR [c000000000802978] copyout+0x68/0xc0
--- interrupt: 300
[c00000000a4f3af0] [c0000000008074b8] copy_page_to_iter+0x188/0x540
[c00000000a4f3b50] [c00000000035c678] generic_file_buffered_read+0x358/0xd80
[c00000000a4f3c40] [c0000000004c1e90] blkdev_read_iter+0x50/0x80
[c00000000a4f3c60] [c00000000045733c] new_sync_read+0x12c/0x1c0
[c00000000a4f3d00] [c00000000045a1f0] vfs_read+0x1d0/0x240
[c00000000a4f3d50] [c00000000045a7f4] ksys_read+0x84/0x140
[c00000000a4f3da0] [c000000000033a60] system_call_exception+0x100/0x280
[c00000000a4f3e10] [c00000000000c508] system_call_common+0xf8/0x2f8
Instruction dump:
eae10078 3be0000b 4bfff890 60420000 792917e1 4182ff18 3c82ffab 3884a5e0
3c62ffab 3863a6e8 480ba891 60000000 <0fe00000> 3be0000b 4bfff860 e93c0938
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20201107023305.2384874-1-npiggin@gmail.com
2020-11-07 05:33:05 +03:00
printk ( " %s--- interrupt: %lx at %pS \n " ,
loglvl , regs - > trap , ( void * ) regs - > nip ) ;
__show_regs ( regs ) ;
printk ( " %s--- interrupt: %lx \n " ,
loglvl , regs - > trap ) ;
2005-10-10 16:29:05 +04:00
firstframe = 1 ;
}
sp = newsp ;
} while ( count + + < kstack_depth_to_print ) ;
2019-01-31 13:08:52 +03:00
put_task_stack ( tsk ) ;
2005-10-10 16:29:05 +04:00
}
2006-02-13 06:48:35 +03:00
# ifdef CONFIG_PPC64
2012-03-01 05:45:27 +04:00
/* Called with hard IRQs off */
2013-06-13 15:04:56 +04:00
void notrace __ppc64_runlatch_on ( void )
2006-02-13 06:48:35 +03:00
{
2012-03-01 05:45:27 +04:00
struct thread_info * ti = current_thread_info ( ) ;
2006-02-13 06:48:35 +03:00
2017-08-11 19:39:07 +03:00
if ( cpu_has_feature ( CPU_FTR_ARCH_206 ) ) {
/*
* Least significant bit ( RUN ) is the only writable bit of
* the CTRL register , so we can avoid mfspr . 2.06 is not the
* earliest ISA where this is the case , but it ' s convenient .
*/
mtspr ( SPRN_CTRLT , CTRL_RUNLATCH ) ;
} else {
unsigned long ctrl ;
/*
* Some architectures ( e . g . , Cell ) have writable fields other
* than RUN , so do the read - modify - write .
*/
ctrl = mfspr ( SPRN_CTRLF ) ;
ctrl | = CTRL_RUNLATCH ;
mtspr ( SPRN_CTRLT , ctrl ) ;
}
2006-02-13 06:48:35 +03:00
2012-04-11 04:42:15 +04:00
ti - > local_flags | = _TLF_RUNLATCH ;
2006-02-13 06:48:35 +03:00
}
2012-03-01 05:45:27 +04:00
/* Called with hard IRQs off */
2013-06-13 15:04:56 +04:00
void notrace __ppc64_runlatch_off ( void )
2006-02-13 06:48:35 +03:00
{
2012-03-01 05:45:27 +04:00
struct thread_info * ti = current_thread_info ( ) ;
2006-02-13 06:48:35 +03:00
2012-04-11 04:42:15 +04:00
ti - > local_flags & = ~ _TLF_RUNLATCH ;
2006-02-13 06:48:35 +03:00
2017-08-11 19:39:07 +03:00
if ( cpu_has_feature ( CPU_FTR_ARCH_206 ) ) {
mtspr ( SPRN_CTRLT , 0 ) ;
} else {
unsigned long ctrl ;
ctrl = mfspr ( SPRN_CTRLF ) ;
ctrl & = ~ CTRL_RUNLATCH ;
mtspr ( SPRN_CTRLT , ctrl ) ;
}
2006-02-13 06:48:35 +03:00
}
2012-03-01 05:45:27 +04:00
# endif /* CONFIG_PPC64 */
2008-04-18 10:56:17 +04:00
2009-02-22 04:50:03 +03:00
unsigned long arch_align_stack ( unsigned long sp )
{
if ( ! ( current - > personality & ADDR_NO_RANDOMIZE ) & & randomize_va_space )
2022-10-10 05:44:02 +03:00
sp - = get_random_u32_below ( PAGE_SIZE ) ;
2009-02-22 04:50:03 +03:00
return sp & ~ 0xf ;
}