arm64: fpsimd: run kernel mode NEON with softirqs disabled

Kernel mode NEON can be used in task or softirq context, but only in
a non-nesting manner, i.e., softirq context is only permitted if the
interrupt was not taken at a point where the kernel was using the NEON
in task context.

This means all users of kernel mode NEON have to be aware of this
limitation, and either need to provide scalar fallbacks that may be much
slower (up to 20x for AES instructions) and potentially less safe, or
use an asynchronous interface that defers processing to a later time
when the NEON is guaranteed to be available.

Given that grabbing and releasing the NEON is cheap, we can relax this
restriction, by increasing the granularity of kernel mode NEON code, and
always disabling softirq processing while the NEON is being used in task
context.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20210302090118.30666-4-ardb@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
This commit is contained in:
Ard Biesheuvel 2021-03-02 10:01:12 +01:00 committed by Catalin Marinas
parent 4c4dcd3541
commit 13150149aa
8 changed files with 31 additions and 15 deletions

View File

@ -700,7 +700,7 @@ AES_FUNC_START(aes_mac_update)
cbz w5, .Lmacout cbz w5, .Lmacout
encrypt_block v0, w2, x1, x7, w8 encrypt_block v0, w2, x1, x7, w8
st1 {v0.16b}, [x4] /* return dg */ st1 {v0.16b}, [x4] /* return dg */
cond_yield .Lmacout, x7 cond_yield .Lmacout, x7, x8
b .Lmacloop4x b .Lmacloop4x
.Lmac1x: .Lmac1x:
add w3, w3, #4 add w3, w3, #4

View File

@ -121,7 +121,7 @@ CPU_LE( rev32 v11.16b, v11.16b )
add dgav.4s, dgav.4s, dg0v.4s add dgav.4s, dgav.4s, dg0v.4s
cbz w2, 2f cbz w2, 2f
cond_yield 3f, x5 cond_yield 3f, x5, x6
b 0b b 0b
/* /*

View File

@ -129,7 +129,7 @@ CPU_LE( rev32 v19.16b, v19.16b )
/* handled all input blocks? */ /* handled all input blocks? */
cbz w2, 2f cbz w2, 2f
cond_yield 3f, x5 cond_yield 3f, x5, x6
b 0b b 0b
/* /*

View File

@ -184,11 +184,11 @@ SYM_FUNC_START(sha3_ce_transform)
eor v0.16b, v0.16b, v31.16b eor v0.16b, v0.16b, v31.16b
cbnz w8, 3b cbnz w8, 3b
cond_yield 3f, x8 cond_yield 4f, x8, x9
cbnz w2, 0b cbnz w2, 0b
/* save state */ /* save state */
3: st1 { v0.1d- v3.1d}, [x0], #32 4: st1 { v0.1d- v3.1d}, [x0], #32
st1 { v4.1d- v7.1d}, [x0], #32 st1 { v4.1d- v7.1d}, [x0], #32
st1 { v8.1d-v11.1d}, [x0], #32 st1 { v8.1d-v11.1d}, [x0], #32
st1 {v12.1d-v15.1d}, [x0], #32 st1 {v12.1d-v15.1d}, [x0], #32

View File

@ -195,7 +195,7 @@ CPU_LE( rev64 v19.16b, v19.16b )
add v10.2d, v10.2d, v2.2d add v10.2d, v10.2d, v2.2d
add v11.2d, v11.2d, v3.2d add v11.2d, v11.2d, v3.2d
cond_yield 3f, x4 cond_yield 3f, x4, x5
/* handled all input blocks? */ /* handled all input blocks? */
cbnz w2, 0b cbnz w2, 0b

View File

@ -15,6 +15,7 @@
#include <asm-generic/export.h> #include <asm-generic/export.h>
#include <asm/asm-offsets.h> #include <asm/asm-offsets.h>
#include <asm/alternative.h>
#include <asm/cpufeature.h> #include <asm/cpufeature.h>
#include <asm/cputype.h> #include <asm/cputype.h>
#include <asm/debug-monitors.h> #include <asm/debug-monitors.h>
@ -701,19 +702,32 @@ USER(\label, ic ivau, \tmp2) // invalidate I line PoU
.endm .endm
/* /*
* Check whether preempt-disabled code should yield as soon as it * Check whether preempt/bh-disabled asm code should yield as soon as
* is able. This is the case if re-enabling preemption a single * it is able. This is the case if we are currently running in task
* time results in a preempt count of zero, and the TIF_NEED_RESCHED * context, and either a softirq is pending, or the TIF_NEED_RESCHED
* flag is set. (Note that the latter is stored negated in the * flag is set and re-enabling preemption a single time would result in
* top word of the thread_info::preempt_count field) * a preempt count of zero. (Note that the TIF_NEED_RESCHED flag is
* stored negated in the top word of the thread_info::preempt_count
* field)
*/ */
.macro cond_yield, lbl:req, tmp:req .macro cond_yield, lbl:req, tmp:req, tmp2:req
#ifdef CONFIG_PREEMPTION
get_current_task \tmp get_current_task \tmp
ldr \tmp, [\tmp, #TSK_TI_PREEMPT] ldr \tmp, [\tmp, #TSK_TI_PREEMPT]
/*
* If we are serving a softirq, there is no point in yielding: the
* softirq will not be preempted no matter what we do, so we should
* run to completion as quickly as we can.
*/
tbnz \tmp, #SOFTIRQ_SHIFT, .Lnoyield_\@
#ifdef CONFIG_PREEMPTION
sub \tmp, \tmp, #PREEMPT_DISABLE_OFFSET sub \tmp, \tmp, #PREEMPT_DISABLE_OFFSET
cbz \tmp, \lbl cbz \tmp, \lbl
#endif #endif
adr_l \tmp, irq_stat + IRQ_CPUSTAT_SOFTIRQ_PENDING
this_cpu_offset \tmp2
ldr w\tmp, [\tmp, \tmp2]
cbnz w\tmp, \lbl // yield on pending softirq in task context
.Lnoyield_\@:
.endm .endm
/* /*

View File

@ -95,6 +95,8 @@ int main(void)
DEFINE(DMA_FROM_DEVICE, DMA_FROM_DEVICE); DEFINE(DMA_FROM_DEVICE, DMA_FROM_DEVICE);
BLANK(); BLANK();
DEFINE(PREEMPT_DISABLE_OFFSET, PREEMPT_DISABLE_OFFSET); DEFINE(PREEMPT_DISABLE_OFFSET, PREEMPT_DISABLE_OFFSET);
DEFINE(SOFTIRQ_SHIFT, SOFTIRQ_SHIFT);
DEFINE(IRQ_CPUSTAT_SOFTIRQ_PENDING, offsetof(irq_cpustat_t, __softirq_pending));
BLANK(); BLANK();
DEFINE(CPU_BOOT_STACK, offsetof(struct secondary_data, stack)); DEFINE(CPU_BOOT_STACK, offsetof(struct secondary_data, stack));
DEFINE(CPU_BOOT_TASK, offsetof(struct secondary_data, task)); DEFINE(CPU_BOOT_TASK, offsetof(struct secondary_data, task));

View File

@ -180,7 +180,7 @@ static void __get_cpu_fpsimd_context(void)
*/ */
static void get_cpu_fpsimd_context(void) static void get_cpu_fpsimd_context(void)
{ {
preempt_disable(); local_bh_disable();
__get_cpu_fpsimd_context(); __get_cpu_fpsimd_context();
} }
@ -201,7 +201,7 @@ static void __put_cpu_fpsimd_context(void)
static void put_cpu_fpsimd_context(void) static void put_cpu_fpsimd_context(void)
{ {
__put_cpu_fpsimd_context(); __put_cpu_fpsimd_context();
preempt_enable(); local_bh_enable();
} }
static bool have_cpu_fpsimd_context(void) static bool have_cpu_fpsimd_context(void)