13150149aa
Kernel mode NEON can be used in task or softirq context, but only in a non-nesting manner, i.e., softirq context is only permitted if the interrupt was not taken at a point where the kernel was using the NEON in task context. This means all users of kernel mode NEON have to be aware of this limitation, and either need to provide scalar fallbacks that may be much slower (up to 20x for AES instructions) and potentially less safe, or use an asynchronous interface that defers processing to a later time when the NEON is guaranteed to be available. Given that grabbing and releasing the NEON is cheap, we can relax this restriction, by increasing the granularity of kernel mode NEON code, and always disabling softirq processing while the NEON is being used in task context. Signed-off-by: Ard Biesheuvel <ardb@kernel.org> Acked-by: Will Deacon <will@kernel.org> Link: https://lore.kernel.org/r/20210302090118.30666-4-ardb@kernel.org Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
151 lines
3.1 KiB
ArmAsm
151 lines
3.1 KiB
ArmAsm
/* SPDX-License-Identifier: GPL-2.0-only */
|
|
/*
|
|
* sha1-ce-core.S - SHA-1 secure hash using ARMv8 Crypto Extensions
|
|
*
|
|
* Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
|
|
*/
|
|
|
|
#include <linux/linkage.h>
|
|
#include <asm/assembler.h>
|
|
|
|
.text
|
|
.arch armv8-a+crypto
|
|
|
|
k0 .req v0
|
|
k1 .req v1
|
|
k2 .req v2
|
|
k3 .req v3
|
|
|
|
t0 .req v4
|
|
t1 .req v5
|
|
|
|
dga .req q6
|
|
dgav .req v6
|
|
dgb .req s7
|
|
dgbv .req v7
|
|
|
|
dg0q .req q12
|
|
dg0s .req s12
|
|
dg0v .req v12
|
|
dg1s .req s13
|
|
dg1v .req v13
|
|
dg2s .req s14
|
|
|
|
.macro add_only, op, ev, rc, s0, dg1
|
|
.ifc \ev, ev
|
|
add t1.4s, v\s0\().4s, \rc\().4s
|
|
sha1h dg2s, dg0s
|
|
.ifnb \dg1
|
|
sha1\op dg0q, \dg1, t0.4s
|
|
.else
|
|
sha1\op dg0q, dg1s, t0.4s
|
|
.endif
|
|
.else
|
|
.ifnb \s0
|
|
add t0.4s, v\s0\().4s, \rc\().4s
|
|
.endif
|
|
sha1h dg1s, dg0s
|
|
sha1\op dg0q, dg2s, t1.4s
|
|
.endif
|
|
.endm
|
|
|
|
.macro add_update, op, ev, rc, s0, s1, s2, s3, dg1
|
|
sha1su0 v\s0\().4s, v\s1\().4s, v\s2\().4s
|
|
add_only \op, \ev, \rc, \s1, \dg1
|
|
sha1su1 v\s0\().4s, v\s3\().4s
|
|
.endm
|
|
|
|
.macro loadrc, k, val, tmp
|
|
movz \tmp, :abs_g0_nc:\val
|
|
movk \tmp, :abs_g1:\val
|
|
dup \k, \tmp
|
|
.endm
|
|
|
|
/*
|
|
* int sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src,
|
|
* int blocks)
|
|
*/
|
|
SYM_FUNC_START(sha1_ce_transform)
|
|
/* load round constants */
|
|
loadrc k0.4s, 0x5a827999, w6
|
|
loadrc k1.4s, 0x6ed9eba1, w6
|
|
loadrc k2.4s, 0x8f1bbcdc, w6
|
|
loadrc k3.4s, 0xca62c1d6, w6
|
|
|
|
/* load state */
|
|
ld1 {dgav.4s}, [x0]
|
|
ldr dgb, [x0, #16]
|
|
|
|
/* load sha1_ce_state::finalize */
|
|
ldr_l w4, sha1_ce_offsetof_finalize, x4
|
|
ldr w4, [x0, x4]
|
|
|
|
/* load input */
|
|
0: ld1 {v8.4s-v11.4s}, [x1], #64
|
|
sub w2, w2, #1
|
|
|
|
CPU_LE( rev32 v8.16b, v8.16b )
|
|
CPU_LE( rev32 v9.16b, v9.16b )
|
|
CPU_LE( rev32 v10.16b, v10.16b )
|
|
CPU_LE( rev32 v11.16b, v11.16b )
|
|
|
|
1: add t0.4s, v8.4s, k0.4s
|
|
mov dg0v.16b, dgav.16b
|
|
|
|
add_update c, ev, k0, 8, 9, 10, 11, dgb
|
|
add_update c, od, k0, 9, 10, 11, 8
|
|
add_update c, ev, k0, 10, 11, 8, 9
|
|
add_update c, od, k0, 11, 8, 9, 10
|
|
add_update c, ev, k1, 8, 9, 10, 11
|
|
|
|
add_update p, od, k1, 9, 10, 11, 8
|
|
add_update p, ev, k1, 10, 11, 8, 9
|
|
add_update p, od, k1, 11, 8, 9, 10
|
|
add_update p, ev, k1, 8, 9, 10, 11
|
|
add_update p, od, k2, 9, 10, 11, 8
|
|
|
|
add_update m, ev, k2, 10, 11, 8, 9
|
|
add_update m, od, k2, 11, 8, 9, 10
|
|
add_update m, ev, k2, 8, 9, 10, 11
|
|
add_update m, od, k2, 9, 10, 11, 8
|
|
add_update m, ev, k3, 10, 11, 8, 9
|
|
|
|
add_update p, od, k3, 11, 8, 9, 10
|
|
add_only p, ev, k3, 9
|
|
add_only p, od, k3, 10
|
|
add_only p, ev, k3, 11
|
|
add_only p, od
|
|
|
|
/* update state */
|
|
add dgbv.2s, dgbv.2s, dg1v.2s
|
|
add dgav.4s, dgav.4s, dg0v.4s
|
|
|
|
cbz w2, 2f
|
|
cond_yield 3f, x5, x6
|
|
b 0b
|
|
|
|
/*
|
|
* Final block: add padding and total bit count.
|
|
* Skip if the input size was not a round multiple of the block size,
|
|
* the padding is handled by the C code in that case.
|
|
*/
|
|
2: cbz x4, 3f
|
|
ldr_l w4, sha1_ce_offsetof_count, x4
|
|
ldr x4, [x0, x4]
|
|
movi v9.2d, #0
|
|
mov x8, #0x80000000
|
|
movi v10.2d, #0
|
|
ror x7, x4, #29 // ror(lsl(x4, 3), 32)
|
|
fmov d8, x8
|
|
mov x4, #0
|
|
mov v11.d[0], xzr
|
|
mov v11.d[1], x7
|
|
b 1b
|
|
|
|
/* store new state */
|
|
3: st1 {dgav.4s}, [x0]
|
|
str dgb, [x0, #16]
|
|
mov w0, w2
|
|
ret
|
|
SYM_FUNC_END(sha1_ce_transform)
|