b575b5a1e6
On 32-bit ARM, AES in GCM mode takes full advantage of the ARMv8 Crypto Extensions when available, resulting in a performance of 6-7 cycles per byte for typical IPsec frames on cores such as Cortex-A53, using the generic GCM template encapsulating the accelerated AES-CTR and GHASH implementations. At such high rates, any time spent copying data or doing other poorly optimized work in the generic layer hurts disproportionately, and we can get a significant performance improvement by combining the optimized AES-CTR and GHASH implementations into a single GCM driver. On Cortex-A53, this results in a performance improvement of around 75%, and AES-256-GCM-128 with RFC4106 encapsulation runs in 4 cycles per byte. Note that this code takes advantage of the fact that kernel mode NEON is now supported in softirq context as well, and therefore does not provide a non-NEON fallback path at all. (AEADs are only callable in process or softirq context) Acked-by: Herbert Xu <herbert@gondor.apana.org.au> Signed-off-by: Ard Biesheuvel <ardb@kernel.org> Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
696 lines
14 KiB
ArmAsm
696 lines
14 KiB
ArmAsm
/* SPDX-License-Identifier: GPL-2.0-only */
|
|
/*
|
|
* Accelerated GHASH implementation with NEON/ARMv8 vmull.p8/64 instructions.
|
|
*
|
|
* Copyright (C) 2015 - 2017 Linaro Ltd.
|
|
* Copyright (C) 2023 Google LLC. <ardb@google.com>
|
|
*/
|
|
|
|
#include <linux/linkage.h>
|
|
#include <asm/assembler.h>
|
|
|
|
.arch armv8-a
|
|
.fpu crypto-neon-fp-armv8
|
|
|
|
SHASH .req q0
|
|
T1 .req q1
|
|
XL .req q2
|
|
XM .req q3
|
|
XH .req q4
|
|
IN1 .req q4
|
|
|
|
SHASH_L .req d0
|
|
SHASH_H .req d1
|
|
T1_L .req d2
|
|
T1_H .req d3
|
|
XL_L .req d4
|
|
XL_H .req d5
|
|
XM_L .req d6
|
|
XM_H .req d7
|
|
XH_L .req d8
|
|
|
|
t0l .req d10
|
|
t0h .req d11
|
|
t1l .req d12
|
|
t1h .req d13
|
|
t2l .req d14
|
|
t2h .req d15
|
|
t3l .req d16
|
|
t3h .req d17
|
|
t4l .req d18
|
|
t4h .req d19
|
|
|
|
t0q .req q5
|
|
t1q .req q6
|
|
t2q .req q7
|
|
t3q .req q8
|
|
t4q .req q9
|
|
XH2 .req q9
|
|
|
|
s1l .req d20
|
|
s1h .req d21
|
|
s2l .req d22
|
|
s2h .req d23
|
|
s3l .req d24
|
|
s3h .req d25
|
|
s4l .req d26
|
|
s4h .req d27
|
|
|
|
MASK .req d28
|
|
SHASH2_p8 .req d28
|
|
|
|
k16 .req d29
|
|
k32 .req d30
|
|
k48 .req d31
|
|
SHASH2_p64 .req d31
|
|
|
|
HH .req q10
|
|
HH3 .req q11
|
|
HH4 .req q12
|
|
HH34 .req q13
|
|
|
|
HH_L .req d20
|
|
HH_H .req d21
|
|
HH3_L .req d22
|
|
HH3_H .req d23
|
|
HH4_L .req d24
|
|
HH4_H .req d25
|
|
HH34_L .req d26
|
|
HH34_H .req d27
|
|
SHASH2_H .req d29
|
|
|
|
XL2 .req q5
|
|
XM2 .req q6
|
|
T2 .req q7
|
|
T3 .req q8
|
|
|
|
XL2_L .req d10
|
|
XL2_H .req d11
|
|
XM2_L .req d12
|
|
XM2_H .req d13
|
|
T3_L .req d16
|
|
T3_H .req d17
|
|
|
|
.text
|
|
|
|
.macro __pmull_p64, rd, rn, rm, b1, b2, b3, b4
|
|
vmull.p64 \rd, \rn, \rm
|
|
.endm
|
|
|
|
/*
|
|
* This implementation of 64x64 -> 128 bit polynomial multiplication
|
|
* using vmull.p8 instructions (8x8 -> 16) is taken from the paper
|
|
* "Fast Software Polynomial Multiplication on ARM Processors Using
|
|
* the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and
|
|
* Ricardo Dahab (https://hal.inria.fr/hal-01506572)
|
|
*
|
|
* It has been slightly tweaked for in-order performance, and to allow
|
|
* 'rq' to overlap with 'ad' or 'bd'.
|
|
*/
|
|
.macro __pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l
|
|
vext.8 t0l, \ad, \ad, #1 @ A1
|
|
.ifc \b1, t4l
|
|
vext.8 t4l, \bd, \bd, #1 @ B1
|
|
.endif
|
|
vmull.p8 t0q, t0l, \bd @ F = A1*B
|
|
vext.8 t1l, \ad, \ad, #2 @ A2
|
|
vmull.p8 t4q, \ad, \b1 @ E = A*B1
|
|
.ifc \b2, t3l
|
|
vext.8 t3l, \bd, \bd, #2 @ B2
|
|
.endif
|
|
vmull.p8 t1q, t1l, \bd @ H = A2*B
|
|
vext.8 t2l, \ad, \ad, #3 @ A3
|
|
vmull.p8 t3q, \ad, \b2 @ G = A*B2
|
|
veor t0q, t0q, t4q @ L = E + F
|
|
.ifc \b3, t4l
|
|
vext.8 t4l, \bd, \bd, #3 @ B3
|
|
.endif
|
|
vmull.p8 t2q, t2l, \bd @ J = A3*B
|
|
veor t0l, t0l, t0h @ t0 = (L) (P0 + P1) << 8
|
|
veor t1q, t1q, t3q @ M = G + H
|
|
.ifc \b4, t3l
|
|
vext.8 t3l, \bd, \bd, #4 @ B4
|
|
.endif
|
|
vmull.p8 t4q, \ad, \b3 @ I = A*B3
|
|
veor t1l, t1l, t1h @ t1 = (M) (P2 + P3) << 16
|
|
vmull.p8 t3q, \ad, \b4 @ K = A*B4
|
|
vand t0h, t0h, k48
|
|
vand t1h, t1h, k32
|
|
veor t2q, t2q, t4q @ N = I + J
|
|
veor t0l, t0l, t0h
|
|
veor t1l, t1l, t1h
|
|
veor t2l, t2l, t2h @ t2 = (N) (P4 + P5) << 24
|
|
vand t2h, t2h, k16
|
|
veor t3l, t3l, t3h @ t3 = (K) (P6 + P7) << 32
|
|
vmov.i64 t3h, #0
|
|
vext.8 t0q, t0q, t0q, #15
|
|
veor t2l, t2l, t2h
|
|
vext.8 t1q, t1q, t1q, #14
|
|
vmull.p8 \rq, \ad, \bd @ D = A*B
|
|
vext.8 t2q, t2q, t2q, #13
|
|
vext.8 t3q, t3q, t3q, #12
|
|
veor t0q, t0q, t1q
|
|
veor t2q, t2q, t3q
|
|
veor \rq, \rq, t0q
|
|
veor \rq, \rq, t2q
|
|
.endm
|
|
|
|
//
|
|
// PMULL (64x64->128) based reduction for CPUs that can do
|
|
// it in a single instruction.
|
|
//
|
|
.macro __pmull_reduce_p64
|
|
vmull.p64 T1, XL_L, MASK
|
|
|
|
veor XH_L, XH_L, XM_H
|
|
vext.8 T1, T1, T1, #8
|
|
veor XL_H, XL_H, XM_L
|
|
veor T1, T1, XL
|
|
|
|
vmull.p64 XL, T1_H, MASK
|
|
.endm
|
|
|
|
//
|
|
// Alternative reduction for CPUs that lack support for the
|
|
// 64x64->128 PMULL instruction
|
|
//
|
|
.macro __pmull_reduce_p8
|
|
veor XL_H, XL_H, XM_L
|
|
veor XH_L, XH_L, XM_H
|
|
|
|
vshl.i64 T1, XL, #57
|
|
vshl.i64 T2, XL, #62
|
|
veor T1, T1, T2
|
|
vshl.i64 T2, XL, #63
|
|
veor T1, T1, T2
|
|
veor XL_H, XL_H, T1_L
|
|
veor XH_L, XH_L, T1_H
|
|
|
|
vshr.u64 T1, XL, #1
|
|
veor XH, XH, XL
|
|
veor XL, XL, T1
|
|
vshr.u64 T1, T1, #6
|
|
vshr.u64 XL, XL, #1
|
|
.endm
|
|
|
|
.macro ghash_update, pn, enc, aggregate=1, head=1
|
|
vld1.64 {XL}, [r1]
|
|
|
|
.if \head
|
|
/* do the head block first, if supplied */
|
|
ldr ip, [sp]
|
|
teq ip, #0
|
|
beq 0f
|
|
vld1.64 {T1}, [ip]
|
|
teq r0, #0
|
|
b 3f
|
|
.endif
|
|
|
|
0: .ifc \pn, p64
|
|
.if \aggregate
|
|
tst r0, #3 // skip until #blocks is a
|
|
bne 2f // round multiple of 4
|
|
|
|
vld1.8 {XL2-XM2}, [r2]!
|
|
1: vld1.8 {T2-T3}, [r2]!
|
|
|
|
.ifnb \enc
|
|
\enc\()_4x XL2, XM2, T2, T3
|
|
|
|
add ip, r3, #16
|
|
vld1.64 {HH}, [ip, :128]!
|
|
vld1.64 {HH3-HH4}, [ip, :128]
|
|
|
|
veor SHASH2_p64, SHASH_L, SHASH_H
|
|
veor SHASH2_H, HH_L, HH_H
|
|
veor HH34_L, HH3_L, HH3_H
|
|
veor HH34_H, HH4_L, HH4_H
|
|
|
|
vmov.i8 MASK, #0xe1
|
|
vshl.u64 MASK, MASK, #57
|
|
.endif
|
|
|
|
vrev64.8 XL2, XL2
|
|
vrev64.8 XM2, XM2
|
|
|
|
subs r0, r0, #4
|
|
|
|
vext.8 T1, XL2, XL2, #8
|
|
veor XL2_H, XL2_H, XL_L
|
|
veor XL, XL, T1
|
|
|
|
vrev64.8 T1, T3
|
|
vrev64.8 T3, T2
|
|
|
|
vmull.p64 XH, HH4_H, XL_H // a1 * b1
|
|
veor XL2_H, XL2_H, XL_H
|
|
vmull.p64 XL, HH4_L, XL_L // a0 * b0
|
|
vmull.p64 XM, HH34_H, XL2_H // (a1 + a0)(b1 + b0)
|
|
|
|
vmull.p64 XH2, HH3_H, XM2_L // a1 * b1
|
|
veor XM2_L, XM2_L, XM2_H
|
|
vmull.p64 XL2, HH3_L, XM2_H // a0 * b0
|
|
vmull.p64 XM2, HH34_L, XM2_L // (a1 + a0)(b1 + b0)
|
|
|
|
veor XH, XH, XH2
|
|
veor XL, XL, XL2
|
|
veor XM, XM, XM2
|
|
|
|
vmull.p64 XH2, HH_H, T3_L // a1 * b1
|
|
veor T3_L, T3_L, T3_H
|
|
vmull.p64 XL2, HH_L, T3_H // a0 * b0
|
|
vmull.p64 XM2, SHASH2_H, T3_L // (a1 + a0)(b1 + b0)
|
|
|
|
veor XH, XH, XH2
|
|
veor XL, XL, XL2
|
|
veor XM, XM, XM2
|
|
|
|
vmull.p64 XH2, SHASH_H, T1_L // a1 * b1
|
|
veor T1_L, T1_L, T1_H
|
|
vmull.p64 XL2, SHASH_L, T1_H // a0 * b0
|
|
vmull.p64 XM2, SHASH2_p64, T1_L // (a1 + a0)(b1 + b0)
|
|
|
|
veor XH, XH, XH2
|
|
veor XL, XL, XL2
|
|
veor XM, XM, XM2
|
|
|
|
beq 4f
|
|
|
|
vld1.8 {XL2-XM2}, [r2]!
|
|
|
|
veor T1, XL, XH
|
|
veor XM, XM, T1
|
|
|
|
__pmull_reduce_p64
|
|
|
|
veor T1, T1, XH
|
|
veor XL, XL, T1
|
|
|
|
b 1b
|
|
.endif
|
|
.endif
|
|
|
|
2: vld1.8 {T1}, [r2]!
|
|
|
|
.ifnb \enc
|
|
\enc\()_1x T1
|
|
veor SHASH2_p64, SHASH_L, SHASH_H
|
|
vmov.i8 MASK, #0xe1
|
|
vshl.u64 MASK, MASK, #57
|
|
.endif
|
|
|
|
subs r0, r0, #1
|
|
|
|
3: /* multiply XL by SHASH in GF(2^128) */
|
|
vrev64.8 T1, T1
|
|
|
|
vext.8 IN1, T1, T1, #8
|
|
veor T1_L, T1_L, XL_H
|
|
veor XL, XL, IN1
|
|
|
|
__pmull_\pn XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h @ a1 * b1
|
|
veor T1, T1, XL
|
|
__pmull_\pn XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l @ a0 * b0
|
|
__pmull_\pn XM, T1_L, SHASH2_\pn @ (a1+a0)(b1+b0)
|
|
|
|
4: veor T1, XL, XH
|
|
veor XM, XM, T1
|
|
|
|
__pmull_reduce_\pn
|
|
|
|
veor T1, T1, XH
|
|
veor XL, XL, T1
|
|
|
|
bne 0b
|
|
.endm
|
|
|
|
/*
|
|
* void pmull_ghash_update(int blocks, u64 dg[], const char *src,
|
|
* struct ghash_key const *k, const char *head)
|
|
*/
|
|
ENTRY(pmull_ghash_update_p64)
|
|
vld1.64 {SHASH}, [r3]!
|
|
vld1.64 {HH}, [r3]!
|
|
vld1.64 {HH3-HH4}, [r3]
|
|
|
|
veor SHASH2_p64, SHASH_L, SHASH_H
|
|
veor SHASH2_H, HH_L, HH_H
|
|
veor HH34_L, HH3_L, HH3_H
|
|
veor HH34_H, HH4_L, HH4_H
|
|
|
|
vmov.i8 MASK, #0xe1
|
|
vshl.u64 MASK, MASK, #57
|
|
|
|
ghash_update p64
|
|
vst1.64 {XL}, [r1]
|
|
|
|
bx lr
|
|
ENDPROC(pmull_ghash_update_p64)
|
|
|
|
ENTRY(pmull_ghash_update_p8)
|
|
vld1.64 {SHASH}, [r3]
|
|
veor SHASH2_p8, SHASH_L, SHASH_H
|
|
|
|
vext.8 s1l, SHASH_L, SHASH_L, #1
|
|
vext.8 s2l, SHASH_L, SHASH_L, #2
|
|
vext.8 s3l, SHASH_L, SHASH_L, #3
|
|
vext.8 s4l, SHASH_L, SHASH_L, #4
|
|
vext.8 s1h, SHASH_H, SHASH_H, #1
|
|
vext.8 s2h, SHASH_H, SHASH_H, #2
|
|
vext.8 s3h, SHASH_H, SHASH_H, #3
|
|
vext.8 s4h, SHASH_H, SHASH_H, #4
|
|
|
|
vmov.i64 k16, #0xffff
|
|
vmov.i64 k32, #0xffffffff
|
|
vmov.i64 k48, #0xffffffffffff
|
|
|
|
ghash_update p8
|
|
vst1.64 {XL}, [r1]
|
|
|
|
bx lr
|
|
ENDPROC(pmull_ghash_update_p8)
|
|
|
|
e0 .req q9
|
|
e1 .req q10
|
|
e2 .req q11
|
|
e3 .req q12
|
|
e0l .req d18
|
|
e0h .req d19
|
|
e2l .req d22
|
|
e2h .req d23
|
|
e3l .req d24
|
|
e3h .req d25
|
|
ctr .req q13
|
|
ctr0 .req d26
|
|
ctr1 .req d27
|
|
|
|
ek0 .req q14
|
|
ek1 .req q15
|
|
|
|
.macro round, rk:req, regs:vararg
|
|
.irp r, \regs
|
|
aese.8 \r, \rk
|
|
aesmc.8 \r, \r
|
|
.endr
|
|
.endm
|
|
|
|
.macro aes_encrypt, rkp, rounds, regs:vararg
|
|
vld1.8 {ek0-ek1}, [\rkp, :128]!
|
|
cmp \rounds, #12
|
|
blt .L\@ // AES-128
|
|
|
|
round ek0, \regs
|
|
vld1.8 {ek0}, [\rkp, :128]!
|
|
round ek1, \regs
|
|
vld1.8 {ek1}, [\rkp, :128]!
|
|
|
|
beq .L\@ // AES-192
|
|
|
|
round ek0, \regs
|
|
vld1.8 {ek0}, [\rkp, :128]!
|
|
round ek1, \regs
|
|
vld1.8 {ek1}, [\rkp, :128]!
|
|
|
|
.L\@: .rept 4
|
|
round ek0, \regs
|
|
vld1.8 {ek0}, [\rkp, :128]!
|
|
round ek1, \regs
|
|
vld1.8 {ek1}, [\rkp, :128]!
|
|
.endr
|
|
|
|
round ek0, \regs
|
|
vld1.8 {ek0}, [\rkp, :128]
|
|
|
|
.irp r, \regs
|
|
aese.8 \r, ek1
|
|
.endr
|
|
.irp r, \regs
|
|
veor \r, \r, ek0
|
|
.endr
|
|
.endm
|
|
|
|
pmull_aes_encrypt:
|
|
add ip, r5, #4
|
|
vld1.8 {ctr0}, [r5] // load 12 byte IV
|
|
vld1.8 {ctr1}, [ip]
|
|
rev r8, r7
|
|
vext.8 ctr1, ctr1, ctr1, #4
|
|
add r7, r7, #1
|
|
vmov.32 ctr1[1], r8
|
|
vmov e0, ctr
|
|
|
|
add ip, r3, #64
|
|
aes_encrypt ip, r6, e0
|
|
bx lr
|
|
ENDPROC(pmull_aes_encrypt)
|
|
|
|
pmull_aes_encrypt_4x:
|
|
add ip, r5, #4
|
|
vld1.8 {ctr0}, [r5]
|
|
vld1.8 {ctr1}, [ip]
|
|
rev r8, r7
|
|
vext.8 ctr1, ctr1, ctr1, #4
|
|
add r7, r7, #1
|
|
vmov.32 ctr1[1], r8
|
|
rev ip, r7
|
|
vmov e0, ctr
|
|
add r7, r7, #1
|
|
vmov.32 ctr1[1], ip
|
|
rev r8, r7
|
|
vmov e1, ctr
|
|
add r7, r7, #1
|
|
vmov.32 ctr1[1], r8
|
|
rev ip, r7
|
|
vmov e2, ctr
|
|
add r7, r7, #1
|
|
vmov.32 ctr1[1], ip
|
|
vmov e3, ctr
|
|
|
|
add ip, r3, #64
|
|
aes_encrypt ip, r6, e0, e1, e2, e3
|
|
bx lr
|
|
ENDPROC(pmull_aes_encrypt_4x)
|
|
|
|
pmull_aes_encrypt_final:
|
|
add ip, r5, #4
|
|
vld1.8 {ctr0}, [r5]
|
|
vld1.8 {ctr1}, [ip]
|
|
rev r8, r7
|
|
vext.8 ctr1, ctr1, ctr1, #4
|
|
mov r7, #1 << 24 // BE #1 for the tag
|
|
vmov.32 ctr1[1], r8
|
|
vmov e0, ctr
|
|
vmov.32 ctr1[1], r7
|
|
vmov e1, ctr
|
|
|
|
add ip, r3, #64
|
|
aes_encrypt ip, r6, e0, e1
|
|
bx lr
|
|
ENDPROC(pmull_aes_encrypt_final)
|
|
|
|
.macro enc_1x, in0
|
|
bl pmull_aes_encrypt
|
|
veor \in0, \in0, e0
|
|
vst1.8 {\in0}, [r4]!
|
|
.endm
|
|
|
|
.macro dec_1x, in0
|
|
bl pmull_aes_encrypt
|
|
veor e0, e0, \in0
|
|
vst1.8 {e0}, [r4]!
|
|
.endm
|
|
|
|
.macro enc_4x, in0, in1, in2, in3
|
|
bl pmull_aes_encrypt_4x
|
|
|
|
veor \in0, \in0, e0
|
|
veor \in1, \in1, e1
|
|
veor \in2, \in2, e2
|
|
veor \in3, \in3, e3
|
|
|
|
vst1.8 {\in0-\in1}, [r4]!
|
|
vst1.8 {\in2-\in3}, [r4]!
|
|
.endm
|
|
|
|
.macro dec_4x, in0, in1, in2, in3
|
|
bl pmull_aes_encrypt_4x
|
|
|
|
veor e0, e0, \in0
|
|
veor e1, e1, \in1
|
|
veor e2, e2, \in2
|
|
veor e3, e3, \in3
|
|
|
|
vst1.8 {e0-e1}, [r4]!
|
|
vst1.8 {e2-e3}, [r4]!
|
|
.endm
|
|
|
|
/*
|
|
* void pmull_gcm_encrypt(int blocks, u64 dg[], const char *src,
|
|
* struct gcm_key const *k, char *dst,
|
|
* char *iv, int rounds, u32 counter)
|
|
*/
|
|
ENTRY(pmull_gcm_encrypt)
|
|
push {r4-r8, lr}
|
|
ldrd r4, r5, [sp, #24]
|
|
ldrd r6, r7, [sp, #32]
|
|
|
|
vld1.64 {SHASH}, [r3]
|
|
|
|
ghash_update p64, enc, head=0
|
|
vst1.64 {XL}, [r1]
|
|
|
|
pop {r4-r8, pc}
|
|
ENDPROC(pmull_gcm_encrypt)
|
|
|
|
/*
|
|
* void pmull_gcm_decrypt(int blocks, u64 dg[], const char *src,
|
|
* struct gcm_key const *k, char *dst,
|
|
* char *iv, int rounds, u32 counter)
|
|
*/
|
|
ENTRY(pmull_gcm_decrypt)
|
|
push {r4-r8, lr}
|
|
ldrd r4, r5, [sp, #24]
|
|
ldrd r6, r7, [sp, #32]
|
|
|
|
vld1.64 {SHASH}, [r3]
|
|
|
|
ghash_update p64, dec, head=0
|
|
vst1.64 {XL}, [r1]
|
|
|
|
pop {r4-r8, pc}
|
|
ENDPROC(pmull_gcm_decrypt)
|
|
|
|
/*
|
|
* void pmull_gcm_enc_final(int bytes, u64 dg[], char *tag,
|
|
* struct gcm_key const *k, char *head,
|
|
* char *iv, int rounds, u32 counter)
|
|
*/
|
|
ENTRY(pmull_gcm_enc_final)
|
|
push {r4-r8, lr}
|
|
ldrd r4, r5, [sp, #24]
|
|
ldrd r6, r7, [sp, #32]
|
|
|
|
bl pmull_aes_encrypt_final
|
|
|
|
cmp r0, #0
|
|
beq .Lenc_final
|
|
|
|
mov_l ip, .Lpermute
|
|
sub r4, r4, #16
|
|
add r8, ip, r0
|
|
add ip, ip, #32
|
|
add r4, r4, r0
|
|
sub ip, ip, r0
|
|
|
|
vld1.8 {e3}, [r8] // permute vector for key stream
|
|
vld1.8 {e2}, [ip] // permute vector for ghash input
|
|
|
|
vtbl.8 e3l, {e0}, e3l
|
|
vtbl.8 e3h, {e0}, e3h
|
|
|
|
vld1.8 {e0}, [r4] // encrypt tail block
|
|
veor e0, e0, e3
|
|
vst1.8 {e0}, [r4]
|
|
|
|
vtbl.8 T1_L, {e0}, e2l
|
|
vtbl.8 T1_H, {e0}, e2h
|
|
|
|
vld1.64 {XL}, [r1]
|
|
.Lenc_final:
|
|
vld1.64 {SHASH}, [r3, :128]
|
|
vmov.i8 MASK, #0xe1
|
|
veor SHASH2_p64, SHASH_L, SHASH_H
|
|
vshl.u64 MASK, MASK, #57
|
|
mov r0, #1
|
|
bne 3f // process head block first
|
|
ghash_update p64, aggregate=0, head=0
|
|
|
|
vrev64.8 XL, XL
|
|
vext.8 XL, XL, XL, #8
|
|
veor XL, XL, e1
|
|
|
|
sub r2, r2, #16 // rewind src pointer
|
|
vst1.8 {XL}, [r2] // store tag
|
|
|
|
pop {r4-r8, pc}
|
|
ENDPROC(pmull_gcm_enc_final)
|
|
|
|
/*
|
|
* int pmull_gcm_dec_final(int bytes, u64 dg[], char *tag,
|
|
* struct gcm_key const *k, char *head,
|
|
* char *iv, int rounds, u32 counter,
|
|
* const char *otag, int authsize)
|
|
*/
|
|
ENTRY(pmull_gcm_dec_final)
|
|
push {r4-r8, lr}
|
|
ldrd r4, r5, [sp, #24]
|
|
ldrd r6, r7, [sp, #32]
|
|
|
|
bl pmull_aes_encrypt_final
|
|
|
|
cmp r0, #0
|
|
beq .Ldec_final
|
|
|
|
mov_l ip, .Lpermute
|
|
sub r4, r4, #16
|
|
add r8, ip, r0
|
|
add ip, ip, #32
|
|
add r4, r4, r0
|
|
sub ip, ip, r0
|
|
|
|
vld1.8 {e3}, [r8] // permute vector for key stream
|
|
vld1.8 {e2}, [ip] // permute vector for ghash input
|
|
|
|
vtbl.8 e3l, {e0}, e3l
|
|
vtbl.8 e3h, {e0}, e3h
|
|
|
|
vld1.8 {e0}, [r4]
|
|
|
|
vtbl.8 T1_L, {e0}, e2l
|
|
vtbl.8 T1_H, {e0}, e2h
|
|
|
|
veor e0, e0, e3
|
|
vst1.8 {e0}, [r4]
|
|
|
|
vld1.64 {XL}, [r1]
|
|
.Ldec_final:
|
|
vld1.64 {SHASH}, [r3]
|
|
vmov.i8 MASK, #0xe1
|
|
veor SHASH2_p64, SHASH_L, SHASH_H
|
|
vshl.u64 MASK, MASK, #57
|
|
mov r0, #1
|
|
bne 3f // process head block first
|
|
ghash_update p64, aggregate=0, head=0
|
|
|
|
vrev64.8 XL, XL
|
|
vext.8 XL, XL, XL, #8
|
|
veor XL, XL, e1
|
|
|
|
mov_l ip, .Lpermute
|
|
ldrd r2, r3, [sp, #40] // otag and authsize
|
|
vld1.8 {T1}, [r2]
|
|
add ip, ip, r3
|
|
vceq.i8 T1, T1, XL // compare tags
|
|
vmvn T1, T1 // 0 for eq, -1 for ne
|
|
|
|
vld1.8 {e0}, [ip]
|
|
vtbl.8 XL_L, {T1}, e0l // keep authsize bytes only
|
|
vtbl.8 XL_H, {T1}, e0h
|
|
|
|
vpmin.s8 XL_L, XL_L, XL_H // take the minimum s8 across the vector
|
|
vpmin.s8 XL_L, XL_L, XL_L
|
|
vmov.32 r0, XL_L[0] // fail if != 0x0
|
|
|
|
pop {r4-r8, pc}
|
|
ENDPROC(pmull_gcm_dec_final)
|
|
|
|
.section ".rodata", "a", %progbits
|
|
.align 5
|
|
.Lpermute:
|
|
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
|
|
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
|
|
.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
|
|
.byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
|
|
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
|
|
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
|