crypto: arm64/aes-neonbs-xts - use plain NEON for non-power-of-2 input sizes

Even though the kernel's implementations of AES-XTS were updated to
implement ciphertext stealing and can operate on inputs of any size
larger than or equal to the AES block size, this feature is rarely used
in practice.

In fact, in the kernel, AES-XTS is only used to operate on 4096 or 512
byte blocks, which means that not only the ciphertext stealing is
effectively dead code, the logic in the bit sliced NEON implementation
to deal with fewer than 8 blocks at a time is also never used.

Since the bit-sliced NEON driver already depends on the plain NEON
version, which is slower but can operate on smaller data quantities more
straightforwardly, let's fallback to the plain NEON implementation of
XTS for any residual inputs that are not multiples of 128 bytes. This
allows us to remove a lot of complicated logic that rarely gets
exercised in practice.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
This commit is contained in:
Ard Biesheuvel 2022-01-27 12:35:45 +01:00 committed by Herbert Xu
parent fc074e1300
commit dfc6031ec9
2 changed files with 57 additions and 108 deletions

View File

@ -735,119 +735,67 @@ SYM_FUNC_END(aesbs_cbc_decrypt)
* int blocks, u8 iv[]) * int blocks, u8 iv[])
*/ */
SYM_FUNC_START_LOCAL(__xts_crypt8) SYM_FUNC_START_LOCAL(__xts_crypt8)
mov x6, #1 movi v18.2s, #0x1
lsl x6, x6, x23 movi v19.2s, #0x87
subs w23, w23, #8 uzp1 v18.4s, v18.4s, v19.4s
csel x23, x23, xzr, pl
csel x6, x6, xzr, mi ld1 {v0.16b-v3.16b}, [x1], #64
ld1 {v4.16b-v7.16b}, [x1], #64
next_tweak v26, v25, v18, v19
next_tweak v27, v26, v18, v19
next_tweak v28, v27, v18, v19
next_tweak v29, v28, v18, v19
next_tweak v30, v29, v18, v19
next_tweak v31, v30, v18, v19
next_tweak v16, v31, v18, v19
next_tweak v17, v16, v18, v19
ld1 {v0.16b}, [x20], #16
next_tweak v26, v25, v30, v31
eor v0.16b, v0.16b, v25.16b eor v0.16b, v0.16b, v25.16b
tbnz x6, #1, 0f
ld1 {v1.16b}, [x20], #16
next_tweak v27, v26, v30, v31
eor v1.16b, v1.16b, v26.16b eor v1.16b, v1.16b, v26.16b
tbnz x6, #2, 0f
ld1 {v2.16b}, [x20], #16
next_tweak v28, v27, v30, v31
eor v2.16b, v2.16b, v27.16b eor v2.16b, v2.16b, v27.16b
tbnz x6, #3, 0f
ld1 {v3.16b}, [x20], #16
next_tweak v29, v28, v30, v31
eor v3.16b, v3.16b, v28.16b eor v3.16b, v3.16b, v28.16b
tbnz x6, #4, 0f
ld1 {v4.16b}, [x20], #16
str q29, [sp, #.Lframe_local_offset]
eor v4.16b, v4.16b, v29.16b eor v4.16b, v4.16b, v29.16b
next_tweak v29, v29, v30, v31 eor v5.16b, v5.16b, v30.16b
tbnz x6, #5, 0f eor v6.16b, v6.16b, v31.16b
eor v7.16b, v7.16b, v16.16b
ld1 {v5.16b}, [x20], #16 stp q16, q17, [sp, #16]
str q29, [sp, #.Lframe_local_offset + 16]
eor v5.16b, v5.16b, v29.16b
next_tweak v29, v29, v30, v31
tbnz x6, #6, 0f
ld1 {v6.16b}, [x20], #16 mov bskey, x2
str q29, [sp, #.Lframe_local_offset + 32] mov rounds, x3
eor v6.16b, v6.16b, v29.16b
next_tweak v29, v29, v30, v31
tbnz x6, #7, 0f
ld1 {v7.16b}, [x20], #16
str q29, [sp, #.Lframe_local_offset + 48]
eor v7.16b, v7.16b, v29.16b
next_tweak v29, v29, v30, v31
0: mov bskey, x21
mov rounds, x22
br x16 br x16
SYM_FUNC_END(__xts_crypt8) SYM_FUNC_END(__xts_crypt8)
.macro __xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7 .macro __xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
frame_push 6, 64 stp x29, x30, [sp, #-48]!
mov x29, sp
mov x19, x0 ld1 {v25.16b}, [x5]
mov x20, x1
mov x21, x2
mov x22, x3
mov x23, x4
mov x24, x5
movi v30.2s, #0x1 0: adr x16, \do8
movi v25.2s, #0x87
uzp1 v30.4s, v30.4s, v25.4s
ld1 {v25.16b}, [x24]
99: adr x16, \do8
bl __xts_crypt8 bl __xts_crypt8
ldp q16, q17, [sp, #.Lframe_local_offset] eor v16.16b, \o0\().16b, v25.16b
ldp q18, q19, [sp, #.Lframe_local_offset + 32] eor v17.16b, \o1\().16b, v26.16b
eor v18.16b, \o2\().16b, v27.16b
eor v19.16b, \o3\().16b, v28.16b
eor \o0\().16b, \o0\().16b, v25.16b ldp q24, q25, [sp, #16]
eor \o1\().16b, \o1\().16b, v26.16b
eor \o2\().16b, \o2\().16b, v27.16b
eor \o3\().16b, \o3\().16b, v28.16b
st1 {\o0\().16b}, [x19], #16 eor v20.16b, \o4\().16b, v29.16b
mov v25.16b, v26.16b eor v21.16b, \o5\().16b, v30.16b
tbnz x6, #1, 1f eor v22.16b, \o6\().16b, v31.16b
st1 {\o1\().16b}, [x19], #16 eor v23.16b, \o7\().16b, v24.16b
mov v25.16b, v27.16b
tbnz x6, #2, 1f
st1 {\o2\().16b}, [x19], #16
mov v25.16b, v28.16b
tbnz x6, #3, 1f
st1 {\o3\().16b}, [x19], #16
mov v25.16b, v29.16b
tbnz x6, #4, 1f
eor \o4\().16b, \o4\().16b, v16.16b st1 {v16.16b-v19.16b}, [x0], #64
eor \o5\().16b, \o5\().16b, v17.16b st1 {v20.16b-v23.16b}, [x0], #64
eor \o6\().16b, \o6\().16b, v18.16b
eor \o7\().16b, \o7\().16b, v19.16b
st1 {\o4\().16b}, [x19], #16 subs x4, x4, #8
tbnz x6, #5, 1f b.gt 0b
st1 {\o5\().16b}, [x19], #16
tbnz x6, #6, 1f
st1 {\o6\().16b}, [x19], #16
tbnz x6, #7, 1f
st1 {\o7\().16b}, [x19], #16
cbz x23, 1f st1 {v25.16b}, [x5]
st1 {v25.16b}, [x24] ldp x29, x30, [sp], #48
b 99b
1: st1 {v25.16b}, [x24]
frame_pop
ret ret
.endm .endm

View File

@ -302,23 +302,18 @@ static int __xts_crypt(struct skcipher_request *req, bool encrypt,
return err; return err;
while (walk.nbytes >= AES_BLOCK_SIZE) { while (walk.nbytes >= AES_BLOCK_SIZE) {
unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE; int blocks = (walk.nbytes / AES_BLOCK_SIZE) & ~7;
if (walk.nbytes < walk.total || walk.nbytes % AES_BLOCK_SIZE)
blocks = round_down(blocks,
walk.stride / AES_BLOCK_SIZE);
out = walk.dst.virt.addr; out = walk.dst.virt.addr;
in = walk.src.virt.addr; in = walk.src.virt.addr;
nbytes = walk.nbytes; nbytes = walk.nbytes;
kernel_neon_begin(); kernel_neon_begin();
if (likely(blocks > 6)) { /* plain NEON is faster otherwise */ if (blocks >= 8) {
if (first) if (first == 1)
neon_aes_ecb_encrypt(walk.iv, walk.iv, neon_aes_ecb_encrypt(walk.iv, walk.iv,
ctx->twkey, ctx->twkey,
ctx->key.rounds, 1); ctx->key.rounds, 1);
first = 0; first = 2;
fn(out, in, ctx->key.rk, ctx->key.rounds, blocks, fn(out, in, ctx->key.rk, ctx->key.rounds, blocks,
walk.iv); walk.iv);
@ -327,10 +322,17 @@ static int __xts_crypt(struct skcipher_request *req, bool encrypt,
in += blocks * AES_BLOCK_SIZE; in += blocks * AES_BLOCK_SIZE;
nbytes -= blocks * AES_BLOCK_SIZE; nbytes -= blocks * AES_BLOCK_SIZE;
} }
if (walk.nbytes == walk.total && nbytes > 0) {
if (walk.nbytes == walk.total && nbytes > 0) if (encrypt)
goto xts_tail; neon_aes_xts_encrypt(out, in, ctx->cts.key_enc,
ctx->key.rounds, nbytes,
ctx->twkey, walk.iv, first);
else
neon_aes_xts_decrypt(out, in, ctx->cts.key_dec,
ctx->key.rounds, nbytes,
ctx->twkey, walk.iv, first);
nbytes = first = 0;
}
kernel_neon_end(); kernel_neon_end();
err = skcipher_walk_done(&walk, nbytes); err = skcipher_walk_done(&walk, nbytes);
} }
@ -355,13 +357,12 @@ static int __xts_crypt(struct skcipher_request *req, bool encrypt,
nbytes = walk.nbytes; nbytes = walk.nbytes;
kernel_neon_begin(); kernel_neon_begin();
xts_tail:
if (encrypt) if (encrypt)
neon_aes_xts_encrypt(out, in, ctx->cts.key_enc, ctx->key.rounds, neon_aes_xts_encrypt(out, in, ctx->cts.key_enc, ctx->key.rounds,
nbytes, ctx->twkey, walk.iv, first ?: 2); nbytes, ctx->twkey, walk.iv, first);
else else
neon_aes_xts_decrypt(out, in, ctx->cts.key_dec, ctx->key.rounds, neon_aes_xts_decrypt(out, in, ctx->cts.key_dec, ctx->key.rounds,
nbytes, ctx->twkey, walk.iv, first ?: 2); nbytes, ctx->twkey, walk.iv, first);
kernel_neon_end(); kernel_neon_end();
return skcipher_walk_done(&walk, 0); return skcipher_walk_done(&walk, 0);