crypto: arm64/aes-neonbs-xts - use plain NEON for non-power-of-2 input sizes
Even though the kernel's implementations of AES-XTS were updated to implement ciphertext stealing and can operate on inputs of any size larger than or equal to the AES block size, this feature is rarely used in practice. In fact, in the kernel, AES-XTS is only used to operate on 4096 or 512 byte blocks, which means that not only the ciphertext stealing is effectively dead code, the logic in the bit sliced NEON implementation to deal with fewer than 8 blocks at a time is also never used. Since the bit-sliced NEON driver already depends on the plain NEON version, which is slower but can operate on smaller data quantities more straightforwardly, let's fallback to the plain NEON implementation of XTS for any residual inputs that are not multiples of 128 bytes. This allows us to remove a lot of complicated logic that rarely gets exercised in practice. Signed-off-by: Ard Biesheuvel <ardb@kernel.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
This commit is contained in:
parent
fc074e1300
commit
dfc6031ec9
@ -735,119 +735,67 @@ SYM_FUNC_END(aesbs_cbc_decrypt)
|
|||||||
* int blocks, u8 iv[])
|
* int blocks, u8 iv[])
|
||||||
*/
|
*/
|
||||||
SYM_FUNC_START_LOCAL(__xts_crypt8)
|
SYM_FUNC_START_LOCAL(__xts_crypt8)
|
||||||
mov x6, #1
|
movi v18.2s, #0x1
|
||||||
lsl x6, x6, x23
|
movi v19.2s, #0x87
|
||||||
subs w23, w23, #8
|
uzp1 v18.4s, v18.4s, v19.4s
|
||||||
csel x23, x23, xzr, pl
|
|
||||||
csel x6, x6, xzr, mi
|
ld1 {v0.16b-v3.16b}, [x1], #64
|
||||||
|
ld1 {v4.16b-v7.16b}, [x1], #64
|
||||||
|
|
||||||
|
next_tweak v26, v25, v18, v19
|
||||||
|
next_tweak v27, v26, v18, v19
|
||||||
|
next_tweak v28, v27, v18, v19
|
||||||
|
next_tweak v29, v28, v18, v19
|
||||||
|
next_tweak v30, v29, v18, v19
|
||||||
|
next_tweak v31, v30, v18, v19
|
||||||
|
next_tweak v16, v31, v18, v19
|
||||||
|
next_tweak v17, v16, v18, v19
|
||||||
|
|
||||||
ld1 {v0.16b}, [x20], #16
|
|
||||||
next_tweak v26, v25, v30, v31
|
|
||||||
eor v0.16b, v0.16b, v25.16b
|
eor v0.16b, v0.16b, v25.16b
|
||||||
tbnz x6, #1, 0f
|
|
||||||
|
|
||||||
ld1 {v1.16b}, [x20], #16
|
|
||||||
next_tweak v27, v26, v30, v31
|
|
||||||
eor v1.16b, v1.16b, v26.16b
|
eor v1.16b, v1.16b, v26.16b
|
||||||
tbnz x6, #2, 0f
|
|
||||||
|
|
||||||
ld1 {v2.16b}, [x20], #16
|
|
||||||
next_tweak v28, v27, v30, v31
|
|
||||||
eor v2.16b, v2.16b, v27.16b
|
eor v2.16b, v2.16b, v27.16b
|
||||||
tbnz x6, #3, 0f
|
|
||||||
|
|
||||||
ld1 {v3.16b}, [x20], #16
|
|
||||||
next_tweak v29, v28, v30, v31
|
|
||||||
eor v3.16b, v3.16b, v28.16b
|
eor v3.16b, v3.16b, v28.16b
|
||||||
tbnz x6, #4, 0f
|
|
||||||
|
|
||||||
ld1 {v4.16b}, [x20], #16
|
|
||||||
str q29, [sp, #.Lframe_local_offset]
|
|
||||||
eor v4.16b, v4.16b, v29.16b
|
eor v4.16b, v4.16b, v29.16b
|
||||||
next_tweak v29, v29, v30, v31
|
eor v5.16b, v5.16b, v30.16b
|
||||||
tbnz x6, #5, 0f
|
eor v6.16b, v6.16b, v31.16b
|
||||||
|
eor v7.16b, v7.16b, v16.16b
|
||||||
|
|
||||||
ld1 {v5.16b}, [x20], #16
|
stp q16, q17, [sp, #16]
|
||||||
str q29, [sp, #.Lframe_local_offset + 16]
|
|
||||||
eor v5.16b, v5.16b, v29.16b
|
|
||||||
next_tweak v29, v29, v30, v31
|
|
||||||
tbnz x6, #6, 0f
|
|
||||||
|
|
||||||
ld1 {v6.16b}, [x20], #16
|
mov bskey, x2
|
||||||
str q29, [sp, #.Lframe_local_offset + 32]
|
mov rounds, x3
|
||||||
eor v6.16b, v6.16b, v29.16b
|
|
||||||
next_tweak v29, v29, v30, v31
|
|
||||||
tbnz x6, #7, 0f
|
|
||||||
|
|
||||||
ld1 {v7.16b}, [x20], #16
|
|
||||||
str q29, [sp, #.Lframe_local_offset + 48]
|
|
||||||
eor v7.16b, v7.16b, v29.16b
|
|
||||||
next_tweak v29, v29, v30, v31
|
|
||||||
|
|
||||||
0: mov bskey, x21
|
|
||||||
mov rounds, x22
|
|
||||||
br x16
|
br x16
|
||||||
SYM_FUNC_END(__xts_crypt8)
|
SYM_FUNC_END(__xts_crypt8)
|
||||||
|
|
||||||
.macro __xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
|
.macro __xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
|
||||||
frame_push 6, 64
|
stp x29, x30, [sp, #-48]!
|
||||||
|
mov x29, sp
|
||||||
|
|
||||||
mov x19, x0
|
ld1 {v25.16b}, [x5]
|
||||||
mov x20, x1
|
|
||||||
mov x21, x2
|
|
||||||
mov x22, x3
|
|
||||||
mov x23, x4
|
|
||||||
mov x24, x5
|
|
||||||
|
|
||||||
movi v30.2s, #0x1
|
0: adr x16, \do8
|
||||||
movi v25.2s, #0x87
|
|
||||||
uzp1 v30.4s, v30.4s, v25.4s
|
|
||||||
ld1 {v25.16b}, [x24]
|
|
||||||
|
|
||||||
99: adr x16, \do8
|
|
||||||
bl __xts_crypt8
|
bl __xts_crypt8
|
||||||
|
|
||||||
ldp q16, q17, [sp, #.Lframe_local_offset]
|
eor v16.16b, \o0\().16b, v25.16b
|
||||||
ldp q18, q19, [sp, #.Lframe_local_offset + 32]
|
eor v17.16b, \o1\().16b, v26.16b
|
||||||
|
eor v18.16b, \o2\().16b, v27.16b
|
||||||
|
eor v19.16b, \o3\().16b, v28.16b
|
||||||
|
|
||||||
eor \o0\().16b, \o0\().16b, v25.16b
|
ldp q24, q25, [sp, #16]
|
||||||
eor \o1\().16b, \o1\().16b, v26.16b
|
|
||||||
eor \o2\().16b, \o2\().16b, v27.16b
|
|
||||||
eor \o3\().16b, \o3\().16b, v28.16b
|
|
||||||
|
|
||||||
st1 {\o0\().16b}, [x19], #16
|
eor v20.16b, \o4\().16b, v29.16b
|
||||||
mov v25.16b, v26.16b
|
eor v21.16b, \o5\().16b, v30.16b
|
||||||
tbnz x6, #1, 1f
|
eor v22.16b, \o6\().16b, v31.16b
|
||||||
st1 {\o1\().16b}, [x19], #16
|
eor v23.16b, \o7\().16b, v24.16b
|
||||||
mov v25.16b, v27.16b
|
|
||||||
tbnz x6, #2, 1f
|
|
||||||
st1 {\o2\().16b}, [x19], #16
|
|
||||||
mov v25.16b, v28.16b
|
|
||||||
tbnz x6, #3, 1f
|
|
||||||
st1 {\o3\().16b}, [x19], #16
|
|
||||||
mov v25.16b, v29.16b
|
|
||||||
tbnz x6, #4, 1f
|
|
||||||
|
|
||||||
eor \o4\().16b, \o4\().16b, v16.16b
|
st1 {v16.16b-v19.16b}, [x0], #64
|
||||||
eor \o5\().16b, \o5\().16b, v17.16b
|
st1 {v20.16b-v23.16b}, [x0], #64
|
||||||
eor \o6\().16b, \o6\().16b, v18.16b
|
|
||||||
eor \o7\().16b, \o7\().16b, v19.16b
|
|
||||||
|
|
||||||
st1 {\o4\().16b}, [x19], #16
|
subs x4, x4, #8
|
||||||
tbnz x6, #5, 1f
|
b.gt 0b
|
||||||
st1 {\o5\().16b}, [x19], #16
|
|
||||||
tbnz x6, #6, 1f
|
|
||||||
st1 {\o6\().16b}, [x19], #16
|
|
||||||
tbnz x6, #7, 1f
|
|
||||||
st1 {\o7\().16b}, [x19], #16
|
|
||||||
|
|
||||||
cbz x23, 1f
|
st1 {v25.16b}, [x5]
|
||||||
st1 {v25.16b}, [x24]
|
ldp x29, x30, [sp], #48
|
||||||
|
|
||||||
b 99b
|
|
||||||
|
|
||||||
1: st1 {v25.16b}, [x24]
|
|
||||||
frame_pop
|
|
||||||
ret
|
ret
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
|
@ -302,23 +302,18 @@ static int __xts_crypt(struct skcipher_request *req, bool encrypt,
|
|||||||
return err;
|
return err;
|
||||||
|
|
||||||
while (walk.nbytes >= AES_BLOCK_SIZE) {
|
while (walk.nbytes >= AES_BLOCK_SIZE) {
|
||||||
unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
|
int blocks = (walk.nbytes / AES_BLOCK_SIZE) & ~7;
|
||||||
|
|
||||||
if (walk.nbytes < walk.total || walk.nbytes % AES_BLOCK_SIZE)
|
|
||||||
blocks = round_down(blocks,
|
|
||||||
walk.stride / AES_BLOCK_SIZE);
|
|
||||||
|
|
||||||
out = walk.dst.virt.addr;
|
out = walk.dst.virt.addr;
|
||||||
in = walk.src.virt.addr;
|
in = walk.src.virt.addr;
|
||||||
nbytes = walk.nbytes;
|
nbytes = walk.nbytes;
|
||||||
|
|
||||||
kernel_neon_begin();
|
kernel_neon_begin();
|
||||||
if (likely(blocks > 6)) { /* plain NEON is faster otherwise */
|
if (blocks >= 8) {
|
||||||
if (first)
|
if (first == 1)
|
||||||
neon_aes_ecb_encrypt(walk.iv, walk.iv,
|
neon_aes_ecb_encrypt(walk.iv, walk.iv,
|
||||||
ctx->twkey,
|
ctx->twkey,
|
||||||
ctx->key.rounds, 1);
|
ctx->key.rounds, 1);
|
||||||
first = 0;
|
first = 2;
|
||||||
|
|
||||||
fn(out, in, ctx->key.rk, ctx->key.rounds, blocks,
|
fn(out, in, ctx->key.rk, ctx->key.rounds, blocks,
|
||||||
walk.iv);
|
walk.iv);
|
||||||
@ -327,10 +322,17 @@ static int __xts_crypt(struct skcipher_request *req, bool encrypt,
|
|||||||
in += blocks * AES_BLOCK_SIZE;
|
in += blocks * AES_BLOCK_SIZE;
|
||||||
nbytes -= blocks * AES_BLOCK_SIZE;
|
nbytes -= blocks * AES_BLOCK_SIZE;
|
||||||
}
|
}
|
||||||
|
if (walk.nbytes == walk.total && nbytes > 0) {
|
||||||
if (walk.nbytes == walk.total && nbytes > 0)
|
if (encrypt)
|
||||||
goto xts_tail;
|
neon_aes_xts_encrypt(out, in, ctx->cts.key_enc,
|
||||||
|
ctx->key.rounds, nbytes,
|
||||||
|
ctx->twkey, walk.iv, first);
|
||||||
|
else
|
||||||
|
neon_aes_xts_decrypt(out, in, ctx->cts.key_dec,
|
||||||
|
ctx->key.rounds, nbytes,
|
||||||
|
ctx->twkey, walk.iv, first);
|
||||||
|
nbytes = first = 0;
|
||||||
|
}
|
||||||
kernel_neon_end();
|
kernel_neon_end();
|
||||||
err = skcipher_walk_done(&walk, nbytes);
|
err = skcipher_walk_done(&walk, nbytes);
|
||||||
}
|
}
|
||||||
@ -355,13 +357,12 @@ static int __xts_crypt(struct skcipher_request *req, bool encrypt,
|
|||||||
nbytes = walk.nbytes;
|
nbytes = walk.nbytes;
|
||||||
|
|
||||||
kernel_neon_begin();
|
kernel_neon_begin();
|
||||||
xts_tail:
|
|
||||||
if (encrypt)
|
if (encrypt)
|
||||||
neon_aes_xts_encrypt(out, in, ctx->cts.key_enc, ctx->key.rounds,
|
neon_aes_xts_encrypt(out, in, ctx->cts.key_enc, ctx->key.rounds,
|
||||||
nbytes, ctx->twkey, walk.iv, first ?: 2);
|
nbytes, ctx->twkey, walk.iv, first);
|
||||||
else
|
else
|
||||||
neon_aes_xts_decrypt(out, in, ctx->cts.key_dec, ctx->key.rounds,
|
neon_aes_xts_decrypt(out, in, ctx->cts.key_dec, ctx->key.rounds,
|
||||||
nbytes, ctx->twkey, walk.iv, first ?: 2);
|
nbytes, ctx->twkey, walk.iv, first);
|
||||||
kernel_neon_end();
|
kernel_neon_end();
|
||||||
|
|
||||||
return skcipher_walk_done(&walk, 0);
|
return skcipher_walk_done(&walk, 0);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user