6b5360a5e0
This patch is a CE-optimized assembly implementation for cmac/xcbc/cbcmac. Benchmark on T-Head Yitian-710 2.75 GHz, the data comes from the 300 mode of tcrypt, and compared the performance before and after this patch (the driver used before this patch is XXXmac(sm4-ce)). The abscissas are blocks of different lengths. The data is tabulated and the unit is Mb/s: Before: update-size | 16 64 256 1024 2048 4096 8192 ---------------+-------------------------------------------------------- cmac(sm4-ce) | 293.33 403.69 503.76 527.78 531.10 535.46 535.81 xcbc(sm4-ce) | 292.83 402.50 504.02 529.08 529.87 536.55 538.24 cbcmac(sm4-ce) | 318.42 415.79 497.12 515.05 523.15 521.19 523.01 After: update-size | 16 64 256 1024 2048 4096 8192 ---------------+-------------------------------------------------------- cmac-sm4-ce | 371.99 675.28 903.56 971.65 980.57 990.40 991.04 xcbc-sm4-ce | 372.11 674.55 903.47 971.61 980.96 990.42 991.10 cbcmac-sm4-ce | 371.63 675.33 903.23 972.07 981.42 990.93 991.45 Signed-off-by: Tianjia Zhang <tianjia.zhang@linux.alibaba.com> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
1094 lines
20 KiB
ArmAsm
1094 lines
20 KiB
ArmAsm
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
|
/*
|
|
* SM4 Cipher Algorithm for ARMv8 with Crypto Extensions
|
|
* as specified in
|
|
* https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
|
|
*
|
|
* Copyright (C) 2022, Alibaba Group.
|
|
* Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
|
|
*/
|
|
|
|
#include <linux/linkage.h>
|
|
#include <asm/assembler.h>
|
|
#include "sm4-ce-asm.h"
|
|
|
|
.arch armv8-a+crypto
|
|
|
|
.irp b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \
|
|
20, 24, 25, 26, 27, 28, 29, 30, 31
|
|
.set .Lv\b\().4s, \b
|
|
.endr
|
|
|
|
.macro sm4e, vd, vn
|
|
.inst 0xcec08400 | (.L\vn << 5) | .L\vd
|
|
.endm
|
|
|
|
.macro sm4ekey, vd, vn, vm
|
|
.inst 0xce60c800 | (.L\vm << 16) | (.L\vn << 5) | .L\vd
|
|
.endm
|
|
|
|
/* Register macros */
|
|
|
|
#define RTMP0 v16
|
|
#define RTMP1 v17
|
|
#define RTMP2 v18
|
|
#define RTMP3 v19
|
|
|
|
#define RIV v20
|
|
#define RMAC v20
|
|
#define RMASK v21
|
|
|
|
|
|
.align 3
|
|
SYM_FUNC_START(sm4_ce_expand_key)
|
|
/* input:
|
|
* x0: 128-bit key
|
|
* x1: rkey_enc
|
|
* x2: rkey_dec
|
|
* x3: fk array
|
|
* x4: ck array
|
|
*/
|
|
ld1 {v0.16b}, [x0];
|
|
rev32 v0.16b, v0.16b;
|
|
ld1 {v1.16b}, [x3];
|
|
/* load ck */
|
|
ld1 {v24.16b-v27.16b}, [x4], #64;
|
|
ld1 {v28.16b-v31.16b}, [x4];
|
|
|
|
/* input ^ fk */
|
|
eor v0.16b, v0.16b, v1.16b;
|
|
|
|
sm4ekey v0.4s, v0.4s, v24.4s;
|
|
sm4ekey v1.4s, v0.4s, v25.4s;
|
|
sm4ekey v2.4s, v1.4s, v26.4s;
|
|
sm4ekey v3.4s, v2.4s, v27.4s;
|
|
sm4ekey v4.4s, v3.4s, v28.4s;
|
|
sm4ekey v5.4s, v4.4s, v29.4s;
|
|
sm4ekey v6.4s, v5.4s, v30.4s;
|
|
sm4ekey v7.4s, v6.4s, v31.4s;
|
|
|
|
adr_l x5, .Lbswap128_mask
|
|
ld1 {v24.16b}, [x5]
|
|
|
|
st1 {v0.16b-v3.16b}, [x1], #64;
|
|
st1 {v4.16b-v7.16b}, [x1];
|
|
|
|
tbl v16.16b, {v7.16b}, v24.16b
|
|
tbl v17.16b, {v6.16b}, v24.16b
|
|
tbl v18.16b, {v5.16b}, v24.16b
|
|
tbl v19.16b, {v4.16b}, v24.16b
|
|
tbl v20.16b, {v3.16b}, v24.16b
|
|
tbl v21.16b, {v2.16b}, v24.16b
|
|
tbl v22.16b, {v1.16b}, v24.16b
|
|
tbl v23.16b, {v0.16b}, v24.16b
|
|
|
|
st1 {v16.16b-v19.16b}, [x2], #64
|
|
st1 {v20.16b-v23.16b}, [x2]
|
|
|
|
ret;
|
|
SYM_FUNC_END(sm4_ce_expand_key)
|
|
|
|
.align 3
|
|
SYM_FUNC_START(sm4_ce_crypt_block)
|
|
/* input:
|
|
* x0: round key array, CTX
|
|
* x1: dst
|
|
* x2: src
|
|
*/
|
|
SM4_PREPARE(x0)
|
|
|
|
ld1 {v0.16b}, [x2];
|
|
SM4_CRYPT_BLK(v0);
|
|
st1 {v0.16b}, [x1];
|
|
|
|
ret;
|
|
SYM_FUNC_END(sm4_ce_crypt_block)
|
|
|
|
.align 3
|
|
SYM_FUNC_START(sm4_ce_crypt)
|
|
/* input:
|
|
* x0: round key array, CTX
|
|
* x1: dst
|
|
* x2: src
|
|
* w3: nblocks
|
|
*/
|
|
SM4_PREPARE(x0)
|
|
|
|
.Lcrypt_loop_blk:
|
|
sub w3, w3, #8;
|
|
tbnz w3, #31, .Lcrypt_tail8;
|
|
|
|
ld1 {v0.16b-v3.16b}, [x2], #64;
|
|
ld1 {v4.16b-v7.16b}, [x2], #64;
|
|
|
|
SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
|
|
|
|
st1 {v0.16b-v3.16b}, [x1], #64;
|
|
st1 {v4.16b-v7.16b}, [x1], #64;
|
|
|
|
cbz w3, .Lcrypt_end;
|
|
b .Lcrypt_loop_blk;
|
|
|
|
.Lcrypt_tail8:
|
|
add w3, w3, #8;
|
|
cmp w3, #4;
|
|
blt .Lcrypt_tail4;
|
|
|
|
sub w3, w3, #4;
|
|
|
|
ld1 {v0.16b-v3.16b}, [x2], #64;
|
|
SM4_CRYPT_BLK4(v0, v1, v2, v3);
|
|
st1 {v0.16b-v3.16b}, [x1], #64;
|
|
|
|
cbz w3, .Lcrypt_end;
|
|
|
|
.Lcrypt_tail4:
|
|
sub w3, w3, #1;
|
|
|
|
ld1 {v0.16b}, [x2], #16;
|
|
SM4_CRYPT_BLK(v0);
|
|
st1 {v0.16b}, [x1], #16;
|
|
|
|
cbnz w3, .Lcrypt_tail4;
|
|
|
|
.Lcrypt_end:
|
|
ret;
|
|
SYM_FUNC_END(sm4_ce_crypt)
|
|
|
|
.align 3
|
|
SYM_FUNC_START(sm4_ce_cbc_enc)
|
|
/* input:
|
|
* x0: round key array, CTX
|
|
* x1: dst
|
|
* x2: src
|
|
* x3: iv (big endian, 128 bit)
|
|
* w4: nblocks
|
|
*/
|
|
SM4_PREPARE(x0)
|
|
|
|
ld1 {RIV.16b}, [x3]
|
|
|
|
.Lcbc_enc_loop_4x:
|
|
cmp w4, #4
|
|
blt .Lcbc_enc_loop_1x
|
|
|
|
sub w4, w4, #4
|
|
|
|
ld1 {v0.16b-v3.16b}, [x2], #64
|
|
|
|
eor v0.16b, v0.16b, RIV.16b
|
|
SM4_CRYPT_BLK(v0)
|
|
eor v1.16b, v1.16b, v0.16b
|
|
SM4_CRYPT_BLK(v1)
|
|
eor v2.16b, v2.16b, v1.16b
|
|
SM4_CRYPT_BLK(v2)
|
|
eor v3.16b, v3.16b, v2.16b
|
|
SM4_CRYPT_BLK(v3)
|
|
|
|
st1 {v0.16b-v3.16b}, [x1], #64
|
|
mov RIV.16b, v3.16b
|
|
|
|
cbz w4, .Lcbc_enc_end
|
|
b .Lcbc_enc_loop_4x
|
|
|
|
.Lcbc_enc_loop_1x:
|
|
sub w4, w4, #1
|
|
|
|
ld1 {v0.16b}, [x2], #16
|
|
|
|
eor RIV.16b, RIV.16b, v0.16b
|
|
SM4_CRYPT_BLK(RIV)
|
|
|
|
st1 {RIV.16b}, [x1], #16
|
|
|
|
cbnz w4, .Lcbc_enc_loop_1x
|
|
|
|
.Lcbc_enc_end:
|
|
/* store new IV */
|
|
st1 {RIV.16b}, [x3]
|
|
|
|
ret
|
|
SYM_FUNC_END(sm4_ce_cbc_enc)
|
|
|
|
.align 3
|
|
SYM_FUNC_START(sm4_ce_cbc_dec)
|
|
/* input:
|
|
* x0: round key array, CTX
|
|
* x1: dst
|
|
* x2: src
|
|
* x3: iv (big endian, 128 bit)
|
|
* w4: nblocks
|
|
*/
|
|
SM4_PREPARE(x0)
|
|
|
|
ld1 {RIV.16b}, [x3]
|
|
|
|
.Lcbc_dec_loop_8x:
|
|
sub w4, w4, #8
|
|
tbnz w4, #31, .Lcbc_dec_4x
|
|
|
|
ld1 {v0.16b-v3.16b}, [x2], #64
|
|
ld1 {v4.16b-v7.16b}, [x2], #64
|
|
|
|
rev32 v8.16b, v0.16b
|
|
rev32 v9.16b, v1.16b
|
|
rev32 v10.16b, v2.16b
|
|
rev32 v11.16b, v3.16b
|
|
rev32 v12.16b, v4.16b
|
|
rev32 v13.16b, v5.16b
|
|
rev32 v14.16b, v6.16b
|
|
rev32 v15.16b, v7.16b
|
|
|
|
SM4_CRYPT_BLK8_BE(v8, v9, v10, v11, v12, v13, v14, v15)
|
|
|
|
eor v8.16b, v8.16b, RIV.16b
|
|
eor v9.16b, v9.16b, v0.16b
|
|
eor v10.16b, v10.16b, v1.16b
|
|
eor v11.16b, v11.16b, v2.16b
|
|
eor v12.16b, v12.16b, v3.16b
|
|
eor v13.16b, v13.16b, v4.16b
|
|
eor v14.16b, v14.16b, v5.16b
|
|
eor v15.16b, v15.16b, v6.16b
|
|
|
|
st1 {v8.16b-v11.16b}, [x1], #64
|
|
st1 {v12.16b-v15.16b}, [x1], #64
|
|
|
|
mov RIV.16b, v7.16b
|
|
|
|
cbz w4, .Lcbc_dec_end
|
|
b .Lcbc_dec_loop_8x
|
|
|
|
.Lcbc_dec_4x:
|
|
add w4, w4, #8
|
|
cmp w4, #4
|
|
blt .Lcbc_dec_loop_1x
|
|
|
|
sub w4, w4, #4
|
|
|
|
ld1 {v0.16b-v3.16b}, [x2], #64
|
|
|
|
rev32 v8.16b, v0.16b
|
|
rev32 v9.16b, v1.16b
|
|
rev32 v10.16b, v2.16b
|
|
rev32 v11.16b, v3.16b
|
|
|
|
SM4_CRYPT_BLK4_BE(v8, v9, v10, v11)
|
|
|
|
eor v8.16b, v8.16b, RIV.16b
|
|
eor v9.16b, v9.16b, v0.16b
|
|
eor v10.16b, v10.16b, v1.16b
|
|
eor v11.16b, v11.16b, v2.16b
|
|
|
|
st1 {v8.16b-v11.16b}, [x1], #64
|
|
|
|
mov RIV.16b, v3.16b
|
|
|
|
cbz w4, .Lcbc_dec_end
|
|
|
|
.Lcbc_dec_loop_1x:
|
|
sub w4, w4, #1
|
|
|
|
ld1 {v0.16b}, [x2], #16
|
|
|
|
rev32 v8.16b, v0.16b
|
|
|
|
SM4_CRYPT_BLK_BE(v8)
|
|
|
|
eor v8.16b, v8.16b, RIV.16b
|
|
st1 {v8.16b}, [x1], #16
|
|
|
|
mov RIV.16b, v0.16b
|
|
|
|
cbnz w4, .Lcbc_dec_loop_1x
|
|
|
|
.Lcbc_dec_end:
|
|
/* store new IV */
|
|
st1 {RIV.16b}, [x3]
|
|
|
|
ret
|
|
SYM_FUNC_END(sm4_ce_cbc_dec)
|
|
|
|
.align 3
|
|
SYM_FUNC_START(sm4_ce_cbc_cts_enc)
|
|
/* input:
|
|
* x0: round key array, CTX
|
|
* x1: dst
|
|
* x2: src
|
|
* x3: iv (big endian, 128 bit)
|
|
* w4: nbytes
|
|
*/
|
|
SM4_PREPARE(x0)
|
|
|
|
sub w5, w4, #16
|
|
uxtw x5, w5
|
|
|
|
ld1 {RIV.16b}, [x3]
|
|
|
|
ld1 {v0.16b}, [x2]
|
|
eor RIV.16b, RIV.16b, v0.16b
|
|
SM4_CRYPT_BLK(RIV)
|
|
|
|
/* load permute table */
|
|
adr_l x6, .Lcts_permute_table
|
|
add x7, x6, #32
|
|
add x6, x6, x5
|
|
sub x7, x7, x5
|
|
ld1 {v3.16b}, [x6]
|
|
ld1 {v4.16b}, [x7]
|
|
|
|
/* overlapping loads */
|
|
add x2, x2, x5
|
|
ld1 {v1.16b}, [x2]
|
|
|
|
/* create Cn from En-1 */
|
|
tbl v0.16b, {RIV.16b}, v3.16b
|
|
/* padding Pn with zeros */
|
|
tbl v1.16b, {v1.16b}, v4.16b
|
|
|
|
eor v1.16b, v1.16b, RIV.16b
|
|
SM4_CRYPT_BLK(v1)
|
|
|
|
/* overlapping stores */
|
|
add x5, x1, x5
|
|
st1 {v0.16b}, [x5]
|
|
st1 {v1.16b}, [x1]
|
|
|
|
ret
|
|
SYM_FUNC_END(sm4_ce_cbc_cts_enc)
|
|
|
|
.align 3
|
|
SYM_FUNC_START(sm4_ce_cbc_cts_dec)
|
|
/* input:
|
|
* x0: round key array, CTX
|
|
* x1: dst
|
|
* x2: src
|
|
* x3: iv (big endian, 128 bit)
|
|
* w4: nbytes
|
|
*/
|
|
SM4_PREPARE(x0)
|
|
|
|
sub w5, w4, #16
|
|
uxtw x5, w5
|
|
|
|
ld1 {RIV.16b}, [x3]
|
|
|
|
/* load permute table */
|
|
adr_l x6, .Lcts_permute_table
|
|
add x7, x6, #32
|
|
add x6, x6, x5
|
|
sub x7, x7, x5
|
|
ld1 {v3.16b}, [x6]
|
|
ld1 {v4.16b}, [x7]
|
|
|
|
/* overlapping loads */
|
|
ld1 {v0.16b}, [x2], x5
|
|
ld1 {v1.16b}, [x2]
|
|
|
|
SM4_CRYPT_BLK(v0)
|
|
/* select the first Ln bytes of Xn to create Pn */
|
|
tbl v2.16b, {v0.16b}, v3.16b
|
|
eor v2.16b, v2.16b, v1.16b
|
|
|
|
/* overwrite the first Ln bytes with Cn to create En-1 */
|
|
tbx v0.16b, {v1.16b}, v4.16b
|
|
SM4_CRYPT_BLK(v0)
|
|
eor v0.16b, v0.16b, RIV.16b
|
|
|
|
/* overlapping stores */
|
|
add x5, x1, x5
|
|
st1 {v2.16b}, [x5]
|
|
st1 {v0.16b}, [x1]
|
|
|
|
ret
|
|
SYM_FUNC_END(sm4_ce_cbc_cts_dec)
|
|
|
|
.align 3
|
|
SYM_FUNC_START(sm4_ce_cfb_enc)
|
|
/* input:
|
|
* x0: round key array, CTX
|
|
* x1: dst
|
|
* x2: src
|
|
* x3: iv (big endian, 128 bit)
|
|
* w4: nblocks
|
|
*/
|
|
SM4_PREPARE(x0)
|
|
|
|
ld1 {RIV.16b}, [x3]
|
|
|
|
.Lcfb_enc_loop_4x:
|
|
cmp w4, #4
|
|
blt .Lcfb_enc_loop_1x
|
|
|
|
sub w4, w4, #4
|
|
|
|
ld1 {v0.16b-v3.16b}, [x2], #64
|
|
|
|
rev32 v8.16b, RIV.16b
|
|
SM4_CRYPT_BLK_BE(v8)
|
|
eor v0.16b, v0.16b, v8.16b
|
|
|
|
rev32 v8.16b, v0.16b
|
|
SM4_CRYPT_BLK_BE(v8)
|
|
eor v1.16b, v1.16b, v8.16b
|
|
|
|
rev32 v8.16b, v1.16b
|
|
SM4_CRYPT_BLK_BE(v8)
|
|
eor v2.16b, v2.16b, v8.16b
|
|
|
|
rev32 v8.16b, v2.16b
|
|
SM4_CRYPT_BLK_BE(v8)
|
|
eor v3.16b, v3.16b, v8.16b
|
|
|
|
st1 {v0.16b-v3.16b}, [x1], #64
|
|
mov RIV.16b, v3.16b
|
|
|
|
cbz w4, .Lcfb_enc_end
|
|
b .Lcfb_enc_loop_4x
|
|
|
|
.Lcfb_enc_loop_1x:
|
|
sub w4, w4, #1
|
|
|
|
ld1 {v0.16b}, [x2], #16
|
|
|
|
SM4_CRYPT_BLK(RIV)
|
|
eor RIV.16b, RIV.16b, v0.16b
|
|
|
|
st1 {RIV.16b}, [x1], #16
|
|
|
|
cbnz w4, .Lcfb_enc_loop_1x
|
|
|
|
.Lcfb_enc_end:
|
|
/* store new IV */
|
|
st1 {RIV.16b}, [x3]
|
|
|
|
ret
|
|
SYM_FUNC_END(sm4_ce_cfb_enc)
|
|
|
|
.align 3
|
|
SYM_FUNC_START(sm4_ce_cfb_dec)
|
|
/* input:
|
|
* x0: round key array, CTX
|
|
* x1: dst
|
|
* x2: src
|
|
* x3: iv (big endian, 128 bit)
|
|
* w4: nblocks
|
|
*/
|
|
SM4_PREPARE(x0)
|
|
|
|
ld1 {RIV.16b}, [x3]
|
|
|
|
.Lcfb_dec_loop_8x:
|
|
sub w4, w4, #8
|
|
tbnz w4, #31, .Lcfb_dec_4x
|
|
|
|
ld1 {v0.16b-v3.16b}, [x2], #64
|
|
ld1 {v4.16b-v7.16b}, [x2], #64
|
|
|
|
rev32 v8.16b, RIV.16b
|
|
rev32 v9.16b, v0.16b
|
|
rev32 v10.16b, v1.16b
|
|
rev32 v11.16b, v2.16b
|
|
rev32 v12.16b, v3.16b
|
|
rev32 v13.16b, v4.16b
|
|
rev32 v14.16b, v5.16b
|
|
rev32 v15.16b, v6.16b
|
|
|
|
SM4_CRYPT_BLK8_BE(v8, v9, v10, v11, v12, v13, v14, v15)
|
|
|
|
mov RIV.16b, v7.16b
|
|
|
|
eor v0.16b, v0.16b, v8.16b
|
|
eor v1.16b, v1.16b, v9.16b
|
|
eor v2.16b, v2.16b, v10.16b
|
|
eor v3.16b, v3.16b, v11.16b
|
|
eor v4.16b, v4.16b, v12.16b
|
|
eor v5.16b, v5.16b, v13.16b
|
|
eor v6.16b, v6.16b, v14.16b
|
|
eor v7.16b, v7.16b, v15.16b
|
|
|
|
st1 {v0.16b-v3.16b}, [x1], #64
|
|
st1 {v4.16b-v7.16b}, [x1], #64
|
|
|
|
cbz w4, .Lcfb_dec_end
|
|
b .Lcfb_dec_loop_8x
|
|
|
|
.Lcfb_dec_4x:
|
|
add w4, w4, #8
|
|
cmp w4, #4
|
|
blt .Lcfb_dec_loop_1x
|
|
|
|
sub w4, w4, #4
|
|
|
|
ld1 {v0.16b-v3.16b}, [x2], #64
|
|
|
|
rev32 v8.16b, RIV.16b
|
|
rev32 v9.16b, v0.16b
|
|
rev32 v10.16b, v1.16b
|
|
rev32 v11.16b, v2.16b
|
|
|
|
SM4_CRYPT_BLK4_BE(v8, v9, v10, v11)
|
|
|
|
mov RIV.16b, v3.16b
|
|
|
|
eor v0.16b, v0.16b, v8.16b
|
|
eor v1.16b, v1.16b, v9.16b
|
|
eor v2.16b, v2.16b, v10.16b
|
|
eor v3.16b, v3.16b, v11.16b
|
|
|
|
st1 {v0.16b-v3.16b}, [x1], #64
|
|
|
|
cbz w4, .Lcfb_dec_end
|
|
|
|
.Lcfb_dec_loop_1x:
|
|
sub w4, w4, #1
|
|
|
|
ld1 {v0.16b}, [x2], #16
|
|
|
|
SM4_CRYPT_BLK(RIV)
|
|
|
|
eor RIV.16b, RIV.16b, v0.16b
|
|
st1 {RIV.16b}, [x1], #16
|
|
|
|
mov RIV.16b, v0.16b
|
|
|
|
cbnz w4, .Lcfb_dec_loop_1x
|
|
|
|
.Lcfb_dec_end:
|
|
/* store new IV */
|
|
st1 {RIV.16b}, [x3]
|
|
|
|
ret
|
|
SYM_FUNC_END(sm4_ce_cfb_dec)
|
|
|
|
.align 3
|
|
SYM_FUNC_START(sm4_ce_ctr_enc)
|
|
/* input:
|
|
* x0: round key array, CTX
|
|
* x1: dst
|
|
* x2: src
|
|
* x3: ctr (big endian, 128 bit)
|
|
* w4: nblocks
|
|
*/
|
|
SM4_PREPARE(x0)
|
|
|
|
ldp x7, x8, [x3]
|
|
rev x7, x7
|
|
rev x8, x8
|
|
|
|
.Lctr_loop_8x:
|
|
sub w4, w4, #8
|
|
tbnz w4, #31, .Lctr_4x
|
|
|
|
#define inc_le128(vctr) \
|
|
mov vctr.d[1], x8; \
|
|
mov vctr.d[0], x7; \
|
|
adds x8, x8, #1; \
|
|
rev64 vctr.16b, vctr.16b; \
|
|
adc x7, x7, xzr;
|
|
|
|
/* construct CTRs */
|
|
inc_le128(v0) /* +0 */
|
|
inc_le128(v1) /* +1 */
|
|
inc_le128(v2) /* +2 */
|
|
inc_le128(v3) /* +3 */
|
|
inc_le128(v4) /* +4 */
|
|
inc_le128(v5) /* +5 */
|
|
inc_le128(v6) /* +6 */
|
|
inc_le128(v7) /* +7 */
|
|
|
|
ld1 {v8.16b-v11.16b}, [x2], #64
|
|
ld1 {v12.16b-v15.16b}, [x2], #64
|
|
|
|
SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
|
|
|
|
eor v0.16b, v0.16b, v8.16b
|
|
eor v1.16b, v1.16b, v9.16b
|
|
eor v2.16b, v2.16b, v10.16b
|
|
eor v3.16b, v3.16b, v11.16b
|
|
eor v4.16b, v4.16b, v12.16b
|
|
eor v5.16b, v5.16b, v13.16b
|
|
eor v6.16b, v6.16b, v14.16b
|
|
eor v7.16b, v7.16b, v15.16b
|
|
|
|
st1 {v0.16b-v3.16b}, [x1], #64
|
|
st1 {v4.16b-v7.16b}, [x1], #64
|
|
|
|
cbz w4, .Lctr_end
|
|
b .Lctr_loop_8x
|
|
|
|
.Lctr_4x:
|
|
add w4, w4, #8
|
|
cmp w4, #4
|
|
blt .Lctr_loop_1x
|
|
|
|
sub w4, w4, #4
|
|
|
|
/* construct CTRs */
|
|
inc_le128(v0) /* +0 */
|
|
inc_le128(v1) /* +1 */
|
|
inc_le128(v2) /* +2 */
|
|
inc_le128(v3) /* +3 */
|
|
|
|
ld1 {v8.16b-v11.16b}, [x2], #64
|
|
|
|
SM4_CRYPT_BLK4(v0, v1, v2, v3)
|
|
|
|
eor v0.16b, v0.16b, v8.16b
|
|
eor v1.16b, v1.16b, v9.16b
|
|
eor v2.16b, v2.16b, v10.16b
|
|
eor v3.16b, v3.16b, v11.16b
|
|
|
|
st1 {v0.16b-v3.16b}, [x1], #64
|
|
|
|
cbz w4, .Lctr_end
|
|
|
|
.Lctr_loop_1x:
|
|
sub w4, w4, #1
|
|
|
|
/* construct CTRs */
|
|
inc_le128(v0)
|
|
|
|
ld1 {v8.16b}, [x2], #16
|
|
|
|
SM4_CRYPT_BLK(v0)
|
|
|
|
eor v0.16b, v0.16b, v8.16b
|
|
st1 {v0.16b}, [x1], #16
|
|
|
|
cbnz w4, .Lctr_loop_1x
|
|
|
|
.Lctr_end:
|
|
/* store new CTR */
|
|
rev x7, x7
|
|
rev x8, x8
|
|
stp x7, x8, [x3]
|
|
|
|
ret
|
|
SYM_FUNC_END(sm4_ce_ctr_enc)
|
|
|
|
|
|
#define tweak_next(vt, vin, RTMP) \
|
|
sshr RTMP.2d, vin.2d, #63; \
|
|
and RTMP.16b, RTMP.16b, RMASK.16b; \
|
|
add vt.2d, vin.2d, vin.2d; \
|
|
ext RTMP.16b, RTMP.16b, RTMP.16b, #8; \
|
|
eor vt.16b, vt.16b, RTMP.16b;
|
|
|
|
.align 3
|
|
SYM_FUNC_START(sm4_ce_xts_enc)
|
|
/* input:
|
|
* x0: round key array, CTX
|
|
* x1: dst
|
|
* x2: src
|
|
* x3: tweak (big endian, 128 bit)
|
|
* w4: nbytes
|
|
* x5: round key array for IV
|
|
*/
|
|
ld1 {v8.16b}, [x3]
|
|
|
|
cbz x5, .Lxts_enc_nofirst
|
|
|
|
SM4_PREPARE(x5)
|
|
|
|
/* Generate first tweak */
|
|
SM4_CRYPT_BLK(v8)
|
|
|
|
.Lxts_enc_nofirst:
|
|
SM4_PREPARE(x0)
|
|
|
|
ands w5, w4, #15
|
|
lsr w4, w4, #4
|
|
sub w6, w4, #1
|
|
csel w4, w4, w6, eq
|
|
uxtw x5, w5
|
|
|
|
movi RMASK.2s, #0x1
|
|
movi RTMP0.2s, #0x87
|
|
uzp1 RMASK.4s, RMASK.4s, RTMP0.4s
|
|
|
|
cbz w4, .Lxts_enc_cts
|
|
|
|
.Lxts_enc_loop_8x:
|
|
sub w4, w4, #8
|
|
tbnz w4, #31, .Lxts_enc_4x
|
|
|
|
tweak_next( v9, v8, RTMP0)
|
|
tweak_next(v10, v9, RTMP1)
|
|
tweak_next(v11, v10, RTMP2)
|
|
tweak_next(v12, v11, RTMP3)
|
|
tweak_next(v13, v12, RTMP0)
|
|
tweak_next(v14, v13, RTMP1)
|
|
tweak_next(v15, v14, RTMP2)
|
|
|
|
ld1 {v0.16b-v3.16b}, [x2], #64
|
|
ld1 {v4.16b-v7.16b}, [x2], #64
|
|
eor v0.16b, v0.16b, v8.16b
|
|
eor v1.16b, v1.16b, v9.16b
|
|
eor v2.16b, v2.16b, v10.16b
|
|
eor v3.16b, v3.16b, v11.16b
|
|
eor v4.16b, v4.16b, v12.16b
|
|
eor v5.16b, v5.16b, v13.16b
|
|
eor v6.16b, v6.16b, v14.16b
|
|
eor v7.16b, v7.16b, v15.16b
|
|
|
|
SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
|
|
|
|
eor v0.16b, v0.16b, v8.16b
|
|
eor v1.16b, v1.16b, v9.16b
|
|
eor v2.16b, v2.16b, v10.16b
|
|
eor v3.16b, v3.16b, v11.16b
|
|
eor v4.16b, v4.16b, v12.16b
|
|
eor v5.16b, v5.16b, v13.16b
|
|
eor v6.16b, v6.16b, v14.16b
|
|
eor v7.16b, v7.16b, v15.16b
|
|
st1 {v0.16b-v3.16b}, [x1], #64
|
|
st1 {v4.16b-v7.16b}, [x1], #64
|
|
|
|
tweak_next(v8, v15, RTMP3)
|
|
|
|
cbz w4, .Lxts_enc_cts
|
|
b .Lxts_enc_loop_8x
|
|
|
|
.Lxts_enc_4x:
|
|
add w4, w4, #8
|
|
cmp w4, #4
|
|
blt .Lxts_enc_loop_1x
|
|
|
|
sub w4, w4, #4
|
|
|
|
tweak_next( v9, v8, RTMP0)
|
|
tweak_next(v10, v9, RTMP1)
|
|
tweak_next(v11, v10, RTMP2)
|
|
|
|
ld1 {v0.16b-v3.16b}, [x2], #64
|
|
eor v0.16b, v0.16b, v8.16b
|
|
eor v1.16b, v1.16b, v9.16b
|
|
eor v2.16b, v2.16b, v10.16b
|
|
eor v3.16b, v3.16b, v11.16b
|
|
|
|
SM4_CRYPT_BLK4(v0, v1, v2, v3)
|
|
|
|
eor v0.16b, v0.16b, v8.16b
|
|
eor v1.16b, v1.16b, v9.16b
|
|
eor v2.16b, v2.16b, v10.16b
|
|
eor v3.16b, v3.16b, v11.16b
|
|
st1 {v0.16b-v3.16b}, [x1], #64
|
|
|
|
tweak_next(v8, v11, RTMP3)
|
|
|
|
cbz w4, .Lxts_enc_cts
|
|
|
|
.Lxts_enc_loop_1x:
|
|
sub w4, w4, #1
|
|
|
|
ld1 {v0.16b}, [x2], #16
|
|
eor v0.16b, v0.16b, v8.16b
|
|
|
|
SM4_CRYPT_BLK(v0)
|
|
|
|
eor v0.16b, v0.16b, v8.16b
|
|
st1 {v0.16b}, [x1], #16
|
|
|
|
tweak_next(v8, v8, RTMP0)
|
|
|
|
cbnz w4, .Lxts_enc_loop_1x
|
|
|
|
.Lxts_enc_cts:
|
|
cbz x5, .Lxts_enc_end
|
|
|
|
/* cipher text stealing */
|
|
|
|
tweak_next(v9, v8, RTMP0)
|
|
ld1 {v0.16b}, [x2]
|
|
eor v0.16b, v0.16b, v8.16b
|
|
SM4_CRYPT_BLK(v0)
|
|
eor v0.16b, v0.16b, v8.16b
|
|
|
|
/* load permute table */
|
|
adr_l x6, .Lcts_permute_table
|
|
add x7, x6, #32
|
|
add x6, x6, x5
|
|
sub x7, x7, x5
|
|
ld1 {v3.16b}, [x6]
|
|
ld1 {v4.16b}, [x7]
|
|
|
|
/* overlapping loads */
|
|
add x2, x2, x5
|
|
ld1 {v1.16b}, [x2]
|
|
|
|
/* create Cn from En-1 */
|
|
tbl v2.16b, {v0.16b}, v3.16b
|
|
/* padding Pn with En-1 at the end */
|
|
tbx v0.16b, {v1.16b}, v4.16b
|
|
|
|
eor v0.16b, v0.16b, v9.16b
|
|
SM4_CRYPT_BLK(v0)
|
|
eor v0.16b, v0.16b, v9.16b
|
|
|
|
|
|
/* overlapping stores */
|
|
add x5, x1, x5
|
|
st1 {v2.16b}, [x5]
|
|
st1 {v0.16b}, [x1]
|
|
|
|
b .Lxts_enc_ret
|
|
|
|
.Lxts_enc_end:
|
|
/* store new tweak */
|
|
st1 {v8.16b}, [x3]
|
|
|
|
.Lxts_enc_ret:
|
|
ret
|
|
SYM_FUNC_END(sm4_ce_xts_enc)
|
|
|
|
.align 3
|
|
SYM_FUNC_START(sm4_ce_xts_dec)
|
|
/* input:
|
|
* x0: round key array, CTX
|
|
* x1: dst
|
|
* x2: src
|
|
* x3: tweak (big endian, 128 bit)
|
|
* w4: nbytes
|
|
* x5: round key array for IV
|
|
*/
|
|
ld1 {v8.16b}, [x3]
|
|
|
|
cbz x5, .Lxts_dec_nofirst
|
|
|
|
SM4_PREPARE(x5)
|
|
|
|
/* Generate first tweak */
|
|
SM4_CRYPT_BLK(v8)
|
|
|
|
.Lxts_dec_nofirst:
|
|
SM4_PREPARE(x0)
|
|
|
|
ands w5, w4, #15
|
|
lsr w4, w4, #4
|
|
sub w6, w4, #1
|
|
csel w4, w4, w6, eq
|
|
uxtw x5, w5
|
|
|
|
movi RMASK.2s, #0x1
|
|
movi RTMP0.2s, #0x87
|
|
uzp1 RMASK.4s, RMASK.4s, RTMP0.4s
|
|
|
|
cbz w4, .Lxts_dec_cts
|
|
|
|
.Lxts_dec_loop_8x:
|
|
sub w4, w4, #8
|
|
tbnz w4, #31, .Lxts_dec_4x
|
|
|
|
tweak_next( v9, v8, RTMP0)
|
|
tweak_next(v10, v9, RTMP1)
|
|
tweak_next(v11, v10, RTMP2)
|
|
tweak_next(v12, v11, RTMP3)
|
|
tweak_next(v13, v12, RTMP0)
|
|
tweak_next(v14, v13, RTMP1)
|
|
tweak_next(v15, v14, RTMP2)
|
|
|
|
ld1 {v0.16b-v3.16b}, [x2], #64
|
|
ld1 {v4.16b-v7.16b}, [x2], #64
|
|
eor v0.16b, v0.16b, v8.16b
|
|
eor v1.16b, v1.16b, v9.16b
|
|
eor v2.16b, v2.16b, v10.16b
|
|
eor v3.16b, v3.16b, v11.16b
|
|
eor v4.16b, v4.16b, v12.16b
|
|
eor v5.16b, v5.16b, v13.16b
|
|
eor v6.16b, v6.16b, v14.16b
|
|
eor v7.16b, v7.16b, v15.16b
|
|
|
|
SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
|
|
|
|
eor v0.16b, v0.16b, v8.16b
|
|
eor v1.16b, v1.16b, v9.16b
|
|
eor v2.16b, v2.16b, v10.16b
|
|
eor v3.16b, v3.16b, v11.16b
|
|
eor v4.16b, v4.16b, v12.16b
|
|
eor v5.16b, v5.16b, v13.16b
|
|
eor v6.16b, v6.16b, v14.16b
|
|
eor v7.16b, v7.16b, v15.16b
|
|
st1 {v0.16b-v3.16b}, [x1], #64
|
|
st1 {v4.16b-v7.16b}, [x1], #64
|
|
|
|
tweak_next(v8, v15, RTMP3)
|
|
|
|
cbz w4, .Lxts_dec_cts
|
|
b .Lxts_dec_loop_8x
|
|
|
|
.Lxts_dec_4x:
|
|
add w4, w4, #8
|
|
cmp w4, #4
|
|
blt .Lxts_dec_loop_1x
|
|
|
|
sub w4, w4, #4
|
|
|
|
tweak_next( v9, v8, RTMP0)
|
|
tweak_next(v10, v9, RTMP1)
|
|
tweak_next(v11, v10, RTMP2)
|
|
|
|
ld1 {v0.16b-v3.16b}, [x2], #64
|
|
eor v0.16b, v0.16b, v8.16b
|
|
eor v1.16b, v1.16b, v9.16b
|
|
eor v2.16b, v2.16b, v10.16b
|
|
eor v3.16b, v3.16b, v11.16b
|
|
|
|
SM4_CRYPT_BLK4(v0, v1, v2, v3)
|
|
|
|
eor v0.16b, v0.16b, v8.16b
|
|
eor v1.16b, v1.16b, v9.16b
|
|
eor v2.16b, v2.16b, v10.16b
|
|
eor v3.16b, v3.16b, v11.16b
|
|
st1 {v0.16b-v3.16b}, [x1], #64
|
|
|
|
tweak_next(v8, v11, RTMP3)
|
|
|
|
cbz w4, .Lxts_dec_cts
|
|
|
|
.Lxts_dec_loop_1x:
|
|
sub w4, w4, #1
|
|
|
|
ld1 {v0.16b}, [x2], #16
|
|
eor v0.16b, v0.16b, v8.16b
|
|
|
|
SM4_CRYPT_BLK(v0)
|
|
|
|
eor v0.16b, v0.16b, v8.16b
|
|
st1 {v0.16b}, [x1], #16
|
|
|
|
tweak_next(v8, v8, RTMP0)
|
|
|
|
cbnz w4, .Lxts_dec_loop_1x
|
|
|
|
.Lxts_dec_cts:
|
|
cbz x5, .Lxts_dec_end
|
|
|
|
/* cipher text stealing */
|
|
|
|
tweak_next(v9, v8, RTMP0)
|
|
ld1 {v0.16b}, [x2]
|
|
eor v0.16b, v0.16b, v9.16b
|
|
SM4_CRYPT_BLK(v0)
|
|
eor v0.16b, v0.16b, v9.16b
|
|
|
|
/* load permute table */
|
|
adr_l x6, .Lcts_permute_table
|
|
add x7, x6, #32
|
|
add x6, x6, x5
|
|
sub x7, x7, x5
|
|
ld1 {v3.16b}, [x6]
|
|
ld1 {v4.16b}, [x7]
|
|
|
|
/* overlapping loads */
|
|
add x2, x2, x5
|
|
ld1 {v1.16b}, [x2]
|
|
|
|
/* create Cn from En-1 */
|
|
tbl v2.16b, {v0.16b}, v3.16b
|
|
/* padding Pn with En-1 at the end */
|
|
tbx v0.16b, {v1.16b}, v4.16b
|
|
|
|
eor v0.16b, v0.16b, v8.16b
|
|
SM4_CRYPT_BLK(v0)
|
|
eor v0.16b, v0.16b, v8.16b
|
|
|
|
|
|
/* overlapping stores */
|
|
add x5, x1, x5
|
|
st1 {v2.16b}, [x5]
|
|
st1 {v0.16b}, [x1]
|
|
|
|
b .Lxts_dec_ret
|
|
|
|
.Lxts_dec_end:
|
|
/* store new tweak */
|
|
st1 {v8.16b}, [x3]
|
|
|
|
.Lxts_dec_ret:
|
|
ret
|
|
SYM_FUNC_END(sm4_ce_xts_dec)
|
|
|
|
.align 3
|
|
SYM_FUNC_START(sm4_ce_mac_update)
|
|
/* input:
|
|
* x0: round key array, CTX
|
|
* x1: digest
|
|
* x2: src
|
|
* w3: nblocks
|
|
* w4: enc_before
|
|
* w5: enc_after
|
|
*/
|
|
SM4_PREPARE(x0)
|
|
|
|
ld1 {RMAC.16b}, [x1]
|
|
|
|
cbz w4, .Lmac_update
|
|
|
|
SM4_CRYPT_BLK(RMAC)
|
|
|
|
.Lmac_update:
|
|
cbz w3, .Lmac_ret
|
|
|
|
sub w6, w3, #1
|
|
cmp w5, wzr
|
|
csel w3, w3, w6, ne
|
|
|
|
cbz w3, .Lmac_end
|
|
|
|
.Lmac_loop_4x:
|
|
cmp w3, #4
|
|
blt .Lmac_loop_1x
|
|
|
|
sub w3, w3, #4
|
|
|
|
ld1 {v0.16b-v3.16b}, [x2], #64
|
|
|
|
eor RMAC.16b, RMAC.16b, v0.16b
|
|
SM4_CRYPT_BLK(RMAC)
|
|
eor RMAC.16b, RMAC.16b, v1.16b
|
|
SM4_CRYPT_BLK(RMAC)
|
|
eor RMAC.16b, RMAC.16b, v2.16b
|
|
SM4_CRYPT_BLK(RMAC)
|
|
eor RMAC.16b, RMAC.16b, v3.16b
|
|
SM4_CRYPT_BLK(RMAC)
|
|
|
|
cbz w3, .Lmac_end
|
|
b .Lmac_loop_4x
|
|
|
|
.Lmac_loop_1x:
|
|
sub w3, w3, #1
|
|
|
|
ld1 {v0.16b}, [x2], #16
|
|
|
|
eor RMAC.16b, RMAC.16b, v0.16b
|
|
SM4_CRYPT_BLK(RMAC)
|
|
|
|
cbnz w3, .Lmac_loop_1x
|
|
|
|
|
|
.Lmac_end:
|
|
cbnz w5, .Lmac_ret
|
|
|
|
ld1 {v0.16b}, [x2], #16
|
|
eor RMAC.16b, RMAC.16b, v0.16b
|
|
|
|
.Lmac_ret:
|
|
st1 {RMAC.16b}, [x1]
|
|
ret
|
|
SYM_FUNC_END(sm4_ce_mac_update)
|
|
|
|
|
|
.section ".rodata", "a"
|
|
.align 4
|
|
.Lbswap128_mask:
|
|
.byte 0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b
|
|
.byte 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03
|
|
|
|
.Lcts_permute_table:
|
|
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
|
|
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
|
|
.byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
|
|
.byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
|
|
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
|
|
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
|