2022-10-27 14:55:05 +08:00
/* SPDX-License-Identifier: GPL-2.0-or-later */
/ *
* SM4 - G C M A E A D A l g o r i t h m u s i n g A R M v8 C r y p t o E x t e n s i o n s
* as s p e c i f i e d i n r f c89 9 8
* https : / / datatracker. i e t f . o r g / d o c / h t m l / r f c89 9 8
*
* Copyright ( C ) 2 0 1 6 J u s s i K i v i l i n n a < j u s s i . k i v i l i n n a @iki.fi>
* Copyright ( C ) 2 0 2 2 T i a n j i a Z h a n g < t i a n j i a . z h a n g @linux.alibaba.com>
* /
# include < l i n u x / l i n k a g e . h >
2022-12-21 15:32:32 +08:00
# include < l i n u x / c f i _ t y p e s . h >
2022-10-27 14:55:05 +08:00
# include < a s m / a s s e m b l e r . h >
# include " s m 4 - c e - a s m . h "
.arch armv8 - a + c r y p t o
.irp b, 0 , 1 , 2 , 3 , 2 4 , 2 5 , 2 6 , 2 7 , 2 8 , 2 9 , 3 0 , 3 1
.set .Lv \ b\ ( ) . 4 s , \ b
.endr
.macro sm4 e , v d , v n
.inst 0xcec08400 | ( .L \ vn < < 5 ) | . L \ v d
.endm
/* Register macros */
/* Used for both encryption and decryption */
# define R H A S H v21
# define R R C O N S T v22
# define R Z E R O v23
/* Helper macros. */
/ *
* input : m0 , m 1
* output : r0 : r1 ( l o w 1 2 8 - b i t s i n r0 , h i g h i n r1 )
* /
# define P M U L _ 1 2 8 x12 8 ( r0 , r1 , m 0 , m 1 , T 0 , T 1 ) \
ext T 0 . 1 6 b , m 1 . 1 6 b , m 1 . 1 6 b , #8 ; \
pmull r0 . 1 q , m 0 . 1 d , m 1 . 1 d ; \
pmull T 1 . 1 q , m 0 . 1 d , T 0 . 1 d ; \
pmull2 T 0 . 1 q , m 0 . 2 d , T 0 . 2 d ; \
pmull2 r1 . 1 q , m 0 . 2 d , m 1 . 2 d ; \
eor T 0 . 1 6 b , T 0 . 1 6 b , T 1 . 1 6 b ; \
ext T 1 . 1 6 b , R Z E R O . 1 6 b , T 0 . 1 6 b , #8 ; \
ext T 0 . 1 6 b , T 0 . 1 6 b , R Z E R O . 1 6 b , #8 ; \
eor r0 . 1 6 b , r0 . 1 6 b , T 1 . 1 6 b ; \
eor r1 . 1 6 b , r1 . 1 6 b , T 0 . 1 6 b ;
# define P M U L _ 1 2 8 x12 8 _ 4 x ( r0 , r1 , m 0 , m 1 , T 0 , T 1 , \
r2 , r3 , m 2 , m 3 , T 2 , T 3 , \
r4 , r5 , m 4 , m 5 , T 4 , T 5 , \
r6 , r7 , m 6 , m 7 , T 6 , T 7 ) \
ext T 0 . 1 6 b , m 1 . 1 6 b , m 1 . 1 6 b , #8 ; \
ext T 2 . 1 6 b , m 3 . 1 6 b , m 3 . 1 6 b , #8 ; \
ext T 4 . 1 6 b , m 5 . 1 6 b , m 5 . 1 6 b , #8 ; \
ext T 6 . 1 6 b , m 7 . 1 6 b , m 7 . 1 6 b , #8 ; \
pmull r0 . 1 q , m 0 . 1 d , m 1 . 1 d ; \
pmull r2 . 1 q , m 2 . 1 d , m 3 . 1 d ; \
pmull r4 . 1 q , m 4 . 1 d , m 5 . 1 d ; \
pmull r6 . 1 q , m 6 . 1 d , m 7 . 1 d ; \
pmull T 1 . 1 q , m 0 . 1 d , T 0 . 1 d ; \
pmull T 3 . 1 q , m 2 . 1 d , T 2 . 1 d ; \
pmull T 5 . 1 q , m 4 . 1 d , T 4 . 1 d ; \
pmull T 7 . 1 q , m 6 . 1 d , T 6 . 1 d ; \
pmull2 T 0 . 1 q , m 0 . 2 d , T 0 . 2 d ; \
pmull2 T 2 . 1 q , m 2 . 2 d , T 2 . 2 d ; \
pmull2 T 4 . 1 q , m 4 . 2 d , T 4 . 2 d ; \
pmull2 T 6 . 1 q , m 6 . 2 d , T 6 . 2 d ; \
pmull2 r1 . 1 q , m 0 . 2 d , m 1 . 2 d ; \
pmull2 r3 . 1 q , m 2 . 2 d , m 3 . 2 d ; \
pmull2 r5 . 1 q , m 4 . 2 d , m 5 . 2 d ; \
pmull2 r7 . 1 q , m 6 . 2 d , m 7 . 2 d ; \
eor T 0 . 1 6 b , T 0 . 1 6 b , T 1 . 1 6 b ; \
eor T 2 . 1 6 b , T 2 . 1 6 b , T 3 . 1 6 b ; \
eor T 4 . 1 6 b , T 4 . 1 6 b , T 5 . 1 6 b ; \
eor T 6 . 1 6 b , T 6 . 1 6 b , T 7 . 1 6 b ; \
ext T 1 . 1 6 b , R Z E R O . 1 6 b , T 0 . 1 6 b , #8 ; \
ext T 3 . 1 6 b , R Z E R O . 1 6 b , T 2 . 1 6 b , #8 ; \
ext T 5 . 1 6 b , R Z E R O . 1 6 b , T 4 . 1 6 b , #8 ; \
ext T 7 . 1 6 b , R Z E R O . 1 6 b , T 6 . 1 6 b , #8 ; \
ext T 0 . 1 6 b , T 0 . 1 6 b , R Z E R O . 1 6 b , #8 ; \
ext T 2 . 1 6 b , T 2 . 1 6 b , R Z E R O . 1 6 b , #8 ; \
ext T 4 . 1 6 b , T 4 . 1 6 b , R Z E R O . 1 6 b , #8 ; \
ext T 6 . 1 6 b , T 6 . 1 6 b , R Z E R O . 1 6 b , #8 ; \
eor r0 . 1 6 b , r0 . 1 6 b , T 1 . 1 6 b ; \
eor r2 . 1 6 b , r2 . 1 6 b , T 3 . 1 6 b ; \
eor r4 . 1 6 b , r4 . 1 6 b , T 5 . 1 6 b ; \
eor r6 . 1 6 b , r6 . 1 6 b , T 7 . 1 6 b ; \
eor r1 . 1 6 b , r1 . 1 6 b , T 0 . 1 6 b ; \
eor r3 . 1 6 b , r3 . 1 6 b , T 2 . 1 6 b ; \
eor r5 . 1 6 b , r5 . 1 6 b , T 4 . 1 6 b ; \
eor r7 . 1 6 b , r7 . 1 6 b , T 6 . 1 6 b ;
/ *
* input : r0 : r1 ( l o w 1 2 8 - b i t s i n r0 , h i g h i n r1 )
* output : a
* /
# define R E D U C T I O N ( a , r0 , r1 , r c o n s t , T 0 , T 1 ) \
pmull2 T 0 . 1 q , r1 . 2 d , r c o n s t . 2 d ; \
ext T 1 . 1 6 b , T 0 . 1 6 b , R Z E R O . 1 6 b , #8 ; \
ext T 0 . 1 6 b , R Z E R O . 1 6 b , T 0 . 1 6 b , #8 ; \
eor r1 . 1 6 b , r1 . 1 6 b , T 1 . 1 6 b ; \
eor r0 . 1 6 b , r0 . 1 6 b , T 0 . 1 6 b ; \
pmull T 0 . 1 q , r1 . 1 d , r c o n s t . 1 d ; \
eor a . 1 6 b , r0 . 1 6 b , T 0 . 1 6 b ;
# define S M 4 _ C R Y P T _ P M U L _ 1 2 8 x12 8 _ B L K ( b0 , r0 , r1 , m 0 , m 1 , T 0 , T 1 ) \
rev3 2 b0 . 1 6 b , b0 . 1 6 b ; \
ext T 0 . 1 6 b , m 1 . 1 6 b , m 1 . 1 6 b , #8 ; \
sm4 e b0 . 4 s , v24 . 4 s ; \
pmull r0 . 1 q , m 0 . 1 d , m 1 . 1 d ; \
sm4 e b0 . 4 s , v25 . 4 s ; \
pmull T 1 . 1 q , m 0 . 1 d , T 0 . 1 d ; \
sm4 e b0 . 4 s , v26 . 4 s ; \
pmull2 T 0 . 1 q , m 0 . 2 d , T 0 . 2 d ; \
sm4 e b0 . 4 s , v27 . 4 s ; \
pmull2 r1 . 1 q , m 0 . 2 d , m 1 . 2 d ; \
sm4 e b0 . 4 s , v28 . 4 s ; \
eor T 0 . 1 6 b , T 0 . 1 6 b , T 1 . 1 6 b ; \
sm4 e b0 . 4 s , v29 . 4 s ; \
ext T 1 . 1 6 b , R Z E R O . 1 6 b , T 0 . 1 6 b , #8 ; \
sm4 e b0 . 4 s , v30 . 4 s ; \
ext T 0 . 1 6 b , T 0 . 1 6 b , R Z E R O . 1 6 b , #8 ; \
sm4 e b0 . 4 s , v31 . 4 s ; \
eor r0 . 1 6 b , r0 . 1 6 b , T 1 . 1 6 b ; \
rev6 4 b0 . 4 s , b0 . 4 s ; \
eor r1 . 1 6 b , r1 . 1 6 b , T 0 . 1 6 b ; \
ext b0 . 1 6 b , b0 . 1 6 b , b0 . 1 6 b , #8 ; \
rev3 2 b0 . 1 6 b , b0 . 1 6 b ;
# define S M 4 _ C R Y P T _ P M U L _ 1 2 8 x12 8 _ B L K 3 ( b0 , b1 , b2 , \
r0 , r1 , m 0 , m 1 , T 0 , T 1 , \
r2 , r3 , m 2 , m 3 , T 2 , T 3 , \
r4 , r5 , m 4 , m 5 , T 4 , T 5 ) \
rev3 2 b0 . 1 6 b , b0 . 1 6 b ; \
rev3 2 b1 . 1 6 b , b1 . 1 6 b ; \
rev3 2 b2 . 1 6 b , b2 . 1 6 b ; \
ext T 0 . 1 6 b , m 1 . 1 6 b , m 1 . 1 6 b , #8 ; \
ext T 2 . 1 6 b , m 3 . 1 6 b , m 3 . 1 6 b , #8 ; \
ext T 4 . 1 6 b , m 5 . 1 6 b , m 5 . 1 6 b , #8 ; \
sm4 e b0 . 4 s , v24 . 4 s ; \
sm4 e b1 . 4 s , v24 . 4 s ; \
sm4 e b2 . 4 s , v24 . 4 s ; \
pmull r0 . 1 q , m 0 . 1 d , m 1 . 1 d ; \
pmull r2 . 1 q , m 2 . 1 d , m 3 . 1 d ; \
pmull r4 . 1 q , m 4 . 1 d , m 5 . 1 d ; \
sm4 e b0 . 4 s , v25 . 4 s ; \
sm4 e b1 . 4 s , v25 . 4 s ; \
sm4 e b2 . 4 s , v25 . 4 s ; \
pmull T 1 . 1 q , m 0 . 1 d , T 0 . 1 d ; \
pmull T 3 . 1 q , m 2 . 1 d , T 2 . 1 d ; \
pmull T 5 . 1 q , m 4 . 1 d , T 4 . 1 d ; \
sm4 e b0 . 4 s , v26 . 4 s ; \
sm4 e b1 . 4 s , v26 . 4 s ; \
sm4 e b2 . 4 s , v26 . 4 s ; \
pmull2 T 0 . 1 q , m 0 . 2 d , T 0 . 2 d ; \
pmull2 T 2 . 1 q , m 2 . 2 d , T 2 . 2 d ; \
pmull2 T 4 . 1 q , m 4 . 2 d , T 4 . 2 d ; \
sm4 e b0 . 4 s , v27 . 4 s ; \
sm4 e b1 . 4 s , v27 . 4 s ; \
sm4 e b2 . 4 s , v27 . 4 s ; \
pmull2 r1 . 1 q , m 0 . 2 d , m 1 . 2 d ; \
pmull2 r3 . 1 q , m 2 . 2 d , m 3 . 2 d ; \
pmull2 r5 . 1 q , m 4 . 2 d , m 5 . 2 d ; \
sm4 e b0 . 4 s , v28 . 4 s ; \
sm4 e b1 . 4 s , v28 . 4 s ; \
sm4 e b2 . 4 s , v28 . 4 s ; \
eor T 0 . 1 6 b , T 0 . 1 6 b , T 1 . 1 6 b ; \
eor T 2 . 1 6 b , T 2 . 1 6 b , T 3 . 1 6 b ; \
eor T 4 . 1 6 b , T 4 . 1 6 b , T 5 . 1 6 b ; \
sm4 e b0 . 4 s , v29 . 4 s ; \
sm4 e b1 . 4 s , v29 . 4 s ; \
sm4 e b2 . 4 s , v29 . 4 s ; \
ext T 1 . 1 6 b , R Z E R O . 1 6 b , T 0 . 1 6 b , #8 ; \
ext T 3 . 1 6 b , R Z E R O . 1 6 b , T 2 . 1 6 b , #8 ; \
ext T 5 . 1 6 b , R Z E R O . 1 6 b , T 4 . 1 6 b , #8 ; \
sm4 e b0 . 4 s , v30 . 4 s ; \
sm4 e b1 . 4 s , v30 . 4 s ; \
sm4 e b2 . 4 s , v30 . 4 s ; \
ext T 0 . 1 6 b , T 0 . 1 6 b , R Z E R O . 1 6 b , #8 ; \
ext T 2 . 1 6 b , T 2 . 1 6 b , R Z E R O . 1 6 b , #8 ; \
ext T 4 . 1 6 b , T 4 . 1 6 b , R Z E R O . 1 6 b , #8 ; \
sm4 e b0 . 4 s , v31 . 4 s ; \
sm4 e b1 . 4 s , v31 . 4 s ; \
sm4 e b2 . 4 s , v31 . 4 s ; \
eor r0 . 1 6 b , r0 . 1 6 b , T 1 . 1 6 b ; \
eor r2 . 1 6 b , r2 . 1 6 b , T 3 . 1 6 b ; \
eor r4 . 1 6 b , r4 . 1 6 b , T 5 . 1 6 b ; \
rev6 4 b0 . 4 s , b0 . 4 s ; \
rev6 4 b1 . 4 s , b1 . 4 s ; \
rev6 4 b2 . 4 s , b2 . 4 s ; \
eor r1 . 1 6 b , r1 . 1 6 b , T 0 . 1 6 b ; \
eor r3 . 1 6 b , r3 . 1 6 b , T 2 . 1 6 b ; \
eor r5 . 1 6 b , r5 . 1 6 b , T 4 . 1 6 b ; \
ext b0 . 1 6 b , b0 . 1 6 b , b0 . 1 6 b , #8 ; \
ext b1 . 1 6 b , b1 . 1 6 b , b1 . 1 6 b , #8 ; \
ext b2 . 1 6 b , b2 . 1 6 b , b2 . 1 6 b , #8 ; \
eor r0 . 1 6 b , r0 . 1 6 b , r2 . 1 6 b ; \
eor r1 . 1 6 b , r1 . 1 6 b , r3 . 1 6 b ; \
rev3 2 b0 . 1 6 b , b0 . 1 6 b ; \
rev3 2 b1 . 1 6 b , b1 . 1 6 b ; \
rev3 2 b2 . 1 6 b , b2 . 1 6 b ; \
eor r0 . 1 6 b , r0 . 1 6 b , r4 . 1 6 b ; \
eor r1 . 1 6 b , r1 . 1 6 b , r5 . 1 6 b ;
# define i n c32 _ l e 1 2 8 ( v c t r ) \
mov v c t r . d [ 1 ] , x9 ; \
add w6 , w9 , #1 ; \
mov v c t r . d [ 0 ] , x8 ; \
bfi x9 , x6 , #0 , #32 ; \
rev6 4 v c t r . 1 6 b , v c t r . 1 6 b ;
# define G T A G _ H A S H _ L E N G T H S ( v c t r0 , v l e n ) \
ld1 { v l e n . 1 6 b } , [ x7 ] ; \
/* construct CTR0 */ \
/* the lower 32-bits of initial IV is always be32(1) */ \
mov x6 , #0x1 ; \
bfi x9 , x6 , #0 , #32 ; \
mov v c t r0 . d [ 0 ] , x8 ; \
mov v c t r0 . d [ 1 ] , x9 ; \
rbit v l e n . 1 6 b , v l e n . 1 6 b ; \
rev6 4 v c t r0 . 1 6 b , v c t r0 . 1 6 b ; \
/* authtag = GCTR(CTR0, GHASH) */ \
eor R H A S H . 1 6 b , R H A S H . 1 6 b , v l e n . 1 6 b ; \
SM4 _ C R Y P T _ P M U L _ 1 2 8 x12 8 _ B L K ( v c t r0 , R R 0 , R R 1 , R H A S H , R H 1 , \
RTMP0 , R T M P 1 ) ; \
REDUCTION( R H A S H , R R 0 , R R 1 , R R C O N S T , R T M P 2 , R T M P 3 ) ; \
rbit R H A S H . 1 6 b , R H A S H . 1 6 b ; \
eor R H A S H . 1 6 b , R H A S H . 1 6 b , v c t r0 . 1 6 b ;
/* Register macros for encrypt and ghash */
/* can be the same as input v0-v3 */
# define R R 1 v0
# define R R 3 v1
# define R R 5 v2
# define R R 7 v3
# define R R 0 v4
# define R R 2 v5
# define R R 4 v6
# define R R 6 v7
# define R T M P 0 v8
# define R T M P 1 v9
# define R T M P 2 v10
# define R T M P 3 v11
# define R T M P 4 v12
# define R T M P 5 v13
# define R T M P 6 v14
# define R T M P 7 v15
# define R H 1 v16
# define R H 2 v17
# define R H 3 v18
# define R H 4 v19
.align 3
SYM_ F U N C _ S T A R T ( s m 4 _ c e _ p m u l l _ g h a s h _ s e t u p )
/ * input :
* x0 : round k e y a r r a y , C T X
* x1 : ghash t a b l e
* /
SM4 _ P R E P A R E ( x0 )
adr_ l x2 , . L g h a s h _ r c o n s t
ld1 r { R R C O N S T . 2 d } , [ x2 ]
eor R Z E R O . 1 6 b , R Z E R O . 1 6 b , R Z E R O . 1 6 b
/* H = E(K, 0^128) */
rev3 2 v0 . 1 6 b , R Z E R O . 1 6 b
SM4 _ C R Y P T _ B L K _ B E ( v0 )
/* H ^ 1 */
rbit R H 1 . 1 6 b , v0 . 1 6 b
/* H ^ 2 */
PMUL_ 1 2 8 x12 8 ( R R 0 , R R 1 , R H 1 , R H 1 , R T M P 0 , R T M P 1 )
REDUCTION( R H 2 , R R 0 , R R 1 , R R C O N S T , R T M P 2 , R T M P 3 )
/* H ^ 3 */
PMUL_ 1 2 8 x12 8 ( R R 0 , R R 1 , R H 2 , R H 1 , R T M P 0 , R T M P 1 )
REDUCTION( R H 3 , R R 0 , R R 1 , R R C O N S T , R T M P 2 , R T M P 3 )
/* H ^ 4 */
PMUL_ 1 2 8 x12 8 ( R R 0 , R R 1 , R H 2 , R H 2 , R T M P 0 , R T M P 1 )
REDUCTION( R H 4 , R R 0 , R R 1 , R R C O N S T , R T M P 2 , R T M P 3 )
st1 { R H 1 . 1 6 b - R H 4 . 1 6 b } , [ x1 ]
ret
SYM_ F U N C _ E N D ( s m 4 _ c e _ p m u l l _ g h a s h _ s e t u p )
.align 3
SYM_ F U N C _ S T A R T ( p m u l l _ g h a s h _ u p d a t e )
/ * input :
* x0 : ghash t a b l e
* x1 : ghash r e s u l t
* x2 : src
* w3 : nblocks
* /
ld1 { R H 1 . 1 6 b - R H 4 . 1 6 b } , [ x0 ]
ld1 { R H A S H . 1 6 b } , [ x1 ]
rbit R H A S H . 1 6 b , R H A S H . 1 6 b
adr_ l x4 , . L g h a s h _ r c o n s t
ld1 r { R R C O N S T . 2 d } , [ x4 ]
eor R Z E R O . 1 6 b , R Z E R O . 1 6 b , R Z E R O . 1 6 b
.Lghash_loop_4x :
cmp w3 , #4
blt . L g h a s h _ l o o p _ 1 x
sub w3 , w3 , #4
ld1 { v0 . 1 6 b - v3 . 1 6 b } , [ x2 ] , #64
rbit v0 . 1 6 b , v0 . 1 6 b
rbit v1 . 1 6 b , v1 . 1 6 b
rbit v2 . 1 6 b , v2 . 1 6 b
rbit v3 . 1 6 b , v3 . 1 6 b
/ *
* ( in0 ^ H A S H ) * H ^ 4 = > r r0 : r r1
* ( in1 ) * H ^ 3 = > r r2 : r r3
* ( in2 ) * H ^ 2 = > r r4 : r r5
* ( in3 ) * H ^ 1 = > r r6 : r r7
* /
eor R H A S H . 1 6 b , R H A S H . 1 6 b , v0 . 1 6 b
PMUL_ 1 2 8 x12 8 _ 4 x ( R R 0 , R R 1 , R H A S H , R H 4 , R T M P 0 , R T M P 1 ,
RR2 , R R 3 , v1 , R H 3 , R T M P 2 , R T M P 3 ,
RR4 , R R 5 , v2 , R H 2 , R T M P 4 , R T M P 5 ,
RR6 , R R 7 , v3 , R H 1 , R T M P 6 , R T M P 7 )
eor R R 0 . 1 6 b , R R 0 . 1 6 b , R R 2 . 1 6 b
eor R R 1 . 1 6 b , R R 1 . 1 6 b , R R 3 . 1 6 b
eor R R 0 . 1 6 b , R R 0 . 1 6 b , R R 4 . 1 6 b
eor R R 1 . 1 6 b , R R 1 . 1 6 b , R R 5 . 1 6 b
eor R R 0 . 1 6 b , R R 0 . 1 6 b , R R 6 . 1 6 b
eor R R 1 . 1 6 b , R R 1 . 1 6 b , R R 7 . 1 6 b
REDUCTION( R H A S H , R R 0 , R R 1 , R R C O N S T , R T M P 0 , R T M P 1 )
cbz w3 , . L g h a s h _ e n d
b . L g h a s h _ l o o p _ 4 x
.Lghash_loop_1x :
sub w3 , w3 , #1
ld1 { v0 . 1 6 b } , [ x2 ] , #16
rbit v0 . 1 6 b , v0 . 1 6 b
eor R H A S H . 1 6 b , R H A S H . 1 6 b , v0 . 1 6 b
PMUL_ 1 2 8 x12 8 ( R R 0 , R R 1 , R H A S H , R H 1 , R T M P 0 , R T M P 1 )
REDUCTION( R H A S H , R R 0 , R R 1 , R R C O N S T , R T M P 2 , R T M P 3 )
cbnz w3 , . L g h a s h _ l o o p _ 1 x
.Lghash_end :
rbit R H A S H . 1 6 b , R H A S H . 1 6 b
st1 { R H A S H . 2 d } , [ x1 ]
ret
SYM_ F U N C _ E N D ( p m u l l _ g h a s h _ u p d a t e )
.align 3
2022-12-21 15:32:32 +08:00
SYM_ T Y P E D _ F U N C _ S T A R T ( s m 4 _ c e _ p m u l l _ g c m _ e n c )
2022-10-27 14:55:05 +08:00
/ * input :
* x0 : round k e y a r r a y , C T X
* x1 : dst
* x2 : src
* x3 : ctr ( b i g e n d i a n , 1 2 8 b i t )
* w4 : nbytes
* x5 : ghash r e s u l t
* x6 : ghash t a b l e
* x7 : lengths ( o n l y f o r l a s t b l o c k )
* /
SM4 _ P R E P A R E ( x0 )
ldp x8 , x9 , [ x3 ]
rev x8 , x8
rev x9 , x9
ld1 { R H 1 . 1 6 b - R H 4 . 1 6 b } , [ x6 ]
ld1 { R H A S H . 1 6 b } , [ x5 ]
rbit R H A S H . 1 6 b , R H A S H . 1 6 b
adr_ l x6 , . L g h a s h _ r c o n s t
ld1 r { R R C O N S T . 2 d } , [ x6 ]
eor R Z E R O . 1 6 b , R Z E R O . 1 6 b , R Z E R O . 1 6 b
cbz w4 , . L g c m _ e n c _ h a s h _ l e n
.Lgcm_enc_loop_4x :
cmp w4 , #( 4 * 1 6 )
blt . L g c m _ e n c _ l o o p _ 1 x
sub w4 , w4 , #( 4 * 1 6 )
/* construct CTRs */
inc3 2 _ l e 1 2 8 ( v0 ) / * + 0 * /
inc3 2 _ l e 1 2 8 ( v1 ) / * + 1 * /
inc3 2 _ l e 1 2 8 ( v2 ) / * + 2 * /
inc3 2 _ l e 1 2 8 ( v3 ) / * + 3 * /
ld1 { R T M P 0 . 1 6 b - R T M P 3 . 1 6 b } , [ x2 ] , #64
SM4 _ C R Y P T _ B L K 4 ( v0 , v1 , v2 , v3 )
eor v0 . 1 6 b , v0 . 1 6 b , R T M P 0 . 1 6 b
eor v1 . 1 6 b , v1 . 1 6 b , R T M P 1 . 1 6 b
eor v2 . 1 6 b , v2 . 1 6 b , R T M P 2 . 1 6 b
eor v3 . 1 6 b , v3 . 1 6 b , R T M P 3 . 1 6 b
st1 { v0 . 1 6 b - v3 . 1 6 b } , [ x1 ] , #64
/* ghash update */
rbit v0 . 1 6 b , v0 . 1 6 b
rbit v1 . 1 6 b , v1 . 1 6 b
rbit v2 . 1 6 b , v2 . 1 6 b
rbit v3 . 1 6 b , v3 . 1 6 b
/ *
* ( in0 ^ H A S H ) * H ^ 4 = > r r0 : r r1
* ( in1 ) * H ^ 3 = > r r2 : r r3
* ( in2 ) * H ^ 2 = > r r4 : r r5
* ( in3 ) * H ^ 1 = > r r6 : r r7
* /
eor R H A S H . 1 6 b , R H A S H . 1 6 b , v0 . 1 6 b
PMUL_ 1 2 8 x12 8 _ 4 x ( R R 0 , R R 1 , R H A S H , R H 4 , R T M P 0 , R T M P 1 ,
RR2 , R R 3 , v1 , R H 3 , R T M P 2 , R T M P 3 ,
RR4 , R R 5 , v2 , R H 2 , R T M P 4 , R T M P 5 ,
RR6 , R R 7 , v3 , R H 1 , R T M P 6 , R T M P 7 )
eor R R 0 . 1 6 b , R R 0 . 1 6 b , R R 2 . 1 6 b
eor R R 1 . 1 6 b , R R 1 . 1 6 b , R R 3 . 1 6 b
eor R R 0 . 1 6 b , R R 0 . 1 6 b , R R 4 . 1 6 b
eor R R 1 . 1 6 b , R R 1 . 1 6 b , R R 5 . 1 6 b
eor R R 0 . 1 6 b , R R 0 . 1 6 b , R R 6 . 1 6 b
eor R R 1 . 1 6 b , R R 1 . 1 6 b , R R 7 . 1 6 b
REDUCTION( R H A S H , R R 0 , R R 1 , R R C O N S T , R T M P 0 , R T M P 1 )
cbz w4 , . L g c m _ e n c _ h a s h _ l e n
b . L g c m _ e n c _ l o o p _ 4 x
.Lgcm_enc_loop_1x :
cmp w4 , #16
blt . L g c m _ e n c _ t a i l
sub w4 , w4 , #16
/* construct CTRs */
inc3 2 _ l e 1 2 8 ( v0 )
ld1 { R T M P 0 . 1 6 b } , [ x2 ] , #16
SM4 _ C R Y P T _ B L K ( v0 )
eor v0 . 1 6 b , v0 . 1 6 b , R T M P 0 . 1 6 b
st1 { v0 . 1 6 b } , [ x1 ] , #16
/* ghash update */
rbit v0 . 1 6 b , v0 . 1 6 b
eor R H A S H . 1 6 b , R H A S H . 1 6 b , v0 . 1 6 b
PMUL_ 1 2 8 x12 8 ( R R 0 , R R 1 , R H A S H , R H 1 , R T M P 0 , R T M P 1 )
REDUCTION( R H A S H , R R 0 , R R 1 , R R C O N S T , R T M P 2 , R T M P 3 )
cbz w4 , . L g c m _ e n c _ h a s h _ l e n
b . L g c m _ e n c _ l o o p _ 1 x
.Lgcm_enc_tail :
/* construct CTRs */
inc3 2 _ l e 1 2 8 ( v0 )
SM4 _ C R Y P T _ B L K ( v0 )
/* load permute table */
adr_ l x0 , . L c t s _ p e r m u t e _ t a b l e
add x0 , x0 , #32
sub x0 , x0 , w4 , u x t w
ld1 { v3 . 1 6 b } , [ x0 ]
.Lgcm_enc_tail_loop :
/* do encrypt */
ldrb w0 , [ x2 ] , #1 / * g e t 1 b y t e f r o m i n p u t * /
umov w6 , v0 . b [ 0 ] / * g e t t o p c r y p t e d b y t e * /
eor w6 , w6 , w0 / * w6 = C T R ^ i n p u t * /
strb w6 , [ x1 ] , #1 / * s t o r e o u t b y t e * /
/* shift right out one byte */
ext v0 . 1 6 b , v0 . 1 6 b , v0 . 1 6 b , #1
/* the last ciphertext is placed in high bytes */
ins v0 . b [ 1 5 ] , w6
subs w4 , w4 , #1
bne . L g c m _ e n c _ t a i l _ l o o p
/* padding last block with zeros */
tbl v0 . 1 6 b , { v0 . 1 6 b } , v3 . 1 6 b
/* ghash update */
rbit v0 . 1 6 b , v0 . 1 6 b
eor R H A S H . 1 6 b , R H A S H . 1 6 b , v0 . 1 6 b
PMUL_ 1 2 8 x12 8 ( R R 0 , R R 1 , R H A S H , R H 1 , R T M P 0 , R T M P 1 )
REDUCTION( R H A S H , R R 0 , R R 1 , R R C O N S T , R T M P 2 , R T M P 3 )
.Lgcm_enc_hash_len :
cbz x7 , . L g c m _ e n c _ e n d
GTAG_ H A S H _ L E N G T H S ( v1 , v3 )
b . L g c m _ e n c _ r e t
.Lgcm_enc_end :
/* store new CTR */
rev x8 , x8
rev x9 , x9
stp x8 , x9 , [ x3 ]
rbit R H A S H . 1 6 b , R H A S H . 1 6 b
.Lgcm_enc_ret :
/* store new MAC */
st1 { R H A S H . 2 d } , [ x5 ]
ret
SYM_ F U N C _ E N D ( s m 4 _ c e _ p m u l l _ g c m _ e n c )
# undef R R 1
# undef R R 3
# undef R R 5
# undef R R 7
# undef R R 0
# undef R R 2
# undef R R 4
# undef R R 6
# undef R T M P 0
# undef R T M P 1
# undef R T M P 2
# undef R T M P 3
# undef R T M P 4
# undef R T M P 5
# undef R T M P 6
# undef R T M P 7
# undef R H 1
# undef R H 2
# undef R H 3
# undef R H 4
/* Register macros for decrypt */
/* v0-v2 for building CTRs, v3-v5 for saving inputs */
# define R R 1 v6
# define R R 3 v7
# define R R 5 v8
# define R R 0 v9
# define R R 2 v10
# define R R 4 v11
# define R T M P 0 v12
# define R T M P 1 v13
# define R T M P 2 v14
# define R T M P 3 v15
# define R T M P 4 v16
# define R T M P 5 v17
# define R H 1 v18
# define R H 2 v19
# define R H 3 v20
.align 3
2022-12-21 15:32:32 +08:00
SYM_ T Y P E D _ F U N C _ S T A R T ( s m 4 _ c e _ p m u l l _ g c m _ d e c )
2022-10-27 14:55:05 +08:00
/ * input :
* x0 : round k e y a r r a y , C T X
* x1 : dst
* x2 : src
* x3 : ctr ( b i g e n d i a n , 1 2 8 b i t )
* w4 : nbytes
* x5 : ghash r e s u l t
* x6 : ghash t a b l e
* x7 : lengths ( o n l y f o r l a s t b l o c k )
* /
SM4 _ P R E P A R E ( x0 )
ldp x8 , x9 , [ x3 ]
rev x8 , x8
rev x9 , x9
ld1 { R H 1 . 1 6 b - R H 3 . 1 6 b } , [ x6 ]
ld1 { R H A S H . 1 6 b } , [ x5 ]
rbit R H A S H . 1 6 b , R H A S H . 1 6 b
adr_ l x6 , . L g h a s h _ r c o n s t
ld1 r { R R C O N S T . 2 d } , [ x6 ]
eor R Z E R O . 1 6 b , R Z E R O . 1 6 b , R Z E R O . 1 6 b
cbz w4 , . L g c m _ d e c _ h a s h _ l e n
.Lgcm_dec_loop_3x :
cmp w4 , #( 3 * 1 6 )
blt . L g c m _ d e c _ l o o p _ 1 x
sub w4 , w4 , #( 3 * 1 6 )
ld1 { v3 . 1 6 b - v5 . 1 6 b } , [ x2 ] , #( 3 * 1 6 )
/* construct CTRs */
inc3 2 _ l e 1 2 8 ( v0 ) / * + 0 * /
rbit v6 . 1 6 b , v3 . 1 6 b
inc3 2 _ l e 1 2 8 ( v1 ) / * + 1 * /
rbit v7 . 1 6 b , v4 . 1 6 b
inc3 2 _ l e 1 2 8 ( v2 ) / * + 2 * /
rbit v8 . 1 6 b , v5 . 1 6 b
eor R H A S H . 1 6 b , R H A S H . 1 6 b , v6 . 1 6 b
/* decrypt & ghash update */
SM4 _ C R Y P T _ P M U L _ 1 2 8 x12 8 _ B L K 3 ( v0 , v1 , v2 ,
RR0 , R R 1 , R H A S H , R H 3 , R T M P 0 , R T M P 1 ,
RR2 , R R 3 , v7 , R H 2 , R T M P 2 , R T M P 3 ,
RR4 , R R 5 , v8 , R H 1 , R T M P 4 , R T M P 5 )
eor v0 . 1 6 b , v0 . 1 6 b , v3 . 1 6 b
eor v1 . 1 6 b , v1 . 1 6 b , v4 . 1 6 b
eor v2 . 1 6 b , v2 . 1 6 b , v5 . 1 6 b
REDUCTION( R H A S H , R R 0 , R R 1 , R R C O N S T , R T M P 0 , R T M P 1 )
st1 { v0 . 1 6 b - v2 . 1 6 b } , [ x1 ] , #( 3 * 1 6 )
cbz w4 , . L g c m _ d e c _ h a s h _ l e n
b . L g c m _ d e c _ l o o p _ 3 x
.Lgcm_dec_loop_1x :
cmp w4 , #16
blt . L g c m _ d e c _ t a i l
sub w4 , w4 , #16
ld1 { v3 . 1 6 b } , [ x2 ] , #16
/* construct CTRs */
inc3 2 _ l e 1 2 8 ( v0 )
rbit v6 . 1 6 b , v3 . 1 6 b
eor R H A S H . 1 6 b , R H A S H . 1 6 b , v6 . 1 6 b
SM4 _ C R Y P T _ P M U L _ 1 2 8 x12 8 _ B L K ( v0 , R R 0 , R R 1 , R H A S H , R H 1 , R T M P 0 , R T M P 1 )
eor v0 . 1 6 b , v0 . 1 6 b , v3 . 1 6 b
REDUCTION( R H A S H , R R 0 , R R 1 , R R C O N S T , R T M P 2 , R T M P 3 )
st1 { v0 . 1 6 b } , [ x1 ] , #16
cbz w4 , . L g c m _ d e c _ h a s h _ l e n
b . L g c m _ d e c _ l o o p _ 1 x
.Lgcm_dec_tail :
/* construct CTRs */
inc3 2 _ l e 1 2 8 ( v0 )
SM4 _ C R Y P T _ B L K ( v0 )
/* load permute table */
adr_ l x0 , . L c t s _ p e r m u t e _ t a b l e
add x0 , x0 , #32
sub x0 , x0 , w4 , u x t w
ld1 { v3 . 1 6 b } , [ x0 ]
.Lgcm_dec_tail_loop :
/* do decrypt */
ldrb w0 , [ x2 ] , #1 / * g e t 1 b y t e f r o m i n p u t * /
umov w6 , v0 . b [ 0 ] / * g e t t o p c r y p t e d b y t e * /
eor w6 , w6 , w0 / * w6 = C T R ^ i n p u t * /
strb w6 , [ x1 ] , #1 / * s t o r e o u t b y t e * /
/* shift right out one byte */
ext v0 . 1 6 b , v0 . 1 6 b , v0 . 1 6 b , #1
/* the last ciphertext is placed in high bytes */
ins v0 . b [ 1 5 ] , w0
subs w4 , w4 , #1
bne . L g c m _ d e c _ t a i l _ l o o p
/* padding last block with zeros */
tbl v0 . 1 6 b , { v0 . 1 6 b } , v3 . 1 6 b
/* ghash update */
rbit v0 . 1 6 b , v0 . 1 6 b
eor R H A S H . 1 6 b , R H A S H . 1 6 b , v0 . 1 6 b
PMUL_ 1 2 8 x12 8 ( R R 0 , R R 1 , R H A S H , R H 1 , R T M P 0 , R T M P 1 )
REDUCTION( R H A S H , R R 0 , R R 1 , R R C O N S T , R T M P 2 , R T M P 3 )
.Lgcm_dec_hash_len :
cbz x7 , . L g c m _ d e c _ e n d
GTAG_ H A S H _ L E N G T H S ( v1 , v3 )
b . L g c m _ d e c _ r e t
.Lgcm_dec_end :
/* store new CTR */
rev x8 , x8
rev x9 , x9
stp x8 , x9 , [ x3 ]
rbit R H A S H . 1 6 b , R H A S H . 1 6 b
.Lgcm_dec_ret :
/* store new MAC */
st1 { R H A S H . 2 d } , [ x5 ]
ret
SYM_ F U N C _ E N D ( s m 4 _ c e _ p m u l l _ g c m _ d e c )
.section " .rodata " , " a"
.align 4
.Lcts_permute_table :
.byte 0 xff, 0 x f f , 0 x f f , 0 x f f , 0 x f f , 0 x f f , 0 x f f , 0 x f f
.byte 0 xff, 0 x f f , 0 x f f , 0 x f f , 0 x f f , 0 x f f , 0 x f f , 0 x f f
.byte 0 x0 , 0 x1 , 0 x2 , 0 x3 , 0 x4 , 0 x5 , 0 x6 , 0 x7
.byte 0 x8 , 0 x9 , 0 x a , 0 x b , 0 x c , 0 x d , 0 x e , 0 x f
.byte 0 xff, 0 x f f , 0 x f f , 0 x f f , 0 x f f , 0 x f f , 0 x f f , 0 x f f
.byte 0 xff, 0 x f f , 0 x f f , 0 x f f , 0 x f f , 0 x f f , 0 x f f , 0 x f f
.Lghash_rconst :
.quad 0x87