2021-07-20 11:46:41 +08:00
/* SPDX-License-Identifier: GPL-2.0-or-later */
/ *
* SM4 C i p h e r A l g o r i t h m , A E S - N I / A V X o p t i m i z e d .
* as s p e c i f i e d i n
* https : / / tools. i e t f . o r g / i d / d r a f t - r i b o s e - c f r g - s m 4 - 1 0 . h t m l
*
* Copyright ( C ) 2 0 1 8 M a r k k u - J u h a n i O . S a a r i n e n < m j o s @iki.fi>
* Copyright ( C ) 2 0 2 0 J u s s i K i v i l i n n a < j u s s i . k i v i l i n n a @iki.fi>
* Copyright ( c ) 2 0 2 1 T i a n j i a Z h a n g < t i a n j i a . z h a n g @linux.alibaba.com>
* /
/ * Based o n S M 4 A E S - N I w o r k b y l i b g c r y p t a n d M a r k k u - J u h a n i O . S a a r i n e n a t :
* https : / / github. c o m / m j o s a a r i n e n / s m 4 n i
* /
# include < l i n u x / l i n k a g e . h >
# include < a s m / f r a m e . h >
# define r R I P ( % r i p )
# define R X 0 % x m m 0
# define R X 1 % x m m 1
# define M A S K _ 4 B I T % x m m 2
# define R T M P 0 % x m m 3
# define R T M P 1 % x m m 4
# define R T M P 2 % x m m 5
# define R T M P 3 % x m m 6
# define R T M P 4 % x m m 7
# define R A 0 % x m m 8
# define R A 1 % x m m 9
# define R A 2 % x m m 1 0
# define R A 3 % x m m 1 1
# define R B 0 % x m m 1 2
# define R B 1 % x m m 1 3
# define R B 2 % x m m 1 4
# define R B 3 % x m m 1 5
# define R N O T % x m m 0
# define R B S W A P % x m m 1
/* Transpose four 32-bit words between 128-bit vectors. */
# define t r a n s p o s e _ 4 x4 ( x0 , x1 , x2 , x3 , t 1 , t 2 ) \
vpunpckhdq x1 , x0 , t 2 ; \
vpunpckldq x1 , x0 , x0 ; \
\
vpunpckldq x3 , x2 , t 1 ; \
vpunpckhdq x3 , x2 , x2 ; \
\
vpunpckhqdq t 1 , x0 , x1 ; \
vpunpcklqdq t 1 , x0 , x0 ; \
\
vpunpckhqdq x2 , t 2 , x3 ; \
vpunpcklqdq x2 , t 2 , x2 ;
/* pre-SubByte transform. */
# define t r a n s f o r m _ p r e ( x , l o _ t , h i _ t , m a s k 4 b i t , t m p0 ) \
vpand x , m a s k 4 b i t , t m p0 ; \
vpandn x , m a s k 4 b i t , x ; \
vpsrld $ 4 , x , x ; \
\
vpshufb t m p0 , l o _ t , t m p0 ; \
vpshufb x , h i _ t , x ; \
vpxor t m p0 , x , x ;
/ * post- S u b B y t e t r a n s f o r m . N o t e : x h a s b e e n X O R ' e d w i t h m a s k 4 b i t b y
* ' vaeslastenc' i n s t r u c t i o n .
* /
# define t r a n s f o r m _ p o s t ( x , l o _ t , h i _ t , m a s k 4 b i t , t m p0 ) \
vpandn m a s k 4 b i t , x , t m p0 ; \
vpsrld $ 4 , x , x ; \
vpand x , m a s k 4 b i t , x ; \
\
vpshufb t m p0 , l o _ t , t m p0 ; \
vpshufb x , h i _ t , x ; \
vpxor t m p0 , x , x ;
2021-10-15 11:47:33 +08:00
.section .rodata .cst16 , " aM" , @progbits, 16
2021-07-20 11:46:41 +08:00
.align 16
/ *
* Following f o u r a f f i n e t r a n s f o r m l o o k - u p t a b l e s a r e f r o m w o r k b y
* Markku- J u h a n i O . S a a r i n e n , a t h t t p s : / / g i t h u b . c o m / m j o s a a r i n e n / s m 4 n i
*
* These a l l o w e x p o s i n g S M 4 S - B o x f r o m A E S S u b B y t e .
* /
/* pre-SubByte affine transform, from SM4 field to AES field. */
.Lpre_tf_lo_s :
.quad 0 x9 1 9 7 E 2 E 4 7 4 7 2 0 7 0 1 , 0 x C 7 C 1 B 4 B 2 2 2 2 4 5 1 5 7
.Lpre_tf_hi_s :
.quad 0 xE2 4 0 A B 0 9 E B 4 9 A 2 0 0 , 0 x F 0 5 2 B 9 1 B F 9 5 B B 0 1 2
/* post-SubByte affine transform, from AES field to SM4 field. */
.Lpost_tf_lo_s :
.quad 0 x5 B 6 7 F 2 C E A 1 9 D 0 8 3 4 , 0 x E D D 1 4 4 7 8 1 7 2 B B E 8 2
.Lpost_tf_hi_s :
.quad 0 xAE7 2 0 1 D D 7 3 A F D C 0 0 , 0 x11 C D B E 6 2 C C 1 0 6 3 B F
/* For isolating SubBytes from AESENCLAST, inverse shift row */
.Linv_shift_row :
.byte 0 x0 0 , 0 x0 d , 0 x0 a , 0 x07 , 0 x04 , 0 x01 , 0 x0 e , 0 x0 b
.byte 0 x0 8 , 0 x05 , 0 x02 , 0 x0 f , 0 x0 c , 0 x09 , 0 x06 , 0 x03
/* Inverse shift row + Rotate left by 8 bits on 32-bit words with vpshufb */
.Linv_shift_row_rol_8 :
.byte 0 x0 7 , 0 x00 , 0 x0 d , 0 x0 a , 0 x0 b , 0 x04 , 0 x01 , 0 x0 e
.byte 0 x0 f , 0 x08 , 0 x05 , 0 x02 , 0 x03 , 0 x0 c , 0 x09 , 0 x06
/* Inverse shift row + Rotate left by 16 bits on 32-bit words with vpshufb */
.Linv_shift_row_rol_16 :
.byte 0 x0 a , 0 x07 , 0 x00 , 0 x0 d , 0 x0 e , 0 x0 b , 0 x04 , 0 x01
.byte 0 x0 2 , 0 x0 f , 0 x08 , 0 x05 , 0 x06 , 0 x03 , 0 x0 c , 0 x09
/* Inverse shift row + Rotate left by 24 bits on 32-bit words with vpshufb */
.Linv_shift_row_rol_24 :
.byte 0 x0 d , 0 x0 a , 0 x07 , 0 x00 , 0 x01 , 0 x0 e , 0 x0 b , 0 x04
.byte 0 x0 5 , 0 x02 , 0 x0 f , 0 x08 , 0 x09 , 0 x06 , 0 x03 , 0 x0 c
/* For CTR-mode IV byteswap */
.Lbswap128_mask :
.byte 1 5 , 1 4 , 1 3 , 1 2 , 1 1 , 1 0 , 9 , 8 , 7 , 6 , 5 , 4 , 3 , 2 , 1 , 0
/* For input word byte-swap */
.Lbswap32_mask :
.byte 3 , 2 , 1 , 0 , 7 , 6 , 5 , 4 , 1 1 , 1 0 , 9 , 8 , 1 5 , 1 4 , 1 3 , 1 2
.align 4
/* 4-bit mask */
.L0f0f0f0f :
.long 0x0f0f0f0f
2021-10-15 11:47:33 +08:00
/* 12 bytes, only for padding */
.Lpadding_deadbeef :
.long 0 xdeadbeef, 0 x d e a d b e e f , 0 x d e a d b e e f
2021-07-20 11:46:41 +08:00
.text
.align 16
/ *
* void s m 4 _ a e s n i _ a v x _ c r y p t 4 ( c o n s t u 3 2 * r k , u 8 * d s t ,
* const u 8 * s r c , i n t n b l o c k s )
* /
.align 8
SYM_ F U N C _ S T A R T ( s m 4 _ a e s n i _ a v x _ c r y p t 4 )
/ * input :
* % rdi : round k e y a r r a y , C T X
* % rsi : dst ( 1 . . 4 b l o c k s )
* % rdx : src ( 1 . . 4 b l o c k s )
* % rcx : num b l o c k s ( 1 . . 4 )
* /
FRAME_ B E G I N
vmovdqu 0 * 1 6 ( % r d x ) , R A 0 ;
vmovdqa R A 0 , R A 1 ;
vmovdqa R A 0 , R A 2 ;
vmovdqa R A 0 , R A 3 ;
cmpq $ 2 , % r c x ;
jb . L b l k 4 _ l o a d _ i n p u t _ d o n e ;
vmovdqu 1 * 1 6 ( % r d x ) , R A 1 ;
je . L b l k 4 _ l o a d _ i n p u t _ d o n e ;
vmovdqu 2 * 1 6 ( % r d x ) , R A 2 ;
cmpq $ 3 , % r c x ;
je . L b l k 4 _ l o a d _ i n p u t _ d o n e ;
vmovdqu 3 * 1 6 ( % r d x ) , R A 3 ;
.Lblk4_load_input_done :
vmovdqa . L b s w a p32 _ m a s k r R I P , R T M P 2 ;
vpshufb R T M P 2 , R A 0 , R A 0 ;
vpshufb R T M P 2 , R A 1 , R A 1 ;
vpshufb R T M P 2 , R A 2 , R A 2 ;
vpshufb R T M P 2 , R A 3 , R A 3 ;
vbroadcastss . L 0 f0 f0 f0 f r R I P , M A S K _ 4 B I T ;
vmovdqa . L p r e _ t f _ l o _ s r R I P , R T M P 4 ;
vmovdqa . L p r e _ t f _ h i _ s r R I P , R B 0 ;
vmovdqa . L p o s t _ t f _ l o _ s r R I P , R B 1 ;
vmovdqa . L p o s t _ t f _ h i _ s r R I P , R B 2 ;
vmovdqa . L i n v _ s h i f t _ r o w r R I P , R B 3 ;
vmovdqa . L i n v _ s h i f t _ r o w _ r o l _ 8 r R I P , R T M P 2 ;
vmovdqa . L i n v _ s h i f t _ r o w _ r o l _ 1 6 r R I P , R T M P 3 ;
transpose_ 4 x4 ( R A 0 , R A 1 , R A 2 , R A 3 , R T M P 0 , R T M P 1 ) ;
# define R O U N D ( r o u n d , s0 , s1 , s2 , s3 ) \
vbroadcastss ( 4 * ( r o u n d ) ) ( % r d i ) , R X 0 ; \
vpxor s1 , R X 0 , R X 0 ; \
vpxor s2 , R X 0 , R X 0 ; \
vpxor s3 , R X 0 , R X 0 ; /* s1 ^ s2 ^ s3 ^ rk */ \
\
/* sbox, non-linear part */ \
transform_ p r e ( R X 0 , R T M P 4 , R B 0 , M A S K _ 4 B I T , R T M P 0 ) ; \
vaesenclast M A S K _ 4 B I T , R X 0 , R X 0 ; \
transform_ p o s t ( R X 0 , R B 1 , R B 2 , M A S K _ 4 B I T , R T M P 0 ) ; \
\
/* linear part */ \
vpshufb R B 3 , R X 0 , R T M P 0 ; \
vpxor R T M P 0 , s0 , s0 ; /* s0 ^ x */ \
vpshufb R T M P 2 , R X 0 , R T M P 1 ; \
vpxor R T M P 1 , R T M P 0 , R T M P 0 ; /* x ^ rol(x,8) */ \
vpshufb R T M P 3 , R X 0 , R T M P 1 ; \
vpxor R T M P 1 , R T M P 0 , R T M P 0 ; /* x ^ rol(x,8) ^ rol(x,16) */ \
vpshufb . L i n v _ s h i f t _ r o w _ r o l _ 2 4 r R I P , R X 0 , R T M P 1 ; \
vpxor R T M P 1 , s0 , s0 ; /* s0 ^ x ^ rol(x,24) */ \
vpslld $ 2 , R T M P 0 , R T M P 1 ; \
vpsrld $ 3 0 , R T M P 0 , R T M P 0 ; \
vpxor R T M P 0 , s0 , s0 ; \
/* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \
vpxor R T M P 1 , s0 , s0 ;
leaq ( 3 2 * 4 ) ( % r d i ) , % r a x ;
.align 16
.Lroundloop_blk4 :
ROUND( 0 , R A 0 , R A 1 , R A 2 , R A 3 ) ;
ROUND( 1 , R A 1 , R A 2 , R A 3 , R A 0 ) ;
ROUND( 2 , R A 2 , R A 3 , R A 0 , R A 1 ) ;
ROUND( 3 , R A 3 , R A 0 , R A 1 , R A 2 ) ;
leaq ( 4 * 4 ) ( % r d i ) , % r d i ;
cmpq % r a x , % r d i ;
jne . L r o u n d l o o p _ b l k 4 ;
# undef R O U N D
vmovdqa . L b s w a p12 8 _ m a s k r R I P , R T M P 2 ;
transpose_ 4 x4 ( R A 0 , R A 1 , R A 2 , R A 3 , R T M P 0 , R T M P 1 ) ;
vpshufb R T M P 2 , R A 0 , R A 0 ;
vpshufb R T M P 2 , R A 1 , R A 1 ;
vpshufb R T M P 2 , R A 2 , R A 2 ;
vpshufb R T M P 2 , R A 3 , R A 3 ;
vmovdqu R A 0 , 0 * 1 6 ( % r s i ) ;
cmpq $ 2 , % r c x ;
jb . L b l k 4 _ s t o r e _ o u t p u t _ d o n e ;
vmovdqu R A 1 , 1 * 1 6 ( % r s i ) ;
je . L b l k 4 _ s t o r e _ o u t p u t _ d o n e ;
vmovdqu R A 2 , 2 * 1 6 ( % r s i ) ;
cmpq $ 3 , % r c x ;
je . L b l k 4 _ s t o r e _ o u t p u t _ d o n e ;
vmovdqu R A 3 , 3 * 1 6 ( % r s i ) ;
.Lblk4_store_output_done :
vzeroall;
FRAME_ E N D
2021-12-04 14:43:40 +01:00
RET;
2021-07-20 11:46:41 +08:00
SYM_ F U N C _ E N D ( s m 4 _ a e s n i _ a v x _ c r y p t 4 )
.align 8
SYM_ F U N C _ S T A R T _ L O C A L ( _ _ s m 4 _ c r y p t _ b l k 8 )
/ * input :
* % rdi : round k e y a r r a y , C T X
* RA0 , R A 1 , R A 2 , R A 3 , R B 0 , R B 1 , R B 2 , R B 3 : e i g h t p a r a l l e l
* plaintext b l o c k s
* output :
* RA0 , R A 1 , R A 2 , R A 3 , R B 0 , R B 1 , R B 2 , R B 3 : e i g h t p a r a l l e l
* ciphertext b l o c k s
* /
FRAME_ B E G I N
vmovdqa . L b s w a p32 _ m a s k r R I P , R T M P 2 ;
vpshufb R T M P 2 , R A 0 , R A 0 ;
vpshufb R T M P 2 , R A 1 , R A 1 ;
vpshufb R T M P 2 , R A 2 , R A 2 ;
vpshufb R T M P 2 , R A 3 , R A 3 ;
vpshufb R T M P 2 , R B 0 , R B 0 ;
vpshufb R T M P 2 , R B 1 , R B 1 ;
vpshufb R T M P 2 , R B 2 , R B 2 ;
vpshufb R T M P 2 , R B 3 , R B 3 ;
vbroadcastss . L 0 f0 f0 f0 f r R I P , M A S K _ 4 B I T ;
transpose_ 4 x4 ( R A 0 , R A 1 , R A 2 , R A 3 , R T M P 0 , R T M P 1 ) ;
transpose_ 4 x4 ( R B 0 , R B 1 , R B 2 , R B 3 , R T M P 0 , R T M P 1 ) ;
# define R O U N D ( r o u n d , s0 , s1 , s2 , s3 , r0 , r1 , r2 , r3 ) \
vbroadcastss ( 4 * ( r o u n d ) ) ( % r d i ) , R X 0 ; \
vmovdqa . L p r e _ t f _ l o _ s r R I P , R T M P 4 ; \
vmovdqa . L p r e _ t f _ h i _ s r R I P , R T M P 1 ; \
vmovdqa R X 0 , R X 1 ; \
vpxor s1 , R X 0 , R X 0 ; \
vpxor s2 , R X 0 , R X 0 ; \
vpxor s3 , R X 0 , R X 0 ; /* s1 ^ s2 ^ s3 ^ rk */ \
vmovdqa . L p o s t _ t f _ l o _ s r R I P , R T M P 2 ; \
vmovdqa . L p o s t _ t f _ h i _ s r R I P , R T M P 3 ; \
vpxor r1 , R X 1 , R X 1 ; \
vpxor r2 , R X 1 , R X 1 ; \
vpxor r3 , R X 1 , R X 1 ; /* r1 ^ r2 ^ r3 ^ rk */ \
\
/* sbox, non-linear part */ \
transform_ p r e ( R X 0 , R T M P 4 , R T M P 1 , M A S K _ 4 B I T , R T M P 0 ) ; \
transform_ p r e ( R X 1 , R T M P 4 , R T M P 1 , M A S K _ 4 B I T , R T M P 0 ) ; \
vmovdqa . L i n v _ s h i f t _ r o w r R I P , R T M P 4 ; \
vaesenclast M A S K _ 4 B I T , R X 0 , R X 0 ; \
vaesenclast M A S K _ 4 B I T , R X 1 , R X 1 ; \
transform_ p o s t ( R X 0 , R T M P 2 , R T M P 3 , M A S K _ 4 B I T , R T M P 0 ) ; \
transform_ p o s t ( R X 1 , R T M P 2 , R T M P 3 , M A S K _ 4 B I T , R T M P 0 ) ; \
\
/* linear part */ \
vpshufb R T M P 4 , R X 0 , R T M P 0 ; \
vpxor R T M P 0 , s0 , s0 ; /* s0 ^ x */ \
vpshufb R T M P 4 , R X 1 , R T M P 2 ; \
vmovdqa . L i n v _ s h i f t _ r o w _ r o l _ 8 r R I P , R T M P 4 ; \
vpxor R T M P 2 , r0 , r0 ; /* r0 ^ x */ \
vpshufb R T M P 4 , R X 0 , R T M P 1 ; \
vpxor R T M P 1 , R T M P 0 , R T M P 0 ; /* x ^ rol(x,8) */ \
vpshufb R T M P 4 , R X 1 , R T M P 3 ; \
vmovdqa . L i n v _ s h i f t _ r o w _ r o l _ 1 6 r R I P , R T M P 4 ; \
vpxor R T M P 3 , R T M P 2 , R T M P 2 ; /* x ^ rol(x,8) */ \
vpshufb R T M P 4 , R X 0 , R T M P 1 ; \
vpxor R T M P 1 , R T M P 0 , R T M P 0 ; /* x ^ rol(x,8) ^ rol(x,16) */ \
vpshufb R T M P 4 , R X 1 , R T M P 3 ; \
vmovdqa . L i n v _ s h i f t _ r o w _ r o l _ 2 4 r R I P , R T M P 4 ; \
vpxor R T M P 3 , R T M P 2 , R T M P 2 ; /* x ^ rol(x,8) ^ rol(x,16) */ \
vpshufb R T M P 4 , R X 0 , R T M P 1 ; \
vpxor R T M P 1 , s0 , s0 ; /* s0 ^ x ^ rol(x,24) */ \
/* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \
vpslld $ 2 , R T M P 0 , R T M P 1 ; \
vpsrld $ 3 0 , R T M P 0 , R T M P 0 ; \
vpxor R T M P 0 , s0 , s0 ; \
vpxor R T M P 1 , s0 , s0 ; \
vpshufb R T M P 4 , R X 1 , R T M P 3 ; \
vpxor R T M P 3 , r0 , r0 ; /* r0 ^ x ^ rol(x,24) */ \
/* r0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \
vpslld $ 2 , R T M P 2 , R T M P 3 ; \
vpsrld $ 3 0 , R T M P 2 , R T M P 2 ; \
vpxor R T M P 2 , r0 , r0 ; \
vpxor R T M P 3 , r0 , r0 ;
leaq ( 3 2 * 4 ) ( % r d i ) , % r a x ;
.align 16
.Lroundloop_blk8 :
ROUND( 0 , R A 0 , R A 1 , R A 2 , R A 3 , R B 0 , R B 1 , R B 2 , R B 3 ) ;
ROUND( 1 , R A 1 , R A 2 , R A 3 , R A 0 , R B 1 , R B 2 , R B 3 , R B 0 ) ;
ROUND( 2 , R A 2 , R A 3 , R A 0 , R A 1 , R B 2 , R B 3 , R B 0 , R B 1 ) ;
ROUND( 3 , R A 3 , R A 0 , R A 1 , R A 2 , R B 3 , R B 0 , R B 1 , R B 2 ) ;
leaq ( 4 * 4 ) ( % r d i ) , % r d i ;
cmpq % r a x , % r d i ;
jne . L r o u n d l o o p _ b l k 8 ;
# undef R O U N D
vmovdqa . L b s w a p12 8 _ m a s k r R I P , R T M P 2 ;
transpose_ 4 x4 ( R A 0 , R A 1 , R A 2 , R A 3 , R T M P 0 , R T M P 1 ) ;
transpose_ 4 x4 ( R B 0 , R B 1 , R B 2 , R B 3 , R T M P 0 , R T M P 1 ) ;
vpshufb R T M P 2 , R A 0 , R A 0 ;
vpshufb R T M P 2 , R A 1 , R A 1 ;
vpshufb R T M P 2 , R A 2 , R A 2 ;
vpshufb R T M P 2 , R A 3 , R A 3 ;
vpshufb R T M P 2 , R B 0 , R B 0 ;
vpshufb R T M P 2 , R B 1 , R B 1 ;
vpshufb R T M P 2 , R B 2 , R B 2 ;
vpshufb R T M P 2 , R B 3 , R B 3 ;
FRAME_ E N D
2021-12-04 14:43:40 +01:00
RET;
2021-07-20 11:46:41 +08:00
SYM_ F U N C _ E N D ( _ _ s m 4 _ c r y p t _ b l k 8 )
/ *
* void s m 4 _ a e s n i _ a v x _ c r y p t 8 ( c o n s t u 3 2 * r k , u 8 * d s t ,
* const u 8 * s r c , i n t n b l o c k s )
* /
.align 8
SYM_ F U N C _ S T A R T ( s m 4 _ a e s n i _ a v x _ c r y p t 8 )
/ * input :
* % rdi : round k e y a r r a y , C T X
* % rsi : dst ( 1 . . 8 b l o c k s )
* % rdx : src ( 1 . . 8 b l o c k s )
* % rcx : num b l o c k s ( 1 . . 8 )
* /
cmpq $ 5 , % r c x ;
jb s m 4 _ a e s n i _ a v x _ c r y p t 4 ;
2021-09-21 22:40:26 -07:00
FRAME_ B E G I N
2021-07-20 11:46:41 +08:00
vmovdqu ( 0 * 1 6 ) ( % r d x ) , R A 0 ;
vmovdqu ( 1 * 1 6 ) ( % r d x ) , R A 1 ;
vmovdqu ( 2 * 1 6 ) ( % r d x ) , R A 2 ;
vmovdqu ( 3 * 1 6 ) ( % r d x ) , R A 3 ;
vmovdqu ( 4 * 1 6 ) ( % r d x ) , R B 0 ;
vmovdqa R B 0 , R B 1 ;
vmovdqa R B 0 , R B 2 ;
vmovdqa R B 0 , R B 3 ;
je . L b l k 8 _ l o a d _ i n p u t _ d o n e ;
vmovdqu ( 5 * 1 6 ) ( % r d x ) , R B 1 ;
cmpq $ 7 , % r c x ;
jb . L b l k 8 _ l o a d _ i n p u t _ d o n e ;
vmovdqu ( 6 * 1 6 ) ( % r d x ) , R B 2 ;
je . L b l k 8 _ l o a d _ i n p u t _ d o n e ;
vmovdqu ( 7 * 1 6 ) ( % r d x ) , R B 3 ;
.Lblk8_load_input_done :
call _ _ s m 4 _ c r y p t _ b l k 8 ;
cmpq $ 6 , % r c x ;
vmovdqu R A 0 , ( 0 * 1 6 ) ( % r s i ) ;
vmovdqu R A 1 , ( 1 * 1 6 ) ( % r s i ) ;
vmovdqu R A 2 , ( 2 * 1 6 ) ( % r s i ) ;
vmovdqu R A 3 , ( 3 * 1 6 ) ( % r s i ) ;
vmovdqu R B 0 , ( 4 * 1 6 ) ( % r s i ) ;
jb . L b l k 8 _ s t o r e _ o u t p u t _ d o n e ;
vmovdqu R B 1 , ( 5 * 1 6 ) ( % r s i ) ;
je . L b l k 8 _ s t o r e _ o u t p u t _ d o n e ;
vmovdqu R B 2 , ( 6 * 1 6 ) ( % r s i ) ;
cmpq $ 7 , % r c x ;
je . L b l k 8 _ s t o r e _ o u t p u t _ d o n e ;
vmovdqu R B 3 , ( 7 * 1 6 ) ( % r s i ) ;
.Lblk8_store_output_done :
vzeroall;
FRAME_ E N D
2021-12-04 14:43:40 +01:00
RET;
2021-07-20 11:46:41 +08:00
SYM_ F U N C _ E N D ( s m 4 _ a e s n i _ a v x _ c r y p t 8 )
/ *
* void s m 4 _ a e s n i _ a v x _ c t r _ e n c _ b l k 8 ( c o n s t u 3 2 * r k , u 8 * d s t ,
* const u 8 * s r c , u 8 * i v )
* /
.align 8
SYM_ F U N C _ S T A R T ( s m 4 _ a e s n i _ a v x _ c t r _ e n c _ b l k 8 )
/ * input :
* % rdi : round k e y a r r a y , C T X
* % rsi : dst ( 8 b l o c k s )
* % rdx : src ( 8 b l o c k s )
* % rcx : iv ( b i g e n d i a n , 1 2 8 b i t )
* /
FRAME_ B E G I N
/* load IV and byteswap */
vmovdqu ( % r c x ) , R A 0 ;
vmovdqa . L b s w a p12 8 _ m a s k r R I P , R B S W A P ;
vpshufb R B S W A P , R A 0 , R T M P 0 ; /* be => le */
vpcmpeqd R N O T , R N O T , R N O T ;
vpsrldq $ 8 , R N O T , R N O T ; /* low: -1, high: 0 */
# define i n c _ l e 1 2 8 ( x , m i n u s _ o n e , t m p ) \
vpcmpeqq m i n u s _ o n e , x , t m p ; \
vpsubq m i n u s _ o n e , x , x ; \
vpslldq $ 8 , t m p , t m p ; \
vpsubq t m p , x , x ;
/* construct IVs */
inc_ l e 1 2 8 ( R T M P 0 , R N O T , R T M P 2 ) ; /* +1 */
vpshufb R B S W A P , R T M P 0 , R A 1 ;
inc_ l e 1 2 8 ( R T M P 0 , R N O T , R T M P 2 ) ; /* +2 */
vpshufb R B S W A P , R T M P 0 , R A 2 ;
inc_ l e 1 2 8 ( R T M P 0 , R N O T , R T M P 2 ) ; /* +3 */
vpshufb R B S W A P , R T M P 0 , R A 3 ;
inc_ l e 1 2 8 ( R T M P 0 , R N O T , R T M P 2 ) ; /* +4 */
vpshufb R B S W A P , R T M P 0 , R B 0 ;
inc_ l e 1 2 8 ( R T M P 0 , R N O T , R T M P 2 ) ; /* +5 */
vpshufb R B S W A P , R T M P 0 , R B 1 ;
inc_ l e 1 2 8 ( R T M P 0 , R N O T , R T M P 2 ) ; /* +6 */
vpshufb R B S W A P , R T M P 0 , R B 2 ;
inc_ l e 1 2 8 ( R T M P 0 , R N O T , R T M P 2 ) ; /* +7 */
vpshufb R B S W A P , R T M P 0 , R B 3 ;
inc_ l e 1 2 8 ( R T M P 0 , R N O T , R T M P 2 ) ; /* +8 */
vpshufb R B S W A P , R T M P 0 , R T M P 1 ;
/* store new IV */
vmovdqu R T M P 1 , ( % r c x ) ;
call _ _ s m 4 _ c r y p t _ b l k 8 ;
vpxor ( 0 * 1 6 ) ( % r d x ) , R A 0 , R A 0 ;
vpxor ( 1 * 1 6 ) ( % r d x ) , R A 1 , R A 1 ;
vpxor ( 2 * 1 6 ) ( % r d x ) , R A 2 , R A 2 ;
vpxor ( 3 * 1 6 ) ( % r d x ) , R A 3 , R A 3 ;
vpxor ( 4 * 1 6 ) ( % r d x ) , R B 0 , R B 0 ;
vpxor ( 5 * 1 6 ) ( % r d x ) , R B 1 , R B 1 ;
vpxor ( 6 * 1 6 ) ( % r d x ) , R B 2 , R B 2 ;
vpxor ( 7 * 1 6 ) ( % r d x ) , R B 3 , R B 3 ;
vmovdqu R A 0 , ( 0 * 1 6 ) ( % r s i ) ;
vmovdqu R A 1 , ( 1 * 1 6 ) ( % r s i ) ;
vmovdqu R A 2 , ( 2 * 1 6 ) ( % r s i ) ;
vmovdqu R A 3 , ( 3 * 1 6 ) ( % r s i ) ;
vmovdqu R B 0 , ( 4 * 1 6 ) ( % r s i ) ;
vmovdqu R B 1 , ( 5 * 1 6 ) ( % r s i ) ;
vmovdqu R B 2 , ( 6 * 1 6 ) ( % r s i ) ;
vmovdqu R B 3 , ( 7 * 1 6 ) ( % r s i ) ;
vzeroall;
FRAME_ E N D
2021-12-04 14:43:40 +01:00
RET;
2021-07-20 11:46:41 +08:00
SYM_ F U N C _ E N D ( s m 4 _ a e s n i _ a v x _ c t r _ e n c _ b l k 8 )
/ *
* void s m 4 _ a e s n i _ a v x _ c b c _ d e c _ b l k 8 ( c o n s t u 3 2 * r k , u 8 * d s t ,
* const u 8 * s r c , u 8 * i v )
* /
.align 8
SYM_ F U N C _ S T A R T ( s m 4 _ a e s n i _ a v x _ c b c _ d e c _ b l k 8 )
/ * input :
* % rdi : round k e y a r r a y , C T X
* % rsi : dst ( 8 b l o c k s )
* % rdx : src ( 8 b l o c k s )
* % rcx : iv
* /
FRAME_ B E G I N
vmovdqu ( 0 * 1 6 ) ( % r d x ) , R A 0 ;
vmovdqu ( 1 * 1 6 ) ( % r d x ) , R A 1 ;
vmovdqu ( 2 * 1 6 ) ( % r d x ) , R A 2 ;
vmovdqu ( 3 * 1 6 ) ( % r d x ) , R A 3 ;
vmovdqu ( 4 * 1 6 ) ( % r d x ) , R B 0 ;
vmovdqu ( 5 * 1 6 ) ( % r d x ) , R B 1 ;
vmovdqu ( 6 * 1 6 ) ( % r d x ) , R B 2 ;
vmovdqu ( 7 * 1 6 ) ( % r d x ) , R B 3 ;
call _ _ s m 4 _ c r y p t _ b l k 8 ;
vmovdqu ( 7 * 1 6 ) ( % r d x ) , R N O T ;
vpxor ( % r c x ) , R A 0 , R A 0 ;
vpxor ( 0 * 1 6 ) ( % r d x ) , R A 1 , R A 1 ;
vpxor ( 1 * 1 6 ) ( % r d x ) , R A 2 , R A 2 ;
vpxor ( 2 * 1 6 ) ( % r d x ) , R A 3 , R A 3 ;
vpxor ( 3 * 1 6 ) ( % r d x ) , R B 0 , R B 0 ;
vpxor ( 4 * 1 6 ) ( % r d x ) , R B 1 , R B 1 ;
vpxor ( 5 * 1 6 ) ( % r d x ) , R B 2 , R B 2 ;
vpxor ( 6 * 1 6 ) ( % r d x ) , R B 3 , R B 3 ;
vmovdqu R N O T , ( % r c x ) ; /* store new IV */
vmovdqu R A 0 , ( 0 * 1 6 ) ( % r s i ) ;
vmovdqu R A 1 , ( 1 * 1 6 ) ( % r s i ) ;
vmovdqu R A 2 , ( 2 * 1 6 ) ( % r s i ) ;
vmovdqu R A 3 , ( 3 * 1 6 ) ( % r s i ) ;
vmovdqu R B 0 , ( 4 * 1 6 ) ( % r s i ) ;
vmovdqu R B 1 , ( 5 * 1 6 ) ( % r s i ) ;
vmovdqu R B 2 , ( 6 * 1 6 ) ( % r s i ) ;
vmovdqu R B 3 , ( 7 * 1 6 ) ( % r s i ) ;
vzeroall;
FRAME_ E N D
2021-12-04 14:43:40 +01:00
RET;
2021-07-20 11:46:41 +08:00
SYM_ F U N C _ E N D ( s m 4 _ a e s n i _ a v x _ c b c _ d e c _ b l k 8 )
/ *
* void s m 4 _ a e s n i _ a v x _ c f b _ d e c _ b l k 8 ( c o n s t u 3 2 * r k , u 8 * d s t ,
* const u 8 * s r c , u 8 * i v )
* /
.align 8
SYM_ F U N C _ S T A R T ( s m 4 _ a e s n i _ a v x _ c f b _ d e c _ b l k 8 )
/ * input :
* % rdi : round k e y a r r a y , C T X
* % rsi : dst ( 8 b l o c k s )
* % rdx : src ( 8 b l o c k s )
* % rcx : iv
* /
FRAME_ B E G I N
/* Load input */
vmovdqu ( % r c x ) , R A 0 ;
vmovdqu 0 * 1 6 ( % r d x ) , R A 1 ;
vmovdqu 1 * 1 6 ( % r d x ) , R A 2 ;
vmovdqu 2 * 1 6 ( % r d x ) , R A 3 ;
vmovdqu 3 * 1 6 ( % r d x ) , R B 0 ;
vmovdqu 4 * 1 6 ( % r d x ) , R B 1 ;
vmovdqu 5 * 1 6 ( % r d x ) , R B 2 ;
vmovdqu 6 * 1 6 ( % r d x ) , R B 3 ;
/* Update IV */
vmovdqu 7 * 1 6 ( % r d x ) , R N O T ;
vmovdqu R N O T , ( % r c x ) ;
call _ _ s m 4 _ c r y p t _ b l k 8 ;
vpxor ( 0 * 1 6 ) ( % r d x ) , R A 0 , R A 0 ;
vpxor ( 1 * 1 6 ) ( % r d x ) , R A 1 , R A 1 ;
vpxor ( 2 * 1 6 ) ( % r d x ) , R A 2 , R A 2 ;
vpxor ( 3 * 1 6 ) ( % r d x ) , R A 3 , R A 3 ;
vpxor ( 4 * 1 6 ) ( % r d x ) , R B 0 , R B 0 ;
vpxor ( 5 * 1 6 ) ( % r d x ) , R B 1 , R B 1 ;
vpxor ( 6 * 1 6 ) ( % r d x ) , R B 2 , R B 2 ;
vpxor ( 7 * 1 6 ) ( % r d x ) , R B 3 , R B 3 ;
vmovdqu R A 0 , ( 0 * 1 6 ) ( % r s i ) ;
vmovdqu R A 1 , ( 1 * 1 6 ) ( % r s i ) ;
vmovdqu R A 2 , ( 2 * 1 6 ) ( % r s i ) ;
vmovdqu R A 3 , ( 3 * 1 6 ) ( % r s i ) ;
vmovdqu R B 0 , ( 4 * 1 6 ) ( % r s i ) ;
vmovdqu R B 1 , ( 5 * 1 6 ) ( % r s i ) ;
vmovdqu R B 2 , ( 6 * 1 6 ) ( % r s i ) ;
vmovdqu R B 3 , ( 7 * 1 6 ) ( % r s i ) ;
vzeroall;
FRAME_ E N D
2021-12-04 14:43:40 +01:00
RET;
2021-07-20 11:46:41 +08:00
SYM_ F U N C _ E N D ( s m 4 _ a e s n i _ a v x _ c f b _ d e c _ b l k 8 )