2019-05-27 08:55:01 +02:00
/* SPDX-License-Identifier: GPL-2.0-or-later */
2013-04-13 13:47:00 +03:00
/ *
* x8 6 _ 6 4 / A V X 2 / A E S - N I a s s e m b l e r i m p l e m e n t a t i o n o f C a m e l l i a
*
* Copyright © 2 0 1 3 J u s s i K i v i l i n n a < j u s s i . k i v i l i n n a @iki.fi>
* /
# include < l i n u x / l i n k a g e . h >
2016-01-21 16:49:19 -06:00
# include < a s m / f r a m e . h >
2013-04-13 13:47:00 +03:00
# define C A M E L L I A _ T A B L E _ B Y T E _ L E N 2 7 2
/* struct camellia_ctx: */
# define k e y _ t a b l e 0
# define k e y _ l e n g t h C A M E L L I A _ T A B L E _ B Y T E _ L E N
/* register macros */
# define C T X % r d i
# define R I O % r8
/ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
helper m a c r o s
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * /
# define f i l t e r _ 8 b i t ( x , l o _ t , h i _ t , m a s k 4 b i t , t m p0 ) \
vpand x , m a s k 4 b i t , t m p0 ; \
vpandn x , m a s k 4 b i t , x ; \
vpsrld $ 4 , x , x ; \
\
vpshufb t m p0 , l o _ t , t m p0 ; \
vpshufb x , h i _ t , x ; \
vpxor t m p0 , x , x ;
# define y m m 0 _ x x m m 0
# define y m m 1 _ x x m m 1
# define y m m 2 _ x x m m 2
# define y m m 3 _ x x m m 3
# define y m m 4 _ x x m m 4
# define y m m 5 _ x x m m 5
# define y m m 6 _ x x m m 6
# define y m m 7 _ x x m m 7
# define y m m 8 _ x x m m 8
# define y m m 9 _ x x m m 9
# define y m m 1 0 _ x x m m 1 0
# define y m m 1 1 _ x x m m 1 1
# define y m m 1 2 _ x x m m 1 2
# define y m m 1 3 _ x x m m 1 3
# define y m m 1 4 _ x x m m 1 4
# define y m m 1 5 _ x x m m 1 5
/ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
3 2 - way c a m e l l i a
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * /
/ *
* IN :
* x0 . . x7 : b y t e - s l i c e d A B s t a t e
* mem_cd : register p o i n t e r s t o r i n g C D s t a t e
* key : index f o r k e y m a t e r i a l
* OUT :
* x0 . . x7 : n e w b y t e - s l i c e d C D s t a t e
* /
# define r o u n d s m 3 2 ( x0 , x1 , x2 , x3 , x4 , x5 , x6 , x7 , t 0 , t 1 , t 2 , t 3 , t 4 , t 5 , t 6 , \
t7 , m e m _ c d , k e y ) \
/ * \
* S- f u n c t i o n w i t h A E S s u b b y t e s \
* / \
vbroadcasti1 2 8 . L i n v _ s h i f t _ r o w , t 4 ; \
2013-06-08 12:00:59 +03:00
vpbroadcastd . L 0 f0 f0 f0 f , t 7 ; \
vbroadcasti1 2 8 . L p r e _ t f _ l o _ s1 , t 5 ; \
vbroadcasti1 2 8 . L p r e _ t f _ h i _ s1 , t 6 ; \
vbroadcasti1 2 8 . L p r e _ t f _ l o _ s4 , t 2 ; \
vbroadcasti1 2 8 . L p r e _ t f _ h i _ s4 , t 3 ; \
2013-04-13 13:47:00 +03:00
\
/* AES inverse shift rows */ \
vpshufb t 4 , x0 , x0 ; \
vpshufb t 4 , x7 , x7 ; \
vpshufb t 4 , x3 , x3 ; \
vpshufb t 4 , x6 , x6 ; \
2013-06-08 12:00:59 +03:00
vpshufb t 4 , x2 , x2 ; \
vpshufb t 4 , x5 , x5 ; \
vpshufb t 4 , x1 , x1 ; \
vpshufb t 4 , x4 , x4 ; \
2013-04-13 13:47:00 +03:00
\
/* prefilter sboxes 1, 2 and 3 */ \
/* prefilter sbox 4 */ \
2013-06-08 12:00:59 +03:00
filter_ 8 b i t ( x0 , t 5 , t 6 , t 7 , t 4 ) ; \
filter_ 8 b i t ( x7 , t 5 , t 6 , t 7 , t 4 ) ; \
vextracti1 2 8 $ 1 , x0 , t 0 ## _ x ; \
vextracti1 2 8 $ 1 , x7 , t 1 ## _ x ; \
filter_ 8 b i t ( x3 , t 2 , t 3 , t 7 , t 4 ) ; \
filter_ 8 b i t ( x6 , t 2 , t 3 , t 7 , t 4 ) ; \
vextracti1 2 8 $ 1 , x3 , t 3 ## _ x ; \
vextracti1 2 8 $ 1 , x6 , t 2 ## _ x ; \
filter_ 8 b i t ( x2 , t 5 , t 6 , t 7 , t 4 ) ; \
filter_ 8 b i t ( x5 , t 5 , t 6 , t 7 , t 4 ) ; \
filter_ 8 b i t ( x1 , t 5 , t 6 , t 7 , t 4 ) ; \
filter_ 8 b i t ( x4 , t 5 , t 6 , t 7 , t 4 ) ; \
\
2013-04-13 13:47:00 +03:00
vpxor t 4 ## _ x , t 4 ## _ x , t 4 ## _ x ; \
\
/* AES subbytes + AES shift rows */ \
2013-06-08 12:00:59 +03:00
vextracti1 2 8 $ 1 , x2 , t 6 ## _ x ; \
vextracti1 2 8 $ 1 , x5 , t 5 ## _ x ; \
vaesenclast t 4 ## _ x , x 0 ## _ x , x 0 ## _ x ; \
vaesenclast t 4 ## _ x , t 0 ## _ x , t 0 ## _ x ; \
vinserti1 2 8 $ 1 , t 0 ## _ x , x 0 , x0 ; \
vaesenclast t 4 ## _ x , x 7 ## _ x , x 7 ## _ x ; \
vaesenclast t 4 ## _ x , t 1 ## _ x , t 1 ## _ x ; \
vinserti1 2 8 $ 1 , t 1 ## _ x , x 7 , x7 ; \
vaesenclast t 4 ## _ x , x 3 ## _ x , x 3 ## _ x ; \
vaesenclast t 4 ## _ x , t 3 ## _ x , t 3 ## _ x ; \
vinserti1 2 8 $ 1 , t 3 ## _ x , x 3 , x3 ; \
vaesenclast t 4 ## _ x , x 6 ## _ x , x 6 ## _ x ; \
vaesenclast t 4 ## _ x , t 2 ## _ x , t 2 ## _ x ; \
vinserti1 2 8 $ 1 , t 2 ## _ x , x 6 , x6 ; \
vextracti1 2 8 $ 1 , x1 , t 3 ## _ x ; \
vextracti1 2 8 $ 1 , x4 , t 2 ## _ x ; \
2013-04-13 13:47:00 +03:00
vbroadcasti1 2 8 . L p o s t _ t f _ l o _ s1 , t 0 ; \
vbroadcasti1 2 8 . L p o s t _ t f _ h i _ s1 , t 1 ; \
2013-06-08 12:00:59 +03:00
vaesenclast t 4 ## _ x , x 2 ## _ x , x 2 ## _ x ; \
vaesenclast t 4 ## _ x , t 6 ## _ x , t 6 ## _ x ; \
vinserti1 2 8 $ 1 , t 6 ## _ x , x 2 , x2 ; \
vaesenclast t 4 ## _ x , x 5 ## _ x , x 5 ## _ x ; \
vaesenclast t 4 ## _ x , t 5 ## _ x , t 5 ## _ x ; \
vinserti1 2 8 $ 1 , t 5 ## _ x , x 5 , x5 ; \
vaesenclast t 4 ## _ x , x 1 ## _ x , x 1 ## _ x ; \
vaesenclast t 4 ## _ x , t 3 ## _ x , t 3 ## _ x ; \
vinserti1 2 8 $ 1 , t 3 ## _ x , x 1 , x1 ; \
vaesenclast t 4 ## _ x , x 4 ## _ x , x 4 ## _ x ; \
vaesenclast t 4 ## _ x , t 2 ## _ x , t 2 ## _ x ; \
vinserti1 2 8 $ 1 , t 2 ## _ x , x 4 , x4 ; \
2013-04-13 13:47:00 +03:00
\
/* postfilter sboxes 1 and 4 */ \
vbroadcasti1 2 8 . L p o s t _ t f _ l o _ s3 , t 2 ; \
vbroadcasti1 2 8 . L p o s t _ t f _ h i _ s3 , t 3 ; \
filter_ 8 b i t ( x0 , t 0 , t 1 , t 7 , t 6 ) ; \
filter_ 8 b i t ( x7 , t 0 , t 1 , t 7 , t 6 ) ; \
filter_ 8 b i t ( x3 , t 0 , t 1 , t 7 , t 6 ) ; \
filter_ 8 b i t ( x6 , t 0 , t 1 , t 7 , t 6 ) ; \
\
/* postfilter sbox 3 */ \
vbroadcasti1 2 8 . L p o s t _ t f _ l o _ s2 , t 4 ; \
vbroadcasti1 2 8 . L p o s t _ t f _ h i _ s2 , t 5 ; \
filter_ 8 b i t ( x2 , t 2 , t 3 , t 7 , t 6 ) ; \
filter_ 8 b i t ( x5 , t 2 , t 3 , t 7 , t 6 ) ; \
\
vpbroadcastq k e y , t 0 ; /* higher 64-bit duplicate ignored */ \
\
/* postfilter sbox 2 */ \
filter_ 8 b i t ( x1 , t 4 , t 5 , t 7 , t 2 ) ; \
filter_ 8 b i t ( x4 , t 4 , t 5 , t 7 , t 2 ) ; \
2013-06-08 12:00:59 +03:00
vpxor t 7 , t 7 , t 7 ; \
2013-04-13 13:47:00 +03:00
\
vpsrldq $ 1 , t 0 , t 1 ; \
vpsrldq $ 2 , t 0 , t 2 ; \
2013-06-08 12:00:59 +03:00
vpshufb t 7 , t 1 , t 1 ; \
2013-04-13 13:47:00 +03:00
vpsrldq $ 3 , t 0 , t 3 ; \
\
/* P-function */ \
vpxor x5 , x0 , x0 ; \
vpxor x6 , x1 , x1 ; \
vpxor x7 , x2 , x2 ; \
vpxor x4 , x3 , x3 ; \
\
2013-06-08 12:00:59 +03:00
vpshufb t 7 , t 2 , t 2 ; \
vpsrldq $ 4 , t 0 , t 4 ; \
vpshufb t 7 , t 3 , t 3 ; \
vpsrldq $ 5 , t 0 , t 5 ; \
vpshufb t 7 , t 4 , t 4 ; \
\
2013-04-13 13:47:00 +03:00
vpxor x2 , x4 , x4 ; \
vpxor x3 , x5 , x5 ; \
vpxor x0 , x6 , x6 ; \
vpxor x1 , x7 , x7 ; \
\
2013-06-08 12:00:59 +03:00
vpsrldq $ 6 , t 0 , t 6 ; \
vpshufb t 7 , t 5 , t 5 ; \
vpshufb t 7 , t 6 , t 6 ; \
\
2013-04-13 13:47:00 +03:00
vpxor x7 , x0 , x0 ; \
vpxor x4 , x1 , x1 ; \
vpxor x5 , x2 , x2 ; \
vpxor x6 , x3 , x3 ; \
\
vpxor x3 , x4 , x4 ; \
vpxor x0 , x5 , x5 ; \
vpxor x1 , x6 , x6 ; \
vpxor x2 , x7 , x7 ; /* note: high and low parts swapped */ \
\
/* Add key material and result to CD (x becomes new CD) */ \
\
vpxor t 6 , x1 , x1 ; \
vpxor 5 * 3 2 ( m e m _ c d ) , x1 , x1 ; \
\
2013-06-08 12:00:59 +03:00
vpsrldq $ 7 , t 0 , t 6 ; \
vpshufb t 7 , t 0 , t 0 ; \
vpshufb t 7 , t 6 , t 7 ; \
\
vpxor t 7 , x0 , x0 ; \
vpxor 4 * 3 2 ( m e m _ c d ) , x0 , x0 ; \
\
2013-04-13 13:47:00 +03:00
vpxor t 5 , x2 , x2 ; \
vpxor 6 * 3 2 ( m e m _ c d ) , x2 , x2 ; \
\
vpxor t 4 , x3 , x3 ; \
vpxor 7 * 3 2 ( m e m _ c d ) , x3 , x3 ; \
\
vpxor t 3 , x4 , x4 ; \
vpxor 0 * 3 2 ( m e m _ c d ) , x4 , x4 ; \
\
vpxor t 2 , x5 , x5 ; \
vpxor 1 * 3 2 ( m e m _ c d ) , x5 , x5 ; \
\
vpxor t 1 , x6 , x6 ; \
vpxor 2 * 3 2 ( m e m _ c d ) , x6 , x6 ; \
\
vpxor t 0 , x7 , x7 ; \
vpxor 3 * 3 2 ( m e m _ c d ) , x7 , x7 ;
/ *
2013-06-08 12:00:59 +03:00
* Size o p t i m i z a t i o n . . . w i t h i n l i n e d r o u n d s m 3 2 b i n a r y w o u l d b e o v e r 5 t i m e s
2013-04-13 13:47:00 +03:00
* larger a n d w o u l d o n l y m a r g i n a l l y f a s t e r .
* /
.align 8
2019-10-11 13:50:46 +02:00
SYM_ F U N C _ S T A R T _ L O C A L ( r o u n d s m 3 2 _ x0 _ x1 _ x2 _ x3 _ x4 _ x5 _ x6 _ x7 _ y 0 _ y 1 _ y 2 _ y 3 _ y 4 _ y 5 _ y 6 _ y 7 _ c d )
2013-04-13 13:47:00 +03:00
roundsm3 2 ( % y m m 0 , % y m m 1 , % y m m 2 , % y m m 3 , % y m m 4 , % y m m 5 , % y m m 6 , % y m m 7 ,
% ymm8 , % y m m 9 , % y m m 1 0 , % y m m 1 1 , % y m m 1 2 , % y m m 1 3 , % y m m 1 4 , % y m m 1 5 ,
% rcx, ( % r9 ) ) ;
2021-12-04 14:43:40 +01:00
RET;
2019-10-11 13:50:46 +02:00
SYM_ F U N C _ E N D ( r o u n d s m 3 2 _ x0 _ x1 _ x2 _ x3 _ x4 _ x5 _ x6 _ x7 _ y 0 _ y 1 _ y 2 _ y 3 _ y 4 _ y 5 _ y 6 _ y 7 _ c d )
2013-04-13 13:47:00 +03:00
.align 8
2019-10-11 13:50:46 +02:00
SYM_ F U N C _ S T A R T _ L O C A L ( r o u n d s m 3 2 _ x4 _ x5 _ x6 _ x7 _ x0 _ x1 _ x2 _ x3 _ y 4 _ y 5 _ y 6 _ y 7 _ y 0 _ y 1 _ y 2 _ y 3 _ a b )
2013-04-13 13:47:00 +03:00
roundsm3 2 ( % y m m 4 , % y m m 5 , % y m m 6 , % y m m 7 , % y m m 0 , % y m m 1 , % y m m 2 , % y m m 3 ,
% ymm1 2 , % y m m 1 3 , % y m m 1 4 , % y m m 1 5 , % y m m 8 , % y m m 9 , % y m m 1 0 , % y m m 1 1 ,
% rax, ( % r9 ) ) ;
2021-12-04 14:43:40 +01:00
RET;
2019-10-11 13:50:46 +02:00
SYM_ F U N C _ E N D ( r o u n d s m 3 2 _ x4 _ x5 _ x6 _ x7 _ x0 _ x1 _ x2 _ x3 _ y 4 _ y 5 _ y 6 _ y 7 _ y 0 _ y 1 _ y 2 _ y 3 _ a b )
2013-04-13 13:47:00 +03:00
/ *
* IN/ O U T :
* x0 . . x7 : b y t e - s l i c e d A B s t a t e p r e l o a d e d
* mem_ab : byte- s l i c e d A B s t a t e i n m e m o r y
* mem_cb : byte- s l i c e d C D s t a t e i n m e m o r y
* /
# define t w o _ r o u n d s m 3 2 ( x0 , x1 , x2 , x3 , x4 , x5 , x6 , x7 , y 0 , y 1 , y 2 , y 3 , y 4 , y 5 , \
y6 , y 7 , m e m _ a b , m e m _ c d , i , d i r , s t o r e _ a b ) \
leaq ( k e y _ t a b l e + ( i ) * 8 ) ( C T X ) , % r9 ; \
call r o u n d s m 3 2 _ x0 _ x1 _ x2 _ x3 _ x4 _ x5 _ x6 _ x7 _ y 0 _ y 1 _ y 2 _ y 3 _ y 4 _ y 5 _ y 6 _ y 7 _ c d ; \
\
vmovdqu x0 , 4 * 3 2 ( m e m _ c d ) ; \
vmovdqu x1 , 5 * 3 2 ( m e m _ c d ) ; \
vmovdqu x2 , 6 * 3 2 ( m e m _ c d ) ; \
vmovdqu x3 , 7 * 3 2 ( m e m _ c d ) ; \
vmovdqu x4 , 0 * 3 2 ( m e m _ c d ) ; \
vmovdqu x5 , 1 * 3 2 ( m e m _ c d ) ; \
vmovdqu x6 , 2 * 3 2 ( m e m _ c d ) ; \
vmovdqu x7 , 3 * 3 2 ( m e m _ c d ) ; \
\
leaq ( k e y _ t a b l e + ( ( i ) + ( d i r ) ) * 8 ) ( C T X ) , % r9 ; \
call r o u n d s m 3 2 _ x4 _ x5 _ x6 _ x7 _ x0 _ x1 _ x2 _ x3 _ y 4 _ y 5 _ y 6 _ y 7 _ y 0 _ y 1 _ y 2 _ y 3 _ a b ; \
\
store_ a b ( x0 , x1 , x2 , x3 , x4 , x5 , x6 , x7 , m e m _ a b ) ;
# define d u m m y _ s t o r e ( x0 , x1 , x2 , x3 , x4 , x5 , x6 , x7 , m e m _ a b ) / * d o n o t h i n g * /
# define s t o r e _ a b _ s t a t e ( x0 , x1 , x2 , x3 , x4 , x5 , x6 , x7 , m e m _ a b ) \
/* Store new AB state */ \
vmovdqu x4 , 4 * 3 2 ( m e m _ a b ) ; \
vmovdqu x5 , 5 * 3 2 ( m e m _ a b ) ; \
vmovdqu x6 , 6 * 3 2 ( m e m _ a b ) ; \
vmovdqu x7 , 7 * 3 2 ( m e m _ a b ) ; \
vmovdqu x0 , 0 * 3 2 ( m e m _ a b ) ; \
vmovdqu x1 , 1 * 3 2 ( m e m _ a b ) ; \
vmovdqu x2 , 2 * 3 2 ( m e m _ a b ) ; \
vmovdqu x3 , 3 * 3 2 ( m e m _ a b ) ;
# define e n c _ r o u n d s32 ( x0 , x1 , x2 , x3 , x4 , x5 , x6 , x7 , y 0 , y 1 , y 2 , y 3 , y 4 , y 5 , \
y6 , y 7 , m e m _ a b , m e m _ c d , i ) \
two_ r o u n d s m 3 2 ( x0 , x1 , x2 , x3 , x4 , x5 , x6 , x7 , y 0 , y 1 , y 2 , y 3 , y 4 , y 5 , \
y6 , y 7 , m e m _ a b , m e m _ c d , ( i ) + 2 , 1 , s t o r e _ a b _ s t a t e ) ; \
two_ r o u n d s m 3 2 ( x0 , x1 , x2 , x3 , x4 , x5 , x6 , x7 , y 0 , y 1 , y 2 , y 3 , y 4 , y 5 , \
y6 , y 7 , m e m _ a b , m e m _ c d , ( i ) + 4 , 1 , s t o r e _ a b _ s t a t e ) ; \
two_ r o u n d s m 3 2 ( x0 , x1 , x2 , x3 , x4 , x5 , x6 , x7 , y 0 , y 1 , y 2 , y 3 , y 4 , y 5 , \
y6 , y 7 , m e m _ a b , m e m _ c d , ( i ) + 6 , 1 , d u m m y _ s t o r e ) ;
# define d e c _ r o u n d s32 ( x0 , x1 , x2 , x3 , x4 , x5 , x6 , x7 , y 0 , y 1 , y 2 , y 3 , y 4 , y 5 , \
y6 , y 7 , m e m _ a b , m e m _ c d , i ) \
two_ r o u n d s m 3 2 ( x0 , x1 , x2 , x3 , x4 , x5 , x6 , x7 , y 0 , y 1 , y 2 , y 3 , y 4 , y 5 , \
y6 , y 7 , m e m _ a b , m e m _ c d , ( i ) + 7 , - 1 , s t o r e _ a b _ s t a t e ) ; \
two_ r o u n d s m 3 2 ( x0 , x1 , x2 , x3 , x4 , x5 , x6 , x7 , y 0 , y 1 , y 2 , y 3 , y 4 , y 5 , \
y6 , y 7 , m e m _ a b , m e m _ c d , ( i ) + 5 , - 1 , s t o r e _ a b _ s t a t e ) ; \
two_ r o u n d s m 3 2 ( x0 , x1 , x2 , x3 , x4 , x5 , x6 , x7 , y 0 , y 1 , y 2 , y 3 , y 4 , y 5 , \
y6 , y 7 , m e m _ a b , m e m _ c d , ( i ) + 3 , - 1 , d u m m y _ s t o r e ) ;
/ *
* IN :
* v0 . . 3 : b y t e - s l i c e d 3 2 - b i t i n t e g e r s
* OUT :
* v0 . . 3 : ( I N < < < 1 )
* /
# define r o l 3 2 _ 1 _ 3 2 ( v0 , v1 , v2 , v3 , t 0 , t 1 , t 2 , z e r o ) \
vpcmpgtb v0 , z e r o , t 0 ; \
vpaddb v0 , v0 , v0 ; \
vpabsb t 0 , t 0 ; \
\
vpcmpgtb v1 , z e r o , t 1 ; \
vpaddb v1 , v1 , v1 ; \
vpabsb t 1 , t 1 ; \
\
vpcmpgtb v2 , z e r o , t 2 ; \
vpaddb v2 , v2 , v2 ; \
vpabsb t 2 , t 2 ; \
\
vpor t 0 , v1 , v1 ; \
\
vpcmpgtb v3 , z e r o , t 0 ; \
vpaddb v3 , v3 , v3 ; \
vpabsb t 0 , t 0 ; \
\
vpor t 1 , v2 , v2 ; \
vpor t 2 , v3 , v3 ; \
vpor t 0 , v0 , v0 ;
/ *
* IN :
* r : byte- s l i c e d A B s t a t e i n m e m o r y
* l : byte- s l i c e d C D s t a t e i n m e m o r y
* OUT :
* x0 . . x7 : n e w b y t e - s l i c e d C D s t a t e
* /
# define f l s32 ( l , l 0 , l 1 , l 2 , l 3 , l 4 , l 5 , l 6 , l 7 , r , t 0 , t 1 , t 2 , t 3 , t t 0 , \
tt1 , t t 2 , t t 3 , k l l , k l r , k r l , k r r ) \
/ * \
* t0 = k l l ; \
* t0 & = l l ; \
* lr ^ = r o l 3 2 ( t 0 , 1 ) ; \
* / \
vpbroadcastd k l l , t 0 ; /* only lowest 32-bit used */ \
vpxor t t 0 , t t 0 , t t 0 ; \
2013-06-08 12:00:59 +03:00
vpshufb t t 0 , t 0 , t 3 ; \
2013-04-13 13:47:00 +03:00
vpsrldq $ 1 , t 0 , t 0 ; \
2013-06-08 12:00:59 +03:00
vpshufb t t 0 , t 0 , t 2 ; \
2013-04-13 13:47:00 +03:00
vpsrldq $ 1 , t 0 , t 0 ; \
2013-06-08 12:00:59 +03:00
vpshufb t t 0 , t 0 , t 1 ; \
2013-04-13 13:47:00 +03:00
vpsrldq $ 1 , t 0 , t 0 ; \
2013-06-08 12:00:59 +03:00
vpshufb t t 0 , t 0 , t 0 ; \
2013-04-13 13:47:00 +03:00
\
vpand l 0 , t 0 , t 0 ; \
vpand l 1 , t 1 , t 1 ; \
vpand l 2 , t 2 , t 2 ; \
vpand l 3 , t 3 , t 3 ; \
\
rol3 2 _ 1 _ 3 2 ( t 3 , t 2 , t 1 , t 0 , t t 1 , t t 2 , t t 3 , t t 0 ) ; \
\
vpxor l 4 , t 0 , l 4 ; \
2013-06-08 12:00:59 +03:00
vpbroadcastd k r r , t 0 ; /* only lowest 32-bit used */ \
2013-04-13 13:47:00 +03:00
vmovdqu l 4 , 4 * 3 2 ( l ) ; \
vpxor l 5 , t 1 , l 5 ; \
vmovdqu l 5 , 5 * 3 2 ( l ) ; \
vpxor l 6 , t 2 , l 6 ; \
vmovdqu l 6 , 6 * 3 2 ( l ) ; \
vpxor l 7 , t 3 , l 7 ; \
vmovdqu l 7 , 7 * 3 2 ( l ) ; \
\
/ * \
* t2 = k r r ; \
* t2 | = r r ; \
* rl ^ = t 2 ; \
* / \
\
2013-06-08 12:00:59 +03:00
vpshufb t t 0 , t 0 , t 3 ; \
2013-04-13 13:47:00 +03:00
vpsrldq $ 1 , t 0 , t 0 ; \
2013-06-08 12:00:59 +03:00
vpshufb t t 0 , t 0 , t 2 ; \
2013-04-13 13:47:00 +03:00
vpsrldq $ 1 , t 0 , t 0 ; \
2013-06-08 12:00:59 +03:00
vpshufb t t 0 , t 0 , t 1 ; \
2013-04-13 13:47:00 +03:00
vpsrldq $ 1 , t 0 , t 0 ; \
2013-06-08 12:00:59 +03:00
vpshufb t t 0 , t 0 , t 0 ; \
2013-04-13 13:47:00 +03:00
\
vpor 4 * 3 2 ( r ) , t 0 , t 0 ; \
vpor 5 * 3 2 ( r ) , t 1 , t 1 ; \
vpor 6 * 3 2 ( r ) , t 2 , t 2 ; \
vpor 7 * 3 2 ( r ) , t 3 , t 3 ; \
\
vpxor 0 * 3 2 ( r ) , t 0 , t 0 ; \
vpxor 1 * 3 2 ( r ) , t 1 , t 1 ; \
vpxor 2 * 3 2 ( r ) , t 2 , t 2 ; \
vpxor 3 * 3 2 ( r ) , t 3 , t 3 ; \
vmovdqu t 0 , 0 * 3 2 ( r ) ; \
2013-06-08 12:00:59 +03:00
vpbroadcastd k r l , t 0 ; /* only lowest 32-bit used */ \
2013-04-13 13:47:00 +03:00
vmovdqu t 1 , 1 * 3 2 ( r ) ; \
vmovdqu t 2 , 2 * 3 2 ( r ) ; \
vmovdqu t 3 , 3 * 3 2 ( r ) ; \
\
/ * \
* t2 = k r l ; \
* t2 & = r l ; \
* rr ^ = r o l 3 2 ( t 2 , 1 ) ; \
* / \
2013-06-08 12:00:59 +03:00
vpshufb t t 0 , t 0 , t 3 ; \
2013-04-13 13:47:00 +03:00
vpsrldq $ 1 , t 0 , t 0 ; \
2013-06-08 12:00:59 +03:00
vpshufb t t 0 , t 0 , t 2 ; \
2013-04-13 13:47:00 +03:00
vpsrldq $ 1 , t 0 , t 0 ; \
2013-06-08 12:00:59 +03:00
vpshufb t t 0 , t 0 , t 1 ; \
2013-04-13 13:47:00 +03:00
vpsrldq $ 1 , t 0 , t 0 ; \
2013-06-08 12:00:59 +03:00
vpshufb t t 0 , t 0 , t 0 ; \
2013-04-13 13:47:00 +03:00
\
vpand 0 * 3 2 ( r ) , t 0 , t 0 ; \
vpand 1 * 3 2 ( r ) , t 1 , t 1 ; \
vpand 2 * 3 2 ( r ) , t 2 , t 2 ; \
vpand 3 * 3 2 ( r ) , t 3 , t 3 ; \
\
rol3 2 _ 1 _ 3 2 ( t 3 , t 2 , t 1 , t 0 , t t 1 , t t 2 , t t 3 , t t 0 ) ; \
\
vpxor 4 * 3 2 ( r ) , t 0 , t 0 ; \
vpxor 5 * 3 2 ( r ) , t 1 , t 1 ; \
vpxor 6 * 3 2 ( r ) , t 2 , t 2 ; \
vpxor 7 * 3 2 ( r ) , t 3 , t 3 ; \
vmovdqu t 0 , 4 * 3 2 ( r ) ; \
2013-06-08 12:00:59 +03:00
vpbroadcastd k l r , t 0 ; /* only lowest 32-bit used */ \
2013-04-13 13:47:00 +03:00
vmovdqu t 1 , 5 * 3 2 ( r ) ; \
vmovdqu t 2 , 6 * 3 2 ( r ) ; \
vmovdqu t 3 , 7 * 3 2 ( r ) ; \
\
/ * \
* t0 = k l r ; \
* t0 | = l r ; \
* ll ^ = t 0 ; \
* / \
\
2013-06-08 12:00:59 +03:00
vpshufb t t 0 , t 0 , t 3 ; \
2013-04-13 13:47:00 +03:00
vpsrldq $ 1 , t 0 , t 0 ; \
2013-06-08 12:00:59 +03:00
vpshufb t t 0 , t 0 , t 2 ; \
2013-04-13 13:47:00 +03:00
vpsrldq $ 1 , t 0 , t 0 ; \
2013-06-08 12:00:59 +03:00
vpshufb t t 0 , t 0 , t 1 ; \
2013-04-13 13:47:00 +03:00
vpsrldq $ 1 , t 0 , t 0 ; \
2013-06-08 12:00:59 +03:00
vpshufb t t 0 , t 0 , t 0 ; \
2013-04-13 13:47:00 +03:00
\
vpor l 4 , t 0 , t 0 ; \
vpor l 5 , t 1 , t 1 ; \
vpor l 6 , t 2 , t 2 ; \
vpor l 7 , t 3 , t 3 ; \
\
vpxor l 0 , t 0 , l 0 ; \
vmovdqu l 0 , 0 * 3 2 ( l ) ; \
vpxor l 1 , t 1 , l 1 ; \
vmovdqu l 1 , 1 * 3 2 ( l ) ; \
vpxor l 2 , t 2 , l 2 ; \
vmovdqu l 2 , 2 * 3 2 ( l ) ; \
vpxor l 3 , t 3 , l 3 ; \
vmovdqu l 3 , 3 * 3 2 ( l ) ;
# define t r a n s p o s e _ 4 x4 ( x0 , x1 , x2 , x3 , t 1 , t 2 ) \
vpunpckhdq x1 , x0 , t 2 ; \
vpunpckldq x1 , x0 , x0 ; \
\
vpunpckldq x3 , x2 , t 1 ; \
vpunpckhdq x3 , x2 , x2 ; \
\
vpunpckhqdq t 1 , x0 , x1 ; \
vpunpcklqdq t 1 , x0 , x0 ; \
\
vpunpckhqdq x2 , t 2 , x3 ; \
vpunpcklqdq x2 , t 2 , x2 ;
# define b y t e s l i c e _ 1 6 x16 b _ f a s t ( a0 , b0 , c0 , d0 , a1 , b1 , c1 , d1 , a2 , b2 , c2 , d2 , \
a3 , b3 , c3 , d3 , s t 0 , s t 1 ) \
vmovdqu d2 , s t 0 ; \
vmovdqu d3 , s t 1 ; \
transpose_ 4 x4 ( a0 , a1 , a2 , a3 , d2 , d3 ) ; \
transpose_ 4 x4 ( b0 , b1 , b2 , b3 , d2 , d3 ) ; \
vmovdqu s t 0 , d2 ; \
vmovdqu s t 1 , d3 ; \
\
vmovdqu a0 , s t 0 ; \
vmovdqu a1 , s t 1 ; \
transpose_ 4 x4 ( c0 , c1 , c2 , c3 , a0 , a1 ) ; \
transpose_ 4 x4 ( d0 , d1 , d2 , d3 , a0 , a1 ) ; \
\
vbroadcasti1 2 8 . L s h u f b _ 1 6 x16 b , a0 ; \
vmovdqu s t 1 , a1 ; \
vpshufb a0 , a2 , a2 ; \
vpshufb a0 , a3 , a3 ; \
vpshufb a0 , b0 , b0 ; \
vpshufb a0 , b1 , b1 ; \
vpshufb a0 , b2 , b2 ; \
vpshufb a0 , b3 , b3 ; \
vpshufb a0 , a1 , a1 ; \
vpshufb a0 , c0 , c0 ; \
vpshufb a0 , c1 , c1 ; \
vpshufb a0 , c2 , c2 ; \
vpshufb a0 , c3 , c3 ; \
vpshufb a0 , d0 , d0 ; \
vpshufb a0 , d1 , d1 ; \
vpshufb a0 , d2 , d2 ; \
vpshufb a0 , d3 , d3 ; \
vmovdqu d3 , s t 1 ; \
vmovdqu s t 0 , d3 ; \
vpshufb a0 , d3 , a0 ; \
vmovdqu d2 , s t 0 ; \
\
transpose_ 4 x4 ( a0 , b0 , c0 , d0 , d2 , d3 ) ; \
transpose_ 4 x4 ( a1 , b1 , c1 , d1 , d2 , d3 ) ; \
vmovdqu s t 0 , d2 ; \
vmovdqu s t 1 , d3 ; \
\
vmovdqu b0 , s t 0 ; \
vmovdqu b1 , s t 1 ; \
transpose_ 4 x4 ( a2 , b2 , c2 , d2 , b0 , b1 ) ; \
transpose_ 4 x4 ( a3 , b3 , c3 , d3 , b0 , b1 ) ; \
vmovdqu s t 0 , b0 ; \
vmovdqu s t 1 , b1 ; \
/* does not adjust output bytes inside vectors */
/* load blocks to registers and apply pre-whitening */
# define i n p a c k 3 2 _ p r e ( x0 , x1 , x2 , x3 , x4 , x5 , x6 , x7 , y 0 , y 1 , y 2 , y 3 , y 4 , y 5 , \
y6 , y 7 , r i o , k e y ) \
vpbroadcastq k e y , x0 ; \
vpshufb . L p a c k _ b s w a p , x0 , x0 ; \
\
vpxor 0 * 3 2 ( r i o ) , x0 , y 7 ; \
vpxor 1 * 3 2 ( r i o ) , x0 , y 6 ; \
vpxor 2 * 3 2 ( r i o ) , x0 , y 5 ; \
vpxor 3 * 3 2 ( r i o ) , x0 , y 4 ; \
vpxor 4 * 3 2 ( r i o ) , x0 , y 3 ; \
vpxor 5 * 3 2 ( r i o ) , x0 , y 2 ; \
vpxor 6 * 3 2 ( r i o ) , x0 , y 1 ; \
vpxor 7 * 3 2 ( r i o ) , x0 , y 0 ; \
vpxor 8 * 3 2 ( r i o ) , x0 , x7 ; \
vpxor 9 * 3 2 ( r i o ) , x0 , x6 ; \
vpxor 1 0 * 3 2 ( r i o ) , x0 , x5 ; \
vpxor 1 1 * 3 2 ( r i o ) , x0 , x4 ; \
vpxor 1 2 * 3 2 ( r i o ) , x0 , x3 ; \
vpxor 1 3 * 3 2 ( r i o ) , x0 , x2 ; \
vpxor 1 4 * 3 2 ( r i o ) , x0 , x1 ; \
vpxor 1 5 * 3 2 ( r i o ) , x0 , x0 ;
/* byteslice pre-whitened blocks and store to temporary memory */
# define i n p a c k 3 2 _ p o s t ( x0 , x1 , x2 , x3 , x4 , x5 , x6 , x7 , y 0 , y 1 , y 2 , y 3 , y 4 , y 5 , \
y6 , y 7 , m e m _ a b , m e m _ c d ) \
byteslice_ 1 6 x16 b _ f a s t ( x0 , x1 , x2 , x3 , x4 , x5 , x6 , x7 , y 0 , y 1 , y 2 , y 3 , \
y4 , y 5 , y 6 , y 7 , ( m e m _ a b ) , ( m e m _ c d ) ) ; \
\
vmovdqu x0 , 0 * 3 2 ( m e m _ a b ) ; \
vmovdqu x1 , 1 * 3 2 ( m e m _ a b ) ; \
vmovdqu x2 , 2 * 3 2 ( m e m _ a b ) ; \
vmovdqu x3 , 3 * 3 2 ( m e m _ a b ) ; \
vmovdqu x4 , 4 * 3 2 ( m e m _ a b ) ; \
vmovdqu x5 , 5 * 3 2 ( m e m _ a b ) ; \
vmovdqu x6 , 6 * 3 2 ( m e m _ a b ) ; \
vmovdqu x7 , 7 * 3 2 ( m e m _ a b ) ; \
vmovdqu y 0 , 0 * 3 2 ( m e m _ c d ) ; \
vmovdqu y 1 , 1 * 3 2 ( m e m _ c d ) ; \
vmovdqu y 2 , 2 * 3 2 ( m e m _ c d ) ; \
vmovdqu y 3 , 3 * 3 2 ( m e m _ c d ) ; \
vmovdqu y 4 , 4 * 3 2 ( m e m _ c d ) ; \
vmovdqu y 5 , 5 * 3 2 ( m e m _ c d ) ; \
vmovdqu y 6 , 6 * 3 2 ( m e m _ c d ) ; \
vmovdqu y 7 , 7 * 3 2 ( m e m _ c d ) ;
/* de-byteslice, apply post-whitening and store blocks */
# define o u t u n p a c k 3 2 ( x0 , x1 , x2 , x3 , x4 , x5 , x6 , x7 , y 0 , y 1 , y 2 , y 3 , y 4 , \
y5 , y 6 , y 7 , k e y , s t a c k _ t m p0 , s t a c k _ t m p1 ) \
byteslice_ 1 6 x16 b _ f a s t ( y 0 , y 4 , x0 , x4 , y 1 , y 5 , x1 , x5 , y 2 , y 6 , x2 , x6 , \
y3 , y 7 , x3 , x7 , s t a c k _ t m p0 , s t a c k _ t m p1 ) ; \
\
vmovdqu x0 , s t a c k _ t m p0 ; \
\
vpbroadcastq k e y , x0 ; \
vpshufb . L p a c k _ b s w a p , x0 , x0 ; \
\
vpxor x0 , y 7 , y 7 ; \
vpxor x0 , y 6 , y 6 ; \
vpxor x0 , y 5 , y 5 ; \
vpxor x0 , y 4 , y 4 ; \
vpxor x0 , y 3 , y 3 ; \
vpxor x0 , y 2 , y 2 ; \
vpxor x0 , y 1 , y 1 ; \
vpxor x0 , y 0 , y 0 ; \
vpxor x0 , x7 , x7 ; \
vpxor x0 , x6 , x6 ; \
vpxor x0 , x5 , x5 ; \
vpxor x0 , x4 , x4 ; \
vpxor x0 , x3 , x3 ; \
vpxor x0 , x2 , x2 ; \
vpxor x0 , x1 , x1 ; \
vpxor s t a c k _ t m p0 , x0 , x0 ;
# define w r i t e _ o u t p u t ( x0 , x1 , x2 , x3 , x4 , x5 , x6 , x7 , y 0 , y 1 , y 2 , y 3 , y 4 , y 5 , \
y6 , y 7 , r i o ) \
vmovdqu x0 , 0 * 3 2 ( r i o ) ; \
vmovdqu x1 , 1 * 3 2 ( r i o ) ; \
vmovdqu x2 , 2 * 3 2 ( r i o ) ; \
vmovdqu x3 , 3 * 3 2 ( r i o ) ; \
vmovdqu x4 , 4 * 3 2 ( r i o ) ; \
vmovdqu x5 , 5 * 3 2 ( r i o ) ; \
vmovdqu x6 , 6 * 3 2 ( r i o ) ; \
vmovdqu x7 , 7 * 3 2 ( r i o ) ; \
vmovdqu y 0 , 8 * 3 2 ( r i o ) ; \
vmovdqu y 1 , 9 * 3 2 ( r i o ) ; \
vmovdqu y 2 , 1 0 * 3 2 ( r i o ) ; \
vmovdqu y 3 , 1 1 * 3 2 ( r i o ) ; \
vmovdqu y 4 , 1 2 * 3 2 ( r i o ) ; \
vmovdqu y 5 , 1 3 * 3 2 ( r i o ) ; \
vmovdqu y 6 , 1 4 * 3 2 ( r i o ) ; \
vmovdqu y 7 , 1 5 * 3 2 ( r i o ) ;
crypto: x86 - make constants readonly, allow linker to merge them
A lot of asm-optimized routines in arch/x86/crypto/ keep its
constants in .data. This is wrong, they should be on .rodata.
Mnay of these constants are the same in different modules.
For example, 128-bit shuffle mask 0x000102030405060708090A0B0C0D0E0F
exists in at least half a dozen places.
There is a way to let linker merge them and use just one copy.
The rules are as follows: mergeable objects of different sizes
should not share sections. You can't put them all in one .rodata
section, they will lose "mergeability".
GCC puts its mergeable constants in ".rodata.cstSIZE" sections,
or ".rodata.cstSIZE.<object_name>" if -fdata-sections is used.
This patch does the same:
.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
It is important that all data in such section consists of
16-byte elements, not larger ones, and there are no implicit
use of one element from another.
When this is not the case, use non-mergeable section:
.section .rodata[.VAR_NAME], "a", @progbits
This reduces .data by ~15 kbytes:
text data bss dec hex filename
11097415 2705840 2630712 16433967 fac32f vmlinux-prev.o
11112095 2690672 2630712 16433479 fac147 vmlinux.o
Merged objects are visible in System.map:
ffffffff81a28810 r POLY
ffffffff81a28810 r POLY
ffffffff81a28820 r TWOONE
ffffffff81a28820 r TWOONE
ffffffff81a28830 r PSHUFFLE_BYTE_FLIP_MASK <- merged regardless of
ffffffff81a28830 r SHUF_MASK <------------- the name difference
ffffffff81a28830 r SHUF_MASK
ffffffff81a28830 r SHUF_MASK
..
ffffffff81a28d00 r K512 <- merged three identical 640-byte tables
ffffffff81a28d00 r K512
ffffffff81a28d00 r K512
Use of object names in section name suffixes is not strictly necessary,
but might help if someday link stage will use garbage collection
to eliminate unused sections (ld --gc-sections).
Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
CC: Herbert Xu <herbert@gondor.apana.org.au>
CC: Josh Poimboeuf <jpoimboe@redhat.com>
CC: Xiaodong Liu <xiaodong.liu@intel.com>
CC: Megha Dey <megha.dey@intel.com>
CC: linux-crypto@vger.kernel.org
CC: x86@kernel.org
CC: linux-kernel@vger.kernel.org
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2017-01-19 22:33:04 +01:00
.section .rodata .cst32 .shufb_16x16b , " aM" , @progbits, 32
.align 32
2013-04-13 13:47:00 +03:00
# define S H U F B _ B Y T E S ( i d x ) \
0 + ( idx) , 4 + ( i d x ) , 8 + ( i d x ) , 1 2 + ( i d x )
.Lshufb_16x16b :
.byte SHUFB_ B Y T E S ( 0 ) , S H U F B _ B Y T E S ( 1 ) , S H U F B _ B Y T E S ( 2 ) , S H U F B _ B Y T E S ( 3 )
.byte SHUFB_ B Y T E S ( 0 ) , S H U F B _ B Y T E S ( 1 ) , S H U F B _ B Y T E S ( 2 ) , S H U F B _ B Y T E S ( 3 )
crypto: x86 - make constants readonly, allow linker to merge them
A lot of asm-optimized routines in arch/x86/crypto/ keep its
constants in .data. This is wrong, they should be on .rodata.
Mnay of these constants are the same in different modules.
For example, 128-bit shuffle mask 0x000102030405060708090A0B0C0D0E0F
exists in at least half a dozen places.
There is a way to let linker merge them and use just one copy.
The rules are as follows: mergeable objects of different sizes
should not share sections. You can't put them all in one .rodata
section, they will lose "mergeability".
GCC puts its mergeable constants in ".rodata.cstSIZE" sections,
or ".rodata.cstSIZE.<object_name>" if -fdata-sections is used.
This patch does the same:
.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
It is important that all data in such section consists of
16-byte elements, not larger ones, and there are no implicit
use of one element from another.
When this is not the case, use non-mergeable section:
.section .rodata[.VAR_NAME], "a", @progbits
This reduces .data by ~15 kbytes:
text data bss dec hex filename
11097415 2705840 2630712 16433967 fac32f vmlinux-prev.o
11112095 2690672 2630712 16433479 fac147 vmlinux.o
Merged objects are visible in System.map:
ffffffff81a28810 r POLY
ffffffff81a28810 r POLY
ffffffff81a28820 r TWOONE
ffffffff81a28820 r TWOONE
ffffffff81a28830 r PSHUFFLE_BYTE_FLIP_MASK <- merged regardless of
ffffffff81a28830 r SHUF_MASK <------------- the name difference
ffffffff81a28830 r SHUF_MASK
ffffffff81a28830 r SHUF_MASK
..
ffffffff81a28d00 r K512 <- merged three identical 640-byte tables
ffffffff81a28d00 r K512
ffffffff81a28d00 r K512
Use of object names in section name suffixes is not strictly necessary,
but might help if someday link stage will use garbage collection
to eliminate unused sections (ld --gc-sections).
Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
CC: Herbert Xu <herbert@gondor.apana.org.au>
CC: Josh Poimboeuf <jpoimboe@redhat.com>
CC: Xiaodong Liu <xiaodong.liu@intel.com>
CC: Megha Dey <megha.dey@intel.com>
CC: linux-crypto@vger.kernel.org
CC: x86@kernel.org
CC: linux-kernel@vger.kernel.org
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2017-01-19 22:33:04 +01:00
.section .rodata .cst32 .pack_bswap , " aM" , @progbits, 32
.align 32
2013-04-13 13:47:00 +03:00
.Lpack_bswap :
.long 0 x0 0 0 1 0 2 0 3 , 0 x04 0 5 0 6 0 7 , 0 x80 8 0 8 0 8 0 , 0 x80 8 0 8 0 8 0
.long 0 x0 0 0 1 0 2 0 3 , 0 x04 0 5 0 6 0 7 , 0 x80 8 0 8 0 8 0 , 0 x80 8 0 8 0 8 0
crypto: x86 - make constants readonly, allow linker to merge them
A lot of asm-optimized routines in arch/x86/crypto/ keep its
constants in .data. This is wrong, they should be on .rodata.
Mnay of these constants are the same in different modules.
For example, 128-bit shuffle mask 0x000102030405060708090A0B0C0D0E0F
exists in at least half a dozen places.
There is a way to let linker merge them and use just one copy.
The rules are as follows: mergeable objects of different sizes
should not share sections. You can't put them all in one .rodata
section, they will lose "mergeability".
GCC puts its mergeable constants in ".rodata.cstSIZE" sections,
or ".rodata.cstSIZE.<object_name>" if -fdata-sections is used.
This patch does the same:
.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
It is important that all data in such section consists of
16-byte elements, not larger ones, and there are no implicit
use of one element from another.
When this is not the case, use non-mergeable section:
.section .rodata[.VAR_NAME], "a", @progbits
This reduces .data by ~15 kbytes:
text data bss dec hex filename
11097415 2705840 2630712 16433967 fac32f vmlinux-prev.o
11112095 2690672 2630712 16433479 fac147 vmlinux.o
Merged objects are visible in System.map:
ffffffff81a28810 r POLY
ffffffff81a28810 r POLY
ffffffff81a28820 r TWOONE
ffffffff81a28820 r TWOONE
ffffffff81a28830 r PSHUFFLE_BYTE_FLIP_MASK <- merged regardless of
ffffffff81a28830 r SHUF_MASK <------------- the name difference
ffffffff81a28830 r SHUF_MASK
ffffffff81a28830 r SHUF_MASK
..
ffffffff81a28d00 r K512 <- merged three identical 640-byte tables
ffffffff81a28d00 r K512
ffffffff81a28d00 r K512
Use of object names in section name suffixes is not strictly necessary,
but might help if someday link stage will use garbage collection
to eliminate unused sections (ld --gc-sections).
Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
CC: Herbert Xu <herbert@gondor.apana.org.au>
CC: Josh Poimboeuf <jpoimboe@redhat.com>
CC: Xiaodong Liu <xiaodong.liu@intel.com>
CC: Megha Dey <megha.dey@intel.com>
CC: linux-crypto@vger.kernel.org
CC: x86@kernel.org
CC: linux-kernel@vger.kernel.org
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2017-01-19 22:33:04 +01:00
/* NB: section is mergeable, all elements must be aligned 16-byte blocks */
.section .rodata .cst16 , " aM" , @progbits, 16
.align 16
2013-04-13 13:47:00 +03:00
/ *
* pre- S u b B y t e t r a n s f o r m
*
* pre- l o o k u p f o r s b o x1 , s b o x2 , s b o x3 :
* swap_ b i t e n d i a n n e s s (
* isom_ m a p _ c a m e l l i a _ t o _ a e s (
* camellia_ f (
* swap_ b i t e n d i a n e s s ( i n )
* )
* )
* )
*
* ( note : ' ⊕ 0 xc5 ' i n s i d e c a m e l l i a _ f ( ) )
* /
.Lpre_tf_lo_s1 :
.byte 0 x4 5 , 0 x e 8 , 0 x40 , 0 x e d , 0 x2 e , 0 x83 , 0 x2 b , 0 x86
.byte 0 x4 b , 0 x e 6 , 0 x4 e , 0 x e 3 , 0 x20 , 0 x8 d , 0 x25 , 0 x88
.Lpre_tf_hi_s1 :
.byte 0 x0 0 , 0 x51 , 0 x f1 , 0 x a0 , 0 x8 a , 0 x d b , 0 x7 b , 0 x2 a
.byte 0 x0 9 , 0 x58 , 0 x f8 , 0 x a9 , 0 x83 , 0 x d2 , 0 x72 , 0 x23
/ *
* pre- S u b B y t e t r a n s f o r m
*
* pre- l o o k u p f o r s b o x4 :
* swap_ b i t e n d i a n n e s s (
* isom_ m a p _ c a m e l l i a _ t o _ a e s (
* camellia_ f (
* swap_ b i t e n d i a n e s s ( i n < < < 1 )
* )
* )
* )
*
* ( note : ' ⊕ 0 xc5 ' i n s i d e c a m e l l i a _ f ( ) )
* /
.Lpre_tf_lo_s4 :
.byte 0 x4 5 , 0 x40 , 0 x2 e , 0 x2 b , 0 x4 b , 0 x4 e , 0 x20 , 0 x25
.byte 0 x1 4 , 0 x11 , 0 x7 f , 0 x7 a , 0 x1 a , 0 x1 f , 0 x71 , 0 x74
.Lpre_tf_hi_s4 :
.byte 0 x0 0 , 0 x f1 , 0 x8 a , 0 x7 b , 0 x09 , 0 x f8 , 0 x83 , 0 x72
.byte 0 xad, 0 x5 c , 0 x27 , 0 x d6 , 0 x a4 , 0 x55 , 0 x2 e , 0 x d f
/ *
* post- S u b B y t e t r a n s f o r m
*
* post- l o o k u p f o r s b o x1 , s b o x4 :
* swap_ b i t e n d i a n n e s s (
* camellia_ h (
* isom_ m a p _ a e s _ t o _ c a m e l l i a (
* swap_ b i t e n d i a n n e s s (
* aes_ i n v e r s e _ a f f i n e _ t r a n s f o r m ( i n )
* )
* )
* )
* )
*
* ( note : ' ⊕ 0 x6 e ' i n s i d e c a m e l l i a _ h ( ) )
* /
.Lpost_tf_lo_s1 :
.byte 0 x3 c , 0 x c c , 0 x c f , 0 x3 f , 0 x32 , 0 x c2 , 0 x c1 , 0 x31
.byte 0 xdc, 0 x2 c , 0 x2 f , 0 x d f , 0 x d2 , 0 x22 , 0 x21 , 0 x d1
.Lpost_tf_hi_s1 :
.byte 0 x0 0 , 0 x f9 , 0 x86 , 0 x7 f , 0 x d7 , 0 x2 e , 0 x51 , 0 x a8
.byte 0 xa4 , 0 x5 d , 0 x22 , 0 x d b , 0 x73 , 0 x8 a , 0 x f5 , 0 x0 c
/ *
* post- S u b B y t e t r a n s f o r m
*
* post- l o o k u p f o r s b o x2 :
* swap_ b i t e n d i a n n e s s (
* camellia_ h (
* isom_ m a p _ a e s _ t o _ c a m e l l i a (
* swap_ b i t e n d i a n n e s s (
* aes_ i n v e r s e _ a f f i n e _ t r a n s f o r m ( i n )
* )
* )
* )
* ) < < < 1
*
* ( note : ' ⊕ 0 x6 e ' i n s i d e c a m e l l i a _ h ( ) )
* /
.Lpost_tf_lo_s2 :
.byte 0 x7 8 , 0 x99 , 0 x9 f , 0 x7 e , 0 x64 , 0 x85 , 0 x83 , 0 x62
.byte 0 xb9 , 0 x58 , 0 x5 e , 0 x b f , 0 x a5 , 0 x44 , 0 x42 , 0 x a3
.Lpost_tf_hi_s2 :
.byte 0 x0 0 , 0 x f3 , 0 x0 d , 0 x f e , 0 x a f , 0 x5 c , 0 x a2 , 0 x51
.byte 0 x4 9 , 0 x b a , 0 x44 , 0 x b7 , 0 x e 6 , 0 x15 , 0 x e b , 0 x18
/ *
* post- S u b B y t e t r a n s f o r m
*
* post- l o o k u p f o r s b o x3 :
* swap_ b i t e n d i a n n e s s (
* camellia_ h (
* isom_ m a p _ a e s _ t o _ c a m e l l i a (
* swap_ b i t e n d i a n n e s s (
* aes_ i n v e r s e _ a f f i n e _ t r a n s f o r m ( i n )
* )
* )
* )
* ) > > > 1
*
* ( note : ' ⊕ 0 x6 e ' i n s i d e c a m e l l i a _ h ( ) )
* /
.Lpost_tf_lo_s3 :
.byte 0 x1 e , 0 x66 , 0 x e 7 , 0 x9 f , 0 x19 , 0 x61 , 0 x e 0 , 0 x98
.byte 0 x6 e , 0 x16 , 0 x97 , 0 x e f , 0 x69 , 0 x11 , 0 x90 , 0 x e 8
.Lpost_tf_hi_s3 :
.byte 0 x0 0 , 0 x f c , 0 x43 , 0 x b f , 0 x e b , 0 x17 , 0 x a8 , 0 x54
.byte 0 x5 2 , 0 x a e , 0 x11 , 0 x e d , 0 x b9 , 0 x45 , 0 x f a , 0 x06
/* For isolating SubBytes from AESENCLAST, inverse shift row */
.Linv_shift_row :
.byte 0 x0 0 , 0 x0 d , 0 x0 a , 0 x07 , 0 x04 , 0 x01 , 0 x0 e , 0 x0 b
.byte 0 x0 8 , 0 x05 , 0 x02 , 0 x0 f , 0 x0 c , 0 x09 , 0 x06 , 0 x03
crypto: x86 - make constants readonly, allow linker to merge them
A lot of asm-optimized routines in arch/x86/crypto/ keep its
constants in .data. This is wrong, they should be on .rodata.
Mnay of these constants are the same in different modules.
For example, 128-bit shuffle mask 0x000102030405060708090A0B0C0D0E0F
exists in at least half a dozen places.
There is a way to let linker merge them and use just one copy.
The rules are as follows: mergeable objects of different sizes
should not share sections. You can't put them all in one .rodata
section, they will lose "mergeability".
GCC puts its mergeable constants in ".rodata.cstSIZE" sections,
or ".rodata.cstSIZE.<object_name>" if -fdata-sections is used.
This patch does the same:
.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
It is important that all data in such section consists of
16-byte elements, not larger ones, and there are no implicit
use of one element from another.
When this is not the case, use non-mergeable section:
.section .rodata[.VAR_NAME], "a", @progbits
This reduces .data by ~15 kbytes:
text data bss dec hex filename
11097415 2705840 2630712 16433967 fac32f vmlinux-prev.o
11112095 2690672 2630712 16433479 fac147 vmlinux.o
Merged objects are visible in System.map:
ffffffff81a28810 r POLY
ffffffff81a28810 r POLY
ffffffff81a28820 r TWOONE
ffffffff81a28820 r TWOONE
ffffffff81a28830 r PSHUFFLE_BYTE_FLIP_MASK <- merged regardless of
ffffffff81a28830 r SHUF_MASK <------------- the name difference
ffffffff81a28830 r SHUF_MASK
ffffffff81a28830 r SHUF_MASK
..
ffffffff81a28d00 r K512 <- merged three identical 640-byte tables
ffffffff81a28d00 r K512
ffffffff81a28d00 r K512
Use of object names in section name suffixes is not strictly necessary,
but might help if someday link stage will use garbage collection
to eliminate unused sections (ld --gc-sections).
Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
CC: Herbert Xu <herbert@gondor.apana.org.au>
CC: Josh Poimboeuf <jpoimboe@redhat.com>
CC: Xiaodong Liu <xiaodong.liu@intel.com>
CC: Megha Dey <megha.dey@intel.com>
CC: linux-crypto@vger.kernel.org
CC: x86@kernel.org
CC: linux-kernel@vger.kernel.org
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2017-01-19 22:33:04 +01:00
.section .rodata .cst4 .L0f0f0f0f , " aM" , @progbits, 4
2013-04-13 13:47:00 +03:00
.align 4
/* 4-bit mask */
.L0f0f0f0f :
.long 0x0f0f0f0f
.text
.align 8
2019-10-11 13:50:46 +02:00
SYM_ F U N C _ S T A R T _ L O C A L ( _ _ c a m e l l i a _ e n c _ b l k 3 2 )
2013-04-13 13:47:00 +03:00
/ * input :
* % rdi : ctx, C T X
* % rax : temporary s t o r a g e , 5 1 2 b y t e s
* % ymm0 . . % y m m 1 5 : 3 2 p l a i n t e x t b l o c k s
* output :
* % ymm0 . . % y m m 1 5 : 3 2 e n c r y p t e d b l o c k s , o r d e r s w a p p e d :
* 7 , 8 , 6 , 5 , 4 , 3 , 2 , 1 , 0 , 1 5 , 1 4 , 1 3 , 1 2 , 1 1 , 1 0 , 9 , 8
* /
2016-01-21 16:49:19 -06:00
FRAME_ B E G I N
2013-04-13 13:47:00 +03:00
leaq 8 * 3 2 ( % r a x ) , % r c x ;
inpack3 2 _ p o s t ( % y m m 0 , % y m m 1 , % y m m 2 , % y m m 3 , % y m m 4 , % y m m 5 , % y m m 6 , % y m m 7 ,
% ymm8 , % y m m 9 , % y m m 1 0 , % y m m 1 1 , % y m m 1 2 , % y m m 1 3 , % y m m 1 4 ,
% ymm1 5 , % r a x , % r c x ) ;
enc_ r o u n d s32 ( % y m m 0 , % y m m 1 , % y m m 2 , % y m m 3 , % y m m 4 , % y m m 5 , % y m m 6 , % y m m 7 ,
% ymm8 , % y m m 9 , % y m m 1 0 , % y m m 1 1 , % y m m 1 2 , % y m m 1 3 , % y m m 1 4 ,
% ymm1 5 , % r a x , % r c x , 0 ) ;
fls3 2 ( % r a x , % y m m 0 , % y m m 1 , % y m m 2 , % y m m 3 , % y m m 4 , % y m m 5 , % y m m 6 , % y m m 7 ,
% rcx, % y m m 8 , % y m m 9 , % y m m 1 0 , % y m m 1 1 , % y m m 1 2 , % y m m 1 3 , % y m m 1 4 ,
% ymm1 5 ,
( ( key_ t a b l e + ( 8 ) * 8 ) + 0 ) ( C T X ) ,
( ( key_ t a b l e + ( 8 ) * 8 ) + 4 ) ( C T X ) ,
( ( key_ t a b l e + ( 8 ) * 8 ) + 8 ) ( C T X ) ,
( ( key_ t a b l e + ( 8 ) * 8 ) + 1 2 ) ( C T X ) ) ;
enc_ r o u n d s32 ( % y m m 0 , % y m m 1 , % y m m 2 , % y m m 3 , % y m m 4 , % y m m 5 , % y m m 6 , % y m m 7 ,
% ymm8 , % y m m 9 , % y m m 1 0 , % y m m 1 1 , % y m m 1 2 , % y m m 1 3 , % y m m 1 4 ,
% ymm1 5 , % r a x , % r c x , 8 ) ;
fls3 2 ( % r a x , % y m m 0 , % y m m 1 , % y m m 2 , % y m m 3 , % y m m 4 , % y m m 5 , % y m m 6 , % y m m 7 ,
% rcx, % y m m 8 , % y m m 9 , % y m m 1 0 , % y m m 1 1 , % y m m 1 2 , % y m m 1 3 , % y m m 1 4 ,
% ymm1 5 ,
( ( key_ t a b l e + ( 1 6 ) * 8 ) + 0 ) ( C T X ) ,
( ( key_ t a b l e + ( 1 6 ) * 8 ) + 4 ) ( C T X ) ,
( ( key_ t a b l e + ( 1 6 ) * 8 ) + 8 ) ( C T X ) ,
( ( key_ t a b l e + ( 1 6 ) * 8 ) + 1 2 ) ( C T X ) ) ;
enc_ r o u n d s32 ( % y m m 0 , % y m m 1 , % y m m 2 , % y m m 3 , % y m m 4 , % y m m 5 , % y m m 6 , % y m m 7 ,
% ymm8 , % y m m 9 , % y m m 1 0 , % y m m 1 1 , % y m m 1 2 , % y m m 1 3 , % y m m 1 4 ,
% ymm1 5 , % r a x , % r c x , 1 6 ) ;
movl $ 2 4 , % r8 d ;
cmpl $ 1 6 , k e y _ l e n g t h ( C T X ) ;
jne . L e n c _ m a x32 ;
.Lenc_done :
/* load CD for output */
vmovdqu 0 * 3 2 ( % r c x ) , % y m m 8 ;
vmovdqu 1 * 3 2 ( % r c x ) , % y m m 9 ;
vmovdqu 2 * 3 2 ( % r c x ) , % y m m 1 0 ;
vmovdqu 3 * 3 2 ( % r c x ) , % y m m 1 1 ;
vmovdqu 4 * 3 2 ( % r c x ) , % y m m 1 2 ;
vmovdqu 5 * 3 2 ( % r c x ) , % y m m 1 3 ;
vmovdqu 6 * 3 2 ( % r c x ) , % y m m 1 4 ;
vmovdqu 7 * 3 2 ( % r c x ) , % y m m 1 5 ;
outunpack3 2 ( % y m m 0 , % y m m 1 , % y m m 2 , % y m m 3 , % y m m 4 , % y m m 5 , % y m m 6 , % y m m 7 ,
% ymm8 , % y m m 9 , % y m m 1 0 , % y m m 1 1 , % y m m 1 2 , % y m m 1 3 , % y m m 1 4 ,
% ymm1 5 , ( k e y _ t a b l e ) ( C T X , % r8 , 8 ) , ( % r a x ) , 1 * 3 2 ( % r a x ) ) ;
2016-01-21 16:49:19 -06:00
FRAME_ E N D
2021-12-04 14:43:40 +01:00
RET;
2013-04-13 13:47:00 +03:00
.align 8
.Lenc_max32 :
movl $ 3 2 , % r8 d ;
fls3 2 ( % r a x , % y m m 0 , % y m m 1 , % y m m 2 , % y m m 3 , % y m m 4 , % y m m 5 , % y m m 6 , % y m m 7 ,
% rcx, % y m m 8 , % y m m 9 , % y m m 1 0 , % y m m 1 1 , % y m m 1 2 , % y m m 1 3 , % y m m 1 4 ,
% ymm1 5 ,
( ( key_ t a b l e + ( 2 4 ) * 8 ) + 0 ) ( C T X ) ,
( ( key_ t a b l e + ( 2 4 ) * 8 ) + 4 ) ( C T X ) ,
( ( key_ t a b l e + ( 2 4 ) * 8 ) + 8 ) ( C T X ) ,
( ( key_ t a b l e + ( 2 4 ) * 8 ) + 1 2 ) ( C T X ) ) ;
enc_ r o u n d s32 ( % y m m 0 , % y m m 1 , % y m m 2 , % y m m 3 , % y m m 4 , % y m m 5 , % y m m 6 , % y m m 7 ,
% ymm8 , % y m m 9 , % y m m 1 0 , % y m m 1 1 , % y m m 1 2 , % y m m 1 3 , % y m m 1 4 ,
% ymm1 5 , % r a x , % r c x , 2 4 ) ;
jmp . L e n c _ d o n e ;
2019-10-11 13:50:46 +02:00
SYM_ F U N C _ E N D ( _ _ c a m e l l i a _ e n c _ b l k 3 2 )
2013-04-13 13:47:00 +03:00
.align 8
2019-10-11 13:50:46 +02:00
SYM_ F U N C _ S T A R T _ L O C A L ( _ _ c a m e l l i a _ d e c _ b l k 3 2 )
2013-04-13 13:47:00 +03:00
/ * input :
* % rdi : ctx, C T X
* % rax : temporary s t o r a g e , 5 1 2 b y t e s
* % r8d : 2 4 for 1 6 b y t e k e y , 3 2 f o r l a r g e r
* % ymm0 . . % y m m 1 5 : 1 6 e n c r y p t e d b l o c k s
* output :
* % ymm0 . . % y m m 1 5 : 1 6 p l a i n t e x t b l o c k s , o r d e r s w a p p e d :
* 7 , 8 , 6 , 5 , 4 , 3 , 2 , 1 , 0 , 1 5 , 1 4 , 1 3 , 1 2 , 1 1 , 1 0 , 9 , 8
* /
2016-01-21 16:49:19 -06:00
FRAME_ B E G I N
2013-04-13 13:47:00 +03:00
leaq 8 * 3 2 ( % r a x ) , % r c x ;
inpack3 2 _ p o s t ( % y m m 0 , % y m m 1 , % y m m 2 , % y m m 3 , % y m m 4 , % y m m 5 , % y m m 6 , % y m m 7 ,
% ymm8 , % y m m 9 , % y m m 1 0 , % y m m 1 1 , % y m m 1 2 , % y m m 1 3 , % y m m 1 4 ,
% ymm1 5 , % r a x , % r c x ) ;
cmpl $ 3 2 , % r8 d ;
je . L d e c _ m a x32 ;
.Ldec_max24 :
dec_ r o u n d s32 ( % y m m 0 , % y m m 1 , % y m m 2 , % y m m 3 , % y m m 4 , % y m m 5 , % y m m 6 , % y m m 7 ,
% ymm8 , % y m m 9 , % y m m 1 0 , % y m m 1 1 , % y m m 1 2 , % y m m 1 3 , % y m m 1 4 ,
% ymm1 5 , % r a x , % r c x , 1 6 ) ;
fls3 2 ( % r a x , % y m m 0 , % y m m 1 , % y m m 2 , % y m m 3 , % y m m 4 , % y m m 5 , % y m m 6 , % y m m 7 ,
% rcx, % y m m 8 , % y m m 9 , % y m m 1 0 , % y m m 1 1 , % y m m 1 2 , % y m m 1 3 , % y m m 1 4 ,
% ymm1 5 ,
( ( key_ t a b l e + ( 1 6 ) * 8 ) + 8 ) ( C T X ) ,
( ( key_ t a b l e + ( 1 6 ) * 8 ) + 1 2 ) ( C T X ) ,
( ( key_ t a b l e + ( 1 6 ) * 8 ) + 0 ) ( C T X ) ,
( ( key_ t a b l e + ( 1 6 ) * 8 ) + 4 ) ( C T X ) ) ;
dec_ r o u n d s32 ( % y m m 0 , % y m m 1 , % y m m 2 , % y m m 3 , % y m m 4 , % y m m 5 , % y m m 6 , % y m m 7 ,
% ymm8 , % y m m 9 , % y m m 1 0 , % y m m 1 1 , % y m m 1 2 , % y m m 1 3 , % y m m 1 4 ,
% ymm1 5 , % r a x , % r c x , 8 ) ;
fls3 2 ( % r a x , % y m m 0 , % y m m 1 , % y m m 2 , % y m m 3 , % y m m 4 , % y m m 5 , % y m m 6 , % y m m 7 ,
% rcx, % y m m 8 , % y m m 9 , % y m m 1 0 , % y m m 1 1 , % y m m 1 2 , % y m m 1 3 , % y m m 1 4 ,
% ymm1 5 ,
( ( key_ t a b l e + ( 8 ) * 8 ) + 8 ) ( C T X ) ,
( ( key_ t a b l e + ( 8 ) * 8 ) + 1 2 ) ( C T X ) ,
( ( key_ t a b l e + ( 8 ) * 8 ) + 0 ) ( C T X ) ,
( ( key_ t a b l e + ( 8 ) * 8 ) + 4 ) ( C T X ) ) ;
dec_ r o u n d s32 ( % y m m 0 , % y m m 1 , % y m m 2 , % y m m 3 , % y m m 4 , % y m m 5 , % y m m 6 , % y m m 7 ,
% ymm8 , % y m m 9 , % y m m 1 0 , % y m m 1 1 , % y m m 1 2 , % y m m 1 3 , % y m m 1 4 ,
% ymm1 5 , % r a x , % r c x , 0 ) ;
/* load CD for output */
vmovdqu 0 * 3 2 ( % r c x ) , % y m m 8 ;
vmovdqu 1 * 3 2 ( % r c x ) , % y m m 9 ;
vmovdqu 2 * 3 2 ( % r c x ) , % y m m 1 0 ;
vmovdqu 3 * 3 2 ( % r c x ) , % y m m 1 1 ;
vmovdqu 4 * 3 2 ( % r c x ) , % y m m 1 2 ;
vmovdqu 5 * 3 2 ( % r c x ) , % y m m 1 3 ;
vmovdqu 6 * 3 2 ( % r c x ) , % y m m 1 4 ;
vmovdqu 7 * 3 2 ( % r c x ) , % y m m 1 5 ;
outunpack3 2 ( % y m m 0 , % y m m 1 , % y m m 2 , % y m m 3 , % y m m 4 , % y m m 5 , % y m m 6 , % y m m 7 ,
% ymm8 , % y m m 9 , % y m m 1 0 , % y m m 1 1 , % y m m 1 2 , % y m m 1 3 , % y m m 1 4 ,
% ymm1 5 , ( k e y _ t a b l e ) ( C T X ) , ( % r a x ) , 1 * 3 2 ( % r a x ) ) ;
2016-01-21 16:49:19 -06:00
FRAME_ E N D
2021-12-04 14:43:40 +01:00
RET;
2013-04-13 13:47:00 +03:00
.align 8
.Ldec_max32 :
dec_ r o u n d s32 ( % y m m 0 , % y m m 1 , % y m m 2 , % y m m 3 , % y m m 4 , % y m m 5 , % y m m 6 , % y m m 7 ,
% ymm8 , % y m m 9 , % y m m 1 0 , % y m m 1 1 , % y m m 1 2 , % y m m 1 3 , % y m m 1 4 ,
% ymm1 5 , % r a x , % r c x , 2 4 ) ;
fls3 2 ( % r a x , % y m m 0 , % y m m 1 , % y m m 2 , % y m m 3 , % y m m 4 , % y m m 5 , % y m m 6 , % y m m 7 ,
% rcx, % y m m 8 , % y m m 9 , % y m m 1 0 , % y m m 1 1 , % y m m 1 2 , % y m m 1 3 , % y m m 1 4 ,
% ymm1 5 ,
( ( key_ t a b l e + ( 2 4 ) * 8 ) + 8 ) ( C T X ) ,
( ( key_ t a b l e + ( 2 4 ) * 8 ) + 1 2 ) ( C T X ) ,
( ( key_ t a b l e + ( 2 4 ) * 8 ) + 0 ) ( C T X ) ,
( ( key_ t a b l e + ( 2 4 ) * 8 ) + 4 ) ( C T X ) ) ;
jmp . L d e c _ m a x24 ;
2019-10-11 13:50:46 +02:00
SYM_ F U N C _ E N D ( _ _ c a m e l l i a _ d e c _ b l k 3 2 )
2013-04-13 13:47:00 +03:00
2019-10-11 13:51:04 +02:00
SYM_ F U N C _ S T A R T ( c a m e l l i a _ e c b _ e n c _ 3 2 w a y )
2013-04-13 13:47:00 +03:00
/ * input :
* % rdi : ctx, C T X
* % rsi : dst ( 3 2 b l o c k s )
* % rdx : src ( 3 2 b l o c k s )
* /
2016-01-21 16:49:19 -06:00
FRAME_ B E G I N
2013-04-13 13:47:00 +03:00
vzeroupper;
inpack3 2 _ p r e ( % y m m 0 , % y m m 1 , % y m m 2 , % y m m 3 , % y m m 4 , % y m m 5 , % y m m 6 , % y m m 7 ,
% ymm8 , % y m m 9 , % y m m 1 0 , % y m m 1 1 , % y m m 1 2 , % y m m 1 3 , % y m m 1 4 ,
% ymm1 5 , % r d x , ( k e y _ t a b l e ) ( C T X ) ) ;
/* now dst can be used as temporary buffer (even in src == dst case) */
movq % r s i , % r a x ;
call _ _ c a m e l l i a _ e n c _ b l k 3 2 ;
write_ o u t p u t ( % y m m 7 , % y m m 6 , % y m m 5 , % y m m 4 , % y m m 3 , % y m m 2 , % y m m 1 , % y m m 0 ,
% ymm1 5 , % y m m 1 4 , % y m m 1 3 , % y m m 1 2 , % y m m 1 1 , % y m m 1 0 , % y m m 9 ,
% ymm8 , % r s i ) ;
vzeroupper;
2016-01-21 16:49:19 -06:00
FRAME_ E N D
2021-12-04 14:43:40 +01:00
RET;
2019-10-11 13:51:04 +02:00
SYM_ F U N C _ E N D ( c a m e l l i a _ e c b _ e n c _ 3 2 w a y )
2013-04-13 13:47:00 +03:00
2019-10-11 13:51:04 +02:00
SYM_ F U N C _ S T A R T ( c a m e l l i a _ e c b _ d e c _ 3 2 w a y )
2013-04-13 13:47:00 +03:00
/ * input :
* % rdi : ctx, C T X
* % rsi : dst ( 3 2 b l o c k s )
* % rdx : src ( 3 2 b l o c k s )
* /
2016-01-21 16:49:19 -06:00
FRAME_ B E G I N
2013-04-13 13:47:00 +03:00
vzeroupper;
cmpl $ 1 6 , k e y _ l e n g t h ( C T X ) ;
movl $ 3 2 , % r8 d ;
movl $ 2 4 , % e a x ;
cmovel % e a x , % r8 d ; /* max */
inpack3 2 _ p r e ( % y m m 0 , % y m m 1 , % y m m 2 , % y m m 3 , % y m m 4 , % y m m 5 , % y m m 6 , % y m m 7 ,
% ymm8 , % y m m 9 , % y m m 1 0 , % y m m 1 1 , % y m m 1 2 , % y m m 1 3 , % y m m 1 4 ,
% ymm1 5 , % r d x , ( k e y _ t a b l e ) ( C T X , % r8 , 8 ) ) ;
/* now dst can be used as temporary buffer (even in src == dst case) */
movq % r s i , % r a x ;
call _ _ c a m e l l i a _ d e c _ b l k 3 2 ;
write_ o u t p u t ( % y m m 7 , % y m m 6 , % y m m 5 , % y m m 4 , % y m m 3 , % y m m 2 , % y m m 1 , % y m m 0 ,
% ymm1 5 , % y m m 1 4 , % y m m 1 3 , % y m m 1 2 , % y m m 1 1 , % y m m 1 0 , % y m m 9 ,
% ymm8 , % r s i ) ;
vzeroupper;
2016-01-21 16:49:19 -06:00
FRAME_ E N D
2021-12-04 14:43:40 +01:00
RET;
2019-10-11 13:51:04 +02:00
SYM_ F U N C _ E N D ( c a m e l l i a _ e c b _ d e c _ 3 2 w a y )
2013-04-13 13:47:00 +03:00
2019-10-11 13:51:04 +02:00
SYM_ F U N C _ S T A R T ( c a m e l l i a _ c b c _ d e c _ 3 2 w a y )
2013-04-13 13:47:00 +03:00
/ * input :
* % rdi : ctx, C T X
* % rsi : dst ( 3 2 b l o c k s )
* % rdx : src ( 3 2 b l o c k s )
* /
2016-01-21 16:49:19 -06:00
FRAME_ B E G I N
2021-02-24 10:29:18 -06:00
subq $ ( 1 6 * 3 2 ) , % r s p ;
2013-04-13 13:47:00 +03:00
vzeroupper;
cmpl $ 1 6 , k e y _ l e n g t h ( C T X ) ;
movl $ 3 2 , % r8 d ;
movl $ 2 4 , % e a x ;
cmovel % e a x , % r8 d ; /* max */
inpack3 2 _ p r e ( % y m m 0 , % y m m 1 , % y m m 2 , % y m m 3 , % y m m 4 , % y m m 5 , % y m m 6 , % y m m 7 ,
% ymm8 , % y m m 9 , % y m m 1 0 , % y m m 1 1 , % y m m 1 2 , % y m m 1 3 , % y m m 1 4 ,
% ymm1 5 , % r d x , ( k e y _ t a b l e ) ( C T X , % r8 , 8 ) ) ;
cmpq % r s i , % r d x ;
je . L c b c _ d e c _ u s e _ s t a c k ;
/* dst can be used as temporary storage, src is not overwritten. */
movq % r s i , % r a x ;
jmp . L c b c _ d e c _ c o n t i n u e ;
.Lcbc_dec_use_stack :
/ *
* dst s t i l l i n - u s e ( b e c a u s e d s t = = s r c ) , s o u s e s t a c k f o r t e m p o r a r y
* storage.
* /
movq % r s p , % r a x ;
.Lcbc_dec_continue :
call _ _ c a m e l l i a _ d e c _ b l k 3 2 ;
vmovdqu % y m m 7 , ( % r a x ) ;
vpxor % y m m 7 , % y m m 7 , % y m m 7 ;
vinserti1 2 8 $ 1 , ( % r d x ) , % y m m 7 , % y m m 7 ;
vpxor ( % r a x ) , % y m m 7 , % y m m 7 ;
vpxor ( 0 * 3 2 + 1 6 ) ( % r d x ) , % y m m 6 , % y m m 6 ;
vpxor ( 1 * 3 2 + 1 6 ) ( % r d x ) , % y m m 5 , % y m m 5 ;
vpxor ( 2 * 3 2 + 1 6 ) ( % r d x ) , % y m m 4 , % y m m 4 ;
vpxor ( 3 * 3 2 + 1 6 ) ( % r d x ) , % y m m 3 , % y m m 3 ;
vpxor ( 4 * 3 2 + 1 6 ) ( % r d x ) , % y m m 2 , % y m m 2 ;
vpxor ( 5 * 3 2 + 1 6 ) ( % r d x ) , % y m m 1 , % y m m 1 ;
vpxor ( 6 * 3 2 + 1 6 ) ( % r d x ) , % y m m 0 , % y m m 0 ;
vpxor ( 7 * 3 2 + 1 6 ) ( % r d x ) , % y m m 1 5 , % y m m 1 5 ;
vpxor ( 8 * 3 2 + 1 6 ) ( % r d x ) , % y m m 1 4 , % y m m 1 4 ;
vpxor ( 9 * 3 2 + 1 6 ) ( % r d x ) , % y m m 1 3 , % y m m 1 3 ;
vpxor ( 1 0 * 3 2 + 1 6 ) ( % r d x ) , % y m m 1 2 , % y m m 1 2 ;
vpxor ( 1 1 * 3 2 + 1 6 ) ( % r d x ) , % y m m 1 1 , % y m m 1 1 ;
vpxor ( 1 2 * 3 2 + 1 6 ) ( % r d x ) , % y m m 1 0 , % y m m 1 0 ;
vpxor ( 1 3 * 3 2 + 1 6 ) ( % r d x ) , % y m m 9 , % y m m 9 ;
vpxor ( 1 4 * 3 2 + 1 6 ) ( % r d x ) , % y m m 8 , % y m m 8 ;
write_ o u t p u t ( % y m m 7 , % y m m 6 , % y m m 5 , % y m m 4 , % y m m 3 , % y m m 2 , % y m m 1 , % y m m 0 ,
% ymm1 5 , % y m m 1 4 , % y m m 1 3 , % y m m 1 2 , % y m m 1 1 , % y m m 1 0 , % y m m 9 ,
% ymm8 , % r s i ) ;
vzeroupper;
2021-02-24 10:29:18 -06:00
addq $ ( 1 6 * 3 2 ) , % r s p ;
2016-01-21 16:49:19 -06:00
FRAME_ E N D
2021-12-04 14:43:40 +01:00
RET;
2019-10-11 13:51:04 +02:00
SYM_ F U N C _ E N D ( c a m e l l i a _ c b c _ d e c _ 3 2 w a y )