2012-07-11 19:37:37 +02:00
/ *
* Cast5 C i p h e r 1 6 - w a y p a r a l l e l a l g o r i t h m ( A V X / x86 _ 6 4 )
*
* Copyright ( C ) 2 0 1 2 J o h a n n e s G o e t z f r i e d
* < Johannes. G o e t z f r i e d @informatik.stud.uni-erlangen.de>
*
2012-08-28 14:24:49 +03:00
* Copyright © 2 0 1 2 J u s s i K i v i l i n n a < j u s s i . k i v i l i n n a @mbnet.fi>
*
2012-07-11 19:37:37 +02:00
* This p r o g r a m i s f r e e s o f t w a r e ; you can redistribute it and/or modify
* it u n d e r t h e t e r m s o f t h e G N U G e n e r a l P u b l i c L i c e n s e a s p u b l i s h e d b y
* the F r e e S o f t w a r e F o u n d a t i o n ; either version 2 of the License, or
* ( at y o u r o p t i o n ) a n y l a t e r v e r s i o n .
*
* This p r o g r a m i s d i s t r i b u t e d i n t h e h o p e t h a t i t w i l l b e u s e f u l ,
* but W I T H O U T A N Y W A R R A N T Y ; without even the implied warranty of
* MERCHANTABILITY o r F I T N E S S F O R A P A R T I C U L A R P U R P O S E . S e e t h e
* GNU G e n e r a l P u b l i c L i c e n s e f o r m o r e d e t a i l s .
*
* You s h o u l d h a v e r e c e i v e d a c o p y o f t h e G N U G e n e r a l P u b l i c L i c e n s e
* along w i t h t h i s p r o g r a m ; if not, write to the Free Software
* Foundation, I n c . , 5 9 T e m p l e P l a c e , S u i t e 3 3 0 , B o s t o n , M A 0 2 1 1 1 - 1 3 0 7
* USA
*
* /
2013-01-19 13:39:11 +02:00
# include < l i n u x / l i n k a g e . h >
2012-07-11 19:37:37 +02:00
.file " cast5 - a v x - x86 _ 6 4 - a s m _ 6 4 . S "
2012-11-13 11:43:14 +02:00
.extern cast_s1
.extern cast_s2
.extern cast_s3
.extern cast_s4
2012-07-11 19:37:37 +02:00
/* structure of crypto context */
# define k m 0
# define k r ( 1 6 * 4 )
# define r r ( ( 1 6 * 4 ) + 1 6 )
/* s-boxes */
2012-11-13 11:43:14 +02:00
# define s1 c a s t _ s1
# define s2 c a s t _ s2
# define s3 c a s t _ s3
# define s4 c a s t _ s4
2012-07-11 19:37:37 +02:00
/ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
1 6 - way A V X c a s t 5
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * /
# define C T X % r d i
# define R L 1 % x m m 0
# define R R 1 % x m m 1
# define R L 2 % x m m 2
# define R R 2 % x m m 3
# define R L 3 % x m m 4
# define R R 3 % x m m 5
# define R L 4 % x m m 6
# define R R 4 % x m m 7
# define R X % x m m 8
# define R K M % x m m 9
2012-08-28 14:24:49 +03:00
# define R K R % x m m 1 0
# define R K R F % x m m 1 1
# define R K R R % x m m 1 2
# define R 3 2 % x m m 1 3
# define R 1 S T % x m m 1 4
2012-07-11 19:37:37 +02:00
2012-08-28 14:24:49 +03:00
# define R T M P % x m m 1 5
2012-07-11 19:37:37 +02:00
2012-08-28 14:24:49 +03:00
# define R I D 1 % r b p
# define R I D 1 d % e b p
# define R I D 2 % r s i
# define R I D 2 d % e s i
2012-07-11 19:37:37 +02:00
# define R G I 1 % r d x
# define R G I 1 b l % d l
# define R G I 1 b h % d h
# define R G I 2 % r c x
# define R G I 2 b l % c l
# define R G I 2 b h % c h
2012-08-28 14:24:49 +03:00
# define R G I 3 % r a x
# define R G I 3 b l % a l
# define R G I 3 b h % a h
# define R G I 4 % r b x
# define R G I 4 b l % b l
# define R G I 4 b h % b h
2012-07-11 19:37:37 +02:00
# define R F S 1 % r8
# define R F S 1 d % r8 d
# define R F S 2 % r9
# define R F S 2 d % r9 d
# define R F S 3 % r10
# define R F S 3 d % r10 d
2012-08-28 14:24:49 +03:00
# define l o o k u p _ 3 2 b i t ( s r c , d s t , o p1 , o p2 , o p3 , i n t e r l e a v e _ o p , i l _ r e g ) \
movzbl s r c ## b h , R I D 1 d ; \
movzbl s r c ## b l , R I D 2 d ; \
shrq $ 1 6 , s r c ; \
2012-07-11 19:37:37 +02:00
movl s1 ( , R I D 1 , 4 ) , d s t ## d ; \
op1 s2 ( , R I D 2 , 4 ) , d s t ## d ; \
2012-08-28 14:24:49 +03:00
movzbl s r c ## b h , R I D 1 d ; \
movzbl s r c ## b l , R I D 2 d ; \
interleave_ o p ( i l _ r e g ) ; \
2012-07-11 19:37:37 +02:00
op2 s3 ( , R I D 1 , 4 ) , d s t ## d ; \
op3 s4 ( , R I D 2 , 4 ) , d s t ## d ;
2012-08-28 14:24:49 +03:00
# define d u m m y ( d ) / * d o n o t h i n g * /
# define s h r _ n e x t ( r e g ) \
shrq $ 1 6 , r e g ;
# define F _ h e a d ( a , x , g i 1 , g i 2 , o p0 ) \
2012-07-11 19:37:37 +02:00
op0 a , R K M , x ; \
2012-08-28 14:24:49 +03:00
vpslld R K R F , x , R T M P ; \
vpsrld R K R R , x , x ; \
2012-07-11 19:37:37 +02:00
vpor R T M P , x , x ; \
\
2012-08-28 14:24:49 +03:00
vmovq x , g i 1 ; \
vpextrq $ 1 , x , g i 2 ;
# define F _ t a i l ( a , x , g i 1 , g i 2 , o p1 , o p2 , o p3 ) \
lookup_ 3 2 b i t ( ## g i 1 , R F S 1 , o p1 , o p2 , o p3 , s h r _ n e x t , ## g i 1 ) ; \
lookup_ 3 2 b i t ( ## g i 2 , R F S 3 , o p1 , o p2 , o p3 , s h r _ n e x t , ## g i 2 ) ; \
2012-07-11 19:37:37 +02:00
\
2012-08-28 14:24:49 +03:00
lookup_ 3 2 b i t ( ## g i 1 , R F S 2 , o p1 , o p2 , o p3 , d u m m y , n o n e ) ; \
shlq $ 3 2 , R F S 2 ; \
orq R F S 1 , R F S 2 ; \
lookup_ 3 2 b i t ( ## g i 2 , R F S 1 , o p1 , o p2 , o p3 , d u m m y , n o n e ) ; \
shlq $ 3 2 , R F S 1 ; \
orq R F S 1 , R F S 3 ; \
2012-07-11 19:37:37 +02:00
\
2012-08-28 14:24:49 +03:00
vmovq R F S 2 , x ; \
2012-07-11 19:37:37 +02:00
vpinsrq $ 1 , R F S 3 , x , x ;
2012-08-28 14:24:49 +03:00
# define F _ 2 ( a1 , b1 , a2 , b2 , o p0 , o p1 , o p2 , o p3 ) \
F_ h e a d ( b1 , R X , R G I 1 , R G I 2 , o p0 ) ; \
F_ h e a d ( b2 , R X , R G I 3 , R G I 4 , o p0 ) ; \
\
F_ t a i l ( b1 , R X , R G I 1 , R G I 2 , o p1 , o p2 , o p3 ) ; \
F_ t a i l ( b2 , R T M P , R G I 3 , R G I 4 , o p1 , o p2 , o p3 ) ; \
\
vpxor a1 , R X , a1 ; \
vpxor a2 , R T M P , a2 ;
# define F 1 _ 2 ( a1 , b1 , a2 , b2 ) \
F_ 2 ( a1 , b1 , a2 , b2 , v p a d d d , x o r l , s u b l , a d d l )
# define F 2 _ 2 ( a1 , b1 , a2 , b2 ) \
F_ 2 ( a1 , b1 , a2 , b2 , v p x o r , s u b l , a d d l , x o r l )
# define F 3 _ 2 ( a1 , b1 , a2 , b2 ) \
F_ 2 ( a1 , b1 , a2 , b2 , v p s u b d , a d d l , x o r l , s u b l )
2012-07-11 19:37:37 +02:00
2012-08-28 14:24:49 +03:00
# define s u b r o u n d ( a1 , b1 , a2 , b2 , f ) \
F ## f # # _ 2 ( a1 , b1 , a2 , b2 ) ;
2012-07-11 19:37:37 +02:00
# define r o u n d ( l , r , n , f ) \
vbroadcastss ( k m + ( 4 * n ) ) ( C T X ) , R K M ; \
2012-08-28 14:24:49 +03:00
vpand R 1 S T , R K R , R K R F ; \
2012-07-11 19:37:37 +02:00
vpsubq R K R F , R 3 2 , R K R R ; \
2012-08-28 14:24:49 +03:00
vpsrldq $ 1 , R K R , R K R ; \
subround( l ## 1 , r ## 1 , l ## 2 , r ## 2 , f ) ; \
subround( l ## 3 , r ## 3 , l ## 4 , r ## 4 , f ) ;
# define e n c _ p r e l o a d _ r k r ( ) \
vbroadcastss . L 1 6 _ m a s k , R K R ; \
/* add 16-bit rotation to key rotations (mod 32) */ \
vpxor k r ( C T X ) , R K R , R K R ;
2012-07-11 19:37:37 +02:00
2012-08-28 14:24:49 +03:00
# define d e c _ p r e l o a d _ r k r ( ) \
vbroadcastss . L 1 6 _ m a s k , R K R ; \
/* add 16-bit rotation to key rotations (mod 32) */ \
vpxor k r ( C T X ) , R K R , R K R ; \
vpshufb . L b s w a p12 8 _ m a s k , R K R , R K R ;
2012-07-11 19:37:37 +02:00
# define t r a n s p o s e _ 2 x4 ( x0 , x1 , t 0 , t 1 ) \
vpunpckldq x1 , x0 , t 0 ; \
vpunpckhdq x1 , x0 , t 1 ; \
\
vpunpcklqdq t 1 , t 0 , x0 ; \
vpunpckhqdq t 1 , t 0 , x1 ;
2012-10-20 15:06:56 +03:00
# define i n p a c k _ b l o c k s ( x0 , x1 , t 0 , t 1 , r m a s k ) \
2012-08-28 14:24:49 +03:00
vpshufb r m a s k , x0 , x0 ; \
vpshufb r m a s k , x1 , x1 ; \
2012-07-11 19:37:37 +02:00
\
transpose_ 2 x4 ( x0 , x1 , t 0 , t 1 )
2012-10-20 15:06:56 +03:00
# define o u t u n p a c k _ b l o c k s ( x0 , x1 , t 0 , t 1 , r m a s k ) \
2012-07-11 19:37:37 +02:00
transpose_ 2 x4 ( x0 , x1 , t 0 , t 1 ) \
\
2012-08-28 14:24:49 +03:00
vpshufb r m a s k , x0 , x0 ; \
2012-10-20 15:06:56 +03:00
vpshufb r m a s k , x1 , x1 ;
2012-07-11 19:37:37 +02:00
2012-08-28 14:24:49 +03:00
.data
2012-07-11 19:37:37 +02:00
.align 16
.Lbswap_mask :
.byte 3 , 2 , 1 , 0 , 7 , 6 , 5 , 4 , 1 1 , 1 0 , 9 , 8 , 1 5 , 1 4 , 1 3 , 1 2
2012-08-28 14:24:49 +03:00
.Lbswap128_mask :
.byte 1 5 , 1 4 , 1 3 , 1 2 , 1 1 , 1 0 , 9 , 8 , 7 , 6 , 5 , 4 , 3 , 2 , 1 , 0
2012-10-20 15:06:56 +03:00
.Lbswap_iv_mask :
.byte 7 , 6 , 5 , 4 , 3 , 2 , 1 , 0 , 7 , 6 , 5 , 4 , 3 , 2 , 1 , 0
2012-08-28 14:24:49 +03:00
.L16_mask :
.byte 1 6 , 1 6 , 1 6 , 1 6
2012-07-11 19:37:37 +02:00
.L32_mask :
2012-08-28 14:24:49 +03:00
.byte 3 2 , 0 , 0 , 0
.Lfirst_mask :
.byte 0 x1 f , 0 , 0 , 0
.text
2012-07-11 19:37:37 +02:00
.align 16
2012-10-20 15:06:56 +03:00
__cast5_enc_blk16 :
2012-07-11 19:37:37 +02:00
/ * input :
* % rdi : ctx, C T X
2012-10-20 15:06:56 +03:00
* RL1 : blocks 1 a n d 2
* RR1 : blocks 3 a n d 4
* RL2 : blocks 5 a n d 6
* RR2 : blocks 7 a n d 8
* RL3 : blocks 9 a n d 1 0
* RR3 : blocks 1 1 a n d 1 2
* RL4 : blocks 1 3 a n d 1 4
* RR4 : blocks 1 5 a n d 1 6
* output :
* RL1 : encrypted b l o c k s 1 a n d 2
* RR1 : encrypted b l o c k s 3 a n d 4
* RL2 : encrypted b l o c k s 5 a n d 6
* RR2 : encrypted b l o c k s 7 a n d 8
* RL3 : encrypted b l o c k s 9 a n d 1 0
* RR3 : encrypted b l o c k s 1 1 a n d 1 2
* RL4 : encrypted b l o c k s 1 3 a n d 1 4
* RR4 : encrypted b l o c k s 1 5 a n d 1 6
2012-07-11 19:37:37 +02:00
* /
2012-08-28 14:24:49 +03:00
pushq % r b p ;
2012-07-11 19:37:37 +02:00
pushq % r b x ;
2012-08-28 14:24:49 +03:00
vmovdqa . L b s w a p _ m a s k , R K M ;
vmovd . L f i r s t _ m a s k , R 1 S T ;
vmovd . L 3 2 _ m a s k , R 3 2 ;
enc_ p r e l o a d _ r k r ( ) ;
2012-07-11 19:37:37 +02:00
2012-10-20 15:06:56 +03:00
inpack_ b l o c k s ( R L 1 , R R 1 , R T M P , R X , R K M ) ;
inpack_ b l o c k s ( R L 2 , R R 2 , R T M P , R X , R K M ) ;
inpack_ b l o c k s ( R L 3 , R R 3 , R T M P , R X , R K M ) ;
inpack_ b l o c k s ( R L 4 , R R 4 , R T M P , R X , R K M ) ;
2012-07-11 19:37:37 +02:00
round( R L , R R , 0 , 1 ) ;
round( R R , R L , 1 , 2 ) ;
round( R L , R R , 2 , 3 ) ;
round( R R , R L , 3 , 1 ) ;
round( R L , R R , 4 , 2 ) ;
round( R R , R L , 5 , 3 ) ;
round( R L , R R , 6 , 1 ) ;
round( R R , R L , 7 , 2 ) ;
round( R L , R R , 8 , 3 ) ;
round( R R , R L , 9 , 1 ) ;
round( R L , R R , 1 0 , 2 ) ;
round( R R , R L , 1 1 , 3 ) ;
2012-08-28 14:24:49 +03:00
movzbl r r ( C T X ) , % e a x ;
testl % e a x , % e a x ;
2013-01-19 13:39:11 +02:00
jnz . L _ _ s k i p _ e n c ;
2012-07-11 19:37:37 +02:00
round( R L , R R , 1 2 , 1 ) ;
round( R R , R L , 1 3 , 2 ) ;
round( R L , R R , 1 4 , 3 ) ;
round( R R , R L , 1 5 , 1 ) ;
2013-01-19 13:39:11 +02:00
.L__skip_enc :
2012-07-11 19:37:37 +02:00
popq % r b x ;
2012-08-28 14:24:49 +03:00
popq % r b p ;
vmovdqa . L b s w a p _ m a s k , R K M ;
2012-07-11 19:37:37 +02:00
2012-10-20 15:06:56 +03:00
outunpack_ b l o c k s ( R R 1 , R L 1 , R T M P , R X , R K M ) ;
outunpack_ b l o c k s ( R R 2 , R L 2 , R T M P , R X , R K M ) ;
outunpack_ b l o c k s ( R R 3 , R L 3 , R T M P , R X , R K M ) ;
outunpack_ b l o c k s ( R R 4 , R L 4 , R T M P , R X , R K M ) ;
2012-07-11 19:37:37 +02:00
ret;
2013-01-19 13:39:11 +02:00
ENDPROC( _ _ c a s t 5 _ e n c _ b l k 1 6 )
2012-07-11 19:37:37 +02:00
.align 16
2012-10-20 15:06:56 +03:00
__cast5_dec_blk16 :
2012-07-11 19:37:37 +02:00
/ * input :
* % rdi : ctx, C T X
2012-10-20 15:06:56 +03:00
* RL1 : encrypted b l o c k s 1 a n d 2
* RR1 : encrypted b l o c k s 3 a n d 4
* RL2 : encrypted b l o c k s 5 a n d 6
* RR2 : encrypted b l o c k s 7 a n d 8
* RL3 : encrypted b l o c k s 9 a n d 1 0
* RR3 : encrypted b l o c k s 1 1 a n d 1 2
* RL4 : encrypted b l o c k s 1 3 a n d 1 4
* RR4 : encrypted b l o c k s 1 5 a n d 1 6
* output :
* RL1 : decrypted b l o c k s 1 a n d 2
* RR1 : decrypted b l o c k s 3 a n d 4
* RL2 : decrypted b l o c k s 5 a n d 6
* RR2 : decrypted b l o c k s 7 a n d 8
* RL3 : decrypted b l o c k s 9 a n d 1 0
* RR3 : decrypted b l o c k s 1 1 a n d 1 2
* RL4 : decrypted b l o c k s 1 3 a n d 1 4
* RR4 : decrypted b l o c k s 1 5 a n d 1 6
2012-07-11 19:37:37 +02:00
* /
2012-08-28 14:24:49 +03:00
pushq % r b p ;
2012-07-11 19:37:37 +02:00
pushq % r b x ;
2012-08-28 14:24:49 +03:00
vmovdqa . L b s w a p _ m a s k , R K M ;
vmovd . L f i r s t _ m a s k , R 1 S T ;
vmovd . L 3 2 _ m a s k , R 3 2 ;
dec_ p r e l o a d _ r k r ( ) ;
2012-07-11 19:37:37 +02:00
2012-10-20 15:06:56 +03:00
inpack_ b l o c k s ( R L 1 , R R 1 , R T M P , R X , R K M ) ;
inpack_ b l o c k s ( R L 2 , R R 2 , R T M P , R X , R K M ) ;
inpack_ b l o c k s ( R L 3 , R R 3 , R T M P , R X , R K M ) ;
inpack_ b l o c k s ( R L 4 , R R 4 , R T M P , R X , R K M ) ;
2012-07-11 19:37:37 +02:00
2012-08-28 14:24:49 +03:00
movzbl r r ( C T X ) , % e a x ;
testl % e a x , % e a x ;
2013-01-19 13:39:11 +02:00
jnz . L _ _ s k i p _ d e c ;
2012-07-11 19:37:37 +02:00
round( R L , R R , 1 5 , 1 ) ;
round( R R , R L , 1 4 , 3 ) ;
round( R L , R R , 1 3 , 2 ) ;
round( R R , R L , 1 2 , 1 ) ;
2013-01-19 13:39:11 +02:00
.L__dec_tail :
2012-07-11 19:37:37 +02:00
round( R L , R R , 1 1 , 3 ) ;
round( R R , R L , 1 0 , 2 ) ;
round( R L , R R , 9 , 1 ) ;
round( R R , R L , 8 , 3 ) ;
round( R L , R R , 7 , 2 ) ;
round( R R , R L , 6 , 1 ) ;
round( R L , R R , 5 , 3 ) ;
round( R R , R L , 4 , 2 ) ;
round( R L , R R , 3 , 1 ) ;
round( R R , R L , 2 , 3 ) ;
round( R L , R R , 1 , 2 ) ;
round( R R , R L , 0 , 1 ) ;
2012-08-28 14:24:49 +03:00
vmovdqa . L b s w a p _ m a s k , R K M ;
2012-07-11 19:37:37 +02:00
popq % r b x ;
2012-08-28 14:24:49 +03:00
popq % r b p ;
2012-07-11 19:37:37 +02:00
2012-10-20 15:06:56 +03:00
outunpack_ b l o c k s ( R R 1 , R L 1 , R T M P , R X , R K M ) ;
outunpack_ b l o c k s ( R R 2 , R L 2 , R T M P , R X , R K M ) ;
outunpack_ b l o c k s ( R R 3 , R L 3 , R T M P , R X , R K M ) ;
outunpack_ b l o c k s ( R R 4 , R L 4 , R T M P , R X , R K M ) ;
2012-07-11 19:37:37 +02:00
ret;
2012-08-28 14:24:49 +03:00
2013-01-19 13:39:11 +02:00
.L__skip_dec :
2012-08-28 14:24:49 +03:00
vpsrldq $ 4 , R K R , R K R ;
2013-01-19 13:39:11 +02:00
jmp . L _ _ d e c _ t a i l ;
ENDPROC( _ _ c a s t 5 _ d e c _ b l k 1 6 )
2012-10-20 15:06:56 +03:00
2013-01-19 13:39:11 +02:00
ENTRY( c a s t 5 _ e c b _ e n c _ 1 6 w a y )
2012-10-20 15:06:56 +03:00
/ * input :
* % rdi : ctx, C T X
* % rsi : dst
* % rdx : src
* /
movq % r s i , % r11 ;
vmovdqu ( 0 * 4 * 4 ) ( % r d x ) , R L 1 ;
vmovdqu ( 1 * 4 * 4 ) ( % r d x ) , R R 1 ;
vmovdqu ( 2 * 4 * 4 ) ( % r d x ) , R L 2 ;
vmovdqu ( 3 * 4 * 4 ) ( % r d x ) , R R 2 ;
vmovdqu ( 4 * 4 * 4 ) ( % r d x ) , R L 3 ;
vmovdqu ( 5 * 4 * 4 ) ( % r d x ) , R R 3 ;
vmovdqu ( 6 * 4 * 4 ) ( % r d x ) , R L 4 ;
vmovdqu ( 7 * 4 * 4 ) ( % r d x ) , R R 4 ;
call _ _ c a s t 5 _ e n c _ b l k 1 6 ;
vmovdqu R R 1 , ( 0 * 4 * 4 ) ( % r11 ) ;
vmovdqu R L 1 , ( 1 * 4 * 4 ) ( % r11 ) ;
vmovdqu R R 2 , ( 2 * 4 * 4 ) ( % r11 ) ;
vmovdqu R L 2 , ( 3 * 4 * 4 ) ( % r11 ) ;
vmovdqu R R 3 , ( 4 * 4 * 4 ) ( % r11 ) ;
vmovdqu R L 3 , ( 5 * 4 * 4 ) ( % r11 ) ;
vmovdqu R R 4 , ( 6 * 4 * 4 ) ( % r11 ) ;
vmovdqu R L 4 , ( 7 * 4 * 4 ) ( % r11 ) ;
ret;
2013-01-19 13:39:11 +02:00
ENDPROC( c a s t 5 _ e c b _ e n c _ 1 6 w a y )
2012-10-20 15:06:56 +03:00
2013-01-19 13:39:11 +02:00
ENTRY( c a s t 5 _ e c b _ d e c _ 1 6 w a y )
2012-10-20 15:06:56 +03:00
/ * input :
* % rdi : ctx, C T X
* % rsi : dst
* % rdx : src
* /
movq % r s i , % r11 ;
vmovdqu ( 0 * 4 * 4 ) ( % r d x ) , R L 1 ;
vmovdqu ( 1 * 4 * 4 ) ( % r d x ) , R R 1 ;
vmovdqu ( 2 * 4 * 4 ) ( % r d x ) , R L 2 ;
vmovdqu ( 3 * 4 * 4 ) ( % r d x ) , R R 2 ;
vmovdqu ( 4 * 4 * 4 ) ( % r d x ) , R L 3 ;
vmovdqu ( 5 * 4 * 4 ) ( % r d x ) , R R 3 ;
vmovdqu ( 6 * 4 * 4 ) ( % r d x ) , R L 4 ;
vmovdqu ( 7 * 4 * 4 ) ( % r d x ) , R R 4 ;
call _ _ c a s t 5 _ d e c _ b l k 1 6 ;
vmovdqu R R 1 , ( 0 * 4 * 4 ) ( % r11 ) ;
vmovdqu R L 1 , ( 1 * 4 * 4 ) ( % r11 ) ;
vmovdqu R R 2 , ( 2 * 4 * 4 ) ( % r11 ) ;
vmovdqu R L 2 , ( 3 * 4 * 4 ) ( % r11 ) ;
vmovdqu R R 3 , ( 4 * 4 * 4 ) ( % r11 ) ;
vmovdqu R L 3 , ( 5 * 4 * 4 ) ( % r11 ) ;
vmovdqu R R 4 , ( 6 * 4 * 4 ) ( % r11 ) ;
vmovdqu R L 4 , ( 7 * 4 * 4 ) ( % r11 ) ;
ret;
2013-01-19 13:39:11 +02:00
ENDPROC( c a s t 5 _ e c b _ d e c _ 1 6 w a y )
2012-10-20 15:06:56 +03:00
2013-01-19 13:39:11 +02:00
ENTRY( c a s t 5 _ c b c _ d e c _ 1 6 w a y )
2012-10-20 15:06:56 +03:00
/ * input :
* % rdi : ctx, C T X
* % rsi : dst
* % rdx : src
* /
pushq % r12 ;
movq % r s i , % r11 ;
movq % r d x , % r12 ;
vmovdqu ( 0 * 1 6 ) ( % r d x ) , R L 1 ;
vmovdqu ( 1 * 1 6 ) ( % r d x ) , R R 1 ;
vmovdqu ( 2 * 1 6 ) ( % r d x ) , R L 2 ;
vmovdqu ( 3 * 1 6 ) ( % r d x ) , R R 2 ;
vmovdqu ( 4 * 1 6 ) ( % r d x ) , R L 3 ;
vmovdqu ( 5 * 1 6 ) ( % r d x ) , R R 3 ;
vmovdqu ( 6 * 1 6 ) ( % r d x ) , R L 4 ;
vmovdqu ( 7 * 1 6 ) ( % r d x ) , R R 4 ;
call _ _ c a s t 5 _ d e c _ b l k 1 6 ;
/* xor with src */
vmovq ( % r12 ) , R X ;
vpshufd $ 0 x4 f , R X , R X ;
vpxor R X , R R 1 , R R 1 ;
vpxor 0 * 1 6 + 8 ( % r12 ) , R L 1 , R L 1 ;
vpxor 1 * 1 6 + 8 ( % r12 ) , R R 2 , R R 2 ;
vpxor 2 * 1 6 + 8 ( % r12 ) , R L 2 , R L 2 ;
vpxor 3 * 1 6 + 8 ( % r12 ) , R R 3 , R R 3 ;
vpxor 4 * 1 6 + 8 ( % r12 ) , R L 3 , R L 3 ;
vpxor 5 * 1 6 + 8 ( % r12 ) , R R 4 , R R 4 ;
vpxor 6 * 1 6 + 8 ( % r12 ) , R L 4 , R L 4 ;
vmovdqu R R 1 , ( 0 * 1 6 ) ( % r11 ) ;
vmovdqu R L 1 , ( 1 * 1 6 ) ( % r11 ) ;
vmovdqu R R 2 , ( 2 * 1 6 ) ( % r11 ) ;
vmovdqu R L 2 , ( 3 * 1 6 ) ( % r11 ) ;
vmovdqu R R 3 , ( 4 * 1 6 ) ( % r11 ) ;
vmovdqu R L 3 , ( 5 * 1 6 ) ( % r11 ) ;
vmovdqu R R 4 , ( 6 * 1 6 ) ( % r11 ) ;
vmovdqu R L 4 , ( 7 * 1 6 ) ( % r11 ) ;
popq % r12 ;
ret;
2013-01-19 13:39:11 +02:00
ENDPROC( c a s t 5 _ c b c _ d e c _ 1 6 w a y )
2012-10-20 15:06:56 +03:00
2013-01-19 13:39:11 +02:00
ENTRY( c a s t 5 _ c t r _ 1 6 w a y )
2012-10-20 15:06:56 +03:00
/ * input :
* % rdi : ctx, C T X
* % rsi : dst
* % rdx : src
* % rcx : iv ( b i g e n d i a n , 6 4 b i t )
* /
pushq % r12 ;
movq % r s i , % r11 ;
movq % r d x , % r12 ;
vpcmpeqd R T M P , R T M P , R T M P ;
vpsrldq $ 8 , R T M P , R T M P ; /* low: -1, high: 0 */
vpcmpeqd R K R , R K R , R K R ;
vpaddq R K R , R K R , R K R ; /* low: -2, high: -2 */
vmovdqa . L b s w a p _ i v _ m a s k , R 1 S T ;
vmovdqa . L b s w a p12 8 _ m a s k , R K M ;
/* load IV and byteswap */
vmovq ( % r c x ) , R X ;
vpshufb R 1 S T , R X , R X ;
/* construct IVs */
vpsubq R T M P , R X , R X ; /* le: IV1, IV0 */
vpshufb R K M , R X , R L 1 ; /* be: IV0, IV1 */
vpsubq R K R , R X , R X ;
vpshufb R K M , R X , R R 1 ; /* be: IV2, IV3 */
vpsubq R K R , R X , R X ;
vpshufb R K M , R X , R L 2 ; /* be: IV4, IV5 */
vpsubq R K R , R X , R X ;
vpshufb R K M , R X , R R 2 ; /* be: IV6, IV7 */
vpsubq R K R , R X , R X ;
vpshufb R K M , R X , R L 3 ; /* be: IV8, IV9 */
vpsubq R K R , R X , R X ;
vpshufb R K M , R X , R R 3 ; /* be: IV10, IV11 */
vpsubq R K R , R X , R X ;
vpshufb R K M , R X , R L 4 ; /* be: IV12, IV13 */
vpsubq R K R , R X , R X ;
vpshufb R K M , R X , R R 4 ; /* be: IV14, IV15 */
/* store last IV */
vpsubq R T M P , R X , R X ; /* le: IV16, IV14 */
vpshufb R 1 S T , R X , R X ; /* be: IV16, IV16 */
vmovq R X , ( % r c x ) ;
call _ _ c a s t 5 _ e n c _ b l k 1 6 ;
/* dst = src ^ iv */
vpxor ( 0 * 1 6 ) ( % r12 ) , R R 1 , R R 1 ;
vpxor ( 1 * 1 6 ) ( % r12 ) , R L 1 , R L 1 ;
vpxor ( 2 * 1 6 ) ( % r12 ) , R R 2 , R R 2 ;
vpxor ( 3 * 1 6 ) ( % r12 ) , R L 2 , R L 2 ;
vpxor ( 4 * 1 6 ) ( % r12 ) , R R 3 , R R 3 ;
vpxor ( 5 * 1 6 ) ( % r12 ) , R L 3 , R L 3 ;
vpxor ( 6 * 1 6 ) ( % r12 ) , R R 4 , R R 4 ;
vpxor ( 7 * 1 6 ) ( % r12 ) , R L 4 , R L 4 ;
vmovdqu R R 1 , ( 0 * 1 6 ) ( % r11 ) ;
vmovdqu R L 1 , ( 1 * 1 6 ) ( % r11 ) ;
vmovdqu R R 2 , ( 2 * 1 6 ) ( % r11 ) ;
vmovdqu R L 2 , ( 3 * 1 6 ) ( % r11 ) ;
vmovdqu R R 3 , ( 4 * 1 6 ) ( % r11 ) ;
vmovdqu R L 3 , ( 5 * 1 6 ) ( % r11 ) ;
vmovdqu R R 4 , ( 6 * 1 6 ) ( % r11 ) ;
vmovdqu R L 4 , ( 7 * 1 6 ) ( % r11 ) ;
popq % r12 ;
ret;
2013-01-19 13:39:11 +02:00
ENDPROC( c a s t 5 _ c t r _ 1 6 w a y )