2012-10-20 15:06:41 +03:00
/ *
* Shared g l u e c o d e f o r 1 2 8 b i t b l o c k c i p h e r s , A V X a s s e m b l e r m a c r o s
*
2013-04-08 21:50:55 +03:00
* Copyright © 2 0 1 2 - 2 0 1 3 J u s s i K i v i l i n n a < j u s s i . k i v i l i n n a @iki.fi>
2012-10-20 15:06:41 +03:00
*
* This p r o g r a m i s f r e e s o f t w a r e ; you can redistribute it and/or modify
* it u n d e r t h e t e r m s o f t h e G N U G e n e r a l P u b l i c L i c e n s e a s p u b l i s h e d b y
* the F r e e S o f t w a r e F o u n d a t i o n ; either version 2 of the License, or
* ( at y o u r o p t i o n ) a n y l a t e r v e r s i o n .
*
* This p r o g r a m i s d i s t r i b u t e d i n t h e h o p e t h a t i t w i l l b e u s e f u l ,
* but W I T H O U T A N Y W A R R A N T Y ; without even the implied warranty of
* MERCHANTABILITY o r F I T N E S S F O R A P A R T I C U L A R P U R P O S E . S e e t h e
* GNU G e n e r a l P u b l i c L i c e n s e f o r m o r e d e t a i l s .
*
* /
# define l o a d _ 8 w a y ( s r c , x0 , x1 , x2 , x3 , x4 , x5 , x6 , x7 ) \
vmovdqu ( 0 * 1 6 ) ( s r c ) , x0 ; \
vmovdqu ( 1 * 1 6 ) ( s r c ) , x1 ; \
vmovdqu ( 2 * 1 6 ) ( s r c ) , x2 ; \
vmovdqu ( 3 * 1 6 ) ( s r c ) , x3 ; \
vmovdqu ( 4 * 1 6 ) ( s r c ) , x4 ; \
vmovdqu ( 5 * 1 6 ) ( s r c ) , x5 ; \
vmovdqu ( 6 * 1 6 ) ( s r c ) , x6 ; \
vmovdqu ( 7 * 1 6 ) ( s r c ) , x7 ;
# define s t o r e _ 8 w a y ( d s t , x0 , x1 , x2 , x3 , x4 , x5 , x6 , x7 ) \
vmovdqu x0 , ( 0 * 1 6 ) ( d s t ) ; \
vmovdqu x1 , ( 1 * 1 6 ) ( d s t ) ; \
vmovdqu x2 , ( 2 * 1 6 ) ( d s t ) ; \
vmovdqu x3 , ( 3 * 1 6 ) ( d s t ) ; \
vmovdqu x4 , ( 4 * 1 6 ) ( d s t ) ; \
vmovdqu x5 , ( 5 * 1 6 ) ( d s t ) ; \
vmovdqu x6 , ( 6 * 1 6 ) ( d s t ) ; \
vmovdqu x7 , ( 7 * 1 6 ) ( d s t ) ;
# define s t o r e _ c b c _ 8 w a y ( s r c , d s t , x0 , x1 , x2 , x3 , x4 , x5 , x6 , x7 ) \
vpxor ( 0 * 1 6 ) ( s r c ) , x1 , x1 ; \
vpxor ( 1 * 1 6 ) ( s r c ) , x2 , x2 ; \
vpxor ( 2 * 1 6 ) ( s r c ) , x3 , x3 ; \
vpxor ( 3 * 1 6 ) ( s r c ) , x4 , x4 ; \
vpxor ( 4 * 1 6 ) ( s r c ) , x5 , x5 ; \
vpxor ( 5 * 1 6 ) ( s r c ) , x6 , x6 ; \
vpxor ( 6 * 1 6 ) ( s r c ) , x7 , x7 ; \
store_ 8 w a y ( d s t , x0 , x1 , x2 , x3 , x4 , x5 , x6 , x7 ) ;
# define i n c _ l e 1 2 8 ( x , m i n u s _ o n e , t m p ) \
vpcmpeqq m i n u s _ o n e , x , t m p ; \
vpsubq m i n u s _ o n e , x , x ; \
vpslldq $ 8 , t m p , t m p ; \
vpsubq t m p , x , x ;
# define l o a d _ c t r _ 8 w a y ( i v , b s w a p , x0 , x1 , x2 , x3 , x4 , x5 , x6 , x7 , t 0 , t 1 , t 2 ) \
vpcmpeqd t 0 , t 0 , t 0 ; \
vpsrldq $ 8 , t 0 , t 0 ; /* low: -1, high: 0 */ \
vmovdqa b s w a p , t 1 ; \
\
/* load IV and byteswap */ \
vmovdqu ( i v ) , x7 ; \
vpshufb t 1 , x7 , x0 ; \
\
/* construct IVs */ \
inc_ l e 1 2 8 ( x7 , t 0 , t 2 ) ; \
vpshufb t 1 , x7 , x1 ; \
inc_ l e 1 2 8 ( x7 , t 0 , t 2 ) ; \
vpshufb t 1 , x7 , x2 ; \
inc_ l e 1 2 8 ( x7 , t 0 , t 2 ) ; \
vpshufb t 1 , x7 , x3 ; \
inc_ l e 1 2 8 ( x7 , t 0 , t 2 ) ; \
vpshufb t 1 , x7 , x4 ; \
inc_ l e 1 2 8 ( x7 , t 0 , t 2 ) ; \
vpshufb t 1 , x7 , x5 ; \
inc_ l e 1 2 8 ( x7 , t 0 , t 2 ) ; \
vpshufb t 1 , x7 , x6 ; \
inc_ l e 1 2 8 ( x7 , t 0 , t 2 ) ; \
vmovdqa x7 , t 2 ; \
vpshufb t 1 , x7 , x7 ; \
inc_ l e 1 2 8 ( t 2 , t 0 , t 1 ) ; \
vmovdqu t 2 , ( i v ) ;
# define s t o r e _ c t r _ 8 w a y ( s r c , d s t , x0 , x1 , x2 , x3 , x4 , x5 , x6 , x7 ) \
vpxor ( 0 * 1 6 ) ( s r c ) , x0 , x0 ; \
vpxor ( 1 * 1 6 ) ( s r c ) , x1 , x1 ; \
vpxor ( 2 * 1 6 ) ( s r c ) , x2 , x2 ; \
vpxor ( 3 * 1 6 ) ( s r c ) , x3 , x3 ; \
vpxor ( 4 * 1 6 ) ( s r c ) , x4 , x4 ; \
vpxor ( 5 * 1 6 ) ( s r c ) , x5 , x5 ; \
vpxor ( 6 * 1 6 ) ( s r c ) , x6 , x6 ; \
vpxor ( 7 * 1 6 ) ( s r c ) , x7 , x7 ; \
store_ 8 w a y ( d s t , x0 , x1 , x2 , x3 , x4 , x5 , x6 , x7 ) ;
2013-04-08 21:50:55 +03:00
# define g f12 8 m u l _ x _ b l e ( i v , m a s k , t m p ) \
vpsrad $ 3 1 , i v , t m p ; \
vpaddq i v , i v , i v ; \
vpshufd $ 0 x13 , t m p , t m p ; \
vpand m a s k , t m p , t m p ; \
vpxor t m p , i v , i v ;
# define l o a d _ x t s _ 8 w a y ( i v , s r c , d s t , x0 , x1 , x2 , x3 , x4 , x5 , x6 , x7 , t i v , t 0 , \
t1 , x t s _ g f12 8 m u l _ a n d _ s h l 1 _ m a s k ) \
vmovdqa x t s _ g f12 8 m u l _ a n d _ s h l 1 _ m a s k , t 0 ; \
\
/* load IV */ \
vmovdqu ( i v ) , t i v ; \
vpxor ( 0 * 1 6 ) ( s r c ) , t i v , x0 ; \
vmovdqu t i v , ( 0 * 1 6 ) ( d s t ) ; \
\
/* construct and store IVs, also xor with source */ \
gf1 2 8 m u l _ x _ b l e ( t i v , t 0 , t 1 ) ; \
vpxor ( 1 * 1 6 ) ( s r c ) , t i v , x1 ; \
vmovdqu t i v , ( 1 * 1 6 ) ( d s t ) ; \
\
gf1 2 8 m u l _ x _ b l e ( t i v , t 0 , t 1 ) ; \
vpxor ( 2 * 1 6 ) ( s r c ) , t i v , x2 ; \
vmovdqu t i v , ( 2 * 1 6 ) ( d s t ) ; \
\
gf1 2 8 m u l _ x _ b l e ( t i v , t 0 , t 1 ) ; \
vpxor ( 3 * 1 6 ) ( s r c ) , t i v , x3 ; \
vmovdqu t i v , ( 3 * 1 6 ) ( d s t ) ; \
\
gf1 2 8 m u l _ x _ b l e ( t i v , t 0 , t 1 ) ; \
vpxor ( 4 * 1 6 ) ( s r c ) , t i v , x4 ; \
vmovdqu t i v , ( 4 * 1 6 ) ( d s t ) ; \
\
gf1 2 8 m u l _ x _ b l e ( t i v , t 0 , t 1 ) ; \
vpxor ( 5 * 1 6 ) ( s r c ) , t i v , x5 ; \
vmovdqu t i v , ( 5 * 1 6 ) ( d s t ) ; \
\
gf1 2 8 m u l _ x _ b l e ( t i v , t 0 , t 1 ) ; \
vpxor ( 6 * 1 6 ) ( s r c ) , t i v , x6 ; \
vmovdqu t i v , ( 6 * 1 6 ) ( d s t ) ; \
\
gf1 2 8 m u l _ x _ b l e ( t i v , t 0 , t 1 ) ; \
vpxor ( 7 * 1 6 ) ( s r c ) , t i v , x7 ; \
vmovdqu t i v , ( 7 * 1 6 ) ( d s t ) ; \
\
gf1 2 8 m u l _ x _ b l e ( t i v , t 0 , t 1 ) ; \
vmovdqu t i v , ( i v ) ;
# define s t o r e _ x t s _ 8 w a y ( d s t , x0 , x1 , x2 , x3 , x4 , x5 , x6 , x7 ) \
vpxor ( 0 * 1 6 ) ( d s t ) , x0 , x0 ; \
vpxor ( 1 * 1 6 ) ( d s t ) , x1 , x1 ; \
vpxor ( 2 * 1 6 ) ( d s t ) , x2 , x2 ; \
vpxor ( 3 * 1 6 ) ( d s t ) , x3 , x3 ; \
vpxor ( 4 * 1 6 ) ( d s t ) , x4 , x4 ; \
vpxor ( 5 * 1 6 ) ( d s t ) , x5 , x5 ; \
vpxor ( 6 * 1 6 ) ( d s t ) , x6 , x6 ; \
vpxor ( 7 * 1 6 ) ( d s t ) , x7 , x7 ; \
store_ 8 w a y ( d s t , x0 , x1 , x2 , x3 , x4 , x5 , x6 , x7 ) ;