2009-01-18 08:28:34 +03:00
/ *
* Implement A E S a l g o r i t h m i n I n t e l A E S - N I i n s t r u c t i o n s .
*
* The w h i t e p a p e r o f A E S - N I i n s t r u c t i o n s c a n b e d o w n l o a d e d f r o m :
* http : / / softwarecommunity. i n t e l . c o m / i s n / d o w n l o a d s / i n t e l a v x / A E S - I n s t r u c t i o n s - S e t _ W P . p d f
*
* Copyright ( C ) 2 0 0 8 , I n t e l C o r p .
* Author : Huang Y i n g < y i n g . h u a n g @intel.com>
* Vinodh G o p a l < v i n o d h . g o p a l @intel.com>
* Kahraman A k d e m i r
*
2010-11-04 22:00:45 +03:00
* Added R F C 4 1 0 6 A E S - G C M s u p p o r t f o r 1 2 8 - b i t k e y s u n d e r t h e A E A D
* interface f o r 6 4 - b i t k e r n e l s .
* Authors : Erdinc O z t u r k ( e r d i n c . o z t u r k @intel.com)
* Aidan O ' M a h o n y ( a i d a n . o . m a h o n y @intel.com)
* Adrian H o b a n < a d r i a n . h o b a n @intel.com>
* James G u i l f o r d ( j a m e s . g u i l f o r d @intel.com)
* Gabriele P a o l o n i < g a b r i e l e . p a o l o n i @intel.com>
* Tadeusz S t r u k ( t a d e u s z . s t r u k @intel.com)
* Wajdi F e g h a l i ( w a j d i . k . f e g h a l i @intel.com)
* Copyright ( c ) 2 0 1 0 , I n t e l C o r p o r a t i o n .
*
crypto: aesni-intel - Ported implementation to x86-32
The AES-NI instructions are also available in legacy mode so the 32-bit
architecture may profit from those, too.
To illustrate the performance gain here's a short summary of a dm-crypt
speed test on a Core i7 M620 running at 2.67GHz comparing both assembler
implementations:
x86: i568 aes-ni delta
ECB, 256 bit: 93.8 MB/s 123.3 MB/s +31.4%
CBC, 256 bit: 84.8 MB/s 262.3 MB/s +209.3%
LRW, 256 bit: 108.6 MB/s 222.1 MB/s +104.5%
XTS, 256 bit: 105.0 MB/s 205.5 MB/s +95.7%
Additionally, due to some minor optimizations, the 64-bit version also
got a minor performance gain as seen below:
x86-64: old impl. new impl. delta
ECB, 256 bit: 121.1 MB/s 123.0 MB/s +1.5%
CBC, 256 bit: 285.3 MB/s 290.8 MB/s +1.9%
LRW, 256 bit: 263.7 MB/s 265.3 MB/s +0.6%
XTS, 256 bit: 251.1 MB/s 255.3 MB/s +1.7%
Signed-off-by: Mathias Krause <minipli@googlemail.com>
Reviewed-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2010-11-27 11:34:46 +03:00
* Ported x86 _ 6 4 v e r s i o n t o x86 :
* Author : Mathias K r a u s e < m i n i p l i @googlemail.com>
*
2009-01-18 08:28:34 +03:00
* This p r o g r a m i s f r e e s o f t w a r e ; you can redistribute it and/or modify
* it u n d e r t h e t e r m s o f t h e G N U G e n e r a l P u b l i c L i c e n s e a s p u b l i s h e d b y
* the F r e e S o f t w a r e F o u n d a t i o n ; either version 2 of the License, or
* ( at y o u r o p t i o n ) a n y l a t e r v e r s i o n .
* /
# include < l i n u x / l i n k a g e . h >
2009-11-23 14:54:06 +03:00
# include < a s m / i n s t . h >
2009-01-18 08:28:34 +03:00
2010-11-29 03:35:39 +03:00
# ifdef _ _ x86 _ 6 4 _ _
2010-11-04 22:00:45 +03:00
.data
POLY : .octa 0xC2000000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
TWOONE : .octa 0x00000001 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
# order o f t h e s e c o n s t a n t s s h o u l d n o t c h a n g e .
# more s p e c i f i c a l l y , A L L _ F s h o u l d f o l l o w S H I F T _ M A S K ,
# and Z E R O s h o u l d f o l l o w A L L _ F
SHUF_MASK : .octa 0x00010203 0 4 0 5 0 6 0 7 0 8 0 9 0 A0 B 0 C 0 D 0 E 0 F
MASK1 : .octa 0x00000000 0 0 0 0 0 0 0 0 ffffffffffffffff
MASK2 : .octa 0xffffffff ffffffff0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
SHIFT_MASK : .octa 0x0f0e0d0c 0 b0 a09 0 8 0 7 0 6 0 5 0 4 0 3 0 2 0 1 0 0
ALL_F : .octa 0xffffffff ffffffffffffffffffffffff
ZERO : .octa 0x00000000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
ONE : .octa 0x00000000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
F_MIN_MASK : .octa 0xf1f2f3f4 f5 f6 f7 f8 f9 f a f b f c f d f e f f0
dec : .octa 0x1
enc : .octa 0x2
2009-01-18 08:28:34 +03:00
.text
2010-11-04 22:00:45 +03:00
# define S T A C K _ O F F S E T 8 * 3
# define H a s h K e y 1 6 * 0 / / s t o r e H a s h K e y < < 1 m o d p o l y h e r e
# define H a s h K e y _ 2 1 6 * 1 / / s t o r e H a s h K e y ^ 2 < < 1 m o d p o l y h e r e
# define H a s h K e y _ 3 1 6 * 2 / / s t o r e H a s h K e y ^ 3 < < 1 m o d p o l y h e r e
# define H a s h K e y _ 4 1 6 * 3 / / s t o r e H a s h K e y ^ 4 < < 1 m o d p o l y h e r e
# define H a s h K e y _ k 1 6 * 4 / / s t o r e X O R o f H i g h 6 4 b i t s a n d L o w 6 4
/ / bits o f H a s h K e y < < 1 m o d p o l y h e r e
/ / ( for K a r a t s u b a p u r p o s e s )
# define H a s h K e y _ 2 _ k 1 6 * 5 / / s t o r e X O R o f H i g h 6 4 b i t s a n d L o w 6 4
/ / bits o f H a s h K e y ^ 2 < < 1 m o d p o l y h e r e
/ / ( for K a r a t s u b a p u r p o s e s )
# define H a s h K e y _ 3 _ k 1 6 * 6 / / s t o r e X O R o f H i g h 6 4 b i t s a n d L o w 6 4
/ / bits o f H a s h K e y ^ 3 < < 1 m o d p o l y h e r e
/ / ( for K a r a t s u b a p u r p o s e s )
# define H a s h K e y _ 4 _ k 1 6 * 7 / / s t o r e X O R o f H i g h 6 4 b i t s a n d L o w 6 4
/ / bits o f H a s h K e y ^ 4 < < 1 m o d p o l y h e r e
/ / ( for K a r a t s u b a p u r p o s e s )
# define V A R I A B L E _ O F F S E T 1 6 * 8
# define a r g 1 r d i
# define a r g 2 r s i
# define a r g 3 r d x
# define a r g 4 r c x
# define a r g 5 r8
# define a r g 6 r9
# define a r g 7 S T A C K _ O F F S E T + 8 ( % r14 )
# define a r g 8 S T A C K _ O F F S E T + 1 6 ( % r14 )
# define a r g 9 S T A C K _ O F F S E T + 2 4 ( % r14 )
# define a r g 1 0 S T A C K _ O F F S E T + 3 2 ( % r14 )
2010-11-29 03:35:39 +03:00
# endif
2010-11-04 22:00:45 +03:00
2009-01-18 08:28:34 +03:00
# define S T A T E 1 % x m m 0
# define S T A T E 2 % x m m 4
# define S T A T E 3 % x m m 5
# define S T A T E 4 % x m m 6
# define S T A T E S T A T E 1
# define I N 1 % x m m 1
# define I N 2 % x m m 7
# define I N 3 % x m m 8
# define I N 4 % x m m 9
# define I N I N 1
# define K E Y % x m m 2
# define I V % x m m 3
crypto: aesni-intel - Ported implementation to x86-32
The AES-NI instructions are also available in legacy mode so the 32-bit
architecture may profit from those, too.
To illustrate the performance gain here's a short summary of a dm-crypt
speed test on a Core i7 M620 running at 2.67GHz comparing both assembler
implementations:
x86: i568 aes-ni delta
ECB, 256 bit: 93.8 MB/s 123.3 MB/s +31.4%
CBC, 256 bit: 84.8 MB/s 262.3 MB/s +209.3%
LRW, 256 bit: 108.6 MB/s 222.1 MB/s +104.5%
XTS, 256 bit: 105.0 MB/s 205.5 MB/s +95.7%
Additionally, due to some minor optimizations, the 64-bit version also
got a minor performance gain as seen below:
x86-64: old impl. new impl. delta
ECB, 256 bit: 121.1 MB/s 123.0 MB/s +1.5%
CBC, 256 bit: 285.3 MB/s 290.8 MB/s +1.9%
LRW, 256 bit: 263.7 MB/s 265.3 MB/s +0.6%
XTS, 256 bit: 251.1 MB/s 255.3 MB/s +1.7%
Signed-off-by: Mathias Krause <minipli@googlemail.com>
Reviewed-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2010-11-27 11:34:46 +03:00
2010-03-10 13:28:55 +03:00
# define B S W A P _ M A S K % x m m 1 0
# define C T R % x m m 1 1
# define I N C % x m m 1 2
2009-01-18 08:28:34 +03:00
crypto: aesni-intel - Ported implementation to x86-32
The AES-NI instructions are also available in legacy mode so the 32-bit
architecture may profit from those, too.
To illustrate the performance gain here's a short summary of a dm-crypt
speed test on a Core i7 M620 running at 2.67GHz comparing both assembler
implementations:
x86: i568 aes-ni delta
ECB, 256 bit: 93.8 MB/s 123.3 MB/s +31.4%
CBC, 256 bit: 84.8 MB/s 262.3 MB/s +209.3%
LRW, 256 bit: 108.6 MB/s 222.1 MB/s +104.5%
XTS, 256 bit: 105.0 MB/s 205.5 MB/s +95.7%
Additionally, due to some minor optimizations, the 64-bit version also
got a minor performance gain as seen below:
x86-64: old impl. new impl. delta
ECB, 256 bit: 121.1 MB/s 123.0 MB/s +1.5%
CBC, 256 bit: 285.3 MB/s 290.8 MB/s +1.9%
LRW, 256 bit: 263.7 MB/s 265.3 MB/s +0.6%
XTS, 256 bit: 251.1 MB/s 255.3 MB/s +1.7%
Signed-off-by: Mathias Krause <minipli@googlemail.com>
Reviewed-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2010-11-27 11:34:46 +03:00
# ifdef _ _ x86 _ 6 4 _ _
# define A R E G % r a x
2009-01-18 08:28:34 +03:00
# define K E Y P % r d i
# define O U T P % r s i
crypto: aesni-intel - Ported implementation to x86-32
The AES-NI instructions are also available in legacy mode so the 32-bit
architecture may profit from those, too.
To illustrate the performance gain here's a short summary of a dm-crypt
speed test on a Core i7 M620 running at 2.67GHz comparing both assembler
implementations:
x86: i568 aes-ni delta
ECB, 256 bit: 93.8 MB/s 123.3 MB/s +31.4%
CBC, 256 bit: 84.8 MB/s 262.3 MB/s +209.3%
LRW, 256 bit: 108.6 MB/s 222.1 MB/s +104.5%
XTS, 256 bit: 105.0 MB/s 205.5 MB/s +95.7%
Additionally, due to some minor optimizations, the 64-bit version also
got a minor performance gain as seen below:
x86-64: old impl. new impl. delta
ECB, 256 bit: 121.1 MB/s 123.0 MB/s +1.5%
CBC, 256 bit: 285.3 MB/s 290.8 MB/s +1.9%
LRW, 256 bit: 263.7 MB/s 265.3 MB/s +0.6%
XTS, 256 bit: 251.1 MB/s 255.3 MB/s +1.7%
Signed-off-by: Mathias Krause <minipli@googlemail.com>
Reviewed-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2010-11-27 11:34:46 +03:00
# define U K E Y P O U T P
2009-01-18 08:28:34 +03:00
# define I N P % r d x
# define L E N % r c x
# define I V P % r8
# define K L E N % r9 d
# define T 1 % r10
# define T K E Y P T 1
# define T 2 % r11
2010-03-10 13:28:55 +03:00
# define T C T R _ L O W T 2
crypto: aesni-intel - Ported implementation to x86-32
The AES-NI instructions are also available in legacy mode so the 32-bit
architecture may profit from those, too.
To illustrate the performance gain here's a short summary of a dm-crypt
speed test on a Core i7 M620 running at 2.67GHz comparing both assembler
implementations:
x86: i568 aes-ni delta
ECB, 256 bit: 93.8 MB/s 123.3 MB/s +31.4%
CBC, 256 bit: 84.8 MB/s 262.3 MB/s +209.3%
LRW, 256 bit: 108.6 MB/s 222.1 MB/s +104.5%
XTS, 256 bit: 105.0 MB/s 205.5 MB/s +95.7%
Additionally, due to some minor optimizations, the 64-bit version also
got a minor performance gain as seen below:
x86-64: old impl. new impl. delta
ECB, 256 bit: 121.1 MB/s 123.0 MB/s +1.5%
CBC, 256 bit: 285.3 MB/s 290.8 MB/s +1.9%
LRW, 256 bit: 263.7 MB/s 265.3 MB/s +0.6%
XTS, 256 bit: 251.1 MB/s 255.3 MB/s +1.7%
Signed-off-by: Mathias Krause <minipli@googlemail.com>
Reviewed-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2010-11-27 11:34:46 +03:00
# else
# define A R E G % e a x
# define K E Y P % e d i
# define O U T P A R E G
# define U K E Y P O U T P
# define I N P % e d x
# define L E N % e s i
# define I V P % e b p
# define K L E N % e b x
# define T 1 % e c x
# define T K E Y P T 1
# endif
2009-01-18 08:28:34 +03:00
2010-11-04 22:00:45 +03:00
2010-11-29 03:35:39 +03:00
# ifdef _ _ x86 _ 6 4 _ _
2010-11-04 22:00:45 +03:00
/ * GHASH_ M U L M A C R O t o i m p l e m e n t : D a t a * H a s h K e y m o d ( 1 2 8 ,1 2 7 ,1 2 6 ,1 2 1 ,0 )
*
*
* Input : A a n d B ( 1 2 8 - b i t s e a c h , b i t - r e f l e c t e d )
* Output : C = A * B * x m o d p o l y , ( i . e . > > 1 )
* To c o m p u t e G H = G H * H a s h K e y m o d p o l y , g i v e H K = H a s h K e y < < 1 m o d p o l y a s i n p u t
* GH = G H * H K * x m o d p o l y w h i c h i s e q u i v a l e n t t o G H * H a s h K e y m o d p o l y .
*
* /
.macro GHASH_MUL GH H K T M P 1 T M P 2 T M P 3 T M P 4 T M P 5
movdqa \ G H , \ T M P 1
pshufd $ 7 8 , \ G H , \ T M P 2
pshufd $ 7 8 , \ H K , \ T M P 3
pxor \ G H , \ T M P 2 # T M P 2 = a1 + a0
pxor \ H K , \ T M P 3 # T M P 3 = b1 + b0
PCLMULQDQ 0 x11 , \ H K , \ T M P 1 # T M P 1 = a1 * b1
PCLMULQDQ 0 x00 , \ H K , \ G H # G H = a 0 * b0
PCLMULQDQ 0 x00 , \ T M P 3 , \ T M P 2 # T M P 2 = ( a0 + a1 ) * ( b1 + b0 )
pxor \ G H , \ T M P 2
pxor \ T M P 1 , \ T M P 2 # T M P 2 = ( a0 * b0 ) + ( a1 * b0 )
movdqa \ T M P 2 , \ T M P 3
pslldq $ 8 , \ T M P 3 # l e f t s h i f t T M P 3 2 D W s
psrldq $ 8 , \ T M P 2 # r i g h t s h i f t T M P 2 2 D W s
pxor \ T M P 3 , \ G H
pxor \ T M P 2 , \ T M P 1 # T M P 2 : G H h o l d s t h e r e s u l t o f G H * H K
# first p h a s e o f t h e r e d u c t i o n
movdqa \ G H , \ T M P 2
movdqa \ G H , \ T M P 3
movdqa \ G H , \ T M P 4 # c o p y G H i n t o T M P 2 ,T M P 3 a n d T M P 4
# in i n o r d e r t o p e r f o r m
# independent s h i f t s
pslld $ 3 1 , \ T M P 2 # p a c k e d r i g h t s h i f t < < 31
pslld $ 3 0 , \ T M P 3 # p a c k e d r i g h t s h i f t < < 30
pslld $ 2 5 , \ T M P 4 # p a c k e d r i g h t s h i f t < < 25
pxor \ T M P 3 , \ T M P 2 # x o r t h e s h i f t e d v e r s i o n s
pxor \ T M P 4 , \ T M P 2
movdqa \ T M P 2 , \ T M P 5
psrldq $ 4 , \ T M P 5 # r i g h t s h i f t T M P 5 1 D W
pslldq $ 1 2 , \ T M P 2 # l e f t s h i f t T M P 2 3 D W s
pxor \ T M P 2 , \ G H
# second p h a s e o f t h e r e d u c t i o n
movdqa \ G H ,\ T M P 2 # c o p y G H i n t o T M P 2 ,T M P 3 a n d T M P 4
# in i n o r d e r t o p e r f o r m
# independent s h i f t s
movdqa \ G H ,\ T M P 3
movdqa \ G H ,\ T M P 4
psrld $ 1 ,\ T M P 2 # p a c k e d l e f t s h i f t > > 1
psrld $ 2 ,\ T M P 3 # p a c k e d l e f t s h i f t > > 2
psrld $ 7 ,\ T M P 4 # p a c k e d l e f t s h i f t > > 7
pxor \ T M P 3 ,\ T M P 2 # x o r t h e s h i f t e d v e r s i o n s
pxor \ T M P 4 ,\ T M P 2
pxor \ T M P 5 , \ T M P 2
pxor \ T M P 2 , \ G H
pxor \ T M P 1 , \ G H # r e s u l t i s i n T M P 1
.endm
/ *
* if a = n u m b e r o f t o t a l p l a i n t e x t b y t e s
* b = f l o o r ( a / 1 6 )
* num_ i n i t i a l _ b l o c k s = b m o d 4
* encrypt t h e i n i t i a l n u m _ i n i t i a l _ b l o c k s b l o c k s a n d a p p l y g h a s h o n
* the c i p h e r t e x t
* % r1 0 , % r11 , % r12 , % r a x , % x m m 5 , % x m m 6 , % x m m 7 , % x m m 8 , % x m m 9 r e g i s t e r s
* are c l o b b e r e d
* arg1 , % a r g 2 , % a r g 3 , % r14 a r e u s e d a s a p o i n t e r o n l y , n o t m o d i f i e d
* /
2010-12-13 14:51:15 +03:00
.macro INITIAL_BLOCKS_DEC num_ i n i t i a l _ b l o c k s T M P 1 T M P 2 T M P 3 T M P 4 T M P 5 X M M 0 X M M 1 \
XMM2 X M M 3 X M M 4 X M M D s t T M P 6 T M P 7 i i _ s e q o p e r a t i o n
2010-11-04 22:00:45 +03:00
mov a r g 7 , % r10 # % r 10 = A A D
mov a r g 8 , % r12 # % r 12 = a a d L e n
mov % r12 , % r11
pxor % x m m \ i , % x m m \ i
_ get_ A A D _ l o o p \ n u m _ i n i t i a l _ b l o c k s \ o p e r a t i o n :
movd ( % r10 ) , \ T M P 1
pslldq $ 1 2 , \ T M P 1
psrldq $ 4 , % x m m \ i
pxor \ T M P 1 , % x m m \ i
add $ 4 , % r10
sub $ 4 , % r12
jne _ g e t _ A A D _ l o o p \ n u m _ i n i t i a l _ b l o c k s \ o p e r a t i o n
cmp $ 1 6 , % r11
je _ g e t _ A A D _ l o o p2 _ d o n e \ n u m _ i n i t i a l _ b l o c k s \ o p e r a t i o n
mov $ 1 6 , % r12
_ get_ A A D _ l o o p2 \ n u m _ i n i t i a l _ b l o c k s \ o p e r a t i o n :
psrldq $ 4 , % x m m \ i
sub $ 4 , % r12
cmp % r11 , % r12
jne _ g e t _ A A D _ l o o p2 \ n u m _ i n i t i a l _ b l o c k s \ o p e r a t i o n
_ get_ A A D _ l o o p2 _ d o n e \ n u m _ i n i t i a l _ b l o c k s \ o p e r a t i o n :
2010-12-13 14:51:15 +03:00
movdqa S H U F _ M A S K ( % r i p ) , % x m m 1 4
PSHUFB_ X M M % x m m 1 4 , % x m m \ i # b y t e - r e f l e c t t h e A A D d a t a
2010-11-04 22:00:45 +03:00
xor % r11 , % r11 # i n i t i a l i s e t h e d a t a p o i n t e r o f f s e t a s z e r o
# start A E S f o r n u m _ i n i t i a l _ b l o c k s b l o c k s
mov % a r g 5 , % r a x # % r a x = * Y 0
movdqu ( % r a x ) , \ X M M 0 # X M M 0 = Y 0
2010-12-13 14:51:15 +03:00
movdqa S H U F _ M A S K ( % r i p ) , % x m m 1 4
PSHUFB_ X M M % x m m 1 4 , \ X M M 0
.if ( \ i = = 5 ) | | ( \ i = = 6 ) | | ( \ i = = 7 )
2010-11-04 22:00:45 +03:00
.irpc index, \ i _ s e q
paddd O N E ( % r i p ) , \ X M M 0 # I N C R Y 0
movdqa \ X M M 0 , % x m m \ i n d e x
2010-12-13 14:51:15 +03:00
movdqa S H U F _ M A S K ( % r i p ) , % x m m 1 4
PSHUFB_ X M M % x m m 1 4 , % x m m \ i n d e x # p e r f o r m a 16 b y t e s w a p
2010-11-04 22:00:45 +03:00
.endr
.irpc index, \ i _ s e q
pxor 1 6 * 0 ( % a r g 1 ) , % x m m \ i n d e x
.endr
.irpc index, \ i _ s e q
movaps 0 x10 ( % r d i ) , \ T M P 1
AESENC \ T M P 1 , % x m m \ i n d e x # R o u n d 1
.endr
.irpc index, \ i _ s e q
movaps 0 x20 ( % a r g 1 ) , \ T M P 1
AESENC \ T M P 1 , % x m m \ i n d e x # R o u n d 2
.endr
.irpc index, \ i _ s e q
movaps 0 x30 ( % a r g 1 ) , \ T M P 1
AESENC \ T M P 1 , % x m m \ i n d e x # R o u n d 2
.endr
.irpc index, \ i _ s e q
movaps 0 x40 ( % a r g 1 ) , \ T M P 1
AESENC \ T M P 1 , % x m m \ i n d e x # R o u n d 2
.endr
.irpc index, \ i _ s e q
movaps 0 x50 ( % a r g 1 ) , \ T M P 1
AESENC \ T M P 1 , % x m m \ i n d e x # R o u n d 2
.endr
.irpc index, \ i _ s e q
movaps 0 x60 ( % a r g 1 ) , \ T M P 1
AESENC \ T M P 1 , % x m m \ i n d e x # R o u n d 2
.endr
.irpc index, \ i _ s e q
movaps 0 x70 ( % a r g 1 ) , \ T M P 1
AESENC \ T M P 1 , % x m m \ i n d e x # R o u n d 2
.endr
.irpc index, \ i _ s e q
movaps 0 x80 ( % a r g 1 ) , \ T M P 1
AESENC \ T M P 1 , % x m m \ i n d e x # R o u n d 2
.endr
.irpc index, \ i _ s e q
movaps 0 x90 ( % a r g 1 ) , \ T M P 1
AESENC \ T M P 1 , % x m m \ i n d e x # R o u n d 2
.endr
.irpc index, \ i _ s e q
movaps 0 x a0 ( % a r g 1 ) , \ T M P 1
AESENCLAST \ T M P 1 , % x m m \ i n d e x # R o u n d 10
.endr
.irpc index, \ i _ s e q
movdqu ( % a r g 3 , % r11 , 1 ) , \ T M P 1
pxor \ T M P 1 , % x m m \ i n d e x
movdqu % x m m \ i n d e x , ( % a r g 2 , % r11 , 1 )
# write b a c k p l a i n t e x t / c i p h e r t e x t f o r n u m _ i n i t i a l _ b l o c k s
add $ 1 6 , % r11
2010-12-13 14:51:15 +03:00
2010-11-04 22:00:45 +03:00
movdqa \ T M P 1 , % x m m \ i n d e x
2010-12-13 14:51:15 +03:00
movdqa S H U F _ M A S K ( % r i p ) , % x m m 1 4
PSHUFB_ X M M % x m m 1 4 , % x m m \ i n d e x
2010-11-04 22:00:45 +03:00
# prepare p l a i n t e x t / c i p h e r t e x t f o r G H A S H c o m p u t a t i o n
.endr
.endif
GHASH_ M U L % x m m \ i , \ T M P 3 , \ T M P 1 , \ T M P 2 , \ T M P 4 , \ T M P 5 , \ X M M 1
# apply G H A S H o n n u m _ i n i t i a l _ b l o c k s b l o c k s
.if \ i = = 5
pxor % x m m 5 , % x m m 6
GHASH_ M U L % x m m 6 , \ T M P 3 , \ T M P 1 , \ T M P 2 , \ T M P 4 , \ T M P 5 , \ X M M 1
pxor % x m m 6 , % x m m 7
GHASH_ M U L % x m m 7 , \ T M P 3 , \ T M P 1 , \ T M P 2 , \ T M P 4 , \ T M P 5 , \ X M M 1
pxor % x m m 7 , % x m m 8
GHASH_ M U L % x m m 8 , \ T M P 3 , \ T M P 1 , \ T M P 2 , \ T M P 4 , \ T M P 5 , \ X M M 1
.elseif \ i = = 6
pxor % x m m 6 , % x m m 7
GHASH_ M U L % x m m 7 , \ T M P 3 , \ T M P 1 , \ T M P 2 , \ T M P 4 , \ T M P 5 , \ X M M 1
pxor % x m m 7 , % x m m 8
GHASH_ M U L % x m m 8 , \ T M P 3 , \ T M P 1 , \ T M P 2 , \ T M P 4 , \ T M P 5 , \ X M M 1
.elseif \ i = = 7
pxor % x m m 7 , % x m m 8
GHASH_ M U L % x m m 8 , \ T M P 3 , \ T M P 1 , \ T M P 2 , \ T M P 4 , \ T M P 5 , \ X M M 1
.endif
cmp $ 6 4 , % r13
jl _ i n i t i a l _ b l o c k s _ d o n e \ n u m _ i n i t i a l _ b l o c k s \ o p e r a t i o n
# no n e e d f o r p r e c o m p u t e d v a l u e s
/ *
*
* Precomputations f o r H a s h K e y p a r a l l e l w i t h e n c r y p t i o n o f f i r s t 4 b l o c k s .
* Haskey_ i _ k h o l d s X O R e d v a l u e s o f t h e l o w a n d h i g h p a r t s o f t h e H a s k e y _ i
* /
paddd O N E ( % r i p ) , \ X M M 0 # I N C R Y 0
movdqa \ X M M 0 , \ X M M 1
2010-12-13 14:51:15 +03:00
movdqa S H U F _ M A S K ( % r i p ) , % x m m 1 4
PSHUFB_ X M M % x m m 1 4 , \ X M M 1 # p e r f o r m a 16 b y t e s w a p
2010-11-04 22:00:45 +03:00
paddd O N E ( % r i p ) , \ X M M 0 # I N C R Y 0
movdqa \ X M M 0 , \ X M M 2
2010-12-13 14:51:15 +03:00
movdqa S H U F _ M A S K ( % r i p ) , % x m m 1 4
PSHUFB_ X M M % x m m 1 4 , \ X M M 2 # p e r f o r m a 16 b y t e s w a p
2010-11-04 22:00:45 +03:00
paddd O N E ( % r i p ) , \ X M M 0 # I N C R Y 0
movdqa \ X M M 0 , \ X M M 3
2010-12-13 14:51:15 +03:00
movdqa S H U F _ M A S K ( % r i p ) , % x m m 1 4
PSHUFB_ X M M % x m m 1 4 , \ X M M 3 # p e r f o r m a 16 b y t e s w a p
2010-11-04 22:00:45 +03:00
paddd O N E ( % r i p ) , \ X M M 0 # I N C R Y 0
movdqa \ X M M 0 , \ X M M 4
2010-12-13 14:51:15 +03:00
movdqa S H U F _ M A S K ( % r i p ) , % x m m 1 4
PSHUFB_ X M M % x m m 1 4 , \ X M M 4 # p e r f o r m a 16 b y t e s w a p
2010-11-04 22:00:45 +03:00
pxor 1 6 * 0 ( % a r g 1 ) , \ X M M 1
pxor 1 6 * 0 ( % a r g 1 ) , \ X M M 2
pxor 1 6 * 0 ( % a r g 1 ) , \ X M M 3
pxor 1 6 * 0 ( % a r g 1 ) , \ X M M 4
movdqa \ T M P 3 , \ T M P 5
pshufd $ 7 8 , \ T M P 3 , \ T M P 1
pxor \ T M P 3 , \ T M P 1
movdqa \ T M P 1 , H a s h K e y _ k ( % r s p )
GHASH_ M U L \ T M P 5 , \ T M P 3 , \ T M P 1 , \ T M P 2 , \ T M P 4 , \ T M P 6 , \ T M P 7
# TMP5 = H a s h K e y ^ 2 < < 1 ( m o d p o l y )
movdqa \ T M P 5 , H a s h K e y _ 2 ( % r s p )
# HashKey_ 2 = H a s h K e y ^ 2 < < 1 ( m o d p o l y )
pshufd $ 7 8 , \ T M P 5 , \ T M P 1
pxor \ T M P 5 , \ T M P 1
movdqa \ T M P 1 , H a s h K e y _ 2 _ k ( % r s p )
.irpc index, 1 2 3 4 # d o 4 r o u n d s
movaps 0 x10 * \ i n d e x ( % a r g 1 ) , \ T M P 1
AESENC \ T M P 1 , \ X M M 1
AESENC \ T M P 1 , \ X M M 2
AESENC \ T M P 1 , \ X M M 3
AESENC \ T M P 1 , \ X M M 4
.endr
GHASH_ M U L \ T M P 5 , \ T M P 3 , \ T M P 1 , \ T M P 2 , \ T M P 4 , \ T M P 6 , \ T M P 7
# TMP5 = H a s h K e y ^ 3 < < 1 ( m o d p o l y )
movdqa \ T M P 5 , H a s h K e y _ 3 ( % r s p )
pshufd $ 7 8 , \ T M P 5 , \ T M P 1
pxor \ T M P 5 , \ T M P 1
movdqa \ T M P 1 , H a s h K e y _ 3 _ k ( % r s p )
.irpc index, 5 6 7 8 9 # d o n e x t 5 r o u n d s
movaps 0 x10 * \ i n d e x ( % a r g 1 ) , \ T M P 1
AESENC \ T M P 1 , \ X M M 1
AESENC \ T M P 1 , \ X M M 2
AESENC \ T M P 1 , \ X M M 3
AESENC \ T M P 1 , \ X M M 4
.endr
GHASH_ M U L \ T M P 5 , \ T M P 3 , \ T M P 1 , \ T M P 2 , \ T M P 4 , \ T M P 6 , \ T M P 7
# TMP5 = H a s h K e y ^ 3 < < 1 ( m o d p o l y )
movdqa \ T M P 5 , H a s h K e y _ 4 ( % r s p )
pshufd $ 7 8 , \ T M P 5 , \ T M P 1
pxor \ T M P 5 , \ T M P 1
movdqa \ T M P 1 , H a s h K e y _ 4 _ k ( % r s p )
movaps 0 x a0 ( % a r g 1 ) , \ T M P 2
AESENCLAST \ T M P 2 , \ X M M 1
AESENCLAST \ T M P 2 , \ X M M 2
AESENCLAST \ T M P 2 , \ X M M 3
AESENCLAST \ T M P 2 , \ X M M 4
movdqu 1 6 * 0 ( % a r g 3 , % r11 , 1 ) , \ T M P 1
pxor \ T M P 1 , \ X M M 1
movdqu \ X M M 1 , 1 6 * 0 ( % a r g 2 , % r11 , 1 )
movdqa \ T M P 1 , \ X M M 1
movdqu 1 6 * 1 ( % a r g 3 , % r11 , 1 ) , \ T M P 1
pxor \ T M P 1 , \ X M M 2
movdqu \ X M M 2 , 1 6 * 1 ( % a r g 2 , % r11 , 1 )
movdqa \ T M P 1 , \ X M M 2
movdqu 1 6 * 2 ( % a r g 3 , % r11 , 1 ) , \ T M P 1
pxor \ T M P 1 , \ X M M 3
movdqu \ X M M 3 , 1 6 * 2 ( % a r g 2 , % r11 , 1 )
movdqa \ T M P 1 , \ X M M 3
movdqu 1 6 * 3 ( % a r g 3 , % r11 , 1 ) , \ T M P 1
pxor \ T M P 1 , \ X M M 4
movdqu \ X M M 4 , 1 6 * 3 ( % a r g 2 , % r11 , 1 )
movdqa \ T M P 1 , \ X M M 4
2010-12-13 14:51:15 +03:00
add $ 6 4 , % r11
movdqa S H U F _ M A S K ( % r i p ) , % x m m 1 4
PSHUFB_ X M M % x m m 1 4 , \ X M M 1 # p e r f o r m a 16 b y t e s w a p
pxor \ X M M D s t , \ X M M 1
# combine G H A S H e d v a l u e w i t h t h e c o r r e s p o n d i n g c i p h e r t e x t
movdqa S H U F _ M A S K ( % r i p ) , % x m m 1 4
PSHUFB_ X M M % x m m 1 4 , \ X M M 2 # p e r f o r m a 16 b y t e s w a p
movdqa S H U F _ M A S K ( % r i p ) , % x m m 1 4
PSHUFB_ X M M % x m m 1 4 , \ X M M 3 # p e r f o r m a 16 b y t e s w a p
movdqa S H U F _ M A S K ( % r i p ) , % x m m 1 4
PSHUFB_ X M M % x m m 1 4 , \ X M M 4 # p e r f o r m a 16 b y t e s w a p
_ initial_ b l o c k s _ d o n e \ n u m _ i n i t i a l _ b l o c k s \ o p e r a t i o n :
.endm
/ *
* if a = n u m b e r o f t o t a l p l a i n t e x t b y t e s
* b = f l o o r ( a / 1 6 )
* num_ i n i t i a l _ b l o c k s = b m o d 4
* encrypt t h e i n i t i a l n u m _ i n i t i a l _ b l o c k s b l o c k s a n d a p p l y g h a s h o n
* the c i p h e r t e x t
* % r1 0 , % r11 , % r12 , % r a x , % x m m 5 , % x m m 6 , % x m m 7 , % x m m 8 , % x m m 9 r e g i s t e r s
* are c l o b b e r e d
* arg1 , % a r g 2 , % a r g 3 , % r14 a r e u s e d a s a p o i n t e r o n l y , n o t m o d i f i e d
* /
.macro INITIAL_BLOCKS_ENC num_ i n i t i a l _ b l o c k s T M P 1 T M P 2 T M P 3 T M P 4 T M P 5 X M M 0 X M M 1 \
XMM2 X M M 3 X M M 4 X M M D s t T M P 6 T M P 7 i i _ s e q o p e r a t i o n
mov a r g 7 , % r10 # % r 10 = A A D
mov a r g 8 , % r12 # % r 12 = a a d L e n
mov % r12 , % r11
pxor % x m m \ i , % x m m \ i
_ get_ A A D _ l o o p \ n u m _ i n i t i a l _ b l o c k s \ o p e r a t i o n :
movd ( % r10 ) , \ T M P 1
pslldq $ 1 2 , \ T M P 1
psrldq $ 4 , % x m m \ i
pxor \ T M P 1 , % x m m \ i
add $ 4 , % r10
sub $ 4 , % r12
jne _ g e t _ A A D _ l o o p \ n u m _ i n i t i a l _ b l o c k s \ o p e r a t i o n
cmp $ 1 6 , % r11
je _ g e t _ A A D _ l o o p2 _ d o n e \ n u m _ i n i t i a l _ b l o c k s \ o p e r a t i o n
mov $ 1 6 , % r12
_ get_ A A D _ l o o p2 \ n u m _ i n i t i a l _ b l o c k s \ o p e r a t i o n :
psrldq $ 4 , % x m m \ i
sub $ 4 , % r12
cmp % r11 , % r12
jne _ g e t _ A A D _ l o o p2 \ n u m _ i n i t i a l _ b l o c k s \ o p e r a t i o n
_ get_ A A D _ l o o p2 _ d o n e \ n u m _ i n i t i a l _ b l o c k s \ o p e r a t i o n :
movdqa S H U F _ M A S K ( % r i p ) , % x m m 1 4
PSHUFB_ X M M % x m m 1 4 , % x m m \ i # b y t e - r e f l e c t t h e A A D d a t a
xor % r11 , % r11 # i n i t i a l i s e t h e d a t a p o i n t e r o f f s e t a s z e r o
# start A E S f o r n u m _ i n i t i a l _ b l o c k s b l o c k s
mov % a r g 5 , % r a x # % r a x = * Y 0
movdqu ( % r a x ) , \ X M M 0 # X M M 0 = Y 0
movdqa S H U F _ M A S K ( % r i p ) , % x m m 1 4
PSHUFB_ X M M % x m m 1 4 , \ X M M 0
.if ( \ i = = 5 ) | | ( \ i = = 6 ) | | ( \ i = = 7 )
.irpc index, \ i _ s e q
paddd O N E ( % r i p ) , \ X M M 0 # I N C R Y 0
movdqa \ X M M 0 , % x m m \ i n d e x
movdqa S H U F _ M A S K ( % r i p ) , % x m m 1 4
PSHUFB_ X M M % x m m 1 4 , % x m m \ i n d e x # p e r f o r m a 16 b y t e s w a p
.endr
.irpc index, \ i _ s e q
pxor 1 6 * 0 ( % a r g 1 ) , % x m m \ i n d e x
.endr
.irpc index, \ i _ s e q
movaps 0 x10 ( % r d i ) , \ T M P 1
AESENC \ T M P 1 , % x m m \ i n d e x # R o u n d 1
.endr
.irpc index, \ i _ s e q
movaps 0 x20 ( % a r g 1 ) , \ T M P 1
AESENC \ T M P 1 , % x m m \ i n d e x # R o u n d 2
.endr
.irpc index, \ i _ s e q
movaps 0 x30 ( % a r g 1 ) , \ T M P 1
AESENC \ T M P 1 , % x m m \ i n d e x # R o u n d 2
.endr
.irpc index, \ i _ s e q
movaps 0 x40 ( % a r g 1 ) , \ T M P 1
AESENC \ T M P 1 , % x m m \ i n d e x # R o u n d 2
.endr
.irpc index, \ i _ s e q
movaps 0 x50 ( % a r g 1 ) , \ T M P 1
AESENC \ T M P 1 , % x m m \ i n d e x # R o u n d 2
.endr
.irpc index, \ i _ s e q
movaps 0 x60 ( % a r g 1 ) , \ T M P 1
AESENC \ T M P 1 , % x m m \ i n d e x # R o u n d 2
.endr
.irpc index, \ i _ s e q
movaps 0 x70 ( % a r g 1 ) , \ T M P 1
AESENC \ T M P 1 , % x m m \ i n d e x # R o u n d 2
.endr
.irpc index, \ i _ s e q
movaps 0 x80 ( % a r g 1 ) , \ T M P 1
AESENC \ T M P 1 , % x m m \ i n d e x # R o u n d 2
.endr
.irpc index, \ i _ s e q
movaps 0 x90 ( % a r g 1 ) , \ T M P 1
AESENC \ T M P 1 , % x m m \ i n d e x # R o u n d 2
.endr
.irpc index, \ i _ s e q
movaps 0 x a0 ( % a r g 1 ) , \ T M P 1
AESENCLAST \ T M P 1 , % x m m \ i n d e x # R o u n d 10
.endr
.irpc index, \ i _ s e q
movdqu ( % a r g 3 , % r11 , 1 ) , \ T M P 1
pxor \ T M P 1 , % x m m \ i n d e x
movdqu % x m m \ i n d e x , ( % a r g 2 , % r11 , 1 )
# write b a c k p l a i n t e x t / c i p h e r t e x t f o r n u m _ i n i t i a l _ b l o c k s
add $ 1 6 , % r11
movdqa S H U F _ M A S K ( % r i p ) , % x m m 1 4
PSHUFB_ X M M % x m m 1 4 , % x m m \ i n d e x
# prepare p l a i n t e x t / c i p h e r t e x t f o r G H A S H c o m p u t a t i o n
.endr
.endif
GHASH_ M U L % x m m \ i , \ T M P 3 , \ T M P 1 , \ T M P 2 , \ T M P 4 , \ T M P 5 , \ X M M 1
# apply G H A S H o n n u m _ i n i t i a l _ b l o c k s b l o c k s
.if \ i = = 5
pxor % x m m 5 , % x m m 6
GHASH_ M U L % x m m 6 , \ T M P 3 , \ T M P 1 , \ T M P 2 , \ T M P 4 , \ T M P 5 , \ X M M 1
pxor % x m m 6 , % x m m 7
GHASH_ M U L % x m m 7 , \ T M P 3 , \ T M P 1 , \ T M P 2 , \ T M P 4 , \ T M P 5 , \ X M M 1
pxor % x m m 7 , % x m m 8
GHASH_ M U L % x m m 8 , \ T M P 3 , \ T M P 1 , \ T M P 2 , \ T M P 4 , \ T M P 5 , \ X M M 1
.elseif \ i = = 6
pxor % x m m 6 , % x m m 7
GHASH_ M U L % x m m 7 , \ T M P 3 , \ T M P 1 , \ T M P 2 , \ T M P 4 , \ T M P 5 , \ X M M 1
pxor % x m m 7 , % x m m 8
GHASH_ M U L % x m m 8 , \ T M P 3 , \ T M P 1 , \ T M P 2 , \ T M P 4 , \ T M P 5 , \ X M M 1
.elseif \ i = = 7
pxor % x m m 7 , % x m m 8
GHASH_ M U L % x m m 8 , \ T M P 3 , \ T M P 1 , \ T M P 2 , \ T M P 4 , \ T M P 5 , \ X M M 1
.endif
cmp $ 6 4 , % r13
jl _ i n i t i a l _ b l o c k s _ d o n e \ n u m _ i n i t i a l _ b l o c k s \ o p e r a t i o n
# no n e e d f o r p r e c o m p u t e d v a l u e s
/ *
*
* Precomputations f o r H a s h K e y p a r a l l e l w i t h e n c r y p t i o n o f f i r s t 4 b l o c k s .
* Haskey_ i _ k h o l d s X O R e d v a l u e s o f t h e l o w a n d h i g h p a r t s o f t h e H a s k e y _ i
* /
paddd O N E ( % r i p ) , \ X M M 0 # I N C R Y 0
movdqa \ X M M 0 , \ X M M 1
movdqa S H U F _ M A S K ( % r i p ) , % x m m 1 4
PSHUFB_ X M M % x m m 1 4 , \ X M M 1 # p e r f o r m a 16 b y t e s w a p
paddd O N E ( % r i p ) , \ X M M 0 # I N C R Y 0
movdqa \ X M M 0 , \ X M M 2
movdqa S H U F _ M A S K ( % r i p ) , % x m m 1 4
PSHUFB_ X M M % x m m 1 4 , \ X M M 2 # p e r f o r m a 16 b y t e s w a p
paddd O N E ( % r i p ) , \ X M M 0 # I N C R Y 0
movdqa \ X M M 0 , \ X M M 3
movdqa S H U F _ M A S K ( % r i p ) , % x m m 1 4
PSHUFB_ X M M % x m m 1 4 , \ X M M 3 # p e r f o r m a 16 b y t e s w a p
paddd O N E ( % r i p ) , \ X M M 0 # I N C R Y 0
movdqa \ X M M 0 , \ X M M 4
movdqa S H U F _ M A S K ( % r i p ) , % x m m 1 4
PSHUFB_ X M M % x m m 1 4 , \ X M M 4 # p e r f o r m a 16 b y t e s w a p
pxor 1 6 * 0 ( % a r g 1 ) , \ X M M 1
pxor 1 6 * 0 ( % a r g 1 ) , \ X M M 2
pxor 1 6 * 0 ( % a r g 1 ) , \ X M M 3
pxor 1 6 * 0 ( % a r g 1 ) , \ X M M 4
movdqa \ T M P 3 , \ T M P 5
pshufd $ 7 8 , \ T M P 3 , \ T M P 1
pxor \ T M P 3 , \ T M P 1
movdqa \ T M P 1 , H a s h K e y _ k ( % r s p )
GHASH_ M U L \ T M P 5 , \ T M P 3 , \ T M P 1 , \ T M P 2 , \ T M P 4 , \ T M P 6 , \ T M P 7
# TMP5 = H a s h K e y ^ 2 < < 1 ( m o d p o l y )
movdqa \ T M P 5 , H a s h K e y _ 2 ( % r s p )
# HashKey_ 2 = H a s h K e y ^ 2 < < 1 ( m o d p o l y )
pshufd $ 7 8 , \ T M P 5 , \ T M P 1
pxor \ T M P 5 , \ T M P 1
movdqa \ T M P 1 , H a s h K e y _ 2 _ k ( % r s p )
.irpc index, 1 2 3 4 # d o 4 r o u n d s
movaps 0 x10 * \ i n d e x ( % a r g 1 ) , \ T M P 1
AESENC \ T M P 1 , \ X M M 1
AESENC \ T M P 1 , \ X M M 2
AESENC \ T M P 1 , \ X M M 3
AESENC \ T M P 1 , \ X M M 4
.endr
GHASH_ M U L \ T M P 5 , \ T M P 3 , \ T M P 1 , \ T M P 2 , \ T M P 4 , \ T M P 6 , \ T M P 7
# TMP5 = H a s h K e y ^ 3 < < 1 ( m o d p o l y )
movdqa \ T M P 5 , H a s h K e y _ 3 ( % r s p )
pshufd $ 7 8 , \ T M P 5 , \ T M P 1
pxor \ T M P 5 , \ T M P 1
movdqa \ T M P 1 , H a s h K e y _ 3 _ k ( % r s p )
.irpc index, 5 6 7 8 9 # d o n e x t 5 r o u n d s
movaps 0 x10 * \ i n d e x ( % a r g 1 ) , \ T M P 1
AESENC \ T M P 1 , \ X M M 1
AESENC \ T M P 1 , \ X M M 2
AESENC \ T M P 1 , \ X M M 3
AESENC \ T M P 1 , \ X M M 4
.endr
GHASH_ M U L \ T M P 5 , \ T M P 3 , \ T M P 1 , \ T M P 2 , \ T M P 4 , \ T M P 6 , \ T M P 7
# TMP5 = H a s h K e y ^ 3 < < 1 ( m o d p o l y )
movdqa \ T M P 5 , H a s h K e y _ 4 ( % r s p )
pshufd $ 7 8 , \ T M P 5 , \ T M P 1
pxor \ T M P 5 , \ T M P 1
movdqa \ T M P 1 , H a s h K e y _ 4 _ k ( % r s p )
movaps 0 x a0 ( % a r g 1 ) , \ T M P 2
AESENCLAST \ T M P 2 , \ X M M 1
AESENCLAST \ T M P 2 , \ X M M 2
AESENCLAST \ T M P 2 , \ X M M 3
AESENCLAST \ T M P 2 , \ X M M 4
movdqu 1 6 * 0 ( % a r g 3 , % r11 , 1 ) , \ T M P 1
pxor \ T M P 1 , \ X M M 1
movdqu 1 6 * 1 ( % a r g 3 , % r11 , 1 ) , \ T M P 1
pxor \ T M P 1 , \ X M M 2
movdqu 1 6 * 2 ( % a r g 3 , % r11 , 1 ) , \ T M P 1
pxor \ T M P 1 , \ X M M 3
movdqu 1 6 * 3 ( % a r g 3 , % r11 , 1 ) , \ T M P 1
pxor \ T M P 1 , \ X M M 4
2010-11-04 22:00:45 +03:00
movdqu \ X M M 1 , 1 6 * 0 ( % a r g 2 , % r11 , 1 )
movdqu \ X M M 2 , 1 6 * 1 ( % a r g 2 , % r11 , 1 )
movdqu \ X M M 3 , 1 6 * 2 ( % a r g 2 , % r11 , 1 )
movdqu \ X M M 4 , 1 6 * 3 ( % a r g 2 , % r11 , 1 )
2010-12-13 14:51:15 +03:00
2010-11-04 22:00:45 +03:00
add $ 6 4 , % r11
2010-12-13 14:51:15 +03:00
movdqa S H U F _ M A S K ( % r i p ) , % x m m 1 4
PSHUFB_ X M M % x m m 1 4 , \ X M M 1 # p e r f o r m a 16 b y t e s w a p
2010-11-04 22:00:45 +03:00
pxor \ X M M D s t , \ X M M 1
# combine G H A S H e d v a l u e w i t h t h e c o r r e s p o n d i n g c i p h e r t e x t
2010-12-13 14:51:15 +03:00
movdqa S H U F _ M A S K ( % r i p ) , % x m m 1 4
PSHUFB_ X M M % x m m 1 4 , \ X M M 2 # p e r f o r m a 16 b y t e s w a p
movdqa S H U F _ M A S K ( % r i p ) , % x m m 1 4
PSHUFB_ X M M % x m m 1 4 , \ X M M 3 # p e r f o r m a 16 b y t e s w a p
movdqa S H U F _ M A S K ( % r i p ) , % x m m 1 4
PSHUFB_ X M M % x m m 1 4 , \ X M M 4 # p e r f o r m a 16 b y t e s w a p
2010-11-04 22:00:45 +03:00
_ initial_ b l o c k s _ d o n e \ n u m _ i n i t i a l _ b l o c k s \ o p e r a t i o n :
2010-12-13 14:51:15 +03:00
2010-11-04 22:00:45 +03:00
.endm
/ *
* encrypt 4 b l o c k s a t a t i m e
* ghash t h e 4 p r e v i o u s l y e n c r y p t e d c i p h e r t e x t b l o c k s
* arg1 , % a r g 2 , % a r g 3 a r e u s e d a s p o i n t e r s o n l y , n o t m o d i f i e d
* % r1 1 i s t h e d a t a o f f s e t v a l u e
* /
2010-12-13 14:51:15 +03:00
.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 T M P 2 T M P 3 T M P 4 T M P 5 \
TMP6 X M M 0 X M M 1 X M M 2 X M M 3 X M M 4 X M M 5 X M M 6 X M M 7 X M M 8 o p e r a t i o n
movdqa \ X M M 1 , \ X M M 5
movdqa \ X M M 2 , \ X M M 6
movdqa \ X M M 3 , \ X M M 7
movdqa \ X M M 4 , \ X M M 8
movdqa S H U F _ M A S K ( % r i p ) , % x m m 1 5
# multiply T M P 5 * H a s h K e y u s i n g k a r a t s u b a
movdqa \ X M M 5 , \ T M P 4
pshufd $ 7 8 , \ X M M 5 , \ T M P 6
pxor \ X M M 5 , \ T M P 6
paddd O N E ( % r i p ) , \ X M M 0 # I N C R C N T
movdqa H a s h K e y _ 4 ( % r s p ) , \ T M P 5
PCLMULQDQ 0 x11 , \ T M P 5 , \ T M P 4 # T M P 4 = a1 * b1
movdqa \ X M M 0 , \ X M M 1
paddd O N E ( % r i p ) , \ X M M 0 # I N C R C N T
movdqa \ X M M 0 , \ X M M 2
paddd O N E ( % r i p ) , \ X M M 0 # I N C R C N T
movdqa \ X M M 0 , \ X M M 3
paddd O N E ( % r i p ) , \ X M M 0 # I N C R C N T
movdqa \ X M M 0 , \ X M M 4
PSHUFB_ X M M % x m m 1 5 , \ X M M 1 # p e r f o r m a 16 b y t e s w a p
PCLMULQDQ 0 x00 , \ T M P 5 , \ X M M 5 # X M M 5 = a0 * b0
PSHUFB_ X M M % x m m 1 5 , \ X M M 2 # p e r f o r m a 16 b y t e s w a p
PSHUFB_ X M M % x m m 1 5 , \ X M M 3 # p e r f o r m a 16 b y t e s w a p
PSHUFB_ X M M % x m m 1 5 , \ X M M 4 # p e r f o r m a 16 b y t e s w a p
pxor ( % a r g 1 ) , \ X M M 1
pxor ( % a r g 1 ) , \ X M M 2
pxor ( % a r g 1 ) , \ X M M 3
pxor ( % a r g 1 ) , \ X M M 4
movdqa H a s h K e y _ 4 _ k ( % r s p ) , \ T M P 5
PCLMULQDQ 0 x00 , \ T M P 5 , \ T M P 6 # T M P 6 = ( a1 + a0 ) * ( b1 + b0 )
movaps 0 x10 ( % a r g 1 ) , \ T M P 1
AESENC \ T M P 1 , \ X M M 1 # R o u n d 1
AESENC \ T M P 1 , \ X M M 2
AESENC \ T M P 1 , \ X M M 3
AESENC \ T M P 1 , \ X M M 4
movaps 0 x20 ( % a r g 1 ) , \ T M P 1
AESENC \ T M P 1 , \ X M M 1 # R o u n d 2
AESENC \ T M P 1 , \ X M M 2
AESENC \ T M P 1 , \ X M M 3
AESENC \ T M P 1 , \ X M M 4
movdqa \ X M M 6 , \ T M P 1
pshufd $ 7 8 , \ X M M 6 , \ T M P 2
pxor \ X M M 6 , \ T M P 2
movdqa H a s h K e y _ 3 ( % r s p ) , \ T M P 5
PCLMULQDQ 0 x11 , \ T M P 5 , \ T M P 1 # T M P 1 = a1 * b1
movaps 0 x30 ( % a r g 1 ) , \ T M P 3
AESENC \ T M P 3 , \ X M M 1 # R o u n d 3
AESENC \ T M P 3 , \ X M M 2
AESENC \ T M P 3 , \ X M M 3
AESENC \ T M P 3 , \ X M M 4
PCLMULQDQ 0 x00 , \ T M P 5 , \ X M M 6 # X M M 6 = a0 * b0
movaps 0 x40 ( % a r g 1 ) , \ T M P 3
AESENC \ T M P 3 , \ X M M 1 # R o u n d 4
AESENC \ T M P 3 , \ X M M 2
AESENC \ T M P 3 , \ X M M 3
AESENC \ T M P 3 , \ X M M 4
movdqa H a s h K e y _ 3 _ k ( % r s p ) , \ T M P 5
PCLMULQDQ 0 x00 , \ T M P 5 , \ T M P 2 # T M P 2 = ( a1 + a0 ) * ( b1 + b0 )
movaps 0 x50 ( % a r g 1 ) , \ T M P 3
AESENC \ T M P 3 , \ X M M 1 # R o u n d 5
AESENC \ T M P 3 , \ X M M 2
AESENC \ T M P 3 , \ X M M 3
AESENC \ T M P 3 , \ X M M 4
pxor \ T M P 1 , \ T M P 4
# accumulate t h e r e s u l t s i n T M P 4 : X M M 5 , T M P 6 h o l d s t h e m i d d l e p a r t
pxor \ X M M 6 , \ X M M 5
pxor \ T M P 2 , \ T M P 6
movdqa \ X M M 7 , \ T M P 1
pshufd $ 7 8 , \ X M M 7 , \ T M P 2
pxor \ X M M 7 , \ T M P 2
movdqa H a s h K e y _ 2 ( % r s p ) , \ T M P 5
# Multiply T M P 5 * H a s h K e y u s i n g k a r a t s u b a
PCLMULQDQ 0 x11 , \ T M P 5 , \ T M P 1 # T M P 1 = a1 * b1
movaps 0 x60 ( % a r g 1 ) , \ T M P 3
AESENC \ T M P 3 , \ X M M 1 # R o u n d 6
AESENC \ T M P 3 , \ X M M 2
AESENC \ T M P 3 , \ X M M 3
AESENC \ T M P 3 , \ X M M 4
PCLMULQDQ 0 x00 , \ T M P 5 , \ X M M 7 # X M M 7 = a0 * b0
movaps 0 x70 ( % a r g 1 ) , \ T M P 3
AESENC \ T M P 3 , \ X M M 1 # R o u n d 7
AESENC \ T M P 3 , \ X M M 2
AESENC \ T M P 3 , \ X M M 3
AESENC \ T M P 3 , \ X M M 4
movdqa H a s h K e y _ 2 _ k ( % r s p ) , \ T M P 5
PCLMULQDQ 0 x00 , \ T M P 5 , \ T M P 2 # T M P 2 = ( a1 + a0 ) * ( b1 + b0 )
movaps 0 x80 ( % a r g 1 ) , \ T M P 3
AESENC \ T M P 3 , \ X M M 1 # R o u n d 8
AESENC \ T M P 3 , \ X M M 2
AESENC \ T M P 3 , \ X M M 3
AESENC \ T M P 3 , \ X M M 4
pxor \ T M P 1 , \ T M P 4
# accumulate t h e r e s u l t s i n T M P 4 : X M M 5 , T M P 6 h o l d s t h e m i d d l e p a r t
pxor \ X M M 7 , \ X M M 5
pxor \ T M P 2 , \ T M P 6
# Multiply X M M 8 * H a s h K e y
# XMM8 a n d T M P 5 h o l d t h e v a l u e s f o r t h e t w o o p e r a n d s
movdqa \ X M M 8 , \ T M P 1
pshufd $ 7 8 , \ X M M 8 , \ T M P 2
pxor \ X M M 8 , \ T M P 2
movdqa H a s h K e y ( % r s p ) , \ T M P 5
PCLMULQDQ 0 x11 , \ T M P 5 , \ T M P 1 # T M P 1 = a1 * b1
movaps 0 x90 ( % a r g 1 ) , \ T M P 3
AESENC \ T M P 3 , \ X M M 1 # R o u n d 9
AESENC \ T M P 3 , \ X M M 2
AESENC \ T M P 3 , \ X M M 3
AESENC \ T M P 3 , \ X M M 4
PCLMULQDQ 0 x00 , \ T M P 5 , \ X M M 8 # X M M 8 = a0 * b0
movaps 0 x a0 ( % a r g 1 ) , \ T M P 3
AESENCLAST \ T M P 3 , \ X M M 1 # R o u n d 10
AESENCLAST \ T M P 3 , \ X M M 2
AESENCLAST \ T M P 3 , \ X M M 3
AESENCLAST \ T M P 3 , \ X M M 4
movdqa H a s h K e y _ k ( % r s p ) , \ T M P 5
PCLMULQDQ 0 x00 , \ T M P 5 , \ T M P 2 # T M P 2 = ( a1 + a0 ) * ( b1 + b0 )
movdqu ( % a r g 3 ,% r11 ,1 ) , \ T M P 3
pxor \ T M P 3 , \ X M M 1 # C i p h e r t e x t / P l a i n t e x t X O R E K
movdqu 1 6 ( % a r g 3 ,% r11 ,1 ) , \ T M P 3
pxor \ T M P 3 , \ X M M 2 # C i p h e r t e x t / P l a i n t e x t X O R E K
movdqu 3 2 ( % a r g 3 ,% r11 ,1 ) , \ T M P 3
pxor \ T M P 3 , \ X M M 3 # C i p h e r t e x t / P l a i n t e x t X O R E K
movdqu 4 8 ( % a r g 3 ,% r11 ,1 ) , \ T M P 3
pxor \ T M P 3 , \ X M M 4 # C i p h e r t e x t / P l a i n t e x t X O R E K
movdqu \ X M M 1 , ( % a r g 2 ,% r11 ,1 ) # W r i t e t o t h e c i p h e r t e x t b u f f e r
movdqu \ X M M 2 , 1 6 ( % a r g 2 ,% r11 ,1 ) # W r i t e t o t h e c i p h e r t e x t b u f f e r
movdqu \ X M M 3 , 3 2 ( % a r g 2 ,% r11 ,1 ) # W r i t e t o t h e c i p h e r t e x t b u f f e r
movdqu \ X M M 4 , 4 8 ( % a r g 2 ,% r11 ,1 ) # W r i t e t o t h e c i p h e r t e x t b u f f e r
PSHUFB_ X M M % x m m 1 5 , \ X M M 1 # p e r f o r m a 16 b y t e s w a p
PSHUFB_ X M M % x m m 1 5 , \ X M M 2 # p e r f o r m a 16 b y t e s w a p
PSHUFB_ X M M % x m m 1 5 , \ X M M 3 # p e r f o r m a 16 b y t e s w a p
PSHUFB_ X M M % x m m 1 5 , \ X M M 4 # p e r f o r m a 16 b y t e s w a p
pxor \ T M P 4 , \ T M P 1
pxor \ X M M 8 , \ X M M 5
pxor \ T M P 6 , \ T M P 2
pxor \ T M P 1 , \ T M P 2
pxor \ X M M 5 , \ T M P 2
movdqa \ T M P 2 , \ T M P 3
pslldq $ 8 , \ T M P 3 # l e f t s h i f t T M P 3 2 D W s
psrldq $ 8 , \ T M P 2 # r i g h t s h i f t T M P 2 2 D W s
pxor \ T M P 3 , \ X M M 5
pxor \ T M P 2 , \ T M P 1 # a c c u m u l a t e t h e r e s u l t s i n T M P 1 : X M M 5
# first p h a s e o f r e d u c t i o n
movdqa \ X M M 5 , \ T M P 2
movdqa \ X M M 5 , \ T M P 3
movdqa \ X M M 5 , \ T M P 4
# move X M M 5 i n t o T M P 2 , T M P 3 , T M P 4 i n o r d e r t o p e r f o r m s h i f t s i n d e p e n d e n t l y
pslld $ 3 1 , \ T M P 2 # p a c k e d r i g h t s h i f t < < 31
pslld $ 3 0 , \ T M P 3 # p a c k e d r i g h t s h i f t < < 30
pslld $ 2 5 , \ T M P 4 # p a c k e d r i g h t s h i f t < < 25
pxor \ T M P 3 , \ T M P 2 # x o r t h e s h i f t e d v e r s i o n s
pxor \ T M P 4 , \ T M P 2
movdqa \ T M P 2 , \ T M P 5
psrldq $ 4 , \ T M P 5 # r i g h t s h i f t T 5 1 D W
pslldq $ 1 2 , \ T M P 2 # l e f t s h i f t T 2 3 D W s
pxor \ T M P 2 , \ X M M 5
# second p h a s e o f r e d u c t i o n
movdqa \ X M M 5 ,\ T M P 2 # m a k e 3 c o p i e s o f X M M 5 i n t o T M P 2 , T M P 3 , T M P 4
movdqa \ X M M 5 ,\ T M P 3
movdqa \ X M M 5 ,\ T M P 4
psrld $ 1 , \ T M P 2 # p a c k e d l e f t s h i f t > > 1
psrld $ 2 , \ T M P 3 # p a c k e d l e f t s h i f t > > 2
psrld $ 7 , \ T M P 4 # p a c k e d l e f t s h i f t > > 7
pxor \ T M P 3 ,\ T M P 2 # x o r t h e s h i f t e d v e r s i o n s
pxor \ T M P 4 ,\ T M P 2
pxor \ T M P 5 , \ T M P 2
pxor \ T M P 2 , \ X M M 5
pxor \ T M P 1 , \ X M M 5 # r e s u l t i s i n T M P 1
pxor \ X M M 5 , \ X M M 1
.endm
/ *
* decrypt 4 b l o c k s a t a t i m e
* ghash t h e 4 p r e v i o u s l y d e c r y p t e d c i p h e r t e x t b l o c k s
* arg1 , % a r g 2 , % a r g 3 a r e u s e d a s p o i n t e r s o n l y , n o t m o d i f i e d
* % r1 1 i s t h e d a t a o f f s e t v a l u e
* /
.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 T M P 2 T M P 3 T M P 4 T M P 5 \
2010-11-04 22:00:45 +03:00
TMP6 X M M 0 X M M 1 X M M 2 X M M 3 X M M 4 X M M 5 X M M 6 X M M 7 X M M 8 o p e r a t i o n
movdqa \ X M M 1 , \ X M M 5
movdqa \ X M M 2 , \ X M M 6
movdqa \ X M M 3 , \ X M M 7
movdqa \ X M M 4 , \ X M M 8
2010-12-13 14:51:15 +03:00
movdqa S H U F _ M A S K ( % r i p ) , % x m m 1 5
2010-11-04 22:00:45 +03:00
# multiply T M P 5 * H a s h K e y u s i n g k a r a t s u b a
movdqa \ X M M 5 , \ T M P 4
pshufd $ 7 8 , \ X M M 5 , \ T M P 6
pxor \ X M M 5 , \ T M P 6
paddd O N E ( % r i p ) , \ X M M 0 # I N C R C N T
movdqa H a s h K e y _ 4 ( % r s p ) , \ T M P 5
PCLMULQDQ 0 x11 , \ T M P 5 , \ T M P 4 # T M P 4 = a1 * b1
movdqa \ X M M 0 , \ X M M 1
paddd O N E ( % r i p ) , \ X M M 0 # I N C R C N T
movdqa \ X M M 0 , \ X M M 2
paddd O N E ( % r i p ) , \ X M M 0 # I N C R C N T
movdqa \ X M M 0 , \ X M M 3
paddd O N E ( % r i p ) , \ X M M 0 # I N C R C N T
movdqa \ X M M 0 , \ X M M 4
2010-12-13 14:51:15 +03:00
PSHUFB_ X M M % x m m 1 5 , \ X M M 1 # p e r f o r m a 16 b y t e s w a p
2010-11-04 22:00:45 +03:00
PCLMULQDQ 0 x00 , \ T M P 5 , \ X M M 5 # X M M 5 = a0 * b0
2010-12-13 14:51:15 +03:00
PSHUFB_ X M M % x m m 1 5 , \ X M M 2 # p e r f o r m a 16 b y t e s w a p
PSHUFB_ X M M % x m m 1 5 , \ X M M 3 # p e r f o r m a 16 b y t e s w a p
PSHUFB_ X M M % x m m 1 5 , \ X M M 4 # p e r f o r m a 16 b y t e s w a p
2010-11-04 22:00:45 +03:00
pxor ( % a r g 1 ) , \ X M M 1
pxor ( % a r g 1 ) , \ X M M 2
pxor ( % a r g 1 ) , \ X M M 3
pxor ( % a r g 1 ) , \ X M M 4
movdqa H a s h K e y _ 4 _ k ( % r s p ) , \ T M P 5
PCLMULQDQ 0 x00 , \ T M P 5 , \ T M P 6 # T M P 6 = ( a1 + a0 ) * ( b1 + b0 )
movaps 0 x10 ( % a r g 1 ) , \ T M P 1
AESENC \ T M P 1 , \ X M M 1 # R o u n d 1
AESENC \ T M P 1 , \ X M M 2
AESENC \ T M P 1 , \ X M M 3
AESENC \ T M P 1 , \ X M M 4
movaps 0 x20 ( % a r g 1 ) , \ T M P 1
AESENC \ T M P 1 , \ X M M 1 # R o u n d 2
AESENC \ T M P 1 , \ X M M 2
AESENC \ T M P 1 , \ X M M 3
AESENC \ T M P 1 , \ X M M 4
movdqa \ X M M 6 , \ T M P 1
pshufd $ 7 8 , \ X M M 6 , \ T M P 2
pxor \ X M M 6 , \ T M P 2
movdqa H a s h K e y _ 3 ( % r s p ) , \ T M P 5
PCLMULQDQ 0 x11 , \ T M P 5 , \ T M P 1 # T M P 1 = a1 * b1
movaps 0 x30 ( % a r g 1 ) , \ T M P 3
AESENC \ T M P 3 , \ X M M 1 # R o u n d 3
AESENC \ T M P 3 , \ X M M 2
AESENC \ T M P 3 , \ X M M 3
AESENC \ T M P 3 , \ X M M 4
PCLMULQDQ 0 x00 , \ T M P 5 , \ X M M 6 # X M M 6 = a0 * b0
movaps 0 x40 ( % a r g 1 ) , \ T M P 3
AESENC \ T M P 3 , \ X M M 1 # R o u n d 4
AESENC \ T M P 3 , \ X M M 2
AESENC \ T M P 3 , \ X M M 3
AESENC \ T M P 3 , \ X M M 4
movdqa H a s h K e y _ 3 _ k ( % r s p ) , \ T M P 5
PCLMULQDQ 0 x00 , \ T M P 5 , \ T M P 2 # T M P 2 = ( a1 + a0 ) * ( b1 + b0 )
movaps 0 x50 ( % a r g 1 ) , \ T M P 3
AESENC \ T M P 3 , \ X M M 1 # R o u n d 5
AESENC \ T M P 3 , \ X M M 2
AESENC \ T M P 3 , \ X M M 3
AESENC \ T M P 3 , \ X M M 4
pxor \ T M P 1 , \ T M P 4
# accumulate t h e r e s u l t s i n T M P 4 : X M M 5 , T M P 6 h o l d s t h e m i d d l e p a r t
pxor \ X M M 6 , \ X M M 5
pxor \ T M P 2 , \ T M P 6
movdqa \ X M M 7 , \ T M P 1
pshufd $ 7 8 , \ X M M 7 , \ T M P 2
pxor \ X M M 7 , \ T M P 2
movdqa H a s h K e y _ 2 ( % r s p ) , \ T M P 5
# Multiply T M P 5 * H a s h K e y u s i n g k a r a t s u b a
PCLMULQDQ 0 x11 , \ T M P 5 , \ T M P 1 # T M P 1 = a1 * b1
movaps 0 x60 ( % a r g 1 ) , \ T M P 3
AESENC \ T M P 3 , \ X M M 1 # R o u n d 6
AESENC \ T M P 3 , \ X M M 2
AESENC \ T M P 3 , \ X M M 3
AESENC \ T M P 3 , \ X M M 4
PCLMULQDQ 0 x00 , \ T M P 5 , \ X M M 7 # X M M 7 = a0 * b0
movaps 0 x70 ( % a r g 1 ) , \ T M P 3
AESENC \ T M P 3 , \ X M M 1 # R o u n d 7
AESENC \ T M P 3 , \ X M M 2
AESENC \ T M P 3 , \ X M M 3
AESENC \ T M P 3 , \ X M M 4
movdqa H a s h K e y _ 2 _ k ( % r s p ) , \ T M P 5
PCLMULQDQ 0 x00 , \ T M P 5 , \ T M P 2 # T M P 2 = ( a1 + a0 ) * ( b1 + b0 )
movaps 0 x80 ( % a r g 1 ) , \ T M P 3
AESENC \ T M P 3 , \ X M M 1 # R o u n d 8
AESENC \ T M P 3 , \ X M M 2
AESENC \ T M P 3 , \ X M M 3
AESENC \ T M P 3 , \ X M M 4
pxor \ T M P 1 , \ T M P 4
# accumulate t h e r e s u l t s i n T M P 4 : X M M 5 , T M P 6 h o l d s t h e m i d d l e p a r t
pxor \ X M M 7 , \ X M M 5
pxor \ T M P 2 , \ T M P 6
# Multiply X M M 8 * H a s h K e y
# XMM8 a n d T M P 5 h o l d t h e v a l u e s f o r t h e t w o o p e r a n d s
movdqa \ X M M 8 , \ T M P 1
pshufd $ 7 8 , \ X M M 8 , \ T M P 2
pxor \ X M M 8 , \ T M P 2
movdqa H a s h K e y ( % r s p ) , \ T M P 5
PCLMULQDQ 0 x11 , \ T M P 5 , \ T M P 1 # T M P 1 = a1 * b1
movaps 0 x90 ( % a r g 1 ) , \ T M P 3
AESENC \ T M P 3 , \ X M M 1 # R o u n d 9
AESENC \ T M P 3 , \ X M M 2
AESENC \ T M P 3 , \ X M M 3
AESENC \ T M P 3 , \ X M M 4
PCLMULQDQ 0 x00 , \ T M P 5 , \ X M M 8 # X M M 8 = a0 * b0
movaps 0 x a0 ( % a r g 1 ) , \ T M P 3
AESENCLAST \ T M P 3 , \ X M M 1 # R o u n d 10
AESENCLAST \ T M P 3 , \ X M M 2
AESENCLAST \ T M P 3 , \ X M M 3
AESENCLAST \ T M P 3 , \ X M M 4
movdqa H a s h K e y _ k ( % r s p ) , \ T M P 5
PCLMULQDQ 0 x00 , \ T M P 5 , \ T M P 2 # T M P 2 = ( a1 + a0 ) * ( b1 + b0 )
movdqu ( % a r g 3 ,% r11 ,1 ) , \ T M P 3
pxor \ T M P 3 , \ X M M 1 # C i p h e r t e x t / P l a i n t e x t X O R E K
movdqu \ X M M 1 , ( % a r g 2 ,% r11 ,1 ) # W r i t e t o p l a i n t e x t b u f f e r
movdqa \ T M P 3 , \ X M M 1
movdqu 1 6 ( % a r g 3 ,% r11 ,1 ) , \ T M P 3
pxor \ T M P 3 , \ X M M 2 # C i p h e r t e x t / P l a i n t e x t X O R E K
movdqu \ X M M 2 , 1 6 ( % a r g 2 ,% r11 ,1 ) # W r i t e t o p l a i n t e x t b u f f e r
movdqa \ T M P 3 , \ X M M 2
movdqu 3 2 ( % a r g 3 ,% r11 ,1 ) , \ T M P 3
pxor \ T M P 3 , \ X M M 3 # C i p h e r t e x t / P l a i n t e x t X O R E K
movdqu \ X M M 3 , 3 2 ( % a r g 2 ,% r11 ,1 ) # W r i t e t o p l a i n t e x t b u f f e r
movdqa \ T M P 3 , \ X M M 3
movdqu 4 8 ( % a r g 3 ,% r11 ,1 ) , \ T M P 3
pxor \ T M P 3 , \ X M M 4 # C i p h e r t e x t / P l a i n t e x t X O R E K
movdqu \ X M M 4 , 4 8 ( % a r g 2 ,% r11 ,1 ) # W r i t e t o p l a i n t e x t b u f f e r
movdqa \ T M P 3 , \ X M M 4
2010-12-13 14:51:15 +03:00
PSHUFB_ X M M % x m m 1 5 , \ X M M 1 # p e r f o r m a 16 b y t e s w a p
PSHUFB_ X M M % x m m 1 5 , \ X M M 2 # p e r f o r m a 16 b y t e s w a p
PSHUFB_ X M M % x m m 1 5 , \ X M M 3 # p e r f o r m a 16 b y t e s w a p
PSHUFB_ X M M % x m m 1 5 , \ X M M 4 # p e r f o r m a 16 b y t e s w a p
2010-11-04 22:00:45 +03:00
pxor \ T M P 4 , \ T M P 1
pxor \ X M M 8 , \ X M M 5
pxor \ T M P 6 , \ T M P 2
pxor \ T M P 1 , \ T M P 2
pxor \ X M M 5 , \ T M P 2
movdqa \ T M P 2 , \ T M P 3
pslldq $ 8 , \ T M P 3 # l e f t s h i f t T M P 3 2 D W s
psrldq $ 8 , \ T M P 2 # r i g h t s h i f t T M P 2 2 D W s
pxor \ T M P 3 , \ X M M 5
pxor \ T M P 2 , \ T M P 1 # a c c u m u l a t e t h e r e s u l t s i n T M P 1 : X M M 5
# first p h a s e o f r e d u c t i o n
movdqa \ X M M 5 , \ T M P 2
movdqa \ X M M 5 , \ T M P 3
movdqa \ X M M 5 , \ T M P 4
# move X M M 5 i n t o T M P 2 , T M P 3 , T M P 4 i n o r d e r t o p e r f o r m s h i f t s i n d e p e n d e n t l y
pslld $ 3 1 , \ T M P 2 # p a c k e d r i g h t s h i f t < < 31
pslld $ 3 0 , \ T M P 3 # p a c k e d r i g h t s h i f t < < 30
pslld $ 2 5 , \ T M P 4 # p a c k e d r i g h t s h i f t < < 25
pxor \ T M P 3 , \ T M P 2 # x o r t h e s h i f t e d v e r s i o n s
pxor \ T M P 4 , \ T M P 2
movdqa \ T M P 2 , \ T M P 5
psrldq $ 4 , \ T M P 5 # r i g h t s h i f t T 5 1 D W
pslldq $ 1 2 , \ T M P 2 # l e f t s h i f t T 2 3 D W s
pxor \ T M P 2 , \ X M M 5
# second p h a s e o f r e d u c t i o n
movdqa \ X M M 5 ,\ T M P 2 # m a k e 3 c o p i e s o f X M M 5 i n t o T M P 2 , T M P 3 , T M P 4
movdqa \ X M M 5 ,\ T M P 3
movdqa \ X M M 5 ,\ T M P 4
psrld $ 1 , \ T M P 2 # p a c k e d l e f t s h i f t > > 1
psrld $ 2 , \ T M P 3 # p a c k e d l e f t s h i f t > > 2
psrld $ 7 , \ T M P 4 # p a c k e d l e f t s h i f t > > 7
pxor \ T M P 3 ,\ T M P 2 # x o r t h e s h i f t e d v e r s i o n s
pxor \ T M P 4 ,\ T M P 2
pxor \ T M P 5 , \ T M P 2
pxor \ T M P 2 , \ X M M 5
pxor \ T M P 1 , \ X M M 5 # r e s u l t i s i n T M P 1
pxor \ X M M 5 , \ X M M 1
.endm
/* GHASH the last 4 ciphertext blocks. */
.macro GHASH_LAST_4 TMP1 T M P 2 T M P 3 T M P 4 T M P 5 T M P 6 \
TMP7 X M M 1 X M M 2 X M M 3 X M M 4 X M M D s t
# Multiply T M P 6 * H a s h K e y ( u s i n g K a r a t s u b a )
movdqa \ X M M 1 , \ T M P 6
pshufd $ 7 8 , \ X M M 1 , \ T M P 2
pxor \ X M M 1 , \ T M P 2
movdqa H a s h K e y _ 4 ( % r s p ) , \ T M P 5
PCLMULQDQ 0 x11 , \ T M P 5 , \ T M P 6 # T M P 6 = a1 * b1
PCLMULQDQ 0 x00 , \ T M P 5 , \ X M M 1 # X M M 1 = a0 * b0
movdqa H a s h K e y _ 4 _ k ( % r s p ) , \ T M P 4
PCLMULQDQ 0 x00 , \ T M P 4 , \ T M P 2 # T M P 2 = ( a1 + a0 ) * ( b1 + b0 )
movdqa \ X M M 1 , \ X M M D s t
movdqa \ T M P 2 , \ X M M 1 # r e s u l t i n T M P 6 , X M M D s t , X M M 1
# Multiply T M P 1 * H a s h K e y ( u s i n g K a r a t s u b a )
movdqa \ X M M 2 , \ T M P 1
pshufd $ 7 8 , \ X M M 2 , \ T M P 2
pxor \ X M M 2 , \ T M P 2
movdqa H a s h K e y _ 3 ( % r s p ) , \ T M P 5
PCLMULQDQ 0 x11 , \ T M P 5 , \ T M P 1 # T M P 1 = a1 * b1
PCLMULQDQ 0 x00 , \ T M P 5 , \ X M M 2 # X M M 2 = a0 * b0
movdqa H a s h K e y _ 3 _ k ( % r s p ) , \ T M P 4
PCLMULQDQ 0 x00 , \ T M P 4 , \ T M P 2 # T M P 2 = ( a1 + a0 ) * ( b1 + b0 )
pxor \ T M P 1 , \ T M P 6
pxor \ X M M 2 , \ X M M D s t
pxor \ T M P 2 , \ X M M 1
# results a c c u m u l a t e d i n T M P 6 , X M M D s t , X M M 1
# Multiply T M P 1 * H a s h K e y ( u s i n g K a r a t s u b a )
movdqa \ X M M 3 , \ T M P 1
pshufd $ 7 8 , \ X M M 3 , \ T M P 2
pxor \ X M M 3 , \ T M P 2
movdqa H a s h K e y _ 2 ( % r s p ) , \ T M P 5
PCLMULQDQ 0 x11 , \ T M P 5 , \ T M P 1 # T M P 1 = a1 * b1
PCLMULQDQ 0 x00 , \ T M P 5 , \ X M M 3 # X M M 3 = a0 * b0
movdqa H a s h K e y _ 2 _ k ( % r s p ) , \ T M P 4
PCLMULQDQ 0 x00 , \ T M P 4 , \ T M P 2 # T M P 2 = ( a1 + a0 ) * ( b1 + b0 )
pxor \ T M P 1 , \ T M P 6
pxor \ X M M 3 , \ X M M D s t
pxor \ T M P 2 , \ X M M 1 # r e s u l t s a c c u m u l a t e d i n T M P 6 , X M M D s t , X M M 1
# Multiply T M P 1 * H a s h K e y ( u s i n g K a r a t s u b a )
movdqa \ X M M 4 , \ T M P 1
pshufd $ 7 8 , \ X M M 4 , \ T M P 2
pxor \ X M M 4 , \ T M P 2
movdqa H a s h K e y ( % r s p ) , \ T M P 5
PCLMULQDQ 0 x11 , \ T M P 5 , \ T M P 1 # T M P 1 = a1 * b1
PCLMULQDQ 0 x00 , \ T M P 5 , \ X M M 4 # X M M 4 = a0 * b0
movdqa H a s h K e y _ k ( % r s p ) , \ T M P 4
PCLMULQDQ 0 x00 , \ T M P 4 , \ T M P 2 # T M P 2 = ( a1 + a0 ) * ( b1 + b0 )
pxor \ T M P 1 , \ T M P 6
pxor \ X M M 4 , \ X M M D s t
pxor \ X M M 1 , \ T M P 2
pxor \ T M P 6 , \ T M P 2
pxor \ X M M D s t , \ T M P 2
# middle s e c t i o n o f t h e t e m p r e s u l t s c o m b i n e d a s i n k a r a t s u b a a l g o r i t h m
movdqa \ T M P 2 , \ T M P 4
pslldq $ 8 , \ T M P 4 # l e f t s h i f t T M P 4 2 D W s
psrldq $ 8 , \ T M P 2 # r i g h t s h i f t T M P 2 2 D W s
pxor \ T M P 4 , \ X M M D s t
pxor \ T M P 2 , \ T M P 6
# TMP6 : XMMDst h o l d s t h e r e s u l t o f t h e a c c u m u l a t e d c a r r y - l e s s m u l t i p l i c a t i o n s
# first p h a s e o f t h e r e d u c t i o n
movdqa \ X M M D s t , \ T M P 2
movdqa \ X M M D s t , \ T M P 3
movdqa \ X M M D s t , \ T M P 4
# move X M M D s t i n t o T M P 2 , T M P 3 , T M P 4 i n o r d e r t o p e r f o r m 3 s h i f t s i n d e p e n d e n t l y
pslld $ 3 1 , \ T M P 2 # p a c k e d r i g h t s h i f t i n g < < 31
pslld $ 3 0 , \ T M P 3 # p a c k e d r i g h t s h i f t i n g < < 30
pslld $ 2 5 , \ T M P 4 # p a c k e d r i g h t s h i f t i n g < < 25
pxor \ T M P 3 , \ T M P 2 # x o r t h e s h i f t e d v e r s i o n s
pxor \ T M P 4 , \ T M P 2
movdqa \ T M P 2 , \ T M P 7
psrldq $ 4 , \ T M P 7 # r i g h t s h i f t T M P 7 1 D W
pslldq $ 1 2 , \ T M P 2 # l e f t s h i f t T M P 2 3 D W s
pxor \ T M P 2 , \ X M M D s t
# second p h a s e o f t h e r e d u c t i o n
movdqa \ X M M D s t , \ T M P 2
# make 3 c o p i e s o f X M M D s t f o r d o i n g 3 s h i f t o p e r a t i o n s
movdqa \ X M M D s t , \ T M P 3
movdqa \ X M M D s t , \ T M P 4
psrld $ 1 , \ T M P 2 # p a c k e d l e f t s h i f t > > 1
psrld $ 2 , \ T M P 3 # p a c k e d l e f t s h i f t > > 2
psrld $ 7 , \ T M P 4 # p a c k e d l e f t s h i f t > > 7
pxor \ T M P 3 , \ T M P 2 # x o r t h e s h i f t e d v e r s i o n s
pxor \ T M P 4 , \ T M P 2
pxor \ T M P 7 , \ T M P 2
pxor \ T M P 2 , \ X M M D s t
pxor \ T M P 6 , \ X M M D s t # r e d u c e d r e s u l t i s i n X M M D s t
.endm
/* Encryption of a single block done*/
.macro ENCRYPT_SINGLE_BLOCK XMM0 T M P 1
pxor ( % a r g 1 ) , \ X M M 0
movaps 1 6 ( % a r g 1 ) , \ T M P 1
AESENC \ T M P 1 , \ X M M 0
movaps 3 2 ( % a r g 1 ) , \ T M P 1
AESENC \ T M P 1 , \ X M M 0
movaps 4 8 ( % a r g 1 ) , \ T M P 1
AESENC \ T M P 1 , \ X M M 0
movaps 6 4 ( % a r g 1 ) , \ T M P 1
AESENC \ T M P 1 , \ X M M 0
movaps 8 0 ( % a r g 1 ) , \ T M P 1
AESENC \ T M P 1 , \ X M M 0
movaps 9 6 ( % a r g 1 ) , \ T M P 1
AESENC \ T M P 1 , \ X M M 0
movaps 1 1 2 ( % a r g 1 ) , \ T M P 1
AESENC \ T M P 1 , \ X M M 0
movaps 1 2 8 ( % a r g 1 ) , \ T M P 1
AESENC \ T M P 1 , \ X M M 0
movaps 1 4 4 ( % a r g 1 ) , \ T M P 1
AESENC \ T M P 1 , \ X M M 0
movaps 1 6 0 ( % a r g 1 ) , \ T M P 1
AESENCLAST \ T M P 1 , \ X M M 0
.endm
/ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
* void a e s n i _ g c m _ d e c ( v o i d * a e s _ c t x , / / A E S K e y s c h e d u l e . S t a r t s o n a 1 6 b y t e b o u n d a r y .
* u8 * o u t , / / P l a i n t e x t o u t p u t . E n c r y p t i n - p l a c e i s a l l o w e d .
* const u 8 * i n , / / C i p h e r t e x t i n p u t
* u6 4 p l a i n t e x t _ l e n , / / L e n g t h o f d a t a i n b y t e s f o r d e c r y p t i o n .
* u8 * i v , / / P r e - c o u n t e r b l o c k j 0 : 4 b y t e s a l t ( f r o m S e c u r i t y A s s o c i a t i o n )
* / / concatenated w i t h 8 b y t e I n i t i a l i s a t i o n V e c t o r ( f r o m I P S e c E S P P a y l o a d )
* / / concatenated w i t h 0 x00 0 0 0 0 0 1 . 1 6 - b y t e a l i g n e d p o i n t e r .
* u8 * h a s h _ s u b k e y , / / H , t h e H a s h s u b k e y i n p u t . D a t a s t a r t s o n a 1 6 - b y t e b o u n d a r y .
* const u 8 * a a d , / / A d d i t i o n a l A u t h e n t i c a t i o n D a t a ( A A D )
* u6 4 a a d _ l e n , / / L e n g t h o f A A D i n b y t e s . W i t h R F C 4 1 0 6 t h i s i s g o i n g t o b e 8 o r 1 2 b y t e s
* u8 * a u t h _ t a g , / / A u t h e n t i c a t e d T a g o u t p u t . T h e d r i v e r w i l l c o m p a r e t h i s t o t h e
* / / given a u t h e n t i c a t i o n t a g a n d o n l y r e t u r n t h e p l a i n t e x t i f t h e y m a t c h .
* u6 4 a u t h _ t a g _ l e n ) ; // Authenticated Tag Length in bytes. Valid values are 16
* / / ( most l i k e l y ) , 1 2 o r 8 .
*
* Assumptions :
*
* keys :
* keys a r e p r e - e x p a n d e d a n d a l i g n e d t o 1 6 b y t e s . w e a r e u s i n g t h e f i r s t
* set o f 1 1 k e y s i n t h e d a t a s t r u c t u r e v o i d * a e s _ c t x
*
* iv :
* 0 1 2 3
* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
* + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - +
* | Salt ( F r o m t h e S A ) |
* + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - +
* | Initialization V e c t o r |
* | ( This i s t h e s e q u e n c e n u m b e r f r o m I P S e c h e a d e r ) |
* + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - +
* | 0 x1 |
* + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - +
*
*
*
* AAD :
* AAD p a d d e d t o 1 2 8 b i t s w i t h 0
* for e x a m p l e , a s s u m e A A D i s a u 3 2 v e c t o r
*
* if A A D i s 8 b y t e s :
* AAD[ 3 ] = { A 0 , A 1 } ;
* padded A A D i n x m m r e g i s t e r = { A 1 A 0 0 0 }
*
* 0 1 2 3
* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
* + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - +
* | SPI ( A 1 ) |
* + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - +
* | 3 2 - bit S e q u e n c e N u m b e r ( A 0 ) |
* + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - +
* | 0 x0 |
* + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - +
*
* AAD F o r m a t w i t h 3 2 - b i t S e q u e n c e N u m b e r
*
* if A A D i s 1 2 b y t e s :
* AAD[ 3 ] = { A 0 , A 1 , A 2 } ;
* padded A A D i n x m m r e g i s t e r = { A 2 A 1 A 0 0 }
*
* 0 1 2 3
* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
* + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - +
* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
* + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - +
* | SPI ( A 2 ) |
* + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - +
* | 6 4 - bit E x t e n d e d S e q u e n c e N u m b e r { A 1 ,A 0 } |
* | |
* + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - +
* | 0 x0 |
* + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - +
*
* AAD F o r m a t w i t h 6 4 - b i t E x t e n d e d S e q u e n c e N u m b e r
*
* aadLen :
* from t h e d e f i n i t i o n o f t h e s p e c , a a d L e n c a n o n l y b e 8 o r 1 2 b y t e s .
* The c o d e s u p p o r t s 1 6 t o o b u t f o r o t h e r s i z e s , t h e c o d e w i l l f a i l .
*
* TLen :
* from t h e d e f i n i t i o n o f t h e s p e c , T L e n c a n o n l y b e 8 , 1 2 o r 1 6 b y t e s .
* For o t h e r s i z e s , t h e c o d e w i l l f a i l .
*
* poly = x ^ 1 2 8 + x ^ 1 2 7 + x ^ 1 2 6 + x ^ 1 2 1 + 1
*
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * /
ENTRY( a e s n i _ g c m _ d e c )
push % r12
push % r13
push % r14
mov % r s p , % r14
/ *
* states o f % x m m r e g i s t e r s % x m m 6 : % x m m 1 5 n o t s a v e d
* all % x m m r e g i s t e r s a r e c l o b b e r e d
* /
sub $ V A R I A B L E _ O F F S E T , % r s p
and $ ~ 6 3 , % r s p # a l i g n r s p t o 64 b y t e s
mov % a r g 6 , % r12
movdqu ( % r12 ) , % x m m 1 3 # % x m m 13 = H a s h K e y
2010-12-13 14:51:15 +03:00
movdqa S H U F _ M A S K ( % r i p ) , % x m m 2
PSHUFB_ X M M % x m m 2 , % x m m 1 3
2010-11-04 22:00:45 +03:00
# Precompute H a s h K e y < < 1 ( m o d p o l y ) f r o m t h e h a s h k e y ( r e q u i r e d f o r G H A S H )
movdqa % x m m 1 3 , % x m m 2
psllq $ 1 , % x m m 1 3
psrlq $ 6 3 , % x m m 2
movdqa % x m m 2 , % x m m 1
pslldq $ 8 , % x m m 2
psrldq $ 8 , % x m m 1
por % x m m 2 , % x m m 1 3
# Reduction
pshufd $ 0 x24 , % x m m 1 , % x m m 2
pcmpeqd T W O O N E ( % r i p ) , % x m m 2
pand P O L Y ( % r i p ) , % x m m 2
pxor % x m m 2 , % x m m 1 3 # % x m m 13 h o l d s t h e H a s h K e y < < 1 ( m o d p o l y )
# Decrypt f i r s t f e w b l o c k s
movdqa % x m m 1 3 , H a s h K e y ( % r s p ) # s t o r e H a s h K e y < < 1 ( m o d p o l y )
mov % a r g 4 , % r13 # s a v e t h e n u m b e r o f b y t e s o f p l a i n t e x t / c i p h e r t e x t
and $ - 1 6 , % r13 # % r 13 = % r13 - ( % r13 m o d 1 6 )
mov % r13 , % r12
and $ ( 3 < < 4 ) , % r12
jz _ i n i t i a l _ n u m _ b l o c k s _ i s _ 0 _ d e c r y p t
cmp $ ( 2 < < 4 ) , % r12
jb _ i n i t i a l _ n u m _ b l o c k s _ i s _ 1 _ d e c r y p t
je _ i n i t i a l _ n u m _ b l o c k s _ i s _ 2 _ d e c r y p t
_initial_num_blocks_is_3_decrypt :
2010-12-13 14:51:15 +03:00
INITIAL_ B L O C K S _ D E C 3 , % x m m 9 , % x m m 1 0 , % x m m 1 3 , % x m m 1 1 , % x m m 1 2 , % x m m 0 , \
2010-11-04 22:00:45 +03:00
% xmm1 , % x m m 2 , % x m m 3 , % x m m 4 , % x m m 8 , % x m m 5 , % x m m 6 , 5 , 6 7 8 , d e c
sub $ 4 8 , % r13
jmp _ i n i t i a l _ b l o c k s _ d e c r y p t e d
_initial_num_blocks_is_2_decrypt :
2010-12-13 14:51:15 +03:00
INITIAL_ B L O C K S _ D E C 2 , % x m m 9 , % x m m 1 0 , % x m m 1 3 , % x m m 1 1 , % x m m 1 2 , % x m m 0 , \
2010-11-04 22:00:45 +03:00
% xmm1 , % x m m 2 , % x m m 3 , % x m m 4 , % x m m 8 , % x m m 5 , % x m m 6 , 6 , 7 8 , d e c
sub $ 3 2 , % r13
jmp _ i n i t i a l _ b l o c k s _ d e c r y p t e d
_initial_num_blocks_is_1_decrypt :
2010-12-13 14:51:15 +03:00
INITIAL_ B L O C K S _ D E C 1 , % x m m 9 , % x m m 1 0 , % x m m 1 3 , % x m m 1 1 , % x m m 1 2 , % x m m 0 , \
2010-11-04 22:00:45 +03:00
% xmm1 , % x m m 2 , % x m m 3 , % x m m 4 , % x m m 8 , % x m m 5 , % x m m 6 , 7 , 8 , d e c
sub $ 1 6 , % r13
jmp _ i n i t i a l _ b l o c k s _ d e c r y p t e d
_initial_num_blocks_is_0_decrypt :
2010-12-13 14:51:15 +03:00
INITIAL_ B L O C K S _ D E C 0 , % x m m 9 , % x m m 1 0 , % x m m 1 3 , % x m m 1 1 , % x m m 1 2 , % x m m 0 , \
2010-11-04 22:00:45 +03:00
% xmm1 , % x m m 2 , % x m m 3 , % x m m 4 , % x m m 8 , % x m m 5 , % x m m 6 , 8 , 0 , d e c
_initial_blocks_decrypted :
cmp $ 0 , % r13
je _ z e r o _ c i p h e r _ l e f t _ d e c r y p t
sub $ 6 4 , % r13
je _ f o u r _ c i p h e r _ l e f t _ d e c r y p t
_decrypt_by_4 :
2010-12-13 14:51:15 +03:00
GHASH_ 4 _ E N C R Y P T _ 4 _ P A R A L L E L _ D E C % x m m 9 , % x m m 1 0 , % x m m 1 1 , % x m m 1 2 , % x m m 1 3 , \
2010-11-04 22:00:45 +03:00
% xmm1 4 , % x m m 0 , % x m m 1 , % x m m 2 , % x m m 3 , % x m m 4 , % x m m 5 , % x m m 6 , % x m m 7 , % x m m 8 , d e c
add $ 6 4 , % r11
sub $ 6 4 , % r13
jne _ d e c r y p t _ b y _ 4
_four_cipher_left_decrypt :
GHASH_ L A S T _ 4 % x m m 9 , % x m m 1 0 , % x m m 1 1 , % x m m 1 2 , % x m m 1 3 , % x m m 1 4 , \
% xmm1 5 , % x m m 1 , % x m m 2 , % x m m 3 , % x m m 4 , % x m m 8
_zero_cipher_left_decrypt :
mov % a r g 4 , % r13
and $ 1 5 , % r13 # % r 13 = a r g 4 ( m o d 1 6 )
je _ m u l t i p l e _ o f _ 1 6 _ b y t e s _ d e c r y p t
2011-03-17 22:24:16 +03:00
# Handle t h e l a s t < 1 6 b y t e b l o c k s e p a r a t e l y
2010-11-04 22:00:45 +03:00
paddd O N E ( % r i p ) , % x m m 0 # i n c r e m e n t C N T t o g e t Y n
2010-12-13 14:51:15 +03:00
movdqa S H U F _ M A S K ( % r i p ) , % x m m 1 0
PSHUFB_ X M M % x m m 1 0 , % x m m 0
2010-11-04 22:00:45 +03:00
ENCRYPT_ S I N G L E _ B L O C K % x m m 0 , % x m m 1 # E ( K , Y n )
sub $ 1 6 , % r11
add % r13 , % r11
2011-03-17 22:24:16 +03:00
movdqu ( % a r g 3 ,% r11 ,1 ) , % x m m 1 # r e c e i v e t h e l a s t < 16 b y t e b l o c k
2010-11-04 22:00:45 +03:00
lea S H I F T _ M A S K + 1 6 ( % r i p ) , % r12
sub % r13 , % r12
# adjust t h e s h u f f l e m a s k p o i n t e r t o b e a b l e t o s h i f t 1 6 - % r13 b y t e s
# ( % r1 3 i s t h e n u m b e r o f b y t e s i n p l a i n t e x t m o d 1 6 )
movdqu ( % r12 ) , % x m m 2 # g e t t h e a p p r o p r i a t e s h u f f l e m a s k
2010-12-13 14:51:15 +03:00
PSHUFB_ X M M % x m m 2 , % x m m 1 # r i g h t s h i f t 16 - % r13 b u t e s
2010-11-04 22:00:45 +03:00
movdqa % x m m 1 , % x m m 2
pxor % x m m 1 , % x m m 0 # C i p h e r t e x t X O R E ( K , Y n )
movdqu A L L _ F - S H I F T _ M A S K ( % r12 ) , % x m m 1
# get t h e a p p r o p r i a t e m a s k t o m a s k o u t t o p 1 6 - % r13 b y t e s o f % x m m 0
pand % x m m 1 , % x m m 0 # m a s k o u t t o p 16 - % r13 b y t e s o f % x m m 0
pand % x m m 1 , % x m m 2
2010-12-13 14:51:15 +03:00
movdqa S H U F _ M A S K ( % r i p ) , % x m m 1 0
PSHUFB_ X M M % x m m 1 0 ,% x m m 2
2010-11-04 22:00:45 +03:00
pxor % x m m 2 , % x m m 8
GHASH_ M U L % x m m 8 , % x m m 1 3 , % x m m 9 , % x m m 1 0 , % x m m 1 1 , % x m m 5 , % x m m 6
# GHASH c o m p u t a t i o n f o r t h e l a s t < 1 6 b y t e b l o c k
sub % r13 , % r11
add $ 1 6 , % r11
# output % r13 b y t e s
2010-12-13 14:51:15 +03:00
MOVQ_ R 6 4 _ X M M % x m m 0 , % r a x
2010-11-04 22:00:45 +03:00
cmp $ 8 , % r13
jle _ l e s s _ t h a n _ 8 _ b y t e s _ l e f t _ d e c r y p t
mov % r a x , ( % a r g 2 , % r11 , 1 )
add $ 8 , % r11
psrldq $ 8 , % x m m 0
2010-12-13 14:51:15 +03:00
MOVQ_ R 6 4 _ X M M % x m m 0 , % r a x
2010-11-04 22:00:45 +03:00
sub $ 8 , % r13
_less_than_8_bytes_left_decrypt :
mov % a l , ( % a r g 2 , % r11 , 1 )
add $ 1 , % r11
shr $ 8 , % r a x
sub $ 1 , % r13
jne _ l e s s _ t h a n _ 8 _ b y t e s _ l e f t _ d e c r y p t
_multiple_of_16_bytes_decrypt :
mov a r g 8 , % r12 # % r 13 = a a d L e n ( n u m b e r o f b y t e s )
shl $ 3 , % r12 # c o n v e r t i n t o n u m b e r o f b i t s
movd % r12 d , % x m m 1 5 # l e n ( A ) i n % x m m 15
shl $ 3 , % a r g 4 # l e n ( C ) i n b i t s ( * 128 )
2010-12-13 14:51:15 +03:00
MOVQ_ R 6 4 _ X M M % a r g 4 , % x m m 1
2010-11-04 22:00:45 +03:00
pslldq $ 8 , % x m m 1 5 # % x m m 15 = l e n ( A ) | | 0 x00 0 0 0 0 0 0 0 0 0 0 0 0 0 0
pxor % x m m 1 , % x m m 1 5 # % x m m 15 = l e n ( A ) | | l e n ( C )
pxor % x m m 1 5 , % x m m 8
GHASH_ M U L % x m m 8 , % x m m 1 3 , % x m m 9 , % x m m 1 0 , % x m m 1 1 , % x m m 5 , % x m m 6
# final G H A S H c o m p u t a t i o n
2010-12-13 14:51:15 +03:00
movdqa S H U F _ M A S K ( % r i p ) , % x m m 1 0
PSHUFB_ X M M % x m m 1 0 , % x m m 8
2010-11-04 22:00:45 +03:00
mov % a r g 5 , % r a x # % r a x = * Y 0
movdqu ( % r a x ) , % x m m 0 # % x m m 0 = Y 0
ENCRYPT_ S I N G L E _ B L O C K % x m m 0 , % x m m 1 # E ( K , Y 0 )
pxor % x m m 8 , % x m m 0
_return_T_decrypt :
mov a r g 9 , % r10 # % r 10 = a u t h T a g
mov a r g 1 0 , % r11 # % r 11 = a u t h _ t a g _ l e n
cmp $ 1 6 , % r11
je _ T _ 1 6 _ d e c r y p t
cmp $ 1 2 , % r11
je _ T _ 1 2 _ d e c r y p t
_T_8_decrypt :
2010-12-13 14:51:15 +03:00
MOVQ_ R 6 4 _ X M M % x m m 0 , % r a x
2010-11-04 22:00:45 +03:00
mov % r a x , ( % r10 )
jmp _ r e t u r n _ T _ d o n e _ d e c r y p t
_T_12_decrypt :
2010-12-13 14:51:15 +03:00
MOVQ_ R 6 4 _ X M M % x m m 0 , % r a x
2010-11-04 22:00:45 +03:00
mov % r a x , ( % r10 )
psrldq $ 8 , % x m m 0
movd % x m m 0 , % e a x
mov % e a x , 8 ( % r10 )
jmp _ r e t u r n _ T _ d o n e _ d e c r y p t
_T_16_decrypt :
movdqu % x m m 0 , ( % r10 )
_return_T_done_decrypt :
mov % r14 , % r s p
pop % r14
pop % r13
pop % r12
ret
/ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
* void a e s n i _ g c m _ e n c ( v o i d * a e s _ c t x , / / A E S K e y s c h e d u l e . S t a r t s o n a 1 6 b y t e b o u n d a r y .
* u8 * o u t , / / C i p h e r t e x t o u t p u t . E n c r y p t i n - p l a c e i s a l l o w e d .
* const u 8 * i n , / / P l a i n t e x t i n p u t
* u6 4 p l a i n t e x t _ l e n , / / L e n g t h o f d a t a i n b y t e s f o r e n c r y p t i o n .
* u8 * i v , / / P r e - c o u n t e r b l o c k j 0 : 4 b y t e s a l t ( f r o m S e c u r i t y A s s o c i a t i o n )
* / / concatenated w i t h 8 b y t e I n i t i a l i s a t i o n V e c t o r ( f r o m I P S e c E S P P a y l o a d )
* / / concatenated w i t h 0 x00 0 0 0 0 0 1 . 1 6 - b y t e a l i g n e d p o i n t e r .
* u8 * h a s h _ s u b k e y , / / H , t h e H a s h s u b k e y i n p u t . D a t a s t a r t s o n a 1 6 - b y t e b o u n d a r y .
* const u 8 * a a d , / / A d d i t i o n a l A u t h e n t i c a t i o n D a t a ( A A D )
* u6 4 a a d _ l e n , / / L e n g t h o f A A D i n b y t e s . W i t h R F C 4 1 0 6 t h i s i s g o i n g t o b e 8 o r 1 2 b y t e s
* u8 * a u t h _ t a g , / / A u t h e n t i c a t e d T a g o u t p u t .
* u6 4 a u t h _ t a g _ l e n ) ; // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
* / / 1 2 or 8 .
*
* Assumptions :
*
* keys :
* keys a r e p r e - e x p a n d e d a n d a l i g n e d t o 1 6 b y t e s . w e a r e u s i n g t h e
* first s e t o f 1 1 k e y s i n t h e d a t a s t r u c t u r e v o i d * a e s _ c t x
*
*
* iv :
* 0 1 2 3
* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
* + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - +
* | Salt ( F r o m t h e S A ) |
* + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - +
* | Initialization V e c t o r |
* | ( This i s t h e s e q u e n c e n u m b e r f r o m I P S e c h e a d e r ) |
* + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - +
* | 0 x1 |
* + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - +
*
*
*
* AAD :
* AAD p a d d e d t o 1 2 8 b i t s w i t h 0
* for e x a m p l e , a s s u m e A A D i s a u 3 2 v e c t o r
*
* if A A D i s 8 b y t e s :
* AAD[ 3 ] = { A 0 , A 1 } ;
* padded A A D i n x m m r e g i s t e r = { A 1 A 0 0 0 }
*
* 0 1 2 3
* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
* + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - +
* | SPI ( A 1 ) |
* + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - +
* | 3 2 - bit S e q u e n c e N u m b e r ( A 0 ) |
* + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - +
* | 0 x0 |
* + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - +
*
* AAD F o r m a t w i t h 3 2 - b i t S e q u e n c e N u m b e r
*
* if A A D i s 1 2 b y t e s :
* AAD[ 3 ] = { A 0 , A 1 , A 2 } ;
* padded A A D i n x m m r e g i s t e r = { A 2 A 1 A 0 0 }
*
* 0 1 2 3
* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
* + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - +
* | SPI ( A 2 ) |
* + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - +
* | 6 4 - bit E x t e n d e d S e q u e n c e N u m b e r { A 1 ,A 0 } |
* | |
* + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - +
* | 0 x0 |
* + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - +
*
* AAD F o r m a t w i t h 6 4 - b i t E x t e n d e d S e q u e n c e N u m b e r
*
* aadLen :
* from t h e d e f i n i t i o n o f t h e s p e c , a a d L e n c a n o n l y b e 8 o r 1 2 b y t e s .
* The c o d e s u p p o r t s 1 6 t o o b u t f o r o t h e r s i z e s , t h e c o d e w i l l f a i l .
*
* TLen :
* from t h e d e f i n i t i o n o f t h e s p e c , T L e n c a n o n l y b e 8 , 1 2 o r 1 6 b y t e s .
* For o t h e r s i z e s , t h e c o d e w i l l f a i l .
*
* poly = x ^ 1 2 8 + x ^ 1 2 7 + x ^ 1 2 6 + x ^ 1 2 1 + 1
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * /
ENTRY( a e s n i _ g c m _ e n c )
push % r12
push % r13
push % r14
mov % r s p , % r14
#
# states o f % x m m r e g i s t e r s % x m m 6 : % x m m 1 5 n o t s a v e d
# all % x m m r e g i s t e r s a r e c l o b b e r e d
#
sub $ V A R I A B L E _ O F F S E T , % r s p
and $ ~ 6 3 , % r s p
mov % a r g 6 , % r12
movdqu ( % r12 ) , % x m m 1 3
2010-12-13 14:51:15 +03:00
movdqa S H U F _ M A S K ( % r i p ) , % x m m 2
PSHUFB_ X M M % x m m 2 , % x m m 1 3
2010-11-04 22:00:45 +03:00
# precompute H a s h K e y < < 1 m o d p o l y f r o m t h e H a s h K e y ( r e q u i r e d f o r G H A S H )
movdqa % x m m 1 3 , % x m m 2
psllq $ 1 , % x m m 1 3
psrlq $ 6 3 , % x m m 2
movdqa % x m m 2 , % x m m 1
pslldq $ 8 , % x m m 2
psrldq $ 8 , % x m m 1
por % x m m 2 , % x m m 1 3
# reduce H a s h K e y < < 1
pshufd $ 0 x24 , % x m m 1 , % x m m 2
pcmpeqd T W O O N E ( % r i p ) , % x m m 2
pand P O L Y ( % r i p ) , % x m m 2
pxor % x m m 2 , % x m m 1 3
movdqa % x m m 1 3 , H a s h K e y ( % r s p )
mov % a r g 4 , % r13 # % x m m 13 h o l d s H a s h K e y < < 1 ( m o d p o l y )
and $ - 1 6 , % r13
mov % r13 , % r12
# Encrypt f i r s t f e w b l o c k s
and $ ( 3 < < 4 ) , % r12
jz _ i n i t i a l _ n u m _ b l o c k s _ i s _ 0 _ e n c r y p t
cmp $ ( 2 < < 4 ) , % r12
jb _ i n i t i a l _ n u m _ b l o c k s _ i s _ 1 _ e n c r y p t
je _ i n i t i a l _ n u m _ b l o c k s _ i s _ 2 _ e n c r y p t
_initial_num_blocks_is_3_encrypt :
2010-12-13 14:51:15 +03:00
INITIAL_ B L O C K S _ E N C 3 , % x m m 9 , % x m m 1 0 , % x m m 1 3 , % x m m 1 1 , % x m m 1 2 , % x m m 0 , \
2010-11-04 22:00:45 +03:00
% xmm1 , % x m m 2 , % x m m 3 , % x m m 4 , % x m m 8 , % x m m 5 , % x m m 6 , 5 , 6 7 8 , e n c
sub $ 4 8 , % r13
jmp _ i n i t i a l _ b l o c k s _ e n c r y p t e d
_initial_num_blocks_is_2_encrypt :
2010-12-13 14:51:15 +03:00
INITIAL_ B L O C K S _ E N C 2 , % x m m 9 , % x m m 1 0 , % x m m 1 3 , % x m m 1 1 , % x m m 1 2 , % x m m 0 , \
2010-11-04 22:00:45 +03:00
% xmm1 , % x m m 2 , % x m m 3 , % x m m 4 , % x m m 8 , % x m m 5 , % x m m 6 , 6 , 7 8 , e n c
sub $ 3 2 , % r13
jmp _ i n i t i a l _ b l o c k s _ e n c r y p t e d
_initial_num_blocks_is_1_encrypt :
2010-12-13 14:51:15 +03:00
INITIAL_ B L O C K S _ E N C 1 , % x m m 9 , % x m m 1 0 , % x m m 1 3 , % x m m 1 1 , % x m m 1 2 , % x m m 0 , \
2010-11-04 22:00:45 +03:00
% xmm1 , % x m m 2 , % x m m 3 , % x m m 4 , % x m m 8 , % x m m 5 , % x m m 6 , 7 , 8 , e n c
sub $ 1 6 , % r13
jmp _ i n i t i a l _ b l o c k s _ e n c r y p t e d
_initial_num_blocks_is_0_encrypt :
2010-12-13 14:51:15 +03:00
INITIAL_ B L O C K S _ E N C 0 , % x m m 9 , % x m m 1 0 , % x m m 1 3 , % x m m 1 1 , % x m m 1 2 , % x m m 0 , \
2010-11-04 22:00:45 +03:00
% xmm1 , % x m m 2 , % x m m 3 , % x m m 4 , % x m m 8 , % x m m 5 , % x m m 6 , 8 , 0 , e n c
_initial_blocks_encrypted :
# Main l o o p - E n c r y p t r e m a i n i n g b l o c k s
cmp $ 0 , % r13
je _ z e r o _ c i p h e r _ l e f t _ e n c r y p t
sub $ 6 4 , % r13
je _ f o u r _ c i p h e r _ l e f t _ e n c r y p t
_encrypt_by_4_encrypt :
2010-12-13 14:51:15 +03:00
GHASH_ 4 _ E N C R Y P T _ 4 _ P A R A L L E L _ E N C % x m m 9 , % x m m 1 0 , % x m m 1 1 , % x m m 1 2 , % x m m 1 3 , \
2010-11-04 22:00:45 +03:00
% xmm1 4 , % x m m 0 , % x m m 1 , % x m m 2 , % x m m 3 , % x m m 4 , % x m m 5 , % x m m 6 , % x m m 7 , % x m m 8 , e n c
add $ 6 4 , % r11
sub $ 6 4 , % r13
jne _ e n c r y p t _ b y _ 4 _ e n c r y p t
_four_cipher_left_encrypt :
GHASH_ L A S T _ 4 % x m m 9 , % x m m 1 0 , % x m m 1 1 , % x m m 1 2 , % x m m 1 3 , % x m m 1 4 , \
% xmm1 5 , % x m m 1 , % x m m 2 , % x m m 3 , % x m m 4 , % x m m 8
_zero_cipher_left_encrypt :
mov % a r g 4 , % r13
and $ 1 5 , % r13 # % r 13 = a r g 4 ( m o d 1 6 )
je _ m u l t i p l e _ o f _ 1 6 _ b y t e s _ e n c r y p t
2011-03-17 22:24:16 +03:00
# Handle t h e l a s t < 1 6 B y t e b l o c k s e p a r a t e l y
2010-11-04 22:00:45 +03:00
paddd O N E ( % r i p ) , % x m m 0 # I N C R C N T t o g e t Y n
2010-12-13 14:51:15 +03:00
movdqa S H U F _ M A S K ( % r i p ) , % x m m 1 0
PSHUFB_ X M M % x m m 1 0 , % x m m 0
2011-03-13 11:56:17 +03:00
2010-11-04 22:00:45 +03:00
ENCRYPT_ S I N G L E _ B L O C K % x m m 0 , % x m m 1 # E n c r y p t ( K , Y n )
sub $ 1 6 , % r11
add % r13 , % r11
movdqu ( % a r g 3 ,% r11 ,1 ) , % x m m 1 # r e c e i v e t h e l a s t < 16 b y t e b l o c k s
lea S H I F T _ M A S K + 1 6 ( % r i p ) , % r12
sub % r13 , % r12
# adjust t h e s h u f f l e m a s k p o i n t e r t o b e a b l e t o s h i f t 1 6 - r13 b y t e s
# ( % r1 3 i s t h e n u m b e r o f b y t e s i n p l a i n t e x t m o d 1 6 )
movdqu ( % r12 ) , % x m m 2 # g e t t h e a p p r o p r i a t e s h u f f l e m a s k
2010-12-13 14:51:15 +03:00
PSHUFB_ X M M % x m m 2 , % x m m 1 # s h i f t r i g h t 16 - r13 b y t e
2010-11-04 22:00:45 +03:00
pxor % x m m 1 , % x m m 0 # P l a i n t e x t X O R E n c r y p t ( K , Y n )
movdqu A L L _ F - S H I F T _ M A S K ( % r12 ) , % x m m 1
# get t h e a p p r o p r i a t e m a s k t o m a s k o u t t o p 1 6 - r13 b y t e s o f x m m 0
pand % x m m 1 , % x m m 0 # m a s k o u t t o p 16 - r13 b y t e s o f x m m 0
2010-12-13 14:51:15 +03:00
movdqa S H U F _ M A S K ( % r i p ) , % x m m 1 0
PSHUFB_ X M M % x m m 1 0 ,% x m m 0
2010-11-04 22:00:45 +03:00
pxor % x m m 0 , % x m m 8
GHASH_ M U L % x m m 8 , % x m m 1 3 , % x m m 9 , % x m m 1 0 , % x m m 1 1 , % x m m 5 , % x m m 6
# GHASH c o m p u t a t i o n f o r t h e l a s t < 1 6 b y t e b l o c k
sub % r13 , % r11
add $ 1 6 , % r11
2011-03-13 11:56:17 +03:00
movdqa S H U F _ M A S K ( % r i p ) , % x m m 1 0
PSHUFB_ X M M % x m m 1 0 , % x m m 0
2010-12-13 14:51:15 +03:00
2010-11-04 22:00:45 +03:00
# shuffle x m m 0 b a c k t o o u t p u t a s c i p h e r t e x t
# Output % r13 b y t e s
2010-12-13 14:51:15 +03:00
MOVQ_ R 6 4 _ X M M % x m m 0 , % r a x
2010-11-04 22:00:45 +03:00
cmp $ 8 , % r13
jle _ l e s s _ t h a n _ 8 _ b y t e s _ l e f t _ e n c r y p t
mov % r a x , ( % a r g 2 , % r11 , 1 )
add $ 8 , % r11
psrldq $ 8 , % x m m 0
2010-12-13 14:51:15 +03:00
MOVQ_ R 6 4 _ X M M % x m m 0 , % r a x
2010-11-04 22:00:45 +03:00
sub $ 8 , % r13
_less_than_8_bytes_left_encrypt :
mov % a l , ( % a r g 2 , % r11 , 1 )
add $ 1 , % r11
shr $ 8 , % r a x
sub $ 1 , % r13
jne _ l e s s _ t h a n _ 8 _ b y t e s _ l e f t _ e n c r y p t
_multiple_of_16_bytes_encrypt :
mov a r g 8 , % r12 # % r 12 = a d d L e n ( n u m b e r o f b y t e s )
shl $ 3 , % r12
movd % r12 d , % x m m 1 5 # l e n ( A ) i n % x m m 15
shl $ 3 , % a r g 4 # l e n ( C ) i n b i t s ( * 128 )
2010-12-13 14:51:15 +03:00
MOVQ_ R 6 4 _ X M M % a r g 4 , % x m m 1
2010-11-04 22:00:45 +03:00
pslldq $ 8 , % x m m 1 5 # % x m m 15 = l e n ( A ) | | 0 x00 0 0 0 0 0 0 0 0 0 0 0 0 0 0
pxor % x m m 1 , % x m m 1 5 # % x m m 15 = l e n ( A ) | | l e n ( C )
pxor % x m m 1 5 , % x m m 8
GHASH_ M U L % x m m 8 , % x m m 1 3 , % x m m 9 , % x m m 1 0 , % x m m 1 1 , % x m m 5 , % x m m 6
# final G H A S H c o m p u t a t i o n
2010-12-13 14:51:15 +03:00
movdqa S H U F _ M A S K ( % r i p ) , % x m m 1 0
PSHUFB_ X M M % x m m 1 0 , % x m m 8 # p e r f o r m a 16 b y t e s w a p
2010-11-04 22:00:45 +03:00
mov % a r g 5 , % r a x # % r a x = * Y 0
movdqu ( % r a x ) , % x m m 0 # % x m m 0 = Y 0
ENCRYPT_ S I N G L E _ B L O C K % x m m 0 , % x m m 1 5 # E n c r y p t ( K , Y 0 )
pxor % x m m 8 , % x m m 0
_return_T_encrypt :
mov a r g 9 , % r10 # % r 10 = a u t h T a g
mov a r g 1 0 , % r11 # % r 11 = a u t h _ t a g _ l e n
cmp $ 1 6 , % r11
je _ T _ 1 6 _ e n c r y p t
cmp $ 1 2 , % r11
je _ T _ 1 2 _ e n c r y p t
_T_8_encrypt :
2010-12-13 14:51:15 +03:00
MOVQ_ R 6 4 _ X M M % x m m 0 , % r a x
2010-11-04 22:00:45 +03:00
mov % r a x , ( % r10 )
jmp _ r e t u r n _ T _ d o n e _ e n c r y p t
_T_12_encrypt :
2010-12-13 14:51:15 +03:00
MOVQ_ R 6 4 _ X M M % x m m 0 , % r a x
2010-11-04 22:00:45 +03:00
mov % r a x , ( % r10 )
psrldq $ 8 , % x m m 0
movd % x m m 0 , % e a x
mov % e a x , 8 ( % r10 )
jmp _ r e t u r n _ T _ d o n e _ e n c r y p t
_T_16_encrypt :
movdqu % x m m 0 , ( % r10 )
_return_T_done_encrypt :
mov % r14 , % r s p
pop % r14
pop % r13
pop % r12
ret
2010-12-13 14:51:15 +03:00
2010-11-29 03:35:39 +03:00
# endif
2010-11-04 22:00:45 +03:00
2009-01-18 08:28:34 +03:00
_key_expansion_128 :
_key_expansion_256a :
pshufd $ 0 b11 1 1 1 1 1 1 , % x m m 1 , % x m m 1
shufps $ 0 b00 0 1 0 0 0 0 , % x m m 0 , % x m m 4
pxor % x m m 4 , % x m m 0
shufps $ 0 b10 0 0 1 1 0 0 , % x m m 0 , % x m m 4
pxor % x m m 4 , % x m m 0
pxor % x m m 1 , % x m m 0
crypto: aesni-intel - Ported implementation to x86-32
The AES-NI instructions are also available in legacy mode so the 32-bit
architecture may profit from those, too.
To illustrate the performance gain here's a short summary of a dm-crypt
speed test on a Core i7 M620 running at 2.67GHz comparing both assembler
implementations:
x86: i568 aes-ni delta
ECB, 256 bit: 93.8 MB/s 123.3 MB/s +31.4%
CBC, 256 bit: 84.8 MB/s 262.3 MB/s +209.3%
LRW, 256 bit: 108.6 MB/s 222.1 MB/s +104.5%
XTS, 256 bit: 105.0 MB/s 205.5 MB/s +95.7%
Additionally, due to some minor optimizations, the 64-bit version also
got a minor performance gain as seen below:
x86-64: old impl. new impl. delta
ECB, 256 bit: 121.1 MB/s 123.0 MB/s +1.5%
CBC, 256 bit: 285.3 MB/s 290.8 MB/s +1.9%
LRW, 256 bit: 263.7 MB/s 265.3 MB/s +0.6%
XTS, 256 bit: 251.1 MB/s 255.3 MB/s +1.7%
Signed-off-by: Mathias Krause <minipli@googlemail.com>
Reviewed-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2010-11-27 11:34:46 +03:00
movaps % x m m 0 , ( T K E Y P )
add $ 0 x10 , T K E Y P
2009-01-18 08:28:34 +03:00
ret
crypto: aesni-intel - Ported implementation to x86-32
The AES-NI instructions are also available in legacy mode so the 32-bit
architecture may profit from those, too.
To illustrate the performance gain here's a short summary of a dm-crypt
speed test on a Core i7 M620 running at 2.67GHz comparing both assembler
implementations:
x86: i568 aes-ni delta
ECB, 256 bit: 93.8 MB/s 123.3 MB/s +31.4%
CBC, 256 bit: 84.8 MB/s 262.3 MB/s +209.3%
LRW, 256 bit: 108.6 MB/s 222.1 MB/s +104.5%
XTS, 256 bit: 105.0 MB/s 205.5 MB/s +95.7%
Additionally, due to some minor optimizations, the 64-bit version also
got a minor performance gain as seen below:
x86-64: old impl. new impl. delta
ECB, 256 bit: 121.1 MB/s 123.0 MB/s +1.5%
CBC, 256 bit: 285.3 MB/s 290.8 MB/s +1.9%
LRW, 256 bit: 263.7 MB/s 265.3 MB/s +0.6%
XTS, 256 bit: 251.1 MB/s 255.3 MB/s +1.7%
Signed-off-by: Mathias Krause <minipli@googlemail.com>
Reviewed-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2010-11-27 11:34:46 +03:00
.align 4
2009-01-18 08:28:34 +03:00
_key_expansion_192a :
pshufd $ 0 b01 0 1 0 1 0 1 , % x m m 1 , % x m m 1
shufps $ 0 b00 0 1 0 0 0 0 , % x m m 0 , % x m m 4
pxor % x m m 4 , % x m m 0
shufps $ 0 b10 0 0 1 1 0 0 , % x m m 0 , % x m m 4
pxor % x m m 4 , % x m m 0
pxor % x m m 1 , % x m m 0
movaps % x m m 2 , % x m m 5
movaps % x m m 2 , % x m m 6
pslldq $ 4 , % x m m 5
pshufd $ 0 b11 1 1 1 1 1 1 , % x m m 0 , % x m m 3
pxor % x m m 3 , % x m m 2
pxor % x m m 5 , % x m m 2
movaps % x m m 0 , % x m m 1
shufps $ 0 b01 0 0 0 1 0 0 , % x m m 0 , % x m m 6
crypto: aesni-intel - Ported implementation to x86-32
The AES-NI instructions are also available in legacy mode so the 32-bit
architecture may profit from those, too.
To illustrate the performance gain here's a short summary of a dm-crypt
speed test on a Core i7 M620 running at 2.67GHz comparing both assembler
implementations:
x86: i568 aes-ni delta
ECB, 256 bit: 93.8 MB/s 123.3 MB/s +31.4%
CBC, 256 bit: 84.8 MB/s 262.3 MB/s +209.3%
LRW, 256 bit: 108.6 MB/s 222.1 MB/s +104.5%
XTS, 256 bit: 105.0 MB/s 205.5 MB/s +95.7%
Additionally, due to some minor optimizations, the 64-bit version also
got a minor performance gain as seen below:
x86-64: old impl. new impl. delta
ECB, 256 bit: 121.1 MB/s 123.0 MB/s +1.5%
CBC, 256 bit: 285.3 MB/s 290.8 MB/s +1.9%
LRW, 256 bit: 263.7 MB/s 265.3 MB/s +0.6%
XTS, 256 bit: 251.1 MB/s 255.3 MB/s +1.7%
Signed-off-by: Mathias Krause <minipli@googlemail.com>
Reviewed-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2010-11-27 11:34:46 +03:00
movaps % x m m 6 , ( T K E Y P )
2009-01-18 08:28:34 +03:00
shufps $ 0 b01 0 0 1 1 1 0 , % x m m 2 , % x m m 1
crypto: aesni-intel - Ported implementation to x86-32
The AES-NI instructions are also available in legacy mode so the 32-bit
architecture may profit from those, too.
To illustrate the performance gain here's a short summary of a dm-crypt
speed test on a Core i7 M620 running at 2.67GHz comparing both assembler
implementations:
x86: i568 aes-ni delta
ECB, 256 bit: 93.8 MB/s 123.3 MB/s +31.4%
CBC, 256 bit: 84.8 MB/s 262.3 MB/s +209.3%
LRW, 256 bit: 108.6 MB/s 222.1 MB/s +104.5%
XTS, 256 bit: 105.0 MB/s 205.5 MB/s +95.7%
Additionally, due to some minor optimizations, the 64-bit version also
got a minor performance gain as seen below:
x86-64: old impl. new impl. delta
ECB, 256 bit: 121.1 MB/s 123.0 MB/s +1.5%
CBC, 256 bit: 285.3 MB/s 290.8 MB/s +1.9%
LRW, 256 bit: 263.7 MB/s 265.3 MB/s +0.6%
XTS, 256 bit: 251.1 MB/s 255.3 MB/s +1.7%
Signed-off-by: Mathias Krause <minipli@googlemail.com>
Reviewed-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2010-11-27 11:34:46 +03:00
movaps % x m m 1 , 0 x10 ( T K E Y P )
add $ 0 x20 , T K E Y P
2009-01-18 08:28:34 +03:00
ret
crypto: aesni-intel - Ported implementation to x86-32
The AES-NI instructions are also available in legacy mode so the 32-bit
architecture may profit from those, too.
To illustrate the performance gain here's a short summary of a dm-crypt
speed test on a Core i7 M620 running at 2.67GHz comparing both assembler
implementations:
x86: i568 aes-ni delta
ECB, 256 bit: 93.8 MB/s 123.3 MB/s +31.4%
CBC, 256 bit: 84.8 MB/s 262.3 MB/s +209.3%
LRW, 256 bit: 108.6 MB/s 222.1 MB/s +104.5%
XTS, 256 bit: 105.0 MB/s 205.5 MB/s +95.7%
Additionally, due to some minor optimizations, the 64-bit version also
got a minor performance gain as seen below:
x86-64: old impl. new impl. delta
ECB, 256 bit: 121.1 MB/s 123.0 MB/s +1.5%
CBC, 256 bit: 285.3 MB/s 290.8 MB/s +1.9%
LRW, 256 bit: 263.7 MB/s 265.3 MB/s +0.6%
XTS, 256 bit: 251.1 MB/s 255.3 MB/s +1.7%
Signed-off-by: Mathias Krause <minipli@googlemail.com>
Reviewed-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2010-11-27 11:34:46 +03:00
.align 4
2009-01-18 08:28:34 +03:00
_key_expansion_192b :
pshufd $ 0 b01 0 1 0 1 0 1 , % x m m 1 , % x m m 1
shufps $ 0 b00 0 1 0 0 0 0 , % x m m 0 , % x m m 4
pxor % x m m 4 , % x m m 0
shufps $ 0 b10 0 0 1 1 0 0 , % x m m 0 , % x m m 4
pxor % x m m 4 , % x m m 0
pxor % x m m 1 , % x m m 0
movaps % x m m 2 , % x m m 5
pslldq $ 4 , % x m m 5
pshufd $ 0 b11 1 1 1 1 1 1 , % x m m 0 , % x m m 3
pxor % x m m 3 , % x m m 2
pxor % x m m 5 , % x m m 2
crypto: aesni-intel - Ported implementation to x86-32
The AES-NI instructions are also available in legacy mode so the 32-bit
architecture may profit from those, too.
To illustrate the performance gain here's a short summary of a dm-crypt
speed test on a Core i7 M620 running at 2.67GHz comparing both assembler
implementations:
x86: i568 aes-ni delta
ECB, 256 bit: 93.8 MB/s 123.3 MB/s +31.4%
CBC, 256 bit: 84.8 MB/s 262.3 MB/s +209.3%
LRW, 256 bit: 108.6 MB/s 222.1 MB/s +104.5%
XTS, 256 bit: 105.0 MB/s 205.5 MB/s +95.7%
Additionally, due to some minor optimizations, the 64-bit version also
got a minor performance gain as seen below:
x86-64: old impl. new impl. delta
ECB, 256 bit: 121.1 MB/s 123.0 MB/s +1.5%
CBC, 256 bit: 285.3 MB/s 290.8 MB/s +1.9%
LRW, 256 bit: 263.7 MB/s 265.3 MB/s +0.6%
XTS, 256 bit: 251.1 MB/s 255.3 MB/s +1.7%
Signed-off-by: Mathias Krause <minipli@googlemail.com>
Reviewed-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2010-11-27 11:34:46 +03:00
movaps % x m m 0 , ( T K E Y P )
add $ 0 x10 , T K E Y P
2009-01-18 08:28:34 +03:00
ret
crypto: aesni-intel - Ported implementation to x86-32
The AES-NI instructions are also available in legacy mode so the 32-bit
architecture may profit from those, too.
To illustrate the performance gain here's a short summary of a dm-crypt
speed test on a Core i7 M620 running at 2.67GHz comparing both assembler
implementations:
x86: i568 aes-ni delta
ECB, 256 bit: 93.8 MB/s 123.3 MB/s +31.4%
CBC, 256 bit: 84.8 MB/s 262.3 MB/s +209.3%
LRW, 256 bit: 108.6 MB/s 222.1 MB/s +104.5%
XTS, 256 bit: 105.0 MB/s 205.5 MB/s +95.7%
Additionally, due to some minor optimizations, the 64-bit version also
got a minor performance gain as seen below:
x86-64: old impl. new impl. delta
ECB, 256 bit: 121.1 MB/s 123.0 MB/s +1.5%
CBC, 256 bit: 285.3 MB/s 290.8 MB/s +1.9%
LRW, 256 bit: 263.7 MB/s 265.3 MB/s +0.6%
XTS, 256 bit: 251.1 MB/s 255.3 MB/s +1.7%
Signed-off-by: Mathias Krause <minipli@googlemail.com>
Reviewed-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2010-11-27 11:34:46 +03:00
.align 4
2009-01-18 08:28:34 +03:00
_key_expansion_256b :
pshufd $ 0 b10 1 0 1 0 1 0 , % x m m 1 , % x m m 1
shufps $ 0 b00 0 1 0 0 0 0 , % x m m 2 , % x m m 4
pxor % x m m 4 , % x m m 2
shufps $ 0 b10 0 0 1 1 0 0 , % x m m 2 , % x m m 4
pxor % x m m 4 , % x m m 2
pxor % x m m 1 , % x m m 2
crypto: aesni-intel - Ported implementation to x86-32
The AES-NI instructions are also available in legacy mode so the 32-bit
architecture may profit from those, too.
To illustrate the performance gain here's a short summary of a dm-crypt
speed test on a Core i7 M620 running at 2.67GHz comparing both assembler
implementations:
x86: i568 aes-ni delta
ECB, 256 bit: 93.8 MB/s 123.3 MB/s +31.4%
CBC, 256 bit: 84.8 MB/s 262.3 MB/s +209.3%
LRW, 256 bit: 108.6 MB/s 222.1 MB/s +104.5%
XTS, 256 bit: 105.0 MB/s 205.5 MB/s +95.7%
Additionally, due to some minor optimizations, the 64-bit version also
got a minor performance gain as seen below:
x86-64: old impl. new impl. delta
ECB, 256 bit: 121.1 MB/s 123.0 MB/s +1.5%
CBC, 256 bit: 285.3 MB/s 290.8 MB/s +1.9%
LRW, 256 bit: 263.7 MB/s 265.3 MB/s +0.6%
XTS, 256 bit: 251.1 MB/s 255.3 MB/s +1.7%
Signed-off-by: Mathias Krause <minipli@googlemail.com>
Reviewed-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2010-11-27 11:34:46 +03:00
movaps % x m m 2 , ( T K E Y P )
add $ 0 x10 , T K E Y P
2009-01-18 08:28:34 +03:00
ret
/ *
* int a e s n i _ s e t _ k e y ( s t r u c t c r y p t o _ a e s _ c t x * c t x , c o n s t u 8 * i n _ k e y ,
* unsigned i n t k e y _ l e n )
* /
ENTRY( a e s n i _ s e t _ k e y )
crypto: aesni-intel - Ported implementation to x86-32
The AES-NI instructions are also available in legacy mode so the 32-bit
architecture may profit from those, too.
To illustrate the performance gain here's a short summary of a dm-crypt
speed test on a Core i7 M620 running at 2.67GHz comparing both assembler
implementations:
x86: i568 aes-ni delta
ECB, 256 bit: 93.8 MB/s 123.3 MB/s +31.4%
CBC, 256 bit: 84.8 MB/s 262.3 MB/s +209.3%
LRW, 256 bit: 108.6 MB/s 222.1 MB/s +104.5%
XTS, 256 bit: 105.0 MB/s 205.5 MB/s +95.7%
Additionally, due to some minor optimizations, the 64-bit version also
got a minor performance gain as seen below:
x86-64: old impl. new impl. delta
ECB, 256 bit: 121.1 MB/s 123.0 MB/s +1.5%
CBC, 256 bit: 285.3 MB/s 290.8 MB/s +1.9%
LRW, 256 bit: 263.7 MB/s 265.3 MB/s +0.6%
XTS, 256 bit: 251.1 MB/s 255.3 MB/s +1.7%
Signed-off-by: Mathias Krause <minipli@googlemail.com>
Reviewed-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2010-11-27 11:34:46 +03:00
# ifndef _ _ x86 _ 6 4 _ _
pushl K E Y P
movl 8 ( % e s p ) , K E Y P # c t x
movl 1 2 ( % e s p ) , U K E Y P # i n _ k e y
movl 1 6 ( % e s p ) , % e d x # k e y _ l e n
# endif
movups ( U K E Y P ) , % x m m 0 # u s e r k e y ( f i r s t 16 b y t e s )
movaps % x m m 0 , ( K E Y P )
lea 0 x10 ( K E Y P ) , T K E Y P # k e y a d d r
movl % e d x , 4 8 0 ( K E Y P )
2009-01-18 08:28:34 +03:00
pxor % x m m 4 , % x m m 4 # x m m 4 i s a s s u m e d 0 i n _ k e y _ e x p a n s i o n _ x
cmp $ 2 4 , % d l
jb . L e n c _ k e y 1 2 8
je . L e n c _ k e y 1 9 2
crypto: aesni-intel - Ported implementation to x86-32
The AES-NI instructions are also available in legacy mode so the 32-bit
architecture may profit from those, too.
To illustrate the performance gain here's a short summary of a dm-crypt
speed test on a Core i7 M620 running at 2.67GHz comparing both assembler
implementations:
x86: i568 aes-ni delta
ECB, 256 bit: 93.8 MB/s 123.3 MB/s +31.4%
CBC, 256 bit: 84.8 MB/s 262.3 MB/s +209.3%
LRW, 256 bit: 108.6 MB/s 222.1 MB/s +104.5%
XTS, 256 bit: 105.0 MB/s 205.5 MB/s +95.7%
Additionally, due to some minor optimizations, the 64-bit version also
got a minor performance gain as seen below:
x86-64: old impl. new impl. delta
ECB, 256 bit: 121.1 MB/s 123.0 MB/s +1.5%
CBC, 256 bit: 285.3 MB/s 290.8 MB/s +1.9%
LRW, 256 bit: 263.7 MB/s 265.3 MB/s +0.6%
XTS, 256 bit: 251.1 MB/s 255.3 MB/s +1.7%
Signed-off-by: Mathias Krause <minipli@googlemail.com>
Reviewed-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2010-11-27 11:34:46 +03:00
movups 0 x10 ( U K E Y P ) , % x m m 2 # o t h e r u s e r k e y
movaps % x m m 2 , ( T K E Y P )
add $ 0 x10 , T K E Y P
2009-11-23 14:54:06 +03:00
AESKEYGENASSIST 0 x1 % x m m 2 % x m m 1 # r o u n d 1
2009-01-18 08:28:34 +03:00
call _ k e y _ e x p a n s i o n _ 2 5 6 a
2009-11-23 14:54:06 +03:00
AESKEYGENASSIST 0 x1 % x m m 0 % x m m 1
2009-01-18 08:28:34 +03:00
call _ k e y _ e x p a n s i o n _ 2 5 6 b
2009-11-23 14:54:06 +03:00
AESKEYGENASSIST 0 x2 % x m m 2 % x m m 1 # r o u n d 2
2009-01-18 08:28:34 +03:00
call _ k e y _ e x p a n s i o n _ 2 5 6 a
2009-11-23 14:54:06 +03:00
AESKEYGENASSIST 0 x2 % x m m 0 % x m m 1
2009-01-18 08:28:34 +03:00
call _ k e y _ e x p a n s i o n _ 2 5 6 b
2009-11-23 14:54:06 +03:00
AESKEYGENASSIST 0 x4 % x m m 2 % x m m 1 # r o u n d 3
2009-01-18 08:28:34 +03:00
call _ k e y _ e x p a n s i o n _ 2 5 6 a
2009-11-23 14:54:06 +03:00
AESKEYGENASSIST 0 x4 % x m m 0 % x m m 1
2009-01-18 08:28:34 +03:00
call _ k e y _ e x p a n s i o n _ 2 5 6 b
2009-11-23 14:54:06 +03:00
AESKEYGENASSIST 0 x8 % x m m 2 % x m m 1 # r o u n d 4
2009-01-18 08:28:34 +03:00
call _ k e y _ e x p a n s i o n _ 2 5 6 a
2009-11-23 14:54:06 +03:00
AESKEYGENASSIST 0 x8 % x m m 0 % x m m 1
2009-01-18 08:28:34 +03:00
call _ k e y _ e x p a n s i o n _ 2 5 6 b
2009-11-23 14:54:06 +03:00
AESKEYGENASSIST 0 x10 % x m m 2 % x m m 1 # r o u n d 5
2009-01-18 08:28:34 +03:00
call _ k e y _ e x p a n s i o n _ 2 5 6 a
2009-11-23 14:54:06 +03:00
AESKEYGENASSIST 0 x10 % x m m 0 % x m m 1
2009-01-18 08:28:34 +03:00
call _ k e y _ e x p a n s i o n _ 2 5 6 b
2009-11-23 14:54:06 +03:00
AESKEYGENASSIST 0 x20 % x m m 2 % x m m 1 # r o u n d 6
2009-01-18 08:28:34 +03:00
call _ k e y _ e x p a n s i o n _ 2 5 6 a
2009-11-23 14:54:06 +03:00
AESKEYGENASSIST 0 x20 % x m m 0 % x m m 1
2009-01-18 08:28:34 +03:00
call _ k e y _ e x p a n s i o n _ 2 5 6 b
2009-11-23 14:54:06 +03:00
AESKEYGENASSIST 0 x40 % x m m 2 % x m m 1 # r o u n d 7
2009-01-18 08:28:34 +03:00
call _ k e y _ e x p a n s i o n _ 2 5 6 a
jmp . L d e c _ k e y
.Lenc_key192 :
crypto: aesni-intel - Ported implementation to x86-32
The AES-NI instructions are also available in legacy mode so the 32-bit
architecture may profit from those, too.
To illustrate the performance gain here's a short summary of a dm-crypt
speed test on a Core i7 M620 running at 2.67GHz comparing both assembler
implementations:
x86: i568 aes-ni delta
ECB, 256 bit: 93.8 MB/s 123.3 MB/s +31.4%
CBC, 256 bit: 84.8 MB/s 262.3 MB/s +209.3%
LRW, 256 bit: 108.6 MB/s 222.1 MB/s +104.5%
XTS, 256 bit: 105.0 MB/s 205.5 MB/s +95.7%
Additionally, due to some minor optimizations, the 64-bit version also
got a minor performance gain as seen below:
x86-64: old impl. new impl. delta
ECB, 256 bit: 121.1 MB/s 123.0 MB/s +1.5%
CBC, 256 bit: 285.3 MB/s 290.8 MB/s +1.9%
LRW, 256 bit: 263.7 MB/s 265.3 MB/s +0.6%
XTS, 256 bit: 251.1 MB/s 255.3 MB/s +1.7%
Signed-off-by: Mathias Krause <minipli@googlemail.com>
Reviewed-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2010-11-27 11:34:46 +03:00
movq 0 x10 ( U K E Y P ) , % x m m 2 # o t h e r u s e r k e y
2009-11-23 14:54:06 +03:00
AESKEYGENASSIST 0 x1 % x m m 2 % x m m 1 # r o u n d 1
2009-01-18 08:28:34 +03:00
call _ k e y _ e x p a n s i o n _ 1 9 2 a
2009-11-23 14:54:06 +03:00
AESKEYGENASSIST 0 x2 % x m m 2 % x m m 1 # r o u n d 2
2009-01-18 08:28:34 +03:00
call _ k e y _ e x p a n s i o n _ 1 9 2 b
2009-11-23 14:54:06 +03:00
AESKEYGENASSIST 0 x4 % x m m 2 % x m m 1 # r o u n d 3
2009-01-18 08:28:34 +03:00
call _ k e y _ e x p a n s i o n _ 1 9 2 a
2009-11-23 14:54:06 +03:00
AESKEYGENASSIST 0 x8 % x m m 2 % x m m 1 # r o u n d 4
2009-01-18 08:28:34 +03:00
call _ k e y _ e x p a n s i o n _ 1 9 2 b
2009-11-23 14:54:06 +03:00
AESKEYGENASSIST 0 x10 % x m m 2 % x m m 1 # r o u n d 5
2009-01-18 08:28:34 +03:00
call _ k e y _ e x p a n s i o n _ 1 9 2 a
2009-11-23 14:54:06 +03:00
AESKEYGENASSIST 0 x20 % x m m 2 % x m m 1 # r o u n d 6
2009-01-18 08:28:34 +03:00
call _ k e y _ e x p a n s i o n _ 1 9 2 b
2009-11-23 14:54:06 +03:00
AESKEYGENASSIST 0 x40 % x m m 2 % x m m 1 # r o u n d 7
2009-01-18 08:28:34 +03:00
call _ k e y _ e x p a n s i o n _ 1 9 2 a
2009-11-23 14:54:06 +03:00
AESKEYGENASSIST 0 x80 % x m m 2 % x m m 1 # r o u n d 8
2009-01-18 08:28:34 +03:00
call _ k e y _ e x p a n s i o n _ 1 9 2 b
jmp . L d e c _ k e y
.Lenc_key128 :
2009-11-23 14:54:06 +03:00
AESKEYGENASSIST 0 x1 % x m m 0 % x m m 1 # r o u n d 1
2009-01-18 08:28:34 +03:00
call _ k e y _ e x p a n s i o n _ 1 2 8
2009-11-23 14:54:06 +03:00
AESKEYGENASSIST 0 x2 % x m m 0 % x m m 1 # r o u n d 2
2009-01-18 08:28:34 +03:00
call _ k e y _ e x p a n s i o n _ 1 2 8
2009-11-23 14:54:06 +03:00
AESKEYGENASSIST 0 x4 % x m m 0 % x m m 1 # r o u n d 3
2009-01-18 08:28:34 +03:00
call _ k e y _ e x p a n s i o n _ 1 2 8
2009-11-23 14:54:06 +03:00
AESKEYGENASSIST 0 x8 % x m m 0 % x m m 1 # r o u n d 4
2009-01-18 08:28:34 +03:00
call _ k e y _ e x p a n s i o n _ 1 2 8
2009-11-23 14:54:06 +03:00
AESKEYGENASSIST 0 x10 % x m m 0 % x m m 1 # r o u n d 5
2009-01-18 08:28:34 +03:00
call _ k e y _ e x p a n s i o n _ 1 2 8
2009-11-23 14:54:06 +03:00
AESKEYGENASSIST 0 x20 % x m m 0 % x m m 1 # r o u n d 6
2009-01-18 08:28:34 +03:00
call _ k e y _ e x p a n s i o n _ 1 2 8
2009-11-23 14:54:06 +03:00
AESKEYGENASSIST 0 x40 % x m m 0 % x m m 1 # r o u n d 7
2009-01-18 08:28:34 +03:00
call _ k e y _ e x p a n s i o n _ 1 2 8
2009-11-23 14:54:06 +03:00
AESKEYGENASSIST 0 x80 % x m m 0 % x m m 1 # r o u n d 8
2009-01-18 08:28:34 +03:00
call _ k e y _ e x p a n s i o n _ 1 2 8
2009-11-23 14:54:06 +03:00
AESKEYGENASSIST 0 x1 b % x m m 0 % x m m 1 # r o u n d 9
2009-01-18 08:28:34 +03:00
call _ k e y _ e x p a n s i o n _ 1 2 8
2009-11-23 14:54:06 +03:00
AESKEYGENASSIST 0 x36 % x m m 0 % x m m 1 # r o u n d 10
2009-01-18 08:28:34 +03:00
call _ k e y _ e x p a n s i o n _ 1 2 8
.Ldec_key :
crypto: aesni-intel - Ported implementation to x86-32
The AES-NI instructions are also available in legacy mode so the 32-bit
architecture may profit from those, too.
To illustrate the performance gain here's a short summary of a dm-crypt
speed test on a Core i7 M620 running at 2.67GHz comparing both assembler
implementations:
x86: i568 aes-ni delta
ECB, 256 bit: 93.8 MB/s 123.3 MB/s +31.4%
CBC, 256 bit: 84.8 MB/s 262.3 MB/s +209.3%
LRW, 256 bit: 108.6 MB/s 222.1 MB/s +104.5%
XTS, 256 bit: 105.0 MB/s 205.5 MB/s +95.7%
Additionally, due to some minor optimizations, the 64-bit version also
got a minor performance gain as seen below:
x86-64: old impl. new impl. delta
ECB, 256 bit: 121.1 MB/s 123.0 MB/s +1.5%
CBC, 256 bit: 285.3 MB/s 290.8 MB/s +1.9%
LRW, 256 bit: 263.7 MB/s 265.3 MB/s +0.6%
XTS, 256 bit: 251.1 MB/s 255.3 MB/s +1.7%
Signed-off-by: Mathias Krause <minipli@googlemail.com>
Reviewed-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2010-11-27 11:34:46 +03:00
sub $ 0 x10 , T K E Y P
movaps ( K E Y P ) , % x m m 0
movaps ( T K E Y P ) , % x m m 1
movaps % x m m 0 , 2 4 0 ( T K E Y P )
movaps % x m m 1 , 2 4 0 ( K E Y P )
add $ 0 x10 , K E Y P
lea 2 4 0 - 1 6 ( T K E Y P ) , U K E Y P
2009-01-18 08:28:34 +03:00
.align 4
.Ldec_key_loop :
crypto: aesni-intel - Ported implementation to x86-32
The AES-NI instructions are also available in legacy mode so the 32-bit
architecture may profit from those, too.
To illustrate the performance gain here's a short summary of a dm-crypt
speed test on a Core i7 M620 running at 2.67GHz comparing both assembler
implementations:
x86: i568 aes-ni delta
ECB, 256 bit: 93.8 MB/s 123.3 MB/s +31.4%
CBC, 256 bit: 84.8 MB/s 262.3 MB/s +209.3%
LRW, 256 bit: 108.6 MB/s 222.1 MB/s +104.5%
XTS, 256 bit: 105.0 MB/s 205.5 MB/s +95.7%
Additionally, due to some minor optimizations, the 64-bit version also
got a minor performance gain as seen below:
x86-64: old impl. new impl. delta
ECB, 256 bit: 121.1 MB/s 123.0 MB/s +1.5%
CBC, 256 bit: 285.3 MB/s 290.8 MB/s +1.9%
LRW, 256 bit: 263.7 MB/s 265.3 MB/s +0.6%
XTS, 256 bit: 251.1 MB/s 255.3 MB/s +1.7%
Signed-off-by: Mathias Krause <minipli@googlemail.com>
Reviewed-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2010-11-27 11:34:46 +03:00
movaps ( K E Y P ) , % x m m 0
2009-11-23 14:54:06 +03:00
AESIMC % x m m 0 % x m m 1
crypto: aesni-intel - Ported implementation to x86-32
The AES-NI instructions are also available in legacy mode so the 32-bit
architecture may profit from those, too.
To illustrate the performance gain here's a short summary of a dm-crypt
speed test on a Core i7 M620 running at 2.67GHz comparing both assembler
implementations:
x86: i568 aes-ni delta
ECB, 256 bit: 93.8 MB/s 123.3 MB/s +31.4%
CBC, 256 bit: 84.8 MB/s 262.3 MB/s +209.3%
LRW, 256 bit: 108.6 MB/s 222.1 MB/s +104.5%
XTS, 256 bit: 105.0 MB/s 205.5 MB/s +95.7%
Additionally, due to some minor optimizations, the 64-bit version also
got a minor performance gain as seen below:
x86-64: old impl. new impl. delta
ECB, 256 bit: 121.1 MB/s 123.0 MB/s +1.5%
CBC, 256 bit: 285.3 MB/s 290.8 MB/s +1.9%
LRW, 256 bit: 263.7 MB/s 265.3 MB/s +0.6%
XTS, 256 bit: 251.1 MB/s 255.3 MB/s +1.7%
Signed-off-by: Mathias Krause <minipli@googlemail.com>
Reviewed-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2010-11-27 11:34:46 +03:00
movaps % x m m 1 , ( U K E Y P )
add $ 0 x10 , K E Y P
sub $ 0 x10 , U K E Y P
cmp T K E Y P , K E Y P
2009-01-18 08:28:34 +03:00
jb . L d e c _ k e y _ l o o p
crypto: aesni-intel - Ported implementation to x86-32
The AES-NI instructions are also available in legacy mode so the 32-bit
architecture may profit from those, too.
To illustrate the performance gain here's a short summary of a dm-crypt
speed test on a Core i7 M620 running at 2.67GHz comparing both assembler
implementations:
x86: i568 aes-ni delta
ECB, 256 bit: 93.8 MB/s 123.3 MB/s +31.4%
CBC, 256 bit: 84.8 MB/s 262.3 MB/s +209.3%
LRW, 256 bit: 108.6 MB/s 222.1 MB/s +104.5%
XTS, 256 bit: 105.0 MB/s 205.5 MB/s +95.7%
Additionally, due to some minor optimizations, the 64-bit version also
got a minor performance gain as seen below:
x86-64: old impl. new impl. delta
ECB, 256 bit: 121.1 MB/s 123.0 MB/s +1.5%
CBC, 256 bit: 285.3 MB/s 290.8 MB/s +1.9%
LRW, 256 bit: 263.7 MB/s 265.3 MB/s +0.6%
XTS, 256 bit: 251.1 MB/s 255.3 MB/s +1.7%
Signed-off-by: Mathias Krause <minipli@googlemail.com>
Reviewed-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2010-11-27 11:34:46 +03:00
xor A R E G , A R E G
# ifndef _ _ x86 _ 6 4 _ _
popl K E Y P
# endif
2009-01-18 08:28:34 +03:00
ret
/ *
* void a e s n i _ e n c ( s t r u c t c r y p t o _ a e s _ c t x * c t x , u 8 * d s t , c o n s t u 8 * s r c )
* /
ENTRY( a e s n i _ e n c )
crypto: aesni-intel - Ported implementation to x86-32
The AES-NI instructions are also available in legacy mode so the 32-bit
architecture may profit from those, too.
To illustrate the performance gain here's a short summary of a dm-crypt
speed test on a Core i7 M620 running at 2.67GHz comparing both assembler
implementations:
x86: i568 aes-ni delta
ECB, 256 bit: 93.8 MB/s 123.3 MB/s +31.4%
CBC, 256 bit: 84.8 MB/s 262.3 MB/s +209.3%
LRW, 256 bit: 108.6 MB/s 222.1 MB/s +104.5%
XTS, 256 bit: 105.0 MB/s 205.5 MB/s +95.7%
Additionally, due to some minor optimizations, the 64-bit version also
got a minor performance gain as seen below:
x86-64: old impl. new impl. delta
ECB, 256 bit: 121.1 MB/s 123.0 MB/s +1.5%
CBC, 256 bit: 285.3 MB/s 290.8 MB/s +1.9%
LRW, 256 bit: 263.7 MB/s 265.3 MB/s +0.6%
XTS, 256 bit: 251.1 MB/s 255.3 MB/s +1.7%
Signed-off-by: Mathias Krause <minipli@googlemail.com>
Reviewed-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2010-11-27 11:34:46 +03:00
# ifndef _ _ x86 _ 6 4 _ _
pushl K E Y P
pushl K L E N
movl 1 2 ( % e s p ) , K E Y P
movl 1 6 ( % e s p ) , O U T P
movl 2 0 ( % e s p ) , I N P
# endif
2009-01-18 08:28:34 +03:00
movl 4 8 0 ( K E Y P ) , K L E N # k e y l e n g t h
movups ( I N P ) , S T A T E # i n p u t
call _ a e s n i _ e n c1
movups S T A T E , ( O U T P ) # o u t p u t
crypto: aesni-intel - Ported implementation to x86-32
The AES-NI instructions are also available in legacy mode so the 32-bit
architecture may profit from those, too.
To illustrate the performance gain here's a short summary of a dm-crypt
speed test on a Core i7 M620 running at 2.67GHz comparing both assembler
implementations:
x86: i568 aes-ni delta
ECB, 256 bit: 93.8 MB/s 123.3 MB/s +31.4%
CBC, 256 bit: 84.8 MB/s 262.3 MB/s +209.3%
LRW, 256 bit: 108.6 MB/s 222.1 MB/s +104.5%
XTS, 256 bit: 105.0 MB/s 205.5 MB/s +95.7%
Additionally, due to some minor optimizations, the 64-bit version also
got a minor performance gain as seen below:
x86-64: old impl. new impl. delta
ECB, 256 bit: 121.1 MB/s 123.0 MB/s +1.5%
CBC, 256 bit: 285.3 MB/s 290.8 MB/s +1.9%
LRW, 256 bit: 263.7 MB/s 265.3 MB/s +0.6%
XTS, 256 bit: 251.1 MB/s 255.3 MB/s +1.7%
Signed-off-by: Mathias Krause <minipli@googlemail.com>
Reviewed-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2010-11-27 11:34:46 +03:00
# ifndef _ _ x86 _ 6 4 _ _
popl K L E N
popl K E Y P
# endif
2009-01-18 08:28:34 +03:00
ret
/ *
* _aesni_enc1 : internal A B I
* input :
* KEYP : key s t r u c t p o i n t e r
* KLEN : round c o u n t
* STATE : initial s t a t e ( i n p u t )
* output :
* STATE : finial s t a t e ( o u t p u t )
* changed :
* KEY
* TKEYP ( T 1 )
* /
crypto: aesni-intel - Ported implementation to x86-32
The AES-NI instructions are also available in legacy mode so the 32-bit
architecture may profit from those, too.
To illustrate the performance gain here's a short summary of a dm-crypt
speed test on a Core i7 M620 running at 2.67GHz comparing both assembler
implementations:
x86: i568 aes-ni delta
ECB, 256 bit: 93.8 MB/s 123.3 MB/s +31.4%
CBC, 256 bit: 84.8 MB/s 262.3 MB/s +209.3%
LRW, 256 bit: 108.6 MB/s 222.1 MB/s +104.5%
XTS, 256 bit: 105.0 MB/s 205.5 MB/s +95.7%
Additionally, due to some minor optimizations, the 64-bit version also
got a minor performance gain as seen below:
x86-64: old impl. new impl. delta
ECB, 256 bit: 121.1 MB/s 123.0 MB/s +1.5%
CBC, 256 bit: 285.3 MB/s 290.8 MB/s +1.9%
LRW, 256 bit: 263.7 MB/s 265.3 MB/s +0.6%
XTS, 256 bit: 251.1 MB/s 255.3 MB/s +1.7%
Signed-off-by: Mathias Krause <minipli@googlemail.com>
Reviewed-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2010-11-27 11:34:46 +03:00
.align 4
2009-01-18 08:28:34 +03:00
_aesni_enc1 :
movaps ( K E Y P ) , K E Y # k e y
mov K E Y P , T K E Y P
pxor K E Y , S T A T E # r o u n d 0
add $ 0 x30 , T K E Y P
cmp $ 2 4 , K L E N
jb . L e n c12 8
lea 0 x20 ( T K E Y P ) , T K E Y P
je . L e n c19 2
add $ 0 x20 , T K E Y P
movaps - 0 x60 ( T K E Y P ) , K E Y
2009-11-23 14:54:06 +03:00
AESENC K E Y S T A T E
2009-01-18 08:28:34 +03:00
movaps - 0 x50 ( T K E Y P ) , K E Y
2009-11-23 14:54:06 +03:00
AESENC K E Y S T A T E
2009-01-18 08:28:34 +03:00
.align 4
.Lenc192 :
movaps - 0 x40 ( T K E Y P ) , K E Y
2009-11-23 14:54:06 +03:00
AESENC K E Y S T A T E
2009-01-18 08:28:34 +03:00
movaps - 0 x30 ( T K E Y P ) , K E Y
2009-11-23 14:54:06 +03:00
AESENC K E Y S T A T E
2009-01-18 08:28:34 +03:00
.align 4
.Lenc128 :
movaps - 0 x20 ( T K E Y P ) , K E Y
2009-11-23 14:54:06 +03:00
AESENC K E Y S T A T E
2009-01-18 08:28:34 +03:00
movaps - 0 x10 ( T K E Y P ) , K E Y
2009-11-23 14:54:06 +03:00
AESENC K E Y S T A T E
2009-01-18 08:28:34 +03:00
movaps ( T K E Y P ) , K E Y
2009-11-23 14:54:06 +03:00
AESENC K E Y S T A T E
2009-01-18 08:28:34 +03:00
movaps 0 x10 ( T K E Y P ) , K E Y
2009-11-23 14:54:06 +03:00
AESENC K E Y S T A T E
2009-01-18 08:28:34 +03:00
movaps 0 x20 ( T K E Y P ) , K E Y
2009-11-23 14:54:06 +03:00
AESENC K E Y S T A T E
2009-01-18 08:28:34 +03:00
movaps 0 x30 ( T K E Y P ) , K E Y
2009-11-23 14:54:06 +03:00
AESENC K E Y S T A T E
2009-01-18 08:28:34 +03:00
movaps 0 x40 ( T K E Y P ) , K E Y
2009-11-23 14:54:06 +03:00
AESENC K E Y S T A T E
2009-01-18 08:28:34 +03:00
movaps 0 x50 ( T K E Y P ) , K E Y
2009-11-23 14:54:06 +03:00
AESENC K E Y S T A T E
2009-01-18 08:28:34 +03:00
movaps 0 x60 ( T K E Y P ) , K E Y
2009-11-23 14:54:06 +03:00
AESENC K E Y S T A T E
2009-01-18 08:28:34 +03:00
movaps 0 x70 ( T K E Y P ) , K E Y
2009-11-23 14:54:06 +03:00
AESENCLAST K E Y S T A T E
2009-01-18 08:28:34 +03:00
ret
/ *
* _aesni_enc4 : internal A B I
* input :
* KEYP : key s t r u c t p o i n t e r
* KLEN : round c o u n t
* STATE1 : initial s t a t e ( i n p u t )
* STATE2
* STATE3
* STATE4
* output :
* STATE1 : finial s t a t e ( o u t p u t )
* STATE2
* STATE3
* STATE4
* changed :
* KEY
* TKEYP ( T 1 )
* /
crypto: aesni-intel - Ported implementation to x86-32
The AES-NI instructions are also available in legacy mode so the 32-bit
architecture may profit from those, too.
To illustrate the performance gain here's a short summary of a dm-crypt
speed test on a Core i7 M620 running at 2.67GHz comparing both assembler
implementations:
x86: i568 aes-ni delta
ECB, 256 bit: 93.8 MB/s 123.3 MB/s +31.4%
CBC, 256 bit: 84.8 MB/s 262.3 MB/s +209.3%
LRW, 256 bit: 108.6 MB/s 222.1 MB/s +104.5%
XTS, 256 bit: 105.0 MB/s 205.5 MB/s +95.7%
Additionally, due to some minor optimizations, the 64-bit version also
got a minor performance gain as seen below:
x86-64: old impl. new impl. delta
ECB, 256 bit: 121.1 MB/s 123.0 MB/s +1.5%
CBC, 256 bit: 285.3 MB/s 290.8 MB/s +1.9%
LRW, 256 bit: 263.7 MB/s 265.3 MB/s +0.6%
XTS, 256 bit: 251.1 MB/s 255.3 MB/s +1.7%
Signed-off-by: Mathias Krause <minipli@googlemail.com>
Reviewed-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2010-11-27 11:34:46 +03:00
.align 4
2009-01-18 08:28:34 +03:00
_aesni_enc4 :
movaps ( K E Y P ) , K E Y # k e y
mov K E Y P , T K E Y P
pxor K E Y , S T A T E 1 # r o u n d 0
pxor K E Y , S T A T E 2
pxor K E Y , S T A T E 3
pxor K E Y , S T A T E 4
add $ 0 x30 , T K E Y P
cmp $ 2 4 , K L E N
jb . L 4 e n c12 8
lea 0 x20 ( T K E Y P ) , T K E Y P
je . L 4 e n c19 2
add $ 0 x20 , T K E Y P
movaps - 0 x60 ( T K E Y P ) , K E Y
2009-11-23 14:54:06 +03:00
AESENC K E Y S T A T E 1
AESENC K E Y S T A T E 2
AESENC K E Y S T A T E 3
AESENC K E Y S T A T E 4
2009-01-18 08:28:34 +03:00
movaps - 0 x50 ( T K E Y P ) , K E Y
2009-11-23 14:54:06 +03:00
AESENC K E Y S T A T E 1
AESENC K E Y S T A T E 2
AESENC K E Y S T A T E 3
AESENC K E Y S T A T E 4
2009-01-18 08:28:34 +03:00
# .align 4
.L4enc192 :
movaps - 0 x40 ( T K E Y P ) , K E Y
2009-11-23 14:54:06 +03:00
AESENC K E Y S T A T E 1
AESENC K E Y S T A T E 2
AESENC K E Y S T A T E 3
AESENC K E Y S T A T E 4
2009-01-18 08:28:34 +03:00
movaps - 0 x30 ( T K E Y P ) , K E Y
2009-11-23 14:54:06 +03:00
AESENC K E Y S T A T E 1
AESENC K E Y S T A T E 2
AESENC K E Y S T A T E 3
AESENC K E Y S T A T E 4
2009-01-18 08:28:34 +03:00
# .align 4
.L4enc128 :
movaps - 0 x20 ( T K E Y P ) , K E Y
2009-11-23 14:54:06 +03:00
AESENC K E Y S T A T E 1
AESENC K E Y S T A T E 2
AESENC K E Y S T A T E 3
AESENC K E Y S T A T E 4
2009-01-18 08:28:34 +03:00
movaps - 0 x10 ( T K E Y P ) , K E Y
2009-11-23 14:54:06 +03:00
AESENC K E Y S T A T E 1
AESENC K E Y S T A T E 2
AESENC K E Y S T A T E 3
AESENC K E Y S T A T E 4
2009-01-18 08:28:34 +03:00
movaps ( T K E Y P ) , K E Y
2009-11-23 14:54:06 +03:00
AESENC K E Y S T A T E 1
AESENC K E Y S T A T E 2
AESENC K E Y S T A T E 3
AESENC K E Y S T A T E 4
2009-01-18 08:28:34 +03:00
movaps 0 x10 ( T K E Y P ) , K E Y
2009-11-23 14:54:06 +03:00
AESENC K E Y S T A T E 1
AESENC K E Y S T A T E 2
AESENC K E Y S T A T E 3
AESENC K E Y S T A T E 4
2009-01-18 08:28:34 +03:00
movaps 0 x20 ( T K E Y P ) , K E Y
2009-11-23 14:54:06 +03:00
AESENC K E Y S T A T E 1
AESENC K E Y S T A T E 2
AESENC K E Y S T A T E 3
AESENC K E Y S T A T E 4
2009-01-18 08:28:34 +03:00
movaps 0 x30 ( T K E Y P ) , K E Y
2009-11-23 14:54:06 +03:00
AESENC K E Y S T A T E 1
AESENC K E Y S T A T E 2
AESENC K E Y S T A T E 3
AESENC K E Y S T A T E 4
2009-01-18 08:28:34 +03:00
movaps 0 x40 ( T K E Y P ) , K E Y
2009-11-23 14:54:06 +03:00
AESENC K E Y S T A T E 1
AESENC K E Y S T A T E 2
AESENC K E Y S T A T E 3
AESENC K E Y S T A T E 4
2009-01-18 08:28:34 +03:00
movaps 0 x50 ( T K E Y P ) , K E Y
2009-11-23 14:54:06 +03:00
AESENC K E Y S T A T E 1
AESENC K E Y S T A T E 2
AESENC K E Y S T A T E 3
AESENC K E Y S T A T E 4
2009-01-18 08:28:34 +03:00
movaps 0 x60 ( T K E Y P ) , K E Y
2009-11-23 14:54:06 +03:00
AESENC K E Y S T A T E 1
AESENC K E Y S T A T E 2
AESENC K E Y S T A T E 3
AESENC K E Y S T A T E 4
2009-01-18 08:28:34 +03:00
movaps 0 x70 ( T K E Y P ) , K E Y
2009-11-23 14:54:06 +03:00
AESENCLAST K E Y S T A T E 1 # l a s t r o u n d
AESENCLAST K E Y S T A T E 2
AESENCLAST K E Y S T A T E 3
AESENCLAST K E Y S T A T E 4
2009-01-18 08:28:34 +03:00
ret
/ *
* void a e s n i _ d e c ( s t r u c t c r y p t o _ a e s _ c t x * c t x , u 8 * d s t , c o n s t u 8 * s r c )
* /
ENTRY( a e s n i _ d e c )
crypto: aesni-intel - Ported implementation to x86-32
The AES-NI instructions are also available in legacy mode so the 32-bit
architecture may profit from those, too.
To illustrate the performance gain here's a short summary of a dm-crypt
speed test on a Core i7 M620 running at 2.67GHz comparing both assembler
implementations:
x86: i568 aes-ni delta
ECB, 256 bit: 93.8 MB/s 123.3 MB/s +31.4%
CBC, 256 bit: 84.8 MB/s 262.3 MB/s +209.3%
LRW, 256 bit: 108.6 MB/s 222.1 MB/s +104.5%
XTS, 256 bit: 105.0 MB/s 205.5 MB/s +95.7%
Additionally, due to some minor optimizations, the 64-bit version also
got a minor performance gain as seen below:
x86-64: old impl. new impl. delta
ECB, 256 bit: 121.1 MB/s 123.0 MB/s +1.5%
CBC, 256 bit: 285.3 MB/s 290.8 MB/s +1.9%
LRW, 256 bit: 263.7 MB/s 265.3 MB/s +0.6%
XTS, 256 bit: 251.1 MB/s 255.3 MB/s +1.7%
Signed-off-by: Mathias Krause <minipli@googlemail.com>
Reviewed-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2010-11-27 11:34:46 +03:00
# ifndef _ _ x86 _ 6 4 _ _
pushl K E Y P
pushl K L E N
movl 1 2 ( % e s p ) , K E Y P
movl 1 6 ( % e s p ) , O U T P
movl 2 0 ( % e s p ) , I N P
# endif
2009-01-18 08:28:34 +03:00
mov 4 8 0 ( K E Y P ) , K L E N # k e y l e n g t h
add $ 2 4 0 , K E Y P
movups ( I N P ) , S T A T E # i n p u t
call _ a e s n i _ d e c1
movups S T A T E , ( O U T P ) #o u t p u t
crypto: aesni-intel - Ported implementation to x86-32
The AES-NI instructions are also available in legacy mode so the 32-bit
architecture may profit from those, too.
To illustrate the performance gain here's a short summary of a dm-crypt
speed test on a Core i7 M620 running at 2.67GHz comparing both assembler
implementations:
x86: i568 aes-ni delta
ECB, 256 bit: 93.8 MB/s 123.3 MB/s +31.4%
CBC, 256 bit: 84.8 MB/s 262.3 MB/s +209.3%
LRW, 256 bit: 108.6 MB/s 222.1 MB/s +104.5%
XTS, 256 bit: 105.0 MB/s 205.5 MB/s +95.7%
Additionally, due to some minor optimizations, the 64-bit version also
got a minor performance gain as seen below:
x86-64: old impl. new impl. delta
ECB, 256 bit: 121.1 MB/s 123.0 MB/s +1.5%
CBC, 256 bit: 285.3 MB/s 290.8 MB/s +1.9%
LRW, 256 bit: 263.7 MB/s 265.3 MB/s +0.6%
XTS, 256 bit: 251.1 MB/s 255.3 MB/s +1.7%
Signed-off-by: Mathias Krause <minipli@googlemail.com>
Reviewed-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2010-11-27 11:34:46 +03:00
# ifndef _ _ x86 _ 6 4 _ _
popl K L E N
popl K E Y P
# endif
2009-01-18 08:28:34 +03:00
ret
/ *
* _aesni_dec1 : internal A B I
* input :
* KEYP : key s t r u c t p o i n t e r
* KLEN : key l e n g t h
* STATE : initial s t a t e ( i n p u t )
* output :
* STATE : finial s t a t e ( o u t p u t )
* changed :
* KEY
* TKEYP ( T 1 )
* /
crypto: aesni-intel - Ported implementation to x86-32
The AES-NI instructions are also available in legacy mode so the 32-bit
architecture may profit from those, too.
To illustrate the performance gain here's a short summary of a dm-crypt
speed test on a Core i7 M620 running at 2.67GHz comparing both assembler
implementations:
x86: i568 aes-ni delta
ECB, 256 bit: 93.8 MB/s 123.3 MB/s +31.4%
CBC, 256 bit: 84.8 MB/s 262.3 MB/s +209.3%
LRW, 256 bit: 108.6 MB/s 222.1 MB/s +104.5%
XTS, 256 bit: 105.0 MB/s 205.5 MB/s +95.7%
Additionally, due to some minor optimizations, the 64-bit version also
got a minor performance gain as seen below:
x86-64: old impl. new impl. delta
ECB, 256 bit: 121.1 MB/s 123.0 MB/s +1.5%
CBC, 256 bit: 285.3 MB/s 290.8 MB/s +1.9%
LRW, 256 bit: 263.7 MB/s 265.3 MB/s +0.6%
XTS, 256 bit: 251.1 MB/s 255.3 MB/s +1.7%
Signed-off-by: Mathias Krause <minipli@googlemail.com>
Reviewed-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2010-11-27 11:34:46 +03:00
.align 4
2009-01-18 08:28:34 +03:00
_aesni_dec1 :
movaps ( K E Y P ) , K E Y # k e y
mov K E Y P , T K E Y P
pxor K E Y , S T A T E # r o u n d 0
add $ 0 x30 , T K E Y P
cmp $ 2 4 , K L E N
jb . L d e c12 8
lea 0 x20 ( T K E Y P ) , T K E Y P
je . L d e c19 2
add $ 0 x20 , T K E Y P
movaps - 0 x60 ( T K E Y P ) , K E Y
2009-11-23 14:54:06 +03:00
AESDEC K E Y S T A T E
2009-01-18 08:28:34 +03:00
movaps - 0 x50 ( T K E Y P ) , K E Y
2009-11-23 14:54:06 +03:00
AESDEC K E Y S T A T E
2009-01-18 08:28:34 +03:00
.align 4
.Ldec192 :
movaps - 0 x40 ( T K E Y P ) , K E Y
2009-11-23 14:54:06 +03:00
AESDEC K E Y S T A T E
2009-01-18 08:28:34 +03:00
movaps - 0 x30 ( T K E Y P ) , K E Y
2009-11-23 14:54:06 +03:00
AESDEC K E Y S T A T E
2009-01-18 08:28:34 +03:00
.align 4
.Ldec128 :
movaps - 0 x20 ( T K E Y P ) , K E Y
2009-11-23 14:54:06 +03:00
AESDEC K E Y S T A T E
2009-01-18 08:28:34 +03:00
movaps - 0 x10 ( T K E Y P ) , K E Y
2009-11-23 14:54:06 +03:00
AESDEC K E Y S T A T E
2009-01-18 08:28:34 +03:00
movaps ( T K E Y P ) , K E Y
2009-11-23 14:54:06 +03:00
AESDEC K E Y S T A T E
2009-01-18 08:28:34 +03:00
movaps 0 x10 ( T K E Y P ) , K E Y
2009-11-23 14:54:06 +03:00
AESDEC K E Y S T A T E
2009-01-18 08:28:34 +03:00
movaps 0 x20 ( T K E Y P ) , K E Y
2009-11-23 14:54:06 +03:00
AESDEC K E Y S T A T E
2009-01-18 08:28:34 +03:00
movaps 0 x30 ( T K E Y P ) , K E Y
2009-11-23 14:54:06 +03:00
AESDEC K E Y S T A T E
2009-01-18 08:28:34 +03:00
movaps 0 x40 ( T K E Y P ) , K E Y
2009-11-23 14:54:06 +03:00
AESDEC K E Y S T A T E
2009-01-18 08:28:34 +03:00
movaps 0 x50 ( T K E Y P ) , K E Y
2009-11-23 14:54:06 +03:00
AESDEC K E Y S T A T E
2009-01-18 08:28:34 +03:00
movaps 0 x60 ( T K E Y P ) , K E Y
2009-11-23 14:54:06 +03:00
AESDEC K E Y S T A T E
2009-01-18 08:28:34 +03:00
movaps 0 x70 ( T K E Y P ) , K E Y
2009-11-23 14:54:06 +03:00
AESDECLAST K E Y S T A T E
2009-01-18 08:28:34 +03:00
ret
/ *
* _aesni_dec4 : internal A B I
* input :
* KEYP : key s t r u c t p o i n t e r
* KLEN : key l e n g t h
* STATE1 : initial s t a t e ( i n p u t )
* STATE2
* STATE3
* STATE4
* output :
* STATE1 : finial s t a t e ( o u t p u t )
* STATE2
* STATE3
* STATE4
* changed :
* KEY
* TKEYP ( T 1 )
* /
crypto: aesni-intel - Ported implementation to x86-32
The AES-NI instructions are also available in legacy mode so the 32-bit
architecture may profit from those, too.
To illustrate the performance gain here's a short summary of a dm-crypt
speed test on a Core i7 M620 running at 2.67GHz comparing both assembler
implementations:
x86: i568 aes-ni delta
ECB, 256 bit: 93.8 MB/s 123.3 MB/s +31.4%
CBC, 256 bit: 84.8 MB/s 262.3 MB/s +209.3%
LRW, 256 bit: 108.6 MB/s 222.1 MB/s +104.5%
XTS, 256 bit: 105.0 MB/s 205.5 MB/s +95.7%
Additionally, due to some minor optimizations, the 64-bit version also
got a minor performance gain as seen below:
x86-64: old impl. new impl. delta
ECB, 256 bit: 121.1 MB/s 123.0 MB/s +1.5%
CBC, 256 bit: 285.3 MB/s 290.8 MB/s +1.9%
LRW, 256 bit: 263.7 MB/s 265.3 MB/s +0.6%
XTS, 256 bit: 251.1 MB/s 255.3 MB/s +1.7%
Signed-off-by: Mathias Krause <minipli@googlemail.com>
Reviewed-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2010-11-27 11:34:46 +03:00
.align 4
2009-01-18 08:28:34 +03:00
_aesni_dec4 :
movaps ( K E Y P ) , K E Y # k e y
mov K E Y P , T K E Y P
pxor K E Y , S T A T E 1 # r o u n d 0
pxor K E Y , S T A T E 2
pxor K E Y , S T A T E 3
pxor K E Y , S T A T E 4
add $ 0 x30 , T K E Y P
cmp $ 2 4 , K L E N
jb . L 4 d e c12 8
lea 0 x20 ( T K E Y P ) , T K E Y P
je . L 4 d e c19 2
add $ 0 x20 , T K E Y P
movaps - 0 x60 ( T K E Y P ) , K E Y
2009-11-23 14:54:06 +03:00
AESDEC K E Y S T A T E 1
AESDEC K E Y S T A T E 2
AESDEC K E Y S T A T E 3
AESDEC K E Y S T A T E 4
2009-01-18 08:28:34 +03:00
movaps - 0 x50 ( T K E Y P ) , K E Y
2009-11-23 14:54:06 +03:00
AESDEC K E Y S T A T E 1
AESDEC K E Y S T A T E 2
AESDEC K E Y S T A T E 3
AESDEC K E Y S T A T E 4
2009-01-18 08:28:34 +03:00
.align 4
.L4dec192 :
movaps - 0 x40 ( T K E Y P ) , K E Y
2009-11-23 14:54:06 +03:00
AESDEC K E Y S T A T E 1
AESDEC K E Y S T A T E 2
AESDEC K E Y S T A T E 3
AESDEC K E Y S T A T E 4
2009-01-18 08:28:34 +03:00
movaps - 0 x30 ( T K E Y P ) , K E Y
2009-11-23 14:54:06 +03:00
AESDEC K E Y S T A T E 1
AESDEC K E Y S T A T E 2
AESDEC K E Y S T A T E 3
AESDEC K E Y S T A T E 4
2009-01-18 08:28:34 +03:00
.align 4
.L4dec128 :
movaps - 0 x20 ( T K E Y P ) , K E Y
2009-11-23 14:54:06 +03:00
AESDEC K E Y S T A T E 1
AESDEC K E Y S T A T E 2
AESDEC K E Y S T A T E 3
AESDEC K E Y S T A T E 4
2009-01-18 08:28:34 +03:00
movaps - 0 x10 ( T K E Y P ) , K E Y
2009-11-23 14:54:06 +03:00
AESDEC K E Y S T A T E 1
AESDEC K E Y S T A T E 2
AESDEC K E Y S T A T E 3
AESDEC K E Y S T A T E 4
2009-01-18 08:28:34 +03:00
movaps ( T K E Y P ) , K E Y
2009-11-23 14:54:06 +03:00
AESDEC K E Y S T A T E 1
AESDEC K E Y S T A T E 2
AESDEC K E Y S T A T E 3
AESDEC K E Y S T A T E 4
2009-01-18 08:28:34 +03:00
movaps 0 x10 ( T K E Y P ) , K E Y
2009-11-23 14:54:06 +03:00
AESDEC K E Y S T A T E 1
AESDEC K E Y S T A T E 2
AESDEC K E Y S T A T E 3
AESDEC K E Y S T A T E 4
2009-01-18 08:28:34 +03:00
movaps 0 x20 ( T K E Y P ) , K E Y
2009-11-23 14:54:06 +03:00
AESDEC K E Y S T A T E 1
AESDEC K E Y S T A T E 2
AESDEC K E Y S T A T E 3
AESDEC K E Y S T A T E 4
2009-01-18 08:28:34 +03:00
movaps 0 x30 ( T K E Y P ) , K E Y
2009-11-23 14:54:06 +03:00
AESDEC K E Y S T A T E 1
AESDEC K E Y S T A T E 2
AESDEC K E Y S T A T E 3
AESDEC K E Y S T A T E 4
2009-01-18 08:28:34 +03:00
movaps 0 x40 ( T K E Y P ) , K E Y
2009-11-23 14:54:06 +03:00
AESDEC K E Y S T A T E 1
AESDEC K E Y S T A T E 2
AESDEC K E Y S T A T E 3
AESDEC K E Y S T A T E 4
2009-01-18 08:28:34 +03:00
movaps 0 x50 ( T K E Y P ) , K E Y
2009-11-23 14:54:06 +03:00
AESDEC K E Y S T A T E 1
AESDEC K E Y S T A T E 2
AESDEC K E Y S T A T E 3
AESDEC K E Y S T A T E 4
2009-01-18 08:28:34 +03:00
movaps 0 x60 ( T K E Y P ) , K E Y
2009-11-23 14:54:06 +03:00
AESDEC K E Y S T A T E 1
AESDEC K E Y S T A T E 2
AESDEC K E Y S T A T E 3
AESDEC K E Y S T A T E 4
2009-01-18 08:28:34 +03:00
movaps 0 x70 ( T K E Y P ) , K E Y
2009-11-23 14:54:06 +03:00
AESDECLAST K E Y S T A T E 1 # l a s t r o u n d
AESDECLAST K E Y S T A T E 2
AESDECLAST K E Y S T A T E 3
AESDECLAST K E Y S T A T E 4
2009-01-18 08:28:34 +03:00
ret
/ *
* void a e s n i _ e c b _ e n c ( s t r u c t c r y p t o _ a e s _ c t x * c t x , c o n s t u 8 * d s t , u 8 * s r c ,
* size_ t l e n )
* /
ENTRY( a e s n i _ e c b _ e n c )
crypto: aesni-intel - Ported implementation to x86-32
The AES-NI instructions are also available in legacy mode so the 32-bit
architecture may profit from those, too.
To illustrate the performance gain here's a short summary of a dm-crypt
speed test on a Core i7 M620 running at 2.67GHz comparing both assembler
implementations:
x86: i568 aes-ni delta
ECB, 256 bit: 93.8 MB/s 123.3 MB/s +31.4%
CBC, 256 bit: 84.8 MB/s 262.3 MB/s +209.3%
LRW, 256 bit: 108.6 MB/s 222.1 MB/s +104.5%
XTS, 256 bit: 105.0 MB/s 205.5 MB/s +95.7%
Additionally, due to some minor optimizations, the 64-bit version also
got a minor performance gain as seen below:
x86-64: old impl. new impl. delta
ECB, 256 bit: 121.1 MB/s 123.0 MB/s +1.5%
CBC, 256 bit: 285.3 MB/s 290.8 MB/s +1.9%
LRW, 256 bit: 263.7 MB/s 265.3 MB/s +0.6%
XTS, 256 bit: 251.1 MB/s 255.3 MB/s +1.7%
Signed-off-by: Mathias Krause <minipli@googlemail.com>
Reviewed-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2010-11-27 11:34:46 +03:00
# ifndef _ _ x86 _ 6 4 _ _
pushl L E N
pushl K E Y P
pushl K L E N
movl 1 6 ( % e s p ) , K E Y P
movl 2 0 ( % e s p ) , O U T P
movl 2 4 ( % e s p ) , I N P
movl 2 8 ( % e s p ) , L E N
# endif
2009-01-18 08:28:34 +03:00
test L E N , L E N # c h e c k l e n g t h
jz . L e c b _ e n c _ r e t
mov 4 8 0 ( K E Y P ) , K L E N
cmp $ 1 6 , L E N
jb . L e c b _ e n c _ r e t
cmp $ 6 4 , L E N
jb . L e c b _ e n c _ l o o p1
.align 4
.Lecb_enc_loop4 :
movups ( I N P ) , S T A T E 1
movups 0 x10 ( I N P ) , S T A T E 2
movups 0 x20 ( I N P ) , S T A T E 3
movups 0 x30 ( I N P ) , S T A T E 4
call _ a e s n i _ e n c4
movups S T A T E 1 , ( O U T P )
movups S T A T E 2 , 0 x10 ( O U T P )
movups S T A T E 3 , 0 x20 ( O U T P )
movups S T A T E 4 , 0 x30 ( O U T P )
sub $ 6 4 , L E N
add $ 6 4 , I N P
add $ 6 4 , O U T P
cmp $ 6 4 , L E N
jge . L e c b _ e n c _ l o o p4
cmp $ 1 6 , L E N
jb . L e c b _ e n c _ r e t
.align 4
.Lecb_enc_loop1 :
movups ( I N P ) , S T A T E 1
call _ a e s n i _ e n c1
movups S T A T E 1 , ( O U T P )
sub $ 1 6 , L E N
add $ 1 6 , I N P
add $ 1 6 , O U T P
cmp $ 1 6 , L E N
jge . L e c b _ e n c _ l o o p1
.Lecb_enc_ret :
crypto: aesni-intel - Ported implementation to x86-32
The AES-NI instructions are also available in legacy mode so the 32-bit
architecture may profit from those, too.
To illustrate the performance gain here's a short summary of a dm-crypt
speed test on a Core i7 M620 running at 2.67GHz comparing both assembler
implementations:
x86: i568 aes-ni delta
ECB, 256 bit: 93.8 MB/s 123.3 MB/s +31.4%
CBC, 256 bit: 84.8 MB/s 262.3 MB/s +209.3%
LRW, 256 bit: 108.6 MB/s 222.1 MB/s +104.5%
XTS, 256 bit: 105.0 MB/s 205.5 MB/s +95.7%
Additionally, due to some minor optimizations, the 64-bit version also
got a minor performance gain as seen below:
x86-64: old impl. new impl. delta
ECB, 256 bit: 121.1 MB/s 123.0 MB/s +1.5%
CBC, 256 bit: 285.3 MB/s 290.8 MB/s +1.9%
LRW, 256 bit: 263.7 MB/s 265.3 MB/s +0.6%
XTS, 256 bit: 251.1 MB/s 255.3 MB/s +1.7%
Signed-off-by: Mathias Krause <minipli@googlemail.com>
Reviewed-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2010-11-27 11:34:46 +03:00
# ifndef _ _ x86 _ 6 4 _ _
popl K L E N
popl K E Y P
popl L E N
# endif
2009-01-18 08:28:34 +03:00
ret
/ *
* void a e s n i _ e c b _ d e c ( s t r u c t c r y p t o _ a e s _ c t x * c t x , c o n s t u 8 * d s t , u 8 * s r c ,
* size_ t l e n ) ;
* /
ENTRY( a e s n i _ e c b _ d e c )
crypto: aesni-intel - Ported implementation to x86-32
The AES-NI instructions are also available in legacy mode so the 32-bit
architecture may profit from those, too.
To illustrate the performance gain here's a short summary of a dm-crypt
speed test on a Core i7 M620 running at 2.67GHz comparing both assembler
implementations:
x86: i568 aes-ni delta
ECB, 256 bit: 93.8 MB/s 123.3 MB/s +31.4%
CBC, 256 bit: 84.8 MB/s 262.3 MB/s +209.3%
LRW, 256 bit: 108.6 MB/s 222.1 MB/s +104.5%
XTS, 256 bit: 105.0 MB/s 205.5 MB/s +95.7%
Additionally, due to some minor optimizations, the 64-bit version also
got a minor performance gain as seen below:
x86-64: old impl. new impl. delta
ECB, 256 bit: 121.1 MB/s 123.0 MB/s +1.5%
CBC, 256 bit: 285.3 MB/s 290.8 MB/s +1.9%
LRW, 256 bit: 263.7 MB/s 265.3 MB/s +0.6%
XTS, 256 bit: 251.1 MB/s 255.3 MB/s +1.7%
Signed-off-by: Mathias Krause <minipli@googlemail.com>
Reviewed-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2010-11-27 11:34:46 +03:00
# ifndef _ _ x86 _ 6 4 _ _
pushl L E N
pushl K E Y P
pushl K L E N
movl 1 6 ( % e s p ) , K E Y P
movl 2 0 ( % e s p ) , O U T P
movl 2 4 ( % e s p ) , I N P
movl 2 8 ( % e s p ) , L E N
# endif
2009-01-18 08:28:34 +03:00
test L E N , L E N
jz . L e c b _ d e c _ r e t
mov 4 8 0 ( K E Y P ) , K L E N
add $ 2 4 0 , K E Y P
cmp $ 1 6 , L E N
jb . L e c b _ d e c _ r e t
cmp $ 6 4 , L E N
jb . L e c b _ d e c _ l o o p1
.align 4
.Lecb_dec_loop4 :
movups ( I N P ) , S T A T E 1
movups 0 x10 ( I N P ) , S T A T E 2
movups 0 x20 ( I N P ) , S T A T E 3
movups 0 x30 ( I N P ) , S T A T E 4
call _ a e s n i _ d e c4
movups S T A T E 1 , ( O U T P )
movups S T A T E 2 , 0 x10 ( O U T P )
movups S T A T E 3 , 0 x20 ( O U T P )
movups S T A T E 4 , 0 x30 ( O U T P )
sub $ 6 4 , L E N
add $ 6 4 , I N P
add $ 6 4 , O U T P
cmp $ 6 4 , L E N
jge . L e c b _ d e c _ l o o p4
cmp $ 1 6 , L E N
jb . L e c b _ d e c _ r e t
.align 4
.Lecb_dec_loop1 :
movups ( I N P ) , S T A T E 1
call _ a e s n i _ d e c1
movups S T A T E 1 , ( O U T P )
sub $ 1 6 , L E N
add $ 1 6 , I N P
add $ 1 6 , O U T P
cmp $ 1 6 , L E N
jge . L e c b _ d e c _ l o o p1
.Lecb_dec_ret :
crypto: aesni-intel - Ported implementation to x86-32
The AES-NI instructions are also available in legacy mode so the 32-bit
architecture may profit from those, too.
To illustrate the performance gain here's a short summary of a dm-crypt
speed test on a Core i7 M620 running at 2.67GHz comparing both assembler
implementations:
x86: i568 aes-ni delta
ECB, 256 bit: 93.8 MB/s 123.3 MB/s +31.4%
CBC, 256 bit: 84.8 MB/s 262.3 MB/s +209.3%
LRW, 256 bit: 108.6 MB/s 222.1 MB/s +104.5%
XTS, 256 bit: 105.0 MB/s 205.5 MB/s +95.7%
Additionally, due to some minor optimizations, the 64-bit version also
got a minor performance gain as seen below:
x86-64: old impl. new impl. delta
ECB, 256 bit: 121.1 MB/s 123.0 MB/s +1.5%
CBC, 256 bit: 285.3 MB/s 290.8 MB/s +1.9%
LRW, 256 bit: 263.7 MB/s 265.3 MB/s +0.6%
XTS, 256 bit: 251.1 MB/s 255.3 MB/s +1.7%
Signed-off-by: Mathias Krause <minipli@googlemail.com>
Reviewed-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2010-11-27 11:34:46 +03:00
# ifndef _ _ x86 _ 6 4 _ _
popl K L E N
popl K E Y P
popl L E N
# endif
2009-01-18 08:28:34 +03:00
ret
/ *
* void a e s n i _ c b c _ e n c ( s t r u c t c r y p t o _ a e s _ c t x * c t x , c o n s t u 8 * d s t , u 8 * s r c ,
* size_ t l e n , u 8 * i v )
* /
ENTRY( a e s n i _ c b c _ e n c )
crypto: aesni-intel - Ported implementation to x86-32
The AES-NI instructions are also available in legacy mode so the 32-bit
architecture may profit from those, too.
To illustrate the performance gain here's a short summary of a dm-crypt
speed test on a Core i7 M620 running at 2.67GHz comparing both assembler
implementations:
x86: i568 aes-ni delta
ECB, 256 bit: 93.8 MB/s 123.3 MB/s +31.4%
CBC, 256 bit: 84.8 MB/s 262.3 MB/s +209.3%
LRW, 256 bit: 108.6 MB/s 222.1 MB/s +104.5%
XTS, 256 bit: 105.0 MB/s 205.5 MB/s +95.7%
Additionally, due to some minor optimizations, the 64-bit version also
got a minor performance gain as seen below:
x86-64: old impl. new impl. delta
ECB, 256 bit: 121.1 MB/s 123.0 MB/s +1.5%
CBC, 256 bit: 285.3 MB/s 290.8 MB/s +1.9%
LRW, 256 bit: 263.7 MB/s 265.3 MB/s +0.6%
XTS, 256 bit: 251.1 MB/s 255.3 MB/s +1.7%
Signed-off-by: Mathias Krause <minipli@googlemail.com>
Reviewed-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2010-11-27 11:34:46 +03:00
# ifndef _ _ x86 _ 6 4 _ _
pushl I V P
pushl L E N
pushl K E Y P
pushl K L E N
movl 2 0 ( % e s p ) , K E Y P
movl 2 4 ( % e s p ) , O U T P
movl 2 8 ( % e s p ) , I N P
movl 3 2 ( % e s p ) , L E N
movl 3 6 ( % e s p ) , I V P
# endif
2009-01-18 08:28:34 +03:00
cmp $ 1 6 , L E N
jb . L c b c _ e n c _ r e t
mov 4 8 0 ( K E Y P ) , K L E N
movups ( I V P ) , S T A T E # l o a d i v a s i n i t i a l s t a t e
.align 4
.Lcbc_enc_loop :
movups ( I N P ) , I N # l o a d i n p u t
pxor I N , S T A T E
call _ a e s n i _ e n c1
movups S T A T E , ( O U T P ) # s t o r e o u t p u t
sub $ 1 6 , L E N
add $ 1 6 , I N P
add $ 1 6 , O U T P
cmp $ 1 6 , L E N
jge . L c b c _ e n c _ l o o p
movups S T A T E , ( I V P )
.Lcbc_enc_ret :
crypto: aesni-intel - Ported implementation to x86-32
The AES-NI instructions are also available in legacy mode so the 32-bit
architecture may profit from those, too.
To illustrate the performance gain here's a short summary of a dm-crypt
speed test on a Core i7 M620 running at 2.67GHz comparing both assembler
implementations:
x86: i568 aes-ni delta
ECB, 256 bit: 93.8 MB/s 123.3 MB/s +31.4%
CBC, 256 bit: 84.8 MB/s 262.3 MB/s +209.3%
LRW, 256 bit: 108.6 MB/s 222.1 MB/s +104.5%
XTS, 256 bit: 105.0 MB/s 205.5 MB/s +95.7%
Additionally, due to some minor optimizations, the 64-bit version also
got a minor performance gain as seen below:
x86-64: old impl. new impl. delta
ECB, 256 bit: 121.1 MB/s 123.0 MB/s +1.5%
CBC, 256 bit: 285.3 MB/s 290.8 MB/s +1.9%
LRW, 256 bit: 263.7 MB/s 265.3 MB/s +0.6%
XTS, 256 bit: 251.1 MB/s 255.3 MB/s +1.7%
Signed-off-by: Mathias Krause <minipli@googlemail.com>
Reviewed-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2010-11-27 11:34:46 +03:00
# ifndef _ _ x86 _ 6 4 _ _
popl K L E N
popl K E Y P
popl L E N
popl I V P
# endif
2009-01-18 08:28:34 +03:00
ret
/ *
* void a e s n i _ c b c _ d e c ( s t r u c t c r y p t o _ a e s _ c t x * c t x , c o n s t u 8 * d s t , u 8 * s r c ,
* size_ t l e n , u 8 * i v )
* /
ENTRY( a e s n i _ c b c _ d e c )
crypto: aesni-intel - Ported implementation to x86-32
The AES-NI instructions are also available in legacy mode so the 32-bit
architecture may profit from those, too.
To illustrate the performance gain here's a short summary of a dm-crypt
speed test on a Core i7 M620 running at 2.67GHz comparing both assembler
implementations:
x86: i568 aes-ni delta
ECB, 256 bit: 93.8 MB/s 123.3 MB/s +31.4%
CBC, 256 bit: 84.8 MB/s 262.3 MB/s +209.3%
LRW, 256 bit: 108.6 MB/s 222.1 MB/s +104.5%
XTS, 256 bit: 105.0 MB/s 205.5 MB/s +95.7%
Additionally, due to some minor optimizations, the 64-bit version also
got a minor performance gain as seen below:
x86-64: old impl. new impl. delta
ECB, 256 bit: 121.1 MB/s 123.0 MB/s +1.5%
CBC, 256 bit: 285.3 MB/s 290.8 MB/s +1.9%
LRW, 256 bit: 263.7 MB/s 265.3 MB/s +0.6%
XTS, 256 bit: 251.1 MB/s 255.3 MB/s +1.7%
Signed-off-by: Mathias Krause <minipli@googlemail.com>
Reviewed-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2010-11-27 11:34:46 +03:00
# ifndef _ _ x86 _ 6 4 _ _
pushl I V P
pushl L E N
pushl K E Y P
pushl K L E N
movl 2 0 ( % e s p ) , K E Y P
movl 2 4 ( % e s p ) , O U T P
movl 2 8 ( % e s p ) , I N P
movl 3 2 ( % e s p ) , L E N
movl 3 6 ( % e s p ) , I V P
# endif
2009-01-18 08:28:34 +03:00
cmp $ 1 6 , L E N
2009-06-18 15:33:57 +04:00
jb . L c b c _ d e c _ j u s t _ r e t
2009-01-18 08:28:34 +03:00
mov 4 8 0 ( K E Y P ) , K L E N
add $ 2 4 0 , K E Y P
movups ( I V P ) , I V
cmp $ 6 4 , L E N
jb . L c b c _ d e c _ l o o p1
.align 4
.Lcbc_dec_loop4 :
movups ( I N P ) , I N 1
movaps I N 1 , S T A T E 1
movups 0 x10 ( I N P ) , I N 2
movaps I N 2 , S T A T E 2
crypto: aesni-intel - Ported implementation to x86-32
The AES-NI instructions are also available in legacy mode so the 32-bit
architecture may profit from those, too.
To illustrate the performance gain here's a short summary of a dm-crypt
speed test on a Core i7 M620 running at 2.67GHz comparing both assembler
implementations:
x86: i568 aes-ni delta
ECB, 256 bit: 93.8 MB/s 123.3 MB/s +31.4%
CBC, 256 bit: 84.8 MB/s 262.3 MB/s +209.3%
LRW, 256 bit: 108.6 MB/s 222.1 MB/s +104.5%
XTS, 256 bit: 105.0 MB/s 205.5 MB/s +95.7%
Additionally, due to some minor optimizations, the 64-bit version also
got a minor performance gain as seen below:
x86-64: old impl. new impl. delta
ECB, 256 bit: 121.1 MB/s 123.0 MB/s +1.5%
CBC, 256 bit: 285.3 MB/s 290.8 MB/s +1.9%
LRW, 256 bit: 263.7 MB/s 265.3 MB/s +0.6%
XTS, 256 bit: 251.1 MB/s 255.3 MB/s +1.7%
Signed-off-by: Mathias Krause <minipli@googlemail.com>
Reviewed-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2010-11-27 11:34:46 +03:00
# ifdef _ _ x86 _ 6 4 _ _
2009-01-18 08:28:34 +03:00
movups 0 x20 ( I N P ) , I N 3
movaps I N 3 , S T A T E 3
movups 0 x30 ( I N P ) , I N 4
movaps I N 4 , S T A T E 4
crypto: aesni-intel - Ported implementation to x86-32
The AES-NI instructions are also available in legacy mode so the 32-bit
architecture may profit from those, too.
To illustrate the performance gain here's a short summary of a dm-crypt
speed test on a Core i7 M620 running at 2.67GHz comparing both assembler
implementations:
x86: i568 aes-ni delta
ECB, 256 bit: 93.8 MB/s 123.3 MB/s +31.4%
CBC, 256 bit: 84.8 MB/s 262.3 MB/s +209.3%
LRW, 256 bit: 108.6 MB/s 222.1 MB/s +104.5%
XTS, 256 bit: 105.0 MB/s 205.5 MB/s +95.7%
Additionally, due to some minor optimizations, the 64-bit version also
got a minor performance gain as seen below:
x86-64: old impl. new impl. delta
ECB, 256 bit: 121.1 MB/s 123.0 MB/s +1.5%
CBC, 256 bit: 285.3 MB/s 290.8 MB/s +1.9%
LRW, 256 bit: 263.7 MB/s 265.3 MB/s +0.6%
XTS, 256 bit: 251.1 MB/s 255.3 MB/s +1.7%
Signed-off-by: Mathias Krause <minipli@googlemail.com>
Reviewed-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2010-11-27 11:34:46 +03:00
# else
movups 0 x20 ( I N P ) , I N 1
movaps I N 1 , S T A T E 3
movups 0 x30 ( I N P ) , I N 2
movaps I N 2 , S T A T E 4
# endif
2009-01-18 08:28:34 +03:00
call _ a e s n i _ d e c4
pxor I V , S T A T E 1
crypto: aesni-intel - Ported implementation to x86-32
The AES-NI instructions are also available in legacy mode so the 32-bit
architecture may profit from those, too.
To illustrate the performance gain here's a short summary of a dm-crypt
speed test on a Core i7 M620 running at 2.67GHz comparing both assembler
implementations:
x86: i568 aes-ni delta
ECB, 256 bit: 93.8 MB/s 123.3 MB/s +31.4%
CBC, 256 bit: 84.8 MB/s 262.3 MB/s +209.3%
LRW, 256 bit: 108.6 MB/s 222.1 MB/s +104.5%
XTS, 256 bit: 105.0 MB/s 205.5 MB/s +95.7%
Additionally, due to some minor optimizations, the 64-bit version also
got a minor performance gain as seen below:
x86-64: old impl. new impl. delta
ECB, 256 bit: 121.1 MB/s 123.0 MB/s +1.5%
CBC, 256 bit: 285.3 MB/s 290.8 MB/s +1.9%
LRW, 256 bit: 263.7 MB/s 265.3 MB/s +0.6%
XTS, 256 bit: 251.1 MB/s 255.3 MB/s +1.7%
Signed-off-by: Mathias Krause <minipli@googlemail.com>
Reviewed-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2010-11-27 11:34:46 +03:00
# ifdef _ _ x86 _ 6 4 _ _
2009-01-18 08:28:34 +03:00
pxor I N 1 , S T A T E 2
pxor I N 2 , S T A T E 3
pxor I N 3 , S T A T E 4
movaps I N 4 , I V
crypto: aesni-intel - Ported implementation to x86-32
The AES-NI instructions are also available in legacy mode so the 32-bit
architecture may profit from those, too.
To illustrate the performance gain here's a short summary of a dm-crypt
speed test on a Core i7 M620 running at 2.67GHz comparing both assembler
implementations:
x86: i568 aes-ni delta
ECB, 256 bit: 93.8 MB/s 123.3 MB/s +31.4%
CBC, 256 bit: 84.8 MB/s 262.3 MB/s +209.3%
LRW, 256 bit: 108.6 MB/s 222.1 MB/s +104.5%
XTS, 256 bit: 105.0 MB/s 205.5 MB/s +95.7%
Additionally, due to some minor optimizations, the 64-bit version also
got a minor performance gain as seen below:
x86-64: old impl. new impl. delta
ECB, 256 bit: 121.1 MB/s 123.0 MB/s +1.5%
CBC, 256 bit: 285.3 MB/s 290.8 MB/s +1.9%
LRW, 256 bit: 263.7 MB/s 265.3 MB/s +0.6%
XTS, 256 bit: 251.1 MB/s 255.3 MB/s +1.7%
Signed-off-by: Mathias Krause <minipli@googlemail.com>
Reviewed-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2010-11-27 11:34:46 +03:00
# else
pxor ( I N P ) , S T A T E 2
pxor 0 x10 ( I N P ) , S T A T E 3
pxor I N 1 , S T A T E 4
movaps I N 2 , I V
# endif
2009-01-18 08:28:34 +03:00
movups S T A T E 1 , ( O U T P )
movups S T A T E 2 , 0 x10 ( O U T P )
movups S T A T E 3 , 0 x20 ( O U T P )
movups S T A T E 4 , 0 x30 ( O U T P )
sub $ 6 4 , L E N
add $ 6 4 , I N P
add $ 6 4 , O U T P
cmp $ 6 4 , L E N
jge . L c b c _ d e c _ l o o p4
cmp $ 1 6 , L E N
jb . L c b c _ d e c _ r e t
.align 4
.Lcbc_dec_loop1 :
movups ( I N P ) , I N
movaps I N , S T A T E
call _ a e s n i _ d e c1
pxor I V , S T A T E
movups S T A T E , ( O U T P )
movaps I N , I V
sub $ 1 6 , L E N
add $ 1 6 , I N P
add $ 1 6 , O U T P
cmp $ 1 6 , L E N
jge . L c b c _ d e c _ l o o p1
.Lcbc_dec_ret :
2009-06-18 15:33:57 +04:00
movups I V , ( I V P )
.Lcbc_dec_just_ret :
crypto: aesni-intel - Ported implementation to x86-32
The AES-NI instructions are also available in legacy mode so the 32-bit
architecture may profit from those, too.
To illustrate the performance gain here's a short summary of a dm-crypt
speed test on a Core i7 M620 running at 2.67GHz comparing both assembler
implementations:
x86: i568 aes-ni delta
ECB, 256 bit: 93.8 MB/s 123.3 MB/s +31.4%
CBC, 256 bit: 84.8 MB/s 262.3 MB/s +209.3%
LRW, 256 bit: 108.6 MB/s 222.1 MB/s +104.5%
XTS, 256 bit: 105.0 MB/s 205.5 MB/s +95.7%
Additionally, due to some minor optimizations, the 64-bit version also
got a minor performance gain as seen below:
x86-64: old impl. new impl. delta
ECB, 256 bit: 121.1 MB/s 123.0 MB/s +1.5%
CBC, 256 bit: 285.3 MB/s 290.8 MB/s +1.9%
LRW, 256 bit: 263.7 MB/s 265.3 MB/s +0.6%
XTS, 256 bit: 251.1 MB/s 255.3 MB/s +1.7%
Signed-off-by: Mathias Krause <minipli@googlemail.com>
Reviewed-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2010-11-27 11:34:46 +03:00
# ifndef _ _ x86 _ 6 4 _ _
popl K L E N
popl K E Y P
popl L E N
popl I V P
# endif
2009-01-18 08:28:34 +03:00
ret
2010-03-10 13:28:55 +03:00
crypto: aesni-intel - Ported implementation to x86-32
The AES-NI instructions are also available in legacy mode so the 32-bit
architecture may profit from those, too.
To illustrate the performance gain here's a short summary of a dm-crypt
speed test on a Core i7 M620 running at 2.67GHz comparing both assembler
implementations:
x86: i568 aes-ni delta
ECB, 256 bit: 93.8 MB/s 123.3 MB/s +31.4%
CBC, 256 bit: 84.8 MB/s 262.3 MB/s +209.3%
LRW, 256 bit: 108.6 MB/s 222.1 MB/s +104.5%
XTS, 256 bit: 105.0 MB/s 205.5 MB/s +95.7%
Additionally, due to some minor optimizations, the 64-bit version also
got a minor performance gain as seen below:
x86-64: old impl. new impl. delta
ECB, 256 bit: 121.1 MB/s 123.0 MB/s +1.5%
CBC, 256 bit: 285.3 MB/s 290.8 MB/s +1.9%
LRW, 256 bit: 263.7 MB/s 265.3 MB/s +0.6%
XTS, 256 bit: 251.1 MB/s 255.3 MB/s +1.7%
Signed-off-by: Mathias Krause <minipli@googlemail.com>
Reviewed-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2010-11-27 11:34:46 +03:00
# ifdef _ _ x86 _ 6 4 _ _
2010-03-10 13:28:55 +03:00
.align 16
.Lbswap_mask :
.byte 1 5 , 1 4 , 1 3 , 1 2 , 1 1 , 1 0 , 9 , 8 , 7 , 6 , 5 , 4 , 3 , 2 , 1 , 0
/ *
* _aesni_inc_init : internal A B I
* setup r e g i s t e r s u s e d b y _ a e s n i _ i n c
* input :
* IV
* output :
* CTR : = = IV, i n l i t t l e e n d i a n
* TCTR_LOW : = = lower q w o r d o f C T R
* INC : = = 1 , in l i t t l e e n d i a n
* BSWAP_ M A S K = = e n d i a n s w a p p i n g m a s k
* /
crypto: aesni-intel - Ported implementation to x86-32
The AES-NI instructions are also available in legacy mode so the 32-bit
architecture may profit from those, too.
To illustrate the performance gain here's a short summary of a dm-crypt
speed test on a Core i7 M620 running at 2.67GHz comparing both assembler
implementations:
x86: i568 aes-ni delta
ECB, 256 bit: 93.8 MB/s 123.3 MB/s +31.4%
CBC, 256 bit: 84.8 MB/s 262.3 MB/s +209.3%
LRW, 256 bit: 108.6 MB/s 222.1 MB/s +104.5%
XTS, 256 bit: 105.0 MB/s 205.5 MB/s +95.7%
Additionally, due to some minor optimizations, the 64-bit version also
got a minor performance gain as seen below:
x86-64: old impl. new impl. delta
ECB, 256 bit: 121.1 MB/s 123.0 MB/s +1.5%
CBC, 256 bit: 285.3 MB/s 290.8 MB/s +1.9%
LRW, 256 bit: 263.7 MB/s 265.3 MB/s +0.6%
XTS, 256 bit: 251.1 MB/s 255.3 MB/s +1.7%
Signed-off-by: Mathias Krause <minipli@googlemail.com>
Reviewed-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2010-11-27 11:34:46 +03:00
.align 4
2010-03-10 13:28:55 +03:00
_aesni_inc_init :
movaps . L b s w a p _ m a s k , B S W A P _ M A S K
movaps I V , C T R
PSHUFB_ X M M B S W A P _ M A S K C T R
mov $ 1 , T C T R _ L O W
2010-03-13 11:28:42 +03:00
MOVQ_ R 6 4 _ X M M T C T R _ L O W I N C
MOVQ_ R 6 4 _ X M M C T R T C T R _ L O W
2010-03-10 13:28:55 +03:00
ret
/ *
* _aesni_inc : internal A B I
* Increase I V b y 1 , I V i s i n b i g e n d i a n
* input :
* IV
* CTR : = = IV, i n l i t t l e e n d i a n
* TCTR_LOW : = = lower q w o r d o f C T R
* INC : = = 1 , in l i t t l e e n d i a n
* BSWAP_ M A S K = = e n d i a n s w a p p i n g m a s k
* output :
* IV : Increase b y 1
* changed :
* CTR : = = output I V , i n l i t t l e e n d i a n
* TCTR_LOW : = = lower q w o r d o f C T R
* /
crypto: aesni-intel - Ported implementation to x86-32
The AES-NI instructions are also available in legacy mode so the 32-bit
architecture may profit from those, too.
To illustrate the performance gain here's a short summary of a dm-crypt
speed test on a Core i7 M620 running at 2.67GHz comparing both assembler
implementations:
x86: i568 aes-ni delta
ECB, 256 bit: 93.8 MB/s 123.3 MB/s +31.4%
CBC, 256 bit: 84.8 MB/s 262.3 MB/s +209.3%
LRW, 256 bit: 108.6 MB/s 222.1 MB/s +104.5%
XTS, 256 bit: 105.0 MB/s 205.5 MB/s +95.7%
Additionally, due to some minor optimizations, the 64-bit version also
got a minor performance gain as seen below:
x86-64: old impl. new impl. delta
ECB, 256 bit: 121.1 MB/s 123.0 MB/s +1.5%
CBC, 256 bit: 285.3 MB/s 290.8 MB/s +1.9%
LRW, 256 bit: 263.7 MB/s 265.3 MB/s +0.6%
XTS, 256 bit: 251.1 MB/s 255.3 MB/s +1.7%
Signed-off-by: Mathias Krause <minipli@googlemail.com>
Reviewed-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2010-11-27 11:34:46 +03:00
.align 4
2010-03-10 13:28:55 +03:00
_aesni_inc :
paddq I N C , C T R
add $ 1 , T C T R _ L O W
jnc . L i n c _ l o w
pslldq $ 8 , I N C
paddq I N C , C T R
psrldq $ 8 , I N C
.Linc_low :
movaps C T R , I V
PSHUFB_ X M M B S W A P _ M A S K I V
ret
/ *
* void a e s n i _ c t r _ e n c ( s t r u c t c r y p t o _ a e s _ c t x * c t x , c o n s t u 8 * d s t , u 8 * s r c ,
* size_ t l e n , u 8 * i v )
* /
ENTRY( a e s n i _ c t r _ e n c )
cmp $ 1 6 , L E N
jb . L c t r _ e n c _ j u s t _ r e t
mov 4 8 0 ( K E Y P ) , K L E N
movups ( I V P ) , I V
call _ a e s n i _ i n c _ i n i t
cmp $ 6 4 , L E N
jb . L c t r _ e n c _ l o o p1
.align 4
.Lctr_enc_loop4 :
movaps I V , S T A T E 1
call _ a e s n i _ i n c
movups ( I N P ) , I N 1
movaps I V , S T A T E 2
call _ a e s n i _ i n c
movups 0 x10 ( I N P ) , I N 2
movaps I V , S T A T E 3
call _ a e s n i _ i n c
movups 0 x20 ( I N P ) , I N 3
movaps I V , S T A T E 4
call _ a e s n i _ i n c
movups 0 x30 ( I N P ) , I N 4
call _ a e s n i _ e n c4
pxor I N 1 , S T A T E 1
movups S T A T E 1 , ( O U T P )
pxor I N 2 , S T A T E 2
movups S T A T E 2 , 0 x10 ( O U T P )
pxor I N 3 , S T A T E 3
movups S T A T E 3 , 0 x20 ( O U T P )
pxor I N 4 , S T A T E 4
movups S T A T E 4 , 0 x30 ( O U T P )
sub $ 6 4 , L E N
add $ 6 4 , I N P
add $ 6 4 , O U T P
cmp $ 6 4 , L E N
jge . L c t r _ e n c _ l o o p4
cmp $ 1 6 , L E N
jb . L c t r _ e n c _ r e t
.align 4
.Lctr_enc_loop1 :
movaps I V , S T A T E
call _ a e s n i _ i n c
movups ( I N P ) , I N
call _ a e s n i _ e n c1
pxor I N , S T A T E
movups S T A T E , ( O U T P )
sub $ 1 6 , L E N
add $ 1 6 , I N P
add $ 1 6 , O U T P
cmp $ 1 6 , L E N
jge . L c t r _ e n c _ l o o p1
.Lctr_enc_ret :
movups I V , ( I V P )
.Lctr_enc_just_ret :
ret
crypto: aesni-intel - Ported implementation to x86-32
The AES-NI instructions are also available in legacy mode so the 32-bit
architecture may profit from those, too.
To illustrate the performance gain here's a short summary of a dm-crypt
speed test on a Core i7 M620 running at 2.67GHz comparing both assembler
implementations:
x86: i568 aes-ni delta
ECB, 256 bit: 93.8 MB/s 123.3 MB/s +31.4%
CBC, 256 bit: 84.8 MB/s 262.3 MB/s +209.3%
LRW, 256 bit: 108.6 MB/s 222.1 MB/s +104.5%
XTS, 256 bit: 105.0 MB/s 205.5 MB/s +95.7%
Additionally, due to some minor optimizations, the 64-bit version also
got a minor performance gain as seen below:
x86-64: old impl. new impl. delta
ECB, 256 bit: 121.1 MB/s 123.0 MB/s +1.5%
CBC, 256 bit: 285.3 MB/s 290.8 MB/s +1.9%
LRW, 256 bit: 263.7 MB/s 265.3 MB/s +0.6%
XTS, 256 bit: 251.1 MB/s 255.3 MB/s +1.7%
Signed-off-by: Mathias Krause <minipli@googlemail.com>
Reviewed-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2010-11-27 11:34:46 +03:00
# endif