2013-12-11 14:28:41 -08:00
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# Copyright ( c ) 2 0 1 3 , I n t e l C o r p o r a t i o n
#
# This s o f t w a r e i s a v a i l a b l e t o y o u u n d e r a c h o i c e o f o n e o f t w o
# licenses. Y o u m a y c h o o s e t o b e l i c e n s e d u n d e r t h e t e r m s o f t h e G N U
# General P u b l i c L i c e n s e ( G P L ) V e r s i o n 2 , a v a i l a b l e f r o m t h e f i l e
# COPYING i n t h e m a i n d i r e c t o r y o f t h i s s o u r c e t r e e , o r t h e
# OpenIB. o r g B S D l i c e n s e b e l o w :
#
# Redistribution a n d u s e i n s o u r c e a n d b i n a r y f o r m s , w i t h o r w i t h o u t
# modification, a r e p e r m i t t e d p r o v i d e d t h a t t h e f o l l o w i n g c o n d i t i o n s a r e
# met :
#
# * Redistributions o f s o u r c e c o d e m u s t r e t a i n t h e a b o v e c o p y r i g h t
# notice, t h i s l i s t o f c o n d i t i o n s a n d t h e f o l l o w i n g d i s c l a i m e r .
#
# * Redistributions i n b i n a r y f o r m m u s t r e p r o d u c e t h e a b o v e c o p y r i g h t
# notice, t h i s l i s t o f c o n d i t i o n s a n d t h e f o l l o w i n g d i s c l a i m e r i n t h e
# documentation a n d / o r o t h e r m a t e r i a l s p r o v i d e d w i t h t h e
# distribution.
#
# * Neither t h e n a m e o f t h e I n t e l C o r p o r a t i o n n o r t h e n a m e s o f i t s
# contributors m a y b e u s e d t o e n d o r s e o r p r o m o t e p r o d u c t s d e r i v e d f r o m
# this s o f t w a r e w i t h o u t s p e c i f i c p r i o r w r i t t e n p e r m i s s i o n .
#
#
# THIS S O F T W A R E I S P R O V I D E D B Y I N T E L C O R P O R A T I O N " " A S I S " " A N D A N Y
# EXPRESS O R I M P L I E D W A R R A N T I E S , I N C L U D I N G , B U T N O T L I M I T E D T O , T H E
# IMPLIED W A R R A N T I E S O F M E R C H A N T A B I L I T Y A N D F I T N E S S F O R A P A R T I C U L A R
# PURPOSE A R E D I S C L A I M E D . I N N O E V E N T S H A L L I N T E L C O R P O R A T I O N O R
# CONTRIBUTORS B E L I A B L E F O R A N Y D I R E C T , I N D I R E C T , I N C I D E N T A L , S P E C I A L ,
# EXEMPLARY, O R C O N S E Q U E N T I A L D A M A G E S ( I N C L U D I N G , B U T N O T L I M I T E D T O ,
# PROCUREMENT O F S U B S T I T U T E G O O D S O R S E R V I C E S # L O S S O F U S E , D A T A , O R
# PROFITS# O R B U S I N E S S I N T E R R U P T I O N ) H O W E V E R C A U S E D A N D O N A N Y T H E O R Y O F
# LIABILITY, W H E T H E R I N C O N T R A C T , S T R I C T L I A B I L I T Y , O R T O R T ( I N C L U D I N G
# NEGLIGENCE O R O T H E R W I S E ) A R I S I N G I N A N Y W A Y O U T O F T H E U S E O F T H I S
# SOFTWARE, E V E N I F A D V I S E D O F T H E P O S S I B I L I T Y O F S U C H D A M A G E .
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# #
# # Authors :
# # Erdinc O z t u r k < e r d i n c . o z t u r k @intel.com>
# # Vinodh G o p a l < v i n o d h . g o p a l @intel.com>
# # James G u i l f o r d < j a m e s . g u i l f o r d @intel.com>
# # Tim C h e n < t i m . c . c h e n @linux.intel.com>
# #
# # References :
# # This c o d e w a s d e r i v e d a n d h i g h l y o p t i m i z e d f r o m t h e c o d e d e s c r i b e d i n p a p e r :
# # Vinodh G o p a l e t . a l . O p t i m i z e d G a l o i s - C o u n t e r - M o d e I m p l e m e n t a t i o n
# # on I n t e l A r c h i t e c t u r e P r o c e s s o r s . A u g u s t , 2 0 1 0
# # The d e t a i l s o f t h e i m p l e m e n t a t i o n i s e x p l a i n e d i n :
# # Erdinc O z t u r k e t . a l . E n a b l i n g H i g h - P e r f o r m a n c e G a l o i s - C o u n t e r - M o d e
# # on I n t e l A r c h i t e c t u r e P r o c e s s o r s . O c t o b e r , 2 0 1 2 .
# #
# # Assumptions :
# #
# #
# #
# # iv :
# # 0 1 2 3
# # 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
# # + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - +
# # | Salt ( F r o m t h e S A ) |
# # + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - +
# # | Initialization V e c t o r |
# # | ( This i s t h e s e q u e n c e n u m b e r f r o m I P S e c h e a d e r ) |
# # + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - +
# # | 0 x1 |
# # + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - +
# #
# #
# #
# # AAD :
# # AAD p a d d e d t o 1 2 8 b i t s w i t h 0
# # for e x a m p l e , a s s u m e A A D i s a u 3 2 v e c t o r
# #
# # if A A D i s 8 b y t e s :
# # AAD[ 3 ] = { A 0 , A 1 } #
# # padded A A D i n x m m r e g i s t e r = { A 1 A 0 0 0 }
# #
# # 0 1 2 3
# # 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
# # + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - +
# # | SPI ( A 1 ) |
# # + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - +
# # | 3 2 - bit S e q u e n c e N u m b e r ( A 0 ) |
# # + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - +
# # | 0 x0 |
# # + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - +
# #
# # AAD F o r m a t w i t h 3 2 - b i t S e q u e n c e N u m b e r
# #
# # if A A D i s 1 2 b y t e s :
# # AAD[ 3 ] = { A 0 , A 1 , A 2 } #
# # padded A A D i n x m m r e g i s t e r = { A 2 A 1 A 0 0 }
# #
# # 0 1 2 3
# # 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
# # + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - +
# # | SPI ( A 2 ) |
# # + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - +
# # | 6 4 - bit E x t e n d e d S e q u e n c e N u m b e r { A 1 ,A 0 } |
# # | |
# # + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - +
# # | 0 x0 |
# # + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - +
# #
# # AAD F o r m a t w i t h 6 4 - b i t E x t e n d e d S e q u e n c e N u m b e r
# #
# #
# # aadLen :
# # from t h e d e f i n i t i o n o f t h e s p e c , a a d L e n c a n o n l y b e 8 o r 1 2 b y t e s .
# # The c o d e a d d i t i o n a l l y s u p p o r t s a a d L e n o f l e n g t h 1 6 b y t e s .
# #
# # TLen :
# # from t h e d e f i n i t i o n o f t h e s p e c , T L e n c a n o n l y b e 8 , 1 2 o r 1 6 b y t e s .
# #
# # poly = x ^ 1 2 8 + x ^ 1 2 7 + x ^ 1 2 6 + x ^ 1 2 1 + 1
# # throughout t h e c o d e , o n e t a b a n d t w o t a b i n d e n t a t i o n s a r e u s e d . o n e t a b i s
# # for G H A S H p a r t , t w o t a b s i s f o r A E S p a r t .
# #
# include < l i n u x / l i n k a g e . h >
# include < a s m / i n s t . h >
crypto: x86 - make constants readonly, allow linker to merge them
A lot of asm-optimized routines in arch/x86/crypto/ keep its
constants in .data. This is wrong, they should be on .rodata.
Mnay of these constants are the same in different modules.
For example, 128-bit shuffle mask 0x000102030405060708090A0B0C0D0E0F
exists in at least half a dozen places.
There is a way to let linker merge them and use just one copy.
The rules are as follows: mergeable objects of different sizes
should not share sections. You can't put them all in one .rodata
section, they will lose "mergeability".
GCC puts its mergeable constants in ".rodata.cstSIZE" sections,
or ".rodata.cstSIZE.<object_name>" if -fdata-sections is used.
This patch does the same:
.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
It is important that all data in such section consists of
16-byte elements, not larger ones, and there are no implicit
use of one element from another.
When this is not the case, use non-mergeable section:
.section .rodata[.VAR_NAME], "a", @progbits
This reduces .data by ~15 kbytes:
text data bss dec hex filename
11097415 2705840 2630712 16433967 fac32f vmlinux-prev.o
11112095 2690672 2630712 16433479 fac147 vmlinux.o
Merged objects are visible in System.map:
ffffffff81a28810 r POLY
ffffffff81a28810 r POLY
ffffffff81a28820 r TWOONE
ffffffff81a28820 r TWOONE
ffffffff81a28830 r PSHUFFLE_BYTE_FLIP_MASK <- merged regardless of
ffffffff81a28830 r SHUF_MASK <------------- the name difference
ffffffff81a28830 r SHUF_MASK
ffffffff81a28830 r SHUF_MASK
..
ffffffff81a28d00 r K512 <- merged three identical 640-byte tables
ffffffff81a28d00 r K512
ffffffff81a28d00 r K512
Use of object names in section name suffixes is not strictly necessary,
but might help if someday link stage will use garbage collection
to eliminate unused sections (ld --gc-sections).
Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
CC: Herbert Xu <herbert@gondor.apana.org.au>
CC: Josh Poimboeuf <jpoimboe@redhat.com>
CC: Xiaodong Liu <xiaodong.liu@intel.com>
CC: Megha Dey <megha.dey@intel.com>
CC: linux-crypto@vger.kernel.org
CC: x86@kernel.org
CC: linux-kernel@vger.kernel.org
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2017-01-19 22:33:04 +01:00
# constants i n m e r g e a b l e s e c t i o n s , l i n k e r c a n r e o r d e r a n d m e r g e
.section .rodata .cst16 .POLY , " aM" , @progbits, 16
2013-12-11 14:28:41 -08:00
.align 16
POLY : .octa 0xC2000000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
crypto: x86 - make constants readonly, allow linker to merge them
A lot of asm-optimized routines in arch/x86/crypto/ keep its
constants in .data. This is wrong, they should be on .rodata.
Mnay of these constants are the same in different modules.
For example, 128-bit shuffle mask 0x000102030405060708090A0B0C0D0E0F
exists in at least half a dozen places.
There is a way to let linker merge them and use just one copy.
The rules are as follows: mergeable objects of different sizes
should not share sections. You can't put them all in one .rodata
section, they will lose "mergeability".
GCC puts its mergeable constants in ".rodata.cstSIZE" sections,
or ".rodata.cstSIZE.<object_name>" if -fdata-sections is used.
This patch does the same:
.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
It is important that all data in such section consists of
16-byte elements, not larger ones, and there are no implicit
use of one element from another.
When this is not the case, use non-mergeable section:
.section .rodata[.VAR_NAME], "a", @progbits
This reduces .data by ~15 kbytes:
text data bss dec hex filename
11097415 2705840 2630712 16433967 fac32f vmlinux-prev.o
11112095 2690672 2630712 16433479 fac147 vmlinux.o
Merged objects are visible in System.map:
ffffffff81a28810 r POLY
ffffffff81a28810 r POLY
ffffffff81a28820 r TWOONE
ffffffff81a28820 r TWOONE
ffffffff81a28830 r PSHUFFLE_BYTE_FLIP_MASK <- merged regardless of
ffffffff81a28830 r SHUF_MASK <------------- the name difference
ffffffff81a28830 r SHUF_MASK
ffffffff81a28830 r SHUF_MASK
..
ffffffff81a28d00 r K512 <- merged three identical 640-byte tables
ffffffff81a28d00 r K512
ffffffff81a28d00 r K512
Use of object names in section name suffixes is not strictly necessary,
but might help if someday link stage will use garbage collection
to eliminate unused sections (ld --gc-sections).
Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
CC: Herbert Xu <herbert@gondor.apana.org.au>
CC: Josh Poimboeuf <jpoimboe@redhat.com>
CC: Xiaodong Liu <xiaodong.liu@intel.com>
CC: Megha Dey <megha.dey@intel.com>
CC: linux-crypto@vger.kernel.org
CC: x86@kernel.org
CC: linux-kernel@vger.kernel.org
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2017-01-19 22:33:04 +01:00
.section .rodata .cst16 .POLY2 , " aM" , @progbits, 16
.align 16
2013-12-11 14:28:41 -08:00
POLY2 : .octa 0xC2000000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 C2 0 0 0 0 0 0
crypto: x86 - make constants readonly, allow linker to merge them
A lot of asm-optimized routines in arch/x86/crypto/ keep its
constants in .data. This is wrong, they should be on .rodata.
Mnay of these constants are the same in different modules.
For example, 128-bit shuffle mask 0x000102030405060708090A0B0C0D0E0F
exists in at least half a dozen places.
There is a way to let linker merge them and use just one copy.
The rules are as follows: mergeable objects of different sizes
should not share sections. You can't put them all in one .rodata
section, they will lose "mergeability".
GCC puts its mergeable constants in ".rodata.cstSIZE" sections,
or ".rodata.cstSIZE.<object_name>" if -fdata-sections is used.
This patch does the same:
.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
It is important that all data in such section consists of
16-byte elements, not larger ones, and there are no implicit
use of one element from another.
When this is not the case, use non-mergeable section:
.section .rodata[.VAR_NAME], "a", @progbits
This reduces .data by ~15 kbytes:
text data bss dec hex filename
11097415 2705840 2630712 16433967 fac32f vmlinux-prev.o
11112095 2690672 2630712 16433479 fac147 vmlinux.o
Merged objects are visible in System.map:
ffffffff81a28810 r POLY
ffffffff81a28810 r POLY
ffffffff81a28820 r TWOONE
ffffffff81a28820 r TWOONE
ffffffff81a28830 r PSHUFFLE_BYTE_FLIP_MASK <- merged regardless of
ffffffff81a28830 r SHUF_MASK <------------- the name difference
ffffffff81a28830 r SHUF_MASK
ffffffff81a28830 r SHUF_MASK
..
ffffffff81a28d00 r K512 <- merged three identical 640-byte tables
ffffffff81a28d00 r K512
ffffffff81a28d00 r K512
Use of object names in section name suffixes is not strictly necessary,
but might help if someday link stage will use garbage collection
to eliminate unused sections (ld --gc-sections).
Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
CC: Herbert Xu <herbert@gondor.apana.org.au>
CC: Josh Poimboeuf <jpoimboe@redhat.com>
CC: Xiaodong Liu <xiaodong.liu@intel.com>
CC: Megha Dey <megha.dey@intel.com>
CC: linux-crypto@vger.kernel.org
CC: x86@kernel.org
CC: linux-kernel@vger.kernel.org
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2017-01-19 22:33:04 +01:00
.section .rodata .cst16 .TWOONE , " aM" , @progbits, 16
.align 16
TWOONE : .octa 0x00000001 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
2013-12-11 14:28:41 -08:00
crypto: x86 - make constants readonly, allow linker to merge them
A lot of asm-optimized routines in arch/x86/crypto/ keep its
constants in .data. This is wrong, they should be on .rodata.
Mnay of these constants are the same in different modules.
For example, 128-bit shuffle mask 0x000102030405060708090A0B0C0D0E0F
exists in at least half a dozen places.
There is a way to let linker merge them and use just one copy.
The rules are as follows: mergeable objects of different sizes
should not share sections. You can't put them all in one .rodata
section, they will lose "mergeability".
GCC puts its mergeable constants in ".rodata.cstSIZE" sections,
or ".rodata.cstSIZE.<object_name>" if -fdata-sections is used.
This patch does the same:
.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
It is important that all data in such section consists of
16-byte elements, not larger ones, and there are no implicit
use of one element from another.
When this is not the case, use non-mergeable section:
.section .rodata[.VAR_NAME], "a", @progbits
This reduces .data by ~15 kbytes:
text data bss dec hex filename
11097415 2705840 2630712 16433967 fac32f vmlinux-prev.o
11112095 2690672 2630712 16433479 fac147 vmlinux.o
Merged objects are visible in System.map:
ffffffff81a28810 r POLY
ffffffff81a28810 r POLY
ffffffff81a28820 r TWOONE
ffffffff81a28820 r TWOONE
ffffffff81a28830 r PSHUFFLE_BYTE_FLIP_MASK <- merged regardless of
ffffffff81a28830 r SHUF_MASK <------------- the name difference
ffffffff81a28830 r SHUF_MASK
ffffffff81a28830 r SHUF_MASK
..
ffffffff81a28d00 r K512 <- merged three identical 640-byte tables
ffffffff81a28d00 r K512
ffffffff81a28d00 r K512
Use of object names in section name suffixes is not strictly necessary,
but might help if someday link stage will use garbage collection
to eliminate unused sections (ld --gc-sections).
Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
CC: Herbert Xu <herbert@gondor.apana.org.au>
CC: Josh Poimboeuf <jpoimboe@redhat.com>
CC: Xiaodong Liu <xiaodong.liu@intel.com>
CC: Megha Dey <megha.dey@intel.com>
CC: linux-crypto@vger.kernel.org
CC: x86@kernel.org
CC: linux-kernel@vger.kernel.org
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2017-01-19 22:33:04 +01:00
.section .rodata .cst16 .SHUF_MASK , " aM" , @progbits, 16
.align 16
2013-12-11 14:28:41 -08:00
SHUF_MASK : .octa 0x00010203 0 4 0 5 0 6 0 7 0 8 0 9 0 A0 B 0 C 0 D 0 E 0 F
crypto: x86 - make constants readonly, allow linker to merge them
A lot of asm-optimized routines in arch/x86/crypto/ keep its
constants in .data. This is wrong, they should be on .rodata.
Mnay of these constants are the same in different modules.
For example, 128-bit shuffle mask 0x000102030405060708090A0B0C0D0E0F
exists in at least half a dozen places.
There is a way to let linker merge them and use just one copy.
The rules are as follows: mergeable objects of different sizes
should not share sections. You can't put them all in one .rodata
section, they will lose "mergeability".
GCC puts its mergeable constants in ".rodata.cstSIZE" sections,
or ".rodata.cstSIZE.<object_name>" if -fdata-sections is used.
This patch does the same:
.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
It is important that all data in such section consists of
16-byte elements, not larger ones, and there are no implicit
use of one element from another.
When this is not the case, use non-mergeable section:
.section .rodata[.VAR_NAME], "a", @progbits
This reduces .data by ~15 kbytes:
text data bss dec hex filename
11097415 2705840 2630712 16433967 fac32f vmlinux-prev.o
11112095 2690672 2630712 16433479 fac147 vmlinux.o
Merged objects are visible in System.map:
ffffffff81a28810 r POLY
ffffffff81a28810 r POLY
ffffffff81a28820 r TWOONE
ffffffff81a28820 r TWOONE
ffffffff81a28830 r PSHUFFLE_BYTE_FLIP_MASK <- merged regardless of
ffffffff81a28830 r SHUF_MASK <------------- the name difference
ffffffff81a28830 r SHUF_MASK
ffffffff81a28830 r SHUF_MASK
..
ffffffff81a28d00 r K512 <- merged three identical 640-byte tables
ffffffff81a28d00 r K512
ffffffff81a28d00 r K512
Use of object names in section name suffixes is not strictly necessary,
but might help if someday link stage will use garbage collection
to eliminate unused sections (ld --gc-sections).
Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
CC: Herbert Xu <herbert@gondor.apana.org.au>
CC: Josh Poimboeuf <jpoimboe@redhat.com>
CC: Xiaodong Liu <xiaodong.liu@intel.com>
CC: Megha Dey <megha.dey@intel.com>
CC: linux-crypto@vger.kernel.org
CC: x86@kernel.org
CC: linux-kernel@vger.kernel.org
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2017-01-19 22:33:04 +01:00
.section .rodata .cst16 .ONE , " aM" , @progbits, 16
.align 16
2013-12-11 14:28:41 -08:00
ONE : .octa 0x00000000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
crypto: x86 - make constants readonly, allow linker to merge them
A lot of asm-optimized routines in arch/x86/crypto/ keep its
constants in .data. This is wrong, they should be on .rodata.
Mnay of these constants are the same in different modules.
For example, 128-bit shuffle mask 0x000102030405060708090A0B0C0D0E0F
exists in at least half a dozen places.
There is a way to let linker merge them and use just one copy.
The rules are as follows: mergeable objects of different sizes
should not share sections. You can't put them all in one .rodata
section, they will lose "mergeability".
GCC puts its mergeable constants in ".rodata.cstSIZE" sections,
or ".rodata.cstSIZE.<object_name>" if -fdata-sections is used.
This patch does the same:
.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
It is important that all data in such section consists of
16-byte elements, not larger ones, and there are no implicit
use of one element from another.
When this is not the case, use non-mergeable section:
.section .rodata[.VAR_NAME], "a", @progbits
This reduces .data by ~15 kbytes:
text data bss dec hex filename
11097415 2705840 2630712 16433967 fac32f vmlinux-prev.o
11112095 2690672 2630712 16433479 fac147 vmlinux.o
Merged objects are visible in System.map:
ffffffff81a28810 r POLY
ffffffff81a28810 r POLY
ffffffff81a28820 r TWOONE
ffffffff81a28820 r TWOONE
ffffffff81a28830 r PSHUFFLE_BYTE_FLIP_MASK <- merged regardless of
ffffffff81a28830 r SHUF_MASK <------------- the name difference
ffffffff81a28830 r SHUF_MASK
ffffffff81a28830 r SHUF_MASK
..
ffffffff81a28d00 r K512 <- merged three identical 640-byte tables
ffffffff81a28d00 r K512
ffffffff81a28d00 r K512
Use of object names in section name suffixes is not strictly necessary,
but might help if someday link stage will use garbage collection
to eliminate unused sections (ld --gc-sections).
Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
CC: Herbert Xu <herbert@gondor.apana.org.au>
CC: Josh Poimboeuf <jpoimboe@redhat.com>
CC: Xiaodong Liu <xiaodong.liu@intel.com>
CC: Megha Dey <megha.dey@intel.com>
CC: linux-crypto@vger.kernel.org
CC: x86@kernel.org
CC: linux-kernel@vger.kernel.org
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2017-01-19 22:33:04 +01:00
.section .rodata .cst16 .ONEf , " aM" , @progbits, 16
.align 16
2013-12-11 14:28:41 -08:00
ONEf : .octa 0x01000000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
crypto: x86 - make constants readonly, allow linker to merge them
A lot of asm-optimized routines in arch/x86/crypto/ keep its
constants in .data. This is wrong, they should be on .rodata.
Mnay of these constants are the same in different modules.
For example, 128-bit shuffle mask 0x000102030405060708090A0B0C0D0E0F
exists in at least half a dozen places.
There is a way to let linker merge them and use just one copy.
The rules are as follows: mergeable objects of different sizes
should not share sections. You can't put them all in one .rodata
section, they will lose "mergeability".
GCC puts its mergeable constants in ".rodata.cstSIZE" sections,
or ".rodata.cstSIZE.<object_name>" if -fdata-sections is used.
This patch does the same:
.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
It is important that all data in such section consists of
16-byte elements, not larger ones, and there are no implicit
use of one element from another.
When this is not the case, use non-mergeable section:
.section .rodata[.VAR_NAME], "a", @progbits
This reduces .data by ~15 kbytes:
text data bss dec hex filename
11097415 2705840 2630712 16433967 fac32f vmlinux-prev.o
11112095 2690672 2630712 16433479 fac147 vmlinux.o
Merged objects are visible in System.map:
ffffffff81a28810 r POLY
ffffffff81a28810 r POLY
ffffffff81a28820 r TWOONE
ffffffff81a28820 r TWOONE
ffffffff81a28830 r PSHUFFLE_BYTE_FLIP_MASK <- merged regardless of
ffffffff81a28830 r SHUF_MASK <------------- the name difference
ffffffff81a28830 r SHUF_MASK
ffffffff81a28830 r SHUF_MASK
..
ffffffff81a28d00 r K512 <- merged three identical 640-byte tables
ffffffff81a28d00 r K512
ffffffff81a28d00 r K512
Use of object names in section name suffixes is not strictly necessary,
but might help if someday link stage will use garbage collection
to eliminate unused sections (ld --gc-sections).
Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
CC: Herbert Xu <herbert@gondor.apana.org.au>
CC: Josh Poimboeuf <jpoimboe@redhat.com>
CC: Xiaodong Liu <xiaodong.liu@intel.com>
CC: Megha Dey <megha.dey@intel.com>
CC: linux-crypto@vger.kernel.org
CC: x86@kernel.org
CC: linux-kernel@vger.kernel.org
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2017-01-19 22:33:04 +01:00
# order o f t h e s e c o n s t a n t s s h o u l d n o t c h a n g e .
# more s p e c i f i c a l l y , A L L _ F s h o u l d f o l l o w S H I F T _ M A S K , a n d z e r o s h o u l d f o l l o w A L L _ F
.section .rodata , " a" , @progbits
.align 16
SHIFT_MASK : .octa 0x0f0e0d0c 0 b0 a09 0 8 0 7 0 6 0 5 0 4 0 3 0 2 0 1 0 0
ALL_F : .octa 0xffffffff ffffffffffffffffffffffff
.octa 0x00000000000000000000000000000000
2017-04-28 18:11:58 +02:00
.section .rodata
.align 16
.type aad_ s h i f t _ a r r , @object
.size aad_ s h i f t _ a r r , 2 7 2
aad_shift_arr :
.octa 0xffffffffffffffffffffffffffffffff
.octa 0xffffffffffffffffffffffffffffff0C
.octa 0xffffffffffffffffffffffffffff0D0C
.octa 0xffffffffffffffffffffffffff0E0D0C
.octa 0xffffffffffffffffffffffff0F0E0D0C
.octa 0xffffffffffffffffffffff0C0B0A0908
.octa 0xffffffffffffffffffff0D0C0B0A0908
.octa 0xffffffffffffffffff0E0D0C0B0A0908
.octa 0xffffffffffffffff0F0E0D0C0B0A0908
.octa 0xffffffffffffff0C0B0A090807060504
.octa 0xffffffffffff0D0C0B0A090807060504
.octa 0xffffffffff0E0D0C0B0A090807060504
.octa 0xffffffff0F0E0D0C0B0A090807060504
.octa 0xffffff0C0B0A09080706050403020100
.octa 0xffff0D0C0B0A09080706050403020100
.octa 0xff0E0D0C0B0A09080706050403020100
.octa 0x0F0E0D0C0B0A09080706050403020100
2013-12-11 14:28:41 -08:00
.text
# # define t h e f i e l d s o f t h e g c m a e s c o n t e x t
# {
# u8 e x p a n d e d _ k e y s [ 1 6 * 1 1 ] s t o r e e x p a n d e d k e y s
# u8 s h i f t e d _ h k e y _ 1 [ 1 6 ] s t o r e H a s h K e y < < 1 m o d p o l y h e r e
# u8 s h i f t e d _ h k e y _ 2 [ 1 6 ] s t o r e H a s h K e y ^ 2 < < 1 m o d p o l y h e r e
# u8 s h i f t e d _ h k e y _ 3 [ 1 6 ] s t o r e H a s h K e y ^ 3 < < 1 m o d p o l y h e r e
# u8 s h i f t e d _ h k e y _ 4 [ 1 6 ] s t o r e H a s h K e y ^ 4 < < 1 m o d p o l y h e r e
# u8 s h i f t e d _ h k e y _ 5 [ 1 6 ] s t o r e H a s h K e y ^ 5 < < 1 m o d p o l y h e r e
# u8 s h i f t e d _ h k e y _ 6 [ 1 6 ] s t o r e H a s h K e y ^ 6 < < 1 m o d p o l y h e r e
# u8 s h i f t e d _ h k e y _ 7 [ 1 6 ] s t o r e H a s h K e y ^ 7 < < 1 m o d p o l y h e r e
# u8 s h i f t e d _ h k e y _ 8 [ 1 6 ] s t o r e H a s h K e y ^ 8 < < 1 m o d p o l y h e r e
# u8 s h i f t e d _ h k e y _ 1 _ k [ 1 6 ] s t o r e X O R H a s h K e y < < 1 m o d p o l y h e r e ( f o r K a r a t s u b a p u r p o s e s )
# u8 s h i f t e d _ h k e y _ 2 _ k [ 1 6 ] s t o r e X O R H a s h K e y ^ 2 < < 1 m o d p o l y h e r e ( f o r K a r a t s u b a p u r p o s e s )
# u8 s h i f t e d _ h k e y _ 3 _ k [ 1 6 ] s t o r e X O R H a s h K e y ^ 3 < < 1 m o d p o l y h e r e ( f o r K a r a t s u b a p u r p o s e s )
# u8 s h i f t e d _ h k e y _ 4 _ k [ 1 6 ] s t o r e X O R H a s h K e y ^ 4 < < 1 m o d p o l y h e r e ( f o r K a r a t s u b a p u r p o s e s )
# u8 s h i f t e d _ h k e y _ 5 _ k [ 1 6 ] s t o r e X O R H a s h K e y ^ 5 < < 1 m o d p o l y h e r e ( f o r K a r a t s u b a p u r p o s e s )
# u8 s h i f t e d _ h k e y _ 6 _ k [ 1 6 ] s t o r e X O R H a s h K e y ^ 6 < < 1 m o d p o l y h e r e ( f o r K a r a t s u b a p u r p o s e s )
# u8 s h i f t e d _ h k e y _ 7 _ k [ 1 6 ] s t o r e X O R H a s h K e y ^ 7 < < 1 m o d p o l y h e r e ( f o r K a r a t s u b a p u r p o s e s )
# u8 s h i f t e d _ h k e y _ 8 _ k [ 1 6 ] s t o r e X O R H a s h K e y ^ 8 < < 1 m o d p o l y h e r e ( f o r K a r a t s u b a p u r p o s e s )
# } gcm_ c t x #
HashKey = 1 6 * 1 1 # s t o r e H a s h K e y < < 1 m o d p o l y h e r e
HashKey_ 2 = 1 6 * 1 2 # s t o r e H a s h K e y ^ 2 < < 1 m o d p o l y h e r e
HashKey_ 3 = 1 6 * 1 3 # s t o r e H a s h K e y ^ 3 < < 1 m o d p o l y h e r e
HashKey_ 4 = 1 6 * 1 4 # s t o r e H a s h K e y ^ 4 < < 1 m o d p o l y h e r e
HashKey_ 5 = 1 6 * 1 5 # s t o r e H a s h K e y ^ 5 < < 1 m o d p o l y h e r e
HashKey_ 6 = 1 6 * 1 6 # s t o r e H a s h K e y ^ 6 < < 1 m o d p o l y h e r e
HashKey_ 7 = 1 6 * 1 7 # s t o r e H a s h K e y ^ 7 < < 1 m o d p o l y h e r e
HashKey_ 8 = 1 6 * 1 8 # s t o r e H a s h K e y ^ 8 < < 1 m o d p o l y h e r e
HashKey_ k = 1 6 * 1 9 # s t o r e X O R o f H a s h K e y < < 1 m o d p o l y h e r e ( f o r K a r a t s u b a p u r p o s e s )
HashKey_ 2 _ k = 1 6 * 2 0 # s t o r e X O R o f H a s h K e y ^ 2 < < 1 m o d p o l y h e r e ( f o r K a r a t s u b a p u r p o s e s )
HashKey_ 3 _ k = 1 6 * 2 1 # s t o r e X O R o f H a s h K e y ^ 3 < < 1 m o d p o l y h e r e ( f o r K a r a t s u b a p u r p o s e s )
HashKey_ 4 _ k = 1 6 * 2 2 # s t o r e X O R o f H a s h K e y ^ 4 < < 1 m o d p o l y h e r e ( f o r K a r a t s u b a p u r p o s e s )
HashKey_ 5 _ k = 1 6 * 2 3 # s t o r e X O R o f H a s h K e y ^ 5 < < 1 m o d p o l y h e r e ( f o r K a r a t s u b a p u r p o s e s )
HashKey_ 6 _ k = 1 6 * 2 4 # s t o r e X O R o f H a s h K e y ^ 6 < < 1 m o d p o l y h e r e ( f o r K a r a t s u b a p u r p o s e s )
HashKey_ 7 _ k = 1 6 * 2 5 # s t o r e X O R o f H a s h K e y ^ 7 < < 1 m o d p o l y h e r e ( f o r K a r a t s u b a p u r p o s e s )
HashKey_ 8 _ k = 1 6 * 2 6 # s t o r e X O R o f H a s h K e y ^ 8 < < 1 m o d p o l y h e r e ( f o r K a r a t s u b a p u r p o s e s )
# define a r g 1 % r d i
# define a r g 2 % r s i
# define a r g 3 % r d x
# define a r g 4 % r c x
# define a r g 5 % r8
# define a r g 6 % r9
# define a r g 7 S T A C K _ O F F S E T + 8 * 1 ( % r14 )
# define a r g 8 S T A C K _ O F F S E T + 8 * 2 ( % r14 )
# define a r g 9 S T A C K _ O F F S E T + 8 * 3 ( % r14 )
i = 0
j = 0
out_ o r d e r = 0
in_ o r d e r = 1
DEC = 0
ENC = 1
.macro define_reg r n
reg_ \ r = % x m m \ n
.endm
.macro setreg
.altmacro
define_ r e g i % i
define_ r e g j % j
.noaltmacro
.endm
# need t o p u s h 4 r e g i s t e r s i n t o s t a c k t o m a i n t a i n
STACK_ O F F S E T = 8 * 4
TMP1 = 1 6 * 0 # T e m p o r a r y s t o r a g e f o r A A D
TMP2 = 1 6 * 1 # T e m p o r a r y s t o r a g e f o r A E S S t a t e 2 ( S t a t e 1 i s s t o r e d i n a n X M M r e g i s t e r )
TMP3 = 1 6 * 2 # T e m p o r a r y s t o r a g e f o r A E S S t a t e 3
TMP4 = 1 6 * 3 # T e m p o r a r y s t o r a g e f o r A E S S t a t e 4
TMP5 = 1 6 * 4 # T e m p o r a r y s t o r a g e f o r A E S S t a t e 5
TMP6 = 1 6 * 5 # T e m p o r a r y s t o r a g e f o r A E S S t a t e 6
TMP7 = 1 6 * 6 # T e m p o r a r y s t o r a g e f o r A E S S t a t e 7
TMP8 = 1 6 * 7 # T e m p o r a r y s t o r a g e f o r A E S S t a t e 8
VARIABLE_ O F F S E T = 1 6 * 8
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# Utility M a c r o s
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# Encryption o f a s i n g l e b l o c k
.macro ENCRYPT_SINGLE_BLOCK XMM0
vpxor ( a r g 1 ) , \ X M M 0 , \ X M M 0
i = 1
setreg
.rep 9
vaesenc 1 6 * i ( a r g 1 ) , \ X M M 0 , \ X M M 0
i = ( i + 1 )
setreg
.endr
vaesenclast 1 6 * 1 0 ( a r g 1 ) , \ X M M 0 , \ X M M 0
.endm
# ifdef C O N F I G _ A S _ A V X
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# GHASH_ M U L M A C R O t o i m p l e m e n t : D a t a * H a s h K e y m o d ( 1 2 8 ,1 2 7 ,1 2 6 ,1 2 1 ,0 )
# Input : A a n d B ( 1 2 8 - b i t s e a c h , b i t - r e f l e c t e d )
# Output : C = A * B * x m o d p o l y , ( i . e . > > 1 )
# To c o m p u t e G H = G H * H a s h K e y m o d p o l y , g i v e H K = H a s h K e y < < 1 m o d p o l y a s i n p u t
# GH = G H * H K * x m o d p o l y w h i c h i s e q u i v a l e n t t o G H * H a s h K e y m o d p o l y .
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
.macro GHASH_MUL_AVX GH H K T 1 T 2 T 3 T 4 T 5
vpshufd $ 0 b01 0 0 1 1 1 0 , \ G H , \ T 2
vpshufd $ 0 b01 0 0 1 1 1 0 , \ H K , \ T 3
vpxor \ G H , \ T 2 , \ T 2 # T 2 = ( a1 + a0 )
vpxor \ H K , \ T 3 , \ T 3 # T 3 = ( b1 + b0 )
vpclmulqdq $ 0 x11 , \ H K , \ G H , \ T 1 # T 1 = a1 * b1
vpclmulqdq $ 0 x00 , \ H K , \ G H , \ G H # G H = a 0 * b0
vpclmulqdq $ 0 x00 , \ T 3 , \ T 2 , \ T 2 # T 2 = ( a1 + a0 ) * ( b1 + b0 )
vpxor \ G H , \ T 2 ,\ T 2
vpxor \ T 1 , \ T 2 ,\ T 2 # T 2 = a0 * b1 + a1 * b0
vpslldq $ 8 , \ T 2 ,\ T 3 # s h i f t - L T 3 2 D W s
vpsrldq $ 8 , \ T 2 ,\ T 2 # s h i f t - R T 2 2 D W s
vpxor \ T 3 , \ G H , \ G H
vpxor \ T 2 , \ T 1 , \ T 1 # < T 1 : G H > = G H x H K
# first p h a s e o f t h e r e d u c t i o n
vpslld $ 3 1 , \ G H , \ T 2 # p a c k e d r i g h t s h i f t i n g < < 31
vpslld $ 3 0 , \ G H , \ T 3 # p a c k e d r i g h t s h i f t i n g s h i f t < < 30
vpslld $ 2 5 , \ G H , \ T 4 # p a c k e d r i g h t s h i f t i n g s h i f t < < 25
vpxor \ T 3 , \ T 2 , \ T 2 # x o r t h e s h i f t e d v e r s i o n s
vpxor \ T 4 , \ T 2 , \ T 2
vpsrldq $ 4 , \ T 2 , \ T 5 # s h i f t - R T 5 1 D W
vpslldq $ 1 2 , \ T 2 , \ T 2 # s h i f t - L T 2 3 D W s
vpxor \ T 2 , \ G H , \ G H # f i r s t p h a s e o f t h e r e d u c t i o n c o m p l e t e
# second p h a s e o f t h e r e d u c t i o n
vpsrld $ 1 ,\ G H , \ T 2 # p a c k e d l e f t s h i f t i n g > > 1
vpsrld $ 2 ,\ G H , \ T 3 # p a c k e d l e f t s h i f t i n g > > 2
vpsrld $ 7 ,\ G H , \ T 4 # p a c k e d l e f t s h i f t i n g > > 7
vpxor \ T 3 , \ T 2 , \ T 2 # x o r t h e s h i f t e d v e r s i o n s
vpxor \ T 4 , \ T 2 , \ T 2
vpxor \ T 5 , \ T 2 , \ T 2
vpxor \ T 2 , \ G H , \ G H
vpxor \ T 1 , \ G H , \ G H # t h e r e s u l t i s i n G H
.endm
.macro PRECOMPUTE_AVX HK T 1 T 2 T 3 T 4 T 5 T 6
# Haskey_ i _ k h o l d s X O R e d v a l u e s o f t h e l o w a n d h i g h p a r t s o f t h e H a s k e y _ i
vmovdqa \ H K , \ T 5
vpshufd $ 0 b01 0 0 1 1 1 0 , \ T 5 , \ T 1
vpxor \ T 5 , \ T 1 , \ T 1
vmovdqa \ T 1 , H a s h K e y _ k ( a r g 1 )
GHASH_ M U L _ A V X \ T 5 , \ H K , \ T 1 , \ T 3 , \ T 4 , \ T 6 , \ T 2 # T 5 = H a s h K e y ^ 2 < < 1 m o d p o l y
vmovdqa \ T 5 , H a s h K e y _ 2 ( a r g 1 ) # [ H a s h K e y _ 2 ] = H a s h K e y ^ 2 < < 1 m o d p o l y
vpshufd $ 0 b01 0 0 1 1 1 0 , \ T 5 , \ T 1
vpxor \ T 5 , \ T 1 , \ T 1
vmovdqa \ T 1 , H a s h K e y _ 2 _ k ( a r g 1 )
GHASH_ M U L _ A V X \ T 5 , \ H K , \ T 1 , \ T 3 , \ T 4 , \ T 6 , \ T 2 # T 5 = H a s h K e y ^ 3 < < 1 m o d p o l y
vmovdqa \ T 5 , H a s h K e y _ 3 ( a r g 1 )
vpshufd $ 0 b01 0 0 1 1 1 0 , \ T 5 , \ T 1
vpxor \ T 5 , \ T 1 , \ T 1
vmovdqa \ T 1 , H a s h K e y _ 3 _ k ( a r g 1 )
GHASH_ M U L _ A V X \ T 5 , \ H K , \ T 1 , \ T 3 , \ T 4 , \ T 6 , \ T 2 # T 5 = H a s h K e y ^ 4 < < 1 m o d p o l y
vmovdqa \ T 5 , H a s h K e y _ 4 ( a r g 1 )
vpshufd $ 0 b01 0 0 1 1 1 0 , \ T 5 , \ T 1
vpxor \ T 5 , \ T 1 , \ T 1
vmovdqa \ T 1 , H a s h K e y _ 4 _ k ( a r g 1 )
GHASH_ M U L _ A V X \ T 5 , \ H K , \ T 1 , \ T 3 , \ T 4 , \ T 6 , \ T 2 # T 5 = H a s h K e y ^ 5 < < 1 m o d p o l y
vmovdqa \ T 5 , H a s h K e y _ 5 ( a r g 1 )
vpshufd $ 0 b01 0 0 1 1 1 0 , \ T 5 , \ T 1
vpxor \ T 5 , \ T 1 , \ T 1
vmovdqa \ T 1 , H a s h K e y _ 5 _ k ( a r g 1 )
GHASH_ M U L _ A V X \ T 5 , \ H K , \ T 1 , \ T 3 , \ T 4 , \ T 6 , \ T 2 # T 5 = H a s h K e y ^ 6 < < 1 m o d p o l y
vmovdqa \ T 5 , H a s h K e y _ 6 ( a r g 1 )
vpshufd $ 0 b01 0 0 1 1 1 0 , \ T 5 , \ T 1
vpxor \ T 5 , \ T 1 , \ T 1
vmovdqa \ T 1 , H a s h K e y _ 6 _ k ( a r g 1 )
GHASH_ M U L _ A V X \ T 5 , \ H K , \ T 1 , \ T 3 , \ T 4 , \ T 6 , \ T 2 # T 5 = H a s h K e y ^ 7 < < 1 m o d p o l y
vmovdqa \ T 5 , H a s h K e y _ 7 ( a r g 1 )
vpshufd $ 0 b01 0 0 1 1 1 0 , \ T 5 , \ T 1
vpxor \ T 5 , \ T 1 , \ T 1
vmovdqa \ T 1 , H a s h K e y _ 7 _ k ( a r g 1 )
GHASH_ M U L _ A V X \ T 5 , \ H K , \ T 1 , \ T 3 , \ T 4 , \ T 6 , \ T 2 # T 5 = H a s h K e y ^ 8 < < 1 m o d p o l y
vmovdqa \ T 5 , H a s h K e y _ 8 ( a r g 1 )
vpshufd $ 0 b01 0 0 1 1 1 0 , \ T 5 , \ T 1
vpxor \ T 5 , \ T 1 , \ T 1
vmovdqa \ T 1 , H a s h K e y _ 8 _ k ( a r g 1 )
.endm
# # if a = n u m b e r o f t o t a l p l a i n t e x t b y t e s
# # b = f l o o r ( a / 1 6 )
# # num_ i n i t i a l _ b l o c k s = b m o d 4 #
# # encrypt t h e i n i t i a l n u m _ i n i t i a l _ b l o c k s b l o c k s a n d a p p l y g h a s h o n t h e c i p h e r t e x t
# # r1 0 , r11 , r12 , r a x a r e c l o b b e r e d
# # arg1 , a r g 2 , a r g 3 , r14 a r e u s e d a s a p o i n t e r o n l y , n o t m o d i f i e d
.macro INITIAL_BLOCKS_AVX num_ i n i t i a l _ b l o c k s T 1 T 2 T 3 T 4 T 5 C T R X M M 1 X M M 2 X M M 3 X M M 4 X M M 5 X M M 6 X M M 7 X M M 8 T 6 T _ k e y E N C _ D E C
i = ( 8 - \ n u m _ i n i t i a l _ b l o c k s )
2017-04-28 18:11:58 +02:00
j = 0
2013-12-11 14:28:41 -08:00
setreg
2017-04-28 18:11:58 +02:00
mov a r g 6 , % r10 # r 10 = A A D
mov a r g 7 , % r12 # r 12 = a a d L e n
mov % r12 , % r11
vpxor r e g _ j , r e g _ j , r e g _ j
vpxor r e g _ i , r e g _ i , r e g _ i
cmp $ 1 6 , % r11
jl _ g e t _ A A D _ r e s t 8 \ @
_ get_ A A D _ b l o c k s \ @:
vmovdqu ( % r10 ) , r e g _ i
vpshufb S H U F _ M A S K ( % r i p ) , r e g _ i , r e g _ i
vpxor r e g _ i , r e g _ j , r e g _ j
GHASH_ M U L _ A V X r e g _ j , \ T 2 , \ T 1 , \ T 3 , \ T 4 , \ T 5 , \ T 6
add $ 1 6 , % r10
sub $ 1 6 , % r12
sub $ 1 6 , % r11
cmp $ 1 6 , % r11
jge _ g e t _ A A D _ b l o c k s \ @
vmovdqu r e g _ j , r e g _ i
cmp $ 0 , % r11
je _ g e t _ A A D _ d o n e \ @
vpxor r e g _ i , r e g _ i , r e g _ i
/ * read t h e l a s t < 1 6 B o f A A D . s i n c e w e h a v e a t l e a s t 4 B o f
data r i g h t a f t e r t h e A A D ( t h e I C V , a n d m a y b e s o m e C T ) , w e c a n
read 4 B / 8 B b l o c k s s a f e l y , a n d t h e n g e t r i d o f t h e e x t r a s t u f f * /
_ get_ A A D _ r e s t 8 \ @:
cmp $ 4 , % r11
jle _ g e t _ A A D _ r e s t 4 \ @
movq ( % r10 ) , \ T 1
add $ 8 , % r10
sub $ 8 , % r11
vpslldq $ 8 , \ T 1 , \ T 1
vpsrldq $ 8 , r e g _ i , r e g _ i
vpxor \ T 1 , r e g _ i , r e g _ i
jmp _ g e t _ A A D _ r e s t 8 \ @
_ get_ A A D _ r e s t 4 \ @:
cmp $ 0 , % r11
jle _ g e t _ A A D _ r e s t 0 \ @
mov ( % r10 ) , % e a x
movq % r a x , \ T 1
add $ 4 , % r10
sub $ 4 , % r11
vpslldq $ 1 2 , \ T 1 , \ T 1
vpsrldq $ 4 , r e g _ i , r e g _ i
vpxor \ T 1 , r e g _ i , r e g _ i
_ get_ A A D _ r e s t 0 \ @:
/ * finalize : shift o u t t h e e x t r a b y t e s w e r e a d , a n d a l i g n
left. s i n c e p s l l d q c a n o n l y s h i f t b y a n i m m e d i a t e , w e u s e
vpshufb a n d a n a r r a y o f s h u f f l e m a s k s * /
movq % r12 , % r11
salq $ 4 , % r11
movdqu a a d _ s h i f t _ a r r ( % r11 ) , \ T 1
vpshufb \ T 1 , r e g _ i , r e g _ i
_ get_ A A D _ r e s t _ f i n a l \ @:
vpshufb S H U F _ M A S K ( % r i p ) , r e g _ i , r e g _ i
vpxor r e g _ j , r e g _ i , r e g _ i
GHASH_ M U L _ A V X r e g _ i , \ T 2 , \ T 1 , \ T 3 , \ T 4 , \ T 5 , \ T 6
_ get_ A A D _ d o n e \ @:
2013-12-11 14:28:41 -08:00
# initialize t h e d a t a p o i n t e r o f f s e t a s z e r o
2018-07-02 04:31:54 -06:00
xor % r11 d , % r11 d
2013-12-11 14:28:41 -08:00
# start A E S f o r n u m _ i n i t i a l _ b l o c k s b l o c k s
mov a r g 5 , % r a x # r a x = * Y 0
vmovdqu ( % r a x ) , \ C T R # C T R = Y 0
vpshufb S H U F _ M A S K ( % r i p ) , \ C T R , \ C T R
i = ( 9 - \ n u m _ i n i t i a l _ b l o c k s )
setreg
.rep \ num_ i n i t i a l _ b l o c k s
vpaddd O N E ( % r i p ) , \ C T R , \ C T R # I N C R Y 0
vmovdqa \ C T R , r e g _ i
vpshufb S H U F _ M A S K ( % r i p ) , r e g _ i , r e g _ i # p e r f o r m a 16 B y t e s w a p
i = ( i + 1 )
setreg
.endr
vmovdqa ( a r g 1 ) , \ T _ k e y
i = ( 9 - \ n u m _ i n i t i a l _ b l o c k s )
setreg
.rep \ num_ i n i t i a l _ b l o c k s
vpxor \ T _ k e y , r e g _ i , r e g _ i
i = ( i + 1 )
setreg
.endr
j = 1
setreg
.rep 9
vmovdqa 1 6 * j ( a r g 1 ) , \ T _ k e y
i = ( 9 - \ n u m _ i n i t i a l _ b l o c k s )
setreg
.rep \ num_ i n i t i a l _ b l o c k s
vaesenc \ T _ k e y , r e g _ i , r e g _ i
i = ( i + 1 )
setreg
.endr
j = ( j + 1 )
setreg
.endr
vmovdqa 1 6 * 1 0 ( a r g 1 ) , \ T _ k e y
i = ( 9 - \ n u m _ i n i t i a l _ b l o c k s )
setreg
.rep \ num_ i n i t i a l _ b l o c k s
vaesenclast \ T _ k e y , r e g _ i , r e g _ i
i = ( i + 1 )
setreg
.endr
i = ( 9 - \ n u m _ i n i t i a l _ b l o c k s )
setreg
.rep \ num_ i n i t i a l _ b l o c k s
vmovdqu ( a r g 3 , % r11 ) , \ T 1
vpxor \ T 1 , r e g _ i , r e g _ i
vmovdqu r e g _ i , ( a r g 2 , % r11 ) # w r i t e b a c k c i p h e r t e x t f o r n u m _ i n i t i a l _ b l o c k s b l o c k s
add $ 1 6 , % r11
.if \ ENC_ D E C = = D E C
vmovdqa \ T 1 , r e g _ i
.endif
vpshufb S H U F _ M A S K ( % r i p ) , r e g _ i , r e g _ i # p r e p a r e c i p h e r t e x t f o r G H A S H c o m p u t a t i o n s
i = ( i + 1 )
setreg
.endr
i = ( 8 - \ n u m _ i n i t i a l _ b l o c k s )
j = ( 9 - \ n u m _ i n i t i a l _ b l o c k s )
setreg
.rep \ num_ i n i t i a l _ b l o c k s
vpxor r e g _ i , r e g _ j , r e g _ j
GHASH_ M U L _ A V X r e g _ j , \ T 2 , \ T 1 , \ T 3 , \ T 4 , \ T 5 , \ T 6 # a p p l y G H A S H o n n u m _ i n i t i a l _ b l o c k s b l o c k s
i = ( i + 1 )
j = ( j + 1 )
setreg
.endr
# XMM8 h a s t h e c o m b i n e d r e s u l t h e r e
vmovdqa \ X M M 8 , T M P 1 ( % r s p )
vmovdqa \ X M M 8 , \ T 3
cmp $ 1 2 8 , % r13
jl _ i n i t i a l _ b l o c k s _ d o n e \ @ # no need for precomputed constants
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# Haskey_ i _ k h o l d s X O R e d v a l u e s o f t h e l o w a n d h i g h p a r t s o f t h e H a s k e y _ i
vpaddd O N E ( % r i p ) , \ C T R , \ C T R # I N C R Y 0
vmovdqa \ C T R , \ X M M 1
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 1 , \ X M M 1 # p e r f o r m a 16 B y t e s w a p
vpaddd O N E ( % r i p ) , \ C T R , \ C T R # I N C R Y 0
vmovdqa \ C T R , \ X M M 2
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 2 , \ X M M 2 # p e r f o r m a 16 B y t e s w a p
vpaddd O N E ( % r i p ) , \ C T R , \ C T R # I N C R Y 0
vmovdqa \ C T R , \ X M M 3
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 3 , \ X M M 3 # p e r f o r m a 16 B y t e s w a p
vpaddd O N E ( % r i p ) , \ C T R , \ C T R # I N C R Y 0
vmovdqa \ C T R , \ X M M 4
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 4 , \ X M M 4 # p e r f o r m a 16 B y t e s w a p
vpaddd O N E ( % r i p ) , \ C T R , \ C T R # I N C R Y 0
vmovdqa \ C T R , \ X M M 5
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 5 , \ X M M 5 # p e r f o r m a 16 B y t e s w a p
vpaddd O N E ( % r i p ) , \ C T R , \ C T R # I N C R Y 0
vmovdqa \ C T R , \ X M M 6
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 6 , \ X M M 6 # p e r f o r m a 16 B y t e s w a p
vpaddd O N E ( % r i p ) , \ C T R , \ C T R # I N C R Y 0
vmovdqa \ C T R , \ X M M 7
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 7 , \ X M M 7 # p e r f o r m a 16 B y t e s w a p
vpaddd O N E ( % r i p ) , \ C T R , \ C T R # I N C R Y 0
vmovdqa \ C T R , \ X M M 8
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 8 , \ X M M 8 # p e r f o r m a 16 B y t e s w a p
vmovdqa ( a r g 1 ) , \ T _ k e y
vpxor \ T _ k e y , \ X M M 1 , \ X M M 1
vpxor \ T _ k e y , \ X M M 2 , \ X M M 2
vpxor \ T _ k e y , \ X M M 3 , \ X M M 3
vpxor \ T _ k e y , \ X M M 4 , \ X M M 4
vpxor \ T _ k e y , \ X M M 5 , \ X M M 5
vpxor \ T _ k e y , \ X M M 6 , \ X M M 6
vpxor \ T _ k e y , \ X M M 7 , \ X M M 7
vpxor \ T _ k e y , \ X M M 8 , \ X M M 8
i = 1
setreg
.rep 9 # do 9 r o u n d s
vmovdqa 1 6 * i ( a r g 1 ) , \ T _ k e y
vaesenc \ T _ k e y , \ X M M 1 , \ X M M 1
vaesenc \ T _ k e y , \ X M M 2 , \ X M M 2
vaesenc \ T _ k e y , \ X M M 3 , \ X M M 3
vaesenc \ T _ k e y , \ X M M 4 , \ X M M 4
vaesenc \ T _ k e y , \ X M M 5 , \ X M M 5
vaesenc \ T _ k e y , \ X M M 6 , \ X M M 6
vaesenc \ T _ k e y , \ X M M 7 , \ X M M 7
vaesenc \ T _ k e y , \ X M M 8 , \ X M M 8
i = ( i + 1 )
setreg
.endr
vmovdqa 1 6 * i ( a r g 1 ) , \ T _ k e y
vaesenclast \ T _ k e y , \ X M M 1 , \ X M M 1
vaesenclast \ T _ k e y , \ X M M 2 , \ X M M 2
vaesenclast \ T _ k e y , \ X M M 3 , \ X M M 3
vaesenclast \ T _ k e y , \ X M M 4 , \ X M M 4
vaesenclast \ T _ k e y , \ X M M 5 , \ X M M 5
vaesenclast \ T _ k e y , \ X M M 6 , \ X M M 6
vaesenclast \ T _ k e y , \ X M M 7 , \ X M M 7
vaesenclast \ T _ k e y , \ X M M 8 , \ X M M 8
vmovdqu ( a r g 3 , % r11 ) , \ T 1
vpxor \ T 1 , \ X M M 1 , \ X M M 1
vmovdqu \ X M M 1 , ( a r g 2 , % r11 )
.if \ ENC_ D E C = = D E C
vmovdqa \ T 1 , \ X M M 1
.endif
vmovdqu 1 6 * 1 ( a r g 3 , % r11 ) , \ T 1
vpxor \ T 1 , \ X M M 2 , \ X M M 2
vmovdqu \ X M M 2 , 1 6 * 1 ( a r g 2 , % r11 )
.if \ ENC_ D E C = = D E C
vmovdqa \ T 1 , \ X M M 2
.endif
vmovdqu 1 6 * 2 ( a r g 3 , % r11 ) , \ T 1
vpxor \ T 1 , \ X M M 3 , \ X M M 3
vmovdqu \ X M M 3 , 1 6 * 2 ( a r g 2 , % r11 )
.if \ ENC_ D E C = = D E C
vmovdqa \ T 1 , \ X M M 3
.endif
vmovdqu 1 6 * 3 ( a r g 3 , % r11 ) , \ T 1
vpxor \ T 1 , \ X M M 4 , \ X M M 4
vmovdqu \ X M M 4 , 1 6 * 3 ( a r g 2 , % r11 )
.if \ ENC_ D E C = = D E C
vmovdqa \ T 1 , \ X M M 4
.endif
vmovdqu 1 6 * 4 ( a r g 3 , % r11 ) , \ T 1
vpxor \ T 1 , \ X M M 5 , \ X M M 5
vmovdqu \ X M M 5 , 1 6 * 4 ( a r g 2 , % r11 )
.if \ ENC_ D E C = = D E C
vmovdqa \ T 1 , \ X M M 5
.endif
vmovdqu 1 6 * 5 ( a r g 3 , % r11 ) , \ T 1
vpxor \ T 1 , \ X M M 6 , \ X M M 6
vmovdqu \ X M M 6 , 1 6 * 5 ( a r g 2 , % r11 )
.if \ ENC_ D E C = = D E C
vmovdqa \ T 1 , \ X M M 6
.endif
vmovdqu 1 6 * 6 ( a r g 3 , % r11 ) , \ T 1
vpxor \ T 1 , \ X M M 7 , \ X M M 7
vmovdqu \ X M M 7 , 1 6 * 6 ( a r g 2 , % r11 )
.if \ ENC_ D E C = = D E C
vmovdqa \ T 1 , \ X M M 7
.endif
vmovdqu 1 6 * 7 ( a r g 3 , % r11 ) , \ T 1
vpxor \ T 1 , \ X M M 8 , \ X M M 8
vmovdqu \ X M M 8 , 1 6 * 7 ( a r g 2 , % r11 )
.if \ ENC_ D E C = = D E C
vmovdqa \ T 1 , \ X M M 8
.endif
add $ 1 2 8 , % r11
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 1 , \ X M M 1 # p e r f o r m a 16 B y t e s w a p
vpxor T M P 1 ( % r s p ) , \ X M M 1 , \ X M M 1 # c o m b i n e G H A S H e d v a l u e w i t h t h e c o r r e s p o n d i n g c i p h e r t e x t
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 2 , \ X M M 2 # p e r f o r m a 16 B y t e s w a p
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 3 , \ X M M 3 # p e r f o r m a 16 B y t e s w a p
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 4 , \ X M M 4 # p e r f o r m a 16 B y t e s w a p
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 5 , \ X M M 5 # p e r f o r m a 16 B y t e s w a p
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 6 , \ X M M 6 # p e r f o r m a 16 B y t e s w a p
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 7 , \ X M M 7 # p e r f o r m a 16 B y t e s w a p
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 8 , \ X M M 8 # p e r f o r m a 16 B y t e s w a p
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
_ initial_ b l o c k s _ d o n e \ @:
.endm
# encrypt 8 b l o c k s a t a t i m e
# ghash t h e 8 p r e v i o u s l y e n c r y p t e d c i p h e r t e x t b l o c k s
# arg1 , a r g 2 , a r g 3 a r e u s e d a s p o i n t e r s o n l y , n o t m o d i f i e d
# r1 1 i s t h e d a t a o f f s e t v a l u e
.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX T1 T 2 T 3 T 4 T 5 T 6 C T R X M M 1 X M M 2 X M M 3 X M M 4 X M M 5 X M M 6 X M M 7 X M M 8 T 7 l o o p _ i d x E N C _ D E C
vmovdqa \ X M M 1 , \ T 2
vmovdqa \ X M M 2 , T M P 2 ( % r s p )
vmovdqa \ X M M 3 , T M P 3 ( % r s p )
vmovdqa \ X M M 4 , T M P 4 ( % r s p )
vmovdqa \ X M M 5 , T M P 5 ( % r s p )
vmovdqa \ X M M 6 , T M P 6 ( % r s p )
vmovdqa \ X M M 7 , T M P 7 ( % r s p )
vmovdqa \ X M M 8 , T M P 8 ( % r s p )
.if \ loop_ i d x = = i n _ o r d e r
vpaddd O N E ( % r i p ) , \ C T R , \ X M M 1 # I N C R C N T
vpaddd O N E ( % r i p ) , \ X M M 1 , \ X M M 2
vpaddd O N E ( % r i p ) , \ X M M 2 , \ X M M 3
vpaddd O N E ( % r i p ) , \ X M M 3 , \ X M M 4
vpaddd O N E ( % r i p ) , \ X M M 4 , \ X M M 5
vpaddd O N E ( % r i p ) , \ X M M 5 , \ X M M 6
vpaddd O N E ( % r i p ) , \ X M M 6 , \ X M M 7
vpaddd O N E ( % r i p ) , \ X M M 7 , \ X M M 8
vmovdqa \ X M M 8 , \ C T R
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 1 , \ X M M 1 # p e r f o r m a 16 B y t e s w a p
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 2 , \ X M M 2 # p e r f o r m a 16 B y t e s w a p
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 3 , \ X M M 3 # p e r f o r m a 16 B y t e s w a p
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 4 , \ X M M 4 # p e r f o r m a 16 B y t e s w a p
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 5 , \ X M M 5 # p e r f o r m a 16 B y t e s w a p
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 6 , \ X M M 6 # p e r f o r m a 16 B y t e s w a p
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 7 , \ X M M 7 # p e r f o r m a 16 B y t e s w a p
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 8 , \ X M M 8 # p e r f o r m a 16 B y t e s w a p
.else
vpaddd O N E f ( % r i p ) , \ C T R , \ X M M 1 # I N C R C N T
vpaddd O N E f ( % r i p ) , \ X M M 1 , \ X M M 2
vpaddd O N E f ( % r i p ) , \ X M M 2 , \ X M M 3
vpaddd O N E f ( % r i p ) , \ X M M 3 , \ X M M 4
vpaddd O N E f ( % r i p ) , \ X M M 4 , \ X M M 5
vpaddd O N E f ( % r i p ) , \ X M M 5 , \ X M M 6
vpaddd O N E f ( % r i p ) , \ X M M 6 , \ X M M 7
vpaddd O N E f ( % r i p ) , \ X M M 7 , \ X M M 8
vmovdqa \ X M M 8 , \ C T R
.endif
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
vmovdqu ( a r g 1 ) , \ T 1
vpxor \ T 1 , \ X M M 1 , \ X M M 1
vpxor \ T 1 , \ X M M 2 , \ X M M 2
vpxor \ T 1 , \ X M M 3 , \ X M M 3
vpxor \ T 1 , \ X M M 4 , \ X M M 4
vpxor \ T 1 , \ X M M 5 , \ X M M 5
vpxor \ T 1 , \ X M M 6 , \ X M M 6
vpxor \ T 1 , \ X M M 7 , \ X M M 7
vpxor \ T 1 , \ X M M 8 , \ X M M 8
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
vmovdqu 1 6 * 1 ( a r g 1 ) , \ T 1
vaesenc \ T 1 , \ X M M 1 , \ X M M 1
vaesenc \ T 1 , \ X M M 2 , \ X M M 2
vaesenc \ T 1 , \ X M M 3 , \ X M M 3
vaesenc \ T 1 , \ X M M 4 , \ X M M 4
vaesenc \ T 1 , \ X M M 5 , \ X M M 5
vaesenc \ T 1 , \ X M M 6 , \ X M M 6
vaesenc \ T 1 , \ X M M 7 , \ X M M 7
vaesenc \ T 1 , \ X M M 8 , \ X M M 8
vmovdqu 1 6 * 2 ( a r g 1 ) , \ T 1
vaesenc \ T 1 , \ X M M 1 , \ X M M 1
vaesenc \ T 1 , \ X M M 2 , \ X M M 2
vaesenc \ T 1 , \ X M M 3 , \ X M M 3
vaesenc \ T 1 , \ X M M 4 , \ X M M 4
vaesenc \ T 1 , \ X M M 5 , \ X M M 5
vaesenc \ T 1 , \ X M M 6 , \ X M M 6
vaesenc \ T 1 , \ X M M 7 , \ X M M 7
vaesenc \ T 1 , \ X M M 8 , \ X M M 8
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
vmovdqa H a s h K e y _ 8 ( a r g 1 ) , \ T 5
vpclmulqdq $ 0 x11 , \ T 5 , \ T 2 , \ T 4 # T 4 = a1 * b1
vpclmulqdq $ 0 x00 , \ T 5 , \ T 2 , \ T 7 # T 7 = a0 * b0
vpshufd $ 0 b01 0 0 1 1 1 0 , \ T 2 , \ T 6
vpxor \ T 2 , \ T 6 , \ T 6
vmovdqa H a s h K e y _ 8 _ k ( a r g 1 ) , \ T 5
vpclmulqdq $ 0 x00 , \ T 5 , \ T 6 , \ T 6
vmovdqu 1 6 * 3 ( a r g 1 ) , \ T 1
vaesenc \ T 1 , \ X M M 1 , \ X M M 1
vaesenc \ T 1 , \ X M M 2 , \ X M M 2
vaesenc \ T 1 , \ X M M 3 , \ X M M 3
vaesenc \ T 1 , \ X M M 4 , \ X M M 4
vaesenc \ T 1 , \ X M M 5 , \ X M M 5
vaesenc \ T 1 , \ X M M 6 , \ X M M 6
vaesenc \ T 1 , \ X M M 7 , \ X M M 7
vaesenc \ T 1 , \ X M M 8 , \ X M M 8
vmovdqa T M P 2 ( % r s p ) , \ T 1
vmovdqa H a s h K e y _ 7 ( a r g 1 ) , \ T 5
vpclmulqdq $ 0 x11 , \ T 5 , \ T 1 , \ T 3
vpxor \ T 3 , \ T 4 , \ T 4
vpclmulqdq $ 0 x00 , \ T 5 , \ T 1 , \ T 3
vpxor \ T 3 , \ T 7 , \ T 7
vpshufd $ 0 b01 0 0 1 1 1 0 , \ T 1 , \ T 3
vpxor \ T 1 , \ T 3 , \ T 3
vmovdqa H a s h K e y _ 7 _ k ( a r g 1 ) , \ T 5
vpclmulqdq $ 0 x10 , \ T 5 , \ T 3 , \ T 3
vpxor \ T 3 , \ T 6 , \ T 6
vmovdqu 1 6 * 4 ( a r g 1 ) , \ T 1
vaesenc \ T 1 , \ X M M 1 , \ X M M 1
vaesenc \ T 1 , \ X M M 2 , \ X M M 2
vaesenc \ T 1 , \ X M M 3 , \ X M M 3
vaesenc \ T 1 , \ X M M 4 , \ X M M 4
vaesenc \ T 1 , \ X M M 5 , \ X M M 5
vaesenc \ T 1 , \ X M M 6 , \ X M M 6
vaesenc \ T 1 , \ X M M 7 , \ X M M 7
vaesenc \ T 1 , \ X M M 8 , \ X M M 8
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
vmovdqa T M P 3 ( % r s p ) , \ T 1
vmovdqa H a s h K e y _ 6 ( a r g 1 ) , \ T 5
vpclmulqdq $ 0 x11 , \ T 5 , \ T 1 , \ T 3
vpxor \ T 3 , \ T 4 , \ T 4
vpclmulqdq $ 0 x00 , \ T 5 , \ T 1 , \ T 3
vpxor \ T 3 , \ T 7 , \ T 7
vpshufd $ 0 b01 0 0 1 1 1 0 , \ T 1 , \ T 3
vpxor \ T 1 , \ T 3 , \ T 3
vmovdqa H a s h K e y _ 6 _ k ( a r g 1 ) , \ T 5
vpclmulqdq $ 0 x10 , \ T 5 , \ T 3 , \ T 3
vpxor \ T 3 , \ T 6 , \ T 6
vmovdqu 1 6 * 5 ( a r g 1 ) , \ T 1
vaesenc \ T 1 , \ X M M 1 , \ X M M 1
vaesenc \ T 1 , \ X M M 2 , \ X M M 2
vaesenc \ T 1 , \ X M M 3 , \ X M M 3
vaesenc \ T 1 , \ X M M 4 , \ X M M 4
vaesenc \ T 1 , \ X M M 5 , \ X M M 5
vaesenc \ T 1 , \ X M M 6 , \ X M M 6
vaesenc \ T 1 , \ X M M 7 , \ X M M 7
vaesenc \ T 1 , \ X M M 8 , \ X M M 8
vmovdqa T M P 4 ( % r s p ) , \ T 1
vmovdqa H a s h K e y _ 5 ( a r g 1 ) , \ T 5
vpclmulqdq $ 0 x11 , \ T 5 , \ T 1 , \ T 3
vpxor \ T 3 , \ T 4 , \ T 4
vpclmulqdq $ 0 x00 , \ T 5 , \ T 1 , \ T 3
vpxor \ T 3 , \ T 7 , \ T 7
vpshufd $ 0 b01 0 0 1 1 1 0 , \ T 1 , \ T 3
vpxor \ T 1 , \ T 3 , \ T 3
vmovdqa H a s h K e y _ 5 _ k ( a r g 1 ) , \ T 5
vpclmulqdq $ 0 x10 , \ T 5 , \ T 3 , \ T 3
vpxor \ T 3 , \ T 6 , \ T 6
vmovdqu 1 6 * 6 ( a r g 1 ) , \ T 1
vaesenc \ T 1 , \ X M M 1 , \ X M M 1
vaesenc \ T 1 , \ X M M 2 , \ X M M 2
vaesenc \ T 1 , \ X M M 3 , \ X M M 3
vaesenc \ T 1 , \ X M M 4 , \ X M M 4
vaesenc \ T 1 , \ X M M 5 , \ X M M 5
vaesenc \ T 1 , \ X M M 6 , \ X M M 6
vaesenc \ T 1 , \ X M M 7 , \ X M M 7
vaesenc \ T 1 , \ X M M 8 , \ X M M 8
vmovdqa T M P 5 ( % r s p ) , \ T 1
vmovdqa H a s h K e y _ 4 ( a r g 1 ) , \ T 5
vpclmulqdq $ 0 x11 , \ T 5 , \ T 1 , \ T 3
vpxor \ T 3 , \ T 4 , \ T 4
vpclmulqdq $ 0 x00 , \ T 5 , \ T 1 , \ T 3
vpxor \ T 3 , \ T 7 , \ T 7
vpshufd $ 0 b01 0 0 1 1 1 0 , \ T 1 , \ T 3
vpxor \ T 1 , \ T 3 , \ T 3
vmovdqa H a s h K e y _ 4 _ k ( a r g 1 ) , \ T 5
vpclmulqdq $ 0 x10 , \ T 5 , \ T 3 , \ T 3
vpxor \ T 3 , \ T 6 , \ T 6
vmovdqu 1 6 * 7 ( a r g 1 ) , \ T 1
vaesenc \ T 1 , \ X M M 1 , \ X M M 1
vaesenc \ T 1 , \ X M M 2 , \ X M M 2
vaesenc \ T 1 , \ X M M 3 , \ X M M 3
vaesenc \ T 1 , \ X M M 4 , \ X M M 4
vaesenc \ T 1 , \ X M M 5 , \ X M M 5
vaesenc \ T 1 , \ X M M 6 , \ X M M 6
vaesenc \ T 1 , \ X M M 7 , \ X M M 7
vaesenc \ T 1 , \ X M M 8 , \ X M M 8
vmovdqa T M P 6 ( % r s p ) , \ T 1
vmovdqa H a s h K e y _ 3 ( a r g 1 ) , \ T 5
vpclmulqdq $ 0 x11 , \ T 5 , \ T 1 , \ T 3
vpxor \ T 3 , \ T 4 , \ T 4
vpclmulqdq $ 0 x00 , \ T 5 , \ T 1 , \ T 3
vpxor \ T 3 , \ T 7 , \ T 7
vpshufd $ 0 b01 0 0 1 1 1 0 , \ T 1 , \ T 3
vpxor \ T 1 , \ T 3 , \ T 3
vmovdqa H a s h K e y _ 3 _ k ( a r g 1 ) , \ T 5
vpclmulqdq $ 0 x10 , \ T 5 , \ T 3 , \ T 3
vpxor \ T 3 , \ T 6 , \ T 6
vmovdqu 1 6 * 8 ( a r g 1 ) , \ T 1
vaesenc \ T 1 , \ X M M 1 , \ X M M 1
vaesenc \ T 1 , \ X M M 2 , \ X M M 2
vaesenc \ T 1 , \ X M M 3 , \ X M M 3
vaesenc \ T 1 , \ X M M 4 , \ X M M 4
vaesenc \ T 1 , \ X M M 5 , \ X M M 5
vaesenc \ T 1 , \ X M M 6 , \ X M M 6
vaesenc \ T 1 , \ X M M 7 , \ X M M 7
vaesenc \ T 1 , \ X M M 8 , \ X M M 8
vmovdqa T M P 7 ( % r s p ) , \ T 1
vmovdqa H a s h K e y _ 2 ( a r g 1 ) , \ T 5
vpclmulqdq $ 0 x11 , \ T 5 , \ T 1 , \ T 3
vpxor \ T 3 , \ T 4 , \ T 4
vpclmulqdq $ 0 x00 , \ T 5 , \ T 1 , \ T 3
vpxor \ T 3 , \ T 7 , \ T 7
vpshufd $ 0 b01 0 0 1 1 1 0 , \ T 1 , \ T 3
vpxor \ T 1 , \ T 3 , \ T 3
vmovdqa H a s h K e y _ 2 _ k ( a r g 1 ) , \ T 5
vpclmulqdq $ 0 x10 , \ T 5 , \ T 3 , \ T 3
vpxor \ T 3 , \ T 6 , \ T 6
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
vmovdqu 1 6 * 9 ( a r g 1 ) , \ T 5
vaesenc \ T 5 , \ X M M 1 , \ X M M 1
vaesenc \ T 5 , \ X M M 2 , \ X M M 2
vaesenc \ T 5 , \ X M M 3 , \ X M M 3
vaesenc \ T 5 , \ X M M 4 , \ X M M 4
vaesenc \ T 5 , \ X M M 5 , \ X M M 5
vaesenc \ T 5 , \ X M M 6 , \ X M M 6
vaesenc \ T 5 , \ X M M 7 , \ X M M 7
vaesenc \ T 5 , \ X M M 8 , \ X M M 8
vmovdqa T M P 8 ( % r s p ) , \ T 1
vmovdqa H a s h K e y ( a r g 1 ) , \ T 5
vpclmulqdq $ 0 x11 , \ T 5 , \ T 1 , \ T 3
vpxor \ T 3 , \ T 4 , \ T 4
vpclmulqdq $ 0 x00 , \ T 5 , \ T 1 , \ T 3
vpxor \ T 3 , \ T 7 , \ T 7
vpshufd $ 0 b01 0 0 1 1 1 0 , \ T 1 , \ T 3
vpxor \ T 1 , \ T 3 , \ T 3
vmovdqa H a s h K e y _ k ( a r g 1 ) , \ T 5
vpclmulqdq $ 0 x10 , \ T 5 , \ T 3 , \ T 3
vpxor \ T 3 , \ T 6 , \ T 6
vpxor \ T 4 , \ T 6 , \ T 6
vpxor \ T 7 , \ T 6 , \ T 6
vmovdqu 1 6 * 1 0 ( a r g 1 ) , \ T 5
i = 0
j = 1
setreg
.rep 8
vpxor 1 6 * i ( a r g 3 , % r11 ) , \ T 5 , \ T 2
.if \ ENC_ D E C = = E N C
vaesenclast \ T 2 , r e g _ j , r e g _ j
.else
vaesenclast \ T 2 , r e g _ j , \ T 3
vmovdqu 1 6 * i ( a r g 3 , % r11 ) , r e g _ j
vmovdqu \ T 3 , 1 6 * i ( a r g 2 , % r11 )
.endif
i = ( i + 1 )
j = ( j + 1 )
setreg
.endr
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
vpslldq $ 8 , \ T 6 , \ T 3 # s h i f t - L T 3 2 D W s
vpsrldq $ 8 , \ T 6 , \ T 6 # s h i f t - R T 2 2 D W s
vpxor \ T 3 , \ T 7 , \ T 7
vpxor \ T 4 , \ T 6 , \ T 6 # a c c u m u l a t e t h e r e s u l t s i n T 6 : T 7
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# first p h a s e o f t h e r e d u c t i o n
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
vpslld $ 3 1 , \ T 7 , \ T 2 # p a c k e d r i g h t s h i f t i n g < < 31
vpslld $ 3 0 , \ T 7 , \ T 3 # p a c k e d r i g h t s h i f t i n g s h i f t < < 30
vpslld $ 2 5 , \ T 7 , \ T 4 # p a c k e d r i g h t s h i f t i n g s h i f t < < 25
vpxor \ T 3 , \ T 2 , \ T 2 # x o r t h e s h i f t e d v e r s i o n s
vpxor \ T 4 , \ T 2 , \ T 2
vpsrldq $ 4 , \ T 2 , \ T 1 # s h i f t - R T 1 1 D W
vpslldq $ 1 2 , \ T 2 , \ T 2 # s h i f t - L T 2 3 D W s
vpxor \ T 2 , \ T 7 , \ T 7 # f i r s t p h a s e o f t h e r e d u c t i o n c o m p l e t e
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
.if \ ENC_ D E C = = E N C
vmovdqu \ X M M 1 , 1 6 * 0 ( a r g 2 ,% r11 ) # W r i t e t o t h e C i p h e r t e x t b u f f e r
vmovdqu \ X M M 2 , 1 6 * 1 ( a r g 2 ,% r11 ) # W r i t e t o t h e C i p h e r t e x t b u f f e r
vmovdqu \ X M M 3 , 1 6 * 2 ( a r g 2 ,% r11 ) # W r i t e t o t h e C i p h e r t e x t b u f f e r
vmovdqu \ X M M 4 , 1 6 * 3 ( a r g 2 ,% r11 ) # W r i t e t o t h e C i p h e r t e x t b u f f e r
vmovdqu \ X M M 5 , 1 6 * 4 ( a r g 2 ,% r11 ) # W r i t e t o t h e C i p h e r t e x t b u f f e r
vmovdqu \ X M M 6 , 1 6 * 5 ( a r g 2 ,% r11 ) # W r i t e t o t h e C i p h e r t e x t b u f f e r
vmovdqu \ X M M 7 , 1 6 * 6 ( a r g 2 ,% r11 ) # W r i t e t o t h e C i p h e r t e x t b u f f e r
vmovdqu \ X M M 8 , 1 6 * 7 ( a r g 2 ,% r11 ) # W r i t e t o t h e C i p h e r t e x t b u f f e r
.endif
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# second p h a s e o f t h e r e d u c t i o n
vpsrld $ 1 , \ T 7 , \ T 2 # p a c k e d l e f t s h i f t i n g > > 1
vpsrld $ 2 , \ T 7 , \ T 3 # p a c k e d l e f t s h i f t i n g > > 2
vpsrld $ 7 , \ T 7 , \ T 4 # p a c k e d l e f t s h i f t i n g > > 7
vpxor \ T 3 , \ T 2 , \ T 2 # x o r t h e s h i f t e d v e r s i o n s
vpxor \ T 4 , \ T 2 , \ T 2
vpxor \ T 1 , \ T 2 , \ T 2
vpxor \ T 2 , \ T 7 , \ T 7
vpxor \ T 7 , \ T 6 , \ T 6 # t h e r e s u l t i s i n T 6
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 1 , \ X M M 1 # p e r f o r m a 16 B y t e s w a p
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 2 , \ X M M 2 # p e r f o r m a 16 B y t e s w a p
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 3 , \ X M M 3 # p e r f o r m a 16 B y t e s w a p
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 4 , \ X M M 4 # p e r f o r m a 16 B y t e s w a p
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 5 , \ X M M 5 # p e r f o r m a 16 B y t e s w a p
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 6 , \ X M M 6 # p e r f o r m a 16 B y t e s w a p
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 7 , \ X M M 7 # p e r f o r m a 16 B y t e s w a p
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 8 , \ X M M 8 # p e r f o r m a 16 B y t e s w a p
vpxor \ T 6 , \ X M M 1 , \ X M M 1
.endm
# GHASH t h e l a s t 4 c i p h e r t e x t b l o c k s .
.macro GHASH_LAST_8_AVX T1 T 2 T 3 T 4 T 5 T 6 T 7 X M M 1 X M M 2 X M M 3 X M M 4 X M M 5 X M M 6 X M M 7 X M M 8
# # Karatsuba M e t h o d
vpshufd $ 0 b01 0 0 1 1 1 0 , \ X M M 1 , \ T 2
vpxor \ X M M 1 , \ T 2 , \ T 2
vmovdqa H a s h K e y _ 8 ( a r g 1 ) , \ T 5
vpclmulqdq $ 0 x11 , \ T 5 , \ X M M 1 , \ T 6
vpclmulqdq $ 0 x00 , \ T 5 , \ X M M 1 , \ T 7
vmovdqa H a s h K e y _ 8 _ k ( a r g 1 ) , \ T 3
vpclmulqdq $ 0 x00 , \ T 3 , \ T 2 , \ X M M 1
# # # # # # # # # # # # # # # # # # # # # #
vpshufd $ 0 b01 0 0 1 1 1 0 , \ X M M 2 , \ T 2
vpxor \ X M M 2 , \ T 2 , \ T 2
vmovdqa H a s h K e y _ 7 ( a r g 1 ) , \ T 5
vpclmulqdq $ 0 x11 , \ T 5 , \ X M M 2 , \ T 4
vpxor \ T 4 , \ T 6 , \ T 6
vpclmulqdq $ 0 x00 , \ T 5 , \ X M M 2 , \ T 4
vpxor \ T 4 , \ T 7 , \ T 7
vmovdqa H a s h K e y _ 7 _ k ( a r g 1 ) , \ T 3
vpclmulqdq $ 0 x00 , \ T 3 , \ T 2 , \ T 2
vpxor \ T 2 , \ X M M 1 , \ X M M 1
# # # # # # # # # # # # # # # # # # # # # #
vpshufd $ 0 b01 0 0 1 1 1 0 , \ X M M 3 , \ T 2
vpxor \ X M M 3 , \ T 2 , \ T 2
vmovdqa H a s h K e y _ 6 ( a r g 1 ) , \ T 5
vpclmulqdq $ 0 x11 , \ T 5 , \ X M M 3 , \ T 4
vpxor \ T 4 , \ T 6 , \ T 6
vpclmulqdq $ 0 x00 , \ T 5 , \ X M M 3 , \ T 4
vpxor \ T 4 , \ T 7 , \ T 7
vmovdqa H a s h K e y _ 6 _ k ( a r g 1 ) , \ T 3
vpclmulqdq $ 0 x00 , \ T 3 , \ T 2 , \ T 2
vpxor \ T 2 , \ X M M 1 , \ X M M 1
# # # # # # # # # # # # # # # # # # # # # #
vpshufd $ 0 b01 0 0 1 1 1 0 , \ X M M 4 , \ T 2
vpxor \ X M M 4 , \ T 2 , \ T 2
vmovdqa H a s h K e y _ 5 ( a r g 1 ) , \ T 5
vpclmulqdq $ 0 x11 , \ T 5 , \ X M M 4 , \ T 4
vpxor \ T 4 , \ T 6 , \ T 6
vpclmulqdq $ 0 x00 , \ T 5 , \ X M M 4 , \ T 4
vpxor \ T 4 , \ T 7 , \ T 7
vmovdqa H a s h K e y _ 5 _ k ( a r g 1 ) , \ T 3
vpclmulqdq $ 0 x00 , \ T 3 , \ T 2 , \ T 2
vpxor \ T 2 , \ X M M 1 , \ X M M 1
# # # # # # # # # # # # # # # # # # # # # #
vpshufd $ 0 b01 0 0 1 1 1 0 , \ X M M 5 , \ T 2
vpxor \ X M M 5 , \ T 2 , \ T 2
vmovdqa H a s h K e y _ 4 ( a r g 1 ) , \ T 5
vpclmulqdq $ 0 x11 , \ T 5 , \ X M M 5 , \ T 4
vpxor \ T 4 , \ T 6 , \ T 6
vpclmulqdq $ 0 x00 , \ T 5 , \ X M M 5 , \ T 4
vpxor \ T 4 , \ T 7 , \ T 7
vmovdqa H a s h K e y _ 4 _ k ( a r g 1 ) , \ T 3
vpclmulqdq $ 0 x00 , \ T 3 , \ T 2 , \ T 2
vpxor \ T 2 , \ X M M 1 , \ X M M 1
# # # # # # # # # # # # # # # # # # # # # #
vpshufd $ 0 b01 0 0 1 1 1 0 , \ X M M 6 , \ T 2
vpxor \ X M M 6 , \ T 2 , \ T 2
vmovdqa H a s h K e y _ 3 ( a r g 1 ) , \ T 5
vpclmulqdq $ 0 x11 , \ T 5 , \ X M M 6 , \ T 4
vpxor \ T 4 , \ T 6 , \ T 6
vpclmulqdq $ 0 x00 , \ T 5 , \ X M M 6 , \ T 4
vpxor \ T 4 , \ T 7 , \ T 7
vmovdqa H a s h K e y _ 3 _ k ( a r g 1 ) , \ T 3
vpclmulqdq $ 0 x00 , \ T 3 , \ T 2 , \ T 2
vpxor \ T 2 , \ X M M 1 , \ X M M 1
# # # # # # # # # # # # # # # # # # # # # #
vpshufd $ 0 b01 0 0 1 1 1 0 , \ X M M 7 , \ T 2
vpxor \ X M M 7 , \ T 2 , \ T 2
vmovdqa H a s h K e y _ 2 ( a r g 1 ) , \ T 5
vpclmulqdq $ 0 x11 , \ T 5 , \ X M M 7 , \ T 4
vpxor \ T 4 , \ T 6 , \ T 6
vpclmulqdq $ 0 x00 , \ T 5 , \ X M M 7 , \ T 4
vpxor \ T 4 , \ T 7 , \ T 7
vmovdqa H a s h K e y _ 2 _ k ( a r g 1 ) , \ T 3
vpclmulqdq $ 0 x00 , \ T 3 , \ T 2 , \ T 2
vpxor \ T 2 , \ X M M 1 , \ X M M 1
# # # # # # # # # # # # # # # # # # # # # #
vpshufd $ 0 b01 0 0 1 1 1 0 , \ X M M 8 , \ T 2
vpxor \ X M M 8 , \ T 2 , \ T 2
vmovdqa H a s h K e y ( a r g 1 ) , \ T 5
vpclmulqdq $ 0 x11 , \ T 5 , \ X M M 8 , \ T 4
vpxor \ T 4 , \ T 6 , \ T 6
vpclmulqdq $ 0 x00 , \ T 5 , \ X M M 8 , \ T 4
vpxor \ T 4 , \ T 7 , \ T 7
vmovdqa H a s h K e y _ k ( a r g 1 ) , \ T 3
vpclmulqdq $ 0 x00 , \ T 3 , \ T 2 , \ T 2
vpxor \ T 2 , \ X M M 1 , \ X M M 1
vpxor \ T 6 , \ X M M 1 , \ X M M 1
vpxor \ T 7 , \ X M M 1 , \ T 2
vpslldq $ 8 , \ T 2 , \ T 4
vpsrldq $ 8 , \ T 2 , \ T 2
vpxor \ T 4 , \ T 7 , \ T 7
vpxor \ T 2 , \ T 6 , \ T 6 # < T 6 : T 7 > h o l d s t h e r e s u l t o f
# the a c c u m u l a t e d c a r r y - l e s s m u l t i p l i c a t i o n s
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# first p h a s e o f t h e r e d u c t i o n
vpslld $ 3 1 , \ T 7 , \ T 2 # p a c k e d r i g h t s h i f t i n g < < 31
vpslld $ 3 0 , \ T 7 , \ T 3 # p a c k e d r i g h t s h i f t i n g s h i f t < < 30
vpslld $ 2 5 , \ T 7 , \ T 4 # p a c k e d r i g h t s h i f t i n g s h i f t < < 25
vpxor \ T 3 , \ T 2 , \ T 2 # x o r t h e s h i f t e d v e r s i o n s
vpxor \ T 4 , \ T 2 , \ T 2
vpsrldq $ 4 , \ T 2 , \ T 1 # s h i f t - R T 1 1 D W
vpslldq $ 1 2 , \ T 2 , \ T 2 # s h i f t - L T 2 3 D W s
vpxor \ T 2 , \ T 7 , \ T 7 # f i r s t p h a s e o f t h e r e d u c t i o n c o m p l e t e
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# second p h a s e o f t h e r e d u c t i o n
vpsrld $ 1 , \ T 7 , \ T 2 # p a c k e d l e f t s h i f t i n g > > 1
vpsrld $ 2 , \ T 7 , \ T 3 # p a c k e d l e f t s h i f t i n g > > 2
vpsrld $ 7 , \ T 7 , \ T 4 # p a c k e d l e f t s h i f t i n g > > 7
vpxor \ T 3 , \ T 2 , \ T 2 # x o r t h e s h i f t e d v e r s i o n s
vpxor \ T 4 , \ T 2 , \ T 2
vpxor \ T 1 , \ T 2 , \ T 2
vpxor \ T 2 , \ T 7 , \ T 7
vpxor \ T 7 , \ T 6 , \ T 6 # t h e r e s u l t i s i n T 6
.endm
# combined f o r G C M e n c r y p t a n d d e c r y p t f u n c t i o n s
# clobbering a l l x m m r e g i s t e r s
# clobbering r10 , r11 , r12 , r13 , r14 , r15
.macro GCM_ENC_DEC_AVX ENC_ D E C
# the n u m b e r o f p u s h e s m u s t e q u a l S T A C K _ O F F S E T
push % r12
push % r13
push % r14
push % r15
mov % r s p , % r14
sub $ V A R I A B L E _ O F F S E T , % r s p
and $ ~ 6 3 , % r s p # a l i g n r s p t o 64 b y t e s
vmovdqu H a s h K e y ( a r g 1 ) , % x m m 1 3 # x m m 13 = H a s h K e y
mov a r g 4 , % r13 # s a v e t h e n u m b e r o f b y t e s o f p l a i n t e x t / c i p h e r t e x t
and $ - 1 6 , % r13 # r 13 = r13 - ( r13 m o d 1 6 )
mov % r13 , % r12
shr $ 4 , % r12
and $ 7 , % r12
jz _ i n i t i a l _ n u m _ b l o c k s _ i s _ 0 \ @
cmp $ 7 , % r12
je _ i n i t i a l _ n u m _ b l o c k s _ i s _ 7 \ @
cmp $ 6 , % r12
je _ i n i t i a l _ n u m _ b l o c k s _ i s _ 6 \ @
cmp $ 5 , % r12
je _ i n i t i a l _ n u m _ b l o c k s _ i s _ 5 \ @
cmp $ 4 , % r12
je _ i n i t i a l _ n u m _ b l o c k s _ i s _ 4 \ @
cmp $ 3 , % r12
je _ i n i t i a l _ n u m _ b l o c k s _ i s _ 3 \ @
cmp $ 2 , % r12
je _ i n i t i a l _ n u m _ b l o c k s _ i s _ 2 \ @
jmp _ i n i t i a l _ n u m _ b l o c k s _ i s _ 1 \ @
_ initial_ n u m _ b l o c k s _ i s _ 7 \ @:
INITIAL_ B L O C K S _ A V X 7 , % x m m 1 2 , % x m m 1 3 , % x m m 1 4 , % x m m 1 5 , % x m m 1 1 , % x m m 9 , % x m m 1 , % x m m 2 , % x m m 3 , % x m m 4 , % x m m 5 , % x m m 6 , % x m m 7 , % x m m 8 , % x m m 1 0 , % x m m 0 , \ E N C _ D E C
sub $ 1 6 * 7 , % r13
jmp _ i n i t i a l _ b l o c k s _ e n c r y p t e d \ @
_ initial_ n u m _ b l o c k s _ i s _ 6 \ @:
INITIAL_ B L O C K S _ A V X 6 , % x m m 1 2 , % x m m 1 3 , % x m m 1 4 , % x m m 1 5 , % x m m 1 1 , % x m m 9 , % x m m 1 , % x m m 2 , % x m m 3 , % x m m 4 , % x m m 5 , % x m m 6 , % x m m 7 , % x m m 8 , % x m m 1 0 , % x m m 0 , \ E N C _ D E C
sub $ 1 6 * 6 , % r13
jmp _ i n i t i a l _ b l o c k s _ e n c r y p t e d \ @
_ initial_ n u m _ b l o c k s _ i s _ 5 \ @:
INITIAL_ B L O C K S _ A V X 5 , % x m m 1 2 , % x m m 1 3 , % x m m 1 4 , % x m m 1 5 , % x m m 1 1 , % x m m 9 , % x m m 1 , % x m m 2 , % x m m 3 , % x m m 4 , % x m m 5 , % x m m 6 , % x m m 7 , % x m m 8 , % x m m 1 0 , % x m m 0 , \ E N C _ D E C
sub $ 1 6 * 5 , % r13
jmp _ i n i t i a l _ b l o c k s _ e n c r y p t e d \ @
_ initial_ n u m _ b l o c k s _ i s _ 4 \ @:
INITIAL_ B L O C K S _ A V X 4 , % x m m 1 2 , % x m m 1 3 , % x m m 1 4 , % x m m 1 5 , % x m m 1 1 , % x m m 9 , % x m m 1 , % x m m 2 , % x m m 3 , % x m m 4 , % x m m 5 , % x m m 6 , % x m m 7 , % x m m 8 , % x m m 1 0 , % x m m 0 , \ E N C _ D E C
sub $ 1 6 * 4 , % r13
jmp _ i n i t i a l _ b l o c k s _ e n c r y p t e d \ @
_ initial_ n u m _ b l o c k s _ i s _ 3 \ @:
INITIAL_ B L O C K S _ A V X 3 , % x m m 1 2 , % x m m 1 3 , % x m m 1 4 , % x m m 1 5 , % x m m 1 1 , % x m m 9 , % x m m 1 , % x m m 2 , % x m m 3 , % x m m 4 , % x m m 5 , % x m m 6 , % x m m 7 , % x m m 8 , % x m m 1 0 , % x m m 0 , \ E N C _ D E C
sub $ 1 6 * 3 , % r13
jmp _ i n i t i a l _ b l o c k s _ e n c r y p t e d \ @
_ initial_ n u m _ b l o c k s _ i s _ 2 \ @:
INITIAL_ B L O C K S _ A V X 2 , % x m m 1 2 , % x m m 1 3 , % x m m 1 4 , % x m m 1 5 , % x m m 1 1 , % x m m 9 , % x m m 1 , % x m m 2 , % x m m 3 , % x m m 4 , % x m m 5 , % x m m 6 , % x m m 7 , % x m m 8 , % x m m 1 0 , % x m m 0 , \ E N C _ D E C
sub $ 1 6 * 2 , % r13
jmp _ i n i t i a l _ b l o c k s _ e n c r y p t e d \ @
_ initial_ n u m _ b l o c k s _ i s _ 1 \ @:
INITIAL_ B L O C K S _ A V X 1 , % x m m 1 2 , % x m m 1 3 , % x m m 1 4 , % x m m 1 5 , % x m m 1 1 , % x m m 9 , % x m m 1 , % x m m 2 , % x m m 3 , % x m m 4 , % x m m 5 , % x m m 6 , % x m m 7 , % x m m 8 , % x m m 1 0 , % x m m 0 , \ E N C _ D E C
sub $ 1 6 * 1 , % r13
jmp _ i n i t i a l _ b l o c k s _ e n c r y p t e d \ @
_ initial_ n u m _ b l o c k s _ i s _ 0 \ @:
INITIAL_ B L O C K S _ A V X 0 , % x m m 1 2 , % x m m 1 3 , % x m m 1 4 , % x m m 1 5 , % x m m 1 1 , % x m m 9 , % x m m 1 , % x m m 2 , % x m m 3 , % x m m 4 , % x m m 5 , % x m m 6 , % x m m 7 , % x m m 8 , % x m m 1 0 , % x m m 0 , \ E N C _ D E C
_ initial_ b l o c k s _ e n c r y p t e d \ @:
cmp $ 0 , % r13
je _ z e r o _ c i p h e r _ l e f t \ @
sub $ 1 2 8 , % r13
je _ e i g h t _ c i p h e r _ l e f t \ @
vmovd % x m m 9 , % r15 d
and $ 2 5 5 , % r15 d
vpshufb S H U F _ M A S K ( % r i p ) , % x m m 9 , % x m m 9
_ encrypt_ b y _ 8 _ n e w \ @:
cmp $ ( 2 5 5 - 8 ) , % r15 d
jg _ e n c r y p t _ b y _ 8 \ @
add $ 8 , % r15 b
GHASH_ 8 _ E N C R Y P T _ 8 _ P A R A L L E L _ A V X % x m m 0 , % x m m 1 0 , % x m m 1 1 , % x m m 1 2 , % x m m 1 3 , % x m m 1 4 , % x m m 9 , % x m m 1 , % x m m 2 , % x m m 3 , % x m m 4 , % x m m 5 , % x m m 6 , % x m m 7 , % x m m 8 , % x m m 1 5 , o u t _ o r d e r , \ E N C _ D E C
add $ 1 2 8 , % r11
sub $ 1 2 8 , % r13
jne _ e n c r y p t _ b y _ 8 _ n e w \ @
vpshufb S H U F _ M A S K ( % r i p ) , % x m m 9 , % x m m 9
jmp _ e i g h t _ c i p h e r _ l e f t \ @
_ encrypt_ b y _ 8 \ @:
vpshufb S H U F _ M A S K ( % r i p ) , % x m m 9 , % x m m 9
add $ 8 , % r15 b
GHASH_ 8 _ E N C R Y P T _ 8 _ P A R A L L E L _ A V X % x m m 0 , % x m m 1 0 , % x m m 1 1 , % x m m 1 2 , % x m m 1 3 , % x m m 1 4 , % x m m 9 , % x m m 1 , % x m m 2 , % x m m 3 , % x m m 4 , % x m m 5 , % x m m 6 , % x m m 7 , % x m m 8 , % x m m 1 5 , i n _ o r d e r , \ E N C _ D E C
vpshufb S H U F _ M A S K ( % r i p ) , % x m m 9 , % x m m 9
add $ 1 2 8 , % r11
sub $ 1 2 8 , % r13
jne _ e n c r y p t _ b y _ 8 _ n e w \ @
vpshufb S H U F _ M A S K ( % r i p ) , % x m m 9 , % x m m 9
_ eight_ c i p h e r _ l e f t \ @:
GHASH_ L A S T _ 8 _ A V X % x m m 0 , % x m m 1 0 , % x m m 1 1 , % x m m 1 2 , % x m m 1 3 , % x m m 1 4 , % x m m 1 5 , % x m m 1 , % x m m 2 , % x m m 3 , % x m m 4 , % x m m 5 , % x m m 6 , % x m m 7 , % x m m 8
_ zero_ c i p h e r _ l e f t \ @:
cmp $ 1 6 , a r g 4
jl _ o n l y _ l e s s _ t h a n _ 1 6 \ @
mov a r g 4 , % r13
and $ 1 5 , % r13 # r 13 = ( a r g 4 m o d 1 6 )
je _ m u l t i p l e _ o f _ 1 6 _ b y t e s \ @
# handle t h e l a s t < 1 6 B y t e b l o c k s e p e r a t e l y
vpaddd O N E ( % r i p ) , % x m m 9 , % x m m 9 # I N C R C N T t o g e t Y n
vpshufb S H U F _ M A S K ( % r i p ) , % x m m 9 , % x m m 9
ENCRYPT_ S I N G L E _ B L O C K % x m m 9 # E ( K , Y n )
sub $ 1 6 , % r11
add % r13 , % r11
vmovdqu ( a r g 3 , % r11 ) , % x m m 1 # r e c e i v e t h e l a s t < 16 B y t e b l o c k
lea S H I F T _ M A S K + 1 6 ( % r i p ) , % r12
sub % r13 , % r12 # a d j u s t t h e s h u f f l e m a s k p o i n t e r t o b e
# able t o s h i f t 1 6 - r13 b y t e s ( r13 i s t h e
# number o f b y t e s i n p l a i n t e x t m o d 1 6 )
vmovdqu ( % r12 ) , % x m m 2 # g e t t h e a p p r o p r i a t e s h u f f l e m a s k
vpshufb % x m m 2 , % x m m 1 , % x m m 1 # s h i f t r i g h t 16 - r13 b y t e s
jmp _ f i n a l _ g h a s h _ m u l \ @
_ only_ l e s s _ t h a n _ 1 6 \ @:
# check f o r 0 l e n g t h
mov a r g 4 , % r13
and $ 1 5 , % r13 # r 13 = ( a r g 4 m o d 1 6 )
je _ m u l t i p l e _ o f _ 1 6 _ b y t e s \ @
# handle t h e l a s t < 1 6 B y t e b l o c k s e p e r a t e l y
vpaddd O N E ( % r i p ) , % x m m 9 , % x m m 9 # I N C R C N T t o g e t Y n
vpshufb S H U F _ M A S K ( % r i p ) , % x m m 9 , % x m m 9
ENCRYPT_ S I N G L E _ B L O C K % x m m 9 # E ( K , Y n )
lea S H I F T _ M A S K + 1 6 ( % r i p ) , % r12
sub % r13 , % r12 # a d j u s t t h e s h u f f l e m a s k p o i n t e r t o b e
# able t o s h i f t 1 6 - r13 b y t e s ( r13 i s t h e
# number o f b y t e s i n p l a i n t e x t m o d 1 6 )
_ get_ l a s t _ 1 6 _ b y t e _ l o o p \ @:
movb ( a r g 3 , % r11 ) , % a l
movb % a l , T M P 1 ( % r s p , % r11 )
add $ 1 , % r11
cmp % r13 , % r11
jne _ g e t _ l a s t _ 1 6 _ b y t e _ l o o p \ @
vmovdqu T M P 1 ( % r s p ) , % x m m 1
sub $ 1 6 , % r11
_ final_ g h a s h _ m u l \ @:
.if \ ENC_ D E C = = D E C
vmovdqa % x m m 1 , % x m m 2
vpxor % x m m 1 , % x m m 9 , % x m m 9 # P l a i n t e x t X O R E ( K , Y n )
vmovdqu A L L _ F - S H I F T _ M A S K ( % r12 ) , % x m m 1 # g e t t h e a p p r o p r i a t e m a s k t o
# mask o u t t o p 1 6 - r13 b y t e s o f x m m 9
vpand % x m m 1 , % x m m 9 , % x m m 9 # m a s k o u t t o p 16 - r13 b y t e s o f x m m 9
vpand % x m m 1 , % x m m 2 , % x m m 2
vpshufb S H U F _ M A S K ( % r i p ) , % x m m 2 , % x m m 2
vpxor % x m m 2 , % x m m 1 4 , % x m m 1 4
# GHASH c o m p u t a t i o n f o r t h e l a s t < 1 6 B y t e b l o c k
GHASH_ M U L _ A V X % x m m 1 4 , % x m m 1 3 , % x m m 0 , % x m m 1 0 , % x m m 1 1 , % x m m 5 , % x m m 6
sub % r13 , % r11
add $ 1 6 , % r11
.else
vpxor % x m m 1 , % x m m 9 , % x m m 9 # P l a i n t e x t X O R E ( K , Y n )
vmovdqu A L L _ F - S H I F T _ M A S K ( % r12 ) , % x m m 1 # g e t t h e a p p r o p r i a t e m a s k t o
# mask o u t t o p 1 6 - r13 b y t e s o f x m m 9
vpand % x m m 1 , % x m m 9 , % x m m 9 # m a s k o u t t o p 16 - r13 b y t e s o f x m m 9
vpshufb S H U F _ M A S K ( % r i p ) , % x m m 9 , % x m m 9
vpxor % x m m 9 , % x m m 1 4 , % x m m 1 4
# GHASH c o m p u t a t i o n f o r t h e l a s t < 1 6 B y t e b l o c k
GHASH_ M U L _ A V X % x m m 1 4 , % x m m 1 3 , % x m m 0 , % x m m 1 0 , % x m m 1 1 , % x m m 5 , % x m m 6
sub % r13 , % r11
add $ 1 6 , % r11
vpshufb S H U F _ M A S K ( % r i p ) , % x m m 9 , % x m m 9 # s h u f f l e x m m 9 b a c k t o o u t p u t a s c i p h e r t e x t
.endif
# # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# output r13 B y t e s
vmovq % x m m 9 , % r a x
cmp $ 8 , % r13
jle _ l e s s _ t h a n _ 8 _ b y t e s _ l e f t \ @
mov % r a x , ( a r g 2 , % r11 )
add $ 8 , % r11
vpsrldq $ 8 , % x m m 9 , % x m m 9
vmovq % x m m 9 , % r a x
sub $ 8 , % r13
_ less_ t h a n _ 8 _ b y t e s _ l e f t \ @:
movb % a l , ( a r g 2 , % r11 )
add $ 1 , % r11
shr $ 8 , % r a x
sub $ 1 , % r13
jne _ l e s s _ t h a n _ 8 _ b y t e s _ l e f t \ @
# # # # # # # # # # # # # # # # # # # # # # # # # # # # #
_ multiple_ o f _ 1 6 _ b y t e s \ @:
mov a r g 7 , % r12 # r 12 = a a d L e n ( n u m b e r o f b y t e s )
shl $ 3 , % r12 # c o n v e r t i n t o n u m b e r o f b i t s
vmovd % r12 d , % x m m 1 5 # l e n ( A ) i n x m m 15
shl $ 3 , a r g 4 # l e n ( C ) i n b i t s ( * 128 )
vmovq a r g 4 , % x m m 1
vpslldq $ 8 , % x m m 1 5 , % x m m 1 5 # x m m 15 = l e n ( A ) | | 0 x00 0 0 0 0 0 0 0 0 0 0 0 0 0 0
vpxor % x m m 1 , % x m m 1 5 , % x m m 1 5 # x m m 15 = l e n ( A ) | | l e n ( C )
vpxor % x m m 1 5 , % x m m 1 4 , % x m m 1 4
GHASH_ M U L _ A V X % x m m 1 4 , % x m m 1 3 , % x m m 0 , % x m m 1 0 , % x m m 1 1 , % x m m 5 , % x m m 6 # f i n a l G H A S H c o m p u t a t i o n
vpshufb S H U F _ M A S K ( % r i p ) , % x m m 1 4 , % x m m 1 4 # p e r f o r m a 16 B y t e s w a p
mov a r g 5 , % r a x # r a x = * Y 0
vmovdqu ( % r a x ) , % x m m 9 # x m m 9 = Y 0
ENCRYPT_ S I N G L E _ B L O C K % x m m 9 # E ( K , Y 0 )
vpxor % x m m 1 4 , % x m m 9 , % x m m 9
_ return_ T \ @:
mov a r g 8 , % r10 # r 10 = a u t h T a g
mov a r g 9 , % r11 # r 11 = a u t h _ t a g _ l e n
cmp $ 1 6 , % r11
je _ T _ 1 6 \ @
2017-04-28 18:11:59 +02:00
cmp $ 8 , % r11
jl _ T _ 4 \ @
2013-12-11 14:28:41 -08:00
_ T_ 8 \ @:
vmovq % x m m 9 , % r a x
mov % r a x , ( % r10 )
2017-04-28 18:11:59 +02:00
add $ 8 , % r10
sub $ 8 , % r11
2013-12-11 14:28:41 -08:00
vpsrldq $ 8 , % x m m 9 , % x m m 9
2017-04-28 18:11:59 +02:00
cmp $ 0 , % r11
je _ r e t u r n _ T _ d o n e \ @
_ T_ 4 \ @:
2013-12-11 14:28:41 -08:00
vmovd % x m m 9 , % e a x
2017-04-28 18:11:59 +02:00
mov % e a x , ( % r10 )
add $ 4 , % r10
sub $ 4 , % r11
vpsrldq $ 4 , % x m m 9 , % x m m 9
cmp $ 0 , % r11
je _ r e t u r n _ T _ d o n e \ @
_ T_ 1 2 3 \ @:
vmovd % x m m 9 , % e a x
cmp $ 2 , % r11
jl _ T _ 1 \ @
mov % a x , ( % r10 )
cmp $ 2 , % r11
je _ r e t u r n _ T _ d o n e \ @
add $ 2 , % r10
sar $ 1 6 , % e a x
_ T_ 1 \ @:
mov % a l , ( % r10 )
2013-12-11 14:28:41 -08:00
jmp _ r e t u r n _ T _ d o n e \ @
_ T_ 1 6 \ @:
vmovdqu % x m m 9 , ( % r10 )
_ return_ T _ d o n e \ @:
mov % r14 , % r s p
pop % r15
pop % r14
pop % r13
pop % r12
.endm
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# void a e s n i _ g c m _ p r e c o m p _ a v x _ g e n 2
# ( gcm_ d a t a * m y _ c t x _ d a t a ,
# u8 * h a s h _ s u b k e y ) # / * H , t h e H a s h s u b k e y i n p u t . D a t a s t a r t s o n a 16 - b y t e b o u n d a r y . * /
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
ENTRY( a e s n i _ g c m _ p r e c o m p _ a v x _ g e n 2 )
# the n u m b e r o f p u s h e s m u s t e q u a l S T A C K _ O F F S E T
push % r12
push % r13
push % r14
push % r15
mov % r s p , % r14
sub $ V A R I A B L E _ O F F S E T , % r s p
and $ ~ 6 3 , % r s p # a l i g n r s p t o 64 b y t e s
vmovdqu ( a r g 2 ) , % x m m 6 # x m m 6 = H a s h K e y
vpshufb S H U F _ M A S K ( % r i p ) , % x m m 6 , % x m m 6
# # # # # # # # # # # # # # # PRECOMPUTATION o f H a s h K e y < < 1 m o d p o l y f r o m t h e H a s h K e y
vmovdqa % x m m 6 , % x m m 2
vpsllq $ 1 , % x m m 6 , % x m m 6
vpsrlq $ 6 3 , % x m m 2 , % x m m 2
vmovdqa % x m m 2 , % x m m 1
vpslldq $ 8 , % x m m 2 , % x m m 2
vpsrldq $ 8 , % x m m 1 , % x m m 1
vpor % x m m 2 , % x m m 6 , % x m m 6
# reduction
vpshufd $ 0 b00 1 0 0 1 0 0 , % x m m 1 , % x m m 2
vpcmpeqd T W O O N E ( % r i p ) , % x m m 2 , % x m m 2
vpand P O L Y ( % r i p ) , % x m m 2 , % x m m 2
vpxor % x m m 2 , % x m m 6 , % x m m 6 # x m m 6 h o l d s t h e H a s h K e y < < 1 m o d p o l y
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
vmovdqa % x m m 6 , H a s h K e y ( a r g 1 ) # s t o r e H a s h K e y < < 1 m o d p o l y
PRECOMPUTE_ A V X % x m m 6 , % x m m 0 , % x m m 1 , % x m m 2 , % x m m 3 , % x m m 4 , % x m m 5
mov % r14 , % r s p
pop % r15
pop % r14
pop % r13
pop % r12
ret
ENDPROC( a e s n i _ g c m _ p r e c o m p _ a v x _ g e n 2 )
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# void a e s n i _ g c m _ e n c _ a v x _ g e n 2 (
# gcm_ d a t a * m y _ c t x _ d a t a , / * a l i g n e d t o 1 6 B y t e s * /
# u8 * o u t , / * C i p h e r t e x t o u t p u t . E n c r y p t i n - p l a c e i s a l l o w e d . * /
# const u 8 * i n , / * P l a i n t e x t i n p u t * /
# u6 4 p l a i n t e x t _ l e n , / * L e n g t h o f d a t a i n B y t e s f o r e n c r y p t i o n . * /
# u8 * i v , / * P r e - c o u n t e r b l o c k j 0 : 4 b y t e s a l t
# ( from S e c u r i t y A s s o c i a t i o n ) c o n c a t e n a t e d w i t h 8 b y t e
# Initialisation V e c t o r ( f r o m I P S e c E S P P a y l o a d )
# concatenated w i t h 0 x00 0 0 0 0 0 1 . 1 6 - b y t e a l i g n e d p o i n t e r . * /
# const u 8 * a a d , / * A d d i t i o n a l A u t h e n t i c a t i o n D a t a ( A A D ) * /
# u6 4 a a d _ l e n , / * L e n g t h o f A A D i n b y t e s . W i t h R F C 4 1 0 6 t h i s i s g o i n g t o b e 8 o r 1 2 B y t e s * /
# u8 * a u t h _ t a g , / * A u t h e n t i c a t e d T a g o u t p u t . * /
# u6 4 a u t h _ t a g _ l e n ) # / * A u t h e n t i c a t e d T a g L e n g t h i n b y t e s .
# Valid v a l u e s a r e 1 6 ( m o s t l i k e l y ) , 1 2 o r 8 . * /
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
ENTRY( a e s n i _ g c m _ e n c _ a v x _ g e n 2 )
GCM_ E N C _ D E C _ A V X E N C
ret
ENDPROC( a e s n i _ g c m _ e n c _ a v x _ g e n 2 )
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# void a e s n i _ g c m _ d e c _ a v x _ g e n 2 (
# gcm_ d a t a * m y _ c t x _ d a t a , / * a l i g n e d t o 1 6 B y t e s * /
# u8 * o u t , / * P l a i n t e x t o u t p u t . D e c r y p t i n - p l a c e i s a l l o w e d . * /
# const u 8 * i n , / * C i p h e r t e x t i n p u t * /
# u6 4 p l a i n t e x t _ l e n , / * L e n g t h o f d a t a i n B y t e s f o r e n c r y p t i o n . * /
# u8 * i v , / * P r e - c o u n t e r b l o c k j 0 : 4 b y t e s a l t
# ( from S e c u r i t y A s s o c i a t i o n ) c o n c a t e n a t e d w i t h 8 b y t e
# Initialisation V e c t o r ( f r o m I P S e c E S P P a y l o a d )
# concatenated w i t h 0 x00 0 0 0 0 0 1 . 1 6 - b y t e a l i g n e d p o i n t e r . * /
# const u 8 * a a d , / * A d d i t i o n a l A u t h e n t i c a t i o n D a t a ( A A D ) * /
# u6 4 a a d _ l e n , / * L e n g t h o f A A D i n b y t e s . W i t h R F C 4 1 0 6 t h i s i s g o i n g t o b e 8 o r 1 2 B y t e s * /
# u8 * a u t h _ t a g , / * A u t h e n t i c a t e d T a g o u t p u t . * /
# u6 4 a u t h _ t a g _ l e n ) # / * A u t h e n t i c a t e d T a g L e n g t h i n b y t e s .
# Valid v a l u e s a r e 1 6 ( m o s t l i k e l y ) , 1 2 o r 8 . * /
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
ENTRY( a e s n i _ g c m _ d e c _ a v x _ g e n 2 )
GCM_ E N C _ D E C _ A V X D E C
ret
ENDPROC( a e s n i _ g c m _ d e c _ a v x _ g e n 2 )
# endif / * C O N F I G _ A S _ A V X * /
# ifdef C O N F I G _ A S _ A V X 2
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# GHASH_ M U L M A C R O t o i m p l e m e n t : D a t a * H a s h K e y m o d ( 1 2 8 ,1 2 7 ,1 2 6 ,1 2 1 ,0 )
# Input : A a n d B ( 1 2 8 - b i t s e a c h , b i t - r e f l e c t e d )
# Output : C = A * B * x m o d p o l y , ( i . e . > > 1 )
# To c o m p u t e G H = G H * H a s h K e y m o d p o l y , g i v e H K = H a s h K e y < < 1 m o d p o l y a s i n p u t
# GH = G H * H K * x m o d p o l y w h i c h i s e q u i v a l e n t t o G H * H a s h K e y m o d p o l y .
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
.macro GHASH_MUL_AVX2 GH H K T 1 T 2 T 3 T 4 T 5
vpclmulqdq $ 0 x11 ,\ H K ,\ G H ,\ T 1 # T 1 = a1 * b1
vpclmulqdq $ 0 x00 ,\ H K ,\ G H ,\ T 2 # T 2 = a0 * b0
vpclmulqdq $ 0 x01 ,\ H K ,\ G H ,\ T 3 # T 3 = a1 * b0
vpclmulqdq $ 0 x10 ,\ H K ,\ G H ,\ G H # G H = a 0 * b1
vpxor \ T 3 , \ G H , \ G H
vpsrldq $ 8 , \ G H , \ T 3 # s h i f t - R G H 2 D W s
vpslldq $ 8 , \ G H , \ G H # s h i f t - L G H 2 D W s
vpxor \ T 3 , \ T 1 , \ T 1
vpxor \ T 2 , \ G H , \ G H
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# first p h a s e o f t h e r e d u c t i o n
vmovdqa P O L Y 2 ( % r i p ) , \ T 3
vpclmulqdq $ 0 x01 , \ G H , \ T 3 , \ T 2
vpslldq $ 8 , \ T 2 , \ T 2 # s h i f t - L T 2 2 D W s
vpxor \ T 2 , \ G H , \ G H # f i r s t p h a s e o f t h e r e d u c t i o n c o m p l e t e
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# second p h a s e o f t h e r e d u c t i o n
vpclmulqdq $ 0 x00 , \ G H , \ T 3 , \ T 2
vpsrldq $ 4 , \ T 2 , \ T 2 # s h i f t - R T 2 1 D W ( S h i f t - R o n l y 1 - D W t o o b t a i n 2 - D W s s h i f t - R )
vpclmulqdq $ 0 x10 , \ G H , \ T 3 , \ G H
vpslldq $ 4 , \ G H , \ G H # s h i f t - L G H 1 D W ( S h i f t - L 1 - D W t o o b t a i n r e s u l t w i t h n o s h i f t s )
vpxor \ T 2 , \ G H , \ G H # s e c o n d p h a s e o f t h e r e d u c t i o n c o m p l e t e
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
vpxor \ T 1 , \ G H , \ G H # t h e r e s u l t i s i n G H
.endm
.macro PRECOMPUTE_AVX2 HK T 1 T 2 T 3 T 4 T 5 T 6
# Haskey_ i _ k h o l d s X O R e d v a l u e s o f t h e l o w a n d h i g h p a r t s o f t h e H a s k e y _ i
vmovdqa \ H K , \ T 5
GHASH_ M U L _ A V X 2 \ T 5 , \ H K , \ T 1 , \ T 3 , \ T 4 , \ T 6 , \ T 2 # T 5 = H a s h K e y ^ 2 < < 1 m o d p o l y
vmovdqa \ T 5 , H a s h K e y _ 2 ( a r g 1 ) # [ H a s h K e y _ 2 ] = H a s h K e y ^ 2 < < 1 m o d p o l y
GHASH_ M U L _ A V X 2 \ T 5 , \ H K , \ T 1 , \ T 3 , \ T 4 , \ T 6 , \ T 2 # T 5 = H a s h K e y ^ 3 < < 1 m o d p o l y
vmovdqa \ T 5 , H a s h K e y _ 3 ( a r g 1 )
GHASH_ M U L _ A V X 2 \ T 5 , \ H K , \ T 1 , \ T 3 , \ T 4 , \ T 6 , \ T 2 # T 5 = H a s h K e y ^ 4 < < 1 m o d p o l y
vmovdqa \ T 5 , H a s h K e y _ 4 ( a r g 1 )
GHASH_ M U L _ A V X 2 \ T 5 , \ H K , \ T 1 , \ T 3 , \ T 4 , \ T 6 , \ T 2 # T 5 = H a s h K e y ^ 5 < < 1 m o d p o l y
vmovdqa \ T 5 , H a s h K e y _ 5 ( a r g 1 )
GHASH_ M U L _ A V X 2 \ T 5 , \ H K , \ T 1 , \ T 3 , \ T 4 , \ T 6 , \ T 2 # T 5 = H a s h K e y ^ 6 < < 1 m o d p o l y
vmovdqa \ T 5 , H a s h K e y _ 6 ( a r g 1 )
GHASH_ M U L _ A V X 2 \ T 5 , \ H K , \ T 1 , \ T 3 , \ T 4 , \ T 6 , \ T 2 # T 5 = H a s h K e y ^ 7 < < 1 m o d p o l y
vmovdqa \ T 5 , H a s h K e y _ 7 ( a r g 1 )
GHASH_ M U L _ A V X 2 \ T 5 , \ H K , \ T 1 , \ T 3 , \ T 4 , \ T 6 , \ T 2 # T 5 = H a s h K e y ^ 8 < < 1 m o d p o l y
vmovdqa \ T 5 , H a s h K e y _ 8 ( a r g 1 )
.endm
# # if a = n u m b e r o f t o t a l p l a i n t e x t b y t e s
# # b = f l o o r ( a / 1 6 )
# # num_ i n i t i a l _ b l o c k s = b m o d 4 #
# # encrypt t h e i n i t i a l n u m _ i n i t i a l _ b l o c k s b l o c k s a n d a p p l y g h a s h o n t h e c i p h e r t e x t
# # r1 0 , r11 , r12 , r a x a r e c l o b b e r e d
# # arg1 , a r g 2 , a r g 3 , r14 a r e u s e d a s a p o i n t e r o n l y , n o t m o d i f i e d
.macro INITIAL_BLOCKS_AVX2 num_ i n i t i a l _ b l o c k s T 1 T 2 T 3 T 4 T 5 C T R X M M 1 X M M 2 X M M 3 X M M 4 X M M 5 X M M 6 X M M 7 X M M 8 T 6 T _ k e y E N C _ D E C V E R
i = ( 8 - \ n u m _ i n i t i a l _ b l o c k s )
2017-04-28 18:12:00 +02:00
j = 0
2013-12-11 14:28:41 -08:00
setreg
2017-04-28 18:12:00 +02:00
mov a r g 6 , % r10 # r 10 = A A D
mov a r g 7 , % r12 # r 12 = a a d L e n
2013-12-11 14:28:41 -08:00
2017-04-28 18:12:00 +02:00
mov % r12 , % r11
2013-12-11 14:28:41 -08:00
2017-04-28 18:12:00 +02:00
vpxor r e g _ j , r e g _ j , r e g _ j
vpxor r e g _ i , r e g _ i , r e g _ i
2013-12-11 14:28:41 -08:00
2017-04-28 18:12:00 +02:00
cmp $ 1 6 , % r11
jl _ g e t _ A A D _ r e s t 8 \ @
_ get_ A A D _ b l o c k s \ @:
vmovdqu ( % r10 ) , r e g _ i
vpshufb S H U F _ M A S K ( % r i p ) , r e g _ i , r e g _ i
vpxor r e g _ i , r e g _ j , r e g _ j
GHASH_ M U L _ A V X 2 r e g _ j , \ T 2 , \ T 1 , \ T 3 , \ T 4 , \ T 5 , \ T 6
add $ 1 6 , % r10
sub $ 1 6 , % r12
sub $ 1 6 , % r11
cmp $ 1 6 , % r11
jge _ g e t _ A A D _ b l o c k s \ @
vmovdqu r e g _ j , r e g _ i
cmp $ 0 , % r11
je _ g e t _ A A D _ d o n e \ @
2013-12-11 14:28:41 -08:00
2017-04-28 18:12:00 +02:00
vpxor r e g _ i , r e g _ i , r e g _ i
2013-12-11 14:28:41 -08:00
2017-04-28 18:12:00 +02:00
/ * read t h e l a s t < 1 6 B o f A A D . s i n c e w e h a v e a t l e a s t 4 B o f
data r i g h t a f t e r t h e A A D ( t h e I C V , a n d m a y b e s o m e C T ) , w e c a n
read 4 B / 8 B b l o c k s s a f e l y , a n d t h e n g e t r i d o f t h e e x t r a s t u f f * /
_ get_ A A D _ r e s t 8 \ @:
cmp $ 4 , % r11
jle _ g e t _ A A D _ r e s t 4 \ @
movq ( % r10 ) , \ T 1
add $ 8 , % r10
sub $ 8 , % r11
vpslldq $ 8 , \ T 1 , \ T 1
vpsrldq $ 8 , r e g _ i , r e g _ i
vpxor \ T 1 , r e g _ i , r e g _ i
jmp _ g e t _ A A D _ r e s t 8 \ @
_ get_ A A D _ r e s t 4 \ @:
cmp $ 0 , % r11
jle _ g e t _ A A D _ r e s t 0 \ @
mov ( % r10 ) , % e a x
movq % r a x , \ T 1
add $ 4 , % r10
sub $ 4 , % r11
vpslldq $ 1 2 , \ T 1 , \ T 1
vpsrldq $ 4 , r e g _ i , r e g _ i
vpxor \ T 1 , r e g _ i , r e g _ i
_ get_ A A D _ r e s t 0 \ @:
/ * finalize : shift o u t t h e e x t r a b y t e s w e r e a d , a n d a l i g n
left. s i n c e p s l l d q c a n o n l y s h i f t b y a n i m m e d i a t e , w e u s e
vpshufb a n d a n a r r a y o f s h u f f l e m a s k s * /
movq % r12 , % r11
salq $ 4 , % r11
movdqu a a d _ s h i f t _ a r r ( % r11 ) , \ T 1
vpshufb \ T 1 , r e g _ i , r e g _ i
_ get_ A A D _ r e s t _ f i n a l \ @:
vpshufb S H U F _ M A S K ( % r i p ) , r e g _ i , r e g _ i
vpxor r e g _ j , r e g _ i , r e g _ i
GHASH_ M U L _ A V X 2 r e g _ i , \ T 2 , \ T 1 , \ T 3 , \ T 4 , \ T 5 , \ T 6
2013-12-11 14:28:41 -08:00
2017-04-28 18:12:00 +02:00
_ get_ A A D _ d o n e \ @:
2013-12-11 14:28:41 -08:00
# initialize t h e d a t a p o i n t e r o f f s e t a s z e r o
2018-07-02 04:31:54 -06:00
xor % r11 d , % r11 d
2013-12-11 14:28:41 -08:00
# start A E S f o r n u m _ i n i t i a l _ b l o c k s b l o c k s
mov a r g 5 , % r a x # r a x = * Y 0
vmovdqu ( % r a x ) , \ C T R # C T R = Y 0
vpshufb S H U F _ M A S K ( % r i p ) , \ C T R , \ C T R
i = ( 9 - \ n u m _ i n i t i a l _ b l o c k s )
setreg
.rep \ num_ i n i t i a l _ b l o c k s
vpaddd O N E ( % r i p ) , \ C T R , \ C T R # I N C R Y 0
vmovdqa \ C T R , r e g _ i
vpshufb S H U F _ M A S K ( % r i p ) , r e g _ i , r e g _ i # p e r f o r m a 16 B y t e s w a p
i = ( i + 1 )
setreg
.endr
vmovdqa ( a r g 1 ) , \ T _ k e y
i = ( 9 - \ n u m _ i n i t i a l _ b l o c k s )
setreg
.rep \ num_ i n i t i a l _ b l o c k s
vpxor \ T _ k e y , r e g _ i , r e g _ i
i = ( i + 1 )
setreg
.endr
j = 1
setreg
.rep 9
vmovdqa 1 6 * j ( a r g 1 ) , \ T _ k e y
i = ( 9 - \ n u m _ i n i t i a l _ b l o c k s )
setreg
.rep \ num_ i n i t i a l _ b l o c k s
vaesenc \ T _ k e y , r e g _ i , r e g _ i
i = ( i + 1 )
setreg
.endr
j = ( j + 1 )
setreg
.endr
vmovdqa 1 6 * 1 0 ( a r g 1 ) , \ T _ k e y
i = ( 9 - \ n u m _ i n i t i a l _ b l o c k s )
setreg
.rep \ num_ i n i t i a l _ b l o c k s
vaesenclast \ T _ k e y , r e g _ i , r e g _ i
i = ( i + 1 )
setreg
.endr
i = ( 9 - \ n u m _ i n i t i a l _ b l o c k s )
setreg
.rep \ num_ i n i t i a l _ b l o c k s
vmovdqu ( a r g 3 , % r11 ) , \ T 1
vpxor \ T 1 , r e g _ i , r e g _ i
vmovdqu r e g _ i , ( a r g 2 , % r11 ) # w r i t e b a c k c i p h e r t e x t f o r
# num_ i n i t i a l _ b l o c k s b l o c k s
add $ 1 6 , % r11
.if \ ENC_ D E C = = D E C
vmovdqa \ T 1 , r e g _ i
.endif
vpshufb S H U F _ M A S K ( % r i p ) , r e g _ i , r e g _ i # p r e p a r e c i p h e r t e x t f o r G H A S H c o m p u t a t i o n s
i = ( i + 1 )
setreg
.endr
i = ( 8 - \ n u m _ i n i t i a l _ b l o c k s )
j = ( 9 - \ n u m _ i n i t i a l _ b l o c k s )
setreg
.rep \ num_ i n i t i a l _ b l o c k s
vpxor r e g _ i , r e g _ j , r e g _ j
GHASH_ M U L _ A V X 2 r e g _ j , \ T 2 , \ T 1 , \ T 3 , \ T 4 , \ T 5 , \ T 6 # a p p l y G H A S H o n n u m _ i n i t i a l _ b l o c k s b l o c k s
i = ( i + 1 )
j = ( j + 1 )
setreg
.endr
# XMM8 h a s t h e c o m b i n e d r e s u l t h e r e
vmovdqa \ X M M 8 , T M P 1 ( % r s p )
vmovdqa \ X M M 8 , \ T 3
cmp $ 1 2 8 , % r13
jl _ i n i t i a l _ b l o c k s _ d o n e \ @ # no need for precomputed constants
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# Haskey_ i _ k h o l d s X O R e d v a l u e s o f t h e l o w a n d h i g h p a r t s o f t h e H a s k e y _ i
vpaddd O N E ( % r i p ) , \ C T R , \ C T R # I N C R Y 0
vmovdqa \ C T R , \ X M M 1
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 1 , \ X M M 1 # p e r f o r m a 16 B y t e s w a p
vpaddd O N E ( % r i p ) , \ C T R , \ C T R # I N C R Y 0
vmovdqa \ C T R , \ X M M 2
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 2 , \ X M M 2 # p e r f o r m a 16 B y t e s w a p
vpaddd O N E ( % r i p ) , \ C T R , \ C T R # I N C R Y 0
vmovdqa \ C T R , \ X M M 3
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 3 , \ X M M 3 # p e r f o r m a 16 B y t e s w a p
vpaddd O N E ( % r i p ) , \ C T R , \ C T R # I N C R Y 0
vmovdqa \ C T R , \ X M M 4
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 4 , \ X M M 4 # p e r f o r m a 16 B y t e s w a p
vpaddd O N E ( % r i p ) , \ C T R , \ C T R # I N C R Y 0
vmovdqa \ C T R , \ X M M 5
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 5 , \ X M M 5 # p e r f o r m a 16 B y t e s w a p
vpaddd O N E ( % r i p ) , \ C T R , \ C T R # I N C R Y 0
vmovdqa \ C T R , \ X M M 6
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 6 , \ X M M 6 # p e r f o r m a 16 B y t e s w a p
vpaddd O N E ( % r i p ) , \ C T R , \ C T R # I N C R Y 0
vmovdqa \ C T R , \ X M M 7
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 7 , \ X M M 7 # p e r f o r m a 16 B y t e s w a p
vpaddd O N E ( % r i p ) , \ C T R , \ C T R # I N C R Y 0
vmovdqa \ C T R , \ X M M 8
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 8 , \ X M M 8 # p e r f o r m a 16 B y t e s w a p
vmovdqa ( a r g 1 ) , \ T _ k e y
vpxor \ T _ k e y , \ X M M 1 , \ X M M 1
vpxor \ T _ k e y , \ X M M 2 , \ X M M 2
vpxor \ T _ k e y , \ X M M 3 , \ X M M 3
vpxor \ T _ k e y , \ X M M 4 , \ X M M 4
vpxor \ T _ k e y , \ X M M 5 , \ X M M 5
vpxor \ T _ k e y , \ X M M 6 , \ X M M 6
vpxor \ T _ k e y , \ X M M 7 , \ X M M 7
vpxor \ T _ k e y , \ X M M 8 , \ X M M 8
i = 1
setreg
.rep 9 # do 9 r o u n d s
vmovdqa 1 6 * i ( a r g 1 ) , \ T _ k e y
vaesenc \ T _ k e y , \ X M M 1 , \ X M M 1
vaesenc \ T _ k e y , \ X M M 2 , \ X M M 2
vaesenc \ T _ k e y , \ X M M 3 , \ X M M 3
vaesenc \ T _ k e y , \ X M M 4 , \ X M M 4
vaesenc \ T _ k e y , \ X M M 5 , \ X M M 5
vaesenc \ T _ k e y , \ X M M 6 , \ X M M 6
vaesenc \ T _ k e y , \ X M M 7 , \ X M M 7
vaesenc \ T _ k e y , \ X M M 8 , \ X M M 8
i = ( i + 1 )
setreg
.endr
vmovdqa 1 6 * i ( a r g 1 ) , \ T _ k e y
vaesenclast \ T _ k e y , \ X M M 1 , \ X M M 1
vaesenclast \ T _ k e y , \ X M M 2 , \ X M M 2
vaesenclast \ T _ k e y , \ X M M 3 , \ X M M 3
vaesenclast \ T _ k e y , \ X M M 4 , \ X M M 4
vaesenclast \ T _ k e y , \ X M M 5 , \ X M M 5
vaesenclast \ T _ k e y , \ X M M 6 , \ X M M 6
vaesenclast \ T _ k e y , \ X M M 7 , \ X M M 7
vaesenclast \ T _ k e y , \ X M M 8 , \ X M M 8
vmovdqu ( a r g 3 , % r11 ) , \ T 1
vpxor \ T 1 , \ X M M 1 , \ X M M 1
vmovdqu \ X M M 1 , ( a r g 2 , % r11 )
.if \ ENC_ D E C = = D E C
vmovdqa \ T 1 , \ X M M 1
.endif
vmovdqu 1 6 * 1 ( a r g 3 , % r11 ) , \ T 1
vpxor \ T 1 , \ X M M 2 , \ X M M 2
vmovdqu \ X M M 2 , 1 6 * 1 ( a r g 2 , % r11 )
.if \ ENC_ D E C = = D E C
vmovdqa \ T 1 , \ X M M 2
.endif
vmovdqu 1 6 * 2 ( a r g 3 , % r11 ) , \ T 1
vpxor \ T 1 , \ X M M 3 , \ X M M 3
vmovdqu \ X M M 3 , 1 6 * 2 ( a r g 2 , % r11 )
.if \ ENC_ D E C = = D E C
vmovdqa \ T 1 , \ X M M 3
.endif
vmovdqu 1 6 * 3 ( a r g 3 , % r11 ) , \ T 1
vpxor \ T 1 , \ X M M 4 , \ X M M 4
vmovdqu \ X M M 4 , 1 6 * 3 ( a r g 2 , % r11 )
.if \ ENC_ D E C = = D E C
vmovdqa \ T 1 , \ X M M 4
.endif
vmovdqu 1 6 * 4 ( a r g 3 , % r11 ) , \ T 1
vpxor \ T 1 , \ X M M 5 , \ X M M 5
vmovdqu \ X M M 5 , 1 6 * 4 ( a r g 2 , % r11 )
.if \ ENC_ D E C = = D E C
vmovdqa \ T 1 , \ X M M 5
.endif
vmovdqu 1 6 * 5 ( a r g 3 , % r11 ) , \ T 1
vpxor \ T 1 , \ X M M 6 , \ X M M 6
vmovdqu \ X M M 6 , 1 6 * 5 ( a r g 2 , % r11 )
.if \ ENC_ D E C = = D E C
vmovdqa \ T 1 , \ X M M 6
.endif
vmovdqu 1 6 * 6 ( a r g 3 , % r11 ) , \ T 1
vpxor \ T 1 , \ X M M 7 , \ X M M 7
vmovdqu \ X M M 7 , 1 6 * 6 ( a r g 2 , % r11 )
.if \ ENC_ D E C = = D E C
vmovdqa \ T 1 , \ X M M 7
.endif
vmovdqu 1 6 * 7 ( a r g 3 , % r11 ) , \ T 1
vpxor \ T 1 , \ X M M 8 , \ X M M 8
vmovdqu \ X M M 8 , 1 6 * 7 ( a r g 2 , % r11 )
.if \ ENC_ D E C = = D E C
vmovdqa \ T 1 , \ X M M 8
.endif
add $ 1 2 8 , % r11
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 1 , \ X M M 1 # p e r f o r m a 16 B y t e s w a p
vpxor T M P 1 ( % r s p ) , \ X M M 1 , \ X M M 1 # c o m b i n e G H A S H e d v a l u e w i t h
# the c o r r e s p o n d i n g c i p h e r t e x t
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 2 , \ X M M 2 # p e r f o r m a 16 B y t e s w a p
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 3 , \ X M M 3 # p e r f o r m a 16 B y t e s w a p
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 4 , \ X M M 4 # p e r f o r m a 16 B y t e s w a p
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 5 , \ X M M 5 # p e r f o r m a 16 B y t e s w a p
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 6 , \ X M M 6 # p e r f o r m a 16 B y t e s w a p
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 7 , \ X M M 7 # p e r f o r m a 16 B y t e s w a p
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 8 , \ X M M 8 # p e r f o r m a 16 B y t e s w a p
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
_ initial_ b l o c k s _ d o n e \ @:
.endm
# encrypt 8 b l o c k s a t a t i m e
# ghash t h e 8 p r e v i o u s l y e n c r y p t e d c i p h e r t e x t b l o c k s
# arg1 , a r g 2 , a r g 3 a r e u s e d a s p o i n t e r s o n l y , n o t m o d i f i e d
# r1 1 i s t h e d a t a o f f s e t v a l u e
.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 T1 T 2 T 3 T 4 T 5 T 6 C T R X M M 1 X M M 2 X M M 3 X M M 4 X M M 5 X M M 6 X M M 7 X M M 8 T 7 l o o p _ i d x E N C _ D E C
vmovdqa \ X M M 1 , \ T 2
vmovdqa \ X M M 2 , T M P 2 ( % r s p )
vmovdqa \ X M M 3 , T M P 3 ( % r s p )
vmovdqa \ X M M 4 , T M P 4 ( % r s p )
vmovdqa \ X M M 5 , T M P 5 ( % r s p )
vmovdqa \ X M M 6 , T M P 6 ( % r s p )
vmovdqa \ X M M 7 , T M P 7 ( % r s p )
vmovdqa \ X M M 8 , T M P 8 ( % r s p )
.if \ loop_ i d x = = i n _ o r d e r
vpaddd O N E ( % r i p ) , \ C T R , \ X M M 1 # I N C R C N T
vpaddd O N E ( % r i p ) , \ X M M 1 , \ X M M 2
vpaddd O N E ( % r i p ) , \ X M M 2 , \ X M M 3
vpaddd O N E ( % r i p ) , \ X M M 3 , \ X M M 4
vpaddd O N E ( % r i p ) , \ X M M 4 , \ X M M 5
vpaddd O N E ( % r i p ) , \ X M M 5 , \ X M M 6
vpaddd O N E ( % r i p ) , \ X M M 6 , \ X M M 7
vpaddd O N E ( % r i p ) , \ X M M 7 , \ X M M 8
vmovdqa \ X M M 8 , \ C T R
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 1 , \ X M M 1 # p e r f o r m a 16 B y t e s w a p
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 2 , \ X M M 2 # p e r f o r m a 16 B y t e s w a p
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 3 , \ X M M 3 # p e r f o r m a 16 B y t e s w a p
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 4 , \ X M M 4 # p e r f o r m a 16 B y t e s w a p
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 5 , \ X M M 5 # p e r f o r m a 16 B y t e s w a p
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 6 , \ X M M 6 # p e r f o r m a 16 B y t e s w a p
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 7 , \ X M M 7 # p e r f o r m a 16 B y t e s w a p
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 8 , \ X M M 8 # p e r f o r m a 16 B y t e s w a p
.else
vpaddd O N E f ( % r i p ) , \ C T R , \ X M M 1 # I N C R C N T
vpaddd O N E f ( % r i p ) , \ X M M 1 , \ X M M 2
vpaddd O N E f ( % r i p ) , \ X M M 2 , \ X M M 3
vpaddd O N E f ( % r i p ) , \ X M M 3 , \ X M M 4
vpaddd O N E f ( % r i p ) , \ X M M 4 , \ X M M 5
vpaddd O N E f ( % r i p ) , \ X M M 5 , \ X M M 6
vpaddd O N E f ( % r i p ) , \ X M M 6 , \ X M M 7
vpaddd O N E f ( % r i p ) , \ X M M 7 , \ X M M 8
vmovdqa \ X M M 8 , \ C T R
.endif
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
vmovdqu ( a r g 1 ) , \ T 1
vpxor \ T 1 , \ X M M 1 , \ X M M 1
vpxor \ T 1 , \ X M M 2 , \ X M M 2
vpxor \ T 1 , \ X M M 3 , \ X M M 3
vpxor \ T 1 , \ X M M 4 , \ X M M 4
vpxor \ T 1 , \ X M M 5 , \ X M M 5
vpxor \ T 1 , \ X M M 6 , \ X M M 6
vpxor \ T 1 , \ X M M 7 , \ X M M 7
vpxor \ T 1 , \ X M M 8 , \ X M M 8
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
vmovdqu 1 6 * 1 ( a r g 1 ) , \ T 1
vaesenc \ T 1 , \ X M M 1 , \ X M M 1
vaesenc \ T 1 , \ X M M 2 , \ X M M 2
vaesenc \ T 1 , \ X M M 3 , \ X M M 3
vaesenc \ T 1 , \ X M M 4 , \ X M M 4
vaesenc \ T 1 , \ X M M 5 , \ X M M 5
vaesenc \ T 1 , \ X M M 6 , \ X M M 6
vaesenc \ T 1 , \ X M M 7 , \ X M M 7
vaesenc \ T 1 , \ X M M 8 , \ X M M 8
vmovdqu 1 6 * 2 ( a r g 1 ) , \ T 1
vaesenc \ T 1 , \ X M M 1 , \ X M M 1
vaesenc \ T 1 , \ X M M 2 , \ X M M 2
vaesenc \ T 1 , \ X M M 3 , \ X M M 3
vaesenc \ T 1 , \ X M M 4 , \ X M M 4
vaesenc \ T 1 , \ X M M 5 , \ X M M 5
vaesenc \ T 1 , \ X M M 6 , \ X M M 6
vaesenc \ T 1 , \ X M M 7 , \ X M M 7
vaesenc \ T 1 , \ X M M 8 , \ X M M 8
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
vmovdqa H a s h K e y _ 8 ( a r g 1 ) , \ T 5
vpclmulqdq $ 0 x11 , \ T 5 , \ T 2 , \ T 4 # T 4 = a1 * b1
vpclmulqdq $ 0 x00 , \ T 5 , \ T 2 , \ T 7 # T 7 = a0 * b0
vpclmulqdq $ 0 x01 , \ T 5 , \ T 2 , \ T 6 # T 6 = a1 * b0
vpclmulqdq $ 0 x10 , \ T 5 , \ T 2 , \ T 5 # T 5 = a0 * b1
vpxor \ T 5 , \ T 6 , \ T 6
vmovdqu 1 6 * 3 ( a r g 1 ) , \ T 1
vaesenc \ T 1 , \ X M M 1 , \ X M M 1
vaesenc \ T 1 , \ X M M 2 , \ X M M 2
vaesenc \ T 1 , \ X M M 3 , \ X M M 3
vaesenc \ T 1 , \ X M M 4 , \ X M M 4
vaesenc \ T 1 , \ X M M 5 , \ X M M 5
vaesenc \ T 1 , \ X M M 6 , \ X M M 6
vaesenc \ T 1 , \ X M M 7 , \ X M M 7
vaesenc \ T 1 , \ X M M 8 , \ X M M 8
vmovdqa T M P 2 ( % r s p ) , \ T 1
vmovdqa H a s h K e y _ 7 ( a r g 1 ) , \ T 5
vpclmulqdq $ 0 x11 , \ T 5 , \ T 1 , \ T 3
vpxor \ T 3 , \ T 4 , \ T 4
vpclmulqdq $ 0 x00 , \ T 5 , \ T 1 , \ T 3
vpxor \ T 3 , \ T 7 , \ T 7
vpclmulqdq $ 0 x01 , \ T 5 , \ T 1 , \ T 3
vpxor \ T 3 , \ T 6 , \ T 6
vpclmulqdq $ 0 x10 , \ T 5 , \ T 1 , \ T 3
vpxor \ T 3 , \ T 6 , \ T 6
vmovdqu 1 6 * 4 ( a r g 1 ) , \ T 1
vaesenc \ T 1 , \ X M M 1 , \ X M M 1
vaesenc \ T 1 , \ X M M 2 , \ X M M 2
vaesenc \ T 1 , \ X M M 3 , \ X M M 3
vaesenc \ T 1 , \ X M M 4 , \ X M M 4
vaesenc \ T 1 , \ X M M 5 , \ X M M 5
vaesenc \ T 1 , \ X M M 6 , \ X M M 6
vaesenc \ T 1 , \ X M M 7 , \ X M M 7
vaesenc \ T 1 , \ X M M 8 , \ X M M 8
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
vmovdqa T M P 3 ( % r s p ) , \ T 1
vmovdqa H a s h K e y _ 6 ( a r g 1 ) , \ T 5
vpclmulqdq $ 0 x11 , \ T 5 , \ T 1 , \ T 3
vpxor \ T 3 , \ T 4 , \ T 4
vpclmulqdq $ 0 x00 , \ T 5 , \ T 1 , \ T 3
vpxor \ T 3 , \ T 7 , \ T 7
vpclmulqdq $ 0 x01 , \ T 5 , \ T 1 , \ T 3
vpxor \ T 3 , \ T 6 , \ T 6
vpclmulqdq $ 0 x10 , \ T 5 , \ T 1 , \ T 3
vpxor \ T 3 , \ T 6 , \ T 6
vmovdqu 1 6 * 5 ( a r g 1 ) , \ T 1
vaesenc \ T 1 , \ X M M 1 , \ X M M 1
vaesenc \ T 1 , \ X M M 2 , \ X M M 2
vaesenc \ T 1 , \ X M M 3 , \ X M M 3
vaesenc \ T 1 , \ X M M 4 , \ X M M 4
vaesenc \ T 1 , \ X M M 5 , \ X M M 5
vaesenc \ T 1 , \ X M M 6 , \ X M M 6
vaesenc \ T 1 , \ X M M 7 , \ X M M 7
vaesenc \ T 1 , \ X M M 8 , \ X M M 8
vmovdqa T M P 4 ( % r s p ) , \ T 1
vmovdqa H a s h K e y _ 5 ( a r g 1 ) , \ T 5
vpclmulqdq $ 0 x11 , \ T 5 , \ T 1 , \ T 3
vpxor \ T 3 , \ T 4 , \ T 4
vpclmulqdq $ 0 x00 , \ T 5 , \ T 1 , \ T 3
vpxor \ T 3 , \ T 7 , \ T 7
vpclmulqdq $ 0 x01 , \ T 5 , \ T 1 , \ T 3
vpxor \ T 3 , \ T 6 , \ T 6
vpclmulqdq $ 0 x10 , \ T 5 , \ T 1 , \ T 3
vpxor \ T 3 , \ T 6 , \ T 6
vmovdqu 1 6 * 6 ( a r g 1 ) , \ T 1
vaesenc \ T 1 , \ X M M 1 , \ X M M 1
vaesenc \ T 1 , \ X M M 2 , \ X M M 2
vaesenc \ T 1 , \ X M M 3 , \ X M M 3
vaesenc \ T 1 , \ X M M 4 , \ X M M 4
vaesenc \ T 1 , \ X M M 5 , \ X M M 5
vaesenc \ T 1 , \ X M M 6 , \ X M M 6
vaesenc \ T 1 , \ X M M 7 , \ X M M 7
vaesenc \ T 1 , \ X M M 8 , \ X M M 8
vmovdqa T M P 5 ( % r s p ) , \ T 1
vmovdqa H a s h K e y _ 4 ( a r g 1 ) , \ T 5
vpclmulqdq $ 0 x11 , \ T 5 , \ T 1 , \ T 3
vpxor \ T 3 , \ T 4 , \ T 4
vpclmulqdq $ 0 x00 , \ T 5 , \ T 1 , \ T 3
vpxor \ T 3 , \ T 7 , \ T 7
vpclmulqdq $ 0 x01 , \ T 5 , \ T 1 , \ T 3
vpxor \ T 3 , \ T 6 , \ T 6
vpclmulqdq $ 0 x10 , \ T 5 , \ T 1 , \ T 3
vpxor \ T 3 , \ T 6 , \ T 6
vmovdqu 1 6 * 7 ( a r g 1 ) , \ T 1
vaesenc \ T 1 , \ X M M 1 , \ X M M 1
vaesenc \ T 1 , \ X M M 2 , \ X M M 2
vaesenc \ T 1 , \ X M M 3 , \ X M M 3
vaesenc \ T 1 , \ X M M 4 , \ X M M 4
vaesenc \ T 1 , \ X M M 5 , \ X M M 5
vaesenc \ T 1 , \ X M M 6 , \ X M M 6
vaesenc \ T 1 , \ X M M 7 , \ X M M 7
vaesenc \ T 1 , \ X M M 8 , \ X M M 8
vmovdqa T M P 6 ( % r s p ) , \ T 1
vmovdqa H a s h K e y _ 3 ( a r g 1 ) , \ T 5
vpclmulqdq $ 0 x11 , \ T 5 , \ T 1 , \ T 3
vpxor \ T 3 , \ T 4 , \ T 4
vpclmulqdq $ 0 x00 , \ T 5 , \ T 1 , \ T 3
vpxor \ T 3 , \ T 7 , \ T 7
vpclmulqdq $ 0 x01 , \ T 5 , \ T 1 , \ T 3
vpxor \ T 3 , \ T 6 , \ T 6
vpclmulqdq $ 0 x10 , \ T 5 , \ T 1 , \ T 3
vpxor \ T 3 , \ T 6 , \ T 6
vmovdqu 1 6 * 8 ( a r g 1 ) , \ T 1
vaesenc \ T 1 , \ X M M 1 , \ X M M 1
vaesenc \ T 1 , \ X M M 2 , \ X M M 2
vaesenc \ T 1 , \ X M M 3 , \ X M M 3
vaesenc \ T 1 , \ X M M 4 , \ X M M 4
vaesenc \ T 1 , \ X M M 5 , \ X M M 5
vaesenc \ T 1 , \ X M M 6 , \ X M M 6
vaesenc \ T 1 , \ X M M 7 , \ X M M 7
vaesenc \ T 1 , \ X M M 8 , \ X M M 8
vmovdqa T M P 7 ( % r s p ) , \ T 1
vmovdqa H a s h K e y _ 2 ( a r g 1 ) , \ T 5
vpclmulqdq $ 0 x11 , \ T 5 , \ T 1 , \ T 3
vpxor \ T 3 , \ T 4 , \ T 4
vpclmulqdq $ 0 x00 , \ T 5 , \ T 1 , \ T 3
vpxor \ T 3 , \ T 7 , \ T 7
vpclmulqdq $ 0 x01 , \ T 5 , \ T 1 , \ T 3
vpxor \ T 3 , \ T 6 , \ T 6
vpclmulqdq $ 0 x10 , \ T 5 , \ T 1 , \ T 3
vpxor \ T 3 , \ T 6 , \ T 6
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
vmovdqu 1 6 * 9 ( a r g 1 ) , \ T 5
vaesenc \ T 5 , \ X M M 1 , \ X M M 1
vaesenc \ T 5 , \ X M M 2 , \ X M M 2
vaesenc \ T 5 , \ X M M 3 , \ X M M 3
vaesenc \ T 5 , \ X M M 4 , \ X M M 4
vaesenc \ T 5 , \ X M M 5 , \ X M M 5
vaesenc \ T 5 , \ X M M 6 , \ X M M 6
vaesenc \ T 5 , \ X M M 7 , \ X M M 7
vaesenc \ T 5 , \ X M M 8 , \ X M M 8
vmovdqa T M P 8 ( % r s p ) , \ T 1
vmovdqa H a s h K e y ( a r g 1 ) , \ T 5
vpclmulqdq $ 0 x00 , \ T 5 , \ T 1 , \ T 3
vpxor \ T 3 , \ T 7 , \ T 7
vpclmulqdq $ 0 x01 , \ T 5 , \ T 1 , \ T 3
vpxor \ T 3 , \ T 6 , \ T 6
vpclmulqdq $ 0 x10 , \ T 5 , \ T 1 , \ T 3
vpxor \ T 3 , \ T 6 , \ T 6
vpclmulqdq $ 0 x11 , \ T 5 , \ T 1 , \ T 3
vpxor \ T 3 , \ T 4 , \ T 1
vmovdqu 1 6 * 1 0 ( a r g 1 ) , \ T 5
i = 0
j = 1
setreg
.rep 8
vpxor 1 6 * i ( a r g 3 , % r11 ) , \ T 5 , \ T 2
.if \ ENC_ D E C = = E N C
vaesenclast \ T 2 , r e g _ j , r e g _ j
.else
vaesenclast \ T 2 , r e g _ j , \ T 3
vmovdqu 1 6 * i ( a r g 3 , % r11 ) , r e g _ j
vmovdqu \ T 3 , 1 6 * i ( a r g 2 , % r11 )
.endif
i = ( i + 1 )
j = ( j + 1 )
setreg
.endr
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
vpslldq $ 8 , \ T 6 , \ T 3 # s h i f t - L T 3 2 D W s
vpsrldq $ 8 , \ T 6 , \ T 6 # s h i f t - R T 2 2 D W s
vpxor \ T 3 , \ T 7 , \ T 7
vpxor \ T 6 , \ T 1 , \ T 1 # a c c u m u l a t e t h e r e s u l t s i n T 1 : T 7
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# first p h a s e o f t h e r e d u c t i o n
vmovdqa P O L Y 2 ( % r i p ) , \ T 3
vpclmulqdq $ 0 x01 , \ T 7 , \ T 3 , \ T 2
vpslldq $ 8 , \ T 2 , \ T 2 # s h i f t - L x m m 2 2 D W s
vpxor \ T 2 , \ T 7 , \ T 7 # f i r s t p h a s e o f t h e r e d u c t i o n c o m p l e t e
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
.if \ ENC_ D E C = = E N C
vmovdqu \ X M M 1 , 1 6 * 0 ( a r g 2 ,% r11 ) # W r i t e t o t h e C i p h e r t e x t b u f f e r
vmovdqu \ X M M 2 , 1 6 * 1 ( a r g 2 ,% r11 ) # W r i t e t o t h e C i p h e r t e x t b u f f e r
vmovdqu \ X M M 3 , 1 6 * 2 ( a r g 2 ,% r11 ) # W r i t e t o t h e C i p h e r t e x t b u f f e r
vmovdqu \ X M M 4 , 1 6 * 3 ( a r g 2 ,% r11 ) # W r i t e t o t h e C i p h e r t e x t b u f f e r
vmovdqu \ X M M 5 , 1 6 * 4 ( a r g 2 ,% r11 ) # W r i t e t o t h e C i p h e r t e x t b u f f e r
vmovdqu \ X M M 6 , 1 6 * 5 ( a r g 2 ,% r11 ) # W r i t e t o t h e C i p h e r t e x t b u f f e r
vmovdqu \ X M M 7 , 1 6 * 6 ( a r g 2 ,% r11 ) # W r i t e t o t h e C i p h e r t e x t b u f f e r
vmovdqu \ X M M 8 , 1 6 * 7 ( a r g 2 ,% r11 ) # W r i t e t o t h e C i p h e r t e x t b u f f e r
.endif
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# second p h a s e o f t h e r e d u c t i o n
vpclmulqdq $ 0 x00 , \ T 7 , \ T 3 , \ T 2
vpsrldq $ 4 , \ T 2 , \ T 2 # s h i f t - R x m m 2 1 D W ( S h i f t - R o n l y 1 - D W t o o b t a i n 2 - D W s s h i f t - R )
vpclmulqdq $ 0 x10 , \ T 7 , \ T 3 , \ T 4
vpslldq $ 4 , \ T 4 , \ T 4 # s h i f t - L x m m 0 1 D W ( S h i f t - L 1 - D W t o o b t a i n r e s u l t w i t h n o s h i f t s )
vpxor \ T 2 , \ T 4 , \ T 4 # s e c o n d p h a s e o f t h e r e d u c t i o n c o m p l e t e
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
vpxor \ T 4 , \ T 1 , \ T 1 # t h e r e s u l t i s i n T 1
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 1 , \ X M M 1 # p e r f o r m a 16 B y t e s w a p
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 2 , \ X M M 2 # p e r f o r m a 16 B y t e s w a p
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 3 , \ X M M 3 # p e r f o r m a 16 B y t e s w a p
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 4 , \ X M M 4 # p e r f o r m a 16 B y t e s w a p
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 5 , \ X M M 5 # p e r f o r m a 16 B y t e s w a p
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 6 , \ X M M 6 # p e r f o r m a 16 B y t e s w a p
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 7 , \ X M M 7 # p e r f o r m a 16 B y t e s w a p
vpshufb S H U F _ M A S K ( % r i p ) , \ X M M 8 , \ X M M 8 # p e r f o r m a 16 B y t e s w a p
vpxor \ T 1 , \ X M M 1 , \ X M M 1
.endm
# GHASH t h e l a s t 4 c i p h e r t e x t b l o c k s .
.macro GHASH_LAST_8_AVX2 T1 T 2 T 3 T 4 T 5 T 6 T 7 X M M 1 X M M 2 X M M 3 X M M 4 X M M 5 X M M 6 X M M 7 X M M 8
# # Karatsuba M e t h o d
vmovdqa H a s h K e y _ 8 ( a r g 1 ) , \ T 5
vpshufd $ 0 b01 0 0 1 1 1 0 , \ X M M 1 , \ T 2
vpshufd $ 0 b01 0 0 1 1 1 0 , \ T 5 , \ T 3
vpxor \ X M M 1 , \ T 2 , \ T 2
vpxor \ T 5 , \ T 3 , \ T 3
vpclmulqdq $ 0 x11 , \ T 5 , \ X M M 1 , \ T 6
vpclmulqdq $ 0 x00 , \ T 5 , \ X M M 1 , \ T 7
vpclmulqdq $ 0 x00 , \ T 3 , \ T 2 , \ X M M 1
# # # # # # # # # # # # # # # # # # # # # #
vmovdqa H a s h K e y _ 7 ( a r g 1 ) , \ T 5
vpshufd $ 0 b01 0 0 1 1 1 0 , \ X M M 2 , \ T 2
vpshufd $ 0 b01 0 0 1 1 1 0 , \ T 5 , \ T 3
vpxor \ X M M 2 , \ T 2 , \ T 2
vpxor \ T 5 , \ T 3 , \ T 3
vpclmulqdq $ 0 x11 , \ T 5 , \ X M M 2 , \ T 4
vpxor \ T 4 , \ T 6 , \ T 6
vpclmulqdq $ 0 x00 , \ T 5 , \ X M M 2 , \ T 4
vpxor \ T 4 , \ T 7 , \ T 7
vpclmulqdq $ 0 x00 , \ T 3 , \ T 2 , \ T 2
vpxor \ T 2 , \ X M M 1 , \ X M M 1
# # # # # # # # # # # # # # # # # # # # # #
vmovdqa H a s h K e y _ 6 ( a r g 1 ) , \ T 5
vpshufd $ 0 b01 0 0 1 1 1 0 , \ X M M 3 , \ T 2
vpshufd $ 0 b01 0 0 1 1 1 0 , \ T 5 , \ T 3
vpxor \ X M M 3 , \ T 2 , \ T 2
vpxor \ T 5 , \ T 3 , \ T 3
vpclmulqdq $ 0 x11 , \ T 5 , \ X M M 3 , \ T 4
vpxor \ T 4 , \ T 6 , \ T 6
vpclmulqdq $ 0 x00 , \ T 5 , \ X M M 3 , \ T 4
vpxor \ T 4 , \ T 7 , \ T 7
vpclmulqdq $ 0 x00 , \ T 3 , \ T 2 , \ T 2
vpxor \ T 2 , \ X M M 1 , \ X M M 1
# # # # # # # # # # # # # # # # # # # # # #
vmovdqa H a s h K e y _ 5 ( a r g 1 ) , \ T 5
vpshufd $ 0 b01 0 0 1 1 1 0 , \ X M M 4 , \ T 2
vpshufd $ 0 b01 0 0 1 1 1 0 , \ T 5 , \ T 3
vpxor \ X M M 4 , \ T 2 , \ T 2
vpxor \ T 5 , \ T 3 , \ T 3
vpclmulqdq $ 0 x11 , \ T 5 , \ X M M 4 , \ T 4
vpxor \ T 4 , \ T 6 , \ T 6
vpclmulqdq $ 0 x00 , \ T 5 , \ X M M 4 , \ T 4
vpxor \ T 4 , \ T 7 , \ T 7
vpclmulqdq $ 0 x00 , \ T 3 , \ T 2 , \ T 2
vpxor \ T 2 , \ X M M 1 , \ X M M 1
# # # # # # # # # # # # # # # # # # # # # #
vmovdqa H a s h K e y _ 4 ( a r g 1 ) , \ T 5
vpshufd $ 0 b01 0 0 1 1 1 0 , \ X M M 5 , \ T 2
vpshufd $ 0 b01 0 0 1 1 1 0 , \ T 5 , \ T 3
vpxor \ X M M 5 , \ T 2 , \ T 2
vpxor \ T 5 , \ T 3 , \ T 3
vpclmulqdq $ 0 x11 , \ T 5 , \ X M M 5 , \ T 4
vpxor \ T 4 , \ T 6 , \ T 6
vpclmulqdq $ 0 x00 , \ T 5 , \ X M M 5 , \ T 4
vpxor \ T 4 , \ T 7 , \ T 7
vpclmulqdq $ 0 x00 , \ T 3 , \ T 2 , \ T 2
vpxor \ T 2 , \ X M M 1 , \ X M M 1
# # # # # # # # # # # # # # # # # # # # # #
vmovdqa H a s h K e y _ 3 ( a r g 1 ) , \ T 5
vpshufd $ 0 b01 0 0 1 1 1 0 , \ X M M 6 , \ T 2
vpshufd $ 0 b01 0 0 1 1 1 0 , \ T 5 , \ T 3
vpxor \ X M M 6 , \ T 2 , \ T 2
vpxor \ T 5 , \ T 3 , \ T 3
vpclmulqdq $ 0 x11 , \ T 5 , \ X M M 6 , \ T 4
vpxor \ T 4 , \ T 6 , \ T 6
vpclmulqdq $ 0 x00 , \ T 5 , \ X M M 6 , \ T 4
vpxor \ T 4 , \ T 7 , \ T 7
vpclmulqdq $ 0 x00 , \ T 3 , \ T 2 , \ T 2
vpxor \ T 2 , \ X M M 1 , \ X M M 1
# # # # # # # # # # # # # # # # # # # # # #
vmovdqa H a s h K e y _ 2 ( a r g 1 ) , \ T 5
vpshufd $ 0 b01 0 0 1 1 1 0 , \ X M M 7 , \ T 2
vpshufd $ 0 b01 0 0 1 1 1 0 , \ T 5 , \ T 3
vpxor \ X M M 7 , \ T 2 , \ T 2
vpxor \ T 5 , \ T 3 , \ T 3
vpclmulqdq $ 0 x11 , \ T 5 , \ X M M 7 , \ T 4
vpxor \ T 4 , \ T 6 , \ T 6
vpclmulqdq $ 0 x00 , \ T 5 , \ X M M 7 , \ T 4
vpxor \ T 4 , \ T 7 , \ T 7
vpclmulqdq $ 0 x00 , \ T 3 , \ T 2 , \ T 2
vpxor \ T 2 , \ X M M 1 , \ X M M 1
# # # # # # # # # # # # # # # # # # # # # #
vmovdqa H a s h K e y ( a r g 1 ) , \ T 5
vpshufd $ 0 b01 0 0 1 1 1 0 , \ X M M 8 , \ T 2
vpshufd $ 0 b01 0 0 1 1 1 0 , \ T 5 , \ T 3
vpxor \ X M M 8 , \ T 2 , \ T 2
vpxor \ T 5 , \ T 3 , \ T 3
vpclmulqdq $ 0 x11 , \ T 5 , \ X M M 8 , \ T 4
vpxor \ T 4 , \ T 6 , \ T 6
vpclmulqdq $ 0 x00 , \ T 5 , \ X M M 8 , \ T 4
vpxor \ T 4 , \ T 7 , \ T 7
vpclmulqdq $ 0 x00 , \ T 3 , \ T 2 , \ T 2
vpxor \ T 2 , \ X M M 1 , \ X M M 1
vpxor \ T 6 , \ X M M 1 , \ X M M 1
vpxor \ T 7 , \ X M M 1 , \ T 2
vpslldq $ 8 , \ T 2 , \ T 4
vpsrldq $ 8 , \ T 2 , \ T 2
vpxor \ T 4 , \ T 7 , \ T 7
vpxor \ T 2 , \ T 6 , \ T 6 # < T 6 : T 7 > h o l d s t h e r e s u l t o f t h e
# accumulated c a r r y - l e s s m u l t i p l i c a t i o n s
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# first p h a s e o f t h e r e d u c t i o n
vmovdqa P O L Y 2 ( % r i p ) , \ T 3
vpclmulqdq $ 0 x01 , \ T 7 , \ T 3 , \ T 2
vpslldq $ 8 , \ T 2 , \ T 2 # s h i f t - L x m m 2 2 D W s
vpxor \ T 2 , \ T 7 , \ T 7 # f i r s t p h a s e o f t h e r e d u c t i o n c o m p l e t e
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# second p h a s e o f t h e r e d u c t i o n
vpclmulqdq $ 0 x00 , \ T 7 , \ T 3 , \ T 2
vpsrldq $ 4 , \ T 2 , \ T 2 # s h i f t - R T 2 1 D W ( S h i f t - R o n l y 1 - D W t o o b t a i n 2 - D W s s h i f t - R )
vpclmulqdq $ 0 x10 , \ T 7 , \ T 3 , \ T 4
vpslldq $ 4 , \ T 4 , \ T 4 # s h i f t - L T 4 1 D W ( S h i f t - L 1 - D W t o o b t a i n r e s u l t w i t h n o s h i f t s )
vpxor \ T 2 , \ T 4 , \ T 4 # s e c o n d p h a s e o f t h e r e d u c t i o n c o m p l e t e
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
vpxor \ T 4 , \ T 6 , \ T 6 # t h e r e s u l t i s i n T 6
.endm
# combined f o r G C M e n c r y p t a n d d e c r y p t f u n c t i o n s
# clobbering a l l x m m r e g i s t e r s
# clobbering r10 , r11 , r12 , r13 , r14 , r15
.macro GCM_ENC_DEC_AVX2 ENC_ D E C
# the n u m b e r o f p u s h e s m u s t e q u a l S T A C K _ O F F S E T
push % r12
push % r13
push % r14
push % r15
mov % r s p , % r14
sub $ V A R I A B L E _ O F F S E T , % r s p
and $ ~ 6 3 , % r s p # a l i g n r s p t o 64 b y t e s
vmovdqu H a s h K e y ( a r g 1 ) , % x m m 1 3 # x m m 13 = H a s h K e y
mov a r g 4 , % r13 # s a v e t h e n u m b e r o f b y t e s o f p l a i n t e x t / c i p h e r t e x t
and $ - 1 6 , % r13 # r 13 = r13 - ( r13 m o d 1 6 )
mov % r13 , % r12
shr $ 4 , % r12
and $ 7 , % r12
jz _ i n i t i a l _ n u m _ b l o c k s _ i s _ 0 \ @
cmp $ 7 , % r12
je _ i n i t i a l _ n u m _ b l o c k s _ i s _ 7 \ @
cmp $ 6 , % r12
je _ i n i t i a l _ n u m _ b l o c k s _ i s _ 6 \ @
cmp $ 5 , % r12
je _ i n i t i a l _ n u m _ b l o c k s _ i s _ 5 \ @
cmp $ 4 , % r12
je _ i n i t i a l _ n u m _ b l o c k s _ i s _ 4 \ @
cmp $ 3 , % r12
je _ i n i t i a l _ n u m _ b l o c k s _ i s _ 3 \ @
cmp $ 2 , % r12
je _ i n i t i a l _ n u m _ b l o c k s _ i s _ 2 \ @
jmp _ i n i t i a l _ n u m _ b l o c k s _ i s _ 1 \ @
_ initial_ n u m _ b l o c k s _ i s _ 7 \ @:
INITIAL_ B L O C K S _ A V X 2 7 , % x m m 1 2 , % x m m 1 3 , % x m m 1 4 , % x m m 1 5 , % x m m 1 1 , % x m m 9 , % x m m 1 , % x m m 2 , % x m m 3 , % x m m 4 , % x m m 5 , % x m m 6 , % x m m 7 , % x m m 8 , % x m m 1 0 , % x m m 0 , \ E N C _ D E C
sub $ 1 6 * 7 , % r13
jmp _ i n i t i a l _ b l o c k s _ e n c r y p t e d \ @
_ initial_ n u m _ b l o c k s _ i s _ 6 \ @:
INITIAL_ B L O C K S _ A V X 2 6 , % x m m 1 2 , % x m m 1 3 , % x m m 1 4 , % x m m 1 5 , % x m m 1 1 , % x m m 9 , % x m m 1 , % x m m 2 , % x m m 3 , % x m m 4 , % x m m 5 , % x m m 6 , % x m m 7 , % x m m 8 , % x m m 1 0 , % x m m 0 , \ E N C _ D E C
sub $ 1 6 * 6 , % r13
jmp _ i n i t i a l _ b l o c k s _ e n c r y p t e d \ @
_ initial_ n u m _ b l o c k s _ i s _ 5 \ @:
INITIAL_ B L O C K S _ A V X 2 5 , % x m m 1 2 , % x m m 1 3 , % x m m 1 4 , % x m m 1 5 , % x m m 1 1 , % x m m 9 , % x m m 1 , % x m m 2 , % x m m 3 , % x m m 4 , % x m m 5 , % x m m 6 , % x m m 7 , % x m m 8 , % x m m 1 0 , % x m m 0 , \ E N C _ D E C
sub $ 1 6 * 5 , % r13
jmp _ i n i t i a l _ b l o c k s _ e n c r y p t e d \ @
_ initial_ n u m _ b l o c k s _ i s _ 4 \ @:
INITIAL_ B L O C K S _ A V X 2 4 , % x m m 1 2 , % x m m 1 3 , % x m m 1 4 , % x m m 1 5 , % x m m 1 1 , % x m m 9 , % x m m 1 , % x m m 2 , % x m m 3 , % x m m 4 , % x m m 5 , % x m m 6 , % x m m 7 , % x m m 8 , % x m m 1 0 , % x m m 0 , \ E N C _ D E C
sub $ 1 6 * 4 , % r13
jmp _ i n i t i a l _ b l o c k s _ e n c r y p t e d \ @
_ initial_ n u m _ b l o c k s _ i s _ 3 \ @:
INITIAL_ B L O C K S _ A V X 2 3 , % x m m 1 2 , % x m m 1 3 , % x m m 1 4 , % x m m 1 5 , % x m m 1 1 , % x m m 9 , % x m m 1 , % x m m 2 , % x m m 3 , % x m m 4 , % x m m 5 , % x m m 6 , % x m m 7 , % x m m 8 , % x m m 1 0 , % x m m 0 , \ E N C _ D E C
sub $ 1 6 * 3 , % r13
jmp _ i n i t i a l _ b l o c k s _ e n c r y p t e d \ @
_ initial_ n u m _ b l o c k s _ i s _ 2 \ @:
INITIAL_ B L O C K S _ A V X 2 2 , % x m m 1 2 , % x m m 1 3 , % x m m 1 4 , % x m m 1 5 , % x m m 1 1 , % x m m 9 , % x m m 1 , % x m m 2 , % x m m 3 , % x m m 4 , % x m m 5 , % x m m 6 , % x m m 7 , % x m m 8 , % x m m 1 0 , % x m m 0 , \ E N C _ D E C
sub $ 1 6 * 2 , % r13
jmp _ i n i t i a l _ b l o c k s _ e n c r y p t e d \ @
_ initial_ n u m _ b l o c k s _ i s _ 1 \ @:
INITIAL_ B L O C K S _ A V X 2 1 , % x m m 1 2 , % x m m 1 3 , % x m m 1 4 , % x m m 1 5 , % x m m 1 1 , % x m m 9 , % x m m 1 , % x m m 2 , % x m m 3 , % x m m 4 , % x m m 5 , % x m m 6 , % x m m 7 , % x m m 8 , % x m m 1 0 , % x m m 0 , \ E N C _ D E C
sub $ 1 6 * 1 , % r13
jmp _ i n i t i a l _ b l o c k s _ e n c r y p t e d \ @
_ initial_ n u m _ b l o c k s _ i s _ 0 \ @:
INITIAL_ B L O C K S _ A V X 2 0 , % x m m 1 2 , % x m m 1 3 , % x m m 1 4 , % x m m 1 5 , % x m m 1 1 , % x m m 9 , % x m m 1 , % x m m 2 , % x m m 3 , % x m m 4 , % x m m 5 , % x m m 6 , % x m m 7 , % x m m 8 , % x m m 1 0 , % x m m 0 , \ E N C _ D E C
_ initial_ b l o c k s _ e n c r y p t e d \ @:
cmp $ 0 , % r13
je _ z e r o _ c i p h e r _ l e f t \ @
sub $ 1 2 8 , % r13
je _ e i g h t _ c i p h e r _ l e f t \ @
vmovd % x m m 9 , % r15 d
and $ 2 5 5 , % r15 d
vpshufb S H U F _ M A S K ( % r i p ) , % x m m 9 , % x m m 9
_ encrypt_ b y _ 8 _ n e w \ @:
cmp $ ( 2 5 5 - 8 ) , % r15 d
jg _ e n c r y p t _ b y _ 8 \ @
add $ 8 , % r15 b
GHASH_ 8 _ E N C R Y P T _ 8 _ P A R A L L E L _ A V X 2 % x m m 0 , % x m m 1 0 , % x m m 1 1 , % x m m 1 2 , % x m m 1 3 , % x m m 1 4 , % x m m 9 , % x m m 1 , % x m m 2 , % x m m 3 , % x m m 4 , % x m m 5 , % x m m 6 , % x m m 7 , % x m m 8 , % x m m 1 5 , o u t _ o r d e r , \ E N C _ D E C
add $ 1 2 8 , % r11
sub $ 1 2 8 , % r13
jne _ e n c r y p t _ b y _ 8 _ n e w \ @
vpshufb S H U F _ M A S K ( % r i p ) , % x m m 9 , % x m m 9
jmp _ e i g h t _ c i p h e r _ l e f t \ @
_ encrypt_ b y _ 8 \ @:
vpshufb S H U F _ M A S K ( % r i p ) , % x m m 9 , % x m m 9
add $ 8 , % r15 b
GHASH_ 8 _ E N C R Y P T _ 8 _ P A R A L L E L _ A V X 2 % x m m 0 , % x m m 1 0 , % x m m 1 1 , % x m m 1 2 , % x m m 1 3 , % x m m 1 4 , % x m m 9 , % x m m 1 , % x m m 2 , % x m m 3 , % x m m 4 , % x m m 5 , % x m m 6 , % x m m 7 , % x m m 8 , % x m m 1 5 , i n _ o r d e r , \ E N C _ D E C
vpshufb S H U F _ M A S K ( % r i p ) , % x m m 9 , % x m m 9
add $ 1 2 8 , % r11
sub $ 1 2 8 , % r13
jne _ e n c r y p t _ b y _ 8 _ n e w \ @
vpshufb S H U F _ M A S K ( % r i p ) , % x m m 9 , % x m m 9
_ eight_ c i p h e r _ l e f t \ @:
GHASH_ L A S T _ 8 _ A V X 2 % x m m 0 , % x m m 1 0 , % x m m 1 1 , % x m m 1 2 , % x m m 1 3 , % x m m 1 4 , % x m m 1 5 , % x m m 1 , % x m m 2 , % x m m 3 , % x m m 4 , % x m m 5 , % x m m 6 , % x m m 7 , % x m m 8
_ zero_ c i p h e r _ l e f t \ @:
cmp $ 1 6 , a r g 4
jl _ o n l y _ l e s s _ t h a n _ 1 6 \ @
mov a r g 4 , % r13
and $ 1 5 , % r13 # r 13 = ( a r g 4 m o d 1 6 )
je _ m u l t i p l e _ o f _ 1 6 _ b y t e s \ @
# handle t h e l a s t < 1 6 B y t e b l o c k s e p e r a t e l y
vpaddd O N E ( % r i p ) , % x m m 9 , % x m m 9 # I N C R C N T t o g e t Y n
vpshufb S H U F _ M A S K ( % r i p ) , % x m m 9 , % x m m 9
ENCRYPT_ S I N G L E _ B L O C K % x m m 9 # E ( K , Y n )
sub $ 1 6 , % r11
add % r13 , % r11
vmovdqu ( a r g 3 , % r11 ) , % x m m 1 # r e c e i v e t h e l a s t < 16 B y t e b l o c k
lea S H I F T _ M A S K + 1 6 ( % r i p ) , % r12
sub % r13 , % r12 # a d j u s t t h e s h u f f l e m a s k p o i n t e r
# to b e a b l e t o s h i f t 1 6 - r13 b y t e s
# ( r1 3 i s t h e n u m b e r o f b y t e s i n p l a i n t e x t m o d 1 6 )
vmovdqu ( % r12 ) , % x m m 2 # g e t t h e a p p r o p r i a t e s h u f f l e m a s k
vpshufb % x m m 2 , % x m m 1 , % x m m 1 # s h i f t r i g h t 16 - r13 b y t e s
jmp _ f i n a l _ g h a s h _ m u l \ @
_ only_ l e s s _ t h a n _ 1 6 \ @:
# check f o r 0 l e n g t h
mov a r g 4 , % r13
and $ 1 5 , % r13 # r 13 = ( a r g 4 m o d 1 6 )
je _ m u l t i p l e _ o f _ 1 6 _ b y t e s \ @
# handle t h e l a s t < 1 6 B y t e b l o c k s e p e r a t e l y
vpaddd O N E ( % r i p ) , % x m m 9 , % x m m 9 # I N C R C N T t o g e t Y n
vpshufb S H U F _ M A S K ( % r i p ) , % x m m 9 , % x m m 9
ENCRYPT_ S I N G L E _ B L O C K % x m m 9 # E ( K , Y n )
lea S H I F T _ M A S K + 1 6 ( % r i p ) , % r12
sub % r13 , % r12 # a d j u s t t h e s h u f f l e m a s k p o i n t e r t o b e
# able t o s h i f t 1 6 - r13 b y t e s ( r13 i s t h e
# number o f b y t e s i n p l a i n t e x t m o d 1 6 )
_ get_ l a s t _ 1 6 _ b y t e _ l o o p \ @:
movb ( a r g 3 , % r11 ) , % a l
movb % a l , T M P 1 ( % r s p , % r11 )
add $ 1 , % r11
cmp % r13 , % r11
jne _ g e t _ l a s t _ 1 6 _ b y t e _ l o o p \ @
vmovdqu T M P 1 ( % r s p ) , % x m m 1
sub $ 1 6 , % r11
_ final_ g h a s h _ m u l \ @:
.if \ ENC_ D E C = = D E C
vmovdqa % x m m 1 , % x m m 2
vpxor % x m m 1 , % x m m 9 , % x m m 9 # P l a i n t e x t X O R E ( K , Y n )
vmovdqu A L L _ F - S H I F T _ M A S K ( % r12 ) , % x m m 1 # g e t t h e a p p r o p r i a t e m a s k t o m a s k o u t t o p 16 - r13 b y t e s o f x m m 9
vpand % x m m 1 , % x m m 9 , % x m m 9 # m a s k o u t t o p 16 - r13 b y t e s o f x m m 9
vpand % x m m 1 , % x m m 2 , % x m m 2
vpshufb S H U F _ M A S K ( % r i p ) , % x m m 2 , % x m m 2
vpxor % x m m 2 , % x m m 1 4 , % x m m 1 4
# GHASH c o m p u t a t i o n f o r t h e l a s t < 1 6 B y t e b l o c k
GHASH_ M U L _ A V X 2 % x m m 1 4 , % x m m 1 3 , % x m m 0 , % x m m 1 0 , % x m m 1 1 , % x m m 5 , % x m m 6
sub % r13 , % r11
add $ 1 6 , % r11
.else
vpxor % x m m 1 , % x m m 9 , % x m m 9 # P l a i n t e x t X O R E ( K , Y n )
vmovdqu A L L _ F - S H I F T _ M A S K ( % r12 ) , % x m m 1 # g e t t h e a p p r o p r i a t e m a s k t o m a s k o u t t o p 16 - r13 b y t e s o f x m m 9
vpand % x m m 1 , % x m m 9 , % x m m 9 # m a s k o u t t o p 16 - r13 b y t e s o f x m m 9
vpshufb S H U F _ M A S K ( % r i p ) , % x m m 9 , % x m m 9
vpxor % x m m 9 , % x m m 1 4 , % x m m 1 4
# GHASH c o m p u t a t i o n f o r t h e l a s t < 1 6 B y t e b l o c k
GHASH_ M U L _ A V X 2 % x m m 1 4 , % x m m 1 3 , % x m m 0 , % x m m 1 0 , % x m m 1 1 , % x m m 5 , % x m m 6
sub % r13 , % r11
add $ 1 6 , % r11
vpshufb S H U F _ M A S K ( % r i p ) , % x m m 9 , % x m m 9 # s h u f f l e x m m 9 b a c k t o o u t p u t a s c i p h e r t e x t
.endif
# # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# output r13 B y t e s
vmovq % x m m 9 , % r a x
cmp $ 8 , % r13
jle _ l e s s _ t h a n _ 8 _ b y t e s _ l e f t \ @
mov % r a x , ( a r g 2 , % r11 )
add $ 8 , % r11
vpsrldq $ 8 , % x m m 9 , % x m m 9
vmovq % x m m 9 , % r a x
sub $ 8 , % r13
_ less_ t h a n _ 8 _ b y t e s _ l e f t \ @:
movb % a l , ( a r g 2 , % r11 )
add $ 1 , % r11
shr $ 8 , % r a x
sub $ 1 , % r13
jne _ l e s s _ t h a n _ 8 _ b y t e s _ l e f t \ @
# # # # # # # # # # # # # # # # # # # # # # # # # # # # #
_ multiple_ o f _ 1 6 _ b y t e s \ @:
mov a r g 7 , % r12 # r 12 = a a d L e n ( n u m b e r o f b y t e s )
shl $ 3 , % r12 # c o n v e r t i n t o n u m b e r o f b i t s
vmovd % r12 d , % x m m 1 5 # l e n ( A ) i n x m m 15
shl $ 3 , a r g 4 # l e n ( C ) i n b i t s ( * 128 )
vmovq a r g 4 , % x m m 1
vpslldq $ 8 , % x m m 1 5 , % x m m 1 5 # x m m 15 = l e n ( A ) | | 0 x00 0 0 0 0 0 0 0 0 0 0 0 0 0 0
vpxor % x m m 1 , % x m m 1 5 , % x m m 1 5 # x m m 15 = l e n ( A ) | | l e n ( C )
vpxor % x m m 1 5 , % x m m 1 4 , % x m m 1 4
GHASH_ M U L _ A V X 2 % x m m 1 4 , % x m m 1 3 , % x m m 0 , % x m m 1 0 , % x m m 1 1 , % x m m 5 , % x m m 6 # f i n a l G H A S H c o m p u t a t i o n
vpshufb S H U F _ M A S K ( % r i p ) , % x m m 1 4 , % x m m 1 4 # p e r f o r m a 16 B y t e s w a p
mov a r g 5 , % r a x # r a x = * Y 0
vmovdqu ( % r a x ) , % x m m 9 # x m m 9 = Y 0
ENCRYPT_ S I N G L E _ B L O C K % x m m 9 # E ( K , Y 0 )
vpxor % x m m 1 4 , % x m m 9 , % x m m 9
_ return_ T \ @:
mov a r g 8 , % r10 # r 10 = a u t h T a g
mov a r g 9 , % r11 # r 11 = a u t h _ t a g _ l e n
cmp $ 1 6 , % r11
je _ T _ 1 6 \ @
2017-04-28 18:12:01 +02:00
cmp $ 8 , % r11
jl _ T _ 4 \ @
2013-12-11 14:28:41 -08:00
_ T_ 8 \ @:
vmovq % x m m 9 , % r a x
mov % r a x , ( % r10 )
2017-04-28 18:12:01 +02:00
add $ 8 , % r10
sub $ 8 , % r11
2013-12-11 14:28:41 -08:00
vpsrldq $ 8 , % x m m 9 , % x m m 9
2017-04-28 18:12:01 +02:00
cmp $ 0 , % r11
je _ r e t u r n _ T _ d o n e \ @
_ T_ 4 \ @:
2013-12-11 14:28:41 -08:00
vmovd % x m m 9 , % e a x
2017-04-28 18:12:01 +02:00
mov % e a x , ( % r10 )
add $ 4 , % r10
sub $ 4 , % r11
vpsrldq $ 4 , % x m m 9 , % x m m 9
cmp $ 0 , % r11
je _ r e t u r n _ T _ d o n e \ @
_ T_ 1 2 3 \ @:
vmovd % x m m 9 , % e a x
cmp $ 2 , % r11
jl _ T _ 1 \ @
mov % a x , ( % r10 )
cmp $ 2 , % r11
je _ r e t u r n _ T _ d o n e \ @
add $ 2 , % r10
sar $ 1 6 , % e a x
_ T_ 1 \ @:
mov % a l , ( % r10 )
2013-12-11 14:28:41 -08:00
jmp _ r e t u r n _ T _ d o n e \ @
_ T_ 1 6 \ @:
vmovdqu % x m m 9 , ( % r10 )
_ return_ T _ d o n e \ @:
mov % r14 , % r s p
pop % r15
pop % r14
pop % r13
pop % r12
.endm
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# void a e s n i _ g c m _ p r e c o m p _ a v x _ g e n 4
# ( gcm_ d a t a * m y _ c t x _ d a t a ,
# u8 * h a s h _ s u b k e y ) # / * H , t h e H a s h s u b k e y i n p u t .
# Data s t a r t s o n a 1 6 - b y t e b o u n d a r y . * /
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
ENTRY( a e s n i _ g c m _ p r e c o m p _ a v x _ g e n 4 )
# the n u m b e r o f p u s h e s m u s t e q u a l S T A C K _ O F F S E T
push % r12
push % r13
push % r14
push % r15
mov % r s p , % r14
sub $ V A R I A B L E _ O F F S E T , % r s p
and $ ~ 6 3 , % r s p # a l i g n r s p t o 64 b y t e s
vmovdqu ( a r g 2 ) , % x m m 6 # x m m 6 = H a s h K e y
vpshufb S H U F _ M A S K ( % r i p ) , % x m m 6 , % x m m 6
# # # # # # # # # # # # # # # PRECOMPUTATION o f H a s h K e y < < 1 m o d p o l y f r o m t h e H a s h K e y
vmovdqa % x m m 6 , % x m m 2
vpsllq $ 1 , % x m m 6 , % x m m 6
vpsrlq $ 6 3 , % x m m 2 , % x m m 2
vmovdqa % x m m 2 , % x m m 1
vpslldq $ 8 , % x m m 2 , % x m m 2
vpsrldq $ 8 , % x m m 1 , % x m m 1
vpor % x m m 2 , % x m m 6 , % x m m 6
# reduction
vpshufd $ 0 b00 1 0 0 1 0 0 , % x m m 1 , % x m m 2
vpcmpeqd T W O O N E ( % r i p ) , % x m m 2 , % x m m 2
vpand P O L Y ( % r i p ) , % x m m 2 , % x m m 2
vpxor % x m m 2 , % x m m 6 , % x m m 6 # x m m 6 h o l d s t h e H a s h K e y < < 1 m o d p o l y
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
vmovdqa % x m m 6 , H a s h K e y ( a r g 1 ) # s t o r e H a s h K e y < < 1 m o d p o l y
PRECOMPUTE_ A V X 2 % x m m 6 , % x m m 0 , % x m m 1 , % x m m 2 , % x m m 3 , % x m m 4 , % x m m 5
mov % r14 , % r s p
pop % r15
pop % r14
pop % r13
pop % r12
ret
ENDPROC( a e s n i _ g c m _ p r e c o m p _ a v x _ g e n 4 )
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# void a e s n i _ g c m _ e n c _ a v x _ g e n 4 (
# gcm_ d a t a * m y _ c t x _ d a t a , / * a l i g n e d t o 1 6 B y t e s * /
# u8 * o u t , / * C i p h e r t e x t o u t p u t . E n c r y p t i n - p l a c e i s a l l o w e d . * /
# const u 8 * i n , / * P l a i n t e x t i n p u t * /
# u6 4 p l a i n t e x t _ l e n , / * L e n g t h o f d a t a i n B y t e s f o r e n c r y p t i o n . * /
# u8 * i v , / * P r e - c o u n t e r b l o c k j 0 : 4 b y t e s a l t
# ( from S e c u r i t y A s s o c i a t i o n ) c o n c a t e n a t e d w i t h 8 b y t e
# Initialisation V e c t o r ( f r o m I P S e c E S P P a y l o a d )
# concatenated w i t h 0 x00 0 0 0 0 0 1 . 1 6 - b y t e a l i g n e d p o i n t e r . * /
# const u 8 * a a d , / * A d d i t i o n a l A u t h e n t i c a t i o n D a t a ( A A D ) * /
# u6 4 a a d _ l e n , / * L e n g t h o f A A D i n b y t e s . W i t h R F C 4 1 0 6 t h i s i s g o i n g t o b e 8 o r 1 2 B y t e s * /
# u8 * a u t h _ t a g , / * A u t h e n t i c a t e d T a g o u t p u t . * /
# u6 4 a u t h _ t a g _ l e n ) # / * A u t h e n t i c a t e d T a g L e n g t h i n b y t e s .
# Valid v a l u e s a r e 1 6 ( m o s t l i k e l y ) , 1 2 o r 8 . * /
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
ENTRY( a e s n i _ g c m _ e n c _ a v x _ g e n 4 )
GCM_ E N C _ D E C _ A V X 2 E N C
ret
ENDPROC( a e s n i _ g c m _ e n c _ a v x _ g e n 4 )
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# void a e s n i _ g c m _ d e c _ a v x _ g e n 4 (
# gcm_ d a t a * m y _ c t x _ d a t a , / * a l i g n e d t o 1 6 B y t e s * /
# u8 * o u t , / * P l a i n t e x t o u t p u t . D e c r y p t i n - p l a c e i s a l l o w e d . * /
# const u 8 * i n , / * C i p h e r t e x t i n p u t * /
# u6 4 p l a i n t e x t _ l e n , / * L e n g t h o f d a t a i n B y t e s f o r e n c r y p t i o n . * /
# u8 * i v , / * P r e - c o u n t e r b l o c k j 0 : 4 b y t e s a l t
# ( from S e c u r i t y A s s o c i a t i o n ) c o n c a t e n a t e d w i t h 8 b y t e
# Initialisation V e c t o r ( f r o m I P S e c E S P P a y l o a d )
# concatenated w i t h 0 x00 0 0 0 0 0 1 . 1 6 - b y t e a l i g n e d p o i n t e r . * /
# const u 8 * a a d , / * A d d i t i o n a l A u t h e n t i c a t i o n D a t a ( A A D ) * /
# u6 4 a a d _ l e n , / * L e n g t h o f A A D i n b y t e s . W i t h R F C 4 1 0 6 t h i s i s g o i n g t o b e 8 o r 1 2 B y t e s * /
# u8 * a u t h _ t a g , / * A u t h e n t i c a t e d T a g o u t p u t . * /
# u6 4 a u t h _ t a g _ l e n ) # / * A u t h e n t i c a t e d T a g L e n g t h i n b y t e s .
# Valid v a l u e s a r e 1 6 ( m o s t l i k e l y ) , 1 2 o r 8 . * /
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
ENTRY( a e s n i _ g c m _ d e c _ a v x _ g e n 4 )
GCM_ E N C _ D E C _ A V X 2 D E C
ret
ENDPROC( a e s n i _ g c m _ d e c _ a v x _ g e n 4 )
# endif / * C O N F I G _ A S _ A V X 2 * /