2013-09-07 12:56:26 +10:00
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# Implement f a s t C R C - T 1 0 D I F c o m p u t a t i o n w i t h S S E a n d P C L M U L Q D Q i n s t r u c t i o n s
#
# Copyright ( c ) 2 0 1 3 , I n t e l C o r p o r a t i o n
#
# Authors :
# Erdinc O z t u r k < e r d i n c . o z t u r k @intel.com>
# Vinodh G o p a l < v i n o d h . g o p a l @intel.com>
# James G u i l f o r d < j a m e s . g u i l f o r d @intel.com>
# Tim C h e n < t i m . c . c h e n @linux.intel.com>
#
# This s o f t w a r e i s a v a i l a b l e t o y o u u n d e r a c h o i c e o f o n e o f t w o
# licenses. Y o u m a y c h o o s e t o b e l i c e n s e d u n d e r t h e t e r m s o f t h e G N U
# General P u b l i c L i c e n s e ( G P L ) V e r s i o n 2 , a v a i l a b l e f r o m t h e f i l e
# COPYING i n t h e m a i n d i r e c t o r y o f t h i s s o u r c e t r e e , o r t h e
# OpenIB. o r g B S D l i c e n s e b e l o w :
#
# Redistribution a n d u s e i n s o u r c e a n d b i n a r y f o r m s , w i t h o r w i t h o u t
# modification, a r e p e r m i t t e d p r o v i d e d t h a t t h e f o l l o w i n g c o n d i t i o n s a r e
# met :
#
# * Redistributions o f s o u r c e c o d e m u s t r e t a i n t h e a b o v e c o p y r i g h t
# notice, t h i s l i s t o f c o n d i t i o n s a n d t h e f o l l o w i n g d i s c l a i m e r .
#
# * Redistributions i n b i n a r y f o r m m u s t r e p r o d u c e t h e a b o v e c o p y r i g h t
# notice, t h i s l i s t o f c o n d i t i o n s a n d t h e f o l l o w i n g d i s c l a i m e r i n t h e
# documentation a n d / o r o t h e r m a t e r i a l s p r o v i d e d w i t h t h e
# distribution.
#
# * Neither t h e n a m e o f t h e I n t e l C o r p o r a t i o n n o r t h e n a m e s o f i t s
# contributors m a y b e u s e d t o e n d o r s e o r p r o m o t e p r o d u c t s d e r i v e d f r o m
# this s o f t w a r e w i t h o u t s p e c i f i c p r i o r w r i t t e n p e r m i s s i o n .
#
#
# THIS S O F T W A R E I S P R O V I D E D B Y I N T E L C O R P O R A T I O N " " A S I S " " A N D A N Y
# EXPRESS O R I M P L I E D W A R R A N T I E S , I N C L U D I N G , B U T N O T L I M I T E D T O , T H E
# IMPLIED W A R R A N T I E S O F M E R C H A N T A B I L I T Y A N D F I T N E S S F O R A P A R T I C U L A R
# PURPOSE A R E D I S C L A I M E D . I N N O E V E N T S H A L L I N T E L C O R P O R A T I O N O R
# CONTRIBUTORS B E L I A B L E F O R A N Y D I R E C T , I N D I R E C T , I N C I D E N T A L , S P E C I A L ,
# EXEMPLARY, O R C O N S E Q U E N T I A L D A M A G E S ( I N C L U D I N G , B U T N O T L I M I T E D T O ,
# PROCUREMENT O F S U B S T I T U T E G O O D S O R S E R V I C E S ; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, W H E T H E R I N C O N T R A C T , S T R I C T L I A B I L I T Y , O R T O R T ( I N C L U D I N G
# NEGLIGENCE O R O T H E R W I S E ) A R I S I N G I N A N Y W A Y O U T O F T H E U S E O F T H I S
# SOFTWARE, E V E N I F A D V I S E D O F T H E P O S S I B I L I T Y O F S U C H D A M A G E .
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# Function A P I :
# UINT1 6 c r c _ t 1 0 d i f _ p c l (
# UINT1 6 i n i t _ c r c , / / i n i t i a l C R C v a l u e , 1 6 b i t s
# const u n s i g n e d c h a r * b u f , / / b u f f e r p o i n t e r t o c a l c u l a t e C R C o n
# UINT6 4 l e n / / b u f f e r l e n g t h i n b y t e s ( 6 4 - b i t d a t a )
# ) ;
#
# Reference p a p e r t i t l e d " F a s t C R C C o m p u t a t i o n f o r G e n e r i c
# Polynomials U s i n g P C L M U L Q D Q I n s t r u c t i o n "
# URL : http : / / www. i n t e l . c o m / c o n t e n t / d a m / w w w / p u b l i c / u s / e n / d o c u m e n t s
# / white- p a p e r s / f a s t - c r c - c o m p u t a t i o n - g e n e r i c - p o l y n o m i a l s - p c l m u l q d q - p a p e r . p d f
#
#
# include < l i n u x / l i n k a g e . h >
.text
# define a r g 1 % r d i
# define a r g 2 % r s i
# define a r g 3 % r d x
# define a r g 1 _ l o w32 % e d i
ENTRY( c r c _ t 1 0 d i f _ p c l )
.align 16
# adjust t h e 1 6 - b i t i n i t i a l _ c r c v a l u e , s c a l e i t t o 3 2 b i t s
shl $ 1 6 , a r g 1 _ l o w32
# Allocate S t a c k S p a c e
mov % r s p , % r c x
sub $ 1 6 * 2 , % r s p
# align s t a c k t o 1 6 b y t e b o u n d a r y
and $ ~ ( 0 x10 - 1 ) , % r s p
# check i f s m a l l e r t h a n 2 5 6
cmp $ 2 5 6 , a r g 3
# for s i z e s l e s s t h a n 1 2 8 , w e c a n ' t f o l d 6 4 B a t a t i m e . . .
jl _ l e s s _ t h a n _ 1 2 8
# load t h e i n i t i a l c r c v a l u e
movd a r g 1 _ l o w32 , % x m m 1 0 # i n i t i a l c r c
# crc v a l u e d o e s n o t n e e d t o b e b y t e - r e f l e c t e d , b u t i t n e e d s
# to b e m o v e d t o t h e h i g h p a r t o f t h e r e g i s t e r .
# because d a t a w i l l b e b y t e - r e f l e c t e d a n d w i l l a l i g n w i t h
# initial c r c a t c o r r e c t p l a c e .
pslldq $ 1 2 , % x m m 1 0
movdqa S H U F _ M A S K ( % r i p ) , % x m m 1 1
# receive t h e i n i t i a l 6 4 B d a t a , x o r t h e i n i t i a l c r c v a l u e
movdqu 1 6 * 0 ( a r g 2 ) , % x m m 0
movdqu 1 6 * 1 ( a r g 2 ) , % x m m 1
movdqu 1 6 * 2 ( a r g 2 ) , % x m m 2
movdqu 1 6 * 3 ( a r g 2 ) , % x m m 3
movdqu 1 6 * 4 ( a r g 2 ) , % x m m 4
movdqu 1 6 * 5 ( a r g 2 ) , % x m m 5
movdqu 1 6 * 6 ( a r g 2 ) , % x m m 6
movdqu 1 6 * 7 ( a r g 2 ) , % x m m 7
pshufb % x m m 1 1 , % x m m 0
# XOR t h e i n i t i a l _ c r c v a l u e
pxor % x m m 1 0 , % x m m 0
pshufb % x m m 1 1 , % x m m 1
pshufb % x m m 1 1 , % x m m 2
pshufb % x m m 1 1 , % x m m 3
pshufb % x m m 1 1 , % x m m 4
pshufb % x m m 1 1 , % x m m 5
pshufb % x m m 1 1 , % x m m 6
pshufb % x m m 1 1 , % x m m 7
movdqa r k 3 ( % r i p ) , % x m m 1 0 #x m m 10 h a s r k 3 a n d r k 4
# imm v a l u e o f p c l m u l q d q i n s t r u c t i o n
# will d e t e r m i n e w h i c h c o n s t a n t t o u s e
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# we s u b t r a c t 2 5 6 i n s t e a d o f 1 2 8 t o s a v e o n e i n s t r u c t i o n f r o m t h e l o o p
sub $ 2 5 6 , a r g 3
# at t h i s s e c t i o n o f t h e c o d e , t h e r e i s 6 4 * x + y ( 0 < =y < 6 4 ) b y t e s o f
# buffer. T h e _ f o l d _ 6 4 _ B _ l o o p w i l l f o l d 6 4 B a t a t i m e
# until w e h a v e 6 4 + y B y t e s o f b u f f e r
# fold 6 4 B a t a t i m e . T h i s s e c t i o n o f t h e c o d e f o l d s 4 x m m
# registers i n p a r a l l e l
_fold_64_B_loop :
# update t h e b u f f e r p o i n t e r
add $ 1 2 8 , a r g 2 # b u f + = 64 #
movdqu 1 6 * 0 ( a r g 2 ) , % x m m 9
movdqu 1 6 * 1 ( a r g 2 ) , % x m m 1 2
pshufb % x m m 1 1 , % x m m 9
pshufb % x m m 1 1 , % x m m 1 2
movdqa % x m m 0 , % x m m 8
movdqa % x m m 1 , % x m m 1 3
pclmulqdq $ 0 x0 , % x m m 1 0 , % x m m 0
pclmulqdq $ 0 x11 , % x m m 1 0 , % x m m 8
pclmulqdq $ 0 x0 , % x m m 1 0 , % x m m 1
pclmulqdq $ 0 x11 , % x m m 1 0 , % x m m 1 3
pxor % x m m 9 , % x m m 0
xorps % x m m 8 , % x m m 0
pxor % x m m 1 2 , % x m m 1
xorps % x m m 1 3 , % x m m 1
movdqu 1 6 * 2 ( a r g 2 ) , % x m m 9
movdqu 1 6 * 3 ( a r g 2 ) , % x m m 1 2
pshufb % x m m 1 1 , % x m m 9
pshufb % x m m 1 1 , % x m m 1 2
movdqa % x m m 2 , % x m m 8
movdqa % x m m 3 , % x m m 1 3
pclmulqdq $ 0 x0 , % x m m 1 0 , % x m m 2
pclmulqdq $ 0 x11 , % x m m 1 0 , % x m m 8
pclmulqdq $ 0 x0 , % x m m 1 0 , % x m m 3
pclmulqdq $ 0 x11 , % x m m 1 0 , % x m m 1 3
pxor % x m m 9 , % x m m 2
xorps % x m m 8 , % x m m 2
pxor % x m m 1 2 , % x m m 3
xorps % x m m 1 3 , % x m m 3
movdqu 1 6 * 4 ( a r g 2 ) , % x m m 9
movdqu 1 6 * 5 ( a r g 2 ) , % x m m 1 2
pshufb % x m m 1 1 , % x m m 9
pshufb % x m m 1 1 , % x m m 1 2
movdqa % x m m 4 , % x m m 8
movdqa % x m m 5 , % x m m 1 3
pclmulqdq $ 0 x0 , % x m m 1 0 , % x m m 4
pclmulqdq $ 0 x11 , % x m m 1 0 , % x m m 8
pclmulqdq $ 0 x0 , % x m m 1 0 , % x m m 5
pclmulqdq $ 0 x11 , % x m m 1 0 , % x m m 1 3
pxor % x m m 9 , % x m m 4
xorps % x m m 8 , % x m m 4
pxor % x m m 1 2 , % x m m 5
xorps % x m m 1 3 , % x m m 5
movdqu 1 6 * 6 ( a r g 2 ) , % x m m 9
movdqu 1 6 * 7 ( a r g 2 ) , % x m m 1 2
pshufb % x m m 1 1 , % x m m 9
pshufb % x m m 1 1 , % x m m 1 2
movdqa % x m m 6 , % x m m 8
movdqa % x m m 7 , % x m m 1 3
pclmulqdq $ 0 x0 , % x m m 1 0 , % x m m 6
pclmulqdq $ 0 x11 , % x m m 1 0 , % x m m 8
pclmulqdq $ 0 x0 , % x m m 1 0 , % x m m 7
pclmulqdq $ 0 x11 , % x m m 1 0 , % x m m 1 3
pxor % x m m 9 , % x m m 6
xorps % x m m 8 , % x m m 6
pxor % x m m 1 2 , % x m m 7
xorps % x m m 1 3 , % x m m 7
sub $ 1 2 8 , a r g 3
# check i f t h e r e i s a n o t h e r 6 4 B i n t h e b u f f e r t o b e a b l e t o f o l d
jge _ f o l d _ 6 4 _ B _ l o o p
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
add $ 1 2 8 , a r g 2
# at t h i s p o i n t , t h e b u f f e r p o i n t e r i s p o i n t i n g a t t h e l a s t y B y t e s
# of t h e b u f f e r t h e 6 4 B o f f o l d e d d a t a i s i n 4 o f t h e x m m
# registers : xmm0 , x m m 1 , x m m 2 , x m m 3
# fold t h e 8 x m m r e g i s t e r s t o 1 x m m r e g i s t e r w i t h d i f f e r e n t c o n s t a n t s
movdqa r k 9 ( % r i p ) , % x m m 1 0
movdqa % x m m 0 , % x m m 8
pclmulqdq $ 0 x11 , % x m m 1 0 , % x m m 0
pclmulqdq $ 0 x0 , % x m m 1 0 , % x m m 8
pxor % x m m 8 , % x m m 7
xorps % x m m 0 , % x m m 7
movdqa r k 1 1 ( % r i p ) , % x m m 1 0
movdqa % x m m 1 , % x m m 8
pclmulqdq $ 0 x11 , % x m m 1 0 , % x m m 1
pclmulqdq $ 0 x0 , % x m m 1 0 , % x m m 8
pxor % x m m 8 , % x m m 7
xorps % x m m 1 , % x m m 7
movdqa r k 1 3 ( % r i p ) , % x m m 1 0
movdqa % x m m 2 , % x m m 8
pclmulqdq $ 0 x11 , % x m m 1 0 , % x m m 2
pclmulqdq $ 0 x0 , % x m m 1 0 , % x m m 8
pxor % x m m 8 , % x m m 7
pxor % x m m 2 , % x m m 7
movdqa r k 1 5 ( % r i p ) , % x m m 1 0
movdqa % x m m 3 , % x m m 8
pclmulqdq $ 0 x11 , % x m m 1 0 , % x m m 3
pclmulqdq $ 0 x0 , % x m m 1 0 , % x m m 8
pxor % x m m 8 , % x m m 7
xorps % x m m 3 , % x m m 7
movdqa r k 1 7 ( % r i p ) , % x m m 1 0
movdqa % x m m 4 , % x m m 8
pclmulqdq $ 0 x11 , % x m m 1 0 , % x m m 4
pclmulqdq $ 0 x0 , % x m m 1 0 , % x m m 8
pxor % x m m 8 , % x m m 7
pxor % x m m 4 , % x m m 7
movdqa r k 1 9 ( % r i p ) , % x m m 1 0
movdqa % x m m 5 , % x m m 8
pclmulqdq $ 0 x11 , % x m m 1 0 , % x m m 5
pclmulqdq $ 0 x0 , % x m m 1 0 , % x m m 8
pxor % x m m 8 , % x m m 7
xorps % x m m 5 , % x m m 7
movdqa r k 1 ( % r i p ) , % x m m 1 0 #x m m 10 h a s r k 1 a n d r k 2
# imm v a l u e o f p c l m u l q d q i n s t r u c t i o n
# will d e t e r m i n e w h i c h c o n s t a n t t o u s e
movdqa % x m m 6 , % x m m 8
pclmulqdq $ 0 x11 , % x m m 1 0 , % x m m 6
pclmulqdq $ 0 x0 , % x m m 1 0 , % x m m 8
pxor % x m m 8 , % x m m 7
pxor % x m m 6 , % x m m 7
# instead o f 6 4 , w e a d d 4 8 t o t h e l o o p c o u n t e r t o s a v e 1 i n s t r u c t i o n
# from t h e l o o p i n s t e a d o f a c m p i n s t r u c t i o n , w e u s e t h e n e g a t i v e
# flag w i t h t h e j l i n s t r u c t i o n
add $ 1 2 8 - 1 6 , a r g 3
jl _ f i n a l _ r e d u c t i o n _ f o r _ 1 2 8
# now w e h a v e 1 6 + y b y t e s l e f t t o r e d u c e . 1 6 B y t e s i s i n r e g i s t e r x m m 7
# and t h e r e s t i s i n m e m o r y . W e c a n f o l d 1 6 b y t e s a t a t i m e i f y > =16
# continue f o l d i n g 1 6 B a t a t i m e
_16B_reduction_loop :
movdqa % x m m 7 , % x m m 8
pclmulqdq $ 0 x11 , % x m m 1 0 , % x m m 7
pclmulqdq $ 0 x0 , % x m m 1 0 , % x m m 8
pxor % x m m 8 , % x m m 7
movdqu ( a r g 2 ) , % x m m 0
pshufb % x m m 1 1 , % x m m 0
pxor % x m m 0 , % x m m 7
add $ 1 6 , a r g 2
sub $ 1 6 , a r g 3
# instead o f a c m p i n s t r u c t i o n , w e u t i l i z e t h e f l a g s w i t h t h e
# jge i n s t r u c t i o n e q u i v a l e n t o f : c m p a r g 3 , 1 6 - 1 6
# check i f t h e r e i s a n y m o r e 1 6 B i n t h e b u f f e r t o b e a b l e t o f o l d
jge _ 1 6 B _ r e d u c t i o n _ l o o p
# now w e h a v e 1 6 + z b y t e s l e f t t o r e d u c e , w h e r e 0 < = z < 1 6 .
# first, w e r e d u c e t h e d a t a i n t h e x m m 7 r e g i s t e r
_final_reduction_for_128 :
# check i f a n y m o r e d a t a t o f o l d . I f n o t , c o m p u t e t h e C R C o f
# the f i n a l 1 2 8 b i t s
add $ 1 6 , a r g 3
je _ 1 2 8 _ d o n e
# here w e a r e g e t t i n g d a t a t h a t i s l e s s t h a n 1 6 b y t e s .
# since w e k n o w t h a t t h e r e w a s d a t a b e f o r e t h e p o i n t e r , w e c a n
# offset t h e i n p u t p o i n t e r b e f o r e t h e a c t u a l p o i n t , t o r e c e i v e
# exactly 1 6 b y t e s . a f t e r t h a t t h e r e g i s t e r s n e e d t o b e a d j u s t e d .
_get_last_two_xmms :
movdqa % x m m 7 , % x m m 2
movdqu - 1 6 ( a r g 2 , a r g 3 ) , % x m m 1
pshufb % x m m 1 1 , % x m m 1
# get r i d o f t h e e x t r a d a t a t h a t w a s l o a d e d b e f o r e
# load t h e s h i f t c o n s t a n t
lea p s h u f b _ s h f _ t a b l e + 1 6 ( % r i p ) , % r a x
sub a r g 3 , % r a x
movdqu ( % r a x ) , % x m m 0
# shift x m m 2 t o t h e l e f t b y a r g 3 b y t e s
pshufb % x m m 0 , % x m m 2
# shift x m m 7 t o t h e r i g h t b y 1 6 - a r g 3 b y t e s
pxor m a s k 1 ( % r i p ) , % x m m 0
pshufb % x m m 0 , % x m m 7
pblendvb % x m m 2 , % x m m 1 #x m m 0 i s i m p l i c i t
# fold 1 6 B y t e s
movdqa % x m m 1 , % x m m 2
movdqa % x m m 7 , % x m m 8
pclmulqdq $ 0 x11 , % x m m 1 0 , % x m m 7
pclmulqdq $ 0 x0 , % x m m 1 0 , % x m m 8
pxor % x m m 8 , % x m m 7
pxor % x m m 2 , % x m m 7
_128_done :
# compute c r c o f a 1 2 8 - b i t v a l u e
movdqa r k 5 ( % r i p ) , % x m m 1 0 # r k 5 a n d r k 6 i n x m m 1 0
movdqa % x m m 7 , % x m m 0
# 6 4 b f o l d
pclmulqdq $ 0 x1 , % x m m 1 0 , % x m m 7
pslldq $ 8 , % x m m 0
pxor % x m m 0 , % x m m 7
# 3 2 b f o l d
movdqa % x m m 7 , % x m m 0
pand m a s k 2 ( % r i p ) , % x m m 0
psrldq $ 1 2 , % x m m 7
pclmulqdq $ 0 x10 , % x m m 1 0 , % x m m 7
pxor % x m m 0 , % x m m 7
# barrett r e d u c t i o n
_barrett :
movdqa r k 7 ( % r i p ) , % x m m 1 0 # r k 7 a n d r k 8 i n x m m 1 0
movdqa % x m m 7 , % x m m 0
pclmulqdq $ 0 x01 , % x m m 1 0 , % x m m 7
pslldq $ 4 , % x m m 7
pclmulqdq $ 0 x11 , % x m m 1 0 , % x m m 7
pslldq $ 4 , % x m m 7
pxor % x m m 0 , % x m m 7
pextrd $ 1 , % x m m 7 , % e a x
_cleanup :
# scale t h e r e s u l t b a c k t o 1 6 b i t s
shr $ 1 6 , % e a x
mov % r c x , % r s p
ret
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
.align 16
_less_than_128 :
# check i f t h e r e i s e n o u g h b u f f e r t o b e a b l e t o f o l d 1 6 B a t a t i m e
cmp $ 3 2 , a r g 3
jl _ l e s s _ t h a n _ 3 2
movdqa S H U F _ M A S K ( % r i p ) , % x m m 1 1
# now i f t h e r e i s , l o a d t h e c o n s t a n t s
movdqa r k 1 ( % r i p ) , % x m m 1 0 # r k 1 a n d r k 2 i n x m m 1 0
movd a r g 1 _ l o w32 , % x m m 0 # g e t t h e i n i t i a l c r c v a l u e
pslldq $ 1 2 , % x m m 0 # a l i g n i t t o i t s c o r r e c t p l a c e
movdqu ( a r g 2 ) , % x m m 7 # l o a d t h e p l a i n t e x t
pshufb % x m m 1 1 , % x m m 7 # b y t e - r e f l e c t t h e p l a i n t e x t
pxor % x m m 0 , % x m m 7
# update t h e b u f f e r p o i n t e r
add $ 1 6 , a r g 2
# update t h e c o u n t e r . s u b t r a c t 3 2 i n s t e a d o f 1 6 t o s a v e o n e
# instruction f r o m t h e l o o p
sub $ 3 2 , a r g 3
jmp _ 1 6 B _ r e d u c t i o n _ l o o p
.align 16
_less_than_32 :
# mov i n i t i a l c r c t o t h e r e t u r n v a l u e . t h i s i s n e c e s s a r y f o r
# zero- l e n g t h b u f f e r s .
mov a r g 1 _ l o w32 , % e a x
test a r g 3 , a r g 3
je _ c l e a n u p
movdqa S H U F _ M A S K ( % r i p ) , % x m m 1 1
movd a r g 1 _ l o w32 , % x m m 0 # g e t t h e i n i t i a l c r c v a l u e
pslldq $ 1 2 , % x m m 0 # a l i g n i t t o i t s c o r r e c t p l a c e
cmp $ 1 6 , a r g 3
je _ e x a c t _ 1 6 _ l e f t
jl _ l e s s _ t h a n _ 1 6 _ l e f t
movdqu ( a r g 2 ) , % x m m 7 # l o a d t h e p l a i n t e x t
pshufb % x m m 1 1 , % x m m 7 # b y t e - r e f l e c t t h e p l a i n t e x t
pxor % x m m 0 , % x m m 7 # x o r t h e i n i t i a l c r c v a l u e
add $ 1 6 , a r g 2
sub $ 1 6 , a r g 3
movdqa r k 1 ( % r i p ) , % x m m 1 0 # r k 1 a n d r k 2 i n x m m 1 0
jmp _ g e t _ l a s t _ t w o _ x m m s
.align 16
_less_than_16_left :
# use s t a c k s p a c e t o l o a d d a t a l e s s t h a n 1 6 b y t e s , z e r o - o u t
# the 1 6 B i n m e m o r y f i r s t .
pxor % x m m 1 , % x m m 1
mov % r s p , % r11
movdqa % x m m 1 , ( % r11 )
cmp $ 4 , a r g 3
jl _ o n l y _ l e s s _ t h a n _ 4
# backup t h e c o u n t e r v a l u e
mov a r g 3 , % r9
cmp $ 8 , a r g 3
jl _ l e s s _ t h a n _ 8 _ l e f t
# load 8 B y t e s
mov ( a r g 2 ) , % r a x
mov % r a x , ( % r11 )
add $ 8 , % r11
sub $ 8 , a r g 3
add $ 8 , a r g 2
_less_than_8_left :
cmp $ 4 , a r g 3
jl _ l e s s _ t h a n _ 4 _ l e f t
# load 4 B y t e s
mov ( a r g 2 ) , % e a x
mov % e a x , ( % r11 )
add $ 4 , % r11
sub $ 4 , a r g 3
add $ 4 , a r g 2
_less_than_4_left :
cmp $ 2 , a r g 3
jl _ l e s s _ t h a n _ 2 _ l e f t
# load 2 B y t e s
mov ( a r g 2 ) , % a x
mov % a x , ( % r11 )
add $ 2 , % r11
sub $ 2 , a r g 3
add $ 2 , a r g 2
_less_than_2_left :
cmp $ 1 , a r g 3
jl _ z e r o _ l e f t
# load 1 B y t e
mov ( a r g 2 ) , % a l
mov % a l , ( % r11 )
_zero_left :
movdqa ( % r s p ) , % x m m 7
pshufb % x m m 1 1 , % x m m 7
pxor % x m m 0 , % x m m 7 # x o r t h e i n i t i a l c r c v a l u e
# shl r9 , 4
lea p s h u f b _ s h f _ t a b l e + 1 6 ( % r i p ) , % r a x
sub % r9 , % r a x
movdqu ( % r a x ) , % x m m 0
pxor m a s k 1 ( % r i p ) , % x m m 0
pshufb % x m m 0 , % x m m 7
jmp _ 1 2 8 _ d o n e
.align 16
_exact_16_left :
movdqu ( a r g 2 ) , % x m m 7
pshufb % x m m 1 1 , % x m m 7
pxor % x m m 0 , % x m m 7 # x o r t h e i n i t i a l c r c v a l u e
jmp _ 1 2 8 _ d o n e
_only_less_than_4 :
cmp $ 3 , a r g 3
jl _ o n l y _ l e s s _ t h a n _ 3
# load 3 B y t e s
mov ( a r g 2 ) , % a l
mov % a l , ( % r11 )
mov 1 ( a r g 2 ) , % a l
mov % a l , 1 ( % r11 )
mov 2 ( a r g 2 ) , % a l
mov % a l , 2 ( % r11 )
movdqa ( % r s p ) , % x m m 7
pshufb % x m m 1 1 , % x m m 7
pxor % x m m 0 , % x m m 7 # x o r t h e i n i t i a l c r c v a l u e
psrldq $ 5 , % x m m 7
jmp _ b a r r e t t
_only_less_than_3 :
cmp $ 2 , a r g 3
jl _ o n l y _ l e s s _ t h a n _ 2
# load 2 B y t e s
mov ( a r g 2 ) , % a l
mov % a l , ( % r11 )
mov 1 ( a r g 2 ) , % a l
mov % a l , 1 ( % r11 )
movdqa ( % r s p ) , % x m m 7
pshufb % x m m 1 1 , % x m m 7
pxor % x m m 0 , % x m m 7 # x o r t h e i n i t i a l c r c v a l u e
psrldq $ 6 , % x m m 7
jmp _ b a r r e t t
_only_less_than_2 :
# load 1 B y t e
mov ( a r g 2 ) , % a l
mov % a l , ( % r11 )
movdqa ( % r s p ) , % x m m 7
pshufb % x m m 1 1 , % x m m 7
pxor % x m m 0 , % x m m 7 # x o r t h e i n i t i a l c r c v a l u e
psrldq $ 7 , % x m m 7
jmp _ b a r r e t t
ENDPROC( c r c _ t 1 0 d i f _ p c l )
crypto: x86 - make constants readonly, allow linker to merge them
A lot of asm-optimized routines in arch/x86/crypto/ keep its
constants in .data. This is wrong, they should be on .rodata.
Mnay of these constants are the same in different modules.
For example, 128-bit shuffle mask 0x000102030405060708090A0B0C0D0E0F
exists in at least half a dozen places.
There is a way to let linker merge them and use just one copy.
The rules are as follows: mergeable objects of different sizes
should not share sections. You can't put them all in one .rodata
section, they will lose "mergeability".
GCC puts its mergeable constants in ".rodata.cstSIZE" sections,
or ".rodata.cstSIZE.<object_name>" if -fdata-sections is used.
This patch does the same:
.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
It is important that all data in such section consists of
16-byte elements, not larger ones, and there are no implicit
use of one element from another.
When this is not the case, use non-mergeable section:
.section .rodata[.VAR_NAME], "a", @progbits
This reduces .data by ~15 kbytes:
text data bss dec hex filename
11097415 2705840 2630712 16433967 fac32f vmlinux-prev.o
11112095 2690672 2630712 16433479 fac147 vmlinux.o
Merged objects are visible in System.map:
ffffffff81a28810 r POLY
ffffffff81a28810 r POLY
ffffffff81a28820 r TWOONE
ffffffff81a28820 r TWOONE
ffffffff81a28830 r PSHUFFLE_BYTE_FLIP_MASK <- merged regardless of
ffffffff81a28830 r SHUF_MASK <------------- the name difference
ffffffff81a28830 r SHUF_MASK
ffffffff81a28830 r SHUF_MASK
..
ffffffff81a28d00 r K512 <- merged three identical 640-byte tables
ffffffff81a28d00 r K512
ffffffff81a28d00 r K512
Use of object names in section name suffixes is not strictly necessary,
but might help if someday link stage will use garbage collection
to eliminate unused sections (ld --gc-sections).
Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
CC: Herbert Xu <herbert@gondor.apana.org.au>
CC: Josh Poimboeuf <jpoimboe@redhat.com>
CC: Xiaodong Liu <xiaodong.liu@intel.com>
CC: Megha Dey <megha.dey@intel.com>
CC: linux-crypto@vger.kernel.org
CC: x86@kernel.org
CC: linux-kernel@vger.kernel.org
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2017-01-19 22:33:04 +01:00
.section .rodata , " a" , @progbits
.align 16
2013-09-07 12:56:26 +10:00
# precomputed c o n s t a n t s
# these c o n s t a n t s a r e p r e c o m p u t e d f r o m t h e p o l y :
# 0 x8 b b70 0 0 0 ( 0 x8 b b7 s c a l e d t o 3 2 b i t s )
# Q = 0 x18 B B 7 0 0 0 0
# rk1 = 2 ^ ( 3 2 * 3 ) m o d Q < < 3 2
# rk2 = 2 ^ ( 3 2 * 5 ) m o d Q < < 3 2
# rk3 = 2 ^ ( 3 2 * 1 5 ) m o d Q < < 3 2
# rk4 = 2 ^ ( 3 2 * 1 7 ) m o d Q < < 3 2
# rk5 = 2 ^ ( 3 2 * 3 ) m o d Q < < 3 2
# rk6 = 2 ^ ( 3 2 * 2 ) m o d Q < < 3 2
# rk7 = f l o o r ( 2 ^ 6 4 / Q )
# rk8 = Q
rk1 :
.quad 0x2d56000000000000
rk2 :
.quad 0x06df000000000000
rk3 :
.quad 0x9d9d000000000000
rk4 :
.quad 0x7cf5000000000000
rk5 :
.quad 0x2d56000000000000
rk6 :
.quad 0x1368000000000000
rk7 :
.quad 0x00000001f65a57f8
rk8 :
.quad 0x000000018bb70000
rk9 :
.quad 0xceae000000000000
rk10 :
.quad 0xbfd6000000000000
rk11 :
.quad 0x1e16000000000000
rk12 :
.quad 0x713c000000000000
rk13 :
.quad 0xf7f9000000000000
rk14 :
.quad 0x80a6000000000000
rk15 :
.quad 0x044c000000000000
rk16 :
.quad 0xe658000000000000
rk17 :
.quad 0xad18000000000000
rk18 :
.quad 0xa497000000000000
rk19 :
.quad 0x6ee3000000000000
rk20 :
.quad 0xe7b5000000000000
crypto: x86 - make constants readonly, allow linker to merge them
A lot of asm-optimized routines in arch/x86/crypto/ keep its
constants in .data. This is wrong, they should be on .rodata.
Mnay of these constants are the same in different modules.
For example, 128-bit shuffle mask 0x000102030405060708090A0B0C0D0E0F
exists in at least half a dozen places.
There is a way to let linker merge them and use just one copy.
The rules are as follows: mergeable objects of different sizes
should not share sections. You can't put them all in one .rodata
section, they will lose "mergeability".
GCC puts its mergeable constants in ".rodata.cstSIZE" sections,
or ".rodata.cstSIZE.<object_name>" if -fdata-sections is used.
This patch does the same:
.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
It is important that all data in such section consists of
16-byte elements, not larger ones, and there are no implicit
use of one element from another.
When this is not the case, use non-mergeable section:
.section .rodata[.VAR_NAME], "a", @progbits
This reduces .data by ~15 kbytes:
text data bss dec hex filename
11097415 2705840 2630712 16433967 fac32f vmlinux-prev.o
11112095 2690672 2630712 16433479 fac147 vmlinux.o
Merged objects are visible in System.map:
ffffffff81a28810 r POLY
ffffffff81a28810 r POLY
ffffffff81a28820 r TWOONE
ffffffff81a28820 r TWOONE
ffffffff81a28830 r PSHUFFLE_BYTE_FLIP_MASK <- merged regardless of
ffffffff81a28830 r SHUF_MASK <------------- the name difference
ffffffff81a28830 r SHUF_MASK
ffffffff81a28830 r SHUF_MASK
..
ffffffff81a28d00 r K512 <- merged three identical 640-byte tables
ffffffff81a28d00 r K512
ffffffff81a28d00 r K512
Use of object names in section name suffixes is not strictly necessary,
but might help if someday link stage will use garbage collection
to eliminate unused sections (ld --gc-sections).
Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
CC: Herbert Xu <herbert@gondor.apana.org.au>
CC: Josh Poimboeuf <jpoimboe@redhat.com>
CC: Xiaodong Liu <xiaodong.liu@intel.com>
CC: Megha Dey <megha.dey@intel.com>
CC: linux-crypto@vger.kernel.org
CC: x86@kernel.org
CC: linux-kernel@vger.kernel.org
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2017-01-19 22:33:04 +01:00
.section .rodata .cst16 .mask1 , " aM" , @progbits, 16
.align 16
2013-09-07 12:56:26 +10:00
mask1 :
.octa 0x80808080808080808080808080808080
crypto: x86 - make constants readonly, allow linker to merge them
A lot of asm-optimized routines in arch/x86/crypto/ keep its
constants in .data. This is wrong, they should be on .rodata.
Mnay of these constants are the same in different modules.
For example, 128-bit shuffle mask 0x000102030405060708090A0B0C0D0E0F
exists in at least half a dozen places.
There is a way to let linker merge them and use just one copy.
The rules are as follows: mergeable objects of different sizes
should not share sections. You can't put them all in one .rodata
section, they will lose "mergeability".
GCC puts its mergeable constants in ".rodata.cstSIZE" sections,
or ".rodata.cstSIZE.<object_name>" if -fdata-sections is used.
This patch does the same:
.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
It is important that all data in such section consists of
16-byte elements, not larger ones, and there are no implicit
use of one element from another.
When this is not the case, use non-mergeable section:
.section .rodata[.VAR_NAME], "a", @progbits
This reduces .data by ~15 kbytes:
text data bss dec hex filename
11097415 2705840 2630712 16433967 fac32f vmlinux-prev.o
11112095 2690672 2630712 16433479 fac147 vmlinux.o
Merged objects are visible in System.map:
ffffffff81a28810 r POLY
ffffffff81a28810 r POLY
ffffffff81a28820 r TWOONE
ffffffff81a28820 r TWOONE
ffffffff81a28830 r PSHUFFLE_BYTE_FLIP_MASK <- merged regardless of
ffffffff81a28830 r SHUF_MASK <------------- the name difference
ffffffff81a28830 r SHUF_MASK
ffffffff81a28830 r SHUF_MASK
..
ffffffff81a28d00 r K512 <- merged three identical 640-byte tables
ffffffff81a28d00 r K512
ffffffff81a28d00 r K512
Use of object names in section name suffixes is not strictly necessary,
but might help if someday link stage will use garbage collection
to eliminate unused sections (ld --gc-sections).
Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
CC: Herbert Xu <herbert@gondor.apana.org.au>
CC: Josh Poimboeuf <jpoimboe@redhat.com>
CC: Xiaodong Liu <xiaodong.liu@intel.com>
CC: Megha Dey <megha.dey@intel.com>
CC: linux-crypto@vger.kernel.org
CC: x86@kernel.org
CC: linux-kernel@vger.kernel.org
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2017-01-19 22:33:04 +01:00
.section .rodata .cst16 .mask2 , " aM" , @progbits, 16
.align 16
2013-09-07 12:56:26 +10:00
mask2 :
.octa 0x00000000FFFFFFFFFFFFFFFFFFFFFFFF
crypto: x86 - make constants readonly, allow linker to merge them
A lot of asm-optimized routines in arch/x86/crypto/ keep its
constants in .data. This is wrong, they should be on .rodata.
Mnay of these constants are the same in different modules.
For example, 128-bit shuffle mask 0x000102030405060708090A0B0C0D0E0F
exists in at least half a dozen places.
There is a way to let linker merge them and use just one copy.
The rules are as follows: mergeable objects of different sizes
should not share sections. You can't put them all in one .rodata
section, they will lose "mergeability".
GCC puts its mergeable constants in ".rodata.cstSIZE" sections,
or ".rodata.cstSIZE.<object_name>" if -fdata-sections is used.
This patch does the same:
.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
It is important that all data in such section consists of
16-byte elements, not larger ones, and there are no implicit
use of one element from another.
When this is not the case, use non-mergeable section:
.section .rodata[.VAR_NAME], "a", @progbits
This reduces .data by ~15 kbytes:
text data bss dec hex filename
11097415 2705840 2630712 16433967 fac32f vmlinux-prev.o
11112095 2690672 2630712 16433479 fac147 vmlinux.o
Merged objects are visible in System.map:
ffffffff81a28810 r POLY
ffffffff81a28810 r POLY
ffffffff81a28820 r TWOONE
ffffffff81a28820 r TWOONE
ffffffff81a28830 r PSHUFFLE_BYTE_FLIP_MASK <- merged regardless of
ffffffff81a28830 r SHUF_MASK <------------- the name difference
ffffffff81a28830 r SHUF_MASK
ffffffff81a28830 r SHUF_MASK
..
ffffffff81a28d00 r K512 <- merged three identical 640-byte tables
ffffffff81a28d00 r K512
ffffffff81a28d00 r K512
Use of object names in section name suffixes is not strictly necessary,
but might help if someday link stage will use garbage collection
to eliminate unused sections (ld --gc-sections).
Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
CC: Herbert Xu <herbert@gondor.apana.org.au>
CC: Josh Poimboeuf <jpoimboe@redhat.com>
CC: Xiaodong Liu <xiaodong.liu@intel.com>
CC: Megha Dey <megha.dey@intel.com>
CC: linux-crypto@vger.kernel.org
CC: x86@kernel.org
CC: linux-kernel@vger.kernel.org
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2017-01-19 22:33:04 +01:00
.section .rodata .cst16 .SHUF_MASK , " aM" , @progbits, 16
.align 16
2013-09-07 12:56:26 +10:00
SHUF_MASK :
.octa 0x000102030405060708090A0B0C0D0E0F
crypto: x86 - make constants readonly, allow linker to merge them
A lot of asm-optimized routines in arch/x86/crypto/ keep its
constants in .data. This is wrong, they should be on .rodata.
Mnay of these constants are the same in different modules.
For example, 128-bit shuffle mask 0x000102030405060708090A0B0C0D0E0F
exists in at least half a dozen places.
There is a way to let linker merge them and use just one copy.
The rules are as follows: mergeable objects of different sizes
should not share sections. You can't put them all in one .rodata
section, they will lose "mergeability".
GCC puts its mergeable constants in ".rodata.cstSIZE" sections,
or ".rodata.cstSIZE.<object_name>" if -fdata-sections is used.
This patch does the same:
.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
It is important that all data in such section consists of
16-byte elements, not larger ones, and there are no implicit
use of one element from another.
When this is not the case, use non-mergeable section:
.section .rodata[.VAR_NAME], "a", @progbits
This reduces .data by ~15 kbytes:
text data bss dec hex filename
11097415 2705840 2630712 16433967 fac32f vmlinux-prev.o
11112095 2690672 2630712 16433479 fac147 vmlinux.o
Merged objects are visible in System.map:
ffffffff81a28810 r POLY
ffffffff81a28810 r POLY
ffffffff81a28820 r TWOONE
ffffffff81a28820 r TWOONE
ffffffff81a28830 r PSHUFFLE_BYTE_FLIP_MASK <- merged regardless of
ffffffff81a28830 r SHUF_MASK <------------- the name difference
ffffffff81a28830 r SHUF_MASK
ffffffff81a28830 r SHUF_MASK
..
ffffffff81a28d00 r K512 <- merged three identical 640-byte tables
ffffffff81a28d00 r K512
ffffffff81a28d00 r K512
Use of object names in section name suffixes is not strictly necessary,
but might help if someday link stage will use garbage collection
to eliminate unused sections (ld --gc-sections).
Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
CC: Herbert Xu <herbert@gondor.apana.org.au>
CC: Josh Poimboeuf <jpoimboe@redhat.com>
CC: Xiaodong Liu <xiaodong.liu@intel.com>
CC: Megha Dey <megha.dey@intel.com>
CC: linux-crypto@vger.kernel.org
CC: x86@kernel.org
CC: linux-kernel@vger.kernel.org
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2017-01-19 22:33:04 +01:00
.section .rodata .cst32 .pshufb_shf_table , " aM" , @progbits, 32
.align 32
2013-09-07 12:56:26 +10:00
pshufb_shf_table :
# use t h e s e v a l u e s f o r s h i f t c o n s t a n t s f o r t h e p s h u f b i n s t r u c t i o n
# different a l i g n m e n t s r e s u l t i n v a l u e s a s s h o w n :
# DDQ 0 x00 8 f8 e 8 d8 c8 b8 a89 8 8 8 7 8 6 8 5 8 4 8 3 8 2 8 1 # s h l 15 ( 1 6 - 1 ) / s h r1
# DDQ 0 x01 0 0 8 f8 e 8 d8 c8 b8 a89 8 8 8 7 8 6 8 5 8 4 8 3 8 2 # s h l 14 ( 1 6 - 3 ) / s h r2
# DDQ 0 x02 0 1 0 0 8 f8 e 8 d8 c8 b8 a89 8 8 8 7 8 6 8 5 8 4 8 3 # s h l 13 ( 1 6 - 4 ) / s h r3
# DDQ 0 x03 0 2 0 1 0 0 8 f8 e 8 d8 c8 b8 a89 8 8 8 7 8 6 8 5 8 4 # s h l 12 ( 1 6 - 4 ) / s h r4
# DDQ 0 x04 0 3 0 2 0 1 0 0 8 f8 e 8 d8 c8 b8 a89 8 8 8 7 8 6 8 5 # s h l 11 ( 1 6 - 5 ) / s h r5
# DDQ 0 x05 0 4 0 3 0 2 0 1 0 0 8 f8 e 8 d8 c8 b8 a89 8 8 8 7 8 6 # s h l 10 ( 1 6 - 6 ) / s h r6
# DDQ 0 x06 0 5 0 4 0 3 0 2 0 1 0 0 8 f8 e 8 d8 c8 b8 a89 8 8 8 7 # s h l 9 ( 1 6 - 7 ) / s h r7
# DDQ 0 x07 0 6 0 5 0 4 0 3 0 2 0 1 0 0 8 f8 e 8 d8 c8 b8 a89 8 8 # s h l 8 ( 1 6 - 8 ) / s h r8
# DDQ 0 x08 0 7 0 6 0 5 0 4 0 3 0 2 0 1 0 0 8 f8 e 8 d8 c8 b8 a89 # s h l 7 ( 1 6 - 9 ) / s h r9
# DDQ 0 x09 0 8 0 7 0 6 0 5 0 4 0 3 0 2 0 1 0 0 8 f8 e 8 d8 c8 b8 a # s h l 6 ( 1 6 - 1 0 ) / s h r10
# DDQ 0 x0 a09 0 8 0 7 0 6 0 5 0 4 0 3 0 2 0 1 0 0 8 f8 e 8 d8 c8 b # s h l 5 ( 1 6 - 1 1 ) / s h r11
# DDQ 0 x0 b0 a09 0 8 0 7 0 6 0 5 0 4 0 3 0 2 0 1 0 0 8 f8 e 8 d8 c # s h l 4 ( 1 6 - 1 2 ) / s h r12
# DDQ 0 x0 c0 b0 a09 0 8 0 7 0 6 0 5 0 4 0 3 0 2 0 1 0 0 8 f8 e 8 d # s h l 3 ( 1 6 - 1 3 ) / s h r13
# DDQ 0 x0 d0 c0 b0 a09 0 8 0 7 0 6 0 5 0 4 0 3 0 2 0 1 0 0 8 f8 e # s h l 2 ( 1 6 - 1 4 ) / s h r14
# DDQ 0 x0 e 0 d0 c0 b0 a09 0 8 0 7 0 6 0 5 0 4 0 3 0 2 0 1 0 0 8 f # s h l 1 ( 1 6 - 1 5 ) / s h r15
.octa 0x8f8e8d8c8b8a89888786858483828100
.octa 0x000e0d0c0b0a09080706050403020100