2013-03-26 13:59:05 -07:00
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# Implement f a s t S H A - 2 5 6 w i t h A V X 1 i n s t r u c t i o n s . ( x86 _ 6 4 )
#
# Copyright ( C ) 2 0 1 3 I n t e l C o r p o r a t i o n .
#
# Authors :
# James G u i l f o r d < j a m e s . g u i l f o r d @intel.com>
# Kirk Y a p < k i r k . s . y a p @intel.com>
# Tim C h e n < t i m . c . c h e n @linux.intel.com>
#
# This s o f t w a r e i s a v a i l a b l e t o y o u u n d e r a c h o i c e o f o n e o f t w o
# licenses. Y o u m a y c h o o s e t o b e l i c e n s e d u n d e r t h e t e r m s o f t h e G N U
# General P u b l i c L i c e n s e ( G P L ) V e r s i o n 2 , a v a i l a b l e f r o m t h e f i l e
# COPYING i n t h e m a i n d i r e c t o r y o f t h i s s o u r c e t r e e , o r t h e
# OpenIB. o r g B S D l i c e n s e b e l o w :
#
# Redistribution a n d u s e i n s o u r c e a n d b i n a r y f o r m s , w i t h o r
# without m o d i f i c a t i o n , a r e p e r m i t t e d p r o v i d e d t h a t t h e f o l l o w i n g
# conditions a r e m e t :
#
# - Redistributions o f s o u r c e c o d e m u s t r e t a i n t h e a b o v e
# copyright n o t i c e , t h i s l i s t o f c o n d i t i o n s a n d t h e f o l l o w i n g
# disclaimer.
#
# - Redistributions i n b i n a r y f o r m m u s t r e p r o d u c e t h e a b o v e
# copyright n o t i c e , t h i s l i s t o f c o n d i t i o n s a n d t h e f o l l o w i n g
# disclaimer i n t h e d o c u m e n t a t i o n a n d / o r o t h e r m a t e r i a l s
# provided w i t h t h e d i s t r i b u t i o n .
#
# THE S O F T W A R E I S P R O V I D E D " A S I S " , W I T H O U T W A R R A N T Y O F A N Y K I N D ,
# EXPRESS O R I M P L I E D , I N C L U D I N G B U T N O T L I M I T E D T O T H E W A R R A N T I E S O F
# MERCHANTABILITY, F I T N E S S F O R A P A R T I C U L A R P U R P O S E A N D
# NONINFRINGEMENT. I N N O E V E N T S H A L L T H E A U T H O R S O R C O P Y R I G H T H O L D E R S
# BE L I A B L E F O R A N Y C L A I M , D A M A G E S O R O T H E R L I A B I L I T Y , W H E T H E R I N A N
# ACTION O F C O N T R A C T , T O R T O R O T H E R W I S E , A R I S I N G F R O M , O U T O F O R I N
# CONNECTION W I T H T H E S O F T W A R E O R T H E U S E O R O T H E R D E A L I N G S I N T H E
# SOFTWARE.
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
#
# This c o d e i s d e s c r i b e d i n a n I n t e l W h i t e - P a p e r :
# " Fast S H A - 2 5 6 I m p l e m e n t a t i o n s o n I n t e l A r c h i t e c t u r e P r o c e s s o r s "
#
# To f i n d i t , s u r f t o h t t p : / / w w w . i n t e l . c o m / p / e n _ U S / e m b e d d e d
# and s e a r c h f o r t h a t t i t l e .
#
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# This c o d e s c h e d u l e s 1 b l o c k a t a t i m e , w i t h 4 l a n e s p e r b l o c k
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# ifdef C O N F I G _ A S _ A V X
# include < l i n u x / l i n k a g e . h >
# # assume b u f f e r s n o t a l i g n e d
# define V M O V D Q v m o v d q u
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Define M a c r o s
# addm [ m e m ] , r e g
# Add r e g t o m e m u s i n g r e g - m e m a d d a n d s t o r e
.macro addm p1 p2
add \ p1 , \ p2
mov \ p2 , \ p1
.endm
.macro MY_ROR p1 p2
shld $ ( 3 2 - ( \ p1 ) ) , \ p2 , \ p2
.endm
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# COPY_ X M M _ A N D _ B S W A P x m m , [ m e m ] , b y t e _ f l i p _ m a s k
# Load x m m w i t h m e m a n d b y t e s w a p e a c h d w o r d
.macro COPY_XMM_AND_BSWAP p1 p2 p3
VMOVDQ \ p2 , \ p1
vpshufb \ p3 , \ p1 , \ p1
.endm
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
X0 = % x m m 4
X1 = % x m m 5
X2 = % x m m 6
X3 = % x m m 7
XTMP0 = % x m m 0
XTMP1 = % x m m 1
XTMP2 = % x m m 2
XTMP3 = % x m m 3
XTMP4 = % x m m 8
XFER = % x m m 9
XTMP5 = % x m m 1 1
SHUF_ 0 0 B A = % x m m 1 0 # s h u f f l e x B x A - > 00 B A
SHUF_ D C 0 0 = % x m m 1 2 # s h u f f l e x D x C - > D C 00
BYTE_ F L I P _ M A S K = % x m m 1 3
NUM_ B L K S = % r d x # 3 r d a r g
2015-04-09 12:55:47 +02:00
INP = % r s i # 2 n d a r g
CTX = % r d i # 1 s t a r g
2013-03-26 13:59:05 -07:00
2015-04-09 12:55:47 +02:00
SRND = % r s i # c l o b b e r s I N P
2013-03-26 13:59:05 -07:00
c = % e c x
d = % r8 d
e = % e d x
TBL = % r b p
a = % e a x
b = % e b x
f = % r9 d
g = % r10 d
h = % r11 d
y0 = % r13 d
y1 = % r14 d
y2 = % r15 d
_ INP_ E N D _ S I Z E = 8
_ INP_ S I Z E = 8
2013-05-21 17:09:41 +03:00
_ XFER_ S I Z E = 1 6
2013-03-26 13:59:05 -07:00
_ XMM_ S A V E _ S I Z E = 0
_ INP_ E N D = 0
_ INP = _ I N P _ E N D + _ I N P _ E N D _ S I Z E
_ XFER = _ I N P + _ I N P _ S I Z E
_ XMM_ S A V E = _ X F E R + _ X F E R _ S I Z E
STACK_ S I Z E = _ X M M _ S A V E + _ X M M _ S A V E _ S I Z E
# rotate_ X s
# Rotate v a l u e s o f s y m b o l s X 0 . . . X 3
.macro rotate_Xs
X_ = X 0
X0 = X 1
X1 = X 2
X2 = X 3
X3 = X _
.endm
# ROTATE_ A R G S
# Rotate v a l u e s o f s y m b o l s a . . . h
.macro ROTATE_ARGS
TMP_ = h
h = g
g = f
f = e
e = d
d = c
c = b
b = a
a = T M P _
.endm
.macro FOUR_ROUNDS_AND_SCHED
# # compute s0 f o u r a t a t i m e a n d s1 t w o a t a t i m e
# # compute W [ - 1 6 ] + W [ - 7 ] 4 a t a t i m e
mov e , y 0 # y 0 = e
MY_ R O R ( 2 5 - 1 1 ) , y 0 # y 0 = e > > ( 2 5 - 1 1 )
mov a , y 1 # y 1 = a
vpalignr $ 4 , X 2 , X 3 , X T M P 0 # X T M P 0 = W [ - 7 ]
MY_ R O R ( 2 2 - 1 3 ) , y 1 # y 1 = a > > ( 2 2 - 1 3 )
xor e , y 0 # y 0 = e ^ ( e > > ( 2 5 - 1 1 ) )
mov f , y 2 # y 2 = f
MY_ R O R ( 1 1 - 6 ) , y 0 # y 0 = ( e > > ( 1 1 - 6 ) ) ^ ( e > > ( 2 5 - 6 ) )
xor a , y 1 # y 1 = a ^ ( a > > ( 2 2 - 1 3 )
xor g , y 2 # y 2 = f ^ g
vpaddd X 0 , X T M P 0 , X T M P 0 # X T M P 0 = W [ - 7 ] + W [ - 1 6 ]
xor e , y 0 # y 0 = e ^ ( e > > ( 1 1 - 6 ) ) ^ ( e > > ( 2 5 - 6 ) )
and e , y 2 # y 2 = ( f ^ g ) & e
MY_ R O R ( 1 3 - 2 ) , y 1 # y 1 = ( a > > ( 1 3 - 2 ) ) ^ ( a > > ( 2 2 - 2 ) )
# # compute s0
vpalignr $ 4 , X 0 , X 1 , X T M P 1 # X T M P 1 = W [ - 1 5 ]
xor a , y 1 # y 1 = a ^ ( a > > ( 1 3 - 2 ) ) ^ ( a > > ( 2 2 - 2 ) )
MY_ R O R 6 , y 0 # y 0 = S 1 = ( e > > 6 ) & ( e > > 1 1 ) ^ ( e > > 2 5 )
xor g , y 2 # y 2 = C H = ( ( f ^ g ) & e ) ^ g
MY_ R O R 2 , y 1 # y 1 = S 0 = ( a > > 2 ) ^ ( a > > 1 3 ) ^ ( a > > 2 2 )
add y 0 , y 2 # y 2 = S 1 + C H
add _ X F E R ( % r s p ) , y 2 # y 2 = k + w + S 1 + C H
mov a , y 0 # y 0 = a
add y 2 , h # h = h + S 1 + C H + k + w
mov a , y 2 # y 2 = a
vpsrld $ 7 , X T M P 1 , X T M P 2
or c , y 0 # y 0 = a | c
add h , d # d = d + h + S 1 + C H + k + w
and c , y 2 # y 2 = a & c
vpslld $ ( 3 2 - 7 ) , X T M P 1 , X T M P 3
and b , y 0 # y 0 = ( a | c ) & b
add y 1 , h # h = h + S 1 + C H + k + w + S 0
vpor X T M P 2 , X T M P 3 , X T M P 3 # X T M P 1 = W [ - 1 5 ] M Y _ R O R 7
or y 2 , y 0 # y 0 = M A J = ( a | c ) & b ) | ( a & c )
add y 0 , h # h = h + S 1 + C H + k + w + S 0 + M A J
ROTATE_ A R G S
mov e , y 0 # y 0 = e
mov a , y 1 # y 1 = a
MY_ R O R ( 2 5 - 1 1 ) , y 0 # y 0 = e > > ( 2 5 - 1 1 )
xor e , y 0 # y 0 = e ^ ( e > > ( 2 5 - 1 1 ) )
mov f , y 2 # y 2 = f
MY_ R O R ( 2 2 - 1 3 ) , y 1 # y 1 = a > > ( 2 2 - 1 3 )
vpsrld $ 1 8 , X T M P 1 , X T M P 2 #
xor a , y 1 # y 1 = a ^ ( a > > ( 2 2 - 1 3 )
MY_ R O R ( 1 1 - 6 ) , y 0 # y 0 = ( e > > ( 1 1 - 6 ) ) ^ ( e > > ( 2 5 - 6 ) )
xor g , y 2 # y 2 = f ^ g
vpsrld $ 3 , X T M P 1 , X T M P 4 # X T M P 4 = W [ - 1 5 ] > > 3
MY_ R O R ( 1 3 - 2 ) , y 1 # y 1 = ( a > > ( 1 3 - 2 ) ) ^ ( a > > ( 2 2 - 2 ) )
xor e , y 0 # y 0 = e ^ ( e > > ( 1 1 - 6 ) ) ^ ( e > > ( 2 5 - 6 ) )
and e , y 2 # y 2 = ( f ^ g ) & e
MY_ R O R 6 , y 0 # y 0 = S 1 = ( e > > 6 ) & ( e > > 1 1 ) ^ ( e > > 2 5 )
vpslld $ ( 3 2 - 1 8 ) , X T M P 1 , X T M P 1
xor a , y 1 # y 1 = a ^ ( a > > ( 1 3 - 2 ) ) ^ ( a > > ( 2 2 - 2 ) )
xor g , y 2 # y 2 = C H = ( ( f ^ g ) & e ) ^ g
vpxor X T M P 1 , X T M P 3 , X T M P 3 #
add y 0 , y 2 # y 2 = S 1 + C H
add ( 1 * 4 + _ X F E R ) ( % r s p ) , y 2 # y 2 = k + w + S 1 + C H
MY_ R O R 2 , y 1 # y 1 = S 0 = ( a > > 2 ) ^ ( a > > 1 3 ) ^ ( a > > 2 2 )
vpxor X T M P 2 , X T M P 3 , X T M P 3 # X T M P 1 = W [ - 1 5 ] M Y _ R O R 7 ^ W [ - 1 5 ] M Y _ R O R
mov a , y 0 # y 0 = a
add y 2 , h # h = h + S 1 + C H + k + w
mov a , y 2 # y 2 = a
vpxor X T M P 4 , X T M P 3 , X T M P 1 # X T M P 1 = s0
or c , y 0 # y 0 = a | c
add h , d # d = d + h + S 1 + C H + k + w
and c , y 2 # y 2 = a & c
# # compute l o w s1
vpshufd $ 0 b11 1 1 1 0 1 0 , X 3 , X T M P 2 # X T M P 2 = W [ - 2 ] { B B A A }
and b , y 0 # y 0 = ( a | c ) & b
add y 1 , h # h = h + S 1 + C H + k + w + S 0
vpaddd X T M P 1 , X T M P 0 , X T M P 0 # X T M P 0 = W [ - 1 6 ] + W [ - 7 ] + s0
or y 2 , y 0 # y 0 = M A J = ( a | c ) & b ) | ( a & c )
add y 0 , h # h = h + S 1 + C H + k + w + S 0 + M A J
ROTATE_ A R G S
mov e , y 0 # y 0 = e
mov a , y 1 # y 1 = a
MY_ R O R ( 2 5 - 1 1 ) , y 0 # y 0 = e > > ( 2 5 - 1 1 )
xor e , y 0 # y 0 = e ^ ( e > > ( 2 5 - 1 1 ) )
MY_ R O R ( 2 2 - 1 3 ) , y 1 # y 1 = a > > ( 2 2 - 1 3 )
mov f , y 2 # y 2 = f
xor a , y 1 # y 1 = a ^ ( a > > ( 2 2 - 1 3 )
MY_ R O R ( 1 1 - 6 ) , y 0 # y 0 = ( e > > ( 1 1 - 6 ) ) ^ ( e > > ( 2 5 - 6 ) )
vpsrld $ 1 0 , X T M P 2 , X T M P 4 # X T M P 4 = W [ - 2 ] > > 1 0 { B B A A }
xor g , y 2 # y 2 = f ^ g
vpsrlq $ 1 9 , X T M P 2 , X T M P 3 # X T M P 3 = W [ - 2 ] M Y _ R O R 1 9 { x B x A }
xor e , y 0 # y 0 = e ^ ( e > > ( 1 1 - 6 ) ) ^ ( e > > ( 2 5 - 6 ) )
and e , y 2 # y 2 = ( f ^ g ) & e
vpsrlq $ 1 7 , X T M P 2 , X T M P 2 # X T M P 2 = W [ - 2 ] M Y _ R O R 1 7 { x B x A }
MY_ R O R ( 1 3 - 2 ) , y 1 # y 1 = ( a > > ( 1 3 - 2 ) ) ^ ( a > > ( 2 2 - 2 ) )
xor a , y 1 # y 1 = a ^ ( a > > ( 1 3 - 2 ) ) ^ ( a > > ( 2 2 - 2 ) )
xor g , y 2 # y 2 = C H = ( ( f ^ g ) & e ) ^ g
MY_ R O R 6 , y 0 # y 0 = S 1 = ( e > > 6 ) & ( e > > 1 1 ) ^ ( e > > 2 5 )
vpxor X T M P 3 , X T M P 2 , X T M P 2 #
add y 0 , y 2 # y 2 = S 1 + C H
MY_ R O R 2 , y 1 # y 1 = S 0 = ( a > > 2 ) ^ ( a > > 1 3 ) ^ ( a > > 2 2 )
add ( 2 * 4 + _ X F E R ) ( % r s p ) , y 2 # y 2 = k + w + S 1 + C H
vpxor X T M P 2 , X T M P 4 , X T M P 4 # X T M P 4 = s1 { x B x A }
mov a , y 0 # y 0 = a
add y 2 , h # h = h + S 1 + C H + k + w
mov a , y 2 # y 2 = a
vpshufb S H U F _ 0 0 B A , X T M P 4 , X T M P 4 # X T M P 4 = s1 { 0 0 B A }
or c , y 0 # y 0 = a | c
add h , d # d = d + h + S 1 + C H + k + w
and c , y 2 # y 2 = a & c
vpaddd X T M P 4 , X T M P 0 , X T M P 0 # X T M P 0 = { . . . , . . . , W [ 1 ] , W [ 0 ] }
and b , y 0 # y 0 = ( a | c ) & b
add y 1 , h # h = h + S 1 + C H + k + w + S 0
# # compute h i g h s1
vpshufd $ 0 b01 0 1 0 0 0 0 , X T M P 0 , X T M P 2 # X T M P 2 = W [ - 2 ] { D D C C }
or y 2 , y 0 # y 0 = M A J = ( a | c ) & b ) | ( a & c )
add y 0 , h # h = h + S 1 + C H + k + w + S 0 + M A J
ROTATE_ A R G S
mov e , y 0 # y 0 = e
MY_ R O R ( 2 5 - 1 1 ) , y 0 # y 0 = e > > ( 2 5 - 1 1 )
mov a , y 1 # y 1 = a
MY_ R O R ( 2 2 - 1 3 ) , y 1 # y 1 = a > > ( 2 2 - 1 3 )
xor e , y 0 # y 0 = e ^ ( e > > ( 2 5 - 1 1 ) )
mov f , y 2 # y 2 = f
MY_ R O R ( 1 1 - 6 ) , y 0 # y 0 = ( e > > ( 1 1 - 6 ) ) ^ ( e > > ( 2 5 - 6 ) )
vpsrld $ 1 0 , X T M P 2 , X T M P 5 # X T M P 5 = W [ - 2 ] > > 1 0 { D D C C }
xor a , y 1 # y 1 = a ^ ( a > > ( 2 2 - 1 3 )
xor g , y 2 # y 2 = f ^ g
vpsrlq $ 1 9 , X T M P 2 , X T M P 3 # X T M P 3 = W [ - 2 ] M Y _ R O R 1 9 { x D x C }
xor e , y 0 # y 0 = e ^ ( e > > ( 1 1 - 6 ) ) ^ ( e > > ( 2 5 - 6 ) )
and e , y 2 # y 2 = ( f ^ g ) & e
MY_ R O R ( 1 3 - 2 ) , y 1 # y 1 = ( a > > ( 1 3 - 2 ) ) ^ ( a > > ( 2 2 - 2 ) )
vpsrlq $ 1 7 , X T M P 2 , X T M P 2 # X T M P 2 = W [ - 2 ] M Y _ R O R 1 7 { x D x C }
xor a , y 1 # y 1 = a ^ ( a > > ( 1 3 - 2 ) ) ^ ( a > > ( 2 2 - 2 ) )
MY_ R O R 6 , y 0 # y 0 = S 1 = ( e > > 6 ) & ( e > > 1 1 ) ^ ( e > > 2 5 )
xor g , y 2 # y 2 = C H = ( ( f ^ g ) & e ) ^ g
vpxor X T M P 3 , X T M P 2 , X T M P 2
MY_ R O R 2 , y 1 # y 1 = S 0 = ( a > > 2 ) ^ ( a > > 1 3 ) ^ ( a > > 2 2 )
add y 0 , y 2 # y 2 = S 1 + C H
add ( 3 * 4 + _ X F E R ) ( % r s p ) , y 2 # y 2 = k + w + S 1 + C H
vpxor X T M P 2 , X T M P 5 , X T M P 5 # X T M P 5 = s1 { x D x C }
mov a , y 0 # y 0 = a
add y 2 , h # h = h + S 1 + C H + k + w
mov a , y 2 # y 2 = a
vpshufb S H U F _ D C 0 0 , X T M P 5 , X T M P 5 # X T M P 5 = s1 { D C 0 0 }
or c , y 0 # y 0 = a | c
add h , d # d = d + h + S 1 + C H + k + w
and c , y 2 # y 2 = a & c
vpaddd X T M P 0 , X T M P 5 , X 0 # X 0 = { W [ 3 ] , W [ 2 ] , W [ 1 ] , W [ 0 ] }
and b , y 0 # y 0 = ( a | c ) & b
add y 1 , h # h = h + S 1 + C H + k + w + S 0
or y 2 , y 0 # y 0 = M A J = ( a | c ) & b ) | ( a & c )
add y 0 , h # h = h + S 1 + C H + k + w + S 0 + M A J
ROTATE_ A R G S
rotate_ X s
.endm
# # input i s [ r s p + _ X F E R + % 1 * 4 ]
.macro DO_ROUND round
mov e , y 0 # y 0 = e
MY_ R O R ( 2 5 - 1 1 ) , y 0 # y 0 = e > > ( 2 5 - 1 1 )
mov a , y 1 # y 1 = a
xor e , y 0 # y 0 = e ^ ( e > > ( 2 5 - 1 1 ) )
MY_ R O R ( 2 2 - 1 3 ) , y 1 # y 1 = a > > ( 2 2 - 1 3 )
mov f , y 2 # y 2 = f
xor a , y 1 # y 1 = a ^ ( a > > ( 2 2 - 1 3 )
MY_ R O R ( 1 1 - 6 ) , y 0 # y 0 = ( e > > ( 1 1 - 6 ) ) ^ ( e > > ( 2 5 - 6 ) )
xor g , y 2 # y 2 = f ^ g
xor e , y 0 # y 0 = e ^ ( e > > ( 1 1 - 6 ) ) ^ ( e > > ( 2 5 - 6 ) )
MY_ R O R ( 1 3 - 2 ) , y 1 # y 1 = ( a > > ( 1 3 - 2 ) ) ^ ( a > > ( 2 2 - 2 ) )
and e , y 2 # y 2 = ( f ^ g ) & e
xor a , y 1 # y 1 = a ^ ( a > > ( 1 3 - 2 ) ) ^ ( a > > ( 2 2 - 2 ) )
MY_ R O R 6 , y 0 # y 0 = S 1 = ( e > > 6 ) & ( e > > 1 1 ) ^ ( e > > 2 5 )
xor g , y 2 # y 2 = C H = ( ( f ^ g ) & e ) ^ g
add y 0 , y 2 # y 2 = S 1 + C H
MY_ R O R 2 , y 1 # y 1 = S 0 = ( a > > 2 ) ^ ( a > > 1 3 ) ^ ( a > > 2 2 )
offset = \ r o u n d * 4 + _ X F E R #
add o f f s e t ( % r s p ) , y 2 # y 2 = k + w + S 1 + C H
mov a , y 0 # y 0 = a
add y 2 , h # h = h + S 1 + C H + k + w
mov a , y 2 # y 2 = a
or c , y 0 # y 0 = a | c
add h , d # d = d + h + S 1 + C H + k + w
and c , y 2 # y 2 = a & c
and b , y 0 # y 0 = ( a | c ) & b
add y 1 , h # h = h + S 1 + C H + k + w + S 0
or y 2 , y 0 # y 0 = M A J = ( a | c ) & b ) | ( a & c )
add y 0 , h # h = h + S 1 + C H + k + w + S 0 + M A J
ROTATE_ A R G S
.endm
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# # void s h a25 6 _ t r a n s f o r m _ a v x ( v o i d * i n p u t _ d a t a , U I N T 3 2 d i g e s t [ 8 ] , U I N T 6 4 n u m _ b l k s )
2015-04-09 12:55:47 +02:00
# # arg 1 : p o i n t e r t o d i g e s t
# # arg 2 : p o i n t e r t o i n p u t d a t a
2013-03-26 13:59:05 -07:00
# # arg 3 : N u m b l o c k s
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
.text
ENTRY( s h a25 6 _ t r a n s f o r m _ a v x )
.align 32
pushq % r b x
pushq % r b p
pushq % r13
pushq % r14
pushq % r15
pushq % r12
mov % r s p , % r12
subq $ S T A C K _ S I Z E , % r s p # a l l o c a t e s t a c k s p a c e
and $ ~ 1 5 , % r s p # a l i g n s t a c k p o i n t e r
shl $ 6 , N U M _ B L K S # c o n v e r t t o b y t e s
jz d o n e _ h a s h
add I N P , N U M _ B L K S # p o i n t e r t o e n d o f d a t a
mov N U M _ B L K S , _ I N P _ E N D ( % r s p )
# # load i n i t i a l d i g e s t
mov 4 * 0 ( C T X ) , a
mov 4 * 1 ( C T X ) , b
mov 4 * 2 ( C T X ) , c
mov 4 * 3 ( C T X ) , d
mov 4 * 4 ( C T X ) , e
mov 4 * 5 ( C T X ) , f
mov 4 * 6 ( C T X ) , g
mov 4 * 7 ( C T X ) , h
vmovdqa P S H U F F L E _ B Y T E _ F L I P _ M A S K ( % r i p ) , B Y T E _ F L I P _ M A S K
vmovdqa _ S H U F _ 0 0 B A ( % r i p ) , S H U F _ 0 0 B A
vmovdqa _ S H U F _ D C 0 0 ( % r i p ) , S H U F _ D C 0 0
loop0 :
lea K 2 5 6 ( % r i p ) , T B L
# # byte s w a p f i r s t 1 6 d w o r d s
COPY_ X M M _ A N D _ B S W A P X 0 , 0 * 1 6 ( I N P ) , B Y T E _ F L I P _ M A S K
COPY_ X M M _ A N D _ B S W A P X 1 , 1 * 1 6 ( I N P ) , B Y T E _ F L I P _ M A S K
COPY_ X M M _ A N D _ B S W A P X 2 , 2 * 1 6 ( I N P ) , B Y T E _ F L I P _ M A S K
COPY_ X M M _ A N D _ B S W A P X 3 , 3 * 1 6 ( I N P ) , B Y T E _ F L I P _ M A S K
mov I N P , _ I N P ( % r s p )
# # schedule 4 8 i n p u t d w o r d s , b y d o i n g 3 r o u n d s o f 1 6 e a c h
mov $ 3 , S R N D
.align 16
loop1 :
vpaddd ( T B L ) , X 0 , X F E R
vmovdqa X F E R , _ X F E R ( % r s p )
FOUR_ R O U N D S _ A N D _ S C H E D
vpaddd 1 * 1 6 ( T B L ) , X 0 , X F E R
vmovdqa X F E R , _ X F E R ( % r s p )
FOUR_ R O U N D S _ A N D _ S C H E D
vpaddd 2 * 1 6 ( T B L ) , X 0 , X F E R
vmovdqa X F E R , _ X F E R ( % r s p )
FOUR_ R O U N D S _ A N D _ S C H E D
vpaddd 3 * 1 6 ( T B L ) , X 0 , X F E R
vmovdqa X F E R , _ X F E R ( % r s p )
add $ 4 * 1 6 , T B L
FOUR_ R O U N D S _ A N D _ S C H E D
sub $ 1 , S R N D
jne l o o p1
mov $ 2 , S R N D
loop2 :
vpaddd ( T B L ) , X 0 , X F E R
vmovdqa X F E R , _ X F E R ( % r s p )
DO_ R O U N D 0
DO_ R O U N D 1
DO_ R O U N D 2
DO_ R O U N D 3
vpaddd 1 * 1 6 ( T B L ) , X 1 , X F E R
vmovdqa X F E R , _ X F E R ( % r s p )
add $ 2 * 1 6 , T B L
DO_ R O U N D 0
DO_ R O U N D 1
DO_ R O U N D 2
DO_ R O U N D 3
vmovdqa X 2 , X 0
vmovdqa X 3 , X 1
sub $ 1 , S R N D
jne l o o p2
addm ( 4 * 0 ) ( C T X ) ,a
addm ( 4 * 1 ) ( C T X ) ,b
addm ( 4 * 2 ) ( C T X ) ,c
addm ( 4 * 3 ) ( C T X ) ,d
addm ( 4 * 4 ) ( C T X ) ,e
addm ( 4 * 5 ) ( C T X ) ,f
addm ( 4 * 6 ) ( C T X ) ,g
addm ( 4 * 7 ) ( C T X ) ,h
mov _ I N P ( % r s p ) , I N P
add $ 6 4 , I N P
cmp _ I N P _ E N D ( % r s p ) , I N P
jne l o o p0
done_hash :
mov % r12 , % r s p
popq % r12
popq % r15
popq % r14
popq % r13
popq % r b p
popq % r b x
ret
ENDPROC( s h a25 6 _ t r a n s f o r m _ a v x )
.data
.align 64
K256 :
.long 0 x4 2 8 a2 f98 ,0 x71 3 7 4 4 9 1 ,0 x b5 c0 f b c f ,0 x e 9 b5 d b a5
.long 0 x3 9 5 6 c25 b ,0 x59 f11 1 f1 ,0 x92 3 f82 a4 ,0 x a b1 c5 e d5
.long 0 xd8 0 7 a a98 ,0 x12 8 3 5 b01 ,0 x24 3 1 8 5 b e ,0 x55 0 c7 d c3
.long 0 x7 2 b e 5 d74 ,0 x80 d e b1 f e ,0 x9 b d c06 a7 ,0 x c19 b f17 4
.long 0 xe4 9 b69 c1 ,0 x e f b e 4 7 8 6 ,0 x0 f c19 d c6 ,0 x24 0 c a1 c c
.long 0 x2 d e 9 2 c6 f ,0 x4 a74 8 4 a a ,0 x5 c b0 a9 d c ,0 x76 f98 8 d a
.long 0 x9 8 3 e 5 1 5 2 ,0 x a83 1 c66 d ,0 x b00 3 2 7 c8 ,0 x b f59 7 f c7
.long 0 xc6 e 0 0 b f3 ,0 x d5 a79 1 4 7 ,0 x06 c a63 5 1 ,0 x14 2 9 2 9 6 7
.long 0 x2 7 b70 a85 ,0 x2 e 1 b21 3 8 ,0 x4 d2 c6 d f c ,0 x53 3 8 0 d13
.long 0 x6 5 0 a73 5 4 ,0 x76 6 a0 a b b ,0 x81 c2 c92 e ,0 x92 7 2 2 c85
.long 0 xa2 b f e 8 a1 ,0 x a81 a66 4 b ,0 x c24 b8 b70 ,0 x c76 c51 a3
.long 0 xd1 9 2 e 8 1 9 ,0 x d69 9 0 6 2 4 ,0 x f40 e 3 5 8 5 ,0 x10 6 a a07 0
.long 0 x1 9 a4 c11 6 ,0 x1 e 3 7 6 c08 ,0 x27 4 8 7 7 4 c ,0 x34 b0 b c b5
.long 0 x3 9 1 c0 c b3 ,0 x4 e d8 a a4 a ,0 x5 b9 c c a4 f ,0 x68 2 e 6 f f3
.long 0 x7 4 8 f82 e e ,0 x78 a56 3 6 f ,0 x84 c87 8 1 4 ,0 x8 c c70 2 0 8
.long 0 x9 0 b e f f f a ,0 x a45 0 6 c e b ,0 x b e f9 a3 f7 ,0 x c67 1 7 8 f2
PSHUFFLE_BYTE_FLIP_MASK :
.octa 0x0c0d0e0f08090a0b0405060700010203
# shuffle x B x A - > 0 0 B A
_SHUF_00BA :
.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
# shuffle x D x C - > D C 0 0
_SHUF_DC00 :
.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
# endif