2013-03-26 13:59:10 -07:00
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# Implement f a s t S H A - 2 5 6 w i t h A V X 2 i n s t r u c t i o n s . ( x86 _ 6 4 )
#
# Copyright ( C ) 2 0 1 3 I n t e l C o r p o r a t i o n .
#
# Authors :
# James G u i l f o r d < j a m e s . g u i l f o r d @intel.com>
# Kirk Y a p < k i r k . s . y a p @intel.com>
# Tim C h e n < t i m . c . c h e n @linux.intel.com>
#
# This s o f t w a r e i s a v a i l a b l e t o y o u u n d e r a c h o i c e o f o n e o f t w o
# licenses. Y o u m a y c h o o s e t o b e l i c e n s e d u n d e r t h e t e r m s o f t h e G N U
# General P u b l i c L i c e n s e ( G P L ) V e r s i o n 2 , a v a i l a b l e f r o m t h e f i l e
# COPYING i n t h e m a i n d i r e c t o r y o f t h i s s o u r c e t r e e , o r t h e
# OpenIB. o r g B S D l i c e n s e b e l o w :
#
# Redistribution a n d u s e i n s o u r c e a n d b i n a r y f o r m s , w i t h o r
# without m o d i f i c a t i o n , a r e p e r m i t t e d p r o v i d e d t h a t t h e f o l l o w i n g
# conditions a r e m e t :
#
# - Redistributions o f s o u r c e c o d e m u s t r e t a i n t h e a b o v e
# copyright n o t i c e , t h i s l i s t o f c o n d i t i o n s a n d t h e f o l l o w i n g
# disclaimer.
#
# - Redistributions i n b i n a r y f o r m m u s t r e p r o d u c e t h e a b o v e
# copyright n o t i c e , t h i s l i s t o f c o n d i t i o n s a n d t h e f o l l o w i n g
# disclaimer i n t h e d o c u m e n t a t i o n a n d / o r o t h e r m a t e r i a l s
# provided w i t h t h e d i s t r i b u t i o n .
#
# THE S O F T W A R E I S P R O V I D E D " A S I S " , W I T H O U T W A R R A N T Y O F A N Y K I N D ,
# EXPRESS O R I M P L I E D , I N C L U D I N G B U T N O T L I M I T E D T O T H E W A R R A N T I E S O F
# MERCHANTABILITY, F I T N E S S F O R A P A R T I C U L A R P U R P O S E A N D
# NONINFRINGEMENT. I N N O E V E N T S H A L L T H E A U T H O R S O R C O P Y R I G H T H O L D E R S
# BE L I A B L E F O R A N Y C L A I M , D A M A G E S O R O T H E R L I A B I L I T Y , W H E T H E R I N A N
# ACTION O F C O N T R A C T , T O R T O R O T H E R W I S E , A R I S I N G F R O M , O U T O F O R I N
# CONNECTION W I T H T H E S O F T W A R E O R T H E U S E O R O T H E R D E A L I N G S I N T H E
# SOFTWARE.
#
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
#
# This c o d e i s d e s c r i b e d i n a n I n t e l W h i t e - P a p e r :
# " Fast S H A - 2 5 6 I m p l e m e n t a t i o n s o n I n t e l A r c h i t e c t u r e P r o c e s s o r s "
#
# To f i n d i t , s u r f t o h t t p : / / w w w . i n t e l . c o m / p / e n _ U S / e m b e d d e d
# and s e a r c h f o r t h a t t i t l e .
#
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# This c o d e s c h e d u l e s 2 b l o c k s a t a t i m e , w i t h 4 l a n e s p e r b l o c k
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# ifdef C O N F I G _ A S _ A V X 2
# include < l i n u x / l i n k a g e . h >
# # assume b u f f e r s n o t a l i g n e d
# define V M O V D Q v m o v d q u
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Define M a c r o s
# addm [ m e m ] , r e g
# Add r e g t o m e m u s i n g r e g - m e m a d d a n d s t o r e
.macro addm p1 p2
add \ p1 , \ p2
mov \ p2 , \ p1
.endm
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
X0 = % y m m 4
X1 = % y m m 5
X2 = % y m m 6
X3 = % y m m 7
# XMM v e r s i o n s o f a b o v e
XWORD0 = % x m m 4
XWORD1 = % x m m 5
XWORD2 = % x m m 6
XWORD3 = % x m m 7
XTMP0 = % y m m 0
XTMP1 = % y m m 1
XTMP2 = % y m m 2
XTMP3 = % y m m 3
XTMP4 = % y m m 8
XFER = % y m m 9
XTMP5 = % y m m 1 1
SHUF_ 0 0 B A = % y m m 1 0 # s h u f f l e x B x A - > 00 B A
SHUF_ D C 0 0 = % y m m 1 2 # s h u f f l e x D x C - > D C 00
BYTE_ F L I P _ M A S K = % y m m 1 3
X_ B Y T E _ F L I P _ M A S K = % x m m 1 3 # X M M v e r s i o n o f B Y T E _ F L I P _ M A S K
NUM_ B L K S = % r d x # 3 r d a r g
2015-04-09 12:55:47 +02:00
INP = % r s i # 2 n d a r g
CTX = % r d i # 1 s t a r g
2013-03-26 13:59:10 -07:00
c = % e c x
d = % r8 d
e = % e d x # c l o b b e r s N U M _ B L K S
2015-04-09 12:55:47 +02:00
y3 = % e s i # c l o b b e r s I N P
2013-03-26 13:59:10 -07:00
TBL = % r b p
SRND = C T X # S R N D i s s a m e r e g i s t e r a s C T X
a = % e a x
b = % e b x
f = % r9 d
g = % r10 d
h = % r11 d
old_ h = % r11 d
T1 = % r12 d
y0 = % r13 d
y1 = % r14 d
y2 = % r15 d
_ XFER_ S I Z E = 2 * 6 4 * 4 # 2 b l o c k s , 6 4 r o u n d s , 4 b y t e s / r o u n d
_ XMM_ S A V E _ S I Z E = 0
_ INP_ E N D _ S I Z E = 8
_ INP_ S I Z E = 8
_ CTX_ S I Z E = 8
_ RSP_ S I Z E = 8
_ XFER = 0
_ XMM_ S A V E = _ X F E R + _ X F E R _ S I Z E
_ INP_ E N D = _ X M M _ S A V E + _ X M M _ S A V E _ S I Z E
_ INP = _ I N P _ E N D + _ I N P _ E N D _ S I Z E
_ CTX = _ I N P + _ I N P _ S I Z E
_ RSP = _ C T X + _ C T X _ S I Z E
STACK_ S I Z E = _ R S P + _ R S P _ S I Z E
# rotate_ X s
# Rotate v a l u e s o f s y m b o l s X 0 . . . X 3
.macro rotate_Xs
X_ = X 0
X0 = X 1
X1 = X 2
X2 = X 3
X3 = X _
.endm
# ROTATE_ A R G S
# Rotate v a l u e s o f s y m b o l s a . . . h
.macro ROTATE_ARGS
old_ h = h
TMP_ = h
h = g
g = f
f = e
e = d
d = c
c = b
b = a
a = T M P _
.endm
.macro FOUR_ROUNDS_AND_SCHED disp
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # RND N + 0 ## # # # # # # # # # # # # # # # # # # # # # # # # # #
mov a , y 3 # y 3 = a # M A J A
rorx $ 2 5 , e , y 0 # y 0 = e > > 2 5 # S 1 A
rorx $ 1 1 , e , y 1 # y 1 = e > > 1 1 # S 1 B
addl \ d i s p ( % r s p , S R N D ) , h # h = k + w + h # - -
or c , y 3 # y 3 = a | c # M A J A
vpalignr $ 4 , X 2 , X 3 , X T M P 0 # X T M P 0 = W [ - 7 ]
mov f , y 2 # y 2 = f # C H
rorx $ 1 3 , a , T 1 # T 1 = a > > 1 3 # S 0 B
xor y 1 , y 0 # y 0 = ( e > > 2 5 ) ^ ( e > > 1 1 ) # S 1
xor g , y 2 # y 2 = f ^ g # C H
vpaddd X 0 , X T M P 0 , X T M P 0 # X T M P 0 = W [ - 7 ] + W [ - 1 6 ] # y 1 = ( e > > 6 ) # S 1
rorx $ 6 , e , y 1 # y 1 = ( e > > 6 ) # S 1
and e , y 2 # y 2 = ( f ^ g ) & e # C H
xor y 1 , y 0 # y 0 = ( e > > 2 5 ) ^ ( e > > 1 1 ) ^ ( e > > 6 ) # S 1
rorx $ 2 2 , a , y 1 # y 1 = a > > 2 2 # S 0 A
add h , d # d = k + w + h + d # - -
and b , y 3 # y 3 = ( a | c ) & b # M A J A
vpalignr $ 4 , X 0 , X 1 , X T M P 1 # X T M P 1 = W [ - 1 5 ]
xor T 1 , y 1 # y 1 = ( a > > 2 2 ) ^ ( a > > 1 3 ) # S 0
rorx $ 2 , a , T 1 # T 1 = ( a > > 2 ) # S 0
xor g , y 2 # y 2 = C H = ( ( f ^ g ) & e ) ^ g # C H
vpsrld $ 7 , X T M P 1 , X T M P 2
xor T 1 , y 1 # y 1 = ( a > > 2 2 ) ^ ( a > > 1 3 ) ^ ( a > > 2 ) # S 0
mov a , T 1 # T 1 = a # M A J B
and c , T 1 # T 1 = a & c # M A J B
add y 0 , y 2 # y 2 = S 1 + C H # - -
vpslld $ ( 3 2 - 7 ) , X T M P 1 , X T M P 3
or T 1 , y 3 # y 3 = M A J = ( a | c ) & b ) | ( a & c ) # M A J
add y 1 , h # h = k + w + h + S 0 # - -
add y 2 , d # d = k + w + h + d + S 1 + C H = d + t 1 # - -
vpor X T M P 2 , X T M P 3 , X T M P 3 # X T M P 3 = W [ - 1 5 ] r o r 7
vpsrld $ 1 8 , X T M P 1 , X T M P 2
add y 2 , h # h = k + w + h + S 0 + S 1 + C H = t 1 + S 0 # - -
add y 3 , h # h = t 1 + S 0 + M A J # - -
ROTATE_ A R G S
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # RND N + 1 ## # # # # # # # # # # # # # # # # # # # # # # # # # #
mov a , y 3 # y 3 = a # M A J A
rorx $ 2 5 , e , y 0 # y 0 = e > > 2 5 # S 1 A
rorx $ 1 1 , e , y 1 # y 1 = e > > 1 1 # S 1 B
offset = \ d i s p + 1 * 4
addl o f f s e t ( % r s p , S R N D ) , h # h = k + w + h # - -
or c , y 3 # y 3 = a | c # M A J A
vpsrld $ 3 , X T M P 1 , X T M P 4 # X T M P 4 = W [ - 1 5 ] > > 3
mov f , y 2 # y 2 = f # C H
rorx $ 1 3 , a , T 1 # T 1 = a > > 1 3 # S 0 B
xor y 1 , y 0 # y 0 = ( e > > 2 5 ) ^ ( e > > 1 1 ) # S 1
xor g , y 2 # y 2 = f ^ g # C H
rorx $ 6 , e , y 1 # y 1 = ( e > > 6 ) # S 1
xor y 1 , y 0 # y 0 = ( e > > 2 5 ) ^ ( e > > 1 1 ) ^ ( e > > 6 ) # S 1
rorx $ 2 2 , a , y 1 # y 1 = a > > 2 2 # S 0 A
and e , y 2 # y 2 = ( f ^ g ) & e # C H
add h , d # d = k + w + h + d # - -
vpslld $ ( 3 2 - 1 8 ) , X T M P 1 , X T M P 1
and b , y 3 # y 3 = ( a | c ) & b # M A J A
xor T 1 , y 1 # y 1 = ( a > > 2 2 ) ^ ( a > > 1 3 ) # S 0
vpxor X T M P 1 , X T M P 3 , X T M P 3
rorx $ 2 , a , T 1 # T 1 = ( a > > 2 ) # S 0
xor g , y 2 # y 2 = C H = ( ( f ^ g ) & e ) ^ g # C H
vpxor X T M P 2 , X T M P 3 , X T M P 3 # X T M P 3 = W [ - 1 5 ] r o r 7 ^ W [ - 1 5 ] r o r 1 8
xor T 1 , y 1 # y 1 = ( a > > 2 2 ) ^ ( a > > 1 3 ) ^ ( a > > 2 ) # S 0
mov a , T 1 # T 1 = a # M A J B
and c , T 1 # T 1 = a & c # M A J B
add y 0 , y 2 # y 2 = S 1 + C H # - -
vpxor X T M P 4 , X T M P 3 , X T M P 1 # X T M P 1 = s0
vpshufd $ 0 b11 1 1 1 0 1 0 , X 3 , X T M P 2 # X T M P 2 = W [ - 2 ] { B B A A }
or T 1 , y 3 # y 3 = M A J = ( a | c ) & b ) | ( a & c ) # M A J
add y 1 , h # h = k + w + h + S 0 # - -
vpaddd X T M P 1 , X T M P 0 , X T M P 0 # X T M P 0 = W [ - 1 6 ] + W [ - 7 ] + s0
add y 2 , d # d = k + w + h + d + S 1 + C H = d + t 1 # - -
add y 2 , h # h = k + w + h + S 0 + S 1 + C H = t 1 + S 0 # - -
add y 3 , h # h = t 1 + S 0 + M A J # - -
vpsrld $ 1 0 , X T M P 2 , X T M P 4 # X T M P 4 = W [ - 2 ] > > 1 0 { B B A A }
ROTATE_ A R G S
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # RND N + 2 ## # # # # # # # # # # # # # # # # # # # # # # # # # #
mov a , y 3 # y 3 = a # M A J A
rorx $ 2 5 , e , y 0 # y 0 = e > > 2 5 # S 1 A
offset = \ d i s p + 2 * 4
addl o f f s e t ( % r s p , S R N D ) , h # h = k + w + h # - -
vpsrlq $ 1 9 , X T M P 2 , X T M P 3 # X T M P 3 = W [ - 2 ] r o r 1 9 { x B x A }
rorx $ 1 1 , e , y 1 # y 1 = e > > 1 1 # S 1 B
or c , y 3 # y 3 = a | c # M A J A
mov f , y 2 # y 2 = f # C H
xor g , y 2 # y 2 = f ^ g # C H
rorx $ 1 3 , a , T 1 # T 1 = a > > 1 3 # S 0 B
xor y 1 , y 0 # y 0 = ( e > > 2 5 ) ^ ( e > > 1 1 ) # S 1
vpsrlq $ 1 7 , X T M P 2 , X T M P 2 # X T M P 2 = W [ - 2 ] r o r 1 7 { x B x A }
and e , y 2 # y 2 = ( f ^ g ) & e # C H
rorx $ 6 , e , y 1 # y 1 = ( e > > 6 ) # S 1
vpxor X T M P 3 , X T M P 2 , X T M P 2
add h , d # d = k + w + h + d # - -
and b , y 3 # y 3 = ( a | c ) & b # M A J A
xor y 1 , y 0 # y 0 = ( e > > 2 5 ) ^ ( e > > 1 1 ) ^ ( e > > 6 ) # S 1
rorx $ 2 2 , a , y 1 # y 1 = a > > 2 2 # S 0 A
vpxor X T M P 2 , X T M P 4 , X T M P 4 # X T M P 4 = s1 { x B x A }
xor g , y 2 # y 2 = C H = ( ( f ^ g ) & e ) ^ g # C H
vpshufb S H U F _ 0 0 B A , X T M P 4 , X T M P 4 # X T M P 4 = s1 { 0 0 B A }
xor T 1 , y 1 # y 1 = ( a > > 2 2 ) ^ ( a > > 1 3 ) # S 0
rorx $ 2 , a ,T 1 # T 1 = ( a > > 2 ) # S 0
vpaddd X T M P 4 , X T M P 0 , X T M P 0 # X T M P 0 = { . . . , . . . , W [ 1 ] , W [ 0 ] }
xor T 1 , y 1 # y 1 = ( a > > 2 2 ) ^ ( a > > 1 3 ) ^ ( a > > 2 ) # S 0
mov a , T 1 # T 1 = a # M A J B
and c , T 1 # T 1 = a & c # M A J B
add y 0 , y 2 # y 2 = S 1 + C H # - -
vpshufd $ 0 b01 0 1 0 0 0 0 , X T M P 0 , X T M P 2 # X T M P 2 = W [ - 2 ] { D D C C }
or T 1 , y 3 # y 3 = M A J = ( a | c ) & b ) | ( a & c ) # M A J
add y 1 ,h # h = k + w + h + S 0 # - -
add y 2 ,d # d = k + w + h + d + S 1 + C H = d + t 1 # - -
add y 2 ,h # h = k + w + h + S 0 + S 1 + C H = t 1 + S 0 # - -
add y 3 ,h # h = t 1 + S 0 + M A J # - -
ROTATE_ A R G S
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # RND N + 3 ## # # # # # # # # # # # # # # # # # # # # # # # # # #
mov a , y 3 # y 3 = a # M A J A
rorx $ 2 5 , e , y 0 # y 0 = e > > 2 5 # S 1 A
rorx $ 1 1 , e , y 1 # y 1 = e > > 1 1 # S 1 B
offset = \ d i s p + 3 * 4
addl o f f s e t ( % r s p , S R N D ) , h # h = k + w + h # - -
or c , y 3 # y 3 = a | c # M A J A
vpsrld $ 1 0 , X T M P 2 , X T M P 5 # X T M P 5 = W [ - 2 ] > > 1 0 { D D C C }
mov f , y 2 # y 2 = f # C H
rorx $ 1 3 , a , T 1 # T 1 = a > > 1 3 # S 0 B
xor y 1 , y 0 # y 0 = ( e > > 2 5 ) ^ ( e > > 1 1 ) # S 1
xor g , y 2 # y 2 = f ^ g # C H
vpsrlq $ 1 9 , X T M P 2 , X T M P 3 # X T M P 3 = W [ - 2 ] r o r 1 9 { x D x C }
rorx $ 6 , e , y 1 # y 1 = ( e > > 6 ) # S 1
and e , y 2 # y 2 = ( f ^ g ) & e # C H
add h , d # d = k + w + h + d # - -
and b , y 3 # y 3 = ( a | c ) & b # M A J A
vpsrlq $ 1 7 , X T M P 2 , X T M P 2 # X T M P 2 = W [ - 2 ] r o r 1 7 { x D x C }
xor y 1 , y 0 # y 0 = ( e > > 2 5 ) ^ ( e > > 1 1 ) ^ ( e > > 6 ) # S 1
xor g , y 2 # y 2 = C H = ( ( f ^ g ) & e ) ^ g # C H
vpxor X T M P 3 , X T M P 2 , X T M P 2
rorx $ 2 2 , a , y 1 # y 1 = a > > 2 2 # S 0 A
add y 0 , y 2 # y 2 = S 1 + C H # - -
vpxor X T M P 2 , X T M P 5 , X T M P 5 # X T M P 5 = s1 { x D x C }
xor T 1 , y 1 # y 1 = ( a > > 2 2 ) ^ ( a > > 1 3 ) # S 0
add y 2 , d # d = k + w + h + d + S 1 + C H = d + t 1 # - -
rorx $ 2 , a , T 1 # T 1 = ( a > > 2 ) # S 0
vpshufb S H U F _ D C 0 0 , X T M P 5 , X T M P 5 # X T M P 5 = s1 { D C 0 0 }
vpaddd X T M P 0 , X T M P 5 , X 0 # X 0 = { W [ 3 ] , W [ 2 ] , W [ 1 ] , W [ 0 ] }
xor T 1 , y 1 # y 1 = ( a > > 2 2 ) ^ ( a > > 1 3 ) ^ ( a > > 2 ) # S 0
mov a , T 1 # T 1 = a # M A J B
and c , T 1 # T 1 = a & c # M A J B
or T 1 , y 3 # y 3 = M A J = ( a | c ) & b ) | ( a & c ) # M A J
add y 1 , h # h = k + w + h + S 0 # - -
add y 2 , h # h = k + w + h + S 0 + S 1 + C H = t 1 + S 0 # - -
add y 3 , h # h = t 1 + S 0 + M A J # - -
ROTATE_ A R G S
rotate_ X s
.endm
.macro DO_4ROUNDS disp
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # RND N + 0 ## # # # # # # # # # # # # # # # # # # # # # # # # #
mov f , y 2 # y 2 = f # C H
rorx $ 2 5 , e , y 0 # y 0 = e > > 2 5 # S 1 A
rorx $ 1 1 , e , y 1 # y 1 = e > > 1 1 # S 1 B
xor g , y 2 # y 2 = f ^ g # C H
xor y 1 , y 0 # y 0 = ( e > > 2 5 ) ^ ( e > > 1 1 ) # S 1
rorx $ 6 , e , y 1 # y 1 = ( e > > 6 ) # S 1
and e , y 2 # y 2 = ( f ^ g ) & e # C H
xor y 1 , y 0 # y 0 = ( e > > 2 5 ) ^ ( e > > 1 1 ) ^ ( e > > 6 ) # S 1
rorx $ 1 3 , a , T 1 # T 1 = a > > 1 3 # S 0 B
xor g , y 2 # y 2 = C H = ( ( f ^ g ) & e ) ^ g # C H
rorx $ 2 2 , a , y 1 # y 1 = a > > 2 2 # S 0 A
mov a , y 3 # y 3 = a # M A J A
xor T 1 , y 1 # y 1 = ( a > > 2 2 ) ^ ( a > > 1 3 ) # S 0
rorx $ 2 , a , T 1 # T 1 = ( a > > 2 ) # S 0
addl \ d i s p ( % r s p , S R N D ) , h # h = k + w + h # - -
or c , y 3 # y 3 = a | c # M A J A
xor T 1 , y 1 # y 1 = ( a > > 2 2 ) ^ ( a > > 1 3 ) ^ ( a > > 2 ) # S 0
mov a , T 1 # T 1 = a # M A J B
and b , y 3 # y 3 = ( a | c ) & b # M A J A
and c , T 1 # T 1 = a & c # M A J B
add y 0 , y 2 # y 2 = S 1 + C H # - -
add h , d # d = k + w + h + d # - -
or T 1 , y 3 # y 3 = M A J = ( a | c ) & b ) | ( a & c ) # M A J
add y 1 , h # h = k + w + h + S 0 # - -
add y 2 , d # d = k + w + h + d + S 1 + C H = d + t 1 # - -
ROTATE_ A R G S
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # RND N + 1 ## # # # # # # # # # # # # # # # # # # # # # # # # #
add y 2 , o l d _ h # h = k + w + h + S 0 + S 1 + C H = t 1 + S 0 # - -
mov f , y 2 # y 2 = f # C H
rorx $ 2 5 , e , y 0 # y 0 = e > > 2 5 # S 1 A
rorx $ 1 1 , e , y 1 # y 1 = e > > 1 1 # S 1 B
xor g , y 2 # y 2 = f ^ g # C H
xor y 1 , y 0 # y 0 = ( e > > 2 5 ) ^ ( e > > 1 1 ) # S 1
rorx $ 6 , e , y 1 # y 1 = ( e > > 6 ) # S 1
and e , y 2 # y 2 = ( f ^ g ) & e # C H
add y 3 , o l d _ h # h = t 1 + S 0 + M A J # - -
xor y 1 , y 0 # y 0 = ( e > > 2 5 ) ^ ( e > > 1 1 ) ^ ( e > > 6 ) # S 1
rorx $ 1 3 , a , T 1 # T 1 = a > > 1 3 # S 0 B
xor g , y 2 # y 2 = C H = ( ( f ^ g ) & e ) ^ g # C H
rorx $ 2 2 , a , y 1 # y 1 = a > > 2 2 # S 0 A
mov a , y 3 # y 3 = a # M A J A
xor T 1 , y 1 # y 1 = ( a > > 2 2 ) ^ ( a > > 1 3 ) # S 0
rorx $ 2 , a , T 1 # T 1 = ( a > > 2 ) # S 0
offset = 4 * 1 + \ d i s p
addl o f f s e t ( % r s p , S R N D ) , h # h = k + w + h # - -
or c , y 3 # y 3 = a | c # M A J A
xor T 1 , y 1 # y 1 = ( a > > 2 2 ) ^ ( a > > 1 3 ) ^ ( a > > 2 ) # S 0
mov a , T 1 # T 1 = a # M A J B
and b , y 3 # y 3 = ( a | c ) & b # M A J A
and c , T 1 # T 1 = a & c # M A J B
add y 0 , y 2 # y 2 = S 1 + C H # - -
add h , d # d = k + w + h + d # - -
or T 1 , y 3 # y 3 = M A J = ( a | c ) & b ) | ( a & c ) # M A J
add y 1 , h # h = k + w + h + S 0 # - -
add y 2 , d # d = k + w + h + d + S 1 + C H = d + t 1 # - -
ROTATE_ A R G S
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # RND N + 2 ## # # # # # # # # # # # # # # # # # # # # # # # # # # # #
add y 2 , o l d _ h # h = k + w + h + S 0 + S 1 + C H = t 1 + S 0 # - -
mov f , y 2 # y 2 = f # C H
rorx $ 2 5 , e , y 0 # y 0 = e > > 2 5 # S 1 A
rorx $ 1 1 , e , y 1 # y 1 = e > > 1 1 # S 1 B
xor g , y 2 # y 2 = f ^ g # C H
xor y 1 , y 0 # y 0 = ( e > > 2 5 ) ^ ( e > > 1 1 ) # S 1
rorx $ 6 , e , y 1 # y 1 = ( e > > 6 ) # S 1
and e , y 2 # y 2 = ( f ^ g ) & e # C H
add y 3 , o l d _ h # h = t 1 + S 0 + M A J # - -
xor y 1 , y 0 # y 0 = ( e > > 2 5 ) ^ ( e > > 1 1 ) ^ ( e > > 6 ) # S 1
rorx $ 1 3 , a , T 1 # T 1 = a > > 1 3 # S 0 B
xor g , y 2 # y 2 = C H = ( ( f ^ g ) & e ) ^ g # C H
rorx $ 2 2 , a , y 1 # y 1 = a > > 2 2 # S 0 A
mov a , y 3 # y 3 = a # M A J A
xor T 1 , y 1 # y 1 = ( a > > 2 2 ) ^ ( a > > 1 3 ) # S 0
rorx $ 2 , a , T 1 # T 1 = ( a > > 2 ) # S 0
offset = 4 * 2 + \ d i s p
addl o f f s e t ( % r s p , S R N D ) , h # h = k + w + h # - -
or c , y 3 # y 3 = a | c # M A J A
xor T 1 , y 1 # y 1 = ( a > > 2 2 ) ^ ( a > > 1 3 ) ^ ( a > > 2 ) # S 0
mov a , T 1 # T 1 = a # M A J B
and b , y 3 # y 3 = ( a | c ) & b # M A J A
and c , T 1 # T 1 = a & c # M A J B
add y 0 , y 2 # y 2 = S 1 + C H # - -
add h , d # d = k + w + h + d # - -
or T 1 , y 3 # y 3 = M A J = ( a | c ) & b ) | ( a & c ) # M A J
add y 1 , h # h = k + w + h + S 0 # - -
add y 2 , d # d = k + w + h + d + S 1 + C H = d + t 1 # - -
ROTATE_ A R G S
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # RND N + 3 ## # # # # # # # # # # # # # # # # # # # # # # # # #
add y 2 , o l d _ h # h = k + w + h + S 0 + S 1 + C H = t 1 + S 0 # - -
mov f , y 2 # y 2 = f # C H
rorx $ 2 5 , e , y 0 # y 0 = e > > 2 5 # S 1 A
rorx $ 1 1 , e , y 1 # y 1 = e > > 1 1 # S 1 B
xor g , y 2 # y 2 = f ^ g # C H
xor y 1 , y 0 # y 0 = ( e > > 2 5 ) ^ ( e > > 1 1 ) # S 1
rorx $ 6 , e , y 1 # y 1 = ( e > > 6 ) # S 1
and e , y 2 # y 2 = ( f ^ g ) & e # C H
add y 3 , o l d _ h # h = t 1 + S 0 + M A J # - -
xor y 1 , y 0 # y 0 = ( e > > 2 5 ) ^ ( e > > 1 1 ) ^ ( e > > 6 ) # S 1
rorx $ 1 3 , a , T 1 # T 1 = a > > 1 3 # S 0 B
xor g , y 2 # y 2 = C H = ( ( f ^ g ) & e ) ^ g # C H
rorx $ 2 2 , a , y 1 # y 1 = a > > 2 2 # S 0 A
mov a , y 3 # y 3 = a # M A J A
xor T 1 , y 1 # y 1 = ( a > > 2 2 ) ^ ( a > > 1 3 ) # S 0
rorx $ 2 , a , T 1 # T 1 = ( a > > 2 ) # S 0
offset = 4 * 3 + \ d i s p
addl o f f s e t ( % r s p , S R N D ) , h # h = k + w + h # - -
or c , y 3 # y 3 = a | c # M A J A
xor T 1 , y 1 # y 1 = ( a > > 2 2 ) ^ ( a > > 1 3 ) ^ ( a > > 2 ) # S 0
mov a , T 1 # T 1 = a # M A J B
and b , y 3 # y 3 = ( a | c ) & b # M A J A
and c , T 1 # T 1 = a & c # M A J B
add y 0 , y 2 # y 2 = S 1 + C H # - -
add h , d # d = k + w + h + d # - -
or T 1 , y 3 # y 3 = M A J = ( a | c ) & b ) | ( a & c ) # M A J
add y 1 , h # h = k + w + h + S 0 # - -
add y 2 , d # d = k + w + h + d + S 1 + C H = d + t 1 # - -
add y 2 , h # h = k + w + h + S 0 + S 1 + C H = t 1 + S 0 # - -
add y 3 , h # h = t 1 + S 0 + M A J # - -
ROTATE_ A R G S
.endm
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# # void s h a25 6 _ t r a n s f o r m _ r o r x ( v o i d * i n p u t _ d a t a , U I N T 3 2 d i g e s t [ 8 ] , U I N T 6 4 n u m _ b l k s )
2015-04-09 12:55:47 +02:00
# # arg 1 : p o i n t e r t o d i g e s t
# # arg 2 : p o i n t e r t o i n p u t d a t a
2013-03-26 13:59:10 -07:00
# # arg 3 : N u m b l o c k s
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
.text
ENTRY( s h a25 6 _ t r a n s f o r m _ r o r x )
.align 32
pushq % r b x
pushq % r b p
pushq % r12
pushq % r13
pushq % r14
pushq % r15
mov % r s p , % r a x
subq $ S T A C K _ S I Z E , % r s p
and $ - 3 2 , % r s p # a l i g n r s p t o 32 b y t e b o u n d a r y
mov % r a x , _ R S P ( % r s p )
shl $ 6 , N U M _ B L K S # c o n v e r t t o b y t e s
jz d o n e _ h a s h
lea - 6 4 ( I N P , N U M _ B L K S ) , N U M _ B L K S # p o i n t e r t o l a s t b l o c k
mov N U M _ B L K S , _ I N P _ E N D ( % r s p )
cmp N U M _ B L K S , I N P
je o n l y _ o n e _ b l o c k
# # load i n i t i a l d i g e s t
mov ( C T X ) , a
mov 4 * 1 ( C T X ) , b
mov 4 * 2 ( C T X ) , c
mov 4 * 3 ( C T X ) , d
mov 4 * 4 ( C T X ) , e
mov 4 * 5 ( C T X ) , f
mov 4 * 6 ( C T X ) , g
mov 4 * 7 ( C T X ) , h
vmovdqa P S H U F F L E _ B Y T E _ F L I P _ M A S K ( % r i p ) , B Y T E _ F L I P _ M A S K
vmovdqa _ S H U F _ 0 0 B A ( % r i p ) , S H U F _ 0 0 B A
vmovdqa _ S H U F _ D C 0 0 ( % r i p ) , S H U F _ D C 0 0
mov C T X , _ C T X ( % r s p )
loop0 :
lea K 2 5 6 ( % r i p ) , T B L
# # Load f i r s t 1 6 d w o r d s f r o m t w o b l o c k s
VMOVDQ 0 * 3 2 ( I N P ) ,X T M P 0
VMOVDQ 1 * 3 2 ( I N P ) ,X T M P 1
VMOVDQ 2 * 3 2 ( I N P ) ,X T M P 2
VMOVDQ 3 * 3 2 ( I N P ) ,X T M P 3
# # byte s w a p d a t a
vpshufb B Y T E _ F L I P _ M A S K , X T M P 0 , X T M P 0
vpshufb B Y T E _ F L I P _ M A S K , X T M P 1 , X T M P 1
vpshufb B Y T E _ F L I P _ M A S K , X T M P 2 , X T M P 2
vpshufb B Y T E _ F L I P _ M A S K , X T M P 3 , X T M P 3
# # transpose d a t a i n t o h i g h / l o w h a l v e s
vperm2 i 1 2 8 $ 0 x20 , X T M P 2 , X T M P 0 , X 0
vperm2 i 1 2 8 $ 0 x31 , X T M P 2 , X T M P 0 , X 1
vperm2 i 1 2 8 $ 0 x20 , X T M P 3 , X T M P 1 , X 2
vperm2 i 1 2 8 $ 0 x31 , X T M P 3 , X T M P 1 , X 3
last_block_enter :
add $ 6 4 , I N P
mov I N P , _ I N P ( % r s p )
# # schedule 4 8 i n p u t d w o r d s , b y d o i n g 3 r o u n d s o f 1 2 e a c h
xor S R N D , S R N D
.align 16
loop1 :
vpaddd 0 * 3 2 ( T B L , S R N D ) , X 0 , X F E R
vmovdqa X F E R , 0 * 3 2 + _ X F E R ( % r s p , S R N D )
FOUR_ R O U N D S _ A N D _ S C H E D _ X F E R + 0 * 3 2
vpaddd 1 * 3 2 ( T B L , S R N D ) , X 0 , X F E R
vmovdqa X F E R , 1 * 3 2 + _ X F E R ( % r s p , S R N D )
FOUR_ R O U N D S _ A N D _ S C H E D _ X F E R + 1 * 3 2
vpaddd 2 * 3 2 ( T B L , S R N D ) , X 0 , X F E R
vmovdqa X F E R , 2 * 3 2 + _ X F E R ( % r s p , S R N D )
FOUR_ R O U N D S _ A N D _ S C H E D _ X F E R + 2 * 3 2
vpaddd 3 * 3 2 ( T B L , S R N D ) , X 0 , X F E R
vmovdqa X F E R , 3 * 3 2 + _ X F E R ( % r s p , S R N D )
FOUR_ R O U N D S _ A N D _ S C H E D _ X F E R + 3 * 3 2
add $ 4 * 3 2 , S R N D
cmp $ 3 * 4 * 3 2 , S R N D
jb l o o p1
loop2 :
# # Do l a s t 1 6 r o u n d s w i t h n o s c h e d u l i n g
vpaddd 0 * 3 2 ( T B L , S R N D ) , X 0 , X F E R
vmovdqa X F E R , 0 * 3 2 + _ X F E R ( % r s p , S R N D )
DO_ 4 R O U N D S _ X F E R + 0 * 3 2
vpaddd 1 * 3 2 ( T B L , S R N D ) , X 1 , X F E R
vmovdqa X F E R , 1 * 3 2 + _ X F E R ( % r s p , S R N D )
DO_ 4 R O U N D S _ X F E R + 1 * 3 2
add $ 2 * 3 2 , S R N D
vmovdqa X 2 , X 0
vmovdqa X 3 , X 1
cmp $ 4 * 4 * 3 2 , S R N D
jb l o o p2
mov _ C T X ( % r s p ) , C T X
mov _ I N P ( % r s p ) , I N P
addm ( 4 * 0 ) ( C T X ) ,a
addm ( 4 * 1 ) ( C T X ) ,b
addm ( 4 * 2 ) ( C T X ) ,c
addm ( 4 * 3 ) ( C T X ) ,d
addm ( 4 * 4 ) ( C T X ) ,e
addm ( 4 * 5 ) ( C T X ) ,f
addm ( 4 * 6 ) ( C T X ) ,g
addm ( 4 * 7 ) ( C T X ) ,h
cmp _ I N P _ E N D ( % r s p ) , I N P
ja d o n e _ h a s h
# # # # Do s e c o n d b l o c k u s i n g p r e v i o u s l y s c h e d u l e d r e s u l t s
xor S R N D , S R N D
.align 16
loop3 :
DO_ 4 R O U N D S _ X F E R + 0 * 3 2 + 1 6
DO_ 4 R O U N D S _ X F E R + 1 * 3 2 + 1 6
add $ 2 * 3 2 , S R N D
cmp $ 4 * 4 * 3 2 , S R N D
jb l o o p3
mov _ C T X ( % r s p ) , C T X
mov _ I N P ( % r s p ) , I N P
add $ 6 4 , I N P
addm ( 4 * 0 ) ( C T X ) ,a
addm ( 4 * 1 ) ( C T X ) ,b
addm ( 4 * 2 ) ( C T X ) ,c
addm ( 4 * 3 ) ( C T X ) ,d
addm ( 4 * 4 ) ( C T X ) ,e
addm ( 4 * 5 ) ( C T X ) ,f
addm ( 4 * 6 ) ( C T X ) ,g
addm ( 4 * 7 ) ( C T X ) ,h
cmp _ I N P _ E N D ( % r s p ) , I N P
jb l o o p0
ja d o n e _ h a s h
do_last_block :
# # # # do l a s t b l o c k
lea K 2 5 6 ( % r i p ) , T B L
VMOVDQ 0 * 1 6 ( I N P ) ,X W O R D 0
VMOVDQ 1 * 1 6 ( I N P ) ,X W O R D 1
VMOVDQ 2 * 1 6 ( I N P ) ,X W O R D 2
VMOVDQ 3 * 1 6 ( I N P ) ,X W O R D 3
vpshufb X _ B Y T E _ F L I P _ M A S K , X W O R D 0 , X W O R D 0
vpshufb X _ B Y T E _ F L I P _ M A S K , X W O R D 1 , X W O R D 1
vpshufb X _ B Y T E _ F L I P _ M A S K , X W O R D 2 , X W O R D 2
vpshufb X _ B Y T E _ F L I P _ M A S K , X W O R D 3 , X W O R D 3
jmp l a s t _ b l o c k _ e n t e r
only_one_block :
# # load i n i t i a l d i g e s t
mov ( 4 * 0 ) ( C T X ) ,a
mov ( 4 * 1 ) ( C T X ) ,b
mov ( 4 * 2 ) ( C T X ) ,c
mov ( 4 * 3 ) ( C T X ) ,d
mov ( 4 * 4 ) ( C T X ) ,e
mov ( 4 * 5 ) ( C T X ) ,f
mov ( 4 * 6 ) ( C T X ) ,g
mov ( 4 * 7 ) ( C T X ) ,h
vmovdqa P S H U F F L E _ B Y T E _ F L I P _ M A S K ( % r i p ) , B Y T E _ F L I P _ M A S K
vmovdqa _ S H U F _ 0 0 B A ( % r i p ) , S H U F _ 0 0 B A
vmovdqa _ S H U F _ D C 0 0 ( % r i p ) , S H U F _ D C 0 0
mov C T X , _ C T X ( % r s p )
jmp d o _ l a s t _ b l o c k
done_hash :
mov _ R S P ( % r s p ) , % r s p
popq % r15
popq % r14
popq % r13
popq % r12
popq % r b p
popq % r b x
ret
ENDPROC( s h a25 6 _ t r a n s f o r m _ r o r x )
.data
.align 64
K256 :
.long 0 x4 2 8 a2 f98 ,0 x71 3 7 4 4 9 1 ,0 x b5 c0 f b c f ,0 x e 9 b5 d b a5
.long 0 x4 2 8 a2 f98 ,0 x71 3 7 4 4 9 1 ,0 x b5 c0 f b c f ,0 x e 9 b5 d b a5
.long 0 x3 9 5 6 c25 b ,0 x59 f11 1 f1 ,0 x92 3 f82 a4 ,0 x a b1 c5 e d5
.long 0 x3 9 5 6 c25 b ,0 x59 f11 1 f1 ,0 x92 3 f82 a4 ,0 x a b1 c5 e d5
.long 0 xd8 0 7 a a98 ,0 x12 8 3 5 b01 ,0 x24 3 1 8 5 b e ,0 x55 0 c7 d c3
.long 0 xd8 0 7 a a98 ,0 x12 8 3 5 b01 ,0 x24 3 1 8 5 b e ,0 x55 0 c7 d c3
.long 0 x7 2 b e 5 d74 ,0 x80 d e b1 f e ,0 x9 b d c06 a7 ,0 x c19 b f17 4
.long 0 x7 2 b e 5 d74 ,0 x80 d e b1 f e ,0 x9 b d c06 a7 ,0 x c19 b f17 4
.long 0 xe4 9 b69 c1 ,0 x e f b e 4 7 8 6 ,0 x0 f c19 d c6 ,0 x24 0 c a1 c c
.long 0 xe4 9 b69 c1 ,0 x e f b e 4 7 8 6 ,0 x0 f c19 d c6 ,0 x24 0 c a1 c c
.long 0 x2 d e 9 2 c6 f ,0 x4 a74 8 4 a a ,0 x5 c b0 a9 d c ,0 x76 f98 8 d a
.long 0 x2 d e 9 2 c6 f ,0 x4 a74 8 4 a a ,0 x5 c b0 a9 d c ,0 x76 f98 8 d a
.long 0 x9 8 3 e 5 1 5 2 ,0 x a83 1 c66 d ,0 x b00 3 2 7 c8 ,0 x b f59 7 f c7
.long 0 x9 8 3 e 5 1 5 2 ,0 x a83 1 c66 d ,0 x b00 3 2 7 c8 ,0 x b f59 7 f c7
.long 0 xc6 e 0 0 b f3 ,0 x d5 a79 1 4 7 ,0 x06 c a63 5 1 ,0 x14 2 9 2 9 6 7
.long 0 xc6 e 0 0 b f3 ,0 x d5 a79 1 4 7 ,0 x06 c a63 5 1 ,0 x14 2 9 2 9 6 7
.long 0 x2 7 b70 a85 ,0 x2 e 1 b21 3 8 ,0 x4 d2 c6 d f c ,0 x53 3 8 0 d13
.long 0 x2 7 b70 a85 ,0 x2 e 1 b21 3 8 ,0 x4 d2 c6 d f c ,0 x53 3 8 0 d13
.long 0 x6 5 0 a73 5 4 ,0 x76 6 a0 a b b ,0 x81 c2 c92 e ,0 x92 7 2 2 c85
.long 0 x6 5 0 a73 5 4 ,0 x76 6 a0 a b b ,0 x81 c2 c92 e ,0 x92 7 2 2 c85
.long 0 xa2 b f e 8 a1 ,0 x a81 a66 4 b ,0 x c24 b8 b70 ,0 x c76 c51 a3
.long 0 xa2 b f e 8 a1 ,0 x a81 a66 4 b ,0 x c24 b8 b70 ,0 x c76 c51 a3
.long 0 xd1 9 2 e 8 1 9 ,0 x d69 9 0 6 2 4 ,0 x f40 e 3 5 8 5 ,0 x10 6 a a07 0
.long 0 xd1 9 2 e 8 1 9 ,0 x d69 9 0 6 2 4 ,0 x f40 e 3 5 8 5 ,0 x10 6 a a07 0
.long 0 x1 9 a4 c11 6 ,0 x1 e 3 7 6 c08 ,0 x27 4 8 7 7 4 c ,0 x34 b0 b c b5
.long 0 x1 9 a4 c11 6 ,0 x1 e 3 7 6 c08 ,0 x27 4 8 7 7 4 c ,0 x34 b0 b c b5
.long 0 x3 9 1 c0 c b3 ,0 x4 e d8 a a4 a ,0 x5 b9 c c a4 f ,0 x68 2 e 6 f f3
.long 0 x3 9 1 c0 c b3 ,0 x4 e d8 a a4 a ,0 x5 b9 c c a4 f ,0 x68 2 e 6 f f3
.long 0 x7 4 8 f82 e e ,0 x78 a56 3 6 f ,0 x84 c87 8 1 4 ,0 x8 c c70 2 0 8
.long 0 x7 4 8 f82 e e ,0 x78 a56 3 6 f ,0 x84 c87 8 1 4 ,0 x8 c c70 2 0 8
.long 0 x9 0 b e f f f a ,0 x a45 0 6 c e b ,0 x b e f9 a3 f7 ,0 x c67 1 7 8 f2
.long 0 x9 0 b e f f f a ,0 x a45 0 6 c e b ,0 x b e f9 a3 f7 ,0 x c67 1 7 8 f2
PSHUFFLE_BYTE_FLIP_MASK :
.octa 0 x0 c0 d0 e 0 f08 0 9 0 a0 b04 0 5 0 6 0 7 0 0 0 1 0 2 0 3 ,0 x0 c0 d0 e 0 f08 0 9 0 a0 b04 0 5 0 6 0 7 0 0 0 1 0 2 0 3
# shuffle x B x A - > 0 0 B A
_SHUF_00BA :
.octa 0 xFFFFFFFFFFFFFFFF0 b0 a09 0 8 0 3 0 2 0 1 0 0 ,0 x F F F F F F F F F F F F F F F F 0 b0 a09 0 8 0 3 0 2 0 1 0 0
# shuffle x D x C - > D C 0 0
_SHUF_DC00 :
.octa 0 x0 b0 a09 0 8 0 3 0 2 0 1 0 0 F F F F F F F F F F F F F F F F ,0 x0 b0 a09 0 8 0 3 0 2 0 1 0 0 F F F F F F F F F F F F F F F F
# endif