2019-05-27 08:55:01 +02:00
/* SPDX-License-Identifier: GPL-2.0-or-later */
2015-01-30 15:39:23 +01:00
/ *
* Fast S H A - 2 5 6 i m p l e m e n t a t i o n f o r S P E i n s t r u c t i o n s e t ( P P C )
*
* This c o d e m a k e s u s e o f t h e S P E S I M D i n s t r u c t i o n s e t a s d e f i n e d i n
* http : / / cache. f r e e s c a l e . c o m / f i l e s / 3 2 b i t / d o c / r e f _ m a n u a l / S P E P I M . p d f
* Implementation i s b a s e d o n o p t i m i z a t i o n g u i d e n o t e s f r o m
* http : / / cache. f r e e s c a l e . c o m / f i l e s / 3 2 b i t / d o c / a p p _ n o t e / A N 2 6 6 5 . p d f
*
* Copyright ( c ) 2 0 1 5 M a r k u s S t o c k h a u s e n < s t o c k h a u s e n @collogia.de>
* /
# include < a s m / p p c _ a s m . h >
# include < a s m / a s m - o f f s e t s . h >
# define r H P r3 / * p o i n t e r t o h a s h v a l u e s i n m e m o r y * /
# define r K P r24 / * p o i n t e r t o r o u n d c o n s t a n t s * /
# define r W P r4 / * p o i n t e r t o i n p u t d a t a * /
# define r H 0 r5 / * 8 3 2 b i t h a s h v a l u e s i n 8 r e g i s t e r s * /
# define r H 1 r6
# define r H 2 r7
# define r H 3 r8
# define r H 4 r9
# define r H 5 r10
# define r H 6 r11
# define r H 7 r12
# define r W 0 r14 / * 6 4 b i t r e g i s t e r s . 1 6 w o r d s i n 8 r e g i s t e r s * /
# define r W 1 r15
# define r W 2 r16
# define r W 3 r17
# define r W 4 r18
# define r W 5 r19
# define r W 6 r20
# define r W 7 r21
# define r T 0 r22 / * 6 4 b i t t e m p o r a r i e s * /
# define r T 1 r23
# define r T 2 r0 / * 3 2 b i t t e m p o r a r i e s * /
# define r T 3 r25
# define C M P _ K N _ L O O P
# define C M P _ K C _ L O O P \
cmpwi r T 1 ,0 ;
# define I N I T I A L I Z E \
stwu r1 ,- 1 2 8 ( r1 ) ; /* create stack frame */ \
evstdw r14 ,8 ( r1 ) ; /* We must save non volatile */ \
evstdw r15 ,1 6 ( r1 ) ; /* registers. Take the chance */ \
evstdw r16 ,2 4 ( r1 ) ; /* and save the SPE part too */ \
evstdw r17 ,3 2 ( r1 ) ; \
evstdw r18 ,4 0 ( r1 ) ; \
evstdw r19 ,4 8 ( r1 ) ; \
evstdw r20 ,5 6 ( r1 ) ; \
evstdw r21 ,6 4 ( r1 ) ; \
evstdw r22 ,7 2 ( r1 ) ; \
evstdw r23 ,8 0 ( r1 ) ; \
stw r24 ,8 8 ( r1 ) ; /* save normal registers */ \
stw r25 ,9 2 ( r1 ) ;
# define F I N A L I Z E \
evldw r14 ,8 ( r1 ) ; /* restore SPE registers */ \
evldw r15 ,1 6 ( r1 ) ; \
evldw r16 ,2 4 ( r1 ) ; \
evldw r17 ,3 2 ( r1 ) ; \
evldw r18 ,4 0 ( r1 ) ; \
evldw r19 ,4 8 ( r1 ) ; \
evldw r20 ,5 6 ( r1 ) ; \
evldw r21 ,6 4 ( r1 ) ; \
evldw r22 ,7 2 ( r1 ) ; \
evldw r23 ,8 0 ( r1 ) ; \
lwz r24 ,8 8 ( r1 ) ; /* restore normal registers */ \
lwz r25 ,9 2 ( r1 ) ; \
xor r0 ,r0 ,r0 ; \
stw r0 ,8 ( r1 ) ; /* Delete sensitive data */ \
stw r0 ,1 6 ( r1 ) ; /* that we might have pushed */ \
stw r0 ,2 4 ( r1 ) ; /* from other context that runs */ \
stw r0 ,3 2 ( r1 ) ; /* the same code. Assume that */ \
stw r0 ,4 0 ( r1 ) ; /* the lower part of the GPRs */ \
stw r0 ,4 8 ( r1 ) ; /* was already overwritten on */ \
stw r0 ,5 6 ( r1 ) ; /* the way down to here */ \
stw r0 ,6 4 ( r1 ) ; \
stw r0 ,7 2 ( r1 ) ; \
stw r0 ,8 0 ( r1 ) ; \
addi r1 ,r1 ,1 2 8 ; /* cleanup stack frame */
# ifdef _ _ B I G _ E N D I A N _ _
# define L O A D _ D A T A ( r e g , o f f ) \
lwz r e g ,o f f ( r W P ) ; /* load data */
# define N E X T _ B L O C K \
addi r W P ,r W P ,6 4 ; /* increment per block */
# else
# define L O A D _ D A T A ( r e g , o f f ) \
lwbrx r e g ,0 ,r W P ; /* load data */ \
addi r W P ,r W P ,4 ; /* increment per word */
# define N E X T _ B L O C K / * n o t h i n g t o d o * /
# endif
# define R _ L O A D _ W ( a , b , c , d , e , f , g , h , w , o f f ) \
LOAD_ D A T A ( w , o f f ) / * 1 : W * / \
rotrwi r T 0 ,e ,6 ; /* 1: S1 = e rotr 6 */ \
rotrwi r T 1 ,e ,1 1 ; /* 1: S1' = e rotr 11 */ \
rotrwi r T 2 ,e ,2 5 ; /* 1: S1" = e rotr 25 */ \
xor r T 0 ,r T 0 ,r T 1 ; /* 1: S1 = S1 xor S1' */ \
and r T 3 ,e ,f ; /* 1: ch = e and f */ \
xor r T 0 ,r T 0 ,r T 2 ; /* 1: S1 = S1 xor S1" */ \
andc r T 1 ,g ,e ; /* 1: ch' = ~e and g */ \
lwz r T 2 ,o f f ( r K P ) ; /* 1: K */ \
xor r T 3 ,r T 3 ,r T 1 ; /* 1: ch = ch xor ch' */ \
add h ,h ,r T 0 ; /* 1: temp1 = h + S1 */ \
add r T 3 ,r T 3 ,w ; /* 1: temp1' = ch + w */ \
rotrwi r T 0 ,a ,2 ; /* 1: S0 = a rotr 2 */ \
add h ,h ,r T 3 ; /* 1: temp1 = temp1 + temp1' */ \
rotrwi r T 1 ,a ,1 3 ; /* 1: S0' = a rotr 13 */ \
add h ,h ,r T 2 ; /* 1: temp1 = temp1 + K */ \
rotrwi r T 3 ,a ,2 2 ; /* 1: S0" = a rotr 22 */ \
xor r T 0 ,r T 0 ,r T 1 ; /* 1: S0 = S0 xor S0' */ \
add d ,d ,h ; /* 1: d = d + temp1 */ \
xor r T 3 ,r T 0 ,r T 3 ; /* 1: S0 = S0 xor S0" */ \
evmergelo w ,w ,w ; /* shift W */ \
or r T 2 ,a ,b ; /* 1: maj = a or b */ \
and r T 1 ,a ,b ; /* 1: maj' = a and b */ \
and r T 2 ,r T 2 ,c ; /* 1: maj = maj and c */ \
LOAD_ D A T A ( w , o f f + 4 ) / * 2 : W * / \
or r T 2 ,r T 1 ,r T 2 ; /* 1: maj = maj or maj' */ \
rotrwi r T 0 ,d ,6 ; /* 2: S1 = e rotr 6 */ \
add r T 3 ,r T 3 ,r T 2 ; /* 1: temp2 = S0 + maj */ \
rotrwi r T 1 ,d ,1 1 ; /* 2: S1' = e rotr 11 */ \
add h ,h ,r T 3 ; /* 1: h = temp1 + temp2 */ \
rotrwi r T 2 ,d ,2 5 ; /* 2: S1" = e rotr 25 */ \
xor r T 0 ,r T 0 ,r T 1 ; /* 2: S1 = S1 xor S1' */ \
and r T 3 ,d ,e ; /* 2: ch = e and f */ \
xor r T 0 ,r T 0 ,r T 2 ; /* 2: S1 = S1 xor S1" */ \
andc r T 1 ,f ,d ; /* 2: ch' = ~e and g */ \
lwz r T 2 ,o f f + 4 ( r K P ) ; /* 2: K */ \
xor r T 3 ,r T 3 ,r T 1 ; /* 2: ch = ch xor ch' */ \
add g ,g ,r T 0 ; /* 2: temp1 = h + S1 */ \
add r T 3 ,r T 3 ,w ; /* 2: temp1' = ch + w */ \
rotrwi r T 0 ,h ,2 ; /* 2: S0 = a rotr 2 */ \
add g ,g ,r T 3 ; /* 2: temp1 = temp1 + temp1' */ \
rotrwi r T 1 ,h ,1 3 ; /* 2: S0' = a rotr 13 */ \
add g ,g ,r T 2 ; /* 2: temp1 = temp1 + K */ \
rotrwi r T 3 ,h ,2 2 ; /* 2: S0" = a rotr 22 */ \
xor r T 0 ,r T 0 ,r T 1 ; /* 2: S0 = S0 xor S0' */ \
or r T 2 ,h ,a ; /* 2: maj = a or b */ \
xor r T 3 ,r T 0 ,r T 3 ; /* 2: S0 = S0 xor S0" */ \
and r T 1 ,h ,a ; /* 2: maj' = a and b */ \
and r T 2 ,r T 2 ,b ; /* 2: maj = maj and c */ \
add c ,c ,g ; /* 2: d = d + temp1 */ \
or r T 2 ,r T 1 ,r T 2 ; /* 2: maj = maj or maj' */ \
add r T 3 ,r T 3 ,r T 2 ; /* 2: temp2 = S0 + maj */ \
add g ,g ,r T 3 / * 2 : h = t e m p1 + t e m p2 * /
# define R _ C A L C _ W ( a , b , c , d , e , f , g , h , w0 , w1 , w4 , w5 , w7 , k , o f f ) \
rotrwi r T 2 ,e ,6 ; /* 1: S1 = e rotr 6 */ \
evmergelohi r T 0 ,w0 ,w1 ; /* w[-15] */ \
rotrwi r T 3 ,e ,1 1 ; /* 1: S1' = e rotr 11 */ \
evsrwiu r T 1 ,r T 0 ,3 ; /* s0 = w[-15] >> 3 */ \
xor r T 2 ,r T 2 ,r T 3 ; /* 1: S1 = S1 xor S1' */ \
evrlwi r T 0 ,r T 0 ,2 5 ; /* s0' = w[-15] rotr 7 */ \
rotrwi r T 3 ,e ,2 5 ; /* 1: S1' = e rotr 25 */ \
evxor r T 1 ,r T 1 ,r T 0 ; /* s0 = s0 xor s0' */ \
xor r T 2 ,r T 2 ,r T 3 ; /* 1: S1 = S1 xor S1' */ \
evrlwi r T 0 ,r T 0 ,2 1 ; /* s0' = w[-15] rotr 18 */ \
add h ,h ,r T 2 ; /* 1: temp1 = h + S1 */ \
evxor r T 0 ,r T 0 ,r T 1 ; /* s0 = s0 xor s0' */ \
and r T 2 ,e ,f ; /* 1: ch = e and f */ \
evaddw w0 ,w0 ,r T 0 ; /* w = w[-16] + s0 */ \
andc r T 3 ,g ,e ; /* 1: ch' = ~e and g */ \
evsrwiu r T 0 ,w7 ,1 0 ; /* s1 = w[-2] >> 10 */ \
xor r T 2 ,r T 2 ,r T 3 ; /* 1: ch = ch xor ch' */ \
evrlwi r T 1 ,w7 ,1 5 ; /* s1' = w[-2] rotr 17 */ \
add h ,h ,r T 2 ; /* 1: temp1 = temp1 + ch */ \
evxor r T 0 ,r T 0 ,r T 1 ; /* s1 = s1 xor s1' */ \
rotrwi r T 2 ,a ,2 ; /* 1: S0 = a rotr 2 */ \
evrlwi r T 1 ,w7 ,1 3 ; /* s1' = w[-2] rotr 19 */ \
rotrwi r T 3 ,a ,1 3 ; /* 1: S0' = a rotr 13 */ \
evxor r T 0 ,r T 0 ,r T 1 ; /* s1 = s1 xor s1' */ \
xor r T 2 ,r T 2 ,r T 3 ; /* 1: S0 = S0 xor S0' */ \
evldw r T 1 ,o f f ( r K P ) ; /* k */ \
rotrwi r T 3 ,a ,2 2 ; /* 1: S0' = a rotr 22 */ \
evaddw w0 ,w0 ,r T 0 ; /* w = w + s1 */ \
xor r T 2 ,r T 2 ,r T 3 ; /* 1: S0 = S0 xor S0' */ \
evmergelohi r T 0 ,w4 ,w5 ; /* w[-7] */ \
and r T 3 ,a ,b ; /* 1: maj = a and b */ \
evaddw w0 ,w0 ,r T 0 ; /* w = w + w[-7] */ \
CMP_ K ## k # # _ L O O P \
add r T 2 ,r T 2 ,r T 3 ; /* 1: temp2 = S0 + maj */ \
evaddw r T 1 ,r T 1 ,w0 ; /* wk = w + k */ \
xor r T 3 ,a ,b ; /* 1: maj = a xor b */ \
evmergehi r T 0 ,r T 1 ,r T 1 ; /* wk1/wk2 */ \
and r T 3 ,r T 3 ,c ; /* 1: maj = maj and c */ \
add h ,h ,r T 0 ; /* 1: temp1 = temp1 + wk */ \
add r T 2 ,r T 2 ,r T 3 ; /* 1: temp2 = temp2 + maj */ \
add g ,g ,r T 1 ; /* 2: temp1 = temp1 + wk */ \
add d ,d ,h ; /* 1: d = d + temp1 */ \
rotrwi r T 0 ,d ,6 ; /* 2: S1 = e rotr 6 */ \
add h ,h ,r T 2 ; /* 1: h = temp1 + temp2 */ \
rotrwi r T 1 ,d ,1 1 ; /* 2: S1' = e rotr 11 */ \
rotrwi r T 2 ,d ,2 5 ; /* 2: S" = e rotr 25 */ \
xor r T 0 ,r T 0 ,r T 1 ; /* 2: S1 = S1 xor S1' */ \
and r T 3 ,d ,e ; /* 2: ch = e and f */ \
xor r T 0 ,r T 0 ,r T 2 ; /* 2: S1 = S1 xor S1" */ \
andc r T 1 ,f ,d ; /* 2: ch' = ~e and g */ \
add g ,g ,r T 0 ; /* 2: temp1 = h + S1 */ \
xor r T 3 ,r T 3 ,r T 1 ; /* 2: ch = ch xor ch' */ \
rotrwi r T 0 ,h ,2 ; /* 2: S0 = a rotr 2 */ \
add g ,g ,r T 3 ; /* 2: temp1 = temp1 + ch */ \
rotrwi r T 1 ,h ,1 3 ; /* 2: S0' = a rotr 13 */ \
rotrwi r T 3 ,h ,2 2 ; /* 2: S0" = a rotr 22 */ \
xor r T 0 ,r T 0 ,r T 1 ; /* 2: S0 = S0 xor S0' */ \
or r T 2 ,h ,a ; /* 2: maj = a or b */ \
and r T 1 ,h ,a ; /* 2: maj' = a and b */ \
and r T 2 ,r T 2 ,b ; /* 2: maj = maj and c */ \
xor r T 3 ,r T 0 ,r T 3 ; /* 2: S0 = S0 xor S0" */ \
or r T 2 ,r T 1 ,r T 2 ; /* 2: maj = maj or maj' */ \
add c ,c ,g ; /* 2: d = d + temp1 */ \
add r T 3 ,r T 3 ,r T 2 ; /* 2: temp2 = S0 + maj */ \
add g ,g ,r T 3 / * 2 : h = t e m p1 + t e m p2 * /
_ GLOBAL( p p c _ s p e _ s h a25 6 _ t r a n s f o r m )
INITIALIZE
mtctr r5
lwz r H 0 ,0 ( r H P )
lwz r H 1 ,4 ( r H P )
lwz r H 2 ,8 ( r H P )
lwz r H 3 ,1 2 ( r H P )
lwz r H 4 ,1 6 ( r H P )
lwz r H 5 ,2 0 ( r H P )
lwz r H 6 ,2 4 ( r H P )
lwz r H 7 ,2 8 ( r H P )
ppc_spe_sha256_main :
lis r K P ,P P C _ S P E _ S H A 2 5 6 _ K @ha
addi r K P ,r K P ,P P C _ S P E _ S H A 2 5 6 _ K @l
R_ L O A D _ W ( r H 0 , r H 1 , r H 2 , r H 3 , r H 4 , r H 5 , r H 6 , r H 7 , r W 0 , 0 )
R_ L O A D _ W ( r H 6 , r H 7 , r H 0 , r H 1 , r H 2 , r H 3 , r H 4 , r H 5 , r W 1 , 8 )
R_ L O A D _ W ( r H 4 , r H 5 , r H 6 , r H 7 , r H 0 , r H 1 , r H 2 , r H 3 , r W 2 , 1 6 )
R_ L O A D _ W ( r H 2 , r H 3 , r H 4 , r H 5 , r H 6 , r H 7 , r H 0 , r H 1 , r W 3 , 2 4 )
R_ L O A D _ W ( r H 0 , r H 1 , r H 2 , r H 3 , r H 4 , r H 5 , r H 6 , r H 7 , r W 4 , 3 2 )
R_ L O A D _ W ( r H 6 , r H 7 , r H 0 , r H 1 , r H 2 , r H 3 , r H 4 , r H 5 , r W 5 , 4 0 )
R_ L O A D _ W ( r H 4 , r H 5 , r H 6 , r H 7 , r H 0 , r H 1 , r H 2 , r H 3 , r W 6 , 4 8 )
R_ L O A D _ W ( r H 2 , r H 3 , r H 4 , r H 5 , r H 6 , r H 7 , r H 0 , r H 1 , r W 7 , 5 6 )
ppc_spe_sha256_16_rounds :
addi r K P ,r K P ,6 4
R_ C A L C _ W ( r H 0 , r H 1 , r H 2 , r H 3 , r H 4 , r H 5 , r H 6 , r H 7 ,
rW0 , r W 1 , r W 4 , r W 5 , r W 7 , N , 0 )
R_ C A L C _ W ( r H 6 , r H 7 , r H 0 , r H 1 , r H 2 , r H 3 , r H 4 , r H 5 ,
rW1 , r W 2 , r W 5 , r W 6 , r W 0 , N , 8 )
R_ C A L C _ W ( r H 4 , r H 5 , r H 6 , r H 7 , r H 0 , r H 1 , r H 2 , r H 3 ,
rW2 , r W 3 , r W 6 , r W 7 , r W 1 , N , 1 6 )
R_ C A L C _ W ( r H 2 , r H 3 , r H 4 , r H 5 , r H 6 , r H 7 , r H 0 , r H 1 ,
rW3 , r W 4 , r W 7 , r W 0 , r W 2 , N , 2 4 )
R_ C A L C _ W ( r H 0 , r H 1 , r H 2 , r H 3 , r H 4 , r H 5 , r H 6 , r H 7 ,
rW4 , r W 5 , r W 0 , r W 1 , r W 3 , N , 3 2 )
R_ C A L C _ W ( r H 6 , r H 7 , r H 0 , r H 1 , r H 2 , r H 3 , r H 4 , r H 5 ,
rW5 , r W 6 , r W 1 , r W 2 , r W 4 , N , 4 0 )
R_ C A L C _ W ( r H 4 , r H 5 , r H 6 , r H 7 , r H 0 , r H 1 , r H 2 , r H 3 ,
rW6 , r W 7 , r W 2 , r W 3 , r W 5 , N , 4 8 )
R_ C A L C _ W ( r H 2 , r H 3 , r H 4 , r H 5 , r H 6 , r H 7 , r H 0 , r H 1 ,
rW7 , r W 0 , r W 3 , r W 4 , r W 6 , C , 5 6 )
bt g t ,p p c _ s p e _ s h a25 6 _ 1 6 _ r o u n d s
lwz r W 0 ,0 ( r H P )
NEXT_ B L O C K
lwz r W 1 ,4 ( r H P )
lwz r W 2 ,8 ( r H P )
lwz r W 3 ,1 2 ( r H P )
lwz r W 4 ,1 6 ( r H P )
lwz r W 5 ,2 0 ( r H P )
lwz r W 6 ,2 4 ( r H P )
lwz r W 7 ,2 8 ( r H P )
add r H 0 ,r H 0 ,r W 0
stw r H 0 ,0 ( r H P )
add r H 1 ,r H 1 ,r W 1
stw r H 1 ,4 ( r H P )
add r H 2 ,r H 2 ,r W 2
stw r H 2 ,8 ( r H P )
add r H 3 ,r H 3 ,r W 3
stw r H 3 ,1 2 ( r H P )
add r H 4 ,r H 4 ,r W 4
stw r H 4 ,1 6 ( r H P )
add r H 5 ,r H 5 ,r W 5
stw r H 5 ,2 0 ( r H P )
add r H 6 ,r H 6 ,r W 6
stw r H 6 ,2 4 ( r H P )
add r H 7 ,r H 7 ,r W 7
stw r H 7 ,2 8 ( r H P )
bdnz p p c _ s p e _ s h a25 6 _ m a i n
FINALIZE
blr
.data
.align 5
PPC_SPE_SHA256_K :
.long 0 x4 2 8 a2 f98 ,0 x71 3 7 4 4 9 1 ,0 x b5 c0 f b c f ,0 x e 9 b5 d b a5
.long 0 x3 9 5 6 c25 b ,0 x59 f11 1 f1 ,0 x92 3 f82 a4 ,0 x a b1 c5 e d5
.long 0 xd8 0 7 a a98 ,0 x12 8 3 5 b01 ,0 x24 3 1 8 5 b e ,0 x55 0 c7 d c3
.long 0 x7 2 b e 5 d74 ,0 x80 d e b1 f e ,0 x9 b d c06 a7 ,0 x c19 b f17 4
.long 0 xe4 9 b69 c1 ,0 x e f b e 4 7 8 6 ,0 x0 f c19 d c6 ,0 x24 0 c a1 c c
.long 0 x2 d e 9 2 c6 f ,0 x4 a74 8 4 a a ,0 x5 c b0 a9 d c ,0 x76 f98 8 d a
.long 0 x9 8 3 e 5 1 5 2 ,0 x a83 1 c66 d ,0 x b00 3 2 7 c8 ,0 x b f59 7 f c7
.long 0 xc6 e 0 0 b f3 ,0 x d5 a79 1 4 7 ,0 x06 c a63 5 1 ,0 x14 2 9 2 9 6 7
.long 0 x2 7 b70 a85 ,0 x2 e 1 b21 3 8 ,0 x4 d2 c6 d f c ,0 x53 3 8 0 d13
.long 0 x6 5 0 a73 5 4 ,0 x76 6 a0 a b b ,0 x81 c2 c92 e ,0 x92 7 2 2 c85
.long 0 xa2 b f e 8 a1 ,0 x a81 a66 4 b ,0 x c24 b8 b70 ,0 x c76 c51 a3
.long 0 xd1 9 2 e 8 1 9 ,0 x d69 9 0 6 2 4 ,0 x f40 e 3 5 8 5 ,0 x10 6 a a07 0
.long 0 x1 9 a4 c11 6 ,0 x1 e 3 7 6 c08 ,0 x27 4 8 7 7 4 c ,0 x34 b0 b c b5
.long 0 x3 9 1 c0 c b3 ,0 x4 e d8 a a4 a ,0 x5 b9 c c a4 f ,0 x68 2 e 6 f f3
.long 0 x7 4 8 f82 e e ,0 x78 a56 3 6 f ,0 x84 c87 8 1 4 ,0 x8 c c70 2 0 8
.long 0 x9 0 b e f f f a ,0 x a45 0 6 c e b ,0 x b e f9 a3 f7 ,0 x c67 1 7 8 f2