2019-11-08 13:22:31 +01:00
/* SPDX-License-Identifier: GPL-2.0 OR MIT */
/ *
* Copyright ( C ) 2 0 1 5 - 2 0 1 9 J a s o n A . D o n e n f e l d < J a s o n @zx2c4.com>. All Rights Reserved.
* Copyright ( C ) 2 0 1 7 - 2 0 1 9 S a m u e l N e v e s < s n e v e s @dei.uc.pt>. All Rights Reserved.
* /
# include < l i n u x / l i n k a g e . h >
.section .rodata .cst32 .BLAKE2S_IV , " aM" , @progbits, 32
.align 32
IV : .octa 0xA54FF53A 3 C6 E F 3 7 2 B B 6 7 A E 8 5 6 A 0 9 E 6 6 7
.octa 0x5BE0CD191F83D9AB9B05688C510E527F
.section .rodata .cst16 .ROT16 , " aM" , @progbits, 16
.align 16
ROT16 : .octa 0x0D0C0F0E 0 9 0 8 0 B0 A 0 5 0 4 0 7 0 6 0 1 0 0 0 3 0 2
.section .rodata .cst16 .ROR328 , " aM" , @progbits, 16
.align 16
ROR328 : .octa 0x0C0F0E0D 0 8 0 B0 A 0 9 0 4 0 7 0 6 0 5 0 0 0 3 0 2 0 1
.section .rodata .cst64 .BLAKE2S_SIGMA , " aM" , @progbits, 160
.align 64
SIGMA :
.byte 0 , 2 , 4 , 6 , 1 , 3 , 5 , 7 , 1 4 , 8 , 1 0 , 1 2 , 1 5 , 9 , 1 1 , 1 3
.byte 1 4 , 4 , 9 , 1 3 , 1 0 , 8 , 1 5 , 6 , 5 , 1 , 0 , 1 1 , 3 , 1 2 , 2 , 7
.byte 1 1 , 1 2 , 5 , 1 5 , 8 , 0 , 2 , 1 3 , 9 , 1 0 , 3 , 7 , 4 , 1 4 , 6 , 1
.byte 7 , 3 , 1 3 , 1 1 , 9 , 1 , 1 2 , 1 4 , 1 5 , 2 , 5 , 4 , 8 , 6 , 1 0 , 0
.byte 9 , 5 , 2 , 1 0 , 0 , 7 , 4 , 1 5 , 3 , 1 4 , 1 1 , 6 , 1 3 , 1 , 1 2 , 8
.byte 2 , 6 , 0 , 8 , 1 2 , 1 0 , 1 1 , 3 , 1 , 4 , 7 , 1 5 , 9 , 1 3 , 5 , 1 4
.byte 1 2 , 1 , 1 4 , 4 , 5 , 1 5 , 1 3 , 1 0 , 8 , 0 , 6 , 9 , 1 1 , 7 , 3 , 2
.byte 1 3 , 7 , 1 2 , 3 , 1 1 , 1 4 , 1 , 9 , 2 , 5 , 1 5 , 8 , 1 0 , 0 , 4 , 6
.byte 6 , 1 4 , 1 1 , 0 , 1 5 , 9 , 3 , 8 , 1 0 , 1 2 , 1 3 , 1 , 5 , 2 , 7 , 4
.byte 1 0 , 8 , 7 , 1 , 2 , 4 , 6 , 5 , 1 3 , 1 5 , 9 , 3 , 0 , 1 1 , 1 4 , 1 2
# ifdef C O N F I G _ A S _ A V X 5 1 2
.section .rodata .cst64 .BLAKE2S_SIGMA2 , " aM" , @progbits, 640
.align 64
SIGMA2 :
.long 0 , 2 , 4 , 6 , 1 , 3 , 5 , 7 , 1 4 , 8 , 1 0 , 1 2 , 1 5 , 9 , 1 1 , 1 3
.long 8 , 2 , 1 3 , 1 5 , 1 0 , 9 , 1 2 , 3 , 6 , 4 , 0 , 1 4 , 5 , 1 1 , 1 , 7
.long 1 1 , 1 3 , 8 , 6 , 5 , 1 0 , 1 4 , 3 , 2 , 4 , 1 2 , 1 5 , 1 , 0 , 7 , 9
.long 1 1 , 1 0 , 7 , 0 , 8 , 1 5 , 1 , 1 3 , 3 , 6 , 2 , 1 2 , 4 , 1 4 , 9 , 5
.long 4 , 1 0 , 9 , 1 4 , 1 5 , 0 , 1 1 , 8 , 1 , 7 , 3 , 1 3 , 2 , 5 , 6 , 1 2
.long 2 , 1 1 , 4 , 1 5 , 1 4 , 3 , 1 0 , 8 , 1 3 , 6 , 5 , 7 , 0 , 1 2 , 1 , 9
.long 4 , 8 , 1 5 , 9 , 1 4 , 1 1 , 1 3 , 5 , 3 , 2 , 1 , 1 2 , 6 , 1 0 , 7 , 0
.long 6 , 1 3 , 0 , 1 4 , 1 2 , 2 , 1 , 1 1 , 1 5 , 4 , 5 , 8 , 7 , 9 , 3 , 1 0
.long 1 5 , 5 , 4 , 1 3 , 1 0 , 7 , 3 , 1 1 , 1 2 , 2 , 0 , 6 , 9 , 8 , 1 , 1 4
.long 8 , 7 , 1 4 , 1 1 , 1 3 , 1 5 , 0 , 1 2 , 1 0 , 4 , 5 , 6 , 3 , 2 , 1 , 9
# endif / * C O N F I G _ A S _ A V X 5 1 2 * /
.text
2019-11-26 10:42:40 -08:00
SYM_ F U N C _ S T A R T ( b l a k e 2 s _ c o m p r e s s _ s s s e 3 )
2019-11-08 13:22:31 +01:00
testq % r d x ,% r d x
je . L e n d o f l o o p
movdqu ( % r d i ) ,% x m m 0
movdqu 0 x10 ( % r d i ) ,% x m m 1
movdqa R O T 1 6 ( % r i p ) ,% x m m 1 2
movdqa R O R 3 2 8 ( % r i p ) ,% x m m 1 3
movdqu 0 x20 ( % r d i ) ,% x m m 1 4
movq % r c x ,% x m m 1 5
leaq S I G M A + 0 x a0 ( % r i p ) ,% r8
jmp . L b e g i n o f l o o p
.align 32
.Lbeginofloop :
movdqa % x m m 0 ,% x m m 1 0
movdqa % x m m 1 ,% x m m 1 1
paddq % x m m 1 5 ,% x m m 1 4
movdqa I V ( % r i p ) ,% x m m 2
movdqa % x m m 1 4 ,% x m m 3
pxor I V + 0 x10 ( % r i p ) ,% x m m 3
leaq S I G M A ( % r i p ) ,% r c x
.Lroundloop :
movzbl ( % r c x ) ,% e a x
movd ( % r s i ,% r a x ,4 ) ,% x m m 4
movzbl 0 x1 ( % r c x ) ,% e a x
movd ( % r s i ,% r a x ,4 ) ,% x m m 5
movzbl 0 x2 ( % r c x ) ,% e a x
movd ( % r s i ,% r a x ,4 ) ,% x m m 6
movzbl 0 x3 ( % r c x ) ,% e a x
movd ( % r s i ,% r a x ,4 ) ,% x m m 7
punpckldq % x m m 5 ,% x m m 4
punpckldq % x m m 7 ,% x m m 6
punpcklqdq % x m m 6 ,% x m m 4
paddd % x m m 4 ,% x m m 0
paddd % x m m 1 ,% x m m 0
pxor % x m m 0 ,% x m m 3
pshufb % x m m 1 2 ,% x m m 3
paddd % x m m 3 ,% x m m 2
pxor % x m m 2 ,% x m m 1
movdqa % x m m 1 ,% x m m 8
psrld $ 0 x c ,% x m m 1
pslld $ 0 x14 ,% x m m 8
por % x m m 8 ,% x m m 1
movzbl 0 x4 ( % r c x ) ,% e a x
movd ( % r s i ,% r a x ,4 ) ,% x m m 5
movzbl 0 x5 ( % r c x ) ,% e a x
movd ( % r s i ,% r a x ,4 ) ,% x m m 6
movzbl 0 x6 ( % r c x ) ,% e a x
movd ( % r s i ,% r a x ,4 ) ,% x m m 7
movzbl 0 x7 ( % r c x ) ,% e a x
movd ( % r s i ,% r a x ,4 ) ,% x m m 4
punpckldq % x m m 6 ,% x m m 5
punpckldq % x m m 4 ,% x m m 7
punpcklqdq % x m m 7 ,% x m m 5
paddd % x m m 5 ,% x m m 0
paddd % x m m 1 ,% x m m 0
pxor % x m m 0 ,% x m m 3
pshufb % x m m 1 3 ,% x m m 3
paddd % x m m 3 ,% x m m 2
pxor % x m m 2 ,% x m m 1
movdqa % x m m 1 ,% x m m 8
psrld $ 0 x7 ,% x m m 1
pslld $ 0 x19 ,% x m m 8
por % x m m 8 ,% x m m 1
pshufd $ 0 x93 ,% x m m 0 ,% x m m 0
pshufd $ 0 x4 e ,% x m m 3 ,% x m m 3
pshufd $ 0 x39 ,% x m m 2 ,% x m m 2
movzbl 0 x8 ( % r c x ) ,% e a x
movd ( % r s i ,% r a x ,4 ) ,% x m m 6
movzbl 0 x9 ( % r c x ) ,% e a x
movd ( % r s i ,% r a x ,4 ) ,% x m m 7
movzbl 0 x a ( % r c x ) ,% e a x
movd ( % r s i ,% r a x ,4 ) ,% x m m 4
movzbl 0 x b ( % r c x ) ,% e a x
movd ( % r s i ,% r a x ,4 ) ,% x m m 5
punpckldq % x m m 7 ,% x m m 6
punpckldq % x m m 5 ,% x m m 4
punpcklqdq % x m m 4 ,% x m m 6
paddd % x m m 6 ,% x m m 0
paddd % x m m 1 ,% x m m 0
pxor % x m m 0 ,% x m m 3
pshufb % x m m 1 2 ,% x m m 3
paddd % x m m 3 ,% x m m 2
pxor % x m m 2 ,% x m m 1
movdqa % x m m 1 ,% x m m 8
psrld $ 0 x c ,% x m m 1
pslld $ 0 x14 ,% x m m 8
por % x m m 8 ,% x m m 1
movzbl 0 x c ( % r c x ) ,% e a x
movd ( % r s i ,% r a x ,4 ) ,% x m m 7
movzbl 0 x d ( % r c x ) ,% e a x
movd ( % r s i ,% r a x ,4 ) ,% x m m 4
movzbl 0 x e ( % r c x ) ,% e a x
movd ( % r s i ,% r a x ,4 ) ,% x m m 5
movzbl 0 x f ( % r c x ) ,% e a x
movd ( % r s i ,% r a x ,4 ) ,% x m m 6
punpckldq % x m m 4 ,% x m m 7
punpckldq % x m m 6 ,% x m m 5
punpcklqdq % x m m 5 ,% x m m 7
paddd % x m m 7 ,% x m m 0
paddd % x m m 1 ,% x m m 0
pxor % x m m 0 ,% x m m 3
pshufb % x m m 1 3 ,% x m m 3
paddd % x m m 3 ,% x m m 2
pxor % x m m 2 ,% x m m 1
movdqa % x m m 1 ,% x m m 8
psrld $ 0 x7 ,% x m m 1
pslld $ 0 x19 ,% x m m 8
por % x m m 8 ,% x m m 1
pshufd $ 0 x39 ,% x m m 0 ,% x m m 0
pshufd $ 0 x4 e ,% x m m 3 ,% x m m 3
pshufd $ 0 x93 ,% x m m 2 ,% x m m 2
addq $ 0 x10 ,% r c x
cmpq % r8 ,% r c x
jnz . L r o u n d l o o p
pxor % x m m 2 ,% x m m 0
pxor % x m m 3 ,% x m m 1
pxor % x m m 1 0 ,% x m m 0
pxor % x m m 1 1 ,% x m m 1
addq $ 0 x40 ,% r s i
decq % r d x
jnz . L b e g i n o f l o o p
movdqu % x m m 0 ,( % r d i )
movdqu % x m m 1 ,0 x10 ( % r d i )
movdqu % x m m 1 4 ,0 x20 ( % r d i )
.Lendofloop :
2021-12-04 14:43:40 +01:00
RET
2019-11-26 10:42:40 -08:00
SYM_ F U N C _ E N D ( b l a k e 2 s _ c o m p r e s s _ s s s e 3 )
2019-11-08 13:22:31 +01:00
# ifdef C O N F I G _ A S _ A V X 5 1 2
2019-11-26 10:42:40 -08:00
SYM_ F U N C _ S T A R T ( b l a k e 2 s _ c o m p r e s s _ a v x51 2 )
2019-11-08 13:22:31 +01:00
vmovdqu ( % r d i ) ,% x m m 0
vmovdqu 0 x10 ( % r d i ) ,% x m m 1
vmovdqu 0 x20 ( % r d i ) ,% x m m 4
vmovq % r c x ,% x m m 5
vmovdqa I V ( % r i p ) ,% x m m 1 4
vmovdqa I V + 1 6 ( % r i p ) ,% x m m 1 5
jmp . L b l a k e 2 s _ c o m p r e s s _ a v x51 2 _ m a i n l o o p
.align 32
.Lblake2s_compress_avx512_mainloop :
vmovdqa % x m m 0 ,% x m m 1 0
vmovdqa % x m m 1 ,% x m m 1 1
vpaddq % x m m 5 ,% x m m 4 ,% x m m 4
vmovdqa % x m m 1 4 ,% x m m 2
vpxor % x m m 1 5 ,% x m m 4 ,% x m m 3
vmovdqu ( % r s i ) ,% y m m 6
vmovdqu 0 x20 ( % r s i ) ,% y m m 7
addq $ 0 x40 ,% r s i
leaq S I G M A 2 ( % r i p ) ,% r a x
movb $ 0 x a ,% c l
.Lblake2s_compress_avx512_roundloop :
addq $ 0 x40 ,% r a x
vmovdqa - 0 x40 ( % r a x ) ,% y m m 8
vmovdqa - 0 x20 ( % r a x ) ,% y m m 9
vpermi2 d % y m m 7 ,% y m m 6 ,% y m m 8
vpermi2 d % y m m 7 ,% y m m 6 ,% y m m 9
vmovdqa % y m m 8 ,% y m m 6
vmovdqa % y m m 9 ,% y m m 7
vpaddd % x m m 8 ,% x m m 0 ,% x m m 0
vpaddd % x m m 1 ,% x m m 0 ,% x m m 0
vpxor % x m m 0 ,% x m m 3 ,% x m m 3
vprord $ 0 x10 ,% x m m 3 ,% x m m 3
vpaddd % x m m 3 ,% x m m 2 ,% x m m 2
vpxor % x m m 2 ,% x m m 1 ,% x m m 1
vprord $ 0 x c ,% x m m 1 ,% x m m 1
vextracti1 2 8 $ 0 x1 ,% y m m 8 ,% x m m 8
vpaddd % x m m 8 ,% x m m 0 ,% x m m 0
vpaddd % x m m 1 ,% x m m 0 ,% x m m 0
vpxor % x m m 0 ,% x m m 3 ,% x m m 3
vprord $ 0 x8 ,% x m m 3 ,% x m m 3
vpaddd % x m m 3 ,% x m m 2 ,% x m m 2
vpxor % x m m 2 ,% x m m 1 ,% x m m 1
vprord $ 0 x7 ,% x m m 1 ,% x m m 1
vpshufd $ 0 x93 ,% x m m 0 ,% x m m 0
vpshufd $ 0 x4 e ,% x m m 3 ,% x m m 3
vpshufd $ 0 x39 ,% x m m 2 ,% x m m 2
vpaddd % x m m 9 ,% x m m 0 ,% x m m 0
vpaddd % x m m 1 ,% x m m 0 ,% x m m 0
vpxor % x m m 0 ,% x m m 3 ,% x m m 3
vprord $ 0 x10 ,% x m m 3 ,% x m m 3
vpaddd % x m m 3 ,% x m m 2 ,% x m m 2
vpxor % x m m 2 ,% x m m 1 ,% x m m 1
vprord $ 0 x c ,% x m m 1 ,% x m m 1
vextracti1 2 8 $ 0 x1 ,% y m m 9 ,% x m m 9
vpaddd % x m m 9 ,% x m m 0 ,% x m m 0
vpaddd % x m m 1 ,% x m m 0 ,% x m m 0
vpxor % x m m 0 ,% x m m 3 ,% x m m 3
vprord $ 0 x8 ,% x m m 3 ,% x m m 3
vpaddd % x m m 3 ,% x m m 2 ,% x m m 2
vpxor % x m m 2 ,% x m m 1 ,% x m m 1
vprord $ 0 x7 ,% x m m 1 ,% x m m 1
vpshufd $ 0 x39 ,% x m m 0 ,% x m m 0
vpshufd $ 0 x4 e ,% x m m 3 ,% x m m 3
vpshufd $ 0 x93 ,% x m m 2 ,% x m m 2
decb % c l
jne . L b l a k e 2 s _ c o m p r e s s _ a v x51 2 _ r o u n d l o o p
vpxor % x m m 1 0 ,% x m m 0 ,% x m m 0
vpxor % x m m 1 1 ,% x m m 1 ,% x m m 1
vpxor % x m m 2 ,% x m m 0 ,% x m m 0
vpxor % x m m 3 ,% x m m 1 ,% x m m 1
decq % r d x
jne . L b l a k e 2 s _ c o m p r e s s _ a v x51 2 _ m a i n l o o p
vmovdqu % x m m 0 ,( % r d i )
vmovdqu % x m m 1 ,0 x10 ( % r d i )
vmovdqu % x m m 4 ,0 x20 ( % r d i )
vzeroupper
2021-12-04 14:43:40 +01:00
RET
2019-11-26 10:42:40 -08:00
SYM_ F U N C _ E N D ( b l a k e 2 s _ c o m p r e s s _ a v x51 2 )
2019-11-08 13:22:31 +01:00
# endif / * C O N F I G _ A S _ A V X 5 1 2 * /