62d9e47506
Improve overall performance of chacha20 encrypt and decrypt operations for Power10 or later CPU. Signed-off-by: Danny Tsen <dtsen@linux.ibm.com> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
843 lines
16 KiB
ArmAsm
843 lines
16 KiB
ArmAsm
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
|
#
|
|
# Accelerated chacha20 implementation for ppc64le.
|
|
#
|
|
# Copyright 2023- IBM Corp. All rights reserved
|
|
#
|
|
#===================================================================================
|
|
# Written by Danny Tsen <dtsen@us.ibm.com>
|
|
#
|
|
# chacha_p10le_8x(u32 *state, byte *dst, const byte *src,
|
|
# size_t len, int nrounds);
|
|
#
|
|
# do rounds, 8 quarter rounds
|
|
# 1. a += b; d ^= a; d <<<= 16;
|
|
# 2. c += d; b ^= c; b <<<= 12;
|
|
# 3. a += b; d ^= a; d <<<= 8;
|
|
# 4. c += d; b ^= c; b <<<= 7
|
|
#
|
|
# row1 = (row1 + row2), row4 = row1 xor row4, row4 rotate each word by 16
|
|
# row3 = (row3 + row4), row2 = row3 xor row2, row2 rotate each word by 12
|
|
# row1 = (row1 + row2), row4 = row1 xor row4, row4 rotate each word by 8
|
|
# row3 = (row3 + row4), row2 = row3 xor row2, row2 rotate each word by 7
|
|
#
|
|
# 4 blocks (a b c d)
|
|
#
|
|
# a0 b0 c0 d0
|
|
# a1 b1 c1 d1
|
|
# ...
|
|
# a4 b4 c4 d4
|
|
# ...
|
|
# a8 b8 c8 d8
|
|
# ...
|
|
# a12 b12 c12 d12
|
|
# a13 ...
|
|
# a14 ...
|
|
# a15 b15 c15 d15
|
|
#
|
|
# Column round (v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
|
|
# Diagnal round (v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14)
|
|
#
|
|
|
|
#include <asm/ppc_asm.h>
|
|
#include <asm/asm-offsets.h>
|
|
#include <asm/asm-compat.h>
|
|
#include <linux/linkage.h>
|
|
|
|
.machine "any"
|
|
.text
|
|
|
|
.macro SAVE_GPR GPR OFFSET FRAME
|
|
std \GPR,\OFFSET(\FRAME)
|
|
.endm
|
|
|
|
.macro SAVE_VRS VRS OFFSET FRAME
|
|
li 16, \OFFSET
|
|
stvx \VRS, 16, \FRAME
|
|
.endm
|
|
|
|
.macro SAVE_VSX VSX OFFSET FRAME
|
|
li 16, \OFFSET
|
|
stxvx \VSX, 16, \FRAME
|
|
.endm
|
|
|
|
.macro RESTORE_GPR GPR OFFSET FRAME
|
|
ld \GPR,\OFFSET(\FRAME)
|
|
.endm
|
|
|
|
.macro RESTORE_VRS VRS OFFSET FRAME
|
|
li 16, \OFFSET
|
|
lvx \VRS, 16, \FRAME
|
|
.endm
|
|
|
|
.macro RESTORE_VSX VSX OFFSET FRAME
|
|
li 16, \OFFSET
|
|
lxvx \VSX, 16, \FRAME
|
|
.endm
|
|
|
|
.macro SAVE_REGS
|
|
mflr 0
|
|
std 0, 16(1)
|
|
stdu 1,-752(1)
|
|
|
|
SAVE_GPR 14, 112, 1
|
|
SAVE_GPR 15, 120, 1
|
|
SAVE_GPR 16, 128, 1
|
|
SAVE_GPR 17, 136, 1
|
|
SAVE_GPR 18, 144, 1
|
|
SAVE_GPR 19, 152, 1
|
|
SAVE_GPR 20, 160, 1
|
|
SAVE_GPR 21, 168, 1
|
|
SAVE_GPR 22, 176, 1
|
|
SAVE_GPR 23, 184, 1
|
|
SAVE_GPR 24, 192, 1
|
|
SAVE_GPR 25, 200, 1
|
|
SAVE_GPR 26, 208, 1
|
|
SAVE_GPR 27, 216, 1
|
|
SAVE_GPR 28, 224, 1
|
|
SAVE_GPR 29, 232, 1
|
|
SAVE_GPR 30, 240, 1
|
|
SAVE_GPR 31, 248, 1
|
|
|
|
addi 9, 1, 256
|
|
SAVE_VRS 20, 0, 9
|
|
SAVE_VRS 21, 16, 9
|
|
SAVE_VRS 22, 32, 9
|
|
SAVE_VRS 23, 48, 9
|
|
SAVE_VRS 24, 64, 9
|
|
SAVE_VRS 25, 80, 9
|
|
SAVE_VRS 26, 96, 9
|
|
SAVE_VRS 27, 112, 9
|
|
SAVE_VRS 28, 128, 9
|
|
SAVE_VRS 29, 144, 9
|
|
SAVE_VRS 30, 160, 9
|
|
SAVE_VRS 31, 176, 9
|
|
|
|
SAVE_VSX 14, 192, 9
|
|
SAVE_VSX 15, 208, 9
|
|
SAVE_VSX 16, 224, 9
|
|
SAVE_VSX 17, 240, 9
|
|
SAVE_VSX 18, 256, 9
|
|
SAVE_VSX 19, 272, 9
|
|
SAVE_VSX 20, 288, 9
|
|
SAVE_VSX 21, 304, 9
|
|
SAVE_VSX 22, 320, 9
|
|
SAVE_VSX 23, 336, 9
|
|
SAVE_VSX 24, 352, 9
|
|
SAVE_VSX 25, 368, 9
|
|
SAVE_VSX 26, 384, 9
|
|
SAVE_VSX 27, 400, 9
|
|
SAVE_VSX 28, 416, 9
|
|
SAVE_VSX 29, 432, 9
|
|
SAVE_VSX 30, 448, 9
|
|
SAVE_VSX 31, 464, 9
|
|
.endm # SAVE_REGS
|
|
|
|
.macro RESTORE_REGS
|
|
addi 9, 1, 256
|
|
RESTORE_VRS 20, 0, 9
|
|
RESTORE_VRS 21, 16, 9
|
|
RESTORE_VRS 22, 32, 9
|
|
RESTORE_VRS 23, 48, 9
|
|
RESTORE_VRS 24, 64, 9
|
|
RESTORE_VRS 25, 80, 9
|
|
RESTORE_VRS 26, 96, 9
|
|
RESTORE_VRS 27, 112, 9
|
|
RESTORE_VRS 28, 128, 9
|
|
RESTORE_VRS 29, 144, 9
|
|
RESTORE_VRS 30, 160, 9
|
|
RESTORE_VRS 31, 176, 9
|
|
|
|
RESTORE_VSX 14, 192, 9
|
|
RESTORE_VSX 15, 208, 9
|
|
RESTORE_VSX 16, 224, 9
|
|
RESTORE_VSX 17, 240, 9
|
|
RESTORE_VSX 18, 256, 9
|
|
RESTORE_VSX 19, 272, 9
|
|
RESTORE_VSX 20, 288, 9
|
|
RESTORE_VSX 21, 304, 9
|
|
RESTORE_VSX 22, 320, 9
|
|
RESTORE_VSX 23, 336, 9
|
|
RESTORE_VSX 24, 352, 9
|
|
RESTORE_VSX 25, 368, 9
|
|
RESTORE_VSX 26, 384, 9
|
|
RESTORE_VSX 27, 400, 9
|
|
RESTORE_VSX 28, 416, 9
|
|
RESTORE_VSX 29, 432, 9
|
|
RESTORE_VSX 30, 448, 9
|
|
RESTORE_VSX 31, 464, 9
|
|
|
|
RESTORE_GPR 14, 112, 1
|
|
RESTORE_GPR 15, 120, 1
|
|
RESTORE_GPR 16, 128, 1
|
|
RESTORE_GPR 17, 136, 1
|
|
RESTORE_GPR 18, 144, 1
|
|
RESTORE_GPR 19, 152, 1
|
|
RESTORE_GPR 20, 160, 1
|
|
RESTORE_GPR 21, 168, 1
|
|
RESTORE_GPR 22, 176, 1
|
|
RESTORE_GPR 23, 184, 1
|
|
RESTORE_GPR 24, 192, 1
|
|
RESTORE_GPR 25, 200, 1
|
|
RESTORE_GPR 26, 208, 1
|
|
RESTORE_GPR 27, 216, 1
|
|
RESTORE_GPR 28, 224, 1
|
|
RESTORE_GPR 29, 232, 1
|
|
RESTORE_GPR 30, 240, 1
|
|
RESTORE_GPR 31, 248, 1
|
|
|
|
addi 1, 1, 752
|
|
ld 0, 16(1)
|
|
mtlr 0
|
|
.endm # RESTORE_REGS
|
|
|
|
.macro QT_loop_8x
|
|
# QR(v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
|
|
xxlor 0, 32+25, 32+25
|
|
xxlor 32+25, 20, 20
|
|
vadduwm 0, 0, 4
|
|
vadduwm 1, 1, 5
|
|
vadduwm 2, 2, 6
|
|
vadduwm 3, 3, 7
|
|
vadduwm 16, 16, 20
|
|
vadduwm 17, 17, 21
|
|
vadduwm 18, 18, 22
|
|
vadduwm 19, 19, 23
|
|
|
|
vpermxor 12, 12, 0, 25
|
|
vpermxor 13, 13, 1, 25
|
|
vpermxor 14, 14, 2, 25
|
|
vpermxor 15, 15, 3, 25
|
|
vpermxor 28, 28, 16, 25
|
|
vpermxor 29, 29, 17, 25
|
|
vpermxor 30, 30, 18, 25
|
|
vpermxor 31, 31, 19, 25
|
|
xxlor 32+25, 0, 0
|
|
vadduwm 8, 8, 12
|
|
vadduwm 9, 9, 13
|
|
vadduwm 10, 10, 14
|
|
vadduwm 11, 11, 15
|
|
vadduwm 24, 24, 28
|
|
vadduwm 25, 25, 29
|
|
vadduwm 26, 26, 30
|
|
vadduwm 27, 27, 31
|
|
vxor 4, 4, 8
|
|
vxor 5, 5, 9
|
|
vxor 6, 6, 10
|
|
vxor 7, 7, 11
|
|
vxor 20, 20, 24
|
|
vxor 21, 21, 25
|
|
vxor 22, 22, 26
|
|
vxor 23, 23, 27
|
|
|
|
xxlor 0, 32+25, 32+25
|
|
xxlor 32+25, 21, 21
|
|
vrlw 4, 4, 25 #
|
|
vrlw 5, 5, 25
|
|
vrlw 6, 6, 25
|
|
vrlw 7, 7, 25
|
|
vrlw 20, 20, 25 #
|
|
vrlw 21, 21, 25
|
|
vrlw 22, 22, 25
|
|
vrlw 23, 23, 25
|
|
xxlor 32+25, 0, 0
|
|
vadduwm 0, 0, 4
|
|
vadduwm 1, 1, 5
|
|
vadduwm 2, 2, 6
|
|
vadduwm 3, 3, 7
|
|
vadduwm 16, 16, 20
|
|
vadduwm 17, 17, 21
|
|
vadduwm 18, 18, 22
|
|
vadduwm 19, 19, 23
|
|
|
|
xxlor 0, 32+25, 32+25
|
|
xxlor 32+25, 22, 22
|
|
vpermxor 12, 12, 0, 25
|
|
vpermxor 13, 13, 1, 25
|
|
vpermxor 14, 14, 2, 25
|
|
vpermxor 15, 15, 3, 25
|
|
vpermxor 28, 28, 16, 25
|
|
vpermxor 29, 29, 17, 25
|
|
vpermxor 30, 30, 18, 25
|
|
vpermxor 31, 31, 19, 25
|
|
xxlor 32+25, 0, 0
|
|
vadduwm 8, 8, 12
|
|
vadduwm 9, 9, 13
|
|
vadduwm 10, 10, 14
|
|
vadduwm 11, 11, 15
|
|
vadduwm 24, 24, 28
|
|
vadduwm 25, 25, 29
|
|
vadduwm 26, 26, 30
|
|
vadduwm 27, 27, 31
|
|
xxlor 0, 32+28, 32+28
|
|
xxlor 32+28, 23, 23
|
|
vxor 4, 4, 8
|
|
vxor 5, 5, 9
|
|
vxor 6, 6, 10
|
|
vxor 7, 7, 11
|
|
vxor 20, 20, 24
|
|
vxor 21, 21, 25
|
|
vxor 22, 22, 26
|
|
vxor 23, 23, 27
|
|
vrlw 4, 4, 28 #
|
|
vrlw 5, 5, 28
|
|
vrlw 6, 6, 28
|
|
vrlw 7, 7, 28
|
|
vrlw 20, 20, 28 #
|
|
vrlw 21, 21, 28
|
|
vrlw 22, 22, 28
|
|
vrlw 23, 23, 28
|
|
xxlor 32+28, 0, 0
|
|
|
|
# QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14)
|
|
xxlor 0, 32+25, 32+25
|
|
xxlor 32+25, 20, 20
|
|
vadduwm 0, 0, 5
|
|
vadduwm 1, 1, 6
|
|
vadduwm 2, 2, 7
|
|
vadduwm 3, 3, 4
|
|
vadduwm 16, 16, 21
|
|
vadduwm 17, 17, 22
|
|
vadduwm 18, 18, 23
|
|
vadduwm 19, 19, 20
|
|
|
|
vpermxor 15, 15, 0, 25
|
|
vpermxor 12, 12, 1, 25
|
|
vpermxor 13, 13, 2, 25
|
|
vpermxor 14, 14, 3, 25
|
|
vpermxor 31, 31, 16, 25
|
|
vpermxor 28, 28, 17, 25
|
|
vpermxor 29, 29, 18, 25
|
|
vpermxor 30, 30, 19, 25
|
|
|
|
xxlor 32+25, 0, 0
|
|
vadduwm 10, 10, 15
|
|
vadduwm 11, 11, 12
|
|
vadduwm 8, 8, 13
|
|
vadduwm 9, 9, 14
|
|
vadduwm 26, 26, 31
|
|
vadduwm 27, 27, 28
|
|
vadduwm 24, 24, 29
|
|
vadduwm 25, 25, 30
|
|
vxor 5, 5, 10
|
|
vxor 6, 6, 11
|
|
vxor 7, 7, 8
|
|
vxor 4, 4, 9
|
|
vxor 21, 21, 26
|
|
vxor 22, 22, 27
|
|
vxor 23, 23, 24
|
|
vxor 20, 20, 25
|
|
|
|
xxlor 0, 32+25, 32+25
|
|
xxlor 32+25, 21, 21
|
|
vrlw 5, 5, 25
|
|
vrlw 6, 6, 25
|
|
vrlw 7, 7, 25
|
|
vrlw 4, 4, 25
|
|
vrlw 21, 21, 25
|
|
vrlw 22, 22, 25
|
|
vrlw 23, 23, 25
|
|
vrlw 20, 20, 25
|
|
xxlor 32+25, 0, 0
|
|
|
|
vadduwm 0, 0, 5
|
|
vadduwm 1, 1, 6
|
|
vadduwm 2, 2, 7
|
|
vadduwm 3, 3, 4
|
|
vadduwm 16, 16, 21
|
|
vadduwm 17, 17, 22
|
|
vadduwm 18, 18, 23
|
|
vadduwm 19, 19, 20
|
|
|
|
xxlor 0, 32+25, 32+25
|
|
xxlor 32+25, 22, 22
|
|
vpermxor 15, 15, 0, 25
|
|
vpermxor 12, 12, 1, 25
|
|
vpermxor 13, 13, 2, 25
|
|
vpermxor 14, 14, 3, 25
|
|
vpermxor 31, 31, 16, 25
|
|
vpermxor 28, 28, 17, 25
|
|
vpermxor 29, 29, 18, 25
|
|
vpermxor 30, 30, 19, 25
|
|
xxlor 32+25, 0, 0
|
|
|
|
vadduwm 10, 10, 15
|
|
vadduwm 11, 11, 12
|
|
vadduwm 8, 8, 13
|
|
vadduwm 9, 9, 14
|
|
vadduwm 26, 26, 31
|
|
vadduwm 27, 27, 28
|
|
vadduwm 24, 24, 29
|
|
vadduwm 25, 25, 30
|
|
|
|
xxlor 0, 32+28, 32+28
|
|
xxlor 32+28, 23, 23
|
|
vxor 5, 5, 10
|
|
vxor 6, 6, 11
|
|
vxor 7, 7, 8
|
|
vxor 4, 4, 9
|
|
vxor 21, 21, 26
|
|
vxor 22, 22, 27
|
|
vxor 23, 23, 24
|
|
vxor 20, 20, 25
|
|
vrlw 5, 5, 28
|
|
vrlw 6, 6, 28
|
|
vrlw 7, 7, 28
|
|
vrlw 4, 4, 28
|
|
vrlw 21, 21, 28
|
|
vrlw 22, 22, 28
|
|
vrlw 23, 23, 28
|
|
vrlw 20, 20, 28
|
|
xxlor 32+28, 0, 0
|
|
.endm
|
|
|
|
.macro QT_loop_4x
|
|
# QR(v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
|
|
vadduwm 0, 0, 4
|
|
vadduwm 1, 1, 5
|
|
vadduwm 2, 2, 6
|
|
vadduwm 3, 3, 7
|
|
vpermxor 12, 12, 0, 20
|
|
vpermxor 13, 13, 1, 20
|
|
vpermxor 14, 14, 2, 20
|
|
vpermxor 15, 15, 3, 20
|
|
vadduwm 8, 8, 12
|
|
vadduwm 9, 9, 13
|
|
vadduwm 10, 10, 14
|
|
vadduwm 11, 11, 15
|
|
vxor 4, 4, 8
|
|
vxor 5, 5, 9
|
|
vxor 6, 6, 10
|
|
vxor 7, 7, 11
|
|
vrlw 4, 4, 21
|
|
vrlw 5, 5, 21
|
|
vrlw 6, 6, 21
|
|
vrlw 7, 7, 21
|
|
vadduwm 0, 0, 4
|
|
vadduwm 1, 1, 5
|
|
vadduwm 2, 2, 6
|
|
vadduwm 3, 3, 7
|
|
vpermxor 12, 12, 0, 22
|
|
vpermxor 13, 13, 1, 22
|
|
vpermxor 14, 14, 2, 22
|
|
vpermxor 15, 15, 3, 22
|
|
vadduwm 8, 8, 12
|
|
vadduwm 9, 9, 13
|
|
vadduwm 10, 10, 14
|
|
vadduwm 11, 11, 15
|
|
vxor 4, 4, 8
|
|
vxor 5, 5, 9
|
|
vxor 6, 6, 10
|
|
vxor 7, 7, 11
|
|
vrlw 4, 4, 23
|
|
vrlw 5, 5, 23
|
|
vrlw 6, 6, 23
|
|
vrlw 7, 7, 23
|
|
|
|
# QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14)
|
|
vadduwm 0, 0, 5
|
|
vadduwm 1, 1, 6
|
|
vadduwm 2, 2, 7
|
|
vadduwm 3, 3, 4
|
|
vpermxor 15, 15, 0, 20
|
|
vpermxor 12, 12, 1, 20
|
|
vpermxor 13, 13, 2, 20
|
|
vpermxor 14, 14, 3, 20
|
|
vadduwm 10, 10, 15
|
|
vadduwm 11, 11, 12
|
|
vadduwm 8, 8, 13
|
|
vadduwm 9, 9, 14
|
|
vxor 5, 5, 10
|
|
vxor 6, 6, 11
|
|
vxor 7, 7, 8
|
|
vxor 4, 4, 9
|
|
vrlw 5, 5, 21
|
|
vrlw 6, 6, 21
|
|
vrlw 7, 7, 21
|
|
vrlw 4, 4, 21
|
|
vadduwm 0, 0, 5
|
|
vadduwm 1, 1, 6
|
|
vadduwm 2, 2, 7
|
|
vadduwm 3, 3, 4
|
|
vpermxor 15, 15, 0, 22
|
|
vpermxor 12, 12, 1, 22
|
|
vpermxor 13, 13, 2, 22
|
|
vpermxor 14, 14, 3, 22
|
|
vadduwm 10, 10, 15
|
|
vadduwm 11, 11, 12
|
|
vadduwm 8, 8, 13
|
|
vadduwm 9, 9, 14
|
|
vxor 5, 5, 10
|
|
vxor 6, 6, 11
|
|
vxor 7, 7, 8
|
|
vxor 4, 4, 9
|
|
vrlw 5, 5, 23
|
|
vrlw 6, 6, 23
|
|
vrlw 7, 7, 23
|
|
vrlw 4, 4, 23
|
|
.endm
|
|
|
|
# Transpose
|
|
.macro TP_4x a0 a1 a2 a3
|
|
xxmrghw 10, 32+\a0, 32+\a1 # a0, a1, b0, b1
|
|
xxmrghw 11, 32+\a2, 32+\a3 # a2, a3, b2, b3
|
|
xxmrglw 12, 32+\a0, 32+\a1 # c0, c1, d0, d1
|
|
xxmrglw 13, 32+\a2, 32+\a3 # c2, c3, d2, d3
|
|
xxpermdi 32+\a0, 10, 11, 0 # a0, a1, a2, a3
|
|
xxpermdi 32+\a1, 10, 11, 3 # b0, b1, b2, b3
|
|
xxpermdi 32+\a2, 12, 13, 0 # c0, c1, c2, c3
|
|
xxpermdi 32+\a3, 12, 13, 3 # d0, d1, d2, d3
|
|
.endm
|
|
|
|
# key stream = working state + state
|
|
.macro Add_state S
|
|
vadduwm \S+0, \S+0, 16-\S
|
|
vadduwm \S+4, \S+4, 17-\S
|
|
vadduwm \S+8, \S+8, 18-\S
|
|
vadduwm \S+12, \S+12, 19-\S
|
|
|
|
vadduwm \S+1, \S+1, 16-\S
|
|
vadduwm \S+5, \S+5, 17-\S
|
|
vadduwm \S+9, \S+9, 18-\S
|
|
vadduwm \S+13, \S+13, 19-\S
|
|
|
|
vadduwm \S+2, \S+2, 16-\S
|
|
vadduwm \S+6, \S+6, 17-\S
|
|
vadduwm \S+10, \S+10, 18-\S
|
|
vadduwm \S+14, \S+14, 19-\S
|
|
|
|
vadduwm \S+3, \S+3, 16-\S
|
|
vadduwm \S+7, \S+7, 17-\S
|
|
vadduwm \S+11, \S+11, 18-\S
|
|
vadduwm \S+15, \S+15, 19-\S
|
|
.endm
|
|
|
|
#
|
|
# write 256 bytes
|
|
#
|
|
.macro Write_256 S
|
|
add 9, 14, 5
|
|
add 16, 14, 4
|
|
lxvw4x 0, 0, 9
|
|
lxvw4x 1, 17, 9
|
|
lxvw4x 2, 18, 9
|
|
lxvw4x 3, 19, 9
|
|
lxvw4x 4, 20, 9
|
|
lxvw4x 5, 21, 9
|
|
lxvw4x 6, 22, 9
|
|
lxvw4x 7, 23, 9
|
|
lxvw4x 8, 24, 9
|
|
lxvw4x 9, 25, 9
|
|
lxvw4x 10, 26, 9
|
|
lxvw4x 11, 27, 9
|
|
lxvw4x 12, 28, 9
|
|
lxvw4x 13, 29, 9
|
|
lxvw4x 14, 30, 9
|
|
lxvw4x 15, 31, 9
|
|
|
|
xxlxor \S+32, \S+32, 0
|
|
xxlxor \S+36, \S+36, 1
|
|
xxlxor \S+40, \S+40, 2
|
|
xxlxor \S+44, \S+44, 3
|
|
xxlxor \S+33, \S+33, 4
|
|
xxlxor \S+37, \S+37, 5
|
|
xxlxor \S+41, \S+41, 6
|
|
xxlxor \S+45, \S+45, 7
|
|
xxlxor \S+34, \S+34, 8
|
|
xxlxor \S+38, \S+38, 9
|
|
xxlxor \S+42, \S+42, 10
|
|
xxlxor \S+46, \S+46, 11
|
|
xxlxor \S+35, \S+35, 12
|
|
xxlxor \S+39, \S+39, 13
|
|
xxlxor \S+43, \S+43, 14
|
|
xxlxor \S+47, \S+47, 15
|
|
|
|
stxvw4x \S+32, 0, 16
|
|
stxvw4x \S+36, 17, 16
|
|
stxvw4x \S+40, 18, 16
|
|
stxvw4x \S+44, 19, 16
|
|
|
|
stxvw4x \S+33, 20, 16
|
|
stxvw4x \S+37, 21, 16
|
|
stxvw4x \S+41, 22, 16
|
|
stxvw4x \S+45, 23, 16
|
|
|
|
stxvw4x \S+34, 24, 16
|
|
stxvw4x \S+38, 25, 16
|
|
stxvw4x \S+42, 26, 16
|
|
stxvw4x \S+46, 27, 16
|
|
|
|
stxvw4x \S+35, 28, 16
|
|
stxvw4x \S+39, 29, 16
|
|
stxvw4x \S+43, 30, 16
|
|
stxvw4x \S+47, 31, 16
|
|
|
|
.endm
|
|
|
|
#
|
|
# chacha20_p10le_8x(u32 *state, byte *dst, const byte *src, size_t len, int nrounds);
|
|
#
|
|
SYM_FUNC_START(chacha_p10le_8x)
|
|
.align 5
|
|
cmpdi 6, 0
|
|
ble Out_no_chacha
|
|
|
|
SAVE_REGS
|
|
|
|
# r17 - r31 mainly for Write_256 macro.
|
|
li 17, 16
|
|
li 18, 32
|
|
li 19, 48
|
|
li 20, 64
|
|
li 21, 80
|
|
li 22, 96
|
|
li 23, 112
|
|
li 24, 128
|
|
li 25, 144
|
|
li 26, 160
|
|
li 27, 176
|
|
li 28, 192
|
|
li 29, 208
|
|
li 30, 224
|
|
li 31, 240
|
|
|
|
mr 15, 6 # len
|
|
li 14, 0 # offset to inp and outp
|
|
|
|
lxvw4x 48, 0, 3 # vr16, constants
|
|
lxvw4x 49, 17, 3 # vr17, key 1
|
|
lxvw4x 50, 18, 3 # vr18, key 2
|
|
lxvw4x 51, 19, 3 # vr19, counter, nonce
|
|
|
|
# create (0, 1, 2, 3) counters
|
|
vspltisw 0, 0
|
|
vspltisw 1, 1
|
|
vspltisw 2, 2
|
|
vspltisw 3, 3
|
|
vmrghw 4, 0, 1
|
|
vmrglw 5, 2, 3
|
|
vsldoi 30, 4, 5, 8 # vr30 counter, 4 (0, 1, 2, 3)
|
|
|
|
vspltisw 21, 12
|
|
vspltisw 23, 7
|
|
|
|
addis 11, 2, permx@toc@ha
|
|
addi 11, 11, permx@toc@l
|
|
lxvw4x 32+20, 0, 11
|
|
lxvw4x 32+22, 17, 11
|
|
|
|
sradi 8, 7, 1
|
|
|
|
mtctr 8
|
|
|
|
# save constants to vsx
|
|
xxlor 16, 48, 48
|
|
xxlor 17, 49, 49
|
|
xxlor 18, 50, 50
|
|
xxlor 19, 51, 51
|
|
|
|
vspltisw 25, 4
|
|
vspltisw 26, 8
|
|
|
|
xxlor 25, 32+26, 32+26
|
|
xxlor 24, 32+25, 32+25
|
|
|
|
vadduwm 31, 30, 25 # counter = (0, 1, 2, 3) + (4, 4, 4, 4)
|
|
xxlor 30, 32+30, 32+30
|
|
xxlor 31, 32+31, 32+31
|
|
|
|
xxlor 20, 32+20, 32+20
|
|
xxlor 21, 32+21, 32+21
|
|
xxlor 22, 32+22, 32+22
|
|
xxlor 23, 32+23, 32+23
|
|
|
|
cmpdi 6, 512
|
|
blt Loop_last
|
|
|
|
Loop_8x:
|
|
xxspltw 32+0, 16, 0
|
|
xxspltw 32+1, 16, 1
|
|
xxspltw 32+2, 16, 2
|
|
xxspltw 32+3, 16, 3
|
|
|
|
xxspltw 32+4, 17, 0
|
|
xxspltw 32+5, 17, 1
|
|
xxspltw 32+6, 17, 2
|
|
xxspltw 32+7, 17, 3
|
|
xxspltw 32+8, 18, 0
|
|
xxspltw 32+9, 18, 1
|
|
xxspltw 32+10, 18, 2
|
|
xxspltw 32+11, 18, 3
|
|
xxspltw 32+12, 19, 0
|
|
xxspltw 32+13, 19, 1
|
|
xxspltw 32+14, 19, 2
|
|
xxspltw 32+15, 19, 3
|
|
vadduwm 12, 12, 30 # increase counter
|
|
|
|
xxspltw 32+16, 16, 0
|
|
xxspltw 32+17, 16, 1
|
|
xxspltw 32+18, 16, 2
|
|
xxspltw 32+19, 16, 3
|
|
|
|
xxspltw 32+20, 17, 0
|
|
xxspltw 32+21, 17, 1
|
|
xxspltw 32+22, 17, 2
|
|
xxspltw 32+23, 17, 3
|
|
xxspltw 32+24, 18, 0
|
|
xxspltw 32+25, 18, 1
|
|
xxspltw 32+26, 18, 2
|
|
xxspltw 32+27, 18, 3
|
|
xxspltw 32+28, 19, 0
|
|
xxspltw 32+29, 19, 1
|
|
vadduwm 28, 28, 31 # increase counter
|
|
xxspltw 32+30, 19, 2
|
|
xxspltw 32+31, 19, 3
|
|
|
|
.align 5
|
|
quarter_loop_8x:
|
|
QT_loop_8x
|
|
|
|
bdnz quarter_loop_8x
|
|
|
|
xxlor 0, 32+30, 32+30
|
|
xxlor 32+30, 30, 30
|
|
vadduwm 12, 12, 30
|
|
xxlor 32+30, 0, 0
|
|
TP_4x 0, 1, 2, 3
|
|
TP_4x 4, 5, 6, 7
|
|
TP_4x 8, 9, 10, 11
|
|
TP_4x 12, 13, 14, 15
|
|
|
|
xxlor 0, 48, 48
|
|
xxlor 1, 49, 49
|
|
xxlor 2, 50, 50
|
|
xxlor 3, 51, 51
|
|
xxlor 48, 16, 16
|
|
xxlor 49, 17, 17
|
|
xxlor 50, 18, 18
|
|
xxlor 51, 19, 19
|
|
Add_state 0
|
|
xxlor 48, 0, 0
|
|
xxlor 49, 1, 1
|
|
xxlor 50, 2, 2
|
|
xxlor 51, 3, 3
|
|
Write_256 0
|
|
addi 14, 14, 256 # offset +=256
|
|
addi 15, 15, -256 # len -=256
|
|
|
|
xxlor 5, 32+31, 32+31
|
|
xxlor 32+31, 31, 31
|
|
vadduwm 28, 28, 31
|
|
xxlor 32+31, 5, 5
|
|
TP_4x 16+0, 16+1, 16+2, 16+3
|
|
TP_4x 16+4, 16+5, 16+6, 16+7
|
|
TP_4x 16+8, 16+9, 16+10, 16+11
|
|
TP_4x 16+12, 16+13, 16+14, 16+15
|
|
|
|
xxlor 32, 16, 16
|
|
xxlor 33, 17, 17
|
|
xxlor 34, 18, 18
|
|
xxlor 35, 19, 19
|
|
Add_state 16
|
|
Write_256 16
|
|
addi 14, 14, 256 # offset +=256
|
|
addi 15, 15, -256 # len +=256
|
|
|
|
xxlor 32+24, 24, 24
|
|
xxlor 32+25, 25, 25
|
|
xxlor 32+30, 30, 30
|
|
vadduwm 30, 30, 25
|
|
vadduwm 31, 30, 24
|
|
xxlor 30, 32+30, 32+30
|
|
xxlor 31, 32+31, 32+31
|
|
|
|
cmpdi 15, 0
|
|
beq Out_loop
|
|
|
|
cmpdi 15, 512
|
|
blt Loop_last
|
|
|
|
mtctr 8
|
|
b Loop_8x
|
|
|
|
Loop_last:
|
|
lxvw4x 48, 0, 3 # vr16, constants
|
|
lxvw4x 49, 17, 3 # vr17, key 1
|
|
lxvw4x 50, 18, 3 # vr18, key 2
|
|
lxvw4x 51, 19, 3 # vr19, counter, nonce
|
|
|
|
vspltisw 21, 12
|
|
vspltisw 23, 7
|
|
addis 11, 2, permx@toc@ha
|
|
addi 11, 11, permx@toc@l
|
|
lxvw4x 32+20, 0, 11
|
|
lxvw4x 32+22, 17, 11
|
|
|
|
sradi 8, 7, 1
|
|
mtctr 8
|
|
|
|
Loop_4x:
|
|
vspltw 0, 16, 0
|
|
vspltw 1, 16, 1
|
|
vspltw 2, 16, 2
|
|
vspltw 3, 16, 3
|
|
|
|
vspltw 4, 17, 0
|
|
vspltw 5, 17, 1
|
|
vspltw 6, 17, 2
|
|
vspltw 7, 17, 3
|
|
vspltw 8, 18, 0
|
|
vspltw 9, 18, 1
|
|
vspltw 10, 18, 2
|
|
vspltw 11, 18, 3
|
|
vspltw 12, 19, 0
|
|
vadduwm 12, 12, 30 # increase counter
|
|
vspltw 13, 19, 1
|
|
vspltw 14, 19, 2
|
|
vspltw 15, 19, 3
|
|
|
|
.align 5
|
|
quarter_loop:
|
|
QT_loop_4x
|
|
|
|
bdnz quarter_loop
|
|
|
|
vadduwm 12, 12, 30
|
|
TP_4x 0, 1, 2, 3
|
|
TP_4x 4, 5, 6, 7
|
|
TP_4x 8, 9, 10, 11
|
|
TP_4x 12, 13, 14, 15
|
|
|
|
Add_state 0
|
|
Write_256 0
|
|
addi 14, 14, 256 # offset += 256
|
|
addi 15, 15, -256 # len += 256
|
|
|
|
# Update state counter
|
|
vspltisw 25, 4
|
|
vadduwm 30, 30, 25
|
|
|
|
cmpdi 15, 0
|
|
beq Out_loop
|
|
cmpdi 15, 256
|
|
blt Out_loop
|
|
|
|
mtctr 8
|
|
b Loop_4x
|
|
|
|
Out_loop:
|
|
RESTORE_REGS
|
|
blr
|
|
|
|
Out_no_chacha:
|
|
li 3, 0
|
|
blr
|
|
SYM_FUNC_END(chacha_p10le_8x)
|
|
|
|
SYM_DATA_START_LOCAL(PERMX)
|
|
.align 5
|
|
permx:
|
|
.long 0x22330011, 0x66774455, 0xaabb8899, 0xeeffccdd
|
|
.long 0x11223300, 0x55667744, 0x99aabb88, 0xddeeffcc
|
|
SYM_DATA_END(PERMX)
|