2012-09-07 04:17:02 +08:00
# define _ _ A R M _ A R C H _ _ _ _ L I N U X _ A R M _ A R C H _ _
2018-05-22 12:35:11 -07:00
@ SPDX-License-Identifier: GPL-2.0
@ This code is taken from the OpenSSL project but the author (Andy Polyakov)
@ has relicensed it under the GPLv2. Therefore this program is free software;
@ you can redistribute it and/or modify it under the terms of the GNU General
@ Public License version 2 as published by the Free Software Foundation.
@
@ The original headers, including the original license headers, are
@ included below for completeness.
2012-09-07 04:17:02 +08:00
@ ====================================================================
@ Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
@ project. The module is, however, dual licensed under OpenSSL and
@ CRYPTOGAMS licenses depending on where you obtain it. For further
@ details see http://www.openssl.org/~appro/cryptogams/.
@ ====================================================================
@ sha1_block procedure for ARMv4.
@
@ January 2007.
@ Size/performance trade-off
@ ====================================================================
@ impl size in bytes comp cycles[*] measured performance
@ ====================================================================
@ thumb 304 3212 4420
@ armv4-small 392/+29% 1958/+64% 2250/+96%
@ armv4-compact 740/+89% 1552/+26% 1840/+22%
@ armv4-large 1420/+92% 1307/+19% 1370/+34%[***]
@ full unroll ~5100/+260% ~1260/+4% ~1300/+5%
@ ====================================================================
@ thumb = same as 'small' but in Thumb instructions[**] and
@ with recurring code in two private functions;
@ small = detached Xload/update, loops are folded;
@ compact = detached Xload/update, 5x unroll;
@ large = interleaved Xload/update, 5x unroll;
@ full unroll = interleaved Xload/update, full unroll, estimated[!];
@
@ [*] Manually counted instructions in "grand" loop body. Measured
@ performance is affected by prologue and epilogue overhead,
@ i-cache availability, branch penalties, etc.
@ [**] While each Thumb instruction is twice smaller, they are not as
@ diverse as ARM ones: e.g., there are only two arithmetic
@ instructions with 3 arguments, no [fixed] rotate, addressing
@ modes are limited. As result it takes more instructions to do
@ the same job in Thumb, therefore the code is never twice as
@ small and always slower.
@ [***] which is also ~35% better than compiler generated code. Dual-
@ issue Cortex A8 core was measured to process input block in
@ ~990 cycles.
@ August 2010.
@
@ Rescheduling for dual-issue pipeline resulted in 13% improvement on
@ Cortex A8 core and in absolute terms ~870 cycles per input block
@ [or 13.6 cycles per byte].
@ February 2011.
@
@ Profiler-assisted and platform-specific optimization resulted in 10%
@ improvement on Cortex A8 core and 12.2 cycles per byte.
2013-01-10 12:20:15 +01:00
# include < l i n u x / l i n k a g e . h >
2012-09-07 04:17:02 +08:00
2013-01-10 12:20:15 +01:00
.text
2012-09-07 04:17:02 +08:00
.align 2
2013-01-10 12:20:15 +01:00
ENTRY( s h a1 _ b l o c k _ d a t a _ o r d e r )
2012-09-07 04:17:02 +08:00
stmdb s p ! ,{ r4 - r12 ,l r }
add r2 ,r1 ,r2 ,l s l #6 @ r2 to point at the end of r1
ldmia r0 ,{ r3 ,r4 ,r5 ,r6 ,r7 }
.Lloop :
ldr r8 ,. L K _ 0 0 _ 1 9
mov r14 ,s p
sub s p ,s p ,#15 * 4
mov r5 ,r5 ,r o r #30
mov r6 ,r6 ,r o r #30
mov r7 ,r7 ,r o r #30 @ [6]
.L_00_15 :
# if _ _ A R M _ A R C H _ _ < 7
ldrb r10 ,[ r1 ,#2 ]
ldrb r9 ,[ r1 ,#3 ]
ldrb r11 ,[ r1 ,#1 ]
add r7 ,r8 ,r7 ,r o r #2 @ E+=K_00_19
ldrb r12 ,[ r1 ] ,#4
orr r9 ,r9 ,r10 ,l s l #8
eor r10 ,r5 ,r6 @ F_xx_xx
orr r9 ,r9 ,r11 ,l s l #16
add r7 ,r7 ,r3 ,r o r #27 @ E+=ROR(A,27)
orr r9 ,r9 ,r12 ,l s l #24
# else
ldr r9 ,[ r1 ] ,#4 @ handles unaligned
add r7 ,r8 ,r7 ,r o r #2 @ E+=K_00_19
eor r10 ,r5 ,r6 @ F_xx_xx
add r7 ,r7 ,r3 ,r o r #27 @ E+=ROR(A,27)
# ifdef _ _ A R M E L _ _
rev r9 ,r9 @ byte swap
# endif
# endif
and r10 ,r4 ,r10 ,r o r #2
add r7 ,r7 ,r9 @ E+=X[i]
eor r10 ,r10 ,r6 ,r o r #2 @ F_00_19(B,C,D)
str r9 ,[ r14 ,#- 4 ] !
add r7 ,r7 ,r10 @ E+=F_00_19(B,C,D)
# if _ _ A R M _ A R C H _ _ < 7
ldrb r10 ,[ r1 ,#2 ]
ldrb r9 ,[ r1 ,#3 ]
ldrb r11 ,[ r1 ,#1 ]
add r6 ,r8 ,r6 ,r o r #2 @ E+=K_00_19
ldrb r12 ,[ r1 ] ,#4
orr r9 ,r9 ,r10 ,l s l #8
eor r10 ,r4 ,r5 @ F_xx_xx
orr r9 ,r9 ,r11 ,l s l #16
add r6 ,r6 ,r7 ,r o r #27 @ E+=ROR(A,27)
orr r9 ,r9 ,r12 ,l s l #24
# else
ldr r9 ,[ r1 ] ,#4 @ handles unaligned
add r6 ,r8 ,r6 ,r o r #2 @ E+=K_00_19
eor r10 ,r4 ,r5 @ F_xx_xx
add r6 ,r6 ,r7 ,r o r #27 @ E+=ROR(A,27)
# ifdef _ _ A R M E L _ _
rev r9 ,r9 @ byte swap
# endif
# endif
and r10 ,r3 ,r10 ,r o r #2
add r6 ,r6 ,r9 @ E+=X[i]
eor r10 ,r10 ,r5 ,r o r #2 @ F_00_19(B,C,D)
str r9 ,[ r14 ,#- 4 ] !
add r6 ,r6 ,r10 @ E+=F_00_19(B,C,D)
# if _ _ A R M _ A R C H _ _ < 7
ldrb r10 ,[ r1 ,#2 ]
ldrb r9 ,[ r1 ,#3 ]
ldrb r11 ,[ r1 ,#1 ]
add r5 ,r8 ,r5 ,r o r #2 @ E+=K_00_19
ldrb r12 ,[ r1 ] ,#4
orr r9 ,r9 ,r10 ,l s l #8
eor r10 ,r3 ,r4 @ F_xx_xx
orr r9 ,r9 ,r11 ,l s l #16
add r5 ,r5 ,r6 ,r o r #27 @ E+=ROR(A,27)
orr r9 ,r9 ,r12 ,l s l #24
# else
ldr r9 ,[ r1 ] ,#4 @ handles unaligned
add r5 ,r8 ,r5 ,r o r #2 @ E+=K_00_19
eor r10 ,r3 ,r4 @ F_xx_xx
add r5 ,r5 ,r6 ,r o r #27 @ E+=ROR(A,27)
# ifdef _ _ A R M E L _ _
rev r9 ,r9 @ byte swap
# endif
# endif
and r10 ,r7 ,r10 ,r o r #2
add r5 ,r5 ,r9 @ E+=X[i]
eor r10 ,r10 ,r4 ,r o r #2 @ F_00_19(B,C,D)
str r9 ,[ r14 ,#- 4 ] !
add r5 ,r5 ,r10 @ E+=F_00_19(B,C,D)
# if _ _ A R M _ A R C H _ _ < 7
ldrb r10 ,[ r1 ,#2 ]
ldrb r9 ,[ r1 ,#3 ]
ldrb r11 ,[ r1 ,#1 ]
add r4 ,r8 ,r4 ,r o r #2 @ E+=K_00_19
ldrb r12 ,[ r1 ] ,#4
orr r9 ,r9 ,r10 ,l s l #8
eor r10 ,r7 ,r3 @ F_xx_xx
orr r9 ,r9 ,r11 ,l s l #16
add r4 ,r4 ,r5 ,r o r #27 @ E+=ROR(A,27)
orr r9 ,r9 ,r12 ,l s l #24
# else
ldr r9 ,[ r1 ] ,#4 @ handles unaligned
add r4 ,r8 ,r4 ,r o r #2 @ E+=K_00_19
eor r10 ,r7 ,r3 @ F_xx_xx
add r4 ,r4 ,r5 ,r o r #27 @ E+=ROR(A,27)
# ifdef _ _ A R M E L _ _
rev r9 ,r9 @ byte swap
# endif
# endif
and r10 ,r6 ,r10 ,r o r #2
add r4 ,r4 ,r9 @ E+=X[i]
eor r10 ,r10 ,r3 ,r o r #2 @ F_00_19(B,C,D)
str r9 ,[ r14 ,#- 4 ] !
add r4 ,r4 ,r10 @ E+=F_00_19(B,C,D)
# if _ _ A R M _ A R C H _ _ < 7
ldrb r10 ,[ r1 ,#2 ]
ldrb r9 ,[ r1 ,#3 ]
ldrb r11 ,[ r1 ,#1 ]
add r3 ,r8 ,r3 ,r o r #2 @ E+=K_00_19
ldrb r12 ,[ r1 ] ,#4
orr r9 ,r9 ,r10 ,l s l #8
eor r10 ,r6 ,r7 @ F_xx_xx
orr r9 ,r9 ,r11 ,l s l #16
add r3 ,r3 ,r4 ,r o r #27 @ E+=ROR(A,27)
orr r9 ,r9 ,r12 ,l s l #24
# else
ldr r9 ,[ r1 ] ,#4 @ handles unaligned
add r3 ,r8 ,r3 ,r o r #2 @ E+=K_00_19
eor r10 ,r6 ,r7 @ F_xx_xx
add r3 ,r3 ,r4 ,r o r #27 @ E+=ROR(A,27)
# ifdef _ _ A R M E L _ _
rev r9 ,r9 @ byte swap
# endif
# endif
and r10 ,r5 ,r10 ,r o r #2
add r3 ,r3 ,r9 @ E+=X[i]
eor r10 ,r10 ,r7 ,r o r #2 @ F_00_19(B,C,D)
str r9 ,[ r14 ,#- 4 ] !
add r3 ,r3 ,r10 @ E+=F_00_19(B,C,D)
2013-01-10 12:20:15 +01:00
cmp r14 ,s p
2012-09-07 04:17:02 +08:00
bne . L _ 0 0 _ 1 5 @ [((11+4)*5+2)*3]
2013-05-15 10:46:30 +01:00
sub s p ,s p ,#25 * 4
2012-09-07 04:17:02 +08:00
# if _ _ A R M _ A R C H _ _ < 7
ldrb r10 ,[ r1 ,#2 ]
ldrb r9 ,[ r1 ,#3 ]
ldrb r11 ,[ r1 ,#1 ]
add r7 ,r8 ,r7 ,r o r #2 @ E+=K_00_19
ldrb r12 ,[ r1 ] ,#4
orr r9 ,r9 ,r10 ,l s l #8
eor r10 ,r5 ,r6 @ F_xx_xx
orr r9 ,r9 ,r11 ,l s l #16
add r7 ,r7 ,r3 ,r o r #27 @ E+=ROR(A,27)
orr r9 ,r9 ,r12 ,l s l #24
# else
ldr r9 ,[ r1 ] ,#4 @ handles unaligned
add r7 ,r8 ,r7 ,r o r #2 @ E+=K_00_19
eor r10 ,r5 ,r6 @ F_xx_xx
add r7 ,r7 ,r3 ,r o r #27 @ E+=ROR(A,27)
# ifdef _ _ A R M E L _ _
rev r9 ,r9 @ byte swap
# endif
# endif
and r10 ,r4 ,r10 ,r o r #2
add r7 ,r7 ,r9 @ E+=X[i]
eor r10 ,r10 ,r6 ,r o r #2 @ F_00_19(B,C,D)
str r9 ,[ r14 ,#- 4 ] !
add r7 ,r7 ,r10 @ E+=F_00_19(B,C,D)
ldr r9 ,[ r14 ,#15 * 4 ]
ldr r10 ,[ r14 ,#13 * 4 ]
ldr r11 ,[ r14 ,#7 * 4 ]
add r6 ,r8 ,r6 ,r o r #2 @ E+=K_xx_xx
ldr r12 ,[ r14 ,#2 * 4 ]
eor r9 ,r9 ,r10
eor r11 ,r11 ,r12 @ 1 cycle stall
eor r10 ,r4 ,r5 @ F_xx_xx
mov r9 ,r9 ,r o r #31
add r6 ,r6 ,r7 ,r o r #27 @ E+=ROR(A,27)
eor r9 ,r9 ,r11 ,r o r #31
str r9 ,[ r14 ,#- 4 ] !
and r10 ,r3 ,r10 ,r o r #2 @ F_xx_xx
@ F_xx_xx
add r6 ,r6 ,r9 @ E+=X[i]
eor r10 ,r10 ,r5 ,r o r #2 @ F_00_19(B,C,D)
add r6 ,r6 ,r10 @ E+=F_00_19(B,C,D)
ldr r9 ,[ r14 ,#15 * 4 ]
ldr r10 ,[ r14 ,#13 * 4 ]
ldr r11 ,[ r14 ,#7 * 4 ]
add r5 ,r8 ,r5 ,r o r #2 @ E+=K_xx_xx
ldr r12 ,[ r14 ,#2 * 4 ]
eor r9 ,r9 ,r10
eor r11 ,r11 ,r12 @ 1 cycle stall
eor r10 ,r3 ,r4 @ F_xx_xx
mov r9 ,r9 ,r o r #31
add r5 ,r5 ,r6 ,r o r #27 @ E+=ROR(A,27)
eor r9 ,r9 ,r11 ,r o r #31
str r9 ,[ r14 ,#- 4 ] !
and r10 ,r7 ,r10 ,r o r #2 @ F_xx_xx
@ F_xx_xx
add r5 ,r5 ,r9 @ E+=X[i]
eor r10 ,r10 ,r4 ,r o r #2 @ F_00_19(B,C,D)
add r5 ,r5 ,r10 @ E+=F_00_19(B,C,D)
ldr r9 ,[ r14 ,#15 * 4 ]
ldr r10 ,[ r14 ,#13 * 4 ]
ldr r11 ,[ r14 ,#7 * 4 ]
add r4 ,r8 ,r4 ,r o r #2 @ E+=K_xx_xx
ldr r12 ,[ r14 ,#2 * 4 ]
eor r9 ,r9 ,r10
eor r11 ,r11 ,r12 @ 1 cycle stall
eor r10 ,r7 ,r3 @ F_xx_xx
mov r9 ,r9 ,r o r #31
add r4 ,r4 ,r5 ,r o r #27 @ E+=ROR(A,27)
eor r9 ,r9 ,r11 ,r o r #31
str r9 ,[ r14 ,#- 4 ] !
and r10 ,r6 ,r10 ,r o r #2 @ F_xx_xx
@ F_xx_xx
add r4 ,r4 ,r9 @ E+=X[i]
eor r10 ,r10 ,r3 ,r o r #2 @ F_00_19(B,C,D)
add r4 ,r4 ,r10 @ E+=F_00_19(B,C,D)
ldr r9 ,[ r14 ,#15 * 4 ]
ldr r10 ,[ r14 ,#13 * 4 ]
ldr r11 ,[ r14 ,#7 * 4 ]
add r3 ,r8 ,r3 ,r o r #2 @ E+=K_xx_xx
ldr r12 ,[ r14 ,#2 * 4 ]
eor r9 ,r9 ,r10
eor r11 ,r11 ,r12 @ 1 cycle stall
eor r10 ,r6 ,r7 @ F_xx_xx
mov r9 ,r9 ,r o r #31
add r3 ,r3 ,r4 ,r o r #27 @ E+=ROR(A,27)
eor r9 ,r9 ,r11 ,r o r #31
str r9 ,[ r14 ,#- 4 ] !
and r10 ,r5 ,r10 ,r o r #2 @ F_xx_xx
@ F_xx_xx
add r3 ,r3 ,r9 @ E+=X[i]
eor r10 ,r10 ,r7 ,r o r #2 @ F_00_19(B,C,D)
add r3 ,r3 ,r10 @ E+=F_00_19(B,C,D)
ldr r8 ,. L K _ 2 0 _ 3 9 @ [+15+16*4]
cmn s p ,#0 @ [+3], clear carry to denote 20_39
.L_20_39_or_60_79 :
ldr r9 ,[ r14 ,#15 * 4 ]
ldr r10 ,[ r14 ,#13 * 4 ]
ldr r11 ,[ r14 ,#7 * 4 ]
add r7 ,r8 ,r7 ,r o r #2 @ E+=K_xx_xx
ldr r12 ,[ r14 ,#2 * 4 ]
eor r9 ,r9 ,r10
eor r11 ,r11 ,r12 @ 1 cycle stall
eor r10 ,r5 ,r6 @ F_xx_xx
mov r9 ,r9 ,r o r #31
add r7 ,r7 ,r3 ,r o r #27 @ E+=ROR(A,27)
eor r9 ,r9 ,r11 ,r o r #31
str r9 ,[ r14 ,#- 4 ] !
eor r10 ,r4 ,r10 ,r o r #2 @ F_xx_xx
@ F_xx_xx
add r7 ,r7 ,r9 @ E+=X[i]
add r7 ,r7 ,r10 @ E+=F_20_39(B,C,D)
ldr r9 ,[ r14 ,#15 * 4 ]
ldr r10 ,[ r14 ,#13 * 4 ]
ldr r11 ,[ r14 ,#7 * 4 ]
add r6 ,r8 ,r6 ,r o r #2 @ E+=K_xx_xx
ldr r12 ,[ r14 ,#2 * 4 ]
eor r9 ,r9 ,r10
eor r11 ,r11 ,r12 @ 1 cycle stall
eor r10 ,r4 ,r5 @ F_xx_xx
mov r9 ,r9 ,r o r #31
add r6 ,r6 ,r7 ,r o r #27 @ E+=ROR(A,27)
eor r9 ,r9 ,r11 ,r o r #31
str r9 ,[ r14 ,#- 4 ] !
eor r10 ,r3 ,r10 ,r o r #2 @ F_xx_xx
@ F_xx_xx
add r6 ,r6 ,r9 @ E+=X[i]
add r6 ,r6 ,r10 @ E+=F_20_39(B,C,D)
ldr r9 ,[ r14 ,#15 * 4 ]
ldr r10 ,[ r14 ,#13 * 4 ]
ldr r11 ,[ r14 ,#7 * 4 ]
add r5 ,r8 ,r5 ,r o r #2 @ E+=K_xx_xx
ldr r12 ,[ r14 ,#2 * 4 ]
eor r9 ,r9 ,r10
eor r11 ,r11 ,r12 @ 1 cycle stall
eor r10 ,r3 ,r4 @ F_xx_xx
mov r9 ,r9 ,r o r #31
add r5 ,r5 ,r6 ,r o r #27 @ E+=ROR(A,27)
eor r9 ,r9 ,r11 ,r o r #31
str r9 ,[ r14 ,#- 4 ] !
eor r10 ,r7 ,r10 ,r o r #2 @ F_xx_xx
@ F_xx_xx
add r5 ,r5 ,r9 @ E+=X[i]
add r5 ,r5 ,r10 @ E+=F_20_39(B,C,D)
ldr r9 ,[ r14 ,#15 * 4 ]
ldr r10 ,[ r14 ,#13 * 4 ]
ldr r11 ,[ r14 ,#7 * 4 ]
add r4 ,r8 ,r4 ,r o r #2 @ E+=K_xx_xx
ldr r12 ,[ r14 ,#2 * 4 ]
eor r9 ,r9 ,r10
eor r11 ,r11 ,r12 @ 1 cycle stall
eor r10 ,r7 ,r3 @ F_xx_xx
mov r9 ,r9 ,r o r #31
add r4 ,r4 ,r5 ,r o r #27 @ E+=ROR(A,27)
eor r9 ,r9 ,r11 ,r o r #31
str r9 ,[ r14 ,#- 4 ] !
eor r10 ,r6 ,r10 ,r o r #2 @ F_xx_xx
@ F_xx_xx
add r4 ,r4 ,r9 @ E+=X[i]
add r4 ,r4 ,r10 @ E+=F_20_39(B,C,D)
ldr r9 ,[ r14 ,#15 * 4 ]
ldr r10 ,[ r14 ,#13 * 4 ]
ldr r11 ,[ r14 ,#7 * 4 ]
add r3 ,r8 ,r3 ,r o r #2 @ E+=K_xx_xx
ldr r12 ,[ r14 ,#2 * 4 ]
eor r9 ,r9 ,r10
eor r11 ,r11 ,r12 @ 1 cycle stall
eor r10 ,r6 ,r7 @ F_xx_xx
mov r9 ,r9 ,r o r #31
add r3 ,r3 ,r4 ,r o r #27 @ E+=ROR(A,27)
eor r9 ,r9 ,r11 ,r o r #31
str r9 ,[ r14 ,#- 4 ] !
eor r10 ,r5 ,r10 ,r o r #2 @ F_xx_xx
@ F_xx_xx
add r3 ,r3 ,r9 @ E+=X[i]
add r3 ,r3 ,r10 @ E+=F_20_39(B,C,D)
2013-01-10 12:20:15 +01:00
ARM( t e q r14 ,s p ) @ preserve carry
THUMB( m o v r11 ,s p )
THUMB( t e q r14 ,r11 ) @ preserve carry
2012-09-07 04:17:02 +08:00
bne . L _ 2 0 _ 3 9 _ o r _ 6 0 _ 7 9 @ [+((12+3)*5+2)*4]
bcs . L _ d o n e @ [+((12+3)*5+2)*4], spare 300 bytes
ldr r8 ,. L K _ 4 0 _ 5 9
sub s p ,s p ,#20 * 4 @ [+2]
.L_40_59 :
ldr r9 ,[ r14 ,#15 * 4 ]
ldr r10 ,[ r14 ,#13 * 4 ]
ldr r11 ,[ r14 ,#7 * 4 ]
add r7 ,r8 ,r7 ,r o r #2 @ E+=K_xx_xx
ldr r12 ,[ r14 ,#2 * 4 ]
eor r9 ,r9 ,r10
eor r11 ,r11 ,r12 @ 1 cycle stall
eor r10 ,r5 ,r6 @ F_xx_xx
mov r9 ,r9 ,r o r #31
add r7 ,r7 ,r3 ,r o r #27 @ E+=ROR(A,27)
eor r9 ,r9 ,r11 ,r o r #31
str r9 ,[ r14 ,#- 4 ] !
and r10 ,r4 ,r10 ,r o r #2 @ F_xx_xx
and r11 ,r5 ,r6 @ F_xx_xx
add r7 ,r7 ,r9 @ E+=X[i]
add r7 ,r7 ,r10 @ E+=F_40_59(B,C,D)
add r7 ,r7 ,r11 ,r o r #2
ldr r9 ,[ r14 ,#15 * 4 ]
ldr r10 ,[ r14 ,#13 * 4 ]
ldr r11 ,[ r14 ,#7 * 4 ]
add r6 ,r8 ,r6 ,r o r #2 @ E+=K_xx_xx
ldr r12 ,[ r14 ,#2 * 4 ]
eor r9 ,r9 ,r10
eor r11 ,r11 ,r12 @ 1 cycle stall
eor r10 ,r4 ,r5 @ F_xx_xx
mov r9 ,r9 ,r o r #31
add r6 ,r6 ,r7 ,r o r #27 @ E+=ROR(A,27)
eor r9 ,r9 ,r11 ,r o r #31
str r9 ,[ r14 ,#- 4 ] !
and r10 ,r3 ,r10 ,r o r #2 @ F_xx_xx
and r11 ,r4 ,r5 @ F_xx_xx
add r6 ,r6 ,r9 @ E+=X[i]
add r6 ,r6 ,r10 @ E+=F_40_59(B,C,D)
add r6 ,r6 ,r11 ,r o r #2
ldr r9 ,[ r14 ,#15 * 4 ]
ldr r10 ,[ r14 ,#13 * 4 ]
ldr r11 ,[ r14 ,#7 * 4 ]
add r5 ,r8 ,r5 ,r o r #2 @ E+=K_xx_xx
ldr r12 ,[ r14 ,#2 * 4 ]
eor r9 ,r9 ,r10
eor r11 ,r11 ,r12 @ 1 cycle stall
eor r10 ,r3 ,r4 @ F_xx_xx
mov r9 ,r9 ,r o r #31
add r5 ,r5 ,r6 ,r o r #27 @ E+=ROR(A,27)
eor r9 ,r9 ,r11 ,r o r #31
str r9 ,[ r14 ,#- 4 ] !
and r10 ,r7 ,r10 ,r o r #2 @ F_xx_xx
and r11 ,r3 ,r4 @ F_xx_xx
add r5 ,r5 ,r9 @ E+=X[i]
add r5 ,r5 ,r10 @ E+=F_40_59(B,C,D)
add r5 ,r5 ,r11 ,r o r #2
ldr r9 ,[ r14 ,#15 * 4 ]
ldr r10 ,[ r14 ,#13 * 4 ]
ldr r11 ,[ r14 ,#7 * 4 ]
add r4 ,r8 ,r4 ,r o r #2 @ E+=K_xx_xx
ldr r12 ,[ r14 ,#2 * 4 ]
eor r9 ,r9 ,r10
eor r11 ,r11 ,r12 @ 1 cycle stall
eor r10 ,r7 ,r3 @ F_xx_xx
mov r9 ,r9 ,r o r #31
add r4 ,r4 ,r5 ,r o r #27 @ E+=ROR(A,27)
eor r9 ,r9 ,r11 ,r o r #31
str r9 ,[ r14 ,#- 4 ] !
and r10 ,r6 ,r10 ,r o r #2 @ F_xx_xx
and r11 ,r7 ,r3 @ F_xx_xx
add r4 ,r4 ,r9 @ E+=X[i]
add r4 ,r4 ,r10 @ E+=F_40_59(B,C,D)
add r4 ,r4 ,r11 ,r o r #2
ldr r9 ,[ r14 ,#15 * 4 ]
ldr r10 ,[ r14 ,#13 * 4 ]
ldr r11 ,[ r14 ,#7 * 4 ]
add r3 ,r8 ,r3 ,r o r #2 @ E+=K_xx_xx
ldr r12 ,[ r14 ,#2 * 4 ]
eor r9 ,r9 ,r10
eor r11 ,r11 ,r12 @ 1 cycle stall
eor r10 ,r6 ,r7 @ F_xx_xx
mov r9 ,r9 ,r o r #31
add r3 ,r3 ,r4 ,r o r #27 @ E+=ROR(A,27)
eor r9 ,r9 ,r11 ,r o r #31
str r9 ,[ r14 ,#- 4 ] !
and r10 ,r5 ,r10 ,r o r #2 @ F_xx_xx
and r11 ,r6 ,r7 @ F_xx_xx
add r3 ,r3 ,r9 @ E+=X[i]
add r3 ,r3 ,r10 @ E+=F_40_59(B,C,D)
add r3 ,r3 ,r11 ,r o r #2
2013-01-10 12:20:15 +01:00
cmp r14 ,s p
2012-09-07 04:17:02 +08:00
bne . L _ 4 0 _ 5 9 @ [+((12+5)*5+2)*4]
ldr r8 ,. L K _ 6 0 _ 7 9
sub s p ,s p ,#20 * 4
cmp s p ,#0 @ set carry to denote 60_79
b . L _ 2 0 _ 3 9 _ o r _ 6 0 _ 7 9 @ [+4], spare 300 bytes
.L_done :
add s p ,s p ,#80 * 4 @ "deallocate" stack frame
ldmia r0 ,{ r8 ,r9 ,r10 ,r11 ,r12 }
add r3 ,r8 ,r3
add r4 ,r9 ,r4
add r5 ,r10 ,r5 ,r o r #2
add r6 ,r11 ,r6 ,r o r #2
add r7 ,r12 ,r7 ,r o r #2
stmia r0 ,{ r3 ,r4 ,r5 ,r6 ,r7 }
teq r1 ,r2
bne . L l o o p @ [+18], total 1307
ldmia s p ! ,{ r4 - r12 ,p c }
.align 2
.LK_00_19 : .word 0x5a827999
.LK_20_39 : .word 0x6ed9eba1
.LK_40_59 : .word 0x8f1bbcdc
.LK_60_79 : .word 0xca62c1d6
2013-01-10 12:20:15 +01:00
ENDPROC( s h a1 _ b l o c k _ d a t a _ o r d e r )
2012-09-07 04:17:02 +08:00
.asciz " SHA1 b l o c k t r a n s f o r m f o r A R M v4 , C R Y P T O G A M S b y < a p p r o @openssl.org>"
.align 2