[ARM] 2930/1: optimized sha1 implementation for ARM
Patch from Nicolas Pitre
Here's an ARM assembly SHA1 implementation to replace the default C
version. It is approximately 50% faster than the generic C version. On
an XScale processor running at 400MHz:
generic C version: 9.8 MB/s
my version: 14.5 MB/s
This code is useful to quite a few callers in the tree:
crypto/sha1.c: sha_transform(sctx->state, sctx->buffer, temp);
crypto/sha1.c: sha_transform(sctx->state, &data[i], temp);
drivers/char/random.c: sha_transform(buf, (__u8 *)r->pool+i, buf + 5);
drivers/char/random.c: sha_transform(buf, (__u8 *)data, buf + 5);
net/ipv4/syncookies.c: sha_transform(tmp + 16, (__u8 *)tmp, tmp + 16 + 5);
Signed-off-by: Nicolas Pitre <nico@cam.org>
Seems to work fine on big-endian as well.
Signed-off-by: Lennert Buytenhek <buytenh@wantstofly.org>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
2005-10-28 15:26:40 +01:00
/ *
* linux/ a r c h / a r m / l i b / s h a1 . S
*
* SHA t r a n s f o r m o p t i m i z e d f o r A R M
*
2009-09-14 03:25:28 -04:00
* Copyright : ( C) 2 0 0 5 b y N i c o l a s P i t r e < n i c o @fluxnic.net>
[ARM] 2930/1: optimized sha1 implementation for ARM
Patch from Nicolas Pitre
Here's an ARM assembly SHA1 implementation to replace the default C
version. It is approximately 50% faster than the generic C version. On
an XScale processor running at 400MHz:
generic C version: 9.8 MB/s
my version: 14.5 MB/s
This code is useful to quite a few callers in the tree:
crypto/sha1.c: sha_transform(sctx->state, sctx->buffer, temp);
crypto/sha1.c: sha_transform(sctx->state, &data[i], temp);
drivers/char/random.c: sha_transform(buf, (__u8 *)r->pool+i, buf + 5);
drivers/char/random.c: sha_transform(buf, (__u8 *)data, buf + 5);
net/ipv4/syncookies.c: sha_transform(tmp + 16, (__u8 *)tmp, tmp + 16 + 5);
Signed-off-by: Nicolas Pitre <nico@cam.org>
Seems to work fine on big-endian as well.
Signed-off-by: Lennert Buytenhek <buytenh@wantstofly.org>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
2005-10-28 15:26:40 +01:00
* Created : September 1 7 , 2 0 0 5
*
* This p r o g r a m i s f r e e s o f t w a r e ; you can redistribute it and/or modify
* it u n d e r t h e t e r m s o f t h e G N U G e n e r a l P u b l i c L i c e n s e v e r s i o n 2 a s
* published b y t h e F r e e S o f t w a r e F o u n d a t i o n .
*
* The r e f e r e n c e i m p l e m e n t a t i o n f o r t h i s c o d e i s l i n u x / l i b / s h a1 . c
* /
# include < l i n u x / l i n k a g e . h >
.text
/ *
* void s h a _ t r a n s f o r m ( _ _ u 3 2 * d i g e s t , c o n s t c h a r * i n , _ _ u 3 2 * W )
*
* Note : the " i n " p t r m a y b e u n a l i g n e d .
* /
ENTRY( s h a _ t r a n s f o r m )
stmfd s p ! , { r4 - r8 , l r }
@ for (i = 0; i < 16; i++)
2008-08-28 00:00:23 +01:00
@ W[i] = be32_to_cpu(in[i]);
[ARM] 2930/1: optimized sha1 implementation for ARM
Patch from Nicolas Pitre
Here's an ARM assembly SHA1 implementation to replace the default C
version. It is approximately 50% faster than the generic C version. On
an XScale processor running at 400MHz:
generic C version: 9.8 MB/s
my version: 14.5 MB/s
This code is useful to quite a few callers in the tree:
crypto/sha1.c: sha_transform(sctx->state, sctx->buffer, temp);
crypto/sha1.c: sha_transform(sctx->state, &data[i], temp);
drivers/char/random.c: sha_transform(buf, (__u8 *)r->pool+i, buf + 5);
drivers/char/random.c: sha_transform(buf, (__u8 *)data, buf + 5);
net/ipv4/syncookies.c: sha_transform(tmp + 16, (__u8 *)tmp, tmp + 16 + 5);
Signed-off-by: Nicolas Pitre <nico@cam.org>
Seems to work fine on big-endian as well.
Signed-off-by: Lennert Buytenhek <buytenh@wantstofly.org>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
2005-10-28 15:26:40 +01:00
# ifdef _ _ A R M E B _ _
mov r4 , r0
mov r0 , r2
mov r2 , #64
bl m e m c p y
mov r2 , r0
mov r0 , r4
# else
mov r3 , r2
mov l r , #16
1 : ldrb r4 , [ r1 ] , #1
ldrb r5 , [ r1 ] , #1
ldrb r6 , [ r1 ] , #1
ldrb r7 , [ r1 ] , #1
subs l r , l r , #1
orr r5 , r5 , r4 , l s l #8
orr r6 , r6 , r5 , l s l #8
orr r7 , r7 , r6 , l s l #8
str r7 , [ r3 ] , #4
bne 1 b
# endif
@ for (i = 0; i < 64; i++)
@ W[i+16] = ror(W[i+13] ^ W[i+8] ^ W[i+2] ^ W[i], 31);
sub r3 , r2 , #4
mov l r , #64
2 : ldr r4 , [ r3 , #4 ] !
subs l r , l r , #1
ldr r5 , [ r3 , #8 ]
ldr r6 , [ r3 , #32 ]
ldr r7 , [ r3 , #52 ]
eor r4 , r4 , r5
eor r4 , r4 , r6
eor r4 , r4 , r7
mov r4 , r4 , r o r #31
str r4 , [ r3 , #64 ]
bne 2 b
/ *
* The S H A f u n c t i o n s a r e :
*
* f1 ( B ,C ,D ) = ( D ^ ( B & ( C ^ D ) ) )
* f2 ( B ,C ,D ) = ( B ^ C ^ D )
* f3 ( B ,C ,D ) = ( ( B & C ) | ( D & ( B | C ) ) )
*
* Then t h e s u b - b l o c k s a r e p r o c e s s e d a s f o l l o w s :
*
* A' = r o r ( A , 2 7 ) + f ( B ,C ,D ) + E + K + * W + +
* B' = A
* C' = r o r ( B , 2 )
* D' = C
* E' = D
*
* We t h e r e f o r e u n r o l l e a c h l o o p 5 t i m e s t o a v o i d r e g i s t e r s h u f f l i n g .
* Also t h e r o r f o r C ( a n d a l s o D a n d E w h i c h a r e s u c c e s s i v e l y d e r i v e d
* from i t ) i s a p p l i e d i n p l a c e t o c u t o n a n a d d i t i o n a l m o v i n s n f o r
* each r o u n d .
* /
.macro sha_ f1 , A , B , C , D , E
ldr r3 , [ r2 ] , #4
eor i p , \ C , \ D
add \ E , r1 , \ E , r o r #2
and i p , \ B , i p , r o r #2
add \ E , \ E , \ A , r o r #27
eor i p , i p , \ D , r o r #2
add \ E , \ E , r3
add \ E , \ E , i p
.endm
.macro sha_ f2 , A , B , C , D , E
ldr r3 , [ r2 ] , #4
add \ E , r1 , \ E , r o r #2
eor i p , \ B , \ C , r o r #2
add \ E , \ E , \ A , r o r #27
eor i p , i p , \ D , r o r #2
add \ E , \ E , r3
add \ E , \ E , i p
.endm
.macro sha_ f3 , A , B , C , D , E
ldr r3 , [ r2 ] , #4
add \ E , r1 , \ E , r o r #2
orr i p , \ B , \ C , r o r #2
add \ E , \ E , \ A , r o r #27
and i p , i p , \ D , r o r #2
add \ E , \ E , r3
and r3 , \ B , \ C , r o r #2
orr i p , i p , r3
add \ E , \ E , i p
.endm
ldmia r0 , { r4 - r8 }
mov l r , #4
ldr r1 , . L _ s h a _ K + 0
/* adjust initial values */
mov r6 , r6 , r o r #30
mov r7 , r7 , r o r #30
mov r8 , r8 , r o r #30
3 : subs l r , l r , #1
sha_ f1 r4 , r5 , r6 , r7 , r8
sha_ f1 r8 , r4 , r5 , r6 , r7
sha_ f1 r7 , r8 , r4 , r5 , r6
sha_ f1 r6 , r7 , r8 , r4 , r5
sha_ f1 r5 , r6 , r7 , r8 , r4
bne 3 b
ldr r1 , . L _ s h a _ K + 4
mov l r , #4
4 : subs l r , l r , #1
sha_ f2 r4 , r5 , r6 , r7 , r8
sha_ f2 r8 , r4 , r5 , r6 , r7
sha_ f2 r7 , r8 , r4 , r5 , r6
sha_ f2 r6 , r7 , r8 , r4 , r5
sha_ f2 r5 , r6 , r7 , r8 , r4
bne 4 b
ldr r1 , . L _ s h a _ K + 8
mov l r , #4
5 : subs l r , l r , #1
sha_ f3 r4 , r5 , r6 , r7 , r8
sha_ f3 r8 , r4 , r5 , r6 , r7
sha_ f3 r7 , r8 , r4 , r5 , r6
sha_ f3 r6 , r7 , r8 , r4 , r5
sha_ f3 r5 , r6 , r7 , r8 , r4
bne 5 b
ldr r1 , . L _ s h a _ K + 1 2
mov l r , #4
6 : subs l r , l r , #1
sha_ f2 r4 , r5 , r6 , r7 , r8
sha_ f2 r8 , r4 , r5 , r6 , r7
sha_ f2 r7 , r8 , r4 , r5 , r6
sha_ f2 r6 , r7 , r8 , r4 , r5
sha_ f2 r5 , r6 , r7 , r8 , r4
bne 6 b
ldmia r0 , { r1 , r2 , r3 , i p , l r }
add r4 , r1 , r4
add r5 , r2 , r5
add r6 , r3 , r6 , r o r #2
add r7 , i p , r7 , r o r #2
add r8 , l r , r8 , r o r #2
stmia r0 , { r4 - r8 }
ldmfd s p ! , { r4 - r8 , p c }
2008-08-28 11:22:32 +01:00
ENDPROC( s h a _ t r a n s f o r m )
2009-07-24 12:32:52 +01:00
.align 2
[ARM] 2930/1: optimized sha1 implementation for ARM
Patch from Nicolas Pitre
Here's an ARM assembly SHA1 implementation to replace the default C
version. It is approximately 50% faster than the generic C version. On
an XScale processor running at 400MHz:
generic C version: 9.8 MB/s
my version: 14.5 MB/s
This code is useful to quite a few callers in the tree:
crypto/sha1.c: sha_transform(sctx->state, sctx->buffer, temp);
crypto/sha1.c: sha_transform(sctx->state, &data[i], temp);
drivers/char/random.c: sha_transform(buf, (__u8 *)r->pool+i, buf + 5);
drivers/char/random.c: sha_transform(buf, (__u8 *)data, buf + 5);
net/ipv4/syncookies.c: sha_transform(tmp + 16, (__u8 *)tmp, tmp + 16 + 5);
Signed-off-by: Nicolas Pitre <nico@cam.org>
Seems to work fine on big-endian as well.
Signed-off-by: Lennert Buytenhek <buytenh@wantstofly.org>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
2005-10-28 15:26:40 +01:00
.L_sha_K :
.word 0 x5 a82 7 9 9 9 , 0 x6 e d9 e b a1 , 0 x8 f1 b b c d c , 0 x c a62 c1 d6
/ *
* void s h a _ i n i t ( _ _ u 3 2 * b u f )
* /
2009-07-24 12:32:52 +01:00
.align 2
[ARM] 2930/1: optimized sha1 implementation for ARM
Patch from Nicolas Pitre
Here's an ARM assembly SHA1 implementation to replace the default C
version. It is approximately 50% faster than the generic C version. On
an XScale processor running at 400MHz:
generic C version: 9.8 MB/s
my version: 14.5 MB/s
This code is useful to quite a few callers in the tree:
crypto/sha1.c: sha_transform(sctx->state, sctx->buffer, temp);
crypto/sha1.c: sha_transform(sctx->state, &data[i], temp);
drivers/char/random.c: sha_transform(buf, (__u8 *)r->pool+i, buf + 5);
drivers/char/random.c: sha_transform(buf, (__u8 *)data, buf + 5);
net/ipv4/syncookies.c: sha_transform(tmp + 16, (__u8 *)tmp, tmp + 16 + 5);
Signed-off-by: Nicolas Pitre <nico@cam.org>
Seems to work fine on big-endian as well.
Signed-off-by: Lennert Buytenhek <buytenh@wantstofly.org>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
2005-10-28 15:26:40 +01:00
.L_sha_initial_digest :
.word 0 x6 7 4 5 2 3 0 1 , 0 x e f c d a b89 , 0 x98 b a d c f e , 0 x10 3 2 5 4 7 6 , 0 x c3 d2 e 1 f0
ENTRY( s h a _ i n i t )
str l r , [ s p , #- 4 ] !
adr r1 , . L _ s h a _ i n i t i a l _ d i g e s t
ldmia r1 , { r1 , r2 , r3 , i p , l r }
stmia r0 , { r1 , r2 , r3 , i p , l r }
ldr p c , [ s p ] , #4
2008-08-28 11:22:32 +01:00
ENDPROC( s h a _ i n i t )