2005-04-16 15:20:36 -07:00
/ *
* " memcpy" i m p l e m e n t a t i o n o f S u p e r H
*
* Copyright ( C ) 1 9 9 9 N i i b e Y u t a k a
* Copyright ( c ) 2 0 0 2 S T M i c r o e l e c t r o n i c s L t d
* Modified f r o m m e m c p y . S a n d m i c r o - o p t i m i s e d f o r S H 4
* Stuart M e n e f y ( s t u a r t . m e n e f y @st.com)
*
* /
# include < l i n u x / l i n k a g e . h >
/ *
* void * m e m c p y ( v o i d * d s t , c o n s t v o i d * s r c , s i z e _ t n ) ;
*
* It i s a s s u m e d t h a t t h e r e i s n o o v e r l a p b e t w e e n s r c a n d d s t .
* If t h e r e i s a n o v e r l a p , t h e n t h e r e s u l t s a r e u n d e f i n e d .
* /
!
! GHIJ K L M N O P Q R - - > . . . G H I J K L M N O P Q R .
!
! Size i s 1 6 o r g r e a t e r , a n d m a y h a v e t r a i l i n g b y t e s
.balign 32
.Lcase1 :
! Read a l o n g w o r d a n d w r i t e a l o n g w o r d a t o n c e
! At t h e s t a r t o f e a c h i t e r a t i o n , r7 c o n t a i n s l a s t l o n g l o a d
add #- 1 ,r5 ! 7 9 E X
mov r4 ,r2 ! 5 M T ( 0 c y c l e s l a t e n c y )
mov. l @(r0,r5),r7 ! 21 LS (2 cycles latency)
add #- 4 ,r5 ! 5 0 E X
add #7 ,r2 ! 7 9 E X
!
# ifdef C O N F I G _ C P U _ L I T T L E _ E N D I A N
! 6 cycles, 4 b y t e s p e r i t e r a t i o n
3 : mov. l @(r0,r5),r1 ! 21 LS (latency=2) ! NMLK
mov r7 , r3 ! 5 M T ( l a t e n c y =0 ) ! R Q P O
cmp/ h i r2 ,r0 ! 5 7 M T
shll1 6 r3 ! 1 0 3 E X
mov r1 ,r6 ! 5 M T ( l a t e n c y =0 )
shll8 r3 ! 1 0 2 E X ! O x x x
shlr8 r6 ! 1 0 6 E X ! x N M L
mov r1 , r7 ! 5 M T ( l a t e n c y =0 )
or r6 ,r3 ! 8 2 E X ! O N M L
bt/ s 3 b ! 1 0 9 B R
mov. l r3 ,@-r0 ! 30 LS
# else
3 : mov. l @(r0,r5),r1 ! 21 LS (latency=2) ! KLMN
mov r7 ,r3 ! 5 M T ( l a t e n c y =0 ) ! O P Q R
cmp/ h i r2 ,r0 ! 5 7 M T
shlr1 6 r3 ! 1 0 7 E X
shlr8 r3 ! 1 0 6 E X ! x x x O
mov r1 ,r6 ! 5 M T ( l a t e n c y =0 )
shll8 r6 ! 1 0 2 E X ! L M N x
mov r1 ,r7 ! 5 M T ( l a t e n c y =0 )
or r6 ,r3 ! 8 2 E X ! L M N O
bt/ s 3 b ! 1 0 9 B R
mov. l r3 ,@-r0 ! 30 LS
# endif
! Finally, c o p y a b y t e a t o n c e , i f n e c e s s a r y
add #4 ,r5 ! 5 0 E X
cmp/ e q r4 ,r0 ! 5 4 M T
add #- 6 ,r2 ! 5 0 E X
bt 9 f ! 1 0 9 B R
8 : cmp/ h i r2 ,r0 ! 5 7 M T
mov. b @(r0,r5),r1 ! 20 LS (latency=2)
bt/ s 8 b ! 1 0 9 B R
mov. b r1 ,@-r0 ! 29 LS
9 : rts
nop
!
! GHIJ K L M N O P Q R - - > . G H I J K L M N O P Q R . . .
!
! Size i s 1 6 o r g r e a t e r , a n d m a y h a v e t r a i l i n g b y t e s
.balign 32
.Lcase3 :
! Read a l o n g w o r d a n d w r i t e a l o n g w o r d a t o n c e
! At t h e s t a r t o f e a c h i t e r a t i o n , r7 c o n t a i n s l a s t l o n g l o a d
add #- 3 ,r5 ! 7 9 E X
mov r4 ,r2 ! 5 M T ( 0 c y c l e s l a t e n c y )
mov. l @(r0,r5),r7 ! 21 LS (2 cycles latency)
add #- 4 ,r5 ! 5 0 E X
add #7 ,r2 ! 7 9 E X
!
# ifdef C O N F I G _ C P U _ L I T T L E _ E N D I A N
! 6 cycles, 4 b y t e s p e r i t e r a t i o n
3 : mov. l @(r0,r5),r1 ! 21 LS (latency=2) ! NMLK
mov r7 , r3 ! 5 M T ( l a t e n c y =0 ) ! R Q P O
cmp/ h i r2 ,r0 ! 5 7 M T
shll8 r3 ! 1 0 2 E X ! Q P O x
mov r1 ,r6 ! 5 M T ( l a t e n c y =0 )
shlr1 6 r6 ! 1 0 7 E X
shlr8 r6 ! 1 0 6 E X ! x x x N
mov r1 , r7 ! 5 M T ( l a t e n c y =0 )
or r6 ,r3 ! 8 2 E X ! Q P O N
bt/ s 3 b ! 1 0 9 B R
mov. l r3 ,@-r0 ! 30 LS
# else
2008-05-15 13:28:46 +09:00
3 : mov r7 ,r3 ! O P Q R
2005-04-16 15:20:36 -07:00
shlr8 r3 ! x O P Q
2008-05-15 13:28:46 +09:00
mov. l @(r0,r5),r7 ! KLMN
mov r7 ,r6
2005-04-16 15:20:36 -07:00
shll1 6 r6
shll8 r6 ! N x x x
or r6 ,r3 ! N O P Q
cmp/ h i r2 ,r0
bt/ s 3 b
mov. l r3 ,@-r0
# endif
! Finally, c o p y a b y t e a t o n c e , i f n e c e s s a r y
add #6 ,r5 ! 5 0 E X
cmp/ e q r4 ,r0 ! 5 4 M T
add #- 6 ,r2 ! 5 0 E X
bt 9 f ! 1 0 9 B R
8 : cmp/ h i r2 ,r0 ! 5 7 M T
mov. b @(r0,r5),r1 ! 20 LS (latency=2)
bt/ s 8 b ! 1 0 9 B R
mov. b r1 ,@-r0 ! 29 LS
9 : rts
nop
ENTRY( m e m c p y )
! Calculate t h e i n v a r i a n t s w h i c h w i l l b e u s e d i n t h e r e m a i n d e r
! of t h e c o d e :
!
! r4 - - > [ . . . ] D S T [ . . . ] S R C
! [ . . . ] [ . . . ]
! : :
! r0 - - > [ . . . ] r0 + r5 - - > [ . . . ]
!
!
! Short c i r c u i t t h e c o m m o n c a s e o f s r c , d s t a n d l e n b e i n g 3 2 b i t a l i g n e d
! and t e s t f o r z e r o l e n g t h m o v e
mov r6 , r0 ! 5 M T ( 0 c y c l e l a t e n c y )
or r4 , r0 ! 8 2 E X
or r5 , r0 ! 8 2 E X
tst r6 , r6 ! 8 6 M T
bt/ s 9 9 f ! 1 1 1 B R ( z e r o l e n )
tst #3 , r0 ! 8 7 M T
mov r4 , r0 ! 5 M T ( 0 c y c l e l a t e n c y )
add r6 , r0 ! 4 9 E X
mov #16 , r1 ! 6 E X
bt/ s . L c a s e 0 0 ! 1 1 1 B R ( a l i g n e d )
sub r4 , r5 ! 7 5 E X
! Arguments a r e n o t n i c e l y l o n g w o r d a l i g n e d o r z e r o l e n .
! Check f o r s m a l l c o p i e s , a n d i f s o d o a s i m p l e b y t e a t a t i m e c o p y .
!
! Deciding o n a n e x a c t v a l u e o f ' s m a l l ' i s n o t e a s y , a s t h e p o i n t a t w h i c h
! using t h e o p t i m i s e d r o u t i n e s b e c o m e w o r t h w h i l e v a r i e s ( t h e s e a r e t h e
! cycle c o u n t s f o r d i f f e r n e t s i z e s u s i n g b y t e - a t - a - t i m e v s . o p t i m i s e d ) :
! size b y t e - a t - t i m e l o n g w o r d b y t e
! 1 6 4 2 3 9 - 4 0 4 6 - 5 0 5 0 - 5 5
! 2 4 5 8 4 3 - 4 4 5 4 - 5 8 6 2 - 6 7
! 3 6 8 2 4 9 - 5 0 6 6 - 7 0 8 0 - 8 5
! However t h e p e n a l t y f o r g e t t i n g i t ' w r o n g ' i s m u c h h i g h e r f o r l o n g w o r d
! aligned d a t a ( a n d t h i s i s m o r e c o m m o n ) , s o u s e a v a l u e o f 1 6 .
cmp/ g t r6 ,r1 ! 5 6 M T
add #- 1 ,r5 ! 5 0 E X
bf/ s 6 f ! 1 0 8 B R ( n o t s m a l l )
mov r5 , r3 ! 5 M T ( l a t e n c y =0 )
shlr r6 ! 1 0 4 E X
mov. b @(r0,r5),r1 ! 20 LS (latency=2)
bf/ s 4 f ! 1 1 1 B R
add #- 1 ,r3 ! 5 0 E X
tst r6 , r6 ! 8 6 M T
bt/ s 9 8 f ! 1 1 0 B R
mov. b r1 ,@-r0 ! 29 LS
! 4 cycles, 2 b y t e s p e r i t e r a t i o n
3 : mov. b @(r0,r5),r1 ! 20 LS (latency=2)
4 : mov. b @(r0,r3),r2 ! 20 LS (latency=2)
dt r6 ! 6 7 E X
mov. b r1 ,@-r0 ! 29 LS
bf/ s 3 b ! 1 1 1 B R
mov. b r2 ,@-r0 ! 29 LS
98 :
rts
nop
99 : rts
mov r4 , r0
! Size i s n o t s m a l l , s o i t s w o r t h w h i l e l o o k i n g f o r o p t i m i s a t i o n s .
! First a l i g n d e s t i n a t i o n t o a l o n g w o r d b o u n d a r y .
!
! r5 = n o r m a l v a l u e - 1
6 : tst #3 , r0 ! 8 7 M T
mov #3 , r3 ! 6 E X
bt/ s 2 f ! 1 1 1 B R
and r0 ,r3 ! 7 8 E X
! 3 cycles, 1 b y t e p e r i t e r a t i o n
1 : dt r3 ! 6 7 E X
mov. b @(r0,r5),r1 ! 19 LS (latency=2)
add #- 1 , r6 ! 7 9 E X
bf/ s 1 b ! 1 0 9 B R
mov. b r1 ,@-r0 ! 28 LS
2 : add #1 , r5 ! 7 9 E X
! Now s e l e c t t h e a p p r o p r i a t e b u l k t r a n s f e r c o d e b a s e d o n r e l a t i v e
! alignment o f s r c a n d d s t .
mov r0 , r3 ! 5 M T ( l a t e n c y =0 )
mov r5 , r0 ! 5 M T ( l a t e n c y =0 )
tst #1 , r0 ! 8 7 M T
bf/ s 1 f ! 1 1 1 B R
mov #64 , r7 ! 6 E X
! bit 0 c l e a r
cmp/ g e r7 , r6 ! 5 5 M T
bt/ s 2 f ! 1 1 1 B R
tst #2 , r0 ! 8 7 M T
! small
bt/ s . L c a s e 0
mov r3 , r0
bra . L c a s e 2
nop
! big
2 : bt/ s . L c a s e 0 b
mov r3 , r0
bra . L c a s e 2 b
nop
! bit 0 s e t
1 : tst #2 , r0 ! 8 7 M T
bt/ s . L c a s e 1
mov r3 , r0
bra . L c a s e 3
nop
!
! GHIJ K L M N O P Q R - - > G H I J K L M N O P Q R
!
! src, d s t a n d s i z e a r e a l l l o n g w o r d a l i g n e d
! size i s n o n - z e r o
.balign 32
.Lcase00 :
mov #64 , r1 ! 6 E X
mov r5 , r3 ! 5 M T ( l a t e n c y =0 )
cmp/ g t r6 , r1 ! 5 6 M T
add #- 4 , r5 ! 5 0 E X
bf . L c a s e 0 0 b ! 1 0 8 B R ( b i g l o o p )
shlr2 r6 ! 1 0 5 E X
shlr r6 ! 1 0 4 E X
mov. l @(r0, r5), r1 ! 21 LS (latency=2)
bf/ s 4 f ! 1 1 1 B R
add #- 8 , r3 ! 5 0 E X
tst r6 , r6 ! 8 6 M T
bt/ s 5 f ! 1 1 0 B R
mov. l r1 ,@-r0 ! 30 LS
! 4 cycles, 2 l o n g w o r d s p e r i t e r a t i o n
3 : mov. l @(r0, r5), r1 ! 21 LS (latency=2)
4 : mov. l @(r0, r3), r2 ! 21 LS (latency=2)
dt r6 ! 6 7 E X
mov. l r1 , @-r0 ! 30 LS
bf/ s 3 b ! 1 0 9 B R
mov. l r2 , @-r0 ! 30 LS
5 : rts
nop
! Size i s 1 6 o r g r e a t e r a n d l e s s t h a n 6 4 , b u t m a y h a v e t r a i l i n g b y t e s
.balign 32
.Lcase0 :
add #- 4 , r5 ! 5 0 E X
mov r4 , r7 ! 5 M T ( l a t e n c y =0 )
mov. l @(r0, r5), r1 ! 21 LS (latency=2)
mov #4 , r2 ! 6 E X
add #11 , r7 ! 5 0 E X
tst r2 , r6 ! 8 6 M T
mov r5 , r3 ! 5 M T ( l a t e n c y =0 )
bt/ s 4 f ! 1 1 1 B R
add #- 4 , r3 ! 5 0 E X
mov. l r1 ,@-r0 ! 30 LS
! 4 cycles, 2 l o n g w o r d s p e r i t e r a t i o n
3 : mov. l @(r0, r5), r1 ! 21 LS (latency=2)
4 : mov. l @(r0, r3), r2 ! 21 LS (latency=2)
cmp/ h i r7 , r0
mov. l r1 , @-r0 ! 30 LS
bt/ s 3 b ! 1 0 9 B R
mov. l r2 , @-r0 ! 30 LS
! Copy t h e f i n a l 0 - 3 b y t e s
add #3 ,r5 ! 5 0 E X
cmp/ e q r0 , r4 ! 5 4 M T
add #- 10 , r7 ! 5 0 E X
bt 9 f ! 1 1 0 B R
! 3 cycles, 1 b y t e p e r i t e r a t i o n
1 : mov. b @(r0,r5),r1 ! 19 LS
cmp/ h i r7 ,r0 ! 5 7 M T
bt/ s 1 b ! 1 1 1 B R
mov. b r1 ,@-r0 ! 28 LS
9 : rts
nop
! Size i s a t l e a s t 6 4 b y t e s , s o w i l l b e g o i n g r o u n d t h e b i g l o o p a t l e a s t o n c e .
!
! r2 = r o u n d e d u p r4
! r3 = r o u n d e d d o w n r0
.balign 32
.Lcase0b :
add #- 4 , r5 ! 5 0 E X
.Lcase00b :
mov r0 , r3 ! 5 M T ( l a t e n c y =0 )
mov #( ~ 0x1f ) , r1 ! 6 E X
and r1 , r3 ! 7 8 E X
mov r4 , r2 ! 5 M T ( l a t e n c y =0 )
cmp/ e q r3 , r0 ! 5 4 M T
add #0x1f , r2 ! 5 0 E X
bt/ s 1 f ! 1 1 0 B R
and r1 , r2 ! 7 8 E X
! copy i n i t i a l w o r d s u n t i l c a c h e l i n e a l i g n e d
mov. l @(r0, r5), r1 ! 21 LS (latency=2)
tst #4 , r0 ! 8 7 M T
mov r5 , r6 ! 5 M T ( l a t e n c y =0 )
add #- 4 , r6 ! 5 0 E X
bt/ s 4 f ! 1 1 1 B R
add #8 , r3 ! 5 0 E X
tst #0x18 , r0 ! 8 7 M T
bt/ s 1 f ! 1 0 9 B R
mov. l r1 ,@-r0 ! 30 LS
! 4 cycles, 2 l o n g w o r d s p e r i t e r a t i o n
3 : mov. l @(r0, r5), r1 ! 21 LS (latency=2)
4 : mov. l @(r0, r6), r7 ! 21 LS (latency=2)
cmp/ e q r3 , r0 ! 5 4 M T
mov. l r1 , @-r0 ! 30 LS
bf/ s 3 b ! 1 0 9 B R
mov. l r7 , @-r0 ! 30 LS
! Copy t h e c a c h e l i n e a l i g n e d b l o c k s
!
! In u s e : r0 , r2 , r4 , r5
! Scratch : r1 , r3 , r6 , r7
!
! We c o u l d d o t h i s w i t h t h e f o u r s c r a t c h r e g i s t e r s , b u t i f s r c
! and d e s t h i t t h e s a m e c a c h e l i n e , t h i s w i l l t h r a s h , s o m a k e
! use o f a d d i t i o n a l r e g i s t e r s .
!
! We a l s o n e e d r0 a s a t e m p o r a r y ( f o r m o v c a ) , s o ' u n d o ' t h e i n v a r i a n t :
! r5 : src ( w a s r0 + r5 )
! r1 : dest ( w a s r0 )
! this c a n b e r e v e r s e d a t t h e e n d , s o w e d o n ' t n e e d t o s a v e a n y e x t r a
! state.
!
1 : mov. l r8 , @-r15 ! 30 LS
add r0 , r5 ! 4 9 E X
mov. l r9 , @-r15 ! 30 LS
mov r0 , r1 ! 5 M T ( l a t e n c y =0 )
mov. l r10 , @-r15 ! 30 LS
add #- 0x1c , r5 ! 5 0 E X
mov. l r11 , @-r15 ! 30 LS
! 1 6 cycles, 3 2 b y t e s p e r i t e r a t i o n
2 : mov. l @(0x00,r5),r0 ! 18 LS (latency=2)
add #- 0x20 , r1 ! 5 0 E X
mov. l @(0x04,r5),r3 ! 18 LS (latency=2)
mov. l @(0x08,r5),r6 ! 18 LS (latency=2)
mov. l @(0x0c,r5),r7 ! 18 LS (latency=2)
mov. l @(0x10,r5),r8 ! 18 LS (latency=2)
mov. l @(0x14,r5),r9 ! 18 LS (latency=2)
mov. l @(0x18,r5),r10 ! 18 LS (latency=2)
mov. l @(0x1c,r5),r11 ! 18 LS (latency=2)
movca. l r0 ,@r1 ! 40 LS (latency=3-7)
mov. l r3 ,@(0x04,r1) ! 33 LS
mov. l r6 ,@(0x08,r1) ! 33 LS
mov. l r7 ,@(0x0c,r1) ! 33 LS
mov. l r8 ,@(0x10,r1) ! 33 LS
add #- 0x20 , r5 ! 5 0 E X
mov. l r9 ,@(0x14,r1) ! 33 LS
cmp/ e q r2 ,r1 ! 5 4 M T
mov. l r10 ,@(0x18,r1) ! 33 LS
bf/ s 2 b ! 1 0 9 B R
mov. l r11 ,@(0x1c,r1) ! 33 LS
mov r1 , r0 ! 5 M T ( l a t e n c y =0 )
mov. l @r15+, r11 ! 15 LS
sub r1 , r5 ! 7 5 E X
mov. l @r15+, r10 ! 15 LS
cmp/ e q r4 , r0 ! 5 4 M T
bf/ s 1 f ! 1 0 9 B R
mov. l @r15+, r9 ! 15 LS
rts
1 : mov. l @r15+, r8 ! 15 LS
sub r4 , r1 ! 7 5 E X ( l e n r e m a i n i n g )
! number o f t r a i l i n g b y t e s i s n o n - z e r o
!
! invariants r e s t o r e d ( r5 a l r e a d y d e c r e m e n t e d b y 4 )
! also r1 =num b y t e s r e m a i n i n g
mov #4 , r2 ! 6 E X
mov r4 , r7 ! 5 M T ( l a t e n c y =0 )
add #0x1c , r5 ! 5 0 E X ( b a c k t o - 4 )
cmp/ h s r2 , r1 ! 5 8 M T
bf/ s 5 f ! 1 0 8 B R
add #11 , r7 ! 5 0 E X
mov. l @(r0, r5), r6 ! 21 LS (latency=2)
tst r2 , r1 ! 8 6 M T
mov r5 , r3 ! 5 M T ( l a t e n c y =0 )
bt/ s 4 f ! 1 1 1 B R
add #- 4 , r3 ! 5 0 E X
cmp/ h s r2 , r1 ! 5 8 M T
bt/ s 5 f ! 1 1 1 B R
mov. l r6 ,@-r0 ! 30 LS
! 4 cycles, 2 l o n g w o r d s p e r i t e r a t i o n
3 : mov. l @(r0, r5), r6 ! 21 LS (latency=2)
4 : mov. l @(r0, r3), r2 ! 21 LS (latency=2)
cmp/ h i r7 , r0
mov. l r6 , @-r0 ! 30 LS
bt/ s 3 b ! 1 0 9 B R
mov. l r2 , @-r0 ! 30 LS
! Copy t h e f i n a l 0 - 3 b y t e s
5 : cmp/ e q r0 , r4 ! 5 4 M T
add #- 10 , r7 ! 5 0 E X
bt 9 f ! 1 1 0 B R
add #3 ,r5 ! 5 0 E X
! 3 cycles, 1 b y t e p e r i t e r a t i o n
1 : mov. b @(r0,r5),r1 ! 19 LS
cmp/ h i r7 ,r0 ! 5 7 M T
bt/ s 1 b ! 1 1 1 B R
mov. b r1 ,@-r0 ! 28 LS
9 : rts
nop
!
! GHIJ K L M N O P Q R - - > . . G H I J K L M N O P Q R . .
!
.balign 32
.Lcase2 :
! Size i s 1 6 o r g r e a t e r a n d l e s s t h e n 6 4 , b u t m a y h a v e t r a i l i n g b y t e s
2 : mov r5 , r6 ! 5 M T ( l a t e n c y =0 )
add #- 2 ,r5 ! 5 0 E X
mov r4 ,r2 ! 5 M T ( l a t e n c y =0 )
add #- 4 ,r6 ! 5 0 E X
add #7 ,r2 ! 5 0 E X
3 : mov. w @(r0,r5),r1 ! 20 LS (latency=2)
mov. w @(r0,r6),r3 ! 20 LS (latency=2)
cmp/ h i r2 ,r0 ! 5 7 M T
mov. w r1 ,@-r0 ! 29 LS
bt/ s 3 b ! 1 1 1 B R
mov. w r3 ,@-r0 ! 29 LS
bra 1 0 f
nop
.balign 32
.Lcase2b :
! Size i s a t l e a s t 6 4 b y t e s , s o w i l l b e g o i n g r o u n d t h e b i g l o o p a t l e a s t o n c e .
!
! r2 = r o u n d e d u p r4
! r3 = r o u n d e d d o w n r0
mov r0 , r3 ! 5 M T ( l a t e n c y =0 )
mov #( ~ 0x1f ) , r1 ! 6 E X
and r1 , r3 ! 7 8 E X
mov r4 , r2 ! 5 M T ( l a t e n c y =0 )
cmp/ e q r3 , r0 ! 5 4 M T
add #0x1f , r2 ! 5 0 E X
add #- 2 , r5 ! 5 0 E X
bt/ s 1 f ! 1 1 0 B R
and r1 , r2 ! 7 8 E X
! Copy a s h o r t w o r d o n e a t a t i m e u n t i l w e a r e c a c h e l i n e a l i g n e d
! Normal v a l u e s : r0 , r2 , r3 , r4
! Unused : r1 , r6 , r7
! Mod : r5 ( =r5 - 2 )
!
add #2 , r3 ! 5 0 E X
2 : mov. w @(r0,r5),r1 ! 20 LS (latency=2)
cmp/ e q r3 ,r0 ! 5 4 M T
bf/ s 2 b ! 1 1 1 B R
mov. w r1 ,@-r0 ! 29 LS
! Copy t h e c a c h e l i n e a l i g n e d b l o c k s
!
! In u s e : r0 , r2 , r4 , r5 ( =r5 - 2 )
! Scratch : r1 , r3 , r6 , r7
!
! We c o u l d d o t h i s w i t h t h e f o u r s c r a t c h r e g i s t e r s , b u t i f s r c
! and d e s t h i t t h e s a m e c a c h e l i n e , t h i s w i l l t h r a s h , s o m a k e
! use o f a d d i t i o n a l r e g i s t e r s .
!
! We a l s o n e e d r0 a s a t e m p o r a r y ( f o r m o v c a ) , s o ' u n d o ' t h e i n v a r i a n t :
! r5 : src ( w a s r0 + r5 )
! r1 : dest ( w a s r0 )
! this c a n b e r e v e r s e d a t t h e e n d , s o w e d o n ' t n e e d t o s a v e a n y e x t r a
! state.
!
1 : mov. l r8 , @-r15 ! 30 LS
add r0 , r5 ! 4 9 E X
mov. l r9 , @-r15 ! 30 LS
mov r0 , r1 ! 5 M T ( l a t e n c y =0 )
mov. l r10 , @-r15 ! 30 LS
add #- 0x1e , r5 ! 5 0 E X
mov. l r11 , @-r15 ! 30 LS
mov. l r12 , @-r15 ! 30 LS
! 1 7 cycles, 3 2 b y t e s p e r i t e r a t i o n
# ifdef C O N F I G _ C P U _ L I T T L E _ E N D I A N
2 : mov. w @r5+, r0 ! 14 LS (latency=2) ..JI
add #- 0x20 , r1 ! 5 0 E X
mov. l @r5+, r3 ! 15 LS (latency=2) NMLK
mov. l @r5+, r6 ! 15 LS (latency=2) RQPO
shll1 6 r0 ! 1 0 3 E X J I . .
mov. l @r5+, r7 ! 15 LS (latency=2)
xtrct r3 , r0 ! 4 8 E X L K J I
mov. l @r5+, r8 ! 15 LS (latency=2)
xtrct r6 , r3 ! 4 8 E X P O N M
mov. l @r5+, r9 ! 15 LS (latency=2)
xtrct r7 , r6 ! 4 8 E X
mov. l @r5+, r10 ! 15 LS (latency=2)
xtrct r8 , r7 ! 4 8 E X
mov. l @r5+, r11 ! 15 LS (latency=2)
xtrct r9 , r8 ! 4 8 E X
mov. w @r5+, r12 ! 15 LS (latency=2)
xtrct r10 , r9 ! 4 8 E X
movca. l r0 ,@r1 ! 40 LS (latency=3-7)
xtrct r11 , r10 ! 4 8 E X
mov. l r3 , @(0x04,r1) ! 33 LS
xtrct r12 , r11 ! 4 8 E X
mov. l r6 , @(0x08,r1) ! 33 LS
mov. l r7 , @(0x0c,r1) ! 33 LS
mov. l r8 , @(0x10,r1) ! 33 LS
add #- 0x40 , r5 ! 5 0 E X
mov. l r9 , @(0x14,r1) ! 33 LS
cmp/ e q r2 ,r1 ! 5 4 M T
mov. l r10 , @(0x18,r1) ! 33 LS
bf/ s 2 b ! 1 0 9 B R
mov. l r11 , @(0x1c,r1) ! 33 LS
# else
2 : mov. w @(0x1e,r5), r0 ! 17 LS (latency=2)
add #- 2 , r5 ! 5 0 E X
mov. l @(0x1c,r5), r3 ! 18 LS (latency=2)
add #- 4 , r1 ! 5 0 E X
mov. l @(0x18,r5), r6 ! 18 LS (latency=2)
shll1 6 r0 ! 1 0 3 E X
mov. l @(0x14,r5), r7 ! 18 LS (latency=2)
xtrct r3 , r0 ! 4 8 E X
mov. l @(0x10,r5), r8 ! 18 LS (latency=2)
xtrct r6 , r3 ! 4 8 E X
mov. l @(0x0c,r5), r9 ! 18 LS (latency=2)
xtrct r7 , r6 ! 4 8 E X
mov. l @(0x08,r5), r10 ! 18 LS (latency=2)
xtrct r8 , r7 ! 4 8 E X
mov. l @(0x04,r5), r11 ! 18 LS (latency=2)
xtrct r9 , r8 ! 4 8 E X
2006-09-27 17:50:03 +09:00
mov. l @(0x00,r5), r12 ! 18 LS (latency=2)
xtrct r10 , r9 ! 4 8 E X
2005-04-16 15:20:36 -07:00
movca. l r0 ,@r1 ! 40 LS (latency=3-7)
add #- 0x1c , r1 ! 5 0 E X
2008-05-15 13:28:46 +09:00
mov. l r3 , @(0x18,r1) ! 33 LS
2005-04-16 15:20:36 -07:00
xtrct r11 , r10 ! 4 8 E X
2008-05-15 13:28:46 +09:00
mov. l r6 , @(0x14,r1) ! 33 LS
2005-04-16 15:20:36 -07:00
xtrct r12 , r11 ! 4 8 E X
2008-05-15 13:28:46 +09:00
mov. l r7 , @(0x10,r1) ! 33 LS
2005-04-16 15:20:36 -07:00
2008-05-15 13:28:46 +09:00
mov. l r8 , @(0x0c,r1) ! 33 LS
add #- 0x1e , r5 ! 5 0 E X
2005-04-16 15:20:36 -07:00
2008-05-15 13:28:46 +09:00
mov. l r9 , @(0x08,r1) ! 33 LS
2005-04-16 15:20:36 -07:00
cmp/ e q r2 ,r1 ! 5 4 M T
2008-05-15 13:28:46 +09:00
mov. l r10 , @(0x04,r1) ! 33 LS
2005-04-16 15:20:36 -07:00
bf/ s 2 b ! 1 0 9 B R
2008-05-15 13:28:46 +09:00
mov. l r11 , @(0x00,r1) ! 33 LS
2005-04-16 15:20:36 -07:00
# endif
mov. l @r15+, r12
mov r1 , r0 ! 5 M T ( l a t e n c y =0 )
mov. l @r15+, r11 ! 15 LS
sub r1 , r5 ! 7 5 E X
mov. l @r15+, r10 ! 15 LS
cmp/ e q r4 , r0 ! 5 4 M T
bf/ s 1 f ! 1 0 9 B R
mov. l @r15+, r9 ! 15 LS
rts
1 : mov. l @r15+, r8 ! 15 LS
add #0x1e , r5 ! 5 0 E X
! Finish o f f a s h o r t w o r d a t a t i m e
! r5 m u s t b e i n v a r i a n t - 2
10 : mov r4 ,r2 ! 5 M T ( l a t e n c y =0 )
add #1 ,r2 ! 5 0 E X
cmp/ h i r2 , r0 ! 5 7 M T
bf/ s 1 f ! 1 0 9 B R
add #2 , r2 ! 5 0 E X
3 : mov. w @(r0,r5),r1 ! 20 LS
cmp/ h i r2 ,r0 ! 5 7 M T
bt/ s 3 b ! 1 0 9 B R
mov. w r1 ,@-r0 ! 29 LS
1 :
!
! Finally, c o p y t h e l a s t b y t e i f n e c e s s a r y
cmp/ e q r4 ,r0 ! 5 4 M T
bt/ s 9 b
add #1 ,r5
mov. b @(r0,r5),r1
rts
mov. b r1 ,@-r0