2005-04-17 02:20:36 +04:00
!
! Fast S H m e m c p y
!
! by T o s h i y a s u M o r i t a ( t m @netcom.com)
! hacked b y J " o r n R e r n n e c k e ( j o e r n . r e n n e c k e @superh.com) ("o for o-umlaut)
! SH5 c o d e C o p y r i g h t 2 0 0 2 S u p e r H L t d .
!
! Entry : ARG0 : destination p o i n t e r
! ARG1 : source p o i n t e r
! ARG2 : byte c o u n t
!
! Exit : RESULT : destination p o i n t e r
! any o t h e r r e g i s t e r s i n t h e r a n g e r0 - r7 : t r a s h e d
!
! Notes : Usually o n e w a n t s t o d o s m a l l r e a d s a n d w r i t e a l o n g w o r d , b u t
! unfortunately i t i s d i f f i c u l t i n s o m e c a s e s t o c o n c a t a n a t e b y t e s
! into a l o n g w o r d o n t h e S H , s o t h i s d o e s a l o n g w o r d r e a d a n d s m a l l
! writes.
!
! This i m p l e m e n t a t i o n m a k e s t w o a s s u m p t i o n s a b o u t h o w i t i s c a l l e d :
!
! 1 . : If t h e b y t e c o u n t i s n o n z e r o , t h e a d d r e s s o f t h e l a s t b y t e t o b e
! copied i s u n s i g n e d g r e a t e r t h a n t h e a d d r e s s o f t h e f i r s t b y t e t o
! be c o p i e d . T h i s c o u l d b e e a s i l y s w a p p e d f o r a s i g n e d c o m p a r i s o n ,
! but t h e a l g o r i t h m u s e d n e e d s s o m e c o m p a r i s o n .
!
! 2 . : When t h e r e a r e t w o o r t h r e e b y t e s i n t h e l a s t w o r d o f a n 1 1 - o r - m o r e
! bytes m e m o r y c h u n k t o b c o p i e d , t h e r e s t o f t h e w o r d c a n b e r e a d
! without s i d e e f f e c t s .
2011-03-31 05:57:33 +04:00
! This c o u l d b e e a s i l y c h a n g e d b y i n c r e a s i n g t h e m i n i m u m s i z e o f
2005-04-17 02:20:36 +04:00
! a f a s t m e m c p y a n d t h e a m o u n t s u b t r a c t e d f r o m r7 b e f o r e L _ 2 l _ l o o p b e 2 ,
! however, t h i s w o u l d c o s t a f e w e x t r a c y l e s o n a v e r a g e .
! For S H m e d i a , t h e a s s u m p t i o n i s t h a t a n y q u a d w o r d c a n b e r e a d i n i t s
! enirety i f a t l e a s t o n e b y t e i s i n c l u d e d i n t h e c o p y .
/ * Imported i n t o L i n u x k e r n e l b y R i c h a r d C u r n o w . T h i s i s u s e d t o i m p l e m e n t t h e
_ _ copy_ u s e r f u n c t i o n i n t h e g e n e r a l c a s e , s o i t h a s t o b e a d i s t i n c t
function f r o m i n t r a - k e r n e l m e m c p y t o a l l o w f o r e x c e p t i o n f i x - u p s i n t h e
event t h a t t h e u s e r p o i n t e r i s b a d s o m e w h e r e i n t h e c o p y ( e . g . d u e t o
running o f f t h e e n d o f t h e v m a ) .
Note, t h i s a l g o r i t h m w i l l b e s l i g h t l y w a s t e f u l i n t h e c a s e w h e r e t h e s o u r c e
and d e s t i n a t i o n p o i n t e r s a r e e q u a l l y a l i g n e d , b e c a u s e t h e s t l o / s t h i p a i r s
could t h e n b e m e r g e d b a c k i n t o s i n g l e s t o r e s . I f t h e r e a r e a l o t o f c a c h e
misses, t h i s i s p r o b a b l y o f f s e t b y t h e s t a l l l e n g t h s o n t h e p r e l o a d s .
* /
/ * NOTE : P r e f e t c h e s r e m o v e d a n d a l l o c o s g u a r d e d b y s y n c o t o a v o i d T A K u m 0 3 0 2 0
* erratum. T h e f i r s t t w o p r e f e t c h e s a r e n o p - e d o u t t o a v o i d u p s e t t i n g t h e
* instruction c o u n t s u s e d i n t h e j u m p a d d r e s s c a l c u l a t i o n .
* * /
.section .text . .SHmedia32 , " ax"
.little
.balign 32
.global copy_user_memcpy
.global copy_user_memcpy_end
copy_user_memcpy :
# define L D U A Q ( P ,O ,D 0 ,D 1 ) l d l o . q P ,O ,D 0 ; ldhi.q P,O+7,D1
# define S T U A Q ( P ,O ,D 0 ,D 1 ) s t l o . q P ,O ,D 0 ; sthi.q P,O+7,D1
# define L D U A L ( P ,O ,D 0 ,D 1 ) l d l o . l P ,O ,D 0 ; ldhi.l P,O+3,D1
# define S T U A L ( P ,O ,D 0 ,D 1 ) s t l o . l P ,O ,D 0 ; sthi.l P,O+3,D1
nop ! l d . b r3 ,0 ,r63 ! T A K u m 0 3 0 2 0
pta/ l L a r g e ,t r0
movi 2 5 ,r0
bgeu/ u r4 ,r0 ,t r0
nsb r4 ,r0
shlli r0 ,5 ,r0
movi ( L 1 - L 0 + 6 3 * 3 2 + 1 ) & 0 x f f f f ,r1
sub r1 , r0 , r0
L0 : ptrel r0 ,t r0
add r2 ,r4 ,r5
ptabs r18 ,t r1
add r3 ,r4 ,r6
blink t r0 ,r63
/* Rearranged to make cut2 safe */
.balign 8
L4_7 : /* 4..7 byte memcpy cntd. */
stlo. l r2 , 0 , r0
or r6 , r7 , r6
sthi. l r5 , - 1 , r6
stlo. l r5 , - 4 , r6
blink t r1 ,r63
.balign 8
L1 : /* 0 byte memcpy */
nop
blink t r1 ,r63
nop
nop
nop
nop
L2_3 : /* 2 or 3 byte memcpy cntd. */
st. b r5 ,- 1 ,r6
blink t r1 ,r63
/* 1 byte memcpy */
ld. b r3 ,0 ,r0
st. b r2 ,0 ,r0
blink t r1 ,r63
L8_15 : /* 8..15 byte memcpy cntd. */
stlo. q r2 , 0 , r0
or r6 , r7 , r6
sthi. q r5 , - 1 , r6
stlo. q r5 , - 8 , r6
blink t r1 ,r63
/* 2 or 3 byte memcpy */
ld. b r3 ,0 ,r0
nop ! l d . b r2 ,0 ,r63 ! T A K u m 0 3 0 2 0
ld. b r3 ,1 ,r1
st. b r2 ,0 ,r0
pta/ l L 2 _ 3 ,t r0
ld. b r6 ,- 1 ,r6
st. b r2 ,1 ,r1
blink t r0 , r63
/* 4 .. 7 byte memcpy */
LDUAL ( r3 , 0 , r0 , r1 )
pta L 4 _ 7 , t r0
ldlo. l r6 , - 4 , r7
or r0 , r1 , r0
sthi. l r2 , 3 , r0
ldhi. l r6 , - 1 , r6
blink t r0 , r63
/* 8 .. 15 byte memcpy */
LDUAQ ( r3 , 0 , r0 , r1 )
pta L 8 _ 1 5 , t r0
ldlo. q r6 , - 8 , r7
or r0 , r1 , r0
sthi. q r2 , 7 , r0
ldhi. q r6 , - 1 , r6
blink t r0 , r63
/* 16 .. 24 byte memcpy */
LDUAQ ( r3 , 0 , r0 , r1 )
LDUAQ ( r3 , 8 , r8 , r9 )
or r0 , r1 , r0
sthi. q r2 , 7 , r0
or r8 , r9 , r8
sthi. q r2 , 1 5 , r8
ldlo. q r6 , - 8 , r7
ldhi. q r6 , - 1 , r6
stlo. q r2 , 8 , r8
stlo. q r2 , 0 , r0
or r6 , r7 , r6
sthi. q r5 , - 1 , r6
stlo. q r5 , - 8 , r6
blink t r1 ,r63
Large :
! ld. b r2 , 0 , r63 ! T A K u m 0 3 0 2 0
pta/ l L o o p _ u a , t r1
ori r3 , - 8 , r7
sub r2 , r7 , r22
sub r3 , r2 , r6
add r2 , r4 , r5
ldlo. q r3 , 0 , r0
addi r5 , - 1 6 , r5
movi 6 4 + 8 , r27 ! c o u l d s u b t r a c t r7 f r o m t h a t .
stlo. q r2 , 0 , r0
sthi. q r2 , 7 , r0
ldx. q r22 , r6 , r0
bgtu/ l r27 , r4 , t r1
addi r5 , - 4 8 , r27
pta/ l L o o p _ l i n e , t r0
addi r6 , 6 4 , r36
addi r6 , - 2 4 , r19
addi r6 , - 1 6 , r20
addi r6 , - 8 , r21
Loop_line :
! ldx. q r22 , r36 , r63 ! T A K u m 0 3 0 2 0
alloco r22 , 3 2
synco
addi r22 , 3 2 , r22
ldx. q r22 , r19 , r23
sthi. q r22 , - 2 5 , r0
ldx. q r22 , r20 , r24
ldx. q r22 , r21 , r25
stlo. q r22 , - 3 2 , r0
ldx. q r22 , r6 , r0
sthi. q r22 , - 1 7 , r23
sthi. q r22 , - 9 , r24
sthi. q r22 , - 1 , r25
stlo. q r22 , - 2 4 , r23
stlo. q r22 , - 1 6 , r24
stlo. q r22 , - 8 , r25
bgeu r27 , r22 , t r0
Loop_ua :
addi r22 , 8 , r22
sthi. q r22 , - 1 , r0
stlo. q r22 , - 8 , r0
ldx. q r22 , r6 , r0
bgtu/ l r5 , r22 , t r1
add r3 , r4 , r7
ldlo. q r7 , - 8 , r1
sthi. q r22 , 7 , r0
ldhi. q r7 , - 1 , r7
ptabs r18 ,t r1
stlo. q r22 , 0 , r0
or r1 , r7 , r1
sthi. q r5 , 1 5 , r1
stlo. q r5 , 8 , r1
blink t r1 , r63
copy_user_memcpy_end :
nop