2005-04-16 15:20:36 -07:00
/ *
*
* Optimized v e r s i o n o f t h e s t a n d a r d m e m c p y ( ) f u n c t i o n
*
* Inputs :
* in0 : destination a d d r e s s
* in1 : source a d d r e s s
* in2 : number o f b y t e s t o c o p y
* Output :
* no r e t u r n v a l u e
*
* Copyright ( C ) 2 0 0 0 - 2 0 0 1 H e w l e t t - P a c k a r d C o
* Stephane E r a n i a n < e r a n i a n @hpl.hp.com>
* David M o s b e r g e r - T a n g < d a v i d m @hpl.hp.com>
* /
# include < a s m / a s m m a c r o . h >
2016-01-17 01:13:41 -05:00
# include < a s m / e x p o r t . h >
2005-04-16 15:20:36 -07:00
GLOBAL_ E N T R Y ( m e m c p y )
# define M E M _ L A T 2 1 / * l a t e n c y t o m e m o r y * /
# define d s t r2
# define s r c r3
# define r e t v a l r8
# define s a v e d _ p f s r9
# define s a v e d _ l c r10
# define s a v e d _ p r r11
# define c n t r16
# define s r c2 r17
# define t 0 r18
# define t 1 r19
# define t 2 r20
# define t 3 r21
# define t 4 r22
# define s r c _ e n d r23
# define N ( M E M _ L A T + 4 )
# define N r o t ( ( N + 7 ) & ~ 7 )
/ *
* First, c h e c k i f e v e r y t h i n g ( s r c , d s t , l e n ) i s a m u l t i p l e o f e i g h t . I f
* so, w e h a n d l e e v e r y t h i n g w i t h n o t a k e n b r a n c h e s ( o t h e r t h a n t h e l o o p
* itself) a n d a s m a l l i c a c h e f o o t p r i n t . O t h e r w i s e , w e j u m p o f f t o
* the m o r e g e n e r a l c o p y r o u t i n e h a n d l i n g a r b i t r a r y
* sizes/ a l i g n m e n t e t c .
* /
.prologue
.save ar. p f s , s a v e d _ p f s
alloc s a v e d _ p f s =ar . p f s ,3 ,N r o t ,0 ,N r o t
.save ar. l c , s a v e d _ l c
mov s a v e d _ l c =ar . l c
or t 0 =in0 ,i n 1
;;
or t 0 =t0 ,i n 2
.save pr, s a v e d _ p r
mov s a v e d _ p r =pr
.body
cmp. e q p6 ,p0 =in2 ,r0 / / z e r o l e n g t h ?
mov r e t v a l =in0 / / r e t u r n d s t
( p6 ) b r . r e t . s p n t . m a n y r p / / z e r o l e n g t h , r e t u r n i m m e d i a t e l y
;;
mov d s t =in0 / / c o p y b e c a u s e o f r o t a t i o n
shr. u c n t =in2 ,3 / / n u m b e r o f 8 - b y t e w o r d s t o c o p y
mov p r . r o t =1 < < 1 6
;;
adds c n t = - 1 ,c n t / / b r . c t o p i s r e p e a t / u n t i l
cmp. g t u p7 ,p0 =16 ,i n 2 / / c o p y i n g l e s s t h a n 1 6 b y t e s ?
mov a r . e c =N
;;
and t 0 =0x7 ,t 0
mov a r . l c =cnt
;;
cmp. n e p6 ,p0 =t0 ,r0
mov s r c =in1 / / c o p y b e c a u s e o f r o t a t i o n
( p7 ) b r . c o n d . s p n t . f e w . m e m c p y _ s h o r t
( p6 ) b r . c o n d . s p n t . f e w . m e m c p y _ l o n g
;;
nop. m 0
;;
nop. m 0
nop. i 0
;;
nop. m 0
;;
.rotr val[ N ]
.rotp p[ N ]
.align 32
1 : { .mib
( p[ 0 ] ) l d8 v a l [ 0 ] = [ s r c ] ,8
nop. i 0
brp. l o o p . i m p 1 b , 2 f
}
2 : { .mfb
( p[ N - 1 ] ) s t 8 [ d s t ] =val [ N - 1 ] ,8
nop. f 0
br. c t o p . d p t k . f e w 1 b
}
;;
mov a r . l c =saved_lc
mov p r =saved_pr ,- 1
mov a r . p f s =saved_pfs
br. r e t . s p t k . m a n y r p
/ *
* Small ( < 1 6 b y t e s ) u n a l i g n e d c o p y i n g i s d o n e v i a a s i m p l e b y t e - a t - t h e - t i m e
* copy l o o p . T h i s p e r f o r m s r e l a t i v e l y p o o r l y o n I t a n i u m , b u t i t d o e s n ' t
* get u s e d v e r y o f t e n ( g c c i n l i n e s s m a l l c o p i e s ) a n d d u e t o a t o m i c i t y
* issues, w e w a n t t o a v o i d r e a d - m o d i f y - w r i t e o f e n t i r e w o r d s .
* /
.align 32
.memcpy_short :
adds c n t = - 1 ,i n 2 / / b r . c t o p i s r e p e a t / u n t i l
mov a r . e c =MEM_LAT
brp. l o o p . i m p 1 f , 2 f
;;
mov a r . l c =cnt
;;
nop. m 0
;;
nop. m 0
nop. i 0
;;
nop. m 0
;;
nop. m 0
;;
/ *
* It i s f a s t e r t o p u t a s t o p b i t i n t h e l o o p h e r e b e c a u s e i t m a k e s
* the p i p e l i n e s h o r t e r ( a n d l a t e n c y i s w h a t m a t t e r s o n s h o r t c o p i e s ) .
* /
.align 32
1 : { .mib
( p[ 0 ] ) l d1 v a l [ 0 ] = [ s r c ] ,1
nop. i 0
brp. l o o p . i m p 1 b , 2 f
} ;;
2 : { .mfb
( p[ M E M _ L A T - 1 ] ) s t 1 [ d s t ] =val [ M E M _ L A T - 1 ] ,1
nop. f 0
br. c t o p . d p t k . f e w 1 b
} ;;
mov a r . l c =saved_lc
mov p r =saved_pr ,- 1
mov a r . p f s =saved_pfs
br. r e t . s p t k . m a n y r p
/ *
* Large ( > = 1 6 b y t e s ) c o p y i n g i s d o n e i n a f a n c y w a y . L a t e n c y i s n ' t
* an o v e r r i d i n g c o n c e r n h e r e , b u t t h r o u g h p u t i s . W e f i r s t d o
* sub- w o r d c o p y i n g u n t i l t h e d e s t i n a t i o n i s a l i g n e d , t h e n w e c h e c k
* if t h e s o u r c e i s a l s o a l i g n e d . I f s o , w e d o a s i m p l e l o a d / s t o r e - l o o p
* until t h e r e a r e l e s s t h a n 8 b y t e s l e f t o v e r a n d t h e n w e d o t h e t a i l ,
* by s t o r i n g t h e l a s t f e w b y t e s u s i n g s u b - w o r d c o p y i n g . I f t h e s o u r c e
* is n o t a l i g n e d , w e b r a n c h o f f t o t h e n o n - c o n g r u e n t l o o p .
*
* stage : op :
* 0 ld
* :
* MEM_ L A T + 3 s h r p
* MEM_ L A T + 4 s t
*
* On I t a n i u m , t h e p i p e l i n e i t s e l f r u n s w i t h o u t s t a l l s . H o w e v e r , b r . c t o p
* seems t o i n t r o d u c e a n u n a v o i d a b l e b u b b l e i n t h e p i p e l i n e s o t h e o v e r a l l
* latency i s 2 c y c l e s / i t e r a t i o n . T h i s g i v e s u s a _ c o p y _ t h r o u g h p u t
* of 4 b y t e / c y c l e . S t i l l n o t b a d .
* /
# undef N
# undef N r o t
# define N ( M E M _ L A T + 5 ) / * n u m b e r o f s t a g e s * /
# define N r o t ( ( N + 1 + 2 + 7 ) & ~ 7 ) / * n u m b e r o f r o t a t i n g r e g s * /
# define L O G _ L O O P _ S I Z E 6
.memcpy_long :
alloc t 3 =ar . p f s ,3 ,N r o t ,0 ,N r o t / / r e s i z e r e g i s t e r f r a m e
and t 0 = - 8 ,s r c / / t 0 = s r c & ~ 7
and t 2 =7 ,s r c / / t 2 = s r c & 7
;;
ld8 t 0 = [ t 0 ] / / t 0 = 1 s t s o u r c e w o r d
adds s r c2 =7 ,s r c / / s r c2 = ( s r c + 7 )
sub t 4 =r0 ,d s t / / t 4 = - d s t
;;
and s r c2 = - 8 ,s r c2 / / s r c2 = ( s r c + 7 ) & ~ 7
shl t 2 =t2 ,3 / / t 2 = 8 * ( s r c & 7 )
shl t 4 =t4 ,3 / / t 4 = 8 * ( d s t & 7 )
;;
ld8 t 1 = [ s r c2 ] / / t 1 = 1 s t s o u r c e w o r d i f s r c i s 8 - b y t e a l i g n e d , 2 n d o t h e r w i s e
sub t 3 =64 ,t 2 / / t 3 = 6 4 - 8 * ( s r c & 7 )
shr. u t 0 =t0 ,t 2
;;
add s r c _ e n d =src ,i n 2
shl t 1 =t1 ,t 3
mov p r =t4 ,0 x38 / / ( p5 ,p4 ,p3 ) = ( d s t & 7 )
;;
or t 0 =t0 ,t 1
mov c n t =r0
adds s r c _ e n d = - 1 ,s r c _ e n d
;;
( p3 ) s t 1 [ d s t ] =t0 ,1
( p3 ) s h r . u t 0 =t0 ,8
( p3 ) a d d s c n t =1 ,c n t
;;
( p4 ) s t 2 [ d s t ] =t0 ,2
( p4 ) s h r . u t 0 =t0 ,1 6
( p4 ) a d d s c n t =2 ,c n t
;;
( p5 ) s t 4 [ d s t ] =t0 ,4
( p5 ) a d d s c n t =4 ,c n t
and s r c _ e n d = - 8 ,s r c _ e n d / / s r c _ e n d = l a s t w o r d o f s o u r c e b u f f e r
;;
/ / At t h i s p o i n t , d s t i s a l i g n e d t o 8 b y t e s a n d t h e r e a t l e a s t 1 6 - 7 =9 b y t e s l e f t t o c o p y :
1 : { add s r c =cnt ,s r c / / m a k e s r c p o i n t t o r e m a i n d e r o f s o u r c e b u f f e r
sub c n t =in2 ,c n t / / c n t = n u m b e r o f b y t e s l e f t t o c o p y
mov t 4 =ip
} ;;
and s r c2 = - 8 ,s r c / / a l i g n s o u r c e p o i n t e r
adds t 4 = . m e m c p y _ l o o p s - 1 b ,t 4
mov a r . e c =N
and t 0 =7 ,s r c / / t 0 = s r c & 7
shr. u t 2 =cnt ,3 / / t 2 = n u m b e r o f 8 - b y t e w o r d s l e f t t o c o p y
shl c n t =cnt ,3 / / m o v e b i t s 0 - 2 t o 3 - 5
;;
.rotr val[ N + 1 ] , w [ 2 ]
.rotp p[ N ]
cmp. n e p6 ,p0 =t0 ,r0 / / i s s r c a l i g n e d , t o o ?
shl t 0 =t0 ,L O G _ L O O P _ S I Z E / / t 0 = 8 * ( s r c & 7 )
adds t 2 = - 1 ,t 2 / / b r . c t o p i s r e p e a t / u n t i l
;;
add t 4 =t0 ,t 4
mov p r =cnt ,0 x38 / / s e t ( p5 ,p4 ,p3 ) t o # o f b y t e s l a s t - w o r d b y t e s t o c o p y
mov a r . l c =t2
;;
nop. m 0
;;
nop. m 0
nop. i 0
;;
nop. m 0
;;
( p6 ) l d8 v a l [ 1 ] = [ s r c2 ] ,8 / / p r i m e t h e p u m p . . .
mov b6 =t4
br. s p t k . f e w b6
;;
.memcpy_tail :
/ / At t h i s p o i n t , ( p5 ,p4 ,p3 ) a r e s e t t o t h e n u m b e r o f b y t e s l e f t t o c o p y ( w h i c h i s
/ / less t h a n 8 ) a n d t 0 c o n t a i n s t h e l a s t f e w b y t e s o f t h e s r c b u f f e r :
( p5 ) s t 4 [ d s t ] =t0 ,4
( p5 ) s h r . u t 0 =t0 ,3 2
mov a r . l c =saved_lc
;;
( p4 ) s t 2 [ d s t ] =t0 ,2
( p4 ) s h r . u t 0 =t0 ,1 6
mov a r . p f s =saved_pfs
;;
( p3 ) s t 1 [ d s t ] =t0
mov p r =saved_pr ,- 1
br. r e t . s p t k . m a n y r p
/ / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / /
.align 64
# define C O P Y ( s h i f t ,i n d e x ) \
1 : { .mib \
( p[ 0 ] ) l d8 v a l [ 0 ] = [ s r c2 ] ,8 ; \
( p[ M E M _ L A T + 3 ] ) s h r p w [ 0 ] =val [ M E M _ L A T + 3 ] ,v a l [ M E M _ L A T + 4 - i n d e x ] ,s h i f t ; \
brp. l o o p . i m p 1 b , 2 f \
} ; \
2 : { .mfb \
( p[ M E M _ L A T + 4 ] ) s t 8 [ d s t ] =w [ 1 ] ,8 ; \
nop. f 0 ; \
br. c t o p . d p t k . f e w 1 b ; \
} ; \
;; \
ld8 v a l [ N - 1 ] = [ s r c _ e n d ] ; /* load last word (may be same as val[N]) */ \
;; \
shrp t 0 =val [ N - 1 ] ,v a l [ N - i n d e x ] ,s h i f t ; \
br . m e m c p y _ t a i l
.memcpy_loops :
COPY( 0 , 1 ) / * n o p o i n t s p e c i a l c a s i n g t h i s - - - i t d o e s n ' t g o a n y f a s t e r w i t h o u t s h r p * /
COPY( 8 , 0 )
COPY( 1 6 , 0 )
COPY( 2 4 , 0 )
COPY( 3 2 , 0 )
COPY( 4 0 , 0 )
COPY( 4 8 , 0 )
COPY( 5 6 , 0 )
END( m e m c p y )
2016-01-17 01:13:41 -05:00
EXPORT_ S Y M B O L ( m e m c p y )