2005-04-16 15:20:36 -07:00
/ *
* McKinley- o p t i m i z e d v e r s i o n o f c o p y _ p a g e ( ) .
*
* Copyright ( C ) 2 0 0 2 H e w l e t t - P a c k a r d C o
* David M o s b e r g e r < d a v i d m @hpl.hp.com>
*
* Inputs :
* in0 : address o f t a r g e t p a g e
* in1 : address o f s o u r c e p a g e
* Output :
* no r e t u r n v a l u e
*
* General i d e a :
* - use r e g u l a r l o a d s a n d s t o r e s t o p r e f e t c h d a t a t o a v o i d c o n s u m i n g M - s l o t j u s t f o r
* lfetches = > g o o d f o r i n - c a c h e p e r f o r m a n c e
* - avoid l 2 b a n k - c o n f l i c t s b y n o t s t o r i n g i n t o t h e s a m e 1 6 - b y t e b a n k w i t h i n a s i n g l e
* cycle
*
* Principle o f o p e r a t i o n :
* First, n o t e t h a t L 1 h a s a l i n e - s i z e o f 6 4 b y t e s a n d L 2 a l i n e - s i z e o f 1 2 8 b y t e s .
* To a v o i d s e c o n d a r y m i s s e s i n L 2 , w e p r e f e t c h b o t h s o u r c e a n d d e s t i n a t i o n w i t h a l i n e - s i z e
* of 1 2 8 b y t e s . W h e n b o t h o f t h e s e l i n e s a r e i n t h e L 2 a n d t h e f i r s t h a l f o f t h e
* source l i n e i s i n L 1 , w e s t a r t c o p y i n g t h e r e m a i n i n g w o r d s . T h e s e c o n d h a l f o f t h e
* source l i n e i s p r e f e t c h e d i n a n e a r l i e r i t e r a t i o n , s o t h a t b y t h e t i m e w e s t a r t
* accessing i t , i t ' s a l s o p r e s e n t i n t h e L 1 .
*
* We u s e a s o f t w a r e - p i p e l i n e d l o o p t o c o n t r o l t h e o v e r a l l o p e r a t i o n . T h e p i p e l i n e
* has 2 * P R E F E T C H _ D I S T + K s t a g e s . T h e f i r s t P R E F E T C H _ D I S T s t a g e s a r e u s e d f o r p r e f e t c h i n g
* source c a c h e - l i n e s . T h e s e c o n d P R E F E T C H _ D I S T s t a g e s a r e u s e d f o r p r e f e t c h i n g d e s t i n a t i o n
* cache- l i n e s , t h e l a s t K s t a g e s a r e u s e d t o c o p y t h e c a c h e - l i n e w o r d s n o t c o p i e d b y
* the p r e f e t c h e s . T h e f o u r r e l e v a n t p o i n t s i n t h e p i p e l i n e d a r e c a l l e d A , B , C , D :
* p[ A ] i s T R U E i f a s o u r c e - l i n e s h o u l d b e p r e f e t c h e d , p [ B ] i s T R U E i f a d e s t i n a t i o n - l i n e
* should b e p r e f e t c h e d , p [ C ] i s T R U E i f t h e s e c o n d h a l f o f a n L 2 l i n e s h o u l d b e b r o u g h t
* into L 1 D a n d p [ D ] i s T R U E i f a c a c h e l i n e n e e d s t o b e c o p i e d .
*
* This a l l s o u n d s v e r y c o m p l i c a t e d , b u t t h a n k s t o t h e m o d u l o - s c h e d u l e d l o o p s u p p o r t ,
* the r e s u l t i n g c o d e i s v e r y r e g u l a r a n d q u i t e e a s y t o f o l l o w ( o n c e y o u g e t t h e i d e a ) .
*
* As a s e c o n d a r y o p t i m i z a t i o n , t h e f i r s t 2 * P R E F E T C H _ D I S T i t e r a t i o n s a r e i m p l e m e n t e d
* as t h e s e p a r a t e . p r e f e t c h _ l o o p . L o g i c a l l y , t h i s l o o p p e r f o r m s e x a c t l y l i k e t h e
* main- l o o p ( . l i n e _ c o p y ) , b u t h a s a l l k n o w n - t o - b e - p r e d i c a t e d - o f f i n s t r u c t i o n s r e m o v e d ,
* so t h a t e a c h l o o p i t e r a t i o n i s f a s t e r ( a g a i n , g o o d f o r c a c h e d c a s e ) .
*
* When r e a d i n g t h e c o d e , i t h e l p s t o k e e p t h e f o l l o w i n g p i c t u r e i n m i n d :
*
* word 0 w o r d 1
* + - - - - - - + - - - - - - + - - -
* | v[ x ] | t 1 | ^
* | t2 | t 3 | |
* | t4 | t 5 | |
* | t6 | t 7 | | 1 2 8 b y t e s
* | n[ y ] | t 9 | | ( L 2 c a c h e l i n e )
* | t1 0 | t 1 1 | |
* | t1 2 | t 1 3 | |
* | t1 4 | t 1 5 | v
* + - - - - - - + - - - - - - + - - -
*
* Here, v [ x ] i s c o p i e d b y t h e ( m e m o r y ) p r e f e t c h . n [ y ] i s l o a d e d a t p [ C ]
* to f e t c h t h e s e c o n d - h a l f o f t h e L 2 c a c h e l i n e i n t o L 1 , a n d t h e t X w o r d s a r e c o p i e d i n
* an o r d e r t h a t a v o i d s b a n k c o n f l i c t s .
* /
# include < a s m / a s m m a c r o . h >
# include < a s m / p a g e . h >
2016-01-17 01:13:41 -05:00
# include < a s m / e x p o r t . h >
2005-04-16 15:20:36 -07:00
# define P R E F E T C H _ D I S T 8 / / M c K i n l e y s u s t a i n s 1 6 o u t s t a n d i n g L 2 m i s s e s ( 8 l d , 8 s t )
# define s r c0 r2
# define s r c1 r3
# define d s t 0 r9
# define d s t 1 r10
# define s r c _ p r e _ m e m r11
# define d s t _ p r e _ m e m r14
# define s r c _ p r e _ l 2 r15
# define d s t _ p r e _ l 2 r16
# define t 1 r17
# define t 2 r18
# define t 3 r19
# define t 4 r20
# define t 5 t 1 / / a l i a s !
# define t 6 t 2 / / a l i a s !
# define t 7 t 3 / / a l i a s !
# define t 9 t 5 / / a l i a s !
# define t 1 0 t 4 / / a l i a s !
# define t 1 1 t 7 / / a l i a s !
# define t 1 2 t 6 / / a l i a s !
# define t 1 4 t 1 0 / / a l i a s !
# define t 1 3 r21
# define t 1 5 r22
# define s a v e d _ l c r23
# define s a v e d _ p r r24
# define A 0
# define B ( P R E F E T C H _ D I S T )
# define C ( B + P R E F E T C H _ D I S T )
# define D ( C + 3 )
# define N ( D + 1 )
# define N r o t ( ( N + 7 ) & ~ 7 )
GLOBAL_ E N T R Y ( c o p y _ p a g e )
.prologue
alloc r8 = a r . p f s , 2 , N r o t - 2 , 0 , N r o t
.rotr v[ 2 * P R E F E T C H _ D I S T ] , n [ D - C + 1 ]
.rotp p[ N ]
.save ar. l c , s a v e d _ l c
mov s a v e d _ l c = a r . l c
.save pr, s a v e d _ p r
mov s a v e d _ p r = p r
.body
mov s r c _ p r e _ m e m = i n 1
mov p r . r o t = 0 x10 0 0 0
mov a r . e c = 1 / / s p e c i a l u n r o l l e d l o o p
mov d s t _ p r e _ m e m = i n 0
mov a r . l c = 2 * P R E F E T C H _ D I S T - 1
add s r c _ p r e _ l 2 = 8 * 8 , i n 1
add d s t _ p r e _ l 2 = 8 * 8 , i n 0
add s r c0 = 8 , i n 1 / / f i r s t t 1 s r c
add s r c1 = 3 * 8 , i n 1 / / f i r s t t 3 s r c
add d s t 0 = 8 , i n 0 / / f i r s t t 1 d s t
add d s t 1 = 3 * 8 , i n 0 / / f i r s t t 3 d s t
mov t 1 = ( P A G E _ S I Z E / 1 2 8 ) - ( 2 * P R E F E T C H _ D I S T ) - 1
nop. m 0
nop. i 0
;;
/ / same a s . l i n e _ c o p y l o o p , b u t w i t h a l l p r e d i c a t e d - o f f i n s t r u c t i o n s r e m o v e d :
.prefetch_loop :
( p[ A ] ) l d8 v [ A ] = [ s r c _ p r e _ m e m ] , 1 2 8 / / M 0
( p[ B ] ) s t 8 [ d s t _ p r e _ m e m ] = v [ B ] , 1 2 8 / / M 2
br. c t o p . s p t k . p r e f e t c h _ l o o p
;;
cmp. e q p16 , p0 = r0 , r0 / / r e s e t p16 t o 1 ( b r . c t o p c l e a r e d i t t o z e r o )
mov a r . l c = t 1 / / w i t h 6 4 K B p a g e s , t 1 i s t o o b i g t o f i t i n 8 b i t s !
mov a r . e c = N / / # o f s t a g e s i n p i p e l i n e
;;
.line_copy :
( p[ D ] ) l d8 t 2 = [ s r c0 ] , 3 * 8 / / M 0
( p[ D ] ) l d8 t 4 = [ s r c1 ] , 3 * 8 / / M 1
( p[ B ] ) s t 8 [ d s t _ p r e _ m e m ] = v [ B ] , 1 2 8 / / M 2 p r e f e t c h d s t f r o m m e m o r y
( p[ D ] ) s t 8 [ d s t _ p r e _ l 2 ] = n [ D - C ] , 1 2 8 / / M 3 p r e f e t c h d s t f r o m L 2
;;
( p[ A ] ) l d8 v [ A ] = [ s r c _ p r e _ m e m ] , 1 2 8 / / M 0 p r e f e t c h s r c f r o m m e m o r y
( p[ C ] ) l d8 n [ 0 ] = [ s r c _ p r e _ l 2 ] , 1 2 8 / / M 1 p r e f e t c h s r c f r o m L 2
( p[ D ] ) s t 8 [ d s t 0 ] = t 1 , 8 / / M 2
( p[ D ] ) s t 8 [ d s t 1 ] = t 3 , 8 / / M 3
;;
( p[ D ] ) l d8 t 5 = [ s r c0 ] , 8
( p[ D ] ) l d8 t 7 = [ s r c1 ] , 3 * 8
( p[ D ] ) s t 8 [ d s t 0 ] = t 2 , 3 * 8
( p[ D ] ) s t 8 [ d s t 1 ] = t 4 , 3 * 8
;;
( p[ D ] ) l d8 t 6 = [ s r c0 ] , 3 * 8
( p[ D ] ) l d8 t 1 0 = [ s r c1 ] , 8
( p[ D ] ) s t 8 [ d s t 0 ] = t 5 , 8
( p[ D ] ) s t 8 [ d s t 1 ] = t 7 , 3 * 8
;;
( p[ D ] ) l d8 t 9 = [ s r c0 ] , 3 * 8
( p[ D ] ) l d8 t 1 1 = [ s r c1 ] , 3 * 8
( p[ D ] ) s t 8 [ d s t 0 ] = t 6 , 3 * 8
( p[ D ] ) s t 8 [ d s t 1 ] = t 1 0 , 8
;;
( p[ D ] ) l d8 t 1 2 = [ s r c0 ] , 8
( p[ D ] ) l d8 t 1 4 = [ s r c1 ] , 8
( p[ D ] ) s t 8 [ d s t 0 ] = t 9 , 3 * 8
( p[ D ] ) s t 8 [ d s t 1 ] = t 1 1 , 3 * 8
;;
( p[ D ] ) l d8 t 1 3 = [ s r c0 ] , 4 * 8
( p[ D ] ) l d8 t 1 5 = [ s r c1 ] , 4 * 8
( p[ D ] ) s t 8 [ d s t 0 ] = t 1 2 , 8
( p[ D ] ) s t 8 [ d s t 1 ] = t 1 4 , 8
;;
( p[ D - 1 ] ) l d8 t 1 = [ s r c0 ] , 8
( p[ D - 1 ] ) l d8 t 3 = [ s r c1 ] , 8
( p[ D ] ) s t 8 [ d s t 0 ] = t 1 3 , 4 * 8
( p[ D ] ) s t 8 [ d s t 1 ] = t 1 5 , 4 * 8
br. c t o p . s p t k . l i n e _ c o p y
;;
mov a r . l c = s a v e d _ l c
mov p r = s a v e d _ p r , - 1
br. r e t . s p t k . m a n y r p
END( c o p y _ p a g e )
2016-01-17 01:13:41 -05:00
EXPORT_ S Y M B O L ( c o p y _ p a g e )