2005-04-16 15:20:36 -07:00
/ *
* Itanium 2 - o p t i m i z e d v e r s i o n o f m e m c p y a n d c o p y _ u s e r f u n c t i o n
*
* Inputs :
* in0 : destination a d d r e s s
* in1 : source a d d r e s s
* in2 : number o f b y t e s t o c o p y
* Output :
* 0 if s u c c e s s , o r n u m b e r o f b y t e N O T c o p i e d i f e r r o r o c c u r r e d .
*
* Copyright ( C ) 2 0 0 2 I n t e l C o r p .
* Copyright ( C ) 2 0 0 2 K e n C h e n < k e n n e t h . w . c h e n @intel.com>
* /
# include < l i n u x / c o n f i g . h >
# include < a s m / a s m m a c r o . h >
# include < a s m / p a g e . h >
# define E K ( y . . . ) E X ( y )
/* McKinley specific optimization */
# define r e t v a l r8
# define s a v e d _ p f s r31
# define s a v e d _ l c r10
# define s a v e d _ p r r11
# define s a v e d _ i n 0 r14
# define s a v e d _ i n 1 r15
# define s a v e d _ i n 2 r16
# define s r c0 r2
# define s r c1 r3
# define d s t 0 r17
# define d s t 1 r18
# define c n t r9
/* r19-r30 are temp for each code section */
# define P R E F E T C H _ D I S T 8
# define s r c _ p r e _ m e m r19
# define d s t _ p r e _ m e m r20
# define s r c _ p r e _ l 2 r21
# define d s t _ p r e _ l 2 r22
# define t 1 r23
# define t 2 r24
# define t 3 r25
# define t 4 r26
# define t 5 t 1 / / a l i a s !
# define t 6 t 2 / / a l i a s !
# define t 7 t 3 / / a l i a s !
# define n 8 r27
# define t 9 t 5 / / a l i a s !
# define t 1 0 t 4 / / a l i a s !
# define t 1 1 t 7 / / a l i a s !
# define t 1 2 t 6 / / a l i a s !
# define t 1 4 t 1 0 / / a l i a s !
# define t 1 3 r28
# define t 1 5 r29
# define t m p r30
/* defines for long_copy block */
# define A 0
# define B ( P R E F E T C H _ D I S T )
# define C ( B + P R E F E T C H _ D I S T )
# define D ( C + 1 )
# define N ( D + 1 )
# define N r o t ( ( N + 7 ) & ~ 7 )
/* alias */
# define i n 0 r32
# define i n 1 r33
# define i n 2 r34
GLOBAL_ E N T R Y ( m e m c p y )
and r28 =0x7 ,i n 0
and r29 =0x7 ,i n 1
mov f6 =f0
br. c o n d . s p t k . c o m m o n _ c o d e
;;
2005-04-26 23:00:00 -07:00
END( m e m c p y )
2005-04-16 15:20:36 -07:00
GLOBAL_ E N T R Y ( _ _ c o p y _ u s e r )
.prologue
/ / check d e s t a l i g n m e n t
and r28 =0x7 ,i n 0
and r29 =0x7 ,i n 1
mov f6 =f1
mov s a v e d _ i n 0 =in0 / / s a v e d e s t p o i n t e r
mov s a v e d _ i n 1 =in1 / / s a v e s r c p o i n t e r
mov s a v e d _ i n 2 =in2 / / s a v e l e n
;;
.common_code :
cmp. g t p15 ,p0 =8 ,i n 2 / / c h e c k f o r s m a l l s i z e
cmp. n e p13 ,p0 =0 ,r28 / / c h e c k d e s t a l i g n m e n t
cmp. n e p14 ,p0 =0 ,r29 / / c h e c k s r c a l i g n m e n t
add s r c0 =0 ,i n 1
sub r30 =8 ,r28 / / f o r . a l i g n _ d e s t
mov r e t v a l =r0 / / i n i t i a l i z e r e t u r n v a l u e
;;
add d s t 0 =0 ,i n 0
add d s t 1 =1 ,i n 0 / / d e s t o d d i n d e x
cmp. l e p6 ,p0 = 1 ,r30 / / f o r . a l i g n _ d e s t
( p1 5 ) b r . c o n d . d p n t . m e m c p y _ s h o r t
( p1 3 ) b r . c o n d . d p n t . a l i g n _ d e s t
( p1 4 ) b r . c o n d . d p n t . u n a l i g n e d _ s r c
;;
/ / both d e s t a n d s r c a r e a l i g n e d o n 8 - b y t e b o u n d a r y
.aligned_src :
.save ar. p f s , s a v e d _ p f s
alloc s a v e d _ p f s =ar . p f s ,3 ,N r o t - 3 ,0 ,N r o t
.save pr, s a v e d _ p r
mov s a v e d _ p r =pr
shr. u c n t =in2 ,7 / / t h i s m u c h c a c h e l i n e
;;
cmp. l t p6 ,p0 =2 * P R E F E T C H _ D I S T ,c n t
cmp. l t p7 ,p8 =1 ,c n t
.save ar. l c , s a v e d _ l c
mov s a v e d _ l c =ar . l c
.body
add c n t = - 1 ,c n t
add s r c _ p r e _ m e m =0 ,i n 1 / / p r e f e t c h s r c p o i n t e r
add d s t _ p r e _ m e m =0 ,i n 0 / / p r e f e t c h d e s t p o i n t e r
;;
( p7 ) m o v a r . l c =cnt / / p r e f e t c h c o u n t
( p8 ) m o v a r . l c =r0
( p6 ) b r . c o n d . d p n t . l o n g _ c o p y
;;
.prefetch :
lfetch. f a u l t [ s r c _ p r e _ m e m ] , 1 2 8
lfetch. f a u l t . e x c l [ d s t _ p r e _ m e m ] , 1 2 8
br. c l o o p . d p t k . f e w . p r e f e t c h
;;
.medium_copy :
and t m p =31 ,i n 2 / / c o p y l e n g t h a f t e r i t e r a t i o n
shr. u r29 =in2 ,5 / / n u m b e r o f 3 2 - b y t e i t e r a t i o n
add d s t 1 =8 ,d s t 0 / / 2 n d d e s t p o i n t e r
;;
add c n t = - 1 ,r29 / / c t o p i t e r a t i o n a d j u s t m e n t
cmp. e q p10 ,p0 =r29 ,r0 / / d o w e r e a l l y n e e d t o l o o p ?
add s r c1 =8 ,s r c0 / / 2 n d s r c p o i n t e r
cmp. l e p6 ,p0 =8 ,t m p
;;
cmp. l e p7 ,p0 =16 ,t m p
mov a r . l c =cnt / / l o o p s e t u p
cmp. e q p16 ,p17 = r0 ,r0
mov a r . e c =2
( p1 0 ) b r . d p n t . f e w . a l i g n e d _ s r c _ t a i l
;;
TEXT_ A L I G N ( 3 2 )
1 :
EX( . e x _ h a n d l e r , ( p16 ) l d8 r34 = [ s r c0 ] ,1 6 )
EK( . e x _ h a n d l e r , ( p16 ) l d8 r38 = [ s r c1 ] ,1 6 )
EX( . e x _ h a n d l e r , ( p17 ) s t 8 [ d s t 0 ] =r33 ,1 6 )
EK( . e x _ h a n d l e r , ( p17 ) s t 8 [ d s t 1 ] =r37 ,1 6 )
;;
EX( . e x _ h a n d l e r , ( p16 ) l d8 r32 = [ s r c0 ] ,1 6 )
EK( . e x _ h a n d l e r , ( p16 ) l d8 r36 = [ s r c1 ] ,1 6 )
EX( . e x _ h a n d l e r , ( p16 ) s t 8 [ d s t 0 ] =r34 ,1 6 )
EK( . e x _ h a n d l e r , ( p16 ) s t 8 [ d s t 1 ] =r38 ,1 6 )
br. c t o p . d p t k . f e w 1 b
;;
.aligned_src_tail :
EX( . e x _ h a n d l e r , ( p6 ) l d8 t 1 = [ s r c0 ] )
mov a r . l c =saved_lc
mov a r . p f s =saved_pfs
EX( . e x _ h n d l r _ s , ( p7 ) l d8 t 2 = [ s r c1 ] ,8 )
cmp. l e p8 ,p0 =24 ,t m p
and r21 = - 8 ,t m p
;;
EX( . e x _ h n d l r _ s , ( p8 ) l d8 t 3 = [ s r c1 ] )
EX( . e x _ h a n d l e r , ( p6 ) s t 8 [ d s t 0 ] =t1 ) / / s t o r e b y t e 1
and i n 2 =7 ,t m p / / r e m a i n i n g l e n g t h
EX( . e x _ h n d l r _ d , ( p7 ) s t 8 [ d s t 1 ] =t2 ,8 ) / / s t o r e b y t e 2
add s r c0 =src0 ,r21 / / s e t t i n g u p s r c p o i n t e r
add d s t 0 =dst0 ,r21 / / s e t t i n g u p d e s t p o i n t e r
;;
EX( . e x _ h a n d l e r , ( p8 ) s t 8 [ d s t 1 ] =t3 ) / / s t o r e b y t e 3
mov p r =saved_pr ,- 1
br. d p t k . m a n y . m e m c p y _ s h o r t
;;
/* code taken from copy_page_mck */
.long_copy :
.rotr v[ 2 * P R E F E T C H _ D I S T ]
.rotp p[ N ]
mov s r c _ p r e _ m e m = s r c0
mov p r . r o t = 0 x10 0 0 0
mov a r . e c = 1 / / s p e c i a l u n r o l l e d l o o p
mov d s t _ p r e _ m e m = d s t 0
add s r c _ p r e _ l 2 = 8 * 8 , s r c0
add d s t _ p r e _ l 2 = 8 * 8 , d s t 0
;;
add s r c0 = 8 , s r c _ p r e _ m e m / / f i r s t t 1 s r c
mov a r . l c = 2 * P R E F E T C H _ D I S T - 1
shr. u c n t =in2 ,7 / / n u m b e r o f l i n e s
add s r c1 = 3 * 8 , s r c _ p r e _ m e m / / f i r s t t 3 s r c
add d s t 0 = 8 , d s t _ p r e _ m e m / / f i r s t t 1 d s t
add d s t 1 = 3 * 8 , d s t _ p r e _ m e m / / f i r s t t 3 d s t
;;
and t m p =127 ,i n 2 / / r e m a i n i n g b y t e s a f t e r t h i s b l o c k
add c n t = - ( 2 * P R E F E T C H _ D I S T ) - 1 , c n t
/ / same a s . l i n e _ c o p y l o o p , b u t w i t h a l l p r e d i c a t e d - o f f i n s t r u c t i o n s r e m o v e d :
.prefetch_loop :
EX( . e x _ h n d l r _ l c p y _ 1 , ( p [ A ] ) l d8 v [ A ] = [ s r c _ p r e _ m e m ] , 1 2 8 ) / / M 0
EK( . e x _ h n d l r _ l c p y _ 1 , ( p [ B ] ) s t 8 [ d s t _ p r e _ m e m ] = v [ B ] , 1 2 8 ) / / M 2
br. c t o p . s p t k . p r e f e t c h _ l o o p
;;
cmp. e q p16 , p0 = r0 , r0 / / r e s e t p16 t o 1
mov a r . l c = c n t
mov a r . e c = N / / # o f s t a g e s i n p i p e l i n e
;;
.line_copy :
EX( . e x _ h a n d l e r , ( p [ D ] ) l d8 t 2 = [ s r c0 ] , 3 * 8 ) / / M 0
EK( . e x _ h a n d l e r , ( p [ D ] ) l d8 t 4 = [ s r c1 ] , 3 * 8 ) / / M 1
EX( . e x _ h a n d l e r _ l c p y , ( p [ B ] ) s t 8 [ d s t _ p r e _ m e m ] = v [ B ] , 1 2 8 ) / / M 2 p r e f e t c h d s t f r o m m e m o r y
EK( . e x _ h a n d l e r _ l c p y , ( p [ D ] ) s t 8 [ d s t _ p r e _ l 2 ] = n 8 , 1 2 8 ) / / M 3 p r e f e t c h d s t f r o m L 2
;;
EX( . e x _ h a n d l e r _ l c p y , ( p [ A ] ) l d8 v [ A ] = [ s r c _ p r e _ m e m ] , 1 2 8 ) / / M 0 p r e f e t c h s r c f r o m m e m o r y
EK( . e x _ h a n d l e r _ l c p y , ( p [ C ] ) l d8 n 8 = [ s r c _ p r e _ l 2 ] , 1 2 8 ) / / M 1 p r e f e t c h s r c f r o m L 2
EX( . e x _ h a n d l e r , ( p [ D ] ) s t 8 [ d s t 0 ] = t 1 , 8 ) / / M 2
EK( . e x _ h a n d l e r , ( p [ D ] ) s t 8 [ d s t 1 ] = t 3 , 8 ) / / M 3
;;
EX( . e x _ h a n d l e r , ( p [ D ] ) l d8 t 5 = [ s r c0 ] , 8 )
EK( . e x _ h a n d l e r , ( p [ D ] ) l d8 t 7 = [ s r c1 ] , 3 * 8 )
EX( . e x _ h a n d l e r , ( p [ D ] ) s t 8 [ d s t 0 ] = t 2 , 3 * 8 )
EK( . e x _ h a n d l e r , ( p [ D ] ) s t 8 [ d s t 1 ] = t 4 , 3 * 8 )
;;
EX( . e x _ h a n d l e r , ( p [ D ] ) l d8 t 6 = [ s r c0 ] , 3 * 8 )
EK( . e x _ h a n d l e r , ( p [ D ] ) l d8 t 1 0 = [ s r c1 ] , 8 )
EX( . e x _ h a n d l e r , ( p [ D ] ) s t 8 [ d s t 0 ] = t 5 , 8 )
EK( . e x _ h a n d l e r , ( p [ D ] ) s t 8 [ d s t 1 ] = t 7 , 3 * 8 )
;;
EX( . e x _ h a n d l e r , ( p [ D ] ) l d8 t 9 = [ s r c0 ] , 3 * 8 )
EK( . e x _ h a n d l e r , ( p [ D ] ) l d8 t 1 1 = [ s r c1 ] , 3 * 8 )
EX( . e x _ h a n d l e r , ( p [ D ] ) s t 8 [ d s t 0 ] = t 6 , 3 * 8 )
EK( . e x _ h a n d l e r , ( p [ D ] ) s t 8 [ d s t 1 ] = t 1 0 , 8 )
;;
EX( . e x _ h a n d l e r , ( p [ D ] ) l d8 t 1 2 = [ s r c0 ] , 8 )
EK( . e x _ h a n d l e r , ( p [ D ] ) l d8 t 1 4 = [ s r c1 ] , 8 )
EX( . e x _ h a n d l e r , ( p [ D ] ) s t 8 [ d s t 0 ] = t 9 , 3 * 8 )
EK( . e x _ h a n d l e r , ( p [ D ] ) s t 8 [ d s t 1 ] = t 1 1 , 3 * 8 )
;;
EX( . e x _ h a n d l e r , ( p [ D ] ) l d8 t 1 3 = [ s r c0 ] , 4 * 8 )
EK( . e x _ h a n d l e r , ( p [ D ] ) l d8 t 1 5 = [ s r c1 ] , 4 * 8 )
EX( . e x _ h a n d l e r , ( p [ D ] ) s t 8 [ d s t 0 ] = t 1 2 , 8 )
EK( . e x _ h a n d l e r , ( p [ D ] ) s t 8 [ d s t 1 ] = t 1 4 , 8 )
;;
EX( . e x _ h a n d l e r , ( p [ C ] ) l d8 t 1 = [ s r c0 ] , 8 )
EK( . e x _ h a n d l e r , ( p [ C ] ) l d8 t 3 = [ s r c1 ] , 8 )
EX( . e x _ h a n d l e r , ( p [ D ] ) s t 8 [ d s t 0 ] = t 1 3 , 4 * 8 )
EK( . e x _ h a n d l e r , ( p [ D ] ) s t 8 [ d s t 1 ] = t 1 5 , 4 * 8 )
br. c t o p . s p t k . l i n e _ c o p y
;;
add d s t 0 = - 8 ,d s t 0
add s r c0 = - 8 ,s r c0
mov i n 2 =tmp
.restore sp
br. s p t k . m a n y . m e d i u m _ c o p y
;;
# define B L O C K _ S I Z E 1 2 8 * 3 2
# define b l o c k s i z e r23
# define c u r l e n r24
/ / dest i s o n 8 - b y t e b o u n d a r y , s r c i s n o t . W e n e e d t o d o
/ / ld8 - l d8 , s h r p , t h e n s t 8 . M a x 8 b y t e c o p y p e r c y c l e .
.unaligned_src :
.prologue
.save ar. p f s , s a v e d _ p f s
alloc s a v e d _ p f s =ar . p f s ,3 ,5 ,0 ,8
.save ar. l c , s a v e d _ l c
mov s a v e d _ l c =ar . l c
.save pr, s a v e d _ p r
mov s a v e d _ p r =pr
.body
.4k_block :
mov s a v e d _ i n 0 =dst0 / / n e e d t o s a v e a l l i n p u t a r g u m e n t s
mov s a v e d _ i n 2 =in2
mov b l o c k s i z e =BLOCK_SIZE
;;
cmp. l t p6 ,p7 =blocksize ,i n 2
mov s a v e d _ i n 1 =src0
;;
( p6 ) m o v i n 2 =blocksize
;;
shr. u r21 =in2 ,7 / / t h i s m u c h c a c h e l i n e
shr. u r22 =in2 ,4 / / n u m b e r o f 1 6 - b y t e i t e r a t i o n
and c u r l e n =15 ,i n 2 / / c o p y l e n g t h a f t e r i t e r a t i o n
and r30 =7 ,s r c0 / / s o u r c e a l i g n m e n t
;;
cmp. l t p7 ,p8 =1 ,r21
add c n t = - 1 ,r21
;;
add s r c _ p r e _ m e m =0 ,s r c0 / / p r e f e t c h s r c p o i n t e r
add d s t _ p r e _ m e m =0 ,d s t 0 / / p r e f e t c h d e s t p o i n t e r
and s r c0 = - 8 ,s r c0 / / 1 s t s r c p o i n t e r
2005-04-25 13:23:47 -07:00
( p7 ) m o v a r . l c = c n t
2005-04-16 15:20:36 -07:00
( p8 ) m o v a r . l c = r0
;;
TEXT_ A L I G N ( 3 2 )
1 : lfetch. f a u l t [ s r c _ p r e _ m e m ] , 1 2 8
lfetch. f a u l t . e x c l [ d s t _ p r e _ m e m ] , 1 2 8
br. c l o o p . d p t k . f e w 1 b
;;
shladd d s t 1 =r22 ,3 ,d s t 0 / / 2 n d d e s t p o i n t e r
shladd s r c1 =r22 ,3 ,s r c0 / / 2 n d s r c p o i n t e r
cmp. e q p8 ,p9 =r22 ,r0 / / d o w e r e a l l y n e e d t o l o o p ?
cmp. l e p6 ,p7 =8 ,c u r l e n ; // have at least 8 byte remaining?
add c n t = - 1 ,r22 / / c t o p i t e r a t i o n a d j u s t m e n t
;;
EX( . e x _ h a n d l e r , ( p9 ) l d8 r33 = [ s r c0 ] ,8 ) / / l o o p p r i m e r
EK( . e x _ h a n d l e r , ( p9 ) l d8 r37 = [ s r c1 ] ,8 )
( p8 ) b r . d p n t . f e w . n o l o o p
;;
/ / The j u m p a d d r e s s i s c a l c u l a t e d b a s e d o n s r c a l i g n m e n t . T h e C O P Y U
/ / macro b e l o w n e e d t o c o n f i n e i t s s i z e t o p o w e r o f t w o , s o a n e n t r y
/ / can b e c a u l a t e d u s i n g s h l i n s t e a d o f a n e x p e n s i v e m u l t i p l y . T h e
/ / size i s t h e n h a r d c o d e d b y t h e f o l l o w i n g #d e f i n e t o m a t c h t h e
/ / actual s i z e . T h i s m a k e i t s o m e w h a t t e d i o u s w h e n C O P Y U m a c r o g e t s
/ / changed a n d t h i s n e e d t o b e a d j u s t e d t o m a t c h .
# define L O O P _ S I Z E 6
1 :
mov r29 =ip / / j m p _ t a b l e t h r e a d
mov a r . l c =cnt
;;
add r29 = . j u m p _ t a b l e - 1 b - ( . j m p1 - . j u m p _ t a b l e ) , r29
shl r28 =r30 , L O O P _ S I Z E / / j m p _ t a b l e t h r e a d
mov a r . e c =2 / / l o o p s e t u p
;;
add r29 =r29 ,r28 / / j m p _ t a b l e t h r e a d
cmp. e q p16 ,p17 =r0 ,r0
;;
mov b6 =r29 / / j m p _ t a b l e t h r e a d
;;
br. c o n d . s p t k . f e w b6
/ / for 8 - 1 5 b y t e c a s e
/ / We w i l l s k i p t h e l o o p , b u t n e e d t o r e p l i c a t e t h e s i d e e f f e c t
/ / that t h e l o o p p r o d u c e s .
.noloop :
EX( . e x _ h a n d l e r , ( p6 ) l d8 r37 = [ s r c1 ] ,8 )
add s r c0 =8 ,s r c0
( p6 ) s h l r25 =r30 ,3
;;
EX( . e x _ h a n d l e r , ( p6 ) l d8 r27 = [ s r c1 ] )
( p6 ) s h r . u r28 =r37 ,r25
( p6 ) s u b r26 =64 ,r25
;;
( p6 ) s h l r27 =r27 ,r26
;;
( p6 ) o r r21 =r28 ,r27
.unaligned_src_tail :
/* check if we have more than blocksize to copy, if so go back */
cmp. g t p8 ,p0 =saved_in2 ,b l o c k s i z e
;;
( p8 ) a d d d s t 0 =saved_in0 ,b l o c k s i z e
( p8 ) a d d s r c0 =saved_in1 ,b l o c k s i z e
( p8 ) s u b i n 2 =saved_in2 ,b l o c k s i z e
( p8 ) b r . d p n t . 4 k _ b l o c k
;;
/ * we h a v e u p t o 1 5 b y t e t o c o p y i n t h e t a i l .
* part o f w o r k i s a l r e a d y d o n e i n t h e j u m p t a b l e c o d e
* we a r e a t t h e f o l l o w i n g s t a t e .
* src s i d e :
*
* xxxxxx x x < - - - - - r21 h a s x x x x x x x x a l r e a d y
* - - - - - - - - - - - - - - - - - - - - - - - -
* 0 8 1 6
* ^
* |
* src1
*
* dst
* - - - - - - - - - - - - - - - - - - - - - - - -
* ^
* |
* dst1
* /
EX( . e x _ h a n d l e r , ( p6 ) s t 8 [ d s t 1 ] =r21 ,8 ) / / m o r e t h a n 8 b y t e t o c o p y
( p6 ) a d d c u r l e n = - 8 ,c u r l e n / / u p d a t e l e n g t h
mov a r . p f s =saved_pfs
;;
mov a r . l c =saved_lc
mov p r =saved_pr ,- 1
mov i n 2 =curlen / / r e m a i n i n g l e n g t h
mov d s t 0 =dst1 / / d e s t p o i n t e r
add s r c0 =src1 ,r30 / / f o r w a r d b y s r c a l i g n m e n t
;;
/ / 7 byte o r s m a l l e r .
.memcpy_short :
cmp. l e p8 ,p9 = 1 ,i n 2
cmp. l e p10 ,p11 = 2 ,i n 2
cmp. l e p12 ,p13 = 3 ,i n 2
cmp. l e p14 ,p15 = 4 ,i n 2
add s r c1 =1 ,s r c0 / / s e c o n d s r c p o i n t e r
add d s t 1 =1 ,d s t 0 / / s e c o n d d e s t p o i n t e r
;;
EX( . e x _ h a n d l e r _ s h o r t , ( p8 ) l d1 t 1 = [ s r c0 ] ,2 )
EK( . e x _ h a n d l e r _ s h o r t , ( p10 ) l d1 t 2 = [ s r c1 ] ,2 )
( p9 ) b r . r e t . d p n t r p / / 0 b y t e c o p y
;;
EX( . e x _ h a n d l e r _ s h o r t , ( p8 ) s t 1 [ d s t 0 ] =t1 ,2 )
EK( . e x _ h a n d l e r _ s h o r t , ( p10 ) s t 1 [ d s t 1 ] =t2 ,2 )
( p1 1 ) b r . r e t . d p n t r p / / 1 b y t e c o p y
EX( . e x _ h a n d l e r _ s h o r t , ( p12 ) l d1 t 3 = [ s r c0 ] ,2 )
EK( . e x _ h a n d l e r _ s h o r t , ( p14 ) l d1 t 4 = [ s r c1 ] ,2 )
( p1 3 ) b r . r e t . d p n t r p / / 2 b y t e c o p y
;;
cmp. l e p6 ,p7 = 5 ,i n 2
cmp. l e p8 ,p9 = 6 ,i n 2
cmp. l e p10 ,p11 = 7 ,i n 2
EX( . e x _ h a n d l e r _ s h o r t , ( p12 ) s t 1 [ d s t 0 ] =t3 ,2 )
EK( . e x _ h a n d l e r _ s h o r t , ( p14 ) s t 1 [ d s t 1 ] =t4 ,2 )
( p1 5 ) b r . r e t . d p n t r p / / 3 b y t e c o p y
;;
EX( . e x _ h a n d l e r _ s h o r t , ( p6 ) l d1 t 5 = [ s r c0 ] ,2 )
EK( . e x _ h a n d l e r _ s h o r t , ( p8 ) l d1 t 6 = [ s r c1 ] ,2 )
( p7 ) b r . r e t . d p n t r p / / 4 b y t e c o p y
;;
EX( . e x _ h a n d l e r _ s h o r t , ( p6 ) s t 1 [ d s t 0 ] =t5 ,2 )
EK( . e x _ h a n d l e r _ s h o r t , ( p8 ) s t 1 [ d s t 1 ] =t6 ,2 )
( p9 ) b r . r e t . d p t k r p / / 5 b y t e c o p y
EX( . e x _ h a n d l e r _ s h o r t , ( p10 ) l d1 t 7 = [ s r c0 ] ,2 )
( p1 1 ) b r . r e t . d p t k r p / / 6 b y t e c o p y
;;
EX( . e x _ h a n d l e r _ s h o r t , ( p10 ) s t 1 [ d s t 0 ] =t7 ,2 )
br. r e t . d p t k r p / / d o n e a l l c a s e s
/ * Align d e s t t o n e a r e s t 8 - b y t e b o u n d a r y . W e k n o w w e h a v e a t
* least 7 b y t e s t o c o p y , e n o u g h t o c r a w l t o 8 - b y t e b o u n d a r y .
* Actual n u m b e r o f b y t e t o c r a w l d e p e n d o n t h e d e s t a l i g n m e n t .
* 7 byte o r l e s s i s t a k e n c a r e a t . m e m c p y _ s h o r t
* src0 - s o u r c e e v e n i n d e x
* src1 - s o u r c e o d d i n d e x
* dst0 - d e s t e v e n i n d e x
* dst1 - d e s t o d d i n d e x
* r3 0 - d i s t a n c e t o 8 - b y t e b o u n d a r y
* /
.align_dest :
add s r c1 =1 ,i n 1 / / s o u r c e o d d i n d e x
cmp. l e p7 ,p0 = 2 ,r30 / / f o r . a l i g n _ d e s t
cmp. l e p8 ,p0 = 3 ,r30 / / f o r . a l i g n _ d e s t
EX( . e x _ h a n d l e r _ s h o r t , ( p6 ) l d1 t 1 = [ s r c0 ] ,2 )
cmp. l e p9 ,p0 = 4 ,r30 / / f o r . a l i g n _ d e s t
cmp. l e p10 ,p0 = 5 ,r30
;;
EX( . e x _ h a n d l e r _ s h o r t , ( p7 ) l d1 t 2 = [ s r c1 ] ,2 )
EK( . e x _ h a n d l e r _ s h o r t , ( p8 ) l d1 t 3 = [ s r c0 ] ,2 )
cmp. l e p11 ,p0 = 6 ,r30
EX( . e x _ h a n d l e r _ s h o r t , ( p6 ) s t 1 [ d s t 0 ] = t 1 ,2 )
cmp. l e p12 ,p0 = 7 ,r30
;;
EX( . e x _ h a n d l e r _ s h o r t , ( p9 ) l d1 t 4 = [ s r c1 ] ,2 )
EK( . e x _ h a n d l e r _ s h o r t , ( p10 ) l d1 t 5 = [ s r c0 ] ,2 )
EX( . e x _ h a n d l e r _ s h o r t , ( p7 ) s t 1 [ d s t 1 ] = t 2 ,2 )
EK( . e x _ h a n d l e r _ s h o r t , ( p8 ) s t 1 [ d s t 0 ] = t 3 ,2 )
;;
EX( . e x _ h a n d l e r _ s h o r t , ( p11 ) l d1 t 6 = [ s r c1 ] ,2 )
EK( . e x _ h a n d l e r _ s h o r t , ( p12 ) l d1 t 7 = [ s r c0 ] ,2 )
cmp. e q p6 ,p7 =r28 ,r29
EX( . e x _ h a n d l e r _ s h o r t , ( p9 ) s t 1 [ d s t 1 ] = t 4 ,2 )
EK( . e x _ h a n d l e r _ s h o r t , ( p10 ) s t 1 [ d s t 0 ] = t 5 ,2 )
sub i n 2 =in2 ,r30
;;
EX( . e x _ h a n d l e r _ s h o r t , ( p11 ) s t 1 [ d s t 1 ] = t 6 ,2 )
EK( . e x _ h a n d l e r _ s h o r t , ( p12 ) s t 1 [ d s t 0 ] = t 7 )
add d s t 0 =in0 ,r30 / / s e t u p a r g u m e n t s
add s r c0 =in1 ,r30
( p6 ) b r . c o n d . d p t k . a l i g n e d _ s r c
( p7 ) b r . c o n d . d p n t . u n a l i g n e d _ s r c
;;
/* main loop body in jump table format */
# define C O P Y U ( s h i f t ) \
1 : \
EX( . e x _ h a n d l e r , ( p16 ) l d8 r32 = [ s r c0 ] ,8 ) ; /* 1 */ \
EK( . e x _ h a n d l e r , ( p16 ) l d8 r36 = [ s r c1 ] ,8 ) ; \
( p1 7 ) s h r p r35 =r33 ,r34 ,s h i f t ;; /* 1 */ \
EX( . e x _ h a n d l e r , ( p6 ) l d8 r22 = [ s r c1 ] ) ; /* common, prime for tail section */ \
nop. m 0 ; \
( p1 6 ) s h r p r38 =r36 ,r37 ,s h i f t ; \
EX( . e x _ h a n d l e r , ( p17 ) s t 8 [ d s t 0 ] =r35 ,8 ) ; /* 1 */ \
EK( . e x _ h a n d l e r , ( p17 ) s t 8 [ d s t 1 ] =r39 ,8 ) ; \
br. c t o p . d p t k . f e w 1 b ;; \
( p7 ) a d d s r c1 = - 8 ,s r c1 ; /* back out for <8 byte case */ \
shrp r21 =r22 ,r38 ,s h i f t ; /* speculative work */ \
br. s p t k . f e w . u n a l i g n e d _ s r c _ t a i l / * b r a n c h o u t o f j u m p t a b l e * / \
;;
TEXT_ A L I G N ( 3 2 )
.jump_table :
COPYU( 8 ) / / u n a l i g n e d c a s e s
.jmp1 :
COPYU( 1 6 )
COPYU( 2 4 )
COPYU( 3 2 )
COPYU( 4 0 )
COPYU( 4 8 )
COPYU( 5 6 )
# undef A
# undef B
# undef C
# undef D
/ *
* Due t o l a c k o f l o c a l t a g s u p p o r t i n g c c 2 . x a s s e m b l e r , i t i s n o t c l e a r w h i c h
* instruction f a i l e d i n t h e b u n d l e . T h e e x c e p t i o n a l g o r i t h m i s t h a t w e
* first f i g u r e o u t t h e f a u l t i n g a d d r e s s , t h e n d e t e c t i f t h e r e i s a n y
* progress m a d e o n t h e c o p y , i f s o , r e d o t h e c o p y f r o m l a s t k n o w n c o p i e d
* location u p t o t h e f a u l t i n g a d d r e s s ( e x c l u s i v e ) . I n t h e c o p y _ f r o m _ u s e r
* case, r e m a i n i n g b y t e i n k e r n e l b u f f e r w i l l b e z e r o e d .
*
* Take c o p y _ f r o m _ u s e r a s a n e x a m p l e , i n t h e c o d e t h e r e a r e m u l t i p l e l o a d s
* in a b u n d l e a n d t h o s e m u l t i p l e l o a d s c o u l d s p a n o v e r t w o p a g e s , t h e
* faulting a d d r e s s i s c a l c u l a t e d a s p a g e _ r o u n d _ d o w n ( m a x ( s r c0 , s r c1 ) ) .
* This i s b a s e d o n k n o w l e d g e t h a t i f w e c a n a c c e s s o n e b y t e i n a p a g e , w e
* can a c c e s s a n y b y t e i n t h a t p a g e .
*
* predicate u s e d i n t h e e x c e p t i o n h a n d l e r :
* p6 - p7 : d i r e c t i o n
* p1 0 - p11 : s r c f a u l t i n g a d d r c a l c u l a t i o n
* p1 2 - p13 : d s t f a u l t i n g a d d r c a l c u l a t i o n
* /
# define A r19
# define B r20
# define C r21
# define D r22
# define F r28
# define m e m s e t _ a r g 0 r32
# define m e m s e t _ a r g 2 r33
# define s a v e d _ r e t v a l l o c0
# define s a v e d _ r t l i n k l o c1
# define s a v e d _ p f s _ s t a c k l o c2
.ex_hndlr_s :
add s r c0 =8 ,s r c0
br. s p t k . e x _ h a n d l e r
;;
.ex_hndlr_d :
add d s t 0 =8 ,d s t 0
br. s p t k . e x _ h a n d l e r
;;
.ex_hndlr_lcpy_1 :
mov s r c1 =src_pre_mem
mov d s t 1 =dst_pre_mem
cmp. g t u p10 ,p11 =src_pre_mem ,s a v e d _ i n 1
cmp. g t u p12 ,p13 =dst_pre_mem ,s a v e d _ i n 0
;;
( p1 0 ) a d d s r c0 =8 ,s a v e d _ i n 1
( p1 1 ) m o v s r c0 =saved_in1
( p1 2 ) a d d d s t 0 =8 ,s a v e d _ i n 0
( p1 3 ) m o v d s t 0 =saved_in0
br. s p t k . e x _ h a n d l e r
.ex_handler_lcpy :
/ / in l i n e _ c o p y b l o c k , t h e p r e l o a d a d d r e s s e s s h o u l d a l w a y s a h e a d
/ / of t h e o t h e r t w o s r c / d s t p o i n t e r s . F u r t h e r m o r e , s r c1 / d s t 1 s h o u l d
/ / always a h e a d o f s r c0 / d s t 0 .
mov s r c1 =src_pre_mem
mov d s t 1 =dst_pre_mem
.ex_handler :
mov p r =saved_pr ,- 1 / / f i r s t r e s t o r e p r , l c , a n d p f s
mov a r . l c =saved_lc
mov a r . p f s =saved_pfs
;;
.ex_handler_short : / / fault o c c u r r e d i n t h e s e s e c t i o n s d i d n ' t c h a n g e p r , l c , p f s
cmp. l t u p6 ,p7 =saved_in0 , s a v e d _ i n 1 / / g e t t h e c o p y d i r e c t i o n
cmp. l t u p10 ,p11 =src0 ,s r c1
cmp. l t u p12 ,p13 =dst0 ,d s t 1
fcmp. e q p8 ,p0 =f6 ,f0 / / i s i t m e m c p y ?
mov t m p = d s t 0
;;
( p1 1 ) m o v s r c1 = s r c0 / / p i c k t h e l a r g e r o f t h e t w o
( p1 3 ) m o v d s t 0 = d s t 1 / / m a k e d s t 0 t h e s m a l l e r o n e
( p1 3 ) m o v d s t 1 = t m p / / a n d d s t 1 t h e l a r g e r o n e
;;
( p6 ) d e p F = r0 ,d s t 1 ,0 ,P A G E _ S H I F T / / u s r d s t r o u n d d o w n t o p a g e b o u n d a r y
( p7 ) d e p F = r0 ,s r c1 ,0 ,P A G E _ S H I F T / / u s r s r c r o u n d d o w n t o p a g e b o u n d a r y
;;
( p6 ) c m p . l e p14 ,p0 =dst0 ,s a v e d _ i n 0 / / n o p r o g r e s s h a s b e e n m a d e o n s t o r e
( p7 ) c m p . l e p14 ,p0 =src0 ,s a v e d _ i n 1 / / n o p r o g r e s s h a s b e e n m a d e o n l o a d
mov r e t v a l =saved_in2
( p8 ) l d1 t m p = [ s r c1 ] / / f o r c e a n o o p s f o r m e m c p y c a l l
( p8 ) s t 1 [ d s t 1 ] =r0 / / f o r c e a n o o p s f o r m e m c p y c a l l
( p1 4 ) b r . r e t . s p t k . m a n y r p
/ *
* The r e m a i n i n g b y t e t o c o p y i s c a l c u l a t e d a s :
*
* A = ( f a u l t i n g _ a d d r - o r i g _ s r c ) - > l e n t o f a u l t i n g l d a d d r e s s
* or
* ( faulting_ a d d r - o r i g _ d s t ) - > l e n t o f a u l t i n g s t a d d r e s s
* B = ( c u r _ d s t - o r i g _ d s t ) - > l e n c o p i e d s o f a r
* C = A - B - > l e n n e e d t o b e c o p i e d
* D = o r i g _ l e n - A - > l e n n e e d t o b e z e r o e d
* /
( p6 ) s u b A = F , s a v e d _ i n 0
( p7 ) s u b A = F , s a v e d _ i n 1
clrrrb
;;
alloc s a v e d _ p f s _ s t a c k =ar . p f s ,3 ,3 ,3 ,0
sub B = d s t 0 , s a v e d _ i n 0 / / h o w m a n y b y t e c o p i e d s o f a r
;;
sub C = A , B
sub D = s a v e d _ i n 2 , A
;;
cmp. g t p8 ,p0 =C ,r0 / / m o r e t h a n 1 b y t e ?
add m e m s e t _ a r g 0 =saved_in0 , A
( p6 ) m o v m e m s e t _ a r g 2 =0 / / c o p y _ t o _ u s e r s h o u l d n o t c a l l m e m s e t
( p7 ) m o v m e m s e t _ a r g 2 =D / / c o p y _ f r o m _ u s e r n e e d t o h a v e k b u f z e r o e d
mov r8 =0
mov s a v e d _ r e t v a l = D
mov s a v e d _ r t l i n k = b0
add o u t 0 =saved_in0 , B
add o u t 1 =saved_in1 , B
mov o u t 2 =C
( p8 ) b r . c a l l . s p t k . f e w b0 =__copy_user / / r e c u r s i v e c a l l
;;
add s a v e d _ r e t v a l =saved_retval ,r8 / / a b o v e m i g h t r e t u r n n o n - z e r o v a l u e
cmp. g t p8 ,p0 =memset_arg2 ,r0 / / m o r e t h a n 1 b y t e ?
mov o u t 0 =memset_arg0 / / * s
mov o u t 1 =r0 / / c
mov o u t 2 =memset_arg2 / / n
( p8 ) b r . c a l l . s p t k . f e w b0 =memset
;;
mov r e t v a l =saved_retval
mov a r . p f s =saved_pfs_stack
mov b0 =saved_rtlink
br. r e t . s p t k . m a n y r p
/* end of McKinley specific optimization */
END( _ _ c o p y _ u s e r )