2005-04-16 15:20:36 -07:00
/ * Optimized v e r s i o n o f t h e s t a n d a r d m e m s e t ( ) f u n c t i o n .
Copyright ( c ) 2 0 0 2 H e w l e t t - P a c k a r d C o / C E R N
Sverre J a r p < S v e r r e . J a r p @cern.ch>
Return : dest
Inputs :
in0 : dest
in1 : value
in2 : count
The a l g o r i t h m i s f a i r l y s t r a i g h t f o r w a r d : s e t b y t e b y b y t e u n t i l w e
we g e t t o a 1 6 B - a l i g n e d a d d r e s s , t h e n l o o p o n 1 2 8 B c h u n k s u s i n g a n
early s t o r e a s p r e f e t c h i n g , t h e n l o o p o n 3 2 B c h u c k s , t h e n c l e a r r e m a i n i n g
words, f i n a l l y c l e a r r e m a i n i n g b y t e s .
Since a s t f . s p i l l f0 c a n s t o r e 1 6 B i n o n e g o , w e u s e t h i s i n s t r u c t i o n
to g e t p e a k s p e e d w h e n v a l u e = 0 . * /
# include < a s m / a s m m a c r o . h >
# undef r e t
# define d e s t i n 0
# define v a l u e i n 1
# define c n t i n 2
# define t m p r31
# define s a v e _ l c r30
# define p t r0 r29
# define p t r1 r28
# define p t r2 r27
# define p t r3 r26
# define p t r9 r24
# define l o o p c n t r23
# define l i n e c n t r22
# define b y t e c n t r21
# define f v a l u e f6
/ / This r o u t i n e u s e s o n l y s c r a t c h p r e d i c a t e r e g i s t e r s ( p6 - p15 )
# define p _ s c r p6 / / d e f a u l t r e g i s t e r f o r s a m e - c y c l e b r a n c h e s
# define p _ n z p7
# define p _ z r p8
# define p _ u n a l g n p9
# define p _ y p11
# define p _ n p12
# define p _ y y p13
# define p _ n n p14
# define M I N 1 1 5
# define M I N 1 P 1 H A L F 8
# define L I N E _ S I Z E 1 2 8
# define L S I Z E _ S H 7 / / s h i f t a m o u n t
# define P R E F _ A H E A D 8
GLOBAL_ E N T R Y ( m e m s e t )
{ .mmi
.prologue
alloc t m p = a r . p f s , 3 , 0 , 0 , 0
lfetch. n t 1 [ d e s t ] / /
.save ar. l c , s a v e _ l c
mov. i s a v e _ l c = a r . l c
2005-03-25 00:16:00 -07:00
.body
2005-04-16 15:20:36 -07:00
} { .mmi
mov r e t 0 = d e s t / / r e t u r n v a l u e
cmp. n e p _ n z , p _ z r = v a l u e , r0 / / u s e s t f . s p i l l i f v a l u e i s z e r o
cmp. e q p _ s c r , p0 = c n t , r0
;; }
{ .mmi
and p t r2 = - ( M I N 1 + 1 ) , d e s t / / a l i g n e d a d d r e s s
and t m p = M I N 1 , d e s t / / p r e p a r e t o c h e c k f o r c o r r e c t a l i g n m e n t
tbit. n z p _ y , p _ n = d e s t , 0 / / D o w e h a v e a n o d d a d d r e s s ? ( M _ B _ U )
} { .mib
mov p t r1 = d e s t
mux1 v a l u e = v a l u e , @brcst // create 8 identical bytes in word
( p_ s c r ) b r . r e t . d p n t . m a n y r p / / r e t u r n i m m e d i a t e l y i f c o u n t = 0
;; }
{ .mib
cmp. n e p _ u n a l g n , p0 = t m p , r0 / /
} { .mib
sub b y t e c n t = ( M I N 1 + 1 ) , t m p / / N B : # o f b y t e s t o m o v e i s 1 h i g h e r t h a n l o o p c n t
cmp. g t p _ s c r , p0 = 1 6 , c n t / / i s i t a m i n i m a l i s t i c t a s k ?
( p_ s c r ) b r . c o n d . d p t k . m a n y . m o v e _ b y t e s _ u n a l i g n e d / / g o m o v e j u s t a f e w ( M _ B _ U )
;; }
{ .mmi
( p_ u n a l g n ) a d d p t r1 = ( M I N 1 + 1 ) , p t r2 / / a f t e r a l i g n m e n t
( p_ u n a l g n ) a d d p t r2 = M I N 1 P 1 H A L F , p t r2 / / a f t e r a l i g n m e n t
( p_ u n a l g n ) t b i t . n z . u n c p _ y , p _ n = b y t e c n t , 3 / / s h o u l d w e d o a s t 8 ?
;; }
{ .mib
( p_ y ) a d d c n t = - 8 , c n t / /
( p_ u n a l g n ) t b i t . n z . u n c p _ y y , p _ n n = b y t e c n t , 2 / / s h o u l d w e d o a s t 4 ?
} { .mib
( p_ y ) s t 8 [ p t r2 ] = v a l u e ,- 4 / /
( p_ n ) a d d p t r2 = 4 , p t r2 / /
;; }
{ .mib
( p_ y y ) a d d c n t = - 4 , c n t / /
( p_ u n a l g n ) t b i t . n z . u n c p _ y , p _ n = b y t e c n t , 1 / / s h o u l d w e d o a s t 2 ?
} { .mib
( p_ y y ) s t 4 [ p t r2 ] = v a l u e ,- 2 / /
( p_ n n ) a d d p t r2 = 2 , p t r2 / /
;; }
{ .mmi
mov t m p = L I N E _ S I Z E + 1 / / f o r c o m p a r e
( p_ y ) a d d c n t = - 2 , c n t / /
( p_ u n a l g n ) t b i t . n z . u n c p _ y y , p _ n n = b y t e c n t , 0 / / s h o u l d w e d o a s t 1 ?
} { .mmi
setf. s i g f v a l u e =value / / t r a n s f e r v a l u e t o F L P s i d e
( p_ y ) s t 2 [ p t r2 ] = v a l u e ,- 1 / /
( p_ n ) a d d p t r2 = 1 , p t r2 / /
;; }
{ .mmi
( p_ y y ) s t 1 [ p t r2 ] = v a l u e / /
cmp. g t p _ s c r , p0 = t m p , c n t / / i s i t a m i n i m a l i s t i c t a s k ?
} { .mbb
( p_ y y ) a d d c n t = - 1 , c n t / /
( p_ s c r ) b r . c o n d . d p n t . m a n y . f r a c t i o n _ o f _ l i n e / / g o m o v e j u s t a f e w
;; }
{ .mib
nop. m 0
shr. u l i n e c n t = c n t , L S I Z E _ S H
( p_ z r ) b r . c o n d . d p t k . m a n y . l 1 b / / J u m p t o u s e s t f . s p i l l
;; }
TEXT_ A L I G N ( 3 2 ) / / - - - - - - - - - - - - - - - - - - - - - / / L 1 A : s t o r e a h e a d i n t o c a c h e l i n e s ; fill later
{ .mmi
and t m p = - ( L I N E _ S I Z E ) , c n t / / c o m p u t e e n d o f r a n g e
mov p t r9 = p t r1 / / u s e d f o r p r e f e t c h i n g
and c n t = ( L I N E _ S I Z E - 1 ) , c n t / / r e m a i n d e r
} { .mmi
mov l o o p c n t = P R E F _ A H E A D - 1 / / d e f a u l t p r e f e t c h l o o p
cmp. g t p _ s c r , p0 = P R E F _ A H E A D , l i n e c n t / / c h e c k a g a i n s t a c t u a l v a l u e
;; }
{ .mmi
( p_ s c r ) a d d l o o p c n t = - 1 , l i n e c n t / /
add p t r2 = 8 , p t r1 / / s t a r t o f s t o r e s ( b e y o n d p r e f e t c h s t o r e s )
add p t r1 = t m p , p t r1 / / f i r s t a d d r e s s b e y o n d t o t a l r a n g e
;; }
{ .mmi
add t m p = - 1 , l i n e c n t / / n e x t l o o p c o u n t
mov. i a r . l c = l o o p c n t / /
;; }
.pref_l1a :
{ .mib
stf8 [ p t r9 ] = f v a l u e , 1 2 8 / / D o s t o r e s o n e c a c h e l i n e a p a r t
nop. i 0
br. c l o o p . d p t k . f e w . p r e f _ l 1 a
;; }
{ .mmi
add p t r0 = 1 6 , p t r2 / / T w o s t o r e s i n p a r a l l e l
mov. i a r . l c = t m p / /
;; }
.l1ax :
{ .mmi
stf8 [ p t r2 ] = f v a l u e , 8
stf8 [ p t r0 ] = f v a l u e , 8
;; }
{ .mmi
stf8 [ p t r2 ] = f v a l u e , 2 4
stf8 [ p t r0 ] = f v a l u e , 2 4
;; }
{ .mmi
stf8 [ p t r2 ] = f v a l u e , 8
stf8 [ p t r0 ] = f v a l u e , 8
;; }
{ .mmi
stf8 [ p t r2 ] = f v a l u e , 2 4
stf8 [ p t r0 ] = f v a l u e , 2 4
;; }
{ .mmi
stf8 [ p t r2 ] = f v a l u e , 8
stf8 [ p t r0 ] = f v a l u e , 8
;; }
{ .mmi
stf8 [ p t r2 ] = f v a l u e , 2 4
stf8 [ p t r0 ] = f v a l u e , 2 4
;; }
{ .mmi
stf8 [ p t r2 ] = f v a l u e , 8
stf8 [ p t r0 ] = f v a l u e , 3 2
cmp. l t p _ s c r , p0 = p t r9 , p t r1 / / d o w e n e e d m o r e p r e f e t c h i n g ?
;; }
{ .mmb
stf8 [ p t r2 ] = f v a l u e , 2 4
( p_ s c r ) s t f8 [ p t r9 ] = f v a l u e , 1 2 8
br. c l o o p . d p t k . f e w . l 1 a x
;; }
{ .mbb
cmp. l e p _ s c r , p0 = 8 , c n t / / j u s t a f e w b y t e s l e f t ?
( p_ s c r ) b r . c o n d . d p n t . m a n y . f r a c t i o n _ o f _ l i n e / / B r a n c h n o . 2
br. c o n d . d p n t . m a n y . m o v e _ b y t e s _ f r o m _ a l i g n m e n t / / B r a n c h n o . 3
;; }
TEXT_ A L I G N ( 3 2 )
.l1b : / / - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - / / L1B : store a h e a d i n t o c a c h e l i n e s ; fill later
{ .mmi
and t m p = - ( L I N E _ S I Z E ) , c n t / / c o m p u t e e n d o f r a n g e
mov p t r9 = p t r1 / / u s e d f o r p r e f e t c h i n g
and c n t = ( L I N E _ S I Z E - 1 ) , c n t / / r e m a i n d e r
} { .mmi
mov l o o p c n t = P R E F _ A H E A D - 1 / / d e f a u l t p r e f e t c h l o o p
cmp. g t p _ s c r , p0 = P R E F _ A H E A D , l i n e c n t / / c h e c k a g a i n s t a c t u a l v a l u e
;; }
{ .mmi
( p_ s c r ) a d d l o o p c n t = - 1 , l i n e c n t
add p t r2 = 1 6 , p t r1 / / s t a r t o f s t o r e s ( b e y o n d p r e f e t c h s t o r e s )
add p t r1 = t m p , p t r1 / / f i r s t a d d r e s s b e y o n d t o t a l r a n g e
;; }
{ .mmi
add t m p = - 1 , l i n e c n t / / n e x t l o o p c o u n t
mov. i a r . l c = l o o p c n t
;; }
.pref_l1b :
{ .mib
stf. s p i l l [ p t r9 ] = f0 , 1 2 8 / / D o s t o r e s o n e c a c h e l i n e a p a r t
nop. i 0
br. c l o o p . d p t k . f e w . p r e f _ l 1 b
;; }
{ .mmi
add p t r0 = 1 6 , p t r2 / / T w o s t o r e s i n p a r a l l e l
mov. i a r . l c = t m p
;; }
.l1bx :
{ .mmi
stf. s p i l l [ p t r2 ] = f0 , 3 2
stf. s p i l l [ p t r0 ] = f0 , 3 2
;; }
{ .mmi
stf. s p i l l [ p t r2 ] = f0 , 3 2
stf. s p i l l [ p t r0 ] = f0 , 3 2
;; }
{ .mmi
stf. s p i l l [ p t r2 ] = f0 , 3 2
stf. s p i l l [ p t r0 ] = f0 , 6 4
cmp. l t p _ s c r , p0 = p t r9 , p t r1 / / d o w e n e e d m o r e p r e f e t c h i n g ?
;; }
{ .mmb
stf. s p i l l [ p t r2 ] = f0 , 3 2
( p_ s c r ) s t f . s p i l l [ p t r9 ] = f0 , 1 2 8
br. c l o o p . d p t k . f e w . l 1 b x
;; }
{ .mib
cmp. g t p _ s c r , p0 = 8 , c n t / / j u s t a f e w b y t e s l e f t ?
( p_ s c r ) b r . c o n d . d p n t . m a n y . m o v e _ b y t e s _ f r o m _ a l i g n m e n t / /
;; }
.fraction_of_line :
{ .mib
add p t r2 = 1 6 , p t r1
shr. u l o o p c n t = c n t , 5 / / l o o p c n t = c n t / 3 2
;; }
{ .mib
cmp. e q p _ s c r , p0 = l o o p c n t , r0
add l o o p c n t = - 1 , l o o p c n t
( p_ s c r ) b r . c o n d . d p n t . m a n y . s t o r e _ w o r d s
;; }
{ .mib
and c n t = 0 x1 f , c n t / / c o m p u t e t h e r e m a i n i n g c n t
mov. i a r . l c = l o o p c n t
;; }
TEXT_ A L I G N ( 3 2 )
.l2 : / / - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - / / L2A : store 3 2 B i n 2 c y c l e s
{ .mmb
stf8 [ p t r1 ] = f v a l u e , 8
stf8 [ p t r2 ] = f v a l u e , 8
;; } { .mmb
stf8 [ p t r1 ] = f v a l u e , 2 4
stf8 [ p t r2 ] = f v a l u e , 2 4
br. c l o o p . d p t k . m a n y . l 2
;; }
.store_words :
{ .mib
cmp. g t p _ s c r , p0 = 8 , c n t / / j u s t a f e w b y t e s l e f t ?
( p_ s c r ) b r . c o n d . d p n t . m a n y . m o v e _ b y t e s _ f r o m _ a l i g n m e n t / / B r a n c h
;; }
{ .mmi
stf8 [ p t r1 ] = f v a l u e , 8 / / s t o r e
cmp. l e p _ y , p _ n = 1 6 , c n t
add c n t = - 8 , c n t / / s u b t r a c t
;; }
{ .mmi
( p_ y ) s t f8 [ p t r1 ] = f v a l u e , 8 / / s t o r e
( p_ y ) c m p . l e . u n c p _ y y , p _ n n = 1 6 , c n t
( p_ y ) a d d c n t = - 8 , c n t / / s u b t r a c t
;; }
{ .mmi / / store
( p_ y y ) s t f8 [ p t r1 ] = f v a l u e , 8
( p_ y y ) a d d c n t = - 8 , c n t / / s u b t r a c t
;; }
.move_bytes_from_alignment :
{ .mib
cmp. e q p _ s c r , p0 = c n t , r0
tbit. n z . u n c p _ y , p0 = c n t , 2 / / s h o u l d w e t e r m i n a t e w i t h a s t 4 ?
( p_ s c r ) b r . c o n d . d p n t . f e w . r e s t o r e _ a n d _ e x i t
;; }
{ .mib
( p_ y ) s t 4 [ p t r1 ] = v a l u e ,4
tbit. n z . u n c p _ y y , p0 = c n t , 1 / / s h o u l d w e t e r m i n a t e w i t h a s t 2 ?
;; }
{ .mib
( p_ y y ) s t 2 [ p t r1 ] = v a l u e ,2
tbit. n z . u n c p _ y , p0 = c n t , 0 / / s h o u l d w e t e r m i n a t e w i t h a s t 1 ?
;; }
{ .mib
( p_ y ) s t 1 [ p t r1 ] = v a l u e
;; }
.restore_and_exit :
{ .mib
nop. m 0
mov. i a r . l c = s a v e _ l c
br. r e t . s p t k . m a n y r p
;; }
.move_bytes_unaligned :
{ .mmi
.pred .rel " mutex" ,p _ y , p _ n
.pred .rel " mutex" ,p _ y y , p _ n n
( p_ n ) c m p . l e p _ y y , p _ n n = 4 , c n t
( p_ y ) c m p . l e p _ y y , p _ n n = 5 , c n t
( p_ n ) a d d p t r2 = 2 , p t r1
} { .mmi
( p_ y ) a d d p t r2 = 3 , p t r1
( p_ y ) s t 1 [ p t r1 ] = v a l u e , 1 / / f i l l 1 ( o d d - a l i g n e d ) b y t e [ 1 5 , 1 4 ( o r l e s s ) l e f t ]
( p_ y ) a d d c n t = - 1 , c n t
;; }
{ .mmi
( p_ y y ) c m p . l e . u n c p _ y , p0 = 8 , c n t
add p t r3 = p t r1 , c n t / / p r e p a r e l a s t s t o r e
mov. i a r . l c = s a v e _ l c
} { .mmi
( p_ y y ) s t 2 [ p t r1 ] = v a l u e , 4 / / f i l l 2 ( a l i g n e d ) b y t e s
( p_ y y ) s t 2 [ p t r2 ] = v a l u e , 4 / / f i l l 2 ( a l i g n e d ) b y t e s [ 1 1 , 1 0 ( o l e s s ) l e f t ]
( p_ y y ) a d d c n t = - 4 , c n t
;; }
{ .mmi
( p_ y ) c m p . l e . u n c p _ y y , p0 = 8 , c n t
add p t r3 = - 1 , p t r3 / / l a s t s t o r e
tbit. n z p _ s c r , p0 = c n t , 1 / / w i l l t h e r e b e a s t 2 a t t h e e n d ?
} { .mmi
( p_ y ) s t 2 [ p t r1 ] = v a l u e , 4 / / f i l l 2 ( a l i g n e d ) b y t e s
( p_ y ) s t 2 [ p t r2 ] = v a l u e , 4 / / f i l l 2 ( a l i g n e d ) b y t e s [ 7 , 6 ( o r l e s s ) l e f t ]
( p_ y ) a d d c n t = - 4 , c n t
;; }
{ .mmi
( p_ y y ) s t 2 [ p t r1 ] = v a l u e , 4 / / f i l l 2 ( a l i g n e d ) b y t e s
( p_ y y ) s t 2 [ p t r2 ] = v a l u e , 4 / / f i l l 2 ( a l i g n e d ) b y t e s [ 3 , 2 ( o r l e s s ) l e f t ]
tbit. n z p _ y , p0 = c n t , 0 / / w i l l t h e r e b e a s t 1 a t t h e e n d ?
} { .mmi
( p_ y y ) a d d c n t = - 4 , c n t
;; }
{ .mmb
( p_ s c r ) s t 2 [ p t r1 ] = v a l u e / / f i l l 2 ( a l i g n e d ) b y t e s
( p_ y ) s t 1 [ p t r3 ] = v a l u e / / f i l l l a s t b y t e ( u s i n g p t r3 )
br. r e t . s p t k . m a n y r p
}
END( m e m s e t )