2011-10-31 18:38:38 -05:00
/ *
2012-09-19 16:22:02 -05:00
* Copyright ( c ) 2 0 1 0 - 2 0 1 1 , T h e L i n u x F o u n d a t i o n . A l l r i g h t s r e s e r v e d .
2011-10-31 18:38:38 -05:00
*
*
* This p r o g r a m i s f r e e s o f t w a r e ; you can redistribute it and/or modify
* it u n d e r t h e t e r m s o f t h e G N U G e n e r a l P u b l i c L i c e n s e v e r s i o n 2 a n d
* only v e r s i o n 2 a s p u b l i s h e d b y t h e F r e e S o f t w a r e F o u n d a t i o n .
*
* This p r o g r a m i s d i s t r i b u t e d i n t h e h o p e t h a t i t w i l l b e u s e f u l ,
* but W I T H O U T A N Y W A R R A N T Y ; without even the implied warranty of
* MERCHANTABILITY o r F I T N E S S F O R A P A R T I C U L A R P U R P O S E . S e e t h e
* GNU G e n e r a l P u b l i c L i c e n s e f o r m o r e d e t a i l s .
*
* You s h o u l d h a v e r e c e i v e d a c o p y o f t h e G N U G e n e r a l P u b l i c L i c e n s e
* along w i t h t h i s p r o g r a m ; if not, write to the Free Software
* Foundation, I n c . , 5 1 F r a n k l i n S t r e e t , F i f t h F l o o r , B o s t o n , M A
* 0 2 1 1 0 - 1 3 0 1 , USA.
* /
/ *
* Description
*
* library f u n c t i o n f o r m e m c p y w h e r e l e n g t h b y t e s a r e c o p i e d f r o m
* ptr_ i n t o p t r _ o u t . p t r _ o u t i s r e t u r n e d u n c h a n g e d .
* Allows a n y c o m b i n a t i o n o f a l i g n m e n t o n i n p u t a n d o u t p u t p o i n t e r s
* and l e n g t h f r o m 0 t o 2 ^ 3 2 - 1
*
* Restrictions
* The a r r a y s s h o u l d n o t o v e r l a p , t h e p r o g r a m w i l l p r o d u c e u n d e f i n e d o u t p u t
* if t h e y d o .
* For b l o c k s l e s s t h a n 1 6 b y t e s a b y t e b y b y t e c o p y i s p e r f o r m e d . F o r
* 8 byte a l i g n m e n t s , a n d l e n g t h m u l t i p l e s , a d w o r d c o p y i s p e r f o r m e d u p t o
* 9 6 bytes
* History
*
* DJH 5 / 1 5 / 0 9 I n i t i a l v e r s i o n 1 . 0
* DJH 6 / 1 / 0 9 V e r s i o n 1 . 1 m o d i f i e d A B I t o i n l c u d e R 1 6 - R 1 9
* DJH 7 / 1 2 / 0 9 V e r s i o n 1 . 2 o p t i m i z e d c o d e s i z e d o w n t o 7 6 0 w a s 8 4 0
* DJH 1 0 / 1 4 / 0 9 V e r s i o n 1 . 3 a d d e d s p e c i a l l o o p f o r a l i g n e d c a s e , w a s
* overreading b l o a t e d c o d e s i z e b a c k u p t o 8 9 2
* DJH 4 / 2 0 / 1 0 V e r s i o n 1 . 4 f i x e d L d w o r d _ l o o p _ e p i l o g l o o p t o p r e v e n t l o a d s
2016-02-24 10:49:53 -08:00
* occurring i f o n l y 1 l e f t o u t s t a n d i n g , f i x e s b u g
2011-10-31 18:38:38 -05:00
* # 3 8 8 8 , corrected f o r a l l a l i g n m e n t s . P e e l e d o f f
* 1 3 2 byte c h u n k f r o m k e r n e l l o o p a n d e x t e n d e d 8 b y t e
* loop a t e n d t o s o l v e a l l c o m b i n a t i o n s a n d p r e v e n t
* over r e a d . F i x e d L d w o r d _ l o o p _ p r o l o g t o p r e v e n t
* overread f o r b l o c k s l e s s t h a n 4 8 b y t e s . R e d u c e d
* codesize t o 7 5 2 b y t e s
* DJH 4 / 2 1 / 1 0 v e r s i o n 1 . 5 1 . 4 f i x b r o k e c o d e f o r i n p u t b l o c k e n d s n o t
* aligned t o d w o r d b o u n d a r i e s ,u n d e r w r i t i n g b y 1
* byte, a d d e d d e t e c t i o n f o r t h i s a n d f i x e d . A
* little b l o a t .
* DJH 4 / 2 3 / 1 0 v e r s i o n 1 . 6 c o r r e c t e d s t a c k e r r o r , R 2 0 w a s n o t b e i n g r e s t o r e d
* always, f i x e d t h e e r r o r o f R 2 0 b e i n g m o d i f i e d
* before i t w a s b e i n g s a v e d
* Natural c m o d e l
* = = = = = = = = = = = = = = =
* void * m e m c p y ( c h a r * p t r _ o u t , c h a r * p t r _ i n , i n t l e n g t h ) {
* int i ;
* if( l e n g t h ) f o r ( i =0 ; i < length; i++) { ptr_out[i] = ptr_in[i]; }
* return( p t r _ o u t ) ;
* }
*
* Optimized m e m c p y f u n c t i o n
* = = = = = = = = = = = = = = = = = = = = = = = = =
* void * m e m c p y ( c h a r * p t r _ o u t , c h a r * p t r _ i n , i n t l e n ) {
* int i , p r o l o g , k e r n e l , e p i l o g , m a s k ;
* u8 o f f s e t ;
* s6 4 d a t a0 , d a t a F 8 , d a t a70 ;
*
* s6 4 * p t r8 _ i n ;
* s6 4 * p t r8 _ o u t ;
* s3 2 * p t r4 ;
* s1 6 * p t r2 ;
*
* offset = ( ( i n t ) p t r _ i n ) & 7 ;
* ptr8 _ i n = ( s64 * ) & p t r _ i n [ - o f f s e t ] ; //read in the aligned pointers
*
* data7 0 = * p t r8 _ i n + + ;
* dataF8 = * p t r8 _ i n + + ;
*
* data0 = H E X A G O N _ P _ v a l i g n b _ P P p ( d a t a F 8 , d a t a70 , o f f s e t ) ;
*
* prolog = 3 2 - ( ( i n t ) p t r _ o u t ) ;
* mask = 0 x7 f f f f f f f > > H E X A G O N _ R _ c l 0 _ R ( l e n ) ;
* prolog = p r o l o g & m a s k ;
* kernel = l e n - p r o l o g ;
* epilog = k e r n e l & 0 x1 F ;
* kernel = k e r n e l > > 5 ;
*
* if ( p r o l o g & 1 ) { p t r _ o u t [ 0 ] = ( u 8 ) d a t a0 ; data0 >>= 8; ptr_out += 1;}
* ptr2 = ( s16 * ) & p t r _ o u t [ 0 ] ;
* if ( p r o l o g & 2 ) { p t r2 [ 0 ] = ( u 1 6 ) d a t a0 ; data0 >>= 16; ptr_out += 2;}
* ptr4 = ( s32 * ) & p t r _ o u t [ 0 ] ;
* if ( p r o l o g & 4 ) { p t r4 [ 0 ] = ( u 3 2 ) d a t a0 ; data0 >>= 32; ptr_out += 4;}
*
* offset = o f f s e t + ( p r o l o g & 7 ) ;
* if ( o f f s e t > = 8 ) {
* data7 0 = d a t a F 8 ;
* dataF8 = * p t r8 _ i n + + ;
* }
* offset = o f f s e t & 0 x7 ;
*
* prolog = p r o l o g > > 3 ;
* if ( p r o l o g ) f o r ( i =0 ; i < prolog; i++) {
* data0 = H E X A G O N _ P _ v a l i g n b _ P P p ( d a t a F 8 , d a t a70 , o f f s e t ) ;
* ptr8 _ o u t = ( s64 * ) & p t r _ o u t [ 0 ] ; *ptr8_out = data0; ptr_out += 8;
* data7 0 = d a t a F 8 ;
* dataF8 = * p t r8 _ i n + + ;
* }
* if( k e r n e l ) { k e r n e l - = 1 ; epilog += 32; }
* if( k e r n e l ) f o r ( i =0 ; i < kernel; i++) {
* data0 = H E X A G O N _ P _ v a l i g n b _ P P p ( d a t a F 8 , d a t a70 , o f f s e t ) ;
* ptr8 _ o u t = ( s64 * ) & p t r _ o u t [ 0 ] ; *ptr8_out = data0; ptr_out += 8;
* data7 0 = * p t r8 _ i n + + ;
*
* data0 = H E X A G O N _ P _ v a l i g n b _ P P p ( d a t a70 , d a t a F 8 , o f f s e t ) ;
* ptr8 _ o u t = ( s64 * ) & p t r _ o u t [ 0 ] ; *ptr8_out = data0; ptr_out += 8;
* dataF8 = * p t r8 _ i n + + ;
*
* data0 = H E X A G O N _ P _ v a l i g n b _ P P p ( d a t a F 8 , d a t a70 , o f f s e t ) ;
* ptr8 _ o u t = ( s64 * ) & p t r _ o u t [ 0 ] ; *ptr8_out = data0; ptr_out += 8;
* data7 0 = * p t r8 _ i n + + ;
*
* data0 = H E X A G O N _ P _ v a l i g n b _ P P p ( d a t a70 , d a t a F 8 , o f f s e t ) ;
* ptr8 _ o u t = ( s64 * ) & p t r _ o u t [ 0 ] ; *ptr8_out = data0; ptr_out += 8;
* dataF8 = * p t r8 _ i n + + ;
* }
* epilogdws = e p i l o g > > 3 ;
* if ( e p i l o g d w s ) f o r ( i =0 ; i < epilogdws; i++) {
* data0 = H E X A G O N _ P _ v a l i g n b _ P P p ( d a t a F 8 , d a t a70 , o f f s e t ) ;
* ptr8 _ o u t = ( s64 * ) & p t r _ o u t [ 0 ] ; *ptr8_out = data0; ptr_out += 8;
* data7 0 = d a t a F 8 ;
* dataF8 = * p t r8 _ i n + + ;
* }
* data0 = H E X A G O N _ P _ v a l i g n b _ P P p ( d a t a F 8 , d a t a70 , o f f s e t ) ;
*
* ptr4 = ( s32 * ) & p t r _ o u t [ 0 ] ;
* if ( e p i l o g & 4 ) { p t r4 [ 0 ] = ( u 3 2 ) d a t a0 ; data0 >>= 32; ptr_out += 4;}
* ptr2 = ( s16 * ) & p t r _ o u t [ 0 ] ;
* if ( e p i l o g & 2 ) { p t r2 [ 0 ] = ( u 1 6 ) d a t a0 ; data0 >>= 16; ptr_out += 2;}
* if ( e p i l o g & 1 ) { * p t r _ o u t + + = ( u 8 ) d a t a0 ; }
*
* return( p t r _ o u t - l e n g t h ) ;
* }
*
* Codesize : 7 8 4 b y t e s
* /
# define p t r _ o u t R 0 / * d e s t i n a t i o n p o u n t e r * /
# define p t r _ i n R 1 / * s o u r c e p o i n t e r * /
# define l e n R 2 / * l e n g t h o f c o p y i n b y t e s * /
# define d a t a70 R 1 3 : 1 2 / * l o 8 b y t e s o f n o n - a l i g n e d t r a n s f e r * /
# define d a t a F 8 R 1 1 : 1 0 / * h i 8 b y t e s o f n o n - a l i g n e d t r a n s f e r * /
# define l d a t a0 R 7 : 6 / * e v e n 8 b y t e s c h u n k s * /
# define l d a t a1 R 2 5 : 2 4 / * o d d 8 b y t e s c h u n k s * /
# define d a t a1 R 7 / * l o w e r 8 b y t e s o f l d a t a1 * /
# define d a t a0 R 6 / * l o w e r 8 b y t e s o f l d a t a0 * /
# define i f b y t e p0 / * i f t r a n s f e r h a s b y t e s i n e p i l o g / p r o l o g * /
# define i f h w o r d p0 / * i f t r a n s f e r h a s s h o r t s i n e p i l o g / p r o l o g * /
# define i f w o r d p0 / * i f t r a n s f e r h a s w o r d s i n e p i l o g / p r o l o g * /
# define n o p r o l o g p0 / * n o p r o l o g , x f e r s t a r t s a t 3 2 b y t e * /
# define n o k e r n e l p1 / * n o 3 2 b y t e m u l t i p l e b l o c k i n t h e t r a n s f e r * /
# define n o e p i l o g p0 / * n o e p i l o g , x f e r e n d s o n 3 2 b y t e b o u n d a r y * /
# define a l i g n p2 / * a l i g n m e n t o f i n p u t r e l t o 8 b y t e b o u n d a r y * /
# define k e r n e l 1 p0 / * k e r n e l c o u n t = = 1 * /
# define d a l i g n R 2 5 / * r e l a l i g n m e n t o f i n p u t t o o u t p u t d a t a * /
# define s t a r3 R 1 6 / * n u m b e r b y t e s i n p r o l o g - d w o r d s * /
# define r e s t R 8 / * l e n g t h - p r o l o g b y t e s * /
# define b a c k R 7 / * n r b y t e s > d w o r d b o u n d a r y i n s r c b l o c k * /
# define e p i l o g R 3 / * b y t e s i n e p i l o g * /
# define i n c R 1 5 : 1 4 / * i n c k e r n e l b y - 1 a n d d e f e t c h p t r b y 3 2 * /
# define k e r n e l R 4 / * n u m b e r o f 3 2 b y t e c h u n k s i n k e r n e l * /
# define p t r _ i n _ p _ 1 2 8 R 5 / * p o i n t e r f o r p r e f e t c h o f i n p u t d a t a * /
# define m a s k R 8 / * m a s k u s e d t o d e t e r m i n e p r o l o g s i z e * /
# define s h i f t R 8 / * u s e d t o w o r k a s h i f t e r t o e x t r a c t b y t e s * /
# define s h i f t 2 R 5 / * i n e p i l o g t o w o r k s h i f t e r t o e x t r a c t b y t e s * /
# define p r o l o g R 1 5 / * b y t e s i n p r o l o g * /
# define e p i l o g d w s R 1 5 / * n u m b e r d w o r d s i n e p i l o g * /
# define s h i f t b R 1 4 / * u s e d t o e x t r a c t b y t e s * /
# define o f f s e t R 9 / * s a m e a s a l i g n i n r e g * /
# define p t r _ o u t _ p _ 3 2 R 1 7 / * p o i n t e r t o o u t p u t d c z e r o * /
# define a l i g n 8 8 8 R 1 4 / * i f s i m p l e d w o r d l o o p c a n b e u s e d * /
# define l e n 8 R 9 / * n u m b e r o f d w o r d s i n l e n g t h * /
# define o v e r R 2 0 / * n r o f b y t e s > l a s t i n p b u f d w o r d b o u n d a r y * /
# define p t r _ i n _ p _ 1 2 8 k e r n e l R 5 : 4 / * p a c k e d f e t c h p o i n t e r & k e r n e l c n t * /
.section .text
.p2align 4
.global memcpy
.type memcpy, @function
memcpy :
{
p2 = c m p . e q ( l e n , #0 ) ; /* =0 */
align8 8 8 = o r ( p t r _ i n , p t r _ o u t ) ; /* %8 < 97 */
p0 = c m p . g t u ( l e n , #23 ) ; /* %1, <24 */
p1 = c m p . e q ( p t r _ i n , p t r _ o u t ) ; /* attempt to overwrite self */
}
{
p1 = o r ( p2 , p1 ) ;
p3 = c m p . g t u ( l e n , #95 ) ; /* %8 < 97 */
align8 8 8 = o r ( a l i g n 8 8 8 , l e n ) ; /* %8 < 97 */
len8 = l s r ( l e n , #3 ) ; /* %8 < 97 */
}
{
dcfetch( p t r _ i n ) ; /* zero/ptrin=ptrout causes fetch */
p2 = b i t s c l r ( a l i g n 8 8 8 , #7 ) ; /* %8 < 97 */
if( p1 ) j u m p r r31 ; /* =0 */
}
{
p2 = a n d ( p2 ,! p3 ) ; /* %8 < 97 */
if ( p2 . n e w ) l e n = a d d ( l e n , #- 8 ) ; /* %8 < 97 */
if ( p2 . n e w ) j u m p : N T . L d w o r d a l i g n e d ; /* %8 < 97 */
}
{
if( ! p0 ) j u m p . L b y t e s23 o r l e s s ; /* %1, <24 */
mask. l = #L O ( 0x7fffffff ) ;
/* all bytes before line multiples of data */
prolog = s u b ( #0 , p t r _ o u t ) ;
}
{
/* save r31 on stack, decrement sp by 16 */
allocframe( #24 ) ;
mask. h = #H I ( 0x7fffffff ) ;
ptr_ i n _ p _ 1 2 8 = a d d ( p t r _ i n , #32 ) ;
back = c l 0 ( l e n ) ;
}
{
memd( s p + #0 ) = R 1 7 : 1 6 ; /* save r16,r17 on stack6 */
r3 1 . l = #L O ( . L m e m c p y _ r e t u r n ) ; / * s e t u p f i n a l r e t u r n p o i n t e r * /
prolog & = l s r ( m a s k , b a c k ) ;
offset = a n d ( p t r _ i n , #7 ) ;
}
{
memd( s p + #8 ) = R 2 5 : 2 4 ; /* save r25,r24 on stack */
dalign = s u b ( p t r _ o u t , p t r _ i n ) ;
r3 1 . h = #H I ( . L m e m c p y _ r e t u r n ) ; / * s e t u p f i n a l r e t u r n p o i n t e r * /
}
{
/* see if there if input buffer end if aligned */
over = a d d ( l e n , p t r _ i n ) ;
back = a d d ( l e n , o f f s e t ) ;
memd( s p + #16 ) = R 2 1 : 2 0 ; /* save r20,r21 on stack */
}
{
noprolog = b i t s c l r ( p r o l o g , #7 ) ;
prolog = a n d ( p r o l o g , #31 ) ;
dcfetch( p t r _ i n _ p _ 1 2 8 ) ;
ptr_ i n _ p _ 1 2 8 = a d d ( p t r _ i n _ p _ 1 2 8 , #32 ) ;
}
{
kernel = s u b ( l e n , p r o l o g ) ;
shift = a s l ( p r o l o g , #3 ) ;
star3 = a n d ( p r o l o g , #7 ) ;
ptr_ i n = a n d ( p t r _ i n , #- 8 ) ;
}
{
prolog = l s r ( p r o l o g , #3 ) ;
epilog = a n d ( k e r n e l , #31 ) ;
ptr_ o u t _ p _ 3 2 = a d d ( p t r _ o u t , p r o l o g ) ;
over = a n d ( o v e r , #7 ) ;
}
{
p3 = c m p . g t u ( b a c k , #8 ) ;
kernel = l s r ( k e r n e l , #5 ) ;
dcfetch( p t r _ i n _ p _ 1 2 8 ) ;
ptr_ i n _ p _ 1 2 8 = a d d ( p t r _ i n _ p _ 1 2 8 , #32 ) ;
}
{
p1 = c m p . e q ( p r o l o g , #0 ) ;
if( ! p1 . n e w ) p r o l o g = a d d ( p r o l o g , #1 ) ;
dcfetch( p t r _ i n _ p _ 1 2 8 ) ; /* reserve the line 64bytes on */
ptr_ i n _ p _ 1 2 8 = a d d ( p t r _ i n _ p _ 1 2 8 , #32 ) ;
}
{
nokernel = c m p . e q ( k e r n e l ,#0 ) ;
dcfetch( p t r _ i n _ p _ 1 2 8 ) ; /* reserve the line 64bytes on */
ptr_ i n _ p _ 1 2 8 = a d d ( p t r _ i n _ p _ 1 2 8 , #32 ) ;
shiftb = a n d ( s h i f t , #8 ) ;
}
{
dcfetch( p t r _ i n _ p _ 1 2 8 ) ; /* reserve the line 64bytes on */
ptr_ i n _ p _ 1 2 8 = a d d ( p t r _ i n _ p _ 1 2 8 , #32 ) ;
if( n o k e r n e l ) j u m p . L s k i p64 ;
p2 = c m p . e q ( k e r n e l , #1 ) ; /* skip ovr if kernel == 0 */
}
{
dczeroa( p t r _ o u t _ p _ 3 2 ) ;
/* don't advance pointer */
if( ! p2 ) p t r _ o u t _ p _ 3 2 = a d d ( p t r _ o u t _ p _ 3 2 , #32 ) ;
}
{
dalign = a n d ( d a l i g n , #31 ) ;
dczeroa( p t r _ o u t _ p _ 3 2 ) ;
}
.Lskip64 :
{
data7 0 = m e m d ( p t r _ i n + + #16 ) ;
if( p3 ) d a t a F 8 = m e m d ( p t r _ i n + #8 ) ;
if( n o p r o l o g ) j u m p . L n o p r o l o g 3 2 ;
align = o f f s e t ;
}
/* upto initial 7 bytes */
{
ldata0 = v a l i g n b ( d a t a F 8 , d a t a70 , a l i g n ) ;
ifbyte = t s t b i t ( s h i f t ,#3 ) ;
offset = a d d ( o f f s e t , s t a r3 ) ;
}
{
if( i f b y t e ) m e m b ( p t r _ o u t + + #1 ) = d a t a0 ;
ldata0 = l s r ( l d a t a0 , s h i f t b ) ;
shiftb = a n d ( s h i f t , #16 ) ;
ifhword = t s t b i t ( s h i f t ,#4 ) ;
}
{
if( i f h w o r d ) m e m h ( p t r _ o u t + + #2 ) = d a t a0 ;
ldata0 = l s r ( l d a t a0 , s h i f t b ) ;
ifword = t s t b i t ( s h i f t ,#5 ) ;
p2 = c m p . g t u ( o f f s e t , #7 ) ;
}
{
if( i f w o r d ) m e m w ( p t r _ o u t + + #4 ) = d a t a0 ;
if( p2 ) d a t a70 = d a t a F 8 ;
if( p2 ) d a t a F 8 = m e m d ( p t r _ i n + + #8 ) ; /* another 8 bytes */
align = o f f s e t ;
}
.Lnoprolog32 :
{
p3 = s p1 l o o p0 ( . L d w o r d _ l o o p _ p r o l o g , p r o l o g )
rest = s u b ( l e n , s t a r3 ) ; /* whats left after the loop */
p0 = c m p . g t ( o v e r , #0 ) ;
}
if( p0 ) r e s t = a d d ( r e s t , #16 ) ;
.Ldword_loop_prolog :
{
if( p3 ) m e m d ( p t r _ o u t + + #8 ) = l d a t a0 ;
ldata0 = v a l i g n b ( d a t a F 8 , d a t a70 , a l i g n ) ;
p0 = c m p . g t ( r e s t , #16 ) ;
}
{
data7 0 = d a t a F 8 ;
if( p0 ) d a t a F 8 = m e m d ( p t r _ i n + + #8 ) ;
rest = a d d ( r e s t , #- 8 ) ;
} : endloop0
.Lkernel :
{
/* kernel is at least 32bytes */
p3 = c m p . g t u ( k e r n e l , #0 ) ;
/* last itn. remove edge effects */
if( p3 . n e w ) k e r n e l = a d d ( k e r n e l , #- 1 ) ;
/* dealt with in last dword loop */
if( p3 . n e w ) e p i l o g = a d d ( e p i l o g , #32 ) ;
}
{
nokernel = c m p . e q ( k e r n e l , #0 ) ; /* after adjustment, recheck */
if( n o k e r n e l . n e w ) j u m p : N T . L e p i l o g ; /* likely not taken */
inc = c o m b i n e ( #32 , #- 1 ) ;
p3 = c m p . g t u ( d a l i g n , #24 ) ;
}
{
if( p3 ) j u m p . L o d d _ a l i g n m e n t ;
}
{
loop0 ( . L o w o r d _ l o o p _ 2 5 t o 3 1 , k e r n e l ) ;
kernel1 = c m p . g t u ( k e r n e l , #1 ) ;
rest = k e r n e l ;
}
.falign
.Loword_loop_25to31 :
{
dcfetch( p t r _ i n _ p _ 1 2 8 ) ; /* prefetch 4 lines ahead */
if( k e r n e l 1 ) p t r _ o u t _ p _ 3 2 = a d d ( p t r _ o u t _ p _ 3 2 , #32 ) ;
}
{
dczeroa( p t r _ o u t _ p _ 3 2 ) ; /* reserve the next 32bytes in cache */
p3 = c m p . e q ( k e r n e l , r e s t ) ;
}
{
/* kernel -= 1 */
ptr_ i n _ p _ 1 2 8 k e r n e l = v a d d w ( p t r _ i n _ p _ 1 2 8 k e r n e l , i n c ) ;
/* kill write on first iteration */
if( ! p3 ) m e m d ( p t r _ o u t + + #8 ) = l d a t a1 ;
ldata1 = v a l i g n b ( d a t a F 8 , d a t a70 , a l i g n ) ;
data7 0 = m e m d ( p t r _ i n + + #8 ) ;
}
{
memd( p t r _ o u t + + #8 ) = l d a t a0 ;
ldata0 = v a l i g n b ( d a t a70 , d a t a F 8 , a l i g n ) ;
dataF8 = m e m d ( p t r _ i n + + #8 ) ;
}
{
memd( p t r _ o u t + + #8 ) = l d a t a1 ;
ldata1 = v a l i g n b ( d a t a F 8 , d a t a70 , a l i g n ) ;
data7 0 = m e m d ( p t r _ i n + + #8 ) ;
}
{
memd( p t r _ o u t + + #8 ) = l d a t a0 ;
ldata0 = v a l i g n b ( d a t a70 , d a t a F 8 , a l i g n ) ;
dataF8 = m e m d ( p t r _ i n + + #8 ) ;
kernel1 = c m p . g t u ( k e r n e l , #1 ) ;
} : endloop0
{
memd( p t r _ o u t + + #8 ) = l d a t a1 ;
jump . L e p i l o g ;
}
.Lodd_alignment :
{
loop0 ( . L o w o r d _ l o o p _ 0 0 t o 2 4 , k e r n e l ) ;
kernel1 = c m p . g t u ( k e r n e l , #1 ) ;
rest = a d d ( k e r n e l , #- 1 ) ;
}
.falign
.Loword_loop_00to24 :
{
dcfetch( p t r _ i n _ p _ 1 2 8 ) ; /* prefetch 4 lines ahead */
ptr_ i n _ p _ 1 2 8 k e r n e l = v a d d w ( p t r _ i n _ p _ 1 2 8 k e r n e l , i n c ) ;
if( k e r n e l 1 ) p t r _ o u t _ p _ 3 2 = a d d ( p t r _ o u t _ p _ 3 2 , #32 ) ;
}
{
dczeroa( p t r _ o u t _ p _ 3 2 ) ; /* reserve the next 32bytes in cache */
}
{
memd( p t r _ o u t + + #8 ) = l d a t a0 ;
ldata0 = v a l i g n b ( d a t a F 8 , d a t a70 , a l i g n ) ;
data7 0 = m e m d ( p t r _ i n + + #8 ) ;
}
{
memd( p t r _ o u t + + #8 ) = l d a t a0 ;
ldata0 = v a l i g n b ( d a t a70 , d a t a F 8 , a l i g n ) ;
dataF8 = m e m d ( p t r _ i n + + #8 ) ;
}
{
memd( p t r _ o u t + + #8 ) = l d a t a0 ;
ldata0 = v a l i g n b ( d a t a F 8 , d a t a70 , a l i g n ) ;
data7 0 = m e m d ( p t r _ i n + + #8 ) ;
}
{
memd( p t r _ o u t + + #8 ) = l d a t a0 ;
ldata0 = v a l i g n b ( d a t a70 , d a t a F 8 , a l i g n ) ;
dataF8 = m e m d ( p t r _ i n + + #8 ) ;
kernel1 = c m p . g t u ( k e r n e l , #1 ) ;
} : endloop0
.Lepilog :
{
noepilog = c m p . e q ( e p i l o g ,#0 ) ;
epilogdws = l s r ( e p i l o g , #3 ) ;
kernel = a n d ( e p i l o g , #7 ) ;
}
{
if( n o e p i l o g ) j u m p r r31 ;
if( n o e p i l o g ) p t r _ o u t = s u b ( p t r _ o u t , l e n ) ;
p3 = c m p . e q ( e p i l o g d w s , #0 ) ;
shift2 = a s l ( e p i l o g , #3 ) ;
}
{
shiftb = a n d ( s h i f t 2 , #32 ) ;
ifword = t s t b i t ( e p i l o g ,#2 ) ;
if( p3 ) j u m p . L e p i l o g 6 0 ;
if( ! p3 ) e p i l o g = a d d ( e p i l o g , #- 16 ) ;
}
{
loop0 ( . L d w o r d _ l o o p _ e p i l o g , e p i l o g d w s ) ;
/* stop criteria is lsbs unless = 0 then its 8 */
p3 = c m p . e q ( k e r n e l , #0 ) ;
if( p3 . n e w ) k e r n e l = #8 ;
p1 = c m p . g t ( o v e r , #0 ) ;
}
/* if not aligned to end of buffer execute 1 more iteration */
if( p1 ) k e r n e l = #0 ;
.Ldword_loop_epilog :
{
memd( p t r _ o u t + + #8 ) = l d a t a0 ;
ldata0 = v a l i g n b ( d a t a F 8 , d a t a70 , a l i g n ) ;
p3 = c m p . g t ( e p i l o g , k e r n e l ) ;
}
{
data7 0 = d a t a F 8 ;
if( p3 ) d a t a F 8 = m e m d ( p t r _ i n + + #8 ) ;
epilog = a d d ( e p i l o g , #- 8 ) ;
} : endloop0
/* copy last 7 bytes */
.Lepilog60 :
{
if( i f w o r d ) m e m w ( p t r _ o u t + + #4 ) = d a t a0 ;
ldata0 = l s r ( l d a t a0 , s h i f t b ) ;
ifhword = t s t b i t ( e p i l o g ,#1 ) ;
shiftb = a n d ( s h i f t 2 , #16 ) ;
}
{
if( i f h w o r d ) m e m h ( p t r _ o u t + + #2 ) = d a t a0 ;
ldata0 = l s r ( l d a t a0 , s h i f t b ) ;
ifbyte = t s t b i t ( e p i l o g ,#0 ) ;
if( i f b y t e . n e w ) l e n = a d d ( l e n , #- 1 ) ;
}
{
if( i f b y t e ) m e m b ( p t r _ o u t ) = d a t a0 ;
ptr_ o u t = s u b ( p t r _ o u t , l e n ) ; /* return dest pointer */
jumpr r31 ;
}
/* do byte copy for small n */
.Lbytes23orless :
{
p3 = s p1 l o o p0 ( . L b y t e _ c o p y , l e n ) ;
len = a d d ( l e n , #- 1 ) ;
}
.Lbyte_copy :
{
data0 = m e m b ( p t r _ i n + + #1 ) ;
if( p3 ) m e m b ( p t r _ o u t + + #1 ) = d a t a0 ;
} : endloop0
{
memb( p t r _ o u t ) = d a t a0 ;
ptr_ o u t = s u b ( p t r _ o u t , l e n ) ;
jumpr r31 ;
}
/* do dword copies for aligned in, out and length */
.Ldwordaligned :
{
p3 = s p1 l o o p0 ( . L d w o r d _ c o p y , l e n 8 ) ;
}
.Ldword_copy :
{
if( p3 ) m e m d ( p t r _ o u t + + #8 ) = l d a t a0 ;
ldata0 = m e m d ( p t r _ i n + + #8 ) ;
} : endloop0
{
memd( p t r _ o u t ) = l d a t a0 ;
ptr_ o u t = s u b ( p t r _ o u t , l e n ) ;
jumpr r31 ; /* return to function caller */
}
.Lmemcpy_return :
r21 : 2 0 = memd( s p + #16 ) ; /* restore r20+r21 */
{
r25 : 2 4 = memd( s p + #8 ) ; /* restore r24+r25 */
r17 : 1 6 = memd( s p + #0 ) ; /* restore r16+r17 */
}
deallocframe; /* restore r31 and incrment stack by 16 */
jumpr r31