2009-03-27 16:25:21 +03:00
/ *
* Copyright ( C ) 2 0 0 8 - 2 0 0 9 M i c h a l S i m e k < m o n s t r @monstr.eu>
* Copyright ( C ) 2 0 0 8 - 2 0 0 9 P e t a L o g i x
* Copyright ( C ) 2 0 0 8 J i m L a w - I r i s L P A l l r i g h t s r e s e r v e d .
*
* This f i l e i s s u b j e c t t o t h e t e r m s a n d c o n d i t i o n s o f t h e G N U G e n e r a l
* Public L i c e n s e . S e e t h e f i l e C O P Y I N G i n t h e m a i n d i r e c t o r y o f t h i s
* archive f o r m o r e d e t a i l s .
*
* Written b y J i m L a w < j l a w @irispower.com>
*
* intended t o r e p l a c e :
* memcpy i n m e m c p y . c a n d
* memmove i n m e m m o v e . c
* . . . in a r c h / m i c r o b l a z e / l i b
*
*
* assly_ f a s t c o p y . S
*
* Attempt a t q u i c k e r m e m c p y a n d m e m m o v e f o r M i c r o B l a z e
* Input : O p e r a n d1 i n R e g r5 - d e s t i n a t i o n a d d r e s s
* Operand2 i n R e g r6 - s o u r c e a d d r e s s
* Operand3 i n R e g r7 - n u m b e r o f b y t e s t o t r a n s f e r
* Output : Result i n R e g r3 - s t a r t i n g d e s t i n a i t i o n a d d r e s s
*
*
* Explanation :
* Perform ( p o s s i b l y u n a l i g n e d ) c o p y o f a b l o c k o f m e m o r y
* between m e m l o c a t i o n s w i t h s i z e o f x f e r s p e c ' d i n b y t e s
* /
2011-01-28 15:26:54 +03:00
# ifdef _ _ M I C R O B L A Z E E L _ _
# error M i c r o b l a z e L E n o t s u p p o r t A S M o p t i m i z e d l i b f u n c . D i s a b l e O P T _ L I B _ A S M .
# endif
2009-03-27 16:25:21 +03:00
# include < l i n u x / l i n k a g e . h >
2010-03-23 10:09:32 +03:00
.text
2009-03-27 16:25:21 +03:00
.globl memcpy
2010-03-23 10:09:32 +03:00
.type memcpy, @function
2009-03-27 16:25:21 +03:00
.ent memcpy
memcpy :
fast_memcpy_ascending :
/* move d to return register as value of function */
addi r3 , r5 , 0
addi r4 , r0 , 4 / * n = 4 * /
cmpu r4 , r4 , r7 / * n = c - n ( u n s i g n e d ) * /
blti r4 , a _ x f e r _ e n d / * i f n < 0 , l e s s t h a n o n e w o r d t o t r a n s f e r * /
/* transfer first 0~3 bytes to get aligned dest address */
andi r4 , r5 , 3 / * n = d & 3 * /
/* if zero, destination already aligned */
beqi r4 , a _ d a l i g n _ d o n e
/* n = 4 - n (yields 3, 2, 1 transfers for 1, 2, 3 addr offset) */
rsubi r4 , r4 , 4
rsub r7 , r4 , r7 / * c = c - n a d j u s t c * /
a_xfer_first_loop :
/* if no bytes left to transfer, transfer the bulk */
beqi r4 , a _ d a l i g n _ d o n e
lbui r11 , r6 , 0 / * h = * s * /
sbi r11 , r5 , 0 / * * d = h * /
addi r6 , r6 , 1 / * s + + * /
addi r5 , r5 , 1 / * d + + * /
brid a _ x f e r _ f i r s t _ l o o p / * l o o p * /
addi r4 , r4 , - 1 / * n - - ( I N D E L A Y S L O T ) * /
a_dalign_done :
addi r4 , r0 , 3 2 / * n = 3 2 * /
cmpu r4 , r4 , r7 / * n = c - n ( u n s i g n e d ) * /
/* if n < 0, less than one block to transfer */
blti r4 , a _ b l o c k _ d o n e
a_block_xfer :
andi r4 , r7 , 0 x f f f f f f e 0 / * n = c & ~ 3 1 * /
rsub r7 , r4 , r7 / * c = c - n * /
andi r9 , r6 , 3 / * t 1 = s & 3 * /
/* if temp != 0, unaligned transfers needed */
bnei r9 , a _ b l o c k _ u n a l i g n e d
a_block_aligned :
lwi r9 , r6 , 0 / * t 1 = * ( s + 0 ) * /
lwi r10 , r6 , 4 / * t 2 = * ( s + 4 ) * /
lwi r11 , r6 , 8 / * t 3 = * ( s + 8 ) * /
lwi r12 , r6 , 1 2 / * t 4 = * ( s + 1 2 ) * /
swi r9 , r5 , 0 / * * ( d + 0 ) = t 1 * /
swi r10 , r5 , 4 / * * ( d + 4 ) = t 2 * /
swi r11 , r5 , 8 / * * ( d + 8 ) = t 3 * /
swi r12 , r5 , 1 2 / * * ( d + 1 2 ) = t 4 * /
lwi r9 , r6 , 1 6 / * t 1 = * ( s + 1 6 ) * /
lwi r10 , r6 , 2 0 / * t 2 = * ( s + 2 0 ) * /
lwi r11 , r6 , 2 4 / * t 3 = * ( s + 2 4 ) * /
lwi r12 , r6 , 2 8 / * t 4 = * ( s + 2 8 ) * /
swi r9 , r5 , 1 6 / * * ( d + 1 6 ) = t 1 * /
swi r10 , r5 , 2 0 / * * ( d + 2 0 ) = t 2 * /
swi r11 , r5 , 2 4 / * * ( d + 2 4 ) = t 3 * /
swi r12 , r5 , 2 8 / * * ( d + 2 8 ) = t 4 * /
addi r6 , r6 , 3 2 / * s = s + 3 2 * /
addi r4 , r4 , - 3 2 / * n = n - 3 2 * /
bneid r4 , a _ b l o c k _ a l i g n e d / * w h i l e ( n ) l o o p * /
addi r5 , r5 , 3 2 / * d = d + 3 2 ( I N D E L A Y S L O T ) * /
bri a _ b l o c k _ d o n e
a_block_unaligned :
andi r8 , r6 , 0 x f f f f f f f c / * a s = s & ~ 3 * /
add r6 , r6 , r4 / * s = s + n * /
lwi r11 , r8 , 0 / * h = * ( a s + 0 ) * /
addi r9 , r9 , - 1
beqi r9 , a _ b l o c k _ u 1 / * t 1 w a s 1 = > 1 b y t e o f f s e t * /
addi r9 , r9 , - 1
beqi r9 , a _ b l o c k _ u 2 / * t 1 w a s 2 = > 2 b y t e o f f s e t * /
a_block_u3 :
bslli r11 , r11 , 2 4 / * h = h < < 2 4 * /
a_bu3_loop :
lwi r12 , r8 , 4 / * v = * ( a s + 4 ) * /
bsrli r9 , r12 , 8 / * t 1 = v > > 8 * /
or r9 , r11 , r9 / * t 1 = h | t 1 * /
swi r9 , r5 , 0 / * * ( d + 0 ) = t 1 * /
bslli r11 , r12 , 2 4 / * h = v < < 2 4 * /
lwi r12 , r8 , 8 / * v = * ( a s + 8 ) * /
bsrli r9 , r12 , 8 / * t 1 = v > > 8 * /
or r9 , r11 , r9 / * t 1 = h | t 1 * /
swi r9 , r5 , 4 / * * ( d + 4 ) = t 1 * /
bslli r11 , r12 , 2 4 / * h = v < < 2 4 * /
lwi r12 , r8 , 1 2 / * v = * ( a s + 1 2 ) * /
bsrli r9 , r12 , 8 / * t 1 = v > > 8 * /
or r9 , r11 , r9 / * t 1 = h | t 1 * /
swi r9 , r5 , 8 / * * ( d + 8 ) = t 1 * /
bslli r11 , r12 , 2 4 / * h = v < < 2 4 * /
lwi r12 , r8 , 1 6 / * v = * ( a s + 1 6 ) * /
bsrli r9 , r12 , 8 / * t 1 = v > > 8 * /
or r9 , r11 , r9 / * t 1 = h | t 1 * /
swi r9 , r5 , 1 2 / * * ( d + 1 2 ) = t 1 * /
bslli r11 , r12 , 2 4 / * h = v < < 2 4 * /
lwi r12 , r8 , 2 0 / * v = * ( a s + 2 0 ) * /
bsrli r9 , r12 , 8 / * t 1 = v > > 8 * /
or r9 , r11 , r9 / * t 1 = h | t 1 * /
swi r9 , r5 , 1 6 / * * ( d + 1 6 ) = t 1 * /
bslli r11 , r12 , 2 4 / * h = v < < 2 4 * /
lwi r12 , r8 , 2 4 / * v = * ( a s + 2 4 ) * /
bsrli r9 , r12 , 8 / * t 1 = v > > 8 * /
or r9 , r11 , r9 / * t 1 = h | t 1 * /
swi r9 , r5 , 2 0 / * * ( d + 2 0 ) = t 1 * /
bslli r11 , r12 , 2 4 / * h = v < < 2 4 * /
lwi r12 , r8 , 2 8 / * v = * ( a s + 2 8 ) * /
bsrli r9 , r12 , 8 / * t 1 = v > > 8 * /
or r9 , r11 , r9 / * t 1 = h | t 1 * /
swi r9 , r5 , 2 4 / * * ( d + 2 4 ) = t 1 * /
bslli r11 , r12 , 2 4 / * h = v < < 2 4 * /
lwi r12 , r8 , 3 2 / * v = * ( a s + 3 2 ) * /
bsrli r9 , r12 , 8 / * t 1 = v > > 8 * /
or r9 , r11 , r9 / * t 1 = h | t 1 * /
swi r9 , r5 , 2 8 / * * ( d + 2 8 ) = t 1 * /
bslli r11 , r12 , 2 4 / * h = v < < 2 4 * /
addi r8 , r8 , 3 2 / * a s = a s + 3 2 * /
addi r4 , r4 , - 3 2 / * n = n - 3 2 * /
bneid r4 , a _ b u 3 _ l o o p / * w h i l e ( n ) l o o p * /
addi r5 , r5 , 3 2 / * d = d + 3 2 ( I N D E L A Y S L O T ) * /
bri a _ b l o c k _ d o n e
a_block_u1 :
bslli r11 , r11 , 8 / * h = h < < 8 * /
a_bu1_loop :
lwi r12 , r8 , 4 / * v = * ( a s + 4 ) * /
bsrli r9 , r12 , 2 4 / * t 1 = v > > 2 4 * /
or r9 , r11 , r9 / * t 1 = h | t 1 * /
swi r9 , r5 , 0 / * * ( d + 0 ) = t 1 * /
bslli r11 , r12 , 8 / * h = v < < 8 * /
lwi r12 , r8 , 8 / * v = * ( a s + 8 ) * /
bsrli r9 , r12 , 2 4 / * t 1 = v > > 2 4 * /
or r9 , r11 , r9 / * t 1 = h | t 1 * /
swi r9 , r5 , 4 / * * ( d + 4 ) = t 1 * /
bslli r11 , r12 , 8 / * h = v < < 8 * /
lwi r12 , r8 , 1 2 / * v = * ( a s + 1 2 ) * /
bsrli r9 , r12 , 2 4 / * t 1 = v > > 2 4 * /
or r9 , r11 , r9 / * t 1 = h | t 1 * /
swi r9 , r5 , 8 / * * ( d + 8 ) = t 1 * /
bslli r11 , r12 , 8 / * h = v < < 8 * /
lwi r12 , r8 , 1 6 / * v = * ( a s + 1 6 ) * /
bsrli r9 , r12 , 2 4 / * t 1 = v > > 2 4 * /
or r9 , r11 , r9 / * t 1 = h | t 1 * /
swi r9 , r5 , 1 2 / * * ( d + 1 2 ) = t 1 * /
bslli r11 , r12 , 8 / * h = v < < 8 * /
lwi r12 , r8 , 2 0 / * v = * ( a s + 2 0 ) * /
bsrli r9 , r12 , 2 4 / * t 1 = v > > 2 4 * /
or r9 , r11 , r9 / * t 1 = h | t 1 * /
swi r9 , r5 , 1 6 / * * ( d + 1 6 ) = t 1 * /
bslli r11 , r12 , 8 / * h = v < < 8 * /
lwi r12 , r8 , 2 4 / * v = * ( a s + 2 4 ) * /
bsrli r9 , r12 , 2 4 / * t 1 = v > > 2 4 * /
or r9 , r11 , r9 / * t 1 = h | t 1 * /
swi r9 , r5 , 2 0 / * * ( d + 2 0 ) = t 1 * /
bslli r11 , r12 , 8 / * h = v < < 8 * /
lwi r12 , r8 , 2 8 / * v = * ( a s + 2 8 ) * /
bsrli r9 , r12 , 2 4 / * t 1 = v > > 2 4 * /
or r9 , r11 , r9 / * t 1 = h | t 1 * /
swi r9 , r5 , 2 4 / * * ( d + 2 4 ) = t 1 * /
bslli r11 , r12 , 8 / * h = v < < 8 * /
lwi r12 , r8 , 3 2 / * v = * ( a s + 3 2 ) * /
bsrli r9 , r12 , 2 4 / * t 1 = v > > 2 4 * /
or r9 , r11 , r9 / * t 1 = h | t 1 * /
swi r9 , r5 , 2 8 / * * ( d + 2 8 ) = t 1 * /
bslli r11 , r12 , 8 / * h = v < < 8 * /
addi r8 , r8 , 3 2 / * a s = a s + 3 2 * /
addi r4 , r4 , - 3 2 / * n = n - 3 2 * /
bneid r4 , a _ b u 1 _ l o o p / * w h i l e ( n ) l o o p * /
addi r5 , r5 , 3 2 / * d = d + 3 2 ( I N D E L A Y S L O T ) * /
bri a _ b l o c k _ d o n e
a_block_u2 :
bslli r11 , r11 , 1 6 / * h = h < < 1 6 * /
a_bu2_loop :
lwi r12 , r8 , 4 / * v = * ( a s + 4 ) * /
bsrli r9 , r12 , 1 6 / * t 1 = v > > 1 6 * /
or r9 , r11 , r9 / * t 1 = h | t 1 * /
swi r9 , r5 , 0 / * * ( d + 0 ) = t 1 * /
bslli r11 , r12 , 1 6 / * h = v < < 1 6 * /
lwi r12 , r8 , 8 / * v = * ( a s + 8 ) * /
bsrli r9 , r12 , 1 6 / * t 1 = v > > 1 6 * /
or r9 , r11 , r9 / * t 1 = h | t 1 * /
swi r9 , r5 , 4 / * * ( d + 4 ) = t 1 * /
bslli r11 , r12 , 1 6 / * h = v < < 1 6 * /
lwi r12 , r8 , 1 2 / * v = * ( a s + 1 2 ) * /
bsrli r9 , r12 , 1 6 / * t 1 = v > > 1 6 * /
or r9 , r11 , r9 / * t 1 = h | t 1 * /
swi r9 , r5 , 8 / * * ( d + 8 ) = t 1 * /
bslli r11 , r12 , 1 6 / * h = v < < 1 6 * /
lwi r12 , r8 , 1 6 / * v = * ( a s + 1 6 ) * /
bsrli r9 , r12 , 1 6 / * t 1 = v > > 1 6 * /
or r9 , r11 , r9 / * t 1 = h | t 1 * /
swi r9 , r5 , 1 2 / * * ( d + 1 2 ) = t 1 * /
bslli r11 , r12 , 1 6 / * h = v < < 1 6 * /
lwi r12 , r8 , 2 0 / * v = * ( a s + 2 0 ) * /
bsrli r9 , r12 , 1 6 / * t 1 = v > > 1 6 * /
or r9 , r11 , r9 / * t 1 = h | t 1 * /
swi r9 , r5 , 1 6 / * * ( d + 1 6 ) = t 1 * /
bslli r11 , r12 , 1 6 / * h = v < < 1 6 * /
lwi r12 , r8 , 2 4 / * v = * ( a s + 2 4 ) * /
bsrli r9 , r12 , 1 6 / * t 1 = v > > 1 6 * /
or r9 , r11 , r9 / * t 1 = h | t 1 * /
swi r9 , r5 , 2 0 / * * ( d + 2 0 ) = t 1 * /
bslli r11 , r12 , 1 6 / * h = v < < 1 6 * /
lwi r12 , r8 , 2 8 / * v = * ( a s + 2 8 ) * /
bsrli r9 , r12 , 1 6 / * t 1 = v > > 1 6 * /
or r9 , r11 , r9 / * t 1 = h | t 1 * /
swi r9 , r5 , 2 4 / * * ( d + 2 4 ) = t 1 * /
bslli r11 , r12 , 1 6 / * h = v < < 1 6 * /
lwi r12 , r8 , 3 2 / * v = * ( a s + 3 2 ) * /
bsrli r9 , r12 , 1 6 / * t 1 = v > > 1 6 * /
or r9 , r11 , r9 / * t 1 = h | t 1 * /
swi r9 , r5 , 2 8 / * * ( d + 2 8 ) = t 1 * /
bslli r11 , r12 , 1 6 / * h = v < < 1 6 * /
addi r8 , r8 , 3 2 / * a s = a s + 3 2 * /
addi r4 , r4 , - 3 2 / * n = n - 3 2 * /
bneid r4 , a _ b u 2 _ l o o p / * w h i l e ( n ) l o o p * /
addi r5 , r5 , 3 2 / * d = d + 3 2 ( I N D E L A Y S L O T ) * /
a_block_done :
addi r4 , r0 , 4 / * n = 4 * /
cmpu r4 , r4 , r7 / * n = c - n ( u n s i g n e d ) * /
blti r4 , a _ x f e r _ e n d / * i f n < 0 , l e s s t h a n o n e w o r d t o t r a n s f e r * /
a_word_xfer :
andi r4 , r7 , 0 x f f f f f f f c / * n = c & ~ 3 * /
addi r10 , r0 , 0 / * o f f s e t = 0 * /
andi r9 , r6 , 3 / * t 1 = s & 3 * /
/* if temp != 0, unaligned transfers needed */
bnei r9 , a _ w o r d _ u n a l i g n e d
a_word_aligned :
lw r9 , r6 , r10 / * t 1 = * ( s + o f f s e t ) * /
sw r9 , r5 , r10 / * * ( d + o f f s e t ) = t 1 * /
addi r4 , r4 ,- 4 / * n - - * /
bneid r4 , a _ w o r d _ a l i g n e d / * l o o p * /
addi r10 , r10 , 4 / * o f f s e t + + ( I N D E L A Y S L O T ) * /
bri a _ w o r d _ d o n e
a_word_unaligned :
andi r8 , r6 , 0 x f f f f f f f c / * a s = s & ~ 3 * /
lwi r11 , r8 , 0 / * h = * ( a s + 0 ) * /
addi r8 , r8 , 4 / * a s = a s + 4 * /
addi r9 , r9 , - 1
beqi r9 , a _ w o r d _ u 1 / * t 1 w a s 1 = > 1 b y t e o f f s e t * /
addi r9 , r9 , - 1
beqi r9 , a _ w o r d _ u 2 / * t 1 w a s 2 = > 2 b y t e o f f s e t * /
a_word_u3 :
bslli r11 , r11 , 2 4 / * h = h < < 2 4 * /
a_wu3_loop :
lw r12 , r8 , r10 / * v = * ( a s + o f f s e t ) * /
bsrli r9 , r12 , 8 / * t 1 = v > > 8 * /
or r9 , r11 , r9 / * t 1 = h | t 1 * /
sw r9 , r5 , r10 / * * ( d + o f f s e t ) = t 1 * /
bslli r11 , r12 , 2 4 / * h = v < < 2 4 * /
addi r4 , r4 ,- 4 / * n = n - 4 * /
bneid r4 , a _ w u 3 _ l o o p / * w h i l e ( n ) l o o p * /
addi r10 , r10 , 4 / * o f f s e t = o f s e t + 4 ( I N D E L A Y S L O T ) * /
bri a _ w o r d _ d o n e
a_word_u1 :
bslli r11 , r11 , 8 / * h = h < < 8 * /
a_wu1_loop :
lw r12 , r8 , r10 / * v = * ( a s + o f f s e t ) * /
bsrli r9 , r12 , 2 4 / * t 1 = v > > 2 4 * /
or r9 , r11 , r9 / * t 1 = h | t 1 * /
sw r9 , r5 , r10 / * * ( d + o f f s e t ) = t 1 * /
bslli r11 , r12 , 8 / * h = v < < 8 * /
addi r4 , r4 ,- 4 / * n = n - 4 * /
bneid r4 , a _ w u 1 _ l o o p / * w h i l e ( n ) l o o p * /
addi r10 , r10 , 4 / * o f f s e t = o f s e t + 4 ( I N D E L A Y S L O T ) * /
bri a _ w o r d _ d o n e
a_word_u2 :
bslli r11 , r11 , 1 6 / * h = h < < 1 6 * /
a_wu2_loop :
lw r12 , r8 , r10 / * v = * ( a s + o f f s e t ) * /
bsrli r9 , r12 , 1 6 / * t 1 = v > > 1 6 * /
or r9 , r11 , r9 / * t 1 = h | t 1 * /
sw r9 , r5 , r10 / * * ( d + o f f s e t ) = t 1 * /
bslli r11 , r12 , 1 6 / * h = v < < 1 6 * /
addi r4 , r4 ,- 4 / * n = n - 4 * /
bneid r4 , a _ w u 2 _ l o o p / * w h i l e ( n ) l o o p * /
addi r10 , r10 , 4 / * o f f s e t = o f s e t + 4 ( I N D E L A Y S L O T ) * /
a_word_done :
add r5 , r5 , r10 / * d = d + o f f s e t * /
add r6 , r6 , r10 / * s = s + o f f s e t * /
rsub r7 , r10 , r7 / * c = c - o f f s e t * /
a_xfer_end :
a_xfer_end_loop :
beqi r7 , a _ d o n e / * w h i l e ( c ) * /
lbui r9 , r6 , 0 / * t 1 = * s * /
addi r6 , r6 , 1 / * s + + * /
sbi r9 , r5 , 0 / * * d = t 1 * /
addi r7 , r7 , - 1 / * c - - * /
brid a _ x f e r _ e n d _ l o o p / * l o o p * /
addi r5 , r5 , 1 / * d + + ( I N D E L A Y S L O T ) * /
a_done :
rtsd r15 , 8
nop
2010-03-23 10:09:32 +03:00
.size memcpy, . - m e m c p y
2009-03-27 16:25:21 +03:00
.end memcpy
/*----------------------------------------------------------------------------*/
.globl memmove
2010-03-23 10:09:32 +03:00
.type memmove, @function
2009-03-27 16:25:21 +03:00
.ent memmove
memmove :
cmpu r4 , r5 , r6 / * n = s - d * /
bgei r4 ,f a s t _ m e m c p y _ a s c e n d i n g
fast_memcpy_descending :
/* move d to return register as value of function */
addi r3 , r5 , 0
add r5 , r5 , r7 / * d = d + c * /
add r6 , r6 , r7 / * s = s + c * /
addi r4 , r0 , 4 / * n = 4 * /
cmpu r4 , r4 , r7 / * n = c - n ( u n s i g n e d ) * /
blti r4 ,d _ x f e r _ e n d / * i f n < 0 , l e s s t h a n o n e w o r d t o t r a n s f e r * /
/* transfer first 0~3 bytes to get aligned dest address */
andi r4 , r5 , 3 / * n = d & 3 * /
/* if zero, destination already aligned */
beqi r4 ,d _ d a l i g n _ d o n e
rsub r7 , r4 , r7 / * c = c - n a d j u s t c * /
d_xfer_first_loop :
/* if no bytes left to transfer, transfer the bulk */
beqi r4 ,d _ d a l i g n _ d o n e
addi r6 , r6 , - 1 / * s - - * /
addi r5 , r5 , - 1 / * d - - * /
lbui r11 , r6 , 0 / * h = * s * /
sbi r11 , r5 , 0 / * * d = h * /
brid d _ x f e r _ f i r s t _ l o o p / * l o o p * /
addi r4 , r4 , - 1 / * n - - ( I N D E L A Y S L O T ) * /
d_dalign_done :
addi r4 , r0 , 3 2 / * n = 3 2 * /
cmpu r4 , r4 , r7 / * n = c - n ( u n s i g n e d ) * /
/* if n < 0, less than one block to transfer */
blti r4 , d _ b l o c k _ d o n e
d_block_xfer :
andi r4 , r7 , 0 x f f f f f f e 0 / * n = c & ~ 3 1 * /
rsub r7 , r4 , r7 / * c = c - n * /
andi r9 , r6 , 3 / * t 1 = s & 3 * /
/* if temp != 0, unaligned transfers needed */
bnei r9 , d _ b l o c k _ u n a l i g n e d
d_block_aligned :
addi r6 , r6 , - 3 2 / * s = s - 3 2 * /
addi r5 , r5 , - 3 2 / * d = d - 3 2 * /
lwi r9 , r6 , 2 8 / * t 1 = * ( s + 2 8 ) * /
lwi r10 , r6 , 2 4 / * t 2 = * ( s + 2 4 ) * /
lwi r11 , r6 , 2 0 / * t 3 = * ( s + 2 0 ) * /
lwi r12 , r6 , 1 6 / * t 4 = * ( s + 1 6 ) * /
swi r9 , r5 , 2 8 / * * ( d + 2 8 ) = t 1 * /
swi r10 , r5 , 2 4 / * * ( d + 2 4 ) = t 2 * /
swi r11 , r5 , 2 0 / * * ( d + 2 0 ) = t 3 * /
swi r12 , r5 , 1 6 / * * ( d + 1 6 ) = t 4 * /
lwi r9 , r6 , 1 2 / * t 1 = * ( s + 1 2 ) * /
lwi r10 , r6 , 8 / * t 2 = * ( s + 8 ) * /
lwi r11 , r6 , 4 / * t 3 = * ( s + 4 ) * /
lwi r12 , r6 , 0 / * t 4 = * ( s + 0 ) * /
swi r9 , r5 , 1 2 / * * ( d + 1 2 ) = t 1 * /
swi r10 , r5 , 8 / * * ( d + 8 ) = t 2 * /
swi r11 , r5 , 4 / * * ( d + 4 ) = t 3 * /
addi r4 , r4 , - 3 2 / * n = n - 3 2 * /
bneid r4 , d _ b l o c k _ a l i g n e d / * w h i l e ( n ) l o o p * /
swi r12 , r5 , 0 / * * ( d + 0 ) = t 4 ( I N D E L A Y S L O T ) * /
bri d _ b l o c k _ d o n e
d_block_unaligned :
andi r8 , r6 , 0 x f f f f f f f c / * a s = s & ~ 3 * /
rsub r6 , r4 , r6 / * s = s - n * /
lwi r11 , r8 , 0 / * h = * ( a s + 0 ) * /
addi r9 , r9 , - 1
beqi r9 ,d _ b l o c k _ u 1 / * t 1 w a s 1 = > 1 b y t e o f f s e t * /
addi r9 , r9 , - 1
beqi r9 ,d _ b l o c k _ u 2 / * t 1 w a s 2 = > 2 b y t e o f f s e t * /
d_block_u3 :
bsrli r11 , r11 , 8 / * h = h > > 8 * /
d_bu3_loop :
addi r8 , r8 , - 3 2 / * a s = a s - 3 2 * /
addi r5 , r5 , - 3 2 / * d = d - 3 2 * /
lwi r12 , r8 , 2 8 / * v = * ( a s + 2 8 ) * /
bslli r9 , r12 , 2 4 / * t 1 = v < < 2 4 * /
or r9 , r11 , r9 / * t 1 = h | t 1 * /
swi r9 , r5 , 2 8 / * * ( d + 2 8 ) = t 1 * /
bsrli r11 , r12 , 8 / * h = v > > 8 * /
lwi r12 , r8 , 2 4 / * v = * ( a s + 2 4 ) * /
bslli r9 , r12 , 2 4 / * t 1 = v < < 2 4 * /
or r9 , r11 , r9 / * t 1 = h | t 1 * /
swi r9 , r5 , 2 4 / * * ( d + 2 4 ) = t 1 * /
bsrli r11 , r12 , 8 / * h = v > > 8 * /
lwi r12 , r8 , 2 0 / * v = * ( a s + 2 0 ) * /
bslli r9 , r12 , 2 4 / * t 1 = v < < 2 4 * /
or r9 , r11 , r9 / * t 1 = h | t 1 * /
swi r9 , r5 , 2 0 / * * ( d + 2 0 ) = t 1 * /
bsrli r11 , r12 , 8 / * h = v > > 8 * /
lwi r12 , r8 , 1 6 / * v = * ( a s + 1 6 ) * /
bslli r9 , r12 , 2 4 / * t 1 = v < < 2 4 * /
or r9 , r11 , r9 / * t 1 = h | t 1 * /
swi r9 , r5 , 1 6 / * * ( d + 1 6 ) = t 1 * /
bsrli r11 , r12 , 8 / * h = v > > 8 * /
lwi r12 , r8 , 1 2 / * v = * ( a s + 1 2 ) * /
bslli r9 , r12 , 2 4 / * t 1 = v < < 2 4 * /
or r9 , r11 , r9 / * t 1 = h | t 1 * /
swi r9 , r5 , 1 2 / * * ( d + 1 1 2 ) = t 1 * /
bsrli r11 , r12 , 8 / * h = v > > 8 * /
lwi r12 , r8 , 8 / * v = * ( a s + 8 ) * /
bslli r9 , r12 , 2 4 / * t 1 = v < < 2 4 * /
or r9 , r11 , r9 / * t 1 = h | t 1 * /
swi r9 , r5 , 8 / * * ( d + 8 ) = t 1 * /
bsrli r11 , r12 , 8 / * h = v > > 8 * /
lwi r12 , r8 , 4 / * v = * ( a s + 4 ) * /
bslli r9 , r12 , 2 4 / * t 1 = v < < 2 4 * /
or r9 , r11 , r9 / * t 1 = h | t 1 * /
swi r9 , r5 , 4 / * * ( d + 4 ) = t 1 * /
bsrli r11 , r12 , 8 / * h = v > > 8 * /
lwi r12 , r8 , 0 / * v = * ( a s + 0 ) * /
bslli r9 , r12 , 2 4 / * t 1 = v < < 2 4 * /
or r9 , r11 , r9 / * t 1 = h | t 1 * /
swi r9 , r5 , 0 / * * ( d + 0 ) = t 1 * /
addi r4 , r4 , - 3 2 / * n = n - 3 2 * /
bneid r4 , d _ b u 3 _ l o o p / * w h i l e ( n ) l o o p * /
bsrli r11 , r12 , 8 / * h = v > > 8 ( I N D E L A Y S L O T ) * /
bri d _ b l o c k _ d o n e
d_block_u1 :
bsrli r11 , r11 , 2 4 / * h = h > > 2 4 * /
d_bu1_loop :
addi r8 , r8 , - 3 2 / * a s = a s - 3 2 * /
addi r5 , r5 , - 3 2 / * d = d - 3 2 * /
lwi r12 , r8 , 2 8 / * v = * ( a s + 2 8 ) * /
bslli r9 , r12 , 8 / * t 1 = v < < 8 * /
or r9 , r11 , r9 / * t 1 = h | t 1 * /
swi r9 , r5 , 2 8 / * * ( d + 2 8 ) = t 1 * /
bsrli r11 , r12 , 2 4 / * h = v > > 2 4 * /
lwi r12 , r8 , 2 4 / * v = * ( a s + 2 4 ) * /
bslli r9 , r12 , 8 / * t 1 = v < < 8 * /
or r9 , r11 , r9 / * t 1 = h | t 1 * /
swi r9 , r5 , 2 4 / * * ( d + 2 4 ) = t 1 * /
bsrli r11 , r12 , 2 4 / * h = v > > 2 4 * /
lwi r12 , r8 , 2 0 / * v = * ( a s + 2 0 ) * /
bslli r9 , r12 , 8 / * t 1 = v < < 8 * /
or r9 , r11 , r9 / * t 1 = h | t 1 * /
swi r9 , r5 , 2 0 / * * ( d + 2 0 ) = t 1 * /
bsrli r11 , r12 , 2 4 / * h = v > > 2 4 * /
lwi r12 , r8 , 1 6 / * v = * ( a s + 1 6 ) * /
bslli r9 , r12 , 8 / * t 1 = v < < 8 * /
or r9 , r11 , r9 / * t 1 = h | t 1 * /
swi r9 , r5 , 1 6 / * * ( d + 1 6 ) = t 1 * /
bsrli r11 , r12 , 2 4 / * h = v > > 2 4 * /
lwi r12 , r8 , 1 2 / * v = * ( a s + 1 2 ) * /
bslli r9 , r12 , 8 / * t 1 = v < < 8 * /
or r9 , r11 , r9 / * t 1 = h | t 1 * /
swi r9 , r5 , 1 2 / * * ( d + 1 1 2 ) = t 1 * /
bsrli r11 , r12 , 2 4 / * h = v > > 2 4 * /
lwi r12 , r8 , 8 / * v = * ( a s + 8 ) * /
bslli r9 , r12 , 8 / * t 1 = v < < 8 * /
or r9 , r11 , r9 / * t 1 = h | t 1 * /
swi r9 , r5 , 8 / * * ( d + 8 ) = t 1 * /
bsrli r11 , r12 , 2 4 / * h = v > > 2 4 * /
lwi r12 , r8 , 4 / * v = * ( a s + 4 ) * /
bslli r9 , r12 , 8 / * t 1 = v < < 8 * /
or r9 , r11 , r9 / * t 1 = h | t 1 * /
swi r9 , r5 , 4 / * * ( d + 4 ) = t 1 * /
bsrli r11 , r12 , 2 4 / * h = v > > 2 4 * /
lwi r12 , r8 , 0 / * v = * ( a s + 0 ) * /
bslli r9 , r12 , 8 / * t 1 = v < < 8 * /
or r9 , r11 , r9 / * t 1 = h | t 1 * /
swi r9 , r5 , 0 / * * ( d + 0 ) = t 1 * /
addi r4 , r4 , - 3 2 / * n = n - 3 2 * /
bneid r4 , d _ b u 1 _ l o o p / * w h i l e ( n ) l o o p * /
bsrli r11 , r12 , 2 4 / * h = v > > 2 4 ( I N D E L A Y S L O T ) * /
bri d _ b l o c k _ d o n e
d_block_u2 :
bsrli r11 , r11 , 1 6 / * h = h > > 1 6 * /
d_bu2_loop :
addi r8 , r8 , - 3 2 / * a s = a s - 3 2 * /
addi r5 , r5 , - 3 2 / * d = d - 3 2 * /
lwi r12 , r8 , 2 8 / * v = * ( a s + 2 8 ) * /
bslli r9 , r12 , 1 6 / * t 1 = v < < 1 6 * /
or r9 , r11 , r9 / * t 1 = h | t 1 * /
swi r9 , r5 , 2 8 / * * ( d + 2 8 ) = t 1 * /
bsrli r11 , r12 , 1 6 / * h = v > > 1 6 * /
lwi r12 , r8 , 2 4 / * v = * ( a s + 2 4 ) * /
bslli r9 , r12 , 1 6 / * t 1 = v < < 1 6 * /
or r9 , r11 , r9 / * t 1 = h | t 1 * /
swi r9 , r5 , 2 4 / * * ( d + 2 4 ) = t 1 * /
bsrli r11 , r12 , 1 6 / * h = v > > 1 6 * /
lwi r12 , r8 , 2 0 / * v = * ( a s + 2 0 ) * /
bslli r9 , r12 , 1 6 / * t 1 = v < < 1 6 * /
or r9 , r11 , r9 / * t 1 = h | t 1 * /
swi r9 , r5 , 2 0 / * * ( d + 2 0 ) = t 1 * /
bsrli r11 , r12 , 1 6 / * h = v > > 1 6 * /
lwi r12 , r8 , 1 6 / * v = * ( a s + 1 6 ) * /
bslli r9 , r12 , 1 6 / * t 1 = v < < 1 6 * /
or r9 , r11 , r9 / * t 1 = h | t 1 * /
swi r9 , r5 , 1 6 / * * ( d + 1 6 ) = t 1 * /
bsrli r11 , r12 , 1 6 / * h = v > > 1 6 * /
lwi r12 , r8 , 1 2 / * v = * ( a s + 1 2 ) * /
bslli r9 , r12 , 1 6 / * t 1 = v < < 1 6 * /
or r9 , r11 , r9 / * t 1 = h | t 1 * /
swi r9 , r5 , 1 2 / * * ( d + 1 1 2 ) = t 1 * /
bsrli r11 , r12 , 1 6 / * h = v > > 1 6 * /
lwi r12 , r8 , 8 / * v = * ( a s + 8 ) * /
bslli r9 , r12 , 1 6 / * t 1 = v < < 1 6 * /
or r9 , r11 , r9 / * t 1 = h | t 1 * /
swi r9 , r5 , 8 / * * ( d + 8 ) = t 1 * /
bsrli r11 , r12 , 1 6 / * h = v > > 1 6 * /
lwi r12 , r8 , 4 / * v = * ( a s + 4 ) * /
bslli r9 , r12 , 1 6 / * t 1 = v < < 1 6 * /
or r9 , r11 , r9 / * t 1 = h | t 1 * /
swi r9 , r5 , 4 / * * ( d + 4 ) = t 1 * /
bsrli r11 , r12 , 1 6 / * h = v > > 1 6 * /
lwi r12 , r8 , 0 / * v = * ( a s + 0 ) * /
bslli r9 , r12 , 1 6 / * t 1 = v < < 1 6 * /
or r9 , r11 , r9 / * t 1 = h | t 1 * /
swi r9 , r5 , 0 / * * ( d + 0 ) = t 1 * /
addi r4 , r4 , - 3 2 / * n = n - 3 2 * /
bneid r4 , d _ b u 2 _ l o o p / * w h i l e ( n ) l o o p * /
bsrli r11 , r12 , 1 6 / * h = v > > 1 6 ( I N D E L A Y S L O T ) * /
d_block_done :
addi r4 , r0 , 4 / * n = 4 * /
cmpu r4 , r4 , r7 / * n = c - n ( u n s i g n e d ) * /
blti r4 ,d _ x f e r _ e n d / * i f n < 0 , l e s s t h a n o n e w o r d t o t r a n s f e r * /
d_word_xfer :
andi r4 , r7 , 0 x f f f f f f f c / * n = c & ~ 3 * /
rsub r5 , r4 , r5 / * d = d - n * /
rsub r6 , r4 , r6 / * s = s - n * /
rsub r7 , r4 , r7 / * c = c - n * /
andi r9 , r6 , 3 / * t 1 = s & 3 * /
/* if temp != 0, unaligned transfers needed */
bnei r9 , d _ w o r d _ u n a l i g n e d
d_word_aligned :
addi r4 , r4 ,- 4 / * n - - * /
lw r9 , r6 , r4 / * t 1 = * ( s + n ) * /
bneid r4 , d _ w o r d _ a l i g n e d / * l o o p * /
sw r9 , r5 , r4 / * * ( d + n ) = t 1 ( I N D E L A Y S L O T ) * /
bri d _ w o r d _ d o n e
d_word_unaligned :
andi r8 , r6 , 0 x f f f f f f f c / * a s = s & ~ 3 * /
lw r11 , r8 , r4 / * h = * ( a s + n ) * /
addi r9 , r9 , - 1
beqi r9 ,d _ w o r d _ u 1 / * t 1 w a s 1 = > 1 b y t e o f f s e t * /
addi r9 , r9 , - 1
beqi r9 ,d _ w o r d _ u 2 / * t 1 w a s 2 = > 2 b y t e o f f s e t * /
d_word_u3 :
bsrli r11 , r11 , 8 / * h = h > > 8 * /
d_wu3_loop :
addi r4 , r4 ,- 4 / * n = n - 4 * /
lw r12 , r8 , r4 / * v = * ( a s + n ) * /
bslli r9 , r12 , 2 4 / * t 1 = v < < 2 4 * /
or r9 , r11 , r9 / * t 1 = h | t 1 * /
sw r9 , r5 , r4 / * * ( d + n ) = t 1 * /
bneid r4 , d _ w u 3 _ l o o p / * w h i l e ( n ) l o o p * /
bsrli r11 , r12 , 8 / * h = v > > 8 ( I N D E L A Y S L O T ) * /
bri d _ w o r d _ d o n e
d_word_u1 :
bsrli r11 , r11 , 2 4 / * h = h > > 2 4 * /
d_wu1_loop :
addi r4 , r4 ,- 4 / * n = n - 4 * /
lw r12 , r8 , r4 / * v = * ( a s + n ) * /
bslli r9 , r12 , 8 / * t 1 = v < < 8 * /
or r9 , r11 , r9 / * t 1 = h | t 1 * /
sw r9 , r5 , r4 / * * ( d + n ) = t 1 * /
bneid r4 , d _ w u 1 _ l o o p / * w h i l e ( n ) l o o p * /
bsrli r11 , r12 , 2 4 / * h = v > > 2 4 ( I N D E L A Y S L O T ) * /
bri d _ w o r d _ d o n e
d_word_u2 :
bsrli r11 , r11 , 1 6 / * h = h > > 1 6 * /
d_wu2_loop :
addi r4 , r4 ,- 4 / * n = n - 4 * /
lw r12 , r8 , r4 / * v = * ( a s + n ) * /
bslli r9 , r12 , 1 6 / * t 1 = v < < 1 6 * /
or r9 , r11 , r9 / * t 1 = h | t 1 * /
sw r9 , r5 , r4 / * * ( d + n ) = t 1 * /
bneid r4 , d _ w u 2 _ l o o p / * w h i l e ( n ) l o o p * /
bsrli r11 , r12 , 1 6 / * h = v > > 1 6 ( I N D E L A Y S L O T ) * /
d_word_done :
d_xfer_end :
d_xfer_end_loop :
beqi r7 , a _ d o n e / * w h i l e ( c ) * /
addi r6 , r6 , - 1 / * s - - * /
lbui r9 , r6 , 0 / * t 1 = * s * /
addi r5 , r5 , - 1 / * d - - * /
sbi r9 , r5 , 0 / * * d = t 1 * /
brid d _ x f e r _ e n d _ l o o p / * l o o p * /
addi r7 , r7 , - 1 / * c - - ( I N D E L A Y S L O T ) * /
d_done :
rtsd r15 , 8
nop
2010-03-23 10:09:32 +03:00
.size memmove, . - m e m m o v e
2009-03-27 16:25:21 +03:00
.end memmove