2015-01-21 12:27:38 +11:00
/ *
* Author : Anton B l a n c h a r d < a n t o n @au.ibm.com>
* Copyright 2 0 1 5 I B M C o r p o r a t i o n .
*
* This p r o g r a m i s f r e e s o f t w a r e ; you can redistribute it and/or
* modify i t u n d e r t h e t e r m s o f t h e G N U G e n e r a l P u b l i c L i c e n s e
* as p u b l i s h e d b y t h e F r e e S o f t w a r e F o u n d a t i o n ; either version
* 2 of t h e L i c e n s e , o r ( a t y o u r o p t i o n ) a n y l a t e r v e r s i o n .
* /
# include < a s m / p p c _ a s m . h >
2016-01-13 23:33:46 -05:00
# include < a s m / e x p o r t . h >
2018-06-07 09:57:53 +08:00
# include < a s m / p p c - o p c o d e . h >
2015-01-21 12:27:38 +11:00
# define o f f8 r6
# define o f f16 r7
# define o f f24 r8
# define r A r9
# define r B r10
# define r C r11
# define r D r27
# define r E r28
# define r F r29
# define r G r30
# define r H r31
# ifdef _ _ L I T T L E _ E N D I A N _ _
2018-06-07 09:57:51 +08:00
# define L H l h b r x
# define L W l w b r x
2015-01-21 12:27:38 +11:00
# define L D l d b r x
2018-06-07 09:57:53 +08:00
# define L V S l v s r
# define V P E R M ( _ V R T ,_ V R A ,_ V R B ,_ V R C ) \
vperm _ V R T ,_ V R B ,_ V R A ,_ V R C
2015-01-21 12:27:38 +11:00
# else
2018-06-07 09:57:51 +08:00
# define L H l h z x
# define L W l w z x
2015-01-21 12:27:38 +11:00
# define L D l d x
2018-06-07 09:57:53 +08:00
# define L V S l v s l
# define V P E R M ( _ V R T ,_ V R A ,_ V R B ,_ V R C ) \
vperm _ V R T ,_ V R A ,_ V R B ,_ V R C
2015-01-21 12:27:38 +11:00
# endif
2018-06-07 09:57:53 +08:00
# define V M X _ T H R E S H 4 0 9 6
# define E N T E R _ V M X _ O P S \
mflr r0 ; \
std r3 ,- S T A C K F R A M E S I Z E + S T K _ R E G ( R 3 1 ) ( r1 ) ; \
std r4 ,- S T A C K F R A M E S I Z E + S T K _ R E G ( R 3 0 ) ( r1 ) ; \
std r5 ,- S T A C K F R A M E S I Z E + S T K _ R E G ( R 2 9 ) ( r1 ) ; \
std r0 ,1 6 ( r1 ) ; \
stdu r1 ,- S T A C K F R A M E S I Z E ( r1 ) ; \
bl e n t e r _ v m x _ o p s ; \
cmpwi c r1 ,r3 ,0 ; \
ld r0 ,S T A C K F R A M E S I Z E + 1 6 ( r1 ) ; \
ld r3 ,S T K _ R E G ( R 3 1 ) ( r1 ) ; \
ld r4 ,S T K _ R E G ( R 3 0 ) ( r1 ) ; \
ld r5 ,S T K _ R E G ( R 2 9 ) ( r1 ) ; \
addi r1 ,r1 ,S T A C K F R A M E S I Z E ; \
mtlr r0
# define E X I T _ V M X _ O P S \
mflr r0 ; \
std r3 ,- S T A C K F R A M E S I Z E + S T K _ R E G ( R 3 1 ) ( r1 ) ; \
std r4 ,- S T A C K F R A M E S I Z E + S T K _ R E G ( R 3 0 ) ( r1 ) ; \
std r5 ,- S T A C K F R A M E S I Z E + S T K _ R E G ( R 2 9 ) ( r1 ) ; \
std r0 ,1 6 ( r1 ) ; \
stdu r1 ,- S T A C K F R A M E S I Z E ( r1 ) ; \
bl e x i t _ v m x _ o p s ; \
ld r0 ,S T A C K F R A M E S I Z E + 1 6 ( r1 ) ; \
ld r3 ,S T K _ R E G ( R 3 1 ) ( r1 ) ; \
ld r4 ,S T K _ R E G ( R 3 0 ) ( r1 ) ; \
ld r5 ,S T K _ R E G ( R 2 9 ) ( r1 ) ; \
addi r1 ,r1 ,S T A C K F R A M E S I Z E ; \
mtlr r0
/ *
* LD_ V S R _ C R O S S 1 6 B l o a d t h e 2 n d 1 6 b y t e s f o r _ v a d d r w h i c h i s u n a l i g n e d w i t h
* 1 6 bytes b o u n d a r y a n d p e r m u t e t h e r e s u l t w i t h t h e 1 s t 1 6 b y t e s .
* | y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z |
* ^ ^ ^
* 0 xbbbb1 0 0 x b b b b20 0 x b b b30
* ^
* _ vaddr
*
*
* _ vmask i s t h e m a s k g e n e r a t e d b y L V S
* _ v1 s t _ q w i s t h e 1 s t a l i g n e d Q W o f c u r r e n t a d d r w h i c h i s a l r e a d y l o a d e d .
* for e x a m p l e : 0 x y y y y y y y y y y y y y 0 1 2 f o r b i g e n d i a n
* _ v2 n d _ q w i s t h e 2 n d a l i g n e d Q W o f c u r _ v a d d r t o b e l o a d e d .
* for e x a m p l e : 0 x34 5 6 7 8 9 a b c d e f z z z f o r b i g e n d i a n
* The p e r m u t e r e s u l t i s s a v e d i n _ v _ r e s .
* for e x a m p l e : 0 x01 2 3 4 5 6 7 8 9 a b c d e f f o r b i g e n d i a n .
* /
# define L D _ V S R _ C R O S S 1 6 B ( _ v a d d r ,_ v m a s k ,_ v1 s t _ q w ,_ v2 n d _ q w ,_ v _ r e s ) \
lvx _ v2 n d _ q w ,_ v a d d r ,o f f16 ; \
VPERM( _ v _ r e s ,_ v1 s t _ q w ,_ v2 n d _ q w ,_ v m a s k )
2018-06-07 09:57:51 +08:00
/ *
* There a r e 2 c a t e g o r i e s f o r m e m c m p :
* 1 ) src/ d s t h a s t h e s a m e o f f s e t t o t h e 8 b y t e s b o u n d a r y . T h e h a n d l e r s
* are n a m e d l i k e . L s a m e o f f s e t _ x x x x
* 2 ) src/ d s t h a s d i f f e r e n t o f f s e t t o t h e 8 b y t e s b o u n d a r y . T h e h a n d l e r s
* are n a m e d l i k e . L d i f f o f f s e t _ x x x x
* /
2018-06-07 09:57:53 +08:00
_ GLOBAL_ T O C ( m e m c m p )
2015-01-21 12:27:38 +11:00
cmpdi c r1 ,r5 ,0
2018-06-07 09:57:51 +08:00
/ * Use t h e s h o r t l o o p i f t h e s r c / d s t a d d r e s s e s a r e n o t
* with t h e s a m e o f f s e t o f 8 b y t e s a l i g n b o u n d a r y .
* /
xor r6 ,r3 ,r4
2015-01-21 12:27:38 +11:00
andi. r6 ,r6 ,7
2018-06-07 09:57:51 +08:00
/ * Fall b a c k t o s h o r t l o o p i f c o m p a r e a t a l i g n e d a d d r s
* with l e s s t h a n 8 b y t e s .
* /
cmpdi c r6 ,r5 ,7
2015-01-21 12:27:38 +11:00
beq c r1 ,. L z e r o
2018-06-07 09:57:51 +08:00
bgt c r6 ,. L n o _ s h o r t
2015-01-21 12:27:38 +11:00
.Lshort :
mtctr r5
1 : lbz r A ,0 ( r3 )
lbz r B ,0 ( r4 )
subf. r C ,r B ,r A
bne . L n o n _ z e r o
bdz . L z e r o
lbz r A ,1 ( r3 )
lbz r B ,1 ( r4 )
subf. r C ,r B ,r A
bne . L n o n _ z e r o
bdz . L z e r o
lbz r A ,2 ( r3 )
lbz r B ,2 ( r4 )
subf. r C ,r B ,r A
bne . L n o n _ z e r o
bdz . L z e r o
lbz r A ,3 ( r3 )
lbz r B ,3 ( r4 )
subf. r C ,r B ,r A
bne . L n o n _ z e r o
addi r3 ,r3 ,4
addi r4 ,r4 ,4
bdnz 1 b
.Lzero :
li r3 ,0
blr
2018-06-07 09:57:51 +08:00
.Lno_short :
dcbt 0 ,r3
dcbt 0 ,r4
bne . L d i f f o f f s e t _ 8 b y t e s _ m a k e _ a l i g n _ s t a r t
.Lsameoffset_8bytes_make_align_start :
/ * attempt t o c o m p a r e b y t e s n o t a l i g n e d w i t h 8 b y t e s s o t h a t
* rest c o m p a r i s o n c a n r u n b a s e d o n 8 b y t e s a l i g n m e n t .
* /
andi. r6 ,r3 ,7
/ * Try t o c o m p a r e t h e f i r s t d o u b l e w o r d w h i c h i s n o t 8 b y t e s a l i g n e d :
* load t h e f i r s t d o u b l e w o r d a t ( s r c & ~ 7 U L ) a n d s h i f t l e f t a p p r o p r i a t e
* bits b e f o r e c o m p a r i s i o n .
* /
rlwinm r6 ,r3 ,3 ,2 6 ,2 8
beq . L s a m e o f f s e t _ 8 b y t e s _ a l i g n e d
clrrdi r3 ,r3 ,3
clrrdi r4 ,r4 ,3
LD r A ,0 ,r3
LD r B ,0 ,r4
sld r A ,r A ,r6
sld r B ,r B ,r6
cmpld c r0 ,r A ,r B
srwi r6 ,r6 ,3
bne c r0 ,. L c m p A B _ l i g h t w e i g h t
subfic r6 ,r6 ,8
subf. r5 ,r6 ,r5
addi r3 ,r3 ,8
addi r4 ,r4 ,8
beq . L z e r o
.Lsameoffset_8bytes_aligned :
/ * now w e a r e a l i g n e d w i t h 8 b y t e s .
* Use . L l o n g l o o p i f l e f t c m p b y t e s a r e e q u a l o r g r e a t e r t h a n 3 2 B .
* /
cmpdi c r6 ,r5 ,3 1
bgt c r6 ,. L l o n g
.Lcmp_lt32bytes :
2018-06-07 09:57:53 +08:00
/* compare 1 ~ 31 bytes, at least r3 addr is 8 bytes aligned now */
2018-06-07 09:57:51 +08:00
cmpdi c r5 ,r5 ,7
srdi r0 ,r5 ,3
ble c r5 ,. L c m p _ r e s t _ l t 8 b y t e s
/* handle 8 ~ 31 bytes */
clrldi r5 ,r5 ,6 1
mtctr r0
2 :
LD r A ,0 ,r3
LD r B ,0 ,r4
cmpld c r0 ,r A ,r B
addi r3 ,r3 ,8
addi r4 ,r4 ,8
bne c r0 ,. L c m p A B _ l i g h t w e i g h t
bdnz 2 b
cmpwi r5 ,0
beq . L z e r o
.Lcmp_rest_lt8bytes :
/ * Here w e h a v e o n l y l e s s t h a n 8 b y t e s t o c o m p a r e w i t h . a t l e a s t s1
* Address i s a l i g n e d w i t h 8 b y t e s .
* The n e x t d o u b l e w o r d s a r e l o a d a n d s h i f t r i g h t w i t h a p p r o p r i a t e
* bits.
* /
subfic r6 ,r5 ,8
slwi r6 ,r6 ,3
LD r A ,0 ,r3
LD r B ,0 ,r4
srd r A ,r A ,r6
srd r B ,r B ,r6
cmpld c r0 ,r A ,r B
bne c r0 ,. L c m p A B _ l i g h t w e i g h t
b . L z e r o
2015-01-21 12:27:38 +11:00
.Lnon_zero :
mr r3 ,r C
blr
.Llong :
2018-06-07 09:57:53 +08:00
# ifdef C O N F I G _ A L T I V E C
BEGIN_ F T R _ S E C T I O N
/* Try to use vmx loop if length is equal or greater than 4K */
cmpldi c r6 ,r5 ,V M X _ T H R E S H
bge c r6 ,. L s a m e o f f s e t _ v m x _ c m p
END_ F T R _ S E C T I O N _ I F S E T ( C P U _ F T R _ A R C H _ 2 0 7 S )
.Llong_novmx_cmp :
# endif
2018-06-07 09:57:51 +08:00
/* At least s1 addr is aligned with 8 bytes */
2015-01-21 12:27:38 +11:00
li o f f8 ,8
li o f f16 ,1 6
li o f f24 ,2 4
std r31 ,- 8 ( r1 )
std r30 ,- 1 6 ( r1 )
std r29 ,- 2 4 ( r1 )
std r28 ,- 3 2 ( r1 )
std r27 ,- 4 0 ( r1 )
srdi r0 ,r5 ,5
mtctr r0
andi. r5 ,r5 ,3 1
LD r A ,0 ,r3
LD r B ,0 ,r4
LD r C ,o f f8 ,r3
LD r D ,o f f8 ,r4
LD r E ,o f f16 ,r3
LD r F ,o f f16 ,r4
LD r G ,o f f24 ,r3
LD r H ,o f f24 ,r4
cmpld c r0 ,r A ,r B
addi r3 ,r3 ,3 2
addi r4 ,r4 ,3 2
bdz . L f i r s t 3 2
LD r A ,0 ,r3
LD r B ,0 ,r4
cmpld c r1 ,r C ,r D
LD r C ,o f f8 ,r3
LD r D ,o f f8 ,r4
cmpld c r6 ,r E ,r F
LD r E ,o f f16 ,r3
LD r F ,o f f16 ,r4
cmpld c r7 ,r G ,r H
bne c r0 ,. L c m p A B
LD r G ,o f f24 ,r3
LD r H ,o f f24 ,r4
cmpld c r0 ,r A ,r B
bne c r1 ,. L c m p C D
addi r3 ,r3 ,3 2
addi r4 ,r4 ,3 2
bdz . L s e c o n d32
.balign 16
1 : LD r A ,0 ,r3
LD r B ,0 ,r4
cmpld c r1 ,r C ,r D
bne c r6 ,. L c m p E F
LD r C ,o f f8 ,r3
LD r D ,o f f8 ,r4
cmpld c r6 ,r E ,r F
bne c r7 ,. L c m p G H
LD r E ,o f f16 ,r3
LD r F ,o f f16 ,r4
cmpld c r7 ,r G ,r H
bne c r0 ,. L c m p A B
LD r G ,o f f24 ,r3
LD r H ,o f f24 ,r4
cmpld c r0 ,r A ,r B
bne c r1 ,. L c m p C D
addi r3 ,r3 ,3 2
addi r4 ,r4 ,3 2
bdnz 1 b
.Lsecond32 :
cmpld c r1 ,r C ,r D
bne c r6 ,. L c m p E F
cmpld c r6 ,r E ,r F
bne c r7 ,. L c m p G H
cmpld c r7 ,r G ,r H
bne c r0 ,. L c m p A B
bne c r1 ,. L c m p C D
bne c r6 ,. L c m p E F
bne c r7 ,. L c m p G H
.Ltail :
ld r31 ,- 8 ( r1 )
ld r30 ,- 1 6 ( r1 )
ld r29 ,- 2 4 ( r1 )
ld r28 ,- 3 2 ( r1 )
ld r27 ,- 4 0 ( r1 )
cmpdi r5 ,0
beq . L z e r o
b . L s h o r t
.Lfirst32 :
cmpld c r1 ,r C ,r D
cmpld c r6 ,r E ,r F
cmpld c r7 ,r G ,r H
bne c r0 ,. L c m p A B
bne c r1 ,. L c m p C D
bne c r6 ,. L c m p E F
bne c r7 ,. L c m p G H
b . L t a i l
.LcmpAB :
li r3 ,1
bgt c r0 ,. L o u t
li r3 ,- 1
b . L o u t
.LcmpCD :
li r3 ,1
bgt c r1 ,. L o u t
li r3 ,- 1
b . L o u t
.LcmpEF :
li r3 ,1
bgt c r6 ,. L o u t
li r3 ,- 1
b . L o u t
.LcmpGH :
li r3 ,1
bgt c r7 ,. L o u t
li r3 ,- 1
.Lout :
ld r31 ,- 8 ( r1 )
ld r30 ,- 1 6 ( r1 )
ld r29 ,- 2 4 ( r1 )
ld r28 ,- 3 2 ( r1 )
ld r27 ,- 4 0 ( r1 )
blr
2018-06-07 09:57:51 +08:00
.LcmpAB_lightweight : /* skip NV GPRS restore */
li r3 ,1
bgtlr
li r3 ,- 1
blr
2018-06-07 09:57:53 +08:00
# ifdef C O N F I G _ A L T I V E C
.Lsameoffset_vmx_cmp :
/ * Enter w i t h s r c / d s t a d d r s h a s t h e s a m e o f f s e t w i t h 8 b y t e s
2018-06-07 09:57:54 +08:00
* align b o u n d a r y .
*
* There i s a n o p t i m i z a t i o n b a s e d o n f o l l o w i n g f a c t : m e m c m p ( )
* prones t o f a i l e a r l y a t t h e f i r s t 3 2 b y t e s .
* Before a p p l y i n g V M X i n s t r u c t i o n s w h i c h w i l l l e a d t o 3 2 x12 8 b i t s
* VMX r e g s l o a d / r e s t o r e p e n a l t y , w e c o m p a r e t h e f i r s t 3 2 b y t e s
* so t h a t w e c a n c a t c h t h e ~ 8 0 % f a i l c a s e s .
2018-06-07 09:57:53 +08:00
* /
2018-06-07 09:57:54 +08:00
li r0 ,4
mtctr r0
.Lsameoffset_prechk_32B_loop :
LD r A ,0 ,r3
LD r B ,0 ,r4
cmpld c r0 ,r A ,r B
addi r3 ,r3 ,8
addi r4 ,r4 ,8
bne c r0 ,. L c m p A B _ l i g h t w e i g h t
addi r5 ,r5 ,- 8
bdnz . L s a m e o f f s e t _ p r e c h k _ 3 2 B _ l o o p
2018-06-07 09:57:53 +08:00
ENTER_ V M X _ O P S
beq c r1 ,. L l o n g _ n o v m x _ c m p
3 :
/ * need t o c h e c k w h e t h e r r4 h a s t h e s a m e o f f s e t w i t h r3
* for 1 6 b y t e s b o u n d a r y .
* /
xor r0 ,r3 ,r4
andi. r0 ,r0 ,0 x f
bne . L d i f f o f f s e t _ v m x _ c m p _ s t a r t
/ * len i s n o l e s s t h a n 4 K B . N e e d t o a l i g n w i t h 1 6 b y t e s f u r t h e r .
* /
andi. r A ,r3 ,8
LD r A ,0 ,r3
beq 4 f
LD r B ,0 ,r4
cmpld c r0 ,r A ,r B
addi r3 ,r3 ,8
addi r4 ,r4 ,8
addi r5 ,r5 ,- 8
beq c r0 ,4 f
/* save and restore cr0 */
mfocrf r5 ,1 2 8
EXIT_ V M X _ O P S
mtocrf 1 2 8 ,r5
b . L c m p A B _ l i g h t w e i g h t
4 :
/* compare 32 bytes for each loop */
srdi r0 ,r5 ,5
mtctr r0
clrldi r5 ,r5 ,5 9
li o f f16 ,1 6
.balign 16
5 :
lvx v0 ,0 ,r3
lvx v1 ,0 ,r4
VCMPEQUD_ R C ( v0 ,v0 ,v1 )
bnl c r6 ,7 f
lvx v0 ,o f f16 ,r3
lvx v1 ,o f f16 ,r4
VCMPEQUD_ R C ( v0 ,v0 ,v1 )
bnl c r6 ,6 f
addi r3 ,r3 ,3 2
addi r4 ,r4 ,3 2
bdnz 5 b
EXIT_ V M X _ O P S
cmpdi r5 ,0
beq . L z e r o
b . L c m p _ l t 3 2 b y t e s
6 :
addi r3 ,r3 ,1 6
addi r4 ,r4 ,1 6
7 :
/* diff the last 16 bytes */
EXIT_ V M X _ O P S
LD r A ,0 ,r3
LD r B ,0 ,r4
cmpld c r0 ,r A ,r B
li o f f8 ,8
bne c r0 ,. L c m p A B _ l i g h t w e i g h t
LD r A ,o f f8 ,r3
LD r B ,o f f8 ,r4
cmpld c r0 ,r A ,r B
bne c r0 ,. L c m p A B _ l i g h t w e i g h t
b . L z e r o
# endif
2018-06-07 09:57:51 +08:00
.Ldiffoffset_8bytes_make_align_start :
/* now try to align s1 with 8 bytes */
rlwinm r6 ,r3 ,3 ,2 6 ,2 8
beq . L d i f f o f f s e t _ a l i g n _ s1 _ 8 b y t e s
clrrdi r3 ,r3 ,3
LD r A ,0 ,r3
LD r B ,0 ,r4 / * u n a l i g n e d l o a d * /
sld r A ,r A ,r6
srd r A ,r A ,r6
srd r B ,r B ,r6
cmpld c r0 ,r A ,r B
srwi r6 ,r6 ,3
bne c r0 ,. L c m p A B _ l i g h t w e i g h t
subfic r6 ,r6 ,8
subf. r5 ,r6 ,r5
addi r3 ,r3 ,8
add r4 ,r4 ,r6
beq . L z e r o
.Ldiffoffset_align_s1_8bytes :
/* now s1 is aligned with 8 bytes. */
2018-06-07 09:57:54 +08:00
# ifdef C O N F I G _ A L T I V E C
BEGIN_ F T R _ S E C T I O N
/* only do vmx ops when the size equal or greater than 4K bytes */
cmpdi c r5 ,r5 ,V M X _ T H R E S H
bge c r5 ,. L d i f f o f f s e t _ v m x _ c m p
END_ F T R _ S E C T I O N _ I F S E T ( C P U _ F T R _ A R C H _ 2 0 7 S )
.Ldiffoffset_novmx_cmp :
# endif
2018-06-07 09:57:51 +08:00
cmpdi c r5 ,r5 ,3 1
ble c r5 ,. L c m p _ l t 3 2 b y t e s
2018-06-07 09:57:53 +08:00
# ifdef C O N F I G _ A L T I V E C
b . L l o n g _ n o v m x _ c m p
# else
2018-06-07 09:57:51 +08:00
b . L l o n g
2018-06-07 09:57:53 +08:00
# endif
# ifdef C O N F I G _ A L T I V E C
.Ldiffoffset_vmx_cmp :
2018-06-07 09:57:54 +08:00
/ * perform a 3 2 b y t e s p r e - c h e c k i n g b e f o r e
* enable V M X o p e r a t i o n s .
* /
li r0 ,4
mtctr r0
.Ldiffoffset_prechk_32B_loop :
LD r A ,0 ,r3
LD r B ,0 ,r4
cmpld c r0 ,r A ,r B
addi r3 ,r3 ,8
addi r4 ,r4 ,8
bne c r0 ,. L c m p A B _ l i g h t w e i g h t
addi r5 ,r5 ,- 8
bdnz . L d i f f o f f s e t _ p r e c h k _ 3 2 B _ l o o p
2018-06-07 09:57:53 +08:00
ENTER_ V M X _ O P S
beq c r1 ,. L d i f f o f f s e t _ n o v m x _ c m p
.Ldiffoffset_vmx_cmp_start :
/* Firstly try to align r3 with 16 bytes */
andi. r6 ,r3 ,0 x f
li o f f16 ,1 6
beq . L d i f f o f f s e t _ v m x _ s1 _ 1 6 b y t e s _ a l i g n
2018-06-07 09:57:51 +08:00
2018-06-07 09:57:53 +08:00
LVS v3 ,0 ,r3
LVS v4 ,0 ,r4
lvx v5 ,0 ,r3
lvx v6 ,0 ,r4
LD_ V S R _ C R O S S 1 6 B ( r3 ,v3 ,v5 ,v7 ,v9 )
LD_ V S R _ C R O S S 1 6 B ( r4 ,v4 ,v6 ,v8 ,v10 )
VCMPEQUB_ R C ( v7 ,v9 ,v10 )
bnl c r6 ,. L d i f f o f f s e t _ v m x _ d i f f _ f o u n d
subfic r6 ,r6 ,1 6
subf r5 ,r6 ,r5
add r3 ,r3 ,r6
add r4 ,r4 ,r6
.Ldiffoffset_vmx_s1_16bytes_align :
/* now s1 is aligned with 16 bytes */
lvx v6 ,0 ,r4
LVS v4 ,0 ,r4
srdi r6 ,r5 ,5 / * l o o p f o r 3 2 b y t e s e a c h * /
clrldi r5 ,r5 ,5 9
mtctr r6
.balign 16
.Ldiffoffset_vmx_32bytesloop :
/* the first qw of r4 was saved in v6 */
lvx v9 ,0 ,r3
LD_ V S R _ C R O S S 1 6 B ( r4 ,v4 ,v6 ,v8 ,v10 )
VCMPEQUB_ R C ( v7 ,v9 ,v10 )
vor v6 ,v8 ,v8
bnl c r6 ,. L d i f f o f f s e t _ v m x _ d i f f _ f o u n d
addi r3 ,r3 ,1 6
addi r4 ,r4 ,1 6
lvx v9 ,0 ,r3
LD_ V S R _ C R O S S 1 6 B ( r4 ,v4 ,v6 ,v8 ,v10 )
VCMPEQUB_ R C ( v7 ,v9 ,v10 )
vor v6 ,v8 ,v8
bnl c r6 ,. L d i f f o f f s e t _ v m x _ d i f f _ f o u n d
addi r3 ,r3 ,1 6
addi r4 ,r4 ,1 6
bdnz . L d i f f o f f s e t _ v m x _ 3 2 b y t e s l o o p
EXIT_ V M X _ O P S
cmpdi r5 ,0
beq . L z e r o
b . L c m p _ l t 3 2 b y t e s
.Ldiffoffset_vmx_diff_found :
EXIT_ V M X _ O P S
/* anyway, the diff will appear in next 16 bytes */
li r5 ,1 6
b . L c m p _ l t 3 2 b y t e s
# endif
2016-01-13 23:33:46 -05:00
EXPORT_ S Y M B O L ( m e m c m p )