2005-04-16 15:20:36 -07:00
/* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */
2006-10-04 03:38:54 -04:00
2006-09-26 10:52:32 +02:00
# include < l i n u x / l i n k a g e . h >
2016-01-26 22:12:04 +01:00
# include < a s m / c p u f e a t u r e s . h >
2011-07-13 09:24:10 -04:00
# include < a s m / a l t e r n a t i v e - a s m . h >
2016-01-11 11:04:34 -05:00
# include < a s m / e x p o r t . h >
2006-09-26 10:52:32 +02:00
x86/lib/copy_page_64.S: Use generic ALTERNATIVE macro
... instead of the semi-version with the spelled out sections.
What is more, make the REP_GOOD version be the default copy_page()
version as the majority of the relevant x86 CPUs do set
X86_FEATURE_REP_GOOD. Thus, copy_page gets compiled to:
ffffffff8130af80 <copy_page>:
ffffffff8130af80: e9 0b 00 00 00 jmpq ffffffff8130af90 <copy_page_regs>
ffffffff8130af85: b9 00 02 00 00 mov $0x200,%ecx
ffffffff8130af8a: f3 48 a5 rep movsq %ds:(%rsi),%es:(%rdi)
ffffffff8130af8d: c3 retq
ffffffff8130af8e: 66 90 xchg %ax,%ax
ffffffff8130af90 <copy_page_regs>:
...
and after the alternatives have run, the JMP to the old, unrolled
version gets NOPed out:
ffffffff8130af80 <copy_page>:
ffffffff8130af80: 66 66 90 xchg %ax,%ax
ffffffff8130af83: 66 90 xchg %ax,%ax
ffffffff8130af85: b9 00 02 00 00 mov $0x200,%ecx
ffffffff8130af8a: f3 48 a5 rep movsq %ds:(%rsi),%es:(%rdi)
ffffffff8130af8d: c3 retq
On modern uarches, those NOPs are cheaper than the unconditional JMP
previously.
Signed-off-by: Borislav Petkov <bp@suse.de>
2015-01-12 18:19:40 +01:00
/ *
* Some C P U s r u n f a s t e r u s i n g t h e s t r i n g c o p y i n s t r u c t i o n s ( s a n e m i c r o c o d e ) .
* It i s a l s o a l o t s i m p l e r . U s e t h i s w h e n p o s s i b l e . B u t , d o n ' t u s e s t r e a m i n g
* copy u n l e s s t h e C P U i n d i c a t e s X 8 6 _ F E A T U R E _ R E P _ G O O D . C o u l d v a r y t h e
* prefetch d i s t a n c e b a s e d o n S M P / U P .
* /
2006-09-26 10:52:32 +02:00
ALIGN
x86/lib/copy_page_64.S: Use generic ALTERNATIVE macro
... instead of the semi-version with the spelled out sections.
What is more, make the REP_GOOD version be the default copy_page()
version as the majority of the relevant x86 CPUs do set
X86_FEATURE_REP_GOOD. Thus, copy_page gets compiled to:
ffffffff8130af80 <copy_page>:
ffffffff8130af80: e9 0b 00 00 00 jmpq ffffffff8130af90 <copy_page_regs>
ffffffff8130af85: b9 00 02 00 00 mov $0x200,%ecx
ffffffff8130af8a: f3 48 a5 rep movsq %ds:(%rsi),%es:(%rdi)
ffffffff8130af8d: c3 retq
ffffffff8130af8e: 66 90 xchg %ax,%ax
ffffffff8130af90 <copy_page_regs>:
...
and after the alternatives have run, the JMP to the old, unrolled
version gets NOPed out:
ffffffff8130af80 <copy_page>:
ffffffff8130af80: 66 66 90 xchg %ax,%ax
ffffffff8130af83: 66 90 xchg %ax,%ax
ffffffff8130af85: b9 00 02 00 00 mov $0x200,%ecx
ffffffff8130af8a: f3 48 a5 rep movsq %ds:(%rsi),%es:(%rdi)
ffffffff8130af8d: c3 retq
On modern uarches, those NOPs are cheaper than the unconditional JMP
previously.
Signed-off-by: Borislav Petkov <bp@suse.de>
2015-01-12 18:19:40 +01:00
ENTRY( c o p y _ p a g e )
ALTERNATIVE " j m p c o p y _ p a g e _ r e g s " , " " , X 8 6 _ F E A T U R E _ R E P _ G O O D
2012-10-18 03:52:45 +08:00
movl $ 4 0 9 6 / 8 , % e c x
rep m o v s q
2006-09-26 10:52:32 +02:00
ret
x86/lib/copy_page_64.S: Use generic ALTERNATIVE macro
... instead of the semi-version with the spelled out sections.
What is more, make the REP_GOOD version be the default copy_page()
version as the majority of the relevant x86 CPUs do set
X86_FEATURE_REP_GOOD. Thus, copy_page gets compiled to:
ffffffff8130af80 <copy_page>:
ffffffff8130af80: e9 0b 00 00 00 jmpq ffffffff8130af90 <copy_page_regs>
ffffffff8130af85: b9 00 02 00 00 mov $0x200,%ecx
ffffffff8130af8a: f3 48 a5 rep movsq %ds:(%rsi),%es:(%rdi)
ffffffff8130af8d: c3 retq
ffffffff8130af8e: 66 90 xchg %ax,%ax
ffffffff8130af90 <copy_page_regs>:
...
and after the alternatives have run, the JMP to the old, unrolled
version gets NOPed out:
ffffffff8130af80 <copy_page>:
ffffffff8130af80: 66 66 90 xchg %ax,%ax
ffffffff8130af83: 66 90 xchg %ax,%ax
ffffffff8130af85: b9 00 02 00 00 mov $0x200,%ecx
ffffffff8130af8a: f3 48 a5 rep movsq %ds:(%rsi),%es:(%rdi)
ffffffff8130af8d: c3 retq
On modern uarches, those NOPs are cheaper than the unconditional JMP
previously.
Signed-off-by: Borislav Petkov <bp@suse.de>
2015-01-12 18:19:40 +01:00
ENDPROC( c o p y _ p a g e )
2016-01-11 11:04:34 -05:00
EXPORT_ S Y M B O L ( c o p y _ p a g e )
2005-04-16 15:20:36 -07:00
x86/lib/copy_page_64.S: Use generic ALTERNATIVE macro
... instead of the semi-version with the spelled out sections.
What is more, make the REP_GOOD version be the default copy_page()
version as the majority of the relevant x86 CPUs do set
X86_FEATURE_REP_GOOD. Thus, copy_page gets compiled to:
ffffffff8130af80 <copy_page>:
ffffffff8130af80: e9 0b 00 00 00 jmpq ffffffff8130af90 <copy_page_regs>
ffffffff8130af85: b9 00 02 00 00 mov $0x200,%ecx
ffffffff8130af8a: f3 48 a5 rep movsq %ds:(%rsi),%es:(%rdi)
ffffffff8130af8d: c3 retq
ffffffff8130af8e: 66 90 xchg %ax,%ax
ffffffff8130af90 <copy_page_regs>:
...
and after the alternatives have run, the JMP to the old, unrolled
version gets NOPed out:
ffffffff8130af80 <copy_page>:
ffffffff8130af80: 66 66 90 xchg %ax,%ax
ffffffff8130af83: 66 90 xchg %ax,%ax
ffffffff8130af85: b9 00 02 00 00 mov $0x200,%ecx
ffffffff8130af8a: f3 48 a5 rep movsq %ds:(%rsi),%es:(%rdi)
ffffffff8130af8d: c3 retq
On modern uarches, those NOPs are cheaper than the unconditional JMP
previously.
Signed-off-by: Borislav Petkov <bp@suse.de>
2015-01-12 18:19:40 +01:00
ENTRY( c o p y _ p a g e _ r e g s )
2012-10-18 03:52:45 +08:00
subq $ 2 * 8 , % r s p
movq % r b x , ( % r s p )
movq % r12 , 1 * 8 ( % r s p )
2006-02-03 21:51:02 +01:00
2012-10-18 03:52:45 +08:00
movl $ ( 4 0 9 6 / 6 4 ) - 5 , % e c x
2006-02-03 21:51:02 +01:00
.p2align 4
.Loop64 :
2012-10-18 03:52:45 +08:00
dec % r c x
movq 0 x8 * 0 ( % r s i ) , % r a x
movq 0 x8 * 1 ( % r s i ) , % r b x
movq 0 x8 * 2 ( % r s i ) , % r d x
movq 0 x8 * 3 ( % r s i ) , % r8
movq 0 x8 * 4 ( % r s i ) , % r9
movq 0 x8 * 5 ( % r s i ) , % r10
movq 0 x8 * 6 ( % r s i ) , % r11
movq 0 x8 * 7 ( % r s i ) , % r12
2006-02-03 21:51:02 +01:00
prefetcht0 5 * 6 4 ( % r s i )
2012-10-18 03:52:45 +08:00
movq % r a x , 0 x8 * 0 ( % r d i )
movq % r b x , 0 x8 * 1 ( % r d i )
movq % r d x , 0 x8 * 2 ( % r d i )
movq % r8 , 0 x8 * 3 ( % r d i )
movq % r9 , 0 x8 * 4 ( % r d i )
movq % r10 , 0 x8 * 5 ( % r d i )
movq % r11 , 0 x8 * 6 ( % r d i )
movq % r12 , 0 x8 * 7 ( % r d i )
2006-02-03 21:51:02 +01:00
2012-10-18 03:52:45 +08:00
leaq 6 4 ( % r s i ) , % r s i
leaq 6 4 ( % r d i ) , % r d i
2006-02-03 21:51:02 +01:00
2012-10-18 03:52:45 +08:00
jnz . L o o p64
2006-02-03 21:51:02 +01:00
2012-10-18 03:52:45 +08:00
movl $ 5 , % e c x
2006-02-03 21:51:02 +01:00
.p2align 4
.Loop2 :
2012-10-18 03:52:45 +08:00
decl % e c x
movq 0 x8 * 0 ( % r s i ) , % r a x
movq 0 x8 * 1 ( % r s i ) , % r b x
movq 0 x8 * 2 ( % r s i ) , % r d x
movq 0 x8 * 3 ( % r s i ) , % r8
movq 0 x8 * 4 ( % r s i ) , % r9
movq 0 x8 * 5 ( % r s i ) , % r10
movq 0 x8 * 6 ( % r s i ) , % r11
movq 0 x8 * 7 ( % r s i ) , % r12
movq % r a x , 0 x8 * 0 ( % r d i )
movq % r b x , 0 x8 * 1 ( % r d i )
movq % r d x , 0 x8 * 2 ( % r d i )
movq % r8 , 0 x8 * 3 ( % r d i )
movq % r9 , 0 x8 * 4 ( % r d i )
movq % r10 , 0 x8 * 5 ( % r d i )
movq % r11 , 0 x8 * 6 ( % r d i )
movq % r12 , 0 x8 * 7 ( % r d i )
leaq 6 4 ( % r d i ) , % r d i
leaq 6 4 ( % r s i ) , % r s i
2006-02-03 21:51:02 +01:00
jnz . L o o p2
2012-10-18 03:52:45 +08:00
movq ( % r s p ) , % r b x
movq 1 * 8 ( % r s p ) , % r12
addq $ 2 * 8 , % r s p
2006-02-03 21:51:02 +01:00
ret
x86/lib/copy_page_64.S: Use generic ALTERNATIVE macro
... instead of the semi-version with the spelled out sections.
What is more, make the REP_GOOD version be the default copy_page()
version as the majority of the relevant x86 CPUs do set
X86_FEATURE_REP_GOOD. Thus, copy_page gets compiled to:
ffffffff8130af80 <copy_page>:
ffffffff8130af80: e9 0b 00 00 00 jmpq ffffffff8130af90 <copy_page_regs>
ffffffff8130af85: b9 00 02 00 00 mov $0x200,%ecx
ffffffff8130af8a: f3 48 a5 rep movsq %ds:(%rsi),%es:(%rdi)
ffffffff8130af8d: c3 retq
ffffffff8130af8e: 66 90 xchg %ax,%ax
ffffffff8130af90 <copy_page_regs>:
...
and after the alternatives have run, the JMP to the old, unrolled
version gets NOPed out:
ffffffff8130af80 <copy_page>:
ffffffff8130af80: 66 66 90 xchg %ax,%ax
ffffffff8130af83: 66 90 xchg %ax,%ax
ffffffff8130af85: b9 00 02 00 00 mov $0x200,%ecx
ffffffff8130af8a: f3 48 a5 rep movsq %ds:(%rsi),%es:(%rdi)
ffffffff8130af8d: c3 retq
On modern uarches, those NOPs are cheaper than the unconditional JMP
previously.
Signed-off-by: Borislav Petkov <bp@suse.de>
2015-01-12 18:19:40 +01:00
ENDPROC( c o p y _ p a g e _ r e g s )