8c9b6a88b7
The old version was oddly written to have the repeat count in multiple registers. So instead of taking advantage of %rax being zero, it had some sub-counts in it. All just for a "single word clearing" loop, which isn't even efficient to begin with. So get rid of those games, and just keep all the state in the same registers we got it in (and that we should return things in). That not only makes this act much more like 'rep stos' (which this function is replacing), but makes it much easier to actually do the obvious loop unrolling. Also rename the function from the now nonsensical 'clear_user_original' to what it now clearly is: 'rep_stos_alternative'. End result: if we don't have a fast 'rep stosb', at least we can have a fast fallback for it. Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
142 lines
2.8 KiB
ArmAsm
142 lines
2.8 KiB
ArmAsm
/* SPDX-License-Identifier: GPL-2.0-only */
|
|
#include <linux/linkage.h>
|
|
#include <asm/asm.h>
|
|
#include <asm/export.h>
|
|
|
|
/*
|
|
* Most CPUs support enhanced REP MOVSB/STOSB instructions. It is
|
|
* recommended to use this when possible and we do use them by default.
|
|
* If enhanced REP MOVSB/STOSB is not available, try to use fast string.
|
|
* Otherwise, use original.
|
|
*/
|
|
|
|
/*
|
|
* Zero a page.
|
|
* %rdi - page
|
|
*/
|
|
SYM_FUNC_START(clear_page_rep)
|
|
movl $4096/8,%ecx
|
|
xorl %eax,%eax
|
|
rep stosq
|
|
RET
|
|
SYM_FUNC_END(clear_page_rep)
|
|
EXPORT_SYMBOL_GPL(clear_page_rep)
|
|
|
|
SYM_FUNC_START(clear_page_orig)
|
|
xorl %eax,%eax
|
|
movl $4096/64,%ecx
|
|
.p2align 4
|
|
.Lloop:
|
|
decl %ecx
|
|
#define PUT(x) movq %rax,x*8(%rdi)
|
|
movq %rax,(%rdi)
|
|
PUT(1)
|
|
PUT(2)
|
|
PUT(3)
|
|
PUT(4)
|
|
PUT(5)
|
|
PUT(6)
|
|
PUT(7)
|
|
leaq 64(%rdi),%rdi
|
|
jnz .Lloop
|
|
nop
|
|
RET
|
|
SYM_FUNC_END(clear_page_orig)
|
|
EXPORT_SYMBOL_GPL(clear_page_orig)
|
|
|
|
SYM_FUNC_START(clear_page_erms)
|
|
movl $4096,%ecx
|
|
xorl %eax,%eax
|
|
rep stosb
|
|
RET
|
|
SYM_FUNC_END(clear_page_erms)
|
|
EXPORT_SYMBOL_GPL(clear_page_erms)
|
|
|
|
/*
|
|
* Default clear user-space.
|
|
* Input:
|
|
* rdi destination
|
|
* rcx count
|
|
* rax is zero
|
|
*
|
|
* Output:
|
|
* rcx: uncleared bytes or 0 if successful.
|
|
*/
|
|
SYM_FUNC_START(rep_stos_alternative)
|
|
cmpq $64,%rcx
|
|
jae .Lunrolled
|
|
|
|
cmp $8,%ecx
|
|
jae .Lword
|
|
|
|
testl %ecx,%ecx
|
|
je .Lexit
|
|
|
|
.Lclear_user_tail:
|
|
0: movb %al,(%rdi)
|
|
inc %rdi
|
|
dec %rcx
|
|
jnz .Lclear_user_tail
|
|
.Lexit:
|
|
RET
|
|
|
|
_ASM_EXTABLE_UA( 0b, .Lexit)
|
|
|
|
.Lword:
|
|
1: movq %rax,(%rdi)
|
|
addq $8,%rdi
|
|
sub $8,%ecx
|
|
je .Lexit
|
|
cmp $8,%ecx
|
|
jae .Lword
|
|
jmp .Lclear_user_tail
|
|
|
|
.p2align 4
|
|
.Lunrolled:
|
|
10: movq %rax,(%rdi)
|
|
11: movq %rax,8(%rdi)
|
|
12: movq %rax,16(%rdi)
|
|
13: movq %rax,24(%rdi)
|
|
14: movq %rax,32(%rdi)
|
|
15: movq %rax,40(%rdi)
|
|
16: movq %rax,48(%rdi)
|
|
17: movq %rax,56(%rdi)
|
|
addq $64,%rdi
|
|
subq $64,%rcx
|
|
cmpq $64,%rcx
|
|
jae .Lunrolled
|
|
cmpl $8,%ecx
|
|
jae .Lword
|
|
testl %ecx,%ecx
|
|
jne .Lclear_user_tail
|
|
RET
|
|
|
|
/*
|
|
* If we take an exception on any of the
|
|
* word stores, we know that %rcx isn't zero,
|
|
* so we can just go to the tail clearing to
|
|
* get the exact count.
|
|
*
|
|
* The unrolled case might end up clearing
|
|
* some bytes twice. Don't care.
|
|
*
|
|
* We could use the value in %rdi to avoid
|
|
* a second fault on the exact count case,
|
|
* but do we really care? No.
|
|
*
|
|
* Finally, we could try to align %rdi at the
|
|
* top of the unrolling. But unaligned stores
|
|
* just aren't that common or expensive.
|
|
*/
|
|
_ASM_EXTABLE_UA( 1b, .Lclear_user_tail)
|
|
_ASM_EXTABLE_UA(10b, .Lclear_user_tail)
|
|
_ASM_EXTABLE_UA(11b, .Lclear_user_tail)
|
|
_ASM_EXTABLE_UA(12b, .Lclear_user_tail)
|
|
_ASM_EXTABLE_UA(13b, .Lclear_user_tail)
|
|
_ASM_EXTABLE_UA(14b, .Lclear_user_tail)
|
|
_ASM_EXTABLE_UA(15b, .Lclear_user_tail)
|
|
_ASM_EXTABLE_UA(16b, .Lclear_user_tail)
|
|
_ASM_EXTABLE_UA(17b, .Lclear_user_tail)
|
|
SYM_FUNC_END(rep_stos_alternative)
|
|
EXPORT_SYMBOL(rep_stos_alternative)
|