44922150d8
If we have a series of events from userpsace, with %fprs=FPRS_FEF, like follows: ETRAP ETRAP VIS_ENTRY(fprs=0x4) VIS_EXIT RTRAP (kernel FPU restore with fpu_saved=0x4) RTRAP We will not restore the user registers that were clobbered by the FPU using kernel code in the inner-most trap. Traps allocate FPU save slots in the thread struct, and FPU using sequences save the "dirty" FPU registers only. This works at the initial trap level because all of the registers get recorded into the top-level FPU save area, and we'll return to userspace with the FPU disabled so that any FPU use by the user will take an FPU disabled trap wherein we'll load the registers back up properly. But this is not how trap returns from kernel to kernel operate. The simplest fix for this bug is to always save all FPU register state for anything other than the top-most FPU save area. Getting rid of the optimized inner-slot FPU saving code ends up making VISEntryHalf degenerate into plain VISEntry. Longer term we need to do something smarter to reinstate the partial save optimizations. Perhaps the fundament error is having trap entry and exit allocate FPU save slots and restore register state. Instead, the VISEntry et al. calls should be doing that work. This bug is about two decades old. Reported-by: James Y Knight <jyknight@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
376 lines
8.4 KiB
ArmAsm
376 lines
8.4 KiB
ArmAsm
/* NG4memcpy.S: Niagara-4 optimized memcpy.
|
|
*
|
|
* Copyright (C) 2012 David S. Miller (davem@davemloft.net)
|
|
*/
|
|
|
|
#ifdef __KERNEL__
|
|
#include <asm/visasm.h>
|
|
#include <asm/asi.h>
|
|
#define GLOBAL_SPARE %g7
|
|
#else
|
|
#define ASI_BLK_INIT_QUAD_LDD_P 0xe2
|
|
#define FPRS_FEF 0x04
|
|
|
|
/* On T4 it is very expensive to access ASRs like %fprs and
|
|
* %asi, avoiding a read or a write can save ~50 cycles.
|
|
*/
|
|
#define FPU_ENTER \
|
|
rd %fprs, %o5; \
|
|
andcc %o5, FPRS_FEF, %g0; \
|
|
be,a,pn %icc, 999f; \
|
|
wr %g0, FPRS_FEF, %fprs; \
|
|
999:
|
|
|
|
#ifdef MEMCPY_DEBUG
|
|
#define VISEntryHalf FPU_ENTER; \
|
|
clr %g1; clr %g2; clr %g3; clr %g5; subcc %g0, %g0, %g0;
|
|
#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
|
|
#else
|
|
#define VISEntryHalf FPU_ENTER
|
|
#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
|
|
#endif
|
|
|
|
#define GLOBAL_SPARE %g5
|
|
#endif
|
|
|
|
#ifndef STORE_ASI
|
|
#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
|
|
#define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P
|
|
#else
|
|
#define STORE_ASI 0x80 /* ASI_P */
|
|
#endif
|
|
#endif
|
|
|
|
#if !defined(EX_LD) && !defined(EX_ST)
|
|
#define NON_USER_COPY
|
|
#endif
|
|
|
|
#ifndef EX_LD
|
|
#define EX_LD(x) x
|
|
#endif
|
|
|
|
#ifndef EX_ST
|
|
#define EX_ST(x) x
|
|
#endif
|
|
|
|
#ifndef EX_RETVAL
|
|
#define EX_RETVAL(x) x
|
|
#endif
|
|
|
|
#ifndef LOAD
|
|
#define LOAD(type,addr,dest) type [addr], dest
|
|
#endif
|
|
|
|
#ifndef STORE
|
|
#ifndef MEMCPY_DEBUG
|
|
#define STORE(type,src,addr) type src, [addr]
|
|
#else
|
|
#define STORE(type,src,addr) type##a src, [addr] %asi
|
|
#endif
|
|
#endif
|
|
|
|
#ifndef STORE_INIT
|
|
#define STORE_INIT(src,addr) stxa src, [addr] STORE_ASI
|
|
#endif
|
|
|
|
#ifndef FUNC_NAME
|
|
#define FUNC_NAME NG4memcpy
|
|
#endif
|
|
#ifndef PREAMBLE
|
|
#define PREAMBLE
|
|
#endif
|
|
|
|
#ifndef XCC
|
|
#define XCC xcc
|
|
#endif
|
|
|
|
.register %g2,#scratch
|
|
.register %g3,#scratch
|
|
|
|
.text
|
|
.align 64
|
|
|
|
.globl FUNC_NAME
|
|
.type FUNC_NAME,#function
|
|
FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
|
|
#ifdef MEMCPY_DEBUG
|
|
wr %g0, 0x80, %asi
|
|
#endif
|
|
srlx %o2, 31, %g2
|
|
cmp %g2, 0
|
|
tne %XCC, 5
|
|
PREAMBLE
|
|
mov %o0, %o3
|
|
brz,pn %o2, .Lexit
|
|
cmp %o2, 3
|
|
ble,pn %icc, .Ltiny
|
|
cmp %o2, 19
|
|
ble,pn %icc, .Lsmall
|
|
or %o0, %o1, %g2
|
|
cmp %o2, 128
|
|
bl,pn %icc, .Lmedium
|
|
nop
|
|
|
|
.Llarge:/* len >= 0x80 */
|
|
/* First get dest 8 byte aligned. */
|
|
sub %g0, %o0, %g1
|
|
and %g1, 0x7, %g1
|
|
brz,pt %g1, 51f
|
|
sub %o2, %g1, %o2
|
|
|
|
1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2))
|
|
add %o1, 1, %o1
|
|
subcc %g1, 1, %g1
|
|
add %o0, 1, %o0
|
|
bne,pt %icc, 1b
|
|
EX_ST(STORE(stb, %g2, %o0 - 0x01))
|
|
|
|
51: LOAD(prefetch, %o1 + 0x040, #n_reads_strong)
|
|
LOAD(prefetch, %o1 + 0x080, #n_reads_strong)
|
|
LOAD(prefetch, %o1 + 0x0c0, #n_reads_strong)
|
|
LOAD(prefetch, %o1 + 0x100, #n_reads_strong)
|
|
LOAD(prefetch, %o1 + 0x140, #n_reads_strong)
|
|
LOAD(prefetch, %o1 + 0x180, #n_reads_strong)
|
|
LOAD(prefetch, %o1 + 0x1c0, #n_reads_strong)
|
|
LOAD(prefetch, %o1 + 0x200, #n_reads_strong)
|
|
|
|
/* Check if we can use the straight fully aligned
|
|
* loop, or we require the alignaddr/faligndata variant.
|
|
*/
|
|
andcc %o1, 0x7, %o5
|
|
bne,pn %icc, .Llarge_src_unaligned
|
|
sub %g0, %o0, %g1
|
|
|
|
/* Legitimize the use of initializing stores by getting dest
|
|
* to be 64-byte aligned.
|
|
*/
|
|
and %g1, 0x3f, %g1
|
|
brz,pt %g1, .Llarge_aligned
|
|
sub %o2, %g1, %o2
|
|
|
|
1: EX_LD(LOAD(ldx, %o1 + 0x00, %g2))
|
|
add %o1, 8, %o1
|
|
subcc %g1, 8, %g1
|
|
add %o0, 8, %o0
|
|
bne,pt %icc, 1b
|
|
EX_ST(STORE(stx, %g2, %o0 - 0x08))
|
|
|
|
.Llarge_aligned:
|
|
/* len >= 0x80 && src 8-byte aligned && dest 8-byte aligned */
|
|
andn %o2, 0x3f, %o4
|
|
sub %o2, %o4, %o2
|
|
|
|
1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
|
|
add %o1, 0x40, %o1
|
|
EX_LD(LOAD(ldx, %o1 - 0x38, %g2))
|
|
subcc %o4, 0x40, %o4
|
|
EX_LD(LOAD(ldx, %o1 - 0x30, %g3))
|
|
EX_LD(LOAD(ldx, %o1 - 0x28, GLOBAL_SPARE))
|
|
EX_LD(LOAD(ldx, %o1 - 0x20, %o5))
|
|
EX_ST(STORE_INIT(%g1, %o0))
|
|
add %o0, 0x08, %o0
|
|
EX_ST(STORE_INIT(%g2, %o0))
|
|
add %o0, 0x08, %o0
|
|
EX_LD(LOAD(ldx, %o1 - 0x18, %g2))
|
|
EX_ST(STORE_INIT(%g3, %o0))
|
|
add %o0, 0x08, %o0
|
|
EX_LD(LOAD(ldx, %o1 - 0x10, %g3))
|
|
EX_ST(STORE_INIT(GLOBAL_SPARE, %o0))
|
|
add %o0, 0x08, %o0
|
|
EX_LD(LOAD(ldx, %o1 - 0x08, GLOBAL_SPARE))
|
|
EX_ST(STORE_INIT(%o5, %o0))
|
|
add %o0, 0x08, %o0
|
|
EX_ST(STORE_INIT(%g2, %o0))
|
|
add %o0, 0x08, %o0
|
|
EX_ST(STORE_INIT(%g3, %o0))
|
|
add %o0, 0x08, %o0
|
|
EX_ST(STORE_INIT(GLOBAL_SPARE, %o0))
|
|
add %o0, 0x08, %o0
|
|
bne,pt %icc, 1b
|
|
LOAD(prefetch, %o1 + 0x200, #n_reads_strong)
|
|
|
|
membar #StoreLoad | #StoreStore
|
|
|
|
brz,pn %o2, .Lexit
|
|
cmp %o2, 19
|
|
ble,pn %icc, .Lsmall_unaligned
|
|
nop
|
|
ba,a,pt %icc, .Lmedium_noprefetch
|
|
|
|
.Lexit: retl
|
|
mov EX_RETVAL(%o3), %o0
|
|
|
|
.Llarge_src_unaligned:
|
|
#ifdef NON_USER_COPY
|
|
VISEntryHalfFast(.Lmedium_vis_entry_fail)
|
|
#else
|
|
VISEntryHalf
|
|
#endif
|
|
andn %o2, 0x3f, %o4
|
|
sub %o2, %o4, %o2
|
|
alignaddr %o1, %g0, %g1
|
|
add %o1, %o4, %o1
|
|
EX_LD(LOAD(ldd, %g1 + 0x00, %f0))
|
|
1: EX_LD(LOAD(ldd, %g1 + 0x08, %f2))
|
|
subcc %o4, 0x40, %o4
|
|
EX_LD(LOAD(ldd, %g1 + 0x10, %f4))
|
|
EX_LD(LOAD(ldd, %g1 + 0x18, %f6))
|
|
EX_LD(LOAD(ldd, %g1 + 0x20, %f8))
|
|
EX_LD(LOAD(ldd, %g1 + 0x28, %f10))
|
|
EX_LD(LOAD(ldd, %g1 + 0x30, %f12))
|
|
EX_LD(LOAD(ldd, %g1 + 0x38, %f14))
|
|
faligndata %f0, %f2, %f16
|
|
EX_LD(LOAD(ldd, %g1 + 0x40, %f0))
|
|
faligndata %f2, %f4, %f18
|
|
add %g1, 0x40, %g1
|
|
faligndata %f4, %f6, %f20
|
|
faligndata %f6, %f8, %f22
|
|
faligndata %f8, %f10, %f24
|
|
faligndata %f10, %f12, %f26
|
|
faligndata %f12, %f14, %f28
|
|
faligndata %f14, %f0, %f30
|
|
EX_ST(STORE(std, %f16, %o0 + 0x00))
|
|
EX_ST(STORE(std, %f18, %o0 + 0x08))
|
|
EX_ST(STORE(std, %f20, %o0 + 0x10))
|
|
EX_ST(STORE(std, %f22, %o0 + 0x18))
|
|
EX_ST(STORE(std, %f24, %o0 + 0x20))
|
|
EX_ST(STORE(std, %f26, %o0 + 0x28))
|
|
EX_ST(STORE(std, %f28, %o0 + 0x30))
|
|
EX_ST(STORE(std, %f30, %o0 + 0x38))
|
|
add %o0, 0x40, %o0
|
|
bne,pt %icc, 1b
|
|
LOAD(prefetch, %g1 + 0x200, #n_reads_strong)
|
|
#ifdef NON_USER_COPY
|
|
VISExitHalfFast
|
|
#else
|
|
VISExitHalf
|
|
#endif
|
|
brz,pn %o2, .Lexit
|
|
cmp %o2, 19
|
|
ble,pn %icc, .Lsmall_unaligned
|
|
nop
|
|
ba,a,pt %icc, .Lmedium_unaligned
|
|
|
|
#ifdef NON_USER_COPY
|
|
.Lmedium_vis_entry_fail:
|
|
or %o0, %o1, %g2
|
|
#endif
|
|
.Lmedium:
|
|
LOAD(prefetch, %o1 + 0x40, #n_reads_strong)
|
|
andcc %g2, 0x7, %g0
|
|
bne,pn %icc, .Lmedium_unaligned
|
|
nop
|
|
.Lmedium_noprefetch:
|
|
andncc %o2, 0x20 - 1, %o5
|
|
be,pn %icc, 2f
|
|
sub %o2, %o5, %o2
|
|
1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
|
|
EX_LD(LOAD(ldx, %o1 + 0x08, %g2))
|
|
EX_LD(LOAD(ldx, %o1 + 0x10, GLOBAL_SPARE))
|
|
EX_LD(LOAD(ldx, %o1 + 0x18, %o4))
|
|
add %o1, 0x20, %o1
|
|
subcc %o5, 0x20, %o5
|
|
EX_ST(STORE(stx, %g1, %o0 + 0x00))
|
|
EX_ST(STORE(stx, %g2, %o0 + 0x08))
|
|
EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x10))
|
|
EX_ST(STORE(stx, %o4, %o0 + 0x18))
|
|
bne,pt %icc, 1b
|
|
add %o0, 0x20, %o0
|
|
2: andcc %o2, 0x18, %o5
|
|
be,pt %icc, 3f
|
|
sub %o2, %o5, %o2
|
|
1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
|
|
add %o1, 0x08, %o1
|
|
add %o0, 0x08, %o0
|
|
subcc %o5, 0x08, %o5
|
|
bne,pt %icc, 1b
|
|
EX_ST(STORE(stx, %g1, %o0 - 0x08))
|
|
3: brz,pt %o2, .Lexit
|
|
cmp %o2, 0x04
|
|
bl,pn %icc, .Ltiny
|
|
nop
|
|
EX_LD(LOAD(lduw, %o1 + 0x00, %g1))
|
|
add %o1, 0x04, %o1
|
|
add %o0, 0x04, %o0
|
|
subcc %o2, 0x04, %o2
|
|
bne,pn %icc, .Ltiny
|
|
EX_ST(STORE(stw, %g1, %o0 - 0x04))
|
|
ba,a,pt %icc, .Lexit
|
|
.Lmedium_unaligned:
|
|
/* First get dest 8 byte aligned. */
|
|
sub %g0, %o0, %g1
|
|
and %g1, 0x7, %g1
|
|
brz,pt %g1, 2f
|
|
sub %o2, %g1, %o2
|
|
|
|
1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2))
|
|
add %o1, 1, %o1
|
|
subcc %g1, 1, %g1
|
|
add %o0, 1, %o0
|
|
bne,pt %icc, 1b
|
|
EX_ST(STORE(stb, %g2, %o0 - 0x01))
|
|
2:
|
|
and %o1, 0x7, %g1
|
|
brz,pn %g1, .Lmedium_noprefetch
|
|
sll %g1, 3, %g1
|
|
mov 64, %g2
|
|
sub %g2, %g1, %g2
|
|
andn %o1, 0x7, %o1
|
|
EX_LD(LOAD(ldx, %o1 + 0x00, %o4))
|
|
sllx %o4, %g1, %o4
|
|
andn %o2, 0x08 - 1, %o5
|
|
sub %o2, %o5, %o2
|
|
1: EX_LD(LOAD(ldx, %o1 + 0x08, %g3))
|
|
add %o1, 0x08, %o1
|
|
subcc %o5, 0x08, %o5
|
|
srlx %g3, %g2, GLOBAL_SPARE
|
|
or GLOBAL_SPARE, %o4, GLOBAL_SPARE
|
|
EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x00))
|
|
add %o0, 0x08, %o0
|
|
bne,pt %icc, 1b
|
|
sllx %g3, %g1, %o4
|
|
srl %g1, 3, %g1
|
|
add %o1, %g1, %o1
|
|
brz,pn %o2, .Lexit
|
|
nop
|
|
ba,pt %icc, .Lsmall_unaligned
|
|
|
|
.Ltiny:
|
|
EX_LD(LOAD(ldub, %o1 + 0x00, %g1))
|
|
subcc %o2, 1, %o2
|
|
be,pn %icc, .Lexit
|
|
EX_ST(STORE(stb, %g1, %o0 + 0x00))
|
|
EX_LD(LOAD(ldub, %o1 + 0x01, %g1))
|
|
subcc %o2, 1, %o2
|
|
be,pn %icc, .Lexit
|
|
EX_ST(STORE(stb, %g1, %o0 + 0x01))
|
|
EX_LD(LOAD(ldub, %o1 + 0x02, %g1))
|
|
ba,pt %icc, .Lexit
|
|
EX_ST(STORE(stb, %g1, %o0 + 0x02))
|
|
|
|
.Lsmall:
|
|
andcc %g2, 0x3, %g0
|
|
bne,pn %icc, .Lsmall_unaligned
|
|
andn %o2, 0x4 - 1, %o5
|
|
sub %o2, %o5, %o2
|
|
1:
|
|
EX_LD(LOAD(lduw, %o1 + 0x00, %g1))
|
|
add %o1, 0x04, %o1
|
|
subcc %o5, 0x04, %o5
|
|
add %o0, 0x04, %o0
|
|
bne,pt %icc, 1b
|
|
EX_ST(STORE(stw, %g1, %o0 - 0x04))
|
|
brz,pt %o2, .Lexit
|
|
nop
|
|
ba,a,pt %icc, .Ltiny
|
|
|
|
.Lsmall_unaligned:
|
|
1: EX_LD(LOAD(ldub, %o1 + 0x00, %g1))
|
|
add %o1, 1, %o1
|
|
add %o0, 1, %o0
|
|
subcc %o2, 1, %o2
|
|
bne,pt %icc, 1b
|
|
EX_ST(STORE(stb, %g1, %o0 - 0x01))
|
|
ba,a,pt %icc, .Lexit
|
|
.size FUNC_NAME, .-FUNC_NAME
|