powerpc/64s: Improve RFI L1-D cache flush fallback
The fallback RFI flush is used when firmware does not provide a way to flush the cache. It's a "displacement flush" that evicts useful data by displacing it with an uninteresting buffer. The flush has to take care to work with implementation specific cache replacment policies, so the recipe has been in flux. The initial slow but conservative approach is to touch all lines of a congruence class, with dependencies between each load. It has since been determined that a linear pattern of loads without dependencies is sufficient, and is significantly faster. Measuring the speed of a null syscall with RFI fallback flush enabled gives the relative improvement: P8 - 1.83x P9 - 1.75x The flush also becomes simpler and more adaptable to different cache geometries. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
This commit is contained in:
parent
35adacd6fc
commit
bdcb1aefc5
@ -239,8 +239,7 @@ struct paca_struct {
|
||||
*/
|
||||
u64 exrfi[EX_SIZE] __aligned(0x80);
|
||||
void *rfi_flush_fallback_area;
|
||||
u64 l1d_flush_congruence;
|
||||
u64 l1d_flush_sets;
|
||||
u64 l1d_flush_size;
|
||||
#endif
|
||||
};
|
||||
|
||||
|
@ -239,8 +239,7 @@ int main(void)
|
||||
OFFSET(PACA_IN_NMI, paca_struct, in_nmi);
|
||||
OFFSET(PACA_RFI_FLUSH_FALLBACK_AREA, paca_struct, rfi_flush_fallback_area);
|
||||
OFFSET(PACA_EXRFI, paca_struct, exrfi);
|
||||
OFFSET(PACA_L1D_FLUSH_CONGRUENCE, paca_struct, l1d_flush_congruence);
|
||||
OFFSET(PACA_L1D_FLUSH_SETS, paca_struct, l1d_flush_sets);
|
||||
OFFSET(PACA_L1D_FLUSH_SIZE, paca_struct, l1d_flush_size);
|
||||
|
||||
#endif
|
||||
OFFSET(PACAHWCPUID, paca_struct, hw_cpu_id);
|
||||
|
@ -1461,39 +1461,37 @@ TRAMP_REAL_BEGIN(rfi_flush_fallback)
|
||||
std r9,PACA_EXRFI+EX_R9(r13)
|
||||
std r10,PACA_EXRFI+EX_R10(r13)
|
||||
std r11,PACA_EXRFI+EX_R11(r13)
|
||||
std r12,PACA_EXRFI+EX_R12(r13)
|
||||
std r8,PACA_EXRFI+EX_R13(r13)
|
||||
mfctr r9
|
||||
ld r10,PACA_RFI_FLUSH_FALLBACK_AREA(r13)
|
||||
ld r11,PACA_L1D_FLUSH_SETS(r13)
|
||||
ld r12,PACA_L1D_FLUSH_CONGRUENCE(r13)
|
||||
/*
|
||||
* The load adresses are at staggered offsets within cachelines,
|
||||
* which suits some pipelines better (on others it should not
|
||||
* hurt).
|
||||
*/
|
||||
addi r12,r12,8
|
||||
ld r11,PACA_L1D_FLUSH_SIZE(r13)
|
||||
srdi r11,r11,(7 + 3) /* 128 byte lines, unrolled 8x */
|
||||
mtctr r11
|
||||
DCBT_STOP_ALL_STREAM_IDS(r11) /* Stop prefetch streams */
|
||||
|
||||
/* order ld/st prior to dcbt stop all streams with flushing */
|
||||
sync
|
||||
1: li r8,0
|
||||
.rept 8 /* 8-way set associative */
|
||||
ldx r11,r10,r8
|
||||
add r8,r8,r12
|
||||
xor r11,r11,r11 // Ensure r11 is 0 even if fallback area is not
|
||||
add r8,r8,r11 // Add 0, this creates a dependency on the ldx
|
||||
.endr
|
||||
addi r10,r10,128 /* 128 byte cache line */
|
||||
|
||||
/*
|
||||
* The load adresses are at staggered offsets within cachelines,
|
||||
* which suits some pipelines better (on others it should not
|
||||
* hurt).
|
||||
*/
|
||||
1:
|
||||
ld r11,(0x80 + 8)*0(r10)
|
||||
ld r11,(0x80 + 8)*1(r10)
|
||||
ld r11,(0x80 + 8)*2(r10)
|
||||
ld r11,(0x80 + 8)*3(r10)
|
||||
ld r11,(0x80 + 8)*4(r10)
|
||||
ld r11,(0x80 + 8)*5(r10)
|
||||
ld r11,(0x80 + 8)*6(r10)
|
||||
ld r11,(0x80 + 8)*7(r10)
|
||||
addi r10,r10,0x80*8
|
||||
bdnz 1b
|
||||
|
||||
mtctr r9
|
||||
ld r9,PACA_EXRFI+EX_R9(r13)
|
||||
ld r10,PACA_EXRFI+EX_R10(r13)
|
||||
ld r11,PACA_EXRFI+EX_R11(r13)
|
||||
ld r12,PACA_EXRFI+EX_R12(r13)
|
||||
ld r8,PACA_EXRFI+EX_R13(r13)
|
||||
GET_SCRATCH0(r13);
|
||||
rfid
|
||||
|
||||
@ -1503,39 +1501,37 @@ TRAMP_REAL_BEGIN(hrfi_flush_fallback)
|
||||
std r9,PACA_EXRFI+EX_R9(r13)
|
||||
std r10,PACA_EXRFI+EX_R10(r13)
|
||||
std r11,PACA_EXRFI+EX_R11(r13)
|
||||
std r12,PACA_EXRFI+EX_R12(r13)
|
||||
std r8,PACA_EXRFI+EX_R13(r13)
|
||||
mfctr r9
|
||||
ld r10,PACA_RFI_FLUSH_FALLBACK_AREA(r13)
|
||||
ld r11,PACA_L1D_FLUSH_SETS(r13)
|
||||
ld r12,PACA_L1D_FLUSH_CONGRUENCE(r13)
|
||||
/*
|
||||
* The load adresses are at staggered offsets within cachelines,
|
||||
* which suits some pipelines better (on others it should not
|
||||
* hurt).
|
||||
*/
|
||||
addi r12,r12,8
|
||||
ld r11,PACA_L1D_FLUSH_SIZE(r13)
|
||||
srdi r11,r11,(7 + 3) /* 128 byte lines, unrolled 8x */
|
||||
mtctr r11
|
||||
DCBT_STOP_ALL_STREAM_IDS(r11) /* Stop prefetch streams */
|
||||
|
||||
/* order ld/st prior to dcbt stop all streams with flushing */
|
||||
sync
|
||||
1: li r8,0
|
||||
.rept 8 /* 8-way set associative */
|
||||
ldx r11,r10,r8
|
||||
add r8,r8,r12
|
||||
xor r11,r11,r11 // Ensure r11 is 0 even if fallback area is not
|
||||
add r8,r8,r11 // Add 0, this creates a dependency on the ldx
|
||||
.endr
|
||||
addi r10,r10,128 /* 128 byte cache line */
|
||||
|
||||
/*
|
||||
* The load adresses are at staggered offsets within cachelines,
|
||||
* which suits some pipelines better (on others it should not
|
||||
* hurt).
|
||||
*/
|
||||
1:
|
||||
ld r11,(0x80 + 8)*0(r10)
|
||||
ld r11,(0x80 + 8)*1(r10)
|
||||
ld r11,(0x80 + 8)*2(r10)
|
||||
ld r11,(0x80 + 8)*3(r10)
|
||||
ld r11,(0x80 + 8)*4(r10)
|
||||
ld r11,(0x80 + 8)*5(r10)
|
||||
ld r11,(0x80 + 8)*6(r10)
|
||||
ld r11,(0x80 + 8)*7(r10)
|
||||
addi r10,r10,0x80*8
|
||||
bdnz 1b
|
||||
|
||||
mtctr r9
|
||||
ld r9,PACA_EXRFI+EX_R9(r13)
|
||||
ld r10,PACA_EXRFI+EX_R10(r13)
|
||||
ld r11,PACA_EXRFI+EX_R11(r13)
|
||||
ld r12,PACA_EXRFI+EX_R12(r13)
|
||||
ld r8,PACA_EXRFI+EX_R13(r13)
|
||||
GET_SCRATCH0(r13);
|
||||
hrfid
|
||||
|
||||
|
@ -875,19 +875,8 @@ static void init_fallback_flush(void)
|
||||
memset(l1d_flush_fallback_area, 0, l1d_size * 2);
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
/*
|
||||
* The fallback flush is currently coded for 8-way
|
||||
* associativity. Different associativity is possible, but it
|
||||
* will be treated as 8-way and may not evict the lines as
|
||||
* effectively.
|
||||
*
|
||||
* 128 byte lines are mandatory.
|
||||
*/
|
||||
u64 c = l1d_size / 8;
|
||||
|
||||
paca[cpu].rfi_flush_fallback_area = l1d_flush_fallback_area;
|
||||
paca[cpu].l1d_flush_congruence = c;
|
||||
paca[cpu].l1d_flush_sets = c / 128;
|
||||
paca[cpu].l1d_flush_size = l1d_size;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2377,8 +2377,6 @@ static void dump_one_paca(int cpu)
|
||||
printf(" slb_cache[%d]: = 0x%016lx\n", i, p->slb_cache[i]);
|
||||
|
||||
DUMP(p, rfi_flush_fallback_area, "px");
|
||||
DUMP(p, l1d_flush_congruence, "llx");
|
||||
DUMP(p, l1d_flush_sets, "llx");
|
||||
#endif
|
||||
DUMP(p, dscr_default, "llx");
|
||||
#ifdef CONFIG_PPC_BOOK3E
|
||||
|
Loading…
Reference in New Issue
Block a user