sparc64: sun4v TLB error power off events
We've witnessed a few TLB events causing the machine to power off because of prom_halt. In one case it was some nfs related area during rmmod. Another was an mmapper of /dev/mem. A more recent one is an ITLB issue with a bad pagesize which could be a hardware bug. Bugs happen but we should attempt to not power off the machine and/or hang it when possible. This is a DTLB error from an mmapper of /dev/mem: [root@sparcie ~]# SUN4V-DTLB: Error at TPC[fffff80100903e6c], tl 1 SUN4V-DTLB: TPC<0xfffff80100903e6c> SUN4V-DTLB: O7[fffff801081979d0] SUN4V-DTLB: O7<0xfffff801081979d0> SUN4V-DTLB: vaddr[fffff80100000000] ctx[1250] pte[98000000000f0610] error[2] . This is recent mainline for ITLB: [ 3708.179864] SUN4V-ITLB: TPC<0xfffffc010071cefc> [ 3708.188866] SUN4V-ITLB: O7[fffffc010071cee8] [ 3708.197377] SUN4V-ITLB: O7<0xfffffc010071cee8> [ 3708.206539] SUN4V-ITLB: vaddr[e0003] ctx[1a3c] pte[2900000dcc800eeb] error[4] . Normally sun4v_itlb_error_report() and sun4v_dtlb_error_report() would call prom_halt() and drop us to OF command prompt "ok". This isn't the case for LDOMs and the machine powers off. For the HV reported error of HV_ENORADDR for HV HV_MMU_MAP_ADDR_TRAP we cause a SIGBUS error by qualifying it within do_sparc64_fault() for fault code mask of FAULT_CODE_BAD_RA. This is done when trap level (%tl) is less or equal one("1"). Otherwise, for %tl > 1, we proceed eventually to die_if_kernel(). The logic of this patch was partially inspired by David Miller's feedback. Power off of large sparc64 machines is painful. Plus die_if_kernel provides more context. A reset sequence isn't a brief period on large sparc64 but better than power-off/power-on sequence. Cc: sparclinux@vger.kernel.org Signed-off-by: Bob Picco <bob.picco@oracle.com> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
parent
d1105287aa
commit
4ccb927289
@ -102,6 +102,7 @@ struct thread_info {
|
||||
#define FAULT_CODE_ITLB 0x04 /* Miss happened in I-TLB */
|
||||
#define FAULT_CODE_WINFIXUP 0x08 /* Miss happened during spill/fill */
|
||||
#define FAULT_CODE_BLKCOMMIT 0x10 /* Use blk-commit ASI in copy_page */
|
||||
#define FAULT_CODE_BAD_RA 0x20 /* Bad RA for sun4v */
|
||||
|
||||
#if PAGE_SHIFT == 13
|
||||
#define THREAD_SIZE (2*PAGE_SIZE)
|
||||
|
@ -195,6 +195,11 @@ sun4v_tsb_miss_common:
|
||||
ldx [%g2 + TRAP_PER_CPU_PGD_PADDR], %g7
|
||||
|
||||
sun4v_itlb_error:
|
||||
rdpr %tl, %g1
|
||||
cmp %g1, 1
|
||||
ble,pt %icc, sun4v_bad_ra
|
||||
or %g0, FAULT_CODE_BAD_RA | FAULT_CODE_ITLB, %g1
|
||||
|
||||
sethi %hi(sun4v_err_itlb_vaddr), %g1
|
||||
stx %g4, [%g1 + %lo(sun4v_err_itlb_vaddr)]
|
||||
sethi %hi(sun4v_err_itlb_ctx), %g1
|
||||
@ -206,15 +211,10 @@ sun4v_itlb_error:
|
||||
sethi %hi(sun4v_err_itlb_error), %g1
|
||||
stx %o0, [%g1 + %lo(sun4v_err_itlb_error)]
|
||||
|
||||
sethi %hi(1f), %g7
|
||||
rdpr %tl, %g4
|
||||
cmp %g4, 1
|
||||
ble,pt %icc, 1f
|
||||
sethi %hi(2f), %g7
|
||||
ba,pt %xcc, etraptl1
|
||||
or %g7, %lo(2f), %g7
|
||||
|
||||
1: ba,pt %xcc, etrap
|
||||
2: or %g7, %lo(2b), %g7
|
||||
1: or %g7, %lo(1f), %g7
|
||||
mov %l4, %o1
|
||||
call sun4v_itlb_error_report
|
||||
add %sp, PTREGS_OFF, %o0
|
||||
@ -222,6 +222,11 @@ sun4v_itlb_error:
|
||||
/* NOTREACHED */
|
||||
|
||||
sun4v_dtlb_error:
|
||||
rdpr %tl, %g1
|
||||
cmp %g1, 1
|
||||
ble,pt %icc, sun4v_bad_ra
|
||||
or %g0, FAULT_CODE_BAD_RA | FAULT_CODE_DTLB, %g1
|
||||
|
||||
sethi %hi(sun4v_err_dtlb_vaddr), %g1
|
||||
stx %g4, [%g1 + %lo(sun4v_err_dtlb_vaddr)]
|
||||
sethi %hi(sun4v_err_dtlb_ctx), %g1
|
||||
@ -233,21 +238,23 @@ sun4v_dtlb_error:
|
||||
sethi %hi(sun4v_err_dtlb_error), %g1
|
||||
stx %o0, [%g1 + %lo(sun4v_err_dtlb_error)]
|
||||
|
||||
sethi %hi(1f), %g7
|
||||
rdpr %tl, %g4
|
||||
cmp %g4, 1
|
||||
ble,pt %icc, 1f
|
||||
sethi %hi(2f), %g7
|
||||
ba,pt %xcc, etraptl1
|
||||
or %g7, %lo(2f), %g7
|
||||
|
||||
1: ba,pt %xcc, etrap
|
||||
2: or %g7, %lo(2b), %g7
|
||||
1: or %g7, %lo(1f), %g7
|
||||
mov %l4, %o1
|
||||
call sun4v_dtlb_error_report
|
||||
add %sp, PTREGS_OFF, %o0
|
||||
|
||||
/* NOTREACHED */
|
||||
|
||||
sun4v_bad_ra:
|
||||
or %g0, %g4, %g5
|
||||
ba,pt %xcc, sparc64_realfault_common
|
||||
or %g1, %g0, %g4
|
||||
|
||||
/* NOTREACHED */
|
||||
|
||||
/* Instruction Access Exception, tl0. */
|
||||
sun4v_iacc:
|
||||
ldxa [%g0] ASI_SCRATCHPAD, %g2
|
||||
|
@ -2104,6 +2104,11 @@ void sun4v_nonresum_overflow(struct pt_regs *regs)
|
||||
atomic_inc(&sun4v_nonresum_oflow_cnt);
|
||||
}
|
||||
|
||||
static void sun4v_tlb_error(struct pt_regs *regs)
|
||||
{
|
||||
die_if_kernel("TLB/TSB error", regs);
|
||||
}
|
||||
|
||||
unsigned long sun4v_err_itlb_vaddr;
|
||||
unsigned long sun4v_err_itlb_ctx;
|
||||
unsigned long sun4v_err_itlb_pte;
|
||||
@ -2111,8 +2116,7 @@ unsigned long sun4v_err_itlb_error;
|
||||
|
||||
void sun4v_itlb_error_report(struct pt_regs *regs, int tl)
|
||||
{
|
||||
if (tl > 1)
|
||||
dump_tl1_traplog((struct tl1_traplog *)(regs + 1));
|
||||
dump_tl1_traplog((struct tl1_traplog *)(regs + 1));
|
||||
|
||||
printk(KERN_EMERG "SUN4V-ITLB: Error at TPC[%lx], tl %d\n",
|
||||
regs->tpc, tl);
|
||||
@ -2125,7 +2129,7 @@ void sun4v_itlb_error_report(struct pt_regs *regs, int tl)
|
||||
sun4v_err_itlb_vaddr, sun4v_err_itlb_ctx,
|
||||
sun4v_err_itlb_pte, sun4v_err_itlb_error);
|
||||
|
||||
prom_halt();
|
||||
sun4v_tlb_error(regs);
|
||||
}
|
||||
|
||||
unsigned long sun4v_err_dtlb_vaddr;
|
||||
@ -2135,8 +2139,7 @@ unsigned long sun4v_err_dtlb_error;
|
||||
|
||||
void sun4v_dtlb_error_report(struct pt_regs *regs, int tl)
|
||||
{
|
||||
if (tl > 1)
|
||||
dump_tl1_traplog((struct tl1_traplog *)(regs + 1));
|
||||
dump_tl1_traplog((struct tl1_traplog *)(regs + 1));
|
||||
|
||||
printk(KERN_EMERG "SUN4V-DTLB: Error at TPC[%lx], tl %d\n",
|
||||
regs->tpc, tl);
|
||||
@ -2149,7 +2152,7 @@ void sun4v_dtlb_error_report(struct pt_regs *regs, int tl)
|
||||
sun4v_err_dtlb_vaddr, sun4v_err_dtlb_ctx,
|
||||
sun4v_err_dtlb_pte, sun4v_err_dtlb_error);
|
||||
|
||||
prom_halt();
|
||||
sun4v_tlb_error(regs);
|
||||
}
|
||||
|
||||
void hypervisor_tlbop_error(unsigned long err, unsigned long op)
|
||||
|
@ -346,6 +346,9 @@ retry:
|
||||
down_read(&mm->mmap_sem);
|
||||
}
|
||||
|
||||
if (fault_code & FAULT_CODE_BAD_RA)
|
||||
goto do_sigbus;
|
||||
|
||||
vma = find_vma(mm, address);
|
||||
if (!vma)
|
||||
goto bad_area;
|
||||
|
Loading…
Reference in New Issue
Block a user