Benjamin Herrenschmidt 7230c56441 powerpc: Rework lazy-interrupt handling
The current implementation of lazy interrupts handling has some
issues that this tries to address.

We don't do the various workarounds we need to do when re-enabling
interrupts in some cases such as when returning from an interrupt
and thus we may still lose or get delayed decrementer or doorbell
interrupts.

The current scheme also makes it much harder to handle the external
"edge" interrupts provided by some BookE processors when using the
EPR facility (External Proxy) and the Freescale Hypervisor.

Additionally, we tend to keep interrupts hard disabled in a number
of cases, such as decrementer interrupts, external interrupts, or
when a masked decrementer interrupt is pending. This is sub-optimal.

This is an attempt at fixing it all in one go by reworking the way
we do the lazy interrupt disabling from the ground up.

The base idea is to replace the "hard_enabled" field with a
"irq_happened" field in which we store a bit mask of what interrupt
occurred while soft-disabled.

When re-enabling, either via arch_local_irq_restore() or when returning
from an interrupt, we can now decide what to do by testing bits in that
field.

We then implement replaying of the missed interrupts either by
re-using the existing exception frame (in exception exit case) or via
the creation of a new one from an assembly trampoline (in the
arch_local_irq_enable case).

This removes the need to play with the decrementer to try to create
fake interrupts, among others.

In addition, this adds a few refinements:

 - We no longer  hard disable decrementer interrupts that occur
while soft-disabled. We now simply bump the decrementer back to max
(on BookS) or leave it stopped (on BookE) and continue with hard interrupts
enabled, which means that we'll potentially get better sample quality from
performance monitor interrupts.

 - Timer, decrementer and doorbell interrupts now hard-enable
shortly after removing the source of the interrupt, which means
they no longer run entirely hard disabled. Again, this will improve
perf sample quality.

 - On Book3E 64-bit, we now make the performance monitor interrupt
act as an NMI like Book3S (the necessary C code for that to work
appear to already be present in the FSL perf code, notably calling
nmi_enter instead of irq_enter). (This also fixes a bug where BookE
perfmon interrupts could clobber r14 ... oops)

 - We could make "masked" decrementer interrupts act as NMIs when doing
timer-based perf sampling to improve the sample quality.

Signed-off-by-yet: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---

v2:

- Add hard-enable to decrementer, timer and doorbells
- Fix CR clobber in masked irq handling on BookE
- Make embedded perf interrupt act as an NMI
- Add a PACA_HAPPENED_EE_EDGE for use by FSL if they want
  to retrigger an interrupt without preventing hard-enable

v3:

 - Fix or vs. ori bug on Book3E
 - Fix enabling of interrupts for some exceptions on Book3E

v4:

 - Fix resend of doorbells on return from interrupt on Book3E

v5:

 - Rebased on top of my latest series, which involves some significant
rework of some aspects of the patch.

v6:
 - 32-bit compile fix
 - more compile fixes with various .config combos
 - factor out the asm code to soft-disable interrupts
 - remove the C wrapper around preempt_schedule_irq

v7:
 - Fix a bug with hard irq state tracking on native power7
2012-03-09 13:25:06 +11:00

184 lines
6.0 KiB
C

/*
* This control block defines the PACA which defines the processor
* specific data for each logical processor on the system.
* There are some pointers defined that are utilized by PLIC.
*
* C 2001 PPC 64 Team, IBM Corp
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#ifndef _ASM_POWERPC_PACA_H
#define _ASM_POWERPC_PACA_H
#ifdef __KERNEL__
#ifdef CONFIG_PPC64
#include <linux/init.h>
#include <asm/types.h>
#include <asm/lppaca.h>
#include <asm/mmu.h>
#include <asm/page.h>
#include <asm/exception-64e.h>
#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
#include <asm/kvm_book3s_asm.h>
#endif
register struct paca_struct *local_paca asm("r13");
#if defined(CONFIG_DEBUG_PREEMPT) && defined(CONFIG_SMP)
extern unsigned int debug_smp_processor_id(void); /* from linux/smp.h */
/*
* Add standard checks that preemption cannot occur when using get_paca():
* otherwise the paca_struct it points to may be the wrong one just after.
*/
#define get_paca() ((void) debug_smp_processor_id(), local_paca)
#else
#define get_paca() local_paca
#endif
#define get_lppaca() (get_paca()->lppaca_ptr)
#define get_slb_shadow() (get_paca()->slb_shadow_ptr)
struct task_struct;
struct opal_machine_check_event;
/*
* Defines the layout of the paca.
*
* This structure is not directly accessed by firmware or the service
* processor.
*/
struct paca_struct {
#ifdef CONFIG_PPC_BOOK3S
/*
* Because hw_cpu_id, unlike other paca fields, is accessed
* routinely from other CPUs (from the IRQ code), we stick to
* read-only (after boot) fields in the first cacheline to
* avoid cacheline bouncing.
*/
struct lppaca *lppaca_ptr; /* Pointer to LpPaca for PLIC */
#endif /* CONFIG_PPC_BOOK3S */
/*
* MAGIC: the spinlock functions in arch/powerpc/lib/locks.c
* load lock_token and paca_index with a single lwz
* instruction. They must travel together and be properly
* aligned.
*/
u16 lock_token; /* Constant 0x8000, used in locks */
u16 paca_index; /* Logical processor number */
u64 kernel_toc; /* Kernel TOC address */
u64 kernelbase; /* Base address of kernel */
u64 kernel_msr; /* MSR while running in kernel */
#ifdef CONFIG_PPC_STD_MMU_64
u64 stab_real; /* Absolute address of segment table */
u64 stab_addr; /* Virtual address of segment table */
#endif /* CONFIG_PPC_STD_MMU_64 */
void *emergency_sp; /* pointer to emergency stack */
u64 data_offset; /* per cpu data offset */
s16 hw_cpu_id; /* Physical processor number */
u8 cpu_start; /* At startup, processor spins until */
/* this becomes non-zero. */
u8 kexec_state; /* set when kexec down has irqs off */
#ifdef CONFIG_PPC_STD_MMU_64
struct slb_shadow *slb_shadow_ptr;
struct dtl_entry *dispatch_log;
struct dtl_entry *dispatch_log_end;
/*
* Now, starting in cacheline 2, the exception save areas
*/
/* used for most interrupts/exceptions */
u64 exgen[11] __attribute__((aligned(0x80)));
u64 exmc[11]; /* used for machine checks */
u64 exslb[11]; /* used for SLB/segment table misses
* on the linear mapping */
/* SLB related definitions */
u16 vmalloc_sllp;
u16 slb_cache_ptr;
u16 slb_cache[SLB_CACHE_ENTRIES];
#endif /* CONFIG_PPC_STD_MMU_64 */
#ifdef CONFIG_PPC_BOOK3E
u64 exgen[8] __attribute__((aligned(0x80)));
/* Keep pgd in the same cacheline as the start of extlb */
pgd_t *pgd __attribute__((aligned(0x80))); /* Current PGD */
pgd_t *kernel_pgd; /* Kernel PGD */
/* We can have up to 3 levels of reentrancy in the TLB miss handler */
u64 extlb[3][EX_TLB_SIZE / sizeof(u64)];
u64 exmc[8]; /* used for machine checks */
u64 excrit[8]; /* used for crit interrupts */
u64 exdbg[8]; /* used for debug interrupts */
/* Kernel stack pointers for use by special exceptions */
void *mc_kstack;
void *crit_kstack;
void *dbg_kstack;
#endif /* CONFIG_PPC_BOOK3E */
mm_context_t context;
/*
* then miscellaneous read-write fields
*/
struct task_struct *__current; /* Pointer to current */
u64 kstack; /* Saved Kernel stack addr */
u64 stab_rr; /* stab/slb round-robin counter */
u64 saved_r1; /* r1 save for RTAS calls or PM */
u64 saved_msr; /* MSR saved here by enter_rtas */
u16 trap_save; /* Used when bad stack is encountered */
u8 soft_enabled; /* irq soft-enable flag */
u8 irq_happened; /* irq happened while soft-disabled */
u8 io_sync; /* writel() needs spin_unlock sync */
u8 irq_work_pending; /* IRQ_WORK interrupt while soft-disable */
u8 nap_state_lost; /* NV GPR values lost in power7_idle */
#ifdef CONFIG_PPC_POWERNV
/* Pointer to OPAL machine check event structure set by the
* early exception handler for use by high level C handler
*/
struct opal_machine_check_event *opal_mc_evt;
#endif
/* Stuff for accurate time accounting */
u64 user_time; /* accumulated usermode TB ticks */
u64 system_time; /* accumulated system TB ticks */
u64 user_time_scaled; /* accumulated usermode SPURR ticks */
u64 starttime; /* TB value snapshot */
u64 starttime_user; /* TB value on exit to usermode */
u64 startspurr; /* SPURR value snapshot */
u64 utime_sspurr; /* ->user_time when ->startspurr set */
u64 stolen_time; /* TB ticks taken by hypervisor */
u64 dtl_ridx; /* read index in dispatch log */
struct dtl_entry *dtl_curr; /* pointer corresponding to dtl_ridx */
#ifdef CONFIG_KVM_BOOK3S_HANDLER
#ifdef CONFIG_KVM_BOOK3S_PR
/* We use this to store guest state in */
struct kvmppc_book3s_shadow_vcpu shadow_vcpu;
#endif
struct kvmppc_host_state kvm_hstate;
#endif
};
extern struct paca_struct *paca;
extern __initdata struct paca_struct boot_paca;
extern void initialise_paca(struct paca_struct *new_paca, int cpu);
extern void setup_paca(struct paca_struct *new_paca);
extern void allocate_pacas(void);
extern void free_unused_pacas(void);
#else /* CONFIG_PPC64 */
static inline void allocate_pacas(void) { };
static inline void free_unused_pacas(void) { };
#endif /* CONFIG_PPC64 */
#endif /* __KERNEL__ */
#endif /* _ASM_POWERPC_PACA_H */