7230c56441
The current implementation of lazy interrupts handling has some issues that this tries to address. We don't do the various workarounds we need to do when re-enabling interrupts in some cases such as when returning from an interrupt and thus we may still lose or get delayed decrementer or doorbell interrupts. The current scheme also makes it much harder to handle the external "edge" interrupts provided by some BookE processors when using the EPR facility (External Proxy) and the Freescale Hypervisor. Additionally, we tend to keep interrupts hard disabled in a number of cases, such as decrementer interrupts, external interrupts, or when a masked decrementer interrupt is pending. This is sub-optimal. This is an attempt at fixing it all in one go by reworking the way we do the lazy interrupt disabling from the ground up. The base idea is to replace the "hard_enabled" field with a "irq_happened" field in which we store a bit mask of what interrupt occurred while soft-disabled. When re-enabling, either via arch_local_irq_restore() or when returning from an interrupt, we can now decide what to do by testing bits in that field. We then implement replaying of the missed interrupts either by re-using the existing exception frame (in exception exit case) or via the creation of a new one from an assembly trampoline (in the arch_local_irq_enable case). This removes the need to play with the decrementer to try to create fake interrupts, among others. In addition, this adds a few refinements: - We no longer hard disable decrementer interrupts that occur while soft-disabled. We now simply bump the decrementer back to max (on BookS) or leave it stopped (on BookE) and continue with hard interrupts enabled, which means that we'll potentially get better sample quality from performance monitor interrupts. - Timer, decrementer and doorbell interrupts now hard-enable shortly after removing the source of the interrupt, which means they no longer run entirely hard disabled. Again, this will improve perf sample quality. - On Book3E 64-bit, we now make the performance monitor interrupt act as an NMI like Book3S (the necessary C code for that to work appear to already be present in the FSL perf code, notably calling nmi_enter instead of irq_enter). (This also fixes a bug where BookE perfmon interrupts could clobber r14 ... oops) - We could make "masked" decrementer interrupts act as NMIs when doing timer-based perf sampling to improve the sample quality. Signed-off-by-yet: Benjamin Herrenschmidt <benh@kernel.crashing.org> --- v2: - Add hard-enable to decrementer, timer and doorbells - Fix CR clobber in masked irq handling on BookE - Make embedded perf interrupt act as an NMI - Add a PACA_HAPPENED_EE_EDGE for use by FSL if they want to retrigger an interrupt without preventing hard-enable v3: - Fix or vs. ori bug on Book3E - Fix enabling of interrupts for some exceptions on Book3E v4: - Fix resend of doorbells on return from interrupt on Book3E v5: - Rebased on top of my latest series, which involves some significant rework of some aspects of the patch. v6: - 32-bit compile fix - more compile fixes with various .config combos - factor out the asm code to soft-disable interrupts - remove the C wrapper around preempt_schedule_irq v7: - Fix a bug with hard irq state tracking on native power7
173 lines
3.8 KiB
C
173 lines
3.8 KiB
C
/*
|
|
* Idle daemon for PowerPC. Idle daemon will handle any action
|
|
* that needs to be taken when the system becomes idle.
|
|
*
|
|
* Originally written by Cort Dougan (cort@cs.nmt.edu).
|
|
* Subsequent 32-bit hacking by Tom Rini, Armin Kuster,
|
|
* Paul Mackerras and others.
|
|
*
|
|
* iSeries supported added by Mike Corrigan <mikejc@us.ibm.com>
|
|
*
|
|
* Additional shared processor, SMT, and firmware support
|
|
* Copyright (c) 2003 Dave Engebretsen <engebret@us.ibm.com>
|
|
*
|
|
* 32-bit and 64-bit versions merged by Paul Mackerras <paulus@samba.org>
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU General Public License
|
|
* as published by the Free Software Foundation; either version
|
|
* 2 of the License, or (at your option) any later version.
|
|
*/
|
|
|
|
#include <linux/sched.h>
|
|
#include <linux/kernel.h>
|
|
#include <linux/smp.h>
|
|
#include <linux/cpu.h>
|
|
#include <linux/sysctl.h>
|
|
#include <linux/tick.h>
|
|
|
|
#include <asm/system.h>
|
|
#include <asm/processor.h>
|
|
#include <asm/cputable.h>
|
|
#include <asm/time.h>
|
|
#include <asm/machdep.h>
|
|
#include <asm/smp.h>
|
|
|
|
#ifdef CONFIG_HOTPLUG_CPU
|
|
#define cpu_should_die() cpu_is_offline(smp_processor_id())
|
|
#else
|
|
#define cpu_should_die() 0
|
|
#endif
|
|
|
|
unsigned long cpuidle_disable = IDLE_NO_OVERRIDE;
|
|
EXPORT_SYMBOL(cpuidle_disable);
|
|
|
|
static int __init powersave_off(char *arg)
|
|
{
|
|
ppc_md.power_save = NULL;
|
|
cpuidle_disable = IDLE_POWERSAVE_OFF;
|
|
return 0;
|
|
}
|
|
__setup("powersave=off", powersave_off);
|
|
|
|
/*
|
|
* The body of the idle task.
|
|
*/
|
|
void cpu_idle(void)
|
|
{
|
|
if (ppc_md.idle_loop)
|
|
ppc_md.idle_loop(); /* doesn't return */
|
|
|
|
set_thread_flag(TIF_POLLING_NRFLAG);
|
|
while (1) {
|
|
tick_nohz_idle_enter();
|
|
rcu_idle_enter();
|
|
|
|
while (!need_resched() && !cpu_should_die()) {
|
|
ppc64_runlatch_off();
|
|
|
|
if (ppc_md.power_save) {
|
|
clear_thread_flag(TIF_POLLING_NRFLAG);
|
|
/*
|
|
* smp_mb is so clearing of TIF_POLLING_NRFLAG
|
|
* is ordered w.r.t. need_resched() test.
|
|
*/
|
|
smp_mb();
|
|
local_irq_disable();
|
|
|
|
/* Don't trace irqs off for idle */
|
|
stop_critical_timings();
|
|
|
|
/* check again after disabling irqs */
|
|
if (!need_resched() && !cpu_should_die())
|
|
ppc_md.power_save();
|
|
|
|
start_critical_timings();
|
|
|
|
/* Some power_save functions return with
|
|
* interrupts enabled, some don't.
|
|
*/
|
|
if (irqs_disabled())
|
|
local_irq_enable();
|
|
set_thread_flag(TIF_POLLING_NRFLAG);
|
|
|
|
} else {
|
|
/*
|
|
* Go into low thread priority and possibly
|
|
* low power mode.
|
|
*/
|
|
HMT_low();
|
|
HMT_very_low();
|
|
}
|
|
}
|
|
|
|
HMT_medium();
|
|
ppc64_runlatch_on();
|
|
rcu_idle_exit();
|
|
tick_nohz_idle_exit();
|
|
preempt_enable_no_resched();
|
|
if (cpu_should_die())
|
|
cpu_die();
|
|
schedule();
|
|
preempt_disable();
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* cpu_idle_wait - Used to ensure that all the CPUs come out of the old
|
|
* idle loop and start using the new idle loop.
|
|
* Required while changing idle handler on SMP systems.
|
|
* Caller must have changed idle handler to the new value before the call.
|
|
* This window may be larger on shared systems.
|
|
*/
|
|
void cpu_idle_wait(void)
|
|
{
|
|
int cpu;
|
|
smp_mb();
|
|
|
|
/* kick all the CPUs so that they exit out of old idle routine */
|
|
get_online_cpus();
|
|
for_each_online_cpu(cpu) {
|
|
if (cpu != smp_processor_id())
|
|
smp_send_reschedule(cpu);
|
|
}
|
|
put_online_cpus();
|
|
}
|
|
EXPORT_SYMBOL_GPL(cpu_idle_wait);
|
|
|
|
int powersave_nap;
|
|
|
|
#ifdef CONFIG_SYSCTL
|
|
/*
|
|
* Register the sysctl to set/clear powersave_nap.
|
|
*/
|
|
static ctl_table powersave_nap_ctl_table[]={
|
|
{
|
|
.procname = "powersave-nap",
|
|
.data = &powersave_nap,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec,
|
|
},
|
|
{}
|
|
};
|
|
static ctl_table powersave_nap_sysctl_root[] = {
|
|
{
|
|
.procname = "kernel",
|
|
.mode = 0555,
|
|
.child = powersave_nap_ctl_table,
|
|
},
|
|
{}
|
|
};
|
|
|
|
static int __init
|
|
register_powersave_nap_sysctl(void)
|
|
{
|
|
register_sysctl_table(powersave_nap_sysctl_root);
|
|
|
|
return 0;
|
|
}
|
|
__initcall(register_powersave_nap_sysctl);
|
|
#endif
|