9fb1b36ca1
We have been observing hangs, both of KVM guest vcpu tasks and more generally, where a process that is woken doesn't properly wake up and continue to run, but instead sticks in TASK_WAKING state. This happens because the update of rq->wake_list in ttwu_queue_remote() is not ordered with the update of ipi_message in smp_muxed_ipi_message_pass(), and the reading of rq->wake_list in scheduler_ipi() is not ordered with the reading of ipi_message in smp_ipi_demux(). Thus it is possible for the IPI receiver not to see the updated rq->wake_list and therefore conclude that there is nothing for it to do. In order to make sure that anything done before smp_send_reschedule() is ordered before anything done in the resulting call to scheduler_ipi(), this adds barriers in smp_muxed_message_pass() and smp_ipi_demux(). The barrier in smp_muxed_message_pass() is a full barrier to ensure that there is a full ordering between the smp_send_reschedule() caller and scheduler_ipi(). In smp_ipi_demux(), we use xchg() rather than xchg_local() because xchg() includes release and acquire barriers. Using xchg() rather than xchg_local() makes sense given that ipi_message is not just accessed locally. This moves the barrier between setting the message and calling the cause_ipi() function into the individual cause_ipi implementations. Most of them -- those that used outb, out_8 or similar -- already had a full barrier because out_8 etc. include a sync before the MMIO store. This adds an explicit barrier in the two remaining cases. These changes made no measurable difference to the speed of IPIs as measured using a simple ping-pong latency test across two CPUs on different cores of a POWER7 machine. The analysis of the reason why processes were not waking up properly is due to Milton Miller. Cc: stable@vger.kernel.org # v3.0+ Reported-by: Milton Miller <miltonm@bga.com> Signed-off-by: Paul Mackerras <paulus@samba.org> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
186 lines
4.0 KiB
C
186 lines
4.0 KiB
C
/*
|
|
* Copyright 2011 IBM Corporation.
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU General Public License
|
|
* as published by the Free Software Foundation; either version
|
|
* 2 of the License, or (at your option) any later version.
|
|
*
|
|
*/
|
|
#include <linux/types.h>
|
|
#include <linux/kernel.h>
|
|
#include <linux/irq.h>
|
|
#include <linux/smp.h>
|
|
#include <linux/interrupt.h>
|
|
#include <linux/init.h>
|
|
#include <linux/cpu.h>
|
|
#include <linux/of.h>
|
|
|
|
#include <asm/smp.h>
|
|
#include <asm/irq.h>
|
|
#include <asm/errno.h>
|
|
#include <asm/xics.h>
|
|
#include <asm/io.h>
|
|
#include <asm/hvcall.h>
|
|
|
|
static inline unsigned int icp_hv_get_xirr(unsigned char cppr)
|
|
{
|
|
unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
|
|
long rc;
|
|
unsigned int ret = XICS_IRQ_SPURIOUS;
|
|
|
|
rc = plpar_hcall(H_XIRR, retbuf, cppr);
|
|
if (rc == H_SUCCESS) {
|
|
ret = (unsigned int)retbuf[0];
|
|
} else {
|
|
pr_err("%s: bad return code xirr cppr=0x%x returned %ld\n",
|
|
__func__, cppr, rc);
|
|
WARN_ON_ONCE(1);
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
static inline void icp_hv_set_cppr(u8 value)
|
|
{
|
|
long rc = plpar_hcall_norets(H_CPPR, value);
|
|
if (rc != H_SUCCESS) {
|
|
pr_err("%s: bad return code cppr cppr=0x%x returned %ld\n",
|
|
__func__, value, rc);
|
|
WARN_ON_ONCE(1);
|
|
}
|
|
}
|
|
|
|
static inline void icp_hv_set_xirr(unsigned int value)
|
|
{
|
|
long rc = plpar_hcall_norets(H_EOI, value);
|
|
if (rc != H_SUCCESS) {
|
|
pr_err("%s: bad return code eoi xirr=0x%x returned %ld\n",
|
|
__func__, value, rc);
|
|
WARN_ON_ONCE(1);
|
|
icp_hv_set_cppr(value >> 24);
|
|
}
|
|
}
|
|
|
|
static inline void icp_hv_set_qirr(int n_cpu , u8 value)
|
|
{
|
|
int hw_cpu = get_hard_smp_processor_id(n_cpu);
|
|
long rc;
|
|
|
|
/* Make sure all previous accesses are ordered before IPI sending */
|
|
mb();
|
|
rc = plpar_hcall_norets(H_IPI, hw_cpu, value);
|
|
if (rc != H_SUCCESS) {
|
|
pr_err("%s: bad return code qirr cpu=%d hw_cpu=%d mfrr=0x%x "
|
|
"returned %ld\n", __func__, n_cpu, hw_cpu, value, rc);
|
|
WARN_ON_ONCE(1);
|
|
}
|
|
}
|
|
|
|
static void icp_hv_eoi(struct irq_data *d)
|
|
{
|
|
unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
|
|
|
|
iosync();
|
|
icp_hv_set_xirr((xics_pop_cppr() << 24) | hw_irq);
|
|
}
|
|
|
|
static void icp_hv_teardown_cpu(void)
|
|
{
|
|
int cpu = smp_processor_id();
|
|
|
|
/* Clear any pending IPI */
|
|
icp_hv_set_qirr(cpu, 0xff);
|
|
}
|
|
|
|
static void icp_hv_flush_ipi(void)
|
|
{
|
|
/* We take the ipi irq but and never return so we
|
|
* need to EOI the IPI, but want to leave our priority 0
|
|
*
|
|
* should we check all the other interrupts too?
|
|
* should we be flagging idle loop instead?
|
|
* or creating some task to be scheduled?
|
|
*/
|
|
|
|
icp_hv_set_xirr((0x00 << 24) | XICS_IPI);
|
|
}
|
|
|
|
static unsigned int icp_hv_get_irq(void)
|
|
{
|
|
unsigned int xirr = icp_hv_get_xirr(xics_cppr_top());
|
|
unsigned int vec = xirr & 0x00ffffff;
|
|
unsigned int irq;
|
|
|
|
if (vec == XICS_IRQ_SPURIOUS)
|
|
return NO_IRQ;
|
|
|
|
irq = irq_find_mapping(xics_host, vec);
|
|
if (likely(irq != NO_IRQ)) {
|
|
xics_push_cppr(vec);
|
|
return irq;
|
|
}
|
|
|
|
/* We don't have a linux mapping, so have rtas mask it. */
|
|
xics_mask_unknown_vec(vec);
|
|
|
|
/* We might learn about it later, so EOI it */
|
|
icp_hv_set_xirr(xirr);
|
|
|
|
return NO_IRQ;
|
|
}
|
|
|
|
static void icp_hv_set_cpu_priority(unsigned char cppr)
|
|
{
|
|
xics_set_base_cppr(cppr);
|
|
icp_hv_set_cppr(cppr);
|
|
iosync();
|
|
}
|
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
static void icp_hv_cause_ipi(int cpu, unsigned long data)
|
|
{
|
|
icp_hv_set_qirr(cpu, IPI_PRIORITY);
|
|
}
|
|
|
|
static irqreturn_t icp_hv_ipi_action(int irq, void *dev_id)
|
|
{
|
|
int cpu = smp_processor_id();
|
|
|
|
icp_hv_set_qirr(cpu, 0xff);
|
|
|
|
return smp_ipi_demux();
|
|
}
|
|
|
|
#endif /* CONFIG_SMP */
|
|
|
|
static const struct icp_ops icp_hv_ops = {
|
|
.get_irq = icp_hv_get_irq,
|
|
.eoi = icp_hv_eoi,
|
|
.set_priority = icp_hv_set_cpu_priority,
|
|
.teardown_cpu = icp_hv_teardown_cpu,
|
|
.flush_ipi = icp_hv_flush_ipi,
|
|
#ifdef CONFIG_SMP
|
|
.ipi_action = icp_hv_ipi_action,
|
|
.cause_ipi = icp_hv_cause_ipi,
|
|
#endif
|
|
};
|
|
|
|
int icp_hv_init(void)
|
|
{
|
|
struct device_node *np;
|
|
|
|
np = of_find_compatible_node(NULL, NULL, "ibm,ppc-xicp");
|
|
if (!np)
|
|
np = of_find_node_by_type(NULL,
|
|
"PowerPC-External-Interrupt-Presentation");
|
|
if (!np)
|
|
return -ENODEV;
|
|
|
|
icp_ops = &icp_hv_ops;
|
|
|
|
return 0;
|
|
}
|
|
|