6f1a4891a5
Evan tracked down a subtle race between the update of the MSI message and the device raising an interrupt internally on PCI devices which do not support MSI masking. The update of the MSI message is non-atomic and consists of either 2 or 3 sequential 32bit wide writes to the PCI config space. - Write address low 32bits - Write address high 32bits (If supported by device) - Write data When an interrupt is migrated then both address and data might change, so the kernel attempts to mask the MSI interrupt first. But for MSI masking is optional, so there exist devices which do not provide it. That means that if the device raises an interrupt internally between the writes then a MSI message is sent built from half updated state. On x86 this can lead to spurious interrupts on the wrong interrupt vector when the affinity setting changes both address and data. As a consequence the device interrupt can be lost causing the device to become stuck or malfunctioning. Evan tried to handle that by disabling MSI accross an MSI message update. That's not feasible because disabling MSI has issues on its own: If MSI is disabled the PCI device is routing an interrupt to the legacy INTx mechanism. The INTx delivery can be disabled, but the disablement is not working on all devices. Some devices lose interrupts when both MSI and INTx delivery are disabled. Another way to solve this would be to enforce the allocation of the same vector on all CPUs in the system for this kind of screwed devices. That could be done, but it would bring back the vector space exhaustion problems which got solved a few years ago. Fortunately the high address (if supported by the device) is only relevant when X2APIC is enabled which implies interrupt remapping. In the interrupt remapping case the affinity setting is happening at the interrupt remapping unit and the PCI MSI message is programmed only once when the PCI device is initialized. That makes it possible to solve it with a two step update: 1) Target the MSI msg to the new vector on the current target CPU 2) Target the MSI msg to the new vector on the new target CPU In both cases writing the MSI message is only changing a single 32bit word which prevents the issue of inconsistency. After writing the final destination it is necessary to check whether the device issued an interrupt while the intermediate state #1 (new vector, current CPU) was in effect. This is possible because the affinity change is always happening on the current target CPU. The code runs with interrupts disabled, so the interrupt can be detected by checking the IRR of the local APIC. If the vector is pending in the IRR then the interrupt is retriggered on the new target CPU by sending an IPI for the associated vector on the target CPU. This can cause spurious interrupts on both the local and the new target CPU. 1) If the new vector is not in use on the local CPU and the device affected by the affinity change raised an interrupt during the transitional state (step #1 above) then interrupt entry code will ignore that spurious interrupt. The vector is marked so that the 'No irq handler for vector' warning is supressed once. 2) If the new vector is in use already on the local CPU then the IRR check might see an pending interrupt from the device which is using this vector. The IPI to the new target CPU will then invoke the handler of the device, which got the affinity change, even if that device did not issue an interrupt 3) If the new vector is in use already on the local CPU and the device affected by the affinity change raised an interrupt during the transitional state (step #1 above) then the handler of the device which uses that vector on the local CPU will be invoked. expose issues in device driver interrupt handlers which are not prepared to handle a spurious interrupt correctly. This not a regression, it's just exposing something which was already broken as spurious interrupts can happen for a lot of reasons and all driver handlers need to be able to deal with them. Reported-by: Evan Green <evgreen@chromium.org> Debugged-by: Evan Green <evgreen@chromium.org> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Tested-by: Evan Green <evgreen@chromium.org> Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/87imkr4s7n.fsf@nanos.tec.linutronix.de
275 lines
7.3 KiB
C
275 lines
7.3 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
// Copyright 2017 Thomas Gleixner <tglx@linutronix.de>
|
|
|
|
#include <linux/irqdomain.h>
|
|
#include <linux/irq.h>
|
|
#include <linux/uaccess.h>
|
|
|
|
#include "internals.h"
|
|
|
|
static struct dentry *irq_dir;
|
|
|
|
struct irq_bit_descr {
|
|
unsigned int mask;
|
|
char *name;
|
|
};
|
|
#define BIT_MASK_DESCR(m) { .mask = m, .name = #m }
|
|
|
|
static void irq_debug_show_bits(struct seq_file *m, int ind, unsigned int state,
|
|
const struct irq_bit_descr *sd, int size)
|
|
{
|
|
int i;
|
|
|
|
for (i = 0; i < size; i++, sd++) {
|
|
if (state & sd->mask)
|
|
seq_printf(m, "%*s%s\n", ind + 12, "", sd->name);
|
|
}
|
|
}
|
|
|
|
#ifdef CONFIG_SMP
|
|
static void irq_debug_show_masks(struct seq_file *m, struct irq_desc *desc)
|
|
{
|
|
struct irq_data *data = irq_desc_get_irq_data(desc);
|
|
struct cpumask *msk;
|
|
|
|
msk = irq_data_get_affinity_mask(data);
|
|
seq_printf(m, "affinity: %*pbl\n", cpumask_pr_args(msk));
|
|
#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
|
|
msk = irq_data_get_effective_affinity_mask(data);
|
|
seq_printf(m, "effectiv: %*pbl\n", cpumask_pr_args(msk));
|
|
#endif
|
|
#ifdef CONFIG_GENERIC_PENDING_IRQ
|
|
msk = desc->pending_mask;
|
|
seq_printf(m, "pending: %*pbl\n", cpumask_pr_args(msk));
|
|
#endif
|
|
}
|
|
#else
|
|
static void irq_debug_show_masks(struct seq_file *m, struct irq_desc *desc) { }
|
|
#endif
|
|
|
|
static const struct irq_bit_descr irqchip_flags[] = {
|
|
BIT_MASK_DESCR(IRQCHIP_SET_TYPE_MASKED),
|
|
BIT_MASK_DESCR(IRQCHIP_EOI_IF_HANDLED),
|
|
BIT_MASK_DESCR(IRQCHIP_MASK_ON_SUSPEND),
|
|
BIT_MASK_DESCR(IRQCHIP_ONOFFLINE_ENABLED),
|
|
BIT_MASK_DESCR(IRQCHIP_SKIP_SET_WAKE),
|
|
BIT_MASK_DESCR(IRQCHIP_ONESHOT_SAFE),
|
|
BIT_MASK_DESCR(IRQCHIP_EOI_THREADED),
|
|
BIT_MASK_DESCR(IRQCHIP_SUPPORTS_LEVEL_MSI),
|
|
BIT_MASK_DESCR(IRQCHIP_SUPPORTS_NMI),
|
|
};
|
|
|
|
static void
|
|
irq_debug_show_chip(struct seq_file *m, struct irq_data *data, int ind)
|
|
{
|
|
struct irq_chip *chip = data->chip;
|
|
|
|
if (!chip) {
|
|
seq_printf(m, "chip: None\n");
|
|
return;
|
|
}
|
|
seq_printf(m, "%*schip: %s\n", ind, "", chip->name);
|
|
seq_printf(m, "%*sflags: 0x%lx\n", ind + 1, "", chip->flags);
|
|
irq_debug_show_bits(m, ind, chip->flags, irqchip_flags,
|
|
ARRAY_SIZE(irqchip_flags));
|
|
}
|
|
|
|
static void
|
|
irq_debug_show_data(struct seq_file *m, struct irq_data *data, int ind)
|
|
{
|
|
seq_printf(m, "%*sdomain: %s\n", ind, "",
|
|
data->domain ? data->domain->name : "");
|
|
seq_printf(m, "%*shwirq: 0x%lx\n", ind + 1, "", data->hwirq);
|
|
irq_debug_show_chip(m, data, ind + 1);
|
|
if (data->domain && data->domain->ops && data->domain->ops->debug_show)
|
|
data->domain->ops->debug_show(m, NULL, data, ind + 1);
|
|
#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
|
|
if (!data->parent_data)
|
|
return;
|
|
seq_printf(m, "%*sparent:\n", ind + 1, "");
|
|
irq_debug_show_data(m, data->parent_data, ind + 4);
|
|
#endif
|
|
}
|
|
|
|
static const struct irq_bit_descr irqdata_states[] = {
|
|
BIT_MASK_DESCR(IRQ_TYPE_EDGE_RISING),
|
|
BIT_MASK_DESCR(IRQ_TYPE_EDGE_FALLING),
|
|
BIT_MASK_DESCR(IRQ_TYPE_LEVEL_HIGH),
|
|
BIT_MASK_DESCR(IRQ_TYPE_LEVEL_LOW),
|
|
BIT_MASK_DESCR(IRQD_LEVEL),
|
|
|
|
BIT_MASK_DESCR(IRQD_ACTIVATED),
|
|
BIT_MASK_DESCR(IRQD_IRQ_STARTED),
|
|
BIT_MASK_DESCR(IRQD_IRQ_DISABLED),
|
|
BIT_MASK_DESCR(IRQD_IRQ_MASKED),
|
|
BIT_MASK_DESCR(IRQD_IRQ_INPROGRESS),
|
|
|
|
BIT_MASK_DESCR(IRQD_PER_CPU),
|
|
BIT_MASK_DESCR(IRQD_NO_BALANCING),
|
|
|
|
BIT_MASK_DESCR(IRQD_SINGLE_TARGET),
|
|
BIT_MASK_DESCR(IRQD_MOVE_PCNTXT),
|
|
BIT_MASK_DESCR(IRQD_AFFINITY_SET),
|
|
BIT_MASK_DESCR(IRQD_SETAFFINITY_PENDING),
|
|
BIT_MASK_DESCR(IRQD_AFFINITY_MANAGED),
|
|
BIT_MASK_DESCR(IRQD_MANAGED_SHUTDOWN),
|
|
BIT_MASK_DESCR(IRQD_CAN_RESERVE),
|
|
BIT_MASK_DESCR(IRQD_MSI_NOMASK_QUIRK),
|
|
|
|
BIT_MASK_DESCR(IRQD_FORWARDED_TO_VCPU),
|
|
|
|
BIT_MASK_DESCR(IRQD_WAKEUP_STATE),
|
|
BIT_MASK_DESCR(IRQD_WAKEUP_ARMED),
|
|
};
|
|
|
|
static const struct irq_bit_descr irqdesc_states[] = {
|
|
BIT_MASK_DESCR(_IRQ_NOPROBE),
|
|
BIT_MASK_DESCR(_IRQ_NOREQUEST),
|
|
BIT_MASK_DESCR(_IRQ_NOTHREAD),
|
|
BIT_MASK_DESCR(_IRQ_NOAUTOEN),
|
|
BIT_MASK_DESCR(_IRQ_NESTED_THREAD),
|
|
BIT_MASK_DESCR(_IRQ_PER_CPU_DEVID),
|
|
BIT_MASK_DESCR(_IRQ_IS_POLLED),
|
|
BIT_MASK_DESCR(_IRQ_DISABLE_UNLAZY),
|
|
};
|
|
|
|
static const struct irq_bit_descr irqdesc_istates[] = {
|
|
BIT_MASK_DESCR(IRQS_AUTODETECT),
|
|
BIT_MASK_DESCR(IRQS_SPURIOUS_DISABLED),
|
|
BIT_MASK_DESCR(IRQS_POLL_INPROGRESS),
|
|
BIT_MASK_DESCR(IRQS_ONESHOT),
|
|
BIT_MASK_DESCR(IRQS_REPLAY),
|
|
BIT_MASK_DESCR(IRQS_WAITING),
|
|
BIT_MASK_DESCR(IRQS_PENDING),
|
|
BIT_MASK_DESCR(IRQS_SUSPENDED),
|
|
BIT_MASK_DESCR(IRQS_NMI),
|
|
};
|
|
|
|
|
|
static int irq_debug_show(struct seq_file *m, void *p)
|
|
{
|
|
struct irq_desc *desc = m->private;
|
|
struct irq_data *data;
|
|
|
|
raw_spin_lock_irq(&desc->lock);
|
|
data = irq_desc_get_irq_data(desc);
|
|
seq_printf(m, "handler: %ps\n", desc->handle_irq);
|
|
seq_printf(m, "device: %s\n", desc->dev_name);
|
|
seq_printf(m, "status: 0x%08x\n", desc->status_use_accessors);
|
|
irq_debug_show_bits(m, 0, desc->status_use_accessors, irqdesc_states,
|
|
ARRAY_SIZE(irqdesc_states));
|
|
seq_printf(m, "istate: 0x%08x\n", desc->istate);
|
|
irq_debug_show_bits(m, 0, desc->istate, irqdesc_istates,
|
|
ARRAY_SIZE(irqdesc_istates));
|
|
seq_printf(m, "ddepth: %u\n", desc->depth);
|
|
seq_printf(m, "wdepth: %u\n", desc->wake_depth);
|
|
seq_printf(m, "dstate: 0x%08x\n", irqd_get(data));
|
|
irq_debug_show_bits(m, 0, irqd_get(data), irqdata_states,
|
|
ARRAY_SIZE(irqdata_states));
|
|
seq_printf(m, "node: %d\n", irq_data_get_node(data));
|
|
irq_debug_show_masks(m, desc);
|
|
irq_debug_show_data(m, data, 0);
|
|
raw_spin_unlock_irq(&desc->lock);
|
|
return 0;
|
|
}
|
|
|
|
static int irq_debug_open(struct inode *inode, struct file *file)
|
|
{
|
|
return single_open(file, irq_debug_show, inode->i_private);
|
|
}
|
|
|
|
static ssize_t irq_debug_write(struct file *file, const char __user *user_buf,
|
|
size_t count, loff_t *ppos)
|
|
{
|
|
struct irq_desc *desc = file_inode(file)->i_private;
|
|
char buf[8] = { 0, };
|
|
size_t size;
|
|
|
|
size = min(sizeof(buf) - 1, count);
|
|
if (copy_from_user(buf, user_buf, size))
|
|
return -EFAULT;
|
|
|
|
if (!strncmp(buf, "trigger", size)) {
|
|
unsigned long flags;
|
|
int err;
|
|
|
|
/* Try the HW interface first */
|
|
err = irq_set_irqchip_state(irq_desc_get_irq(desc),
|
|
IRQCHIP_STATE_PENDING, true);
|
|
if (!err)
|
|
return count;
|
|
|
|
/*
|
|
* Otherwise, try to inject via the resend interface,
|
|
* which may or may not succeed.
|
|
*/
|
|
chip_bus_lock(desc);
|
|
raw_spin_lock_irqsave(&desc->lock, flags);
|
|
|
|
if (irq_settings_is_level(desc) || desc->istate & IRQS_NMI) {
|
|
/* Can't do level nor NMIs, sorry */
|
|
err = -EINVAL;
|
|
} else {
|
|
desc->istate |= IRQS_PENDING;
|
|
check_irq_resend(desc);
|
|
err = 0;
|
|
}
|
|
|
|
raw_spin_unlock_irqrestore(&desc->lock, flags);
|
|
chip_bus_sync_unlock(desc);
|
|
|
|
return err ? err : count;
|
|
}
|
|
|
|
return count;
|
|
}
|
|
|
|
static const struct file_operations dfs_irq_ops = {
|
|
.open = irq_debug_open,
|
|
.write = irq_debug_write,
|
|
.read = seq_read,
|
|
.llseek = seq_lseek,
|
|
.release = single_release,
|
|
};
|
|
|
|
void irq_debugfs_copy_devname(int irq, struct device *dev)
|
|
{
|
|
struct irq_desc *desc = irq_to_desc(irq);
|
|
const char *name = dev_name(dev);
|
|
|
|
if (name)
|
|
desc->dev_name = kstrdup(name, GFP_KERNEL);
|
|
}
|
|
|
|
void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc)
|
|
{
|
|
char name [10];
|
|
|
|
if (!irq_dir || !desc || desc->debugfs_file)
|
|
return;
|
|
|
|
sprintf(name, "%d", irq);
|
|
desc->debugfs_file = debugfs_create_file(name, 0644, irq_dir, desc,
|
|
&dfs_irq_ops);
|
|
}
|
|
|
|
static int __init irq_debugfs_init(void)
|
|
{
|
|
struct dentry *root_dir;
|
|
int irq;
|
|
|
|
root_dir = debugfs_create_dir("irq", NULL);
|
|
|
|
irq_domain_debugfs_init(root_dir);
|
|
|
|
irq_dir = debugfs_create_dir("irqs", root_dir);
|
|
|
|
irq_lock_sparse();
|
|
for_each_active_irq(irq)
|
|
irq_add_debugfs_entry(irq, irq_to_desc(irq));
|
|
irq_unlock_sparse();
|
|
|
|
return 0;
|
|
}
|
|
__initcall(irq_debugfs_init);
|