25a068b8e9
Jan Kiszka reported that the x2apic_wrmsr_fence() function uses a plain MFENCE while the Intel SDM (10.12.3 MSR Access in x2APIC Mode) calls for MFENCE; LFENCE. Short summary: we have special MSRs that have weaker ordering than all the rest. Add fencing consistent with current SDM recommendations. This is not known to cause any issues in practice, only in theory. Longer story below: The reason the kernel uses a different semantic is that the SDM changed (roughly in late 2017). The SDM changed because folks at Intel were auditing all of the recommended fences in the SDM and realized that the x2apic fences were insufficient. Why was the pain MFENCE judged insufficient? WRMSR itself is normally a serializing instruction. No fences are needed because the instruction itself serializes everything. But, there are explicit exceptions for this serializing behavior written into the WRMSR instruction documentation for two classes of MSRs: IA32_TSC_DEADLINE and the X2APIC MSRs. Back to x2apic: WRMSR is *not* serializing in this specific case. But why is MFENCE insufficient? MFENCE makes writes visible, but only affects load/store instructions. WRMSR is unfortunately not a load/store instruction and is unaffected by MFENCE. This means that a non-serializing WRMSR could be reordered by the CPU to execute before the writes made visible by the MFENCE have even occurred in the first place. This means that an x2apic IPI could theoretically be triggered before there is any (visible) data to process. Does this affect anything in practice? I honestly don't know. It seems quite possible that by the time an interrupt gets to consume the (not yet) MFENCE'd data, it has become visible, mostly by accident. To be safe, add the SDM-recommended fences for all x2apic WRMSRs. This also leaves open the question of the _other_ weakly-ordered WRMSR: MSR_IA32_TSC_DEADLINE. While it has the same ordering architecture as the x2APIC MSRs, it seems substantially less likely to be a problem in practice. While writes to the in-memory Local Vector Table (LVT) might theoretically be reordered with respect to a weakly-ordered WRMSR like TSC_DEADLINE, the SDM has this to say: In x2APIC mode, the WRMSR instruction is used to write to the LVT entry. The processor ensures the ordering of this write and any subsequent WRMSR to the deadline; no fencing is required. But, that might still leave xAPIC exposed. The safest thing to do for now is to add the extra, recommended LFENCE. [ bp: Massage commit message, fix typos, drop accidentally added newline to tools/arch/x86/include/asm/barrier.h. ] Reported-by: Jan Kiszka <jan.kiszka@siemens.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Signed-off-by: Borislav Petkov <bp@suse.de> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Acked-by: Thomas Gleixner <tglx@linutronix.de> Cc: <stable@vger.kernel.org> Link: https://lkml.kernel.org/r/20200305174708.F77040DD@viggo.jf.intel.com
227 lines
5.7 KiB
C
227 lines
5.7 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
#include <linux/cpuhotplug.h>
|
|
#include <linux/cpumask.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/mm.h>
|
|
|
|
#include <asm/apic.h>
|
|
|
|
#include "local.h"
|
|
|
|
struct cluster_mask {
|
|
unsigned int clusterid;
|
|
int node;
|
|
struct cpumask mask;
|
|
};
|
|
|
|
static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
|
|
static DEFINE_PER_CPU(cpumask_var_t, ipi_mask);
|
|
static DEFINE_PER_CPU(struct cluster_mask *, cluster_masks);
|
|
static struct cluster_mask *cluster_hotplug_mask;
|
|
|
|
static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
|
|
{
|
|
return x2apic_enabled();
|
|
}
|
|
|
|
static void x2apic_send_IPI(int cpu, int vector)
|
|
{
|
|
u32 dest = per_cpu(x86_cpu_to_logical_apicid, cpu);
|
|
|
|
/* x2apic MSRs are special and need a special fence: */
|
|
weak_wrmsr_fence();
|
|
__x2apic_send_IPI_dest(dest, vector, APIC_DEST_LOGICAL);
|
|
}
|
|
|
|
static void
|
|
__x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
|
|
{
|
|
unsigned int cpu, clustercpu;
|
|
struct cpumask *tmpmsk;
|
|
unsigned long flags;
|
|
u32 dest;
|
|
|
|
/* x2apic MSRs are special and need a special fence: */
|
|
weak_wrmsr_fence();
|
|
local_irq_save(flags);
|
|
|
|
tmpmsk = this_cpu_cpumask_var_ptr(ipi_mask);
|
|
cpumask_copy(tmpmsk, mask);
|
|
/* If IPI should not be sent to self, clear current CPU */
|
|
if (apic_dest != APIC_DEST_ALLINC)
|
|
__cpumask_clear_cpu(smp_processor_id(), tmpmsk);
|
|
|
|
/* Collapse cpus in a cluster so a single IPI per cluster is sent */
|
|
for_each_cpu(cpu, tmpmsk) {
|
|
struct cluster_mask *cmsk = per_cpu(cluster_masks, cpu);
|
|
|
|
dest = 0;
|
|
for_each_cpu_and(clustercpu, tmpmsk, &cmsk->mask)
|
|
dest |= per_cpu(x86_cpu_to_logical_apicid, clustercpu);
|
|
|
|
if (!dest)
|
|
continue;
|
|
|
|
__x2apic_send_IPI_dest(dest, vector, APIC_DEST_LOGICAL);
|
|
/* Remove cluster CPUs from tmpmask */
|
|
cpumask_andnot(tmpmsk, tmpmsk, &cmsk->mask);
|
|
}
|
|
|
|
local_irq_restore(flags);
|
|
}
|
|
|
|
static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
|
|
{
|
|
__x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLINC);
|
|
}
|
|
|
|
static void
|
|
x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
|
|
{
|
|
__x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT);
|
|
}
|
|
|
|
static void x2apic_send_IPI_allbutself(int vector)
|
|
{
|
|
__x2apic_send_IPI_shorthand(vector, APIC_DEST_ALLBUT);
|
|
}
|
|
|
|
static void x2apic_send_IPI_all(int vector)
|
|
{
|
|
__x2apic_send_IPI_shorthand(vector, APIC_DEST_ALLINC);
|
|
}
|
|
|
|
static u32 x2apic_calc_apicid(unsigned int cpu)
|
|
{
|
|
return per_cpu(x86_cpu_to_logical_apicid, cpu);
|
|
}
|
|
|
|
static void init_x2apic_ldr(void)
|
|
{
|
|
struct cluster_mask *cmsk = this_cpu_read(cluster_masks);
|
|
u32 cluster, apicid = apic_read(APIC_LDR);
|
|
unsigned int cpu;
|
|
|
|
this_cpu_write(x86_cpu_to_logical_apicid, apicid);
|
|
|
|
if (cmsk)
|
|
goto update;
|
|
|
|
cluster = apicid >> 16;
|
|
for_each_online_cpu(cpu) {
|
|
cmsk = per_cpu(cluster_masks, cpu);
|
|
/* Matching cluster found. Link and update it. */
|
|
if (cmsk && cmsk->clusterid == cluster)
|
|
goto update;
|
|
}
|
|
cmsk = cluster_hotplug_mask;
|
|
cmsk->clusterid = cluster;
|
|
cluster_hotplug_mask = NULL;
|
|
update:
|
|
this_cpu_write(cluster_masks, cmsk);
|
|
cpumask_set_cpu(smp_processor_id(), &cmsk->mask);
|
|
}
|
|
|
|
static int alloc_clustermask(unsigned int cpu, int node)
|
|
{
|
|
if (per_cpu(cluster_masks, cpu))
|
|
return 0;
|
|
/*
|
|
* If a hotplug spare mask exists, check whether it's on the right
|
|
* node. If not, free it and allocate a new one.
|
|
*/
|
|
if (cluster_hotplug_mask) {
|
|
if (cluster_hotplug_mask->node == node)
|
|
return 0;
|
|
kfree(cluster_hotplug_mask);
|
|
}
|
|
|
|
cluster_hotplug_mask = kzalloc_node(sizeof(*cluster_hotplug_mask),
|
|
GFP_KERNEL, node);
|
|
if (!cluster_hotplug_mask)
|
|
return -ENOMEM;
|
|
cluster_hotplug_mask->node = node;
|
|
return 0;
|
|
}
|
|
|
|
static int x2apic_prepare_cpu(unsigned int cpu)
|
|
{
|
|
if (alloc_clustermask(cpu, cpu_to_node(cpu)) < 0)
|
|
return -ENOMEM;
|
|
if (!zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL))
|
|
return -ENOMEM;
|
|
return 0;
|
|
}
|
|
|
|
static int x2apic_dead_cpu(unsigned int dead_cpu)
|
|
{
|
|
struct cluster_mask *cmsk = per_cpu(cluster_masks, dead_cpu);
|
|
|
|
if (cmsk)
|
|
cpumask_clear_cpu(dead_cpu, &cmsk->mask);
|
|
free_cpumask_var(per_cpu(ipi_mask, dead_cpu));
|
|
return 0;
|
|
}
|
|
|
|
static int x2apic_cluster_probe(void)
|
|
{
|
|
if (!x2apic_mode)
|
|
return 0;
|
|
|
|
if (cpuhp_setup_state(CPUHP_X2APIC_PREPARE, "x86/x2apic:prepare",
|
|
x2apic_prepare_cpu, x2apic_dead_cpu) < 0) {
|
|
pr_err("Failed to register X2APIC_PREPARE\n");
|
|
return 0;
|
|
}
|
|
init_x2apic_ldr();
|
|
return 1;
|
|
}
|
|
|
|
static struct apic apic_x2apic_cluster __ro_after_init = {
|
|
|
|
.name = "cluster x2apic",
|
|
.probe = x2apic_cluster_probe,
|
|
.acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
|
|
.apic_id_valid = x2apic_apic_id_valid,
|
|
.apic_id_registered = x2apic_apic_id_registered,
|
|
|
|
.delivery_mode = APIC_DELIVERY_MODE_FIXED,
|
|
.dest_mode_logical = true,
|
|
|
|
.disable_esr = 0,
|
|
|
|
.check_apicid_used = NULL,
|
|
.init_apic_ldr = init_x2apic_ldr,
|
|
.ioapic_phys_id_map = NULL,
|
|
.setup_apic_routing = NULL,
|
|
.cpu_present_to_apicid = default_cpu_present_to_apicid,
|
|
.apicid_to_cpu_present = NULL,
|
|
.check_phys_apicid_present = default_check_phys_apicid_present,
|
|
.phys_pkg_id = x2apic_phys_pkg_id,
|
|
|
|
.get_apic_id = x2apic_get_apic_id,
|
|
.set_apic_id = x2apic_set_apic_id,
|
|
|
|
.calc_dest_apicid = x2apic_calc_apicid,
|
|
|
|
.send_IPI = x2apic_send_IPI,
|
|
.send_IPI_mask = x2apic_send_IPI_mask,
|
|
.send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself,
|
|
.send_IPI_allbutself = x2apic_send_IPI_allbutself,
|
|
.send_IPI_all = x2apic_send_IPI_all,
|
|
.send_IPI_self = x2apic_send_IPI_self,
|
|
|
|
.inquire_remote_apic = NULL,
|
|
|
|
.read = native_apic_msr_read,
|
|
.write = native_apic_msr_write,
|
|
.eoi_write = native_apic_msr_eoi_write,
|
|
.icr_read = native_x2apic_icr_read,
|
|
.icr_write = native_x2apic_icr_write,
|
|
.wait_icr_idle = native_x2apic_wait_icr_idle,
|
|
.safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle,
|
|
};
|
|
|
|
apic_driver(apic_x2apic_cluster);
|