ca7e917769
The current implementation has a couple of shortcomings: - It fails to handle hybrid systems correctly. - The APIC registration code which handles CPU number assignents is in the middle of the APIC code and detached from the topology evaluation. - The various mechanisms which enumerate APICs, ACPI, MPPARSE and guest specific ones, tweak global variables as they see fit or in case of XENPV just hack around the generic mechanisms completely. - The CPUID topology evaluation code is sprinkled all over the vendor code and reevaluates global variables on every hotplug operation. - There is no way to analyze topology on the boot CPU before bringing up the APs. This causes problems for infrastructure like PERF which needs to size certain aspects upfront or could be simplified if that would be possible. - The APIC admission and CPU number association logic is incomprehensible and overly complex and needs to be kept around after boot instead of completing this right after the APIC enumeration. This update addresses these shortcomings with the following changes: - Rework the CPUID evaluation code so it is common for all vendors and provides information about the APIC ID segments in a uniform way independent of the number of segments (Thread, Core, Module, ..., Die, Package) so that this information can be computed instead of rewriting global variables of dubious value over and over. - A few cleanups and simplifcations of the APIC, IO/APIC and related interfaces to prepare for the topology evaluation changes. - Seperation of the parser stages so the early evaluation which tries to find the APIC address can be seperately overridden from the late evaluation which enumerates and registers the local APIC as further preparation for sanitizing the topology evaluation. - A new registration and admission logic which - encapsulates the inner workings so that parsers and guest logic cannot longer fiddle in it - uses the APIC ID segments to build topology bitmaps at registration time - provides a sane admission logic - allows to detect the crash kernel case, where CPU0 does not run on the real BSP, automatically. This is required to prevent sending INIT/SIPI sequences to the real BSP which would reset the whole machine. This was so far handled by a tedious command line parameter, which does not even work in nested crash scenarios. - Associates CPU number after the enumeration completed and prevents the late registration of APICs, which was somehow tolerated before. - Converting all parsers and guest enumeration mechanisms over to the new interfaces. This allows to get rid of all global variable tweaking from the parsers and enumeration mechanisms and sanitizes the XEN[PV] handling so it can use CPUID evaluation for the first time. - Mopping up existing sins by taking the information from the APIC ID segment bitmaps. This evaluates hybrid systems correctly on the boot CPU and allows for cleanups and fixes in the related drivers, e.g. PERF. The series has been extensively tested and the minimal late fallout due to a broken ACPI/MADT table has been addressed by tightening the admission logic further. -----BEGIN PGP SIGNATURE----- iQJHBAABCgAxFiEEQp8+kY+LLUocC4bMphj1TA10mKEFAmXuDawTHHRnbHhAbGlu dXRyb25peC5kZQAKCRCmGPVMDXSYobE7EACngItF+UOTCoCV6och2lL6HVoIdZD1 Y5oaAgD+WzQSz/lBkH6b9kZSyvjlMo6O9GlnGX+ii+VUnijDp4VrspnxbJDaKEq3 gOfsSg2Tk+ps50HqMcZawjjBYJb/TmvKwEV2XuzIBPOONSWLNjvN7nBSzLl1eF9/ 8uCE39/8aB5K3GXryRyXdo2uLu6eHTVC0aYFu/kLX1/BbVqF5NMD3sz9E9w8+D/U MIIMEMXy4Fn+P2o0vVH+gjUlwI76mJbB1WqCX/sqbVacXrjl3KfNJRiisTFIOOYV 8o+rIV0ef5X9xmZqtOXAdyZQzj++Gwmz9+4TU1M4YHtS7UkYn6AluOjvVekCc+gc qXE3WhqKfCK2/carRMLQxAMxNeRylkZG+Wuv1Qtyjpe9JX2dTqtems0f4DMp9DKf b7InO3z39kJanpqcUG2Sx+GWanetfnX+0Ho2Moqu6Xi+2ATr1PfMG/Wyr5/WWOfV qApaHSTwa+J43mSzP6BsXngEv085EHSGM5tPe7u46MCYFqB21+bMl+qH82KjMkOe c6uZovFQMmX2WBlqJSYGVCH+Jhgvqq8HFeRs19Hd4enOt3e6LE3E74RBVD1AyfLV 1b/m8tYB/o871ZlEZwDCGVrV/LNnA7PxmFpq5ZHLpUt39g2/V0RH1puBVz1e97pU YsTT7hBCUYzgjQ== =/5oR -----END PGP SIGNATURE----- Merge tag 'x86-apic-2024-03-10' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip Pull x86 APIC updates from Thomas Gleixner: "Rework of APIC enumeration and topology evaluation. The current implementation has a couple of shortcomings: - It fails to handle hybrid systems correctly. - The APIC registration code which handles CPU number assignents is in the middle of the APIC code and detached from the topology evaluation. - The various mechanisms which enumerate APICs, ACPI, MPPARSE and guest specific ones, tweak global variables as they see fit or in case of XENPV just hack around the generic mechanisms completely. - The CPUID topology evaluation code is sprinkled all over the vendor code and reevaluates global variables on every hotplug operation. - There is no way to analyze topology on the boot CPU before bringing up the APs. This causes problems for infrastructure like PERF which needs to size certain aspects upfront or could be simplified if that would be possible. - The APIC admission and CPU number association logic is incomprehensible and overly complex and needs to be kept around after boot instead of completing this right after the APIC enumeration. This update addresses these shortcomings with the following changes: - Rework the CPUID evaluation code so it is common for all vendors and provides information about the APIC ID segments in a uniform way independent of the number of segments (Thread, Core, Module, ..., Die, Package) so that this information can be computed instead of rewriting global variables of dubious value over and over. - A few cleanups and simplifcations of the APIC, IO/APIC and related interfaces to prepare for the topology evaluation changes. - Seperation of the parser stages so the early evaluation which tries to find the APIC address can be seperately overridden from the late evaluation which enumerates and registers the local APIC as further preparation for sanitizing the topology evaluation. - A new registration and admission logic which - encapsulates the inner workings so that parsers and guest logic cannot longer fiddle in it - uses the APIC ID segments to build topology bitmaps at registration time - provides a sane admission logic - allows to detect the crash kernel case, where CPU0 does not run on the real BSP, automatically. This is required to prevent sending INIT/SIPI sequences to the real BSP which would reset the whole machine. This was so far handled by a tedious command line parameter, which does not even work in nested crash scenarios. - Associates CPU number after the enumeration completed and prevents the late registration of APICs, which was somehow tolerated before. - Converting all parsers and guest enumeration mechanisms over to the new interfaces. This allows to get rid of all global variable tweaking from the parsers and enumeration mechanisms and sanitizes the XEN[PV] handling so it can use CPUID evaluation for the first time. - Mopping up existing sins by taking the information from the APIC ID segment bitmaps. This evaluates hybrid systems correctly on the boot CPU and allows for cleanups and fixes in the related drivers, e.g. PERF. The series has been extensively tested and the minimal late fallout due to a broken ACPI/MADT table has been addressed by tightening the admission logic further" * tag 'x86-apic-2024-03-10' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (76 commits) x86/topology: Ignore non-present APIC IDs in a present package x86/apic: Build the x86 topology enumeration functions on UP APIC builds too smp: Provide 'setup_max_cpus' definition on UP too smp: Avoid 'setup_max_cpus' namespace collision/shadowing x86/bugs: Use fixed addressing for VERW operand x86/cpu/topology: Get rid of cpuinfo::x86_max_cores x86/cpu/topology: Provide __num_[cores|threads]_per_package x86/cpu/topology: Rename topology_max_die_per_package() x86/cpu/topology: Rename smp_num_siblings x86/cpu/topology: Retrieve cores per package from topology bitmaps x86/cpu/topology: Use topology logical mapping mechanism x86/cpu/topology: Provide logical pkg/die mapping x86/cpu/topology: Simplify cpu_mark_primary_thread() x86/cpu/topology: Mop up primary thread mask handling x86/cpu/topology: Use topology bitmaps for sizing x86/cpu/topology: Let XEN/PV use topology from CPUID/MADT x86/xen/smp_pv: Count number of vCPUs early x86/cpu/topology: Assign hotpluggable CPUIDs during init x86/cpu/topology: Reject unknown APIC IDs on ACPI hotplug x86/topology: Add a mechanism to track topology via APIC IDs ...
241 lines
6.9 KiB
C
241 lines
6.9 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
/*
|
|
* Copyright (c) 2023, Microsoft Corporation.
|
|
*
|
|
* Author:
|
|
* Saurabh Sengar <ssengar@microsoft.com>
|
|
*/
|
|
|
|
#include <asm/apic.h>
|
|
#include <asm/boot.h>
|
|
#include <asm/desc.h>
|
|
#include <asm/i8259.h>
|
|
#include <asm/mshyperv.h>
|
|
#include <asm/realmode.h>
|
|
|
|
extern struct boot_params boot_params;
|
|
static struct real_mode_header hv_vtl_real_mode_header;
|
|
|
|
static bool __init hv_vtl_msi_ext_dest_id(void)
|
|
{
|
|
return true;
|
|
}
|
|
|
|
void __init hv_vtl_init_platform(void)
|
|
{
|
|
pr_info("Linux runs in Hyper-V Virtual Trust Level\n");
|
|
|
|
x86_platform.realmode_reserve = x86_init_noop;
|
|
x86_platform.realmode_init = x86_init_noop;
|
|
x86_init.irqs.pre_vector_init = x86_init_noop;
|
|
x86_init.timers.timer_init = x86_init_noop;
|
|
|
|
/* Avoid searching for BIOS MP tables */
|
|
x86_init.mpparse.find_mptable = x86_init_noop;
|
|
x86_init.mpparse.early_parse_smp_cfg = x86_init_noop;
|
|
x86_init.mpparse.parse_smp_cfg = x86_init_noop;
|
|
|
|
x86_platform.get_wallclock = get_rtc_noop;
|
|
x86_platform.set_wallclock = set_rtc_noop;
|
|
x86_platform.get_nmi_reason = hv_get_nmi_reason;
|
|
|
|
x86_platform.legacy.i8042 = X86_LEGACY_I8042_PLATFORM_ABSENT;
|
|
x86_platform.legacy.rtc = 0;
|
|
x86_platform.legacy.warm_reset = 0;
|
|
x86_platform.legacy.reserve_bios_regions = 0;
|
|
x86_platform.legacy.devices.pnpbios = 0;
|
|
|
|
x86_init.hyper.msi_ext_dest_id = hv_vtl_msi_ext_dest_id;
|
|
}
|
|
|
|
static inline u64 hv_vtl_system_desc_base(struct ldttss_desc *desc)
|
|
{
|
|
return ((u64)desc->base3 << 32) | ((u64)desc->base2 << 24) |
|
|
(desc->base1 << 16) | desc->base0;
|
|
}
|
|
|
|
static inline u32 hv_vtl_system_desc_limit(struct ldttss_desc *desc)
|
|
{
|
|
return ((u32)desc->limit1 << 16) | (u32)desc->limit0;
|
|
}
|
|
|
|
typedef void (*secondary_startup_64_fn)(void*, void*);
|
|
static void hv_vtl_ap_entry(void)
|
|
{
|
|
((secondary_startup_64_fn)secondary_startup_64)(&boot_params, &boot_params);
|
|
}
|
|
|
|
static int hv_vtl_bringup_vcpu(u32 target_vp_index, u64 eip_ignored)
|
|
{
|
|
u64 status;
|
|
int ret = 0;
|
|
struct hv_enable_vp_vtl *input;
|
|
unsigned long irq_flags;
|
|
|
|
struct desc_ptr gdt_ptr;
|
|
struct desc_ptr idt_ptr;
|
|
|
|
struct ldttss_desc *tss;
|
|
struct ldttss_desc *ldt;
|
|
struct desc_struct *gdt;
|
|
|
|
u64 rsp = current->thread.sp;
|
|
u64 rip = (u64)&hv_vtl_ap_entry;
|
|
|
|
native_store_gdt(&gdt_ptr);
|
|
store_idt(&idt_ptr);
|
|
|
|
gdt = (struct desc_struct *)((void *)(gdt_ptr.address));
|
|
tss = (struct ldttss_desc *)(gdt + GDT_ENTRY_TSS);
|
|
ldt = (struct ldttss_desc *)(gdt + GDT_ENTRY_LDT);
|
|
|
|
local_irq_save(irq_flags);
|
|
|
|
input = *this_cpu_ptr(hyperv_pcpu_input_arg);
|
|
memset(input, 0, sizeof(*input));
|
|
|
|
input->partition_id = HV_PARTITION_ID_SELF;
|
|
input->vp_index = target_vp_index;
|
|
input->target_vtl.target_vtl = HV_VTL_MGMT;
|
|
|
|
/*
|
|
* The x86_64 Linux kernel follows the 16-bit -> 32-bit -> 64-bit
|
|
* mode transition sequence after waking up an AP with SIPI whose
|
|
* vector points to the 16-bit AP startup trampoline code. Here in
|
|
* VTL2, we can't perform that sequence as the AP has to start in
|
|
* the 64-bit mode.
|
|
*
|
|
* To make this happen, we tell the hypervisor to load a valid 64-bit
|
|
* context (most of which is just magic numbers from the CPU manual)
|
|
* so that AP jumps right to the 64-bit entry of the kernel, and the
|
|
* control registers are loaded with values that let the AP fetch the
|
|
* code and data and carry on with work it gets assigned.
|
|
*/
|
|
|
|
input->vp_context.rip = rip;
|
|
input->vp_context.rsp = rsp;
|
|
input->vp_context.rflags = 0x0000000000000002;
|
|
input->vp_context.efer = __rdmsr(MSR_EFER);
|
|
input->vp_context.cr0 = native_read_cr0();
|
|
input->vp_context.cr3 = __native_read_cr3();
|
|
input->vp_context.cr4 = native_read_cr4();
|
|
input->vp_context.msr_cr_pat = __rdmsr(MSR_IA32_CR_PAT);
|
|
input->vp_context.idtr.limit = idt_ptr.size;
|
|
input->vp_context.idtr.base = idt_ptr.address;
|
|
input->vp_context.gdtr.limit = gdt_ptr.size;
|
|
input->vp_context.gdtr.base = gdt_ptr.address;
|
|
|
|
/* Non-system desc (64bit), long, code, present */
|
|
input->vp_context.cs.selector = __KERNEL_CS;
|
|
input->vp_context.cs.base = 0;
|
|
input->vp_context.cs.limit = 0xffffffff;
|
|
input->vp_context.cs.attributes = 0xa09b;
|
|
/* Non-system desc (64bit), data, present, granularity, default */
|
|
input->vp_context.ss.selector = __KERNEL_DS;
|
|
input->vp_context.ss.base = 0;
|
|
input->vp_context.ss.limit = 0xffffffff;
|
|
input->vp_context.ss.attributes = 0xc093;
|
|
|
|
/* System desc (128bit), present, LDT */
|
|
input->vp_context.ldtr.selector = GDT_ENTRY_LDT * 8;
|
|
input->vp_context.ldtr.base = hv_vtl_system_desc_base(ldt);
|
|
input->vp_context.ldtr.limit = hv_vtl_system_desc_limit(ldt);
|
|
input->vp_context.ldtr.attributes = 0x82;
|
|
|
|
/* System desc (128bit), present, TSS, 0x8b - busy, 0x89 -- default */
|
|
input->vp_context.tr.selector = GDT_ENTRY_TSS * 8;
|
|
input->vp_context.tr.base = hv_vtl_system_desc_base(tss);
|
|
input->vp_context.tr.limit = hv_vtl_system_desc_limit(tss);
|
|
input->vp_context.tr.attributes = 0x8b;
|
|
|
|
status = hv_do_hypercall(HVCALL_ENABLE_VP_VTL, input, NULL);
|
|
|
|
if (!hv_result_success(status) &&
|
|
hv_result(status) != HV_STATUS_VTL_ALREADY_ENABLED) {
|
|
pr_err("HVCALL_ENABLE_VP_VTL failed for VP : %d ! [Err: %#llx\n]",
|
|
target_vp_index, status);
|
|
ret = -EINVAL;
|
|
goto free_lock;
|
|
}
|
|
|
|
status = hv_do_hypercall(HVCALL_START_VP, input, NULL);
|
|
|
|
if (!hv_result_success(status)) {
|
|
pr_err("HVCALL_START_VP failed for VP : %d ! [Err: %#llx]\n",
|
|
target_vp_index, status);
|
|
ret = -EINVAL;
|
|
}
|
|
|
|
free_lock:
|
|
local_irq_restore(irq_flags);
|
|
|
|
return ret;
|
|
}
|
|
|
|
static int hv_vtl_apicid_to_vp_id(u32 apic_id)
|
|
{
|
|
u64 control;
|
|
u64 status;
|
|
unsigned long irq_flags;
|
|
struct hv_get_vp_from_apic_id_in *input;
|
|
u32 *output, ret;
|
|
|
|
local_irq_save(irq_flags);
|
|
|
|
input = *this_cpu_ptr(hyperv_pcpu_input_arg);
|
|
memset(input, 0, sizeof(*input));
|
|
input->partition_id = HV_PARTITION_ID_SELF;
|
|
input->apic_ids[0] = apic_id;
|
|
|
|
output = (u32 *)input;
|
|
|
|
control = HV_HYPERCALL_REP_COMP_1 | HVCALL_GET_VP_ID_FROM_APIC_ID;
|
|
status = hv_do_hypercall(control, input, output);
|
|
ret = output[0];
|
|
|
|
local_irq_restore(irq_flags);
|
|
|
|
if (!hv_result_success(status)) {
|
|
pr_err("failed to get vp id from apic id %d, status %#llx\n",
|
|
apic_id, status);
|
|
return -EINVAL;
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
static int hv_vtl_wakeup_secondary_cpu(u32 apicid, unsigned long start_eip)
|
|
{
|
|
int vp_id;
|
|
|
|
pr_debug("Bringing up CPU with APIC ID %d in VTL2...\n", apicid);
|
|
vp_id = hv_vtl_apicid_to_vp_id(apicid);
|
|
|
|
if (vp_id < 0) {
|
|
pr_err("Couldn't find CPU with APIC ID %d\n", apicid);
|
|
return -EINVAL;
|
|
}
|
|
if (vp_id > ms_hyperv.max_vp_index) {
|
|
pr_err("Invalid CPU id %d for APIC ID %d\n", vp_id, apicid);
|
|
return -EINVAL;
|
|
}
|
|
|
|
return hv_vtl_bringup_vcpu(vp_id, start_eip);
|
|
}
|
|
|
|
int __init hv_vtl_early_init(void)
|
|
{
|
|
/*
|
|
* `boot_cpu_has` returns the runtime feature support,
|
|
* and here is the earliest it can be used.
|
|
*/
|
|
if (cpu_feature_enabled(X86_FEATURE_XSAVE))
|
|
panic("XSAVE has to be disabled as it is not supported by this module.\n"
|
|
"Please add 'noxsave' to the kernel command line.\n");
|
|
|
|
real_mode_header = &hv_vtl_real_mode_header;
|
|
apic_update_callback(wakeup_secondary_cpu_64, hv_vtl_wakeup_secondary_cpu);
|
|
|
|
return 0;
|
|
}
|