linux/arch/x86/hyperv/hv_vtl.c
Linus Torvalds ca7e917769 Rework of APIC enumeration and topology evaluation:
The current implementation has a couple of shortcomings:
 
   - It fails to handle hybrid systems correctly.
 
   - The APIC registration code which handles CPU number assignents is in
     the middle of the APIC code and detached from the topology evaluation.
 
   - The various mechanisms which enumerate APICs, ACPI, MPPARSE and guest
     specific ones, tweak global variables as they see fit or in case of
     XENPV just hack around the generic mechanisms completely.
 
   - The CPUID topology evaluation code is sprinkled all over the vendor
     code and reevaluates global variables on every hotplug operation.
 
   - There is no way to analyze topology on the boot CPU before bringing up
     the APs. This causes problems for infrastructure like PERF which needs
     to size certain aspects upfront or could be simplified if that would be
     possible.
 
   - The APIC admission and CPU number association logic is incomprehensible
     and overly complex and needs to be kept around after boot instead of
     completing this right after the APIC enumeration.
 
 This update addresses these shortcomings with the following changes:
 
   - Rework the CPUID evaluation code so it is common for all vendors and
     provides information about the APIC ID segments in a uniform way
     independent of the number of segments (Thread, Core, Module, ..., Die,
     Package) so that this information can be computed instead of rewriting
     global variables of dubious value over and over.
 
   - A few cleanups and simplifcations of the APIC, IO/APIC and related
     interfaces to prepare for the topology evaluation changes.
 
   - Seperation of the parser stages so the early evaluation which tries to
     find the APIC address can be seperately overridden from the late
     evaluation which enumerates and registers the local APIC as further
     preparation for sanitizing the topology evaluation.
 
   - A new registration and admission logic which
 
      - encapsulates the inner workings so that parsers and guest logic
        cannot longer fiddle in it
 
      - uses the APIC ID segments to build topology bitmaps at registration
        time
 
      - provides a sane admission logic
 
      - allows to detect the crash kernel case, where CPU0 does not run on
        the real BSP, automatically. This is required to prevent sending
        INIT/SIPI sequences to the real BSP which would reset the whole
        machine. This was so far handled by a tedious command line
        parameter, which does not even work in nested crash scenarios.
 
      - Associates CPU number after the enumeration completed and prevents
        the late registration of APICs, which was somehow tolerated before.
 
   - Converting all parsers and guest enumeration mechanisms over to the
     new interfaces.
 
     This allows to get rid of all global variable tweaking from the parsers
     and enumeration mechanisms and sanitizes the XEN[PV] handling so it can
     use CPUID evaluation for the first time.
 
   - Mopping up existing sins by taking the information from the APIC ID
     segment bitmaps.
 
     This evaluates hybrid systems correctly on the boot CPU and allows for
     cleanups and fixes in the related drivers, e.g. PERF.
 
 The series has been extensively tested and the minimal late fallout due to
 a broken ACPI/MADT table has been addressed by tightening the admission
 logic further.
 -----BEGIN PGP SIGNATURE-----
 
 iQJHBAABCgAxFiEEQp8+kY+LLUocC4bMphj1TA10mKEFAmXuDawTHHRnbHhAbGlu
 dXRyb25peC5kZQAKCRCmGPVMDXSYobE7EACngItF+UOTCoCV6och2lL6HVoIdZD1
 Y5oaAgD+WzQSz/lBkH6b9kZSyvjlMo6O9GlnGX+ii+VUnijDp4VrspnxbJDaKEq3
 gOfsSg2Tk+ps50HqMcZawjjBYJb/TmvKwEV2XuzIBPOONSWLNjvN7nBSzLl1eF9/
 8uCE39/8aB5K3GXryRyXdo2uLu6eHTVC0aYFu/kLX1/BbVqF5NMD3sz9E9w8+D/U
 MIIMEMXy4Fn+P2o0vVH+gjUlwI76mJbB1WqCX/sqbVacXrjl3KfNJRiisTFIOOYV
 8o+rIV0ef5X9xmZqtOXAdyZQzj++Gwmz9+4TU1M4YHtS7UkYn6AluOjvVekCc+gc
 qXE3WhqKfCK2/carRMLQxAMxNeRylkZG+Wuv1Qtyjpe9JX2dTqtems0f4DMp9DKf
 b7InO3z39kJanpqcUG2Sx+GWanetfnX+0Ho2Moqu6Xi+2ATr1PfMG/Wyr5/WWOfV
 qApaHSTwa+J43mSzP6BsXngEv085EHSGM5tPe7u46MCYFqB21+bMl+qH82KjMkOe
 c6uZovFQMmX2WBlqJSYGVCH+Jhgvqq8HFeRs19Hd4enOt3e6LE3E74RBVD1AyfLV
 1b/m8tYB/o871ZlEZwDCGVrV/LNnA7PxmFpq5ZHLpUt39g2/V0RH1puBVz1e97pU
 YsTT7hBCUYzgjQ==
 =/5oR
 -----END PGP SIGNATURE-----

Merge tag 'x86-apic-2024-03-10' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 APIC updates from Thomas Gleixner:
 "Rework of APIC enumeration and topology evaluation.

  The current implementation has a couple of shortcomings:

   - It fails to handle hybrid systems correctly.

   - The APIC registration code which handles CPU number assignents is
     in the middle of the APIC code and detached from the topology
     evaluation.

   - The various mechanisms which enumerate APICs, ACPI, MPPARSE and
     guest specific ones, tweak global variables as they see fit or in
     case of XENPV just hack around the generic mechanisms completely.

   - The CPUID topology evaluation code is sprinkled all over the vendor
     code and reevaluates global variables on every hotplug operation.

   - There is no way to analyze topology on the boot CPU before bringing
     up the APs. This causes problems for infrastructure like PERF which
     needs to size certain aspects upfront or could be simplified if
     that would be possible.

   - The APIC admission and CPU number association logic is
     incomprehensible and overly complex and needs to be kept around
     after boot instead of completing this right after the APIC
     enumeration.

  This update addresses these shortcomings with the following changes:

   - Rework the CPUID evaluation code so it is common for all vendors
     and provides information about the APIC ID segments in a uniform
     way independent of the number of segments (Thread, Core, Module,
     ..., Die, Package) so that this information can be computed instead
     of rewriting global variables of dubious value over and over.

   - A few cleanups and simplifcations of the APIC, IO/APIC and related
     interfaces to prepare for the topology evaluation changes.

   - Seperation of the parser stages so the early evaluation which tries
     to find the APIC address can be seperately overridden from the late
     evaluation which enumerates and registers the local APIC as further
     preparation for sanitizing the topology evaluation.

   - A new registration and admission logic which

       - encapsulates the inner workings so that parsers and guest logic
         cannot longer fiddle in it

       - uses the APIC ID segments to build topology bitmaps at
         registration time

       - provides a sane admission logic

       - allows to detect the crash kernel case, where CPU0 does not run
         on the real BSP, automatically. This is required to prevent
         sending INIT/SIPI sequences to the real BSP which would reset
         the whole machine. This was so far handled by a tedious command
         line parameter, which does not even work in nested crash
         scenarios.

       - Associates CPU number after the enumeration completed and
         prevents the late registration of APICs, which was somehow
         tolerated before.

   - Converting all parsers and guest enumeration mechanisms over to the
     new interfaces.

     This allows to get rid of all global variable tweaking from the
     parsers and enumeration mechanisms and sanitizes the XEN[PV]
     handling so it can use CPUID evaluation for the first time.

   - Mopping up existing sins by taking the information from the APIC ID
     segment bitmaps.

     This evaluates hybrid systems correctly on the boot CPU and allows
     for cleanups and fixes in the related drivers, e.g. PERF.

  The series has been extensively tested and the minimal late fallout
  due to a broken ACPI/MADT table has been addressed by tightening the
  admission logic further"

* tag 'x86-apic-2024-03-10' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (76 commits)
  x86/topology: Ignore non-present APIC IDs in a present package
  x86/apic: Build the x86 topology enumeration functions on UP APIC builds too
  smp: Provide 'setup_max_cpus' definition on UP too
  smp: Avoid 'setup_max_cpus' namespace collision/shadowing
  x86/bugs: Use fixed addressing for VERW operand
  x86/cpu/topology: Get rid of cpuinfo::x86_max_cores
  x86/cpu/topology: Provide __num_[cores|threads]_per_package
  x86/cpu/topology: Rename topology_max_die_per_package()
  x86/cpu/topology: Rename smp_num_siblings
  x86/cpu/topology: Retrieve cores per package from topology bitmaps
  x86/cpu/topology: Use topology logical mapping mechanism
  x86/cpu/topology: Provide logical pkg/die mapping
  x86/cpu/topology: Simplify cpu_mark_primary_thread()
  x86/cpu/topology: Mop up primary thread mask handling
  x86/cpu/topology: Use topology bitmaps for sizing
  x86/cpu/topology: Let XEN/PV use topology from CPUID/MADT
  x86/xen/smp_pv: Count number of vCPUs early
  x86/cpu/topology: Assign hotpluggable CPUIDs during init
  x86/cpu/topology: Reject unknown APIC IDs on ACPI hotplug
  x86/topology: Add a mechanism to track topology via APIC IDs
  ...
2024-03-11 15:45:55 -07:00

241 lines
6.9 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2023, Microsoft Corporation.
*
* Author:
* Saurabh Sengar <ssengar@microsoft.com>
*/
#include <asm/apic.h>
#include <asm/boot.h>
#include <asm/desc.h>
#include <asm/i8259.h>
#include <asm/mshyperv.h>
#include <asm/realmode.h>
extern struct boot_params boot_params;
static struct real_mode_header hv_vtl_real_mode_header;
static bool __init hv_vtl_msi_ext_dest_id(void)
{
return true;
}
void __init hv_vtl_init_platform(void)
{
pr_info("Linux runs in Hyper-V Virtual Trust Level\n");
x86_platform.realmode_reserve = x86_init_noop;
x86_platform.realmode_init = x86_init_noop;
x86_init.irqs.pre_vector_init = x86_init_noop;
x86_init.timers.timer_init = x86_init_noop;
/* Avoid searching for BIOS MP tables */
x86_init.mpparse.find_mptable = x86_init_noop;
x86_init.mpparse.early_parse_smp_cfg = x86_init_noop;
x86_init.mpparse.parse_smp_cfg = x86_init_noop;
x86_platform.get_wallclock = get_rtc_noop;
x86_platform.set_wallclock = set_rtc_noop;
x86_platform.get_nmi_reason = hv_get_nmi_reason;
x86_platform.legacy.i8042 = X86_LEGACY_I8042_PLATFORM_ABSENT;
x86_platform.legacy.rtc = 0;
x86_platform.legacy.warm_reset = 0;
x86_platform.legacy.reserve_bios_regions = 0;
x86_platform.legacy.devices.pnpbios = 0;
x86_init.hyper.msi_ext_dest_id = hv_vtl_msi_ext_dest_id;
}
static inline u64 hv_vtl_system_desc_base(struct ldttss_desc *desc)
{
return ((u64)desc->base3 << 32) | ((u64)desc->base2 << 24) |
(desc->base1 << 16) | desc->base0;
}
static inline u32 hv_vtl_system_desc_limit(struct ldttss_desc *desc)
{
return ((u32)desc->limit1 << 16) | (u32)desc->limit0;
}
typedef void (*secondary_startup_64_fn)(void*, void*);
static void hv_vtl_ap_entry(void)
{
((secondary_startup_64_fn)secondary_startup_64)(&boot_params, &boot_params);
}
static int hv_vtl_bringup_vcpu(u32 target_vp_index, u64 eip_ignored)
{
u64 status;
int ret = 0;
struct hv_enable_vp_vtl *input;
unsigned long irq_flags;
struct desc_ptr gdt_ptr;
struct desc_ptr idt_ptr;
struct ldttss_desc *tss;
struct ldttss_desc *ldt;
struct desc_struct *gdt;
u64 rsp = current->thread.sp;
u64 rip = (u64)&hv_vtl_ap_entry;
native_store_gdt(&gdt_ptr);
store_idt(&idt_ptr);
gdt = (struct desc_struct *)((void *)(gdt_ptr.address));
tss = (struct ldttss_desc *)(gdt + GDT_ENTRY_TSS);
ldt = (struct ldttss_desc *)(gdt + GDT_ENTRY_LDT);
local_irq_save(irq_flags);
input = *this_cpu_ptr(hyperv_pcpu_input_arg);
memset(input, 0, sizeof(*input));
input->partition_id = HV_PARTITION_ID_SELF;
input->vp_index = target_vp_index;
input->target_vtl.target_vtl = HV_VTL_MGMT;
/*
* The x86_64 Linux kernel follows the 16-bit -> 32-bit -> 64-bit
* mode transition sequence after waking up an AP with SIPI whose
* vector points to the 16-bit AP startup trampoline code. Here in
* VTL2, we can't perform that sequence as the AP has to start in
* the 64-bit mode.
*
* To make this happen, we tell the hypervisor to load a valid 64-bit
* context (most of which is just magic numbers from the CPU manual)
* so that AP jumps right to the 64-bit entry of the kernel, and the
* control registers are loaded with values that let the AP fetch the
* code and data and carry on with work it gets assigned.
*/
input->vp_context.rip = rip;
input->vp_context.rsp = rsp;
input->vp_context.rflags = 0x0000000000000002;
input->vp_context.efer = __rdmsr(MSR_EFER);
input->vp_context.cr0 = native_read_cr0();
input->vp_context.cr3 = __native_read_cr3();
input->vp_context.cr4 = native_read_cr4();
input->vp_context.msr_cr_pat = __rdmsr(MSR_IA32_CR_PAT);
input->vp_context.idtr.limit = idt_ptr.size;
input->vp_context.idtr.base = idt_ptr.address;
input->vp_context.gdtr.limit = gdt_ptr.size;
input->vp_context.gdtr.base = gdt_ptr.address;
/* Non-system desc (64bit), long, code, present */
input->vp_context.cs.selector = __KERNEL_CS;
input->vp_context.cs.base = 0;
input->vp_context.cs.limit = 0xffffffff;
input->vp_context.cs.attributes = 0xa09b;
/* Non-system desc (64bit), data, present, granularity, default */
input->vp_context.ss.selector = __KERNEL_DS;
input->vp_context.ss.base = 0;
input->vp_context.ss.limit = 0xffffffff;
input->vp_context.ss.attributes = 0xc093;
/* System desc (128bit), present, LDT */
input->vp_context.ldtr.selector = GDT_ENTRY_LDT * 8;
input->vp_context.ldtr.base = hv_vtl_system_desc_base(ldt);
input->vp_context.ldtr.limit = hv_vtl_system_desc_limit(ldt);
input->vp_context.ldtr.attributes = 0x82;
/* System desc (128bit), present, TSS, 0x8b - busy, 0x89 -- default */
input->vp_context.tr.selector = GDT_ENTRY_TSS * 8;
input->vp_context.tr.base = hv_vtl_system_desc_base(tss);
input->vp_context.tr.limit = hv_vtl_system_desc_limit(tss);
input->vp_context.tr.attributes = 0x8b;
status = hv_do_hypercall(HVCALL_ENABLE_VP_VTL, input, NULL);
if (!hv_result_success(status) &&
hv_result(status) != HV_STATUS_VTL_ALREADY_ENABLED) {
pr_err("HVCALL_ENABLE_VP_VTL failed for VP : %d ! [Err: %#llx\n]",
target_vp_index, status);
ret = -EINVAL;
goto free_lock;
}
status = hv_do_hypercall(HVCALL_START_VP, input, NULL);
if (!hv_result_success(status)) {
pr_err("HVCALL_START_VP failed for VP : %d ! [Err: %#llx]\n",
target_vp_index, status);
ret = -EINVAL;
}
free_lock:
local_irq_restore(irq_flags);
return ret;
}
static int hv_vtl_apicid_to_vp_id(u32 apic_id)
{
u64 control;
u64 status;
unsigned long irq_flags;
struct hv_get_vp_from_apic_id_in *input;
u32 *output, ret;
local_irq_save(irq_flags);
input = *this_cpu_ptr(hyperv_pcpu_input_arg);
memset(input, 0, sizeof(*input));
input->partition_id = HV_PARTITION_ID_SELF;
input->apic_ids[0] = apic_id;
output = (u32 *)input;
control = HV_HYPERCALL_REP_COMP_1 | HVCALL_GET_VP_ID_FROM_APIC_ID;
status = hv_do_hypercall(control, input, output);
ret = output[0];
local_irq_restore(irq_flags);
if (!hv_result_success(status)) {
pr_err("failed to get vp id from apic id %d, status %#llx\n",
apic_id, status);
return -EINVAL;
}
return ret;
}
static int hv_vtl_wakeup_secondary_cpu(u32 apicid, unsigned long start_eip)
{
int vp_id;
pr_debug("Bringing up CPU with APIC ID %d in VTL2...\n", apicid);
vp_id = hv_vtl_apicid_to_vp_id(apicid);
if (vp_id < 0) {
pr_err("Couldn't find CPU with APIC ID %d\n", apicid);
return -EINVAL;
}
if (vp_id > ms_hyperv.max_vp_index) {
pr_err("Invalid CPU id %d for APIC ID %d\n", vp_id, apicid);
return -EINVAL;
}
return hv_vtl_bringup_vcpu(vp_id, start_eip);
}
int __init hv_vtl_early_init(void)
{
/*
* `boot_cpu_has` returns the runtime feature support,
* and here is the earliest it can be used.
*/
if (cpu_feature_enabled(X86_FEATURE_XSAVE))
panic("XSAVE has to be disabled as it is not supported by this module.\n"
"Please add 'noxsave' to the kernel command line.\n");
real_mode_header = &hv_vtl_real_mode_header;
apic_update_callback(wakeup_secondary_cpu_64, hv_vtl_wakeup_secondary_cpu);
return 0;
}