From 1d6664fadda3e028b8220ba1ba487a928e801825 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Sun, 25 Jun 2023 15:34:38 +0800 Subject: [PATCH 01/50] KVM: x86: Use sysfs_emit() instead of sprintf() Use sysfs_emit() instead of the sprintf() for sysfs entries. sysfs_emit() knows the maximum of the temporary buffer used for outputting sysfs content and avoids overrunning the buffer length. Signed-off-by: Like Xu Link: https://lore.kernel.org/r/20230625073438.57427-1-likexu@tencent.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 2 +- arch/x86/kvm/vmx/vmx.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index ec169f5c7dce..31d9cbe817a2 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -6862,7 +6862,7 @@ static void mmu_destroy_caches(void) static int get_nx_huge_pages(char *buffer, const struct kernel_param *kp) { if (nx_hugepage_mitigation_hard_disabled) - return sprintf(buffer, "never\n"); + return sysfs_emit(buffer, "never\n"); return param_get_bool(buffer, kp); } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 0ecf4be2c6af..254f2296e549 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -366,9 +366,9 @@ static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp) static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp) { if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param))) - return sprintf(s, "???\n"); + return sysfs_emit(s, "???\n"); - return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option); + return sysfs_emit(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option); } static void vmx_setup_fb_clear_ctrl(void) From 7f717f54845c518a64fe5464d855ddeca272483a Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Tue, 18 Jul 2023 12:15:44 +0200 Subject: [PATCH 02/50] KVM: x86: Remove x86_emulate_ops::guest_has_long_mode Remove x86_emulate_ops::guest_has_long_mode along with its implementation, emulator_guest_has_long_mode(). It has been unused since commit 1d0da94cdafe ("KVM: x86: do not go through ctxt->ops when emulating rsm"). No functional change intended. Signed-off-by: Michal Luczaj Link: https://lore.kernel.org/r/20230718101809.1249769-1-mhal@rbox.co Signed-off-by: Sean Christopherson --- arch/x86/kvm/kvm_emulate.h | 1 - arch/x86/kvm/x86.c | 6 ------ 2 files changed, 7 deletions(-) diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index ab65f3a47dfd..be7aeb9b8ea3 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -213,7 +213,6 @@ struct x86_emulate_ops { bool (*get_cpuid)(struct x86_emulate_ctxt *ctxt, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool exact_only); - bool (*guest_has_long_mode)(struct x86_emulate_ctxt *ctxt); bool (*guest_has_movbe)(struct x86_emulate_ctxt *ctxt); bool (*guest_has_fxsr)(struct x86_emulate_ctxt *ctxt); bool (*guest_has_rdpid)(struct x86_emulate_ctxt *ctxt); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a6b9bea62fb8..0fca1546e029 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8229,11 +8229,6 @@ static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt, return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, exact_only); } -static bool emulator_guest_has_long_mode(struct x86_emulate_ctxt *ctxt) -{ - return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_LM); -} - static bool emulator_guest_has_movbe(struct x86_emulate_ctxt *ctxt) { return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_MOVBE); @@ -8335,7 +8330,6 @@ static const struct x86_emulate_ops emulate_ops = { .fix_hypercall = emulator_fix_hypercall, .intercept = emulator_intercept, .get_cpuid = emulator_get_cpuid, - .guest_has_long_mode = emulator_guest_has_long_mode, .guest_has_movbe = emulator_guest_has_movbe, .guest_has_fxsr = emulator_guest_has_fxsr, .guest_has_rdpid = emulator_guest_has_rdpid, From af8e2ccfa6f101f505add076c1a4d56c718e0d50 Mon Sep 17 00:00:00 2001 From: Takahiro Itazuri Date: Wed, 12 Jul 2023 19:31:36 +0100 Subject: [PATCH 03/50] KVM: x86: Advertise host CPUID 0x80000005 in KVM_GET_SUPPORTED_CPUID Advertise CPUID 0x80000005 (L1 cache and TLB info) to userspace so that VMMs that reflect KVM_GET_SUPPORTED_CPUID into KVM_SET_CPUID2 will enumerate sane cache/TLB information to the guest. CPUID 0x80000006 (L2 cache and TLB and L3 cache info) has been returned since commit 43d05de2bee7 ("KVM: pass through CPUID(0x80000006)"). Enumerating both 0x80000005 and 0x80000006 with KVM_GET_SUPPORTED_CPUID is better than reporting one or the other, and 0x80000005 could be helpful for VMM to pass it to KVM_SET_CPUID{,2} for the same reason with 0x80000006. Signed-off-by: Takahiro Itazuri Link: https://lore.kernel.org/all/ZK7NmfKI9xur%2FMop@google.com Link: https://lore.kernel.org/r/20230712183136.85561-1-itazur@amazon.com [sean: add link, massage changelog] Signed-off-by: Sean Christopherson --- arch/x86/kvm/cpuid.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 7f4d13383cf2..ad6566636c22 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -1151,6 +1151,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) cpuid_entry_override(entry, CPUID_8000_0001_EDX); cpuid_entry_override(entry, CPUID_8000_0001_ECX); break; + case 0x80000005: + /* Pass host L1 cache and TLB info. */ + break; case 0x80000006: /* Drop reserved bits, pass host L2 cache and TLB info. */ entry->edx &= ~GENMASK(17, 16); From a2fd5d02bad6d63daaaf4a8bb19c2400387aca61 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 6 Jun 2023 17:43:09 -0700 Subject: [PATCH 04/50] KVM: x86: Snapshot host's MSR_IA32_ARCH_CAPABILITIES Snapshot the host's MSR_IA32_ARCH_CAPABILITIES, if it's supported, instead of reading the MSR every time KVM wants to query the host state, e.g. when initializing the default value during vCPU creation. The paths that query ARCH_CAPABILITIES aren't particularly performance sensitive, but creating vCPUs is a frequent enough operation that burning 8 bytes is a good trade-off. Alternatively, KVM could add a field in kvm_caps and thus skip the on-demand calculations entirely, but a pure snapshot isn't possible due to the way KVM handles the l1tf_vmx_mitigation module param. And unlike the other "supported" fields in kvm_caps, KVM doesn't enforce the "supported" value, i.e. KVM treats ARCH_CAPABILITIES like a CPUID leaf and lets userspace advertise whatever it wants. Those problems are solvable, but it's not clear there is real benefit versus snapshotting the host value, and grabbing the host value will allow additional cleanup of KVM's FB_CLEAR_CTRL code. Link: https://lore.kernel.org/all/20230524061634.54141-2-chao.gao@intel.com Cc: Chao Gao Cc: Xiaoyao Li Reviewed-by: Chao Gao Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20230607004311.1420507-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 22 ++++++---------------- arch/x86/kvm/x86.c | 13 +++++++------ arch/x86/kvm/x86.h | 1 + 3 files changed, 14 insertions(+), 22 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 254f2296e549..f0ec9acae86c 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -255,14 +255,9 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) return 0; } - if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) { - u64 msr; - - rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr); - if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) { - l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; - return 0; - } + if (host_arch_capabilities & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) { + l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; + return 0; } /* If set to auto use the default l1tf mitigation method */ @@ -373,15 +368,10 @@ static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp) static void vmx_setup_fb_clear_ctrl(void) { - u64 msr; - - if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES) && + if ((host_arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) && !boot_cpu_has_bug(X86_BUG_MDS) && - !boot_cpu_has_bug(X86_BUG_TAA)) { - rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr); - if (msr & ARCH_CAP_FB_CLEAR_CTRL) - vmx_fb_clear_ctrl_available = true; - } + !boot_cpu_has_bug(X86_BUG_TAA)) + vmx_fb_clear_ctrl_available = true; } static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0fca1546e029..a1b13d2d1d71 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -237,6 +237,9 @@ EXPORT_SYMBOL_GPL(enable_apicv); u64 __read_mostly host_xss; EXPORT_SYMBOL_GPL(host_xss); +u64 __read_mostly host_arch_capabilities; +EXPORT_SYMBOL_GPL(host_arch_capabilities); + const struct _kvm_stats_desc kvm_vm_stats_desc[] = { KVM_GENERIC_VM_STATS(), STATS_DESC_COUNTER(VM, mmu_shadow_zapped), @@ -1611,12 +1614,7 @@ static bool kvm_is_immutable_feature_msr(u32 msr) static u64 kvm_get_arch_capabilities(void) { - u64 data = 0; - - if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) { - rdmsrl(MSR_IA32_ARCH_CAPABILITIES, data); - data &= KVM_SUPPORTED_ARCH_CAP; - } + u64 data = host_arch_capabilities & KVM_SUPPORTED_ARCH_CAP; /* * If nx_huge_pages is enabled, KVM's shadow paging will ensure that @@ -9490,6 +9488,9 @@ static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) kvm_init_pmu_capability(ops->pmu_ops); + if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, host_arch_capabilities); + r = ops->hardware_setup(); if (r != 0) goto out_mmu_exit; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 82e3dafc5453..1e7be1f6ab29 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -323,6 +323,7 @@ fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu); extern u64 host_xcr0; extern u64 host_xss; +extern u64 host_arch_capabilities; extern struct kvm_caps kvm_caps; From 550ba57faa04042956079b8d09ec0f83bef8817f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 6 Jun 2023 17:43:10 -0700 Subject: [PATCH 05/50] KVM: VMX: Drop unnecessary vmx_fb_clear_ctrl_available "cache" Now that KVM snapshots the host's MSR_IA32_ARCH_CAPABILITIES, drop the similar snapshot/cache of whether or not KVM is allowed to manipulate MSR_IA32_MCU_OPT_CTRL.FB_CLEAR_DIS. The motivation for the cache was presumably to avoid the RDMSR, e.g. boot_cpu_has_bug() is quite cheap, and modifying the vCPU's MSR_IA32_ARCH_CAPABILITIES is an infrequent option and a relatively slow path. Cc: Pawan Gupta Reviewed-by: Pawan Gupta Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20230607004311.1420507-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index f0ec9acae86c..e6d1ce2d230c 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -237,9 +237,6 @@ static const struct { #define L1D_CACHE_ORDER 4 static void *vmx_l1d_flush_pages; -/* Control for disabling CPU Fill buffer clear */ -static bool __read_mostly vmx_fb_clear_ctrl_available; - static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) { struct page *page; @@ -366,14 +363,6 @@ static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp) return sysfs_emit(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option); } -static void vmx_setup_fb_clear_ctrl(void) -{ - if ((host_arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) && - !boot_cpu_has_bug(X86_BUG_MDS) && - !boot_cpu_has_bug(X86_BUG_TAA)) - vmx_fb_clear_ctrl_available = true; -} - static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx) { u64 msr; @@ -399,7 +388,9 @@ static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx) static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) { - vmx->disable_fb_clear = vmx_fb_clear_ctrl_available; + vmx->disable_fb_clear = (host_arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) && + !boot_cpu_has_bug(X86_BUG_MDS) && + !boot_cpu_has_bug(X86_BUG_TAA); /* * If guest will not execute VERW, there is no need to set FB_CLEAR_DIS @@ -8626,8 +8617,6 @@ static int __init vmx_init(void) if (r) goto err_l1d_flush; - vmx_setup_fb_clear_ctrl(); - for_each_possible_cpu(cpu) { INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); From 41e90a69a49be2e3467e811ae8fdc17456af1b02 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 2 Jun 2023 16:32:49 -0700 Subject: [PATCH 06/50] KVM: x86: Retry APIC optimized map recalc if vCPU is added/enabled Retry the optimized APIC map recalculation if an APIC-enabled vCPU shows up between allocating the map and filling in the map data. Conditionally reschedule before retrying even though the number of vCPUs that can be created is bounded by KVM. Retrying a few thousand times isn't so slow as to be hugely problematic, but it's not blazing fast either. Reset xapic_id_mistach on each retry as a vCPU could change its xAPIC ID between loops, but do NOT reset max_id. The map size also factors in whether or not a vCPU's local APIC is hardware-enabled, i.e. userspace and/or the guest can theoretically keep KVM retrying indefinitely. The only downside is that KVM will allocate more memory than is strictly necessary if the vCPU with the highest x2APIC ID disabled its APIC while the recalculation was in-progress. Refresh kvm->arch.apic_map_dirty to opportunistically change it from DIRTY => UPDATE_IN_PROGRESS to avoid an unnecessary recalc from a different task, i.e. if another task is waiting to attempt an update (which is likely since a retry happens if and only if an update is required). Link: https://lore.kernel.org/r/20230602233250.1014316-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/lapic.c | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 113ca9661ab2..673880bc0762 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -376,7 +376,8 @@ void kvm_recalculate_apic_map(struct kvm *kvm) struct kvm_vcpu *vcpu; unsigned long i; u32 max_id = 255; /* enough space for any xAPIC ID */ - bool xapic_id_mismatch = false; + bool xapic_id_mismatch; + int r; /* Read kvm->arch.apic_map_dirty before kvm->arch.apic_map. */ if (atomic_read_acquire(&kvm->arch.apic_map_dirty) == CLEAN) @@ -386,9 +387,14 @@ void kvm_recalculate_apic_map(struct kvm *kvm) "Dirty APIC map without an in-kernel local APIC"); mutex_lock(&kvm->arch.apic_map_lock); + +retry: /* - * Read kvm->arch.apic_map_dirty before kvm->arch.apic_map - * (if clean) or the APIC registers (if dirty). + * Read kvm->arch.apic_map_dirty before kvm->arch.apic_map (if clean) + * or the APIC registers (if dirty). Note, on retry the map may have + * not yet been marked dirty by whatever task changed a vCPU's x2APIC + * ID, i.e. the map may still show up as in-progress. In that case + * this task still needs to retry and complete its calculation. */ if (atomic_cmpxchg_acquire(&kvm->arch.apic_map_dirty, DIRTY, UPDATE_IN_PROGRESS) == CLEAN) { @@ -397,6 +403,15 @@ void kvm_recalculate_apic_map(struct kvm *kvm) return; } + /* + * Reset the mismatch flag between attempts so that KVM does the right + * thing if a vCPU changes its xAPIC ID, but do NOT reset max_id, i.e. + * keep max_id strictly increasing. Disallowing max_id from shrinking + * ensures KVM won't get stuck in an infinite loop, e.g. if the vCPU + * with the highest x2APIC ID is toggling its APIC on and off. + */ + xapic_id_mismatch = false; + kvm_for_each_vcpu(i, vcpu, kvm) if (kvm_apic_present(vcpu)) max_id = max(max_id, kvm_x2apic_id(vcpu->arch.apic)); @@ -415,9 +430,15 @@ void kvm_recalculate_apic_map(struct kvm *kvm) if (!kvm_apic_present(vcpu)) continue; - if (kvm_recalculate_phys_map(new, vcpu, &xapic_id_mismatch)) { + r = kvm_recalculate_phys_map(new, vcpu, &xapic_id_mismatch); + if (r) { kvfree(new); new = NULL; + if (r == -E2BIG) { + cond_resched(); + goto retry; + } + goto out; } From b23c83ad2c638420ec0608a9de354507c41bec29 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 21 Jul 2023 13:18:41 -0700 Subject: [PATCH 07/50] x86/reboot: VMCLEAR active VMCSes before emergency reboot VMCLEAR active VMCSes before any emergency reboot, not just if the kernel may kexec into a new kernel after a crash. Per Intel's SDM, the VMX architecture doesn't require the CPU to flush the VMCS cache on INIT. If an emergency reboot doesn't RESET CPUs, cached VMCSes could theoretically be kept and only be written back to memory after the new kernel is booted, i.e. could effectively corrupt memory after reboot. Opportunistically remove the setting of the global pointer to NULL to make checkpatch happy. Cc: Andrew Cooper Link: https://lore.kernel.org/r/20230721201859.2307736-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kexec.h | 2 -- arch/x86/include/asm/reboot.h | 2 ++ arch/x86/kernel/crash.c | 31 ------------------------------- arch/x86/kernel/reboot.c | 22 ++++++++++++++++++++++ arch/x86/kvm/vmx/vmx.c | 10 +++------- 5 files changed, 27 insertions(+), 40 deletions(-) diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 5b77bbc28f96..819046974b99 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -205,8 +205,6 @@ int arch_kimage_file_post_load_cleanup(struct kimage *image); #endif #endif -typedef void crash_vmclear_fn(void); -extern crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss; extern void kdump_nmi_shootdown_cpus(void); #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h index 9177b4354c3f..dc201724a643 100644 --- a/arch/x86/include/asm/reboot.h +++ b/arch/x86/include/asm/reboot.h @@ -25,6 +25,8 @@ void __noreturn machine_real_restart(unsigned int type); #define MRR_BIOS 0 #define MRR_APM 1 +typedef void crash_vmclear_fn(void); +extern crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss; void cpu_emergency_disable_virtualization(void); typedef void (*nmi_shootdown_cb)(int, struct pt_regs*); diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index cdd92ab43cda..54cd959cb316 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -48,38 +48,12 @@ struct crash_memmap_data { unsigned int type; }; -/* - * This is used to VMCLEAR all VMCSs loaded on the - * processor. And when loading kvm_intel module, the - * callback function pointer will be assigned. - * - * protected by rcu. - */ -crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss = NULL; -EXPORT_SYMBOL_GPL(crash_vmclear_loaded_vmcss); - -static inline void cpu_crash_vmclear_loaded_vmcss(void) -{ - crash_vmclear_fn *do_vmclear_operation = NULL; - - rcu_read_lock(); - do_vmclear_operation = rcu_dereference(crash_vmclear_loaded_vmcss); - if (do_vmclear_operation) - do_vmclear_operation(); - rcu_read_unlock(); -} - #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) static void kdump_nmi_callback(int cpu, struct pt_regs *regs) { crash_save_cpu(regs, cpu); - /* - * VMCLEAR VMCSs loaded on all cpus if needed. - */ - cpu_crash_vmclear_loaded_vmcss(); - /* * Disable Intel PT to stop its logging */ @@ -133,11 +107,6 @@ void native_machine_crash_shutdown(struct pt_regs *regs) crash_smp_send_stop(); - /* - * VMCLEAR VMCSs loaded on this cpu if needed. - */ - cpu_crash_vmclear_loaded_vmcss(); - cpu_emergency_disable_virtualization(); /* diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 3adbe97015c1..3fa4c6717a1d 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -787,6 +787,26 @@ void machine_crash_shutdown(struct pt_regs *regs) } #endif +/* + * This is used to VMCLEAR all VMCSs loaded on the + * processor. And when loading kvm_intel module, the + * callback function pointer will be assigned. + * + * protected by rcu. + */ +crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss; +EXPORT_SYMBOL_GPL(crash_vmclear_loaded_vmcss); + +static inline void cpu_crash_vmclear_loaded_vmcss(void) +{ + crash_vmclear_fn *do_vmclear_operation = NULL; + + rcu_read_lock(); + do_vmclear_operation = rcu_dereference(crash_vmclear_loaded_vmcss); + if (do_vmclear_operation) + do_vmclear_operation(); + rcu_read_unlock(); +} /* This is the CPU performing the emergency shutdown work. */ int crashing_cpu = -1; @@ -798,6 +818,8 @@ int crashing_cpu = -1; */ void cpu_emergency_disable_virtualization(void) { + cpu_crash_vmclear_loaded_vmcss(); + cpu_emergency_vmxoff(); cpu_emergency_svm_disable(); } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index e6d1ce2d230c..75351477f090 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -41,7 +41,7 @@ #include #include #include -#include +#include #include #include #include @@ -725,7 +725,6 @@ static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx, return ret; } -#ifdef CONFIG_KEXEC_CORE static void crash_vmclear_local_loaded_vmcss(void) { int cpu = raw_smp_processor_id(); @@ -735,7 +734,6 @@ static void crash_vmclear_local_loaded_vmcss(void) loaded_vmcss_on_cpu_link) vmcs_clear(v->vmcs); } -#endif /* CONFIG_KEXEC_CORE */ static void __loaded_vmcs_clear(void *arg) { @@ -8573,10 +8571,9 @@ static void __vmx_exit(void) { allow_smaller_maxphyaddr = false; -#ifdef CONFIG_KEXEC_CORE RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL); synchronize_rcu(); -#endif + vmx_cleanup_l1d_flush(); } @@ -8623,10 +8620,9 @@ static int __init vmx_init(void) pi_init_cpu(cpu); } -#ifdef CONFIG_KEXEC_CORE rcu_assign_pointer(crash_vmclear_loaded_vmcss, crash_vmclear_local_loaded_vmcss); -#endif + vmx_check_vmcs12_offsets(); /* From 5e408396c60cd0f0b53a43713016b6d6af8d69e0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 21 Jul 2023 13:18:42 -0700 Subject: [PATCH 08/50] x86/reboot: Harden virtualization hooks for emergency reboot Provide dedicated helpers to (un)register virt hooks used during an emergency crash/reboot, and WARN if there is an attempt to overwrite the registered callback, or an attempt to do an unpaired unregister. Opportunsitically use rcu_assign_pointer() instead of RCU_INIT_POINTER(), mainly so that the set/unset paths are more symmetrical, but also because any performance gains from using RCU_INIT_POINTER() are meaningless for this code. Reviewed-by: Kai Huang Link: https://lore.kernel.org/r/20230721201859.2307736-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/reboot.h | 5 +++-- arch/x86/kernel/reboot.c | 30 ++++++++++++++++++++++++------ arch/x86/kvm/vmx/vmx.c | 6 ++---- 3 files changed, 29 insertions(+), 12 deletions(-) diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h index dc201724a643..74c6a624d166 100644 --- a/arch/x86/include/asm/reboot.h +++ b/arch/x86/include/asm/reboot.h @@ -25,8 +25,9 @@ void __noreturn machine_real_restart(unsigned int type); #define MRR_BIOS 0 #define MRR_APM 1 -typedef void crash_vmclear_fn(void); -extern crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss; +typedef void (cpu_emergency_virt_cb)(void); +void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback); +void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback); void cpu_emergency_disable_virtualization(void); typedef void (*nmi_shootdown_cb)(int, struct pt_regs*); diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 3fa4c6717a1d..62ccedeb5e2b 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -794,17 +794,35 @@ void machine_crash_shutdown(struct pt_regs *regs) * * protected by rcu. */ -crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss; -EXPORT_SYMBOL_GPL(crash_vmclear_loaded_vmcss); +static cpu_emergency_virt_cb __rcu *cpu_emergency_virt_callback; + +void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback) +{ + if (WARN_ON_ONCE(rcu_access_pointer(cpu_emergency_virt_callback))) + return; + + rcu_assign_pointer(cpu_emergency_virt_callback, callback); +} +EXPORT_SYMBOL_GPL(cpu_emergency_register_virt_callback); + +void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback) +{ + if (WARN_ON_ONCE(rcu_access_pointer(cpu_emergency_virt_callback) != callback)) + return; + + rcu_assign_pointer(cpu_emergency_virt_callback, NULL); + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(cpu_emergency_unregister_virt_callback); static inline void cpu_crash_vmclear_loaded_vmcss(void) { - crash_vmclear_fn *do_vmclear_operation = NULL; + cpu_emergency_virt_cb *callback; rcu_read_lock(); - do_vmclear_operation = rcu_dereference(crash_vmclear_loaded_vmcss); - if (do_vmclear_operation) - do_vmclear_operation(); + callback = rcu_dereference(cpu_emergency_virt_callback); + if (callback) + callback(); rcu_read_unlock(); } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 75351477f090..661ba09685b8 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -8571,8 +8571,7 @@ static void __vmx_exit(void) { allow_smaller_maxphyaddr = false; - RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL); - synchronize_rcu(); + cpu_emergency_unregister_virt_callback(crash_vmclear_local_loaded_vmcss); vmx_cleanup_l1d_flush(); } @@ -8620,8 +8619,7 @@ static int __init vmx_init(void) pi_init_cpu(cpu); } - rcu_assign_pointer(crash_vmclear_loaded_vmcss, - crash_vmclear_local_loaded_vmcss); + cpu_emergency_register_virt_callback(crash_vmclear_local_loaded_vmcss); vmx_check_vmcs12_offsets(); From 119b5cb4ffd0166f3e98e9ee042f5046f7744f28 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 21 Jul 2023 13:18:43 -0700 Subject: [PATCH 09/50] x86/reboot: KVM: Handle VMXOFF in KVM's reboot callback Use KVM VMX's reboot/crash callback to do VMXOFF in an emergency instead of manually and blindly doing VMXOFF. There's no need to attempt VMXOFF if a hypervisor, i.e. KVM, isn't loaded/active, i.e. if the CPU can't possibly be post-VMXON. Reviewed-by: Kai Huang Link: https://lore.kernel.org/r/20230721201859.2307736-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/virtext.h | 10 ---------- arch/x86/kernel/reboot.c | 29 +++++++++-------------------- arch/x86/kvm/vmx/vmx.c | 8 +++++--- 3 files changed, 14 insertions(+), 33 deletions(-) diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h index 3b12e6b99412..5bc29fab15da 100644 --- a/arch/x86/include/asm/virtext.h +++ b/arch/x86/include/asm/virtext.h @@ -70,16 +70,6 @@ static inline void __cpu_emergency_vmxoff(void) cpu_vmxoff(); } -/** Disable VMX if it is supported and enabled on the current CPU - */ -static inline void cpu_emergency_vmxoff(void) -{ - if (cpu_has_vmx()) - __cpu_emergency_vmxoff(); -} - - - /* * SVM functions: diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 62ccedeb5e2b..d2d0f2672a64 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -787,13 +787,7 @@ void machine_crash_shutdown(struct pt_regs *regs) } #endif -/* - * This is used to VMCLEAR all VMCSs loaded on the - * processor. And when loading kvm_intel module, the - * callback function pointer will be assigned. - * - * protected by rcu. - */ +/* RCU-protected callback to disable virtualization prior to reboot. */ static cpu_emergency_virt_cb __rcu *cpu_emergency_virt_callback; void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback) @@ -815,17 +809,6 @@ void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback) } EXPORT_SYMBOL_GPL(cpu_emergency_unregister_virt_callback); -static inline void cpu_crash_vmclear_loaded_vmcss(void) -{ - cpu_emergency_virt_cb *callback; - - rcu_read_lock(); - callback = rcu_dereference(cpu_emergency_virt_callback); - if (callback) - callback(); - rcu_read_unlock(); -} - /* This is the CPU performing the emergency shutdown work. */ int crashing_cpu = -1; @@ -836,9 +819,15 @@ int crashing_cpu = -1; */ void cpu_emergency_disable_virtualization(void) { - cpu_crash_vmclear_loaded_vmcss(); + cpu_emergency_virt_cb *callback; - cpu_emergency_vmxoff(); + rcu_read_lock(); + callback = rcu_dereference(cpu_emergency_virt_callback); + if (callback) + callback(); + rcu_read_unlock(); + + /* KVM_AMD doesn't yet utilize the common callback. */ cpu_emergency_svm_disable(); } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 661ba09685b8..df991f15baf4 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -725,7 +725,7 @@ static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx, return ret; } -static void crash_vmclear_local_loaded_vmcss(void) +static void vmx_emergency_disable(void) { int cpu = raw_smp_processor_id(); struct loaded_vmcs *v; @@ -733,6 +733,8 @@ static void crash_vmclear_local_loaded_vmcss(void) list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu), loaded_vmcss_on_cpu_link) vmcs_clear(v->vmcs); + + __cpu_emergency_vmxoff(); } static void __loaded_vmcs_clear(void *arg) @@ -8571,7 +8573,7 @@ static void __vmx_exit(void) { allow_smaller_maxphyaddr = false; - cpu_emergency_unregister_virt_callback(crash_vmclear_local_loaded_vmcss); + cpu_emergency_unregister_virt_callback(vmx_emergency_disable); vmx_cleanup_l1d_flush(); } @@ -8619,7 +8621,7 @@ static int __init vmx_init(void) pi_init_cpu(cpu); } - cpu_emergency_register_virt_callback(crash_vmclear_local_loaded_vmcss); + cpu_emergency_register_virt_callback(vmx_emergency_disable); vmx_check_vmcs12_offsets(); From baeb4de7ad12b700a91f2a7be8e1c0389a5c8fd4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 21 Jul 2023 13:18:44 -0700 Subject: [PATCH 10/50] x86/reboot: KVM: Disable SVM during reboot via virt/KVM reboot callback Use the virt callback to disable SVM (and set GIF=1) during an emergency instead of blindly attempting to disable SVM. Like the VMX case, if a hypervisor, i.e. KVM, isn't loaded/active, SVM can't be in use. Acked-by: Kai Huang Link: https://lore.kernel.org/r/20230721201859.2307736-5-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/virtext.h | 8 -------- arch/x86/kernel/reboot.c | 3 --- arch/x86/kvm/svm/svm.c | 19 +++++++++++++++++-- 3 files changed, 17 insertions(+), 13 deletions(-) diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h index 5bc29fab15da..aaed66249ccf 100644 --- a/arch/x86/include/asm/virtext.h +++ b/arch/x86/include/asm/virtext.h @@ -133,12 +133,4 @@ fault: } } -/** Makes sure SVM is disabled, if it is supported on the CPU - */ -static inline void cpu_emergency_svm_disable(void) -{ - if (cpu_has_svm(NULL)) - cpu_svm_disable(); -} - #endif /* _ASM_X86_VIRTEX_H */ diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index d2d0f2672a64..48ad2d1ff83d 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -826,9 +826,6 @@ void cpu_emergency_disable_virtualization(void) if (callback) callback(); rcu_read_unlock(); - - /* KVM_AMD doesn't yet utilize the common callback. */ - cpu_emergency_svm_disable(); } #if defined(CONFIG_SMP) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d381ad424554..1ae9c2c7eacb 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -563,6 +564,11 @@ out: preempt_enable(); } +static void svm_emergency_disable(void) +{ + cpu_svm_disable(); +} + static void svm_hardware_disable(void) { /* Make sure we clean up behind us */ @@ -5209,6 +5215,13 @@ static struct kvm_x86_init_ops svm_init_ops __initdata = { .pmu_ops = &amd_pmu_ops, }; +static void __svm_exit(void) +{ + kvm_x86_vendor_exit(); + + cpu_emergency_unregister_virt_callback(svm_emergency_disable); +} + static int __init svm_init(void) { int r; @@ -5222,6 +5235,8 @@ static int __init svm_init(void) if (r) return r; + cpu_emergency_register_virt_callback(svm_emergency_disable); + /* * Common KVM initialization _must_ come last, after this, /dev/kvm is * exposed to userspace! @@ -5234,14 +5249,14 @@ static int __init svm_init(void) return 0; err_kvm_init: - kvm_x86_vendor_exit(); + __svm_exit(); return r; } static void __exit svm_exit(void) { kvm_exit(); - kvm_x86_vendor_exit(); + __svm_exit(); } module_init(svm_init) From ad93c1a7c0102c93e92bf0c06412a1f588e015ab Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 21 Jul 2023 13:18:45 -0700 Subject: [PATCH 11/50] x86/reboot: Assert that IRQs are disabled when turning off virtualization Assert that IRQs are disabled when turning off virtualization in an emergency. KVM enables hardware via on_each_cpu(), i.e. could re-enable hardware if a pending IPI were delivered after disabling virtualization. Remove a misleading comment from emergency_reboot_disable_virtualization() about "just" needing to guarantee the CPU is stable (see above). Reviewed-by: Kai Huang Link: https://lore.kernel.org/r/20230721201859.2307736-6-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kernel/reboot.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 48ad2d1ff83d..4cad7183b89e 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -532,7 +532,6 @@ static inline void nmi_shootdown_cpus_on_restart(void); static void emergency_reboot_disable_virtualization(void) { - /* Just make sure we won't change CPUs while doing this */ local_irq_disable(); /* @@ -821,6 +820,13 @@ void cpu_emergency_disable_virtualization(void) { cpu_emergency_virt_cb *callback; + /* + * IRQs must be disabled as KVM enables virtualization in hardware via + * function call IPIs, i.e. IRQs need to be disabled to guarantee + * virtualization stays disabled. + */ + lockdep_assert_irqs_disabled(); + rcu_read_lock(); callback = rcu_dereference(cpu_emergency_virt_callback); if (callback) From edc8deb087d884bac2f7013f0c23af73042b23a7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 21 Jul 2023 13:18:46 -0700 Subject: [PATCH 12/50] x86/reboot: Hoist "disable virt" helpers above "emergency reboot" path Move the various "disable virtualization" helpers above the emergency reboot path so that emergency_reboot_disable_virtualization() can be stubbed out in a future patch if neither KVM_INTEL nor KVM_AMD is enabled, i.e. if there is no in-tree user of CPU virtualization. No functional change intended. Reviewed-by: Kai Huang Link: https://lore.kernel.org/r/20230721201859.2307736-7-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kernel/reboot.c | 90 ++++++++++++++++++++-------------------- 1 file changed, 45 insertions(+), 45 deletions(-) diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 4cad7183b89e..85cb2dfcb67b 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -530,6 +530,51 @@ static inline void kb_wait(void) static inline void nmi_shootdown_cpus_on_restart(void); +/* RCU-protected callback to disable virtualization prior to reboot. */ +static cpu_emergency_virt_cb __rcu *cpu_emergency_virt_callback; + +void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback) +{ + if (WARN_ON_ONCE(rcu_access_pointer(cpu_emergency_virt_callback))) + return; + + rcu_assign_pointer(cpu_emergency_virt_callback, callback); +} +EXPORT_SYMBOL_GPL(cpu_emergency_register_virt_callback); + +void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback) +{ + if (WARN_ON_ONCE(rcu_access_pointer(cpu_emergency_virt_callback) != callback)) + return; + + rcu_assign_pointer(cpu_emergency_virt_callback, NULL); + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(cpu_emergency_unregister_virt_callback); + +/* + * Disable virtualization, i.e. VMX or SVM, to ensure INIT is recognized during + * reboot. VMX blocks INIT if the CPU is post-VMXON, and SVM blocks INIT if + * GIF=0, i.e. if the crash occurred between CLGI and STGI. + */ +void cpu_emergency_disable_virtualization(void) +{ + cpu_emergency_virt_cb *callback; + + /* + * IRQs must be disabled as KVM enables virtualization in hardware via + * function call IPIs, i.e. IRQs need to be disabled to guarantee + * virtualization stays disabled. + */ + lockdep_assert_irqs_disabled(); + + rcu_read_lock(); + callback = rcu_dereference(cpu_emergency_virt_callback); + if (callback) + callback(); + rcu_read_unlock(); +} + static void emergency_reboot_disable_virtualization(void) { local_irq_disable(); @@ -786,54 +831,9 @@ void machine_crash_shutdown(struct pt_regs *regs) } #endif -/* RCU-protected callback to disable virtualization prior to reboot. */ -static cpu_emergency_virt_cb __rcu *cpu_emergency_virt_callback; - -void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback) -{ - if (WARN_ON_ONCE(rcu_access_pointer(cpu_emergency_virt_callback))) - return; - - rcu_assign_pointer(cpu_emergency_virt_callback, callback); -} -EXPORT_SYMBOL_GPL(cpu_emergency_register_virt_callback); - -void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback) -{ - if (WARN_ON_ONCE(rcu_access_pointer(cpu_emergency_virt_callback) != callback)) - return; - - rcu_assign_pointer(cpu_emergency_virt_callback, NULL); - synchronize_rcu(); -} -EXPORT_SYMBOL_GPL(cpu_emergency_unregister_virt_callback); - /* This is the CPU performing the emergency shutdown work. */ int crashing_cpu = -1; -/* - * Disable virtualization, i.e. VMX or SVM, to ensure INIT is recognized during - * reboot. VMX blocks INIT if the CPU is post-VMXON, and SVM blocks INIT if - * GIF=0, i.e. if the crash occurred between CLGI and STGI. - */ -void cpu_emergency_disable_virtualization(void) -{ - cpu_emergency_virt_cb *callback; - - /* - * IRQs must be disabled as KVM enables virtualization in hardware via - * function call IPIs, i.e. IRQs need to be disabled to guarantee - * virtualization stays disabled. - */ - lockdep_assert_irqs_disabled(); - - rcu_read_lock(); - callback = rcu_dereference(cpu_emergency_virt_callback); - if (callback) - callback(); - rcu_read_unlock(); -} - #if defined(CONFIG_SMP) static nmi_shootdown_cb shootdown_callback; From 59765db5fc82726b32876b794667e2c6936a98ab Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 21 Jul 2023 13:18:47 -0700 Subject: [PATCH 13/50] x86/reboot: Disable virtualization during reboot iff callback is registered Attempt to disable virtualization during an emergency reboot if and only if there is a registered virt callback, i.e. iff a hypervisor (KVM) is active. If there's no active hypervisor, then the CPU can't be operating with VMX or SVM enabled (barring an egregious bug). Checking for a valid callback instead of simply for SVM or VMX support can also eliminates spurious NMIs by avoiding the unecessary call to nmi_shootdown_cpus_on_restart(). Note, IRQs are disabled, which prevents KVM from coming along and enabling virtualization after the fact. Reviewed-by: Kai Huang Link: https://lore.kernel.org/r/20230721201859.2307736-8-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kernel/reboot.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 85cb2dfcb67b..98e5db3fd7f4 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include @@ -589,7 +588,7 @@ static void emergency_reboot_disable_virtualization(void) * Do the NMI shootdown even if virtualization is off on _this_ CPU, as * other CPUs may have virtualization enabled. */ - if (cpu_has_vmx() || cpu_has_svm(NULL)) { + if (rcu_access_pointer(cpu_emergency_virt_callback)) { /* Safely force _this_ CPU out of VMX/SVM operation. */ cpu_emergency_disable_virtualization(); From 261cd5ed934e6923187cf1c9eaa6cb63f2b81212 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 21 Jul 2023 13:18:48 -0700 Subject: [PATCH 14/50] x86/reboot: Expose VMCS crash hooks if and only if KVM_{INTEL,AMD} is enabled Expose the crash/reboot hooks used by KVM to disable virtualization in hardware and unblock INIT only if there's a potential in-tree user, i.e. either KVM_INTEL or KVM_AMD is enabled. Reviewed-by: Kai Huang Link: https://lore.kernel.org/r/20230721201859.2307736-9-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/reboot.h | 4 ++++ arch/x86/kernel/reboot.c | 5 ++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h index 74c6a624d166..6536873f8fc0 100644 --- a/arch/x86/include/asm/reboot.h +++ b/arch/x86/include/asm/reboot.h @@ -25,10 +25,14 @@ void __noreturn machine_real_restart(unsigned int type); #define MRR_BIOS 0 #define MRR_APM 1 +#if IS_ENABLED(CONFIG_KVM_INTEL) || IS_ENABLED(CONFIG_KVM_AMD) typedef void (cpu_emergency_virt_cb)(void); void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback); void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback); void cpu_emergency_disable_virtualization(void); +#else +static inline void cpu_emergency_disable_virtualization(void) {} +#endif /* CONFIG_KVM_INTEL || CONFIG_KVM_AMD */ typedef void (*nmi_shootdown_cb)(int, struct pt_regs*); void nmi_shootdown_cpus(nmi_shootdown_cb callback); diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 98e5db3fd7f4..830425e6d38e 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -529,6 +529,7 @@ static inline void kb_wait(void) static inline void nmi_shootdown_cpus_on_restart(void); +#if IS_ENABLED(CONFIG_KVM_INTEL) || IS_ENABLED(CONFIG_KVM_AMD) /* RCU-protected callback to disable virtualization prior to reboot. */ static cpu_emergency_virt_cb __rcu *cpu_emergency_virt_callback; @@ -596,7 +597,9 @@ static void emergency_reboot_disable_virtualization(void) nmi_shootdown_cpus_on_restart(); } } - +#else +static void emergency_reboot_disable_virtualization(void) { } +#endif /* CONFIG_KVM_INTEL || CONFIG_KVM_AMD */ void __attribute__((weak)) mach_reboot_fixups(void) { From b6a6af0d19cea17a3c1e34f81f0d521775f94ab5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 21 Jul 2023 13:18:49 -0700 Subject: [PATCH 15/50] x86/virt: KVM: Open code cpu_has_vmx() in KVM VMX Fold the raw CPUID check for VMX into kvm_is_vmx_supported(), its sole user. Keep the check even though KVM also checks X86_FEATURE_VMX, as the intent is to provide a unique error message if VMX is unsupported by hardware, whereas X86_FEATURE_VMX may be clear due to firmware and/or kernel actions. No functional change intended. Reviewed-by: Kai Huang Link: https://lore.kernel.org/r/20230721201859.2307736-10-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/virtext.h | 10 ---------- arch/x86/kvm/vmx/vmx.c | 2 +- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h index aaed66249ccf..b1171a5ad452 100644 --- a/arch/x86/include/asm/virtext.h +++ b/arch/x86/include/asm/virtext.h @@ -22,14 +22,6 @@ /* * VMX functions: */ - -static inline int cpu_has_vmx(void) -{ - unsigned long ecx = cpuid_ecx(1); - return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */ -} - - /** * cpu_vmxoff() - Disable VMX on the current CPU * @@ -61,8 +53,6 @@ static inline int cpu_vmx_enabled(void) } /** Disable VMX if it is enabled on the current CPU - * - * You shouldn't call this if cpu_has_vmx() returns 0. */ static inline void __cpu_emergency_vmxoff(void) { diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index df991f15baf4..4b9d68826172 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2699,7 +2699,7 @@ static bool kvm_is_vmx_supported(void) { int cpu = raw_smp_processor_id(); - if (!cpu_has_vmx()) { + if (!(cpuid_ecx(1) & feature_bit(VMX))) { pr_err("VMX not supported by CPU %d\n", cpu); return false; } From 22e420e127399f2e75c8efddff763e92247f3da7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 21 Jul 2023 13:18:50 -0700 Subject: [PATCH 16/50] x86/virt: KVM: Move VMXOFF helpers into KVM VMX Now that VMX is disabled in emergencies via the virt callbacks, move the VMXOFF helpers into KVM, the only remaining user. No functional change intended. Reviewed-by: Kai Huang Link: https://lore.kernel.org/r/20230721201859.2307736-11-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/virtext.h | 42 ---------------------------------- arch/x86/kvm/vmx/vmx.c | 29 ++++++++++++++++++++--- 2 files changed, 26 insertions(+), 45 deletions(-) diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h index b1171a5ad452..a27801f2bc71 100644 --- a/arch/x86/include/asm/virtext.h +++ b/arch/x86/include/asm/virtext.h @@ -19,48 +19,6 @@ #include #include -/* - * VMX functions: - */ -/** - * cpu_vmxoff() - Disable VMX on the current CPU - * - * Disable VMX and clear CR4.VMXE (even if VMXOFF faults) - * - * Note, VMXOFF causes a #UD if the CPU is !post-VMXON, but it's impossible to - * atomically track post-VMXON state, e.g. this may be called in NMI context. - * Eat all faults as all other faults on VMXOFF faults are mode related, i.e. - * faults are guaranteed to be due to the !post-VMXON check unless the CPU is - * magically in RM, VM86, compat mode, or at CPL>0. - */ -static inline int cpu_vmxoff(void) -{ - asm_volatile_goto("1: vmxoff\n\t" - _ASM_EXTABLE(1b, %l[fault]) - ::: "cc", "memory" : fault); - - cr4_clear_bits(X86_CR4_VMXE); - return 0; - -fault: - cr4_clear_bits(X86_CR4_VMXE); - return -EIO; -} - -static inline int cpu_vmx_enabled(void) -{ - return __read_cr4() & X86_CR4_VMXE; -} - -/** Disable VMX if it is enabled on the current CPU - */ -static inline void __cpu_emergency_vmxoff(void) -{ - if (cpu_vmx_enabled()) - cpu_vmxoff(); -} - - /* * SVM functions: */ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 4b9d68826172..64377fe3a990 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -47,7 +47,6 @@ #include #include #include -#include #include #include "capabilities.h" @@ -725,6 +724,29 @@ static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx, return ret; } +/* + * Disable VMX and clear CR4.VMXE (even if VMXOFF faults) + * + * Note, VMXOFF causes a #UD if the CPU is !post-VMXON, but it's impossible to + * atomically track post-VMXON state, e.g. this may be called in NMI context. + * Eat all faults as all other faults on VMXOFF faults are mode related, i.e. + * faults are guaranteed to be due to the !post-VMXON check unless the CPU is + * magically in RM, VM86, compat mode, or at CPL>0. + */ +static int kvm_cpu_vmxoff(void) +{ + asm_volatile_goto("1: vmxoff\n\t" + _ASM_EXTABLE(1b, %l[fault]) + ::: "cc", "memory" : fault); + + cr4_clear_bits(X86_CR4_VMXE); + return 0; + +fault: + cr4_clear_bits(X86_CR4_VMXE); + return -EIO; +} + static void vmx_emergency_disable(void) { int cpu = raw_smp_processor_id(); @@ -734,7 +756,8 @@ static void vmx_emergency_disable(void) loaded_vmcss_on_cpu_link) vmcs_clear(v->vmcs); - __cpu_emergency_vmxoff(); + if (__read_cr4() & X86_CR4_VMXE) + kvm_cpu_vmxoff(); } static void __loaded_vmcs_clear(void *arg) @@ -2799,7 +2822,7 @@ static void vmx_hardware_disable(void) { vmclear_local_loaded_vmcss(); - if (cpu_vmxoff()) + if (kvm_cpu_vmxoff()) kvm_spurious_fault(); hv_reset_evmcs(); From 554856b69e3d1faae6f056f75ed6262b5eb01546 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 21 Jul 2023 13:18:51 -0700 Subject: [PATCH 17/50] KVM: SVM: Make KVM_AMD depend on CPU_SUP_AMD or CPU_SUP_HYGON Make building KVM SVM support depend on support for AMD or Hygon. KVM already effectively restricts SVM support to AMD and Hygon by virtue of the vendor string checks in cpu_has_svm(), and KVM VMX supports depends on one of its three known vendors (Intel, Centaur, or Zhaoxin). Add the CPU_SUP_HYGON clause even though CPU_SUP_HYGON selects CPU_SUP_AMD to document that KVM SVM support isn't just for AMD CPUs, and to prevent breakage should Hygon support ever become a standalone thing. Link: https://lore.kernel.org/r/20230721201859.2307736-12-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 89ca7f4c1464..66dbd1f4d57d 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -101,7 +101,7 @@ config X86_SGX_KVM config KVM_AMD tristate "KVM for AMD processors support" - depends on KVM + depends on KVM && (CPU_SUP_AMD || CPU_SUP_HYGON) help Provides support for KVM on AMD processors equipped with the AMD-V (SVM) extensions. From 5df8ecfe3632d5879d1f154f7aa8de441b5d1c89 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 21 Jul 2023 13:18:52 -0700 Subject: [PATCH 18/50] x86/virt: Drop unnecessary check on extended CPUID level in cpu_has_svm() Drop the explicit check on the extended CPUID level in cpu_has_svm(), the kernel's cached CPUID info will leave the entire SVM leaf unset if said leaf is not supported by hardware. Prior to using cached information, the check was needed to avoid false positives due to Intel's rather crazy CPUID behavior of returning the values of the maximum supported leaf if the specified leaf is unsupported. Fixes: 682a8108872f ("x86/kvm/svm: Simplify cpu_has_svm()") Link: https://lore.kernel.org/r/20230721201859.2307736-13-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/virtext.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h index a27801f2bc71..be50c414efe4 100644 --- a/arch/x86/include/asm/virtext.h +++ b/arch/x86/include/asm/virtext.h @@ -39,12 +39,6 @@ static inline int cpu_has_svm(const char **msg) return 0; } - if (boot_cpu_data.extended_cpuid_level < SVM_CPUID_FUNC) { - if (msg) - *msg = "can't execute cpuid_8000000a"; - return 0; - } - if (!boot_cpu_has(X86_FEATURE_SVM)) { if (msg) *msg = "svm not available"; From 85fd29dd5fe459b732e46c7a9782fc403c5604ed Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 21 Jul 2023 13:18:53 -0700 Subject: [PATCH 19/50] x86/virt: KVM: Open code cpu_has_svm() into kvm_is_svm_supported() Fold the guts of cpu_has_svm() into kvm_is_svm_supported(), its sole remaining user. No functional change intended. Reviewed-by: Kai Huang Link: https://lore.kernel.org/r/20230721201859.2307736-14-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/virtext.h | 28 ---------------------------- arch/x86/kvm/svm/svm.c | 11 ++++++++--- 2 files changed, 8 insertions(+), 31 deletions(-) diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h index be50c414efe4..632575e257d8 100644 --- a/arch/x86/include/asm/virtext.h +++ b/arch/x86/include/asm/virtext.h @@ -22,35 +22,7 @@ /* * SVM functions: */ - -/** Check if the CPU has SVM support - * - * You can use the 'msg' arg to get a message describing the problem, - * if the function returns zero. Simply pass NULL if you are not interested - * on the messages; gcc should take care of not generating code for - * the messages on this case. - */ -static inline int cpu_has_svm(const char **msg) -{ - if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && - boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) { - if (msg) - *msg = "not amd or hygon"; - return 0; - } - - if (!boot_cpu_has(X86_FEATURE_SVM)) { - if (msg) - *msg = "svm not available"; - return 0; - } - return 1; -} - - /** Disable SVM on the current CPU - * - * You should call this only if cpu_has_svm() returned true. */ static inline void cpu_svm_disable(void) { diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 1ae9c2c7eacb..ff6c769aafb2 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -521,11 +521,16 @@ static void svm_init_osvw(struct kvm_vcpu *vcpu) static bool kvm_is_svm_supported(void) { int cpu = raw_smp_processor_id(); - const char *msg; u64 vm_cr; - if (!cpu_has_svm(&msg)) { - pr_err("SVM not supported by CPU %d, %s\n", cpu, msg); + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && + boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) { + pr_err("CPU %d isn't AMD or Hygon\n", cpu); + return false; + } + + if (!boot_cpu_has(X86_FEATURE_SVM)) { + pr_err("SVM not supported by CPU %d\n", cpu); return false; } From c4db4f20f3bf619bacd3d705bf427016453eedb1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 21 Jul 2023 13:18:54 -0700 Subject: [PATCH 20/50] KVM: SVM: Check that the current CPU supports SVM in kvm_is_svm_supported() Check "this" CPU instead of the boot CPU when querying SVM support so that the per-CPU checks done during hardware enabling actually function as intended, i.e. will detect issues where SVM isn't support on all CPUs. Disable migration for the use from svm_init() mostly so that the standard accessors for the per-CPU data can be used without getting yelled at by CONFIG_DEBUG_PREEMPT=y sanity checks. Preventing the "disabled by BIOS" error message from reporting the wrong CPU is largely a bonus, as ensuring a stable CPU during module load is a non-goal for KVM. Link: https://lore.kernel.org/all/ZAdxNgv0M6P63odE@google.com Cc: Kai Huang Cc: Chao Gao Reviewed-by: Kai Huang Link: https://lore.kernel.org/r/20230721201859.2307736-15-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index ff6c769aafb2..9e449167e71b 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -518,18 +518,20 @@ static void svm_init_osvw(struct kvm_vcpu *vcpu) vcpu->arch.osvw.status |= 1; } -static bool kvm_is_svm_supported(void) +static bool __kvm_is_svm_supported(void) { - int cpu = raw_smp_processor_id(); + int cpu = smp_processor_id(); + struct cpuinfo_x86 *c = &cpu_data(cpu); + u64 vm_cr; - if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && - boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) { + if (c->x86_vendor != X86_VENDOR_AMD && + c->x86_vendor != X86_VENDOR_HYGON) { pr_err("CPU %d isn't AMD or Hygon\n", cpu); return false; } - if (!boot_cpu_has(X86_FEATURE_SVM)) { + if (!cpu_has(c, X86_FEATURE_SVM)) { pr_err("SVM not supported by CPU %d\n", cpu); return false; } @@ -548,9 +550,20 @@ static bool kvm_is_svm_supported(void) return true; } +static bool kvm_is_svm_supported(void) +{ + bool supported; + + migrate_disable(); + supported = __kvm_is_svm_supported(); + migrate_enable(); + + return supported; +} + static int svm_check_processor_compat(void) { - if (!kvm_is_svm_supported()) + if (!__kvm_is_svm_supported()) return -EIO; return 0; From f9a8866040fcf8672f20c67e07e67ba5a00caba4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 21 Jul 2023 13:18:55 -0700 Subject: [PATCH 21/50] KVM: VMX: Ensure CPU is stable when probing basic VMX support Disable migration when probing VMX support during module load to ensure the CPU is stable, mostly to match similar SVM logic, where allowing migration effective requires deliberately writing buggy code. As a bonus, KVM won't report the wrong CPU to userspace if VMX is unsupported, but in practice that is a very, very minor bonus as the only way that reporting the wrong CPU would actually matter is if hardware is broken or if the system is misconfigured, i.e. if KVM gets migrated from a CPU that _does_ support VMX to a CPU that does _not_ support VMX. Reviewed-by: Kai Huang Link: https://lore.kernel.org/r/20230721201859.2307736-16-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 64377fe3a990..ff0eced7d2ab 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2718,9 +2718,9 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf, return 0; } -static bool kvm_is_vmx_supported(void) +static bool __kvm_is_vmx_supported(void) { - int cpu = raw_smp_processor_id(); + int cpu = smp_processor_id(); if (!(cpuid_ecx(1) & feature_bit(VMX))) { pr_err("VMX not supported by CPU %d\n", cpu); @@ -2736,13 +2736,24 @@ static bool kvm_is_vmx_supported(void) return true; } +static bool kvm_is_vmx_supported(void) +{ + bool supported; + + migrate_disable(); + supported = __kvm_is_vmx_supported(); + migrate_enable(); + + return supported; +} + static int vmx_check_processor_compat(void) { int cpu = raw_smp_processor_id(); struct vmcs_config vmcs_conf; struct vmx_capability vmx_cap; - if (!kvm_is_vmx_supported()) + if (!__kvm_is_vmx_supported()) return -EIO; if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) { From 76ab8161083bfd0ae4de9b93e68d639da6e1c726 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 21 Jul 2023 13:18:56 -0700 Subject: [PATCH 22/50] x86/virt: KVM: Move "disable SVM" helper into KVM SVM Move cpu_svm_disable() into KVM proper now that all hardware virtualization management is routed through KVM. Remove the now-empty virtext.h. No functional change intended. Reviewed-by: Kai Huang Link: https://lore.kernel.org/r/20230721201859.2307736-17-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/virtext.h | 50 ---------------------------------- arch/x86/kvm/svm/svm.c | 29 +++++++++++++++++--- 2 files changed, 25 insertions(+), 54 deletions(-) delete mode 100644 arch/x86/include/asm/virtext.h diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h deleted file mode 100644 index 632575e257d8..000000000000 --- a/arch/x86/include/asm/virtext.h +++ /dev/null @@ -1,50 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* CPU virtualization extensions handling - * - * This should carry the code for handling CPU virtualization extensions - * that needs to live in the kernel core. - * - * Author: Eduardo Habkost - * - * Copyright (C) 2008, Red Hat Inc. - * - * Contains code from KVM, Copyright (C) 2006 Qumranet, Inc. - */ -#ifndef _ASM_X86_VIRTEX_H -#define _ASM_X86_VIRTEX_H - -#include - -#include -#include -#include - -/* - * SVM functions: - */ -/** Disable SVM on the current CPU - */ -static inline void cpu_svm_disable(void) -{ - uint64_t efer; - - wrmsrl(MSR_VM_HSAVE_PA, 0); - rdmsrl(MSR_EFER, efer); - if (efer & EFER_SVME) { - /* - * Force GIF=1 prior to disabling SVM to ensure INIT and NMI - * aren't blocked, e.g. if a fatal error occurred between CLGI - * and STGI. Note, STGI may #UD if SVM is disabled from NMI - * context between reading EFER and executing STGI. In that - * case, GIF must already be set, otherwise the NMI would have - * been blocked, so just eat the fault. - */ - asm_volatile_goto("1: stgi\n\t" - _ASM_EXTABLE(1b, %l[fault]) - ::: "memory" : fault); -fault: - wrmsrl(MSR_EFER, efer & ~EFER_SVME); - } -} - -#endif /* _ASM_X86_VIRTEX_H */ diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 9e449167e71b..47f9c7156609 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -42,8 +42,6 @@ #include #include -#include - #include #include "trace.h" @@ -582,9 +580,32 @@ out: preempt_enable(); } +static inline void kvm_cpu_svm_disable(void) +{ + uint64_t efer; + + wrmsrl(MSR_VM_HSAVE_PA, 0); + rdmsrl(MSR_EFER, efer); + if (efer & EFER_SVME) { + /* + * Force GIF=1 prior to disabling SVM to ensure INIT and NMI + * aren't blocked, e.g. if a fatal error occurred between CLGI + * and STGI. Note, STGI may #UD if SVM is disabled from NMI + * context between reading EFER and executing STGI. In that + * case, GIF must already be set, otherwise the NMI would have + * been blocked, so just eat the fault. + */ + asm_volatile_goto("1: stgi\n\t" + _ASM_EXTABLE(1b, %l[fault]) + ::: "memory" : fault); +fault: + wrmsrl(MSR_EFER, efer & ~EFER_SVME); + } +} + static void svm_emergency_disable(void) { - cpu_svm_disable(); + kvm_cpu_svm_disable(); } static void svm_hardware_disable(void) @@ -593,7 +614,7 @@ static void svm_hardware_disable(void) if (tsc_scaling) __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT); - cpu_svm_disable(); + kvm_cpu_svm_disable(); amd_pmu_disable_virt(); } From 6ae44e012f4c35fb67bc61bd0bf1b3c5f504d931 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 21 Jul 2023 13:18:57 -0700 Subject: [PATCH 23/50] KVM: x86: Force kvm_rebooting=true during emergency reboot/crash Set kvm_rebooting when virtualization is disabled in an emergency so that KVM eats faults on virtualization instructions even if kvm_reboot() isn't reached. Reviewed-by: Kai Huang Link: https://lore.kernel.org/r/20230721201859.2307736-18-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 2 ++ arch/x86/kvm/vmx/vmx.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 47f9c7156609..8d1b3c801629 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -605,6 +605,8 @@ fault: static void svm_emergency_disable(void) { + kvm_rebooting = true; + kvm_cpu_svm_disable(); } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index ff0eced7d2ab..415665c40a39 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -752,6 +752,8 @@ static void vmx_emergency_disable(void) int cpu = raw_smp_processor_id(); struct loaded_vmcs *v; + kvm_rebooting = true; + list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu), loaded_vmcss_on_cpu_link) vmcs_clear(v->vmcs); From 2e6b9bd49b702c4a52278e35202354b709021116 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 21 Jul 2023 13:18:58 -0700 Subject: [PATCH 24/50] KVM: SVM: Use "standard" stgi() helper when disabling SVM Now that kvm_rebooting is guaranteed to be true prior to disabling SVM in an emergency, use the existing stgi() helper instead of open coding STGI. In effect, eat faults on STGI if and only if kvm_rebooting==true. Link: https://lore.kernel.org/r/20230721201859.2307736-19-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 8d1b3c801629..4785d780cce3 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -588,17 +588,10 @@ static inline void kvm_cpu_svm_disable(void) rdmsrl(MSR_EFER, efer); if (efer & EFER_SVME) { /* - * Force GIF=1 prior to disabling SVM to ensure INIT and NMI - * aren't blocked, e.g. if a fatal error occurred between CLGI - * and STGI. Note, STGI may #UD if SVM is disabled from NMI - * context between reading EFER and executing STGI. In that - * case, GIF must already be set, otherwise the NMI would have - * been blocked, so just eat the fault. + * Force GIF=1 prior to disabling SVM, e.g. to ensure INIT and + * NMI aren't blocked. */ - asm_volatile_goto("1: stgi\n\t" - _ASM_EXTABLE(1b, %l[fault]) - ::: "memory" : fault); -fault: + stgi(); wrmsrl(MSR_EFER, efer & ~EFER_SVME); } } From a788fbb763b5001f67f2ecdf207a357f9b12838f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 21 Jul 2023 13:18:59 -0700 Subject: [PATCH 25/50] KVM: VMX: Skip VMCLEAR logic during emergency reboots if CR4.VMXE=0 Bail from vmx_emergency_disable() without processing the list of loaded VMCSes if CR4.VMXE=0, i.e. if the CPU can't be post-VMXON. It should be impossible for the list to have entries if VMX is already disabled, and even if that invariant doesn't hold, VMCLEAR will #UD anyways, i.e. processing the list is pointless even if it somehow isn't empty. Assuming no existing KVM bugs, this should be a glorified nop. The primary motivation for the change is to avoid having code that looks like it does VMCLEAR, but then skips VMXON, which is nonsensical. Suggested-by: Kai Huang Reviewed-by: Kai Huang Link: https://lore.kernel.org/r/20230721201859.2307736-20-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 415665c40a39..83338872a39f 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -754,12 +754,20 @@ static void vmx_emergency_disable(void) kvm_rebooting = true; + /* + * Note, CR4.VMXE can be _cleared_ in NMI context, but it can only be + * set in task context. If this races with VMX is disabled by an NMI, + * VMCLEAR and VMXOFF may #UD, but KVM will eat those faults due to + * kvm_rebooting set. + */ + if (!(__read_cr4() & X86_CR4_VMXE)) + return; + list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu), loaded_vmcss_on_cpu_link) vmcs_clear(v->vmcs); - if (__read_cr4() & X86_CR4_VMXE) - kvm_cpu_vmxoff(); + kvm_cpu_vmxoff(); } static void __loaded_vmcs_clear(void *arg) From 99b668545356115e11a6cca519361c8cfd02569e Mon Sep 17 00:00:00 2001 From: Tao Su Date: Wed, 2 Aug 2023 10:29:54 +0800 Subject: [PATCH 26/50] KVM: x86: Advertise AMX-COMPLEX CPUID to userspace Latest Intel platform GraniteRapids-D introduces AMX-COMPLEX, which adds two instructions to perform matrix multiplication of two tiles containing complex elements and accumulate the results into a packed single precision tile. AMX-COMPLEX is enumerated via CPUID.(EAX=7,ECX=1):EDX[bit 8] Advertise AMX_COMPLEX if it's supported in hardware. There are no VMX controls for the feature, i.e. the instructions can't be interecepted, and KVM advertises base AMX in CPUID if AMX is supported in hardware, even if KVM doesn't advertise AMX as being supported in XCR0, e.g. because the process didn't opt-in to allocating tile data. Signed-off-by: Tao Su Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20230802022954.193843-1-tao1.su@linux.intel.com [sean: tweak last paragraph of changelog] Signed-off-by: Sean Christopherson --- arch/x86/kvm/cpuid.c | 3 ++- arch/x86/kvm/reverse_cpuid.h | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index ad6566636c22..5a88affb2e1a 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -647,7 +647,8 @@ void kvm_set_cpu_caps(void) ); kvm_cpu_cap_init_kvm_defined(CPUID_7_1_EDX, - F(AVX_VNNI_INT8) | F(AVX_NE_CONVERT) | F(PREFETCHITI) + F(AVX_VNNI_INT8) | F(AVX_NE_CONVERT) | F(PREFETCHITI) | + F(AMX_COMPLEX) ); kvm_cpu_cap_mask(CPUID_D_1_EAX, diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h index 56cbdb24400a..b81650678375 100644 --- a/arch/x86/kvm/reverse_cpuid.h +++ b/arch/x86/kvm/reverse_cpuid.h @@ -43,6 +43,7 @@ enum kvm_only_cpuid_leafs { /* Intel-defined sub-features, CPUID level 0x00000007:1 (EDX) */ #define X86_FEATURE_AVX_VNNI_INT8 KVM_X86_FEATURE(CPUID_7_1_EDX, 4) #define X86_FEATURE_AVX_NE_CONVERT KVM_X86_FEATURE(CPUID_7_1_EDX, 5) +#define X86_FEATURE_AMX_COMPLEX KVM_X86_FEATURE(CPUID_7_1_EDX, 8) #define X86_FEATURE_PREFETCHITI KVM_X86_FEATURE(CPUID_7_1_EDX, 14) /* CPUID level 0x80000007 (EDX). */ From 7cafe9b8e22bb3d77f130c461aedf6868c4aaf58 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 28 Jul 2023 18:15:48 -0700 Subject: [PATCH 27/50] KVM: nSVM: Check instead of asserting on nested TSC scaling support Check for nested TSC scaling support on nested SVM VMRUN instead of asserting that TSC scaling is exposed to L1 if L1's MSR_AMD64_TSC_RATIO has diverged from KVM's default. Userspace can trigger the WARN at will by writing the MSR and then updating guest CPUID to hide the feature (modifying guest CPUID is allowed anytime before KVM_RUN). E.g. hacking KVM's state_test selftest to do vcpu_set_msr(vcpu, MSR_AMD64_TSC_RATIO, 0); vcpu_clear_cpuid_feature(vcpu, X86_FEATURE_TSCRATEMSR); after restoring state in a new VM+vCPU yields an endless supply of: ------------[ cut here ]------------ WARNING: CPU: 164 PID: 62565 at arch/x86/kvm/svm/nested.c:699 nested_vmcb02_prepare_control+0x3d6/0x3f0 [kvm_amd] Call Trace: enter_svm_guest_mode+0x114/0x560 [kvm_amd] nested_svm_vmrun+0x260/0x330 [kvm_amd] vmrun_interception+0x29/0x30 [kvm_amd] svm_invoke_exit_handler+0x35/0x100 [kvm_amd] svm_handle_exit+0xe7/0x180 [kvm_amd] kvm_arch_vcpu_ioctl_run+0x1eab/0x2570 [kvm] kvm_vcpu_ioctl+0x4c9/0x5b0 [kvm] __se_sys_ioctl+0x7a/0xc0 __x64_sys_ioctl+0x21/0x30 do_syscall_64+0x41/0x90 entry_SYSCALL_64_after_hwframe+0x63/0xcd RIP: 0033:0x45ca1b Note, the nested #VMEXIT path has the same flaw, but needs a different fix and will be handled separately. Fixes: 5228eb96a487 ("KVM: x86: nSVM: implement nested TSC scaling") Cc: Maxim Levitsky Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20230729011608.1065019-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/nested.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 96936ddf1b3c..0b90f5cf9df3 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -695,10 +695,9 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, vmcb02->control.tsc_offset = vcpu->arch.tsc_offset; - if (svm->tsc_ratio_msr != kvm_caps.default_tsc_scaling_ratio) { - WARN_ON(!svm->tsc_scaling_enabled); + if (svm->tsc_scaling_enabled && + svm->tsc_ratio_msr != kvm_caps.default_tsc_scaling_ratio) nested_svm_update_tsc_ratio_msr(vcpu); - } vmcb02->control.int_ctl = (svm->nested.ctl.int_ctl & int_ctl_vmcb12_bits) | From 0c94e2468491cbf0754f49a5136ab51294a96b69 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 28 Jul 2023 18:15:49 -0700 Subject: [PATCH 28/50] KVM: nSVM: Load L1's TSC multiplier based on L1 state, not L2 state When emulating nested VM-Exit, load L1's TSC multiplier if L1's desired ratio doesn't match the current ratio, not if the ratio L1 is using for L2 diverges from the default. Functionally, the end result is the same as KVM will run L2 with L1's multiplier if L2's multiplier is the default, i.e. checking that L1's multiplier is loaded is equivalent to checking if L2 has a non-default multiplier. However, the assertion that TSC scaling is exposed to L1 is flawed, as userspace can trigger the WARN at will by writing the MSR and then updating guest CPUID to hide the feature (modifying guest CPUID is allowed anytime before KVM_RUN). E.g. hacking KVM's state_test selftest to do vcpu_set_msr(vcpu, MSR_AMD64_TSC_RATIO, 0); vcpu_clear_cpuid_feature(vcpu, X86_FEATURE_TSCRATEMSR); after restoring state in a new VM+vCPU yields an endless supply of: ------------[ cut here ]------------ WARNING: CPU: 10 PID: 206939 at arch/x86/kvm/svm/nested.c:1105 nested_svm_vmexit+0x6af/0x720 [kvm_amd] Call Trace: nested_svm_exit_handled+0x102/0x1f0 [kvm_amd] svm_handle_exit+0xb9/0x180 [kvm_amd] kvm_arch_vcpu_ioctl_run+0x1eab/0x2570 [kvm] kvm_vcpu_ioctl+0x4c9/0x5b0 [kvm] ? trace_hardirqs_off+0x4d/0xa0 __se_sys_ioctl+0x7a/0xc0 __x64_sys_ioctl+0x21/0x30 do_syscall_64+0x41/0x90 entry_SYSCALL_64_after_hwframe+0x63/0xcd Unlike the nested VMRUN path, hoisting the svm->tsc_scaling_enabled check into the if-statement is wrong as KVM needs to ensure L1's multiplier is loaded in the above scenario. Alternatively, the WARN_ON() could simply be deleted, but that would make KVM's behavior even more subtle, e.g. it's not immediately obvious why it's safe to write MSR_AMD64_TSC_RATIO when checking only tsc_ratio_msr. Fixes: 5228eb96a487 ("KVM: x86: nSVM: implement nested TSC scaling") Cc: Maxim Levitsky Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20230729011608.1065019-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/nested.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 0b90f5cf9df3..c66c823ae222 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -1100,8 +1100,8 @@ int nested_svm_vmexit(struct vcpu_svm *svm) vmcb_mark_dirty(vmcb01, VMCB_INTERCEPTS); } - if (svm->tsc_ratio_msr != kvm_caps.default_tsc_scaling_ratio) { - WARN_ON(!svm->tsc_scaling_enabled); + if (kvm_caps.has_tsc_control && + vcpu->arch.tsc_scaling_ratio != vcpu->arch.l1_tsc_scaling_ratio) { vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio; __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio); } From c0dc39bd2c584879748dfd1321f8541d1cbeb9b8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 28 Jul 2023 18:15:50 -0700 Subject: [PATCH 29/50] KVM: nSVM: Use the "outer" helper for writing multiplier to MSR_AMD64_TSC_RATIO When emulating nested SVM transitions, use the outer helper for writing the TSC multiplier for L2. Using the inner helper only for one-off cases, i.e. for paths where KVM is NOT emulating or modifying vCPU state, will allow for multiple cleanups: - Explicitly disabling preemption only in the outer helper - Getting the multiplier from the vCPU field in the outer helper - Skipping the WRMSR in the outer helper if guest state isn't loaded Opportunistically delete an extra newline. No functional change intended. Link: https://lore.kernel.org/r/20230729011608.1065019-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/nested.c | 4 ++-- arch/x86/kvm/svm/svm.c | 5 ++--- arch/x86/kvm/svm/svm.h | 2 +- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index c66c823ae222..5d5a1d7832fb 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -1103,7 +1103,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm) if (kvm_caps.has_tsc_control && vcpu->arch.tsc_scaling_ratio != vcpu->arch.l1_tsc_scaling_ratio) { vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio; - __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio); + svm_write_tsc_multiplier(vcpu, vcpu->arch.tsc_scaling_ratio); } svm->nested.ctl.nested_cr3 = 0; @@ -1536,7 +1536,7 @@ void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu) vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier(vcpu->arch.l1_tsc_scaling_ratio, svm->tsc_ratio_msr); - __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio); + svm_write_tsc_multiplier(vcpu, vcpu->arch.tsc_scaling_ratio); } /* Inverse operation of nested_copy_vmcb_control_to_cache(). asid is copied too. */ diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 4785d780cce3..7217f7532df9 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -567,7 +567,7 @@ static int svm_check_processor_compat(void) return 0; } -void __svm_write_tsc_multiplier(u64 multiplier) +static void __svm_write_tsc_multiplier(u64 multiplier) { preempt_disable(); @@ -1150,12 +1150,11 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); } -static void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier) +void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier) { __svm_write_tsc_multiplier(multiplier); } - /* Evaluate instruction intercepts that depend on guest CPUID features. */ static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu, struct vcpu_svm *svm) diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 18af7e712a5a..7132c0a04817 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -658,7 +658,7 @@ int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, bool has_error_code, u32 error_code); int nested_svm_exit_special(struct vcpu_svm *svm); void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu); -void __svm_write_tsc_multiplier(u64 multiplier); +void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier); void nested_copy_vmcb_control_to_cache(struct vcpu_svm *svm, struct vmcb_control_area *control); void nested_copy_vmcb_save_to_cache(struct vcpu_svm *svm, From 229725acfaea0dd81f245aec40986dc766de40fa Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 28 Jul 2023 18:15:51 -0700 Subject: [PATCH 30/50] KVM: SVM: Clean up preemption toggling related to MSR_AMD64_TSC_RATIO Explicitly disable preemption when writing MSR_AMD64_TSC_RATIO only in the "outer" helper, as all direct callers of the "inner" helper now run with preemption already disabled. And that isn't a coincidence, as the outer helper requires a vCPU and is intended to be used when modifying guest state and/or emulating guest instructions, which are typically done with preemption enabled. Direct use of the inner helper should be extremely limited, as the only time KVM should modify MSR_AMD64_TSC_RATIO without a vCPU is when sanitizing the MSR for a specific pCPU (currently done when {en,dis}abling disabling SVM). The other direct caller is svm_prepare_switch_to_guest(), which does have a vCPU, but is a one-off special case: KVM is about to enter the guest on a specific pCPU and thus must have preemption disabled. Link: https://lore.kernel.org/r/20230729011608.1065019-5-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 7217f7532df9..b66af2966ec7 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -569,15 +569,11 @@ static int svm_check_processor_compat(void) static void __svm_write_tsc_multiplier(u64 multiplier) { - preempt_disable(); - if (multiplier == __this_cpu_read(current_tsc_ratio)) - goto out; + return; wrmsrl(MSR_AMD64_TSC_RATIO, multiplier); __this_cpu_write(current_tsc_ratio, multiplier); -out: - preempt_enable(); } static inline void kvm_cpu_svm_disable(void) @@ -1152,7 +1148,9 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier) { + preempt_disable(); __svm_write_tsc_multiplier(multiplier); + preempt_enable(); } /* Evaluate instruction intercepts that depend on guest CPUID features. */ From 2d63699099ac1fad13a0dbf8538faf4127c062e8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 28 Jul 2023 18:15:52 -0700 Subject: [PATCH 31/50] KVM: x86: Always write vCPU's current TSC offset/ratio in vendor hooks Drop the @offset and @multiplier params from the kvm_x86_ops hooks for propagating TSC offsets/multipliers into hardware, and instead have the vendor implementations pull the information directly from the vCPU structure. The respective vCPU fields _must_ be written at the same time in order to maintain consistent state, i.e. it's not random luck that the value passed in by all callers is grabbed from the vCPU. Explicitly grabbing the value from the vCPU field in SVM's implementation in particular will allow for additional cleanup without introducing even more subtle dependencies. Specifically, SVM can skip the WRMSR if guest state isn't loaded, i.e. svm_prepare_switch_to_guest() will load the correct value for the vCPU prior to entering the guest. This also reconciles KVM's handling of related values that are stored in the vCPU, as svm_write_tsc_offset() already assumes/requires the caller to have updated l1_tsc_offset. Link: https://lore.kernel.org/r/20230729011608.1065019-6-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 4 ++-- arch/x86/kvm/svm/nested.c | 4 ++-- arch/x86/kvm/svm/svm.c | 8 ++++---- arch/x86/kvm/svm/svm.h | 2 +- arch/x86/kvm/vmx/vmx.c | 8 ++++---- arch/x86/kvm/x86.c | 5 ++--- 6 files changed, 15 insertions(+), 16 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 28bd38303d70..dad9331c5270 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1654,8 +1654,8 @@ struct kvm_x86_ops { u64 (*get_l2_tsc_offset)(struct kvm_vcpu *vcpu); u64 (*get_l2_tsc_multiplier)(struct kvm_vcpu *vcpu); - void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); - void (*write_tsc_multiplier)(struct kvm_vcpu *vcpu, u64 multiplier); + void (*write_tsc_offset)(struct kvm_vcpu *vcpu); + void (*write_tsc_multiplier)(struct kvm_vcpu *vcpu); /* * Retrieve somewhat arbitrary exit information. Intended to diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 5d5a1d7832fb..3342cc4a5189 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -1103,7 +1103,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm) if (kvm_caps.has_tsc_control && vcpu->arch.tsc_scaling_ratio != vcpu->arch.l1_tsc_scaling_ratio) { vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio; - svm_write_tsc_multiplier(vcpu, vcpu->arch.tsc_scaling_ratio); + svm_write_tsc_multiplier(vcpu); } svm->nested.ctl.nested_cr3 = 0; @@ -1536,7 +1536,7 @@ void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu) vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier(vcpu->arch.l1_tsc_scaling_ratio, svm->tsc_ratio_msr); - svm_write_tsc_multiplier(vcpu, vcpu->arch.tsc_scaling_ratio); + svm_write_tsc_multiplier(vcpu); } /* Inverse operation of nested_copy_vmcb_control_to_cache(). asid is copied too. */ diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index b66af2966ec7..2542066652a3 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1137,19 +1137,19 @@ static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) return svm->tsc_ratio_msr; } -static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) +static void svm_write_tsc_offset(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); svm->vmcb01.ptr->control.tsc_offset = vcpu->arch.l1_tsc_offset; - svm->vmcb->control.tsc_offset = offset; + svm->vmcb->control.tsc_offset = vcpu->arch.tsc_offset; vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); } -void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier) +void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu) { preempt_disable(); - __svm_write_tsc_multiplier(multiplier); + __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio); preempt_enable(); } diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 7132c0a04817..5829a1801862 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -658,7 +658,7 @@ int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, bool has_error_code, u32 error_code); int nested_svm_exit_special(struct vcpu_svm *svm); void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu); -void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier); +void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu); void nested_copy_vmcb_control_to_cache(struct vcpu_svm *svm, struct vmcb_control_area *control); void nested_copy_vmcb_save_to_cache(struct vcpu_svm *svm, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 83338872a39f..3235998ca261 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1898,14 +1898,14 @@ u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) return kvm_caps.default_tsc_scaling_ratio; } -static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) +static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu) { - vmcs_write64(TSC_OFFSET, offset); + vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); } -static void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier) +static void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu) { - vmcs_write64(TSC_MULTIPLIER, multiplier); + vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); } /* diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a1b13d2d1d71..80ec33fc823b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2613,7 +2613,7 @@ static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 l1_offset) else vcpu->arch.tsc_offset = l1_offset; - static_call(kvm_x86_write_tsc_offset)(vcpu, vcpu->arch.tsc_offset); + static_call(kvm_x86_write_tsc_offset)(vcpu); } static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier) @@ -2629,8 +2629,7 @@ static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multipli vcpu->arch.tsc_scaling_ratio = l1_multiplier; if (kvm_caps.has_tsc_control) - static_call(kvm_x86_write_tsc_multiplier)( - vcpu, vcpu->arch.tsc_scaling_ratio); + static_call(kvm_x86_write_tsc_multiplier)(vcpu); } static inline bool kvm_check_tsc_unstable(void) From 223f93d4d88aedc02bdcc92b3734e76f707812f3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 28 Jul 2023 18:15:53 -0700 Subject: [PATCH 32/50] KVM: nSVM: Skip writes to MSR_AMD64_TSC_RATIO if guest state isn't loaded Skip writes to MSR_AMD64_TSC_RATIO that are done in the context of a vCPU if guest state isn't loaded, i.e. if KVM will update MSR_AMD64_TSC_RATIO during svm_prepare_switch_to_guest() before entering the guest. Checking guest_state_loaded may or may not be a net positive for performance as the current_tsc_ratio cache will optimize away duplicate WRMSRs in the vast majority of scenarios. However, the cost of the check is negligible, and the real motivation is to document that KVM needs to load the vCPU's value only when running the vCPU. Link: https://lore.kernel.org/r/20230729011608.1065019-7-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 2542066652a3..6d8932e05753 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1149,7 +1149,8 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu) void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu) { preempt_disable(); - __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio); + if (to_svm(vcpu)->guest_state_loaded) + __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio); preempt_enable(); } From 765da7fe0e76ed41eea9514433f4ca8cdb5312b1 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Mon, 7 Aug 2023 17:42:43 +0800 Subject: [PATCH 33/50] KVM: x86: Remove break statements that will never be executed Fix compiler warnings when compiling KVM with [-Wunreachable-code-break]. No functional change intended. Cc: Vitaly Kuznetsov Signed-off-by: Like Xu Link: https://lore.kernel.org/r/20230807094243.32516-1-likexu@tencent.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/emulate.c | 2 -- arch/x86/kvm/hyperv.c | 1 - arch/x86/kvm/x86.c | 1 - 3 files changed, 4 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 936a397a08cd..2673cd5c46cb 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1799,13 +1799,11 @@ static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op) op->addr.mem, &op->val, op->bytes); - break; case OP_MEM_STR: return segmented_write(ctxt, op->addr.mem, op->data, op->bytes * op->count); - break; case OP_XMM: kvm_write_sse_reg(op->addr.xmm, &op->vec_val); break; diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index b28fd020066f..7c2dac6824e2 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1293,7 +1293,6 @@ static bool hv_check_msr_access(struct kvm_vcpu_hv *hv_vcpu, u32 msr) case HV_X64_MSR_VP_ASSIST_PAGE: return hv_vcpu->cpuid_cache.features_eax & HV_MSR_APIC_ACCESS_AVAILABLE; - break; case HV_X64_MSR_TSC_FREQUENCY: case HV_X64_MSR_APIC_FREQUENCY: return hv_vcpu->cpuid_cache.features_eax & diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 80ec33fc823b..5ff77a70b382 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4646,7 +4646,6 @@ static int kvm_x86_dev_get_attr(struct kvm_device_attr *attr) return 0; default: return -ENXIO; - break; } } From 7b0151caf73a656b75b550e361648430233455a0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 8 Aug 2023 16:20:57 -0700 Subject: [PATCH 34/50] KVM: x86: Remove WARN sanity check on hypervisor timer vs. UNINITIALIZED vCPU Drop the WARN in KVM_RUN that asserts that KVM isn't using the hypervisor timer, a.k.a. the VMX preemption timer, for a vCPU that is in the UNINITIALIZIED activity state. The intent of the WARN is to sanity check that KVM won't drop a timer interrupt due to an unexpected transition to UNINITIALIZED, but unfortunately userspace can use various ioctl()s to force the unexpected state. Drop the sanity check instead of switching from the hypervisor timer to a software based timer, as the only reason to switch to a software timer when a vCPU is blocking is to ensure the timer interrupt wakes the vCPU, but said interrupt isn't a valid wake event for vCPUs in UNINITIALIZED state *and* the interrupt will be dropped in the end. Reported-by: Yikebaer Aizezi Closes: https://lore.kernel.org/all/CALcu4rbFrU4go8sBHk3FreP+qjgtZCGcYNpSiEXOLm==qFv7iQ@mail.gmail.com Reviewed-by: Paolo Bonzini Link: https://lore.kernel.org/r/20230808232057.2498287-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5ff77a70b382..d293adc05145 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11084,12 +11084,17 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) r = -EINTR; goto out; } - /* - * It should be impossible for the hypervisor timer to be in - * use before KVM has ever run the vCPU. - */ - WARN_ON_ONCE(kvm_lapic_hv_timer_in_use(vcpu)); + /* + * Don't bother switching APIC timer emulation from the + * hypervisor timer to the software timer, the only way for the + * APIC timer to be active is if userspace stuffed vCPU state, + * i.e. put the vCPU into a nonsensical state. Only an INIT + * will transition the vCPU out of UNINITIALIZED (without more + * state stuffing from userspace), which will reset the local + * APIC and thus cancel the timer or drop the IRQ (if the timer + * already expired). + */ kvm_vcpu_srcu_read_unlock(vcpu); kvm_vcpu_block(vcpu); kvm_vcpu_srcu_read_lock(vcpu); From 392a53246257a9f89ef421b60af62056ce374285 Mon Sep 17 00:00:00 2001 From: Li zeming Date: Thu, 17 Aug 2023 08:26:31 +0800 Subject: [PATCH 35/50] x86: kvm: x86: Remove unnecessary initial values of variables bitmap and khz is assigned first, so it does not need to initialize the assignment. Signed-off-by: Li zeming Link: https://lore.kernel.org/r/20230817002631.2885-1-zeming@nfschina.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d293adc05145..c91de9897e32 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6512,7 +6512,7 @@ static void kvm_free_msr_filter(struct kvm_x86_msr_filter *msr_filter) static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter, struct kvm_msr_filter_range *user_range) { - unsigned long *bitmap = NULL; + unsigned long *bitmap; size_t bitmap_size; if (!user_range->nmsrs) @@ -9146,7 +9146,7 @@ static int kvmclock_cpu_down_prep(unsigned int cpu) static void tsc_khz_changed(void *data) { struct cpufreq_freqs *freq = data; - unsigned long khz = 0; + unsigned long khz; WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_CONSTANT_TSC)); From 42764413d19585caff1321823b376c0bd0a597af Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Aug 2023 13:36:39 -0700 Subject: [PATCH 36/50] KVM: x86: Add a framework for enabling KVM-governed x86 features Introduce yet another X86_FEATURE flag framework to manage and cache KVM governed features (for lack of a better name). "Governed" in this case means that KVM has some level of involvement and/or vested interest in whether or not an X86_FEATURE can be used by the guest. The intent of the framework is twofold: to simplify caching of guest CPUID flags that KVM needs to frequently query, and to add clarity to such caching, e.g. it isn't immediately obvious that SVM's bundle of flags for "optional nested SVM features" track whether or not a flag is exposed to L1. Begrudgingly define KVM_MAX_NR_GOVERNED_FEATURES for the size of the bitmap to avoid exposing governed_features.h in arch/x86/include/asm/, but add a FIXME to call out that it can and should be cleaned up once "struct kvm_vcpu_arch" is no longer expose to the kernel at large. Cc: Zeng Guang Reviewed-by: Binbin Wu Reviewed-by: Kai Huang Reviewed-by: Yuan Yao Link: https://lore.kernel.org/r/20230815203653.519297-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 19 +++++++++++++ arch/x86/kvm/cpuid.c | 4 +++ arch/x86/kvm/cpuid.h | 46 ++++++++++++++++++++++++++++++++ arch/x86/kvm/governed_features.h | 9 +++++++ 4 files changed, 78 insertions(+) create mode 100644 arch/x86/kvm/governed_features.h diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index dad9331c5270..007fa8bfd634 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -831,6 +831,25 @@ struct kvm_vcpu_arch { struct kvm_cpuid_entry2 *cpuid_entries; struct kvm_hypervisor_cpuid kvm_cpuid; + /* + * FIXME: Drop this macro and use KVM_NR_GOVERNED_FEATURES directly + * when "struct kvm_vcpu_arch" is no longer defined in an + * arch/x86/include/asm header. The max is mostly arbitrary, i.e. + * can be increased as necessary. + */ +#define KVM_MAX_NR_GOVERNED_FEATURES BITS_PER_LONG + + /* + * Track whether or not the guest is allowed to use features that are + * governed by KVM, where "governed" means KVM needs to manage state + * and/or explicitly enable the feature in hardware. Typically, but + * not always, governed features can be used by the guest if and only + * if both KVM and userspace want to expose the feature to the guest. + */ + struct { + DECLARE_BITMAP(enabled, KVM_MAX_NR_GOVERNED_FEATURES); + } governed_features; + u64 reserved_gpa_bits; int maxphyaddr; diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 5a88affb2e1a..4ba43ae008cb 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -313,6 +313,10 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) struct kvm_lapic *apic = vcpu->arch.apic; struct kvm_cpuid_entry2 *best; + BUILD_BUG_ON(KVM_NR_GOVERNED_FEATURES > KVM_MAX_NR_GOVERNED_FEATURES); + bitmap_zero(vcpu->arch.governed_features.enabled, + KVM_MAX_NR_GOVERNED_FEATURES); + best = kvm_find_cpuid_entry(vcpu, 1); if (best && apic) { if (cpuid_entry_has(best, X86_FEATURE_TSC_DEADLINE_TIMER)) diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index b1658c0de847..284fa4704553 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -232,4 +232,50 @@ static __always_inline bool guest_pv_has(struct kvm_vcpu *vcpu, return vcpu->arch.pv_cpuid.features & (1u << kvm_feature); } +enum kvm_governed_features { +#define KVM_GOVERNED_FEATURE(x) KVM_GOVERNED_##x, +#include "governed_features.h" + KVM_NR_GOVERNED_FEATURES +}; + +static __always_inline int kvm_governed_feature_index(unsigned int x86_feature) +{ + switch (x86_feature) { +#define KVM_GOVERNED_FEATURE(x) case x: return KVM_GOVERNED_##x; +#include "governed_features.h" + default: + return -1; + } +} + +static __always_inline bool kvm_is_governed_feature(unsigned int x86_feature) +{ + return kvm_governed_feature_index(x86_feature) >= 0; +} + +static __always_inline void kvm_governed_feature_set(struct kvm_vcpu *vcpu, + unsigned int x86_feature) +{ + BUILD_BUG_ON(!kvm_is_governed_feature(x86_feature)); + + __set_bit(kvm_governed_feature_index(x86_feature), + vcpu->arch.governed_features.enabled); +} + +static __always_inline void kvm_governed_feature_check_and_set(struct kvm_vcpu *vcpu, + unsigned int x86_feature) +{ + if (kvm_cpu_cap_has(x86_feature) && guest_cpuid_has(vcpu, x86_feature)) + kvm_governed_feature_set(vcpu, x86_feature); +} + +static __always_inline bool guest_can_use(struct kvm_vcpu *vcpu, + unsigned int x86_feature) +{ + BUILD_BUG_ON(!kvm_is_governed_feature(x86_feature)); + + return test_bit(kvm_governed_feature_index(x86_feature), + vcpu->arch.governed_features.enabled); +} + #endif diff --git a/arch/x86/kvm/governed_features.h b/arch/x86/kvm/governed_features.h new file mode 100644 index 000000000000..40ce8e6608cd --- /dev/null +++ b/arch/x86/kvm/governed_features.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#if !defined(KVM_GOVERNED_FEATURE) || defined(KVM_GOVERNED_X86_FEATURE) +BUILD_BUG() +#endif + +#define KVM_GOVERNED_X86_FEATURE(x) KVM_GOVERNED_FEATURE(X86_FEATURE_##x) + +#undef KVM_GOVERNED_X86_FEATURE +#undef KVM_GOVERNED_FEATURE From ccf31d6e6cc53e50cc42845061174082fd229c79 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Aug 2023 13:36:40 -0700 Subject: [PATCH 37/50] KVM: x86/mmu: Use KVM-governed feature framework to track "GBPAGES enabled" Use the governed feature framework to track whether or not the guest can use 1GiB pages, and drop the one-off helper that wraps the surprisingly non-trivial logic surrounding 1GiB page usage in the guest. No functional change intended. Reviewed-by: Yuan Yao Link: https://lore.kernel.org/r/20230815203653.519297-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/cpuid.c | 17 +++++++++++++++++ arch/x86/kvm/governed_features.h | 2 ++ arch/x86/kvm/mmu/mmu.c | 20 +++----------------- 3 files changed, 22 insertions(+), 17 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 4ba43ae008cb..67e9f79fe059 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -312,11 +312,28 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; struct kvm_cpuid_entry2 *best; + bool allow_gbpages; BUILD_BUG_ON(KVM_NR_GOVERNED_FEATURES > KVM_MAX_NR_GOVERNED_FEATURES); bitmap_zero(vcpu->arch.governed_features.enabled, KVM_MAX_NR_GOVERNED_FEATURES); + /* + * If TDP is enabled, let the guest use GBPAGES if they're supported in + * hardware. The hardware page walker doesn't let KVM disable GBPAGES, + * i.e. won't treat them as reserved, and KVM doesn't redo the GVA->GPA + * walk for performance and complexity reasons. Not to mention KVM + * _can't_ solve the problem because GVA->GPA walks aren't visible to + * KVM once a TDP translation is installed. Mimic hardware behavior so + * that KVM's is at least consistent, i.e. doesn't randomly inject #PF. + * If TDP is disabled, honor *only* guest CPUID as KVM has full control + * and can install smaller shadow pages if the host lacks 1GiB support. + */ + allow_gbpages = tdp_enabled ? boot_cpu_has(X86_FEATURE_GBPAGES) : + guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES); + if (allow_gbpages) + kvm_governed_feature_set(vcpu, X86_FEATURE_GBPAGES); + best = kvm_find_cpuid_entry(vcpu, 1); if (best && apic) { if (cpuid_entry_has(best, X86_FEATURE_TSC_DEADLINE_TIMER)) diff --git a/arch/x86/kvm/governed_features.h b/arch/x86/kvm/governed_features.h index 40ce8e6608cd..b29c15d5e038 100644 --- a/arch/x86/kvm/governed_features.h +++ b/arch/x86/kvm/governed_features.h @@ -5,5 +5,7 @@ BUILD_BUG() #define KVM_GOVERNED_X86_FEATURE(x) KVM_GOVERNED_FEATURE(X86_FEATURE_##x) +KVM_GOVERNED_X86_FEATURE(GBPAGES) + #undef KVM_GOVERNED_X86_FEATURE #undef KVM_GOVERNED_FEATURE diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 31d9cbe817a2..a0c2acb323eb 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4808,28 +4808,13 @@ static void __reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check, } } -static bool guest_can_use_gbpages(struct kvm_vcpu *vcpu) -{ - /* - * If TDP is enabled, let the guest use GBPAGES if they're supported in - * hardware. The hardware page walker doesn't let KVM disable GBPAGES, - * i.e. won't treat them as reserved, and KVM doesn't redo the GVA->GPA - * walk for performance and complexity reasons. Not to mention KVM - * _can't_ solve the problem because GVA->GPA walks aren't visible to - * KVM once a TDP translation is installed. Mimic hardware behavior so - * that KVM's is at least consistent, i.e. doesn't randomly inject #PF. - */ - return tdp_enabled ? boot_cpu_has(X86_FEATURE_GBPAGES) : - guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES); -} - static void reset_guest_rsvds_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) { __reset_rsvds_bits_mask(&context->guest_rsvd_check, vcpu->arch.reserved_gpa_bits, context->cpu_role.base.level, is_efer_nx(context), - guest_can_use_gbpages(vcpu), + guest_can_use(vcpu, X86_FEATURE_GBPAGES), is_cr4_pse(context), guest_cpuid_is_amd_or_hygon(vcpu)); } @@ -4906,7 +4891,8 @@ static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(), context->root_role.level, context->root_role.efer_nx, - guest_can_use_gbpages(vcpu), is_pse, is_amd); + guest_can_use(vcpu, X86_FEATURE_GBPAGES), + is_pse, is_amd); if (!shadow_me_mask) return; From 1143c0b85c07fc45daf1c33b26ba9320becd6748 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Aug 2023 13:36:41 -0700 Subject: [PATCH 38/50] KVM: VMX: Recompute "XSAVES enabled" only after CPUID update Recompute whether or not XSAVES is enabled for the guest only if the guest's CPUID model changes instead of redoing the computation every time KVM generates vmcs01's secondary execution controls. The boot_cpu_has() and cpu_has_vmx_xsaves() checks should never change after KVM is loaded, and if they do the kernel/KVM is hosed. Opportunistically add a comment explaining _why_ XSAVES is effectively exposed to the guest if and only if XSAVE is also exposed to the guest. Practically speaking, no functional change intended (KVM will do fewer computations, but should still see the same xsaves_enabled value whenever KVM looks at it). Reviewed-by: Yuan Yao Link: https://lore.kernel.org/r/20230815203653.519297-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 3235998ca261..159a58c12115 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4612,19 +4612,10 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) if (!enable_pml || !atomic_read(&vcpu->kvm->nr_memslots_dirty_logging)) exec_control &= ~SECONDARY_EXEC_ENABLE_PML; - if (cpu_has_vmx_xsaves()) { - /* Exposing XSAVES only when XSAVE is exposed */ - bool xsaves_enabled = - boot_cpu_has(X86_FEATURE_XSAVE) && - guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && - guest_cpuid_has(vcpu, X86_FEATURE_XSAVES); - - vcpu->arch.xsaves_enabled = xsaves_enabled; - + if (cpu_has_vmx_xsaves()) vmx_adjust_secondary_exec_control(vmx, &exec_control, SECONDARY_EXEC_XSAVES, - xsaves_enabled, false); - } + vcpu->arch.xsaves_enabled, false); /* * RDPID is also gated by ENABLE_RDTSCP, turn on the control if either @@ -7747,8 +7738,15 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - /* xsaves_enabled is recomputed in vmx_compute_secondary_exec_control(). */ - vcpu->arch.xsaves_enabled = false; + /* + * XSAVES is effectively enabled if and only if XSAVE is also exposed + * to the guest. XSAVES depends on CR4.OSXSAVE, and CR4.OSXSAVE can be + * set if and only if XSAVE is supported. + */ + vcpu->arch.xsaves_enabled = cpu_has_vmx_xsaves() && + boot_cpu_has(X86_FEATURE_XSAVE) && + guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && + guest_cpuid_has(vcpu, X86_FEATURE_XSAVES); vmx_setup_uret_msrs(vmx); From 0497d2ac9b26d015cfafc4655ad9308c30a61810 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Aug 2023 13:36:42 -0700 Subject: [PATCH 39/50] KVM: VMX: Check KVM CPU caps, not just VMX MSR support, for XSAVE enabling Check KVM CPU capabilities instead of raw VMX support for XSAVES when determining whether or not XSAVER can/should be exposed to the guest. Practically speaking, it's nonsensical/impossible for a CPU to support "enable XSAVES" without XSAVES being supported natively. The real motivation for checking kvm_cpu_cap_has() is to allow using the governed feature's standard check-and-set logic. Reviewed-by: Yuan Yao Link: https://lore.kernel.org/r/20230815203653.519297-5-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 159a58c12115..968401b2efcd 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7743,7 +7743,7 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) * to the guest. XSAVES depends on CR4.OSXSAVE, and CR4.OSXSAVE can be * set if and only if XSAVE is supported. */ - vcpu->arch.xsaves_enabled = cpu_has_vmx_xsaves() && + vcpu->arch.xsaves_enabled = kvm_cpu_cap_has(X86_FEATURE_XSAVES) && boot_cpu_has(X86_FEATURE_XSAVE) && guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && guest_cpuid_has(vcpu, X86_FEATURE_XSAVES); From 662f6815786efc9a60ee374142b3fd70db4637be Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Aug 2023 13:36:43 -0700 Subject: [PATCH 40/50] KVM: VMX: Rename XSAVES control to follow KVM's preferred "ENABLE_XYZ" Rename the XSAVES secondary execution control to follow KVM's preferred style so that XSAVES related logic can use common macros that depend on KVM's preferred style. No functional change intended. Reviewed-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20230815203653.519297-6-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/vmx.h | 2 +- arch/x86/kvm/vmx/capabilities.h | 2 +- arch/x86/kvm/vmx/hyperv.c | 2 +- arch/x86/kvm/vmx/nested.c | 6 +++--- arch/x86/kvm/vmx/nested.h | 2 +- arch/x86/kvm/vmx/vmx.c | 2 +- arch/x86/kvm/vmx/vmx.h | 2 +- 7 files changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 0d02c4aafa6f..0e73616b82f3 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -71,7 +71,7 @@ #define SECONDARY_EXEC_RDSEED_EXITING VMCS_CONTROL_BIT(RDSEED_EXITING) #define SECONDARY_EXEC_ENABLE_PML VMCS_CONTROL_BIT(PAGE_MOD_LOGGING) #define SECONDARY_EXEC_PT_CONCEAL_VMX VMCS_CONTROL_BIT(PT_CONCEAL_VMX) -#define SECONDARY_EXEC_XSAVES VMCS_CONTROL_BIT(XSAVES) +#define SECONDARY_EXEC_ENABLE_XSAVES VMCS_CONTROL_BIT(XSAVES) #define SECONDARY_EXEC_MODE_BASED_EPT_EXEC VMCS_CONTROL_BIT(MODE_BASED_EPT_EXEC) #define SECONDARY_EXEC_PT_USE_GPA VMCS_CONTROL_BIT(PT_USE_GPA) #define SECONDARY_EXEC_TSC_SCALING VMCS_CONTROL_BIT(TSC_SCALING) diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index d0abee35d7ba..41a4533f9989 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -252,7 +252,7 @@ static inline bool cpu_has_vmx_pml(void) static inline bool cpu_has_vmx_xsaves(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_XSAVES; + SECONDARY_EXEC_ENABLE_XSAVES; } static inline bool cpu_has_vmx_waitpkg(void) diff --git a/arch/x86/kvm/vmx/hyperv.c b/arch/x86/kvm/vmx/hyperv.c index 79450e1ed7cf..313b8bb5b8a7 100644 --- a/arch/x86/kvm/vmx/hyperv.c +++ b/arch/x86/kvm/vmx/hyperv.c @@ -78,7 +78,7 @@ SECONDARY_EXEC_DESC | \ SECONDARY_EXEC_ENABLE_RDTSCP | \ SECONDARY_EXEC_ENABLE_INVPCID | \ - SECONDARY_EXEC_XSAVES | \ + SECONDARY_EXEC_ENABLE_XSAVES | \ SECONDARY_EXEC_RDSEED_EXITING | \ SECONDARY_EXEC_RDRAND_EXITING | \ SECONDARY_EXEC_TSC_SCALING | \ diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 516391cc0d64..22e08d30baef 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2307,7 +2307,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | SECONDARY_EXEC_ENABLE_INVPCID | SECONDARY_EXEC_ENABLE_RDTSCP | - SECONDARY_EXEC_XSAVES | + SECONDARY_EXEC_ENABLE_XSAVES | SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_APIC_REGISTER_VIRT | @@ -6331,7 +6331,7 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, * If if it were, XSS would have to be checked against * the XSS exit bitmap in vmcs12. */ - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_XSAVES); case EXIT_REASON_UMWAIT: case EXIT_REASON_TPAUSE: return nested_cpu_has2(vmcs12, @@ -6874,7 +6874,7 @@ static void nested_vmx_setup_secondary_ctls(u32 ept_caps, SECONDARY_EXEC_ENABLE_INVPCID | SECONDARY_EXEC_ENABLE_VMFUNC | SECONDARY_EXEC_RDSEED_EXITING | - SECONDARY_EXEC_XSAVES | + SECONDARY_EXEC_ENABLE_XSAVES | SECONDARY_EXEC_TSC_SCALING | SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index 96952263b029..b4b9d51438c6 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -168,7 +168,7 @@ static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12) static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12) { - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_XSAVES); } static inline bool nested_cpu_has_pml(struct vmcs12 *vmcs12) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 968401b2efcd..d0bba4a34824 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4614,7 +4614,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) if (cpu_has_vmx_xsaves()) vmx_adjust_secondary_exec_control(vmx, &exec_control, - SECONDARY_EXEC_XSAVES, + SECONDARY_EXEC_ENABLE_XSAVES, vcpu->arch.xsaves_enabled, false); /* diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 32384ba38499..cde902b44d97 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -562,7 +562,7 @@ static inline u8 vmx_get_rvi(void) SECONDARY_EXEC_APIC_REGISTER_VIRT | \ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | \ SECONDARY_EXEC_SHADOW_VMCS | \ - SECONDARY_EXEC_XSAVES | \ + SECONDARY_EXEC_ENABLE_XSAVES | \ SECONDARY_EXEC_RDSEED_EXITING | \ SECONDARY_EXEC_RDRAND_EXITING | \ SECONDARY_EXEC_ENABLE_PML | \ From fe60e8f65f79138ffdcc8a9155036cc35f6079ac Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Aug 2023 13:36:44 -0700 Subject: [PATCH 41/50] KVM: x86: Use KVM-governed feature framework to track "XSAVES enabled" Use the governed feature framework to track if XSAVES is "enabled", i.e. if XSAVES can be used by the guest. Add a comment in the SVM code to explain the very unintuitive logic of deliberately NOT checking if XSAVES is enumerated in the guest CPUID model. No functional change intended. Reviewed-by: Yuan Yao Link: https://lore.kernel.org/r/20230815203653.519297-7-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/governed_features.h | 1 + arch/x86/kvm/svm/svm.c | 17 ++++++++++++--- arch/x86/kvm/vmx/vmx.c | 36 ++++++++++++++++---------------- arch/x86/kvm/x86.c | 4 ++-- 5 files changed, 35 insertions(+), 24 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 007fa8bfd634..771adf2438bc 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -746,7 +746,6 @@ struct kvm_vcpu_arch { u64 smi_count; bool at_instruction_boundary; bool tpr_access_reporting; - bool xsaves_enabled; bool xfd_no_write_intercept; u64 ia32_xss; u64 microcode_version; diff --git a/arch/x86/kvm/governed_features.h b/arch/x86/kvm/governed_features.h index b29c15d5e038..b896a64e4ac3 100644 --- a/arch/x86/kvm/governed_features.h +++ b/arch/x86/kvm/governed_features.h @@ -6,6 +6,7 @@ BUILD_BUG() #define KVM_GOVERNED_X86_FEATURE(x) KVM_GOVERNED_FEATURE(X86_FEATURE_##x) KVM_GOVERNED_X86_FEATURE(GBPAGES) +KVM_GOVERNED_X86_FEATURE(XSAVES) #undef KVM_GOVERNED_X86_FEATURE #undef KVM_GOVERNED_FEATURE diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 6d8932e05753..1c6b1175016d 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4244,9 +4244,20 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); struct kvm_cpuid_entry2 *best; - vcpu->arch.xsaves_enabled = guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && - boot_cpu_has(X86_FEATURE_XSAVE) && - boot_cpu_has(X86_FEATURE_XSAVES); + /* + * SVM doesn't provide a way to disable just XSAVES in the guest, KVM + * can only disable all variants of by disallowing CR4.OSXSAVE from + * being set. As a result, if the host has XSAVE and XSAVES, and the + * guest has XSAVE enabled, the guest can execute XSAVES without + * faulting. Treat XSAVES as enabled in this case regardless of + * whether it's advertised to the guest so that KVM context switches + * XSS on VM-Enter/VM-Exit. Failure to do so would effectively give + * the guest read/write access to the host's XSS. + */ + if (boot_cpu_has(X86_FEATURE_XSAVE) && + boot_cpu_has(X86_FEATURE_XSAVES) && + guest_cpuid_has(vcpu, X86_FEATURE_XSAVE)) + kvm_governed_feature_set(vcpu, X86_FEATURE_XSAVES); /* Update nrips enabled cache */ svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) && diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index d0bba4a34824..55cd17231867 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4543,16 +4543,19 @@ vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control, * based on a single guest CPUID bit, with a dedicated feature bit. This also * verifies that the control is actually supported by KVM and hardware. */ -#define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \ -({ \ - bool __enabled; \ - \ - if (cpu_has_vmx_##name()) { \ - __enabled = guest_cpuid_has(&(vmx)->vcpu, \ - X86_FEATURE_##feat_name); \ - vmx_adjust_secondary_exec_control(vmx, exec_control, \ - SECONDARY_EXEC_##ctrl_name, __enabled, exiting); \ - } \ +#define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \ +({ \ + struct kvm_vcpu *__vcpu = &(vmx)->vcpu; \ + bool __enabled; \ + \ + if (cpu_has_vmx_##name()) { \ + if (kvm_is_governed_feature(X86_FEATURE_##feat_name)) \ + __enabled = guest_can_use(__vcpu, X86_FEATURE_##feat_name); \ + else \ + __enabled = guest_cpuid_has(__vcpu, X86_FEATURE_##feat_name); \ + vmx_adjust_secondary_exec_control(vmx, exec_control, SECONDARY_EXEC_##ctrl_name,\ + __enabled, exiting); \ + } \ }) /* More macro magic for ENABLE_/opt-in versus _EXITING/opt-out controls. */ @@ -4612,10 +4615,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) if (!enable_pml || !atomic_read(&vcpu->kvm->nr_memslots_dirty_logging)) exec_control &= ~SECONDARY_EXEC_ENABLE_PML; - if (cpu_has_vmx_xsaves()) - vmx_adjust_secondary_exec_control(vmx, &exec_control, - SECONDARY_EXEC_ENABLE_XSAVES, - vcpu->arch.xsaves_enabled, false); + vmx_adjust_sec_exec_feature(vmx, &exec_control, xsaves, XSAVES); /* * RDPID is also gated by ENABLE_RDTSCP, turn on the control if either @@ -4634,6 +4634,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) SECONDARY_EXEC_ENABLE_RDTSCP, rdpid_or_rdtscp_enabled, false); } + vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID); vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND); @@ -7743,10 +7744,9 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) * to the guest. XSAVES depends on CR4.OSXSAVE, and CR4.OSXSAVE can be * set if and only if XSAVE is supported. */ - vcpu->arch.xsaves_enabled = kvm_cpu_cap_has(X86_FEATURE_XSAVES) && - boot_cpu_has(X86_FEATURE_XSAVE) && - guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && - guest_cpuid_has(vcpu, X86_FEATURE_XSAVES); + if (boot_cpu_has(X86_FEATURE_XSAVE) && + guest_cpuid_has(vcpu, X86_FEATURE_XSAVE)) + kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_XSAVES); vmx_setup_uret_msrs(vmx); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c91de9897e32..7849ea0b0bf7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1015,7 +1015,7 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu) if (vcpu->arch.xcr0 != host_xcr0) xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0); - if (vcpu->arch.xsaves_enabled && + if (guest_can_use(vcpu, X86_FEATURE_XSAVES) && vcpu->arch.ia32_xss != host_xss) wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss); } @@ -1046,7 +1046,7 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu) if (vcpu->arch.xcr0 != host_xcr0) xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0); - if (vcpu->arch.xsaves_enabled && + if (guest_can_use(vcpu, X86_FEATURE_XSAVES) && vcpu->arch.ia32_xss != host_xss) wrmsrl(MSR_IA32_XSS, host_xss); } From 1c18efdaa3148605071ae8b3551ee86c95c8807b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Aug 2023 13:36:45 -0700 Subject: [PATCH 42/50] KVM: nVMX: Use KVM-governed feature framework to track "nested VMX enabled" Track "VMX exposed to L1" via a governed feature flag instead of using a dedicated helper to provide the same functionality. The main goal is to drive convergence between VMX and SVM with respect to querying features that are controllable via module param (SVM likes to cache nested features), avoiding the guest CPUID lookups at runtime is just a bonus and unlikely to provide any meaningful performance benefits. Note, X86_FEATURE_VMX is set in kvm_cpu_caps if and only if "nested" is true, and the CPU obviously supports VMX if KVM+VMX is running. I.e. the check on "nested" is now implicitly down by the kvm_cpu_cap_has() check in kvm_governed_feature_check_and_set(). No functional change intended. Reviewed-by: Yuan Yao Reviwed-by: Kai Huang Link: https://lore.kernel.org/r/20230815203653.519297-8-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/governed_features.h | 1 + arch/x86/kvm/vmx/nested.c | 7 ++++--- arch/x86/kvm/vmx/vmx.c | 21 ++++++--------------- arch/x86/kvm/vmx/vmx.h | 1 - 4 files changed, 11 insertions(+), 19 deletions(-) diff --git a/arch/x86/kvm/governed_features.h b/arch/x86/kvm/governed_features.h index b896a64e4ac3..22446614bf49 100644 --- a/arch/x86/kvm/governed_features.h +++ b/arch/x86/kvm/governed_features.h @@ -7,6 +7,7 @@ BUILD_BUG() KVM_GOVERNED_X86_FEATURE(GBPAGES) KVM_GOVERNED_X86_FEATURE(XSAVES) +KVM_GOVERNED_X86_FEATURE(VMX) #undef KVM_GOVERNED_X86_FEATURE #undef KVM_GOVERNED_FEATURE diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 22e08d30baef..c5ec0ef51ff7 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -6426,7 +6426,7 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, vmx = to_vmx(vcpu); vmcs12 = get_vmcs12(vcpu); - if (nested_vmx_allowed(vcpu) && + if (guest_can_use(vcpu, X86_FEATURE_VMX) && (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr; kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr; @@ -6567,7 +6567,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS) return -EINVAL; } else { - if (!nested_vmx_allowed(vcpu)) + if (!guest_can_use(vcpu, X86_FEATURE_VMX)) return -EINVAL; if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa)) @@ -6601,7 +6601,8 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, return -EINVAL; if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) && - (!nested_vmx_allowed(vcpu) || !vmx->nested.enlightened_vmcs_enabled)) + (!guest_can_use(vcpu, X86_FEATURE_VMX) || + !vmx->nested.enlightened_vmcs_enabled)) return -EINVAL; vmx_leave_nested(vcpu); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 55cd17231867..e9386afd1521 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1908,17 +1908,6 @@ static void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu) vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); } -/* - * nested_vmx_allowed() checks whether a guest should be allowed to use VMX - * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for - * all guests if the "nested" module option is off, and can also be disabled - * for a single guest by disabling its VMX cpuid bit. - */ -bool nested_vmx_allowed(struct kvm_vcpu *vcpu) -{ - return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX); -} - /* * Userspace is allowed to set any supported IA32_FEATURE_CONTROL regardless of * guest CPUID. Note, KVM allows userspace to set "VMX in SMX" to maintain @@ -2046,7 +2035,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) [msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0]; break; case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: - if (!nested_vmx_allowed(vcpu)) + if (!guest_can_use(vcpu, X86_FEATURE_VMX)) return 1; if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index, &msr_info->data)) @@ -2354,7 +2343,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: if (!msr_info->host_initiated) return 1; /* they are read-only */ - if (!nested_vmx_allowed(vcpu)) + if (!guest_can_use(vcpu, X86_FEATURE_VMX)) return 1; return vmx_set_vmx_msr(vcpu, msr_index, data); case MSR_IA32_RTIT_CTL: @@ -7748,13 +7737,15 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) guest_cpuid_has(vcpu, X86_FEATURE_XSAVE)) kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_XSAVES); + kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VMX); + vmx_setup_uret_msrs(vmx); if (cpu_has_secondary_exec_ctrls()) vmcs_set_secondary_exec_control(vmx, vmx_secondary_exec_control(vmx)); - if (nested_vmx_allowed(vcpu)) + if (guest_can_use(vcpu, X86_FEATURE_VMX)) vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_VMX_ENABLED_INSIDE_SMX | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; @@ -7763,7 +7754,7 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) ~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX); - if (nested_vmx_allowed(vcpu)) + if (guest_can_use(vcpu, X86_FEATURE_VMX)) nested_vmx_cr_fixed1_bits_update(vcpu); if (boot_cpu_has(X86_FEATURE_INTEL_PT) && diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index cde902b44d97..c2130d2c8e24 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -374,7 +374,6 @@ struct kvm_vmx { u64 *pid_table; }; -bool nested_vmx_allowed(struct kvm_vcpu *vcpu); void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, struct loaded_vmcs *buddy); int allocate_vpid(void); From 7a6a6a3bf5d8c32a91d8fabb0e2d31f3bd23a412 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Aug 2023 13:36:46 -0700 Subject: [PATCH 43/50] KVM: nSVM: Use KVM-governed feature framework to track "NRIPS enabled" Track "NRIPS exposed to L1" via a governed feature flag instead of using a dedicated bit/flag in vcpu_svm. No functional change intended. Reviewed-by: Yuan Yao Link: https://lore.kernel.org/r/20230815203653.519297-9-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/governed_features.h | 1 + arch/x86/kvm/svm/nested.c | 6 +++--- arch/x86/kvm/svm/svm.c | 4 +--- arch/x86/kvm/svm/svm.h | 1 - 4 files changed, 5 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/governed_features.h b/arch/x86/kvm/governed_features.h index 22446614bf49..722b66af412c 100644 --- a/arch/x86/kvm/governed_features.h +++ b/arch/x86/kvm/governed_features.h @@ -8,6 +8,7 @@ BUILD_BUG() KVM_GOVERNED_X86_FEATURE(GBPAGES) KVM_GOVERNED_X86_FEATURE(XSAVES) KVM_GOVERNED_X86_FEATURE(VMX) +KVM_GOVERNED_X86_FEATURE(NRIPS) #undef KVM_GOVERNED_X86_FEATURE #undef KVM_GOVERNED_FEATURE diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 3342cc4a5189..9092f3f8dccf 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -716,7 +716,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, * what a nrips=0 CPU would do (L1 is responsible for advancing RIP * prior to injecting the event). */ - if (svm->nrips_enabled) + if (guest_can_use(vcpu, X86_FEATURE_NRIPS)) vmcb02->control.next_rip = svm->nested.ctl.next_rip; else if (boot_cpu_has(X86_FEATURE_NRIPS)) vmcb02->control.next_rip = vmcb12_rip; @@ -726,7 +726,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, svm->soft_int_injected = true; svm->soft_int_csbase = vmcb12_csbase; svm->soft_int_old_rip = vmcb12_rip; - if (svm->nrips_enabled) + if (guest_can_use(vcpu, X86_FEATURE_NRIPS)) svm->soft_int_next_rip = svm->nested.ctl.next_rip; else svm->soft_int_next_rip = vmcb12_rip; @@ -1026,7 +1026,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm) if (vmcb12->control.exit_code != SVM_EXIT_ERR) nested_save_pending_event_to_vmcb12(svm, vmcb12); - if (svm->nrips_enabled) + if (guest_can_use(vcpu, X86_FEATURE_NRIPS)) vmcb12->control.next_rip = vmcb02->control.next_rip; vmcb12->control.int_ctl = svm->nested.ctl.int_ctl; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 1c6b1175016d..de69e98a7c6a 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4259,9 +4259,7 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) guest_cpuid_has(vcpu, X86_FEATURE_XSAVE)) kvm_governed_feature_set(vcpu, X86_FEATURE_XSAVES); - /* Update nrips enabled cache */ - svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) && - guest_cpuid_has(vcpu, X86_FEATURE_NRIPS); + kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_NRIPS); svm->tsc_scaling_enabled = tsc_scaling && guest_cpuid_has(vcpu, X86_FEATURE_TSCRATEMSR); svm->lbrv_enabled = lbrv && guest_cpuid_has(vcpu, X86_FEATURE_LBRV); diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 5829a1801862..c06de55425d4 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -259,7 +259,6 @@ struct vcpu_svm { bool soft_int_injected; /* optional nested SVM features that are enabled for this guest */ - bool nrips_enabled : 1; bool tsc_scaling_enabled : 1; bool v_vmload_vmsave_enabled : 1; bool lbrv_enabled : 1; From 4365a45571c791a2fbeb81cf27738960c5456f57 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Aug 2023 13:36:47 -0700 Subject: [PATCH 44/50] KVM: nSVM: Use KVM-governed feature framework to track "TSC scaling enabled" Track "TSC scaling exposed to L1" via a governed feature flag instead of using a dedicated bit/flag in vcpu_svm. Note, this fixes a benign bug where KVM would mark TSC scaling as exposed to L1 even if overall nested SVM supported is disabled, i.e. KVM would let L1 write MSR_AMD64_TSC_RATIO even when KVM didn't advertise TSCRATEMSR support to userspace. Reviewed-by: Yuan Yao Link: https://lore.kernel.org/r/20230815203653.519297-10-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/governed_features.h | 1 + arch/x86/kvm/svm/nested.c | 2 +- arch/x86/kvm/svm/svm.c | 10 ++++++---- arch/x86/kvm/svm/svm.h | 1 - 4 files changed, 8 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/governed_features.h b/arch/x86/kvm/governed_features.h index 722b66af412c..32c0469cf952 100644 --- a/arch/x86/kvm/governed_features.h +++ b/arch/x86/kvm/governed_features.h @@ -9,6 +9,7 @@ KVM_GOVERNED_X86_FEATURE(GBPAGES) KVM_GOVERNED_X86_FEATURE(XSAVES) KVM_GOVERNED_X86_FEATURE(VMX) KVM_GOVERNED_X86_FEATURE(NRIPS) +KVM_GOVERNED_X86_FEATURE(TSCRATEMSR) #undef KVM_GOVERNED_X86_FEATURE #undef KVM_GOVERNED_FEATURE diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 9092f3f8dccf..da65948064dc 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -695,7 +695,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, vmcb02->control.tsc_offset = vcpu->arch.tsc_offset; - if (svm->tsc_scaling_enabled && + if (guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR) && svm->tsc_ratio_msr != kvm_caps.default_tsc_scaling_ratio) nested_svm_update_tsc_ratio_msr(vcpu); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index de69e98a7c6a..0d5cca0bb5fb 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2795,7 +2795,8 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) switch (msr_info->index) { case MSR_AMD64_TSC_RATIO: - if (!msr_info->host_initiated && !svm->tsc_scaling_enabled) + if (!msr_info->host_initiated && + !guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR)) return 1; msr_info->data = svm->tsc_ratio_msr; break; @@ -2937,7 +2938,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) switch (ecx) { case MSR_AMD64_TSC_RATIO: - if (!svm->tsc_scaling_enabled) { + if (!guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR)) { if (!msr->host_initiated) return 1; @@ -2959,7 +2960,8 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) svm->tsc_ratio_msr = data; - if (svm->tsc_scaling_enabled && is_guest_mode(vcpu)) + if (guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR) && + is_guest_mode(vcpu)) nested_svm_update_tsc_ratio_msr(vcpu); break; @@ -4260,8 +4262,8 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) kvm_governed_feature_set(vcpu, X86_FEATURE_XSAVES); kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_NRIPS); + kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_TSCRATEMSR); - svm->tsc_scaling_enabled = tsc_scaling && guest_cpuid_has(vcpu, X86_FEATURE_TSCRATEMSR); svm->lbrv_enabled = lbrv && guest_cpuid_has(vcpu, X86_FEATURE_LBRV); svm->v_vmload_vmsave_enabled = vls && guest_cpuid_has(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD); diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index c06de55425d4..4e7332f77702 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -259,7 +259,6 @@ struct vcpu_svm { bool soft_int_injected; /* optional nested SVM features that are enabled for this guest */ - bool tsc_scaling_enabled : 1; bool v_vmload_vmsave_enabled : 1; bool lbrv_enabled : 1; bool pause_filter_enabled : 1; From 4d2a1560ffc29525493829ee31dc069c00c52c69 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Aug 2023 13:36:48 -0700 Subject: [PATCH 45/50] KVM: nSVM: Use KVM-governed feature framework to track "vVM{SAVE,LOAD} enabled" Track "virtual VMSAVE/VMLOAD exposed to L1" via a governed feature flag instead of using a dedicated bit/flag in vcpu_svm. Opportunistically add a comment explaining why KVM disallows virtual VMLOAD/VMSAVE when the vCPU model is Intel. No functional change intended. Link: https://lore.kernel.org/r/20230815203653.519297-11-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/governed_features.h | 1 + arch/x86/kvm/svm/nested.c | 2 +- arch/x86/kvm/svm/svm.c | 10 +++++++--- arch/x86/kvm/svm/svm.h | 1 - 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/governed_features.h b/arch/x86/kvm/governed_features.h index 32c0469cf952..f01a95fd0071 100644 --- a/arch/x86/kvm/governed_features.h +++ b/arch/x86/kvm/governed_features.h @@ -10,6 +10,7 @@ KVM_GOVERNED_X86_FEATURE(XSAVES) KVM_GOVERNED_X86_FEATURE(VMX) KVM_GOVERNED_X86_FEATURE(NRIPS) KVM_GOVERNED_X86_FEATURE(TSCRATEMSR) +KVM_GOVERNED_X86_FEATURE(V_VMSAVE_VMLOAD) #undef KVM_GOVERNED_X86_FEATURE #undef KVM_GOVERNED_FEATURE diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index da65948064dc..24d47ebeb0e0 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -107,7 +107,7 @@ static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu) static bool nested_vmcb_needs_vls_intercept(struct vcpu_svm *svm) { - if (!svm->v_vmload_vmsave_enabled) + if (!guest_can_use(&svm->vcpu, X86_FEATURE_V_VMSAVE_VMLOAD)) return true; if (!nested_npt_enabled(svm)) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 0d5cca0bb5fb..78d53ea513f4 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1194,8 +1194,6 @@ static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu) set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0); set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0); - - svm->v_vmload_vmsave_enabled = false; } else { /* * If hardware supports Virtual VMLOAD VMSAVE then enable it @@ -4266,7 +4264,13 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) svm->lbrv_enabled = lbrv && guest_cpuid_has(vcpu, X86_FEATURE_LBRV); - svm->v_vmload_vmsave_enabled = vls && guest_cpuid_has(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD); + /* + * Intercept VMLOAD if the vCPU mode is Intel in order to emulate that + * VMLOAD drops bits 63:32 of SYSENTER (ignoring the fact that exposing + * SVM on Intel is bonkers and extremely unlikely to work). + */ + if (!guest_cpuid_is_intel(vcpu)) + kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD); svm->pause_filter_enabled = kvm_cpu_cap_has(X86_FEATURE_PAUSEFILTER) && guest_cpuid_has(vcpu, X86_FEATURE_PAUSEFILTER); diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 4e7332f77702..b475241df6dc 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -259,7 +259,6 @@ struct vcpu_svm { bool soft_int_injected; /* optional nested SVM features that are enabled for this guest */ - bool v_vmload_vmsave_enabled : 1; bool lbrv_enabled : 1; bool pause_filter_enabled : 1; bool pause_threshold_enabled : 1; From e183d17ac362655e5ef061760245b4402b9c04f8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Aug 2023 13:36:49 -0700 Subject: [PATCH 46/50] KVM: nSVM: Use KVM-governed feature framework to track "LBRv enabled" Track "LBR virtualization exposed to L1" via a governed feature flag instead of using a dedicated bit/flag in vcpu_svm. Note, checking KVM's capabilities instead of the "lbrv" param means that the code isn't strictly equivalent, as lbrv_enabled could have been set if nested=false where as that the governed feature cannot. But that's a glorified nop as the feature/flag is consumed only by paths that are gated by nSVM being enabled. Link: https://lore.kernel.org/r/20230815203653.519297-12-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/governed_features.h | 1 + arch/x86/kvm/svm/nested.c | 23 +++++++++++++---------- arch/x86/kvm/svm/svm.c | 5 ++--- arch/x86/kvm/svm/svm.h | 1 - 4 files changed, 16 insertions(+), 14 deletions(-) diff --git a/arch/x86/kvm/governed_features.h b/arch/x86/kvm/governed_features.h index f01a95fd0071..3a4c0e40e1e0 100644 --- a/arch/x86/kvm/governed_features.h +++ b/arch/x86/kvm/governed_features.h @@ -11,6 +11,7 @@ KVM_GOVERNED_X86_FEATURE(VMX) KVM_GOVERNED_X86_FEATURE(NRIPS) KVM_GOVERNED_X86_FEATURE(TSCRATEMSR) KVM_GOVERNED_X86_FEATURE(V_VMSAVE_VMLOAD) +KVM_GOVERNED_X86_FEATURE(LBRV) #undef KVM_GOVERNED_X86_FEATURE #undef KVM_GOVERNED_FEATURE diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 24d47ebeb0e0..f50f74b1a04e 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -552,6 +552,7 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12 bool new_vmcb12 = false; struct vmcb *vmcb01 = svm->vmcb01.ptr; struct vmcb *vmcb02 = svm->nested.vmcb02.ptr; + struct kvm_vcpu *vcpu = &svm->vcpu; nested_vmcb02_compute_g_pat(svm); @@ -577,18 +578,18 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12 vmcb_mark_dirty(vmcb02, VMCB_DT); } - kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED); + kvm_set_rflags(vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED); - svm_set_efer(&svm->vcpu, svm->nested.save.efer); + svm_set_efer(vcpu, svm->nested.save.efer); - svm_set_cr0(&svm->vcpu, svm->nested.save.cr0); - svm_set_cr4(&svm->vcpu, svm->nested.save.cr4); + svm_set_cr0(vcpu, svm->nested.save.cr0); + svm_set_cr4(vcpu, svm->nested.save.cr4); svm->vcpu.arch.cr2 = vmcb12->save.cr2; - kvm_rax_write(&svm->vcpu, vmcb12->save.rax); - kvm_rsp_write(&svm->vcpu, vmcb12->save.rsp); - kvm_rip_write(&svm->vcpu, vmcb12->save.rip); + kvm_rax_write(vcpu, vmcb12->save.rax); + kvm_rsp_write(vcpu, vmcb12->save.rsp); + kvm_rip_write(vcpu, vmcb12->save.rip); /* In case we don't even reach vcpu_run, the fields are not updated */ vmcb02->save.rax = vmcb12->save.rax; @@ -602,7 +603,8 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12 vmcb_mark_dirty(vmcb02, VMCB_DR); } - if (unlikely(svm->lbrv_enabled && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) { + if (unlikely(guest_can_use(vcpu, X86_FEATURE_LBRV) && + (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) { /* * Reserved bits of DEBUGCTL are ignored. Be consistent with * svm_set_msr's definition of reserved bits. @@ -734,7 +736,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, vmcb02->control.virt_ext = vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK; - if (svm->lbrv_enabled) + if (guest_can_use(vcpu, X86_FEATURE_LBRV)) vmcb02->control.virt_ext |= (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK); @@ -1065,7 +1067,8 @@ int nested_svm_vmexit(struct vcpu_svm *svm) if (!nested_exit_on_intr(svm)) kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); - if (unlikely(svm->lbrv_enabled && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) { + if (unlikely(guest_can_use(vcpu, X86_FEATURE_LBRV) && + (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) { svm_copy_lbrs(vmcb12, vmcb02); svm_update_lbrv(vcpu); } else if (unlikely(vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK)) { diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 78d53ea513f4..3b8e412238d3 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1024,7 +1024,7 @@ void svm_update_lbrv(struct kvm_vcpu *vcpu) bool current_enable_lbrv = !!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK); - if (unlikely(is_guest_mode(vcpu) && svm->lbrv_enabled)) + if (unlikely(is_guest_mode(vcpu) && guest_can_use(vcpu, X86_FEATURE_LBRV))) if (unlikely(svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK)) enable_lbrv = true; @@ -4261,8 +4261,7 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_NRIPS); kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_TSCRATEMSR); - - svm->lbrv_enabled = lbrv && guest_cpuid_has(vcpu, X86_FEATURE_LBRV); + kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_LBRV); /* * Intercept VMLOAD if the vCPU mode is Intel in order to emulate that diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index b475241df6dc..0e21823e8a19 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -259,7 +259,6 @@ struct vcpu_svm { bool soft_int_injected; /* optional nested SVM features that are enabled for this guest */ - bool lbrv_enabled : 1; bool pause_filter_enabled : 1; bool pause_threshold_enabled : 1; bool vgif_enabled : 1; From 59d67fc1f0dbb5ee8841d62140efe0fdc4fbabf5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Aug 2023 13:36:50 -0700 Subject: [PATCH 47/50] KVM: nSVM: Use KVM-governed feature framework to track "Pause Filter enabled" Track "Pause Filtering is exposed to L1" via governed feature flags instead of using dedicated bits/flags in vcpu_svm. No functional change intended. Reviewed-by: Yuan Yao Link: https://lore.kernel.org/r/20230815203653.519297-13-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/governed_features.h | 2 ++ arch/x86/kvm/svm/nested.c | 10 ++++++++-- arch/x86/kvm/svm/svm.c | 7 ++----- arch/x86/kvm/svm/svm.h | 2 -- 4 files changed, 12 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/governed_features.h b/arch/x86/kvm/governed_features.h index 3a4c0e40e1e0..9afd34f30599 100644 --- a/arch/x86/kvm/governed_features.h +++ b/arch/x86/kvm/governed_features.h @@ -12,6 +12,8 @@ KVM_GOVERNED_X86_FEATURE(NRIPS) KVM_GOVERNED_X86_FEATURE(TSCRATEMSR) KVM_GOVERNED_X86_FEATURE(V_VMSAVE_VMLOAD) KVM_GOVERNED_X86_FEATURE(LBRV) +KVM_GOVERNED_X86_FEATURE(PAUSEFILTER) +KVM_GOVERNED_X86_FEATURE(PFTHRESHOLD) #undef KVM_GOVERNED_X86_FEATURE #undef KVM_GOVERNED_FEATURE diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index f50f74b1a04e..ac03b2bc5b2c 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -743,8 +743,14 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, if (!nested_vmcb_needs_vls_intercept(svm)) vmcb02->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; - pause_count12 = svm->pause_filter_enabled ? svm->nested.ctl.pause_filter_count : 0; - pause_thresh12 = svm->pause_threshold_enabled ? svm->nested.ctl.pause_filter_thresh : 0; + if (guest_can_use(vcpu, X86_FEATURE_PAUSEFILTER)) + pause_count12 = svm->nested.ctl.pause_filter_count; + else + pause_count12 = 0; + if (guest_can_use(vcpu, X86_FEATURE_PFTHRESHOLD)) + pause_thresh12 = svm->nested.ctl.pause_filter_thresh; + else + pause_thresh12 = 0; if (kvm_pause_in_guest(svm->vcpu.kvm)) { /* use guest values since host doesn't intercept PAUSE */ vmcb02->control.pause_filter_count = pause_count12; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 3b8e412238d3..0dc70ad88d29 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4271,11 +4271,8 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) if (!guest_cpuid_is_intel(vcpu)) kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD); - svm->pause_filter_enabled = kvm_cpu_cap_has(X86_FEATURE_PAUSEFILTER) && - guest_cpuid_has(vcpu, X86_FEATURE_PAUSEFILTER); - - svm->pause_threshold_enabled = kvm_cpu_cap_has(X86_FEATURE_PFTHRESHOLD) && - guest_cpuid_has(vcpu, X86_FEATURE_PFTHRESHOLD); + kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_PAUSEFILTER); + kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_PFTHRESHOLD); svm->vgif_enabled = vgif && guest_cpuid_has(vcpu, X86_FEATURE_VGIF); diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 0e21823e8a19..fb438439b61e 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -259,8 +259,6 @@ struct vcpu_svm { bool soft_int_injected; /* optional nested SVM features that are enabled for this guest */ - bool pause_filter_enabled : 1; - bool pause_threshold_enabled : 1; bool vgif_enabled : 1; bool vnmi_enabled : 1; From b89456aee78d22b20c6c83c4d75af7985ae5be8d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Aug 2023 13:36:51 -0700 Subject: [PATCH 48/50] KVM: nSVM: Use KVM-governed feature framework to track "vGIF enabled" Track "virtual GIF exposed to L1" via a governed feature flag instead of using a dedicated bit/flag in vcpu_svm. Note, checking KVM's capabilities instead of the "vgif" param means that the code isn't strictly equivalent, as vgif_enabled could have been set if nested=false where as that the governed feature cannot. But that's a glorified nop as the feature/flag is consumed only by paths that are Link: https://lore.kernel.org/r/20230815203653.519297-14-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/governed_features.h | 1 + arch/x86/kvm/svm/nested.c | 3 ++- arch/x86/kvm/svm/svm.c | 3 +-- arch/x86/kvm/svm/svm.h | 5 +++-- 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/governed_features.h b/arch/x86/kvm/governed_features.h index 9afd34f30599..368696c2e96b 100644 --- a/arch/x86/kvm/governed_features.h +++ b/arch/x86/kvm/governed_features.h @@ -14,6 +14,7 @@ KVM_GOVERNED_X86_FEATURE(V_VMSAVE_VMLOAD) KVM_GOVERNED_X86_FEATURE(LBRV) KVM_GOVERNED_X86_FEATURE(PAUSEFILTER) KVM_GOVERNED_X86_FEATURE(PFTHRESHOLD) +KVM_GOVERNED_X86_FEATURE(VGIF) #undef KVM_GOVERNED_X86_FEATURE #undef KVM_GOVERNED_FEATURE diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index ac03b2bc5b2c..dd496c9e5f91 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -660,7 +660,8 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, * exit_int_info, exit_int_info_err, next_rip, insn_len, insn_bytes. */ - if (svm->vgif_enabled && (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK)) + if (guest_can_use(vcpu, X86_FEATURE_VGIF) && + (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK)) int_ctl_vmcb12_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK); else int_ctl_vmcb01_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 0dc70ad88d29..d8b7e0c71313 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4273,8 +4273,7 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_PAUSEFILTER); kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_PFTHRESHOLD); - - svm->vgif_enabled = vgif && guest_cpuid_has(vcpu, X86_FEATURE_VGIF); + kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VGIF); svm->vnmi_enabled = vnmi && guest_cpuid_has(vcpu, X86_FEATURE_VNMI); diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index fb438439b61e..6eb5877cc6c3 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -22,6 +22,7 @@ #include #include +#include "cpuid.h" #include "kvm_cache_regs.h" #define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT) @@ -259,7 +260,6 @@ struct vcpu_svm { bool soft_int_injected; /* optional nested SVM features that are enabled for this guest */ - bool vgif_enabled : 1; bool vnmi_enabled : 1; u32 ldr_reg; @@ -485,7 +485,8 @@ static inline bool svm_is_intercept(struct vcpu_svm *svm, int bit) static inline bool nested_vgif_enabled(struct vcpu_svm *svm) { - return svm->vgif_enabled && (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK); + return guest_can_use(&svm->vcpu, X86_FEATURE_VGIF) && + (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK); } static inline struct vmcb *get_vgif_vmcb(struct vcpu_svm *svm) From ee785c870d6f80380be6549772137a41c527d0ba Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Aug 2023 13:36:52 -0700 Subject: [PATCH 49/50] KVM: nSVM: Use KVM-governed feature framework to track "vNMI enabled" Track "virtual NMI exposed to L1" via a governed feature flag instead of using a dedicated bit/flag in vcpu_svm. Note, checking KVM's capabilities instead of the "vnmi" param means that the code isn't strictly equivalent, as vnmi_enabled could have been set if nested=false where as that the governed feature cannot. But that's a glorified nop as the feature/flag is consumed only by paths that are gated by nSVM being enabled. Link: https://lore.kernel.org/r/20230815203653.519297-15-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/governed_features.h | 1 + arch/x86/kvm/svm/svm.c | 3 +-- arch/x86/kvm/svm/svm.h | 5 +---- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/governed_features.h b/arch/x86/kvm/governed_features.h index 368696c2e96b..423a73395c10 100644 --- a/arch/x86/kvm/governed_features.h +++ b/arch/x86/kvm/governed_features.h @@ -15,6 +15,7 @@ KVM_GOVERNED_X86_FEATURE(LBRV) KVM_GOVERNED_X86_FEATURE(PAUSEFILTER) KVM_GOVERNED_X86_FEATURE(PFTHRESHOLD) KVM_GOVERNED_X86_FEATURE(VGIF) +KVM_GOVERNED_X86_FEATURE(VNMI) #undef KVM_GOVERNED_X86_FEATURE #undef KVM_GOVERNED_FEATURE diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d8b7e0c71313..226b3a780d0f 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4274,8 +4274,7 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_PAUSEFILTER); kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_PFTHRESHOLD); kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VGIF); - - svm->vnmi_enabled = vnmi && guest_cpuid_has(vcpu, X86_FEATURE_VNMI); + kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VNMI); svm_recalc_instruction_intercepts(vcpu, svm); diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 6eb5877cc6c3..06400cfe2244 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -259,9 +259,6 @@ struct vcpu_svm { unsigned long soft_int_next_rip; bool soft_int_injected; - /* optional nested SVM features that are enabled for this guest */ - bool vnmi_enabled : 1; - u32 ldr_reg; u32 dfr_reg; struct page *avic_backing_page; @@ -537,7 +534,7 @@ static inline bool nested_npt_enabled(struct vcpu_svm *svm) static inline bool nested_vnmi_enabled(struct vcpu_svm *svm) { - return svm->vnmi_enabled && + return guest_can_use(&svm->vcpu, X86_FEATURE_VNMI) && (svm->nested.ctl.int_ctl & V_NMI_ENABLE_MASK); } From 9717efbe5ba3f52d4b3cc637c7f7f6149ea264bb Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Aug 2023 13:36:53 -0700 Subject: [PATCH 50/50] KVM: x86: Disallow guest CPUID lookups when IRQs are disabled Now that KVM has a framework for caching guest CPUID feature flags, add a "rule" that IRQs must be enabled when doing guest CPUID lookups, and enforce the rule via a lockdep assertion. CPUID lookups are slow, and within KVM, IRQs are only ever disabled in hot paths, e.g. the core run loop, fast page fault handling, etc. I.e. querying guest CPUID with IRQs disabled, especially in the run loop, should be avoided. Link: https://lore.kernel.org/r/20230815203653.519297-16-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/cpuid.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 67e9f79fe059..e961e9a05847 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -11,6 +11,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include +#include "linux/lockdep.h" #include #include #include @@ -84,6 +85,18 @@ static inline struct kvm_cpuid_entry2 *cpuid_entry2_find( struct kvm_cpuid_entry2 *e; int i; + /* + * KVM has a semi-arbitrary rule that querying the guest's CPUID model + * with IRQs disabled is disallowed. The CPUID model can legitimately + * have over one hundred entries, i.e. the lookup is slow, and IRQs are + * typically disabled in KVM only when KVM is in a performance critical + * path, e.g. the core VM-Enter/VM-Exit run loop. Nothing will break + * if this rule is violated, this assertion is purely to flag potential + * performance issues. If this fires, consider moving the lookup out + * of the hotpath, e.g. by caching information during CPUID updates. + */ + lockdep_assert_irqs_enabled(); + for (i = 0; i < nent; i++) { e = &entries[i];