diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 9725c546a0d4..fc34332c8d9a 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2413,8 +2413,12 @@ Default is 1 (enabled) kvm-intel.emulate_invalid_guest_state= - [KVM,Intel] Enable emulation of invalid guest states - Default is 0 (disabled) + [KVM,Intel] Disable emulation of invalid guest state. + Ignored if kvm-intel.enable_unrestricted_guest=1, as + guest state is never invalid for unrestricted guests. + This param doesn't apply to nested guests (L2), as KVM + never emulates invalid L2 guest state. + Default is 1 (enabled) kvm-intel.flexpriority= [KVM,Intel] Disable FlexPriority feature (TPR shadow). diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index cefe1d81e2e8..9e50da3ed01a 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -47,6 +47,7 @@ KVM_X86_OP(set_dr7) KVM_X86_OP(cache_reg) KVM_X86_OP(get_rflags) KVM_X86_OP(set_rflags) +KVM_X86_OP(get_if_flag) KVM_X86_OP(tlb_flush_all) KVM_X86_OP(tlb_flush_current) KVM_X86_OP_NULL(tlb_remote_flush) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 2164b9f4c7b0..555f4de47ef2 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1349,6 +1349,7 @@ struct kvm_x86_ops { void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); + bool (*get_if_flag)(struct kvm_vcpu *vcpu); void (*tlb_flush_all)(struct kvm_vcpu *vcpu); void (*tlb_flush_current)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c index b3ed302c1a35..caa96c270b95 100644 --- a/arch/x86/kvm/mmu/tdp_iter.c +++ b/arch/x86/kvm/mmu/tdp_iter.c @@ -26,6 +26,7 @@ static gfn_t round_gfn_for_level(gfn_t gfn, int level) */ void tdp_iter_restart(struct tdp_iter *iter) { + iter->yielded = false; iter->yielded_gfn = iter->next_last_level_gfn; iter->level = iter->root_level; @@ -160,6 +161,11 @@ static bool try_step_up(struct tdp_iter *iter) */ void tdp_iter_next(struct tdp_iter *iter) { + if (iter->yielded) { + tdp_iter_restart(iter); + return; + } + if (try_step_down(iter)) return; diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h index b1748b988d3a..e19cabbcb65c 100644 --- a/arch/x86/kvm/mmu/tdp_iter.h +++ b/arch/x86/kvm/mmu/tdp_iter.h @@ -45,6 +45,12 @@ struct tdp_iter { * iterator walks off the end of the paging structure. */ bool valid; + /* + * True if KVM dropped mmu_lock and yielded in the middle of a walk, in + * which case tdp_iter_next() needs to restart the walk at the root + * level instead of advancing to the next entry. + */ + bool yielded; }; /* diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 1db8496259ad..1beb4ca90560 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -502,6 +502,8 @@ static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm, struct tdp_iter *iter, u64 new_spte) { + WARN_ON_ONCE(iter->yielded); + lockdep_assert_held_read(&kvm->mmu_lock); /* @@ -575,6 +577,8 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, u64 new_spte, bool record_acc_track, bool record_dirty_log) { + WARN_ON_ONCE(iter->yielded); + lockdep_assert_held_write(&kvm->mmu_lock); /* @@ -640,18 +644,19 @@ static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm, * If this function should yield and flush is set, it will perform a remote * TLB flush before yielding. * - * If this function yields, it will also reset the tdp_iter's walk over the - * paging structure and the calling function should skip to the next - * iteration to allow the iterator to continue its traversal from the - * paging structure root. + * If this function yields, iter->yielded is set and the caller must skip to + * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk + * over the paging structures to allow the iterator to continue its traversal + * from the paging structure root. * - * Return true if this function yielded and the iterator's traversal was reset. - * Return false if a yield was not needed. + * Returns true if this function yielded. */ -static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm, - struct tdp_iter *iter, bool flush, - bool shared) +static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm, + struct tdp_iter *iter, + bool flush, bool shared) { + WARN_ON(iter->yielded); + /* Ensure forward progress has been made before yielding. */ if (iter->next_last_level_gfn == iter->yielded_gfn) return false; @@ -671,12 +676,10 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm, WARN_ON(iter->gfn > iter->next_last_level_gfn); - tdp_iter_restart(iter); - - return true; + iter->yielded = true; } - return false; + return iter->yielded; } /* diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d0f68d11ec70..5151efa424ac 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1585,6 +1585,15 @@ static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) to_svm(vcpu)->vmcb->save.rflags = rflags; } +static bool svm_get_if_flag(struct kvm_vcpu *vcpu) +{ + struct vmcb *vmcb = to_svm(vcpu)->vmcb; + + return sev_es_guest(vcpu->kvm) + ? vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK + : kvm_get_rflags(vcpu) & X86_EFLAGS_IF; +} + static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) { switch (reg) { @@ -3568,14 +3577,7 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu) if (!gif_set(svm)) return true; - if (sev_es_guest(vcpu->kvm)) { - /* - * SEV-ES guests to not expose RFLAGS. Use the VMCB interrupt mask - * bit to determine the state of the IF flag. - */ - if (!(vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK)) - return true; - } else if (is_guest_mode(vcpu)) { + if (is_guest_mode(vcpu)) { /* As long as interrupts are being delivered... */ if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK) ? !(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF) @@ -3586,7 +3588,7 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu) if (nested_exit_on_intr(svm)) return false; } else { - if (!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF)) + if (!svm_get_if_flag(vcpu)) return true; } @@ -4621,6 +4623,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .cache_reg = svm_cache_reg, .get_rflags = svm_get_rflags, .set_rflags = svm_set_rflags, + .get_if_flag = svm_get_if_flag, .tlb_flush_all = svm_flush_tlb, .tlb_flush_current = svm_flush_tlb, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 5aadad3e7367..0dbf94eb954f 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1363,6 +1363,11 @@ void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) vmx->emulation_required = vmx_emulation_required(vcpu); } +static bool vmx_get_if_flag(struct kvm_vcpu *vcpu) +{ + return vmx_get_rflags(vcpu) & X86_EFLAGS_IF; +} + u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu) { u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); @@ -3959,8 +3964,7 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) if (pi_test_and_set_on(&vmx->pi_desc)) return 0; - if (vcpu != kvm_get_running_vcpu() && - !kvm_vcpu_trigger_posted_interrupt(vcpu, false)) + if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false)) kvm_vcpu_kick(vcpu); return 0; @@ -5877,18 +5881,14 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) vmx_flush_pml_buffer(vcpu); /* - * We should never reach this point with a pending nested VM-Enter, and - * more specifically emulation of L2 due to invalid guest state (see - * below) should never happen as that means we incorrectly allowed a - * nested VM-Enter with an invalid vmcs12. + * KVM should never reach this point with a pending nested VM-Enter. + * More specifically, short-circuiting VM-Entry to emulate L2 due to + * invalid guest state should never happen as that means KVM knowingly + * allowed a nested VM-Enter with an invalid vmcs12. More below. */ if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm)) return -EIO; - /* If guest state is invalid, start emulating */ - if (vmx->emulation_required) - return handle_invalid_guest_state(vcpu); - if (is_guest_mode(vcpu)) { /* * PML is never enabled when running L2, bail immediately if a @@ -5910,10 +5910,30 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) */ nested_mark_vmcs12_pages_dirty(vcpu); + /* + * Synthesize a triple fault if L2 state is invalid. In normal + * operation, nested VM-Enter rejects any attempt to enter L2 + * with invalid state. However, those checks are skipped if + * state is being stuffed via RSM or KVM_SET_NESTED_STATE. If + * L2 state is invalid, it means either L1 modified SMRAM state + * or userspace provided bad state. Synthesize TRIPLE_FAULT as + * doing so is architecturally allowed in the RSM case, and is + * the least awful solution for the userspace case without + * risking false positives. + */ + if (vmx->emulation_required) { + nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); + return 1; + } + if (nested_vmx_reflect_vmexit(vcpu)) return 1; } + /* If guest state is invalid, start emulating. L2 is handled above. */ + if (vmx->emulation_required) + return handle_invalid_guest_state(vcpu); + if (exit_reason.failed_vmentry) { dump_vmcs(vcpu); vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; @@ -6608,9 +6628,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) * consistency check VM-Exit due to invalid guest state and bail. */ if (unlikely(vmx->emulation_required)) { - - /* We don't emulate invalid state of a nested guest */ - vmx->fail = is_guest_mode(vcpu); + vmx->fail = 0; vmx->exit_reason.full = EXIT_REASON_INVALID_STATE; vmx->exit_reason.failed_vmentry = 1; @@ -7579,6 +7597,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .cache_reg = vmx_cache_reg, .get_rflags = vmx_get_rflags, .set_rflags = vmx_set_rflags, + .get_if_flag = vmx_get_if_flag, .tlb_flush_all = vmx_flush_tlb_all, .tlb_flush_current = vmx_flush_tlb_current, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9a2972fdae82..e50e97ac4408 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1331,7 +1331,7 @@ static const u32 msrs_to_save_all[] = { MSR_IA32_UMWAIT_CONTROL, MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1, - MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_ARCH_PERFMON_FIXED_CTR0 + 3, + MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS, MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL, MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1, @@ -9001,14 +9001,7 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu) { struct kvm_run *kvm_run = vcpu->run; - /* - * if_flag is obsolete and useless, so do not bother - * setting it for SEV-ES guests. Userspace can just - * use kvm_run->ready_for_interrupt_injection. - */ - kvm_run->if_flag = !vcpu->arch.guest_state_protected - && (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0; - + kvm_run->if_flag = static_call(kvm_x86_get_if_flag)(vcpu); kvm_run->cr8 = kvm_get_cr8(vcpu); kvm_run->apic_base = kvm_get_apic_base(vcpu); diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index 00814c0f87a6..3cb5ac5da087 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -35,6 +35,7 @@ /x86_64/vmx_apic_access_test /x86_64/vmx_close_while_nested_test /x86_64/vmx_dirty_log_test +/x86_64/vmx_invalid_nested_guest_state /x86_64/vmx_preemption_timer_test /x86_64/vmx_set_nested_state_test /x86_64/vmx_tsc_adjust_test diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index f307c9f61981..17342b575e85 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -64,6 +64,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/userspace_msr_exit_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_apic_access_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_dirty_log_test +TEST_GEN_PROGS_x86_64 += x86_64/vmx_invalid_nested_guest_state TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_nested_tsc_scaling_test diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index da2b702da71a..2d62edc49d67 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -71,15 +71,6 @@ enum vm_guest_mode { #endif -#if defined(__x86_64__) -unsigned long vm_compute_max_gfn(struct kvm_vm *vm); -#else -static inline unsigned long vm_compute_max_gfn(struct kvm_vm *vm) -{ - return ((1ULL << vm->pa_bits) >> vm->page_shift) - 1; -} -#endif - #define MIN_PAGE_SIZE (1U << MIN_PAGE_SHIFT) #define PTES_PER_MIN_PAGE ptes_per_page(MIN_PAGE_SIZE) @@ -330,6 +321,7 @@ bool vm_is_unrestricted_guest(struct kvm_vm *vm); unsigned int vm_get_page_size(struct kvm_vm *vm); unsigned int vm_get_page_shift(struct kvm_vm *vm); +unsigned long vm_compute_max_gfn(struct kvm_vm *vm); uint64_t vm_get_max_gfn(struct kvm_vm *vm); int vm_get_fd(struct kvm_vm *vm); diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index daf6fdb217a7..53d2b5d04b82 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -2328,6 +2328,11 @@ unsigned int vm_get_page_shift(struct kvm_vm *vm) return vm->page_shift; } +unsigned long __attribute__((weak)) vm_compute_max_gfn(struct kvm_vm *vm) +{ + return ((1ULL << vm->pa_bits) >> vm->page_shift) - 1; +} + uint64_t vm_get_max_gfn(struct kvm_vm *vm) { return vm->max_gfn; diff --git a/tools/testing/selftests/kvm/x86_64/vmx_invalid_nested_guest_state.c b/tools/testing/selftests/kvm/x86_64/vmx_invalid_nested_guest_state.c new file mode 100644 index 000000000000..489fbed4ca6f --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/vmx_invalid_nested_guest_state.c @@ -0,0 +1,105 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "vmx.h" + +#include +#include + +#include "kselftest.h" + +#define VCPU_ID 0 +#define ARBITRARY_IO_PORT 0x2000 + +static struct kvm_vm *vm; + +static void l2_guest_code(void) +{ + /* + * Generate an exit to L0 userspace, i.e. main(), via I/O to an + * arbitrary port. + */ + asm volatile("inb %%dx, %%al" + : : [port] "d" (ARBITRARY_IO_PORT) : "rax"); +} + +static void l1_guest_code(struct vmx_pages *vmx_pages) +{ +#define L2_GUEST_STACK_SIZE 64 + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + + GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); + GUEST_ASSERT(load_vmcs(vmx_pages)); + + /* Prepare the VMCS for L2 execution. */ + prepare_vmcs(vmx_pages, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + /* + * L2 must be run without unrestricted guest, verify that the selftests + * library hasn't enabled it. Because KVM selftests jump directly to + * 64-bit mode, unrestricted guest support isn't required. + */ + GUEST_ASSERT(!(vmreadz(CPU_BASED_VM_EXEC_CONTROL) & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) || + !(vmreadz(SECONDARY_VM_EXEC_CONTROL) & SECONDARY_EXEC_UNRESTRICTED_GUEST)); + + GUEST_ASSERT(!vmlaunch()); + + /* L2 should triple fault after main() stuffs invalid guest state. */ + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_TRIPLE_FAULT); + GUEST_DONE(); +} + +int main(int argc, char *argv[]) +{ + vm_vaddr_t vmx_pages_gva; + struct kvm_sregs sregs; + struct kvm_run *run; + struct ucall uc; + + nested_vmx_check_supported(); + + vm = vm_create_default(VCPU_ID, 0, (void *) l1_guest_code); + + /* Allocate VMX pages and shared descriptors (vmx_pages). */ + vcpu_alloc_vmx(vm, &vmx_pages_gva); + vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva); + + vcpu_run(vm, VCPU_ID); + + run = vcpu_state(vm, VCPU_ID); + + /* + * The first exit to L0 userspace should be an I/O access from L2. + * Running L1 should launch L2 without triggering an exit to userspace. + */ + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "Expected KVM_EXIT_IO, got: %u (%s)\n", + run->exit_reason, exit_reason_str(run->exit_reason)); + + TEST_ASSERT(run->io.port == ARBITRARY_IO_PORT, + "Expected IN from port %d from L2, got port %d", + ARBITRARY_IO_PORT, run->io.port); + + /* + * Stuff invalid guest state for L2 by making TR unusuable. The next + * KVM_RUN should induce a TRIPLE_FAULT in L2 as KVM doesn't support + * emulating invalid guest state for L2. + */ + memset(&sregs, 0, sizeof(sregs)); + vcpu_sregs_get(vm, VCPU_ID, &sregs); + sregs.tr.unusable = 1; + vcpu_sregs_set(vm, VCPU_ID, &sregs); + + vcpu_run(vm, VCPU_ID); + + switch (get_ucall(vm, VCPU_ID, &uc)) { + case UCALL_DONE: + break; + case UCALL_ABORT: + TEST_FAIL("%s", (const char *)uc.args[0]); + default: + TEST_FAIL("Unexpected ucall: %lu", uc.cmd); + } +}