From a867e9d0cc15039a6ef72e17e2603303dcd1783f Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Thu, 17 Feb 2022 10:12:42 +0000 Subject: [PATCH 1/4] KVM: arm64: Don't miss pending interrupts for suspended vCPU In order to properly emulate the WFI instruction, KVM reads back ICH_VMCR_EL2 and enables doorbells for GICv4. These preparations are necessary in order to recognize pending interrupts in kvm_arch_vcpu_runnable() and return to the guest. Until recently, this work was done by kvm_arch_vcpu_{blocking,unblocking}(). Since commit 6109c5a6ab7f ("KVM: arm64: Move vGIC v4 handling for WFI out arch callback hook"), these callbacks were gutted and superseded by kvm_vcpu_wfi(). It is important to note that KVM implements PSCI CPU_SUSPEND calls as a WFI within the guest. However, the implementation calls directly into kvm_vcpu_halt(), which skips the needed work done in kvm_vcpu_wfi() to detect pending interrupts. Fix the issue by calling the WFI helper. Fixes: 6109c5a6ab7f ("KVM: arm64: Move vGIC v4 handling for WFI out arch callback hook") Signed-off-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220217101242.3013716-1-oupton@google.com --- arch/arm64/kvm/psci.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/arm64/kvm/psci.c b/arch/arm64/kvm/psci.c index 3eae32876897..2ce60fecd861 100644 --- a/arch/arm64/kvm/psci.c +++ b/arch/arm64/kvm/psci.c @@ -46,8 +46,7 @@ static unsigned long kvm_psci_vcpu_suspend(struct kvm_vcpu *vcpu) * specification (ARM DEN 0022A). This means all suspend states * for KVM will preserve the register state. */ - kvm_vcpu_halt(vcpu); - kvm_clear_request(KVM_REQ_UNHALT, vcpu); + kvm_vcpu_wfi(vcpu); return PSCI_RET_SUCCESS; } From bca06b85fcaf866602e328b3bcd86f74180eca14 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 24 Feb 2022 19:19:16 +0000 Subject: [PATCH 2/4] Revert "KVM: VMX: Save HOST_CR3 in vmx_set_host_fs_gs()" Undo a nested VMX fix as a step toward reverting the commit it fixed, 15ad9762d69f ("KVM: VMX: Save HOST_CR3 in vmx_prepare_switch_to_guest()"), as the underlying premise that "host CR3 in the vcpu thread can only be changed when scheduling" is wrong. This reverts commit a9f2705ec84449e3b8d70c804766f8e97e23080d. Signed-off-by: Sean Christopherson Message-Id: <20220224191917.3508476-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 3 +-- arch/x86/kvm/vmx/vmx.c | 20 +++++++++++--------- arch/x86/kvm/vmx/vmx.h | 5 ++--- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index ba34e94049c7..c12f95004a72 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -246,8 +246,7 @@ static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, src = &prev->host_state; dest = &vmx->loaded_vmcs->host_state; - vmx_set_vmcs_host_state(dest, src->cr3, src->fs_sel, src->gs_sel, - src->fs_base, src->gs_base); + vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base); dest->ldt_sel = src->ldt_sel; #ifdef CONFIG_X86_64 dest->ds_sel = src->ds_sel; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index efda5e4d6247..beb68cd28aca 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1080,14 +1080,9 @@ static void pt_guest_exit(struct vcpu_vmx *vmx) wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); } -void vmx_set_vmcs_host_state(struct vmcs_host_state *host, unsigned long cr3, - u16 fs_sel, u16 gs_sel, - unsigned long fs_base, unsigned long gs_base) +void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel, + unsigned long fs_base, unsigned long gs_base) { - if (unlikely(cr3 != host->cr3)) { - vmcs_writel(HOST_CR3, cr3); - host->cr3 = cr3; - } if (unlikely(fs_sel != host->fs_sel)) { if (!(fs_sel & 7)) vmcs_write16(HOST_FS_SELECTOR, fs_sel); @@ -1119,6 +1114,7 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) #ifdef CONFIG_X86_64 int cpu = raw_smp_processor_id(); #endif + unsigned long cr3; unsigned long fs_base, gs_base; u16 fs_sel, gs_sel; int i; @@ -1182,8 +1178,14 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) gs_base = segment_base(gs_sel); #endif - vmx_set_vmcs_host_state(host_state, __get_current_cr3_fast(), - fs_sel, gs_sel, fs_base, gs_base); + vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base); + + /* Host CR3 including its PCID is stable when guest state is loaded. */ + cr3 = __get_current_cr3_fast(); + if (unlikely(cr3 != host_state->cr3)) { + vmcs_writel(HOST_CR3, cr3); + host_state->cr3 = cr3; + } vmx->guest_state_loaded = true; } diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 7f2c82e7f38f..9c6bfcd84008 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -374,9 +374,8 @@ int allocate_vpid(void); void free_vpid(int vpid); void vmx_set_constant_host_state(struct vcpu_vmx *vmx); void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu); -void vmx_set_vmcs_host_state(struct vmcs_host_state *host, unsigned long cr3, - u16 fs_sel, u16 gs_sel, - unsigned long fs_base, unsigned long gs_base); +void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel, + unsigned long fs_base, unsigned long gs_base); int vmx_get_cpl(struct kvm_vcpu *vcpu); bool vmx_emulation_required(struct kvm_vcpu *vcpu); unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu); From 1a71581012ddf1f465040ef3d9f700341fa3cf04 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 24 Feb 2022 19:19:17 +0000 Subject: [PATCH 3/4] Revert "KVM: VMX: Save HOST_CR3 in vmx_prepare_switch_to_guest()" Revert back to refreshing vmcs.HOST_CR3 immediately prior to VM-Enter. The PCID (ASID) part of CR3 can be bumped without KVM being scheduled out, as the kernel will switch CR3 during __text_poke(), e.g. in response to a static key toggling. If switch_mm_irqs_off() chooses a new ASID for the mm associate with KVM, KVM will do VM-Enter => VM-Exit with a stale vmcs.HOST_CR3. Add a comment to explain why KVM must wait until VM-Enter is imminent to refresh vmcs.HOST_CR3. The following splat was captured by stashing vmcs.HOST_CR3 in kvm_vcpu and adding a WARN in load_new_mm_cr3() to fire if a new ASID is being loaded for the KVM-associated mm while KVM has a "running" vCPU: static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush) { struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); ... WARN(vcpu && (vcpu->cr3 & GENMASK(11, 0)) != (new_mm_cr3 & GENMASK(11, 0)) && (vcpu->cr3 & PHYSICAL_PAGE_MASK) == (new_mm_cr3 & PHYSICAL_PAGE_MASK), "KVM is hosed, loading CR3 = %lx, vmcs.HOST_CR3 = %lx", new_mm_cr3, vcpu->cr3); } ------------[ cut here ]------------ KVM is hosed, loading CR3 = 8000000105393004, vmcs.HOST_CR3 = 105393003 WARNING: CPU: 4 PID: 20717 at arch/x86/mm/tlb.c:291 load_new_mm_cr3+0x82/0xe0 Modules linked in: vhost_net vhost vhost_iotlb tap kvm_intel CPU: 4 PID: 20717 Comm: stable Tainted: G W 5.17.0-rc3+ #747 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 RIP: 0010:load_new_mm_cr3+0x82/0xe0 RSP: 0018:ffffc9000489fa98 EFLAGS: 00010082 RAX: 0000000000000000 RBX: 8000000105393004 RCX: 0000000000000027 RDX: 0000000000000027 RSI: 00000000ffffdfff RDI: ffff888277d1b788 RBP: 0000000000000004 R08: ffff888277d1b780 R09: ffffc9000489f8b8 R10: 0000000000000001 R11: 0000000000000001 R12: 0000000000000000 R13: ffff88810678a800 R14: 0000000000000004 R15: 0000000000000c33 FS: 00007fa9f0e72700(0000) GS:ffff888277d00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 00000001001b5003 CR4: 0000000000172ea0 Call Trace: switch_mm_irqs_off+0x1cb/0x460 __text_poke+0x308/0x3e0 text_poke_bp_batch+0x168/0x220 text_poke_finish+0x1b/0x30 arch_jump_label_transform_apply+0x18/0x30 static_key_slow_inc_cpuslocked+0x7c/0x90 static_key_slow_inc+0x16/0x20 kvm_lapic_set_base+0x116/0x190 kvm_set_apic_base+0xa5/0xe0 kvm_set_msr_common+0x2f4/0xf60 vmx_set_msr+0x355/0xe70 [kvm_intel] kvm_set_msr_ignored_check+0x91/0x230 kvm_emulate_wrmsr+0x36/0x120 vmx_handle_exit+0x609/0x6c0 [kvm_intel] kvm_arch_vcpu_ioctl_run+0x146f/0x1b80 kvm_vcpu_ioctl+0x279/0x690 __x64_sys_ioctl+0x83/0xb0 do_syscall_64+0x3b/0xc0 entry_SYSCALL_64_after_hwframe+0x44/0xae ---[ end trace 0000000000000000 ]--- This reverts commit 15ad9762d69fd8e40a4a51828c1d6b0c1b8fbea0. Fixes: 15ad9762d69f ("KVM: VMX: Save HOST_CR3 in vmx_prepare_switch_to_guest()") Reported-by: Wanpeng Li Cc: Lai Jiangshan Signed-off-by: Sean Christopherson Acked-by: Lai Jiangshan Message-Id: <20220224191917.3508476-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 8 +++++++- arch/x86/kvm/vmx/vmx.c | 24 ++++++++++++++---------- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index c12f95004a72..dc822a1d403d 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3055,7 +3055,7 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long cr4; + unsigned long cr3, cr4; bool vm_fail; if (!nested_early_check) @@ -3078,6 +3078,12 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) */ vmcs_writel(GUEST_RFLAGS, 0); + cr3 = __get_current_cr3_fast(); + if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { + vmcs_writel(HOST_CR3, cr3); + vmx->loaded_vmcs->host_state.cr3 = cr3; + } + cr4 = cr4_read_shadow(); if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { vmcs_writel(HOST_CR4, cr4); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index beb68cd28aca..b730d799c26e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1114,7 +1114,6 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) #ifdef CONFIG_X86_64 int cpu = raw_smp_processor_id(); #endif - unsigned long cr3; unsigned long fs_base, gs_base; u16 fs_sel, gs_sel; int i; @@ -1179,14 +1178,6 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) #endif vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base); - - /* Host CR3 including its PCID is stable when guest state is loaded. */ - cr3 = __get_current_cr3_fast(); - if (unlikely(cr3 != host_state->cr3)) { - vmcs_writel(HOST_CR3, cr3); - host_state->cr3 = cr3; - } - vmx->guest_state_loaded = true; } @@ -6793,7 +6784,7 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long cr4; + unsigned long cr3, cr4; /* Record the guest's net vcpu time for enforced NMI injections. */ if (unlikely(!enable_vnmi && @@ -6836,6 +6827,19 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); vcpu->arch.regs_dirty = 0; + /* + * Refresh vmcs.HOST_CR3 if necessary. This must be done immediately + * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time + * it switches back to the current->mm, which can occur in KVM context + * when switching to a temporary mm to patch kernel code, e.g. if KVM + * toggles a static key while handling a VM-Exit. + */ + cr3 = __get_current_cr3_fast(); + if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { + vmcs_writel(HOST_CR3, cr3); + vmx->loaded_vmcs->host_state.cr3 = cr3; + } + cr4 = cr4_read_shadow(); if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { vmcs_writel(HOST_CR4, cr4); From 456f89e0928ab938122a40e9f094a6524cc158b4 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 23 Feb 2022 13:16:24 +0000 Subject: [PATCH 4/4] KVM: selftests: aarch64: Skip tests if we can't create a vgic-v3 The arch_timer and vgic_irq kselftests assume that they can create a vgic-v3, using the library function vgic_v3_setup() which aborts with a test failure if it is not possible to do so. Since vgic-v3 can only be instantiated on systems where the host has GICv3 this leads to false positives on older systems where that is not the case. Fix this by changing vgic_v3_setup() to return an error if the vgic can't be instantiated and have the callers skip if this happens. We could also exit flagging a skip in vgic_v3_setup() but this would prevent future test cases conditionally deciding which GIC to use or generally doing more complex output. Signed-off-by: Mark Brown Reviewed-by: Andrew Jones Tested-by: Ricardo Koller Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220223131624.1830351-1-broonie@kernel.org --- tools/testing/selftests/kvm/aarch64/arch_timer.c | 7 ++++++- tools/testing/selftests/kvm/aarch64/vgic_irq.c | 4 ++++ tools/testing/selftests/kvm/lib/aarch64/vgic.c | 4 +++- 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/aarch64/arch_timer.c b/tools/testing/selftests/kvm/aarch64/arch_timer.c index 9ad38bd360a4..b08d30bf71c5 100644 --- a/tools/testing/selftests/kvm/aarch64/arch_timer.c +++ b/tools/testing/selftests/kvm/aarch64/arch_timer.c @@ -366,6 +366,7 @@ static struct kvm_vm *test_vm_create(void) { struct kvm_vm *vm; unsigned int i; + int ret; int nr_vcpus = test_args.nr_vcpus; vm = vm_create_default_with_vcpus(nr_vcpus, 0, 0, guest_code, NULL); @@ -382,7 +383,11 @@ static struct kvm_vm *test_vm_create(void) ucall_init(vm, NULL); test_init_timer_irq(vm); - vgic_v3_setup(vm, nr_vcpus, 64, GICD_BASE_GPA, GICR_BASE_GPA); + ret = vgic_v3_setup(vm, nr_vcpus, 64, GICD_BASE_GPA, GICR_BASE_GPA); + if (ret < 0) { + print_skip("Failed to create vgic-v3"); + exit(KSFT_SKIP); + } /* Make all the test's cmdline args visible to the guest */ sync_global_to_guest(vm, test_args); diff --git a/tools/testing/selftests/kvm/aarch64/vgic_irq.c b/tools/testing/selftests/kvm/aarch64/vgic_irq.c index e6c7d7f8fbd1..7eca97799917 100644 --- a/tools/testing/selftests/kvm/aarch64/vgic_irq.c +++ b/tools/testing/selftests/kvm/aarch64/vgic_irq.c @@ -761,6 +761,10 @@ static void test_vgic(uint32_t nr_irqs, bool level_sensitive, bool eoi_split) gic_fd = vgic_v3_setup(vm, 1, nr_irqs, GICD_BASE_GPA, GICR_BASE_GPA); + if (gic_fd < 0) { + print_skip("Failed to create vgic-v3, skipping"); + exit(KSFT_SKIP); + } vm_install_exception_handler(vm, VECTOR_IRQ_CURRENT, guest_irq_handlers[args.eoi_split][args.level_sensitive]); diff --git a/tools/testing/selftests/kvm/lib/aarch64/vgic.c b/tools/testing/selftests/kvm/lib/aarch64/vgic.c index b3a0fca0d780..f5cd0c536d85 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/vgic.c +++ b/tools/testing/selftests/kvm/lib/aarch64/vgic.c @@ -52,7 +52,9 @@ int vgic_v3_setup(struct kvm_vm *vm, unsigned int nr_vcpus, uint32_t nr_irqs, nr_vcpus, nr_vcpus_created); /* Distributor setup */ - gic_fd = kvm_create_device(vm, KVM_DEV_TYPE_ARM_VGIC_V3, false); + if (_kvm_create_device(vm, KVM_DEV_TYPE_ARM_VGIC_V3, + false, &gic_fd) != 0) + return -1; kvm_device_access(gic_fd, KVM_DEV_ARM_VGIC_GRP_NR_IRQS, 0, &nr_irqs, true);