- Fix guest vtime accounting so that ticks happening while the guest is running
can also be accounted to it. Along with a consolidation to the guest-specific context tracking helpers. - Provide for the host NMI handler running after a VMX VMEXIT to be able to run on the kernel stack correctly. - Initialize MSR_TSC_AUX when RDPID is supported and not RDTSCP (virt relevant - real hw supports both) - A code generation improvement to TASK_SIZE_MAX through the use of alternatives - The usual misc. and related cleanups and improvements -----BEGIN PGP SIGNATURE----- iQIzBAABCgAdFiEEzv7L6UO9uDPlPSfHEsHwGGHeVUoFAmCXqpcACgkQEsHwGGHe VUp6Kg/+O0y2PvL6dhfYnUvTmQD7be0DOfWeFSLfBBA0c6yaHL1INbFHWDDptNuJ ZV50V+vyqXWV9q0AWF94fYHBs2kB0S79/En0Pwt1a3kb/xlfVTh8VAMPr36utnTY VWvOwHgixfPbY+8g1AoqIm/IeFuYWubXQ9CyBrLx/zkJjszfot1eooGRYKDPc2qi dNEqBO4IKzw24OdO+oIzW1/owLfnBF+GnXrwCb8fFC2U7luyFAJmp9c1bYnyNuCm BdQySOTfm8nnE2RpN4wfc8Akvu/ETKHOPSQOqHIb5glzv6lVfRKXu3CgpYbzoCNl Iohb6z8xmgAG29g2VpBjNvCWyyO79y4Ckf94ibWl+qt01EdeYefcP0euK+MGi85A cN/MrMt7QjHHEO7ok5J9rBSeKobOtng6A4MHenSOLvjifOYoupRFijaLVxRluATW 3NsC2IhL10u1c69Zsq6JJFJKoAytInKSigEN9VFZp+4NdE/FzDxfebC/6rSKznGi XoaEjOOX0JQ5TXM1gDoyzowAvt2vgndvldpwJTnPY5NP3X9fdiHhoOF9cU2yvl+x ZjgD1VxRWLGZKBojNfAa+0oDMZ/cTwPoeZ5Rr5p7SMr/Xw2fsUQ68KVjhOR7ZbaU 8zEV//JtetwGSN86NhQ/V32hqiF2fni62yBZjYGZ8XM/AnDqaMQ= =O3BS -----END PGP SIGNATURE----- Merge tag 'x86_urgent_for_v5.13_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip Pull x86 fixes from Borislav Petkov: "A bunch of things accumulated for x86 in the last two weeks: - Fix guest vtime accounting so that ticks happening while the guest is running can also be accounted to it. Along with a consolidation to the guest-specific context tracking helpers. - Provide for the host NMI handler running after a VMX VMEXIT to be able to run on the kernel stack correctly. - Initialize MSR_TSC_AUX when RDPID is supported and not RDTSCP (virt relevant - real hw supports both) - A code generation improvement to TASK_SIZE_MAX through the use of alternatives - The usual misc and related cleanups and improvements" * tag 'x86_urgent_for_v5.13_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: KVM: x86: Consolidate guest enter/exit logic to common helpers context_tracking: KVM: Move guest enter/exit wrappers to KVM's domain context_tracking: Consolidate guest enter/exit wrappers sched/vtime: Move guest enter/exit vtime accounting to vtime.h sched/vtime: Move vtime accounting external declarations above inlines KVM: x86: Defer vtime accounting 'til after IRQ handling context_tracking: Move guest exit vtime accounting to separate helpers context_tracking: Move guest exit context tracking to separate helpers KVM/VMX: Invoke NMI non-IST entry instead of IST entry x86/cpu: Remove write_tsc() and write_rdtscp_aux() wrappers x86/cpu: Initialize MSR_TSC_AUX if RDTSCP *or* RDPID is supported x86/resctrl: Fix init const confusion x86: Delete UD0, UD1 traces x86/smpboot: Remove duplicate includes x86/cpu: Use alternative to generate the TASK_SIZE_MAX constant
This commit is contained in:
commit
dd3e4012dd
@ -7,18 +7,9 @@
|
|||||||
|
|
||||||
/*
|
/*
|
||||||
* Despite that some emulators terminate on UD2, we use it for WARN().
|
* Despite that some emulators terminate on UD2, we use it for WARN().
|
||||||
*
|
|
||||||
* Since various instruction decoders/specs disagree on the encoding of
|
|
||||||
* UD0/UD1.
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#define ASM_UD0 ".byte 0x0f, 0xff" /* + ModRM (for Intel) */
|
|
||||||
#define ASM_UD1 ".byte 0x0f, 0xb9" /* + ModRM */
|
|
||||||
#define ASM_UD2 ".byte 0x0f, 0x0b"
|
#define ASM_UD2 ".byte 0x0f, 0x0b"
|
||||||
|
|
||||||
#define INSN_UD0 0xff0f
|
|
||||||
#define INSN_UD2 0x0b0f
|
#define INSN_UD2 0x0b0f
|
||||||
|
|
||||||
#define LEN_UD2 2
|
#define LEN_UD2 2
|
||||||
|
|
||||||
#ifdef CONFIG_GENERIC_BUG
|
#ifdef CONFIG_GENERIC_BUG
|
||||||
|
@ -588,6 +588,21 @@ DECLARE_IDTENTRY_RAW(X86_TRAP_MC, xenpv_exc_machine_check);
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* NMI */
|
/* NMI */
|
||||||
|
|
||||||
|
#if defined(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM_INTEL)
|
||||||
|
/*
|
||||||
|
* Special NOIST entry point for VMX which invokes this on the kernel
|
||||||
|
* stack. asm_exc_nmi() requires an IST to work correctly vs. the NMI
|
||||||
|
* 'executing' marker.
|
||||||
|
*
|
||||||
|
* On 32bit this just uses the regular NMI entry point because 32-bit does
|
||||||
|
* not have ISTs.
|
||||||
|
*/
|
||||||
|
DECLARE_IDTENTRY(X86_TRAP_NMI, exc_nmi_noist);
|
||||||
|
#else
|
||||||
|
#define asm_exc_nmi_noist asm_exc_nmi
|
||||||
|
#endif
|
||||||
|
|
||||||
DECLARE_IDTENTRY_NMI(X86_TRAP_NMI, exc_nmi);
|
DECLARE_IDTENTRY_NMI(X86_TRAP_NMI, exc_nmi);
|
||||||
#ifdef CONFIG_XEN_PV
|
#ifdef CONFIG_XEN_PV
|
||||||
DECLARE_IDTENTRY_RAW(X86_TRAP_NMI, xenpv_exc_nmi);
|
DECLARE_IDTENTRY_RAW(X86_TRAP_NMI, xenpv_exc_nmi);
|
||||||
|
@ -324,10 +324,6 @@ static inline int wrmsrl_safe(u32 msr, u64 val)
|
|||||||
return wrmsr_safe(msr, (u32)val, (u32)(val >> 32));
|
return wrmsr_safe(msr, (u32)val, (u32)(val >> 32));
|
||||||
}
|
}
|
||||||
|
|
||||||
#define write_tsc(low, high) wrmsr(MSR_IA32_TSC, (low), (high))
|
|
||||||
|
|
||||||
#define write_rdtscp_aux(val) wrmsr(MSR_TSC_AUX, (val), 0)
|
|
||||||
|
|
||||||
struct msr *msrs_alloc(void);
|
struct msr *msrs_alloc(void);
|
||||||
void msrs_free(struct msr *msrs);
|
void msrs_free(struct msr *msrs);
|
||||||
int msr_set_bit(u32 msr, u8 bit);
|
int msr_set_bit(u32 msr, u8 bit);
|
||||||
|
@ -56,6 +56,39 @@ static inline void clear_page(void *page)
|
|||||||
|
|
||||||
void copy_page(void *to, void *from);
|
void copy_page(void *to, void *from);
|
||||||
|
|
||||||
|
#ifdef CONFIG_X86_5LEVEL
|
||||||
|
/*
|
||||||
|
* User space process size. This is the first address outside the user range.
|
||||||
|
* There are a few constraints that determine this:
|
||||||
|
*
|
||||||
|
* On Intel CPUs, if a SYSCALL instruction is at the highest canonical
|
||||||
|
* address, then that syscall will enter the kernel with a
|
||||||
|
* non-canonical return address, and SYSRET will explode dangerously.
|
||||||
|
* We avoid this particular problem by preventing anything
|
||||||
|
* from being mapped at the maximum canonical address.
|
||||||
|
*
|
||||||
|
* On AMD CPUs in the Ryzen family, there's a nasty bug in which the
|
||||||
|
* CPUs malfunction if they execute code from the highest canonical page.
|
||||||
|
* They'll speculate right off the end of the canonical space, and
|
||||||
|
* bad things happen. This is worked around in the same way as the
|
||||||
|
* Intel problem.
|
||||||
|
*
|
||||||
|
* With page table isolation enabled, we map the LDT in ... [stay tuned]
|
||||||
|
*/
|
||||||
|
static inline unsigned long task_size_max(void)
|
||||||
|
{
|
||||||
|
unsigned long ret;
|
||||||
|
|
||||||
|
alternative_io("movq %[small],%0","movq %[large],%0",
|
||||||
|
X86_FEATURE_LA57,
|
||||||
|
"=r" (ret),
|
||||||
|
[small] "i" ((1ul << 47)-PAGE_SIZE),
|
||||||
|
[large] "i" ((1ul << 56)-PAGE_SIZE));
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
#endif /* CONFIG_X86_5LEVEL */
|
||||||
|
|
||||||
#endif /* !__ASSEMBLY__ */
|
#endif /* !__ASSEMBLY__ */
|
||||||
|
|
||||||
#ifdef CONFIG_X86_VSYSCALL_EMULATION
|
#ifdef CONFIG_X86_VSYSCALL_EMULATION
|
||||||
|
@ -55,30 +55,13 @@
|
|||||||
|
|
||||||
#ifdef CONFIG_X86_5LEVEL
|
#ifdef CONFIG_X86_5LEVEL
|
||||||
#define __VIRTUAL_MASK_SHIFT (pgtable_l5_enabled() ? 56 : 47)
|
#define __VIRTUAL_MASK_SHIFT (pgtable_l5_enabled() ? 56 : 47)
|
||||||
|
/* See task_size_max() in <asm/page_64.h> */
|
||||||
#else
|
#else
|
||||||
#define __VIRTUAL_MASK_SHIFT 47
|
#define __VIRTUAL_MASK_SHIFT 47
|
||||||
|
#define task_size_max() ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/*
|
#define TASK_SIZE_MAX task_size_max()
|
||||||
* User space process size. This is the first address outside the user range.
|
|
||||||
* There are a few constraints that determine this:
|
|
||||||
*
|
|
||||||
* On Intel CPUs, if a SYSCALL instruction is at the highest canonical
|
|
||||||
* address, then that syscall will enter the kernel with a
|
|
||||||
* non-canonical return address, and SYSRET will explode dangerously.
|
|
||||||
* We avoid this particular problem by preventing anything
|
|
||||||
* from being mapped at the maximum canonical address.
|
|
||||||
*
|
|
||||||
* On AMD CPUs in the Ryzen family, there's a nasty bug in which the
|
|
||||||
* CPUs malfunction if they execute code from the highest canonical page.
|
|
||||||
* They'll speculate right off the end of the canonical space, and
|
|
||||||
* bad things happen. This is worked around in the same way as the
|
|
||||||
* Intel problem.
|
|
||||||
*
|
|
||||||
* With page table isolation enabled, we map the LDT in ... [stay tuned]
|
|
||||||
*/
|
|
||||||
#define TASK_SIZE_MAX ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE)
|
|
||||||
|
|
||||||
#define DEFAULT_MAP_WINDOW ((1UL << 47) - PAGE_SIZE)
|
#define DEFAULT_MAP_WINDOW ((1UL << 47) - PAGE_SIZE)
|
||||||
|
|
||||||
/* This decides where the kernel will search for a free chunk of vm
|
/* This decides where the kernel will search for a free chunk of vm
|
||||||
|
@ -1851,8 +1851,8 @@ static inline void setup_getcpu(int cpu)
|
|||||||
unsigned long cpudata = vdso_encode_cpunode(cpu, early_cpu_to_node(cpu));
|
unsigned long cpudata = vdso_encode_cpunode(cpu, early_cpu_to_node(cpu));
|
||||||
struct desc_struct d = { };
|
struct desc_struct d = { };
|
||||||
|
|
||||||
if (boot_cpu_has(X86_FEATURE_RDTSCP))
|
if (boot_cpu_has(X86_FEATURE_RDTSCP) || boot_cpu_has(X86_FEATURE_RDPID))
|
||||||
write_rdtscp_aux(cpudata);
|
wrmsr(MSR_TSC_AUX, cpudata, 0);
|
||||||
|
|
||||||
/* Store CPU and node number in limit. */
|
/* Store CPU and node number in limit. */
|
||||||
d.limit0 = cpudata;
|
d.limit0 = cpudata;
|
||||||
|
@ -84,7 +84,7 @@ unsigned int resctrl_cqm_threshold;
|
|||||||
static const struct mbm_correction_factor_table {
|
static const struct mbm_correction_factor_table {
|
||||||
u32 rmidthreshold;
|
u32 rmidthreshold;
|
||||||
u64 cf;
|
u64 cf;
|
||||||
} mbm_cf_table[] __initdata = {
|
} mbm_cf_table[] __initconst = {
|
||||||
{7, CF(1.000000)},
|
{7, CF(1.000000)},
|
||||||
{15, CF(1.000000)},
|
{15, CF(1.000000)},
|
||||||
{15, CF(0.969650)},
|
{15, CF(0.969650)},
|
||||||
|
@ -524,6 +524,16 @@ nmi_restart:
|
|||||||
mds_user_clear_cpu_buffers();
|
mds_user_clear_cpu_buffers();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if defined(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM_INTEL)
|
||||||
|
DEFINE_IDTENTRY_RAW(exc_nmi_noist)
|
||||||
|
{
|
||||||
|
exc_nmi(regs);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
#if IS_MODULE(CONFIG_KVM_INTEL)
|
||||||
|
EXPORT_SYMBOL_GPL(asm_exc_nmi_noist);
|
||||||
|
#endif
|
||||||
|
|
||||||
void stop_nmi(void)
|
void stop_nmi(void)
|
||||||
{
|
{
|
||||||
ignore_nmis++;
|
ignore_nmis++;
|
||||||
|
@ -1865,9 +1865,6 @@ static bool slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
#include <asm/cpu_device_id.h>
|
|
||||||
#include <asm/intel-family.h>
|
|
||||||
|
|
||||||
#define X86_MATCH(model) \
|
#define X86_MATCH(model) \
|
||||||
X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, \
|
X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, \
|
||||||
INTEL_FAM6_##model, X86_FEATURE_APERFMPERF, NULL)
|
INTEL_FAM6_##model, X86_FEATURE_APERFMPERF, NULL)
|
||||||
|
@ -3710,25 +3710,7 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
|
|||||||
struct vcpu_svm *svm = to_svm(vcpu);
|
struct vcpu_svm *svm = to_svm(vcpu);
|
||||||
unsigned long vmcb_pa = svm->current_vmcb->pa;
|
unsigned long vmcb_pa = svm->current_vmcb->pa;
|
||||||
|
|
||||||
/*
|
kvm_guest_enter_irqoff();
|
||||||
* VMENTER enables interrupts (host state), but the kernel state is
|
|
||||||
* interrupts disabled when this is invoked. Also tell RCU about
|
|
||||||
* it. This is the same logic as for exit_to_user_mode().
|
|
||||||
*
|
|
||||||
* This ensures that e.g. latency analysis on the host observes
|
|
||||||
* guest mode as interrupt enabled.
|
|
||||||
*
|
|
||||||
* guest_enter_irqoff() informs context tracking about the
|
|
||||||
* transition to guest mode and if enabled adjusts RCU state
|
|
||||||
* accordingly.
|
|
||||||
*/
|
|
||||||
instrumentation_begin();
|
|
||||||
trace_hardirqs_on_prepare();
|
|
||||||
lockdep_hardirqs_on_prepare(CALLER_ADDR0);
|
|
||||||
instrumentation_end();
|
|
||||||
|
|
||||||
guest_enter_irqoff();
|
|
||||||
lockdep_hardirqs_on(CALLER_ADDR0);
|
|
||||||
|
|
||||||
if (sev_es_guest(vcpu->kvm)) {
|
if (sev_es_guest(vcpu->kvm)) {
|
||||||
__svm_sev_es_vcpu_run(vmcb_pa);
|
__svm_sev_es_vcpu_run(vmcb_pa);
|
||||||
@ -3748,24 +3730,7 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
|
|||||||
vmload(__sme_page_pa(sd->save_area));
|
vmload(__sme_page_pa(sd->save_area));
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
kvm_guest_exit_irqoff();
|
||||||
* VMEXIT disables interrupts (host state), but tracing and lockdep
|
|
||||||
* have them in state 'on' as recorded before entering guest mode.
|
|
||||||
* Same as enter_from_user_mode().
|
|
||||||
*
|
|
||||||
* guest_exit_irqoff() restores host context and reinstates RCU if
|
|
||||||
* enabled and required.
|
|
||||||
*
|
|
||||||
* This needs to be done before the below as native_read_msr()
|
|
||||||
* contains a tracepoint and x86_spec_ctrl_restore_host() calls
|
|
||||||
* into world and some more.
|
|
||||||
*/
|
|
||||||
lockdep_hardirqs_off(CALLER_ADDR0);
|
|
||||||
guest_exit_irqoff();
|
|
||||||
|
|
||||||
instrumentation_begin();
|
|
||||||
trace_hardirqs_off_finish();
|
|
||||||
instrumentation_end();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
|
static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
|
||||||
|
@ -36,6 +36,7 @@
|
|||||||
#include <asm/debugreg.h>
|
#include <asm/debugreg.h>
|
||||||
#include <asm/desc.h>
|
#include <asm/desc.h>
|
||||||
#include <asm/fpu/internal.h>
|
#include <asm/fpu/internal.h>
|
||||||
|
#include <asm/idtentry.h>
|
||||||
#include <asm/io.h>
|
#include <asm/io.h>
|
||||||
#include <asm/irq_remapping.h>
|
#include <asm/irq_remapping.h>
|
||||||
#include <asm/kexec.h>
|
#include <asm/kexec.h>
|
||||||
@ -6415,18 +6416,17 @@ static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
|
|||||||
|
|
||||||
void vmx_do_interrupt_nmi_irqoff(unsigned long entry);
|
void vmx_do_interrupt_nmi_irqoff(unsigned long entry);
|
||||||
|
|
||||||
static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu, u32 intr_info)
|
static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu,
|
||||||
|
unsigned long entry)
|
||||||
{
|
{
|
||||||
unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
|
|
||||||
gate_desc *desc = (gate_desc *)host_idt_base + vector;
|
|
||||||
|
|
||||||
kvm_before_interrupt(vcpu);
|
kvm_before_interrupt(vcpu);
|
||||||
vmx_do_interrupt_nmi_irqoff(gate_offset(desc));
|
vmx_do_interrupt_nmi_irqoff(entry);
|
||||||
kvm_after_interrupt(vcpu);
|
kvm_after_interrupt(vcpu);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
|
static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
|
||||||
{
|
{
|
||||||
|
const unsigned long nmi_entry = (unsigned long)asm_exc_nmi_noist;
|
||||||
u32 intr_info = vmx_get_intr_info(&vmx->vcpu);
|
u32 intr_info = vmx_get_intr_info(&vmx->vcpu);
|
||||||
|
|
||||||
/* if exit due to PF check for async PF */
|
/* if exit due to PF check for async PF */
|
||||||
@ -6437,18 +6437,20 @@ static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
|
|||||||
kvm_machine_check();
|
kvm_machine_check();
|
||||||
/* We need to handle NMIs before interrupts are enabled */
|
/* We need to handle NMIs before interrupts are enabled */
|
||||||
else if (is_nmi(intr_info))
|
else if (is_nmi(intr_info))
|
||||||
handle_interrupt_nmi_irqoff(&vmx->vcpu, intr_info);
|
handle_interrupt_nmi_irqoff(&vmx->vcpu, nmi_entry);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
|
static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
|
||||||
{
|
{
|
||||||
u32 intr_info = vmx_get_intr_info(vcpu);
|
u32 intr_info = vmx_get_intr_info(vcpu);
|
||||||
|
unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
|
||||||
|
gate_desc *desc = (gate_desc *)host_idt_base + vector;
|
||||||
|
|
||||||
if (WARN_ONCE(!is_external_intr(intr_info),
|
if (WARN_ONCE(!is_external_intr(intr_info),
|
||||||
"KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
|
"KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
handle_interrupt_nmi_irqoff(vcpu, intr_info);
|
handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc));
|
||||||
}
|
}
|
||||||
|
|
||||||
static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
|
static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
|
||||||
@ -6662,25 +6664,7 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
|
|||||||
static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
|
static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
|
||||||
struct vcpu_vmx *vmx)
|
struct vcpu_vmx *vmx)
|
||||||
{
|
{
|
||||||
/*
|
kvm_guest_enter_irqoff();
|
||||||
* VMENTER enables interrupts (host state), but the kernel state is
|
|
||||||
* interrupts disabled when this is invoked. Also tell RCU about
|
|
||||||
* it. This is the same logic as for exit_to_user_mode().
|
|
||||||
*
|
|
||||||
* This ensures that e.g. latency analysis on the host observes
|
|
||||||
* guest mode as interrupt enabled.
|
|
||||||
*
|
|
||||||
* guest_enter_irqoff() informs context tracking about the
|
|
||||||
* transition to guest mode and if enabled adjusts RCU state
|
|
||||||
* accordingly.
|
|
||||||
*/
|
|
||||||
instrumentation_begin();
|
|
||||||
trace_hardirqs_on_prepare();
|
|
||||||
lockdep_hardirqs_on_prepare(CALLER_ADDR0);
|
|
||||||
instrumentation_end();
|
|
||||||
|
|
||||||
guest_enter_irqoff();
|
|
||||||
lockdep_hardirqs_on(CALLER_ADDR0);
|
|
||||||
|
|
||||||
/* L1D Flush includes CPU buffer clear to mitigate MDS */
|
/* L1D Flush includes CPU buffer clear to mitigate MDS */
|
||||||
if (static_branch_unlikely(&vmx_l1d_should_flush))
|
if (static_branch_unlikely(&vmx_l1d_should_flush))
|
||||||
@ -6696,24 +6680,7 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
|
|||||||
|
|
||||||
vcpu->arch.cr2 = native_read_cr2();
|
vcpu->arch.cr2 = native_read_cr2();
|
||||||
|
|
||||||
/*
|
kvm_guest_exit_irqoff();
|
||||||
* VMEXIT disables interrupts (host state), but tracing and lockdep
|
|
||||||
* have them in state 'on' as recorded before entering guest mode.
|
|
||||||
* Same as enter_from_user_mode().
|
|
||||||
*
|
|
||||||
* guest_exit_irqoff() restores host context and reinstates RCU if
|
|
||||||
* enabled and required.
|
|
||||||
*
|
|
||||||
* This needs to be done before the below as native_read_msr()
|
|
||||||
* contains a tracepoint and x86_spec_ctrl_restore_host() calls
|
|
||||||
* into world and some more.
|
|
||||||
*/
|
|
||||||
lockdep_hardirqs_off(CALLER_ADDR0);
|
|
||||||
guest_exit_irqoff();
|
|
||||||
|
|
||||||
instrumentation_begin();
|
|
||||||
trace_hardirqs_off_finish();
|
|
||||||
instrumentation_end();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
|
static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
|
||||||
|
@ -9315,6 +9315,15 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
|
|||||||
local_irq_disable();
|
local_irq_disable();
|
||||||
kvm_after_interrupt(vcpu);
|
kvm_after_interrupt(vcpu);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Wait until after servicing IRQs to account guest time so that any
|
||||||
|
* ticks that occurred while running the guest are properly accounted
|
||||||
|
* to the guest. Waiting until IRQs are enabled degrades the accuracy
|
||||||
|
* of accounting via context tracking, but the loss of accuracy is
|
||||||
|
* acceptable for all known use cases.
|
||||||
|
*/
|
||||||
|
vtime_account_guest_exit();
|
||||||
|
|
||||||
if (lapic_in_kernel(vcpu)) {
|
if (lapic_in_kernel(vcpu)) {
|
||||||
s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta;
|
s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta;
|
||||||
if (delta != S64_MIN) {
|
if (delta != S64_MIN) {
|
||||||
|
@ -8,6 +8,51 @@
|
|||||||
#include "kvm_cache_regs.h"
|
#include "kvm_cache_regs.h"
|
||||||
#include "kvm_emulate.h"
|
#include "kvm_emulate.h"
|
||||||
|
|
||||||
|
static __always_inline void kvm_guest_enter_irqoff(void)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* VMENTER enables interrupts (host state), but the kernel state is
|
||||||
|
* interrupts disabled when this is invoked. Also tell RCU about
|
||||||
|
* it. This is the same logic as for exit_to_user_mode().
|
||||||
|
*
|
||||||
|
* This ensures that e.g. latency analysis on the host observes
|
||||||
|
* guest mode as interrupt enabled.
|
||||||
|
*
|
||||||
|
* guest_enter_irqoff() informs context tracking about the
|
||||||
|
* transition to guest mode and if enabled adjusts RCU state
|
||||||
|
* accordingly.
|
||||||
|
*/
|
||||||
|
instrumentation_begin();
|
||||||
|
trace_hardirqs_on_prepare();
|
||||||
|
lockdep_hardirqs_on_prepare(CALLER_ADDR0);
|
||||||
|
instrumentation_end();
|
||||||
|
|
||||||
|
guest_enter_irqoff();
|
||||||
|
lockdep_hardirqs_on(CALLER_ADDR0);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __always_inline void kvm_guest_exit_irqoff(void)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* VMEXIT disables interrupts (host state), but tracing and lockdep
|
||||||
|
* have them in state 'on' as recorded before entering guest mode.
|
||||||
|
* Same as enter_from_user_mode().
|
||||||
|
*
|
||||||
|
* context_tracking_guest_exit() restores host context and reinstates
|
||||||
|
* RCU if enabled and required.
|
||||||
|
*
|
||||||
|
* This needs to be done immediately after VM-Exit, before any code
|
||||||
|
* that might contain tracepoints or call out to the greater world,
|
||||||
|
* e.g. before x86_spec_ctrl_restore_host().
|
||||||
|
*/
|
||||||
|
lockdep_hardirqs_off(CALLER_ADDR0);
|
||||||
|
context_tracking_guest_exit();
|
||||||
|
|
||||||
|
instrumentation_begin();
|
||||||
|
trace_hardirqs_off_finish();
|
||||||
|
instrumentation_end();
|
||||||
|
}
|
||||||
|
|
||||||
#define KVM_NESTED_VMENTER_CONSISTENCY_CHECK(consistency_check) \
|
#define KVM_NESTED_VMENTER_CONSISTENCY_CHECK(consistency_check) \
|
||||||
({ \
|
({ \
|
||||||
bool failed = (consistency_check); \
|
bool failed = (consistency_check); \
|
||||||
|
@ -71,6 +71,19 @@ static inline void exception_exit(enum ctx_state prev_ctx)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static __always_inline bool context_tracking_guest_enter(void)
|
||||||
|
{
|
||||||
|
if (context_tracking_enabled())
|
||||||
|
__context_tracking_enter(CONTEXT_GUEST);
|
||||||
|
|
||||||
|
return context_tracking_enabled_this_cpu();
|
||||||
|
}
|
||||||
|
|
||||||
|
static __always_inline void context_tracking_guest_exit(void)
|
||||||
|
{
|
||||||
|
if (context_tracking_enabled())
|
||||||
|
__context_tracking_exit(CONTEXT_GUEST);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* ct_state() - return the current context tracking state if known
|
* ct_state() - return the current context tracking state if known
|
||||||
@ -92,6 +105,9 @@ static inline void user_exit_irqoff(void) { }
|
|||||||
static inline enum ctx_state exception_enter(void) { return 0; }
|
static inline enum ctx_state exception_enter(void) { return 0; }
|
||||||
static inline void exception_exit(enum ctx_state prev_ctx) { }
|
static inline void exception_exit(enum ctx_state prev_ctx) { }
|
||||||
static inline enum ctx_state ct_state(void) { return CONTEXT_DISABLED; }
|
static inline enum ctx_state ct_state(void) { return CONTEXT_DISABLED; }
|
||||||
|
static inline bool context_tracking_guest_enter(void) { return false; }
|
||||||
|
static inline void context_tracking_guest_exit(void) { }
|
||||||
|
|
||||||
#endif /* !CONFIG_CONTEXT_TRACKING */
|
#endif /* !CONFIG_CONTEXT_TRACKING */
|
||||||
|
|
||||||
#define CT_WARN_ON(cond) WARN_ON(context_tracking_enabled() && (cond))
|
#define CT_WARN_ON(cond) WARN_ON(context_tracking_enabled() && (cond))
|
||||||
@ -102,80 +118,4 @@ extern void context_tracking_init(void);
|
|||||||
static inline void context_tracking_init(void) { }
|
static inline void context_tracking_init(void) { }
|
||||||
#endif /* CONFIG_CONTEXT_TRACKING_FORCE */
|
#endif /* CONFIG_CONTEXT_TRACKING_FORCE */
|
||||||
|
|
||||||
|
|
||||||
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
|
|
||||||
/* must be called with irqs disabled */
|
|
||||||
static __always_inline void guest_enter_irqoff(void)
|
|
||||||
{
|
|
||||||
instrumentation_begin();
|
|
||||||
if (vtime_accounting_enabled_this_cpu())
|
|
||||||
vtime_guest_enter(current);
|
|
||||||
else
|
|
||||||
current->flags |= PF_VCPU;
|
|
||||||
instrumentation_end();
|
|
||||||
|
|
||||||
if (context_tracking_enabled())
|
|
||||||
__context_tracking_enter(CONTEXT_GUEST);
|
|
||||||
|
|
||||||
/* KVM does not hold any references to rcu protected data when it
|
|
||||||
* switches CPU into a guest mode. In fact switching to a guest mode
|
|
||||||
* is very similar to exiting to userspace from rcu point of view. In
|
|
||||||
* addition CPU may stay in a guest mode for quite a long time (up to
|
|
||||||
* one time slice). Lets treat guest mode as quiescent state, just like
|
|
||||||
* we do with user-mode execution.
|
|
||||||
*/
|
|
||||||
if (!context_tracking_enabled_this_cpu()) {
|
|
||||||
instrumentation_begin();
|
|
||||||
rcu_virt_note_context_switch(smp_processor_id());
|
|
||||||
instrumentation_end();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static __always_inline void guest_exit_irqoff(void)
|
|
||||||
{
|
|
||||||
if (context_tracking_enabled())
|
|
||||||
__context_tracking_exit(CONTEXT_GUEST);
|
|
||||||
|
|
||||||
instrumentation_begin();
|
|
||||||
if (vtime_accounting_enabled_this_cpu())
|
|
||||||
vtime_guest_exit(current);
|
|
||||||
else
|
|
||||||
current->flags &= ~PF_VCPU;
|
|
||||||
instrumentation_end();
|
|
||||||
}
|
|
||||||
|
|
||||||
#else
|
|
||||||
static __always_inline void guest_enter_irqoff(void)
|
|
||||||
{
|
|
||||||
/*
|
|
||||||
* This is running in ioctl context so its safe
|
|
||||||
* to assume that it's the stime pending cputime
|
|
||||||
* to flush.
|
|
||||||
*/
|
|
||||||
instrumentation_begin();
|
|
||||||
vtime_account_kernel(current);
|
|
||||||
current->flags |= PF_VCPU;
|
|
||||||
rcu_virt_note_context_switch(smp_processor_id());
|
|
||||||
instrumentation_end();
|
|
||||||
}
|
|
||||||
|
|
||||||
static __always_inline void guest_exit_irqoff(void)
|
|
||||||
{
|
|
||||||
instrumentation_begin();
|
|
||||||
/* Flush the guest cputime we spent on the guest */
|
|
||||||
vtime_account_kernel(current);
|
|
||||||
current->flags &= ~PF_VCPU;
|
|
||||||
instrumentation_end();
|
|
||||||
}
|
|
||||||
#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
|
|
||||||
|
|
||||||
static inline void guest_exit(void)
|
|
||||||
{
|
|
||||||
unsigned long flags;
|
|
||||||
|
|
||||||
local_irq_save(flags);
|
|
||||||
guest_exit_irqoff();
|
|
||||||
local_irq_restore(flags);
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -338,6 +338,51 @@ struct kvm_vcpu {
|
|||||||
struct kvm_dirty_ring dirty_ring;
|
struct kvm_dirty_ring dirty_ring;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/* must be called with irqs disabled */
|
||||||
|
static __always_inline void guest_enter_irqoff(void)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* This is running in ioctl context so its safe to assume that it's the
|
||||||
|
* stime pending cputime to flush.
|
||||||
|
*/
|
||||||
|
instrumentation_begin();
|
||||||
|
vtime_account_guest_enter();
|
||||||
|
instrumentation_end();
|
||||||
|
|
||||||
|
/*
|
||||||
|
* KVM does not hold any references to rcu protected data when it
|
||||||
|
* switches CPU into a guest mode. In fact switching to a guest mode
|
||||||
|
* is very similar to exiting to userspace from rcu point of view. In
|
||||||
|
* addition CPU may stay in a guest mode for quite a long time (up to
|
||||||
|
* one time slice). Lets treat guest mode as quiescent state, just like
|
||||||
|
* we do with user-mode execution.
|
||||||
|
*/
|
||||||
|
if (!context_tracking_guest_enter()) {
|
||||||
|
instrumentation_begin();
|
||||||
|
rcu_virt_note_context_switch(smp_processor_id());
|
||||||
|
instrumentation_end();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static __always_inline void guest_exit_irqoff(void)
|
||||||
|
{
|
||||||
|
context_tracking_guest_exit();
|
||||||
|
|
||||||
|
instrumentation_begin();
|
||||||
|
/* Flush the guest cputime we spent on the guest */
|
||||||
|
vtime_account_guest_exit();
|
||||||
|
instrumentation_end();
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void guest_exit(void)
|
||||||
|
{
|
||||||
|
unsigned long flags;
|
||||||
|
|
||||||
|
local_irq_save(flags);
|
||||||
|
guest_exit_irqoff();
|
||||||
|
local_irq_restore(flags);
|
||||||
|
}
|
||||||
|
|
||||||
static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu)
|
static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu)
|
||||||
{
|
{
|
||||||
/*
|
/*
|
||||||
|
@ -3,12 +3,46 @@
|
|||||||
#define _LINUX_KERNEL_VTIME_H
|
#define _LINUX_KERNEL_VTIME_H
|
||||||
|
|
||||||
#include <linux/context_tracking_state.h>
|
#include <linux/context_tracking_state.h>
|
||||||
|
#include <linux/sched.h>
|
||||||
|
|
||||||
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
|
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
|
||||||
#include <asm/vtime.h>
|
#include <asm/vtime.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Common vtime APIs
|
||||||
|
*/
|
||||||
|
#ifdef CONFIG_VIRT_CPU_ACCOUNTING
|
||||||
|
extern void vtime_account_kernel(struct task_struct *tsk);
|
||||||
|
extern void vtime_account_idle(struct task_struct *tsk);
|
||||||
|
#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */
|
||||||
|
|
||||||
struct task_struct;
|
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
|
||||||
|
extern void arch_vtime_task_switch(struct task_struct *tsk);
|
||||||
|
extern void vtime_user_enter(struct task_struct *tsk);
|
||||||
|
extern void vtime_user_exit(struct task_struct *tsk);
|
||||||
|
extern void vtime_guest_enter(struct task_struct *tsk);
|
||||||
|
extern void vtime_guest_exit(struct task_struct *tsk);
|
||||||
|
extern void vtime_init_idle(struct task_struct *tsk, int cpu);
|
||||||
|
#else /* !CONFIG_VIRT_CPU_ACCOUNTING_GEN */
|
||||||
|
static inline void vtime_user_enter(struct task_struct *tsk) { }
|
||||||
|
static inline void vtime_user_exit(struct task_struct *tsk) { }
|
||||||
|
static inline void vtime_guest_enter(struct task_struct *tsk) { }
|
||||||
|
static inline void vtime_guest_exit(struct task_struct *tsk) { }
|
||||||
|
static inline void vtime_init_idle(struct task_struct *tsk, int cpu) { }
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
|
||||||
|
extern void vtime_account_irq(struct task_struct *tsk, unsigned int offset);
|
||||||
|
extern void vtime_account_softirq(struct task_struct *tsk);
|
||||||
|
extern void vtime_account_hardirq(struct task_struct *tsk);
|
||||||
|
extern void vtime_flush(struct task_struct *tsk);
|
||||||
|
#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
|
||||||
|
static inline void vtime_account_irq(struct task_struct *tsk, unsigned int offset) { }
|
||||||
|
static inline void vtime_account_softirq(struct task_struct *tsk) { }
|
||||||
|
static inline void vtime_account_hardirq(struct task_struct *tsk) { }
|
||||||
|
static inline void vtime_flush(struct task_struct *tsk) { }
|
||||||
|
#endif
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* vtime_accounting_enabled_this_cpu() definitions/declarations
|
* vtime_accounting_enabled_this_cpu() definitions/declarations
|
||||||
@ -18,6 +52,18 @@ struct task_struct;
|
|||||||
static inline bool vtime_accounting_enabled_this_cpu(void) { return true; }
|
static inline bool vtime_accounting_enabled_this_cpu(void) { return true; }
|
||||||
extern void vtime_task_switch(struct task_struct *prev);
|
extern void vtime_task_switch(struct task_struct *prev);
|
||||||
|
|
||||||
|
static __always_inline void vtime_account_guest_enter(void)
|
||||||
|
{
|
||||||
|
vtime_account_kernel(current);
|
||||||
|
current->flags |= PF_VCPU;
|
||||||
|
}
|
||||||
|
|
||||||
|
static __always_inline void vtime_account_guest_exit(void)
|
||||||
|
{
|
||||||
|
vtime_account_kernel(current);
|
||||||
|
current->flags &= ~PF_VCPU;
|
||||||
|
}
|
||||||
|
|
||||||
#elif defined(CONFIG_VIRT_CPU_ACCOUNTING_GEN)
|
#elif defined(CONFIG_VIRT_CPU_ACCOUNTING_GEN)
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -49,49 +95,37 @@ static inline void vtime_task_switch(struct task_struct *prev)
|
|||||||
vtime_task_switch_generic(prev);
|
vtime_task_switch_generic(prev);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static __always_inline void vtime_account_guest_enter(void)
|
||||||
|
{
|
||||||
|
if (vtime_accounting_enabled_this_cpu())
|
||||||
|
vtime_guest_enter(current);
|
||||||
|
else
|
||||||
|
current->flags |= PF_VCPU;
|
||||||
|
}
|
||||||
|
|
||||||
|
static __always_inline void vtime_account_guest_exit(void)
|
||||||
|
{
|
||||||
|
if (vtime_accounting_enabled_this_cpu())
|
||||||
|
vtime_guest_exit(current);
|
||||||
|
else
|
||||||
|
current->flags &= ~PF_VCPU;
|
||||||
|
}
|
||||||
|
|
||||||
#else /* !CONFIG_VIRT_CPU_ACCOUNTING */
|
#else /* !CONFIG_VIRT_CPU_ACCOUNTING */
|
||||||
|
|
||||||
static inline bool vtime_accounting_enabled_cpu(int cpu) {return false; }
|
|
||||||
static inline bool vtime_accounting_enabled_this_cpu(void) { return false; }
|
static inline bool vtime_accounting_enabled_this_cpu(void) { return false; }
|
||||||
static inline void vtime_task_switch(struct task_struct *prev) { }
|
static inline void vtime_task_switch(struct task_struct *prev) { }
|
||||||
|
|
||||||
#endif
|
static __always_inline void vtime_account_guest_enter(void)
|
||||||
|
{
|
||||||
|
current->flags |= PF_VCPU;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
static __always_inline void vtime_account_guest_exit(void)
|
||||||
* Common vtime APIs
|
{
|
||||||
*/
|
current->flags &= ~PF_VCPU;
|
||||||
#ifdef CONFIG_VIRT_CPU_ACCOUNTING
|
}
|
||||||
extern void vtime_account_kernel(struct task_struct *tsk);
|
|
||||||
extern void vtime_account_idle(struct task_struct *tsk);
|
|
||||||
#else /* !CONFIG_VIRT_CPU_ACCOUNTING */
|
|
||||||
static inline void vtime_account_kernel(struct task_struct *tsk) { }
|
|
||||||
#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */
|
|
||||||
|
|
||||||
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
|
|
||||||
extern void arch_vtime_task_switch(struct task_struct *tsk);
|
|
||||||
extern void vtime_user_enter(struct task_struct *tsk);
|
|
||||||
extern void vtime_user_exit(struct task_struct *tsk);
|
|
||||||
extern void vtime_guest_enter(struct task_struct *tsk);
|
|
||||||
extern void vtime_guest_exit(struct task_struct *tsk);
|
|
||||||
extern void vtime_init_idle(struct task_struct *tsk, int cpu);
|
|
||||||
#else /* !CONFIG_VIRT_CPU_ACCOUNTING_GEN */
|
|
||||||
static inline void vtime_user_enter(struct task_struct *tsk) { }
|
|
||||||
static inline void vtime_user_exit(struct task_struct *tsk) { }
|
|
||||||
static inline void vtime_guest_enter(struct task_struct *tsk) { }
|
|
||||||
static inline void vtime_guest_exit(struct task_struct *tsk) { }
|
|
||||||
static inline void vtime_init_idle(struct task_struct *tsk, int cpu) { }
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
|
|
||||||
extern void vtime_account_irq(struct task_struct *tsk, unsigned int offset);
|
|
||||||
extern void vtime_account_softirq(struct task_struct *tsk);
|
|
||||||
extern void vtime_account_hardirq(struct task_struct *tsk);
|
|
||||||
extern void vtime_flush(struct task_struct *tsk);
|
|
||||||
#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
|
|
||||||
static inline void vtime_account_irq(struct task_struct *tsk, unsigned int offset) { }
|
|
||||||
static inline void vtime_account_softirq(struct task_struct *tsk) { }
|
|
||||||
static inline void vtime_account_hardirq(struct task_struct *tsk) { }
|
|
||||||
static inline void vtime_flush(struct task_struct *tsk) { }
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user