2020-10-16 10:29:37 -04:00
// SPDX-License-Identifier: GPL-2.0-only
# ifndef KVM_X86_MMU_SPTE_H
# define KVM_X86_MMU_SPTE_H
2023-08-08 15:40:59 -07:00
# include "mmu.h"
2020-10-16 10:29:37 -04:00
# include "mmu_internal.h"
2021-02-25 12:47:45 -08:00
/*
* A MMU present SPTE is backed by actual memory and may or may not be present
* in hardware . E . g . MMIO SPTEs are not considered present . Use bit 11 , as it
* is ignored by all flavors of SPTEs and checking a low bit often generates
* better code than for a high bit , e . g . 56 + . MMU present checks are pervasive
* enough that the improved code generation is noticeable in KVM ' s footprint .
*/
# define SPTE_MMU_PRESENT_MASK BIT_ULL(11)
2021-02-25 12:47:37 -08:00
/*
* TDP SPTES ( more specifically , EPT SPTEs ) may not have A / D bits , and may also
* be restricted to using write - protection ( for L2 when CPU dirty logging , i . e .
* PML , is enabled ) . Use bits 52 and 53 to hold the type of A / D tracking that
* is must be employed for a given TDP SPTE .
*
* Note , the " enabled " mask must be ' 0 ' , as bits 62 : 52 are _reserved_ for PAE
* paging , including NPT PAE . This scheme works because legacy shadow paging
* is guaranteed to have A / D bits and write - protection is forced only for
* TDP with CPU dirty logging ( PML ) . If NPT ever gains PML - like support , it
* must be restricted to 64 - bit KVM .
*/
# define SPTE_TDP_AD_SHIFT 52
# define SPTE_TDP_AD_MASK (3ULL << SPTE_TDP_AD_SHIFT)
2023-01-05 18:02:03 +08:00
# define SPTE_TDP_AD_ENABLED (0ULL << SPTE_TDP_AD_SHIFT)
# define SPTE_TDP_AD_DISABLED (1ULL << SPTE_TDP_AD_SHIFT)
# define SPTE_TDP_AD_WRPROT_ONLY (2ULL << SPTE_TDP_AD_SHIFT)
static_assert ( SPTE_TDP_AD_ENABLED = = 0 ) ;
2020-10-16 10:29:37 -04:00
# ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
2022-06-14 23:33:25 +00:00
# define SPTE_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1))
2020-10-16 10:29:37 -04:00
# else
2022-06-14 23:33:25 +00:00
# define SPTE_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
2020-10-16 10:29:37 -04:00
# endif
2022-06-14 23:33:25 +00:00
# define SPTE_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \
2020-10-16 10:29:37 -04:00
| shadow_x_mask | shadow_nx_mask | shadow_me_mask )
# define ACC_EXEC_MASK 1
# define ACC_WRITE_MASK PT_WRITABLE_MASK
# define ACC_USER_MASK PT_USER_MASK
# define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
/* The mask for the R/X bits in EPT PTEs */
2022-06-14 23:33:25 +00:00
# define SPTE_EPT_READABLE_MASK 0x1ull
# define SPTE_EPT_EXECUTABLE_MASK 0x4ull
2020-10-16 10:29:37 -04:00
2022-06-14 23:33:25 +00:00
# define SPTE_LEVEL_BITS 9
# define SPTE_LEVEL_SHIFT(level) __PT_LEVEL_SHIFT(level, SPTE_LEVEL_BITS)
# define SPTE_INDEX(address, level) __PT_INDEX(address, level, SPTE_LEVEL_BITS)
# define SPTE_ENT_PER_PAGE __PT_ENT_PER_PAGE(SPTE_LEVEL_BITS)
2020-10-16 10:29:37 -04:00
2021-02-25 12:47:44 -08:00
/*
* The mask / shift to use for saving the original R / X bits when marking the PTE
* as not - present for access tracking purposes . We do not save the W bit as the
* PTEs being access tracked also need to be dirty tracked , so the W bit will be
* restored only when a write is attempted to the page . This mask obviously
* must not overlap the A / D type mask .
*/
2022-06-14 23:33:25 +00:00
# define SHADOW_ACC_TRACK_SAVED_BITS_MASK (SPTE_EPT_READABLE_MASK | \
SPTE_EPT_EXECUTABLE_MASK )
2021-02-25 12:47:44 -08:00
# define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT 54
# define SHADOW_ACC_TRACK_SAVED_MASK (SHADOW_ACC_TRACK_SAVED_BITS_MASK << \
SHADOW_ACC_TRACK_SAVED_BITS_SHIFT )
static_assert ( ! ( SPTE_TDP_AD_MASK & SHADOW_ACC_TRACK_SAVED_MASK ) ) ;
2022-01-13 23:30:19 +00:00
/*
2022-01-25 23:07:23 +00:00
* { DEFAULT , EPT } _SPTE_ { HOST , MMU } _WRITABLE are used to keep track of why a given
* SPTE is write - protected . See is_writable_pte ( ) for details .
2022-01-13 23:30:19 +00:00
*/
/* Bits 9 and 10 are ignored by all non-EPT PTEs. */
2022-01-25 23:07:13 +00:00
# define DEFAULT_SPTE_HOST_WRITABLE BIT_ULL(9)
# define DEFAULT_SPTE_MMU_WRITABLE BIT_ULL(10)
2022-01-13 23:30:19 +00:00
2021-02-25 12:47:44 -08:00
/*
* Low ignored bits are at a premium for EPT , use high ignored bits , taking care
* to not overlap the A / D type mask or the saved access bits of access - tracked
* SPTEs when A / D bits are disabled .
*/
# define EPT_SPTE_HOST_WRITABLE BIT_ULL(57)
# define EPT_SPTE_MMU_WRITABLE BIT_ULL(58)
static_assert ( ! ( EPT_SPTE_HOST_WRITABLE & SPTE_TDP_AD_MASK ) ) ;
static_assert ( ! ( EPT_SPTE_MMU_WRITABLE & SPTE_TDP_AD_MASK ) ) ;
static_assert ( ! ( EPT_SPTE_HOST_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK ) ) ;
static_assert ( ! ( EPT_SPTE_MMU_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK ) ) ;
/* Defined only to keep the above static asserts readable. */
# undef SHADOW_ACC_TRACK_SAVED_MASK
2020-10-16 10:29:37 -04:00
/*
2021-02-25 12:47:45 -08:00
* Due to limited space in PTEs , the MMIO generation is a 19 bit subset of
2020-10-16 10:29:37 -04:00
* the memslots generation and is derived as follows :
*
2021-02-25 12:47:45 -08:00
* Bits 0 - 7 of the MMIO generation are propagated to spte bits 3 - 10
* Bits 8 - 18 of the MMIO generation are propagated to spte bits 52 - 62
2020-10-16 10:29:37 -04:00
*
* The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in
* the MMIO generation number , as doing so would require stealing a bit from
* the " real " generation number and thus effectively halve the maximum number
* of MMIO generations that can be handled before encountering a wrap ( which
* requires a full MMU zap ) . The flag is instead explicitly queried when
* checking for MMIO spte cache hits .
*/
# define MMIO_SPTE_GEN_LOW_START 3
2021-02-25 12:47:45 -08:00
# define MMIO_SPTE_GEN_LOW_END 10
2020-10-16 10:29:37 -04:00
2021-02-25 12:47:38 -08:00
# define MMIO_SPTE_GEN_HIGH_START 52
2020-10-16 10:29:37 -04:00
# define MMIO_SPTE_GEN_HIGH_END 62
2020-12-05 01:48:08 +01:00
# define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \
MMIO_SPTE_GEN_LOW_START )
2020-10-16 10:29:37 -04:00
# define MMIO_SPTE_GEN_HIGH_MASK GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \
MMIO_SPTE_GEN_HIGH_START )
2021-02-25 12:47:45 -08:00
static_assert ( ! ( SPTE_MMU_PRESENT_MASK &
( MMIO_SPTE_GEN_LOW_MASK | MMIO_SPTE_GEN_HIGH_MASK ) ) ) ;
2020-10-16 10:29:37 -04:00
2022-08-05 19:41:33 +00:00
/*
* The SPTE MMIO mask must NOT overlap the MMIO generation bits or the
* MMU - present bit . The generation obviously co - exists with the magic MMIO
* mask / value , and MMIO SPTEs are considered ! MMU - present .
*
* The SPTE MMIO mask is allowed to use hardware " present " bits ( i . e . all EPT
* RWX bits ) , all physical address bits ( legal PA bits are used for " fast " MMIO
* and so they ' re off - limits for generation ; additional checks ensure the mask
* doesn ' t overlap legal PA bits ) , and bit 63 ( carved out for future usage ) .
*/
# define SPTE_MMIO_ALLOWED_MASK (BIT_ULL(63) | GENMASK_ULL(51, 12) | GENMASK_ULL(2, 0))
static_assert ( ! ( SPTE_MMIO_ALLOWED_MASK &
( SPTE_MMU_PRESENT_MASK | MMIO_SPTE_GEN_LOW_MASK | MMIO_SPTE_GEN_HIGH_MASK ) ) ) ;
2020-12-05 01:48:08 +01:00
# define MMIO_SPTE_GEN_LOW_BITS (MMIO_SPTE_GEN_LOW_END - MMIO_SPTE_GEN_LOW_START + 1)
# define MMIO_SPTE_GEN_HIGH_BITS (MMIO_SPTE_GEN_HIGH_END - MMIO_SPTE_GEN_HIGH_START + 1)
/* remember to adjust the comment above as well if you change these */
2021-02-25 12:47:45 -08:00
static_assert ( MMIO_SPTE_GEN_LOW_BITS = = 8 & & MMIO_SPTE_GEN_HIGH_BITS = = 11 ) ;
2020-12-05 01:48:08 +01:00
# define MMIO_SPTE_GEN_LOW_SHIFT (MMIO_SPTE_GEN_LOW_START - 0)
# define MMIO_SPTE_GEN_HIGH_SHIFT (MMIO_SPTE_GEN_HIGH_START - MMIO_SPTE_GEN_LOW_BITS)
# define MMIO_SPTE_GEN_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_BITS + MMIO_SPTE_GEN_HIGH_BITS - 1, 0)
2021-02-25 12:47:43 -08:00
extern u64 __read_mostly shadow_host_writable_mask ;
extern u64 __read_mostly shadow_mmu_writable_mask ;
2020-10-16 10:29:37 -04:00
extern u64 __read_mostly shadow_nx_mask ;
extern u64 __read_mostly shadow_x_mask ; /* mutual exclusive with nx_mask */
extern u64 __read_mostly shadow_user_mask ;
extern u64 __read_mostly shadow_accessed_mask ;
extern u64 __read_mostly shadow_dirty_mask ;
extern u64 __read_mostly shadow_mmio_value ;
2021-02-25 12:47:35 -08:00
extern u64 __read_mostly shadow_mmio_mask ;
2020-10-16 10:29:37 -04:00
extern u64 __read_mostly shadow_mmio_access_mask ;
extern u64 __read_mostly shadow_present_mask ;
2022-07-15 23:00:15 +00:00
extern u64 __read_mostly shadow_memtype_mask ;
KVM: x86/mmu: Add shadow_me_value and repurpose shadow_me_mask
Intel Multi-Key Total Memory Encryption (MKTME) repurposes couple of
high bits of physical address bits as 'KeyID' bits. Intel Trust Domain
Extentions (TDX) further steals part of MKTME KeyID bits as TDX private
KeyID bits. TDX private KeyID bits cannot be set in any mapping in the
host kernel since they can only be accessed by software running inside a
new CPU isolated mode. And unlike to AMD's SME, host kernel doesn't set
any legacy MKTME KeyID bits to any mapping either. Therefore, it's not
legitimate for KVM to set any KeyID bits in SPTE which maps guest
memory.
KVM maintains shadow_zero_check bits to represent which bits must be
zero for SPTE which maps guest memory. MKTME KeyID bits should be set
to shadow_zero_check. Currently, shadow_me_mask is used by AMD to set
the sme_me_mask to SPTE, and shadow_me_shadow is excluded from
shadow_zero_check. So initializing shadow_me_mask to represent all
MKTME keyID bits doesn't work for VMX (as oppositely, they must be set
to shadow_zero_check).
Introduce a new 'shadow_me_value' to replace existing shadow_me_mask,
and repurpose shadow_me_mask as 'all possible memory encryption bits'.
The new schematic of them will be:
- shadow_me_value: the memory encryption bit(s) that will be set to the
SPTE (the original shadow_me_mask).
- shadow_me_mask: all possible memory encryption bits (which is a super
set of shadow_me_value).
- For now, shadow_me_value is supposed to be set by SVM and VMX
respectively, and it is a constant during KVM's life time. This
perhaps doesn't fit MKTME but for now host kernel doesn't support it
(and perhaps will never do).
- Bits in shadow_me_mask are set to shadow_zero_check, except the bits
in shadow_me_value.
Introduce a new helper kvm_mmu_set_me_spte_mask() to initialize them.
Replace shadow_me_mask with shadow_me_value in almost all code paths,
except the one in PT64_PERM_MASK, which is used by need_remote_flush()
to determine whether remote TLB flush is needed. This should still use
shadow_me_mask as any encryption bit change should need a TLB flush.
And for AMD, move initializing shadow_me_value/shadow_me_mask from
kvm_mmu_reset_all_pte_masks() to svm_hardware_setup().
Signed-off-by: Kai Huang <kai.huang@intel.com>
Message-Id: <f90964b93a3398b1cf1c56f510f3281e0709e2ab.1650363789.git.kai.huang@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-04-19 23:17:03 +12:00
extern u64 __read_mostly shadow_me_value ;
2020-10-16 10:29:37 -04:00
extern u64 __read_mostly shadow_me_mask ;
/*
2023-01-05 18:02:03 +08:00
* SPTEs in MMUs without A / D bits are marked with SPTE_TDP_AD_DISABLED ;
2020-10-16 10:29:37 -04:00
* shadow_acc_track_mask is the set of bits to be cleared in non - accessed
* pages .
*/
extern u64 __read_mostly shadow_acc_track_mask ;
/*
* This mask must be set on all non - zero Non - Present or Reserved SPTEs in order
* to guard against L1TF attacks .
*/
extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask ;
2020-10-30 13:39:55 -04:00
/*
* The number of high - order 1 bits to use in the mask above .
*/
# define SHADOW_NONPRESENT_OR_RSVD_MASK_LEN 5
2021-02-02 10:57:27 -08:00
/*
* If a thread running without exclusive control of the MMU lock must perform a
* multi - part operation on an SPTE , it can set the SPTE to REMOVED_SPTE as a
* non - present intermediate value . Other threads which encounter this value
* should not modify the SPTE .
*
2021-02-25 12:47:48 -08:00
* Use a semi - arbitrary value that doesn ' t set RWX bits , i . e . is not - present on
2022-09-13 17:17:25 +08:00
* both AMD and Intel CPUs , and doesn ' t set PFN bits , i . e . doesn ' t create a L1TF
2021-02-25 12:47:48 -08:00
* vulnerability . Use only low bits to avoid 64 - bit immediates .
2021-02-02 10:57:27 -08:00
*
* Only used by the TDP MMU .
*/
2021-02-25 12:47:48 -08:00
# define REMOVED_SPTE 0x5a0ULL
/* Removed SPTEs must not be misconstrued as shadow present PTEs. */
static_assert ( ! ( REMOVED_SPTE & SPTE_MMU_PRESENT_MASK ) ) ;
2021-02-02 10:57:27 -08:00
static inline bool is_removed_spte ( u64 spte )
{
return spte = = REMOVED_SPTE ;
}
2022-07-12 02:07:22 +00:00
/* Get an SPTE's index into its parent's page table (and the spt array). */
static inline int spte_index ( u64 * sptep )
{
return ( ( unsigned long ) sptep / sizeof ( * sptep ) ) & ( SPTE_ENT_PER_PAGE - 1 ) ;
}
2020-10-16 10:29:37 -04:00
/*
* In some cases , we need to preserve the GFN of a non - present or reserved
* SPTE when we usurp the upper five bits of the physical address space to
* defend against L1TF , e . g . for MMIO SPTEs . To preserve the GFN , we ' ll
* shift bits of the GFN that overlap with shadow_nonpresent_or_rsvd_mask
* left into the reserved bits , i . e . the GFN in the SPTE will be split into
* high and low parts . This mask covers the lower bits of the GFN .
*/
extern u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask ;
2022-10-19 16:56:16 +00:00
static inline struct kvm_mmu_page * to_shadow_page ( hpa_t shadow_page )
{
struct page * page = pfn_to_page ( ( shadow_page ) > > PAGE_SHIFT ) ;
return ( struct kvm_mmu_page * ) page_private ( page ) ;
}
static inline struct kvm_mmu_page * spte_to_child_sp ( u64 spte )
{
return to_shadow_page ( spte & SPTE_BASE_ADDR_MASK ) ;
}
static inline struct kvm_mmu_page * sptep_to_sp ( u64 * sptep )
{
return to_shadow_page ( __pa ( sptep ) ) ;
}
2023-07-28 17:51:56 -07:00
static inline struct kvm_mmu_page * root_to_sp ( hpa_t root )
{
KVM: x86/mmu: Use dummy root, backed by zero page, for !visible guest roots
When attempting to allocate a shadow root for a !visible guest root gfn,
e.g. that resides in MMIO space, load a dummy root that is backed by the
zero page instead of immediately synthesizing a triple fault shutdown
(using the zero page ensures any attempt to translate memory will generate
a !PRESENT fault and thus VM-Exit).
Unless the vCPU is racing with memslot activity, KVM will inject a page
fault due to not finding a visible slot in FNAME(walk_addr_generic), i.e.
the end result is mostly same, but critically KVM will inject a fault only
*after* KVM runs the vCPU with the bogus root.
Waiting to inject a fault until after running the vCPU fixes a bug where
KVM would bail from nested VM-Enter if L1 tried to run L2 with TDP enabled
and a !visible root. Even though a bad root will *probably* lead to
shutdown, (a) it's not guaranteed and (b) the CPU won't read the
underlying memory until after VM-Enter succeeds. E.g. if L1 runs L2 with
a VMX preemption timer value of '0', then architecturally the preemption
timer VM-Exit is guaranteed to occur before the CPU executes any
instruction, i.e. before the CPU needs to translate a GPA to a HPA (so
long as there are no injected events with higher priority than the
preemption timer).
If KVM manages to get to FNAME(fetch) with a dummy root, e.g. because
userspace created a memslot between installing the dummy root and handling
the page fault, simply unload the MMU to allocate a new root and retry the
instruction. Use KVM_REQ_MMU_FREE_OBSOLETE_ROOTS to drop the root, as
invoking kvm_mmu_free_roots() while holding mmu_lock would deadlock, and
conceptually the dummy root has indeeed become obsolete. The only
difference versus existing usage of KVM_REQ_MMU_FREE_OBSOLETE_ROOTS is
that the root has become obsolete due to memslot *creation*, not memslot
deletion or movement.
Reported-by: Reima Ishii <ishiir@g.ecc.u-tokyo.ac.jp>
Cc: Yu Zhang <yu.c.zhang@linux.intel.com>
Link: https://lore.kernel.org/r/20230729005200.1057358-6-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2023-07-28 17:52:00 -07:00
if ( kvm_mmu_is_dummy_root ( root ) )
return NULL ;
2023-07-28 17:51:56 -07:00
/*
* The " root " may be a special root , e . g . a PAE entry , treat it as a
* SPTE to ensure any non - PA bits are dropped .
*/
return spte_to_child_sp ( root ) ;
}
2020-10-16 10:29:37 -04:00
static inline bool is_mmio_spte ( u64 spte )
{
2021-02-25 12:47:35 -08:00
return ( spte & shadow_mmio_mask ) = = shadow_mmio_value & &
2022-04-20 00:27:47 +00:00
likely ( enable_mmio_caching ) ;
2020-10-16 10:29:37 -04:00
}
2021-02-25 12:47:46 -08:00
static inline bool is_shadow_present_pte ( u64 pte )
{
return ! ! ( pte & SPTE_MMU_PRESENT_MASK ) ;
}
KVM: x86/mmu: Don't attempt fast page fault just because EPT is in use
Check for A/D bits being disabled instead of the access tracking mask
being non-zero when deciding whether or not to attempt to fix a page
fault vian the fast path. Originally, the access tracking mask was
non-zero if and only if A/D bits were disabled by _KVM_ (including not
being supported by hardware), but that hasn't been true since nVMX was
fixed to honor EPTP12's A/D enabling, i.e. since KVM allowed L1 to cause
KVM to not use A/D bits while running L2 despite KVM using them while
running L1.
In other words, don't attempt the fast path just because EPT is enabled.
Note, attempting the fast path for all !PRESENT faults can "fix" a very,
_VERY_ tiny percentage of faults out of mmu_lock by detecting that the
fault is spurious, i.e. has been fixed by a different vCPU, but again the
odds of that happening are vanishingly small. E.g. booting an 8-vCPU VM
gets less than 10 successes out of 30k+ faults, and that's likely one of
the more favorable scenarios. Disabling dirty logging can likely lead to
a rash of collisions between vCPUs for some workloads that operate on a
common set of pages, but penalizing _all_ !PRESENT faults for that one
case is unlikely to be a net positive, not to mention that that problem
is best solved by not zapping in the first place.
The number of spurious faults does scale with the number of vCPUs, e.g. a
255-vCPU VM using TDP "jumps" to ~60 spurious faults detected in the fast
path (again out of 30k), but that's all of 0.2% of faults. Using legacy
shadow paging does get more spurious faults, and a few more detected out
of mmu_lock, but the percentage goes _down_ to 0.08% (and that's ignoring
faults that are reflected into the guest), i.e. the extra detections are
purely due to the sheer number of faults observed.
On the other hand, getting a "negative" in the fast path takes in the
neighborhood of 150-250 cycles. So while it is tempting to keep/extend
the current behavior, such a change needs to come with hard numbers
showing that it's actually a win in the grand scheme, or any scheme for
that matter.
Fixes: 995f00a61958 ("x86: kvm: mmu: use ept a/d in vmcs02 iff used in vmcs12")
Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20220423034752.1161007-5-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-04-23 03:47:44 +00:00
/*
* Returns true if A / D bits are supported in hardware and are enabled by KVM .
* When enabled , KVM uses A / D bits for all non - nested MMUs . Because L1 can
* disable A / D bits in EPTP12 , SP and SPTE variants are needed to handle the
* scenario where KVM is using A / D bits for L1 , but not L2 .
*/
static inline bool kvm_ad_enabled ( void )
{
return ! ! shadow_accessed_mask ;
}
2020-10-16 10:29:37 -04:00
static inline bool sp_ad_disabled ( struct kvm_mmu_page * sp )
{
return sp - > role . ad_disabled ;
}
static inline bool spte_ad_enabled ( u64 spte )
{
2023-07-28 17:47:16 -07:00
KVM_MMU_WARN_ON ( ! is_shadow_present_pte ( spte ) ) ;
2023-01-05 18:02:03 +08:00
return ( spte & SPTE_TDP_AD_MASK ) ! = SPTE_TDP_AD_DISABLED ;
2020-10-16 10:29:37 -04:00
}
static inline bool spte_ad_need_write_protect ( u64 spte )
{
2023-07-28 17:47:16 -07:00
KVM_MMU_WARN_ON ( ! is_shadow_present_pte ( spte ) ) ;
2021-02-25 12:47:37 -08:00
/*
2023-01-05 18:02:03 +08:00
* This is benign for non - TDP SPTEs as SPTE_TDP_AD_ENABLED is ' 0 ' ,
2021-02-25 12:47:37 -08:00
* and non - TDP SPTEs will never set these bits . Optimize for 64 - bit
* TDP and do the A / D type check unconditionally .
*/
2023-01-05 18:02:03 +08:00
return ( spte & SPTE_TDP_AD_MASK ) ! = SPTE_TDP_AD_ENABLED ;
2020-10-16 10:29:37 -04:00
}
static inline u64 spte_shadow_accessed_mask ( u64 spte )
{
2023-07-28 17:47:16 -07:00
KVM_MMU_WARN_ON ( ! is_shadow_present_pte ( spte ) ) ;
2020-10-16 10:29:37 -04:00
return spte_ad_enabled ( spte ) ? shadow_accessed_mask : 0 ;
}
static inline u64 spte_shadow_dirty_mask ( u64 spte )
{
2023-07-28 17:47:16 -07:00
KVM_MMU_WARN_ON ( ! is_shadow_present_pte ( spte ) ) ;
2020-10-16 10:29:37 -04:00
return spte_ad_enabled ( spte ) ? shadow_dirty_mask : 0 ;
}
static inline bool is_access_track_spte ( u64 spte )
{
return ! spte_ad_enabled ( spte ) & & ( spte & shadow_acc_track_mask ) = = 0 ;
}
2021-01-22 16:30:03 -08:00
static inline bool is_large_pte ( u64 pte )
2020-10-16 10:29:37 -04:00
{
return pte & PT_PAGE_SIZE_MASK ;
}
2021-01-22 16:30:03 -08:00
static inline bool is_last_spte ( u64 pte , int level )
2020-10-16 10:29:37 -04:00
{
2021-01-22 16:30:03 -08:00
return ( level = = PG_LEVEL_4K ) | | is_large_pte ( pte ) ;
2020-10-16 10:29:37 -04:00
}
static inline bool is_executable_pte ( u64 spte )
{
return ( spte & ( shadow_x_mask | shadow_nx_mask ) ) = = shadow_x_mask ;
}
static inline kvm_pfn_t spte_to_pfn ( u64 pte )
{
2022-06-14 23:33:25 +00:00
return ( pte & SPTE_BASE_ADDR_MASK ) > > PAGE_SHIFT ;
2020-10-16 10:29:37 -04:00
}
static inline bool is_accessed_spte ( u64 spte )
{
u64 accessed_mask = spte_shadow_accessed_mask ( spte ) ;
return accessed_mask ? spte & accessed_mask
: ! is_access_track_spte ( spte ) ;
}
static inline bool is_dirty_spte ( u64 spte )
{
u64 dirty_mask = spte_shadow_dirty_mask ( spte ) ;
return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK ;
}
2021-06-22 10:57:32 -07:00
static inline u64 get_rsvd_bits ( struct rsvd_bits_validate * rsvd_check , u64 pte ,
int level )
{
int bit7 = ( pte > > 7 ) & 1 ;
return rsvd_check - > rsvd_bits_mask [ bit7 ] [ level - 1 ] ;
}
static inline bool __is_rsvd_bits_set ( struct rsvd_bits_validate * rsvd_check ,
u64 pte , int level )
{
return pte & get_rsvd_bits ( rsvd_check , pte , level ) ;
}
static inline bool __is_bad_mt_xwr ( struct rsvd_bits_validate * rsvd_check ,
u64 pte )
{
return rsvd_check - > bad_mt_xwr & BIT_ULL ( pte & 0x3f ) ;
}
static __always_inline bool is_rsvd_spte ( struct rsvd_bits_validate * rsvd_check ,
u64 spte , int level )
{
2021-10-15 04:50:01 -04:00
return __is_bad_mt_xwr ( rsvd_check , spte ) | |
2021-06-22 10:57:32 -07:00
__is_rsvd_bits_set ( rsvd_check , spte , level ) ;
}
2022-01-25 23:05:16 +00:00
/*
kvm: x86: mmu: Always flush TLBs when enabling dirty logging
When A/D bits are not available, KVM uses a software access tracking
mechanism, which involves making the SPTEs inaccessible. However,
the clear_young() MMU notifier does not flush TLBs. So it is possible
that there may still be stale, potentially writable, TLB entries.
This is usually fine, but can be problematic when enabling dirty
logging, because it currently only does a TLB flush if any SPTEs were
modified. But if all SPTEs are in access-tracked state, then there
won't be a TLB flush, which means that the guest could still possibly
write to memory and not have it reflected in the dirty bitmap.
So just unconditionally flush the TLBs when enabling dirty logging.
As an alternative, KVM could explicitly check the MMU-Writable bit when
write-protecting SPTEs to decide if a flush is needed (instead of
checking the Writable bit), but given that a flush almost always happens
anyway, so just making it unconditional seems simpler.
Signed-off-by: Junaid Shahid <junaids@google.com>
Message-Id: <20220810224939.2611160-1-junaids@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-08-10 15:49:39 -07:00
* A shadow - present leaf SPTE may be non - writable for 4 possible reasons :
2022-01-25 23:07:23 +00:00
*
* 1. To intercept writes for dirty logging . KVM write - protects huge pages
2022-12-07 20:05:05 +08:00
* so that they can be split down into the dirty logging
2022-01-25 23:07:23 +00:00
* granularity ( 4 KiB ) whenever the guest writes to them . KVM also
* write - protects 4 KiB pages so that writes can be recorded in the dirty log
* ( e . g . if not using PML ) . SPTEs are write - protected for dirty logging
* during the VM - iotcls that enable dirty logging .
*
* 2. To intercept writes to guest page tables that KVM is shadowing . When a
* guest writes to its page table the corresponding shadow page table will
* be marked " unsync " . That way KVM knows which shadow page tables need to
* be updated on the next TLB flush , INVLPG , etc . and which do not .
*
* 3. To prevent guest writes to read - only memory , such as for memory in a
* read - only memslot or guest memory backed by a read - only VMA . Writes to
* such pages are disallowed entirely .
*
kvm: x86: mmu: Always flush TLBs when enabling dirty logging
When A/D bits are not available, KVM uses a software access tracking
mechanism, which involves making the SPTEs inaccessible. However,
the clear_young() MMU notifier does not flush TLBs. So it is possible
that there may still be stale, potentially writable, TLB entries.
This is usually fine, but can be problematic when enabling dirty
logging, because it currently only does a TLB flush if any SPTEs were
modified. But if all SPTEs are in access-tracked state, then there
won't be a TLB flush, which means that the guest could still possibly
write to memory and not have it reflected in the dirty bitmap.
So just unconditionally flush the TLBs when enabling dirty logging.
As an alternative, KVM could explicitly check the MMU-Writable bit when
write-protecting SPTEs to decide if a flush is needed (instead of
checking the Writable bit), but given that a flush almost always happens
anyway, so just making it unconditional seems simpler.
Signed-off-by: Junaid Shahid <junaids@google.com>
Message-Id: <20220810224939.2611160-1-junaids@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-08-10 15:49:39 -07:00
* 4. To emulate the Accessed bit for SPTEs without A / D bits . Note , in this
* case , the SPTE is access - protected , not just write - protected !
*
* For cases # 1 and # 4 , KVM can safely make such SPTEs writable without taking
* mmu_lock as capturing the Accessed / Dirty state doesn ' t require taking it .
* To differentiate # 1 and # 4 from # 2 and # 3 , KVM uses two software - only bits
* in the SPTE :
2022-01-25 23:07:23 +00:00
*
* shadow_mmu_writable_mask , aka MMU - writable -
* Cleared on SPTEs that KVM is currently write - protecting for shadow paging
* purposes ( case 2 above ) .
*
* shadow_host_writable_mask , aka Host - writable -
* Cleared on SPTEs that are not host - writable ( case 3 above )
*
* Note , not all possible combinations of PT_WRITABLE_MASK ,
* shadow_mmu_writable_mask , and shadow_host_writable_mask are valid . A given
* SPTE can be in only one of the following states , which map to the
* aforementioned 3 cases :
*
* shadow_host_writable_mask | shadow_mmu_writable_mask | PT_WRITABLE_MASK
* - - - - - - - - - - - - - - - - - - - - - - - - - | - - - - - - - - - - - - - - - - - - - - - - - - | - - - - - - - - - - - - - - - -
* 1 | 1 | 1 ( writable )
* 1 | 1 | 0 ( case 1 )
* 1 | 0 | 0 ( case 2 )
* 0 | 0 | 0 ( case 3 )
2022-01-25 23:05:16 +00:00
*
2022-01-25 23:07:23 +00:00
* The valid combinations of these bits are checked by
* check_spte_writable_invariants ( ) whenever an SPTE is modified .
2022-01-25 23:05:16 +00:00
*
2022-01-25 23:07:23 +00:00
* Clearing the MMU - writable bit is always done under the MMU lock and always
* accompanied by a TLB flush before dropping the lock to avoid corrupting the
* shadow page tables between vCPUs . Write - protecting an SPTE for dirty logging
* ( which does not clear the MMU - writable bit ) , does not flush TLBs before
* dropping the lock , as it only needs to synchronize guest writes with the
kvm: x86: mmu: Always flush TLBs when enabling dirty logging
When A/D bits are not available, KVM uses a software access tracking
mechanism, which involves making the SPTEs inaccessible. However,
the clear_young() MMU notifier does not flush TLBs. So it is possible
that there may still be stale, potentially writable, TLB entries.
This is usually fine, but can be problematic when enabling dirty
logging, because it currently only does a TLB flush if any SPTEs were
modified. But if all SPTEs are in access-tracked state, then there
won't be a TLB flush, which means that the guest could still possibly
write to memory and not have it reflected in the dirty bitmap.
So just unconditionally flush the TLBs when enabling dirty logging.
As an alternative, KVM could explicitly check the MMU-Writable bit when
write-protecting SPTEs to decide if a flush is needed (instead of
checking the Writable bit), but given that a flush almost always happens
anyway, so just making it unconditional seems simpler.
Signed-off-by: Junaid Shahid <junaids@google.com>
Message-Id: <20220810224939.2611160-1-junaids@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-08-10 15:49:39 -07:00
* dirty bitmap . Similarly , making the SPTE inaccessible ( and non - writable ) for
* access - tracking via the clear_young ( ) MMU notifier also does not flush TLBs .
2022-01-25 23:05:16 +00:00
*
2022-01-25 23:07:23 +00:00
* So , there is the problem : clearing the MMU - writable bit can encounter a
* write - protected SPTE while CPUs still have writable mappings for that SPTE
* cached in their TLB . To address this , KVM always flushes TLBs when
* write - protecting SPTEs if the MMU - writable bit is set on the old SPTE .
2022-01-25 23:05:16 +00:00
*
2022-01-25 23:07:23 +00:00
* The Host - writable bit is not modified on present SPTEs , it is only set or
* cleared when an SPTE is first faulted in from non - present and then remains
* immutable .
2022-01-25 23:05:16 +00:00
*/
static inline bool is_writable_pte ( unsigned long pte )
{
return pte & PT_WRITABLE_MASK ;
}
2022-01-25 23:05:15 +00:00
/* Note: spte must be a shadow-present leaf SPTE. */
2022-01-25 23:05:14 +00:00
static inline void check_spte_writable_invariants ( u64 spte )
2020-10-16 10:29:37 -04:00
{
2022-01-25 23:05:14 +00:00
if ( spte & shadow_mmu_writable_mask )
WARN_ONCE ( ! ( spte & shadow_host_writable_mask ) ,
KVM: x86: Unify pr_fmt to use module name for all KVM modules
Define pr_fmt using KBUILD_MODNAME for all KVM x86 code so that printks
use consistent formatting across common x86, Intel, and AMD code. In
addition to providing consistent print formatting, using KBUILD_MODNAME,
e.g. kvm_amd and kvm_intel, allows referencing SVM and VMX (and SEV and
SGX and ...) as technologies without generating weird messages, and
without causing naming conflicts with other kernel code, e.g. "SEV: ",
"tdx: ", "sgx: " etc.. are all used by the kernel for non-KVM subsystems.
Opportunistically move away from printk() for prints that need to be
modified anyways, e.g. to drop a manual "kvm: " prefix.
Opportunistically convert a few SGX WARNs that are similarly modified to
WARN_ONCE; in the very unlikely event that the WARNs fire, odds are good
that they would fire repeatedly and spam the kernel log without providing
unique information in each print.
Note, defining pr_fmt yields undesirable results for code that uses KVM's
printk wrappers, e.g. vcpu_unimpl(). But, that's a pre-existing problem
as SVM/kvm_amd already defines a pr_fmt, and thankfully use of KVM's
wrappers is relatively limited in KVM x86 code.
Signed-off-by: Sean Christopherson <seanjc@google.com>
Reviewed-by: Paul Durrant <paul@xen.org>
Message-Id: <20221130230934.1014142-35-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-11-30 23:09:18 +00:00
KBUILD_MODNAME " : MMU-writable SPTE is not Host-writable: %llx " ,
2022-01-25 23:05:14 +00:00
spte ) ;
else
2022-01-25 23:05:16 +00:00
WARN_ONCE ( is_writable_pte ( spte ) ,
KVM: x86: Unify pr_fmt to use module name for all KVM modules
Define pr_fmt using KBUILD_MODNAME for all KVM x86 code so that printks
use consistent formatting across common x86, Intel, and AMD code. In
addition to providing consistent print formatting, using KBUILD_MODNAME,
e.g. kvm_amd and kvm_intel, allows referencing SVM and VMX (and SEV and
SGX and ...) as technologies without generating weird messages, and
without causing naming conflicts with other kernel code, e.g. "SEV: ",
"tdx: ", "sgx: " etc.. are all used by the kernel for non-KVM subsystems.
Opportunistically move away from printk() for prints that need to be
modified anyways, e.g. to drop a manual "kvm: " prefix.
Opportunistically convert a few SGX WARNs that are similarly modified to
WARN_ONCE; in the very unlikely event that the WARNs fire, odds are good
that they would fire repeatedly and spam the kernel log without providing
unique information in each print.
Note, defining pr_fmt yields undesirable results for code that uses KVM's
printk wrappers, e.g. vcpu_unimpl(). But, that's a pre-existing problem
as SVM/kvm_amd already defines a pr_fmt, and thankfully use of KVM's
wrappers is relatively limited in KVM x86 code.
Signed-off-by: Sean Christopherson <seanjc@google.com>
Reviewed-by: Paul Durrant <paul@xen.org>
Message-Id: <20221130230934.1014142-35-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-11-30 23:09:18 +00:00
KBUILD_MODNAME " : Writable SPTE is not MMU-writable: %llx " , spte ) ;
2022-01-25 23:05:14 +00:00
}
2022-01-13 23:30:19 +00:00
2022-04-23 03:47:41 +00:00
static inline bool is_mmu_writable_spte ( u64 spte )
2022-01-25 23:05:14 +00:00
{
return spte & shadow_mmu_writable_mask ;
2020-10-16 10:29:37 -04:00
}
static inline u64 get_mmio_spte_generation ( u64 spte )
{
u64 gen ;
2020-12-05 01:48:08 +01:00
gen = ( spte & MMIO_SPTE_GEN_LOW_MASK ) > > MMIO_SPTE_GEN_LOW_SHIFT ;
gen | = ( spte & MMIO_SPTE_GEN_HIGH_MASK ) > > MMIO_SPTE_GEN_HIGH_SHIFT ;
2020-10-16 10:29:37 -04:00
return gen ;
}
KVM: x86/mmu: Move shadow-present check out of spte_has_volatile_bits()
Move the is_shadow_present_pte() check out of spte_has_volatile_bits()
and into its callers. Well, caller, since only one of its two callers
doesn't already do the shadow-present check.
Opportunistically move the helper to spte.c/h so that it can be used by
the TDP MMU, which is also the primary motivation for the shadow-present
change. Unlike the legacy MMU, the TDP MMU uses a single path for clear
leaf and non-leaf SPTEs, and to avoid unnecessary atomic updates, the TDP
MMU will need to check is_last_spte() prior to calling
spte_has_volatile_bits(), and calling is_last_spte() without first
calling is_shadow_present_spte() is at best odd, and at worst a violation
of KVM's loosely defines SPTE rules.
Note, mmu_spte_clear_track_bits() could likely skip the write entirely
for SPTEs that are not shadow-present. Leave that cleanup for a future
patch to avoid introducing a functional change, and because the
shadow-present check can likely be moved further up the stack, e.g.
drop_large_spte() appears to be the only path that doesn't already
explicitly check for a shadow-present SPTE.
No functional change intended.
Cc: stable@vger.kernel.org
Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20220423034752.1161007-3-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-04-23 03:47:42 +00:00
bool spte_has_volatile_bits ( u64 spte ) ;
2021-08-17 07:43:19 -04:00
bool make_spte ( struct kvm_vcpu * vcpu , struct kvm_mmu_page * sp ,
2021-11-15 15:45:58 -08:00
const struct kvm_memory_slot * slot ,
2021-08-17 07:43:19 -04:00
unsigned int pte_access , gfn_t gfn , kvm_pfn_t pfn ,
2021-09-29 09:19:32 -04:00
u64 old_spte , bool prefetch , bool can_unsync ,
2021-08-17 07:43:19 -04:00
bool host_writable , u64 * new_spte ) ;
2022-06-22 15:27:05 -04:00
u64 make_huge_page_split_spte ( struct kvm * kvm , u64 huge_spte ,
union kvm_mmu_page_role role , int index ) ;
2020-10-16 10:29:37 -04:00
u64 make_nonleaf_spte ( u64 * child_pt , bool ad_disabled ) ;
u64 make_mmio_spte ( struct kvm_vcpu * vcpu , u64 gfn , unsigned int access ) ;
u64 mark_spte_for_access_track ( u64 spte ) ;
2022-01-19 23:07:31 +00:00
/* Restore an acc-track PTE back to a regular PTE */
static inline u64 restore_acc_track_spte ( u64 spte )
{
u64 saved_bits = ( spte > > SHADOW_ACC_TRACK_SAVED_BITS_SHIFT )
& SHADOW_ACC_TRACK_SAVED_BITS_MASK ;
spte & = ~ shadow_acc_track_mask ;
spte & = ~ ( SHADOW_ACC_TRACK_SAVED_BITS_MASK < <
SHADOW_ACC_TRACK_SAVED_BITS_SHIFT ) ;
spte | = saved_bits ;
return spte ;
}
2022-08-03 22:49:56 +00:00
void __init kvm_mmu_spte_module_init ( void ) ;
2020-10-16 10:29:37 -04:00
void kvm_mmu_reset_all_pte_masks ( void ) ;
# endif