KVM x86 MMU changes for 6.8:

- Fix a relatively benign off-by-one error when splitting huge pages during
    CLEAR_DIRTY_LOG.
 
  - Fix a bug where KVM could incorrectly test-and-clear dirty bits in non-leaf
    TDP MMU SPTEs if a racing thread replaces a huge SPTE with a non-huge SPTE.
 
  - Relax the TDP MMU's lockdep assertions related to holding mmu_lock for read
    versus write so that KVM doesn't pass "bool shared" all over the place just
    to have precise assertions in paths that don't actually care about whether
    the caller is a reader or a writer.
 -----BEGIN PGP SIGNATURE-----
 
 iQJGBAABCgAwFiEEMHr+pfEFOIzK+KY1YJEiAU0MEvkFAmWW/W4SHHNlYW5qY0Bn
 b29nbGUuY29tAAoJEGCRIgFNDBL5pAQP/27OuHVhtcao9m8GAsm55tMEIDh2n6Ih
 Yb74SOQhprbaGWyx/J6QdKABzcQX86L3CvCMSy6Ssoz2FeDM8Qs2a80/0jkK0qtI
 TPXnOsvxh/MgEsC55LOSJZeWM+xwEqaTA2wuPAbvAcJLdAZsSLQQV1XwH6nl4l0l
 LhcdkUeAKqQ8DbeBvTDyz80zuWaKhQMm5DJOBM6HbqaAdmMpw/hPDtu9b654jC2R
 Z6Rs5590OFrR13cBAvgCBayvb54pjW9dbm8lfZ4Grcq1I+C0I7Y5jF+DYFjKuTUi
 N7t+3HrpAM54DWwD95qkTO73g8tMSvwKGKq+OhylZrbPjeqgyYApcUvbxgKmhgGu
 hBRoAS9D2goJQtFv4TehpAiJIoR6YGX5cCyIIFI50EfsWkifED/b7zUBf2jg0UnV
 Z7IzvGLQtMAHj6daDcgXxaLVAXQ3v4aQK7DiPuCIiFGcXvedgcGIFjZH4NaVvK2D
 Y9ygO8PSbN0MbGUSkaS8Urrxmx7l1tv05cardxjSVxnKEx01nYugB0vTM0/o7gSq
 Bs2gnf+UJrazeabqFuVBYzzuHH7QyjsiQQ0HWoQiq04v8+C5VldwQKk+g8vxLXG1
 iYNXW7qgueLxYxRNleLiyfw1HIyspR09d6e67n2729Xxtay0cvL8dC68AhEenb+N
 fZ2lKFhnZ2Vs
 =3f2t
 -----END PGP SIGNATURE-----

Merge tag 'kvm-x86-mmu-6.8' of https://github.com/kvm-x86/linux into HEAD

KVM x86 MMU changes for 6.8:

 - Fix a relatively benign off-by-one error when splitting huge pages during
   CLEAR_DIRTY_LOG.

 - Fix a bug where KVM could incorrectly test-and-clear dirty bits in non-leaf
   TDP MMU SPTEs if a racing thread replaces a huge SPTE with a non-huge SPTE.

 - Relax the TDP MMU's lockdep assertions related to holding mmu_lock for read
   versus write so that KVM doesn't pass "bool shared" all over the place just
   to have precise assertions in paths that don't actually care about whether
   the caller is a reader or a writer.
This commit is contained in:
Paolo Bonzini 2024-01-08 08:10:32 -05:00
commit 7f26fea9bc
5 changed files with 57 additions and 67 deletions

View File

@ -43,10 +43,9 @@ On x86:
- vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock and kvm->arch.xen.xen_lock
- kvm->arch.mmu_lock is an rwlock. kvm->arch.tdp_mmu_pages_lock and
kvm->arch.mmu_unsync_pages_lock are taken inside kvm->arch.mmu_lock, and
cannot be taken without already holding kvm->arch.mmu_lock (typically with
``read_lock`` for the TDP MMU, thus the need for additional spinlocks).
- kvm->arch.mmu_lock is an rwlock; critical sections for
kvm->arch.tdp_mmu_pages_lock and kvm->arch.mmu_unsync_pages_lock must
also take kvm->arch.mmu_lock
Everything else is a leaf: no other lock is taken inside the critical
sections.

View File

@ -1433,9 +1433,8 @@ struct kvm_arch {
* the MMU lock in read mode + RCU or
* the MMU lock in write mode
*
* For writes, this list is protected by:
* the MMU lock in read mode + the tdp_mmu_pages_lock or
* the MMU lock in write mode
* For writes, this list is protected by tdp_mmu_pages_lock; see
* below for the details.
*
* Roots will remain in the list until their tdp_mmu_root_count
* drops to zero, at which point the thread that decremented the
@ -1452,8 +1451,10 @@ struct kvm_arch {
* - possible_nx_huge_pages;
* - the possible_nx_huge_page_link field of kvm_mmu_page structs used
* by the TDP MMU
* It is acceptable, but not necessary, to acquire this lock when
* the thread holds the MMU lock in write mode.
* Because the lock is only taken within the MMU lock, strictly
* speaking it is redundant to acquire this lock when the thread
* holds the MMU lock in write mode. However it often simplifies
* the code to do so.
*/
spinlock_t tdp_mmu_pages_lock;
#endif /* CONFIG_X86_64 */

View File

@ -1388,7 +1388,7 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
gfn_t end = slot->base_gfn + gfn_offset + __fls(mask);
if (READ_ONCE(eager_page_split))
kvm_mmu_try_split_huge_pages(kvm, slot, start, end, PG_LEVEL_4K);
kvm_mmu_try_split_huge_pages(kvm, slot, start, end + 1, PG_LEVEL_4K);
kvm_mmu_slot_gfn_write_protect(kvm, slot, start, PG_LEVEL_2M);
@ -2846,9 +2846,9 @@ int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot,
/*
* Recheck after taking the spinlock, a different vCPU
* may have since marked the page unsync. A false
* positive on the unprotected check above is not
* negative on the unprotected check above is not
* possible as clearing sp->unsync _must_ hold mmu_lock
* for write, i.e. unsync cannot transition from 0->1
* for write, i.e. unsync cannot transition from 1->0
* while this CPU holds mmu_lock for read (or write).
*/
if (READ_ONCE(sp->unsync))
@ -3576,7 +3576,7 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
return;
if (is_tdp_mmu_page(sp))
kvm_tdp_mmu_put_root(kvm, sp, false);
kvm_tdp_mmu_put_root(kvm, sp);
else if (!--sp->root_count && sp->role.invalid)
kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);

View File

@ -73,11 +73,8 @@ static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
tdp_mmu_free_sp(sp);
}
void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
bool shared)
void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
{
kvm_lockdep_assert_mmu_lock_held(kvm, shared);
if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
return;
@ -106,10 +103,16 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
*/
static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
struct kvm_mmu_page *prev_root,
bool shared, bool only_valid)
bool only_valid)
{
struct kvm_mmu_page *next_root;
/*
* While the roots themselves are RCU-protected, fields such as
* role.invalid are protected by mmu_lock.
*/
lockdep_assert_held(&kvm->mmu_lock);
rcu_read_lock();
if (prev_root)
@ -132,7 +135,7 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
rcu_read_unlock();
if (prev_root)
kvm_tdp_mmu_put_root(kvm, prev_root, shared);
kvm_tdp_mmu_put_root(kvm, prev_root);
return next_root;
}
@ -144,26 +147,22 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
* recent root. (Unless keeping a live reference is desirable.)
*
* If shared is set, this function is operating under the MMU lock in read
* mode. In the unlikely event that this thread must free a root, the lock
* will be temporarily dropped and reacquired in write mode.
* mode.
*/
#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\
for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid); \
_root; \
_root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid)) \
if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) && \
kvm_mmu_page_as_id(_root) != _as_id) { \
#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _only_valid)\
for (_root = tdp_mmu_next_root(_kvm, NULL, _only_valid); \
({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \
_root = tdp_mmu_next_root(_kvm, _root, _only_valid)) \
if (kvm_mmu_page_as_id(_root) != _as_id) { \
} else
#define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \
__for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true)
#define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \
__for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, true)
#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _shared) \
for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, false); \
_root; \
_root = tdp_mmu_next_root(_kvm, _root, _shared, false)) \
if (!kvm_lockdep_assert_mmu_lock_held(_kvm, _shared)) { \
} else
#define for_each_tdp_mmu_root_yield_safe(_kvm, _root) \
for (_root = tdp_mmu_next_root(_kvm, NULL, false); \
({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \
_root = tdp_mmu_next_root(_kvm, _root, false))
/*
* Iterate over all TDP MMU roots. Requires that mmu_lock be held for write,
@ -276,28 +275,18 @@ static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
*
* @kvm: kvm instance
* @sp: the page to be removed
* @shared: This operation may not be running under the exclusive use of
* the MMU lock and the operation must synchronize with other
* threads that might be adding or removing pages.
*/
static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp,
bool shared)
static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
{
tdp_unaccount_mmu_page(kvm, sp);
if (!sp->nx_huge_page_disallowed)
return;
if (shared)
spin_lock(&kvm->arch.tdp_mmu_pages_lock);
else
lockdep_assert_held_write(&kvm->mmu_lock);
spin_lock(&kvm->arch.tdp_mmu_pages_lock);
sp->nx_huge_page_disallowed = false;
untrack_possible_nx_huge_page(kvm, sp);
if (shared)
spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
}
/**
@ -326,7 +315,7 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
trace_kvm_mmu_prepare_zap_page(sp);
tdp_mmu_unlink_sp(kvm, sp, shared);
tdp_mmu_unlink_sp(kvm, sp);
for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
tdp_ptep_t sptep = pt + i;
@ -832,7 +821,8 @@ bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush)
{
struct kvm_mmu_page *root;
for_each_tdp_mmu_root_yield_safe(kvm, root, false)
lockdep_assert_held_write(&kvm->mmu_lock);
for_each_tdp_mmu_root_yield_safe(kvm, root)
flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush);
return flush;
@ -854,7 +844,8 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm)
* is being destroyed or the userspace VMM has exited. In both cases,
* KVM_RUN is unreachable, i.e. no vCPUs will ever service the request.
*/
for_each_tdp_mmu_root_yield_safe(kvm, root, false)
lockdep_assert_held_write(&kvm->mmu_lock);
for_each_tdp_mmu_root_yield_safe(kvm, root)
tdp_mmu_zap_root(kvm, root, false);
}
@ -868,7 +859,7 @@ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
read_lock(&kvm->mmu_lock);
for_each_tdp_mmu_root_yield_safe(kvm, root, true) {
for_each_tdp_mmu_root_yield_safe(kvm, root) {
if (!root->tdp_mmu_scheduled_root_to_zap)
continue;
@ -891,7 +882,7 @@ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
* the root must be reachable by mmu_notifiers while it's being
* zapped
*/
kvm_tdp_mmu_put_root(kvm, root, true);
kvm_tdp_mmu_put_root(kvm, root);
}
read_unlock(&kvm->mmu_lock);
@ -1125,7 +1116,7 @@ bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
{
struct kvm_mmu_page *root;
__for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, false, false)
__for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, false)
flush = tdp_mmu_zap_leafs(kvm, root, range->start, range->end,
range->may_block, flush);
@ -1314,7 +1305,7 @@ bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
lockdep_assert_held_read(&kvm->mmu_lock);
for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
slot->base_gfn + slot->npages, min_level);
@ -1346,6 +1337,8 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
{
struct kvm_mmu_page *sp;
kvm_lockdep_assert_mmu_lock_held(kvm, shared);
/*
* Since we are allocating while under the MMU lock we have to be
* careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct
@ -1496,11 +1489,10 @@ void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
int r = 0;
kvm_lockdep_assert_mmu_lock_held(kvm, shared);
for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) {
for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) {
r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
if (r) {
kvm_tdp_mmu_put_root(kvm, root, shared);
kvm_tdp_mmu_put_root(kvm, root);
break;
}
}
@ -1522,12 +1514,13 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
rcu_read_lock();
tdp_root_for_each_leaf_pte(iter, root, start, end) {
tdp_root_for_each_pte(iter, root, start, end) {
retry:
if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
if (!is_shadow_present_pte(iter.old_spte) ||
!is_last_spte(iter.old_spte, iter.level))
continue;
if (!is_shadow_present_pte(iter.old_spte))
if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
continue;
KVM_MMU_WARN_ON(kvm_ad_enabled() &&
@ -1560,8 +1553,7 @@ bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
bool spte_set = false;
lockdep_assert_held_read(&kvm->mmu_lock);
for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
slot->base_gfn + slot->npages);
@ -1695,8 +1687,7 @@ void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
struct kvm_mmu_page *root;
lockdep_assert_held_read(&kvm->mmu_lock);
for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
zap_collapsible_spte_range(kvm, root, slot);
}

View File

@ -17,8 +17,7 @@ __must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root)
return refcount_inc_not_zero(&root->tdp_mmu_root_count);
}
void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
bool shared);
void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root);
bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush);
bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp);