Merge branch 'kvm-x86-mmu-6.6' into HEAD

KVM x86 MMU changes for 6.6:

 - Rip out the ancient MMU_DEBUG crud and replace the useful bits with
   CONFIG_KVM_PROVE_MMU

 - Overhaul KVM's page-track APIs, and KVMGT's usage, to reduce the API surface
   that is needed by external users (currently only KVMGT), and fix a variety
   of issues in the process

 - Fix KVM's handling of !visible guest roots to avoid premature triple fault
   injection by loading a dummy root backed by the zero page
This commit is contained in:
Paolo Bonzini 2023-09-01 15:49:45 -04:00
commit d011151616
20 changed files with 584 additions and 600 deletions

View File

@ -288,13 +288,13 @@ struct kvm_kernel_irq_routing_entry;
* kvm_mmu_page_role tracks the properties of a shadow page (where shadow page
* also includes TDP pages) to determine whether or not a page can be used in
* the given MMU context. This is a subset of the overall kvm_cpu_role to
* minimize the size of kvm_memory_slot.arch.gfn_track, i.e. allows allocating
* 2 bytes per gfn instead of 4 bytes per gfn.
* minimize the size of kvm_memory_slot.arch.gfn_write_track, i.e. allows
* allocating 2 bytes per gfn instead of 4 bytes per gfn.
*
* Upper-level shadow pages having gptes are tracked for write-protection via
* gfn_track. As above, gfn_track is a 16 bit counter, so KVM must not create
* more than 2^16-1 upper-level shadow pages at a single gfn, otherwise
* gfn_track will overflow and explosions will ensure.
* gfn_write_track. As above, gfn_write_track is a 16 bit counter, so KVM must
* not create more than 2^16-1 upper-level shadow pages at a single gfn,
* otherwise gfn_write_track will overflow and explosions will ensue.
*
* A unique shadow page (SP) for a gfn is created if and only if an existing SP
* cannot be reused. The ability to reuse a SP is tracked by its role, which
@ -1023,7 +1023,7 @@ struct kvm_lpage_info {
struct kvm_arch_memory_slot {
struct kvm_rmap_head *rmap[KVM_NR_PAGE_SIZES];
struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
unsigned short *gfn_track[KVM_PAGE_TRACK_MAX];
unsigned short *gfn_write_track;
};
/*
@ -1265,8 +1265,9 @@ struct kvm_arch {
* create an NX huge page (without hanging the guest).
*/
struct list_head possible_nx_huge_pages;
struct kvm_page_track_notifier_node mmu_sp_tracker;
#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING
struct kvm_page_track_notifier_head track_notifier_head;
#endif
/*
* Protects marking pages unsync during page faults, as TDP MMU page
* faults only take mmu_lock for read. For simplicity, the unsync
@ -1853,7 +1854,6 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
const struct kvm_memory_slot *memslot);
void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
const struct kvm_memory_slot *memslot);
void kvm_mmu_zap_all(struct kvm *kvm);
void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen);
void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long kvm_nr_mmu_pages);

View File

@ -2,11 +2,9 @@
#ifndef _ASM_X86_KVM_PAGE_TRACK_H
#define _ASM_X86_KVM_PAGE_TRACK_H
enum kvm_page_track_mode {
KVM_PAGE_TRACK_WRITE,
KVM_PAGE_TRACK_MAX,
};
#include <linux/kvm_types.h>
#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING
/*
* The notifier represented by @kvm_page_track_notifier_node is linked into
* the head which will be notified when guest is triggering the track event.
@ -26,54 +24,39 @@ struct kvm_page_track_notifier_node {
* It is called when guest is writing the write-tracked page
* and write emulation is finished at that time.
*
* @vcpu: the vcpu where the write access happened.
* @gpa: the physical address written by guest.
* @new: the data was written to the address.
* @bytes: the written length.
* @node: this node
*/
void (*track_write)(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
int bytes, struct kvm_page_track_notifier_node *node);
void (*track_write)(gpa_t gpa, const u8 *new, int bytes,
struct kvm_page_track_notifier_node *node);
/*
* It is called when memory slot is being moved or removed
* users can drop write-protection for the pages in that memory slot
* Invoked when a memory region is removed from the guest. Or in KVM
* terms, when a memslot is deleted.
*
* @kvm: the kvm where memory slot being moved or removed
* @slot: the memory slot being moved or removed
* @gfn: base gfn of the region being removed
* @nr_pages: number of pages in the to-be-removed region
* @node: this node
*/
void (*track_flush_slot)(struct kvm *kvm, struct kvm_memory_slot *slot,
void (*track_remove_region)(gfn_t gfn, unsigned long nr_pages,
struct kvm_page_track_notifier_node *node);
};
int kvm_page_track_init(struct kvm *kvm);
void kvm_page_track_cleanup(struct kvm *kvm);
bool kvm_page_track_write_tracking_enabled(struct kvm *kvm);
int kvm_page_track_write_tracking_alloc(struct kvm_memory_slot *slot);
void kvm_page_track_free_memslot(struct kvm_memory_slot *slot);
int kvm_page_track_create_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot,
unsigned long npages);
void kvm_slot_page_track_add_page(struct kvm *kvm,
struct kvm_memory_slot *slot, gfn_t gfn,
enum kvm_page_track_mode mode);
void kvm_slot_page_track_remove_page(struct kvm *kvm,
struct kvm_memory_slot *slot, gfn_t gfn,
enum kvm_page_track_mode mode);
bool kvm_slot_page_track_is_active(struct kvm *kvm,
const struct kvm_memory_slot *slot,
gfn_t gfn, enum kvm_page_track_mode mode);
void
kvm_page_track_register_notifier(struct kvm *kvm,
int kvm_page_track_register_notifier(struct kvm *kvm,
struct kvm_page_track_notifier_node *n);
void
kvm_page_track_unregister_notifier(struct kvm *kvm,
void kvm_page_track_unregister_notifier(struct kvm *kvm,
struct kvm_page_track_notifier_node *n);
void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
int bytes);
void kvm_page_track_flush_slot(struct kvm *kvm, struct kvm_memory_slot *slot);
int kvm_write_track_add_gfn(struct kvm *kvm, gfn_t gfn);
int kvm_write_track_remove_gfn(struct kvm *kvm, gfn_t gfn);
#else
/*
* Allow defining a node in a structure even if page tracking is disabled, e.g.
* to play nice with testing headers via direct inclusion from the command line.
*/
struct kvm_page_track_notifier_node {};
#endif /* CONFIG_KVM_EXTERNAL_WRITE_TRACKING */
#endif

View File

@ -138,6 +138,19 @@ config KVM_XEN
If in doubt, say "N".
config KVM_PROVE_MMU
bool "Prove KVM MMU correctness"
depends on DEBUG_KERNEL
depends on KVM
depends on EXPERT
help
Enables runtime assertions in KVM's MMU that are too costly to enable
in anything remotely resembling a production environment, e.g. this
gates code that verifies a to-be-freed page table doesn't have any
present SPTEs.
If in doubt, say "N".
config KVM_EXTERNAL_WRITE_TRACKING
bool

View File

@ -121,6 +121,8 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu);
void kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu);
void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu);
void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
int bytes);
static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
{

View File

@ -25,6 +25,7 @@
#include "kvm_cache_regs.h"
#include "smm.h"
#include "kvm_emulate.h"
#include "page_track.h"
#include "cpuid.h"
#include "spte.h"
@ -53,7 +54,7 @@
#include <asm/io.h>
#include <asm/set_memory.h>
#include <asm/vmx.h>
#include <asm/kvm_page_track.h>
#include "trace.h"
extern bool itlb_multihit_kvm_mitigation;
@ -115,11 +116,6 @@ static int max_huge_page_level __read_mostly;
static int tdp_root_level __read_mostly;
static int max_tdp_level __read_mostly;
#ifdef MMU_DEBUG
bool dbg = 0;
module_param(dbg, bool, 0644);
#endif
#define PTE_PREFETCH_NUM 8
#include <trace/events/kvm.h>
@ -486,7 +482,7 @@ retry:
*/
static void mmu_spte_set(u64 *sptep, u64 new_spte)
{
WARN_ON(is_shadow_present_pte(*sptep));
WARN_ON_ONCE(is_shadow_present_pte(*sptep));
__set_spte(sptep, new_spte);
}
@ -498,7 +494,7 @@ static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
{
u64 old_spte = *sptep;
WARN_ON(!is_shadow_present_pte(new_spte));
WARN_ON_ONCE(!is_shadow_present_pte(new_spte));
check_spte_writable_invariants(new_spte);
if (!is_shadow_present_pte(old_spte)) {
@ -511,7 +507,7 @@ static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
else
old_spte = __update_clear_spte_slow(sptep, new_spte);
WARN_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
WARN_ON_ONCE(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
return old_spte;
}
@ -593,7 +589,7 @@ static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
* by a refcounted page, the refcount is elevated.
*/
page = kvm_pfn_to_refcounted_page(pfn);
WARN_ON(page && !page_count(page));
WARN_ON_ONCE(page && !page_count(page));
if (is_accessed_spte(old_spte))
kvm_set_pfn_accessed(pfn);
@ -808,7 +804,7 @@ static void update_gfn_disallow_lpage_count(const struct kvm_memory_slot *slot,
for (i = PG_LEVEL_2M; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
linfo = lpage_info_slot(gfn, slot, i);
linfo->disallow_lpage += count;
WARN_ON(linfo->disallow_lpage < 0);
WARN_ON_ONCE(linfo->disallow_lpage < 0);
}
}
@ -835,8 +831,7 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
/* the non-leaf shadow pages are keeping readonly. */
if (sp->role.level > PG_LEVEL_4K)
return kvm_slot_page_track_add_page(kvm, slot, gfn,
KVM_PAGE_TRACK_WRITE);
return __kvm_write_track_add_gfn(kvm, slot, gfn);
kvm_mmu_gfn_disallow_lpage(slot, gfn);
@ -882,8 +877,7 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
slots = kvm_memslots_for_spte_role(kvm, sp->role);
slot = __gfn_to_memslot(slots, gfn);
if (sp->role.level > PG_LEVEL_4K)
return kvm_slot_page_track_remove_page(kvm, slot, gfn,
KVM_PAGE_TRACK_WRITE);
return __kvm_write_track_remove_gfn(kvm, slot, gfn);
kvm_mmu_gfn_allow_lpage(slot, gfn);
}
@ -937,10 +931,8 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
int count = 0;
if (!rmap_head->val) {
rmap_printk("%p %llx 0->1\n", spte, *spte);
rmap_head->val = (unsigned long)spte;
} else if (!(rmap_head->val & 1)) {
rmap_printk("%p %llx 1->many\n", spte, *spte);
desc = kvm_mmu_memory_cache_alloc(cache);
desc->sptes[0] = (u64 *)rmap_head->val;
desc->sptes[1] = spte;
@ -949,7 +941,6 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
rmap_head->val = (unsigned long)desc | 1;
++count;
} else {
rmap_printk("%p %llx many->many\n", spte, *spte);
desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
count = desc->tail_count + desc->spte_count;
@ -969,7 +960,8 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
return count;
}
static void pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
static void pte_list_desc_remove_entry(struct kvm *kvm,
struct kvm_rmap_head *rmap_head,
struct pte_list_desc *desc, int i)
{
struct pte_list_desc *head_desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
@ -980,7 +972,7 @@ static void pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
* when adding an entry and the previous head is full, and heads are
* removed (this flow) when they become empty.
*/
BUG_ON(j < 0);
KVM_BUG_ON_DATA_CORRUPTION(j < 0, kvm);
/*
* Replace the to-be-freed SPTE with the last valid entry from the head
@ -1005,35 +997,34 @@ static void pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
mmu_free_pte_list_desc(head_desc);
}
static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
static void pte_list_remove(struct kvm *kvm, u64 *spte,
struct kvm_rmap_head *rmap_head)
{
struct pte_list_desc *desc;
int i;
if (!rmap_head->val) {
pr_err("%s: %p 0->BUG\n", __func__, spte);
BUG();
} else if (!(rmap_head->val & 1)) {
rmap_printk("%p 1->0\n", spte);
if ((u64 *)rmap_head->val != spte) {
pr_err("%s: %p 1->BUG\n", __func__, spte);
BUG();
}
if (KVM_BUG_ON_DATA_CORRUPTION(!rmap_head->val, kvm))
return;
if (!(rmap_head->val & 1)) {
if (KVM_BUG_ON_DATA_CORRUPTION((u64 *)rmap_head->val != spte, kvm))
return;
rmap_head->val = 0;
} else {
rmap_printk("%p many->many\n", spte);
desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
while (desc) {
for (i = 0; i < desc->spte_count; ++i) {
if (desc->sptes[i] == spte) {
pte_list_desc_remove_entry(rmap_head, desc, i);
pte_list_desc_remove_entry(kvm, rmap_head,
desc, i);
return;
}
}
desc = desc->more;
}
pr_err("%s: %p many->many\n", __func__, spte);
BUG();
KVM_BUG_ON_DATA_CORRUPTION(true, kvm);
}
}
@ -1041,7 +1032,7 @@ static void kvm_zap_one_rmap_spte(struct kvm *kvm,
struct kvm_rmap_head *rmap_head, u64 *sptep)
{
mmu_spte_clear_track_bits(kvm, sptep);
pte_list_remove(sptep, rmap_head);
pte_list_remove(kvm, sptep, rmap_head);
}
/* Return true if at least one SPTE was zapped, false otherwise */
@ -1116,7 +1107,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
slot = __gfn_to_memslot(slots, gfn);
rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
pte_list_remove(spte, rmap_head);
pte_list_remove(kvm, spte, rmap_head);
}
/*
@ -1208,7 +1199,7 @@ static void drop_large_spte(struct kvm *kvm, u64 *sptep, bool flush)
struct kvm_mmu_page *sp;
sp = sptep_to_sp(sptep);
WARN_ON(sp->role.level == PG_LEVEL_4K);
WARN_ON_ONCE(sp->role.level == PG_LEVEL_4K);
drop_spte(kvm, sptep);
@ -1237,8 +1228,6 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect)
!(pt_protect && is_mmu_writable_spte(spte)))
return false;
rmap_printk("spte %p %llx\n", sptep, *sptep);
if (pt_protect)
spte &= ~shadow_mmu_writable_mask;
spte = spte & ~PT_WRITABLE_MASK;
@ -1263,9 +1252,7 @@ static bool spte_clear_dirty(u64 *sptep)
{
u64 spte = *sptep;
rmap_printk("spte %p %llx\n", sptep, *sptep);
MMU_WARN_ON(!spte_ad_enabled(spte));
KVM_MMU_WARN_ON(!spte_ad_enabled(spte));
spte &= ~shadow_dirty_mask;
return mmu_spte_update(sptep, spte);
}
@ -1471,14 +1458,11 @@ static bool kvm_set_pte_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
u64 new_spte;
kvm_pfn_t new_pfn;
WARN_ON(pte_huge(pte));
WARN_ON_ONCE(pte_huge(pte));
new_pfn = pte_pfn(pte);
restart:
for_each_rmap_spte(rmap_head, &iter, sptep) {
rmap_printk("spte %p %llx gfn %llx (%d)\n",
sptep, *sptep, gfn, level);
need_flush = true;
if (pte_write(pte)) {
@ -1706,21 +1690,19 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
return young;
}
#ifdef MMU_DEBUG
static int is_empty_shadow_page(u64 *spt)
static void kvm_mmu_check_sptes_at_free(struct kvm_mmu_page *sp)
{
u64 *pos;
u64 *end;
#ifdef CONFIG_KVM_PROVE_MMU
int i;
for (pos = spt, end = pos + SPTE_ENT_PER_PAGE; pos != end; pos++)
if (is_shadow_present_pte(*pos)) {
printk(KERN_ERR "%s: %p %llx\n", __func__,
pos, *pos);
return 0;
}
return 1;
for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
if (KVM_MMU_WARN_ON(is_shadow_present_pte(sp->spt[i])))
pr_err_ratelimited("SPTE %llx (@ %p) for gfn %llx shadow-present at free",
sp->spt[i], &sp->spt[i],
kvm_mmu_page_get_gfn(sp, i));
}
#endif
}
/*
* This value is the sum of all of the kvm instances's
@ -1748,7 +1730,8 @@ static void kvm_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
static void kvm_mmu_free_shadow_page(struct kvm_mmu_page *sp)
{
MMU_WARN_ON(!is_empty_shadow_page(sp->spt));
kvm_mmu_check_sptes_at_free(sp);
hlist_del(&sp->hash_link);
list_del(&sp->link);
free_page((unsigned long)sp->spt);
@ -1771,16 +1754,16 @@ static void mmu_page_add_parent_pte(struct kvm_mmu_memory_cache *cache,
pte_list_add(cache, parent_pte, &sp->parent_ptes);
}
static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
static void mmu_page_remove_parent_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
u64 *parent_pte)
{
pte_list_remove(parent_pte, &sp->parent_ptes);
pte_list_remove(kvm, parent_pte, &sp->parent_ptes);
}
static void drop_parent_pte(struct kvm_mmu_page *sp,
static void drop_parent_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
u64 *parent_pte)
{
mmu_page_remove_parent_pte(sp, parent_pte);
mmu_page_remove_parent_pte(kvm, sp, parent_pte);
mmu_spte_clear_no_track(parent_pte);
}
@ -1836,7 +1819,7 @@ static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx)
{
--sp->unsync_children;
WARN_ON((int)sp->unsync_children < 0);
WARN_ON_ONCE((int)sp->unsync_children < 0);
__clear_bit(idx, sp->unsync_child_bitmap);
}
@ -1894,7 +1877,7 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp,
static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
WARN_ON(!sp->unsync);
WARN_ON_ONCE(!sp->unsync);
trace_kvm_mmu_sync_page(sp);
sp->unsync = 0;
--kvm->stat.mmu_unsync;
@ -2069,11 +2052,11 @@ static int mmu_pages_first(struct kvm_mmu_pages *pvec,
if (pvec->nr == 0)
return 0;
WARN_ON(pvec->page[0].idx != INVALID_INDEX);
WARN_ON_ONCE(pvec->page[0].idx != INVALID_INDEX);
sp = pvec->page[0].sp;
level = sp->role.level;
WARN_ON(level == PG_LEVEL_4K);
WARN_ON_ONCE(level == PG_LEVEL_4K);
parents->parent[level-2] = sp;
@ -2095,7 +2078,7 @@ static void mmu_pages_clear_parents(struct mmu_page_path *parents)
if (!sp)
return;
WARN_ON(idx == INVALID_INDEX);
WARN_ON_ONCE(idx == INVALID_INDEX);
clear_unsync_child_bit(sp, idx);
level++;
} while (!sp->unsync_children);
@ -2216,7 +2199,7 @@ static struct kvm_mmu_page *kvm_mmu_find_shadow_page(struct kvm *kvm,
if (ret < 0)
break;
WARN_ON(!list_empty(&invalid_list));
WARN_ON_ONCE(!list_empty(&invalid_list));
if (ret > 0)
kvm_flush_remote_tlbs(kvm);
}
@ -2495,7 +2478,7 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
if (child->role.access == direct_access)
return;
drop_parent_pte(child, sptep);
drop_parent_pte(vcpu->kvm, child, sptep);
kvm_flush_remote_tlbs_sptep(vcpu->kvm, sptep);
}
}
@ -2513,7 +2496,7 @@ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
drop_spte(kvm, spte);
} else {
child = spte_to_child_sp(pte);
drop_parent_pte(child, spte);
drop_parent_pte(kvm, child, spte);
/*
* Recursively zap nested TDP SPs, parentless SPs are
@ -2544,13 +2527,13 @@ static int kvm_mmu_page_unlink_children(struct kvm *kvm,
return zapped;
}
static void kvm_mmu_unlink_parents(struct kvm_mmu_page *sp)
static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
{
u64 *sptep;
struct rmap_iterator iter;
while ((sptep = rmap_get_first(&sp->parent_ptes, &iter)))
drop_parent_pte(sp, sptep);
drop_parent_pte(kvm, sp, sptep);
}
static int mmu_zap_unsync_children(struct kvm *kvm,
@ -2589,7 +2572,7 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
++kvm->stat.mmu_shadow_zapped;
*nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list);
*nr_zapped += kvm_mmu_page_unlink_children(kvm, sp, invalid_list);
kvm_mmu_unlink_parents(sp);
kvm_mmu_unlink_parents(kvm, sp);
/* Zapping children means active_mmu_pages has become unstable. */
list_unstable = *nr_zapped;
@ -2671,7 +2654,7 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
kvm_flush_remote_tlbs(kvm);
list_for_each_entry_safe(sp, nsp, invalid_list, link) {
WARN_ON(!sp->role.invalid || sp->root_count);
WARN_ON_ONCE(!sp->role.invalid || sp->root_count);
kvm_mmu_free_shadow_page(sp);
}
}
@ -2771,12 +2754,9 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
LIST_HEAD(invalid_list);
int r;
pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
r = 0;
write_lock(&kvm->mmu_lock);
for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) {
pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
sp->role.word);
r = 1;
kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
}
@ -2827,7 +2807,7 @@ int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot,
* track machinery is used to write-protect upper-level shadow pages,
* i.e. this guards the role.level == 4K assertion below!
*/
if (kvm_slot_page_track_is_active(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE))
if (kvm_gfn_is_write_tracked(kvm, slot, gfn))
return -EPERM;
/*
@ -2869,7 +2849,7 @@ int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot,
continue;
}
WARN_ON(sp->role.level != PG_LEVEL_4K);
WARN_ON_ONCE(sp->role.level != PG_LEVEL_4K);
kvm_unsync_page(kvm, sp);
}
if (locked)
@ -2934,9 +2914,6 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
bool prefetch = !fault || fault->prefetch;
bool write_fault = fault && fault->write;
pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
*sptep, write_fault, gfn);
if (unlikely(is_noslot_pfn(pfn))) {
vcpu->stat.pf_mmio_spte_created++;
mark_mmio_spte(vcpu, sptep, gfn, pte_access);
@ -2953,11 +2930,9 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
u64 pte = *sptep;
child = spte_to_child_sp(pte);
drop_parent_pte(child, sptep);
drop_parent_pte(vcpu->kvm, child, sptep);
flush = true;
} else if (pfn != spte_to_pfn(*sptep)) {
pgprintk("hfn old %llx new %llx\n",
spte_to_pfn(*sptep), pfn);
drop_spte(vcpu->kvm, sptep);
flush = true;
} else
@ -2982,8 +2957,6 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
if (flush)
kvm_flush_remote_tlbs_gfn(vcpu->kvm, gfn, level);
pgprintk("%s: setting spte %llx\n", __func__, *sptep);
if (!was_rmapped) {
WARN_ON_ONCE(ret == RET_PF_SPURIOUS);
rmap_add(vcpu, slot, sptep, gfn, pte_access);
@ -3029,7 +3002,7 @@ static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
u64 *spte, *start = NULL;
int i;
WARN_ON(!sp->role.direct);
WARN_ON_ONCE(!sp->role.direct);
i = spte_index(sptep) & ~(PTE_PREFETCH_NUM - 1);
spte = sp->spt + i;
@ -3570,12 +3543,8 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
if (!VALID_PAGE(*root_hpa))
return;
/*
* The "root" may be a special root, e.g. a PAE entry, treat it as a
* SPTE to ensure any non-PA bits are dropped.
*/
sp = spte_to_child_sp(*root_hpa);
if (WARN_ON(!sp))
sp = root_to_sp(*root_hpa);
if (WARN_ON_ONCE(!sp))
return;
if (is_tdp_mmu_page(sp))
@ -3620,7 +3589,9 @@ void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
&invalid_list);
if (free_active_root) {
if (to_shadow_page(mmu->root.hpa)) {
if (kvm_mmu_is_dummy_root(mmu->root.hpa)) {
/* Nothing to cleanup for dummy roots. */
} else if (root_to_sp(mmu->root.hpa)) {
mmu_free_root_page(kvm, &mmu->root.hpa, &invalid_list);
} else if (mmu->pae_root) {
for (i = 0; i < 4; ++i) {
@ -3644,6 +3615,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu)
{
unsigned long roots_to_free = 0;
struct kvm_mmu_page *sp;
hpa_t root_hpa;
int i;
@ -3658,8 +3630,8 @@ void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu)
if (!VALID_PAGE(root_hpa))
continue;
if (!to_shadow_page(root_hpa) ||
to_shadow_page(root_hpa)->role.guest_mode)
sp = root_to_sp(root_hpa);
if (!sp || sp->role.guest_mode)
roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
}
@ -3667,19 +3639,6 @@ void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu)
}
EXPORT_SYMBOL_GPL(kvm_mmu_free_guest_mode_roots);
static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
{
int ret = 0;
if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) {
kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
ret = 1;
}
return ret;
}
static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, int quadrant,
u8 level)
{
@ -3817,8 +3776,10 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
root_pgd = kvm_mmu_get_guest_pgd(vcpu, mmu);
root_gfn = root_pgd >> PAGE_SHIFT;
if (mmu_check_root(vcpu, root_gfn))
return 1;
if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) {
mmu->root.hpa = kvm_mmu_get_dummy_root();
return 0;
}
/*
* On SVM, reading PDPTRs might access guest memory, which might fault
@ -3830,8 +3791,8 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
if (!(pdptrs[i] & PT_PRESENT_MASK))
continue;
if (mmu_check_root(vcpu, pdptrs[i] >> PAGE_SHIFT))
return 1;
if (!kvm_vcpu_is_visible_gfn(vcpu, pdptrs[i] >> PAGE_SHIFT))
pdptrs[i] = 0;
}
}
@ -3998,7 +3959,7 @@ static bool is_unsync_root(hpa_t root)
{
struct kvm_mmu_page *sp;
if (!VALID_PAGE(root))
if (!VALID_PAGE(root) || kvm_mmu_is_dummy_root(root))
return false;
/*
@ -4014,7 +3975,7 @@ static bool is_unsync_root(hpa_t root)
* requirement isn't satisfied.
*/
smp_rmb();
sp = to_shadow_page(root);
sp = root_to_sp(root);
/*
* PAE roots (somewhat arbitrarily) aren't backed by shadow pages, the
@ -4044,11 +4005,12 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
if (vcpu->arch.mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) {
hpa_t root = vcpu->arch.mmu->root.hpa;
sp = to_shadow_page(root);
if (!is_unsync_root(root))
return;
sp = root_to_sp(root);
write_lock(&vcpu->kvm->mmu_lock);
mmu_sync_children(vcpu, sp, true);
write_unlock(&vcpu->kvm->mmu_lock);
@ -4190,7 +4152,7 @@ static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
return RET_PF_EMULATE;
reserved = get_mmio_spte(vcpu, addr, &spte);
if (WARN_ON(reserved))
if (WARN_ON_ONCE(reserved))
return -EINVAL;
if (is_mmio_spte(spte)) {
@ -4228,7 +4190,7 @@ static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
* guest is writing the page which is write tracked which can
* not be fixed by page fault handler.
*/
if (kvm_slot_page_track_is_active(vcpu->kvm, fault->slot, fault->gfn, KVM_PAGE_TRACK_WRITE))
if (kvm_gfn_is_write_tracked(vcpu->kvm, fault->slot, fault->gfn))
return true;
return false;
@ -4378,7 +4340,7 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
struct kvm_page_fault *fault)
{
struct kvm_mmu_page *sp = to_shadow_page(vcpu->arch.mmu->root.hpa);
struct kvm_mmu_page *sp = root_to_sp(vcpu->arch.mmu->root.hpa);
/* Special roots, e.g. pae_root, are not backed by shadow pages. */
if (sp && is_obsolete_sp(vcpu->kvm, sp))
@ -4403,6 +4365,10 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
{
int r;
/* Dummy roots are used only for shadowing bad guest roots. */
if (WARN_ON_ONCE(kvm_mmu_is_dummy_root(vcpu->arch.mmu->root.hpa)))
return RET_PF_RETRY;
if (page_fault_handle_page_track(vcpu, fault))
return RET_PF_EMULATE;
@ -4439,8 +4405,6 @@ out_unlock:
static int nonpaging_page_fault(struct kvm_vcpu *vcpu,
struct kvm_page_fault *fault)
{
pgprintk("%s: gva %lx error %x\n", __func__, fault->addr, fault->error_code);
/* This path builds a PAE pagetable, we can map 2mb pages at maximum. */
fault->max_level = PG_LEVEL_2M;
return direct_page_fault(vcpu, fault);
@ -4558,9 +4522,19 @@ static void nonpaging_init_context(struct kvm_mmu *context)
static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd,
union kvm_mmu_page_role role)
{
return (role.direct || pgd == root->pgd) &&
VALID_PAGE(root->hpa) &&
role.word == to_shadow_page(root->hpa)->role.word;
struct kvm_mmu_page *sp;
if (!VALID_PAGE(root->hpa))
return false;
if (!role.direct && pgd != root->pgd)
return false;
sp = root_to_sp(root->hpa);
if (WARN_ON_ONCE(!sp))
return false;
return role.word == sp->role.word;
}
/*
@ -4630,11 +4604,10 @@ static bool fast_pgd_switch(struct kvm *kvm, struct kvm_mmu *mmu,
gpa_t new_pgd, union kvm_mmu_page_role new_role)
{
/*
* For now, limit the caching to 64-bit hosts+VMs in order to avoid
* having to deal with PDPTEs. We may add support for 32-bit hosts/VMs
* later if necessary.
* Limit reuse to 64-bit hosts+VMs without "special" roots in order to
* avoid having to deal with PDPTEs and other complexities.
*/
if (VALID_PAGE(mmu->root.hpa) && !to_shadow_page(mmu->root.hpa))
if (VALID_PAGE(mmu->root.hpa) && !root_to_sp(mmu->root.hpa))
kvm_mmu_free_roots(kvm, mmu, KVM_MMU_ROOT_CURRENT);
if (VALID_PAGE(mmu->root.hpa))
@ -4680,9 +4653,12 @@ void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd)
* If this is a direct root page, it doesn't have a write flooding
* count. Otherwise, clear the write flooding count.
*/
if (!new_role.direct)
__clear_sp_write_flooding_count(
to_shadow_page(vcpu->arch.mmu->root.hpa));
if (!new_role.direct) {
struct kvm_mmu_page *sp = root_to_sp(vcpu->arch.mmu->root.hpa);
if (!WARN_ON_ONCE(!sp))
__clear_sp_write_flooding_count(sp);
}
}
EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd);
@ -5449,8 +5425,8 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
* physical address properties) in a single VM would require tracking
* all relevant CPUID information in kvm_mmu_page_role. That is very
* undesirable as it would increase the memory requirements for
* gfn_track (see struct kvm_mmu_page_role comments). For now that
* problem is swept under the rug; KVM's CPUID API is horrific and
* gfn_write_track (see struct kvm_mmu_page_role comments). For now
* that problem is swept under the rug; KVM's CPUID API is horrific and
* it's all but impossible to solve it without introducing a new API.
*/
vcpu->arch.root_mmu.root_role.word = 0;
@ -5513,9 +5489,9 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu)
struct kvm *kvm = vcpu->kvm;
kvm_mmu_free_roots(kvm, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL);
WARN_ON(VALID_PAGE(vcpu->arch.root_mmu.root.hpa));
WARN_ON_ONCE(VALID_PAGE(vcpu->arch.root_mmu.root.hpa));
kvm_mmu_free_roots(kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root.hpa));
WARN_ON_ONCE(VALID_PAGE(vcpu->arch.guest_mmu.root.hpa));
vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
}
@ -5528,16 +5504,21 @@ static bool is_obsolete_root(struct kvm *kvm, hpa_t root_hpa)
/*
* When freeing obsolete roots, treat roots as obsolete if they don't
* have an associated shadow page. This does mean KVM will get false
* have an associated shadow page, as it's impossible to determine if
* such roots are fresh or stale. This does mean KVM will get false
* positives and free roots that don't strictly need to be freed, but
* such false positives are relatively rare:
*
* (a) only PAE paging and nested NPT has roots without shadow pages
* (a) only PAE paging and nested NPT have roots without shadow pages
* (or any shadow paging flavor with a dummy root, see note below)
* (b) remote reloads due to a memslot update obsoletes _all_ roots
* (c) KVM doesn't track previous roots for PAE paging, and the guest
* is unlikely to zap an in-use PGD.
*
* Note! Dummy roots are unique in that they are obsoleted by memslot
* _creation_! See also FNAME(fetch).
*/
sp = to_shadow_page(root_hpa);
sp = root_to_sp(root_hpa);
return !sp || is_obsolete_sp(kvm, sp);
}
@ -5616,9 +5597,6 @@ static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
{
unsigned offset, pte_size, misaligned;
pgprintk("misaligned: gpa %llx bytes %d role %x\n",
gpa, bytes, sp->role.word);
offset = offset_in_page(gpa);
pte_size = sp->role.has_4_byte_gpte ? 4 : 8;
@ -5666,9 +5644,8 @@ static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
return spte;
}
static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
const u8 *new, int bytes,
struct kvm_page_track_notifier_node *node)
void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
int bytes)
{
gfn_t gfn = gpa >> PAGE_SHIFT;
struct kvm_mmu_page *sp;
@ -5684,8 +5661,6 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
return;
pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
write_lock(&vcpu->kvm->mmu_lock);
gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes);
@ -5724,7 +5699,18 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err
int r, emulation_type = EMULTYPE_PF;
bool direct = vcpu->arch.mmu->root_role.direct;
if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
/*
* IMPLICIT_ACCESS is a KVM-defined flag used to correctly perform SMAP
* checks when emulating instructions that triggers implicit access.
* WARN if hardware generates a fault with an error code that collides
* with the KVM-defined value. Clear the flag and continue on, i.e.
* don't terminate the VM, as KVM can't possibly be relying on a flag
* that KVM doesn't know about.
*/
if (WARN_ON_ONCE(error_code & PFERR_IMPLICIT_ACCESS))
error_code &= ~PFERR_IMPLICIT_ACCESS;
if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
return RET_PF_RETRY;
r = RET_PF_INVALID;
@ -6081,7 +6067,7 @@ restart:
* pages. Skip the bogus page, otherwise we'll get stuck in an
* infinite loop if the page gets put back on the list (again).
*/
if (WARN_ON(sp->role.invalid))
if (WARN_ON_ONCE(sp->role.invalid))
continue;
/*
@ -6181,16 +6167,8 @@ static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
}
static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot,
struct kvm_page_track_notifier_node *node)
{
kvm_mmu_zap_all_fast(kvm);
}
int kvm_mmu_init_vm(struct kvm *kvm)
{
struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
int r;
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
@ -6204,10 +6182,6 @@ int kvm_mmu_init_vm(struct kvm *kvm)
return r;
}
node->track_write = kvm_mmu_pte_write;
node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
kvm_page_track_register_notifier(kvm, node);
kvm->arch.split_page_header_cache.kmem_cache = mmu_page_header_cache;
kvm->arch.split_page_header_cache.gfp_zero = __GFP_ZERO;
@ -6228,10 +6202,6 @@ static void mmu_free_vm_memory_caches(struct kvm *kvm)
void kvm_mmu_uninit_vm(struct kvm *kvm)
{
struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
kvm_page_track_unregister_notifier(kvm, node);
if (tdp_mmu_enabled)
kvm_mmu_uninit_tdp_mmu(kvm);
@ -6700,7 +6670,7 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
*/
}
void kvm_mmu_zap_all(struct kvm *kvm)
static void kvm_mmu_zap_all(struct kvm *kvm)
{
struct kvm_mmu_page *sp, *node;
LIST_HEAD(invalid_list);
@ -6709,7 +6679,7 @@ void kvm_mmu_zap_all(struct kvm *kvm)
write_lock(&kvm->mmu_lock);
restart:
list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
if (WARN_ON(sp->role.invalid))
if (WARN_ON_ONCE(sp->role.invalid))
continue;
if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign))
goto restart;
@ -6725,9 +6695,20 @@ restart:
write_unlock(&kvm->mmu_lock);
}
void kvm_arch_flush_shadow_all(struct kvm *kvm)
{
kvm_mmu_zap_all(kvm);
}
void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot)
{
kvm_mmu_zap_all_fast(kvm);
}
void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
{
WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
WARN_ON_ONCE(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
gen &= MMIO_SPTE_GEN_MASK;

View File

@ -6,18 +6,10 @@
#include <linux/kvm_host.h>
#include <asm/kvm_host.h>
#undef MMU_DEBUG
#ifdef MMU_DEBUG
extern bool dbg;
#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
#define rmap_printk(fmt, args...) do { if (dbg) printk("%s: " fmt, __func__, ## args); } while (0)
#define MMU_WARN_ON(x) WARN_ON(x)
#ifdef CONFIG_KVM_PROVE_MMU
#define KVM_MMU_WARN_ON(x) WARN_ON_ONCE(x)
#else
#define pgprintk(x...) do { } while (0)
#define rmap_printk(x...) do { } while (0)
#define MMU_WARN_ON(x) do { } while (0)
#define KVM_MMU_WARN_ON(x) BUILD_BUG_ON_INVALID(x)
#endif
/* Page table builder macros common to shadow (host) PTEs and guest PTEs. */
@ -44,6 +36,16 @@ extern bool dbg;
#define INVALID_PAE_ROOT 0
#define IS_VALID_PAE_ROOT(x) (!!(x))
static inline hpa_t kvm_mmu_get_dummy_root(void)
{
return my_zero_pfn(0) << PAGE_SHIFT;
}
static inline bool kvm_mmu_is_dummy_root(hpa_t shadow_page)
{
return is_zero_pfn(shadow_page >> PAGE_SHIFT);
}
typedef u64 __rcu *tdp_ptep_t;
struct kvm_mmu_page {

View File

@ -12,13 +12,13 @@
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/lockdep.h>
#include <linux/kvm_host.h>
#include <linux/rculist.h>
#include <asm/kvm_page_track.h>
#include "mmu.h"
#include "mmu_internal.h"
#include "page_track.h"
bool kvm_page_track_write_tracking_enabled(struct kvm *kvm)
{
@ -28,103 +28,64 @@ bool kvm_page_track_write_tracking_enabled(struct kvm *kvm)
void kvm_page_track_free_memslot(struct kvm_memory_slot *slot)
{
int i;
for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) {
kvfree(slot->arch.gfn_track[i]);
slot->arch.gfn_track[i] = NULL;
kvfree(slot->arch.gfn_write_track);
slot->arch.gfn_write_track = NULL;
}
static int __kvm_page_track_write_tracking_alloc(struct kvm_memory_slot *slot,
unsigned long npages)
{
const size_t size = sizeof(*slot->arch.gfn_write_track);
if (!slot->arch.gfn_write_track)
slot->arch.gfn_write_track = __vcalloc(npages, size,
GFP_KERNEL_ACCOUNT);
return slot->arch.gfn_write_track ? 0 : -ENOMEM;
}
int kvm_page_track_create_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot,
unsigned long npages)
{
int i;
for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) {
if (i == KVM_PAGE_TRACK_WRITE &&
!kvm_page_track_write_tracking_enabled(kvm))
continue;
slot->arch.gfn_track[i] =
__vcalloc(npages, sizeof(*slot->arch.gfn_track[i]),
GFP_KERNEL_ACCOUNT);
if (!slot->arch.gfn_track[i])
goto track_free;
}
if (!kvm_page_track_write_tracking_enabled(kvm))
return 0;
track_free:
kvm_page_track_free_memslot(slot);
return -ENOMEM;
}
static inline bool page_track_mode_is_valid(enum kvm_page_track_mode mode)
{
if (mode < 0 || mode >= KVM_PAGE_TRACK_MAX)
return false;
return true;
return __kvm_page_track_write_tracking_alloc(slot, npages);
}
int kvm_page_track_write_tracking_alloc(struct kvm_memory_slot *slot)
{
unsigned short *gfn_track;
if (slot->arch.gfn_track[KVM_PAGE_TRACK_WRITE])
return 0;
gfn_track = __vcalloc(slot->npages, sizeof(*gfn_track),
GFP_KERNEL_ACCOUNT);
if (gfn_track == NULL)
return -ENOMEM;
slot->arch.gfn_track[KVM_PAGE_TRACK_WRITE] = gfn_track;
return 0;
return __kvm_page_track_write_tracking_alloc(slot, slot->npages);
}
static void update_gfn_track(struct kvm_memory_slot *slot, gfn_t gfn,
enum kvm_page_track_mode mode, short count)
static void update_gfn_write_track(struct kvm_memory_slot *slot, gfn_t gfn,
short count)
{
int index, val;
index = gfn_to_index(gfn, slot->base_gfn, PG_LEVEL_4K);
val = slot->arch.gfn_track[mode][index];
val = slot->arch.gfn_write_track[index];
if (WARN_ON(val + count < 0 || val + count > USHRT_MAX))
if (WARN_ON_ONCE(val + count < 0 || val + count > USHRT_MAX))
return;
slot->arch.gfn_track[mode][index] += count;
slot->arch.gfn_write_track[index] += count;
}
/*
* add guest page to the tracking pool so that corresponding access on that
* page will be intercepted.
*
* It should be called under the protection both of mmu-lock and kvm->srcu
* or kvm->slots_lock.
*
* @kvm: the guest instance we are interested in.
* @slot: the @gfn belongs to.
* @gfn: the guest page.
* @mode: tracking mode, currently only write track is supported.
*/
void kvm_slot_page_track_add_page(struct kvm *kvm,
struct kvm_memory_slot *slot, gfn_t gfn,
enum kvm_page_track_mode mode)
void __kvm_write_track_add_gfn(struct kvm *kvm, struct kvm_memory_slot *slot,
gfn_t gfn)
{
lockdep_assert_held_write(&kvm->mmu_lock);
if (WARN_ON(!page_track_mode_is_valid(mode)))
lockdep_assert_once(lockdep_is_held(&kvm->slots_lock) ||
srcu_read_lock_held(&kvm->srcu));
if (KVM_BUG_ON(!kvm_page_track_write_tracking_enabled(kvm), kvm))
return;
if (WARN_ON(mode == KVM_PAGE_TRACK_WRITE &&
!kvm_page_track_write_tracking_enabled(kvm)))
return;
update_gfn_track(slot, gfn, mode, 1);
update_gfn_write_track(slot, gfn, 1);
/*
* new track stops large page mapping for the
@ -132,37 +93,22 @@ void kvm_slot_page_track_add_page(struct kvm *kvm,
*/
kvm_mmu_gfn_disallow_lpage(slot, gfn);
if (mode == KVM_PAGE_TRACK_WRITE)
if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn, PG_LEVEL_4K))
kvm_flush_remote_tlbs(kvm);
}
EXPORT_SYMBOL_GPL(kvm_slot_page_track_add_page);
/*
* remove the guest page from the tracking pool which stops the interception
* of corresponding access on that page. It is the opposed operation of
* kvm_slot_page_track_add_page().
*
* It should be called under the protection both of mmu-lock and kvm->srcu
* or kvm->slots_lock.
*
* @kvm: the guest instance we are interested in.
* @slot: the @gfn belongs to.
* @gfn: the guest page.
* @mode: tracking mode, currently only write track is supported.
*/
void kvm_slot_page_track_remove_page(struct kvm *kvm,
struct kvm_memory_slot *slot, gfn_t gfn,
enum kvm_page_track_mode mode)
void __kvm_write_track_remove_gfn(struct kvm *kvm,
struct kvm_memory_slot *slot, gfn_t gfn)
{
if (WARN_ON(!page_track_mode_is_valid(mode)))
lockdep_assert_held_write(&kvm->mmu_lock);
lockdep_assert_once(lockdep_is_held(&kvm->slots_lock) ||
srcu_read_lock_held(&kvm->srcu));
if (KVM_BUG_ON(!kvm_page_track_write_tracking_enabled(kvm), kvm))
return;
if (WARN_ON(mode == KVM_PAGE_TRACK_WRITE &&
!kvm_page_track_write_tracking_enabled(kvm)))
return;
update_gfn_track(slot, gfn, mode, -1);
update_gfn_write_track(slot, gfn, -1);
/*
* allow large page mapping for the tracked page
@ -170,31 +116,26 @@ void kvm_slot_page_track_remove_page(struct kvm *kvm,
*/
kvm_mmu_gfn_allow_lpage(slot, gfn);
}
EXPORT_SYMBOL_GPL(kvm_slot_page_track_remove_page);
/*
* check if the corresponding access on the specified guest page is tracked.
*/
bool kvm_slot_page_track_is_active(struct kvm *kvm,
const struct kvm_memory_slot *slot,
gfn_t gfn, enum kvm_page_track_mode mode)
bool kvm_gfn_is_write_tracked(struct kvm *kvm,
const struct kvm_memory_slot *slot, gfn_t gfn)
{
int index;
if (WARN_ON(!page_track_mode_is_valid(mode)))
return false;
if (!slot)
return false;
if (mode == KVM_PAGE_TRACK_WRITE &&
!kvm_page_track_write_tracking_enabled(kvm))
if (!kvm_page_track_write_tracking_enabled(kvm))
return false;
index = gfn_to_index(gfn, slot->base_gfn, PG_LEVEL_4K);
return !!READ_ONCE(slot->arch.gfn_track[mode][index]);
return !!READ_ONCE(slot->arch.gfn_write_track[index]);
}
#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING
void kvm_page_track_cleanup(struct kvm *kvm)
{
struct kvm_page_track_notifier_head *head;
@ -216,17 +157,22 @@ int kvm_page_track_init(struct kvm *kvm)
* register the notifier so that event interception for the tracked guest
* pages can be received.
*/
void
kvm_page_track_register_notifier(struct kvm *kvm,
int kvm_page_track_register_notifier(struct kvm *kvm,
struct kvm_page_track_notifier_node *n)
{
struct kvm_page_track_notifier_head *head;
if (!kvm || kvm->mm != current->mm)
return -ESRCH;
kvm_get_kvm(kvm);
head = &kvm->arch.track_notifier_head;
write_lock(&kvm->mmu_lock);
hlist_add_head_rcu(&n->node, &head->track_notifier_list);
write_unlock(&kvm->mmu_lock);
return 0;
}
EXPORT_SYMBOL_GPL(kvm_page_track_register_notifier);
@ -234,8 +180,7 @@ EXPORT_SYMBOL_GPL(kvm_page_track_register_notifier);
* stop receiving the event interception. It is the opposed operation of
* kvm_page_track_register_notifier().
*/
void
kvm_page_track_unregister_notifier(struct kvm *kvm,
void kvm_page_track_unregister_notifier(struct kvm *kvm,
struct kvm_page_track_notifier_node *n)
{
struct kvm_page_track_notifier_head *head;
@ -246,6 +191,8 @@ kvm_page_track_unregister_notifier(struct kvm *kvm,
hlist_del_rcu(&n->node);
write_unlock(&kvm->mmu_lock);
synchronize_srcu(&head->track_srcu);
kvm_put_kvm(kvm);
}
EXPORT_SYMBOL_GPL(kvm_page_track_unregister_notifier);
@ -256,34 +203,7 @@ EXPORT_SYMBOL_GPL(kvm_page_track_unregister_notifier);
* The node should figure out if the written page is the one that node is
* interested in by itself.
*/
void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
int bytes)
{
struct kvm_page_track_notifier_head *head;
struct kvm_page_track_notifier_node *n;
int idx;
head = &vcpu->kvm->arch.track_notifier_head;
if (hlist_empty(&head->track_notifier_list))
return;
idx = srcu_read_lock(&head->track_srcu);
hlist_for_each_entry_srcu(n, &head->track_notifier_list, node,
srcu_read_lock_held(&head->track_srcu))
if (n->track_write)
n->track_write(vcpu, gpa, new, bytes, n);
srcu_read_unlock(&head->track_srcu, idx);
}
/*
* Notify the node that memory slot is being removed or moved so that it can
* drop write-protection for the pages in the memory slot.
*
* The node should figure out it has any write-protected pages in this slot
* by itself.
*/
void kvm_page_track_flush_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
void __kvm_page_track_write(struct kvm *kvm, gpa_t gpa, const u8 *new, int bytes)
{
struct kvm_page_track_notifier_head *head;
struct kvm_page_track_notifier_node *n;
@ -297,7 +217,91 @@ void kvm_page_track_flush_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
idx = srcu_read_lock(&head->track_srcu);
hlist_for_each_entry_srcu(n, &head->track_notifier_list, node,
srcu_read_lock_held(&head->track_srcu))
if (n->track_flush_slot)
n->track_flush_slot(kvm, slot, n);
if (n->track_write)
n->track_write(gpa, new, bytes, n);
srcu_read_unlock(&head->track_srcu, idx);
}
/*
* Notify external page track nodes that a memory region is being removed from
* the VM, e.g. so that users can free any associated metadata.
*/
void kvm_page_track_delete_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
{
struct kvm_page_track_notifier_head *head;
struct kvm_page_track_notifier_node *n;
int idx;
head = &kvm->arch.track_notifier_head;
if (hlist_empty(&head->track_notifier_list))
return;
idx = srcu_read_lock(&head->track_srcu);
hlist_for_each_entry_srcu(n, &head->track_notifier_list, node,
srcu_read_lock_held(&head->track_srcu))
if (n->track_remove_region)
n->track_remove_region(slot->base_gfn, slot->npages, n);
srcu_read_unlock(&head->track_srcu, idx);
}
/*
* add guest page to the tracking pool so that corresponding access on that
* page will be intercepted.
*
* @kvm: the guest instance we are interested in.
* @gfn: the guest page.
*/
int kvm_write_track_add_gfn(struct kvm *kvm, gfn_t gfn)
{
struct kvm_memory_slot *slot;
int idx;
idx = srcu_read_lock(&kvm->srcu);
slot = gfn_to_memslot(kvm, gfn);
if (!slot) {
srcu_read_unlock(&kvm->srcu, idx);
return -EINVAL;
}
write_lock(&kvm->mmu_lock);
__kvm_write_track_add_gfn(kvm, slot, gfn);
write_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, idx);
return 0;
}
EXPORT_SYMBOL_GPL(kvm_write_track_add_gfn);
/*
* remove the guest page from the tracking pool which stops the interception
* of corresponding access on that page.
*
* @kvm: the guest instance we are interested in.
* @gfn: the guest page.
*/
int kvm_write_track_remove_gfn(struct kvm *kvm, gfn_t gfn)
{
struct kvm_memory_slot *slot;
int idx;
idx = srcu_read_lock(&kvm->srcu);
slot = gfn_to_memslot(kvm, gfn);
if (!slot) {
srcu_read_unlock(&kvm->srcu, idx);
return -EINVAL;
}
write_lock(&kvm->mmu_lock);
__kvm_write_track_remove_gfn(kvm, slot, gfn);
write_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, idx);
return 0;
}
EXPORT_SYMBOL_GPL(kvm_write_track_remove_gfn);
#endif

View File

@ -0,0 +1,58 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __KVM_X86_PAGE_TRACK_H
#define __KVM_X86_PAGE_TRACK_H
#include <linux/kvm_host.h>
#include <asm/kvm_page_track.h>
bool kvm_page_track_write_tracking_enabled(struct kvm *kvm);
int kvm_page_track_write_tracking_alloc(struct kvm_memory_slot *slot);
void kvm_page_track_free_memslot(struct kvm_memory_slot *slot);
int kvm_page_track_create_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot,
unsigned long npages);
void __kvm_write_track_add_gfn(struct kvm *kvm, struct kvm_memory_slot *slot,
gfn_t gfn);
void __kvm_write_track_remove_gfn(struct kvm *kvm,
struct kvm_memory_slot *slot, gfn_t gfn);
bool kvm_gfn_is_write_tracked(struct kvm *kvm,
const struct kvm_memory_slot *slot, gfn_t gfn);
#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING
int kvm_page_track_init(struct kvm *kvm);
void kvm_page_track_cleanup(struct kvm *kvm);
void __kvm_page_track_write(struct kvm *kvm, gpa_t gpa, const u8 *new, int bytes);
void kvm_page_track_delete_slot(struct kvm *kvm, struct kvm_memory_slot *slot);
static inline bool kvm_page_track_has_external_user(struct kvm *kvm)
{
return !hlist_empty(&kvm->arch.track_notifier_head.track_notifier_list);
}
#else
static inline int kvm_page_track_init(struct kvm *kvm) { return 0; }
static inline void kvm_page_track_cleanup(struct kvm *kvm) { }
static inline void __kvm_page_track_write(struct kvm *kvm, gpa_t gpa,
const u8 *new, int bytes) { }
static inline void kvm_page_track_delete_slot(struct kvm *kvm,
struct kvm_memory_slot *slot) { }
static inline bool kvm_page_track_has_external_user(struct kvm *kvm) { return false; }
#endif /* CONFIG_KVM_EXTERNAL_WRITE_TRACKING */
static inline void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa,
const u8 *new, int bytes)
{
__kvm_page_track_write(vcpu->kvm, gpa, new, bytes);
kvm_mmu_track_write(vcpu, gpa, new, bytes);
}
#endif /* __KVM_X86_PAGE_TRACK_H */

View File

@ -338,7 +338,6 @@ retry_walk:
}
#endif
walker->max_level = walker->level;
ASSERT(!(is_long_mode(vcpu) && !is_pae(vcpu)));
/*
* FIXME: on Intel processors, loads of the PDPTE registers for PAE paging
@ -348,9 +347,21 @@ retry_walk:
nested_access = (have_ad ? PFERR_WRITE_MASK : 0) | PFERR_USER_MASK;
pte_access = ~0;
/*
* Queue a page fault for injection if this assertion fails, as callers
* assume that walker.fault contains sane info on a walk failure. I.e.
* avoid making the situation worse by inducing even worse badness
* between when the assertion fails and when KVM kicks the vCPU out to
* userspace (because the VM is bugged).
*/
if (KVM_BUG_ON(is_long_mode(vcpu) && !is_pae(vcpu), vcpu->kvm))
goto error;
++walker->level;
do {
struct kvm_memory_slot *slot;
unsigned long host_addr;
pt_access = pte_access;
@ -381,7 +392,11 @@ retry_walk:
if (unlikely(real_gpa == INVALID_GPA))
return 0;
host_addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gpa_to_gfn(real_gpa),
slot = kvm_vcpu_gfn_to_memslot(vcpu, gpa_to_gfn(real_gpa));
if (!kvm_is_visible_memslot(slot))
goto error;
host_addr = gfn_to_hva_memslot_prot(slot, gpa_to_gfn(real_gpa),
&walker->pte_writable[walker->level - 1]);
if (unlikely(kvm_is_error_hva(host_addr)))
goto error;
@ -456,9 +471,6 @@ retry_walk:
goto retry_walk;
}
pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
__func__, (u64)pte, walker->pte_access,
walker->pt_access[walker->level - 1]);
return 1;
error:
@ -529,8 +541,6 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
return false;
pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
gfn = gpte_to_gfn(gpte);
pte_access = sp->role.access & FNAME(gpte_access)(gpte);
FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte);
@ -638,9 +648,20 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
if (FNAME(gpte_changed)(vcpu, gw, top_level))
goto out_gpte_changed;
if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
goto out_gpte_changed;
/*
* Load a new root and retry the faulting instruction in the extremely
* unlikely scenario that the guest root gfn became visible between
* loading a dummy root and handling the resulting page fault, e.g. if
* userspace create a memslot in the interim.
*/
if (unlikely(kvm_mmu_is_dummy_root(vcpu->arch.mmu->root.hpa))) {
kvm_make_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu);
goto out_gpte_changed;
}
for_each_shadow_entry(vcpu, fault->addr, it) {
gfn_t table_gfn;
@ -758,7 +779,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
struct guest_walker walker;
int r;
pgprintk("%s: addr %lx err %x\n", __func__, fault->addr, fault->error_code);
WARN_ON_ONCE(fault->is_tdp);
/*
@ -773,7 +793,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
* The page is not mapped by the guest. Let the guest handle it.
*/
if (!r) {
pgprintk("%s: guest page fault\n", __func__);
if (!fault->prefetch)
kvm_inject_emulated_page_fault(vcpu, &walker.fault);
@ -837,7 +856,7 @@ static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp)
{
int offset = 0;
WARN_ON(sp->role.level != PG_LEVEL_4K);
WARN_ON_ONCE(sp->role.level != PG_LEVEL_4K);
if (PTTYPE == 32)
offset = sp->role.quadrant << SPTE_LEVEL_BITS;

View File

@ -61,7 +61,7 @@ static u64 generation_mmio_spte_mask(u64 gen)
{
u64 mask;
WARN_ON(gen & ~MMIO_SPTE_GEN_MASK);
WARN_ON_ONCE(gen & ~MMIO_SPTE_GEN_MASK);
mask = (gen << MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_SPTE_GEN_LOW_MASK;
mask |= (gen << MMIO_SPTE_GEN_HIGH_SHIFT) & MMIO_SPTE_GEN_HIGH_MASK;
@ -221,8 +221,6 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
* shadow pages and unsync'ing pages is not allowed.
*/
if (mmu_try_to_unsync_pages(vcpu->kvm, slot, gfn, can_unsync, prefetch)) {
pgprintk("%s: found shadow page for %llx, marking ro\n",
__func__, gfn);
wrprot = true;
pte_access &= ~ACC_WRITE_MASK;
spte &= ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
@ -242,7 +240,7 @@ out:
if ((spte & PT_WRITABLE_MASK) && kvm_slot_dirty_track_enabled(slot)) {
/* Enforced by kvm_mmu_hugepage_adjust. */
WARN_ON(level > PG_LEVEL_4K);
WARN_ON_ONCE(level > PG_LEVEL_4K);
mark_page_dirty_in_slot(vcpu->kvm, slot, gfn);
}

View File

@ -3,6 +3,7 @@
#ifndef KVM_X86_MMU_SPTE_H
#define KVM_X86_MMU_SPTE_H
#include "mmu.h"
#include "mmu_internal.h"
/*
@ -236,6 +237,18 @@ static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep)
return to_shadow_page(__pa(sptep));
}
static inline struct kvm_mmu_page *root_to_sp(hpa_t root)
{
if (kvm_mmu_is_dummy_root(root))
return NULL;
/*
* The "root" may be a special root, e.g. a PAE entry, treat it as a
* SPTE to ensure any non-PA bits are dropped.
*/
return spte_to_child_sp(root);
}
static inline bool is_mmio_spte(u64 spte)
{
return (spte & shadow_mmio_mask) == shadow_mmio_value &&
@ -265,13 +278,13 @@ static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
static inline bool spte_ad_enabled(u64 spte)
{
MMU_WARN_ON(!is_shadow_present_pte(spte));
KVM_MMU_WARN_ON(!is_shadow_present_pte(spte));
return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_DISABLED;
}
static inline bool spte_ad_need_write_protect(u64 spte)
{
MMU_WARN_ON(!is_shadow_present_pte(spte));
KVM_MMU_WARN_ON(!is_shadow_present_pte(spte));
/*
* This is benign for non-TDP SPTEs as SPTE_TDP_AD_ENABLED is '0',
* and non-TDP SPTEs will never set these bits. Optimize for 64-bit
@ -282,13 +295,13 @@ static inline bool spte_ad_need_write_protect(u64 spte)
static inline u64 spte_shadow_accessed_mask(u64 spte)
{
MMU_WARN_ON(!is_shadow_present_pte(spte));
KVM_MMU_WARN_ON(!is_shadow_present_pte(spte));
return spte_ad_enabled(spte) ? shadow_accessed_mask : 0;
}
static inline u64 spte_shadow_dirty_mask(u64 spte)
{
MMU_WARN_ON(!is_shadow_present_pte(spte));
KVM_MMU_WARN_ON(!is_shadow_present_pte(spte));
return spte_ad_enabled(spte) ? shadow_dirty_mask : 0;
}

View File

@ -39,13 +39,14 @@ void tdp_iter_restart(struct tdp_iter *iter)
void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root,
int min_level, gfn_t next_last_level_gfn)
{
int root_level = root->role.level;
WARN_ON(root_level < 1);
WARN_ON(root_level > PT64_ROOT_MAX_LEVEL);
if (WARN_ON_ONCE(!root || (root->role.level < 1) ||
(root->role.level > PT64_ROOT_MAX_LEVEL))) {
iter->valid = false;
return;
}
iter->next_last_level_gfn = next_last_level_gfn;
iter->root_level = root_level;
iter->root_level = root->role.level;
iter->min_level = min_level;
iter->pt_path[iter->root_level - 1] = (tdp_ptep_t)root->spt;
iter->as_id = kvm_mmu_page_as_id(root);

View File

@ -475,9 +475,9 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
bool is_leaf = is_present && is_last_spte(new_spte, level);
bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
WARN_ON(level > PT64_ROOT_MAX_LEVEL);
WARN_ON(level < PG_LEVEL_4K);
WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
WARN_ON_ONCE(level > PT64_ROOT_MAX_LEVEL);
WARN_ON_ONCE(level < PG_LEVEL_4K);
WARN_ON_ONCE(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
/*
* If this warning were to trigger it would indicate that there was a
@ -522,7 +522,7 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
* impact the guest since both the former and current SPTEs
* are nonpresent.
*/
if (WARN_ON(!is_mmio_spte(old_spte) &&
if (WARN_ON_ONCE(!is_mmio_spte(old_spte) &&
!is_mmio_spte(new_spte) &&
!is_removed_spte(new_spte)))
pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
@ -661,7 +661,7 @@ static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
* should be used. If operating under the MMU lock in write mode, the
* use of the removed SPTE should not be necessary.
*/
WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte));
WARN_ON_ONCE(is_removed_spte(old_spte) || is_removed_spte(new_spte));
old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
@ -689,7 +689,7 @@ static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter,
else
#define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \
for_each_tdp_pte(_iter, to_shadow_page(_mmu->root.hpa), _start, _end)
for_each_tdp_pte(_iter, root_to_sp(_mmu->root.hpa), _start, _end)
/*
* Yield if the MMU lock is contended or this thread needs to return control
@ -709,7 +709,7 @@ static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
struct tdp_iter *iter,
bool flush, bool shared)
{
WARN_ON(iter->yielded);
WARN_ON_ONCE(iter->yielded);
/* Ensure forward progress has been made before yielding. */
if (iter->next_last_level_gfn == iter->yielded_gfn)
@ -728,7 +728,7 @@ static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
rcu_read_lock();
WARN_ON(iter->gfn > iter->next_last_level_gfn);
WARN_ON_ONCE(iter->gfn > iter->next_last_level_gfn);
iter->yielded = true;
}
@ -1241,7 +1241,7 @@ static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
u64 new_spte;
/* Huge pages aren't expected to be modified without first being zapped. */
WARN_ON(pte_huge(range->arg.pte) || range->start + 1 != range->end);
WARN_ON_ONCE(pte_huge(range->arg.pte) || range->start + 1 != range->end);
if (iter->level != PG_LEVEL_4K ||
!is_shadow_present_pte(iter->old_spte))
@ -1548,7 +1548,7 @@ retry:
if (!is_shadow_present_pte(iter.old_spte))
continue;
MMU_WARN_ON(kvm_ad_enabled() &&
KVM_MMU_WARN_ON(kvm_ad_enabled() &&
spte_ad_need_write_protect(iter.old_spte));
if (!(iter.old_spte & dbit))
@ -1600,6 +1600,8 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
shadow_dirty_mask;
struct tdp_iter iter;
lockdep_assert_held_write(&kvm->mmu_lock);
rcu_read_lock();
tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
@ -1607,7 +1609,7 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
if (!mask)
break;
MMU_WARN_ON(kvm_ad_enabled() &&
KVM_MMU_WARN_ON(kvm_ad_enabled() &&
spte_ad_need_write_protect(iter.old_spte));
if (iter.level > PG_LEVEL_4K ||
@ -1646,7 +1648,6 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
{
struct kvm_mmu_page *root;
lockdep_assert_held_write(&kvm->mmu_lock);
for_each_tdp_mmu_root(kvm, root, slot->as_id)
clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
}

View File

@ -25,6 +25,7 @@
#include "tss.h"
#include "kvm_cache_regs.h"
#include "kvm_emulate.h"
#include "mmu/page_track.h"
#include "x86.h"
#include "cpuid.h"
#include "pmu.h"
@ -12632,6 +12633,13 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
struct kvm_memory_slot *new,
enum kvm_mr_change change)
{
/*
* KVM doesn't support moving memslots when there are external page
* trackers attached to the VM, i.e. if KVMGT is in use.
*/
if (change == KVM_MR_MOVE && kvm_page_track_has_external_user(kvm))
return -EINVAL;
if (change == KVM_MR_CREATE || change == KVM_MR_MOVE) {
if ((new->base_gfn + new->npages - 1) > kvm_mmu_max_gfn())
return -EINVAL;
@ -12786,6 +12794,9 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
const struct kvm_memory_slot *new,
enum kvm_mr_change change)
{
if (change == KVM_MR_DELETE)
kvm_page_track_delete_slot(kvm, old);
if (!kvm->arch.n_requested_mmu_pages &&
(change == KVM_MR_CREATE || change == KVM_MR_DELETE)) {
unsigned long nr_mmu_pages;
@ -12802,17 +12813,6 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
kvm_arch_free_memslot(kvm, old);
}
void kvm_arch_flush_shadow_all(struct kvm *kvm)
{
kvm_mmu_zap_all(kvm);
}
void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot)
{
kvm_page_track_flush_slot(kvm, slot);
}
static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
{
return (is_guest_mode(vcpu) &&

View File

@ -49,22 +49,6 @@
static bool enable_out_of_sync = false;
static int preallocated_oos_pages = 8192;
static bool intel_gvt_is_valid_gfn(struct intel_vgpu *vgpu, unsigned long gfn)
{
struct kvm *kvm = vgpu->vfio_device.kvm;
int idx;
bool ret;
if (!test_bit(INTEL_VGPU_STATUS_ATTACHED, vgpu->status))
return false;
idx = srcu_read_lock(&kvm->srcu);
ret = kvm_is_visible_gfn(kvm, gfn);
srcu_read_unlock(&kvm->srcu, idx);
return ret;
}
/*
* validate a gm address and related range size,
* translate it to host gm address
@ -1161,31 +1145,6 @@ static inline void ppgtt_generate_shadow_entry(struct intel_gvt_gtt_entry *se,
ops->set_pfn(se, s->shadow_page.mfn);
}
/*
* Check if can do 2M page
* @vgpu: target vgpu
* @entry: target pfn's gtt entry
*
* Return 1 if 2MB huge gtt shadowing is possible, 0 if miscondition,
* negative if found err.
*/
static int is_2MB_gtt_possible(struct intel_vgpu *vgpu,
struct intel_gvt_gtt_entry *entry)
{
const struct intel_gvt_gtt_pte_ops *ops = vgpu->gvt->gtt.pte_ops;
kvm_pfn_t pfn;
if (!HAS_PAGE_SIZES(vgpu->gvt->gt->i915, I915_GTT_PAGE_SIZE_2M))
return 0;
if (!test_bit(INTEL_VGPU_STATUS_ATTACHED, vgpu->status))
return -EINVAL;
pfn = gfn_to_pfn(vgpu->vfio_device.kvm, ops->get_pfn(entry));
if (is_error_noslot_pfn(pfn))
return -EINVAL;
return PageTransHuge(pfn_to_page(pfn));
}
static int split_2MB_gtt_entry(struct intel_vgpu *vgpu,
struct intel_vgpu_ppgtt_spt *spt, unsigned long index,
struct intel_gvt_gtt_entry *se)
@ -1279,7 +1238,7 @@ static int ppgtt_populate_shadow_entry(struct intel_vgpu *vgpu,
{
const struct intel_gvt_gtt_pte_ops *pte_ops = vgpu->gvt->gtt.pte_ops;
struct intel_gvt_gtt_entry se = *ge;
unsigned long gfn, page_size = PAGE_SIZE;
unsigned long gfn;
dma_addr_t dma_addr;
int ret;
@ -1291,6 +1250,9 @@ static int ppgtt_populate_shadow_entry(struct intel_vgpu *vgpu,
switch (ge->type) {
case GTT_TYPE_PPGTT_PTE_4K_ENTRY:
gvt_vdbg_mm("shadow 4K gtt entry\n");
ret = intel_gvt_dma_map_guest_page(vgpu, gfn, PAGE_SIZE, &dma_addr);
if (ret)
return -ENXIO;
break;
case GTT_TYPE_PPGTT_PTE_64K_ENTRY:
gvt_vdbg_mm("shadow 64K gtt entry\n");
@ -1302,25 +1264,20 @@ static int ppgtt_populate_shadow_entry(struct intel_vgpu *vgpu,
return split_64KB_gtt_entry(vgpu, spt, index, &se);
case GTT_TYPE_PPGTT_PTE_2M_ENTRY:
gvt_vdbg_mm("shadow 2M gtt entry\n");
ret = is_2MB_gtt_possible(vgpu, ge);
if (ret == 0)
if (!HAS_PAGE_SIZES(vgpu->gvt->gt->i915, I915_GTT_PAGE_SIZE_2M) ||
intel_gvt_dma_map_guest_page(vgpu, gfn,
I915_GTT_PAGE_SIZE_2M, &dma_addr))
return split_2MB_gtt_entry(vgpu, spt, index, &se);
else if (ret < 0)
return ret;
page_size = I915_GTT_PAGE_SIZE_2M;
break;
case GTT_TYPE_PPGTT_PTE_1G_ENTRY:
gvt_vgpu_err("GVT doesn't support 1GB entry\n");
return -EINVAL;
default:
GEM_BUG_ON(1);
return -EINVAL;
}
/* direct shadow */
ret = intel_gvt_dma_map_guest_page(vgpu, gfn, page_size, &dma_addr);
if (ret)
return -ENXIO;
/* Successfully shadowed a 4K or 2M page (without splitting). */
pte_ops->set_pfn(&se, dma_addr >> PAGE_SHIFT);
ppgtt_set_shadow_entry(spt, &se, index);
return 0;
@ -1329,11 +1286,9 @@ static int ppgtt_populate_shadow_entry(struct intel_vgpu *vgpu,
static int ppgtt_populate_spt(struct intel_vgpu_ppgtt_spt *spt)
{
struct intel_vgpu *vgpu = spt->vgpu;
struct intel_gvt *gvt = vgpu->gvt;
const struct intel_gvt_gtt_pte_ops *ops = gvt->gtt.pte_ops;
struct intel_vgpu_ppgtt_spt *s;
struct intel_gvt_gtt_entry se, ge;
unsigned long gfn, i;
unsigned long i;
int ret;
trace_spt_change(spt->vgpu->id, "born", spt,
@ -1350,13 +1305,6 @@ static int ppgtt_populate_spt(struct intel_vgpu_ppgtt_spt *spt)
ppgtt_generate_shadow_entry(&se, s, &ge);
ppgtt_set_shadow_entry(spt, &se, i);
} else {
gfn = ops->get_pfn(&ge);
if (!intel_gvt_is_valid_gfn(vgpu, gfn)) {
ops->set_pfn(&se, gvt->gtt.scratch_mfn);
ppgtt_set_shadow_entry(spt, &se, i);
continue;
}
ret = ppgtt_populate_shadow_entry(vgpu, spt, i, &ge);
if (ret)
goto fail;
@ -1845,6 +1793,9 @@ static int shadow_ppgtt_mm(struct intel_vgpu_mm *mm)
if (mm->ppgtt_mm.shadowed)
return 0;
if (!test_bit(INTEL_VGPU_STATUS_ATTACHED, vgpu->status))
return -EINVAL;
mm->ppgtt_mm.shadowed = true;
for (index = 0; index < ARRAY_SIZE(mm->ppgtt_mm.guest_pdps); index++) {
@ -2331,14 +2282,6 @@ static int emulate_ggtt_mmio_write(struct intel_vgpu *vgpu, unsigned int off,
m.val64 = e.val64;
m.type = e.type;
/* one PTE update may be issued in multiple writes and the
* first write may not construct a valid gfn
*/
if (!intel_gvt_is_valid_gfn(vgpu, gfn)) {
ops->set_pfn(&m, gvt->gtt.scratch_mfn);
goto out;
}
ret = intel_gvt_dma_map_guest_page(vgpu, gfn, PAGE_SIZE,
&dma_addr);
if (ret) {
@ -2355,7 +2298,6 @@ static int emulate_ggtt_mmio_write(struct intel_vgpu *vgpu, unsigned int off,
ops->clear_present(&m);
}
out:
ggtt_set_guest_entry(ggtt_mm, &e, g_gtt_index);
ggtt_get_host_entry(ggtt_mm, &e, g_gtt_index);
@ -2875,24 +2817,6 @@ void intel_vgpu_reset_ggtt(struct intel_vgpu *vgpu, bool invalidate_old)
ggtt_invalidate(gvt->gt);
}
/**
* intel_vgpu_reset_gtt - reset the all GTT related status
* @vgpu: a vGPU
*
* This function is called from vfio core to reset reset all
* GTT related status, including GGTT, PPGTT, scratch page.
*
*/
void intel_vgpu_reset_gtt(struct intel_vgpu *vgpu)
{
/* Shadow pages are only created when there is no page
* table tracking data, so remove page tracking data after
* removing the shadow pages.
*/
intel_vgpu_destroy_all_ppgtt_mm(vgpu);
intel_vgpu_reset_ggtt(vgpu, true);
}
/**
* intel_gvt_restore_ggtt - restore all vGPU's ggtt entries
* @gvt: intel gvt device

View File

@ -224,7 +224,6 @@ void intel_vgpu_reset_ggtt(struct intel_vgpu *vgpu, bool invalidate_old);
void intel_vgpu_invalidate_ppgtt(struct intel_vgpu *vgpu);
int intel_gvt_init_gtt(struct intel_gvt *gvt);
void intel_vgpu_reset_gtt(struct intel_vgpu *vgpu);
void intel_gvt_clean_gtt(struct intel_gvt *gvt);
struct intel_vgpu_mm *intel_gvt_find_ppgtt_mm(struct intel_vgpu *vgpu,

View File

@ -34,10 +34,11 @@
#define _GVT_H_
#include <uapi/linux/pci_regs.h>
#include <linux/kvm_host.h>
#include <linux/vfio.h>
#include <linux/mdev.h>
#include <asm/kvm_page_track.h>
#include "i915_drv.h"
#include "intel_gvt.h"

View File

@ -106,11 +106,9 @@ struct gvt_dma {
#define vfio_dev_to_vgpu(vfio_dev) \
container_of((vfio_dev), struct intel_vgpu, vfio_device)
static void kvmgt_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa,
const u8 *val, int len,
static void kvmgt_page_track_write(gpa_t gpa, const u8 *val, int len,
struct kvm_page_track_notifier_node *node);
static void kvmgt_page_track_flush_slot(struct kvm *kvm,
struct kvm_memory_slot *slot,
static void kvmgt_page_track_remove_region(gfn_t gfn, unsigned long nr_pages,
struct kvm_page_track_notifier_node *node);
static ssize_t intel_vgpu_show_description(struct mdev_type *mtype, char *buf)
@ -161,8 +159,7 @@ static int gvt_pin_guest_page(struct intel_vgpu *vgpu, unsigned long gfn,
if (npage == 0)
base_page = cur_page;
else if (base_page + npage != cur_page) {
gvt_vgpu_err("The pages are not continuous\n");
else if (page_to_pfn(base_page) + npage != page_to_pfn(cur_page)) {
ret = -EINVAL;
npage++;
goto err;
@ -172,6 +169,7 @@ static int gvt_pin_guest_page(struct intel_vgpu *vgpu, unsigned long gfn,
*page = base_page;
return 0;
err:
if (npage)
gvt_unpin_guest_page(vgpu, gfn, npage * PAGE_SIZE);
return ret;
}
@ -352,6 +350,8 @@ __kvmgt_protect_table_find(struct intel_vgpu *info, gfn_t gfn)
{
struct kvmgt_pgfn *p, *res = NULL;
lockdep_assert_held(&info->vgpu_lock);
hash_for_each_possible(info->ptable, p, hnode, gfn) {
if (gfn == p->gfn) {
res = p;
@ -654,21 +654,19 @@ out:
static int intel_vgpu_open_device(struct vfio_device *vfio_dev)
{
struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev);
if (!vgpu->vfio_device.kvm ||
vgpu->vfio_device.kvm->mm != current->mm) {
gvt_vgpu_err("KVM is required to use Intel vGPU\n");
return -ESRCH;
}
int ret;
if (__kvmgt_vgpu_exist(vgpu))
return -EEXIST;
vgpu->track_node.track_write = kvmgt_page_track_write;
vgpu->track_node.track_flush_slot = kvmgt_page_track_flush_slot;
kvm_get_kvm(vgpu->vfio_device.kvm);
kvm_page_track_register_notifier(vgpu->vfio_device.kvm,
vgpu->track_node.track_remove_region = kvmgt_page_track_remove_region;
ret = kvm_page_track_register_notifier(vgpu->vfio_device.kvm,
&vgpu->track_node);
if (ret) {
gvt_vgpu_err("KVM is required to use Intel vGPU\n");
return ret;
}
set_bit(INTEL_VGPU_STATUS_ATTACHED, vgpu->status);
@ -703,7 +701,6 @@ static void intel_vgpu_close_device(struct vfio_device *vfio_dev)
kvm_page_track_unregister_notifier(vgpu->vfio_device.kvm,
&vgpu->track_node);
kvm_put_kvm(vgpu->vfio_device.kvm);
kvmgt_protect_table_destroy(vgpu);
gvt_cache_destroy(vgpu);
@ -1546,95 +1543,70 @@ static struct mdev_driver intel_vgpu_mdev_driver = {
int intel_gvt_page_track_add(struct intel_vgpu *info, u64 gfn)
{
struct kvm *kvm = info->vfio_device.kvm;
struct kvm_memory_slot *slot;
int idx;
int r;
if (!test_bit(INTEL_VGPU_STATUS_ATTACHED, info->status))
return -ESRCH;
idx = srcu_read_lock(&kvm->srcu);
slot = gfn_to_memslot(kvm, gfn);
if (!slot) {
srcu_read_unlock(&kvm->srcu, idx);
return -EINVAL;
}
write_lock(&kvm->mmu_lock);
if (kvmgt_gfn_is_write_protected(info, gfn))
goto out;
return 0;
r = kvm_write_track_add_gfn(info->vfio_device.kvm, gfn);
if (r)
return r;
kvm_slot_page_track_add_page(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE);
kvmgt_protect_table_add(info, gfn);
out:
write_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, idx);
return 0;
}
int intel_gvt_page_track_remove(struct intel_vgpu *info, u64 gfn)
{
struct kvm *kvm = info->vfio_device.kvm;
struct kvm_memory_slot *slot;
int idx;
int r;
if (!test_bit(INTEL_VGPU_STATUS_ATTACHED, info->status))
return -ESRCH;
idx = srcu_read_lock(&kvm->srcu);
slot = gfn_to_memslot(kvm, gfn);
if (!slot) {
srcu_read_unlock(&kvm->srcu, idx);
return -EINVAL;
}
write_lock(&kvm->mmu_lock);
if (!kvmgt_gfn_is_write_protected(info, gfn))
goto out;
return 0;
r = kvm_write_track_remove_gfn(info->vfio_device.kvm, gfn);
if (r)
return r;
kvm_slot_page_track_remove_page(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE);
kvmgt_protect_table_del(info, gfn);
out:
write_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, idx);
return 0;
}
static void kvmgt_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa,
const u8 *val, int len,
static void kvmgt_page_track_write(gpa_t gpa, const u8 *val, int len,
struct kvm_page_track_notifier_node *node)
{
struct intel_vgpu *info =
container_of(node, struct intel_vgpu, track_node);
if (kvmgt_gfn_is_write_protected(info, gpa_to_gfn(gpa)))
mutex_lock(&info->vgpu_lock);
if (kvmgt_gfn_is_write_protected(info, gpa >> PAGE_SHIFT))
intel_vgpu_page_track_handler(info, gpa,
(void *)val, len);
mutex_unlock(&info->vgpu_lock);
}
static void kvmgt_page_track_flush_slot(struct kvm *kvm,
struct kvm_memory_slot *slot,
static void kvmgt_page_track_remove_region(gfn_t gfn, unsigned long nr_pages,
struct kvm_page_track_notifier_node *node)
{
int i;
gfn_t gfn;
unsigned long i;
struct intel_vgpu *info =
container_of(node, struct intel_vgpu, track_node);
write_lock(&kvm->mmu_lock);
for (i = 0; i < slot->npages; i++) {
gfn = slot->base_gfn + i;
if (kvmgt_gfn_is_write_protected(info, gfn)) {
kvm_slot_page_track_remove_page(kvm, slot, gfn,
KVM_PAGE_TRACK_WRITE);
kvmgt_protect_table_del(info, gfn);
mutex_lock(&info->vgpu_lock);
for (i = 0; i < nr_pages; i++) {
if (kvmgt_gfn_is_write_protected(info, gfn + i))
kvmgt_protect_table_del(info, gfn + i);
}
}
write_unlock(&kvm->mmu_lock);
mutex_unlock(&info->vgpu_lock);
}
void intel_vgpu_detach_regions(struct intel_vgpu *vgpu)

View File

@ -162,13 +162,9 @@ int intel_vgpu_page_track_handler(struct intel_vgpu *vgpu, u64 gpa,
struct intel_vgpu_page_track *page_track;
int ret = 0;
mutex_lock(&vgpu->vgpu_lock);
page_track = intel_vgpu_find_page_track(vgpu, gpa >> PAGE_SHIFT);
if (!page_track) {
ret = -ENXIO;
goto out;
}
if (!page_track)
return -ENXIO;
if (unlikely(vgpu->failsafe)) {
/* Remove write protection to prevent furture traps. */
@ -179,7 +175,5 @@ int intel_vgpu_page_track_handler(struct intel_vgpu *vgpu, u64 gpa,
gvt_err("guest page write error, gpa %llx\n", gpa);
}
out:
mutex_unlock(&vgpu->vgpu_lock);
return ret;
}

View File

@ -867,6 +867,25 @@ static inline void kvm_vm_bugged(struct kvm *kvm)
unlikely(__ret); \
})
/*
* Note, "data corruption" refers to corruption of host kernel data structures,
* not guest data. Guest data corruption, suspected or confirmed, that is tied
* and contained to a single VM should *never* BUG() and potentially panic the
* host, i.e. use this variant of KVM_BUG() if and only if a KVM data structure
* is corrupted and that corruption can have a cascading effect to other parts
* of the hosts and/or to other VMs.
*/
#define KVM_BUG_ON_DATA_CORRUPTION(cond, kvm) \
({ \
bool __ret = !!(cond); \
\
if (IS_ENABLED(CONFIG_BUG_ON_DATA_CORRUPTION)) \
BUG_ON(__ret); \
else if (WARN_ON_ONCE(__ret && !(kvm)->vm_bugged)) \
kvm_vm_bugged(kvm); \
unlikely(__ret); \
})
static inline void kvm_vcpu_srcu_read_lock(struct kvm_vcpu *vcpu)
{
#ifdef CONFIG_PROVE_RCU