mirror of
https://github.com/torvalds/linux.git
synced 2026-05-31 10:33:41 +02:00
KVM x86 MMU changes for 6.9:
- Clean up code related to unprotecting shadow pages when retrying a guest
instruction after failed #PF-induced emulation.
- Zap TDP MMU roots at 4KiB granularity to minimize the delay in yielding if
a reschedule is needed, e.g. if a high priority task needs to run. Because
KVM doesn't support yielding in the middle of processing a zapped non-leaf
SPTE, zapping at 1GiB granularity can result in multi-millisecond lag when
attempting to schedule in a high priority.
- Rework TDP MMU root unload, free, and alloc to run with mmu_lock held for
read, e.g. to avoid serializing vCPUs when userspace deletes a memslot.
- Allocate write-tracking metadata on-demand to avoid the memory overhead when
running kernels built with KVMGT support (external write-tracking enabled),
but for workloads that don't use nested virtualization (shadow paging) or
KVMGT.
-----BEGIN PGP SIGNATURE-----
iQIzBAABCgAdFiEEKTobbabEP7vbhhN9OlYIJqCjN/0FAmXrTH4ACgkQOlYIJqCj
N/1q3xAAh3wpUDzRfkNkgGUbulhuJmQ72PiaW3NRoMo/3Rowegsdgt1N3/ec+fcJ
Awx0KUM8Cju8O2Zqp6NzKwUkddCni8dHmOa55NJQuK2M1OpnE0RjBB94n+AFJZki
mm8wKSKNgjlVeJDG87+RLPnbaeEvqYPp22oNKJyAPsimTbxvmhIqtg8qdyujGPXA
Jke7LXgtVGav+nEzXiLh86VU/agoBJc/zt+hiuLvamU5Y8so+zReqFbrDtvsgtpV
ryvMbDZxcPXKrsBP+B7syqUAbODcmh/wkzOCZ4Tby5yurEaw1rwpZIH0BRKRgGx2
F2JqWayYsCOsrJ4DwQre8RfLMtbEKB2BBWkZlYyblAy0++1LcTP9pSk5YC5lSL71
5Oszql9DKi10Vq5IfR/ehsr6mHXFr3AB7C7QefiXpytGbObQs8/f/OxinxaEajcs
ERBgh+rcQ5p3kfdiHzuQjn7y45J7z21CKVhka4iKJtTxypBK4ZvkDOVqHuHppb5O
aw6rC5HR1EKhSW4jz7QWrDExtDZ2X5HeYl8TgfHncSSJRc7urKYcSCHhXJsB6BPs
iQf0xbHaIOyH9jmoqLZjz0QZmXB9fydQ/zAlFVXZsrNHvomayVjqrpl8UFTMdhuI
zll9ynfRRHMUkIi1YubUlmFMgBeqOXGkfBFh8QUH3+YiI7Cwzh4=
=SgFo
-----END PGP SIGNATURE-----
Merge tag 'kvm-x86-mmu-6.9' of https://github.com/kvm-x86/linux into HEAD
KVM x86 MMU changes for 6.9:
- Clean up code related to unprotecting shadow pages when retrying a guest
instruction after failed #PF-induced emulation.
- Zap TDP MMU roots at 4KiB granularity to minimize the delay in yielding if
a reschedule is needed, e.g. if a high priority task needs to run. Because
KVM doesn't support yielding in the middle of processing a zapped non-leaf
SPTE, zapping at 1GiB granularity can result in multi-millisecond lag when
attempting to schedule in a high priority.
- Rework TDP MMU root unload, free, and alloc to run with mmu_lock held for
read, e.g. to avoid serializing vCPUs when userspace deletes a memslot.
- Allocate write-tracking metadata on-demand to avoid the memory overhead when
running kernels built with KVMGT support (external write-tracking enabled),
but for workloads that don't use nested virtualization (shadow paging) or
KVMGT.
This commit is contained in:
commit
41ebae2ecd
|
|
@ -1468,6 +1468,15 @@ struct kvm_arch {
|
|||
*/
|
||||
bool shadow_root_allocated;
|
||||
|
||||
#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING
|
||||
/*
|
||||
* If set, the VM has (or had) an external write tracking user, and
|
||||
* thus all write tracking metadata has been allocated, even if KVM
|
||||
* itself isn't using write tracking.
|
||||
*/
|
||||
bool external_write_tracking_enabled;
|
||||
#endif
|
||||
|
||||
#if IS_ENABLED(CONFIG_HYPERV)
|
||||
hpa_t hv_root_tdp;
|
||||
spinlock_t hv_root_tdp_lock;
|
||||
|
|
|
|||
|
|
@ -3575,10 +3575,14 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
|
|||
if (WARN_ON_ONCE(!sp))
|
||||
return;
|
||||
|
||||
if (is_tdp_mmu_page(sp))
|
||||
if (is_tdp_mmu_page(sp)) {
|
||||
lockdep_assert_held_read(&kvm->mmu_lock);
|
||||
kvm_tdp_mmu_put_root(kvm, sp);
|
||||
else if (!--sp->root_count && sp->role.invalid)
|
||||
kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
|
||||
} else {
|
||||
lockdep_assert_held_write(&kvm->mmu_lock);
|
||||
if (!--sp->root_count && sp->role.invalid)
|
||||
kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
|
||||
}
|
||||
|
||||
*root_hpa = INVALID_PAGE;
|
||||
}
|
||||
|
|
@ -3587,6 +3591,7 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
|
|||
void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
|
||||
ulong roots_to_free)
|
||||
{
|
||||
bool is_tdp_mmu = tdp_mmu_enabled && mmu->root_role.direct;
|
||||
int i;
|
||||
LIST_HEAD(invalid_list);
|
||||
bool free_active_root;
|
||||
|
|
@ -3609,7 +3614,10 @@ void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
|
|||
return;
|
||||
}
|
||||
|
||||
write_lock(&kvm->mmu_lock);
|
||||
if (is_tdp_mmu)
|
||||
read_lock(&kvm->mmu_lock);
|
||||
else
|
||||
write_lock(&kvm->mmu_lock);
|
||||
|
||||
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
|
||||
if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i))
|
||||
|
|
@ -3635,8 +3643,13 @@ void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
|
|||
mmu->root.pgd = 0;
|
||||
}
|
||||
|
||||
kvm_mmu_commit_zap_page(kvm, &invalid_list);
|
||||
write_unlock(&kvm->mmu_lock);
|
||||
if (is_tdp_mmu) {
|
||||
read_unlock(&kvm->mmu_lock);
|
||||
WARN_ON_ONCE(!list_empty(&invalid_list));
|
||||
} else {
|
||||
kvm_mmu_commit_zap_page(kvm, &invalid_list);
|
||||
write_unlock(&kvm->mmu_lock);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
|
||||
|
||||
|
|
@ -3693,15 +3706,15 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
|
|||
unsigned i;
|
||||
int r;
|
||||
|
||||
if (tdp_mmu_enabled)
|
||||
return kvm_tdp_mmu_alloc_root(vcpu);
|
||||
|
||||
write_lock(&vcpu->kvm->mmu_lock);
|
||||
r = make_mmu_pages_available(vcpu);
|
||||
if (r < 0)
|
||||
goto out_unlock;
|
||||
|
||||
if (tdp_mmu_enabled) {
|
||||
root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu);
|
||||
mmu->root.hpa = root;
|
||||
} else if (shadow_root_level >= PT64_ROOT_4LEVEL) {
|
||||
if (shadow_root_level >= PT64_ROOT_4LEVEL) {
|
||||
root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level);
|
||||
mmu->root.hpa = root;
|
||||
} else if (shadow_root_level == PT32E_ROOT_LEVEL) {
|
||||
|
|
@ -6997,9 +7010,7 @@ int kvm_mmu_vendor_module_init(void)
|
|||
|
||||
kvm_mmu_reset_all_pte_masks();
|
||||
|
||||
pte_list_desc_cache = kmem_cache_create("pte_list_desc",
|
||||
sizeof(struct pte_list_desc),
|
||||
0, SLAB_ACCOUNT, NULL);
|
||||
pte_list_desc_cache = KMEM_CACHE(pte_list_desc, SLAB_ACCOUNT);
|
||||
if (!pte_list_desc_cache)
|
||||
goto out;
|
||||
|
||||
|
|
|
|||
|
|
@ -20,10 +20,23 @@
|
|||
#include "mmu_internal.h"
|
||||
#include "page_track.h"
|
||||
|
||||
static bool kvm_external_write_tracking_enabled(struct kvm *kvm)
|
||||
{
|
||||
#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING
|
||||
/*
|
||||
* Read external_write_tracking_enabled before related pointers. Pairs
|
||||
* with the smp_store_release in kvm_page_track_write_tracking_enable().
|
||||
*/
|
||||
return smp_load_acquire(&kvm->arch.external_write_tracking_enabled);
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
bool kvm_page_track_write_tracking_enabled(struct kvm *kvm)
|
||||
{
|
||||
return IS_ENABLED(CONFIG_KVM_EXTERNAL_WRITE_TRACKING) ||
|
||||
!tdp_enabled || kvm_shadow_root_allocated(kvm);
|
||||
return kvm_external_write_tracking_enabled(kvm) ||
|
||||
kvm_shadow_root_allocated(kvm) || !tdp_enabled;
|
||||
}
|
||||
|
||||
void kvm_page_track_free_memslot(struct kvm_memory_slot *slot)
|
||||
|
|
@ -153,6 +166,50 @@ int kvm_page_track_init(struct kvm *kvm)
|
|||
return init_srcu_struct(&head->track_srcu);
|
||||
}
|
||||
|
||||
static int kvm_enable_external_write_tracking(struct kvm *kvm)
|
||||
{
|
||||
struct kvm_memslots *slots;
|
||||
struct kvm_memory_slot *slot;
|
||||
int r = 0, i, bkt;
|
||||
|
||||
mutex_lock(&kvm->slots_arch_lock);
|
||||
|
||||
/*
|
||||
* Check for *any* write tracking user (not just external users) under
|
||||
* lock. This avoids unnecessary work, e.g. if KVM itself is using
|
||||
* write tracking, or if two external users raced when registering.
|
||||
*/
|
||||
if (kvm_page_track_write_tracking_enabled(kvm))
|
||||
goto out_success;
|
||||
|
||||
for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
|
||||
slots = __kvm_memslots(kvm, i);
|
||||
kvm_for_each_memslot(slot, bkt, slots) {
|
||||
/*
|
||||
* Intentionally do NOT free allocations on failure to
|
||||
* avoid having to track which allocations were made
|
||||
* now versus when the memslot was created. The
|
||||
* metadata is guaranteed to be freed when the slot is
|
||||
* freed, and will be kept/used if userspace retries
|
||||
* the failed ioctl() instead of killing the VM.
|
||||
*/
|
||||
r = kvm_page_track_write_tracking_alloc(slot);
|
||||
if (r)
|
||||
goto out_unlock;
|
||||
}
|
||||
}
|
||||
|
||||
out_success:
|
||||
/*
|
||||
* Ensure that external_write_tracking_enabled becomes true strictly
|
||||
* after all the related pointers are set.
|
||||
*/
|
||||
smp_store_release(&kvm->arch.external_write_tracking_enabled, true);
|
||||
out_unlock:
|
||||
mutex_unlock(&kvm->slots_arch_lock);
|
||||
return r;
|
||||
}
|
||||
|
||||
/*
|
||||
* register the notifier so that event interception for the tracked guest
|
||||
* pages can be received.
|
||||
|
|
@ -161,10 +218,17 @@ int kvm_page_track_register_notifier(struct kvm *kvm,
|
|||
struct kvm_page_track_notifier_node *n)
|
||||
{
|
||||
struct kvm_page_track_notifier_head *head;
|
||||
int r;
|
||||
|
||||
if (!kvm || kvm->mm != current->mm)
|
||||
return -ESRCH;
|
||||
|
||||
if (!kvm_external_write_tracking_enabled(kvm)) {
|
||||
r = kvm_enable_external_write_tracking(kvm);
|
||||
if (r)
|
||||
return r;
|
||||
}
|
||||
|
||||
kvm_get_kvm(kvm);
|
||||
|
||||
head = &kvm->arch.track_notifier_head;
|
||||
|
|
|
|||
|
|
@ -149,11 +149,11 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
|
|||
* If shared is set, this function is operating under the MMU lock in read
|
||||
* mode.
|
||||
*/
|
||||
#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _only_valid)\
|
||||
for (_root = tdp_mmu_next_root(_kvm, NULL, _only_valid); \
|
||||
({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \
|
||||
_root = tdp_mmu_next_root(_kvm, _root, _only_valid)) \
|
||||
if (kvm_mmu_page_as_id(_root) != _as_id) { \
|
||||
#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _only_valid) \
|
||||
for (_root = tdp_mmu_next_root(_kvm, NULL, _only_valid); \
|
||||
({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \
|
||||
_root = tdp_mmu_next_root(_kvm, _root, _only_valid)) \
|
||||
if (_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) { \
|
||||
} else
|
||||
|
||||
#define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \
|
||||
|
|
@ -171,12 +171,19 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
|
|||
* Holding mmu_lock for write obviates the need for RCU protection as the list
|
||||
* is guaranteed to be stable.
|
||||
*/
|
||||
#define for_each_tdp_mmu_root(_kvm, _root, _as_id) \
|
||||
list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \
|
||||
if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \
|
||||
kvm_mmu_page_as_id(_root) != _as_id) { \
|
||||
#define __for_each_tdp_mmu_root(_kvm, _root, _as_id, _only_valid) \
|
||||
list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \
|
||||
if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \
|
||||
((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) || \
|
||||
((_only_valid) && (_root)->role.invalid))) { \
|
||||
} else
|
||||
|
||||
#define for_each_tdp_mmu_root(_kvm, _root, _as_id) \
|
||||
__for_each_tdp_mmu_root(_kvm, _root, _as_id, false)
|
||||
|
||||
#define for_each_valid_tdp_mmu_root(_kvm, _root, _as_id) \
|
||||
__for_each_tdp_mmu_root(_kvm, _root, _as_id, true)
|
||||
|
||||
static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
struct kvm_mmu_page *sp;
|
||||
|
|
@ -216,22 +223,41 @@ static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
|
|||
tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role);
|
||||
}
|
||||
|
||||
hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
|
||||
int kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
union kvm_mmu_page_role role = vcpu->arch.mmu->root_role;
|
||||
struct kvm_mmu *mmu = vcpu->arch.mmu;
|
||||
union kvm_mmu_page_role role = mmu->root_role;
|
||||
int as_id = kvm_mmu_role_as_id(role);
|
||||
struct kvm *kvm = vcpu->kvm;
|
||||
struct kvm_mmu_page *root;
|
||||
|
||||
lockdep_assert_held_write(&kvm->mmu_lock);
|
||||
/*
|
||||
* Check for an existing root before acquiring the pages lock to avoid
|
||||
* unnecessary serialization if multiple vCPUs are loading a new root.
|
||||
* E.g. when bringing up secondary vCPUs, KVM will already have created
|
||||
* a valid root on behalf of the primary vCPU.
|
||||
*/
|
||||
read_lock(&kvm->mmu_lock);
|
||||
|
||||
for_each_valid_tdp_mmu_root_yield_safe(kvm, root, as_id) {
|
||||
if (root->role.word == role.word)
|
||||
goto out_read_unlock;
|
||||
}
|
||||
|
||||
spin_lock(&kvm->arch.tdp_mmu_pages_lock);
|
||||
|
||||
/*
|
||||
* Check for an existing root before allocating a new one. Note, the
|
||||
* role check prevents consuming an invalid root.
|
||||
* Recheck for an existing root after acquiring the pages lock, another
|
||||
* vCPU may have raced ahead and created a new usable root. Manually
|
||||
* walk the list of roots as the standard macros assume that the pages
|
||||
* lock is *not* held. WARN if grabbing a reference to a usable root
|
||||
* fails, as the last reference to a root can only be put *after* the
|
||||
* root has been invalidated, which requires holding mmu_lock for write.
|
||||
*/
|
||||
for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
|
||||
list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
|
||||
if (root->role.word == role.word &&
|
||||
kvm_tdp_mmu_get_root(root))
|
||||
goto out;
|
||||
!WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root)))
|
||||
goto out_spin_unlock;
|
||||
}
|
||||
|
||||
root = tdp_mmu_alloc_sp(vcpu);
|
||||
|
|
@ -245,13 +271,20 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
|
|||
* is ultimately put by kvm_tdp_mmu_zap_invalidated_roots().
|
||||
*/
|
||||
refcount_set(&root->tdp_mmu_root_count, 2);
|
||||
|
||||
spin_lock(&kvm->arch.tdp_mmu_pages_lock);
|
||||
list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
|
||||
spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
|
||||
|
||||
out:
|
||||
return __pa(root->spt);
|
||||
out_spin_unlock:
|
||||
spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
|
||||
out_read_unlock:
|
||||
read_unlock(&kvm->mmu_lock);
|
||||
/*
|
||||
* Note, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS will prevent entering the guest
|
||||
* and actually consuming the root if it's invalidated after dropping
|
||||
* mmu_lock, and the root can't be freed as this vCPU holds a reference.
|
||||
*/
|
||||
mmu->root.hpa = __pa(root->spt);
|
||||
mmu->root.pgd = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
|
||||
|
|
@ -734,15 +767,26 @@ static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
|
|||
rcu_read_lock();
|
||||
|
||||
/*
|
||||
* To avoid RCU stalls due to recursively removing huge swaths of SPs,
|
||||
* split the zap into two passes. On the first pass, zap at the 1gb
|
||||
* level, and then zap top-level SPs on the second pass. "1gb" is not
|
||||
* arbitrary, as KVM must be able to zap a 1gb shadow page without
|
||||
* inducing a stall to allow in-place replacement with a 1gb hugepage.
|
||||
* Zap roots in multiple passes of decreasing granularity, i.e. zap at
|
||||
* 4KiB=>2MiB=>1GiB=>root, in order to better honor need_resched() (all
|
||||
* preempt models) or mmu_lock contention (full or real-time models).
|
||||
* Zapping at finer granularity marginally increases the total time of
|
||||
* the zap, but in most cases the zap itself isn't latency sensitive.
|
||||
*
|
||||
* Because zapping a SP recurses on its children, stepping down to
|
||||
* PG_LEVEL_4K in the iterator itself is unnecessary.
|
||||
* If KVM is configured to prove the MMU, skip the 4KiB and 2MiB zaps
|
||||
* in order to mimic the page fault path, which can replace a 1GiB page
|
||||
* table with an equivalent 1GiB hugepage, i.e. can get saddled with
|
||||
* zapping a 1GiB region that's fully populated with 4KiB SPTEs. This
|
||||
* allows verifying that KVM can safely zap 1GiB regions, e.g. without
|
||||
* inducing RCU stalls, without relying on a relatively rare event
|
||||
* (zapping roots is orders of magnitude more common). Note, because
|
||||
* zapping a SP recurses on its children, stepping down to PG_LEVEL_4K
|
||||
* in the iterator itself is unnecessary.
|
||||
*/
|
||||
if (!IS_ENABLED(CONFIG_KVM_PROVE_MMU)) {
|
||||
__tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_4K);
|
||||
__tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_2M);
|
||||
}
|
||||
__tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G);
|
||||
__tdp_mmu_zap_root(kvm, root, shared, root->role.level);
|
||||
|
||||
|
|
@ -800,7 +844,13 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
|
|||
continue;
|
||||
|
||||
tdp_mmu_iter_set_spte(kvm, &iter, 0);
|
||||
flush = true;
|
||||
|
||||
/*
|
||||
* Zappings SPTEs in invalid roots doesn't require a TLB flush,
|
||||
* see kvm_tdp_mmu_zap_invalidated_roots() for details.
|
||||
*/
|
||||
if (!root->role.invalid)
|
||||
flush = true;
|
||||
}
|
||||
|
||||
rcu_read_unlock();
|
||||
|
|
@ -813,16 +863,16 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
|
|||
}
|
||||
|
||||
/*
|
||||
* Zap leaf SPTEs for the range of gfns, [start, end), for all roots. Returns
|
||||
* true if a TLB flush is needed before releasing the MMU lock, i.e. if one or
|
||||
* more SPTEs were zapped since the MMU lock was last acquired.
|
||||
* Zap leaf SPTEs for the range of gfns, [start, end), for all *VALID** roots.
|
||||
* Returns true if a TLB flush is needed before releasing the MMU lock, i.e. if
|
||||
* one or more SPTEs were zapped since the MMU lock was last acquired.
|
||||
*/
|
||||
bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush)
|
||||
{
|
||||
struct kvm_mmu_page *root;
|
||||
|
||||
lockdep_assert_held_write(&kvm->mmu_lock);
|
||||
for_each_tdp_mmu_root_yield_safe(kvm, root)
|
||||
for_each_valid_tdp_mmu_root_yield_safe(kvm, root, -1)
|
||||
flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush);
|
||||
|
||||
return flush;
|
||||
|
|
@ -896,7 +946,7 @@ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
|
|||
* the VM is being destroyed).
|
||||
*
|
||||
* Note, kvm_tdp_mmu_zap_invalidated_roots() is gifted the TDP MMU's reference.
|
||||
* See kvm_tdp_mmu_get_vcpu_root_hpa().
|
||||
* See kvm_tdp_mmu_alloc_root().
|
||||
*/
|
||||
void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
|
||||
{
|
||||
|
|
@ -1622,7 +1672,7 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
|
|||
{
|
||||
struct kvm_mmu_page *root;
|
||||
|
||||
for_each_tdp_mmu_root(kvm, root, slot->as_id)
|
||||
for_each_valid_tdp_mmu_root(kvm, root, slot->as_id)
|
||||
clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
|
||||
}
|
||||
|
||||
|
|
@ -1740,7 +1790,7 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
|
|||
bool spte_set = false;
|
||||
|
||||
lockdep_assert_held_write(&kvm->mmu_lock);
|
||||
for_each_tdp_mmu_root(kvm, root, slot->as_id)
|
||||
for_each_valid_tdp_mmu_root(kvm, root, slot->as_id)
|
||||
spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
|
||||
|
||||
return spte_set;
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@
|
|||
void kvm_mmu_init_tdp_mmu(struct kvm *kvm);
|
||||
void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm);
|
||||
|
||||
hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu);
|
||||
int kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu);
|
||||
|
||||
__must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root)
|
||||
{
|
||||
|
|
|
|||
|
|
@ -8792,31 +8792,24 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
|
|||
|
||||
kvm_release_pfn_clean(pfn);
|
||||
|
||||
/* The instructions are well-emulated on direct mmu. */
|
||||
if (vcpu->arch.mmu->root_role.direct) {
|
||||
unsigned int indirect_shadow_pages;
|
||||
|
||||
write_lock(&vcpu->kvm->mmu_lock);
|
||||
indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages;
|
||||
write_unlock(&vcpu->kvm->mmu_lock);
|
||||
|
||||
if (indirect_shadow_pages)
|
||||
kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* if emulation was due to access to shadowed page table
|
||||
* and it failed try to unshadow page and re-enter the
|
||||
* guest to let CPU execute the instruction.
|
||||
* If emulation may have been triggered by a write to a shadowed page
|
||||
* table, unprotect the gfn (zap any relevant SPTEs) and re-enter the
|
||||
* guest to let the CPU re-execute the instruction in the hope that the
|
||||
* CPU can cleanly execute the instruction that KVM failed to emulate.
|
||||
*/
|
||||
kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
|
||||
if (vcpu->kvm->arch.indirect_shadow_pages)
|
||||
kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
|
||||
|
||||
/*
|
||||
* If the access faults on its page table, it can not
|
||||
* be fixed by unprotecting shadow page and it should
|
||||
* be reported to userspace.
|
||||
* If the failed instruction faulted on an access to page tables that
|
||||
* are used to translate any part of the instruction, KVM can't resolve
|
||||
* the issue by unprotecting the gfn, as zapping the shadow page will
|
||||
* result in the instruction taking a !PRESENT page fault and thus put
|
||||
* the vCPU into an infinite loop of page faults. E.g. KVM will create
|
||||
* a SPTE and write-protect the gfn to resolve the !PRESENT fault, and
|
||||
* then zap the SPTE to unprotect the gfn, and then do it all over
|
||||
* again. Report the error to userspace.
|
||||
*/
|
||||
return !(emulation_type & EMULTYPE_WRITE_PF_TO_SP);
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user