KVM: x86/mmu: Age TDP MMU SPTEs without holding mmu_lock

Walk the TDP MMU in an RCU read-side critical section without holding
mmu_lock when harvesting and potentially updating age information on
TDP MMU SPTEs.  Add a new macro to do RCU-safe walking of TDP MMU roots,
and do all SPTE aging with atomic updates; while clobbering Accessed
information is ok, KVM must not corrupt other bits, e.g. must not drop
a Dirty or Writable bit when making a SPTE young..

If updating a SPTE to mark it for access tracking fails, leave it as is
and treat it as if it were young.  If the spte is being actively modified,
it is most likely young.

Acquire and release mmu_lock for write when harvesting age information
from the shadow MMU, as the shadow MMU doesn't yet support aging outside
of mmu_lock.

Suggested-by: Yu Zhao <yuzhao@google.com>
Signed-off-by: James Houghton <jthoughton@google.com>
Reviewed-by: David Matlack <dmatlack@google.com>
Link: https://lore.kernel.org/r/20250204004038.1680123-5-jthoughton@google.com
[sean: massage changelog]
Signed-off-by: Sean Christopherson <seanjc@google.com>
This commit is contained in:
Sean Christopherson 2025-02-12 12:30:12 -08:00
parent 928c54b1c4
commit b146a9b34a
4 changed files with 35 additions and 13 deletions

View File

@ -1478,6 +1478,7 @@ struct kvm_arch {
* tdp_mmu_page set.
*
* For reads, this list is protected by:
* RCU alone or
* the MMU lock in read mode + RCU or
* the MMU lock in write mode
*

View File

@ -22,6 +22,7 @@ config KVM_X86
select KVM_COMMON
select KVM_GENERIC_MMU_NOTIFIER
select KVM_ELIDE_TLB_FLUSH_IF_YOUNG
select KVM_MMU_LOCKLESS_AGING
select HAVE_KVM_IRQCHIP
select HAVE_KVM_PFNCACHE
select HAVE_KVM_DIRTY_RING_TSO

View File

@ -1592,8 +1592,11 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
bool young = false;
if (kvm_memslots_have_rmaps(kvm))
if (kvm_memslots_have_rmaps(kvm)) {
write_lock(&kvm->mmu_lock);
young = kvm_rmap_age_gfn_range(kvm, range, false);
write_unlock(&kvm->mmu_lock);
}
if (tdp_mmu_enabled)
young |= kvm_tdp_mmu_age_gfn_range(kvm, range);
@ -1605,8 +1608,11 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
bool young = false;
if (kvm_memslots_have_rmaps(kvm))
if (kvm_memslots_have_rmaps(kvm)) {
write_lock(&kvm->mmu_lock);
young = kvm_rmap_age_gfn_range(kvm, range, true);
write_unlock(&kvm->mmu_lock);
}
if (tdp_mmu_enabled)
young |= kvm_tdp_mmu_test_age_gfn(kvm, range);

View File

@ -193,6 +193,19 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
!tdp_mmu_root_match((_root), (_types)))) { \
} else
/*
* Iterate over all TDP MMU roots in an RCU read-side critical section.
* It is safe to iterate over the SPTEs under the root, but their values will
* be unstable, so all writes must be atomic. As this routine is meant to be
* used without holding the mmu_lock at all, any bits that are flipped must
* be reflected in kvm_tdp_mmu_spte_need_atomic_write().
*/
#define for_each_tdp_mmu_root_rcu(_kvm, _root, _as_id, _types) \
list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link) \
if ((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) || \
!tdp_mmu_root_match((_root), (_types))) { \
} else
#define for_each_valid_tdp_mmu_root(_kvm, _root, _as_id) \
__for_each_tdp_mmu_root(_kvm, _root, _as_id, KVM_VALID_ROOTS)
@ -1332,21 +1345,22 @@ bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
* from the clear_young() or clear_flush_young() notifier, which uses the
* return value to determine if the page has been accessed.
*/
static void kvm_tdp_mmu_age_spte(struct tdp_iter *iter)
static void kvm_tdp_mmu_age_spte(struct kvm *kvm, struct tdp_iter *iter)
{
u64 new_spte;
if (spte_ad_enabled(iter->old_spte)) {
iter->old_spte = tdp_mmu_clear_spte_bits(iter->sptep,
iter->old_spte,
shadow_accessed_mask,
iter->level);
iter->old_spte = tdp_mmu_clear_spte_bits_atomic(iter->sptep,
shadow_accessed_mask);
new_spte = iter->old_spte & ~shadow_accessed_mask;
} else {
new_spte = mark_spte_for_access_track(iter->old_spte);
iter->old_spte = kvm_tdp_mmu_write_spte(iter->sptep,
iter->old_spte, new_spte,
iter->level);
/*
* It is safe for the following cmpxchg to fail. Leave the
* Accessed bit set, as the spte is most likely young anyway.
*/
if (__tdp_mmu_set_spte_atomic(kvm, iter, new_spte))
return;
}
trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level,
@ -1371,9 +1385,9 @@ static bool __kvm_tdp_mmu_age_gfn_range(struct kvm *kvm,
* valid roots!
*/
WARN_ON(types & ~KVM_VALID_ROOTS);
__for_each_tdp_mmu_root(kvm, root, range->slot->as_id, types) {
guard(rcu)();
guard(rcu)();
for_each_tdp_mmu_root_rcu(kvm, root, range->slot->as_id, types) {
tdp_root_for_each_leaf_pte(iter, kvm, root, range->start, range->end) {
if (!is_accessed_spte(iter.old_spte))
continue;
@ -1382,7 +1396,7 @@ static bool __kvm_tdp_mmu_age_gfn_range(struct kvm *kvm,
return true;
ret = true;
kvm_tdp_mmu_age_spte(&iter);
kvm_tdp_mmu_age_spte(kvm, &iter);
}
}