mirror of
https://github.com/torvalds/linux.git
synced 2026-05-25 15:41:52 +02:00
KVM: x86/mmu: Add support for lockless walks of rmap SPTEs
Add a lockless version of for_each_rmap_spte(), which is pretty much the same as the normal version, except that it doesn't BUG() the host if a non-present SPTE is encountered. When mmu_lock is held, it should be impossible for a different task to zap a SPTE, _and_ zapped SPTEs must be removed from their rmap chain prior to dropping mmu_lock. Thus, the normal walker BUG()s if a non-present SPTE is encountered as something is wildly broken. When walking rmaps without holding mmu_lock, the SPTEs pointed at by the rmap chain can be zapped/dropped, and so a lockless walk can observe a non-present SPTE if it runs concurrently with a different operation that is zapping SPTEs. Signed-off-by: James Houghton <jthoughton@google.com> Link: https://lore.kernel.org/r/20250204004038.1680123-11-jthoughton@google.com Signed-off-by: Sean Christopherson <seanjc@google.com>
This commit is contained in:
parent
4834eaded9
commit
bb6c7749cc
|
|
@ -876,7 +876,7 @@ static struct kvm_memory_slot *gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu
|
|||
*/
|
||||
#define KVM_RMAP_LOCKED BIT(1)
|
||||
|
||||
static unsigned long kvm_rmap_lock(struct kvm_rmap_head *rmap_head)
|
||||
static unsigned long __kvm_rmap_lock(struct kvm_rmap_head *rmap_head)
|
||||
{
|
||||
unsigned long old_val, new_val;
|
||||
|
||||
|
|
@ -914,7 +914,7 @@ static unsigned long kvm_rmap_lock(struct kvm_rmap_head *rmap_head)
|
|||
/*
|
||||
* Use try_cmpxchg_acquire() to prevent reads and writes to the rmap
|
||||
* from being reordered outside of the critical section created by
|
||||
* kvm_rmap_lock().
|
||||
* __kvm_rmap_lock().
|
||||
*
|
||||
* Pairs with the atomic_long_set_release() in kvm_rmap_unlock().
|
||||
*
|
||||
|
|
@ -923,21 +923,42 @@ static unsigned long kvm_rmap_lock(struct kvm_rmap_head *rmap_head)
|
|||
*/
|
||||
} while (!atomic_long_try_cmpxchg_acquire(&rmap_head->val, &old_val, new_val));
|
||||
|
||||
/* Return the old value, i.e. _without_ the LOCKED bit set. */
|
||||
/*
|
||||
* Return the old value, i.e. _without_ the LOCKED bit set. It's
|
||||
* impossible for the return value to be 0 (see above), i.e. the read-
|
||||
* only unlock flow can't get a false positive and fail to unlock.
|
||||
*/
|
||||
return old_val;
|
||||
}
|
||||
|
||||
static void kvm_rmap_unlock(struct kvm_rmap_head *rmap_head,
|
||||
unsigned long new_val)
|
||||
static unsigned long kvm_rmap_lock(struct kvm *kvm,
|
||||
struct kvm_rmap_head *rmap_head)
|
||||
{
|
||||
WARN_ON_ONCE(new_val & KVM_RMAP_LOCKED);
|
||||
lockdep_assert_held_write(&kvm->mmu_lock);
|
||||
|
||||
return __kvm_rmap_lock(rmap_head);
|
||||
}
|
||||
|
||||
static void __kvm_rmap_unlock(struct kvm_rmap_head *rmap_head,
|
||||
unsigned long val)
|
||||
{
|
||||
KVM_MMU_WARN_ON(val & KVM_RMAP_LOCKED);
|
||||
/*
|
||||
* Ensure that all accesses to the rmap have completed before unlocking
|
||||
* the rmap.
|
||||
*
|
||||
* Pairs with the atomic_long_try_cmpxchg_acquire() in kvm_rmap_lock.
|
||||
* Pairs with the atomic_long_try_cmpxchg_acquire() in __kvm_rmap_lock().
|
||||
*/
|
||||
atomic_long_set_release(&rmap_head->val, new_val);
|
||||
atomic_long_set_release(&rmap_head->val, val);
|
||||
}
|
||||
|
||||
static void kvm_rmap_unlock(struct kvm *kvm,
|
||||
struct kvm_rmap_head *rmap_head,
|
||||
unsigned long new_val)
|
||||
{
|
||||
lockdep_assert_held_write(&kvm->mmu_lock);
|
||||
|
||||
__kvm_rmap_unlock(rmap_head, new_val);
|
||||
}
|
||||
|
||||
static unsigned long kvm_rmap_get(struct kvm_rmap_head *rmap_head)
|
||||
|
|
@ -945,17 +966,49 @@ static unsigned long kvm_rmap_get(struct kvm_rmap_head *rmap_head)
|
|||
return atomic_long_read(&rmap_head->val) & ~KVM_RMAP_LOCKED;
|
||||
}
|
||||
|
||||
/*
|
||||
* If mmu_lock isn't held, rmaps can only be locked in read-only mode. The
|
||||
* actual locking is the same, but the caller is disallowed from modifying the
|
||||
* rmap, and so the unlock flow is a nop if the rmap is/was empty.
|
||||
*/
|
||||
__maybe_unused
|
||||
static unsigned long kvm_rmap_lock_readonly(struct kvm_rmap_head *rmap_head)
|
||||
{
|
||||
unsigned long rmap_val;
|
||||
|
||||
preempt_disable();
|
||||
rmap_val = __kvm_rmap_lock(rmap_head);
|
||||
|
||||
if (!rmap_val)
|
||||
preempt_enable();
|
||||
|
||||
return rmap_val;
|
||||
}
|
||||
|
||||
__maybe_unused
|
||||
static void kvm_rmap_unlock_readonly(struct kvm_rmap_head *rmap_head,
|
||||
unsigned long old_val)
|
||||
{
|
||||
if (!old_val)
|
||||
return;
|
||||
|
||||
KVM_MMU_WARN_ON(old_val != kvm_rmap_get(rmap_head));
|
||||
|
||||
__kvm_rmap_unlock(rmap_head, old_val);
|
||||
preempt_enable();
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns the number of pointers in the rmap chain, not counting the new one.
|
||||
*/
|
||||
static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
|
||||
struct kvm_rmap_head *rmap_head)
|
||||
static int pte_list_add(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
|
||||
u64 *spte, struct kvm_rmap_head *rmap_head)
|
||||
{
|
||||
unsigned long old_val, new_val;
|
||||
struct pte_list_desc *desc;
|
||||
int count = 0;
|
||||
|
||||
old_val = kvm_rmap_lock(rmap_head);
|
||||
old_val = kvm_rmap_lock(kvm, rmap_head);
|
||||
|
||||
if (!old_val) {
|
||||
new_val = (unsigned long)spte;
|
||||
|
|
@ -987,7 +1040,7 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
|
|||
desc->sptes[desc->spte_count++] = spte;
|
||||
}
|
||||
|
||||
kvm_rmap_unlock(rmap_head, new_val);
|
||||
kvm_rmap_unlock(kvm, rmap_head, new_val);
|
||||
|
||||
return count;
|
||||
}
|
||||
|
|
@ -1035,7 +1088,7 @@ static void pte_list_remove(struct kvm *kvm, u64 *spte,
|
|||
unsigned long rmap_val;
|
||||
int i;
|
||||
|
||||
rmap_val = kvm_rmap_lock(rmap_head);
|
||||
rmap_val = kvm_rmap_lock(kvm, rmap_head);
|
||||
if (KVM_BUG_ON_DATA_CORRUPTION(!rmap_val, kvm))
|
||||
goto out;
|
||||
|
||||
|
|
@ -1061,7 +1114,7 @@ static void pte_list_remove(struct kvm *kvm, u64 *spte,
|
|||
}
|
||||
|
||||
out:
|
||||
kvm_rmap_unlock(rmap_head, rmap_val);
|
||||
kvm_rmap_unlock(kvm, rmap_head, rmap_val);
|
||||
}
|
||||
|
||||
static void kvm_zap_one_rmap_spte(struct kvm *kvm,
|
||||
|
|
@ -1079,7 +1132,7 @@ static bool kvm_zap_all_rmap_sptes(struct kvm *kvm,
|
|||
unsigned long rmap_val;
|
||||
int i;
|
||||
|
||||
rmap_val = kvm_rmap_lock(rmap_head);
|
||||
rmap_val = kvm_rmap_lock(kvm, rmap_head);
|
||||
if (!rmap_val)
|
||||
return false;
|
||||
|
||||
|
|
@ -1098,7 +1151,7 @@ static bool kvm_zap_all_rmap_sptes(struct kvm *kvm,
|
|||
}
|
||||
out:
|
||||
/* rmap_head is meaningless now, remember to reset it */
|
||||
kvm_rmap_unlock(rmap_head, 0);
|
||||
kvm_rmap_unlock(kvm, rmap_head, 0);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
@ -1171,23 +1224,18 @@ static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
|
|||
struct rmap_iterator *iter)
|
||||
{
|
||||
unsigned long rmap_val = kvm_rmap_get(rmap_head);
|
||||
u64 *sptep;
|
||||
|
||||
if (!rmap_val)
|
||||
return NULL;
|
||||
|
||||
if (!(rmap_val & KVM_RMAP_MANY)) {
|
||||
iter->desc = NULL;
|
||||
sptep = (u64 *)rmap_val;
|
||||
goto out;
|
||||
return (u64 *)rmap_val;
|
||||
}
|
||||
|
||||
iter->desc = (struct pte_list_desc *)(rmap_val & ~KVM_RMAP_MANY);
|
||||
iter->pos = 0;
|
||||
sptep = iter->desc->sptes[iter->pos];
|
||||
out:
|
||||
BUG_ON(!is_shadow_present_pte(*sptep));
|
||||
return sptep;
|
||||
return iter->desc->sptes[iter->pos];
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
@ -1197,14 +1245,11 @@ static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
|
|||
*/
|
||||
static u64 *rmap_get_next(struct rmap_iterator *iter)
|
||||
{
|
||||
u64 *sptep;
|
||||
|
||||
if (iter->desc) {
|
||||
if (iter->pos < PTE_LIST_EXT - 1) {
|
||||
++iter->pos;
|
||||
sptep = iter->desc->sptes[iter->pos];
|
||||
if (sptep)
|
||||
goto out;
|
||||
if (iter->desc->sptes[iter->pos])
|
||||
return iter->desc->sptes[iter->pos];
|
||||
}
|
||||
|
||||
iter->desc = iter->desc->more;
|
||||
|
|
@ -1212,20 +1257,24 @@ static u64 *rmap_get_next(struct rmap_iterator *iter)
|
|||
if (iter->desc) {
|
||||
iter->pos = 0;
|
||||
/* desc->sptes[0] cannot be NULL */
|
||||
sptep = iter->desc->sptes[iter->pos];
|
||||
goto out;
|
||||
return iter->desc->sptes[iter->pos];
|
||||
}
|
||||
}
|
||||
|
||||
return NULL;
|
||||
out:
|
||||
BUG_ON(!is_shadow_present_pte(*sptep));
|
||||
return sptep;
|
||||
}
|
||||
|
||||
#define for_each_rmap_spte(_rmap_head_, _iter_, _spte_) \
|
||||
for (_spte_ = rmap_get_first(_rmap_head_, _iter_); \
|
||||
_spte_; _spte_ = rmap_get_next(_iter_))
|
||||
#define __for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \
|
||||
for (_sptep_ = rmap_get_first(_rmap_head_, _iter_); \
|
||||
_sptep_; _sptep_ = rmap_get_next(_iter_))
|
||||
|
||||
#define for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \
|
||||
__for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \
|
||||
if (!WARN_ON_ONCE(!is_shadow_present_pte(*(_sptep_)))) \
|
||||
|
||||
#define for_each_rmap_spte_lockless(_rmap_head_, _iter_, _sptep_, _spte_) \
|
||||
__for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \
|
||||
if (is_shadow_present_pte(_spte_ = mmu_spte_get_lockless(sptep)))
|
||||
|
||||
static void drop_spte(struct kvm *kvm, u64 *sptep)
|
||||
{
|
||||
|
|
@ -1311,12 +1360,13 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
|
|||
struct rmap_iterator iter;
|
||||
bool flush = false;
|
||||
|
||||
for_each_rmap_spte(rmap_head, &iter, sptep)
|
||||
for_each_rmap_spte(rmap_head, &iter, sptep) {
|
||||
if (spte_ad_need_write_protect(*sptep))
|
||||
flush |= test_and_clear_bit(PT_WRITABLE_SHIFT,
|
||||
(unsigned long *)sptep);
|
||||
else
|
||||
flush |= spte_clear_dirty(sptep);
|
||||
}
|
||||
|
||||
return flush;
|
||||
}
|
||||
|
|
@ -1637,7 +1687,7 @@ static void __rmap_add(struct kvm *kvm,
|
|||
kvm_update_page_stats(kvm, sp->role.level, 1);
|
||||
|
||||
rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
|
||||
rmap_count = pte_list_add(cache, spte, rmap_head);
|
||||
rmap_count = pte_list_add(kvm, cache, spte, rmap_head);
|
||||
|
||||
if (rmap_count > kvm->stat.max_mmu_rmap_size)
|
||||
kvm->stat.max_mmu_rmap_size = rmap_count;
|
||||
|
|
@ -1771,13 +1821,14 @@ static unsigned kvm_page_table_hashfn(gfn_t gfn)
|
|||
return hash_64(gfn, KVM_MMU_HASH_SHIFT);
|
||||
}
|
||||
|
||||
static void mmu_page_add_parent_pte(struct kvm_mmu_memory_cache *cache,
|
||||
static void mmu_page_add_parent_pte(struct kvm *kvm,
|
||||
struct kvm_mmu_memory_cache *cache,
|
||||
struct kvm_mmu_page *sp, u64 *parent_pte)
|
||||
{
|
||||
if (!parent_pte)
|
||||
return;
|
||||
|
||||
pte_list_add(cache, parent_pte, &sp->parent_ptes);
|
||||
pte_list_add(kvm, cache, parent_pte, &sp->parent_ptes);
|
||||
}
|
||||
|
||||
static void mmu_page_remove_parent_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
|
||||
|
|
@ -2467,7 +2518,7 @@ static void __link_shadow_page(struct kvm *kvm,
|
|||
|
||||
mmu_spte_set(sptep, spte);
|
||||
|
||||
mmu_page_add_parent_pte(cache, sp, sptep);
|
||||
mmu_page_add_parent_pte(kvm, cache, sp, sptep);
|
||||
|
||||
/*
|
||||
* The non-direct sub-pagetable must be updated before linking. For
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user