Merge tag 'kvm-x86-mmu-6.17' of https://github.com/kvm-x86/linux into HEAD

KVM x86 MMU changes for 6.17

 - Exempt nested EPT from the the !USER + CR0.WP logic, as EPT doesn't interact
   with CR0.WP.

 - Move the TDX hardware setup code to tdx.c to better co-locate TDX code
   and eliminate a few global symbols.

 - Dynamically allocation the shadow MMU's hashed page list, and defer
   allocating the hashed list until it's actually needed (the TDP MMU doesn't
   use the list).
This commit is contained in:
Paolo Bonzini 2025-07-28 11:14:40 -04:00
commit d7f4aac280
11 changed files with 145 additions and 69 deletions

View File

@ -1358,7 +1358,7 @@ struct kvm_arch {
bool has_private_mem;
bool has_protected_state;
bool pre_fault_allowed;
struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
struct hlist_head *mmu_page_hash;
struct list_head active_mmu_pages;
/*
* A list of kvm_mmu_page structs that, if zapped, could possibly be
@ -1985,7 +1985,7 @@ void kvm_x86_vendor_exit(void);
#define __KVM_HAVE_ARCH_VM_ALLOC
static inline struct kvm *kvm_arch_alloc_vm(void)
{
return __vmalloc(kvm_x86_ops.vm_size, GFP_KERNEL_ACCOUNT | __GFP_ZERO);
return kvzalloc(kvm_x86_ops.vm_size, GFP_KERNEL_ACCOUNT);
}
#define __KVM_HAVE_ARCH_VM_FREE
@ -2030,7 +2030,7 @@ void kvm_mmu_vendor_module_exit(void);
void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
int kvm_mmu_create(struct kvm_vcpu *vcpu);
void kvm_mmu_init_vm(struct kvm *kvm);
int kvm_mmu_init_vm(struct kvm *kvm);
void kvm_mmu_uninit_vm(struct kvm *kvm);
void kvm_mmu_init_memslot_memory_attributes(struct kvm *kvm,

View File

@ -1983,14 +1983,35 @@ static bool sp_has_gptes(struct kvm_mmu_page *sp)
return true;
}
static __ro_after_init HLIST_HEAD(empty_page_hash);
static struct hlist_head *kvm_get_mmu_page_hash(struct kvm *kvm, gfn_t gfn)
{
/*
* Ensure the load of the hash table pointer itself is ordered before
* loads to walk the table. The pointer is set at runtime outside of
* mmu_lock when the TDP MMU is enabled, i.e. when the hash table of
* shadow pages becomes necessary only when KVM needs to shadow L1's
* TDP for an L2 guest. Pairs with the smp_store_release() in
* kvm_mmu_alloc_page_hash().
*/
struct hlist_head *page_hash = smp_load_acquire(&kvm->arch.mmu_page_hash);
lockdep_assert_held(&kvm->mmu_lock);
if (!page_hash)
return &empty_page_hash;
return &page_hash[kvm_page_table_hashfn(gfn)];
}
#define for_each_valid_sp(_kvm, _sp, _list) \
hlist_for_each_entry(_sp, _list, hash_link) \
if (is_obsolete_sp((_kvm), (_sp))) { \
} else
#define for_each_gfn_valid_sp_with_gptes(_kvm, _sp, _gfn) \
for_each_valid_sp(_kvm, _sp, \
&(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)]) \
for_each_valid_sp(_kvm, _sp, kvm_get_mmu_page_hash(_kvm, _gfn)) \
if ((_sp)->gfn != (_gfn) || !sp_has_gptes(_sp)) {} else
static bool kvm_sync_page_check(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
@ -2358,6 +2379,12 @@ static struct kvm_mmu_page *__kvm_mmu_get_shadow_page(struct kvm *kvm,
struct kvm_mmu_page *sp;
bool created = false;
/*
* No need for memory barriers, unlike in kvm_get_mmu_page_hash(), as
* mmu_page_hash must be set prior to creating the first shadow root,
* i.e. reaching this point is fully serialized by slots_arch_lock.
*/
BUG_ON(!kvm->arch.mmu_page_hash);
sp_list = &kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)];
sp = kvm_mmu_find_shadow_page(kvm, vcpu, gfn, sp_list, role);
@ -3882,6 +3909,28 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
return r;
}
static int kvm_mmu_alloc_page_hash(struct kvm *kvm)
{
struct hlist_head *h;
if (kvm->arch.mmu_page_hash)
return 0;
h = kvcalloc(KVM_NUM_MMU_PAGES, sizeof(*h), GFP_KERNEL_ACCOUNT);
if (!h)
return -ENOMEM;
/*
* Ensure the hash table pointer is set only after all stores to zero
* the memory are retired. Pairs with the smp_load_acquire() in
* kvm_get_mmu_page_hash(). Note, mmu_lock must be held for write to
* add (or remove) shadow pages, and so readers are guaranteed to see
* an empty list for their current mmu_lock critical section.
*/
smp_store_release(&kvm->arch.mmu_page_hash, h);
return 0;
}
static int mmu_first_shadow_root_alloc(struct kvm *kvm)
{
struct kvm_memslots *slots;
@ -3901,9 +3950,13 @@ static int mmu_first_shadow_root_alloc(struct kvm *kvm)
if (kvm_shadow_root_allocated(kvm))
goto out_unlock;
r = kvm_mmu_alloc_page_hash(kvm);
if (r)
goto out_unlock;
/*
* Check if anything actually needs to be allocated, e.g. all metadata
* will be allocated upfront if TDP is disabled.
* Check if memslot metadata actually needs to be allocated, e.g. all
* metadata will be allocated upfront if TDP is disabled.
*/
if (kvm_memslots_have_rmaps(kvm) &&
kvm_page_track_write_tracking_enabled(kvm))
@ -6682,15 +6735,22 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
kvm_tdp_mmu_zap_invalidated_roots(kvm, true);
}
void kvm_mmu_init_vm(struct kvm *kvm)
int kvm_mmu_init_vm(struct kvm *kvm)
{
int r;
kvm->arch.shadow_mmio_value = shadow_mmio_value;
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages);
spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);
if (tdp_mmu_enabled)
if (tdp_mmu_enabled) {
kvm_mmu_init_tdp_mmu(kvm);
} else {
r = kvm_mmu_alloc_page_hash(kvm);
if (r)
return r;
}
kvm->arch.split_page_header_cache.kmem_cache = mmu_page_header_cache;
kvm->arch.split_page_header_cache.gfp_zero = __GFP_ZERO;
@ -6699,6 +6759,7 @@ void kvm_mmu_init_vm(struct kvm *kvm)
kvm->arch.split_desc_cache.kmem_cache = pte_list_desc_cache;
kvm->arch.split_desc_cache.gfp_zero = __GFP_ZERO;
return 0;
}
static void mmu_free_vm_memory_caches(struct kvm *kvm)
@ -6710,6 +6771,8 @@ static void mmu_free_vm_memory_caches(struct kvm *kvm)
void kvm_mmu_uninit_vm(struct kvm *kvm)
{
kvfree(kvm->arch.mmu_page_hash);
if (tdp_mmu_enabled)
kvm_mmu_uninit_tdp_mmu(kvm);

View File

@ -804,9 +804,12 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
if (r != RET_PF_CONTINUE)
return r;
#if PTTYPE != PTTYPE_EPT
/*
* Do not change pte_access if the pfn is a mmio page, otherwise
* we will cache the incorrect access into mmio spte.
* Treat the guest PTE protections as writable, supervisor-only if this
* is a supervisor write fault and CR0.WP=0 (supervisor accesses ignore
* PTE.W if CR0.WP=0). Don't change the access type for emulated MMIO,
* otherwise KVM will cache incorrect access information in the SPTE.
*/
if (fault->write && !(walker.pte_access & ACC_WRITE_MASK) &&
!is_cr0_wp(vcpu->arch.mmu) && !fault->user && fault->slot) {
@ -822,6 +825,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
if (is_cr4_smep(vcpu->arch.mmu))
walker.pte_access &= ~ACC_EXEC_MASK;
}
#endif
r = RET_PF_RETRY;
write_lock(&vcpu->kvm->mmu_lock);

View File

@ -5494,6 +5494,8 @@ static int __init svm_init(void)
{
int r;
KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_svm);
__unused_size_checks();
if (!kvm_is_svm_supported())

View File

@ -29,40 +29,8 @@ static __init int vt_hardware_setup(void)
if (ret)
return ret;
/*
* Update vt_x86_ops::vm_size here so it is ready before
* kvm_ops_update() is called in kvm_x86_vendor_init().
*
* Note, the actual bringing up of TDX must be done after
* kvm_ops_update() because enabling TDX requires enabling
* hardware virtualization first, i.e., all online CPUs must
* be in post-VMXON state. This means the @vm_size here
* may be updated to TDX's size but TDX may fail to enable
* at later time.
*
* The VMX/VT code could update kvm_x86_ops::vm_size again
* after bringing up TDX, but this would require exporting
* either kvm_x86_ops or kvm_ops_update() from the base KVM
* module, which looks overkill. Anyway, the worst case here
* is KVM may allocate couple of more bytes than needed for
* each VM.
*/
if (enable_tdx) {
vt_x86_ops.vm_size = max_t(unsigned int, vt_x86_ops.vm_size,
sizeof(struct kvm_tdx));
/*
* Note, TDX may fail to initialize in a later time in
* vt_init(), in which case it is not necessary to setup
* those callbacks. But making them valid here even
* when TDX fails to init later is fine because those
* callbacks won't be called if the VM isn't TDX guest.
*/
vt_x86_ops.link_external_spt = tdx_sept_link_private_spt;
vt_x86_ops.set_external_spte = tdx_sept_set_private_spte;
vt_x86_ops.free_external_spt = tdx_sept_free_private_spt;
vt_x86_ops.remove_external_spte = tdx_sept_remove_private_spte;
vt_x86_ops.protected_apic_has_interrupt = tdx_protected_apic_has_interrupt;
}
if (enable_tdx)
tdx_hardware_setup();
return 0;
}

View File

@ -743,7 +743,7 @@ bool tdx_interrupt_allowed(struct kvm_vcpu *vcpu)
!to_tdx(vcpu)->vp_enter_args.r12;
}
bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu)
static bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu)
{
u64 vcpu_state_details;
@ -1638,8 +1638,8 @@ static int tdx_mem_page_record_premap_cnt(struct kvm *kvm, gfn_t gfn,
return 0;
}
int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
enum pg_level level, kvm_pfn_t pfn)
static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
enum pg_level level, kvm_pfn_t pfn)
{
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
struct page *page = pfn_to_page(pfn);
@ -1719,8 +1719,8 @@ static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn,
return 0;
}
int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
enum pg_level level, void *private_spt)
static int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
enum pg_level level, void *private_spt)
{
int tdx_level = pg_level_to_tdx_sept_level(level);
gpa_t gpa = gfn_to_gpa(gfn);
@ -1855,8 +1855,8 @@ static void tdx_track(struct kvm *kvm)
kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
}
int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
enum pg_level level, void *private_spt)
static int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
enum pg_level level, void *private_spt)
{
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
@ -1878,8 +1878,8 @@ int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
return tdx_reclaim_page(virt_to_page(private_spt));
}
int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
enum pg_level level, kvm_pfn_t pfn)
static int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
enum pg_level level, kvm_pfn_t pfn)
{
struct page *page = pfn_to_page(pfn);
int ret;
@ -3603,10 +3603,14 @@ int __init tdx_bringup(void)
r = __tdx_bringup();
if (r) {
/*
* Disable TDX only but don't fail to load module if
* the TDX module could not be loaded. No need to print
* message saying "module is not loaded" because it was
* printed when the first SEAMCALL failed.
* Disable TDX only but don't fail to load module if the TDX
* module could not be loaded. No need to print message saying
* "module is not loaded" because it was printed when the first
* SEAMCALL failed. Don't bother unwinding the S-EPT hooks or
* vm_size, as kvm_x86_ops have already been finalized (and are
* intentionally not exported). The S-EPT code is unreachable,
* and allocating a few more bytes per VM in a should-be-rare
* failure scenario is a non-issue.
*/
if (r == -ENODEV)
goto success_disable_tdx;
@ -3620,3 +3624,20 @@ int __init tdx_bringup(void)
enable_tdx = 0;
return 0;
}
void __init tdx_hardware_setup(void)
{
KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_tdx);
/*
* Note, if the TDX module can't be loaded, KVM TDX support will be
* disabled but KVM will continue loading (see tdx_bringup()).
*/
vt_x86_ops.vm_size = max_t(unsigned int, vt_x86_ops.vm_size, sizeof(struct kvm_tdx));
vt_x86_ops.link_external_spt = tdx_sept_link_private_spt;
vt_x86_ops.set_external_spte = tdx_sept_set_private_spte;
vt_x86_ops.free_external_spt = tdx_sept_free_private_spt;
vt_x86_ops.remove_external_spte = tdx_sept_remove_private_spte;
vt_x86_ops.protected_apic_has_interrupt = tdx_protected_apic_has_interrupt;
}

View File

@ -8,6 +8,7 @@
#ifdef CONFIG_KVM_INTEL_TDX
#include "common.h"
void tdx_hardware_setup(void);
int tdx_bringup(void);
void tdx_cleanup(void);

View File

@ -8552,6 +8552,8 @@ int __init vmx_init(void)
{
int r, cpu;
KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_vmx);
if (!kvm_is_vmx_supported())
return -EOPNOTSUPP;

View File

@ -136,7 +136,6 @@ int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu);
fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags);
void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu);
void tdx_vcpu_put(struct kvm_vcpu *vcpu);
bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu);
int tdx_handle_exit(struct kvm_vcpu *vcpu,
enum exit_fastpath_completion fastpath);
@ -151,15 +150,6 @@ int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr);
int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp);
int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
enum pg_level level, void *private_spt);
int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
enum pg_level level, void *private_spt);
int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
enum pg_level level, kvm_pfn_t pfn);
int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
enum pg_level level, kvm_pfn_t pfn);
void tdx_flush_tlb_current(struct kvm_vcpu *vcpu);
void tdx_flush_tlb_all(struct kvm_vcpu *vcpu);
void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level);

View File

@ -12699,7 +12699,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
if (ret)
goto out;
kvm_mmu_init_vm(kvm);
ret = kvm_mmu_init_vm(kvm);
if (ret)
goto out_cleanup_page_track;
ret = kvm_x86_call(vm_init)(kvm);
if (ret)
@ -12745,6 +12747,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
out_uninit_mmu:
kvm_mmu_uninit_vm(kvm);
out_cleanup_page_track:
kvm_page_track_cleanup(kvm);
out:
return ret;

View File

@ -55,6 +55,28 @@ struct kvm_host_values {
void kvm_spurious_fault(void);
#define SIZE_OF_MEMSLOTS_HASHTABLE \
(sizeof(((struct kvm_memslots *)0)->id_hash) * 2 * KVM_MAX_NR_ADDRESS_SPACES)
/* Sanity check the size of the memslot hash tables. */
static_assert(SIZE_OF_MEMSLOTS_HASHTABLE ==
(1024 * (1 + IS_ENABLED(CONFIG_X86_64)) * (1 + IS_ENABLED(CONFIG_KVM_SMM))));
/*
* Assert that "struct kvm_{svm,vmx,tdx}" is an order-0 or order-1 allocation.
* Spilling over to an order-2 allocation isn't fundamentally problematic, but
* isn't expected to happen in the foreseeable future (O(years)). Assert that
* the size is an order-0 allocation when ignoring the memslot hash tables, to
* help detect and debug unexpected size increases.
*/
#define KVM_SANITY_CHECK_VM_STRUCT_SIZE(x) \
do { \
BUILD_BUG_ON(get_order(sizeof(struct x) - SIZE_OF_MEMSLOTS_HASHTABLE) && \
!IS_ENABLED(CONFIG_DEBUG_KERNEL) && !IS_ENABLED(CONFIG_KASAN)); \
BUILD_BUG_ON(get_order(sizeof(struct x)) > 1 && \
!IS_ENABLED(CONFIG_DEBUG_KERNEL) && !IS_ENABLED(CONFIG_KASAN)); \
} while (0)
#define KVM_NESTED_VMENTER_CONSISTENCY_CHECK(consistency_check) \
({ \
bool failed = (consistency_check); \