From dfcbcd864edc694a8efdd20179dc29cb561207ac Mon Sep 17 00:00:00 2001 From: Ted Chen Date: Fri, 24 Jan 2025 15:50:55 +0800 Subject: [PATCH 01/26] KVM: x86: Remove unused iommu_domain and iommu_noncoherent from kvm_arch Remove the "iommu_domain" and "iommu_noncoherent" fields from struct kvm_arch, which are no longer used since commit ad6260da1e23 ("KVM: x86: drop legacy device assignment"). Signed-off-by: Ted Chen Link: https://lore.kernel.org/r/20250124075055.97158-1-znscnchen@gmail.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b15cde0a9b5c..c4e844b60b0f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1353,8 +1353,6 @@ struct kvm_arch { u64 shadow_mmio_value; - struct iommu_domain *iommu_domain; - bool iommu_noncoherent; #define __KVM_HAVE_ARCH_NONCOHERENT_DMA atomic_t noncoherent_dma_count; #define __KVM_HAVE_ARCH_ASSIGNED_DEVICE From 4cad9f87876a943d018ad73ec3919215fb756d2d Mon Sep 17 00:00:00 2001 From: Liam Ni Date: Tue, 30 Jul 2024 21:59:41 +0800 Subject: [PATCH 02/26] KVM: x86: Wake vCPU for PIC interrupt injection iff a valid IRQ was found When updating the emulated PIC IRQ status, set "wakeup_needed" if and only if a new interrupt was found, i.e. if the incoming level is non-zero and an IRQ is being raised. The bug is relatively benign, as KVM will signal a spurious wakeup, e.g. set KVM_REQ_EVENT and kick target vCPUs, but KVM will never actually inject a spurious IRQ as kvm_cpu_has_extint() cares only about the "output" field. Fixes: 7049467b5383 ("KVM: remove isr_ack logic from PIC") Signed-off-by: Liam Ni Link: https://lore.kernel.org/r/CACZJ9cX2R_=qgvLdaqbB_DUJhv08c674b67Ln_Qb9yyVwgE16w@mail.gmail.com [sean: reconstruct patch, rewrite changelog] Signed-off-by: Sean Christopherson --- arch/x86/kvm/i8259.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 8dec646e764b..a8fb19940975 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -567,7 +567,7 @@ static void pic_irq_request(struct kvm *kvm, int level) { struct kvm_pic *s = kvm->arch.vpic; - if (!s->output) + if (!s->output && level) s->wakeup_needed = true; s->output = level; } From 82c470121c7ba970ad866a08a78eae067b523993 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Wed, 22 Jan 2025 15:34:56 +0800 Subject: [PATCH 03/26] KVM: x86: Use kvfree_rcu() to free old optimized APIC map Use kvfree_rcu() to free the old optimized APIC instead of open coding a rough equivalent via call_rcu() and a callback function. Note, there is a subtle function change as rcu_barrier() doesn't wait on kvfree_rcu(), but does wait on call_rcu(). Not forcing rcu_barrier() to wait is safe and desirable in this case, as KVM doesn't care when an old map is actually freed. In fact, using kvfree_rcu() fixes a largely theoretical use-after-free. Because KVM _doesn't_ do rcu_barrier() to wait for kvm_apic_map_free() to complete, if KVM-the-module is unloaded in the RCU grace period before kvm_apic_map_free() is invoked, KVM's callback could run after module unload. Signed-off-by: Li RongQing Reviewed-by: Neeraj Upadhyay Link: https://lore.kernel.org/r/20250122073456.2950-1-lirongqing@baidu.com [sean: rework changelog, call out rcu_barrier() interaction] Signed-off-by: Sean Christopherson --- arch/x86/kvm/lapic.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index a009c94c26c2..0b5e1ce49402 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -221,13 +221,6 @@ static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map, } } -static void kvm_apic_map_free(struct rcu_head *rcu) -{ - struct kvm_apic_map *map = container_of(rcu, struct kvm_apic_map, rcu); - - kvfree(map); -} - static int kvm_recalculate_phys_map(struct kvm_apic_map *new, struct kvm_vcpu *vcpu, bool *xapic_id_mismatch) @@ -489,7 +482,7 @@ static void kvm_recalculate_apic_map(struct kvm *kvm) mutex_unlock(&kvm->arch.apic_map_lock); if (old) - call_rcu(&old->rcu, kvm_apic_map_free); + kvfree_rcu(old, rcu); kvm_make_scan_ioapic_request(kvm); } From c9e5f3fa903961131a832c2593022bc6a5229cf5 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Mon, 13 Jan 2025 12:01:43 -0800 Subject: [PATCH 04/26] KVM: x86: Introduce kvm_set_mp_state() Replace all open-coded assignments to vcpu->arch.mp_state with calls to a new helper, kvm_set_mp_state(), to centralize all changes to mp_state. No functional change intended. Signed-off-by: Jim Mattson Link: https://lore.kernel.org/r/20250113200150.487409-2-jmattson@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/lapic.c | 6 +++--- arch/x86/kvm/svm/nested.c | 2 +- arch/x86/kvm/svm/sev.c | 4 ++-- arch/x86/kvm/vmx/nested.c | 4 ++-- arch/x86/kvm/x86.c | 17 ++++++++--------- arch/x86/kvm/x86.h | 5 +++++ arch/x86/kvm/xen.c | 4 ++-- 7 files changed, 23 insertions(+), 19 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 0b5e1ce49402..750bb89b20ac 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -3390,9 +3390,9 @@ int kvm_apic_accept_events(struct kvm_vcpu *vcpu) if (test_and_clear_bit(KVM_APIC_INIT, &apic->pending_events)) { kvm_vcpu_reset(vcpu, true); if (kvm_vcpu_is_bsp(apic->vcpu)) - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); else - vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; + kvm_set_mp_state(vcpu, KVM_MP_STATE_INIT_RECEIVED); } if (test_and_clear_bit(KVM_APIC_SIPI, &apic->pending_events)) { if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { @@ -3401,7 +3401,7 @@ int kvm_apic_accept_events(struct kvm_vcpu *vcpu) sipi_vector = apic->sipi_vector; kvm_x86_call(vcpu_deliver_sipi_vector)(vcpu, sipi_vector); - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); } } return 0; diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index d77b094d9a4d..c6b79c2e1e05 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -994,7 +994,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm) kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); /* in case we halted in L2 */ - svm->vcpu.arch.mp_state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); /* Give the current vmcb to the guest */ diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index a2a794c32050..87d2840da6af 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -3845,7 +3845,7 @@ static int __sev_snp_update_protected_guest_state(struct kvm_vcpu *vcpu) /* Mark the vCPU as offline and not runnable */ vcpu->arch.pv.pv_unhalted = false; - vcpu->arch.mp_state = KVM_MP_STATE_HALTED; + kvm_set_mp_state(vcpu, KVM_MP_STATE_HALTED); /* Clear use of the VMSA */ svm->vmcb->control.vmsa_pa = INVALID_PAGE; @@ -3884,7 +3884,7 @@ static int __sev_snp_update_protected_guest_state(struct kvm_vcpu *vcpu) /* Mark the vCPU as runnable */ vcpu->arch.pv.pv_unhalted = false; - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); svm->sev_es.snp_vmsa_gpa = INVALID_PAGE; diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 8a7af02d466e..bca2575837ce 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3771,7 +3771,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) break; case GUEST_ACTIVITY_WAIT_SIPI: vmx->nested.nested_run_pending = 0; - vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; + kvm_set_mp_state(vcpu, KVM_MP_STATE_INIT_RECEIVED); break; default: break; @@ -5071,7 +5071,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, vmx->nested.need_vmcs12_to_shadow_sync = true; /* in case we halted in L2 */ - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); if (likely(!vmx->fail)) { if (vm_exit_reason != -1) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8e77e61d4fbd..3041b8d8b59f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11216,8 +11216,7 @@ static inline int vcpu_block(struct kvm_vcpu *vcpu) case KVM_MP_STATE_HALTED: case KVM_MP_STATE_AP_RESET_HOLD: vcpu->arch.pv.pv_unhalted = false; - vcpu->arch.mp_state = - KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); fallthrough; case KVM_MP_STATE_RUNNABLE: vcpu->arch.apf.halted = false; @@ -11296,7 +11295,7 @@ static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason) if (kvm_vcpu_has_events(vcpu)) vcpu->arch.pv.pv_unhalted = false; else - vcpu->arch.mp_state = state; + kvm_set_mp_state(vcpu, state); return 1; } else { vcpu->run->exit_reason = reason; @@ -11816,10 +11815,10 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, goto out; if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) { - vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; + kvm_set_mp_state(vcpu, KVM_MP_STATE_INIT_RECEIVED); set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events); } else - vcpu->arch.mp_state = mp_state->mp_state; + kvm_set_mp_state(vcpu, mp_state->mp_state); kvm_make_request(KVM_REQ_EVENT, vcpu); ret = 0; @@ -11946,7 +11945,7 @@ static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs, if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 && sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && !is_protmode(vcpu)) - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); return 0; } @@ -12249,9 +12248,9 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) kvm_gpc_init(&vcpu->arch.pv_time, vcpu->kvm); if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu)) - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); else - vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; + kvm_set_mp_state(vcpu, KVM_MP_STATE_UNINITIALIZED); r = kvm_mmu_create(vcpu); if (r < 0) @@ -13469,7 +13468,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, } vcpu->arch.apf.halted = false; - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); } void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 91e50a513100..34ca87049845 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -121,6 +121,11 @@ static inline bool kvm_vcpu_has_run(struct kvm_vcpu *vcpu) return vcpu->arch.last_vmentry_cpu != -1; } +static inline void kvm_set_mp_state(struct kvm_vcpu *vcpu, int mp_state) +{ + vcpu->arch.mp_state = mp_state; +} + static inline bool kvm_is_exception_pending(struct kvm_vcpu *vcpu) { return vcpu->arch.exception.pending || diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index a909b817b9c0..f65ca27888e9 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -1480,7 +1480,7 @@ static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode, set_bit(vcpu->vcpu_idx, vcpu->kvm->arch.xen.poll_mask); if (!wait_pending_event(vcpu, sched_poll.nr_ports, ports)) { - vcpu->arch.mp_state = KVM_MP_STATE_HALTED; + kvm_set_mp_state(vcpu, KVM_MP_STATE_HALTED); if (sched_poll.timeout) mod_timer(&vcpu->arch.xen.poll_timer, @@ -1491,7 +1491,7 @@ static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode, if (sched_poll.timeout) del_timer(&vcpu->arch.xen.poll_timer); - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); } vcpu->arch.xen.poll_evtchn = 0; From e9cb61055fee5f973984e2b98edd3bbc356f9c89 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Mon, 13 Jan 2025 12:01:44 -0800 Subject: [PATCH 05/26] KVM: x86: Clear pv_unhalted on all transitions to KVM_MP_STATE_RUNNABLE In kvm_set_mp_state(), ensure that vcpu->arch.pv.pv_unhalted is always cleared on a transition to KVM_MP_STATE_RUNNABLE, so that the next HLT instruction will be respected. Fixes: 6aef266c6e17 ("kvm hypervisor : Add a hypercall to KVM hypervisor to support pv-ticketlocks") Fixes: b6b8a1451fc4 ("KVM: nVMX: Rework interception of IRQs and NMIs") Fixes: 38c0b192bd6d ("KVM: SVM: leave halted state on vmexit") Fixes: 1a65105a5aba ("KVM: x86/xen: handle PV spinlocks slowpath") Signed-off-by: Jim Mattson Link: https://lore.kernel.org/r/20250113200150.487409-3-jmattson@google.com [sean: add Xen PV spinlocks to the list of Fixes, tweak changelog] Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 1 - arch/x86/kvm/x86.c | 1 - arch/x86/kvm/x86.h | 2 ++ 3 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 87d2840da6af..8bc62e994138 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -3883,7 +3883,6 @@ static int __sev_snp_update_protected_guest_state(struct kvm_vcpu *vcpu) svm->vmcb->control.vmsa_pa = pfn_to_hpa(pfn); /* Mark the vCPU as runnable */ - vcpu->arch.pv.pv_unhalted = false; kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); svm->sev_es.snp_vmsa_gpa = INVALID_PAGE; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3041b8d8b59f..0aca2a5dac7e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11215,7 +11215,6 @@ static inline int vcpu_block(struct kvm_vcpu *vcpu) switch(vcpu->arch.mp_state) { case KVM_MP_STATE_HALTED: case KVM_MP_STATE_AP_RESET_HOLD: - vcpu->arch.pv.pv_unhalted = false; kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); fallthrough; case KVM_MP_STATE_RUNNABLE: diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 34ca87049845..5c1fd5230cee 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -124,6 +124,8 @@ static inline bool kvm_vcpu_has_run(struct kvm_vcpu *vcpu) static inline void kvm_set_mp_state(struct kvm_vcpu *vcpu, int mp_state) { vcpu->arch.mp_state = mp_state; + if (mp_state == KVM_MP_STATE_RUNNABLE) + vcpu->arch.pv.pv_unhalted = false; } static inline bool kvm_is_exception_pending(struct kvm_vcpu *vcpu) From a11128ce16366d455a6f937e2a2d1aa2a8b2bd45 Mon Sep 17 00:00:00 2001 From: Ethan Zhao Date: Mon, 27 Jan 2025 09:38:37 +0800 Subject: [PATCH 06/26] KVM: x86/cpuid: add type suffix to decimal const 48 fix building warning The default type of a decimal constant is determined by the magnitude of its value. If the value falls within the range of int, its type is int; otherwise, if it falls within the range of unsigned int, its type is unsigned int. This results in the constant 48 being of type int. In the following min call, g_phys_as = min(g_phys_as, 48); This leads to a building warning/error (CONFIG_KVM_WERROR=y) caused by the mismatch between the types of the two arguments to macro min. By adding the suffix U to explicitly declare the type of the constant, this issue is fixed. Signed-off-by: Ethan Zhao Link: https://lore.kernel.org/r/20250127013837.12983-1-haifeng.zhao@linux.intel.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/cpuid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 8eb3a88707f2..269ffbd5c2e0 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -1704,7 +1704,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) phys_as = entry->eax & 0xff; g_phys_as = phys_as; if (kvm_mmu_get_max_tdp_level() < 5) - g_phys_as = min(g_phys_as, 48); + g_phys_as = min(g_phys_as, 48U); } entry->eax = phys_as | (virt_as << 8) | (g_phys_as << 16); From aa93b6f96f6486fa1a78b7b92ebe43f49f35d8bf Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 10 Dec 2024 17:32:59 -0800 Subject: [PATCH 07/26] KVM: x86: Use for-loop to iterate over XSTATE size entries Rework xstate_required_size() to use a for-loop and continue, to make it more obvious that the xstate_sizes[] lookups are indeed correctly bounded, and to make it (hopefully) easier to understand that the loop is iterating over supported XSAVE features. Link: https://lore.kernel.org/r/20241211013302.1347853-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/cpuid.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 269ffbd5c2e0..a57fa0123c77 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -58,25 +58,24 @@ void __init kvm_init_xstate_sizes(void) u32 xstate_required_size(u64 xstate_bv, bool compacted) { - int feature_bit = 0; u32 ret = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; + int i; xstate_bv &= XFEATURE_MASK_EXTEND; - while (xstate_bv) { - if (xstate_bv & 0x1) { - struct cpuid_xstate_sizes *xs = &xstate_sizes[feature_bit]; - u32 offset; + for (i = XFEATURE_YMM; i < ARRAY_SIZE(xstate_sizes) && xstate_bv; i++) { + struct cpuid_xstate_sizes *xs = &xstate_sizes[i]; + u32 offset; - /* ECX[1]: 64B alignment in compacted form */ - if (compacted) - offset = (xs->ecx & 0x2) ? ALIGN(ret, 64) : ret; - else - offset = xs->ebx; - ret = max(ret, offset + xs->eax); - } + if (!(xstate_bv & BIT_ULL(i))) + continue; - xstate_bv >>= 1; - feature_bit++; + /* ECX[1]: 64B alignment in compacted form */ + if (compacted) + offset = (xs->ecx & 0x2) ? ALIGN(ret, 64) : ret; + else + offset = xs->ebx; + ret = max(ret, offset + xs->eax); + xstate_bv &= ~BIT_ULL(i); } return ret; From 7e9f735e7ac4b95207e108ec67d04a9c4cb9cced Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 10 Dec 2024 17:33:00 -0800 Subject: [PATCH 08/26] KVM: x86: Apply TSX_CTRL_CPUID_CLEAR if and only if the vCPU has RTM or HLE When emulating CPUID, retrieve MSR_IA32_TSX_CTRL.TSX_CTRL_CPUID_CLEAR if and only if RTM and/or HLE feature bits need to be cleared. Getting the MSR value is unnecessary if neither bit is set, and avoiding the lookup saves ~80 cycles for vCPUs without RTM or HLE. Cc: Jim Mattson Reviewed-by: Jim Mattson Link: https://lore.kernel.org/r/20241211013302.1347853-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/cpuid.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index a57fa0123c77..0de3f581ef12 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -1999,7 +1999,8 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, *edx = entry->edx; if (function == 7 && index == 0) { u64 data; - if (!__kvm_get_msr(vcpu, MSR_IA32_TSX_CTRL, &data, true) && + if ((*ebx & (feature_bit(RTM) | feature_bit(HLE))) && + !__kvm_get_msr(vcpu, MSR_IA32_TSX_CTRL, &data, true) && (data & TSX_CTRL_CPUID_CLEAR)) *ebx &= ~(feature_bit(RTM) | feature_bit(HLE)); } else if (function == 0x80000007) { From a487f6797c881460024ab45f2fea562bc377f02a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 10 Dec 2024 17:33:01 -0800 Subject: [PATCH 09/26] KVM: x86: Query X86_FEATURE_MWAIT iff userspace owns the CPUID feature bit Rework MONITOR/MWAIT emulation to query X86_FEATURE_MWAIT if and only if the MISC_ENABLE_NO_MWAIT quirk is enabled, in which case MWAIT is not a dynamic, KVM-controlled CPUID feature. KVM's funky ABI for that quirk is to emulate MONITOR/MWAIT as nops if userspace sets MWAIT in guest CPUID. For the case where KVM owns the MWAIT feature bit, check MISC_ENABLES itself, i.e. check the actual control, not its reflection in guest CPUID. Avoiding consumption of dynamic CPUID features will allow KVM to defer runtime CPUID updates until kvm_emulate_cpuid(), i.e. until the updates become visible to the guest. Alternatively, KVM could play other games with runtime CPUID updates, e.g. by precisely specifying which feature bits to update, but doing so adds non-trivial complexity and doesn't solve the underlying issue of unnecessary updates causing meaningful overhead for nested virtualization roundtrips. Link: https://lore.kernel.org/r/20241211013302.1347853-5-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0aca2a5dac7e..9883f0565baf 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2080,10 +2080,20 @@ EXPORT_SYMBOL_GPL(kvm_handle_invalid_op); static int kvm_emulate_monitor_mwait(struct kvm_vcpu *vcpu, const char *insn) { - if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS) && - !guest_cpu_cap_has(vcpu, X86_FEATURE_MWAIT)) + bool enabled; + + if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS)) + goto emulate_as_nop; + + if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) + enabled = guest_cpu_cap_has(vcpu, X86_FEATURE_MWAIT); + else + enabled = vcpu->arch.ia32_misc_enable_msr & MSR_IA32_MISC_ENABLE_MWAIT; + + if (!enabled) return kvm_handle_invalid_op(vcpu); +emulate_as_nop: pr_warn_once("%s instruction emulated as NOP!\n", insn); return kvm_emulate_as_nop(vcpu); } From 93da6af3ae563b7dbc60d97b5872109822d7bf0a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 10 Dec 2024 17:33:02 -0800 Subject: [PATCH 10/26] KVM: x86: Defer runtime updates of dynamic CPUID bits until CPUID emulation Defer runtime CPUID updates until the next non-faulting CPUID emulation or KVM_GET_CPUID2, which are the only paths in KVM that consume the dynamic entries. Deferring the updates is especially beneficial to nested VM-Enter/VM-Exit, as KVM will almost always detect multiple state changes, not to mention the updates don't need to be realized while L2 is active if CPUID is being intercepted by L1 (CPUID is a mandatory intercept on Intel, but not AMD). Deferring CPUID updates shaves several hundred cycles from nested VMX roundtrips, as measured from L2 executing CPUID in a tight loop: SKX 6850 => 6450 ICX 9000 => 8800 EMR 7900 => 7700 Alternatively, KVM could update only the CPUID leaves that are affected by the state change, e.g. update XSAVE info only if XCR0 or XSS changes, but that adds non-trivial complexity and doesn't solve the underlying problem of nested transitions potentially changing both XCR0 and XSS, on both nested VM-Enter and VM-Exit. Skipping updates entirely if L2 is active and CPUID is being intercepted by L1 could work for the common case. However, simply skipping updates if L2 is active is *very* subtly dangerous and complex. Most KVM updates are triggered by changes to the current vCPU state, which may be L2 state, whereas performing updates only for L1 would requiring detecting changes to L1 state. KVM would need to either track relevant L1 state, or defer runtime CPUID updates until the next nested VM-Exit. The former is ugly and complex, while the latter comes with similar dangers to deferring all CPUID updates, and would only address the nested VM-Enter path. To guard against using stale data, disallow querying dynamic CPUID feature bits, i.e. features that KVM updates at runtime, via a compile-time assertion in guest_cpu_cap_has(). Exempt MWAIT from the rule, as the MISC_ENABLE_NO_MWAIT means that MWAIT is _conditionally_ a dynamic CPUID feature. Note, the rule could be enforced for MWAIT as well, e.g. by querying guest CPUID in kvm_emulate_monitor_mwait, but there's no obvious advtantage to doing so, and allowing MWAIT for guest_cpuid_has() opens up a different can of worms. MONITOR/MWAIT can't be virtualized (for a reasonable definition), and the nature of the MWAIT_NEVER_UD_FAULTS and MISC_ENABLE_NO_MWAIT quirks means checking X86_FEATURE_MWAIT outside of kvm_emulate_monitor_mwait() is wrong for other reasons. Beyond the aforementioned feature bits, the only other dynamic CPUID (sub)leaves are the XSAVE sizes, and similar to MWAIT, consuming those CPUID entries in KVM is all but guaranteed to be a bug. The layout for an actual XSAVE buffer depends on the format (compacted or not) and potentially the features that are actually enabled. E.g. see the logic in fpstate_clear_xstate_component() needed to poke into the guest's effective XSAVE state to clear MPX state on INIT. KVM does consume CPUID.0xD.0.{EAX,EDX} in kvm_check_cpuid() and cpuid_get_supported_xcr0(), but not EBX, which is the only dynamic output register in the leaf. Link: https://lore.kernel.org/r/20241211013302.1347853-6-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/cpuid.c | 12 ++++++++++-- arch/x86/kvm/cpuid.h | 9 ++++++++- arch/x86/kvm/lapic.c | 2 +- arch/x86/kvm/smm.c | 2 +- arch/x86/kvm/svm/sev.c | 2 +- arch/x86/kvm/svm/svm.c | 2 +- arch/x86/kvm/vmx/vmx.c | 2 +- arch/x86/kvm/x86.c | 6 +++--- 9 files changed, 27 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c4e844b60b0f..94b14b3a3b8d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -879,6 +879,7 @@ struct kvm_vcpu_arch { int cpuid_nent; struct kvm_cpuid_entry2 *cpuid_entries; + bool cpuid_dynamic_bits_dirty; bool is_amd_compatible; /* diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 0de3f581ef12..97a90689a9dc 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -195,6 +195,7 @@ static int kvm_check_cpuid(struct kvm_vcpu *vcpu) } static u32 kvm_apply_cpuid_pv_features_quirk(struct kvm_vcpu *vcpu); +static void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu); /* Check whether the supplied CPUID data is equal to what is already set for the vCPU. */ static int kvm_cpuid_check_equal(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, @@ -299,10 +300,12 @@ static __always_inline void kvm_update_feature_runtime(struct kvm_vcpu *vcpu, guest_cpu_cap_change(vcpu, x86_feature, has_feature); } -void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) +static void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; + vcpu->arch.cpuid_dynamic_bits_dirty = false; + best = kvm_find_cpuid_entry(vcpu, 1); if (best) { kvm_update_feature_runtime(vcpu, best, X86_FEATURE_OSXSAVE, @@ -332,7 +335,6 @@ void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) cpuid_entry_has(best, X86_FEATURE_XSAVEC))) best->ebx = xstate_required_size(vcpu->arch.xcr0, true); } -EXPORT_SYMBOL_GPL(kvm_update_cpuid_runtime); static bool kvm_cpuid_has_hyperv(struct kvm_vcpu *vcpu) { @@ -645,6 +647,9 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, if (cpuid->nent < vcpu->arch.cpuid_nent) return -E2BIG; + if (vcpu->arch.cpuid_dynamic_bits_dirty) + kvm_update_cpuid_runtime(vcpu); + if (copy_to_user(entries, vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2))) return -EFAULT; @@ -1984,6 +1989,9 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, struct kvm_cpuid_entry2 *entry; bool exact, used_max_basic = false; + if (vcpu->arch.cpuid_dynamic_bits_dirty) + kvm_update_cpuid_runtime(vcpu); + entry = kvm_find_cpuid_entry_index(vcpu, function, index); exact = !!entry; diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 67d80aa72d50..d2884162a46a 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -11,7 +11,6 @@ extern u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly; void kvm_set_cpu_caps(void); void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu); -void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu); struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu, u32 function, u32 index); struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, @@ -232,6 +231,14 @@ static __always_inline bool guest_cpu_cap_has(struct kvm_vcpu *vcpu, { unsigned int x86_leaf = __feature_leaf(x86_feature); + /* + * Except for MWAIT, querying dynamic feature bits is disallowed, so + * that KVM can defer runtime updates until the next CPUID emulation. + */ + BUILD_BUG_ON(x86_feature == X86_FEATURE_APIC || + x86_feature == X86_FEATURE_OSXSAVE || + x86_feature == X86_FEATURE_OSPKE); + return vcpu->arch.cpu_caps[x86_leaf] & __feature_bit(x86_feature); } diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 750bb89b20ac..9dbc0f5d9865 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2586,7 +2586,7 @@ static void __kvm_apic_set_base(struct kvm_vcpu *vcpu, u64 value) vcpu->arch.apic_base = value; if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) - kvm_update_cpuid_runtime(vcpu); + vcpu->arch.cpuid_dynamic_bits_dirty = true; if (!apic) return; diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c index e0ab7df27b66..699e551ec93b 100644 --- a/arch/x86/kvm/smm.c +++ b/arch/x86/kvm/smm.c @@ -358,7 +358,7 @@ void enter_smm(struct kvm_vcpu *vcpu) goto error; #endif - kvm_update_cpuid_runtime(vcpu); + vcpu->arch.cpuid_dynamic_bits_dirty = true; kvm_mmu_reset_context(vcpu); return; error: diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 8bc62e994138..42a6d8bb1024 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -3261,7 +3261,7 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) if (kvm_ghcb_xcr0_is_valid(svm)) { vcpu->arch.xcr0 = ghcb_get_xcr0(ghcb); - kvm_update_cpuid_runtime(vcpu); + vcpu->arch.cpuid_dynamic_bits_dirty = true; } /* Copy the GHCB exit information into the VMCB fields */ diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 7640a84e554a..88dd61389a2d 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1932,7 +1932,7 @@ void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)) - kvm_update_cpuid_runtime(vcpu); + vcpu->arch.cpuid_dynamic_bits_dirty = true; } static void svm_set_segment(struct kvm_vcpu *vcpu, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index f72835e85b6d..b60de7f8e563 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -3523,7 +3523,7 @@ void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) vmcs_writel(GUEST_CR4, hw_cr4); if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)) - kvm_update_cpuid_runtime(vcpu); + vcpu->arch.cpuid_dynamic_bits_dirty = true; } void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9883f0565baf..263ab2ec6f71 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1264,7 +1264,7 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) vcpu->arch.xcr0 = xcr0; if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND) - kvm_update_cpuid_runtime(vcpu); + vcpu->arch.cpuid_dynamic_bits_dirty = true; return 0; } @@ -3899,7 +3899,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!guest_cpu_cap_has(vcpu, X86_FEATURE_XMM3)) return 1; vcpu->arch.ia32_misc_enable_msr = data; - kvm_update_cpuid_runtime(vcpu); + vcpu->arch.cpuid_dynamic_bits_dirty = true; } else { vcpu->arch.ia32_misc_enable_msr = data; } @@ -3934,7 +3934,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (data & ~kvm_caps.supported_xss) return 1; vcpu->arch.ia32_xss = data; - kvm_update_cpuid_runtime(vcpu); + vcpu->arch.cpuid_dynamic_bits_dirty = true; break; case MSR_SMI_COUNT: if (!msr_info->host_initiated) From f002a97ec8c977ce64a23d68bed8c7a2cec9d96e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 31 Jan 2025 17:55:08 -0800 Subject: [PATCH 11/26] KVM: nVMX: Check PAUSE_EXITING, not BUS_LOCK_DETECTION, on PAUSE emulation When emulating PAUSE on behalf of L2, check for interception in vmcs12 by looking at primary execution controls, not secondary execution controls. Checking for PAUSE_EXITING in secondary execution controls effectively results in KVM looking for BUS_LOCK_DETECTION, which KVM doesn't expose to L1, i.e. is always off in vmcs12, and ultimately results in KVM failing to "intercept" PAUSE. Because KVM doesn't handle interception during emulation correctly on VMX, i.e. the "fixed" code is still quite broken, and not intercepting PAUSE is relatively benign, for all intents and purposes the bug means that L2 gets to live when it would otherwise get an unexpected #UD. Fixes: 4984563823f0 ("KVM: nVMX: Emulate NOPs in L2, and PAUSE if it's not intercepted") Link: https://lore.kernel.org/r/20250201015518.689704-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index b60de7f8e563..e8672c4cb625 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -8092,7 +8092,7 @@ int vmx_check_intercept(struct kvm_vcpu *vcpu, * the PAUSE. */ if ((info->rep_prefix != REPE_PREFIX) || - !nested_cpu_has2(vmcs12, CPU_BASED_PAUSE_EXITING)) + !nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING)) return X86EMUL_CONTINUE; break; From c8e612bfedff67da0dd4abdf4de9ce7671b89f71 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 31 Jan 2025 17:55:09 -0800 Subject: [PATCH 12/26] KVM: nSVM: Pass next RIP, not current RIP, for nested VM-Exit on emulation Set "next_rip" in the emulation interception info passed to vendor code using the emulator context's "_eip", not "eip". "eip" holds RIP from the start of emulation, i.e. the RIP of the instruction that's being emulated, whereas _eip tracks the context's current position in decoding the code stream, which at the time of the intercept checks is effectively the RIP of the next instruction. Passing the current RIP as next_rip causes SVM to stuff the wrong value value into vmcb12->control.next_rip if a nested VM-Exit is generated, i.e. if L1 wants to intercept the instruction, and could result in L1 putting L2 into an infinite loop due to restarting L2 with the same RIP over and over. Fixes: 8a76d7f25f8f ("KVM: x86: Add x86 callback for intercept check") Link: https://lore.kernel.org/r/20250201015518.689704-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 60986f67c35a..0915b5e8aa71 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -478,7 +478,7 @@ static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt, .src_bytes = ctxt->src.bytes, .dst_bytes = ctxt->dst.bytes, .ad_bytes = ctxt->ad_bytes, - .next_rip = ctxt->eip, + .next_rip = ctxt->_eip, }; return ctxt->ops->intercept(ctxt, &info, stage); From 3244616aac8dc69b3d7e4a4ee541e767b974af43 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 31 Jan 2025 17:55:10 -0800 Subject: [PATCH 13/26] KVM: nVMX: Allow emulating RDPID on behalf of L2 Return X86EMUL_CONTINUE instead X86EMUL_UNHANDLEABLE when emulating RDPID on behalf of L2 and L1 _does_ expose RDPID/RDTSCP to L2. When RDPID emulation was added by commit fb6d4d340e05 ("KVM: x86: emulate RDPID"), KVM incorrectly allowed emulation by default. Commit 07721feee46b ("KVM: nVMX: Don't emulate instructions in guest mode") fixed that flaw, but missed that RDPID emulation was relying on the common return path to allow emulation on behalf of L2. Fixes: 07721feee46b ("KVM: nVMX: Don't emulate instructions in guest mode") Link: https://lore.kernel.org/r/20250201015518.689704-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index e8672c4cb625..f03d1dc63de0 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -8049,18 +8049,19 @@ int vmx_check_intercept(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12 = get_vmcs12(vcpu); switch (info->intercept) { - /* - * RDPID causes #UD if disabled through secondary execution controls. - * Because it is marked as EmulateOnUD, we need to intercept it here. - * Note, RDPID is hidden behind ENABLE_RDTSCP. - */ case x86_intercept_rdpid: + /* + * RDPID causes #UD if not enabled through secondary execution + * controls (ENABLE_RDTSCP). Note, the implicit MSR access to + * TSC_AUX is NOT subject to interception, i.e. checking only + * the dedicated execution control is architecturally correct. + */ if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) { exception->vector = UD_VECTOR; exception->error_code_valid = false; return X86EMUL_PROPAGATE_FAULT; } - break; + return X86EMUL_CONTINUE; case x86_intercept_in: case x86_intercept_ins: From f43f7a215af0586e196253555fc614efb0588858 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 31 Jan 2025 17:55:11 -0800 Subject: [PATCH 14/26] KVM: nVMX: Emulate HLT in L2 if it's not intercepted Extend VMX's nested intercept logic for emulated instructions to handle HLT interception, primarily for testing purposes. Failure to allow emulation of HLT isn't all that interesting, as emulating HLT while L2 is active either requires forced emulation (and no #UD intercept in L1), TLB games in the guest to coerce KVM into emulating the wrong instruction, or a bug elsewhere in KVM. E.g. without commit 47ef3ef843c0 ("KVM: VMX: Handle event vectoring error in check_emulate_instruction()"), KVM can end up trying to emulate HLT if RIP happens to point at a HLT when a vectored event arrives with L2's IDT pointing at emulated MMIO. Note, vmx_check_intercept() is still broken when L1 wants to intercept an instruction, as KVM injects a #UD instead of synthesizing a nested VM-Exit. That issue extends far beyond HLT, punt on it for now. Link: https://lore.kernel.org/r/20250201015518.689704-5-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index f03d1dc63de0..c844107e0466 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -8083,6 +8083,11 @@ int vmx_check_intercept(struct kvm_vcpu *vcpu, /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */ break; + case x86_intercept_hlt: + if (!nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING)) + return X86EMUL_CONTINUE; + break; + case x86_intercept_pause: /* * PAUSE is a single-byte NOP with a REPE prefix, i.e. collides From 08e3d89eb330ef579a8a494264e6f192b4434fc0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 31 Jan 2025 17:55:12 -0800 Subject: [PATCH 15/26] KVM: nVMX: Consolidate missing X86EMUL_INTERCEPTED logic in L2 emulation Refactor the handling of port I/O interception checks when emulating on behalf of L2 in anticipation of synthesizing a nested VM-Exit to L1 instead of injecting a #UD into L2. No functional change intended. Link: https://lore.kernel.org/r/20250201015518.689704-6-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index c844107e0466..bf5a700ef064 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -8007,12 +8007,11 @@ static __init void vmx_set_cpu_caps(void) kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG); } -static int vmx_check_intercept_io(struct kvm_vcpu *vcpu, +static bool vmx_is_io_intercepted(struct kvm_vcpu *vcpu, struct x86_instruction_info *info) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); unsigned short port; - bool intercept; int size; if (info->intercept == x86_intercept_in || @@ -8032,13 +8031,9 @@ static int vmx_check_intercept_io(struct kvm_vcpu *vcpu, * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps. */ if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) - intercept = nested_cpu_has(vmcs12, - CPU_BASED_UNCOND_IO_EXITING); - else - intercept = nested_vmx_check_io_bitmaps(vcpu, port, size); + return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); - /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */ - return intercept ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE; + return nested_vmx_check_io_bitmaps(vcpu, port, size); } int vmx_check_intercept(struct kvm_vcpu *vcpu, @@ -8067,7 +8062,9 @@ int vmx_check_intercept(struct kvm_vcpu *vcpu, case x86_intercept_ins: case x86_intercept_out: case x86_intercept_outs: - return vmx_check_intercept_io(vcpu, info); + if (!vmx_is_io_intercepted(vcpu, info)) + return X86EMUL_CONTINUE; + break; case x86_intercept_lgdt: case x86_intercept_lidt: @@ -8079,8 +8076,6 @@ int vmx_check_intercept(struct kvm_vcpu *vcpu, case x86_intercept_str: if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC)) return X86EMUL_CONTINUE; - - /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */ break; case x86_intercept_hlt: @@ -8108,6 +8103,7 @@ int vmx_check_intercept(struct kvm_vcpu *vcpu, break; } + /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */ return X86EMUL_UNHANDLEABLE; } From 407d03fe924c69d04bca980402fb11d1542cd74c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 31 Jan 2025 17:55:13 -0800 Subject: [PATCH 16/26] KVM: x86: Plumb the src/dst operand types through to .check_intercept() When checking for intercept when emulating an instruction on behalf of L2, forward the source and destination operand types to vendor code so that VMX can synthesize the correct EXIT_QUALIFICATION for port I/O VM-Exits. Link: https://lore.kernel.org/r/20250201015518.689704-7-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/emulate.c | 2 ++ arch/x86/kvm/kvm_emulate.h | 2 ++ 2 files changed, 4 insertions(+) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 0915b5e8aa71..ca613796b5af 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -477,6 +477,8 @@ static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt, .dst_val = ctxt->dst.val64, .src_bytes = ctxt->src.bytes, .dst_bytes = ctxt->dst.bytes, + .src_type = ctxt->src.type, + .dst_type = ctxt->dst.type, .ad_bytes = ctxt->ad_bytes, .next_rip = ctxt->_eip, }; diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index 73072585e164..49ab8b060137 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -44,6 +44,8 @@ struct x86_instruction_info { u64 dst_val; /* value of destination operand */ u8 src_bytes; /* size of source operand */ u8 dst_bytes; /* size of destination operand */ + u8 src_type; /* type of source operand */ + u8 dst_type; /* type of destination operand */ u8 ad_bytes; /* size of src/dst address */ u64 next_rip; /* rip following the instruction */ }; From 9aeb9d8a67389caa97545987adc79a25a734b149 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 31 Jan 2025 17:55:14 -0800 Subject: [PATCH 17/26] KVM: x86: Plumb the emulator's starting RIP into nested intercept checks When checking for intercept when emulating an instruction on behalf of L2, pass the emulator's view of the RIP of the instruction being emulated to vendor code. Unlike SVM, which communicates the next RIP on VM-Exit, VMX communicates the length of the instruction that generated the VM-Exit, i.e. requires the current and next RIPs. Note, unless userspace modifies RIP during a userspace exit that requires completion, kvm_rip_read() will contain the same information. Pass the emulator's view largely out of a paranoia, and because there is no meaningful cost in doing so. Link: https://lore.kernel.org/r/20250201015518.689704-8-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/emulate.c | 1 + arch/x86/kvm/kvm_emulate.h | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index ca613796b5af..1349e278cd2a 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -480,6 +480,7 @@ static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt, .src_type = ctxt->src.type, .dst_type = ctxt->dst.type, .ad_bytes = ctxt->ad_bytes, + .rip = ctxt->eip, .next_rip = ctxt->_eip, }; diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index 49ab8b060137..35029b12667f 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -47,6 +47,7 @@ struct x86_instruction_info { u8 src_type; /* type of source operand */ u8 dst_type; /* type of destination operand */ u8 ad_bytes; /* size of src/dst address */ + u64 rip; /* rip of the instruction */ u64 next_rip; /* rip following the instruction */ }; From d4aea23fd0ff6d792f66886bcb158e52a095b3bc Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 31 Jan 2025 17:55:15 -0800 Subject: [PATCH 18/26] KVM: x86: Add a #define for the architectural max instruction length Add a #define to capture x86's architecturally defined max instruction length instead of open coding the literal in a variety of places. No functional change intended. Link: https://lore.kernel.org/r/20250201015518.689704-9-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/kvm_emulate.h | 4 +++- arch/x86/kvm/trace.h | 14 +++++++------- arch/x86/kvm/vmx/nested.c | 2 +- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index 35029b12667f..c1df5acfacaf 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -275,8 +275,10 @@ struct operand { }; }; +#define X86_MAX_INSTRUCTION_LENGTH 15 + struct fetch_cache { - u8 data[15]; + u8 data[X86_MAX_INSTRUCTION_LENGTH]; u8 *ptr; u8 *end; }; diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 0b844cb97978..ccda95e53f62 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -830,12 +830,12 @@ TRACE_EVENT(kvm_emulate_insn, TP_ARGS(vcpu, failed), TP_STRUCT__entry( - __field( __u64, rip ) - __field( __u32, csbase ) - __field( __u8, len ) - __array( __u8, insn, 15 ) - __field( __u8, flags ) - __field( __u8, failed ) + __field( __u64, rip ) + __field( __u32, csbase ) + __field( __u8, len ) + __array( __u8, insn, X86_MAX_INSTRUCTION_LENGTH ) + __field( __u8, flags ) + __field( __u8, failed ) ), TP_fast_assign( @@ -846,7 +846,7 @@ TRACE_EVENT(kvm_emulate_insn, __entry->rip = vcpu->arch.emulate_ctxt->_eip - __entry->len; memcpy(__entry->insn, vcpu->arch.emulate_ctxt->fetch.data, - 15); + X86_MAX_INSTRUCTION_LENGTH); __entry->flags = kei_decode_mode(vcpu->arch.emulate_ctxt->mode); __entry->failed = failed; ), diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index bca2575837ce..e5c17fba6507 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2970,7 +2970,7 @@ static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, case INTR_TYPE_SOFT_EXCEPTION: case INTR_TYPE_SOFT_INTR: case INTR_TYPE_PRIV_SW_EXCEPTION: - if (CC(vmcs12->vm_entry_instruction_len > 15) || + if (CC(vmcs12->vm_entry_instruction_len > X86_MAX_INSTRUCTION_LENGTH) || CC(vmcs12->vm_entry_instruction_len == 0 && CC(!nested_cpu_has_zero_length_injection(vcpu)))) return -EINVAL; From fbd1e0f195464a362c91e16b5c327db0e89612ca Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 31 Jan 2025 17:55:16 -0800 Subject: [PATCH 19/26] KVM: nVMX: Allow the caller to provide instruction length on nested VM-Exit Rework the nested VM-Exit helper to take the instruction length as a parameter, and convert nested_vmx_vmexit() into a "default" wrapper that grabs the length from vmcs02 as appropriate. This will allow KVM to set the correct instruction length when synthesizing a nested VM-Exit when emulating an instruction that L1 wants to intercept. No functional change intended, as the path to prepare_vmcs12()'s reading of vmcs02.VM_EXIT_INSTRUCTION_LEN is gated on the same set of conditions as the VMREAD in the new nested_vmx_vmexit(). Link: https://lore.kernel.org/r/20250201015518.689704-10-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 12 +++++++----- arch/x86/kvm/vmx/nested.h | 22 ++++++++++++++++++++-- 2 files changed, 27 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index e5c17fba6507..edc2d2039508 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4618,7 +4618,7 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) */ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, u32 vm_exit_reason, u32 exit_intr_info, - unsigned long exit_qualification) + unsigned long exit_qualification, u32 exit_insn_len) { /* update exit information fields: */ vmcs12->vm_exit_reason = vm_exit_reason; @@ -4646,7 +4646,7 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vm_exit_reason, exit_intr_info); vmcs12->vm_exit_intr_info = exit_intr_info; - vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + vmcs12->vm_exit_instruction_len = exit_insn_len; vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); /* @@ -4930,8 +4930,9 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) * and modify vmcs12 to make it see what it would expect to see there if * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) */ -void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, - u32 exit_intr_info, unsigned long exit_qualification) +void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, + u32 exit_intr_info, unsigned long exit_qualification, + u32 exit_insn_len) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); @@ -4981,7 +4982,8 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, if (vm_exit_reason != -1) prepare_vmcs12(vcpu, vmcs12, vm_exit_reason, - exit_intr_info, exit_qualification); + exit_intr_info, exit_qualification, + exit_insn_len); /* * Must happen outside of sync_vmcs02_to_vmcs12() as it will diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index 2c296b6abb8c..6eedcfc91070 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -26,8 +26,26 @@ void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu); enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry); bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu); -void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, - u32 exit_intr_info, unsigned long exit_qualification); +void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, + u32 exit_intr_info, unsigned long exit_qualification, + u32 exit_insn_len); + +static inline void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, + u32 exit_intr_info, + unsigned long exit_qualification) +{ + u32 exit_insn_len; + + if (to_vmx(vcpu)->fail || vm_exit_reason == -1 || + (vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) + exit_insn_len = 0; + else + exit_insn_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + + __nested_vmx_vmexit(vcpu, vm_exit_reason, exit_intr_info, + exit_qualification, exit_insn_len); +} + void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu); int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata); From 79a14afc60904cdb2b4288fd00c65b8159e0049a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 31 Jan 2025 17:55:17 -0800 Subject: [PATCH 20/26] KVM: nVMX: Synthesize nested VM-Exit for supported emulation intercepts When emulating an instruction on behalf of L2 that L1 wants to intercept, generate a nested VM-Exit instead of injecting a #UD into L2. Now that (most of) the necessary information is available, synthesizing a VM-Exit isn't terribly difficult. Punt on decoding the ModR/M for descriptor table exits for now. There is no evidence that any hypervisor intercepts descriptor table accesses *and* uses the EXIT_QUALIFICATION to expedite emulation, i.e. it's not worth delaying basic support for. To avoid doing more harm than good, e.g. by putting L2 into an infinite or effectively corrupting its code stream, inject #UD if the instruction length is nonsensical. Link: https://lore.kernel.org/r/20250201015518.689704-11-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 70 +++++++++++++++++++++++++++++++++--------- 1 file changed, 56 insertions(+), 14 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index bf5a700ef064..1e5f73a7f8f0 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -8008,20 +8008,13 @@ static __init void vmx_set_cpu_caps(void) } static bool vmx_is_io_intercepted(struct kvm_vcpu *vcpu, - struct x86_instruction_info *info) + struct x86_instruction_info *info, + unsigned long *exit_qualification) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); unsigned short port; int size; - - if (info->intercept == x86_intercept_in || - info->intercept == x86_intercept_ins) { - port = info->src_val; - size = info->dst_bytes; - } else { - port = info->dst_val; - size = info->src_bytes; - } + bool imm; /* * If the 'use IO bitmaps' VM-execution control is 0, IO instruction @@ -8033,6 +8026,30 @@ static bool vmx_is_io_intercepted(struct kvm_vcpu *vcpu, if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); + if (info->intercept == x86_intercept_in || + info->intercept == x86_intercept_ins) { + port = info->src_val; + size = info->dst_bytes; + imm = info->src_type == OP_IMM; + } else { + port = info->dst_val; + size = info->src_bytes; + imm = info->dst_type == OP_IMM; + } + + + *exit_qualification = ((unsigned long)port << 16) | (size - 1); + + if (info->intercept == x86_intercept_ins || + info->intercept == x86_intercept_outs) + *exit_qualification |= BIT(4); + + if (info->rep_prefix) + *exit_qualification |= BIT(5); + + if (imm) + *exit_qualification |= BIT(6); + return nested_vmx_check_io_bitmaps(vcpu, port, size); } @@ -8042,6 +8059,9 @@ int vmx_check_intercept(struct kvm_vcpu *vcpu, struct x86_exception *exception) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + unsigned long exit_qualification = 0; + u32 vm_exit_reason; + u64 exit_insn_len; switch (info->intercept) { case x86_intercept_rdpid: @@ -8062,8 +8082,10 @@ int vmx_check_intercept(struct kvm_vcpu *vcpu, case x86_intercept_ins: case x86_intercept_out: case x86_intercept_outs: - if (!vmx_is_io_intercepted(vcpu, info)) + if (!vmx_is_io_intercepted(vcpu, info, &exit_qualification)) return X86EMUL_CONTINUE; + + vm_exit_reason = EXIT_REASON_IO_INSTRUCTION; break; case x86_intercept_lgdt: @@ -8076,11 +8098,25 @@ int vmx_check_intercept(struct kvm_vcpu *vcpu, case x86_intercept_str: if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC)) return X86EMUL_CONTINUE; + + if (info->intercept == x86_intercept_lldt || + info->intercept == x86_intercept_ltr || + info->intercept == x86_intercept_sldt || + info->intercept == x86_intercept_str) + vm_exit_reason = EXIT_REASON_LDTR_TR; + else + vm_exit_reason = EXIT_REASON_GDTR_IDTR; + /* + * FIXME: Decode the ModR/M to generate the correct exit + * qualification for memory operands. + */ break; case x86_intercept_hlt: if (!nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING)) return X86EMUL_CONTINUE; + + vm_exit_reason = EXIT_REASON_HLT; break; case x86_intercept_pause: @@ -8096,15 +8132,21 @@ int vmx_check_intercept(struct kvm_vcpu *vcpu, !nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING)) return X86EMUL_CONTINUE; + vm_exit_reason = EXIT_REASON_PAUSE_INSTRUCTION; break; /* TODO: check more intercepts... */ default: - break; + return X86EMUL_UNHANDLEABLE; } - /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */ - return X86EMUL_UNHANDLEABLE; + exit_insn_len = abs_diff((s64)info->next_rip, (s64)info->rip); + if (!exit_insn_len || exit_insn_len > X86_MAX_INSTRUCTION_LENGTH) + return X86EMUL_UNHANDLEABLE; + + __nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification, + exit_insn_len); + return X86EMUL_INTERCEPTED; } #ifdef CONFIG_X86_64 From 2428865bf0af18f7a0349aee804c21d9bac69bd1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 31 Jan 2025 17:55:18 -0800 Subject: [PATCH 21/26] KVM: selftests: Add a nested (forced) emulation intercept test for x86 Add a rudimentary test for validating KVM's handling of L1 hypervisor intercepts during instruction emulation on behalf of L2. To minimize complexity and avoid overlap with other tests, only validate KVM's handling of instructions that L1 wants to intercept, i.e. that generate a nested VM-Exit. Full testing of emulation on behalf of L2 is better achieved by running existing (forced) emulation tests in a VM, (although on VMX, getting L0 to emulate on #UD requires modifying either L1 KVM to not intercept #UD, or modifying L0 KVM to prioritize L0's exception intercepts over L1's intercepts, as is done by KVM for SVM). Since emulation should never be successful, i.e. L2 always exits to L1, dynamically generate the L2 code stream instead of adding a helper for each instruction. Doing so requires hand coding instruction opcodes, but makes it significantly easier for the test to compute the expected "next RIP" and instruction length. Link: https://lore.kernel.org/r/20250201015518.689704-12-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/Makefile.kvm | 1 + .../selftests/kvm/x86/nested_emulation_test.c | 146 ++++++++++++++++++ 2 files changed, 147 insertions(+) create mode 100644 tools/testing/selftests/kvm/x86/nested_emulation_test.c diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index 4277b983cace..f773f8f99249 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -69,6 +69,7 @@ TEST_GEN_PROGS_x86 += x86/hyperv_tlb_flush TEST_GEN_PROGS_x86 += x86/kvm_clock_test TEST_GEN_PROGS_x86 += x86/kvm_pv_test TEST_GEN_PROGS_x86 += x86/monitor_mwait_test +TEST_GEN_PROGS_x86 += x86/nested_emulation_test TEST_GEN_PROGS_x86 += x86/nested_exceptions_test TEST_GEN_PROGS_x86 += x86/platform_info_test TEST_GEN_PROGS_x86 += x86/pmu_counters_test diff --git a/tools/testing/selftests/kvm/x86/nested_emulation_test.c b/tools/testing/selftests/kvm/x86/nested_emulation_test.c new file mode 100644 index 000000000000..abc824dba04f --- /dev/null +++ b/tools/testing/selftests/kvm/x86/nested_emulation_test.c @@ -0,0 +1,146 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "vmx.h" +#include "svm_util.h" + +enum { + SVM_F, + VMX_F, + NR_VIRTUALIZATION_FLAVORS, +}; + +struct emulated_instruction { + const char name[32]; + uint8_t opcode[15]; + uint32_t exit_reason[NR_VIRTUALIZATION_FLAVORS]; +}; + +static struct emulated_instruction instructions[] = { + { + .name = "pause", + .opcode = { 0xf3, 0x90 }, + .exit_reason = { SVM_EXIT_PAUSE, + EXIT_REASON_PAUSE_INSTRUCTION, } + }, + { + .name = "hlt", + .opcode = { 0xf4 }, + .exit_reason = { SVM_EXIT_HLT, + EXIT_REASON_HLT, } + }, +}; + +static uint8_t kvm_fep[] = { 0x0f, 0x0b, 0x6b, 0x76, 0x6d }; /* ud2 ; .ascii "kvm" */ +static uint8_t l2_guest_code[sizeof(kvm_fep) + 15]; +static uint8_t *l2_instruction = &l2_guest_code[sizeof(kvm_fep)]; + +static uint32_t get_instruction_length(struct emulated_instruction *insn) +{ + uint32_t i; + + for (i = 0; i < ARRAY_SIZE(insn->opcode) && insn->opcode[i]; i++) + ; + + return i; +} + +static void guest_code(void *test_data) +{ + int f = this_cpu_has(X86_FEATURE_SVM) ? SVM_F : VMX_F; + int i; + + memcpy(l2_guest_code, kvm_fep, sizeof(kvm_fep)); + + if (f == SVM_F) { + struct svm_test_data *svm = test_data; + struct vmcb *vmcb = svm->vmcb; + + generic_svm_setup(svm, NULL, NULL); + vmcb->save.idtr.limit = 0; + vmcb->save.rip = (u64)l2_guest_code; + + vmcb->control.intercept |= BIT_ULL(INTERCEPT_SHUTDOWN) | + BIT_ULL(INTERCEPT_PAUSE) | + BIT_ULL(INTERCEPT_HLT); + vmcb->control.intercept_exceptions = 0; + } else { + GUEST_ASSERT(prepare_for_vmx_operation(test_data)); + GUEST_ASSERT(load_vmcs(test_data)); + + prepare_vmcs(test_data, NULL, NULL); + GUEST_ASSERT(!vmwrite(GUEST_IDTR_LIMIT, 0)); + GUEST_ASSERT(!vmwrite(GUEST_RIP, (u64)l2_guest_code)); + GUEST_ASSERT(!vmwrite(EXCEPTION_BITMAP, 0)); + + vmwrite(CPU_BASED_VM_EXEC_CONTROL, vmreadz(CPU_BASED_VM_EXEC_CONTROL) | + CPU_BASED_PAUSE_EXITING | + CPU_BASED_HLT_EXITING); + } + + for (i = 0; i < ARRAY_SIZE(instructions); i++) { + struct emulated_instruction *insn = &instructions[i]; + uint32_t insn_len = get_instruction_length(insn); + uint32_t exit_insn_len; + u32 exit_reason; + + /* + * Copy the target instruction to the L2 code stream, and fill + * the remaining bytes with INT3s so that a missed intercept + * results in a consistent failure mode (SHUTDOWN). + */ + memcpy(l2_instruction, insn->opcode, insn_len); + memset(l2_instruction + insn_len, 0xcc, sizeof(insn->opcode) - insn_len); + + if (f == SVM_F) { + struct svm_test_data *svm = test_data; + struct vmcb *vmcb = svm->vmcb; + + run_guest(vmcb, svm->vmcb_gpa); + exit_reason = vmcb->control.exit_code; + exit_insn_len = vmcb->control.next_rip - vmcb->save.rip; + GUEST_ASSERT_EQ(vmcb->save.rip, (u64)l2_instruction); + } else { + GUEST_ASSERT_EQ(i ? vmresume() : vmlaunch(), 0); + exit_reason = vmreadz(VM_EXIT_REASON); + exit_insn_len = vmreadz(VM_EXIT_INSTRUCTION_LEN); + GUEST_ASSERT_EQ(vmreadz(GUEST_RIP), (u64)l2_instruction); + } + + __GUEST_ASSERT(exit_reason == insn->exit_reason[f], + "Wanted exit_reason '0x%x' for '%s', got '0x%x'", + insn->exit_reason[f], insn->name, exit_reason); + + __GUEST_ASSERT(exit_insn_len == insn_len, + "Wanted insn_len '%u' for '%s', got '%u'", + insn_len, insn->name, exit_insn_len); + } + + GUEST_DONE(); +} + +int main(int argc, char *argv[]) +{ + vm_vaddr_t nested_test_data_gva; + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + + TEST_REQUIRE(is_forced_emulation_enabled); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM) || kvm_cpu_has(X86_FEATURE_VMX)); + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + vm_enable_cap(vm, KVM_CAP_EXCEPTION_PAYLOAD, -2ul); + + if (kvm_cpu_has(X86_FEATURE_SVM)) + vcpu_alloc_svm(vm, &nested_test_data_gva); + else + vcpu_alloc_vmx(vm, &nested_test_data_gva); + + vcpu_args_set(vcpu, 1, nested_test_data_gva); + + vcpu_run(vcpu); + TEST_ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_DONE); + + kvm_vm_free(vm); +} From b9595d1ddef883f538563c779fff2151ee040aeb Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 14 Feb 2025 17:06:08 -0800 Subject: [PATCH 22/26] KVM: x86: Don't inject PV async #PF if SEND_ALWAYS=0 and guest state is protected Don't inject PV async #PFs into guests with protected register state, i.e. SEV-ES and SEV-SNP guests, unless the guest has opted-in to receiving #PFs at CPL0. For protected guests, the actual CPL of the guest is unknown. Note, no sane CoCo guest should enable PV async #PF, but the current state of Linux-as-a-CoCo-guest isn't entirely sane. Fixes: add5e2f04541 ("KVM: SVM: Add support for the SEV-ES VMSA") Link: https://lore.kernel.org/r/20250215010609.1199982-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 263ab2ec6f71..3d84a79583b1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -13387,7 +13387,7 @@ static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu) return false; if (vcpu->arch.apf.send_user_only && - kvm_x86_call(get_cpl)(vcpu) == 0) + (vcpu->arch.guest_state_protected || !kvm_x86_call(get_cpl)(vcpu))) return false; if (is_guest_mode(vcpu)) { From 4fa0efb43a781055d146298a7ae886d529488d82 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 14 Feb 2025 17:06:09 -0800 Subject: [PATCH 23/26] KVM: x86: Rename and invert async #PF's send_user_only flag to send_always Rename send_user_only to avoid "user", because KVM's ABI is to not inject page faults into CPL0, whereas "user" in x86 is specifically CPL3. Invert the polarity to keep the naming simple and unambiguous. E.g. while KVM often refers to CPL0 as "kernel", that terminology isn't ubiquitous, and "send_kernel" could be misconstrued as "send only to kernel". Link: https://lore.kernel.org/r/20250215010609.1199982-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/x86.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 94b14b3a3b8d..fef8f07cdc82 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -997,8 +997,8 @@ struct kvm_vcpu_arch { u64 msr_int_val; /* MSR_KVM_ASYNC_PF_INT */ u16 vec; u32 id; - bool send_user_only; u32 host_apf_flags; + bool send_always; bool delivery_as_pf_vmexit; bool pageready_pending; } apf; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3d84a79583b1..a13e98f90a0e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3554,7 +3554,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) sizeof(u64))) return 1; - vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS); + vcpu->arch.apf.send_always = (data & KVM_ASYNC_PF_SEND_ALWAYS); vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT; kvm_async_pf_wakeup_all(vcpu); @@ -13386,7 +13386,7 @@ static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu) if (!kvm_pv_async_pf_enabled(vcpu)) return false; - if (vcpu->arch.apf.send_user_only && + if (!vcpu->arch.apf.send_always && (vcpu->arch.guest_state_protected || !kvm_x86_call(get_cpl)(vcpu))) return false; From b50cb2b1555d8714e12a566b9b49fcac56a04a3f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 30 Sep 2024 22:00:44 -0700 Subject: [PATCH 24/26] KVM: x86: Use a dedicated flow for queueing re-injected exceptions Open code the filling of vcpu->arch.exception in kvm_requeue_exception() instead of bouncing through kvm_multiple_exception(), as re-injection doesn't actually share that much code with "normal" injection, e.g. the VM-Exit interception check, payload delivery, and nested exception code is all bypassed as those flows only apply during initial injection. When FRED comes along, the special casing will only get worse, as FRED explicitly tracks nested exceptions and essentially delivers the payload on the stack frame, i.e. re-injection will need more inputs, and normal injection will have yet more code that needs to be bypassed when KVM is re-injecting an exception. No functional change intended. Signed-off-by: Xin Li (Intel) Tested-by: Shan Kang Link: https://lore.kernel.org/r/20241001050110.3643764-2-xin@zytor.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 4 +- arch/x86/kvm/svm/svm.c | 15 +++--- arch/x86/kvm/vmx/vmx.c | 16 +++--- arch/x86/kvm/x86.c | 89 ++++++++++++++++----------------- 4 files changed, 63 insertions(+), 61 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index fef8f07cdc82..97978ca62b7c 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2161,8 +2161,8 @@ int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu); void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr); void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr, unsigned long payload); -void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr); -void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); +void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned int nr, + bool has_error_code, u32 error_code); void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault); void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 88dd61389a2d..6bd7e44380e0 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4116,20 +4116,23 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu) vcpu->arch.nmi_injected = true; svm->nmi_l1_to_l2 = nmi_l1_to_l2; break; - case SVM_EXITINTINFO_TYPE_EXEPT: + case SVM_EXITINTINFO_TYPE_EXEPT: { + u32 error_code = 0; + /* * Never re-inject a #VC exception. */ if (vector == X86_TRAP_VC) break; - if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) { - u32 err = svm->vmcb->control.exit_int_info_err; - kvm_requeue_exception_e(vcpu, vector, err); + if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) + error_code = svm->vmcb->control.exit_int_info_err; - } else - kvm_requeue_exception(vcpu, vector); + kvm_requeue_exception(vcpu, vector, + exitintinfo & SVM_EXITINTINFO_VALID_ERR, + error_code); break; + } case SVM_EXITINTINFO_TYPE_INTR: kvm_queue_interrupt(vcpu, vector, false); break; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 1e5f73a7f8f0..374f5a2ca38e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7155,13 +7155,17 @@ static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu, case INTR_TYPE_SOFT_EXCEPTION: vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); fallthrough; - case INTR_TYPE_HARD_EXCEPTION: - if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { - u32 err = vmcs_read32(error_code_field); - kvm_requeue_exception_e(vcpu, vector, err); - } else - kvm_requeue_exception(vcpu, vector); + case INTR_TYPE_HARD_EXCEPTION: { + u32 error_code = 0; + + if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) + error_code = vmcs_read32(error_code_field); + + kvm_requeue_exception(vcpu, vector, + idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK, + error_code); break; + } case INTR_TYPE_SOFT_INTR: vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); fallthrough; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a13e98f90a0e..2231a2cd8489 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -800,9 +800,9 @@ static void kvm_queue_exception_vmexit(struct kvm_vcpu *vcpu, unsigned int vecto ex->payload = payload; } -static void kvm_multiple_exception(struct kvm_vcpu *vcpu, - unsigned nr, bool has_error, u32 error_code, - bool has_payload, unsigned long payload, bool reinject) +static void kvm_multiple_exception(struct kvm_vcpu *vcpu, unsigned int nr, + bool has_error, u32 error_code, + bool has_payload, unsigned long payload) { u32 prev_nr; int class1, class2; @@ -810,13 +810,10 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, kvm_make_request(KVM_REQ_EVENT, vcpu); /* - * If the exception is destined for L2 and isn't being reinjected, - * morph it to a VM-Exit if L1 wants to intercept the exception. A - * previously injected exception is not checked because it was checked - * when it was original queued, and re-checking is incorrect if _L1_ - * injected the exception, in which case it's exempt from interception. + * If the exception is destined for L2, morph it to a VM-Exit if L1 + * wants to intercept the exception. */ - if (!reinject && is_guest_mode(vcpu) && + if (is_guest_mode(vcpu) && kvm_x86_ops.nested_ops->is_exception_vmexit(vcpu, nr, error_code)) { kvm_queue_exception_vmexit(vcpu, nr, has_error, error_code, has_payload, payload); @@ -825,28 +822,9 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) { queue: - if (reinject) { - /* - * On VM-Entry, an exception can be pending if and only - * if event injection was blocked by nested_run_pending. - * In that case, however, vcpu_enter_guest() requests an - * immediate exit, and the guest shouldn't proceed far - * enough to need reinjection. - */ - WARN_ON_ONCE(kvm_is_exception_pending(vcpu)); - vcpu->arch.exception.injected = true; - if (WARN_ON_ONCE(has_payload)) { - /* - * A reinjected event has already - * delivered its payload. - */ - has_payload = false; - payload = 0; - } - } else { - vcpu->arch.exception.pending = true; - vcpu->arch.exception.injected = false; - } + vcpu->arch.exception.pending = true; + vcpu->arch.exception.injected = false; + vcpu->arch.exception.has_error_code = has_error; vcpu->arch.exception.vector = nr; vcpu->arch.exception.error_code = error_code; @@ -887,30 +865,53 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) { - kvm_multiple_exception(vcpu, nr, false, 0, false, 0, false); + kvm_multiple_exception(vcpu, nr, false, 0, false, 0); } EXPORT_SYMBOL_GPL(kvm_queue_exception); -void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr) -{ - kvm_multiple_exception(vcpu, nr, false, 0, false, 0, true); -} -EXPORT_SYMBOL_GPL(kvm_requeue_exception); void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr, unsigned long payload) { - kvm_multiple_exception(vcpu, nr, false, 0, true, payload, false); + kvm_multiple_exception(vcpu, nr, false, 0, true, payload); } EXPORT_SYMBOL_GPL(kvm_queue_exception_p); static void kvm_queue_exception_e_p(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code, unsigned long payload) { - kvm_multiple_exception(vcpu, nr, true, error_code, - true, payload, false); + kvm_multiple_exception(vcpu, nr, true, error_code, true, payload); } +void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned int nr, + bool has_error_code, u32 error_code) +{ + + /* + * On VM-Entry, an exception can be pending if and only if event + * injection was blocked by nested_run_pending. In that case, however, + * vcpu_enter_guest() requests an immediate exit, and the guest + * shouldn't proceed far enough to need reinjection. + */ + WARN_ON_ONCE(kvm_is_exception_pending(vcpu)); + + /* + * Do not check for interception when injecting an event for L2, as the + * exception was checked for intercept when it was original queued, and + * re-checking is incorrect if _L1_ injected the exception, in which + * case it's exempt from interception. + */ + kvm_make_request(KVM_REQ_EVENT, vcpu); + + vcpu->arch.exception.injected = true; + vcpu->arch.exception.has_error_code = has_error_code; + vcpu->arch.exception.vector = nr; + vcpu->arch.exception.error_code = error_code; + vcpu->arch.exception.has_payload = false; + vcpu->arch.exception.payload = 0; +} +EXPORT_SYMBOL_GPL(kvm_requeue_exception); + int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err) { if (err) @@ -980,16 +981,10 @@ void kvm_inject_nmi(struct kvm_vcpu *vcpu) void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) { - kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, false); + kvm_multiple_exception(vcpu, nr, true, error_code, false, 0); } EXPORT_SYMBOL_GPL(kvm_queue_exception_e); -void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) -{ - kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, true); -} -EXPORT_SYMBOL_GPL(kvm_requeue_exception_e); - /* * Checks if cpl <= required_cpl; if true, return true. Otherwise queue * a #GP and return false. From 2a289aed3fcd7fdd6d5c8def0f992d31a0754094 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 24 Feb 2025 09:41:56 -0800 Subject: [PATCH 25/26] KVM: x86: Always set mp_state to RUNNABLE on wakeup from HLT When emulating HLT and a wake event is already pending, explicitly mark the vCPU RUNNABLE (via kvm_set_mp_state()) instead of assuming the vCPU is already in the appropriate state. Barring a KVM bug, it should be impossible for the vCPU to be in a non-RUNNABLE state, but there is no advantage to relying on that to hold true, and ensuring the vCPU is made RUNNABLE avoids non-deterministic behavior with respect to pv_unhalted. E.g. if the vCPU is not already RUNNABLE, then depending on when pv_unhalted is set, KVM could either leave the vCPU in the non-RUNNABLE state (set before __kvm_emulate_halt()), or transition the vCPU to HALTED and then RUNNABLE (pv_unhalted set after the kvm_vcpu_has_events() check). Link: https://lore.kernel.org/r/20250224174156.2362059-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2231a2cd8489..bef63fd46d5a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11297,9 +11297,8 @@ static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason) ++vcpu->stat.halt_exits; if (lapic_in_kernel(vcpu)) { if (kvm_vcpu_has_events(vcpu)) - vcpu->arch.pv.pv_unhalted = false; - else - kvm_set_mp_state(vcpu, state); + state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, state); return 1; } else { vcpu->run->exit_reason = reason; From e6c8728a8e2d20b262209c70a8ca67719a628833 Mon Sep 17 00:00:00 2001 From: Xiaoyao Li Date: Tue, 4 Mar 2025 03:23:13 -0500 Subject: [PATCH 26/26] KVM: x86: Remove the unreachable case for 0x80000022 leaf in __do_cpuid_func() Remove dead/unreachable (and misguided) code in KVM's processing of 0x80000022. The case statement breaks early if PERFMON_V2 isnt supported, i.e. kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2) must be true when KVM reaches the code code to setup EBX. Note, early versions of the patch that became commit 94cdeebd8211 ("KVM: x86/cpuid: Add AMD CPUID ExtPerfMonAndDbg leaf 0x80000022") didn't break early on lack of PERFMON_V2 support, and instead enumerated the effective number of counters KVM could emulate. All of that code was flawed, e.g. the APM explicitly states EBX is valid only for v2. Performance Monitoring Version 2 supported. When set, CPUID_Fn8000_0022_EBX reports the number of available performance counters. When the flaw of not respecting v2 support was addressed, the misguided stuffing of the number of counters got left behind. Link: https://lore.kernel.org/all/20220919093453.71737-4-likexu@tencent.com Fixes: 94cdeebd8211 ("KVM: x86/cpuid: Add AMD CPUID ExtPerfMonAndDbg leaf 0x80000022") Signed-off-by: Xiaoyao Li Link: https://lore.kernel.org/r/20250304082314.472202-2-xiaoyao.li@intel.com [sean: elaborate on the situation a bit more, add Fixes] Signed-off-by: Sean Christopherson --- arch/x86/kvm/cpuid.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 97a90689a9dc..668e8fac1f6e 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -1773,13 +1773,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) cpuid_entry_override(entry, CPUID_8000_0022_EAX); - if (kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) - ebx.split.num_core_pmc = kvm_pmu_cap.num_counters_gp; - else if (kvm_cpu_cap_has(X86_FEATURE_PERFCTR_CORE)) - ebx.split.num_core_pmc = AMD64_NUM_COUNTERS_CORE; - else - ebx.split.num_core_pmc = AMD64_NUM_COUNTERS; - + ebx.split.num_core_pmc = kvm_pmu_cap.num_counters_gp; entry->ebx = ebx.full; break; }