From 309d28576f0a23931a36bf67324868448f79a77e Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Thu, 27 Mar 2025 12:39:56 -0500 Subject: [PATCH 01/16] KVM: SVM: Fix SNP AP destroy race with VMRUN An AP destroy request for a target vCPU is typically followed by an RMPADJUST to remove the VMSA attribute from the page currently being used as the VMSA for the target vCPU. This can result in a vCPU that is about to VMRUN to exit with #VMEXIT_INVALID. This usually does not happen as APs are typically sitting in HLT when being destroyed and therefore the vCPU thread is not running at the time. However, if HLT is allowed inside the VM, then the vCPU could be about to VMRUN when the VMSA attribute is removed from the VMSA page, resulting in a #VMEXIT_INVALID when the vCPU actually issues the VMRUN and causing the guest to crash. An RMPADJUST against an in-use (already running) VMSA results in a #NPF for the vCPU issuing the RMPADJUST, so the VMSA attribute cannot be changed until the VMRUN for target vCPU exits. The Qemu command line option '-overcommit cpu-pm=on' is an example of allowing HLT inside the guest. Update the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event to include the KVM_REQUEST_WAIT flag. The kvm_vcpu_kick() function will not wait for requests to be honored, so create kvm_make_request_and_kick() that will add a new event request and honor the KVM_REQUEST_WAIT flag. This will ensure that the target vCPU sees the AP destroy request before returning to the initiating vCPU should the target vCPU be in guest mode. Fixes: e366f92ea99e ("KVM: SEV: Support SEV-SNP AP Creation NAE event") Signed-off-by: Tom Lendacky Link: https://lore.kernel.org/r/fe2c885bf35643dd224e91294edb6777d5df23a4.1743097196.git.thomas.lendacky@amd.com [sean: add a comment explaining the use of smp_send_reschedule()] Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 3 ++- arch/x86/kvm/svm/sev.c | 6 ++---- include/linux/kvm_host.h | 19 ++++++++++++++++++- virt/kvm/kvm_main.c | 19 +++++++++++++++---- 4 files changed, 37 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 6c06f3d6e081..c5e80131626d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -125,7 +125,8 @@ KVM_ARCH_REQ_FLAGS(31, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_HV_TLB_FLUSH \ KVM_ARCH_REQ_FLAGS(32, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) -#define KVM_REQ_UPDATE_PROTECTED_GUEST_STATE KVM_ARCH_REQ(34) +#define KVM_REQ_UPDATE_PROTECTED_GUEST_STATE \ + KVM_ARCH_REQ_FLAGS(34, KVM_REQUEST_WAIT) #define CR0_RESERVED_BITS \ (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 0bc708ee2788..7d11e82e4fa4 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -3988,10 +3988,8 @@ static int sev_snp_ap_creation(struct vcpu_svm *svm) * Unless Creation is deferred until INIT, signal the vCPU to update * its state. */ - if (request != SVM_VMGEXIT_AP_CREATE_ON_INIT) { - kvm_make_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, target_vcpu); - kvm_vcpu_kick(target_vcpu); - } + if (request != SVM_VMGEXIT_AP_CREATE_ON_INIT) + kvm_make_request_and_kick(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, target_vcpu); return 0; } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 1dedc421b3e3..c685fb417e92 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1505,7 +1505,16 @@ bool kvm_vcpu_block(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu); bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu); -void kvm_vcpu_kick(struct kvm_vcpu *vcpu); + +#ifndef CONFIG_S390 +void __kvm_vcpu_kick(struct kvm_vcpu *vcpu, bool wait); + +static inline void kvm_vcpu_kick(struct kvm_vcpu *vcpu) +{ + __kvm_vcpu_kick(vcpu, false); +} +#endif + int kvm_vcpu_yield_to(struct kvm_vcpu *target); void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu, bool yield_to_kernel_mode); @@ -2253,6 +2262,14 @@ static __always_inline void kvm_make_request(int req, struct kvm_vcpu *vcpu) __kvm_make_request(req, vcpu); } +#ifndef CONFIG_S390 +static inline void kvm_make_request_and_kick(int req, struct kvm_vcpu *vcpu) +{ + kvm_make_request(req, vcpu); + __kvm_vcpu_kick(vcpu, req & KVM_REQUEST_WAIT); +} +#endif + static inline bool kvm_request_pending(struct kvm_vcpu *vcpu) { return READ_ONCE(vcpu->requests); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 69782df3617f..16fe54cf2808 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3739,7 +3739,7 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up); /* * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode. */ -void kvm_vcpu_kick(struct kvm_vcpu *vcpu) +void __kvm_vcpu_kick(struct kvm_vcpu *vcpu, bool wait) { int me, cpu; @@ -3768,13 +3768,24 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu) */ if (kvm_arch_vcpu_should_kick(vcpu)) { cpu = READ_ONCE(vcpu->cpu); - if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) - smp_send_reschedule(cpu); + if (cpu != me && (unsigned int)cpu < nr_cpu_ids && cpu_online(cpu)) { + /* + * Use a reschedule IPI to kick the vCPU if the caller + * doesn't need to wait for a response, as KVM allows + * kicking vCPUs while IRQs are disabled, but using the + * SMP function call framework with IRQs disabled can + * deadlock due to taking cross-CPU locks. + */ + if (wait) + smp_call_function_single(cpu, ack_kick, NULL, wait); + else + smp_send_reschedule(cpu); + } } out: put_cpu(); } -EXPORT_SYMBOL_GPL(kvm_vcpu_kick); +EXPORT_SYMBOL_GPL(__kvm_vcpu_kick); #endif /* !CONFIG_S390 */ int kvm_vcpu_yield_to(struct kvm_vcpu *target) From 962e2b6152ef1264224a9e1793ad2d5b5f3311a8 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Thu, 20 Mar 2025 08:26:49 -0500 Subject: [PATCH 02/16] KVM: SVM: Decrypt SEV VMSA in dump_vmcb() if debugging is enabled An SEV-ES/SEV-SNP VM save area (VMSA) can be decrypted if the guest policy allows debugging. Update the dump_vmcb() routine to output some of the SEV VMSA contents if possible. This can be useful for debug purposes. Signed-off-by: Tom Lendacky Acked-by: Borislav Petkov (AMD) Tested-by: Kim Phillips Link: https://lore.kernel.org/r/ea3b852c295b6f4b200925ed6b6e2c90d9475e71.1742477213.git.thomas.lendacky@amd.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 98 ++++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/svm/svm.c | 13 ++++++ arch/x86/kvm/svm/svm.h | 11 +++++ 3 files changed, 122 insertions(+) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 7d11e82e4fa4..11695362b6b2 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -560,6 +560,8 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) return -EFAULT; + sev->policy = params.policy; + memset(&start, 0, sizeof(start)); dh_blob = NULL; @@ -2199,6 +2201,8 @@ static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) if (params.policy & SNP_POLICY_MASK_SINGLE_SOCKET) return -EINVAL; + sev->policy = params.policy; + sev->snp_context = snp_context_create(kvm, argp); if (!sev->snp_context) return -ENOTTY; @@ -4922,3 +4926,97 @@ int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) return level; } + +struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb_save_area *vmsa; + struct kvm_sev_info *sev; + int error = 0; + int ret; + + if (!sev_es_guest(vcpu->kvm)) + return NULL; + + /* + * If the VMSA has not yet been encrypted, return a pointer to the + * current un-encrypted VMSA. + */ + if (!vcpu->arch.guest_state_protected) + return (struct vmcb_save_area *)svm->sev_es.vmsa; + + sev = to_kvm_sev_info(vcpu->kvm); + + /* Check if the SEV policy allows debugging */ + if (sev_snp_guest(vcpu->kvm)) { + if (!(sev->policy & SNP_POLICY_DEBUG)) + return NULL; + } else { + if (sev->policy & SEV_POLICY_NODBG) + return NULL; + } + + if (sev_snp_guest(vcpu->kvm)) { + struct sev_data_snp_dbg dbg = {0}; + + vmsa = snp_alloc_firmware_page(__GFP_ZERO); + if (!vmsa) + return NULL; + + dbg.gctx_paddr = __psp_pa(sev->snp_context); + dbg.src_addr = svm->vmcb->control.vmsa_pa; + dbg.dst_addr = __psp_pa(vmsa); + + ret = sev_do_cmd(SEV_CMD_SNP_DBG_DECRYPT, &dbg, &error); + + /* + * Return the target page to a hypervisor page no matter what. + * If this fails, the page can't be used, so leak it and don't + * try to use it. + */ + if (snp_page_reclaim(vcpu->kvm, PHYS_PFN(__pa(vmsa)))) + return NULL; + + if (ret) { + pr_err("SEV: SNP_DBG_DECRYPT failed ret=%d, fw_error=%d (%#x)\n", + ret, error, error); + free_page((unsigned long)vmsa); + + return NULL; + } + } else { + struct sev_data_dbg dbg = {0}; + struct page *vmsa_page; + + vmsa_page = alloc_page(GFP_KERNEL); + if (!vmsa_page) + return NULL; + + vmsa = page_address(vmsa_page); + + dbg.handle = sev->handle; + dbg.src_addr = svm->vmcb->control.vmsa_pa; + dbg.dst_addr = __psp_pa(vmsa); + dbg.len = PAGE_SIZE; + + ret = sev_do_cmd(SEV_CMD_DBG_DECRYPT, &dbg, &error); + if (ret) { + pr_err("SEV: SEV_CMD_DBG_DECRYPT failed ret=%d, fw_error=%d (0x%x)\n", + ret, error, error); + __free_page(vmsa_page); + + return NULL; + } + } + + return vmsa; +} + +void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa) +{ + /* If the VMSA has not yet been encrypted, nothing was allocated */ + if (!vcpu->arch.guest_state_protected || !vmsa) + return; + + free_page((unsigned long)vmsa); +} diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index cc1c721ba067..36e2199fe483 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3442,6 +3442,15 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id); pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id); pr_err("%-20s%016llx\n", "vmsa_pa:", control->vmsa_pa); + + if (sev_es_guest(vcpu->kvm)) { + save = sev_decrypt_vmsa(vcpu); + if (!save) + goto no_vmsa; + + save01 = save; + } + pr_err("VMCB State Save Area:\n"); pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", "es:", @@ -3512,6 +3521,10 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) pr_err("%-15s %016llx %-13s %016llx\n", "excp_from:", save->last_excp_from, "excp_to:", save->last_excp_to); + +no_vmsa: + if (sev_es_guest(vcpu->kvm)) + sev_free_decrypted_vmsa(vcpu, save); } static bool svm_check_exit_valid(u64 exit_code) diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index d4490eaed55d..5ce34a1faf6a 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -98,6 +98,7 @@ struct kvm_sev_info { unsigned int asid; /* ASID used for this guest */ unsigned int handle; /* SEV firmware handle */ int fd; /* SEV device fd */ + unsigned long policy; unsigned long pages_locked; /* Number of pages locked */ struct list_head regions_list; /* List of registered regions */ u64 ap_jump_table; /* SEV-ES AP Jump Table address */ @@ -114,6 +115,9 @@ struct kvm_sev_info { struct mutex guest_req_mutex; /* Must acquire before using bounce buffers */ }; +#define SEV_POLICY_NODBG BIT_ULL(0) +#define SNP_POLICY_DEBUG BIT_ULL(19) + struct kvm_svm { struct kvm kvm; @@ -783,6 +787,8 @@ void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu); int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order); void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end); int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn); +struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu); +void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa); #else static inline struct page *snp_safe_alloc_page_node(int node, gfp_t gfp) { @@ -814,6 +820,11 @@ static inline int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) return 0; } +static inline struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu) +{ + return NULL; +} +static inline void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa) {} #endif /* vmenter.S */ From 22f5c2003a18300f03cd1dd0b63b4df0ce7fed17 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Thu, 20 Mar 2025 08:26:50 -0500 Subject: [PATCH 03/16] KVM: SVM: Dump guest register state in dump_vmcb() Guest register state can be useful when debugging, include it as part of dump_vmcb(). Signed-off-by: Tom Lendacky Acked-by: Borislav Petkov (AMD) Tested-by: Kim Phillips Link: https://lore.kernel.org/r/a4131a10c082a93610cac12b35dca90292e50f50.1742477213.git.thomas.lendacky@amd.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 53 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 36e2199fe483..0ee507e65c05 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3522,6 +3522,59 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) "excp_from:", save->last_excp_from, "excp_to:", save->last_excp_to); + if (sev_es_guest(vcpu->kvm)) { + struct sev_es_save_area *vmsa = (struct sev_es_save_area *)save; + + pr_err("%-15s %016llx\n", + "sev_features", vmsa->sev_features); + + pr_err("%-15s %016llx %-13s %016llx\n", + "rax:", vmsa->rax, "rbx:", vmsa->rbx); + pr_err("%-15s %016llx %-13s %016llx\n", + "rcx:", vmsa->rcx, "rdx:", vmsa->rdx); + pr_err("%-15s %016llx %-13s %016llx\n", + "rsi:", vmsa->rsi, "rdi:", vmsa->rdi); + pr_err("%-15s %016llx %-13s %016llx\n", + "rbp:", vmsa->rbp, "rsp:", vmsa->rsp); + pr_err("%-15s %016llx %-13s %016llx\n", + "r8:", vmsa->r8, "r9:", vmsa->r9); + pr_err("%-15s %016llx %-13s %016llx\n", + "r10:", vmsa->r10, "r11:", vmsa->r11); + pr_err("%-15s %016llx %-13s %016llx\n", + "r12:", vmsa->r12, "r13:", vmsa->r13); + pr_err("%-15s %016llx %-13s %016llx\n", + "r14:", vmsa->r14, "r15:", vmsa->r15); + pr_err("%-15s %016llx %-13s %016llx\n", + "xcr0:", vmsa->xcr0, "xss:", vmsa->xss); + } else { + pr_err("%-15s %016llx %-13s %016lx\n", + "rax:", save->rax, "rbx:", + vcpu->arch.regs[VCPU_REGS_RBX]); + pr_err("%-15s %016lx %-13s %016lx\n", + "rcx:", vcpu->arch.regs[VCPU_REGS_RCX], + "rdx:", vcpu->arch.regs[VCPU_REGS_RDX]); + pr_err("%-15s %016lx %-13s %016lx\n", + "rsi:", vcpu->arch.regs[VCPU_REGS_RSI], + "rdi:", vcpu->arch.regs[VCPU_REGS_RDI]); + pr_err("%-15s %016lx %-13s %016llx\n", + "rbp:", vcpu->arch.regs[VCPU_REGS_RBP], + "rsp:", save->rsp); +#ifdef CONFIG_X86_64 + pr_err("%-15s %016lx %-13s %016lx\n", + "r8:", vcpu->arch.regs[VCPU_REGS_R8], + "r9:", vcpu->arch.regs[VCPU_REGS_R9]); + pr_err("%-15s %016lx %-13s %016lx\n", + "r10:", vcpu->arch.regs[VCPU_REGS_R10], + "r11:", vcpu->arch.regs[VCPU_REGS_R11]); + pr_err("%-15s %016lx %-13s %016lx\n", + "r12:", vcpu->arch.regs[VCPU_REGS_R12], + "r13:", vcpu->arch.regs[VCPU_REGS_R13]); + pr_err("%-15s %016lx %-13s %016lx\n", + "r14:", vcpu->arch.regs[VCPU_REGS_R14], + "r15:", vcpu->arch.regs[VCPU_REGS_R15]); +#endif + } + no_vmsa: if (sev_es_guest(vcpu->kvm)) sev_free_decrypted_vmsa(vcpu, save); From db264509610588dea5fa1dbe28914d39b688d190 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Thu, 20 Mar 2025 08:26:51 -0500 Subject: [PATCH 04/16] KVM: SVM: Add the type of VM for which the VMCB/VMSA is being dumped Add the type of VM (SVM, SEV, SEV-ES, or SEV-SNP) being dumped to the dump_vmcb() function. Signed-off-by: Tom Lendacky Acked-by: Borislav Petkov (AMD) Tested-by: Kim Phillips Link: https://lore.kernel.org/r/7a183a8beedf4ee26c42001160e073a884fe466e.1742477213.git.thomas.lendacky@amd.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 0ee507e65c05..71a5d96f9a04 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3397,14 +3397,19 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) struct vmcb_control_area *control = &svm->vmcb->control; struct vmcb_save_area *save = &svm->vmcb->save; struct vmcb_save_area *save01 = &svm->vmcb01.ptr->save; + char *vm_type; if (!dump_invalid_vmcb) { pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n"); return; } - pr_err("VMCB %p, last attempted VMRUN on CPU %d\n", - svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu); + vm_type = sev_snp_guest(vcpu->kvm) ? "SEV-SNP" : + sev_es_guest(vcpu->kvm) ? "SEV-ES" : + sev_guest(vcpu->kvm) ? "SEV" : "SVM"; + + pr_err("%s VMCB %p, last attempted VMRUN on CPU %d\n", + vm_type, svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu); pr_err("VMCB Control Area:\n"); pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff); pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16); From 0e6b677de730bec4113c466861968e38c4018c50 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Thu, 20 Mar 2025 08:26:52 -0500 Subject: [PATCH 05/16] KVM: SVM: Include the vCPU ID when dumping a VMCB Provide the vCPU ID of the VMCB in dump_vmcb(). Signed-off-by: Tom Lendacky Acked-by: Borislav Petkov (AMD) Tested-by: Kim Phillips Link: https://lore.kernel.org/r/ee0af5a6c1a49aebb4a8291071c3f68cacf107b2.1742477213.git.thomas.lendacky@amd.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 71a5d96f9a04..272ce4bdbf11 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3408,8 +3408,8 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) sev_es_guest(vcpu->kvm) ? "SEV-ES" : sev_guest(vcpu->kvm) ? "SEV" : "SVM"; - pr_err("%s VMCB %p, last attempted VMRUN on CPU %d\n", - vm_type, svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu); + pr_err("%s vCPU%u VMCB %p, last attempted VMRUN on CPU %d\n", + vm_type, vcpu->vcpu_id, svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu); pr_err("VMCB Control Area:\n"); pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff); pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16); From 468c27ae0215c14bc62c90fa7b2ce2bc9eb4564b Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Thu, 20 Mar 2025 08:26:53 -0500 Subject: [PATCH 06/16] KVM: SVM: Add a mutex to dump_vmcb() to prevent concurrent output If multiple VMRUN instructions fail, resulting in calls to dump_vmcb(), the output can become interleaved and it is impossible to identify which line of output belongs to which VMCB. Add a mutex to dump_vmcb() so that the output is serialized. Signed-off-by: Tom Lendacky Acked-by: Borislav Petkov (AMD) Tested-by: Kim Phillips Link: https://lore.kernel.org/r/a880678afd9488e1dd6017445802712f7c02cc6d.1742477213.git.thomas.lendacky@amd.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 272ce4bdbf11..919f98479774 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -249,6 +250,8 @@ static unsigned long iopm_base; DEFINE_PER_CPU(struct svm_cpu_data, svm_data); +static DEFINE_MUTEX(vmcb_dump_mutex); + /* * Only MSR_TSC_AUX is switched via the user return hook. EFER is switched via * the VMCB, and the SYSCALL/SYSENTER MSRs are handled by VMLOAD/VMSAVE. @@ -3404,6 +3407,8 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) return; } + guard(mutex)(&vmcb_dump_mutex); + vm_type = sev_snp_guest(vcpu->kvm) ? "SEV-SNP" : sev_es_guest(vcpu->kvm) ? "SEV-ES" : sev_guest(vcpu->kvm) ? "SEV" : "SVM"; From f9f27c4a377a8b45d6ece79279846b4fb3e27c96 Mon Sep 17 00:00:00 2001 From: Kishon Vijay Abraham I Date: Mon, 10 Mar 2025 15:16:02 -0500 Subject: [PATCH 07/16] x86/cpufeatures: Add "Allowed SEV Features" Feature Add CPU feature detection for "Allowed SEV Features" to allow the Hypervisor to enforce that SEV-ES and SEV-SNP guest VMs cannot enable features (via SEV_FEATURES) that the Hypervisor does not support or wish to be enabled. Signed-off-by: Kishon Vijay Abraham I Reviewed-by: Tom Lendacky Reviewed-by: Pankaj Gupta Acked-by: Borislav Petkov (AMD) Signed-off-by: Kim Phillips Link: https://lore.kernel.org/r/20250310201603.1217954-2-kim.phillips@amd.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/cpufeatures.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 6c2c152d8a67..e95a8e9ef22b 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -446,6 +446,7 @@ #define X86_FEATURE_DEBUG_SWAP (19*32+14) /* "debug_swap" SEV-ES full debug state swap support */ #define X86_FEATURE_RMPREAD (19*32+21) /* RMPREAD instruction */ #define X86_FEATURE_SEGMENTED_RMP (19*32+23) /* Segmented RMP support */ +#define X86_FEATURE_ALLOWED_SEV_FEATURES (19*32+27) /* Allowed SEV Features */ #define X86_FEATURE_SVSM (19*32+28) /* "svsm" SVSM present */ #define X86_FEATURE_HV_INUSE_WR_ALLOWED (19*32+30) /* Allow Write to in-use hypervisor-owned pages */ From b6bc164f41dbee11a4f2913f8fda22066689aa95 Mon Sep 17 00:00:00 2001 From: Kim Phillips Date: Mon, 10 Mar 2025 15:16:03 -0500 Subject: [PATCH 08/16] KVM: SEV: Configure "ALLOWED_SEV_FEATURES" VMCB Field AMD EPYC 5th generation processors have introduced a feature that allows the hypervisor to control the SEV_FEATURES that are set for, or by, a guest [1]. ALLOWED_SEV_FEATURES can be used by the hypervisor to enforce that SEV-ES and SEV-SNP guests cannot enable features that the hypervisor does not want to be enabled. Always enable ALLOWED_SEV_FEATURES. A VMRUN will fail if any non-reserved bits are 1 in SEV_FEATURES but are 0 in ALLOWED_SEV_FEATURES. Some SEV_FEATURES - currently PmcVirtualization and SecureAvic (see Appendix B, Table B-4) - require an opt-in via ALLOWED_SEV_FEATURES, i.e. are off-by-default, whereas all other features are effectively on-by-default, but still honor ALLOWED_SEV_FEATURES. [1] Section 15.36.20 "Allowed SEV Features", AMD64 Architecture Programmer's Manual, Pub. 24593 Rev. 3.42 - March 2024: https://bugzilla.kernel.org/attachment.cgi?id=306250 Co-developed-by: Kishon Vijay Abraham I Signed-off-by: Kishon Vijay Abraham I Reviewed-by: Pankaj Gupta Signed-off-by: Kim Phillips Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/r/20250310201603.1217954-3-kim.phillips@amd.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/svm.h | 7 ++++++- arch/x86/kvm/svm/sev.c | 5 +++++ arch/x86/kvm/svm/svm.c | 2 ++ 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 9b7fa99ae951..b382fd251e5b 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -159,7 +159,10 @@ struct __attribute__ ((__packed__)) vmcb_control_area { u64 avic_physical_id; /* Offset 0xf8 */ u8 reserved_7[8]; u64 vmsa_pa; /* Used for an SEV-ES guest */ - u8 reserved_8[720]; + u8 reserved_8[40]; + u64 allowed_sev_features; /* Offset 0x138 */ + u64 guest_sev_features; /* Offset 0x140 */ + u8 reserved_9[664]; /* * Offset 0x3e0, 32 bytes reserved * for use by hypervisor/software. @@ -291,6 +294,8 @@ static_assert((X2AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == X2AVIC_ #define SVM_SEV_FEAT_ALTERNATE_INJECTION BIT(4) #define SVM_SEV_FEAT_DEBUG_SWAP BIT(5) +#define VMCB_ALLOWED_SEV_FEATURES_VALID BIT_ULL(63) + struct vmcb_seg { u16 selector; u16 attrib; diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 11695362b6b2..99e4dcf429e4 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -4451,6 +4451,7 @@ void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm) static void sev_es_init_vmcb(struct vcpu_svm *svm) { + struct kvm_sev_info *sev = to_kvm_sev_info(svm->vcpu.kvm); struct vmcb *vmcb = svm->vmcb01.ptr; struct kvm_vcpu *vcpu = &svm->vcpu; @@ -4466,6 +4467,10 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm) if (svm->sev_es.vmsa && !svm->sev_es.snp_has_guest_vmsa) svm->vmcb->control.vmsa_pa = __pa(svm->sev_es.vmsa); + if (cpu_feature_enabled(X86_FEATURE_ALLOWED_SEV_FEATURES)) + svm->vmcb->control.allowed_sev_features = sev->vmsa_features | + VMCB_ALLOWED_SEV_FEATURES_VALID; + /* Can't intercept CR register access, HV can't modify CR registers */ svm_clr_intercept(svm, INTERCEPT_CR0_READ); svm_clr_intercept(svm, INTERCEPT_CR4_READ); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 919f98479774..6de35274d278 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3452,6 +3452,8 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id); pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id); pr_err("%-20s%016llx\n", "vmsa_pa:", control->vmsa_pa); + pr_err("%-20s%016llx\n", "allowed_sev_features:", control->allowed_sev_features); + pr_err("%-20s%016llx\n", "guest_sev_features:", control->guest_sev_features); if (sev_es_guest(vcpu->kvm)) { save = sev_decrypt_vmsa(vcpu); From bb5081f4abf2d91ae3ed22d7d1157c91d23db5ed Mon Sep 17 00:00:00 2001 From: Peng Hao Date: Thu, 6 Mar 2025 15:54:25 +0800 Subject: [PATCH 09/16] KVM: SVM: avoid frequency indirect calls When retpoline is enabled, indirect function calls introduce additional performance overhead. Avoid frequent indirect calls to VMGEXIT when SEV is enabled. Signed-off-by: Peng Hao Link: https://lore.kernel.org/r/20250306075425.66693-1-flyingpeng@tencent.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 6de35274d278..dca87d9e5850 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3626,6 +3626,10 @@ int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code) return kvm_emulate_halt(vcpu); else if (exit_code == SVM_EXIT_NPF) return npf_interception(vcpu); +#ifdef CONFIG_KVM_AMD_SEV + else if (exit_code == SVM_EXIT_VMGEXIT) + return sev_handle_vmgexit(vcpu); +#endif #endif return svm_exit_handlers[exit_code](vcpu); } From 5ecdb48dd9188c355ddde5c3686c0835a223ca21 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 27 Feb 2025 14:24:11 -0800 Subject: [PATCH 10/16] KVM: SVM: Treat DEBUGCTL[5:2] as reserved Stop ignoring DEBUGCTL[5:2] on AMD CPUs and instead treat them as reserved. KVM has never properly virtualized AMD's legacy PBi bits, but did allow the guest (and host userspace) to set the bits. To avoid breaking guests when running on CPUs with BusLockTrap, which redefined bit 2 to BLCKDB and made bits 5:3 reserved, a previous KVM change ignored bits 5:3, e.g. so that legacy guest software wouldn't inadvertently enable BusLockTrap or hit a VMRUN failure due to setting reserved. To allow for virtualizing BusLockTrap and whatever future features may use bits 5:3, treat bits 5:2 as reserved (and hope that doing so doesn't break any existing guests). Reviewed-and-tested-by: Ravi Bangoria Link: https://lore.kernel.org/r/20250227222411.3490595-7-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index dca87d9e5850..1ed5aa76e06c 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3176,17 +3176,6 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) break; } - /* - * AMD changed the architectural behavior of bits 5:2. On CPUs - * without BusLockTrap, bits 5:2 control "external pins", but - * on CPUs that support BusLockDetect, bit 2 enables BusLockTrap - * and bits 5:3 are reserved-to-zero. Sadly, old KVM allowed - * the guest to set bits 5:2 despite not actually virtualizing - * Performance-Monitoring/Breakpoint external pins. Drop bits - * 5:2 for backwards compatibility. - */ - data &= ~GENMASK(5, 2); - /* * Suppress BTF as KVM doesn't virtualize BTF, but there's no * way to communicate lack of support to the guest. From e0136112e99d63d87a18bfec3ffb16c414f903a1 Mon Sep 17 00:00:00 2001 From: Peng Hao Date: Mon, 28 Apr 2025 14:30:13 +0800 Subject: [PATCH 11/16] x86/sev: Remove unnecessary GFP_KERNEL_ACCOUNT for temporary variables Some variables allocated in sev_send_update_data are released when the function exits, so there is no need to set GFP_KERNEL_ACCOUNT. Signed-off-by: Peng Hao Link: https://lore.kernel.org/r/20250428063013.62311-1-flyingpeng@tencent.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 99e4dcf429e4..c6c2975e25ed 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -1594,11 +1594,11 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) /* allocate memory for header and transport buffer */ ret = -ENOMEM; - hdr = kzalloc(params.hdr_len, GFP_KERNEL_ACCOUNT); + hdr = kzalloc(params.hdr_len, GFP_KERNEL); if (!hdr) goto e_unpin; - trans_data = kzalloc(params.trans_len, GFP_KERNEL_ACCOUNT); + trans_data = kzalloc(params.trans_len, GFP_KERNEL); if (!trans_data) goto e_free_hdr; From e9628b011bbd7f9cc13fb9c15267695c996338c7 Mon Sep 17 00:00:00 2001 From: Manali Shukla Date: Fri, 2 May 2025 05:03:42 +0000 Subject: [PATCH 12/16] KVM: x86: Make kvm_pio_request.linear_rip a common field for user exits Move and rename kvm_pio_request.linear_rip to kvm_vcpu_arch.cui_linear_rip so that the field can be used by other userspace exit completion flows that need to take action if and only if userspace has not modified RIP. No functional changes intended. Suggested-by: Sean Christopherson Signed-off-by: Manali Shukla Link: https://lore.kernel.org/r/20250502050346.14274-2-manali.shukla@amd.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/x86.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c5e80131626d..61d4809418cf 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -412,7 +412,6 @@ struct kvm_rmap_head { }; struct kvm_pio_request { - unsigned long linear_rip; unsigned long count; int in; int port; @@ -918,6 +917,7 @@ struct kvm_vcpu_arch { bool emulate_regs_need_sync_to_vcpu; bool emulate_regs_need_sync_from_vcpu; int (*complete_userspace_io)(struct kvm_vcpu *vcpu); + unsigned long cui_linear_rip; gpa_t time; s8 pvclock_tsc_shift; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f6ce044b090a..b41090a7faac 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9359,7 +9359,7 @@ static int complete_fast_pio_out(struct kvm_vcpu *vcpu) { vcpu->arch.pio.count = 0; - if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip))) + if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip))) return 1; return kvm_skip_emulated_instruction(vcpu); @@ -9384,7 +9384,7 @@ static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, complete_fast_pio_out_port_0x7e; kvm_skip_emulated_instruction(vcpu); } else { - vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu); + vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu); vcpu->arch.complete_userspace_io = complete_fast_pio_out; } return 0; @@ -9397,7 +9397,7 @@ static int complete_fast_pio_in(struct kvm_vcpu *vcpu) /* We should only ever be called with arch.pio.count equal to 1 */ BUG_ON(vcpu->arch.pio.count != 1); - if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip))) { + if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip))) { vcpu->arch.pio.count = 0; return 1; } @@ -9426,7 +9426,7 @@ static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size, return ret; } - vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu); + vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu); vcpu->arch.complete_userspace_io = complete_fast_pio_in; return 0; From faad6645e1128ec2187d56009e2b76d603772d5f Mon Sep 17 00:00:00 2001 From: Manali Shukla Date: Fri, 2 May 2025 05:03:43 +0000 Subject: [PATCH 13/16] x86/cpufeatures: Add CPUID feature bit for the Bus Lock Threshold Misbehaving guests can cause bus locks to degrade the performance of the system. The Bus Lock Threshold feature can be used to address this issue by providing capability to the hypervisor to limit guest's ability to generate bus lock, thereby preventing system slowdown due to performance penalities. When the Bus Lock Threshold feature is enabled, the processor checks the bus lock threshold count before executing the buslock and decides whether to trigger bus lock exit or not. The value of the bus lock threshold count '0' generates bus lock exits, and if the value is greater than '0', the bus lock is executed successfully and the bus lock threshold count is decremented. Presence of the Bus Lock threshold feature is indicated via CPUID function 0x8000000A_EDX[29]. Signed-off-by: Manali Shukla Reviewed-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20250502050346.14274-3-manali.shukla@amd.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/cpufeatures.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index e95a8e9ef22b..b51f3ce84032 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -378,6 +378,7 @@ #define X86_FEATURE_V_SPEC_CTRL (15*32+20) /* "v_spec_ctrl" Virtual SPEC_CTRL */ #define X86_FEATURE_VNMI (15*32+25) /* "vnmi" Virtual NMI */ #define X86_FEATURE_SVME_ADDR_CHK (15*32+28) /* SVME addr check */ +#define X86_FEATURE_BUS_LOCK_THRESHOLD (15*32+29) /* Bus lock threshold */ #define X86_FEATURE_IDLE_HLT (15*32+30) /* IDLE HLT intercept */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */ From 827547bc3a2a2af6391260de8874008c5a52f238 Mon Sep 17 00:00:00 2001 From: Nikunj A Dadhania Date: Fri, 2 May 2025 05:03:44 +0000 Subject: [PATCH 14/16] KVM: SVM: Add architectural definitions/assets for Bus Lock Threshold Virtual machines can exploit bus locks to degrade the performance of the system. Bus locks can be caused by Non-WB(Write back) and misaligned locked RMW (Read-modify-Write) instructions and require systemwide synchronization among all processors which can result into significant performance penalties. To address this issue, the Bus Lock Threshold feature is introduced to provide ability to hypervisor to restrict guests' capability of initiating mulitple buslocks, thereby preventing system slowdowns. Support for the buslock threshold is indicated via CPUID function 0x8000000A_EDX[29]. On the processors that support the Bus Lock Threshold feature, the VMCB provides a Bus Lock Threshold enable bit and an unsigned 16-bit Bus Lock threshold count. VMCB intercept bit VMCB Offset Bits Function 14h 5 Intercept bus lock operations Bus lock threshold count VMCB Offset Bits Function 120h 15:0 Bus lock counter When a VMRUN instruction is executed, the bus lock threshold count is loaded into an internal count register. Before the processor executes a bus lock in the guest, it checks the value of this register: - If the value is greater than '0', the processor successfully executes the bus lock and decrements the count. - If the value is '0', the bus lock is not executed, and a #VMEXIT to the VMM is taken. The bus lock threshold #VMEXIT is reported to the VMM with the VMEXIT code A5h, SVM_EXIT_BUS_LOCK. Signed-off-by: Nikunj A Dadhania Co-developed-by: Manali Shukla Signed-off-by: Manali Shukla Link: https://lore.kernel.org/r/20250502050346.14274-4-manali.shukla@amd.com [sean: rewrite shortlog] Signed-off-by: Sean Christopherson --- arch/x86/include/asm/svm.h | 7 +++++-- arch/x86/include/uapi/asm/svm.h | 2 ++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index b382fd251e5b..ad954a1a6656 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -116,6 +116,7 @@ enum { INTERCEPT_INVPCID, INTERCEPT_MCOMMIT, INTERCEPT_TLBSYNC, + INTERCEPT_BUSLOCK, INTERCEPT_IDLE_HLT = 166, }; @@ -159,10 +160,12 @@ struct __attribute__ ((__packed__)) vmcb_control_area { u64 avic_physical_id; /* Offset 0xf8 */ u8 reserved_7[8]; u64 vmsa_pa; /* Used for an SEV-ES guest */ - u8 reserved_8[40]; + u8 reserved_8[16]; + u16 bus_lock_counter; /* Offset 0x120 */ + u8 reserved_9[22]; u64 allowed_sev_features; /* Offset 0x138 */ u64 guest_sev_features; /* Offset 0x140 */ - u8 reserved_9[664]; + u8 reserved_10[664]; /* * Offset 0x3e0, 32 bytes reserved * for use by hypervisor/software. diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h index ec1321248dac..9c640a521a67 100644 --- a/arch/x86/include/uapi/asm/svm.h +++ b/arch/x86/include/uapi/asm/svm.h @@ -95,6 +95,7 @@ #define SVM_EXIT_CR14_WRITE_TRAP 0x09e #define SVM_EXIT_CR15_WRITE_TRAP 0x09f #define SVM_EXIT_INVPCID 0x0a2 +#define SVM_EXIT_BUS_LOCK 0x0a5 #define SVM_EXIT_IDLE_HLT 0x0a6 #define SVM_EXIT_NPF 0x400 #define SVM_EXIT_AVIC_INCOMPLETE_IPI 0x401 @@ -225,6 +226,7 @@ { SVM_EXIT_CR4_WRITE_TRAP, "write_cr4_trap" }, \ { SVM_EXIT_CR8_WRITE_TRAP, "write_cr8_trap" }, \ { SVM_EXIT_INVPCID, "invpcid" }, \ + { SVM_EXIT_BUS_LOCK, "buslock" }, \ { SVM_EXIT_IDLE_HLT, "idle-halt" }, \ { SVM_EXIT_NPF, "npf" }, \ { SVM_EXIT_AVIC_INCOMPLETE_IPI, "avic_incomplete_ipi" }, \ From 89f9edf4c69ddd345b07be414e6a99ace0757ff5 Mon Sep 17 00:00:00 2001 From: Manali Shukla Date: Fri, 2 May 2025 05:03:45 +0000 Subject: [PATCH 15/16] KVM: SVM: Add support for KVM_CAP_X86_BUS_LOCK_EXIT on SVM CPUs Add support for KVM_CAP_X86_BUS_LOCK_EXIT on SVM CPUs with Bus Lock Threshold, which is close enough to VMX's Bus Lock Detection VM-Exit to allow reusing KVM_CAP_X86_BUS_LOCK_EXIT. The biggest difference between the two features is that Threshold is fault-like, whereas Detection is trap-like. To allow the guest to make forward progress, Threshold provides a per-VMCB counter which is decremented every time a bus lock occurs, and a VM-Exit is triggered if and only if the counter is '0'. To provide Detection-like semantics, initialize the counter to '0', i.e. exit on every bus lock, and when re-executing the guilty instruction, set the counter to '1' to effectively step past the instruction. Note, in the unlikely scenario that re-executing the instruction doesn't trigger a bus lock, e.g. because the guest has changed memory types or patched the guilty instruction, the bus lock counter will be left at '1', i.e. the guest will be able to do a bus lock on a different instruction. In a perfect world, KVM would ensure the counter is '0' if the guest has made forward progress, e.g. if RIP has changed. But trying to close that hole would incur non-trivial complexity, for marginal benefit; the intent of KVM_CAP_X86_BUS_LOCK_EXIT is to allow userspace rate-limit bus locks, not to allow for precise detection of problematic guest code. And, it's simply not feasible to fully close the hole, e.g. if an interrupt arrives before the original instruction can re-execute, the guest could step past a different bus lock. Suggested-by: Sean Christopherson Signed-off-by: Manali Shukla Link: https://lore.kernel.org/r/20250502050346.14274-5-manali.shukla@amd.com [sean: fix typo in comment] Signed-off-by: Sean Christopherson --- Documentation/virt/kvm/api.rst | 5 +++++ arch/x86/kvm/svm/nested.c | 34 ++++++++++++++++++++++++++++++ arch/x86/kvm/svm/svm.c | 38 ++++++++++++++++++++++++++++++++++ arch/x86/kvm/svm/svm.h | 1 + 4 files changed, 78 insertions(+) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index ad1859f4699e..f7d2d477c3cf 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -7989,6 +7989,11 @@ apply some other policy-based mitigation. When exiting to userspace, KVM sets KVM_RUN_X86_BUS_LOCK in vcpu-run->flags, and conditionally sets the exit_reason to KVM_EXIT_X86_BUS_LOCK. +Due to differences in the underlying hardware implementation, the vCPU's RIP at +the time of exit diverges between Intel and AMD. On Intel hosts, RIP points at +the next instruction, i.e. the exit is trap-like. On AMD hosts, RIP points at +the offending instruction, i.e. the exit is fault-like. + Note! Detected bus locks may be coincident with other exits to userspace, i.e. KVM_RUN_X86_BUS_LOCK should be checked regardless of the primary exit reason if userspace wants to take action on all detected bus locks. diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 834b67672d50..6221a341f500 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -678,6 +678,33 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, vmcb02->control.iopm_base_pa = vmcb01->control.iopm_base_pa; vmcb02->control.msrpm_base_pa = vmcb01->control.msrpm_base_pa; + /* + * Stash vmcb02's counter if the guest hasn't moved past the guilty + * instruction; otherwise, reset the counter to '0'. + * + * In order to detect if L2 has made forward progress or not, track the + * RIP at which a bus lock has occurred on a per-vmcb12 basis. If RIP + * is changed, guest has clearly made forward progress, bus_lock_counter + * still remained '1', so reset bus_lock_counter to '0'. Eg. In the + * scenario, where a buslock happened in L1 before VMRUN, the bus lock + * firmly happened on an instruction in the past. Even if vmcb01's + * counter is still '1', (because the guilty instruction got patched), + * the vCPU has clearly made forward progress and so KVM should reset + * vmcb02's counter to '0'. + * + * If the RIP hasn't changed, stash the bus lock counter at nested VMRUN + * to prevent the same guilty instruction from triggering a VM-Exit. Eg. + * if userspace rate-limits the vCPU, then it's entirely possible that + * L1's tick interrupt is pending by the time userspace re-runs the + * vCPU. If KVM unconditionally clears the counter on VMRUN, then when + * L1 re-enters L2, the same instruction will trigger a VM-Exit and the + * entire cycle start over. + */ + if (vmcb02->save.rip && (svm->nested.ctl.bus_lock_rip == vmcb02->save.rip)) + vmcb02->control.bus_lock_counter = 1; + else + vmcb02->control.bus_lock_counter = 0; + /* Done at vmrun: asid. */ /* Also overwritten later if necessary. */ @@ -1039,6 +1066,13 @@ int nested_svm_vmexit(struct vcpu_svm *svm) } + /* + * Invalidate bus_lock_rip unless KVM is still waiting for the guest + * to make forward progress before re-enabling bus lock detection. + */ + if (!vmcb02->control.bus_lock_counter) + svm->nested.ctl.bus_lock_rip = INVALID_GPA; + nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr); svm_switch_vmcb(svm, &svm->vmcb01); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 1ed5aa76e06c..d4a0843d0ec9 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1384,6 +1384,9 @@ static void init_vmcb(struct kvm_vcpu *vcpu) svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK; } + if (vcpu->kvm->arch.bus_lock_detection_enabled) + svm_set_intercept(svm, INTERCEPT_BUSLOCK); + if (sev_guest(vcpu->kvm)) sev_init_vmcb(svm); @@ -3306,6 +3309,37 @@ static int invpcid_interception(struct kvm_vcpu *vcpu) return kvm_handle_invpcid(vcpu, type, gva); } +static inline int complete_userspace_buslock(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + /* + * If userspace has NOT changed RIP, then KVM's ABI is to let the guest + * execute the bus-locking instruction. Set the bus lock counter to '1' + * to effectively step past the bus lock. + */ + if (kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip)) + svm->vmcb->control.bus_lock_counter = 1; + + return 1; +} + +static int bus_lock_exit(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK; + vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK; + + vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu); + vcpu->arch.complete_userspace_io = complete_userspace_buslock; + + if (is_guest_mode(vcpu)) + svm->nested.ctl.bus_lock_rip = vcpu->arch.cui_linear_rip; + + return 0; +} + static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = { [SVM_EXIT_READ_CR0] = cr_interception, [SVM_EXIT_READ_CR3] = cr_interception, @@ -3375,6 +3409,7 @@ static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = { [SVM_EXIT_INVPCID] = invpcid_interception, [SVM_EXIT_IDLE_HLT] = kvm_emulate_halt, [SVM_EXIT_NPF] = npf_interception, + [SVM_EXIT_BUS_LOCK] = bus_lock_exit, [SVM_EXIT_RSM] = rsm_interception, [SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception, [SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception, @@ -5377,6 +5412,9 @@ static __init void svm_set_cpu_caps(void) kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK); } + if (cpu_feature_enabled(X86_FEATURE_BUS_LOCK_THRESHOLD)) + kvm_caps.has_bus_lock_exit = true; + /* CPUID 0x80000008 */ if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) || boot_cpu_has(X86_FEATURE_AMD_SSBD)) diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 5ce34a1faf6a..a991a839bb6d 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -173,6 +173,7 @@ struct vmcb_ctrl_area_cached { u64 nested_cr3; u64 virt_ext; u32 clean; + u64 bus_lock_rip; union { #if IS_ENABLED(CONFIG_HYPERV) || IS_ENABLED(CONFIG_KVM_HYPERV) struct hv_vmcb_enlightenments hv_enlightenments; From 72df72e1c6ddfb6e0c2bce174d5879bc095540c8 Mon Sep 17 00:00:00 2001 From: Nikunj A Dadhania Date: Fri, 2 May 2025 05:03:46 +0000 Subject: [PATCH 16/16] KVM: selftests: Add test to verify KVM_CAP_X86_BUS_LOCK_EXIT Add a test case to verify x86's bus lock exit functionality, which is now supported on both Intel and AMD. Trigger bus lock exits by performing a split-lock access, i.e. an atomic access that splits two cache lines. Verify that the correct number of bus lock exits are generated, and that the counter is incremented correctly and at the appropriate time based on the underlying architecture. Generate bus locks in both L1 and L2 (if nested virtualization is enabled), as SVM's functionality in particular requires non-trivial logic to do the right thing when running nested VMs. Signed-off-by: Nikunj A Dadhania Co-developed-by: Manali Shukla Signed-off-by: Manali Shukla Link: https://lore.kernel.org/r/20250502050346.14274-6-manali.shukla@amd.com Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/Makefile.kvm | 1 + .../selftests/kvm/x86/kvm_buslock_test.c | 135 ++++++++++++++++++ 2 files changed, 136 insertions(+) create mode 100644 tools/testing/selftests/kvm/x86/kvm_buslock_test.c diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index f62b0a5aba35..fa65b82c1f53 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -78,6 +78,7 @@ TEST_GEN_PROGS_x86 += x86/hyperv_svm_test TEST_GEN_PROGS_x86 += x86/hyperv_tlb_flush TEST_GEN_PROGS_x86 += x86/kvm_clock_test TEST_GEN_PROGS_x86 += x86/kvm_pv_test +TEST_GEN_PROGS_x86 += x86/kvm_buslock_test TEST_GEN_PROGS_x86 += x86/monitor_mwait_test TEST_GEN_PROGS_x86 += x86/nested_emulation_test TEST_GEN_PROGS_x86 += x86/nested_exceptions_test diff --git a/tools/testing/selftests/kvm/x86/kvm_buslock_test.c b/tools/testing/selftests/kvm/x86/kvm_buslock_test.c new file mode 100644 index 000000000000..d88500c118eb --- /dev/null +++ b/tools/testing/selftests/kvm/x86/kvm_buslock_test.c @@ -0,0 +1,135 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2024 Advanced Micro Devices, Inc. + */ +#include + +#include "kvm_util.h" +#include "processor.h" +#include "svm_util.h" +#include "vmx.h" +#include "test_util.h" + +#define NR_BUS_LOCKS_PER_LEVEL 100 +#define CACHE_LINE_SIZE 64 + +/* + * To generate a bus lock, carve out a buffer that precisely occupies two cache + * lines and perform an atomic access that splits the two lines. + */ +static u8 buffer[CACHE_LINE_SIZE * 2] __aligned(CACHE_LINE_SIZE); +static atomic_t *val = (void *)&buffer[CACHE_LINE_SIZE - (sizeof(*val) / 2)]; + +static void guest_generate_buslocks(void) +{ + for (int i = 0; i < NR_BUS_LOCKS_PER_LEVEL; i++) + atomic_inc(val); +} + +#define L2_GUEST_STACK_SIZE 64 + +static void l2_guest_code(void) +{ + guest_generate_buslocks(); + GUEST_DONE(); +} + +static void l1_svm_code(struct svm_test_data *svm) +{ + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + struct vmcb *vmcb = svm->vmcb; + + generic_svm_setup(svm, l2_guest_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]); + run_guest(vmcb, svm->vmcb_gpa); +} + +static void l1_vmx_code(struct vmx_pages *vmx) +{ + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + + GUEST_ASSERT_EQ(prepare_for_vmx_operation(vmx), true); + GUEST_ASSERT_EQ(load_vmcs(vmx), true); + + prepare_vmcs(vmx, NULL, &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + GUEST_ASSERT(!vmwrite(GUEST_RIP, (u64)l2_guest_code)); + GUEST_ASSERT(!vmlaunch()); +} + +static void guest_code(void *test_data) +{ + guest_generate_buslocks(); + + if (this_cpu_has(X86_FEATURE_SVM)) + l1_svm_code(test_data); + else if (this_cpu_has(X86_FEATURE_VMX)) + l1_vmx_code(test_data); + else + GUEST_DONE(); + + TEST_FAIL("L2 should have signaled 'done'"); +} + +int main(int argc, char *argv[]) +{ + const bool has_nested = kvm_cpu_has(X86_FEATURE_SVM) || kvm_cpu_has(X86_FEATURE_VMX); + vm_vaddr_t nested_test_data_gva; + struct kvm_vcpu *vcpu; + struct kvm_run *run; + struct kvm_vm *vm; + int i, bus_locks = 0; + + TEST_REQUIRE(kvm_has_cap(KVM_CAP_X86_BUS_LOCK_EXIT)); + + vm = vm_create(1); + vm_enable_cap(vm, KVM_CAP_X86_BUS_LOCK_EXIT, KVM_BUS_LOCK_DETECTION_EXIT); + vcpu = vm_vcpu_add(vm, 0, guest_code); + + if (kvm_cpu_has(X86_FEATURE_SVM)) + vcpu_alloc_svm(vm, &nested_test_data_gva); + else + vcpu_alloc_vmx(vm, &nested_test_data_gva); + + vcpu_args_set(vcpu, 1, nested_test_data_gva); + + run = vcpu->run; + + for (i = 0; i <= NR_BUS_LOCKS_PER_LEVEL * (1 + has_nested); i++) { + struct ucall uc; + + vcpu_run(vcpu); + + if (run->exit_reason == KVM_EXIT_IO) { + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + goto done; + case UCALL_SYNC: + continue; + case UCALL_DONE: + goto done; + default: + TEST_FAIL("Unknown ucall 0x%lx.", uc.cmd); + } + } + + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_X86_BUS_LOCK); + + /* + * Verify the counter is actually getting incremented, e.g. that + * KVM isn't skipping the instruction. On Intel, the exit is + * trap-like, i.e. the counter should already have been + * incremented. On AMD, it's fault-like, i.e. the counter will + * be incremented when the guest re-executes the instruction. + */ + sync_global_from_guest(vm, *val); + TEST_ASSERT_EQ(atomic_read(val), bus_locks + host_cpu_is_intel); + + bus_locks++; + } + TEST_FAIL("Didn't receive UCALL_DONE, took %u bus lock exits\n", bus_locks); +done: + TEST_ASSERT_EQ(i, bus_locks); + kvm_vm_free(vm); + return 0; +}