From 6d3790bc689de9f18fae01c21f02e7d6d425534c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 27 Apr 2026 18:25:03 -0700 Subject: [PATCH 01/22] KVM: selftests: Include sys/mman.h *and* linux/mman.h, via kvm_syscalls.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Include both linux/mman.h (the kernel provided version) and sys/mman.h (the libc provided version) throughout KVM selftests, by way of kvm_syscalls.h (which should have been including sys/mman.h anyways). Pulling in the kernel's version fixes compilation errors with the guest_memfd test on older versions of libc due to a recent commit adding MADV_COLLAPSE testing. In file included from include/kvm_util.h:8, from guest_memfd_test.c:21: guest_memfd_test.c: In function ‘test_collapse’: guest_memfd_test.c:219:47: error: ‘MADV_COLLAPSE’ undeclared (first use in this function); did you mean ‘MADV_COLD’? 219 | TEST_ASSERT_EQ(madvise(mem, pmd_size, MADV_COLLAPSE), -1); | ^~~~~~~~~~~~~ include/test_util.h:62:16: note: in definition of macro ‘TEST_ASSERT_EQ’ 62 | typeof(a) __a = (a); \ | ^ guest_memfd_test.c:219:47: note: each undeclared identifier is reported only once for each function it appears in 219 | TEST_ASSERT_EQ(madvise(mem, pmd_size, MADV_COLLAPSE), -1); | ^~~~~~~~~~~~~ include/test_util.h:62:16: note: in definition of macro ‘TEST_ASSERT_EQ’ 62 | typeof(a) __a = (a); \ | ^ Route the includes through kvm_syscalls.h to try and avoid a future game of whack-a-mole, i.e. so that future expansion of test coverage doesn't run into the same problem. To discourage use of sys/mman.h, opportunistically include the kernel's version of mman.h in test_util.h as it only needs MAP_SHARED, i.e. only needs the full set of kernel defs, not the libc syscall wrappers. Fixes: 9830209b4ae8 ("KVM: selftests: Test MADV_COLLAPSE on guest_memfd") Reported-by: Rick Edgecombe Closes: https://lore.kernel.org/all/20260427204313.50741-1-rick.p.edgecombe@intel.com Link: https://patch.msgid.link/20260428012503.1213654-1-seanjc@google.com Signed-off-by: Sean Christopherson --- .../testing/selftests/kvm/access_tracking_perf_test.c | 2 +- tools/testing/selftests/kvm/guest_memfd_test.c | 2 +- tools/testing/selftests/kvm/include/kvm_syscalls.h | 10 ++++++++++ tools/testing/selftests/kvm/include/test_util.h | 2 +- tools/testing/selftests/kvm/lib/kvm_util.c | 2 +- tools/testing/selftests/kvm/memslot_perf_test.c | 2 +- .../testing/selftests/kvm/s390/shared_zeropage_test.c | 3 +-- tools/testing/selftests/kvm/s390/tprot.c | 2 +- tools/testing/selftests/kvm/set_memory_region_test.c | 2 +- 9 files changed, 18 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/kvm/access_tracking_perf_test.c b/tools/testing/selftests/kvm/access_tracking_perf_test.c index e5bbdb5bbdc3..4415c94b2866 100644 --- a/tools/testing/selftests/kvm/access_tracking_perf_test.c +++ b/tools/testing/selftests/kvm/access_tracking_perf_test.c @@ -41,10 +41,10 @@ #include #include #include -#include #include #include +#include "kvm_syscalls.h" #include "kvm_util.h" #include "test_util.h" #include "memstress.h" diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c index 253e748c1d4a..832ef4dfb99f 100644 --- a/tools/testing/selftests/kvm/guest_memfd_test.c +++ b/tools/testing/selftests/kvm/guest_memfd_test.c @@ -14,10 +14,10 @@ #include #include #include -#include #include #include +#include "kvm_syscalls.h" #include "kvm_util.h" #include "numaif.h" #include "test_util.h" diff --git a/tools/testing/selftests/kvm/include/kvm_syscalls.h b/tools/testing/selftests/kvm/include/kvm_syscalls.h index 843c9904c46f..067a4c9cf452 100644 --- a/tools/testing/selftests/kvm/include/kvm_syscalls.h +++ b/tools/testing/selftests/kvm/include/kvm_syscalls.h @@ -2,8 +2,18 @@ #ifndef SELFTEST_KVM_SYSCALLS_H #define SELFTEST_KVM_SYSCALLS_H +/* + * Include both the kernel and libc versions of mman.h. The kernel provides + * the most up-to-date flags and definitions, while libc provides the syscall + * wrappers tests expect. + */ +#include + +#include #include +#include + #define MAP_ARGS0(m,...) #define MAP_ARGS1(m,t,a,...) m(t,a) #define MAP_ARGS2(m,t,a,...) m(t,a), MAP_ARGS1(m,__VA_ARGS__) diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h index d9b433b834f1..a56271c237ae 100644 --- a/tools/testing/selftests/kvm/include/test_util.h +++ b/tools/testing/selftests/kvm/include/test_util.h @@ -19,9 +19,9 @@ #include #include #include -#include #include "kselftest.h" +#include #include #define msecs_to_usecs(msec) ((msec) * 1000ULL) diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 2a76eca7029d..e08967ef7b7b 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -5,13 +5,13 @@ * Copyright (C) 2018, Google LLC. */ #include "test_util.h" +#include "kvm_syscalls.h" #include "kvm_util.h" #include "processor.h" #include "ucall_common.h" #include #include -#include #include #include #include diff --git a/tools/testing/selftests/kvm/memslot_perf_test.c b/tools/testing/selftests/kvm/memslot_perf_test.c index 3d02db371422..e977e979470f 100644 --- a/tools/testing/selftests/kvm/memslot_perf_test.c +++ b/tools/testing/selftests/kvm/memslot_perf_test.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include @@ -23,6 +22,7 @@ #include #include +#include #include #include #include diff --git a/tools/testing/selftests/kvm/s390/shared_zeropage_test.c b/tools/testing/selftests/kvm/s390/shared_zeropage_test.c index a9e5a01200b8..478381e6f84e 100644 --- a/tools/testing/selftests/kvm/s390/shared_zeropage_test.c +++ b/tools/testing/selftests/kvm/s390/shared_zeropage_test.c @@ -4,11 +4,10 @@ * * Copyright (C) 2024, Red Hat, Inc. */ -#include - #include #include "test_util.h" +#include "kvm_syscalls.h" #include "kvm_util.h" #include "kselftest.h" #include "ucall_common.h" diff --git a/tools/testing/selftests/kvm/s390/tprot.c b/tools/testing/selftests/kvm/s390/tprot.c index 8054d2b178f0..d86179827a18 100644 --- a/tools/testing/selftests/kvm/s390/tprot.c +++ b/tools/testing/selftests/kvm/s390/tprot.c @@ -4,8 +4,8 @@ * * Copyright IBM Corp. 2021 */ -#include #include "test_util.h" +#include "kvm_syscalls.h" #include "kvm_util.h" #include "kselftest.h" #include "ucall_common.h" diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c index 9b919a231c93..e639a9db51ee 100644 --- a/tools/testing/selftests/kvm/set_memory_region_test.c +++ b/tools/testing/selftests/kvm/set_memory_region_test.c @@ -8,11 +8,11 @@ #include #include #include -#include #include #include +#include #include #include From fff82ea9d900b6bbebc58d34b7a63789de1ad10d Mon Sep 17 00:00:00 2001 From: Mikhail Gavrilov Date: Tue, 5 May 2026 04:54:35 +0500 Subject: [PATCH 02/22] x86/virt: Silence RCU lockdep splat in emergency virt callback path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit x86_virt_invoke_kvm_emergency_callback() reaches rcu_dereference() through machine_crash_shutdown() with IRQs disabled but with RCU not necessarily watching the crashing CPU, which triggers a suspicious RCU usage splat on debug kernels (CONFIG_PROVE_RCU=y) during panic/kdump: WARNING: suspicious RCU usage arch/x86/virt/hw.c:52 suspicious rcu_dereference_check() usage! rcu_scheduler_active = 2, debug_locks = 1 1 lock held by tee/11119: #0: ffff8881fa32c440 (sb_writers#3){.+.+}-{0:0}, at: ksys_write Call Trace: dump_stack_lvl+0x84/0xd0 lockdep_rcu_suspicious.cold+0x37/0x8f x86_virt_invoke_kvm_emergency_callback+0x5f/0x70 x86_svm_emergency_disable_virtualization_cpu+0x2a/0x30 x86_virt_emergency_disable_virtualization_cpu+0x6b/0x90 native_machine_crash_shutdown+0x72/0x170 __crash_kexec+0x137/0x280 panic+0xce/0xd0 sysrq_handle_crash+0x1f/0x20 __handle_sysrq.cold+0x192/0x335 write_sysrq_trigger+0x8c/0xc0 proc_reg_write+0x1c3/0x3c0 vfs_write+0x1d0/0xf80 ksys_write+0x116/0x250 do_syscall_64+0x11c/0x1480 entry_SYSCALL_64_after_hwframe+0x76/0x7e A truly correct fix is non-trivial: the RCU usage genuinely is wrong in panic context (RCU may ignore the crashing CPU during synchronization), and a concurrent KVM module unload could in principle race with the callback read; see commit 2baa33a8ddd6 ("KVM: x86: Leave user-return notifier registered on reboot/shutdown") which notes that nothing prevents module unload during panic/reboot. However, the alternatives are worse: - smp_store_release()/smp_load_acquire() handles ordering but not liveness; the kernel still needs to keep the module text alive while the callback is in flight. - Taking a lock in the panic path is risky — any lock could be held by a CPU that has already been NMI'd to a halt. Use rcu_dereference_raw() to silence the splat and accept the vanishingly small remaining race. Panic context inherently cannot guarantee complete correctness; the goal here is to keep debug builds quiet on the kdump path so the splat doesn't obscure the actual kernel state being captured. Reproducible on a debug kernel (CONFIG_PROVE_LOCKING=y, CONFIG_PROVE_RCU=y) with kvm_amd or kvm_intel loaded by triggering kdump: echo c > /proc/sysrq-trigger Suggested-by: Sean Christopherson Fixes: 428afac5a8ea ("KVM: x86: Move bulk of emergency virtualizaton logic to virt subsystem") Signed-off-by: Mikhail Gavrilov Acked-by: Sean Christopherson Link: https://patch.msgid.link/20260504235435.90957-1-mikhail.v.gavrilov@gmail.com Signed-off-by: Sean Christopherson --- arch/x86/virt/hw.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/arch/x86/virt/hw.c b/arch/x86/virt/hw.c index f647557d38ac..7e9091c640be 100644 --- a/arch/x86/virt/hw.c +++ b/arch/x86/virt/hw.c @@ -49,7 +49,20 @@ static void x86_virt_invoke_kvm_emergency_callback(void) { cpu_emergency_virt_cb *kvm_callback; - kvm_callback = rcu_dereference(kvm_emergency_callback); + /* + * RCU may not be watching the crashing CPU here, so rcu_dereference() + * triggers a suspicious-RCU-usage splat. In principle, a concurrent + * KVM module unload could race with this read; see commit 2baa33a8ddd6 + * ("KVM: x86: Leave user-return notifier registered on reboot/shutdown") + * which notes that nothing prevents module unload during panic/reboot. + * + * However, taking a lock here would be riskier than the current race: + * the system is going down via NMI shootdown, and any lock could be + * held by an already-stopped CPU. Use rcu_dereference_raw() to silence + * the lockdep splat and accept the comically small remaining race; + * panic context inherently cannot guarantee complete correctness. + */ + kvm_callback = rcu_dereference_raw(kvm_emergency_callback); if (kvm_callback) kvm_callback(); } From 8fe2e698fce4a95a3ac2c25fe59832a3c22534c6 Mon Sep 17 00:00:00 2001 From: Lei Chen Date: Thu, 9 Apr 2026 22:22:26 +0800 Subject: [PATCH 03/22] KVM: x86: Rate-limit global clock updates on vCPU load commit 446fcce2a52b ("Revert "x86: kvm: rate-limit global clock updates"") dropped the rate limiting for KVM_REQ_GLOBAL_CLOCK_UPDATE. As a result, kvm_arch_vcpu_load() can queue global clock update requests every time a vCPU is scheduled when the master clock is disabled or when the vCPU is loaded for the first time. Restore the throttling with a per-VM ratelimit state and gate KVM_REQ_GLOBAL_CLOCK_UPDATE through __ratelimit(), so frequent vCPU scheduling does not generate a steady stream of redundant clock update requests. Fixes: 446fcce2a52b ("Revert "x86: kvm: rate-limit global clock updates"") Signed-off-by: Lei Chen Reported-by: Jaroslav Pulchart Closes: https://lore.kernel.org/all/CAK8fFZ5gY8_Mw2A=iZVFNVKQNrXQzVsn-HTd+Me9K6ZfmdgA+Q@mail.gmail.com/ Link: https://patch.msgid.link/20260409142226.2581-1-lei.chen@smartx.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/x86.c | 11 +++++++++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c470e40a00aa..f14009f25a3b 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1504,6 +1504,7 @@ struct kvm_arch { bool use_master_clock; u64 master_kernel_ns; u64 master_cycle_now; + struct ratelimit_state kvmclock_update_rs; #ifdef CONFIG_KVM_HYPERV struct kvm_hv hyperv; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0a1b63c63d1a..e01d6984ed04 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5227,8 +5227,13 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) * On a host with synchronized TSC, there is no need to update * kvmclock on vcpu->cpu migration */ - if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1) - kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); + if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1) { + if (__ratelimit(&vcpu->kvm->arch.kvmclock_update_rs)) + kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); + else + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); + } + if (vcpu->cpu != cpu) kvm_make_request(KVM_REQ_MIGRATE_TIMER, vcpu); vcpu->cpu = cpu; @@ -13366,6 +13371,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) raw_spin_lock_init(&kvm->arch.tsc_write_lock); mutex_init(&kvm->arch.apic_map_lock); seqcount_raw_spinlock_init(&kvm->arch.pvclock_sc, &kvm->arch.tsc_write_lock); + ratelimit_state_init(&kvm->arch.kvmclock_update_rs, HZ, 10); + ratelimit_set_flags(&kvm->arch.kvmclock_update_rs, RATELIMIT_MSG_ON_RELEASE); kvm->arch.kvmclock_offset = -get_kvmclock_base_ns(); raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); From 34065a5f3cf94886e59e2a8b5db00515f32d6cf2 Mon Sep 17 00:00:00 2001 From: Hisam Mehboob Date: Thu, 9 Apr 2026 20:38:47 +0500 Subject: [PATCH 04/22] KVM: selftests: Guard execinfo.h inclusion for non-glibc builds The backtrace() function and execinfo.h are GNU extensions available in glibc but not in non-glibc C libraries such as musl. Building KVM selftests with musl-gcc fails with: lib/assert.c:9:10: fatal error: execinfo.h: No such file or directory Fix this by guarding the inclusion of execinfo.h and the stack dumping logic under #ifdef __GLIBC__. For non-glibc builds, provide a local stub for test_dump_stack(). Suggested-by: Aqib Faruqui Suggested-by: Sean Christopherson Signed-off-by: Hisam Mehboob Link: https://patch.msgid.link/20260409153846.1502656-2-hisamshar@gmail.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/lib/assert.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/lib/assert.c b/tools/testing/selftests/kvm/lib/assert.c index b49690658c60..8be0d09ecf0f 100644 --- a/tools/testing/selftests/kvm/lib/assert.c +++ b/tools/testing/selftests/kvm/lib/assert.c @@ -6,11 +6,14 @@ */ #include "test_util.h" -#include + #include #include "kselftest.h" +#ifdef __GLIBC__ +#include + /* Dumps the current stack trace to stderr. */ static void __attribute__((noinline)) test_dump_stack(void); static void test_dump_stack(void) @@ -57,6 +60,9 @@ static void test_dump_stack(void) system(cmd); #pragma GCC diagnostic pop } +#else +static void test_dump_stack(void) {} +#endif static pid_t _gettid(void) { From 373452ac0649846431ca0f88574a2fa6382d2045 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Wed, 20 May 2026 23:08:30 +0100 Subject: [PATCH 05/22] KVM: arm64: Fix CONFIG_PKVM_DISABLE_STAGE2_ON_PANIC A typo in the config guard in __hyp_do_panic broke the stage-2 disabling and made backtraces for pKVM quite unreliable. Fix that typo. Fixes: 9019e82c7e46 ("KVM: arm64: Add PKVM_DISABLE_STAGE2_ON_PANIC") Signed-off-by: Vincent Donnefort Link: https://patch.msgid.link/20260520220830.273289-1-vdonnefort@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/host.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S index f337770ec459..9393fe3ea6a1 100644 --- a/arch/arm64/kvm/hyp/nvhe/host.S +++ b/arch/arm64/kvm/hyp/nvhe/host.S @@ -120,7 +120,7 @@ SYM_FUNC_START(__hyp_do_panic) mov x29, x0 -#ifdef PKVM_DISABLE_STAGE2_ON_PANIC +#ifdef CONFIG_PKVM_DISABLE_STAGE2_ON_PANIC /* Ensure host stage-2 is disabled */ mrs x0, hcr_el2 bic x0, x0, #HCR_VM From b60621c5121c9435eda99af7dc2100f5c0f88695 Mon Sep 17 00:00:00 2001 From: Emily Ehlert Date: Mon, 18 May 2026 13:59:56 +0000 Subject: [PATCH 06/22] KVM: x86: Fix ERAPS RAP clear on INVPCID single-context invalidation Use kvm_register_mark_dirty() instead of kvm_register_is_dirty() to actually mark VCPU_EXREG_ERAPS as dirty when emulating INVPCID_TYPE_SINGLE_CTXT. kvm_register_is_dirty() is a read-only predicate whose return value is discarded, making the call a no-op. Without this fix, a single-context INVPCID will not trigger a RAP clear on the next VMRUN, breaking the ERAPS security guarantee. Fixes: db5e82496492 ("KVM: SVM: Virtualize and advertise support for ERAPS") Signed-off-by: Emily Ehlert Link: https://patch.msgid.link/20260518135956.82569-1-ehemily@amazon.de Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e01d6984ed04..108318e1b3f0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -14330,7 +14330,7 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva) * the RAP (Return Address Predicator). */ if (guest_cpu_cap_has(vcpu, X86_FEATURE_ERAPS)) - kvm_register_is_dirty(vcpu, VCPU_EXREG_ERAPS); + kvm_register_mark_dirty(vcpu, VCPU_EXREG_ERAPS); kvm_invalidate_pcid(vcpu, operand.pcid); return kvm_skip_emulated_instruction(vcpu); From a9e18aa3263f356edae305e29830e5fe63d8597a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 15 May 2026 10:15:36 -0700 Subject: [PATCH 07/22] KVM: SVM: Flush the current TLB when transitioning from xAVIC => x2AVIC Flush the current TLB when xAVIC *or* x2AVIC is activated, as KVM is (apparently) responsible for purging TLB entries when transitioning from xAVIC to x2AVIC. The APM says a whole lot of nothing about TLB flushing with respect to (x2)AVIC, but empirical data strongly suggests hardware also does a whole lot of nothing. Failure to flush the TLB when enabling x2AVIC can lead to guest accesses to the APIC base address getting incorrectly redirected to the virtual APIC page. The flaw most visibly manifests as failures in KVM-Unit-Test's verify_disabled_apic_mmio() testcase when x2APIC is enabled (though for reasons unknown, the test only reliably fails with EFI builds). Fixes: 0ccf3e7cb95a ("KVM: SVM: Flush the "current" TLB when activating AVIC") Fixes: 4d1d7942e36a ("KVM: SVM: Introduce logic to (de)activate x2AVIC mode") Cc: stable@vger.kernel.org Cc: Naveen N Rao (AMD) Link: https://patch.msgid.link/20260515171536.1841645-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 35 +++++++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index adf211860949..e8bd60156941 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -206,6 +206,35 @@ static void avic_activate_vmcb(struct vcpu_svm *svm) svm_clr_intercept(svm, INTERCEPT_CR8_WRITE); + /* + * Flush the TLB when enabling (x2)AVIC and when transitioning between + * xAVIC and x2AVIC, as the CPU may have inserted a TLB entry for the + * "wrong" mapping. + * + * KVM uses a per-VM "scratch" page to back the APIC memslot, because + * KVM also uses per-VM page tables *and* maintains the page table (NPT + * or shadow page) mappings for said memslot even if one or more vCPUs + * have their local APIC hardware-disabled or are in x2APIC mode, i.e. + * even if one or more vCPUs' APIC MMIO BAR is effectively disabled. + * + * If xAVIC is fully enabled, hardware ignores the physical address in + * KVM's page tables, i.e. in the leaf SPTE for the APIC memslot, and + * instead redirects the access to the AVIC backing page, i.e. to the + * vCPU's virtual APIC page. If xAVIC is not enabled (APIC is either + * hardware-disabled or in x2APIC mode), then guest accesses will use + * the page table mapping verbatim, i.e. will access the per-VM scratch + * page, as normal memory. + * + * In both cases, the CPU is allowed to cache TLB entries for the APIC + * base GPA. So, KVM needs to flush the TLB when enabling xAVIC, as + * accesses need to be redirected to the virtual APIC page, but the TLB + * may contain entries pointing at the scratch page. KVM also needs to + * flush the TLB when enabling x2AVIC, as accesses need to go to the + * scratch page, but the TLB may contain entries tagged as xAVIC, i.e. + * entries pointing to the vCPU's virtual APIC page. + */ + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, &svm->vcpu); + /* * Note: KVM supports hybrid-AVIC mode, where KVM emulates x2APIC MSR * accesses, while interrupt injection to a running vCPU can be @@ -219,12 +248,6 @@ static void avic_activate_vmcb(struct vcpu_svm *svm) /* Disabling MSR intercept for x2APIC registers */ avic_set_x2apic_msr_interception(svm, false); } else { - /* - * Flush the TLB, the guest may have inserted a non-APIC - * mapping into the TLB while AVIC was disabled. - */ - kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, &svm->vcpu); - /* Enabling MSR intercept for x2APIC registers */ avic_set_x2apic_msr_interception(svm, true); } From 1750ad1388e03fb27068cd1f22c9c8b4590fe936 Mon Sep 17 00:00:00 2001 From: Qiang Ma Date: Tue, 26 May 2026 15:46:40 +0800 Subject: [PATCH 08/22] KVM: arm64: PMU: Preserve AArch32 counter low bits AArch32 writes to PMU event counters cannot update the top 32 bits, even when PMUv3p5 makes the counters 64-bit. KVM therefore needs to preserve the existing high half and only update the low half written by the guest, unless the caller explicitly forces a full reset through PMCR.P. The current code masks @val down to the old high half before taking lower_32_bits(val), which means the low half is always zero. As a result, AArch32 writes to event counters discard the guest-provided low 32 bits instead of storing them. Build the new value from the old high 32 bits and the low 32 bits of the value supplied by the guest. Fixes: 26d2d0594d70 ("KVM: arm64: PMU: Do not let AArch32 change the counters' top 32 bits") Signed-off-by: Qiang Ma Signed-off-by: Marc Zyngier Link: https://patch.msgid.link/20260526074640.791991-1-maqianga@uniontech.com Cc: stable@vger.kernel.org --- arch/arm64/kvm/pmu-emul.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index e1860acae641..c816db5d6761 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -174,8 +174,8 @@ static void kvm_pmu_set_pmc_value(struct kvm_pmc *pmc, u64 val, bool force) * action is to use PMCR.P, which will reset them to * 0 (the only use of the 'force' parameter). */ - val = __vcpu_sys_reg(vcpu, reg) & GENMASK(63, 32); - val |= lower_32_bits(val); + val = (__vcpu_sys_reg(vcpu, reg) & GENMASK(63, 32)) | + lower_32_bits(val); } __vcpu_assign_sys_reg(vcpu, reg, val); From 0f7abb6eaa3c3965f925e231c18409dac4f5a0c1 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Thu, 21 May 2026 13:46:11 +0100 Subject: [PATCH 09/22] KVM: arm64: Fix meta-page unsharing in pKVM hyp tracing As the hyp_trace_buffer_unshare_hyp() function name suggests we should unshare all the previously shared pages, otherwise we leak hyp-shared pages which won't be reusable for hyp memory. Fix the typo by calling __unshare_page() on the meta-page, ensuring all previously shared pages are correctly unshared. Fixes: 3aed038aac8d ("KVM: arm64: Add trace remote for the nVHE/pKVM hyp") Signed-off-by: Vincent Donnefort Link: https://patch.msgid.link/20260521124613.911067-2-vdonnefort@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp_trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/hyp_trace.c b/arch/arm64/kvm/hyp_trace.c index 8b7f2bf2fba8..06805b426101 100644 --- a/arch/arm64/kvm/hyp_trace.c +++ b/arch/arm64/kvm/hyp_trace.c @@ -189,7 +189,7 @@ static void hyp_trace_buffer_unshare_hyp(struct hyp_trace_buffer *trace_buffer, if (cpu > last_cpu) break; - __share_page(rb_desc->meta_va); + __unshare_page(rb_desc->meta_va); for (p = 0; p < rb_desc->nr_page_va; p++) __unshare_page(rb_desc->page_va[p]); } From a23780ea9db3f3cadbb52ff6151384bff89d95d2 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Thu, 21 May 2026 13:46:12 +0100 Subject: [PATCH 10/22] KVM: arm64: Fix rollback in hyp_trace_buffer_share_hyp() When sharing the trace buffer with the hypervisor, if sharing a page fails, the rollback path in hyp_trace_buffer_share_hyp() misses unsharing the metadata page (meta_va) which was successfully shared before entering the page sharing loop. Additionally, if a failure occurs, the cleanup calls hyp_trace_buffer_unshare_hyp() with an incorrect CPU index. Since that CPU's pages were already rolled back locally in the loop, this leads to duplicate unsharing attempts. Fix both issues affecting the rollback. Fixes: 3aed038aac8d ("KVM: arm64: Add trace remote for the nVHE/pKVM hyp") Reported-by: Sashiko Signed-off-by: Vincent Donnefort Link: https://patch.msgid.link/20260521124613.911067-3-vdonnefort@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp_trace.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/hyp_trace.c b/arch/arm64/kvm/hyp_trace.c index 06805b426101..8595f9bdb3dc 100644 --- a/arch/arm64/kvm/hyp_trace.c +++ b/arch/arm64/kvm/hyp_trace.c @@ -212,14 +212,15 @@ static int hyp_trace_buffer_share_hyp(struct hyp_trace_buffer *trace_buffer) } if (ret) { - for (p--; p >= 0; p--) + while (--p >= 0) __unshare_page(rb_desc->page_va[p]); + __unshare_page(rb_desc->meta_va); break; } } if (ret) - hyp_trace_buffer_unshare_hyp(trace_buffer, cpu--); + hyp_trace_buffer_unshare_hyp(trace_buffer, --cpu); return ret; } From adae9996c04fea3b1791099b6d79e1df76d50849 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Thu, 21 May 2026 13:46:13 +0100 Subject: [PATCH 11/22] KVM: arm64: Fix memory leak in hyp_trace_unload() During trace remote loading, hyp_trace_load() allocates the descriptor pages but fails to store the allocated size in trace_buffer->desc_size. As a result, when unloading the trace buffer, hyp_trace_unload() calls free_pages_exact() with a size of 0 which fails to free the memory. Fix this by updating the descriptor size in trace_buffer->desc_size. Fixes: 3aed038aac8d ("KVM: arm64: Add trace remote for the nVHE/pKVM hyp") Reported-by: Sashiko Signed-off-by: Vincent Donnefort Link: https://patch.msgid.link/20260521124613.911067-4-vdonnefort@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp_trace.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/kvm/hyp_trace.c b/arch/arm64/kvm/hyp_trace.c index 8595f9bdb3dc..c4b3ee552131 100644 --- a/arch/arm64/kvm/hyp_trace.c +++ b/arch/arm64/kvm/hyp_trace.c @@ -249,6 +249,7 @@ static struct trace_buffer_desc *hyp_trace_load(unsigned long size, void *priv) goto err_free_desc; trace_buffer->desc = desc; + trace_buffer->desc_size = desc_size; ret = hyp_trace_buffer_alloc_bpages_backing(trace_buffer, size); if (ret) @@ -298,6 +299,7 @@ static void hyp_trace_unload(struct trace_buffer_desc *desc, void *priv) hyp_trace_buffer_free_bpages_backing(trace_buffer); free_pages_exact(trace_buffer->desc, trace_buffer->desc_size); trace_buffer->desc = NULL; + trace_buffer->desc_size = 0; } static int hyp_trace_enable_tracing(bool enable, void *priv) From 83726330748981372bde86ed5411d7b306612991 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Fri, 29 May 2026 00:01:44 +0100 Subject: [PATCH 12/22] KVM: arm64: Correctly cap ZCR_EL2 provided by a guest hypervisor ZCR_EL2 can be updated by a VHE guest hypervisor either using ZCR_EL2 (which traps) or ZCR_EL1 (which does not trap). KVM handles both in different way: - on ZCR_EL2 trap, ZCR_EL2.LEN is immediately capped at the VM's own VL limit. This has the potential to break existing SW that relies on the full LEN field to be stateful. - on ZCR_EL1 access, we do absolutely nothing. On restoring the SVE context for an L2 guest, we directly restore the guest hypervisor's view of ZCR_EL2 into the physical ZCR_EL2. If the guest's view of the register was updated using the ZCR_EL2 accessor, the value has already been sanitised (with the caveat mentioned above). But if the guest used ZCR_EL1, the raw value is written into the HW, and the L2 guest can now access VLs that it shouldn't. Fix all the above by moving the VL capping to the restore points, ensuring that: - the HW is always programmed with a capped value, irrespective of the accessor being used, - the ZCR_EL2.LEN field is always completely stateful, irrespective of the accessor being used. Additionally, move ZCR_EL2 to be a sanitised register, ensuring that only the LEN field is actually stateful. This requires some creative construction of the RES0 mask, as the sysreg generation script does not yet generate RAZ/WI fields. Fixes: b3d29a823099 ("KVM: arm64: nv: Handle ZCR_EL2 traps") Signed-off-by: Mark Brown Cc: stable@vger.kernel.org Link: https://patch.msgid.link/20260529-kvm-arm64-fix-zcr-len-nv-v2-1-86cad51992bd@kernel.org [maz: rewrote commit message, tidy up access_zcr_el2()] Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 2 +- arch/arm64/kvm/hyp/include/hyp/switch.h | 16 ++++++++++------ arch/arm64/kvm/nested.c | 5 +++++ arch/arm64/kvm/sys_regs.c | 11 +++-------- 4 files changed, 19 insertions(+), 15 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 65eead8362e0..a49042bfa801 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -511,7 +511,6 @@ enum vcpu_sysreg { ACTLR_EL2, /* Auxiliary Control Register (EL2) */ CPTR_EL2, /* Architectural Feature Trap Register (EL2) */ HACR_EL2, /* Hypervisor Auxiliary Control Register */ - ZCR_EL2, /* SVE Control Register (EL2) */ TTBR0_EL2, /* Translation Table Base Register 0 (EL2) */ TTBR1_EL2, /* Translation Table Base Register 1 (EL2) */ TCR_EL2, /* Translation Control Register (EL2) */ @@ -543,6 +542,7 @@ enum vcpu_sysreg { SCTLR2_EL2, /* System Control Register 2 (EL2) */ MDCR_EL2, /* Monitor Debug Configuration Register (EL2) */ CNTHCTL_EL2, /* Counter-timer Hypervisor Control register */ + ZCR_EL2, /* SVE Control Register (EL2) */ /* Any VNCR-capable reg goes after this point */ MARKER(__VNCR_START__), diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index bf0eb5e43427..320cd45d49c5 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -462,11 +462,13 @@ static inline bool kvm_hyp_handle_mops(struct kvm_vcpu *vcpu, u64 *exit_code) static inline void __hyp_sve_restore_guest(struct kvm_vcpu *vcpu) { + u64 zcr_el2 = vcpu_sve_max_vq(vcpu) - 1; + /* * The vCPU's saved SVE state layout always matches the max VL of the * vCPU. Start off with the max VL so we can load the SVE state. */ - sve_cond_update_zcr_vq(vcpu_sve_max_vq(vcpu) - 1, SYS_ZCR_EL2); + sve_cond_update_zcr_vq(zcr_el2, SYS_ZCR_EL2); __sve_restore_state(vcpu_sve_pffr(vcpu), &vcpu->arch.ctxt.fp_regs.fpsr, true); @@ -476,8 +478,10 @@ static inline void __hyp_sve_restore_guest(struct kvm_vcpu *vcpu) * nested guest, as the guest hypervisor could select a smaller VL. Slap * that into hardware before wrapping up. */ - if (is_nested_ctxt(vcpu)) - sve_cond_update_zcr_vq(__vcpu_sys_reg(vcpu, ZCR_EL2), SYS_ZCR_EL2); + if (is_nested_ctxt(vcpu)) { + zcr_el2 = min(zcr_el2, __vcpu_sys_reg(vcpu, ZCR_EL2)); + sve_cond_update_zcr_vq(zcr_el2, SYS_ZCR_EL2); + } write_sysreg_el1(__vcpu_sys_reg(vcpu, vcpu_sve_zcr_elx(vcpu)), SYS_ZCR); } @@ -501,11 +505,11 @@ static inline void fpsimd_lazy_switch_to_guest(struct kvm_vcpu *vcpu) return; if (vcpu_has_sve(vcpu)) { + zcr_el2 = vcpu_sve_max_vq(vcpu) - 1; + /* A guest hypervisor may restrict the effective max VL. */ if (is_nested_ctxt(vcpu)) - zcr_el2 = __vcpu_sys_reg(vcpu, ZCR_EL2); - else - zcr_el2 = vcpu_sve_max_vq(vcpu) - 1; + zcr_el2 = min(zcr_el2, __vcpu_sys_reg(vcpu, ZCR_EL2)); write_sysreg_el2(zcr_el2, SYS_ZCR); diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 883b6c1008fb..38f672e94087 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -1834,6 +1834,11 @@ int kvm_init_nv_sysregs(struct kvm_vcpu *vcpu) resx.res1 = VNCR_EL2_RES1; set_sysreg_masks(kvm, VNCR_EL2, resx); + /* ZCR_EL2 - bits 8:4 are RAZ/WI so treat them as RES0 */ + resx.res0 = ZCR_ELx_RES0 | GENMASK_ULL(8, 4); + resx.res1 = ZCR_ELx_RES1; + set_sysreg_masks(kvm, ZCR_EL2, resx); + out: for (enum vcpu_sysreg sr = __SANITISED_REG_START__; sr < NR_SYS_REGS; sr++) __vcpu_rmw_sys_reg(vcpu, sr, |=, 0); diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 148fc3400ea8..fa5c93c7a135 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -2862,21 +2862,16 @@ static bool access_zcr_el2(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { - unsigned int vq; - if (guest_hyp_sve_traps_enabled(vcpu)) { kvm_inject_nested_sve_trap(vcpu); return false; } - if (!p->is_write) { + if (!p->is_write) p->regval = __vcpu_sys_reg(vcpu, ZCR_EL2); - return true; - } + else + __vcpu_assign_sys_reg(vcpu, ZCR_EL2, p->regval); - vq = SYS_FIELD_GET(ZCR_ELx, LEN, p->regval) + 1; - vq = min(vq, vcpu_sve_max_vq(vcpu)); - __vcpu_assign_sys_reg(vcpu, ZCR_EL2, vq - 1); return true; } From db3f2195d29344a3cf1e9dd9ab7f21ced7308cf7 Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Fri, 1 May 2026 13:22:26 -0700 Subject: [PATCH 13/22] KVM: SEV: Require in-GHCB scratch area if GHCB v2+ is in use As per the GHCB spec, when using GHCB v2+ require the software scratch area to reside in the GHCB's shared buffer. Note, things like Page State Change (PSC) requests _rely_ on this behavior, as the guest can't provide a length when making the request, i.e. the size of the guest payload is bounded by the size of the shared buffer. Failure to force usage of the GHCB, and a slew of other flaws, lets a malicious SNP guest corrupt host kernel heap memory, and leak host heap layout information. setup_vmgexit_scratch() allocates a buffer via kvzalloc(exit_info_2), where exit_info_2 is guest-controlled. With exit_info_2=24, this yields a 24-byte allocation in kmalloc-cg-32 (32-byte slab objects). The buffer holds an 8-byte psc_hdr followed by 8-byte psc_entry structs, so only entries[0] and entries[1] are in-bounds. snp_begin_psc() validates end_entry against VMGEXIT_PSC_MAX_COUNT (253) but NOT against the actual buffer size: idx_end = hdr->end_entry; if (idx_end >= VMGEXIT_PSC_MAX_COUNT) { // checks 253, not buffer snp_complete_psc(svm, ...); return 1; } for (idx = idx_start; idx <= idx_end; idx++) { entry_start = entries[idx]; // OOB when idx >= 2 The guest sets end_entry=10+, causing the host to iterate entries[2+] which are OOB into adjacent slab objects. For each OOB entry: - The host reads 8 bytes (OOB READ / info leak oracle) - If the data passes PSC validation, __snp_complete_one_psc() writes cur_page = 1 or 512 into the entry (OOB WRITE, sev.c:3806) - If validation fails, the error response reveals whether adjacent memory is zero vs non-zero (information disclosure to guest) The guest controls allocation size (exit_info_2), entry range (cur_entry/end_entry), and can fire unlimited VMGEXITs to repeatedly hit different slab positions. By exploiting the variety of bugs, a malicious SEV-SNP guest can: - OOB read adjacent kmalloc-cg-32 objects (heap layout disclosure) - OOB write cur_page bits into adjacent objects (heap corruption) - Trigger use-after-free conditions across VMGEXITs E.g. with KASAN enabled, a single insmod of the PoC guest module produces 73 KASAN reports: BUG: KASAN: slab-out-of-bounds in snp_begin_psc+0x126/0x890 Read of size 8 at addr ffff888219ffb5e0 by task qemu-system-x86/2199 BUG: KASAN: slab-out-of-bounds in snp_begin_psc+0x468/0x890 Write of size 8 at addr ffff888351566648 by task qemu-system-x86/2199 The buggy address belongs to the object at ffff888XXXXXXXXX which belongs to the cache kmalloc-cg-32 of size 32 The buggy address is located N bytes to the right of allocated 32-byte region [ffff888XXXXXXXXX, ffff888XXXXXXXXX) Breakdown: 62 slab-out-of-bounds (reads + writes past allocation) 7 slab-use-after-free 4 use-after-free All credit to Stan for the wonderful description and reproducer! Reported-by: Stan Shaw Cc: Michael Roth Cc: Tom Lendacky Cc: Peter Gonda Cc: Jacky Li Fixes: 4af663c2f64a ("KVM: SEV: Allow per-guest configuration of GHCB protocol version") Cc: stable@vger.kernel.org Signed-off-by: Michael Roth [sean: write changelog] Reviewed-by: Tom Lendacky Signed-off-by: Sean Christopherson Message-ID: <20260501202250.2115252-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index c2126b3c3072..23170b64f4a3 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -3703,6 +3703,10 @@ static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) scratch_va = (void *)svm->sev_es.ghcb; scratch_va += (scratch_gpa_beg - control->ghcb_gpa); } else { + /* GHCB v2 requires the scratch area to be within the GHCB. */ + if (to_kvm_sev_info(svm->vcpu.kvm)->ghcb_version >= 2) + goto e_scratch; + /* * The guest memory must be read into a kernel buffer, so * limit the size From 1aa8a6dc7dac8b83234b53518311bf78231f4fa5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 1 May 2026 13:22:27 -0700 Subject: [PATCH 14/22] KVM: SEV: Ignore MMIO requests of length '0' Explicitly ignore MMIO requests of length '0', so that setting up the software scratch area (and other code) doesn't have to worry about underflowing the length, and to allow for special casing '0' in the future. Fixes: 8f423a80d299 ("KVM: SVM: Support MMIO for an SEV-ES guest") Cc: stable@vger.kernel.org Reviewed-by: Tom Lendacky Signed-off-by: Sean Christopherson Message-ID: <20260501202250.2115252-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 23170b64f4a3..fb2174b6d1ba 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -4497,13 +4497,17 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) case SVM_VMGEXIT_MMIO_READ: case SVM_VMGEXIT_MMIO_WRITE: { bool is_write = control->exit_code == SVM_VMGEXIT_MMIO_WRITE; + u64 len = control->exit_info_2; - ret = setup_vmgexit_scratch(svm, !is_write, control->exit_info_2); + if (!len) + return 1; + + ret = setup_vmgexit_scratch(svm, !is_write, len); if (ret) break; - ret = kvm_sev_es_mmio(vcpu, is_write, control->exit_info_1, - control->exit_info_2, svm->sev_es.ghcb_sa); + ret = kvm_sev_es_mmio(vcpu, is_write, control->exit_info_1, len, + svm->sev_es.ghcb_sa); break; } case SVM_VMGEXIT_NMI_COMPLETE: From dcf1b2d4b0564a27e4ca7c654871aab4f9620046 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 1 May 2026 13:22:28 -0700 Subject: [PATCH 15/22] KVM: SEV: Reject MMIO requests larger than 8 bytes with GHCB v2+ When using GHCB v2+, reject MMIO requests that are larger than 8 bytes. Per the GHCB spec: SW_EXITINFO2 must be less than or equal to 0x7fffffff for version 1 and less than or equal to 0x8 for all other versions. Fixes: 4af663c2f64a ("KVM: SEV: Allow per-guest configuration of GHCB protocol version") Cc: stable@vger.kernel.org Reviewed-by: Tom Lendacky Signed-off-by: Sean Christopherson Message-ID: <20260501202250.2115252-4-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index fb2174b6d1ba..e6579ca9f364 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -4502,6 +4502,11 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) if (!len) return 1; + if (to_kvm_sev_info(vcpu->kvm)->ghcb_version >= 2 && len > 8) { + svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_INPUT); + return 1; + } + ret = setup_vmgexit_scratch(svm, !is_write, len); if (ret) break; From 3988bd2723de407ae90fa7a6f6029b4e60238c58 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 1 May 2026 13:22:29 -0700 Subject: [PATCH 16/22] KVM: SEV: Ignore Port I/O requests of length '0' Explicitly ignore Port I/O requests of length '0' (or count '0'), so that setting up the software scratch area (and other code) doesn't have to worry about underflowing the length, and to allow for WARNing on trying to configure the scratch area with len==0. Fixes: 291bd20d5d88 ("KVM: SVM: Add initial support for a VMGEXIT VMEXIT") Cc: stable@vger.kernel.org Reviewed-by: Tom Lendacky Signed-off-by: Sean Christopherson Message-ID: <20260501202250.2115252-5-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index e6579ca9f364..52703c954856 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -4585,6 +4585,11 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) control->exit_info_1, control->exit_info_2); ret = -EINVAL; break; + case SVM_EXIT_IOIO: + if (!((control->exit_info_1 & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT)) + return 1; + + fallthrough; default: ret = svm_invoke_exit_handler(vcpu, control->exit_code); } @@ -4605,6 +4610,9 @@ int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in) if (unlikely(check_mul_overflow(count, size, &bytes))) return -EINVAL; + if (!bytes) + return 1; + r = setup_vmgexit_scratch(svm, in, bytes); if (r) return r; From 2be54670bdc017004c4a4b8bddb6ff02ebe7dbe2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 1 May 2026 13:22:30 -0700 Subject: [PATCH 17/22] KVM: SEV: Use the size of the PSC header as the minimum size for PSC requests When handling a Page State Change (PSC) #VMGEXIT use the size of the PSC header as the minimum size for the scratch area. Per the GHCB spec, PSC requests do NOT provide the length, i.e. using control->exit_info_2 for the length is completely made up behavior. The existing code "works", e.g. even though Linux-as-a-guest always passes '0', because KVM doesn't do anything with the length when the request is in the GHCB's shared buffer. Use the header as the min length. Once the header is retrieved, KVM can use the specified indices to compute the full size of the request. Fixes: 9b54e248d264 ("KVM: SEV: Add support to handle Page State Change VMGEXIT") Cc: stable@vger.kernel.org Reviewed-by: Tom Lendacky Reviewed-by: Michael Roth Signed-off-by: Sean Christopherson Message-ID: <20260501202250.2115252-6-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 52703c954856..cbb3040e0778 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -4559,7 +4559,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) vcpu->run->system_event.data[0] = control->ghcb_gpa; break; case SVM_VMGEXIT_PSC: - ret = setup_vmgexit_scratch(svm, true, control->exit_info_2); + ret = setup_vmgexit_scratch(svm, true, sizeof(struct psc_hdr)); if (ret) break; From 5867d7e202e09f037cefe77f7af4413c7c0fa088 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 1 May 2026 13:22:31 -0700 Subject: [PATCH 18/22] KVM: SEV: Compute the correct max length of the in-GHCB scratch area When setting the length of the GHCB scratch area, and the area is in the GHCB shared buffer, set the effective length of the scratch area to the max possible size given the start of the guest-provided pointer, and the end of the shared buffer. The code was "fine" when first introduced, as KVM doesn't consult the length of the buffer when emulating MMIO, because the passed in @len always specifies the *max* size required. But for PSC requests, the incoming @len is just the minimum length (to process the header), and KVM needs to know the full size of the scratch area to avoid buffer overflows (spoiler alert). Opportunistically rename @len => @min_len to better reflect its role. Fixes: 9b54e248d264 ("KVM: SEV: Add support to handle Page State Change VMGEXIT") Cc: stable@vger.kernel.org Reviewed-by: Tom Lendacky Reviewed-by: Michael Roth Signed-off-by: Sean Christopherson Message-ID: <20260501202250.2115252-7-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index cbb3040e0778..6072fecfe994 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -3662,7 +3662,7 @@ int pre_sev_run(struct vcpu_svm *svm, int cpu) } #define GHCB_SCRATCH_AREA_LIMIT (16ULL * PAGE_SIZE) -static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) +static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 min_len) { struct vmcb_control_area *control = &svm->vmcb->control; u64 ghcb_scratch_beg, ghcb_scratch_end; @@ -3675,10 +3675,10 @@ static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) goto e_scratch; } - scratch_gpa_end = scratch_gpa_beg + len; + scratch_gpa_end = scratch_gpa_beg + min_len; if (scratch_gpa_end < scratch_gpa_beg) { pr_err("vmgexit: scratch length (%#llx) not valid for scratch address (%#llx)\n", - len, scratch_gpa_beg); + min_len, scratch_gpa_beg); goto e_scratch; } @@ -3702,6 +3702,8 @@ static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) scratch_va = (void *)svm->sev_es.ghcb; scratch_va += (scratch_gpa_beg - control->ghcb_gpa); + + svm->sev_es.ghcb_sa_len = ghcb_scratch_end - scratch_gpa_beg; } else { /* GHCB v2 requires the scratch area to be within the GHCB. */ if (to_kvm_sev_info(svm->vcpu.kvm)->ghcb_version >= 2) @@ -3711,16 +3713,16 @@ static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) * The guest memory must be read into a kernel buffer, so * limit the size */ - if (len > GHCB_SCRATCH_AREA_LIMIT) { + if (min_len > GHCB_SCRATCH_AREA_LIMIT) { pr_err("vmgexit: scratch area exceeds KVM limits (%#llx requested, %#llx limit)\n", - len, GHCB_SCRATCH_AREA_LIMIT); + min_len, GHCB_SCRATCH_AREA_LIMIT); goto e_scratch; } - scratch_va = kvzalloc(len, GFP_KERNEL_ACCOUNT); + scratch_va = kvzalloc(min_len, GFP_KERNEL_ACCOUNT); if (!scratch_va) return -ENOMEM; - if (kvm_read_guest(svm->vcpu.kvm, scratch_gpa_beg, scratch_va, len)) { + if (kvm_read_guest(svm->vcpu.kvm, scratch_gpa_beg, scratch_va, min_len)) { /* Unable to copy scratch area from guest */ pr_err("vmgexit: kvm_read_guest for scratch area failed\n"); @@ -3736,11 +3738,10 @@ static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) */ svm->sev_es.ghcb_sa_sync = sync; svm->sev_es.ghcb_sa_free = true; + svm->sev_es.ghcb_sa_len = min_len; } svm->sev_es.ghcb_sa = scratch_va; - svm->sev_es.ghcb_sa_len = len; - return 0; e_scratch: From f185e05dce6f170f83c4ba602e969b1c3c7a22e6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 1 May 2026 13:22:32 -0700 Subject: [PATCH 19/22] KVM: SEV: WARN if KVM attempts to setup scratch area with min_len==0 Now that all paths in KVM properly validate the length needed for the scratch area, and are guaranteed to pass in a non-zero length, WARN if KVM attempts to configured the scratch area with min_len==0 to guard against future bugs. Cc: stable@vger.kernel.org Reviewed-by: Tom Lendacky Reviewed-by: Michael Roth Signed-off-by: Sean Christopherson Message-ID: <20260501202250.2115252-8-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 6072fecfe994..a3e85348ace9 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -3669,6 +3669,9 @@ static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 min_len) u64 scratch_gpa_beg, scratch_gpa_end; void *scratch_va; + if (WARN_ON_ONCE(!min_len)) + goto e_scratch; + scratch_gpa_beg = svm->sev_es.sw_scratch; if (!scratch_gpa_beg) { pr_err("vmgexit: scratch gpa not provided\n"); From ebe4b2dc9cfbfb2d8f665667c4d08f4c6c9bec05 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 1 May 2026 13:22:33 -0700 Subject: [PATCH 20/22] KVM: SEV: Don't explicitly pass PSC buffer to snp_begin_psc() Stop explicitly passing the PSC buffer to snp_begin_psc(): it *must* be the scratch area. This will allow fixing a variety of bugs without further complicating the code. No functional change intended. Cc: stable@vger.kernel.org Reviewed-by: Tom Lendacky Reviewed-by: Michael Roth Signed-off-by: Sean Christopherson Message-ID: <20260501202250.2115252-9-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index a3e85348ace9..8577451b82b2 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -3841,7 +3841,7 @@ struct psc_buffer { struct psc_entry entries[]; } __packed; -static int snp_begin_psc(struct vcpu_svm *svm, struct psc_buffer *psc); +static int snp_begin_psc(struct vcpu_svm *svm); static void snp_complete_psc(struct vcpu_svm *svm, u64 psc_ret) { @@ -3883,7 +3883,6 @@ static void __snp_complete_one_psc(struct vcpu_svm *svm) static int snp_complete_one_psc(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - struct psc_buffer *psc = svm->sev_es.ghcb_sa; if (vcpu->run->hypercall.ret) { snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC); @@ -3893,11 +3892,13 @@ static int snp_complete_one_psc(struct kvm_vcpu *vcpu) __snp_complete_one_psc(svm); /* Handle the next range (if any). */ - return snp_begin_psc(svm, psc); + return snp_begin_psc(svm); } -static int snp_begin_psc(struct vcpu_svm *svm, struct psc_buffer *psc) +static int snp_begin_psc(struct vcpu_svm *svm) { + struct vcpu_sev_es_state *sev_es = &svm->sev_es; + struct psc_buffer *psc = sev_es->ghcb_sa; struct psc_entry *entries = psc->entries; struct kvm_vcpu *vcpu = &svm->vcpu; struct psc_hdr *hdr = &psc->hdr; @@ -4567,7 +4568,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) if (ret) break; - ret = snp_begin_psc(svm, svm->sev_es.ghcb_sa); + ret = snp_begin_psc(svm); break; case SVM_VMGEXIT_AP_CREATION: ret = sev_snp_ap_creation(svm); From 121d88de56bc5c0ba0ce2f6381af67f948a7e7c1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 1 May 2026 13:22:34 -0700 Subject: [PATCH 21/22] KVM: SEV: Check PSC request indices against the actual size of the buffer When processing Page State Change (PSC) requests, validate the PSC buffer against the effective size of the scratch area, which could be less than the maximum size if the guest provided a pointer that isn't exactly at the start of the GHCB shared buffer. Fixes: 9b54e248d264 ("KVM: SEV: Add support to handle Page State Change VMGEXIT") Cc: stable@vger.kernel.org Reviewed-by: Tom Lendacky Reviewed-by: Michael Roth Signed-off-by: Sean Christopherson Message-ID: <20260501202250.2115252-10-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 8577451b82b2..6e8cbae2135a 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -3903,7 +3903,7 @@ static int snp_begin_psc(struct vcpu_svm *svm) struct kvm_vcpu *vcpu = &svm->vcpu; struct psc_hdr *hdr = &psc->hdr; struct psc_entry entry_start; - u16 idx, idx_start, idx_end; + u16 idx, idx_start, idx_end, max_nr_entries; int npages; bool huge; u64 gfn; @@ -3913,6 +3913,19 @@ static int snp_begin_psc(struct vcpu_svm *svm) return 1; } + /* + * GHCB v2 requires the scratch area to reside within the GHCB itself, + * and PSC requests are only supported for GHCB v2+. Thus it should be + * impossible to exceed the max PSC entry count (which is derived from + * the size of the shared GHCB buffer). + */ + max_nr_entries = (sev_es->ghcb_sa_len - sizeof(struct psc_hdr)) / + sizeof(struct psc_entry); + if (WARN_ON_ONCE(max_nr_entries > VMGEXIT_PSC_MAX_COUNT)) { + snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC); + return 1; + } + next_range: /* There should be no other PSCs in-flight at this point. */ if (WARN_ON_ONCE(svm->sev_es.psc_inflight)) { @@ -3928,7 +3941,7 @@ static int snp_begin_psc(struct vcpu_svm *svm) idx_start = hdr->cur_entry; idx_end = hdr->end_entry; - if (idx_end >= VMGEXIT_PSC_MAX_COUNT) { + if (idx_end >= max_nr_entries) { snp_complete_psc(svm, VMGEXIT_PSC_ERROR_INVALID_HDR); return 1; } From c8cc238093ca6c99267032f6cfe78f59389f3157 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 1 May 2026 13:22:35 -0700 Subject: [PATCH 22/22] KVM: SEV: Use READ_ONCE() when reading entries/indices from PSC buffer Use READ_ONCE() when reading entries/indices from the guest-accessible Page State Change buffer to defend against TOCTOU bugs. Don't bother with READ_ONCE()/WRITE_ONCE() for cases where KVM is writing (and not consuming the result!), as the guest isn't supposed to touch the buffer while it's being processed. I.e. using READ_ONCE() is all about protecting against misbehaving guests. Fixes: 9b54e248d264 ("KVM: SEV: Add support to handle Page State Change VMGEXIT") Cc: stable@vger.kernel.org Reviewed-by: Tom Lendacky Signed-off-by: Sean Christopherson Message-ID: <20260501202250.2115252-11-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 6e8cbae2135a..62b5befe0eed 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -3872,9 +3872,9 @@ static void __snp_complete_one_psc(struct vcpu_svm *svm) */ for (idx = svm->sev_es.psc_idx; svm->sev_es.psc_inflight; svm->sev_es.psc_inflight--, idx++) { - struct psc_entry *entry = &entries[idx]; + struct psc_entry entry = READ_ONCE(entries[idx]); - entry->cur_page = entry->pagesize ? 512 : 1; + entries[idx].cur_page = entry.pagesize ? 512 : 1; } hdr->cur_entry = idx; @@ -3938,8 +3938,8 @@ static int snp_begin_psc(struct vcpu_svm *svm) * validation, so take care to only use validated copies of values used * for things like array indexing. */ - idx_start = hdr->cur_entry; - idx_end = hdr->end_entry; + idx_start = READ_ONCE(hdr->cur_entry); + idx_end = READ_ONCE(hdr->end_entry); if (idx_end >= max_nr_entries) { snp_complete_psc(svm, VMGEXIT_PSC_ERROR_INVALID_HDR); @@ -3948,7 +3948,7 @@ static int snp_begin_psc(struct vcpu_svm *svm) /* Find the start of the next range which needs processing. */ for (idx = idx_start; idx <= idx_end; idx++, hdr->cur_entry++) { - entry_start = entries[idx]; + entry_start = READ_ONCE(entries[idx]); gfn = entry_start.gfn; huge = entry_start.pagesize; @@ -3992,7 +3992,7 @@ static int snp_begin_psc(struct vcpu_svm *svm) * KVM_HC_MAP_GPA_RANGE exit. */ while (++idx <= idx_end) { - struct psc_entry entry = entries[idx]; + struct psc_entry entry = READ_ONCE(entries[idx]); if (entry.operation != entry_start.operation || entry.gfn != entry_start.gfn + npages ||