KVM x86 fixes for 7.1-rcN

- Include the kernel's linux/mman.h in KVM selftests to ensure MADV_COLLAPSE
    is defined, as older libc versions may not provide it.
 
  - Include execinfo.h if and only if KVM selftests are building against glibc,
    and provide a test_dump_stack() for non-glibc builds.
 
  - Fudge around an RCU splat in the emegerncy reboot code that is technically
    a legitimate flaw, but in practice is a non-issue and fixing the flaw, e.g.
    by adding locking, would incur meaningful risk, i.e. do more harm than good.
 
  - Rate-limit global clock updates once again (but without delayed work), as
    KVM was subtly relying on the old rate-limiting for NPT correction to guard
    against "update storms" when running without a master clock on systems with
    overcommitted CPUs.
 
  - Fix a brown paper bag goof where KVM checked if ERAPS is "dirty" instead of
    marking it dirty when emulating INVPCID.
 
  - Flush the TLB when transitioning from xAVIC => x2AVIC to ensure the CPU TLB
    doesn't contain AVIC-tagged entries for the APIC base GPA.
 -----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEEKTobbabEP7vbhhN9OlYIJqCjN/0FAmoZtdgACgkQOlYIJqCj
 N/26sw/+IWOA5AxyoNW/lKAhhkzDTzGWrNCQkpMv+F4tOUbHYniTxI/pv4L3eMvf
 ZLUXijYxhpJtnblLtrnPpSFl5tll4xQdMUv7+fljgpYmy6+erQHodtCgRi5wHDbM
 NlD7DWOgwmpvzYLcybq1RfjZ3n+OBRvq95haQ6Ph4FtoYuIomtJ5tF2mnMlyxlc/
 aIK5wzQ/JeYdQxwwz1ctlHkgE5bPnS+Sxr33+MRFQ5cIpuwdoS9zYRITNBM107kg
 bLeei8Cxh91sgEidgwS8JToLvaEQH8AodkROjcScllwUxYsshPKsHeH7sTMbCOVd
 DiH9VbheZo7d4kb6pvhGsY891ec00dR5E/l2gZYLWHg4v0lINTw6uBdoJuq3t2TO
 Q3KmGVaUWz+c6dY/0qntVpws35zG106S8Pp4mx/1EnUHbJKZYDsUMC1ppwhrr3Pz
 WEyQ9PFXhOyoSbrtOaEfU+wsFPeAfT9eYADu7oV1t7l75TJAKW1EEaSGfzOO/crj
 3GK3vRq2B1cMHX9c4fwhSs4h8k5JvKlI/mtGPxZN3khVorx9dv/rTqOoeQEsFS5+
 8s5XcNPPJlKfNXcu3Jq6rn8U/JA2HnbH298Nk5uXTCfTrZtDgbOnI8YVYWnoadOl
 8xJoie5ccEsysVj1npNNh61LNMF1XBUUC+eNn0I1o0NzeRauxF8=
 =QQUn
 -----END PGP SIGNATURE-----

Merge tag 'kvm-x86-fixes-7.1-rc6' of https://github.com/kvm-x86/linux into HEAD

KVM x86 fixes for 7.1-rcN

 - Include the kernel's linux/mman.h in KVM selftests to ensure MADV_COLLAPSE
   is defined, as older libc versions may not provide it.

 - Include execinfo.h if and only if KVM selftests are building against glibc,
   and provide a test_dump_stack() for non-glibc builds.

 - Fudge around an RCU splat in the emegerncy reboot code that is technically
   a legitimate flaw, but in practice is a non-issue and fixing the flaw, e.g.
   by adding locking, would incur meaningful risk, i.e. do more harm than good.

 - Rate-limit global clock updates once again (but without delayed work), as
   KVM was subtly relying on the old rate-limiting for NPT correction to guard
   against "update storms" when running without a master clock on systems with
   overcommitted CPUs.

 - Fix a brown paper bag goof where KVM checked if ERAPS is "dirty" instead of
   marking it dirty when emulating INVPCID.

 - Flush the TLB when transitioning from xAVIC => x2AVIC to ensure the CPU TLB
   doesn't contain AVIC-tagged entries for the APIC base GPA.
This commit is contained in:
Paolo Bonzini 2026-05-29 19:28:16 +02:00
commit b397897016
14 changed files with 79 additions and 20 deletions

View File

@ -1504,6 +1504,7 @@ struct kvm_arch {
bool use_master_clock;
u64 master_kernel_ns;
u64 master_cycle_now;
struct ratelimit_state kvmclock_update_rs;
#ifdef CONFIG_KVM_HYPERV
struct kvm_hv hyperv;

View File

@ -206,6 +206,35 @@ static void avic_activate_vmcb(struct vcpu_svm *svm)
svm_clr_intercept(svm, INTERCEPT_CR8_WRITE);
/*
* Flush the TLB when enabling (x2)AVIC and when transitioning between
* xAVIC and x2AVIC, as the CPU may have inserted a TLB entry for the
* "wrong" mapping.
*
* KVM uses a per-VM "scratch" page to back the APIC memslot, because
* KVM also uses per-VM page tables *and* maintains the page table (NPT
* or shadow page) mappings for said memslot even if one or more vCPUs
* have their local APIC hardware-disabled or are in x2APIC mode, i.e.
* even if one or more vCPUs' APIC MMIO BAR is effectively disabled.
*
* If xAVIC is fully enabled, hardware ignores the physical address in
* KVM's page tables, i.e. in the leaf SPTE for the APIC memslot, and
* instead redirects the access to the AVIC backing page, i.e. to the
* vCPU's virtual APIC page. If xAVIC is not enabled (APIC is either
* hardware-disabled or in x2APIC mode), then guest accesses will use
* the page table mapping verbatim, i.e. will access the per-VM scratch
* page, as normal memory.
*
* In both cases, the CPU is allowed to cache TLB entries for the APIC
* base GPA. So, KVM needs to flush the TLB when enabling xAVIC, as
* accesses need to be redirected to the virtual APIC page, but the TLB
* may contain entries pointing at the scratch page. KVM also needs to
* flush the TLB when enabling x2AVIC, as accesses need to go to the
* scratch page, but the TLB may contain entries tagged as xAVIC, i.e.
* entries pointing to the vCPU's virtual APIC page.
*/
kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, &svm->vcpu);
/*
* Note: KVM supports hybrid-AVIC mode, where KVM emulates x2APIC MSR
* accesses, while interrupt injection to a running vCPU can be
@ -219,12 +248,6 @@ static void avic_activate_vmcb(struct vcpu_svm *svm)
/* Disabling MSR intercept for x2APIC registers */
avic_set_x2apic_msr_interception(svm, false);
} else {
/*
* Flush the TLB, the guest may have inserted a non-APIC
* mapping into the TLB while AVIC was disabled.
*/
kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, &svm->vcpu);
/* Enabling MSR intercept for x2APIC registers */
avic_set_x2apic_msr_interception(svm, true);
}

View File

@ -5227,8 +5227,13 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
* On a host with synchronized TSC, there is no need to update
* kvmclock on vcpu->cpu migration
*/
if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1)
kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1) {
if (__ratelimit(&vcpu->kvm->arch.kvmclock_update_rs))
kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
else
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
}
if (vcpu->cpu != cpu)
kvm_make_request(KVM_REQ_MIGRATE_TIMER, vcpu);
vcpu->cpu = cpu;
@ -13366,6 +13371,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
raw_spin_lock_init(&kvm->arch.tsc_write_lock);
mutex_init(&kvm->arch.apic_map_lock);
seqcount_raw_spinlock_init(&kvm->arch.pvclock_sc, &kvm->arch.tsc_write_lock);
ratelimit_state_init(&kvm->arch.kvmclock_update_rs, HZ, 10);
ratelimit_set_flags(&kvm->arch.kvmclock_update_rs, RATELIMIT_MSG_ON_RELEASE);
kvm->arch.kvmclock_offset = -get_kvmclock_base_ns();
raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
@ -14323,7 +14330,7 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
* the RAP (Return Address Predicator).
*/
if (guest_cpu_cap_has(vcpu, X86_FEATURE_ERAPS))
kvm_register_is_dirty(vcpu, VCPU_EXREG_ERAPS);
kvm_register_mark_dirty(vcpu, VCPU_EXREG_ERAPS);
kvm_invalidate_pcid(vcpu, operand.pcid);
return kvm_skip_emulated_instruction(vcpu);

View File

@ -49,7 +49,20 @@ static void x86_virt_invoke_kvm_emergency_callback(void)
{
cpu_emergency_virt_cb *kvm_callback;
kvm_callback = rcu_dereference(kvm_emergency_callback);
/*
* RCU may not be watching the crashing CPU here, so rcu_dereference()
* triggers a suspicious-RCU-usage splat. In principle, a concurrent
* KVM module unload could race with this read; see commit 2baa33a8ddd6
* ("KVM: x86: Leave user-return notifier registered on reboot/shutdown")
* which notes that nothing prevents module unload during panic/reboot.
*
* However, taking a lock here would be riskier than the current race:
* the system is going down via NMI shootdown, and any lock could be
* held by an already-stopped CPU. Use rcu_dereference_raw() to silence
* the lockdep splat and accept the comically small remaining race;
* panic context inherently cannot guarantee complete correctness.
*/
kvm_callback = rcu_dereference_raw(kvm_emergency_callback);
if (kvm_callback)
kvm_callback();
}

View File

@ -41,10 +41,10 @@
#include <inttypes.h>
#include <limits.h>
#include <pthread.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include "kvm_syscalls.h"
#include "kvm_util.h"
#include "test_util.h"
#include "memstress.h"

View File

@ -14,10 +14,10 @@
#include <linux/bitmap.h>
#include <linux/falloc.h>
#include <linux/sizes.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include "kvm_syscalls.h"
#include "kvm_util.h"
#include "numaif.h"
#include "test_util.h"

View File

@ -2,8 +2,18 @@
#ifndef SELFTEST_KVM_SYSCALLS_H
#define SELFTEST_KVM_SYSCALLS_H
/*
* Include both the kernel and libc versions of mman.h. The kernel provides
* the most up-to-date flags and definitions, while libc provides the syscall
* wrappers tests expect.
*/
#include <linux/mman.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <test_util.h>
#define MAP_ARGS0(m,...)
#define MAP_ARGS1(m,t,a,...) m(t,a)
#define MAP_ARGS2(m,t,a,...) m(t,a), MAP_ARGS1(m,__VA_ARGS__)

View File

@ -19,9 +19,9 @@
#include <errno.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/mman.h>
#include "kselftest.h"
#include <linux/mman.h>
#include <linux/types.h>
#define msecs_to_usecs(msec) ((msec) * 1000ULL)

View File

@ -6,11 +6,14 @@
*/
#include "test_util.h"
#include <execinfo.h>
#include <sys/syscall.h>
#include "kselftest.h"
#ifdef __GLIBC__
#include <execinfo.h>
/* Dumps the current stack trace to stderr. */
static void __attribute__((noinline)) test_dump_stack(void);
static void test_dump_stack(void)
@ -57,6 +60,9 @@ static void test_dump_stack(void)
system(cmd);
#pragma GCC diagnostic pop
}
#else
static void test_dump_stack(void) {}
#endif
static pid_t _gettid(void)
{

View File

@ -5,13 +5,13 @@
* Copyright (C) 2018, Google LLC.
*/
#include "test_util.h"
#include "kvm_syscalls.h"
#include "kvm_util.h"
#include "processor.h"
#include "ucall_common.h"
#include <assert.h>
#include <sched.h>
#include <sys/mman.h>
#include <sys/resource.h>
#include <sys/types.h>
#include <sys/stat.h>

View File

@ -15,7 +15,6 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <time.h>
#include <unistd.h>
@ -23,6 +22,7 @@
#include <linux/sizes.h>
#include <test_util.h>
#include <kvm_syscalls.h>
#include <kvm_util.h>
#include <processor.h>
#include <ucall_common.h>

View File

@ -4,11 +4,10 @@
*
* Copyright (C) 2024, Red Hat, Inc.
*/
#include <sys/mman.h>
#include <linux/fs.h>
#include "test_util.h"
#include "kvm_syscalls.h"
#include "kvm_util.h"
#include "kselftest.h"
#include "ucall_common.h"

View File

@ -4,8 +4,8 @@
*
* Copyright IBM Corp. 2021
*/
#include <sys/mman.h>
#include "test_util.h"
#include "kvm_syscalls.h"
#include "kvm_util.h"
#include "kselftest.h"
#include "ucall_common.h"

View File

@ -8,11 +8,11 @@
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <linux/compiler.h>
#include <test_util.h>
#include <kvm_syscalls.h>
#include <kvm_util.h>
#include <processor.h>