KVM fixes for 6.19

- Fix a bug where AVIC is incorrectly inhibited when running with x2AVIC
    disabled via module param (or on a system without x2AVIC).
 
  - Fix a dangling device posted IRQs bug by explicitly checking if the irqfd is
    still active (on the list) when handling an eventfd signal, instead of
    zeroing the irqfd's routing information when the irqfd is deassigned.
    Zeroing the irqfd's routing info causes arm64 and x86's to not disable
    posting for the IRQ (kvm_arch_irq_bypass_del_producer() looks for an MSI),
    incorrectly leaving the IRQ in posted mode (and leading to use-after-free
    and memory leaks on AMD in particular).
 
  - Disable FORTIFY_SOURCE for KVM selftests to prevent the compiler from
    generating calls to the checked versions of memset() and friends, which
    leads to unexpected page faults in guest code due e.g. __memset_chk@plt
    not being resolved.
 
  - Explicitly configure the support XSS from within {svm,vmx}_set_cpu_caps() to
    fix a bug where VMX will compute the reference VMCS configuration with SHSTK
    and IBT enabled, but then compute each CPUs local config with SHSTK and IBT
    disabled if not all CET xfeatures are enabled, e.g. if the kernel is built
    with X86_KERNEL_IBT=n.  The mismatch in features results in differing nVMX
    setting, and ultimately causes kvm-intel.ko to refuse to load with nested=1.
 -----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEEKTobbabEP7vbhhN9OlYIJqCjN/0FAml9aGcACgkQOlYIJqCj
 N/2BCw/9E4XyIIpbMJRk1xNBt+dBslBSggzmb6utXhucn0NaO1pKg9fEoqg6bfic
 OP4Z+UUV8v4lPswVTob/Iq7L5VqvSPq0wa1//YuzxIce4pl5om7zO7RDFnDS47+w
 8367jcmKDzGBKM9KfQN+J5xBYylh82fmd3FtI0212yS6HMOtf9DDioyUz0FXri+s
 k1FWssxV2F9k9qp3qfAwqhj/fCSfVBeEaVKoJY6ep/kQPaLMemuFyrPVXxuWxw1Y
 4udk0idjzrsDuln6fNvgV281Z2o6ScDMhvouRJZOo8d3njwaoJIgRBbGobTcuRGl
 ULHZlilyg14+a8/25TN02SGbDaEZalHXced7F8Y5XLtCsea4ZxzjB4DVwPaKG6Kb
 /465mH/lepdTxmelPFvhhA2th+Ku4o1hfCesYFRckO5jhOU8+EaXT3SW46+NLIAY
 uXtx0BcVPmBqNPn9Zt82N+9WB/MEVLenTuvhkHW14ON3isVPDrUUPmH1kpoyZ/Uq
 6DWmnunTmeWgS2wk3zuEsXWpxK6ko+wK2E8sUIDQiYIGSPTdIjlZMfHNHMicYrGL
 b2vQPw/7Jb1NXy0Ek3CYcZes/s2crYDqWcRsg1jFP2INFXytOlzFrBS5bER4ZZVH
 JB3UNItHKUcFdOk96wXxVxKXyjjeyBrWWH8Gu+T0ERPNl7Tn54c=
 =O9hx
 -----END PGP SIGNATURE-----

Merge tag 'kvm-x86-fixes-6.19-rc8' of https://github.com/kvm-x86/linux into HEAD

Final KVM fixes for 6.19:

 - Fix a bug where AVIC is incorrectly inhibited when running with x2AVIC
   disabled via module param (or on a system without x2AVIC).

 - Fix a dangling device posted IRQs bug by explicitly checking if the irqfd is
   still active (on the list) when handling an eventfd signal, instead of
   zeroing the irqfd's routing information when the irqfd is deassigned.
   Zeroing the irqfd's routing info causes arm64 and x86's to not disable
   posting for the IRQ (kvm_arch_irq_bypass_del_producer() looks for an MSI),
   incorrectly leaving the IRQ in posted mode (and leading to use-after-free
   and memory leaks on AMD in particular).

   This is both the most pressing and scariest, but it's been in -next for
   a while.

 - Disable FORTIFY_SOURCE for KVM selftests to prevent the compiler from
   generating calls to the checked versions of memset() and friends, which
   leads to unexpected page faults in guest code due e.g. __memset_chk@plt
   not being resolved.

 - Explicitly configure the support XSS from within {svm,vmx}_set_cpu_caps() to
   fix a bug where VMX will compute the reference VMCS configuration with SHSTK
   and IBT enabled, but then compute each CPUs local config with SHSTK and IBT
   disabled if not all CET xfeatures are enabled, e.g. if the kernel is built
   with X86_KERNEL_IBT=n.  The mismatch in features results in differing nVMX
   setting, and ultimately causes kvm-intel.ko to refuse to load with nested=1.
This commit is contained in:
Paolo Bonzini 2026-02-04 18:30:32 +01:00
commit 0de4a0eec2
8 changed files with 52 additions and 36 deletions

View File

@ -514,7 +514,8 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
*/
spin_lock_irq(&kvm->irqfds.lock);
if (irqfd->irq_entry.type == KVM_IRQ_ROUTING_MSI) {
if (irqfd->irq_entry.type == KVM_IRQ_ROUTING_MSI ||
WARN_ON_ONCE(irqfd->irq_bypass_vcpu)) {
ret = kvm_pi_update_irte(irqfd, NULL);
if (ret)
pr_info("irq bypass consumer (eventfd %p) unregistration fails: %d\n",

View File

@ -376,6 +376,7 @@ void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb)
static int avic_init_backing_page(struct kvm_vcpu *vcpu)
{
u32 max_id = x2avic_enabled ? x2avic_max_physical_id : AVIC_MAX_PHYSICAL_ID;
struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
struct vcpu_svm *svm = to_svm(vcpu);
u32 id = vcpu->vcpu_id;
@ -388,8 +389,7 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
* avic_vcpu_load() expects to be called if and only if the vCPU has
* fully initialized AVIC.
*/
if ((!x2avic_enabled && id > AVIC_MAX_PHYSICAL_ID) ||
(id > x2avic_max_physical_id)) {
if (id > max_id) {
kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG);
vcpu->arch.apic->apicv_active = false;
return 0;

View File

@ -5284,6 +5284,8 @@ static __init void svm_set_cpu_caps(void)
*/
kvm_cpu_cap_clear(X86_FEATURE_BUS_LOCK_DETECT);
kvm_cpu_cap_clear(X86_FEATURE_MSR_IMM);
kvm_setup_xss_caps();
}
static __init int svm_hardware_setup(void)

View File

@ -8051,6 +8051,8 @@ static __init void vmx_set_cpu_caps(void)
kvm_cpu_cap_clear(X86_FEATURE_SHSTK);
kvm_cpu_cap_clear(X86_FEATURE_IBT);
}
kvm_setup_xss_caps();
}
static bool vmx_is_io_intercepted(struct kvm_vcpu *vcpu,

View File

@ -9953,6 +9953,23 @@ static struct notifier_block pvclock_gtod_notifier = {
};
#endif
void kvm_setup_xss_caps(void)
{
if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
kvm_caps.supported_xss = 0;
if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) &&
!kvm_cpu_cap_has(X86_FEATURE_IBT))
kvm_caps.supported_xss &= ~XFEATURE_MASK_CET_ALL;
if ((kvm_caps.supported_xss & XFEATURE_MASK_CET_ALL) != XFEATURE_MASK_CET_ALL) {
kvm_cpu_cap_clear(X86_FEATURE_SHSTK);
kvm_cpu_cap_clear(X86_FEATURE_IBT);
kvm_caps.supported_xss &= ~XFEATURE_MASK_CET_ALL;
}
}
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_setup_xss_caps);
static inline void kvm_ops_update(struct kvm_x86_init_ops *ops)
{
memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops));
@ -10125,19 +10142,6 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
if (!tdp_enabled)
kvm_caps.supported_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT;
if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
kvm_caps.supported_xss = 0;
if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) &&
!kvm_cpu_cap_has(X86_FEATURE_IBT))
kvm_caps.supported_xss &= ~XFEATURE_MASK_CET_ALL;
if ((kvm_caps.supported_xss & XFEATURE_MASK_CET_ALL) != XFEATURE_MASK_CET_ALL) {
kvm_cpu_cap_clear(X86_FEATURE_SHSTK);
kvm_cpu_cap_clear(X86_FEATURE_IBT);
kvm_caps.supported_xss &= ~XFEATURE_MASK_CET_ALL;
}
if (kvm_caps.has_tsc_control) {
/*
* Make sure the user can only configure tsc_khz values that

View File

@ -471,6 +471,8 @@ extern struct kvm_host_values kvm_host;
extern bool enable_pmu;
void kvm_setup_xss_caps(void);
/*
* Get a filtered version of KVM's supported XCR0 that strips out dynamic
* features for which the current process doesn't (yet) have permission to use.

View File

@ -251,6 +251,7 @@ LINUX_TOOL_INCLUDE = $(top_srcdir)/tools/include
LINUX_TOOL_ARCH_INCLUDE = $(top_srcdir)/tools/arch/$(ARCH)/include
CFLAGS += -Wall -Wstrict-prototypes -Wuninitialized -O2 -g -std=gnu99 \
-Wno-gnu-variable-sized-type-not-at-end -MD -MP -DCONFIG_64BIT \
-U_FORTIFY_SOURCE \
-fno-builtin-memcmp -fno-builtin-memcpy \
-fno-builtin-memset -fno-builtin-strnlen \
-fno-stack-protector -fno-PIE -fno-strict-aliasing \

View File

@ -157,21 +157,28 @@ irqfd_shutdown(struct work_struct *work)
}
/* assumes kvm->irqfds.lock is held */
static bool
irqfd_is_active(struct kvm_kernel_irqfd *irqfd)
static bool irqfd_is_active(struct kvm_kernel_irqfd *irqfd)
{
/*
* Assert that either irqfds.lock or SRCU is held, as irqfds.lock must
* be held to prevent false positives (on the irqfd being active), and
* while false negatives are impossible as irqfds are never added back
* to the list once they're deactivated, the caller must at least hold
* SRCU to guard against routing changes if the irqfd is deactivated.
*/
lockdep_assert_once(lockdep_is_held(&irqfd->kvm->irqfds.lock) ||
srcu_read_lock_held(&irqfd->kvm->irq_srcu));
return list_empty(&irqfd->list) ? false : true;
}
/*
* Mark the irqfd as inactive and schedule it for removal
*
* assumes kvm->irqfds.lock is held
*/
static void
irqfd_deactivate(struct kvm_kernel_irqfd *irqfd)
static void irqfd_deactivate(struct kvm_kernel_irqfd *irqfd)
{
lockdep_assert_held(&irqfd->kvm->irqfds.lock);
BUG_ON(!irqfd_is_active(irqfd));
list_del_init(&irqfd->list);
@ -217,8 +224,15 @@ irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
seq = read_seqcount_begin(&irqfd->irq_entry_sc);
irq = irqfd->irq_entry;
} while (read_seqcount_retry(&irqfd->irq_entry_sc, seq));
/* An event has been signaled, inject an interrupt */
if (kvm_arch_set_irq_inatomic(&irq, kvm,
/*
* An event has been signaled, inject an interrupt unless the
* irqfd is being deassigned (isn't active), in which case the
* routing information may be stale (once the irqfd is removed
* from the list, it will stop receiving routing updates).
*/
if (unlikely(!irqfd_is_active(irqfd)) ||
kvm_arch_set_irq_inatomic(&irq, kvm,
KVM_USERSPACE_IRQ_SOURCE_ID, 1,
false) == -EWOULDBLOCK)
schedule_work(&irqfd->inject);
@ -585,18 +599,8 @@ kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args)
spin_lock_irq(&kvm->irqfds.lock);
list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) {
if (irqfd->eventfd == eventfd && irqfd->gsi == args->gsi) {
/*
* This clearing of irq_entry.type is needed for when
* another thread calls kvm_irq_routing_update before
* we flush workqueue below (we synchronize with
* kvm_irq_routing_update using irqfds.lock).
*/
write_seqcount_begin(&irqfd->irq_entry_sc);
irqfd->irq_entry.type = 0;
write_seqcount_end(&irqfd->irq_entry_sc);
if (irqfd->eventfd == eventfd && irqfd->gsi == args->gsi)
irqfd_deactivate(irqfd);
}
}
spin_unlock_irq(&kvm->irqfds.lock);