- Fix ITS EventID sanitisation when restoring an interrupt translation
   table.
 
 - Fix PPI memory leak when failing to initialise a vcpu.
 
 - Correctly return an error when the validation of a hypervisor trace
   descriptor fails, and limit this validation to protected mode only.
 
 RISC-V:
 
 - Fix invalid HVA warning in steal-time recording
 
 - Return SBI_ERR_FAILURE to guest upon OOM in pmu_event_info()
   and pmu_snapshot_set_shmem()
 
 - Fix NULL pointer dereference in SBI v0.1 SEND_IPI handler
 
 - Fix sign extension of value for MMIO loads
 
 s390:
 
 - Fix bugs in vSIE (nested virtualization) and UCONTROL, caused by the page
   table rewrite.
 
 x86:
 
 - Apply erratum #1235 workaround (disable AVIC IPI virtualization) on Hygon
   Family 18h, just like on AMD Family 17h.
 
 - When KVM_CAP_X86_APIC_BUS_CYCLES_NS is queried on a specific VM, return
   the VM's configured APIC bus frequency instead of the default.  This
   is less confusing (read: not wrong) and makes it easier to fill in CPUID
   information that communicates the APIC bus frequency to the guest.
 
 Selftests:
 
 - Do not include glibc-internal <bits/endian.h>; it worked by chance and
   broke building KVM selftests with musl.
 -----BEGIN PGP SIGNATURE-----
 
 iQFIBAABCgAyFiEE8TM4V0tmI4mGbHaCv/vSX3jHroMFAmoSp6cUHHBib256aW5p
 QHJlZGhhdC5jb20ACgkQv/vSX3jHroPgswgAiO5Gi7d6dIspG7e41g5fF2Wq5Rnq
 1nB7ZV+CqT0k1fvFe4hBrc2c+DLzFn+h3/fj+4scVF4oAN9YRauIq/2xlGWR23bR
 gsFncJ2w6TAKLN3MvCh1SpO+GI7kcnTs7HtJ6weDkddbGEtUIgkUZkwEYnEN4t6T
 pgO7USGFbBBXY575UO/xMeLkfyABzJlLjQbKrvG6RKtEsKAxzTxcPtjQegtHYH4Q
 6DLGif4YUB0ZWMQETccl/bKqU6L+OQgDUOSUoHWt+2ox0DLDwiy7VVf3infecXsJ
 r3PGKn709nlrd+hBn2S9gCbT/BCxp828k2DxSasZ7PQ8634O+qrpLLkODw==
 =VWgs
 -----END PGP SIGNATURE-----

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull kvm fixes from Paolo Bonzini:
 "arm64:

   - Fix ITS EventID sanitisation when restoring an interrupt
     translation table.

   - Fix PPI memory leak when failing to initialise a vcpu.

   - Correctly return an error when the validation of a hypervisor trace
     descriptor fails, and limit this validation to protected mode only.

  RISC-V:

   - Fix invalid HVA warning in steal-time recording

   - Return SBI_ERR_FAILURE to guest upon OOM in pmu_event_info() and
     pmu_snapshot_set_shmem()

   - Fix NULL pointer dereference in SBI v0.1 SEND_IPI handler

   - Fix sign extension of value for MMIO loads

  s390:

   - Fix bugs in vSIE (nested virtualization) and UCONTROL, caused by
     the page table rewrite.

  x86:

   - Apply erratum #1235 workaround (disable AVIC IPI virtualization) on
     Hygon Family 18h, just like on AMD Family 17h.

   - When KVM_CAP_X86_APIC_BUS_CYCLES_NS is queried on a specific VM,
     return the VM's configured APIC bus frequency instead of the
     default. This is less confusing (read: not wrong) and makes it
     easier to fill in CPUID information that communicates the APIC bus
     frequency to the guest.

  Selftests:

   - Do not include glibc-internal <bits/endian.h>; it worked by chance
     and broke building KVM selftests with musl"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm:
  KVM: SVM: Disable AVIC IPI virtualization on Hygon Family 18h (erratum #1235)
  KVM: selftests: Verify that KVM returns the configured APIC cycle length
  KVM: x86: Return the VM's configured APIC bus frequency when queried
  KVM: selftests: elf: Include <endian.h> instead of <bits/endian.h>
  KVM: s390: Properly reset zero bit in PGSTE
  KVM: s390: vsie: Fix redundant rmap entries
  KVM: s390: vsie: Fix unshadowing logic
  KVM: s390: Fix leaking kvm_s390_mmu_cache in case of errors
  KVM: s390: vsie: Fix memory leak when unshadowing
  KVM: arm64: Fix nVHE/pKVM hyp tracing error on invalid desc
  KVM: arm64: vgic: Free private_irqs when init fails after allocation
  KVM: arm64: vgic-its: Reject restored DTE with out-of-range num_eventid_bits
  RISC-V: KVM: Fix sign extension for MMIO loads
  RISC-V: KVM: Fix NULL pointer dereference in SBI v0.1 SEND_IPI handler
  riscv: kvm: return SBI_ERR_FAILURE for pmu_event_info() when OOM
  riscv: kvm: return SBI_ERR_FAILURE for pmu_snapshot_set_shmem() when OOM
  RISC-V: KVM: Fix invalid HVA warning in steal-time recording
This commit is contained in:
Linus Torvalds 2026-05-24 12:50:36 -07:00
commit 6a97c4d526
16 changed files with 115 additions and 31 deletions

View File

@ -555,8 +555,10 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
kvm_destroy_mpidr_data(vcpu->kvm);
err = kvm_vgic_vcpu_init(vcpu);
if (err)
if (err) {
kvm_vgic_vcpu_destroy(vcpu);
return err;
}
err = kvm_share_hyp(vcpu, vcpu + 1);
if (err)

View File

@ -164,13 +164,16 @@ static int hyp_trace_buffer_load(struct hyp_trace_buffer *trace_buffer,
return ret;
}
static bool hyp_trace_desc_validate(struct hyp_trace_desc *desc, size_t desc_size)
static bool hyp_trace_desc_is_valid(struct hyp_trace_desc *desc, size_t desc_size)
{
struct ring_buffer_desc *rb_desc;
unsigned int cpu;
size_t nr_bpages;
void *desc_end;
if (!is_protected_kvm_enabled())
return true;
/*
* Both desc_size and bpages_backing_size are untrusted host-provided
* values. We rely on __pkvm_host_donate_hyp() to enforce their validity.
@ -212,8 +215,10 @@ int __tracing_load(unsigned long desc_hva, size_t desc_size)
if (ret)
return ret;
if (!hyp_trace_desc_validate(desc, desc_size))
if (!hyp_trace_desc_is_valid(desc, desc_size)) {
ret = -EINVAL;
goto err_release_desc;
}
hyp_spin_lock(&trace_buffer.lock);

View File

@ -2307,6 +2307,10 @@ static int vgic_its_restore_dte(struct vgic_its *its, u32 id,
/* dte entry is valid */
offset = (entry & KVM_ITS_DTE_NEXT_MASK) >> KVM_ITS_DTE_NEXT_SHIFT;
/* Mimic the MAPD behaviour and reject invalid EID bits. */
if (num_eventid_bits > VITS_TYPER_IDBITS)
return -EINVAL;
if (!vgic_its_check_id(its, baser, id, NULL))
return -EINVAL;

View File

@ -415,7 +415,6 @@ int kvm_riscv_vcpu_mmio_load(struct kvm_vcpu *vcpu, struct kvm_run *run,
shift = 8 * (sizeof(ulong) - len);
} else if ((insn & INSN_MASK_LBU) == INSN_MATCH_LBU) {
len = 1;
shift = 8 * (sizeof(ulong) - len);
#ifdef CONFIG_64BIT
} else if ((insn & INSN_MASK_LD) == INSN_MATCH_LD) {
len = 8;
@ -649,22 +648,22 @@ int kvm_riscv_vcpu_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run)
case 1:
data8 = *((u8 *)run->mmio.data);
SET_RD(insn, &vcpu->arch.guest_context,
(ulong)data8 << shift >> shift);
(long)((ulong)data8 << shift) >> shift);
break;
case 2:
data16 = *((u16 *)run->mmio.data);
SET_RD(insn, &vcpu->arch.guest_context,
(ulong)data16 << shift >> shift);
(long)((ulong)data16 << shift) >> shift);
break;
case 4:
data32 = *((u32 *)run->mmio.data);
SET_RD(insn, &vcpu->arch.guest_context,
(ulong)data32 << shift >> shift);
(long)((ulong)data32 << shift) >> shift);
break;
case 8:
data64 = *((u64 *)run->mmio.data);
SET_RD(insn, &vcpu->arch.guest_context,
(ulong)data64 << shift >> shift);
(long)((ulong)data64 << shift) >> shift);
break;
default:
return -EOPNOTSUPP;

View File

@ -453,8 +453,10 @@ int kvm_riscv_vcpu_pmu_snapshot_set_shmem(struct kvm_vcpu *vcpu, unsigned long s
}
kvpmu->sdata = kzalloc(snapshot_area_size, GFP_ATOMIC);
if (!kvpmu->sdata)
return -ENOMEM;
if (!kvpmu->sdata) {
sbiret = SBI_ERR_FAILURE;
goto out;
}
/* No need to check writable slot explicitly as kvm_vcpu_write_guest does it internally */
if (kvm_vcpu_write_guest(vcpu, saddr, kvpmu->sdata, snapshot_area_size)) {
@ -499,8 +501,10 @@ int kvm_riscv_vcpu_pmu_event_info(struct kvm_vcpu *vcpu, unsigned long saddr_low
}
einfo = kzalloc(shmem_size, GFP_KERNEL);
if (!einfo)
return -ENOMEM;
if (!einfo) {
ret = SBI_ERR_FAILURE;
goto out;
}
ret = kvm_vcpu_read_guest(vcpu, shmem, einfo, shmem_size);
if (ret) {

View File

@ -46,7 +46,7 @@ void kvm_riscv_vcpu_record_steal_time(struct kvm_vcpu *vcpu)
gfn = shmem >> PAGE_SHIFT;
hva = kvm_vcpu_gfn_to_hva(vcpu, gfn);
if (WARN_ON(kvm_is_error_hva(hva))) {
if (kvm_is_error_hva(hva)) {
vcpu->arch.sta.shmem = INVALID_GPA;
return;
}

View File

@ -55,6 +55,8 @@ static int kvm_sbi_ext_v01_handler(struct kvm_vcpu *vcpu, struct kvm_run *run,
for_each_set_bit(i, &hmask, BITS_PER_LONG) {
rvcpu = kvm_get_vcpu_by_id(vcpu->kvm, i);
if (!rvcpu)
continue;
ret = kvm_riscv_vcpu_set_interrupt(rvcpu, IRQ_VS_SOFT);
if (ret < 0)
break;

View File

@ -267,6 +267,7 @@ static int dat_split_ste(struct kvm_s390_mmu_cache *mc, union pmd *pmdp, gfn_t g
/* No need to take locks as the page table is not installed yet. */
pgste_init.prefix_notif = old.s.fc1.prefix_notif;
pgste_init.vsie_notif = old.s.fc1.vsie_notif;
pgste_init.vsie_gmem = old.s.fc1.vsie_notif;
pgste_init.pcl = uses_skeys && init.h.i;
dat_init_pgstes(pt, pgste_init.val);
} else {

View File

@ -145,7 +145,8 @@ union pgste {
unsigned long cmma_d : 1; /* Dirty flag for CMMA bits */
unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */
unsigned long vsie_notif : 1; /* Referenced in a shadow table */
unsigned long : 5;
unsigned long vsie_gmem : 1; /* Contains nested guest memory */
unsigned long : 4;
unsigned long : 8;
};
struct {

View File

@ -1445,6 +1445,7 @@ static int _do_shadow_pte(struct gmap *sg, gpa_t raddr, union pte *ptep_h, union
} else {
pgste = _gmap_ptep_xchg(sg->parent, ptep_h, newpte, pgste, f->gfn, false);
pgste.vsie_notif = 1;
pgste.vsie_gmem = 1;
}
pgste_set_unlock(ptep_h, pgste);
if (rc)

View File

@ -125,7 +125,7 @@ struct gmap *gmap_new_child(struct gmap *parent, gfn_t limit)
int gmap_set_limit(struct gmap *gmap, gfn_t limit)
{
struct kvm_s390_mmu_cache *mc;
struct kvm_s390_mmu_cache *mc __free(kvm_s390_mmu_cache) = NULL;
int rc, type;
type = gmap_limit_to_type(limit);
@ -142,7 +142,6 @@ int gmap_set_limit(struct gmap *gmap, gfn_t limit)
rc = dat_set_asce_limit(mc, &gmap->asce, type);
} while (rc == -ENOMEM);
kvm_s390_free_mmu_cache(mc);
return 0;
}
@ -822,8 +821,8 @@ int gmap_ucas_translate(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, gpa_t
int gmap_ucas_map(struct gmap *gmap, gfn_t p_gfn, gfn_t c_gfn, unsigned long count)
{
struct kvm_s390_mmu_cache *mc;
int rc;
struct kvm_s390_mmu_cache *mc __free(kvm_s390_mmu_cache) = NULL;
int rc = 0;
mc = kvm_s390_new_mmu_cache();
if (!mc)
@ -1026,13 +1025,15 @@ int gmap_insert_rmap(struct gmap *sg, gfn_t p_gfn, gfn_t r_gfn, int level)
int gmap_protect_rmap(struct kvm_s390_mmu_cache *mc, struct gmap *sg, gfn_t p_gfn, gfn_t r_gfn,
kvm_pfn_t pfn, int level, bool wr)
{
unsigned long bitmask;
union crste *crstep;
union pgste pgste;
union pte *ptep;
union pte pte;
int flags, rc;
KVM_BUG_ON(!is_shadow(sg), sg->kvm);
if (KVM_BUG_ON(!is_shadow(sg) || level <= TABLE_TYPE_PAGE_TABLE, sg->kvm))
return -EINVAL;
lockdep_assert_held(&sg->parent->children_lock);
flags = DAT_WALK_SPLIT_ALLOC | (uses_skeys(sg->parent) ? DAT_WALK_USES_SKEYS : 0);
@ -1041,8 +1042,9 @@ int gmap_protect_rmap(struct kvm_s390_mmu_cache *mc, struct gmap *sg, gfn_t p_gf
if (rc)
return rc;
if (level <= TABLE_TYPE_REGION1) {
bitmask = -1UL << (8 + 11 * level);
scoped_guard(spinlock, &sg->host_to_rmap_lock)
rc = gmap_insert_rmap(sg, p_gfn, r_gfn, level);
rc = gmap_insert_rmap(sg, p_gfn, r_gfn & bitmask, level);
}
if (rc)
return rc;
@ -1143,8 +1145,10 @@ void _gmap_handle_vsie_unshadow_event(struct gmap *parent, gfn_t gfn)
}
scoped_guard(spinlock, &sg->host_to_rmap_lock)
head = radix_tree_delete(&sg->host_to_rmap, gfn);
gmap_for_each_rmap_safe(rmap, rnext, head)
gmap_for_each_rmap_safe(rmap, rnext, head) {
gmap_unshadow_level(sg, rmap->r_gfn, rmap->level);
kfree(rmap);
}
}
}

View File

@ -167,6 +167,36 @@ static inline bool gmap_unmap_prefix(struct gmap *gmap, gfn_t gfn, gfn_t end)
return _gmap_unmap_prefix(gmap, gfn, end, false);
}
/**
* pte_needs_unshadow() -- Check if the pte operations triggers unshadowing.
* @oldpte: the previous value for the guest pte.
* @newpte: the new pte being set.
* @pgste: the pgste for the pte entry.
*
* If the pgste.vsie_notif bit is not set, return false: the page is not
* involved in vsie and thus should not trigger an unshadow operation.
*
* If the pgste.vsie_gmem bit is set, this pte represents shadowed guest
* memory. The access rights on g3's memory should be synchronized with g1's
* and g2's. Therefore unshadowing is triggered if the new and old pte
* differ in protection, or if the new pte is invalid.
*
* If the pgste.vsie_gmem bit is not set, this pte maps the g2 dat tables
* for g3. If the entry becomes writable or absent, it becomes impossible to
* guarantee that the shadow mapping will match g2's mapping. In that case,
* trigger an unshadow event.
*
* Return: true if an unshadow event should be triggered, otherwise false.
*/
static inline bool pte_needs_unshadow(union pte oldpte, union pte newpte, union pgste pgste)
{
if (!pgste.vsie_notif)
return false;
if (pgste.vsie_gmem)
return (oldpte.h.p != newpte.h.p) || newpte.h.i;
return !newpte.h.p || !newpte.s.pr;
}
static inline union pgste _gmap_ptep_xchg(struct gmap *gmap, union pte *ptep, union pte newpte,
union pgste pgste, gfn_t gfn, bool needs_lock)
{
@ -180,8 +210,9 @@ static inline union pgste _gmap_ptep_xchg(struct gmap *gmap, union pte *ptep, un
pgste.prefix_notif = 0;
gmap_unmap_prefix(gmap, gfn, gfn + 1);
}
if (pgste.vsie_notif && (ptep->h.p != newpte.h.p || newpte.h.i)) {
if (pte_needs_unshadow(*ptep, newpte, pgste)) {
pgste.vsie_notif = 0;
pgste.vsie_gmem = 0;
if (needs_lock)
gmap_handle_vsie_unshadow_event(gmap, gfn);
else
@ -189,6 +220,7 @@ static inline union pgste _gmap_ptep_xchg(struct gmap *gmap, union pte *ptep, un
}
if (!ptep->s.d && newpte.s.d && !newpte.s.s)
SetPageDirty(pfn_to_page(newpte.h.pfra));
pgste.zero = 0;
return __dat_ptep_xchg(ptep, pgste, newpte, gfn, gmap->asce, uses_skeys(gmap));
}
@ -198,6 +230,30 @@ static inline union pgste gmap_ptep_xchg(struct gmap *gmap, union pte *ptep, uni
return _gmap_ptep_xchg(gmap, ptep, newpte, pgste, gfn, true);
}
/**
* crste_needs_unshadow() -- Check if the crste operations triggers unshadowing.
* @oldcrste: the previous value for the crste.
* @newcrste: the new value for the crste.
*
* If the old crste did not have the vsie_notif bit set, return false: the
* page is not involved in vsie and thus should not trigger an unshadow
* operation. Conversely, if the bit is set, it can only be g3 memory, since
* dat tables are never mapped using large pages.
*
* Similar to the pgste.vsie_gmem case of pte_needs_unshadow(), if the
* protection bit is changing or the new page is invalid, trigger an
* unshadow event. Also trigger an unshadow event if the new crste does not
* have the vsie_notif bit set.
*
* Return: true if an unshadow event should be triggered, otherwise false.
*/
static inline bool crste_needs_unshadow(union crste oldcrste, union crste newcrste)
{
if (!oldcrste.s.fc1.vsie_notif)
return false;
return (newcrste.h.p != oldcrste.h.p) || newcrste.h.i || !newcrste.s.fc1.vsie_notif;
}
static inline bool __must_check _gmap_crstep_xchg_atomic(struct gmap *gmap, union crste *crstep,
union crste oldcrste, union crste newcrste,
gfn_t gfn, bool needs_lock)
@ -216,8 +272,7 @@ static inline bool __must_check _gmap_crstep_xchg_atomic(struct gmap *gmap, unio
newcrste.s.fc1.prefix_notif = 0;
gmap_unmap_prefix(gmap, gfn, gfn + align);
}
if (crste_leaf(oldcrste) && oldcrste.s.fc1.vsie_notif &&
(newcrste.h.p || newcrste.h.i || !newcrste.s.fc1.vsie_notif)) {
if (crste_leaf(oldcrste) && crste_needs_unshadow(oldcrste, newcrste)) {
newcrste.s.fc1.vsie_notif = 0;
if (needs_lock)
gmap_handle_vsie_unshadow_event(gmap, gfn);

View File

@ -1300,12 +1300,14 @@ bool __init avic_hardware_setup(void)
}
/*
* Disable IPI virtualization for AMD Family 17h CPUs (Zen1 and Zen2)
* due to erratum 1235, which results in missed VM-Exits on the sender
* and thus missed wake events for blocking vCPUs due to the CPU
* failing to see a software update to clear IsRunning.
* Disable IPI virtualization for AMD Family 17h (Zen1 and Zen2) and
* Hygon Family 18h (derived from AMD Zen1) CPUs due to erratum 1235,
* which results in missed VM-Exits on the sender and thus missed wake
* events for blocking vCPUs due to the CPU failing to see a software
* update to clear IsRunning.
*/
enable_ipiv = enable_ipiv && boot_cpu_data.x86 != 0x17;
if (boot_cpu_data.x86 == 0x17 || boot_cpu_data.x86 == 0x18)
enable_ipiv = false;
amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);

View File

@ -4876,7 +4876,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = tdp_enabled;
break;
case KVM_CAP_X86_APIC_BUS_CYCLES_NS:
r = APIC_BUS_CYCLE_NS_DEFAULT;
r = kvm ? kvm->arch.apic_bus_cycle_ns : APIC_BUS_CYCLE_NS_DEFAULT;
break;
case KVM_CAP_EXIT_HYPERCALL:
r = KVM_EXIT_HYPERCALL_VALID_MASK;

View File

@ -7,7 +7,7 @@
#include "test_util.h"
#include <bits/endian.h>
#include <endian.h>
#include <linux/elf.h>
#include "kvm_util.h"

View File

@ -137,6 +137,10 @@ static void run_apic_bus_clock_test(u64 apic_hz, u64 delay_ms,
vm_enable_cap(vm, KVM_CAP_X86_APIC_BUS_CYCLES_NS,
NSEC_PER_SEC / apic_hz);
TEST_ASSERT_EQ(kvm_check_cap(KVM_CAP_X86_APIC_BUS_CYCLES_NS), 1);
TEST_ASSERT_EQ(vm_check_cap(vm, KVM_CAP_X86_APIC_BUS_CYCLES_NS),
NSEC_PER_SEC / apic_hz);
vcpu = vm_vcpu_add(vm, 0, apic_guest_code);
vcpu_args_set(vcpu, 2, apic_hz, delay_ms);