From 6aba0cb5bba6141158d5449f2cf53187b7f755f9 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Thu, 5 Jun 2025 11:44:46 +0530 Subject: [PATCH 01/15] RISC-V: KVM: Fix the size parameter check in SBI SFENCE calls As-per the SBI specification, an SBI remote fence operation applies to the entire address space if either: 1) start_addr and size are both 0 2) size is equal to 2^XLEN-1 >From the above, only #1 is checked by SBI SFENCE calls so fix the size parameter check in SBI SFENCE calls to cover #2 as well. Fixes: 13acfec2dbcc ("RISC-V: KVM: Add remote HFENCE functions based on VCPU requests") Reviewed-by: Atish Patra Signed-off-by: Anup Patel Link: https://lore.kernel.org/r/20250605061458.196003-2-apatel@ventanamicro.com Signed-off-by: Anup Patel --- arch/riscv/kvm/vcpu_sbi_replace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/riscv/kvm/vcpu_sbi_replace.c b/arch/riscv/kvm/vcpu_sbi_replace.c index 5fbf3f94f1e8..9752d2ffff68 100644 --- a/arch/riscv/kvm/vcpu_sbi_replace.c +++ b/arch/riscv/kvm/vcpu_sbi_replace.c @@ -103,7 +103,7 @@ static int kvm_sbi_ext_rfence_handler(struct kvm_vcpu *vcpu, struct kvm_run *run kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_FENCE_I_SENT); break; case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA: - if (cp->a2 == 0 && cp->a3 == 0) + if ((cp->a2 == 0 && cp->a3 == 0) || cp->a3 == -1UL) kvm_riscv_hfence_vvma_all(vcpu->kvm, hbase, hmask); else kvm_riscv_hfence_vvma_gva(vcpu->kvm, hbase, hmask, @@ -111,7 +111,7 @@ static int kvm_sbi_ext_rfence_handler(struct kvm_vcpu *vcpu, struct kvm_run *run kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_HFENCE_VVMA_SENT); break; case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA_ASID: - if (cp->a2 == 0 && cp->a3 == 0) + if ((cp->a2 == 0 && cp->a3 == 0) || cp->a3 == -1UL) kvm_riscv_hfence_vvma_asid_all(vcpu->kvm, hbase, hmask, cp->a4); else From 2e7be162996640bbe3b6da694cc064c511b8a5d9 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Thu, 5 Jun 2025 11:44:47 +0530 Subject: [PATCH 02/15] RISC-V: KVM: Don't treat SBI HFENCE calls as NOPs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The SBI specification clearly states that SBI HFENCE calls should return SBI_ERR_NOT_SUPPORTED when one of the target hart doesn’t support hypervisor extension (aka nested virtualization in-case of KVM RISC-V). Fixes: c7fa3c48de86 ("RISC-V: KVM: Treat SBI HFENCE calls as NOPs") Reviewed-by: Atish Patra Signed-off-by: Anup Patel Link: https://lore.kernel.org/r/20250605061458.196003-3-apatel@ventanamicro.com Signed-off-by: Anup Patel --- arch/riscv/kvm/vcpu_sbi_replace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/riscv/kvm/vcpu_sbi_replace.c b/arch/riscv/kvm/vcpu_sbi_replace.c index 9752d2ffff68..b17fad091bab 100644 --- a/arch/riscv/kvm/vcpu_sbi_replace.c +++ b/arch/riscv/kvm/vcpu_sbi_replace.c @@ -127,9 +127,9 @@ static int kvm_sbi_ext_rfence_handler(struct kvm_vcpu *vcpu, struct kvm_run *run case SBI_EXT_RFENCE_REMOTE_HFENCE_VVMA_ASID: /* * Until nested virtualization is implemented, the - * SBI HFENCE calls should be treated as NOPs + * SBI HFENCE calls should return not supported + * hence fallthrough. */ - break; default: retdata->err_val = SBI_ERR_NOT_SUPPORTED; } From 8a8ff069c7ad9a359c54683329883e2432cff191 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 15 Jun 2025 16:11:38 +0100 Subject: [PATCH 03/15] KVM: arm64: nv: Fix tracking of shadow list registers Wei-Lin reports that the tracking of shadow list registers is majorly broken when resync'ing the L2 state after a run, as we confuse the guest's LR index with the host's, potentially losing the interrupt state. While this could be fixed by adding yet another side index to track it (Wei-Lin's fix), it may be better to refactor this code to avoid having a side index altogether, limiting the risk to introduce this class of bugs. A key observation is that the shadow index is always the number of bits in the lr_map bitmap. With that, the parallel indexing scheme can be completely dropped. While doing this, introduce a couple of helpers that abstract the index conversion and some of the LR repainting, making the whole exercise much simpler. Reported-by: Wei-Lin Chang Reviewed-by: Wei-Lin Chang Reviewed-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20250614145721.2504524-1-r09922117@csie.ntu.edu.tw Link: https://lore.kernel.org/r/86qzzkc5xa.wl-maz@kernel.org --- arch/arm64/kvm/vgic/vgic-v3-nested.c | 81 ++++++++++++++-------------- 1 file changed, 42 insertions(+), 39 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-v3-nested.c b/arch/arm64/kvm/vgic/vgic-v3-nested.c index d22a8ad7bcc5..a50fb7e6841f 100644 --- a/arch/arm64/kvm/vgic/vgic-v3-nested.c +++ b/arch/arm64/kvm/vgic/vgic-v3-nested.c @@ -36,6 +36,11 @@ struct shadow_if { static DEFINE_PER_CPU(struct shadow_if, shadow_if); +static int lr_map_idx_to_shadow_idx(struct shadow_if *shadow_if, int idx) +{ + return hweight16(shadow_if->lr_map & (BIT(idx) - 1)); +} + /* * Nesting GICv3 support * @@ -209,6 +214,29 @@ u64 vgic_v3_get_misr(struct kvm_vcpu *vcpu) return reg; } +static u64 translate_lr_pintid(struct kvm_vcpu *vcpu, u64 lr) +{ + struct vgic_irq *irq; + + if (!(lr & ICH_LR_HW)) + return lr; + + /* We have the HW bit set, check for validity of pINTID */ + irq = vgic_get_vcpu_irq(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr)); + /* If there was no real mapping, nuke the HW bit */ + if (!irq || !irq->hw || irq->intid > VGIC_MAX_SPI) + lr &= ~ICH_LR_HW; + + /* Translate the virtual mapping to the real one, even if invalid */ + if (irq) { + lr &= ~ICH_LR_PHYS_ID_MASK; + lr |= FIELD_PREP(ICH_LR_PHYS_ID_MASK, (u64)irq->hwintid); + vgic_put_irq(vcpu->kvm, irq); + } + + return lr; +} + /* * For LRs which have HW bit set such as timer interrupts, we modify them to * have the host hardware interrupt number instead of the virtual one programmed @@ -217,58 +245,37 @@ u64 vgic_v3_get_misr(struct kvm_vcpu *vcpu) static void vgic_v3_create_shadow_lr(struct kvm_vcpu *vcpu, struct vgic_v3_cpu_if *s_cpu_if) { - unsigned long lr_map = 0; - int index = 0; + struct shadow_if *shadow_if; + + shadow_if = container_of(s_cpu_if, struct shadow_if, cpuif); + shadow_if->lr_map = 0; for (int i = 0; i < kvm_vgic_global_state.nr_lr; i++) { u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i)); - struct vgic_irq *irq; if (!(lr & ICH_LR_STATE)) - lr = 0; + continue; - if (!(lr & ICH_LR_HW)) - goto next; + lr = translate_lr_pintid(vcpu, lr); - /* We have the HW bit set, check for validity of pINTID */ - irq = vgic_get_vcpu_irq(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr)); - if (!irq || !irq->hw || irq->intid > VGIC_MAX_SPI ) { - /* There was no real mapping, so nuke the HW bit */ - lr &= ~ICH_LR_HW; - if (irq) - vgic_put_irq(vcpu->kvm, irq); - goto next; - } - - /* Translate the virtual mapping to the real one */ - lr &= ~ICH_LR_PHYS_ID_MASK; - lr |= FIELD_PREP(ICH_LR_PHYS_ID_MASK, (u64)irq->hwintid); - - vgic_put_irq(vcpu->kvm, irq); - -next: - s_cpu_if->vgic_lr[index] = lr; - if (lr) { - lr_map |= BIT(i); - index++; - } + s_cpu_if->vgic_lr[hweight16(shadow_if->lr_map)] = lr; + shadow_if->lr_map |= BIT(i); } - container_of(s_cpu_if, struct shadow_if, cpuif)->lr_map = lr_map; - s_cpu_if->used_lrs = index; + s_cpu_if->used_lrs = hweight16(shadow_if->lr_map); } void vgic_v3_sync_nested(struct kvm_vcpu *vcpu) { struct shadow_if *shadow_if = get_shadow_if(); - int i, index = 0; + int i; for_each_set_bit(i, &shadow_if->lr_map, kvm_vgic_global_state.nr_lr) { u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i)); struct vgic_irq *irq; if (!(lr & ICH_LR_HW) || !(lr & ICH_LR_STATE)) - goto next; + continue; /* * If we had a HW lr programmed by the guest hypervisor, we @@ -277,15 +284,13 @@ void vgic_v3_sync_nested(struct kvm_vcpu *vcpu) */ irq = vgic_get_vcpu_irq(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr)); if (WARN_ON(!irq)) /* Shouldn't happen as we check on load */ - goto next; + continue; - lr = __gic_v3_get_lr(index); + lr = __gic_v3_get_lr(lr_map_idx_to_shadow_idx(shadow_if, i)); if (!(lr & ICH_LR_STATE)) irq->active = false; vgic_put_irq(vcpu->kvm, irq); - next: - index++; } } @@ -368,13 +373,11 @@ void vgic_v3_put_nested(struct kvm_vcpu *vcpu) val = __vcpu_sys_reg(vcpu, ICH_LRN(i)); val &= ~ICH_LR_STATE; - val |= s_cpu_if->vgic_lr[i] & ICH_LR_STATE; + val |= s_cpu_if->vgic_lr[lr_map_idx_to_shadow_idx(shadow_if, i)] & ICH_LR_STATE; __vcpu_assign_sys_reg(vcpu, ICH_LRN(i), val); - s_cpu_if->vgic_lr[i] = 0; } - shadow_if->lr_map = 0; vcpu->arch.vgic_cpu.vgic_v3.used_lrs = 0; } From 1fbe6861a6d9a942fb8ab8677ddf1ecb86b1af60 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:04 -0700 Subject: [PATCH 04/15] KVM: arm64: Explicitly treat routing entry type changes as changes Explicitly treat type differences as GSI routing changes, as comparing MSI data between two entries could get a false negative, e.g. if userspace changed the type but left the type-specific data as- Note, the same bug was fixed in x86 by commit bcda70c56f3e ("KVM: x86: Explicitly treat routing entry type changes as changes"). Fixes: 4bf3693d36af ("KVM: arm64: Unmap vLPIs affected by changes to GSI routing information") Signed-off-by: Sean Christopherson Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250611224604.313496-3-seanjc@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/arm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index de2b4e9c9f9f..38a91bb5d4c7 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -2764,7 +2764,8 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old, struct kvm_kernel_irq_routing_entry *new) { - if (new->type != KVM_IRQ_ROUTING_MSI) + if (old->type != KVM_IRQ_ROUTING_MSI || + new->type != KVM_IRQ_ROUTING_MSI) return true; return memcmp(&old->msi, &new->msi, sizeof(new->msi)); From 56a14984505b11674df5e01407748236bc4bc8f8 Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Sun, 8 Jun 2025 17:54:02 +0800 Subject: [PATCH 05/15] KVM: arm64: selftests: Close the GIC FD in arch_timer_edge_cases Close the GIC FD to free the reference it holds to the VM so that we can correctly clean up the VM. This also gets rid of the "KVM: debugfs: duplicate directory 395722-4" warning when running arch_timer_edge_cases. Signed-off-by: Zenghui Yu Reviewed-by: Miguel Luis Reviewed-by: Sebastian Ott Link: https://lore.kernel.org/r/20250608095402.1131-1-yuzenghui@huawei.com Signed-off-by: Marc Zyngier --- .../selftests/kvm/arm64/arch_timer_edge_cases.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c b/tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c index b4d22b3ab7cc..4e71740a098b 100644 --- a/tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c +++ b/tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c @@ -954,6 +954,8 @@ static void test_init_timer_irq(struct kvm_vm *vm, struct kvm_vcpu *vcpu) pr_debug("ptimer_irq: %d; vtimer_irq: %d\n", ptimer_irq, vtimer_irq); } +static int gic_fd; + static void test_vm_create(struct kvm_vm **vm, struct kvm_vcpu **vcpu, enum arch_timer timer) { @@ -968,12 +970,20 @@ static void test_vm_create(struct kvm_vm **vm, struct kvm_vcpu **vcpu, vcpu_args_set(*vcpu, 1, timer); test_init_timer_irq(*vm, *vcpu); - vgic_v3_setup(*vm, 1, 64); + gic_fd = vgic_v3_setup(*vm, 1, 64); + __TEST_REQUIRE(gic_fd >= 0, "Failed to create vgic-v3"); + sync_global_to_guest(*vm, test_args); sync_global_to_guest(*vm, CVAL_MAX); sync_global_to_guest(*vm, DEF_CNT); } +static void test_vm_cleanup(struct kvm_vm *vm) +{ + close(gic_fd); + kvm_vm_free(vm); +} + static void test_print_help(char *name) { pr_info("Usage: %s [-h] [-b] [-i iterations] [-l long_wait_ms] [-p] [-v]\n" @@ -1060,13 +1070,13 @@ int main(int argc, char *argv[]) if (test_args.test_virtual) { test_vm_create(&vm, &vcpu, VIRTUAL); test_run(vm, vcpu); - kvm_vm_free(vm); + test_vm_cleanup(vm); } if (test_args.test_physical) { test_vm_create(&vm, &vcpu, PHYSICAL); test_run(vm, vcpu); - kvm_vm_free(vm); + test_vm_cleanup(vm); } return 0; From cade3d57e456e69f67aa9894bf89dc8678796bb7 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 17 Jun 2025 14:37:12 +0100 Subject: [PATCH 06/15] KVM: arm64: VHE: Synchronize restore of host debug registers When KVM runs in non-protected VHE mode, there's no context synchronization event between __debug_switch_to_host() restoring the host debug registers and __kvm_vcpu_run() unmasking debug exceptions. Due to this, it's theoretically possible for the host to take an unexpected debug exception due to the stale guest configuration. This cannot happen in NVHE/HVHE mode as debug exceptions are masked in the hyp code, and the exception return to the host will provide the necessary context synchronization before debug exceptions can be taken. For now, avoid the problem by adding an ISB after VHE hyp code restores the host debug registers. Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: Fuad Tabba Cc: Marc Zyngier Cc: Mark Brown Cc: Oliver Upton Cc: Will Deacon Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20250617133718.4014181-2-mark.rutland@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/include/hyp/debug-sr.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/kvm/hyp/include/hyp/debug-sr.h b/arch/arm64/kvm/hyp/include/hyp/debug-sr.h index 502a5b73ee70..73881e1dc267 100644 --- a/arch/arm64/kvm/hyp/include/hyp/debug-sr.h +++ b/arch/arm64/kvm/hyp/include/hyp/debug-sr.h @@ -167,6 +167,9 @@ static inline void __debug_switch_to_host_common(struct kvm_vcpu *vcpu) __debug_save_state(guest_dbg, guest_ctxt); __debug_restore_state(host_dbg, host_ctxt); + + if (has_vhe()) + isb(); } #endif /* __ARM64_KVM_HYP_DEBUG_SR_H__ */ From 257d0aa8e2502754bc758faceceb6ff59318af60 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 17 Jun 2025 14:37:13 +0100 Subject: [PATCH 07/15] KVM: arm64: VHE: Synchronize CPTR trap deactivation Currently there is no ISB between __deactivate_cptr_traps() disabling traps that affect EL2 and fpsimd_lazy_switch_to_host() manipulating registers potentially affected by CPTR traps. When NV is not in use, this is safe because the relevant registers are only accessed when guest_owns_fp_regs() && vcpu_has_sve(vcpu), and this also implies that SVE traps affecting EL2 have been deactivated prior to __guest_entry(). When NV is in use, a guest hypervisor may have configured SVE traps for a nested context, and so it is necessary to have an ISB between __deactivate_cptr_traps() and fpsimd_lazy_switch_to_host(). Due to the current lack of an ISB, when a guest hypervisor enables SVE traps in CPTR, the host can take an unexpected SVE trap from within fpsimd_lazy_switch_to_host(), e.g. | Unhandled 64-bit el1h sync exception on CPU1, ESR 0x0000000066000000 -- SVE | CPU: 1 UID: 0 PID: 164 Comm: kvm-vcpu-0 Not tainted 6.15.0-rc4-00138-ga05e0f012c05 #3 PREEMPT | Hardware name: FVP Base RevC (DT) | pstate: 604023c9 (nZCv DAIF +PAN -UAO -TCO -DIT -SSBS BTYPE=--) | pc : __kvm_vcpu_run+0x6f4/0x844 | lr : __kvm_vcpu_run+0x150/0x844 | sp : ffff800083903a60 | x29: ffff800083903a90 x28: ffff000801f4a300 x27: 0000000000000000 | x26: 0000000000000000 x25: ffff000801f90000 x24: ffff000801f900f0 | x23: ffff800081ff7720 x22: 0002433c807d623f x21: ffff000801f90000 | x20: ffff00087f730730 x19: 0000000000000000 x18: 0000000000000000 | x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000 | x14: 0000000000000000 x13: 0000000000000000 x12: 0000000000000000 | x11: 0000000000000000 x10: 0000000000000000 x9 : 0000000000000000 | x8 : 0000000000000000 x7 : 0000000000000000 x6 : ffff000801f90d70 | x5 : 0000000000001000 x4 : ffff8007fd739000 x3 : ffff000801f90000 | x2 : 0000000000000000 x1 : 00000000000003cc x0 : ffff800082f9d000 | Kernel panic - not syncing: Unhandled exception | CPU: 1 UID: 0 PID: 164 Comm: kvm-vcpu-0 Not tainted 6.15.0-rc4-00138-ga05e0f012c05 #3 PREEMPT | Hardware name: FVP Base RevC (DT) | Call trace: | show_stack+0x18/0x24 (C) | dump_stack_lvl+0x60/0x80 | dump_stack+0x18/0x24 | panic+0x168/0x360 | __panic_unhandled+0x68/0x74 | el1h_64_irq_handler+0x0/0x24 | el1h_64_sync+0x6c/0x70 | __kvm_vcpu_run+0x6f4/0x844 (P) | kvm_arm_vcpu_enter_exit+0x64/0xa0 | kvm_arch_vcpu_ioctl_run+0x21c/0x870 | kvm_vcpu_ioctl+0x1a8/0x9d0 | __arm64_sys_ioctl+0xb4/0xf4 | invoke_syscall+0x48/0x104 | el0_svc_common.constprop.0+0x40/0xe0 | do_el0_svc+0x1c/0x28 | el0_svc+0x30/0xcc | el0t_64_sync_handler+0x10c/0x138 | el0t_64_sync+0x198/0x19c | SMP: stopping secondary CPUs | Kernel Offset: disabled | CPU features: 0x0000,000002c0,02df4fb9,97ee773f | Memory Limit: none | ---[ end Kernel panic - not syncing: Unhandled exception ]--- Fix this by adding an ISB between __deactivate_traps() and fpsimd_lazy_switch_to_host(). Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: Fuad Tabba Cc: Marc Zyngier Cc: Mark Brown Cc: Oliver Upton Cc: Will Deacon Link: https://lore.kernel.org/r/20250617133718.4014181-3-mark.rutland@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/vhe/switch.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index 09df2b42bc1b..20df44b49ceb 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -667,6 +667,9 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) __deactivate_traps(vcpu); + /* Ensure CPTR trap deactivation has taken effect */ + isb(); + fpsimd_lazy_switch_to_host(vcpu); sysreg_restore_host_state_vhe(host_ctxt); From e62dd507844fa47f0fdc29f3be5a90a83f297820 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 17 Jun 2025 14:37:14 +0100 Subject: [PATCH 08/15] KVM: arm64: Reorganise CPTR trap manipulation The NVHE/HVHE and VHE modes have separate implementations of __activate_cptr_traps() and __deactivate_cptr_traps() in their respective switch.c files. There's some duplication of logic, and it's not currently possible to reuse this logic elsewhere. Move the logic into the common switch.h header so that it can be reused, and de-duplicate the common logic. This rework changes the way SVE traps are deactivated in VHE mode, aligning it with NVHE/HVHE modes: * Before this patch, VHE's __deactivate_cptr_traps() would unconditionally enable SVE for host EL2 (but not EL0), regardless of whether the ARM64_SVE cpucap was set. * After this patch, VHE's __deactivate_cptr_traps() will take the ARM64_SVE cpucap into account. When ARM64_SVE is not set, SVE will be trapped from EL2 and below. The old and new behaviour are both benign: * When ARM64_SVE is not set, the host will not touch SVE state, and will not reconfigure SVE traps. Host EL0 access to SVE will be trapped as expected. * When ARM64_SVE is set, the host will configure EL0 SVE traps before returning to EL0 as part of reloading the EL0 FPSIMD/SVE/SME state. Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: Fuad Tabba Cc: Marc Zyngier Cc: Mark Brown Cc: Oliver Upton Cc: Will Deacon Link: https://lore.kernel.org/r/20250617133718.4014181-4-mark.rutland@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/include/hyp/switch.h | 130 ++++++++++++++++++++++++ arch/arm64/kvm/hyp/nvhe/switch.c | 59 ----------- arch/arm64/kvm/hyp/vhe/switch.c | 81 --------------- 3 files changed, 130 insertions(+), 140 deletions(-) diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 76dfda116e56..8a77fcccbcf6 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -65,6 +65,136 @@ static inline void __activate_traps_fpsimd32(struct kvm_vcpu *vcpu) } } +static inline void __activate_cptr_traps_nvhe(struct kvm_vcpu *vcpu) +{ + u64 val = CPTR_NVHE_EL2_RES1 | CPTR_EL2_TAM | CPTR_EL2_TTA; + + /* + * Always trap SME since it's not supported in KVM. + * TSM is RES1 if SME isn't implemented. + */ + val |= CPTR_EL2_TSM; + + if (!vcpu_has_sve(vcpu) || !guest_owns_fp_regs()) + val |= CPTR_EL2_TZ; + + if (!guest_owns_fp_regs()) + val |= CPTR_EL2_TFP; + + write_sysreg(val, cptr_el2); +} + +static inline void __activate_cptr_traps_vhe(struct kvm_vcpu *vcpu) +{ + /* + * With VHE (HCR.E2H == 1), accesses to CPACR_EL1 are routed to + * CPTR_EL2. In general, CPACR_EL1 has the same layout as CPTR_EL2, + * except for some missing controls, such as TAM. + * In this case, CPTR_EL2.TAM has the same position with or without + * VHE (HCR.E2H == 1) which allows us to use here the CPTR_EL2.TAM + * shift value for trapping the AMU accesses. + */ + u64 val = CPTR_EL2_TAM | CPACR_EL1_TTA; + u64 cptr; + + if (guest_owns_fp_regs()) { + val |= CPACR_EL1_FPEN; + if (vcpu_has_sve(vcpu)) + val |= CPACR_EL1_ZEN; + } + + if (!vcpu_has_nv(vcpu)) + goto write; + + /* + * The architecture is a bit crap (what a surprise): an EL2 guest + * writing to CPTR_EL2 via CPACR_EL1 can't set any of TCPAC or TTA, + * as they are RES0 in the guest's view. To work around it, trap the + * sucker using the very same bit it can't set... + */ + if (vcpu_el2_e2h_is_set(vcpu) && is_hyp_ctxt(vcpu)) + val |= CPTR_EL2_TCPAC; + + /* + * Layer the guest hypervisor's trap configuration on top of our own if + * we're in a nested context. + */ + if (is_hyp_ctxt(vcpu)) + goto write; + + cptr = vcpu_sanitised_cptr_el2(vcpu); + + /* + * Pay attention, there's some interesting detail here. + * + * The CPTR_EL2.xEN fields are 2 bits wide, although there are only two + * meaningful trap states when HCR_EL2.TGE = 0 (running a nested guest): + * + * - CPTR_EL2.xEN = x0, traps are enabled + * - CPTR_EL2.xEN = x1, traps are disabled + * + * In other words, bit[0] determines if guest accesses trap or not. In + * the interest of simplicity, clear the entire field if the guest + * hypervisor has traps enabled to dispel any illusion of something more + * complicated taking place. + */ + if (!(SYS_FIELD_GET(CPACR_EL1, FPEN, cptr) & BIT(0))) + val &= ~CPACR_EL1_FPEN; + if (!(SYS_FIELD_GET(CPACR_EL1, ZEN, cptr) & BIT(0))) + val &= ~CPACR_EL1_ZEN; + + if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, S2POE, IMP)) + val |= cptr & CPACR_EL1_E0POE; + + val |= cptr & CPTR_EL2_TCPAC; + +write: + write_sysreg(val, cpacr_el1); +} + +static inline void __activate_cptr_traps(struct kvm_vcpu *vcpu) +{ + if (!guest_owns_fp_regs()) + __activate_traps_fpsimd32(vcpu); + + if (has_vhe() || has_hvhe()) + __activate_cptr_traps_vhe(vcpu); + else + __activate_cptr_traps_nvhe(vcpu); +} + +static inline void __deactivate_cptr_traps_nvhe(struct kvm_vcpu *vcpu) +{ + u64 val = CPTR_NVHE_EL2_RES1; + + if (!cpus_have_final_cap(ARM64_SVE)) + val |= CPTR_EL2_TZ; + if (!cpus_have_final_cap(ARM64_SME)) + val |= CPTR_EL2_TSM; + + write_sysreg(val, cptr_el2); +} + +static inline void __deactivate_cptr_traps_vhe(struct kvm_vcpu *vcpu) +{ + u64 val = CPACR_EL1_FPEN; + + if (cpus_have_final_cap(ARM64_SVE)) + val |= CPACR_EL1_ZEN; + if (cpus_have_final_cap(ARM64_SME)) + val |= CPACR_EL1_SMEN; + + write_sysreg(val, cpacr_el1); +} + +static inline void __deactivate_cptr_traps(struct kvm_vcpu *vcpu) +{ + if (has_vhe() || has_hvhe()) + __deactivate_cptr_traps_vhe(vcpu); + else + __deactivate_cptr_traps_nvhe(vcpu); +} + #define reg_to_fgt_masks(reg) \ ({ \ struct fgt_masks *m; \ diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index 73affe1333a4..0e752b515d0f 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -47,65 +47,6 @@ struct fgt_masks hdfgwtr2_masks; extern void kvm_nvhe_prepare_backtrace(unsigned long fp, unsigned long pc); -static void __activate_cptr_traps(struct kvm_vcpu *vcpu) -{ - u64 val = CPTR_EL2_TAM; /* Same bit irrespective of E2H */ - - if (!guest_owns_fp_regs()) - __activate_traps_fpsimd32(vcpu); - - if (has_hvhe()) { - val |= CPACR_EL1_TTA; - - if (guest_owns_fp_regs()) { - val |= CPACR_EL1_FPEN; - if (vcpu_has_sve(vcpu)) - val |= CPACR_EL1_ZEN; - } - - write_sysreg(val, cpacr_el1); - } else { - val |= CPTR_EL2_TTA | CPTR_NVHE_EL2_RES1; - - /* - * Always trap SME since it's not supported in KVM. - * TSM is RES1 if SME isn't implemented. - */ - val |= CPTR_EL2_TSM; - - if (!vcpu_has_sve(vcpu) || !guest_owns_fp_regs()) - val |= CPTR_EL2_TZ; - - if (!guest_owns_fp_regs()) - val |= CPTR_EL2_TFP; - - write_sysreg(val, cptr_el2); - } -} - -static void __deactivate_cptr_traps(struct kvm_vcpu *vcpu) -{ - if (has_hvhe()) { - u64 val = CPACR_EL1_FPEN; - - if (cpus_have_final_cap(ARM64_SVE)) - val |= CPACR_EL1_ZEN; - if (cpus_have_final_cap(ARM64_SME)) - val |= CPACR_EL1_SMEN; - - write_sysreg(val, cpacr_el1); - } else { - u64 val = CPTR_NVHE_EL2_RES1; - - if (!cpus_have_final_cap(ARM64_SVE)) - val |= CPTR_EL2_TZ; - if (!cpus_have_final_cap(ARM64_SME)) - val |= CPTR_EL2_TSM; - - write_sysreg(val, cptr_el2); - } -} - static void __activate_traps(struct kvm_vcpu *vcpu) { ___activate_traps(vcpu, vcpu->arch.hcr_el2); diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index 20df44b49ceb..32b4814eb2b7 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -90,87 +90,6 @@ static u64 __compute_hcr(struct kvm_vcpu *vcpu) return hcr | (guest_hcr & ~NV_HCR_GUEST_EXCLUDE); } -static void __activate_cptr_traps(struct kvm_vcpu *vcpu) -{ - u64 cptr; - - /* - * With VHE (HCR.E2H == 1), accesses to CPACR_EL1 are routed to - * CPTR_EL2. In general, CPACR_EL1 has the same layout as CPTR_EL2, - * except for some missing controls, such as TAM. - * In this case, CPTR_EL2.TAM has the same position with or without - * VHE (HCR.E2H == 1) which allows us to use here the CPTR_EL2.TAM - * shift value for trapping the AMU accesses. - */ - u64 val = CPACR_EL1_TTA | CPTR_EL2_TAM; - - if (guest_owns_fp_regs()) { - val |= CPACR_EL1_FPEN; - if (vcpu_has_sve(vcpu)) - val |= CPACR_EL1_ZEN; - } else { - __activate_traps_fpsimd32(vcpu); - } - - if (!vcpu_has_nv(vcpu)) - goto write; - - /* - * The architecture is a bit crap (what a surprise): an EL2 guest - * writing to CPTR_EL2 via CPACR_EL1 can't set any of TCPAC or TTA, - * as they are RES0 in the guest's view. To work around it, trap the - * sucker using the very same bit it can't set... - */ - if (vcpu_el2_e2h_is_set(vcpu) && is_hyp_ctxt(vcpu)) - val |= CPTR_EL2_TCPAC; - - /* - * Layer the guest hypervisor's trap configuration on top of our own if - * we're in a nested context. - */ - if (is_hyp_ctxt(vcpu)) - goto write; - - cptr = vcpu_sanitised_cptr_el2(vcpu); - - /* - * Pay attention, there's some interesting detail here. - * - * The CPTR_EL2.xEN fields are 2 bits wide, although there are only two - * meaningful trap states when HCR_EL2.TGE = 0 (running a nested guest): - * - * - CPTR_EL2.xEN = x0, traps are enabled - * - CPTR_EL2.xEN = x1, traps are disabled - * - * In other words, bit[0] determines if guest accesses trap or not. In - * the interest of simplicity, clear the entire field if the guest - * hypervisor has traps enabled to dispel any illusion of something more - * complicated taking place. - */ - if (!(SYS_FIELD_GET(CPACR_EL1, FPEN, cptr) & BIT(0))) - val &= ~CPACR_EL1_FPEN; - if (!(SYS_FIELD_GET(CPACR_EL1, ZEN, cptr) & BIT(0))) - val &= ~CPACR_EL1_ZEN; - - if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, S2POE, IMP)) - val |= cptr & CPACR_EL1_E0POE; - - val |= cptr & CPTR_EL2_TCPAC; - -write: - write_sysreg(val, cpacr_el1); -} - -static void __deactivate_cptr_traps(struct kvm_vcpu *vcpu) -{ - u64 val = CPACR_EL1_FPEN | CPACR_EL1_ZEN_EL1EN; - - if (cpus_have_final_cap(ARM64_SME)) - val |= CPACR_EL1_SMEN_EL1EN; - - write_sysreg(val, cpacr_el1); -} - static void __activate_traps(struct kvm_vcpu *vcpu) { u64 val; From 59e6e101a6fa542a365dd5858affd18ba3e84cb8 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 17 Jun 2025 14:37:15 +0100 Subject: [PATCH 09/15] KVM: arm64: Remove ad-hoc CPTR manipulation from fpsimd_sve_sync() There's no need for fpsimd_sve_sync() to write to CPTR/CPACR. All relevant traps are always disabled earlier within __kvm_vcpu_run(), when __deactivate_cptr_traps() configures CPTR/CPACR. With irrelevant details elided, the flow is: handle___kvm_vcpu_run(...) { flush_hyp_vcpu(...) { fpsimd_sve_flush(...); } __kvm_vcpu_run(...) { __activate_traps(...) { __activate_cptr_traps(...); } do { __guest_enter(...); } while (...); __deactivate_traps(....) { __deactivate_cptr_traps(...); } } sync_hyp_vcpu(...) { fpsimd_sve_sync(...); } } Remove the unnecessary write to CPTR/CPACR. An ISB is still necessary, so a comment is added to describe this requirement. Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: Fuad Tabba Cc: Marc Zyngier Cc: Mark Brown Cc: Oliver Upton Cc: Will Deacon Link: https://lore.kernel.org/r/20250617133718.4014181-5-mark.rutland@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/hyp-main.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index e9198e56e784..3206b2c07f82 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -69,7 +69,10 @@ static void fpsimd_sve_sync(struct kvm_vcpu *vcpu) if (!guest_owns_fp_regs()) return; - cpacr_clear_set(0, CPACR_EL1_FPEN | CPACR_EL1_ZEN); + /* + * Traps have been disabled by __deactivate_cptr_traps(), but there + * hasn't necessarily been a context synchronization event yet. + */ isb(); if (vcpu_has_sve(vcpu)) From 186b58bacd74d9b7892869f7c7d20cf865a3c237 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 17 Jun 2025 14:37:16 +0100 Subject: [PATCH 10/15] KVM: arm64: Remove ad-hoc CPTR manipulation from kvm_hyp_handle_fpsimd() The hyp code FPSIMD/SVE/SME trap handling logic has some rather messy open-coded manipulation of CPTR/CPACR. This is benign for non-nested guests, but broken for nested guests, as the guest hypervisor's CPTR configuration is not taken into account. Consider the case where L0 provides FPSIMD+SVE to an L1 guest hypervisor, and the L1 guest hypervisor only provides FPSIMD to an L2 guest (with L1 configuring CPTR/CPACR to trap SVE usage from L2). If the L2 guest triggers an FPSIMD trap to the L0 hypervisor, kvm_hyp_handle_fpsimd() will see that the vCPU supports FPSIMD+SVE, and will configure CPTR/CPACR to NOT trap FPSIMD+SVE before returning to the L2 guest. Consequently the L2 guest would be able to manipulate SVE state even though the L1 hypervisor had configured CPTR/CPACR to forbid this. Clean this up, and fix the nested virt issue by always using __deactivate_cptr_traps() and __activate_cptr_traps() to manage the CPTR traps. This removes the need for the ad-hoc fixup in kvm_hyp_save_fpsimd_host(), and ensures that any guest hypervisor configuration of CPTR/CPACR is taken into account. Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: Fuad Tabba Cc: Marc Zyngier Cc: Mark Brown Cc: Oliver Upton Cc: Will Deacon Link: https://lore.kernel.org/r/20250617133718.4014181-6-mark.rutland@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/include/hyp/switch.h | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 8a77fcccbcf6..2ad57b117385 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -616,11 +616,6 @@ static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu) */ if (system_supports_sve()) { __hyp_sve_save_host(); - - /* Re-enable SVE traps if not supported for the guest vcpu. */ - if (!vcpu_has_sve(vcpu)) - cpacr_clear_set(CPACR_EL1_ZEN, 0); - } else { __fpsimd_save_state(host_data_ptr(host_ctxt.fp_regs)); } @@ -671,10 +666,7 @@ static inline bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) /* Valid trap. Switch the context: */ /* First disable enough traps to allow us to update the registers */ - if (sve_guest || (is_protected_kvm_enabled() && system_supports_sve())) - cpacr_clear_set(0, CPACR_EL1_FPEN | CPACR_EL1_ZEN); - else - cpacr_clear_set(0, CPACR_EL1_FPEN); + __deactivate_cptr_traps(vcpu); isb(); /* Write out the host state if it's in the registers */ @@ -696,6 +688,13 @@ static inline bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) *host_data_ptr(fp_owner) = FP_STATE_GUEST_OWNED; + /* + * Re-enable traps necessary for the current state of the guest, e.g. + * those enabled by a guest hypervisor. The ERET to the guest will + * provide the necessary context synchronization. + */ + __activate_cptr_traps(vcpu); + return true; } From 3a300a33e4063fb44c7887bec3aecd2fd6966df8 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 17 Jun 2025 14:37:17 +0100 Subject: [PATCH 11/15] KVM: arm64: Remove cpacr_clear_set() We no longer use cpacr_clear_set(). Remove cpacr_clear_set() and its helper functions. Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: Fuad Tabba Cc: Marc Zyngier Cc: Mark Brown Cc: Oliver Upton Cc: Will Deacon Link: https://lore.kernel.org/r/20250617133718.4014181-7-mark.rutland@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_emulate.h | 62 ---------------------------- 1 file changed, 62 deletions(-) diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index bd020fc28aa9..0720898f563e 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -561,68 +561,6 @@ static __always_inline void kvm_incr_pc(struct kvm_vcpu *vcpu) vcpu_set_flag((v), e); \ } while (0) -#define __build_check_all_or_none(r, bits) \ - BUILD_BUG_ON(((r) & (bits)) && ((r) & (bits)) != (bits)) - -#define __cpacr_to_cptr_clr(clr, set) \ - ({ \ - u64 cptr = 0; \ - \ - if ((set) & CPACR_EL1_FPEN) \ - cptr |= CPTR_EL2_TFP; \ - if ((set) & CPACR_EL1_ZEN) \ - cptr |= CPTR_EL2_TZ; \ - if ((set) & CPACR_EL1_SMEN) \ - cptr |= CPTR_EL2_TSM; \ - if ((clr) & CPACR_EL1_TTA) \ - cptr |= CPTR_EL2_TTA; \ - if ((clr) & CPTR_EL2_TAM) \ - cptr |= CPTR_EL2_TAM; \ - if ((clr) & CPTR_EL2_TCPAC) \ - cptr |= CPTR_EL2_TCPAC; \ - \ - cptr; \ - }) - -#define __cpacr_to_cptr_set(clr, set) \ - ({ \ - u64 cptr = 0; \ - \ - if ((clr) & CPACR_EL1_FPEN) \ - cptr |= CPTR_EL2_TFP; \ - if ((clr) & CPACR_EL1_ZEN) \ - cptr |= CPTR_EL2_TZ; \ - if ((clr) & CPACR_EL1_SMEN) \ - cptr |= CPTR_EL2_TSM; \ - if ((set) & CPACR_EL1_TTA) \ - cptr |= CPTR_EL2_TTA; \ - if ((set) & CPTR_EL2_TAM) \ - cptr |= CPTR_EL2_TAM; \ - if ((set) & CPTR_EL2_TCPAC) \ - cptr |= CPTR_EL2_TCPAC; \ - \ - cptr; \ - }) - -#define cpacr_clear_set(clr, set) \ - do { \ - BUILD_BUG_ON((set) & CPTR_VHE_EL2_RES0); \ - BUILD_BUG_ON((clr) & CPACR_EL1_E0POE); \ - __build_check_all_or_none((clr), CPACR_EL1_FPEN); \ - __build_check_all_or_none((set), CPACR_EL1_FPEN); \ - __build_check_all_or_none((clr), CPACR_EL1_ZEN); \ - __build_check_all_or_none((set), CPACR_EL1_ZEN); \ - __build_check_all_or_none((clr), CPACR_EL1_SMEN); \ - __build_check_all_or_none((set), CPACR_EL1_SMEN); \ - \ - if (has_vhe() || has_hvhe()) \ - sysreg_clear_set(cpacr_el1, clr, set); \ - else \ - sysreg_clear_set(cptr_el2, \ - __cpacr_to_cptr_clr(clr, set), \ - __cpacr_to_cptr_set(clr, set));\ - } while (0) - /* * Returns a 'sanitised' view of CPTR_EL2, translating from nVHE to the VHE * format if E2H isn't set. From 04c5355b2a94ff3191ce63ab035fb7f04d036869 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 17 Jun 2025 14:37:18 +0100 Subject: [PATCH 12/15] KVM: arm64: VHE: Centralize ISBs when returning to host The VHE hyp code has recently gained a few ISBs. Simplify this to one unconditional ISB in __kvm_vcpu_run_vhe(), and remove the unnecessary ISB from the kvm_call_hyp_ret() macro. While kvm_call_hyp_ret() is also used to invoke __vgic_v3_get_gic_config(), but no ISB is necessary in that case either. For the moment, an ISB is left in kvm_call_hyp(), as there are many more users, and removing the ISB would require a more thorough audit. Suggested-by: Marc Zyngier Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: Fuad Tabba Cc: Marc Zyngier Cc: Mark Brown Cc: Oliver Upton Cc: Will Deacon Link: https://lore.kernel.org/r/20250617133718.4014181-8-mark.rutland@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 6 ++---- arch/arm64/kvm/hyp/include/hyp/debug-sr.h | 3 --- arch/arm64/kvm/hyp/vhe/switch.c | 25 +++++++++++------------ 3 files changed, 14 insertions(+), 20 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 5ccca509dff1..d27079968341 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -1289,9 +1289,8 @@ void kvm_arm_resume_guest(struct kvm *kvm); }) /* - * The couple of isb() below are there to guarantee the same behaviour - * on VHE as on !VHE, where the eret to EL1 acts as a context - * synchronization event. + * The isb() below is there to guarantee the same behaviour on VHE as on !VHE, + * where the eret to EL1 acts as a context synchronization event. */ #define kvm_call_hyp(f, ...) \ do { \ @@ -1309,7 +1308,6 @@ void kvm_arm_resume_guest(struct kvm *kvm); \ if (has_vhe()) { \ ret = f(__VA_ARGS__); \ - isb(); \ } else { \ ret = kvm_call_hyp_nvhe(f, ##__VA_ARGS__); \ } \ diff --git a/arch/arm64/kvm/hyp/include/hyp/debug-sr.h b/arch/arm64/kvm/hyp/include/hyp/debug-sr.h index 73881e1dc267..502a5b73ee70 100644 --- a/arch/arm64/kvm/hyp/include/hyp/debug-sr.h +++ b/arch/arm64/kvm/hyp/include/hyp/debug-sr.h @@ -167,9 +167,6 @@ static inline void __debug_switch_to_host_common(struct kvm_vcpu *vcpu) __debug_save_state(guest_dbg, guest_ctxt); __debug_restore_state(host_dbg, host_ctxt); - - if (has_vhe()) - isb(); } #endif /* __ARM64_KVM_HYP_DEBUG_SR_H__ */ diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index 32b4814eb2b7..477f1580ffea 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -558,10 +558,10 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) host_ctxt = host_data_ptr(host_ctxt); guest_ctxt = &vcpu->arch.ctxt; - sysreg_save_host_state_vhe(host_ctxt); - fpsimd_lazy_switch_to_guest(vcpu); + sysreg_save_host_state_vhe(host_ctxt); + /* * Note that ARM erratum 1165522 requires us to configure both stage 1 * and stage 2 translation for the guest context before we clear @@ -586,18 +586,23 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) __deactivate_traps(vcpu); - /* Ensure CPTR trap deactivation has taken effect */ + sysreg_restore_host_state_vhe(host_ctxt); + + __debug_switch_to_host(vcpu); + + /* + * Ensure that all system register writes above have taken effect + * before returning to the host. In VHE mode, CPTR traps for + * FPSIMD/SVE/SME also apply to EL2, so FPSIMD/SVE/SME state must be + * manipulated after the ISB. + */ isb(); fpsimd_lazy_switch_to_host(vcpu); - sysreg_restore_host_state_vhe(host_ctxt); - if (guest_owns_fp_regs()) __fpsimd_save_fpexc32(vcpu); - __debug_switch_to_host(vcpu); - return exit_code; } NOKPROBE_SYMBOL(__kvm_vcpu_run_vhe); @@ -627,12 +632,6 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) */ local_daif_restore(DAIF_PROCCTX_NOIRQ); - /* - * When we exit from the guest we change a number of CPU configuration - * parameters, such as traps. We rely on the isb() in kvm_call_hyp*() - * to make sure these changes take effect before running the host or - * additional guests. - */ return ret; } From b5aafcb4efd2bdacbc37753cf807d69faa6a7304 Mon Sep 17 00:00:00 2001 From: Binbin Wu Date: Tue, 10 Jun 2025 10:14:19 +0800 Subject: [PATCH 13/15] KVM: TDX: Add new TDVMCALL status code for unsupported subfuncs Add the new TDVMCALL status code TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED and return it for unimplemented TDVMCALL subfunctions. Returning TDVMCALL_STATUS_INVALID_OPERAND when a subfunction is not implemented is vague because TDX guests can't tell the error is due to the subfunction is not supported or an invalid input of the subfunction. New GHCI spec adds TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED to avoid the ambiguity. Use it instead of TDVMCALL_STATUS_INVALID_OPERAND. Before the change, for common guest implementations, when a TDX guest receives TDVMCALL_STATUS_INVALID_OPERAND, it has two cases: 1. Some operand is invalid. It could change the operand to another value retry. 2. The subfunction is not supported. For case 1, an invalid operand usually means the guest implementation bug. Since the TDX guest can't tell which case is, the best practice for handling TDVMCALL_STATUS_INVALID_OPERAND is stopping calling such leaf, treating the failure as fatal if the TDVMCALL is essential or ignoring it if the TDVMCALL is optional. With this change, TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED could be sent to old TDX guest that do not know about it, but it is expected that the guest will make the same action as TDVMCALL_STATUS_INVALID_OPERAND. Currently, no known TDX guest checks TDVMCALL_STATUS_INVALID_OPERAND specifically; for example Linux just checks for success. Signed-off-by: Binbin Wu [Return it for untrapped KVM_HC_MAP_GPA_RANGE. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/shared/tdx.h | 1 + arch/x86/kvm/vmx/tdx.c | 10 ++++++---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/shared/tdx.h b/arch/x86/include/asm/shared/tdx.h index 2f3820342598..d8525e6ef50a 100644 --- a/arch/x86/include/asm/shared/tdx.h +++ b/arch/x86/include/asm/shared/tdx.h @@ -80,6 +80,7 @@ #define TDVMCALL_STATUS_RETRY 0x0000000000000001ULL #define TDVMCALL_STATUS_INVALID_OPERAND 0x8000000000000000ULL #define TDVMCALL_STATUS_ALIGN_ERROR 0x8000000000000002ULL +#define TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED 0x8000000000000003ULL /* * Bitmasks of exposed registers (with VMM). diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index b952bc673271..5d100c240ab3 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -1212,11 +1212,13 @@ static int tdx_map_gpa(struct kvm_vcpu *vcpu) /* * Converting TDVMCALL_MAP_GPA to KVM_HC_MAP_GPA_RANGE requires * userspace to enable KVM_CAP_EXIT_HYPERCALL with KVM_HC_MAP_GPA_RANGE - * bit set. If not, the error code is not defined in GHCI for TDX, use - * TDVMCALL_STATUS_INVALID_OPERAND for this case. + * bit set. This is a base call so it should always be supported, but + * KVM has no way to ensure that userspace implements the GHCI correctly. + * So if KVM_HC_MAP_GPA_RANGE does not cause a VMEXIT, return an error + * to the guest. */ if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) { - ret = TDVMCALL_STATUS_INVALID_OPERAND; + ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED; goto error; } @@ -1476,7 +1478,7 @@ static int handle_tdvmcall(struct kvm_vcpu *vcpu) break; } - tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); + tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED); return 1; } From cf207eac06f661fb692f405d5ab8230df884ee52 Mon Sep 17 00:00:00 2001 From: Binbin Wu Date: Tue, 10 Jun 2025 10:14:20 +0800 Subject: [PATCH 14/15] KVM: TDX: Handle TDG.VP.VMCALL Handle TDVMCALL for GetQuote to generate a TD-Quote. GetQuote is a doorbell-like interface used by TDX guests to request VMM to generate a TD-Quote signed by a service hosting TD-Quoting Enclave operating on the host. A TDX guest passes a TD Report (TDREPORT_STRUCT) in a shared-memory area as parameter. Host VMM can access it and queue the operation for a service hosting TD-Quoting enclave. When completed, the Quote is returned via the same shared-memory area. KVM only checks the GPA from the TDX guest has the shared-bit set and drops the shared-bit before exiting to userspace to avoid bleeding the shared-bit into KVM's exit ABI. KVM forwards the request to userspace VMM (e.g. QEMU) and userspace VMM queues the operation asynchronously. KVM sets the return code according to the 'ret' field set by userspace to notify the TDX guest whether the request has been queued successfully or not. When the request has been queued successfully, the TDX guest can poll the status field in the shared-memory area to check whether the Quote generation is completed or not. When completed, the generated Quote is returned via the same buffer. Add KVM_EXIT_TDX as a new exit reason to userspace. Userspace is required to handle the KVM exit reason as the initial support for TDX, by reentering KVM to ensure that the TDVMCALL is complete. While at it, add a note that KVM_EXIT_HYPERCALL also requires reentry with KVM_RUN. Signed-off-by: Binbin Wu Tested-by: Mikko Ylinen Acked-by: Kai Huang [Adjust userspace API. - Paolo] Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 49 +++++++++++++++++++++++++++++++++- arch/x86/kvm/vmx/tdx.c | 32 ++++++++++++++++++++++ include/uapi/linux/kvm.h | 17 ++++++++++++ 3 files changed, 97 insertions(+), 1 deletion(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 1bd2d42e6424..115ec3c2b641 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6645,7 +6645,8 @@ to the byte array. .. note:: For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN, - KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding + KVM_EXIT_EPR, KVM_EXIT_HYPERCALL, KVM_EXIT_TDX, + KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding operations are complete (and guest state is consistent) only after userspace has re-entered the kernel with KVM_RUN. The kernel side will first finish incomplete operations and then check for pending signals. @@ -7174,6 +7175,52 @@ The valid value for 'flags' is: - KVM_NOTIFY_CONTEXT_INVALID -- the VM context is corrupted and not valid in VMCS. It would run into unknown result if resume the target VM. +:: + + /* KVM_EXIT_TDX */ + struct { + __u64 flags; + __u64 nr; + union { + struct { + u64 ret; + u64 data[5]; + } unknown; + struct { + u64 ret; + u64 gpa; + u64 size; + } get_quote; + }; + } tdx; + +Process a TDVMCALL from the guest. KVM forwards select TDVMCALL based +on the Guest-Hypervisor Communication Interface (GHCI) specification; +KVM bridges these requests to the userspace VMM with minimal changes, +placing the inputs in the union and copying them back to the guest +on re-entry. + +Flags are currently always zero, whereas ``nr`` contains the TDVMCALL +number from register R11. The remaining field of the union provide the +inputs and outputs of the TDVMCALL. Currently the following values of +``nr`` are defined: + +* ``TDVMCALL_GET_QUOTE``: the guest has requested to generate a TD-Quote +signed by a service hosting TD-Quoting Enclave operating on the host. +Parameters and return value are in the ``get_quote`` field of the union. +The ``gpa`` field and ``size`` specify the guest physical address +(without the shared bit set) and the size of a shared-memory buffer, in +which the TDX guest passes a TD Report. The ``ret`` field represents +the return value of the GetQuote request. When the request has been +queued successfully, the TDX guest can poll the status field in the +shared-memory area to check whether the Quote generation is completed or +not. When completed, the generated Quote is returned via the same buffer. + +KVM may add support for more values in the future that may cause a userspace +exit, even without calls to ``KVM_ENABLE_CAP`` or similar. In this case, +it will enter with output fields already valid; in the common case, the +``unknown.ret`` field of the union will be ``TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED``. +Userspace need not do anything if it does not wish to support a TDVMCALL. :: /* Fix the size of the union. */ diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 5d100c240ab3..b619a3478983 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -1465,6 +1465,36 @@ static int tdx_get_td_vm_call_info(struct kvm_vcpu *vcpu) return 1; } +static int tdx_complete_simple(struct kvm_vcpu *vcpu) +{ + tdvmcall_set_return_code(vcpu, vcpu->run->tdx.unknown.ret); + return 1; +} + +static int tdx_get_quote(struct kvm_vcpu *vcpu) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + u64 gpa = tdx->vp_enter_args.r12; + u64 size = tdx->vp_enter_args.r13; + + /* The gpa of buffer must have shared bit set. */ + if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) { + tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); + return 1; + } + + vcpu->run->exit_reason = KVM_EXIT_TDX; + vcpu->run->tdx.flags = 0; + vcpu->run->tdx.nr = TDVMCALL_GET_QUOTE; + vcpu->run->tdx.get_quote.ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED; + vcpu->run->tdx.get_quote.gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm)); + vcpu->run->tdx.get_quote.size = size; + + vcpu->arch.complete_userspace_io = tdx_complete_simple; + + return 0; +} + static int handle_tdvmcall(struct kvm_vcpu *vcpu) { switch (tdvmcall_leaf(vcpu)) { @@ -1474,6 +1504,8 @@ static int handle_tdvmcall(struct kvm_vcpu *vcpu) return tdx_report_fatal_error(vcpu); case TDVMCALL_GET_TD_VM_CALL_INFO: return tdx_get_td_vm_call_info(vcpu); + case TDVMCALL_GET_QUOTE: + return tdx_get_quote(vcpu); default: break; } diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index d00b85cb168c..e23e7286ad1a 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -178,6 +178,7 @@ struct kvm_xen_exit { #define KVM_EXIT_NOTIFY 37 #define KVM_EXIT_LOONGARCH_IOCSR 38 #define KVM_EXIT_MEMORY_FAULT 39 +#define KVM_EXIT_TDX 40 /* For KVM_EXIT_INTERNAL_ERROR */ /* Emulate instruction failed. */ @@ -447,6 +448,22 @@ struct kvm_run { __u64 gpa; __u64 size; } memory_fault; + /* KVM_EXIT_TDX */ + struct { + __u64 flags; + __u64 nr; + union { + struct { + __u64 ret; + __u64 data[5]; + } unknown; + struct { + __u64 ret; + __u64 gpa; + __u64 size; + } get_quote; + }; + } tdx; /* Fix the size of the union. */ char padding[256]; }; From 25e8b1dd4883e6c251c3db5b347f3c8ae4ade921 Mon Sep 17 00:00:00 2001 From: Binbin Wu Date: Tue, 10 Jun 2025 10:14:21 +0800 Subject: [PATCH 15/15] KVM: TDX: Exit to userspace for GetTdVmCallInfo Exit to userspace for TDG.VP.VMCALL via KVM_EXIT_TDX, to allow userspace to provide information about the support of TDVMCALLs when r12 is 1 for the TDVMCALLs beyond the GHCI base API. GHCI spec defines the GHCI base TDVMCALLs: , , , , <#VE.RequestMMIO>, , , and . They must be supported by VMM to support TDX guests. For GetTdVmCallInfo - When leaf (r12) to enumerate TDVMCALL functionality is set to 0, successful execution indicates all GHCI base TDVMCALLs listed above are supported. Update the KVM TDX document with the set of the GHCI base APIs. - When leaf (r12) to enumerate TDVMCALL functionality is set to 1, it indicates the TDX guest is querying the supported TDVMCALLs beyond the GHCI base TDVMCALLs. Exit to userspace to let userspace set the TDVMCALL sub-function bit(s) accordingly to the leaf outputs. KVM could set the TDVMCALL bit(s) supported by itself when the TDVMCALLs don't need support from userspace after returning from userspace and before entering guest. Currently, no such TDVMCALLs implemented, KVM just sets the values returned from userspace. Suggested-by: Paolo Bonzini Signed-off-by: Binbin Wu [Adjust userspace API. - Paolo] Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 10 ++++++++ arch/x86/kvm/vmx/tdx.c | 43 ++++++++++++++++++++++++++++++---- include/uapi/linux/kvm.h | 5 ++++ 3 files changed, 54 insertions(+), 4 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 115ec3c2b641..9abf93ee5f65 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -7191,6 +7191,11 @@ The valid value for 'flags' is: u64 gpa; u64 size; } get_quote; + struct { + u64 ret; + u64 leaf; + u64 r11, r12, r13, r14; + } get_tdvmcall_info; }; } tdx; @@ -7216,6 +7221,11 @@ queued successfully, the TDX guest can poll the status field in the shared-memory area to check whether the Quote generation is completed or not. When completed, the generated Quote is returned via the same buffer. +* ``TDVMCALL_GET_TD_VM_CALL_INFO``: the guest has requested the support +status of TDVMCALLs. The output values for the given leaf should be +placed in fields from ``r11`` to ``r14`` of the ``get_tdvmcall_info`` +field of the union. + KVM may add support for more values in the future that may cause a userspace exit, even without calls to ``KVM_ENABLE_CAP`` or similar. In this case, it will enter with output fields already valid; in the common case, the diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index b619a3478983..1ad20c273f3b 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -1451,18 +1451,53 @@ static int tdx_emulate_mmio(struct kvm_vcpu *vcpu) return 1; } +static int tdx_complete_get_td_vm_call_info(struct kvm_vcpu *vcpu) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + + tdvmcall_set_return_code(vcpu, vcpu->run->tdx.get_tdvmcall_info.ret); + + /* + * For now, there is no TDVMCALL beyond GHCI base API supported by KVM + * directly without the support from userspace, just set the value + * returned from userspace. + */ + tdx->vp_enter_args.r11 = vcpu->run->tdx.get_tdvmcall_info.r11; + tdx->vp_enter_args.r12 = vcpu->run->tdx.get_tdvmcall_info.r12; + tdx->vp_enter_args.r13 = vcpu->run->tdx.get_tdvmcall_info.r13; + tdx->vp_enter_args.r14 = vcpu->run->tdx.get_tdvmcall_info.r14; + + return 1; +} + static int tdx_get_td_vm_call_info(struct kvm_vcpu *vcpu) { struct vcpu_tdx *tdx = to_tdx(vcpu); - if (tdx->vp_enter_args.r12) - tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); - else { + switch (tdx->vp_enter_args.r12) { + case 0: tdx->vp_enter_args.r11 = 0; + tdx->vp_enter_args.r12 = 0; tdx->vp_enter_args.r13 = 0; tdx->vp_enter_args.r14 = 0; + tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUCCESS); + return 1; + case 1: + vcpu->run->tdx.get_tdvmcall_info.leaf = tdx->vp_enter_args.r12; + vcpu->run->exit_reason = KVM_EXIT_TDX; + vcpu->run->tdx.flags = 0; + vcpu->run->tdx.nr = TDVMCALL_GET_TD_VM_CALL_INFO; + vcpu->run->tdx.get_tdvmcall_info.ret = TDVMCALL_STATUS_SUCCESS; + vcpu->run->tdx.get_tdvmcall_info.r11 = 0; + vcpu->run->tdx.get_tdvmcall_info.r12 = 0; + vcpu->run->tdx.get_tdvmcall_info.r13 = 0; + vcpu->run->tdx.get_tdvmcall_info.r14 = 0; + vcpu->arch.complete_userspace_io = tdx_complete_get_td_vm_call_info; + return 0; + default: + tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); + return 1; } - return 1; } static int tdx_complete_simple(struct kvm_vcpu *vcpu) diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index e23e7286ad1a..37891580d05d 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -462,6 +462,11 @@ struct kvm_run { __u64 gpa; __u64 size; } get_quote; + struct { + __u64 ret; + __u64 leaf; + __u64 r11, r12, r13, r14; + } get_tdvmcall_info; }; } tdx; /* Fix the size of the union. */