From 7b424ffcd45821600312ed4c794c21f7805e79d4 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 13 Sep 2023 16:56:44 +0000 Subject: [PATCH 1/8] KVM: arm64: Don't use kerneldoc comment for arm64_check_features() A double-asterisk opening mark to the comment (i.e. '/**') indicates a comment block is in the kerneldoc format. There's automation in place to validate that kerneldoc blocks actually adhere to the formatting rules. The function comment for arm64_check_features() isn't kerneldoc; use a 'regular' comment to silence automation warnings. Link: https://lore.kernel.org/all/202309112251.e25LqfcK-lkp@intel.com/ Reviewed-by: Zenghui Yu Link: https://lore.kernel.org/r/20230913165645.2319017-1-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/kvm/sys_regs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index e92ec810d449..818a52e257ed 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1228,7 +1228,7 @@ static s64 kvm_arm64_ftr_safe_value(u32 id, const struct arm64_ftr_bits *ftrp, return arm64_ftr_safe_value(&kvm_ftr, new, cur); } -/** +/* * arm64_check_features() - Check if a feature register value constitutes * a subset of features indicated by the idreg's KVM sanitised limit. * From ec1c3b9ff16082f880b304be40992568f4eee6a7 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 20 Sep 2023 08:01:32 +0000 Subject: [PATCH 2/8] arm64: tlbflush: Rename MAX_TLBI_OPS Perhaps unsurprisingly, I-cache invalidations suffer from performance issues similar to TLB invalidations on certain systems. TLB and I-cache maintenance all result in DVM on the mesh, which is where the real bottleneck lies. Rename the heuristic to point the finger at DVM, such that it may be reused for limiting I-cache invalidations. Reviewed-by: Gavin Shan Tested-by: Gavin Shan Acked-by: Will Deacon Link: https://lore.kernel.org/r/20230920080133.944717-2-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/include/asm/tlbflush.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index b149cf9f91bc..3431d37e5054 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -333,7 +333,7 @@ static inline void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) * This is meant to avoid soft lock-ups on large TLB flushing ranges and not * necessarily a performance improvement. */ -#define MAX_TLBI_OPS PTRS_PER_PTE +#define MAX_DVM_OPS PTRS_PER_PTE /* * __flush_tlb_range_op - Perform TLBI operation upon a range @@ -413,12 +413,12 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma, /* * When not uses TLB range ops, we can handle up to - * (MAX_TLBI_OPS - 1) pages; + * (MAX_DVM_OPS - 1) pages; * When uses TLB range ops, we can handle up to * (MAX_TLBI_RANGE_PAGES - 1) pages. */ if ((!system_supports_tlb_range() && - (end - start) >= (MAX_TLBI_OPS * stride)) || + (end - start) >= (MAX_DVM_OPS * stride)) || pages >= MAX_TLBI_RANGE_PAGES) { flush_tlb_mm(vma->vm_mm); return; @@ -451,7 +451,7 @@ static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end { unsigned long addr; - if ((end - start) > (MAX_TLBI_OPS * PAGE_SIZE)) { + if ((end - start) > (MAX_DVM_OPS * PAGE_SIZE)) { flush_tlb_all(); return; } From 909b583f81b5bb5a398d4580543f59b908a86ccc Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 20 Sep 2023 08:01:33 +0000 Subject: [PATCH 3/8] KVM: arm64: Avoid soft lockups due to I-cache maintenance Gavin reports of soft lockups on his Ampere Altra Max machine when backing KVM guests with hugetlb pages. Upon further investigation, it was found that the system is unable to keep up with parallel I-cache invalidations done by KVM's stage-2 fault handler. This is ultimately an implementation problem. I-cache maintenance instructions are available at EL0, so nothing stops a malicious userspace from hammering a system with CMOs and cause it to fall over. "Fixing" this problem in KVM is nothing more than slapping a bandage over a much deeper problem. Anyway, the kernel already has a heuristic for limiting TLB invalidations to avoid soft lockups. Reuse that logic to limit I-cache CMOs done by KVM to map executable pages on systems without FEAT_DIC. While at it, restructure __invalidate_icache_guest_page() to improve readability and squeeze our new condition into the existing branching structure. Link: https://lore.kernel.org/kvmarm/20230904072826.1468907-1-gshan@redhat.com/ Reviewed-by: Gavin Shan Tested-by: Gavin Shan Link: https://lore.kernel.org/r/20230920080133.944717-3-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_mmu.h | 37 ++++++++++++++++++++++++++------ 1 file changed, 31 insertions(+), 6 deletions(-) diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 96a80e8f6226..a425ecdd7be0 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -224,16 +224,41 @@ static inline void __clean_dcache_guest_page(void *va, size_t size) kvm_flush_dcache_to_poc(va, size); } +static inline size_t __invalidate_icache_max_range(void) +{ + u8 iminline; + u64 ctr; + + asm volatile(ALTERNATIVE_CB("movz %0, #0\n" + "movk %0, #0, lsl #16\n" + "movk %0, #0, lsl #32\n" + "movk %0, #0, lsl #48\n", + ARM64_ALWAYS_SYSTEM, + kvm_compute_final_ctr_el0) + : "=r" (ctr)); + + iminline = SYS_FIELD_GET(CTR_EL0, IminLine, ctr) + 2; + return MAX_DVM_OPS << iminline; +} + static inline void __invalidate_icache_guest_page(void *va, size_t size) { - if (icache_is_aliasing()) { - /* any kind of VIPT cache */ + /* + * VPIPT I-cache maintenance must be done from EL2. See comment in the + * nVHE flavor of __kvm_tlb_flush_vmid_ipa(). + */ + if (icache_is_vpipt() && read_sysreg(CurrentEL) != CurrentEL_EL2) + return; + + /* + * Blow the whole I-cache if it is aliasing (i.e. VIPT) or the + * invalidation range exceeds our arbitrary limit on invadations by + * cache line. + */ + if (icache_is_aliasing() || size > __invalidate_icache_max_range()) icache_inval_all_pou(); - } else if (read_sysreg(CurrentEL) != CurrentEL_EL1 || - !icache_is_vpipt()) { - /* PIPT or VPIPT at EL2 (see comment in __kvm_tlb_flush_vmid_ipa) */ + else icache_inval_pou((unsigned long)va, (unsigned long)va + size); - } } void kvm_set_way_flush(struct kvm_vcpu *vcpu); From c04bf723ccd639250fa680a0fb4e1ac15bb84c3b Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Thu, 28 Sep 2023 18:32:03 +0100 Subject: [PATCH 4/8] KVM: arm64: Do not transfer page refcount for THP adjustment GUP affects a refcount common to all pages forming the THP. There is therefore no need to move the refcount from a tail to the head page. Under the hood it decrements and increments the same counter. Signed-off-by: Vincent Donnefort Reviewed-by: Gavin Shan Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20230928173205.2826598-2-vdonnefort@google.com Signed-off-by: Oliver Upton --- arch/arm64/kvm/mmu.c | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 587a104f66c3..de5e5148ef5d 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1295,28 +1295,8 @@ transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot, if (sz < PMD_SIZE) return PAGE_SIZE; - /* - * The address we faulted on is backed by a transparent huge - * page. However, because we map the compound huge page and - * not the individual tail page, we need to transfer the - * refcount to the head page. We have to be careful that the - * THP doesn't start to split while we are adjusting the - * refcounts. - * - * We are sure this doesn't happen, because mmu_invalidate_retry - * was successful and we are holding the mmu_lock, so if this - * THP is trying to split, it will be blocked in the mmu - * notifier before touching any of the pages, specifically - * before being able to call __split_huge_page_refcount(). - * - * We can therefore safely transfer the refcount from PG_tail - * to PG_head and switch the pfn from a tail page to the head - * page accordingly. - */ *ipap &= PMD_MASK; - kvm_release_pfn_clean(pfn); pfn &= ~(PTRS_PER_PMD - 1); - get_page(pfn_to_page(pfn)); *pfnp = pfn; return PMD_SIZE; From be097997a273259f1723baac5463cf19d8564efa Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 22 Sep 2023 22:32:29 +0000 Subject: [PATCH 5/8] KVM: arm64: Always invalidate TLB for stage-2 permission faults It is possible for multiple vCPUs to fault on the same IPA and attempt to resolve the fault. One of the page table walks will actually update the PTE and the rest will return -EAGAIN per our race detection scheme. KVM elides the TLB invalidation on the racing threads as the return value is nonzero. Before commit a12ab1378a88 ("KVM: arm64: Use local TLBI on permission relaxation") KVM always used broadcast TLB invalidations when handling permission faults, which had the convenient property of making the stage-2 updates visible to all CPUs in the system. However now we do a local invalidation, and TLBI elision leads to the vCPU thread faulting again on the stale entry. Remember that the architecture permits the TLB to cache translations that precipitate a permission fault. Invalidate the TLB entry responsible for the permission fault if the stage-2 descriptor has been relaxed, regardless of which thread actually did the job. Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20230922223229.1608155-1-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/pgtable.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index f155b8c9e98c..286888751793 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -1314,7 +1314,7 @@ int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level, KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED); - if (!ret) + if (!ret || ret == -EAGAIN) kvm_call_hyp(__kvm_tlb_flush_vmid_ipa_nsh, pgt->mmu, addr, level); return ret; } From beaf35b480875d05d7c751d50951a659ce6dff94 Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Sat, 7 Oct 2023 20:40:42 +0800 Subject: [PATCH 6/8] KVM: arm64: selftest: Add the missing .guest_prepare() Running page_fault_test on a Cortex A72 fails with Test: ro_memslot_no_syndrome_guest_cas Testing guest mode: PA-bits:40, VA-bits:48, 4K pages Testing memory backing src type: anonymous ==== Test Assertion Failure ==== aarch64/page_fault_test.c:117: guest_check_lse() pid=1944087 tid=1944087 errno=4 - Interrupted system call 1 0x00000000004028b3: vcpu_run_loop at page_fault_test.c:682 2 0x0000000000402d93: run_test at page_fault_test.c:731 3 0x0000000000403957: for_each_guest_mode at guest_modes.c:100 4 0x00000000004019f3: for_each_test_and_guest_mode at page_fault_test.c:1108 5 (inlined by) main at page_fault_test.c:1134 6 0x0000ffff868e503b: ?? ??:0 7 0x0000ffff868e5113: ?? ??:0 8 0x0000000000401aaf: _start at ??:? guest_check_lse() because we don't have a guest_prepare stage to check the presence of FEAT_LSE and skip the related guest_cas testing, and we end-up failing in GUEST_ASSERT(guest_check_lse()). Add the missing .guest_prepare() where it's indeed required. Signed-off-by: Zenghui Yu Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20231007124043.626-1-yuzenghui@huawei.com Signed-off-by: Oliver Upton --- tools/testing/selftests/kvm/aarch64/page_fault_test.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/testing/selftests/kvm/aarch64/page_fault_test.c b/tools/testing/selftests/kvm/aarch64/page_fault_test.c index 47bb914ab2fa..eb5ead50c0b1 100644 --- a/tools/testing/selftests/kvm/aarch64/page_fault_test.c +++ b/tools/testing/selftests/kvm/aarch64/page_fault_test.c @@ -842,6 +842,7 @@ static void help(char *name) .name = SCAT2(ro_memslot_no_syndrome, _access), \ .data_memslot_flags = KVM_MEM_READONLY, \ .pt_memslot_flags = KVM_MEM_READONLY, \ + .guest_prepare = { _PREPARE(_access) }, \ .guest_test = _access, \ .fail_vcpu_run_handler = fail_vcpu_run_mmio_no_syndrome_handler, \ .expected_events = { .fail_vcpu_runs = 1 }, \ @@ -865,6 +866,7 @@ static void help(char *name) .name = SCAT2(ro_memslot_no_syn_and_dlog, _access), \ .data_memslot_flags = KVM_MEM_READONLY | KVM_MEM_LOG_DIRTY_PAGES, \ .pt_memslot_flags = KVM_MEM_READONLY | KVM_MEM_LOG_DIRTY_PAGES, \ + .guest_prepare = { _PREPARE(_access) }, \ .guest_test = _access, \ .guest_test_check = { _test_check }, \ .fail_vcpu_run_handler = fail_vcpu_run_mmio_no_syndrome_handler, \ @@ -894,6 +896,7 @@ static void help(char *name) .data_memslot_flags = KVM_MEM_READONLY, \ .pt_memslot_flags = KVM_MEM_READONLY, \ .mem_mark_cmd = CMD_HOLE_DATA | CMD_HOLE_PT, \ + .guest_prepare = { _PREPARE(_access) }, \ .guest_test = _access, \ .uffd_data_handler = _uffd_data_handler, \ .uffd_pt_handler = uffd_pt_handler, \ From 06899aa5dd3d76e888b28d7e8d7304c0a7ec6262 Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Sat, 7 Oct 2023 20:40:43 +0800 Subject: [PATCH 7/8] KVM: arm64: selftest: Perform ISB before reading PAR_EL1 It looks like a mistake to issue ISB *after* reading PAR_EL1, we should instead perform it between the AT instruction and the reads of PAR_EL1. As according to DDI0487J.a IJTYVP, "When an address translation instruction is executed, explicit synchronization is required to guarantee the result is visible to subsequent direct reads of PAR_EL1." Otherwise all guest_at testcases fail on my box with ==== Test Assertion Failure ==== aarch64/page_fault_test.c:142: par & 1 == 0 pid=1355864 tid=1355864 errno=4 - Interrupted system call 1 0x0000000000402853: vcpu_run_loop at page_fault_test.c:681 2 0x0000000000402cdb: run_test at page_fault_test.c:730 3 0x0000000000403897: for_each_guest_mode at guest_modes.c:100 4 0x00000000004019f3: for_each_test_and_guest_mode at page_fault_test.c:1105 5 (inlined by) main at page_fault_test.c:1131 6 0x0000ffffb153c03b: ?? ??:0 7 0x0000ffffb153c113: ?? ??:0 8 0x0000000000401aaf: _start at ??:? 0x1 != 0x0 (par & 1 != 0) Signed-off-by: Zenghui Yu Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20231007124043.626-2-yuzenghui@huawei.com Signed-off-by: Oliver Upton --- tools/testing/selftests/kvm/aarch64/page_fault_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/aarch64/page_fault_test.c b/tools/testing/selftests/kvm/aarch64/page_fault_test.c index eb5ead50c0b1..9a3be80e8bc9 100644 --- a/tools/testing/selftests/kvm/aarch64/page_fault_test.c +++ b/tools/testing/selftests/kvm/aarch64/page_fault_test.c @@ -135,8 +135,8 @@ static void guest_at(void) uint64_t par; asm volatile("at s1e1r, %0" :: "r" (guest_test_memory)); - par = read_sysreg(par_el1); isb(); + par = read_sysreg(par_el1); /* Bit 1 indicates whether the AT was successful */ GUEST_ASSERT_EQ(par & 1, 0); From d11974dc5f208ae1c0fe62a9bcb47c0cdcd7b081 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Thu, 26 Oct 2023 20:53:06 +0000 Subject: [PATCH 8/8] KVM: arm64: Add tracepoint for MMIO accesses where ISV==0 It is a pretty well known fact that KVM does not support MMIO emulation without valid instruction syndrome information (ESR_EL2.ISV == 0). The current kvm_pr_unimpl() is pretty useless, as it contains zero context to relate the event to a vCPU. Replace it with a precise tracepoint that dumps the relevant context so the user can make sense of what the guest is doing. Acked-by: Zenghui Yu Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20231026205306.3045075-1-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/kvm/mmio.c | 4 +++- arch/arm64/kvm/trace_arm.h | 25 +++++++++++++++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/mmio.c b/arch/arm64/kvm/mmio.c index 3dd38a151d2a..200c8019a82a 100644 --- a/arch/arm64/kvm/mmio.c +++ b/arch/arm64/kvm/mmio.c @@ -135,6 +135,9 @@ int io_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) * volunteered to do so, and bail out otherwise. */ if (!kvm_vcpu_dabt_isvalid(vcpu)) { + trace_kvm_mmio_nisv(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu), + kvm_vcpu_get_hfar(vcpu), fault_ipa); + if (test_bit(KVM_ARCH_FLAG_RETURN_NISV_IO_ABORT_TO_USER, &vcpu->kvm->arch.flags)) { run->exit_reason = KVM_EXIT_ARM_NISV; @@ -143,7 +146,6 @@ int io_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) return 0; } - kvm_pr_unimpl("Data abort outside memslots with no valid syndrome info\n"); return -ENOSYS; } diff --git a/arch/arm64/kvm/trace_arm.h b/arch/arm64/kvm/trace_arm.h index 8ad53104934d..c18c1a95831e 100644 --- a/arch/arm64/kvm/trace_arm.h +++ b/arch/arm64/kvm/trace_arm.h @@ -136,6 +136,31 @@ TRACE_EVENT(kvm_mmio_emulate, __entry->vcpu_pc, __entry->instr, __entry->cpsr) ); +TRACE_EVENT(kvm_mmio_nisv, + TP_PROTO(unsigned long vcpu_pc, unsigned long esr, + unsigned long far, unsigned long ipa), + TP_ARGS(vcpu_pc, esr, far, ipa), + + TP_STRUCT__entry( + __field( unsigned long, vcpu_pc ) + __field( unsigned long, esr ) + __field( unsigned long, far ) + __field( unsigned long, ipa ) + ), + + TP_fast_assign( + __entry->vcpu_pc = vcpu_pc; + __entry->esr = esr; + __entry->far = far; + __entry->ipa = ipa; + ), + + TP_printk("ipa %#016lx, esr %#016lx, far %#016lx, pc %#016lx", + __entry->ipa, __entry->esr, + __entry->far, __entry->vcpu_pc) +); + + TRACE_EVENT(kvm_set_way_flush, TP_PROTO(unsigned long vcpu_pc, bool cache), TP_ARGS(vcpu_pc, cache),