From 297877069bc2fa079fb2a60ae91ca9abb481074a Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 19 Nov 2025 01:38:21 -0800 Subject: [PATCH 1/7] KVM: arm64: Drop useless __GFP_HIGHMEM from kvm struct allocation A recent change on the receiving end of vmalloc() started warning about unsupported GFP flags passed by the caller. Nathan reports that this warning fires in kvm_arch_alloc_vm(), owing to the fact that KVM is passing a meaningless __GFP_HIGHMEM. Do as the warning says and fix the code. Cc: Vishal Moola (Oracle) Reported-by: Nathan Chancellor Closes: https://lore.kernel.org/kvmarm/20251118224448.GA998046@ax162/ Acked-by: Vishal Moola (Oracle) Reviewed-by: Marc Zyngier Reviewed-by: Joey Gouly Link: https://msgid.link/20251119093822.2513142-2-oupton@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/arm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 870953b4a8a7..e791fa52f874 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -440,7 +440,7 @@ struct kvm *kvm_arch_alloc_vm(void) if (!has_vhe()) return kzalloc(sz, GFP_KERNEL_ACCOUNT); - return __vmalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_HIGHMEM | __GFP_ZERO); + return __vmalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_ZERO); } int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) From cb17d79ff51d41f656bcf7928330b2e9c0003583 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 19 Nov 2025 01:38:22 -0800 Subject: [PATCH 2/7] KVM: arm64: Use kvzalloc() for kvm struct allocation Physically-allocated KVM structs aren't necessary when in VHE mode as there's no need to share with the hyp's address space. Of course, there can still be a performance benefit from physical allocations. Use kvzalloc() for opportunistic physical allocations. Acked-by: Vishal Moola (Oracle) Reviewed-by: Marc Zyngier Reviewed-by: Joey Gouly Link: https://msgid.link/20251119093822.2513142-3-oupton@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/arm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index e791fa52f874..ecbe2c8dc00c 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -440,7 +440,7 @@ struct kvm *kvm_arch_alloc_vm(void) if (!has_vhe()) return kzalloc(sz, GFP_KERNEL_ACCOUNT); - return __vmalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_ZERO); + return kvzalloc(sz, GFP_KERNEL_ACCOUNT); } int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) From 31df012da496968d8d4368f693ad45dfcbfba40b Mon Sep 17 00:00:00 2001 From: Maximilian Dittgen Date: Wed, 19 Nov 2025 14:57:43 +0100 Subject: [PATCH 3/7] KVM: selftests: Assert GICR_TYPER.Processor_Number matches selftest CPU number The selftests GIC library and tests assume that the GICR_TYPER.Processor_number associated with a given CPU is the same as the CPU's selftest index. Since this assumption is not guaranteed by specification, add an assert in gicv3_cpu_init() that validates this is true. Signed-off-by: Maximilian Dittgen Link: https://msgid.link/20251119135744.68552-1-mdittgen@amazon.de Signed-off-by: Oliver Upton --- tools/testing/selftests/kvm/lib/arm64/gic_v3.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/testing/selftests/kvm/lib/arm64/gic_v3.c b/tools/testing/selftests/kvm/lib/arm64/gic_v3.c index 66d05506f78b..f81025cd32e2 100644 --- a/tools/testing/selftests/kvm/lib/arm64/gic_v3.c +++ b/tools/testing/selftests/kvm/lib/arm64/gic_v3.c @@ -298,12 +298,17 @@ static void gicv3_cpu_init(unsigned int cpu) volatile void *sgi_base; unsigned int i; volatile void *redist_base_cpu; + u64 typer; GUEST_ASSERT(cpu < gicv3_data.nr_cpus); redist_base_cpu = gicr_base_cpu(cpu); sgi_base = sgi_base_from_redist(redist_base_cpu); + /* Verify assumption that GICR_TYPER.Processor_number == cpu */ + typer = readq_relaxed(redist_base_cpu + GICR_TYPER); + GUEST_ASSERT_EQ(GICR_TYPER_CPU_NUMBER(typer), cpu); + gicv3_enable_redist(redist_base_cpu); /* From 85f329df293119d6ba1a26453d109556631081a4 Mon Sep 17 00:00:00 2001 From: Maximilian Dittgen Date: Wed, 19 Nov 2025 14:57:44 +0100 Subject: [PATCH 4/7] KVM: selftests: SYNC after guest ITS setup in vgic_lpi_stress vgic_lpi_stress sends MAPTI and MAPC commands during guest GIC setup to map interrupt events to ITT entries and collection IDs to redistributors, respectively. We have no guarantee that the ITS will finish handling these mapping commands before the selftest calls KVM_SIGNAL_MSI to inject LPIs to the guest. If LPIs are injected before ITS mapping completes, the ITS cannot properly pass the interrupt on to the redistributor. Fix by adding a SYNC command to the selftests ITS library, then calling SYNC after ITS mapping to ensure mapping completes before signal_lpi() writes to GITS_TRANSLATER. Signed-off-by: Maximilian Dittgen Link: https://msgid.link/20251119135744.68552-2-mdittgen@amazon.de Signed-off-by: Oliver Upton --- tools/testing/selftests/kvm/arm64/vgic_lpi_stress.c | 4 ++++ tools/testing/selftests/kvm/include/arm64/gic_v3_its.h | 1 + tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c | 10 ++++++++++ 3 files changed, 15 insertions(+) diff --git a/tools/testing/selftests/kvm/arm64/vgic_lpi_stress.c b/tools/testing/selftests/kvm/arm64/vgic_lpi_stress.c index 687d04463983..e857a605f577 100644 --- a/tools/testing/selftests/kvm/arm64/vgic_lpi_stress.c +++ b/tools/testing/selftests/kvm/arm64/vgic_lpi_stress.c @@ -118,6 +118,10 @@ static void guest_setup_gic(void) guest_setup_its_mappings(); guest_invalidate_all_rdists(); + + /* SYNC to ensure ITS setup is complete */ + for (cpuid = 0; cpuid < test_data.nr_cpus; cpuid++) + its_send_sync_cmd(test_data.cmdq_base_va, cpuid); } static void guest_code(size_t nr_lpis) diff --git a/tools/testing/selftests/kvm/include/arm64/gic_v3_its.h b/tools/testing/selftests/kvm/include/arm64/gic_v3_its.h index 3722ed9c8f96..58feef3eb386 100644 --- a/tools/testing/selftests/kvm/include/arm64/gic_v3_its.h +++ b/tools/testing/selftests/kvm/include/arm64/gic_v3_its.h @@ -15,5 +15,6 @@ void its_send_mapc_cmd(void *cmdq_base, u32 vcpu_id, u32 collection_id, bool val void its_send_mapti_cmd(void *cmdq_base, u32 device_id, u32 event_id, u32 collection_id, u32 intid); void its_send_invall_cmd(void *cmdq_base, u32 collection_id); +void its_send_sync_cmd(void *cmdq_base, u32 vcpu_id); #endif // __SELFTESTS_GIC_V3_ITS_H__ diff --git a/tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c b/tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c index 09f270545646..aec1b69a4de3 100644 --- a/tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c +++ b/tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c @@ -246,3 +246,13 @@ void its_send_invall_cmd(void *cmdq_base, u32 collection_id) its_send_cmd(cmdq_base, &cmd); } + +void its_send_sync_cmd(void *cmdq_base, u32 vcpu_id) +{ + struct its_cmd_block cmd = {}; + + its_encode_cmd(&cmd, GITS_CMD_SYNC); + its_encode_target(&cmd, procnum_to_rdbase(vcpu_id)); + + its_send_cmd(cmdq_base, &cmd); +} From 156f70afcfecfc45be5fdc2e4adebc5ea70a93b0 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 19 Nov 2025 14:11:50 -0800 Subject: [PATCH 5/7] KVM: arm64: Only drop references on empty tables in stage2_free_walker A subsequent change to the way KVM frees stage-2s will invoke the free walker on sub-ranges of the VM's IPA space, meaning there's potential for only partially visiting a table's PTEs. Split the leaf and table visitors and only drop references on a table when the page count reaches 1, implying there are no valid PTEs that need to be visited. Invalidate the table PTE to avoid traversing the stale reference. Link: https://msgid.link/20251113052452.975081-2-rananta@google.com Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/pgtable.c | 42 +++++++++++++++++++++++++++++------- 1 file changed, 34 insertions(+), 8 deletions(-) diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index c351b4abd5db..6d6a23f7dedb 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -1535,20 +1535,46 @@ size_t kvm_pgtable_stage2_pgd_size(u64 vtcr) return kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE; } -static int stage2_free_walker(const struct kvm_pgtable_visit_ctx *ctx, - enum kvm_pgtable_walk_flags visit) +static int stage2_free_leaf(const struct kvm_pgtable_visit_ctx *ctx) { struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; + mm_ops->put_page(ctx->ptep); + return 0; +} + +static int stage2_free_table_post(const struct kvm_pgtable_visit_ctx *ctx) +{ + struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; + kvm_pte_t *childp = kvm_pte_follow(ctx->old, mm_ops); + + if (mm_ops->page_count(childp) != 1) + return 0; + + /* + * Drop references and clear the now stale PTE to avoid rewalking the + * freed page table. + */ + mm_ops->put_page(ctx->ptep); + mm_ops->put_page(childp); + kvm_clear_pte(ctx->ptep); + return 0; +} + +static int stage2_free_walker(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) +{ if (!stage2_pte_is_counted(ctx->old)) return 0; - mm_ops->put_page(ctx->ptep); - - if (kvm_pte_table(ctx->old, ctx->level)) - mm_ops->put_page(kvm_pte_follow(ctx->old, mm_ops)); - - return 0; + switch (visit) { + case KVM_PGTABLE_WALK_LEAF: + return stage2_free_leaf(ctx); + case KVM_PGTABLE_WALK_TABLE_POST: + return stage2_free_table_post(ctx); + default: + return -EINVAL; + } } void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) From d68d66e57e2be9541fa67bafabdfe8826bb73799 Mon Sep 17 00:00:00 2001 From: Raghavendra Rao Ananta Date: Thu, 13 Nov 2025 05:24:51 +0000 Subject: [PATCH 6/7] KVM: arm64: Split kvm_pgtable_stage2_destroy() Split kvm_pgtable_stage2_destroy() into two: - kvm_pgtable_stage2_destroy_range(), that performs the page-table walk and free the entries over a range of addresses. - kvm_pgtable_stage2_destroy_pgd(), that frees the PGD. This refactoring enables subsequent patches to free large page-tables in chunks, calling cond_resched() between each chunk, to yield the CPU as necessary. Existing callers of kvm_pgtable_stage2_destroy(), that probably cannot take advantage of this (such as nVMHE), will continue to function as is. Signed-off-by: Raghavendra Rao Ananta Suggested-by: Oliver Upton Link: https://msgid.link/20251113052452.975081-3-rananta@google.com Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_pgtable.h | 30 ++++++++++++++++++++++++++++ arch/arm64/include/asm/kvm_pkvm.h | 4 +++- arch/arm64/kvm/hyp/pgtable.c | 25 +++++++++++++++++++---- arch/arm64/kvm/mmu.c | 12 +++++++++-- arch/arm64/kvm/pkvm.c | 11 ++++++++-- 5 files changed, 73 insertions(+), 9 deletions(-) diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index 2888b5d03757..1246216616b5 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -355,6 +355,11 @@ static inline kvm_pte_t *kvm_dereference_pteref(struct kvm_pgtable_walker *walke return pteref; } +static inline kvm_pte_t *kvm_dereference_pteref_raw(kvm_pteref_t pteref) +{ + return pteref; +} + static inline int kvm_pgtable_walk_begin(struct kvm_pgtable_walker *walker) { /* @@ -384,6 +389,11 @@ static inline kvm_pte_t *kvm_dereference_pteref(struct kvm_pgtable_walker *walke return rcu_dereference_check(pteref, !(walker->flags & KVM_PGTABLE_WALK_SHARED)); } +static inline kvm_pte_t *kvm_dereference_pteref_raw(kvm_pteref_t pteref) +{ + return rcu_dereference_raw(pteref); +} + static inline int kvm_pgtable_walk_begin(struct kvm_pgtable_walker *walker) { if (walker->flags & KVM_PGTABLE_WALK_SHARED) @@ -551,6 +561,26 @@ static inline int kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2 */ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt); +/** + * kvm_pgtable_stage2_destroy_range() - Destroy the unlinked range of addresses. + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). + * @addr: Intermediate physical address at which to place the mapping. + * @size: Size of the mapping. + * + * The page-table is assumed to be unreachable by any hardware walkers prior + * to freeing and therefore no TLB invalidation is performed. + */ +void kvm_pgtable_stage2_destroy_range(struct kvm_pgtable *pgt, + u64 addr, u64 size); + +/** + * kvm_pgtable_stage2_destroy_pgd() - Destroy the PGD of guest stage-2 page-table. + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). + * + * It is assumed that the rest of the page-table is freed before this operation. + */ +void kvm_pgtable_stage2_destroy_pgd(struct kvm_pgtable *pgt); + /** * kvm_pgtable_stage2_free_unlinked() - Free an unlinked stage-2 paging structure. * @mm_ops: Memory management callbacks. diff --git a/arch/arm64/include/asm/kvm_pkvm.h b/arch/arm64/include/asm/kvm_pkvm.h index 08be89c95466..0aecd4ac5f45 100644 --- a/arch/arm64/include/asm/kvm_pkvm.h +++ b/arch/arm64/include/asm/kvm_pkvm.h @@ -180,7 +180,9 @@ struct pkvm_mapping { int pkvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, struct kvm_pgtable_mm_ops *mm_ops); -void pkvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt); +void pkvm_pgtable_stage2_destroy_range(struct kvm_pgtable *pgt, + u64 addr, u64 size); +void pkvm_pgtable_stage2_destroy_pgd(struct kvm_pgtable *pgt); int pkvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys, enum kvm_pgtable_prot prot, void *mc, enum kvm_pgtable_walk_flags flags); diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 6d6a23f7dedb..0882896dbf8f 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -1577,21 +1577,38 @@ static int stage2_free_walker(const struct kvm_pgtable_visit_ctx *ctx, } } -void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) +void kvm_pgtable_stage2_destroy_range(struct kvm_pgtable *pgt, + u64 addr, u64 size) { - size_t pgd_sz; struct kvm_pgtable_walker walker = { .cb = stage2_free_walker, .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, }; - WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker)); + WARN_ON(kvm_pgtable_walk(pgt, addr, size, &walker)); +} + +void kvm_pgtable_stage2_destroy_pgd(struct kvm_pgtable *pgt) +{ + size_t pgd_sz; + pgd_sz = kvm_pgd_pages(pgt->ia_bits, pgt->start_level) * PAGE_SIZE; - pgt->mm_ops->free_pages_exact(kvm_dereference_pteref(&walker, pgt->pgd), pgd_sz); + + /* + * Since the pgtable is unlinked at this point, and not shared with + * other walkers, safely deference pgd with kvm_dereference_pteref_raw() + */ + pgt->mm_ops->free_pages_exact(kvm_dereference_pteref_raw(pgt->pgd), pgd_sz); pgt->pgd = NULL; } +void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) +{ + kvm_pgtable_stage2_destroy_range(pgt, 0, BIT(pgt->ia_bits)); + kvm_pgtable_stage2_destroy_pgd(pgt); +} + void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, s8 level) { kvm_pteref_t ptep = (kvm_pteref_t)pgtable; diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 7cc964af8d30..c2bc1eba032c 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -904,6 +904,14 @@ static int kvm_init_ipa_range(struct kvm_s2_mmu *mmu, unsigned long type) return 0; } +static void kvm_stage2_destroy(struct kvm_pgtable *pgt) +{ + unsigned int ia_bits = VTCR_EL2_IPA(pgt->mmu->vtcr); + + KVM_PGT_FN(kvm_pgtable_stage2_destroy_range)(pgt, 0, BIT(ia_bits)); + KVM_PGT_FN(kvm_pgtable_stage2_destroy_pgd)(pgt); +} + /** * kvm_init_stage2_mmu - Initialise a S2 MMU structure * @kvm: The pointer to the KVM structure @@ -980,7 +988,7 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t return 0; out_destroy_pgtable: - KVM_PGT_FN(kvm_pgtable_stage2_destroy)(pgt); + kvm_stage2_destroy(pgt); out_free_pgtable: kfree(pgt); return err; @@ -1081,7 +1089,7 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu) write_unlock(&kvm->mmu_lock); if (pgt) { - KVM_PGT_FN(kvm_pgtable_stage2_destroy)(pgt); + kvm_stage2_destroy(pgt); kfree(pgt); } } diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c index 24f0f8a8c943..d7a0f69a9982 100644 --- a/arch/arm64/kvm/pkvm.c +++ b/arch/arm64/kvm/pkvm.c @@ -344,9 +344,16 @@ static int __pkvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 start, u64 e return 0; } -void pkvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) +void pkvm_pgtable_stage2_destroy_range(struct kvm_pgtable *pgt, + u64 addr, u64 size) { - __pkvm_pgtable_stage2_unmap(pgt, 0, ~(0ULL)); + __pkvm_pgtable_stage2_unmap(pgt, addr, addr + size); +} + +void pkvm_pgtable_stage2_destroy_pgd(struct kvm_pgtable *pgt) +{ + /* Expected to be called after all pKVM mappings have been released. */ + WARN_ON_ONCE(!RB_EMPTY_ROOT(&pgt->pkvm_mappings.rb_root)); } int pkvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, From 4ddfab5436b6918ce4e28f610e670e040a304152 Mon Sep 17 00:00:00 2001 From: Raghavendra Rao Ananta Date: Thu, 13 Nov 2025 05:24:52 +0000 Subject: [PATCH 7/7] KVM: arm64: Reschedule as needed when destroying the stage-2 page-tables When a large VM, specifically one that holds a significant number of PTEs, gets abruptly destroyed, the following warning is seen during the page-table walk: sched: CPU 0 need_resched set for > 100018840 ns (100 ticks) without schedule CPU: 0 UID: 0 PID: 9617 Comm: kvm_page_table_ Tainted: G O 6.16.0-smp-DEV #3 NONE Tainted: [O]=OOT_MODULE Call trace: show_stack+0x20/0x38 (C) dump_stack_lvl+0x3c/0xb8 dump_stack+0x18/0x30 resched_latency_warn+0x7c/0x88 sched_tick+0x1c4/0x268 update_process_times+0xa8/0xd8 tick_nohz_handler+0xc8/0x168 __hrtimer_run_queues+0x11c/0x338 hrtimer_interrupt+0x104/0x308 arch_timer_handler_phys+0x40/0x58 handle_percpu_devid_irq+0x8c/0x1b0 generic_handle_domain_irq+0x48/0x78 gic_handle_irq+0x1b8/0x408 call_on_irq_stack+0x24/0x30 do_interrupt_handler+0x54/0x78 el1_interrupt+0x44/0x88 el1h_64_irq_handler+0x18/0x28 el1h_64_irq+0x84/0x88 stage2_free_walker+0x30/0xa0 (P) __kvm_pgtable_walk+0x11c/0x258 __kvm_pgtable_walk+0x180/0x258 __kvm_pgtable_walk+0x180/0x258 __kvm_pgtable_walk+0x180/0x258 kvm_pgtable_walk+0xc4/0x140 kvm_pgtable_stage2_destroy+0x5c/0xf0 kvm_free_stage2_pgd+0x6c/0xe8 kvm_uninit_stage2_mmu+0x24/0x48 kvm_arch_flush_shadow_all+0x80/0xa0 kvm_mmu_notifier_release+0x38/0x78 __mmu_notifier_release+0x15c/0x250 exit_mmap+0x68/0x400 __mmput+0x38/0x1c8 mmput+0x30/0x68 exit_mm+0xd4/0x198 do_exit+0x1a4/0xb00 do_group_exit+0x8c/0x120 get_signal+0x6d4/0x778 do_signal+0x90/0x718 do_notify_resume+0x70/0x170 el0_svc+0x74/0xd8 el0t_64_sync_handler+0x60/0xc8 el0t_64_sync+0x1b0/0x1b8 The warning is seen majorly on the host kernels that are configured not to force-preempt, such as CONFIG_PREEMPT_NONE=y. To avoid this, instead of walking the entire page-table in one go, split it into smaller ranges, by checking for cond_resched() between each range. Since the path is executed during VM destruction, after the page-table structure is unlinked from the KVM MMU, relying on cond_resched_rwlock_write() isn't necessary. Signed-off-by: Raghavendra Rao Ananta Link: https://msgid.link/20251113052452.975081-4-rananta@google.com Signed-off-by: Oliver Upton --- arch/arm64/kvm/mmu.c | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index c2bc1eba032c..f86d17ad50a7 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -904,11 +904,35 @@ static int kvm_init_ipa_range(struct kvm_s2_mmu *mmu, unsigned long type) return 0; } +/* + * Assume that @pgt is valid and unlinked from the KVM MMU to free the + * page-table without taking the kvm_mmu_lock and without performing any + * TLB invalidations. + * + * Also, the range of addresses can be large enough to cause need_resched + * warnings, for instance on CONFIG_PREEMPT_NONE kernels. Hence, invoke + * cond_resched() periodically to prevent hogging the CPU for a long time + * and schedule something else, if required. + */ +static void stage2_destroy_range(struct kvm_pgtable *pgt, phys_addr_t addr, + phys_addr_t end) +{ + u64 next; + + do { + next = stage2_range_addr_end(addr, end); + KVM_PGT_FN(kvm_pgtable_stage2_destroy_range)(pgt, addr, + next - addr); + if (next != end) + cond_resched(); + } while (addr = next, addr != end); +} + static void kvm_stage2_destroy(struct kvm_pgtable *pgt) { unsigned int ia_bits = VTCR_EL2_IPA(pgt->mmu->vtcr); - KVM_PGT_FN(kvm_pgtable_stage2_destroy_range)(pgt, 0, BIT(ia_bits)); + stage2_destroy_range(pgt, 0, BIT(ia_bits)); KVM_PGT_FN(kvm_pgtable_stage2_destroy_pgd)(pgt); }