From e21560b7d33c4f692cc9cd5b75ff09024f5a69d2 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 29 May 2025 09:35:08 +0200 Subject: [PATCH 1/4] arm64: Disable LLD linker ASSERT()s for the time being It turns out [1] that the way LLD handles ASSERT()s in the linker script can result in spurious failures, so disable them for the newly introduced BSS symbol export checks. Since we're not aware of any issues with the existing assertions in vmlinux.lds.S, leave those alone for now so that they can continue to provide useful coverage. A linker fix [2] is due to land in version 21 of LLD. Link: https://lore.kernel.org/r/202505261019.OUlitN6m-lkp@intel.com [1] Link: https://github.com/llvm/llvm-project/commit/5859863bab7f [2] Link: https://github.com/ClangBuiltLinux/linux/issues/2094 Signed-off-by: Ard Biesheuvel Tested-by: Arnd Bergmann Reviewed-by: Nathan Chancellor Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202505261019.OUlitN6m-lkp@intel.com/ Link: https://lore.kernel.org/r/20250529073507.2984959-2-ardb+git@google.com Signed-off-by: Will Deacon --- arch/arm64/kernel/image-vars.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index c5266430284b..86f088a16147 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -10,6 +10,10 @@ #error This file should only be included in vmlinux.lds.S #endif +#if defined(CONFIG_LD_IS_LLD) && CONFIG_LLD_VERSION < 210000 +#define ASSERT(...) +#endif + #define PI_EXPORT_SYM(sym) \ __PI_EXPORT_SYM(sym, __pi_ ## sym, Cannot export BSS symbol sym to startup code) #define __PI_EXPORT_SYM(sym, pisym, msg)\ @@ -142,4 +146,6 @@ KVM_NVHE_ALIAS(kvm_protected_mode_initialized); _kernel_codesize = ABSOLUTE(__inittext_end - _text); #endif +#undef ASSERT + #endif /* __ARM64_KERNEL_IMAGE_VARS_H */ From dc0a083948040ff364d065da8bb50c29f77a39ad Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sat, 31 May 2025 14:30:05 +0200 Subject: [PATCH 2/4] arm64: Work around convergence issue with LLD linker LLD will occasionally error out with a '__init_end does not converge' error if INIT_IDMAP_DIR_SIZE is defined in terms of _end, as this results in a circular dependency. Counter this by dimensioning the initial IDMAP page tables based on a new boundary marker 'kimage_limit', and define it such that its value should not change as a result of the initdata segment being pushed over a 64k segment boundary due to changes in INIT_IDMAP_DIR_SIZE, provided that its value doesn't change by more than 2M between linker passes. Reported-by: Arnd Bergmann Signed-off-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250531123005.3866382-2-ardb+git@google.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/kernel-pgtable.h | 2 +- arch/arm64/kernel/image-vars.h | 11 +++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h index 9e93733523f6..74a4f738c5f5 100644 --- a/arch/arm64/include/asm/kernel-pgtable.h +++ b/arch/arm64/include/asm/kernel-pgtable.h @@ -58,7 +58,7 @@ #define INIT_DIR_SIZE (PAGE_SIZE * (EARLY_PAGES(SWAPPER_PGTABLE_LEVELS, KIMAGE_VADDR, _end, EXTRA_PAGE) \ + EARLY_SEGMENT_EXTRA_PAGES)) -#define INIT_IDMAP_DIR_PAGES (EARLY_PAGES(INIT_IDMAP_PGTABLE_LEVELS, KIMAGE_VADDR, _end, 1)) +#define INIT_IDMAP_DIR_PAGES (EARLY_PAGES(INIT_IDMAP_PGTABLE_LEVELS, KIMAGE_VADDR, kimage_limit, 1)) #define INIT_IDMAP_DIR_SIZE ((INIT_IDMAP_DIR_PAGES + EARLY_IDMAP_EXTRA_PAGES) * PAGE_SIZE) #define INIT_IDMAP_FDT_PAGES (EARLY_PAGES(INIT_IDMAP_PGTABLE_LEVELS, 0UL, UL(MAX_FDT_SIZE), 1) - 1) diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index 86f088a16147..1cee48feb309 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -146,6 +146,17 @@ KVM_NVHE_ALIAS(kvm_protected_mode_initialized); _kernel_codesize = ABSOLUTE(__inittext_end - _text); #endif +/* + * LLD will occasionally error out with a '__init_end does not converge' error + * if INIT_IDMAP_DIR_SIZE is defined in terms of _end, as this results in a + * circular dependency. Counter this by dimensioning the initial IDMAP page + * tables based on kimage_limit, which is defined such that its value should + * not change as a result of the initdata segment being pushed over a 64k + * segment boundary due to changes in INIT_IDMAP_DIR_SIZE, provided that its + * value doesn't change by more than 2M between linker passes. + */ +kimage_limit = ALIGN(ABSOLUTE(_end + SZ_64K), SZ_2M); + #undef ASSERT #endif /* __ARM64_KERNEL_IMAGE_VARS_H */ From 4b634918384c0f84c33aeb4dd9fd4c38e7be5ccb Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Fri, 30 May 2025 16:23:47 +0100 Subject: [PATCH 3/4] arm64/mm: Close theoretical race where stale TLB entry remains valid Commit 3ea277194daa ("mm, mprotect: flush TLB if potentially racing with a parallel reclaim leaving stale TLB entries") describes a race that, prior to the commit, could occur between reclaim and operations such as mprotect() when using reclaim's tlbbatch mechanism. See that commit for details but the summary is: """ Nadav Amit identified a theoritical race between page reclaim and mprotect due to TLB flushes being batched outside of the PTL being held. He described the race as follows: CPU0 CPU1 ---- ---- user accesses memory using RW PTE [PTE now cached in TLB] try_to_unmap_one() ==> ptep_get_and_clear() ==> set_tlb_ubc_flush_pending() mprotect(addr, PROT_READ) ==> change_pte_range() ==> [ PTE non-present - no flush ] user writes using cached RW PTE ... try_to_unmap_flush() """ The solution was to insert flush_tlb_batched_pending() in mprotect() and friends to explcitly drain any pending reclaim TLB flushes. In the modern version of this solution, arch_flush_tlb_batched_pending() is called to do that synchronisation. arm64's tlbbatch implementation simply issues TLBIs at queue-time (arch_tlbbatch_add_pending()), eliding the trailing dsb(ish). The trailing dsb(ish) is finally issued in arch_tlbbatch_flush() at the end of the batch to wait for all the issued TLBIs to complete. Now, the Arm ARM states: """ The completion of the TLB maintenance instruction is guaranteed only by the execution of a DSB by the observer that performed the TLB maintenance instruction. The execution of a DSB by a different observer does not have this effect, even if the DSB is known to be executed after the TLB maintenance instruction is observed by that different observer. """ arch_tlbbatch_add_pending() and arch_tlbbatch_flush() conform to this requirement because they are called from the same task (either kswapd or caller of madvise(MADV_PAGEOUT)), so either they are on the same CPU or if the task was migrated, __switch_to() contains an extra dsb(ish). HOWEVER, arm64's arch_flush_tlb_batched_pending() is also implemented as a dsb(ish). But this may be running on a CPU remote from the one that issued the outstanding TLBIs. So there is no architectural gurantee of synchonization. Therefore we are still vulnerable to the theoretical race described in Commit 3ea277194daa ("mm, mprotect: flush TLB if potentially racing with a parallel reclaim leaving stale TLB entries"). Fix this by flushing the entire mm in arch_flush_tlb_batched_pending(). This aligns with what the other arches that implement the tlbbatch feature do. Cc: Fixes: 43b3dfdd0455 ("arm64: support batched/deferred tlb shootdown during page reclamation/migration") Signed-off-by: Ryan Roberts Link: https://lore.kernel.org/r/20250530152445.2430295-1-ryan.roberts@arm.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/tlbflush.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index eba1a98657f1..aa9efee17277 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -323,13 +323,14 @@ static inline bool arch_tlbbatch_should_defer(struct mm_struct *mm) } /* - * If mprotect/munmap/etc occurs during TLB batched flushing, we need to - * synchronise all the TLBI issued with a DSB to avoid the race mentioned in - * flush_tlb_batched_pending(). + * If mprotect/munmap/etc occurs during TLB batched flushing, we need to ensure + * all the previously issued TLBIs targeting mm have completed. But since we + * can be executing on a remote CPU, a DSB cannot guarantee this like it can + * for arch_tlbbatch_flush(). Our only option is to flush the entire mm. */ static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm) { - dsb(ish); + flush_tlb_mm(mm); } /* From 10f885d63a0efd50b0d22bf27eb3cf727838e99e Mon Sep 17 00:00:00 2001 From: Xi Ruoyao Date: Mon, 2 Jun 2025 12:33:21 +0800 Subject: [PATCH 4/4] arm64: Add override for MPAM As the message of the commit 09e6b306f3ba ("arm64: cpufeature: discover CPU support for MPAM") already states, if a buggy firmware fails to either enable MPAM or emulate the trap as if it were disabled, the kernel will just fail to boot. While upgrading the firmware should be the best solution, we have some hardware of which the vendor have made no response 2 months after we requested a firmware update. Allow overriding it so our devices don't become some e-waste. Cc: James Morse Cc: Marc Zyngier Cc: Will Deacon Cc: Shameer Kolothum Cc: Mingcong Bai Cc: Shaopeng Tan Cc: Ben Horgan Signed-off-by: Xi Ruoyao Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20250602043723.216338-1-xry111@xry111.site Signed-off-by: Will Deacon --- .../admin-guide/kernel-parameters.txt | 3 +++ arch/arm64/include/asm/el2_setup.h | 24 ++++++++----------- arch/arm64/kernel/cpufeature.c | 7 ++++-- arch/arm64/kernel/cpuinfo.c | 7 ++++-- arch/arm64/kernel/pi/idreg-override.c | 3 +++ 5 files changed, 26 insertions(+), 18 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 76e538c77e31..c16717dc5e57 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -458,6 +458,9 @@ arm64.nomops [ARM64] Unconditionally disable Memory Copy and Memory Set instructions support + arm64.nompam [ARM64] Unconditionally disable Memory Partitioning And + Monitoring support + arm64.nomte [ARM64] Unconditionally disable Memory Tagging Extension support diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h index 30f57b0334a3..af7807f11df4 100644 --- a/arch/arm64/include/asm/el2_setup.h +++ b/arch/arm64/include/asm/el2_setup.h @@ -298,19 +298,6 @@ .Lskip_gcs_\@: .endm -.macro __init_el2_mpam - /* Memory Partitioning And Monitoring: disable EL2 traps */ - mrs x1, id_aa64pfr0_el1 - ubfx x0, x1, #ID_AA64PFR0_EL1_MPAM_SHIFT, #4 - cbz x0, .Lskip_mpam_\@ // skip if no MPAM - msr_s SYS_MPAM2_EL2, xzr // use the default partition - // and disable lower traps - mrs_s x0, SYS_MPAMIDR_EL1 - tbz x0, #MPAMIDR_EL1_HAS_HCR_SHIFT, .Lskip_mpam_\@ // skip if no MPAMHCR reg - msr_s SYS_MPAMHCR_EL2, xzr // clear TRAP_MPAMIDR_EL1 -> EL2 -.Lskip_mpam_\@: -.endm - /** * Initialize EL2 registers to sane values. This should be called early on all * cores that were booted in EL2. Note that everything gets initialised as @@ -328,7 +315,6 @@ __init_el2_stage2 __init_el2_gicv3 __init_el2_hstr - __init_el2_mpam __init_el2_nvhe_idregs __init_el2_cptr __init_el2_fgt @@ -375,6 +361,16 @@ #endif .macro finalise_el2_state + check_override id_aa64pfr0, ID_AA64PFR0_EL1_MPAM_SHIFT, .Linit_mpam_\@, .Lskip_mpam_\@, x1, x2 + +.Linit_mpam_\@: + msr_s SYS_MPAM2_EL2, xzr // use the default partition + // and disable lower traps + mrs_s x0, SYS_MPAMIDR_EL1 + tbz x0, #MPAMIDR_EL1_HAS_HCR_SHIFT, .Lskip_mpam_\@ // skip if no MPAMHCR reg + msr_s SYS_MPAMHCR_EL2, xzr // clear TRAP_MPAMIDR_EL1 -> EL2 + +.Lskip_mpam_\@: check_override id_aa64pfr0, ID_AA64PFR0_EL1_SVE_SHIFT, .Linit_sve_\@, .Lskip_sve_\@, x1, x2 .Linit_sve_\@: /* SVE register access */ diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 379c82d22c75..fcc20d13e938 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -1198,8 +1198,10 @@ void __init init_cpu_features(struct cpuinfo_arm64 *info) cpacr_restore(cpacr); } - if (id_aa64pfr0_mpam(info->reg_id_aa64pfr0)) + if (id_aa64pfr0_mpam(read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1))) { + info->reg_mpamidr = read_cpuid(MPAMIDR_EL1); init_cpu_ftr_reg(SYS_MPAMIDR_EL1, info->reg_mpamidr); + } if (id_aa64pfr1_mte(info->reg_id_aa64pfr1)) init_cpu_ftr_reg(SYS_GMID_EL1, info->reg_gmid); @@ -1452,7 +1454,8 @@ void update_cpu_features(int cpu, cpacr_restore(cpacr); } - if (id_aa64pfr0_mpam(info->reg_id_aa64pfr0)) { + if (id_aa64pfr0_mpam(read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1))) { + info->reg_mpamidr = read_cpuid(MPAMIDR_EL1); taint |= check_update_ftr_reg(SYS_MPAMIDR_EL1, cpu, info->reg_mpamidr, boot->reg_mpamidr); } diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index 94525abd1c22..c1f2b6b04b41 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -496,8 +496,11 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info) if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) __cpuinfo_store_cpu_32bit(&info->aarch32); - if (id_aa64pfr0_mpam(info->reg_id_aa64pfr0)) - info->reg_mpamidr = read_cpuid(MPAMIDR_EL1); + /* + * info->reg_mpamidr deferred to {init,update}_cpu_features because we + * don't want to read it (and trigger a trap on buggy firmware) if + * using an aa64pfr0_el1 override to unconditionally disable MPAM. + */ if (IS_ENABLED(CONFIG_ARM64_SME) && id_aa64pfr1_sme(info->reg_id_aa64pfr1)) { diff --git a/arch/arm64/kernel/pi/idreg-override.c b/arch/arm64/kernel/pi/idreg-override.c index c6b185b885f7..bc57b290e5e7 100644 --- a/arch/arm64/kernel/pi/idreg-override.c +++ b/arch/arm64/kernel/pi/idreg-override.c @@ -127,6 +127,7 @@ static const struct ftr_set_desc pfr0 __prel64_initconst = { .fields = { FIELD("sve", ID_AA64PFR0_EL1_SVE_SHIFT, pfr0_sve_filter), FIELD("el0", ID_AA64PFR0_EL1_EL0_SHIFT, NULL), + FIELD("mpam", ID_AA64PFR0_EL1_MPAM_SHIFT, NULL), {} }, }; @@ -154,6 +155,7 @@ static const struct ftr_set_desc pfr1 __prel64_initconst = { FIELD("gcs", ID_AA64PFR1_EL1_GCS_SHIFT, NULL), FIELD("mte", ID_AA64PFR1_EL1_MTE_SHIFT, NULL), FIELD("sme", ID_AA64PFR1_EL1_SME_SHIFT, pfr1_sme_filter), + FIELD("mpam_frac", ID_AA64PFR1_EL1_MPAM_frac_SHIFT, NULL), {} }, }; @@ -246,6 +248,7 @@ static const struct { { "rodata=off", "arm64_sw.rodataoff=1" }, { "arm64.nolva", "id_aa64mmfr2.varange=0" }, { "arm64.no32bit_el0", "id_aa64pfr0.el0=1" }, + { "arm64.nompam", "id_aa64pfr0.mpam=0 id_aa64pfr1.mpam_frac=0" }, }; static int __init parse_hexdigit(const char *p, u64 *v)