From cae444e0e2f1a843a34299cbb70875b05b9b6730 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 19 Dec 2025 21:59:00 +0000 Subject: [PATCH 01/15] ACPI: APEI: EINJ: make read-only array non_mmio_desc static const Don't populate the read-only array non_mmio_desc on the stack at run time, instead make it static const. Signed-off-by: Colin Ian King Link: https://patch.msgid.link/20251219215900.494211-1-colin.i.king@gmail.com Signed-off-by: Rafael J. Wysocki --- drivers/acpi/apei/einj-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/apei/einj-core.c b/drivers/acpi/apei/einj-core.c index 305c240a303f..f5bfdffe1e43 100644 --- a/drivers/acpi/apei/einj-core.c +++ b/drivers/acpi/apei/einj-core.c @@ -679,7 +679,7 @@ static bool is_allowed_range(u64 base_addr, u64 size) * region intersects with known resource. So do an allow list check for * IORES_DESCs that definitely or most likely not MMIO. */ - int non_mmio_desc[] = { + static const int non_mmio_desc[] = { IORES_DESC_CRASH_KERNEL, IORES_DESC_ACPI_TABLES, IORES_DESC_ACPI_NV_STORAGE, From 87880af2d24e62a84ed19943dbdd524f097172f2 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 8 Jan 2026 12:35:03 +0100 Subject: [PATCH 02/15] APEI/GHES: ARM processor Error: don't go past allocated memory If the BIOS generates a very small ARM Processor Error, or an incomplete one, the current logic will fail to deferrence err->section_length and ctx_info->size Add checks to avoid that. With such changes, such GHESv2 records won't cause OOPSes like this: [ 1.492129] Internal error: Oops: 0000000096000005 [#1] SMP [ 1.495449] Modules linked in: [ 1.495820] CPU: 0 UID: 0 PID: 9 Comm: kworker/0:0 Not tainted 6.18.0-rc1-00017-gabadcc3553dd-dirty #18 PREEMPT [ 1.496125] Hardware name: QEMU QEMU Virtual Machine, BIOS unknown 02/02/2022 [ 1.496433] Workqueue: kacpi_notify acpi_os_execute_deferred [ 1.496967] pstate: 814000c5 (Nzcv daIF +PAN -UAO -TCO +DIT -SSBS BTYPE=--) [ 1.497199] pc : log_arm_hw_error+0x5c/0x200 [ 1.497380] lr : ghes_handle_arm_hw_error+0x94/0x220 0xffff8000811c5324 is in log_arm_hw_error (../drivers/ras/ras.c:75). 70 err_info = (struct cper_arm_err_info *)(err + 1); 71 ctx_info = (struct cper_arm_ctx_info *)(err_info + err->err_info_num); 72 ctx_err = (u8 *)ctx_info; 73 74 for (n = 0; n < err->context_info_num; n++) { 75 sz = sizeof(struct cper_arm_ctx_info) + ctx_info->size; 76 ctx_info = (struct cper_arm_ctx_info *)((long)ctx_info + sz); 77 ctx_len += sz; 78 } 79 and similar ones while trying to access section_length on an error dump with too small size. Signed-off-by: Mauro Carvalho Chehab Reviewed-by: Jonathan Cameron Acked-by: Ard Biesheuvel Reviewed-by: Hanjun Guo [ rjw: Subject tweaks ] Link: https://patch.msgid.link/7fd9f38413be05ee2d7cfdb0dc31ea2274cf1a54.1767871950.git.mchehab+huawei@kernel.org Signed-off-by: Rafael J. Wysocki --- drivers/acpi/apei/ghes.c | 32 ++++++++++++++++++++++++++++---- drivers/ras/ras.c | 6 +++++- 2 files changed, 33 insertions(+), 5 deletions(-) diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index 0dc767392a6c..fc3f8aed99d5 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -552,21 +552,45 @@ static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata, { struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata); int flags = sync ? MF_ACTION_REQUIRED : 0; + int length = gdata->error_data_length; char error_type[120]; bool queued = false; int sec_sev, i; char *p; sec_sev = ghes_severity(gdata->error_severity); - log_arm_hw_error(err, sec_sev); + if (length >= sizeof(*err)) { + log_arm_hw_error(err, sec_sev); + } else { + pr_warn(FW_BUG "arm error length: %d\n", length); + pr_warn(FW_BUG "length is too small\n"); + pr_warn(FW_BUG "firmware-generated error record is incorrect\n"); + return false; + } + if (sev != GHES_SEV_RECOVERABLE || sec_sev != GHES_SEV_RECOVERABLE) return false; p = (char *)(err + 1); + length -= sizeof(err); + for (i = 0; i < err->err_info_num; i++) { - struct cper_arm_err_info *err_info = (struct cper_arm_err_info *)p; - bool is_cache = err_info->type & CPER_ARM_CACHE_ERROR; - bool has_pa = (err_info->validation_bits & CPER_ARM_INFO_VALID_PHYSICAL_ADDR); + struct cper_arm_err_info *err_info; + bool is_cache, has_pa; + + /* Ensure we have enough data for the error info header */ + if (length < sizeof(*err_info)) + break; + + err_info = (struct cper_arm_err_info *)p; + + /* Validate the claimed length before using it */ + length -= err_info->length; + if (length < 0) + break; + + is_cache = err_info->type & CPER_ARM_CACHE_ERROR; + has_pa = (err_info->validation_bits & CPER_ARM_INFO_VALID_PHYSICAL_ADDR); /* * The field (err_info->error_info & BIT(26)) is fixed to set to diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c index 2a5b5a9fdcb3..03df3db62334 100644 --- a/drivers/ras/ras.c +++ b/drivers/ras/ras.c @@ -72,7 +72,11 @@ void log_arm_hw_error(struct cper_sec_proc_arm *err, const u8 sev) ctx_err = (u8 *)ctx_info; for (n = 0; n < err->context_info_num; n++) { - sz = sizeof(struct cper_arm_ctx_info) + ctx_info->size; + sz = sizeof(struct cper_arm_ctx_info); + + if (sz + (long)ctx_info - (long)err >= err->section_length) + sz += ctx_info->size; + ctx_info = (struct cper_arm_ctx_info *)((long)ctx_info + sz); ctx_len += sz; } From eae21beecb95a3b69ee5c38a659f774e171d730e Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 8 Jan 2026 12:35:04 +0100 Subject: [PATCH 03/15] EFI/CPER: don't go past the ARM processor CPER record buffer There's a logic inside GHES/CPER to detect if the section_length is too small, but it doesn't detect if it is too big. Currently, if the firmware receives an ARM processor CPER record stating that a section length is big, kernel will blindly trust section_length, producing a very long dump. For instance, a 67 bytes record with ERR_INFO_NUM set 46198 and section length set to 854918320 would dump a lot of data going a way past the firmware memory-mapped area. Fix it by adding a logic to prevent it to go past the buffer if ERR_INFO_NUM is too big, making it report instead: [Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 1 [Hardware Error]: event severity: recoverable [Hardware Error]: Error 0, type: recoverable [Hardware Error]: section_type: ARM processor error [Hardware Error]: MIDR: 0xff304b2f8476870a [Hardware Error]: section length: 854918320, CPER size: 67 [Hardware Error]: section length is too big [Hardware Error]: firmware-generated error record is incorrect [Hardware Error]: ERR_INFO_NUM is 46198 Signed-off-by: Mauro Carvalho Chehab Reviewed-by: Jonathan Cameron Acked-by: Ard Biesheuvel Reviewed-by: Hanjun Guo [ rjw: Subject and changelog tweaks ] Link: https://patch.msgid.link/41cd9f6b3ace3cdff7a5e864890849e4b1c58b63.1767871950.git.mchehab+huawei@kernel.org Signed-off-by: Rafael J. Wysocki --- drivers/firmware/efi/cper-arm.c | 12 ++++++++---- drivers/firmware/efi/cper.c | 3 ++- include/linux/cper.h | 3 ++- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/drivers/firmware/efi/cper-arm.c b/drivers/firmware/efi/cper-arm.c index 76542a53e202..b21cb1232d82 100644 --- a/drivers/firmware/efi/cper-arm.c +++ b/drivers/firmware/efi/cper-arm.c @@ -226,7 +226,8 @@ static void cper_print_arm_err_info(const char *pfx, u32 type, } void cper_print_proc_arm(const char *pfx, - const struct cper_sec_proc_arm *proc) + const struct cper_sec_proc_arm *proc, + u32 length) { int i, len, max_ctx_type; struct cper_arm_err_info *err_info; @@ -238,9 +239,12 @@ void cper_print_proc_arm(const char *pfx, len = proc->section_length - (sizeof(*proc) + proc->err_info_num * (sizeof(*err_info))); - if (len < 0) { - printk("%ssection length: %d\n", pfx, proc->section_length); - printk("%ssection length is too small\n", pfx); + + if (len < 0 || proc->section_length > length) { + printk("%ssection length: %d, CPER size: %d\n", + pfx, proc->section_length, length); + printk("%ssection length is too %s\n", pfx, + (len < 0) ? "small" : "big"); printk("%sfirmware-generated error record is incorrect\n", pfx); printk("%sERR_INFO_NUM is %d\n", pfx, proc->err_info_num); return; diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c index 0232bd040f61..88fc0293f876 100644 --- a/drivers/firmware/efi/cper.c +++ b/drivers/firmware/efi/cper.c @@ -659,7 +659,8 @@ cper_estatus_print_section(const char *pfx, struct acpi_hest_generic_data *gdata printk("%ssection_type: ARM processor error\n", newpfx); if (gdata->error_data_length >= sizeof(*arm_err)) - cper_print_proc_arm(newpfx, arm_err); + cper_print_proc_arm(newpfx, arm_err, + gdata->error_data_length); else goto err_section_too_small; #endif diff --git a/include/linux/cper.h b/include/linux/cper.h index 5b1236d8c65b..440b35e459e5 100644 --- a/include/linux/cper.h +++ b/include/linux/cper.h @@ -595,7 +595,8 @@ void cper_mem_err_pack(const struct cper_sec_mem_err *, const char *cper_mem_err_unpack(struct trace_seq *, struct cper_mem_err_compact *); void cper_print_proc_arm(const char *pfx, - const struct cper_sec_proc_arm *proc); + const struct cper_sec_proc_arm *proc, + u32 length); void cper_print_proc_ia(const char *pfx, const struct cper_sec_proc_ia *proc); int cper_mem_err_location(struct cper_mem_err_compact *mem, char *msg); From fa2408a24f8f0db14d9cfc613ef162dc267d7ad4 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 8 Jan 2026 12:35:05 +0100 Subject: [PATCH 04/15] APEI/GHES: ensure that won't go past CPER allocated record The logic at ghes_new() prevents allocating too large records, by checking if they're bigger than GHES_ESTATUS_MAX_SIZE (currently, 64KB). Yet, the allocation is done with the actual number of pages from the CPER bios table location, which can be smaller. Yet, a bad firmware could send data with a different size, which might be bigger than the allocated memory, causing an OOPS: Unable to handle kernel paging request at virtual address fff00000f9b40000 Mem abort info: ESR = 0x0000000096000007 EC = 0x25: DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 FSC = 0x07: level 3 translation fault Data abort info: ISV = 0, ISS = 0x00000007, ISS2 = 0x00000000 CM = 0, WnR = 0, TnD = 0, TagAccess = 0 GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 swapper pgtable: 4k pages, 52-bit VAs, pgdp=000000008ba16000 [fff00000f9b40000] pgd=180000013ffff403, p4d=180000013fffe403, pud=180000013f85b403, pmd=180000013f68d403, pte=0000000000000000 Internal error: Oops: 0000000096000007 [#1] SMP Modules linked in: CPU: 0 UID: 0 PID: 303 Comm: kworker/0:1 Not tainted 6.19.0-rc1-00002-gda407d200220 #34 PREEMPT Hardware name: QEMU QEMU Virtual Machine, BIOS unknown 02/02/2022 Workqueue: kacpi_notify acpi_os_execute_deferred pstate: 214020c5 (nzCv daIF +PAN -UAO -TCO +DIT -SSBS BTYPE=--) pc : hex_dump_to_buffer+0x30c/0x4a0 lr : hex_dump_to_buffer+0x328/0x4a0 sp : ffff800080e13880 x29: ffff800080e13880 x28: ffffac9aba86f6a8 x27: 0000000000000083 x26: fff00000f9b3fffc x25: 0000000000000004 x24: 0000000000000004 x23: ffff800080e13905 x22: 0000000000000010 x21: 0000000000000083 x20: 0000000000000001 x19: 0000000000000008 x18: 0000000000000010 x17: 0000000000000001 x16: 00000007c7f20fec x15: 0000000000000020 x14: 0000000000000008 x13: 0000000000081020 x12: 0000000000000008 x11: ffff800080e13905 x10: ffff800080e13988 x9 : 0000000000000000 x8 : 0000000000000000 x7 : 0000000000000001 x6 : 0000000000000020 x5 : 0000000000000030 x4 : 00000000fffffffe x3 : 0000000000000000 x2 : ffffac9aba78c1c8 x1 : ffffac9aba76d0a8 x0 : 0000000000000008 Call trace: hex_dump_to_buffer+0x30c/0x4a0 (P) print_hex_dump+0xac/0x170 cper_estatus_print_section+0x90c/0x968 cper_estatus_print+0xf0/0x158 __ghes_print_estatus+0xa0/0x148 ghes_proc+0x1bc/0x220 ghes_notify_hed+0x5c/0xb8 notifier_call_chain+0x78/0x148 blocking_notifier_call_chain+0x4c/0x80 acpi_hed_notify+0x28/0x40 acpi_ev_notify_dispatch+0x50/0x80 acpi_os_execute_deferred+0x24/0x48 process_one_work+0x15c/0x3b0 worker_thread+0x2d0/0x400 kthread+0x148/0x228 ret_from_fork+0x10/0x20 Code: 6b14033f 540001ad a94707e2 f100029f (b8747b44) ---[ end trace 0000000000000000 ]--- Prevent that by taking the actual allocated are into account when checking for CPER length. Signed-off-by: Mauro Carvalho Chehab Reviewed-by: Jonathan Cameron Acked-by: Ard Biesheuvel Reviewed-by: Hanjun Guo [ rjw: Subject tweaks ] Link: https://patch.msgid.link/4e70310a816577fabf37d94ed36cde4ad62b1e0a.1767871950.git.mchehab+huawei@kernel.org Signed-off-by: Rafael J. Wysocki --- drivers/acpi/apei/ghes.c | 6 +++++- include/acpi/ghes.h | 1 + 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index fc3f8aed99d5..77ea7a5b761f 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -294,6 +295,7 @@ static struct ghes *ghes_new(struct acpi_hest_generic *generic) error_block_length = GHES_ESTATUS_MAX_SIZE; } ghes->estatus = kmalloc(error_block_length, GFP_KERNEL); + ghes->estatus_length = error_block_length; if (!ghes->estatus) { rc = -ENOMEM; goto err_unmap_status_addr; @@ -365,13 +367,15 @@ static int __ghes_check_estatus(struct ghes *ghes, struct acpi_hest_generic_status *estatus) { u32 len = cper_estatus_len(estatus); + u32 max_len = min(ghes->generic->error_block_length, + ghes->estatus_length); if (len < sizeof(*estatus)) { pr_warn_ratelimited(FW_WARN GHES_PFX "Truncated error status block!\n"); return -EIO; } - if (len > ghes->generic->error_block_length) { + if (!len || len > max_len) { pr_warn_ratelimited(FW_WARN GHES_PFX "Invalid error status block length!\n"); return -EIO; } diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h index ebd21b05fe6e..93db60da5934 100644 --- a/include/acpi/ghes.h +++ b/include/acpi/ghes.h @@ -21,6 +21,7 @@ struct ghes { struct acpi_hest_generic_v2 *generic_v2; }; struct acpi_hest_generic_status *estatus; + unsigned int estatus_length; unsigned long flags; union { struct list_head list; From 55cc6fe5716f678f06bcb95140882dfa684464ec Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 8 Jan 2026 12:35:06 +0100 Subject: [PATCH 05/15] EFI/CPER: don't dump the entire memory region The current logic at cper_print_fw_err() doesn't check if the error record length is big enough to handle offset. On a bad firmware, if the ofset is above the actual record, length -= offset will underflow, making it dump the entire memory. The end result can be: - the logic taking a lot of time dumping large regions of memory; - data disclosure due to the memory dumps; - an OOPS, if it tries to dump an unmapped memory region. Fix it by checking if the section length is too small before doing a hex dump. Signed-off-by: Mauro Carvalho Chehab Reviewed-by: Jonathan Cameron Acked-by: Ard Biesheuvel Reviewed-by: Hanjun Guo [ rjw: Subject tweaks ] Link: https://patch.msgid.link/1752b5ba63a3e2f148ddee813b36c996cc617e86.1767871950.git.mchehab+huawei@kernel.org Signed-off-by: Rafael J. Wysocki --- drivers/firmware/efi/cper.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c index 88fc0293f876..0e938fc5ccb1 100644 --- a/drivers/firmware/efi/cper.c +++ b/drivers/firmware/efi/cper.c @@ -560,6 +560,11 @@ static void cper_print_fw_err(const char *pfx, } else { offset = sizeof(*fw_err); } + if (offset > length) { + printk("%s""error section length is too small: offset=%d, length=%d\n", + pfx, offset, length); + return; + } buf += offset; length -= offset; From f2edc1fb9c81b7b57a092204455e4d159a10873e Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Mon, 12 Jan 2026 11:22:37 +0800 Subject: [PATCH 06/15] ACPI: APEI: GHES: Improve ghes_notify_nmi() status check ghes_notify_nmi() is called for every NMI and must check whether the NMI was generated because an error was signalled by platform firmware. This check is very expensive as for each registered GHES NMI source it reads from the acpi generic address attached to this error source to get the physical address of the acpi_hest_generic_status block. It then checks the "block_status" to see if an error was logged. The ACPI/APEI code must create virtual mappings for each of those physical addresses, and tear them down afterwards. On an Icelake system this takes around 15,000 TSC cycles. Enough to disturb efforts to profile system performance. If that were not bad enough, there are some atomic accesses in the code path that will cause cache line bounces between CPUs. A problem that gets worse as the core count increases. But BIOS changes neither the acpi generic address nor the physical address of the acpi_hest_generic_status block. So this walk can be done once when the NMI is registered to save the virtual address (unmapping if the NMI is ever unregistered). The "block_status" can be checked directly in the NMI handler. This can be done without any atomic accesses. Resulting time to check that there is not an error record is around 900 cycles. Reported-by: Andi Kleen Signed-off-by: Tony Luck Tested-by: Tony Luck Signed-off-by: Shuai Xue Reviewed-by: Hanjun Guo Link: https://patch.msgid.link/20260112032239.30023-2-xueshuai@linux.alibaba.com Signed-off-by: Rafael J. Wysocki --- drivers/acpi/apei/ghes.c | 40 +++++++++++++++++++++++++++++++++++++--- include/acpi/ghes.h | 1 + 2 files changed, 38 insertions(+), 3 deletions(-) diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index 77ea7a5b761f..8796013b5166 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -1484,7 +1484,21 @@ static LIST_HEAD(ghes_nmi); static int ghes_notify_nmi(unsigned int cmd, struct pt_regs *regs) { static DEFINE_RAW_SPINLOCK(ghes_notify_lock_nmi); + bool active_error = false; int ret = NMI_DONE; + struct ghes *ghes; + + rcu_read_lock(); + list_for_each_entry_rcu(ghes, &ghes_nmi, list) { + if (ghes->error_status_vaddr && readl(ghes->error_status_vaddr)) { + active_error = true; + break; + } + } + rcu_read_unlock(); + + if (!active_error) + return ret; if (!atomic_add_unless(&ghes_in_nmi, 1, 1)) return ret; @@ -1498,13 +1512,27 @@ static int ghes_notify_nmi(unsigned int cmd, struct pt_regs *regs) return ret; } -static void ghes_nmi_add(struct ghes *ghes) +static int ghes_nmi_add(struct ghes *ghes) { + struct acpi_hest_generic *g = ghes->generic; + u64 paddr; + int rc; + + rc = apei_read(&paddr, &g->error_status_address); + if (rc) + return rc; + + ghes->error_status_vaddr = acpi_os_ioremap(paddr, sizeof(ghes->estatus->block_status)); + if (!ghes->error_status_vaddr) + return -EINVAL; + mutex_lock(&ghes_list_mutex); if (list_empty(&ghes_nmi)) register_nmi_handler(NMI_LOCAL, ghes_notify_nmi, 0, "ghes"); list_add_rcu(&ghes->list, &ghes_nmi); mutex_unlock(&ghes_list_mutex); + + return 0; } static void ghes_nmi_remove(struct ghes *ghes) @@ -1514,6 +1542,10 @@ static void ghes_nmi_remove(struct ghes *ghes) if (list_empty(&ghes_nmi)) unregister_nmi_handler(NMI_LOCAL, "ghes"); mutex_unlock(&ghes_list_mutex); + + if (ghes->error_status_vaddr) + iounmap(ghes->error_status_vaddr); + /* * To synchronize with NMI handler, ghes can only be * freed after NMI handler finishes. @@ -1521,7 +1553,7 @@ static void ghes_nmi_remove(struct ghes *ghes) synchronize_rcu(); } #else /* CONFIG_HAVE_ACPI_APEI_NMI */ -static inline void ghes_nmi_add(struct ghes *ghes) { } +static inline int ghes_nmi_add(struct ghes *ghes) { return -EINVAL; } static inline void ghes_nmi_remove(struct ghes *ghes) { } #endif /* CONFIG_HAVE_ACPI_APEI_NMI */ @@ -1689,7 +1721,9 @@ static int ghes_probe(struct platform_device *ghes_dev) ghes_sea_add(ghes); break; case ACPI_HEST_NOTIFY_NMI: - ghes_nmi_add(ghes); + rc = ghes_nmi_add(ghes); + if (rc) + goto err; break; case ACPI_HEST_NOTIFY_SOFTWARE_DELEGATED: rc = apei_sdei_register_ghes(ghes); diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h index 93db60da5934..7bea522c0657 100644 --- a/include/acpi/ghes.h +++ b/include/acpi/ghes.h @@ -30,6 +30,7 @@ struct ghes { }; struct device *dev; struct list_head elist; + void __iomem *error_status_vaddr; }; struct ghes_estatus_node { From feb2d38013ddfc8ea4b53134d194582dc1e4de2c Mon Sep 17 00:00:00 2001 From: Shuai Xue Date: Mon, 12 Jan 2026 11:22:38 +0800 Subject: [PATCH 07/15] ACPI: APEI: GHES: Extract helper functions for error status handling Refactors the GHES driver by extracting common functionality into reusable helper functions: 1. ghes_has_active_errors() - Checks if any error sources in a given list have active errors 2. ghes_map_error_status() - Maps error status address to virtual address 3. ghes_unmap_error_status() - Unmaps error status virtual address 4. Use `guard(rcu)()` instead of explicit `rcu_read_lock()`/`rcu_read_unlock()`. These helpers eliminate code duplication in the NMI path and prepare for similar usage in the SEA path in a subsequent patch. No functional change intended. Tested-by: Tony Luck Reviewed-by: Tony Luck Signed-off-by: Shuai Xue Reviewed-by: Breno Leitao Reviewed-by: Hanjun Guo Link: https://patch.msgid.link/20260112032239.30023-3-xueshuai@linux.alibaba.com Signed-off-by: Rafael J. Wysocki --- drivers/acpi/apei/ghes.c | 89 ++++++++++++++++++++++++++++++---------- 1 file changed, 68 insertions(+), 21 deletions(-) diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index 8796013b5166..067a58dbcc01 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -1434,6 +1434,71 @@ static int ghes_in_nmi_spool_from_list(struct list_head *rcu_list, return ret; } +/** + * ghes_has_active_errors - Check if there are active errors in error sources + * @ghes_list: List of GHES entries to check for active errors + * + * This function iterates through all GHES entries in the given list and + * checks if any of them has active error status by reading the error + * status register. + * + * Return: true if at least one source has active error, false otherwise. + */ +static bool __maybe_unused ghes_has_active_errors(struct list_head *ghes_list) +{ + struct ghes *ghes; + + guard(rcu)(); + list_for_each_entry_rcu(ghes, ghes_list, list) { + if (ghes->error_status_vaddr && + readl(ghes->error_status_vaddr)) + return true; + } + + return false; +} + +/** + * ghes_map_error_status - Map error status address to virtual address + * @ghes: pointer to GHES structure + * + * Reads the error status address from ACPI HEST table and maps it to a virtual + * address that can be accessed by the kernel. + * + * Return: 0 on success, error code on failure. + */ +static int __maybe_unused ghes_map_error_status(struct ghes *ghes) +{ + struct acpi_hest_generic *g = ghes->generic; + u64 paddr; + int rc; + + rc = apei_read(&paddr, &g->error_status_address); + if (rc) + return rc; + + ghes->error_status_vaddr = + acpi_os_ioremap(paddr, sizeof(ghes->estatus->block_status)); + if (!ghes->error_status_vaddr) + return -EINVAL; + + return 0; +} + +/** + * ghes_unmap_error_status - Unmap error status virtual address + * @ghes: pointer to GHES structure + * + * Unmaps the error status address if it was previously mapped. + */ +static void __maybe_unused ghes_unmap_error_status(struct ghes *ghes) +{ + if (ghes->error_status_vaddr) { + iounmap(ghes->error_status_vaddr); + ghes->error_status_vaddr = NULL; + } +} + #ifdef CONFIG_ACPI_APEI_SEA static LIST_HEAD(ghes_sea); @@ -1484,20 +1549,9 @@ static LIST_HEAD(ghes_nmi); static int ghes_notify_nmi(unsigned int cmd, struct pt_regs *regs) { static DEFINE_RAW_SPINLOCK(ghes_notify_lock_nmi); - bool active_error = false; int ret = NMI_DONE; - struct ghes *ghes; - rcu_read_lock(); - list_for_each_entry_rcu(ghes, &ghes_nmi, list) { - if (ghes->error_status_vaddr && readl(ghes->error_status_vaddr)) { - active_error = true; - break; - } - } - rcu_read_unlock(); - - if (!active_error) + if (!ghes_has_active_errors(&ghes_nmi)) return ret; if (!atomic_add_unless(&ghes_in_nmi, 1, 1)) @@ -1514,18 +1568,12 @@ static int ghes_notify_nmi(unsigned int cmd, struct pt_regs *regs) static int ghes_nmi_add(struct ghes *ghes) { - struct acpi_hest_generic *g = ghes->generic; - u64 paddr; int rc; - rc = apei_read(&paddr, &g->error_status_address); + rc = ghes_map_error_status(ghes); if (rc) return rc; - ghes->error_status_vaddr = acpi_os_ioremap(paddr, sizeof(ghes->estatus->block_status)); - if (!ghes->error_status_vaddr) - return -EINVAL; - mutex_lock(&ghes_list_mutex); if (list_empty(&ghes_nmi)) register_nmi_handler(NMI_LOCAL, ghes_notify_nmi, 0, "ghes"); @@ -1543,8 +1591,7 @@ static void ghes_nmi_remove(struct ghes *ghes) unregister_nmi_handler(NMI_LOCAL, "ghes"); mutex_unlock(&ghes_list_mutex); - if (ghes->error_status_vaddr) - iounmap(ghes->error_status_vaddr); + ghes_unmap_error_status(ghes); /* * To synchronize with NMI handler, ghes can only be From b73cf7eaa6ee77f030667531245e1635c1b6fc9a Mon Sep 17 00:00:00 2001 From: Shuai Xue Date: Mon, 12 Jan 2026 11:22:39 +0800 Subject: [PATCH 08/15] ACPI: APEI: GHES: Improve ghes_notify_sea() status check Performance testing on ARMv8 systems shows significant overhead in error status handling in SEA error handling. - ghes_peek_estatus(): 8,138.3 ns (21,160 cycles). - ghes_clear_estatus(): 2,038.3 ns (5,300 cycles). Apply the same optimization used in ghes_notify_nmi() to ghes_notify_sea() by checking for active errors before processing, Tested-by: Tony Luck Reviewed-by: Tony Luck Signed-off-by: Shuai Xue Reviewed-by: Hanjun Guo Link: https://patch.msgid.link/20260112032239.30023-4-xueshuai@linux.alibaba.com Signed-off-by: Rafael J. Wysocki --- drivers/acpi/apei/ghes.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index 067a58dbcc01..a2ac282c99cb 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -1511,6 +1511,9 @@ int ghes_notify_sea(void) static DEFINE_RAW_SPINLOCK(ghes_notify_lock_sea); int rv; + if (!ghes_has_active_errors(&ghes_sea)) + return -ENOENT; + raw_spin_lock(&ghes_notify_lock_sea); rv = ghes_in_nmi_spool_from_list(&ghes_sea, FIX_APEI_GHES_SEA); raw_spin_unlock(&ghes_notify_lock_sea); @@ -1518,11 +1521,19 @@ int ghes_notify_sea(void) return rv; } -static void ghes_sea_add(struct ghes *ghes) +static int ghes_sea_add(struct ghes *ghes) { + int rc; + + rc = ghes_map_error_status(ghes); + if (rc) + return rc; + mutex_lock(&ghes_list_mutex); list_add_rcu(&ghes->list, &ghes_sea); mutex_unlock(&ghes_list_mutex); + + return 0; } static void ghes_sea_remove(struct ghes *ghes) @@ -1530,10 +1541,11 @@ static void ghes_sea_remove(struct ghes *ghes) mutex_lock(&ghes_list_mutex); list_del_rcu(&ghes->list); mutex_unlock(&ghes_list_mutex); + ghes_unmap_error_status(ghes); synchronize_rcu(); } #else /* CONFIG_ACPI_APEI_SEA */ -static inline void ghes_sea_add(struct ghes *ghes) { } +static inline int ghes_sea_add(struct ghes *ghes) { return -EINVAL; } static inline void ghes_sea_remove(struct ghes *ghes) { } #endif /* CONFIG_ACPI_APEI_SEA */ @@ -1765,7 +1777,9 @@ static int ghes_probe(struct platform_device *ghes_dev) break; case ACPI_HEST_NOTIFY_SEA: - ghes_sea_add(ghes); + rc = ghes_sea_add(ghes); + if (rc) + goto err; break; case ACPI_HEST_NOTIFY_NMI: rc = ghes_nmi_add(ghes); From a2995f7dab51bc0cfabd750f9848a5ee7612099d Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Wed, 14 Jan 2026 11:14:21 +0100 Subject: [PATCH 09/15] ACPI: extlog: Trace CPER Non-standard Section Body ghes_do_proc() has a catch-all for unknown or unhandled CPER formats (UEFI v2.11 Appendix N 2.3), extlog_print() does not. This gap was noticed by a RAS test that injected CXL protocol errors which were notified to extlog_print() via the IOMCA (I/O Machine Check Architecture) mechanism. Bring parity to the extlog_print() path by including a similar log_non_standard_event(). Reviewed-by: Dan Williams Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Reviewed-by: Kuppuswamy Sathyanarayanan Reviewed-by: Qiuxu Zhuo Reviewed-by: Shuai Xue Signed-off-by: Fabio M. De Francesco Link: https://patch.msgid.link/20260114101543.85926-2-fabio.m.de.francesco@linux.intel.com Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpi_extlog.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/acpi/acpi_extlog.c b/drivers/acpi/acpi_extlog.c index f6b9562779de..47d11cb5c912 100644 --- a/drivers/acpi/acpi_extlog.c +++ b/drivers/acpi/acpi_extlog.c @@ -183,6 +183,12 @@ static int extlog_print(struct notifier_block *nb, unsigned long val, if (gdata->error_data_length >= sizeof(*mem)) trace_extlog_mem_event(mem, err_seq, fru_id, fru_text, (u8)gdata->error_severity); + } else { + void *err = acpi_hest_get_payload(gdata); + + log_non_standard_event(sec_type, fru_id, fru_text, + gdata->error_severity, err, + gdata->error_data_length); } } From e778ffefa34ddcdc32a260452627e390941812eb Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Wed, 14 Jan 2026 11:14:22 +0100 Subject: [PATCH 10/15] ACPI: extlog: Trace CPER PCI Express Error Section I/O Machine Check Architecture events may signal failing PCIe components or links. The AER event contains details on what was happening on the wire when the error was signaled. Trace the CPER PCIe Error section (UEFI v2.11, Appendix N.2.7) reported by the I/O MCA. Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Signed-off-by: Fabio M. De Francesco Link: https://patch.msgid.link/20260114101543.85926-3-fabio.m.de.francesco@linux.intel.com Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpi_extlog.c | 34 ++++++++++++++++++++++++++++++++++ drivers/pci/pcie/aer.c | 2 +- 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/acpi_extlog.c b/drivers/acpi/acpi_extlog.c index 47d11cb5c912..88a2237772c2 100644 --- a/drivers/acpi/acpi_extlog.c +++ b/drivers/acpi/acpi_extlog.c @@ -132,6 +132,36 @@ static int print_extlog_rcd(const char *pfx, return 1; } +static void extlog_print_pcie(struct cper_sec_pcie *pcie_err, + int severity) +{ +#ifdef ACPI_APEI_PCIEAER + struct aer_capability_regs *aer; + struct pci_dev *pdev; + unsigned int devfn; + unsigned int bus; + int aer_severity; + int domain; + + if (!(pcie_err->validation_bits & CPER_PCIE_VALID_DEVICE_ID && + pcie_err->validation_bits & CPER_PCIE_VALID_AER_INFO)) + return; + + aer_severity = cper_severity_to_aer(severity); + aer = (struct aer_capability_regs *)pcie_err->aer_info; + domain = pcie_err->device_id.segment; + bus = pcie_err->device_id.bus; + devfn = PCI_DEVFN(pcie_err->device_id.device, + pcie_err->device_id.function); + pdev = pci_get_domain_bus_and_slot(domain, bus, devfn); + if (!pdev) + return; + + pci_print_aer(pdev, aer_severity, aer); + pci_dev_put(pdev); +#endif +} + static int extlog_print(struct notifier_block *nb, unsigned long val, void *data) { @@ -183,6 +213,10 @@ static int extlog_print(struct notifier_block *nb, unsigned long val, if (gdata->error_data_length >= sizeof(*mem)) trace_extlog_mem_event(mem, err_seq, fru_id, fru_text, (u8)gdata->error_severity); + } else if (guid_equal(sec_type, &CPER_SEC_PCIE)) { + struct cper_sec_pcie *pcie_err = acpi_hest_get_payload(gdata); + + extlog_print_pcie(pcie_err, gdata->error_severity); } else { void *err = acpi_hest_get_payload(gdata); diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index e0bcaa896803..71ee4f5064de 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -973,7 +973,7 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity, pcie_print_tlp_log(dev, &aer->header_log, info.level, dev_fmt(" ")); } -EXPORT_SYMBOL_NS_GPL(pci_print_aer, "CXL"); +EXPORT_SYMBOL_GPL(pci_print_aer); /** * add_error_device - list device to be handled From 70205869686212eb8e4cddf02bf87fd5fd597bc2 Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Wed, 14 Jan 2026 11:14:23 +0100 Subject: [PATCH 11/15] ACPI: APEI: GHES: Add helper for CPER CXL protocol errors checks Move the CPER CXL protocol errors validity check out of cxl_cper_post_prot_err() to new cxl_cper_sec_prot_err_valid() and limit the serial number check only to CXL agents that are CXL devices (UEFI v2.10, Appendix N.2.13). Export the new symbol for reuse by ELOG. Reviewed-by: Dave Jiang Reviewed-by: Hanjun Guo Reviewed-by: Jonathan Cameron Signed-off-by: Fabio M. De Francesco [ rjw: Subject tweak ] Link: https://patch.msgid.link/20260114101543.85926-4-fabio.m.de.francesco@linux.intel.com Signed-off-by: Rafael J. Wysocki --- drivers/acpi/apei/Makefile | 1 + drivers/acpi/apei/ghes.c | 18 +---------------- drivers/acpi/apei/ghes_helpers.c | 33 ++++++++++++++++++++++++++++++++ include/cxl/event.h | 10 ++++++++++ 4 files changed, 45 insertions(+), 17 deletions(-) create mode 100644 drivers/acpi/apei/ghes_helpers.c diff --git a/drivers/acpi/apei/Makefile b/drivers/acpi/apei/Makefile index 2c474e6477e1..5db61dfb4691 100644 --- a/drivers/acpi/apei/Makefile +++ b/drivers/acpi/apei/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_ACPI_APEI) += apei.o obj-$(CONFIG_ACPI_APEI_GHES) += ghes.o +obj-$(CONFIG_ACPI_APEI_PCIEAER) += ghes_helpers.o obj-$(CONFIG_ACPI_APEI_EINJ) += einj.o einj-y := einj-core.o einj-$(CONFIG_ACPI_APEI_EINJ_CXL) += einj-cxl.o diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index a2ac282c99cb..319bbc9317ae 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -741,24 +741,8 @@ static void cxl_cper_post_prot_err(struct cxl_cper_sec_prot_err *prot_err, struct cxl_cper_prot_err_work_data wd; u8 *dvsec_start, *cap_start; - if (!(prot_err->valid_bits & PROT_ERR_VALID_AGENT_ADDRESS)) { - pr_err_ratelimited("CXL CPER invalid agent type\n"); + if (cxl_cper_sec_prot_err_valid(prot_err)) return; - } - - if (!(prot_err->valid_bits & PROT_ERR_VALID_ERROR_LOG)) { - pr_err_ratelimited("CXL CPER invalid protocol error log\n"); - return; - } - - if (prot_err->err_len != sizeof(struct cxl_ras_capability_regs)) { - pr_err_ratelimited("CXL CPER invalid RAS Cap size (%u)\n", - prot_err->err_len); - return; - } - - if (!(prot_err->valid_bits & PROT_ERR_VALID_SERIAL_NUMBER)) - pr_warn(FW_WARN "CXL CPER no device serial number\n"); guard(spinlock_irqsave)(&cxl_cper_prot_err_work_lock); diff --git a/drivers/acpi/apei/ghes_helpers.c b/drivers/acpi/apei/ghes_helpers.c new file mode 100644 index 000000000000..f3d162139a97 --- /dev/null +++ b/drivers/acpi/apei/ghes_helpers.c @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: GPL-2.0-only +// Copyright(c) 2025 Intel Corporation. All rights reserved + +#include +#include + +int cxl_cper_sec_prot_err_valid(struct cxl_cper_sec_prot_err *prot_err) +{ + if (!(prot_err->valid_bits & PROT_ERR_VALID_AGENT_ADDRESS)) { + pr_err_ratelimited("CXL CPER invalid agent type\n"); + return -EINVAL; + } + + if (!(prot_err->valid_bits & PROT_ERR_VALID_ERROR_LOG)) { + pr_err_ratelimited("CXL CPER invalid protocol error log\n"); + return -EINVAL; + } + + if (prot_err->err_len != sizeof(struct cxl_ras_capability_regs)) { + pr_err_ratelimited("CXL CPER invalid RAS Cap size (%u)\n", + prot_err->err_len); + return -EINVAL; + } + + if ((prot_err->agent_type == RCD || prot_err->agent_type == DEVICE || + prot_err->agent_type == LD || prot_err->agent_type == FMLD) && + !(prot_err->valid_bits & PROT_ERR_VALID_SERIAL_NUMBER)) + pr_warn_ratelimited(FW_WARN + "CXL CPER no device serial number\n"); + + return 0; +} +EXPORT_SYMBOL_GPL(cxl_cper_sec_prot_err_valid); diff --git a/include/cxl/event.h b/include/cxl/event.h index 6fd90f9cc203..4d7d1036ea9c 100644 --- a/include/cxl/event.h +++ b/include/cxl/event.h @@ -320,4 +320,14 @@ static inline int cxl_cper_prot_err_kfifo_get(struct cxl_cper_prot_err_work_data } #endif +#ifdef CONFIG_ACPI_APEI_PCIEAER +int cxl_cper_sec_prot_err_valid(struct cxl_cper_sec_prot_err *prot_err); +#else +static inline int +cxl_cper_sec_prot_err_valid(struct cxl_cper_sec_prot_err *prot_err) +{ + return -EOPNOTSUPP; +} +#endif + #endif /* _LINUX_CXL_EVENT_H */ From ba8af8e1f1de32f14c98bd4a7da8b270284ffce3 Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Wed, 14 Jan 2026 11:14:24 +0100 Subject: [PATCH 12/15] ACPI: APEI: GHES: Add helper to copy CPER CXL protocol error info to work struct Make a helper out of cxl_cper_post_prot_err() that checks the CXL agent type and copy the CPER CXL protocol errors information to a work data structure. Export the new symbol for reuse by ELOG. Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Reviewed-by: Hanjun Guo Signed-off-by: Fabio M. De Francesco [ rjw: Subject tweak ] Link: https://patch.msgid.link/20260114101543.85926-5-fabio.m.de.francesco@linux.intel.com Signed-off-by: Rafael J. Wysocki --- drivers/acpi/apei/ghes.c | 22 +-------------------- drivers/acpi/apei/ghes_helpers.c | 33 ++++++++++++++++++++++++++++++++ include/cxl/event.h | 10 ++++++++++ 3 files changed, 44 insertions(+), 21 deletions(-) diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index 319bbc9317ae..b49a5da46788 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -739,7 +739,6 @@ static void cxl_cper_post_prot_err(struct cxl_cper_sec_prot_err *prot_err, { #ifdef CONFIG_ACPI_APEI_PCIEAER struct cxl_cper_prot_err_work_data wd; - u8 *dvsec_start, *cap_start; if (cxl_cper_sec_prot_err_valid(prot_err)) return; @@ -749,27 +748,8 @@ static void cxl_cper_post_prot_err(struct cxl_cper_sec_prot_err *prot_err, if (!cxl_cper_prot_err_work) return; - switch (prot_err->agent_type) { - case RCD: - case DEVICE: - case LD: - case FMLD: - case RP: - case DSP: - case USP: - memcpy(&wd.prot_err, prot_err, sizeof(wd.prot_err)); - - dvsec_start = (u8 *)(prot_err + 1); - cap_start = dvsec_start + prot_err->dvsec_len; - - memcpy(&wd.ras_cap, cap_start, sizeof(wd.ras_cap)); - wd.severity = cper_severity_to_aer(severity); - break; - default: - pr_err_ratelimited("CXL CPER invalid agent type: %d\n", - prot_err->agent_type); + if (cxl_cper_setup_prot_err_work_data(&wd, prot_err, severity)) return; - } if (!kfifo_put(&cxl_cper_prot_err_fifo, wd)) { pr_err_ratelimited("CXL CPER kfifo overflow\n"); diff --git a/drivers/acpi/apei/ghes_helpers.c b/drivers/acpi/apei/ghes_helpers.c index f3d162139a97..bc7111b740af 100644 --- a/drivers/acpi/apei/ghes_helpers.c +++ b/drivers/acpi/apei/ghes_helpers.c @@ -2,6 +2,7 @@ // Copyright(c) 2025 Intel Corporation. All rights reserved #include +#include #include int cxl_cper_sec_prot_err_valid(struct cxl_cper_sec_prot_err *prot_err) @@ -31,3 +32,35 @@ int cxl_cper_sec_prot_err_valid(struct cxl_cper_sec_prot_err *prot_err) return 0; } EXPORT_SYMBOL_GPL(cxl_cper_sec_prot_err_valid); + +int cxl_cper_setup_prot_err_work_data(struct cxl_cper_prot_err_work_data *wd, + struct cxl_cper_sec_prot_err *prot_err, + int severity) +{ + u8 *dvsec_start, *cap_start; + + switch (prot_err->agent_type) { + case RCD: + case DEVICE: + case LD: + case FMLD: + case RP: + case DSP: + case USP: + memcpy(&wd->prot_err, prot_err, sizeof(wd->prot_err)); + + dvsec_start = (u8 *)(prot_err + 1); + cap_start = dvsec_start + prot_err->dvsec_len; + + memcpy(&wd->ras_cap, cap_start, sizeof(wd->ras_cap)); + wd->severity = cper_severity_to_aer(severity); + break; + default: + pr_err_ratelimited("CXL CPER invalid agent type: %d\n", + prot_err->agent_type); + return -EINVAL; + } + + return 0; +} +EXPORT_SYMBOL_GPL(cxl_cper_setup_prot_err_work_data); diff --git a/include/cxl/event.h b/include/cxl/event.h index 4d7d1036ea9c..94081aec597a 100644 --- a/include/cxl/event.h +++ b/include/cxl/event.h @@ -322,12 +322,22 @@ static inline int cxl_cper_prot_err_kfifo_get(struct cxl_cper_prot_err_work_data #ifdef CONFIG_ACPI_APEI_PCIEAER int cxl_cper_sec_prot_err_valid(struct cxl_cper_sec_prot_err *prot_err); +int cxl_cper_setup_prot_err_work_data(struct cxl_cper_prot_err_work_data *wd, + struct cxl_cper_sec_prot_err *prot_err, + int severity); #else static inline int cxl_cper_sec_prot_err_valid(struct cxl_cper_sec_prot_err *prot_err) { return -EOPNOTSUPP; } +static inline int +cxl_cper_setup_prot_err_work_data(struct cxl_cper_prot_err_work_data *wd, + struct cxl_cper_sec_prot_err *prot_err, + int severity) +{ + return -EOPNOTSUPP; +} #endif #endif /* _LINUX_CXL_EVENT_H */ From 95350effc3ad62582411f59fd08a7621ac82f314 Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Wed, 14 Jan 2026 11:14:25 +0100 Subject: [PATCH 13/15] ACPI: extlog: Trace CPER CXL Protocol Error Section When Firmware First is enabled, BIOS handles errors first and then it makes them available to the kernel via the Common Platform Error Record (CPER) sections (UEFI 2.11 Appendix N.2.13). Linux parses the CPER sections via one of two similar paths, either ELOG or GHES. The errors managed by ELOG are signaled to the BIOS by the I/O Machine Check Architecture (I/O MCA). Currently, ELOG and GHES show some inconsistencies in how they report to userspace via trace events. Therefore, make the two mentioned paths act similarly by tracing the CPER CXL Protocol Error Section. Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Reviewed-by: Kuppuswamy Sathyanarayanan Signed-off-by: Fabio M. De Francesco Link: https://patch.msgid.link/20260114101543.85926-6-fabio.m.de.francesco@linux.intel.com Signed-off-by: Rafael J. Wysocki --- drivers/acpi/Kconfig | 2 ++ drivers/acpi/acpi_extlog.c | 24 ++++++++++++++++++++++++ drivers/cxl/core/ras.c | 3 ++- include/cxl/event.h | 2 ++ 4 files changed, 30 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig index ca00a5dbcf75..df0ff0764d0d 100644 --- a/drivers/acpi/Kconfig +++ b/drivers/acpi/Kconfig @@ -494,6 +494,8 @@ config ACPI_EXTLOG tristate "Extended Error Log support" depends on X86_MCE && X86_LOCAL_APIC && EDAC select UEFI_CPER + select ACPI_APEI + select ACPI_APEI_GHES help Certain usages such as Predictive Failure Analysis (PFA) require more information about the error than what can be described in diff --git a/drivers/acpi/acpi_extlog.c b/drivers/acpi/acpi_extlog.c index 88a2237772c2..7ad3b36013cc 100644 --- a/drivers/acpi/acpi_extlog.c +++ b/drivers/acpi/acpi_extlog.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -162,6 +163,23 @@ static void extlog_print_pcie(struct cper_sec_pcie *pcie_err, #endif } +static void +extlog_cxl_cper_handle_prot_err(struct cxl_cper_sec_prot_err *prot_err, + int severity) +{ +#ifdef ACPI_APEI_PCIEAER + struct cxl_cper_prot_err_work_data wd; + + if (cxl_cper_sec_prot_err_valid(prot_err)) + return; + + if (cxl_cper_setup_prot_err_work_data(&wd, prot_err, severity)) + return; + + cxl_cper_handle_prot_err(&wd); +#endif +} + static int extlog_print(struct notifier_block *nb, unsigned long val, void *data) { @@ -213,6 +231,12 @@ static int extlog_print(struct notifier_block *nb, unsigned long val, if (gdata->error_data_length >= sizeof(*mem)) trace_extlog_mem_event(mem, err_seq, fru_id, fru_text, (u8)gdata->error_severity); + } else if (guid_equal(sec_type, &CPER_SEC_CXL_PROT_ERR)) { + struct cxl_cper_sec_prot_err *prot_err = + acpi_hest_get_payload(gdata); + + extlog_cxl_cper_handle_prot_err(prot_err, + gdata->error_severity); } else if (guid_equal(sec_type, &CPER_SEC_PCIE)) { struct cper_sec_pcie *pcie_err = acpi_hest_get_payload(gdata); diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c index 2731ba3a0799..a90480d07c87 100644 --- a/drivers/cxl/core/ras.c +++ b/drivers/cxl/core/ras.c @@ -63,7 +63,7 @@ static int match_memdev_by_parent(struct device *dev, const void *uport) return 0; } -static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data) +void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data) { unsigned int devfn = PCI_DEVFN(data->prot_err.agent_addr.device, data->prot_err.agent_addr.function); @@ -104,6 +104,7 @@ static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data) else cxl_cper_trace_uncorr_prot_err(cxlmd, data->ras_cap); } +EXPORT_SYMBOL_GPL(cxl_cper_handle_prot_err); static void cxl_cper_prot_err_work_fn(struct work_struct *work) { diff --git a/include/cxl/event.h b/include/cxl/event.h index 94081aec597a..ff97fea718d2 100644 --- a/include/cxl/event.h +++ b/include/cxl/event.h @@ -340,4 +340,6 @@ cxl_cper_setup_prot_err_work_data(struct cxl_cper_prot_err_work_data *wd, } #endif +void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *wd); + #endif /* _LINUX_CXL_EVENT_H */ From b584bfbd7ec417f257f651cc00a90c66e31dfbf1 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 14 Jan 2026 16:27:11 -0700 Subject: [PATCH 14/15] ACPI: APEI: GHES: Disable KASAN instrumentation when compile testing with clang < 18 After a recent innocuous change to drivers/acpi/apei/ghes.c, building ARCH=arm64 allmodconfig with clang-17 or older (which has both CONFIG_KASAN=y and CONFIG_WERROR=y) fails with: drivers/acpi/apei/ghes.c:902:13: error: stack frame size (2768) exceeds limit (2048) in 'ghes_do_proc' [-Werror,-Wframe-larger-than] 902 | static void ghes_do_proc(struct ghes *ghes, | ^ A KASAN pass that removes unneeded stack instrumentation, enabled by default in clang-18 [1], drastically improves stack usage in this case. To avoid the warning in the common allmodconfig case when it can break the build, disable KASAN for ghes.o when compile testing with clang-17 and older. Disabling KASAN outright may hide legitimate runtime issues, so live with the warning in that case; the user can either increase the frame warning limit or disable -Werror, which they should probably do when debugging with KASAN anyways. Closes: https://github.com/ClangBuiltLinux/linux/issues/2148 Link: https://github.com/llvm/llvm-project/commit/51fbab134560ece663517bf1e8c2a30300d08f1a [1] Signed-off-by: Nathan Chancellor Cc: All applicable Link: https://patch.msgid.link/20260114-ghes-avoid-wflt-clang-older-than-18-v1-1-9c8248bfe4f4@kernel.org Signed-off-by: Rafael J. Wysocki --- drivers/acpi/apei/Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/acpi/apei/Makefile b/drivers/acpi/apei/Makefile index 5db61dfb4691..1a0b85923cd4 100644 --- a/drivers/acpi/apei/Makefile +++ b/drivers/acpi/apei/Makefile @@ -1,6 +1,10 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_ACPI_APEI) += apei.o obj-$(CONFIG_ACPI_APEI_GHES) += ghes.o +# clang versions prior to 18 may blow out the stack with KASAN +ifeq ($(CONFIG_COMPILE_TEST)_$(CONFIG_CC_IS_CLANG)_$(call clang-min-version, 180000),y_y_) +KASAN_SANITIZE_ghes.o := n +endif obj-$(CONFIG_ACPI_APEI_PCIEAER) += ghes_helpers.o obj-$(CONFIG_ACPI_APEI_EINJ) += einj.o einj-y := einj-core.o From 57d5287b7eb334e0b772be74d6ff9f2f22f0512c Mon Sep 17 00:00:00 2001 From: Tony W Wang-oc Date: Wed, 28 Jan 2026 10:52:16 +0800 Subject: [PATCH 15/15] ACPI: APEI: GHES: Add ghes_edac support for __ZX__ and _BYO_ systems Let ghes_edac be the preferred driver to load on __ZX__ and _BYO_ systems by extending the platform detection list in ghes.c Signed-off-by: Tony W Wang-oc Tested-by: Lyle Li Acked-by: Borislav Petkov (AMD) [ rjw: Subject and changelog edits ] Link: https://patch.msgid.link/20260128025216.12564-1-TonyWWang-oc@zhaoxin.com Signed-off-by: Rafael J. Wysocki --- drivers/acpi/apei/ghes.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index b49a5da46788..f96aede5d9a3 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -1897,6 +1897,8 @@ void __init acpi_ghes_init(void) */ static struct acpi_platform_list plat_list[] = { {"HPE ", "Server ", 0, ACPI_SIG_FADT, all_versions}, + {"__ZX__", "EDK2 ", 3, ACPI_SIG_FADT, greater_than_or_equal}, + {"_BYO_ ", "BYOSOFT ", 3, ACPI_SIG_FADT, greater_than_or_equal}, { } /* End */ };