From f7f4a21c2a51710a06965cc9c1252821fc925544 Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Mon, 23 Mar 2026 22:22:17 -0300 Subject: [PATCH 01/13] memblock: Print out errors on reserve_mem parser The parsing of kernel parameter "reserve_mem=" is subject to multiple failures, like duplicate naming, malformed expression or even lack of available memory. Right now, all of these fail silently. Let's add some messages so the kernel log can provide useful information in case of failures. Reviewed-by: SeongJae Park Signed-off-by: Guilherme G. Piccoli Link: https://patch.msgid.link/20260324012839.1991765-1-gpiccoli@igalia.com Signed-off-by: Mike Rapoport (Microsoft) --- mm/memblock.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/mm/memblock.c b/mm/memblock.c index b3ddfdec7a80..ac08d7f8c15e 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -2642,23 +2642,25 @@ static int __init reserve_mem(char *p) int len; if (!p) - return -EINVAL; + goto err_param; /* Check if there's room for more reserved memory */ - if (reserved_mem_count >= RESERVE_MEM_MAX_ENTRIES) + if (reserved_mem_count >= RESERVE_MEM_MAX_ENTRIES) { + pr_err("reserve_mem: no more room for reserved memory\n"); return -EBUSY; + } oldp = p; size = memparse(p, &p); if (!size || p == oldp) - return -EINVAL; + goto err_param; if (*p != ':') - return -EINVAL; + goto err_param; align = memparse(p+1, &p); if (*p != ':') - return -EINVAL; + goto err_param; /* * memblock_phys_alloc() doesn't like a zero size align, @@ -2672,7 +2674,7 @@ static int __init reserve_mem(char *p) /* name needs to have length but not too big */ if (!len || len >= RESERVE_MEM_NAME_SIZE) - return -EINVAL; + goto err_param; /* Make sure that name has text */ for (p = name; *p; p++) { @@ -2680,11 +2682,13 @@ static int __init reserve_mem(char *p) break; } if (!*p) - return -EINVAL; + goto err_param; /* Make sure the name is not already used */ - if (reserve_mem_find_by_name(name, &start, &tmp)) + if (reserve_mem_find_by_name(name, &start, &tmp)) { + pr_err("reserve_mem: name \"%s\" was already used\n", name); return -EBUSY; + } /* Pick previous allocations up from KHO if available */ if (reserve_mem_kho_revive(name, size, align)) @@ -2692,12 +2696,17 @@ static int __init reserve_mem(char *p) /* TODO: Allocation must be outside of scratch region */ start = memblock_phys_alloc(size, align); - if (!start) + if (!start) { + pr_err("reserve_mem: memblock allocation failed\n"); return -ENOMEM; + } reserved_mem_add(start, size, name); return 1; +err_param: + pr_err("reserve_mem: empty or malformed parameter\n"); + return -EINVAL; } __setup("reserve_mem=", reserve_mem); From 0709682cdb4ac77e3f78ea9c10d7f74b41a12518 Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Mon, 23 Mar 2026 22:22:18 -0300 Subject: [PATCH 02/13] memblock: Add reserve_mem debugfs info When using the "reserve_mem" parameter, users aim at having an area that (hopefully) persists across boots, so pstore infrastructure (like ramoops module) can make use of that to save oops/ftrace logs, for example. There is no easy way to determine if this kernel parameter is properly set though; the kernel doesn't show information about this memory in memblock debugfs, neither in /proc/iomem nor dmesg. This is a relevant information for tools like kdumpst[0], to determine if it's reliable to use the reserved area as ramoops persistent storage; checking only /proc/cmdline is not sufficient as it doesn't tell if the reservation effectively succeeded or not. Add here a new file under memblock debugfs showing properly set memory reservations, with name and size as passed to "reserve_mem". Notice that if no "reserve_mem=" is passed on command-line or if the reservation attempts fail, the file is not created. [0] https://aur.archlinux.org/packages/kdumpst Reviewed-by: SeongJae Park Signed-off-by: Guilherme G. Piccoli Link: https://patch.msgid.link/20260324012839.1991765-2-gpiccoli@igalia.com Signed-off-by: Mike Rapoport (Microsoft) --- mm/memblock.c | 49 +++++++++++++++++-- tools/testing/memblock/linux/string_helpers.h | 10 ++++ 2 files changed, 55 insertions(+), 4 deletions(-) create mode 100644 tools/testing/memblock/linux/string_helpers.h diff --git a/mm/memblock.c b/mm/memblock.c index ac08d7f8c15e..57d96f2484cc 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -17,6 +17,7 @@ #include #include #include +#include #ifdef CONFIG_KEXEC_HANDOVER #include @@ -2710,7 +2711,8 @@ static int __init reserve_mem(char *p) } __setup("reserve_mem=", reserve_mem); -#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_ARCH_KEEP_MEMBLOCK) +#ifdef CONFIG_DEBUG_FS +#ifdef CONFIG_ARCH_KEEP_MEMBLOCK static const char * const flagname[] = { [ilog2(MEMBLOCK_HOTPLUG)] = "HOTPLUG", [ilog2(MEMBLOCK_MIRROR)] = "MIRROR", @@ -2757,10 +2759,8 @@ static int memblock_debug_show(struct seq_file *m, void *private) } DEFINE_SHOW_ATTRIBUTE(memblock_debug); -static int __init memblock_init_debugfs(void) +static inline void memblock_debugfs_expose_arrays(struct dentry *root) { - struct dentry *root = debugfs_create_dir("memblock", NULL); - debugfs_create_file("memory", 0444, root, &memblock.memory, &memblock_debug_fops); debugfs_create_file("reserved", 0444, root, @@ -2769,7 +2769,48 @@ static int __init memblock_init_debugfs(void) debugfs_create_file("physmem", 0444, root, &physmem, &memblock_debug_fops); #endif +} +#else + +static inline void memblock_debugfs_expose_arrays(struct dentry *root) { } + +#endif /* CONFIG_ARCH_KEEP_MEMBLOCK */ + +static int memblock_reserve_mem_show(struct seq_file *m, void *private) +{ + struct reserve_mem_table *map; + char txtsz[16]; + + guard(mutex)(&reserve_mem_lock); + for (int i = 0; i < reserved_mem_count; i++) { + map = &reserved_mem_table[i]; + if (!map->size) + continue; + + memset(txtsz, 0, sizeof(txtsz)); + string_get_size(map->size, 1, STRING_UNITS_2, txtsz, sizeof(txtsz)); + seq_printf(m, "%s\t\t(%s)\n", map->name, txtsz); + } + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(memblock_reserve_mem); + +static int __init memblock_init_debugfs(void) +{ + struct dentry *root; + + if (!IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK) && !reserved_mem_count) + return 0; + + root = debugfs_create_dir("memblock", NULL); + + if (reserved_mem_count) + debugfs_create_file("reserve_mem_param", 0444, root, NULL, + &memblock_reserve_mem_fops); + + memblock_debugfs_expose_arrays(root); return 0; } __initcall(memblock_init_debugfs); diff --git a/tools/testing/memblock/linux/string_helpers.h b/tools/testing/memblock/linux/string_helpers.h new file mode 100644 index 000000000000..dbf015cfff31 --- /dev/null +++ b/tools/testing/memblock/linux/string_helpers.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_STRING_HELPERS_H_ +#define _LINUX_STRING_HELPERS_H_ + +/* + * Header stub to avoid test build breakage; we don't need to + * actually implement string_get_size() as it's not used in the tests. + */ + +#endif From 8b7b85384fad6e21e8a28628e7ebacb5a6329de4 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Mon, 23 Mar 2026 09:20:42 +0200 Subject: [PATCH 03/13] memblock: move reserve_bootmem_range() to memblock.c and make it static reserve_bootmem_region() is only called from memmap_init_reserved_pages() and it was in mm/mm_init.c because of its dependecies on static init_deferred_page(). Since init_deferred_page() is not static anymore, move reserve_bootmem_region(), rename it to memmap_init_reserved_range() and make it static. Update the comment describing it to better reflect what the function does and drop bogus comment about reserved pages in free_bootmem_page(). Update memblock test stubs to reflect the core changes. Reviewed-by: Lorenzo Stoakes (Oracle) Reviewed-by: David Hildenbrand (Arm) Link: https://patch.msgid.link/20260323072042.3651061-1-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) --- include/linux/bootmem_info.h | 4 ---- include/linux/mm.h | 3 --- mm/memblock.c | 31 ++++++++++++++++++++++++++++--- mm/mm_init.c | 25 ------------------------- tools/include/linux/mm.h | 2 -- tools/testing/memblock/internal.h | 9 +++++++++ tools/testing/memblock/mmzone.c | 4 ---- 7 files changed, 37 insertions(+), 41 deletions(-) diff --git a/include/linux/bootmem_info.h b/include/linux/bootmem_info.h index 4c506e76a808..492ceeb1cdf8 100644 --- a/include/linux/bootmem_info.h +++ b/include/linux/bootmem_info.h @@ -44,10 +44,6 @@ static inline void free_bootmem_page(struct page *page) { enum bootmem_type type = bootmem_type(page); - /* - * The reserve_bootmem_region sets the reserved flag on bootmem - * pages. - */ VM_BUG_ON_PAGE(page_ref_count(page) != 2, page); if (type == SECTION_INFO || type == MIX_SECTION_INFO) diff --git a/include/linux/mm.h b/include/linux/mm.h index abb4963c1f06..764d10fdfb5d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3686,9 +3686,6 @@ extern unsigned long free_reserved_area(void *start, void *end, extern void adjust_managed_page_count(struct page *page, long count); -extern void reserve_bootmem_region(phys_addr_t start, - phys_addr_t end, int nid); - /* Free the reserved page into the buddy system, so it gets managed. */ void free_reserved_page(struct page *page); diff --git a/mm/memblock.c b/mm/memblock.c index 57d96f2484cc..eaaa6110bcc1 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -974,7 +974,7 @@ __init void memmap_init_kho_scratch_pages(void) /* * Initialize struct pages for free scratch memory. * The struct pages for reserved scratch memory will be set up in - * reserve_bootmem_region() + * memmap_init_reserved_pages() */ __for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE, MEMBLOCK_KHO_SCRATCH, &start, &end, &nid) { @@ -2241,6 +2241,31 @@ static unsigned long __init __free_memory_core(phys_addr_t start, return end_pfn - start_pfn; } +/* + * Initialised pages do not have PageReserved set. This function is called + * for each reserved range and marks the pages PageReserved. + * When deferred initialization of struct pages is enabled it also ensures + * that struct pages are properly initialised. + */ +static void __init memmap_init_reserved_range(phys_addr_t start, + phys_addr_t end, int nid) +{ + unsigned long pfn; + + for_each_valid_pfn(pfn, PFN_DOWN(start), PFN_UP(end)) { + struct page *page = pfn_to_page(pfn); + + init_deferred_page(pfn, nid); + + /* + * no need for atomic set_bit because the struct + * page is not visible yet so nobody should + * access it yet. + */ + __SetPageReserved(page); + } +} + static void __init memmap_init_reserved_pages(void) { struct memblock_region *region; @@ -2260,7 +2285,7 @@ static void __init memmap_init_reserved_pages(void) end = start + region->size; if (memblock_is_nomap(region)) - reserve_bootmem_region(start, end, nid); + memmap_init_reserved_range(start, end, nid); memblock_set_node(start, region->size, &memblock.reserved, nid); } @@ -2285,7 +2310,7 @@ static void __init memmap_init_reserved_pages(void) if (!numa_valid_node(nid)) nid = early_pfn_to_nid(PFN_DOWN(start)); - reserve_bootmem_region(start, end, nid); + memmap_init_reserved_range(start, end, nid); } } } diff --git a/mm/mm_init.c b/mm/mm_init.c index df34797691bd..ea8d3de43470 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -772,31 +772,6 @@ void __meminit init_deferred_page(unsigned long pfn, int nid) __init_deferred_page(pfn, nid); } -/* - * Initialised pages do not have PageReserved set. This function is - * called for each range allocated by the bootmem allocator and - * marks the pages PageReserved. The remaining valid pages are later - * sent to the buddy page allocator. - */ -void __meminit reserve_bootmem_region(phys_addr_t start, - phys_addr_t end, int nid) -{ - unsigned long pfn; - - for_each_valid_pfn(pfn, PFN_DOWN(start), PFN_UP(end)) { - struct page *page = pfn_to_page(pfn); - - __init_deferred_page(pfn, nid); - - /* - * no need for atomic set_bit because the struct - * page is not visible yet so nobody should - * access it yet. - */ - __SetPageReserved(page); - } -} - /* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */ static bool __meminit overlap_memmap_init(unsigned long zone, unsigned long *pfn) diff --git a/tools/include/linux/mm.h b/tools/include/linux/mm.h index 028f3faf46e7..74cbd51dbea2 100644 --- a/tools/include/linux/mm.h +++ b/tools/include/linux/mm.h @@ -32,8 +32,6 @@ static inline phys_addr_t virt_to_phys(volatile void *address) return (phys_addr_t)address; } -void reserve_bootmem_region(phys_addr_t start, phys_addr_t end, int nid); - static inline void totalram_pages_inc(void) { } diff --git a/tools/testing/memblock/internal.h b/tools/testing/memblock/internal.h index 009b97bbdd22..eb02d5771f4c 100644 --- a/tools/testing/memblock/internal.h +++ b/tools/testing/memblock/internal.h @@ -29,4 +29,13 @@ static inline unsigned long free_reserved_area(void *start, void *end, return 0; } +#define for_each_valid_pfn(pfn, start_pfn, end_pfn) \ + for ((pfn) = (start_pfn); (pfn) < (end_pfn); (pfn)++) + +static inline void init_deferred_page(unsigned long pfn, int nid) +{ +} + +#define __SetPageReserved(p) ((void)(p)) + #endif diff --git a/tools/testing/memblock/mmzone.c b/tools/testing/memblock/mmzone.c index d3d58851864e..e719450f81cb 100644 --- a/tools/testing/memblock/mmzone.c +++ b/tools/testing/memblock/mmzone.c @@ -11,10 +11,6 @@ struct pglist_data *next_online_pgdat(struct pglist_data *pgdat) return NULL; } -void reserve_bootmem_region(phys_addr_t start, phys_addr_t end, int nid) -{ -} - void atomic_long_set(atomic_long_t *v, long i) { } From c12c3e1507809ad1fc0448f51c933f52e17d13cd Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Mon, 23 Mar 2026 09:48:28 +0200 Subject: [PATCH 04/13] memblock: reserve_mem: fix end caclulation in reserve_mem_release_by_name() free_reserved_area() expects end parameter to point to the first address after the area, but reserve_mem_release_by_name() passes it the last address inside the area. Remove subtraction of one in calculation of the area end. Fixes: 74e2498ccf7b ("mm/memblock: Add reserved memory release function") Link: https://patch.msgid.link/20260323074836.3653702-2-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) --- mm/memblock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memblock.c b/mm/memblock.c index eaaa6110bcc1..134724f5299e 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -2460,7 +2460,7 @@ int reserve_mem_release_by_name(const char *name) return 0; start = phys_to_virt(map->start); - end = start + map->size - 1; + end = start + map->size; snprintf(buf, sizeof(buf), "reserve_mem:%s", name); free_reserved_area(start, end, 0, buf); map->size = 0; From 25ee3aff9996f22e1b8b27fb284efb285e2fb025 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Mon, 23 Mar 2026 09:48:29 +0200 Subject: [PATCH 05/13] powerpc: fadump: pair alloc_pages_exact() with free_pages_exact() fadump allocates buffers with alloc_pages_exact(), but then marks them as reserved and frees using free_reserved_area(). This is completely unnecessary and the pages allocated with alloc_pages_exact() can be naturally freed with free_pages_exact(). Replace freeing of memory in fadump_free_buffer() with free_pages_exact() and simplify allocation code so that it won't mark allocated pages as reserved. Link: https://patch.msgid.link/20260323074836.3653702-3-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) --- arch/powerpc/kernel/fadump.c | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 4ebc333dd786..501d43bf18f3 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -775,24 +775,12 @@ void __init fadump_update_elfcore_header(char *bufp) static void *__init fadump_alloc_buffer(unsigned long size) { - unsigned long count, i; - struct page *page; - void *vaddr; - - vaddr = alloc_pages_exact(size, GFP_KERNEL | __GFP_ZERO); - if (!vaddr) - return NULL; - - count = PAGE_ALIGN(size) / PAGE_SIZE; - page = virt_to_page(vaddr); - for (i = 0; i < count; i++) - mark_page_reserved(page + i); - return vaddr; + return alloc_pages_exact(size, GFP_KERNEL | __GFP_ZERO); } static void fadump_free_buffer(unsigned long vaddr, unsigned long size) { - free_reserved_area((void *)vaddr, (void *)(vaddr + size), -1, NULL); + free_pages_exact((void *)vaddr, size); } s32 __init fadump_setup_cpu_notes_buf(u32 num_cpus) From 8ff5d8f2008889bb6f46125d5a0638e8749e29bd Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Mon, 23 Mar 2026 09:48:30 +0200 Subject: [PATCH 06/13] powerpc: opal-core: pair alloc_pages_exact() with free_pages_exact() opal-core allocates buffers with alloc_pages_exact(), but then marks them as reserved and frees using free_reserved_area(). This is completely unnecessary and the pages allocated with alloc_pages_exact() can be naturally freed with free_pages_exact(). Replace freeing of memory in opalcore_cleanup() with free_pages_exact() and simplify allocation code so that it won't mark allocated pages as reserved. Link: https://patch.msgid.link/20260323074836.3653702-4-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) --- arch/powerpc/platforms/powernv/opal-core.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/arch/powerpc/platforms/powernv/opal-core.c b/arch/powerpc/platforms/powernv/opal-core.c index e76e462f55f6..32662d30d70f 100644 --- a/arch/powerpc/platforms/powernv/opal-core.c +++ b/arch/powerpc/platforms/powernv/opal-core.c @@ -303,7 +303,6 @@ static int __init create_opalcore(void) struct device_node *dn; struct opalcore *new; loff_t opalcore_off; - struct page *page; Elf64_Phdr *phdr; Elf64_Ehdr *elf; int i, ret; @@ -328,11 +327,6 @@ static int __init create_opalcore(void) oc_conf->opalcorebuf_sz = 0; return -ENOMEM; } - count = oc_conf->opalcorebuf_sz / PAGE_SIZE; - page = virt_to_page(oc_conf->opalcorebuf); - for (i = 0; i < count; i++) - mark_page_reserved(page + i); - pr_debug("opalcorebuf = 0x%llx\n", (u64)oc_conf->opalcorebuf); /* Read OPAL related device-tree entries */ @@ -437,10 +431,7 @@ static void opalcore_cleanup(void) /* free the buffer used for setting up OPAL core */ if (oc_conf->opalcorebuf) { - void *end = (void *)((u64)oc_conf->opalcorebuf + - oc_conf->opalcorebuf_sz); - - free_reserved_area(oc_conf->opalcorebuf, end, -1, NULL); + free_pages_exact(oc_conf->opalcorebuf, oc_conf->opalcorebuf_sz); oc_conf->opalcorebuf = NULL; oc_conf->opalcorebuf_sz = 0; } From 0510bdab538e2af07a67bc58a0c6c4547b83f8d5 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Mon, 23 Mar 2026 09:48:31 +0200 Subject: [PATCH 07/13] mm: move free_reserved_area() to mm/memblock.c free_reserved_area() is related to memblock as it frees reserved memory back to the buddy allocator, similar to what memblock_free_late() does. Move free_reserved_area() to mm/memblock.c to prepare for further consolidation of the functions that free reserved memory. No functional changes. Link: https://patch.msgid.link/20260323074836.3653702-5-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: Vlastimil Babka (SUSE) --- mm/memblock.c | 37 ++++++++++++++++++++++++++++++- mm/page_alloc.c | 36 ------------------------------ tools/include/linux/mm.h | 1 + tools/testing/memblock/internal.h | 34 +++++++++++++++++++++++++--- 4 files changed, 68 insertions(+), 40 deletions(-) diff --git a/mm/memblock.c b/mm/memblock.c index 134724f5299e..180b8347458f 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -894,6 +894,42 @@ int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size) return memblock_remove_range(&memblock.memory, base, size); } +unsigned long free_reserved_area(void *start, void *end, int poison, const char *s) +{ + void *pos; + unsigned long pages = 0; + + start = (void *)PAGE_ALIGN((unsigned long)start); + end = (void *)((unsigned long)end & PAGE_MASK); + for (pos = start; pos < end; pos += PAGE_SIZE, pages++) { + struct page *page = virt_to_page(pos); + void *direct_map_addr; + + /* + * 'direct_map_addr' might be different from 'pos' + * because some architectures' virt_to_page() + * work with aliases. Getting the direct map + * address ensures that we get a _writeable_ + * alias for the memset(). + */ + direct_map_addr = page_address(page); + /* + * Perform a kasan-unchecked memset() since this memory + * has not been initialized. + */ + direct_map_addr = kasan_reset_tag(direct_map_addr); + if ((unsigned int)poison <= 0xFF) + memset(direct_map_addr, poison, PAGE_SIZE); + + free_reserved_page(page); + } + + if (pages && s) + pr_info("Freeing %s memory: %ldK\n", s, K(pages)); + + return pages; +} + /** * memblock_free - free boot memory allocation * @ptr: starting address of the boot memory allocation @@ -1777,7 +1813,6 @@ void __init memblock_free_late(phys_addr_t base, phys_addr_t size) totalram_pages_inc(); } } - /* * Remaining API functions */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2d4b6f1a554e..df3d61253001 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6234,42 +6234,6 @@ void adjust_managed_page_count(struct page *page, long count) } EXPORT_SYMBOL(adjust_managed_page_count); -unsigned long free_reserved_area(void *start, void *end, int poison, const char *s) -{ - void *pos; - unsigned long pages = 0; - - start = (void *)PAGE_ALIGN((unsigned long)start); - end = (void *)((unsigned long)end & PAGE_MASK); - for (pos = start; pos < end; pos += PAGE_SIZE, pages++) { - struct page *page = virt_to_page(pos); - void *direct_map_addr; - - /* - * 'direct_map_addr' might be different from 'pos' - * because some architectures' virt_to_page() - * work with aliases. Getting the direct map - * address ensures that we get a _writeable_ - * alias for the memset(). - */ - direct_map_addr = page_address(page); - /* - * Perform a kasan-unchecked memset() since this memory - * has not been initialized. - */ - direct_map_addr = kasan_reset_tag(direct_map_addr); - if ((unsigned int)poison <= 0xFF) - memset(direct_map_addr, poison, PAGE_SIZE); - - free_reserved_page(page); - } - - if (pages && s) - pr_info("Freeing %s memory: %ldK\n", s, K(pages)); - - return pages; -} - void free_reserved_page(struct page *page) { clear_page_tag_ref(page); diff --git a/tools/include/linux/mm.h b/tools/include/linux/mm.h index 74cbd51dbea2..84b5954f66c3 100644 --- a/tools/include/linux/mm.h +++ b/tools/include/linux/mm.h @@ -17,6 +17,7 @@ #define __va(x) ((void *)((unsigned long)(x))) #define __pa(x) ((unsigned long)(x)) +#define __pa_symbol(x) ((unsigned long)(x)) #define pfn_to_page(pfn) ((void *)((pfn) * PAGE_SIZE)) diff --git a/tools/testing/memblock/internal.h b/tools/testing/memblock/internal.h index eb02d5771f4c..b6b1d147fd75 100644 --- a/tools/testing/memblock/internal.h +++ b/tools/testing/memblock/internal.h @@ -11,9 +11,22 @@ static int memblock_debug = 1; #define pr_warn_ratelimited(fmt, ...) printf(fmt, ##__VA_ARGS__) +#define K(x) ((x) << (PAGE_SHIFT-10)) + bool mirrored_kernelcore = false; struct page {}; +static inline void *page_address(struct page *page) +{ + BUG(); + return page; +} + +static inline struct page *virt_to_page(void *virt) +{ + BUG(); + return virt; +} void memblock_free_pages(unsigned long pfn, unsigned int order) { @@ -23,10 +36,25 @@ static inline void accept_memory(phys_addr_t start, unsigned long size) { } -static inline unsigned long free_reserved_area(void *start, void *end, - int poison, const char *s) +unsigned long free_reserved_area(void *start, void *end, int poison, const char *s); +void free_reserved_page(struct page *page); + +static inline bool deferred_pages_enabled(void) { - return 0; + return false; +} + +#define for_each_valid_pfn(pfn, start_pfn, end_pfn) \ + for ((pfn) = (start_pfn); (pfn) < (end_pfn); (pfn)++) + +static inline void *kasan_reset_tag(const void *addr) +{ + return (void *)addr; +} + +static inline bool __is_kernel(unsigned long addr) +{ + return false; } #define for_each_valid_pfn(pfn, start_pfn, end_pfn) \ From b8de9573e6aea8e0be666288ee4427eb07369187 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Mon, 23 Mar 2026 09:48:32 +0200 Subject: [PATCH 08/13] memblock: make free_reserved_area() more robust There are two potential problems in free_reserved_area(): * it may free a page with not-existent buddy page * it may be passed a virtual address from an alias mapping that won't be properly translated by virt_to_page(), for example a symbol on arm64 While first issue is quite theoretical and the second one does not manifest itself because all the callers do the right thing, it is easy to make free_reserved_area() robust enough to avoid these potential issues. Replace the loop by virtual address with a loop by pfn that uses for_each_valid_pfn() and use __pa() or __pa_symbol() depending on the virtual mapping alias to correctly determine the loop boundaries. Link: https://patch.msgid.link/20260323074836.3653702-6-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) --- mm/memblock.c | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/mm/memblock.c b/mm/memblock.c index 180b8347458f..a42ec6a76ea0 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -896,21 +896,32 @@ int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size) unsigned long free_reserved_area(void *start, void *end, int poison, const char *s) { - void *pos; - unsigned long pages = 0; + phys_addr_t start_pa, end_pa; + unsigned long pages = 0, pfn; - start = (void *)PAGE_ALIGN((unsigned long)start); - end = (void *)((unsigned long)end & PAGE_MASK); - for (pos = start; pos < end; pos += PAGE_SIZE, pages++) { - struct page *page = virt_to_page(pos); + /* + * end is the first address past the region and it may be beyond what + * __pa() or __pa_symbol() can handle. + * Use the address included in the range for the conversion and add + * back 1 afterwards. + */ + if (__is_kernel((unsigned long)start)) { + start_pa = __pa_symbol(start); + end_pa = __pa_symbol(end - 1) + 1; + } else { + start_pa = __pa(start); + end_pa = __pa(end - 1) + 1; + } + + for_each_valid_pfn(pfn, PFN_UP(start_pa), PFN_DOWN(end_pa)) { + struct page *page = pfn_to_page(pfn); void *direct_map_addr; /* - * 'direct_map_addr' might be different from 'pos' - * because some architectures' virt_to_page() - * work with aliases. Getting the direct map - * address ensures that we get a _writeable_ - * alias for the memset(). + * 'direct_map_addr' might be different from the kernel virtual + * address because some architectures use aliases. + * Going via physical address, pfn_to_page() and page_address() + * ensures that we get a _writeable_ alias for the memset(). */ direct_map_addr = page_address(page); /* @@ -922,6 +933,7 @@ unsigned long free_reserved_area(void *start, void *end, int poison, const char memset(direct_map_addr, poison, PAGE_SIZE); free_reserved_page(page); + pages++; } if (pages && s) From 7fbc5e26123e5fee1f0eb59e6fabf5ce4cf4f475 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Mon, 23 Mar 2026 09:48:33 +0200 Subject: [PATCH 09/13] memblock: extract page freeing from free_reserved_area() into a helper There are two functions that release pages to the buddy allocator late in the boot: free_reserved_area() and memblock_free_late(). Currently they are using different underlying functionality, free_reserved_area() runs each page being freed via free_reserved_page() and memblock_free_late() uses memblock_free_pages() -> __free_pages_core(), but in the end they both boil down to a loop that frees a range page by page. Extract the loop frees pages from free_reserved_area() into a helper and use that helper in memblock_free_late(). Link: https://patch.msgid.link/20260323074836.3653702-7-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) --- mm/memblock.c | 55 +++++++++++++++++++++++++++------------------------ 1 file changed, 29 insertions(+), 26 deletions(-) diff --git a/mm/memblock.c b/mm/memblock.c index a42ec6a76ea0..68a72bd4c8bd 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -894,26 +894,12 @@ int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size) return memblock_remove_range(&memblock.memory, base, size); } -unsigned long free_reserved_area(void *start, void *end, int poison, const char *s) +static unsigned long __free_reserved_area(phys_addr_t start, phys_addr_t end, + int poison) { - phys_addr_t start_pa, end_pa; unsigned long pages = 0, pfn; - /* - * end is the first address past the region and it may be beyond what - * __pa() or __pa_symbol() can handle. - * Use the address included in the range for the conversion and add - * back 1 afterwards. - */ - if (__is_kernel((unsigned long)start)) { - start_pa = __pa_symbol(start); - end_pa = __pa_symbol(end - 1) + 1; - } else { - start_pa = __pa(start); - end_pa = __pa(end - 1) + 1; - } - - for_each_valid_pfn(pfn, PFN_UP(start_pa), PFN_DOWN(end_pa)) { + for_each_valid_pfn(pfn, PFN_UP(start), PFN_DOWN(end)) { struct page *page = pfn_to_page(pfn); void *direct_map_addr; @@ -935,7 +921,29 @@ unsigned long free_reserved_area(void *start, void *end, int poison, const char free_reserved_page(page); pages++; } + return pages; +} +unsigned long free_reserved_area(void *start, void *end, int poison, const char *s) +{ + phys_addr_t start_pa, end_pa; + unsigned long pages; + + /* + * end is the first address past the region and it may be beyond what + * __pa() or __pa_symbol() can handle. + * Use the address included in the range for the conversion and add back + * 1 afterwards. + */ + if (__is_kernel((unsigned long)start)) { + start_pa = __pa_symbol(start); + end_pa = __pa_symbol(end - 1) + 1; + } else { + start_pa = __pa(start); + end_pa = __pa(end - 1) + 1; + } + + pages = __free_reserved_area(start_pa, end_pa, poison); if (pages && s) pr_info("Freeing %s memory: %ldK\n", s, K(pages)); @@ -1811,20 +1819,15 @@ void *__init __memblock_alloc_or_panic(phys_addr_t size, phys_addr_t align, */ void __init memblock_free_late(phys_addr_t base, phys_addr_t size) { - phys_addr_t cursor, end; + phys_addr_t end = base + size - 1; - end = base + size - 1; memblock_dbg("%s: [%pa-%pa] %pS\n", __func__, &base, &end, (void *)_RET_IP_); - kmemleak_free_part_phys(base, size); - cursor = PFN_UP(base); - end = PFN_DOWN(base + size); - for (; cursor < end; cursor++) { - memblock_free_pages(cursor, 0); - totalram_pages_inc(); - } + kmemleak_free_part_phys(base, size); + __free_reserved_area(base, base + size, -1); } + /* * Remaining API functions */ From b2129a39511b71b5ed0ae923d6eebd9398c6184e Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Mon, 23 Mar 2026 09:48:34 +0200 Subject: [PATCH 10/13] memblock: make free_reserved_area() update memblock if ARCH_KEEP_MEMBLOCK=y On architectures that keep memblock after boot, freeing of reserved memory with free_reserved_area() is paired with an update of memblock arrays, usually by a call to memblock_free(). Make free_reserved_area() directly update memblock.reserved when ARCH_KEEP_MEMBLOCK is enabled. Remove the now-redundant explicit memblock_free() call from arm64::free_initmem() and the #ifdef CONFIG_ARCH_KEEP_MEMBLOCK block from the generic free_initrd_mem(). Link: https://patch.msgid.link/20260323074836.3653702-8-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) --- arch/arm64/mm/init.c | 3 --- init/initramfs.c | 7 ------- mm/memblock.c | 6 ++++++ 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 96711b8578fd..07b17c708702 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -385,9 +385,6 @@ void free_initmem(void) WARN_ON(!IS_ALIGNED((unsigned long)lm_init_begin, PAGE_SIZE)); WARN_ON(!IS_ALIGNED((unsigned long)lm_init_end, PAGE_SIZE)); - /* Delete __init region from memblock.reserved. */ - memblock_free(lm_init_begin, lm_init_end - lm_init_begin); - free_reserved_area(lm_init_begin, lm_init_end, POISON_FREE_INITMEM, "unused kernel"); /* diff --git a/init/initramfs.c b/init/initramfs.c index 139baed06589..bca0922b2850 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -652,13 +652,6 @@ void __init reserve_initrd_mem(void) void __weak __init free_initrd_mem(unsigned long start, unsigned long end) { -#ifdef CONFIG_ARCH_KEEP_MEMBLOCK - unsigned long aligned_start = ALIGN_DOWN(start, PAGE_SIZE); - unsigned long aligned_end = ALIGN(end, PAGE_SIZE); - - memblock_free((void *)aligned_start, aligned_end - aligned_start); -#endif - free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM, "initrd"); } diff --git a/mm/memblock.c b/mm/memblock.c index 68a72bd4c8bd..dee18c40d928 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -943,6 +943,12 @@ unsigned long free_reserved_area(void *start, void *end, int poison, const char end_pa = __pa(end - 1) + 1; } + if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) { + if (start_pa < end_pa) + memblock_remove_range(&memblock.reserved, + start_pa, end_pa - start_pa); + } + pages = __free_reserved_area(start_pa, end_pa, poison); if (pages && s) pr_info("Freeing %s memory: %ldK\n", s, K(pages)); From 87ce9e83ab8be5daf64351cd481ffa6537778e6b Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Mon, 23 Mar 2026 09:48:35 +0200 Subject: [PATCH 11/13] memblock, treewide: make memblock_free() handle late freeing It shouldn't be responsibility of memblock users to detect if they free memory allocated from memblock late and should use memblock_free_late(). Make memblock_free() and memblock_phys_free() take care of late memory freeing and drop memblock_free_late(). Link: https://patch.msgid.link/20260323074836.3653702-9-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) --- arch/sparc/kernel/mdesc.c | 4 +- arch/x86/kernel/setup.c | 2 +- arch/x86/platform/efi/memmap.c | 5 +-- arch/x86/platform/efi/quirks.c | 2 +- drivers/firmware/efi/apple-properties.c | 2 +- drivers/of/kexec.c | 2 +- include/linux/memblock.h | 2 - kernel/dma/swiotlb.c | 6 +-- lib/bootconfig.c | 2 +- mm/kfence/core.c | 4 +- mm/memblock.c | 49 ++++++++++--------------- 11 files changed, 31 insertions(+), 49 deletions(-) diff --git a/arch/sparc/kernel/mdesc.c b/arch/sparc/kernel/mdesc.c index 30f171b7b00c..ecd6c8ae49c7 100644 --- a/arch/sparc/kernel/mdesc.c +++ b/arch/sparc/kernel/mdesc.c @@ -183,14 +183,12 @@ static struct mdesc_handle * __init mdesc_memblock_alloc(unsigned int mdesc_size static void __init mdesc_memblock_free(struct mdesc_handle *hp) { unsigned int alloc_size; - unsigned long start; BUG_ON(refcount_read(&hp->refcnt) != 0); BUG_ON(!list_empty(&hp->list)); alloc_size = PAGE_ALIGN(hp->handle_size); - start = __pa(hp); - memblock_free_late(start, alloc_size); + memblock_free(hp, alloc_size); } static struct mdesc_mem_ops memblock_mdesc_ops = { diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index eebcc9db1a1b..46882ce79c3a 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -426,7 +426,7 @@ int __init ima_free_kexec_buffer(void) if (!ima_kexec_buffer_size) return -ENOENT; - memblock_free_late(ima_kexec_buffer_phys, + memblock_phys_free(ima_kexec_buffer_phys, ima_kexec_buffer_size); ima_kexec_buffer_phys = 0; diff --git a/arch/x86/platform/efi/memmap.c b/arch/x86/platform/efi/memmap.c index 023697c88910..697a9a26a005 100644 --- a/arch/x86/platform/efi/memmap.c +++ b/arch/x86/platform/efi/memmap.c @@ -34,10 +34,7 @@ static void __init __efi_memmap_free(u64 phys, unsigned long size, unsigned long flags) { if (flags & EFI_MEMMAP_MEMBLOCK) { - if (slab_is_available()) - memblock_free_late(phys, size); - else - memblock_phys_free(phys, size); + memblock_phys_free(phys, size); } else if (flags & EFI_MEMMAP_SLAB) { struct page *p = pfn_to_page(PHYS_PFN(phys)); unsigned int order = get_order(size); diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index 35caa5746115..a560bbcaa006 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -372,7 +372,7 @@ void __init efi_reserve_boot_services(void) * doesn't make sense as far as the firmware is * concerned, but it does provide us with a way to tag * those regions that must not be paired with - * memblock_free_late(). + * memblock_phys_free(). */ md->attribute |= EFI_MEMORY_RUNTIME; } diff --git a/drivers/firmware/efi/apple-properties.c b/drivers/firmware/efi/apple-properties.c index 13ac28754c03..2e525e17fba7 100644 --- a/drivers/firmware/efi/apple-properties.c +++ b/drivers/firmware/efi/apple-properties.c @@ -226,7 +226,7 @@ static int __init map_properties(void) */ data->len = 0; memunmap(data); - memblock_free_late(pa_data + sizeof(*data), data_len); + memblock_phys_free(pa_data + sizeof(*data), data_len); return ret; } diff --git a/drivers/of/kexec.c b/drivers/of/kexec.c index c4cf3552c018..512d9be9d513 100644 --- a/drivers/of/kexec.c +++ b/drivers/of/kexec.c @@ -175,7 +175,7 @@ int __init ima_free_kexec_buffer(void) if (ret) return ret; - memblock_free_late(addr, size); + memblock_phys_free(addr, size); return 0; } #endif diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 6ec5e9ac0699..6f6c5b5c4a4b 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -172,8 +172,6 @@ void __next_mem_range_rev(u64 *idx, int nid, enum memblock_flags flags, struct memblock_type *type_b, phys_addr_t *out_start, phys_addr_t *out_end, int *out_nid); -void memblock_free_late(phys_addr_t base, phys_addr_t size); - #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP static inline void __next_physmem_range(u64 *idx, struct memblock_type *type, phys_addr_t *out_start, diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index d8e6f1d889d5..e44e039e00d3 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -546,10 +546,10 @@ void __init swiotlb_exit(void) free_pages(tbl_vaddr, get_order(tbl_size)); free_pages((unsigned long)mem->slots, get_order(slots_size)); } else { - memblock_free_late(__pa(mem->areas), + memblock_free(mem->areas, array_size(sizeof(*mem->areas), mem->nareas)); - memblock_free_late(mem->start, tbl_size); - memblock_free_late(__pa(mem->slots), slots_size); + memblock_phys_free(mem->start, tbl_size); + memblock_free(mem->slots, slots_size); } memset(mem, 0, sizeof(*mem)); diff --git a/lib/bootconfig.c b/lib/bootconfig.c index 2da049216fe0..9225fa057c1e 100644 --- a/lib/bootconfig.c +++ b/lib/bootconfig.c @@ -64,7 +64,7 @@ static inline void __init xbc_free_mem(void *addr, size_t size, bool early) if (early) memblock_free(addr, size); else if (addr) - memblock_free_late(__pa(addr), size); + memblock_free(addr, size); } #else /* !__KERNEL__ */ diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 7393957f9a20..5c8268af533e 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -731,10 +731,10 @@ static bool __init kfence_init_pool_early(void) * fails for the first page, and therefore expect addr==__kfence_pool in * most failure cases. */ - memblock_free_late(__pa(addr), KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool)); + memblock_free((void *)addr, KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool)); __kfence_pool = NULL; - memblock_free_late(__pa(kfence_metadata_init), KFENCE_METADATA_SIZE); + memblock_free(kfence_metadata_init, KFENCE_METADATA_SIZE); kfence_metadata_init = NULL; return false; diff --git a/mm/memblock.c b/mm/memblock.c index dee18c40d928..df4e3475fe39 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -385,26 +385,27 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u */ void __init memblock_discard(void) { - phys_addr_t addr, size; + phys_addr_t size; + void *addr; if (memblock.reserved.regions != memblock_reserved_init_regions) { - addr = __pa(memblock.reserved.regions); + addr = memblock.reserved.regions; size = PAGE_ALIGN(sizeof(struct memblock_region) * memblock.reserved.max); if (memblock_reserved_in_slab) - kfree(memblock.reserved.regions); + kfree(addr); else - memblock_free_late(addr, size); + memblock_free(addr, size); } if (memblock.memory.regions != memblock_memory_init_regions) { - addr = __pa(memblock.memory.regions); + addr = memblock.memory.regions; size = PAGE_ALIGN(sizeof(struct memblock_region) * memblock.memory.max); if (memblock_memory_in_slab) - kfree(memblock.memory.regions); + kfree(addr); else - memblock_free_late(addr, size); + memblock_free(addr, size); } memblock_memory = NULL; @@ -962,7 +963,8 @@ unsigned long free_reserved_area(void *start, void *end, int poison, const char * @size: size of the boot memory block in bytes * * Free boot memory block previously allocated by memblock_alloc_xx() API. - * The freeing memory will not be released to the buddy allocator. + * If called after the buddy allocator is available, the memory is released to + * the buddy allocator. */ void __init_memblock memblock_free(void *ptr, size_t size) { @@ -976,17 +978,24 @@ void __init_memblock memblock_free(void *ptr, size_t size) * @size: size of the boot memory block in bytes * * Free boot memory block previously allocated by memblock_phys_alloc_xx() API. - * The freeing memory will not be released to the buddy allocator. + * If called after the buddy allocator is available, the memory is released to + * the buddy allocator. */ int __init_memblock memblock_phys_free(phys_addr_t base, phys_addr_t size) { phys_addr_t end = base + size - 1; + int ret; memblock_dbg("%s: [%pa-%pa] %pS\n", __func__, &base, &end, (void *)_RET_IP_); kmemleak_free_part_phys(base, size); - return memblock_remove_range(&memblock.reserved, base, size); + ret = memblock_remove_range(&memblock.reserved, base, size); + + if (slab_is_available()) + __free_reserved_area(base, base + size, -1); + + return ret; } int __init_memblock __memblock_reserve(phys_addr_t base, phys_addr_t size, @@ -1814,26 +1823,6 @@ void *__init __memblock_alloc_or_panic(phys_addr_t size, phys_addr_t align, return addr; } -/** - * memblock_free_late - free pages directly to buddy allocator - * @base: phys starting address of the boot memory block - * @size: size of the boot memory block in bytes - * - * This is only useful when the memblock allocator has already been torn - * down, but we are still initializing the system. Pages are released directly - * to the buddy allocator. - */ -void __init memblock_free_late(phys_addr_t base, phys_addr_t size) -{ - phys_addr_t end = base + size - 1; - - memblock_dbg("%s: [%pa-%pa] %pS\n", - __func__, &base, &end, (void *)_RET_IP_); - - kmemleak_free_part_phys(base, size); - __free_reserved_area(base, base + size, -1); -} - /* * Remaining API functions */ From 59bd1d914bb51ab99a33ce32420403ccd035ad29 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Mon, 23 Mar 2026 09:48:36 +0200 Subject: [PATCH 12/13] memblock: warn when freeing reserved memory before memory map is initialized When CONFIG_DEFERRED_STRUCT_PAGE_INIT is enabled, freeing of reserved memory before the memory map is fully initialized in deferred_init_memmap() would cause access to uninitialized struct pages and may crash when accessing spurious list pointers, like was recently discovered during discussion about memory leaks in x86 EFI code [1]. The trace below is from an attempt to call free_reserved_page() before page_alloc_init_late(): [ 0.076840] BUG: unable to handle page fault for address: ffffce1a005a0788 [ 0.078226] #PF: supervisor read access in kernel mode [ 0.078226] #PF: error_code(0x0000) - not-present page [ 0.078226] PGD 0 P4D 0 [ 0.078226] Oops: Oops: 0000 [#1] PREEMPT SMP NOPTI [ 0.078226] CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Not tainted 6.12.68-92.123.amzn2023.x86_64 #1 [ 0.078226] Hardware name: Amazon EC2 t3a.nano/, BIOS 1.0 10/16/2017 [ 0.078226] RIP: 0010:__list_del_entry_valid_or_report+0x32/0xb0 ... [ 0.078226] __free_one_page+0x170/0x520 [ 0.078226] free_pcppages_bulk+0x151/0x1e0 [ 0.078226] free_unref_page_commit+0x263/0x320 [ 0.078226] free_unref_page+0x2c8/0x5b0 [ 0.078226] ? srso_return_thunk+0x5/0x5f [ 0.078226] free_reserved_page+0x1c/0x30 [ 0.078226] memblock_free_late+0x6c/0xc0 Currently there are not many callers of free_reserved_area() and they all appear to be at the right timings. Still, in order to protect against problematic code moves or additions of new callers add a warning that will inform that reserved pages cannot be freed until the memory map is fully initialized. [1] https://lore.kernel.org/all/e5d5a1105d90ee1e7fe7eafaed2ed03bbad0c46b.camel@kernel.crashing.org/ Link: https://patch.msgid.link/20260323074836.3653702-10-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) --- mm/internal.h | 10 ++++++++++ mm/memblock.c | 5 +++++ mm/page_alloc.c | 10 ---------- 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index cb0af847d7d9..f60c1edb2e02 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1233,7 +1233,17 @@ static inline void vunmap_range_noflush(unsigned long start, unsigned long end) #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT DECLARE_STATIC_KEY_TRUE(deferred_pages); +static inline bool deferred_pages_enabled(void) +{ + return static_branch_unlikely(&deferred_pages); +} + bool __init deferred_grow_zone(struct zone *zone, unsigned int order); +#else +static inline bool deferred_pages_enabled(void) +{ + return false; +} #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ void init_deferred_page(unsigned long pfn, int nid); diff --git a/mm/memblock.c b/mm/memblock.c index df4e3475fe39..6cf1de7a0dac 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -900,6 +900,11 @@ static unsigned long __free_reserved_area(phys_addr_t start, phys_addr_t end, { unsigned long pages = 0, pfn; + if (deferred_pages_enabled()) { + WARN(1, "Cannot free reserved memory because of deferred initialization of the memory map"); + return 0; + } + for_each_valid_pfn(pfn, PFN_UP(start), PFN_DOWN(end)) { struct page *page = pfn_to_page(pfn); void *direct_map_addr; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index df3d61253001..9ac47bab2ea7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -331,11 +331,6 @@ int page_group_by_mobility_disabled __read_mostly; */ DEFINE_STATIC_KEY_TRUE(deferred_pages); -static inline bool deferred_pages_enabled(void) -{ - return static_branch_unlikely(&deferred_pages); -} - /* * deferred_grow_zone() is __init, but it is called from * get_page_from_freelist() during early boot until deferred_pages permanently @@ -348,11 +343,6 @@ _deferred_grow_zone(struct zone *zone, unsigned int order) return deferred_grow_zone(zone, order); } #else -static inline bool deferred_pages_enabled(void) -{ - return false; -} - static inline bool _deferred_grow_zone(struct zone *zone, unsigned int order) { return false; From d5759519805c54786c00765ca1303e6d7a0676ca Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Mon, 30 Mar 2026 22:10:00 +0300 Subject: [PATCH 13/13] x86/alternative: delay freeing of smp_locks section On SMP systems alternative_instructions() frees memory occupied by smp_locks section immediately after patching the lock instructions. The memory is freed using free_init_pages() that calls free_reserved_area() that essentially does __free_page() for every page in the range. Up until recently it didn't update memblock state so in cases when CONFIG_ARCH_KEEP_MEMBLOCK is enabled (on x86 it is selected by INTEL_TDX_HOST), the state of memblock and the memory map would be inconsistent. Additionally, with CONFIG_DEFERRED_STRUCT_PAGE_INIT enabled, freeing of smp_locks happens before the memory map is fully initialized and freeing reserved memory may cause an access to not-yet-initialized struct page when __free_page() searches for a buddy page. Following the discussion in [1], implementation of memblock_free_late() and free_reserved_area() was unified to ensure that reserved memory that's freed after memblock transfers the pages to the buddy allocator is actually freed and that the memblock and the memory map are consistent. As a part of these changes, free_reserved_area() now WARN()s when it is called before the initialization of the memory map is complete. The memory map is fully initialized in page_alloc_init_late() that completes before initcalls are executed, so it is safe to free reserved memory in any initcall except early_initcall(). Move freeing of smp_locks section to an initcall to ensure it will happen after the memory map is fully initialized. Since it does not matter which exactly initcall to use and the code lives in arch/, pick arch_initcall. [1] https://lore.kernel.org/all/ec2aaef14783869b3be6e3c253b2dcbf67dbc12a.camel@kernel.crashing.org Reported-By: Bert Karwatzki Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202603302154.b50adaf1-lkp@intel.com Tested-By: Bert Karwatzki Link: https://lore.kernel.org/r/20260327140109.7561-1-spasswolf@web.de Acked-by: Borislav Petkov (AMD) Fixes: b2129a39511b ("memblock: make free_reserved_area() update memblock if ARCH_KEEP_MEMBLOCK=y") Signed-off-by: Mike Rapoport (Microsoft) --- arch/x86/kernel/alternative.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index e87da25d1236..62936a3bde19 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -2448,12 +2448,6 @@ void __init alternative_instructions(void) __smp_locks, __smp_locks_end, _text, _etext); } - - if (!uniproc_patched || num_possible_cpus() == 1) { - free_init_pages("SMP alternatives", - (unsigned long)__smp_locks, - (unsigned long)__smp_locks_end); - } #endif restart_nmi(); @@ -2462,6 +2456,24 @@ void __init alternative_instructions(void) alt_reloc_selftest(); } +#ifdef CONFIG_SMP +/* + * With CONFIG_DEFERRED_STRUCT_PAGE_INIT enabled we can free_init_pages() only + * after the deferred initialization of the memory map is complete. + */ +static int __init free_smp_locks(void) +{ + if (!uniproc_patched || num_possible_cpus() == 1) { + free_init_pages("SMP alternatives", + (unsigned long)__smp_locks, + (unsigned long)__smp_locks_end); + } + + return 0; +} +arch_initcall(free_smp_locks); +#endif + /** * text_poke_early - Update instructions on a live kernel at boot time * @addr: address to modify