diff --git a/include/linux/mm.h b/include/linux/mm.h index d7e53532a109..19619e5efeba 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4479,7 +4479,8 @@ int vmemmap_populate_hugepages(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap); int vmemmap_populate(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap); -int vmemmap_populate_hvo(unsigned long start, unsigned long end, int node, +int vmemmap_populate_hvo(unsigned long start, unsigned long end, + unsigned int order, struct zone *zone, unsigned long headsize); void vmemmap_wrprotect_hvo(unsigned long start, unsigned long end, int node, unsigned long headsize); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 0bef68e41f19..5c3ae0348754 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -81,13 +81,17 @@ * currently expect (see CONFIG_HAVE_GIGANTIC_FOLIOS): with hugetlb, we expect * no folios larger than 16 GiB on 64bit and 1 GiB on 32bit. */ -#define MAX_FOLIO_ORDER get_order(IS_ENABLED(CONFIG_64BIT) ? SZ_16G : SZ_1G) +#ifdef CONFIG_64BIT +#define MAX_FOLIO_ORDER (ilog2(SZ_16G) - PAGE_SHIFT) +#else +#define MAX_FOLIO_ORDER (ilog2(SZ_1G) - PAGE_SHIFT) +#endif #else /* * Without hugetlb, gigantic folios that are bigger than a single PUD are * currently impossible. */ -#define MAX_FOLIO_ORDER PUD_ORDER +#define MAX_FOLIO_ORDER (PUD_SHIFT - PAGE_SHIFT) #endif #define MAX_FOLIO_NR_PAGES (1UL << MAX_FOLIO_ORDER) @@ -103,6 +107,14 @@ is_power_of_2(sizeof(struct page)) ? \ MAX_FOLIO_NR_PAGES * sizeof(struct page) : 0) +/* + * vmemmap optimization (like HVO) is only possible for page orders that fill + * two or more pages with struct pages. + */ +#define VMEMMAP_TAIL_MIN_ORDER (ilog2(2 * PAGE_SIZE / sizeof(struct page))) +#define __NR_VMEMMAP_TAILS (MAX_FOLIO_ORDER - VMEMMAP_TAIL_MIN_ORDER + 1) +#define NR_VMEMMAP_TAILS (__NR_VMEMMAP_TAILS > 0 ? __NR_VMEMMAP_TAILS : 0) + enum migratetype { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, @@ -1113,6 +1125,9 @@ struct zone { /* Zone statistics */ atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS]; +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP + struct page *vmemmap_tails[NR_VMEMMAP_TAILS]; +#endif } ____cacheline_internodealigned_in_smp; enum pgdat_flags { diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 3628fb5b2a28..92330f172eb7 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -19,6 +19,7 @@ #include #include "hugetlb_vmemmap.h" +#include "internal.h" /** * struct vmemmap_remap_walk - walk vmemmap page table @@ -505,6 +506,32 @@ static bool vmemmap_should_optimize_folio(const struct hstate *h, struct folio * return true; } +static struct page *vmemmap_get_tail(unsigned int order, struct zone *zone) +{ + const unsigned int idx = order - VMEMMAP_TAIL_MIN_ORDER; + struct page *tail, *p; + int node = zone_to_nid(zone); + + tail = READ_ONCE(zone->vmemmap_tails[idx]); + if (likely(tail)) + return tail; + + tail = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); + if (!tail) + return NULL; + + p = page_to_virt(tail); + for (int i = 0; i < PAGE_SIZE / sizeof(struct page); i++) + init_compound_tail(p + i, NULL, order, zone); + + if (cmpxchg(&zone->vmemmap_tails[idx], NULL, tail)) { + __free_page(tail); + tail = READ_ONCE(zone->vmemmap_tails[idx]); + } + + return tail; +} + static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio, struct list_head *vmemmap_pages, @@ -520,6 +547,11 @@ static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h, if (!vmemmap_should_optimize_folio(h, folio)) return ret; + nid = folio_nid(folio); + vmemmap_tail = vmemmap_get_tail(h->order, folio_zone(folio)); + if (!vmemmap_tail) + return -ENOMEM; + static_branch_inc(&hugetlb_optimize_vmemmap_key); if (flags & VMEMMAP_SYNCHRONIZE_RCU) @@ -537,7 +569,6 @@ static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h, */ folio_set_hugetlb_vmemmap_optimized(folio); - nid = folio_nid(folio); vmemmap_head = alloc_pages_node(nid, GFP_KERNEL, 0); if (!vmemmap_head) { ret = -ENOMEM; @@ -548,7 +579,6 @@ static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h, list_add(&vmemmap_head->lru, vmemmap_pages); memmap_pages_add(1); - vmemmap_tail = vmemmap_head; vmemmap_start = (unsigned long)&folio->page; vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); @@ -776,11 +806,26 @@ void __init hugetlb_vmemmap_init_early(int nid) } } +static struct zone *pfn_to_zone(unsigned nid, unsigned long pfn) +{ + struct zone *zone; + enum zone_type zone_type; + + for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { + zone = &NODE_DATA(nid)->node_zones[zone_type]; + if (zone_spans_pfn(zone, pfn)) + return zone; + } + + return NULL; +} + void __init hugetlb_vmemmap_init_late(int nid) { struct huge_bootmem_page *m, *tm; unsigned long phys, nr_pages, start, end; unsigned long pfn, nr_mmap; + struct zone *zone = NULL; struct hstate *h; void *map; @@ -814,7 +859,12 @@ void __init hugetlb_vmemmap_init_late(int nid) continue; } - if (vmemmap_populate_hvo(start, end, nid, + if (!zone || !zone_spans_pfn(zone, pfn)) + zone = pfn_to_zone(nid, pfn); + if (WARN_ON_ONCE(!zone)) + continue; + + if (vmemmap_populate_hvo(start, end, huge_page_order(h), zone, HUGETLB_VMEMMAP_RESERVE_SIZE) < 0) { /* Fallback if HVO population fails */ vmemmap_populate(start, end, nid, NULL); @@ -842,10 +892,27 @@ static const struct ctl_table hugetlb_vmemmap_sysctls[] = { static int __init hugetlb_vmemmap_init(void) { const struct hstate *h; + struct zone *zone; /* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */ BUILD_BUG_ON(__NR_USED_SUBPAGE > HUGETLB_VMEMMAP_RESERVE_PAGES); + for_each_zone(zone) { + for (int i = 0; i < NR_VMEMMAP_TAILS; i++) { + struct page *tail, *p; + unsigned int order; + + tail = zone->vmemmap_tails[i]; + if (!tail) + continue; + + order = i + VMEMMAP_TAIL_MIN_ORDER; + p = page_to_virt(tail); + for (int j = 0; j < PAGE_SIZE / sizeof(struct page); j++) + init_compound_tail(p + j, NULL, order, zone); + } + } + for_each_hstate(h) { if (hugetlb_vmemmap_optimizable(h)) { register_sysctl_init("vm", hugetlb_vmemmap_sysctls); diff --git a/mm/internal.h b/mm/internal.h index 9cfbd8e41914..84167b0570c9 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -905,6 +905,15 @@ static inline void prep_compound_tail(struct page *tail, set_page_private(tail, 0); } +static inline void init_compound_tail(struct page *tail, + const struct page *head, unsigned int order, struct zone *zone) +{ + atomic_set(&tail->_mapcount, -1); + set_page_node(tail, zone_to_nid(zone)); + set_page_zone(tail, zone_idx(zone)); + prep_compound_tail(tail, head, order); +} + void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags); extern bool free_pages_prepare(struct page *page, unsigned int order); diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 032a81450838..842ed2f0bce6 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -325,16 +325,54 @@ void vmemmap_wrprotect_hvo(unsigned long addr, unsigned long end, } } -/* - * Populate vmemmap pages HVO-style. The first page contains the head - * page and needed tail pages, the other ones are mirrors of the first - * page. - */ -int __meminit vmemmap_populate_hvo(unsigned long addr, unsigned long end, - int node, unsigned long headsize) +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP +static __meminit struct page *vmemmap_get_tail(unsigned int order, struct zone *zone) +{ + struct page *p, *tail; + unsigned int idx; + int node = zone_to_nid(zone); + + if (WARN_ON_ONCE(order < VMEMMAP_TAIL_MIN_ORDER)) + return NULL; + if (WARN_ON_ONCE(order > MAX_FOLIO_ORDER)) + return NULL; + + idx = order - VMEMMAP_TAIL_MIN_ORDER; + tail = zone->vmemmap_tails[idx]; + if (tail) + return tail; + + /* + * Only allocate the page, but do not initialize it. + * + * Any initialization done here will be overwritten by memmap_init(). + * + * hugetlb_vmemmap_init() will take care of initialization after + * memmap_init(). + */ + + p = vmemmap_alloc_block_zero(PAGE_SIZE, node); + if (!p) + return NULL; + + tail = virt_to_page(p); + zone->vmemmap_tails[idx] = tail; + + return tail; +} + +int __meminit vmemmap_populate_hvo(unsigned long addr, unsigned long end, + unsigned int order, struct zone *zone, + unsigned long headsize) { - pte_t *pte; unsigned long maddr; + struct page *tail; + pte_t *pte; + int node = zone_to_nid(zone); + + tail = vmemmap_get_tail(order, zone); + if (!tail) + return -ENOMEM; for (maddr = addr; maddr < addr + headsize; maddr += PAGE_SIZE) { pte = vmemmap_populate_address(maddr, node, NULL, -1, 0); @@ -346,8 +384,9 @@ int __meminit vmemmap_populate_hvo(unsigned long addr, unsigned long end, * Reuse the last page struct page mapped above for the rest. */ return vmemmap_populate_range(maddr, end, node, NULL, - pte_pfn(ptep_get(pte)), 0); + page_to_pfn(tail), 0); } +#endif void __weak __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node, unsigned long addr, unsigned long next)