From 5aa2a02b985f36a9042b2c7fa63a15de096effb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Mon, 23 Feb 2026 22:37:43 +0100 Subject: [PATCH 01/11] mm/slab: create sysfs attribute through default_groups MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The driver core can automatically create custom type attributes. This makes the code and error-handling shorter. Signed-off-by: Thomas Weißschuh Reviewed-by: Harry Yoo Link: https://patch.msgid.link/20260223-sysfs-const-slub-v1-1-ff86ffc26fff@weissschuh.net Signed-off-by: Vlastimil Babka (SUSE) --- mm/slub.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 862642c165ed..a48ea23b1728 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -9317,9 +9317,7 @@ static struct attribute *slab_attrs[] = { NULL }; -static const struct attribute_group slab_attr_group = { - .attrs = slab_attrs, -}; +ATTRIBUTE_GROUPS(slab); static ssize_t slab_attr_show(struct kobject *kobj, struct attribute *attr, @@ -9366,6 +9364,7 @@ static const struct sysfs_ops slab_sysfs_ops = { static const struct kobj_type slab_ktype = { .sysfs_ops = &slab_sysfs_ops, .release = kmem_cache_release, + .default_groups = slab_groups, }; static struct kset *slab_kset; @@ -9453,10 +9452,6 @@ static int sysfs_slab_add(struct kmem_cache *s) if (err) goto out; - err = sysfs_create_group(&s->kobj, &slab_attr_group); - if (err) - goto out_del_kobj; - if (!unmergeable) { /* Setup first alias */ sysfs_slab_alias(s, s->name); @@ -9465,9 +9460,6 @@ static int sysfs_slab_add(struct kmem_cache *s) if (!unmergeable) kfree(name); return err; -out_del_kobj: - kobject_del(&s->kobj); - goto out; } void sysfs_slab_unlink(struct kmem_cache *s) From 9042e77a5c29d42a56540b9402c8cc01b1c126e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Mon, 23 Feb 2026 22:37:44 +0100 Subject: [PATCH 02/11] mm/slab: constify sysfs attributes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These attributes are never modified, make them read-only. Signed-off-by: Thomas Weißschuh Reviewed-by: Harry Yoo Link: https://patch.msgid.link/20260223-sysfs-const-slub-v1-2-ff86ffc26fff@weissschuh.net Signed-off-by: Vlastimil Babka (SUSE) --- mm/slub.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index a48ea23b1728..73051cf77353 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -8833,7 +8833,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, return len; } -#define to_slab_attr(n) container_of(n, struct slab_attribute, attr) +#define to_slab_attr(n) container_of_const(n, struct slab_attribute, attr) #define to_slab(n) container_of(n, struct kmem_cache, kobj) struct slab_attribute { @@ -8843,10 +8843,10 @@ struct slab_attribute { }; #define SLAB_ATTR_RO(_name) \ - static struct slab_attribute _name##_attr = __ATTR_RO_MODE(_name, 0400) + static const struct slab_attribute _name##_attr = __ATTR_RO_MODE(_name, 0400) #define SLAB_ATTR(_name) \ - static struct slab_attribute _name##_attr = __ATTR_RW_MODE(_name, 0600) + static const struct slab_attribute _name##_attr = __ATTR_RW_MODE(_name, 0600) static ssize_t slab_size_show(struct kmem_cache *s, char *buf) { @@ -9240,7 +9240,7 @@ static ssize_t skip_kfence_store(struct kmem_cache *s, SLAB_ATTR(skip_kfence); #endif -static struct attribute *slab_attrs[] = { +static const struct attribute *const slab_attrs[] = { &slab_size_attr.attr, &object_size_attr.attr, &objs_per_slab_attr.attr, @@ -9323,7 +9323,7 @@ static ssize_t slab_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { - struct slab_attribute *attribute; + const struct slab_attribute *attribute; struct kmem_cache *s; attribute = to_slab_attr(attr); @@ -9339,7 +9339,7 @@ static ssize_t slab_attr_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t len) { - struct slab_attribute *attribute; + const struct slab_attribute *attribute; struct kmem_cache *s; attribute = to_slab_attr(attr); From 69d73421b76e3d952076be2e17cea42de90d126a Mon Sep 17 00:00:00 2001 From: "Vlastimil Babka (SUSE)" Date: Wed, 11 Mar 2026 19:22:33 +0100 Subject: [PATCH 03/11] slab: remove alloc_full_sheaf() The function allocates and then refills and empty sheaf. It's only called from __pcs_replace_empty_main(), which can also in some cases refill an empty sheaf. We can therefore consolidate this code. Remove alloc_full_sheaf() and refactor __pcs_replace_empty_main() so it will call alloc_empty_sheaf() when necessary, and then use the pre-existing refill_sheaf(). The result should be simpler to follow and less duplicated code. Also adjust the comment about returning sheaves to barn, the part about where the empty sheaf we'd be returning comes from is incorrect. No functional change intended. Reviewed-by: Qing Wang Reviewed-by: Harry Yoo Reviewed-by: Hao Li Link: https://patch.msgid.link/20260311-b4-slab-remove-alloc_full_sheaf-v1-1-c4c5bb587ae5@kernel.org Signed-off-by: Vlastimil Babka (SUSE) --- mm/slub.c | 57 +++++++++++++++++++------------------------------------ 1 file changed, 20 insertions(+), 37 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 2b2d33cc735c..a8347b79e46f 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2822,24 +2822,6 @@ static int refill_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf, return 0; } -static void sheaf_flush_unused(struct kmem_cache *s, struct slab_sheaf *sheaf); - -static struct slab_sheaf *alloc_full_sheaf(struct kmem_cache *s, gfp_t gfp) -{ - struct slab_sheaf *sheaf = alloc_empty_sheaf(s, gfp); - - if (!sheaf) - return NULL; - - if (refill_sheaf(s, sheaf, gfp | __GFP_NOMEMALLOC | __GFP_NOWARN)) { - sheaf_flush_unused(s, sheaf); - free_empty_sheaf(s, sheaf); - return NULL; - } - - return sheaf; -} - /* * Maximum number of objects freed during a single flush of main pcs sheaf. * Translates directly to an on-stack array size. @@ -4611,34 +4593,35 @@ __pcs_replace_empty_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs, if (!allow_spin) return NULL; - if (empty) { - if (!refill_sheaf(s, empty, gfp | __GFP_NOMEMALLOC | __GFP_NOWARN)) { - full = empty; - } else { - /* - * we must be very low on memory so don't bother - * with the barn - */ - sheaf_flush_unused(s, empty); - free_empty_sheaf(s, empty); - } - } else { - full = alloc_full_sheaf(s, gfp); + if (!empty) { + empty = alloc_empty_sheaf(s, gfp); + if (!empty) + return NULL; } - if (!full) + if (refill_sheaf(s, empty, gfp | __GFP_NOMEMALLOC | __GFP_NOWARN)) { + /* + * we must be very low on memory so don't bother + * with the barn + */ + sheaf_flush_unused(s, empty); + free_empty_sheaf(s, empty); + return NULL; + } + + full = empty; + empty = NULL; if (!local_trylock(&s->cpu_sheaves->lock)) goto barn_put; pcs = this_cpu_ptr(s->cpu_sheaves); /* - * If we are returning empty sheaf, we either got it from the - * barn or had to allocate one. If we are returning a full - * sheaf, it's due to racing or being migrated to a different - * cpu. Breaching the barn's sheaf limits should be thus rare - * enough so just ignore them to simplify the recovery. + * If we put any empty or full sheaf to the barn below, it's due to + * racing or being migrated to a different cpu. Breaching the barn's + * sheaf limits should be thus rare enough so just ignore them to + * simplify the recovery. */ if (pcs->main->size == 0) { From 5ba6bc27b1f99b35aa528409a8e223136c59e0af Mon Sep 17 00:00:00 2001 From: "Vlastimil Babka (SUSE)" Date: Wed, 11 Mar 2026 09:25:55 +0100 Subject: [PATCH 04/11] slab: decouple pointer to barn from kmem_cache_node The pointer to barn currently exists in struct kmem_cache_node. That struct is instantiated for every NUMA node with memory, but we want to have a barn for every online node (including memoryless). Thus decouple the two structures. In struct kmem_cache we have an array for kmem_cache_node pointers that appears to be sized MAX_NUMNODES but the actual size calculation in kmem_cache_init() uses nr_node_ids. Therefore we can't just add another array of barn pointers. Instead change the array to newly introduced struct kmem_cache_per_node_ptrs holding both kmem_cache_node and barn pointer. Adjust barn accessor and allocation/initialization code accordingly. For now no functional change intended, barns are created 1:1 together with kmem_cache_nodes. Link: https://patch.msgid.link/20260311-b4-slab-memoryless-barns-v1-1-70ab850be4ce@kernel.org Signed-off-by: Vlastimil Babka (SUSE) Reviewed-by: Harry Yoo Reviewed-by: Hao Li --- mm/slab.h | 7 ++- mm/slub.c | 130 ++++++++++++++++++++++++++++++------------------------ 2 files changed, 79 insertions(+), 58 deletions(-) diff --git a/mm/slab.h b/mm/slab.h index e9ab292acd22..c735e6b4dddb 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -191,6 +191,11 @@ struct kmem_cache_order_objects { unsigned int x; }; +struct kmem_cache_per_node_ptrs { + struct node_barn *barn; + struct kmem_cache_node *node; +}; + /* * Slab cache management. */ @@ -247,7 +252,7 @@ struct kmem_cache { struct kmem_cache_stats __percpu *cpu_stats; #endif - struct kmem_cache_node *node[MAX_NUMNODES]; + struct kmem_cache_per_node_ptrs per_node[MAX_NUMNODES]; }; /* diff --git a/mm/slub.c b/mm/slub.c index a8347b79e46f..6f65cc136108 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -59,7 +59,7 @@ * 0. cpu_hotplug_lock * 1. slab_mutex (Global Mutex) * 2a. kmem_cache->cpu_sheaves->lock (Local trylock) - * 2b. node->barn->lock (Spinlock) + * 2b. barn->lock (Spinlock) * 2c. node->list_lock (Spinlock) * 3. slab_lock(slab) (Only on some arches) * 4. object_map_lock (Only for debugging) @@ -136,7 +136,7 @@ * or spare sheaf can handle the allocation or free, there is no other * overhead. * - * node->barn->lock (spinlock) + * barn->lock (spinlock) * * This lock protects the operations on per-NUMA-node barn. It can quickly * serve an empty or full sheaf if available, and avoid more expensive refill @@ -436,26 +436,24 @@ struct kmem_cache_node { atomic_long_t total_objects; struct list_head full; #endif - struct node_barn *barn; }; static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) { - return s->node[node]; + return s->per_node[node].node; +} + +static inline struct node_barn *get_barn_node(struct kmem_cache *s, int node) +{ + return s->per_node[node].barn; } /* - * Get the barn of the current cpu's closest memory node. It may not exist on - * systems with memoryless nodes but without CONFIG_HAVE_MEMORYLESS_NODES + * Get the barn of the current cpu's NUMA node. It may be a memoryless node. */ static inline struct node_barn *get_barn(struct kmem_cache *s) { - struct kmem_cache_node *n = get_node(s, numa_mem_id()); - - if (!n) - return NULL; - - return n->barn; + return get_barn_node(s, numa_mem_id()); } /* @@ -5771,7 +5769,6 @@ bool free_to_pcs(struct kmem_cache *s, void *object, bool allow_spin) static void rcu_free_sheaf(struct rcu_head *head) { - struct kmem_cache_node *n; struct slab_sheaf *sheaf; struct node_barn *barn = NULL; struct kmem_cache *s; @@ -5794,12 +5791,10 @@ static void rcu_free_sheaf(struct rcu_head *head) if (__rcu_free_sheaf_prepare(s, sheaf)) goto flush; - n = get_node(s, sheaf->node); - if (!n) + barn = get_barn_node(s, sheaf->node); + if (!barn) goto flush; - barn = n->barn; - /* due to slab_free_hook() */ if (unlikely(sheaf->size == 0)) goto empty; @@ -7410,7 +7405,7 @@ static inline int calculate_order(unsigned int size) } static void -init_kmem_cache_node(struct kmem_cache_node *n, struct node_barn *barn) +init_kmem_cache_node(struct kmem_cache_node *n) { n->nr_partial = 0; spin_lock_init(&n->list_lock); @@ -7420,9 +7415,6 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct node_barn *barn) atomic_long_set(&n->total_objects, 0); INIT_LIST_HEAD(&n->full); #endif - n->barn = barn; - if (barn) - barn_init(barn); } #ifdef CONFIG_SLUB_STATS @@ -7517,8 +7509,8 @@ static void early_kmem_cache_node_alloc(int node) n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL, false); slab->freelist = get_freepointer(kmem_cache_node, n); slab->inuse = 1; - kmem_cache_node->node[node] = n; - init_kmem_cache_node(n, NULL); + kmem_cache_node->per_node[node].node = n; + init_kmem_cache_node(n); inc_slabs_node(kmem_cache_node, node, slab->objects); /* @@ -7533,15 +7525,20 @@ static void free_kmem_cache_nodes(struct kmem_cache *s) int node; struct kmem_cache_node *n; - for_each_kmem_cache_node(s, node, n) { - if (n->barn) { - WARN_ON(n->barn->nr_full); - WARN_ON(n->barn->nr_empty); - kfree(n->barn); - n->barn = NULL; - } + for_each_node(node) { + struct node_barn *barn = get_barn_node(s, node); - s->node[node] = NULL; + if (!barn) + continue; + + WARN_ON(barn->nr_full); + WARN_ON(barn->nr_empty); + kfree(barn); + s->per_node[node].barn = NULL; + } + + for_each_kmem_cache_node(s, node, n) { + s->per_node[node].node = NULL; kmem_cache_free(kmem_cache_node, n); } } @@ -7562,31 +7559,36 @@ static int init_kmem_cache_nodes(struct kmem_cache *s) for_each_node_mask(node, slab_nodes) { struct kmem_cache_node *n; - struct node_barn *barn = NULL; if (slab_state == DOWN) { early_kmem_cache_node_alloc(node); continue; } - if (cache_has_sheaves(s)) { - barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, node); - - if (!barn) - return 0; - } - n = kmem_cache_alloc_node(kmem_cache_node, GFP_KERNEL, node); - if (!n) { - kfree(barn); + if (!n) return 0; - } - init_kmem_cache_node(n, barn); - - s->node[node] = n; + init_kmem_cache_node(n); + s->per_node[node].node = n; } + + if (slab_state == DOWN || !cache_has_sheaves(s)) + return 1; + + for_each_node_mask(node, slab_nodes) { + struct node_barn *barn; + + barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, node); + + if (!barn) + return 0; + + barn_init(barn); + s->per_node[node].barn = barn; + } + return 1; } @@ -7875,10 +7877,15 @@ int __kmem_cache_shutdown(struct kmem_cache *s) if (cache_has_sheaves(s)) rcu_barrier(); + for_each_node(node) { + struct node_barn *barn = get_barn_node(s, node); + + if (barn) + barn_shrink(s, barn); + } + /* Attempt to free all objects */ for_each_kmem_cache_node(s, node, n) { - if (n->barn) - barn_shrink(s, n->barn); free_partial(s, n); if (n->nr_partial || node_nr_slabs(n)) return 1; @@ -8088,14 +8095,18 @@ static int __kmem_cache_do_shrink(struct kmem_cache *s) unsigned long flags; int ret = 0; + for_each_node(node) { + struct node_barn *barn = get_barn_node(s, node); + + if (barn) + barn_shrink(s, barn); + } + for_each_kmem_cache_node(s, node, n) { INIT_LIST_HEAD(&discard); for (i = 0; i < SHRINK_PROMOTE_MAX; i++) INIT_LIST_HEAD(promote + i); - if (n->barn) - barn_shrink(s, n->barn); - spin_lock_irqsave(&n->list_lock, flags); /* @@ -8184,7 +8195,8 @@ static int slab_mem_going_online_callback(int nid) if (get_node(s, nid)) continue; - if (cache_has_sheaves(s)) { + if (cache_has_sheaves(s) && !get_barn_node(s, nid)) { + barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, nid); if (!barn) { @@ -8205,13 +8217,17 @@ static int slab_mem_going_online_callback(int nid) goto out; } - init_kmem_cache_node(n, barn); + init_kmem_cache_node(n); + s->per_node[nid].node = n; - s->node[nid] = n; + if (barn) { + barn_init(barn); + s->per_node[nid].barn = barn; + } } /* * Any cache created after this point will also have kmem_cache_node - * initialized for the new node. + * and barn initialized for the new node. */ node_set(nid, slab_nodes); out: @@ -8303,7 +8319,7 @@ static void __init bootstrap_cache_sheaves(struct kmem_cache *s) } barn_init(barn); - get_node(s, node)->barn = barn; + s->per_node[node].barn = barn; } for_each_possible_cpu(cpu) { @@ -8374,8 +8390,8 @@ void __init kmem_cache_init(void) slab_state = PARTIAL; create_boot_cache(kmem_cache, "kmem_cache", - offsetof(struct kmem_cache, node) + - nr_node_ids * sizeof(struct kmem_cache_node *), + offsetof(struct kmem_cache, per_node) + + nr_node_ids * sizeof(struct kmem_cache_per_node_ptrs), SLAB_HWCACHE_ALIGN | SLAB_NO_OBJ_EXT, 0, 0); kmem_cache = bootstrap(&boot_kmem_cache); From 7f693882f00963fd7808e333b86a87e0f9b9873b Mon Sep 17 00:00:00 2001 From: "Vlastimil Babka (SUSE)" Date: Wed, 11 Mar 2026 09:25:56 +0100 Subject: [PATCH 05/11] slab: create barns for online memoryless nodes Ming Lei has reported [1] a performance regression due to replacing cpu (partial) slabs with sheaves. With slub stats enabled, a large amount of slowpath allocations were observed. The affected system has 8 online NUMA nodes but only 2 have memory. For sheaves to work effectively on given cpu, its NUMA node has to have struct node_barn allocated. Those are currently only allocated on nodes with memory (N_MEMORY) where kmem_cache_node also exist as the goal is to cache only node-local objects. But in order to have good performance on a memoryless node, we need its barn to exist and use sheaves to cache non-local objects (as no local objects can exist anyway). Therefore change the implementation to allocate barns on all online nodes, tracked in a new nodemask slab_barn_nodes. Also add a cpu hotplug callback as that's when a memoryless node can become online. Change both get_barn() and rcu_sheaf->node assignment to numa_node_id() so it's returned to the barn of the local cpu's (potentially memoryless) node, and not to the nearest node with memory anymore. On systems with CONFIG_HAVE_MEMORYLESS_NODES=y (which are not the main target of this change) barns did not exist on memoryless nodes, but get_barn() using numa_mem_id() meant a barn was returned from the nearest node with memory. This works, but the barn lock contention increases with every such memoryless node. With this change, barn will be allocated also on the memoryless node, reducing this contention in exchange for increased memory consumption. Reported-by: Ming Lei Link: https://lore.kernel.org/all/aZ0SbIqaIkwoW2mB@fedora/ [1] Link: https://patch.msgid.link/20260311-b4-slab-memoryless-barns-v1-2-70ab850be4ce@kernel.org Signed-off-by: Vlastimil Babka (SUSE) Reviewed-by: Harry Yoo Reviewed-by: Hao Li --- mm/slub.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 60 insertions(+), 5 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 6f65cc136108..73fe34014c6e 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -453,7 +453,7 @@ static inline struct node_barn *get_barn_node(struct kmem_cache *s, int node) */ static inline struct node_barn *get_barn(struct kmem_cache *s) { - return get_barn_node(s, numa_mem_id()); + return get_barn_node(s, numa_node_id()); } /* @@ -472,6 +472,12 @@ static inline struct node_barn *get_barn(struct kmem_cache *s) */ static nodemask_t slab_nodes; +/* + * Similar to slab_nodes but for where we have node_barn allocated. + * Corresponds to N_ONLINE nodes. + */ +static nodemask_t slab_barn_nodes; + /* * Workqueue used for flushing cpu and kfree_rcu sheaves. */ @@ -4062,6 +4068,51 @@ void flush_all_rcu_sheaves(void) rcu_barrier(); } +static int slub_cpu_setup(unsigned int cpu) +{ + int nid = cpu_to_node(cpu); + struct kmem_cache *s; + int ret = 0; + + /* + * we never clear a nid so it's safe to do a quick check before taking + * the mutex, and then recheck to handle parallel cpu hotplug safely + */ + if (node_isset(nid, slab_barn_nodes)) + return 0; + + mutex_lock(&slab_mutex); + + if (node_isset(nid, slab_barn_nodes)) + goto out; + + list_for_each_entry(s, &slab_caches, list) { + struct node_barn *barn; + + /* + * barn might already exist if a previous callback failed midway + */ + if (!cache_has_sheaves(s) || get_barn_node(s, nid)) + continue; + + barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, nid); + + if (!barn) { + ret = -ENOMEM; + goto out; + } + + barn_init(barn); + s->per_node[nid].barn = barn; + } + node_set(nid, slab_barn_nodes); + +out: + mutex_unlock(&slab_mutex); + + return ret; +} + /* * Use the cpu notifier to insure that the cpu slabs are flushed when * necessary. @@ -5916,7 +5967,7 @@ bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj) rcu_sheaf = NULL; } else { pcs->rcu_free = NULL; - rcu_sheaf->node = numa_mem_id(); + rcu_sheaf->node = numa_node_id(); } /* @@ -7577,7 +7628,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s) if (slab_state == DOWN || !cache_has_sheaves(s)) return 1; - for_each_node_mask(node, slab_nodes) { + for_each_node_mask(node, slab_barn_nodes) { struct node_barn *barn; barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, node); @@ -8230,6 +8281,7 @@ static int slab_mem_going_online_callback(int nid) * and barn initialized for the new node. */ node_set(nid, slab_nodes); + node_set(nid, slab_barn_nodes); out: mutex_unlock(&slab_mutex); return ret; @@ -8308,7 +8360,7 @@ static void __init bootstrap_cache_sheaves(struct kmem_cache *s) if (!capacity) return; - for_each_node_mask(node, slab_nodes) { + for_each_node_mask(node, slab_barn_nodes) { struct node_barn *barn; barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, node); @@ -8380,6 +8432,9 @@ void __init kmem_cache_init(void) for_each_node_state(node, N_MEMORY) node_set(node, slab_nodes); + for_each_online_node(node) + node_set(node, slab_barn_nodes); + create_boot_cache(kmem_cache_node, "kmem_cache_node", sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN | SLAB_NO_OBJ_EXT, 0, 0); @@ -8406,7 +8461,7 @@ void __init kmem_cache_init(void) /* Setup random freelists for each cache */ init_freelist_randomization(); - cpuhp_setup_state_nocalls(CPUHP_SLUB_DEAD, "slub:dead", NULL, + cpuhp_setup_state_nocalls(CPUHP_SLUB_DEAD, "slub:dead", slub_cpu_setup, slub_cpu_dead); pr_info("SLUB: HWalign=%d, Order=%u-%u, MinObjects=%u, CPUs=%u, Nodes=%u\n", From e65d430111a5ba83598b03a4aca4799eb295eef1 Mon Sep 17 00:00:00 2001 From: "Vlastimil Babka (SUSE)" Date: Wed, 11 Mar 2026 09:25:57 +0100 Subject: [PATCH 06/11] slab: free remote objects to sheaves on memoryless nodes On memoryless nodes we can now allocate from cpu sheaves and refill them normally. But when a node is memoryless on a system without actual CONFIG_HAVE_MEMORYLESS_NODES support, freeing always uses the slowpath because all objects appear as remote. We could instead benefit from the freeing fastpath, because the allocations can't obtain local objects anyway if the node is memoryless. Thus adapt the locality check when freeing, and move them to an inline function can_free_to_pcs() for a single shared implementation. On configurations with CONFIG_HAVE_MEMORYLESS_NODES=y continue using numa_mem_id() so the percpu sheaves and barn on a memoryless node will contain mostly objects from the closest memory node (returned by numa_mem_id()). No change is thus intended for such configuration. On systems with CONFIG_HAVE_MEMORYLESS_NODES=n use numa_node_id() (the cpu's node) since numa_mem_id() just aliases it anyway. But if we are freeing on a memoryless node, allow the freeing to use percpu sheaves for objects from any node, since they are all remote anyway. This way we avoid the slowpath and get more performant freeing. The potential downside is that allocations will obtain objects with a larger average distance. If we kept bypassing the sheaves on freeing, a refill of sheaves from slabs would tend to get closer objects thanks to the ordering of the zonelist. Architectures that allow de-facto memoryless nodes without proper CONFIG_HAVE_MEMORYLESS_NODES support should perhaps consider adding such support. Link: https://patch.msgid.link/20260311-b4-slab-memoryless-barns-v1-3-70ab850be4ce@kernel.org Signed-off-by: Vlastimil Babka (SUSE) Reviewed-by: Harry Yoo Reviewed-by: Hao Li --- mm/slub.c | 67 +++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 55 insertions(+), 12 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 73fe34014c6e..492ef5927e58 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5989,6 +5989,56 @@ bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj) return false; } +static __always_inline bool can_free_to_pcs(struct slab *slab) +{ + int slab_node; + int numa_node; + + if (!IS_ENABLED(CONFIG_NUMA)) + goto check_pfmemalloc; + + slab_node = slab_nid(slab); + +#ifdef CONFIG_HAVE_MEMORYLESS_NODES + /* + * numa_mem_id() points to the closest node with memory so only allow + * objects from that node to the percpu sheaves + */ + numa_node = numa_mem_id(); + + if (likely(slab_node == numa_node)) + goto check_pfmemalloc; +#else + + /* + * numa_mem_id() is only a wrapper to numa_node_id() which is where this + * cpu belongs to, but it might be a memoryless node anyway. We don't + * know what the closest node is. + */ + numa_node = numa_node_id(); + + /* freed object is from this cpu's node, proceed */ + if (likely(slab_node == numa_node)) + goto check_pfmemalloc; + + /* + * Freed object isn't from this cpu's node, but that node is memoryless. + * Proceed as it's better to cache remote objects than falling back to + * the slowpath for everything. The allocation side can never obtain + * a local object anyway, if none exist. We don't have numa_mem_id() to + * point to the closest node as we would on a proper memoryless node + * setup. + */ + if (unlikely(!node_state(numa_node, N_MEMORY))) + goto check_pfmemalloc; +#endif + + return false; + +check_pfmemalloc: + return likely(!slab_test_pfmemalloc(slab)); +} + /* * Bulk free objects to the percpu sheaves. * Unlike free_to_pcs() this includes the calls to all necessary hooks @@ -6003,7 +6053,6 @@ static void free_to_pcs_bulk(struct kmem_cache *s, size_t size, void **p) struct node_barn *barn; void *remote_objects[PCS_BATCH_MAX]; unsigned int remote_nr = 0; - int node = numa_mem_id(); next_remote_batch: while (i < size) { @@ -6017,8 +6066,7 @@ static void free_to_pcs_bulk(struct kmem_cache *s, size_t size, void **p) continue; } - if (unlikely((IS_ENABLED(CONFIG_NUMA) && slab_nid(slab) != node) - || slab_test_pfmemalloc(slab))) { + if (unlikely(!can_free_to_pcs(slab))) { remote_objects[remote_nr] = p[i]; p[i] = p[--size]; if (++remote_nr >= PCS_BATCH_MAX) @@ -6194,11 +6242,8 @@ void slab_free(struct kmem_cache *s, struct slab *slab, void *object, if (unlikely(!slab_free_hook(s, object, slab_want_init_on_free(s), false))) return; - if (likely(!IS_ENABLED(CONFIG_NUMA) || slab_nid(slab) == numa_mem_id()) - && likely(!slab_test_pfmemalloc(slab))) { - if (likely(free_to_pcs(s, object, true))) - return; - } + if (likely(can_free_to_pcs(slab)) && likely(free_to_pcs(s, object, true))) + return; __slab_free(s, slab, object, object, 1, addr); stat(s, FREE_SLOWPATH); @@ -6569,10 +6614,8 @@ void kfree_nolock(const void *object) */ kasan_slab_free(s, x, false, false, /* skip quarantine */true); - if (likely(!IS_ENABLED(CONFIG_NUMA) || slab_nid(slab) == numa_mem_id())) { - if (likely(free_to_pcs(s, x, false))) - return; - } + if (likely(can_free_to_pcs(slab)) && likely(free_to_pcs(s, x, false))) + return; /* * __slab_free() can locklessly cmpxchg16 into a slab, but then it might From 17a9399a61c9ce89771de588f6df43a8ec91f535 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Tue, 24 Mar 2026 22:35:12 +0100 Subject: [PATCH 07/11] slab,rcu: disable KVFREE_RCU_BATCHED for strict grace period Disable CONFIG_KVFREE_RCU_BATCHED in CONFIG_RCU_STRICT_GRACE_PERIOD builds so that kernel fuzzers have an easier time finding use-after-free involving kfree_rcu(). The intent behind CONFIG_RCU_STRICT_GRACE_PERIOD is that RCU should invoke callbacks and free objects as soon as possible (at a large performance cost) so that kernel fuzzers and such have an easier time detecting use-after-free bugs in objects with RCU lifetime. CONFIG_KVFREE_RCU_BATCHED is a performance optimization that queues RCU-freed objects in ways that CONFIG_RCU_STRICT_GRACE_PERIOD can't expedite; for example, the following testcase doesn't trigger a KASAN splat when CONFIG_KVFREE_RCU_BATCHED is enabled: ``` struct foo_struct { struct rcu_head rcu; int a; }; struct foo_struct *foo = kmalloc(sizeof(*foo), GFP_KERNEL | __GFP_NOFAIL | __GFP_ZERO); pr_info("%s: calling kfree_rcu()\n", __func__); kfree_rcu(foo, rcu); msleep(10); pr_info("%s: start UAF access\n", __func__); READ_ONCE(foo->a); pr_info("%s: end UAF access\n", __func__); ``` Signed-off-by: Jann Horn Acked-by: David Rientjes Reviewed-by: Joel Fernandes Acked-by: Harry Yoo (Oracle) Link: https://patch.msgid.link/20260324-kasan-kfree-rcu-v1-1-ac58a7a13d03@google.com Signed-off-by: Vlastimil Babka (SUSE) --- mm/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/Kconfig b/mm/Kconfig index ebd8ea353687..67a72fe89186 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -172,6 +172,7 @@ config SLUB config KVFREE_RCU_BATCHED def_bool y depends on !SLUB_TINY && !TINY_RCU + depends on !RCU_STRICT_GRACE_PERIOD config SLUB_TINY bool "Configure for minimal memory footprint" From 7f9bb84fdb5ee7621fcd6519cd14d3dc9aa75c5c Mon Sep 17 00:00:00 2001 From: Hao Li Date: Fri, 3 Apr 2026 15:37:36 +0800 Subject: [PATCH 08/11] slub: use N_NORMAL_MEMORY in can_free_to_pcs to handle remote frees Memory hotplug now keeps N_NORMAL_MEMORY up to date correctly, so make can_free_to_pcs() use it. As a result, when freeing objects on memoryless nodes, or on nodes that have memory but only in ZONE_MOVABLE, the objects can be freed to the sheaf instead of going through the slow path. Signed-off-by: Hao Li Acked-by: Harry Yoo (Oracle) Acked-by: David Rientjes Link: https://patch.msgid.link/20260403073958.8722-1-hao.li@linux.dev Signed-off-by: Vlastimil Babka (SUSE) --- mm/slub.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 492ef5927e58..f896cdb41383 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -6022,14 +6022,15 @@ static __always_inline bool can_free_to_pcs(struct slab *slab) goto check_pfmemalloc; /* - * Freed object isn't from this cpu's node, but that node is memoryless. + * Freed object isn't from this cpu's node, but that node is memoryless + * or only has ZONE_MOVABLE memory, which slab cannot allocate from. * Proceed as it's better to cache remote objects than falling back to * the slowpath for everything. The allocation side can never obtain * a local object anyway, if none exist. We don't have numa_mem_id() to * point to the closest node as we would on a proper memoryless node * setup. */ - if (unlikely(!node_state(numa_node, N_MEMORY))) + if (unlikely(!node_state(numa_node, N_NORMAL_MEMORY))) goto check_pfmemalloc; #endif From 7711207dcb9b7b74270ea0fb21daf91e4291f21d Mon Sep 17 00:00:00 2001 From: "Harry Yoo (Oracle)" Date: Mon, 6 Apr 2026 18:09:06 +0900 Subject: [PATCH 09/11] MAINTAINERS: add lib/tests/slub_kunit.c to SLAB ALLOCATOR section The slub_kunit module has been maintained by SLAB ALLOCATOR folks, but is missing in the MAINTAINERS file. Add the missing entry. Acked-by: David Rientjes Signed-off-by: Harry Yoo (Oracle) Acked-by: SeongJae Park Link: https://patch.msgid.link/20260406090907.11710-2-harry@kernel.org Signed-off-by: Vlastimil Babka (SUSE) --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 55af015174a5..5418ad867e34 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -24363,6 +24363,7 @@ F: Documentation/admin-guide/mm/slab.rst F: Documentation/mm/slab.rst F: include/linux/mempool.h F: include/linux/slab.h +F: lib/tests/slub_kunit.c F: mm/failslab.c F: mm/mempool.c F: mm/slab.h From 92af129b4085cd561b59bfa1596653844cb82e4c Mon Sep 17 00:00:00 2001 From: "Harry Yoo (Oracle)" Date: Mon, 6 Apr 2026 18:09:07 +0900 Subject: [PATCH 10/11] lib/tests/slub_kunit: add a test case for {kmalloc,kfree}_nolock Testing invocation of {kmalloc,kfree}_nolock() during kmalloc() or kfree() is tricky, and it is even harder to ensure that slowpaths are properly tested. Lack of such testing has led to late discovery of the bug fixed by commit a1e244a9f177 ("mm/slab: use prandom if !allow_spin"). Add a slub_kunit test that allocates and frees objects in a tight loop while a perf event triggers interrupts (NMI or hardirq depending on the arch) on the same task, invoking {kmalloc,kfree}_nolock() from the overflow handler. Acked-by: David Rientjes Signed-off-by: Harry Yoo (Oracle) Link: https://patch.msgid.link/20260406090907.11710-3-harry@kernel.org Signed-off-by: Vlastimil Babka (SUSE) --- lib/tests/slub_kunit.c | 92 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) diff --git a/lib/tests/slub_kunit.c b/lib/tests/slub_kunit.c index 848b682a2d70..fa6d31dbca16 100644 --- a/lib/tests/slub_kunit.c +++ b/lib/tests/slub_kunit.c @@ -7,6 +7,7 @@ #include #include #include +#include #include "../mm/slab.h" static struct kunit_resource resource; @@ -291,6 +292,94 @@ static void test_krealloc_redzone_zeroing(struct kunit *test) kmem_cache_destroy(s); } +#ifdef CONFIG_PERF_EVENTS +#define NR_ITERATIONS 1000 +#define NR_OBJECTS 1000 +static void *objects[NR_OBJECTS]; + +struct test_nolock_context { + struct kunit *test; + int callback_count; + int alloc_ok; + int alloc_fail; + struct perf_event *event; +}; + +static struct perf_event_attr hw_attr = { + .type = PERF_TYPE_HARDWARE, + .config = PERF_COUNT_HW_CPU_CYCLES, + .size = sizeof(struct perf_event_attr), + .pinned = 1, + .disabled = 1, + .freq = 1, + .sample_freq = 100000, +}; + +static void overflow_handler_test_kmalloc_kfree_nolock(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + void *objp; + gfp_t gfp; + struct test_nolock_context *ctx = event->overflow_handler_context; + + /* __GFP_ACCOUNT to test kmalloc_nolock() in alloc_slab_obj_exts() */ + gfp = (ctx->callback_count % 2) ? 0 : __GFP_ACCOUNT; + objp = kmalloc_nolock(64, gfp, NUMA_NO_NODE); + + if (objp) + ctx->alloc_ok++; + else + ctx->alloc_fail++; + + kfree_nolock(objp); + ctx->callback_count++; +} + +static void test_kmalloc_kfree_nolock(struct kunit *test) +{ + int i, j; + struct test_nolock_context ctx = { .test = test }; + struct perf_event *event; + bool alloc_fail = false; + + event = perf_event_create_kernel_counter(&hw_attr, -1, current, + overflow_handler_test_kmalloc_kfree_nolock, + &ctx); + if (IS_ERR(event)) + kunit_skip(test, "Failed to create perf event"); + ctx.event = event; + perf_event_enable(ctx.event); + for (i = 0; i < NR_ITERATIONS; i++) { + for (j = 0; j < NR_OBJECTS; j++) { + gfp_t gfp = (i % 2) ? GFP_KERNEL : GFP_KERNEL_ACCOUNT; + + objects[j] = kmalloc(64, gfp); + if (!objects[j]) { + j--; + while (j >= 0) + kfree(objects[j--]); + alloc_fail = true; + goto cleanup; + } + } + for (j = 0; j < NR_OBJECTS; j++) + kfree(objects[j]); + } + +cleanup: + perf_event_disable(ctx.event); + perf_event_release_kernel(ctx.event); + + kunit_info(test, "callback_count: %d, alloc_ok: %d, alloc_fail: %d\n", + ctx.callback_count, ctx.alloc_ok, ctx.alloc_fail); + + if (alloc_fail) + kunit_skip(test, "Allocation failed"); + KUNIT_EXPECT_EQ(test, 0, slab_errors); +} +#endif + static int test_init(struct kunit *test) { slab_errors = 0; @@ -315,6 +404,9 @@ static struct kunit_case test_cases[] = { KUNIT_CASE(test_kfree_rcu_wq_destroy), KUNIT_CASE(test_leak_destroy), KUNIT_CASE(test_krealloc_redzone_zeroing), +#ifdef CONFIG_PERF_EVENTS + KUNIT_CASE_SLOW(test_kmalloc_kfree_nolock), +#endif {} }; From 51274836193a661a3f39e7f10629d5978a61bbfb Mon Sep 17 00:00:00 2001 From: Hao Li Date: Tue, 7 Apr 2026 19:59:33 +0800 Subject: [PATCH 11/11] slub: clarify kmem_cache_refill_sheaf() comments In the in-place refill case, some objects may already have been added before the function returns -ENOMEM. Clarify this behavior and polish the rest of the comment for readability. Acked-by: Harry Yoo (Oracle) Signed-off-by: Hao Li Link: https://patch.msgid.link/20260407120018.42692-1-hao.li@linux.dev Signed-off-by: Vlastimil Babka (SUSE) --- mm/slub.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index f896cdb41383..98d473f5c701 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5120,12 +5120,15 @@ void kmem_cache_return_sheaf(struct kmem_cache *s, gfp_t gfp, } /* - * refill a sheaf previously returned by kmem_cache_prefill_sheaf to at least - * the given size + * Refill a sheaf previously returned by kmem_cache_prefill_sheaf to at least + * the given size. * - * the sheaf might be replaced by a new one when requesting more than - * s->sheaf_capacity objects if such replacement is necessary, but the refill - * fails (returning -ENOMEM), the existing sheaf is left intact + * Return: 0 on success. The sheaf will contain at least @size objects. + * The sheaf might have been replaced with a new one if more than + * sheaf->capacity objects are requested. + * + * Return: -ENOMEM on failure. Some objects might have been added to the sheaf + * but the sheaf will not be replaced. * * In practice we always refill to full sheaf's capacity. */