From b14ff274e8aa5517ff86c94d682bf26bf8b5dcc8 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 3 Feb 2025 10:28:47 +0100 Subject: [PATCH 01/14] slab, rcu: move TINY_RCU variant of kvfree_rcu() to SLAB Following the move of TREE_RCU implementation, let's move also the TINY_RCU one for consistency and subsequent refactoring. For simplicity, remove the separate inline __kvfree_call_rcu() as TINY_RCU is not meant for high-performance hardware anyway. Declare kvfree_call_rcu() in rcupdate.h to avoid header dependency issues. Also move the kvfree_rcu_barrier() declaration to slab.h Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: Joel Fernandes (Google) Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Paul E. McKenney Signed-off-by: Vlastimil Babka --- include/linux/rcupdate.h | 5 +++++ include/linux/rcutiny.h | 36 ------------------------------------ include/linux/rcutree.h | 3 --- include/linux/slab.h | 14 ++++++++++++++ kernel/rcu/tiny.c | 11 ----------- mm/slab_common.c | 19 +++++++++++++++++++ 6 files changed, 38 insertions(+), 50 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 48e5c03df1dd..3f70d1c81444 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -1082,6 +1082,11 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) #define kfree_rcu_mightsleep(ptr) kvfree_rcu_arg_1(ptr) #define kvfree_rcu_mightsleep(ptr) kvfree_rcu_arg_1(ptr) +/* + * In mm/slab_common.c, no suitable header to include here. + */ +void kvfree_call_rcu(struct rcu_head *head, void *ptr); + #define kvfree_rcu_arg_2(ptr, rhf) \ do { \ typeof (ptr) ___p = (ptr); \ diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index fe42315f667f..f519cd680228 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -90,41 +90,6 @@ static inline void synchronize_rcu_expedited(void) synchronize_rcu(); } -/* - * Add one more declaration of kvfree() here. It is - * not so straight forward to just include - * where it is defined due to getting many compile - * errors caused by that include. - */ -extern void kvfree(const void *addr); - -static inline void __kvfree_call_rcu(struct rcu_head *head, void *ptr) -{ - if (head) { - call_rcu(head, (rcu_callback_t) ((void *) head - ptr)); - return; - } - - // kvfree_rcu(one_arg) call. - might_sleep(); - synchronize_rcu(); - kvfree(ptr); -} - -static inline void kvfree_rcu_barrier(void) -{ - rcu_barrier(); -} - -#ifdef CONFIG_KASAN_GENERIC -void kvfree_call_rcu(struct rcu_head *head, void *ptr); -#else -static inline void kvfree_call_rcu(struct rcu_head *head, void *ptr) -{ - __kvfree_call_rcu(head, ptr); -} -#endif - void rcu_qs(void); static inline void rcu_softirq_qs(void) @@ -164,7 +129,6 @@ static inline void rcu_end_inkernel_boot(void) { } static inline bool rcu_inkernel_boot_has_ended(void) { return true; } static inline bool rcu_is_watching(void) { return true; } static inline void rcu_momentary_eqs(void) { } -static inline void kfree_rcu_scheduler_running(void) { } /* Avoid RCU read-side critical sections leaking across. */ static inline void rcu_all_qs(void) { barrier(); } diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 27d86d912781..dbe77b5fe06e 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -34,12 +34,9 @@ static inline void rcu_virt_note_context_switch(void) } void synchronize_rcu_expedited(void); -void kvfree_call_rcu(struct rcu_head *head, void *ptr); -void kvfree_rcu_barrier(void); void rcu_barrier(void); void rcu_momentary_eqs(void); -void kfree_rcu_scheduler_running(void); struct rcu_gp_oldstate { unsigned long rgos_norm; diff --git a/include/linux/slab.h b/include/linux/slab.h index 09eedaecf120..bcc62e5656c3 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -1082,6 +1083,19 @@ extern void kvfree_sensitive(const void *addr, size_t len); unsigned int kmem_cache_size(struct kmem_cache *s); +#ifdef CONFIG_TINY_RCU +static inline void kvfree_rcu_barrier(void) +{ + rcu_barrier(); +} + +static inline void kfree_rcu_scheduler_running(void) { } +#else +void kvfree_rcu_barrier(void); + +void kfree_rcu_scheduler_running(void); +#endif + /** * kmalloc_size_roundup - Report allocation bucket size for the given size * diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 4b3f31911465..0ec27093d0e1 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -246,17 +246,6 @@ bool poll_state_synchronize_rcu(unsigned long oldstate) } EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu); -#ifdef CONFIG_KASAN_GENERIC -void kvfree_call_rcu(struct rcu_head *head, void *ptr) -{ - if (head) - kasan_record_aux_stack(ptr); - - __kvfree_call_rcu(head, ptr); -} -EXPORT_SYMBOL_GPL(kvfree_call_rcu); -#endif - void __init rcu_init(void) { open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); diff --git a/mm/slab_common.c b/mm/slab_common.c index 4030907b6b7d..81a0ce77b11c 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1284,6 +1284,25 @@ EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); EXPORT_TRACEPOINT_SYMBOL(kfree); EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free); +#ifdef CONFIG_TINY_RCU + +void kvfree_call_rcu(struct rcu_head *head, void *ptr) +{ + if (head) { + kasan_record_aux_stack(ptr); + call_rcu(head, (rcu_callback_t) ((void *) head - ptr)); + return; + } + + // kvfree_rcu(one_arg) call. + might_sleep(); + synchronize_rcu(); + kvfree(ptr); +} +EXPORT_SYMBOL_GPL(kvfree_call_rcu); + +#endif + /* * This rcu parameter is runtime-read-only. It reflects * a minimum allowed number of objects which can be cached From 7f4b19ef3129e1f2e1856b3ee475a02c0be34891 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 3 Feb 2025 10:28:48 +0100 Subject: [PATCH 02/14] rcu: remove trace_rcu_kvfree_callback Tree RCU does not handle kvfree_rcu() by queueing individual objects by call_rcu() anymore, thus the tracepoint and associated __is_kvfree_rcu_offset() check is dead code now. Remove it. Reviewed-by: Joel Fernandes (Google) Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Paul E. McKenney Signed-off-by: Vlastimil Babka --- include/trace/events/rcu.h | 34 ---------------------------------- kernel/rcu/tree.c | 9 ++------- 2 files changed, 2 insertions(+), 41 deletions(-) diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index e81431deaa50..ac3b28b8939b 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -560,40 +560,6 @@ TRACE_EVENT_RCU(rcu_segcb_stats, ); -/* - * Tracepoint for the registration of a single RCU callback of the special - * kvfree() form. The first argument is the RCU type, the second argument - * is a pointer to the RCU callback, the third argument is the offset - * of the callback within the enclosing RCU-protected data structure, - * the fourth argument is the number of lazy callbacks queued, and the - * fifth argument is the total number of callbacks queued. - */ -TRACE_EVENT_RCU(rcu_kvfree_callback, - - TP_PROTO(const char *rcuname, struct rcu_head *rhp, unsigned long offset, - long qlen), - - TP_ARGS(rcuname, rhp, offset, qlen), - - TP_STRUCT__entry( - __field(const char *, rcuname) - __field(void *, rhp) - __field(unsigned long, offset) - __field(long, qlen) - ), - - TP_fast_assign( - __entry->rcuname = rcuname; - __entry->rhp = rhp; - __entry->offset = offset; - __entry->qlen = qlen; - ), - - TP_printk("%s rhp=%p func=%ld %ld", - __entry->rcuname, __entry->rhp, __entry->offset, - __entry->qlen) -); - /* * Tracepoint for marking the beginning rcu_do_batch, performed to start * RCU callback invocation. The first argument is the RCU flavor, diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 475f31deed14..5dbc4189037c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2931,13 +2931,8 @@ static int __init rcu_spawn_core_kthreads(void) static void rcutree_enqueue(struct rcu_data *rdp, struct rcu_head *head, rcu_callback_t func) { rcu_segcblist_enqueue(&rdp->cblist, head); - if (__is_kvfree_rcu_offset((unsigned long)func)) - trace_rcu_kvfree_callback(rcu_state.name, head, - (unsigned long)func, - rcu_segcblist_n_cbs(&rdp->cblist)); - else - trace_rcu_callback(rcu_state.name, head, - rcu_segcblist_n_cbs(&rdp->cblist)); + trace_rcu_callback(rcu_state.name, head, + rcu_segcblist_n_cbs(&rdp->cblist)); trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCBQueued")); } From 49d5377b38aa127451cf5dc6d6ea5d9da7f465a4 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 3 Feb 2025 10:28:49 +0100 Subject: [PATCH 03/14] rcu, slab: use a regular callback function for kvfree_rcu RCU has been special-casing callback function pointers that are integers lower than 4096 as offsets of rcu_head for kvfree() instead. The tree RCU implementation no longer does that as the batched kvfree_rcu() is not a simple call_rcu(). The tiny RCU still does, and the plan is also to make tree RCU use call_rcu() for SLUB_TINY configurations. Instead of teaching tree RCU again to special case the offsets, let's remove the special casing completely. Since there's no SLOB anymore, it is possible to create a callback function that can take a pointer to a middle of slab object with unknown offset and determine the object's pointer before freeing it, so implement that as kvfree_rcu_cb(). Large kmalloc and vmalloc allocations are handled simply by aligning down to page size. For that we retain the requirement that the offset is smaller than 4096. But we can remove __is_kvfree_rcu_offset() completely and instead just opencode the condition in the BUILD_BUG_ON() check. Reviewed-by: Joel Fernandes (Google) Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Paul E. McKenney Signed-off-by: Vlastimil Babka --- include/linux/rcupdate.h | 28 ++++++++++++------------ kernel/rcu/tiny.c | 14 ------------ mm/slab.h | 2 ++ mm/slab_common.c | 5 ++--- mm/slub.c | 46 ++++++++++++++++++++++++++++++++++++++++ 5 files changed, 63 insertions(+), 32 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 3f70d1c81444..23bcf71ffb06 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -1025,12 +1025,6 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) #define RCU_POINTER_INITIALIZER(p, v) \ .p = RCU_INITIALIZER(v) -/* - * Does the specified offset indicate that the corresponding rcu_head - * structure can be handled by kvfree_rcu()? - */ -#define __is_kvfree_rcu_offset(offset) ((offset) < 4096) - /** * kfree_rcu() - kfree an object after a grace period. * @ptr: pointer to kfree for double-argument invocations. @@ -1041,11 +1035,11 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) * when they are used in a kernel module, that module must invoke the * high-latency rcu_barrier() function at module-unload time. * - * The kfree_rcu() function handles this issue. Rather than encoding a - * function address in the embedded rcu_head structure, kfree_rcu() instead - * encodes the offset of the rcu_head structure within the base structure. - * Because the functions are not allowed in the low-order 4096 bytes of - * kernel virtual memory, offsets up to 4095 bytes can be accommodated. + * The kfree_rcu() function handles this issue. In order to have a universal + * callback function handling different offsets of rcu_head, the callback needs + * to determine the starting address of the freed object, which can be a large + * kmalloc or vmalloc allocation. To allow simply aligning the pointer down to + * page boundary for those, only offsets up to 4095 bytes can be accommodated. * If the offset is larger than 4095 bytes, a compile-time error will * be generated in kvfree_rcu_arg_2(). If this error is triggered, you can * either fall back to use of call_rcu() or rearrange the structure to @@ -1087,14 +1081,18 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) */ void kvfree_call_rcu(struct rcu_head *head, void *ptr); +/* + * The BUILD_BUG_ON() makes sure the rcu_head offset can be handled. See the + * comment of kfree_rcu() for details. + */ #define kvfree_rcu_arg_2(ptr, rhf) \ do { \ typeof (ptr) ___p = (ptr); \ \ - if (___p) { \ - BUILD_BUG_ON(!__is_kvfree_rcu_offset(offsetof(typeof(*(ptr)), rhf))); \ - kvfree_call_rcu(&((___p)->rhf), (void *) (___p)); \ - } \ + if (___p) { \ + BUILD_BUG_ON(offsetof(typeof(*(ptr)), rhf) >= 4096); \ + kvfree_call_rcu(&((___p)->rhf), (void *) (___p)); \ + } \ } while (0) #define kvfree_rcu_arg_1(ptr) \ diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 0ec27093d0e1..7a34a99d4664 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -85,15 +85,8 @@ void rcu_sched_clock_irq(int user) static inline bool rcu_reclaim_tiny(struct rcu_head *head) { rcu_callback_t f; - unsigned long offset = (unsigned long)head->func; rcu_lock_acquire(&rcu_callback_map); - if (__is_kvfree_rcu_offset(offset)) { - trace_rcu_invoke_kvfree_callback("", head, offset); - kvfree((void *)head - offset); - rcu_lock_release(&rcu_callback_map); - return true; - } trace_rcu_invoke_callback("", head); f = head->func; @@ -159,10 +152,6 @@ void synchronize_rcu(void) } EXPORT_SYMBOL_GPL(synchronize_rcu); -static void tiny_rcu_leak_callback(struct rcu_head *rhp) -{ -} - /* * Post an RCU callback to be invoked after the end of an RCU grace * period. But since we have but one CPU, that would be after any @@ -178,9 +167,6 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func) pr_err("%s(): Double-freed CB %p->%pS()!!! ", __func__, head, head->func); mem_dump_obj(head); } - - if (!__is_kvfree_rcu_offset((unsigned long)head->func)) - WRITE_ONCE(head->func, tiny_rcu_leak_callback); return; } diff --git a/mm/slab.h b/mm/slab.h index e9fd9bf0bfa6..2f01c7317988 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -604,6 +604,8 @@ void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, int objects, struct slabobj_ext *obj_exts); #endif +void kvfree_rcu_cb(struct rcu_head *head); + size_t __ksize(const void *objp); static inline size_t slab_ksize(const struct kmem_cache *s) diff --git a/mm/slab_common.c b/mm/slab_common.c index 81a0ce77b11c..6438a38aa5dc 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1290,7 +1290,7 @@ void kvfree_call_rcu(struct rcu_head *head, void *ptr) { if (head) { kasan_record_aux_stack(ptr); - call_rcu(head, (rcu_callback_t) ((void *) head - ptr)); + call_rcu(head, kvfree_rcu_cb); return; } @@ -1551,8 +1551,7 @@ kvfree_rcu_list(struct rcu_head *head) rcu_lock_acquire(&rcu_callback_map); trace_rcu_invoke_kvfree_callback("slab", head, offset); - if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset))) - kvfree(ptr); + kvfree(ptr); rcu_lock_release(&rcu_callback_map); cond_resched_tasks_rcu_qs(); diff --git a/mm/slub.c b/mm/slub.c index 1f50129dcfb3..e8273f286569 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -19,6 +19,7 @@ #include #include #include "slab.h" +#include #include #include #include @@ -4728,6 +4729,51 @@ static void free_large_kmalloc(struct folio *folio, void *object) folio_put(folio); } +/* + * Given an rcu_head embedded within an object obtained from kvmalloc at an + * offset < 4k, free the object in question. + */ +void kvfree_rcu_cb(struct rcu_head *head) +{ + void *obj = head; + struct folio *folio; + struct slab *slab; + struct kmem_cache *s; + void *slab_addr; + + if (is_vmalloc_addr(obj)) { + obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj); + vfree(obj); + return; + } + + folio = virt_to_folio(obj); + if (!folio_test_slab(folio)) { + /* + * rcu_head offset can be only less than page size so no need to + * consider folio order + */ + obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj); + free_large_kmalloc(folio, obj); + return; + } + + slab = folio_slab(folio); + s = slab->slab_cache; + slab_addr = folio_address(folio); + + if (is_kfence_address(obj)) { + obj = kfence_object_start(obj); + } else { + unsigned int idx = __obj_to_index(s, slab_addr, obj); + + obj = slab_addr + s->size * idx; + obj = fixup_red_left(s, obj); + } + + slab_free(s, slab, obj, _RET_IP_); +} + /** * kfree - free previously allocated memory * @object: pointer returned by kmalloc() or kmem_cache_alloc() From c9f8f1242a4c3e48adc6c3cf6b31c1ffbaa49943 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 3 Feb 2025 10:28:50 +0100 Subject: [PATCH 04/14] slab: don't batch kvfree_rcu() with SLUB_TINY kvfree_rcu() is batched for better performance except on TINY_RCU, which is a simple implementation for small UP systems. Similarly SLUB_TINY is an option intended for small systems, whether or not used together with TINY_RCU. In case SLUB_TINY is used with !TINY_RCU, it makes arguably sense to not do the batching and limit the memory footprint. It's also suboptimal to have RCU-specific #ifdefs in slab code. With that, add CONFIG_KVFREE_RCU_BATCHED to determine whether batching kvfree_rcu() implementation is used. It is not set by a user prompt, but enabled by default and disabled in case TINY_RCU or SLUB_TINY are enabled. Use the new config for #ifdef's in slab code and extend their scope to cover all code used by the batched kvfree_rcu(). For example there's no need to perform kvfree_rcu_init() if the batching is disabled. Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: Joel Fernandes (Google) Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Paul E. McKenney Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 2 +- mm/Kconfig | 4 ++++ mm/slab_common.c | 15 +++++++++------ 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index bcc62e5656c3..7686054dd494 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -1083,7 +1083,7 @@ extern void kvfree_sensitive(const void *addr, size_t len); unsigned int kmem_cache_size(struct kmem_cache *s); -#ifdef CONFIG_TINY_RCU +#ifndef CONFIG_KVFREE_RCU_BATCHED static inline void kvfree_rcu_barrier(void) { rcu_barrier(); diff --git a/mm/Kconfig b/mm/Kconfig index 1b501db06417..0b7f4bb5cb80 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -242,6 +242,10 @@ menu "Slab allocator options" config SLUB def_bool y +config KVFREE_RCU_BATCHED + def_bool y + depends on !SLUB_TINY && !TINY_RCU + config SLUB_TINY bool "Configure for minimal memory footprint" depends on EXPERT diff --git a/mm/slab_common.c b/mm/slab_common.c index 6438a38aa5dc..46d0a4cd33b5 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1284,7 +1284,7 @@ EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); EXPORT_TRACEPOINT_SYMBOL(kfree); EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free); -#ifdef CONFIG_TINY_RCU +#ifndef CONFIG_KVFREE_RCU_BATCHED void kvfree_call_rcu(struct rcu_head *head, void *ptr) { @@ -1301,7 +1301,11 @@ void kvfree_call_rcu(struct rcu_head *head, void *ptr) } EXPORT_SYMBOL_GPL(kvfree_call_rcu); -#endif +void __init kvfree_rcu_init(void) +{ +} + +#else /* CONFIG_KVFREE_RCU_BATCHED */ /* * This rcu parameter is runtime-read-only. It reflects @@ -1879,8 +1883,6 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp, return true; } -#if !defined(CONFIG_TINY_RCU) - static enum hrtimer_restart schedule_page_work_fn(struct hrtimer *t) { @@ -2089,8 +2091,6 @@ void kvfree_rcu_barrier(void) } EXPORT_SYMBOL_GPL(kvfree_rcu_barrier); -#endif /* #if !defined(CONFIG_TINY_RCU) */ - static unsigned long kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { @@ -2180,3 +2180,6 @@ void __init kvfree_rcu_init(void) shrinker_register(kfree_rcu_shrinker); } + +#endif /* CONFIG_KVFREE_RCU_BATCHED */ + From 12f4888c9de0415ca9fc0aefc134e2800a7f47f0 Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Fri, 24 Jan 2025 16:48:58 +0000 Subject: [PATCH 05/14] mm/slab: simplify SLAB_* flag handling SLUB is the only remaining allocator. We can therefore get rid of the logic for allocator-specific flags: * Merge SLAB_CACHE_FLAGS into SLAB_CORE_FLAGS. * Remove CACHE_CREATE_MASK and instead mask out SLAB_DEBUG_FLAGS if !CONFIG_SLUB_DEBUG. SLAB_DEBUG_FLAGS is now defined unconditionally (no impact on existing code, which ignores it if !CONFIG_SLUB_DEBUG). * Define SLAB_FLAGS_PERMITTED in terms of SLAB_CORE_FLAGS and SLAB_DEBUG_FLAGS (no functional change). While at it also remove misleading comments that suggest that multiple allocators are available. Signed-off-by: Kevin Brodsky Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- mm/slab.h | 32 +++++--------------------------- mm/slab_common.c | 11 ++--------- 2 files changed, 7 insertions(+), 36 deletions(-) diff --git a/mm/slab.h b/mm/slab.h index e9fd9bf0bfa6..1a081f50f947 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -457,39 +457,17 @@ static inline bool is_kmalloc_normal(struct kmem_cache *s) return !(s->flags & (SLAB_CACHE_DMA|SLAB_ACCOUNT|SLAB_RECLAIM_ACCOUNT)); } -/* Legal flag mask for kmem_cache_create(), for various configurations */ #define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \ SLAB_CACHE_DMA32 | SLAB_PANIC | \ - SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS ) + SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS | \ + SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \ + SLAB_TEMPORARY | SLAB_ACCOUNT | \ + SLAB_NO_USER_FLAGS | SLAB_KMALLOC | SLAB_NO_MERGE) -#ifdef CONFIG_SLUB_DEBUG #define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ SLAB_TRACE | SLAB_CONSISTENCY_CHECKS) -#else -#define SLAB_DEBUG_FLAGS (0) -#endif -#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \ - SLAB_TEMPORARY | SLAB_ACCOUNT | \ - SLAB_NO_USER_FLAGS | SLAB_KMALLOC | SLAB_NO_MERGE) - -/* Common flags available with current configuration */ -#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS) - -/* Common flags permitted for kmem_cache_create */ -#define SLAB_FLAGS_PERMITTED (SLAB_CORE_FLAGS | \ - SLAB_RED_ZONE | \ - SLAB_POISON | \ - SLAB_STORE_USER | \ - SLAB_TRACE | \ - SLAB_CONSISTENCY_CHECKS | \ - SLAB_NOLEAKTRACE | \ - SLAB_RECLAIM_ACCOUNT | \ - SLAB_TEMPORARY | \ - SLAB_ACCOUNT | \ - SLAB_KMALLOC | \ - SLAB_NO_MERGE | \ - SLAB_NO_USER_FLAGS) +#define SLAB_FLAGS_PERMITTED (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS) bool __kmem_cache_empty(struct kmem_cache *); int __kmem_cache_shutdown(struct kmem_cache *); diff --git a/mm/slab_common.c b/mm/slab_common.c index 4c9f0a87f733..58bb663dab6a 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -298,6 +298,8 @@ struct kmem_cache *__kmem_cache_create_args(const char *name, static_branch_enable(&slub_debug_enabled); if (flags & SLAB_STORE_USER) stack_depot_init(); +#else + flags &= ~SLAB_DEBUG_FLAGS; #endif mutex_lock(&slab_mutex); @@ -307,20 +309,11 @@ struct kmem_cache *__kmem_cache_create_args(const char *name, goto out_unlock; } - /* Refuse requests with allocator specific flags */ if (flags & ~SLAB_FLAGS_PERMITTED) { err = -EINVAL; goto out_unlock; } - /* - * Some allocators will constraint the set of valid flags to a subset - * of all flags. We expect them to define CACHE_CREATE_MASK in this - * case, and we'll just provide them with a sanitized version of the - * passed flags. - */ - flags &= CACHE_CREATE_MASK; - /* Fail closed on bad usersize of useroffset values. */ if (!IS_ENABLED(CONFIG_HARDENED_USERCOPY) || WARN_ON(!args->usersize && args->useroffset) || From f1157db8b539cf1a98678667255fa7efa1f5b2cb Mon Sep 17 00:00:00 2001 From: GONG Ruiqi Date: Wed, 12 Feb 2025 16:15:04 +0800 Subject: [PATCH 06/14] slab: Adjust placement of __kvmalloc_node_noprof Move __kvmalloc_node_noprof (as well as kvfree*, kvrealloc_noprof and kmalloc_gfp_adjust for consistency) into mm/slub.c so that it can directly invoke __do_kmalloc_node, which is needed for the next patch. No functional changes intended. Signed-off-by: GONG Ruiqi Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- mm/slub.c | 162 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ mm/util.c | 162 ------------------------------------------------------ 2 files changed, 162 insertions(+), 162 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 1f50129dcfb3..abc982d68feb 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4878,6 +4878,168 @@ void *krealloc_noprof(const void *p, size_t new_size, gfp_t flags) } EXPORT_SYMBOL(krealloc_noprof); +static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size) +{ + /* + * We want to attempt a large physically contiguous block first because + * it is less likely to fragment multiple larger blocks and therefore + * contribute to a long term fragmentation less than vmalloc fallback. + * However make sure that larger requests are not too disruptive - no + * OOM killer and no allocation failure warnings as we have a fallback. + */ + if (size > PAGE_SIZE) { + flags |= __GFP_NOWARN; + + if (!(flags & __GFP_RETRY_MAYFAIL)) + flags |= __GFP_NORETRY; + + /* nofail semantic is implemented by the vmalloc fallback */ + flags &= ~__GFP_NOFAIL; + } + + return flags; +} + +/** + * __kvmalloc_node - attempt to allocate physically contiguous memory, but upon + * failure, fall back to non-contiguous (vmalloc) allocation. + * @size: size of the request. + * @b: which set of kmalloc buckets to allocate from. + * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL. + * @node: numa node to allocate from + * + * Uses kmalloc to get the memory but if the allocation fails then falls back + * to the vmalloc allocator. Use kvfree for freeing the memory. + * + * GFP_NOWAIT and GFP_ATOMIC are not supported, neither is the __GFP_NORETRY modifier. + * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is + * preferable to the vmalloc fallback, due to visible performance drawbacks. + * + * Return: pointer to the allocated memory of %NULL in case of failure + */ +void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node) +{ + void *ret; + + /* + * It doesn't really make sense to fallback to vmalloc for sub page + * requests + */ + ret = __kmalloc_node_noprof(PASS_BUCKET_PARAMS(size, b), + kmalloc_gfp_adjust(flags, size), + node); + if (ret || size <= PAGE_SIZE) + return ret; + + /* non-sleeping allocations are not supported by vmalloc */ + if (!gfpflags_allow_blocking(flags)) + return NULL; + + /* Don't even allow crazy sizes */ + if (unlikely(size > INT_MAX)) { + WARN_ON_ONCE(!(flags & __GFP_NOWARN)); + return NULL; + } + + /* + * kvmalloc() can always use VM_ALLOW_HUGE_VMAP, + * since the callers already cannot assume anything + * about the resulting pointer, and cannot play + * protection games. + */ + return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END, + flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, + node, __builtin_return_address(0)); +} +EXPORT_SYMBOL(__kvmalloc_node_noprof); + +/** + * kvfree() - Free memory. + * @addr: Pointer to allocated memory. + * + * kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc(). + * It is slightly more efficient to use kfree() or vfree() if you are certain + * that you know which one to use. + * + * Context: Either preemptible task context or not-NMI interrupt. + */ +void kvfree(const void *addr) +{ + if (is_vmalloc_addr(addr)) + vfree(addr); + else + kfree(addr); +} +EXPORT_SYMBOL(kvfree); + +/** + * kvfree_sensitive - Free a data object containing sensitive information. + * @addr: address of the data object to be freed. + * @len: length of the data object. + * + * Use the special memzero_explicit() function to clear the content of a + * kvmalloc'ed object containing sensitive data to make sure that the + * compiler won't optimize out the data clearing. + */ +void kvfree_sensitive(const void *addr, size_t len) +{ + if (likely(!ZERO_OR_NULL_PTR(addr))) { + memzero_explicit((void *)addr, len); + kvfree(addr); + } +} +EXPORT_SYMBOL(kvfree_sensitive); + +/** + * kvrealloc - reallocate memory; contents remain unchanged + * @p: object to reallocate memory for + * @size: the size to reallocate + * @flags: the flags for the page level allocator + * + * If @p is %NULL, kvrealloc() behaves exactly like kvmalloc(). If @size is 0 + * and @p is not a %NULL pointer, the object pointed to is freed. + * + * If __GFP_ZERO logic is requested, callers must ensure that, starting with the + * initial memory allocation, every subsequent call to this API for the same + * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that + * __GFP_ZERO is not fully honored by this API. + * + * In any case, the contents of the object pointed to are preserved up to the + * lesser of the new and old sizes. + * + * This function must not be called concurrently with itself or kvfree() for the + * same memory allocation. + * + * Return: pointer to the allocated memory or %NULL in case of error + */ +void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags) +{ + void *n; + + if (is_vmalloc_addr(p)) + return vrealloc_noprof(p, size, flags); + + n = krealloc_noprof(p, size, kmalloc_gfp_adjust(flags, size)); + if (!n) { + /* We failed to krealloc(), fall back to kvmalloc(). */ + n = kvmalloc_noprof(size, flags); + if (!n) + return NULL; + + if (p) { + /* We already know that `p` is not a vmalloc address. */ + kasan_disable_current(); + memcpy(n, kasan_reset_tag(p), ksize(p)); + kasan_enable_current(); + + kfree(p); + } + } + + return n; +} +EXPORT_SYMBOL(kvrealloc_noprof); + struct detached_freelist { struct slab *slab; void *tail; diff --git a/mm/util.c b/mm/util.c index b6b9684a1438..c808668f0548 100644 --- a/mm/util.c +++ b/mm/util.c @@ -612,168 +612,6 @@ unsigned long vm_mmap(struct file *file, unsigned long addr, } EXPORT_SYMBOL(vm_mmap); -static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size) -{ - /* - * We want to attempt a large physically contiguous block first because - * it is less likely to fragment multiple larger blocks and therefore - * contribute to a long term fragmentation less than vmalloc fallback. - * However make sure that larger requests are not too disruptive - no - * OOM killer and no allocation failure warnings as we have a fallback. - */ - if (size > PAGE_SIZE) { - flags |= __GFP_NOWARN; - - if (!(flags & __GFP_RETRY_MAYFAIL)) - flags |= __GFP_NORETRY; - - /* nofail semantic is implemented by the vmalloc fallback */ - flags &= ~__GFP_NOFAIL; - } - - return flags; -} - -/** - * __kvmalloc_node - attempt to allocate physically contiguous memory, but upon - * failure, fall back to non-contiguous (vmalloc) allocation. - * @size: size of the request. - * @b: which set of kmalloc buckets to allocate from. - * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL. - * @node: numa node to allocate from - * - * Uses kmalloc to get the memory but if the allocation fails then falls back - * to the vmalloc allocator. Use kvfree for freeing the memory. - * - * GFP_NOWAIT and GFP_ATOMIC are not supported, neither is the __GFP_NORETRY modifier. - * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is - * preferable to the vmalloc fallback, due to visible performance drawbacks. - * - * Return: pointer to the allocated memory of %NULL in case of failure - */ -void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node) -{ - void *ret; - - /* - * It doesn't really make sense to fallback to vmalloc for sub page - * requests - */ - ret = __kmalloc_node_noprof(PASS_BUCKET_PARAMS(size, b), - kmalloc_gfp_adjust(flags, size), - node); - if (ret || size <= PAGE_SIZE) - return ret; - - /* non-sleeping allocations are not supported by vmalloc */ - if (!gfpflags_allow_blocking(flags)) - return NULL; - - /* Don't even allow crazy sizes */ - if (unlikely(size > INT_MAX)) { - WARN_ON_ONCE(!(flags & __GFP_NOWARN)); - return NULL; - } - - /* - * kvmalloc() can always use VM_ALLOW_HUGE_VMAP, - * since the callers already cannot assume anything - * about the resulting pointer, and cannot play - * protection games. - */ - return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END, - flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, - node, __builtin_return_address(0)); -} -EXPORT_SYMBOL(__kvmalloc_node_noprof); - -/** - * kvfree() - Free memory. - * @addr: Pointer to allocated memory. - * - * kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc(). - * It is slightly more efficient to use kfree() or vfree() if you are certain - * that you know which one to use. - * - * Context: Either preemptible task context or not-NMI interrupt. - */ -void kvfree(const void *addr) -{ - if (is_vmalloc_addr(addr)) - vfree(addr); - else - kfree(addr); -} -EXPORT_SYMBOL(kvfree); - -/** - * kvfree_sensitive - Free a data object containing sensitive information. - * @addr: address of the data object to be freed. - * @len: length of the data object. - * - * Use the special memzero_explicit() function to clear the content of a - * kvmalloc'ed object containing sensitive data to make sure that the - * compiler won't optimize out the data clearing. - */ -void kvfree_sensitive(const void *addr, size_t len) -{ - if (likely(!ZERO_OR_NULL_PTR(addr))) { - memzero_explicit((void *)addr, len); - kvfree(addr); - } -} -EXPORT_SYMBOL(kvfree_sensitive); - -/** - * kvrealloc - reallocate memory; contents remain unchanged - * @p: object to reallocate memory for - * @size: the size to reallocate - * @flags: the flags for the page level allocator - * - * If @p is %NULL, kvrealloc() behaves exactly like kvmalloc(). If @size is 0 - * and @p is not a %NULL pointer, the object pointed to is freed. - * - * If __GFP_ZERO logic is requested, callers must ensure that, starting with the - * initial memory allocation, every subsequent call to this API for the same - * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that - * __GFP_ZERO is not fully honored by this API. - * - * In any case, the contents of the object pointed to are preserved up to the - * lesser of the new and old sizes. - * - * This function must not be called concurrently with itself or kvfree() for the - * same memory allocation. - * - * Return: pointer to the allocated memory or %NULL in case of error - */ -void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags) -{ - void *n; - - if (is_vmalloc_addr(p)) - return vrealloc_noprof(p, size, flags); - - n = krealloc_noprof(p, size, kmalloc_gfp_adjust(flags, size)); - if (!n) { - /* We failed to krealloc(), fall back to kvmalloc(). */ - n = kvmalloc_noprof(size, flags); - if (!n) - return NULL; - - if (p) { - /* We already know that `p` is not a vmalloc address. */ - kasan_disable_current(); - memcpy(n, kasan_reset_tag(p), ksize(p)); - kasan_enable_current(); - - kfree(p); - } - } - - return n; -} -EXPORT_SYMBOL(kvrealloc_noprof); - /** * __vmalloc_array - allocate memory for a virtually contiguous array. * @n: number of elements. From 539f552892b757ca7a9eb1ba34f5be3c0a947f59 Mon Sep 17 00:00:00 2001 From: GONG Ruiqi Date: Wed, 12 Feb 2025 16:15:05 +0800 Subject: [PATCH 07/14] slab: Achieve better kmalloc caches randomization in kvmalloc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As revealed by this writeup[1], due to the fact that __kmalloc_node (now renamed to __kmalloc_node_noprof) is an exported symbol and will never get inlined, using it in kvmalloc_node (now is __kvmalloc_node_noprof) would make the RET_IP inside always point to the same address: upper_caller kvmalloc kvmalloc_node kvmalloc_node_noprof __kvmalloc_node_noprof <-- all macros all the way down here __kmalloc_node_noprof __do_kmalloc_node(.., _RET_IP_) ... <-- _RET_IP_ points to That literally means all kmalloc invoked via kvmalloc would use the same seed for cache randomization (CONFIG_RANDOM_KMALLOC_CACHES), which makes this hardening non-functional. The root cause of this problem, IMHO, is that using RET_IP only cannot identify the actual allocation site in case of kmalloc being called inside non-inlined wrappers or helper functions. And I believe there could be similar cases in other functions. Nevertheless, I haven't thought of any good solution for this. So for now let's solve this specific case first. For __kvmalloc_node_noprof, replace __kmalloc_node_noprof and call __do_kmalloc_node directly instead, so that RET_IP can take the return address of kvmalloc and differentiate each kvmalloc invocation: upper_caller kvmalloc kvmalloc_node kvmalloc_node_noprof __kvmalloc_node_noprof <-- all macros all the way down here __do_kmalloc_node(.., _RET_IP_) ... <-- _RET_IP_ points to Thanks to Tamás Koczka for the report and discussion! Link: https://github.com/google/security-research/blob/908d59b573960dc0b90adda6f16f7017aca08609/pocs/linux/kernelctf/CVE-2024-27397_mitigation/docs/exploit.md?plain=1#L259 [1] Reported-by: Tamás Koczka Signed-off-by: GONG Ruiqi Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- mm/slub.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index abc982d68feb..1f7d1d260eeb 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4925,9 +4925,9 @@ void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node) * It doesn't really make sense to fallback to vmalloc for sub page * requests */ - ret = __kmalloc_node_noprof(PASS_BUCKET_PARAMS(size, b), - kmalloc_gfp_adjust(flags, size), - node); + ret = __do_kmalloc_node(size, PASS_BUCKET_PARAM(b), + kmalloc_gfp_adjust(flags, size), + node, _RET_IP_); if (ret || size <= PAGE_SIZE) return ret; From ed5ec2e952595a469eae1f6dce040737359b6da2 Mon Sep 17 00:00:00 2001 From: Hyesoo Yu Date: Wed, 26 Feb 2025 17:12:00 +0900 Subject: [PATCH 08/14] mm: slub: Print the broken data before restoring them Previously, the restore occurred after printing the object in slub. After commit 47d911b02cbe ("slab: make check_object() more consistent"), the bytes are printed after the restore. This information about the bytes before the restore is highly valuable for debugging purpose. For instance, in a event of cache issue, it displays byte patterns by breaking them down into 64-bytes units. Without this information, we can only speculate on how it was broken. Hence the corrupted regions should be printed prior to the restoration process. However if an object breaks in multiple places, the same log may be output multiple times. Therefore the slub log is reported only once to prevent redundant printing, by sending a parameter indicating whether an error has occurred previously. Signed-off-by: Hyesoo Yu Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- mm/slub.c | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 1f7d1d260eeb..faea4a1dbcca 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1181,8 +1181,8 @@ static void restore_bytes(struct kmem_cache *s, char *message, u8 data, static pad_check_attributes int check_bytes_and_report(struct kmem_cache *s, struct slab *slab, - u8 *object, char *what, - u8 *start, unsigned int value, unsigned int bytes) + u8 *object, char *what, u8 *start, unsigned int value, + unsigned int bytes, bool slab_obj_print) { u8 *fault; u8 *end; @@ -1201,10 +1201,11 @@ check_bytes_and_report(struct kmem_cache *s, struct slab *slab, if (slab_add_kunit_errors()) goto skip_bug_print; - slab_bug(s, "%s overwritten", what); - pr_err("0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n", - fault, end - 1, fault - addr, - fault[0], value); + pr_err("[%s overwritten] 0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n", + what, fault, end - 1, fault - addr, fault[0], value); + + if (slab_obj_print) + object_err(s, slab, object, "Object corrupt"); skip_bug_print: restore_bytes(s, what, value, fault, end); @@ -1268,7 +1269,7 @@ static int check_pad_bytes(struct kmem_cache *s, struct slab *slab, u8 *p) return 1; return check_bytes_and_report(s, slab, p, "Object padding", - p + off, POISON_INUSE, size_from_object(s) - off); + p + off, POISON_INUSE, size_from_object(s) - off, true); } /* Check the pad bytes at the end of a slab page */ @@ -1318,11 +1319,11 @@ static int check_object(struct kmem_cache *s, struct slab *slab, if (s->flags & SLAB_RED_ZONE) { if (!check_bytes_and_report(s, slab, object, "Left Redzone", - object - s->red_left_pad, val, s->red_left_pad)) + object - s->red_left_pad, val, s->red_left_pad, ret)) ret = 0; if (!check_bytes_and_report(s, slab, object, "Right Redzone", - endobject, val, s->inuse - s->object_size)) + endobject, val, s->inuse - s->object_size, ret)) ret = 0; if (slub_debug_orig_size(s) && val == SLUB_RED_ACTIVE) { @@ -1331,7 +1332,7 @@ static int check_object(struct kmem_cache *s, struct slab *slab, if (s->object_size > orig_size && !check_bytes_and_report(s, slab, object, "kmalloc Redzone", p + orig_size, - val, s->object_size - orig_size)) { + val, s->object_size - orig_size, ret)) { ret = 0; } } @@ -1339,7 +1340,7 @@ static int check_object(struct kmem_cache *s, struct slab *slab, if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) { if (!check_bytes_and_report(s, slab, p, "Alignment padding", endobject, POISON_INUSE, - s->inuse - s->object_size)) + s->inuse - s->object_size, ret)) ret = 0; } } @@ -1355,11 +1356,11 @@ static int check_object(struct kmem_cache *s, struct slab *slab, if (kasan_meta_size < s->object_size - 1 && !check_bytes_and_report(s, slab, p, "Poison", p + kasan_meta_size, POISON_FREE, - s->object_size - kasan_meta_size - 1)) + s->object_size - kasan_meta_size - 1, ret)) ret = 0; if (kasan_meta_size < s->object_size && !check_bytes_and_report(s, slab, p, "End Poison", - p + s->object_size - 1, POISON_END, 1)) + p + s->object_size - 1, POISON_END, 1, ret)) ret = 0; } /* @@ -1385,11 +1386,6 @@ static int check_object(struct kmem_cache *s, struct slab *slab, ret = 0; } - if (!ret && !slab_in_kunit_test()) { - print_trailer(s, slab, object); - add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); - } - return ret; } From 3f6f32b14ab35452d2ed52f7821cf2829923c98d Mon Sep 17 00:00:00 2001 From: Hyesoo Yu Date: Wed, 26 Feb 2025 17:12:01 +0900 Subject: [PATCH 09/14] mm: slub: call WARN() when detecting a slab corruption If a slab object is corrupted or an error occurs in its internal validation, continuing after restoration may cause other side effects. At this point, it is difficult to debug because the problem occurred in the past. It is useful to use WARN() to catch errors at the point of issue because WARN() could trigger panic for system debugging when panic_on_warn is enabled. WARN() is added where to detect the error on slab_err and object_err. It makes sense to only do the WARN() after printing the logs. slab_err is splited to __slab_err that calls the WARN() and it is called after printing logs. Signed-off-by: Hyesoo Yu Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- mm/slub.c | 47 +++++++++++++++++++++++++++++------------------ 1 file changed, 29 insertions(+), 18 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index faea4a1dbcca..a9a02b4ae4d6 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1026,7 +1026,7 @@ static void slab_bug(struct kmem_cache *s, char *fmt, ...) vaf.fmt = fmt; vaf.va = &args; pr_err("=============================================================================\n"); - pr_err("BUG %s (%s): %pV\n", s->name, print_tainted(), &vaf); + pr_err("BUG %s (%s): %pV\n", s ? s->name : "", print_tainted(), &vaf); pr_err("-----------------------------------------------------------------------------\n\n"); va_end(args); } @@ -1085,8 +1085,6 @@ static void print_trailer(struct kmem_cache *s, struct slab *slab, u8 *p) /* Beginning of the filler is the free pointer */ print_section(KERN_ERR, "Padding ", p + off, size_from_object(s) - off); - - dump_stack(); } static void object_err(struct kmem_cache *s, struct slab *slab, @@ -1098,6 +1096,8 @@ static void object_err(struct kmem_cache *s, struct slab *slab, slab_bug(s, "%s", reason); print_trailer(s, slab, object); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); + + WARN_ON(1); } static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, @@ -1114,6 +1114,17 @@ static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, return false; } +static void __slab_err(struct slab *slab) +{ + if (slab_in_kunit_test()) + return; + + print_slab_info(slab); + add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); + + WARN_ON(1); +} + static __printf(3, 4) void slab_err(struct kmem_cache *s, struct slab *slab, const char *fmt, ...) { @@ -1127,9 +1138,7 @@ static __printf(3, 4) void slab_err(struct kmem_cache *s, struct slab *slab, vsnprintf(buf, sizeof(buf), fmt, args); va_end(args); slab_bug(s, "%s", buf); - print_slab_info(slab); - dump_stack(); - add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); + __slab_err(slab); } static void init_object(struct kmem_cache *s, void *object, u8 val) @@ -1302,9 +1311,10 @@ slab_pad_check(struct kmem_cache *s, struct slab *slab) while (end > fault && end[-1] == POISON_INUSE) end--; - slab_err(s, slab, "Padding overwritten. 0x%p-0x%p @offset=%tu", - fault, end - 1, fault - start); + slab_bug(s, "Padding overwritten. 0x%p-0x%p @offset=%tu", + fault, end - 1, fault - start); print_section(KERN_ERR, "Padding ", pad, remainder); + __slab_err(slab); restore_bytes(s, "slab padding", POISON_INUSE, fault, end); } @@ -1620,12 +1630,12 @@ static inline int free_consistency_checks(struct kmem_cache *s, slab_err(s, slab, "Attempt to free object(0x%p) outside of slab", object); } else if (!slab->slab_cache) { - pr_err("SLUB : no slab for object 0x%p.\n", - object); - dump_stack(); - } else + slab_err(NULL, slab, "No slab cache for object 0x%p", + object); + } else { object_err(s, slab, object, - "page slab pointer corrupt."); + "page slab pointer corrupt."); + } return 0; } return 1; @@ -5728,14 +5738,14 @@ static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s) return !!oo_objects(s->oo); } -static void list_slab_objects(struct kmem_cache *s, struct slab *slab, - const char *text) +static void list_slab_objects(struct kmem_cache *s, struct slab *slab) { #ifdef CONFIG_SLUB_DEBUG void *addr = slab_address(slab); void *p; - slab_err(s, slab, text, s->name); + if (!slab_add_kunit_errors()) + slab_bug(s, "Objects remaining on __kmem_cache_shutdown()"); spin_lock(&object_map_lock); __fill_map(object_map, s, slab); @@ -5750,6 +5760,8 @@ static void list_slab_objects(struct kmem_cache *s, struct slab *slab, } } spin_unlock(&object_map_lock); + + __slab_err(slab); #endif } @@ -5770,8 +5782,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) remove_partial(n, slab); list_add(&slab->slab_list, &discard); } else { - list_slab_objects(s, slab, - "Objects remaining in %s on __kmem_cache_shutdown()"); + list_slab_objects(s, slab); } } spin_unlock_irq(&n->list_lock); From 4b183dd9359d5772446cb634b12a383bed98c4fc Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 27 Feb 2025 16:05:46 +0100 Subject: [PATCH 10/14] mm, slab: cleanup slab_bug() parameters slab_err() has variadic printf arguments but instead of passing them to slab_bug() it does vsnprintf() to a buffer and passes %s, buf. To allow passing them directly, turn slab_bug() to __slab_bug() with a va_list parameter, and slab_bug() a wrapper with fmt, ... parameters. Then slab_err() can call __slab_bug() without the intermediate buffer. Also constify fmt everywhere, which also simplifies object_err()'s call to slab_bug(). Signed-off-by: Vlastimil Babka Reviewed-by: Harry Yoo --- mm/slub.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index a9a02b4ae4d6..d94af020b305 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1017,12 +1017,12 @@ void skip_orig_size_check(struct kmem_cache *s, const void *object) set_orig_size(s, (void *)object, s->object_size); } -static void slab_bug(struct kmem_cache *s, char *fmt, ...) +static void __slab_bug(struct kmem_cache *s, const char *fmt, va_list argsp) { struct va_format vaf; va_list args; - va_start(args, fmt); + va_copy(args, argsp); vaf.fmt = fmt; vaf.va = &args; pr_err("=============================================================================\n"); @@ -1031,8 +1031,17 @@ static void slab_bug(struct kmem_cache *s, char *fmt, ...) va_end(args); } +static void slab_bug(struct kmem_cache *s, const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + __slab_bug(s, fmt, args); + va_end(args); +} + __printf(2, 3) -static void slab_fix(struct kmem_cache *s, char *fmt, ...) +static void slab_fix(struct kmem_cache *s, const char *fmt, ...) { struct va_format vaf; va_list args; @@ -1088,12 +1097,12 @@ static void print_trailer(struct kmem_cache *s, struct slab *slab, u8 *p) } static void object_err(struct kmem_cache *s, struct slab *slab, - u8 *object, char *reason) + u8 *object, const char *reason) { if (slab_add_kunit_errors()) return; - slab_bug(s, "%s", reason); + slab_bug(s, reason); print_trailer(s, slab, object); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); @@ -1129,15 +1138,14 @@ static __printf(3, 4) void slab_err(struct kmem_cache *s, struct slab *slab, const char *fmt, ...) { va_list args; - char buf[100]; if (slab_add_kunit_errors()) return; va_start(args, fmt); - vsnprintf(buf, sizeof(buf), fmt, args); + __slab_bug(s, fmt, args); va_end(args); - slab_bug(s, "%s", buf); + __slab_err(slab); } @@ -1175,7 +1183,7 @@ static void init_object(struct kmem_cache *s, void *object, u8 val) s->inuse - poison_size); } -static void restore_bytes(struct kmem_cache *s, char *message, u8 data, +static void restore_bytes(struct kmem_cache *s, const char *message, u8 data, void *from, void *to) { slab_fix(s, "Restoring %s 0x%p-0x%p=0x%x", message, from, to - 1, data); @@ -1190,7 +1198,7 @@ static void restore_bytes(struct kmem_cache *s, char *message, u8 data, static pad_check_attributes int check_bytes_and_report(struct kmem_cache *s, struct slab *slab, - u8 *object, char *what, u8 *start, unsigned int value, + u8 *object, const char *what, u8 *start, unsigned int value, unsigned int bytes, bool slab_obj_print) { u8 *fault; From 7e384dbb57e2c3cef7e70d4913b0cc4caedf0a1f Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Fri, 28 Feb 2025 13:13:55 +0100 Subject: [PATCH 11/14] kunit, slub: Add test_kfree_rcu_wq_destroy use case Add a test_kfree_rcu_wq_destroy test to verify a kmem_cache_destroy() from a workqueue context. The problem is that, before destroying any cache the kvfree_rcu_barrier() is invoked to guarantee that in-flight freed objects are flushed. The _barrier() function queues and flushes its own internal workers which might conflict with a workqueue type a kmem-cache gets destroyed from. One example is when a WQ_MEM_RECLAIM workqueue is flushing !WQ_MEM_RECLAIM events which leads to a kernel splat. See the check_flush_dependency() in the workqueue.c file. If this test does not emits any kernel warning, it is passed. Reviewed-by: Keith Busch Co-developed-by: Vlastimil Babka Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Vlastimil Babka --- lib/slub_kunit.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/lib/slub_kunit.c b/lib/slub_kunit.c index f11691315c2f..d47c472b0520 100644 --- a/lib/slub_kunit.c +++ b/lib/slub_kunit.c @@ -6,6 +6,7 @@ #include #include #include +#include #include "../mm/slab.h" static struct kunit_resource resource; @@ -181,6 +182,63 @@ static void test_kfree_rcu(struct kunit *test) KUNIT_EXPECT_EQ(test, 0, slab_errors); } +struct cache_destroy_work { + struct work_struct work; + struct kmem_cache *s; +}; + +static void cache_destroy_workfn(struct work_struct *w) +{ + struct cache_destroy_work *cdw; + + cdw = container_of(w, struct cache_destroy_work, work); + kmem_cache_destroy(cdw->s); +} + +#define KMEM_CACHE_DESTROY_NR 10 + +static void test_kfree_rcu_wq_destroy(struct kunit *test) +{ + struct test_kfree_rcu_struct *p; + struct cache_destroy_work cdw; + struct workqueue_struct *wq; + struct kmem_cache *s; + unsigned int delay; + int i; + + if (IS_BUILTIN(CONFIG_SLUB_KUNIT_TEST)) + kunit_skip(test, "can't do kfree_rcu() when test is built-in"); + + INIT_WORK_ONSTACK(&cdw.work, cache_destroy_workfn); + wq = alloc_workqueue("test_kfree_rcu_destroy_wq", + WQ_HIGHPRI | WQ_UNBOUND | WQ_MEM_RECLAIM, 0); + + if (!wq) + kunit_skip(test, "failed to alloc wq"); + + for (i = 0; i < KMEM_CACHE_DESTROY_NR; i++) { + s = test_kmem_cache_create("TestSlub_kfree_rcu_wq_destroy", + sizeof(struct test_kfree_rcu_struct), + SLAB_NO_MERGE); + + if (!s) + kunit_skip(test, "failed to create cache"); + + delay = get_random_u8(); + p = kmem_cache_alloc(s, GFP_KERNEL); + kfree_rcu(p, rcu); + + cdw.s = s; + + msleep(delay); + queue_work(wq, &cdw.work); + flush_work(&cdw.work); + } + + destroy_workqueue(wq); + KUNIT_EXPECT_EQ(test, 0, slab_errors); +} + static void test_leak_destroy(struct kunit *test) { struct kmem_cache *s = test_kmem_cache_create("TestSlub_leak_destroy", @@ -254,6 +312,7 @@ static struct kunit_case test_cases[] = { KUNIT_CASE(test_clobber_redzone_free), KUNIT_CASE(test_kmalloc_redzone_access), KUNIT_CASE(test_kfree_rcu), + KUNIT_CASE(test_kfree_rcu_wq_destroy), KUNIT_CASE(test_leak_destroy), KUNIT_CASE(test_krealloc_redzone_zeroing), {} From a6687c8ff613fc13a71ce1390593ba8d27c52db9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 3 Mar 2025 17:28:05 +0000 Subject: [PATCH 12/14] slab: Mark large folios for debugging purposes If a user calls p = kmalloc(1024); kfree(p); kfree(p); and 'p' was the only object in the slab, we may free the slab after the first call to kfree(). If we do, we clear PGTY_slab and the second call to kfree() will call free_large_kmalloc(). That will leave a trace in the logs ("object pointer: 0x%p"), but otherwise proceed to free the memory, which is likely to corrupt the page allocator's metadata. Allocate a new page type for large kmalloc and mark the memory with it while it's allocated. That lets us detect this double-free and return without harming any data structures. Reported-by: Hannes Reinecke Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- include/linux/page-flags.h | 18 ++++++++++-------- mm/slub.c | 7 +++++++ 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 36d283552f80..df9234e5f478 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -925,14 +925,15 @@ FOLIO_FLAG_FALSE(has_hwpoisoned) enum pagetype { /* 0x00-0x7f are positive numbers, ie mapcount */ /* Reserve 0x80-0xef for mapcount overflow. */ - PGTY_buddy = 0xf0, - PGTY_offline = 0xf1, - PGTY_table = 0xf2, - PGTY_guard = 0xf3, - PGTY_hugetlb = 0xf4, - PGTY_slab = 0xf5, - PGTY_zsmalloc = 0xf6, - PGTY_unaccepted = 0xf7, + PGTY_buddy = 0xf0, + PGTY_offline = 0xf1, + PGTY_table = 0xf2, + PGTY_guard = 0xf3, + PGTY_hugetlb = 0xf4, + PGTY_slab = 0xf5, + PGTY_zsmalloc = 0xf6, + PGTY_unaccepted = 0xf7, + PGTY_large_kmalloc = 0xf8, PGTY_mapcount_underflow = 0xff }; @@ -1075,6 +1076,7 @@ PAGE_TYPE_OPS(Zsmalloc, zsmalloc, zsmalloc) * Serialized with zone lock. */ PAGE_TYPE_OPS(Unaccepted, unaccepted, unaccepted) +FOLIO_TYPE_OPS(large_kmalloc, large_kmalloc) /** * PageHuge - Determine if the page belongs to hugetlbfs diff --git a/mm/slub.c b/mm/slub.c index d94af020b305..3e6ab4986f8f 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4255,6 +4255,7 @@ static void *___kmalloc_large_node(size_t size, gfp_t flags, int node) ptr = folio_address(folio); lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B, PAGE_SIZE << order); + __folio_set_large_kmalloc(folio); } ptr = kasan_kmalloc_large(ptr, size, flags); @@ -4730,6 +4731,11 @@ static void free_large_kmalloc(struct folio *folio, void *object) { unsigned int order = folio_order(folio); + if (WARN_ON_ONCE(!folio_test_large_kmalloc(folio))) { + dump_page(&folio->page, "Not a kmalloc allocation"); + return; + } + if (WARN_ON_ONCE(order == 0)) pr_warn_once("object pointer: 0x%p\n", object); @@ -4739,6 +4745,7 @@ static void free_large_kmalloc(struct folio *folio, void *object) lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B, -(PAGE_SIZE << order)); + __folio_clear_large_kmalloc(folio); folio_put(folio); } From 939c5de3c70d145d7388db1b04d75cda79297c23 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Sat, 1 Mar 2025 16:37:20 +0800 Subject: [PATCH 13/14] mm/slab: call kmalloc_noprof() unconditionally in kmalloc_array_noprof() If 'n' or 'size' isn't builtin constant, we used to call __kmalloc() before commit 7bd230a26648 ("mm/slab: enable slab allocation tagging for kmalloc and friends"), which inadvertedly changed both paths to kmalloc_noprof(). As Harry Yoo points out we can just call kmalloc_noprof() unconditionally. If the compiler knows n and size are constants it doesn't guarantee that bytes will be also seen as constant, and that is the important test in kmalloc_noprof() anyway, so we can just defer to it always. [ vbabka@suse.cz: change as Harry suggested and adjust commit log ] Fixes: 7bd230a26648 ("mm/slab: enable slab allocation tagging for kmalloc and friends") Signed-off-by: Ye Bin Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index 09eedaecf120..ab05a143d09a 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -941,8 +941,6 @@ static inline __alloc_size(1, 2) void *kmalloc_array_noprof(size_t n, size_t siz if (unlikely(check_mul_overflow(n, size, &bytes))) return NULL; - if (__builtin_constant_p(n) && __builtin_constant_p(size)) - return kmalloc_noprof(bytes, flags); return kmalloc_noprof(bytes, flags); } #define kmalloc_array(...) alloc_hooks(kmalloc_array_noprof(__VA_ARGS__)) From 747e2cf137f44058a093d3226bf83974d9d117e7 Mon Sep 17 00:00:00 2001 From: Lilith Gkini Date: Wed, 5 Mar 2025 17:48:39 +0200 Subject: [PATCH 14/14] slub: Handle freelist cycle in on_freelist() The on_freelist() doesn't have a way to handle the edgecase of having a full freelist that doesn't end in NULL and instead has another valid pointer in the slab as a result of a Use-After-Free or anything similar. This case won't get caught by check_valid_pointer() and it will result in nr incrementing to `slab->objects + 1`, corrupting the slab->inuse entry later in the code by setting it to -1. Add an if check to detect that case, report it and handle the freelist and slab appropriately, as is the standard process in these situations. Furthermore change the return type of the function from int to bool as per coding style guidelines. Also move the `break;` line inside the `if (object) {` to make it more obvious that the code breaks the while loop in that branch. Signed-off-by: Lilith Persefoni Gkini Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- mm/slub.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 3e6ab4986f8f..6493b26f08cf 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1441,7 +1441,7 @@ static int check_slab(struct kmem_cache *s, struct slab *slab) * Determine if a certain object in a slab is on the freelist. Must hold the * slab lock to guarantee that the chains are in a consistent state. */ -static int on_freelist(struct kmem_cache *s, struct slab *slab, void *search) +static bool on_freelist(struct kmem_cache *s, struct slab *slab, void *search) { int nr = 0; void *fp; @@ -1451,26 +1451,34 @@ static int on_freelist(struct kmem_cache *s, struct slab *slab, void *search) fp = slab->freelist; while (fp && nr <= slab->objects) { if (fp == search) - return 1; + return true; if (!check_valid_pointer(s, slab, fp)) { if (object) { object_err(s, slab, object, "Freechain corrupt"); set_freepointer(s, object, NULL); + break; } else { slab_err(s, slab, "Freepointer corrupt"); slab->freelist = NULL; slab->inuse = slab->objects; slab_fix(s, "Freelist cleared"); - return 0; + return false; } - break; } object = fp; fp = get_freepointer(s, object); nr++; } + if (nr > slab->objects) { + slab_err(s, slab, "Freelist cycle detected"); + slab->freelist = NULL; + slab->inuse = slab->objects; + slab_fix(s, "Freelist cleared"); + return false; + } + max_objects = order_objects(slab_order(slab), s->size); if (max_objects > MAX_OBJS_PER_PAGE) max_objects = MAX_OBJS_PER_PAGE;