From 81f88f6ab674973d361b6d176aa4d3ebd32253ab Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Wed, 3 Dec 2025 19:15:07 +0000 Subject: [PATCH 001/268] libbpf: Add debug messaging in dedup equivalence/identity matching We have seen a number of issues like [1]; failures to deduplicate key kernel data structures like task_struct. These are often hard to debug from pahole even with verbose output, especially when identity/equivalence checks fail deep in a nested struct comparison. Here we add debug messages of the form libbpf: STRUCT 'task_struct' size=2560 vlen=194 cand_id[54222] canon_id[102820] shallow-equal but not equiv for field#23 'sched_class': 0 These will be emitted during dedup from pahole when --verbose/-V is specified. This greatly helps identify exactly where dedup failures are experienced. [1] https://lore.kernel.org/bpf/b8e8b560-bce5-414b-846d-0da6d22a9983@oracle.com/ Changes since v1: - updated debug messages to refer to shallow-equal, added ids (Andrii) Signed-off-by: Alan Maguire Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20251203191507.55565-1-alan.maguire@oracle.com --- tools/lib/bpf/btf.c | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 84a4b0abc8be..b136572e889a 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -4431,11 +4431,14 @@ static bool btf_dedup_identical_types(struct btf_dedup *d, __u32 id1, __u32 id2, struct btf_type *t1, *t2; int k1, k2; recur: - if (depth <= 0) - return false; - t1 = btf_type_by_id(d->btf, id1); t2 = btf_type_by_id(d->btf, id2); + if (depth <= 0) { + pr_debug("Reached depth limit for identical type comparison for '%s'/'%s'\n", + btf__name_by_offset(d->btf, t1->name_off), + btf__name_by_offset(d->btf, t2->name_off)); + return false; + } k1 = btf_kind(t1); k2 = btf_kind(t2); @@ -4497,8 +4500,16 @@ static bool btf_dedup_identical_types(struct btf_dedup *d, __u32 id1, __u32 id2, for (i = 0, n = btf_vlen(t1); i < n; i++, m1++, m2++) { if (m1->type == m2->type) continue; - if (!btf_dedup_identical_types(d, m1->type, m2->type, depth - 1)) + if (!btf_dedup_identical_types(d, m1->type, m2->type, depth - 1)) { + if (t1->name_off) { + pr_debug("%s '%s' size=%d vlen=%d id1[%u] id2[%u] shallow-equal but not identical for field#%d '%s'\n", + k1 == BTF_KIND_STRUCT ? "STRUCT" : "UNION", + btf__name_by_offset(d->btf, t1->name_off), + t1->size, btf_vlen(t1), id1, id2, i, + btf__name_by_offset(d->btf, m1->name_off)); + } return false; + } } return true; } @@ -4739,8 +4750,16 @@ static int btf_dedup_is_equiv(struct btf_dedup *d, __u32 cand_id, canon_m = btf_members(canon_type); for (i = 0; i < vlen; i++) { eq = btf_dedup_is_equiv(d, cand_m->type, canon_m->type); - if (eq <= 0) + if (eq <= 0) { + if (cand_type->name_off) { + pr_debug("%s '%s' size=%d vlen=%d cand_id[%u] canon_id[%u] shallow-equal but not equiv for field#%d '%s': %d\n", + cand_kind == BTF_KIND_STRUCT ? "STRUCT" : "UNION", + btf__name_by_offset(d->btf, cand_type->name_off), + cand_type->size, vlen, cand_id, canon_id, i, + btf__name_by_offset(d->btf, cand_m->name_off), eq); + } return eq; + } cand_m++; canon_m++; } From 1588c81b9f215170bd596984691eda2f05a84a1f Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Wed, 3 Dec 2025 15:37:43 -0800 Subject: [PATCH 002/268] bpf: Allow verifier to fixup kernel module kfuncs Allow verifier to fixup kfuncs in kernel module to support kfuncs with __prog arguments. Currently, special kfuncs and kfuncs with __prog arguments are kernel kfuncs. Allowing kernel module kfuncs should not affect existing kfunc fixup as kernel module kfuncs have BTF IDs greater than kernel kfuncs' BTF IDs. Signed-off-by: Amery Hung Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20251203233748.668365-2-ameryhung@gmail.com --- kernel/bpf/verifier.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f0ca69f888fa..bb7eca1025c3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -22493,8 +22493,7 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (!bpf_jit_supports_far_kfunc_call()) insn->imm = BPF_CALL_IMM(desc->addr); - if (insn->off) - return 0; + if (desc->func_id == special_kfunc_list[KF_bpf_obj_new_impl] || desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta; From b5709f6d26d65f6bb9711f4b5f98469fd507cb5b Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Wed, 3 Dec 2025 15:37:44 -0800 Subject: [PATCH 003/268] bpf: Support associating BPF program with struct_ops Add a new BPF command BPF_PROG_ASSOC_STRUCT_OPS to allow associating a BPF program with a struct_ops map. This command takes a file descriptor of a struct_ops map and a BPF program and set prog->aux->st_ops_assoc to the kdata of the struct_ops map. The command does not accept a struct_ops program nor a non-struct_ops map. Programs of a struct_ops map is automatically associated with the map during map update. If a program is shared between two struct_ops maps, prog->aux->st_ops_assoc will be poisoned to indicate that the associated struct_ops is ambiguous. The pointer, once poisoned, cannot be reset since we have lost track of associated struct_ops. For other program types, the associated struct_ops map, once set, cannot be changed later. This restriction may be lifted in the future if there is a use case. A kernel helper bpf_prog_get_assoc_struct_ops() can be used to retrieve the associated struct_ops pointer. The returned pointer, if not NULL, is guaranteed to be valid and point to a fully updated struct_ops struct. For struct_ops program reused in multiple struct_ops map, the return will be NULL. prog->aux->st_ops_assoc is protected by bumping the refcount for non-struct_ops programs and RCU for struct_ops programs. Since it would be inefficient to track programs associated with a struct_ops map, every non-struct_ops program will bump the refcount of the map to make sure st_ops_assoc stays valid. For a struct_ops program, it is protected by RCU as map_free will wait for an RCU grace period before disassociating the program with the map. The helper must be called in BPF program context or RCU read-side critical section. struct_ops implementers should note that the struct_ops returned may not be initialized nor attached yet. The struct_ops implementer will be responsible for tracking and checking the state of the associated struct_ops map if the use case expects an initialized or attached struct_ops. Signed-off-by: Amery Hung Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20251203233748.668365-3-ameryhung@gmail.com --- include/linux/bpf.h | 16 +++++++ include/uapi/linux/bpf.h | 17 +++++++ kernel/bpf/bpf_struct_ops.c | 88 ++++++++++++++++++++++++++++++++++ kernel/bpf/core.c | 3 ++ kernel/bpf/syscall.c | 46 ++++++++++++++++++ tools/include/uapi/linux/bpf.h | 17 +++++++ 6 files changed, 187 insertions(+) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6498be4c44f8..28d8d6b7bb1e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1739,6 +1739,8 @@ struct bpf_prog_aux { struct rcu_head rcu; }; struct bpf_stream stream[2]; + struct mutex st_ops_assoc_mutex; + struct bpf_map __rcu *st_ops_assoc; }; struct bpf_prog { @@ -2041,6 +2043,9 @@ static inline void bpf_module_put(const void *data, struct module *owner) module_put(owner); } int bpf_struct_ops_link_create(union bpf_attr *attr); +int bpf_prog_assoc_struct_ops(struct bpf_prog *prog, struct bpf_map *map); +void bpf_prog_disassoc_struct_ops(struct bpf_prog *prog); +void *bpf_prog_get_assoc_struct_ops(const struct bpf_prog_aux *aux); u32 bpf_struct_ops_id(const void *kdata); #ifdef CONFIG_NET @@ -2088,6 +2093,17 @@ static inline int bpf_struct_ops_link_create(union bpf_attr *attr) { return -EOPNOTSUPP; } +static inline int bpf_prog_assoc_struct_ops(struct bpf_prog *prog, struct bpf_map *map) +{ + return -EOPNOTSUPP; +} +static inline void bpf_prog_disassoc_struct_ops(struct bpf_prog *prog) +{ +} +static inline void *bpf_prog_get_assoc_struct_ops(const struct bpf_prog_aux *aux) +{ + return NULL; +} static inline void bpf_map_struct_ops_info_fill(struct bpf_map_info *info, struct bpf_map *map) { } diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f8d8513eda27..84ced3ed2d21 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -918,6 +918,16 @@ union bpf_iter_link_info { * Number of bytes read from the stream on success, or -1 if an * error occurred (in which case, *errno* is set appropriately). * + * BPF_PROG_ASSOC_STRUCT_OPS + * Description + * Associate a BPF program with a struct_ops map. The struct_ops + * map is identified by *map_fd* and the BPF program is + * identified by *prog_fd*. + * + * Return + * 0 on success or -1 if an error occurred (in which case, + * *errno* is set appropriately). + * * NOTES * eBPF objects (maps and programs) can be shared between processes. * @@ -974,6 +984,7 @@ enum bpf_cmd { BPF_PROG_BIND_MAP, BPF_TOKEN_CREATE, BPF_PROG_STREAM_READ_BY_FD, + BPF_PROG_ASSOC_STRUCT_OPS, __MAX_BPF_CMD, }; @@ -1894,6 +1905,12 @@ union bpf_attr { __u32 prog_fd; } prog_stream_read; + struct { + __u32 map_fd; + __u32 prog_fd; + __u32 flags; + } prog_assoc_struct_ops; + } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 278490683d28..c43346cb3d76 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -533,6 +533,17 @@ static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map) } } +static void bpf_struct_ops_map_dissoc_progs(struct bpf_struct_ops_map *st_map) +{ + u32 i; + + for (i = 0; i < st_map->funcs_cnt; i++) { + if (!st_map->links[i]) + break; + bpf_prog_disassoc_struct_ops(st_map->links[i]->prog); + } +} + static void bpf_struct_ops_map_free_image(struct bpf_struct_ops_map *st_map) { int i; @@ -801,6 +812,9 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, goto reset_unlock; } + /* Poison pointer on error instead of return for backward compatibility */ + bpf_prog_assoc_struct_ops(prog, &st_map->map); + link = kzalloc(sizeof(*link), GFP_USER); if (!link) { bpf_prog_put(prog); @@ -980,6 +994,8 @@ static void bpf_struct_ops_map_free(struct bpf_map *map) if (btf_is_module(st_map->btf)) module_put(st_map->st_ops_desc->st_ops->owner); + bpf_struct_ops_map_dissoc_progs(st_map); + bpf_struct_ops_map_del_ksyms(st_map); /* The struct_ops's function may switch to another struct_ops. @@ -1396,6 +1412,78 @@ int bpf_struct_ops_link_create(union bpf_attr *attr) return err; } +int bpf_prog_assoc_struct_ops(struct bpf_prog *prog, struct bpf_map *map) +{ + struct bpf_map *st_ops_assoc; + + guard(mutex)(&prog->aux->st_ops_assoc_mutex); + + st_ops_assoc = rcu_dereference_protected(prog->aux->st_ops_assoc, + lockdep_is_held(&prog->aux->st_ops_assoc_mutex)); + if (st_ops_assoc && st_ops_assoc == map) + return 0; + + if (st_ops_assoc) { + if (prog->type != BPF_PROG_TYPE_STRUCT_OPS) + return -EBUSY; + + rcu_assign_pointer(prog->aux->st_ops_assoc, BPF_PTR_POISON); + } else { + /* + * struct_ops map does not track associated non-struct_ops programs. + * Bump the refcount to make sure st_ops_assoc is always valid. + */ + if (prog->type != BPF_PROG_TYPE_STRUCT_OPS) + bpf_map_inc(map); + + rcu_assign_pointer(prog->aux->st_ops_assoc, map); + } + + return 0; +} + +void bpf_prog_disassoc_struct_ops(struct bpf_prog *prog) +{ + struct bpf_map *st_ops_assoc; + + guard(mutex)(&prog->aux->st_ops_assoc_mutex); + + st_ops_assoc = rcu_dereference_protected(prog->aux->st_ops_assoc, + lockdep_is_held(&prog->aux->st_ops_assoc_mutex)); + if (!st_ops_assoc || st_ops_assoc == BPF_PTR_POISON) + return; + + if (prog->type != BPF_PROG_TYPE_STRUCT_OPS) + bpf_map_put(st_ops_assoc); + + RCU_INIT_POINTER(prog->aux->st_ops_assoc, NULL); +} + +/* + * Get a reference to the struct_ops struct (i.e., kdata) associated with a + * program. Should only be called in BPF program context (e.g., in a kfunc). + * + * If the returned pointer is not NULL, it must points to a valid struct_ops. + * The struct_ops map is not guaranteed to be initialized nor attached. + * Kernel struct_ops implementers are responsible for tracking and checking + * the state of the struct_ops if the use case requires an initialized or + * attached struct_ops. + */ +void *bpf_prog_get_assoc_struct_ops(const struct bpf_prog_aux *aux) +{ + struct bpf_struct_ops_map *st_map; + struct bpf_map *st_ops_assoc; + + st_ops_assoc = rcu_dereference_check(aux->st_ops_assoc, bpf_rcu_lock_held()); + if (!st_ops_assoc || st_ops_assoc == BPF_PTR_POISON) + return NULL; + + st_map = (struct bpf_struct_ops_map *)st_ops_assoc; + + return &st_map->kvalue.data; +} +EXPORT_SYMBOL_GPL(bpf_prog_get_assoc_struct_ops); + void bpf_map_struct_ops_info_fill(struct bpf_map_info *info, struct bpf_map *map) { struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index c8ae6ab31651..67226145a4db 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -136,6 +136,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag mutex_init(&fp->aux->used_maps_mutex); mutex_init(&fp->aux->ext_mutex); mutex_init(&fp->aux->dst_mutex); + mutex_init(&fp->aux->st_ops_assoc_mutex); #ifdef CONFIG_BPF_SYSCALL bpf_prog_stream_init(fp); @@ -286,6 +287,7 @@ void __bpf_prog_free(struct bpf_prog *fp) if (fp->aux) { mutex_destroy(&fp->aux->used_maps_mutex); mutex_destroy(&fp->aux->dst_mutex); + mutex_destroy(&fp->aux->st_ops_assoc_mutex); kfree(fp->aux->poke_tab); kfree(fp->aux); } @@ -2896,6 +2898,7 @@ static void bpf_prog_free_deferred(struct work_struct *work) #endif bpf_free_used_maps(aux); bpf_free_used_btfs(aux); + bpf_prog_disassoc_struct_ops(aux->prog); if (bpf_prog_is_dev_bound(aux)) bpf_prog_dev_bound_destroy(aux->prog); #ifdef CONFIG_PERF_EVENTS diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 6589acc89ef8..3080cc48bfc3 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -6122,6 +6122,49 @@ static int prog_stream_read(union bpf_attr *attr) return ret; } +#define BPF_PROG_ASSOC_STRUCT_OPS_LAST_FIELD prog_assoc_struct_ops.prog_fd + +static int prog_assoc_struct_ops(union bpf_attr *attr) +{ + struct bpf_prog *prog; + struct bpf_map *map; + int ret; + + if (CHECK_ATTR(BPF_PROG_ASSOC_STRUCT_OPS)) + return -EINVAL; + + if (attr->prog_assoc_struct_ops.flags) + return -EINVAL; + + prog = bpf_prog_get(attr->prog_assoc_struct_ops.prog_fd); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + if (prog->type == BPF_PROG_TYPE_STRUCT_OPS) { + ret = -EINVAL; + goto put_prog; + } + + map = bpf_map_get(attr->prog_assoc_struct_ops.map_fd); + if (IS_ERR(map)) { + ret = PTR_ERR(map); + goto put_prog; + } + + if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS) { + ret = -EINVAL; + goto put_map; + } + + ret = bpf_prog_assoc_struct_ops(prog, map); + +put_map: + bpf_map_put(map); +put_prog: + bpf_prog_put(prog); + return ret; +} + static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size) { union bpf_attr attr; @@ -6261,6 +6304,9 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size) case BPF_PROG_STREAM_READ_BY_FD: err = prog_stream_read(&attr); break; + case BPF_PROG_ASSOC_STRUCT_OPS: + err = prog_assoc_struct_ops(&attr); + break; default: err = -EINVAL; break; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index be7d8e060e10..6b92b0847ec2 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -918,6 +918,16 @@ union bpf_iter_link_info { * Number of bytes read from the stream on success, or -1 if an * error occurred (in which case, *errno* is set appropriately). * + * BPF_PROG_ASSOC_STRUCT_OPS + * Description + * Associate a BPF program with a struct_ops map. The struct_ops + * map is identified by *map_fd* and the BPF program is + * identified by *prog_fd*. + * + * Return + * 0 on success or -1 if an error occurred (in which case, + * *errno* is set appropriately). + * * NOTES * eBPF objects (maps and programs) can be shared between processes. * @@ -974,6 +984,7 @@ enum bpf_cmd { BPF_PROG_BIND_MAP, BPF_TOKEN_CREATE, BPF_PROG_STREAM_READ_BY_FD, + BPF_PROG_ASSOC_STRUCT_OPS, __MAX_BPF_CMD, }; @@ -1894,6 +1905,12 @@ union bpf_attr { __u32 prog_fd; } prog_stream_read; + struct { + __u32 map_fd; + __u32 prog_fd; + __u32 flags; + } prog_assoc_struct_ops; + } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF From 87cd177b149a5d86103736994307c25999e0be4b Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Wed, 3 Dec 2025 15:37:45 -0800 Subject: [PATCH 004/268] libbpf: Add support for associating BPF program with struct_ops Add low-level wrapper and libbpf API for BPF_PROG_ASSOC_STRUCT_OPS command in the bpf() syscall. Signed-off-by: Amery Hung Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20251203233748.668365-4-ameryhung@gmail.com --- tools/lib/bpf/bpf.c | 19 +++++++++++++++++++ tools/lib/bpf/bpf.h | 21 +++++++++++++++++++++ tools/lib/bpf/libbpf.c | 31 +++++++++++++++++++++++++++++++ tools/lib/bpf/libbpf.h | 16 ++++++++++++++++ tools/lib/bpf/libbpf.map | 2 ++ 5 files changed, 89 insertions(+) diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index b66f5fbfbbb2..21b57a629916 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -1397,3 +1397,22 @@ int bpf_prog_stream_read(int prog_fd, __u32 stream_id, void *buf, __u32 buf_len, err = sys_bpf(BPF_PROG_STREAM_READ_BY_FD, &attr, attr_sz); return libbpf_err_errno(err); } + +int bpf_prog_assoc_struct_ops(int prog_fd, int map_fd, + struct bpf_prog_assoc_struct_ops_opts *opts) +{ + const size_t attr_sz = offsetofend(union bpf_attr, prog_assoc_struct_ops); + union bpf_attr attr; + int err; + + if (!OPTS_VALID(opts, bpf_prog_assoc_struct_ops_opts)) + return libbpf_err(-EINVAL); + + memset(&attr, 0, attr_sz); + attr.prog_assoc_struct_ops.map_fd = map_fd; + attr.prog_assoc_struct_ops.prog_fd = prog_fd; + attr.prog_assoc_struct_ops.flags = OPTS_GET(opts, flags, 0); + + err = sys_bpf(BPF_PROG_ASSOC_STRUCT_OPS, &attr, attr_sz); + return libbpf_err_errno(err); +} diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index e983a3e40d61..1f9c28d27795 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -733,6 +733,27 @@ struct bpf_prog_stream_read_opts { LIBBPF_API int bpf_prog_stream_read(int prog_fd, __u32 stream_id, void *buf, __u32 buf_len, struct bpf_prog_stream_read_opts *opts); +struct bpf_prog_assoc_struct_ops_opts { + size_t sz; + __u32 flags; + size_t :0; +}; +#define bpf_prog_assoc_struct_ops_opts__last_field flags + +/** + * @brief **bpf_prog_assoc_struct_ops** associates a BPF program with a + * struct_ops map. + * + * @param prog_fd FD for the BPF program + * @param map_fd FD for the struct_ops map to be associated with the BPF program + * @param opts optional options, can be NULL + * + * @return 0 on success; negative error code, otherwise (errno is also set to + * the error code) + */ +LIBBPF_API int bpf_prog_assoc_struct_ops(int prog_fd, int map_fd, + struct bpf_prog_assoc_struct_ops_opts *opts); + #ifdef __cplusplus } /* extern "C" */ #endif diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 3dc8a8078815..c7c79014d46c 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -14133,6 +14133,37 @@ int bpf_program__set_attach_target(struct bpf_program *prog, return 0; } +int bpf_program__assoc_struct_ops(struct bpf_program *prog, struct bpf_map *map, + struct bpf_prog_assoc_struct_ops_opts *opts) +{ + int prog_fd, map_fd; + + prog_fd = bpf_program__fd(prog); + if (prog_fd < 0) { + pr_warn("prog '%s': can't associate BPF program without FD (was it loaded?)\n", + prog->name); + return libbpf_err(-EINVAL); + } + + if (prog->type == BPF_PROG_TYPE_STRUCT_OPS) { + pr_warn("prog '%s': can't associate struct_ops program\n", prog->name); + return libbpf_err(-EINVAL); + } + + map_fd = bpf_map__fd(map); + if (map_fd < 0) { + pr_warn("map '%s': can't associate BPF map without FD (was it created?)\n", map->name); + return libbpf_err(-EINVAL); + } + + if (!bpf_map__is_struct_ops(map)) { + pr_warn("map '%s': can't associate non-struct_ops map\n", map->name); + return libbpf_err(-EINVAL); + } + + return bpf_prog_assoc_struct_ops(prog_fd, map_fd, opts); +} + int parse_cpu_mask_str(const char *s, bool **mask, int *mask_sz) { int err = 0, n, len, start, end = -1; diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 65e68e964b89..e14d9e349f9c 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -1006,6 +1006,22 @@ LIBBPF_API int bpf_program__set_attach_target(struct bpf_program *prog, int attach_prog_fd, const char *attach_func_name); +struct bpf_prog_assoc_struct_ops_opts; /* defined in bpf.h */ + +/** + * @brief **bpf_program__assoc_struct_ops()** associates a BPF program with a + * struct_ops map. + * + * @param prog BPF program + * @param map struct_ops map to be associated with the BPF program + * @param opts optional options, can be NULL + * + * @return 0, on success; negative error code, otherwise + */ +LIBBPF_API int +bpf_program__assoc_struct_ops(struct bpf_program *prog, struct bpf_map *map, + struct bpf_prog_assoc_struct_ops_opts *opts); + /** * @brief **bpf_object__find_map_by_name()** returns BPF map of * the given name, if it exists within the passed BPF object diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 8ed8749907d4..84fb90a016c9 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -451,4 +451,6 @@ LIBBPF_1.7.0 { global: bpf_map__set_exclusive_program; bpf_map__exclusive_program; + bpf_prog_assoc_struct_ops; + bpf_program__assoc_struct_ops; } LIBBPF_1.6.0; From 33a165f9c2c1b9ddceaaccc356ce841baf1a08a2 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Wed, 3 Dec 2025 15:37:46 -0800 Subject: [PATCH 005/268] selftests/bpf: Test BPF_PROG_ASSOC_STRUCT_OPS command Test BPF_PROG_ASSOC_STRUCT_OPS command that associates a BPF program with a struct_ops. The test follows the same logic in commit ba7000f1c360 ("selftests/bpf: Test multi_st_ops and calling kfuncs from different programs"), but instead of using map id to identify a specific struct_ops, this test uses the new BPF command to associate a struct_ops with a program. The test consists of two sets of almost identical struct_ops maps and BPF programs associated with the map. Their only difference is the unique value returned by bpf_testmod_multi_st_ops::test_1(). The test first loads the programs and associates them with struct_ops maps. Then, it exercises the BPF programs. They will in turn call kfunc bpf_kfunc_multi_st_ops_test_1_prog_arg() to trigger test_1() of the associated struct_ops map, and then check if the right unique value is returned. Signed-off-by: Amery Hung Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20251203233748.668365-5-ameryhung@gmail.com --- .../bpf/prog_tests/test_struct_ops_assoc.c | 72 ++++++++++++ .../selftests/bpf/progs/struct_ops_assoc.c | 105 ++++++++++++++++++ .../selftests/bpf/test_kmods/bpf_testmod.c | 17 +++ .../bpf/test_kmods/bpf_testmod_kfunc.h | 1 + 4 files changed, 195 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/test_struct_ops_assoc.c create mode 100644 tools/testing/selftests/bpf/progs/struct_ops_assoc.c diff --git a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_assoc.c b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_assoc.c new file mode 100644 index 000000000000..1e24a4915524 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_assoc.c @@ -0,0 +1,72 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include "struct_ops_assoc.skel.h" + +static void test_st_ops_assoc(void) +{ + struct struct_ops_assoc *skel = NULL; + int err, pid; + + skel = struct_ops_assoc__open_and_load(); + if (!ASSERT_OK_PTR(skel, "struct_ops_assoc__open")) + goto out; + + /* cannot explicitly associate struct_ops program */ + err = bpf_program__assoc_struct_ops(skel->progs.test_1_a, + skel->maps.st_ops_map_a, NULL); + ASSERT_ERR(err, "bpf_program__assoc_struct_ops(test_1_a, st_ops_map_a)"); + + err = bpf_program__assoc_struct_ops(skel->progs.syscall_prog_a, + skel->maps.st_ops_map_a, NULL); + ASSERT_OK(err, "bpf_program__assoc_struct_ops(syscall_prog_a, st_ops_map_a)"); + + err = bpf_program__assoc_struct_ops(skel->progs.sys_enter_prog_a, + skel->maps.st_ops_map_a, NULL); + ASSERT_OK(err, "bpf_program__assoc_struct_ops(sys_enter_prog_a, st_ops_map_a)"); + + err = bpf_program__assoc_struct_ops(skel->progs.syscall_prog_b, + skel->maps.st_ops_map_b, NULL); + ASSERT_OK(err, "bpf_program__assoc_struct_ops(syscall_prog_b, st_ops_map_b)"); + + err = bpf_program__assoc_struct_ops(skel->progs.sys_enter_prog_b, + skel->maps.st_ops_map_b, NULL); + ASSERT_OK(err, "bpf_program__assoc_struct_ops(sys_enter_prog_b, st_ops_map_b)"); + + /* sys_enter_prog_a already associated with map_a */ + err = bpf_program__assoc_struct_ops(skel->progs.sys_enter_prog_a, + skel->maps.st_ops_map_b, NULL); + ASSERT_ERR(err, "bpf_program__assoc_struct_ops(sys_enter_prog_a, st_ops_map_b)"); + + err = struct_ops_assoc__attach(skel); + if (!ASSERT_OK(err, "struct_ops_assoc__attach")) + goto out; + + /* run tracing prog that calls .test_1 and checks return */ + pid = getpid(); + skel->bss->test_pid = pid; + sys_gettid(); + skel->bss->test_pid = 0; + + ASSERT_EQ(skel->bss->test_err_a, 0, "skel->bss->test_err_a"); + ASSERT_EQ(skel->bss->test_err_b, 0, "skel->bss->test_err_b"); + + /* run syscall_prog that calls .test_1 and checks return */ + err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.syscall_prog_a), NULL); + ASSERT_OK(err, "bpf_prog_test_run_opts"); + + err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.syscall_prog_b), NULL); + ASSERT_OK(err, "bpf_prog_test_run_opts"); + + ASSERT_EQ(skel->bss->test_err_a, 0, "skel->bss->test_err_a"); + ASSERT_EQ(skel->bss->test_err_b, 0, "skel->bss->test_err_b"); + +out: + struct_ops_assoc__destroy(skel); +} + +void test_struct_ops_assoc(void) +{ + if (test__start_subtest("st_ops_assoc")) + test_st_ops_assoc(); +} diff --git a/tools/testing/selftests/bpf/progs/struct_ops_assoc.c b/tools/testing/selftests/bpf/progs/struct_ops_assoc.c new file mode 100644 index 000000000000..8f1097903e22 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/struct_ops_assoc.c @@ -0,0 +1,105 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include "bpf_misc.h" +#include "../test_kmods/bpf_testmod.h" +#include "../test_kmods/bpf_testmod_kfunc.h" + +char _license[] SEC("license") = "GPL"; + +int test_pid; + +/* Programs associated with st_ops_map_a */ + +#define MAP_A_MAGIC 1234 +int test_err_a; + +SEC("struct_ops") +int BPF_PROG(test_1_a, struct st_ops_args *args) +{ + return MAP_A_MAGIC; +} + +SEC("tp_btf/sys_enter") +int BPF_PROG(sys_enter_prog_a, struct pt_regs *regs, long id) +{ + struct st_ops_args args = {}; + struct task_struct *task; + int ret; + + task = bpf_get_current_task_btf(); + if (!test_pid || task->pid != test_pid) + return 0; + + ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL); + if (ret != MAP_A_MAGIC) + test_err_a++; + + return 0; +} + +SEC("syscall") +int syscall_prog_a(void *ctx) +{ + struct st_ops_args args = {}; + int ret; + + ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL); + if (ret != MAP_A_MAGIC) + test_err_a++; + + return 0; +} + +SEC(".struct_ops.link") +struct bpf_testmod_multi_st_ops st_ops_map_a = { + .test_1 = (void *)test_1_a, +}; + +/* Programs associated with st_ops_map_b */ + +#define MAP_B_MAGIC 5678 +int test_err_b; + +SEC("struct_ops") +int BPF_PROG(test_1_b, struct st_ops_args *args) +{ + return MAP_B_MAGIC; +} + +SEC("tp_btf/sys_enter") +int BPF_PROG(sys_enter_prog_b, struct pt_regs *regs, long id) +{ + struct st_ops_args args = {}; + struct task_struct *task; + int ret; + + task = bpf_get_current_task_btf(); + if (!test_pid || task->pid != test_pid) + return 0; + + ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL); + if (ret != MAP_B_MAGIC) + test_err_b++; + + return 0; +} + +SEC("syscall") +int syscall_prog_b(void *ctx) +{ + struct st_ops_args args = {}; + int ret; + + ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL); + if (ret != MAP_B_MAGIC) + test_err_b++; + + return 0; +} + +SEC(".struct_ops.link") +struct bpf_testmod_multi_st_ops st_ops_map_b = { + .test_1 = (void *)test_1_b, +}; diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c index 1669a7eeda26..90c4b1a51de6 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c @@ -1134,6 +1134,7 @@ __bpf_kfunc int bpf_kfunc_st_ops_inc10(struct st_ops_args *args) } __bpf_kfunc int bpf_kfunc_multi_st_ops_test_1(struct st_ops_args *args, u32 id); +__bpf_kfunc int bpf_kfunc_multi_st_ops_test_1_impl(struct st_ops_args *args, void *aux_prog); BTF_KFUNCS_START(bpf_testmod_check_kfunc_ids) BTF_ID_FLAGS(func, bpf_testmod_test_mod_kfunc) @@ -1176,6 +1177,7 @@ BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_epilogue, KF_TRUSTED_ARGS | KF_SLEEPABL BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_pro_epilogue, KF_TRUSTED_ARGS | KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_kfunc_st_ops_inc10, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1_impl, KF_TRUSTED_ARGS) BTF_KFUNCS_END(bpf_testmod_check_kfunc_ids) static int bpf_testmod_ops_init(struct btf *btf) @@ -1637,6 +1639,7 @@ static struct bpf_testmod_multi_st_ops *multi_st_ops_find_nolock(u32 id) return NULL; } +/* Call test_1() of the struct_ops map identified by the id */ int bpf_kfunc_multi_st_ops_test_1(struct st_ops_args *args, u32 id) { struct bpf_testmod_multi_st_ops *st_ops; @@ -1652,6 +1655,20 @@ int bpf_kfunc_multi_st_ops_test_1(struct st_ops_args *args, u32 id) return ret; } +/* Call test_1() of the associated struct_ops map */ +int bpf_kfunc_multi_st_ops_test_1_impl(struct st_ops_args *args, void *aux__prog) +{ + struct bpf_prog_aux *prog_aux = (struct bpf_prog_aux *)aux__prog; + struct bpf_testmod_multi_st_ops *st_ops; + int ret = -1; + + st_ops = (struct bpf_testmod_multi_st_ops *)bpf_prog_get_assoc_struct_ops(prog_aux); + if (st_ops) + ret = st_ops->test_1(args); + + return ret; +} + static int multi_st_ops_reg(void *kdata, struct bpf_link *link) { struct bpf_testmod_multi_st_ops *st_ops = diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h b/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h index 4df6fa6a92cb..2357a0340ffe 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h @@ -162,5 +162,6 @@ struct task_struct *bpf_kfunc_ret_rcu_test(void) __ksym; int *bpf_kfunc_ret_rcu_test_nostruct(int rdonly_buf_size) __ksym; int bpf_kfunc_multi_st_ops_test_1(struct st_ops_args *args, u32 id) __ksym; +int bpf_kfunc_multi_st_ops_test_1_impl(struct st_ops_args *args, void *aux__prog) __ksym; #endif /* _BPF_TESTMOD_KFUNC_H */ From 04fd12df4e05dd6fd3017b637f6fbc9da10b4e65 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Wed, 3 Dec 2025 15:37:47 -0800 Subject: [PATCH 006/268] selftests/bpf: Test ambiguous associated struct_ops Add a test to make sure implicit struct_ops association does not break backward compatibility nor return incorrect struct_ops. struct_ops programs should still be allowed to be reused in different struct_ops map. The associated struct_ops map set implicitly however will be poisoned. Trying to read it through the helper bpf_prog_get_assoc_struct_ops() should result in a NULL pointer. While recursion of test_1() cannot happen due to the associated struct_ops being ambiguois, explicitly check for it to prevent stack overflow if the test regresses. Signed-off-by: Amery Hung Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20251203233748.668365-6-ameryhung@gmail.com --- .../bpf/prog_tests/test_struct_ops_assoc.c | 38 ++++++++++ .../bpf/progs/struct_ops_assoc_reuse.c | 75 +++++++++++++++++++ 2 files changed, 113 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/struct_ops_assoc_reuse.c diff --git a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_assoc.c b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_assoc.c index 1e24a4915524..02173504f675 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_assoc.c +++ b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_assoc.c @@ -2,6 +2,7 @@ #include #include "struct_ops_assoc.skel.h" +#include "struct_ops_assoc_reuse.skel.h" static void test_st_ops_assoc(void) { @@ -65,8 +66,45 @@ static void test_st_ops_assoc(void) struct_ops_assoc__destroy(skel); } +static void test_st_ops_assoc_reuse(void) +{ + struct struct_ops_assoc_reuse *skel = NULL; + int err; + + skel = struct_ops_assoc_reuse__open_and_load(); + if (!ASSERT_OK_PTR(skel, "struct_ops_assoc_reuse__open")) + goto out; + + err = bpf_program__assoc_struct_ops(skel->progs.syscall_prog_a, + skel->maps.st_ops_map_a, NULL); + ASSERT_OK(err, "bpf_program__assoc_struct_ops(syscall_prog_a, st_ops_map_a)"); + + err = bpf_program__assoc_struct_ops(skel->progs.syscall_prog_b, + skel->maps.st_ops_map_b, NULL); + ASSERT_OK(err, "bpf_program__assoc_struct_ops(syscall_prog_b, st_ops_map_b)"); + + err = struct_ops_assoc_reuse__attach(skel); + if (!ASSERT_OK(err, "struct_ops_assoc__attach")) + goto out; + + /* run syscall_prog that calls .test_1 and checks return */ + err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.syscall_prog_a), NULL); + ASSERT_OK(err, "bpf_prog_test_run_opts"); + + err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.syscall_prog_b), NULL); + ASSERT_OK(err, "bpf_prog_test_run_opts"); + + ASSERT_EQ(skel->bss->test_err_a, 0, "skel->bss->test_err_a"); + ASSERT_EQ(skel->bss->test_err_b, 0, "skel->bss->test_err_b"); + +out: + struct_ops_assoc_reuse__destroy(skel); +} + void test_struct_ops_assoc(void) { if (test__start_subtest("st_ops_assoc")) test_st_ops_assoc(); + if (test__start_subtest("st_ops_assoc_reuse")) + test_st_ops_assoc_reuse(); } diff --git a/tools/testing/selftests/bpf/progs/struct_ops_assoc_reuse.c b/tools/testing/selftests/bpf/progs/struct_ops_assoc_reuse.c new file mode 100644 index 000000000000..5bb6ebf5eed4 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/struct_ops_assoc_reuse.c @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include "bpf_misc.h" +#include "../test_kmods/bpf_testmod.h" +#include "../test_kmods/bpf_testmod_kfunc.h" + +char _license[] SEC("license") = "GPL"; + +#define MAP_A_MAGIC 1234 +int test_err_a; +int recur; + +/* + * test_1_a is reused. The kfunc should not be able to get the associated + * struct_ops and call test_1 recursively as it is ambiguous. + */ +SEC("struct_ops") +int BPF_PROG(test_1_a, struct st_ops_args *args) +{ + int ret; + + if (!recur) { + recur++; + ret = bpf_kfunc_multi_st_ops_test_1_impl(args, NULL); + if (ret != -1) + test_err_a++; + recur--; + } + + return MAP_A_MAGIC; +} + +/* Programs associated with st_ops_map_a */ + +SEC("syscall") +int syscall_prog_a(void *ctx) +{ + struct st_ops_args args = {}; + int ret; + + ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL); + if (ret != MAP_A_MAGIC) + test_err_a++; + + return 0; +} + +SEC(".struct_ops.link") +struct bpf_testmod_multi_st_ops st_ops_map_a = { + .test_1 = (void *)test_1_a, +}; + +/* Programs associated with st_ops_map_b */ + +int test_err_b; + +SEC("syscall") +int syscall_prog_b(void *ctx) +{ + struct st_ops_args args = {}; + int ret; + + ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL); + if (ret != MAP_A_MAGIC) + test_err_b++; + + return 0; +} + +SEC(".struct_ops.link") +struct bpf_testmod_multi_st_ops st_ops_map_b = { + .test_1 = (void *)test_1_a, +}; From 0e841d19263ab6e1ca2b280109832f57624e48d1 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Wed, 3 Dec 2025 15:37:48 -0800 Subject: [PATCH 007/268] selftests/bpf: Test getting associated struct_ops in timer callback Make sure 1) a timer callback can also reference the associated struct_ops, and then make sure 2) the timer callback cannot get a dangled pointer to the struct_ops when the map is freed. The test schedules a timer callback from a struct_ops program since struct_ops programs do not pin the map. It is possible for the timer callback to run after the map is freed. The timer callback calls a kfunc that runs .test_1() of the associated struct_ops, which should return MAP_MAGIC when the map is still alive or -1 when the map is gone. The first subtest added in this patch schedules the timer callback to run immediately, while the map is still alive. The second subtest added schedules the callback to run 500ms after syscall_prog runs and then frees the map right after syscall_prog runs. Both subtests then wait until the callback runs to check the return of the kfunc. Signed-off-by: Amery Hung Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20251203233748.668365-7-ameryhung@gmail.com --- .../bpf/prog_tests/test_struct_ops_assoc.c | 81 +++++++++++++++++++ .../bpf/progs/struct_ops_assoc_in_timer.c | 77 ++++++++++++++++++ 2 files changed, 158 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/struct_ops_assoc_in_timer.c diff --git a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_assoc.c b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_assoc.c index 02173504f675..461ded722351 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_assoc.c +++ b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_assoc.c @@ -3,6 +3,7 @@ #include #include "struct_ops_assoc.skel.h" #include "struct_ops_assoc_reuse.skel.h" +#include "struct_ops_assoc_in_timer.skel.h" static void test_st_ops_assoc(void) { @@ -101,10 +102,90 @@ static void test_st_ops_assoc_reuse(void) struct_ops_assoc_reuse__destroy(skel); } +static void test_st_ops_assoc_in_timer(void) +{ + struct struct_ops_assoc_in_timer *skel = NULL; + int err; + + skel = struct_ops_assoc_in_timer__open_and_load(); + if (!ASSERT_OK_PTR(skel, "struct_ops_assoc_in_timer__open")) + goto out; + + err = bpf_program__assoc_struct_ops(skel->progs.syscall_prog, + skel->maps.st_ops_map, NULL); + ASSERT_OK(err, "bpf_program__assoc_struct_ops"); + + err = struct_ops_assoc_in_timer__attach(skel); + if (!ASSERT_OK(err, "struct_ops_assoc__attach")) + goto out; + + /* + * Run .test_1 by calling kfunc bpf_kfunc_multi_st_ops_test_1_prog_arg() and checks + * the return value. .test_1 will also schedule timer_cb that runs .test_1 again + * immediately. + */ + err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.syscall_prog), NULL); + ASSERT_OK(err, "bpf_prog_test_run_opts"); + + /* Check the return of the kfunc after timer_cb runs */ + while (!READ_ONCE(skel->bss->timer_cb_run)) + sched_yield(); + ASSERT_EQ(skel->bss->timer_test_1_ret, 1234, "skel->bss->timer_test_1_ret"); + ASSERT_EQ(skel->bss->test_err, 0, "skel->bss->test_err_a"); +out: + struct_ops_assoc_in_timer__destroy(skel); +} + +static void test_st_ops_assoc_in_timer_no_uref(void) +{ + struct struct_ops_assoc_in_timer *skel = NULL; + struct bpf_link *link; + int err; + + skel = struct_ops_assoc_in_timer__open_and_load(); + if (!ASSERT_OK_PTR(skel, "struct_ops_assoc_in_timer__open")) + goto out; + + err = bpf_program__assoc_struct_ops(skel->progs.syscall_prog, + skel->maps.st_ops_map, NULL); + ASSERT_OK(err, "bpf_program__assoc_struct_ops"); + + link = bpf_map__attach_struct_ops(skel->maps.st_ops_map); + if (!ASSERT_OK_PTR(link, "bpf_map__attach_struct_ops")) + goto out; + + /* + * Run .test_1 by calling kfunc bpf_kfunc_multi_st_ops_test_1_prog_arg() and checks + * the return value. .test_1 will also schedule timer_cb that runs .test_1 again. + * timer_cb will run 500ms after syscall_prog runs, when the user space no longer + * holds a reference to st_ops_map. + */ + skel->bss->timer_ns = 500000000; + err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.syscall_prog), NULL); + ASSERT_OK(err, "bpf_prog_test_run_opts"); + + /* Detach and close struct_ops map to cause it to be freed */ + bpf_link__destroy(link); + close(bpf_program__fd(skel->progs.syscall_prog)); + close(bpf_map__fd(skel->maps.st_ops_map)); + + /* Check the return of the kfunc after timer_cb runs */ + while (!READ_ONCE(skel->bss->timer_cb_run)) + sched_yield(); + ASSERT_EQ(skel->bss->timer_test_1_ret, -1, "skel->bss->timer_test_1_ret"); + ASSERT_EQ(skel->bss->test_err, 0, "skel->bss->test_err_a"); +out: + struct_ops_assoc_in_timer__destroy(skel); +} + void test_struct_ops_assoc(void) { if (test__start_subtest("st_ops_assoc")) test_st_ops_assoc(); if (test__start_subtest("st_ops_assoc_reuse")) test_st_ops_assoc_reuse(); + if (test__start_subtest("st_ops_assoc_in_timer")) + test_st_ops_assoc_in_timer(); + if (test__start_subtest("st_ops_assoc_in_timer_no_uref")) + test_st_ops_assoc_in_timer_no_uref(); } diff --git a/tools/testing/selftests/bpf/progs/struct_ops_assoc_in_timer.c b/tools/testing/selftests/bpf/progs/struct_ops_assoc_in_timer.c new file mode 100644 index 000000000000..d5a2ea934284 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/struct_ops_assoc_in_timer.c @@ -0,0 +1,77 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include "bpf_misc.h" +#include "../test_kmods/bpf_testmod.h" +#include "../test_kmods/bpf_testmod_kfunc.h" + +char _license[] SEC("license") = "GPL"; + +struct elem { + struct bpf_timer timer; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, int); + __type(value, struct elem); +} array_map SEC(".maps"); + +#define MAP_MAGIC 1234 +int recur; +int test_err; +int timer_ns; +int timer_test_1_ret; +int timer_cb_run; + +__noinline static int timer_cb(void *map, int *key, struct bpf_timer *timer) +{ + struct st_ops_args args = {}; + + recur++; + timer_test_1_ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL); + recur--; + + timer_cb_run++; + + return 0; +} + +SEC("struct_ops") +int BPF_PROG(test_1, struct st_ops_args *args) +{ + struct bpf_timer *timer; + int key = 0; + + if (!recur) { + timer = bpf_map_lookup_elem(&array_map, &key); + if (!timer) + return 0; + + bpf_timer_init(timer, &array_map, 1); + bpf_timer_set_callback(timer, timer_cb); + bpf_timer_start(timer, timer_ns, 0); + } + + return MAP_MAGIC; +} + +SEC("syscall") +int syscall_prog(void *ctx) +{ + struct st_ops_args args = {}; + int ret; + + ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL); + if (ret != MAP_MAGIC) + test_err++; + + return 0; +} + +SEC(".struct_ops.link") +struct bpf_testmod_multi_st_ops st_ops_map = { + .test_1 = (void *)test_1, +}; From 48e11bad9a1fd803a22d25c9a7d3aced1ba87817 Mon Sep 17 00:00:00 2001 From: Kohei Enju Date: Mon, 8 Dec 2025 22:14:31 +0900 Subject: [PATCH 008/268] bpf: cpumap: propagate underlying error in cpu_map_update_elem() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After commit 9216477449f3 ("bpf: cpumap: Add the possibility to attach an eBPF program to cpumap"), __cpu_map_entry_alloc() may fail with errors other than -ENOMEM, such as -EBADF or -EINVAL. However, __cpu_map_entry_alloc() returns NULL on all failures, and cpu_map_update_elem() unconditionally converts this NULL into -ENOMEM. As a result, user space always receives -ENOMEM regardless of the actual underlying error. Examples of unexpected behavior: - Nonexistent fd : -ENOMEM (should be -EBADF) - Non-BPF fd : -ENOMEM (should be -EINVAL) - Bad attach type : -ENOMEM (should be -EINVAL) Change __cpu_map_entry_alloc() to return ERR_PTR(err) instead of NULL and have cpu_map_update_elem() propagate this error. Signed-off-by: Kohei Enju Reviewed-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/20251208131449.73036-2-enjuk@amazon.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/cpumap.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 703e5df1f4ef..04171fbc39cb 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -430,7 +430,7 @@ static struct bpf_cpu_map_entry * __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value, u32 cpu) { - int numa, err, i, fd = value->bpf_prog.fd; + int numa, err = -ENOMEM, i, fd = value->bpf_prog.fd; gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; struct bpf_cpu_map_entry *rcpu; struct xdp_bulk_queue *bq; @@ -440,7 +440,7 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value, rcpu = bpf_map_kmalloc_node(map, sizeof(*rcpu), gfp | __GFP_ZERO, numa); if (!rcpu) - return NULL; + return ERR_PTR(err); /* Alloc percpu bulkq */ rcpu->bulkq = bpf_map_alloc_percpu(map, sizeof(*rcpu->bulkq), @@ -468,16 +468,21 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value, rcpu->value.qsize = value->qsize; gro_init(&rcpu->gro); - if (fd > 0 && __cpu_map_load_bpf_program(rcpu, map, fd)) - goto free_ptr_ring; + if (fd > 0) { + err = __cpu_map_load_bpf_program(rcpu, map, fd); + if (err) + goto free_ptr_ring; + } /* Setup kthread */ init_completion(&rcpu->kthread_running); rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa, "cpumap/%d/map:%d", cpu, map->id); - if (IS_ERR(rcpu->kthread)) + if (IS_ERR(rcpu->kthread)) { + err = PTR_ERR(rcpu->kthread); goto free_prog; + } /* Make sure kthread runs on a single CPU */ kthread_bind(rcpu->kthread, cpu); @@ -503,7 +508,7 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value, free_percpu(rcpu->bulkq); free_rcu: kfree(rcpu); - return NULL; + return ERR_PTR(err); } static void __cpu_map_entry_free(struct work_struct *work) @@ -596,8 +601,8 @@ static long cpu_map_update_elem(struct bpf_map *map, void *key, void *value, } else { /* Updating qsize cause re-allocation of bpf_cpu_map_entry */ rcpu = __cpu_map_entry_alloc(map, &cpumap_value, key_cpu); - if (!rcpu) - return -ENOMEM; + if (IS_ERR(rcpu)) + return PTR_ERR(rcpu); } rcu_read_lock(); __cpu_map_entry_replace(cmap, key_cpu, rcpu); From 18352f8fae91d23bbd7165a7b2a1f15c4f5beff8 Mon Sep 17 00:00:00 2001 From: Kohei Enju Date: Mon, 8 Dec 2025 22:14:32 +0900 Subject: [PATCH 009/268] selftests/bpf: add tests for attaching invalid fd MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add test cases for situations where adding the following types of file descriptors to a cpumap entry should fail: - Non-BPF file descriptor (expect -EINVAL) - Nonexistent file descriptor (expect -EBADF) Also tighten the assertion for the expected error when adding a non-BPF_XDP_CPUMAP program to a cpumap entry. Signed-off-by: Kohei Enju Reviewed-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/20251208131449.73036-3-enjuk@amazon.com Signed-off-by: Alexei Starovoitov --- .../bpf/prog_tests/xdp_cpumap_attach.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_cpumap_attach.c b/tools/testing/selftests/bpf/prog_tests/xdp_cpumap_attach.c index df27535995af..ad56e4370ce3 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_cpumap_attach.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_cpumap_attach.c @@ -18,7 +18,7 @@ static void test_xdp_with_cpumap_helpers(void) struct bpf_cpumap_val val = { .qsize = 192, }; - int err, prog_fd, prog_redir_fd, map_fd; + int err, prog_fd, prog_redir_fd, map_fd, bad_fd; struct nstoken *nstoken = NULL; __u32 idx = 0; @@ -79,7 +79,22 @@ static void test_xdp_with_cpumap_helpers(void) val.qsize = 192; val.bpf_prog.fd = bpf_program__fd(skel->progs.xdp_dummy_prog); err = bpf_map_update_elem(map_fd, &idx, &val, 0); - ASSERT_NEQ(err, 0, "Add non-BPF_XDP_CPUMAP program to cpumap entry"); + ASSERT_EQ(err, -EINVAL, "Add non-BPF_XDP_CPUMAP program to cpumap entry"); + + /* Try to attach non-BPF file descriptor */ + bad_fd = open("/dev/null", O_RDONLY); + ASSERT_GE(bad_fd, 0, "Open /dev/null for non-BPF fd"); + + val.bpf_prog.fd = bad_fd; + err = bpf_map_update_elem(map_fd, &idx, &val, 0); + ASSERT_EQ(err, -EINVAL, "Add non-BPF fd to cpumap entry"); + + /* Try to attach nonexistent file descriptor */ + err = close(bad_fd); + ASSERT_EQ(err, 0, "Close non-BPF fd for nonexistent fd"); + + err = bpf_map_update_elem(map_fd, &idx, &val, 0); + ASSERT_EQ(err, -EBADF, "Add nonexistent fd to cpumap entry"); /* Try to attach BPF_XDP program with frags to cpumap when we have * already loaded a BPF_XDP program on the map From d18dec4b8990048ce75f0ece32bb96b3fbd3f422 Mon Sep 17 00:00:00 2001 From: Cupertino Miranda Date: Tue, 2 Dec 2025 18:02:19 +0000 Subject: [PATCH 010/268] bpf: verifier improvement in 32bit shift sign extension pattern This patch improves the verifier to correctly compute bounds for sign extension compiler pattern composed of left shift by 32bits followed by a sign right shift by 32bits. Pattern in the verifier was limitted to positive value bounds and would reset bound computation for negative values. New code allows both positive and negative values for sign extension without compromising bound computation and verifier to pass. This change is required by GCC which generate such pattern, and was detected in the context of systemd, as described in the following GCC bugzilla: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=119731 Three new tests were added in verifier_subreg.c. Signed-off-by: Cupertino Miranda Signed-off-by: Andrew Pinski Acked-by: Eduard Zingerman Cc: David Faust Cc: Jose Marchesi Cc: Elena Zannoni Link: https://lore.kernel.org/r/20251202180220.11128-2-cupertino.miranda@oracle.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bb7eca1025c3..a31c032b2dd6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -15300,21 +15300,17 @@ static void __scalar64_min_max_lsh(struct bpf_reg_state *dst_reg, u64 umin_val, u64 umax_val) { /* Special case <<32 because it is a common compiler pattern to sign - * extend subreg by doing <<32 s>>32. In this case if 32bit bounds are - * positive we know this shift will also be positive so we can track - * bounds correctly. Otherwise we lose all sign bit information except - * what we can pick up from var_off. Perhaps we can generalize this - * later to shifts of any length. + * extend subreg by doing <<32 s>>32. smin/smax assignments are correct + * because s32 bounds don't flip sign when shifting to the left by + * 32bits. */ - if (umin_val == 32 && umax_val == 32 && dst_reg->s32_max_value >= 0) + if (umin_val == 32 && umax_val == 32) { dst_reg->smax_value = (s64)dst_reg->s32_max_value << 32; - else - dst_reg->smax_value = S64_MAX; - - if (umin_val == 32 && umax_val == 32 && dst_reg->s32_min_value >= 0) dst_reg->smin_value = (s64)dst_reg->s32_min_value << 32; - else + } else { + dst_reg->smax_value = S64_MAX; dst_reg->smin_value = S64_MIN; + } /* If we might shift our top bit out, then we know nothing */ if (dst_reg->umax_value > 1ULL << (63 - umax_val)) { From a5b4867fad18e72fd5fc442c16be83723776283b Mon Sep 17 00:00:00 2001 From: Cupertino Miranda Date: Tue, 2 Dec 2025 18:02:20 +0000 Subject: [PATCH 011/268] selftests/bpf: add verifier sign extension bound computation tests. This commit adds 3 tests to verify a common compiler generated pattern for sign extension (r1 <<= 32; r1 s>>= 32). The tests make sure the register bounds are correctly computed both for positive and negative register values. Signed-off-by: Cupertino Miranda Signed-off-by: Andrew Pinski Acked-by: Eduard Zingerman Cc: David Faust Cc: Jose Marchesi Cc: Elena Zannoni Link: https://lore.kernel.org/r/20251202180220.11128-3-cupertino.miranda@oracle.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/progs/verifier_subreg.c | 68 +++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/verifier_subreg.c b/tools/testing/selftests/bpf/progs/verifier_subreg.c index 8613ea160dcd..b3e1c3eef9ae 100644 --- a/tools/testing/selftests/bpf/progs/verifier_subreg.c +++ b/tools/testing/selftests/bpf/progs/verifier_subreg.c @@ -531,6 +531,74 @@ __naked void arsh32_imm_zero_extend_check(void) : __clobber_all); } +SEC("socket") +__description("arsh32 imm sign positive extend check") +__success __retval(0) +__log_level(2) +__msg("2: (57) r6 &= 4095 ; R6=scalar(smin=smin32=0,smax=umax=smax32=umax32=4095,var_off=(0x0; 0xfff))") +__msg("3: (67) r6 <<= 32 ; R6=scalar(smin=smin32=0,smax=umax=0xfff00000000,smax32=umax32=0,var_off=(0x0; 0xfff00000000))") +__msg("4: (c7) r6 s>>= 32 ; R6=scalar(smin=smin32=0,smax=umax=smax32=umax32=4095,var_off=(0x0; 0xfff))") +__naked void arsh32_imm_sign_extend_positive_check(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r6 = r0; \ + r6 &= 4095; \ + r6 <<= 32; \ + r6 s>>= 32; \ + r0 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("arsh32 imm sign negative extend check") +__success __retval(0) +__log_level(2) +__msg("3: (17) r6 -= 4095 ; R6=scalar(smin=smin32=-4095,smax=smax32=0)") +__msg("4: (67) r6 <<= 32 ; R6=scalar(smin=0xfffff00100000000,smax=smax32=umax32=0,umax=0xffffffff00000000,smin32=0,var_off=(0x0; 0xffffffff00000000))") +__msg("5: (c7) r6 s>>= 32 ; R6=scalar(smin=smin32=-4095,smax=smax32=0)") +__naked void arsh32_imm_sign_extend_negative_check(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r6 = r0; \ + r6 &= 4095; \ + r6 -= 4095; \ + r6 <<= 32; \ + r6 s>>= 32; \ + r0 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("arsh32 imm sign extend check") +__success __retval(0) +__log_level(2) +__msg("3: (17) r6 -= 2047 ; R6=scalar(smin=smin32=-2047,smax=smax32=2048)") +__msg("4: (67) r6 <<= 32 ; R6=scalar(smin=0xfffff80100000000,smax=0x80000000000,umax=0xffffffff00000000,smin32=0,smax32=umax32=0,var_off=(0x0; 0xffffffff00000000))") +__msg("5: (c7) r6 s>>= 32 ; R6=scalar(smin=smin32=-2047,smax=smax32=2048)") +__naked void arsh32_imm_sign_extend_check(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r6 = r0; \ + r6 &= 4095; \ + r6 -= 2047; \ + r6 <<= 32; \ + r6 s>>= 32; \ + r0 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + SEC("socket") __description("end16 (to_le) reg zero extend check") __success __success_unpriv __retval(0) From 6f0b824a61f212e9707ff68abcabfdfa4724b811 Mon Sep 17 00:00:00 2001 From: "T.J. Mercier" Date: Sun, 7 Dec 2025 01:10:04 -0800 Subject: [PATCH 012/268] bpf: Fix bpf_seq_read docs for increased buffer size Commit af65320948b8 ("bpf: Bump iter seq size to support BTF representation of large data structures") increased the fixed buffer size from PAGE_SIZE to PAGE_SIZE << 3, but the docs for the function didn't get updated at the same time. Update them. Signed-off-by: T.J. Mercier Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20251207091005.2829703-1-tjmercier@google.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_iter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index eec60b57bd3d..4b58d56ecab1 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -86,7 +86,7 @@ static bool bpf_iter_support_resched(struct seq_file *seq) /* bpf_seq_read, a customized and simpler version for bpf iterator. * The following are differences from seq_read(): - * . fixed buffer size (PAGE_SIZE) + * . fixed buffer size (PAGE_SIZE << 3) * . assuming NULL ->llseek() * . stop() may call bpf program, handling potential overflow there */ From 0355911ac021a424b18d2d746536d70b879cdeab Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Tue, 16 Dec 2025 12:33:21 -0500 Subject: [PATCH 013/268] selftests/bpf: Explicitly account for globals in verifier_arena_large The big_alloc1 test in verifier_arena_large assumes that the arena base and the first page allocated by bpf_arena_alloc_pages are identical. This is not the case, because the first page in the arena is populated by global arena data. The test still passes because the code makes the tacit assumption that the first page is on offset PAGE_SIZE instead of 0. Make this distinction explicit in the code, and adjust the page offsets requested during the test to count from the beginning of the arena instead of using the address of the first allocated page. Signed-off-by: Emil Tsalapatis Signed-off-by: Andrii Nakryiko Reviewed-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20251216173325.98465-2-emil@etsalapatis.com --- .../selftests/bpf/progs/verifier_arena_large.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/verifier_arena_large.c b/tools/testing/selftests/bpf/progs/verifier_arena_large.c index f19e15400b3e..bd430a34c3ab 100644 --- a/tools/testing/selftests/bpf/progs/verifier_arena_large.c +++ b/tools/testing/selftests/bpf/progs/verifier_arena_large.c @@ -23,18 +23,25 @@ int big_alloc1(void *ctx) { #if defined(__BPF_FEATURE_ADDR_SPACE_CAST) volatile char __arena *page1, *page2, *no_page, *page3; - void __arena *base; + u64 base; - page1 = base = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0); + base = (u64)arena_base(&arena); + + page1 = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0); if (!page1) return 1; + + /* Account for global arena data. */ + if ((u64)page1 != base + PAGE_SIZE) + return 15; + *page1 = 1; - page2 = bpf_arena_alloc_pages(&arena, base + ARENA_SIZE - PAGE_SIZE * 2, + page2 = bpf_arena_alloc_pages(&arena, (void __arena *)(ARENA_SIZE - PAGE_SIZE), 1, NUMA_NO_NODE, 0); if (!page2) return 2; *page2 = 2; - no_page = bpf_arena_alloc_pages(&arena, base + ARENA_SIZE - PAGE_SIZE, + no_page = bpf_arena_alloc_pages(&arena, (void __arena *)ARENA_SIZE, 1, NUMA_NO_NODE, 0); if (no_page) return 3; From 12a1fe6e12dbad39f2f0dad1a385625f0298eff4 Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Tue, 16 Dec 2025 12:33:22 -0500 Subject: [PATCH 014/268] bpf/verifier: Do not limit maximum direct offset into arena map The verifier currently limits direct offsets into a map to 512MiB to avoid overflow during pointer arithmetic. However, this prevents arena maps from using direct addressing instructions to access data at the end of > 512MiB arena maps. This is necessary when moving arena globals to the end of the arena instead of the front. Refactor the verifier code to remove the offset calculation during direct value access calculations. This is possible because the only two map types that implement .map_direct_value_addr() are arrays and arenas, and they both do their own internal checks to ensure the offset is within bounds. Adjust selftests that expect the old error. These tests still fail because the verifier identifies the access as out of bounds for the map, so change them to expect an "invalid access to map value pointer" error instead. Signed-off-by: Emil Tsalapatis Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20251216173325.98465-3-emil@etsalapatis.com --- kernel/bpf/verifier.c | 5 ----- tools/testing/selftests/bpf/verifier/direct_value_access.c | 4 ++-- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a31c032b2dd6..d6b8a77fbe3b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -21132,11 +21132,6 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) } else { u32 off = insn[1].imm; - if (off >= BPF_MAX_VAR_OFF) { - verbose(env, "direct value offset of %u is not allowed\n", off); - return -EINVAL; - } - if (!map->ops->map_direct_value_addr) { verbose(env, "no direct value access support for this map type\n"); return -EINVAL; diff --git a/tools/testing/selftests/bpf/verifier/direct_value_access.c b/tools/testing/selftests/bpf/verifier/direct_value_access.c index c0648dc009b5..e569d119fb60 100644 --- a/tools/testing/selftests/bpf/verifier/direct_value_access.c +++ b/tools/testing/selftests/bpf/verifier/direct_value_access.c @@ -81,7 +81,7 @@ }, .fixup_map_array_48b = { 1 }, .result = REJECT, - .errstr = "direct value offset of 4294967295 is not allowed", + .errstr = "invalid access to map value pointer, value_size=48 off=4294967295", }, { "direct map access, write test 8", @@ -141,7 +141,7 @@ }, .fixup_map_array_48b = { 1 }, .result = REJECT, - .errstr = "direct value offset of 536870912 is not allowed", + .errstr = "invalid access to map value pointer, value_size=48 off=536870912", }, { "direct map access, write test 13", From 0aa721437e4b74d737f58582f1bbf2eea3e038c7 Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Tue, 16 Dec 2025 12:33:23 -0500 Subject: [PATCH 015/268] libbpf: Turn relo_core->sym_off unsigned The symbols' relocation offsets in BPF are stored in an int field, but cannot actually be negative. When in the next patch libbpf relocates globals to the end of the arena, it is also possible to have valid offsets > 2GiB that are used to calculate the final relo offsets. Avoid accidentally interpreting large offsets as negative by turning the sym_off field unsigned. Signed-off-by: Emil Tsalapatis Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20251216173325.98465-4-emil@etsalapatis.com --- tools/lib/bpf/libbpf.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index c7c79014d46c..4d4badb64824 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -380,7 +380,7 @@ struct reloc_desc { const struct bpf_core_relo *core_relo; /* used when type == RELO_CORE */ struct { int map_idx; - int sym_off; + unsigned int sym_off; /* * The following two fields can be unionized, as the * ext_idx field is used for extern symbols, and the @@ -763,7 +763,7 @@ struct bpf_object { struct { struct bpf_program *prog; - int sym_off; + unsigned int sym_off; int fd; } *jumptable_maps; size_t jumptable_map_cnt; @@ -6192,7 +6192,7 @@ static void poison_kfunc_call(struct bpf_program *prog, int relo_idx, insn->imm = POISON_CALL_KFUNC_BASE + ext_idx; } -static int find_jt_map(struct bpf_object *obj, struct bpf_program *prog, int sym_off) +static int find_jt_map(struct bpf_object *obj, struct bpf_program *prog, unsigned int sym_off) { size_t i; @@ -6210,7 +6210,7 @@ static int find_jt_map(struct bpf_object *obj, struct bpf_program *prog, int sym return -ENOENT; } -static int add_jt_map(struct bpf_object *obj, struct bpf_program *prog, int sym_off, int map_fd) +static int add_jt_map(struct bpf_object *obj, struct bpf_program *prog, unsigned int sym_off, int map_fd) { size_t cnt = obj->jumptable_map_cnt; size_t size = sizeof(obj->jumptable_maps[0]); @@ -6244,7 +6244,7 @@ static int find_subprog_idx(struct bpf_program *prog, int insn_idx) static int create_jt_map(struct bpf_object *obj, struct bpf_program *prog, struct reloc_desc *relo) { const __u32 jt_entry_size = 8; - int sym_off = relo->sym_off; + unsigned int sym_off = relo->sym_off; int jt_size = relo->sym_size; __u32 max_entries = jt_size / jt_entry_size; __u32 value_size = sizeof(struct bpf_insn_array_value); @@ -6260,7 +6260,7 @@ static int create_jt_map(struct bpf_object *obj, struct bpf_program *prog, struc return map_fd; if (sym_off % jt_entry_size) { - pr_warn("map '.jumptables': jumptable start %d should be multiple of %u\n", + pr_warn("map '.jumptables': jumptable start %u should be multiple of %u\n", sym_off, jt_entry_size); return -EINVAL; } @@ -6316,7 +6316,7 @@ static int create_jt_map(struct bpf_object *obj, struct bpf_program *prog, struc * should contain values that fit in u32. */ if (insn_off > UINT32_MAX) { - pr_warn("map '.jumptables': invalid jump table value 0x%llx at offset %d\n", + pr_warn("map '.jumptables': invalid jump table value 0x%llx at offset %u\n", (long long)jt[i], sym_off + i * jt_entry_size); err = -EINVAL; goto err_close; From c1f61171d44b19834cf24def2cf832f2688e83df Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Tue, 16 Dec 2025 12:33:24 -0500 Subject: [PATCH 016/268] libbpf: Move arena globals to the end of the arena Arena globals are currently placed at the beginning of the arena by libbpf. This is convenient, but prevents users from reserving guard pages in the beginning of the arena to identify NULL pointer dereferences. Adjust the load logic to place the globals at the end of the arena instead. Also modify bpftool to set the arena pointer in the program's BPF skeleton to point to the globals. Users now call bpf_map__initial_value() to find the beginning of the arena mapping and use the arena pointer in the skeleton to determine which part of the mapping holds the arena globals and which part is free. Suggested-by: Andrii Nakryiko Signed-off-by: Emil Tsalapatis Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20251216173325.98465-5-emil@etsalapatis.com --- tools/lib/bpf/libbpf.c | 17 +++++++++++++---- .../selftests/bpf/progs/verifier_arena_large.c | 12 +++++++++--- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 4d4badb64824..6fba879492a8 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -757,6 +757,7 @@ struct bpf_object { int arena_map_idx; void *arena_data; size_t arena_data_sz; + size_t arena_data_off; void *jumptables_data; size_t jumptables_data_sz; @@ -2991,10 +2992,11 @@ static int init_arena_map_data(struct bpf_object *obj, struct bpf_map *map, void *data, size_t data_sz) { const long page_sz = sysconf(_SC_PAGE_SIZE); + const size_t data_alloc_sz = roundup(data_sz, page_sz); size_t mmap_sz; mmap_sz = bpf_map_mmap_sz(map); - if (roundup(data_sz, page_sz) > mmap_sz) { + if (data_alloc_sz > mmap_sz) { pr_warn("elf: sec '%s': declared ARENA map size (%zu) is too small to hold global __arena variables of size %zu\n", sec_name, mmap_sz, data_sz); return -E2BIG; @@ -3006,6 +3008,9 @@ static int init_arena_map_data(struct bpf_object *obj, struct bpf_map *map, memcpy(obj->arena_data, data, data_sz); obj->arena_data_sz = data_sz; + /* place globals at the end of the arena */ + obj->arena_data_off = mmap_sz - data_alloc_sz; + /* make bpf_map__init_value() work for ARENA maps */ map->mmaped = obj->arena_data; @@ -4663,7 +4668,7 @@ static int bpf_program__record_reloc(struct bpf_program *prog, reloc_desc->type = RELO_DATA; reloc_desc->insn_idx = insn_idx; reloc_desc->map_idx = obj->arena_map_idx; - reloc_desc->sym_off = sym->st_value; + reloc_desc->sym_off = sym->st_value + obj->arena_data_off; map = &obj->maps[obj->arena_map_idx]; pr_debug("prog '%s': found arena map %d (%s, sec %d, off %zu) for insn %u\n", @@ -5624,7 +5629,8 @@ bpf_object__create_maps(struct bpf_object *obj) return err; } if (obj->arena_data) { - memcpy(map->mmaped, obj->arena_data, obj->arena_data_sz); + memcpy(map->mmaped + obj->arena_data_off, obj->arena_data, + obj->arena_data_sz); zfree(&obj->arena_data); } } @@ -14429,7 +14435,10 @@ int bpf_object__load_skeleton(struct bpf_object_skeleton *s) if (!map_skel->mmaped) continue; - *map_skel->mmaped = map->mmaped; + if (map->def.type == BPF_MAP_TYPE_ARENA) + *map_skel->mmaped = map->mmaped + map->obj->arena_data_off; + else + *map_skel->mmaped = map->mmaped; } return 0; diff --git a/tools/testing/selftests/bpf/progs/verifier_arena_large.c b/tools/testing/selftests/bpf/progs/verifier_arena_large.c index bd430a34c3ab..2b8cf2a4d880 100644 --- a/tools/testing/selftests/bpf/progs/verifier_arena_large.c +++ b/tools/testing/selftests/bpf/progs/verifier_arena_large.c @@ -31,16 +31,22 @@ int big_alloc1(void *ctx) if (!page1) return 1; - /* Account for global arena data. */ - if ((u64)page1 != base + PAGE_SIZE) + if ((u64)page1 != base) return 15; *page1 = 1; - page2 = bpf_arena_alloc_pages(&arena, (void __arena *)(ARENA_SIZE - PAGE_SIZE), + page2 = bpf_arena_alloc_pages(&arena, (void __arena *)(ARENA_SIZE - 2 * PAGE_SIZE), 1, NUMA_NO_NODE, 0); if (!page2) return 2; *page2 = 2; + + /* Test for the guard region at the end of the arena. */ + no_page = bpf_arena_alloc_pages(&arena, (void __arena *)ARENA_SIZE - PAGE_SIZE, + 1, NUMA_NO_NODE, 0); + if (no_page) + return 16; + no_page = bpf_arena_alloc_pages(&arena, (void __arena *)ARENA_SIZE, 1, NUMA_NO_NODE, 0); if (no_page) From 19f12431b6c339416e656c794a26ff0ebb2dba56 Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Tue, 16 Dec 2025 12:33:25 -0500 Subject: [PATCH 017/268] selftests/bpf: Add tests for the arena offset of globals Add tests for the new libbpf globals arena offset logic. The tests cover the case of globals being as large as the arena itself, and being smaller than the arena. In that case, the data is placed at the end of the arena, and the beginning of the arena is free. Signed-off-by: Emil Tsalapatis Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20251216173325.98465-6-emil@etsalapatis.com --- .../selftests/bpf/prog_tests/verifier.c | 4 + .../bpf/progs/verifier_arena_globals1.c | 87 +++++++++++++++++++ .../bpf/progs/verifier_arena_globals2.c | 49 +++++++++++ 3 files changed, 140 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/verifier_arena_globals1.c create mode 100644 tools/testing/selftests/bpf/progs/verifier_arena_globals2.c diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index 4b4b081b46cc..5829ffd70f8f 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -6,6 +6,8 @@ #include "verifier_and.skel.h" #include "verifier_arena.skel.h" #include "verifier_arena_large.skel.h" +#include "verifier_arena_globals1.skel.h" +#include "verifier_arena_globals2.skel.h" #include "verifier_array_access.skel.h" #include "verifier_async_cb_context.skel.h" #include "verifier_basic_stack.skel.h" @@ -147,6 +149,8 @@ static void run_tests_aux(const char *skel_name, void test_verifier_and(void) { RUN(verifier_and); } void test_verifier_arena(void) { RUN(verifier_arena); } void test_verifier_arena_large(void) { RUN(verifier_arena_large); } +void test_verifier_arena_globals1(void) { RUN(verifier_arena_globals1); } +void test_verifier_arena_globals2(void) { RUN(verifier_arena_globals2); } void test_verifier_basic_stack(void) { RUN(verifier_basic_stack); } void test_verifier_bitfield_write(void) { RUN(verifier_bitfield_write); } void test_verifier_bounds(void) { RUN(verifier_bounds); } diff --git a/tools/testing/selftests/bpf/progs/verifier_arena_globals1.c b/tools/testing/selftests/bpf/progs/verifier_arena_globals1.c new file mode 100644 index 000000000000..14afef3d6442 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_arena_globals1.c @@ -0,0 +1,87 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#define BPF_NO_KFUNC_PROTOTYPES +#include +#include +#include +#include "bpf_experimental.h" +#include "bpf_arena_common.h" +#include "bpf_misc.h" + +#define ARENA_PAGES (1UL<< (32 - 12)) +#define GLOBAL_PAGES (16) + +struct { + __uint(type, BPF_MAP_TYPE_ARENA); + __uint(map_flags, BPF_F_MMAPABLE); + __uint(max_entries, ARENA_PAGES); +#ifdef __TARGET_ARCH_arm64 + __ulong(map_extra, (1ull << 32) | (~0u - __PAGE_SIZE * ARENA_PAGES + 1)); +#else + __ulong(map_extra, (1ull << 44) | (~0u - __PAGE_SIZE * ARENA_PAGES + 1)); +#endif +} arena SEC(".maps"); + +/* + * Global data, to be placed at the end of the arena. + */ +volatile char __arena global_data[GLOBAL_PAGES][PAGE_SIZE]; + +SEC("syscall") +__success __retval(0) +int check_reserve1(void *ctx) +{ +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) + const u8 magic = 0x5a; + __u8 __arena *guard, *globals; + volatile char __arena *ptr; + int i; + int ret; + + guard = (void __arena *)arena_base(&arena); + globals = (void __arena *)(arena_base(&arena) + (ARENA_PAGES - GLOBAL_PAGES) * PAGE_SIZE); + + /* Reserve the region we've offset the globals by. */ + ret = bpf_arena_reserve_pages(&arena, guard, ARENA_PAGES - GLOBAL_PAGES); + if (ret) + return 1; + + /* Make sure the globals are in the expected offset. */ + ret = bpf_arena_reserve_pages(&arena, globals, 1); + if (!ret) + return 2; + + /* Verify globals are properly mapped in by libbpf. */ + for (i = 0; i < GLOBAL_PAGES; i++) { + ptr = &global_data[i][PAGE_SIZE / 2]; + + *ptr = magic; + if (*ptr != magic) + return i + 3; + } +#endif + return 0; +} + +/* + * Relocation check by reading directly into the global data w/o using symbols. + */ +SEC("syscall") +__success __retval(0) +int check_relocation(void *ctx) +{ +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) + const u8 magic = 0xfa; + u8 __arena *ptr; + + global_data[GLOBAL_PAGES - 1][PAGE_SIZE / 2] = magic; + ptr = (u8 __arena *)((u64)(ARENA_PAGES * PAGE_SIZE - PAGE_SIZE / 2)); + if (*ptr != magic) + return 1; + +#endif + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_arena_globals2.c b/tools/testing/selftests/bpf/progs/verifier_arena_globals2.c new file mode 100644 index 000000000000..e6bd7b61f9f1 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_arena_globals2.c @@ -0,0 +1,49 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#define BPF_NO_KFUNC_PROTOTYPES +#include +#include +#include +#include "bpf_misc.h" +#include "bpf_experimental.h" +#include "bpf_arena_common.h" + +#define ARENA_PAGES (32) + +struct { + __uint(type, BPF_MAP_TYPE_ARENA); + __uint(map_flags, BPF_F_MMAPABLE); + __uint(max_entries, ARENA_PAGES); +#ifdef __TARGET_ARCH_arm64 + __ulong(map_extra, (1ull << 32) | (~0u - __PAGE_SIZE * ARENA_PAGES + 1)); +#else + __ulong(map_extra, (1ull << 44) | (~0u - __PAGE_SIZE * ARENA_PAGES + 1)); +#endif +} arena SEC(".maps"); + +/* + * Fill the entire arena with global data. + * The offset into the arena should be 0. + */ +char __arena global_data[ARENA_PAGES][PAGE_SIZE]; + +SEC("syscall") +__success __retval(0) +int check_reserve2(void *ctx) +{ +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) + void __arena *guard; + int ret; + + guard = (void __arena *)arena_base(&arena); + + /* Make sure the data at offset 0 case is properly handled. */ + ret = bpf_arena_reserve_pages(&arena, guard, 1); + if (!ret) + return 1; +#endif + return 0; +} + +char _license[] SEC("license") = "GPL"; From c1c7d61746f42154bad30bc4adb88f5374969408 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Fri, 19 Dec 2025 10:13:14 -0800 Subject: [PATCH 018/268] resolve_btfids: Rename object btf field to btf_path Rename the member of `struct object` holding the path to BTF data if provided via --btf arg. `btf_path` is less ambiguous. Signed-off-by: Ihor Solodrai Signed-off-by: Andrii Nakryiko Tested-by: Alan Maguire Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20251219181321.1283664-2-ihor.solodrai@linux.dev --- tools/bpf/resolve_btfids/main.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c index d47191c6e55e..164f0c941f04 100644 --- a/tools/bpf/resolve_btfids/main.c +++ b/tools/bpf/resolve_btfids/main.c @@ -113,7 +113,7 @@ struct btf_id { struct object { const char *path; - const char *btf; + const char *btf_path; const char *base_btf_path; struct { @@ -550,11 +550,11 @@ static int symbols_resolve(struct object *obj) } } - btf = btf__parse_split(obj->btf ?: obj->path, base_btf); + btf = btf__parse_split(obj->btf_path ?: obj->path, base_btf); err = libbpf_get_error(btf); if (err) { pr_err("FAILED: load BTF from %s: %s\n", - obj->btf ?: obj->path, strerror(-err)); + obj->btf_path ?: obj->path, strerror(-err)); goto out; } @@ -790,8 +790,8 @@ int main(int argc, const char **argv) struct option btfid_options[] = { OPT_INCR('v', "verbose", &verbose, "be more verbose (show errors, etc)"), - OPT_STRING(0, "btf", &obj.btf, "BTF data", - "BTF data"), + OPT_STRING(0, "btf", &obj.btf_path, "file", + "path to a file with input BTF data"), OPT_STRING('b', "btf_base", &obj.base_btf_path, "file", "path of file providing base BTF"), OPT_BOOLEAN(0, "fatal_warnings", &fatal_warnings, From 5f347a0f781a36927a547904dde4c4dac343010b Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Fri, 19 Dec 2025 10:13:15 -0800 Subject: [PATCH 019/268] resolve_btfids: Factor out load_btf() Increase the lifetime of parsed BTF in resolve_btfids by factoring load_btf() routine out of symbols_resolve() and storing the base_btf and btf pointers in the struct object. Signed-off-by: Ihor Solodrai Signed-off-by: Andrii Nakryiko Tested-by: Alan Maguire Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20251219181321.1283664-3-ihor.solodrai@linux.dev --- tools/bpf/resolve_btfids/main.c | 47 ++++++++++++++++++++++++--------- 1 file changed, 34 insertions(+), 13 deletions(-) diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c index 164f0c941f04..b4caae1170dd 100644 --- a/tools/bpf/resolve_btfids/main.c +++ b/tools/bpf/resolve_btfids/main.c @@ -116,6 +116,9 @@ struct object { const char *btf_path; const char *base_btf_path; + struct btf *btf; + struct btf *base_btf; + struct { int fd; Elf *elf; @@ -529,16 +532,10 @@ static int symbols_collect(struct object *obj) return 0; } -static int symbols_resolve(struct object *obj) +static int load_btf(struct object *obj) { - int nr_typedefs = obj->nr_typedefs; - int nr_structs = obj->nr_structs; - int nr_unions = obj->nr_unions; - int nr_funcs = obj->nr_funcs; - struct btf *base_btf = NULL; - int err, type_id; - struct btf *btf; - __u32 nr_types; + struct btf *base_btf = NULL, *btf = NULL; + int err; if (obj->base_btf_path) { base_btf = btf__parse(obj->base_btf_path, NULL); @@ -546,7 +543,7 @@ static int symbols_resolve(struct object *obj) if (err) { pr_err("FAILED: load base BTF from %s: %s\n", obj->base_btf_path, strerror(-err)); - return -1; + goto out_err; } } @@ -555,9 +552,30 @@ static int symbols_resolve(struct object *obj) if (err) { pr_err("FAILED: load BTF from %s: %s\n", obj->btf_path ?: obj->path, strerror(-err)); - goto out; + goto out_err; } + obj->base_btf = base_btf; + obj->btf = btf; + + return 0; + +out_err: + btf__free(base_btf); + btf__free(btf); + return err; +} + +static int symbols_resolve(struct object *obj) +{ + int nr_typedefs = obj->nr_typedefs; + int nr_structs = obj->nr_structs; + int nr_unions = obj->nr_unions; + int nr_funcs = obj->nr_funcs; + struct btf *btf = obj->btf; + int err, type_id; + __u32 nr_types; + err = -1; nr_types = btf__type_cnt(btf); @@ -615,8 +633,6 @@ static int symbols_resolve(struct object *obj) err = 0; out: - btf__free(base_btf); - btf__free(btf); return err; } @@ -824,6 +840,9 @@ int main(int argc, const char **argv) if (symbols_collect(&obj)) goto out; + if (load_btf(&obj)) + goto out; + if (symbols_resolve(&obj)) goto out; @@ -833,6 +852,8 @@ int main(int argc, const char **argv) if (!(fatal_warnings && warnings)) err = 0; out: + btf__free(obj.base_btf); + btf__free(obj.btf); if (obj.efile.elf) { elf_end(obj.efile.elf); close(obj.efile.fd); From a4fa885bd52d1711994ad1ef99e989977cd15698 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Fri, 19 Dec 2025 10:13:16 -0800 Subject: [PATCH 020/268] resolve_btfids: Introduce enum btf_id_kind Instead of using multiple flags, make struct btf_id tagged with an enum value indicating its kind in the context of resolve_btfids. Signed-off-by: Ihor Solodrai Signed-off-by: Andrii Nakryiko Tested-by: Alan Maguire Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20251219181321.1283664-4-ihor.solodrai@linux.dev --- tools/bpf/resolve_btfids/main.c | 83 ++++++++++++++++++++++++--------- 1 file changed, 60 insertions(+), 23 deletions(-) diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c index b4caae1170dd..e721e20a2bbd 100644 --- a/tools/bpf/resolve_btfids/main.c +++ b/tools/bpf/resolve_btfids/main.c @@ -98,6 +98,13 @@ # error "Unknown machine endianness!" #endif +enum btf_id_kind { + BTF_ID_KIND_NONE, + BTF_ID_KIND_SYM, + BTF_ID_KIND_SET, + BTF_ID_KIND_SET8 +}; + struct btf_id { struct rb_node rb_node; char *name; @@ -105,9 +112,8 @@ struct btf_id { int id; int cnt; }; + enum btf_id_kind kind; int addr_cnt; - bool is_set; - bool is_set8; Elf64_Addr addr[ADDR_CNT]; }; @@ -197,8 +203,10 @@ static struct btf_id *btf_id__find(struct rb_root *root, const char *name) return NULL; } -static struct btf_id * -btf_id__add(struct rb_root *root, char *name, bool unique) +static struct btf_id *__btf_id__add(struct rb_root *root, + char *name, + enum btf_id_kind kind, + bool unique) { struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; @@ -221,12 +229,23 @@ btf_id__add(struct rb_root *root, char *name, bool unique) if (id) { pr_debug("adding symbol %s\n", name); id->name = name; + id->kind = kind; rb_link_node(&id->rb_node, parent, p); rb_insert_color(&id->rb_node, root); } return id; } +static inline struct btf_id *btf_id__add(struct rb_root *root, char *name, enum btf_id_kind kind) +{ + return __btf_id__add(root, name, kind, false); +} + +static inline struct btf_id *btf_id__add_unique(struct rb_root *root, char *name, enum btf_id_kind kind) +{ + return __btf_id__add(root, name, kind, true); +} + static char *get_id(const char *prefix_end) { /* @@ -260,22 +279,36 @@ static char *get_id(const char *prefix_end) return id; } -static struct btf_id *add_set(struct object *obj, char *name, bool is_set8) +static struct btf_id *add_set(struct object *obj, char *name, enum btf_id_kind kind) { + int len = strlen(name); + int prefixlen; + char *id; + /* * __BTF_ID__set__name * name = ^ * id = ^ */ - char *id = name + (is_set8 ? sizeof(BTF_SET8 "__") : sizeof(BTF_SET "__")) - 1; - int len = strlen(name); + switch (kind) { + case BTF_ID_KIND_SET: + prefixlen = sizeof(BTF_SET "__") - 1; + break; + case BTF_ID_KIND_SET8: + prefixlen = sizeof(BTF_SET8 "__") - 1; + break; + default: + pr_err("Unexpected kind %d passed to %s() for symbol %s\n", kind, __func__, name); + return NULL; + } + id = name + prefixlen; if (id >= name + len) { pr_err("FAILED to parse set name: %s\n", name); return NULL; } - return btf_id__add(&obj->sets, id, true); + return btf_id__add_unique(&obj->sets, id, kind); } static struct btf_id *add_symbol(struct rb_root *root, char *name, size_t size) @@ -288,7 +321,7 @@ static struct btf_id *add_symbol(struct rb_root *root, char *name, size_t size) return NULL; } - return btf_id__add(root, id, false); + return btf_id__add(root, id, BTF_ID_KIND_SYM); } /* Older libelf.h and glibc elf.h might not yet define the ELF compression types. */ @@ -491,35 +524,31 @@ static int symbols_collect(struct object *obj) id = add_symbol(&obj->funcs, prefix, sizeof(BTF_FUNC) - 1); /* set8 */ } else if (!strncmp(prefix, BTF_SET8, sizeof(BTF_SET8) - 1)) { - id = add_set(obj, prefix, true); + id = add_set(obj, prefix, BTF_ID_KIND_SET8); /* * SET8 objects store list's count, which is encoded * in symbol's size, together with 'cnt' field hence * that - 1. */ - if (id) { + if (id) id->cnt = sym.st_size / sizeof(uint64_t) - 1; - id->is_set8 = true; - } /* set */ } else if (!strncmp(prefix, BTF_SET, sizeof(BTF_SET) - 1)) { - id = add_set(obj, prefix, false); + id = add_set(obj, prefix, BTF_ID_KIND_SET); /* * SET objects store list's count, which is encoded * in symbol's size, together with 'cnt' field hence * that - 1. */ - if (id) { + if (id) id->cnt = sym.st_size / sizeof(int) - 1; - id->is_set = true; - } } else { pr_err("FAILED unsupported prefix %s\n", prefix); return -1; } if (!id) - return -ENOMEM; + return -EINVAL; if (id->addr_cnt >= ADDR_CNT) { pr_err("FAILED symbol %s crossed the number of allowed lists\n", @@ -643,7 +672,7 @@ static int id_patch(struct object *obj, struct btf_id *id) int i; /* For set, set8, id->id may be 0 */ - if (!id->id && !id->is_set && !id->is_set8) { + if (!id->id && id->kind != BTF_ID_KIND_SET && id->kind != BTF_ID_KIND_SET8) { pr_err("WARN: resolve_btfids: unresolved symbol %s\n", id->name); warnings++; } @@ -696,6 +725,7 @@ static int sets_patch(struct object *obj) { Elf_Data *data = obj->efile.idlist; struct rb_node *next; + int cnt; next = rb_first(&obj->sets); while (next) { @@ -715,11 +745,15 @@ static int sets_patch(struct object *obj) return -1; } - if (id->is_set) { + switch (id->kind) { + case BTF_ID_KIND_SET: set = data->d_buf + off; + cnt = set->cnt; qsort(set->ids, set->cnt, sizeof(set->ids[0]), cmp_id); - } else { + break; + case BTF_ID_KIND_SET8: set8 = data->d_buf + off; + cnt = set8->cnt; /* * Make sure id is at the beginning of the pairs * struct, otherwise the below qsort would not work. @@ -744,10 +778,13 @@ static int sets_patch(struct object *obj) bswap_32(set8->pairs[i].flags); } } + break; + default: + pr_err("Unexpected btf_id_kind %d for set '%s'\n", id->kind, id->name); + return -1; } - pr_debug("sorting addr %5lu: cnt %6d [%s]\n", - off, id->is_set ? set->cnt : set8->cnt, id->name); + pr_debug("sorting addr %5lu: cnt %6d [%s]\n", off, cnt, id->name); next = rb_next(next); } From fb348d4fdf5ec08aea8b1f686136584f758e4364 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Fri, 19 Dec 2025 10:13:17 -0800 Subject: [PATCH 021/268] resolve_btfids: Always build with -Wall -Werror resolve_btfids builds without compiler warnings currently, so let's enforce this for future changes with '-Wall -Werror' flags [1]. [1] https://lore.kernel.org/bpf/1957a60b-6c45-42a7-b525-a6e335a735ff@linux.dev/ Signed-off-by: Ihor Solodrai Signed-off-by: Andrii Nakryiko Tested-by: Alan Maguire Link: https://lore.kernel.org/bpf/20251219181321.1283664-5-ihor.solodrai@linux.dev --- tools/bpf/resolve_btfids/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/bpf/resolve_btfids/Makefile b/tools/bpf/resolve_btfids/Makefile index ce1b556dfa90..1733a6e93a07 100644 --- a/tools/bpf/resolve_btfids/Makefile +++ b/tools/bpf/resolve_btfids/Makefile @@ -70,7 +70,8 @@ HOSTCFLAGS_resolve_btfids += -g \ -I$(srctree)/tools/include/uapi \ -I$(LIBBPF_INCLUDE) \ -I$(SUBCMD_INCLUDE) \ - $(LIBELF_FLAGS) + $(LIBELF_FLAGS) \ + -Wall -Werror LIBS = $(LIBELF_LIBS) -lz From 90e5b38a26529391da2d851a9cc5eb9de2ec35ca Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Fri, 19 Dec 2025 10:13:18 -0800 Subject: [PATCH 022/268] kbuild: Sync kconfig when PAHOLE_VERSION changes This patch implements kconfig re-sync when the pahole version changes between builds, similar to how it happens for compiler version change via CC_VERSION_TEXT. Define PAHOLE_VERSION in the top-level Makefile and export it for config builds. Set CONFIG_PAHOLE_VERSION default to the exported variable. Kconfig records the PAHOLE_VERSION value in include/config/auto.conf.cmd [1]. The Makefile includes auto.conf.cmd, so if PAHOLE_VERSION changes between builds, make detects a dependency change and triggers syncconfig to update the kconfig [2]. For external module builds, add a warning message in the prepare target, similar to the existing compiler version mismatch warning. Note that if pahole is not installed or available, PAHOLE_VERSION is set to 0 by pahole-version.sh, so the (un)installation of pahole is treated as a version change. See previous discussions for context [3]. [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/scripts/kconfig/preprocess.c?h=v6.18#n91 [2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Makefile?h=v6.18#n815 [3] https://lore.kernel.org/bpf/8f946abf-dd88-4fac-8bb4-84fcd8d81cf0@oracle.com/ Signed-off-by: Ihor Solodrai Signed-off-by: Andrii Nakryiko Tested-by: Nicolas Schier Tested-by: Alan Maguire Reviewed-by: Nicolas Schier Link: https://lore.kernel.org/bpf/20251219181321.1283664-6-ihor.solodrai@linux.dev --- Makefile | 15 +++++++++++---- init/Kconfig | 2 +- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index e404e4767944..18adf5502244 100644 --- a/Makefile +++ b/Makefile @@ -708,11 +708,12 @@ endif # The expansion should be delayed until arch/$(SRCARCH)/Makefile is included. # Some architectures define CROSS_COMPILE in arch/$(SRCARCH)/Makefile. -# CC_VERSION_TEXT and RUSTC_VERSION_TEXT are referenced from Kconfig (so they -# need export), and from include/config/auto.conf.cmd to detect the compiler -# upgrade. +# CC_VERSION_TEXT, RUSTC_VERSION_TEXT and PAHOLE_VERSION are referenced from +# Kconfig (so they need export), and from include/config/auto.conf.cmd to +# detect the version changes between builds. CC_VERSION_TEXT = $(subst $(pound),,$(shell LC_ALL=C $(CC) --version 2>/dev/null | head -n 1)) RUSTC_VERSION_TEXT = $(subst $(pound),,$(shell $(RUSTC) --version 2>/dev/null)) +PAHOLE_VERSION = $(shell $(srctree)/scripts/pahole-version.sh $(PAHOLE)) ifneq ($(findstring clang,$(CC_VERSION_TEXT)),) include $(srctree)/scripts/Makefile.clang @@ -733,7 +734,7 @@ ifdef config-build # KBUILD_DEFCONFIG may point out an alternative default configuration # used for 'make defconfig' include $(srctree)/arch/$(SRCARCH)/Makefile -export KBUILD_DEFCONFIG KBUILD_KCONFIG CC_VERSION_TEXT RUSTC_VERSION_TEXT +export KBUILD_DEFCONFIG KBUILD_KCONFIG CC_VERSION_TEXT RUSTC_VERSION_TEXT PAHOLE_VERSION config: outputmakefile scripts_basic FORCE $(Q)$(MAKE) $(build)=scripts/kconfig $@ @@ -1921,12 +1922,18 @@ clean: private rm-files := Module.symvers modules.nsdeps compile_commands.json PHONY += prepare # now expand this into a simple variable to reduce the cost of shell evaluations prepare: CC_VERSION_TEXT := $(CC_VERSION_TEXT) +prepare: PAHOLE_VERSION := $(PAHOLE_VERSION) prepare: @if [ "$(CC_VERSION_TEXT)" != "$(CONFIG_CC_VERSION_TEXT)" ]; then \ echo >&2 "warning: the compiler differs from the one used to build the kernel"; \ echo >&2 " The kernel was built by: $(CONFIG_CC_VERSION_TEXT)"; \ echo >&2 " You are using: $(CC_VERSION_TEXT)"; \ fi + @if [ "$(PAHOLE_VERSION)" != "$(CONFIG_PAHOLE_VERSION)" ]; then \ + echo >&2 "warning: pahole version differs from the one used to build the kernel"; \ + echo >&2 " The kernel was built with: $(CONFIG_PAHOLE_VERSION)"; \ + echo >&2 " You are using: $(PAHOLE_VERSION)"; \ + fi PHONY += help help: diff --git a/init/Kconfig b/init/Kconfig index fa79feb8fe57..317f3c0b13ad 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -171,7 +171,7 @@ config RUSTC_HAS_FILE_AS_C_STR config PAHOLE_VERSION int - default $(shell,$(srctree)/scripts/pahole-version.sh $(PAHOLE)) + default "$(PAHOLE_VERSION)" config CONSTRUCTORS bool From 903922cfa0e60573234ff895974c23a000035258 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Fri, 19 Dec 2025 10:18:23 -0800 Subject: [PATCH 023/268] lib/Kconfig.debug: Set the minimum required pahole version to v1.22 Subsequent patches in the series change vmlinux linking scripts to unconditionally pass --btf_encode_detached to pahole, which was introduced in v1.22 [1][2]. This change allows to remove PAHOLE_HAS_SPLIT_BTF Kconfig option and other checks of older pahole versions. [1] https://github.com/acmel/dwarves/releases/tag/v1.22 [2] https://lore.kernel.org/bpf/cbafbf4e-9073-4383-8ee6-1353f9e5869c@oracle.com/ Signed-off-by: Ihor Solodrai Signed-off-by: Andrii Nakryiko Tested-by: Alan Maguire Acked-by: Eduard Zingerman Acked-by: Nicolas Schier Link: https://lore.kernel.org/bpf/20251219181825.1289460-1-ihor.solodrai@linux.dev --- Documentation/process/changes.rst | 4 ++-- Documentation/scheduler/sched-ext.rst | 1 - lib/Kconfig.debug | 13 ++++--------- scripts/Makefile.btf | 9 +-------- tools/sched_ext/README.md | 1 - 5 files changed, 7 insertions(+), 21 deletions(-) diff --git a/Documentation/process/changes.rst b/Documentation/process/changes.rst index 62951cdb13ad..b7e329159d00 100644 --- a/Documentation/process/changes.rst +++ b/Documentation/process/changes.rst @@ -38,7 +38,7 @@ bash 4.2 bash --version binutils 2.30 ld -v flex 2.5.35 flex --version bison 2.0 bison --version -pahole 1.16 pahole --version +pahole 1.22 pahole --version util-linux 2.10o mount --version kmod 13 depmod -V e2fsprogs 1.41.4 e2fsck -V @@ -143,7 +143,7 @@ pahole Since Linux 5.2, if CONFIG_DEBUG_INFO_BTF is selected, the build system generates BTF (BPF Type Format) from DWARF in vmlinux, a bit later from kernel -modules as well. This requires pahole v1.16 or later. +modules as well. This requires pahole v1.22 or later. It is found in the 'dwarves' or 'pahole' distro packages or from https://fedorapeople.org/~acme/dwarves/. diff --git a/Documentation/scheduler/sched-ext.rst b/Documentation/scheduler/sched-ext.rst index 404fe6126a76..9e2882d937b4 100644 --- a/Documentation/scheduler/sched-ext.rst +++ b/Documentation/scheduler/sched-ext.rst @@ -43,7 +43,6 @@ options should be enabled to use sched_ext: CONFIG_DEBUG_INFO_BTF=y CONFIG_BPF_JIT_ALWAYS_ON=y CONFIG_BPF_JIT_DEFAULT_ON=y - CONFIG_PAHOLE_HAS_SPLIT_BTF=y CONFIG_PAHOLE_HAS_BTF_TAG=y sched_ext is used only when the BPF scheduler is loaded and running. diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index ba36939fda79..60281c4f9e99 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -388,18 +388,13 @@ config DEBUG_INFO_BTF depends on !DEBUG_INFO_SPLIT && !DEBUG_INFO_REDUCED depends on !GCC_PLUGIN_RANDSTRUCT || COMPILE_TEST depends on BPF_SYSCALL - depends on PAHOLE_VERSION >= 116 - depends on DEBUG_INFO_DWARF4 || PAHOLE_VERSION >= 121 + depends on PAHOLE_VERSION >= 122 # pahole uses elfutils, which does not have support for Hexagon relocations depends on !HEXAGON help Generate deduplicated BTF type information from DWARF debug info. - Turning this on requires pahole v1.16 or later (v1.21 or later to - support DWARF 5), which will convert DWARF type info into equivalent - deduplicated BTF type info. - -config PAHOLE_HAS_SPLIT_BTF - def_bool PAHOLE_VERSION >= 119 + Turning this on requires pahole v1.22 or later, which will convert + DWARF type info into equivalent deduplicated BTF type info. config PAHOLE_HAS_BTF_TAG def_bool PAHOLE_VERSION >= 123 @@ -421,7 +416,7 @@ config PAHOLE_HAS_LANG_EXCLUDE config DEBUG_INFO_BTF_MODULES bool "Generate BTF type information for kernel modules" default y - depends on DEBUG_INFO_BTF && MODULES && PAHOLE_HAS_SPLIT_BTF + depends on DEBUG_INFO_BTF && MODULES help Generate compact split BTF type information for kernel modules. diff --git a/scripts/Makefile.btf b/scripts/Makefile.btf index db76335dd917..840a55de42da 100644 --- a/scripts/Makefile.btf +++ b/scripts/Makefile.btf @@ -7,14 +7,7 @@ JOBS := $(patsubst -j%,%,$(filter -j%,$(MAKEFLAGS))) ifeq ($(call test-le, $(pahole-ver), 125),y) -# pahole 1.18 through 1.21 can't handle zero-sized per-CPU vars -ifeq ($(call test-le, $(pahole-ver), 121),y) -pahole-flags-$(call test-ge, $(pahole-ver), 118) += --skip_encoding_btf_vars -endif - -pahole-flags-$(call test-ge, $(pahole-ver), 121) += --btf_gen_floats - -pahole-flags-$(call test-ge, $(pahole-ver), 122) += -j$(JOBS) +pahole-flags-y += --btf_gen_floats -j$(JOBS) pahole-flags-$(call test-ge, $(pahole-ver), 125) += --skip_encoding_btf_inconsistent_proto --btf_gen_optimized diff --git a/tools/sched_ext/README.md b/tools/sched_ext/README.md index 16a42e4060f6..56a9d1557ac4 100644 --- a/tools/sched_ext/README.md +++ b/tools/sched_ext/README.md @@ -65,7 +65,6 @@ It's also recommended that you also include the following Kconfig options: ``` CONFIG_BPF_JIT_ALWAYS_ON=y CONFIG_BPF_JIT_DEFAULT_ON=y -CONFIG_PAHOLE_HAS_SPLIT_BTF=y CONFIG_PAHOLE_HAS_BTF_TAG=y ``` From 014e1cdb5fad8c6034feb3a97468a91edf23d3d0 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Fri, 19 Dec 2025 10:18:24 -0800 Subject: [PATCH 024/268] selftests/bpf: Run resolve_btfids only for relevant .test.o objects A selftest targeting resolve_btfids functionality relies on a resolved .BTF_ids section to be available in the TRUNNER_BINARY. The underlying BTF data is taken from a special BPF program (btf_data.c), and so resolve_btfids is executed as a part of a TRUNNER_BINARY build recipe on the final binary. Subsequent patches in this series allow resolve_btfids to modify BTF before resolving the symbols, which means that the test needs access to that modified BTF [1]. Currently the test simply reads in btf_data.bpf.o on the assumption that BTF hasn't changed. Implement resolve_btfids call only for particular test objects (just resolve_btfids.test.o for now). The test objects are linked into the TRUNNER_BINARY, and so .BTF_ids section will be available there. This will make it trivial for the resolve_btfids test to access BTF modified by resolve_btfids. [1] https://lore.kernel.org/bpf/CAErzpmvsgSDe-QcWH8SFFErL6y3p3zrqNri5-UHJ9iK2ChyiBw@mail.gmail.com/ Signed-off-by: Ihor Solodrai Signed-off-by: Andrii Nakryiko Tested-by: Alan Maguire Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20251219181825.1289460-2-ihor.solodrai@linux.dev --- tools/testing/selftests/bpf/Makefile | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 4aa60e83ff19..ffd0a4c354c7 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -643,6 +643,9 @@ $(TRUNNER_TESTS_HDR): $(TRUNNER_TESTS_DIR)/*.c ) > $$@) endif +$(TRUNNER_OUTPUT)/resolve_btfids.test.o: $(RESOLVE_BTFIDS) $(TRUNNER_OUTPUT)/btf_data.bpf.o +$(TRUNNER_OUTPUT)/resolve_btfids.test.o: private TEST_NEEDS_BTFIDS = 1 + # compile individual test files # Note: we cd into output directory to ensure embedded BPF object is found $(TRUNNER_TEST_OBJS): $(TRUNNER_OUTPUT)/%.test.o: \ @@ -650,6 +653,9 @@ $(TRUNNER_TEST_OBJS): $(TRUNNER_OUTPUT)/%.test.o: \ | $(TRUNNER_OUTPUT)/%.test.d $$(call msg,TEST-OBJ,$(TRUNNER_BINARY),$$@) $(Q)cd $$(@D) && $$(CC) -I. $$(CFLAGS) -MMD -MT $$@ -c $(CURDIR)/$$< $$(LDLIBS) -o $$(@F) + $$(if $$(TEST_NEEDS_BTFIDS), \ + $$(call msg,BTFIDS,$(TRUNNER_BINARY),$$@) \ + $(RESOLVE_BTFIDS) --btf $(TRUNNER_OUTPUT)/btf_data.bpf.o $$@) $(TRUNNER_TEST_OBJS:.o=.d): $(TRUNNER_OUTPUT)/%.test.d: \ $(TRUNNER_TESTS_DIR)/%.c \ @@ -695,13 +701,11 @@ $(OUTPUT)/$(TRUNNER_BINARY): | $(TRUNNER_BPF_OBJS) $(OUTPUT)/$(TRUNNER_BINARY): $(TRUNNER_TEST_OBJS) \ $(TRUNNER_EXTRA_OBJS) $$(BPFOBJ) \ $(TRUNNER_LIB_OBJS) \ - $(RESOLVE_BTFIDS) \ $(TRUNNER_BPFTOOL) \ $(OUTPUT)/veristat \ | $(TRUNNER_BINARY)-extras $$(call msg,BINARY,,$$@) $(Q)$$(CC) $$(CFLAGS) $$(filter %.a %.o,$$^) $$(LDLIBS) $$(LLVM_LDLIBS) $$(LDFLAGS) $$(LLVM_LDFLAGS) -o $$@ - $(Q)$(RESOLVE_BTFIDS) --btf $(TRUNNER_OUTPUT)/btf_data.bpf.o $$@ $(Q)ln -sf $(if $2,..,.)/tools/build/bpftool/$(USE_BOOTSTRAP)bpftool \ $(OUTPUT)/$(if $2,$2/)bpftool From 522397d05e7d4a7c30b91841492360336b24f833 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Fri, 19 Dec 2025 10:18:25 -0800 Subject: [PATCH 025/268] resolve_btfids: Change in-place update with raw binary output Currently resolve_btfids updates .BTF_ids section of an ELF file in-place, based on the contents of provided BTF, usually within the same input file, and optionally a BTF base. Change resolve_btfids behavior to enable BTF transformations as part of its main operation. To achieve this, in-place ELF write in resolve_btfids is replaced with generation of the following binaries: * ${1}.BTF with .BTF section data * ${1}.BTF_ids with .BTF_ids section data if it existed in ${1} * ${1}.BTF.base with .BTF.base section data for out-of-tree modules The execution of resolve_btfids and consumption of its output is orchestrated by scripts/gen-btf.sh introduced in this patch. The motivation for emitting binary data is that it allows simplifying resolve_btfids implementation by delegating ELF update to the $OBJCOPY tool [1], which is already widely used across the codebase. There are two distinct paths for BTF generation and resolve_btfids application in the kernel build: for vmlinux and for kernel modules. For the vmlinux binary a .BTF section is added in a roundabout way to ensure correct linking. The patch doesn't change this approach, only the implementation is a little different. Before this patch it worked as follows: * pahole consumed .tmp_vmlinux1 [2] and added .BTF section with llvm-objcopy [3] to it * then everything except the .BTF section was stripped from .tmp_vmlinux1 into a .tmp_vmlinux1.bpf.o object [2], later linked into vmlinux * resolve_btfids was executed later on vmlinux.unstripped [4], updating it in-place After this patch gen-btf.sh implements the following: * pahole consumes .tmp_vmlinux1 and produces a *detached* file with raw BTF data * resolve_btfids consumes .tmp_vmlinux1 and detached BTF to produce (potentially modified) .BTF, and .BTF_ids sections data * a .tmp_vmlinux1.bpf.o object is then produced with objcopy copying BTF output of resolve_btfids * .BTF_ids data gets embedded into vmlinux.unstripped in link-vmlinux.sh by objcopy --update-section For kernel modules, creating a special .bpf.o file is not necessary, and so embedding of sections data produced by resolve_btfids is straightforward with objcopy. With this patch an ELF file becomes effectively read-only within resolve_btfids, which allows deleting elf_update() call and satellite code (like compressed_section_fix [5]). Endianness handling of .BTF_ids data is also changed. Previously the "flags" part of the section was bswapped in sets_patch() [6], and then Elf_Type was modified before elf_update() to signal to libelf that bswap may be necessary. With this patch we explicitly bswap entire data buffer on load and on dump. [1] https://lore.kernel.org/bpf/131b4190-9c49-4f79-a99d-c00fac97fa44@linux.dev/ [2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/scripts/link-vmlinux.sh?h=v6.18#n110 [3] https://git.kernel.org/pub/scm/devel/pahole/pahole.git/tree/btf_encoder.c?h=v1.31#n1803 [4] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/scripts/link-vmlinux.sh?h=v6.18#n284 [5] https://lore.kernel.org/bpf/20200819092342.259004-1-jolsa@kernel.org/ [6] https://lore.kernel.org/bpf/cover.1707223196.git.vmalik@redhat.com/ Signed-off-by: Ihor Solodrai Signed-off-by: Andrii Nakryiko Tested-by: Alan Maguire Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20251219181825.1289460-3-ihor.solodrai@linux.dev --- MAINTAINERS | 1 + scripts/Makefile.btf | 12 +- scripts/Makefile.modfinal | 5 +- scripts/Makefile.vmlinux | 2 +- scripts/gen-btf.sh | 157 ++++++++++++ scripts/link-vmlinux.sh | 42 +--- tools/bpf/resolve_btfids/main.c | 224 +++++++++++------- tools/testing/selftests/bpf/.gitignore | 3 + tools/testing/selftests/bpf/Makefile | 9 +- .../selftests/bpf/prog_tests/resolve_btfids.c | 4 +- 10 files changed, 328 insertions(+), 131 deletions(-) create mode 100755 scripts/gen-btf.sh diff --git a/MAINTAINERS b/MAINTAINERS index 5b11839cba9d..cb1898a85b05 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4766,6 +4766,7 @@ F: net/sched/act_bpf.c F: net/sched/cls_bpf.c F: samples/bpf/ F: scripts/bpf_doc.py +F: scripts/gen-btf.sh F: scripts/Makefile.btf F: scripts/pahole-version.sh F: tools/bpf/ diff --git a/scripts/Makefile.btf b/scripts/Makefile.btf index 840a55de42da..562a04b40e06 100644 --- a/scripts/Makefile.btf +++ b/scripts/Makefile.btf @@ -18,13 +18,15 @@ pahole-flags-$(call test-ge, $(pahole-ver), 126) = -j$(JOBS) --btf_features=enc pahole-flags-$(call test-ge, $(pahole-ver), 130) += --btf_features=attributes -ifneq ($(KBUILD_EXTMOD),) -module-pahole-flags-$(call test-ge, $(pahole-ver), 128) += --btf_features=distilled_base -endif - endif pahole-flags-$(CONFIG_PAHOLE_HAS_LANG_EXCLUDE) += --lang_exclude=rust export PAHOLE_FLAGS := $(pahole-flags-y) -export MODULE_PAHOLE_FLAGS := $(module-pahole-flags-y) + +resolve-btfids-flags-y := +resolve-btfids-flags-$(CONFIG_WERROR) += --fatal_warnings +resolve-btfids-flags-$(if $(KBUILD_EXTMOD),y) += --distill_base +resolve-btfids-flags-$(if $(KBUILD_VERBOSE),y) += --verbose + +export RESOLVE_BTFIDS_FLAGS := $(resolve-btfids-flags-y) diff --git a/scripts/Makefile.modfinal b/scripts/Makefile.modfinal index 149e12ff5700..422c56dc878e 100644 --- a/scripts/Makefile.modfinal +++ b/scripts/Makefile.modfinal @@ -42,9 +42,8 @@ quiet_cmd_btf_ko = BTF [M] $@ cmd_btf_ko = \ if [ ! -f $(objtree)/vmlinux ]; then \ printf "Skipping BTF generation for %s due to unavailability of vmlinux\n" $@ 1>&2; \ - else \ - LLVM_OBJCOPY="$(OBJCOPY)" $(PAHOLE) -J $(PAHOLE_FLAGS) $(MODULE_PAHOLE_FLAGS) --btf_base $(objtree)/vmlinux $@; \ - $(RESOLVE_BTFIDS) -b $(objtree)/vmlinux $@; \ + else \ + $(srctree)/scripts/gen-btf.sh --btf_base $(objtree)/vmlinux $@; \ fi; # Same as newer-prereqs, but allows to exclude specified extra dependencies diff --git a/scripts/Makefile.vmlinux b/scripts/Makefile.vmlinux index cd788cac9d91..20a988f4fe0c 100644 --- a/scripts/Makefile.vmlinux +++ b/scripts/Makefile.vmlinux @@ -71,7 +71,7 @@ targets += vmlinux.unstripped .vmlinux.export.o vmlinux.unstripped: scripts/link-vmlinux.sh vmlinux.o .vmlinux.export.o $(KBUILD_LDS) FORCE +$(call if_changed_dep,link_vmlinux) ifdef CONFIG_DEBUG_INFO_BTF -vmlinux.unstripped: $(RESOLVE_BTFIDS) +vmlinux.unstripped: $(RESOLVE_BTFIDS) $(srctree)/scripts/gen-btf.sh endif ifdef CONFIG_BUILDTIME_TABLE_SORT diff --git a/scripts/gen-btf.sh b/scripts/gen-btf.sh new file mode 100755 index 000000000000..06c6d8becaa2 --- /dev/null +++ b/scripts/gen-btf.sh @@ -0,0 +1,157 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Copyright (c) 2025 Meta Platforms, Inc. and affiliates. +# +# This script generates BTF data for the provided ELF file. +# +# Kernel BTF generation involves these conceptual steps: +# 1. pahole generates BTF from DWARF data +# 2. resolve_btfids applies kernel-specific btf2btf +# transformations and computes data for .BTF_ids section +# 3. the result gets linked/objcopied into the target binary +# +# How step (3) should be done differs between vmlinux, and +# kernel modules, which is the primary reason for the existence +# of this script. +# +# For modules the script expects vmlinux passed in as --btf_base. +# Generated .BTF, .BTF.base and .BTF_ids sections become embedded +# into the input ELF file with objcopy. +# +# For vmlinux the input file remains unchanged and two files are produced: +# - ${1}.btf.o ready for linking into vmlinux +# - ${1}.BTF_ids with .BTF_ids data blob +# This output is consumed by scripts/link-vmlinux.sh + +set -e + +usage() +{ + echo "Usage: $0 [--btf_base ] " + exit 1 +} + +BTF_BASE="" + +while [ $# -gt 0 ]; do + case "$1" in + --btf_base) + BTF_BASE="$2" + shift 2 + ;; + -*) + echo "Unknown option: $1" >&2 + usage + ;; + *) + break + ;; + esac +done + +if [ $# -ne 1 ]; then + usage +fi + +ELF_FILE="$1" +shift + +is_enabled() { + grep -q "^$1=y" ${objtree}/include/config/auto.conf +} + +info() +{ + printf " %-7s %s\n" "${1}" "${2}" +} + +case "${KBUILD_VERBOSE}" in +*1*) + set -x + ;; +esac + + +gen_btf_data() +{ + info BTF "${ELF_FILE}" + btf1="${ELF_FILE}.BTF.1" + ${PAHOLE} -J ${PAHOLE_FLAGS} \ + ${BTF_BASE:+--btf_base ${BTF_BASE}} \ + --btf_encode_detached=${btf1} \ + "${ELF_FILE}" + + info BTFIDS "${ELF_FILE}" + ${RESOLVE_BTFIDS} ${RESOLVE_BTFIDS_FLAGS} \ + ${BTF_BASE:+--btf_base ${BTF_BASE}} \ + --btf ${btf1} "${ELF_FILE}" +} + +gen_btf_o() +{ + local btf_data=${ELF_FILE}.btf.o + + # Create ${btf_data} which contains just .BTF section but no symbols. Add + # SHF_ALLOC because .BTF will be part of the vmlinux image. --strip-all + # deletes all symbols including __start_BTF and __stop_BTF, which will + # be redefined in the linker script. + info OBJCOPY "${btf_data}" + echo "" | ${CC} ${CLANG_FLAGS} -c -x c -o ${btf_data} - + ${OBJCOPY} --add-section .BTF=${ELF_FILE}.BTF \ + --set-section-flags .BTF=alloc,readonly ${btf_data} + ${OBJCOPY} --only-section=.BTF --strip-all ${btf_data} + + # Change e_type to ET_REL so that it can be used to link final vmlinux. + # GNU ld 2.35+ and lld do not allow an ET_EXEC input. + if is_enabled CONFIG_CPU_BIG_ENDIAN; then + et_rel='\0\1' + else + et_rel='\1\0' + fi + printf "${et_rel}" | dd of="${btf_data}" conv=notrunc bs=1 seek=16 status=none +} + +embed_btf_data() +{ + info OBJCOPY "${ELF_FILE}.BTF" + ${OBJCOPY} --add-section .BTF=${ELF_FILE}.BTF ${ELF_FILE} + + # a module might not have a .BTF_ids or .BTF.base section + local btf_base="${ELF_FILE}.BTF.base" + if [ -f "${btf_base}" ]; then + ${OBJCOPY} --add-section .BTF.base=${btf_base} ${ELF_FILE} + fi + local btf_ids="${ELF_FILE}.BTF_ids" + if [ -f "${btf_ids}" ]; then + ${OBJCOPY} --update-section .BTF_ids=${btf_ids} ${ELF_FILE} + fi +} + +cleanup() +{ + rm -f "${ELF_FILE}.BTF.1" + rm -f "${ELF_FILE}.BTF" + if [ "${BTFGEN_MODE}" = "module" ]; then + rm -f "${ELF_FILE}.BTF.base" + rm -f "${ELF_FILE}.BTF_ids" + fi +} +trap cleanup EXIT + +BTFGEN_MODE="vmlinux" +if [ -n "${BTF_BASE}" ]; then + BTFGEN_MODE="module" +fi + +gen_btf_data + +case "${BTFGEN_MODE}" in +vmlinux) + gen_btf_o + ;; +module) + embed_btf_data + ;; +esac + +exit 0 diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index 4ab44c73da4d..e2207e612ac3 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -106,34 +106,6 @@ vmlinux_link() ${kallsymso} ${btf_vmlinux_bin_o} ${arch_vmlinux_o} ${ldlibs} } -# generate .BTF typeinfo from DWARF debuginfo -# ${1} - vmlinux image -gen_btf() -{ - local btf_data=${1}.btf.o - - info BTF "${btf_data}" - LLVM_OBJCOPY="${OBJCOPY}" ${PAHOLE} -J ${PAHOLE_FLAGS} ${1} - - # Create ${btf_data} which contains just .BTF section but no symbols. Add - # SHF_ALLOC because .BTF will be part of the vmlinux image. --strip-all - # deletes all symbols including __start_BTF and __stop_BTF, which will - # be redefined in the linker script. Add 2>/dev/null to suppress GNU - # objcopy warnings: "empty loadable segment detected at ..." - ${OBJCOPY} --only-section=.BTF --set-section-flags .BTF=alloc,readonly \ - --strip-all ${1} "${btf_data}" 2>/dev/null - # Change e_type to ET_REL so that it can be used to link final vmlinux. - # GNU ld 2.35+ and lld do not allow an ET_EXEC input. - if is_enabled CONFIG_CPU_BIG_ENDIAN; then - et_rel='\0\1' - else - et_rel='\1\0' - fi - printf "${et_rel}" | dd of="${btf_data}" conv=notrunc bs=1 seek=16 status=none - - btf_vmlinux_bin_o=${btf_data} -} - # Create ${2}.o file with all symbols from the ${1} object file kallsyms() { @@ -205,6 +177,7 @@ if is_enabled CONFIG_ARCH_WANTS_PRE_LINK_VMLINUX; then fi btf_vmlinux_bin_o= +btfids_vmlinux= kallsymso= strip_debug= generate_map= @@ -232,11 +205,13 @@ if is_enabled CONFIG_KALLSYMS || is_enabled CONFIG_DEBUG_INFO_BTF; then fi if is_enabled CONFIG_DEBUG_INFO_BTF; then - if ! gen_btf .tmp_vmlinux1; then + if ! ${srctree}/scripts/gen-btf.sh .tmp_vmlinux1; then echo >&2 "Failed to generate BTF for vmlinux" echo >&2 "Try to disable CONFIG_DEBUG_INFO_BTF" exit 1 fi + btf_vmlinux_bin_o=.tmp_vmlinux1.btf.o + btfids_vmlinux=.tmp_vmlinux1.BTF_ids fi if is_enabled CONFIG_KALLSYMS; then @@ -289,14 +264,9 @@ fi vmlinux_link "${VMLINUX}" -# fill in BTF IDs if is_enabled CONFIG_DEBUG_INFO_BTF; then - info BTFIDS "${VMLINUX}" - RESOLVE_BTFIDS_ARGS="" - if is_enabled CONFIG_WERROR; then - RESOLVE_BTFIDS_ARGS=" --fatal_warnings " - fi - ${RESOLVE_BTFIDS} ${RESOLVE_BTFIDS_ARGS} "${VMLINUX}" + info OBJCOPY ${btfids_vmlinux} + ${OBJCOPY} --update-section .BTF_ids=${btfids_vmlinux} ${VMLINUX} fi mksysmap "${VMLINUX}" System.map diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c index e721e20a2bbd..2cbc252259be 100644 --- a/tools/bpf/resolve_btfids/main.c +++ b/tools/bpf/resolve_btfids/main.c @@ -71,9 +71,11 @@ #include #include #include +#include #include #include #include +#include #include #include #include @@ -124,6 +126,7 @@ struct object { struct btf *btf; struct btf *base_btf; + bool distill_base; struct { int fd; @@ -324,42 +327,16 @@ static struct btf_id *add_symbol(struct rb_root *root, char *name, size_t size) return btf_id__add(root, id, BTF_ID_KIND_SYM); } -/* Older libelf.h and glibc elf.h might not yet define the ELF compression types. */ -#ifndef SHF_COMPRESSED -#define SHF_COMPRESSED (1 << 11) /* Section with compressed data. */ -#endif - -/* - * The data of compressed section should be aligned to 4 - * (for 32bit) or 8 (for 64 bit) bytes. The binutils ld - * sets sh_addralign to 1, which makes libelf fail with - * misaligned section error during the update: - * FAILED elf_update(WRITE): invalid section alignment - * - * While waiting for ld fix, we fix the compressed sections - * sh_addralign value manualy. - */ -static int compressed_section_fix(Elf *elf, Elf_Scn *scn, GElf_Shdr *sh) +static void bswap_32_data(void *data, u32 nr_bytes) { - int expected = gelf_getclass(elf) == ELFCLASS32 ? 4 : 8; + u32 cnt, i; + u32 *ptr; - if (!(sh->sh_flags & SHF_COMPRESSED)) - return 0; + cnt = nr_bytes / sizeof(u32); + ptr = data; - if (sh->sh_addralign == expected) - return 0; - - pr_debug2(" - fixing wrong alignment sh_addralign %u, expected %u\n", - sh->sh_addralign, expected); - - sh->sh_addralign = expected; - - if (gelf_update_shdr(scn, sh) == 0) { - pr_err("FAILED cannot update section header: %s\n", - elf_errmsg(-1)); - return -1; - } - return 0; + for (i = 0; i < cnt; i++) + ptr[i] = bswap_32(ptr[i]); } static int elf_collect(struct object *obj) @@ -380,7 +357,7 @@ static int elf_collect(struct object *obj) elf_version(EV_CURRENT); - elf = elf_begin(fd, ELF_C_RDWR_MMAP, NULL); + elf = elf_begin(fd, ELF_C_READ_MMAP_PRIVATE, NULL); if (!elf) { close(fd); pr_err("FAILED cannot create ELF descriptor: %s\n", @@ -443,21 +420,20 @@ static int elf_collect(struct object *obj) obj->efile.symbols_shndx = idx; obj->efile.strtabidx = sh.sh_link; } else if (!strcmp(name, BTF_IDS_SECTION)) { + /* + * If target endianness differs from host, we need to bswap32 + * the .BTF_ids section data on load, because .BTF_ids has + * Elf_Type = ELF_T_BYTE, and so libelf returns data buffer in + * the target endianness. We repeat this on dump. + */ + if (obj->efile.encoding != ELFDATANATIVE) { + pr_debug("bswap_32 .BTF_ids data from target to host endianness\n"); + bswap_32_data(data->d_buf, data->d_size); + } obj->efile.idlist = data; obj->efile.idlist_shndx = idx; obj->efile.idlist_addr = sh.sh_addr; - } else if (!strcmp(name, BTF_BASE_ELF_SEC)) { - /* If a .BTF.base section is found, do not resolve - * BTF ids relative to vmlinux; resolve relative - * to the .BTF.base section instead. btf__parse_split() - * will take care of this once the base BTF it is - * passed is NULL. - */ - obj->base_btf_path = NULL; } - - if (compressed_section_fix(elf, scn, &sh)) - return -1; } return 0; @@ -587,11 +563,26 @@ static int load_btf(struct object *obj) obj->base_btf = base_btf; obj->btf = btf; + if (obj->base_btf && obj->distill_base) { + err = btf__distill_base(obj->btf, &base_btf, &btf); + if (err) { + pr_err("FAILED to distill base BTF: %s\n", strerror(errno)); + goto out_err; + } + + btf__free(obj->base_btf); + btf__free(obj->btf); + obj->base_btf = base_btf; + obj->btf = btf; + } + return 0; out_err: btf__free(base_btf); btf__free(btf); + obj->base_btf = NULL; + obj->btf = NULL; return err; } @@ -760,24 +751,6 @@ static int sets_patch(struct object *obj) */ BUILD_BUG_ON((u32 *)set8->pairs != &set8->pairs[0].id); qsort(set8->pairs, set8->cnt, sizeof(set8->pairs[0]), cmp_id); - - /* - * When ELF endianness does not match endianness of the - * host, libelf will do the translation when updating - * the ELF. This, however, corrupts SET8 flags which are - * already in the target endianness. So, let's bswap - * them to the host endianness and libelf will then - * correctly translate everything. - */ - if (obj->efile.encoding != ELFDATANATIVE) { - int i; - - set8->flags = bswap_32(set8->flags); - for (i = 0; i < set8->cnt; i++) { - set8->pairs[i].flags = - bswap_32(set8->pairs[i].flags); - } - } break; default: pr_err("Unexpected btf_id_kind %d for set '%s'\n", id->kind, id->name); @@ -793,8 +766,6 @@ static int sets_patch(struct object *obj) static int symbols_patch(struct object *obj) { - off_t err; - if (__symbols_patch(obj, &obj->structs) || __symbols_patch(obj, &obj->unions) || __symbols_patch(obj, &obj->typedefs) || @@ -805,20 +776,90 @@ static int symbols_patch(struct object *obj) if (sets_patch(obj)) return -1; - /* Set type to ensure endian translation occurs. */ - obj->efile.idlist->d_type = ELF_T_WORD; + return 0; +} - elf_flagdata(obj->efile.idlist, ELF_C_SET, ELF_F_DIRTY); +static int dump_raw_data(const char *out_path, const void *data, u32 size) +{ + size_t written; + FILE *file; - err = elf_update(obj->efile.elf, ELF_C_WRITE); - if (err < 0) { - pr_err("FAILED elf_update(WRITE): %s\n", - elf_errmsg(-1)); + file = fopen(out_path, "wb"); + if (!file) { + pr_err("Couldn't open %s for writing\n", out_path); + return -1; } - pr_debug("update %s for %s\n", - err >= 0 ? "ok" : "failed", obj->path); - return err < 0 ? -1 : 0; + written = fwrite(data, 1, size, file); + if (written != size) { + pr_err("Failed to write data to %s\n", out_path); + fclose(file); + unlink(out_path); + return -1; + } + + fclose(file); + pr_debug("Dumped %lu bytes of data to %s\n", size, out_path); + + return 0; +} + +static int dump_raw_btf_ids(struct object *obj, const char *out_path) +{ + Elf_Data *data = obj->efile.idlist; + int err; + + if (!data || !data->d_buf) { + pr_debug("%s has no BTF_ids data to dump\n", obj->path); + return 0; + } + + /* + * If target endianness differs from host, we need to bswap32 the + * .BTF_ids section data before dumping so that the output is in + * target endianness. + */ + if (obj->efile.encoding != ELFDATANATIVE) { + pr_debug("bswap_32 .BTF_ids data from host to target endianness\n"); + bswap_32_data(data->d_buf, data->d_size); + } + + err = dump_raw_data(out_path, data->d_buf, data->d_size); + if (err) + return -1; + + return 0; +} + +static int dump_raw_btf(struct btf *btf, const char *out_path) +{ + const void *raw_btf_data; + u32 raw_btf_size; + int err; + + raw_btf_data = btf__raw_data(btf, &raw_btf_size); + if (!raw_btf_data) { + pr_err("btf__raw_data() failed\n"); + return -1; + } + + err = dump_raw_data(out_path, raw_btf_data, raw_btf_size); + if (err) + return -1; + + return 0; +} + +static inline int make_out_path(char *buf, u32 buf_sz, const char *in_path, const char *suffix) +{ + int len = snprintf(buf, buf_sz, "%s%s", in_path, suffix); + + if (len < 0 || len >= buf_sz) { + pr_err("Output path is too long: %s%s\n", in_path, suffix); + return -E2BIG; + } + + return 0; } static const char * const resolve_btfids_usage[] = { @@ -840,6 +881,8 @@ int main(int argc, const char **argv) .sets = RB_ROOT, }; bool fatal_warnings = false; + char out_path[PATH_MAX]; + struct option btfid_options[] = { OPT_INCR('v', "verbose", &verbose, "be more verbose (show errors, etc)"), @@ -849,6 +892,8 @@ int main(int argc, const char **argv) "path of file providing base BTF"), OPT_BOOLEAN(0, "fatal_warnings", &fatal_warnings, "turn warnings into errors"), + OPT_BOOLEAN(0, "distill_base", &obj.distill_base, + "distill --btf_base and emit .BTF.base section data"), OPT_END() }; int err = -1; @@ -860,6 +905,9 @@ int main(int argc, const char **argv) obj.path = argv[0]; + if (load_btf(&obj)) + goto out; + if (elf_collect(&obj)) goto out; @@ -869,23 +917,37 @@ int main(int argc, const char **argv) */ if (obj.efile.idlist_shndx == -1 || obj.efile.symbols_shndx == -1) { - pr_debug("Cannot find .BTF_ids or symbols sections, nothing to do\n"); - err = 0; - goto out; + pr_debug("Cannot find .BTF_ids or symbols sections, skip symbols resolution\n"); + goto dump_btf; } if (symbols_collect(&obj)) goto out; - if (load_btf(&obj)) - goto out; - if (symbols_resolve(&obj)) goto out; if (symbols_patch(&obj)) goto out; + err = make_out_path(out_path, sizeof(out_path), obj.path, BTF_IDS_SECTION); + err = err ?: dump_raw_btf_ids(&obj, out_path); + if (err) + goto out; + +dump_btf: + err = make_out_path(out_path, sizeof(out_path), obj.path, BTF_ELF_SEC); + err = err ?: dump_raw_btf(obj.btf, out_path); + if (err) + goto out; + + if (obj.base_btf && obj.distill_base) { + err = make_out_path(out_path, sizeof(out_path), obj.path, BTF_BASE_ELF_SEC); + err = err ?: dump_raw_btf(obj.base_btf, out_path); + if (err) + goto out; + } + if (!(fatal_warnings && warnings)) err = 0; out: diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index 19c1638e312a..b8bf51b7a0b0 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -45,3 +45,6 @@ xdp_synproxy xdp_hw_metadata xdp_features verification_cert.h +*.BTF +*.BTF_ids +*.BTF.base diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index ffd0a4c354c7..f28a32b16ff0 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -4,6 +4,7 @@ include ../../../scripts/Makefile.arch include ../../../scripts/Makefile.include CXX ?= $(CROSS_COMPILE)g++ +OBJCOPY ?= $(CROSS_COMPILE)objcopy CURDIR := $(abspath .) TOOLSDIR := $(abspath ../../..) @@ -653,9 +654,10 @@ $(TRUNNER_TEST_OBJS): $(TRUNNER_OUTPUT)/%.test.o: \ | $(TRUNNER_OUTPUT)/%.test.d $$(call msg,TEST-OBJ,$(TRUNNER_BINARY),$$@) $(Q)cd $$(@D) && $$(CC) -I. $$(CFLAGS) -MMD -MT $$@ -c $(CURDIR)/$$< $$(LDLIBS) -o $$(@F) - $$(if $$(TEST_NEEDS_BTFIDS), \ - $$(call msg,BTFIDS,$(TRUNNER_BINARY),$$@) \ - $(RESOLVE_BTFIDS) --btf $(TRUNNER_OUTPUT)/btf_data.bpf.o $$@) + $$(if $$(TEST_NEEDS_BTFIDS), \ + $$(call msg,BTFIDS,$(TRUNNER_BINARY),$$@) \ + $(RESOLVE_BTFIDS) --btf $(TRUNNER_OUTPUT)/btf_data.bpf.o $$@; \ + $(OBJCOPY) --update-section .BTF_ids=$$@.BTF_ids $$@) $(TRUNNER_TEST_OBJS:.o=.d): $(TRUNNER_OUTPUT)/%.test.d: \ $(TRUNNER_TESTS_DIR)/%.c \ @@ -894,6 +896,7 @@ EXTRA_CLEAN := $(SCRATCH_DIR) $(HOST_SCRATCH_DIR) \ prog_tests/tests.h map_tests/tests.h verifier/tests.h \ feature bpftool $(TEST_KMOD_TARGETS) \ $(addprefix $(OUTPUT)/,*.o *.d *.skel.h *.lskel.h *.subskel.h \ + *.BTF *.BTF_ids *.BTF.base \ no_alu32 cpuv4 bpf_gcc \ liburandom_read.so) \ $(OUTPUT)/FEATURE-DUMP.selftests \ diff --git a/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c b/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c index 51544372f52e..41dfaaabb73f 100644 --- a/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c +++ b/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c @@ -101,9 +101,9 @@ static int resolve_symbols(void) int type_id; __u32 nr; - btf = btf__parse_elf("btf_data.bpf.o", NULL); + btf = btf__parse_raw("resolve_btfids.test.o.BTF"); if (CHECK(libbpf_get_error(btf), "resolve", - "Failed to load BTF from btf_data.bpf.o\n")) + "Failed to load BTF from resolve_btfids.test.o.BTF\n")) return -1; nr = btf__type_cnt(btf); From 93f0d09697613beba922a387d21a09a41eeefef5 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Fri, 19 Dec 2025 10:44:17 -0800 Subject: [PATCH 026/268] bpf: move recursion detection logic to helpers BPF programs detect recursion by doing atomic inc/dec on a per-cpu active counter from the trampoline. Create two helpers for operations on this active counter, this makes it easy to changes the recursion detection logic in future. This commit makes no functional changes. Acked-by: Yonghong Song Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20251219184422.2899902-2-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 10 ++++++++++ kernel/bpf/trampoline.c | 8 ++++---- kernel/trace/bpf_trace.c | 4 ++-- 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index bb3847caeae1..2da986136d26 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2004,6 +2004,16 @@ struct bpf_struct_ops_common_value { enum bpf_struct_ops_state state; }; +static inline bool bpf_prog_get_recursion_context(struct bpf_prog *prog) +{ + return this_cpu_inc_return(*(prog->active)) == 1; +} + +static inline void bpf_prog_put_recursion_context(struct bpf_prog *prog) +{ + this_cpu_dec(*(prog->active)); +} + #if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL) /* This macro helps developer to register a struct_ops type and generate * type information correctly. Developers should use this macro to register diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 976d89011b15..2a125d063e62 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -949,7 +949,7 @@ static u64 notrace __bpf_prog_enter_recur(struct bpf_prog *prog, struct bpf_tram run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); - if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { + if (unlikely(!bpf_prog_get_recursion_context(prog))) { bpf_prog_inc_misses_counter(prog); if (prog->aux->recursion_detected) prog->aux->recursion_detected(prog); @@ -993,7 +993,7 @@ static void notrace __bpf_prog_exit_recur(struct bpf_prog *prog, u64 start, bpf_reset_run_ctx(run_ctx->saved_run_ctx); update_prog_stats(prog, start); - this_cpu_dec(*(prog->active)); + bpf_prog_put_recursion_context(prog); rcu_read_unlock_migrate(); } @@ -1029,7 +1029,7 @@ u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog, run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); - if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { + if (unlikely(!bpf_prog_get_recursion_context(prog))) { bpf_prog_inc_misses_counter(prog); if (prog->aux->recursion_detected) prog->aux->recursion_detected(prog); @@ -1044,7 +1044,7 @@ void notrace __bpf_prog_exit_sleepable_recur(struct bpf_prog *prog, u64 start, bpf_reset_run_ctx(run_ctx->saved_run_ctx); update_prog_stats(prog, start); - this_cpu_dec(*(prog->active)); + bpf_prog_put_recursion_context(prog); migrate_enable(); rcu_read_unlock_trace(); } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index fe28d86f7c35..6e076485bf70 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2063,7 +2063,7 @@ void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args) struct bpf_trace_run_ctx run_ctx; cant_sleep(); - if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { + if (unlikely(!bpf_prog_get_recursion_context(prog))) { bpf_prog_inc_misses_counter(prog); goto out; } @@ -2077,7 +2077,7 @@ void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args) bpf_reset_run_ctx(old_run_ctx); out: - this_cpu_dec(*(prog->active)); + bpf_prog_put_recursion_context(prog); } #define UNPACK(...) __VA_ARGS__ From c3e34f88f9992866a1fb510850921a8fe299a97b Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Fri, 19 Dec 2025 10:44:18 -0800 Subject: [PATCH 027/268] bpf: arm64: Optimize recursion detection by not using atomics BPF programs detect recursion using a per-CPU 'active' flag in struct bpf_prog. The trampoline currently sets/clears this flag with atomic operations. On some arm64 platforms (e.g., Neoverse V2 with LSE), per-CPU atomic operations are relatively slow. Unlike x86_64 - where per-CPU updates can avoid cross-core atomicity, arm64 LSE atomics are always atomic across all cores, which is unnecessary overhead for strictly per-CPU state. This patch removes atomics from the recursion detection path on arm64 by changing 'active' to a per-CPU array of four u8 counters, one per context: {NMI, hard-irq, soft-irq, normal}. The running context uses a non-atomic increment/decrement on its element. After increment, recursion is detected by reading the array as a u32 and verifying that only the expected element changed; any change in another element indicates inter-context recursion, and a value > 1 in the same element indicates same-context recursion. For example, starting from {0,0,0,0}, a normal-context trigger changes the array to {0,0,0,1}. If an NMI arrives on the same CPU and triggers the program, the array becomes {1,0,0,1}. When the NMI context checks the u32 against the expected mask for normal (0x00000001), it observes 0x01000001 and correctly reports recursion. Same-context recursion is detected analogously. Acked-by: Yonghong Song Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20251219184422.2899902-3-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 32 +++++++++++++++++++++++++++++--- kernel/bpf/core.c | 3 ++- 2 files changed, 31 insertions(+), 4 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2da986136d26..da6a00dd313f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1746,6 +1746,8 @@ struct bpf_prog_aux { struct bpf_map __rcu *st_ops_assoc; }; +#define BPF_NR_CONTEXTS 4 /* normal, softirq, hardirq, NMI */ + struct bpf_prog { u16 pages; /* Number of allocated pages */ u16 jited:1, /* Is our filter JIT'ed? */ @@ -1772,7 +1774,7 @@ struct bpf_prog { u8 tag[BPF_TAG_SIZE]; }; struct bpf_prog_stats __percpu *stats; - int __percpu *active; + u8 __percpu *active; /* u8[BPF_NR_CONTEXTS] for recursion protection */ unsigned int (*bpf_func)(const void *ctx, const struct bpf_insn *insn); struct bpf_prog_aux *aux; /* Auxiliary fields */ @@ -2006,12 +2008,36 @@ struct bpf_struct_ops_common_value { static inline bool bpf_prog_get_recursion_context(struct bpf_prog *prog) { - return this_cpu_inc_return(*(prog->active)) == 1; +#ifdef CONFIG_ARM64 + u8 rctx = interrupt_context_level(); + u8 *active = this_cpu_ptr(prog->active); + u32 val; + + preempt_disable(); + active[rctx]++; + val = le32_to_cpu(*(__le32 *)active); + preempt_enable(); + if (val != BIT(rctx * 8)) + return false; + + return true; +#else + return this_cpu_inc_return(*(int __percpu *)(prog->active)) == 1; +#endif } static inline void bpf_prog_put_recursion_context(struct bpf_prog *prog) { - this_cpu_dec(*(prog->active)); +#ifdef CONFIG_ARM64 + u8 rctx = interrupt_context_level(); + u8 *active = this_cpu_ptr(prog->active); + + preempt_disable(); + active[rctx]--; + preempt_enable(); +#else + this_cpu_dec(*(int __percpu *)(prog->active)); +#endif } #if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index c66316e32563..e0b8a8a5aaa9 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -112,7 +112,8 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag vfree(fp); return NULL; } - fp->active = alloc_percpu_gfp(int, bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags)); + fp->active = __alloc_percpu_gfp(sizeof(u8[BPF_NR_CONTEXTS]), 4, + bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags)); if (!fp->active) { vfree(fp); kfree(aux); From e09f6be4a3558e01afb4d16705ce57006a6f9712 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 8 Dec 2025 17:33:34 +0100 Subject: [PATCH 028/268] x86/bpf: Avoid emitting LOCK prefix for XCHG atomic ops The x86 XCHG instruction is implicitly locked when one of the operands is a memory location, making an explicit LOCK prefix unnecessary. Stop emitting the LOCK prefix for BPF_XCHG in the JIT atomic read-modify-write helpers. This avoids redundant instruction prefixes while preserving correct atomic semantics. No functional change for other atomic operations. Signed-off-by: Uros Bizjak Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Andrii Nakryiko Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: "H. Peter Anvin" Link: https://lore.kernel.org/r/20251208163420.7643-1-ubizjak@gmail.com Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index b0bac2a66eff..e3b1c4b1d550 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1305,7 +1305,8 @@ static int emit_atomic_rmw(u8 **pprog, u32 atomic_op, { u8 *prog = *pprog; - EMIT1(0xF0); /* lock prefix */ + if (atomic_op != BPF_XCHG) + EMIT1(0xF0); /* lock prefix */ maybe_emit_mod(&prog, dst_reg, src_reg, bpf_size == BPF_DW); @@ -1347,7 +1348,9 @@ static int emit_atomic_rmw_index(u8 **pprog, u32 atomic_op, u32 size, { u8 *prog = *pprog; - EMIT1(0xF0); /* lock prefix */ + if (atomic_op != BPF_XCHG) + EMIT1(0xF0); /* lock prefix */ + switch (size) { case BPF_W: EMIT1(add_3mod(0x40, dst_reg, src_reg, index_reg)); From 94e948b7e684c0465bb3faca8fafee5caf421b84 Mon Sep 17 00:00:00 2001 From: Matt Bobrowski Date: Tue, 16 Dec 2025 13:29:59 +0000 Subject: [PATCH 029/268] bpf: annotate file argument as __nullable in bpf_lsm_mmap_file As reported in [0], anonymous memory mappings are not backed by a struct file instance. Consequently, the struct file pointer passed to the security_mmap_file() LSM hook is NULL in such cases. The BPF verifier is currently unaware of this, allowing BPF LSM programs to dereference this struct file pointer without needing to perform an explicit NULL check. This leads to potential NULL pointer dereference and a kernel crash. Add a strong override for bpf_lsm_mmap_file() which annotates the struct file pointer parameter with the __nullable suffix. This explicitly informs the BPF verifier that this pointer (PTR_MAYBE_NULL) can be NULL, forcing BPF LSM programs to perform a check on it before dereferencing it. [0] https://lore.kernel.org/bpf/5e460d3c.4c3e9.19adde547d8.Coremail.kaiyanm@hust.edu.cn/ Reported-by: Kaiyan Mei Reported-by: Yinhao Hu Reviewed-by: Dongliang Mu Closes: https://lore.kernel.org/bpf/5e460d3c.4c3e9.19adde547d8.Coremail.kaiyanm@hust.edu.cn/ Signed-off-by: Matt Bobrowski Acked-by: Song Liu Link: https://lore.kernel.org/r/20251216133000.3690723-1-mattbobrowski@google.com Signed-off-by: Alexei Starovoitov --- MAINTAINERS | 1 + kernel/bpf/Makefile | 12 +++++++++++- kernel/bpf/bpf_lsm.c | 5 +++-- kernel/bpf/bpf_lsm_proto.c | 19 +++++++++++++++++++ 4 files changed, 34 insertions(+), 3 deletions(-) create mode 100644 kernel/bpf/bpf_lsm_proto.c diff --git a/MAINTAINERS b/MAINTAINERS index cb1898a85b05..e7027fba97db 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4848,6 +4848,7 @@ S: Maintained F: Documentation/bpf/prog_lsm.rst F: include/linux/bpf_lsm.h F: kernel/bpf/bpf_lsm.c +F: kernel/bpf/bpf_lsm_proto.c F: kernel/trace/bpf_trace.c F: security/bpf/ diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 232cbc97434d..79cf22860a99 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -42,7 +42,17 @@ endif ifeq ($(CONFIG_BPF_JIT),y) obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o obj-$(CONFIG_BPF_SYSCALL) += cpumask.o -obj-${CONFIG_BPF_LSM} += bpf_lsm.o +# bpf_lsm_proto.o must precede bpf_lsm.o. The current pahole logic +# deduplicates function prototypes within +# btf_encoder__add_saved_func() by keeping the first instance seen. We +# need the function prototype(s) in bpf_lsm_proto.o to take precedence +# over those within bpf_lsm.o. Having bpf_lsm_proto.o precede +# bpf_lsm.o ensures its DWARF CU is processed early, forcing the +# generated BTF to contain the overrides. +# +# Notably, this is a temporary workaround whilst the deduplication +# semantics within pahole are revisited accordingly. +obj-${CONFIG_BPF_LSM} += bpf_lsm_proto.o bpf_lsm.o endif ifneq ($(CONFIG_CRYPTO),) obj-$(CONFIG_BPF_SYSCALL) += crypto.o diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 7cb6e8d4282c..0c4a0c8e6f70 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -18,10 +18,11 @@ #include /* For every LSM hook that allows attachment of BPF programs, declare a nop - * function where a BPF program can be attached. + * function where a BPF program can be attached. Notably, we qualify each with + * weak linkage such that strong overrides can be implemented if need be. */ #define LSM_HOOK(RET, DEFAULT, NAME, ...) \ -noinline RET bpf_lsm_##NAME(__VA_ARGS__) \ +__weak noinline RET bpf_lsm_##NAME(__VA_ARGS__) \ { \ return DEFAULT; \ } diff --git a/kernel/bpf/bpf_lsm_proto.c b/kernel/bpf/bpf_lsm_proto.c new file mode 100644 index 000000000000..44a54fd8045e --- /dev/null +++ b/kernel/bpf/bpf_lsm_proto.c @@ -0,0 +1,19 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2025 Google LLC. + */ + +#include +#include + +/* + * Strong definition of the mmap_file() BPF LSM hook. The __nullable suffix on + * the struct file pointer parameter name marks it as PTR_MAYBE_NULL. This + * explicitly enforces that BPF LSM programs check for NULL before attempting to + * dereference it. + */ +int bpf_lsm_mmap_file(struct file *file__nullable, unsigned long reqprot, + unsigned long prot, unsigned long flags) +{ + return 0; +} From d2749ae85aec685e52e0474f445f6a8552363eb0 Mon Sep 17 00:00:00 2001 From: Matt Bobrowski Date: Tue, 16 Dec 2025 13:30:00 +0000 Subject: [PATCH 030/268] selftests/bpf: add test case for BPF LSM hook bpf_lsm_mmap_file Add a trivial test case asserting that the BPF verifier enforces PTR_MAYBE_NULL semantics on the struct file pointer argument of BPF LSM hook bpf_lsm_mmap_file(). Dereferencing the struct file pointer passed into bpf_lsm_mmap_file() without explicitly performing a NULL check first should not be permitted by the BPF verifier as it can lead to NULL pointer dereferences and a kernel crash. Signed-off-by: Matt Bobrowski Acked-by: Song Liu Link: https://lore.kernel.org/r/20251216133000.3690723-2-mattbobrowski@google.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/progs/verifier_lsm.c | 31 ++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/progs/verifier_lsm.c b/tools/testing/selftests/bpf/progs/verifier_lsm.c index 6af9100a37ff..38e8e9176862 100644 --- a/tools/testing/selftests/bpf/progs/verifier_lsm.c +++ b/tools/testing/selftests/bpf/progs/verifier_lsm.c @@ -1,7 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 -#include +#include #include +#include #include "bpf_misc.h" SEC("lsm/file_permission") @@ -159,4 +160,32 @@ __naked int disabled_hook_test3(void *ctx) ::: __clobber_all); } +SEC("lsm/mmap_file") +__description("not null checking nullable pointer in bpf_lsm_mmap_file") +__failure __msg("R1 invalid mem access 'trusted_ptr_or_null_'") +int BPF_PROG(no_null_check, struct file *file) +{ + struct inode *inode; + + inode = file->f_inode; + __sink(inode); + + return 0; +} + +SEC("lsm/mmap_file") +__description("null checking nullable pointer in bpf_lsm_mmap_file") +__success +int BPF_PROG(null_check, struct file *file) +{ + struct inode *inode; + + if (file) { + inode = file->f_inode; + __sink(inode); + } + + return 0; +} + char _license[] SEC("license") = "GPL"; From f785a31395d9cafb8b2c42c7358fad72a6463142 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Fri, 19 Dec 2025 11:13:08 -0800 Subject: [PATCH 031/268] bpf: arm64: Fix sparse warnings ctx->image is declared as __le32 because arm64 instructions are LE regardless of CPU's runtime endianness. emit_u32_data() emits raw data and not instructions so cast the value to __le32 to fix the sparse warning. Cast function pointer to void * before doing arithmetic. Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20251219191310.3204425-1-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- arch/arm64/net/bpf_jit_comp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index b6eb7a465ad2..0c4d44bcfbf4 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -118,7 +118,7 @@ static inline void emit(const u32 insn, struct jit_ctx *ctx) static inline void emit_u32_data(const u32 data, struct jit_ctx *ctx) { if (ctx->image != NULL && ctx->write) - ctx->image[ctx->idx] = data; + ctx->image[ctx->idx] = (__force __le32)data; ctx->idx++; } @@ -3139,7 +3139,7 @@ void bpf_jit_free(struct bpf_prog *prog) bpf_jit_binary_pack_finalize(jit_data->ro_header, jit_data->header); kfree(jit_data); } - prog->bpf_func -= cfi_get_offset(); + prog->bpf_func = (void *)prog->bpf_func - cfi_get_offset(); hdr = bpf_jit_binary_pack_hdr(prog); bpf_jit_binary_pack_free(hdr, NULL); priv_stack_ptr = prog->aux->priv_stack_ptr; From 4221de8c410e7adb1114a4374c78fa4269175343 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 22 Dec 2025 20:41:51 -0800 Subject: [PATCH 032/268] mm: declare memcg_page_state_output() in memcontrol.h To use memcg_page_state_output() in bpf_memcontrol.c move the declaration from v1-specific memcontrol-v1.h to memcontrol.h. Signed-off-by: Roman Gushchin Acked-by: Michal Hocko Acked-by: Shakeel Butt Link: https://lore.kernel.org/r/20251223044156.208250-2-roman.gushchin@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/memcontrol.h | 6 ++++++ mm/memcontrol-v1.h | 1 - 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 0651865a4564..7bef427d5a82 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -950,6 +950,7 @@ static inline void mod_memcg_page_state(struct page *page, } unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx); +unsigned long memcg_page_state_output(struct mem_cgroup *memcg, int item); unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx); unsigned long lruvec_page_state_local(struct lruvec *lruvec, enum node_stat_item idx); @@ -1373,6 +1374,11 @@ static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) return 0; } +static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg, int item) +{ + return 0; +} + static inline unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx) { diff --git a/mm/memcontrol-v1.h b/mm/memcontrol-v1.h index 6358464bb416..a304ad418cdf 100644 --- a/mm/memcontrol-v1.h +++ b/mm/memcontrol-v1.h @@ -27,7 +27,6 @@ unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap); void drain_all_stock(struct mem_cgroup *root_memcg); unsigned long memcg_events(struct mem_cgroup *memcg, int event); -unsigned long memcg_page_state_output(struct mem_cgroup *memcg, int item); int memory_stat_show(struct seq_file *m, void *v); void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n); From 5904db9891f80f84283648121e2d8c8a506296a8 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 22 Dec 2025 20:41:52 -0800 Subject: [PATCH 033/268] mm: introduce BPF kfuncs to deal with memcg pointers To effectively operate with memory cgroups in BPF there is a need to convert css pointers to memcg pointers. A simple container_of cast which is used in the kernel code can't be used in BPF because from the verifier's point of view that's a out-of-bounds memory access. Introduce helper get/put kfuncs which can be used to get a refcounted memcg pointer from the css pointer: - bpf_get_mem_cgroup, - bpf_put_mem_cgroup. bpf_get_mem_cgroup() can take both memcg's css and the corresponding cgroup's "self" css. It allows it to be used with the existing cgroup iterator which iterates over cgroup tree, not memcg tree. Signed-off-by: Roman Gushchin Link: https://lore.kernel.org/r/20251223044156.208250-3-roman.gushchin@linux.dev Signed-off-by: Alexei Starovoitov --- mm/Makefile | 3 ++ mm/bpf_memcontrol.c | 88 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 91 insertions(+) create mode 100644 mm/bpf_memcontrol.c diff --git a/mm/Makefile b/mm/Makefile index 2d0570a16e5b..bf46fe31dc14 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -106,6 +106,9 @@ obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o ifdef CONFIG_SWAP obj-$(CONFIG_MEMCG) += swap_cgroup.o endif +ifdef CONFIG_BPF_SYSCALL +obj-$(CONFIG_MEMCG) += bpf_memcontrol.o +endif obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o obj-$(CONFIG_GUP_TEST) += gup_test.o obj-$(CONFIG_DMAPOOL_TEST) += dmapool_test.o diff --git a/mm/bpf_memcontrol.c b/mm/bpf_memcontrol.c new file mode 100644 index 000000000000..82eb95de77b7 --- /dev/null +++ b/mm/bpf_memcontrol.c @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Memory Controller-related BPF kfuncs and auxiliary code + * + * Author: Roman Gushchin + */ + +#include +#include + +__bpf_kfunc_start_defs(); + +/** + * bpf_get_mem_cgroup - Get a reference to a memory cgroup + * @css: pointer to the css structure + * + * It's fine to pass a css which belongs to any cgroup controller, + * e.g. unified hierarchy's main css. + * + * Implements KF_ACQUIRE semantics. + * + * Return: A pointer to a mem_cgroup structure after bumping + * the corresponding css's reference counter. + */ +__bpf_kfunc struct mem_cgroup * +bpf_get_mem_cgroup(struct cgroup_subsys_state *css) +{ + struct mem_cgroup *memcg = NULL; + bool rcu_unlock = false; + + if (mem_cgroup_disabled() || !root_mem_cgroup) + return NULL; + + if (root_mem_cgroup->css.ss != css->ss) { + struct cgroup *cgroup = css->cgroup; + int ssid = root_mem_cgroup->css.ss->id; + + rcu_read_lock(); + rcu_unlock = true; + css = rcu_dereference_raw(cgroup->subsys[ssid]); + } + + if (css && css_tryget(css)) + memcg = container_of(css, struct mem_cgroup, css); + + if (rcu_unlock) + rcu_read_unlock(); + + return memcg; +} + +/** + * bpf_put_mem_cgroup - Put a reference to a memory cgroup + * @memcg: memory cgroup to release + * + * Releases a previously acquired memcg reference. + * Implements KF_RELEASE semantics. + */ +__bpf_kfunc void bpf_put_mem_cgroup(struct mem_cgroup *memcg) +{ + css_put(&memcg->css); +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(bpf_memcontrol_kfuncs) +BTF_ID_FLAGS(func, bpf_get_mem_cgroup, KF_ACQUIRE | KF_RET_NULL | KF_RCU) +BTF_ID_FLAGS(func, bpf_put_mem_cgroup, KF_RELEASE) + +BTF_KFUNCS_END(bpf_memcontrol_kfuncs) + +static const struct btf_kfunc_id_set bpf_memcontrol_kfunc_set = { + .owner = THIS_MODULE, + .set = &bpf_memcontrol_kfuncs, +}; + +static int __init bpf_memcontrol_init(void) +{ + int err; + + err = register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, + &bpf_memcontrol_kfunc_set); + if (err) + pr_warn("error while registering bpf memcontrol kfuncs: %d", err); + + return err; +} +late_initcall(bpf_memcontrol_init); From 5c7db3239c9fbe3c62cb0d89b64959ea23af2de9 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 22 Dec 2025 20:41:53 -0800 Subject: [PATCH 034/268] mm: introduce bpf_get_root_mem_cgroup() BPF kfunc Introduce a BPF kfunc to get a trusted pointer to the root memory cgroup. It's very handy to traverse the full memcg tree, e.g. for handling a system-wide OOM. It's possible to obtain this pointer by traversing the memcg tree up from any known memcg, but it's sub-optimal and makes BPF programs more complex and less efficient. bpf_get_root_mem_cgroup() has a KF_ACQUIRE | KF_RET_NULL semantics, however in reality it's not necessary to bump the corresponding reference counter - root memory cgroup is immortal, reference counting is skipped, see css_get(). Once set, root_mem_cgroup is always a valid memcg pointer. It's safe to call bpf_put_mem_cgroup() for the pointer obtained with bpf_get_root_mem_cgroup(), it's effectively a no-op. Signed-off-by: Roman Gushchin Link: https://lore.kernel.org/r/20251223044156.208250-4-roman.gushchin@linux.dev Signed-off-by: Alexei Starovoitov --- mm/bpf_memcontrol.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/mm/bpf_memcontrol.c b/mm/bpf_memcontrol.c index 82eb95de77b7..187919eb2fe2 100644 --- a/mm/bpf_memcontrol.c +++ b/mm/bpf_memcontrol.c @@ -10,6 +10,25 @@ __bpf_kfunc_start_defs(); +/** + * bpf_get_root_mem_cgroup - Returns a pointer to the root memory cgroup + * + * The function has KF_ACQUIRE semantics, even though the root memory + * cgroup is never destroyed after being created and doesn't require + * reference counting. And it's perfectly safe to pass it to + * bpf_put_mem_cgroup() + * + * Return: A pointer to the root memory cgroup. + */ +__bpf_kfunc struct mem_cgroup *bpf_get_root_mem_cgroup(void) +{ + if (mem_cgroup_disabled()) + return NULL; + + /* css_get() is not needed */ + return root_mem_cgroup; +} + /** * bpf_get_mem_cgroup - Get a reference to a memory cgroup * @css: pointer to the css structure @@ -64,6 +83,7 @@ __bpf_kfunc void bpf_put_mem_cgroup(struct mem_cgroup *memcg) __bpf_kfunc_end_defs(); BTF_KFUNCS_START(bpf_memcontrol_kfuncs) +BTF_ID_FLAGS(func, bpf_get_root_mem_cgroup, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_get_mem_cgroup, KF_ACQUIRE | KF_RET_NULL | KF_RCU) BTF_ID_FLAGS(func, bpf_put_mem_cgroup, KF_RELEASE) From 99430ab8b804c26b8a0dec93fcbfe75469f3edc7 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 22 Dec 2025 20:41:54 -0800 Subject: [PATCH 035/268] mm: introduce BPF kfuncs to access memcg statistics and events Introduce BPF kfuncs to conveniently access memcg data: - bpf_mem_cgroup_vm_events(), - bpf_mem_cgroup_memory_events(), - bpf_mem_cgroup_usage(), - bpf_mem_cgroup_page_state(), - bpf_mem_cgroup_flush_stats(). These functions are useful for implementing BPF OOM policies, but also can be used to accelerate access to the memcg data. Reading it through cgroupfs is much more expensive, roughly 5x, mostly because of the need to convert the data into the text and back. JP Kobryn: An experiment was setup to compare the performance of a program that uses the traditional method of reading memory.stat vs a program using the new kfuncs. The control program opens up the root memory.stat file and for 1M iterations reads, converts the string values to numeric data, then seeks back to the beginning. The experimental program sets up the requisite libbpf objects and for 1M iterations invokes a bpf program which uses the kfuncs to fetch all available stats for node_stat_item, memcg_stat_item, and vm_event_item types. The results showed a significant perf benefit on the experimental side, outperforming the control side by a margin of 93%. In kernel mode, elapsed time was reduced by 80%, while in user mode, over 99% of time was saved. control: elapsed time real 0m38.318s user 0m25.131s sys 0m13.070s experiment: elapsed time real 0m2.789s user 0m0.187s sys 0m2.512s control: perf data 33.43% a.out libc.so.6 [.] __vfscanf_internal 6.88% a.out [kernel.kallsyms] [k] vsnprintf 6.33% a.out libc.so.6 [.] _IO_fgets 5.51% a.out [kernel.kallsyms] [k] format_decode 4.31% a.out libc.so.6 [.] __GI_____strtoull_l_internal 3.78% a.out [kernel.kallsyms] [k] string 3.53% a.out [kernel.kallsyms] [k] number 2.71% a.out libc.so.6 [.] _IO_sputbackc 2.41% a.out [kernel.kallsyms] [k] strlen 1.98% a.out a.out [.] main 1.70% a.out libc.so.6 [.] _IO_getline_info 1.51% a.out libc.so.6 [.] __isoc99_sscanf 1.47% a.out [kernel.kallsyms] [k] memory_stat_format 1.47% a.out [kernel.kallsyms] [k] memcpy_orig 1.41% a.out [kernel.kallsyms] [k] seq_buf_printf experiment: perf data 10.55% memcgstat bpf_prog_..._query [k] bpf_prog_16aab2f19fa982a7_query 6.90% memcgstat [kernel.kallsyms] [k] memcg_page_state_output 3.55% memcgstat [kernel.kallsyms] [k] _raw_spin_lock 3.12% memcgstat [kernel.kallsyms] [k] memcg_events 2.87% memcgstat [kernel.kallsyms] [k] __memcg_slab_post_alloc_hook 2.73% memcgstat [kernel.kallsyms] [k] kmem_cache_free 2.70% memcgstat [kernel.kallsyms] [k] entry_SYSRETQ_unsafe_stack 2.25% memcgstat [kernel.kallsyms] [k] __memcg_slab_free_hook 2.06% memcgstat [kernel.kallsyms] [k] get_page_from_freelist Signed-off-by: Roman Gushchin Co-developed-by: JP Kobryn Signed-off-by: JP Kobryn Link: https://lore.kernel.org/r/20251223044156.208250-5-roman.gushchin@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/memcontrol.h | 14 +++++++ mm/bpf_memcontrol.c | 85 ++++++++++++++++++++++++++++++++++++++ mm/memcontrol.c | 16 +++++++ 3 files changed, 115 insertions(+) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 7bef427d5a82..6a5d65487b70 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -949,8 +949,12 @@ static inline void mod_memcg_page_state(struct page *page, rcu_read_unlock(); } +unsigned long memcg_events(struct mem_cgroup *memcg, int event); +unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap); unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx); unsigned long memcg_page_state_output(struct mem_cgroup *memcg, int item); +bool memcg_stat_item_valid(int idx); +bool memcg_vm_event_item_valid(enum vm_event_item idx); unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx); unsigned long lruvec_page_state_local(struct lruvec *lruvec, enum node_stat_item idx); @@ -1379,6 +1383,16 @@ static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg, in return 0; } +static inline bool memcg_stat_item_valid(int idx) +{ + return false; +} + +static inline bool memcg_vm_event_item_valid(enum vm_event_item idx) +{ + return false; +} + static inline unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx) { diff --git a/mm/bpf_memcontrol.c b/mm/bpf_memcontrol.c index 187919eb2fe2..e8fa7f5855f9 100644 --- a/mm/bpf_memcontrol.c +++ b/mm/bpf_memcontrol.c @@ -80,6 +80,85 @@ __bpf_kfunc void bpf_put_mem_cgroup(struct mem_cgroup *memcg) css_put(&memcg->css); } +/** + * bpf_mem_cgroup_vm_events - Read memory cgroup's vm event counter + * @memcg: memory cgroup + * @event: event id + * + * Allows to read memory cgroup event counters. + * + * Return: The current value of the corresponding events counter. + */ +__bpf_kfunc unsigned long bpf_mem_cgroup_vm_events(struct mem_cgroup *memcg, + enum vm_event_item event) +{ + if (unlikely(!memcg_vm_event_item_valid(event))) + return (unsigned long)-1; + + return memcg_events(memcg, event); +} + +/** + * bpf_mem_cgroup_usage - Read memory cgroup's usage + * @memcg: memory cgroup + * + * Please, note that the root memory cgroup it special and is exempt + * from the memory accounting. The returned value is a sum of sub-cgroup's + * usages and it not reflecting the size of the root memory cgroup itself. + * If you need to get an approximation, you can use root level statistics: + * e.g. NR_FILE_PAGES + NR_ANON_MAPPED. + * + * Return: The current memory cgroup size in bytes. + */ +__bpf_kfunc unsigned long bpf_mem_cgroup_usage(struct mem_cgroup *memcg) +{ + return page_counter_read(&memcg->memory) * PAGE_SIZE; +} + +/** + * bpf_mem_cgroup_memory_events - Read memory cgroup's memory event value + * @memcg: memory cgroup + * @event: memory event id + * + * Return: The current value of the memory event counter. + */ +__bpf_kfunc unsigned long bpf_mem_cgroup_memory_events(struct mem_cgroup *memcg, + enum memcg_memory_event event) +{ + if (unlikely(event >= MEMCG_NR_MEMORY_EVENTS)) + return (unsigned long)-1; + + return atomic_long_read(&memcg->memory_events[event]); +} + +/** + * bpf_mem_cgroup_page_state - Read memory cgroup's page state counter + * @memcg: memory cgroup + * @idx: counter idx + * + * Allows to read memory cgroup statistics. The output is in bytes. + * + * Return: The value of the page state counter in bytes. + */ +__bpf_kfunc unsigned long bpf_mem_cgroup_page_state(struct mem_cgroup *memcg, int idx) +{ + if (unlikely(!memcg_stat_item_valid(idx))) + return (unsigned long)-1; + + return memcg_page_state_output(memcg, idx); +} + +/** + * bpf_mem_cgroup_flush_stats - Flush memory cgroup's statistics + * @memcg: memory cgroup + * + * Propagate memory cgroup's statistics up the cgroup tree. + */ +__bpf_kfunc void bpf_mem_cgroup_flush_stats(struct mem_cgroup *memcg) +{ + mem_cgroup_flush_stats(memcg); +} + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(bpf_memcontrol_kfuncs) @@ -87,6 +166,12 @@ BTF_ID_FLAGS(func, bpf_get_root_mem_cgroup, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_get_mem_cgroup, KF_ACQUIRE | KF_RET_NULL | KF_RCU) BTF_ID_FLAGS(func, bpf_put_mem_cgroup, KF_RELEASE) +BTF_ID_FLAGS(func, bpf_mem_cgroup_vm_events, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_mem_cgroup_memory_events, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_mem_cgroup_usage, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_mem_cgroup_page_state, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_mem_cgroup_flush_stats, KF_TRUSTED_ARGS | KF_SLEEPABLE) + BTF_KFUNCS_END(bpf_memcontrol_kfuncs) static const struct btf_kfunc_id_set bpf_memcontrol_kfunc_set = { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index be810c1fbfc3..bae4eb72da61 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -663,6 +663,14 @@ unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) return x; } +bool memcg_stat_item_valid(int idx) +{ + if ((u32)idx >= MEMCG_NR_STAT) + return false; + + return !BAD_STAT_IDX(memcg_stats_index(idx)); +} + static int memcg_page_state_unit(int item); /* @@ -860,6 +868,14 @@ unsigned long memcg_events(struct mem_cgroup *memcg, int event) return READ_ONCE(memcg->vmstats->events[i]); } +bool memcg_vm_event_item_valid(enum vm_event_item idx) +{ + if (idx >= NR_VM_EVENT_ITEMS) + return false; + + return !BAD_STAT_IDX(memcg_events_index(idx)); +} + #ifdef CONFIG_MEMCG_V1 unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) { From 6bce6ddbe634bbc6d21672b5bfdbb5ad0409bd8d Mon Sep 17 00:00:00 2001 From: JP Kobryn Date: Mon, 22 Dec 2025 20:41:55 -0800 Subject: [PATCH 036/268] bpf: selftests: selftests for memcg stat kfuncs Add test coverage for the kfuncs that fetch memcg stats. Using some common stats, test scenarios ensuring that the given stat increases by some arbitrary amount. The stats selected cover the three categories represented by the enums: node_stat_item, memcg_stat_item, vm_event_item. Since only a subset of all stats are queried, use a static struct made up of fields for each stat. Write to the struct with the fetched values when the bpf program is invoked and read the fields in the user mode program for verification. Signed-off-by: JP Kobryn Signed-off-by: Roman Gushchin Link: https://lore.kernel.org/r/20251223044156.208250-6-roman.gushchin@linux.dev Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/cgroup_iter_memcg.h | 18 ++ .../bpf/prog_tests/cgroup_iter_memcg.c | 223 ++++++++++++++++++ .../selftests/bpf/progs/cgroup_iter_memcg.c | 39 +++ 3 files changed, 280 insertions(+) create mode 100644 tools/testing/selftests/bpf/cgroup_iter_memcg.h create mode 100644 tools/testing/selftests/bpf/prog_tests/cgroup_iter_memcg.c create mode 100644 tools/testing/selftests/bpf/progs/cgroup_iter_memcg.c diff --git a/tools/testing/selftests/bpf/cgroup_iter_memcg.h b/tools/testing/selftests/bpf/cgroup_iter_memcg.h new file mode 100644 index 000000000000..3f59b127943b --- /dev/null +++ b/tools/testing/selftests/bpf/cgroup_iter_memcg.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#ifndef __CGROUP_ITER_MEMCG_H +#define __CGROUP_ITER_MEMCG_H + +struct memcg_query { + /* some node_stat_item's */ + unsigned long nr_anon_mapped; + unsigned long nr_shmem; + unsigned long nr_file_pages; + unsigned long nr_file_mapped; + /* some memcg_stat_item */ + unsigned long memcg_kmem; + /* some vm_event_item */ + unsigned long pgfault; +}; + +#endif /* __CGROUP_ITER_MEMCG_H */ diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_iter_memcg.c b/tools/testing/selftests/bpf/prog_tests/cgroup_iter_memcg.c new file mode 100644 index 000000000000..a5afd16705f0 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_iter_memcg.c @@ -0,0 +1,223 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include +#include +#include +#include +#include "cgroup_helpers.h" +#include "cgroup_iter_memcg.h" +#include "cgroup_iter_memcg.skel.h" + +static int read_stats(struct bpf_link *link) +{ + int fd, ret = 0; + ssize_t bytes; + + fd = bpf_iter_create(bpf_link__fd(link)); + if (!ASSERT_OK_FD(fd, "bpf_iter_create")) + return 1; + + /* + * Invoke iter program by reading from its fd. We're not expecting any + * data to be written by the bpf program so the result should be zero. + * Results will be read directly through the custom data section + * accessible through skel->data_query.memcg_query. + */ + bytes = read(fd, NULL, 0); + if (!ASSERT_EQ(bytes, 0, "read fd")) + ret = 1; + + close(fd); + return ret; +} + +static void test_anon(struct bpf_link *link, struct memcg_query *memcg_query) +{ + void *map; + size_t len; + + len = sysconf(_SC_PAGESIZE) * 1024; + + /* + * Increase memcg anon usage by mapping and writing + * to a new anon region. + */ + map = mmap(NULL, len, PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (!ASSERT_NEQ(map, MAP_FAILED, "mmap anon")) + return; + + memset(map, 1, len); + + if (!ASSERT_OK(read_stats(link), "read stats")) + goto cleanup; + + ASSERT_GT(memcg_query->nr_anon_mapped, 0, "final anon mapped val"); + +cleanup: + munmap(map, len); +} + +static void test_file(struct bpf_link *link, struct memcg_query *memcg_query) +{ + void *map; + size_t len; + char *path; + int fd; + + len = sysconf(_SC_PAGESIZE) * 1024; + path = "/tmp/test_cgroup_iter_memcg"; + + /* + * Increase memcg file usage by creating and writing + * to a mapped file. + */ + fd = open(path, O_CREAT | O_RDWR, 0644); + if (!ASSERT_OK_FD(fd, "open fd")) + return; + if (!ASSERT_OK(ftruncate(fd, len), "ftruncate")) + goto cleanup_fd; + + map = mmap(NULL, len, PROT_WRITE, MAP_SHARED, fd, 0); + if (!ASSERT_NEQ(map, MAP_FAILED, "mmap file")) + goto cleanup_fd; + + memset(map, 1, len); + + if (!ASSERT_OK(read_stats(link), "read stats")) + goto cleanup_map; + + ASSERT_GT(memcg_query->nr_file_pages, 0, "final file value"); + ASSERT_GT(memcg_query->nr_file_mapped, 0, "final file mapped value"); + +cleanup_map: + munmap(map, len); +cleanup_fd: + close(fd); + unlink(path); +} + +static void test_shmem(struct bpf_link *link, struct memcg_query *memcg_query) +{ + size_t len; + int fd; + + len = sysconf(_SC_PAGESIZE) * 1024; + + /* + * Increase memcg shmem usage by creating and writing + * to a shmem object. + */ + fd = shm_open("/tmp_shmem", O_CREAT | O_RDWR, 0644); + if (!ASSERT_OK_FD(fd, "shm_open")) + return; + + if (!ASSERT_OK(fallocate(fd, 0, 0, len), "fallocate")) + goto cleanup; + + if (!ASSERT_OK(read_stats(link), "read stats")) + goto cleanup; + + ASSERT_GT(memcg_query->nr_shmem, 0, "final shmem value"); + +cleanup: + close(fd); + shm_unlink("/tmp_shmem"); +} + +#define NR_PIPES 64 +static void test_kmem(struct bpf_link *link, struct memcg_query *memcg_query) +{ + int fds[NR_PIPES][2], i; + + /* + * Increase kmem value by creating pipes which will allocate some + * kernel buffers. + */ + for (i = 0; i < NR_PIPES; i++) { + if (!ASSERT_OK(pipe(fds[i]), "pipe")) + goto cleanup; + } + + if (!ASSERT_OK(read_stats(link), "read stats")) + goto cleanup; + + ASSERT_GT(memcg_query->memcg_kmem, 0, "kmem value"); + +cleanup: + for (i = i - 1; i >= 0; i--) { + close(fds[i][0]); + close(fds[i][1]); + } +} + +static void test_pgfault(struct bpf_link *link, struct memcg_query *memcg_query) +{ + void *map; + size_t len; + + len = sysconf(_SC_PAGESIZE) * 1024; + + /* Create region to use for triggering a page fault. */ + map = mmap(NULL, len, PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (!ASSERT_NEQ(map, MAP_FAILED, "mmap anon")) + return; + + /* Trigger page fault. */ + memset(map, 1, len); + + if (!ASSERT_OK(read_stats(link), "read stats")) + goto cleanup; + + ASSERT_GT(memcg_query->pgfault, 0, "final pgfault val"); + +cleanup: + munmap(map, len); +} + +void test_cgroup_iter_memcg(void) +{ + char *cgroup_rel_path = "/cgroup_iter_memcg_test"; + struct cgroup_iter_memcg *skel; + struct bpf_link *link; + int cgroup_fd; + + cgroup_fd = cgroup_setup_and_join(cgroup_rel_path); + if (!ASSERT_OK_FD(cgroup_fd, "cgroup_setup_and_join")) + return; + + skel = cgroup_iter_memcg__open_and_load(); + if (!ASSERT_OK_PTR(skel, "cgroup_iter_memcg__open_and_load")) + goto cleanup_cgroup_fd; + + DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts); + union bpf_iter_link_info linfo = { + .cgroup.cgroup_fd = cgroup_fd, + .cgroup.order = BPF_CGROUP_ITER_SELF_ONLY, + }; + opts.link_info = &linfo; + opts.link_info_len = sizeof(linfo); + + link = bpf_program__attach_iter(skel->progs.cgroup_memcg_query, &opts); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_iter")) + goto cleanup_skel; + + if (test__start_subtest("cgroup_iter_memcg__anon")) + test_anon(link, &skel->data_query->memcg_query); + if (test__start_subtest("cgroup_iter_memcg__shmem")) + test_shmem(link, &skel->data_query->memcg_query); + if (test__start_subtest("cgroup_iter_memcg__file")) + test_file(link, &skel->data_query->memcg_query); + if (test__start_subtest("cgroup_iter_memcg__kmem")) + test_kmem(link, &skel->data_query->memcg_query); + if (test__start_subtest("cgroup_iter_memcg__pgfault")) + test_pgfault(link, &skel->data_query->memcg_query); + + bpf_link__destroy(link); +cleanup_skel: + cgroup_iter_memcg__destroy(skel); +cleanup_cgroup_fd: + close(cgroup_fd); + cleanup_cgroup_environment(); +} diff --git a/tools/testing/selftests/bpf/progs/cgroup_iter_memcg.c b/tools/testing/selftests/bpf/progs/cgroup_iter_memcg.c new file mode 100644 index 000000000000..59fb70a3cc50 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/cgroup_iter_memcg.c @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include "cgroup_iter_memcg.h" + +char _license[] SEC("license") = "GPL"; + +/* The latest values read are stored here. */ +struct memcg_query memcg_query SEC(".data.query"); + +SEC("iter.s/cgroup") +int cgroup_memcg_query(struct bpf_iter__cgroup *ctx) +{ + struct cgroup *cgrp = ctx->cgroup; + struct cgroup_subsys_state *css; + struct mem_cgroup *memcg; + + if (!cgrp) + return 1; + + css = &cgrp->self; + memcg = bpf_get_mem_cgroup(css); + if (!memcg) + return 1; + + bpf_mem_cgroup_flush_stats(memcg); + + memcg_query.nr_anon_mapped = bpf_mem_cgroup_page_state(memcg, NR_ANON_MAPPED); + memcg_query.nr_shmem = bpf_mem_cgroup_page_state(memcg, NR_SHMEM); + memcg_query.nr_file_pages = bpf_mem_cgroup_page_state(memcg, NR_FILE_PAGES); + memcg_query.nr_file_mapped = bpf_mem_cgroup_page_state(memcg, NR_FILE_MAPPED); + memcg_query.memcg_kmem = bpf_mem_cgroup_page_state(memcg, MEMCG_KMEM); + memcg_query.pgfault = bpf_mem_cgroup_vm_events(memcg, PGFAULT); + + bpf_put_mem_cgroup(memcg); + + return 0; +} From 6e57cdde70c10f4654356cc45467ebce0a5c3f88 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 22 Dec 2025 20:41:56 -0800 Subject: [PATCH 037/268] MAINTAINERS: add an entry for MM BPF extensions Let's create a separate entry for MM BPF extensions: these patches often require an attention from both bpf and mm communities. Signed-off-by: Roman Gushchin Link: https://lore.kernel.org/r/20251223044156.208250-7-roman.gushchin@linux.dev Signed-off-by: Alexei Starovoitov --- MAINTAINERS | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index e7027fba97db..70c2b73b3941 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4799,6 +4799,15 @@ L: bpf@vger.kernel.org S: Maintained F: tools/lib/bpf/ +BPF [MEMORY MANAGEMENT EXTENSIONS] +M: Roman Gushchin +M: JP Kobryn +M: Shakeel Butt +L: bpf@vger.kernel.org +L: linux-mm@kvack.org +S: Maintained +F: mm/bpf_memcontrol.c + BPF [MISC] L: bpf@vger.kernel.org S: Odd Fixes From 342297d51146b1a3184e53925901a0cc282b3f76 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Mon, 22 Dec 2025 05:32:45 -0800 Subject: [PATCH 038/268] bpf: allow calling kfuncs from raw_tp programs Associate raw tracepoint program type with the kfunc tracing hook. This allows calling kfuncs from raw_tp programs. Signed-off-by: Puranjay Mohan Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20251222133250.1890587-2-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 0de8fc8a0e0b..539c9fdea41d 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -8681,6 +8681,7 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type) return BTF_KFUNC_HOOK_STRUCT_OPS; case BPF_PROG_TYPE_TRACING: case BPF_PROG_TYPE_TRACEPOINT: + case BPF_PROG_TYPE_RAW_TRACEPOINT: case BPF_PROG_TYPE_PERF_EVENT: case BPF_PROG_TYPE_LSM: return BTF_KFUNC_HOOK_TRACING; From 83dd46ecb68ecc03cff23e68490ded5d40d79f66 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Mon, 22 Dec 2025 05:32:46 -0800 Subject: [PATCH 039/268] selftests: bpf: fix tests with raw_tp calling kfuncs As the previous commit allowed raw_tp programs to call kfuncs, so of the selftests that were expected to fail will now succeed. Signed-off-by: Puranjay Mohan Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20251222133250.1890587-3-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/dynptr_fail.c | 2 +- .../testing/selftests/bpf/progs/verifier_kfunc_prog_types.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c index dda6a8dada82..8f2ae9640886 100644 --- a/tools/testing/selftests/bpf/progs/dynptr_fail.c +++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c @@ -1465,7 +1465,7 @@ int xdp_invalid_data_slice2(struct xdp_md *xdp) } /* Only supported prog type can create skb-type dynptrs */ -SEC("?raw_tp") +SEC("?xdp") __failure __msg("calling kernel function bpf_dynptr_from_skb is not allowed") int skb_invalid_ctx(void *ctx) { diff --git a/tools/testing/selftests/bpf/progs/verifier_kfunc_prog_types.c b/tools/testing/selftests/bpf/progs/verifier_kfunc_prog_types.c index a509cad97e69..1fce7a7e8d03 100644 --- a/tools/testing/selftests/bpf/progs/verifier_kfunc_prog_types.c +++ b/tools/testing/selftests/bpf/progs/verifier_kfunc_prog_types.c @@ -32,7 +32,7 @@ static void task_kfunc_load_test(void) } SEC("raw_tp") -__failure __msg("calling kernel function") +__success int BPF_PROG(task_kfunc_raw_tp) { task_kfunc_load_test(); @@ -86,7 +86,7 @@ static void cgrp_kfunc_load_test(void) } SEC("raw_tp") -__failure __msg("calling kernel function") +__success int BPF_PROG(cgrp_kfunc_raw_tp) { cgrp_kfunc_load_test(); @@ -138,7 +138,7 @@ static void cpumask_kfunc_load_test(void) } SEC("raw_tp") -__failure __msg("calling kernel function") +__success int BPF_PROG(cpumask_kfunc_raw_tp) { cpumask_kfunc_load_test(); From ac1c5bc7c4c7e20e2070e6eaa673fc3e11619dbb Mon Sep 17 00:00:00 2001 From: Daniel Gomez Date: Sat, 20 Dec 2025 04:48:09 +0100 Subject: [PATCH 040/268] bpf: crypto: replace -EEXIST with -EBUSY The -EEXIST error code is reserved by the module loading infrastructure to indicate that a module is already loaded. When a module's init function returns -EEXIST, userspace tools like kmod interpret this as "module already loaded" and treat the operation as successful, returning 0 to the user even though the module initialization actually failed. This follows the precedent set by commit 54416fd76770 ("netfilter: conntrack: helper: Replace -EEXIST by -EBUSY") which fixed the same issue in nf_conntrack_helper_register(). This affects bpf_crypto_skcipher module. While the configuration required to build it as a module is unlikely in practice, it is technically possible, so fix it for correctness. Signed-off-by: Daniel Gomez Acked-by: Vadim Fedorenko Link: https://lore.kernel.org/r/20251220-dev-module-init-eexists-bpf-v1-1-7f186663dbe7@samsung.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/crypto.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/crypto.c b/kernel/bpf/crypto.c index 83c4d9943084..1ab79a6dec84 100644 --- a/kernel/bpf/crypto.c +++ b/kernel/bpf/crypto.c @@ -60,7 +60,7 @@ struct bpf_crypto_ctx { int bpf_crypto_register_type(const struct bpf_crypto_type *type) { struct bpf_crypto_type_list *node; - int err = -EEXIST; + int err = -EBUSY; down_write(&bpf_crypto_types_sem); list_for_each_entry(node, &bpf_crypto_types, list) { From c336b0b327120052c331f6839ee60069065a7c74 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Mon, 22 Dec 2025 11:50:16 -0800 Subject: [PATCH 041/268] bpf: arena: populate vm_area without allocating memory vm_area_map_pages() may allocate memory while inserting pages into bpf arena's vm_area. In order to make bpf_arena_alloc_pages() kfunc non-sleepable change bpf arena to populate pages without allocating memory: - at arena creation time populate all page table levels except the last level - when new pages need to be inserted call apply_to_page_range() again with apply_range_set_cb() which will only set_pte_at() those pages and will not allocate memory. - when freeing pages call apply_to_existing_page_range with apply_range_clear_cb() to clear the pte for the page to be removed. This doesn't free intermediate page table levels. Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20251222195022.431211-2-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/arena.c | 100 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 90 insertions(+), 10 deletions(-) diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 872dc0e41c65..55b198b9f1a3 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -2,11 +2,13 @@ /* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ #include #include +#include #include #include "linux/filter.h" #include #include #include +#include #include "range_tree.h" /* @@ -92,6 +94,68 @@ static long compute_pgoff(struct bpf_arena *arena, long uaddr) return (u32)(uaddr - (u32)arena->user_vm_start) >> PAGE_SHIFT; } +struct apply_range_data { + struct page **pages; + int i; +}; + +static int apply_range_set_cb(pte_t *pte, unsigned long addr, void *data) +{ + struct apply_range_data *d = data; + struct page *page; + + if (!data) + return 0; + /* sanity check */ + if (unlikely(!pte_none(ptep_get(pte)))) + return -EBUSY; + + page = d->pages[d->i]; + /* paranoia, similar to vmap_pages_pte_range() */ + if (WARN_ON_ONCE(!pfn_valid(page_to_pfn(page)))) + return -EINVAL; + + set_pte_at(&init_mm, addr, pte, mk_pte(page, PAGE_KERNEL)); + d->i++; + return 0; +} + +static void flush_vmap_cache(unsigned long start, unsigned long size) +{ + flush_cache_vmap(start, start + size); +} + +static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *data) +{ + pte_t old_pte; + struct page *page; + + /* sanity check */ + old_pte = ptep_get(pte); + if (pte_none(old_pte) || !pte_present(old_pte)) + return 0; /* nothing to do */ + + /* get page and free it */ + page = pte_page(old_pte); + if (WARN_ON_ONCE(!page)) + return -EINVAL; + + pte_clear(&init_mm, addr, pte); + + /* ensure no stale TLB entries */ + flush_tlb_kernel_range(addr, addr + PAGE_SIZE); + + __free_page(page); + + return 0; +} + +static int populate_pgtable_except_pte(struct bpf_arena *arena) +{ + return apply_to_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena), + KERN_VM_SZ - GUARD_SZ, apply_range_set_cb, NULL); +} + static struct bpf_map *arena_map_alloc(union bpf_attr *attr) { struct vm_struct *kern_vm; @@ -144,6 +208,12 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr) goto err; } mutex_init(&arena->lock); + err = populate_pgtable_except_pte(arena); + if (err) { + range_tree_destroy(&arena->rt); + bpf_map_area_free(arena); + goto err; + } return &arena->map; err: @@ -286,6 +356,7 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf) if (ret) return VM_FAULT_SIGSEGV; + struct apply_range_data data = { .pages = &page, .i = 0 }; /* Account into memcg of the process that created bpf_arena */ ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page); if (ret) { @@ -293,12 +364,13 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf) return VM_FAULT_SIGSEGV; } - ret = vm_area_map_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE, &page); + ret = apply_to_page_range(&init_mm, kaddr, PAGE_SIZE, apply_range_set_cb, &data); if (ret) { range_tree_set(&arena->rt, vmf->pgoff, 1); __free_page(page); return VM_FAULT_SIGSEGV; } + flush_vmap_cache(kaddr, PAGE_SIZE); out: page_ref_add(page, 1); vmf->page = page; @@ -428,7 +500,8 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt /* user_vm_end/start are fixed before bpf prog runs */ long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT; u64 kern_vm_start = bpf_arena_get_kern_vm_start(arena); - struct page **pages; + struct page **pages = NULL; + long mapped = 0; long pgoff = 0; u32 uaddr32; int ret, i; @@ -450,7 +523,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt if (!pages) return 0; - guard(mutex)(&arena->lock); + mutex_lock(&arena->lock); if (uaddr) { ret = is_range_tree_set(&arena->rt, pgoff, page_cnt); @@ -465,6 +538,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt if (ret) goto out_free_pages; + struct apply_range_data data = { .pages = pages, .i = 0 }; ret = bpf_map_alloc_pages(&arena->map, node_id, page_cnt, pages); if (ret) goto out; @@ -477,18 +551,24 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt * kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE - 1 can overflow * lower 32-bit and it's ok. */ - ret = vm_area_map_pages(arena->kern_vm, kern_vm_start + uaddr32, - kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE, pages); - if (ret) { - for (i = 0; i < page_cnt; i++) + apply_to_page_range(&init_mm, kern_vm_start + uaddr32, + page_cnt << PAGE_SHIFT, apply_range_set_cb, &data); + mapped = data.i; + flush_vmap_cache(kern_vm_start + uaddr32, mapped << PAGE_SHIFT); + if (mapped < page_cnt) { + for (i = mapped; i < page_cnt; i++) __free_page(pages[i]); goto out; } + mutex_unlock(&arena->lock); kvfree(pages); return clear_lo32(arena->user_vm_start) + uaddr32; out: - range_tree_set(&arena->rt, pgoff, page_cnt); + range_tree_set(&arena->rt, pgoff + mapped, page_cnt - mapped); out_free_pages: + mutex_unlock(&arena->lock); + if (mapped) + arena_free_pages(arena, uaddr32, mapped); kvfree(pages); return 0; } @@ -545,8 +625,8 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt) * page_cnt is big it's faster to do the batched zap. */ zap_pages(arena, full_uaddr, 1); - vm_area_unmap_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE); - __free_page(page); + apply_to_existing_page_range(&init_mm, kaddr, PAGE_SIZE, apply_range_clear_cb, + NULL); } } From 360c35f8ffae0f184805d9eb7d126474345bac9b Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Mon, 22 Dec 2025 11:50:17 -0800 Subject: [PATCH 042/268] bpf: arena: use kmalloc_nolock() in place of kvcalloc() To make arena_alloc_pages() safe to be called from any context, replace kvcalloc() with kmalloc_nolock() so as it doesn't sleep or take any locks. kmalloc_nolock() returns NULL for allocations larger than KMALLOC_MAX_CACHE_SIZE, which is (PAGE_SIZE * 2) = 8KB on systems with 4KB pages. So, round down the allocation done by kmalloc_nolock to 1024 * 8 and reuse the array in a loop. Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20251222195022.431211-3-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/arena.c | 84 ++++++++++++++++++++++++++++++---------------- 1 file changed, 55 insertions(+), 29 deletions(-) diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 55b198b9f1a3..128efb68d47b 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -44,6 +44,8 @@ #define GUARD_SZ round_up(1ull << sizeof_field(struct bpf_insn, off) * 8, PAGE_SIZE << 1) #define KERN_VM_SZ (SZ_4G + GUARD_SZ) +static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt); + struct bpf_arena { struct bpf_map map; u64 user_vm_start; @@ -500,8 +502,10 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt /* user_vm_end/start are fixed before bpf prog runs */ long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT; u64 kern_vm_start = bpf_arena_get_kern_vm_start(arena); + struct apply_range_data data; struct page **pages = NULL; - long mapped = 0; + long remaining, mapped = 0; + long alloc_pages; long pgoff = 0; u32 uaddr32; int ret, i; @@ -518,17 +522,19 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt return 0; } - /* zeroing is needed, since alloc_pages_bulk() only fills in non-zero entries */ - pages = kvcalloc(page_cnt, sizeof(struct page *), GFP_KERNEL); + /* Cap allocation size to KMALLOC_MAX_CACHE_SIZE so kmalloc_nolock() can succeed. */ + alloc_pages = min(page_cnt, KMALLOC_MAX_CACHE_SIZE / sizeof(struct page *)); + pages = kmalloc_nolock(alloc_pages * sizeof(struct page *), 0, NUMA_NO_NODE); if (!pages) return 0; + data.pages = pages; mutex_lock(&arena->lock); if (uaddr) { ret = is_range_tree_set(&arena->rt, pgoff, page_cnt); if (ret) - goto out_free_pages; + goto out_unlock_free_pages; ret = range_tree_clear(&arena->rt, pgoff, page_cnt); } else { ret = pgoff = range_tree_find(&arena->rt, page_cnt); @@ -536,40 +542,60 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt ret = range_tree_clear(&arena->rt, pgoff, page_cnt); } if (ret) - goto out_free_pages; - - struct apply_range_data data = { .pages = pages, .i = 0 }; - ret = bpf_map_alloc_pages(&arena->map, node_id, page_cnt, pages); - if (ret) - goto out; + goto out_unlock_free_pages; + remaining = page_cnt; uaddr32 = (u32)(arena->user_vm_start + pgoff * PAGE_SIZE); - /* Earlier checks made sure that uaddr32 + page_cnt * PAGE_SIZE - 1 - * will not overflow 32-bit. Lower 32-bit need to represent - * contiguous user address range. - * Map these pages at kern_vm_start base. - * kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE - 1 can overflow - * lower 32-bit and it's ok. - */ - apply_to_page_range(&init_mm, kern_vm_start + uaddr32, - page_cnt << PAGE_SHIFT, apply_range_set_cb, &data); - mapped = data.i; - flush_vmap_cache(kern_vm_start + uaddr32, mapped << PAGE_SHIFT); - if (mapped < page_cnt) { - for (i = mapped; i < page_cnt; i++) - __free_page(pages[i]); - goto out; + + while (remaining) { + long this_batch = min(remaining, alloc_pages); + + /* zeroing is needed, since alloc_pages_bulk() only fills in non-zero entries */ + memset(pages, 0, this_batch * sizeof(struct page *)); + + ret = bpf_map_alloc_pages(&arena->map, node_id, this_batch, pages); + if (ret) + goto out; + + /* + * Earlier checks made sure that uaddr32 + page_cnt * PAGE_SIZE - 1 + * will not overflow 32-bit. Lower 32-bit need to represent + * contiguous user address range. + * Map these pages at kern_vm_start base. + * kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE - 1 can overflow + * lower 32-bit and it's ok. + */ + data.i = 0; + ret = apply_to_page_range(&init_mm, + kern_vm_start + uaddr32 + (mapped << PAGE_SHIFT), + this_batch << PAGE_SHIFT, apply_range_set_cb, &data); + if (ret) { + /* data.i pages were mapped, account them and free the remaining */ + mapped += data.i; + for (i = data.i; i < this_batch; i++) + __free_page(pages[i]); + goto out; + } + + mapped += this_batch; + remaining -= this_batch; } + flush_vmap_cache(kern_vm_start + uaddr32, mapped << PAGE_SHIFT); mutex_unlock(&arena->lock); - kvfree(pages); + kfree_nolock(pages); return clear_lo32(arena->user_vm_start) + uaddr32; out: range_tree_set(&arena->rt, pgoff + mapped, page_cnt - mapped); -out_free_pages: mutex_unlock(&arena->lock); - if (mapped) + if (mapped) { + flush_vmap_cache(kern_vm_start + uaddr32, mapped << PAGE_SHIFT); arena_free_pages(arena, uaddr32, mapped); - kvfree(pages); + } + goto out_free_pages; +out_unlock_free_pages: + mutex_unlock(&arena->lock); +out_free_pages: + kfree_nolock(pages); return 0; } From b8467290edab4bafae352bf3f317055669a1a458 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Mon, 22 Dec 2025 11:50:18 -0800 Subject: [PATCH 043/268] bpf: arena: make arena kfuncs any context safe Make arena related kfuncs any context safe by the following changes: bpf_arena_alloc_pages() and bpf_arena_reserve_pages(): Replace the usage of the mutex with a rqspinlock for range tree and use kmalloc_nolock() wherever needed. Use free_pages_nolock() to free pages from any context. apply_range_set/clear_cb() with apply_to_page_range() has already made populating the vm_area in bpf_arena_alloc_pages() any context safe. bpf_arena_free_pages(): defer the main logic to a workqueue if it is called from a non-sleepable context. specialize_kfunc() is used to replace the sleepable arena_free_pages() with bpf_arena_free_pages_non_sleepable() when the verifier detects the call is from a non-sleepable context. In the non-sleepable case, arena_free_pages() queues the address and the page count to be freed to a lock-less list of struct arena_free_spans and raises an irq_work. The irq_work handler calls schedules_work() as it is safe to be called from irq context. arena_free_worker() (the work queue handler) iterates these spans and clears ptes, flushes tlb, zaps pages, and calls __free_page(). Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20251222195022.431211-4-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 16 +++ kernel/bpf/arena.c | 250 +++++++++++++++++++++++++++++++++++------- kernel/bpf/verifier.c | 10 ++ 3 files changed, 234 insertions(+), 42 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index da6a00dd313f..4e7d72dfbcd4 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -673,6 +673,22 @@ void bpf_map_free_internal_structs(struct bpf_map *map, void *obj); int bpf_dynptr_from_file_sleepable(struct file *file, u32 flags, struct bpf_dynptr *ptr__uninit); +#if defined(CONFIG_MMU) && defined(CONFIG_64BIT) +void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 page_cnt, int node_id, + u64 flags); +void bpf_arena_free_pages_non_sleepable(void *p__map, void *ptr__ign, u32 page_cnt); +#else +static inline void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 page_cnt, + int node_id, u64 flags) +{ + return NULL; +} + +static inline void bpf_arena_free_pages_non_sleepable(void *p__map, void *ptr__ign, u32 page_cnt) +{ +} +#endif + extern const struct bpf_map_ops bpf_map_offload_ops; /* bpf_type_flag contains a set of flags that are applicable to the values of diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 128efb68d47b..456ac989269d 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -4,7 +4,9 @@ #include #include #include +#include #include "linux/filter.h" +#include #include #include #include @@ -44,7 +46,7 @@ #define GUARD_SZ round_up(1ull << sizeof_field(struct bpf_insn, off) * 8, PAGE_SIZE << 1) #define KERN_VM_SZ (SZ_4G + GUARD_SZ) -static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt); +static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, bool sleepable); struct bpf_arena { struct bpf_map map; @@ -52,8 +54,23 @@ struct bpf_arena { u64 user_vm_end; struct vm_struct *kern_vm; struct range_tree rt; + /* protects rt */ + rqspinlock_t spinlock; struct list_head vma_list; + /* protects vma_list */ struct mutex lock; + struct irq_work free_irq; + struct work_struct free_work; + struct llist_head free_spans; +}; + +static void arena_free_worker(struct work_struct *work); +static void arena_free_irq(struct irq_work *iw); + +struct arena_free_span { + struct llist_node node; + unsigned long uaddr; + u32 page_cnt; }; u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena) @@ -127,7 +144,7 @@ static void flush_vmap_cache(unsigned long start, unsigned long size) flush_cache_vmap(start, start + size); } -static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *data) +static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *free_pages) { pte_t old_pte; struct page *page; @@ -137,17 +154,15 @@ static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *data) if (pte_none(old_pte) || !pte_present(old_pte)) return 0; /* nothing to do */ - /* get page and free it */ page = pte_page(old_pte); if (WARN_ON_ONCE(!page)) return -EINVAL; pte_clear(&init_mm, addr, pte); - /* ensure no stale TLB entries */ - flush_tlb_kernel_range(addr, addr + PAGE_SIZE); - - __free_page(page); + /* Add page to the list so it is freed later */ + if (free_pages) + __llist_add(&page->pcp_llist, free_pages); return 0; } @@ -202,6 +217,9 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr) arena->user_vm_end = arena->user_vm_start + vm_range; INIT_LIST_HEAD(&arena->vma_list); + init_llist_head(&arena->free_spans); + init_irq_work(&arena->free_irq, arena_free_irq); + INIT_WORK(&arena->free_work, arena_free_worker); bpf_map_init_from_attr(&arena->map, attr); range_tree_init(&arena->rt); err = range_tree_set(&arena->rt, 0, attr->max_entries); @@ -210,6 +228,7 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr) goto err; } mutex_init(&arena->lock); + raw_res_spin_lock_init(&arena->spinlock); err = populate_pgtable_except_pte(arena); if (err) { range_tree_destroy(&arena->rt); @@ -256,6 +275,10 @@ static void arena_map_free(struct bpf_map *map) if (WARN_ON_ONCE(!list_empty(&arena->vma_list))) return; + /* Ensure no pending deferred frees */ + irq_work_sync(&arena->free_irq); + flush_work(&arena->free_work); + /* * free_vm_area() calls remove_vm_area() that calls free_unmap_vmap_area(). * It unmaps everything from vmalloc area and clears pgtables. @@ -339,12 +362,16 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf) struct bpf_arena *arena = container_of(map, struct bpf_arena, map); struct page *page; long kbase, kaddr; + unsigned long flags; int ret; kbase = bpf_arena_get_kern_vm_start(arena); kaddr = kbase + (u32)(vmf->address); - guard(mutex)(&arena->lock); + if (raw_res_spin_lock_irqsave(&arena->spinlock, flags)) + /* Make a reasonable effort to address impossible case */ + return VM_FAULT_RETRY; + page = vmalloc_to_page((void *)kaddr); if (page) /* already have a page vmap-ed */ @@ -352,31 +379,35 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf) if (arena->map.map_flags & BPF_F_SEGV_ON_FAULT) /* User space requested to segfault when page is not allocated by bpf prog */ - return VM_FAULT_SIGSEGV; + goto out_unlock_sigsegv; ret = range_tree_clear(&arena->rt, vmf->pgoff, 1); if (ret) - return VM_FAULT_SIGSEGV; + goto out_unlock_sigsegv; struct apply_range_data data = { .pages = &page, .i = 0 }; /* Account into memcg of the process that created bpf_arena */ ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page); if (ret) { range_tree_set(&arena->rt, vmf->pgoff, 1); - return VM_FAULT_SIGSEGV; + goto out_unlock_sigsegv; } ret = apply_to_page_range(&init_mm, kaddr, PAGE_SIZE, apply_range_set_cb, &data); if (ret) { range_tree_set(&arena->rt, vmf->pgoff, 1); - __free_page(page); - return VM_FAULT_SIGSEGV; + free_pages_nolock(page, 0); + goto out_unlock_sigsegv; } flush_vmap_cache(kaddr, PAGE_SIZE); out: page_ref_add(page, 1); + raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); vmf->page = page; return 0; +out_unlock_sigsegv: + raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); + return VM_FAULT_SIGSEGV; } static const struct vm_operations_struct arena_vm_ops = { @@ -497,7 +528,8 @@ static u64 clear_lo32(u64 val) * Allocate pages and vmap them into kernel vmalloc area. * Later the pages will be mmaped into user space vma. */ -static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt, int node_id) +static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt, int node_id, + bool sleepable) { /* user_vm_end/start are fixed before bpf prog runs */ long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT; @@ -506,6 +538,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt struct page **pages = NULL; long remaining, mapped = 0; long alloc_pages; + unsigned long flags; long pgoff = 0; u32 uaddr32; int ret, i; @@ -529,7 +562,8 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt return 0; data.pages = pages; - mutex_lock(&arena->lock); + if (raw_res_spin_lock_irqsave(&arena->spinlock, flags)) + goto out_free_pages; if (uaddr) { ret = is_range_tree_set(&arena->rt, pgoff, page_cnt); @@ -573,7 +607,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt /* data.i pages were mapped, account them and free the remaining */ mapped += data.i; for (i = data.i; i < this_batch; i++) - __free_page(pages[i]); + free_pages_nolock(pages[i], 0); goto out; } @@ -581,19 +615,19 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt remaining -= this_batch; } flush_vmap_cache(kern_vm_start + uaddr32, mapped << PAGE_SHIFT); - mutex_unlock(&arena->lock); + raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); kfree_nolock(pages); return clear_lo32(arena->user_vm_start) + uaddr32; out: range_tree_set(&arena->rt, pgoff + mapped, page_cnt - mapped); - mutex_unlock(&arena->lock); + raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); if (mapped) { flush_vmap_cache(kern_vm_start + uaddr32, mapped << PAGE_SHIFT); - arena_free_pages(arena, uaddr32, mapped); + arena_free_pages(arena, uaddr32, mapped, sleepable); } goto out_free_pages; out_unlock_free_pages: - mutex_unlock(&arena->lock); + raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); out_free_pages: kfree_nolock(pages); return 0; @@ -608,42 +642,64 @@ static void zap_pages(struct bpf_arena *arena, long uaddr, long page_cnt) { struct vma_list *vml; + guard(mutex)(&arena->lock); + /* iterate link list under lock */ list_for_each_entry(vml, &arena->vma_list, head) zap_page_range_single(vml->vma, uaddr, PAGE_SIZE * page_cnt, NULL); } -static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt) +static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, bool sleepable) { u64 full_uaddr, uaddr_end; - long kaddr, pgoff, i; + long kaddr, pgoff; struct page *page; + struct llist_head free_pages; + struct llist_node *pos, *t; + struct arena_free_span *s; + unsigned long flags; + int ret = 0; /* only aligned lower 32-bit are relevant */ uaddr = (u32)uaddr; uaddr &= PAGE_MASK; + kaddr = bpf_arena_get_kern_vm_start(arena) + uaddr; full_uaddr = clear_lo32(arena->user_vm_start) + uaddr; uaddr_end = min(arena->user_vm_end, full_uaddr + (page_cnt << PAGE_SHIFT)); if (full_uaddr >= uaddr_end) return; page_cnt = (uaddr_end - full_uaddr) >> PAGE_SHIFT; - - guard(mutex)(&arena->lock); - pgoff = compute_pgoff(arena, uaddr); - /* clear range */ + + if (!sleepable) + goto defer; + + ret = raw_res_spin_lock_irqsave(&arena->spinlock, flags); + + /* Can't proceed without holding the spinlock so defer the free */ + if (ret) + goto defer; + range_tree_set(&arena->rt, pgoff, page_cnt); + init_llist_head(&free_pages); + /* clear ptes and collect struct pages */ + apply_to_existing_page_range(&init_mm, kaddr, page_cnt << PAGE_SHIFT, + apply_range_clear_cb, &free_pages); + + /* drop the lock to do the tlb flush and zap pages */ + raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); + + /* ensure no stale TLB entries */ + flush_tlb_kernel_range(kaddr, kaddr + (page_cnt * PAGE_SIZE)); + if (page_cnt > 1) /* bulk zap if multiple pages being freed */ zap_pages(arena, full_uaddr, page_cnt); - kaddr = bpf_arena_get_kern_vm_start(arena) + uaddr; - for (i = 0; i < page_cnt; i++, kaddr += PAGE_SIZE, full_uaddr += PAGE_SIZE) { - page = vmalloc_to_page((void *)kaddr); - if (!page) - continue; + llist_for_each_safe(pos, t, __llist_del_all(&free_pages)) { + page = llist_entry(pos, struct page, pcp_llist); if (page_cnt == 1 && page_mapped(page)) /* mapped by some user process */ /* Optimization for the common case of page_cnt==1: * If page wasn't mapped into some user vma there @@ -651,9 +707,25 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt) * page_cnt is big it's faster to do the batched zap. */ zap_pages(arena, full_uaddr, 1); - apply_to_existing_page_range(&init_mm, kaddr, PAGE_SIZE, apply_range_clear_cb, - NULL); + __free_page(page); } + + return; + +defer: + s = kmalloc_nolock(sizeof(struct arena_free_span), 0, -1); + if (!s) + /* + * If allocation fails in non-sleepable context, pages are intentionally left + * inaccessible (leaked) until the arena is destroyed. Cleanup or retries are not + * possible here, so we intentionally omit them for safety. + */ + return; + + s->page_cnt = page_cnt; + s->uaddr = uaddr; + llist_add(&s->node, &arena->free_spans); + irq_work_queue(&arena->free_irq); } /* @@ -663,6 +735,7 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt) static int arena_reserve_pages(struct bpf_arena *arena, long uaddr, u32 page_cnt) { long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT; + unsigned long flags; long pgoff; int ret; @@ -673,15 +746,87 @@ static int arena_reserve_pages(struct bpf_arena *arena, long uaddr, u32 page_cnt if (pgoff + page_cnt > page_cnt_max) return -EINVAL; - guard(mutex)(&arena->lock); + if (raw_res_spin_lock_irqsave(&arena->spinlock, flags)) + return -EBUSY; /* Cannot guard already allocated pages. */ ret = is_range_tree_set(&arena->rt, pgoff, page_cnt); - if (ret) - return -EBUSY; + if (ret) { + ret = -EBUSY; + goto out; + } /* "Allocate" the region to prevent it from being allocated. */ - return range_tree_clear(&arena->rt, pgoff, page_cnt); + ret = range_tree_clear(&arena->rt, pgoff, page_cnt); +out: + raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); + return ret; +} + +static void arena_free_worker(struct work_struct *work) +{ + struct bpf_arena *arena = container_of(work, struct bpf_arena, free_work); + struct llist_node *list, *pos, *t; + struct arena_free_span *s; + u64 arena_vm_start, user_vm_start; + struct llist_head free_pages; + struct page *page; + unsigned long full_uaddr; + long kaddr, page_cnt, pgoff; + unsigned long flags; + + if (raw_res_spin_lock_irqsave(&arena->spinlock, flags)) { + schedule_work(work); + return; + } + + init_llist_head(&free_pages); + arena_vm_start = bpf_arena_get_kern_vm_start(arena); + user_vm_start = bpf_arena_get_user_vm_start(arena); + + list = llist_del_all(&arena->free_spans); + llist_for_each(pos, list) { + s = llist_entry(pos, struct arena_free_span, node); + page_cnt = s->page_cnt; + kaddr = arena_vm_start + s->uaddr; + pgoff = compute_pgoff(arena, s->uaddr); + + /* clear ptes and collect pages in free_pages llist */ + apply_to_existing_page_range(&init_mm, kaddr, page_cnt << PAGE_SHIFT, + apply_range_clear_cb, &free_pages); + + range_tree_set(&arena->rt, pgoff, page_cnt); + } + raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); + + /* Iterate the list again without holding spinlock to do the tlb flush and zap_pages */ + llist_for_each_safe(pos, t, list) { + s = llist_entry(pos, struct arena_free_span, node); + page_cnt = s->page_cnt; + full_uaddr = clear_lo32(user_vm_start) + s->uaddr; + kaddr = arena_vm_start + s->uaddr; + + /* ensure no stale TLB entries */ + flush_tlb_kernel_range(kaddr, kaddr + (page_cnt * PAGE_SIZE)); + + /* remove pages from user vmas */ + zap_pages(arena, full_uaddr, page_cnt); + + kfree_nolock(s); + } + + /* free all pages collected by apply_to_existing_page_range() in the first loop */ + llist_for_each_safe(pos, t, __llist_del_all(&free_pages)) { + page = llist_entry(pos, struct page, pcp_llist); + __free_page(page); + } +} + +static void arena_free_irq(struct irq_work *iw) +{ + struct bpf_arena *arena = container_of(iw, struct bpf_arena, free_irq); + + schedule_work(&arena->free_work); } __bpf_kfunc_start_defs(); @@ -695,9 +840,20 @@ __bpf_kfunc void *bpf_arena_alloc_pages(void *p__map, void *addr__ign, u32 page_ if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt) return NULL; - return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id); + return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, true); } +void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 page_cnt, + int node_id, u64 flags) +{ + struct bpf_map *map = p__map; + struct bpf_arena *arena = container_of(map, struct bpf_arena, map); + + if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt) + return NULL; + + return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, false); +} __bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt) { struct bpf_map *map = p__map; @@ -705,7 +861,17 @@ __bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt if (map->map_type != BPF_MAP_TYPE_ARENA || !page_cnt || !ptr__ign) return; - arena_free_pages(arena, (long)ptr__ign, page_cnt); + arena_free_pages(arena, (long)ptr__ign, page_cnt, true); +} + +void bpf_arena_free_pages_non_sleepable(void *p__map, void *ptr__ign, u32 page_cnt) +{ + struct bpf_map *map = p__map; + struct bpf_arena *arena = container_of(map, struct bpf_arena, map); + + if (map->map_type != BPF_MAP_TYPE_ARENA || !page_cnt || !ptr__ign) + return; + arena_free_pages(arena, (long)ptr__ign, page_cnt, false); } __bpf_kfunc int bpf_arena_reserve_pages(void *p__map, void *ptr__ign, u32 page_cnt) @@ -724,9 +890,9 @@ __bpf_kfunc int bpf_arena_reserve_pages(void *p__map, void *ptr__ign, u32 page_c __bpf_kfunc_end_defs(); BTF_KFUNCS_START(arena_kfuncs) -BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_RET | KF_ARENA_ARG2) -BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_ARG2) -BTF_ID_FLAGS(func, bpf_arena_reserve_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_ARG2) +BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_TRUSTED_ARGS | KF_ARENA_RET | KF_ARENA_ARG2) +BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_TRUSTED_ARGS | KF_ARENA_ARG2) +BTF_ID_FLAGS(func, bpf_arena_reserve_pages, KF_TRUSTED_ARGS | KF_ARENA_ARG2) BTF_KFUNCS_END(arena_kfuncs) static const struct btf_kfunc_id_set common_kfunc_set = { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d6b8a77fbe3b..2de1a736ef69 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12380,6 +12380,8 @@ enum special_kfunc_type { KF___bpf_trap, KF_bpf_task_work_schedule_signal_impl, KF_bpf_task_work_schedule_resume_impl, + KF_bpf_arena_alloc_pages, + KF_bpf_arena_free_pages, }; BTF_ID_LIST(special_kfunc_list) @@ -12454,6 +12456,8 @@ BTF_ID(func, bpf_dynptr_file_discard) BTF_ID(func, __bpf_trap) BTF_ID(func, bpf_task_work_schedule_signal_impl) BTF_ID(func, bpf_task_work_schedule_resume_impl) +BTF_ID(func, bpf_arena_alloc_pages) +BTF_ID(func, bpf_arena_free_pages) static bool is_task_work_add_kfunc(u32 func_id) { @@ -22432,6 +22436,12 @@ static int specialize_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_desc } else if (func_id == special_kfunc_list[KF_bpf_dynptr_from_file]) { if (!env->insn_aux_data[insn_idx].non_sleepable) addr = (unsigned long)bpf_dynptr_from_file_sleepable; + } else if (func_id == special_kfunc_list[KF_bpf_arena_alloc_pages]) { + if (env->insn_aux_data[insn_idx].non_sleepable) + addr = (unsigned long)bpf_arena_alloc_pages_non_sleepable; + } else if (func_id == special_kfunc_list[KF_bpf_arena_free_pages]) { + if (env->insn_aux_data[insn_idx].non_sleepable) + addr = (unsigned long)bpf_arena_free_pages_non_sleepable; } desc->addr = addr; return 0; From efecc9e825f4aa3fe616236152604a066a3e776d Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Mon, 22 Dec 2025 11:50:19 -0800 Subject: [PATCH 044/268] selftests: bpf: test non-sleepable arena allocations As arena kfuncs can now be called from non-sleepable contexts, test this by adding non-sleepable copies of tests in verifier_arena, this is done by using a socket program instead of syscall. Add a new test case in verifier_arena_large to check that the bpf_arena_alloc_pages() works for more than 1024 pages. 1024 * sizeof(struct page *) is the upper limit of kmalloc_nolock() but bpf_arena_alloc_pages() should still succeed because it re-uses this array in a loop. Augment the arena_list selftest to also run in non-sleepable context by taking rcu_read_lock. Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20251222195022.431211-5-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/arena_list.c | 20 +- .../testing/selftests/bpf/progs/arena_list.c | 11 ++ .../selftests/bpf/progs/verifier_arena.c | 185 ++++++++++++++++++ .../bpf/progs/verifier_arena_large.c | 29 +++ 4 files changed, 240 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/arena_list.c b/tools/testing/selftests/bpf/prog_tests/arena_list.c index d15867cddde0..4f2866a615ce 100644 --- a/tools/testing/selftests/bpf/prog_tests/arena_list.c +++ b/tools/testing/selftests/bpf/prog_tests/arena_list.c @@ -27,17 +27,23 @@ static int list_sum(struct arena_list_head *head) return sum; } -static void test_arena_list_add_del(int cnt) +static void test_arena_list_add_del(int cnt, bool nonsleepable) { LIBBPF_OPTS(bpf_test_run_opts, opts); struct arena_list *skel; int expected_sum = (u64)cnt * (cnt - 1) / 2; int ret, sum; - skel = arena_list__open_and_load(); - if (!ASSERT_OK_PTR(skel, "arena_list__open_and_load")) + skel = arena_list__open(); + if (!ASSERT_OK_PTR(skel, "arena_list__open")) return; + skel->rodata->nonsleepable = nonsleepable; + + ret = arena_list__load(skel); + if (!ASSERT_OK(ret, "arena_list__load")) + goto out; + skel->bss->cnt = cnt; ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.arena_list_add), &opts); ASSERT_OK(ret, "ret_add"); @@ -65,7 +71,11 @@ static void test_arena_list_add_del(int cnt) void test_arena_list(void) { if (test__start_subtest("arena_list_1")) - test_arena_list_add_del(1); + test_arena_list_add_del(1, false); if (test__start_subtest("arena_list_1000")) - test_arena_list_add_del(1000); + test_arena_list_add_del(1000, false); + if (test__start_subtest("arena_list_1_nonsleepable")) + test_arena_list_add_del(1, true); + if (test__start_subtest("arena_list_1000_nonsleepable")) + test_arena_list_add_del(1000, true); } diff --git a/tools/testing/selftests/bpf/progs/arena_list.c b/tools/testing/selftests/bpf/progs/arena_list.c index 3a2ddcacbea6..235d8cc95bdd 100644 --- a/tools/testing/selftests/bpf/progs/arena_list.c +++ b/tools/testing/selftests/bpf/progs/arena_list.c @@ -30,6 +30,7 @@ struct arena_list_head __arena *list_head; int list_sum; int cnt; bool skip = false; +const volatile bool nonsleepable = false; #ifdef __BPF_FEATURE_ADDR_SPACE_CAST long __arena arena_sum; @@ -42,6 +43,9 @@ int test_val SEC(".addr_space.1"); int zero; +void bpf_rcu_read_lock(void) __ksym; +void bpf_rcu_read_unlock(void) __ksym; + SEC("syscall") int arena_list_add(void *ctx) { @@ -71,6 +75,10 @@ int arena_list_del(void *ctx) struct elem __arena *n; int sum = 0; + /* Take rcu_read_lock to test non-sleepable context */ + if (nonsleepable) + bpf_rcu_read_lock(); + arena_sum = 0; list_for_each_entry(n, list_head, node) { sum += n->value; @@ -79,6 +87,9 @@ int arena_list_del(void *ctx) bpf_free(n); } list_sum = sum; + + if (nonsleepable) + bpf_rcu_read_unlock(); #else skip = true; #endif diff --git a/tools/testing/selftests/bpf/progs/verifier_arena.c b/tools/testing/selftests/bpf/progs/verifier_arena.c index 7f4827eede3c..4a9d96344813 100644 --- a/tools/testing/selftests/bpf/progs/verifier_arena.c +++ b/tools/testing/selftests/bpf/progs/verifier_arena.c @@ -21,6 +21,37 @@ struct { #endif } arena SEC(".maps"); +SEC("socket") +__success __retval(0) +int basic_alloc1_nosleep(void *ctx) +{ +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) + volatile int __arena *page1, *page2, *no_page; + + page1 = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0); + if (!page1) + return 1; + *page1 = 1; + page2 = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0); + if (!page2) + return 2; + *page2 = 2; + no_page = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0); + if (no_page) + return 3; + if (*page1 != 1) + return 4; + if (*page2 != 2) + return 5; + bpf_arena_free_pages(&arena, (void __arena *)page2, 1); + if (*page1 != 1) + return 6; + if (*page2 != 0 && *page2 != 2) /* use-after-free should return 0 or the stored value */ + return 7; +#endif + return 0; +} + SEC("syscall") __success __retval(0) int basic_alloc1(void *ctx) @@ -60,6 +91,44 @@ int basic_alloc1(void *ctx) return 0; } +SEC("socket") +__success __retval(0) +int basic_alloc2_nosleep(void *ctx) +{ +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) + volatile char __arena *page1, *page2, *page3, *page4; + + page1 = bpf_arena_alloc_pages(&arena, NULL, 2, NUMA_NO_NODE, 0); + if (!page1) + return 1; + page2 = page1 + __PAGE_SIZE; + page3 = page1 + __PAGE_SIZE * 2; + page4 = page1 - __PAGE_SIZE; + *page1 = 1; + *page2 = 2; + *page3 = 3; + *page4 = 4; + if (*page1 != 1) + return 1; + if (*page2 != 2) + return 2; + if (*page3 != 0) + return 3; + if (*page4 != 0) + return 4; + bpf_arena_free_pages(&arena, (void __arena *)page1, 2); + if (*page1 != 0 && *page1 != 1) + return 5; + if (*page2 != 0 && *page2 != 2) + return 6; + if (*page3 != 0) + return 7; + if (*page4 != 0) + return 8; +#endif + return 0; +} + SEC("syscall") __success __retval(0) int basic_alloc2(void *ctx) @@ -102,6 +171,19 @@ struct bpf_arena___l { struct bpf_map map; } __attribute__((preserve_access_index)); +SEC("socket") +__success __retval(0) __log_level(2) +int basic_alloc3_nosleep(void *ctx) +{ + struct bpf_arena___l *ar = (struct bpf_arena___l *)&arena; + volatile char __arena *pages; + + pages = bpf_arena_alloc_pages(&ar->map, NULL, ar->map.max_entries, NUMA_NO_NODE, 0); + if (!pages) + return 1; + return 0; +} + SEC("syscall") __success __retval(0) __log_level(2) int basic_alloc3(void *ctx) @@ -115,6 +197,38 @@ int basic_alloc3(void *ctx) return 0; } +SEC("socket") +__success __retval(0) +int basic_reserve1_nosleep(void *ctx) +{ +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) + char __arena *page; + int ret; + + page = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0); + if (!page) + return 1; + + page += __PAGE_SIZE; + + /* Reserve the second page */ + ret = bpf_arena_reserve_pages(&arena, page, 1); + if (ret) + return 2; + + /* Try to explicitly allocate the reserved page. */ + page = bpf_arena_alloc_pages(&arena, page, 1, NUMA_NO_NODE, 0); + if (page) + return 3; + + /* Try to implicitly allocate the page (since there's only 2 of them). */ + page = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0); + if (page) + return 4; +#endif + return 0; +} + SEC("syscall") __success __retval(0) int basic_reserve1(void *ctx) @@ -147,6 +261,26 @@ int basic_reserve1(void *ctx) return 0; } +SEC("socket") +__success __retval(0) +int basic_reserve2_nosleep(void *ctx) +{ +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) + char __arena *page; + int ret; + + page = arena_base(&arena); + ret = bpf_arena_reserve_pages(&arena, page, 1); + if (ret) + return 1; + + page = bpf_arena_alloc_pages(&arena, page, 1, NUMA_NO_NODE, 0); + if ((u64)page) + return 2; +#endif + return 0; +} + SEC("syscall") __success __retval(0) int basic_reserve2(void *ctx) @@ -168,6 +302,27 @@ int basic_reserve2(void *ctx) } /* Reserve the same page twice, should return -EBUSY. */ +SEC("socket") +__success __retval(0) +int reserve_twice_nosleep(void *ctx) +{ +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) + char __arena *page; + int ret; + + page = arena_base(&arena); + + ret = bpf_arena_reserve_pages(&arena, page, 1); + if (ret) + return 1; + + ret = bpf_arena_reserve_pages(&arena, page, 1); + if (ret != -EBUSY) + return 2; +#endif + return 0; +} + SEC("syscall") __success __retval(0) int reserve_twice(void *ctx) @@ -190,6 +345,36 @@ int reserve_twice(void *ctx) } /* Try to reserve past the end of the arena. */ +SEC("socket") +__success __retval(0) +int reserve_invalid_region_nosleep(void *ctx) +{ +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) + char __arena *page; + int ret; + + /* Try a NULL pointer. */ + ret = bpf_arena_reserve_pages(&arena, NULL, 3); + if (ret != -EINVAL) + return 1; + + page = arena_base(&arena); + + ret = bpf_arena_reserve_pages(&arena, page, 3); + if (ret != -EINVAL) + return 2; + + ret = bpf_arena_reserve_pages(&arena, page, 4096); + if (ret != -EINVAL) + return 3; + + ret = bpf_arena_reserve_pages(&arena, page, (1ULL << 32) - 1); + if (ret != -EINVAL) + return 4; +#endif + return 0; +} + SEC("syscall") __success __retval(0) int reserve_invalid_region(void *ctx) diff --git a/tools/testing/selftests/bpf/progs/verifier_arena_large.c b/tools/testing/selftests/bpf/progs/verifier_arena_large.c index 2b8cf2a4d880..4ca491cbe8d1 100644 --- a/tools/testing/selftests/bpf/progs/verifier_arena_large.c +++ b/tools/testing/selftests/bpf/progs/verifier_arena_large.c @@ -283,5 +283,34 @@ int big_alloc2(void *ctx) return 9; return 0; } + +SEC("socket") +__success __retval(0) +int big_alloc3(void *ctx) +{ +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) + char __arena *pages; + u64 i; + + /* + * Allocate 2051 pages in one go to check how kmalloc_nolock() handles large requests. + * Since kmalloc_nolock() can allocate up to 1024 struct page * at a time, this call should + * result in three batches: two batches of 1024 pages each, followed by a final batch of 3 + * pages. + */ + pages = bpf_arena_alloc_pages(&arena, NULL, 2051, NUMA_NO_NODE, 0); + if (!pages) + return -1; + + bpf_for(i, 0, 2051) + pages[i * PAGE_SIZE] = 123; + bpf_for(i, 0, 2051) + if (pages[i * PAGE_SIZE] != 123) + return i; + + bpf_arena_free_pages(&arena, pages, 2051); +#endif + return 0; +} #endif char _license[] SEC("license") = "GPL"; From 600605853f87a4b1c3530a63f78a3541633402b0 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Mon, 29 Dec 2025 12:28:23 -0800 Subject: [PATCH 045/268] scripts/gen-btf.sh: Fix .btf.o generation when compiling for RISCV gen-btf.sh emits a .btf.o file with BTF sections to be linked into vmlinux in link-vmlinux.sh This .btf.o file is created by compiling an emptystring with ${CC}, and then adding BTF sections into it with ${OBJCOPY}. To ensure the .btf.o is linkable when cross-compiling with LLVM, we have to also pass ${KBUILD_FLAGS}, which in particular control the target word size. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202512240559.2M06DSX7-lkp@intel.com/ Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20251229202823.569619-1-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- scripts/gen-btf.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/gen-btf.sh b/scripts/gen-btf.sh index 06c6d8becaa2..12244dbe097c 100755 --- a/scripts/gen-btf.sh +++ b/scripts/gen-btf.sh @@ -96,7 +96,7 @@ gen_btf_o() # deletes all symbols including __start_BTF and __stop_BTF, which will # be redefined in the linker script. info OBJCOPY "${btf_data}" - echo "" | ${CC} ${CLANG_FLAGS} -c -x c -o ${btf_data} - + echo "" | ${CC} ${CLANG_FLAGS} ${KBUILD_CFLAGS} -c -x c -o ${btf_data} - ${OBJCOPY} --add-section .BTF=${ELF_FILE}.BTF \ --set-section-flags .BTF=alloc,readonly ${btf_data} ${OBJCOPY} --only-section=.BTF --strip-all ${btf_data} From 317a5df78f24bd77fb770a26eb85bf39620592e0 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Tue, 30 Dec 2025 11:51:32 -0800 Subject: [PATCH 046/268] selftests/bpf: Fix verifier_arena_large/big_alloc3 test The big_alloc3() test tries to allocate 2051 pages at once in non-sleepable context and this can fail sporadically on resource contrained systems, so skip this test in case of such failures. Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20251230195134.599463-1-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/verifier_arena_large.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/progs/verifier_arena_large.c b/tools/testing/selftests/bpf/progs/verifier_arena_large.c index 4ca491cbe8d1..5f7e7afee169 100644 --- a/tools/testing/selftests/bpf/progs/verifier_arena_large.c +++ b/tools/testing/selftests/bpf/progs/verifier_arena_large.c @@ -300,7 +300,7 @@ int big_alloc3(void *ctx) */ pages = bpf_arena_alloc_pages(&arena, NULL, 2051, NUMA_NO_NODE, 0); if (!pages) - return -1; + return 0; bpf_for(i, 0, 2051) pages[i * PAGE_SIZE] = 123; From f597664454bde5ac45ceaf24da55b590ccfa60e3 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Mon, 29 Dec 2025 23:13:07 -0800 Subject: [PATCH 047/268] bpf: bpf_scc_visit instance and backedges accumulation for bpf_loop() Calls like bpf_loop() or bpf_for_each_map_elem() introduce loops that are not explicitly present in the control-flow graph. The verifier processes such calls by repeatedly interpreting the callback function body within the same verification path (until the current state converges with a previous state). Such loops require a bpf_scc_visit instance in order to allow the accumulation of the state graph backedges. Otherwise, certain checkpoint states created within the bodies of such loops will have incomplete precision marks. See the next patch for an example of a program that leads to the verifier accepting an unsafe program. Fixes: 96c6aa4c63af ("bpf: compute SCCs in program control flow graph") Fixes: c9e31900b54c ("bpf: propagate read/precision marks over state graph backedges") Reported-by: Breno Leitao Signed-off-by: Eduard Zingerman Tested-by: Breno Leitao Link: https://lore.kernel.org/r/20251229-scc-for-callbacks-v1-1-ceadfe679900@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2de1a736ef69..0baae7828af2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -19830,8 +19830,10 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) } } if (bpf_calls_callback(env, insn_idx)) { - if (states_equal(env, &sl->state, cur, RANGE_WITHIN)) + if (states_equal(env, &sl->state, cur, RANGE_WITHIN)) { + loop = true; goto hit; + } goto skip_inf_loop_check; } /* attempt to detect infinite loop to avoid unnecessary doomed work */ @@ -25071,15 +25073,18 @@ static int compute_scc(struct bpf_verifier_env *env) } /* * Assign SCC number only if component has two or more elements, - * or if component has a self reference. + * or if component has a self reference, or if instruction is a + * callback calling function (implicit loop). */ - assign_scc = stack[stack_sz - 1] != w; - for (j = 0; j < succ->cnt; ++j) { + assign_scc = stack[stack_sz - 1] != w; /* two or more elements? */ + for (j = 0; j < succ->cnt; ++j) { /* self reference? */ if (succ->items[j] == w) { assign_scc = true; break; } } + if (bpf_calls_callback(env, w)) /* implicit loop? */ + assign_scc = true; /* Pop component elements from stack */ do { t = stack[--stack_sz]; From e6f2612f0e7c23ce991d3094b5387caf1a52a4fe Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Mon, 29 Dec 2025 23:13:08 -0800 Subject: [PATCH 048/268] selftests/bpf: test cases for bpf_loop SCC and state graph backedges Test for state graph backedges accumulation for SCCs formed by bpf_loop(). Equivalent to the following C program: int main(void) { 1: fp[-8] = bpf_get_prandom_u32(); 2: fp[-16] = -32; // used in a memory access below 3: bpf_loop(7, loop_cb4, fp, 0); 4: return 0; } int loop_cb4(int i, void *ctx) { 5: if (unlikely(ctx[-8] > bpf_get_prandom_u32())) 6: *(u64 *)(fp + ctx[-16]) = 42; // aligned access expected 7: if (unlikely(fp[-8] > bpf_get_prandom_u32())) 8: ctx[-16] = -31; // makes said access unaligned 9: return 0; } If state graph backedges are not accumulated properly at the SCC formed by loop_cb4() call from bpf_loop(), the state {ctx[-16]=-32} injected at instruction 9 on verification path 1,2,3,5,7,9,4 would be considered fully verified and would lack precision mark for ctx[-16]. This would lead to early pruning of verification path 1,2,3,5,7,8,9 in state {ctx[-16]=-31}, which in turn leads to the incorrect assumption that the above program is safe. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20251229-scc-for-callbacks-v1-2-ceadfe679900@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/iters.c | 75 +++++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/iters.c b/tools/testing/selftests/bpf/progs/iters.c index 7dd92a303bf6..69061f030957 100644 --- a/tools/testing/selftests/bpf/progs/iters.c +++ b/tools/testing/selftests/bpf/progs/iters.c @@ -1926,4 +1926,79 @@ static int loop1_wrapper(void) ); } +/* + * This is similar to a test case absent_mark_in_the_middle_state(), + * but adapted for use with bpf_loop(). + */ +SEC("raw_tp") +__flag(BPF_F_TEST_STATE_FREQ) +__failure __msg("math between fp pointer and register with unbounded min value is not allowed") +__naked void absent_mark_in_the_middle_state4(void) +{ + /* + * Equivalent to a C program below: + * + * int main(void) { + * fp[-8] = bpf_get_prandom_u32(); + * fp[-16] = -32; // used in a memory access below + * bpf_loop(7, loop_cb4, fp, 0); + * return 0; + * } + * + * int loop_cb4(int i, void *ctx) { + * if (unlikely(ctx[-8] > bpf_get_prandom_u32())) + * *(u64 *)(fp + ctx[-16]) = 42; // aligned access expected + * if (unlikely(fp[-8] > bpf_get_prandom_u32())) + * ctx[-16] = -31; // makes said access unaligned + * return 0; + * } + */ + asm volatile ( + "call %[bpf_get_prandom_u32];" + "r8 = r0;" + "*(u64 *)(r10 - 8) = r0;" + "*(u64 *)(r10 - 16) = -32;" + "r1 = 7;" + "r2 = loop_cb4 ll;" + "r3 = r10;" + "r4 = 0;" + "call %[bpf_loop];" + "r0 = 0;" + "exit;" + : + : __imm(bpf_loop), + __imm(bpf_get_prandom_u32) + : __clobber_all + ); +} + +__used __naked +static void loop_cb4(void) +{ + asm volatile ( + "r9 = r2;" + "r8 = *(u64 *)(r9 - 8);" + "r6 = *(u64 *)(r9 - 16);" + "call %[bpf_get_prandom_u32];" + "if r0 > r8 goto use_fp16_%=;" + "1:" + "call %[bpf_get_prandom_u32];" + "if r0 > r8 goto update_fp16_%=;" + "2:" + "r0 = 0;" + "exit;" + "use_fp16_%=:" + "r1 = r10;" + "r1 += r6;" + "*(u64 *)(r1 + 0) = 42;" + "goto 1b;" + "update_fp16_%=:" + "*(u64 *)(r9 - 16) = -31;" + "goto 2b;" + : + : __imm(bpf_get_prandom_u32) + : __clobber_all + ); +} + char _license[] SEC("license") = "GPL"; From 840692326e92b5deb76c224931e8ca145ce7cfb8 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 30 Dec 2025 21:36:03 -0800 Subject: [PATCH 049/268] bpf: allow states pruning for misc/invalid slots in iterator loops Within an iterator or callback based loop, it should be safe to prune the current state if the old state stack slot is marked as STACK_INVALID or STACK_MISC: - either all branches of the old state lead to a program exit; - or some branch of the old state leads the current state. This is the same logic as applied in non-loop cases when states_equal() is called in NOT_EXACT mode. The test case that exercises stacksafe() and demonstrates the difference in verification performance is included in the next patch. I'm not sure if it is possible to prepare a test case that exercises regsafe(); it appears that the compute_live_registers() pass makes this impossible. Nevertheless, for code readability reasons, I think that stacksafe() and regsafe() should handle STACK_INVALID / NOT_INIT symmetrically. Hence, this commit changes both functions. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20251230-loop-stack-misc-pruning-v1-1-585cfd6cec51@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0baae7828af2..3d44c5d06623 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -19086,11 +19086,9 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, if (exact == EXACT) return regs_exact(rold, rcur, idmap); - if (rold->type == NOT_INIT) { - if (exact == NOT_EXACT || rcur->type == NOT_INIT) - /* explored state can't have used this */ - return true; - } + if (rold->type == NOT_INIT) + /* explored state can't have used this */ + return true; /* Enforce that register types have to match exactly, including their * modifiers (like PTR_MAYBE_NULL, MEM_RDONLY, etc), as a general @@ -19259,7 +19257,7 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, spi = i / BPF_REG_SIZE; - if (exact != NOT_EXACT && + if (exact == EXACT && (i >= cur->allocated_stack || old->stack[spi].slot_type[i % BPF_REG_SIZE] != cur->stack[spi].slot_type[i % BPF_REG_SIZE])) From 4fd99103eef347174b3c9b6071428324a3cf9a60 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 30 Dec 2025 21:36:04 -0800 Subject: [PATCH 050/268] selftests/bpf: iterator based loop and STACK_MISC states pruning The test case first initializes 9 stack slots as STACK_MISC, then conditionally updates each of them to SCALAR spill inside an iterator based loop. This leads to 2**9 combinations of MISC/SPILL marks for these slots at the iterator next call. The loop converges only if the verifier treats such states as equivalent, otherwise visited states are evicted from the states cache too quickly. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20251230-loop-stack-misc-pruning-v1-2-585cfd6cec51@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/iters.c | 65 +++++++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/iters.c b/tools/testing/selftests/bpf/progs/iters.c index 69061f030957..7f27b517d5d5 100644 --- a/tools/testing/selftests/bpf/progs/iters.c +++ b/tools/testing/selftests/bpf/progs/iters.c @@ -1997,6 +1997,71 @@ static void loop_cb4(void) "goto 2b;" : : __imm(bpf_get_prandom_u32) + ); +} + +SEC("raw_tp") +__success +__naked int stack_misc_vs_scalar_in_a_loop(void) +{ + asm volatile( + "*(u8 *)(r10 - 15) = 1;" /* This marks stack slot fp[-16] as STACK_MISC. */ + "*(u8 *)(r10 - 23) = 1;" + "*(u8 *)(r10 - 31) = 1;" + "*(u8 *)(r10 - 39) = 1;" + "*(u8 *)(r10 - 47) = 1;" + "*(u8 *)(r10 - 55) = 1;" + "*(u8 *)(r10 - 63) = 1;" + "*(u8 *)(r10 - 71) = 1;" + "*(u8 *)(r10 - 79) = 1;" + "r1 = r10;" + "r1 += -8;" + "r2 = 0;" + "r3 = 10;" + "call %[bpf_iter_num_new];" + "loop_%=:" + "r1 = r10;" + "r1 += -8;" + "call %[bpf_iter_num_next];" + "if r0 == 0 goto loop_end_%=;" + +#define maybe_change_stack_slot(off) \ + "call %[bpf_get_prandom_u32];" \ + "if r0 == 42 goto +1;" \ + "goto +1;" \ + "*(u64 *)(r10 " #off ") = r0;" + + /* + * When comparing verifier states fp[-16] will be + * either STACK_MISC or SCALAR. Pruning logic should + * consider old STACK_MISC equivalent to current SCALAR + * to avoid states explosion. + */ + maybe_change_stack_slot(-16) + maybe_change_stack_slot(-24) + maybe_change_stack_slot(-32) + maybe_change_stack_slot(-40) + maybe_change_stack_slot(-48) + maybe_change_stack_slot(-56) + maybe_change_stack_slot(-64) + maybe_change_stack_slot(-72) + maybe_change_stack_slot(-80) + +#undef maybe_change_stack_slot + + "goto loop_%=;" + "loop_end_%=:" + "r1 = r10;" + "r1 += -8;" + "call %[bpf_iter_num_destroy];" + "r0 = 0;" + "exit;" + : + : __imm(bpf_get_prandom_u32), + __imm(bpf_iter_num_new), + __imm(bpf_iter_num_next), + __imm(bpf_iter_num_destroy), + __imm_addr(amap) : __clobber_all ); } From 1a8fa7faf4890d201aad4f5d4943f74d840cd0ba Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Tue, 30 Dec 2025 17:25:57 -0800 Subject: [PATCH 051/268] resolve_btfids: Implement --patch_btfids Recent changes in BTF generation [1] rely on ${OBJCOPY} command to update .BTF_ids section data in target ELF files. This exposed a bug in llvm-objcopy --update-section code path, that may lead to corruption of a target ELF file. Specifically, because of the bug st_shndx of some symbols may be (incorrectly) set to 0xffff (SHN_XINDEX) [2][3]. While there is a pending fix for LLVM, it'll take some time before it lands (likely in 22.x). And the kernel build must keep working with older LLVM toolchains in the foreseeable future. Using GNU objcopy for .BTF_ids update would work, but it would require changes to LLVM-based build process, likely breaking existing build environments as discussed in [2]. To work around llvm-objcopy bug, implement --patch_btfids code path in resolve_btfids as a drop-in replacement for: ${OBJCOPY} --update-section .BTF_ids=${btf_ids} ${elf} Which works specifically for .BTF_ids section: ${RESOLVE_BTFIDS} --patch_btfids ${btf_ids} ${elf} This feature in resolve_btfids can be removed at some point in the future, when llvm-objcopy with a relevant bugfix becomes common. [1] https://lore.kernel.org/bpf/20251219181321.1283664-1-ihor.solodrai@linux.dev/ [2] https://lore.kernel.org/bpf/20251224005752.201911-1-ihor.solodrai@linux.dev/ [3] https://github.com/llvm/llvm-project/issues/168060#issuecomment-3533552952 Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20251231012558.1699758-1-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- scripts/gen-btf.sh | 2 +- scripts/link-vmlinux.sh | 2 +- tools/bpf/resolve_btfids/main.c | 117 +++++++++++++++++++++++++++ tools/testing/selftests/bpf/Makefile | 2 +- 4 files changed, 120 insertions(+), 3 deletions(-) diff --git a/scripts/gen-btf.sh b/scripts/gen-btf.sh index 12244dbe097c..0aec86615416 100755 --- a/scripts/gen-btf.sh +++ b/scripts/gen-btf.sh @@ -123,7 +123,7 @@ embed_btf_data() fi local btf_ids="${ELF_FILE}.BTF_ids" if [ -f "${btf_ids}" ]; then - ${OBJCOPY} --update-section .BTF_ids=${btf_ids} ${ELF_FILE} + ${RESOLVE_BTFIDS} --patch_btfids ${btf_ids} ${ELF_FILE} fi } diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index e2207e612ac3..1915adf3249b 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -266,7 +266,7 @@ vmlinux_link "${VMLINUX}" if is_enabled CONFIG_DEBUG_INFO_BTF; then info OBJCOPY ${btfids_vmlinux} - ${OBJCOPY} --update-section .BTF_ids=${btfids_vmlinux} ${VMLINUX} + ${RESOLVE_BTFIDS} --patch_btfids ${btfids_vmlinux} ${VMLINUX} fi mksysmap "${VMLINUX}" System.map diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c index 2cbc252259be..df39982f51df 100644 --- a/tools/bpf/resolve_btfids/main.c +++ b/tools/bpf/resolve_btfids/main.c @@ -862,8 +862,119 @@ static inline int make_out_path(char *buf, u32 buf_sz, const char *in_path, cons return 0; } +/* + * Patch the .BTF_ids section of an ELF file with data from provided file. + * Equivalent to: objcopy --update-section .BTF_ids= + * + * 1. Find .BTF_ids section in the ELF + * 2. Verify that blob file size matches section size + * 3. Update section data buffer with blob data + * 4. Write the ELF file + */ +static int patch_btfids(const char *btfids_path, const char *elf_path) +{ + Elf_Scn *scn = NULL; + FILE *btfids_file; + size_t shdrstrndx; + int fd, err = -1; + Elf_Data *data; + struct stat st; + GElf_Shdr sh; + char *name; + Elf *elf; + + elf_version(EV_CURRENT); + + fd = open(elf_path, O_RDWR, 0666); + if (fd < 0) { + pr_err("FAILED to open %s: %s\n", elf_path, strerror(errno)); + return -1; + } + + elf = elf_begin(fd, ELF_C_RDWR_MMAP, NULL); + if (!elf) { + close(fd); + pr_err("FAILED cannot create ELF descriptor: %s\n", elf_errmsg(-1)); + return -1; + } + + elf_flagelf(elf, ELF_C_SET, ELF_F_LAYOUT); + + if (elf_getshdrstrndx(elf, &shdrstrndx) != 0) { + pr_err("FAILED cannot get shdr str ndx\n"); + goto out; + } + + while ((scn = elf_nextscn(elf, scn)) != NULL) { + + if (gelf_getshdr(scn, &sh) != &sh) { + pr_err("FAILED to get section header\n"); + goto out; + } + + name = elf_strptr(elf, shdrstrndx, sh.sh_name); + if (!name) + continue; + + if (strcmp(name, BTF_IDS_SECTION) == 0) + break; + } + + if (!scn) { + pr_err("FAILED: section %s not found in %s\n", BTF_IDS_SECTION, elf_path); + goto out; + } + + data = elf_getdata(scn, NULL); + if (!data) { + pr_err("FAILED to get %s section data from %s\n", BTF_IDS_SECTION, elf_path); + goto out; + } + + if (stat(btfids_path, &st) < 0) { + pr_err("FAILED to stat %s: %s\n", btfids_path, strerror(errno)); + goto out; + } + + if ((size_t)st.st_size != data->d_size) { + pr_err("FAILED: size mismatch - %s section in %s is %zu bytes, %s is %zu bytes\n", + BTF_IDS_SECTION, elf_path, data->d_size, btfids_path, (size_t)st.st_size); + goto out; + } + + btfids_file = fopen(btfids_path, "rb"); + if (!btfids_file) { + pr_err("FAILED to open %s: %s\n", btfids_path, strerror(errno)); + goto out; + } + + pr_debug("Copying data from %s to %s section of %s (%zu bytes)\n", + btfids_path, BTF_IDS_SECTION, elf_path, data->d_size); + + if (fread(data->d_buf, data->d_size, 1, btfids_file) != 1) { + pr_err("FAILED to read %s\n", btfids_path); + fclose(btfids_file); + goto out; + } + fclose(btfids_file); + + elf_flagdata(data, ELF_C_SET, ELF_F_DIRTY); + if (elf_update(elf, ELF_C_WRITE) < 0) { + pr_err("FAILED to update ELF file %s\n", elf_path); + goto out; + } + + err = 0; +out: + elf_end(elf); + close(fd); + + return err; +} + static const char * const resolve_btfids_usage[] = { "resolve_btfids [] ", + "resolve_btfids --patch_btfids <.BTF_ids file> ", NULL }; @@ -880,6 +991,7 @@ int main(int argc, const char **argv) .funcs = RB_ROOT, .sets = RB_ROOT, }; + const char *btfids_path = NULL; bool fatal_warnings = false; char out_path[PATH_MAX]; @@ -894,6 +1006,8 @@ int main(int argc, const char **argv) "turn warnings into errors"), OPT_BOOLEAN(0, "distill_base", &obj.distill_base, "distill --btf_base and emit .BTF.base section data"), + OPT_STRING(0, "patch_btfids", &btfids_path, "file", + "path to .BTF_ids section data blob to patch into ELF file"), OPT_END() }; int err = -1; @@ -905,6 +1019,9 @@ int main(int argc, const char **argv) obj.path = argv[0]; + if (btfids_path) + return patch_btfids(btfids_path, obj.path); + if (load_btf(&obj)) goto out; diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index f28a32b16ff0..9488d076c740 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -657,7 +657,7 @@ $(TRUNNER_TEST_OBJS): $(TRUNNER_OUTPUT)/%.test.o: \ $$(if $$(TEST_NEEDS_BTFIDS), \ $$(call msg,BTFIDS,$(TRUNNER_BINARY),$$@) \ $(RESOLVE_BTFIDS) --btf $(TRUNNER_OUTPUT)/btf_data.bpf.o $$@; \ - $(OBJCOPY) --update-section .BTF_ids=$$@.BTF_ids $$@) + $(RESOLVE_BTFIDS) --patch_btfids $$@.BTF_ids $$@) $(TRUNNER_TEST_OBJS:.o=.d): $(TRUNNER_OUTPUT)/%.test.d: \ $(TRUNNER_TESTS_DIR)/%.c \ From 453dece55bb1cc6812314497a1c5ecc0513457ed Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Wed, 31 Dec 2025 10:39:29 -0800 Subject: [PATCH 052/268] scripts/gen-btf.sh: Reduce log verbosity Remove info messages from gen-btf.sh, as they are unnecessarily detailed and sometimes inaccurate [1]. Verbose log can be produced by passing V=1 to make, which will set -x for the shell. [1] https://lore.kernel.org/bpf/CAADnVQ+biTSDaNtoL=ct9XtBJiXYMUqGYLqu604C3D8N+8YH9A@mail.gmail.com/ Suggested-by: Alexei Starovoitov Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20251231183929.65668-1-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- scripts/gen-btf.sh | 10 ---------- scripts/link-vmlinux.sh | 3 ++- 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/scripts/gen-btf.sh b/scripts/gen-btf.sh index 0aec86615416..d6457661b9b6 100755 --- a/scripts/gen-btf.sh +++ b/scripts/gen-btf.sh @@ -60,28 +60,20 @@ is_enabled() { grep -q "^$1=y" ${objtree}/include/config/auto.conf } -info() -{ - printf " %-7s %s\n" "${1}" "${2}" -} - case "${KBUILD_VERBOSE}" in *1*) set -x ;; esac - gen_btf_data() { - info BTF "${ELF_FILE}" btf1="${ELF_FILE}.BTF.1" ${PAHOLE} -J ${PAHOLE_FLAGS} \ ${BTF_BASE:+--btf_base ${BTF_BASE}} \ --btf_encode_detached=${btf1} \ "${ELF_FILE}" - info BTFIDS "${ELF_FILE}" ${RESOLVE_BTFIDS} ${RESOLVE_BTFIDS_FLAGS} \ ${BTF_BASE:+--btf_base ${BTF_BASE}} \ --btf ${btf1} "${ELF_FILE}" @@ -95,7 +87,6 @@ gen_btf_o() # SHF_ALLOC because .BTF will be part of the vmlinux image. --strip-all # deletes all symbols including __start_BTF and __stop_BTF, which will # be redefined in the linker script. - info OBJCOPY "${btf_data}" echo "" | ${CC} ${CLANG_FLAGS} ${KBUILD_CFLAGS} -c -x c -o ${btf_data} - ${OBJCOPY} --add-section .BTF=${ELF_FILE}.BTF \ --set-section-flags .BTF=alloc,readonly ${btf_data} @@ -113,7 +104,6 @@ gen_btf_o() embed_btf_data() { - info OBJCOPY "${ELF_FILE}.BTF" ${OBJCOPY} --add-section .BTF=${ELF_FILE}.BTF ${ELF_FILE} # a module might not have a .BTF_ids or .BTF.base section diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index 1915adf3249b..08cd8e25c65c 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -205,6 +205,7 @@ if is_enabled CONFIG_KALLSYMS || is_enabled CONFIG_DEBUG_INFO_BTF; then fi if is_enabled CONFIG_DEBUG_INFO_BTF; then + info BTF .tmp_vmlinux1 if ! ${srctree}/scripts/gen-btf.sh .tmp_vmlinux1; then echo >&2 "Failed to generate BTF for vmlinux" echo >&2 "Try to disable CONFIG_DEBUG_INFO_BTF" @@ -265,7 +266,7 @@ fi vmlinux_link "${VMLINUX}" if is_enabled CONFIG_DEBUG_INFO_BTF; then - info OBJCOPY ${btfids_vmlinux} + info BTFIDS ${VMLINUX} ${RESOLVE_BTFIDS} --patch_btfids ${btfids_vmlinux} ${VMLINUX} fi From 17c736a7b58a18e3683df2583b60f0edeaf65070 Mon Sep 17 00:00:00 2001 From: SungRock Jung Date: Sun, 21 Dec 2025 07:00:41 +0000 Subject: [PATCH 053/268] bpf: Update BPF_PROG_RUN documentation LWT_SEG6LOCAL no longer supports test_run starting from v6.11 so remove it from the list of program types supported by BPF_PROG_RUN. Add TRACING and NETFILTER program types to reflect the current set of types that implement test_run. Signed-off-by: SungRock Jung Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20251221070041.26592-1-tjdfkr2421@gmail.com Signed-off-by: Alexei Starovoitov --- Documentation/bpf/bpf_prog_run.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Documentation/bpf/bpf_prog_run.rst b/Documentation/bpf/bpf_prog_run.rst index 4868c909df5c..81ef768c75a3 100644 --- a/Documentation/bpf/bpf_prog_run.rst +++ b/Documentation/bpf/bpf_prog_run.rst @@ -34,11 +34,12 @@ following types: - ``BPF_PROG_TYPE_LWT_IN`` - ``BPF_PROG_TYPE_LWT_OUT`` - ``BPF_PROG_TYPE_LWT_XMIT`` -- ``BPF_PROG_TYPE_LWT_SEG6LOCAL`` - ``BPF_PROG_TYPE_FLOW_DISSECTOR`` - ``BPF_PROG_TYPE_STRUCT_OPS`` - ``BPF_PROG_TYPE_RAW_TRACEPOINT`` - ``BPF_PROG_TYPE_SYSCALL`` +- ``BPF_PROG_TYPE_TRACING`` +- ``BPF_PROG_TYPE_NETFILTER`` When using the ``BPF_PROG_RUN`` command, userspace supplies an input context object and (for program types operating on network packets) a buffer containing From c286e7e9d1f1f3d90ad11c37e896f582b02d19c4 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Wed, 31 Dec 2025 14:10:50 -0800 Subject: [PATCH 054/268] selftests/bpf: veristat: fix printing order in output_stats() The order of the variables in the printf() doesn't match the text and therefore veristat prints something like this: Done. Processed 24 files, 0 programs. Skipped 62 files, 0 programs. When it should print: Done. Processed 24 files, 62 programs. Skipped 0 files, 0 programs. Fix the order of variables in the printf() call. Fixes: 518fee8bfaf2 ("selftests/bpf: make veristat skip non-BPF and failing-to-open BPF objects") Tested-by: Eduard Zingerman Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20251231221052.759396-1-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/veristat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c index e962f133250c..1be1e353d40a 100644 --- a/tools/testing/selftests/bpf/veristat.c +++ b/tools/testing/selftests/bpf/veristat.c @@ -2580,7 +2580,7 @@ static void output_stats(const struct verif_stats *s, enum resfmt fmt, bool last if (last && fmt == RESFMT_TABLE) { output_header_underlines(); printf("Done. Processed %d files, %d programs. Skipped %d files, %d programs.\n", - env.files_processed, env.files_skipped, env.progs_processed, env.progs_skipped); + env.files_processed, env.progs_processed, env.files_skipped, env.progs_skipped); } } From 1a5c01d2508a845825eece360c6145d7f436dbf8 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Fri, 2 Jan 2026 10:00:27 -0800 Subject: [PATCH 055/268] bpf: Make KF_TRUSTED_ARGS the default for all kfuncs Change the verifier to make trusted args the default requirement for all kfuncs by removing is_kfunc_trusted_args() assuming it be to always return true. This works because: 1. Context pointers (xdp_md, __sk_buff, etc.) are handled through their own KF_ARG_PTR_TO_CTX case label and bypass the trusted check 2. Struct_ops callback arguments are already marked as PTR_TRUSTED during initialization and pass is_trusted_reg() 3. KF_RCU kfuncs are handled separately via is_kfunc_rcu() checks at call sites (always checked with || alongside is_kfunc_trusted_args) This simple change makes all kfuncs require trusted args by default while maintaining correct behavior for all existing special cases. Note: This change means kfuncs that previously accepted NULL pointers without KF_TRUSTED_ARGS will now reject NULL at verification time. Several netfilter kfuncs are affected: bpf_xdp_ct_lookup(), bpf_skb_ct_lookup(), bpf_xdp_ct_alloc(), and bpf_skb_ct_alloc() all accept NULL for their bpf_tuple and opts parameters internally (checked in __bpf_nf_ct_lookup), but after this change the verifier rejects NULL before the kfunc is even called. This is acceptable because these kfuncs don't work with NULL parameters in their proper usage. Now they will be rejected rather than returning an error, which shouldn't make a difference to BPF programs that were using these kfuncs properly. Acked-by: Eduard Zingerman Reviewed-by: Emil Tsalapatis Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260102180038.2708325-2-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- Documentation/bpf/kfuncs.rst | 430 +++++++++++++++++------------------ fs/bpf_fs_kfuncs.c | 10 +- kernel/bpf/verifier.c | 14 +- 3 files changed, 220 insertions(+), 234 deletions(-) diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst index e38941370b90..6cb6857bfa6f 100644 --- a/Documentation/bpf/kfuncs.rst +++ b/Documentation/bpf/kfuncs.rst @@ -50,216 +50,21 @@ A wrapper kfunc is often needed when we need to annotate parameters of the kfunc. Otherwise one may directly make the kfunc visible to the BPF program by registering it with the BPF subsystem. See :ref:`BPF_kfunc_nodef`. -2.2 Annotating kfunc parameters -------------------------------- - -Similar to BPF helpers, there is sometime need for additional context required -by the verifier to make the usage of kernel functions safer and more useful. -Hence, we can annotate a parameter by suffixing the name of the argument of the -kfunc with a __tag, where tag may be one of the supported annotations. - -2.2.1 __sz Annotation ---------------------- - -This annotation is used to indicate a memory and size pair in the argument list. -An example is given below:: - - __bpf_kfunc void bpf_memzero(void *mem, int mem__sz) - { - ... - } - -Here, the verifier will treat first argument as a PTR_TO_MEM, and second -argument as its size. By default, without __sz annotation, the size of the type -of the pointer is used. Without __sz annotation, a kfunc cannot accept a void -pointer. - -2.2.2 __k Annotation +2.2 kfunc Parameters -------------------- -This annotation is only understood for scalar arguments, where it indicates that -the verifier must check the scalar argument to be a known constant, which does -not indicate a size parameter, and the value of the constant is relevant to the -safety of the program. +All kfuncs now require trusted arguments by default. This means that all +pointer arguments must be valid, and all pointers to BTF objects must be +passed in their unmodified form (at a zero offset, and without having been +obtained from walking another pointer, with exceptions described below). -An example is given below:: - - __bpf_kfunc void *bpf_obj_new(u32 local_type_id__k, ...) - { - ... - } - -Here, bpf_obj_new uses local_type_id argument to find out the size of that type -ID in program's BTF and return a sized pointer to it. Each type ID will have a -distinct size, hence it is crucial to treat each such call as distinct when -values don't match during verifier state pruning checks. - -Hence, whenever a constant scalar argument is accepted by a kfunc which is not a -size parameter, and the value of the constant matters for program safety, __k -suffix should be used. - -2.2.3 __uninit Annotation -------------------------- - -This annotation is used to indicate that the argument will be treated as -uninitialized. - -An example is given below:: - - __bpf_kfunc int bpf_dynptr_from_skb(..., struct bpf_dynptr_kern *ptr__uninit) - { - ... - } - -Here, the dynptr will be treated as an uninitialized dynptr. Without this -annotation, the verifier will reject the program if the dynptr passed in is -not initialized. - -2.2.4 __opt Annotation -------------------------- - -This annotation is used to indicate that the buffer associated with an __sz or __szk -argument may be null. If the function is passed a nullptr in place of the buffer, -the verifier will not check that length is appropriate for the buffer. The kfunc is -responsible for checking if this buffer is null before using it. - -An example is given below:: - - __bpf_kfunc void *bpf_dynptr_slice(..., void *buffer__opt, u32 buffer__szk) - { - ... - } - -Here, the buffer may be null. If buffer is not null, it at least of size buffer_szk. -Either way, the returned buffer is either NULL, or of size buffer_szk. Without this -annotation, the verifier will reject the program if a null pointer is passed in with -a nonzero size. - -2.2.5 __str Annotation ----------------------------- -This annotation is used to indicate that the argument is a constant string. - -An example is given below:: - - __bpf_kfunc bpf_get_file_xattr(..., const char *name__str, ...) - { - ... - } - -In this case, ``bpf_get_file_xattr()`` can be called as:: - - bpf_get_file_xattr(..., "xattr_name", ...); - -Or:: - - const char name[] = "xattr_name"; /* This need to be global */ - int BPF_PROG(...) - { - ... - bpf_get_file_xattr(..., name, ...); - ... - } - -2.2.6 __prog Annotation ---------------------------- -This annotation is used to indicate that the argument needs to be fixed up to -the bpf_prog_aux of the caller BPF program. Any value passed into this argument -is ignored, and rewritten by the verifier. - -An example is given below:: - - __bpf_kfunc int bpf_wq_set_callback_impl(struct bpf_wq *wq, - int (callback_fn)(void *map, int *key, void *value), - unsigned int flags, - void *aux__prog) - { - struct bpf_prog_aux *aux = aux__prog; - ... - } - -.. _BPF_kfunc_nodef: - -2.3 Using an existing kernel function -------------------------------------- - -When an existing function in the kernel is fit for consumption by BPF programs, -it can be directly registered with the BPF subsystem. However, care must still -be taken to review the context in which it will be invoked by the BPF program -and whether it is safe to do so. - -2.4 Annotating kfuncs ---------------------- - -In addition to kfuncs' arguments, verifier may need more information about the -type of kfunc(s) being registered with the BPF subsystem. To do so, we define -flags on a set of kfuncs as follows:: - - BTF_KFUNCS_START(bpf_task_set) - BTF_ID_FLAGS(func, bpf_get_task_pid, KF_ACQUIRE | KF_RET_NULL) - BTF_ID_FLAGS(func, bpf_put_pid, KF_RELEASE) - BTF_KFUNCS_END(bpf_task_set) - -This set encodes the BTF ID of each kfunc listed above, and encodes the flags -along with it. Ofcourse, it is also allowed to specify no flags. - -kfunc definitions should also always be annotated with the ``__bpf_kfunc`` -macro. This prevents issues such as the compiler inlining the kfunc if it's a -static kernel function, or the function being elided in an LTO build as it's -not used in the rest of the kernel. Developers should not manually add -annotations to their kfunc to prevent these issues. If an annotation is -required to prevent such an issue with your kfunc, it is a bug and should be -added to the definition of the macro so that other kfuncs are similarly -protected. An example is given below:: - - __bpf_kfunc struct task_struct *bpf_get_task_pid(s32 pid) - { - ... - } - -2.4.1 KF_ACQUIRE flag ---------------------- - -The KF_ACQUIRE flag is used to indicate that the kfunc returns a pointer to a -refcounted object. The verifier will then ensure that the pointer to the object -is eventually released using a release kfunc, or transferred to a map using a -referenced kptr (by invoking bpf_kptr_xchg). If not, the verifier fails the -loading of the BPF program until no lingering references remain in all possible -explored states of the program. - -2.4.2 KF_RET_NULL flag ----------------------- - -The KF_RET_NULL flag is used to indicate that the pointer returned by the kfunc -may be NULL. Hence, it forces the user to do a NULL check on the pointer -returned from the kfunc before making use of it (dereferencing or passing to -another helper). This flag is often used in pairing with KF_ACQUIRE flag, but -both are orthogonal to each other. - -2.4.3 KF_RELEASE flag ---------------------- - -The KF_RELEASE flag is used to indicate that the kfunc releases the pointer -passed in to it. There can be only one referenced pointer that can be passed -in. All copies of the pointer being released are invalidated as a result of -invoking kfunc with this flag. KF_RELEASE kfuncs automatically receive the -protection afforded by the KF_TRUSTED_ARGS flag described below. - -2.4.4 KF_TRUSTED_ARGS flag --------------------------- - -The KF_TRUSTED_ARGS flag is used for kfuncs taking pointer arguments. It -indicates that the all pointer arguments are valid, and that all pointers to -BTF objects have been passed in their unmodified form (that is, at a zero -offset, and without having been obtained from walking another pointer, with one -exception described below). - -There are two types of pointers to kernel objects which are considered "valid": +There are two types of pointers to kernel objects which are considered "trusted": 1. Pointers which are passed as tracepoint or struct_ops callback arguments. 2. Pointers which were returned from a KF_ACQUIRE kfunc. Pointers to non-BTF objects (e.g. scalar pointers) may also be passed to -KF_TRUSTED_ARGS kfuncs, and may have a non-zero offset. +kfuncs, and may have a non-zero offset. The definition of "valid" pointers is subject to change at any time, and has absolutely no ABI stability guarantees. @@ -308,14 +113,206 @@ is emitted in the ``type_is_trusted()`` function as follows: BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct socket)); +2.3 Annotating kfunc parameters +------------------------------- -2.4.5 KF_SLEEPABLE flag +Similar to BPF helpers, there is sometime need for additional context required +by the verifier to make the usage of kernel functions safer and more useful. +Hence, we can annotate a parameter by suffixing the name of the argument of the +kfunc with a __tag, where tag may be one of the supported annotations. + +2.3.1 __sz Annotation +--------------------- + +This annotation is used to indicate a memory and size pair in the argument list. +An example is given below:: + + __bpf_kfunc void bpf_memzero(void *mem, int mem__sz) + { + ... + } + +Here, the verifier will treat first argument as a PTR_TO_MEM, and second +argument as its size. By default, without __sz annotation, the size of the type +of the pointer is used. Without __sz annotation, a kfunc cannot accept a void +pointer. + +2.3.2 __k Annotation +-------------------- + +This annotation is only understood for scalar arguments, where it indicates that +the verifier must check the scalar argument to be a known constant, which does +not indicate a size parameter, and the value of the constant is relevant to the +safety of the program. + +An example is given below:: + + __bpf_kfunc void *bpf_obj_new(u32 local_type_id__k, ...) + { + ... + } + +Here, bpf_obj_new uses local_type_id argument to find out the size of that type +ID in program's BTF and return a sized pointer to it. Each type ID will have a +distinct size, hence it is crucial to treat each such call as distinct when +values don't match during verifier state pruning checks. + +Hence, whenever a constant scalar argument is accepted by a kfunc which is not a +size parameter, and the value of the constant matters for program safety, __k +suffix should be used. + +2.3.3 __uninit Annotation +------------------------- + +This annotation is used to indicate that the argument will be treated as +uninitialized. + +An example is given below:: + + __bpf_kfunc int bpf_dynptr_from_skb(..., struct bpf_dynptr_kern *ptr__uninit) + { + ... + } + +Here, the dynptr will be treated as an uninitialized dynptr. Without this +annotation, the verifier will reject the program if the dynptr passed in is +not initialized. + +2.3.4 __opt Annotation +------------------------- + +This annotation is used to indicate that the buffer associated with an __sz or __szk +argument may be null. If the function is passed a nullptr in place of the buffer, +the verifier will not check that length is appropriate for the buffer. The kfunc is +responsible for checking if this buffer is null before using it. + +An example is given below:: + + __bpf_kfunc void *bpf_dynptr_slice(..., void *buffer__opt, u32 buffer__szk) + { + ... + } + +Here, the buffer may be null. If buffer is not null, it at least of size buffer_szk. +Either way, the returned buffer is either NULL, or of size buffer_szk. Without this +annotation, the verifier will reject the program if a null pointer is passed in with +a nonzero size. + +2.3.5 __str Annotation +---------------------------- +This annotation is used to indicate that the argument is a constant string. + +An example is given below:: + + __bpf_kfunc bpf_get_file_xattr(..., const char *name__str, ...) + { + ... + } + +In this case, ``bpf_get_file_xattr()`` can be called as:: + + bpf_get_file_xattr(..., "xattr_name", ...); + +Or:: + + const char name[] = "xattr_name"; /* This need to be global */ + int BPF_PROG(...) + { + ... + bpf_get_file_xattr(..., name, ...); + ... + } + +2.3.6 __prog Annotation +--------------------------- +This annotation is used to indicate that the argument needs to be fixed up to +the bpf_prog_aux of the caller BPF program. Any value passed into this argument +is ignored, and rewritten by the verifier. + +An example is given below:: + + __bpf_kfunc int bpf_wq_set_callback_impl(struct bpf_wq *wq, + int (callback_fn)(void *map, int *key, void *value), + unsigned int flags, + void *aux__prog) + { + struct bpf_prog_aux *aux = aux__prog; + ... + } + +.. _BPF_kfunc_nodef: + +2.4 Using an existing kernel function +------------------------------------- + +When an existing function in the kernel is fit for consumption by BPF programs, +it can be directly registered with the BPF subsystem. However, care must still +be taken to review the context in which it will be invoked by the BPF program +and whether it is safe to do so. + +2.5 Annotating kfuncs +--------------------- + +In addition to kfuncs' arguments, verifier may need more information about the +type of kfunc(s) being registered with the BPF subsystem. To do so, we define +flags on a set of kfuncs as follows:: + + BTF_KFUNCS_START(bpf_task_set) + BTF_ID_FLAGS(func, bpf_get_task_pid, KF_ACQUIRE | KF_RET_NULL) + BTF_ID_FLAGS(func, bpf_put_pid, KF_RELEASE) + BTF_KFUNCS_END(bpf_task_set) + +This set encodes the BTF ID of each kfunc listed above, and encodes the flags +along with it. Ofcourse, it is also allowed to specify no flags. + +kfunc definitions should also always be annotated with the ``__bpf_kfunc`` +macro. This prevents issues such as the compiler inlining the kfunc if it's a +static kernel function, or the function being elided in an LTO build as it's +not used in the rest of the kernel. Developers should not manually add +annotations to their kfunc to prevent these issues. If an annotation is +required to prevent such an issue with your kfunc, it is a bug and should be +added to the definition of the macro so that other kfuncs are similarly +protected. An example is given below:: + + __bpf_kfunc struct task_struct *bpf_get_task_pid(s32 pid) + { + ... + } + +2.5.1 KF_ACQUIRE flag +--------------------- + +The KF_ACQUIRE flag is used to indicate that the kfunc returns a pointer to a +refcounted object. The verifier will then ensure that the pointer to the object +is eventually released using a release kfunc, or transferred to a map using a +referenced kptr (by invoking bpf_kptr_xchg). If not, the verifier fails the +loading of the BPF program until no lingering references remain in all possible +explored states of the program. + +2.5.2 KF_RET_NULL flag +---------------------- + +The KF_RET_NULL flag is used to indicate that the pointer returned by the kfunc +may be NULL. Hence, it forces the user to do a NULL check on the pointer +returned from the kfunc before making use of it (dereferencing or passing to +another helper). This flag is often used in pairing with KF_ACQUIRE flag, but +both are orthogonal to each other. + +2.5.3 KF_RELEASE flag +--------------------- + +The KF_RELEASE flag is used to indicate that the kfunc releases the pointer +passed in to it. There can be only one referenced pointer that can be passed +in. All copies of the pointer being released are invalidated as a result of +invoking kfunc with this flag. + +2.5.4 KF_SLEEPABLE flag ----------------------- The KF_SLEEPABLE flag is used for kfuncs that may sleep. Such kfuncs can only be called by sleepable BPF programs (BPF_F_SLEEPABLE). -2.4.6 KF_DESTRUCTIVE flag +2.5.5 KF_DESTRUCTIVE flag -------------------------- The KF_DESTRUCTIVE flag is used to indicate functions calling which is @@ -324,18 +321,19 @@ rebooting or panicking. Due to this additional restrictions apply to these calls. At the moment they only require CAP_SYS_BOOT capability, but more can be added later. -2.4.7 KF_RCU flag +2.5.6 KF_RCU flag ----------------- -The KF_RCU flag is a weaker version of KF_TRUSTED_ARGS. The kfuncs marked with -KF_RCU expect either PTR_TRUSTED or MEM_RCU arguments. The verifier guarantees -that the objects are valid and there is no use-after-free. The pointers are not -NULL, but the object's refcount could have reached zero. The kfuncs need to -consider doing refcnt != 0 check, especially when returning a KF_ACQUIRE -pointer. Note as well that a KF_ACQUIRE kfunc that is KF_RCU should very likely -also be KF_RET_NULL. +The KF_RCU flag allows kfuncs to opt out of the default trusted args +requirement and accept RCU pointers with weaker guarantees. The kfuncs marked +with KF_RCU expect either PTR_TRUSTED or MEM_RCU arguments. The verifier +guarantees that the objects are valid and there is no use-after-free. The +pointers are not NULL, but the object's refcount could have reached zero. The +kfuncs need to consider doing refcnt != 0 check, especially when returning a +KF_ACQUIRE pointer. Note as well that a KF_ACQUIRE kfunc that is KF_RCU should +very likely also be KF_RET_NULL. -2.4.8 KF_RCU_PROTECTED flag +2.5.7 KF_RCU_PROTECTED flag --------------------------- The KF_RCU_PROTECTED flag is used to indicate that the kfunc must be invoked in @@ -354,7 +352,7 @@ RCU protection but do not take RCU protected arguments. .. _KF_deprecated_flag: -2.4.9 KF_DEPRECATED flag +2.5.8 KF_DEPRECATED flag ------------------------ The KF_DEPRECATED flag is used for kfuncs which are scheduled to be @@ -374,7 +372,7 @@ encouraged to make their use-cases known as early as possible, and participate in upstream discussions regarding whether to keep, change, deprecate, or remove those kfuncs if and when such discussions occur. -2.5 Registering the kfuncs +2.6 Registering the kfuncs -------------------------- Once the kfunc is prepared for use, the final step to making it visible is @@ -397,7 +395,7 @@ type. An example is shown below:: } late_initcall(init_subsystem); -2.6 Specifying no-cast aliases with ___init +2.7 Specifying no-cast aliases with ___init -------------------------------------------- The verifier will always enforce that the BTF type of a pointer passed to a diff --git a/fs/bpf_fs_kfuncs.c b/fs/bpf_fs_kfuncs.c index 5ace2511fec5..abd5eaa4892e 100644 --- a/fs/bpf_fs_kfuncs.c +++ b/fs/bpf_fs_kfuncs.c @@ -68,10 +68,7 @@ __bpf_kfunc void bpf_put_file(struct file *file) * * Resolve the pathname for the supplied *path* and store it in *buf*. This BPF * kfunc is the safer variant of the legacy bpf_d_path() helper and should be - * used in place of bpf_d_path() whenever possible. It enforces KF_TRUSTED_ARGS - * semantics, meaning that the supplied *path* must itself hold a valid - * reference, or else the BPF program will be outright rejected by the BPF - * verifier. + * used in place of bpf_d_path() whenever possible. * * This BPF kfunc may only be called from BPF LSM programs. * @@ -377,9 +374,8 @@ static int bpf_fs_kfuncs_filter(const struct bpf_prog *prog, u32 kfunc_id) return -EACCES; } -/* bpf_[set|remove]_dentry_xattr.* hooks have KF_TRUSTED_ARGS and - * KF_SLEEPABLE, so they are only available to sleepable hooks with - * dentry arguments. +/* bpf_[set|remove]_dentry_xattr.* hooks have KF_SLEEPABLE, so they are only + * available to sleepable hooks with dentry arguments. * * Setting and removing xattr requires exclusive lock on dentry->d_inode. * Some hooks already locked d_inode, while some hooks have not locked diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3d44c5d06623..359a962d69a1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12040,11 +12040,6 @@ static bool is_kfunc_release(struct bpf_kfunc_call_arg_meta *meta) return meta->kfunc_flags & KF_RELEASE; } -static bool is_kfunc_trusted_args(struct bpf_kfunc_call_arg_meta *meta) -{ - return (meta->kfunc_flags & KF_TRUSTED_ARGS) || is_kfunc_release(meta); -} - static bool is_kfunc_sleepable(struct bpf_kfunc_call_arg_meta *meta) { return meta->kfunc_flags & KF_SLEEPABLE; @@ -13253,9 +13248,9 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return -EINVAL; } - if ((is_kfunc_trusted_args(meta) || is_kfunc_rcu(meta)) && - (register_is_null(reg) || type_may_be_null(reg->type)) && - !is_kfunc_arg_nullable(meta->btf, &args[i])) { + if ((register_is_null(reg) || type_may_be_null(reg->type)) && + !is_kfunc_arg_nullable(meta->btf, &args[i]) && + !is_kfunc_arg_optional(meta->btf, &args[i])) { verbose(env, "Possibly NULL pointer passed to trusted arg%d\n", i); return -EACCES; } @@ -13320,9 +13315,6 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ fallthrough; case KF_ARG_PTR_TO_ALLOC_BTF_ID: case KF_ARG_PTR_TO_BTF_ID: - if (!is_kfunc_trusted_args(meta) && !is_kfunc_rcu(meta)) - break; - if (!is_trusted_reg(reg)) { if (!is_kfunc_rcu(meta)) { verbose(env, "R%d must be referenced or trusted\n", regno); From 7646c7afd9a95db0b0cb4ad066ed90f6024da67d Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Fri, 2 Jan 2026 10:00:28 -0800 Subject: [PATCH 056/268] bpf: Remove redundant KF_TRUSTED_ARGS flag from all kfuncs Now that KF_TRUSTED_ARGS is the default for all kfuncs, remove the explicit KF_TRUSTED_ARGS flag from all kfunc definitions and remove the flag itself. Acked-by: Eduard Zingerman Reviewed-by: Emil Tsalapatis Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260102180038.2708325-3-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- fs/bpf_fs_kfuncs.c | 13 ++++++------ fs/verity/measure.c | 2 +- include/linux/bpf.h | 2 +- include/linux/btf.h | 3 +-- kernel/bpf/arena.c | 6 +++--- kernel/bpf/cpumask.c | 2 +- kernel/bpf/helpers.c | 20 +++++++++---------- kernel/bpf/map_iter.c | 2 +- kernel/bpf/verifier.c | 2 +- kernel/sched/ext.c | 8 ++++---- mm/bpf_memcontrol.c | 10 +++++----- net/core/filter.c | 10 +++++----- net/core/xdp.c | 2 +- net/netfilter/nf_conntrack_bpf.c | 8 ++++---- net/netfilter/nf_flow_table_bpf.c | 2 +- net/netfilter/nf_nat_bpf.c | 2 +- net/sched/bpf_qdisc.c | 12 +++++------ .../selftests/bpf/progs/cpumask_failure.c | 2 +- .../selftests/bpf/test_kmods/bpf_testmod.c | 20 +++++++++---------- 19 files changed, 63 insertions(+), 65 deletions(-) diff --git a/fs/bpf_fs_kfuncs.c b/fs/bpf_fs_kfuncs.c index abd5eaa4892e..e4e51a1d0de2 100644 --- a/fs/bpf_fs_kfuncs.c +++ b/fs/bpf_fs_kfuncs.c @@ -356,14 +356,13 @@ __bpf_kfunc int bpf_cgroup_read_xattr(struct cgroup *cgroup, const char *name__s __bpf_kfunc_end_defs(); BTF_KFUNCS_START(bpf_fs_kfunc_set_ids) -BTF_ID_FLAGS(func, bpf_get_task_exe_file, - KF_ACQUIRE | KF_TRUSTED_ARGS | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_get_task_exe_file, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_put_file, KF_RELEASE) -BTF_ID_FLAGS(func, bpf_path_d_path, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_get_dentry_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_get_file_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_set_dentry_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_remove_dentry_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_path_d_path) +BTF_ID_FLAGS(func, bpf_get_dentry_xattr, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_get_file_xattr, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_set_dentry_xattr, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_remove_dentry_xattr, KF_SLEEPABLE) BTF_KFUNCS_END(bpf_fs_kfunc_set_ids) static int bpf_fs_kfuncs_filter(const struct bpf_prog *prog, u32 kfunc_id) diff --git a/fs/verity/measure.c b/fs/verity/measure.c index 388734132f01..6a35623ebdf0 100644 --- a/fs/verity/measure.c +++ b/fs/verity/measure.c @@ -162,7 +162,7 @@ __bpf_kfunc int bpf_get_fsverity_digest(struct file *file, struct bpf_dynptr *di __bpf_kfunc_end_defs(); BTF_KFUNCS_START(fsverity_set_ids) -BTF_ID_FLAGS(func, bpf_get_fsverity_digest, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_get_fsverity_digest) BTF_KFUNCS_END(fsverity_set_ids) static int bpf_get_fsverity_digest_filter(const struct bpf_prog *prog, u32 kfunc_id) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4e7d72dfbcd4..9efb2ddf331c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -753,7 +753,7 @@ enum bpf_type_flag { MEM_ALLOC = BIT(11 + BPF_BASE_TYPE_BITS), /* PTR was passed from the kernel in a trusted context, and may be - * passed to KF_TRUSTED_ARGS kfuncs or BPF helper functions. + * passed to kfuncs or BPF helper functions. * Confusingly, this is _not_ the opposite of PTR_UNTRUSTED above. * PTR_UNTRUSTED refers to a kptr that was read directly from a map * without invoking bpf_kptr_xchg(). What we really need to know is diff --git a/include/linux/btf.h b/include/linux/btf.h index f06976ffb63f..691f09784933 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -34,7 +34,7 @@ * * And the following kfunc: * - * BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS) + * BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE) * * All invocations to the kfunc must pass the unmodified, unwalked task: * @@ -66,7 +66,6 @@ * return 0; * } */ -#define KF_TRUSTED_ARGS (1 << 4) /* kfunc only takes trusted pointer arguments */ #define KF_SLEEPABLE (1 << 5) /* kfunc may sleep */ #define KF_DESTRUCTIVE (1 << 6) /* kfunc performs destructive actions */ #define KF_RCU (1 << 7) /* kfunc takes either rcu or trusted pointer arguments */ diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 456ac989269d..2274319a95e6 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -890,9 +890,9 @@ __bpf_kfunc int bpf_arena_reserve_pages(void *p__map, void *ptr__ign, u32 page_c __bpf_kfunc_end_defs(); BTF_KFUNCS_START(arena_kfuncs) -BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_TRUSTED_ARGS | KF_ARENA_RET | KF_ARENA_ARG2) -BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_TRUSTED_ARGS | KF_ARENA_ARG2) -BTF_ID_FLAGS(func, bpf_arena_reserve_pages, KF_TRUSTED_ARGS | KF_ARENA_ARG2) +BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_ARENA_RET | KF_ARENA_ARG2) +BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_ARENA_ARG2) +BTF_ID_FLAGS(func, bpf_arena_reserve_pages, KF_ARENA_ARG2) BTF_KFUNCS_END(arena_kfuncs) static const struct btf_kfunc_id_set common_kfunc_set = { diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c index 9876c5fe6c2a..b8c805b4b06a 100644 --- a/kernel/bpf/cpumask.c +++ b/kernel/bpf/cpumask.c @@ -477,7 +477,7 @@ __bpf_kfunc_end_defs(); BTF_KFUNCS_START(cpumask_kfunc_btf_ids) BTF_ID_FLAGS(func, bpf_cpumask_create, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_cpumask_release, KF_RELEASE) -BTF_ID_FLAGS(func, bpf_cpumask_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_acquire, KF_ACQUIRE) BTF_ID_FLAGS(func, bpf_cpumask_first, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_first_zero, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_first_and, KF_RCU) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index db72b96f9c8c..2c15f77c74db 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -4427,7 +4427,7 @@ BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_from_vpid, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_throw) #ifdef CONFIG_BPF_EVENTS -BTF_ID_FLAGS(func, bpf_send_signal_task, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_send_signal_task) #endif #ifdef CONFIG_KEYS BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE) @@ -4467,14 +4467,14 @@ BTF_ID_FLAGS(func, bpf_iter_task_vma_new, KF_ITER_NEW | KF_RCU) BTF_ID_FLAGS(func, bpf_iter_task_vma_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_task_vma_destroy, KF_ITER_DESTROY) #ifdef CONFIG_CGROUPS -BTF_ID_FLAGS(func, bpf_iter_css_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_iter_css_task_new, KF_ITER_NEW) BTF_ID_FLAGS(func, bpf_iter_css_task_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_css_task_destroy, KF_ITER_DESTROY) -BTF_ID_FLAGS(func, bpf_iter_css_new, KF_ITER_NEW | KF_TRUSTED_ARGS | KF_RCU_PROTECTED) +BTF_ID_FLAGS(func, bpf_iter_css_new, KF_ITER_NEW | KF_RCU_PROTECTED) BTF_ID_FLAGS(func, bpf_iter_css_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_css_destroy, KF_ITER_DESTROY) #endif -BTF_ID_FLAGS(func, bpf_iter_task_new, KF_ITER_NEW | KF_TRUSTED_ARGS | KF_RCU_PROTECTED) +BTF_ID_FLAGS(func, bpf_iter_task_new, KF_ITER_NEW | KF_RCU_PROTECTED) BTF_ID_FLAGS(func, bpf_iter_task_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_task_destroy, KF_ITER_DESTROY) BTF_ID_FLAGS(func, bpf_dynptr_adjust) @@ -4510,8 +4510,8 @@ BTF_ID_FLAGS(func, bpf_probe_read_user_str_dynptr) BTF_ID_FLAGS(func, bpf_probe_read_kernel_str_dynptr) BTF_ID_FLAGS(func, bpf_copy_from_user_dynptr, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_copy_from_user_str_dynptr, KF_SLEEPABLE) -BTF_ID_FLAGS(func, bpf_copy_from_user_task_dynptr, KF_SLEEPABLE | KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_copy_from_user_task_str_dynptr, KF_SLEEPABLE | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_copy_from_user_task_dynptr, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_copy_from_user_task_str_dynptr, KF_SLEEPABLE) #endif #ifdef CONFIG_DMA_SHARED_BUFFER BTF_ID_FLAGS(func, bpf_iter_dmabuf_new, KF_ITER_NEW | KF_SLEEPABLE) @@ -4536,10 +4536,10 @@ BTF_ID_FLAGS(func, bpf_strncasestr); #if defined(CONFIG_BPF_LSM) && defined(CONFIG_CGROUPS) BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU) #endif -BTF_ID_FLAGS(func, bpf_stream_vprintk_impl, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_task_work_schedule_signal_impl, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_task_work_schedule_resume_impl, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_dynptr_from_file, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_stream_vprintk_impl) +BTF_ID_FLAGS(func, bpf_task_work_schedule_signal_impl) +BTF_ID_FLAGS(func, bpf_task_work_schedule_resume_impl) +BTF_ID_FLAGS(func, bpf_dynptr_from_file) BTF_ID_FLAGS(func, bpf_dynptr_file_discard) BTF_KFUNCS_END(common_btf_ids) diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index 9575314f40a6..261a03ea73d3 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -214,7 +214,7 @@ __bpf_kfunc s64 bpf_map_sum_elem_count(const struct bpf_map *map) __bpf_kfunc_end_defs(); BTF_KFUNCS_START(bpf_map_iter_kfunc_ids) -BTF_ID_FLAGS(func, bpf_map_sum_elem_count, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_map_sum_elem_count) BTF_KFUNCS_END(bpf_map_iter_kfunc_ids) static const struct btf_kfunc_id_set bpf_map_iter_kfunc_set = { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 359a962d69a1..c9da70dd3e72 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12619,7 +12619,7 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env, /* Enforce strict type matching for calls to kfuncs that are acquiring * or releasing a reference, or are no-cast aliases. We do _not_ - * enforce strict matching for plain KF_TRUSTED_ARGS kfuncs by default, + * enforce strict matching for kfuncs by default, * as we want to enable BPF programs to pass types that are bitwise * equivalent without forcing them to explicitly cast with something * like bpf_cast_to_kern_ctx(). diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 94164f2dec6d..fd5423428dde 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -7229,9 +7229,9 @@ BTF_ID_FLAGS(func, scx_bpf_dsq_peek, KF_RCU_PROTECTED | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_scx_dsq_new, KF_ITER_NEW | KF_RCU_PROTECTED) BTF_ID_FLAGS(func, bpf_iter_scx_dsq_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_scx_dsq_destroy, KF_ITER_DESTROY) -BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, scx_bpf_exit_bstr) +BTF_ID_FLAGS(func, scx_bpf_error_bstr) +BTF_ID_FLAGS(func, scx_bpf_dump_bstr) BTF_ID_FLAGS(func, scx_bpf_reenqueue_local___v2) BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap) BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur) @@ -7250,7 +7250,7 @@ BTF_ID_FLAGS(func, scx_bpf_cpu_curr, KF_RET_NULL | KF_RCU_PROTECTED) BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE) #endif BTF_ID_FLAGS(func, scx_bpf_now) -BTF_ID_FLAGS(func, scx_bpf_events, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, scx_bpf_events) BTF_KFUNCS_END(scx_kfunc_ids_any) static const struct btf_kfunc_id_set scx_kfunc_set_any = { diff --git a/mm/bpf_memcontrol.c b/mm/bpf_memcontrol.c index e8fa7f5855f9..716df49d7647 100644 --- a/mm/bpf_memcontrol.c +++ b/mm/bpf_memcontrol.c @@ -166,11 +166,11 @@ BTF_ID_FLAGS(func, bpf_get_root_mem_cgroup, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_get_mem_cgroup, KF_ACQUIRE | KF_RET_NULL | KF_RCU) BTF_ID_FLAGS(func, bpf_put_mem_cgroup, KF_RELEASE) -BTF_ID_FLAGS(func, bpf_mem_cgroup_vm_events, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_mem_cgroup_memory_events, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_mem_cgroup_usage, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_mem_cgroup_page_state, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_mem_cgroup_flush_stats, KF_TRUSTED_ARGS | KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_mem_cgroup_vm_events) +BTF_ID_FLAGS(func, bpf_mem_cgroup_memory_events) +BTF_ID_FLAGS(func, bpf_mem_cgroup_usage) +BTF_ID_FLAGS(func, bpf_mem_cgroup_page_state) +BTF_ID_FLAGS(func, bpf_mem_cgroup_flush_stats, KF_SLEEPABLE) BTF_KFUNCS_END(bpf_memcontrol_kfuncs) diff --git a/net/core/filter.c b/net/core/filter.c index 616e0520a0bb..d43df98e1ded 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -12438,11 +12438,11 @@ int bpf_dynptr_from_skb_rdonly(struct __sk_buff *skb, u64 flags, } BTF_KFUNCS_START(bpf_kfunc_check_set_skb) -BTF_ID_FLAGS(func, bpf_dynptr_from_skb, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_dynptr_from_skb) BTF_KFUNCS_END(bpf_kfunc_check_set_skb) BTF_KFUNCS_START(bpf_kfunc_check_set_skb_meta) -BTF_ID_FLAGS(func, bpf_dynptr_from_skb_meta, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_dynptr_from_skb_meta) BTF_KFUNCS_END(bpf_kfunc_check_set_skb_meta) BTF_KFUNCS_START(bpf_kfunc_check_set_xdp) @@ -12455,11 +12455,11 @@ BTF_ID_FLAGS(func, bpf_sock_addr_set_sun_path) BTF_KFUNCS_END(bpf_kfunc_check_set_sock_addr) BTF_KFUNCS_START(bpf_kfunc_check_set_tcp_reqsk) -BTF_ID_FLAGS(func, bpf_sk_assign_tcp_reqsk, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_sk_assign_tcp_reqsk) BTF_KFUNCS_END(bpf_kfunc_check_set_tcp_reqsk) BTF_KFUNCS_START(bpf_kfunc_check_set_sock_ops) -BTF_ID_FLAGS(func, bpf_sock_ops_enable_tx_tstamp, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_sock_ops_enable_tx_tstamp) BTF_KFUNCS_END(bpf_kfunc_check_set_sock_ops) static const struct btf_kfunc_id_set bpf_kfunc_set_skb = { @@ -12554,7 +12554,7 @@ __bpf_kfunc int bpf_sock_destroy(struct sock_common *sock) __bpf_kfunc_end_defs(); BTF_KFUNCS_START(bpf_sk_iter_kfunc_ids) -BTF_ID_FLAGS(func, bpf_sock_destroy, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_sock_destroy) BTF_KFUNCS_END(bpf_sk_iter_kfunc_ids) static int tracing_iter_filter(const struct bpf_prog *prog, u32 kfunc_id) diff --git a/net/core/xdp.c b/net/core/xdp.c index 9100e160113a..fee6d080ee85 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -964,7 +964,7 @@ __bpf_kfunc int bpf_xdp_metadata_rx_vlan_tag(const struct xdp_md *ctx, __bpf_kfunc_end_defs(); BTF_KFUNCS_START(xdp_metadata_kfunc_ids) -#define XDP_METADATA_KFUNC(_, __, name, ___) BTF_ID_FLAGS(func, name, KF_TRUSTED_ARGS) +#define XDP_METADATA_KFUNC(_, __, name, ___) BTF_ID_FLAGS(func, name) XDP_METADATA_KFUNC_xxx #undef XDP_METADATA_KFUNC BTF_KFUNCS_END(xdp_metadata_kfunc_ids) diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c index 4a136fc3a9c0..a630139bd0c3 100644 --- a/net/netfilter/nf_conntrack_bpf.c +++ b/net/netfilter/nf_conntrack_bpf.c @@ -516,10 +516,10 @@ BTF_ID_FLAGS(func, bpf_skb_ct_alloc, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_skb_ct_lookup, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_ct_insert_entry, KF_ACQUIRE | KF_RET_NULL | KF_RELEASE) BTF_ID_FLAGS(func, bpf_ct_release, KF_RELEASE) -BTF_ID_FLAGS(func, bpf_ct_set_timeout, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_ct_change_timeout, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_ct_set_status, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_ct_change_status, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_ct_set_timeout) +BTF_ID_FLAGS(func, bpf_ct_change_timeout) +BTF_ID_FLAGS(func, bpf_ct_set_status) +BTF_ID_FLAGS(func, bpf_ct_change_status) BTF_KFUNCS_END(nf_ct_kfunc_set) static const struct btf_kfunc_id_set nf_conntrack_kfunc_set = { diff --git a/net/netfilter/nf_flow_table_bpf.c b/net/netfilter/nf_flow_table_bpf.c index 4a5f5195f2d2..cbd5b97a6329 100644 --- a/net/netfilter/nf_flow_table_bpf.c +++ b/net/netfilter/nf_flow_table_bpf.c @@ -105,7 +105,7 @@ __diag_pop() __bpf_kfunc_end_defs(); BTF_KFUNCS_START(nf_ft_kfunc_set) -BTF_ID_FLAGS(func, bpf_xdp_flow_lookup, KF_TRUSTED_ARGS | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_xdp_flow_lookup, KF_RET_NULL) BTF_KFUNCS_END(nf_ft_kfunc_set) static const struct btf_kfunc_id_set nf_flow_kfunc_set = { diff --git a/net/netfilter/nf_nat_bpf.c b/net/netfilter/nf_nat_bpf.c index 481be15609b1..f9dd85ccea01 100644 --- a/net/netfilter/nf_nat_bpf.c +++ b/net/netfilter/nf_nat_bpf.c @@ -55,7 +55,7 @@ __bpf_kfunc int bpf_ct_set_nat_info(struct nf_conn___init *nfct, __bpf_kfunc_end_defs(); BTF_KFUNCS_START(nf_nat_kfunc_set) -BTF_ID_FLAGS(func, bpf_ct_set_nat_info, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_ct_set_nat_info) BTF_KFUNCS_END(nf_nat_kfunc_set) static const struct btf_kfunc_id_set nf_bpf_nat_kfunc_set = { diff --git a/net/sched/bpf_qdisc.c b/net/sched/bpf_qdisc.c index adcb618a2bfc..b9771788b9b3 100644 --- a/net/sched/bpf_qdisc.c +++ b/net/sched/bpf_qdisc.c @@ -271,14 +271,14 @@ __bpf_kfunc void bpf_qdisc_bstats_update(struct Qdisc *sch, const struct sk_buff __bpf_kfunc_end_defs(); BTF_KFUNCS_START(qdisc_kfunc_ids) -BTF_ID_FLAGS(func, bpf_skb_get_hash, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_skb_get_hash) BTF_ID_FLAGS(func, bpf_kfree_skb, KF_RELEASE) BTF_ID_FLAGS(func, bpf_qdisc_skb_drop, KF_RELEASE) -BTF_ID_FLAGS(func, bpf_dynptr_from_skb, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_qdisc_watchdog_schedule, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_qdisc_init_prologue, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_qdisc_reset_destroy_epilogue, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_qdisc_bstats_update, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_dynptr_from_skb) +BTF_ID_FLAGS(func, bpf_qdisc_watchdog_schedule) +BTF_ID_FLAGS(func, bpf_qdisc_init_prologue) +BTF_ID_FLAGS(func, bpf_qdisc_reset_destroy_epilogue) +BTF_ID_FLAGS(func, bpf_qdisc_bstats_update) BTF_KFUNCS_END(qdisc_kfunc_ids) BTF_SET_START(qdisc_common_kfunc_set) diff --git a/tools/testing/selftests/bpf/progs/cpumask_failure.c b/tools/testing/selftests/bpf/progs/cpumask_failure.c index 8a2fd596c8a3..61c32e91e8c3 100644 --- a/tools/testing/selftests/bpf/progs/cpumask_failure.c +++ b/tools/testing/selftests/bpf/progs/cpumask_failure.c @@ -110,7 +110,7 @@ SEC("tp_btf/task_newtask") __failure __msg("NULL pointer passed to trusted arg0") int BPF_PROG(test_cpumask_null, struct task_struct *task, u64 clone_flags) { - /* NULL passed to KF_TRUSTED_ARGS kfunc. */ + /* NULL passed to kfunc. */ bpf_cpumask_empty(NULL); return 0; diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c index 90c4b1a51de6..1c41d03bd5a1 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c @@ -693,9 +693,9 @@ BTF_ID_FLAGS(func, bpf_kfunc_dynptr_test) BTF_ID_FLAGS(func, bpf_kfunc_nested_acquire_nonzero_offset_test, KF_ACQUIRE) BTF_ID_FLAGS(func, bpf_kfunc_nested_acquire_zero_offset_test, KF_ACQUIRE) BTF_ID_FLAGS(func, bpf_kfunc_nested_release_test, KF_RELEASE) -BTF_ID_FLAGS(func, bpf_kfunc_trusted_vma_test, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_kfunc_trusted_task_test, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_kfunc_trusted_num_test, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_kfunc_trusted_vma_test) +BTF_ID_FLAGS(func, bpf_kfunc_trusted_task_test) +BTF_ID_FLAGS(func, bpf_kfunc_trusted_num_test) BTF_ID_FLAGS(func, bpf_kfunc_rcu_task_test, KF_RCU) BTF_ID_FLAGS(func, bpf_kfunc_ret_rcu_test, KF_RET_NULL | KF_RCU_PROTECTED) BTF_ID_FLAGS(func, bpf_kfunc_ret_rcu_test_nostruct, KF_RET_NULL | KF_RCU_PROTECTED) @@ -1158,7 +1158,7 @@ BTF_ID_FLAGS(func, bpf_kfunc_call_test_pass2) BTF_ID_FLAGS(func, bpf_kfunc_call_test_fail1) BTF_ID_FLAGS(func, bpf_kfunc_call_test_fail2) BTF_ID_FLAGS(func, bpf_kfunc_call_test_fail3) -BTF_ID_FLAGS(func, bpf_kfunc_call_test_ref, KF_TRUSTED_ARGS | KF_RCU) +BTF_ID_FLAGS(func, bpf_kfunc_call_test_ref, KF_RCU) BTF_ID_FLAGS(func, bpf_kfunc_call_test_destructive, KF_DESTRUCTIVE) BTF_ID_FLAGS(func, bpf_kfunc_call_test_static_unused_arg) BTF_ID_FLAGS(func, bpf_kfunc_call_test_offset) @@ -1172,12 +1172,12 @@ BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_sendmsg, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_kfunc_call_sock_sendmsg, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_getsockname, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_kfunc_call_kernel_getpeername, KF_SLEEPABLE) -BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_prologue, KF_TRUSTED_ARGS | KF_SLEEPABLE) -BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_epilogue, KF_TRUSTED_ARGS | KF_SLEEPABLE) -BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_pro_epilogue, KF_TRUSTED_ARGS | KF_SLEEPABLE) -BTF_ID_FLAGS(func, bpf_kfunc_st_ops_inc10, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1_impl, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_prologue, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_epilogue, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_pro_epilogue, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_kfunc_st_ops_inc10) +BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1) +BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1_impl) BTF_KFUNCS_END(bpf_testmod_check_kfunc_ids) static int bpf_testmod_ops_init(struct btf *btf) From bddaf9adda72447061a52b328c2d4c64b327fa30 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Fri, 2 Jan 2026 10:00:29 -0800 Subject: [PATCH 057/268] bpf: net: netfilter: drop dead NULL checks bpf_xdp_ct_lookup() and bpf_skb_ct_lookup() receive bpf_tuple and opts parameter that are expected to be not NULL for real usages (see doc string above functions). They return an error if NULL is passed for opts or tuple. The verifier will now reject programs that pass NULL to these parameters, the kfuns can assume that these are always valid pointer, so drop the NULL checks for these parameters. Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260102180038.2708325-4-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- net/netfilter/nf_conntrack_bpf.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c index a630139bd0c3..be654363f53f 100644 --- a/net/netfilter/nf_conntrack_bpf.c +++ b/net/netfilter/nf_conntrack_bpf.c @@ -114,8 +114,6 @@ __bpf_nf_ct_alloc_entry(struct net *net, struct bpf_sock_tuple *bpf_tuple, struct nf_conn *ct; int err; - if (!opts || !bpf_tuple) - return ERR_PTR(-EINVAL); if (!(opts_len == NF_BPF_CT_OPTS_SZ || opts_len == 12)) return ERR_PTR(-EINVAL); if (opts_len == NF_BPF_CT_OPTS_SZ) { @@ -299,8 +297,7 @@ bpf_xdp_ct_alloc(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple, nfct = __bpf_nf_ct_alloc_entry(dev_net(ctx->rxq->dev), bpf_tuple, tuple__sz, opts, opts__sz, 10); if (IS_ERR(nfct)) { - if (opts) - opts->error = PTR_ERR(nfct); + opts->error = PTR_ERR(nfct); return NULL; } @@ -334,8 +331,7 @@ bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple, caller_net = dev_net(ctx->rxq->dev); nfct = __bpf_nf_ct_lookup(caller_net, bpf_tuple, tuple__sz, opts, opts__sz); if (IS_ERR(nfct)) { - if (opts) - opts->error = PTR_ERR(nfct); + opts->error = PTR_ERR(nfct); return NULL; } return nfct; @@ -367,8 +363,7 @@ bpf_skb_ct_alloc(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple, net = skb->dev ? dev_net(skb->dev) : sock_net(skb->sk); nfct = __bpf_nf_ct_alloc_entry(net, bpf_tuple, tuple__sz, opts, opts__sz, 10); if (IS_ERR(nfct)) { - if (opts) - opts->error = PTR_ERR(nfct); + opts->error = PTR_ERR(nfct); return NULL; } @@ -402,8 +397,7 @@ bpf_skb_ct_lookup(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple, caller_net = skb->dev ? dev_net(skb->dev) : sock_net(skb->sk); nfct = __bpf_nf_ct_lookup(caller_net, bpf_tuple, tuple__sz, opts, opts__sz); if (IS_ERR(nfct)) { - if (opts) - opts->error = PTR_ERR(nfct); + opts->error = PTR_ERR(nfct); return NULL; } return nfct; From cd1d60949143b10d8eb7cea111e2b8bfb18fe461 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Fri, 2 Jan 2026 10:00:30 -0800 Subject: [PATCH 058/268] bpf: xfrm: drop dead NULL check in bpf_xdp_get_xfrm_state() As KF_TRUSTED_ARGS is now considered the default for all kfuncs, the opts parameter in bpf_xdp_get_xfrm_state() can never be NULL. Verifier will detect this at load time and will not allow passing NULL to this function. This matches the documentation above the kfunc that says this parameter (opts) Cannot be NULL. Acked-by: Eduard Zingerman Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260102180038.2708325-5-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- net/xfrm/xfrm_state_bpf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/xfrm/xfrm_state_bpf.c b/net/xfrm/xfrm_state_bpf.c index 2248eda741f8..4180c317f9bc 100644 --- a/net/xfrm/xfrm_state_bpf.c +++ b/net/xfrm/xfrm_state_bpf.c @@ -68,7 +68,7 @@ bpf_xdp_get_xfrm_state(struct xdp_md *ctx, struct bpf_xfrm_state_opts *opts, u32 struct net *net = dev_net(xdp->rxq->dev); struct xfrm_state *x; - if (!opts || opts__sz < sizeof(opts->error)) + if (opts__sz < sizeof(opts->error)) return NULL; if (opts__sz != BPF_XFRM_STATE_OPTS_SZ) { From 8fe172fa305f14db815bd88133d2030e4a9e107e Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Fri, 2 Jan 2026 10:00:31 -0800 Subject: [PATCH 059/268] HID: bpf: drop dead NULL checks in kfuncs As KF_TRUSTED_ARGS is now considered default for all kfuns, the verifier will not allow passing NULL pointers to these kfuns. These checks for NULL pointers can therefore be removed. Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260102180038.2708325-6-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- drivers/hid/bpf/hid_bpf_dispatch.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/hid/bpf/hid_bpf_dispatch.c b/drivers/hid/bpf/hid_bpf_dispatch.c index 9a06f9b0e4ef..892aca026ffa 100644 --- a/drivers/hid/bpf/hid_bpf_dispatch.c +++ b/drivers/hid/bpf/hid_bpf_dispatch.c @@ -295,9 +295,6 @@ hid_bpf_get_data(struct hid_bpf_ctx *ctx, unsigned int offset, const size_t rdwr { struct hid_bpf_ctx_kern *ctx_kern; - if (!ctx) - return NULL; - ctx_kern = container_of(ctx, struct hid_bpf_ctx_kern, ctx); if (rdwr_buf_size + offset > ctx->allocated_size) @@ -364,7 +361,7 @@ __hid_bpf_hw_check_params(struct hid_bpf_ctx *ctx, __u8 *buf, size_t *buf__sz, u32 report_len; /* check arguments */ - if (!ctx || !hid_ops || !buf) + if (!hid_ops) return -EINVAL; switch (rtype) { From df5004579bbdfe4c734f2cbbf3f44fe3fac440a3 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Fri, 2 Jan 2026 10:00:32 -0800 Subject: [PATCH 060/268] selftests: bpf: Update kfunc_param_nullable test for new error message With trusted args now being the default, the NULL pointer check runs before type-specific validation. Update test3 to expect the new error message "Possibly NULL pointer passed to trusted arg0" instead of the old dynptr-specific error message. Acked-by: Eduard Zingerman Reviewed-by: Emil Tsalapatis Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260102180038.2708325-7-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/test_kfunc_param_nullable.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/progs/test_kfunc_param_nullable.c b/tools/testing/selftests/bpf/progs/test_kfunc_param_nullable.c index 0ad1bf1ede8d..967081bbcfe1 100644 --- a/tools/testing/selftests/bpf/progs/test_kfunc_param_nullable.c +++ b/tools/testing/selftests/bpf/progs/test_kfunc_param_nullable.c @@ -29,7 +29,7 @@ int kfunc_dynptr_nullable_test2(struct __sk_buff *skb) } SEC("tc") -__failure __msg("expected pointer to stack or const struct bpf_dynptr") +__failure __msg("Possibly NULL pointer passed to trusted arg0") int kfunc_dynptr_nullable_test3(struct __sk_buff *skb) { struct bpf_dynptr data; From 03cc77b10e009ce87f1a8e93454aadf2912a4c15 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Fri, 2 Jan 2026 10:00:33 -0800 Subject: [PATCH 061/268] selftests: bpf: Update failure message for rbtree_fail The rbtree_api_use_unchecked_remove_retval() selftest passes a pointer received from bpf_rbtree_remove() to bpf_rbtree_add() without checking for NULL, this was earlier caught by __check_ptr_off_reg() in the verifier. Now the verifier assumes every kfunc only takes trusted pointer arguments, so it catches this NULL pointer earlier in the path and provides a more accurate failure message. Acked-by: Eduard Zingerman Reviewed-by: Emil Tsalapatis Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260102180038.2708325-8-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/rbtree_fail.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/progs/rbtree_fail.c b/tools/testing/selftests/bpf/progs/rbtree_fail.c index 4acb6af2dfe3..70b7baf9304b 100644 --- a/tools/testing/selftests/bpf/progs/rbtree_fail.c +++ b/tools/testing/selftests/bpf/progs/rbtree_fail.c @@ -153,7 +153,7 @@ long rbtree_api_add_to_multiple_trees(void *ctx) } SEC("?tc") -__failure __msg("dereference of modified ptr_or_null_ ptr R2 off=16 disallowed") +__failure __msg("Possibly NULL pointer passed to trusted arg1") long rbtree_api_use_unchecked_remove_retval(void *ctx) { struct bpf_rb_node *res; From 230b0118e416583a53fc0ad5d1fecb37f496fe34 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Fri, 2 Jan 2026 10:00:34 -0800 Subject: [PATCH 062/268] selftests: bpf: fix test_kfunc_dynptr_param As verifier now assumes that all kfuncs only takes trusted pointer arguments, passing 0 (NULL) to a kfunc that doesn't mark the argument as __nullable or __opt will be rejected with a failure message of: Possibly NULL pointer passed to trusted arg Pass a non-null value to the kfunc to test the expected failure mode. Acked-by: Eduard Zingerman Reviewed-by: Emil Tsalapatis Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260102180038.2708325-9-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c b/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c index 061befb004c2..d249113ed657 100644 --- a/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c +++ b/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c @@ -48,10 +48,9 @@ SEC("?lsm.s/bpf") __failure __msg("arg#0 expected pointer to stack or const struct bpf_dynptr") int BPF_PROG(not_ptr_to_stack, int cmd, union bpf_attr *attr, unsigned int size, bool kernel) { - unsigned long val = 0; + static struct bpf_dynptr val; - return bpf_verify_pkcs7_signature((struct bpf_dynptr *)val, - (struct bpf_dynptr *)val, NULL); + return bpf_verify_pkcs7_signature(&val, &val, NULL); } SEC("lsm.s/bpf") From cf82580c86a91de2aa979260985cadcb39ed28d2 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Fri, 2 Jan 2026 10:00:35 -0800 Subject: [PATCH 063/268] selftests: bpf: fix cgroup_hierarchical_stats The cgroup_hierarchical_stats selftests uses an fentry program attached to cgroup_attach_task and then passes the received &dst_cgrp->self to the css_rstat_updated() kfunc. The verifier now assumes that all kfuncs only takes trusted pointer arguments, and pointers received by fentry are not marked trustes by default. Use a tp_btf program in place for fentry for this test, pointers received by tp_btf programs are marked trusted by the verifier. Acked-by: Eduard Zingerman Reviewed-by: Emil Tsalapatis Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260102180038.2708325-10-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/progs/cgroup_hierarchical_stats.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/cgroup_hierarchical_stats.c b/tools/testing/selftests/bpf/progs/cgroup_hierarchical_stats.c index ff189a736ad8..8fc38592a87b 100644 --- a/tools/testing/selftests/bpf/progs/cgroup_hierarchical_stats.c +++ b/tools/testing/selftests/bpf/progs/cgroup_hierarchical_stats.c @@ -62,9 +62,9 @@ static int create_attach_counter(__u64 cg_id, __u64 state, __u64 pending) &init, BPF_NOEXIST); } -SEC("fentry/cgroup_attach_task") -int BPF_PROG(counter, struct cgroup *dst_cgrp, struct task_struct *leader, - bool threadgroup) +SEC("tp_btf/cgroup_attach_task") +int BPF_PROG(counter, struct cgroup *dst_cgrp, const char *path, + struct task_struct *task, bool threadgroup) { __u64 cg_id = cgroup_id(dst_cgrp); struct percpu_attach_counter *pcpu_counter = bpf_map_lookup_elem( From cf503eb2c6c38bf449063f33790a96218a067718 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Fri, 2 Jan 2026 10:00:36 -0800 Subject: [PATCH 064/268] selftests: bpf: Fix test_bpf_nf for trusted args becoming default With trusted args now being the default, passing NULL to kfunc parameters that are pointers causes verifier rejection rather than a runtime error. The test_bpf_nf test was failing because it attempted to pass NULL to bpf_xdp_ct_lookup() to verify runtime error handling. Since the NULL check now happens at verification time, remove the runtime test case that passed NULL to the bpf_tuple parameter and instead add verification-time tests to ensure the verifier correctly rejects programs that pass NULL to trusted arguments. Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260102180038.2708325-11-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/prog_tests/bpf_nf.c | 5 +- .../testing/selftests/bpf/progs/test_bpf_nf.c | 7 --- .../selftests/bpf/progs/test_bpf_nf_fail.c | 57 +++++++++++++++++++ 3 files changed, 61 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_nf.c b/tools/testing/selftests/bpf/prog_tests/bpf_nf.c index dd6512fa652b..215878ea04de 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_nf.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_nf.c @@ -19,6 +19,10 @@ struct { { "change_timeout_after_alloc", "kernel function bpf_ct_change_timeout args#0 expected pointer to STRUCT nf_conn but" }, { "change_status_after_alloc", "kernel function bpf_ct_change_status args#0 expected pointer to STRUCT nf_conn but" }, { "write_not_allowlisted_field", "no write support to nf_conn at off" }, + { "lookup_null_bpf_tuple", "Possibly NULL pointer passed to trusted arg1" }, + { "lookup_null_bpf_opts", "Possibly NULL pointer passed to trusted arg3" }, + { "xdp_lookup_null_bpf_tuple", "Possibly NULL pointer passed to trusted arg1" }, + { "xdp_lookup_null_bpf_opts", "Possibly NULL pointer passed to trusted arg3" }, }; enum { @@ -111,7 +115,6 @@ static void test_bpf_nf_ct(int mode) if (!ASSERT_OK(err, "bpf_prog_test_run")) goto end; - ASSERT_EQ(skel->bss->test_einval_bpf_tuple, -EINVAL, "Test EINVAL for NULL bpf_tuple"); ASSERT_EQ(skel->bss->test_einval_reserved, -EINVAL, "Test EINVAL for reserved not set to 0"); ASSERT_EQ(skel->bss->test_einval_reserved_new, -EINVAL, "Test EINVAL for reserved in new struct not set to 0"); ASSERT_EQ(skel->bss->test_einval_netns_id, -EINVAL, "Test EINVAL for netns_id < -1"); diff --git a/tools/testing/selftests/bpf/progs/test_bpf_nf.c b/tools/testing/selftests/bpf/progs/test_bpf_nf.c index f7b330ddd007..076fbf03a126 100644 --- a/tools/testing/selftests/bpf/progs/test_bpf_nf.c +++ b/tools/testing/selftests/bpf/progs/test_bpf_nf.c @@ -15,7 +15,6 @@ extern unsigned long CONFIG_HZ __kconfig; -int test_einval_bpf_tuple = 0; int test_einval_reserved = 0; int test_einval_reserved_new = 0; int test_einval_netns_id = 0; @@ -99,12 +98,6 @@ nf_ct_test(struct nf_conn *(*lookup_fn)(void *, struct bpf_sock_tuple *, u32, __builtin_memset(&bpf_tuple, 0, sizeof(bpf_tuple.ipv4)); - ct = lookup_fn(ctx, NULL, 0, &opts_def, sizeof(opts_def)); - if (ct) - bpf_ct_release(ct); - else - test_einval_bpf_tuple = opts_def.error; - opts_def.reserved[0] = 1; ct = lookup_fn(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def, sizeof(opts_def)); diff --git a/tools/testing/selftests/bpf/progs/test_bpf_nf_fail.c b/tools/testing/selftests/bpf/progs/test_bpf_nf_fail.c index a586f087ffeb..2c156cd166af 100644 --- a/tools/testing/selftests/bpf/progs/test_bpf_nf_fail.c +++ b/tools/testing/selftests/bpf/progs/test_bpf_nf_fail.c @@ -4,6 +4,7 @@ #include #include #include +#include "bpf_misc.h" struct nf_conn; @@ -18,6 +19,10 @@ struct nf_conn *bpf_skb_ct_alloc(struct __sk_buff *, struct bpf_sock_tuple *, u3 struct bpf_ct_opts___local *, u32) __ksym; struct nf_conn *bpf_skb_ct_lookup(struct __sk_buff *, struct bpf_sock_tuple *, u32, struct bpf_ct_opts___local *, u32) __ksym; +struct nf_conn *bpf_xdp_ct_alloc(struct xdp_md *, struct bpf_sock_tuple *, u32, + struct bpf_ct_opts___local *, u32) __ksym; +struct nf_conn *bpf_xdp_ct_lookup(struct xdp_md *, struct bpf_sock_tuple *, u32, + struct bpf_ct_opts___local *, u32) __ksym; struct nf_conn *bpf_ct_insert_entry(struct nf_conn *) __ksym; void bpf_ct_release(struct nf_conn *) __ksym; void bpf_ct_set_timeout(struct nf_conn *, u32) __ksym; @@ -146,4 +151,56 @@ int change_status_after_alloc(struct __sk_buff *ctx) return 0; } +SEC("?tc") +__failure __msg("Possibly NULL pointer passed to trusted arg1") +int lookup_null_bpf_tuple(struct __sk_buff *ctx) +{ + struct bpf_ct_opts___local opts = {}; + struct nf_conn *ct; + + ct = bpf_skb_ct_lookup(ctx, NULL, 0, &opts, sizeof(opts)); + if (ct) + bpf_ct_release(ct); + return 0; +} + +SEC("?tc") +__failure __msg("Possibly NULL pointer passed to trusted arg3") +int lookup_null_bpf_opts(struct __sk_buff *ctx) +{ + struct bpf_sock_tuple tup = {}; + struct nf_conn *ct; + + ct = bpf_skb_ct_lookup(ctx, &tup, sizeof(tup.ipv4), NULL, sizeof(struct bpf_ct_opts___local)); + if (ct) + bpf_ct_release(ct); + return 0; +} + +SEC("?xdp") +__failure __msg("Possibly NULL pointer passed to trusted arg1") +int xdp_lookup_null_bpf_tuple(struct xdp_md *ctx) +{ + struct bpf_ct_opts___local opts = {}; + struct nf_conn *ct; + + ct = bpf_xdp_ct_lookup(ctx, NULL, 0, &opts, sizeof(opts)); + if (ct) + bpf_ct_release(ct); + return 0; +} + +SEC("?xdp") +__failure __msg("Possibly NULL pointer passed to trusted arg3") +int xdp_lookup_null_bpf_opts(struct xdp_md *ctx) +{ + struct bpf_sock_tuple tup = {}; + struct nf_conn *ct; + + ct = bpf_xdp_ct_lookup(ctx, &tup, sizeof(tup.ipv4), NULL, sizeof(struct bpf_ct_opts___local)); + if (ct) + bpf_ct_release(ct); + return 0; +} + char _license[] SEC("license") = "GPL"; From 817593af7b9ba56c23d9dd1858232c5493a14f55 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Fri, 2 Jan 2026 12:02:27 -0800 Subject: [PATCH 065/268] bpf: syscall: Introduce memcg enter/exit helpers Introduce bpf_map_memcg_enter() and bpf_map_memcg_exit() helpers to reduce code duplication in memcg context management. bpf_map_memcg_enter() gets the memcg from the map, sets it as active, and returns both the previous and the now active memcg. bpf_map_memcg_exit() restores the previous active memcg and releases the reference obtained during enter. Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260102200230.25168-2-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 15 ++++++++++++ kernel/bpf/syscall.c | 54 +++++++++++++++++++++----------------------- 2 files changed, 41 insertions(+), 28 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9efb2ddf331c..4e9667ed6630 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2608,6 +2608,10 @@ struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id); int bpf_map_alloc_pages(const struct bpf_map *map, int nid, unsigned long nr_pages, struct page **page_array); #ifdef CONFIG_MEMCG +void bpf_map_memcg_enter(const struct bpf_map *map, struct mem_cgroup **old_memcg, + struct mem_cgroup **new_memcg); +void bpf_map_memcg_exit(struct mem_cgroup *old_memcg, + struct mem_cgroup *memcg); void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, int node); void *bpf_map_kmalloc_nolock(const struct bpf_map *map, size_t size, gfp_t flags, @@ -2632,6 +2636,17 @@ void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, kvcalloc(_n, _size, _flags) #define bpf_map_alloc_percpu(_map, _size, _align, _flags) \ __alloc_percpu_gfp(_size, _align, _flags) +static inline void bpf_map_memcg_enter(const struct bpf_map *map, struct mem_cgroup **old_memcg, + struct mem_cgroup **new_memcg) +{ + *new_memcg = NULL; + *old_memcg = NULL; +} + +static inline void bpf_map_memcg_exit(struct mem_cgroup *old_memcg, + struct mem_cgroup *memcg) +{ +} #endif static inline int diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a4d38272d8bc..c77ab2e32659 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -505,17 +505,29 @@ static struct mem_cgroup *bpf_map_get_memcg(const struct bpf_map *map) return root_mem_cgroup; } +void bpf_map_memcg_enter(const struct bpf_map *map, struct mem_cgroup **old_memcg, + struct mem_cgroup **new_memcg) +{ + *new_memcg = bpf_map_get_memcg(map); + *old_memcg = set_active_memcg(*new_memcg); +} + +void bpf_map_memcg_exit(struct mem_cgroup *old_memcg, + struct mem_cgroup *new_memcg) +{ + set_active_memcg(old_memcg); + mem_cgroup_put(new_memcg); +} + void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, int node) { struct mem_cgroup *memcg, *old_memcg; void *ptr; - memcg = bpf_map_get_memcg(map); - old_memcg = set_active_memcg(memcg); + bpf_map_memcg_enter(map, &old_memcg, &memcg); ptr = kmalloc_node(size, flags | __GFP_ACCOUNT, node); - set_active_memcg(old_memcg); - mem_cgroup_put(memcg); + bpf_map_memcg_exit(old_memcg, memcg); return ptr; } @@ -526,11 +538,9 @@ void *bpf_map_kmalloc_nolock(const struct bpf_map *map, size_t size, gfp_t flags struct mem_cgroup *memcg, *old_memcg; void *ptr; - memcg = bpf_map_get_memcg(map); - old_memcg = set_active_memcg(memcg); + bpf_map_memcg_enter(map, &old_memcg, &memcg); ptr = kmalloc_nolock(size, flags | __GFP_ACCOUNT, node); - set_active_memcg(old_memcg); - mem_cgroup_put(memcg); + bpf_map_memcg_exit(old_memcg, memcg); return ptr; } @@ -540,11 +550,9 @@ void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags) struct mem_cgroup *memcg, *old_memcg; void *ptr; - memcg = bpf_map_get_memcg(map); - old_memcg = set_active_memcg(memcg); + bpf_map_memcg_enter(map, &old_memcg, &memcg); ptr = kzalloc(size, flags | __GFP_ACCOUNT); - set_active_memcg(old_memcg); - mem_cgroup_put(memcg); + bpf_map_memcg_exit(old_memcg, memcg); return ptr; } @@ -555,11 +563,9 @@ void *bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size, struct mem_cgroup *memcg, *old_memcg; void *ptr; - memcg = bpf_map_get_memcg(map); - old_memcg = set_active_memcg(memcg); + bpf_map_memcg_enter(map, &old_memcg, &memcg); ptr = kvcalloc(n, size, flags | __GFP_ACCOUNT); - set_active_memcg(old_memcg); - mem_cgroup_put(memcg); + bpf_map_memcg_exit(old_memcg, memcg); return ptr; } @@ -570,11 +576,9 @@ void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, struct mem_cgroup *memcg, *old_memcg; void __percpu *ptr; - memcg = bpf_map_get_memcg(map); - old_memcg = set_active_memcg(memcg); + bpf_map_memcg_enter(map, &old_memcg, &memcg); ptr = __alloc_percpu_gfp(size, align, flags | __GFP_ACCOUNT); - set_active_memcg(old_memcg); - mem_cgroup_put(memcg); + bpf_map_memcg_exit(old_memcg, memcg); return ptr; } @@ -612,12 +616,9 @@ int bpf_map_alloc_pages(const struct bpf_map *map, int nid, unsigned long i, j; struct page *pg; int ret = 0; -#ifdef CONFIG_MEMCG struct mem_cgroup *memcg, *old_memcg; - memcg = bpf_map_get_memcg(map); - old_memcg = set_active_memcg(memcg); -#endif + bpf_map_memcg_enter(map, &old_memcg, &memcg); for (i = 0; i < nr_pages; i++) { pg = __bpf_alloc_page(nid); @@ -631,10 +632,7 @@ int bpf_map_alloc_pages(const struct bpf_map *map, int nid, break; } -#ifdef CONFIG_MEMCG - set_active_memcg(old_memcg); - mem_cgroup_put(memcg); -#endif + bpf_map_memcg_exit(old_memcg, memcg); return ret; } From e66fe1bc6d25d6fbced99de6c377f1b3d961a80e Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Fri, 2 Jan 2026 12:02:28 -0800 Subject: [PATCH 066/268] bpf: arena: Reintroduce memcg accounting When arena allocations were converted from bpf_map_alloc_pages() to kmalloc_nolock() to support non-sleepable contexts, memcg accounting was inadvertently lost. This commit restores proper memory accounting for all arena-related allocations. All arena related allocations are accounted into memcg of the process that created bpf_arena. Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260102200230.25168-3-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/arena.c | 29 ++++++++++++++++++++++++++--- kernel/bpf/range_tree.c | 5 +++-- kernel/bpf/syscall.c | 3 --- 3 files changed, 29 insertions(+), 8 deletions(-) diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 2274319a95e6..42fae0a9f314 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -360,6 +360,7 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf) { struct bpf_map *map = vmf->vma->vm_file->private_data; struct bpf_arena *arena = container_of(map, struct bpf_arena, map); + struct mem_cgroup *new_memcg, *old_memcg; struct page *page; long kbase, kaddr; unsigned long flags; @@ -377,6 +378,8 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf) /* already have a page vmap-ed */ goto out; + bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg); + if (arena->map.map_flags & BPF_F_SEGV_ON_FAULT) /* User space requested to segfault when page is not allocated by bpf prog */ goto out_unlock_sigsegv; @@ -400,12 +403,14 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf) goto out_unlock_sigsegv; } flush_vmap_cache(kaddr, PAGE_SIZE); + bpf_map_memcg_exit(old_memcg, new_memcg); out: page_ref_add(page, 1); raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); vmf->page = page; return 0; out_unlock_sigsegv: + bpf_map_memcg_exit(old_memcg, new_memcg); raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); return VM_FAULT_SIGSEGV; } @@ -534,6 +539,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt /* user_vm_end/start are fixed before bpf prog runs */ long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT; u64 kern_vm_start = bpf_arena_get_kern_vm_start(arena); + struct mem_cgroup *new_memcg, *old_memcg; struct apply_range_data data; struct page **pages = NULL; long remaining, mapped = 0; @@ -555,11 +561,14 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt return 0; } + bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg); /* Cap allocation size to KMALLOC_MAX_CACHE_SIZE so kmalloc_nolock() can succeed. */ alloc_pages = min(page_cnt, KMALLOC_MAX_CACHE_SIZE / sizeof(struct page *)); - pages = kmalloc_nolock(alloc_pages * sizeof(struct page *), 0, NUMA_NO_NODE); - if (!pages) + pages = kmalloc_nolock(alloc_pages * sizeof(struct page *), __GFP_ACCOUNT, NUMA_NO_NODE); + if (!pages) { + bpf_map_memcg_exit(old_memcg, new_memcg); return 0; + } data.pages = pages; if (raw_res_spin_lock_irqsave(&arena->spinlock, flags)) @@ -617,6 +626,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt flush_vmap_cache(kern_vm_start + uaddr32, mapped << PAGE_SHIFT); raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); kfree_nolock(pages); + bpf_map_memcg_exit(old_memcg, new_memcg); return clear_lo32(arena->user_vm_start) + uaddr32; out: range_tree_set(&arena->rt, pgoff + mapped, page_cnt - mapped); @@ -630,6 +640,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); out_free_pages: kfree_nolock(pages); + bpf_map_memcg_exit(old_memcg, new_memcg); return 0; } @@ -651,6 +662,7 @@ static void zap_pages(struct bpf_arena *arena, long uaddr, long page_cnt) static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, bool sleepable) { + struct mem_cgroup *new_memcg, *old_memcg; u64 full_uaddr, uaddr_end; long kaddr, pgoff; struct page *page; @@ -671,6 +683,7 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, page_cnt = (uaddr_end - full_uaddr) >> PAGE_SHIFT; pgoff = compute_pgoff(arena, uaddr); + bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg); if (!sleepable) goto defer; @@ -709,11 +722,13 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, zap_pages(arena, full_uaddr, 1); __free_page(page); } + bpf_map_memcg_exit(old_memcg, new_memcg); return; defer: - s = kmalloc_nolock(sizeof(struct arena_free_span), 0, -1); + s = kmalloc_nolock(sizeof(struct arena_free_span), __GFP_ACCOUNT, -1); + bpf_map_memcg_exit(old_memcg, new_memcg); if (!s) /* * If allocation fails in non-sleepable context, pages are intentionally left @@ -735,6 +750,7 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, static int arena_reserve_pages(struct bpf_arena *arena, long uaddr, u32 page_cnt) { long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT; + struct mem_cgroup *new_memcg, *old_memcg; unsigned long flags; long pgoff; int ret; @@ -757,7 +773,9 @@ static int arena_reserve_pages(struct bpf_arena *arena, long uaddr, u32 page_cnt } /* "Allocate" the region to prevent it from being allocated. */ + bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg); ret = range_tree_clear(&arena->rt, pgoff, page_cnt); + bpf_map_memcg_exit(old_memcg, new_memcg); out: raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); return ret; @@ -766,6 +784,7 @@ static int arena_reserve_pages(struct bpf_arena *arena, long uaddr, u32 page_cnt static void arena_free_worker(struct work_struct *work) { struct bpf_arena *arena = container_of(work, struct bpf_arena, free_work); + struct mem_cgroup *new_memcg, *old_memcg; struct llist_node *list, *pos, *t; struct arena_free_span *s; u64 arena_vm_start, user_vm_start; @@ -780,6 +799,8 @@ static void arena_free_worker(struct work_struct *work) return; } + bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg); + init_llist_head(&free_pages); arena_vm_start = bpf_arena_get_kern_vm_start(arena); user_vm_start = bpf_arena_get_user_vm_start(arena); @@ -820,6 +841,8 @@ static void arena_free_worker(struct work_struct *work) page = llist_entry(pos, struct page, pcp_llist); __free_page(page); } + + bpf_map_memcg_exit(old_memcg, new_memcg); } static void arena_free_irq(struct irq_work *iw) diff --git a/kernel/bpf/range_tree.c b/kernel/bpf/range_tree.c index 99c63d982c5d..2f28886f3ff7 100644 --- a/kernel/bpf/range_tree.c +++ b/kernel/bpf/range_tree.c @@ -149,7 +149,8 @@ int range_tree_clear(struct range_tree *rt, u32 start, u32 len) range_it_insert(rn, rt); /* Add a range */ - new_rn = kmalloc_nolock(sizeof(struct range_node), 0, NUMA_NO_NODE); + new_rn = kmalloc_nolock(sizeof(struct range_node), __GFP_ACCOUNT, + NUMA_NO_NODE); if (!new_rn) return -ENOMEM; new_rn->rn_start = last + 1; @@ -234,7 +235,7 @@ int range_tree_set(struct range_tree *rt, u32 start, u32 len) right->rn_start = start; range_it_insert(right, rt); } else { - left = kmalloc_nolock(sizeof(struct range_node), 0, NUMA_NO_NODE); + left = kmalloc_nolock(sizeof(struct range_node), __GFP_ACCOUNT, NUMA_NO_NODE); if (!left) return -ENOMEM; left->rn_start = start; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c77ab2e32659..6dd2ad2f9e81 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -616,9 +616,7 @@ int bpf_map_alloc_pages(const struct bpf_map *map, int nid, unsigned long i, j; struct page *pg; int ret = 0; - struct mem_cgroup *memcg, *old_memcg; - bpf_map_memcg_enter(map, &old_memcg, &memcg); for (i = 0; i < nr_pages; i++) { pg = __bpf_alloc_page(nid); @@ -632,7 +630,6 @@ int bpf_map_alloc_pages(const struct bpf_map *map, int nid, break; } - bpf_map_memcg_exit(old_memcg, memcg); return ret; } From a069190b590e108223cd841a1c2d0bfb92230ecc Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Fri, 2 Jan 2026 14:15:12 -0800 Subject: [PATCH 067/268] bpf: Replace __opt annotation with __nullable for kfuncs The __opt annotation was originally introduced specifically for buffer/size argument pairs in bpf_dynptr_slice() and bpf_dynptr_slice_rdwr(), allowing the buffer pointer to be NULL while still validating the size as a constant. The __nullable annotation serves the same purpose but is more general and is already used throughout the BPF subsystem for raw tracepoints, struct_ops, and other kfuncs. This patch unifies the two annotations by replacing __opt with __nullable. The key change is in the verifier's get_kfunc_ptr_arg_type() function, where mem/size pair detection is now performed before the nullable check. This ensures that buffer/size pairs are correctly classified as KF_ARG_PTR_TO_MEM_SIZE even when the buffer is nullable, while adding an !arg_mem_size condition to the nullable check prevents interference with mem/size pair handling. When processing KF_ARG_PTR_TO_MEM_SIZE arguments, the verifier now uses is_kfunc_arg_nullable() instead of the removed is_kfunc_arg_optional() to determine whether to skip size validation for NULL buffers. This is the first documentation added for the __nullable annotation, which has been in use since it was introduced but was previously undocumented. No functional changes to verifier behavior - nullable buffer/size pairs continue to work exactly as before. Acked-by: Eduard Zingerman Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260102221513.1961781-1-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- Documentation/bpf/kfuncs.rst | 31 ++++++++++++++++++++----------- include/linux/bpf.h | 2 +- kernel/bpf/helpers.c | 28 ++++++++++++++-------------- kernel/bpf/verifier.c | 23 +++++++++-------------- 4 files changed, 44 insertions(+), 40 deletions(-) diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst index 6cb6857bfa6f..3eb59a8f9f34 100644 --- a/Documentation/bpf/kfuncs.rst +++ b/Documentation/bpf/kfuncs.rst @@ -178,25 +178,34 @@ Here, the dynptr will be treated as an uninitialized dynptr. Without this annotation, the verifier will reject the program if the dynptr passed in is not initialized. -2.3.4 __opt Annotation -------------------------- +2.3.4 __nullable Annotation +--------------------------- -This annotation is used to indicate that the buffer associated with an __sz or __szk -argument may be null. If the function is passed a nullptr in place of the buffer, -the verifier will not check that length is appropriate for the buffer. The kfunc is -responsible for checking if this buffer is null before using it. +This annotation is used to indicate that the pointer argument may be NULL. +The verifier will allow passing NULL for such arguments. An example is given below:: - __bpf_kfunc void *bpf_dynptr_slice(..., void *buffer__opt, u32 buffer__szk) + __bpf_kfunc void bpf_task_release(struct task_struct *task__nullable) { ... } -Here, the buffer may be null. If buffer is not null, it at least of size buffer_szk. -Either way, the returned buffer is either NULL, or of size buffer_szk. Without this -annotation, the verifier will reject the program if a null pointer is passed in with -a nonzero size. +Here, the task pointer may be NULL. The kfunc is responsible for checking if +the pointer is NULL before dereferencing it. + +The __nullable annotation can be combined with other annotations. For example, +when used with __sz or __szk annotations for memory and size pairs, the +verifier will skip size validation when a NULL pointer is passed, but will +still process the size argument to extract constant size information when +needed:: + + __bpf_kfunc void *bpf_dynptr_slice(..., void *buffer__nullable, + u32 buffer__szk) + +Here, the buffer may be NULL. If the buffer is not NULL, it must be at least +buffer__szk bytes in size. The kfunc is responsible for checking if the buffer +is NULL before using it. 2.3.5 __str Annotation ---------------------------- diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4e9667ed6630..a63e47d2109c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1434,7 +1434,7 @@ bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr); int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u64 offset, void *src, u64 len, u64 flags); void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u64 offset, - void *buffer__opt, u64 buffer__szk); + void *buffer__nullable, u64 buffer__szk); static inline int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u64 offset, u64 len) { diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 2c15f77c74db..9eaa4185e0a7 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2709,14 +2709,14 @@ __bpf_kfunc struct task_struct *bpf_task_from_vpid(s32 vpid) * bpf_dynptr_slice() - Obtain a read-only pointer to the dynptr data. * @p: The dynptr whose data slice to retrieve * @offset: Offset into the dynptr - * @buffer__opt: User-provided buffer to copy contents into. May be NULL + * @buffer__nullable: User-provided buffer to copy contents into. May be NULL * @buffer__szk: Size (in bytes) of the buffer if present. This is the * length of the requested slice. This must be a constant. * * For non-skb and non-xdp type dynptrs, there is no difference between * bpf_dynptr_slice and bpf_dynptr_data. * - * If buffer__opt is NULL, the call will fail if buffer_opt was needed. + * If buffer__nullable is NULL, the call will fail if buffer_opt was needed. * * If the intention is to write to the data slice, please use * bpf_dynptr_slice_rdwr. @@ -2734,7 +2734,7 @@ __bpf_kfunc struct task_struct *bpf_task_from_vpid(s32 vpid) * direct pointer) */ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u64 offset, - void *buffer__opt, u64 buffer__szk) + void *buffer__nullable, u64 buffer__szk) { const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; enum bpf_dynptr_type type; @@ -2755,8 +2755,8 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u64 offset, case BPF_DYNPTR_TYPE_RINGBUF: return ptr->data + ptr->offset + offset; case BPF_DYNPTR_TYPE_SKB: - if (buffer__opt) - return skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer__opt); + if (buffer__nullable) + return skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer__nullable); else return skb_pointer_if_linear(ptr->data, ptr->offset + offset, len); case BPF_DYNPTR_TYPE_XDP: @@ -2765,16 +2765,16 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u64 offset, if (!IS_ERR_OR_NULL(xdp_ptr)) return xdp_ptr; - if (!buffer__opt) + if (!buffer__nullable) return NULL; - bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer__opt, len, false); - return buffer__opt; + bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer__nullable, len, false); + return buffer__nullable; } case BPF_DYNPTR_TYPE_SKB_META: return bpf_skb_meta_pointer(ptr->data, ptr->offset + offset); case BPF_DYNPTR_TYPE_FILE: - err = bpf_file_fetch_bytes(ptr->data, offset, buffer__opt, buffer__szk); - return err ? NULL : buffer__opt; + err = bpf_file_fetch_bytes(ptr->data, offset, buffer__nullable, buffer__szk); + return err ? NULL : buffer__nullable; default: WARN_ONCE(true, "unknown dynptr type %d\n", type); return NULL; @@ -2785,14 +2785,14 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u64 offset, * bpf_dynptr_slice_rdwr() - Obtain a writable pointer to the dynptr data. * @p: The dynptr whose data slice to retrieve * @offset: Offset into the dynptr - * @buffer__opt: User-provided buffer to copy contents into. May be NULL + * @buffer__nullable: User-provided buffer to copy contents into. May be NULL * @buffer__szk: Size (in bytes) of the buffer if present. This is the * length of the requested slice. This must be a constant. * * For non-skb and non-xdp type dynptrs, there is no difference between * bpf_dynptr_slice and bpf_dynptr_data. * - * If buffer__opt is NULL, the call will fail if buffer_opt was needed. + * If buffer__nullable is NULL, the call will fail if buffer_opt was needed. * * The returned pointer is writable and may point to either directly the dynptr * data at the requested offset or to the buffer if unable to obtain a direct @@ -2824,7 +2824,7 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u64 offset, * direct pointer) */ __bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u64 offset, - void *buffer__opt, u64 buffer__szk) + void *buffer__nullable, u64 buffer__szk) { const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; @@ -2853,7 +2853,7 @@ __bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u64 offset, * will be copied out into the buffer and the user will need to call * bpf_dynptr_write() to commit changes. */ - return bpf_dynptr_slice(p, offset, buffer__opt, buffer__szk); + return bpf_dynptr_slice(p, offset, buffer__nullable, buffer__szk); } __bpf_kfunc int bpf_dynptr_adjust(const struct bpf_dynptr *p, u64 start, u64 end) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c9da70dd3e72..9394b0de2ef0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12086,11 +12086,6 @@ static bool is_kfunc_arg_const_mem_size(const struct btf *btf, return btf_param_match_suffix(btf, arg, "__szk"); } -static bool is_kfunc_arg_optional(const struct btf *btf, const struct btf_param *arg) -{ - return btf_param_match_suffix(btf, arg, "__opt"); -} - static bool is_kfunc_arg_constant(const struct btf *btf, const struct btf_param *arg) { return btf_param_match_suffix(btf, arg, "__k"); @@ -12510,6 +12505,11 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) return KF_ARG_PTR_TO_CTX; + if (argno + 1 < nargs && + (is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], ®s[regno + 1]) || + is_kfunc_arg_const_mem_size(meta->btf, &args[argno + 1], ®s[regno + 1]))) + arg_mem_size = true; + /* In this function, we verify the kfunc's BTF as per the argument type, * leaving the rest of the verification with respect to the register * type to our caller. When a set of conditions hold in the BTF type of @@ -12518,7 +12518,8 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (btf_is_prog_ctx_type(&env->log, meta->btf, t, resolve_prog_type(env->prog), argno)) return KF_ARG_PTR_TO_CTX; - if (is_kfunc_arg_nullable(meta->btf, &args[argno]) && register_is_null(reg)) + if (is_kfunc_arg_nullable(meta->btf, &args[argno]) && register_is_null(reg) && + !arg_mem_size) return KF_ARG_PTR_TO_NULL; if (is_kfunc_arg_alloc_obj(meta->btf, &args[argno])) @@ -12575,11 +12576,6 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (is_kfunc_arg_callback(env, meta->btf, &args[argno])) return KF_ARG_PTR_TO_CALLBACK; - if (argno + 1 < nargs && - (is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], ®s[regno + 1]) || - is_kfunc_arg_const_mem_size(meta->btf, &args[argno + 1], ®s[regno + 1]))) - arg_mem_size = true; - /* This is the catch all argument type of register types supported by * check_helper_mem_access. However, we only allow when argument type is * pointer to scalar, or struct composed (recursively) of scalars. When @@ -13249,8 +13245,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } if ((register_is_null(reg) || type_may_be_null(reg->type)) && - !is_kfunc_arg_nullable(meta->btf, &args[i]) && - !is_kfunc_arg_optional(meta->btf, &args[i])) { + !is_kfunc_arg_nullable(meta->btf, &args[i])) { verbose(env, "Possibly NULL pointer passed to trusted arg%d\n", i); return -EACCES; } @@ -13566,7 +13561,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ struct bpf_reg_state *size_reg = ®s[regno + 1]; const struct btf_param *size_arg = &args[i + 1]; - if (!register_is_null(buff_reg) || !is_kfunc_arg_optional(meta->btf, buff_arg)) { + if (!register_is_null(buff_reg) || !is_kfunc_arg_nullable(meta->btf, buff_arg)) { ret = check_kfunc_mem_size_reg(env, size_reg, regno + 1); if (ret < 0) { verbose(env, "arg#%d arg#%d memory, len pair leads to invalid memory access\n", i, i + 1); From ea180ffbd27ce5abf2a06329fe1fc8d20dc9becf Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 5 Jan 2026 20:23:13 -0800 Subject: [PATCH 068/268] mm: drop mem_cgroup_usage() declaration from memcontrol.h mem_cgroup_usage() is not used outside of memcg-v1 code, the declaration was added by a mistake. Signed-off-by: Roman Gushchin Link: https://lore.kernel.org/r/20260106042313.140256-1-roman.gushchin@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/memcontrol.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 6a5d65487b70..229ac9835adb 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -950,7 +950,6 @@ static inline void mod_memcg_page_state(struct page *page, } unsigned long memcg_events(struct mem_cgroup *memcg, int event); -unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap); unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx); unsigned long memcg_page_state_output(struct mem_cgroup *memcg, int item); bool memcg_stat_item_valid(int idx); From b25b48c7d37617601ebc8cf2633bee95aa82c697 Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Tue, 6 Jan 2026 18:36:43 -0500 Subject: [PATCH 069/268] bpf: Check active lock count in in_sleepable_context() The in_sleepable_context() function is used to specialize the BPF code in do_misc_fixups(). With the addition of nonsleepable arena kfuncs, there are kfuncs whose specialization depends on whether we are holding a lock. We should use the nonsleepable version while holding a lock and the sleepable one when not. Add a check for active_locks to account for locking when specializing arena kfuncs. Signed-off-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260106-arena-under-lock-v2-1-378e9eab3066@etsalapatis.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9394b0de2ef0..7f82e27dd7e7 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11466,6 +11466,7 @@ static inline bool in_sleepable_context(struct bpf_verifier_env *env) { return !env->cur_state->active_rcu_locks && !env->cur_state->active_preempt_locks && + !env->cur_state->active_locks && !env->cur_state->active_irq_id && in_sleepable(env); } From 39f77533b6c16e7fbd72e2560e13c9435d2602f5 Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Tue, 6 Jan 2026 18:36:44 -0500 Subject: [PATCH 070/268] bpf: Allow calls to arena functions while holding spinlocks The bpf_arena_*_pages() kfuncs can be called from sleepable contexts, but the verifier still prevents BPF programs from calling them while holding a spinlock. Amend the verifier to allow for BPF programs calling arena page management functions while holding a lock. Signed-off-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260106-arena-under-lock-v2-2-378e9eab3066@etsalapatis.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7f82e27dd7e7..53635ea2e41b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12373,6 +12373,7 @@ enum special_kfunc_type { KF_bpf_task_work_schedule_resume_impl, KF_bpf_arena_alloc_pages, KF_bpf_arena_free_pages, + KF_bpf_arena_reserve_pages, }; BTF_ID_LIST(special_kfunc_list) @@ -12449,6 +12450,7 @@ BTF_ID(func, bpf_task_work_schedule_signal_impl) BTF_ID(func, bpf_task_work_schedule_resume_impl) BTF_ID(func, bpf_arena_alloc_pages) BTF_ID(func, bpf_arena_free_pages) +BTF_ID(func, bpf_arena_reserve_pages) static bool is_task_work_add_kfunc(u32 func_id) { @@ -12884,10 +12886,17 @@ static bool is_bpf_res_spin_lock_kfunc(u32 btf_id) btf_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore]; } +static bool is_bpf_arena_kfunc(u32 btf_id) +{ + return btf_id == special_kfunc_list[KF_bpf_arena_alloc_pages] || + btf_id == special_kfunc_list[KF_bpf_arena_free_pages] || + btf_id == special_kfunc_list[KF_bpf_arena_reserve_pages]; +} + static bool kfunc_spin_allowed(u32 btf_id) { return is_bpf_graph_api_kfunc(btf_id) || is_bpf_iter_num_api_kfunc(btf_id) || - is_bpf_res_spin_lock_kfunc(btf_id); + is_bpf_res_spin_lock_kfunc(btf_id) || is_bpf_arena_kfunc(btf_id); } static bool is_sync_callback_calling_kfunc(u32 btf_id) From b81d5e9d965e0af2c1f21fc392a23e598171f9d6 Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Tue, 6 Jan 2026 18:36:45 -0500 Subject: [PATCH 071/268] selftests/bpf: add tests for arena kfuncs under lock Add selftests to ensure the verifier permits calling the arena kfunc API while holding a lock. Signed-off-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260106-arena-under-lock-v2-3-378e9eab3066@etsalapatis.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/progs/verifier_arena.c | 38 +++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/verifier_arena.c b/tools/testing/selftests/bpf/progs/verifier_arena.c index 4a9d96344813..c4b8daac4388 100644 --- a/tools/testing/selftests/bpf/progs/verifier_arena.c +++ b/tools/testing/selftests/bpf/progs/verifier_arena.c @@ -10,6 +10,8 @@ #include "bpf_experimental.h" #include "bpf_arena_common.h" +#define private(name) SEC(".bss." #name) __hidden __attribute__((aligned(8))) + struct { __uint(type, BPF_MAP_TYPE_ARENA); __uint(map_flags, BPF_F_MMAPABLE); @@ -439,4 +441,40 @@ int iter_maps3(struct bpf_iter__bpf_map *ctx) return 0; } +private(ARENA_TESTS) struct bpf_spin_lock arena_bpf_test_lock; + +/* Use the arena kfunc API while under a BPF lock. */ +SEC("syscall") +__success __retval(0) +int arena_kfuncs_under_bpf_lock(void *ctx) +{ +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) + char __arena *page; + int ret; + + bpf_spin_lock(&arena_bpf_test_lock); + + /* Get a separate region of the arena. */ + page = arena_base(&arena); + ret = bpf_arena_reserve_pages(&arena, page, 1); + if (ret) { + bpf_spin_unlock(&arena_bpf_test_lock); + return 1; + } + + bpf_arena_free_pages(&arena, page, 1); + + page = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0); + if (!page) { + bpf_spin_unlock(&arena_bpf_test_lock); + return 2; + } + + bpf_arena_free_pages(&arena, page, 1); + + bpf_spin_unlock(&arena_bpf_test_lock); +#endif + + return 0; +} char _license[] SEC("license") = "GPL"; From 2b421662c7887a0649fe409155a1f101562d0fa9 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Wed, 7 Jan 2026 10:20:16 +0800 Subject: [PATCH 072/268] bpf: Introduce BPF_F_CPU and BPF_F_ALL_CPUS flags Introduce BPF_F_CPU and BPF_F_ALL_CPUS flags and check them for following APIs: * 'map_lookup_elem()' * 'map_update_elem()' * 'generic_map_lookup_batch()' * 'generic_map_update_batch()' And, get the correct value size for these APIs. Acked-by: Andrii Nakryiko Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260107022022.12843-2-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 23 ++++++++++++++++++++++- include/uapi/linux/bpf.h | 2 ++ kernel/bpf/syscall.c | 29 ++++++++++++++++------------- tools/include/uapi/linux/bpf.h | 2 ++ 4 files changed, 42 insertions(+), 14 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a63e47d2109c..108bab1bda9d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -3915,14 +3915,35 @@ bpf_prog_update_insn_ptrs(struct bpf_prog *prog, u32 *offsets, void *image) } #endif +static inline bool bpf_map_supports_cpu_flags(enum bpf_map_type map_type) +{ + return false; +} + static inline int bpf_map_check_op_flags(struct bpf_map *map, u64 flags, u64 allowed_flags) { - if (flags & ~allowed_flags) + u32 cpu; + + if ((u32)flags & ~allowed_flags) return -EINVAL; if ((flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK)) return -EINVAL; + if (!(flags & BPF_F_CPU) && flags >> 32) + return -EINVAL; + + if (flags & (BPF_F_CPU | BPF_F_ALL_CPUS)) { + if (!bpf_map_supports_cpu_flags(map->map_type)) + return -EINVAL; + if ((flags & BPF_F_CPU) && (flags & BPF_F_ALL_CPUS)) + return -EINVAL; + + cpu = flags >> 32; + if ((flags & BPF_F_CPU) && cpu >= num_possible_cpus()) + return -ERANGE; + } + return 0; } diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 84ced3ed2d21..2a2ade4be60f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1384,6 +1384,8 @@ enum { BPF_NOEXIST = 1, /* create new element if it didn't exist */ BPF_EXIST = 2, /* update existing element */ BPF_F_LOCK = 4, /* spin_lock-ed map_lookup/map_update */ + BPF_F_CPU = 8, /* cpu flag for percpu maps, upper 32-bit of flags is a cpu number */ + BPF_F_ALL_CPUS = 16, /* update value across all CPUs for percpu maps */ }; /* flags for BPF_MAP_CREATE command */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 6dd2ad2f9e81..e8cfe9d67e64 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -133,12 +133,14 @@ bool bpf_map_write_active(const struct bpf_map *map) return atomic64_read(&map->writecnt) != 0; } -static u32 bpf_map_value_size(const struct bpf_map *map) +static u32 bpf_map_value_size(const struct bpf_map *map, u64 flags) { - if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY || - map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) + if (flags & (BPF_F_CPU | BPF_F_ALL_CPUS)) + return map->value_size; + else if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY || + map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) return round_up(map->value_size, 8) * num_possible_cpus(); else if (IS_FD_MAP(map)) return sizeof(u32); @@ -1729,7 +1731,7 @@ static int map_lookup_elem(union bpf_attr *attr) if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) return -EPERM; - err = bpf_map_check_op_flags(map, attr->flags, BPF_F_LOCK); + err = bpf_map_check_op_flags(map, attr->flags, BPF_F_LOCK | BPF_F_CPU); if (err) return err; @@ -1737,7 +1739,7 @@ static int map_lookup_elem(union bpf_attr *attr) if (IS_ERR(key)) return PTR_ERR(key); - value_size = bpf_map_value_size(map); + value_size = bpf_map_value_size(map, attr->flags); err = -ENOMEM; value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); @@ -1804,7 +1806,7 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr) goto err_put; } - value_size = bpf_map_value_size(map); + value_size = bpf_map_value_size(map, attr->flags); value = kvmemdup_bpfptr(uvalue, value_size); if (IS_ERR(value)) { err = PTR_ERR(value); @@ -2000,11 +2002,12 @@ int generic_map_update_batch(struct bpf_map *map, struct file *map_file, void *key, *value; int err = 0; - err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK); + err = bpf_map_check_op_flags(map, attr->batch.elem_flags, + BPF_F_LOCK | BPF_F_CPU | BPF_F_ALL_CPUS); if (err) return err; - value_size = bpf_map_value_size(map); + value_size = bpf_map_value_size(map, attr->batch.elem_flags); max_count = attr->batch.count; if (!max_count) @@ -2059,11 +2062,11 @@ int generic_map_lookup_batch(struct bpf_map *map, u32 value_size, cp, max_count; int err; - err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK); + err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK | BPF_F_CPU); if (err) return err; - value_size = bpf_map_value_size(map); + value_size = bpf_map_value_size(map, attr->batch.elem_flags); max_count = attr->batch.count; if (!max_count) @@ -2185,7 +2188,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) goto err_put; } - value_size = bpf_map_value_size(map); + value_size = bpf_map_value_size(map, 0); err = -ENOMEM; value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 6b92b0847ec2..b816bc53d2e1 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1384,6 +1384,8 @@ enum { BPF_NOEXIST = 1, /* create new element if it didn't exist */ BPF_EXIST = 2, /* update existing element */ BPF_F_LOCK = 4, /* spin_lock-ed map_lookup/map_update */ + BPF_F_CPU = 8, /* cpu flag for percpu maps, upper 32-bit of flags is a cpu number */ + BPF_F_ALL_CPUS = 16, /* update value across all CPUs for percpu maps */ }; /* flags for BPF_MAP_CREATE command */ From 8eb76cb03f0f6c2bd7b15cf45dcffcd6bd07a360 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Wed, 7 Jan 2026 10:20:17 +0800 Subject: [PATCH 073/268] bpf: Add BPF_F_CPU and BPF_F_ALL_CPUS flags support for percpu_array maps Introduce support for the BPF_F_ALL_CPUS flag in percpu_array maps to allow updating values for all CPUs with a single value for both update_elem and update_batch APIs. Introduce support for the BPF_F_CPU flag in percpu_array maps to allow: * update value for specified CPU for both update_elem and update_batch APIs. * lookup value for specified CPU for both lookup_elem and lookup_batch APIs. The BPF_F_CPU flag is passed via: * map_flags of lookup_elem and update_elem APIs along with embedded cpu info. * elem_flags of lookup_batch and update_batch APIs along with embedded cpu info. Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260107022022.12843-3-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 9 +++++++-- kernel/bpf/arraymap.c | 31 ++++++++++++++++++++++++------- kernel/bpf/syscall.c | 2 +- 3 files changed, 32 insertions(+), 10 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 108bab1bda9d..dc92d001d978 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2848,7 +2848,7 @@ int map_set_for_each_callback_args(struct bpf_verifier_env *env, struct bpf_func_state *callee); int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); -int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value); +int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value, u64 flags); int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, u64 flags); int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, @@ -3917,7 +3917,12 @@ bpf_prog_update_insn_ptrs(struct bpf_prog *prog, u32 *offsets, void *image) static inline bool bpf_map_supports_cpu_flags(enum bpf_map_type map_type) { - return false; + switch (map_type) { + case BPF_MAP_TYPE_PERCPU_ARRAY: + return true; + default: + return false; + } } static inline int bpf_map_check_op_flags(struct bpf_map *map, u64 flags, u64 allowed_flags) diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 1eeb31c5b317..67e9e811de3a 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -307,7 +307,7 @@ static void *percpu_array_map_lookup_percpu_elem(struct bpf_map *map, void *key, return per_cpu_ptr(array->pptrs[index & array->index_mask], cpu); } -int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) +int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = *(u32 *)key; @@ -325,11 +325,18 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) size = array->elem_size; rcu_read_lock(); pptr = array->pptrs[index & array->index_mask]; + if (map_flags & BPF_F_CPU) { + cpu = map_flags >> 32; + copy_map_value(map, value, per_cpu_ptr(pptr, cpu)); + check_and_init_map_value(map, value); + goto unlock; + } for_each_possible_cpu(cpu) { copy_map_value_long(map, value + off, per_cpu_ptr(pptr, cpu)); check_and_init_map_value(map, value + off); off += size; } +unlock: rcu_read_unlock(); return 0; } @@ -398,10 +405,11 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = *(u32 *)key; void __percpu *pptr; - int cpu, off = 0; + void *ptr, *val; u32 size; + int cpu; - if (unlikely(map_flags > BPF_EXIST)) + if (unlikely((map_flags & BPF_F_LOCK) || (u32)map_flags > BPF_F_ALL_CPUS)) /* unknown flags */ return -EINVAL; @@ -422,11 +430,20 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, size = array->elem_size; rcu_read_lock(); pptr = array->pptrs[index & array->index_mask]; - for_each_possible_cpu(cpu) { - copy_map_value_long(map, per_cpu_ptr(pptr, cpu), value + off); - bpf_obj_free_fields(array->map.record, per_cpu_ptr(pptr, cpu)); - off += size; + if (map_flags & BPF_F_CPU) { + cpu = map_flags >> 32; + ptr = per_cpu_ptr(pptr, cpu); + copy_map_value(map, ptr, value); + bpf_obj_free_fields(array->map.record, ptr); + goto unlock; } + for_each_possible_cpu(cpu) { + ptr = per_cpu_ptr(pptr, cpu); + val = (map_flags & BPF_F_ALL_CPUS) ? value : value + size * cpu; + copy_map_value(map, ptr, val); + bpf_obj_free_fields(array->map.record, ptr); + } +unlock: rcu_read_unlock(); return 0; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e8cfe9d67e64..b5341ab1eb84 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -318,7 +318,7 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value, map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { err = bpf_percpu_hash_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { - err = bpf_percpu_array_copy(map, key, value); + err = bpf_percpu_array_copy(map, key, value, flags); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { err = bpf_percpu_cgroup_storage_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { From c6936161fd55d0c6ffde01da726e66ff5f2934e8 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Wed, 7 Jan 2026 10:20:18 +0800 Subject: [PATCH 074/268] bpf: Add BPF_F_CPU and BPF_F_ALL_CPUS flags support for percpu_hash and lru_percpu_hash maps Introduce BPF_F_ALL_CPUS flag support for percpu_hash and lru_percpu_hash maps to allow updating values for all CPUs with a single value for both update_elem and update_batch APIs. Introduce BPF_F_CPU flag support for percpu_hash and lru_percpu_hash maps to allow: * update value for specified CPU for both update_elem and update_batch APIs. * lookup value for specified CPU for both lookup_elem and lookup_batch APIs. The BPF_F_CPU flag is passed via: * map_flags along with embedded cpu info. * elem_flags along with embedded cpu info. Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260107022022.12843-4-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 4 +- kernel/bpf/hashtab.c | 94 ++++++++++++++++++++++++++++++-------------- kernel/bpf/syscall.c | 2 +- 3 files changed, 68 insertions(+), 32 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index dc92d001d978..f5c259dcc425 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2847,7 +2847,7 @@ int map_set_for_each_callback_args(struct bpf_verifier_env *env, struct bpf_func_state *caller, struct bpf_func_state *callee); -int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); +int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value, u64 flags); int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value, u64 flags); int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, u64 flags); @@ -3919,6 +3919,8 @@ static inline bool bpf_map_supports_cpu_flags(enum bpf_map_type map_type) { switch (map_type) { case BPF_MAP_TYPE_PERCPU_ARRAY: + case BPF_MAP_TYPE_PERCPU_HASH: + case BPF_MAP_TYPE_LRU_PERCPU_HASH: return true; default: return false; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index c8a9b27f8663..441ff5bc54ac 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -932,7 +932,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) } static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, - void *value, bool onallcpus) + void *value, bool onallcpus, u64 map_flags) { void *ptr; @@ -943,19 +943,28 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, bpf_obj_free_fields(htab->map.record, ptr); } else { u32 size = round_up(htab->map.value_size, 8); - int off = 0, cpu; + void *val; + int cpu; + + if (map_flags & BPF_F_CPU) { + cpu = map_flags >> 32; + ptr = per_cpu_ptr(pptr, cpu); + copy_map_value(&htab->map, ptr, value); + bpf_obj_free_fields(htab->map.record, ptr); + return; + } for_each_possible_cpu(cpu) { ptr = per_cpu_ptr(pptr, cpu); - copy_map_value_long(&htab->map, ptr, value + off); + val = (map_flags & BPF_F_ALL_CPUS) ? value : value + size * cpu; + copy_map_value(&htab->map, ptr, val); bpf_obj_free_fields(htab->map.record, ptr); - off += size; } } } static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr, - void *value, bool onallcpus) + void *value, bool onallcpus, u64 map_flags) { /* When not setting the initial value on all cpus, zero-fill element * values for other cpus. Otherwise, bpf program has no way to ensure @@ -973,7 +982,7 @@ static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr, zero_map_value(&htab->map, per_cpu_ptr(pptr, cpu)); } } else { - pcpu_copy_value(htab, pptr, value, onallcpus); + pcpu_copy_value(htab, pptr, value, onallcpus, map_flags); } } @@ -985,7 +994,7 @@ static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab) static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, void *value, u32 key_size, u32 hash, bool percpu, bool onallcpus, - struct htab_elem *old_elem) + struct htab_elem *old_elem, u64 map_flags) { u32 size = htab->map.value_size; bool prealloc = htab_is_prealloc(htab); @@ -1043,7 +1052,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, pptr = *(void __percpu **)ptr; } - pcpu_init_value(htab, pptr, value, onallcpus); + pcpu_init_value(htab, pptr, value, onallcpus, map_flags); if (!prealloc) htab_elem_set_ptr(l_new, key_size, pptr); @@ -1147,7 +1156,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, } l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false, - l_old); + l_old, map_flags); if (IS_ERR(l_new)) { /* all pre-allocated elements are in use or memory exhausted */ ret = PTR_ERR(l_new); @@ -1249,6 +1258,15 @@ static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value return ret; } +static int htab_map_check_update_flags(bool onallcpus, u64 map_flags) +{ + if (unlikely(!onallcpus && map_flags > BPF_EXIST)) + return -EINVAL; + if (unlikely(onallcpus && ((map_flags & BPF_F_LOCK) || (u32)map_flags > BPF_F_ALL_CPUS))) + return -EINVAL; + return 0; +} + static long htab_map_update_elem_in_place(struct bpf_map *map, void *key, void *value, u64 map_flags, bool percpu, bool onallcpus) @@ -1262,9 +1280,9 @@ static long htab_map_update_elem_in_place(struct bpf_map *map, void *key, u32 key_size, hash; int ret; - if (unlikely(map_flags > BPF_EXIST)) - /* unknown flags */ - return -EINVAL; + ret = htab_map_check_update_flags(onallcpus, map_flags); + if (unlikely(ret)) + return ret; WARN_ON_ONCE(!bpf_rcu_lock_held()); @@ -1289,7 +1307,7 @@ static long htab_map_update_elem_in_place(struct bpf_map *map, void *key, /* Update value in-place */ if (percpu) { pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), - value, onallcpus); + value, onallcpus, map_flags); } else { void **inner_map_pptr = htab_elem_value(l_old, key_size); @@ -1298,7 +1316,7 @@ static long htab_map_update_elem_in_place(struct bpf_map *map, void *key, } } else { l_new = alloc_htab_elem(htab, key, value, key_size, - hash, percpu, onallcpus, NULL); + hash, percpu, onallcpus, NULL, map_flags); if (IS_ERR(l_new)) { ret = PTR_ERR(l_new); goto err; @@ -1324,9 +1342,9 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, u32 key_size, hash; int ret; - if (unlikely(map_flags > BPF_EXIST)) - /* unknown flags */ - return -EINVAL; + ret = htab_map_check_update_flags(onallcpus, map_flags); + if (unlikely(ret)) + return ret; WARN_ON_ONCE(!bpf_rcu_lock_held()); @@ -1363,10 +1381,10 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, /* per-cpu hash map can update value in-place */ pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), - value, onallcpus); + value, onallcpus, map_flags); } else { pcpu_init_value(htab, htab_elem_get_ptr(l_new, key_size), - value, onallcpus); + value, onallcpus, map_flags); hlist_nulls_add_head_rcu(&l_new->hash_node, head); l_new = NULL; } @@ -1678,9 +1696,9 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, void __user *ukeys = u64_to_user_ptr(attr->batch.keys); void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch); u32 batch, max_count, size, bucket_size, map_id; + u64 elem_map_flags, map_flags, allowed_flags; u32 bucket_cnt, total, key_size, value_size; struct htab_elem *node_to_free = NULL; - u64 elem_map_flags, map_flags; struct hlist_nulls_head *head; struct hlist_nulls_node *n; unsigned long flags = 0; @@ -1690,9 +1708,12 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, int ret = 0; elem_map_flags = attr->batch.elem_flags; - if ((elem_map_flags & ~BPF_F_LOCK) || - ((elem_map_flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK))) - return -EINVAL; + allowed_flags = BPF_F_LOCK; + if (!do_delete && is_percpu) + allowed_flags |= BPF_F_CPU; + ret = bpf_map_check_op_flags(map, elem_map_flags, allowed_flags); + if (ret) + return ret; map_flags = attr->batch.flags; if (map_flags) @@ -1715,7 +1736,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, key_size = htab->map.key_size; value_size = htab->map.value_size; size = round_up(value_size, 8); - if (is_percpu) + if (is_percpu && !(elem_map_flags & BPF_F_CPU)) value_size = size * num_possible_cpus(); total = 0; /* while experimenting with hash tables with sizes ranging from 10 to @@ -1798,10 +1819,17 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, void __percpu *pptr; pptr = htab_elem_get_ptr(l, map->key_size); - for_each_possible_cpu(cpu) { - copy_map_value_long(&htab->map, dst_val + off, per_cpu_ptr(pptr, cpu)); - check_and_init_map_value(&htab->map, dst_val + off); - off += size; + if (elem_map_flags & BPF_F_CPU) { + cpu = elem_map_flags >> 32; + copy_map_value(&htab->map, dst_val, per_cpu_ptr(pptr, cpu)); + check_and_init_map_value(&htab->map, dst_val); + } else { + for_each_possible_cpu(cpu) { + copy_map_value_long(&htab->map, dst_val + off, + per_cpu_ptr(pptr, cpu)); + check_and_init_map_value(&htab->map, dst_val + off); + off += size; + } } } else { value = htab_elem_value(l, key_size); @@ -2357,7 +2385,7 @@ static void *htab_lru_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *k return NULL; } -int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) +int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct htab_elem *l; void __percpu *pptr; @@ -2374,16 +2402,22 @@ int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) l = __htab_map_lookup_elem(map, key); if (!l) goto out; + ret = 0; /* We do not mark LRU map element here in order to not mess up * eviction heuristics when user space does a map walk. */ pptr = htab_elem_get_ptr(l, map->key_size); + if (map_flags & BPF_F_CPU) { + cpu = map_flags >> 32; + copy_map_value(map, value, per_cpu_ptr(pptr, cpu)); + check_and_init_map_value(map, value); + goto out; + } for_each_possible_cpu(cpu) { copy_map_value_long(map, value + off, per_cpu_ptr(pptr, cpu)); check_and_init_map_value(map, value + off); off += size; } - ret = 0; out: rcu_read_unlock(); return ret; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b5341ab1eb84..5e3b5d828856 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -316,7 +316,7 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value, bpf_disable_instrumentation(); if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { - err = bpf_percpu_hash_copy(map, key, value); + err = bpf_percpu_hash_copy(map, key, value, flags); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_copy(map, key, value, flags); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { From 8526397c3caf8d0289bab69bfb2c889e2b39b30a Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Wed, 7 Jan 2026 10:20:19 +0800 Subject: [PATCH 075/268] bpf: Copy map value using copy_map_value_long for percpu_cgroup_storage maps Copy map value using 'copy_map_value_long()'. It's to keep consistent style with the way of other percpu maps. No functional change intended. Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260107022022.12843-5-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/local_storage.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index c93a756e035c..2ab4b60ffe61 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -200,8 +200,7 @@ int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *key, */ size = round_up(_map->value_size, 8); for_each_possible_cpu(cpu) { - bpf_long_memcpy(value + off, - per_cpu_ptr(storage->percpu_buf, cpu), size); + copy_map_value_long(_map, value + off, per_cpu_ptr(storage->percpu_buf, cpu)); off += size; } rcu_read_unlock(); @@ -234,8 +233,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *key, */ size = round_up(_map->value_size, 8); for_each_possible_cpu(cpu) { - bpf_long_memcpy(per_cpu_ptr(storage->percpu_buf, cpu), - value + off, size); + copy_map_value_long(_map, per_cpu_ptr(storage->percpu_buf, cpu), value + off); off += size; } rcu_read_unlock(); From 47c79f05aa0d289f760caf5ad6521fd8dbae37a1 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Wed, 7 Jan 2026 10:20:20 +0800 Subject: [PATCH 076/268] bpf: Add BPF_F_CPU and BPF_F_ALL_CPUS flags support for percpu_cgroup_storage maps Introduce BPF_F_ALL_CPUS flag support for percpu_cgroup_storage maps to allow updating values for all CPUs with a single value for update_elem API. Introduce BPF_F_CPU flag support for percpu_cgroup_storage maps to allow: * update value for specified CPU for update_elem API. * lookup value for specified CPU for lookup_elem API. The BPF_F_CPU flag is passed via map_flags along with embedded cpu info. Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260107022022.12843-6-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf-cgroup.h | 4 ++-- include/linux/bpf.h | 1 + kernel/bpf/local_storage.c | 23 ++++++++++++++++++----- kernel/bpf/syscall.c | 2 +- 4 files changed, 22 insertions(+), 8 deletions(-) diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index d1eb5c7729cb..2f535331f926 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -172,7 +172,7 @@ void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage); int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *map); -int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, void *value); +int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, void *value, u64 flags); int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, void *value, u64 flags); @@ -470,7 +470,7 @@ static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc( static inline void bpf_cgroup_storage_free( struct bpf_cgroup_storage *storage) {} static inline int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, - void *value) { + void *value, u64 flags) { return 0; } static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f5c259dcc425..5936f8e2996f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -3921,6 +3921,7 @@ static inline bool bpf_map_supports_cpu_flags(enum bpf_map_type map_type) case BPF_MAP_TYPE_PERCPU_ARRAY: case BPF_MAP_TYPE_PERCPU_HASH: case BPF_MAP_TYPE_LRU_PERCPU_HASH: + case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE: return true; default: return false; diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 2ab4b60ffe61..1ccbf28b2ad9 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -180,7 +180,7 @@ static long cgroup_storage_update_elem(struct bpf_map *map, void *key, } int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *key, - void *value) + void *value, u64 map_flags) { struct bpf_cgroup_storage_map *map = map_to_storage(_map); struct bpf_cgroup_storage *storage; @@ -198,11 +198,17 @@ int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *key, * access 'value_size' of them, so copying rounded areas * will not leak any kernel data */ + if (map_flags & BPF_F_CPU) { + cpu = map_flags >> 32; + copy_map_value(_map, value, per_cpu_ptr(storage->percpu_buf, cpu)); + goto unlock; + } size = round_up(_map->value_size, 8); for_each_possible_cpu(cpu) { copy_map_value_long(_map, value + off, per_cpu_ptr(storage->percpu_buf, cpu)); off += size; } +unlock: rcu_read_unlock(); return 0; } @@ -212,10 +218,11 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *key, { struct bpf_cgroup_storage_map *map = map_to_storage(_map); struct bpf_cgroup_storage *storage; - int cpu, off = 0; + void *val; u32 size; + int cpu; - if (map_flags != BPF_ANY && map_flags != BPF_EXIST) + if ((u32)map_flags & ~(BPF_ANY | BPF_EXIST | BPF_F_CPU | BPF_F_ALL_CPUS)) return -EINVAL; rcu_read_lock(); @@ -231,11 +238,17 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *key, * returned or zeros which were zero-filled by percpu_alloc, * so no kernel data leaks possible */ + if (map_flags & BPF_F_CPU) { + cpu = map_flags >> 32; + copy_map_value(_map, per_cpu_ptr(storage->percpu_buf, cpu), value); + goto unlock; + } size = round_up(_map->value_size, 8); for_each_possible_cpu(cpu) { - copy_map_value_long(_map, per_cpu_ptr(storage->percpu_buf, cpu), value + off); - off += size; + val = (map_flags & BPF_F_ALL_CPUS) ? value : value + size * cpu; + copy_map_value(_map, per_cpu_ptr(storage->percpu_buf, cpu), val); } +unlock: rcu_read_unlock(); return 0; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5e3b5d828856..ecc0929ce462 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -320,7 +320,7 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value, } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_copy(map, key, value, flags); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { - err = bpf_percpu_cgroup_storage_copy(map, key, value); + err = bpf_percpu_cgroup_storage_copy(map, key, value, flags); } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { err = bpf_stackmap_extract(map, key, value, false); } else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) { From 2546863b4a723c96f55af7127827d62632cfbc9c Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Wed, 7 Jan 2026 10:20:21 +0800 Subject: [PATCH 077/268] libbpf: Add BPF_F_CPU and BPF_F_ALL_CPUS flags support for percpu maps Add libbpf support for the BPF_F_CPU flag for percpu maps by embedding the cpu info into the high 32 bits of: 1. **flags**: bpf_map_lookup_elem_flags(), bpf_map__lookup_elem(), bpf_map_update_elem() and bpf_map__update_elem() 2. **opts->elem_flags**: bpf_map_lookup_batch() and bpf_map_update_batch() And the flag can be BPF_F_ALL_CPUS, but cannot be 'BPF_F_CPU | BPF_F_ALL_CPUS'. Behavior: * If the flag is BPF_F_ALL_CPUS, the update is applied across all CPUs. * If the flag is BPF_F_CPU, it updates value only to the specified CPU. * If the flag is BPF_F_CPU, lookup value only from the specified CPU. * lookup does not support BPF_F_ALL_CPUS. Acked-by: Andrii Nakryiko Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260107022022.12843-7-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/bpf.h | 8 ++++++++ tools/lib/bpf/libbpf.c | 26 ++++++++++++++++++++------ tools/lib/bpf/libbpf.h | 21 ++++++++------------- 3 files changed, 36 insertions(+), 19 deletions(-) diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index 1f9c28d27795..2c8e88ddb674 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -289,6 +289,14 @@ LIBBPF_API int bpf_map_lookup_and_delete_batch(int fd, void *in_batch, * Update spin_lock-ed map elements. This must be * specified if the map value contains a spinlock. * + * **BPF_F_CPU** + * As for percpu maps, update value on the specified CPU. And the cpu + * info is embedded into the high 32 bits of **opts->elem_flags**. + * + * **BPF_F_ALL_CPUS** + * As for percpu maps, update value across all CPUs. This flag cannot + * be used with BPF_F_CPU at the same time. + * * @param fd BPF map file descriptor * @param keys pointer to an array of *count* keys * @param values pointer to an array of *count* values diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 1a52d818a76c..6ea81701e274 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -10919,7 +10919,7 @@ bpf_object__find_map_fd_by_name(const struct bpf_object *obj, const char *name) } static int validate_map_op(const struct bpf_map *map, size_t key_sz, - size_t value_sz, bool check_value_sz) + size_t value_sz, bool check_value_sz, __u64 flags) { if (!map_is_created(map)) /* map is not yet created */ return -ENOENT; @@ -10946,6 +10946,20 @@ static int validate_map_op(const struct bpf_map *map, size_t key_sz, int num_cpu = libbpf_num_possible_cpus(); size_t elem_sz = roundup(map->def.value_size, 8); + if (flags & (BPF_F_CPU | BPF_F_ALL_CPUS)) { + if ((flags & BPF_F_CPU) && (flags & BPF_F_ALL_CPUS)) { + pr_warn("map '%s': BPF_F_CPU and BPF_F_ALL_CPUS are mutually exclusive\n", + map->name); + return -EINVAL; + } + if (map->def.value_size != value_sz) { + pr_warn("map '%s': unexpected value size %zu provided for either BPF_F_CPU or BPF_F_ALL_CPUS, expected %u\n", + map->name, value_sz, map->def.value_size); + return -EINVAL; + } + break; + } + if (value_sz != num_cpu * elem_sz) { pr_warn("map '%s': unexpected value size %zu provided for per-CPU map, expected %d * %zu = %zd\n", map->name, value_sz, num_cpu, elem_sz, num_cpu * elem_sz); @@ -10970,7 +10984,7 @@ int bpf_map__lookup_elem(const struct bpf_map *map, { int err; - err = validate_map_op(map, key_sz, value_sz, true); + err = validate_map_op(map, key_sz, value_sz, true, flags); if (err) return libbpf_err(err); @@ -10983,7 +10997,7 @@ int bpf_map__update_elem(const struct bpf_map *map, { int err; - err = validate_map_op(map, key_sz, value_sz, true); + err = validate_map_op(map, key_sz, value_sz, true, flags); if (err) return libbpf_err(err); @@ -10995,7 +11009,7 @@ int bpf_map__delete_elem(const struct bpf_map *map, { int err; - err = validate_map_op(map, key_sz, 0, false /* check_value_sz */); + err = validate_map_op(map, key_sz, 0, false /* check_value_sz */, flags); if (err) return libbpf_err(err); @@ -11008,7 +11022,7 @@ int bpf_map__lookup_and_delete_elem(const struct bpf_map *map, { int err; - err = validate_map_op(map, key_sz, value_sz, true); + err = validate_map_op(map, key_sz, value_sz, true, flags); if (err) return libbpf_err(err); @@ -11020,7 +11034,7 @@ int bpf_map__get_next_key(const struct bpf_map *map, { int err; - err = validate_map_op(map, key_sz, 0, false /* check_value_sz */); + err = validate_map_op(map, key_sz, 0, false /* check_value_sz */, 0); if (err) return libbpf_err(err); diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index e14d9e349f9c..dfc37a615578 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -1216,12 +1216,13 @@ LIBBPF_API struct bpf_map *bpf_map__inner_map(struct bpf_map *map); * @param key_sz size in bytes of key data, needs to match BPF map definition's **key_size** * @param value pointer to memory in which looked up value will be stored * @param value_sz size in byte of value data memory; it has to match BPF map - * definition's **value_size**. For per-CPU BPF maps value size has to be - * a product of BPF map value size and number of possible CPUs in the system - * (could be fetched with **libbpf_num_possible_cpus()**). Note also that for - * per-CPU values value size has to be aligned up to closest 8 bytes for - * alignment reasons, so expected size is: `round_up(value_size, 8) - * * libbpf_num_possible_cpus()`. + * definition's **value_size**. For per-CPU BPF maps, value size can be + * `value_size` if either **BPF_F_CPU** or **BPF_F_ALL_CPUS** is specified + * in **flags**, otherwise a product of BPF map value size and number of + * possible CPUs in the system (could be fetched with + * **libbpf_num_possible_cpus()**). Note also that for per-CPU values value + * size has to be aligned up to closest 8 bytes, so expected size is: + * `round_up(value_size, 8) * libbpf_num_possible_cpus()`. * @param flags extra flags passed to kernel for this operation * @return 0, on success; negative error, otherwise * @@ -1239,13 +1240,7 @@ LIBBPF_API int bpf_map__lookup_elem(const struct bpf_map *map, * @param key pointer to memory containing bytes of the key * @param key_sz size in bytes of key data, needs to match BPF map definition's **key_size** * @param value pointer to memory containing bytes of the value - * @param value_sz size in byte of value data memory; it has to match BPF map - * definition's **value_size**. For per-CPU BPF maps value size has to be - * a product of BPF map value size and number of possible CPUs in the system - * (could be fetched with **libbpf_num_possible_cpus()**). Note also that for - * per-CPU values value size has to be aligned up to closest 8 bytes for - * alignment reasons, so expected size is: `round_up(value_size, 8) - * * libbpf_num_possible_cpus()`. + * @param value_sz refer to **bpf_map__lookup_elem**'s description.' * @param flags extra flags passed to kernel for this operation * @return 0, on success; negative error, otherwise * From 07bf7aa58e5e7fb27b8addcc33052400a7d9ce32 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Wed, 7 Jan 2026 10:20:22 +0800 Subject: [PATCH 078/268] selftests/bpf: Add cases to test BPF_F_CPU and BPF_F_ALL_CPUS flags Add test coverage for the new BPF_F_CPU and BPF_F_ALL_CPUS flags support in percpu maps. The following APIs are exercised: * bpf_map_update_batch() * bpf_map_lookup_batch() * bpf_map_update_elem() * bpf_map__update_elem() * bpf_map_lookup_elem_flags() * bpf_map__lookup_elem() For lru_percpu_hash map, set max_entries to 'libbpf_num_possible_cpus() + 1' and only use the first 'libbpf_num_possible_cpus()' entries. This ensures a spare entry is always available in the LRU free list, avoiding eviction. When updating an existing key in lru_percpu_hash map: 1. l_new = prealloc_lru_pop(); /* Borrow from free list */ 2. l_old = lookup_elem_raw(); /* Found, key exists */ 3. pcpu_copy_value(); /* In-place update */ 4. bpf_lru_push_free(); /* Return l_new to free list */ Also add negative tests to verify that non-percpu array and hash maps reject the BPF_F_CPU and BPF_F_ALL_CPUS flags. Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260107022022.12843-8-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/percpu_alloc.c | 328 ++++++++++++++++++ .../selftests/bpf/progs/percpu_alloc_array.c | 32 ++ 2 files changed, 360 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/percpu_alloc.c b/tools/testing/selftests/bpf/prog_tests/percpu_alloc.c index 343da65864d6..c1d0949f093f 100644 --- a/tools/testing/selftests/bpf/prog_tests/percpu_alloc.c +++ b/tools/testing/selftests/bpf/prog_tests/percpu_alloc.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include "cgroup_helpers.h" #include "percpu_alloc_array.skel.h" #include "percpu_alloc_cgrp_local_storage.skel.h" #include "percpu_alloc_fail.skel.h" @@ -115,6 +116,321 @@ static void test_failure(void) { RUN_TESTS(percpu_alloc_fail); } +static void test_percpu_map_op_cpu_flag(struct bpf_map *map, void *keys, size_t key_sz, u32 entries, + int nr_cpus, bool test_batch) +{ + size_t value_sz = sizeof(u32), value_sz_cpus, value_sz_total; + u32 *values = NULL, *values_percpu = NULL; + const u32 value = 0xDEADC0DE; + int i, j, cpu, map_fd, err; + u64 batch = 0, flags; + void *values_row; + u32 count, v; + LIBBPF_OPTS(bpf_map_batch_opts, batch_opts); + + value_sz_cpus = value_sz * nr_cpus; + values = calloc(entries, value_sz_cpus); + if (!ASSERT_OK_PTR(values, "calloc values")) + return; + + values_percpu = calloc(entries, roundup(value_sz, 8) * nr_cpus); + if (!ASSERT_OK_PTR(values_percpu, "calloc values_percpu")) { + free(values); + return; + } + + value_sz_total = value_sz_cpus * entries; + memset(values, 0, value_sz_total); + + map_fd = bpf_map__fd(map); + flags = BPF_F_CPU | BPF_F_ALL_CPUS; + err = bpf_map_lookup_elem_flags(map_fd, keys, values, flags); + if (!ASSERT_ERR(err, "bpf_map_lookup_elem_flags cpu|all_cpus")) + goto out; + + err = bpf_map_update_elem(map_fd, keys, values, flags); + if (!ASSERT_ERR(err, "bpf_map_update_elem cpu|all_cpus")) + goto out; + + flags = BPF_F_ALL_CPUS; + err = bpf_map_lookup_elem_flags(map_fd, keys, values, flags); + if (!ASSERT_ERR(err, "bpf_map_lookup_elem_flags all_cpus")) + goto out; + + flags = BPF_F_LOCK | BPF_F_CPU; + err = bpf_map_lookup_elem_flags(map_fd, keys, values, flags); + if (!ASSERT_ERR(err, "bpf_map_lookup_elem_flags BPF_F_LOCK")) + goto out; + + flags = BPF_F_LOCK | BPF_F_ALL_CPUS; + err = bpf_map_update_elem(map_fd, keys, values, flags); + if (!ASSERT_ERR(err, "bpf_map_update_elem BPF_F_LOCK")) + goto out; + + flags = (u64)nr_cpus << 32 | BPF_F_CPU; + err = bpf_map_update_elem(map_fd, keys, values, flags); + if (!ASSERT_EQ(err, -ERANGE, "bpf_map_update_elem -ERANGE")) + goto out; + + err = bpf_map__update_elem(map, keys, key_sz, values, value_sz, flags); + if (!ASSERT_EQ(err, -ERANGE, "bpf_map__update_elem -ERANGE")) + goto out; + + err = bpf_map_lookup_elem_flags(map_fd, keys, values, flags); + if (!ASSERT_EQ(err, -ERANGE, "bpf_map_lookup_elem_flags -ERANGE")) + goto out; + + err = bpf_map__lookup_elem(map, keys, key_sz, values, value_sz, flags); + if (!ASSERT_EQ(err, -ERANGE, "bpf_map__lookup_elem -ERANGE")) + goto out; + + for (cpu = 0; cpu < nr_cpus; cpu++) { + /* clear value on all cpus */ + values[0] = 0; + flags = BPF_F_ALL_CPUS; + for (i = 0; i < entries; i++) { + err = bpf_map__update_elem(map, keys + i * key_sz, key_sz, values, + value_sz, flags); + if (!ASSERT_OK(err, "bpf_map__update_elem all_cpus")) + goto out; + } + + /* update value on specified cpu */ + for (i = 0; i < entries; i++) { + values[0] = value; + flags = (u64)cpu << 32 | BPF_F_CPU; + err = bpf_map__update_elem(map, keys + i * key_sz, key_sz, values, + value_sz, flags); + if (!ASSERT_OK(err, "bpf_map__update_elem specified cpu")) + goto out; + + /* lookup then check value on CPUs */ + for (j = 0; j < nr_cpus; j++) { + flags = (u64)j << 32 | BPF_F_CPU; + err = bpf_map__lookup_elem(map, keys + i * key_sz, key_sz, values, + value_sz, flags); + if (!ASSERT_OK(err, "bpf_map__lookup_elem specified cpu")) + goto out; + if (!ASSERT_EQ(values[0], j != cpu ? 0 : value, + "bpf_map__lookup_elem value on specified cpu")) + goto out; + } + } + } + + if (!test_batch) + goto out; + + count = entries; + batch_opts.elem_flags = (u64)nr_cpus << 32 | BPF_F_CPU; + err = bpf_map_update_batch(map_fd, keys, values, &count, &batch_opts); + if (!ASSERT_EQ(err, -ERANGE, "bpf_map_update_batch -ERANGE")) + goto out; + + for (cpu = 0; cpu < nr_cpus; cpu++) { + memset(values, 0, value_sz_total); + + /* clear values across all CPUs */ + count = entries; + batch_opts.elem_flags = BPF_F_ALL_CPUS; + err = bpf_map_update_batch(map_fd, keys, values, &count, &batch_opts); + if (!ASSERT_OK(err, "bpf_map_update_batch all_cpus")) + goto out; + + /* update values on specified CPU */ + for (i = 0; i < entries; i++) + values[i] = value; + + count = entries; + batch_opts.elem_flags = (u64)cpu << 32 | BPF_F_CPU; + err = bpf_map_update_batch(map_fd, keys, values, &count, &batch_opts); + if (!ASSERT_OK(err, "bpf_map_update_batch specified cpu")) + goto out; + + /* lookup values on specified CPU */ + batch = 0; + count = entries; + memset(values, 0, entries * value_sz); + err = bpf_map_lookup_batch(map_fd, NULL, &batch, keys, values, &count, &batch_opts); + if (!ASSERT_TRUE(!err || err == -ENOENT, "bpf_map_lookup_batch specified cpu")) + goto out; + + for (i = 0; i < entries; i++) + if (!ASSERT_EQ(values[i], value, + "bpf_map_lookup_batch value on specified cpu")) + goto out; + + /* lookup values from all CPUs */ + batch = 0; + count = entries; + batch_opts.elem_flags = 0; + memset(values_percpu, 0, roundup(value_sz, 8) * nr_cpus * entries); + err = bpf_map_lookup_batch(map_fd, NULL, &batch, keys, values_percpu, &count, + &batch_opts); + if (!ASSERT_TRUE(!err || err == -ENOENT, "bpf_map_lookup_batch all_cpus")) + goto out; + + for (i = 0; i < entries; i++) { + values_row = (void *) values_percpu + + roundup(value_sz, 8) * i * nr_cpus; + for (j = 0; j < nr_cpus; j++) { + v = *(u32 *) (values_row + roundup(value_sz, 8) * j); + if (!ASSERT_EQ(v, j != cpu ? 0 : value, + "bpf_map_lookup_batch value all_cpus")) + goto out; + } + } + } + +out: + free(values_percpu); + free(values); +} + + +static void test_percpu_map_cpu_flag(enum bpf_map_type map_type) +{ + struct percpu_alloc_array *skel; + size_t key_sz = sizeof(int); + int *keys, nr_cpus, i, err; + struct bpf_map *map; + u32 max_entries; + + nr_cpus = libbpf_num_possible_cpus(); + if (!ASSERT_GT(nr_cpus, 0, "libbpf_num_possible_cpus")) + return; + + max_entries = nr_cpus + 1; + keys = calloc(max_entries, key_sz); + if (!ASSERT_OK_PTR(keys, "calloc keys")) + return; + + for (i = 0; i < max_entries; i++) + keys[i] = i; + + skel = percpu_alloc_array__open(); + if (!ASSERT_OK_PTR(skel, "percpu_alloc_array__open")) { + free(keys); + return; + } + + map = skel->maps.percpu; + bpf_map__set_type(map, map_type); + bpf_map__set_max_entries(map, max_entries); + + err = percpu_alloc_array__load(skel); + if (!ASSERT_OK(err, "test_percpu_alloc__load")) + goto out; + + test_percpu_map_op_cpu_flag(map, keys, key_sz, max_entries - 1, nr_cpus, true); +out: + percpu_alloc_array__destroy(skel); + free(keys); +} + +static void test_percpu_array_cpu_flag(void) +{ + test_percpu_map_cpu_flag(BPF_MAP_TYPE_PERCPU_ARRAY); +} + +static void test_percpu_hash_cpu_flag(void) +{ + test_percpu_map_cpu_flag(BPF_MAP_TYPE_PERCPU_HASH); +} + +static void test_lru_percpu_hash_cpu_flag(void) +{ + test_percpu_map_cpu_flag(BPF_MAP_TYPE_LRU_PERCPU_HASH); +} + +static void test_percpu_cgroup_storage_cpu_flag(void) +{ + struct percpu_alloc_array *skel = NULL; + struct bpf_cgroup_storage_key key; + int cgroup, prog_fd, nr_cpus, err; + struct bpf_map *map; + + nr_cpus = libbpf_num_possible_cpus(); + if (!ASSERT_GT(nr_cpus, 0, "libbpf_num_possible_cpus")) + return; + + err = setup_cgroup_environment(); + if (!ASSERT_OK(err, "setup_cgroup_environment")) + return; + + cgroup = create_and_get_cgroup("/cg_percpu"); + if (!ASSERT_GE(cgroup, 0, "create_and_get_cgroup")) { + cleanup_cgroup_environment(); + return; + } + + err = join_cgroup("/cg_percpu"); + if (!ASSERT_OK(err, "join_cgroup")) + goto out; + + skel = percpu_alloc_array__open_and_load(); + if (!ASSERT_OK_PTR(skel, "percpu_alloc_array__open_and_load")) + goto out; + + prog_fd = bpf_program__fd(skel->progs.cgroup_egress); + err = bpf_prog_attach(prog_fd, cgroup, BPF_CGROUP_INET_EGRESS, 0); + if (!ASSERT_OK(err, "bpf_prog_attach")) + goto out; + + map = skel->maps.percpu_cgroup_storage; + err = bpf_map_get_next_key(bpf_map__fd(map), NULL, &key); + if (!ASSERT_OK(err, "bpf_map_get_next_key")) + goto out; + + test_percpu_map_op_cpu_flag(map, &key, sizeof(key), 1, nr_cpus, false); +out: + bpf_prog_detach2(-1, cgroup, BPF_CGROUP_INET_EGRESS); + close(cgroup); + cleanup_cgroup_environment(); + percpu_alloc_array__destroy(skel); +} + +static void test_map_op_cpu_flag(enum bpf_map_type map_type) +{ + u32 max_entries = 1, count = max_entries; + u64 flags, batch = 0, val = 0; + int err, map_fd, key = 0; + LIBBPF_OPTS(bpf_map_batch_opts, batch_opts); + + map_fd = bpf_map_create(map_type, "test_cpu_flag", sizeof(int), sizeof(u64), max_entries, + NULL); + if (!ASSERT_GE(map_fd, 0, "bpf_map_create")) + return; + + flags = BPF_F_ALL_CPUS; + err = bpf_map_update_elem(map_fd, &key, &val, flags); + ASSERT_ERR(err, "bpf_map_update_elem all_cpus"); + + batch_opts.elem_flags = BPF_F_ALL_CPUS; + err = bpf_map_update_batch(map_fd, &key, &val, &count, &batch_opts); + ASSERT_ERR(err, "bpf_map_update_batch all_cpus"); + + flags = BPF_F_CPU; + err = bpf_map_lookup_elem_flags(map_fd, &key, &val, flags); + ASSERT_ERR(err, "bpf_map_lookup_elem_flags cpu"); + + batch_opts.elem_flags = BPF_F_CPU; + err = bpf_map_lookup_batch(map_fd, NULL, &batch, &key, &val, &count, &batch_opts); + ASSERT_ERR(err, "bpf_map_lookup_batch cpu"); + + close(map_fd); +} + +static void test_array_cpu_flag(void) +{ + test_map_op_cpu_flag(BPF_MAP_TYPE_ARRAY); +} + +static void test_hash_cpu_flag(void) +{ + test_map_op_cpu_flag(BPF_MAP_TYPE_HASH); +} + void test_percpu_alloc(void) { if (test__start_subtest("array")) @@ -125,4 +441,16 @@ void test_percpu_alloc(void) test_cgrp_local_storage(); if (test__start_subtest("failure_tests")) test_failure(); + if (test__start_subtest("cpu_flag_percpu_array")) + test_percpu_array_cpu_flag(); + if (test__start_subtest("cpu_flag_percpu_hash")) + test_percpu_hash_cpu_flag(); + if (test__start_subtest("cpu_flag_lru_percpu_hash")) + test_lru_percpu_hash_cpu_flag(); + if (test__start_subtest("cpu_flag_percpu_cgroup_storage")) + test_percpu_cgroup_storage_cpu_flag(); + if (test__start_subtest("cpu_flag_array")) + test_array_cpu_flag(); + if (test__start_subtest("cpu_flag_hash")) + test_hash_cpu_flag(); } diff --git a/tools/testing/selftests/bpf/progs/percpu_alloc_array.c b/tools/testing/selftests/bpf/progs/percpu_alloc_array.c index 37c2d2608ec0..ed6a2a93d5a5 100644 --- a/tools/testing/selftests/bpf/progs/percpu_alloc_array.c +++ b/tools/testing/selftests/bpf/progs/percpu_alloc_array.c @@ -187,4 +187,36 @@ int BPF_PROG(test_array_map_10) return 0; } +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, 2); + __type(key, int); + __type(value, u32); +} percpu SEC(".maps"); + +SEC("?fentry/bpf_fentry_test1") +int BPF_PROG(test_percpu_array, int x) +{ + u64 value = 0xDEADC0DE; + int key = 0; + + bpf_map_update_elem(&percpu, &key, &value, BPF_ANY); + return 0; +} + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE); + __type(key, struct bpf_cgroup_storage_key); + __type(value, u32); +} percpu_cgroup_storage SEC(".maps"); + +SEC("cgroup_skb/egress") +int cgroup_egress(struct __sk_buff *skb) +{ + u32 *val = bpf_get_local_storage(&percpu_cgroup_storage, 0); + + *val = 1; + return 1; +} + char _license[] SEC("license") = "GPL"; From 2421649778dca8fe6e7b166905e97278aa0fdf58 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Tue, 6 Jan 2026 15:44:20 -0700 Subject: [PATCH 079/268] scripts/gen-btf.sh: Ensure initial object in gen_btf_o is ELF with correct endianness After commit 600605853f87 ("scripts/gen-btf.sh: Fix .btf.o generation when compiling for RISCV"), there is an error from llvm-objcopy when CONFIG_LTO_CLANG is enabled: llvm-objcopy: error: '.tmp_vmlinux1.btf.o': The file was not recognized as a valid object file Failed to generate BTF for vmlinux KBUILD_CFLAGS includes CC_FLAGS_LTO, which makes clang emit an LLVM IR object, rather than an ELF one as expected by llvm-objcopy. Most areas of the kernel deal with this by filtering out CC_FLAGS_LTO from KBUILD_CFLAGS for the particular object or directory but this is not so easy to do in bash. Just include '-fno-lto' after KBUILD_CFLAGS to ensure an ELF object is consistently created as the initial .o file. Additionally, while there is no reported or discovered bug yet, the absence of KBUILD_CPPFLAGS from this command could result in incorrect endianness because KBUILD_CPPFLAGS typically contains '-mbig-endian' and '-mlittle-endian' so that biendian toolchains can be used. Include it in this ${CC} command to hopefully limit necessary changes to this command for the foreseeable future. Fixes: 600605853f87 ("scripts/gen-btf.sh: Fix .btf.o generation when compiling for RISCV") Signed-off-by: Nathan Chancellor Acked-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260106-fix-gen-btf-sh-lto-v2-1-01d3e1c241c4@kernel.org Signed-off-by: Alexei Starovoitov --- scripts/gen-btf.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/gen-btf.sh b/scripts/gen-btf.sh index d6457661b9b6..be21ccee3487 100755 --- a/scripts/gen-btf.sh +++ b/scripts/gen-btf.sh @@ -87,7 +87,7 @@ gen_btf_o() # SHF_ALLOC because .BTF will be part of the vmlinux image. --strip-all # deletes all symbols including __start_BTF and __stop_BTF, which will # be redefined in the linker script. - echo "" | ${CC} ${CLANG_FLAGS} ${KBUILD_CFLAGS} -c -x c -o ${btf_data} - + echo "" | ${CC} ${CLANG_FLAGS} ${KBUILD_CPPFLAGS} ${KBUILD_CFLAGS} -fno-lto -c -x c -o ${btf_data} - ${OBJCOPY} --add-section .BTF=${ELF_FILE}.BTF \ --set-section-flags .BTF=alloc,readonly ${btf_data} ${OBJCOPY} --only-section=.BTF --strip-all ${btf_data} From 97fb54d86d2194ea8a4cbe6cf074e6ba47b054ea Mon Sep 17 00:00:00 2001 From: "Jose E. Marchesi" Date: Tue, 6 Jan 2026 18:36:49 +0100 Subject: [PATCH 080/268] bpf: adapt selftests to GCC 16 -Wunused-but-set-variable GCC 16 has changed the semantics of -Wunused-but-set-variable, as well as introducing new options -Wunused-but-set-variable={0,1,2,3} to adjust the level of support. One of the changes is that GCC now treats 'sum += 1' and 'sum++' as non-usage, whereas clang (and GCC < 16) considers the first as usage and the second as non-usage, which is sort of inconsistent. The GCC 16 -Wunused-but-set-variable=2 option implements the previous semantics of -Wunused-but-set-variable, but since it is a new option, it cannot be used unconditionally for forward-compatibility, just for backwards-compatibility. So this patch adds pragmas to the two self-tests impacted by this, progs/free_timer.c and progs/rcu_read_lock.c, to make gcc to ignore -Wunused-but-set-variable warnings when compiling them with GCC > 15. See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=44677#c25 for details on why this regression got introduced in GCC upstream. Signed-off-by: Jose E. Marchesi Cc: david.faust@oracle.com Cc: cupertino.miranda@oracle.com Cc: Eduard Zingerman Cc: Yonghong Song Link: https://lore.kernel.org/r/20260106173650.18191-2-jose.marchesi@oracle.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/free_timer.c | 10 ++++++++++ tools/testing/selftests/bpf/progs/rcu_read_lock.c | 10 ++++++++++ 2 files changed, 20 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/free_timer.c b/tools/testing/selftests/bpf/progs/free_timer.c index 4501ae8fc414..eccb2d47db43 100644 --- a/tools/testing/selftests/bpf/progs/free_timer.c +++ b/tools/testing/selftests/bpf/progs/free_timer.c @@ -7,6 +7,16 @@ #define MAX_ENTRIES 8 +/* clang considers 'sum += 1' as usage but 'sum++' as non-usage. GCC + * is more consistent and considers both 'sum += 1' and 'sum++' as + * non-usage. This triggers warnings in the functions below. + * + * Starting with GCC 16 -Wunused-but-set-variable=2 can be used to + * mimic clang's behavior. */ +#if !defined(__clang__) && __GNUC__ > 15 +#pragma GCC diagnostic ignored "-Wunused-but-set-variable" +#endif + struct map_value { struct bpf_timer timer; }; diff --git a/tools/testing/selftests/bpf/progs/rcu_read_lock.c b/tools/testing/selftests/bpf/progs/rcu_read_lock.c index d70c28824bbe..b4e073168fb1 100644 --- a/tools/testing/selftests/bpf/progs/rcu_read_lock.c +++ b/tools/testing/selftests/bpf/progs/rcu_read_lock.c @@ -7,6 +7,16 @@ #include "bpf_tracing_net.h" #include "bpf_misc.h" +/* clang considers 'sum += 1' as usage but 'sum++' as non-usage. GCC + * is more consistent and considers both 'sum += 1' and 'sum++' as + * non-usage. This triggers warnings in the functions below. + * + * Starting with GCC 16 -Wunused-but-set-variable=2 can be used to + * mimic clang's behavior. */ +#if !defined(__clang__) && __GNUC__ > 15 +#pragma GCC diagnostic ignored "-Wunused-but-set-variable" +#endif + char _license[] SEC("license") = "GPL"; struct { From 681600647c59050546939da5e490c736e567fe91 Mon Sep 17 00:00:00 2001 From: "Jose E. Marchesi" Date: Tue, 6 Jan 2026 18:36:50 +0100 Subject: [PATCH 081/268] bpf: GCC requires function attributes before the declarator GCC insists in placing attributes before the declarators in function declarations. Now that GCC supports btf_decl_tag and therefore __tag1 and __tag2 expand to actual attributes, the compiler is complaining about it for static __noinline int foo(int x __tag1 __tag2) __tag1 __tag2 progs/test_btf_decl_tag.c:36:1: error: attributes should be specified \ before the declarator in a function definition This patch simply places the tags before the declarator. Signed-off-by: Jose E. Marchesi Cc: david.faust@oracle.com Cc: cupertino.miranda@oracle.com Cc: Eduard Zingerman Cc: Yonghong Song Link: https://lore.kernel.org/r/20260106173650.18191-3-jose.marchesi@oracle.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/test_btf_decl_tag.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/progs/test_btf_decl_tag.c b/tools/testing/selftests/bpf/progs/test_btf_decl_tag.c index c88ccc53529a..0c3df19626cb 100644 --- a/tools/testing/selftests/bpf/progs/test_btf_decl_tag.c +++ b/tools/testing/selftests/bpf/progs/test_btf_decl_tag.c @@ -33,7 +33,7 @@ struct { } hashmap1 SEC(".maps"); -static __noinline int foo(int x __tag1 __tag2) __tag1 __tag2 +static __noinline __tag1 __tag2 int foo(int x __tag1 __tag2) { struct key_t key; value_t val = {}; From 4effccde0a0521b220c3585c9a0d8e677d345209 Mon Sep 17 00:00:00 2001 From: WanLi Niu Date: Tue, 6 Jan 2026 10:31:23 +0800 Subject: [PATCH 082/268] bpftool: Make skeleton C++ compatible with explicit casts Fix C++ compilation errors in generated skeleton by adding explicit pointer casts and use char * subtraction for offset calculation error: invalid conversion from 'void*' to '*' [-fpermissive] | skel = skel_alloc(sizeof(*skel)); | ~~~~~~~~~~^~~~~~~~~~~~~~~ | | | void* error: arithmetic on pointers to void | skel->ctx.sz = (void *)&skel->links - (void *)skel; | ~~~~~~~~~~~~~~~~~~~~ ^ ~~~~~~~~~~~~ error: assigning to 'struct __ *' from incompatible type 'void *' | skel-> = skel_prep_map_data((void *)data, 4096, | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | sizeof(data) - 1); | ~~~~~~~~~~~~~~~~~ error: assigning to 'struct __ *' from incompatible type 'void *' | skel-> = skel_finalize_map_data(&skel->maps..initial_value, | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | 4096, PROT_READ | PROT_WRITE, skel->maps..map_fd); | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Minimum reproducer: $ cat test.bpf.c int val; // placed in .bss section #include "vmlinux.h" #include SEC("raw_tracepoint/sched_wakeup_new") int handle(void *ctx) { return 0; } $ cat test.cpp #include extern "C" { #include "test.bpf.skel.h" } $ bpftool btf dump file /sys/kernel/btf/vmlinux format c > vmlinux.h $ clang -g -O2 -target bpf -c test.bpf.c -o test.bpf.o $ bpftool gen skeleton test.bpf.o -L > test.bpf.skel.h $ g++ -c test.cpp -I. Co-developed-by: Menglong Dong Signed-off-by: WanLi Niu Signed-off-by: Menglong Dong Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260106023123.2928-1-kiraskyler@163.com --- tools/bpf/bpftool/gen.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c index 993c7d9484a4..2f9e10752e28 100644 --- a/tools/bpf/bpftool/gen.c +++ b/tools/bpf/bpftool/gen.c @@ -731,10 +731,10 @@ static int gen_trace(struct bpf_object *obj, const char *obj_name, const char *h { \n\ struct %1$s *skel; \n\ \n\ - skel = skel_alloc(sizeof(*skel)); \n\ + skel = (struct %1$s *)skel_alloc(sizeof(*skel)); \n\ if (!skel) \n\ goto cleanup; \n\ - skel->ctx.sz = (void *)&skel->links - (void *)skel; \n\ + skel->ctx.sz = (char *)&skel->links - (char *)skel; \n\ ", obj_name, opts.data_sz); bpf_object__for_each_map(map, obj) { @@ -755,7 +755,7 @@ static int gen_trace(struct bpf_object *obj, const char *obj_name, const char *h \n\ \"; \n\ \n\ - skel->%1$s = skel_prep_map_data((void *)data, %2$zd,\n\ + skel->%1$s = (__typeof__(skel->%1$s))skel_prep_map_data((void *)data, %2$zd,\n\ sizeof(data) - 1);\n\ if (!skel->%1$s) \n\ goto cleanup; \n\ @@ -857,7 +857,7 @@ static int gen_trace(struct bpf_object *obj, const char *obj_name, const char *h codegen("\ \n\ - skel->%1$s = skel_finalize_map_data(&skel->maps.%1$s.initial_value, \n\ + skel->%1$s = (__typeof__(skel->%1$s))skel_finalize_map_data(&skel->maps.%1$s.initial_value,\n\ %2$zd, %3$s, skel->maps.%1$s.map_fd);\n\ if (!skel->%1$s) \n\ return -ENOMEM; \n\ From 5714ca8cba5ed736f3733663c446cbee63a10a64 Mon Sep 17 00:00:00 2001 From: Varun R Mallya Date: Wed, 7 Jan 2026 05:05:27 +0530 Subject: [PATCH 083/268] libbpf: Fix OOB read in btf_dump_get_bitfield_value When dumping bitfield data, btf_dump_get_bitfield_value() reads data based on the underlying type's size (t->size). However, it does not verify that the provided data buffer (data_sz) is large enough to contain these bytes. If btf_dump__dump_type_data() is called with a buffer smaller than the type's size, this leads to an out-of-bounds read. This was confirmed by AddressSanitizer in the linked issue. Fix this by ensuring we do not read past the provided data_sz limit. Fixes: a1d3cc3c5eca ("libbpf: Avoid use of __int128 in typed dump display") Reported-by: Harrison Green Suggested-by: Alan Maguire Signed-off-by: Varun R Mallya Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260106233527.163487-1-varunrmallya@gmail.com Closes: https://github.com/libbpf/libbpf/issues/928 --- tools/lib/bpf/btf_dump.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c index 6388392f49a0..53c6624161d7 100644 --- a/tools/lib/bpf/btf_dump.c +++ b/tools/lib/bpf/btf_dump.c @@ -1762,9 +1762,18 @@ static int btf_dump_get_bitfield_value(struct btf_dump *d, __u16 left_shift_bits, right_shift_bits; const __u8 *bytes = data; __u8 nr_copy_bits; + __u8 start_bit, nr_bytes; __u64 num = 0; int i; + /* Calculate how many bytes cover the bitfield */ + start_bit = bits_offset % 8; + nr_bytes = (start_bit + bit_sz + 7) / 8; + + /* Bound check */ + if (data + nr_bytes > d->typed_dump->data_end) + return -E2BIG; + /* Maximum supported bitfield size is 64 bits */ if (t->size > 8) { pr_warn("unexpected bitfield size %d\n", t->size); From b40a5d724f29fc2eed23ff353808a9aae616b48a Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Sat, 10 Jan 2026 08:25:50 +0000 Subject: [PATCH 084/268] bpf: crypto: Use the correct destructor kfunc type With CONFIG_CFI enabled, the kernel strictly enforces that indirect function calls use a function pointer type that matches the target function. I ran into the following type mismatch when running BPF self-tests: CFI failure at bpf_obj_free_fields+0x190/0x238 (target: bpf_crypto_ctx_release+0x0/0x94; expected type: 0xa488ebfc) Internal error: Oops - CFI: 00000000f2008228 [#1] SMP ... As bpf_crypto_ctx_release() is also used in BPF programs and using a void pointer as the argument would make the verifier unhappy, add a simple stub function with the correct type and register it as the destructor kfunc instead. Signed-off-by: Sami Tolvanen Acked-by: Yonghong Song Tested-by: Viktor Malik Link: https://lore.kernel.org/r/20260110082548.113748-7-samitolvanen@google.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/crypto.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/crypto.c b/kernel/bpf/crypto.c index 1ab79a6dec84..7e75a1936256 100644 --- a/kernel/bpf/crypto.c +++ b/kernel/bpf/crypto.c @@ -261,6 +261,12 @@ __bpf_kfunc void bpf_crypto_ctx_release(struct bpf_crypto_ctx *ctx) call_rcu(&ctx->rcu, crypto_free_cb); } +__bpf_kfunc void bpf_crypto_ctx_release_dtor(void *ctx) +{ + bpf_crypto_ctx_release(ctx); +} +CFI_NOSEAL(bpf_crypto_ctx_release_dtor); + static int bpf_crypto_crypt(const struct bpf_crypto_ctx *ctx, const struct bpf_dynptr_kern *src, const struct bpf_dynptr_kern *dst, @@ -368,7 +374,7 @@ static const struct btf_kfunc_id_set crypt_kfunc_set = { BTF_ID_LIST(bpf_crypto_dtor_ids) BTF_ID(struct, bpf_crypto_ctx) -BTF_ID(func, bpf_crypto_ctx_release) +BTF_ID(func, bpf_crypto_ctx_release_dtor) static int __init crypto_kfunc_init(void) { From c99d97b46631c4bea0c14b7581b7a59214601e63 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Sat, 10 Jan 2026 08:25:51 +0000 Subject: [PATCH 085/268] bpf: net_sched: Use the correct destructor kfunc type With CONFIG_CFI enabled, the kernel strictly enforces that indirect function calls use a function pointer type that matches the target function. As bpf_kfree_skb() signature differs from the btf_dtor_kfunc_t pointer type used for the destructor calls in bpf_obj_free_fields(), add a stub function with the correct type to fix the type mismatch. Signed-off-by: Sami Tolvanen Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20260110082548.113748-8-samitolvanen@google.com Signed-off-by: Alexei Starovoitov --- net/sched/bpf_qdisc.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/net/sched/bpf_qdisc.c b/net/sched/bpf_qdisc.c index b9771788b9b3..098ca02aed89 100644 --- a/net/sched/bpf_qdisc.c +++ b/net/sched/bpf_qdisc.c @@ -202,6 +202,12 @@ __bpf_kfunc void bpf_kfree_skb(struct sk_buff *skb) kfree_skb(skb); } +__bpf_kfunc void bpf_kfree_skb_dtor(void *skb) +{ + bpf_kfree_skb(skb); +} +CFI_NOSEAL(bpf_kfree_skb_dtor); + /* bpf_qdisc_skb_drop - Drop an skb by adding it to a deferred free list. * @skb: The skb whose reference to be released and dropped. * @to_free_list: The list of skbs to be dropped. @@ -449,7 +455,7 @@ static struct bpf_struct_ops bpf_Qdisc_ops = { .owner = THIS_MODULE, }; -BTF_ID_LIST_SINGLE(bpf_sk_buff_dtor_ids, func, bpf_kfree_skb) +BTF_ID_LIST_SINGLE(bpf_sk_buff_dtor_ids, func, bpf_kfree_skb_dtor) static int __init bpf_qdisc_kfunc_init(void) { From ba7f1024a1024f219a18c40b4ab2d7d900fd2d15 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Sat, 10 Jan 2026 08:25:52 +0000 Subject: [PATCH 086/268] selftests/bpf: Use the correct destructor kfunc type With CONFIG_CFI enabled, the kernel strictly enforces that indirect function calls use a function pointer type that matches the target function. As bpf_testmod_ctx_release() signature differs from the btf_dtor_kfunc_t pointer type used for the destructor calls in bpf_obj_free_fields(), add a stub function with the correct type to fix the type mismatch. Signed-off-by: Sami Tolvanen Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20260110082548.113748-9-samitolvanen@google.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/test_kmods/bpf_testmod.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c index 1c41d03bd5a1..bc07ce9d5477 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c @@ -285,6 +285,12 @@ __bpf_kfunc void bpf_testmod_ctx_release(struct bpf_testmod_ctx *ctx) call_rcu(&ctx->rcu, testmod_free_cb); } +__bpf_kfunc void bpf_testmod_ctx_release_dtor(void *ctx) +{ + bpf_testmod_ctx_release(ctx); +} +CFI_NOSEAL(bpf_testmod_ctx_release_dtor); + static struct bpf_testmod_ops3 *st_ops3; static int bpf_testmod_test_3(void) @@ -707,7 +713,7 @@ BTF_KFUNCS_END(bpf_testmod_common_kfunc_ids) BTF_ID_LIST(bpf_testmod_dtor_ids) BTF_ID(struct, bpf_testmod_ctx) -BTF_ID(func, bpf_testmod_ctx_release) +BTF_ID(func, bpf_testmod_ctx_release_dtor) static const struct btf_kfunc_id_set bpf_testmod_common_kfunc_set = { .owner = THIS_MODULE, From 99fde4d0626176d03cea35c64a063df73816e64d Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Sat, 10 Jan 2026 08:25:53 +0000 Subject: [PATCH 087/268] bpf, btf: Enforce destructor kfunc type with CFI Ensure that registered destructor kfuncs have the same type as btf_dtor_kfunc_t to avoid a kernel panic on systems with CONFIG_CFI enabled. Signed-off-by: Sami Tolvanen Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20260110082548.113748-10-samitolvanen@google.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 539c9fdea41d..2c6076fc29b9 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -8846,6 +8846,13 @@ static int btf_check_dtor_kfuncs(struct btf *btf, const struct btf_id_dtor_kfunc */ if (!t || !btf_type_is_ptr(t)) return -EINVAL; + + if (IS_ENABLED(CONFIG_CFI_CLANG)) { + /* Ensure the destructor kfunc type matches btf_dtor_kfunc_t */ + t = btf_type_by_id(btf, t->type); + if (!btf_type_is_void(t)) + return -EINVAL; + } } return 0; } From 7af3339948601e188d93d7e03326aeb8fcbd6bea Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Tue, 13 Jan 2026 13:48:26 +0000 Subject: [PATCH 088/268] bpf: Consistently use reg_state() for register access in the verifier Replace the pattern of declaring a local regs array from cur_regs() and then indexing into it with the more concise reg_state() helper. This simplifies the code by eliminating intermediate variables and makes register access more consistent throughout the verifier. Signed-off-by: Mykyta Yatsenko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20260113134826.2214860-1-mykyta.yatsenko5@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 42 +++++++++++++++++++----------------------- 1 file changed, 19 insertions(+), 23 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 53635ea2e41b..02a43cafbb25 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5654,8 +5654,8 @@ static int check_stack_write(struct bpf_verifier_env *env, static int check_map_access_type(struct bpf_verifier_env *env, u32 regno, int off, int size, enum bpf_access_type type) { - struct bpf_reg_state *regs = cur_regs(env); - struct bpf_map *map = regs[regno].map_ptr; + struct bpf_reg_state *reg = reg_state(env, regno); + struct bpf_map *map = reg->map_ptr; u32 cap = bpf_map_flags_to_cap(map); if (type == BPF_WRITE && !(cap & BPF_MAP_CAN_WRITE)) { @@ -6168,8 +6168,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, int size, bool zero_size_allowed) { - struct bpf_reg_state *regs = cur_regs(env); - struct bpf_reg_state *reg = ®s[regno]; + struct bpf_reg_state *reg = reg_state(env, regno); int err; /* We may have added a variable offset to the packet pointer; but any @@ -6256,8 +6255,7 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx, u32 regno, int off, int size, enum bpf_access_type t) { - struct bpf_reg_state *regs = cur_regs(env); - struct bpf_reg_state *reg = ®s[regno]; + struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_insn_access_aux info = {}; bool valid; @@ -7453,8 +7451,7 @@ static int check_stack_access_within_bounds( int regno, int off, int access_size, enum bpf_access_type type) { - struct bpf_reg_state *regs = cur_regs(env); - struct bpf_reg_state *reg = regs + regno; + struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_func_state *state = func(env, reg); s64 min_off, max_off; int err; @@ -8408,7 +8405,7 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, int flags) { bool is_lock = flags & PROCESS_SPIN_LOCK, is_res_lock = flags & PROCESS_RES_LOCK; const char *lock_str = is_res_lock ? "bpf_res_spin" : "bpf_spin"; - struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_verifier_state *cur = env->cur_state; bool is_const = tnum_is_const(reg->var_off); bool is_irq = flags & PROCESS_LOCK_IRQ; @@ -8524,7 +8521,7 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, int flags) static int check_map_field_pointer(struct bpf_verifier_env *env, u32 regno, enum btf_field_type field_type) { - struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_reg_state *reg = reg_state(env, regno); bool is_const = tnum_is_const(reg->var_off); struct bpf_map *map = reg->map_ptr; u64 val = reg->var_off.value; @@ -8571,7 +8568,7 @@ static int check_map_field_pointer(struct bpf_verifier_env *env, u32 regno, static int process_timer_func(struct bpf_verifier_env *env, int regno, struct bpf_call_arg_meta *meta) { - struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_map *map = reg->map_ptr; int err; @@ -8595,7 +8592,7 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno, static int process_wq_func(struct bpf_verifier_env *env, int regno, struct bpf_kfunc_call_arg_meta *meta) { - struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_map *map = reg->map_ptr; int err; @@ -8616,7 +8613,7 @@ static int process_wq_func(struct bpf_verifier_env *env, int regno, static int process_task_work_func(struct bpf_verifier_env *env, int regno, struct bpf_kfunc_call_arg_meta *meta) { - struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_map *map = reg->map_ptr; int err; @@ -8636,7 +8633,7 @@ static int process_task_work_func(struct bpf_verifier_env *env, int regno, static int process_kptr_func(struct bpf_verifier_env *env, int regno, struct bpf_call_arg_meta *meta) { - struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_reg_state *reg = reg_state(env, regno); struct btf_field *kptr_field; struct bpf_map *map_ptr; struct btf_record *rec; @@ -8709,7 +8706,7 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno, static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn_idx, enum bpf_arg_type arg_type, int clone_ref_obj_id) { - struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_reg_state *reg = reg_state(env, regno); int err; if (reg->type != PTR_TO_STACK && reg->type != CONST_PTR_TO_DYNPTR) { @@ -8829,7 +8826,7 @@ static bool is_kfunc_arg_iter(struct bpf_kfunc_call_arg_meta *meta, int arg_idx, static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_idx, struct bpf_kfunc_call_arg_meta *meta) { - struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_reg_state *reg = reg_state(env, regno); const struct btf_type *t; int spi, err, i, nr_slots, btf_id; @@ -9301,7 +9298,7 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, const u32 *arg_btf_id, struct bpf_call_arg_meta *meta) { - struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_reg_state *reg = reg_state(env, regno); enum bpf_reg_type expected, type = reg->type; const struct bpf_reg_types *compatible; int i, j; @@ -9714,7 +9711,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, int insn_idx) { u32 regno = BPF_REG_1 + arg; - struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_reg_state *reg = reg_state(env, regno); enum bpf_arg_type arg_type = fn->arg_type[arg]; enum bpf_reg_type type = reg->type; u32 *arg_btf_id = NULL; @@ -11247,7 +11244,7 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, int func_id, int insn_idx) { struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx]; - struct bpf_reg_state *regs = cur_regs(env), *reg; + struct bpf_reg_state *reg; struct bpf_map *map = meta->map_ptr; u64 val, max; int err; @@ -11259,7 +11256,7 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, return -EINVAL; } - reg = ®s[BPF_REG_3]; + reg = reg_state(env, BPF_REG_3); val = reg->var_off.value; max = map->max_entries; @@ -11405,8 +11402,7 @@ static struct bpf_insn_aux_data *cur_aux(const struct bpf_verifier_env *env) static bool loop_flag_is_zero(struct bpf_verifier_env *env) { - struct bpf_reg_state *regs = cur_regs(env); - struct bpf_reg_state *reg = ®s[BPF_REG_4]; + struct bpf_reg_state *reg = reg_state(env, BPF_REG_4); bool reg_is_null = register_is_null(reg); if (reg_is_null) @@ -12668,7 +12664,7 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env, static int process_irq_flag(struct bpf_verifier_env *env, int regno, struct bpf_kfunc_call_arg_meta *meta) { - struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_reg_state *reg = reg_state(env, regno); int err, kfunc_class = IRQ_NATIVE_KFUNC; bool irq_save; From 2465a08d433dd4ae0c4eecdb8e79c54b7c5e5a55 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 12 Jan 2026 22:10:23 -0800 Subject: [PATCH 089/268] selftests/bpf: Fix dmabuf_iter/lots_of_buffers failure with 64K page On arm64 with 64K page , I observed the following test failure: ... subtest_dmabuf_iter_check_lots_of_buffers:FAIL:total_bytes_read unexpected total_bytes_read: actual 4696 <= expected 65536 #97/3 dmabuf_iter/lots_of_buffers:FAIL With 4K page on x86, the total_bytes_read is 4593. With 64K page on arm64, the total_byte_read is 4696. In progs/dmabuf_iter.c, for each iteration, the output is BPF_SEQ_PRINTF(seq, "%lu\n%llu\n%s\n%s\n", inode, size, name, exporter); The only difference between 4K and 64K page is 'size' in the above BPF_SEQ_PRINTF. The 4K page will output '4096' and the 64K page will output '65536'. So the total_bytes_read with 64K page is slighter greater than 4K page. Adjusting the total_bytes_read from 65536 to 4096 fixed the issue. Cc: T.J. Mercier Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260113061023.3798085-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/dmabuf_iter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/dmabuf_iter.c b/tools/testing/selftests/bpf/prog_tests/dmabuf_iter.c index e442be9dde7e..fb2cea710db3 100644 --- a/tools/testing/selftests/bpf/prog_tests/dmabuf_iter.c +++ b/tools/testing/selftests/bpf/prog_tests/dmabuf_iter.c @@ -233,7 +233,7 @@ static void subtest_dmabuf_iter_check_lots_of_buffers(struct dmabuf_iter *skel) while ((bytes_read = read(iter_fd, buf, sizeof(buf))) > 0) total_bytes_read += bytes_read; - ASSERT_GT(total_bytes_read, getpagesize(), "total_bytes_read"); + ASSERT_GT(total_bytes_read, 4096, "total_bytes_read"); close(iter_fd); } From d2f7cd20a7c7742b83d2c899044d4f2a851a4a7d Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 12 Jan 2026 22:10:28 -0800 Subject: [PATCH 090/268] selftests/bpf: Fix sk_bypass_prot_mem failure with 64K page The current selftest sk_bypass_prot_mem only supports 4K page. When running with 64K page on arm64, the following failure happens: ... check_bypass:FAIL:no bypass unexpected no bypass: actual 3 <= expected 32 ... #385/1 sk_bypass_prot_mem/TCP :FAIL ... check_bypass:FAIL:no bypass unexpected no bypass: actual 4 <= expected 32 ... #385/2 sk_bypass_prot_mem/UDP :FAIL ... Adding support to 64K page as well fixed the failure. Cc: Kuniyuki Iwashima Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20260113061028.3798326-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/prog_tests/sk_bypass_prot_mem.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sk_bypass_prot_mem.c b/tools/testing/selftests/bpf/prog_tests/sk_bypass_prot_mem.c index e4940583924b..e2c867fd5244 100644 --- a/tools/testing/selftests/bpf/prog_tests/sk_bypass_prot_mem.c +++ b/tools/testing/selftests/bpf/prog_tests/sk_bypass_prot_mem.c @@ -5,9 +5,14 @@ #include "sk_bypass_prot_mem.skel.h" #include "network_helpers.h" +#ifndef PAGE_SIZE +#include +#define PAGE_SIZE getpagesize() +#endif + #define NR_PAGES 32 #define NR_SOCKETS 2 -#define BUF_TOTAL (NR_PAGES * 4096 / NR_SOCKETS) +#define BUF_TOTAL (NR_PAGES * PAGE_SIZE / NR_SOCKETS) #define BUF_SINGLE 1024 #define NR_SEND (BUF_TOTAL / BUF_SINGLE) From 951d79017e8a0af6db4a167a89746b4a0924626b Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 12 Jan 2026 22:10:33 -0800 Subject: [PATCH 091/268] selftests/bpf: Fix verifier_arena_globals1 failure with 64K page With 64K page on arm64, verifier_arena_globals1 failed like below: ... libbpf: map 'arena': failed to create: -E2BIG ... #509/1 verifier_arena_globals1/check_reserve1:FAIL ... For 64K page, if the number of arena pages is (1UL << 20), the total memory will exceed 4G and this will cause map creation failure. Adjusting ARENA_PAGES based on the actual page size fixed the problem. Cc: Emil Tsalapatis Signed-off-by: Yonghong Song Reviewed-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260113061033.3798549-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/verifier_arena_globals1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/progs/verifier_arena_globals1.c b/tools/testing/selftests/bpf/progs/verifier_arena_globals1.c index 14afef3d6442..83182ddbfb95 100644 --- a/tools/testing/selftests/bpf/progs/verifier_arena_globals1.c +++ b/tools/testing/selftests/bpf/progs/verifier_arena_globals1.c @@ -9,7 +9,7 @@ #include "bpf_arena_common.h" #include "bpf_misc.h" -#define ARENA_PAGES (1UL<< (32 - 12)) +#define ARENA_PAGES (1UL<< (32 - __builtin_ffs(__PAGE_SIZE) + 1)) #define GLOBAL_PAGES (16) struct { From bffacdb80b93b7b5e96b26fad64cc490a6c7d6c7 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 12 Jan 2026 12:13:57 -0800 Subject: [PATCH 092/268] bpf: Recognize special arithmetic shift in the verifier cilium bpf_wiregard.bpf.c when compiled with -O1 fails to load with the following verifier log: 192: (79) r2 = *(u64 *)(r10 -304) ; R2=pkt(r=40) R10=fp0 fp-304=pkt(r=40) ... 227: (85) call bpf_skb_store_bytes#9 ; R0=scalar() 228: (bc) w2 = w0 ; R0=scalar() R2=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff)) 229: (c4) w2 s>>= 31 ; R2=scalar(smin=0,smax=umax=0xffffffff,smin32=-1,smax32=0,var_off=(0x0; 0xffffffff)) 230: (54) w2 &= -134 ; R2=scalar(smin=0,smax=umax=umax32=0xffffff7a,smax32=0x7fffff7a,var_off=(0x0; 0xffffff7a)) ... 232: (66) if w2 s> 0xffffffff goto pc+125 ; R2=scalar(smin=umin=umin32=0x80000000,smax=umax=umax32=0xffffff7a,smax32=-134,var_off=(0x80000000; 0x7fffff7a)) ... 238: (79) r4 = *(u64 *)(r10 -304) ; R4=scalar() R10=fp0 fp-304=scalar() 239: (56) if w2 != 0xffffff78 goto pc+210 ; R2=0xffffff78 // -136 ... 258: (71) r1 = *(u8 *)(r4 +0) R4 invalid mem access 'scalar' The error might confuse most bpf authors, since fp-304 slot had 'pkt' pointer at insn 192 and became 'scalar' at 238. That happened because bpf_skb_store_bytes() clears all packet pointers including those in the stack. On the first glance it might look like a bug in the source code, since ctx->data pointer should have been reloaded after the call to bpf_skb_store_bytes(). The relevant part of cilium source code looks like this: // bpf/lib/nodeport.h int dsr_set_ipip6() { if (ctx_adjust_hroom(...)) return DROP_INVALID; // -134 if (ctx_store_bytes(...)) return DROP_WRITE_ERROR; // -141 return 0; } bool dsr_fail_needs_reply(int code) { if (code == DROP_FRAG_NEEDED) // -136 return true; return false; } tail_nodeport_ipv6_dsr() { ret = dsr_set_ipip6(...); if (!IS_ERR(ret)) { ... } else { if (dsr_fail_needs_reply(ret)) return dsr_reply_icmp6(...); } } The code doesn't have arithmetic shift by 31 and it reloads ctx->data every time it needs to access it. So it's not a bug in the source code. The reason is DAGCombiner::foldSelectCCToShiftAnd() LLVM transformation: // If this is a select where the false operand is zero and the compare is a // check of the sign bit, see if we can perform the "gzip trick": // select_cc setlt X, 0, A, 0 -> and (sra X, size(X)-1), A // select_cc setgt X, 0, A, 0 -> and (not (sra X, size(X)-1)), A The conditional branch in dsr_set_ipip6() and its return values are optimized into BPF_ARSH plus BPF_AND: 227: (85) call bpf_skb_store_bytes#9 228: (bc) w2 = w0 229: (c4) w2 s>>= 31 ; R2=scalar(smin=0,smax=umax=0xffffffff,smin32=-1,smax32=0,var_off=(0x0; 0xffffffff)) 230: (54) w2 &= -134 ; R2=scalar(smin=0,smax=umax=umax32=0xffffff7a,smax32=0x7fffff7a,var_off=(0x0; 0xffffff7a)) after insn 230 the register w2 can only be 0 or -134, but the verifier approximates it, since there is no way to represent two scalars in bpf_reg_state. After fallthough at insn 232 the w2 can only be -134, hence the branch at insn 239: (56) if w2 != -136 goto pc+210 should be always taken, and trapping insn 258 should never execute. LLVM generated correct code, but the verifier follows impossible path and rejects valid program. To fix this issue recognize this special LLVM optimization and fork the verifier state. So after insn 229: (c4) w2 s>>= 31 the verifier has two states to explore: one with w2 = 0 and another with w2 = 0xffffffff which makes the verifier accept bpf_wiregard.c A similar pattern exists were OR operation is used in place of the AND operation, the verifier detects that pattern as well by forking the state before the OR operation with a scalar in range [-1,0]. Note there are 20+ such patterns in bpf_wiregard.o compiled with -O1 and -O2, but they're rarely seen in other production bpf programs, so push_stack() approach is not a concern. Reported-by: Hao Sun Signed-off-by: Alexei Starovoitov Co-developed-by: Puranjay Mohan Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260112201424.816836-2-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 02a43cafbb25..9d3ad2876d8f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -15491,6 +15491,35 @@ static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn, } } +static int maybe_fork_scalars(struct bpf_verifier_env *env, struct bpf_insn *insn, + struct bpf_reg_state *dst_reg) +{ + struct bpf_verifier_state *branch; + struct bpf_reg_state *regs; + bool alu32; + + if (dst_reg->smin_value == -1 && dst_reg->smax_value == 0) + alu32 = false; + else if (dst_reg->s32_min_value == -1 && dst_reg->s32_max_value == 0) + alu32 = true; + else + return 0; + + branch = push_stack(env, env->insn_idx + 1, env->insn_idx, false); + if (IS_ERR(branch)) + return PTR_ERR(branch); + + regs = branch->frame[branch->curframe]->regs; + if (alu32) { + __mark_reg32_known(®s[insn->dst_reg], 0); + __mark_reg32_known(dst_reg, -1ull); + } else { + __mark_reg_known(®s[insn->dst_reg], 0); + __mark_reg_known(dst_reg, -1ull); + } + return 0; +} + /* WARNING: This function does calculations on 64-bit values, but the actual * execution may occur on 32-bit values. Therefore, things like bitshifts * need extra checks in the 32-bit case. @@ -15553,11 +15582,21 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, scalar_min_max_mul(dst_reg, &src_reg); break; case BPF_AND: + if (tnum_is_const(src_reg.var_off)) { + ret = maybe_fork_scalars(env, insn, dst_reg); + if (ret) + return ret; + } dst_reg->var_off = tnum_and(dst_reg->var_off, src_reg.var_off); scalar32_min_max_and(dst_reg, &src_reg); scalar_min_max_and(dst_reg, &src_reg); break; case BPF_OR: + if (tnum_is_const(src_reg.var_off)) { + ret = maybe_fork_scalars(env, insn, dst_reg); + if (ret) + return ret; + } dst_reg->var_off = tnum_or(dst_reg->var_off, src_reg.var_off); scalar32_min_max_or(dst_reg, &src_reg); scalar_min_max_or(dst_reg, &src_reg); From 9160335317cb404f54ad2f509546c666ddd4d0eb Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 12 Jan 2026 12:13:58 -0800 Subject: [PATCH 093/268] selftests/bpf: Add tests for s>>=31 and s>>=63 Add tests for special arithmetic shift right. Signed-off-by: Alexei Starovoitov Co-developed-by: Puranjay Mohan Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260112201424.816836-3-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/progs/verifier_subreg.c | 85 +++++++++++++++++++ 1 file changed, 85 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/verifier_subreg.c b/tools/testing/selftests/bpf/progs/verifier_subreg.c index b3e1c3eef9ae..be328100ba53 100644 --- a/tools/testing/selftests/bpf/progs/verifier_subreg.c +++ b/tools/testing/selftests/bpf/progs/verifier_subreg.c @@ -738,4 +738,89 @@ __naked void ldx_w_zero_extend_check(void) : __clobber_all); } +SEC("socket") +__success __success_unpriv __retval(0) +__naked void arsh_31_and(void) +{ + /* Below is what LLVM generates in cilium's bpf_wiregard.o */ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w2 = w0; \ + w2 s>>= 31; \ + w2 &= -134; /* w2 becomes 0 or -134 */ \ + if w2 s> -1 goto +2; \ + /* Branch always taken because w2 = -134 */ \ + if w2 != -136 goto +1; \ + w0 /= 0; \ + w0 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__success __success_unpriv __retval(0) +__naked void arsh_63_and(void) +{ + /* Copy of arsh_31 with s/w/r/ */ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r2 = r0; \ + r2 <<= 32; \ + r2 s>>= 63; \ + r2 &= -134; \ + if r2 s> -1 goto +2; \ + /* Branch always taken because w2 = -134 */ \ + if r2 != -136 goto +1; \ + r0 /= 0; \ + r0 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__success __success_unpriv __retval(0) +__naked void arsh_31_or(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w2 = w0; \ + w2 s>>= 31; \ + w2 |= 134; /* w2 becomes -1 or 134 */ \ + if w2 s> -1 goto +2; \ + /* Branch always taken because w2 = -1 */ \ + if w2 == -1 goto +1; \ + w0 /= 0; \ + w0 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__success __success_unpriv __retval(0) +__naked void arsh_63_or(void) +{ + /* Copy of arsh_31 with s/w/r/ */ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r2 = r0; \ + r2 <<= 32; \ + r2 s>>= 63; \ + r2 |= 134; /* r2 becomes -1 or 134 */ \ + if r2 s> -1 goto +2; \ + /* Branch always taken because w2 = -1 */ \ + if r2 == -1 goto +1; \ + r0 /= 0; \ + r0 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + char _license[] SEC("license") = "GPL"; From c9c9f6bf7fbcec4bc1cba845633e48035b883630 Mon Sep 17 00:00:00 2001 From: Song Chen Date: Mon, 5 Jan 2026 23:50:09 +0800 Subject: [PATCH 094/268] bpf: Remove an unused parameter in check_func_proto The func_id parameter is not needed in check_func_proto. This patch removes it. Signed-off-by: Song Chen Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260105155009.4581-1-chensong_2000@189.cn --- kernel/bpf/verifier.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9d3ad2876d8f..092003cc7841 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10346,7 +10346,7 @@ static bool check_btf_id_ok(const struct bpf_func_proto *fn) return true; } -static int check_func_proto(const struct bpf_func_proto *fn, int func_id) +static int check_func_proto(const struct bpf_func_proto *fn) { return check_raw_mode_ok(fn) && check_arg_pair_ok(fn) && @@ -11521,7 +11521,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn memset(&meta, 0, sizeof(meta)); meta.pkt_access = fn->pkt_access; - err = check_func_proto(fn, func_id); + err = check_func_proto(fn); if (err) { verifier_bug(env, "incorrect func proto %s#%d", func_id_name(func_id), func_id); return err; From 6fbf129c49905e9e34801b362c9c30ae383d7a90 Mon Sep 17 00:00:00 2001 From: Donglin Peng Date: Fri, 9 Jan 2026 20:59:53 +0800 Subject: [PATCH 095/268] libbpf: Add BTF permutation support for type reordering Introduce btf__permute() API to allow in-place rearrangement of BTF types. This function reorganizes BTF type order according to a provided array of type IDs, updating all type references to maintain consistency. Signed-off-by: Donglin Peng Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20260109130003.3313716-2-dolinux.peng@gmail.com --- tools/lib/bpf/btf.c | 133 +++++++++++++++++++++++++++++++++++++++ tools/lib/bpf/btf.h | 42 +++++++++++++ tools/lib/bpf/libbpf.map | 1 + 3 files changed, 176 insertions(+) diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index b136572e889a..bf75f770d29a 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -5887,3 +5887,136 @@ int btf__relocate(struct btf *btf, const struct btf *base_btf) btf->owns_base = false; return libbpf_err(err); } + +struct btf_permute { + struct btf *btf; + __u32 *id_map; + __u32 start_offs; +}; + +/* Callback function to remap individual type ID references */ +static int btf_permute_remap_type_id(__u32 *type_id, void *ctx) +{ + struct btf_permute *p = ctx; + __u32 new_id = *type_id; + + /* refer to the base BTF or VOID type */ + if (new_id < p->btf->start_id) + return 0; + + if (new_id >= btf__type_cnt(p->btf)) + return -EINVAL; + + *type_id = p->id_map[new_id - p->btf->start_id + p->start_offs]; + return 0; +} + +int btf__permute(struct btf *btf, __u32 *id_map, __u32 id_map_cnt, + const struct btf_permute_opts *opts) +{ + struct btf_permute p; + struct btf_ext *btf_ext; + void *nt, *new_types = NULL; + __u32 *order_map = NULL; + int err = 0, i; + __u32 n, id, start_offs = 0; + + if (!OPTS_VALID(opts, btf_permute_opts)) + return libbpf_err(-EINVAL); + + if (btf__base_btf(btf)) { + n = btf->nr_types; + } else { + if (id_map[0] != 0) + return libbpf_err(-EINVAL); + n = btf__type_cnt(btf); + start_offs = 1; + } + + if (id_map_cnt != n) + return libbpf_err(-EINVAL); + + /* record the sequence of types */ + order_map = calloc(id_map_cnt, sizeof(*id_map)); + if (!order_map) { + err = -ENOMEM; + goto done; + } + + new_types = calloc(btf->hdr->type_len, 1); + if (!new_types) { + err = -ENOMEM; + goto done; + } + + if (btf_ensure_modifiable(btf)) { + err = -ENOMEM; + goto done; + } + + for (i = start_offs; i < id_map_cnt; i++) { + id = id_map[i]; + if (id < btf->start_id || id >= btf__type_cnt(btf)) { + err = -EINVAL; + goto done; + } + id -= btf->start_id - start_offs; + /* cannot be mapped to the same ID */ + if (order_map[id]) { + err = -EINVAL; + goto done; + } + order_map[id] = i + btf->start_id - start_offs; + } + + p.btf = btf; + p.id_map = id_map; + p.start_offs = start_offs; + nt = new_types; + for (i = start_offs; i < id_map_cnt; i++) { + struct btf_field_iter it; + const struct btf_type *t; + __u32 *type_id; + int type_size; + + id = order_map[i]; + t = btf__type_by_id(btf, id); + type_size = btf_type_size(t); + memcpy(nt, t, type_size); + + /* fix up referenced IDs for BTF */ + err = btf_field_iter_init(&it, nt, BTF_FIELD_ITER_IDS); + if (err) + goto done; + while ((type_id = btf_field_iter_next(&it))) { + err = btf_permute_remap_type_id(type_id, &p); + if (err) + goto done; + } + + nt += type_size; + } + + /* fix up referenced IDs for btf_ext */ + btf_ext = OPTS_GET(opts, btf_ext, NULL); + if (btf_ext) { + err = btf_ext_visit_type_ids(btf_ext, btf_permute_remap_type_id, &p); + if (err) + goto done; + } + + for (nt = new_types, i = 0; i < id_map_cnt - start_offs; i++) { + btf->type_offs[i] = nt - new_types; + nt += btf_type_size(nt); + } + + free(order_map); + free(btf->types_data); + btf->types_data = new_types; + return 0; + +done: + free(order_map); + free(new_types); + return libbpf_err(err); +} diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h index cc01494d6210..b30008c267c0 100644 --- a/tools/lib/bpf/btf.h +++ b/tools/lib/bpf/btf.h @@ -281,6 +281,48 @@ LIBBPF_API int btf__dedup(struct btf *btf, const struct btf_dedup_opts *opts); */ LIBBPF_API int btf__relocate(struct btf *btf, const struct btf *base_btf); +struct btf_permute_opts { + size_t sz; + /* optional .BTF.ext info along the main BTF info */ + struct btf_ext *btf_ext; + size_t :0; +}; +#define btf_permute_opts__last_field btf_ext + +/** + * @brief **btf__permute()** rearranges BTF types in-place according to a specified ID mapping + * @param btf BTF object to permute + * @param id_map Array mapping original type IDs to new IDs + * @param id_map_cnt Number of elements in @id_map + * @param opts Optional parameters, including BTF extension data for reference updates + * @return 0 on success, negative error code on failure + * + * **btf__permute()** reorders BTF types based on the provided @id_map array, + * updating all internal type references to maintain consistency. The function + * operates in-place, modifying the BTF object directly. + * + * For **base BTF**: + * - @id_map must include all types from ID 0 to `btf__type_cnt(btf) - 1` + * - @id_map_cnt must be `btf__type_cnt(btf)` + * - Mapping is defined as `id_map[original_id] = new_id` + * - `id_map[0]` must be 0 (void type cannot be moved) + * + * For **split BTF**: + * - @id_map must include only split types (types added on top of the base BTF) + * - @id_map_cnt must be `btf__type_cnt(btf) - btf__type_cnt(btf__base_btf(btf))` + * - Mapping is defined as `id_map[original_id - start_id] = new_id` + * - `start_id` equals `btf__type_cnt(btf__base_btf(btf))` + * + * After permutation, all type references within the BTF data and optional + * BTF extension (if provided via @opts) are updated automatically. + * + * On error, returns a negative error code and sets errno: + * - `-EINVAL`: Invalid parameters or invalid ID mapping + * - `-ENOMEM`: Memory allocation failure + */ +LIBBPF_API int btf__permute(struct btf *btf, __u32 *id_map, __u32 id_map_cnt, + const struct btf_permute_opts *opts); + struct btf_dump; struct btf_dump_opts { diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 84fb90a016c9..d18fbcea7578 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -453,4 +453,5 @@ LIBBPF_1.7.0 { bpf_map__exclusive_program; bpf_prog_assoc_struct_ops; bpf_program__assoc_struct_ops; + btf__permute; } LIBBPF_1.6.0; From a3acd7d43462a7f7429301afad3c0059276f427e Mon Sep 17 00:00:00 2001 From: Donglin Peng Date: Fri, 9 Jan 2026 20:59:54 +0800 Subject: [PATCH 096/268] selftests/bpf: Add test cases for btf__permute functionality This patch introduces test cases for the btf__permute function to ensure it works correctly with both base BTF and split BTF scenarios. The test suite includes: - test_permute_base: Validates permutation on base BTF - test_permute_split: Tests permutation on split BTF Signed-off-by: Donglin Peng Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20260109130003.3313716-3-dolinux.peng@gmail.com --- .../selftests/bpf/prog_tests/btf_permute.c | 244 ++++++++++++++++++ 1 file changed, 244 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/btf_permute.c diff --git a/tools/testing/selftests/bpf/prog_tests/btf_permute.c b/tools/testing/selftests/bpf/prog_tests/btf_permute.c new file mode 100644 index 000000000000..04ade5ad77ac --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/btf_permute.c @@ -0,0 +1,244 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Xiaomi */ + +#include +#include +#include "btf_helpers.h" + +static void permute_base_check(struct btf *btf) +{ + VALIDATE_RAW_BTF( + btf, + "[1] STRUCT 's2' size=4 vlen=1\n" + "\t'm' type_id=4 bits_offset=0", + "[2] FUNC 'f' type_id=6 linkage=static", + "[3] PTR '(anon)' type_id=4", + "[4] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", + "[5] STRUCT 's1' size=4 vlen=1\n" + "\t'm' type_id=4 bits_offset=0", + "[6] FUNC_PROTO '(anon)' ret_type_id=4 vlen=1\n" + "\t'p' type_id=3"); +} + +/* Ensure btf__permute works as expected in the base-BTF scenario */ +static void test_permute_base(void) +{ + struct btf *btf; + __u32 permute_ids[7]; + int err; + + btf = btf__new_empty(); + if (!ASSERT_OK_PTR(btf, "empty_main_btf")) + return; + + btf__add_int(btf, "int", 4, BTF_INT_SIGNED); /* [1] int */ + btf__add_ptr(btf, 1); /* [2] ptr to int */ + btf__add_struct(btf, "s1", 4); /* [3] struct s1 { */ + btf__add_field(btf, "m", 1, 0, 0); /* int m; */ + /* } */ + btf__add_struct(btf, "s2", 4); /* [4] struct s2 { */ + btf__add_field(btf, "m", 1, 0, 0); /* int m; */ + /* } */ + btf__add_func_proto(btf, 1); /* [5] int (*)(int *p); */ + btf__add_func_param(btf, "p", 2); + btf__add_func(btf, "f", BTF_FUNC_STATIC, 5); /* [6] int f(int *p); */ + + VALIDATE_RAW_BTF( + btf, + "[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", + "[2] PTR '(anon)' type_id=1", + "[3] STRUCT 's1' size=4 vlen=1\n" + "\t'm' type_id=1 bits_offset=0", + "[4] STRUCT 's2' size=4 vlen=1\n" + "\t'm' type_id=1 bits_offset=0", + "[5] FUNC_PROTO '(anon)' ret_type_id=1 vlen=1\n" + "\t'p' type_id=2", + "[6] FUNC 'f' type_id=5 linkage=static"); + + permute_ids[0] = 0; /* [0] -> [0] */ + permute_ids[1] = 4; /* [1] -> [4] */ + permute_ids[2] = 3; /* [2] -> [3] */ + permute_ids[3] = 5; /* [3] -> [5] */ + permute_ids[4] = 1; /* [4] -> [1] */ + permute_ids[5] = 6; /* [5] -> [6] */ + permute_ids[6] = 2; /* [6] -> [2] */ + err = btf__permute(btf, permute_ids, ARRAY_SIZE(permute_ids), NULL); + if (!ASSERT_OK(err, "btf__permute_base")) + goto done; + permute_base_check(btf); + + /* ids[0] must be 0 for base BTF */ + permute_ids[0] = 4; /* [0] -> [0] */ + permute_ids[1] = 0; /* [1] -> [4] */ + permute_ids[2] = 3; /* [2] -> [3] */ + permute_ids[3] = 5; /* [3] -> [5] */ + permute_ids[4] = 1; /* [4] -> [1] */ + permute_ids[5] = 6; /* [5] -> [6] */ + permute_ids[6] = 2; /* [6] -> [2] */ + err = btf__permute(btf, permute_ids, ARRAY_SIZE(permute_ids), NULL); + if (!ASSERT_ERR(err, "btf__permute_base")) + goto done; + /* BTF is not modified */ + permute_base_check(btf); + + /* id_map_cnt is invalid */ + permute_ids[0] = 0; /* [0] -> [0] */ + permute_ids[1] = 4; /* [1] -> [4] */ + permute_ids[2] = 3; /* [2] -> [3] */ + permute_ids[3] = 5; /* [3] -> [5] */ + permute_ids[4] = 1; /* [4] -> [1] */ + permute_ids[5] = 6; /* [5] -> [6] */ + permute_ids[6] = 2; /* [6] -> [2] */ + err = btf__permute(btf, permute_ids, ARRAY_SIZE(permute_ids) - 1, NULL); + if (!ASSERT_ERR(err, "btf__permute_base")) + goto done; + /* BTF is not modified */ + permute_base_check(btf); + + /* Multiple types can not be mapped to the same ID */ + permute_ids[0] = 0; + permute_ids[1] = 4; + permute_ids[2] = 4; + permute_ids[3] = 5; + permute_ids[4] = 1; + permute_ids[5] = 6; + permute_ids[6] = 2; + err = btf__permute(btf, permute_ids, ARRAY_SIZE(permute_ids), NULL); + if (!ASSERT_ERR(err, "btf__permute_base")) + goto done; + /* BTF is not modified */ + permute_base_check(btf); + + /* Type ID must be valid */ + permute_ids[0] = 0; + permute_ids[1] = 4; + permute_ids[2] = 3; + permute_ids[3] = 5; + permute_ids[4] = 1; + permute_ids[5] = 7; + permute_ids[6] = 2; + err = btf__permute(btf, permute_ids, ARRAY_SIZE(permute_ids), NULL); + if (!ASSERT_ERR(err, "btf__permute_base")) + goto done; + /* BTF is not modified */ + permute_base_check(btf); + +done: + btf__free(btf); +} + +static void permute_split_check(struct btf *btf) +{ + VALIDATE_RAW_BTF( + btf, + "[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", + "[2] PTR '(anon)' type_id=1", + "[3] STRUCT 's2' size=4 vlen=1\n" + "\t'm' type_id=1 bits_offset=0", + "[4] FUNC 'f' type_id=5 linkage=static", + "[5] FUNC_PROTO '(anon)' ret_type_id=1 vlen=1\n" + "\t'p' type_id=2", + "[6] STRUCT 's1' size=4 vlen=1\n" + "\t'm' type_id=1 bits_offset=0"); +} + +/* Ensure btf__permute works as expected in the split-BTF scenario */ +static void test_permute_split(void) +{ + struct btf *split_btf = NULL, *base_btf = NULL; + __u32 permute_ids[4]; + int err, start_id; + + base_btf = btf__new_empty(); + if (!ASSERT_OK_PTR(base_btf, "empty_main_btf")) + return; + + btf__add_int(base_btf, "int", 4, BTF_INT_SIGNED); /* [1] int */ + btf__add_ptr(base_btf, 1); /* [2] ptr to int */ + VALIDATE_RAW_BTF( + base_btf, + "[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", + "[2] PTR '(anon)' type_id=1"); + split_btf = btf__new_empty_split(base_btf); + if (!ASSERT_OK_PTR(split_btf, "empty_split_btf")) + goto cleanup; + btf__add_struct(split_btf, "s1", 4); /* [3] struct s1 { */ + btf__add_field(split_btf, "m", 1, 0, 0); /* int m; */ + /* } */ + btf__add_struct(split_btf, "s2", 4); /* [4] struct s2 { */ + btf__add_field(split_btf, "m", 1, 0, 0); /* int m; */ + /* } */ + btf__add_func_proto(split_btf, 1); /* [5] int (*)(int p); */ + btf__add_func_param(split_btf, "p", 2); + btf__add_func(split_btf, "f", BTF_FUNC_STATIC, 5); /* [6] int f(int *p); */ + + VALIDATE_RAW_BTF( + split_btf, + "[1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED", + "[2] PTR '(anon)' type_id=1", + "[3] STRUCT 's1' size=4 vlen=1\n" + "\t'm' type_id=1 bits_offset=0", + "[4] STRUCT 's2' size=4 vlen=1\n" + "\t'm' type_id=1 bits_offset=0", + "[5] FUNC_PROTO '(anon)' ret_type_id=1 vlen=1\n" + "\t'p' type_id=2", + "[6] FUNC 'f' type_id=5 linkage=static"); + + start_id = btf__type_cnt(base_btf); + permute_ids[3 - start_id] = 6; /* [3] -> [6] */ + permute_ids[4 - start_id] = 3; /* [4] -> [3] */ + permute_ids[5 - start_id] = 5; /* [5] -> [5] */ + permute_ids[6 - start_id] = 4; /* [6] -> [4] */ + err = btf__permute(split_btf, permute_ids, ARRAY_SIZE(permute_ids), NULL); + if (!ASSERT_OK(err, "btf__permute_split")) + goto cleanup; + permute_split_check(split_btf); + + /* + * For split BTF, id_map_cnt must equal to the number of types + * added on top of base BTF + */ + permute_ids[3 - start_id] = 4; + permute_ids[4 - start_id] = 3; + permute_ids[5 - start_id] = 5; + permute_ids[6 - start_id] = 6; + err = btf__permute(split_btf, permute_ids, ARRAY_SIZE(permute_ids) - 1, NULL); + if (!ASSERT_ERR(err, "btf__permute_split")) + goto cleanup; + /* BTF is not modified */ + permute_split_check(split_btf); + + /* Multiple types can not be mapped to the same ID */ + permute_ids[3 - start_id] = 4; + permute_ids[4 - start_id] = 3; + permute_ids[5 - start_id] = 3; + permute_ids[6 - start_id] = 6; + err = btf__permute(split_btf, permute_ids, ARRAY_SIZE(permute_ids), NULL); + if (!ASSERT_ERR(err, "btf__permute_split")) + goto cleanup; + /* BTF is not modified */ + permute_split_check(split_btf); + + /* Can not map to base ID */ + permute_ids[3 - start_id] = 4; + permute_ids[4 - start_id] = 2; + permute_ids[5 - start_id] = 5; + permute_ids[6 - start_id] = 6; + err = btf__permute(split_btf, permute_ids, ARRAY_SIZE(permute_ids), NULL); + if (!ASSERT_ERR(err, "btf__permute_split")) + goto cleanup; + /* BTF is not modified */ + permute_split_check(split_btf); + +cleanup: + btf__free(split_btf); + btf__free(base_btf); +} + +void test_btf_permute(void) +{ + if (test__start_subtest("permute_base")) + test_permute_base(); + if (test__start_subtest("permute_split")) + test_permute_split(); +} From 230e7d7de5a8d2bcda2a5cdcc8c176c09a63e331 Mon Sep 17 00:00:00 2001 From: Donglin Peng Date: Fri, 9 Jan 2026 20:59:55 +0800 Subject: [PATCH 097/268] tools/resolve_btfids: Support BTF sorting feature This introduces a new BTF sorting phase that specifically sorts BTF types by name in ascending order, so that the binary search can be used to look up types. Signed-off-by: Donglin Peng Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20260109130003.3313716-4-dolinux.peng@gmail.com --- tools/bpf/resolve_btfids/main.c | 64 +++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c index df39982f51df..343d08050116 100644 --- a/tools/bpf/resolve_btfids/main.c +++ b/tools/bpf/resolve_btfids/main.c @@ -850,6 +850,67 @@ static int dump_raw_btf(struct btf *btf, const char *out_path) return 0; } +/* + * Sort types by name in ascending order resulting in all + * anonymous types being placed before named types. + */ +static int cmp_type_names(const void *a, const void *b, void *priv) +{ + struct btf *btf = (struct btf *)priv; + const struct btf_type *ta = btf__type_by_id(btf, *(__u32 *)a); + const struct btf_type *tb = btf__type_by_id(btf, *(__u32 *)b); + const char *na, *nb; + + na = btf__str_by_offset(btf, ta->name_off); + nb = btf__str_by_offset(btf, tb->name_off); + return strcmp(na, nb); +} + +static int sort_btf_by_name(struct btf *btf) +{ + __u32 *permute_ids = NULL, *id_map = NULL; + int nr_types, i, err = 0; + __u32 start_id = 0, start_offs = 1, id; + + if (btf__base_btf(btf)) { + start_id = btf__type_cnt(btf__base_btf(btf)); + start_offs = 0; + } + nr_types = btf__type_cnt(btf) - start_id; + + permute_ids = calloc(nr_types, sizeof(*permute_ids)); + if (!permute_ids) { + err = -ENOMEM; + goto out; + } + + id_map = calloc(nr_types, sizeof(*id_map)); + if (!id_map) { + err = -ENOMEM; + goto out; + } + + for (i = 0, id = start_id; i < nr_types; i++, id++) + permute_ids[i] = id; + + qsort_r(permute_ids + start_offs, nr_types - start_offs, + sizeof(*permute_ids), cmp_type_names, btf); + + for (i = 0; i < nr_types; i++) { + id = permute_ids[i] - start_id; + id_map[id] = i + start_id; + } + + err = btf__permute(btf, id_map, nr_types, NULL); + if (err) + pr_err("FAILED: btf permute: %s\n", strerror(-err)); + +out: + free(permute_ids); + free(id_map); + return err; +} + static inline int make_out_path(char *buf, u32 buf_sz, const char *in_path, const char *suffix) { int len = snprintf(buf, buf_sz, "%s%s", in_path, suffix); @@ -1025,6 +1086,9 @@ int main(int argc, const char **argv) if (load_btf(&obj)) goto out; + if (sort_btf_by_name(obj.btf)) + goto out; + if (elf_collect(&obj)) goto out; From d836e5e64992363b5fa9b121f1ab4a1a1b89162d Mon Sep 17 00:00:00 2001 From: Donglin Peng Date: Fri, 9 Jan 2026 20:59:56 +0800 Subject: [PATCH 098/268] libbpf: Optimize type lookup with binary search for sorted BTF This patch introduces binary search optimization for BTF type lookups when the BTF instance contains sorted types. The optimization significantly improves performance when searching for types in large BTF instances with sorted types. For unsorted BTF, the implementation falls back to the original linear search. Signed-off-by: Donglin Peng Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260109130003.3313716-5-dolinux.peng@gmail.com --- tools/lib/bpf/btf.c | 96 ++++++++++++++++++++++++++++++++------------- 1 file changed, 68 insertions(+), 28 deletions(-) diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index bf75f770d29a..5a6ac40439e4 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -92,6 +92,8 @@ struct btf { * - for split BTF counts number of types added on top of base BTF. */ __u32 nr_types; + /* the start IDs of named types in sorted BTF */ + int named_start_id; /* if not NULL, points to the base BTF on top of which the current * split BTF is based */ @@ -897,44 +899,79 @@ int btf__resolve_type(const struct btf *btf, __u32 type_id) return type_id; } -__s32 btf__find_by_name(const struct btf *btf, const char *type_name) +static __s32 btf_find_type_by_name_bsearch(const struct btf *btf, const char *name, + __s32 start_id) { - __u32 i, nr_types = btf__type_cnt(btf); + const struct btf_type *t; + const char *tname; + __s32 l, r, m; - if (!strcmp(type_name, "void")) + l = start_id; + r = btf__type_cnt(btf) - 1; + while (l <= r) { + m = l + (r - l) / 2; + t = btf_type_by_id(btf, m); + tname = btf__str_by_offset(btf, t->name_off); + if (strcmp(tname, name) >= 0) { + if (l == r) + return r; + r = m; + } else { + l = m + 1; + } + } + + return btf__type_cnt(btf); +} + +static __s32 btf_find_by_name_kind(const struct btf *btf, int start_id, + const char *type_name, __s32 kind) +{ + __u32 nr_types = btf__type_cnt(btf); + const struct btf_type *t; + const char *tname; + __s32 id; + + if (start_id < btf->start_id) { + id = btf_find_by_name_kind(btf->base_btf, start_id, + type_name, kind); + if (id >= 0) + return id; + start_id = btf->start_id; + } + + if (kind == BTF_KIND_UNKN || strcmp(type_name, "void") == 0) return 0; - for (i = 1; i < nr_types; i++) { - const struct btf_type *t = btf__type_by_id(btf, i); - const char *name = btf__name_by_offset(btf, t->name_off); - - if (name && !strcmp(type_name, name)) - return i; + if (btf->named_start_id > 0 && type_name[0]) { + start_id = max(start_id, btf->named_start_id); + id = btf_find_type_by_name_bsearch(btf, type_name, start_id); + for (; id < nr_types; id++) { + t = btf__type_by_id(btf, id); + tname = btf__str_by_offset(btf, t->name_off); + if (strcmp(tname, type_name) != 0) + return libbpf_err(-ENOENT); + if (kind < 0 || btf_kind(t) == kind) + return id; + } + } else { + for (id = start_id; id < nr_types; id++) { + t = btf_type_by_id(btf, id); + if (kind > 0 && btf_kind(t) != kind) + continue; + tname = btf__str_by_offset(btf, t->name_off); + if (strcmp(tname, type_name) == 0) + return id; + } } return libbpf_err(-ENOENT); } -static __s32 btf_find_by_name_kind(const struct btf *btf, int start_id, - const char *type_name, __u32 kind) +/* the kind value of -1 indicates that kind matching should be skipped */ +__s32 btf__find_by_name(const struct btf *btf, const char *type_name) { - __u32 i, nr_types = btf__type_cnt(btf); - - if (kind == BTF_KIND_UNKN || !strcmp(type_name, "void")) - return 0; - - for (i = start_id; i < nr_types; i++) { - const struct btf_type *t = btf__type_by_id(btf, i); - const char *name; - - if (btf_kind(t) != kind) - continue; - name = btf__name_by_offset(btf, t->name_off); - if (name && !strcmp(type_name, name)) - return i; - } - - return libbpf_err(-ENOENT); + return btf_find_by_name_kind(btf, 1, type_name, -1); } __s32 btf__find_by_name_kind_own(const struct btf *btf, const char *type_name, @@ -1006,6 +1043,7 @@ static struct btf *btf_new_empty(struct btf *base_btf) btf->fd = -1; btf->ptr_sz = sizeof(void *); btf->swapped_endian = false; + btf->named_start_id = 0; if (base_btf) { btf->base_btf = base_btf; @@ -1057,6 +1095,7 @@ static struct btf *btf_new(const void *data, __u32 size, struct btf *base_btf, b btf->start_id = 1; btf->start_str_off = 0; btf->fd = -1; + btf->named_start_id = 0; if (base_btf) { btf->base_btf = base_btf; @@ -1715,6 +1754,7 @@ static void btf_invalidate_raw_data(struct btf *btf) free(btf->raw_data_swapped); btf->raw_data_swapped = NULL; } + btf->named_start_id = 0; } /* Ensure BTF is ready to be modified (by splitting into a three memory From 33ecca574f1c27cbf560aee9c1b3045dcb9f8de5 Mon Sep 17 00:00:00 2001 From: Donglin Peng Date: Fri, 9 Jan 2026 20:59:57 +0800 Subject: [PATCH 099/268] libbpf: Verify BTF sorting This patch checks whether the BTF is sorted by name in ascending order. If sorted, binary search will be used when looking up types. Signed-off-by: Donglin Peng Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20260109130003.3313716-6-dolinux.peng@gmail.com --- tools/lib/bpf/btf.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 5a6ac40439e4..808e53961ed6 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -899,6 +899,30 @@ int btf__resolve_type(const struct btf *btf, __u32 type_id) return type_id; } +static void btf_check_sorted(struct btf *btf) +{ + __u32 i, n, named_start_id = 0; + + n = btf__type_cnt(btf); + for (i = btf->start_id + 1; i < n; i++) { + struct btf_type *ta = btf_type_by_id(btf, i - 1); + struct btf_type *tb = btf_type_by_id(btf, i); + const char *na = btf__str_by_offset(btf, ta->name_off); + const char *nb = btf__str_by_offset(btf, tb->name_off); + + if (strcmp(na, nb) > 0) + return; + + if (named_start_id == 0 && na[0] != '\0') + named_start_id = i - 1; + if (named_start_id == 0 && nb[0] != '\0') + named_start_id = i; + } + + if (named_start_id) + btf->named_start_id = named_start_id; +} + static __s32 btf_find_type_by_name_bsearch(const struct btf *btf, const char *name, __s32 start_id) { @@ -1130,6 +1154,7 @@ static struct btf *btf_new(const void *data, __u32 size, struct btf *base_btf, b err = err ?: btf_sanity_check(btf); if (err) goto done; + btf_check_sorted(btf); done: if (err) { From 8c3070e159ba00424f0389ead694cacd85af260e Mon Sep 17 00:00:00 2001 From: Donglin Peng Date: Fri, 9 Jan 2026 20:59:58 +0800 Subject: [PATCH 100/268] btf: Optimize type lookup with binary search Improve btf_find_by_name_kind() performance by adding binary search support for sorted types. Falls back to linear search for compatibility. Signed-off-by: Donglin Peng Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260109130003.3313716-7-dolinux.peng@gmail.com --- include/linux/btf.h | 1 + kernel/bpf/btf.c | 92 ++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 83 insertions(+), 10 deletions(-) diff --git a/include/linux/btf.h b/include/linux/btf.h index 691f09784933..78dc79810c7d 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -219,6 +219,7 @@ bool btf_is_module(const struct btf *btf); bool btf_is_vmlinux(const struct btf *btf); struct module *btf_try_get_module(const struct btf *btf); u32 btf_nr_types(const struct btf *btf); +u32 btf_named_start_id(const struct btf *btf, bool own); struct btf *btf_base_btf(const struct btf *btf); bool btf_type_is_i32(const struct btf_type *t); bool btf_type_is_i64(const struct btf_type *t); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 2c6076fc29b9..be8aa31e9e94 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -259,6 +259,7 @@ struct btf { void *nohdr_data; struct btf_header hdr; u32 nr_types; /* includes VOID for base BTF */ + u32 named_start_id; u32 types_size; u32 data_size; refcount_t refcnt; @@ -494,6 +495,11 @@ static bool btf_type_is_modifier(const struct btf_type *t) return false; } +static int btf_start_id(const struct btf *btf) +{ + return btf->start_id + (btf->base_btf ? 0 : 1); +} + bool btf_type_is_void(const struct btf_type *t) { return t == &btf_void; @@ -544,21 +550,84 @@ u32 btf_nr_types(const struct btf *btf) return total; } -s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind) +/* + * btf_named_start_id - Get the named starting ID for the BTF + * @btf: Pointer to the target BTF object + * @own: Flag indicating whether to query only the current BTF (true = current BTF only, + * false = recursively traverse the base BTF chain) + * + * Return value rules: + * 1. For a sorted btf, return its named_start_id + * 2. Else for a split BTF, return its start_id + * 3. Else for a base BTF, return 1 + */ +u32 btf_named_start_id(const struct btf *btf, bool own) +{ + const struct btf *base_btf = btf; + + while (!own && base_btf->base_btf) + base_btf = base_btf->base_btf; + + return base_btf->named_start_id ?: (base_btf->start_id ?: 1); +} + +static s32 btf_find_by_name_kind_bsearch(const struct btf *btf, const char *name) { const struct btf_type *t; const char *tname; - u32 i, total; + s32 l, r, m; + + l = btf_named_start_id(btf, true); + r = btf_nr_types(btf) - 1; + while (l <= r) { + m = l + (r - l) / 2; + t = btf_type_by_id(btf, m); + tname = btf_name_by_offset(btf, t->name_off); + if (strcmp(tname, name) >= 0) { + if (l == r) + return r; + r = m; + } else { + l = m + 1; + } + } + + return btf_nr_types(btf); +} + +s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind) +{ + const struct btf *base_btf = btf_base_btf(btf); + const struct btf_type *t; + const char *tname; + s32 id, total; + + if (base_btf) { + id = btf_find_by_name_kind(base_btf, name, kind); + if (id > 0) + return id; + } total = btf_nr_types(btf); - for (i = 1; i < total; i++) { - t = btf_type_by_id(btf, i); - if (BTF_INFO_KIND(t->info) != kind) - continue; - - tname = btf_name_by_offset(btf, t->name_off); - if (!strcmp(tname, name)) - return i; + if (btf->named_start_id > 0 && name[0]) { + id = btf_find_by_name_kind_bsearch(btf, name); + for (; id < total; id++) { + t = btf_type_by_id(btf, id); + tname = btf_name_by_offset(btf, t->name_off); + if (strcmp(tname, name) != 0) + return -ENOENT; + if (BTF_INFO_KIND(t->info) == kind) + return id; + } + } else { + for (id = btf_start_id(btf); id < total; id++) { + t = btf_type_by_id(btf, id); + if (BTF_INFO_KIND(t->info) != kind) + continue; + tname = btf_name_by_offset(btf, t->name_off); + if (strcmp(tname, name) == 0) + return id; + } } return -ENOENT; @@ -5791,6 +5860,7 @@ static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uat goto errout; } env->btf = btf; + btf->named_start_id = 0; data = kvmalloc(attr->btf_size, GFP_KERNEL | __GFP_NOWARN); if (!data) { @@ -6210,6 +6280,7 @@ static struct btf *btf_parse_base(struct btf_verifier_env *env, const char *name btf->data = data; btf->data_size = data_size; btf->kernel_btf = true; + btf->named_start_id = 0; snprintf(btf->name, sizeof(btf->name), "%s", name); err = btf_parse_hdr(env); @@ -6327,6 +6398,7 @@ static struct btf *btf_parse_module(const char *module_name, const void *data, btf->start_id = base_btf->nr_types; btf->start_str_off = base_btf->hdr.str_len; btf->kernel_btf = true; + btf->named_start_id = 0; snprintf(btf->name, sizeof(btf->name), "%s", module_name); btf->data = kvmemdup(data, data_size, GFP_KERNEL | __GFP_NOWARN); From 342bf525ba0d83374f318e19186d50b1e7160d0e Mon Sep 17 00:00:00 2001 From: Donglin Peng Date: Fri, 9 Jan 2026 20:59:59 +0800 Subject: [PATCH 101/268] btf: Verify BTF sorting This patch checks whether the BTF is sorted by name in ascending order. If sorted, binary search will be used when looking up types. Specifically, vmlinux and kernel module BTFs are always sorted during the build phase with anonymous types placed before named types, so we only need to identify the starting ID of named types. Signed-off-by: Donglin Peng Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260109130003.3313716-8-dolinux.peng@gmail.com --- kernel/bpf/btf.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index be8aa31e9e94..d5b3a37c8560 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -550,6 +550,47 @@ u32 btf_nr_types(const struct btf *btf) return total; } +/* + * Note that vmlinux and kernel module BTFs are always sorted + * during the building phase. + */ +static void btf_check_sorted(struct btf *btf) +{ + u32 i, n, named_start_id = 0; + + n = btf_nr_types(btf); + if (btf_is_vmlinux(btf)) { + for (i = btf_start_id(btf); i < n; i++) { + const struct btf_type *t = btf_type_by_id(btf, i); + const char *n = btf_name_by_offset(btf, t->name_off); + + if (n[0] != '\0') { + btf->named_start_id = i; + return; + } + } + return; + } + + for (i = btf_start_id(btf) + 1; i < n; i++) { + const struct btf_type *ta = btf_type_by_id(btf, i - 1); + const struct btf_type *tb = btf_type_by_id(btf, i); + const char *na = btf_name_by_offset(btf, ta->name_off); + const char *nb = btf_name_by_offset(btf, tb->name_off); + + if (strcmp(na, nb) > 0) + return; + + if (named_start_id == 0 && na[0] != '\0') + named_start_id = i - 1; + if (named_start_id == 0 && nb[0] != '\0') + named_start_id = i; + } + + if (named_start_id) + btf->named_start_id = named_start_id; +} + /* * btf_named_start_id - Get the named starting ID for the BTF * @btf: Pointer to the target BTF object @@ -6301,6 +6342,7 @@ static struct btf *btf_parse_base(struct btf_verifier_env *env, const char *name if (err) goto errout; + btf_check_sorted(btf); refcount_set(&btf->refcnt, 1); return btf; @@ -6435,6 +6477,7 @@ static struct btf *btf_parse_module(const char *module_name, const void *data, } btf_verifier_env_free(env); + btf_check_sorted(btf); refcount_set(&btf->refcnt, 1); return btf; From dc893cfa390aa9d5fc83c908ea5e37a36e531892 Mon Sep 17 00:00:00 2001 From: Donglin Peng Date: Fri, 9 Jan 2026 21:00:00 +0800 Subject: [PATCH 102/268] bpf: Skip anonymous types in type lookup for performance Currently, vmlinux and kernel module BTFs are unconditionally sorted during the build phase, with named types placed at the end. Thus, anonymous types should be skipped when starting the search. In my vmlinux BTF, the number of anonymous types is 61,747, which means the loop count can be reduced by 61,747. Signed-off-by: Donglin Peng Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20260109130003.3313716-9-dolinux.peng@gmail.com --- kernel/bpf/btf.c | 10 ++++++---- kernel/bpf/verifier.c | 7 +------ 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index d5b3a37c8560..364dd84bfc5a 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3534,7 +3534,8 @@ const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type const struct btf_type *t; int len, id; - id = btf_find_next_decl_tag(btf, pt, comp_idx, tag_key, 0); + id = btf_find_next_decl_tag(btf, pt, comp_idx, tag_key, + btf_named_start_id(btf, false) - 1); if (id < 0) return ERR_PTR(id); @@ -7844,12 +7845,13 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) tname); return -EINVAL; } + /* Convert BTF function arguments into verifier types. * Only PTR_TO_CTX and SCALAR are supported atm. */ for (i = 0; i < nargs; i++) { u32 tags = 0; - int id = 0; + int id = btf_named_start_id(btf, false) - 1; /* 'arg:' decl_tag takes precedence over derivation of * register type from BTF type itself @@ -9338,7 +9340,7 @@ bpf_core_find_cands(struct bpf_core_ctx *ctx, u32 local_type_id) } /* Attempt to find target candidates in vmlinux BTF first */ - cands = bpf_core_add_cands(cands, main_btf, 1); + cands = bpf_core_add_cands(cands, main_btf, btf_named_start_id(main_btf, true)); if (IS_ERR(cands)) return ERR_CAST(cands); @@ -9370,7 +9372,7 @@ bpf_core_find_cands(struct bpf_core_ctx *ctx, u32 local_type_id) */ btf_get(mod_btf); spin_unlock_bh(&btf_idr_lock); - cands = bpf_core_add_cands(cands, mod_btf, btf_nr_types(main_btf)); + cands = bpf_core_add_cands(cands, mod_btf, btf_named_start_id(mod_btf, true)); btf_put(mod_btf); if (IS_ERR(cands)) return ERR_CAST(cands); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 092003cc7841..45733bae271d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -20687,12 +20687,7 @@ static int find_btf_percpu_datasec(struct btf *btf) * types to look at only module's own BTF types. */ n = btf_nr_types(btf); - if (btf_is_module(btf)) - i = btf_nr_types(btf_vmlinux); - else - i = 1; - - for(; i < n; i++) { + for (i = btf_named_start_id(btf, true); i < n; i++) { t = btf_type_by_id(btf, i); if (BTF_INFO_KIND(t->info) != BTF_KIND_DATASEC) continue; From 434bcbc837a69baa2720b2ae5baba8b6e36898c0 Mon Sep 17 00:00:00 2001 From: Donglin Peng Date: Fri, 9 Jan 2026 21:00:01 +0800 Subject: [PATCH 103/268] bpf: Optimize the performance of find_bpffs_btf_enums Currently, vmlinux BTF is unconditionally sorted during the build phase. The function btf_find_by_name_kind executes the binary search branch, so find_bpffs_btf_enums can be optimized by using btf_find_by_name_kind. Signed-off-by: Donglin Peng Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20260109130003.3313716-10-dolinux.peng@gmail.com --- kernel/bpf/inode.c | 42 +++++++++++++++++------------------------- 1 file changed, 17 insertions(+), 25 deletions(-) diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 9f866a010dad..005ea3a2cda7 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -600,10 +600,17 @@ struct bpffs_btf_enums { static int find_bpffs_btf_enums(struct bpffs_btf_enums *info) { + struct { + const struct btf_type **type; + const char *name; + } btf_enums[] = { + {&info->cmd_t, "bpf_cmd"}, + {&info->map_t, "bpf_map_type"}, + {&info->prog_t, "bpf_prog_type"}, + {&info->attach_t, "bpf_attach_type"}, + }; const struct btf *btf; - const struct btf_type *t; - const char *name; - int i, n; + int i, id; memset(info, 0, sizeof(*info)); @@ -615,31 +622,16 @@ static int find_bpffs_btf_enums(struct bpffs_btf_enums *info) info->btf = btf; - for (i = 1, n = btf_nr_types(btf); i < n; i++) { - t = btf_type_by_id(btf, i); - if (!btf_type_is_enum(t)) - continue; + for (i = 0; i < ARRAY_SIZE(btf_enums); i++) { + id = btf_find_by_name_kind(btf, btf_enums[i].name, + BTF_KIND_ENUM); + if (id < 0) + return -ESRCH; - name = btf_name_by_offset(btf, t->name_off); - if (!name) - continue; - - if (strcmp(name, "bpf_cmd") == 0) - info->cmd_t = t; - else if (strcmp(name, "bpf_map_type") == 0) - info->map_t = t; - else if (strcmp(name, "bpf_prog_type") == 0) - info->prog_t = t; - else if (strcmp(name, "bpf_attach_type") == 0) - info->attach_t = t; - else - continue; - - if (info->cmd_t && info->map_t && info->prog_t && info->attach_t) - return 0; + *btf_enums[i].type = btf_type_by_id(btf, id); } - return -ESRCH; + return 0; } static bool find_btf_enum_const(const struct btf *btf, const struct btf_type *enum_t, From 9282a42a1fe16c61a253293af439d6fecd8b5b6c Mon Sep 17 00:00:00 2001 From: Donglin Peng Date: Fri, 9 Jan 2026 21:00:03 +0800 Subject: [PATCH 104/268] btf: Refactor the code by calling str_is_empty Calling the str_is_empty function to clarify the code and no functional changes are introduced. Signed-off-by: Donglin Peng Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20260109130003.3313716-12-dolinux.peng@gmail.com --- tools/lib/bpf/btf.c | 34 +++++++++++++++++----------------- tools/lib/bpf/libbpf.c | 4 ++-- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 808e53961ed6..83fe79ffcb8f 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -2134,7 +2134,7 @@ int btf__add_int(struct btf *btf, const char *name, size_t byte_sz, int encoding int sz, name_off; /* non-empty name */ - if (!name || !name[0]) + if (str_is_empty(name)) return libbpf_err(-EINVAL); /* byte_sz must be power of 2 */ if (!byte_sz || (byte_sz & (byte_sz - 1)) || byte_sz > 16) @@ -2182,7 +2182,7 @@ int btf__add_float(struct btf *btf, const char *name, size_t byte_sz) int sz, name_off; /* non-empty name */ - if (!name || !name[0]) + if (str_is_empty(name)) return libbpf_err(-EINVAL); /* byte_sz must be one of the explicitly allowed values */ @@ -2237,7 +2237,7 @@ static int btf_add_ref_kind(struct btf *btf, int kind, const char *name, int ref if (!t) return libbpf_err(-ENOMEM); - if (name && name[0]) { + if (!str_is_empty(name)) { name_off = btf__add_str(btf, name); if (name_off < 0) return name_off; @@ -2314,7 +2314,7 @@ static int btf_add_composite(struct btf *btf, int kind, const char *name, __u32 if (!t) return libbpf_err(-ENOMEM); - if (name && name[0]) { + if (!str_is_empty(name)) { name_off = btf__add_str(btf, name); if (name_off < 0) return name_off; @@ -2415,7 +2415,7 @@ int btf__add_field(struct btf *btf, const char *name, int type_id, if (!m) return libbpf_err(-ENOMEM); - if (name && name[0]) { + if (!str_is_empty(name)) { name_off = btf__add_str(btf, name); if (name_off < 0) return name_off; @@ -2453,7 +2453,7 @@ static int btf_add_enum_common(struct btf *btf, const char *name, __u32 byte_sz, if (!t) return libbpf_err(-ENOMEM); - if (name && name[0]) { + if (!str_is_empty(name)) { name_off = btf__add_str(btf, name); if (name_off < 0) return name_off; @@ -2511,7 +2511,7 @@ int btf__add_enum_value(struct btf *btf, const char *name, __s64 value) return libbpf_err(-EINVAL); /* non-empty name */ - if (!name || !name[0]) + if (str_is_empty(name)) return libbpf_err(-EINVAL); if (value < INT_MIN || value > UINT_MAX) return libbpf_err(-E2BIG); @@ -2588,7 +2588,7 @@ int btf__add_enum64_value(struct btf *btf, const char *name, __u64 value) return libbpf_err(-EINVAL); /* non-empty name */ - if (!name || !name[0]) + if (str_is_empty(name)) return libbpf_err(-EINVAL); /* decompose and invalidate raw data */ @@ -2628,7 +2628,7 @@ int btf__add_enum64_value(struct btf *btf, const char *name, __u64 value) */ int btf__add_fwd(struct btf *btf, const char *name, enum btf_fwd_kind fwd_kind) { - if (!name || !name[0]) + if (str_is_empty(name)) return libbpf_err(-EINVAL); switch (fwd_kind) { @@ -2664,7 +2664,7 @@ int btf__add_fwd(struct btf *btf, const char *name, enum btf_fwd_kind fwd_kind) */ int btf__add_typedef(struct btf *btf, const char *name, int ref_type_id) { - if (!name || !name[0]) + if (str_is_empty(name)) return libbpf_err(-EINVAL); return btf_add_ref_kind(btf, BTF_KIND_TYPEDEF, name, ref_type_id, 0); @@ -2716,7 +2716,7 @@ int btf__add_restrict(struct btf *btf, int ref_type_id) */ int btf__add_type_tag(struct btf *btf, const char *value, int ref_type_id) { - if (!value || !value[0]) + if (str_is_empty(value)) return libbpf_err(-EINVAL); return btf_add_ref_kind(btf, BTF_KIND_TYPE_TAG, value, ref_type_id, 0); @@ -2733,7 +2733,7 @@ int btf__add_type_tag(struct btf *btf, const char *value, int ref_type_id) */ int btf__add_type_attr(struct btf *btf, const char *value, int ref_type_id) { - if (!value || !value[0]) + if (str_is_empty(value)) return libbpf_err(-EINVAL); return btf_add_ref_kind(btf, BTF_KIND_TYPE_TAG, value, ref_type_id, 1); @@ -2752,7 +2752,7 @@ int btf__add_func(struct btf *btf, const char *name, { int id; - if (!name || !name[0]) + if (str_is_empty(name)) return libbpf_err(-EINVAL); if (linkage != BTF_FUNC_STATIC && linkage != BTF_FUNC_GLOBAL && linkage != BTF_FUNC_EXTERN) @@ -2838,7 +2838,7 @@ int btf__add_func_param(struct btf *btf, const char *name, int type_id) if (!p) return libbpf_err(-ENOMEM); - if (name && name[0]) { + if (!str_is_empty(name)) { name_off = btf__add_str(btf, name); if (name_off < 0) return name_off; @@ -2873,7 +2873,7 @@ int btf__add_var(struct btf *btf, const char *name, int linkage, int type_id) int sz, name_off; /* non-empty name */ - if (!name || !name[0]) + if (str_is_empty(name)) return libbpf_err(-EINVAL); if (linkage != BTF_VAR_STATIC && linkage != BTF_VAR_GLOBAL_ALLOCATED && linkage != BTF_VAR_GLOBAL_EXTERN) @@ -2922,7 +2922,7 @@ int btf__add_datasec(struct btf *btf, const char *name, __u32 byte_sz) int sz, name_off; /* non-empty name */ - if (!name || !name[0]) + if (str_is_empty(name)) return libbpf_err(-EINVAL); if (btf_ensure_modifiable(btf)) @@ -2999,7 +2999,7 @@ static int btf_add_decl_tag(struct btf *btf, const char *value, int ref_type_id, struct btf_type *t; int sz, value_off; - if (!value || !value[0] || component_idx < -1) + if (str_is_empty(value) || component_idx < -1) return libbpf_err(-EINVAL); if (validate_type_id(ref_type_id)) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 6ea81701e274..bbcfd72b07d5 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -2904,7 +2904,7 @@ static int bpf_object__init_user_btf_map(struct bpf_object *obj, var_extra = btf_var(var); map_name = btf__name_by_offset(obj->btf, var->name_off); - if (map_name == NULL || map_name[0] == '\0') { + if (str_is_empty(map_name)) { pr_warn("map #%d: empty name.\n", var_idx); return -EINVAL; } @@ -4281,7 +4281,7 @@ static int bpf_object__collect_externs(struct bpf_object *obj) if (!sym_is_extern(sym)) continue; ext_name = elf_sym_str(obj, sym->st_name); - if (!ext_name || !ext_name[0]) + if (str_is_empty(ext_name)) continue; ext = obj->externs; From f8ade2342e22e7dbc71af496f07c900f8c69dd54 Mon Sep 17 00:00:00 2001 From: Matt Bobrowski Date: Tue, 13 Jan 2026 08:39:47 +0000 Subject: [PATCH 105/268] bpf: return PTR_TO_BTF_ID | PTR_TRUSTED from BPF kfuncs by default Teach the BPF verifier to treat pointers to struct types returned from BPF kfuncs as implicitly trusted (PTR_TO_BTF_ID | PTR_TRUSTED) by default. Returning untrusted pointers to struct types from BPF kfuncs should be considered an exception only, and certainly not the norm. Update existing selftests to reflect the change in register type printing (e.g. `ptr_` becoming `trusted_ptr_` in verifier error messages). Link: https://lore.kernel.org/bpf/aV4nbCaMfIoM0awM@google.com/ Signed-off-by: Matt Bobrowski Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20260113083949.2502978-1-mattbobrowski@google.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 46 ++++++++++++------- .../selftests/bpf/progs/map_kptr_fail.c | 4 +- .../struct_ops_kptr_return_fail__wrong_type.c | 2 +- .../bpf/progs/verifier_global_ptr_args.c | 2 +- tools/testing/selftests/bpf/verifier/calls.c | 2 +- 5 files changed, 34 insertions(+), 22 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 45733bae271d..faa1ecc1fe9d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -14212,26 +14212,38 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (is_kfunc_rcu_protected(&meta)) regs[BPF_REG_0].type |= MEM_RCU; } else { - mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].btf = desc_btf; - regs[BPF_REG_0].type = PTR_TO_BTF_ID; - regs[BPF_REG_0].btf_id = ptr_type_id; + enum bpf_reg_type type = PTR_TO_BTF_ID; if (meta.func_id == special_kfunc_list[KF_bpf_get_kmem_cache]) - regs[BPF_REG_0].type |= PTR_UNTRUSTED; - else if (is_kfunc_rcu_protected(&meta)) - regs[BPF_REG_0].type |= MEM_RCU; - - if (is_iter_next_kfunc(&meta)) { - struct bpf_reg_state *cur_iter; - - cur_iter = get_iter_from_state(env->cur_state, &meta); - - if (cur_iter->type & MEM_RCU) /* KF_RCU_PROTECTED */ - regs[BPF_REG_0].type |= MEM_RCU; - else - regs[BPF_REG_0].type |= PTR_TRUSTED; + type |= PTR_UNTRUSTED; + else if (is_kfunc_rcu_protected(&meta) || + (is_iter_next_kfunc(&meta) && + (get_iter_from_state(env->cur_state, &meta) + ->type & MEM_RCU))) { + /* + * If the iterator's constructor (the _new + * function e.g., bpf_iter_task_new) has been + * annotated with BPF kfunc flag + * KF_RCU_PROTECTED and was called within a RCU + * read-side critical section, also propagate + * the MEM_RCU flag to the pointer returned from + * the iterator's next function (e.g., + * bpf_iter_task_next). + */ + type |= MEM_RCU; + } else { + /* + * Any PTR_TO_BTF_ID that is returned from a BPF + * kfunc should by default be treated as + * implicitly trusted. + */ + type |= PTR_TRUSTED; } + + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].btf = desc_btf; + regs[BPF_REG_0].type = type; + regs[BPF_REG_0].btf_id = ptr_type_id; } if (is_kfunc_ret_null(&meta)) { diff --git a/tools/testing/selftests/bpf/progs/map_kptr_fail.c b/tools/testing/selftests/bpf/progs/map_kptr_fail.c index 4c0ff01f1a96..6443b320c732 100644 --- a/tools/testing/selftests/bpf/progs/map_kptr_fail.c +++ b/tools/testing/selftests/bpf/progs/map_kptr_fail.c @@ -272,7 +272,7 @@ int reject_untrusted_xchg(struct __sk_buff *ctx) SEC("?tc") __failure -__msg("invalid kptr access, R2 type=ptr_prog_test_ref_kfunc expected=ptr_prog_test_member") +__msg("invalid kptr access, R2 type=trusted_ptr_prog_test_ref_kfunc expected=ptr_prog_test_member") int reject_bad_type_xchg(struct __sk_buff *ctx) { struct prog_test_ref_kfunc *ref_ptr; @@ -291,7 +291,7 @@ int reject_bad_type_xchg(struct __sk_buff *ctx) } SEC("?tc") -__failure __msg("invalid kptr access, R2 type=ptr_prog_test_ref_kfunc") +__failure __msg("invalid kptr access, R2 type=trusted_ptr_prog_test_ref_kfunc") int reject_member_of_ref_xchg(struct __sk_buff *ctx) { struct prog_test_ref_kfunc *ref_ptr; diff --git a/tools/testing/selftests/bpf/progs/struct_ops_kptr_return_fail__wrong_type.c b/tools/testing/selftests/bpf/progs/struct_ops_kptr_return_fail__wrong_type.c index 6a2dd5367802..c8d217e89eea 100644 --- a/tools/testing/selftests/bpf/progs/struct_ops_kptr_return_fail__wrong_type.c +++ b/tools/testing/selftests/bpf/progs/struct_ops_kptr_return_fail__wrong_type.c @@ -12,7 +12,7 @@ void bpf_task_release(struct task_struct *p) __ksym; * reject programs returning a referenced kptr of the wrong type. */ SEC("struct_ops/test_return_ref_kptr") -__failure __msg("At program exit the register R0 is not a known value (ptr_or_null_)") +__failure __msg("At program exit the register R0 is not a known value (trusted_ptr_or_null_)") struct task_struct *BPF_PROG(kptr_return_fail__wrong_type, int dummy, struct task_struct *task, struct cgroup *cgrp) { diff --git a/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c b/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c index 1204fbc58178..e7dae0cf9c17 100644 --- a/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c +++ b/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c @@ -72,7 +72,7 @@ int trusted_task_arg_nonnull_fail1(void *ctx) SEC("?tp_btf/task_newtask") __failure __log_level(2) -__msg("R1 type=ptr_or_null_ expected=ptr_, trusted_ptr_, rcu_ptr_") +__msg("R1 type=trusted_ptr_or_null_ expected=ptr_, trusted_ptr_, rcu_ptr_") __msg("Caller passes invalid args into func#1 ('subprog_trusted_task_nonnull')") int trusted_task_arg_nonnull_fail2(void *ctx) { diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c index c8d640802cce..9ca83dce100d 100644 --- a/tools/testing/selftests/bpf/verifier/calls.c +++ b/tools/testing/selftests/bpf/verifier/calls.c @@ -220,7 +220,7 @@ }, .result_unpriv = REJECT, .result = REJECT, - .errstr = "variable ptr_ access var_off=(0x0; 0x7) disallowed", + .errstr = "variable trusted_ptr_ access var_off=(0x0; 0x7) disallowed", }, { "calls: invalid kfunc call: referenced arg needs refcounted PTR_TO_BTF_ID", From e463b6de9da17995a2ddabf199cc00c65a8a5392 Mon Sep 17 00:00:00 2001 From: Matt Bobrowski Date: Tue, 13 Jan 2026 08:39:48 +0000 Subject: [PATCH 106/268] bpf: drop KF_ACQUIRE flag on BPF kfunc bpf_get_root_mem_cgroup() With the BPF verifier now treating pointers to struct types returned from BPF kfuncs as implicitly trusted by default, there is no need for bpf_get_root_mem_cgroup() to be annotated with the KF_ACQUIRE flag. bpf_get_root_mem_cgroup() does not acquire any references, but rather simply returns a NULL pointer or a pointer to a struct mem_cgroup object that is valid for the entire lifetime of the kernel. This simplifies BPF programs using this kfunc by removing the requirement to pair the call with bpf_put_mem_cgroup(). Signed-off-by: Matt Bobrowski Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20260113083949.2502978-2-mattbobrowski@google.com Signed-off-by: Alexei Starovoitov --- mm/bpf_memcontrol.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/mm/bpf_memcontrol.c b/mm/bpf_memcontrol.c index 716df49d7647..f95cd5d16f4c 100644 --- a/mm/bpf_memcontrol.c +++ b/mm/bpf_memcontrol.c @@ -13,12 +13,12 @@ __bpf_kfunc_start_defs(); /** * bpf_get_root_mem_cgroup - Returns a pointer to the root memory cgroup * - * The function has KF_ACQUIRE semantics, even though the root memory - * cgroup is never destroyed after being created and doesn't require - * reference counting. And it's perfectly safe to pass it to - * bpf_put_mem_cgroup() - * - * Return: A pointer to the root memory cgroup. + * Return: A pointer to the root memory cgroup. Notably, the pointer to the + * returned struct mem_cgroup is trusted by default, so it's perfectably + * acceptable to also pass this pointer into other BPF kfuncs (e.g., + * bpf_mem_cgroup_usage()). Additionally, this BPF kfunc does not make use of + * KF_ACQUIRE semantics, so there's no requirement for the BPF program to call + * bpf_put_mem_cgroup() on the pointer returned by this BPF kfunc. */ __bpf_kfunc struct mem_cgroup *bpf_get_root_mem_cgroup(void) { @@ -162,7 +162,7 @@ __bpf_kfunc void bpf_mem_cgroup_flush_stats(struct mem_cgroup *memcg) __bpf_kfunc_end_defs(); BTF_KFUNCS_START(bpf_memcontrol_kfuncs) -BTF_ID_FLAGS(func, bpf_get_root_mem_cgroup, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_get_root_mem_cgroup, KF_RET_NULL) BTF_ID_FLAGS(func, bpf_get_mem_cgroup, KF_ACQUIRE | KF_RET_NULL | KF_RCU) BTF_ID_FLAGS(func, bpf_put_mem_cgroup, KF_RELEASE) From bbdbed193bcf57f1e9c0d9d58c3ad3350bfd0bd1 Mon Sep 17 00:00:00 2001 From: Matt Bobrowski Date: Tue, 13 Jan 2026 08:39:49 +0000 Subject: [PATCH 107/268] selftests/bpf: assert BPF kfunc default trusted pointer semantics The BPF verifier was recently updated to treat pointers to struct types returned from BPF kfuncs as implicitly trusted by default. Add a new test case to exercise this new implicit trust semantic. The KF_ACQUIRE flag was dropped from the bpf_get_root_mem_cgroup() kfunc because it returns a global pointer to root_mem_cgroup without performing any explicit reference counting. This makes it an ideal candidate to verify the new implicit trusted pointer semantics. Signed-off-by: Matt Bobrowski Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20260113083949.2502978-3-mattbobrowski@google.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/verifier.c | 2 ++ .../selftests/bpf/progs/verifier_memcontrol.c | 32 +++++++++++++++++++ 2 files changed, 34 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/verifier_memcontrol.c diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index 5829ffd70f8f..38c5ba70100c 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -61,6 +61,7 @@ #include "verifier_masking.skel.h" #include "verifier_may_goto_1.skel.h" #include "verifier_may_goto_2.skel.h" +#include "verifier_memcontrol.skel.h" #include "verifier_meta_access.skel.h" #include "verifier_movsx.skel.h" #include "verifier_mtu.skel.h" @@ -202,6 +203,7 @@ void test_verifier_map_ret_val(void) { RUN(verifier_map_ret_val); } void test_verifier_masking(void) { RUN(verifier_masking); } void test_verifier_may_goto_1(void) { RUN(verifier_may_goto_1); } void test_verifier_may_goto_2(void) { RUN(verifier_may_goto_2); } +void test_verifier_memcontrol(void) { RUN(verifier_memcontrol); } void test_verifier_meta_access(void) { RUN(verifier_meta_access); } void test_verifier_movsx(void) { RUN(verifier_movsx); } void test_verifier_mul(void) { RUN(verifier_mul); } diff --git a/tools/testing/selftests/bpf/progs/verifier_memcontrol.c b/tools/testing/selftests/bpf/progs/verifier_memcontrol.c new file mode 100644 index 000000000000..13564956f621 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_memcontrol.c @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2026 Google LLC. + */ + +#include +#include +#include +#include "bpf_misc.h" + +SEC("syscall") +__success __retval(0) +int root_mem_cgroup_default_trusted(void *ctx) +{ + unsigned long usage; + struct mem_cgroup *root_mem_cgroup; + + root_mem_cgroup = bpf_get_root_mem_cgroup(); + if (!root_mem_cgroup) + return 1; + + /* + * BPF kfunc bpf_get_root_mem_cgroup() returns a PTR_TO_BTF_ID | + * PTR_TRUSTED | PTR_MAYBE_NULL, therefore it should be accepted when + * passed to a BPF kfunc only accepting KF_TRUSTED_ARGS. + */ + usage = bpf_mem_cgroup_usage(root_mem_cgroup); + __sink(usage); + return 0; +} + +char _license[] SEC("license") = "GPL"; From e3bd7bdf5ffe49d8381e42843f6e98cd0c78a1e8 Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Sun, 11 Jan 2026 15:30:45 +0000 Subject: [PATCH 108/268] bpf: Return proper address for non-zero offsets in insn array The map_direct_value_addr() function of the instruction array map incorrectly adds offset to the resulting address. This is a bug, because later the resolve_pseudo_ldimm64() function adds the offset. Fix it. Corresponding selftests are added in a consequent commit. Fixes: 493d9e0d6083 ("bpf, x86: add support for indirect jumps") Signed-off-by: Anton Protopopov Reviewed-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260111153047.8388-2-a.s.protopopov@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_insn_array.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/bpf_insn_array.c b/kernel/bpf/bpf_insn_array.c index c96630cb75bf..37b43102953e 100644 --- a/kernel/bpf/bpf_insn_array.c +++ b/kernel/bpf/bpf_insn_array.c @@ -126,7 +126,7 @@ static int insn_array_map_direct_value_addr(const struct bpf_map *map, u64 *imm, return -EINVAL; /* from BPF's point of view, this map is a jump table */ - *imm = (unsigned long)insn_array->ips + off; + *imm = (unsigned long)insn_array->ips; return 0; } From 7e525860e7250355bcb01ae9779154512b5e5e88 Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Sun, 11 Jan 2026 15:30:46 +0000 Subject: [PATCH 109/268] bpf: Return EACCES for incorrect access to insn array The insn_array_map_direct_value_addr() function currently returns -EINVAL when the offset within the map is invalid. Change this to return -EACCES, so that it is consistent with similar boundary access checks in the verifier. Signed-off-by: Anton Protopopov Reviewed-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260111153047.8388-3-a.s.protopopov@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_insn_array.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/bpf_insn_array.c b/kernel/bpf/bpf_insn_array.c index 37b43102953e..c0286f25ca3c 100644 --- a/kernel/bpf/bpf_insn_array.c +++ b/kernel/bpf/bpf_insn_array.c @@ -123,7 +123,7 @@ static int insn_array_map_direct_value_addr(const struct bpf_map *map, u64 *imm, if ((off % sizeof(long)) != 0 || (off / sizeof(long)) >= map->max_entries) - return -EINVAL; + return -EACCES; /* from BPF's point of view, this map is a jump table */ *imm = (unsigned long)insn_array->ips; From c656807675e09604af09a4b9f3ea466af91b7b7a Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Sun, 11 Jan 2026 15:30:47 +0000 Subject: [PATCH 110/268] selftests/bpf: Add tests for loading insn array values with offsets The ldimm64 instruction for map value supports an offset. For insn array maps it wasn't tested before, as normally such instructions aren't generated. However, this is still possible to pass such instructions, so add a few tests to check that correct offsets work properly and incorrect offsets are rejected. Signed-off-by: Anton Protopopov Link: https://lore.kernel.org/r/20260111153047.8388-4-a.s.protopopov@gmail.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/bpf_gotox.c | 208 ++++++++++++++++++ 1 file changed, 208 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_gotox.c b/tools/testing/selftests/bpf/prog_tests/bpf_gotox.c index d138cc7b1bda..75b0cf2467ab 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_gotox.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_gotox.c @@ -240,6 +240,208 @@ static void check_nonstatic_global_other_sec(struct bpf_gotox *skel) bpf_link__destroy(link); } +/* + * The following subtests do not use skeleton rather than to check + * if the test should be skipped. + */ + +static int create_jt_map(__u32 max_entries) +{ + const char *map_name = "jt"; + __u32 key_size = 4; + __u32 value_size = sizeof(struct bpf_insn_array_value); + + return bpf_map_create(BPF_MAP_TYPE_INSN_ARRAY, map_name, + key_size, value_size, max_entries, NULL); +} + +static int prog_load(struct bpf_insn *insns, __u32 insn_cnt) +{ + return bpf_prog_load(BPF_PROG_TYPE_RAW_TRACEPOINT, NULL, "GPL", insns, insn_cnt, NULL); +} + +static int __check_ldimm64_off_prog_load(__u32 max_entries, __u32 off) +{ + struct bpf_insn insns[] = { + BPF_LD_IMM64_RAW(BPF_REG_1, BPF_PSEUDO_MAP_VALUE, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + int map_fd, ret; + + map_fd = create_jt_map(max_entries); + if (!ASSERT_GE(map_fd, 0, "create_jt_map")) + return -1; + if (!ASSERT_EQ(bpf_map_freeze(map_fd), 0, "bpf_map_freeze")) { + close(map_fd); + return -1; + } + + insns[0].imm = map_fd; + insns[1].imm = off; + + ret = prog_load(insns, ARRAY_SIZE(insns)); + close(map_fd); + return ret; +} + +/* + * Check that loads from an instruction array map are only allowed with offsets + * which are multiples of 8 and do not point to outside of the map. + */ +static void check_ldimm64_off_load(struct bpf_gotox *skel __always_unused) +{ + const __u32 max_entries = 10; + int prog_fd; + __u32 off; + + for (off = 0; off < max_entries; off++) { + prog_fd = __check_ldimm64_off_prog_load(max_entries, off * 8); + if (!ASSERT_GE(prog_fd, 0, "__check_ldimm64_off_prog_load")) + return; + close(prog_fd); + } + + prog_fd = __check_ldimm64_off_prog_load(max_entries, 7 /* not a multiple of 8 */); + if (!ASSERT_EQ(prog_fd, -EACCES, "__check_ldimm64_off_prog_load: should be -EACCES")) { + close(prog_fd); + return; + } + + prog_fd = __check_ldimm64_off_prog_load(max_entries, max_entries * 8 /* too large */); + if (!ASSERT_EQ(prog_fd, -EACCES, "__check_ldimm64_off_prog_load: should be -EACCES")) { + close(prog_fd); + return; + } +} + +static int __check_ldimm64_gotox_prog_load(struct bpf_insn *insns, + __u32 insn_cnt, + __u32 off1, __u32 off2) +{ + const __u32 values[] = {5, 7, 9, 11, 13, 15}; + const __u32 max_entries = ARRAY_SIZE(values); + struct bpf_insn_array_value val = {}; + int map_fd, ret, i; + + map_fd = create_jt_map(max_entries); + if (!ASSERT_GE(map_fd, 0, "create_jt_map")) + return -1; + + for (i = 0; i < max_entries; i++) { + val.orig_off = values[i]; + if (!ASSERT_EQ(bpf_map_update_elem(map_fd, &i, &val, 0), 0, + "bpf_map_update_elem")) { + close(map_fd); + return -1; + } + } + + if (!ASSERT_EQ(bpf_map_freeze(map_fd), 0, "bpf_map_freeze")) { + close(map_fd); + return -1; + } + + /* r1 = &map + offset1 */ + insns[0].imm = map_fd; + insns[1].imm = off1; + + /* r1 += off2 */ + insns[2].imm = off2; + + ret = prog_load(insns, insn_cnt); + close(map_fd); + return ret; +} + +static void reject_offsets(struct bpf_insn *insns, __u32 insn_cnt, __u32 off1, __u32 off2) +{ + int prog_fd; + + prog_fd = __check_ldimm64_gotox_prog_load(insns, insn_cnt, off1, off2); + if (!ASSERT_EQ(prog_fd, -EACCES, "__check_ldimm64_gotox_prog_load")) + close(prog_fd); +} + +/* + * Verify a bit more complex programs which include indirect jumps + * and with jump tables loaded with a non-zero offset + */ +static void check_ldimm64_off_gotox(struct bpf_gotox *skel __always_unused) +{ + struct bpf_insn insns[] = { + /* + * The following instructions perform an indirect jump to + * labels below. Thus valid offsets in the map are {0,...,5}. + * The program rewrites the offsets in the instructions below: + * r1 = &map + offset1 + * r1 += offset2 + * r1 = *r1 + * gotox r1 + */ + BPF_LD_IMM64_RAW(BPF_REG_1, BPF_PSEUDO_MAP_VALUE, 0), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0), + BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_JA | BPF_X, BPF_REG_1, 0, 0, 0), + + /* case 0: */ + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + /* case 1: */ + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + /* case 2: */ + BPF_MOV64_IMM(BPF_REG_0, 2), + BPF_EXIT_INSN(), + /* case 3: */ + BPF_MOV64_IMM(BPF_REG_0, 3), + BPF_EXIT_INSN(), + /* case 4: */ + BPF_MOV64_IMM(BPF_REG_0, 4), + BPF_EXIT_INSN(), + /* default: */ + BPF_MOV64_IMM(BPF_REG_0, 5), + BPF_EXIT_INSN(), + }; + int prog_fd, err; + __u32 off1, off2; + + /* allow all combinations off1 + off2 < 6 */ + for (off1 = 0; off1 < 6; off1++) { + for (off2 = 0; off1 + off2 < 6; off2++) { + LIBBPF_OPTS(bpf_test_run_opts, topts); + + prog_fd = __check_ldimm64_gotox_prog_load(insns, ARRAY_SIZE(insns), + off1 * 8, off2 * 8); + if (!ASSERT_GE(prog_fd, 0, "__check_ldimm64_gotox_prog_load")) + return; + + err = bpf_prog_test_run_opts(prog_fd, &topts); + if (!ASSERT_OK(err, "test_run_opts err")) { + close(prog_fd); + return; + } + + if (!ASSERT_EQ(topts.retval, off1 + off2, "test_run_opts retval")) { + close(prog_fd); + return; + } + + close(prog_fd); + } + } + + /* reject off1 + off2 >= 6 */ + reject_offsets(insns, ARRAY_SIZE(insns), 8 * 3, 8 * 3); + reject_offsets(insns, ARRAY_SIZE(insns), 8 * 7, 8 * 0); + reject_offsets(insns, ARRAY_SIZE(insns), 8 * 0, 8 * 7); + + /* reject (off1 + off2) % 8 != 0 */ + reject_offsets(insns, ARRAY_SIZE(insns), 3, 3); + reject_offsets(insns, ARRAY_SIZE(insns), 7, 0); + reject_offsets(insns, ARRAY_SIZE(insns), 0, 7); +} + void test_bpf_gotox(void) { struct bpf_gotox *skel; @@ -288,5 +490,11 @@ void test_bpf_gotox(void) if (test__start_subtest("one-map-two-jumps")) __subtest(skel, check_one_map_two_jumps); + if (test__start_subtest("check-ldimm64-off")) + __subtest(skel, check_ldimm64_off_load); + + if (test__start_subtest("check-ldimm64-off-gotox")) + __subtest(skel, check_ldimm64_off_gotox); + bpf_gotox__destroy(skel); } From d1aab1ca576c90192ba961094d51b0be6355a4d6 Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Wed, 14 Jan 2026 16:25:43 +0000 Subject: [PATCH 111/268] bpf: Properly mark live registers for indirect jumps For a `gotox rX` instruction the rX register should be marked as used in the compute_insn_live_regs() function. Fix this. Signed-off-by: Anton Protopopov Link: https://lore.kernel.org/r/20260114162544.83253-2-a.s.protopopov@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 62ad7c79ce2d..7a375f608263 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -24848,6 +24848,12 @@ static void compute_insn_live_regs(struct bpf_verifier_env *env, case BPF_JMP32: switch (code) { case BPF_JA: + def = 0; + if (BPF_SRC(insn->code) == BPF_X) + use = dst; + else + use = 0; + break; case BPF_JCOND: def = 0; use = 0; From 7c8e817e443c118aa303f1bbcec33df8d9e3487a Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Wed, 14 Jan 2026 16:25:44 +0000 Subject: [PATCH 112/268] selftests/bpf: Extend live regs tests with a test for gotox Add a test which checks that the destination register of a gotox instruction is marked as used and that the union of jump targets is considered as live. Signed-off-by: Anton Protopopov Link: https://lore.kernel.org/r/20260114162544.83253-3-a.s.protopopov@gmail.com Signed-off-by: Alexei Starovoitov --- .../bpf/progs/compute_live_registers.c | 41 +++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/compute_live_registers.c b/tools/testing/selftests/bpf/progs/compute_live_registers.c index 6884ab99a421..f05e120f3450 100644 --- a/tools/testing/selftests/bpf/progs/compute_live_registers.c +++ b/tools/testing/selftests/bpf/progs/compute_live_registers.c @@ -431,6 +431,47 @@ __naked void subprog1(void) ::: __clobber_all); } +#if defined(__TARGET_ARCH_x86) || defined(__TARGET_ARCH_arm64) + +SEC("socket") +__log_level(2) +__msg("2: .1........ (07) r1 += 8") +__msg("3: .1........ (79) r2 = *(u64 *)(r1 +0)") +__msg("4: ..2....... (b7) r3 = 1") +__msg("5: ..23...... (b7) r4 = 2") +__msg("6: ..234..... (0d) gotox r2") +__msg("7: ...3...... (bf) r0 = r3") +__msg("8: 0......... (95) exit") +__msg("9: ....4..... (bf) r0 = r4") +__msg("10: 0......... (95) exit") +__naked +void gotox(void) +{ + asm volatile ( + ".pushsection .jumptables,\"\",@progbits;" +"jt0_%=: .quad l0_%= - socket;" + ".quad l1_%= - socket;" + ".size jt0_%=, 16;" + ".global jt0_%=;" + ".popsection;" + + "r1 = jt0_%= ll;" + "r1 += 8;" + "r2 = *(u64 *)(r1 + 0);" + "r3 = 1;" + "r4 = 2;" + ".8byte %[gotox_r2];" +"l0_%=: r0 = r3;" + "exit;" +"l1_%=: r0 = r4;" + "exit;" + : + : __imm_insn(gotox_r2, BPF_RAW_INSN(BPF_JMP | BPF_JA | BPF_X, BPF_REG_2, BPF_REG_0, 0, 0)) + : __clobber_all); +} + +#endif /* __TARGET_ARCH_x86 || __TARGET_ARCH_arm64 */ + /* to retain debug info for BTF generation */ void kfunc_root(void) { From 276f3b6daf6024ae2742afd161e7418a5584a660 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 12 Jan 2026 13:11:56 +0100 Subject: [PATCH 113/268] arm64/ftrace,bpf: Fix partial regs after bpf_prog_run Mahe reported issue with bpf_override_return helper not working when executed from kprobe.multi bpf program on arm. The problem is that on arm we use alternate storage for pt_regs object that is passed to bpf_prog_run and if any register is changed (which is the case of bpf_override_return) it's not propagated back to actual pt_regs object. Fixing this by introducing and calling ftrace_partial_regs_update function to propagate the values of changed registers (ip and stack). Reported-by: Mahe Tardy Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Reviewed-by: Steven Rostedt (Google) Acked-by: Will Deacon Link: https://lore.kernel.org/bpf/20260112121157.854473-1-jolsa@kernel.org --- include/linux/ftrace_regs.h | 25 +++++++++++++++++++++++++ kernel/trace/bpf_trace.c | 1 + 2 files changed, 26 insertions(+) diff --git a/include/linux/ftrace_regs.h b/include/linux/ftrace_regs.h index 15627ceea9bc..386fa48c4a95 100644 --- a/include/linux/ftrace_regs.h +++ b/include/linux/ftrace_regs.h @@ -33,6 +33,31 @@ struct ftrace_regs; #define ftrace_regs_get_frame_pointer(fregs) \ frame_pointer(&arch_ftrace_regs(fregs)->regs) +static __always_inline void +ftrace_partial_regs_update(struct ftrace_regs *fregs, struct pt_regs *regs) { } + +#else + +/* + * ftrace_partial_regs_update - update the original ftrace_regs from regs + * @fregs: The ftrace_regs to update from @regs + * @regs: The partial regs from ftrace_partial_regs() that was updated + * + * Some architectures have the partial regs living in the ftrace_regs + * structure, whereas other architectures need to make a different copy + * of the @regs. If a partial @regs is retrieved by ftrace_partial_regs() and + * if the code using @regs updates a field (like the instruction pointer or + * stack pointer) it may need to propagate that change to the original @fregs + * it retrieved the partial @regs from. Use this function to guarantee that + * update happens. + */ +static __always_inline void +ftrace_partial_regs_update(struct ftrace_regs *fregs, struct pt_regs *regs) +{ + ftrace_regs_set_instruction_pointer(fregs, instruction_pointer(regs)); + ftrace_regs_set_return_value(fregs, regs_return_value(regs)); +} + #endif /* HAVE_ARCH_FTRACE_REGS */ /* This can be overridden by the architectures */ diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 6e076485bf70..3a17f79b20c2 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2564,6 +2564,7 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link, old_run_ctx = bpf_set_run_ctx(&run_ctx.session_ctx.run_ctx); err = bpf_prog_run(link->link.prog, regs); bpf_reset_run_ctx(old_run_ctx); + ftrace_partial_regs_update(fregs, bpf_kprobe_multi_pt_regs_ptr()); rcu_read_unlock(); out: From 934d9746ed0206e93506a68c838fe82ef748576a Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 12 Jan 2026 13:11:57 +0100 Subject: [PATCH 114/268] selftests/bpf: Add test for bpf_override_return helper We do not actually test the bpf_override_return helper functionality itself at the moment, only the bpf program being able to attach it. Adding test that override prctl syscall return value on top of kprobe and kprobe.multi. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20260112121157.854473-2-jolsa@kernel.org --- .../bpf/prog_tests/kprobe_multi_test.c | 44 +++++++++++++++++++ .../bpf/progs/kprobe_multi_override.c | 15 +++++++ tools/testing/selftests/bpf/trace_helpers.h | 12 +++++ 3 files changed, 71 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c index 6cfaa978bc9a..9caef222e528 100644 --- a/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c +++ b/tools/testing/selftests/bpf/prog_tests/kprobe_multi_test.c @@ -1,4 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 +#include +#include #include #include "kprobe_multi.skel.h" #include "trace_helpers.h" @@ -540,6 +542,46 @@ static void test_attach_override(void) kprobe_multi_override__destroy(skel); } +static void test_override(void) +{ + struct kprobe_multi_override *skel = NULL; + int err; + + skel = kprobe_multi_override__open_and_load(); + if (!ASSERT_OK_PTR(skel, "kprobe_multi_empty__open_and_load")) + goto cleanup; + + skel->bss->pid = getpid(); + + /* no override */ + err = prctl(0xffff, 0); + ASSERT_EQ(err, -1, "err"); + + /* kprobe.multi override */ + skel->links.test_override = bpf_program__attach_kprobe_multi_opts(skel->progs.test_override, + SYS_PREFIX "sys_prctl", NULL); + if (!ASSERT_OK_PTR(skel->links.test_override, "bpf_program__attach_kprobe_multi_opts")) + goto cleanup; + + err = prctl(0xffff, 0); + ASSERT_EQ(err, 123, "err"); + + bpf_link__destroy(skel->links.test_override); + skel->links.test_override = NULL; + + /* kprobe override */ + skel->links.test_kprobe_override = bpf_program__attach_kprobe(skel->progs.test_kprobe_override, + false, SYS_PREFIX "sys_prctl"); + if (!ASSERT_OK_PTR(skel->links.test_kprobe_override, "bpf_program__attach_kprobe")) + goto cleanup; + + err = prctl(0xffff, 0); + ASSERT_EQ(err, 123, "err"); + +cleanup: + kprobe_multi_override__destroy(skel); +} + #ifdef __x86_64__ static void test_attach_write_ctx(void) { @@ -597,6 +639,8 @@ void test_kprobe_multi_test(void) test_attach_api_fails(); if (test__start_subtest("attach_override")) test_attach_override(); + if (test__start_subtest("override")) + test_override(); if (test__start_subtest("session")) test_session_skel_api(); if (test__start_subtest("session_cookie")) diff --git a/tools/testing/selftests/bpf/progs/kprobe_multi_override.c b/tools/testing/selftests/bpf/progs/kprobe_multi_override.c index 28f8487c9059..14f39fa6d515 100644 --- a/tools/testing/selftests/bpf/progs/kprobe_multi_override.c +++ b/tools/testing/selftests/bpf/progs/kprobe_multi_override.c @@ -5,9 +5,24 @@ char _license[] SEC("license") = "GPL"; +int pid = 0; + SEC("kprobe.multi") int test_override(struct pt_regs *ctx) { + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + + bpf_override_return(ctx, 123); + return 0; +} + +SEC("kprobe") +int test_kprobe_override(struct pt_regs *ctx) +{ + if (bpf_get_current_pid_tgid() >> 32 != pid) + return 0; + bpf_override_return(ctx, 123); return 0; } diff --git a/tools/testing/selftests/bpf/trace_helpers.h b/tools/testing/selftests/bpf/trace_helpers.h index 9437bdd4afa5..a5576b2dfc26 100644 --- a/tools/testing/selftests/bpf/trace_helpers.h +++ b/tools/testing/selftests/bpf/trace_helpers.h @@ -4,6 +4,18 @@ #include +#ifdef __x86_64__ +#define SYS_PREFIX "__x64_" +#elif defined(__s390x__) +#define SYS_PREFIX "__s390x_" +#elif defined(__aarch64__) +#define SYS_PREFIX "__arm64_" +#elif defined(__riscv) +#define SYS_PREFIX "__riscv_" +#else +#define SYS_PREFIX "" +#endif + #define __ALIGN_MASK(x, mask) (((x)+(mask))&~(mask)) #define ALIGN(x, a) __ALIGN_MASK(x, (typeof(x))(a)-1) From af9e89d8dd39530c8bd14c33ddf6b502df1071b6 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Thu, 15 Jan 2026 07:11:40 -0800 Subject: [PATCH 115/268] bpf: Preserve id of register in sync_linked_regs() sync_linked_regs() copies the id of known_reg to reg when propagating bounds of known_reg to reg using the off of known_reg, but when known_reg was linked to reg like: known_reg = reg ; both known_reg and reg get same id known_reg += 4 ; known_reg gets off = 4, and its id gets BPF_ADD_CONST now when a call to sync_linked_regs() happens, let's say with the following: if known_reg >= 10 goto pc+2 known_reg's new bounds are propagated to reg but now reg gets BPF_ADD_CONST from the copy. This means if another link to reg is created like: another_reg = reg ; another_reg should get the id of reg but assign_scalar_id_before_mov() sees BPF_ADD_CONST on reg and assigns a new id to it. As reg has a new id now, known_reg's link to reg is broken. If we find new bounds for known_reg, they will not be propagated to reg. This can be seen in the selftest added in the next commit: 0: (85) call bpf_get_prandom_u32#7 ; R0=scalar() 1: (57) r0 &= 255 ; R0=scalar(smin=smin32=0,smax=umax=smax32=umax32=255,var_off=(0x0; 0xff)) 2: (bf) r1 = r0 ; R0=scalar(id=1,smin=smin32=0,smax=umax=smax32=umax32=255,var_off=(0x0; 0xff)) R1=scalar(id=1,smin=smin32=0,smax=umax=smax32=umax32=255,var_off=(0x0; 0xff)) 3: (07) r1 += 4 ; R1=scalar(id=1+4,smin=umin=smin32=umin32=4,smax=umax=smax32=umax32=259,var_off=(0x0; 0x1ff)) 4: (a5) if r1 < 0xa goto pc+4 ; R1=scalar(id=1+4,smin=umin=smin32=umin32=10,smax=umax=smax32=umax32=259,var_off=(0x0; 0x1ff)) 5: (bf) r2 = r0 ; R0=scalar(id=2,smin=umin=smin32=umin32=6,smax=umax=smax32=umax32=255) R2=scalar(id=2,smin=umin=smin32=umin32=6,smax=umax=smax32=umax32=255) 6: (a5) if r1 < 0xe goto pc+2 ; R1=scalar(id=1+4,smin=umin=smin32=umin32=14,smax=umax=smax32=umax32=259,var_off=(0x0; 0x1ff)) 7: (35) if r0 >= 0xa goto pc+1 ; R0=scalar(id=2,smin=umin=smin32=umin32=6,smax=umax=smax32=umax32=9,var_off=(0x0; 0xf)) 8: (37) r0 /= 0 div by zero When 4 is verified, r1's bounds are propagated to r0 but r0 also gets BPF_ADD_CONST (bug). When 5 is verified, r0 gets a new id (2) and its link with r1 is broken. After 6 we know r1 has bounds [14, 259] and therefore r0 should have bounds [10, 255], therefore the branch at 7 is always taken. But because r0's id was changed to 2, r1's new bounds are not propagated to r0. The verifier still thinks r0 has bounds [6, 255] before 7 and execution can reach div by zero. Fix this by preserving id in sync_linked_regs() like off and subreg_def. Fixes: 98d7ca374ba4 ("bpf: Track delta between "linked" registers.") Signed-off-by: Puranjay Mohan Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260115151143.1344724-2-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7a375f608263..9de0ec0c3ed9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -16871,6 +16871,7 @@ static void sync_linked_regs(struct bpf_verifier_state *vstate, struct bpf_reg_s } else { s32 saved_subreg_def = reg->subreg_def; s32 saved_off = reg->off; + u32 saved_id = reg->id; fake_reg.type = SCALAR_VALUE; __mark_reg_known(&fake_reg, (s32)reg->off - (s32)known_reg->off); @@ -16878,10 +16879,11 @@ static void sync_linked_regs(struct bpf_verifier_state *vstate, struct bpf_reg_s /* reg = known_reg; reg += delta */ copy_register_state(reg, known_reg); /* - * Must preserve off, id and add_const flag, + * Must preserve off, id and subreg_def flag, * otherwise another sync_linked_regs() will be incorrect. */ reg->off = saved_off; + reg->id = saved_id; reg->subreg_def = saved_subreg_def; scalar32_min_max_add(reg, &fake_reg); From 086c99fbe45070d02851427eab5ae26fe7d0f3c0 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Thu, 15 Jan 2026 07:11:41 -0800 Subject: [PATCH 116/268] selftests: bpf: Add test for multiple syncs from linked register Before the last commit, sync_linked_regs() corrupted the register whose bounds are being updated by copying known_reg's id to it. The ids are the same in value but known_reg has the BPF_ADD_CONST flag which is wrongly copied to reg. This later causes issues when creating new links to this reg. assign_scalar_id_before_mov() sees this BPF_ADD_CONST and gives a new id to this register and breaks the old links. This is exposed by the added selftest. Signed-off-by: Puranjay Mohan Tested-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260115151143.1344724-3-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- .../bpf/progs/verifier_linked_scalars.c | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/verifier_linked_scalars.c b/tools/testing/selftests/bpf/progs/verifier_linked_scalars.c index 8f755d2464cf..5f41bbb730a7 100644 --- a/tools/testing/selftests/bpf/progs/verifier_linked_scalars.c +++ b/tools/testing/selftests/bpf/progs/verifier_linked_scalars.c @@ -31,4 +31,37 @@ l1: \ " ::: __clobber_all); } +/* + * Test that sync_linked_regs() preserves register IDs. + * + * The sync_linked_regs() function copies bounds from known_reg to linked + * registers. When doing so, it must preserve each register's original id + * to allow subsequent syncs from the same source to work correctly. + * + */ +SEC("socket") +__success +__naked void sync_linked_regs_preserves_id(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r0 &= 0xff; /* r0 in [0, 255] */ \ + r1 = r0; /* r0, r1 linked with id 1 */ \ + r1 += 4; /* r1 has id=1 and off=4 in [4, 259] */ \ + if r1 < 10 goto l0_%=; \ + /* r1 in [10, 259], r0 synced to [6, 255] */ \ + r2 = r0; /* r2 has id=1 and in [6, 255] */ \ + if r1 < 14 goto l0_%=; \ + /* r1 in [14, 259], r0 synced to [10, 255] */ \ + if r0 >= 10 goto l0_%=; \ + /* Never executed */ \ + r0 /= 0; \ +l0_%=: \ + r0 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + char _license[] SEC("license") = "GPL"; From 1700147697618a57ed0a2a97d9a477d131b8fc54 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Thu, 15 Jan 2026 18:45:09 +0000 Subject: [PATCH 117/268] bpf: Add __force annotations to silence sparse warnings Add __force annotations to casts that convert between __user and kernel address spaces. These casts are intentional: - In bpf_send_signal_common(), the value is stored in si_value.sival_ptr which is typed as void __user *, but the value comes from a BPF program parameter. - In the bpf_*_dynptr() kfuncs, user pointers are cast to const void * before being passed to copy helper functions that correctly handle the user address space through copy_from_user variants. Without __force, sparse reports: warning: cast removes address space '__user' of expression Reported-by: kernel test robot Signed-off-by: Mykyta Yatsenko Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260115184509.3585759-1-mykyta.yatsenko5@gmail.com Closes: https://lore.kernel.org/oe-kbuild-all/202601131740.6C3BdBaB-lkp@intel.com/ --- kernel/trace/bpf_trace.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 3a17f79b20c2..f73e08c223b5 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -830,7 +830,7 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type, struct task_struc info.si_code = SI_KERNEL; info.si_pid = 0; info.si_uid = 0; - info.si_value.sival_ptr = (void *)(unsigned long)value; + info.si_value.sival_ptr = (void __user __force *)(unsigned long)value; siginfo = &info; } @@ -3518,7 +3518,7 @@ __bpf_kfunc int bpf_send_signal_task(struct task_struct *task, int sig, enum pid __bpf_kfunc int bpf_probe_read_user_dynptr(struct bpf_dynptr *dptr, u64 off, u64 size, const void __user *unsafe_ptr__ign) { - return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign, + return __bpf_dynptr_copy(dptr, off, size, (const void __force *)unsafe_ptr__ign, copy_user_data_nofault, NULL); } @@ -3532,7 +3532,7 @@ __bpf_kfunc int bpf_probe_read_kernel_dynptr(struct bpf_dynptr *dptr, u64 off, __bpf_kfunc int bpf_probe_read_user_str_dynptr(struct bpf_dynptr *dptr, u64 off, u64 size, const void __user *unsafe_ptr__ign) { - return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign, + return __bpf_dynptr_copy_str(dptr, off, size, (const void __force *)unsafe_ptr__ign, copy_user_str_nofault, NULL); } @@ -3546,14 +3546,14 @@ __bpf_kfunc int bpf_probe_read_kernel_str_dynptr(struct bpf_dynptr *dptr, u64 of __bpf_kfunc int bpf_copy_from_user_dynptr(struct bpf_dynptr *dptr, u64 off, u64 size, const void __user *unsafe_ptr__ign) { - return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign, + return __bpf_dynptr_copy(dptr, off, size, (const void __force *)unsafe_ptr__ign, copy_user_data_sleepable, NULL); } __bpf_kfunc int bpf_copy_from_user_str_dynptr(struct bpf_dynptr *dptr, u64 off, u64 size, const void __user *unsafe_ptr__ign) { - return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign, + return __bpf_dynptr_copy_str(dptr, off, size, (const void __force *)unsafe_ptr__ign, copy_user_str_sleepable, NULL); } @@ -3561,7 +3561,7 @@ __bpf_kfunc int bpf_copy_from_user_task_dynptr(struct bpf_dynptr *dptr, u64 off, u64 size, const void __user *unsafe_ptr__ign, struct task_struct *tsk) { - return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign, + return __bpf_dynptr_copy(dptr, off, size, (const void __force *)unsafe_ptr__ign, copy_user_data_sleepable, tsk); } @@ -3569,7 +3569,7 @@ __bpf_kfunc int bpf_copy_from_user_task_str_dynptr(struct bpf_dynptr *dptr, u64 u64 size, const void __user *unsafe_ptr__ign, struct task_struct *tsk) { - return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign, + return __bpf_dynptr_copy_str(dptr, off, size, (const void __force *)unsafe_ptr__ign, copy_user_str_sleepable, tsk); } From 4787eaf7c17131bf0cce93336f8f411f832ad05a Mon Sep 17 00:00:00 2001 From: Tim Bird Date: Wed, 14 Jan 2026 18:31:29 -0700 Subject: [PATCH 118/268] bpf: Add SPDX license identifiers to a few files Add GPL-2.0 SPDX-License-Identifier lines to some files, and remove a reference to COPYING, and boilerplate warranty text, from offload.c. Signed-off-by: Tim Bird Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260115013129.598705-1-tim.bird@sony.com --- kernel/bpf/offload.c | 12 +----------- kernel/bpf/ringbuf.c | 1 + kernel/bpf/token.c | 1 + 3 files changed, 3 insertions(+), 11 deletions(-) diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 42ae8d595c2c..227f9b5f388b 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -1,16 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2017-2018 Netronome Systems, Inc. - * - * This software is licensed under the GNU General License Version 2, - * June 1991 as shown in the file COPYING in the top-level directory of this - * source tree. - * - * THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" - * WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, - * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - * FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE - * OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME - * THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. */ #include diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index f6a075ffac63..35ae64ade36b 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include #include #include diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c index feecd8f4dbf9..7e4aa1e44b50 100644 --- a/kernel/bpf/token.c +++ b/kernel/bpf/token.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include #include #include From 999b2395e3c32273dec98f811f0ab5c8a7441850 Mon Sep 17 00:00:00 2001 From: Gyutae Bae Date: Mon, 12 Jan 2026 12:45:16 +0900 Subject: [PATCH 119/268] bpftool: Add 'prepend' option for tcx attach to insert at chain start Add support for the 'prepend' option when attaching tcx_ingress and tcx_egress programs. This option allows inserting a BPF program at the beginning of the TCX chain instead of appending it at the end. The implementation uses BPF_F_BEFORE flag which automatically inserts the program at the beginning of the chain when no relative reference is specified. This change includes: - Modify do_attach_tcx() to support prepend insertion using BPF_F_BEFORE - Update documentation to describe the new 'prepend' option - Add bash completion support for the 'prepend' option on tcx attach types - Add example usage in the documentation - Add validation to reject 'overwrite' for non-XDP attach types The 'prepend' option is only valid for tcx_ingress and tcx_egress attach types. For XDP attach types, the existing 'overwrite' option remains available. Example usage: # bpftool net attach tcx_ingress name tc_prog dev lo prepend This feature is useful when the order of program execution in the TCX chain matters and users need to ensure certain programs run first. Co-developed-by: Siwan Kim Signed-off-by: Siwan Kim Signed-off-by: Gyutae Bae Signed-off-by: Andrii Nakryiko Reviewed-by: Quentin Monnet Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20260112034516.22723-1-gyutae.opensource@navercorp.com --- .../bpf/bpftool/Documentation/bpftool-net.rst | 30 +++++++++++++----- tools/bpf/bpftool/bash-completion/bpftool | 9 +++++- tools/bpf/bpftool/net.c | 31 ++++++++++++++++--- 3 files changed, 58 insertions(+), 12 deletions(-) diff --git a/tools/bpf/bpftool/Documentation/bpftool-net.rst b/tools/bpf/bpftool/Documentation/bpftool-net.rst index a9ed8992800f..22da07087e42 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-net.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-net.rst @@ -24,7 +24,7 @@ NET COMMANDS ============ | **bpftool** **net** { **show** | **list** } [ **dev** *NAME* ] -| **bpftool** **net attach** *ATTACH_TYPE* *PROG* **dev** *NAME* [ **overwrite** ] +| **bpftool** **net attach** *ATTACH_TYPE* *PROG* **dev** *NAME* [ **overwrite** | **prepend** ] | **bpftool** **net detach** *ATTACH_TYPE* **dev** *NAME* | **bpftool** **net help** | @@ -58,11 +58,9 @@ bpftool net { show | list } [ dev *NAME* ] then all bpf programs attached to non clsact qdiscs, and finally all bpf programs attached to root and clsact qdisc. -bpftool net attach *ATTACH_TYPE* *PROG* dev *NAME* [ overwrite ] +bpftool net attach *ATTACH_TYPE* *PROG* dev *NAME* [ overwrite | prepend ] Attach bpf program *PROG* to network interface *NAME* with type specified - by *ATTACH_TYPE*. Previously attached bpf program can be replaced by the - command used with **overwrite** option. Currently, only XDP-related modes - are supported for *ATTACH_TYPE*. + by *ATTACH_TYPE*. *ATTACH_TYPE* can be of: **xdp** - try native XDP and fallback to generic XDP if NIC driver does not support it; @@ -72,11 +70,18 @@ bpftool net attach *ATTACH_TYPE* *PROG* dev *NAME* [ overwrite ] **tcx_ingress** - Ingress TCX. runs on ingress net traffic; **tcx_egress** - Egress TCX. runs on egress net traffic; + For XDP-related attach types (**xdp**, **xdpgeneric**, **xdpdrv**, + **xdpoffload**), the **overwrite** option can be used to replace a + previously attached bpf program. + + For **tcx_ingress** and **tcx_egress** attach types, the **prepend** option + can be used to attach the program at the beginning of the chain instead of + at the end. + bpftool net detach *ATTACH_TYPE* dev *NAME* Detach bpf program attached to network interface *NAME* with type specified by *ATTACH_TYPE*. To detach bpf program, same *ATTACH_TYPE* previously used - for attach must be specified. Currently, only XDP-related modes are - supported for *ATTACH_TYPE*. + for attach must be specified. bpftool net help Print short help message. @@ -191,6 +196,17 @@ EXAMPLES tc: lo(1) tcx/ingress tc_prog prog_id 29 +| +| **# bpftool net attach tcx_ingress name tc_prog2 dev lo prepend** +| **# bpftool net** +| + +:: + + tc: + lo(1) tcx/ingress tc_prog2 prog_id 30 + lo(1) tcx/ingress tc_prog prog_id 29 + | | **# bpftool net attach tcx_ingress name tc_prog dev lo** | **# bpftool net detach tcx_ingress dev lo** diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index 53bcfeb1a76e..a28f0cc522e4 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -1142,7 +1142,14 @@ _bpftool() return 0 ;; 8) - _bpftool_once_attr 'overwrite' + case ${words[3]} in + tcx_ingress|tcx_egress) + _bpftool_once_attr 'prepend' + ;; + *) + _bpftool_once_attr 'overwrite' + ;; + esac return 0 ;; esac diff --git a/tools/bpf/bpftool/net.c b/tools/bpf/bpftool/net.c index cfc6f944f7c3..f25d66c8395e 100644 --- a/tools/bpf/bpftool/net.c +++ b/tools/bpf/bpftool/net.c @@ -666,10 +666,16 @@ static int get_tcx_type(enum net_attach_type attach_type) } } -static int do_attach_tcx(int progfd, enum net_attach_type attach_type, int ifindex) +static int do_attach_tcx(int progfd, enum net_attach_type attach_type, int ifindex, bool prepend) { int type = get_tcx_type(attach_type); + if (prepend) { + LIBBPF_OPTS(bpf_prog_attach_opts, opts, + .flags = BPF_F_BEFORE + ); + return bpf_prog_attach_opts(progfd, ifindex, type, &opts); + } return bpf_prog_attach(progfd, ifindex, type, 0); } @@ -685,6 +691,7 @@ static int do_attach(int argc, char **argv) enum net_attach_type attach_type; int progfd, ifindex, err = 0; bool overwrite = false; + bool prepend = false; /* parse attach args */ if (!REQ_ARGS(5)) @@ -709,9 +716,25 @@ static int do_attach(int argc, char **argv) if (argc) { if (is_prefix(*argv, "overwrite")) { + if (attach_type != NET_ATTACH_TYPE_XDP && + attach_type != NET_ATTACH_TYPE_XDP_GENERIC && + attach_type != NET_ATTACH_TYPE_XDP_DRIVER && + attach_type != NET_ATTACH_TYPE_XDP_OFFLOAD) { + p_err("'overwrite' is only supported for xdp types"); + err = -EINVAL; + goto cleanup; + } overwrite = true; + } else if (is_prefix(*argv, "prepend")) { + if (attach_type != NET_ATTACH_TYPE_TCX_INGRESS && + attach_type != NET_ATTACH_TYPE_TCX_EGRESS) { + p_err("'prepend' is only supported for tcx_ingress/tcx_egress"); + err = -EINVAL; + goto cleanup; + } + prepend = true; } else { - p_err("expected 'overwrite', got: '%s'?", *argv); + p_err("expected 'overwrite' or 'prepend', got: '%s'?", *argv); err = -EINVAL; goto cleanup; } @@ -728,7 +751,7 @@ static int do_attach(int argc, char **argv) /* attach tcx prog */ case NET_ATTACH_TYPE_TCX_INGRESS: case NET_ATTACH_TYPE_TCX_EGRESS: - err = do_attach_tcx(progfd, attach_type, ifindex); + err = do_attach_tcx(progfd, attach_type, ifindex, prepend); break; default: break; @@ -985,7 +1008,7 @@ static int do_help(int argc, char **argv) fprintf(stderr, "Usage: %1$s %2$s { show | list } [dev ]\n" - " %1$s %2$s attach ATTACH_TYPE PROG dev [ overwrite ]\n" + " %1$s %2$s attach ATTACH_TYPE PROG dev [ overwrite | prepend ]\n" " %1$s %2$s detach ATTACH_TYPE dev \n" " %1$s %2$s help\n" "\n" From 47d440d0a5bb822f3f4e4b2479246da5efb765e6 Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Thu, 15 Jan 2026 16:34:57 +0000 Subject: [PATCH 120/268] selftests/bpf: Support when CONFIG_VXLAN=m If CONFIG_VXLAN is 'm', struct vxlanhdr will not be in vmlinux.h. Add a ___local variant to support cases where vxlan is a module. Fixes: 8517b1abe5ea ("selftests/bpf: Integrate test_tc_tunnel.sh tests into test_progs") Signed-off-by: Alan Maguire Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260115163457.146267-1-alan.maguire@oracle.com --- .../selftests/bpf/progs/test_tc_tunnel.c | 21 ++++++++++++------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c index 7330c61b5730..7376df405a6b 100644 --- a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c +++ b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c @@ -23,7 +23,12 @@ static const int cfg_udp_src = 20000; (((__u64)len & BPF_ADJ_ROOM_ENCAP_L2_MASK) \ << BPF_ADJ_ROOM_ENCAP_L2_SHIFT) -#define L2_PAD_SZ (sizeof(struct vxlanhdr) + ETH_HLEN) +struct vxlanhdr___local { + __be32 vx_flags; + __be32 vx_vni; +}; + +#define L2_PAD_SZ (sizeof(struct vxlanhdr___local) + ETH_HLEN) #define UDP_PORT 5555 #define MPLS_OVER_UDP_PORT 6635 @@ -154,7 +159,7 @@ static __always_inline int __encap_ipv4(struct __sk_buff *skb, __u8 encap_proto, l2_len = ETH_HLEN; if (ext_proto & EXTPROTO_VXLAN) { udp_dst = VXLAN_UDP_PORT; - l2_len += sizeof(struct vxlanhdr); + l2_len += sizeof(struct vxlanhdr___local); } else udp_dst = ETH_OVER_UDP_PORT; break; @@ -195,12 +200,12 @@ static __always_inline int __encap_ipv4(struct __sk_buff *skb, __u8 encap_proto, flags |= BPF_F_ADJ_ROOM_ENCAP_L2_ETH; if (ext_proto & EXTPROTO_VXLAN) { - struct vxlanhdr *vxlan_hdr = (struct vxlanhdr *)l2_hdr; + struct vxlanhdr___local *vxlan_hdr = (struct vxlanhdr___local *)l2_hdr; vxlan_hdr->vx_flags = VXLAN_FLAGS; vxlan_hdr->vx_vni = VXLAN_VNI; - l2_hdr += sizeof(struct vxlanhdr); + l2_hdr += sizeof(struct vxlanhdr___local); } if (bpf_skb_load_bytes(skb, 0, l2_hdr, ETH_HLEN)) @@ -285,7 +290,7 @@ static __always_inline int __encap_ipv6(struct __sk_buff *skb, __u8 encap_proto, l2_len = ETH_HLEN; if (ext_proto & EXTPROTO_VXLAN) { udp_dst = VXLAN_UDP_PORT; - l2_len += sizeof(struct vxlanhdr); + l2_len += sizeof(struct vxlanhdr___local); } else udp_dst = ETH_OVER_UDP_PORT; break; @@ -325,12 +330,12 @@ static __always_inline int __encap_ipv6(struct __sk_buff *skb, __u8 encap_proto, flags |= BPF_F_ADJ_ROOM_ENCAP_L2_ETH; if (ext_proto & EXTPROTO_VXLAN) { - struct vxlanhdr *vxlan_hdr = (struct vxlanhdr *)l2_hdr; + struct vxlanhdr___local *vxlan_hdr = (struct vxlanhdr___local *)l2_hdr; vxlan_hdr->vx_flags = VXLAN_FLAGS; vxlan_hdr->vx_vni = VXLAN_VNI; - l2_hdr += sizeof(struct vxlanhdr); + l2_hdr += sizeof(struct vxlanhdr___local); } if (bpf_skb_load_bytes(skb, 0, l2_hdr, ETH_HLEN)) @@ -639,7 +644,7 @@ static int decap_internal(struct __sk_buff *skb, int off, int len, char proto) olen += ETH_HLEN; break; case VXLAN_UDP_PORT: - olen += ETH_HLEN + sizeof(struct vxlanhdr); + olen += ETH_HLEN + sizeof(struct vxlanhdr___local); break; } break; From efad162f5a840ae178e7761c176c49f433c7bb68 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 15 Jan 2026 21:22:45 -0800 Subject: [PATCH 121/268] selftests/bpf: Fix map_kptr test failure On my arm64 machine, I get the following failure: ... tester_init:PASS:tester_log_buf 0 nsec process_subtest:PASS:obj_open_mem 0 nsec process_subtest:PASS:specs_alloc 0 nsec serial_test_map_kptr:PASS:rcu_tasks_trace_gp__open_and_load 0 nsec ... test_map_kptr_success:PASS:map_kptr__open_and_load 0 nsec test_map_kptr_success:PASS:test_map_kptr_ref1 refcount 0 nsec test_map_kptr_success:FAIL:test_map_kptr_ref1 retval unexpected error: 2 (errno 2) test_map_kptr_success:PASS:test_map_kptr_ref2 refcount 0 nsec test_map_kptr_success:FAIL:test_map_kptr_ref2 retval unexpected error: 1 (errno 2) ... #201/21 map_kptr/success-map:FAIL In serial_test_map_kptr(), before test_map_kptr_success(), one kern_sync_rcu() is used to have some delay for freeing the map. But in my environment, one kern_sync_rcu() seems not enough and caused the test failure. In bpf_map_free_in_work() in syscall.c, the queue time for queue_work(system_dfl_wq, &map->work) may be longer than expected. This may cause the test failure since test_map_kptr_success() expects all previous maps having been freed. Since it is not clear how long queue_work() time takes, a bpf prog is added to count the reference after bpf_kfunc_call_test_acquire(). If the number of references is 2 (for initial ref and the one just acquired), all previous maps should have been released. This will resolve the above 'retval unexpected error' issue. Signed-off-by: Yonghong Song Signed-off-by: Andrii Nakryiko Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/bpf/20260116052245.3692405-1-yonghong.song@linux.dev --- .../selftests/bpf/prog_tests/map_kptr.c | 23 +++++++++++++++++++ tools/testing/selftests/bpf/progs/map_kptr.c | 18 +++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/map_kptr.c b/tools/testing/selftests/bpf/prog_tests/map_kptr.c index 8743df599567..f372162c0280 100644 --- a/tools/testing/selftests/bpf/prog_tests/map_kptr.c +++ b/tools/testing/selftests/bpf/prog_tests/map_kptr.c @@ -131,6 +131,25 @@ static int kern_sync_rcu_tasks_trace(struct rcu_tasks_trace_gp *rcu) return 0; } +static void wait_for_map_release(void) +{ + LIBBPF_OPTS(bpf_test_run_opts, lopts); + struct map_kptr *skel; + int ret; + + skel = map_kptr__open_and_load(); + if (!ASSERT_OK_PTR(skel, "map_kptr__open_and_load")) + return; + + do { + ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.count_ref), &lopts); + ASSERT_OK(ret, "count_ref ret"); + ASSERT_OK(lopts.retval, "count_ref retval"); + } while (skel->bss->num_of_refs != 2); + + map_kptr__destroy(skel); +} + void serial_test_map_kptr(void) { struct rcu_tasks_trace_gp *skel; @@ -148,11 +167,15 @@ void serial_test_map_kptr(void) ASSERT_OK(kern_sync_rcu_tasks_trace(skel), "sync rcu_tasks_trace"); ASSERT_OK(kern_sync_rcu(), "sync rcu"); + wait_for_map_release(); + /* Observe refcount dropping to 1 on bpf_map_free_deferred */ test_map_kptr_success(false); ASSERT_OK(kern_sync_rcu_tasks_trace(skel), "sync rcu_tasks_trace"); ASSERT_OK(kern_sync_rcu(), "sync rcu"); + wait_for_map_release(); + /* Observe refcount dropping to 1 on synchronous delete elem */ test_map_kptr_success(true); } diff --git a/tools/testing/selftests/bpf/progs/map_kptr.c b/tools/testing/selftests/bpf/progs/map_kptr.c index edaba481db9d..e708ffbe1f61 100644 --- a/tools/testing/selftests/bpf/progs/map_kptr.c +++ b/tools/testing/selftests/bpf/progs/map_kptr.c @@ -487,6 +487,24 @@ int test_map_kptr_ref3(struct __sk_buff *ctx) return 0; } +int num_of_refs; + +SEC("syscall") +int count_ref(void *ctx) +{ + struct prog_test_ref_kfunc *p; + unsigned long arg = 0; + + p = bpf_kfunc_call_test_acquire(&arg); + if (!p) + return 1; + + num_of_refs = p->cnt.refs.counter; + + bpf_kfunc_call_test_release(p); + return 0; +} + SEC("syscall") int test_ls_map_kptr_ref1(void *ctx) { From ef7d4e42d16f74b123c86c9195ba5136046cee57 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Fri, 16 Jan 2026 06:14:35 -0800 Subject: [PATCH 122/268] bpf: verifier: Make sync_linked_regs() scratch registers sync_linked_regs() is called after a conditional jump to propagate new bounds of a register to all its liked registers. But the verifier log only prints the state of the register that is part of the conditional jump. Make sync_linked_regs() scratch the registers whose bounds have been updated by propagation from a known register. Before: 0: (85) call bpf_get_prandom_u32#7 ; R0=scalar() 1: (57) r0 &= 255 ; R0=scalar(smin=smin32=0,smax=umax=smax32=umax32=255,var_off=(0x0; 0xff)) 2: (bf) r1 = r0 ; R0=scalar(id=1,smin=smin32=0,smax=umax=smax32=umax32=255,var_off=(0x0; 0xff)) R1=scalar(id=1,smin=smin32=0,smax=umax=smax32=umax32=255,var_off=(0x0; 0xff)) 3: (07) r1 += 4 ; R1=scalar(id=1+4,smin=umin=smin32=umin32=4,smax=umax=smax32=umax32=259,var_off=(0x0; 0x1ff)) 4: (a5) if r1 < 0xa goto pc+2 ; R1=scalar(id=1+4,smin=umin=smin32=umin32=10,smax=umax=smax32=umax32=259,var_off=(0x0; 0x1ff)) 5: (35) if r0 >= 0x6 goto pc+1 After: 0: (85) call bpf_get_prandom_u32#7 ; R0=scalar() 1: (57) r0 &= 255 ; R0=scalar(smin=smin32=0,smax=umax=smax32=umax32=255,var_off=(0x0; 0xff)) 2: (bf) r1 = r0 ; R0=scalar(id=1,smin=smin32=0,smax=umax=smax32=umax32=255,var_off=(0x0; 0xff)) R1=scalar(id=1,smin=smin32=0,smax=umax=smax32=umax32=255,var_off=(0x0; 0xff)) 3: (07) r1 += 4 ; R1=scalar(id=1+4,smin=umin=smin32=umin32=4,smax=umax=smax32=umax32=259,var_off=(0x0; 0x1ff)) 4: (a5) if r1 < 0xa goto pc+2 ; R0=scalar(id=1+0,smin=umin=smin32=umin32=6,smax=umax=smax32=umax32=255) R1=scalar(id=1+4,smin=umin=smin32=umin32=10,smax=umax=smax32=umax32=259,var_off=(0x0; 0x1ff)) 5: (35) if r0 >= 0x6 goto pc+1 The conditional jump in 4 updates the bound of R1 and the new bounds are propogated to R0 as it is linked with the same id, before this change, verifier only printed the state for R1 but after it prints for both R0 and R1. Suggested-by: Andrii Nakryiko Signed-off-by: Puranjay Mohan Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20260116141436.3715322-1-puranjay@kernel.org --- kernel/bpf/verifier.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9de0ec0c3ed9..76e02e13890f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -16846,8 +16846,8 @@ static void collect_linked_regs(struct bpf_verifier_state *vstate, u32 id, /* For all R in linked_regs, copy known_reg range into R * if R->id == known_reg->id. */ -static void sync_linked_regs(struct bpf_verifier_state *vstate, struct bpf_reg_state *known_reg, - struct linked_regs *linked_regs) +static void sync_linked_regs(struct bpf_verifier_env *env, struct bpf_verifier_state *vstate, + struct bpf_reg_state *known_reg, struct linked_regs *linked_regs) { struct bpf_reg_state fake_reg; struct bpf_reg_state *reg; @@ -16890,6 +16890,10 @@ static void sync_linked_regs(struct bpf_verifier_state *vstate, struct bpf_reg_s scalar_min_max_add(reg, &fake_reg); reg->var_off = tnum_add(reg->var_off, fake_reg.var_off); } + if (e->is_reg) + mark_reg_scratched(env, e->regno); + else + mark_stack_slot_scratched(env, e->spi); } } @@ -17076,13 +17080,15 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, if (BPF_SRC(insn->code) == BPF_X && src_reg->type == SCALAR_VALUE && src_reg->id && !WARN_ON_ONCE(src_reg->id != other_branch_regs[insn->src_reg].id)) { - sync_linked_regs(this_branch, src_reg, &linked_regs); - sync_linked_regs(other_branch, &other_branch_regs[insn->src_reg], &linked_regs); + sync_linked_regs(env, this_branch, src_reg, &linked_regs); + sync_linked_regs(env, other_branch, &other_branch_regs[insn->src_reg], + &linked_regs); } if (dst_reg->type == SCALAR_VALUE && dst_reg->id && !WARN_ON_ONCE(dst_reg->id != other_branch_regs[insn->dst_reg].id)) { - sync_linked_regs(this_branch, dst_reg, &linked_regs); - sync_linked_regs(other_branch, &other_branch_regs[insn->dst_reg], &linked_regs); + sync_linked_regs(env, this_branch, dst_reg, &linked_regs); + sync_linked_regs(env, other_branch, &other_branch_regs[insn->dst_reg], + &linked_regs); } /* if one pointer register is compared to another pointer From 713edc71449f122491f8860be49b40f27d5f46b5 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 20 Jan 2026 13:55:01 +0100 Subject: [PATCH 123/268] bpf: Remove leftover accounting in htab_map_mem_usage after rqspinlock After commit 4fa8d68aa53e ("bpf: Convert hashtab.c to rqspinlock") we no longer use HASHTAB_MAP_LOCK_{COUNT,MASK} as the per-CPU map_locked[HASHTAB_MAP_LOCK_COUNT] array got removed from struct bpf_htab. Right now it is still accounted for in htab_map_mem_usage. Signed-off-by: Daniel Borkmann Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/09703eb6bb249f12b1d5253b5a50a0c4fa239d27.1768913513.git.daniel@iogearbox.net --- kernel/bpf/hashtab.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 441ff5bc54ac..3b9d297a53be 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -82,9 +82,6 @@ struct bucket { rqspinlock_t raw_lock; }; -#define HASHTAB_MAP_LOCK_COUNT 8 -#define HASHTAB_MAP_LOCK_MASK (HASHTAB_MAP_LOCK_COUNT - 1) - struct bpf_htab { struct bpf_map map; struct bpf_mem_alloc ma; @@ -2237,11 +2234,11 @@ static u64 htab_map_mem_usage(const struct bpf_map *map) bool prealloc = htab_is_prealloc(htab); bool percpu = htab_is_percpu(htab); bool lru = htab_is_lru(htab); - u64 num_entries; - u64 usage = sizeof(struct bpf_htab); + u64 num_entries, usage; + + usage = sizeof(struct bpf_htab) + + sizeof(struct bucket) * htab->n_buckets; - usage += sizeof(struct bucket) * htab->n_buckets; - usage += sizeof(int) * num_possible_cpus() * HASHTAB_MAP_LOCK_COUNT; if (prealloc) { num_entries = map->max_entries; if (htab_has_extra_elems(htab)) From f81c07a6e98e3171d0c4c5ab79f5aeff71b42c44 Mon Sep 17 00:00:00 2001 From: Qiliang Yuan Date: Tue, 20 Jan 2026 10:32:34 +0800 Subject: [PATCH 124/268] bpf/verifier: Optimize ID mapping reset in states_equal Currently, reset_idmap_scratch() performs a 4.7KB memset() in every states_equal() call. Optimize this by using a counter to track used ID mappings, replacing the O(N) memset() with an O(1) reset and bounding the search loop in check_ids(). Signed-off-by: Qiliang Yuan Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20260120023234.77673-1-realwujing@gmail.com --- include/linux/bpf_verifier.h | 1 + kernel/bpf/verifier.c | 23 ++++++++++++++--------- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 130bcbd66f60..8355b585cd18 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -692,6 +692,7 @@ struct bpf_id_pair { struct bpf_idmap { u32 tmp_id_gen; + u32 cnt; struct bpf_id_pair map[BPF_ID_MAP_SIZE]; }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 76e02e13890f..b67d8981b058 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -19000,18 +19000,21 @@ static bool check_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap) if (old_id == 0) /* cur_id == 0 as well */ return true; - for (i = 0; i < BPF_ID_MAP_SIZE; i++) { - if (!map[i].old) { - /* Reached an empty slot; haven't seen this id before */ - map[i].old = old_id; - map[i].cur = cur_id; - return true; - } + for (i = 0; i < idmap->cnt; i++) { if (map[i].old == old_id) return map[i].cur == cur_id; if (map[i].cur == cur_id) return false; } + + /* Reached the end of known mappings; haven't seen this id before */ + if (idmap->cnt < BPF_ID_MAP_SIZE) { + map[idmap->cnt].old = old_id; + map[idmap->cnt].cur = cur_id; + idmap->cnt++; + return true; + } + /* We ran out of idmap slots, which should be impossible */ WARN_ON_ONCE(1); return false; @@ -19520,8 +19523,10 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat static void reset_idmap_scratch(struct bpf_verifier_env *env) { - env->idmap_scratch.tmp_id_gen = env->id_gen; - memset(&env->idmap_scratch.map, 0, sizeof(env->idmap_scratch.map)); + struct bpf_idmap *idmap = &env->idmap_scratch; + + idmap->tmp_id_gen = env->id_gen; + idmap->cnt = 0; } static bool states_equal(struct bpf_verifier_env *env, From 2e6690d4f7fc41c4fae7d0a4c0bf11f1973e5650 Mon Sep 17 00:00:00 2001 From: Gyutae Bae Date: Tue, 20 Jan 2026 18:07:16 +0900 Subject: [PATCH 125/268] selftests/bpf: Add perfbuf multi-producer benchmark Add a multi-producer benchmark for perfbuf to complement the existing ringbuf multi-producer test. Unlike ringbuf which uses a shared buffer and experiences contention, perfbuf uses per-CPU buffers so the test measures scaling behavior rather than contention. This allows developers to compare perfbuf vs ringbuf performance under multi-producer workloads when choosing between the two for their systems. Signed-off-by: Gyutae Bae Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260120090716.82927-1-gyutae.opensource@navercorp.com --- tools/testing/selftests/bpf/benchs/run_bench_ringbufs.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/testing/selftests/bpf/benchs/run_bench_ringbufs.sh b/tools/testing/selftests/bpf/benchs/run_bench_ringbufs.sh index 83e05e837871..123b7feb6935 100755 --- a/tools/testing/selftests/bpf/benchs/run_bench_ringbufs.sh +++ b/tools/testing/selftests/bpf/benchs/run_bench_ringbufs.sh @@ -49,6 +49,11 @@ for b in 1 2 3 4 8 12 16 20 24 28 32 36 40 44 48 52; do summarize "rb-libbpf nr_prod $b" "$($RUN_RB_BENCH -p$b --rb-batch-cnt 50 rb-libbpf)" done +header "Perfbuf, multi-producer" +for b in 1 2 3 4 8 12 16 20 24 28 32 36 40 44 48 52; do + summarize "pb-libbpf nr_prod $b" "$($RUN_RB_BENCH -p$b --rb-batch-cnt 50 --rb-sample-rate 50 pb-libbpf)" +done + header "Ringbuf, multi-producer contention in overwrite mode, no consumer" for b in 1 2 3 4 8 12 16 20 24 28 32 36 40 44 48 52; do summarize "rb-prod nr_prod $b" "$($RUN_BENCH -p$b --rb-batch-cnt 50 --rb-overwrite --rb-bench-producer rb-libbpf)" From ea073d1818e228440275cc90047b4ef0fddd6eb5 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Tue, 20 Jan 2026 14:26:26 -0800 Subject: [PATCH 126/268] bpf: Refactor btf_kfunc_id_set_contains btf_kfunc_id_set_contains() is called by fetch_kfunc_meta() in the BPF verifier to get the kfunc flags stored in the .BTF_ids ELF section. If it returns NULL instead of a valid pointer, it's interpreted as an illegal kfunc usage failing the verification. There are two potential reasons for btf_kfunc_id_set_contains() to return NULL: 1. Provided kfunc BTF id is not present in relevant kfunc id sets. 2. The kfunc is not allowed, as determined by the program type specific filter [1]. The filter functions accept a pointer to `struct bpf_prog`, so they might implicitly depend on earlier stages of verification, when bpf_prog members are set. For example, bpf_qdisc_kfunc_filter() in linux/net/sched/bpf_qdisc.c inspects prog->aux->st_ops [2], which is initialized in: check_attach_btf_id() -> check_struct_ops_btf_id() So far this hasn't been an issue, because fetch_kfunc_meta() is the only caller of btf_kfunc_id_set_contains(). However in subsequent patches of this series it is necessary to inspect kfunc flags earlier in BPF verifier, in the add_kfunc_call(). To resolve this, refactor btf_kfunc_id_set_contains() into two interface functions: * btf_kfunc_flags() that simply returns pointer to kfunc_flags without applying the filters * btf_kfunc_is_allowed() that both checks for kfunc_flags existence (which is a requirement for a kfunc to be allowed) and applies the prog filters See [3] for the previous version of this patch. [1] https://lore.kernel.org/all/20230519225157.760788-7-aditi.ghag@isovalent.com/ [2] https://lore.kernel.org/all/20250409214606.2000194-4-ameryhung@gmail.com/ [3] https://lore.kernel.org/bpf/20251029190113.3323406-3-ihor.solodrai@linux.dev/ Reviewed-by: Eduard Zingerman Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260120222638.3976562-2-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 4 +-- kernel/bpf/btf.c | 70 ++++++++++++++++++++++++++++++++----------- kernel/bpf/verifier.c | 6 ++-- 3 files changed, 58 insertions(+), 22 deletions(-) diff --git a/include/linux/btf.h b/include/linux/btf.h index 78dc79810c7d..a2f4f383f5b6 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -575,8 +575,8 @@ const char *btf_name_by_offset(const struct btf *btf, u32 offset); const char *btf_str_by_offset(const struct btf *btf, u32 offset); struct btf *btf_parse_vmlinux(void); struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog); -u32 *btf_kfunc_id_set_contains(const struct btf *btf, u32 kfunc_btf_id, - const struct bpf_prog *prog); +u32 *btf_kfunc_flags(const struct btf *btf, u32 kfunc_btf_id, const struct bpf_prog *prog); +bool btf_kfunc_is_allowed(const struct btf *btf, u32 kfunc_btf_id, const struct bpf_prog *prog); u32 *btf_kfunc_is_modify_return(const struct btf *btf, u32 kfunc_btf_id, const struct bpf_prog *prog); int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 364dd84bfc5a..d10b3404260f 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -8757,24 +8757,17 @@ static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook, return ret; } -static u32 *__btf_kfunc_id_set_contains(const struct btf *btf, - enum btf_kfunc_hook hook, - u32 kfunc_btf_id, - const struct bpf_prog *prog) +static u32 *btf_kfunc_id_set_contains(const struct btf *btf, + enum btf_kfunc_hook hook, + u32 kfunc_btf_id) { - struct btf_kfunc_hook_filter *hook_filter; struct btf_id_set8 *set; - u32 *id, i; + u32 *id; if (hook >= BTF_KFUNC_HOOK_MAX) return NULL; if (!btf->kfunc_set_tab) return NULL; - hook_filter = &btf->kfunc_set_tab->hook_filters[hook]; - for (i = 0; i < hook_filter->nr_filters; i++) { - if (hook_filter->filters[i](prog, kfunc_btf_id)) - return NULL; - } set = btf->kfunc_set_tab->sets[hook]; if (!set) return NULL; @@ -8785,6 +8778,28 @@ static u32 *__btf_kfunc_id_set_contains(const struct btf *btf, return id + 1; } +static bool __btf_kfunc_is_allowed(const struct btf *btf, + enum btf_kfunc_hook hook, + u32 kfunc_btf_id, + const struct bpf_prog *prog) +{ + struct btf_kfunc_hook_filter *hook_filter; + int i; + + if (hook >= BTF_KFUNC_HOOK_MAX) + return false; + if (!btf->kfunc_set_tab) + return false; + + hook_filter = &btf->kfunc_set_tab->hook_filters[hook]; + for (i = 0; i < hook_filter->nr_filters; i++) { + if (hook_filter->filters[i](prog, kfunc_btf_id)) + return false; + } + + return true; +} + static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type) { switch (prog_type) { @@ -8832,6 +8847,26 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type) } } +bool btf_kfunc_is_allowed(const struct btf *btf, + u32 kfunc_btf_id, + const struct bpf_prog *prog) +{ + enum bpf_prog_type prog_type = resolve_prog_type(prog); + enum btf_kfunc_hook hook; + u32 *kfunc_flags; + + kfunc_flags = btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_COMMON, kfunc_btf_id); + if (kfunc_flags && __btf_kfunc_is_allowed(btf, BTF_KFUNC_HOOK_COMMON, kfunc_btf_id, prog)) + return true; + + hook = bpf_prog_type_to_kfunc_hook(prog_type); + kfunc_flags = btf_kfunc_id_set_contains(btf, hook, kfunc_btf_id); + if (kfunc_flags && __btf_kfunc_is_allowed(btf, hook, kfunc_btf_id, prog)) + return true; + + return false; +} + /* Caution: * Reference to the module (obtained using btf_try_get_module) corresponding to * the struct btf *MUST* be held when calling this function from verifier @@ -8839,26 +8874,27 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type) * keeping the reference for the duration of the call provides the necessary * protection for looking up a well-formed btf->kfunc_set_tab. */ -u32 *btf_kfunc_id_set_contains(const struct btf *btf, - u32 kfunc_btf_id, - const struct bpf_prog *prog) +u32 *btf_kfunc_flags(const struct btf *btf, u32 kfunc_btf_id, const struct bpf_prog *prog) { enum bpf_prog_type prog_type = resolve_prog_type(prog); enum btf_kfunc_hook hook; u32 *kfunc_flags; - kfunc_flags = __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_COMMON, kfunc_btf_id, prog); + kfunc_flags = btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_COMMON, kfunc_btf_id); if (kfunc_flags) return kfunc_flags; hook = bpf_prog_type_to_kfunc_hook(prog_type); - return __btf_kfunc_id_set_contains(btf, hook, kfunc_btf_id, prog); + return btf_kfunc_id_set_contains(btf, hook, kfunc_btf_id); } u32 *btf_kfunc_is_modify_return(const struct btf *btf, u32 kfunc_btf_id, const struct bpf_prog *prog) { - return __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_FMODRET, kfunc_btf_id, prog); + if (!__btf_kfunc_is_allowed(btf, BTF_KFUNC_HOOK_FMODRET, kfunc_btf_id, prog)) + return NULL; + + return btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_FMODRET, kfunc_btf_id); } static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook, diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b67d8981b058..c344371feb33 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -13723,10 +13723,10 @@ static int fetch_kfunc_meta(struct bpf_verifier_env *env, *kfunc_name = func_name; func_proto = btf_type_by_id(desc_btf, func->type); - kfunc_flags = btf_kfunc_id_set_contains(desc_btf, func_id, env->prog); - if (!kfunc_flags) { + if (!btf_kfunc_is_allowed(desc_btf, func_id, env->prog)) return -EACCES; - } + + kfunc_flags = btf_kfunc_flags(desc_btf, func_id, env->prog); memset(meta, 0, sizeof(*meta)); meta->btf = desc_btf; From 08ca87d6324350a7abf5f05db5b63df9420dd29d Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Tue, 20 Jan 2026 14:26:27 -0800 Subject: [PATCH 127/268] bpf: Introduce struct bpf_kfunc_meta There is code duplication between add_kfunc_call() and fetch_kfunc_meta() collecting information about a kfunc from BTF. Introduce struct bpf_kfunc_meta to hold common kfunc BTF data and implement fetch_kfunc_meta() to fill it in, instead of struct bpf_kfunc_call_arg_meta directly. Then use these in add_kfunc_call() and (new) fetch_kfunc_arg_meta() functions, and fixup previous usages of fetch_kfunc_meta() to fetch_kfunc_arg_meta(). Besides the code dedup, this change enables add_kfunc_call() to access kfunc->flags. Acked-by: Eduard Zingerman Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260120222638.3976562-3-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 160 ++++++++++++++++++++++++------------------ 1 file changed, 93 insertions(+), 67 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c344371feb33..87f0e113b356 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -294,6 +294,14 @@ struct bpf_call_arg_meta { s64 const_map_key; }; +struct bpf_kfunc_meta { + struct btf *btf; + const struct btf_type *proto; + const char *name; + const u32 *flags; + s32 id; +}; + struct bpf_kfunc_call_arg_meta { /* In parameters */ struct btf *btf; @@ -3263,16 +3271,68 @@ static struct btf *find_kfunc_desc_btf(struct bpf_verifier_env *env, s16 offset) return btf_vmlinux ?: ERR_PTR(-ENOENT); } -static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) +static int fetch_kfunc_meta(struct bpf_verifier_env *env, + s32 func_id, + s16 offset, + struct bpf_kfunc_meta *kfunc) { const struct btf_type *func, *func_proto; + const char *func_name; + u32 *kfunc_flags; + struct btf *btf; + + if (func_id <= 0) { + verbose(env, "invalid kernel function btf_id %d\n", func_id); + return -EINVAL; + } + + btf = find_kfunc_desc_btf(env, offset); + if (IS_ERR(btf)) { + verbose(env, "failed to find BTF for kernel function\n"); + return PTR_ERR(btf); + } + + /* + * Note that kfunc_flags may be NULL at this point, which + * means that we couldn't find func_id in any relevant + * kfunc_id_set. This most likely indicates an invalid kfunc + * call. However we don't fail with an error here, + * and let the caller decide what to do with NULL kfunc->flags. + */ + kfunc_flags = btf_kfunc_flags(btf, func_id, env->prog); + + func = btf_type_by_id(btf, func_id); + if (!func || !btf_type_is_func(func)) { + verbose(env, "kernel btf_id %d is not a function\n", func_id); + return -EINVAL; + } + + func_name = btf_name_by_offset(btf, func->name_off); + func_proto = btf_type_by_id(btf, func->type); + if (!func_proto || !btf_type_is_func_proto(func_proto)) { + verbose(env, "kernel function btf_id %d does not have a valid func_proto\n", + func_id); + return -EINVAL; + } + + memset(kfunc, 0, sizeof(*kfunc)); + kfunc->btf = btf; + kfunc->id = func_id; + kfunc->name = func_name; + kfunc->proto = func_proto; + kfunc->flags = kfunc_flags; + + return 0; +} + +static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) +{ struct bpf_kfunc_btf_tab *btf_tab; struct btf_func_model func_model; struct bpf_kfunc_desc_tab *tab; struct bpf_prog_aux *prog_aux; + struct bpf_kfunc_meta kfunc; struct bpf_kfunc_desc *desc; - const char *func_name; - struct btf *desc_btf; unsigned long addr; int err; @@ -3322,12 +3382,6 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) prog_aux->kfunc_btf_tab = btf_tab; } - desc_btf = find_kfunc_desc_btf(env, offset); - if (IS_ERR(desc_btf)) { - verbose(env, "failed to find BTF for kernel function\n"); - return PTR_ERR(desc_btf); - } - if (find_kfunc_desc(env->prog, func_id, offset)) return 0; @@ -3336,24 +3390,13 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) return -E2BIG; } - func = btf_type_by_id(desc_btf, func_id); - if (!func || !btf_type_is_func(func)) { - verbose(env, "kernel btf_id %u is not a function\n", - func_id); - return -EINVAL; - } - func_proto = btf_type_by_id(desc_btf, func->type); - if (!func_proto || !btf_type_is_func_proto(func_proto)) { - verbose(env, "kernel function btf_id %u does not have a valid func_proto\n", - func_id); - return -EINVAL; - } + err = fetch_kfunc_meta(env, func_id, offset, &kfunc); + if (err) + return err; - func_name = btf_name_by_offset(desc_btf, func->name_off); - addr = kallsyms_lookup_name(func_name); + addr = kallsyms_lookup_name(kfunc.name); if (!addr) { - verbose(env, "cannot find address for kernel function %s\n", - func_name); + verbose(env, "cannot find address for kernel function %s\n", kfunc.name); return -EINVAL; } @@ -3363,9 +3406,7 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) return err; } - err = btf_distill_func_proto(&env->log, desc_btf, - func_proto, func_name, - &func_model); + err = btf_distill_func_proto(&env->log, kfunc.btf, kfunc.proto, kfunc.name, &func_model); if (err) return err; @@ -13696,44 +13737,28 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return 0; } -static int fetch_kfunc_meta(struct bpf_verifier_env *env, - struct bpf_insn *insn, - struct bpf_kfunc_call_arg_meta *meta, - const char **kfunc_name) +static int fetch_kfunc_arg_meta(struct bpf_verifier_env *env, + s32 func_id, + s16 offset, + struct bpf_kfunc_call_arg_meta *meta) { - const struct btf_type *func, *func_proto; - u32 func_id, *kfunc_flags; - const char *func_name; - struct btf *desc_btf; + struct bpf_kfunc_meta kfunc; + int err; - if (kfunc_name) - *kfunc_name = NULL; - - if (!insn->imm) - return -EINVAL; - - desc_btf = find_kfunc_desc_btf(env, insn->off); - if (IS_ERR(desc_btf)) - return PTR_ERR(desc_btf); - - func_id = insn->imm; - func = btf_type_by_id(desc_btf, func_id); - func_name = btf_name_by_offset(desc_btf, func->name_off); - if (kfunc_name) - *kfunc_name = func_name; - func_proto = btf_type_by_id(desc_btf, func->type); - - if (!btf_kfunc_is_allowed(desc_btf, func_id, env->prog)) - return -EACCES; - - kfunc_flags = btf_kfunc_flags(desc_btf, func_id, env->prog); + err = fetch_kfunc_meta(env, func_id, offset, &kfunc); + if (err) + return err; memset(meta, 0, sizeof(*meta)); - meta->btf = desc_btf; - meta->func_id = func_id; - meta->kfunc_flags = *kfunc_flags; - meta->func_proto = func_proto; - meta->func_name = func_name; + meta->btf = kfunc.btf; + meta->func_id = kfunc.id; + meta->func_proto = kfunc.proto; + meta->func_name = kfunc.name; + + if (!kfunc.flags || !btf_kfunc_is_allowed(kfunc.btf, kfunc.id, env->prog)) + return -EACCES; + + meta->kfunc_flags = *kfunc.flags; return 0; } @@ -13938,12 +13963,13 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (!insn->imm) return 0; - err = fetch_kfunc_meta(env, insn, &meta, &func_name); - if (err == -EACCES && func_name) - verbose(env, "calling kernel function %s is not allowed\n", func_name); + err = fetch_kfunc_arg_meta(env, insn->imm, insn->off, &meta); + if (err == -EACCES && meta.func_name) + verbose(env, "calling kernel function %s is not allowed\n", meta.func_name); if (err) return err; desc_btf = meta.btf; + func_name = meta.func_name; insn_aux = &env->insn_aux_data[insn_idx]; insn_aux->is_iter_next = is_iter_next_kfunc(&meta); @@ -17789,7 +17815,7 @@ static bool get_call_summary(struct bpf_verifier_env *env, struct bpf_insn *call if (bpf_pseudo_kfunc_call(call)) { int err; - err = fetch_kfunc_meta(env, call, &meta, NULL); + err = fetch_kfunc_arg_meta(env, call->imm, call->off, &meta); if (err < 0) /* error would be reported later */ return false; @@ -18297,7 +18323,7 @@ static int visit_insn(int t, struct bpf_verifier_env *env) } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { struct bpf_kfunc_call_arg_meta meta; - ret = fetch_kfunc_meta(env, insn, &meta, NULL); + ret = fetch_kfunc_arg_meta(env, insn->imm, insn->off, &meta); if (ret == 0 && is_iter_next_kfunc(&meta)) { mark_prune_point(env, t); /* Checking and saving state checkpoints at iter_next() call From 64e1360524b9ef5835714669b5876e122a23e6fc Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Tue, 20 Jan 2026 14:26:28 -0800 Subject: [PATCH 128/268] bpf: Verifier support for KF_IMPLICIT_ARGS A kernel function bpf_foo marked with KF_IMPLICIT_ARGS flag is expected to have two associated types in BTF: * `bpf_foo` with a function prototype that omits implicit arguments * `bpf_foo_impl` with a function prototype that matches the kernel declaration of `bpf_foo`, but doesn't have a ksym associated with its name In order to support kfuncs with implicit arguments, the verifier has to know how to resolve a call of `bpf_foo` to the correct BTF function prototype and address. To implement this, in add_kfunc_call() kfunc flags are checked for KF_IMPLICIT_ARGS. For such kfuncs a BTF func prototype is adjusted to the one found for `bpf_foo_impl` (func_name + "_impl" suffix, by convention) function in BTF. This effectively changes the signature of the `bpf_foo` kfunc in the context of verification: from one without implicit args to the one with full argument list. The values of implicit arguments by design are provided by the verifier, and so they can only be of particular types. In this patch the only allowed implicit arg type is a pointer to struct bpf_prog_aux. In order for the verifier to correctly set an implicit bpf_prog_aux arg value at runtime, is_kfunc_arg_prog() is extended to check for the arg type. At a point when prog arg is determined in check_kfunc_args() the kfunc with implicit args already has a prototype with full argument list, so the existing value patch mechanism just works. If a new kfunc with KF_IMPLICIT_ARG is declared for an existing kfunc that uses a __prog argument (a legacy case), the prototype substitution works in exactly the same way, assuming the kfunc follows the _impl naming convention. The difference is only in how _impl prototype is added to the BTF, which is not the verifier's concern. See a subsequent resolve_btfids patch for details. __prog suffix is still supported at this point, but will be removed in a subsequent patch, after current users are moved to KF_IMPLICIT_ARGS. Introduction of KF_IMPLICIT_ARGS revealed an issue with zero-extension tracking, because an explicit rX = 0 in place of the verifier-supplied argument is now absent if the arg is implicit (the BPF prog doesn't pass a dummy NULL anymore). To mitigate this, reset the subreg_def of all caller saved registers in check_kfunc_call() [1]. [1] https://lore.kernel.org/bpf/b4a760ef828d40dac7ea6074d39452bb0dc82caa.camel@gmail.com/ Acked-by: Eduard Zingerman Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260120222638.3976562-4-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 1 + kernel/bpf/verifier.c | 58 ++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 55 insertions(+), 4 deletions(-) diff --git a/include/linux/btf.h b/include/linux/btf.h index a2f4f383f5b6..48108471c5b1 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -78,6 +78,7 @@ #define KF_ARENA_RET (1 << 13) /* kfunc returns an arena pointer */ #define KF_ARENA_ARG1 (1 << 14) /* kfunc takes an arena pointer as its first argument */ #define KF_ARENA_ARG2 (1 << 15) /* kfunc takes an arena pointer as its second argument */ +#define KF_IMPLICIT_ARGS (1 << 16) /* kfunc has implicit arguments supplied by the verifier */ /* * Tag marking a kernel function as a kfunc. This is meant to minimize the diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 87f0e113b356..adc24a2ce5b6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3271,6 +3271,34 @@ static struct btf *find_kfunc_desc_btf(struct bpf_verifier_env *env, s16 offset) return btf_vmlinux ?: ERR_PTR(-ENOENT); } +#define KF_IMPL_SUFFIX "_impl" + +static const struct btf_type *find_kfunc_impl_proto(struct bpf_verifier_env *env, + struct btf *btf, + const char *func_name) +{ + char *buf = env->tmp_str_buf; + const struct btf_type *func; + s32 impl_id; + int len; + + len = snprintf(buf, TMP_STR_BUF_LEN, "%s%s", func_name, KF_IMPL_SUFFIX); + if (len < 0 || len >= TMP_STR_BUF_LEN) { + verbose(env, "function name %s%s is too long\n", func_name, KF_IMPL_SUFFIX); + return NULL; + } + + impl_id = btf_find_by_name_kind(btf, buf, BTF_KIND_FUNC); + if (impl_id <= 0) { + verbose(env, "cannot find function %s in BTF\n", buf); + return NULL; + } + + func = btf_type_by_id(btf, impl_id); + + return btf_type_by_id(btf, func->type); +} + static int fetch_kfunc_meta(struct bpf_verifier_env *env, s32 func_id, s16 offset, @@ -3308,7 +3336,16 @@ static int fetch_kfunc_meta(struct bpf_verifier_env *env, } func_name = btf_name_by_offset(btf, func->name_off); - func_proto = btf_type_by_id(btf, func->type); + + /* + * An actual prototype of a kfunc with KF_IMPLICIT_ARGS flag + * can be found through the counterpart _impl kfunc. + */ + if (kfunc_flags && (*kfunc_flags & KF_IMPLICIT_ARGS)) + func_proto = find_kfunc_impl_proto(env, btf, func_name); + else + func_proto = btf_type_by_id(btf, func->type); + if (!func_proto || !btf_type_is_func_proto(func_proto)) { verbose(env, "kernel function btf_id %d does not have a valid func_proto\n", func_id); @@ -12174,9 +12211,11 @@ static bool is_kfunc_arg_irq_flag(const struct btf *btf, const struct btf_param return btf_param_match_suffix(btf, arg, "__irq_flag"); } +static bool is_kfunc_arg_prog_aux(const struct btf *btf, const struct btf_param *arg); + static bool is_kfunc_arg_prog(const struct btf *btf, const struct btf_param *arg) { - return btf_param_match_suffix(btf, arg, "__prog"); + return btf_param_match_suffix(btf, arg, "__prog") || is_kfunc_arg_prog_aux(btf, arg); } static bool is_kfunc_arg_scalar_with_name(const struct btf *btf, @@ -12207,6 +12246,7 @@ enum { KF_ARG_WORKQUEUE_ID, KF_ARG_RES_SPIN_LOCK_ID, KF_ARG_TASK_WORK_ID, + KF_ARG_PROG_AUX_ID }; BTF_ID_LIST(kf_arg_btf_ids) @@ -12218,6 +12258,7 @@ BTF_ID(struct, bpf_rb_node) BTF_ID(struct, bpf_wq) BTF_ID(struct, bpf_res_spin_lock) BTF_ID(struct, bpf_task_work) +BTF_ID(struct, bpf_prog_aux) static bool __is_kfunc_ptr_arg_type(const struct btf *btf, const struct btf_param *arg, int type) @@ -12298,6 +12339,11 @@ static bool is_kfunc_arg_callback(struct bpf_verifier_env *env, const struct btf return true; } +static bool is_kfunc_arg_prog_aux(const struct btf *btf, const struct btf_param *arg) +{ + return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_PROG_AUX_ID); +} + /* Returns true if struct is composed of scalars, 4 levels of nesting allowed */ static bool __btf_type_is_scalar_struct(struct bpf_verifier_env *env, const struct btf *btf, @@ -14177,8 +14223,12 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } } - for (i = 0; i < CALLER_SAVED_REGS; i++) - mark_reg_not_init(env, regs, caller_saved[i]); + for (i = 0; i < CALLER_SAVED_REGS; i++) { + u32 regno = caller_saved[i]; + + mark_reg_not_init(env, regs, regno); + regs[regno].subreg_def = DEF_NOT_SUBREG; + } /* Check return type */ t = btf_type_skip_modifiers(desc_btf, meta.func_proto->type, NULL); From 2583e81fd8852e6cf6730c4463b6580deb7a8aad Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Tue, 20 Jan 2026 14:26:29 -0800 Subject: [PATCH 129/268] resolve_btfids: Introduce finalize_btf() step Since recently [1][2] resolve_btfids executes final adjustments to the kernel/module BTF before it's embedded into the target binary. To keep the implementation simple, a clear and stable "pipeline" of how BTF data flows through resolve_btfids would be helpful. Some BTF modifications may change the ids of the types, so it is important to maintain correct order of operations with respect to .BTF_ids resolution too. This patch refactors the BTF handling to establish the following sequence: - load target ELF sections - load .BTF_ids symbols - this will be a dependency of btf2btf transformations in subsequent patches - load BTF and its base as is - (*) btf2btf transformations will happen here - finalize_btf(), introduced in this patch - does distill base and sort BTF - resolve and patch .BTF_ids This approach helps to avoid fixups in .BTF_ids data in case the ids change at any point of BTF processing, because symbol resolution happens on the finalized, ready to dump, BTF data. This also gives flexibility in BTF transformations, because they will happen on BTF that is not distilled and/or sorted yet, allowing to freely add, remove and modify BTF types. [1] https://lore.kernel.org/bpf/20251219181321.1283664-1-ihor.solodrai@linux.dev/ [2] https://lore.kernel.org/bpf/20260109130003.3313716-1-dolinux.peng@gmail.com/ Acked-by: Eduard Zingerman Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260120222638.3976562-5-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- tools/bpf/resolve_btfids/main.c | 69 +++++++++++++++++++++++---------- 1 file changed, 48 insertions(+), 21 deletions(-) diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c index 343d08050116..1fcf37af6764 100644 --- a/tools/bpf/resolve_btfids/main.c +++ b/tools/bpf/resolve_btfids/main.c @@ -563,19 +563,6 @@ static int load_btf(struct object *obj) obj->base_btf = base_btf; obj->btf = btf; - if (obj->base_btf && obj->distill_base) { - err = btf__distill_base(obj->btf, &base_btf, &btf); - if (err) { - pr_err("FAILED to distill base BTF: %s\n", strerror(errno)); - goto out_err; - } - - btf__free(obj->base_btf); - btf__free(obj->btf); - obj->base_btf = base_btf; - obj->btf = btf; - } - return 0; out_err: @@ -911,6 +898,41 @@ static int sort_btf_by_name(struct btf *btf) return err; } +static int finalize_btf(struct object *obj) +{ + struct btf *base_btf = obj->base_btf, *btf = obj->btf; + int err; + + if (obj->base_btf && obj->distill_base) { + err = btf__distill_base(obj->btf, &base_btf, &btf); + if (err) { + pr_err("FAILED to distill base BTF: %s\n", strerror(errno)); + goto out_err; + } + + btf__free(obj->base_btf); + btf__free(obj->btf); + obj->base_btf = base_btf; + obj->btf = btf; + } + + err = sort_btf_by_name(obj->btf); + if (err) { + pr_err("FAILED to sort BTF: %s\n", strerror(errno)); + goto out_err; + } + + return 0; + +out_err: + btf__free(base_btf); + btf__free(btf); + obj->base_btf = NULL; + obj->btf = NULL; + + return err; +} + static inline int make_out_path(char *buf, u32 buf_sz, const char *in_path, const char *suffix) { int len = snprintf(buf, buf_sz, "%s%s", in_path, suffix); @@ -1054,6 +1076,7 @@ int main(int argc, const char **argv) }; const char *btfids_path = NULL; bool fatal_warnings = false; + bool resolve_btfids = true; char out_path[PATH_MAX]; struct option btfid_options[] = { @@ -1083,12 +1106,6 @@ int main(int argc, const char **argv) if (btfids_path) return patch_btfids(btfids_path, obj.path); - if (load_btf(&obj)) - goto out; - - if (sort_btf_by_name(obj.btf)) - goto out; - if (elf_collect(&obj)) goto out; @@ -1099,12 +1116,22 @@ int main(int argc, const char **argv) if (obj.efile.idlist_shndx == -1 || obj.efile.symbols_shndx == -1) { pr_debug("Cannot find .BTF_ids or symbols sections, skip symbols resolution\n"); - goto dump_btf; + resolve_btfids = false; } - if (symbols_collect(&obj)) + if (resolve_btfids) + if (symbols_collect(&obj)) + goto out; + + if (load_btf(&obj)) goto out; + if (finalize_btf(&obj)) + goto out; + + if (!resolve_btfids) + goto dump_btf; + if (symbols_resolve(&obj)) goto out; From 9d199965990c0b21160c565076b93725d6638c28 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Tue, 20 Jan 2026 14:26:30 -0800 Subject: [PATCH 130/268] resolve_btfids: Support for KF_IMPLICIT_ARGS Implement BTF modifications in resolve_btfids to support BPF kernel functions with implicit arguments. For a kfunc marked with KF_IMPLICIT_ARGS flag, a new function prototype is added to BTF that does not have implicit arguments. The kfunc's prototype is then updated to a new one in BTF. This prototype is the intended interface for the BPF programs. A _impl function is added to BTF to make the original kfunc prototype searchable for the BPF verifier. If a _impl function already exists in BTF, its interpreted as a legacy case, and this step is skipped. Whether an argument is implicit is determined by its type: currently only `struct bpf_prog_aux *` is supported. As a result, the BTF associated with kfunc is changed from __bpf_kfunc bpf_foo(int arg1, struct bpf_prog_aux *aux); into bpf_foo_impl(int arg1, struct bpf_prog_aux *aux); __bpf_kfunc bpf_foo(int arg1); For more context see previous discussions and patches [1][2]. [1] https://lore.kernel.org/dwarves/ba1650aa-fafd-49a8-bea4-bdddee7c38c9@linux.dev/ [2] https://lore.kernel.org/bpf/20251029190113.3323406-1-ihor.solodrai@linux.dev/ Acked-by: Eduard Zingerman Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260120222638.3976562-6-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- tools/bpf/resolve_btfids/main.c | 382 ++++++++++++++++++++++++++++++++ 1 file changed, 382 insertions(+) diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c index 1fcf37af6764..db8d1554bdcc 100644 --- a/tools/bpf/resolve_btfids/main.c +++ b/tools/bpf/resolve_btfids/main.c @@ -152,6 +152,25 @@ struct object { int nr_typedefs; }; +#define KF_IMPLICIT_ARGS (1 << 16) +#define KF_IMPL_SUFFIX "_impl" + +struct kfunc { + const char *name; + u32 btf_id; + u32 flags; +}; + +struct btf2btf_context { + struct btf *btf; + u32 *decl_tags; + u32 nr_decl_tags; + u32 max_decl_tags; + struct kfunc *kfuncs; + u32 nr_kfuncs; + u32 max_kfuncs; +}; + static int verbose; static int warnings; @@ -837,6 +856,366 @@ static int dump_raw_btf(struct btf *btf, const char *out_path) return 0; } +static const struct btf_type *btf_type_skip_qualifiers(const struct btf *btf, s32 type_id) +{ + const struct btf_type *t = btf__type_by_id(btf, type_id); + + while (btf_is_mod(t)) + t = btf__type_by_id(btf, t->type); + + return t; +} + +static int push_decl_tag_id(struct btf2btf_context *ctx, u32 decl_tag_id) +{ + u32 *arr = ctx->decl_tags; + u32 cap = ctx->max_decl_tags; + + if (ctx->nr_decl_tags + 1 > cap) { + cap = max(cap + 256, cap * 2); + arr = realloc(arr, sizeof(u32) * cap); + if (!arr) + return -ENOMEM; + ctx->max_decl_tags = cap; + ctx->decl_tags = arr; + } + + ctx->decl_tags[ctx->nr_decl_tags++] = decl_tag_id; + + return 0; +} + +static int push_kfunc(struct btf2btf_context *ctx, struct kfunc *kfunc) +{ + struct kfunc *arr = ctx->kfuncs; + u32 cap = ctx->max_kfuncs; + + if (ctx->nr_kfuncs + 1 > cap) { + cap = max(cap + 256, cap * 2); + arr = realloc(arr, sizeof(struct kfunc) * cap); + if (!arr) + return -ENOMEM; + ctx->max_kfuncs = cap; + ctx->kfuncs = arr; + } + + ctx->kfuncs[ctx->nr_kfuncs++] = *kfunc; + + return 0; +} + +static int collect_decl_tags(struct btf2btf_context *ctx) +{ + const u32 type_cnt = btf__type_cnt(ctx->btf); + struct btf *btf = ctx->btf; + const struct btf_type *t; + int err; + + for (u32 id = 1; id < type_cnt; id++) { + t = btf__type_by_id(btf, id); + if (!btf_is_decl_tag(t)) + continue; + err = push_decl_tag_id(ctx, id); + if (err) + return err; + } + + return 0; +} + +/* + * To find the kfunc flags having its struct btf_id (with ELF addresses) + * we need to find the address that is in range of a set8. + * If a set8 is found, then the flags are located at addr + 4 bytes. + * Return 0 (no flags!) if not found. + */ +static u32 find_kfunc_flags(struct object *obj, struct btf_id *kfunc_id) +{ + const u32 *elf_data_ptr = obj->efile.idlist->d_buf; + u64 set_lower_addr, set_upper_addr, addr; + struct btf_id *set_id; + struct rb_node *next; + u32 flags; + u64 idx; + + for (next = rb_first(&obj->sets); next; next = rb_next(next)) { + set_id = rb_entry(next, struct btf_id, rb_node); + if (set_id->kind != BTF_ID_KIND_SET8 || set_id->addr_cnt != 1) + continue; + + set_lower_addr = set_id->addr[0]; + set_upper_addr = set_lower_addr + set_id->cnt * sizeof(u64); + + for (u32 i = 0; i < kfunc_id->addr_cnt; i++) { + addr = kfunc_id->addr[i]; + /* + * Lower bound is exclusive to skip the 8-byte header of the set. + * Upper bound is inclusive to capture the last entry at offset 8*cnt. + */ + if (set_lower_addr < addr && addr <= set_upper_addr) { + pr_debug("found kfunc %s in BTF_ID_FLAGS %s\n", + kfunc_id->name, set_id->name); + idx = addr - obj->efile.idlist_addr; + idx = idx / sizeof(u32) + 1; + flags = elf_data_ptr[idx]; + + return flags; + } + } + } + + return 0; +} + +static int collect_kfuncs(struct object *obj, struct btf2btf_context *ctx) +{ + const char *tag_name, *func_name; + struct btf *btf = ctx->btf; + const struct btf_type *t; + u32 flags, func_id; + struct kfunc kfunc; + struct btf_id *id; + int err; + + if (ctx->nr_decl_tags == 0) + return 0; + + for (u32 i = 0; i < ctx->nr_decl_tags; i++) { + t = btf__type_by_id(btf, ctx->decl_tags[i]); + if (btf_kflag(t) || btf_decl_tag(t)->component_idx != -1) + continue; + + tag_name = btf__name_by_offset(btf, t->name_off); + if (strcmp(tag_name, "bpf_kfunc") != 0) + continue; + + func_id = t->type; + t = btf__type_by_id(btf, func_id); + if (!btf_is_func(t)) + continue; + + func_name = btf__name_by_offset(btf, t->name_off); + if (!func_name) + continue; + + id = btf_id__find(&obj->funcs, func_name); + if (!id || id->kind != BTF_ID_KIND_SYM) + continue; + + flags = find_kfunc_flags(obj, id); + + kfunc.name = id->name; + kfunc.btf_id = func_id; + kfunc.flags = flags; + + err = push_kfunc(ctx, &kfunc); + if (err) + return err; + } + + return 0; +} + +static int build_btf2btf_context(struct object *obj, struct btf2btf_context *ctx) +{ + int err; + + ctx->btf = obj->btf; + + err = collect_decl_tags(ctx); + if (err) { + pr_err("ERROR: resolve_btfids: failed to collect decl tags from BTF\n"); + return err; + } + + err = collect_kfuncs(obj, ctx); + if (err) { + pr_err("ERROR: resolve_btfids: failed to collect kfuncs from BTF\n"); + return err; + } + + return 0; +} + + +/* Implicit BPF kfunc arguments can only be of particular types */ +static bool is_kf_implicit_arg(const struct btf *btf, const struct btf_param *p) +{ + static const char *const kf_implicit_arg_types[] = { + "bpf_prog_aux", + }; + const struct btf_type *t; + const char *name; + + t = btf_type_skip_qualifiers(btf, p->type); + if (!btf_is_ptr(t)) + return false; + + t = btf_type_skip_qualifiers(btf, t->type); + if (!btf_is_struct(t)) + return false; + + name = btf__name_by_offset(btf, t->name_off); + if (!name) + return false; + + for (int i = 0; i < ARRAY_SIZE(kf_implicit_arg_types); i++) + if (strcmp(name, kf_implicit_arg_types[i]) == 0) + return true; + + return false; +} + +/* + * For a kfunc with KF_IMPLICIT_ARGS we do the following: + * 1. Add a new function with _impl suffix in the name, with the prototype + * of the original kfunc. + * 2. Add all decl tags except "bpf_kfunc" for the _impl func. + * 3. Add a new function prototype with modified list of arguments: + * omitting implicit args. + * 4. Change the prototype of the original kfunc to the new one. + * + * This way we transform the BTF associated with the kfunc from + * __bpf_kfunc bpf_foo(int arg1, void *implicit_arg); + * into + * bpf_foo_impl(int arg1, void *implicit_arg); + * __bpf_kfunc bpf_foo(int arg1); + * + * If a kfunc with KF_IMPLICIT_ARGS already has an _impl counterpart + * in BTF, then it's a legacy case: an _impl function is declared in the + * source code. In this case, we can skip adding an _impl function, but we + * still have to add a func prototype that omits implicit args. + */ +static int process_kfunc_with_implicit_args(struct btf2btf_context *ctx, struct kfunc *kfunc) +{ + s32 idx, new_proto_id, new_func_id, proto_id; + const char *param_name, *tag_name; + const struct btf_param *params; + enum btf_func_linkage linkage; + char tmp_name[KSYM_NAME_LEN]; + struct btf *btf = ctx->btf; + int err, len, nr_params; + struct btf_type *t; + + t = (struct btf_type *)btf__type_by_id(btf, kfunc->btf_id); + if (!t || !btf_is_func(t)) { + pr_err("ERROR: resolve_btfids: btf id %d is not a function\n", kfunc->btf_id); + return -EINVAL; + } + + linkage = btf_vlen(t); + + proto_id = t->type; + t = (struct btf_type *)btf__type_by_id(btf, proto_id); + if (!t || !btf_is_func_proto(t)) { + pr_err("ERROR: resolve_btfids: btf id %d is not a function prototype\n", proto_id); + return -EINVAL; + } + + len = snprintf(tmp_name, sizeof(tmp_name), "%s%s", kfunc->name, KF_IMPL_SUFFIX); + if (len < 0 || len >= sizeof(tmp_name)) { + pr_err("ERROR: function name is too long: %s%s\n", kfunc->name, KF_IMPL_SUFFIX); + return -E2BIG; + } + + if (btf__find_by_name_kind(btf, tmp_name, BTF_KIND_FUNC) > 0) { + pr_debug("resolve_btfids: function %s already exists in BTF\n", tmp_name); + goto add_new_proto; + } + + /* Add a new function with _impl suffix and original prototype */ + new_func_id = btf__add_func(btf, tmp_name, linkage, proto_id); + if (new_func_id < 0) { + pr_err("ERROR: resolve_btfids: failed to add func %s to BTF\n", tmp_name); + return new_func_id; + } + + /* Copy all decl tags except "bpf_kfunc" from the original kfunc to the new one */ + for (int i = 0; i < ctx->nr_decl_tags; i++) { + t = (struct btf_type *)btf__type_by_id(btf, ctx->decl_tags[i]); + if (t->type != kfunc->btf_id) + continue; + + tag_name = btf__name_by_offset(btf, t->name_off); + if (strcmp(tag_name, "bpf_kfunc") == 0) + continue; + + idx = btf_decl_tag(t)->component_idx; + + if (btf_kflag(t)) + err = btf__add_decl_attr(btf, tag_name, new_func_id, idx); + else + err = btf__add_decl_tag(btf, tag_name, new_func_id, idx); + + if (err < 0) { + pr_err("ERROR: resolve_btfids: failed to add decl tag %s for %s\n", + tag_name, tmp_name); + return -EINVAL; + } + } + +add_new_proto: + t = (struct btf_type *)btf__type_by_id(btf, proto_id); + new_proto_id = btf__add_func_proto(btf, t->type); + if (new_proto_id < 0) { + pr_err("ERROR: resolve_btfids: failed to add func proto for %s\n", kfunc->name); + return new_proto_id; + } + + /* Add non-implicit args to the new prototype */ + t = (struct btf_type *)btf__type_by_id(btf, proto_id); + nr_params = btf_vlen(t); + for (int i = 0; i < nr_params; i++) { + params = btf_params(t); + if (is_kf_implicit_arg(btf, ¶ms[i])) + break; + param_name = btf__name_by_offset(btf, params[i].name_off); + err = btf__add_func_param(btf, param_name, params[i].type); + if (err < 0) { + pr_err("ERROR: resolve_btfids: failed to add param %s for %s\n", + param_name, kfunc->name); + return err; + } + t = (struct btf_type *)btf__type_by_id(btf, proto_id); + } + + /* Finally change the prototype of the original kfunc to the new one */ + t = (struct btf_type *)btf__type_by_id(btf, kfunc->btf_id); + t->type = new_proto_id; + + pr_debug("resolve_btfids: updated BTF for kfunc with implicit args %s\n", kfunc->name); + + return 0; +} + +static int btf2btf(struct object *obj) +{ + struct btf2btf_context ctx = {}; + int err; + + err = build_btf2btf_context(obj, &ctx); + if (err) + goto out; + + for (u32 i = 0; i < ctx.nr_kfuncs; i++) { + struct kfunc *kfunc = &ctx.kfuncs[i]; + + if (!(kfunc->flags & KF_IMPLICIT_ARGS)) + continue; + + err = process_kfunc_with_implicit_args(&ctx, kfunc); + if (err) + goto out; + } + + err = 0; +out: + free(ctx.decl_tags); + free(ctx.kfuncs); + + return err; +} + /* * Sort types by name in ascending order resulting in all * anonymous types being placed before named types. @@ -1126,6 +1505,9 @@ int main(int argc, const char **argv) if (load_btf(&obj)) goto out; + if (btf2btf(&obj)) + goto out; + if (finalize_btf(&obj)) goto out; From e939f3d16d77a88e5f363394ef73db4c898c4107 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Tue, 20 Jan 2026 14:26:31 -0800 Subject: [PATCH 131/268] selftests/bpf: Add tests for KF_IMPLICIT_ARGS Add trivial end-to-end tests to validate that KF_IMPLICIT_ARGS flag is properly handled by both resolve_btfids and the verifier. Declare kfuncs in bpf_testmod. Check that bpf_prog_aux pointer is set in the kfunc implementation. Verify that calls with implicit args and a legacy case all work. Acked-by: Eduard Zingerman Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260120222638.3976562-7-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- .../bpf/prog_tests/kfunc_implicit_args.c | 10 +++++ .../selftests/bpf/progs/kfunc_implicit_args.c | 41 +++++++++++++++++++ .../selftests/bpf/test_kmods/bpf_testmod.c | 26 ++++++++++++ 3 files changed, 77 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/kfunc_implicit_args.c create mode 100644 tools/testing/selftests/bpf/progs/kfunc_implicit_args.c diff --git a/tools/testing/selftests/bpf/prog_tests/kfunc_implicit_args.c b/tools/testing/selftests/bpf/prog_tests/kfunc_implicit_args.c new file mode 100644 index 000000000000..5e4793c9c29a --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/kfunc_implicit_args.c @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#include +#include "kfunc_implicit_args.skel.h" + +void test_kfunc_implicit_args(void) +{ + RUN_TESTS(kfunc_implicit_args); +} diff --git a/tools/testing/selftests/bpf/progs/kfunc_implicit_args.c b/tools/testing/selftests/bpf/progs/kfunc_implicit_args.c new file mode 100644 index 000000000000..89b6a47e22dd --- /dev/null +++ b/tools/testing/selftests/bpf/progs/kfunc_implicit_args.c @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include "bpf_misc.h" + +extern int bpf_kfunc_implicit_arg(int a) __weak __ksym; +extern int bpf_kfunc_implicit_arg_impl(int a, struct bpf_prog_aux *aux) __weak __ksym; /* illegal */ +extern int bpf_kfunc_implicit_arg_legacy(int a, int b) __weak __ksym; +extern int bpf_kfunc_implicit_arg_legacy_impl(int a, int b, struct bpf_prog_aux *aux) __weak __ksym; + +char _license[] SEC("license") = "GPL"; + +SEC("syscall") +__retval(5) +int test_kfunc_implicit_arg(void *ctx) +{ + return bpf_kfunc_implicit_arg(5); +} + +SEC("syscall") +__failure __msg("cannot find address for kernel function bpf_kfunc_implicit_arg_impl") +int test_kfunc_implicit_arg_impl_illegal(void *ctx) +{ + return bpf_kfunc_implicit_arg_impl(5, NULL); +} + +SEC("syscall") +__retval(7) +int test_kfunc_implicit_arg_legacy(void *ctx) +{ + return bpf_kfunc_implicit_arg_legacy(3, 4); +} + +SEC("syscall") +__retval(11) +int test_kfunc_implicit_arg_legacy_impl(void *ctx) +{ + return bpf_kfunc_implicit_arg_legacy_impl(5, 6, NULL); +} diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c index bc07ce9d5477..a996b816ecc4 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c @@ -1142,6 +1142,10 @@ __bpf_kfunc int bpf_kfunc_st_ops_inc10(struct st_ops_args *args) __bpf_kfunc int bpf_kfunc_multi_st_ops_test_1(struct st_ops_args *args, u32 id); __bpf_kfunc int bpf_kfunc_multi_st_ops_test_1_impl(struct st_ops_args *args, void *aux_prog); +__bpf_kfunc int bpf_kfunc_implicit_arg(int a, struct bpf_prog_aux *aux); +__bpf_kfunc int bpf_kfunc_implicit_arg_legacy(int a, int b, struct bpf_prog_aux *aux); +__bpf_kfunc int bpf_kfunc_implicit_arg_legacy_impl(int a, int b, struct bpf_prog_aux *aux); + BTF_KFUNCS_START(bpf_testmod_check_kfunc_ids) BTF_ID_FLAGS(func, bpf_testmod_test_mod_kfunc) BTF_ID_FLAGS(func, bpf_kfunc_call_test1) @@ -1184,6 +1188,9 @@ BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_pro_epilogue, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_kfunc_st_ops_inc10) BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1) BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1_impl) +BTF_ID_FLAGS(func, bpf_kfunc_implicit_arg, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, bpf_kfunc_implicit_arg_legacy, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, bpf_kfunc_implicit_arg_legacy_impl) BTF_KFUNCS_END(bpf_testmod_check_kfunc_ids) static int bpf_testmod_ops_init(struct btf *btf) @@ -1675,6 +1682,25 @@ int bpf_kfunc_multi_st_ops_test_1_impl(struct st_ops_args *args, void *aux__prog return ret; } +int bpf_kfunc_implicit_arg(int a, struct bpf_prog_aux *aux) +{ + if (aux && a > 0) + return a; + return -EINVAL; +} + +int bpf_kfunc_implicit_arg_legacy(int a, int b, struct bpf_prog_aux *aux) +{ + if (aux) + return a + b; + return -EINVAL; +} + +int bpf_kfunc_implicit_arg_legacy_impl(int a, int b, struct bpf_prog_aux *aux) +{ + return bpf_kfunc_implicit_arg_legacy(a, b, aux); +} + static int multi_st_ops_reg(void *kdata, struct bpf_link *link) { struct bpf_testmod_multi_st_ops *st_ops = From b97931a25a4bc74076ffb5c3d1a534c71ade4d55 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Tue, 20 Jan 2026 14:26:32 -0800 Subject: [PATCH 132/268] bpf: Migrate bpf_wq_set_callback_impl() to KF_IMPLICIT_ARGS Implement bpf_wq_set_callback() with an implicit bpf_prog_aux argument, and remove bpf_wq_set_callback_impl(). Update special kfunc checks in the verifier accordingly. Reviewed-by: Eduard Zingerman Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260120222638.3976562-8-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 11 +++++------ kernel/bpf/verifier.c | 16 ++++++++-------- tools/testing/selftests/bpf/bpf_experimental.h | 5 ----- .../bpf/progs/verifier_async_cb_context.c | 4 ++-- tools/testing/selftests/bpf/progs/wq_failures.c | 4 ++-- 5 files changed, 17 insertions(+), 23 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 9eaa4185e0a7..c76a9003b221 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -3120,12 +3120,11 @@ __bpf_kfunc int bpf_wq_start(struct bpf_wq *wq, unsigned int flags) return 0; } -__bpf_kfunc int bpf_wq_set_callback_impl(struct bpf_wq *wq, - int (callback_fn)(void *map, int *key, void *value), - unsigned int flags, - void *aux__prog) +__bpf_kfunc int bpf_wq_set_callback(struct bpf_wq *wq, + int (callback_fn)(void *map, int *key, void *value), + unsigned int flags, + struct bpf_prog_aux *aux) { - struct bpf_prog_aux *aux = (struct bpf_prog_aux *)aux__prog; struct bpf_async_kern *async = (struct bpf_async_kern *)wq; if (flags) @@ -4488,7 +4487,7 @@ BTF_ID_FLAGS(func, bpf_dynptr_memset) BTF_ID_FLAGS(func, bpf_modify_return_test_tp) #endif BTF_ID_FLAGS(func, bpf_wq_init) -BTF_ID_FLAGS(func, bpf_wq_set_callback_impl) +BTF_ID_FLAGS(func, bpf_wq_set_callback, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_wq_start) BTF_ID_FLAGS(func, bpf_preempt_disable) BTF_ID_FLAGS(func, bpf_preempt_enable) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index adc24a2ce5b6..51e8c9f70868 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -520,7 +520,7 @@ static bool is_async_callback_calling_kfunc(u32 btf_id); static bool is_callback_calling_kfunc(u32 btf_id); static bool is_bpf_throw_kfunc(struct bpf_insn *insn); -static bool is_bpf_wq_set_callback_impl_kfunc(u32 btf_id); +static bool is_bpf_wq_set_callback_kfunc(u32 btf_id); static bool is_task_work_add_kfunc(u32 func_id); static bool is_sync_callback_calling_function(enum bpf_func_id func_id) @@ -562,7 +562,7 @@ static bool is_async_cb_sleepable(struct bpf_verifier_env *env, struct bpf_insn /* bpf_wq and bpf_task_work callbacks are always sleepable. */ if (bpf_pseudo_kfunc_call(insn) && insn->off == 0 && - (is_bpf_wq_set_callback_impl_kfunc(insn->imm) || is_task_work_add_kfunc(insn->imm))) + (is_bpf_wq_set_callback_kfunc(insn->imm) || is_task_work_add_kfunc(insn->imm))) return true; verifier_bug(env, "unhandled async callback in is_async_cb_sleepable"); @@ -12437,7 +12437,7 @@ enum special_kfunc_type { KF_bpf_percpu_obj_new_impl, KF_bpf_percpu_obj_drop_impl, KF_bpf_throw, - KF_bpf_wq_set_callback_impl, + KF_bpf_wq_set_callback, KF_bpf_preempt_disable, KF_bpf_preempt_enable, KF_bpf_iter_css_task_new, @@ -12501,7 +12501,7 @@ BTF_ID(func, bpf_dynptr_clone) BTF_ID(func, bpf_percpu_obj_new_impl) BTF_ID(func, bpf_percpu_obj_drop_impl) BTF_ID(func, bpf_throw) -BTF_ID(func, bpf_wq_set_callback_impl) +BTF_ID(func, bpf_wq_set_callback) BTF_ID(func, bpf_preempt_disable) BTF_ID(func, bpf_preempt_enable) #ifdef CONFIG_CGROUPS @@ -12994,7 +12994,7 @@ static bool is_sync_callback_calling_kfunc(u32 btf_id) static bool is_async_callback_calling_kfunc(u32 btf_id) { - return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl] || + return is_bpf_wq_set_callback_kfunc(btf_id) || is_task_work_add_kfunc(btf_id); } @@ -13004,9 +13004,9 @@ static bool is_bpf_throw_kfunc(struct bpf_insn *insn) insn->imm == special_kfunc_list[KF_bpf_throw]; } -static bool is_bpf_wq_set_callback_impl_kfunc(u32 btf_id) +static bool is_bpf_wq_set_callback_kfunc(u32 btf_id) { - return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl]; + return btf_id == special_kfunc_list[KF_bpf_wq_set_callback]; } static bool is_callback_calling_kfunc(u32 btf_id) @@ -14085,7 +14085,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, meta.r0_rdonly = false; } - if (is_bpf_wq_set_callback_impl_kfunc(meta.func_id)) { + if (is_bpf_wq_set_callback_kfunc(meta.func_id)) { err = push_callback_call(env, insn, insn_idx, meta.subprogno, set_timer_callback_state); if (err) { diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h index 2cd9165c7348..68a49b1f77ae 100644 --- a/tools/testing/selftests/bpf/bpf_experimental.h +++ b/tools/testing/selftests/bpf/bpf_experimental.h @@ -580,11 +580,6 @@ extern void bpf_iter_css_destroy(struct bpf_iter_css *it) __weak __ksym; extern int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags) __weak __ksym; extern int bpf_wq_start(struct bpf_wq *wq, unsigned int flags) __weak __ksym; -extern int bpf_wq_set_callback_impl(struct bpf_wq *wq, - int (callback_fn)(void *map, int *key, void *value), - unsigned int flags__k, void *aux__ign) __ksym; -#define bpf_wq_set_callback(timer, cb, flags) \ - bpf_wq_set_callback_impl(timer, cb, flags, NULL) struct bpf_iter_kmem_cache; extern int bpf_iter_kmem_cache_new(struct bpf_iter_kmem_cache *it) __weak __ksym; diff --git a/tools/testing/selftests/bpf/progs/verifier_async_cb_context.c b/tools/testing/selftests/bpf/progs/verifier_async_cb_context.c index 7efa9521105e..5d5e1cd4d51d 100644 --- a/tools/testing/selftests/bpf/progs/verifier_async_cb_context.c +++ b/tools/testing/selftests/bpf/progs/verifier_async_cb_context.c @@ -96,7 +96,7 @@ int wq_non_sleepable_prog(void *ctx) if (bpf_wq_init(&val->w, &wq_map, 0) != 0) return 0; - if (bpf_wq_set_callback_impl(&val->w, wq_cb, 0, NULL) != 0) + if (bpf_wq_set_callback(&val->w, wq_cb, 0) != 0) return 0; return 0; } @@ -114,7 +114,7 @@ int wq_sleepable_prog(void *ctx) if (bpf_wq_init(&val->w, &wq_map, 0) != 0) return 0; - if (bpf_wq_set_callback_impl(&val->w, wq_cb, 0, NULL) != 0) + if (bpf_wq_set_callback(&val->w, wq_cb, 0) != 0) return 0; return 0; } diff --git a/tools/testing/selftests/bpf/progs/wq_failures.c b/tools/testing/selftests/bpf/progs/wq_failures.c index d06f6d40594a..3767f5595bbc 100644 --- a/tools/testing/selftests/bpf/progs/wq_failures.c +++ b/tools/testing/selftests/bpf/progs/wq_failures.c @@ -97,7 +97,7 @@ __failure /* check that the first argument of bpf_wq_set_callback() * is a correct bpf_wq pointer. */ -__msg(": (85) call bpf_wq_set_callback_impl#") /* anchor message */ +__msg(": (85) call bpf_wq_set_callback#") /* anchor message */ __msg("arg#0 doesn't point to a map value") long test_wrong_wq_pointer(void *ctx) { @@ -123,7 +123,7 @@ __failure /* check that the first argument of bpf_wq_set_callback() * is a correct bpf_wq pointer. */ -__msg(": (85) call bpf_wq_set_callback_impl#") /* anchor message */ +__msg(": (85) call bpf_wq_set_callback#") /* anchor message */ __msg("off 1 doesn't point to 'struct bpf_wq' that is at 0") long test_wrong_wq_pointer_offset(void *ctx) { From 8157cc739ad301b7fb6dfc4cfc5497cedd33df4e Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Tue, 20 Jan 2026 14:26:33 -0800 Subject: [PATCH 133/268] HID: Use bpf_wq_set_callback kernel function Remove extern declaration of bpf_wq_set_callback_impl() from hid_bpf_helpers.h and replace bpf_wq_set_callback macro with a corresponding new declaration. Tested with: # append tools/testing/selftests/hid/config and build the kernel $ make -C tools/testing/selftests/hid # in built kernel $ ./tools/testing/selftests/hid/hid_bpf -t test_multiply_events_wq TAP version 13 1..1 # Starting 1 tests from 1 test cases. # RUN hid_bpf.test_multiply_events_wq ... [ 2.575520] hid-generic 0003:0001:0A36.0001: hidraw0: USB HID v0.00 Device [test-uhid-device-138] on 138 # OK hid_bpf.test_multiply_events_wq ok 1 hid_bpf.test_multiply_events_wq # PASSED: 1 / 1 tests passed. # Totals: pass:1 fail:0 xfail:0 xpass:0 skip:0 error:0 PASS Acked-by: Benjamin Tissoires Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260120222638.3976562-9-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- drivers/hid/bpf/progs/hid_bpf_helpers.h | 8 +++----- tools/testing/selftests/hid/progs/hid_bpf_helpers.h | 8 +++----- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/drivers/hid/bpf/progs/hid_bpf_helpers.h b/drivers/hid/bpf/progs/hid_bpf_helpers.h index bf19785a6b06..228f8d787567 100644 --- a/drivers/hid/bpf/progs/hid_bpf_helpers.h +++ b/drivers/hid/bpf/progs/hid_bpf_helpers.h @@ -33,11 +33,9 @@ extern int hid_bpf_try_input_report(struct hid_bpf_ctx *ctx, /* bpf_wq implementation */ extern int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags) __weak __ksym; extern int bpf_wq_start(struct bpf_wq *wq, unsigned int flags) __weak __ksym; -extern int bpf_wq_set_callback_impl(struct bpf_wq *wq, - int (callback_fn)(void *map, int *key, void *value), - unsigned int flags__k, void *aux__ign) __ksym; -#define bpf_wq_set_callback(wq, cb, flags) \ - bpf_wq_set_callback_impl(wq, cb, flags, NULL) +extern int bpf_wq_set_callback(struct bpf_wq *wq, + int (*callback_fn)(void *, int *, void *), + unsigned int flags) __weak __ksym; #define HID_MAX_DESCRIPTOR_SIZE 4096 #define HID_IGNORE_EVENT -1 diff --git a/tools/testing/selftests/hid/progs/hid_bpf_helpers.h b/tools/testing/selftests/hid/progs/hid_bpf_helpers.h index 531228b849da..80ab60905865 100644 --- a/tools/testing/selftests/hid/progs/hid_bpf_helpers.h +++ b/tools/testing/selftests/hid/progs/hid_bpf_helpers.h @@ -116,10 +116,8 @@ extern int hid_bpf_try_input_report(struct hid_bpf_ctx *ctx, /* bpf_wq implementation */ extern int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags) __weak __ksym; extern int bpf_wq_start(struct bpf_wq *wq, unsigned int flags) __weak __ksym; -extern int bpf_wq_set_callback_impl(struct bpf_wq *wq, - int (callback_fn)(void *map, int *key, void *wq), - unsigned int flags__k, void *aux__ign) __weak __ksym; -#define bpf_wq_set_callback(timer, cb, flags) \ - bpf_wq_set_callback_impl(timer, cb, flags, NULL) +extern int bpf_wq_set_callback(struct bpf_wq *wq, + int (*callback_fn)(void *, int *, void *), + unsigned int flags) __weak __ksym; #endif /* __HID_BPF_HELPERS_H */ From 6e663ffdf7600168338fdfa2fd1eed83395d58a3 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Tue, 20 Jan 2026 14:26:34 -0800 Subject: [PATCH 134/268] bpf: Migrate bpf_task_work_schedule_* kfuncs to KF_IMPLICIT_ARGS Implement bpf_task_work_schedule_* with an implicit bpf_prog_aux argument, and remove corresponding _impl funcs from the kernel. Update special kfunc checks in the verifier accordingly. Update the selftests to use the new API with implicit argument. Reviewed-by: Eduard Zingerman Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260120222638.3976562-10-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 30 +++++++++---------- kernel/bpf/verifier.c | 12 ++++---- .../testing/selftests/bpf/progs/file_reader.c | 2 +- tools/testing/selftests/bpf/progs/task_work.c | 7 ++--- .../selftests/bpf/progs/task_work_fail.c | 8 ++--- .../selftests/bpf/progs/task_work_stress.c | 4 +-- .../bpf/progs/verifier_async_cb_context.c | 4 +-- 7 files changed, 32 insertions(+), 35 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index c76a9003b221..f2f974b5fb3b 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -4274,41 +4274,39 @@ static int bpf_task_work_schedule(struct task_struct *task, struct bpf_task_work } /** - * bpf_task_work_schedule_signal_impl - Schedule BPF callback using task_work_add with TWA_SIGNAL + * bpf_task_work_schedule_signal - Schedule BPF callback using task_work_add with TWA_SIGNAL * mode * @task: Task struct for which callback should be scheduled * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping * @map__map: bpf_map that embeds struct bpf_task_work in the values * @callback: pointer to BPF subprogram to call - * @aux__prog: user should pass NULL + * @aux: pointer to bpf_prog_aux of the caller BPF program, implicitly set by the verifier * * Return: 0 if task work has been scheduled successfully, negative error code otherwise */ -__bpf_kfunc int bpf_task_work_schedule_signal_impl(struct task_struct *task, - struct bpf_task_work *tw, void *map__map, - bpf_task_work_callback_t callback, - void *aux__prog) +__bpf_kfunc int bpf_task_work_schedule_signal(struct task_struct *task, struct bpf_task_work *tw, + void *map__map, bpf_task_work_callback_t callback, + struct bpf_prog_aux *aux) { - return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_SIGNAL); + return bpf_task_work_schedule(task, tw, map__map, callback, aux, TWA_SIGNAL); } /** - * bpf_task_work_schedule_resume_impl - Schedule BPF callback using task_work_add with TWA_RESUME + * bpf_task_work_schedule_resume - Schedule BPF callback using task_work_add with TWA_RESUME * mode * @task: Task struct for which callback should be scheduled * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping * @map__map: bpf_map that embeds struct bpf_task_work in the values * @callback: pointer to BPF subprogram to call - * @aux__prog: user should pass NULL + * @aux: pointer to bpf_prog_aux of the caller BPF program, implicitly set by the verifier * * Return: 0 if task work has been scheduled successfully, negative error code otherwise */ -__bpf_kfunc int bpf_task_work_schedule_resume_impl(struct task_struct *task, - struct bpf_task_work *tw, void *map__map, - bpf_task_work_callback_t callback, - void *aux__prog) +__bpf_kfunc int bpf_task_work_schedule_resume(struct task_struct *task, struct bpf_task_work *tw, + void *map__map, bpf_task_work_callback_t callback, + struct bpf_prog_aux *aux) { - return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_RESUME); + return bpf_task_work_schedule(task, tw, map__map, callback, aux, TWA_RESUME); } static int make_file_dynptr(struct file *file, u32 flags, bool may_sleep, @@ -4536,8 +4534,8 @@ BTF_ID_FLAGS(func, bpf_strncasestr); BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU) #endif BTF_ID_FLAGS(func, bpf_stream_vprintk_impl) -BTF_ID_FLAGS(func, bpf_task_work_schedule_signal_impl) -BTF_ID_FLAGS(func, bpf_task_work_schedule_resume_impl) +BTF_ID_FLAGS(func, bpf_task_work_schedule_signal, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, bpf_task_work_schedule_resume, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_dynptr_from_file) BTF_ID_FLAGS(func, bpf_dynptr_file_discard) BTF_KFUNCS_END(common_btf_ids) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 51e8c9f70868..8e8570e9d167 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12457,8 +12457,8 @@ enum special_kfunc_type { KF_bpf_dynptr_from_file, KF_bpf_dynptr_file_discard, KF___bpf_trap, - KF_bpf_task_work_schedule_signal_impl, - KF_bpf_task_work_schedule_resume_impl, + KF_bpf_task_work_schedule_signal, + KF_bpf_task_work_schedule_resume, KF_bpf_arena_alloc_pages, KF_bpf_arena_free_pages, KF_bpf_arena_reserve_pages, @@ -12534,16 +12534,16 @@ BTF_ID(func, bpf_res_spin_unlock_irqrestore) BTF_ID(func, bpf_dynptr_from_file) BTF_ID(func, bpf_dynptr_file_discard) BTF_ID(func, __bpf_trap) -BTF_ID(func, bpf_task_work_schedule_signal_impl) -BTF_ID(func, bpf_task_work_schedule_resume_impl) +BTF_ID(func, bpf_task_work_schedule_signal) +BTF_ID(func, bpf_task_work_schedule_resume) BTF_ID(func, bpf_arena_alloc_pages) BTF_ID(func, bpf_arena_free_pages) BTF_ID(func, bpf_arena_reserve_pages) static bool is_task_work_add_kfunc(u32 func_id) { - return func_id == special_kfunc_list[KF_bpf_task_work_schedule_signal_impl] || - func_id == special_kfunc_list[KF_bpf_task_work_schedule_resume_impl]; + return func_id == special_kfunc_list[KF_bpf_task_work_schedule_signal] || + func_id == special_kfunc_list[KF_bpf_task_work_schedule_resume]; } static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) diff --git a/tools/testing/selftests/bpf/progs/file_reader.c b/tools/testing/selftests/bpf/progs/file_reader.c index 4d756b623557..462712ff3b8a 100644 --- a/tools/testing/selftests/bpf/progs/file_reader.c +++ b/tools/testing/selftests/bpf/progs/file_reader.c @@ -77,7 +77,7 @@ int on_open_validate_file_read(void *c) err = 1; return 0; } - bpf_task_work_schedule_signal_impl(task, &work->tw, &arrmap, task_work_callback, NULL); + bpf_task_work_schedule_signal(task, &work->tw, &arrmap, task_work_callback); return 0; } diff --git a/tools/testing/selftests/bpf/progs/task_work.c b/tools/testing/selftests/bpf/progs/task_work.c index 663a80990f8f..a6009d105158 100644 --- a/tools/testing/selftests/bpf/progs/task_work.c +++ b/tools/testing/selftests/bpf/progs/task_work.c @@ -65,8 +65,7 @@ int oncpu_hash_map(struct pt_regs *args) work = bpf_map_lookup_elem(&hmap, &key); if (!work) return 0; - - bpf_task_work_schedule_resume_impl(task, &work->tw, &hmap, process_work, NULL); + bpf_task_work_schedule_resume(task, &work->tw, &hmap, process_work); return 0; } @@ -80,7 +79,7 @@ int oncpu_array_map(struct pt_regs *args) work = bpf_map_lookup_elem(&arrmap, &key); if (!work) return 0; - bpf_task_work_schedule_signal_impl(task, &work->tw, &arrmap, process_work, NULL); + bpf_task_work_schedule_signal(task, &work->tw, &arrmap, process_work); return 0; } @@ -102,6 +101,6 @@ int oncpu_lru_map(struct pt_regs *args) work = bpf_map_lookup_elem(&lrumap, &key); if (!work || work->data[0]) return 0; - bpf_task_work_schedule_resume_impl(task, &work->tw, &lrumap, process_work, NULL); + bpf_task_work_schedule_resume(task, &work->tw, &lrumap, process_work); return 0; } diff --git a/tools/testing/selftests/bpf/progs/task_work_fail.c b/tools/testing/selftests/bpf/progs/task_work_fail.c index 1270953fd092..82e4b8913333 100644 --- a/tools/testing/selftests/bpf/progs/task_work_fail.c +++ b/tools/testing/selftests/bpf/progs/task_work_fail.c @@ -53,7 +53,7 @@ int mismatch_map(struct pt_regs *args) work = bpf_map_lookup_elem(&arrmap, &key); if (!work) return 0; - bpf_task_work_schedule_resume_impl(task, &work->tw, &hmap, process_work, NULL); + bpf_task_work_schedule_resume(task, &work->tw, &hmap, process_work); return 0; } @@ -65,7 +65,7 @@ int no_map_task_work(struct pt_regs *args) struct bpf_task_work tw; task = bpf_get_current_task_btf(); - bpf_task_work_schedule_resume_impl(task, &tw, &hmap, process_work, NULL); + bpf_task_work_schedule_resume(task, &tw, &hmap, process_work); return 0; } @@ -76,7 +76,7 @@ int task_work_null(struct pt_regs *args) struct task_struct *task; task = bpf_get_current_task_btf(); - bpf_task_work_schedule_resume_impl(task, NULL, &hmap, process_work, NULL); + bpf_task_work_schedule_resume(task, NULL, &hmap, process_work); return 0; } @@ -91,6 +91,6 @@ int map_null(struct pt_regs *args) work = bpf_map_lookup_elem(&arrmap, &key); if (!work) return 0; - bpf_task_work_schedule_resume_impl(task, &work->tw, NULL, process_work, NULL); + bpf_task_work_schedule_resume(task, &work->tw, NULL, process_work); return 0; } diff --git a/tools/testing/selftests/bpf/progs/task_work_stress.c b/tools/testing/selftests/bpf/progs/task_work_stress.c index 55e555f7f41b..1d4378f351ef 100644 --- a/tools/testing/selftests/bpf/progs/task_work_stress.c +++ b/tools/testing/selftests/bpf/progs/task_work_stress.c @@ -51,8 +51,8 @@ int schedule_task_work(void *ctx) if (!work) return 0; } - err = bpf_task_work_schedule_signal_impl(bpf_get_current_task_btf(), &work->tw, &hmap, - process_work, NULL); + err = bpf_task_work_schedule_signal(bpf_get_current_task_btf(), &work->tw, &hmap, + process_work); if (err) __sync_fetch_and_add(&schedule_error, 1); else diff --git a/tools/testing/selftests/bpf/progs/verifier_async_cb_context.c b/tools/testing/selftests/bpf/progs/verifier_async_cb_context.c index 5d5e1cd4d51d..39aff82549c9 100644 --- a/tools/testing/selftests/bpf/progs/verifier_async_cb_context.c +++ b/tools/testing/selftests/bpf/progs/verifier_async_cb_context.c @@ -156,7 +156,7 @@ int task_work_non_sleepable_prog(void *ctx) if (!task) return 0; - bpf_task_work_schedule_resume_impl(task, &val->tw, &task_work_map, task_work_cb, NULL); + bpf_task_work_schedule_resume(task, &val->tw, &task_work_map, task_work_cb); return 0; } @@ -176,6 +176,6 @@ int task_work_sleepable_prog(void *ctx) if (!task) return 0; - bpf_task_work_schedule_resume_impl(task, &val->tw, &task_work_map, task_work_cb, NULL); + bpf_task_work_schedule_resume(task, &val->tw, &task_work_map, task_work_cb); return 0; } From d806f3101276a1ed18d963944580e1ee1c7a3d26 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Tue, 20 Jan 2026 14:26:35 -0800 Subject: [PATCH 135/268] bpf: Migrate bpf_stream_vprintk() to KF_IMPLICIT_ARGS Implement bpf_stream_vprintk with an implicit bpf_prog_aux argument, and remote bpf_stream_vprintk_impl from the kernel. Update the selftests to use the new API with implicit argument. bpf_stream_vprintk macro is changed to use the new bpf_stream_vprintk kfunc, and the extern definition of bpf_stream_vprintk_impl is replaced accordingly. Reviewed-by: Eduard Zingerman Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260120222638.3976562-11-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 2 +- kernel/bpf/stream.c | 5 ++--- tools/lib/bpf/bpf_helpers.h | 6 +++--- tools/testing/selftests/bpf/progs/stream_fail.c | 6 +++--- 4 files changed, 9 insertions(+), 10 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index f2f974b5fb3b..f8aa1320e2f7 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -4533,7 +4533,7 @@ BTF_ID_FLAGS(func, bpf_strncasestr); #if defined(CONFIG_BPF_LSM) && defined(CONFIG_CGROUPS) BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU) #endif -BTF_ID_FLAGS(func, bpf_stream_vprintk_impl) +BTF_ID_FLAGS(func, bpf_stream_vprintk, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_task_work_schedule_signal, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_task_work_schedule_resume, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_dynptr_from_file) diff --git a/kernel/bpf/stream.c b/kernel/bpf/stream.c index 0b6bc3f30335..24730df55e69 100644 --- a/kernel/bpf/stream.c +++ b/kernel/bpf/stream.c @@ -212,14 +212,13 @@ __bpf_kfunc_start_defs(); * Avoid using enum bpf_stream_id so that kfunc users don't have to pull in the * enum in headers. */ -__bpf_kfunc int bpf_stream_vprintk_impl(int stream_id, const char *fmt__str, const void *args, - u32 len__sz, void *aux__prog) +__bpf_kfunc int bpf_stream_vprintk(int stream_id, const char *fmt__str, const void *args, + u32 len__sz, struct bpf_prog_aux *aux) { struct bpf_bprintf_data data = { .get_bin_args = true, .get_buf = true, }; - struct bpf_prog_aux *aux = aux__prog; u32 fmt_size = strlen(fmt__str) + 1; struct bpf_stream *stream; u32 data_len = len__sz; diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h index d4e4e388e625..c145da05a67c 100644 --- a/tools/lib/bpf/bpf_helpers.h +++ b/tools/lib/bpf/bpf_helpers.h @@ -315,8 +315,8 @@ enum libbpf_tristate { ___param, sizeof(___param)); \ }) -extern int bpf_stream_vprintk_impl(int stream_id, const char *fmt__str, const void *args, - __u32 len__sz, void *aux__prog) __weak __ksym; +extern int bpf_stream_vprintk(int stream_id, const char *fmt__str, const void *args, + __u32 len__sz) __weak __ksym; #define bpf_stream_printk(stream_id, fmt, args...) \ ({ \ @@ -328,7 +328,7 @@ extern int bpf_stream_vprintk_impl(int stream_id, const char *fmt__str, const vo ___bpf_fill(___param, args); \ _Pragma("GCC diagnostic pop") \ \ - bpf_stream_vprintk_impl(stream_id, ___fmt, ___param, sizeof(___param), NULL); \ + bpf_stream_vprintk(stream_id, ___fmt, ___param, sizeof(___param)); \ }) /* Use __bpf_printk when bpf_printk call has 3 or fewer fmt args diff --git a/tools/testing/selftests/bpf/progs/stream_fail.c b/tools/testing/selftests/bpf/progs/stream_fail.c index 3662515f0107..8e8249f3521c 100644 --- a/tools/testing/selftests/bpf/progs/stream_fail.c +++ b/tools/testing/selftests/bpf/progs/stream_fail.c @@ -10,7 +10,7 @@ SEC("syscall") __failure __msg("Possibly NULL pointer passed") int stream_vprintk_null_arg(void *ctx) { - bpf_stream_vprintk_impl(BPF_STDOUT, "", NULL, 0, NULL); + bpf_stream_vprintk(BPF_STDOUT, "", NULL, 0); return 0; } @@ -18,7 +18,7 @@ SEC("syscall") __failure __msg("R3 type=scalar expected=") int stream_vprintk_scalar_arg(void *ctx) { - bpf_stream_vprintk_impl(BPF_STDOUT, "", (void *)46, 0, NULL); + bpf_stream_vprintk(BPF_STDOUT, "", (void *)46, 0); return 0; } @@ -26,7 +26,7 @@ SEC("syscall") __failure __msg("arg#1 doesn't point to a const string") int stream_vprintk_string_arg(void *ctx) { - bpf_stream_vprintk_impl(BPF_STDOUT, ctx, NULL, 0, NULL); + bpf_stream_vprintk(BPF_STDOUT, ctx, NULL, 0); return 0; } From bd06b977e02d80fe0dec303cc219d007121a4526 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Tue, 20 Jan 2026 14:26:36 -0800 Subject: [PATCH 136/268] selftests/bpf: Migrate struct_ops_assoc test to KF_IMPLICIT_ARGS A test kfunc named bpf_kfunc_multi_st_ops_test_1_impl() is a user of __prog suffix. Subsequent patch removes __prog support in favor of KF_IMPLICIT_ARGS, so migrate this kfunc to use implicit argument. Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260120222638.3976562-12-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/struct_ops_assoc.c | 8 ++++---- .../selftests/bpf/progs/struct_ops_assoc_in_timer.c | 4 ++-- .../testing/selftests/bpf/progs/struct_ops_assoc_reuse.c | 6 +++--- tools/testing/selftests/bpf/test_kmods/bpf_testmod.c | 9 ++++----- .../testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h | 6 ++++-- 5 files changed, 17 insertions(+), 16 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/struct_ops_assoc.c b/tools/testing/selftests/bpf/progs/struct_ops_assoc.c index 8f1097903e22..68842e3f936b 100644 --- a/tools/testing/selftests/bpf/progs/struct_ops_assoc.c +++ b/tools/testing/selftests/bpf/progs/struct_ops_assoc.c @@ -32,7 +32,7 @@ int BPF_PROG(sys_enter_prog_a, struct pt_regs *regs, long id) if (!test_pid || task->pid != test_pid) return 0; - ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL); + ret = bpf_kfunc_multi_st_ops_test_1_assoc(&args); if (ret != MAP_A_MAGIC) test_err_a++; @@ -45,7 +45,7 @@ int syscall_prog_a(void *ctx) struct st_ops_args args = {}; int ret; - ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL); + ret = bpf_kfunc_multi_st_ops_test_1_assoc(&args); if (ret != MAP_A_MAGIC) test_err_a++; @@ -79,7 +79,7 @@ int BPF_PROG(sys_enter_prog_b, struct pt_regs *regs, long id) if (!test_pid || task->pid != test_pid) return 0; - ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL); + ret = bpf_kfunc_multi_st_ops_test_1_assoc(&args); if (ret != MAP_B_MAGIC) test_err_b++; @@ -92,7 +92,7 @@ int syscall_prog_b(void *ctx) struct st_ops_args args = {}; int ret; - ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL); + ret = bpf_kfunc_multi_st_ops_test_1_assoc(&args); if (ret != MAP_B_MAGIC) test_err_b++; diff --git a/tools/testing/selftests/bpf/progs/struct_ops_assoc_in_timer.c b/tools/testing/selftests/bpf/progs/struct_ops_assoc_in_timer.c index d5a2ea934284..0bed49e9f217 100644 --- a/tools/testing/selftests/bpf/progs/struct_ops_assoc_in_timer.c +++ b/tools/testing/selftests/bpf/progs/struct_ops_assoc_in_timer.c @@ -31,7 +31,7 @@ __noinline static int timer_cb(void *map, int *key, struct bpf_timer *timer) struct st_ops_args args = {}; recur++; - timer_test_1_ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL); + timer_test_1_ret = bpf_kfunc_multi_st_ops_test_1_assoc(&args); recur--; timer_cb_run++; @@ -64,7 +64,7 @@ int syscall_prog(void *ctx) struct st_ops_args args = {}; int ret; - ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL); + ret = bpf_kfunc_multi_st_ops_test_1_assoc(&args); if (ret != MAP_MAGIC) test_err++; diff --git a/tools/testing/selftests/bpf/progs/struct_ops_assoc_reuse.c b/tools/testing/selftests/bpf/progs/struct_ops_assoc_reuse.c index 5bb6ebf5eed4..396b3e58c729 100644 --- a/tools/testing/selftests/bpf/progs/struct_ops_assoc_reuse.c +++ b/tools/testing/selftests/bpf/progs/struct_ops_assoc_reuse.c @@ -23,7 +23,7 @@ int BPF_PROG(test_1_a, struct st_ops_args *args) if (!recur) { recur++; - ret = bpf_kfunc_multi_st_ops_test_1_impl(args, NULL); + ret = bpf_kfunc_multi_st_ops_test_1_assoc(args); if (ret != -1) test_err_a++; recur--; @@ -40,7 +40,7 @@ int syscall_prog_a(void *ctx) struct st_ops_args args = {}; int ret; - ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL); + ret = bpf_kfunc_multi_st_ops_test_1_assoc(&args); if (ret != MAP_A_MAGIC) test_err_a++; @@ -62,7 +62,7 @@ int syscall_prog_b(void *ctx) struct st_ops_args args = {}; int ret; - ret = bpf_kfunc_multi_st_ops_test_1_impl(&args, NULL); + ret = bpf_kfunc_multi_st_ops_test_1_assoc(&args); if (ret != MAP_A_MAGIC) test_err_b++; diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c index a996b816ecc4..0d542ba64365 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c @@ -1140,7 +1140,7 @@ __bpf_kfunc int bpf_kfunc_st_ops_inc10(struct st_ops_args *args) } __bpf_kfunc int bpf_kfunc_multi_st_ops_test_1(struct st_ops_args *args, u32 id); -__bpf_kfunc int bpf_kfunc_multi_st_ops_test_1_impl(struct st_ops_args *args, void *aux_prog); +__bpf_kfunc int bpf_kfunc_multi_st_ops_test_1_assoc(struct st_ops_args *args, struct bpf_prog_aux *aux); __bpf_kfunc int bpf_kfunc_implicit_arg(int a, struct bpf_prog_aux *aux); __bpf_kfunc int bpf_kfunc_implicit_arg_legacy(int a, int b, struct bpf_prog_aux *aux); @@ -1187,7 +1187,7 @@ BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_epilogue, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_kfunc_st_ops_test_pro_epilogue, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_kfunc_st_ops_inc10) BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1) -BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1_impl) +BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1_assoc, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_kfunc_implicit_arg, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_kfunc_implicit_arg_legacy, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_kfunc_implicit_arg_legacy_impl) @@ -1669,13 +1669,12 @@ int bpf_kfunc_multi_st_ops_test_1(struct st_ops_args *args, u32 id) } /* Call test_1() of the associated struct_ops map */ -int bpf_kfunc_multi_st_ops_test_1_impl(struct st_ops_args *args, void *aux__prog) +int bpf_kfunc_multi_st_ops_test_1_assoc(struct st_ops_args *args, struct bpf_prog_aux *aux) { - struct bpf_prog_aux *prog_aux = (struct bpf_prog_aux *)aux__prog; struct bpf_testmod_multi_st_ops *st_ops; int ret = -1; - st_ops = (struct bpf_testmod_multi_st_ops *)bpf_prog_get_assoc_struct_ops(prog_aux); + st_ops = (struct bpf_testmod_multi_st_ops *)bpf_prog_get_assoc_struct_ops(aux); if (st_ops) ret = st_ops->test_1(args); diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h b/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h index 2357a0340ffe..225ea30c4e3d 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h @@ -161,7 +161,9 @@ void bpf_kfunc_rcu_task_test(struct task_struct *ptr) __ksym; struct task_struct *bpf_kfunc_ret_rcu_test(void) __ksym; int *bpf_kfunc_ret_rcu_test_nostruct(int rdonly_buf_size) __ksym; -int bpf_kfunc_multi_st_ops_test_1(struct st_ops_args *args, u32 id) __ksym; -int bpf_kfunc_multi_st_ops_test_1_impl(struct st_ops_args *args, void *aux__prog) __ksym; +#ifndef __KERNEL__ +extern int bpf_kfunc_multi_st_ops_test_1(struct st_ops_args *args, u32 id) __weak __ksym; +extern int bpf_kfunc_multi_st_ops_test_1_assoc(struct st_ops_args *args) __weak __ksym; +#endif #endif /* _BPF_TESTMOD_KFUNC_H */ From aed57a36387135bcb73f01bac3d0a286a657b006 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Tue, 20 Jan 2026 14:26:37 -0800 Subject: [PATCH 137/268] bpf: Remove __prog kfunc arg annotation Now that all the __prog suffix users in the kernel tree migrated to KF_IMPLICIT_ARGS, remove it from the verifier. See prior discussion for context [1]. [1] https://lore.kernel.org/bpf/CAEf4BzbgPfRm9BX=TsZm-TsHFAHcwhPY4vTt=9OT-uhWqf8tqw@mail.gmail.com/ Acked-by: Eduard Zingerman Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260120222638.3976562-13-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8e8570e9d167..919556614505 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12211,13 +12211,6 @@ static bool is_kfunc_arg_irq_flag(const struct btf *btf, const struct btf_param return btf_param_match_suffix(btf, arg, "__irq_flag"); } -static bool is_kfunc_arg_prog_aux(const struct btf *btf, const struct btf_param *arg); - -static bool is_kfunc_arg_prog(const struct btf *btf, const struct btf_param *arg) -{ - return btf_param_match_suffix(btf, arg, "__prog") || is_kfunc_arg_prog_aux(btf, arg); -} - static bool is_kfunc_arg_scalar_with_name(const struct btf *btf, const struct btf_param *arg, const char *name) @@ -13280,8 +13273,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (is_kfunc_arg_ignore(btf, &args[i])) continue; - if (is_kfunc_arg_prog(btf, &args[i])) { - /* Used to reject repeated use of __prog. */ + if (is_kfunc_arg_prog_aux(btf, &args[i])) { + /* Reject repeated use bpf_prog_aux */ if (meta->arg_prog) { verifier_bug(env, "Only 1 prog->aux argument supported per-kfunc"); return -EFAULT; From 74bc4f6127207624ec06f0d0984b280a390992aa Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Tue, 20 Jan 2026 14:30:26 -0800 Subject: [PATCH 138/268] bpf,docs: Document KF_IMPLICIT_ARGS flag Add a section explaining KF_IMPLICIT_ARGS kfunc flag. Remove __prog arg annotation, as it is no longer supported. Signed-off-by: Ihor Solodrai Link: https://lore.kernel.org/r/20260120223027.3981805-1-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- Documentation/bpf/kfuncs.rst | 49 +++++++++++++++++++++++------------- 1 file changed, 32 insertions(+), 17 deletions(-) diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst index 3eb59a8f9f34..75e6c078e0e7 100644 --- a/Documentation/bpf/kfuncs.rst +++ b/Documentation/bpf/kfuncs.rst @@ -232,23 +232,6 @@ Or:: ... } -2.3.6 __prog Annotation ---------------------------- -This annotation is used to indicate that the argument needs to be fixed up to -the bpf_prog_aux of the caller BPF program. Any value passed into this argument -is ignored, and rewritten by the verifier. - -An example is given below:: - - __bpf_kfunc int bpf_wq_set_callback_impl(struct bpf_wq *wq, - int (callback_fn)(void *map, int *key, void *value), - unsigned int flags, - void *aux__prog) - { - struct bpf_prog_aux *aux = aux__prog; - ... - } - .. _BPF_kfunc_nodef: 2.4 Using an existing kernel function @@ -381,6 +364,38 @@ encouraged to make their use-cases known as early as possible, and participate in upstream discussions regarding whether to keep, change, deprecate, or remove those kfuncs if and when such discussions occur. +2.5.9 KF_IMPLICIT_ARGS flag +------------------------------------ + +The KF_IMPLICIT_ARGS flag is used to indicate that the BPF signature +of the kfunc is different from it's kernel signature, and the values +for implicit arguments are provided at load time by the verifier. + +Only arguments of specific types are implicit. +Currently only ``struct bpf_prog_aux *`` type is supported. + +A kfunc with KF_IMPLICIT_ARGS flag therefore has two types in BTF: one +function matching the kernel declaration (with _impl suffix in the +name by convention), and another matching the intended BPF API. + +Verifier only allows calls to the non-_impl version of a kfunc, that +uses a signature without the implicit arguments. + +Example declaration: + +.. code-block:: c + + __bpf_kfunc int bpf_task_work_schedule_signal(struct task_struct *task, struct bpf_task_work *tw, + void *map__map, bpf_task_work_callback_t callback, + struct bpf_prog_aux *aux) { ... } + +Example usage in BPF program: + +.. code-block:: c + + /* note that the last argument is omitted */ + bpf_task_work_schedule_signal(task, &work->tw, &arrmap, task_work_callback); + 2.6 Registering the kfuncs -------------------------- From 44fdd581d27366092e162b42f025d75d5a16c851 Mon Sep 17 00:00:00 2001 From: Yazhou Tang Date: Mon, 19 Jan 2026 16:54:57 +0800 Subject: [PATCH 139/268] bpf: Add range tracking for BPF_DIV and BPF_MOD This patch implements range tracking (interval analysis) for BPF_DIV and BPF_MOD operations when the divisor is a constant, covering both signed and unsigned variants. While LLVM typically optimizes integer division and modulo by constants into multiplication and shift sequences, this optimization is less effective for the BPF target when dealing with 64-bit arithmetic. Currently, the verifier does not track bounds for scalar division or modulo, treating the result as "unbounded". This leads to false positive rejections for safe code patterns. For example, the following code (compiled with -O2): ```c int test(struct pt_regs *ctx) { char buffer[6] = {1}; __u64 x = bpf_ktime_get_ns(); __u64 res = x % sizeof(buffer); char value = buffer[res]; bpf_printk("res = %llu, val = %d", res, value); return 0; } ``` Generates a raw `BPF_MOD64` instruction: ```asm ; __u64 res = x % sizeof(buffer); 1: 97 00 00 00 06 00 00 00 r0 %= 0x6 ; char value = buffer[res]; 2: 18 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r1 = 0x0 ll 4: 0f 01 00 00 00 00 00 00 r1 += r0 5: 91 14 00 00 00 00 00 00 r4 = *(s8 *)(r1 + 0x0) ``` Without this patch, the verifier fails with "math between map_value pointer and register with unbounded min value is not allowed" because it cannot deduce that `r0` is within [0, 5]. According to the BPF instruction set[1], the instruction's offset field (`insn->off`) is used to distinguish between signed (`off == 1`) and unsigned division (`off == 0`). Moreover, we also follow the BPF division and modulo runtime behavior (semantics) to handle special cases, such as division by zero and signed division overflow. - UDIV: dst = (src != 0) ? (dst / src) : 0 - SDIV: dst = (src == 0) ? 0 : ((src == -1 && dst == LLONG_MIN) ? LLONG_MIN : (dst / src)) - UMOD: dst = (src != 0) ? (dst % src) : dst - SMOD: dst = (src == 0) ? dst : ((src == -1 && dst == LLONG_MIN) ? 0: (dst s% src)) Here is the overview of the changes made in this patch (See the code comments for more details and examples): 1. For BPF_DIV: Firstly check whether the divisor is zero. If so, set the destination register to zero (matching runtime behavior). For non-zero constant divisors: goto `scalar(32)?_min_max_(u|s)div` functions. - General cases: compute the new range by dividing max_dividend and min_dividend by the constant divisor. - Overflow case (SIGNED_MIN / -1) in signed division: mark the result as unbounded if the dividend is not a single number. 2. For BPF_MOD: Firstly check whether the divisor is zero. If so, leave the destination register unchanged (matching runtime behavior). For non-zero constant divisors: goto `scalar(32)?_min_max_(u|s)mod` functions. - General case: For signed modulo, the result's sign matches the dividend's sign. And the result's absolute value is strictly bounded by `min(abs(dividend), abs(divisor) - 1)`. - Special care is taken when the divisor is SIGNED_MIN. By casting to unsigned before negation and subtracting 1, we avoid signed overflow and correctly calculate the maximum possible magnitude (`res_max_abs` in the code). - "Small dividend" case: If the dividend is already within the possible result range (e.g., [-2, 5] % 10), the operation is an identity function, and the destination register remains unchanged. 3. In `scalar(32)?_min_max_(u|s)(div|mod)` functions: After updating current range, reset other ranges and tnum to unbounded/unknown. e.g., in `scalar_min_max_sdiv`, signed 64-bit range is updated. Then reset unsigned 64-bit range and 32-bit range to unbounded, and tnum to unknown. Exception: in BPF_MOD's "small dividend" case, since the result remains unchanged, we do not reset other ranges/tnum. 4. Also updated existing selftests based on the expected BPF_DIV and BPF_MOD behavior. [1] https://www.kernel.org/doc/Documentation/bpf/standardization/instruction-set.rst Co-developed-by: Shenghao Yuan Signed-off-by: Shenghao Yuan Co-developed-by: Tianci Cao Signed-off-by: Tianci Cao Signed-off-by: Yazhou Tang Tested-by: syzbot@syzkaller.appspotmail.com Link: https://lore.kernel.org/r/20260119085458.182221-2-tangyazhou@zju.edu.cn Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 299 ++++++++++++++++++ .../bpf/progs/verifier_value_illegal_alu.c | 7 +- .../testing/selftests/bpf/verifier/precise.c | 4 +- 3 files changed, 305 insertions(+), 5 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 919556614505..f11dc5366e5b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2349,6 +2349,18 @@ static void __mark_reg32_unbounded(struct bpf_reg_state *reg) reg->u32_max_value = U32_MAX; } +static void reset_reg64_and_tnum(struct bpf_reg_state *reg) +{ + __mark_reg64_unbounded(reg); + reg->var_off = tnum_unknown; +} + +static void reset_reg32_and_tnum(struct bpf_reg_state *reg) +{ + __mark_reg32_unbounded(reg); + reg->var_off = tnum_unknown; +} + static void __update_reg32_bounds(struct bpf_reg_state *reg) { struct tnum var32_off = tnum_subreg(reg->var_off); @@ -15159,6 +15171,252 @@ static void scalar_min_max_mul(struct bpf_reg_state *dst_reg, } } +static void scalar32_min_max_udiv(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + u32 *dst_umin = &dst_reg->u32_min_value; + u32 *dst_umax = &dst_reg->u32_max_value; + u32 src_val = src_reg->u32_min_value; /* non-zero, const divisor */ + + *dst_umin = *dst_umin / src_val; + *dst_umax = *dst_umax / src_val; + + /* Reset other ranges/tnum to unbounded/unknown. */ + dst_reg->s32_min_value = S32_MIN; + dst_reg->s32_max_value = S32_MAX; + reset_reg64_and_tnum(dst_reg); +} + +static void scalar_min_max_udiv(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + u64 *dst_umin = &dst_reg->umin_value; + u64 *dst_umax = &dst_reg->umax_value; + u64 src_val = src_reg->umin_value; /* non-zero, const divisor */ + + *dst_umin = div64_u64(*dst_umin, src_val); + *dst_umax = div64_u64(*dst_umax, src_val); + + /* Reset other ranges/tnum to unbounded/unknown. */ + dst_reg->smin_value = S64_MIN; + dst_reg->smax_value = S64_MAX; + reset_reg32_and_tnum(dst_reg); +} + +static void scalar32_min_max_sdiv(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + s32 *dst_smin = &dst_reg->s32_min_value; + s32 *dst_smax = &dst_reg->s32_max_value; + s32 src_val = src_reg->s32_min_value; /* non-zero, const divisor */ + s32 res1, res2; + + /* BPF div specification: S32_MIN / -1 = S32_MIN */ + if (*dst_smin == S32_MIN && src_val == -1) { + /* + * If the dividend range contains more than just S32_MIN, + * we cannot precisely track the result, so it becomes unbounded. + * e.g., [S32_MIN, S32_MIN+10]/(-1), + * = {S32_MIN} U [-(S32_MIN+10), -(S32_MIN+1)] + * = {S32_MIN} U [S32_MAX-9, S32_MAX] = [S32_MIN, S32_MAX] + * Otherwise (if dividend is exactly S32_MIN), result remains S32_MIN. + */ + if (*dst_smax != S32_MIN) { + *dst_smin = S32_MIN; + *dst_smax = S32_MAX; + } + goto reset; + } + + res1 = *dst_smin / src_val; + res2 = *dst_smax / src_val; + *dst_smin = min(res1, res2); + *dst_smax = max(res1, res2); + +reset: + /* Reset other ranges/tnum to unbounded/unknown. */ + dst_reg->u32_min_value = 0; + dst_reg->u32_max_value = U32_MAX; + reset_reg64_and_tnum(dst_reg); +} + +static void scalar_min_max_sdiv(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + s64 *dst_smin = &dst_reg->smin_value; + s64 *dst_smax = &dst_reg->smax_value; + s64 src_val = src_reg->smin_value; /* non-zero, const divisor */ + s64 res1, res2; + + /* BPF div specification: S64_MIN / -1 = S64_MIN */ + if (*dst_smin == S64_MIN && src_val == -1) { + /* + * If the dividend range contains more than just S64_MIN, + * we cannot precisely track the result, so it becomes unbounded. + * e.g., [S64_MIN, S64_MIN+10]/(-1), + * = {S64_MIN} U [-(S64_MIN+10), -(S64_MIN+1)] + * = {S64_MIN} U [S64_MAX-9, S64_MAX] = [S64_MIN, S64_MAX] + * Otherwise (if dividend is exactly S64_MIN), result remains S64_MIN. + */ + if (*dst_smax != S64_MIN) { + *dst_smin = S64_MIN; + *dst_smax = S64_MAX; + } + goto reset; + } + + res1 = div64_s64(*dst_smin, src_val); + res2 = div64_s64(*dst_smax, src_val); + *dst_smin = min(res1, res2); + *dst_smax = max(res1, res2); + +reset: + /* Reset other ranges/tnum to unbounded/unknown. */ + dst_reg->umin_value = 0; + dst_reg->umax_value = U64_MAX; + reset_reg32_and_tnum(dst_reg); +} + +static void scalar32_min_max_umod(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + u32 *dst_umin = &dst_reg->u32_min_value; + u32 *dst_umax = &dst_reg->u32_max_value; + u32 src_val = src_reg->u32_min_value; /* non-zero, const divisor */ + u32 res_max = src_val - 1; + + /* + * If dst_umax <= res_max, the result remains unchanged. + * e.g., [2, 5] % 10 = [2, 5]. + */ + if (*dst_umax <= res_max) + return; + + *dst_umin = 0; + *dst_umax = min(*dst_umax, res_max); + + /* Reset other ranges/tnum to unbounded/unknown. */ + dst_reg->s32_min_value = S32_MIN; + dst_reg->s32_max_value = S32_MAX; + reset_reg64_and_tnum(dst_reg); +} + +static void scalar_min_max_umod(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + u64 *dst_umin = &dst_reg->umin_value; + u64 *dst_umax = &dst_reg->umax_value; + u64 src_val = src_reg->umin_value; /* non-zero, const divisor */ + u64 res_max = src_val - 1; + + /* + * If dst_umax <= res_max, the result remains unchanged. + * e.g., [2, 5] % 10 = [2, 5]. + */ + if (*dst_umax <= res_max) + return; + + *dst_umin = 0; + *dst_umax = min(*dst_umax, res_max); + + /* Reset other ranges/tnum to unbounded/unknown. */ + dst_reg->smin_value = S64_MIN; + dst_reg->smax_value = S64_MAX; + reset_reg32_and_tnum(dst_reg); +} + +static void scalar32_min_max_smod(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + s32 *dst_smin = &dst_reg->s32_min_value; + s32 *dst_smax = &dst_reg->s32_max_value; + s32 src_val = src_reg->s32_min_value; /* non-zero, const divisor */ + + /* + * Safe absolute value calculation: + * If src_val == S32_MIN (-2147483648), src_abs becomes 2147483648. + * Here use unsigned integer to avoid overflow. + */ + u32 src_abs = (src_val > 0) ? (u32)src_val : -(u32)src_val; + + /* + * Calculate the maximum possible absolute value of the result. + * Even if src_abs is 2147483648 (S32_MIN), subtracting 1 gives + * 2147483647 (S32_MAX), which fits perfectly in s32. + */ + s32 res_max_abs = src_abs - 1; + + /* + * If the dividend is already within the result range, + * the result remains unchanged. e.g., [-2, 5] % 10 = [-2, 5]. + */ + if (*dst_smin >= -res_max_abs && *dst_smax <= res_max_abs) + return; + + /* General case: result has the same sign as the dividend. */ + if (*dst_smin >= 0) { + *dst_smin = 0; + *dst_smax = min(*dst_smax, res_max_abs); + } else if (*dst_smax <= 0) { + *dst_smax = 0; + *dst_smin = max(*dst_smin, -res_max_abs); + } else { + *dst_smin = -res_max_abs; + *dst_smax = res_max_abs; + } + + /* Reset other ranges/tnum to unbounded/unknown. */ + dst_reg->u32_min_value = 0; + dst_reg->u32_max_value = U32_MAX; + reset_reg64_and_tnum(dst_reg); +} + +static void scalar_min_max_smod(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + s64 *dst_smin = &dst_reg->smin_value; + s64 *dst_smax = &dst_reg->smax_value; + s64 src_val = src_reg->smin_value; /* non-zero, const divisor */ + + /* + * Safe absolute value calculation: + * If src_val == S64_MIN (-2^63), src_abs becomes 2^63. + * Here use unsigned integer to avoid overflow. + */ + u64 src_abs = (src_val > 0) ? (u64)src_val : -(u64)src_val; + + /* + * Calculate the maximum possible absolute value of the result. + * Even if src_abs is 2^63 (S64_MIN), subtracting 1 gives + * 2^63 - 1 (S64_MAX), which fits perfectly in s64. + */ + s64 res_max_abs = src_abs - 1; + + /* + * If the dividend is already within the result range, + * the result remains unchanged. e.g., [-2, 5] % 10 = [-2, 5]. + */ + if (*dst_smin >= -res_max_abs && *dst_smax <= res_max_abs) + return; + + /* General case: result has the same sign as the dividend. */ + if (*dst_smin >= 0) { + *dst_smin = 0; + *dst_smax = min(*dst_smax, res_max_abs); + } else if (*dst_smax <= 0) { + *dst_smax = 0; + *dst_smin = max(*dst_smin, -res_max_abs); + } else { + *dst_smin = -res_max_abs; + *dst_smax = res_max_abs; + } + + /* Reset other ranges/tnum to unbounded/unknown. */ + dst_reg->umin_value = 0; + dst_reg->umax_value = U64_MAX; + reset_reg32_and_tnum(dst_reg); +} + static void scalar32_min_max_and(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { @@ -15564,6 +15822,14 @@ static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn, case BPF_MUL: return true; + /* + * Division and modulo operators range is only safe to compute when the + * divisor is a constant. + */ + case BPF_DIV: + case BPF_MOD: + return src_is_const; + /* Shift operators range is only computable if shift dimension operand * is a constant. Shifts greater than 31 or 63 are undefined. This * includes shifts by a negative number. @@ -15616,6 +15882,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, struct bpf_reg_state src_reg) { u8 opcode = BPF_OP(insn->code); + s16 off = insn->off; bool alu32 = (BPF_CLASS(insn->code) != BPF_ALU64); int ret; @@ -15667,6 +15934,38 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, scalar32_min_max_mul(dst_reg, &src_reg); scalar_min_max_mul(dst_reg, &src_reg); break; + case BPF_DIV: + /* BPF div specification: x / 0 = 0 */ + if ((alu32 && src_reg.u32_min_value == 0) || (!alu32 && src_reg.umin_value == 0)) { + ___mark_reg_known(dst_reg, 0); + break; + } + if (alu32) + if (off == 1) + scalar32_min_max_sdiv(dst_reg, &src_reg); + else + scalar32_min_max_udiv(dst_reg, &src_reg); + else + if (off == 1) + scalar_min_max_sdiv(dst_reg, &src_reg); + else + scalar_min_max_udiv(dst_reg, &src_reg); + break; + case BPF_MOD: + /* BPF mod specification: x % 0 = x */ + if ((alu32 && src_reg.u32_min_value == 0) || (!alu32 && src_reg.umin_value == 0)) + break; + if (alu32) + if (off == 1) + scalar32_min_max_smod(dst_reg, &src_reg); + else + scalar32_min_max_umod(dst_reg, &src_reg); + else + if (off == 1) + scalar_min_max_smod(dst_reg, &src_reg); + else + scalar_min_max_umod(dst_reg, &src_reg); + break; case BPF_AND: if (tnum_is_const(src_reg.var_off)) { ret = maybe_fork_scalars(env, insn, dst_reg); diff --git a/tools/testing/selftests/bpf/progs/verifier_value_illegal_alu.c b/tools/testing/selftests/bpf/progs/verifier_value_illegal_alu.c index 2129e4353fd9..4d8273c258d5 100644 --- a/tools/testing/selftests/bpf/progs/verifier_value_illegal_alu.c +++ b/tools/testing/selftests/bpf/progs/verifier_value_illegal_alu.c @@ -173,14 +173,15 @@ __naked void flow_keys_illegal_variable_offset_alu(void) asm volatile(" \ r6 = r1; \ r7 = *(u64*)(r6 + %[flow_keys_off]); \ - r8 = 8; \ - r8 /= 1; \ + call %[bpf_get_prandom_u32]; \ + r8 = r0; \ r8 &= 8; \ r7 += r8; \ r0 = *(u64*)(r7 + 0); \ exit; \ " : - : __imm_const(flow_keys_off, offsetof(struct __sk_buff, flow_keys)) + : __imm_const(flow_keys_off, offsetof(struct __sk_buff, flow_keys)), + __imm(bpf_get_prandom_u32) : __clobber_all); } diff --git a/tools/testing/selftests/bpf/verifier/precise.c b/tools/testing/selftests/bpf/verifier/precise.c index 59a020c35647..061d98f6e9bb 100644 --- a/tools/testing/selftests/bpf/verifier/precise.c +++ b/tools/testing/selftests/bpf/verifier/precise.c @@ -229,11 +229,11 @@ { "precise: program doesn't prematurely prune branches", .insns = { - BPF_ALU64_IMM(BPF_MOV, BPF_REG_6, 0x400), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32), + BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_0), BPF_ALU64_IMM(BPF_MOV, BPF_REG_7, 0), BPF_ALU64_IMM(BPF_MOV, BPF_REG_8, 0), BPF_ALU64_IMM(BPF_MOV, BPF_REG_9, 0x80000000), - BPF_ALU64_IMM(BPF_MOD, BPF_REG_6, 0x401), BPF_JMP_IMM(BPF_JA, 0, 0, 0), BPF_JMP_REG(BPF_JLE, BPF_REG_6, BPF_REG_9, 2), BPF_ALU64_IMM(BPF_MOD, BPF_REG_6, 1), From c9e440bf25a712d906c40ba3ef831f3f0ccc6a1b Mon Sep 17 00:00:00 2001 From: Yazhou Tang Date: Mon, 19 Jan 2026 16:54:58 +0800 Subject: [PATCH 140/268] selftests/bpf: Add tests for BPF_DIV and BPF_MOD range tracking Now BPF_DIV has range tracking support via interval analysis. This patch adds selftests to cover various cases of BPF_DIV and BPF_MOD operations when the divisor is a constant, also covering both signed and unsigned variants. This patch includes several types of tests in 32-bit and 64-bit variants: 1. For UDIV - positive divisor - zero divisor 2. For SDIV - positive divisor, positive dividend - positive divisor, negative dividend - positive divisor, mixed sign dividend - negative divisor, positive dividend - negative divisor, negative dividend - negative divisor, mixed sign dividend - zero divisor - overflow (SIGNED_MIN/-1), normal dividend - overflow (SIGNED_MIN/-1), constant dividend 3. For UMOD - positive divisor - positive divisor, small dividend - zero divisor 4. For SMOD - positive divisor, positive dividend - positive divisor, negative dividend - positive divisor, mixed sign dividend - positive divisor, mixed sign dividend, small dividend - negative divisor, positive dividend - negative divisor, negative dividend - negative divisor, mixed sign dividend - negative divisor, mixed sign dividend, small dividend - zero divisor - overflow (SIGNED_MIN/-1), normal dividend - overflow (SIGNED_MIN/-1), constant dividend Specifically, these selftests are based on dead code elimination: If the BPF verifier can precisely analyze the result of BPF_DIV/BPF_MOD instruction, it can prune the path that leads to an error (here we use invalid memory access as the error case), allowing the program to pass verification. Co-developed-by: Shenghao Yuan Signed-off-by: Shenghao Yuan Co-developed-by: Tianci Cao Signed-off-by: Tianci Cao Signed-off-by: Yazhou Tang Link: https://lore.kernel.org/r/20260119085458.182221-3-tangyazhou@zju.edu.cn Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/verifier.c | 2 + .../bpf/progs/verifier_div_mod_bounds.c | 1149 +++++++++++++++++ 2 files changed, 1151 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/verifier_div_mod_bounds.c diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index 38c5ba70100c..fa9e506cc36f 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -33,6 +33,7 @@ #include "verifier_direct_packet_access.skel.h" #include "verifier_direct_stack_access_wraparound.skel.h" #include "verifier_div0.skel.h" +#include "verifier_div_mod_bounds.skel.h" #include "verifier_div_overflow.skel.h" #include "verifier_global_subprogs.skel.h" #include "verifier_global_ptr_args.skel.h" @@ -175,6 +176,7 @@ void test_verifier_d_path(void) { RUN(verifier_d_path); } void test_verifier_direct_packet_access(void) { RUN(verifier_direct_packet_access); } void test_verifier_direct_stack_access_wraparound(void) { RUN(verifier_direct_stack_access_wraparound); } void test_verifier_div0(void) { RUN(verifier_div0); } +void test_verifier_div_mod_bounds(void) { RUN(verifier_div_mod_bounds); } void test_verifier_div_overflow(void) { RUN(verifier_div_overflow); } void test_verifier_global_subprogs(void) { RUN(verifier_global_subprogs); } void test_verifier_global_ptr_args(void) { RUN(verifier_global_ptr_args); } diff --git a/tools/testing/selftests/bpf/progs/verifier_div_mod_bounds.c b/tools/testing/selftests/bpf/progs/verifier_div_mod_bounds.c new file mode 100644 index 000000000000..4672af0b3268 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_div_mod_bounds.c @@ -0,0 +1,1149 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include "bpf_misc.h" + +/* This file contains unit tests for signed/unsigned division and modulo + * operations (with divisor as a constant), focusing on verifying whether + * BPF verifier's range tracking module soundly and precisely computes + * the results. + */ + +SEC("socket") +__description("UDIV32, positive divisor") +__success __retval(0) __log_level(2) +__msg("w1 /= 3 {{.*}}; R1=scalar(smin=smin32=0,smax=umax=smax32=umax32=3,var_off=(0x0; 0x3))") +__naked void udiv32_pos_divisor(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + w1 &= 8; \ + w1 |= 1; \ + w1 /= 3; \ + if w1 > 3 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("UDIV32, zero divisor") +__success __retval(0) __log_level(2) +__msg("w1 /= w2 {{.*}}; R1=0 R2=0") +__naked void udiv32_zero_divisor(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + w1 &= 8; \ + w1 |= 1; \ + w2 = 0; \ + w1 /= w2; \ + if w1 != 0 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("UDIV64, positive divisor") +__success __retval(0) __log_level(2) +__msg("r1 /= 3 {{.*}}; R1=scalar(smin=smin32=0,smax=umax=smax32=umax32=3,var_off=(0x0; 0x3))") +__naked void udiv64_pos_divisor(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + r1 &= 8; \ + r1 |= 1; \ + r1 /= 3; \ + if r1 > 3 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("UDIV64, zero divisor") +__success __retval(0) __log_level(2) +__msg("r1 /= r2 {{.*}}; R1=0 R2=0") +__naked void udiv64_zero_divisor(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + r1 &= 8; \ + r1 |= 1; \ + r2 = 0; \ + r1 /= r2; \ + if r1 != 0 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV32, positive divisor, positive dividend") +__success __retval(0) __log_level(2) +__msg("w1 s/= 3 {{.*}}; R1=scalar(smin=umin=smin32=umin32=2,smax=umax=smax32=umax32=3,var_off=(0x2; 0x1))") +__naked void sdiv32_pos_divisor_1(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s< 8 goto l0_%=; \ + if w1 s> 10 goto l0_%=; \ + w1 s/= 3; \ + if w1 s< 2 goto l1_%=; \ + if w1 s> 3 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV32, positive divisor, negative dividend") +__success __retval(0) __log_level(2) +__msg("w1 s/= 3 {{.*}}; R1=scalar(smin=umin=umin32=0xfffffffd,smax=umax=umax32=0xfffffffe,smin32=-3,smax32=-2,var_off=(0xfffffffc; 0x3))") +__naked void sdiv32_pos_divisor_2(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s> -8 goto l0_%=; \ + if w1 s< -10 goto l0_%=; \ + w1 s/= 3; \ + if w1 s< -3 goto l1_%=; \ + if w1 s> -2 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV32, positive divisor, mixed sign dividend") +__success __retval(0) __log_level(2) +__msg("w1 s/= 3 {{.*}}; R1=scalar(smin=0,smax=umax=0xffffffff,smin32=-2,smax32=3,var_off=(0x0; 0xffffffff))") +__naked void sdiv32_pos_divisor_3(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s< -8 goto l0_%=; \ + if w1 s> 10 goto l0_%=; \ + w1 s/= 3; \ + if w1 s< -2 goto l1_%=; \ + if w1 s> 3 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV32, negative divisor, positive dividend") +__success __retval(0) __log_level(2) +__msg("w1 s/= -3 {{.*}}; R1=scalar(smin=umin=umin32=0xfffffffd,smax=umax=umax32=0xfffffffe,smin32=-3,smax32=-2,var_off=(0xfffffffc; 0x3))") +__naked void sdiv32_neg_divisor_1(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s< 8 goto l0_%=; \ + if w1 s> 10 goto l0_%=; \ + w1 s/= -3; \ + if w1 s< -3 goto l1_%=; \ + if w1 s> -2 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV32, negative divisor, positive dividend") +__success __retval(0) __log_level(2) +__msg("w1 s/= -3 {{.*}}; R1=scalar(smin=umin=smin32=umin32=2,smax=umax=smax32=umax32=3,var_off=(0x2; 0x1))") +__naked void sdiv32_neg_divisor_2(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s> -8 goto l0_%=; \ + if w1 s< -10 goto l0_%=; \ + w1 s/= -3; \ + if w1 s< 2 goto l1_%=; \ + if w1 s> 3 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV32, negative divisor, mixed sign dividend") +__success __retval(0) __log_level(2) +__msg("w1 s/= -3 {{.*}}; R1=scalar(smin=0,smax=umax=0xffffffff,smin32=-3,smax32=2,var_off=(0x0; 0xffffffff))") +__naked void sdiv32_neg_divisor_3(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s< -8 goto l0_%=; \ + if w1 s> 10 goto l0_%=; \ + w1 s/= -3; \ + if w1 s< -3 goto l1_%=; \ + if w1 s> 2 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV32, zero divisor") +__success __retval(0) __log_level(2) +__msg("w1 s/= w2 {{.*}}; R1=0 R2=0") +__naked void sdiv32_zero_divisor(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + w1 &= 8; \ + w1 |= 1; \ + w2 = 0; \ + w1 s/= w2; \ + if w1 != 0 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV32, overflow (S32_MIN/-1)") +__success __retval(0) __log_level(2) +__msg("w1 s/= -1 {{.*}}; R1=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff))") +__naked void sdiv32_overflow_1(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + w2 = %[int_min]; \ + w2 += 10; \ + if w1 s> w2 goto l0_%=; \ + w1 s/= -1; \ +l0_%=: r0 = 0; \ + exit; \ +" : + : __imm_const(int_min, INT_MIN), + __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV32, overflow (S32_MIN/-1), constant dividend") +__success __retval(0) __log_level(2) +__msg("w1 s/= -1 {{.*}}; R1=0x80000000") +__naked void sdiv32_overflow_2(void) +{ + asm volatile (" \ + w1 = %[int_min]; \ + w1 s/= -1; \ + if w1 != %[int_min] goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm_const(int_min, INT_MIN) + : __clobber_all); +} + +SEC("socket") +__description("SDIV64, positive divisor, positive dividend") +__success __retval(0) __log_level(2) +__msg("r1 s/= 3 {{.*}}; R1=scalar(smin=umin=smin32=umin32=2,smax=umax=smax32=umax32=3,var_off=(0x2; 0x1))") +__naked void sdiv64_pos_divisor_1(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s< 8 goto l0_%=; \ + if r1 s> 10 goto l0_%=; \ + r1 s/= 3; \ + if r1 s< 2 goto l1_%=; \ + if r1 s> 3 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV64, positive divisor, negative dividend") +__success __retval(0) __log_level(2) +__msg("r1 s/= 3 {{.*}}; R1=scalar(smin=smin32=-3,smax=smax32=-2,umin=0xfffffffffffffffd,umax=0xfffffffffffffffe,umin32=0xfffffffd,umax32=0xfffffffe,var_off=(0xfffffffffffffffc; 0x3))") +__naked void sdiv64_pos_divisor_2(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s> -8 goto l0_%=; \ + if r1 s< -10 goto l0_%=; \ + r1 s/= 3; \ + if r1 s< -3 goto l1_%=; \ + if r1 s> -2 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV64, positive divisor, mixed sign dividend") +__success __retval(0) __log_level(2) +__msg("r1 s/= 3 {{.*}}; R1=scalar(smin=smin32=-2,smax=smax32=3)") +__naked void sdiv64_pos_divisor_3(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s< -8 goto l0_%=; \ + if r1 s> 10 goto l0_%=; \ + r1 s/= 3; \ + if r1 s< -2 goto l1_%=; \ + if r1 s> 3 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV64, negative divisor, positive dividend") +__success __retval(0) __log_level(2) +__msg("r1 s/= -3 {{.*}}; R1=scalar(smin=smin32=-3,smax=smax32=-2,umin=0xfffffffffffffffd,umax=0xfffffffffffffffe,umin32=0xfffffffd,umax32=0xfffffffe,var_off=(0xfffffffffffffffc; 0x3))") +__naked void sdiv64_neg_divisor_1(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s< 8 goto l0_%=; \ + if r1 s> 10 goto l0_%=; \ + r1 s/= -3; \ + if r1 s< -3 goto l1_%=; \ + if r1 s> -2 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV64, negative divisor, positive dividend") +__success __retval(0) __log_level(2) +__msg("r1 s/= -3 {{.*}}; R1=scalar(smin=umin=smin32=umin32=2,smax=umax=smax32=umax32=3,var_off=(0x2; 0x1))") +__naked void sdiv64_neg_divisor_2(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s> -8 goto l0_%=; \ + if r1 s< -10 goto l0_%=; \ + r1 s/= -3; \ + if r1 s< 2 goto l1_%=; \ + if r1 s> 3 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV64, negative divisor, mixed sign dividend") +__success __retval(0) __log_level(2) +__msg("r1 s/= -3 {{.*}}; R1=scalar(smin=smin32=-3,smax=smax32=2)") +__naked void sdiv64_neg_divisor_3(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s< -8 goto l0_%=; \ + if r1 s> 10 goto l0_%=; \ + r1 s/= -3; \ + if r1 s< -3 goto l1_%=; \ + if r1 s> 2 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV64, zero divisor") +__success __retval(0) __log_level(2) +__msg("r1 s/= r2 {{.*}}; R1=0 R2=0") +__naked void sdiv64_zero_divisor(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + r1 &= 8; \ + r1 |= 1; \ + r2 = 0; \ + r1 s/= r2; \ + if r1 != 0 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SDIV64, overflow (S64_MIN/-1)") +__success __retval(0) __log_level(2) +__msg("r1 s/= -1 {{.*}}; R1=scalar()") +__naked void sdiv64_overflow_1(void) +{ + asm volatile (" \ + call %[bpf_ktime_get_ns]; \ + r1 = r0; \ + r2 = %[llong_min] ll; \ + r2 += 10; \ + if r1 s> r2 goto l0_%=; \ + r1 s/= -1; \ +l0_%=: r0 = 0; \ + exit; \ +" : + : __imm_const(llong_min, LLONG_MIN), + __imm(bpf_ktime_get_ns) + : __clobber_all); +} + +SEC("socket") +__description("SDIV64, overflow (S64_MIN/-1), constant dividend") +__success __retval(0) __log_level(2) +__msg("r1 s/= -1 {{.*}}; R1=0x8000000000000000") +__naked void sdiv64_overflow_2(void) +{ + asm volatile (" \ + r1 = %[llong_min] ll; \ + r1 s/= -1; \ + r2 = %[llong_min] ll; \ + if r1 != r2 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm_const(llong_min, LLONG_MIN) + : __clobber_all); +} + +SEC("socket") +__description("UMOD32, positive divisor") +__success __retval(0) __log_level(2) +__msg("w1 %= 3 {{.*}}; R1=scalar(smin=smin32=0,smax=umax=smax32=umax32=2,var_off=(0x0; 0x3))") +__naked void umod32_pos_divisor(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + w1 &= 8; \ + w1 |= 1; \ + w1 %%= 3; \ + if w1 > 3 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("UMOD32, positive divisor, small dividend") +__success __retval(0) __log_level(2) +__msg("w1 %= 10 {{.*}}; R1=scalar(smin=umin=smin32=umin32=1,smax=umax=smax32=umax32=9,var_off=(0x1; 0x8))") +__naked void umod32_pos_divisor_unchanged(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + w1 &= 8; \ + w1 |= 1; \ + w1 %%= 10; \ + if w1 < 1 goto l0_%=; \ + if w1 > 9 goto l0_%=; \ + if w1 & 1 != 1 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("UMOD32, zero divisor") +__success __retval(0) __log_level(2) +__msg("w1 %= w2 {{.*}}; R1=scalar(smin=umin=smin32=umin32=1,smax=umax=smax32=umax32=9,var_off=(0x1; 0x8)) R2=0") +__naked void umod32_zero_divisor(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + w1 &= 8; \ + w1 |= 1; \ + w2 = 0; \ + w1 %%= w2; \ + if w1 < 1 goto l0_%=; \ + if w1 > 9 goto l0_%=; \ + if w1 & 1 != 1 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("UMOD64, positive divisor") +__success __retval(0) __log_level(2) +__msg("r1 %= 3 {{.*}}; R1=scalar(smin=smin32=0,smax=umax=smax32=umax32=2,var_off=(0x0; 0x3))") +__naked void umod64_pos_divisor(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + r1 &= 8; \ + r1 |= 1; \ + r1 %%= 3; \ + if r1 > 3 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("UMOD64, positive divisor, small dividend") +__success __retval(0) __log_level(2) +__msg("r1 %= 10 {{.*}}; R1=scalar(smin=umin=smin32=umin32=1,smax=umax=smax32=umax32=9,var_off=(0x1; 0x8))") +__naked void umod64_pos_divisor_unchanged(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + r1 &= 8; \ + r1 |= 1; \ + r1 %%= 10; \ + if r1 < 1 goto l0_%=; \ + if r1 > 9 goto l0_%=; \ + if r1 & 1 != 1 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("UMOD64, zero divisor") +__success __retval(0) __log_level(2) +__msg("r1 %= r2 {{.*}}; R1=scalar(smin=umin=smin32=umin32=1,smax=umax=smax32=umax32=9,var_off=(0x1; 0x8)) R2=0") +__naked void umod64_zero_divisor(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + r1 &= 8; \ + r1 |= 1; \ + r2 = 0; \ + r1 %%= r2; \ + if r1 < 1 goto l0_%=; \ + if r1 > 9 goto l0_%=; \ + if r1 & 1 != 1 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD32, positive divisor, positive dividend") +__success __retval(0) __log_level(2) +__msg("w1 s%= 3 {{.*}}; R1=scalar(smin=smin32=0,smax=umax=smax32=umax32=2,var_off=(0x0; 0x3))") +__naked void smod32_pos_divisor_1(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s< 8 goto l0_%=; \ + if w1 s> 10 goto l0_%=; \ + w1 s%%= 3; \ + if w1 s< 0 goto l1_%=; \ + if w1 s> 2 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD32, positive divisor, negative dividend") +__success __retval(0) __log_level(2) +__msg("w1 s%= 3 {{.*}}; R1=scalar(smin=0,smax=umax=0xffffffff,smin32=-2,smax32=0,var_off=(0x0; 0xffffffff))") +__naked void smod32_pos_divisor_2(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s> -8 goto l0_%=; \ + if w1 s< -10 goto l0_%=; \ + w1 s%%= 3; \ + if w1 s< -2 goto l1_%=; \ + if w1 s> 0 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD32, positive divisor, mixed sign dividend") +__success __retval(0) __log_level(2) +__msg("w1 s%= 3 {{.*}}; R1=scalar(smin=0,smax=umax=0xffffffff,smin32=-2,smax32=2,var_off=(0x0; 0xffffffff))") +__naked void smod32_pos_divisor_3(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s< -8 goto l0_%=; \ + if w1 s> 10 goto l0_%=; \ + w1 s%%= 3; \ + if w1 s< -2 goto l1_%=; \ + if w1 s> 2 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD32, positive divisor, small dividend") +__success __retval(0) __log_level(2) +__msg("w1 s%= 11 {{.*}}; R1=scalar(smin=0,smax=umax=0xffffffff,smin32=-8,smax32=10,var_off=(0x0; 0xffffffff))") +__naked void smod32_pos_divisor_unchanged(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s< -8 goto l0_%=; \ + if w1 s> 10 goto l0_%=; \ + w1 s%%= 11; \ + if w1 s< -8 goto l1_%=; \ + if w1 s> 10 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD32, negative divisor, positive dividend") +__success __retval(0) __log_level(2) +__msg("w1 s%= -3 {{.*}}; R1=scalar(smin=smin32=0,smax=umax=smax32=umax32=2,var_off=(0x0; 0x3))") +__naked void smod32_neg_divisor_1(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s< 8 goto l0_%=; \ + if w1 s> 10 goto l0_%=; \ + w1 s%%= -3; \ + if w1 s< 0 goto l1_%=; \ + if w1 s> 2 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD32, negative divisor, negative dividend") +__success __retval(0) __log_level(2) +__msg("w1 s%= -3 {{.*}}; R1=scalar(smin=0,smax=umax=0xffffffff,smin32=-2,smax32=0,var_off=(0x0; 0xffffffff))") +__naked void smod32_neg_divisor_2(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s> -8 goto l0_%=; \ + if w1 s< -10 goto l0_%=; \ + w1 s%%= -3; \ + if w1 s< -2 goto l1_%=; \ + if w1 s> 0 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD32, negative divisor, mixed sign dividend") +__success __retval(0) __log_level(2) +__msg("w1 s%= -3 {{.*}}; R1=scalar(smin=0,smax=umax=0xffffffff,smin32=-2,smax32=2,var_off=(0x0; 0xffffffff))") +__naked void smod32_neg_divisor_3(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s< -8 goto l0_%=; \ + if w1 s> 10 goto l0_%=; \ + w1 s%%= -3; \ + if w1 s< -2 goto l1_%=; \ + if w1 s> 2 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD32, negative divisor, small dividend") +__success __retval(0) __log_level(2) +__msg("w1 s%= -11 {{.*}}; R1=scalar(smin=0,smax=umax=0xffffffff,smin32=-8,smax32=10,var_off=(0x0; 0xffffffff))") +__naked void smod32_neg_divisor_unchanged(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s< -8 goto l0_%=; \ + if w1 s> 10 goto l0_%=; \ + w1 s%%= -11; \ + if w1 s< -8 goto l1_%=; \ + if w1 s> 10 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD32, zero divisor") +__success __retval(0) __log_level(2) +__msg("w1 s%= w2 {{.*}}; R1=scalar(smin=0,smax=umax=0xffffffff,smin32=-8,smax32=10,var_off=(0x0; 0xffffffff)) R2=0") +__naked void smod32_zero_divisor(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + if w1 s< -8 goto l0_%=; \ + if w1 s> 10 goto l0_%=; \ + w2 = 0; \ + w1 s%%= w2; \ + if w1 s< -8 goto l1_%=; \ + if w1 s> 10 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD32, overflow (S32_MIN%-1)") +__success __retval(0) __log_level(2) +__msg("w1 s%= -1 {{.*}}; R1=0") +__naked void smod32_overflow_1(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w1 = w0; \ + w2 = %[int_min]; \ + w2 += 10; \ + if w1 s> w2 goto l0_%=; \ + w1 s%%= -1; \ + if w1 != 0 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm_const(int_min, INT_MIN), + __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD32, overflow (S32_MIN%-1), constant dividend") +__success __retval(0) __log_level(2) +__msg("w1 s%= -1 {{.*}}; R1=0") +__naked void smod32_overflow_2(void) +{ + asm volatile (" \ + w1 = %[int_min]; \ + w1 s%%= -1; \ + if w1 != 0 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm_const(int_min, INT_MIN) + : __clobber_all); +} + +SEC("socket") +__description("SMOD64, positive divisor, positive dividend") +__success __retval(0) __log_level(2) +__msg("r1 s%= 3 {{.*}}; R1=scalar(smin=smin32=0,smax=umax=smax32=umax32=2,var_off=(0x0; 0x3))") +__naked void smod64_pos_divisor_1(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s< 8 goto l0_%=; \ + if r1 s> 10 goto l0_%=; \ + r1 s%%= 3; \ + if r1 s< 0 goto l1_%=; \ + if r1 s> 2 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD64, positive divisor, negative dividend") +__success __retval(0) __log_level(2) +__msg("r1 s%= 3 {{.*}}; R1=scalar(smin=smin32=-2,smax=smax32=0)") +__naked void smod64_pos_divisor_2(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s> -8 goto l0_%=; \ + if r1 s< -10 goto l0_%=; \ + r1 s%%= 3; \ + if r1 s< -2 goto l1_%=; \ + if r1 s> 0 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD64, positive divisor, mixed sign dividend") +__success __retval(0) __log_level(2) +__msg("r1 s%= 3 {{.*}}; R1=scalar(smin=smin32=-2,smax=smax32=2)") +__naked void smod64_pos_divisor_3(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s< -8 goto l0_%=; \ + if r1 s> 10 goto l0_%=; \ + r1 s%%= 3; \ + if r1 s< -2 goto l1_%=; \ + if r1 s> 2 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD64, positive divisor, small dividend") +__success __retval(0) __log_level(2) +__msg("r1 s%= 11 {{.*}}; R1=scalar(smin=smin32=-8,smax=smax32=10)") +__naked void smod64_pos_divisor_unchanged(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s< -8 goto l0_%=; \ + if r1 s> 10 goto l0_%=; \ + r1 s%%= 11; \ + if r1 s< -8 goto l1_%=; \ + if r1 s> 10 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD64, negative divisor, positive dividend") +__success __retval(0) __log_level(2) +__msg("r1 s%= -3 {{.*}}; R1=scalar(smin=smin32=0,smax=umax=smax32=umax32=2,var_off=(0x0; 0x3))") +__naked void smod64_neg_divisor_1(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s< 8 goto l0_%=; \ + if r1 s> 10 goto l0_%=; \ + r1 s%%= -3; \ + if r1 s< 0 goto l1_%=; \ + if r1 s> 2 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD64, negative divisor, negative dividend") +__success __retval(0) __log_level(2) +__msg("r1 s%= -3 {{.*}}; R1=scalar(smin=smin32=-2,smax=smax32=0)") +__naked void smod64_neg_divisor_2(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s> -8 goto l0_%=; \ + if r1 s< -10 goto l0_%=; \ + r1 s%%= -3; \ + if r1 s< -2 goto l1_%=; \ + if r1 s> 0 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD64, negative divisor, mixed sign dividend") +__success __retval(0) __log_level(2) +__msg("r1 s%= -3 {{.*}}; R1=scalar(smin=smin32=-2,smax=smax32=2)") +__naked void smod64_neg_divisor_3(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s< -8 goto l0_%=; \ + if r1 s> 10 goto l0_%=; \ + r1 s%%= -3; \ + if r1 s< -2 goto l1_%=; \ + if r1 s> 2 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD64, negative divisor, small dividend") +__success __retval(0) __log_level(2) +__msg("r1 s%= -11 {{.*}}; R1=scalar(smin=smin32=-8,smax=smax32=10)") +__naked void smod64_neg_divisor_unchanged(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s< -8 goto l0_%=; \ + if r1 s> 10 goto l0_%=; \ + r1 s%%= -11; \ + if r1 s< -8 goto l1_%=; \ + if r1 s> 10 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD64, zero divisor") +__success __retval(0) __log_level(2) +__msg("r1 s%= r2 {{.*}}; R1=scalar(smin=smin32=-8,smax=smax32=10) R2=0") +__naked void smod64_zero_divisor(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + if r1 s< -8 goto l0_%=; \ + if r1 s> 10 goto l0_%=; \ + r2 = 0; \ + r1 s%%= r2; \ + if r1 s< -8 goto l1_%=; \ + if r1 s> 10 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__description("SMOD64, overflow (S64_MIN%-1)") +__success __retval(0) __log_level(2) +__msg("r1 s%= -1 {{.*}}; R1=0") +__naked void smod64_overflow_1(void) +{ + asm volatile (" \ + call %[bpf_ktime_get_ns]; \ + r1 = r0; \ + r2 = %[llong_min] ll; \ + r2 += 10; \ + if r1 s> r2 goto l0_%=; \ + r1 s%%= -1; \ + if r1 != 0 goto l1_%=; \ +l0_%=: r0 = 0; \ + exit; \ +l1_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm_const(llong_min, LLONG_MIN), + __imm(bpf_ktime_get_ns) + : __clobber_all); +} + +SEC("socket") +__description("SMOD64, overflow (S64_MIN%-1), constant dividend") +__success __retval(0) __log_level(2) +__msg("r1 s%= -1 {{.*}}; R1=0") +__naked void smod64_overflow_2(void) +{ + asm volatile (" \ + r1 = %[llong_min] ll; \ + r1 s%%= -1; \ + if r1 != 0 goto l0_%=; \ + r0 = 0; \ + exit; \ +l0_%=: r0 = *(u64 *)(r1 + 0); \ + exit; \ +" : + : __imm_const(llong_min, LLONG_MIN) + : __clobber_all); +} From 802eef5afb1865bc5536a5302c068ba2215a1f72 Mon Sep 17 00:00:00 2001 From: Zesen Liu Date: Tue, 20 Jan 2026 16:28:46 +0800 Subject: [PATCH 141/268] bpf: Fix memory access flags in helper prototypes After commit 37cce22dbd51 ("bpf: verifier: Refactor helper access type tracking"), the verifier started relying on the access type flags in helper function prototypes to perform memory access optimizations. Currently, several helper functions utilizing ARG_PTR_TO_MEM lack the corresponding MEM_RDONLY or MEM_WRITE flags. This omission causes the verifier to incorrectly assume that the buffer contents are unchanged across the helper call. Consequently, the verifier may optimize away subsequent reads based on this wrong assumption, leading to correctness issues. For bpf_get_stack_proto_raw_tp, the original MEM_RDONLY was incorrect since the helper writes to the buffer. Change it to ARG_PTR_TO_UNINIT_MEM which correctly indicates write access to potentially uninitialized memory. Similar issues were recently addressed for specific helpers in commit ac44dcc788b9 ("bpf: Fix verifier assumptions of bpf_d_path's output buffer") and commit 2eb7648558a7 ("bpf: Specify access type of bpf_sysctl_get_name args"). Fix these prototypes by adding the correct memory access flags. Fixes: 37cce22dbd51 ("bpf: verifier: Refactor helper access type tracking") Co-developed-by: Shuran Liu Signed-off-by: Shuran Liu Co-developed-by: Peili Gao Signed-off-by: Peili Gao Co-developed-by: Haoran Ni Signed-off-by: Haoran Ni Signed-off-by: Zesen Liu Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260120-helper_proto-v3-1-27b0180b4e77@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 2 +- kernel/bpf/syscall.c | 2 +- kernel/trace/bpf_trace.c | 6 +++--- net/core/filter.c | 20 ++++++++++---------- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index f8aa1320e2f7..4d1af703cfcb 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1077,7 +1077,7 @@ const struct bpf_func_proto bpf_snprintf_proto = { .func = bpf_snprintf, .gpl_only = true, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_MEM_OR_NULL, + .arg1_type = ARG_PTR_TO_MEM_OR_NULL | MEM_WRITE, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_PTR_TO_CONST_STR, .arg4_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ecc0929ce462..3c5c03d43f5f 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -6451,7 +6451,7 @@ static const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = { .func = bpf_kallsyms_lookup_name, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_MEM, + .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED, diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index f73e08c223b5..bd15ff62490b 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1022,7 +1022,7 @@ const struct bpf_func_proto bpf_snprintf_btf_proto = { .func = bpf_snprintf_btf, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_MEM, + .arg1_type = ARG_PTR_TO_MEM | MEM_WRITE, .arg2_type = ARG_CONST_SIZE, .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE, @@ -1526,7 +1526,7 @@ static const struct bpf_func_proto bpf_read_branch_records_proto = { .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM_OR_NULL, + .arg2_type = ARG_PTR_TO_MEM_OR_NULL | MEM_WRITE, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; @@ -1661,7 +1661,7 @@ static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = { .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; diff --git a/net/core/filter.c b/net/core/filter.c index d43df98e1ded..d14401193b01 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6399,7 +6399,7 @@ static const struct bpf_func_proto bpf_xdp_fib_lookup_proto = { .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_WRITE, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; @@ -6454,7 +6454,7 @@ static const struct bpf_func_proto bpf_skb_fib_lookup_proto = { .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_WRITE, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; @@ -8008,9 +8008,9 @@ static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv4_proto = { .gpl_only = true, /* __cookie_v4_init_sequence() is GPL */ .pkt_access = true, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM, + .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_RDONLY, .arg1_size = sizeof(struct iphdr), - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, }; @@ -8040,9 +8040,9 @@ static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv6_proto = { .gpl_only = true, /* __cookie_v6_init_sequence() is GPL */ .pkt_access = true, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM, + .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_RDONLY, .arg1_size = sizeof(struct ipv6hdr), - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, }; @@ -8060,9 +8060,9 @@ static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_ipv4_proto = { .gpl_only = true, /* __cookie_v4_check is GPL */ .pkt_access = true, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM, + .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_RDONLY, .arg1_size = sizeof(struct iphdr), - .arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM, + .arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_RDONLY, .arg2_size = sizeof(struct tcphdr), }; @@ -8084,9 +8084,9 @@ static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_ipv6_proto = { .gpl_only = true, /* __cookie_v6_check is GPL */ .pkt_access = true, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM, + .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_RDONLY, .arg1_size = sizeof(struct ipv6hdr), - .arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM, + .arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_RDONLY, .arg2_size = sizeof(struct tcphdr), }; #endif /* CONFIG_SYN_COOKIES */ From ed4724212f6f472fcb529947730ee0ec21c2eb6c Mon Sep 17 00:00:00 2001 From: Zesen Liu Date: Tue, 20 Jan 2026 16:28:47 +0800 Subject: [PATCH 142/268] bpf: Require ARG_PTR_TO_MEM with memory flag Add check to ensure that ARG_PTR_TO_MEM is used with either MEM_WRITE or MEM_RDONLY. Using ARG_PTR_TO_MEM alone without flags does not make sense because: - If the helper does not change the argument, missing MEM_RDONLY causes the verifier to incorrectly reject a read-only buffer. - If the helper does change the argument, missing MEM_WRITE causes the verifier to incorrectly assume the memory is unchanged, leading to errors in code optimization. Co-developed-by: Shuran Liu Signed-off-by: Shuran Liu Co-developed-by: Peili Gao Signed-off-by: Peili Gao Co-developed-by: Haoran Ni Signed-off-by: Haoran Ni Signed-off-by: Zesen Liu Reviewed-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260120-helper_proto-v3-2-27b0180b4e77@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f11dc5366e5b..bca0ca82d164 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10441,10 +10441,27 @@ static bool check_btf_id_ok(const struct bpf_func_proto *fn) return true; } +static bool check_mem_arg_rw_flag_ok(const struct bpf_func_proto *fn) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(fn->arg_type); i++) { + enum bpf_arg_type arg_type = fn->arg_type[i]; + + if (base_type(arg_type) != ARG_PTR_TO_MEM) + continue; + if (!(arg_type & (MEM_WRITE | MEM_RDONLY))) + return false; + } + + return true; +} + static int check_func_proto(const struct bpf_func_proto *fn) { return check_raw_mode_ok(fn) && check_arg_pair_ok(fn) && + check_mem_arg_rw_flag_ok(fn) && check_btf_id_ok(fn) ? 0 : -EINVAL; } From dd341eacdba360d035c9d4de66d3c80a89d77c84 Mon Sep 17 00:00:00 2001 From: Matt Bobrowski Date: Tue, 20 Jan 2026 09:16:30 +0000 Subject: [PATCH 143/268] selftests/bpf: update verifier test for default trusted pointer semantics Replace the verifier test for default trusted pointer semantics, which previously relied on BPF kfunc bpf_get_root_mem_cgroup(), with a new test utilizing dedicated BPF kfuncs defined within the bpf_testmod. bpf_get_root_mem_cgroup() was modified such that it again relies on KF_ACQUIRE semantics, therefore no longer making it a suitable candidate to test BPF verifier default trusted pointer semantics against. Link: https://lore.kernel.org/bpf/20260113083949.2502978-2-mattbobrowski@google.com Signed-off-by: Matt Bobrowski Link: https://lore.kernel.org/r/20260120091630.3420452-1-mattbobrowski@google.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/verifier.c | 4 +-- .../bpf/progs/verifier_default_trusted_ptr.c | 29 +++++++++++++++++ .../selftests/bpf/progs/verifier_memcontrol.c | 32 ------------------- .../selftests/bpf/test_kmods/bpf_testmod.c | 18 +++++++++++ .../bpf/test_kmods/bpf_testmod_kfunc.h | 3 ++ 5 files changed, 52 insertions(+), 34 deletions(-) create mode 100644 tools/testing/selftests/bpf/progs/verifier_default_trusted_ptr.c delete mode 100644 tools/testing/selftests/bpf/progs/verifier_memcontrol.c diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index fa9e506cc36f..b6a1e79709be 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -30,6 +30,7 @@ #include "verifier_ctx.skel.h" #include "verifier_ctx_sk_msg.skel.h" #include "verifier_d_path.skel.h" +#include "verifier_default_trusted_ptr.skel.h" #include "verifier_direct_packet_access.skel.h" #include "verifier_direct_stack_access_wraparound.skel.h" #include "verifier_div0.skel.h" @@ -62,7 +63,6 @@ #include "verifier_masking.skel.h" #include "verifier_may_goto_1.skel.h" #include "verifier_may_goto_2.skel.h" -#include "verifier_memcontrol.skel.h" #include "verifier_meta_access.skel.h" #include "verifier_movsx.skel.h" #include "verifier_mtu.skel.h" @@ -173,6 +173,7 @@ void test_verifier_const_or(void) { RUN(verifier_const_or); } void test_verifier_ctx(void) { RUN(verifier_ctx); } void test_verifier_ctx_sk_msg(void) { RUN(verifier_ctx_sk_msg); } void test_verifier_d_path(void) { RUN(verifier_d_path); } +void test_verifier_default_trusted_ptr(void) { RUN_TESTS(verifier_default_trusted_ptr); } void test_verifier_direct_packet_access(void) { RUN(verifier_direct_packet_access); } void test_verifier_direct_stack_access_wraparound(void) { RUN(verifier_direct_stack_access_wraparound); } void test_verifier_div0(void) { RUN(verifier_div0); } @@ -205,7 +206,6 @@ void test_verifier_map_ret_val(void) { RUN(verifier_map_ret_val); } void test_verifier_masking(void) { RUN(verifier_masking); } void test_verifier_may_goto_1(void) { RUN(verifier_may_goto_1); } void test_verifier_may_goto_2(void) { RUN(verifier_may_goto_2); } -void test_verifier_memcontrol(void) { RUN(verifier_memcontrol); } void test_verifier_meta_access(void) { RUN(verifier_meta_access); } void test_verifier_movsx(void) { RUN(verifier_movsx); } void test_verifier_mul(void) { RUN(verifier_mul); } diff --git a/tools/testing/selftests/bpf/progs/verifier_default_trusted_ptr.c b/tools/testing/selftests/bpf/progs/verifier_default_trusted_ptr.c new file mode 100644 index 000000000000..fa3b656ad4fb --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_default_trusted_ptr.c @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2026 Google LLC. + */ + +#include +#include +#include + +#include "bpf_misc.h" +#include "../test_kmods/bpf_testmod_kfunc.h" + +SEC("syscall") +__success __retval(0) +int test_default_trusted_ptr(void *ctx) +{ + struct prog_test_member *trusted_ptr; + + trusted_ptr = bpf_kfunc_get_default_trusted_ptr_test(); + /* + * Test BPF kfunc bpf_get_default_trusted_ptr_test() returns a + * PTR_TO_BTF_ID | PTR_TRUSTED, therefore it should be accepted when + * passed to a BPF kfunc only accepting KF_TRUSTED_ARGS. + */ + bpf_kfunc_put_default_trusted_ptr_test(trusted_ptr); + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_memcontrol.c b/tools/testing/selftests/bpf/progs/verifier_memcontrol.c deleted file mode 100644 index 13564956f621..000000000000 --- a/tools/testing/selftests/bpf/progs/verifier_memcontrol.c +++ /dev/null @@ -1,32 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright 2026 Google LLC. - */ - -#include -#include -#include -#include "bpf_misc.h" - -SEC("syscall") -__success __retval(0) -int root_mem_cgroup_default_trusted(void *ctx) -{ - unsigned long usage; - struct mem_cgroup *root_mem_cgroup; - - root_mem_cgroup = bpf_get_root_mem_cgroup(); - if (!root_mem_cgroup) - return 1; - - /* - * BPF kfunc bpf_get_root_mem_cgroup() returns a PTR_TO_BTF_ID | - * PTR_TRUSTED | PTR_MAYBE_NULL, therefore it should be accepted when - * passed to a BPF kfunc only accepting KF_TRUSTED_ARGS. - */ - usage = bpf_mem_cgroup_usage(root_mem_cgroup); - __sink(usage); - return 0; -} - -char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c index 0d542ba64365..d425034b72d3 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c @@ -254,6 +254,22 @@ __bpf_kfunc int *bpf_kfunc_ret_rcu_test_nostruct(int rdonly_buf_size) return NULL; } +static struct prog_test_member trusted_ptr; + +__bpf_kfunc struct prog_test_member *bpf_kfunc_get_default_trusted_ptr_test(void) +{ + return &trusted_ptr; +} + +__bpf_kfunc void bpf_kfunc_put_default_trusted_ptr_test(struct prog_test_member *trusted_ptr) +{ + /* + * This BPF kfunc doesn't actually have any put/KF_ACQUIRE + * semantics. We're simply wanting to simulate a BPF kfunc that takes a + * struct prog_test_member pointer as an argument. + */ +} + __bpf_kfunc struct bpf_testmod_ctx * bpf_testmod_ctx_create(int *err) { @@ -709,6 +725,8 @@ BTF_ID_FLAGS(func, bpf_testmod_ctx_create, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_testmod_ctx_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_testmod_ops3_call_test_1) BTF_ID_FLAGS(func, bpf_testmod_ops3_call_test_2) +BTF_ID_FLAGS(func, bpf_kfunc_get_default_trusted_ptr_test); +BTF_ID_FLAGS(func, bpf_kfunc_put_default_trusted_ptr_test); BTF_KFUNCS_END(bpf_testmod_common_kfunc_ids) BTF_ID_LIST(bpf_testmod_dtor_ids) diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h b/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h index 225ea30c4e3d..10f89f06245f 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h @@ -166,4 +166,7 @@ extern int bpf_kfunc_multi_st_ops_test_1(struct st_ops_args *args, u32 id) __wea extern int bpf_kfunc_multi_st_ops_test_1_assoc(struct st_ops_args *args) __weak __ksym; #endif +struct prog_test_member *bpf_kfunc_get_default_trusted_ptr_test(void) __ksym; +void bpf_kfunc_put_default_trusted_ptr_test(struct prog_test_member *trusted_ptr) __ksym; + #endif /* _BPF_TESTMOD_KFUNC_H */ From c1f2c449de279967e04254f09104f657ba60ab3f Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Tue, 20 Jan 2026 15:59:10 +0000 Subject: [PATCH 144/268] bpf: Factor out timer deletion helper Move the timer deletion logic into a dedicated bpf_timer_delete() helper so it can be reused by later patches. Acked-by: Eduard Zingerman Acked-by: Kumar Kartikeya Dwivedi Signed-off-by: Mykyta Yatsenko Link: https://lore.kernel.org/r/20260120-timer_nolock-v6-1-670ffdd787b4@meta.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 4d1af703cfcb..fb6f9dbca084 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1558,18 +1558,10 @@ static struct bpf_async_cb *__bpf_async_cancel_and_free(struct bpf_async_kern *a return cb; } -/* This function is called by map_delete/update_elem for individual element and - * by ops->map_release_uref when the user space reference to a map reaches zero. - */ -void bpf_timer_cancel_and_free(void *val) +static void bpf_timer_delete(struct bpf_hrtimer *t) { - struct bpf_hrtimer *t; - - t = (struct bpf_hrtimer *)__bpf_async_cancel_and_free(val); - - if (!t) - return; - /* We check that bpf_map_delete/update_elem() was called from timer + /* + * We check that bpf_map_delete/update_elem() was called from timer * callback_fn. In such case we don't call hrtimer_cancel() (since it * will deadlock) and don't call hrtimer_try_to_cancel() (since it will * just return -1). Though callback_fn is still running on this cpu it's @@ -1618,6 +1610,21 @@ void bpf_timer_cancel_and_free(void *val) } } +/* + * This function is called by map_delete/update_elem for individual element and + * by ops->map_release_uref when the user space reference to a map reaches zero. + */ +void bpf_timer_cancel_and_free(void *val) +{ + struct bpf_hrtimer *t; + + t = (struct bpf_hrtimer *)__bpf_async_cancel_and_free(val); + if (!t) + return; + + bpf_timer_delete(t); +} + /* This function is called by map_delete/update_elem for individual element and * by ops->map_release_uref when the user space reference to a map reaches zero. */ From 57d31e72dbdd1f71455aa62a2505a8cf088f46c6 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Tue, 20 Jan 2026 15:59:11 +0000 Subject: [PATCH 145/268] bpf: Remove unnecessary arguments from bpf_async_set_callback() Remove unused arguments from __bpf_async_set_callback(). Signed-off-by: Mykyta Yatsenko Link: https://lore.kernel.org/r/20260120-timer_nolock-v6-2-670ffdd787b4@meta.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index fb6f9dbca084..6eadb66b8c67 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1355,10 +1355,9 @@ static const struct bpf_func_proto bpf_timer_init_proto = { }; static int __bpf_async_set_callback(struct bpf_async_kern *async, void *callback_fn, - struct bpf_prog_aux *aux, unsigned int flags, - enum bpf_async_type type) + struct bpf_prog *prog) { - struct bpf_prog *prev, *prog = aux->prog; + struct bpf_prog *prev; struct bpf_async_cb *cb; int ret = 0; @@ -1403,7 +1402,7 @@ static int __bpf_async_set_callback(struct bpf_async_kern *async, void *callback BPF_CALL_3(bpf_timer_set_callback, struct bpf_async_kern *, timer, void *, callback_fn, struct bpf_prog_aux *, aux) { - return __bpf_async_set_callback(timer, callback_fn, aux, 0, BPF_ASYNC_TYPE_TIMER); + return __bpf_async_set_callback(timer, callback_fn, aux->prog); } static const struct bpf_func_proto bpf_timer_set_callback_proto = { @@ -3137,7 +3136,7 @@ __bpf_kfunc int bpf_wq_set_callback(struct bpf_wq *wq, if (flags) return -EINVAL; - return __bpf_async_set_callback(async, callback_fn, aux, flags, BPF_ASYNC_TYPE_WQ); + return __bpf_async_set_callback(async, callback_fn, aux->prog); } __bpf_kfunc void bpf_preempt_disable(void) From 8bb1e32b3fac1becb4c1c8079d720784b8e33e34 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Tue, 20 Jan 2026 15:59:12 +0000 Subject: [PATCH 146/268] bpf: Introduce lock-free bpf_async_update_prog_callback() Introduce bpf_async_update_prog_callback(): lock-free update of cb->prog and cb->callback_fn. This function allows updating prog and callback_fn fields of the struct bpf_async_cb without holding lock. For now use it under the lock from __bpf_async_set_callback(), in the next patches that lock will be removed. Lock-free algorithm: * Acquire a guard reference on prog to prevent it from being freed during the retry loop. * Retry loop: 1. Each iteration acquires a new prog reference and stores it in cb->prog via xchg. The previous prog is released. 2. The loop condition checks if both cb->prog and cb->callback_fn match what we just wrote. If either differs, a concurrent writer overwrote our value, and we must retry. 3. When we retry, our previously-stored prog was already released by the concurrent writer or will be released by us after overwriting. * Release guard reference. Acked-by: Andrii Nakryiko Signed-off-by: Mykyta Yatsenko Link: https://lore.kernel.org/r/20260120-timer_nolock-v6-3-670ffdd787b4@meta.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 67 ++++++++++++++++++++++++-------------------- 1 file changed, 37 insertions(+), 30 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 6eadb66b8c67..2a2df867bfe7 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1354,10 +1354,43 @@ static const struct bpf_func_proto bpf_timer_init_proto = { .arg3_type = ARG_ANYTHING, }; +static int bpf_async_update_prog_callback(struct bpf_async_cb *cb, void *callback_fn, + struct bpf_prog *prog) +{ + struct bpf_prog *prev; + + /* Acquire a guard reference on prog to prevent it from being freed during the loop */ + if (prog) { + prog = bpf_prog_inc_not_zero(prog); + if (IS_ERR(prog)) + return PTR_ERR(prog); + } + + do { + if (prog) + prog = bpf_prog_inc_not_zero(prog); + prev = xchg(&cb->prog, prog); + rcu_assign_pointer(cb->callback_fn, callback_fn); + + /* + * Release previous prog, make sure that if other CPU is contending, + * to set bpf_prog, references are not leaked as each iteration acquires and + * releases one reference. + */ + if (prev) + bpf_prog_put(prev); + + } while (READ_ONCE(cb->prog) != prog || READ_ONCE(cb->callback_fn) != callback_fn); + + if (prog) + bpf_prog_put(prog); + + return 0; +} + static int __bpf_async_set_callback(struct bpf_async_kern *async, void *callback_fn, struct bpf_prog *prog) { - struct bpf_prog *prev; struct bpf_async_cb *cb; int ret = 0; @@ -1378,22 +1411,7 @@ static int __bpf_async_set_callback(struct bpf_async_kern *async, void *callback ret = -EPERM; goto out; } - prev = cb->prog; - if (prev != prog) { - /* Bump prog refcnt once. Every bpf_timer_set_callback() - * can pick different callback_fn-s within the same prog. - */ - prog = bpf_prog_inc_not_zero(prog); - if (IS_ERR(prog)) { - ret = PTR_ERR(prog); - goto out; - } - if (prev) - /* Drop prev prog refcnt when swapping with new prog */ - bpf_prog_put(prev); - cb->prog = prog; - } - rcu_assign_pointer(cb->callback_fn, callback_fn); + ret = bpf_async_update_prog_callback(cb, callback_fn, prog); out: __bpf_spin_unlock_irqrestore(&async->lock); return ret; @@ -1453,17 +1471,6 @@ static const struct bpf_func_proto bpf_timer_start_proto = { .arg3_type = ARG_ANYTHING, }; -static void drop_prog_refcnt(struct bpf_async_cb *async) -{ - struct bpf_prog *prog = async->prog; - - if (prog) { - bpf_prog_put(prog); - async->prog = NULL; - rcu_assign_pointer(async->callback_fn, NULL); - } -} - BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer) { struct bpf_hrtimer *t, *cur_t; @@ -1514,7 +1521,7 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer) goto out; } drop: - drop_prog_refcnt(&t->cb); + bpf_async_update_prog_callback(&t->cb, NULL, NULL); out: __bpf_spin_unlock_irqrestore(&timer->lock); /* Cancel the timer and wait for associated callback to finish @@ -1547,7 +1554,7 @@ static struct bpf_async_cb *__bpf_async_cancel_and_free(struct bpf_async_kern *a cb = async->cb; if (!cb) goto out; - drop_prog_refcnt(cb); + bpf_async_update_prog_callback(cb, NULL, NULL); /* The subsequent bpf_timer_start/cancel() helpers won't be able to use * this timer, since it won't be initialized. */ From 83c9030cdc45e0518d71065c25201a24eafc9818 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Tue, 20 Jan 2026 15:59:13 +0000 Subject: [PATCH 147/268] bpf: Simplify bpf_timer_cancel() Remove lock from the bpf_timer_cancel() helper. The lock does not protect from concurrent modification of the bpf_async_cb data fields as those are modified in the callback without locking. Use guard(rcu)() instead of pair of explicit lock()/unlock(). Acked-by: Kumar Kartikeya Dwivedi Acked-by: Andrii Nakryiko Signed-off-by: Mykyta Yatsenko Link: https://lore.kernel.org/r/20260120-timer_nolock-v6-4-670ffdd787b4@meta.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 2a2df867bfe7..637677815365 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1471,7 +1471,7 @@ static const struct bpf_func_proto bpf_timer_start_proto = { .arg3_type = ARG_ANYTHING, }; -BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer) +BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, async) { struct bpf_hrtimer *t, *cur_t; bool inc = false; @@ -1479,13 +1479,12 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer) if (in_nmi()) return -EOPNOTSUPP; - rcu_read_lock(); - __bpf_spin_lock_irqsave(&timer->lock); - t = timer->timer; - if (!t) { - ret = -EINVAL; - goto out; - } + + guard(rcu)(); + + t = READ_ONCE(async->timer); + if (!t) + return -EINVAL; cur_t = this_cpu_read(hrtimer_running); if (cur_t == t) { @@ -1493,8 +1492,7 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer) * its own timer the hrtimer_cancel() will deadlock * since it waits for callback_fn to finish. */ - ret = -EDEADLK; - goto out; + return -EDEADLK; } /* Only account in-flight cancellations when invoked from a timer @@ -1517,20 +1515,17 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer) * cancelling and waiting for it synchronously, since it might * do the same. Bail! */ - ret = -EDEADLK; - goto out; + atomic_dec(&t->cancelling); + return -EDEADLK; } drop: bpf_async_update_prog_callback(&t->cb, NULL, NULL); -out: - __bpf_spin_unlock_irqrestore(&timer->lock); /* Cancel the timer and wait for associated callback to finish * if it was running. */ - ret = ret ?: hrtimer_cancel(&t->timer); + ret = hrtimer_cancel(&t->timer); if (inc) atomic_dec(&t->cancelling); - rcu_read_unlock(); return ret; } From eaedea154eb96b2f4ba8ec8e4397dab10758a705 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Tue, 20 Jan 2026 15:05:54 +0800 Subject: [PATCH 148/268] bpf, x86: inline bpf_get_current_task() for x86_64 Inline bpf_get_current_task() and bpf_get_current_task_btf() for x86_64 to obtain better performance. Signed-off-by: Menglong Dong Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260120070555.233486-2-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bca0ca82d164..9905ad40f4d3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -18130,6 +18130,10 @@ static bool verifier_inlines_helper_call(struct bpf_verifier_env *env, s32 imm) switch (imm) { #ifdef CONFIG_X86_64 case BPF_FUNC_get_smp_processor_id: +#ifdef CONFIG_SMP + case BPF_FUNC_get_current_task_btf: + case BPF_FUNC_get_current_task: +#endif return env->prog->jit_requested && bpf_jit_supports_percpu_insn(); #endif default: @@ -23715,6 +23719,24 @@ static int do_misc_fixups(struct bpf_verifier_env *env) insn = new_prog->insnsi + i + delta; goto next_insn; } + + /* Implement bpf_get_current_task() and bpf_get_current_task_btf() inline. */ + if ((insn->imm == BPF_FUNC_get_current_task || insn->imm == BPF_FUNC_get_current_task_btf) && + verifier_inlines_helper_call(env, insn->imm)) { + insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, (u32)(unsigned long)¤t_task); + insn_buf[1] = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0); + insn_buf[2] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0); + cnt = 3; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + goto next_insn; + } #endif /* Implement bpf_get_func_arg inline. */ if (prog_type == BPF_PROG_TYPE_TRACING && From 4fca95095cdcd81bd4a8c8c7008fb3c175a3a5d5 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Tue, 20 Jan 2026 15:05:55 +0800 Subject: [PATCH 149/268] selftests/bpf: test the jited inline of bpf_get_current_task Add the testcase for the jited inline of bpf_get_current_task(). Signed-off-by: Menglong Dong Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260120070555.233486-3-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/verifier.c | 2 ++ .../selftests/bpf/progs/verifier_jit_inline.c | 20 +++++++++++++++++++ 2 files changed, 22 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/verifier_jit_inline.c diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index b6a1e79709be..302286a80154 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -112,6 +112,7 @@ #include "verifier_xdp_direct_packet_access.skel.h" #include "verifier_bits_iter.skel.h" #include "verifier_lsm.skel.h" +#include "verifier_jit_inline.skel.h" #include "irq.skel.h" #define MAX_ENTRIES 11 @@ -255,6 +256,7 @@ void test_verifier_bits_iter(void) { RUN(verifier_bits_iter); } void test_verifier_lsm(void) { RUN(verifier_lsm); } void test_irq(void) { RUN(irq); } void test_verifier_mtu(void) { RUN(verifier_mtu); } +void test_verifier_jit_inline(void) { RUN(verifier_jit_inline); } static int init_test_val_map(struct bpf_object *obj, char *map_name) { diff --git a/tools/testing/selftests/bpf/progs/verifier_jit_inline.c b/tools/testing/selftests/bpf/progs/verifier_jit_inline.c new file mode 100644 index 000000000000..4ea254063646 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_jit_inline.c @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include "bpf_misc.h" + +SEC("fentry/bpf_fentry_test1") +__success __retval(0) +__arch_x86_64 +__jited(" addq %gs:{{.*}}, %rax") +__arch_arm64 +__jited(" mrs x7, SP_EL0") +int inline_bpf_get_current_task(void) +{ + bpf_get_current_task(); + + return 0; +} + +char _license[] SEC("license") = "GPL"; From 85c7f914714741de992fc19c2ba673f6c400a584 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Wed, 21 Jan 2026 12:43:47 +0800 Subject: [PATCH 150/268] bpf: support bpf_get_func_arg() for BPF_TRACE_RAW_TP For now, bpf_get_func_arg() and bpf_get_func_arg_cnt() is not supported by the BPF_TRACE_RAW_TP, which is not convenient to get the argument of the tracepoint, especially for the case that the position of the arguments in a tracepoint can change. The target tracepoint BTF type id is specified during loading time, therefore we can get the function argument count from the function prototype instead of the stack. Signed-off-by: Menglong Dong Acked-by: Yonghong Song Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20260121044348.113201-2-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 22 ++++++++++++++++++---- kernel/trace/bpf_trace.c | 10 ++++++++-- 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9905ad40f4d3..c7f5234d5fd2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -23741,8 +23741,15 @@ static int do_misc_fixups(struct bpf_verifier_env *env) /* Implement bpf_get_func_arg inline. */ if (prog_type == BPF_PROG_TYPE_TRACING && insn->imm == BPF_FUNC_get_func_arg) { - /* Load nr_args from ctx - 8 */ - insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); + if (eatype == BPF_TRACE_RAW_TP) { + int nr_args = btf_type_vlen(prog->aux->attach_func_proto); + + /* skip 'void *__data' in btf_trace_##name() and save to reg0 */ + insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, nr_args - 1); + } else { + /* Load nr_args from ctx - 8 */ + insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); + } insn_buf[1] = BPF_JMP32_REG(BPF_JGE, BPF_REG_2, BPF_REG_0, 6); insn_buf[2] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 3); insn_buf[3] = BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_1); @@ -23794,8 +23801,15 @@ static int do_misc_fixups(struct bpf_verifier_env *env) /* Implement get_func_arg_cnt inline. */ if (prog_type == BPF_PROG_TYPE_TRACING && insn->imm == BPF_FUNC_get_func_arg_cnt) { - /* Load nr_args from ctx - 8 */ - insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); + if (eatype == BPF_TRACE_RAW_TP) { + int nr_args = btf_type_vlen(prog->aux->attach_func_proto); + + /* skip 'void *__data' in btf_trace_##name() and save to reg0 */ + insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, nr_args - 1); + } else { + /* Load nr_args from ctx - 8 */ + insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); + } new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, 1); if (!new_prog) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index bd15ff62490b..0e9635bcd783 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1734,11 +1734,17 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_d_path: return &bpf_d_path_proto; case BPF_FUNC_get_func_arg: - return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_proto : NULL; + if (bpf_prog_has_trampoline(prog) || + prog->expected_attach_type == BPF_TRACE_RAW_TP) + return &bpf_get_func_arg_proto; + return NULL; case BPF_FUNC_get_func_ret: return bpf_prog_has_trampoline(prog) ? &bpf_get_func_ret_proto : NULL; case BPF_FUNC_get_func_arg_cnt: - return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_cnt_proto : NULL; + if (bpf_prog_has_trampoline(prog) || + prog->expected_attach_type == BPF_TRACE_RAW_TP) + return &bpf_get_func_arg_cnt_proto; + return NULL; case BPF_FUNC_get_attach_cookie: if (prog->type == BPF_PROG_TYPE_TRACING && prog->expected_attach_type == BPF_TRACE_RAW_TP) From 1ed797764315496a115cd0568450a7f72da80df6 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Wed, 21 Jan 2026 12:43:48 +0800 Subject: [PATCH 151/268] selftests/bpf: test bpf_get_func_arg() for tp_btf Test bpf_get_func_arg() and bpf_get_func_arg_cnt() for tp_btf. The code is most copied from test1 and test2. Signed-off-by: Menglong Dong Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20260121044348.113201-3-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- .../bpf/prog_tests/get_func_args_test.c | 3 ++ .../selftests/bpf/progs/get_func_args_test.c | 44 +++++++++++++++++++ .../bpf/test_kmods/bpf_testmod-events.h | 10 +++++ .../selftests/bpf/test_kmods/bpf_testmod.c | 4 ++ 4 files changed, 61 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/get_func_args_test.c b/tools/testing/selftests/bpf/prog_tests/get_func_args_test.c index 64a9c95d4acf..fadee95d3ae8 100644 --- a/tools/testing/selftests/bpf/prog_tests/get_func_args_test.c +++ b/tools/testing/selftests/bpf/prog_tests/get_func_args_test.c @@ -33,11 +33,14 @@ void test_get_func_args_test(void) ASSERT_EQ(topts.retval >> 16, 1, "test_run"); ASSERT_EQ(topts.retval & 0xffff, 1234 + 29, "test_run"); + ASSERT_OK(trigger_module_test_read(1), "trigger_read"); ASSERT_EQ(skel->bss->test1_result, 1, "test1_result"); ASSERT_EQ(skel->bss->test2_result, 1, "test2_result"); ASSERT_EQ(skel->bss->test3_result, 1, "test3_result"); ASSERT_EQ(skel->bss->test4_result, 1, "test4_result"); + ASSERT_EQ(skel->bss->test5_result, 1, "test5_result"); + ASSERT_EQ(skel->bss->test6_result, 1, "test6_result"); cleanup: get_func_args_test__destroy(skel); diff --git a/tools/testing/selftests/bpf/progs/get_func_args_test.c b/tools/testing/selftests/bpf/progs/get_func_args_test.c index e0f34a55e697..5b7233afef05 100644 --- a/tools/testing/selftests/bpf/progs/get_func_args_test.c +++ b/tools/testing/selftests/bpf/progs/get_func_args_test.c @@ -121,3 +121,47 @@ int BPF_PROG(fexit_test, int _a, int *_b, int _ret) test4_result &= err == 0 && ret == 1234; return 0; } + +__u64 test5_result = 0; +SEC("tp_btf/bpf_testmod_fentry_test1_tp") +int BPF_PROG(tp_test1) +{ + __u64 cnt = bpf_get_func_arg_cnt(ctx); + __u64 a = 0, z = 0; + __s64 err; + + test5_result = cnt == 1; + + err = bpf_get_func_arg(ctx, 0, &a); + test5_result &= err == 0 && ((int) a == 1); + + /* not valid argument */ + err = bpf_get_func_arg(ctx, 1, &z); + test5_result &= err == -EINVAL; + + return 0; +} + +__u64 test6_result = 0; +SEC("tp_btf/bpf_testmod_fentry_test2_tp") +int BPF_PROG(tp_test2) +{ + __u64 cnt = bpf_get_func_arg_cnt(ctx); + __u64 a = 0, b = 0, z = 0; + __s64 err; + + test6_result = cnt == 2; + + /* valid arguments */ + err = bpf_get_func_arg(ctx, 0, &a); + test6_result &= err == 0 && (int) a == 2; + + err = bpf_get_func_arg(ctx, 1, &b); + test6_result &= err == 0 && b == 3; + + /* not valid argument */ + err = bpf_get_func_arg(ctx, 2, &z); + test6_result &= err == -EINVAL; + + return 0; +} diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod-events.h b/tools/testing/selftests/bpf/test_kmods/bpf_testmod-events.h index aeef86b3da74..45a5e41f3a92 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod-events.h +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod-events.h @@ -63,6 +63,16 @@ BPF_TESTMOD_DECLARE_TRACE(bpf_testmod_test_writable_bare, sizeof(struct bpf_testmod_test_writable_ctx) ); +DECLARE_TRACE(bpf_testmod_fentry_test1, + TP_PROTO(int a), + TP_ARGS(a) +); + +DECLARE_TRACE(bpf_testmod_fentry_test2, + TP_PROTO(int a, u64 b), + TP_ARGS(a, b) +); + #endif /* _BPF_TESTMOD_EVENTS_H */ #undef TRACE_INCLUDE_PATH diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c index d425034b72d3..77a81fa8ec6a 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c @@ -412,11 +412,15 @@ __weak noinline struct file *bpf_testmod_return_ptr(int arg) noinline int bpf_testmod_fentry_test1(int a) { + trace_bpf_testmod_fentry_test1_tp(a); + return a + 1; } noinline int bpf_testmod_fentry_test2(int a, u64 b) { + trace_bpf_testmod_fentry_test2_tp(a, b); + return a + b; } From d0f5d4f8f3285349bfb94fa84555ab267d2c041d Mon Sep 17 00:00:00 2001 From: Matt Bobrowski Date: Wed, 21 Jan 2026 09:00:01 +0000 Subject: [PATCH 152/268] bpf: Revert "bpf: drop KF_ACQUIRE flag on BPF kfunc bpf_get_root_mem_cgroup()" This reverts commit e463b6de9da1 ("bpf: drop KF_ACQUIRE flag on BPF kfunc bpf_get_root_mem_cgroup()"). The original commit removed the KF_ACQUIRE flag from bpf_get_root_mem_cgroup() under the assumption that it resulted in simplified usage. This stemmed from the fact that bpf_get_root_mem_cgroup() inherently returns a reference to an object which technically isn't reference counted, therefore there is no strong requirement to call a matching bpf_put_mem_cgroup() on the returned reference. Although technically correct, as per the arguments in the thread [0], dropping the KF_ACQUIRE flag and losing reference tracking semantics negatively impacted the usability of bpf_get_root_mem_cgroup() in practice. [0] https://lore.kernel.org/bpf/878qdx6yut.fsf@linux.dev/ Link: https://lore.kernel.org/bpf/CAADnVQ+6d1Lj4dteAv8u62d7kj3Ze5io6bqM0xeQd-UPk9ZgJQ@mail.gmail.com/ Signed-off-by: Matt Bobrowski Link: https://lore.kernel.org/r/20260121090001.240166-1-mattbobrowski@google.com Signed-off-by: Alexei Starovoitov --- mm/bpf_memcontrol.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/mm/bpf_memcontrol.c b/mm/bpf_memcontrol.c index f95cd5d16f4c..716df49d7647 100644 --- a/mm/bpf_memcontrol.c +++ b/mm/bpf_memcontrol.c @@ -13,12 +13,12 @@ __bpf_kfunc_start_defs(); /** * bpf_get_root_mem_cgroup - Returns a pointer to the root memory cgroup * - * Return: A pointer to the root memory cgroup. Notably, the pointer to the - * returned struct mem_cgroup is trusted by default, so it's perfectably - * acceptable to also pass this pointer into other BPF kfuncs (e.g., - * bpf_mem_cgroup_usage()). Additionally, this BPF kfunc does not make use of - * KF_ACQUIRE semantics, so there's no requirement for the BPF program to call - * bpf_put_mem_cgroup() on the pointer returned by this BPF kfunc. + * The function has KF_ACQUIRE semantics, even though the root memory + * cgroup is never destroyed after being created and doesn't require + * reference counting. And it's perfectly safe to pass it to + * bpf_put_mem_cgroup() + * + * Return: A pointer to the root memory cgroup. */ __bpf_kfunc struct mem_cgroup *bpf_get_root_mem_cgroup(void) { @@ -162,7 +162,7 @@ __bpf_kfunc void bpf_mem_cgroup_flush_stats(struct mem_cgroup *memcg) __bpf_kfunc_end_defs(); BTF_KFUNCS_START(bpf_memcontrol_kfuncs) -BTF_ID_FLAGS(func, bpf_get_root_mem_cgroup, KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_get_root_mem_cgroup, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_get_mem_cgroup, KF_ACQUIRE | KF_RET_NULL | KF_RCU) BTF_ID_FLAGS(func, bpf_put_mem_cgroup, KF_RELEASE) From 1dc669646762726d59be15e2de354b06e3e0cbcf Mon Sep 17 00:00:00 2001 From: Yuzuki Ishiyama Date: Wed, 21 Jan 2026 12:33:27 +0900 Subject: [PATCH 153/268] bpf: add bpf_strncasecmp kfunc bpf_strncasecmp() function performs same like bpf_strcasecmp() except limiting the comparison to a specific length. Signed-off-by: Yuzuki Ishiyama Acked-by: Viktor Malik Acked-by: Mykyta Yatsenko Link: https://lore.kernel.org/r/20260121033328.1850010-2-ishiyama@hpc.is.uec.ac.jp Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 637677815365..b54ec0e945aa 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -3413,7 +3413,7 @@ __bpf_kfunc void __bpf_trap(void) * __get_kernel_nofault instead of plain dereference to make them safe. */ -static int __bpf_strcasecmp(const char *s1, const char *s2, bool ignore_case) +static int __bpf_strncasecmp(const char *s1, const char *s2, bool ignore_case, size_t len) { char c1, c2; int i; @@ -3424,7 +3424,7 @@ static int __bpf_strcasecmp(const char *s1, const char *s2, bool ignore_case) } guard(pagefault)(); - for (i = 0; i < XATTR_SIZE_MAX; i++) { + for (i = 0; i < len && i < XATTR_SIZE_MAX; i++) { __get_kernel_nofault(&c1, s1, char, err_out); __get_kernel_nofault(&c2, s2, char, err_out); if (ignore_case) { @@ -3438,7 +3438,7 @@ static int __bpf_strcasecmp(const char *s1, const char *s2, bool ignore_case) s1++; s2++; } - return -E2BIG; + return i == XATTR_SIZE_MAX ? -E2BIG : 0; err_out: return -EFAULT; } @@ -3458,7 +3458,7 @@ static int __bpf_strcasecmp(const char *s1, const char *s2, bool ignore_case) */ __bpf_kfunc int bpf_strcmp(const char *s1__ign, const char *s2__ign) { - return __bpf_strcasecmp(s1__ign, s2__ign, false); + return __bpf_strncasecmp(s1__ign, s2__ign, false, XATTR_SIZE_MAX); } /** @@ -3476,7 +3476,26 @@ __bpf_kfunc int bpf_strcmp(const char *s1__ign, const char *s2__ign) */ __bpf_kfunc int bpf_strcasecmp(const char *s1__ign, const char *s2__ign) { - return __bpf_strcasecmp(s1__ign, s2__ign, true); + return __bpf_strncasecmp(s1__ign, s2__ign, true, XATTR_SIZE_MAX); +} + +/* + * bpf_strncasecmp - Compare two length-limited strings, ignoring case + * @s1__ign: One string + * @s2__ign: Another string + * @len: The maximum number of characters to compare + * + * Return: + * * %0 - Strings are equal + * * %-1 - @s1__ign is smaller + * * %1 - @s2__ign is smaller + * * %-EFAULT - Cannot read one of the strings + * * %-E2BIG - One of strings is too large + * * %-ERANGE - One of strings is outside of kernel address space + */ +__bpf_kfunc int bpf_strncasecmp(const char *s1__ign, const char *s2__ign, size_t len) +{ + return __bpf_strncasecmp(s1__ign, s2__ign, true, len); } /** @@ -4526,6 +4545,7 @@ BTF_ID_FLAGS(func, bpf_iter_dmabuf_destroy, KF_ITER_DESTROY | KF_SLEEPABLE) BTF_ID_FLAGS(func, __bpf_trap) BTF_ID_FLAGS(func, bpf_strcmp); BTF_ID_FLAGS(func, bpf_strcasecmp); +BTF_ID_FLAGS(func, bpf_strncasecmp); BTF_ID_FLAGS(func, bpf_strchr); BTF_ID_FLAGS(func, bpf_strchrnul); BTF_ID_FLAGS(func, bpf_strnchr); From f4924ad0b13fd4ca4f0c7117dc143bf372224aec Mon Sep 17 00:00:00 2001 From: Yuzuki Ishiyama Date: Wed, 21 Jan 2026 12:33:28 +0900 Subject: [PATCH 154/268] selftests/bpf: Test kfunc bpf_strncasecmp Add testsuites for kfunc bpf_strncasecmp. Signed-off-by: Yuzuki Ishiyama Acked-by: Viktor Malik Link: https://lore.kernel.org/r/20260121033328.1850010-3-ishiyama@hpc.is.uec.ac.jp Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/string_kfuncs.c | 1 + tools/testing/selftests/bpf/progs/string_kfuncs_failure1.c | 6 ++++++ tools/testing/selftests/bpf/progs/string_kfuncs_failure2.c | 1 + tools/testing/selftests/bpf/progs/string_kfuncs_success.c | 7 +++++++ 4 files changed, 15 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/string_kfuncs.c b/tools/testing/selftests/bpf/prog_tests/string_kfuncs.c index 0f3bf594e7a5..300032a19445 100644 --- a/tools/testing/selftests/bpf/prog_tests/string_kfuncs.c +++ b/tools/testing/selftests/bpf/prog_tests/string_kfuncs.c @@ -9,6 +9,7 @@ static const char * const test_cases[] = { "strcmp", "strcasecmp", + "strncasecmp", "strchr", "strchrnul", "strnchr", diff --git a/tools/testing/selftests/bpf/progs/string_kfuncs_failure1.c b/tools/testing/selftests/bpf/progs/string_kfuncs_failure1.c index 826e6b6aff7e..bddc4e8579d2 100644 --- a/tools/testing/selftests/bpf/progs/string_kfuncs_failure1.c +++ b/tools/testing/selftests/bpf/progs/string_kfuncs_failure1.c @@ -33,6 +33,8 @@ SEC("syscall") __retval(USER_PTR_ERR) int test_strcmp_null1(void *ctx) { return SEC("syscall") __retval(USER_PTR_ERR)int test_strcmp_null2(void *ctx) { return bpf_strcmp("hello", NULL); } SEC("syscall") __retval(USER_PTR_ERR) int test_strcasecmp_null1(void *ctx) { return bpf_strcasecmp(NULL, "HELLO"); } SEC("syscall") __retval(USER_PTR_ERR)int test_strcasecmp_null2(void *ctx) { return bpf_strcasecmp("HELLO", NULL); } +SEC("syscall") __retval(USER_PTR_ERR)int test_strncasecmp_null1(void *ctx) { return bpf_strncasecmp(NULL, "HELLO", 5); } +SEC("syscall") __retval(USER_PTR_ERR)int test_strncasecmp_null2(void *ctx) { return bpf_strncasecmp("HELLO", NULL, 5); } SEC("syscall") __retval(USER_PTR_ERR)int test_strchr_null(void *ctx) { return bpf_strchr(NULL, 'a'); } SEC("syscall") __retval(USER_PTR_ERR)int test_strchrnul_null(void *ctx) { return bpf_strchrnul(NULL, 'a'); } SEC("syscall") __retval(USER_PTR_ERR)int test_strnchr_null(void *ctx) { return bpf_strnchr(NULL, 1, 'a'); } @@ -57,6 +59,8 @@ SEC("syscall") __retval(USER_PTR_ERR) int test_strcmp_user_ptr1(void *ctx) { ret SEC("syscall") __retval(USER_PTR_ERR) int test_strcmp_user_ptr2(void *ctx) { return bpf_strcmp("hello", user_ptr); } SEC("syscall") __retval(USER_PTR_ERR) int test_strcasecmp_user_ptr1(void *ctx) { return bpf_strcasecmp(user_ptr, "HELLO"); } SEC("syscall") __retval(USER_PTR_ERR) int test_strcasecmp_user_ptr2(void *ctx) { return bpf_strcasecmp("HELLO", user_ptr); } +SEC("syscall") __retval(USER_PTR_ERR) int test_strncasecmp_user_ptr1(void *ctx) { return bpf_strncasecmp(user_ptr, "HELLO", 5); } +SEC("syscall") __retval(USER_PTR_ERR) int test_strncasecmp_user_ptr2(void *ctx) { return bpf_strncasecmp("HELLO", user_ptr, 5); } SEC("syscall") __retval(USER_PTR_ERR) int test_strchr_user_ptr(void *ctx) { return bpf_strchr(user_ptr, 'a'); } SEC("syscall") __retval(USER_PTR_ERR) int test_strchrnul_user_ptr(void *ctx) { return bpf_strchrnul(user_ptr, 'a'); } SEC("syscall") __retval(USER_PTR_ERR) int test_strnchr_user_ptr(void *ctx) { return bpf_strnchr(user_ptr, 1, 'a'); } @@ -83,6 +87,8 @@ SEC("syscall") __retval(-EFAULT) int test_strcmp_pagefault1(void *ctx) { return SEC("syscall") __retval(-EFAULT) int test_strcmp_pagefault2(void *ctx) { return bpf_strcmp("hello", invalid_kern_ptr); } SEC("syscall") __retval(-EFAULT) int test_strcasecmp_pagefault1(void *ctx) { return bpf_strcasecmp(invalid_kern_ptr, "HELLO"); } SEC("syscall") __retval(-EFAULT) int test_strcasecmp_pagefault2(void *ctx) { return bpf_strcasecmp("HELLO", invalid_kern_ptr); } +SEC("syscall") __retval(-EFAULT) int test_strncasecmp_pagefault1(void *ctx) { return bpf_strncasecmp(invalid_kern_ptr, "HELLO", 5); } +SEC("syscall") __retval(-EFAULT) int test_strncasecmp_pagefault2(void *ctx) { return bpf_strncasecmp("HELLO", invalid_kern_ptr, 5); } SEC("syscall") __retval(-EFAULT) int test_strchr_pagefault(void *ctx) { return bpf_strchr(invalid_kern_ptr, 'a'); } SEC("syscall") __retval(-EFAULT) int test_strchrnul_pagefault(void *ctx) { return bpf_strchrnul(invalid_kern_ptr, 'a'); } SEC("syscall") __retval(-EFAULT) int test_strnchr_pagefault(void *ctx) { return bpf_strnchr(invalid_kern_ptr, 1, 'a'); } diff --git a/tools/testing/selftests/bpf/progs/string_kfuncs_failure2.c b/tools/testing/selftests/bpf/progs/string_kfuncs_failure2.c index 05e1da1f250f..412c53b87b18 100644 --- a/tools/testing/selftests/bpf/progs/string_kfuncs_failure2.c +++ b/tools/testing/selftests/bpf/progs/string_kfuncs_failure2.c @@ -8,6 +8,7 @@ char long_str[XATTR_SIZE_MAX + 1]; SEC("syscall") int test_strcmp_too_long(void *ctx) { return bpf_strcmp(long_str, long_str); } SEC("syscall") int test_strcasecmp_too_long(void *ctx) { return bpf_strcasecmp(long_str, long_str); } +SEC("syscall") int test_strncasecmp_too_long(void *ctx) { return bpf_strncasecmp(long_str, long_str, sizeof(long_str)); } SEC("syscall") int test_strchr_too_long(void *ctx) { return bpf_strchr(long_str, 'b'); } SEC("syscall") int test_strchrnul_too_long(void *ctx) { return bpf_strchrnul(long_str, 'b'); } SEC("syscall") int test_strnchr_too_long(void *ctx) { return bpf_strnchr(long_str, sizeof(long_str), 'b'); } diff --git a/tools/testing/selftests/bpf/progs/string_kfuncs_success.c b/tools/testing/selftests/bpf/progs/string_kfuncs_success.c index a8513964516b..f65b1226a81a 100644 --- a/tools/testing/selftests/bpf/progs/string_kfuncs_success.c +++ b/tools/testing/selftests/bpf/progs/string_kfuncs_success.c @@ -17,6 +17,13 @@ __test(0) int test_strcasecmp_eq2(void *ctx) { return bpf_strcasecmp(str, "HELLO __test(0) int test_strcasecmp_eq3(void *ctx) { return bpf_strcasecmp(str, "HELLO world"); } __test(1) int test_strcasecmp_neq1(void *ctx) { return bpf_strcasecmp(str, "hello"); } __test(1) int test_strcasecmp_neq2(void *ctx) { return bpf_strcasecmp(str, "HELLO"); } +__test(0) int test_strncasecmp_eq1(void *ctx) { return bpf_strncasecmp(str, "hello world", 11); } +__test(0) int test_strncasecmp_eq2(void *ctx) { return bpf_strncasecmp(str, "HELLO WORLD", 11); } +__test(0) int test_strncasecmp_eq3(void *ctx) { return bpf_strncasecmp(str, "HELLO world", 11); } +__test(0) int test_strncasecmp_eq4(void *ctx) { return bpf_strncasecmp(str, "hello", 5); } +__test(0) int test_strncasecmp_eq5(void *ctx) { return bpf_strncasecmp(str, "hello world!", 11); } +__test(-1) int test_strncasecmp_neq1(void *ctx) { return bpf_strncasecmp(str, "hello!", 6); } +__test(1) int test_strncasecmp_neq2(void *ctx) { return bpf_strncasecmp(str, "abc", 3); } __test(1) int test_strchr_found(void *ctx) { return bpf_strchr(str, 'e'); } __test(11) int test_strchr_null(void *ctx) { return bpf_strchr(str, '\0'); } __test(-ENOENT) int test_strchr_notfound(void *ctx) { return bpf_strchr(str, 'x'); } From 26ad5d6e763070aa146d86b941884b11eb1ac0aa Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Wed, 21 Jan 2026 10:16:17 -0800 Subject: [PATCH 155/268] scripts/gen-btf.sh: Use CONFIG_SHELL for execution MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit According to the docs [1], kernel build scripts should be executed via CONFIG_SHELL, which is sh by default. Fixup gen-btf.sh to be runnable with sh, and use CONFIG_SHELL at every invocation site. See relevant discussion for context [2]. [1] https://docs.kernel.org/kbuild/makefiles.html#script-invocation [2] https://lore.kernel.org/bpf/CAADnVQ+dxmSNoJAGb6xV89ffUCKXe5CJXovXZt22nv5iYFV5mw@mail.gmail.com/ Signed-off-by: Ihor Solodrai Tested-by: Gary Guo Reported-by: Gary Guo Suggested-by: Thomas Weißschuh Fixes: 522397d05e7d ("resolve_btfids: Change in-place update with raw binary output") Link: https://lore.kernel.org/r/20260121181617.820300-1-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- scripts/Makefile.modfinal | 2 +- scripts/gen-btf.sh | 8 ++++---- scripts/link-vmlinux.sh | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/scripts/Makefile.modfinal b/scripts/Makefile.modfinal index 422c56dc878e..adcbcde16a07 100644 --- a/scripts/Makefile.modfinal +++ b/scripts/Makefile.modfinal @@ -43,7 +43,7 @@ quiet_cmd_btf_ko = BTF [M] $@ if [ ! -f $(objtree)/vmlinux ]; then \ printf "Skipping BTF generation for %s due to unavailability of vmlinux\n" $@ 1>&2; \ else \ - $(srctree)/scripts/gen-btf.sh --btf_base $(objtree)/vmlinux $@; \ + $(CONFIG_SHELL) $(srctree)/scripts/gen-btf.sh --btf_base $(objtree)/vmlinux $@; \ fi; # Same as newer-prereqs, but allows to exclude specified extra dependencies diff --git a/scripts/gen-btf.sh b/scripts/gen-btf.sh index be21ccee3487..8ca96eb10a69 100755 --- a/scripts/gen-btf.sh +++ b/scripts/gen-btf.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/sh # SPDX-License-Identifier: GPL-2.0 # Copyright (c) 2025 Meta Platforms, Inc. and affiliates. # @@ -81,7 +81,7 @@ gen_btf_data() gen_btf_o() { - local btf_data=${ELF_FILE}.btf.o + btf_data=${ELF_FILE}.btf.o # Create ${btf_data} which contains just .BTF section but no symbols. Add # SHF_ALLOC because .BTF will be part of the vmlinux image. --strip-all @@ -107,11 +107,11 @@ embed_btf_data() ${OBJCOPY} --add-section .BTF=${ELF_FILE}.BTF ${ELF_FILE} # a module might not have a .BTF_ids or .BTF.base section - local btf_base="${ELF_FILE}.BTF.base" + btf_base="${ELF_FILE}.BTF.base" if [ -f "${btf_base}" ]; then ${OBJCOPY} --add-section .BTF.base=${btf_base} ${ELF_FILE} fi - local btf_ids="${ELF_FILE}.BTF_ids" + btf_ids="${ELF_FILE}.BTF_ids" if [ -f "${btf_ids}" ]; then ${RESOLVE_BTFIDS} --patch_btfids ${btf_ids} ${ELF_FILE} fi diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index 08cd8e25c65c..16d6a048e07c 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -206,7 +206,7 @@ fi if is_enabled CONFIG_DEBUG_INFO_BTF; then info BTF .tmp_vmlinux1 - if ! ${srctree}/scripts/gen-btf.sh .tmp_vmlinux1; then + if ! ${CONFIG_SHELL} ${srctree}/scripts/gen-btf.sh .tmp_vmlinux1; then echo >&2 "Failed to generate BTF for vmlinux" echo >&2 "Try to disable CONFIG_DEBUG_INFO_BTF" exit 1 From a32ae2658471dd87a2f7a438388ed7d9a5767212 Mon Sep 17 00:00:00 2001 From: Kery Qi Date: Wed, 21 Jan 2026 17:41:16 +0800 Subject: [PATCH 156/268] selftests/bpf: Fix resource leak in serial_test_wq on attach failure When wq__attach() fails, serial_test_wq() returns early without calling wq__destroy(), leaking the skeleton resources allocated by wq__open_and_load(). This causes ASAN leak reports in selftests runs. Fix this by jumping to a common clean_up label that calls wq__destroy() on all exit paths after successful open_and_load. Note that the early return after wq__open_and_load() failure is correct and doesn't need fixing, since that function returns NULL on failure (after internally cleaning up any partial allocations). Fixes: 8290dba51910 ("selftests/bpf: wq: add bpf_wq_start() checks") Signed-off-by: Kery Qi Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20260121094114.1801-3-qikeyu2017@gmail.com --- tools/testing/selftests/bpf/prog_tests/wq.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/wq.c b/tools/testing/selftests/bpf/prog_tests/wq.c index 15c67d23128b..84831eecc935 100644 --- a/tools/testing/selftests/bpf/prog_tests/wq.c +++ b/tools/testing/selftests/bpf/prog_tests/wq.c @@ -16,12 +16,12 @@ void serial_test_wq(void) /* re-run the success test to check if the timer was actually executed */ wq_skel = wq__open_and_load(); - if (!ASSERT_OK_PTR(wq_skel, "wq_skel_load")) + if (!ASSERT_OK_PTR(wq_skel, "wq__open_and_load")) return; err = wq__attach(wq_skel); if (!ASSERT_OK(err, "wq_attach")) - return; + goto clean_up; prog_fd = bpf_program__fd(wq_skel->progs.test_syscall_array_sleepable); err = bpf_prog_test_run_opts(prog_fd, &topts); @@ -31,6 +31,7 @@ void serial_test_wq(void) usleep(50); /* 10 usecs should be enough, but give it extra */ ASSERT_EQ(wq_skel->bss->ok_sleepable, (1 << 1), "ok_sleepable"); +clean_up: wq__destroy(wq_skel); } From 82f3b142c99cf44c7b1e70b7720169c646b9760f Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Thu, 22 Jan 2026 03:59:11 -0800 Subject: [PATCH 157/268] rqspinlock: Fix TAS fallback lock entry creation The TAS fallback can be invoked directly when queued spin locks are disabled, and through the slow path when paravirt is enabled for queued spin locks. In the latter case, the res_spin_lock macro will attempt the fast path and already hold the entry when entering the slow path. This will lead to creation of extraneous entries that are not released, which may cause false positives for deadlock detection. Fix this by always preceding invocation of the TAS fallback in every case with the grabbing of the held lock entry, and add a comment to make note of this. Fixes: c9102a68c070 ("rqspinlock: Add a test-and-set fallback") Reported-by: Amery Hung Signed-off-by: Kumar Kartikeya Dwivedi Tested-by: Amery Hung Link: https://lore.kernel.org/r/20260122115911.3668985-1-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/asm-generic/rqspinlock.h | 2 +- kernel/bpf/rqspinlock.c | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/include/asm-generic/rqspinlock.h b/include/asm-generic/rqspinlock.h index 0f2dcbbfee2f..5c5cf2f7fc39 100644 --- a/include/asm-generic/rqspinlock.h +++ b/include/asm-generic/rqspinlock.h @@ -191,7 +191,7 @@ static __always_inline int res_spin_lock(rqspinlock_t *lock) #else -#define res_spin_lock(lock) resilient_tas_spin_lock(lock) +#define res_spin_lock(lock) ({ grab_held_lock_entry(lock); resilient_tas_spin_lock(lock); }) #endif /* CONFIG_QUEUED_SPINLOCKS */ diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c index f7d0c8d4644e..2fdfa828e3d3 100644 --- a/kernel/bpf/rqspinlock.c +++ b/kernel/bpf/rqspinlock.c @@ -265,10 +265,11 @@ int __lockfunc resilient_tas_spin_lock(rqspinlock_t *lock) RES_INIT_TIMEOUT(ts); /* - * The fast path is not invoked for the TAS fallback, so we must grab - * the deadlock detection entry here. + * We are either called directly from res_spin_lock after grabbing the + * deadlock detection entry when queued spinlocks are disabled, or from + * resilient_queued_spin_lock_slowpath after grabbing the deadlock + * detection entry. No need to obtain it here. */ - grab_held_lock_entry(lock); /* * Since the waiting loop's time is dependent on the amount of From d8df878140506e7938928195f540c10b1089fdaf Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 22 Jan 2026 21:51:22 -0800 Subject: [PATCH 158/268] selftests/bpf: Fix task_local_data failure with 64K page On arm64 systems with 64K pages, the selftest task_local_data has the following failures: ... test_task_local_data_basic:PASS:tld_create_key 0 nsec test_task_local_data_basic:FAIL:tld_create_key unexpected tld_create_key: actual 0 != expected -28 ... test_task_local_data_basic_thread:PASS:run task_main 0 nsec test_task_local_data_basic_thread:FAIL:task_main retval unexpected error: 2 (errno 0) test_task_local_data_basic_thread:FAIL:tld_get_data value0 unexpected tld_get_data value0: actual 0 != expected 6268 ... #447/1 task_local_data/task_local_data_basic:FAIL ... #447/2 task_local_data/task_local_data_race:FAIL #447 task_local_data:FAIL When TLD_DYN_DATA_SIZE is 64K page size, for struct tld_meta_u { _Atomic __u8 cnt; __u16 size; struct tld_metadata metadata[]; }; field 'cnt' would overflow. For example, for 4K page, 'cnt' will be 4096/64 = 64. But for 64K page, 'cnt' will be 65536/64 = 1024 and 'cnt' is not enough for 1024. To accommodate 64K page, '_Atomic __u8 cnt' becomes '_Atomic __u16 cnt'. A few other places are adjusted accordingly. In test_task_local_data.c, the value for TLD_DYN_DATA_SIZE is changed from 4096 to (getpagesize() - 8) since the maximum buffer size for TLD_DYN_DATA_SIZE is (getpagesize() - 8). Reviewed-by: Alan Maguire Tested-by: Alan Maguire Cc: Amery Hung Signed-off-by: Yonghong Song Acked-by: Amery Hung Link: https://lore.kernel.org/r/20260123055122.494352-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/task_local_data.h | 4 ++-- tools/testing/selftests/bpf/prog_tests/test_task_local_data.c | 2 +- tools/testing/selftests/bpf/progs/task_local_data.bpf.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/task_local_data.h b/tools/testing/selftests/bpf/prog_tests/task_local_data.h index 2de38776a2d4..0f86b9275cf9 100644 --- a/tools/testing/selftests/bpf/prog_tests/task_local_data.h +++ b/tools/testing/selftests/bpf/prog_tests/task_local_data.h @@ -94,7 +94,7 @@ struct tld_metadata { }; struct tld_meta_u { - _Atomic __u8 cnt; + _Atomic __u16 cnt; __u16 size; struct tld_metadata metadata[]; }; @@ -217,7 +217,7 @@ static int __tld_init_data_p(int map_fd) static tld_key_t __tld_create_key(const char *name, size_t size, bool dyn_data) { int err, i, sz, off = 0; - __u8 cnt; + __u16 cnt; if (!TLD_READ_ONCE(tld_meta_p)) { err = __tld_init_meta_p(); diff --git a/tools/testing/selftests/bpf/prog_tests/test_task_local_data.c b/tools/testing/selftests/bpf/prog_tests/test_task_local_data.c index 9fd6306b455c..9556ad3d986f 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_task_local_data.c +++ b/tools/testing/selftests/bpf/prog_tests/test_task_local_data.c @@ -4,7 +4,7 @@ #include #define TLD_FREE_DATA_ON_THREAD_EXIT -#define TLD_DYN_DATA_SIZE 4096 +#define TLD_DYN_DATA_SIZE (getpagesize() - 8) #include "task_local_data.h" struct test_tld_struct { diff --git a/tools/testing/selftests/bpf/progs/task_local_data.bpf.h b/tools/testing/selftests/bpf/progs/task_local_data.bpf.h index 432fff2af844..fed53d63a7e5 100644 --- a/tools/testing/selftests/bpf/progs/task_local_data.bpf.h +++ b/tools/testing/selftests/bpf/progs/task_local_data.bpf.h @@ -80,7 +80,7 @@ struct tld_metadata { }; struct tld_meta_u { - __u8 cnt; + __u16 cnt; __u16 size; struct tld_metadata metadata[TLD_MAX_DATA_CNT]; }; From c7900f225a102219f5fe2c1c93a7dec5467315ee Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 22 Jan 2026 21:51:28 -0800 Subject: [PATCH 159/268] selftests/bpf: Fix xdp_pull_data failure with 64K page If the argument 'pull_len' of run_test() is 'PULL_MAX' or 'PULL_MAX | PULL_PLUS_ONE', the eventual pull_len size will close to the page size. On arm64 systems with 64K pages, the pull_len size will be close to 64K. But the existing buffer will be close to 9000 which is not enough to pull. For those failed run_tests(), make buff size to pg_sz + (pg_sz / 2) This way, there will be enough buffer space to pull regardless of page size. Tested-by: Alan Maguire Cc: Amery Hung Signed-off-by: Yonghong Song Acked-by: Amery Hung Link: https://lore.kernel.org/r/20260123055128.495265-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/xdp_pull_data.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_pull_data.c b/tools/testing/selftests/bpf/prog_tests/xdp_pull_data.c index efa350d04ec5..910dabe95afd 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_pull_data.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_pull_data.c @@ -114,12 +114,14 @@ static void test_xdp_pull_data_basic(void) { u32 pg_sz, max_meta_len, max_data_len; struct test_xdp_pull_data *skel; + int buff_len; skel = test_xdp_pull_data__open_and_load(); if (!ASSERT_OK_PTR(skel, "test_xdp_pull_data__open_and_load")) return; pg_sz = sysconf(_SC_PAGE_SIZE); + buff_len = pg_sz + pg_sz / 2; if (find_xdp_sizes(skel, pg_sz)) goto out; @@ -140,13 +142,13 @@ static void test_xdp_pull_data_basic(void) run_test(skel, XDP_PASS, pg_sz, 9000, 0, 1025, 1025); /* multi-buf pkt, empty linear data area, pull requires memmove */ - run_test(skel, XDP_PASS, pg_sz, 9000, 0, 0, PULL_MAX); + run_test(skel, XDP_PASS, pg_sz, buff_len, 0, 0, PULL_MAX); /* multi-buf pkt, no headroom */ - run_test(skel, XDP_PASS, pg_sz, 9000, max_meta_len, 1024, PULL_MAX); + run_test(skel, XDP_PASS, pg_sz, buff_len, max_meta_len, 1024, PULL_MAX); /* multi-buf pkt, no tailroom, pull requires memmove */ - run_test(skel, XDP_PASS, pg_sz, 9000, 0, max_data_len, PULL_MAX); + run_test(skel, XDP_PASS, pg_sz, buff_len, 0, max_data_len, PULL_MAX); /* Test cases with invalid pull length */ @@ -154,18 +156,18 @@ static void test_xdp_pull_data_basic(void) run_test(skel, XDP_DROP, pg_sz, 2048, 0, 2048, 2049); /* multi-buf pkt with no space left in linear data area */ - run_test(skel, XDP_DROP, pg_sz, 9000, max_meta_len, max_data_len, + run_test(skel, XDP_DROP, pg_sz, buff_len, max_meta_len, max_data_len, PULL_MAX | PULL_PLUS_ONE); /* multi-buf pkt, empty linear data area */ - run_test(skel, XDP_DROP, pg_sz, 9000, 0, 0, PULL_MAX | PULL_PLUS_ONE); + run_test(skel, XDP_DROP, pg_sz, buff_len, 0, 0, PULL_MAX | PULL_PLUS_ONE); /* multi-buf pkt, no headroom */ - run_test(skel, XDP_DROP, pg_sz, 9000, max_meta_len, 1024, + run_test(skel, XDP_DROP, pg_sz, buff_len, max_meta_len, 1024, PULL_MAX | PULL_PLUS_ONE); /* multi-buf pkt, no tailroom */ - run_test(skel, XDP_DROP, pg_sz, 9000, 0, max_data_len, + run_test(skel, XDP_DROP, pg_sz, buff_len, 0, max_data_len, PULL_MAX | PULL_PLUS_ONE); out: From 2d419c44658f75e7655794341a95c0687830f3df Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 24 Jan 2026 14:19:56 +0800 Subject: [PATCH 160/268] bpf: add fsession support The fsession is something that similar to kprobe session. It allow to attach a single BPF program to both the entry and the exit of the target functions. Introduce the struct bpf_fsession_link, which allows to add the link to both the fentry and fexit progs_hlist of the trampoline. Signed-off-by: Menglong Dong Co-developed-by: Leon Hwang Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260124062008.8657-2-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 19 +++++++ include/uapi/linux/bpf.h | 1 + kernel/bpf/btf.c | 2 + kernel/bpf/syscall.c | 18 ++++++- kernel/bpf/trampoline.c | 53 ++++++++++++++++--- kernel/bpf/verifier.c | 12 +++-- net/bpf/test_run.c | 1 + net/core/bpf_sk_storage.c | 1 + tools/include/uapi/linux/bpf.h | 1 + .../bpf/prog_tests/tracing_failure.c | 2 +- 10 files changed, 97 insertions(+), 13 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5936f8e2996f..41228b0add52 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1309,6 +1309,7 @@ enum bpf_tramp_prog_type { BPF_TRAMP_MODIFY_RETURN, BPF_TRAMP_MAX, BPF_TRAMP_REPLACE, /* more than MAX */ + BPF_TRAMP_FSESSION, }; struct bpf_tramp_image { @@ -1875,6 +1876,11 @@ struct bpf_tracing_link { struct bpf_prog *tgt_prog; }; +struct bpf_fsession_link { + struct bpf_tracing_link link; + struct bpf_tramp_link fexit; +}; + struct bpf_raw_tp_link { struct bpf_link link; struct bpf_raw_event_map *btp; @@ -2169,6 +2175,19 @@ static inline void bpf_struct_ops_desc_release(struct bpf_struct_ops_desc *st_op #endif +static inline int bpf_fsession_cnt(struct bpf_tramp_links *links) +{ + struct bpf_tramp_links fentries = links[BPF_TRAMP_FENTRY]; + int cnt = 0; + + for (int i = 0; i < links[BPF_TRAMP_FENTRY].nr_links; i++) { + if (fentries.links[i]->link.prog->expected_attach_type == BPF_TRACE_FSESSION) + cnt++; + } + + return cnt; +} + int bpf_prog_ctx_arg_info_init(struct bpf_prog *prog, const struct bpf_ctx_arg_aux *info, u32 cnt); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2a2ade4be60f..44e7dbc278e3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1145,6 +1145,7 @@ enum bpf_attach_type { BPF_NETKIT_PEER, BPF_TRACE_KPROBE_SESSION, BPF_TRACE_UPROBE_SESSION, + BPF_TRACE_FSESSION, __MAX_BPF_ATTACH_TYPE }; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index d10b3404260f..8959f3bc1e92 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6219,6 +6219,7 @@ static int btf_validate_prog_ctx_type(struct bpf_verifier_log *log, const struct case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: case BPF_MODIFY_RETURN: + case BPF_TRACE_FSESSION: /* allow u64* as ctx */ if (btf_is_int(t) && t->size == 8) return 0; @@ -6820,6 +6821,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, fallthrough; case BPF_LSM_CGROUP: case BPF_TRACE_FEXIT: + case BPF_TRACE_FSESSION: /* When LSM programs are attached to void LSM hooks * they use FEXIT trampolines and when attached to * int LSM hooks, they use MODIFY_RETURN trampolines. diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 3c5c03d43f5f..b9184545c3fd 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3577,6 +3577,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, case BPF_PROG_TYPE_TRACING: if (prog->expected_attach_type != BPF_TRACE_FENTRY && prog->expected_attach_type != BPF_TRACE_FEXIT && + prog->expected_attach_type != BPF_TRACE_FSESSION && prog->expected_attach_type != BPF_MODIFY_RETURN) { err = -EINVAL; goto out_put_prog; @@ -3626,7 +3627,21 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, key = bpf_trampoline_compute_key(tgt_prog, NULL, btf_id); } - link = kzalloc(sizeof(*link), GFP_USER); + if (prog->expected_attach_type == BPF_TRACE_FSESSION) { + struct bpf_fsession_link *fslink; + + fslink = kzalloc(sizeof(*fslink), GFP_USER); + if (fslink) { + bpf_link_init(&fslink->fexit.link, BPF_LINK_TYPE_TRACING, + &bpf_tracing_link_lops, prog, attach_type); + fslink->fexit.cookie = bpf_cookie; + link = &fslink->link; + } else { + link = NULL; + } + } else { + link = kzalloc(sizeof(*link), GFP_USER); + } if (!link) { err = -ENOMEM; goto out_put_prog; @@ -4350,6 +4365,7 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) case BPF_TRACE_RAW_TP: case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: + case BPF_TRACE_FSESSION: case BPF_MODIFY_RETURN: return BPF_PROG_TYPE_TRACING; case BPF_LSM_MAC: diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 2a125d063e62..edf9da43762d 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -109,10 +109,17 @@ bool bpf_prog_has_trampoline(const struct bpf_prog *prog) enum bpf_attach_type eatype = prog->expected_attach_type; enum bpf_prog_type ptype = prog->type; - return (ptype == BPF_PROG_TYPE_TRACING && - (eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT || - eatype == BPF_MODIFY_RETURN)) || - (ptype == BPF_PROG_TYPE_LSM && eatype == BPF_LSM_MAC); + switch (ptype) { + case BPF_PROG_TYPE_TRACING: + if (eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT || + eatype == BPF_MODIFY_RETURN || eatype == BPF_TRACE_FSESSION) + return true; + return false; + case BPF_PROG_TYPE_LSM: + return eatype == BPF_LSM_MAC; + default: + return false; + } } void bpf_image_ksym_init(void *data, unsigned int size, struct bpf_ksym *ksym) @@ -559,6 +566,8 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog) return BPF_TRAMP_MODIFY_RETURN; case BPF_TRACE_FEXIT: return BPF_TRAMP_FEXIT; + case BPF_TRACE_FSESSION: + return BPF_TRAMP_FSESSION; case BPF_LSM_MAC: if (!prog->aux->attach_func_proto->type) /* The function returns void, we cannot modify its @@ -594,8 +603,10 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr, struct bpf_prog *tgt_prog) { + struct bpf_fsession_link *fslink = NULL; enum bpf_tramp_prog_type kind; struct bpf_tramp_link *link_exiting; + struct hlist_head *prog_list; int err = 0; int cnt = 0, i; @@ -621,24 +632,43 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, BPF_MOD_JUMP, NULL, link->link.prog->bpf_func); } + if (kind == BPF_TRAMP_FSESSION) { + prog_list = &tr->progs_hlist[BPF_TRAMP_FENTRY]; + cnt++; + } else { + prog_list = &tr->progs_hlist[kind]; + } if (cnt >= BPF_MAX_TRAMP_LINKS) return -E2BIG; if (!hlist_unhashed(&link->tramp_hlist)) /* prog already linked */ return -EBUSY; - hlist_for_each_entry(link_exiting, &tr->progs_hlist[kind], tramp_hlist) { + hlist_for_each_entry(link_exiting, prog_list, tramp_hlist) { if (link_exiting->link.prog != link->link.prog) continue; /* prog already linked */ return -EBUSY; } - hlist_add_head(&link->tramp_hlist, &tr->progs_hlist[kind]); - tr->progs_cnt[kind]++; + hlist_add_head(&link->tramp_hlist, prog_list); + if (kind == BPF_TRAMP_FSESSION) { + tr->progs_cnt[BPF_TRAMP_FENTRY]++; + fslink = container_of(link, struct bpf_fsession_link, link.link); + hlist_add_head(&fslink->fexit.tramp_hlist, &tr->progs_hlist[BPF_TRAMP_FEXIT]); + tr->progs_cnt[BPF_TRAMP_FEXIT]++; + } else { + tr->progs_cnt[kind]++; + } err = bpf_trampoline_update(tr, true /* lock_direct_mutex */); if (err) { hlist_del_init(&link->tramp_hlist); - tr->progs_cnt[kind]--; + if (kind == BPF_TRAMP_FSESSION) { + tr->progs_cnt[BPF_TRAMP_FENTRY]--; + hlist_del_init(&fslink->fexit.tramp_hlist); + tr->progs_cnt[BPF_TRAMP_FEXIT]--; + } else { + tr->progs_cnt[kind]--; + } } return err; } @@ -672,6 +702,13 @@ static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, guard(mutex)(&tgt_prog->aux->ext_mutex); tgt_prog->aux->is_extended = false; return err; + } else if (kind == BPF_TRAMP_FSESSION) { + struct bpf_fsession_link *fslink = + container_of(link, struct bpf_fsession_link, link.link); + + hlist_del_init(&fslink->fexit.tramp_hlist); + tr->progs_cnt[BPF_TRAMP_FEXIT]--; + kind = BPF_TRAMP_FENTRY; } hlist_del_init(&link->tramp_hlist); tr->progs_cnt[kind]--; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c7f5234d5fd2..41bbed6418b5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -17848,6 +17848,7 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char switch (env->prog->expected_attach_type) { case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: + case BPF_TRACE_FSESSION: range = retval_range(0, 0); break; case BPF_TRACE_RAW_TP: @@ -23774,6 +23775,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) if (prog_type == BPF_PROG_TYPE_TRACING && insn->imm == BPF_FUNC_get_func_ret) { if (eatype == BPF_TRACE_FEXIT || + eatype == BPF_TRACE_FSESSION || eatype == BPF_MODIFY_RETURN) { /* Load nr_args from ctx - 8 */ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); @@ -24725,7 +24727,8 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, if (tgt_prog->type == BPF_PROG_TYPE_TRACING && prog_extension && (tgt_prog->expected_attach_type == BPF_TRACE_FENTRY || - tgt_prog->expected_attach_type == BPF_TRACE_FEXIT)) { + tgt_prog->expected_attach_type == BPF_TRACE_FEXIT || + tgt_prog->expected_attach_type == BPF_TRACE_FSESSION)) { /* Program extensions can extend all program types * except fentry/fexit. The reason is the following. * The fentry/fexit programs are used for performance @@ -24740,7 +24743,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, * beyond reasonable stack size. Hence extending fentry * is not allowed. */ - bpf_log(log, "Cannot extend fentry/fexit\n"); + bpf_log(log, "Cannot extend fentry/fexit/fsession\n"); return -EINVAL; } } else { @@ -24824,6 +24827,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, case BPF_LSM_CGROUP: case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: + case BPF_TRACE_FSESSION: if (!btf_type_is_func(t)) { bpf_log(log, "attach_btf_id %u is not a function\n", btf_id); @@ -24990,6 +24994,7 @@ static bool can_be_sleepable(struct bpf_prog *prog) case BPF_TRACE_FEXIT: case BPF_MODIFY_RETURN: case BPF_TRACE_ITER: + case BPF_TRACE_FSESSION: return true; default: return false; @@ -25071,9 +25076,10 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) tgt_info.tgt_name); return -EINVAL; } else if ((prog->expected_attach_type == BPF_TRACE_FEXIT || + prog->expected_attach_type == BPF_TRACE_FSESSION || prog->expected_attach_type == BPF_MODIFY_RETURN) && btf_id_set_contains(&noreturn_deny, btf_id)) { - verbose(env, "Attaching fexit/fmod_ret to __noreturn function '%s' is rejected.\n", + verbose(env, "Attaching fexit/fsession/fmod_ret to __noreturn function '%s' is rejected.\n", tgt_info.tgt_name); return -EINVAL; } diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 26cfcfdc45eb..178c4738e63b 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -685,6 +685,7 @@ int bpf_prog_test_run_tracing(struct bpf_prog *prog, switch (prog->expected_attach_type) { case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: + case BPF_TRACE_FSESSION: if (bpf_fentry_test1(1) != 2 || bpf_fentry_test2(2, 3) != 5 || bpf_fentry_test3(4, 5, 6) != 15 || diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 850dd736ccd1..de111818f3a0 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -365,6 +365,7 @@ static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog) return true; case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: + case BPF_TRACE_FSESSION: return !!strncmp(prog->aux->attach_func_name, "bpf_sk_storage", strlen("bpf_sk_storage")); default: diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index b816bc53d2e1..3ca7d76e05f0 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1145,6 +1145,7 @@ enum bpf_attach_type { BPF_NETKIT_PEER, BPF_TRACE_KPROBE_SESSION, BPF_TRACE_UPROBE_SESSION, + BPF_TRACE_FSESSION, __MAX_BPF_ATTACH_TYPE }; diff --git a/tools/testing/selftests/bpf/prog_tests/tracing_failure.c b/tools/testing/selftests/bpf/prog_tests/tracing_failure.c index 10e231965589..f9f9e1cb87bf 100644 --- a/tools/testing/selftests/bpf/prog_tests/tracing_failure.c +++ b/tools/testing/selftests/bpf/prog_tests/tracing_failure.c @@ -73,7 +73,7 @@ static void test_tracing_deny(void) static void test_fexit_noreturns(void) { test_tracing_fail_prog("fexit_noreturns", - "Attaching fexit/fmod_ret to __noreturn function 'do_exit' is rejected."); + "Attaching fexit/fsession/fmod_ret to __noreturn function 'do_exit' is rejected."); } void test_tracing_failure(void) From f1b56b3cbdb2d2a51d8ea91008eddf4d1d9f277b Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 24 Jan 2026 14:19:57 +0800 Subject: [PATCH 161/268] bpf: use the least significant byte for the nr_args in trampoline For now, ((u64 *)ctx)[-1] is used to store the nr_args in the trampoline. However, 1 byte is enough to store such information. Therefore, we use only the least significant byte of ((u64 *)ctx)[-1] to store the nr_args, and reserve the rest for other usages. Signed-off-by: Menglong Dong Link: https://lore.kernel.org/r/20260124062008.8657-3-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 39 +++++++++++++++++++++++---------------- kernel/trace/bpf_trace.c | 6 +++--- 2 files changed, 26 insertions(+), 19 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 41bbed6418b5..2081343a848d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -23747,19 +23747,21 @@ static int do_misc_fixups(struct bpf_verifier_env *env) /* skip 'void *__data' in btf_trace_##name() and save to reg0 */ insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, nr_args - 1); + cnt = 1; } else { /* Load nr_args from ctx - 8 */ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); + insn_buf[1] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xFF); + cnt = 2; } - insn_buf[1] = BPF_JMP32_REG(BPF_JGE, BPF_REG_2, BPF_REG_0, 6); - insn_buf[2] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 3); - insn_buf[3] = BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_1); - insn_buf[4] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, 0); - insn_buf[5] = BPF_STX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0); - insn_buf[6] = BPF_MOV64_IMM(BPF_REG_0, 0); - insn_buf[7] = BPF_JMP_A(1); - insn_buf[8] = BPF_MOV64_IMM(BPF_REG_0, -EINVAL); - cnt = 9; + insn_buf[cnt++] = BPF_JMP32_REG(BPF_JGE, BPF_REG_2, BPF_REG_0, 6); + insn_buf[cnt++] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 3); + insn_buf[cnt++] = BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_1); + insn_buf[cnt++] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, 0); + insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0); + insn_buf[cnt++] = BPF_MOV64_IMM(BPF_REG_0, 0); + insn_buf[cnt++] = BPF_JMP_A(1); + insn_buf[cnt++] = BPF_MOV64_IMM(BPF_REG_0, -EINVAL); new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); if (!new_prog) @@ -23779,12 +23781,13 @@ static int do_misc_fixups(struct bpf_verifier_env *env) eatype == BPF_MODIFY_RETURN) { /* Load nr_args from ctx - 8 */ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); - insn_buf[1] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3); - insn_buf[2] = BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1); - insn_buf[3] = BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0); - insn_buf[4] = BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_3, 0); - insn_buf[5] = BPF_MOV64_IMM(BPF_REG_0, 0); - cnt = 6; + insn_buf[1] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xFF); + insn_buf[2] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3); + insn_buf[3] = BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1); + insn_buf[4] = BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0); + insn_buf[5] = BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_3, 0); + insn_buf[6] = BPF_MOV64_IMM(BPF_REG_0, 0); + cnt = 7; } else { insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, -EOPNOTSUPP); cnt = 1; @@ -23808,15 +23811,19 @@ static int do_misc_fixups(struct bpf_verifier_env *env) /* skip 'void *__data' in btf_trace_##name() and save to reg0 */ insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, nr_args - 1); + cnt = 1; } else { /* Load nr_args from ctx - 8 */ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); + insn_buf[1] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xFF); + cnt = 2; } - new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, 1); + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); if (!new_prog) return -ENOMEM; + delta += cnt - 1; env->prog = prog = new_prog; insn = new_prog->insnsi + i + delta; goto next_insn; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 0e9635bcd783..d466a1503da3 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1194,7 +1194,7 @@ const struct bpf_func_proto bpf_get_branch_snapshot_proto = { BPF_CALL_3(get_func_arg, void *, ctx, u32, n, u64 *, value) { /* This helper call is inlined by verifier. */ - u64 nr_args = ((u64 *)ctx)[-1]; + u64 nr_args = ((u64 *)ctx)[-1] & 0xFF; if ((u64) n >= nr_args) return -EINVAL; @@ -1214,7 +1214,7 @@ static const struct bpf_func_proto bpf_get_func_arg_proto = { BPF_CALL_2(get_func_ret, void *, ctx, u64 *, value) { /* This helper call is inlined by verifier. */ - u64 nr_args = ((u64 *)ctx)[-1]; + u64 nr_args = ((u64 *)ctx)[-1] & 0xFF; *value = ((u64 *)ctx)[nr_args]; return 0; @@ -1231,7 +1231,7 @@ static const struct bpf_func_proto bpf_get_func_ret_proto = { BPF_CALL_1(get_func_arg_cnt, void *, ctx) { /* This helper call is inlined by verifier. */ - return ((u64 *)ctx)[-1]; + return ((u64 *)ctx)[-1] & 0xFF; } static const struct bpf_func_proto bpf_get_func_arg_cnt_proto = { From 8fe4dc4f6456b3d2c9e6f8aeb1f978b7bff0f6c8 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 24 Jan 2026 14:19:58 +0800 Subject: [PATCH 162/268] bpf: change prototype of bpf_session_{cookie,is_return} Add the function argument of "void *ctx" to bpf_session_cookie() and bpf_session_is_return(), which is a preparation of the next patch. The two kfunc is seldom used now, so it will not introduce much effect to change their function prototype. Signed-off-by: Menglong Dong Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20260124062008.8657-4-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 6 +++++- kernel/trace/bpf_trace.c | 4 ++-- tools/testing/selftests/bpf/bpf_kfuncs.h | 3 --- .../bpf/progs/kprobe_multi_session_cookie.c | 15 +++++++-------- .../selftests/bpf/progs/uprobe_multi_session.c | 7 +++---- .../bpf/progs/uprobe_multi_session_cookie.c | 15 +++++++-------- .../bpf/progs/uprobe_multi_session_recursive.c | 11 +++++------ 7 files changed, 29 insertions(+), 32 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2081343a848d..0fa73d56cb8b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12484,6 +12484,7 @@ enum special_kfunc_type { KF_bpf_arena_alloc_pages, KF_bpf_arena_free_pages, KF_bpf_arena_reserve_pages, + KF_bpf_session_is_return, }; BTF_ID_LIST(special_kfunc_list) @@ -12561,6 +12562,7 @@ BTF_ID(func, bpf_task_work_schedule_resume) BTF_ID(func, bpf_arena_alloc_pages) BTF_ID(func, bpf_arena_free_pages) BTF_ID(func, bpf_arena_reserve_pages) +BTF_ID(func, bpf_session_is_return) static bool is_task_work_add_kfunc(u32 func_id) { @@ -12615,7 +12617,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, struct bpf_reg_state *reg = ®s[regno]; bool arg_mem_size = false; - if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) + if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] || + meta->func_id == special_kfunc_list[KF_bpf_session_is_return] || + meta->func_id == special_kfunc_list[KF_bpf_session_cookie]) return KF_ARG_PTR_TO_CTX; if (argno + 1 < nargs && diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index d466a1503da3..13f0a2de33b7 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -3323,7 +3323,7 @@ static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx) __bpf_kfunc_start_defs(); -__bpf_kfunc bool bpf_session_is_return(void) +__bpf_kfunc bool bpf_session_is_return(void *ctx) { struct bpf_session_run_ctx *session_ctx; @@ -3331,7 +3331,7 @@ __bpf_kfunc bool bpf_session_is_return(void) return session_ctx->is_return; } -__bpf_kfunc __u64 *bpf_session_cookie(void) +__bpf_kfunc __u64 *bpf_session_cookie(void *ctx) { struct bpf_session_run_ctx *session_ctx; diff --git a/tools/testing/selftests/bpf/bpf_kfuncs.h b/tools/testing/selftests/bpf/bpf_kfuncs.h index e0189254bb6e..7dad01439391 100644 --- a/tools/testing/selftests/bpf/bpf_kfuncs.h +++ b/tools/testing/selftests/bpf/bpf_kfuncs.h @@ -79,9 +79,6 @@ extern int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_ptr, struct bpf_dynptr *sig_ptr, struct bpf_key *trusted_keyring) __ksym; -extern bool bpf_session_is_return(void) __ksym __weak; -extern __u64 *bpf_session_cookie(void) __ksym __weak; - struct dentry; /* Description * Returns xattr of a dentry diff --git a/tools/testing/selftests/bpf/progs/kprobe_multi_session_cookie.c b/tools/testing/selftests/bpf/progs/kprobe_multi_session_cookie.c index 0835b5edf685..ad627016e3e5 100644 --- a/tools/testing/selftests/bpf/progs/kprobe_multi_session_cookie.c +++ b/tools/testing/selftests/bpf/progs/kprobe_multi_session_cookie.c @@ -1,9 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 -#include +#include #include #include #include -#include "bpf_kfuncs.h" char _license[] SEC("license") = "GPL"; @@ -23,16 +22,16 @@ int BPF_PROG(trigger) return 0; } -static int check_cookie(__u64 val, __u64 *result) +static int check_cookie(struct pt_regs *ctx, __u64 val, __u64 *result) { __u64 *cookie; if (bpf_get_current_pid_tgid() >> 32 != pid) return 1; - cookie = bpf_session_cookie(); + cookie = bpf_session_cookie(ctx); - if (bpf_session_is_return()) + if (bpf_session_is_return(ctx)) *result = *cookie == val ? val : 0; else *cookie = val; @@ -42,17 +41,17 @@ static int check_cookie(__u64 val, __u64 *result) SEC("kprobe.session/bpf_fentry_test1") int test_kprobe_1(struct pt_regs *ctx) { - return check_cookie(1, &test_kprobe_1_result); + return check_cookie(ctx, 1, &test_kprobe_1_result); } SEC("kprobe.session/bpf_fentry_test1") int test_kprobe_2(struct pt_regs *ctx) { - return check_cookie(2, &test_kprobe_2_result); + return check_cookie(ctx, 2, &test_kprobe_2_result); } SEC("kprobe.session/bpf_fentry_test1") int test_kprobe_3(struct pt_regs *ctx) { - return check_cookie(3, &test_kprobe_3_result); + return check_cookie(ctx, 3, &test_kprobe_3_result); } diff --git a/tools/testing/selftests/bpf/progs/uprobe_multi_session.c b/tools/testing/selftests/bpf/progs/uprobe_multi_session.c index 30bff90b68dc..6e46bb00ff58 100644 --- a/tools/testing/selftests/bpf/progs/uprobe_multi_session.c +++ b/tools/testing/selftests/bpf/progs/uprobe_multi_session.c @@ -1,9 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 -#include +#include #include #include #include -#include "bpf_kfuncs.h" #include "bpf_misc.h" char _license[] SEC("license") = "GPL"; @@ -51,7 +50,7 @@ static int uprobe_multi_check(void *ctx, bool is_return) SEC("uprobe.session//proc/self/exe:uprobe_multi_func_*") int uprobe(struct pt_regs *ctx) { - return uprobe_multi_check(ctx, bpf_session_is_return()); + return uprobe_multi_check(ctx, bpf_session_is_return(ctx)); } static __always_inline bool verify_sleepable_user_copy(void) @@ -67,5 +66,5 @@ int uprobe_sleepable(struct pt_regs *ctx) { if (verify_sleepable_user_copy()) uprobe_multi_sleep_result++; - return uprobe_multi_check(ctx, bpf_session_is_return()); + return uprobe_multi_check(ctx, bpf_session_is_return(ctx)); } diff --git a/tools/testing/selftests/bpf/progs/uprobe_multi_session_cookie.c b/tools/testing/selftests/bpf/progs/uprobe_multi_session_cookie.c index 5befdf944dc6..b5db196614a9 100644 --- a/tools/testing/selftests/bpf/progs/uprobe_multi_session_cookie.c +++ b/tools/testing/selftests/bpf/progs/uprobe_multi_session_cookie.c @@ -1,9 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 -#include +#include #include #include #include -#include "bpf_kfuncs.h" char _license[] SEC("license") = "GPL"; @@ -13,16 +12,16 @@ __u64 test_uprobe_1_result = 0; __u64 test_uprobe_2_result = 0; __u64 test_uprobe_3_result = 0; -static int check_cookie(__u64 val, __u64 *result) +static int check_cookie(struct pt_regs *ctx, __u64 val, __u64 *result) { __u64 *cookie; if (bpf_get_current_pid_tgid() >> 32 != pid) return 1; - cookie = bpf_session_cookie(); + cookie = bpf_session_cookie(ctx); - if (bpf_session_is_return()) + if (bpf_session_is_return(ctx)) *result = *cookie == val ? val : 0; else *cookie = val; @@ -32,17 +31,17 @@ static int check_cookie(__u64 val, __u64 *result) SEC("uprobe.session//proc/self/exe:uprobe_multi_func_1") int uprobe_1(struct pt_regs *ctx) { - return check_cookie(1, &test_uprobe_1_result); + return check_cookie(ctx, 1, &test_uprobe_1_result); } SEC("uprobe.session//proc/self/exe:uprobe_multi_func_2") int uprobe_2(struct pt_regs *ctx) { - return check_cookie(2, &test_uprobe_2_result); + return check_cookie(ctx, 2, &test_uprobe_2_result); } SEC("uprobe.session//proc/self/exe:uprobe_multi_func_3") int uprobe_3(struct pt_regs *ctx) { - return check_cookie(3, &test_uprobe_3_result); + return check_cookie(ctx, 3, &test_uprobe_3_result); } diff --git a/tools/testing/selftests/bpf/progs/uprobe_multi_session_recursive.c b/tools/testing/selftests/bpf/progs/uprobe_multi_session_recursive.c index 8fbcd69fae22..3ce309248a04 100644 --- a/tools/testing/selftests/bpf/progs/uprobe_multi_session_recursive.c +++ b/tools/testing/selftests/bpf/progs/uprobe_multi_session_recursive.c @@ -1,9 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 -#include +#include #include #include #include -#include "bpf_kfuncs.h" #include "bpf_misc.h" char _license[] SEC("license") = "GPL"; @@ -16,11 +15,11 @@ int idx_return = 0; __u64 test_uprobe_cookie_entry[6]; __u64 test_uprobe_cookie_return[3]; -static int check_cookie(void) +static int check_cookie(struct pt_regs *ctx) { - __u64 *cookie = bpf_session_cookie(); + __u64 *cookie = bpf_session_cookie(ctx); - if (bpf_session_is_return()) { + if (bpf_session_is_return(ctx)) { if (idx_return >= ARRAY_SIZE(test_uprobe_cookie_return)) return 1; test_uprobe_cookie_return[idx_return++] = *cookie; @@ -40,5 +39,5 @@ int uprobe_recursive(struct pt_regs *ctx) if (bpf_get_current_pid_tgid() >> 32 != pid) return 1; - return check_cookie(); + return check_cookie(ctx); } From 27d89baa6da8e5e546585c53a959176d1302d46e Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 24 Jan 2026 14:19:59 +0800 Subject: [PATCH 163/268] bpf: support fsession for bpf_session_is_return If fsession exists, we will use the bit (1 << BPF_TRAMP_IS_RETURN_SHIFT) in ((u64 *)ctx)[-1] to store the "is_return" flag. The logic of bpf_session_is_return() for fsession is implemented in the verifier by inline following code: bool bpf_session_is_return(void *ctx) { return (((u64 *)ctx)[-1] >> BPF_TRAMP_IS_RETURN_SHIFT) & 1; } Signed-off-by: Menglong Dong Co-developed-by: Leon Hwang Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260124062008.8657-5-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ kernel/bpf/verifier.c | 13 +++++++++++++ kernel/trace/bpf_trace.c | 39 ++++++++++++++++++++++++++------------- 3 files changed, 41 insertions(+), 13 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 41228b0add52..29eecd79352e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1229,6 +1229,8 @@ enum { #endif }; +#define BPF_TRAMP_IS_RETURN_SHIFT 63 + struct bpf_tramp_links { struct bpf_tramp_link *links[BPF_MAX_TRAMP_LINKS]; int nr_links; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0fa73d56cb8b..d04aea235a12 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -23011,6 +23011,19 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) { insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1); *cnt = 1; + } else if (desc->func_id == special_kfunc_list[KF_bpf_session_is_return] && + env->prog->expected_attach_type == BPF_TRACE_FSESSION) { + /* + * inline the bpf_session_is_return() for fsession: + * bool bpf_session_is_return(void *ctx) + * { + * return (((u64 *)ctx)[-1] >> BPF_TRAMP_IS_RETURN_SHIFT) & 1; + * } + */ + insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); + insn_buf[1] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, BPF_TRAMP_IS_RETURN_SHIFT); + insn_buf[2] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1); + *cnt = 3; } if (env->insn_aux_data[insn_idx].arg_prog) { diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 13f0a2de33b7..f7baeb8278ca 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1286,7 +1286,8 @@ static bool is_kprobe_multi(const struct bpf_prog *prog) static inline bool is_kprobe_session(const struct bpf_prog *prog) { - return prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION; + return prog->type == BPF_PROG_TYPE_KPROBE && + prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION; } static inline bool is_uprobe_multi(const struct bpf_prog *prog) @@ -1297,7 +1298,14 @@ static inline bool is_uprobe_multi(const struct bpf_prog *prog) static inline bool is_uprobe_session(const struct bpf_prog *prog) { - return prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION; + return prog->type == BPF_PROG_TYPE_KPROBE && + prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION; +} + +static inline bool is_trace_fsession(const struct bpf_prog *prog) +{ + return prog->type == BPF_PROG_TYPE_TRACING && + prog->expected_attach_type == BPF_TRACE_FSESSION; } static const struct bpf_func_proto * @@ -3341,34 +3349,39 @@ __bpf_kfunc __u64 *bpf_session_cookie(void *ctx) __bpf_kfunc_end_defs(); -BTF_KFUNCS_START(kprobe_multi_kfunc_set_ids) +BTF_KFUNCS_START(session_kfunc_set_ids) BTF_ID_FLAGS(func, bpf_session_is_return) BTF_ID_FLAGS(func, bpf_session_cookie) -BTF_KFUNCS_END(kprobe_multi_kfunc_set_ids) +BTF_KFUNCS_END(session_kfunc_set_ids) -static int bpf_kprobe_multi_filter(const struct bpf_prog *prog, u32 kfunc_id) +static int bpf_session_filter(const struct bpf_prog *prog, u32 kfunc_id) { - if (!btf_id_set8_contains(&kprobe_multi_kfunc_set_ids, kfunc_id)) + if (!btf_id_set8_contains(&session_kfunc_set_ids, kfunc_id)) return 0; - if (!is_kprobe_session(prog) && !is_uprobe_session(prog)) + if (!is_kprobe_session(prog) && !is_uprobe_session(prog) && !is_trace_fsession(prog)) return -EACCES; return 0; } -static const struct btf_kfunc_id_set bpf_kprobe_multi_kfunc_set = { +static const struct btf_kfunc_id_set bpf_session_kfunc_set = { .owner = THIS_MODULE, - .set = &kprobe_multi_kfunc_set_ids, - .filter = bpf_kprobe_multi_filter, + .set = &session_kfunc_set_ids, + .filter = bpf_session_filter, }; -static int __init bpf_kprobe_multi_kfuncs_init(void) +static int __init bpf_trace_kfuncs_init(void) { - return register_btf_kfunc_id_set(BPF_PROG_TYPE_KPROBE, &bpf_kprobe_multi_kfunc_set); + int err = 0; + + err = err ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_KPROBE, &bpf_session_kfunc_set); + err = err ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_session_kfunc_set); + + return err; } -late_initcall(bpf_kprobe_multi_kfuncs_init); +late_initcall(bpf_trace_kfuncs_init); typedef int (*copy_fn_t)(void *dst, const void *src, u32 size, struct task_struct *tsk); From eeee4239dbb155a98f8ba737324ac081acde8417 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 24 Jan 2026 14:20:00 +0800 Subject: [PATCH 164/268] bpf: support fsession for bpf_session_cookie Implement session cookie for fsession. The session cookies will be stored in the stack, and the layout of the stack will look like this: return value -> 8 bytes argN -> 8 bytes ... arg1 -> 8 bytes nr_args -> 8 bytes ip (optional) -> 8 bytes cookie2 -> 8 bytes cookie1 -> 8 bytes The offset of the cookie for the current bpf program, which is in 8-byte units, is stored in the "(((u64 *)ctx)[-1] >> BPF_TRAMP_COOKIE_INDEX_SHIFT) & 0xFF". Therefore, we can get the session cookie with ((u64 *)ctx)[-offset]. Implement and inline the bpf_session_cookie() for the fsession in the verifier. Signed-off-by: Menglong Dong Link: https://lore.kernel.org/r/20260124062008.8657-6-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 15 +++++++++++++++ kernel/bpf/verifier.c | 20 ++++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 29eecd79352e..4427c6e98331 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1229,6 +1229,7 @@ enum { #endif }; +#define BPF_TRAMP_COOKIE_INDEX_SHIFT 8 #define BPF_TRAMP_IS_RETURN_SHIFT 63 struct bpf_tramp_links { @@ -1782,6 +1783,7 @@ struct bpf_prog { enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */ call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */ call_get_func_ip:1, /* Do we call get_func_ip() */ + call_session_cookie:1, /* Do we call bpf_session_cookie() */ tstamp_type_access:1, /* Accessed __sk_buff->tstamp_type */ sleepable:1; /* BPF program is sleepable */ enum bpf_prog_type type; /* Type of BPF program */ @@ -2190,6 +2192,19 @@ static inline int bpf_fsession_cnt(struct bpf_tramp_links *links) return cnt; } +static inline int bpf_fsession_cookie_cnt(struct bpf_tramp_links *links) +{ + struct bpf_tramp_links fentries = links[BPF_TRAMP_FENTRY]; + int cnt = 0; + + for (int i = 0; i < links[BPF_TRAMP_FENTRY].nr_links; i++) { + if (fentries.links[i]->link.prog->call_session_cookie) + cnt++; + } + + return cnt; +} + int bpf_prog_ctx_arg_info_init(struct bpf_prog *prog, const struct bpf_ctx_arg_aux *info, u32 cnt); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d04aea235a12..c2f2650db9fd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -14406,6 +14406,9 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return err; } + if (meta.func_id == special_kfunc_list[KF_bpf_session_cookie]) + env->prog->call_session_cookie = true; + return 0; } @@ -23024,6 +23027,23 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, insn_buf[1] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, BPF_TRAMP_IS_RETURN_SHIFT); insn_buf[2] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1); *cnt = 3; + } else if (desc->func_id == special_kfunc_list[KF_bpf_session_cookie] && + env->prog->expected_attach_type == BPF_TRACE_FSESSION) { + /* + * inline bpf_session_cookie() for fsession: + * __u64 *bpf_session_cookie(void *ctx) + * { + * u64 off = (((u64 *)ctx)[-1] >> BPF_TRAMP_COOKIE_INDEX_SHIFT) & 0xFF; + * return &((u64 *)ctx)[-off]; + * } + */ + insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); + insn_buf[1] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, BPF_TRAMP_COOKIE_INDEX_SHIFT); + insn_buf[2] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xFF); + insn_buf[3] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3); + insn_buf[4] = BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1); + insn_buf[5] = BPF_ALU64_IMM(BPF_NEG, BPF_REG_0, 0); + *cnt = 6; } if (env->insn_aux_data[insn_idx].arg_prog) { From 37c7ba1b39c493229d1b3c1f6ab466a23362e21e Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 24 Jan 2026 14:20:01 +0800 Subject: [PATCH 165/268] bpf,x86: introduce emit_store_stack_imm64() for trampoline Introduce the helper emit_store_stack_imm64(), which is used to store a imm64 to the stack with the help of a register. Signed-off-by: Menglong Dong Link: https://lore.kernel.org/r/20260124062008.8657-7-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index e3b1c4b1d550..2f31331955b5 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1300,6 +1300,16 @@ static void emit_st_r12(u8 **pprog, u32 size, u32 dst_reg, int off, int imm) emit_st_index(pprog, size, dst_reg, X86_REG_R12, off, imm); } +static void emit_store_stack_imm64(u8 **pprog, int reg, int stack_off, u64 imm64) +{ + /* + * mov reg, imm64 + * mov QWORD PTR [rbp + stack_off], reg + */ + emit_mov_imm64(pprog, reg, imm64 >> 32, (u32) imm64); + emit_stx(pprog, BPF_DW, BPF_REG_FP, reg, stack_off); +} + static int emit_atomic_rmw(u8 **pprog, u32 atomic_op, u32 dst_reg, u32 src_reg, s16 off, u8 bpf_size) { @@ -3348,20 +3358,12 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im /* mov QWORD PTR [rbp - rbx_off], rbx */ emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_6, -rbx_off); - /* Store number of argument registers of the traced function: - * mov rax, nr_regs - * mov QWORD PTR [rbp - nregs_off], rax - */ - emit_mov_imm64(&prog, BPF_REG_0, 0, (u32) nr_regs); - emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -nregs_off); + /* Store number of argument registers of the traced function */ + emit_store_stack_imm64(&prog, BPF_REG_0, -nregs_off, nr_regs); if (flags & BPF_TRAMP_F_IP_ARG) { - /* Store IP address of the traced function: - * movabsq rax, func_addr - * mov QWORD PTR [rbp - ip_off], rax - */ - emit_mov_imm64(&prog, BPF_REG_0, (long) func_addr >> 32, (u32) (long) func_addr); - emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -ip_off); + /* Store IP address of the traced function */ + emit_store_stack_imm64(&prog, BPF_REG_0, -ip_off, (long)func_addr); } save_args(m, &prog, regs_off, false, flags); From 98770bd4e6df50db81cbdf5876f8836baefa3005 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 24 Jan 2026 14:20:02 +0800 Subject: [PATCH 166/268] bpf,x86: add fsession support for x86_64 Add BPF_TRACE_FSESSION supporting to x86_64, including: 1. clear the return value in the stack before fentry to make the fentry of the fsession can only get 0 with bpf_get_func_ret(). 2. clear all the session cookies' value in the stack. 2. store the index of the cookie to ctx[-1] before the calling to fsession 3. store the "is_return" flag to ctx[-1] before the calling to fexit of the fsession. Signed-off-by: Menglong Dong Co-developed-by: Leon Hwang Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260124062008.8657-8-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 52 ++++++++++++++++++++++++++++--------- 1 file changed, 40 insertions(+), 12 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 2f31331955b5..5a075e06cf45 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -3094,13 +3094,19 @@ static int emit_cond_near_jump(u8 **pprog, void *func, void *ip, u8 jmp_cond) static int invoke_bpf(const struct btf_func_model *m, u8 **pprog, struct bpf_tramp_links *tl, int stack_size, - int run_ctx_off, bool save_ret, - void *image, void *rw_image) + int run_ctx_off, int func_meta_off, bool save_ret, + void *image, void *rw_image, u64 func_meta, + int cookie_off) { - int i; + int i, cur_cookie = (cookie_off - stack_size) / 8; u8 *prog = *pprog; for (i = 0; i < tl->nr_links; i++) { + if (tl->links[i]->link.prog->call_session_cookie) { + emit_store_stack_imm64(&prog, BPF_REG_0, -func_meta_off, + func_meta | (cur_cookie << BPF_TRAMP_COOKIE_INDEX_SHIFT)); + cur_cookie--; + } if (invoke_bpf_prog(m, &prog, tl->links[i], stack_size, run_ctx_off, save_ret, image, rw_image)) return -EINVAL; @@ -3218,12 +3224,14 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im void *func_addr) { int i, ret, nr_regs = m->nr_args, stack_size = 0; - int regs_off, nregs_off, ip_off, run_ctx_off, arg_stack_off, rbx_off; + int regs_off, func_meta_off, ip_off, run_ctx_off, arg_stack_off, rbx_off; struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY]; struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT]; struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN]; void *orig_call = func_addr; + int cookie_off, cookie_cnt; u8 **branches = NULL; + u64 func_meta; u8 *prog; bool save_ret; @@ -3259,7 +3267,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im * [ ... ] * RBP - regs_off [ reg_arg1 ] program's ctx pointer * - * RBP - nregs_off [ regs count ] always + * RBP - func_meta_off [ regs count, etc ] always * * RBP - ip_off [ traced function ] BPF_TRAMP_F_IP_ARG flag * @@ -3282,15 +3290,20 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im stack_size += nr_regs * 8; regs_off = stack_size; - /* regs count */ + /* function matedata, such as regs count */ stack_size += 8; - nregs_off = stack_size; + func_meta_off = stack_size; if (flags & BPF_TRAMP_F_IP_ARG) stack_size += 8; /* room for IP address argument */ ip_off = stack_size; + cookie_cnt = bpf_fsession_cookie_cnt(tlinks); + /* room for session cookies */ + stack_size += cookie_cnt * 8; + cookie_off = stack_size; + stack_size += 8; rbx_off = stack_size; @@ -3358,8 +3371,9 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im /* mov QWORD PTR [rbp - rbx_off], rbx */ emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_6, -rbx_off); + func_meta = nr_regs; /* Store number of argument registers of the traced function */ - emit_store_stack_imm64(&prog, BPF_REG_0, -nregs_off, nr_regs); + emit_store_stack_imm64(&prog, BPF_REG_0, -func_meta_off, func_meta); if (flags & BPF_TRAMP_F_IP_ARG) { /* Store IP address of the traced function */ @@ -3378,9 +3392,18 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im } } + if (bpf_fsession_cnt(tlinks)) { + /* clear all the session cookies' value */ + for (int i = 0; i < cookie_cnt; i++) + emit_store_stack_imm64(&prog, BPF_REG_0, -cookie_off + 8 * i, 0); + /* clear the return value to make sure fentry always get 0 */ + emit_store_stack_imm64(&prog, BPF_REG_0, -8, 0); + } + if (fentry->nr_links) { - if (invoke_bpf(m, &prog, fentry, regs_off, run_ctx_off, - flags & BPF_TRAMP_F_RET_FENTRY_RET, image, rw_image)) + if (invoke_bpf(m, &prog, fentry, regs_off, run_ctx_off, func_meta_off, + flags & BPF_TRAMP_F_RET_FENTRY_RET, image, rw_image, + func_meta, cookie_off)) return -EINVAL; } @@ -3440,9 +3463,14 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im } } + /* set the "is_return" flag for fsession */ + func_meta |= (1ULL << BPF_TRAMP_IS_RETURN_SHIFT); + if (bpf_fsession_cnt(tlinks)) + emit_store_stack_imm64(&prog, BPF_REG_0, -func_meta_off, func_meta); + if (fexit->nr_links) { - if (invoke_bpf(m, &prog, fexit, regs_off, run_ctx_off, - false, image, rw_image)) { + if (invoke_bpf(m, &prog, fexit, regs_off, run_ctx_off, func_meta_off, + false, image, rw_image, func_meta, cookie_off)) { ret = -EINVAL; goto cleanup; } From 257c43688b143fd9805cdfef9d2623dde92989e6 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 24 Jan 2026 14:20:03 +0800 Subject: [PATCH 167/268] libbpf: add fsession support Add BPF_TRACE_FSESSION to libbpf. Signed-off-by: Menglong Dong Link: https://lore.kernel.org/r/20260124062008.8657-9-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/bpf.c | 1 + tools/lib/bpf/libbpf.c | 3 +++ 2 files changed, 4 insertions(+) diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 21b57a629916..5846de364209 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -794,6 +794,7 @@ int bpf_link_create(int prog_fd, int target_fd, case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: case BPF_MODIFY_RETURN: + case BPF_TRACE_FSESSION: case BPF_LSM_MAC: attr.link_create.tracing.cookie = OPTS_GET(opts, tracing.cookie, 0); if (!OPTS_ZEROED(opts, tracing)) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index bbcfd72b07d5..0c8bf0b5cce4 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -115,6 +115,7 @@ static const char * const attach_type_name[] = { [BPF_TRACE_FENTRY] = "trace_fentry", [BPF_TRACE_FEXIT] = "trace_fexit", [BPF_MODIFY_RETURN] = "modify_return", + [BPF_TRACE_FSESSION] = "trace_fsession", [BPF_LSM_MAC] = "lsm_mac", [BPF_LSM_CGROUP] = "lsm_cgroup", [BPF_SK_LOOKUP] = "sk_lookup", @@ -9859,6 +9860,8 @@ static const struct bpf_sec_def section_defs[] = { SEC_DEF("fentry.s+", TRACING, BPF_TRACE_FENTRY, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace), SEC_DEF("fmod_ret.s+", TRACING, BPF_MODIFY_RETURN, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace), SEC_DEF("fexit.s+", TRACING, BPF_TRACE_FEXIT, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace), + SEC_DEF("fsession+", TRACING, BPF_TRACE_FSESSION, SEC_ATTACH_BTF, attach_trace), + SEC_DEF("fsession.s+", TRACING, BPF_TRACE_FSESSION, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace), SEC_DEF("freplace+", EXT, 0, SEC_ATTACH_BTF, attach_trace), SEC_DEF("lsm+", LSM, BPF_LSM_MAC, SEC_ATTACH_BTF, attach_lsm), SEC_DEF("lsm.s+", LSM, BPF_LSM_MAC, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_lsm), From 85fc4be6d811372f8f9a2a131a092735418fdbf2 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 24 Jan 2026 14:20:04 +0800 Subject: [PATCH 168/268] bpftool: add fsession support Add BPF_TRACE_FSESSION to bpftool. Signed-off-by: Menglong Dong Link: https://lore.kernel.org/r/20260124062008.8657-10-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- tools/bpf/bpftool/common.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c index e8daf963ecef..8bfcff9e2f63 100644 --- a/tools/bpf/bpftool/common.c +++ b/tools/bpf/bpftool/common.c @@ -1191,6 +1191,7 @@ const char *bpf_attach_type_input_str(enum bpf_attach_type t) case BPF_TRACE_FENTRY: return "fentry"; case BPF_TRACE_FEXIT: return "fexit"; case BPF_MODIFY_RETURN: return "mod_ret"; + case BPF_TRACE_FSESSION: return "fsession"; case BPF_SK_REUSEPORT_SELECT: return "sk_skb_reuseport_select"; case BPF_SK_REUSEPORT_SELECT_OR_MIGRATE: return "sk_skb_reuseport_select_or_migrate"; default: return libbpf_bpf_attach_type_str(t); From f7afef5617b685c3491db3593ca09abc33815774 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 24 Jan 2026 14:20:05 +0800 Subject: [PATCH 169/268] selftests/bpf: add testcases for fsession Add testcases for BPF_TRACE_FSESSION. The function arguments and return value are tested both in the entry and exit. And the kfunc bpf_session_is_ret() is also tested. Signed-off-by: Menglong Dong Link: https://lore.kernel.org/r/20260124062008.8657-11-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/fsession_test.c | 90 +++++++++++++++++ .../selftests/bpf/progs/fsession_test.c | 97 +++++++++++++++++++ 2 files changed, 187 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/fsession_test.c create mode 100644 tools/testing/selftests/bpf/progs/fsession_test.c diff --git a/tools/testing/selftests/bpf/prog_tests/fsession_test.c b/tools/testing/selftests/bpf/prog_tests/fsession_test.c new file mode 100644 index 000000000000..75bb42942b67 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/fsession_test.c @@ -0,0 +1,90 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 ChinaTelecom */ +#include +#include "fsession_test.skel.h" + +static int check_result(struct fsession_test *skel) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + int err, prog_fd; + + /* Trigger test function calls */ + prog_fd = bpf_program__fd(skel->progs.test1); + err = bpf_prog_test_run_opts(prog_fd, &topts); + if (!ASSERT_OK(err, "test_run_opts err")) + return err; + if (!ASSERT_OK(topts.retval, "test_run_opts retval")) + return topts.retval; + + for (int i = 0; i < sizeof(*skel->bss) / sizeof(__u64); i++) { + if (!ASSERT_EQ(((__u64 *)skel->bss)[i], 1, "test_result")) + return -EINVAL; + } + + return 0; +} + +static void test_fsession_basic(void) +{ + struct fsession_test *skel = NULL; + int err; + + skel = fsession_test__open_and_load(); + if (!ASSERT_OK_PTR(skel, "fsession_test__open_and_load")) + goto cleanup; + + err = fsession_test__attach(skel); + if (!ASSERT_OK(err, "fsession_attach")) + goto cleanup; + + check_result(skel); +cleanup: + fsession_test__destroy(skel); +} + +static void test_fsession_reattach(void) +{ + struct fsession_test *skel = NULL; + int err; + + skel = fsession_test__open_and_load(); + if (!ASSERT_OK_PTR(skel, "fsession_test__open_and_load")) + goto cleanup; + + /* first attach */ + err = fsession_test__attach(skel); + if (!ASSERT_OK(err, "fsession_first_attach")) + goto cleanup; + + if (check_result(skel)) + goto cleanup; + + /* detach */ + fsession_test__detach(skel); + + /* reset counters */ + memset(skel->bss, 0, sizeof(*skel->bss)); + + /* second attach */ + err = fsession_test__attach(skel); + if (!ASSERT_OK(err, "fsession_second_attach")) + goto cleanup; + + if (check_result(skel)) + goto cleanup; + +cleanup: + fsession_test__destroy(skel); +} + +void test_fsession_test(void) +{ +#if !defined(__x86_64__) + test__skip(); + return; +#endif + if (test__start_subtest("fsession_test")) + test_fsession_basic(); + if (test__start_subtest("fsession_reattach")) + test_fsession_reattach(); +} diff --git a/tools/testing/selftests/bpf/progs/fsession_test.c b/tools/testing/selftests/bpf/progs/fsession_test.c new file mode 100644 index 000000000000..0e1b66b2dddc --- /dev/null +++ b/tools/testing/selftests/bpf/progs/fsession_test.c @@ -0,0 +1,97 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 ChinaTelecom */ +#include +#include +#include + +char _license[] SEC("license") = "GPL"; + +__u64 test1_entry_result = 0; +__u64 test1_exit_result = 0; + +SEC("fsession/bpf_fentry_test1") +int BPF_PROG(test1, int a, int ret) +{ + bool is_exit = bpf_session_is_return(ctx); + + if (!is_exit) { + test1_entry_result = a == 1 && ret == 0; + return 0; + } + + test1_exit_result = a == 1 && ret == 2; + return 0; +} + +__u64 test2_entry_result = 0; +__u64 test2_exit_result = 0; + +SEC("fsession/bpf_fentry_test3") +int BPF_PROG(test2, char a, int b, __u64 c, int ret) +{ + bool is_exit = bpf_session_is_return(ctx); + + if (!is_exit) { + test2_entry_result = a == 4 && b == 5 && c == 6 && ret == 0; + return 0; + } + + test2_exit_result = a == 4 && b == 5 && c == 6 && ret == 15; + return 0; +} + +__u64 test3_entry_result = 0; +__u64 test3_exit_result = 0; + +SEC("fsession/bpf_fentry_test4") +int BPF_PROG(test3, void *a, char b, int c, __u64 d, int ret) +{ + bool is_exit = bpf_session_is_return(ctx); + + if (!is_exit) { + test3_entry_result = a == (void *)7 && b == 8 && c == 9 && d == 10 && ret == 0; + return 0; + } + + test3_exit_result = a == (void *)7 && b == 8 && c == 9 && d == 10 && ret == 34; + return 0; +} + +__u64 test4_entry_result = 0; +__u64 test4_exit_result = 0; + +SEC("fsession/bpf_fentry_test5") +int BPF_PROG(test4, __u64 a, void *b, short c, int d, __u64 e, int ret) +{ + bool is_exit = bpf_session_is_return(ctx); + + if (!is_exit) { + test4_entry_result = a == 11 && b == (void *)12 && c == 13 && d == 14 && + e == 15 && ret == 0; + return 0; + } + + test4_exit_result = a == 11 && b == (void *)12 && c == 13 && d == 14 && + e == 15 && ret == 65; + return 0; +} + +__u64 test5_entry_result = 0; +__u64 test5_exit_result = 0; + +SEC("fsession/bpf_fentry_test7") +int BPF_PROG(test5, struct bpf_fentry_test_t *arg, int ret) +{ + bool is_exit = bpf_session_is_return(ctx); + + if (!is_exit) { + if (!arg) + test5_entry_result = ret == 0; + return 0; + } + + if (!arg) + test5_exit_result = 1; + return 0; +} + From a5533a6eaa5b602fe54e53d85f787e09eab4e771 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 24 Jan 2026 14:20:06 +0800 Subject: [PATCH 170/268] selftests/bpf: test bpf_get_func_* for fsession Test following bpf helper for fsession: bpf_get_func_arg() bpf_get_func_arg_cnt() bpf_get_func_ret() bpf_get_func_ip() Signed-off-by: Menglong Dong Link: https://lore.kernel.org/r/20260124062008.8657-12-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- .../bpf/prog_tests/get_func_args_test.c | 1 + .../bpf/prog_tests/get_func_ip_test.c | 2 + .../selftests/bpf/progs/get_func_args_test.c | 40 ++++++++++++++++++- .../selftests/bpf/progs/get_func_ip_test.c | 23 +++++++++++ 4 files changed, 65 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/get_func_args_test.c b/tools/testing/selftests/bpf/prog_tests/get_func_args_test.c index fadee95d3ae8..96b27de05524 100644 --- a/tools/testing/selftests/bpf/prog_tests/get_func_args_test.c +++ b/tools/testing/selftests/bpf/prog_tests/get_func_args_test.c @@ -41,6 +41,7 @@ void test_get_func_args_test(void) ASSERT_EQ(skel->bss->test4_result, 1, "test4_result"); ASSERT_EQ(skel->bss->test5_result, 1, "test5_result"); ASSERT_EQ(skel->bss->test6_result, 1, "test6_result"); + ASSERT_EQ(skel->bss->test7_result, 1, "test7_result"); cleanup: get_func_args_test__destroy(skel); diff --git a/tools/testing/selftests/bpf/prog_tests/get_func_ip_test.c b/tools/testing/selftests/bpf/prog_tests/get_func_ip_test.c index c40242dfa8fb..7772a0f288d3 100644 --- a/tools/testing/selftests/bpf/prog_tests/get_func_ip_test.c +++ b/tools/testing/selftests/bpf/prog_tests/get_func_ip_test.c @@ -46,6 +46,8 @@ static void test_function_entry(void) ASSERT_EQ(skel->bss->test5_result, 1, "test5_result"); ASSERT_EQ(skel->bss->test7_result, 1, "test7_result"); ASSERT_EQ(skel->bss->test8_result, 1, "test8_result"); + ASSERT_EQ(skel->bss->test9_entry_result, 1, "test9_entry_result"); + ASSERT_EQ(skel->bss->test9_exit_result, 1, "test9_exit_result"); cleanup: get_func_ip_test__destroy(skel); diff --git a/tools/testing/selftests/bpf/progs/get_func_args_test.c b/tools/testing/selftests/bpf/progs/get_func_args_test.c index 5b7233afef05..0a3236a7a109 100644 --- a/tools/testing/selftests/bpf/progs/get_func_args_test.c +++ b/tools/testing/selftests/bpf/progs/get_func_args_test.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -#include +#include #include #include #include @@ -165,3 +165,41 @@ int BPF_PROG(tp_test2) return 0; } + +__u64 test7_result = 0; +#ifdef __TARGET_ARCH_x86 +SEC("fsession/bpf_fentry_test1") +int BPF_PROG(test7) +{ + __u64 cnt = bpf_get_func_arg_cnt(ctx); + __u64 a = 0, z = 0, ret = 0; + __s64 err; + + test7_result = cnt == 1; + + /* valid arguments */ + err = bpf_get_func_arg(ctx, 0, &a); + test7_result &= err == 0 && ((int) a == 1); + + /* not valid argument */ + err = bpf_get_func_arg(ctx, 1, &z); + test7_result &= err == -EINVAL; + + if (bpf_session_is_return(ctx)) { + err = bpf_get_func_ret(ctx, &ret); + test7_result &= err == 0 && ret == 2; + } else { + err = bpf_get_func_ret(ctx, &ret); + test7_result &= err == 0 && ret == 0; + } + + return 0; +} +#else +SEC("fentry/bpf_fentry_test1") +int BPF_PROG(test7) +{ + test7_result = 1; + return 0; +} +#endif diff --git a/tools/testing/selftests/bpf/progs/get_func_ip_test.c b/tools/testing/selftests/bpf/progs/get_func_ip_test.c index 2011cacdeb18..65f7e1f182bf 100644 --- a/tools/testing/selftests/bpf/progs/get_func_ip_test.c +++ b/tools/testing/selftests/bpf/progs/get_func_ip_test.c @@ -103,3 +103,26 @@ int BPF_URETPROBE(test8, int ret) test8_result = (const void *) addr == (const void *) uprobe_trigger; return 0; } + +__u64 test9_entry_result = 0; +__u64 test9_exit_result = 0; +#ifdef __TARGET_ARCH_x86 +SEC("fsession/bpf_fentry_test1") +int BPF_PROG(test9, int a) +{ + __u64 addr = bpf_get_func_ip(ctx); + + if (bpf_session_is_return(ctx)) + test9_exit_result = (const void *) addr == &bpf_fentry_test1; + else + test9_entry_result = (const void *) addr == &bpf_fentry_test1; + return 0; +} +#else +SEC("fentry/bpf_fentry_test1") +int BPF_PROG(test9, int a) +{ + test9_entry_result = test9_exit_result = 1; + return 0; +} +#endif From 8909b3fb23e245f8ade903dfcfcc43522cf28a56 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 24 Jan 2026 14:20:07 +0800 Subject: [PATCH 171/268] selftests/bpf: add testcases for fsession cookie Test session cookie for fsession. Multiple fsession BPF progs is attached to bpf_fentry_test1() and session cookie is read and write in the testcase. bpf_get_func_ip() will influence the layout of the session cookies, so we test the cookie in two case: with and without bpf_get_func_ip(). Signed-off-by: Menglong Dong Link: https://lore.kernel.org/r/20260124062008.8657-13-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/fsession_test.c | 34 ++++++++++ .../selftests/bpf/progs/fsession_test.c | 66 +++++++++++++++++++ 2 files changed, 100 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/fsession_test.c b/tools/testing/selftests/bpf/prog_tests/fsession_test.c index 75bb42942b67..0c4b428e1cee 100644 --- a/tools/testing/selftests/bpf/prog_tests/fsession_test.c +++ b/tools/testing/selftests/bpf/prog_tests/fsession_test.c @@ -77,6 +77,38 @@ static void test_fsession_reattach(void) fsession_test__destroy(skel); } +static void test_fsession_cookie(void) +{ + struct fsession_test *skel = NULL; + int err; + + skel = fsession_test__open(); + if (!ASSERT_OK_PTR(skel, "fsession_test__open")) + goto cleanup; + + /* + * The test_fsession_basic() will test the session cookie with + * bpf_get_func_ip() case, so we need only check + * the cookie without bpf_get_func_ip() case here + */ + bpf_program__set_autoload(skel->progs.test6, false); + + err = fsession_test__load(skel); + if (!ASSERT_OK(err, "fsession_test__load")) + goto cleanup; + + err = fsession_test__attach(skel); + if (!ASSERT_OK(err, "fsession_attach")) + goto cleanup; + + skel->bss->test6_entry_result = 1; + skel->bss->test6_exit_result = 1; + + check_result(skel); +cleanup: + fsession_test__destroy(skel); +} + void test_fsession_test(void) { #if !defined(__x86_64__) @@ -87,4 +119,6 @@ void test_fsession_test(void) test_fsession_basic(); if (test__start_subtest("fsession_reattach")) test_fsession_reattach(); + if (test__start_subtest("fsession_cookie")) + test_fsession_cookie(); } diff --git a/tools/testing/selftests/bpf/progs/fsession_test.c b/tools/testing/selftests/bpf/progs/fsession_test.c index 0e1b66b2dddc..211332bdcccb 100644 --- a/tools/testing/selftests/bpf/progs/fsession_test.c +++ b/tools/testing/selftests/bpf/progs/fsession_test.c @@ -95,3 +95,69 @@ int BPF_PROG(test5, struct bpf_fentry_test_t *arg, int ret) return 0; } +__u64 test6_entry_result = 0; +__u64 test6_exit_result = 0; +SEC("fsession/bpf_fentry_test1") +int BPF_PROG(test6, int a) +{ + __u64 addr = bpf_get_func_ip(ctx); + + if (bpf_session_is_return(ctx)) + test6_exit_result = (const void *) addr == &bpf_fentry_test1; + else + test6_entry_result = (const void *) addr == &bpf_fentry_test1; + return 0; +} + +__u64 test7_entry_ok = 0; +__u64 test7_exit_ok = 0; +SEC("fsession/bpf_fentry_test1") +int BPF_PROG(test7, int a) +{ + volatile __u64 *cookie = bpf_session_cookie(ctx); + + if (!bpf_session_is_return(ctx)) { + *cookie = 0xAAAABBBBCCCCDDDDull; + test7_entry_ok = *cookie == 0xAAAABBBBCCCCDDDDull; + return 0; + } + + test7_exit_ok = *cookie == 0xAAAABBBBCCCCDDDDull; + return 0; +} + +__u64 test8_entry_ok = 0; +__u64 test8_exit_ok = 0; + +SEC("fsession/bpf_fentry_test1") +int BPF_PROG(test8, int a) +{ + volatile __u64 *cookie = bpf_session_cookie(ctx); + + if (!bpf_session_is_return(ctx)) { + *cookie = 0x1111222233334444ull; + test8_entry_ok = *cookie == 0x1111222233334444ull; + return 0; + } + + test8_exit_ok = *cookie == 0x1111222233334444ull; + return 0; +} + +__u64 test9_entry_result = 0; +__u64 test9_exit_result = 0; + +SEC("fsession/bpf_fentry_test1") +int BPF_PROG(test9, int a, int ret) +{ + __u64 *cookie = bpf_session_cookie(ctx); + + if (!bpf_session_is_return(ctx)) { + test9_entry_result = a == 1 && ret == 0; + *cookie = 0x123456ULL; + return 0; + } + + test9_exit_result = a == 1 && ret == 2 && *cookie == 0x123456ULL; + return 0; +} From cb4bfacfb0110aa1b10ab60c64a3df0e176998c5 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 24 Jan 2026 14:20:08 +0800 Subject: [PATCH 172/268] selftests/bpf: test fsession mixed with fentry and fexit Test the fsession when it is used together with fentry, fexit. Signed-off-by: Menglong Dong Link: https://lore.kernel.org/r/20260124062008.8657-14-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/progs/fsession_test.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/fsession_test.c b/tools/testing/selftests/bpf/progs/fsession_test.c index 211332bdcccb..86e8a2fe467e 100644 --- a/tools/testing/selftests/bpf/progs/fsession_test.c +++ b/tools/testing/selftests/bpf/progs/fsession_test.c @@ -161,3 +161,19 @@ int BPF_PROG(test9, int a, int ret) test9_exit_result = a == 1 && ret == 2 && *cookie == 0x123456ULL; return 0; } + +__u64 test10_result = 0; +SEC("fexit/bpf_fentry_test1") +int BPF_PROG(test10, int a, int ret) +{ + test10_result = a == 1 && ret == 2; + return 0; +} + +__u64 test11_result = 0; +SEC("fentry/bpf_fentry_test1") +int BPF_PROG(test11, int a) +{ + test11_result = a == 1; + return 0; +} From c31df36bd26a5ed8898bb3fcc8c37ea9157ba784 Mon Sep 17 00:00:00 2001 From: Changwoo Min Date: Sun, 25 Jan 2026 20:54:12 +0900 Subject: [PATCH 173/268] selftests/bpf: Introduce execution context detection helpers Introduce bpf_in_nmi(), bpf_in_hardirq(), bpf_in_serving_softirq(), and bpf_in_task() inline helpers in bpf_experimental.h. These allow BPF programs to query the current execution context with higher granularity than the existing bpf_in_interrupt() helper. While BPF programs can often infer their context from attachment points, subsystems like sched_ext may call the same BPF logic from multiple contexts (e.g., task-to-task wake-ups vs. interrupt-to-task wake-ups). These helpers provide a reliable way for logic to branch based on the current CPU execution state. Implementing these as BPF-native inline helpers wrapping get_preempt_count() allows the compiler and JIT to inline the logic. The implementation accounts for differences in preempt_count layout between standard and PREEMPT_RT kernels. Reviewed-by: Alexei Starovoitov Signed-off-by: Changwoo Min Link: https://lore.kernel.org/r/20260125115413.117502-2-changwoo@igalia.com Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/bpf_experimental.h | 58 +++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h index 68a49b1f77ae..a39576c8ba04 100644 --- a/tools/testing/selftests/bpf/bpf_experimental.h +++ b/tools/testing/selftests/bpf/bpf_experimental.h @@ -610,6 +610,8 @@ extern int bpf_cgroup_read_xattr(struct cgroup *cgroup, const char *name__str, #define HARDIRQ_MASK (__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT) #define NMI_MASK (__IRQ_MASK(NMI_BITS) << NMI_SHIFT) +#define SOFTIRQ_OFFSET (1UL << SOFTIRQ_SHIFT) + extern bool CONFIG_PREEMPT_RT __kconfig __weak; #ifdef bpf_target_x86 extern const int __preempt_count __ksym; @@ -648,4 +650,60 @@ static inline int bpf_in_interrupt(void) (tsk->softirq_disable_cnt & SOFTIRQ_MASK); } +/* Description + * Report whether it is in NMI context. Only works on the following archs: + * * x86 + * * arm64 + */ +static inline int bpf_in_nmi(void) +{ + return get_preempt_count() & NMI_MASK; +} + +/* Description + * Report whether it is in hard IRQ context. Only works on the following archs: + * * x86 + * * arm64 + */ +static inline int bpf_in_hardirq(void) +{ + return get_preempt_count() & HARDIRQ_MASK; +} + +/* Description + * Report whether it is in softirq context. Only works on the following archs: + * * x86 + * * arm64 + */ +static inline int bpf_in_serving_softirq(void) +{ + struct task_struct___preempt_rt *tsk; + int pcnt; + + pcnt = get_preempt_count(); + if (!CONFIG_PREEMPT_RT) + return (pcnt & SOFTIRQ_MASK) & SOFTIRQ_OFFSET; + + tsk = (void *) bpf_get_current_task_btf(); + return (tsk->softirq_disable_cnt & SOFTIRQ_MASK) & SOFTIRQ_OFFSET; +} + +/* Description + * Report whether it is in task context. Only works on the following archs: + * * x86 + * * arm64 + */ +static inline int bpf_in_task(void) +{ + struct task_struct___preempt_rt *tsk; + int pcnt; + + pcnt = get_preempt_count(); + if (!CONFIG_PREEMPT_RT) + return !(pcnt & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)); + + tsk = (void *) bpf_get_current_task_btf(); + return !((pcnt & (NMI_MASK | HARDIRQ_MASK)) | + ((tsk->softirq_disable_cnt & SOFTIRQ_MASK) & SOFTIRQ_OFFSET)); +} #endif From 221b5e76c1c6e8ad4fa7c95a689e44ff45daab1c Mon Sep 17 00:00:00 2001 From: Changwoo Min Date: Sun, 25 Jan 2026 20:54:13 +0900 Subject: [PATCH 174/268] selftests/bpf: Add tests for execution context helpers Add a new selftest suite `exe_ctx` to verify the accuracy of the bpf_in_task(), bpf_in_hardirq(), and bpf_in_serving_softirq() helpers introduced in bpf_experimental.h. Testing these execution contexts deterministically requires crossing context boundaries within a single CPU. To achieve this, the test implements a "Trigger-Observer" pattern using bpf_testmod: 1. Trigger: A BPF syscall program calls a new bpf_testmod kfunc bpf_kfunc_trigger_ctx_check(). 2. Task to HardIRQ: The kfunc uses irq_work_queue() to trigger a self-IPI on the local CPU. 3. HardIRQ to SoftIRQ: The irq_work handler calls a dummy function (observed by BPF fentry) and then schedules a tasklet to transition into SoftIRQ context. The user-space runner ensures determinism by pinning itself to CPU 0 before execution, forcing the entire interrupt chain to remain on a single core. Dummy noinline functions with compiler barriers are added to bpf_testmod.c to serve as stable attachment points for fentry programs. A retry loop is used in user-space to wait for the asynchronous SoftIRQ to complete. Note that testing on s390x is avoided because supporting those helpers purely in BPF on s390x is not possible at this point. Reviewed-by: Alexei Starovoitov Signed-off-by: Changwoo Min Link: https://lore.kernel.org/r/20260125115413.117502-3-changwoo@igalia.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/DENYLIST.s390x | 1 + .../selftests/bpf/prog_tests/exe_ctx.c | 59 +++++++++++++++++++ tools/testing/selftests/bpf/progs/test_ctx.c | 48 +++++++++++++++ .../selftests/bpf/test_kmods/bpf_testmod.c | 32 ++++++++++ .../bpf/test_kmods/bpf_testmod_kfunc.h | 4 ++ 5 files changed, 144 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/exe_ctx.c create mode 100644 tools/testing/selftests/bpf/progs/test_ctx.c diff --git a/tools/testing/selftests/bpf/DENYLIST.s390x b/tools/testing/selftests/bpf/DENYLIST.s390x index a17baf8c6fd7..f7e1e5f5511c 100644 --- a/tools/testing/selftests/bpf/DENYLIST.s390x +++ b/tools/testing/selftests/bpf/DENYLIST.s390x @@ -1,4 +1,5 @@ # TEMPORARY # Alphabetical order +exe_ctx # execution context check (e.g., hardirq, softirq, etc) get_stack_raw_tp # user_stack corrupted user stack (no backchain userspace) stacktrace_build_id # compare_map_keys stackid_hmap vs. stackmap err -2 errno 2 (?) diff --git a/tools/testing/selftests/bpf/prog_tests/exe_ctx.c b/tools/testing/selftests/bpf/prog_tests/exe_ctx.c new file mode 100644 index 000000000000..aed6a6ef0876 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/exe_ctx.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2026 Valve Corporation. + * Author: Changwoo Min + */ + +#include +#include +#include "test_ctx.skel.h" + +void test_exe_ctx(void) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts); + cpu_set_t old_cpuset, target_cpuset; + struct test_ctx *skel; + int err, prog_fd; + + /* 1. Pin the current process to CPU 0. */ + if (sched_getaffinity(0, sizeof(old_cpuset), &old_cpuset) == 0) { + CPU_ZERO(&target_cpuset); + CPU_SET(0, &target_cpuset); + ASSERT_OK(sched_setaffinity(0, sizeof(target_cpuset), + &target_cpuset), "setaffinity"); + } + + skel = test_ctx__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_load")) + goto restore_affinity; + + err = test_ctx__attach(skel); + if (!ASSERT_OK(err, "skel_attach")) + goto cleanup; + + /* 2. When we run this, the kernel will execute the BPF prog on CPU 0. */ + prog_fd = bpf_program__fd(skel->progs.trigger_all_contexts); + err = bpf_prog_test_run_opts(prog_fd, &opts); + ASSERT_OK(err, "test_run_trigger"); + + /* 3. Wait for the local CPU's softirq/tasklet to finish. */ + for (int i = 0; i < 1000; i++) { + if (skel->bss->count_task > 0 && + skel->bss->count_hardirq > 0 && + skel->bss->count_softirq > 0) + break; + usleep(1000); /* Wait 1ms per iteration, up to 1 sec total */ + } + + /* On CPU 0, these should now all be non-zero. */ + ASSERT_GT(skel->bss->count_task, 0, "task_ok"); + ASSERT_GT(skel->bss->count_hardirq, 0, "hardirq_ok"); + ASSERT_GT(skel->bss->count_softirq, 0, "softirq_ok"); + +cleanup: + test_ctx__destroy(skel); + +restore_affinity: + ASSERT_OK(sched_setaffinity(0, sizeof(old_cpuset), &old_cpuset), + "restore_affinity"); +} diff --git a/tools/testing/selftests/bpf/progs/test_ctx.c b/tools/testing/selftests/bpf/progs/test_ctx.c new file mode 100644 index 000000000000..7d4995506717 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_ctx.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2026 Valve Corporation. + * Author: Changwoo Min + */ + +#include "vmlinux.h" +#include +#include +#include "bpf_experimental.h" + +char _license[] SEC("license") = "GPL"; + +extern void bpf_kfunc_trigger_ctx_check(void) __ksym; + +int count_hardirq; +int count_softirq; +int count_task; + +/* Triggered via bpf_prog_test_run from user-space */ +SEC("syscall") +int trigger_all_contexts(void *ctx) +{ + if (bpf_in_task()) + __sync_fetch_and_add(&count_task, 1); + + /* Trigger the firing of a hardirq and softirq for test. */ + bpf_kfunc_trigger_ctx_check(); + return 0; +} + +/* Observer for HardIRQ */ +SEC("fentry/bpf_testmod_test_hardirq_fn") +int BPF_PROG(on_hardirq) +{ + if (bpf_in_hardirq()) + __sync_fetch_and_add(&count_hardirq, 1); + return 0; +} + +/* Observer for SoftIRQ */ +SEC("fentry/bpf_testmod_test_softirq_fn") +int BPF_PROG(on_softirq) +{ + if (bpf_in_serving_softirq()) + __sync_fetch_and_add(&count_softirq, 1); + return 0; +} diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c index 77a81fa8ec6a..186a25ab429a 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c @@ -1168,6 +1168,33 @@ __bpf_kfunc int bpf_kfunc_implicit_arg(int a, struct bpf_prog_aux *aux); __bpf_kfunc int bpf_kfunc_implicit_arg_legacy(int a, int b, struct bpf_prog_aux *aux); __bpf_kfunc int bpf_kfunc_implicit_arg_legacy_impl(int a, int b, struct bpf_prog_aux *aux); +/* hook targets */ +noinline void bpf_testmod_test_hardirq_fn(void) { barrier(); } +noinline void bpf_testmod_test_softirq_fn(void) { barrier(); } + +/* Tasklet for SoftIRQ context */ +static void ctx_check_tasklet_fn(struct tasklet_struct *t) +{ + bpf_testmod_test_softirq_fn(); +} + +DECLARE_TASKLET(ctx_check_tasklet, ctx_check_tasklet_fn); + +/* IRQ Work for HardIRQ context */ +static void ctx_check_irq_fn(struct irq_work *work) +{ + bpf_testmod_test_hardirq_fn(); + tasklet_schedule(&ctx_check_tasklet); +} + +static struct irq_work ctx_check_irq = IRQ_WORK_INIT_HARD(ctx_check_irq_fn); + +/* The kfunc trigger */ +__bpf_kfunc void bpf_kfunc_trigger_ctx_check(void) +{ + irq_work_queue(&ctx_check_irq); +} + BTF_KFUNCS_START(bpf_testmod_check_kfunc_ids) BTF_ID_FLAGS(func, bpf_testmod_test_mod_kfunc) BTF_ID_FLAGS(func, bpf_kfunc_call_test1) @@ -1213,6 +1240,7 @@ BTF_ID_FLAGS(func, bpf_kfunc_multi_st_ops_test_1_assoc, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_kfunc_implicit_arg, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_kfunc_implicit_arg_legacy, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_kfunc_implicit_arg_legacy_impl) +BTF_ID_FLAGS(func, bpf_kfunc_trigger_ctx_check) BTF_KFUNCS_END(bpf_testmod_check_kfunc_ids) static int bpf_testmod_ops_init(struct btf *btf) @@ -1844,6 +1872,10 @@ static void bpf_testmod_exit(void) while (refcount_read(&prog_test_struct.cnt) > 1) msleep(20); + /* Clean up irqwork and tasklet */ + irq_work_sync(&ctx_check_irq); + tasklet_kill(&ctx_check_tasklet); + bpf_kfunc_close_sock(); sysfs_remove_bin_file(kernel_kobj, &bin_attr_bpf_testmod_file); unregister_bpf_testmod_uprobe(); diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h b/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h index 10f89f06245f..d5c5454e257e 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod_kfunc.h @@ -169,4 +169,8 @@ extern int bpf_kfunc_multi_st_ops_test_1_assoc(struct st_ops_args *args) __weak struct prog_test_member *bpf_kfunc_get_default_trusted_ptr_test(void) __ksym; void bpf_kfunc_put_default_trusted_ptr_test(struct prog_test_member *trusted_ptr) __ksym; +void bpf_testmod_test_hardirq_fn(void); +void bpf_testmod_test_softirq_fn(void); +void bpf_kfunc_trigger_ctx_check(void) __ksym; + #endif /* _BPF_TESTMOD_KFUNC_H */ From 78980b4c7fcb5ef74b7af65fbef5ce8d718cf791 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Mon, 19 Jan 2026 21:34:17 +0800 Subject: [PATCH 175/268] selftests/bpf: Harden cpu flags test for lru_percpu_hash map CI occasionally reports failures in the percpu_alloc/cpu_flag_lru_percpu_hash selftest, for example: First test_progs failure (test_progs_no_alu32-x86_64-llvm-21): #264/15 percpu_alloc/cpu_flag_lru_percpu_hash ... test_percpu_map_op_cpu_flag:FAIL:bpf_map_lookup_batch value on specified cpu unexpected bpf_map_lookup_batch value on specified cpu: actual 0 != expected 3735929054 The unexpected value indicates that an element was removed from the map. However, the test never calls delete_elem(), so the only possible cause is LRU eviction. This can happen when the current task migrates to another CPU: an update_elem() triggers eviction because there is no available LRU node on local freelist and global freelist. Harden the test against this behavior by provisioning sufficient spare elements. Set max_entries to 'nr_cpus * 2' and restrict the test to using the first nr_cpus entries, ensuring that updates do not spuriously trigger LRU eviction. Signed-off-by: Leon Hwang Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260119133417.19739-1-leon.hwang@linux.dev --- .../testing/selftests/bpf/prog_tests/percpu_alloc.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/percpu_alloc.c b/tools/testing/selftests/bpf/prog_tests/percpu_alloc.c index c1d0949f093f..a72ae0b29f6e 100644 --- a/tools/testing/selftests/bpf/prog_tests/percpu_alloc.c +++ b/tools/testing/selftests/bpf/prog_tests/percpu_alloc.c @@ -236,6 +236,8 @@ static void test_percpu_map_op_cpu_flag(struct bpf_map *map, void *keys, size_t err = bpf_map_update_batch(map_fd, keys, values, &count, &batch_opts); if (!ASSERT_OK(err, "bpf_map_update_batch all_cpus")) goto out; + if (!ASSERT_EQ(count, entries, "bpf_map_update_batch count")) + goto out; /* update values on specified CPU */ for (i = 0; i < entries; i++) @@ -246,6 +248,8 @@ static void test_percpu_map_op_cpu_flag(struct bpf_map *map, void *keys, size_t err = bpf_map_update_batch(map_fd, keys, values, &count, &batch_opts); if (!ASSERT_OK(err, "bpf_map_update_batch specified cpu")) goto out; + if (!ASSERT_EQ(count, entries, "bpf_map_update_batch count")) + goto out; /* lookup values on specified CPU */ batch = 0; @@ -254,6 +258,8 @@ static void test_percpu_map_op_cpu_flag(struct bpf_map *map, void *keys, size_t err = bpf_map_lookup_batch(map_fd, NULL, &batch, keys, values, &count, &batch_opts); if (!ASSERT_TRUE(!err || err == -ENOENT, "bpf_map_lookup_batch specified cpu")) goto out; + if (!ASSERT_EQ(count, entries, "bpf_map_lookup_batch count")) + goto out; for (i = 0; i < entries; i++) if (!ASSERT_EQ(values[i], value, @@ -269,6 +275,8 @@ static void test_percpu_map_op_cpu_flag(struct bpf_map *map, void *keys, size_t &batch_opts); if (!ASSERT_TRUE(!err || err == -ENOENT, "bpf_map_lookup_batch all_cpus")) goto out; + if (!ASSERT_EQ(count, entries, "bpf_map_lookup_batch count")) + goto out; for (i = 0; i < entries; i++) { values_row = (void *) values_percpu + @@ -287,7 +295,6 @@ static void test_percpu_map_op_cpu_flag(struct bpf_map *map, void *keys, size_t free(values); } - static void test_percpu_map_cpu_flag(enum bpf_map_type map_type) { struct percpu_alloc_array *skel; @@ -300,7 +307,7 @@ static void test_percpu_map_cpu_flag(enum bpf_map_type map_type) if (!ASSERT_GT(nr_cpus, 0, "libbpf_num_possible_cpus")) return; - max_entries = nr_cpus + 1; + max_entries = nr_cpus * 2; keys = calloc(max_entries, key_sz); if (!ASSERT_OK_PTR(keys, "calloc keys")) return; @@ -322,7 +329,7 @@ static void test_percpu_map_cpu_flag(enum bpf_map_type map_type) if (!ASSERT_OK(err, "test_percpu_alloc__load")) goto out; - test_percpu_map_op_cpu_flag(map, keys, key_sz, max_entries - 1, nr_cpus, true); + test_percpu_map_op_cpu_flag(map, keys, key_sz, nr_cpus, nr_cpus, true); out: percpu_alloc_array__destroy(skel); free(keys); From f21fae57744607330ae5dabdd996538faac7f5ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexis=20Lothor=C3=A9=20=28eBPF=20Foundation=29?= Date: Fri, 23 Jan 2026 09:30:08 +0100 Subject: [PATCH 176/268] selftests/bpf: Add a few helpers for bpftool testing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In order to integrate some bpftool tests into test_progs, define a few specific helpers that allow to execute bpftool commands, while possibly retrieving the command output. Those helpers most notably set the path to the bpftool binary under test. This version checks different possible paths relative to the directories where the different test_progs runners are executed, as we want to make sure not to accidentally use a bootstrap version of the binary. Signed-off-by: Alexis Lothoré (eBPF Foundation) Link: https://lore.kernel.org/r/20260123-bpftool-tests-v4-1-a6653a7f28e7@bootlin.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 3 +- tools/testing/selftests/bpf/bpftool_helpers.c | 74 +++++++++++++++++++ tools/testing/selftests/bpf/bpftool_helpers.h | 11 +++ 3 files changed, 87 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/bpftool_helpers.c create mode 100644 tools/testing/selftests/bpf/bpftool_helpers.h diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 9488d076c740..2cc559ecc723 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -747,7 +747,8 @@ TRUNNER_EXTRA_SOURCES := test_progs.c \ json_writer.c \ $(VERIFY_SIG_HDR) \ flow_dissector_load.h \ - ip_check_defrag_frags.h + ip_check_defrag_frags.h \ + bpftool_helpers.c TRUNNER_LIB_SOURCES := find_bit.c TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read \ $(OUTPUT)/liburandom_read.so \ diff --git a/tools/testing/selftests/bpf/bpftool_helpers.c b/tools/testing/selftests/bpf/bpftool_helpers.c new file mode 100644 index 000000000000..a5824945a4a5 --- /dev/null +++ b/tools/testing/selftests/bpf/bpftool_helpers.c @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include "bpftool_helpers.h" +#include +#include +#include + +#define BPFTOOL_PATH_MAX_LEN 64 +#define BPFTOOL_FULL_CMD_MAX_LEN 512 + +#define BPFTOOL_DEFAULT_PATH "tools/sbin/bpftool" + +static int detect_bpftool_path(char *buffer) +{ + char tmp[BPFTOOL_PATH_MAX_LEN]; + + /* Check default bpftool location (will work if we are running the + * default flavor of test_progs) + */ + snprintf(tmp, BPFTOOL_PATH_MAX_LEN, "./%s", BPFTOOL_DEFAULT_PATH); + if (access(tmp, X_OK) == 0) { + strncpy(buffer, tmp, BPFTOOL_PATH_MAX_LEN); + return 0; + } + + /* Check alternate bpftool location (will work if we are running a + * specific flavor of test_progs, e.g. cpuv4 or no_alu32) + */ + snprintf(tmp, BPFTOOL_PATH_MAX_LEN, "../%s", BPFTOOL_DEFAULT_PATH); + if (access(tmp, X_OK) == 0) { + strncpy(buffer, tmp, BPFTOOL_PATH_MAX_LEN); + return 0; + } + + /* Failed to find bpftool binary */ + return 1; +} + +static int run_command(char *args, char *output_buf, size_t output_max_len) +{ + static char bpftool_path[BPFTOOL_PATH_MAX_LEN] = {0}; + bool suppress_output = !(output_buf && output_max_len); + char command[BPFTOOL_FULL_CMD_MAX_LEN]; + FILE *f; + int ret; + + /* Detect and cache bpftool binary location */ + if (bpftool_path[0] == 0 && detect_bpftool_path(bpftool_path)) + return 1; + + ret = snprintf(command, BPFTOOL_FULL_CMD_MAX_LEN, "%s %s%s", + bpftool_path, args, + suppress_output ? " > /dev/null 2>&1" : ""); + + f = popen(command, "r"); + if (!f) + return 1; + + if (!suppress_output) + fread(output_buf, 1, output_max_len, f); + ret = pclose(f); + + return ret; +} + +int run_bpftool_command(char *args) +{ + return run_command(args, NULL, 0); +} + +int get_bpftool_command_output(char *args, char *output_buf, size_t output_max_len) +{ + return run_command(args, output_buf, output_max_len); +} + diff --git a/tools/testing/selftests/bpf/bpftool_helpers.h b/tools/testing/selftests/bpf/bpftool_helpers.h new file mode 100644 index 000000000000..dec1ba201410 --- /dev/null +++ b/tools/testing/selftests/bpf/bpftool_helpers.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#pragma once + +#include +#include +#include + +#define MAX_BPFTOOL_CMD_LEN (256) + +int run_bpftool_command(char *args); +int get_bpftool_command_output(char *args, char *output_buf, size_t output_max_len); From 1c0b505908a201054dadc87930f550bea2631c1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexis=20Lothor=C3=A9=20=28eBPF=20Foundation=29?= Date: Fri, 23 Jan 2026 09:30:09 +0100 Subject: [PATCH 177/268] selftests/bpf: convert test_bpftool_metadata.sh into test_progs framework MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The test_bpftool_metadata.sh script validates that bpftool properly returns in its ouptput any metadata generated by bpf programs through some .rodata sections. Port this test to the test_progs framework so that it can be executed automatically in CI. The new test, similarly to the former script, checks that valid data appears both for textual output and json output, as well as for both data not used at all and used data. For the json check part, the expected json string is hardcoded to avoid bringing a new external dependency (eg: a json deserializer) for test_progs. As the test is now converted into test_progs, remove the former script. The newly converted test brings two new subtests: #37/1 bpftool_metadata/metadata_unused:OK #37/2 bpftool_metadata/metadata_used:OK #37 bpftool_metadata:OK Summary: 1/2 PASSED, 0 SKIPPED, 0 FAILED Signed-off-by: Alexis Lothoré (eBPF Foundation) Link: https://lore.kernel.org/r/20260123-bpftool-tests-v4-2-a6653a7f28e7@bootlin.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 1 - .../bpf/prog_tests/bpftool_metadata.c | 144 ++++++++++++++++++ .../selftests/bpf/test_bpftool_metadata.sh | 85 ----------- 3 files changed, 144 insertions(+), 86 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/bpftool_metadata.c delete mode 100755 tools/testing/selftests/bpf/test_bpftool_metadata.sh diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 2cc559ecc723..1bb7db1ed6ea 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -109,7 +109,6 @@ TEST_PROGS := test_kmod.sh \ test_bpftool_build.sh \ test_bpftool.sh \ test_bpftool_map.sh \ - test_bpftool_metadata.sh \ test_doc_build.sh \ test_xsk.sh \ test_xdp_features.sh diff --git a/tools/testing/selftests/bpf/prog_tests/bpftool_metadata.c b/tools/testing/selftests/bpf/prog_tests/bpftool_metadata.c new file mode 100644 index 000000000000..408ace90dc7e --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/bpftool_metadata.c @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include +#include +#include +#include +#include + +#define BPFFS_DIR "/sys/fs/bpf/test_metadata" +#define BPFFS_USED BPFFS_DIR "/used" +#define BPFFS_UNUSED BPFFS_DIR "/unused" + +#define BPF_FILE_USED "metadata_used.bpf.o" +#define BPF_FILE_UNUSED "metadata_unused.bpf.o" +#define METADATA_MAP_NAME "metadata.rodata" + +#define MAX_BPFTOOL_OUTPUT_LEN (64*1024) + +#define MAX_TOKENS_TO_CHECK 3 +static char output[MAX_BPFTOOL_OUTPUT_LEN]; + +struct test_desc { + char *name; + char *bpf_prog; + char *bpffs_path; + char *expected_output[MAX_TOKENS_TO_CHECK]; + char *expected_output_json[MAX_TOKENS_TO_CHECK]; + char *metadata_map_name; +}; + +static int setup(struct test_desc *test) +{ + return mkdir(BPFFS_DIR, 0700); +} + +static void cleanup(struct test_desc *test) +{ + unlink(test->bpffs_path); + rmdir(BPFFS_DIR); +} + +static int check_metadata(char *buf, char * const *tokens, int count) +{ + int i; + + for (i = 0; i < count && tokens[i]; i++) + if (!strstr(buf, tokens[i])) + return 1; + + return 0; +} + +static void run_test(struct test_desc *test) +{ + int ret; + char cmd[MAX_BPFTOOL_CMD_LEN]; + + ret = snprintf(cmd, MAX_BPFTOOL_CMD_LEN, "prog load %s %s", + test->bpf_prog, test->bpffs_path); + if (!ASSERT_GT(ret, 0, "format prog insert command")) + return; + ret = run_bpftool_command(cmd); + if (!ASSERT_OK(ret, "load program")) + return; + + /* Check output with default format */ + ret = snprintf(cmd, MAX_BPFTOOL_CMD_LEN, "prog show pinned %s", + test->bpffs_path); + if (!ASSERT_GT(ret, 0, "format pinned prog check command")) + return; + ret = get_bpftool_command_output(cmd, output, + MAX_BPFTOOL_OUTPUT_LEN); + if (ASSERT_OK(ret, "get program info")) { + ret = check_metadata(output, test->expected_output, + ARRAY_SIZE(test->expected_output)); + ASSERT_OK(ret, "find metadata"); + } + + /* Check output with json format */ + ret = snprintf(cmd, MAX_BPFTOOL_CMD_LEN, "prog -j show pinned %s", + test->bpffs_path); + if (!ASSERT_GT(ret, 0, "format pinned prog check command in json")) + return; + ret = get_bpftool_command_output(cmd, output, + MAX_BPFTOOL_OUTPUT_LEN); + if (ASSERT_OK(ret, "get program info in json")) { + ret = check_metadata(output, test->expected_output_json, + ARRAY_SIZE(test->expected_output_json)); + ASSERT_OK(ret, "find metadata in json"); + } + + /* Check that the corresponding map can be found and accessed */ + ret = snprintf(cmd, MAX_BPFTOOL_CMD_LEN, "map show name %s", + test->metadata_map_name); + if (!ASSERT_GT(ret, 0, "format map check command")) + return; + ASSERT_OK(run_bpftool_command(cmd), "access metadata map"); +} + +static struct test_desc tests[] = { + { + .name = "metadata_unused", + .bpf_prog = BPF_FILE_UNUSED, + .bpffs_path = BPFFS_UNUSED, + .expected_output = { + "a = \"foo\"", + "b = 1" + }, + .expected_output_json = { + "\"metadata\":{\"a\":\"foo\",\"b\":1}" + }, + .metadata_map_name = METADATA_MAP_NAME + }, + { + .name = "metadata_used", + .bpf_prog = BPF_FILE_USED, + .bpffs_path = BPFFS_USED, + .expected_output = { + "a = \"bar\"", + "b = 2" + }, + .expected_output_json = { + "\"metadata\":{\"a\":\"bar\",\"b\":2}" + }, + .metadata_map_name = METADATA_MAP_NAME + } +}; +static const int tests_count = ARRAY_SIZE(tests); + +void test_bpftool_metadata(void) +{ + int i; + + for (i = 0; i < tests_count; i++) { + if (!test__start_subtest(tests[i].name)) + continue; + if (ASSERT_OK(setup(&tests[i]), "setup bpffs pin dir")) { + run_test(&tests[i]); + cleanup(&tests[i]); + } + } +} diff --git a/tools/testing/selftests/bpf/test_bpftool_metadata.sh b/tools/testing/selftests/bpf/test_bpftool_metadata.sh deleted file mode 100755 index b5520692f41b..000000000000 --- a/tools/testing/selftests/bpf/test_bpftool_metadata.sh +++ /dev/null @@ -1,85 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 - -BPF_FILE_USED="metadata_used.bpf.o" -BPF_FILE_UNUSED="metadata_unused.bpf.o" - -TESTNAME=bpftool_metadata -BPF_FS=$(awk '$3 == "bpf" {print $2; exit}' /proc/mounts) -BPF_DIR=$BPF_FS/test_$TESTNAME - -_cleanup() -{ - set +e - rm -rf $BPF_DIR 2> /dev/null -} - -cleanup_skip() -{ - echo "selftests: $TESTNAME [SKIP]" - _cleanup - - exit $ksft_skip -} - -cleanup() -{ - if [ "$?" = 0 ]; then - echo "selftests: $TESTNAME [PASS]" - else - echo "selftests: $TESTNAME [FAILED]" - fi - _cleanup -} - -if [ $(id -u) -ne 0 ]; then - echo "selftests: $TESTNAME [SKIP] Need root privileges" - exit $ksft_skip -fi - -if [ -z "$BPF_FS" ]; then - echo "selftests: $TESTNAME [SKIP] Could not run test without bpffs mounted" - exit $ksft_skip -fi - -if ! bpftool version > /dev/null 2>&1; then - echo "selftests: $TESTNAME [SKIP] Could not run test without bpftool" - exit $ksft_skip -fi - -set -e - -trap cleanup_skip EXIT - -mkdir $BPF_DIR - -trap cleanup EXIT - -bpftool prog load $BPF_FILE_UNUSED $BPF_DIR/unused - -METADATA_PLAIN="$(bpftool prog)" -echo "$METADATA_PLAIN" | grep 'a = "foo"' > /dev/null -echo "$METADATA_PLAIN" | grep 'b = 1' > /dev/null - -bpftool prog --json | grep '"metadata":{"a":"foo","b":1}' > /dev/null - -bpftool map | grep 'metadata.rodata' > /dev/null - -rm $BPF_DIR/unused - -bpftool prog load $BPF_FILE_USED $BPF_DIR/used - -METADATA_PLAIN="$(bpftool prog)" -echo "$METADATA_PLAIN" | grep 'a = "bar"' > /dev/null -echo "$METADATA_PLAIN" | grep 'b = 2' > /dev/null - -bpftool prog --json | grep '"metadata":{"a":"bar","b":2}' > /dev/null - -bpftool map | grep 'metadata.rodata' > /dev/null - -rm $BPF_DIR/used - -exit 0 From 2d96bbdfd3b5d28306001036c0161fcb1713f964 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexis=20Lothor=C3=A9=20=28eBPF=20Foundation=29?= Date: Fri, 23 Jan 2026 09:30:10 +0100 Subject: [PATCH 178/268] selftests/bpf: convert test_bpftool_map_access.sh into test_progs framework MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The test_bpftool_map.sh script tests that maps read/write accesses are being properly allowed/refused by the kernel depending on a specific fmod_ret program being attached on security_bpf_map function. Rewrite this test to integrate it in the test_progs. The new test spawns a few subtests: #36/1 bpftool_maps_access/unprotected_unpinned:OK #36/2 bpftool_maps_access/unprotected_pinned:OK #36/3 bpftool_maps_access/protected_unpinned:OK #36/4 bpftool_maps_access/protected_pinned:OK #36/5 bpftool_maps_access/nested_maps:OK #36/6 bpftool_maps_access/btf_list:OK #36 bpftool_maps_access:OK Summary: 1/6 PASSED, 0 SKIPPED, 0 FAILED Signed-off-by: Alexis Lothoré (eBPF Foundation) Acked-by: Quentin Monnet Link: https://lore.kernel.org/r/20260123-bpftool-tests-v4-3-a6653a7f28e7@bootlin.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 1 - .../bpf/prog_tests/bpftool_maps_access.c | 371 ++++++++++++++++ .../testing/selftests/bpf/test_bpftool_map.sh | 398 ------------------ 3 files changed, 371 insertions(+), 399 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/bpftool_maps_access.c delete mode 100755 tools/testing/selftests/bpf/test_bpftool_map.sh diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 1bb7db1ed6ea..2c2f68a171ed 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -108,7 +108,6 @@ TEST_PROGS := test_kmod.sh \ test_xdping.sh \ test_bpftool_build.sh \ test_bpftool.sh \ - test_bpftool_map.sh \ test_doc_build.sh \ test_xsk.sh \ test_xdp_features.sh diff --git a/tools/testing/selftests/bpf/prog_tests/bpftool_maps_access.c b/tools/testing/selftests/bpf/prog_tests/bpftool_maps_access.c new file mode 100644 index 000000000000..e0eb869cb1b4 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/bpftool_maps_access.c @@ -0,0 +1,371 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "security_bpf_map.skel.h" + +#define PROTECTED_MAP_NAME "prot_map" +#define UNPROTECTED_MAP_NAME "not_prot_map" +#define BPF_ITER_FILE "bpf_iter_map_elem.bpf.o" +#define BPFFS_PIN_DIR "/sys/fs/bpf/test_bpftool_map" +#define INNER_MAP_NAME "inner_map_tt" +#define OUTER_MAP_NAME "outer_map_tt" + +#define MAP_NAME_MAX_LEN 64 +#define PATH_MAX_LEN 128 + +enum map_protection { + PROTECTED, + UNPROTECTED +}; + +struct test_desc { + char *name; + enum map_protection protection; + struct bpf_map *map; + char *map_name; + bool pinned; + char pin_path[PATH_MAX_LEN]; + bool write_must_fail; +}; + +static struct security_bpf_map *general_setup(void) +{ + struct security_bpf_map *skel; + uint32_t key, value; + int ret, i; + + skel = security_bpf_map__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open and load skeleton")) + goto end; + + struct bpf_map *maps[] = {skel->maps.prot_map, skel->maps.not_prot_map}; + + ret = security_bpf_map__attach(skel); + if (!ASSERT_OK(ret, "attach maps security programs")) + goto end_destroy; + + for (i = 0; i < sizeof(maps)/sizeof(struct bpf_map *); i++) { + for (key = 0; key < 2; key++) { + int ret = bpf_map__update_elem(maps[i], &key, + sizeof(key), &key, sizeof(key), + 0); + if (!ASSERT_OK(ret, "set initial map value")) + goto end_destroy; + } + } + + key = 0; + value = 1; + ret = bpf_map__update_elem(skel->maps.prot_status_map, &key, + sizeof(key), &value, sizeof(value), 0); + if (!ASSERT_OK(ret, "configure map protection")) + goto end_destroy; + + if (!ASSERT_OK(mkdir(BPFFS_PIN_DIR, S_IFDIR), "create bpffs pin dir")) + goto end_destroy; + + return skel; +end_destroy: + security_bpf_map__destroy(skel); +end: + return NULL; +} + +static void general_cleanup(struct security_bpf_map *skel) +{ + rmdir(BPFFS_PIN_DIR); + security_bpf_map__destroy(skel); +} + +static void update_test_desc(struct security_bpf_map *skel, + struct test_desc *test) +{ + /* Now that the skeleton is loaded, update all missing fields to + * have the subtest properly configured + */ + if (test->protection == PROTECTED) { + test->map = skel->maps.prot_map; + test->map_name = PROTECTED_MAP_NAME; + } else { + test->map = skel->maps.not_prot_map; + test->map_name = UNPROTECTED_MAP_NAME; + } +} + +static int test_setup(struct security_bpf_map *skel, struct test_desc *desc) +{ + int ret; + + update_test_desc(skel, desc); + + if (desc->pinned) { + ret = snprintf(desc->pin_path, PATH_MAX_LEN, "%s/%s", BPFFS_PIN_DIR, + desc->name); + if (!ASSERT_GT(ret, 0, "format pin path")) + return 1; + ret = bpf_map__pin(desc->map, desc->pin_path); + if (!ASSERT_OK(ret, "pin map")) + return 1; + } + + return 0; +} + +static void test_cleanup(struct test_desc *desc) +{ + if (desc->pinned) + bpf_map__unpin(desc->map, NULL); +} + +static int lookup_map_value(char *map_handle) +{ + char cmd[MAX_BPFTOOL_CMD_LEN]; + int ret = 0; + + ret = snprintf(cmd, MAX_BPFTOOL_CMD_LEN, "map lookup %s key 0 0 0 0", + map_handle); + if (!ASSERT_GT(ret, 0, "format map lookup cmd")) + return 1; + return run_bpftool_command(cmd); +} + +static int read_map_btf_data(char *map_handle) +{ + char cmd[MAX_BPFTOOL_CMD_LEN]; + int ret = 0; + + ret = snprintf(cmd, MAX_BPFTOOL_CMD_LEN, "btf dump map %s", + map_handle); + if (!ASSERT_GT(ret, 0, "format map btf dump cmd")) + return 1; + return run_bpftool_command(cmd); +} + +static int write_map_value(char *map_handle) +{ + char cmd[MAX_BPFTOOL_CMD_LEN]; + int ret = 0; + + ret = snprintf(cmd, MAX_BPFTOOL_CMD_LEN, + "map update %s key 0 0 0 0 value 1 1 1 1", map_handle); + if (!ASSERT_GT(ret, 0, "format value write cmd")) + return 1; + return run_bpftool_command(cmd); +} + +static int delete_map_value(char *map_handle) +{ + char cmd[MAX_BPFTOOL_CMD_LEN]; + int ret = 0; + + ret = snprintf(cmd, MAX_BPFTOOL_CMD_LEN, + "map delete %s key 0 0 0 0", map_handle); + if (!ASSERT_GT(ret, 0, "format value deletion cmd")) + return 1; + return run_bpftool_command(cmd); +} + +static int iterate_on_map_values(char *map_handle, char *iter_pin_path) +{ + char cmd[MAX_BPFTOOL_CMD_LEN]; + int ret = 0; + + + ret = snprintf(cmd, MAX_BPFTOOL_CMD_LEN, "iter pin %s %s map %s", + BPF_ITER_FILE, iter_pin_path, map_handle); + if (!ASSERT_GT(ret, 0, "format iterator creation cmd")) + return 1; + ret = run_bpftool_command(cmd); + if (ret) + return ret; + ret = snprintf(cmd, MAP_NAME_MAX_LEN, "cat %s", iter_pin_path); + if (ret < 0) + goto cleanup; + ret = system(cmd); + +cleanup: + unlink(iter_pin_path); + return ret; +} + +static int create_inner_map(void) +{ + char cmd[MAX_BPFTOOL_CMD_LEN]; + int ret = 0; + + ret = snprintf( + cmd, MAX_BPFTOOL_CMD_LEN, + "map create %s/%s type array key 4 value 4 entries 4 name %s", + BPFFS_PIN_DIR, INNER_MAP_NAME, INNER_MAP_NAME); + if (!ASSERT_GT(ret, 0, "format inner map create cmd")) + return 1; + return run_bpftool_command(cmd); +} + +static int create_outer_map(void) +{ + char cmd[MAX_BPFTOOL_CMD_LEN]; + int ret = 0; + + ret = snprintf( + cmd, MAX_BPFTOOL_CMD_LEN, + "map create %s/%s type hash_of_maps key 4 value 4 entries 2 name %s inner_map name %s", + BPFFS_PIN_DIR, OUTER_MAP_NAME, OUTER_MAP_NAME, INNER_MAP_NAME); + if (!ASSERT_GT(ret, 0, "format outer map create cmd")) + return 1; + return run_bpftool_command(cmd); +} + +static void delete_pinned_map(char *map_name) +{ + char pin_path[PATH_MAX_LEN]; + int ret; + + ret = snprintf(pin_path, PATH_MAX_LEN, "%s/%s", BPFFS_PIN_DIR, + map_name); + if (ret >= 0) + unlink(pin_path); +} + +static int add_outer_map_entry(int key) +{ + char cmd[MAX_BPFTOOL_CMD_LEN]; + int ret = 0; + + ret = snprintf( + cmd, MAX_BPFTOOL_CMD_LEN, + "map update pinned %s/%s key %d 0 0 0 value name %s", + BPFFS_PIN_DIR, OUTER_MAP_NAME, key, INNER_MAP_NAME); + if (!ASSERT_GT(ret, 0, "format outer map value addition cmd")) + return 1; + return run_bpftool_command(cmd); +} + +static void test_basic_access(struct test_desc *desc) +{ + char map_handle[MAP_NAME_MAX_LEN]; + char iter_pin_path[PATH_MAX_LEN]; + int ret; + + if (desc->pinned) + ret = snprintf(map_handle, MAP_NAME_MAX_LEN, "pinned %s", + desc->pin_path); + else + ret = snprintf(map_handle, MAP_NAME_MAX_LEN, "name %s", + desc->map_name); + if (!ASSERT_GT(ret, 0, "format map handle")) + return; + + ret = lookup_map_value(map_handle); + ASSERT_OK(ret, "read map value"); + + ret = read_map_btf_data(map_handle); + ASSERT_OK(ret, "read map btf data"); + + ret = write_map_value(map_handle); + ASSERT_OK(desc->write_must_fail ? !ret : ret, "write map value"); + + ret = delete_map_value(map_handle); + ASSERT_OK(desc->write_must_fail ? !ret : ret, "delete map value"); + /* Restore deleted value */ + if (!ret) + write_map_value(map_handle); + + ret = snprintf(iter_pin_path, PATH_MAX_LEN, "%s/iter", BPFFS_PIN_DIR); + if (ASSERT_GT(ret, 0, "format iter pin path")) { + ret = iterate_on_map_values(map_handle, iter_pin_path); + ASSERT_OK(ret, "iterate on map values"); + } +} + +static void test_create_nested_maps(void) +{ + if (!ASSERT_OK(create_inner_map(), "create inner map")) + return; + if (!ASSERT_OK(create_outer_map(), "create outer map")) + goto end_cleanup_inner; + ASSERT_OK(add_outer_map_entry(0), "add a first entry in outer map"); + ASSERT_OK(add_outer_map_entry(1), "add a second entry in outer map"); + ASSERT_NEQ(add_outer_map_entry(2), 0, "add a third entry in outer map"); + + delete_pinned_map(OUTER_MAP_NAME); +end_cleanup_inner: + delete_pinned_map(INNER_MAP_NAME); +} + +static void test_btf_list(void) +{ + ASSERT_OK(run_bpftool_command("btf list"), "list btf data"); +} + +static struct test_desc tests[] = { + { + .name = "unprotected_unpinned", + .protection = UNPROTECTED, + .map_name = UNPROTECTED_MAP_NAME, + .pinned = false, + .write_must_fail = false, + }, + { + .name = "unprotected_pinned", + .protection = UNPROTECTED, + .map_name = UNPROTECTED_MAP_NAME, + .pinned = true, + .write_must_fail = false, + }, + { + .name = "protected_unpinned", + .protection = PROTECTED, + .map_name = UNPROTECTED_MAP_NAME, + .pinned = false, + .write_must_fail = true, + }, + { + .name = "protected_pinned", + .protection = PROTECTED, + .map_name = UNPROTECTED_MAP_NAME, + .pinned = true, + .write_must_fail = true, + } +}; + +static const size_t tests_count = ARRAY_SIZE(tests); + +void test_bpftool_maps_access(void) +{ + struct security_bpf_map *skel; + struct test_desc *current; + int i; + + skel = general_setup(); + if (!ASSERT_OK_PTR(skel, "prepare programs")) + goto cleanup; + + for (i = 0; i < tests_count; i++) { + current = &tests[i]; + if (!test__start_subtest(current->name)) + continue; + if (ASSERT_OK(test_setup(skel, current), "subtest setup")) { + test_basic_access(current); + test_cleanup(current); + } + } + if (test__start_subtest("nested_maps")) + test_create_nested_maps(); + if (test__start_subtest("btf_list")) + test_btf_list(); + +cleanup: + general_cleanup(skel); +} + diff --git a/tools/testing/selftests/bpf/test_bpftool_map.sh b/tools/testing/selftests/bpf/test_bpftool_map.sh deleted file mode 100755 index 515b1df0501e..000000000000 --- a/tools/testing/selftests/bpf/test_bpftool_map.sh +++ /dev/null @@ -1,398 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 - -TESTNAME="bpftool_map" -BPF_FILE="security_bpf_map.bpf.o" -BPF_ITER_FILE="bpf_iter_map_elem.bpf.o" -PROTECTED_MAP_NAME="prot_map" -NOT_PROTECTED_MAP_NAME="not_prot_map" -BPF_FS_TMP_PARENT="/tmp" -BPF_FS_PARENT=$(awk '$3 == "bpf" {print $2; exit}' /proc/mounts) -BPF_FS_PARENT=${BPF_FS_PARENT:-$BPF_FS_TMP_PARENT} -# bpftool will mount bpf file system under BPF_DIR if it is not mounted -# under BPF_FS_PARENT. -BPF_DIR="$BPF_FS_PARENT/test_$TESTNAME" -SCRIPT_DIR=$(dirname $(realpath "$0")) -BPF_FILE_PATH="$SCRIPT_DIR/$BPF_FILE" -BPF_ITER_FILE_PATH="$SCRIPT_DIR/$BPF_ITER_FILE" -BPFTOOL_PATH="bpftool" -# Assume the script is located under tools/testing/selftests/bpf/ -KDIR_ROOT_DIR=$(realpath "$SCRIPT_DIR"/../../../../) - -_cleanup() -{ - set +eu - - # If BPF_DIR is a mount point this will not remove the mount point itself. - [ -d "$BPF_DIR" ] && rm -rf "$BPF_DIR" 2> /dev/null - - # Unmount if BPF filesystem was temporarily created. - if [ "$BPF_FS_PARENT" = "$BPF_FS_TMP_PARENT" ]; then - # A loop and recursive unmount are required as bpftool might - # create multiple mounts. For example, a bind mount of the directory - # to itself. The bind mount is created to change mount propagation - # flags on an actual mount point. - max_attempts=3 - attempt=0 - while mountpoint -q "$BPF_DIR" && [ $attempt -lt $max_attempts ]; do - umount -R "$BPF_DIR" 2>/dev/null - attempt=$((attempt+1)) - done - - # The directory still exists. Remove it now. - [ -d "$BPF_DIR" ] && rm -rf "$BPF_DIR" 2>/dev/null - fi -} - -cleanup_skip() -{ - echo "selftests: $TESTNAME [SKIP]" - _cleanup - - exit $ksft_skip -} - -cleanup() -{ - if [ "$?" = 0 ]; then - echo "selftests: $TESTNAME [PASS]" - else - echo "selftests: $TESTNAME [FAILED]" - fi - _cleanup -} - -check_root_privileges() { - if [ $(id -u) -ne 0 ]; then - echo "Need root privileges" - exit $ksft_skip - fi -} - -# Function to verify bpftool path. -# Parameters: -# $1: bpftool path -verify_bpftool_path() { - local bpftool_path="$1" - if ! "$bpftool_path" version > /dev/null 2>&1; then - echo "Could not run test without bpftool" - exit $ksft_skip - fi -} - -# Function to verify BTF support. -# The test requires BTF support for fmod_ret programs. -verify_btf_support() { - if [ ! -f /sys/kernel/btf/vmlinux ]; then - echo "Could not run test without BTF support" - exit $ksft_skip - fi -} - -# Function to initialize map entries with keys [0..2] and values set to 0. -# Parameters: -# $1: Map name -# $2: bpftool path -initialize_map_entries() { - local map_name="$1" - local bpftool_path="$2" - - for key in 0 1 2; do - "$bpftool_path" map update name "$map_name" key $key 0 0 0 value 0 0 0 $key - done -} - -# Test read access to the map. -# Parameters: -# $1: Name command (name/pinned) -# $2: Map name -# $3: bpftool path -# $4: key -access_for_read() { - local name_cmd="$1" - local map_name="$2" - local bpftool_path="$3" - local key="$4" - - # Test read access to the map. - if ! "$bpftool_path" map lookup "$name_cmd" "$map_name" key $key 1>/dev/null; then - echo " Read access to $key in $map_name failed" - exit 1 - fi - - # Test read access to map's BTF data. - if ! "$bpftool_path" btf dump map "$name_cmd" "$map_name" 1>/dev/null; then - echo " Read access to $map_name for BTF data failed" - exit 1 - fi -} - -# Test write access to the map. -# Parameters: -# $1: Name command (name/pinned) -# $2: Map name -# $3: bpftool path -# $4: key -# $5: Whether write should succeed (true/false) -access_for_write() { - local name_cmd="$1" - local map_name="$2" - local bpftool_path="$3" - local key="$4" - local write_should_succeed="$5" - local value="1 1 1 1" - - if "$bpftool_path" map update "$name_cmd" "$map_name" key $key value \ - $value 2>/dev/null; then - if [ "$write_should_succeed" = "false" ]; then - echo " Write access to $key in $map_name succeeded but should have failed" - exit 1 - fi - else - if [ "$write_should_succeed" = "true" ]; then - echo " Write access to $key in $map_name failed but should have succeeded" - exit 1 - fi - fi -} - -# Test entry deletion for the map. -# Parameters: -# $1: Name command (name/pinned) -# $2: Map name -# $3: bpftool path -# $4: key -# $5: Whether write should succeed (true/false) -access_for_deletion() { - local name_cmd="$1" - local map_name="$2" - local bpftool_path="$3" - local key="$4" - local write_should_succeed="$5" - local value="1 1 1 1" - - # Test deletion by key for the map. - # Before deleting, check the key exists. - if ! "$bpftool_path" map lookup "$name_cmd" "$map_name" key $key 1>/dev/null; then - echo " Key $key does not exist in $map_name" - exit 1 - fi - - # Delete by key. - if "$bpftool_path" map delete "$name_cmd" "$map_name" key $key 2>/dev/null; then - if [ "$write_should_succeed" = "false" ]; then - echo " Deletion for $key in $map_name succeeded but should have failed" - exit 1 - fi - else - if [ "$write_should_succeed" = "true" ]; then - echo " Deletion for $key in $map_name failed but should have succeeded" - exit 1 - fi - fi - - # After deleting, check the entry existence according to the expected status. - if "$bpftool_path" map lookup "$name_cmd" "$map_name" key $key 1>/dev/null; then - if [ "$write_should_succeed" = "true" ]; then - echo " Key $key for $map_name was not deleted but should have been deleted" - exit 1 - fi - else - if [ "$write_should_succeed" = "false" ]; then - echo "Key $key for $map_name was deleted but should have not been deleted" - exit 1 - fi - fi - - # Test creation of map's deleted entry, if deletion was successful. - # Otherwise, the entry exists. - if "$bpftool_path" map update "$name_cmd" "$map_name" key $key value \ - $value 2>/dev/null; then - if [ "$write_should_succeed" = "false" ]; then - echo " Write access to $key in $map_name succeeded after deletion attempt but should have failed" - exit 1 - fi - else - if [ "$write_should_succeed" = "true" ]; then - echo " Write access to $key in $map_name failed after deletion attempt but should have succeeded" - exit 1 - fi - fi -} - -# Test map elements iterator. -# Parameters: -# $1: Name command (name/pinned) -# $2: Map name -# $3: bpftool path -# $4: BPF_DIR -# $5: bpf iterator object file path -iterate_map_elem() { - local name_cmd="$1" - local map_name="$2" - local bpftool_path="$3" - local bpf_dir="$4" - local bpf_file="$5" - local pin_path="$bpf_dir/map_iterator" - - "$bpftool_path" iter pin "$bpf_file" "$pin_path" map "$name_cmd" "$map_name" - if [ ! -f "$pin_path" ]; then - echo " Failed to pin iterator to $pin_path" - exit 1 - fi - - cat "$pin_path" 1>/dev/null - rm "$pin_path" 2>/dev/null -} - -# Function to test map access with configurable write expectations -# Parameters: -# $1: Name command (name/pinned) -# $2: Map name -# $3: bpftool path -# $4: key for rw -# $5: key to delete -# $6: Whether write should succeed (true/false) -# $7: BPF_DIR -# $8: bpf iterator object file path -access_map() { - local name_cmd="$1" - local map_name="$2" - local bpftool_path="$3" - local key_for_rw="$4" - local key_to_del="$5" - local write_should_succeed="$6" - local bpf_dir="$7" - local bpf_iter_file_path="$8" - - access_for_read "$name_cmd" "$map_name" "$bpftool_path" "$key_for_rw" - access_for_write "$name_cmd" "$map_name" "$bpftool_path" "$key_for_rw" \ - "$write_should_succeed" - access_for_deletion "$name_cmd" "$map_name" "$bpftool_path" "$key_to_del" \ - "$write_should_succeed" - iterate_map_elem "$name_cmd" "$map_name" "$bpftool_path" "$bpf_dir" \ - "$bpf_iter_file_path" -} - -# Function to test map access with configurable write expectations -# Parameters: -# $1: Map name -# $2: bpftool path -# $3: BPF_DIR -# $4: Whether write should succeed (true/false) -# $5: bpf iterator object file path -test_map_access() { - local map_name="$1" - local bpftool_path="$2" - local bpf_dir="$3" - local pin_path="$bpf_dir/${map_name}_pinned" - local write_should_succeed="$4" - local bpf_iter_file_path="$5" - - # Test access to the map by name. - access_map "name" "$map_name" "$bpftool_path" "0 0 0 0" "1 0 0 0" \ - "$write_should_succeed" "$bpf_dir" "$bpf_iter_file_path" - - # Pin the map to the BPF filesystem - "$bpftool_path" map pin name "$map_name" "$pin_path" - if [ ! -e "$pin_path" ]; then - echo " Failed to pin $map_name" - exit 1 - fi - - # Test access to the pinned map. - access_map "pinned" "$pin_path" "$bpftool_path" "0 0 0 0" "2 0 0 0" \ - "$write_should_succeed" "$bpf_dir" "$bpf_iter_file_path" -} - -# Function to test map creation and map-of-maps -# Parameters: -# $1: bpftool path -# $2: BPF_DIR -test_map_creation_and_map_of_maps() { - local bpftool_path="$1" - local bpf_dir="$2" - local outer_map_name="outer_map_tt" - local inner_map_name="inner_map_tt" - - "$bpftool_path" map create "$bpf_dir/$inner_map_name" type array key 4 \ - value 4 entries 4 name "$inner_map_name" - if [ ! -f "$bpf_dir/$inner_map_name" ]; then - echo " Failed to create inner map file at $bpf_dir/$outer_map_name" - return 1 - fi - - "$bpftool_path" map create "$bpf_dir/$outer_map_name" type hash_of_maps \ - key 4 value 4 entries 2 name "$outer_map_name" inner_map name "$inner_map_name" - if [ ! -f "$bpf_dir/$outer_map_name" ]; then - echo " Failed to create outer map file at $bpf_dir/$outer_map_name" - return 1 - fi - - # Add entries to the outer map by name and by pinned path. - "$bpftool_path" map update pinned "$bpf_dir/$outer_map_name" key 0 0 0 0 \ - value pinned "$bpf_dir/$inner_map_name" - "$bpftool_path" map update name "$outer_map_name" key 1 0 0 0 value \ - name "$inner_map_name" - - # The outer map should be full by now. - # The following map update command is expected to fail. - if "$bpftool_path" map update name "$outer_map_name" key 2 0 0 0 value name \ - "$inner_map_name" 2>/dev/null; then - echo " Update for $outer_map_name succeeded but should have failed" - exit 1 - fi -} - -# Function to test map access with the btf list command -# Parameters: -# $1: bpftool path -test_map_access_with_btf_list() { - local bpftool_path="$1" - - # The btf list command iterates over maps for - # loaded BPF programs. - if ! "$bpftool_path" btf list 1>/dev/null; then - echo " Failed to access btf data" - exit 1 - fi -} - -set -eu - -trap cleanup_skip EXIT - -check_root_privileges - -verify_bpftool_path "$BPFTOOL_PATH" - -verify_btf_support - -trap cleanup EXIT - -# Load and attach the BPF programs to control maps access. -"$BPFTOOL_PATH" prog loadall "$BPF_FILE_PATH" "$BPF_DIR" autoattach - -initialize_map_entries "$PROTECTED_MAP_NAME" "$BPFTOOL_PATH" -initialize_map_entries "$NOT_PROTECTED_MAP_NAME" "$BPFTOOL_PATH" - -# Activate the map protection mechanism. Protection status is controlled -# by a value stored in the prot_status_map at index 0. -"$BPFTOOL_PATH" map update name prot_status_map key 0 0 0 0 value 1 0 0 0 - -# Test protected map (write should fail). -test_map_access "$PROTECTED_MAP_NAME" "$BPFTOOL_PATH" "$BPF_DIR" "false" \ - "$BPF_ITER_FILE_PATH" - -# Test not protected map (write should succeed). -test_map_access "$NOT_PROTECTED_MAP_NAME" "$BPFTOOL_PATH" "$BPF_DIR" "true" \ - "$BPF_ITER_FILE_PATH" - -test_map_creation_and_map_of_maps "$BPFTOOL_PATH" "$BPF_DIR" - -test_map_access_with_btf_list "$BPFTOOL_PATH" - -exit 0 From 752b807028e63f1473b84eb1350e131eca5e5249 Mon Sep 17 00:00:00 2001 From: Matt Bobrowski Date: Tue, 27 Jan 2026 08:51:10 +0000 Subject: [PATCH 179/268] bpf: add new BPF_CGROUP_ITER_CHILDREN control option Currently, the BPF cgroup iterator supports walking descendants in either pre-order (BPF_CGROUP_ITER_DESCENDANTS_PRE) or post-order (BPF_CGROUP_ITER_DESCENDANTS_POST). These modes perform an exhaustive depth-first search (DFS) of the hierarchy. In scenarios where a BPF program may need to inspect only the direct children of a given parent cgroup, a full DFS is unnecessarily expensive. This patch introduces a new BPF cgroup iterator control option, BPF_CGROUP_ITER_CHILDREN. This control option restricts the traversal to the immediate children of a specified parent cgroup, allowing for more targeted and efficient iteration, particularly when exhaustive depth-first search (DFS) traversal is not required. Signed-off-by: Matt Bobrowski Link: https://lore.kernel.org/r/20260127085112.3608687-1-mattbobrowski@google.com Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 8 ++++++++ kernel/bpf/cgroup_iter.c | 26 +++++++++++++++++++++----- tools/include/uapi/linux/bpf.h | 8 ++++++++ 3 files changed, 37 insertions(+), 5 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 44e7dbc278e3..c8d400b7680a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -119,6 +119,14 @@ enum bpf_cgroup_iter_order { BPF_CGROUP_ITER_DESCENDANTS_PRE, /* walk descendants in pre-order. */ BPF_CGROUP_ITER_DESCENDANTS_POST, /* walk descendants in post-order. */ BPF_CGROUP_ITER_ANCESTORS_UP, /* walk ancestors upward. */ + /* + * Walks the immediate children of the specified parent + * cgroup_subsys_state. Unlike BPF_CGROUP_ITER_DESCENDANTS_PRE, + * BPF_CGROUP_ITER_DESCENDANTS_POST, and BPF_CGROUP_ITER_ANCESTORS_UP + * the iterator does not include the specified parent as one of the + * returned iterator elements. + */ + BPF_CGROUP_ITER_CHILDREN, }; union bpf_iter_link_info { diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c index f04a468cf6a7..fd51fe3d92cc 100644 --- a/kernel/bpf/cgroup_iter.c +++ b/kernel/bpf/cgroup_iter.c @@ -8,12 +8,13 @@ #include "../cgroup/cgroup-internal.h" /* cgroup_mutex and cgroup_is_dead */ -/* cgroup_iter provides four modes of traversal to the cgroup hierarchy. +/* cgroup_iter provides five modes of traversal to the cgroup hierarchy. * * 1. Walk the descendants of a cgroup in pre-order. * 2. Walk the descendants of a cgroup in post-order. * 3. Walk the ancestors of a cgroup. * 4. Show the given cgroup only. + * 5. Walk the children of a given parent cgroup. * * For walking descendants, cgroup_iter can walk in either pre-order or * post-order. For walking ancestors, the iter walks up from a cgroup to @@ -78,6 +79,8 @@ static void *cgroup_iter_seq_start(struct seq_file *seq, loff_t *pos) return css_next_descendant_pre(NULL, p->start_css); else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST) return css_next_descendant_post(NULL, p->start_css); + else if (p->order == BPF_CGROUP_ITER_CHILDREN) + return css_next_child(NULL, p->start_css); else /* BPF_CGROUP_ITER_SELF_ONLY and BPF_CGROUP_ITER_ANCESTORS_UP */ return p->start_css; } @@ -113,6 +116,8 @@ static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos) return css_next_descendant_post(curr, p->start_css); else if (p->order == BPF_CGROUP_ITER_ANCESTORS_UP) return curr->parent; + else if (p->order == BPF_CGROUP_ITER_CHILDREN) + return css_next_child(curr, p->start_css); else /* BPF_CGROUP_ITER_SELF_ONLY */ return NULL; } @@ -200,11 +205,16 @@ static int bpf_iter_attach_cgroup(struct bpf_prog *prog, int order = linfo->cgroup.order; struct cgroup *cgrp; - if (order != BPF_CGROUP_ITER_DESCENDANTS_PRE && - order != BPF_CGROUP_ITER_DESCENDANTS_POST && - order != BPF_CGROUP_ITER_ANCESTORS_UP && - order != BPF_CGROUP_ITER_SELF_ONLY) + switch (order) { + case BPF_CGROUP_ITER_DESCENDANTS_PRE: + case BPF_CGROUP_ITER_DESCENDANTS_POST: + case BPF_CGROUP_ITER_ANCESTORS_UP: + case BPF_CGROUP_ITER_SELF_ONLY: + case BPF_CGROUP_ITER_CHILDREN: + break; + default: return -EINVAL; + } if (fd && id) return -EINVAL; @@ -257,6 +267,8 @@ static void bpf_iter_cgroup_show_fdinfo(const struct bpf_iter_aux_info *aux, seq_puts(seq, "order: descendants_post\n"); else if (aux->cgroup.order == BPF_CGROUP_ITER_ANCESTORS_UP) seq_puts(seq, "order: ancestors_up\n"); + else if (aux->cgroup.order == BPF_CGROUP_ITER_CHILDREN) + seq_puts(seq, "order: children\n"); else /* BPF_CGROUP_ITER_SELF_ONLY */ seq_puts(seq, "order: self_only\n"); } @@ -320,6 +332,7 @@ __bpf_kfunc int bpf_iter_css_new(struct bpf_iter_css *it, case BPF_CGROUP_ITER_DESCENDANTS_PRE: case BPF_CGROUP_ITER_DESCENDANTS_POST: case BPF_CGROUP_ITER_ANCESTORS_UP: + case BPF_CGROUP_ITER_CHILDREN: break; default: return -EINVAL; @@ -345,6 +358,9 @@ __bpf_kfunc struct cgroup_subsys_state *bpf_iter_css_next(struct bpf_iter_css *i case BPF_CGROUP_ITER_DESCENDANTS_POST: kit->pos = css_next_descendant_post(kit->pos, kit->start); break; + case BPF_CGROUP_ITER_CHILDREN: + kit->pos = css_next_child(kit->pos, kit->start); + break; case BPF_CGROUP_ITER_ANCESTORS_UP: kit->pos = kit->pos ? kit->pos->parent : kit->start; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 3ca7d76e05f0..5e38b4887de6 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -119,6 +119,14 @@ enum bpf_cgroup_iter_order { BPF_CGROUP_ITER_DESCENDANTS_PRE, /* walk descendants in pre-order. */ BPF_CGROUP_ITER_DESCENDANTS_POST, /* walk descendants in post-order. */ BPF_CGROUP_ITER_ANCESTORS_UP, /* walk ancestors upward. */ + /* + * Walks the immediate children of the specified parent + * cgroup_subsys_state. Unlike BPF_CGROUP_ITER_DESCENDANTS_PRE, + * BPF_CGROUP_ITER_DESCENDANTS_POST, and BPF_CGROUP_ITER_ANCESTORS_UP + * the iterator does not include the specified parent as one of the + * returned iterator elements. + */ + BPF_CGROUP_ITER_CHILDREN, }; union bpf_iter_link_info { From 1456ebb291ddee67c9144c8f7f38a6dddcd32ed7 Mon Sep 17 00:00:00 2001 From: Matt Bobrowski Date: Tue, 27 Jan 2026 08:51:11 +0000 Subject: [PATCH 180/268] selftests/bpf: cover BPF_CGROUP_ITER_CHILDREN control option Extend some of the existing CSS iterator selftests such that they cover the newly introduced BPF_CGROUP_ITER_CHILDREN iterator control option. Signed-off-by: Matt Bobrowski Link: https://lore.kernel.org/r/20260127085112.3608687-2-mattbobrowski@google.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/cgroup_iter.c | 12 ++++++++++++ tools/testing/selftests/bpf/prog_tests/iters.c | 8 +++++++- tools/testing/selftests/bpf/progs/iters_css.c | 9 ++++++--- 3 files changed, 25 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c b/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c index 574d9a0cdc8e..0f88a9d00a22 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_iter.c @@ -190,6 +190,16 @@ static void test_walk_self_only(struct cgroup_iter *skel) BPF_CGROUP_ITER_SELF_ONLY, "self_only"); } +static void test_walk_children(struct cgroup_iter *skel) +{ + snprintf(expected_output, sizeof(expected_output), + PROLOGUE "%8llu\n%8llu\n" EPILOGUE, cg_id[CHILD1], + cg_id[CHILD2]); + + read_from_cgroup_iter(skel->progs.cgroup_id_printer, cg_fd[PARENT], + BPF_CGROUP_ITER_CHILDREN, "children"); +} + static void test_walk_dead_self_only(struct cgroup_iter *skel) { DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts); @@ -325,6 +335,8 @@ void test_cgroup_iter(void) test_walk_dead_self_only(skel); if (test__start_subtest("cgroup_iter__self_only_css_task")) test_walk_self_only_css_task(); + if (test__start_subtest("cgroup_iter__children")) + test_walk_children(skel); out: cgroup_iter__destroy(skel); diff --git a/tools/testing/selftests/bpf/prog_tests/iters.c b/tools/testing/selftests/bpf/prog_tests/iters.c index 3cea71f9c500..a539980a2fbe 100644 --- a/tools/testing/selftests/bpf/prog_tests/iters.c +++ b/tools/testing/selftests/bpf/prog_tests/iters.c @@ -253,6 +253,11 @@ static void subtest_css_iters(void) { "/cg1/cg2" }, { "/cg1/cg2/cg3" }, { "/cg1/cg2/cg3/cg4" }, + { "/cg1/cg5" }, + { "/cg1/cg5/cg6" }, + { "/cg1/cg7" }, + { "/cg1/cg7/cg8" }, + { "/cg1/cg7/cg8/cg9" }, }; int err, cg_nr = ARRAY_SIZE(cgs); int i; @@ -284,7 +289,8 @@ static void subtest_css_iters(void) ASSERT_EQ(skel->bss->post_order_cnt, cg_nr, "post_order_cnt"); ASSERT_EQ(skel->bss->last_cg_id, get_cgroup_id(cgs[0].path), "last_cg_id"); - ASSERT_EQ(skel->bss->tree_high, cg_nr - 1, "tree_high"); + ASSERT_EQ(skel->bss->children_cnt, 3, "children_cnt"); + ASSERT_EQ(skel->bss->tree_high, 3, "tree_high"); iters_css__detach(skel); cleanup: cleanup_cgroup_environment(); diff --git a/tools/testing/selftests/bpf/progs/iters_css.c b/tools/testing/selftests/bpf/progs/iters_css.c index ec1f6c2f590b..5a1d87d186a9 100644 --- a/tools/testing/selftests/bpf/progs/iters_css.c +++ b/tools/testing/selftests/bpf/progs/iters_css.c @@ -12,8 +12,7 @@ char _license[] SEC("license") = "GPL"; pid_t target_pid; u64 root_cg_id, leaf_cg_id; u64 first_cg_id, last_cg_id; - -int pre_order_cnt, post_order_cnt, tree_high; +int pre_order_cnt, post_order_cnt, children_cnt, tree_high; struct cgroup *bpf_cgroup_from_id(u64 cgid) __ksym; void bpf_cgroup_release(struct cgroup *p) __ksym; @@ -43,7 +42,7 @@ int iter_css_for_each(const void *ctx) } root_css = &root_cgrp->self; leaf_css = &leaf_cgrp->self; - pre_order_cnt = post_order_cnt = tree_high = 0; + pre_order_cnt = post_order_cnt = children_cnt = tree_high = 0; first_cg_id = last_cg_id = 0; bpf_rcu_read_lock(); @@ -60,6 +59,10 @@ int iter_css_for_each(const void *ctx) first_cg_id = cur_cgrp->kn->id; } + bpf_for_each(css, pos, root_css, BPF_CGROUP_ITER_CHILDREN) { + children_cnt++; + } + bpf_for_each(css, pos, leaf_css, BPF_CGROUP_ITER_ANCESTORS_UP) tree_high++; From b40cc5adaa80e1471095a62d78233b611d7a558c Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Sat, 24 Jan 2026 19:32:43 +0800 Subject: [PATCH 181/268] bpf, sockmap: Fix incorrect copied_seq calculation A socket using sockmap has its own independent receive queue: ingress_msg. This queue may contain data from its own protocol stack or from other sockets. The issue is that when reading from ingress_msg, we update tp->copied_seq by default. However, if the data is not from its own protocol stack, tcp->rcv_nxt is not increased. Later, if we convert this socket to a native socket, reading from this socket may fail because copied_seq might be significantly larger than rcv_nxt. This fix also addresses the syzkaller-reported bug referenced in the Closes tag. This patch marks the skmsg objects in ingress_msg. When reading, we update copied_seq only if the data is from its own protocol stack. FD1:read() -- FD1->copied_seq++ | [read data] | [enqueue data] v [sockmap] -> ingress to self -> ingress_msg queue FD1 native stack ------> ^ -- FD1->rcv_nxt++ -> redirect to other | [enqueue data] | | | ingress to FD1 v ^ ... | [sockmap] FD2 native stack Closes: https://syzkaller.appspot.com/bug?extid=06dbd397158ec0ea4983 Fixes: 04919bed948dc ("tcp: Introduce tcp_read_skb()") Reviewed-by: Jakub Sitnicki Reviewed-by: John Fastabend Signed-off-by: Jiayuan Chen Link: https://lore.kernel.org/r/20260124113314.113584-2-jiayuan.chen@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/skmsg.h | 2 ++ net/core/skmsg.c | 27 ++++++++++++++++++++++++--- net/ipv4/tcp_bpf.c | 5 +++-- 3 files changed, 29 insertions(+), 5 deletions(-) diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 49847888c287..dfdc158ab88c 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -141,6 +141,8 @@ int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, struct sk_msg *msg, u32 bytes); int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, int len, int flags); +int __sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, + int len, int flags, int *copied_from_self); bool sk_msg_is_readable(struct sock *sk); static inline void sk_msg_check_to_free(struct sk_msg *msg, u32 i, u32 bytes) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 2ac7731e1e0a..d402da5caadd 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -409,22 +409,26 @@ int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, } EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter); -/* Receive sk_msg from psock->ingress_msg to @msg. */ -int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, - int len, int flags) +int __sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, + int len, int flags, int *copied_from_self) { struct iov_iter *iter = &msg->msg_iter; int peek = flags & MSG_PEEK; struct sk_msg *msg_rx; int i, copied = 0; + bool from_self; msg_rx = sk_psock_peek_msg(psock); + if (copied_from_self) + *copied_from_self = 0; + while (copied != len) { struct scatterlist *sge; if (unlikely(!msg_rx)) break; + from_self = msg_rx->sk == sk; i = msg_rx->sg.start; do { struct page *page; @@ -443,6 +447,9 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, } copied += copy; + if (from_self && copied_from_self) + *copied_from_self += copy; + if (likely(!peek)) { sge->offset += copy; sge->length -= copy; @@ -487,6 +494,13 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, out: return copied; } + +/* Receive sk_msg from psock->ingress_msg to @msg. */ +int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, + int len, int flags) +{ + return __sk_msg_recvmsg(sk, psock, msg, len, flags, NULL); +} EXPORT_SYMBOL_GPL(sk_msg_recvmsg); bool sk_msg_is_readable(struct sock *sk) @@ -616,6 +630,12 @@ static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb if (unlikely(!msg)) return -EAGAIN; skb_set_owner_r(skb, sk); + + /* This is used in tcp_bpf_recvmsg_parser() to determine whether the + * data originates from the socket's own protocol stack. No need to + * refcount sk because msg's lifetime is bound to sk via the ingress_msg. + */ + msg->sk = sk; err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg, take_ref); if (err < 0) kfree(msg); @@ -909,6 +929,7 @@ int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock, sk_msg_compute_data_pointers(msg); msg->sk = sk; ret = bpf_prog_run_pin_on_cpu(prog, msg); + msg->sk = NULL; ret = sk_psock_map_verd(ret, msg->sk_redir); psock->apply_bytes = msg->apply_bytes; if (ret == __SK_REDIRECT) { diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index a268e1595b22..5c698fd7fbf8 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -226,6 +226,7 @@ static int tcp_bpf_recvmsg_parser(struct sock *sk, int peek = flags & MSG_PEEK; struct sk_psock *psock; struct tcp_sock *tcp; + int copied_from_self = 0; int copied = 0; u32 seq; @@ -262,7 +263,7 @@ static int tcp_bpf_recvmsg_parser(struct sock *sk, } msg_bytes_ready: - copied = sk_msg_recvmsg(sk, psock, msg, len, flags); + copied = __sk_msg_recvmsg(sk, psock, msg, len, flags, &copied_from_self); /* The typical case for EFAULT is the socket was gracefully * shutdown with a FIN pkt. So check here the other case is * some error on copy_page_to_iter which would be unexpected. @@ -277,7 +278,7 @@ static int tcp_bpf_recvmsg_parser(struct sock *sk, goto out; } } - seq += copied; + seq += copied_from_self; if (!copied) { long timeo; int data; From 929e30f9312514902133c45e51c79088421ab084 Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Sat, 24 Jan 2026 19:32:44 +0800 Subject: [PATCH 182/268] bpf, sockmap: Fix FIONREAD for sockmap A socket using sockmap has its own independent receive queue: ingress_msg. This queue may contain data from its own protocol stack or from other sockets. Therefore, for sockmap, relying solely on copied_seq and rcv_nxt to calculate FIONREAD is not enough. This patch adds a new msg_tot_len field in the psock structure to record the data length in ingress_msg. Additionally, we implement new ioctl interfaces for TCP and UDP to intercept FIONREAD operations. Note that we intentionally do not include sk_receive_queue data in the FIONREAD result. Data in sk_receive_queue has not yet been processed by the BPF verdict program, and may be redirected to other sockets or dropped. Including it would create semantic ambiguity since this data may never be readable by the user. Unix and VSOCK sockets have similar issues, but fixing them is outside the scope of this patch as it would require more intrusive changes. Previous work by John Fastabend made some efforts towards FIONREAD support: commit e5c6de5fa025 ("bpf, sockmap: Incorrectly handling copied_seq") Although the current patch is based on the previous work by John Fastabend, it is acceptable for our Fixes tag to point to the same commit. FD1:read() -- FD1->copied_seq++ | [read data] | [enqueue data] v [sockmap] -> ingress to self -> ingress_msg queue FD1 native stack ------> ^ -- FD1->rcv_nxt++ -> redirect to other | [enqueue data] | | | ingress to FD1 v ^ ... | [sockmap] FD2 native stack Fixes: 04919bed948dc ("tcp: Introduce tcp_read_skb()") Signed-off-by: Jiayuan Chen Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/r/20260124113314.113584-3-jiayuan.chen@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/skmsg.h | 68 +++++++++++++++++++++++++++++++++++++++++-- net/core/skmsg.c | 3 ++ net/ipv4/tcp_bpf.c | 20 +++++++++++++ net/ipv4/udp_bpf.c | 23 ++++++++++++--- 4 files changed, 108 insertions(+), 6 deletions(-) diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index dfdc158ab88c..829b281d6c9c 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -97,6 +97,8 @@ struct sk_psock { struct sk_buff_head ingress_skb; struct list_head ingress_msg; spinlock_t ingress_lock; + /** @msg_tot_len: Total bytes queued in ingress_msg list. */ + u32 msg_tot_len; unsigned long state; struct list_head link; spinlock_t link_lock; @@ -321,6 +323,27 @@ static inline void sock_drop(struct sock *sk, struct sk_buff *skb) kfree_skb(skb); } +static inline u32 sk_psock_get_msg_len_nolock(struct sk_psock *psock) +{ + /* Used by ioctl to read msg_tot_len only; lock-free for performance */ + return READ_ONCE(psock->msg_tot_len); +} + +static inline void sk_psock_msg_len_add_locked(struct sk_psock *psock, int diff) +{ + /* Use WRITE_ONCE to ensure correct read in sk_psock_get_msg_len_nolock(). + * ingress_lock should be held to prevent concurrent updates to msg_tot_len + */ + WRITE_ONCE(psock->msg_tot_len, psock->msg_tot_len + diff); +} + +static inline void sk_psock_msg_len_add(struct sk_psock *psock, int diff) +{ + spin_lock_bh(&psock->ingress_lock); + sk_psock_msg_len_add_locked(psock, diff); + spin_unlock_bh(&psock->ingress_lock); +} + static inline bool sk_psock_queue_msg(struct sk_psock *psock, struct sk_msg *msg) { @@ -329,6 +352,7 @@ static inline bool sk_psock_queue_msg(struct sk_psock *psock, spin_lock_bh(&psock->ingress_lock); if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { list_add_tail(&msg->list, &psock->ingress_msg); + sk_psock_msg_len_add_locked(psock, msg->sg.size); ret = true; } else { sk_msg_free(psock->sk, msg); @@ -345,18 +369,25 @@ static inline struct sk_msg *sk_psock_dequeue_msg(struct sk_psock *psock) spin_lock_bh(&psock->ingress_lock); msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list); - if (msg) + if (msg) { list_del(&msg->list); + sk_psock_msg_len_add_locked(psock, -msg->sg.size); + } spin_unlock_bh(&psock->ingress_lock); return msg; } +static inline struct sk_msg *sk_psock_peek_msg_locked(struct sk_psock *psock) +{ + return list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list); +} + static inline struct sk_msg *sk_psock_peek_msg(struct sk_psock *psock) { struct sk_msg *msg; spin_lock_bh(&psock->ingress_lock); - msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list); + msg = sk_psock_peek_msg_locked(psock); spin_unlock_bh(&psock->ingress_lock); return msg; } @@ -523,6 +554,39 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock) return !!psock->saved_data_ready; } +/* for tcp only, sk is locked */ +static inline ssize_t sk_psock_msg_inq(struct sock *sk) +{ + struct sk_psock *psock; + ssize_t inq = 0; + + psock = sk_psock_get(sk); + if (likely(psock)) { + inq = sk_psock_get_msg_len_nolock(psock); + sk_psock_put(sk, psock); + } + return inq; +} + +/* for udp only, sk is not locked */ +static inline ssize_t sk_msg_first_len(struct sock *sk) +{ + struct sk_psock *psock; + struct sk_msg *msg; + ssize_t inq = 0; + + psock = sk_psock_get(sk); + if (likely(psock)) { + spin_lock_bh(&psock->ingress_lock); + msg = sk_psock_peek_msg_locked(psock); + if (msg) + inq = msg->sg.size; + spin_unlock_bh(&psock->ingress_lock); + sk_psock_put(sk, psock); + } + return inq; +} + #if IS_ENABLED(CONFIG_NET_SOCK_MSG) #define BPF_F_STRPARSER (1UL << 1) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index d402da5caadd..ddde93dd8bc6 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -458,6 +458,7 @@ int __sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg atomic_sub(copy, &sk->sk_rmem_alloc); } msg_rx->sg.size -= copy; + sk_psock_msg_len_add(psock, -copy); if (!sge->length) { sk_msg_iter_var_next(i); @@ -821,9 +822,11 @@ static void __sk_psock_purge_ingress_msg(struct sk_psock *psock) list_del(&msg->list); if (!msg->skb) atomic_sub(msg->sg.size, &psock->sk->sk_rmem_alloc); + sk_psock_msg_len_add(psock, -msg->sg.size); sk_msg_free(psock->sk, msg); kfree(msg); } + WARN_ON_ONCE(psock->msg_tot_len); } static void __sk_psock_zap_ingress(struct sk_psock *psock) diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 5c698fd7fbf8..ca8a5cb8e569 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -10,6 +10,7 @@ #include #include +#include void tcp_eat_skb(struct sock *sk, struct sk_buff *skb) { @@ -332,6 +333,24 @@ static int tcp_bpf_recvmsg_parser(struct sock *sk, return copied; } +static int tcp_bpf_ioctl(struct sock *sk, int cmd, int *karg) +{ + bool slow; + + if (cmd != SIOCINQ) + return tcp_ioctl(sk, cmd, karg); + + /* works similar as tcp_ioctl */ + if (sk->sk_state == TCP_LISTEN) + return -EINVAL; + + slow = lock_sock_fast(sk); + *karg = sk_psock_msg_inq(sk); + unlock_sock_fast(sk, slow); + + return 0; +} + static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { @@ -610,6 +629,7 @@ static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS], prot[TCP_BPF_BASE].close = sock_map_close; prot[TCP_BPF_BASE].recvmsg = tcp_bpf_recvmsg; prot[TCP_BPF_BASE].sock_is_readable = sk_msg_is_readable; + prot[TCP_BPF_BASE].ioctl = tcp_bpf_ioctl; prot[TCP_BPF_TX] = prot[TCP_BPF_BASE]; prot[TCP_BPF_TX].sendmsg = tcp_bpf_sendmsg; diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c index 0735d820e413..91233e37cd97 100644 --- a/net/ipv4/udp_bpf.c +++ b/net/ipv4/udp_bpf.c @@ -5,6 +5,7 @@ #include #include #include +#include #include "udp_impl.h" @@ -111,12 +112,26 @@ enum { static DEFINE_SPINLOCK(udpv6_prot_lock); static struct proto udp_bpf_prots[UDP_BPF_NUM_PROTS]; +static int udp_bpf_ioctl(struct sock *sk, int cmd, int *karg) +{ + if (cmd != SIOCINQ) + return udp_ioctl(sk, cmd, karg); + + /* Since we don't hold a lock, sk_receive_queue may contain data. + * BPF might only be processing this data at the moment. We only + * care about the data in the ingress_msg here. + */ + *karg = sk_msg_first_len(sk); + return 0; +} + static void udp_bpf_rebuild_protos(struct proto *prot, const struct proto *base) { - *prot = *base; - prot->close = sock_map_close; - prot->recvmsg = udp_bpf_recvmsg; - prot->sock_is_readable = sk_msg_is_readable; + *prot = *base; + prot->close = sock_map_close; + prot->recvmsg = udp_bpf_recvmsg; + prot->sock_is_readable = sk_msg_is_readable; + prot->ioctl = udp_bpf_ioctl; } static void udp_bpf_check_v6_needs_rebuild(struct proto *ops) From 17e2ce02bf5669dfa659976e93d409228cba98f9 Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Sat, 24 Jan 2026 19:32:45 +0800 Subject: [PATCH 183/268] selftests/bpf: Add tests for FIONREAD and copied_seq This commit adds two new test functions: one to reproduce the bug reported by syzkaller [1], and another to cover the calculation of copied_seq. The tests primarily involve installing and uninstalling sockmap on sockets, then reading data to verify proper functionality. Additionally, extend the do_test_sockmap_skb_verdict_fionread() function to support UDP FIONREAD testing. [1] https://syzkaller.appspot.com/bug?extid=06dbd397158ec0ea4983 Signed-off-by: Jiayuan Chen Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/r/20260124113314.113584-4-jiayuan.chen@linux.dev Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/sockmap_basic.c | 294 +++++++++++++++++- .../bpf/progs/test_sockmap_pass_prog.c | 14 + 2 files changed, 302 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c index 1e3e4392dcca..256707e7d20d 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c @@ -1,7 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2020 Cloudflare #include -#include +#include +#include #include #include "test_progs.h" @@ -22,6 +23,15 @@ #define TCP_REPAIR_ON 1 #define TCP_REPAIR_OFF_NO_WP -1 /* Turn off without window probes */ +/** + * SOL_TCP is defined in (glibc), but the copybuf_address + * field of tcp_zerocopy_receive is not yet included in older versions. + * This workaround remains necessary until the glibc update propagates. + */ +#ifndef SOL_TCP +#define SOL_TCP 6 +#endif + static int connected_socket_v4(void) { struct sockaddr_in addr = { @@ -536,13 +546,14 @@ static void test_sockmap_skb_verdict_shutdown(void) } -static void test_sockmap_skb_verdict_fionread(bool pass_prog) +static void do_test_sockmap_skb_verdict_fionread(int sotype, bool pass_prog) { int err, map, verdict, c0 = -1, c1 = -1, p0 = -1, p1 = -1; int expected, zero = 0, sent, recvd, avail; struct test_sockmap_pass_prog *pass = NULL; struct test_sockmap_drop_prog *drop = NULL; char buf[256] = "0123456789"; + int split_len = sizeof(buf) / 2; if (pass_prog) { pass = test_sockmap_pass_prog__open_and_load(); @@ -550,7 +561,10 @@ static void test_sockmap_skb_verdict_fionread(bool pass_prog) return; verdict = bpf_program__fd(pass->progs.prog_skb_verdict); map = bpf_map__fd(pass->maps.sock_map_rx); - expected = sizeof(buf); + if (sotype == SOCK_DGRAM) + expected = split_len; /* FIONREAD for UDP is different from TCP */ + else + expected = sizeof(buf); } else { drop = test_sockmap_drop_prog__open_and_load(); if (!ASSERT_OK_PTR(drop, "open_and_load")) @@ -566,7 +580,7 @@ static void test_sockmap_skb_verdict_fionread(bool pass_prog) if (!ASSERT_OK(err, "bpf_prog_attach")) goto out; - err = create_socket_pairs(AF_INET, SOCK_STREAM, &c0, &c1, &p0, &p1); + err = create_socket_pairs(AF_INET, sotype, &c0, &c1, &p0, &p1); if (!ASSERT_OK(err, "create_socket_pairs()")) goto out; @@ -574,8 +588,9 @@ static void test_sockmap_skb_verdict_fionread(bool pass_prog) if (!ASSERT_OK(err, "bpf_map_update_elem(c1)")) goto out_close; - sent = xsend(p1, &buf, sizeof(buf), 0); - ASSERT_EQ(sent, sizeof(buf), "xsend(p0)"); + sent = xsend(p1, &buf, split_len, 0); + sent += xsend(p1, &buf, sizeof(buf) - split_len, 0); + ASSERT_EQ(sent, sizeof(buf), "xsend(p1)"); err = ioctl(c1, FIONREAD, &avail); ASSERT_OK(err, "ioctl(FIONREAD) error"); ASSERT_EQ(avail, expected, "ioctl(FIONREAD)"); @@ -597,6 +612,12 @@ static void test_sockmap_skb_verdict_fionread(bool pass_prog) test_sockmap_drop_prog__destroy(drop); } +static void test_sockmap_skb_verdict_fionread(bool pass_prog) +{ + do_test_sockmap_skb_verdict_fionread(SOCK_STREAM, pass_prog); + do_test_sockmap_skb_verdict_fionread(SOCK_DGRAM, pass_prog); +} + static void test_sockmap_skb_verdict_change_tail(void) { struct test_sockmap_change_tail *skel; @@ -1042,6 +1063,257 @@ static void test_sockmap_vsock_unconnected(void) xclose(map); } +/* it is used to reproduce WARNING */ +static void test_sockmap_zc(void) +{ + int map, err, sent, recvd, zero = 0, one = 1, on = 1; + char buf[10] = "0123456789", rcv[11], addr[100]; + struct test_sockmap_pass_prog *skel = NULL; + int c0 = -1, p0 = -1, c1 = -1, p1 = -1; + struct tcp_zerocopy_receive zc; + socklen_t zc_len = sizeof(zc); + struct bpf_program *prog; + + skel = test_sockmap_pass_prog__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open_and_load")) + return; + + if (create_socket_pairs(AF_INET, SOCK_STREAM, &c0, &c1, &p0, &p1)) + goto end; + + prog = skel->progs.prog_skb_verdict_ingress; + map = bpf_map__fd(skel->maps.sock_map_rx); + + err = bpf_prog_attach(bpf_program__fd(prog), map, BPF_SK_SKB_STREAM_VERDICT, 0); + if (!ASSERT_OK(err, "bpf_prog_attach")) + goto end; + + err = bpf_map_update_elem(map, &zero, &p0, BPF_ANY); + if (!ASSERT_OK(err, "bpf_map_update_elem")) + goto end; + + err = bpf_map_update_elem(map, &one, &p1, BPF_ANY); + if (!ASSERT_OK(err, "bpf_map_update_elem")) + goto end; + + sent = xsend(c0, buf, sizeof(buf), 0); + if (!ASSERT_EQ(sent, sizeof(buf), "xsend")) + goto end; + + /* trigger tcp_bpf_recvmsg_parser and inc copied_seq of p1 */ + recvd = recv_timeout(p1, rcv, sizeof(rcv), MSG_DONTWAIT, 1); + if (!ASSERT_EQ(recvd, sent, "recv_timeout(p1)")) + goto end; + + /* uninstall sockmap of p1 */ + bpf_map_delete_elem(map, &one); + + /* trigger tcp stack and the rcv_nxt of p1 is less than copied_seq */ + sent = xsend(c1, buf, sizeof(buf) - 1, 0); + if (!ASSERT_EQ(sent, sizeof(buf) - 1, "xsend")) + goto end; + + err = setsockopt(p1, SOL_SOCKET, SO_ZEROCOPY, &on, sizeof(on)); + if (!ASSERT_OK(err, "setsockopt")) + goto end; + + memset(&zc, 0, sizeof(zc)); + zc.copybuf_address = (__u64)((unsigned long)addr); + zc.copybuf_len = sizeof(addr); + + err = getsockopt(p1, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, &zc, &zc_len); + if (!ASSERT_OK(err, "getsockopt")) + goto end; + +end: + if (c0 >= 0) + close(c0); + if (p0 >= 0) + close(p0); + if (c1 >= 0) + close(c1); + if (p1 >= 0) + close(p1); + test_sockmap_pass_prog__destroy(skel); +} + +/* it is used to check whether copied_seq of sk is correct */ +static void test_sockmap_copied_seq(bool strp) +{ + int i, map, err, sent, recvd, zero = 0, one = 1; + struct test_sockmap_pass_prog *skel = NULL; + int c0 = -1, p0 = -1, c1 = -1, p1 = -1; + char buf[10] = "0123456789", rcv[11]; + struct bpf_program *prog; + + skel = test_sockmap_pass_prog__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open_and_load")) + return; + + if (create_socket_pairs(AF_INET, SOCK_STREAM, &c0, &c1, &p0, &p1)) + goto end; + + prog = skel->progs.prog_skb_verdict_ingress; + map = bpf_map__fd(skel->maps.sock_map_rx); + + err = bpf_prog_attach(bpf_program__fd(prog), map, BPF_SK_SKB_STREAM_VERDICT, 0); + if (!ASSERT_OK(err, "bpf_prog_attach verdict")) + goto end; + + if (strp) { + prog = skel->progs.prog_skb_verdict_ingress_strp; + err = bpf_prog_attach(bpf_program__fd(prog), map, BPF_SK_SKB_STREAM_PARSER, 0); + if (!ASSERT_OK(err, "bpf_prog_attach parser")) + goto end; + } + + err = bpf_map_update_elem(map, &zero, &p0, BPF_ANY); + if (!ASSERT_OK(err, "bpf_map_update_elem(p0)")) + goto end; + + err = bpf_map_update_elem(map, &one, &p1, BPF_ANY); + if (!ASSERT_OK(err, "bpf_map_update_elem(p1)")) + goto end; + + /* just trigger sockamp: data sent by c0 will be received by p1 */ + sent = xsend(c0, buf, sizeof(buf), 0); + if (!ASSERT_EQ(sent, sizeof(buf), "xsend(c0), bpf")) + goto end; + + /* do partial read */ + recvd = recv_timeout(p1, rcv, 1, MSG_DONTWAIT, 1); + recvd += recv_timeout(p1, rcv + 1, sizeof(rcv) - 1, MSG_DONTWAIT, 1); + if (!ASSERT_EQ(recvd, sent, "recv_timeout(p1), bpf") || + !ASSERT_OK(memcmp(buf, rcv, recvd), "data mismatch")) + goto end; + + /* uninstall sockmap of p1 and p0 */ + err = bpf_map_delete_elem(map, &one); + if (!ASSERT_OK(err, "bpf_map_delete_elem(1)")) + goto end; + + err = bpf_map_delete_elem(map, &zero); + if (!ASSERT_OK(err, "bpf_map_delete_elem(0)")) + goto end; + + /* now all sockets become plain socket, they should still work */ + for (i = 0; i < 5; i++) { + /* test copied_seq of p1 by running tcp native stack */ + sent = xsend(c1, buf, sizeof(buf), 0); + if (!ASSERT_EQ(sent, sizeof(buf), "xsend(c1), native")) + goto end; + + recvd = recv(p1, rcv, sizeof(rcv), MSG_DONTWAIT); + if (!ASSERT_EQ(recvd, sent, "recv_timeout(p1), native")) + goto end; + + /* p0 previously redirected skb to p1, we also check copied_seq of p0 */ + sent = xsend(c0, buf, sizeof(buf), 0); + if (!ASSERT_EQ(sent, sizeof(buf), "xsend(c0), native")) + goto end; + + recvd = recv(p0, rcv, sizeof(rcv), MSG_DONTWAIT); + if (!ASSERT_EQ(recvd, sent, "recv_timeout(p0), native")) + goto end; + } + +end: + if (c0 >= 0) + close(c0); + if (p0 >= 0) + close(p0); + if (c1 >= 0) + close(c1); + if (p1 >= 0) + close(p1); + test_sockmap_pass_prog__destroy(skel); +} + +/* Wait until FIONREAD returns the expected value or timeout */ +static int wait_for_fionread(int fd, int expected, unsigned int timeout_ms) +{ + unsigned int elapsed = 0; + int avail = 0; + + while (elapsed < timeout_ms) { + if (ioctl(fd, FIONREAD, &avail) < 0) + return -errno; + if (avail >= expected) + return avail; + usleep(1000); + elapsed++; + } + return avail; +} + +/* it is used to send data to via native stack and BPF redirecting */ +static void test_sockmap_multi_channels(int sotype) +{ + int map, err, sent, recvd, zero = 0, one = 1, avail = 0, expected; + struct test_sockmap_pass_prog *skel = NULL; + int c0 = -1, p0 = -1, c1 = -1, p1 = -1; + char buf[10] = "0123456789", rcv[11]; + struct bpf_program *prog; + + skel = test_sockmap_pass_prog__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open_and_load")) + return; + + err = create_socket_pairs(AF_INET, sotype, &c0, &c1, &p0, &p1); + if (err) + goto end; + + prog = skel->progs.prog_skb_verdict_ingress; + map = bpf_map__fd(skel->maps.sock_map_rx); + + err = bpf_prog_attach(bpf_program__fd(prog), map, BPF_SK_SKB_STREAM_VERDICT, 0); + if (!ASSERT_OK(err, "bpf_prog_attach verdict")) + goto end; + + err = bpf_map_update_elem(map, &zero, &p0, BPF_ANY); + if (!ASSERT_OK(err, "bpf_map_update_elem(p0)")) + goto end; + + err = bpf_map_update_elem(map, &one, &p1, BPF_ANY); + if (!ASSERT_OK(err, "bpf_map_update_elem")) + goto end; + + /* send data to p1 via native stack */ + sent = xsend(c1, buf, 2, 0); + if (!ASSERT_EQ(sent, 2, "xsend(2)")) + goto end; + + avail = wait_for_fionread(p1, 2, IO_TIMEOUT_SEC); + ASSERT_EQ(avail, 2, "ioctl(FIONREAD) partial return"); + + /* send data to p1 via bpf redirecting */ + sent = xsend(c0, buf + 2, sizeof(buf) - 2, 0); + if (!ASSERT_EQ(sent, sizeof(buf) - 2, "xsend(remain-data)")) + goto end; + + /* Poll FIONREAD until expected bytes arrive, poll_read() is unreliable + * here since it may return immediately if prior data is already queued. + */ + expected = sotype == SOCK_DGRAM ? 2 : sizeof(buf); + avail = wait_for_fionread(p1, expected, IO_TIMEOUT_SEC); + ASSERT_EQ(avail, expected, "ioctl(FIONREAD) full return"); + + recvd = recv_timeout(p1, rcv, sizeof(rcv), MSG_DONTWAIT, 1); + if (!ASSERT_EQ(recvd, sizeof(buf), "recv_timeout(p1)") || + !ASSERT_OK(memcmp(buf, rcv, recvd), "data mismatch")) + goto end; +end: + if (c0 >= 0) + close(c0); + if (p0 >= 0) + close(p0); + if (c1 >= 0) + close(c1); + if (p1 >= 0) + close(p1); + test_sockmap_pass_prog__destroy(skel); +} + void test_sockmap_basic(void) { if (test__start_subtest("sockmap create_update_free")) @@ -1108,4 +1380,14 @@ void test_sockmap_basic(void) test_sockmap_skb_verdict_vsock_poll(); if (test__start_subtest("sockmap vsock unconnected")) test_sockmap_vsock_unconnected(); + if (test__start_subtest("sockmap with zc")) + test_sockmap_zc(); + if (test__start_subtest("sockmap recover")) + test_sockmap_copied_seq(false); + if (test__start_subtest("sockmap recover with strp")) + test_sockmap_copied_seq(true); + if (test__start_subtest("sockmap tcp multi channels")) + test_sockmap_multi_channels(SOCK_STREAM); + if (test__start_subtest("sockmap udp multi channels")) + test_sockmap_multi_channels(SOCK_DGRAM); } diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_pass_prog.c b/tools/testing/selftests/bpf/progs/test_sockmap_pass_prog.c index 69aacc96db36..ef9edca184ea 100644 --- a/tools/testing/selftests/bpf/progs/test_sockmap_pass_prog.c +++ b/tools/testing/selftests/bpf/progs/test_sockmap_pass_prog.c @@ -44,4 +44,18 @@ int prog_skb_parser(struct __sk_buff *skb) return SK_PASS; } +SEC("sk_skb/stream_verdict") +int prog_skb_verdict_ingress(struct __sk_buff *skb) +{ + int one = 1; + + return bpf_sk_redirect_map(skb, &sock_map_rx, one, BPF_F_INGRESS); +} + +SEC("sk_skb/stream_parser") +int prog_skb_verdict_ingress_strp(struct __sk_buff *skb) +{ + return skb->len; +} + char _license[] SEC("license") = "GPL"; From ae23bc81ddf7c17b663c4ed1b21e35527b0a7131 Mon Sep 17 00:00:00 2001 From: Guillaume Gonnet Date: Tue, 27 Jan 2026 17:02:00 +0100 Subject: [PATCH 184/268] bpf: Fix tcx/netkit detach permissions when prog fd isn't given This commit fixes a security issue where BPF_PROG_DETACH on tcx or netkit devices could be executed by any user when no program fd was provided, bypassing permission checks. The fix adds a capability check for CAP_NET_ADMIN or CAP_SYS_ADMIN in this case. Fixes: e420bed02507 ("bpf: Add fd-based tcx multi-prog infra with link support") Signed-off-by: Guillaume Gonnet Link: https://lore.kernel.org/r/20260127160200.10395-1-ggonnet.linux@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 5 +++++ include/linux/bpf_mprog.h | 10 ++++++++++ kernel/bpf/syscall.c | 7 ++----- 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4427c6e98331..9272a237cced 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -3362,6 +3362,11 @@ static inline void bpf_prog_report_arena_violation(bool write, unsigned long add } #endif /* CONFIG_BPF_SYSCALL */ +static inline bool bpf_net_capable(void) +{ + return capable(CAP_NET_ADMIN) || capable(CAP_SYS_ADMIN); +} + static __always_inline int bpf_probe_read_kernel_common(void *dst, u32 size, const void *unsafe_ptr) { diff --git a/include/linux/bpf_mprog.h b/include/linux/bpf_mprog.h index 929225f7b095..0b9f4caeeb0a 100644 --- a/include/linux/bpf_mprog.h +++ b/include/linux/bpf_mprog.h @@ -340,4 +340,14 @@ static inline bool bpf_mprog_supported(enum bpf_prog_type type) return false; } } + +static inline bool bpf_mprog_detach_empty(enum bpf_prog_type type) +{ + switch (type) { + case BPF_PROG_TYPE_SCHED_CLS: + return bpf_net_capable(); + default: + return false; + } +} #endif /* __BPF_MPROG_H */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b9184545c3fd..5f59dd47a5b1 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1363,11 +1363,6 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, return ret; } -static bool bpf_net_capable(void) -{ - return capable(CAP_NET_ADMIN) || capable(CAP_SYS_ADMIN); -} - #define BPF_MAP_CREATE_LAST_FIELD excl_prog_hash_size /* called via syscall */ static int map_create(union bpf_attr *attr, bpfptr_t uattr) @@ -4579,6 +4574,8 @@ static int bpf_prog_detach(const union bpf_attr *attr) prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); if (IS_ERR(prog)) return PTR_ERR(prog); + } else if (!bpf_mprog_detach_empty(ptype)) { + return -EPERM; } } else if (is_cgroup_prog_type(ptype, 0, false)) { if (attr->attach_flags || attr->relative_fd) From 4be42c92220128b3128854a2d6b0f0ad0bcedbdb Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 30 Dec 2025 15:50:02 +0100 Subject: [PATCH 185/268] ftrace,bpf: Remove FTRACE_OPS_FL_JMP ftrace_ops flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit At the moment the we allow the jmp attach only for ftrace_ops that has FTRACE_OPS_FL_JMP set. This conflicts with following changes where we use single ftrace_ops object for all direct call sites, so all could be be attached via just call or jmp. We already limit the jmp attach support with config option and bit (LSB) set on the trampoline address. It turns out that's actually enough to limit the jmp attach for architecture and only for chosen addresses (with LSB bit set). Each user of register_ftrace_direct or modify_ftrace_direct can set the trampoline bit (LSB) to indicate it has to be attached by jmp. The bpf trampoline generation code uses trampoline flags to generate jmp-attach specific code and ftrace inner code uses the trampoline bit (LSB) to handle return from jmp attachment, so there's no harm to remove the FTRACE_OPS_FL_JMP bit. The fexit/fmodret performance stays the same (did not drop), current code: fentry : 77.904 ± 0.546M/s fexit : 62.430 ± 0.554M/s fmodret : 66.503 ± 0.902M/s with this change: fentry : 80.472 ± 0.061M/s fexit : 63.995 ± 0.127M/s fmodret : 67.362 ± 0.175M/s Fixes: 25e4e3565d45 ("ftrace: Introduce FTRACE_OPS_FL_JMP") Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Reviewed-by: Steven Rostedt (Google) Link: https://lore.kernel.org/bpf/20251230145010.103439-2-jolsa@kernel.org --- include/linux/ftrace.h | 1 - kernel/bpf/trampoline.c | 32 ++++++++++++++------------------ kernel/trace/ftrace.c | 14 -------------- 3 files changed, 14 insertions(+), 33 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index a3a8989e3268..cc869c59c1a6 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -359,7 +359,6 @@ enum { FTRACE_OPS_FL_DIRECT = BIT(17), FTRACE_OPS_FL_SUBOP = BIT(18), FTRACE_OPS_FL_GRAPH = BIT(19), - FTRACE_OPS_FL_JMP = BIT(20), }; #ifndef CONFIG_DYNAMIC_FTRACE_WITH_ARGS diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index edf9da43762d..468de930b2ed 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -221,10 +221,15 @@ static int modify_fentry(struct bpf_trampoline *tr, u32 orig_flags, int ret; if (tr->func.ftrace_managed) { + unsigned long addr = (unsigned long) new_addr; + + if (bpf_trampoline_use_jmp(tr->flags)) + addr = ftrace_jmp_set(addr); + if (lock_direct_mutex) - ret = modify_ftrace_direct(tr->fops, (long)new_addr); + ret = modify_ftrace_direct(tr->fops, addr); else - ret = modify_ftrace_direct_nolock(tr->fops, (long)new_addr); + ret = modify_ftrace_direct_nolock(tr->fops, addr); } else { ret = bpf_trampoline_update_fentry(tr, orig_flags, old_addr, new_addr); @@ -247,10 +252,15 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr) } if (tr->func.ftrace_managed) { + unsigned long addr = (unsigned long) new_addr; + + if (bpf_trampoline_use_jmp(tr->flags)) + addr = ftrace_jmp_set(addr); + ret = ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 0, 1); if (ret) return ret; - ret = register_ftrace_direct(tr->fops, (long)new_addr); + ret = register_ftrace_direct(tr->fops, addr); } else { ret = bpf_trampoline_update_fentry(tr, 0, NULL, new_addr); } @@ -506,13 +516,6 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut if (err) goto out_free; -#ifdef CONFIG_DYNAMIC_FTRACE_WITH_JMP - if (bpf_trampoline_use_jmp(tr->flags)) - tr->fops->flags |= FTRACE_OPS_FL_JMP; - else - tr->fops->flags &= ~FTRACE_OPS_FL_JMP; -#endif - WARN_ON(tr->cur_image && total == 0); if (tr->cur_image) /* progs already running at this address */ @@ -540,15 +543,8 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut tr->cur_image = im; out: /* If any error happens, restore previous flags */ - if (err) { + if (err) tr->flags = orig_flags; -#ifdef CONFIG_DYNAMIC_FTRACE_WITH_JMP - if (bpf_trampoline_use_jmp(tr->flags)) - tr->fops->flags |= FTRACE_OPS_FL_JMP; - else - tr->fops->flags &= ~FTRACE_OPS_FL_JMP; -#endif - } kfree(tlinks); return err; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ef2d5dca6f70..6040aacc97da 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6046,15 +6046,8 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) if (ftrace_hash_empty(hash)) return -EINVAL; - /* This is a "raw" address, and this should never happen. */ - if (WARN_ON_ONCE(ftrace_is_jmp(addr))) - return -EINVAL; - mutex_lock(&direct_mutex); - if (ops->flags & FTRACE_OPS_FL_JMP) - addr = ftrace_jmp_set(addr); - /* Make sure requested entries are not already registered.. */ size = 1 << hash->size_bits; for (i = 0; i < size; i++) { @@ -6175,13 +6168,6 @@ __modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) lockdep_assert_held_once(&direct_mutex); - /* This is a "raw" address, and this should never happen. */ - if (WARN_ON_ONCE(ftrace_is_jmp(addr))) - return -EINVAL; - - if (ops->flags & FTRACE_OPS_FL_JMP) - addr = ftrace_jmp_set(addr); - /* Enable the tmp_ops to have the same functions as the direct ops */ ftrace_ops_init(&tmp_ops); tmp_ops.func_hash = ops->func_hash; From 676bfeae7bd55ca405670798711ac7c46b48655d Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 30 Dec 2025 15:50:03 +0100 Subject: [PATCH 186/268] ftrace: Make alloc_and_copy_ftrace_hash direct friendly Make alloc_and_copy_ftrace_hash to copy also direct address for each hash entry. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Reviewed-by: Steven Rostedt (Google) Link: https://lore.kernel.org/bpf/20251230145010.103439-3-jolsa@kernel.org --- kernel/trace/ftrace.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6040aacc97da..2a63860c172e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1212,7 +1212,7 @@ static void __add_hash_entry(struct ftrace_hash *hash, } static struct ftrace_func_entry * -add_hash_entry(struct ftrace_hash *hash, unsigned long ip) +add_hash_entry_direct(struct ftrace_hash *hash, unsigned long ip, unsigned long direct) { struct ftrace_func_entry *entry; @@ -1221,11 +1221,18 @@ add_hash_entry(struct ftrace_hash *hash, unsigned long ip) return NULL; entry->ip = ip; + entry->direct = direct; __add_hash_entry(hash, entry); return entry; } +static struct ftrace_func_entry * +add_hash_entry(struct ftrace_hash *hash, unsigned long ip) +{ + return add_hash_entry_direct(hash, ip, 0); +} + static void free_hash_entry(struct ftrace_hash *hash, struct ftrace_func_entry *entry) @@ -1398,7 +1405,7 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) size = 1 << hash->size_bits; for (i = 0; i < size; i++) { hlist_for_each_entry(entry, &hash->buckets[i], hlist) { - if (add_hash_entry(new_hash, entry->ip) == NULL) + if (add_hash_entry_direct(new_hash, entry->ip, entry->direct) == NULL) goto free_hash; } } From 0e860d07c29d70205d5ad48456ac7a133ccfb293 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 30 Dec 2025 15:50:04 +0100 Subject: [PATCH 187/268] ftrace: Export some of hash related functions We are going to use these functions in following changes. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Reviewed-by: Steven Rostedt (Google) Link: https://lore.kernel.org/bpf/20251230145010.103439-4-jolsa@kernel.org --- include/linux/ftrace.h | 9 +++++++++ kernel/trace/ftrace.c | 13 ++++++------- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index cc869c59c1a6..a77b57a4aba2 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -82,6 +82,7 @@ static inline void early_trace_init(void) { } struct module; struct ftrace_hash; +struct ftrace_func_entry; #if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_MODULES) && \ defined(CONFIG_DYNAMIC_FTRACE) @@ -405,6 +406,14 @@ enum ftrace_ops_cmd { typedef int (*ftrace_ops_func_t)(struct ftrace_ops *op, enum ftrace_ops_cmd cmd); #ifdef CONFIG_DYNAMIC_FTRACE + +#define FTRACE_HASH_DEFAULT_BITS 10 + +struct ftrace_hash *alloc_ftrace_hash(int size_bits); +void free_ftrace_hash(struct ftrace_hash *hash); +struct ftrace_func_entry *add_ftrace_hash_entry_direct(struct ftrace_hash *hash, + unsigned long ip, unsigned long direct); + /* The hash used to know what functions callbacks trace */ struct ftrace_ops_hash { struct ftrace_hash __rcu *notrace_hash; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 2a63860c172e..bf0ca563e91f 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -68,7 +68,6 @@ }) /* hash bits for specific function selection */ -#define FTRACE_HASH_DEFAULT_BITS 10 #define FTRACE_HASH_MAX_BITS 12 #ifdef CONFIG_DYNAMIC_FTRACE @@ -1211,8 +1210,8 @@ static void __add_hash_entry(struct ftrace_hash *hash, hash->count++; } -static struct ftrace_func_entry * -add_hash_entry_direct(struct ftrace_hash *hash, unsigned long ip, unsigned long direct) +struct ftrace_func_entry * +add_ftrace_hash_entry_direct(struct ftrace_hash *hash, unsigned long ip, unsigned long direct) { struct ftrace_func_entry *entry; @@ -1230,7 +1229,7 @@ add_hash_entry_direct(struct ftrace_hash *hash, unsigned long ip, unsigned long static struct ftrace_func_entry * add_hash_entry(struct ftrace_hash *hash, unsigned long ip) { - return add_hash_entry_direct(hash, ip, 0); + return add_ftrace_hash_entry_direct(hash, ip, 0); } static void @@ -1291,7 +1290,7 @@ static void clear_ftrace_mod_list(struct list_head *head) mutex_unlock(&ftrace_lock); } -static void free_ftrace_hash(struct ftrace_hash *hash) +void free_ftrace_hash(struct ftrace_hash *hash) { if (!hash || hash == EMPTY_HASH) return; @@ -1331,7 +1330,7 @@ void ftrace_free_filter(struct ftrace_ops *ops) } EXPORT_SYMBOL_GPL(ftrace_free_filter); -static struct ftrace_hash *alloc_ftrace_hash(int size_bits) +struct ftrace_hash *alloc_ftrace_hash(int size_bits) { struct ftrace_hash *hash; int size; @@ -1405,7 +1404,7 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) size = 1 << hash->size_bits; for (i = 0; i < size; i++) { hlist_for_each_entry(entry, &hash->buckets[i], hlist) { - if (add_hash_entry_direct(new_hash, entry->ip, entry->direct) == NULL) + if (add_ftrace_hash_entry_direct(new_hash, entry->ip, entry->direct) == NULL) goto free_hash; } } From 05dc5e9c1fe156fd9dddc4c2f81e8fc6c7e50eb5 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 30 Dec 2025 15:50:05 +0100 Subject: [PATCH 188/268] ftrace: Add update_ftrace_direct_add function Adding update_ftrace_direct_add function that adds all entries (ip -> addr) provided in hash argument to direct ftrace ops and updates its attachments. The difference to current register_ftrace_direct is - hash argument that allows to register multiple ip -> direct entries at once - we can call update_ftrace_direct_add multiple times on the same ftrace_ops object, becase after first registration with register_ftrace_function_nolock, it uses ftrace_update_ops to update the ftrace_ops object This change will allow us to have simple ftrace_ops for all bpf direct interface users in following changes. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Reviewed-by: Steven Rostedt (Google) Link: https://lore.kernel.org/bpf/20251230145010.103439-5-jolsa@kernel.org --- include/linux/ftrace.h | 7 +++ kernel/trace/ftrace.c | 140 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 147 insertions(+) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index a77b57a4aba2..43a6cfaf5aae 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -543,6 +543,8 @@ int unregister_ftrace_direct(struct ftrace_ops *ops, unsigned long addr, int modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr); int modify_ftrace_direct_nolock(struct ftrace_ops *ops, unsigned long addr); +int update_ftrace_direct_add(struct ftrace_ops *ops, struct ftrace_hash *hash); + void ftrace_stub_direct_tramp(void); #else @@ -569,6 +571,11 @@ static inline int modify_ftrace_direct_nolock(struct ftrace_ops *ops, unsigned l return -ENODEV; } +static inline int update_ftrace_direct_add(struct ftrace_ops *ops, struct ftrace_hash *hash) +{ + return -ENODEV; +} + /* * This must be implemented by the architecture. * It is the way the ftrace direct_ops helper, when called diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index bf0ca563e91f..97bd988b0a62 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6278,6 +6278,146 @@ int modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) return err; } EXPORT_SYMBOL_GPL(modify_ftrace_direct); + +static unsigned long hash_count(struct ftrace_hash *hash) +{ + return hash ? hash->count : 0; +} + +/** + * hash_add - adds two struct ftrace_hash and returns the result + * @a: struct ftrace_hash object + * @b: struct ftrace_hash object + * + * Returns struct ftrace_hash object on success, NULL on error. + */ +static struct ftrace_hash *hash_add(struct ftrace_hash *a, struct ftrace_hash *b) +{ + struct ftrace_func_entry *entry; + struct ftrace_hash *add; + int size; + + size = hash_count(a) + hash_count(b); + if (size > 32) + size = 32; + + add = alloc_and_copy_ftrace_hash(fls(size), a); + if (!add) + return NULL; + + size = 1 << b->size_bits; + for (int i = 0; i < size; i++) { + hlist_for_each_entry(entry, &b->buckets[i], hlist) { + if (add_ftrace_hash_entry_direct(add, entry->ip, entry->direct) == NULL) { + free_ftrace_hash(add); + return NULL; + } + } + } + return add; +} + +/** + * update_ftrace_direct_add - Updates @ops by adding direct + * callers provided in @hash + * @ops: The address of the struct ftrace_ops object + * @hash: The address of the struct ftrace_hash object + * + * This is used to add custom direct callers (ip -> addr) to @ops, + * specified in @hash. The @ops will be either registered or updated. + * + * Returns: zero on success. Non zero on error, which includes: + * -EINVAL - The @hash is empty + */ +int update_ftrace_direct_add(struct ftrace_ops *ops, struct ftrace_hash *hash) +{ + struct ftrace_hash *old_direct_functions = NULL; + struct ftrace_hash *new_direct_functions; + struct ftrace_hash *old_filter_hash; + struct ftrace_hash *new_filter_hash = NULL; + struct ftrace_func_entry *entry; + int err = -EINVAL; + int size; + bool reg; + + if (!hash_count(hash)) + return -EINVAL; + + mutex_lock(&direct_mutex); + + /* Make sure requested entries are not already registered. */ + size = 1 << hash->size_bits; + for (int i = 0; i < size; i++) { + hlist_for_each_entry(entry, &hash->buckets[i], hlist) { + if (__ftrace_lookup_ip(direct_functions, entry->ip)) + goto out_unlock; + } + } + + old_filter_hash = ops->func_hash ? ops->func_hash->filter_hash : NULL; + + /* If there's nothing in filter_hash we need to register the ops. */ + reg = hash_count(old_filter_hash) == 0; + if (reg) { + if (ops->func || ops->trampoline) + goto out_unlock; + if (ops->flags & FTRACE_OPS_FL_ENABLED) + goto out_unlock; + } + + err = -ENOMEM; + new_filter_hash = hash_add(old_filter_hash, hash); + if (!new_filter_hash) + goto out_unlock; + + new_direct_functions = hash_add(direct_functions, hash); + if (!new_direct_functions) + goto out_unlock; + + old_direct_functions = direct_functions; + rcu_assign_pointer(direct_functions, new_direct_functions); + + if (reg) { + ops->func = call_direct_funcs; + ops->flags |= MULTI_FLAGS; + ops->trampoline = FTRACE_REGS_ADDR; + ops->local_hash.filter_hash = new_filter_hash; + + err = register_ftrace_function_nolock(ops); + if (err) { + /* restore old filter on error */ + ops->local_hash.filter_hash = old_filter_hash; + + /* cleanup for possible another register call */ + ops->func = NULL; + ops->trampoline = 0; + } else { + new_filter_hash = old_filter_hash; + } + } else { + err = ftrace_update_ops(ops, new_filter_hash, EMPTY_HASH); + /* + * new_filter_hash is dup-ed, so we need to release it anyway, + * old_filter_hash either stays on error or is already released + */ + } + + if (err) { + /* reset direct_functions and free the new one */ + rcu_assign_pointer(direct_functions, old_direct_functions); + old_direct_functions = new_direct_functions; + } + + out_unlock: + mutex_unlock(&direct_mutex); + + if (old_direct_functions && old_direct_functions != EMPTY_HASH) + call_rcu_tasks(&old_direct_functions->rcu, register_ftrace_direct_cb); + free_ftrace_hash(new_filter_hash); + + return err; +} + #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ /** From 8d2c1233f37149e4d1223d06ca7054a7f88c0a24 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 30 Dec 2025 15:50:06 +0100 Subject: [PATCH 189/268] ftrace: Add update_ftrace_direct_del function Adding update_ftrace_direct_del function that removes all entries (ip -> addr) provided in hash argument to direct ftrace ops and updates its attachments. The difference to current unregister_ftrace_direct is - hash argument that allows to unregister multiple ip -> direct entries at once - we can call update_ftrace_direct_del multiple times on the same ftrace_ops object, becase we do not need to unregister all entries at once, we can do it gradualy with the help of ftrace_update_ops function This change will allow us to have simple ftrace_ops for all bpf direct interface users in following changes. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Reviewed-by: Steven Rostedt (Google) Link: https://lore.kernel.org/bpf/20251230145010.103439-6-jolsa@kernel.org --- include/linux/ftrace.h | 6 ++ kernel/trace/ftrace.c | 127 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 133 insertions(+) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 43a6cfaf5aae..cfb957731433 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -544,6 +544,7 @@ int modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr); int modify_ftrace_direct_nolock(struct ftrace_ops *ops, unsigned long addr); int update_ftrace_direct_add(struct ftrace_ops *ops, struct ftrace_hash *hash); +int update_ftrace_direct_del(struct ftrace_ops *ops, struct ftrace_hash *hash); void ftrace_stub_direct_tramp(void); @@ -576,6 +577,11 @@ static inline int update_ftrace_direct_add(struct ftrace_ops *ops, struct ftrace return -ENODEV; } +static inline int update_ftrace_direct_del(struct ftrace_ops *ops, struct ftrace_hash *hash) +{ + return -ENODEV; +} + /* * This must be implemented by the architecture. * It is the way the ftrace direct_ops helper, when called diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 97bd988b0a62..244bb7553d3d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6418,6 +6418,133 @@ int update_ftrace_direct_add(struct ftrace_ops *ops, struct ftrace_hash *hash) return err; } +/** + * hash_sub - substracts @b from @a and returns the result + * @a: struct ftrace_hash object + * @b: struct ftrace_hash object + * + * Returns struct ftrace_hash object on success, NULL on error. + */ +static struct ftrace_hash *hash_sub(struct ftrace_hash *a, struct ftrace_hash *b) +{ + struct ftrace_func_entry *entry, *del; + struct ftrace_hash *sub; + int size; + + sub = alloc_and_copy_ftrace_hash(a->size_bits, a); + if (!sub) + return NULL; + + size = 1 << b->size_bits; + for (int i = 0; i < size; i++) { + hlist_for_each_entry(entry, &b->buckets[i], hlist) { + del = __ftrace_lookup_ip(sub, entry->ip); + if (WARN_ON_ONCE(!del)) { + free_ftrace_hash(sub); + return NULL; + } + remove_hash_entry(sub, del); + kfree(del); + } + } + return sub; +} + +/** + * update_ftrace_direct_del - Updates @ops by removing its direct + * callers provided in @hash + * @ops: The address of the struct ftrace_ops object + * @hash: The address of the struct ftrace_hash object + * + * This is used to delete custom direct callers (ip -> addr) in + * @ops specified via @hash. The @ops will be either unregistered + * updated. + * + * Returns: zero on success. Non zero on error, which includes: + * -EINVAL - The @hash is empty + * -EINVAL - The @ops is not registered + */ +int update_ftrace_direct_del(struct ftrace_ops *ops, struct ftrace_hash *hash) +{ + struct ftrace_hash *old_direct_functions = NULL; + struct ftrace_hash *new_direct_functions; + struct ftrace_hash *new_filter_hash = NULL; + struct ftrace_hash *old_filter_hash; + struct ftrace_func_entry *entry; + struct ftrace_func_entry *del; + unsigned long size; + int err = -EINVAL; + + if (!hash_count(hash)) + return -EINVAL; + if (check_direct_multi(ops)) + return -EINVAL; + if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) + return -EINVAL; + if (direct_functions == EMPTY_HASH) + return -EINVAL; + + mutex_lock(&direct_mutex); + + old_filter_hash = ops->func_hash ? ops->func_hash->filter_hash : NULL; + + if (!hash_count(old_filter_hash)) + goto out_unlock; + + /* Make sure requested entries are already registered. */ + size = 1 << hash->size_bits; + for (int i = 0; i < size; i++) { + hlist_for_each_entry(entry, &hash->buckets[i], hlist) { + del = __ftrace_lookup_ip(direct_functions, entry->ip); + if (!del || del->direct != entry->direct) + goto out_unlock; + } + } + + err = -ENOMEM; + new_filter_hash = hash_sub(old_filter_hash, hash); + if (!new_filter_hash) + goto out_unlock; + + new_direct_functions = hash_sub(direct_functions, hash); + if (!new_direct_functions) + goto out_unlock; + + /* If there's nothing left, we need to unregister the ops. */ + if (ftrace_hash_empty(new_filter_hash)) { + err = unregister_ftrace_function(ops); + if (!err) { + /* cleanup for possible another register call */ + ops->func = NULL; + ops->trampoline = 0; + ftrace_free_filter(ops); + ops->func_hash->filter_hash = NULL; + } + } else { + err = ftrace_update_ops(ops, new_filter_hash, EMPTY_HASH); + /* + * new_filter_hash is dup-ed, so we need to release it anyway, + * old_filter_hash either stays on error or is already released + */ + } + + if (err) { + /* free the new_direct_functions */ + old_direct_functions = new_direct_functions; + } else { + rcu_assign_pointer(direct_functions, new_direct_functions); + } + + out_unlock: + mutex_unlock(&direct_mutex); + + if (old_direct_functions && old_direct_functions != EMPTY_HASH) + call_rcu_tasks(&old_direct_functions->rcu, register_ftrace_direct_cb); + free_ftrace_hash(new_filter_hash); + + return err; +} + #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ /** From e93672f770d72f4c33d1dd9fb0633f95948c0cc9 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 30 Dec 2025 15:50:07 +0100 Subject: [PATCH 190/268] ftrace: Add update_ftrace_direct_mod function Adding update_ftrace_direct_mod function that modifies all entries (ip -> direct) provided in hash argument to direct ftrace ops and updates its attachments. The difference to current modify_ftrace_direct is: - hash argument that allows to modify multiple ip -> direct entries at once This change will allow us to have simple ftrace_ops for all bpf direct interface users in following changes. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Reviewed-by: Steven Rostedt (Google) Link: https://lore.kernel.org/bpf/20251230145010.103439-7-jolsa@kernel.org --- include/linux/ftrace.h | 6 +++ kernel/trace/ftrace.c | 94 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index cfb957731433..b8461e541b9d 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -545,6 +545,7 @@ int modify_ftrace_direct_nolock(struct ftrace_ops *ops, unsigned long addr); int update_ftrace_direct_add(struct ftrace_ops *ops, struct ftrace_hash *hash); int update_ftrace_direct_del(struct ftrace_ops *ops, struct ftrace_hash *hash); +int update_ftrace_direct_mod(struct ftrace_ops *ops, struct ftrace_hash *hash, bool do_direct_lock); void ftrace_stub_direct_tramp(void); @@ -582,6 +583,11 @@ static inline int update_ftrace_direct_del(struct ftrace_ops *ops, struct ftrace return -ENODEV; } +static inline int update_ftrace_direct_mod(struct ftrace_ops *ops, struct ftrace_hash *hash, bool do_direct_lock) +{ + return -ENODEV; +} + /* * This must be implemented by the architecture. * It is the way the ftrace direct_ops helper, when called diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 244bb7553d3d..794ab5e22398 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6545,6 +6545,100 @@ int update_ftrace_direct_del(struct ftrace_ops *ops, struct ftrace_hash *hash) return err; } +/** + * update_ftrace_direct_mod - Updates @ops by modifing its direct + * callers provided in @hash + * @ops: The address of the struct ftrace_ops object + * @hash: The address of the struct ftrace_hash object + * @do_direct_lock: If true lock the direct_mutex + * + * This is used to modify custom direct callers (ip -> addr) in + * @ops specified via @hash. + * + * This can be called from within ftrace ops_func callback with + * direct_mutex already locked, in which case @do_direct_lock + * needs to be false. + * + * Returns: zero on success. Non zero on error, which includes: + * -EINVAL - The @hash is empty + * -EINVAL - The @ops is not registered + */ +int update_ftrace_direct_mod(struct ftrace_ops *ops, struct ftrace_hash *hash, bool do_direct_lock) +{ + struct ftrace_func_entry *entry, *tmp; + static struct ftrace_ops tmp_ops = { + .func = ftrace_stub, + .flags = FTRACE_OPS_FL_STUB, + }; + struct ftrace_hash *orig_hash; + unsigned long size, i; + int err = -EINVAL; + + if (!hash_count(hash)) + return -EINVAL; + if (check_direct_multi(ops)) + return -EINVAL; + if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) + return -EINVAL; + if (direct_functions == EMPTY_HASH) + return -EINVAL; + + /* + * We can be called from within ops_func callback with direct_mutex + * already taken. + */ + if (do_direct_lock) + mutex_lock(&direct_mutex); + + orig_hash = ops->func_hash ? ops->func_hash->filter_hash : NULL; + if (!orig_hash) + goto unlock; + + /* Enable the tmp_ops to have the same functions as the direct ops */ + ftrace_ops_init(&tmp_ops); + tmp_ops.func_hash = ops->func_hash; + + err = register_ftrace_function_nolock(&tmp_ops); + if (err) + goto unlock; + + /* + * Call __ftrace_hash_update_ipmodify() here, so that we can call + * ops->ops_func for the ops. This is needed because the above + * register_ftrace_function_nolock() worked on tmp_ops. + */ + err = __ftrace_hash_update_ipmodify(ops, orig_hash, orig_hash, true); + if (err) + goto out; + + /* + * Now the ftrace_ops_list_func() is called to do the direct callers. + * We can safely change the direct functions attached to each entry. + */ + mutex_lock(&ftrace_lock); + + size = 1 << hash->size_bits; + for (i = 0; i < size; i++) { + hlist_for_each_entry(entry, &hash->buckets[i], hlist) { + tmp = __ftrace_lookup_ip(direct_functions, entry->ip); + if (!tmp) + continue; + tmp->direct = entry->direct; + } + } + + mutex_unlock(&ftrace_lock); + +out: + /* Removing the tmp_ops will add the updated direct callers to the functions */ + unregister_ftrace_function(&tmp_ops); + +unlock: + if (do_direct_lock) + mutex_unlock(&direct_mutex); + return err; +} + #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ /** From 7d0452497c292153e690652e6df218fead21185f Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 30 Dec 2025 15:50:08 +0100 Subject: [PATCH 191/268] bpf: Add trampoline ip hash table Following changes need to lookup trampoline based on its ip address, adding hash table for that. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20251230145010.103439-8-jolsa@kernel.org --- include/linux/bpf.h | 7 +++++-- kernel/bpf/trampoline.c | 30 +++++++++++++++++++----------- 2 files changed, 24 insertions(+), 13 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9272a237cced..5524f9429e76 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1329,14 +1329,17 @@ struct bpf_tramp_image { }; struct bpf_trampoline { - /* hlist for trampoline_table */ - struct hlist_node hlist; + /* hlist for trampoline_key_table */ + struct hlist_node hlist_key; + /* hlist for trampoline_ip_table */ + struct hlist_node hlist_ip; struct ftrace_ops *fops; /* serializes access to fields of this trampoline */ struct mutex mutex; refcount_t refcnt; u32 flags; u64 key; + unsigned long ip; struct { struct btf_func_model model; void *addr; diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 468de930b2ed..1d4f618c2402 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -24,9 +24,10 @@ const struct bpf_prog_ops bpf_extension_prog_ops = { #define TRAMPOLINE_HASH_BITS 10 #define TRAMPOLINE_TABLE_SIZE (1 << TRAMPOLINE_HASH_BITS) -static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE]; +static struct hlist_head trampoline_key_table[TRAMPOLINE_TABLE_SIZE]; +static struct hlist_head trampoline_ip_table[TRAMPOLINE_TABLE_SIZE]; -/* serializes access to trampoline_table */ +/* serializes access to trampoline tables */ static DEFINE_MUTEX(trampoline_mutex); #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS @@ -142,15 +143,15 @@ void bpf_image_ksym_del(struct bpf_ksym *ksym) PAGE_SIZE, true, ksym->name); } -static struct bpf_trampoline *bpf_trampoline_lookup(u64 key) +static struct bpf_trampoline *bpf_trampoline_lookup(u64 key, unsigned long ip) { struct bpf_trampoline *tr; struct hlist_head *head; int i; mutex_lock(&trampoline_mutex); - head = &trampoline_table[hash_64(key, TRAMPOLINE_HASH_BITS)]; - hlist_for_each_entry(tr, head, hlist) { + head = &trampoline_key_table[hash_64(key, TRAMPOLINE_HASH_BITS)]; + hlist_for_each_entry(tr, head, hlist_key) { if (tr->key == key) { refcount_inc(&tr->refcnt); goto out; @@ -171,8 +172,12 @@ static struct bpf_trampoline *bpf_trampoline_lookup(u64 key) #endif tr->key = key; - INIT_HLIST_NODE(&tr->hlist); - hlist_add_head(&tr->hlist, head); + tr->ip = ftrace_location(ip); + INIT_HLIST_NODE(&tr->hlist_key); + INIT_HLIST_NODE(&tr->hlist_ip); + hlist_add_head(&tr->hlist_key, head); + head = &trampoline_ip_table[hash_64(tr->ip, TRAMPOLINE_HASH_BITS)]; + hlist_add_head(&tr->hlist_ip, head); refcount_set(&tr->refcnt, 1); mutex_init(&tr->mutex); for (i = 0; i < BPF_TRAMP_MAX; i++) @@ -883,7 +888,7 @@ void bpf_trampoline_unlink_cgroup_shim(struct bpf_prog *prog) prog->aux->attach_btf_id); bpf_lsm_find_cgroup_shim(prog, &bpf_func); - tr = bpf_trampoline_lookup(key); + tr = bpf_trampoline_lookup(key, 0); if (WARN_ON_ONCE(!tr)) return; @@ -903,7 +908,7 @@ struct bpf_trampoline *bpf_trampoline_get(u64 key, { struct bpf_trampoline *tr; - tr = bpf_trampoline_lookup(key); + tr = bpf_trampoline_lookup(key, tgt_info->tgt_addr); if (!tr) return NULL; @@ -939,7 +944,8 @@ void bpf_trampoline_put(struct bpf_trampoline *tr) * fexit progs. The fentry-only trampoline will be freed via * multiple rcu callbacks. */ - hlist_del(&tr->hlist); + hlist_del(&tr->hlist_key); + hlist_del(&tr->hlist_ip); if (tr->fops) { ftrace_free_filter(tr->fops); kfree(tr->fops); @@ -1212,7 +1218,9 @@ static int __init init_trampolines(void) int i; for (i = 0; i < TRAMPOLINE_TABLE_SIZE; i++) - INIT_HLIST_HEAD(&trampoline_table[i]); + INIT_HLIST_HEAD(&trampoline_key_table[i]); + for (i = 0; i < TRAMPOLINE_TABLE_SIZE; i++) + INIT_HLIST_HEAD(&trampoline_ip_table[i]); return 0; } late_initcall(init_trampolines); From 956747efd82aa60e0ac3e6aef2b9a17adda6f3b1 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 30 Dec 2025 15:50:09 +0100 Subject: [PATCH 192/268] ftrace: Factor ftrace_ops ops_func interface We are going to remove "ftrace_ops->private == bpf_trampoline" setup in following changes. Adding ip argument to ftrace_ops_func_t callback function, so we can use it to look up the trampoline. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Reviewed-by: Steven Rostedt (Google) Link: https://lore.kernel.org/bpf/20251230145010.103439-9-jolsa@kernel.org --- include/linux/ftrace.h | 2 +- kernel/bpf/trampoline.c | 3 ++- kernel/trace/ftrace.c | 6 +++--- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index b8461e541b9d..705db0a6d995 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -403,7 +403,7 @@ enum ftrace_ops_cmd { * Negative on failure. The return value is dependent on the * callback. */ -typedef int (*ftrace_ops_func_t)(struct ftrace_ops *op, enum ftrace_ops_cmd cmd); +typedef int (*ftrace_ops_func_t)(struct ftrace_ops *op, unsigned long ip, enum ftrace_ops_cmd cmd); #ifdef CONFIG_DYNAMIC_FTRACE diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 1d4f618c2402..64556aa0007b 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -33,7 +33,8 @@ static DEFINE_MUTEX(trampoline_mutex); #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex); -static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, enum ftrace_ops_cmd cmd) +static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, unsigned long ip, + enum ftrace_ops_cmd cmd) { struct bpf_trampoline *tr = ops->private; int ret = 0; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 794ab5e22398..ee2d7b8a4372 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2075,7 +2075,7 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops, */ if (!ops->ops_func) return -EBUSY; - ret = ops->ops_func(ops, FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF); + ret = ops->ops_func(ops, rec->ip, FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF); if (ret) return ret; } else if (is_ipmodify) { @@ -9061,7 +9061,7 @@ static int prepare_direct_functions_for_ipmodify(struct ftrace_ops *ops) if (!op->ops_func) return -EBUSY; - ret = op->ops_func(op, FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_PEER); + ret = op->ops_func(op, ip, FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_PEER); if (ret) return ret; } @@ -9108,7 +9108,7 @@ static void cleanup_direct_functions_after_ipmodify(struct ftrace_ops *ops) /* The cleanup is optional, ignore any errors */ if (found_op && op->ops_func) - op->ops_func(op, FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY_PEER); + op->ops_func(op, ip, FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY_PEER); } } mutex_unlock(&direct_mutex); From 424f6a3610965d125634bc65c5d6d506582d4f7e Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 30 Dec 2025 15:50:10 +0100 Subject: [PATCH 193/268] bpf,x86: Use single ftrace_ops for direct calls Using single ftrace_ops for direct calls update instead of allocating ftrace_ops object for each trampoline. With single ftrace_ops object we can use update_ftrace_direct_* api that allows multiple ip sites updates on single ftrace_ops object. Adding HAVE_SINGLE_FTRACE_DIRECT_OPS config option to be enabled on each arch that supports this. At the moment we can enable this only on x86 arch, because arm relies on ftrace_ops object representing just single trampoline image (stored in ftrace_ops::direct_call). Archs that do not support this will continue to use *_ftrace_direct api. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Reviewed-by: Steven Rostedt (Google) Link: https://lore.kernel.org/bpf/20251230145010.103439-10-jolsa@kernel.org --- arch/x86/Kconfig | 1 + kernel/bpf/trampoline.c | 220 ++++++++++++++++++++++++++++++++++------ kernel/trace/Kconfig | 3 + kernel/trace/ftrace.c | 7 +- 4 files changed, 200 insertions(+), 31 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 80527299f859..53bf2cf7ff6f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -336,6 +336,7 @@ config X86 select SCHED_SMT if SMP select ARCH_SUPPORTS_SCHED_CLUSTER if SMP select ARCH_SUPPORTS_SCHED_MC if SMP + select HAVE_SINGLE_FTRACE_DIRECT_OPS if X86_64 && DYNAMIC_FTRACE_WITH_DIRECT_CALLS config INSTRUCTION_DECODER def_bool y diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 64556aa0007b..952cd7932461 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -33,12 +33,40 @@ static DEFINE_MUTEX(trampoline_mutex); #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex); +#ifdef CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS +static struct bpf_trampoline *direct_ops_ip_lookup(struct ftrace_ops *ops, unsigned long ip) +{ + struct hlist_head *head_ip; + struct bpf_trampoline *tr; + + mutex_lock(&trampoline_mutex); + head_ip = &trampoline_ip_table[hash_64(ip, TRAMPOLINE_HASH_BITS)]; + hlist_for_each_entry(tr, head_ip, hlist_ip) { + if (tr->ip == ip) + goto out; + } + tr = NULL; +out: + mutex_unlock(&trampoline_mutex); + return tr; +} +#else +static struct bpf_trampoline *direct_ops_ip_lookup(struct ftrace_ops *ops, unsigned long ip) +{ + return ops->private; +} +#endif /* CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS */ + static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, unsigned long ip, enum ftrace_ops_cmd cmd) { - struct bpf_trampoline *tr = ops->private; + struct bpf_trampoline *tr; int ret = 0; + tr = direct_ops_ip_lookup(ops, ip); + if (!tr) + return -EINVAL; + if (cmd == FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF) { /* This is called inside register_ftrace_direct_multi(), so * tr->mutex is already locked. @@ -144,6 +172,162 @@ void bpf_image_ksym_del(struct bpf_ksym *ksym) PAGE_SIZE, true, ksym->name); } +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +#ifdef CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS +/* + * We have only single direct_ops which contains all the direct call + * sites and is the only global ftrace_ops for all trampolines. + * + * We use 'update_ftrace_direct_*' api for attachment. + */ +struct ftrace_ops direct_ops = { + .ops_func = bpf_tramp_ftrace_ops_func, +}; + +static int direct_ops_alloc(struct bpf_trampoline *tr) +{ + tr->fops = &direct_ops; + return 0; +} + +static void direct_ops_free(struct bpf_trampoline *tr) { } + +static struct ftrace_hash *hash_from_ip(struct bpf_trampoline *tr, void *ptr) +{ + unsigned long ip, addr = (unsigned long) ptr; + struct ftrace_hash *hash; + + ip = ftrace_location(tr->ip); + if (!ip) + return NULL; + hash = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS); + if (!hash) + return NULL; + if (bpf_trampoline_use_jmp(tr->flags)) + addr = ftrace_jmp_set(addr); + if (!add_ftrace_hash_entry_direct(hash, ip, addr)) { + free_ftrace_hash(hash); + return NULL; + } + return hash; +} + +static int direct_ops_add(struct bpf_trampoline *tr, void *addr) +{ + struct ftrace_hash *hash = hash_from_ip(tr, addr); + int err; + + if (!hash) + return -ENOMEM; + err = update_ftrace_direct_add(tr->fops, hash); + free_ftrace_hash(hash); + return err; +} + +static int direct_ops_del(struct bpf_trampoline *tr, void *addr) +{ + struct ftrace_hash *hash = hash_from_ip(tr, addr); + int err; + + if (!hash) + return -ENOMEM; + err = update_ftrace_direct_del(tr->fops, hash); + free_ftrace_hash(hash); + return err; +} + +static int direct_ops_mod(struct bpf_trampoline *tr, void *addr, bool lock_direct_mutex) +{ + struct ftrace_hash *hash = hash_from_ip(tr, addr); + int err; + + if (!hash) + return -ENOMEM; + err = update_ftrace_direct_mod(tr->fops, hash, lock_direct_mutex); + free_ftrace_hash(hash); + return err; +} +#else +/* + * We allocate ftrace_ops object for each trampoline and it contains + * call site specific for that trampoline. + * + * We use *_ftrace_direct api for attachment. + */ +static int direct_ops_alloc(struct bpf_trampoline *tr) +{ + tr->fops = kzalloc(sizeof(struct ftrace_ops), GFP_KERNEL); + if (!tr->fops) + return -ENOMEM; + tr->fops->private = tr; + tr->fops->ops_func = bpf_tramp_ftrace_ops_func; + return 0; +} + +static void direct_ops_free(struct bpf_trampoline *tr) +{ + if (!tr->fops) + return; + ftrace_free_filter(tr->fops); + kfree(tr->fops); +} + +static int direct_ops_add(struct bpf_trampoline *tr, void *ptr) +{ + unsigned long addr = (unsigned long) ptr; + struct ftrace_ops *ops = tr->fops; + int ret; + + if (bpf_trampoline_use_jmp(tr->flags)) + addr = ftrace_jmp_set(addr); + + ret = ftrace_set_filter_ip(ops, tr->ip, 0, 1); + if (ret) + return ret; + return register_ftrace_direct(ops, addr); +} + +static int direct_ops_del(struct bpf_trampoline *tr, void *addr) +{ + return unregister_ftrace_direct(tr->fops, (long)addr, false); +} + +static int direct_ops_mod(struct bpf_trampoline *tr, void *ptr, bool lock_direct_mutex) +{ + unsigned long addr = (unsigned long) ptr; + struct ftrace_ops *ops = tr->fops; + + if (bpf_trampoline_use_jmp(tr->flags)) + addr = ftrace_jmp_set(addr); + if (lock_direct_mutex) + return modify_ftrace_direct(ops, addr); + return modify_ftrace_direct_nolock(ops, addr); +} +#endif /* CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS */ +#else +static void direct_ops_free(struct bpf_trampoline *tr) { } + +static int direct_ops_alloc(struct bpf_trampoline *tr) +{ + return 0; +} + +static int direct_ops_add(struct bpf_trampoline *tr, void *addr) +{ + return -ENODEV; +} + +static int direct_ops_del(struct bpf_trampoline *tr, void *addr) +{ + return -ENODEV; +} + +static int direct_ops_mod(struct bpf_trampoline *tr, void *ptr, bool lock_direct_mutex) +{ + return -ENODEV; +} +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ + static struct bpf_trampoline *bpf_trampoline_lookup(u64 key, unsigned long ip) { struct bpf_trampoline *tr; @@ -161,16 +345,11 @@ static struct bpf_trampoline *bpf_trampoline_lookup(u64 key, unsigned long ip) tr = kzalloc(sizeof(*tr), GFP_KERNEL); if (!tr) goto out; -#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS - tr->fops = kzalloc(sizeof(struct ftrace_ops), GFP_KERNEL); - if (!tr->fops) { + if (direct_ops_alloc(tr)) { kfree(tr); tr = NULL; goto out; } - tr->fops->private = tr; - tr->fops->ops_func = bpf_tramp_ftrace_ops_func; -#endif tr->key = key; tr->ip = ftrace_location(ip); @@ -213,7 +392,7 @@ static int unregister_fentry(struct bpf_trampoline *tr, u32 orig_flags, int ret; if (tr->func.ftrace_managed) - ret = unregister_ftrace_direct(tr->fops, (long)old_addr, false); + ret = direct_ops_del(tr, old_addr); else ret = bpf_trampoline_update_fentry(tr, orig_flags, old_addr, NULL); @@ -227,15 +406,7 @@ static int modify_fentry(struct bpf_trampoline *tr, u32 orig_flags, int ret; if (tr->func.ftrace_managed) { - unsigned long addr = (unsigned long) new_addr; - - if (bpf_trampoline_use_jmp(tr->flags)) - addr = ftrace_jmp_set(addr); - - if (lock_direct_mutex) - ret = modify_ftrace_direct(tr->fops, addr); - else - ret = modify_ftrace_direct_nolock(tr->fops, addr); + ret = direct_ops_mod(tr, new_addr, lock_direct_mutex); } else { ret = bpf_trampoline_update_fentry(tr, orig_flags, old_addr, new_addr); @@ -258,15 +429,7 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr) } if (tr->func.ftrace_managed) { - unsigned long addr = (unsigned long) new_addr; - - if (bpf_trampoline_use_jmp(tr->flags)) - addr = ftrace_jmp_set(addr); - - ret = ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 0, 1); - if (ret) - return ret; - ret = register_ftrace_direct(tr->fops, addr); + ret = direct_ops_add(tr, new_addr); } else { ret = bpf_trampoline_update_fentry(tr, 0, NULL, new_addr); } @@ -947,10 +1110,7 @@ void bpf_trampoline_put(struct bpf_trampoline *tr) */ hlist_del(&tr->hlist_key); hlist_del(&tr->hlist_ip); - if (tr->fops) { - ftrace_free_filter(tr->fops); - kfree(tr->fops); - } + direct_ops_free(tr); kfree(tr); out: mutex_unlock(&trampoline_mutex); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index bfa2ec46e075..d7042a09fe46 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -50,6 +50,9 @@ config HAVE_DYNAMIC_FTRACE_WITH_REGS config HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS bool +config HAVE_SINGLE_FTRACE_DIRECT_OPS + bool + config HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS bool diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ee2d7b8a4372..8574932e66b6 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2631,8 +2631,13 @@ unsigned long ftrace_find_rec_direct(unsigned long ip) static void call_direct_funcs(unsigned long ip, unsigned long pip, struct ftrace_ops *ops, struct ftrace_regs *fregs) { - unsigned long addr = READ_ONCE(ops->direct_call); + unsigned long addr; +#ifdef CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS + addr = ftrace_find_rec_direct(ip); +#else + addr = READ_ONCE(ops->direct_call); +#endif if (!addr) return; From b640d556a2b354863a9962747a01f67f31cbf4d8 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Wed, 28 Jan 2026 19:05:51 +0000 Subject: [PATCH 194/268] selftests/bpf: Remove xxd util dependency The verification signature header generation requires converting a binary certificate to a C array. Previously this only worked with xxd (part of vim-common package). As xxd may not be available on some systems building selftests, it makes sense to substitute it with more common utils: hexdump, wc, sed to generate equivalent C array output. Tested by generating header with both xxd and hexdump and comparing them. Signed-off-by: Mykyta Yatsenko Signed-off-by: Andrii Nakryiko Tested-by: Alan Maguire Link: https://lore.kernel.org/bpf/20260128190552.242335-1-mykyta.yatsenko5@gmail.com --- tools/testing/selftests/bpf/.gitignore | 1 - tools/testing/selftests/bpf/Makefile | 10 ++++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index b8bf51b7a0b0..a3ea98211ea6 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -23,7 +23,6 @@ test_tcpnotify_user test_libbpf xdping test_cpp -test_progs_verification_cert *.d *.subskel.h *.skel.h diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 2c2f68a171ed..c6bf4dfb1495 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -720,9 +720,12 @@ $(VERIFICATION_CERT) $(PRIVATE_KEY): $(VERIFY_SIG_SETUP) $(Q)mkdir -p $(BUILD_DIR) $(Q)$(VERIFY_SIG_SETUP) genkey $(BUILD_DIR) +# Generates a header with C array declaration, containing test_progs_verification_cert bytes $(VERIFY_SIG_HDR): $(VERIFICATION_CERT) - $(Q)ln -fs $< test_progs_verification_cert && \ - xxd -i test_progs_verification_cert > $@ + $(Q)(echo "unsigned char test_progs_verification_cert[] = {"; \ + hexdump -v -e '12/1 " 0x%02x," "\n"' $< | sed 's/0x ,//g; $$s/,$$//'; \ + echo "};"; \ + echo "unsigned int test_progs_verification_cert_len = $$(wc -c < $<);") > $@ # Define test_progs test runner. TRUNNER_TESTS_DIR := prog_tests @@ -898,8 +901,7 @@ EXTRA_CLEAN := $(SCRATCH_DIR) $(HOST_SCRATCH_DIR) \ *.BTF *.BTF_ids *.BTF.base \ no_alu32 cpuv4 bpf_gcc \ liburandom_read.so) \ - $(OUTPUT)/FEATURE-DUMP.selftests \ - test_progs_verification_cert + $(OUTPUT)/FEATURE-DUMP.selftests .PHONY: docs docs-clean From 08a7491843224f8b96518fbe70d9e48163046054 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Wed, 28 Jan 2026 13:12:55 -0800 Subject: [PATCH 195/268] bpftool: Fix dependencies for static build When building selftests/bpf with EXTRA_LDFLAGS=-static the follwoing error happens: LINK /ws/linux/tools/testing/selftests/bpf/tools/build/bpftool/bootstrap/bpftool /usr/bin/x86_64-linux-gnu-ld.bfd: /usr/lib/gcc/x86_64-linux-gnu/15/../../../x86_64-linux-gnu/libcrypto.a(libcrypto-lib-dso_dlfcn.o): in function `dlfcn_globallookup': [...] /usr/bin/x86_64-linux-gnu-ld.bfd: /usr/lib/gcc/x86_64-linux-gnu/15/../../../x86_64-linux-gnu/libcrypto.a(libcrypto-lib-c_zlib.o): in function `zlib_oneshot_expand_block': (.text+0xc64): undefined reference to `uncompress' /usr/bin/x86_64-linux-gnu-ld.bfd: /usr/lib/gcc/x86_64-linux-gnu/15/../../../x86_64-linux-gnu/libcrypto.a(libcrypto-lib-c_zlib.o): in function `zlib_oneshot_compress_block': (.text+0xce4): undefined reference to `compress' collect2: error: ld returned 1 exit status make[1]: *** [Makefile:252: /ws/linux/tools/testing/selftests/bpf/tools/build/bpftool/bootstrap/bpftool] Error 1 make: *** [Makefile:327: /ws/linux/tools/testing/selftests/bpf/tools/sbin/bpftool] Error 2 make: *** Waiting for unfinished jobs.... This is caused by wrong order of dependencies in the Makefile. Fix it. Signed-off-by: Ihor Solodrai Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260128211255.376933-1-ihor.solodrai@linux.dev --- tools/bpf/bpftool/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile index 5442073a2e42..519ea5cb8ab1 100644 --- a/tools/bpf/bpftool/Makefile +++ b/tools/bpf/bpftool/Makefile @@ -130,8 +130,8 @@ include $(FEATURES_DUMP) endif endif -LIBS = $(LIBBPF) -lelf -lz -lcrypto -LIBS_BOOTSTRAP = $(LIBBPF_BOOTSTRAP) -lelf -lz -lcrypto +LIBS = $(LIBBPF) -lelf -lcrypto -lz +LIBS_BOOTSTRAP = $(LIBBPF_BOOTSTRAP) -lelf -lcrypto -lz ifeq ($(feature-libelf-zstd),1) LIBS += -lzstd From cd3b6a3d49f8061d0c4c7e4226783051fe592ae7 Mon Sep 17 00:00:00 2001 From: Luis Gerhorst Date: Tue, 27 Jan 2026 12:59:11 +0100 Subject: [PATCH 196/268] bpf: Fix verifier_bug_if to account for BPF_CALL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The BPF verifier assumes `insn_aux->nospec_result` is only set for direct memory writes (e.g., `*(u32*)(r1+off) = r2`). However, the assertion fails to account for helper calls (e.g., `bpf_skb_load_bytes_relative`) that perform writes to stack memory. Make the check more precise to resolve this. The problem is that `BPF_CALL` instructions have `BPF_CLASS(insn->code) == BPF_JMP`, which triggers the warning check: - Helpers like `bpf_skb_load_bytes_relative` write to stack memory - `check_helper_call()` loops through `meta.access_size`, calling `check_mem_access(..., BPF_WRITE)` - `check_stack_write()` sets `insn_aux->nospec_result = 1` - Since `BPF_CALL` is encoded as `BPF_JMP | BPF_CALL`, the warning fires Execution flow: ``` 1. Drop capabilities → Enable Spectre mitigation 2. Load BPF program └─> do_check() ├─> check_cond_jmp_op() → Marks dead branch as speculative │ └─> push_stack(..., speculative=true) ├─> pop_stack() → state->speculative = 1 ├─> check_helper_call() → Processes helper in dead branch │ └─> check_mem_access(..., BPF_WRITE) │ └─> insn_aux->nospec_result = 1 └─> Checks: state->speculative && insn_aux->nospec_result └─> BPF_CLASS(insn->code) == BPF_JMP → WARNING ``` To fix the assert, it would be nice to be able to reuse bpf_insn_successors() here, but bpf_insn_successors()->cnt is not exactly what we want as it may also be 1 for BPF_JA. Instead, we could check opcode_info.can_jump, but then we would have to share the table between the functions. This would mean moving the table out of the function and adding bpf_opcode_info(). As the verifier_bug_if() only runs for insns with nospec_result set, the impact on verification time would likely still be negligible. However, I assume sharing bpf_opcode_info() between liveness.c and verifier.c will not be worth it. It seems as only adjust_jmp_off() could also be simplified using it, and there imm/off is touched. Thus it is maybe better to rely on exact opcode/class matching there. Therefore, to avoid this sharing only for a verifier_bug_if(), just check the opcode. This should now cover all opcodes for which can_jump in bpf_insn_successors() is true. Parts of the description and example are taken from the bug report. Fixes: dadb59104c64 ("bpf: Fix aux usage after do_check_insn()") Signed-off-by: Luis Gerhorst Reported-by: Yinhao Hu Reported-by: Kaiyan Mei Reported-by: Dongliang Mu Closes: https://lore.kernel.org/bpf/7678017d-b760-4053-a2d8-a6879b0dbeeb@hust.edu.cn/ Link: https://lore.kernel.org/r/20260127115912.3026761-2-luis.gerhorst@fau.de Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c2f2650db9fd..e7ff8394e0da 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -21065,17 +21065,19 @@ static int do_check(struct bpf_verifier_env *env) * may skip a nospec patched-in after the jump. This can * currently never happen because nospec_result is only * used for the write-ops - * `*(size*)(dst_reg+off)=src_reg|imm32` which must - * never skip the following insn. Still, add a warning - * to document this in case nospec_result is used - * elsewhere in the future. + * `*(size*)(dst_reg+off)=src_reg|imm32` and helper + * calls. These must never skip the following insn + * (i.e., bpf_insn_successors()'s opcode_info.can_jump + * is false). Still, add a warning to document this in + * case nospec_result is used elsewhere in the future. * * All non-branch instructions have a single * fall-through edge. For these, nospec_result should * already work. */ - if (verifier_bug_if(BPF_CLASS(insn->code) == BPF_JMP || - BPF_CLASS(insn->code) == BPF_JMP32, env, + if (verifier_bug_if((BPF_CLASS(insn->code) == BPF_JMP || + BPF_CLASS(insn->code) == BPF_JMP32) && + BPF_OP(insn->code) != BPF_CALL, env, "speculation barrier after jump instruction may not have the desired effect")) return -EFAULT; process_bpf_exit: From 60d2c438c1bb705cdbf74ce8f12e6e141a4719b0 Mon Sep 17 00:00:00 2001 From: Luis Gerhorst Date: Tue, 27 Jan 2026 12:59:12 +0100 Subject: [PATCH 197/268] bpf: Test nospec after dead stack write in helper Without the fix from the previous commit, the selftest fails: $ ./tools/testing/selftests/bpf/vmtest.sh -- \ ./test_progs -t verifier_unpriv [...] run_subtest:PASS:obj_open_mem 0 nsec libbpf: BTF loading error: -EPERM libbpf: Error loading .BTF into kernel: -EPERM. BTF is optional, ignoring. libbpf: prog 'unpriv_nospec_after_helper_stack_write': BPF program load failed: -EFAULT libbpf: prog 'unpriv_nospec_after_helper_stack_write': failed to load: -EFAULT libbpf: failed to load object 'verifier_unpriv' run_subtest:FAIL:unexpected_load_failure unexpected error: -14 (errno 14) VERIFIER LOG: ============= 0: R1=ctx() R10=fp0 0: (b7) r0 = 0 ; R0=P0 1: (55) if r0 != 0x1 goto pc+6 2: R0=Pscalar() R1=ctx() R10=fp0 2: (b7) r2 = 0 ; R2=P0 3: (bf) r3 = r10 ; R3=fp0 R10=fp0 4: (07) r3 += -16 ; R3=fp-16 5: (b7) r4 = 4 ; R4=P4 6: (b7) r5 = 0 ; R5=P0 7: (85) call bpf_skb_load_bytes_relative#68 verifier bug: speculation barrier after jump instruction may not have the desired effect (BPF_CLASS(insn->code) == BPF_JMP || BPF_CLASS(insn->code) == BPF_JMP32) processed 9 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0 ============= [...] The test is based on the PoC from the report. Signed-off-by: Luis Gerhorst Reported-by: Yinhao Hu Reported-by: Kaiyan Mei Reported-by: Dongliang Mu Link: https://lore.kernel.org/bpf/7678017d-b760-4053-a2d8-a6879b0dbeeb@hust.edu.cn/ Link: https://lore.kernel.org/r/20260127115912.3026761-3-luis.gerhorst@fau.de Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/progs/verifier_unpriv.c | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/verifier_unpriv.c b/tools/testing/selftests/bpf/progs/verifier_unpriv.c index 28b4f7035ceb..8ee1243e62a8 100644 --- a/tools/testing/selftests/bpf/progs/verifier_unpriv.c +++ b/tools/testing/selftests/bpf/progs/verifier_unpriv.c @@ -950,4 +950,26 @@ l3_%=: r0 = 0; \ " ::: __clobber_all); } +SEC("socket") +__description("unpriv: nospec after dead stack write in helper") +__success __success_unpriv +__retval(0) +/* Dead code sanitizer rewrites the call to `goto -1`. */ +__naked void unpriv_dead_helper_stack_write_nospec_result(void) +{ + asm volatile (" \ + r0 = 0; \ + if r0 != 1 goto l0_%=; \ + r2 = 0; \ + r3 = r10; \ + r3 += -16; \ + r4 = 4; \ + r5 = 0; \ + call %[bpf_skb_load_bytes_relative]; \ +l0_%=: exit; \ +" : + : __imm(bpf_skb_load_bytes_relative) + : __clobber_all); +} + char _license[] SEC("license") = "GPL"; From 0f0c332992b8a5d2ae7b611b94c4e02ef8d54b97 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 30 Jan 2026 09:12:07 +0100 Subject: [PATCH 198/268] bpf: Allow sleepable programs to use tail calls Allowing sleepable programs to use tail calls. Making sure we can't mix sleepable and non-sleepable bpf programs in tail call map (BPF_MAP_TYPE_PROG_ARRAY) and allowing it to be used in sleepable programs. Sleepable programs can be preempted and sleep which might bring new source of race conditions, but both direct and indirect tail calls should not be affected. Direct tail calls work by patching direct jump to callee into bpf caller program, so no problem there. We atomically switch from nop to jump instruction. Indirect tail call reads the callee from the map and then jumps to it. The callee bpf program can't disappear (be released) from the caller, because it is executed under rcu lock (rcu_read_lock_trace). Signed-off-by: Jiri Olsa Acked-by: Leon Hwang Link: https://lore.kernel.org/r/20260130081208.1130204-2-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + kernel/bpf/core.c | 4 +++- kernel/bpf/verifier.c | 1 + 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5524f9429e76..3b0ceb759075 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -287,6 +287,7 @@ struct bpf_map_owner { enum bpf_prog_type type; bool jited; bool xdp_has_frags; + bool sleepable; u64 storage_cookie[MAX_BPF_CGROUP_STORAGE_TYPE]; const struct btf_type *attach_func_proto; enum bpf_attach_type expected_attach_type; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index e0b8a8a5aaa9..5ebece600aeb 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2401,6 +2401,7 @@ static bool __bpf_prog_map_compatible(struct bpf_map *map, map->owner->type = prog_type; map->owner->jited = fp->jited; map->owner->xdp_has_frags = aux->xdp_has_frags; + map->owner->sleepable = fp->sleepable; map->owner->expected_attach_type = fp->expected_attach_type; map->owner->attach_func_proto = aux->attach_func_proto; for_each_cgroup_storage_type(i) { @@ -2412,7 +2413,8 @@ static bool __bpf_prog_map_compatible(struct bpf_map *map, } else { ret = map->owner->type == prog_type && map->owner->jited == fp->jited && - map->owner->xdp_has_frags == aux->xdp_has_frags; + map->owner->xdp_has_frags == aux->xdp_has_frags && + map->owner->sleepable == fp->sleepable; if (ret && map->map_type == BPF_MAP_TYPE_PROG_ARRAY && map->owner->expected_attach_type != fp->expected_attach_type) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e7ff8394e0da..f185ebc6748d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -21386,6 +21386,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, case BPF_MAP_TYPE_STACK: case BPF_MAP_TYPE_ARENA: case BPF_MAP_TYPE_INSN_ARRAY: + case BPF_MAP_TYPE_PROG_ARRAY: break; default: verbose(env, From 15ac1adf0f84a90605121fbe4a6238b24c865f92 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 30 Jan 2026 09:12:08 +0100 Subject: [PATCH 199/268] selftests/bpf: Add test for sleepable program tailcalls Adding test that makes sure we can't mix sleepable and non-sleepable bpf programs in the BPF_MAP_TYPE_PROG_ARRAY map and that we can do tail call in the sleepable program. Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20260130081208.1130204-3-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/tailcalls.c | 74 +++++++++++++++++++ .../selftests/bpf/progs/tailcall_sleepable.c | 43 +++++++++++ 2 files changed, 117 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/tailcall_sleepable.c diff --git a/tools/testing/selftests/bpf/prog_tests/tailcalls.c b/tools/testing/selftests/bpf/prog_tests/tailcalls.c index 0ab36503c3b2..7d534fde0af9 100644 --- a/tools/testing/selftests/bpf/prog_tests/tailcalls.c +++ b/tools/testing/selftests/bpf/prog_tests/tailcalls.c @@ -8,6 +8,7 @@ #include "tailcall_freplace.skel.h" #include "tc_bpf2bpf.skel.h" #include "tailcall_fail.skel.h" +#include "tailcall_sleepable.skel.h" /* test_tailcall_1 checks basic functionality by patching multiple locations * in a single program for a single tail call slot with nop->jmp, jmp->nop @@ -1653,6 +1654,77 @@ static void test_tailcall_failure() RUN_TESTS(tailcall_fail); } +noinline void uprobe_sleepable_trigger(void) +{ + asm volatile (""); +} + +static void test_tailcall_sleepable(void) +{ + LIBBPF_OPTS(bpf_uprobe_opts, opts); + struct tailcall_sleepable *skel; + int prog_fd, map_fd; + int err, key; + + skel = tailcall_sleepable__open(); + if (!ASSERT_OK_PTR(skel, "tailcall_sleepable__open")) + return; + + /* + * Test that we can't load uprobe_normal and uprobe_sleepable_1, + * because they share tailcall map. + */ + bpf_program__set_autoload(skel->progs.uprobe_normal, true); + bpf_program__set_autoload(skel->progs.uprobe_sleepable_1, true); + + err = tailcall_sleepable__load(skel); + if (!ASSERT_ERR(err, "tailcall_sleepable__load")) + goto out; + + tailcall_sleepable__destroy(skel); + + /* + * Test that we can tail call from sleepable to sleepable program. + */ + skel = tailcall_sleepable__open(); + if (!ASSERT_OK_PTR(skel, "tailcall_sleepable__open")) + return; + + bpf_program__set_autoload(skel->progs.uprobe_sleepable_1, true); + bpf_program__set_autoload(skel->progs.uprobe_sleepable_2, true); + + err = tailcall_sleepable__load(skel); + if (!ASSERT_OK(err, "tailcall_sleepable__load")) + goto out; + + /* Add sleepable uprobe_sleepable_2 to jmp_table[0]. */ + key = 0; + prog_fd = bpf_program__fd(skel->progs.uprobe_sleepable_2); + map_fd = bpf_map__fd(skel->maps.jmp_table); + err = bpf_map_update_elem(map_fd, &key, &prog_fd, BPF_ANY); + if (!ASSERT_OK(err, "update jmp_table")) + goto out; + + skel->bss->my_pid = getpid(); + + /* Attach uprobe_sleepable_1 to uprobe_sleepable_trigger and hit it. */ + opts.func_name = "uprobe_sleepable_trigger"; + skel->links.uprobe_sleepable_1 = bpf_program__attach_uprobe_opts( + skel->progs.uprobe_sleepable_1, + -1, + "/proc/self/exe", + 0 /* offset */, + &opts); + if (!ASSERT_OK_PTR(skel->links.uprobe_sleepable_1, "bpf_program__attach_uprobe_opts")) + goto out; + + uprobe_sleepable_trigger(); + ASSERT_EQ(skel->bss->executed, 1, "executed"); + +out: + tailcall_sleepable__destroy(skel); +} + void test_tailcalls(void) { if (test__start_subtest("tailcall_1")) @@ -1707,4 +1779,6 @@ void test_tailcalls(void) test_tailcall_bpf2bpf_freplace(); if (test__start_subtest("tailcall_failure")) test_tailcall_failure(); + if (test__start_subtest("tailcall_sleepable")) + test_tailcall_sleepable(); } diff --git a/tools/testing/selftests/bpf/progs/tailcall_sleepable.c b/tools/testing/selftests/bpf/progs/tailcall_sleepable.c new file mode 100644 index 000000000000..d959a9eaaa9c --- /dev/null +++ b/tools/testing/selftests/bpf/progs/tailcall_sleepable.c @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include "bpf_misc.h" +#include "bpf_test_utils.h" + +struct { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __uint(max_entries, 1); + __uint(key_size, sizeof(__u32)); + __array(values, void (void)); +} jmp_table SEC(".maps"); + +SEC("?uprobe") +int uprobe_normal(void *ctx) +{ + bpf_tail_call_static(ctx, &jmp_table, 0); + return 0; +} + +SEC("?uprobe.s") +int uprobe_sleepable_1(void *ctx) +{ + bpf_tail_call_static(ctx, &jmp_table, 0); + return 0; +} + +int executed = 0; +int my_pid = 0; + +SEC("?uprobe.s") +int uprobe_sleepable_2(void *ctx) +{ + int pid = bpf_get_current_pid_tgid() >> 32; + + if (pid != my_pid) + return 0; + + executed++; + return 0; +} + +char __license[] SEC("license") = "GPL"; From cd77618c418254b827f2a807b4c27b97088fdb52 Mon Sep 17 00:00:00 2001 From: Changwoo Min Date: Fri, 30 Jan 2026 11:18:43 +0900 Subject: [PATCH 200/268] selftests/bpf: Make bpf get_preempt_count() work for v6.14+ kernels Recent x86 kernels export __preempt_count as a ksym, while some old kernels between v6.1 and v6.14 expose the preemption counter via pcpu_hot.preempt_count. The existing selftest helper unconditionally dereferenced __preempt_count, which breaks BPF program loading on such old kernels. Make the x86 preemption count lookup version-agnostic by: - Marking __preempt_count and pcpu_hot as weak ksyms. - Introducing a BTF-described pcpu_hot___local layout with preserve_access_index. - Selecting the appropriate access path at runtime using ksym availability and bpf_ksym_exists() and bpf_core_field_exists(). This allows a single BPF binary to run correctly across kernel versions (e.g., v6.18 vs. v6.13) without relying on compile-time version checks. Signed-off-by: Changwoo Min Link: https://lore.kernel.org/r/20260130021843.154885-1-changwoo@igalia.com Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/bpf_experimental.h | 22 +++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h index a39576c8ba04..4b7210c318dd 100644 --- a/tools/testing/selftests/bpf/bpf_experimental.h +++ b/tools/testing/selftests/bpf/bpf_experimental.h @@ -614,7 +614,13 @@ extern int bpf_cgroup_read_xattr(struct cgroup *cgroup, const char *name__str, extern bool CONFIG_PREEMPT_RT __kconfig __weak; #ifdef bpf_target_x86 -extern const int __preempt_count __ksym; +extern const int __preempt_count __ksym __weak; + +struct pcpu_hot___local { + int preempt_count; +} __attribute__((preserve_access_index)); + +extern struct pcpu_hot___local pcpu_hot __ksym __weak; #endif struct task_struct___preempt_rt { @@ -624,7 +630,19 @@ struct task_struct___preempt_rt { static inline int get_preempt_count(void) { #if defined(bpf_target_x86) - return *(int *) bpf_this_cpu_ptr(&__preempt_count); + /* By default, read the per-CPU __preempt_count. */ + if (bpf_ksym_exists(&__preempt_count)) + return *(int *) bpf_this_cpu_ptr(&__preempt_count); + + /* + * If __preempt_count does not exist, try to read preempt_count under + * struct pcpu_hot. Between v6.1 and v6.14 -- more specifically, + * [64701838bf057, 46e8fff6d45fe), preempt_count had been managed + * under struct pcpu_hot. + */ + if (bpf_core_field_exists(pcpu_hot.preempt_count)) + return ((struct pcpu_hot___local *) + bpf_this_cpu_ptr(&pcpu_hot))->preempt_count; #elif defined(bpf_target_arm64) return bpf_get_current_task_btf()->thread_info.preempt.count; #endif From 8bc11700e0d23d4fdb7d8d5a73b2e95de427cabc Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 26 Jan 2026 22:18:32 +0100 Subject: [PATCH 201/268] x86/fgraph: Fix return_to_handler regs.rsp value The previous change (Fixes commit) messed up the rsp register value, which is wrong because it's already adjusted with FRAME_SIZE, we need the original rsp value. This change does not affect fprobe current kernel unwind, the !perf_hw_regs path perf_callchain_kernel: if (perf_hw_regs(regs)) { if (perf_callchain_store(entry, regs->ip)) return; unwind_start(&state, current, regs, NULL); } else { unwind_start(&state, current, NULL, (void *)regs->sp); } which uses pt_regs.sp as first_frame boundary (FRAME_SIZE shift makes no difference, unwind stil stops at the right frame). This change fixes the other path when we want to unwind directly from pt_regs sp/fp/ip state, which is coming in following change. Fixes: 20a0bc10272f ("x86/fgraph,bpf: Fix stack ORC unwind from kprobe_multi return probe") Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Reviewed-by: Steven Rostedt (Google) Link: https://lore.kernel.org/bpf/20260126211837.472802-2-jolsa@kernel.org --- arch/x86/kernel/ftrace_64.S | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index a132608265f6..62c1c93aa1c6 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -364,6 +364,9 @@ SYM_CODE_START(return_to_handler) UNWIND_HINT_UNDEFINED ANNOTATE_NOENDBR + /* Store original rsp for pt_regs.sp value. */ + movq %rsp, %rdi + /* Restore return_to_handler value that got eaten by previous ret instruction. */ subq $8, %rsp UNWIND_HINT_FUNC @@ -374,7 +377,7 @@ SYM_CODE_START(return_to_handler) movq %rax, RAX(%rsp) movq %rdx, RDX(%rsp) movq %rbp, RBP(%rsp) - movq %rsp, RSP(%rsp) + movq %rdi, RSP(%rsp) movq %rsp, %rdi call ftrace_return_to_handler From aea251799998aa1b78eacdfb308f18ea114ea5b3 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 26 Jan 2026 22:18:33 +0100 Subject: [PATCH 202/268] x86/fgraph,bpf: Switch kprobe_multi program stack unwind to hw_regs path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mahe reported missing function from stack trace on top of kprobe multi program. The missing function is the very first one in the stacktrace, the one that the bpf program is attached to. # bpftrace -e 'kprobe:__x64_sys_newuname* { print(kstack)}' Attaching 1 probe... do_syscall_64+134 entry_SYSCALL_64_after_hwframe+118 ('*' is used for kprobe_multi attachment) The reason is that the previous change (the Fixes commit) fixed stack unwind for tracepoint, but removed attached function address from the stack trace on top of kprobe multi programs, which I also overlooked in the related test (check following patch). The tracepoint and kprobe_multi have different stack setup, but use same unwind path. I think it's better to keep the previous change, which fixed tracepoint unwind and instead change the kprobe multi unwind as explained below. The bpf program stack unwind calls perf_callchain_kernel for kernel portion and it follows two unwind paths based on X86_EFLAGS_FIXED bit in pt_regs.flags. When the bit set we unwind from stack represented by pt_regs argument, otherwise we unwind currently executed stack up to 'first_frame' boundary. The 'first_frame' value is taken from regs.rsp value, but ftrace_caller and ftrace_regs_caller (ftrace trampoline) functions set the regs.rsp to the previous stack frame, so we skip the attached function entry. If we switch kprobe_multi unwind to use the X86_EFLAGS_FIXED bit, we set the start of the unwind to the attached function address. As another benefit we also cut extra unwind cycles needed to reach the 'first_frame' boundary. The speedup can be measured with trigger bench for kprobe_multi program and stacktrace support. - trigger bench with stacktrace on current code: kprobe-multi : 0.810 ± 0.001M/s kretprobe-multi: 0.808 ± 0.001M/s - and with the fix: kprobe-multi : 1.264 ± 0.001M/s kretprobe-multi: 1.401 ± 0.002M/s With the fix, the entry probe stacktrace: # bpftrace -e 'kprobe:__x64_sys_newuname* { print(kstack)}' Attaching 1 probe... __x64_sys_newuname+9 do_syscall_64+134 entry_SYSCALL_64_after_hwframe+118 The return probe skips the attached function, because it's no longer on the stack at the point of the unwind and this way is the same how standard kretprobe works. # bpftrace -e 'kretprobe:__x64_sys_newuname* { print(kstack)}' Attaching 1 probe... do_syscall_64+134 entry_SYSCALL_64_after_hwframe+118 Fixes: 6d08340d1e35 ("Revert "perf/x86: Always store regs->ip in perf_callchain_kernel()"") Reported-by: Mahe Tardy Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Acked-by: Steven Rostedt (Google) Link: https://lore.kernel.org/bpf/20260126211837.472802-3-jolsa@kernel.org --- arch/x86/include/asm/ftrace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index b08c95872eed..c56e1e63b893 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -57,7 +57,7 @@ arch_ftrace_get_regs(struct ftrace_regs *fregs) } #define arch_ftrace_partial_regs(regs) do { \ - regs->flags &= ~X86_EFLAGS_FIXED; \ + regs->flags |= X86_EFLAGS_FIXED; \ regs->cs = __KERNEL_CS; \ } while (0) From 0207f94971e72a13380e28022c86da147e8e090f Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 26 Jan 2026 22:18:34 +0100 Subject: [PATCH 203/268] selftests/bpf: Fix kprobe multi stacktrace_ips test We now include the attached function in the stack trace, fixing the test accordingly. Fixes: c9e208fa93cd ("selftests/bpf: Add stacktrace ips test for kprobe_multi/kretprobe_multi") Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260126211837.472802-4-jolsa@kernel.org --- .../selftests/bpf/prog_tests/stacktrace_ips.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c index c9efdd2a5b18..c93718dafd9b 100644 --- a/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c +++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c @@ -74,11 +74,20 @@ static void test_stacktrace_ips_kprobe_multi(bool retprobe) load_kallsyms(); - check_stacktrace_ips(bpf_map__fd(skel->maps.stackmap), skel->bss->stack_key, 4, - ksym_get_addr("bpf_testmod_stacktrace_test_3"), - ksym_get_addr("bpf_testmod_stacktrace_test_2"), - ksym_get_addr("bpf_testmod_stacktrace_test_1"), - ksym_get_addr("bpf_testmod_test_read")); + if (retprobe) { + check_stacktrace_ips(bpf_map__fd(skel->maps.stackmap), skel->bss->stack_key, 4, + ksym_get_addr("bpf_testmod_stacktrace_test_3"), + ksym_get_addr("bpf_testmod_stacktrace_test_2"), + ksym_get_addr("bpf_testmod_stacktrace_test_1"), + ksym_get_addr("bpf_testmod_test_read")); + } else { + check_stacktrace_ips(bpf_map__fd(skel->maps.stackmap), skel->bss->stack_key, 5, + ksym_get_addr("bpf_testmod_stacktrace_test"), + ksym_get_addr("bpf_testmod_stacktrace_test_3"), + ksym_get_addr("bpf_testmod_stacktrace_test_2"), + ksym_get_addr("bpf_testmod_stacktrace_test_1"), + ksym_get_addr("bpf_testmod_test_read")); + } cleanup: stacktrace_ips__destroy(skel); From 7373f97e868ad01fc73de8e7b71834eeba25d4f1 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 26 Jan 2026 22:18:35 +0100 Subject: [PATCH 204/268] selftests/bpf: Add stacktrace ips test for kprobe/kretprobe Adding test that attaches kprobe/kretprobe and verifies the ORC stacktrace matches expected functions. The test is only for ORC unwinder to keep it simple. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260126211837.472802-5-jolsa@kernel.org --- .../selftests/bpf/prog_tests/stacktrace_ips.c | 50 +++++++++++++++++++ .../selftests/bpf/progs/stacktrace_ips.c | 7 +++ 2 files changed, 57 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c index c93718dafd9b..852830536109 100644 --- a/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c +++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c @@ -137,6 +137,52 @@ static void test_stacktrace_ips_raw_tp(void) stacktrace_ips__destroy(skel); } +static void test_stacktrace_ips_kprobe(bool retprobe) +{ + LIBBPF_OPTS(bpf_kprobe_opts, opts, + .retprobe = retprobe + ); + LIBBPF_OPTS(bpf_test_run_opts, topts); + struct stacktrace_ips *skel; + + skel = stacktrace_ips__open_and_load(); + if (!ASSERT_OK_PTR(skel, "stacktrace_ips__open_and_load")) + return; + + if (!skel->kconfig->CONFIG_UNWINDER_ORC) { + test__skip(); + goto cleanup; + } + + skel->links.kprobe_test = bpf_program__attach_kprobe_opts( + skel->progs.kprobe_test, + "bpf_testmod_stacktrace_test", &opts); + if (!ASSERT_OK_PTR(skel->links.kprobe_test, "bpf_program__attach_kprobe_opts")) + goto cleanup; + + trigger_module_test_read(1); + + load_kallsyms(); + + if (retprobe) { + check_stacktrace_ips(bpf_map__fd(skel->maps.stackmap), skel->bss->stack_key, 4, + ksym_get_addr("bpf_testmod_stacktrace_test_3"), + ksym_get_addr("bpf_testmod_stacktrace_test_2"), + ksym_get_addr("bpf_testmod_stacktrace_test_1"), + ksym_get_addr("bpf_testmod_test_read")); + } else { + check_stacktrace_ips(bpf_map__fd(skel->maps.stackmap), skel->bss->stack_key, 5, + ksym_get_addr("bpf_testmod_stacktrace_test"), + ksym_get_addr("bpf_testmod_stacktrace_test_3"), + ksym_get_addr("bpf_testmod_stacktrace_test_2"), + ksym_get_addr("bpf_testmod_stacktrace_test_1"), + ksym_get_addr("bpf_testmod_test_read")); + } + +cleanup: + stacktrace_ips__destroy(skel); +} + static void __test_stacktrace_ips(void) { if (test__start_subtest("kprobe_multi")) @@ -145,6 +191,10 @@ static void __test_stacktrace_ips(void) test_stacktrace_ips_kprobe_multi(true); if (test__start_subtest("raw_tp")) test_stacktrace_ips_raw_tp(); + if (test__start_subtest("kprobe")) + test_stacktrace_ips_kprobe(false); + if (test__start_subtest("kretprobe")) + test_stacktrace_ips_kprobe(true); } #else static void __test_stacktrace_ips(void) diff --git a/tools/testing/selftests/bpf/progs/stacktrace_ips.c b/tools/testing/selftests/bpf/progs/stacktrace_ips.c index a96c8150d7f5..cae077a4061b 100644 --- a/tools/testing/selftests/bpf/progs/stacktrace_ips.c +++ b/tools/testing/selftests/bpf/progs/stacktrace_ips.c @@ -31,6 +31,13 @@ int unused(void) __u32 stack_key; +SEC("kprobe") +int kprobe_test(struct pt_regs *ctx) +{ + stack_key = bpf_get_stackid(ctx, &stackmap, 0); + return 0; +} + SEC("kprobe.multi") int kprobe_multi_test(struct pt_regs *ctx) { From e5d532be4a3b0d1f0a9210c0da2c04a6b4605904 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 26 Jan 2026 22:18:36 +0100 Subject: [PATCH 205/268] selftests/bpf: Add stacktrace ips test for fentry/fexit Adding test that attaches fentry/fexitand verifies the ORC stacktrace matches expected functions. The test is only for ORC unwinder to keep it simple. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260126211837.472802-6-jolsa@kernel.org --- .../selftests/bpf/prog_tests/stacktrace_ips.c | 51 +++++++++++++++++++ .../selftests/bpf/progs/stacktrace_ips.c | 20 ++++++++ 2 files changed, 71 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c index 852830536109..da42b00e3d1f 100644 --- a/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c +++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_ips.c @@ -183,6 +183,53 @@ static void test_stacktrace_ips_kprobe(bool retprobe) stacktrace_ips__destroy(skel); } +static void test_stacktrace_ips_trampoline(bool retprobe) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + struct stacktrace_ips *skel; + + skel = stacktrace_ips__open_and_load(); + if (!ASSERT_OK_PTR(skel, "stacktrace_ips__open_and_load")) + return; + + if (!skel->kconfig->CONFIG_UNWINDER_ORC) { + test__skip(); + goto cleanup; + } + + if (retprobe) { + skel->links.fexit_test = bpf_program__attach_trace(skel->progs.fexit_test); + if (!ASSERT_OK_PTR(skel->links.fexit_test, "bpf_program__attach_trace")) + goto cleanup; + } else { + skel->links.fentry_test = bpf_program__attach_trace(skel->progs.fentry_test); + if (!ASSERT_OK_PTR(skel->links.fentry_test, "bpf_program__attach_trace")) + goto cleanup; + } + + trigger_module_test_read(1); + + load_kallsyms(); + + if (retprobe) { + check_stacktrace_ips(bpf_map__fd(skel->maps.stackmap), skel->bss->stack_key, 4, + ksym_get_addr("bpf_testmod_stacktrace_test_3"), + ksym_get_addr("bpf_testmod_stacktrace_test_2"), + ksym_get_addr("bpf_testmod_stacktrace_test_1"), + ksym_get_addr("bpf_testmod_test_read")); + } else { + check_stacktrace_ips(bpf_map__fd(skel->maps.stackmap), skel->bss->stack_key, 5, + ksym_get_addr("bpf_testmod_stacktrace_test"), + ksym_get_addr("bpf_testmod_stacktrace_test_3"), + ksym_get_addr("bpf_testmod_stacktrace_test_2"), + ksym_get_addr("bpf_testmod_stacktrace_test_1"), + ksym_get_addr("bpf_testmod_test_read")); + } + +cleanup: + stacktrace_ips__destroy(skel); +} + static void __test_stacktrace_ips(void) { if (test__start_subtest("kprobe_multi")) @@ -195,6 +242,10 @@ static void __test_stacktrace_ips(void) test_stacktrace_ips_kprobe(false); if (test__start_subtest("kretprobe")) test_stacktrace_ips_kprobe(true); + if (test__start_subtest("fentry")) + test_stacktrace_ips_trampoline(false); + if (test__start_subtest("fexit")) + test_stacktrace_ips_trampoline(true); } #else static void __test_stacktrace_ips(void) diff --git a/tools/testing/selftests/bpf/progs/stacktrace_ips.c b/tools/testing/selftests/bpf/progs/stacktrace_ips.c index cae077a4061b..6830f2978613 100644 --- a/tools/testing/selftests/bpf/progs/stacktrace_ips.c +++ b/tools/testing/selftests/bpf/progs/stacktrace_ips.c @@ -53,4 +53,24 @@ int rawtp_test(void *ctx) return 0; } +SEC("fentry/bpf_testmod_stacktrace_test") +int fentry_test(struct pt_regs *ctx) +{ + /* + * Skip 2 bpf_program/trampoline stack entries: + * - bpf_prog_bd1f7a949f55fb03_fentry_test + * - bpf_trampoline_182536277701 + */ + stack_key = bpf_get_stackid(ctx, &stackmap, 2); + return 0; +} + +SEC("fexit/bpf_testmod_stacktrace_test") +int fexit_test(struct pt_regs *ctx) +{ + /* Skip 2 bpf_program/trampoline stack entries, check fentry_test. */ + stack_key = bpf_get_stackid(ctx, &stackmap, 2); + return 0; +} + char _license[] SEC("license") = "GPL"; From 4173b494d93a8057d3ed23e65853cd76b647f870 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 26 Jan 2026 22:18:37 +0100 Subject: [PATCH 206/268] selftests/bpf: Allow to benchmark trigger with stacktrace Adding support to call bpf_get_stackid helper from trigger programs, so far added for kprobe multi. Adding the --stacktrace/-g option to enable it. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260126211837.472802-7-jolsa@kernel.org --- tools/testing/selftests/bpf/bench.c | 4 ++ tools/testing/selftests/bpf/bench.h | 1 + .../selftests/bpf/benchs/bench_trigger.c | 1 + .../selftests/bpf/progs/trigger_bench.c | 46 +++++++++++++++---- 4 files changed, 43 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c index bd29bb2e6cb5..8368bd3a0665 100644 --- a/tools/testing/selftests/bpf/bench.c +++ b/tools/testing/selftests/bpf/bench.c @@ -265,6 +265,7 @@ static const struct argp_option opts[] = { { "verbose", 'v', NULL, 0, "Verbose debug output"}, { "affinity", 'a', NULL, 0, "Set consumer/producer thread affinity"}, { "quiet", 'q', NULL, 0, "Be more quiet"}, + { "stacktrace", 's', NULL, 0, "Get stack trace"}, { "prod-affinity", ARG_PROD_AFFINITY_SET, "CPUSET", 0, "Set of CPUs for producer threads; implies --affinity"}, { "cons-affinity", ARG_CONS_AFFINITY_SET, "CPUSET", 0, @@ -350,6 +351,9 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state) case 'q': env.quiet = true; break; + case 's': + env.stacktrace = true; + break; case ARG_PROD_AFFINITY_SET: env.affinity = true; if (parse_num_list(arg, &env.prod_cpus.cpus, diff --git a/tools/testing/selftests/bpf/bench.h b/tools/testing/selftests/bpf/bench.h index bea323820ffb..7cf21936e7ed 100644 --- a/tools/testing/selftests/bpf/bench.h +++ b/tools/testing/selftests/bpf/bench.h @@ -26,6 +26,7 @@ struct env { bool list; bool affinity; bool quiet; + bool stacktrace; int consumer_cnt; int producer_cnt; int nr_cpus; diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c index 34018fc3927f..aeec9edd3851 100644 --- a/tools/testing/selftests/bpf/benchs/bench_trigger.c +++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c @@ -146,6 +146,7 @@ static void setup_ctx(void) bpf_program__set_autoload(ctx.skel->progs.trigger_driver, true); ctx.skel->rodata->batch_iters = args.batch_iters; + ctx.skel->rodata->stacktrace = env.stacktrace; } static void load_ctx(void) diff --git a/tools/testing/selftests/bpf/progs/trigger_bench.c b/tools/testing/selftests/bpf/progs/trigger_bench.c index 2898b3749d07..4ea0422d1042 100644 --- a/tools/testing/selftests/bpf/progs/trigger_bench.c +++ b/tools/testing/selftests/bpf/progs/trigger_bench.c @@ -25,6 +25,34 @@ static __always_inline void inc_counter(void) __sync_add_and_fetch(&hits[cpu & CPU_MASK].value, 1); } +volatile const int stacktrace; + +typedef __u64 stack_trace_t[128]; + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, stack_trace_t); +} stack_heap SEC(".maps"); + +static __always_inline void do_stacktrace(void *ctx) +{ + if (!stacktrace) + return; + + __u64 *ptr = bpf_map_lookup_elem(&stack_heap, &(__u32){0}); + + if (ptr) + bpf_get_stack(ctx, ptr, sizeof(stack_trace_t), 0); +} + +static __always_inline void handle(void *ctx) +{ + inc_counter(); + do_stacktrace(ctx); +} + SEC("?uprobe") int bench_trigger_uprobe(void *ctx) { @@ -81,21 +109,21 @@ int trigger_driver_kfunc(void *ctx) SEC("?kprobe/bpf_get_numa_node_id") int bench_trigger_kprobe(void *ctx) { - inc_counter(); + handle(ctx); return 0; } SEC("?kretprobe/bpf_get_numa_node_id") int bench_trigger_kretprobe(void *ctx) { - inc_counter(); + handle(ctx); return 0; } SEC("?kprobe.multi/bpf_get_numa_node_id") int bench_trigger_kprobe_multi(void *ctx) { - inc_counter(); + handle(ctx); return 0; } @@ -108,7 +136,7 @@ int bench_kprobe_multi_empty(void *ctx) SEC("?kretprobe.multi/bpf_get_numa_node_id") int bench_trigger_kretprobe_multi(void *ctx) { - inc_counter(); + handle(ctx); return 0; } @@ -121,34 +149,34 @@ int bench_kretprobe_multi_empty(void *ctx) SEC("?fentry/bpf_get_numa_node_id") int bench_trigger_fentry(void *ctx) { - inc_counter(); + handle(ctx); return 0; } SEC("?fexit/bpf_get_numa_node_id") int bench_trigger_fexit(void *ctx) { - inc_counter(); + handle(ctx); return 0; } SEC("?fmod_ret/bpf_modify_return_test_tp") int bench_trigger_fmodret(void *ctx) { - inc_counter(); + handle(ctx); return -22; } SEC("?tp/bpf_test_run/bpf_trigger_tp") int bench_trigger_tp(void *ctx) { - inc_counter(); + handle(ctx); return 0; } SEC("?raw_tp/bpf_trigger_tp") int bench_trigger_rawtp(void *ctx) { - inc_counter(); + handle(ctx); return 0; } From 98c4fd2963cb2bc5eae22b5365e6bbf3a5cd0f14 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Fri, 30 Jan 2026 20:42:10 +0000 Subject: [PATCH 207/268] bpf: Introduce struct bpf_map_desc in verifier Introduce struct bpf_map_desc to hold bpf_map pointer and map uid. Use this struct in both bpf_call_arg_meta and bpf_kfunc_call_arg_meta instead of having different representations: - bpf_call_arg_meta had separate map_ptr and map_uid fields - bpf_kfunc_call_arg_meta had an anonymous inline struct This unifies the map fields layout across both metadata structures, making the code more consistent and preparing for further refactoring of map field pointer validation. Acked-by: Eduard Zingerman Signed-off-by: Mykyta Yatsenko Link: https://lore.kernel.org/r/20260130-verif_special_fields-v2-1-2c59e637da7d@meta.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 79 ++++++++++++++++++++++--------------------- 1 file changed, 40 insertions(+), 39 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f185ebc6748d..c79463d50b37 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -272,8 +272,13 @@ static bool bpf_pseudo_kfunc_call(const struct bpf_insn *insn) insn->src_reg == BPF_PSEUDO_KFUNC_CALL; } +struct bpf_map_desc { + struct bpf_map *ptr; + int uid; +}; + struct bpf_call_arg_meta { - struct bpf_map *map_ptr; + struct bpf_map_desc map; bool raw_mode; bool pkt_access; u8 release_regno; @@ -283,7 +288,6 @@ struct bpf_call_arg_meta { u64 msize_max_value; int ref_obj_id; int dynptr_id; - int map_uid; int func_id; struct btf *btf; u32 btf_id; @@ -351,10 +355,7 @@ struct bpf_kfunc_call_arg_meta { u8 spi; u8 frameno; } iter; - struct { - struct bpf_map *ptr; - int uid; - } map; + struct bpf_map_desc map; u64 mem_size; }; @@ -8666,7 +8667,7 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno, if (err) return err; - if (meta->map_ptr) { + if (meta->map.ptr) { verifier_bug(env, "Two map pointers in a timer helper"); return -EFAULT; } @@ -8674,8 +8675,8 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno, verbose(env, "bpf_timer cannot be used for PREEMPT_RT.\n"); return -EOPNOTSUPP; } - meta->map_uid = reg->map_uid; - meta->map_ptr = map; + meta->map.uid = reg->map_uid; + meta->map.ptr = map; return 0; } @@ -8739,7 +8740,7 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno, return -EINVAL; } rec = map_ptr->record; - meta->map_ptr = map_ptr; + meta->map.ptr = map_ptr; } if (!tnum_is_const(reg->var_off)) { @@ -9246,13 +9247,13 @@ static int resolve_map_arg_type(struct bpf_verifier_env *env, const struct bpf_call_arg_meta *meta, enum bpf_arg_type *arg_type) { - if (!meta->map_ptr) { + if (!meta->map.ptr) { /* kernel subsystem misconfigured verifier */ verifier_bug(env, "invalid map_ptr to access map->type"); return -EFAULT; } - switch (meta->map_ptr->map_type) { + switch (meta->map.ptr->map_type) { case BPF_MAP_TYPE_SOCKMAP: case BPF_MAP_TYPE_SOCKHASH: if (*arg_type == ARG_PTR_TO_MAP_VALUE) { @@ -9906,7 +9907,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, switch (base_type(arg_type)) { case ARG_CONST_MAP_PTR: /* bpf_map_xxx(map_ptr) call: remember that map_ptr */ - if (meta->map_ptr) { + if (meta->map.ptr) { /* Use map_uid (which is unique id of inner map) to reject: * inner_map1 = bpf_map_lookup_elem(outer_map, key1) * inner_map2 = bpf_map_lookup_elem(outer_map, key2) @@ -9919,23 +9920,23 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, * * Comparing map_ptr is enough to distinguish normal and outer maps. */ - if (meta->map_ptr != reg->map_ptr || - meta->map_uid != reg->map_uid) { + if (meta->map.ptr != reg->map_ptr || + meta->map.uid != reg->map_uid) { verbose(env, "timer pointer in R1 map_uid=%d doesn't match map pointer in R2 map_uid=%d\n", - meta->map_uid, reg->map_uid); + meta->map.uid, reg->map_uid); return -EINVAL; } } - meta->map_ptr = reg->map_ptr; - meta->map_uid = reg->map_uid; + meta->map.ptr = reg->map_ptr; + meta->map.uid = reg->map_uid; break; case ARG_PTR_TO_MAP_KEY: /* bpf_map_xxx(..., map_ptr, ..., key) call: * check that [key, key + map->key_size) are within * stack limits and initialized */ - if (!meta->map_ptr) { + if (!meta->map.ptr) { /* in function declaration map_ptr must come before * map_key, so that it's verified and known before * we have to check map_key here. Otherwise it means @@ -9944,11 +9945,11 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, verifier_bug(env, "invalid map_ptr to access map->key"); return -EFAULT; } - key_size = meta->map_ptr->key_size; + key_size = meta->map.ptr->key_size; err = check_helper_mem_access(env, regno, key_size, BPF_READ, false, NULL); if (err) return err; - if (can_elide_value_nullness(meta->map_ptr->map_type)) { + if (can_elide_value_nullness(meta->map.ptr->map_type)) { err = get_constant_map_key(env, reg, key_size, &meta->const_map_key); if (err < 0) { meta->const_map_key = -1; @@ -9966,13 +9967,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, /* bpf_map_xxx(..., map_ptr, ..., value) call: * check [value, value + map->value_size) validity */ - if (!meta->map_ptr) { + if (!meta->map.ptr) { /* kernel subsystem misconfigured verifier */ verifier_bug(env, "invalid map_ptr to access map->value"); return -EFAULT; } meta->raw_mode = arg_type & MEM_UNINIT; - err = check_helper_mem_access(env, regno, meta->map_ptr->value_size, + err = check_helper_mem_access(env, regno, meta->map.ptr->value_size, arg_type & MEM_WRITE ? BPF_WRITE : BPF_READ, false, meta); break; @@ -11310,7 +11311,7 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, int func_id, int insn_idx) { struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx]; - struct bpf_map *map = meta->map_ptr; + struct bpf_map *map = meta->map.ptr; if (func_id != BPF_FUNC_tail_call && func_id != BPF_FUNC_map_lookup_elem && @@ -11343,11 +11344,11 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, } if (!aux->map_ptr_state.map_ptr) - bpf_map_ptr_store(aux, meta->map_ptr, - !meta->map_ptr->bypass_spec_v1, false); - else if (aux->map_ptr_state.map_ptr != meta->map_ptr) - bpf_map_ptr_store(aux, meta->map_ptr, - !meta->map_ptr->bypass_spec_v1, true); + bpf_map_ptr_store(aux, meta->map.ptr, + !meta->map.ptr->bypass_spec_v1, false); + else if (aux->map_ptr_state.map_ptr != meta->map.ptr) + bpf_map_ptr_store(aux, meta->map.ptr, + !meta->map.ptr->bypass_spec_v1, true); return 0; } @@ -11357,7 +11358,7 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, { struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx]; struct bpf_reg_state *reg; - struct bpf_map *map = meta->map_ptr; + struct bpf_map *map = meta->map.ptr; u64 val, max; int err; @@ -11913,22 +11914,22 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn * can check 'value_size' boundary of memory access * to map element returned from bpf_map_lookup_elem() */ - if (meta.map_ptr == NULL) { + if (meta.map.ptr == NULL) { verifier_bug(env, "unexpected null map_ptr"); return -EFAULT; } if (func_id == BPF_FUNC_map_lookup_elem && - can_elide_value_nullness(meta.map_ptr->map_type) && + can_elide_value_nullness(meta.map.ptr->map_type) && meta.const_map_key >= 0 && - meta.const_map_key < meta.map_ptr->max_entries) + meta.const_map_key < meta.map.ptr->max_entries) ret_flag &= ~PTR_MAYBE_NULL; - regs[BPF_REG_0].map_ptr = meta.map_ptr; - regs[BPF_REG_0].map_uid = meta.map_uid; + regs[BPF_REG_0].map_ptr = meta.map.ptr; + regs[BPF_REG_0].map_uid = meta.map.uid; regs[BPF_REG_0].type = PTR_TO_MAP_VALUE | ret_flag; if (!type_may_be_null(ret_flag) && - btf_record_has_field(meta.map_ptr->record, BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK)) { + btf_record_has_field(meta.map.ptr->record, BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK)) { regs[BPF_REG_0].id = ++env->id_gen; } break; @@ -12031,7 +12032,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (type_may_be_null(regs[BPF_REG_0].type)) regs[BPF_REG_0].id = ++env->id_gen; - if (helper_multiple_ref_obj_use(func_id, meta.map_ptr)) { + if (helper_multiple_ref_obj_use(func_id, meta.map.ptr)) { verifier_bug(env, "func %s#%d sets ref_obj_id more than once", func_id_name(func_id), func_id); return -EFAULT; @@ -12043,7 +12044,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (is_ptr_cast_function(func_id) || is_dynptr_ref_function(func_id)) { /* For release_reference() */ regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id; - } else if (is_acquire_function(func_id, meta.map_ptr)) { + } else if (is_acquire_function(func_id, meta.map.ptr)) { int id = acquire_reference(env, insn_idx); if (id < 0) @@ -12058,7 +12059,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (err) return err; - err = check_map_func_compatibility(env, meta.map_ptr, func_id); + err = check_map_func_compatibility(env, meta.map.ptr, func_id); if (err) return err; From f4e72ad7c161d6ee1466b42ce0acc5c4eb6164dd Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Fri, 30 Jan 2026 20:42:11 +0000 Subject: [PATCH 208/268] bpf: Consolidate special map field validation in verifier Consolidate all logic for verifying special map fields in the single function check_map_field_pointer(). Acked-by: Eduard Zingerman Signed-off-by: Mykyta Yatsenko Link: https://lore.kernel.org/r/20260130-verif_special_fields-v2-2-2c59e637da7d@meta.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 70 +++++++------------------------------------ 1 file changed, 11 insertions(+), 59 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c79463d50b37..256cc5c1a7df 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8610,7 +8610,8 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, int flags) /* Check if @regno is a pointer to a specific field in a map value */ static int check_map_field_pointer(struct bpf_verifier_env *env, u32 regno, - enum btf_field_type field_type) + enum btf_field_type field_type, + struct bpf_map_desc *map_desc) { struct bpf_reg_state *reg = reg_state(env, regno); bool is_const = tnum_is_const(reg->var_off); @@ -8653,72 +8654,23 @@ static int check_map_field_pointer(struct bpf_verifier_env *env, u32 regno, val + reg->off, struct_name, field_off); return -EINVAL; } + if (map_desc->ptr) { + verifier_bug(env, "Two map pointers in a %s helper", struct_name); + return -EFAULT; + } + map_desc->uid = reg->map_uid; + map_desc->ptr = map; return 0; } static int process_timer_func(struct bpf_verifier_env *env, int regno, struct bpf_call_arg_meta *meta) { - struct bpf_reg_state *reg = reg_state(env, regno); - struct bpf_map *map = reg->map_ptr; - int err; - - err = check_map_field_pointer(env, regno, BPF_TIMER); - if (err) - return err; - - if (meta->map.ptr) { - verifier_bug(env, "Two map pointers in a timer helper"); - return -EFAULT; - } if (IS_ENABLED(CONFIG_PREEMPT_RT)) { verbose(env, "bpf_timer cannot be used for PREEMPT_RT.\n"); return -EOPNOTSUPP; } - meta->map.uid = reg->map_uid; - meta->map.ptr = map; - return 0; -} - -static int process_wq_func(struct bpf_verifier_env *env, int regno, - struct bpf_kfunc_call_arg_meta *meta) -{ - struct bpf_reg_state *reg = reg_state(env, regno); - struct bpf_map *map = reg->map_ptr; - int err; - - err = check_map_field_pointer(env, regno, BPF_WORKQUEUE); - if (err) - return err; - - if (meta->map.ptr) { - verifier_bug(env, "Two map pointers in a bpf_wq helper"); - return -EFAULT; - } - - meta->map.uid = reg->map_uid; - meta->map.ptr = map; - return 0; -} - -static int process_task_work_func(struct bpf_verifier_env *env, int regno, - struct bpf_kfunc_call_arg_meta *meta) -{ - struct bpf_reg_state *reg = reg_state(env, regno); - struct bpf_map *map = reg->map_ptr; - int err; - - err = check_map_field_pointer(env, regno, BPF_TASK_WORK); - if (err) - return err; - - if (meta->map.ptr) { - verifier_bug(env, "Two map pointers in a bpf_task_work helper"); - return -EFAULT; - } - meta->map.uid = reg->map_uid; - meta->map.ptr = map; - return 0; + return check_map_field_pointer(env, regno, BPF_TIMER, &meta->map); } static int process_kptr_func(struct bpf_verifier_env *env, int regno, @@ -13754,7 +13706,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ verbose(env, "arg#%d doesn't point to a map value\n", i); return -EINVAL; } - ret = process_wq_func(env, regno, meta); + ret = check_map_field_pointer(env, regno, BPF_WORKQUEUE, &meta->map); if (ret < 0) return ret; break; @@ -13763,7 +13715,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ verbose(env, "arg#%d doesn't point to a map value\n", i); return -EINVAL; } - ret = process_task_work_func(env, regno, meta); + ret = check_map_field_pointer(env, regno, BPF_TASK_WORK, &meta->map); if (ret < 0) return ret; break; From 6557f1565d779851c4db9c488c49c05a47a6e72f Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Sat, 31 Jan 2026 17:08:37 +0100 Subject: [PATCH 209/268] bpf: Fix bpf_xdp_store_bytes proto for read-only arg While making some maps in Cilium read-only from the BPF side, we noticed that the bpf_xdp_store_bytes proto is incorrect. In particular, the verifier was throwing the following error: ; ret = ctx_store_bytes(ctx, l3_off + offsetof(struct iphdr, saddr), &nat->address, 4, 0); 635: (79) r1 = *(u64 *)(r10 -144) ; R1=ctx() R10=fp0 fp-144=ctx() 636: (b4) w2 = 26 ; R2=26 637: (b4) w4 = 4 ; R4=4 638: (b4) w5 = 0 ; R5=0 639: (85) call bpf_xdp_store_bytes#190 write into map forbidden, value_size=6 off=0 size=4 nat comes from a BPF_F_RDONLY_PROG map, so R3 is a PTR_TO_MAP_VALUE. The verifier checks the helper's memory access to R3 in check_mem_size_reg, as it reaches ARG_CONST_SIZE argument. The third argument has expected type ARG_PTR_TO_UNINIT_MEM, which includes the MEM_WRITE flag. The verifier thus checks for a BPF_WRITE access on R3. Given R3 points to a read-only map, the check fails. Conversely, ARG_PTR_TO_UNINIT_MEM can also lead to the helper reading from uninitialized memory. This patch simply fixes the expected argument type to match that of bpf_skb_store_bytes. Fixes: 3f364222d032 ("net: xdp: introduce bpf_xdp_pointer utility routine") Signed-off-by: Paul Chaignon Link: https://lore.kernel.org/r/9fa3c9f72d806e82541071c4df88b8cba28ad6a9.1769875479.git.paul.chaignon@gmail.com Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/filter.c b/net/core/filter.c index d14401193b01..f04982d79d72 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4135,7 +4135,7 @@ static const struct bpf_func_proto bpf_xdp_store_bytes_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE, }; From f0b5b3d6b56f8717e255406366d81bbcd3631660 Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Sat, 31 Jan 2026 17:09:02 +0100 Subject: [PATCH 210/268] selftests/bpf: Test access from RO map from xdp_store_bytes This new test simply checks that helper bpf_xdp_store_bytes can successfully read from a read-only map. Signed-off-by: Paul Chaignon Link: https://lore.kernel.org/r/4fdb934a713b2d7cf133288c77f6cfefe9856440.1769875479.git.paul.chaignon@gmail.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/progs/verifier_xdp.c | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/verifier_xdp.c b/tools/testing/selftests/bpf/progs/verifier_xdp.c index 50768ed179b3..7dc9226aeb34 100644 --- a/tools/testing/selftests/bpf/progs/verifier_xdp.c +++ b/tools/testing/selftests/bpf/progs/verifier_xdp.c @@ -5,6 +5,14 @@ #include #include "bpf_misc.h" +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, int); + __type(value, __u64); + __uint(map_flags, BPF_F_RDONLY_PROG); +} map_array_ro SEC(".maps"); + SEC("xdp") __description("XDP, using ifindex from netdev") __success __retval(1) @@ -21,4 +29,31 @@ l0_%=: exit; \ : __clobber_all); } +SEC("xdp") +__description("XDP, using xdp_store_bytes from RO map") +__success __retval(0) +__naked void xdp_store_bytes_from_ro_map(void) +{ + asm volatile (" \ + r6 = r1; \ + r1 = 0; \ + *(u64*)(r10 - 8) = r1; \ + r2 = r10; \ + r2 += -8; \ + r1 = %[map_array_ro] ll; \ + call %[bpf_map_lookup_elem]; \ + if r0 == 0 goto l0_%=; \ + r1 = r6; \ + r2 = 0; \ + r3 = r0; \ + r4 = 8; \ + call %[bpf_xdp_store_bytes]; \ +l0_%=: exit; \ +" : + : __imm(bpf_map_lookup_elem), + __imm(bpf_xdp_store_bytes), + __imm_addr(map_array_ro) + : __clobber_all); +} + char _license[] SEC("license") = "GPL"; From 8798902f2b8bcae6f90229a1a1496b48ddda2972 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Sat, 31 Jan 2026 22:49:48 +0800 Subject: [PATCH 211/268] bpf: Add bpf_jit_supports_fsession() The added fsession does not prevent running on those architectures, that haven't added fsession support. For example, try to run fsession tests on arm64: test_fsession_basic:PASS:fsession_test__open_and_load 0 nsec test_fsession_basic:PASS:fsession_attach 0 nsec check_result:FAIL:test_run_opts err unexpected error: -14 (errno 14) In order to prevent such errors, add bpf_jit_supports_fsession() to guard those architectures. Fixes: 2d419c44658f ("bpf: add fsession support") Acked-by: Puranjay Mohan Tested-by: Puranjay Mohan Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260131144950.16294-2-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 5 +++ include/linux/filter.h | 1 + kernel/bpf/core.c | 5 +++ kernel/bpf/verifier.c | 5 +++ .../selftests/bpf/prog_tests/fsession_test.c | 32 ++++++++++++++----- 5 files changed, 40 insertions(+), 8 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 5a075e06cf45..070ba80e39d7 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -4112,3 +4112,8 @@ bool bpf_jit_supports_timed_may_goto(void) { return true; } + +bool bpf_jit_supports_fsession(void) +{ + return true; +} diff --git a/include/linux/filter.h b/include/linux/filter.h index fd54fed8f95f..4e1cb4f91f49 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1167,6 +1167,7 @@ bool bpf_jit_supports_arena(void); bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena); bool bpf_jit_supports_private_stack(void); bool bpf_jit_supports_timed_may_goto(void); +bool bpf_jit_supports_fsession(void); u64 bpf_arch_uaddress_limit(void); void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie); u64 arch_bpf_timed_may_goto(void); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 5ebece600aeb..dc906dfdff94 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -3144,6 +3144,11 @@ bool __weak bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena) return false; } +bool __weak bpf_jit_supports_fsession(void) +{ + return false; +} + u64 __weak bpf_arch_uaddress_limit(void) { #if defined(CONFIG_64BIT) && defined(CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 256cc5c1a7df..6b62b6d57175 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -24828,6 +24828,11 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: case BPF_TRACE_FSESSION: + if (prog->expected_attach_type == BPF_TRACE_FSESSION && + !bpf_jit_supports_fsession()) { + bpf_log(log, "JIT does not support fsession\n"); + return -EOPNOTSUPP; + } if (!btf_type_is_func(t)) { bpf_log(log, "attach_btf_id %u is not a function\n", btf_id); diff --git a/tools/testing/selftests/bpf/prog_tests/fsession_test.c b/tools/testing/selftests/bpf/prog_tests/fsession_test.c index 0c4b428e1cee..a299aeb8cc2e 100644 --- a/tools/testing/selftests/bpf/prog_tests/fsession_test.c +++ b/tools/testing/selftests/bpf/prog_tests/fsession_test.c @@ -29,8 +29,16 @@ static void test_fsession_basic(void) struct fsession_test *skel = NULL; int err; - skel = fsession_test__open_and_load(); - if (!ASSERT_OK_PTR(skel, "fsession_test__open_and_load")) + skel = fsession_test__open(); + if (!ASSERT_OK_PTR(skel, "fsession_test__open")) + return; + + err = fsession_test__load(skel); + if (err == -EOPNOTSUPP) { + test__skip(); + goto cleanup; + } + if (!ASSERT_OK(err, "fsession_test__load")) goto cleanup; err = fsession_test__attach(skel); @@ -47,8 +55,16 @@ static void test_fsession_reattach(void) struct fsession_test *skel = NULL; int err; - skel = fsession_test__open_and_load(); - if (!ASSERT_OK_PTR(skel, "fsession_test__open_and_load")) + skel = fsession_test__open(); + if (!ASSERT_OK_PTR(skel, "fsession_test__open")) + return; + + err = fsession_test__load(skel); + if (err == -EOPNOTSUPP) { + test__skip(); + goto cleanup; + } + if (!ASSERT_OK(err, "fsession_test__load")) goto cleanup; /* first attach */ @@ -94,6 +110,10 @@ static void test_fsession_cookie(void) bpf_program__set_autoload(skel->progs.test6, false); err = fsession_test__load(skel); + if (err == -EOPNOTSUPP) { + test__skip(); + goto cleanup; + } if (!ASSERT_OK(err, "fsession_test__load")) goto cleanup; @@ -111,10 +131,6 @@ static void test_fsession_cookie(void) void test_fsession_test(void) { -#if !defined(__x86_64__) - test__skip(); - return; -#endif if (test__start_subtest("fsession_test")) test_fsession_basic(); if (test__start_subtest("fsession_reattach")) From e3aa56b3ac175bccb8fe60d652a3df2ea6a68a1e Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Sat, 31 Jan 2026 22:49:49 +0800 Subject: [PATCH 212/268] bpf, arm64: Add fsession support Implement fsession support in the arm64 BPF JIT trampoline. Extend the trampoline stack layout to store function metadata and session cookies, and pass the appropriate metadata to fentry and fexit programs. This mirrors the existing x86 behavior and enables session cookies on arm64. Acked-by: Puranjay Mohan Tested-by: Puranjay Mohan Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260131144950.16294-3-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- arch/arm64/net/bpf_jit_comp.c | 71 ++++++++++++++++++++++++++++++----- include/linux/bpf.h | 7 +++- 2 files changed, 68 insertions(+), 10 deletions(-) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 0c4d44bcfbf4..2dc5037694ba 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -2510,6 +2510,12 @@ static bool is_struct_ops_tramp(const struct bpf_tramp_links *fentry_links) fentry_links->links[0]->link.type == BPF_LINK_TYPE_STRUCT_OPS; } +static void store_func_meta(struct jit_ctx *ctx, u64 func_meta, int func_meta_off) +{ + emit_a64_mov_i64(A64_R(10), func_meta, ctx); + emit(A64_STR64I(A64_R(10), A64_SP, func_meta_off), ctx); +} + /* Based on the x86's implementation of arch_prepare_bpf_trampoline(). * * bpf prog and function entry before bpf trampoline hooked: @@ -2533,7 +2539,7 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, int regs_off; int retval_off; int bargs_off; - int nfuncargs_off; + int func_meta_off; int ip_off; int run_ctx_off; int oargs_off; @@ -2544,6 +2550,9 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, bool save_ret; __le32 **branches = NULL; bool is_struct_ops = is_struct_ops_tramp(fentry); + int cookie_off, cookie_cnt, cookie_bargs_off; + int fsession_cnt = bpf_fsession_cnt(tlinks); + u64 func_meta; /* trampoline stack layout: * [ parent ip ] @@ -2562,10 +2571,14 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, * [ ... ] * SP + bargs_off [ arg reg 1 ] for bpf * - * SP + nfuncargs_off [ arg regs count ] + * SP + func_meta_off [ regs count, etc ] * * SP + ip_off [ traced function ] BPF_TRAMP_F_IP_ARG flag * + * [ stack cookie N ] + * [ ... ] + * SP + cookie_off [ stack cookie 1 ] + * * SP + run_ctx_off [ bpf_tramp_run_ctx ] * * [ stack arg N ] @@ -2582,13 +2595,18 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, /* room for bpf_tramp_run_ctx */ stack_size += round_up(sizeof(struct bpf_tramp_run_ctx), 8); + cookie_off = stack_size; + /* room for session cookies */ + cookie_cnt = bpf_fsession_cookie_cnt(tlinks); + stack_size += cookie_cnt * 8; + ip_off = stack_size; /* room for IP address argument */ if (flags & BPF_TRAMP_F_IP_ARG) stack_size += 8; - nfuncargs_off = stack_size; - /* room for args count */ + func_meta_off = stack_size; + /* room for function metadata, such as regs count */ stack_size += 8; bargs_off = stack_size; @@ -2646,9 +2664,9 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, emit(A64_STR64I(A64_R(10), A64_SP, ip_off), ctx); } - /* save arg regs count*/ - emit(A64_MOVZ(1, A64_R(10), nfuncargs, 0), ctx); - emit(A64_STR64I(A64_R(10), A64_SP, nfuncargs_off), ctx); + /* save function metadata */ + func_meta = nfuncargs; + store_func_meta(ctx, func_meta, func_meta_off); /* save args for bpf */ save_args(ctx, bargs_off, oargs_off, m, a, false); @@ -2666,10 +2684,27 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, emit_call((const u64)__bpf_tramp_enter, ctx); } - for (i = 0; i < fentry->nr_links; i++) + if (fsession_cnt) { + /* clear all the session cookies' value */ + emit(A64_MOVZ(1, A64_R(10), 0, 0), ctx); + for (int i = 0; i < cookie_cnt; i++) + emit(A64_STR64I(A64_R(10), A64_SP, cookie_off + 8 * i), ctx); + /* clear the return value to make sure fentry always gets 0 */ + emit(A64_STR64I(A64_R(10), A64_SP, retval_off), ctx); + } + + cookie_bargs_off = (bargs_off - cookie_off) / 8; + for (i = 0; i < fentry->nr_links; i++) { + if (bpf_prog_calls_session_cookie(fentry->links[i])) { + u64 meta = func_meta | (cookie_bargs_off << BPF_TRAMP_COOKIE_INDEX_SHIFT); + + store_func_meta(ctx, meta, func_meta_off); + cookie_bargs_off--; + } invoke_bpf_prog(ctx, fentry->links[i], bargs_off, retval_off, run_ctx_off, flags & BPF_TRAMP_F_RET_FENTRY_RET); + } if (fmod_ret->nr_links) { branches = kcalloc(fmod_ret->nr_links, sizeof(__le32 *), @@ -2701,9 +2736,22 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, *branches[i] = cpu_to_le32(A64_CBNZ(1, A64_R(10), offset)); } - for (i = 0; i < fexit->nr_links; i++) + /* set the "is_return" flag for fsession */ + func_meta |= (1ULL << BPF_TRAMP_IS_RETURN_SHIFT); + if (fsession_cnt) + store_func_meta(ctx, func_meta, func_meta_off); + + cookie_bargs_off = (bargs_off - cookie_off) / 8; + for (i = 0; i < fexit->nr_links; i++) { + if (bpf_prog_calls_session_cookie(fexit->links[i])) { + u64 meta = func_meta | (cookie_bargs_off << BPF_TRAMP_COOKIE_INDEX_SHIFT); + + store_func_meta(ctx, meta, func_meta_off); + cookie_bargs_off--; + } invoke_bpf_prog(ctx, fexit->links[i], bargs_off, retval_off, run_ctx_off, false); + } if (flags & BPF_TRAMP_F_CALL_ORIG) { im->ip_epilogue = ctx->ro_image + ctx->idx; @@ -2753,6 +2801,11 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, return ctx->idx; } +bool bpf_jit_supports_fsession(void) +{ + return true; +} + int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, struct bpf_tramp_links *tlinks, void *func_addr) { diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3b0ceb759075..cd9b96434904 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2196,13 +2196,18 @@ static inline int bpf_fsession_cnt(struct bpf_tramp_links *links) return cnt; } +static inline bool bpf_prog_calls_session_cookie(struct bpf_tramp_link *link) +{ + return link->link.prog->call_session_cookie; +} + static inline int bpf_fsession_cookie_cnt(struct bpf_tramp_links *links) { struct bpf_tramp_links fentries = links[BPF_TRAMP_FENTRY]; int cnt = 0; for (int i = 0; i < links[BPF_TRAMP_FENTRY].nr_links; i++) { - if (fentries.links[i]->link.prog->call_session_cookie) + if (bpf_prog_calls_session_cookie(fentries.links[i])) cnt++; } From 7f10da2133b18b0f1bc02d58671883537e212279 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Sat, 31 Jan 2026 22:49:50 +0800 Subject: [PATCH 213/268] selftests/bpf: Enable get_func_args and get_func_ip tests on arm64 Allow get_func_args, and get_func_ip fsession selftests to run on arm64. Acked-by: Puranjay Mohan Tested-by: Puranjay Mohan Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20260131144950.16294-4-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/get_func_args_test.c | 2 +- tools/testing/selftests/bpf/progs/get_func_ip_test.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/get_func_args_test.c b/tools/testing/selftests/bpf/progs/get_func_args_test.c index 0a3236a7a109..180ba5098ca1 100644 --- a/tools/testing/selftests/bpf/progs/get_func_args_test.c +++ b/tools/testing/selftests/bpf/progs/get_func_args_test.c @@ -167,7 +167,7 @@ int BPF_PROG(tp_test2) } __u64 test7_result = 0; -#ifdef __TARGET_ARCH_x86 +#if defined(bpf_target_x86) || defined(bpf_target_arm64) SEC("fsession/bpf_fentry_test1") int BPF_PROG(test7) { diff --git a/tools/testing/selftests/bpf/progs/get_func_ip_test.c b/tools/testing/selftests/bpf/progs/get_func_ip_test.c index 65f7e1f182bf..43ff836a8ed8 100644 --- a/tools/testing/selftests/bpf/progs/get_func_ip_test.c +++ b/tools/testing/selftests/bpf/progs/get_func_ip_test.c @@ -106,7 +106,7 @@ int BPF_URETPROBE(test8, int ret) __u64 test9_entry_result = 0; __u64 test9_exit_result = 0; -#ifdef __TARGET_ARCH_x86 +#if defined(bpf_target_x86) || defined(bpf_target_arm64) SEC("fsession/bpf_fentry_test1") int BPF_PROG(test9, int a) { From 6b95cc562de2889a9333843dda073ba875f9e808 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 2 Feb 2026 08:58:49 +0100 Subject: [PATCH 214/268] ftrace: Fix direct_functions leak in update_ftrace_direct_del Alexei reported memory leak in update_ftrace_direct_del. We miss cleanup of the replaced direct_functions in the success path in update_ftrace_direct_del, adding that. Fixes: 8d2c1233f371 ("ftrace: Add update_ftrace_direct_del function") Reported-by: Alexei Starovoitov Closes: https://lore.kernel.org/bpf/aX_BxG5EJTJdCMT9@krava/T/#m7c13f5a95f862ed7ab78e905fbb678d635306a0c Signed-off-by: Jiri Olsa Acked-by: Steven Rostedt (Google) Link: https://lore.kernel.org/r/20260202075849.1684369-1-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/trace/ftrace.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 8574932e66b6..b12dbd93ae1c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6537,6 +6537,7 @@ int update_ftrace_direct_del(struct ftrace_ops *ops, struct ftrace_hash *hash) /* free the new_direct_functions */ old_direct_functions = new_direct_functions; } else { + old_direct_functions = direct_functions; rcu_assign_pointer(direct_functions, new_direct_functions); } From d95d76aa772bf94df353b015b1cb38303d4a415d Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Sun, 1 Feb 2026 22:52:48 +0100 Subject: [PATCH 215/268] bpf: Replace snprintf("%s") with strscpy Replace snprintf("%s") with the faster and more direct strscpy(). Signed-off-by: Thorsten Blum Link: https://lore.kernel.org/r/20260201215247.677121-2-thorsten.blum@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 8959f3bc1e92..7708958e3fb8 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -6324,7 +6325,7 @@ static struct btf *btf_parse_base(struct btf_verifier_env *env, const char *name btf->data_size = data_size; btf->kernel_btf = true; btf->named_start_id = 0; - snprintf(btf->name, sizeof(btf->name), "%s", name); + strscpy(btf->name, name); err = btf_parse_hdr(env); if (err) @@ -6443,7 +6444,7 @@ static struct btf *btf_parse_module(const char *module_name, const void *data, btf->start_str_off = base_btf->hdr.str_len; btf->kernel_btf = true; btf->named_start_id = 0; - snprintf(btf->name, sizeof(btf->name), "%s", module_name); + strscpy(btf->name, module_name); btf->data = kvmemdup(data, data_size, GFP_KERNEL | __GFP_NOWARN); if (!btf->data) { From 3cd5c890652ba1f0682adc291b5446245259b692 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Tue, 3 Feb 2026 08:50:57 -0800 Subject: [PATCH 216/268] bpf: Let the verifier assign ids on stack fills The next commit will allow clearing of scalar ids if no other register/stack slot has that id. This is because if only one register has a unique id, it can't participate in bounds propagation and is equivalent to having no id. But if the id of a stack slot is cleared by clear_singular_ids() in the next commit, reading that stack slot into a register will not establish a link because the stack slot's id is cleared. This can happen in a situation where a register is spilled and later loses its id due to a multiply operation (for example) and then the stack slot's id becomes singular and can be cleared. Make sure that scalar stack slots have an id before we read them into a register. Acked-by: Eduard Zingerman Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260203165102.2302462-2-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6b62b6d57175..17b499956156 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5518,6 +5518,12 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, */ s32 subreg_def = state->regs[dst_regno].subreg_def; + if (env->bpf_capable && size == 4 && spill_size == 4 && + get_reg_width(reg) <= 32) + /* Ensure stack slot has an ID to build a relation + * with the destination register on fill. + */ + assign_scalar_id_before_mov(env, reg); copy_register_state(&state->regs[dst_regno], reg); state->regs[dst_regno].subreg_def = subreg_def; @@ -5563,6 +5569,11 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, } } else if (dst_regno >= 0) { /* restore register state from stack */ + if (env->bpf_capable) + /* Ensure stack slot has an ID to build a relation + * with the destination register on fill. + */ + assign_scalar_id_before_mov(env, reg); copy_register_state(&state->regs[dst_regno], reg); /* mark reg as written since spilled pointer state likely * has its liveness marks cleared by is_state_visited() From b2a0aa3a87396483b468b7c81be2fddb29171d74 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Tue, 3 Feb 2026 08:50:58 -0800 Subject: [PATCH 217/268] bpf: Clear singular ids for scalars in is_state_visited() The verifier assigns ids to scalar registers/stack slots when they are linked through a mov or stack spill/fill instruction. These ids are later used to propagate newly found bounds from one register to all registers that share the same id. The verifier also compares the ids of these registers in current state and cached state when making pruning decisions. When an ID becomes singular (i.e., only a single register or stack slot has that ID), it can no longer participate in bounds propagation. During comparisons between current and cached states for pruning decisions, however, such stale IDs can prevent pruning of otherwise equivalent states. Find and clear all singular ids before caching a state in is_state_visited(). struct bpf_idset which is currently unused has been repurposed for this use case. Acked-by: Eduard Zingerman Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260203165102.2302462-3-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 7 ++-- kernel/bpf/verifier.c | 68 ++++++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+), 2 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 8355b585cd18..746025df82c8 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -697,8 +697,11 @@ struct bpf_idmap { }; struct bpf_idset { - u32 count; - u32 ids[BPF_ID_MAP_SIZE]; + u32 num_ids; + struct { + u32 id; + u32 cnt; + } entries[BPF_ID_MAP_SIZE]; }; /* see verifier.c:compute_scc_callchain() */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 17b499956156..d92e10d4c2cc 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -19461,6 +19461,72 @@ static void clean_verifier_state(struct bpf_verifier_env *env, * doesn't meant that the states are DONE. The verifier has to compare * the callsites */ + +/* Find id in idset and increment its count, or add new entry */ +static void idset_cnt_inc(struct bpf_idset *idset, u32 id) +{ + u32 i; + + for (i = 0; i < idset->num_ids; i++) { + if (idset->entries[i].id == id) { + idset->entries[i].cnt++; + return; + } + } + /* New id */ + if (idset->num_ids < BPF_ID_MAP_SIZE) { + idset->entries[idset->num_ids].id = id; + idset->entries[idset->num_ids].cnt = 1; + idset->num_ids++; + } +} + +/* Find id in idset and return its count, or 0 if not found */ +static u32 idset_cnt_get(struct bpf_idset *idset, u32 id) +{ + u32 i; + + for (i = 0; i < idset->num_ids; i++) { + if (idset->entries[i].id == id) + return idset->entries[i].cnt; + } + return 0; +} + +/* + * Clear singular scalar ids in a state. + * A register with a non-zero id is called singular if no other register shares + * the same base id. Such registers can be treated as independent (id=0). + */ +static void clear_singular_ids(struct bpf_verifier_env *env, + struct bpf_verifier_state *st) +{ + struct bpf_idset *idset = &env->idset_scratch; + struct bpf_func_state *func; + struct bpf_reg_state *reg; + + idset->num_ids = 0; + + bpf_for_each_reg_in_vstate(st, func, reg, ({ + if (reg->type != SCALAR_VALUE) + continue; + if (!reg->id) + continue; + idset_cnt_inc(idset, reg->id & ~BPF_ADD_CONST); + })); + + bpf_for_each_reg_in_vstate(st, func, reg, ({ + if (reg->type != SCALAR_VALUE) + continue; + if (!reg->id) + continue; + if (idset_cnt_get(idset, reg->id & ~BPF_ADD_CONST) == 1) { + reg->id = 0; + reg->off = 0; + } + })); +} + static void clean_live_states(struct bpf_verifier_env *env, int insn, struct bpf_verifier_state *cur) { @@ -20459,6 +20525,8 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) if (env->bpf_capable) mark_all_scalars_imprecise(env, cur); + clear_singular_ids(env, cur); + /* add new state to the head of linked list */ new = &new_sl->state; err = copy_verifier_state(new, cur); From a24d6f955d4f68a98daa905a7dd090675a50eca8 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Tue, 3 Feb 2026 08:50:59 -0800 Subject: [PATCH 218/268] bpf: Relax maybe_widen_reg() constraints The maybe_widen_reg() function widens imprecise scalar registers to unknown when their values differ between the cached and current states. Previously, it used regs_exact() which also compared register IDs via check_ids(), requiring registers to have matching IDs (or mapped IDs) to be considered exact. For scalar widening purposes, what matters is whether the value tracking (bounds, tnum, var_off) is the same, not whether the IDs match. Two scalars with identical value constraints but different IDs represent the same abstract value and don't need to be widened. Introduce scalars_exact_for_widen() that only compares the value-tracking portion of bpf_reg_state (fields before 'id'). This allows the verifier to preserve more scalar value information during state merging when IDs differ but actual tracked values are identical, reducing unnecessary widening and potentially improving verification precision. Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260203165102.2302462-4-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d92e10d4c2cc..80dc01350c77 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8995,15 +8995,24 @@ static bool regs_exact(const struct bpf_reg_state *rold, const struct bpf_reg_state *rcur, struct bpf_idmap *idmap); +/* + * Check if scalar registers are exact for the purpose of not widening. + * More lenient than regs_exact() + */ +static bool scalars_exact_for_widen(const struct bpf_reg_state *rold, + const struct bpf_reg_state *rcur) +{ + return !memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)); +} + static void maybe_widen_reg(struct bpf_verifier_env *env, - struct bpf_reg_state *rold, struct bpf_reg_state *rcur, - struct bpf_idmap *idmap) + struct bpf_reg_state *rold, struct bpf_reg_state *rcur) { if (rold->type != SCALAR_VALUE) return; if (rold->type != rcur->type) return; - if (rold->precise || rcur->precise || regs_exact(rold, rcur, idmap)) + if (rold->precise || rcur->precise || scalars_exact_for_widen(rold, rcur)) return; __mark_reg_unknown(env, rcur); } @@ -9015,7 +9024,6 @@ static int widen_imprecise_scalars(struct bpf_verifier_env *env, struct bpf_func_state *fold, *fcur; int i, fr, num_slots; - reset_idmap_scratch(env); for (fr = old->curframe; fr >= 0; fr--) { fold = old->frame[fr]; fcur = cur->frame[fr]; @@ -9023,8 +9031,7 @@ static int widen_imprecise_scalars(struct bpf_verifier_env *env, for (i = 0; i < MAX_BPF_REG; i++) maybe_widen_reg(env, &fold->regs[i], - &fcur->regs[i], - &env->idmap_scratch); + &fcur->regs[i]); num_slots = min(fold->allocated_stack / BPF_REG_SIZE, fcur->allocated_stack / BPF_REG_SIZE); @@ -9035,8 +9042,7 @@ static int widen_imprecise_scalars(struct bpf_verifier_env *env, maybe_widen_reg(env, &fold->stack[i].spilled_ptr, - &fcur->stack[i].spilled_ptr, - &env->idmap_scratch); + &fcur->stack[i].spilled_ptr); } } return 0; From b0388bafa4949bd30af7b3be5ee415f2a25ac014 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Tue, 3 Feb 2026 08:51:00 -0800 Subject: [PATCH 219/268] bpf: Relax scalar id equivalence for state pruning Scalar register IDs are used by the verifier to track relationships between registers and enable bounds propagation across those relationships. Once an ID becomes singular (i.e. only a single register/stack slot carries it), it can no longer contribute to bounds propagation and effectively becomes stale. The previous commit makes the verifier clear such ids before caching the state. When comparing the current and cached states for pruning, these stale IDs can cause technically equivalent states to be considered different and thus prevent pruning. For example, in the selftest added in the next commit, two registers - r6 and r7 are not linked to any other registers and get cached with id=0, in the current state, they are both linked to each other with id=A. Before this commit, check_scalar_ids would give temporary ids to r6 and r7 (say tid1 and tid2) and then check_ids() would map tid1->A, and when it would see tid2->A, it would not consider these state equivalent. Relax scalar ID equivalence by treating rold->id == 0 as "independent": if the old state did not rely on any ID relationships for a register, then any ID/linking present in the current state only adds constraints and is always safe to accept for pruning. Implement this by returning true immediately in check_scalar_ids() when old_id == 0. Maintain correctness for the opposite direction (old_id != 0 && cur_id == 0) by still allocating a temporary ID for cur_id == 0. This avoids incorrectly allowing multiple independent current registers (id==0) to satisfy a single linked old ID during mapping. Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260203165102.2302462-5-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 63 +++++++++++++++---- .../selftests/bpf/progs/verifier_scalar_ids.c | 8 ++- 2 files changed, 56 insertions(+), 15 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 80dc01350c77..da03bbbc1620 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -19387,13 +19387,29 @@ static bool check_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap) return false; } -/* Similar to check_ids(), but allocate a unique temporary ID - * for 'old_id' or 'cur_id' of zero. - * This makes pairs like '0 vs unique ID', 'unique ID vs 0' valid. +/* + * Compare scalar register IDs for state equivalence. + * + * When old_id == 0, the old register is independent - not linked to any + * other register. Any linking in the current state only adds constraints, + * making it more restrictive. Since the old state didn't rely on any ID + * relationships for this register, it's always safe to accept cur regardless + * of its ID. Hence, return true immediately. + * + * When old_id != 0 but cur_id == 0, we need to ensure that different + * independent registers in cur don't incorrectly satisfy the ID matching + * requirements of linked registers in old. + * + * Example: if old has r6.id=X and r7.id=X (linked), but cur has r6.id=0 + * and r7.id=0 (both independent), without temp IDs both would map old_id=X + * to cur_id=0 and pass. With temp IDs: r6 maps X->temp1, r7 tries to map + * X->temp2, but X is already mapped to temp1, so the check fails correctly. */ static bool check_scalar_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap) { - old_id = old_id ? old_id : ++idmap->tmp_id_gen; + if (!old_id) + return true; + cur_id = cur_id ? cur_id : ++idmap->tmp_id_gen; return check_ids(old_id, cur_id, idmap); @@ -19618,11 +19634,21 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, } if (!rold->precise && exact == NOT_EXACT) return true; - if ((rold->id & BPF_ADD_CONST) != (rcur->id & BPF_ADD_CONST)) - return false; - if ((rold->id & BPF_ADD_CONST) && (rold->off != rcur->off)) - return false; - /* Why check_ids() for scalar registers? + /* + * Linked register tracking uses rold->id to detect relationships. + * When rold->id == 0, the register is independent and any linking + * in rcur only adds constraints. When rold->id != 0, we must verify + * id mapping and (for BPF_ADD_CONST) offset consistency. + * + * +------------------+-----------+------------------+---------------+ + * | | rold->id | rold + ADD_CONST | rold->id == 0 | + * |------------------+-----------+------------------+---------------| + * | rcur->id | range,ids | false | range | + * | rcur + ADD_CONST | false | range,ids,off | range | + * | rcur->id == 0 | range,ids | false | range | + * +------------------+-----------+------------------+---------------+ + * + * Why check_ids() for scalar registers? * * Consider the following BPF code: * 1: r6 = ... unbound scalar, ID=a ... @@ -19646,9 +19672,22 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, * --- * Also verify that new value satisfies old value range knowledge. */ - return range_within(rold, rcur) && - tnum_in(rold->var_off, rcur->var_off) && - check_scalar_ids(rold->id, rcur->id, idmap); + + /* ADD_CONST mismatch: different linking semantics */ + if ((rold->id & BPF_ADD_CONST) && !(rcur->id & BPF_ADD_CONST)) + return false; + + if (rold->id && !(rold->id & BPF_ADD_CONST) && (rcur->id & BPF_ADD_CONST)) + return false; + + /* Both have offset linkage: offsets must match */ + if ((rold->id & BPF_ADD_CONST) && rold->off != rcur->off) + return false; + + if (!check_scalar_ids(rold->id, rcur->id, idmap)) + return false; + + return range_within(rold, rcur) && tnum_in(rold->var_off, rcur->var_off); case PTR_TO_MAP_KEY: case PTR_TO_MAP_VALUE: case PTR_TO_MEM: diff --git a/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c b/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c index c0ce690ddb68..c8f8820336b7 100644 --- a/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c +++ b/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c @@ -723,9 +723,9 @@ __success __log_level(2) /* The exit instruction should be reachable from two states, * use two matches and "processed .. insns" to ensure this. */ -__msg("13: (95) exit") -__msg("13: (95) exit") -__msg("processed 18 insns") +__msg("15: (95) exit") +__msg("15: (95) exit") +__msg("processed 20 insns") __flag(BPF_F_TEST_STATE_FREQ) __naked void two_old_ids_one_cur_id(void) { @@ -734,9 +734,11 @@ __naked void two_old_ids_one_cur_id(void) "call %[bpf_ktime_get_ns];" "r0 &= 0xff;" "r6 = r0;" + "r8 = r0;" "call %[bpf_ktime_get_ns];" "r0 &= 0xff;" "r7 = r0;" + "r9 = r0;" "r0 = 0;" /* Maybe make r{6,7} IDs identical */ "if r6 > r7 goto l0_%=;" From f6ef5584ccb5683542abb38461970e969b580fba Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Tue, 3 Feb 2026 08:51:01 -0800 Subject: [PATCH 220/268] selftests/bpf: Add a test for ids=0 to verifier_scalar_ids test Test that two registers with their id=0 (unlinked) in the cached state can be mapped to a single id (linked) in the current state. Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20260203165102.2302462-6-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/progs/verifier_scalar_ids.c | 45 +++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c b/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c index c8f8820336b7..3072fee9a448 100644 --- a/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c +++ b/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c @@ -715,6 +715,51 @@ __naked void ignore_unique_scalar_ids_old(void) : __clobber_all); } +/* Check that two registers with 0 scalar IDs in a verified state can be mapped + * to the same scalar ID in current state. + */ +SEC("socket") +__success __log_level(2) +/* The states should be equivalent on reaching insn 12. + */ +__msg("12: safe") +__msg("processed 17 insns") +__flag(BPF_F_TEST_STATE_FREQ) +__naked void two_nil_old_ids_one_cur_id(void) +{ + asm volatile ( + /* Give unique scalar IDs to r{6,7} */ + "call %[bpf_ktime_get_ns];" + "r0 &= 0xff;" + "r6 = r0;" + "r6 *= 1;" + "call %[bpf_ktime_get_ns];" + "r0 &= 0xff;" + "r7 = r0;" + "r7 *= 1;" + "r0 = 0;" + /* Maybe make r{6,7} IDs identical */ + "if r6 > r7 goto l0_%=;" + "goto l1_%=;" +"l0_%=:" + "r6 = r7;" +"l1_%=:" + /* Mark r{6,7} precise. + * Get here in two states: + * - first: r6{.id=0}, r7{.id=0} (cached state) + * - second: r6{.id=A}, r7{.id=A} + * Verifier considers such states equivalent. + * Thus "exit;" would be verified only once. + */ + "r2 = r10;" + "r2 += r6;" + "r2 += r7;" + "exit;" + : + : __imm(bpf_ktime_get_ns) + : __clobber_all); +} + /* Check that two different scalar IDs in a verified state can't be * mapped to the same scalar ID in current state. */ From 63328bb23f2693fe36e8bcdb972c6040e84d16e4 Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Tue, 3 Feb 2026 13:04:21 -0500 Subject: [PATCH 221/268] bpf: Add bpf_stream_print_stack stack dumping kfunc Add a new kfunc called bpf_stream_print_stack to be used by programs that need to print out their current BPF stack. The kfunc is essentially a wrapper around the existing bpf_stream_dump_stack functionality used to generate stack traces for error events like may_goto violations and BPF-side arena page faults. Signed-off-by: Emil Tsalapatis Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20260203180424.14057-2-emil@etsalapatis.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 1 + kernel/bpf/stream.c | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index b54ec0e945aa..c30a9f68af6b 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -4562,6 +4562,7 @@ BTF_ID_FLAGS(func, bpf_strncasestr); BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU) #endif BTF_ID_FLAGS(func, bpf_stream_vprintk, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, bpf_stream_print_stack, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_task_work_schedule_signal, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_task_work_schedule_resume, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_dynptr_from_file) diff --git a/kernel/bpf/stream.c b/kernel/bpf/stream.c index 24730df55e69..be9ce98e9469 100644 --- a/kernel/bpf/stream.c +++ b/kernel/bpf/stream.c @@ -245,6 +245,25 @@ __bpf_kfunc int bpf_stream_vprintk(int stream_id, const char *fmt__str, const vo return ret; } +/* Directly trigger a stack dump from the program. */ +__bpf_kfunc int bpf_stream_print_stack(int stream_id, struct bpf_prog_aux *aux) +{ + struct bpf_stream_stage ss; + struct bpf_prog *prog; + + /* Make sure the stream ID is valid. */ + if (!bpf_stream_get(stream_id, aux)) + return -ENOENT; + + prog = aux->main_prog_aux->prog; + + bpf_stream_stage(ss, prog, stream_id, ({ + bpf_stream_dump_stack(ss); + })); + + return 0; +} + __bpf_kfunc_end_defs(); /* Added kfunc to common_btf_ids */ From 954fa97e215ea8fb1fe70d117d25875f3d3938ea Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Tue, 3 Feb 2026 13:04:22 -0500 Subject: [PATCH 222/268] selftests/bpf: Add selftests for bpf_stream_print_stack Add selftests for the new bpf_stream_print_stack kfunc. Signed-off-by: Emil Tsalapatis Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20260203180424.14057-3-emil@etsalapatis.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/stream.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/stream.c b/tools/testing/selftests/bpf/progs/stream.c index 4a5bd852f10c..f63b378de090 100644 --- a/tools/testing/selftests/bpf/progs/stream.c +++ b/tools/testing/selftests/bpf/progs/stream.c @@ -234,4 +234,25 @@ int stream_arena_callback_fault(void *ctx) return 0; } +SEC("syscall") +__arch_x86_64 +__arch_arm64 +__success __retval(0) +__stderr("CPU: {{[0-9]+}} UID: 0 PID: {{[0-9]+}} Comm: {{.*}}") +__stderr("Call trace:\n" +"{{([a-zA-Z_][a-zA-Z0-9_]*\\+0x[0-9a-fA-F]+/0x[0-9a-fA-F]+\n" +"|[ \t]+[^\n]+\n)*}}") +int stream_print_stack_kfunc(void *ctx) +{ + return bpf_stream_print_stack(BPF_STDERR); +} + +SEC("syscall") +__success __retval(-2) +int stream_print_stack_invalid_id(void *ctx) +{ + /* Try to pass an invalid stream ID. */ + return bpf_stream_print_stack((enum bpf_stream_id)0xbadcafe); +} + char _license[] SEC("license") = "GPL"; From 9ddfa24e16747da8d98464b4285ee66e37ddc5c0 Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Tue, 3 Feb 2026 13:04:23 -0500 Subject: [PATCH 223/268] bpf: Allow BPF stream kfuncs while holding a lock The BPF stream kfuncs bpf_stream_vprintk and bpf_stream_print_stack do not sleep and so are safe to call while holding a lock. Amend the verifier to allow that. Signed-off-by: Emil Tsalapatis Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20260203180424.14057-4-emil@etsalapatis.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index da03bbbc1620..6a616dc4dc54 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12455,6 +12455,8 @@ enum special_kfunc_type { KF_bpf_arena_free_pages, KF_bpf_arena_reserve_pages, KF_bpf_session_is_return, + KF_bpf_stream_vprintk, + KF_bpf_stream_print_stack, }; BTF_ID_LIST(special_kfunc_list) @@ -12533,6 +12535,8 @@ BTF_ID(func, bpf_arena_alloc_pages) BTF_ID(func, bpf_arena_free_pages) BTF_ID(func, bpf_arena_reserve_pages) BTF_ID(func, bpf_session_is_return) +BTF_ID(func, bpf_stream_vprintk) +BTF_ID(func, bpf_stream_print_stack) static bool is_task_work_add_kfunc(u32 func_id) { @@ -12977,10 +12981,17 @@ static bool is_bpf_arena_kfunc(u32 btf_id) btf_id == special_kfunc_list[KF_bpf_arena_reserve_pages]; } +static bool is_bpf_stream_kfunc(u32 btf_id) +{ + return btf_id == special_kfunc_list[KF_bpf_stream_vprintk] || + btf_id == special_kfunc_list[KF_bpf_stream_print_stack]; +} + static bool kfunc_spin_allowed(u32 btf_id) { return is_bpf_graph_api_kfunc(btf_id) || is_bpf_iter_num_api_kfunc(btf_id) || - is_bpf_res_spin_lock_kfunc(btf_id) || is_bpf_arena_kfunc(btf_id); + is_bpf_res_spin_lock_kfunc(btf_id) || is_bpf_arena_kfunc(btf_id) || + is_bpf_stream_kfunc(btf_id); } static bool is_sync_callback_calling_kfunc(u32 btf_id) From 4d99137eea48b18387d8d17443e28d124177ab7b Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Tue, 3 Feb 2026 13:04:24 -0500 Subject: [PATCH 224/268] selftests/bpf: Add selftests for stream functions under lock Add a selftest to ensure BPF stream functions can now be called while holding a lock. Signed-off-by: Emil Tsalapatis Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20260203180424.14057-5-emil@etsalapatis.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/stream.c | 32 ++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/stream.c b/tools/testing/selftests/bpf/progs/stream.c index f63b378de090..6f999ba951a3 100644 --- a/tools/testing/selftests/bpf/progs/stream.c +++ b/tools/testing/selftests/bpf/progs/stream.c @@ -42,6 +42,10 @@ int size; u64 fault_addr; void *arena_ptr; +#define private(name) SEC(".bss." #name) __hidden __attribute__((aligned(8))) + +private(STREAM) struct bpf_spin_lock block; + SEC("syscall") __success __retval(0) int stream_exhaust(void *ctx) @@ -255,4 +259,32 @@ int stream_print_stack_invalid_id(void *ctx) return bpf_stream_print_stack((enum bpf_stream_id)0xbadcafe); } +SEC("syscall") +__arch_x86_64 +__arch_arm64 +__success __retval(0) +__stdout(_STR) +__stderr("CPU: {{[0-9]+}} UID: 0 PID: {{[0-9]+}} Comm: {{.*}}") +__stderr("Call trace:\n" +"{{([a-zA-Z_][a-zA-Z0-9_]*\\+0x[0-9a-fA-F]+/0x[0-9a-fA-F]+\n" +"|[ \t]+[^\n]+\n)*}}") +int stream_print_kfuncs_locked(void *ctx) +{ + int ret; + + bpf_spin_lock(&block); + + ret = bpf_stream_printk(BPF_STDOUT, _STR); + if (ret) + goto out; + + ret = bpf_stream_print_stack(BPF_STDERR); + +out: + bpf_spin_unlock(&block); + + return ret; +} + + char _license[] SEC("license") = "GPL"; From 1bfbc267ec915e58c3723a2237bf067f0c4dffa8 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Sat, 31 Jan 2026 18:53:55 -0800 Subject: [PATCH 225/268] bpf: Enable bpf_timer and bpf_wq in any context Refactor bpf_timer and bpf_wq to allow calling them from any context: - add refcnt to bpf_async_cb - map_delete_elem or map_free will drop refcnt to zero via bpf_async_cancel_and_free() - once refcnt is zero timer/wq_start is not allowed to make sure that callback cannot rearm itself - if in_hardirq defer to start/cancel operations to irq_work Co-developed-by: Mykyta Yatsenko Signed-off-by: Mykyta Yatsenko Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/bpf/20260201025403.66625-2-alexei.starovoitov@gmail.com --- kernel/bpf/helpers.c | 424 ++++++++++++++++++++++++------------------- 1 file changed, 233 insertions(+), 191 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index c30a9f68af6b..354aa607df3e 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1095,16 +1095,34 @@ static void *map_key_from_value(struct bpf_map *map, void *value, u32 *arr_idx) return (void *)value - round_up(map->key_size, 8); } +enum bpf_async_type { + BPF_ASYNC_TYPE_TIMER = 0, + BPF_ASYNC_TYPE_WQ, +}; + +enum bpf_async_op { + BPF_ASYNC_START, + BPF_ASYNC_CANCEL +}; + +struct bpf_async_cmd { + struct llist_node node; + u64 nsec; + u32 mode; + enum bpf_async_op op; +}; + struct bpf_async_cb { struct bpf_map *map; struct bpf_prog *prog; void __rcu *callback_fn; void *value; - union { - struct rcu_head rcu; - struct work_struct delete_work; - }; + struct rcu_head rcu; u64 flags; + struct irq_work worker; + refcount_t refcnt; + enum bpf_async_type type; + struct llist_head async_cmds; }; /* BPF map elements can contain 'struct bpf_timer'. @@ -1132,7 +1150,6 @@ struct bpf_hrtimer { struct bpf_work { struct bpf_async_cb cb; struct work_struct work; - struct work_struct delete_work; }; /* the actual struct hidden inside uapi struct bpf_timer and bpf_wq */ @@ -1142,20 +1159,12 @@ struct bpf_async_kern { struct bpf_hrtimer *timer; struct bpf_work *work; }; - /* bpf_spin_lock is used here instead of spinlock_t to make - * sure that it always fits into space reserved by struct bpf_timer - * regardless of LOCKDEP and spinlock debug flags. - */ - struct bpf_spin_lock lock; } __attribute__((aligned(8))); -enum bpf_async_type { - BPF_ASYNC_TYPE_TIMER = 0, - BPF_ASYNC_TYPE_WQ, -}; - static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running); +static void bpf_async_refcount_put(struct bpf_async_cb *cb); + static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer) { struct bpf_hrtimer *t = container_of(hrtimer, struct bpf_hrtimer, timer); @@ -1219,45 +1228,73 @@ static void bpf_async_cb_rcu_free(struct rcu_head *rcu) { struct bpf_async_cb *cb = container_of(rcu, struct bpf_async_cb, rcu); + /* + * Drop the last reference to prog only after RCU GP, as set_callback() + * may race with cancel_and_free() + */ + if (cb->prog) + bpf_prog_put(cb->prog); + kfree_nolock(cb); } -static void bpf_wq_delete_work(struct work_struct *work) +/* Callback from call_rcu_tasks_trace, chains to call_rcu for final free */ +static void bpf_async_cb_rcu_tasks_trace_free(struct rcu_head *rcu) { - struct bpf_work *w = container_of(work, struct bpf_work, delete_work); + struct bpf_async_cb *cb = container_of(rcu, struct bpf_async_cb, rcu); + struct bpf_hrtimer *t = container_of(cb, struct bpf_hrtimer, cb); + struct bpf_work *w = container_of(cb, struct bpf_work, cb); + bool retry = false; - cancel_work_sync(&w->work); - - call_rcu(&w->cb.rcu, bpf_async_cb_rcu_free); -} - -static void bpf_timer_delete_work(struct work_struct *work) -{ - struct bpf_hrtimer *t = container_of(work, struct bpf_hrtimer, cb.delete_work); - - /* Cancel the timer and wait for callback to complete if it was running. - * If hrtimer_cancel() can be safely called it's safe to call - * call_rcu() right after for both preallocated and non-preallocated - * maps. The async->cb = NULL was already done and no code path can see - * address 't' anymore. Timer if armed for existing bpf_hrtimer before - * bpf_timer_cancel_and_free will have been cancelled. + /* + * bpf_async_cancel_and_free() tried to cancel timer/wq, but it + * could have raced with timer/wq_start. Now refcnt is zero and + * srcu/rcu GP completed. Cancel timer/wq again. */ - hrtimer_cancel(&t->timer); - call_rcu(&t->cb.rcu, bpf_async_cb_rcu_free); + switch (cb->type) { + case BPF_ASYNC_TYPE_TIMER: + if (hrtimer_try_to_cancel(&t->timer) < 0) + retry = true; + break; + case BPF_ASYNC_TYPE_WQ: + if (!cancel_work(&w->work)) + retry = true; + break; + } + if (retry) { + /* + * hrtimer or wq callback may still be running. It must be + * in rcu_tasks_trace or rcu CS, so wait for GP again. + * It won't retry forever, since refcnt zero prevents all + * operations on timer/wq. + */ + call_rcu_tasks_trace(&cb->rcu, bpf_async_cb_rcu_tasks_trace_free); + return; + } + + /* rcu_trace_implies_rcu_gp() is true and will remain so */ + bpf_async_cb_rcu_free(rcu); } +static void bpf_async_refcount_put(struct bpf_async_cb *cb) +{ + if (!refcount_dec_and_test(&cb->refcnt)) + return; + + call_rcu_tasks_trace(&cb->rcu, bpf_async_cb_rcu_tasks_trace_free); +} + +static void bpf_async_cancel_and_free(struct bpf_async_kern *async); +static void bpf_async_irq_worker(struct irq_work *work); + static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags, enum bpf_async_type type) { - struct bpf_async_cb *cb; + struct bpf_async_cb *cb, *old_cb; struct bpf_hrtimer *t; struct bpf_work *w; clockid_t clockid; size_t size; - int ret = 0; - - if (in_nmi()) - return -EOPNOTSUPP; switch (type) { case BPF_ASYNC_TYPE_TIMER: @@ -1270,18 +1307,13 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u return -EINVAL; } - __bpf_spin_lock_irqsave(&async->lock); - t = async->timer; - if (t) { - ret = -EBUSY; - goto out; - } + old_cb = READ_ONCE(async->cb); + if (old_cb) + return -EBUSY; cb = bpf_map_kmalloc_nolock(map, size, 0, map->numa_node); - if (!cb) { - ret = -ENOMEM; - goto out; - } + if (!cb) + return -ENOMEM; switch (type) { case BPF_ASYNC_TYPE_TIMER: @@ -1289,7 +1321,6 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u t = (struct bpf_hrtimer *)cb; atomic_set(&t->cancelling, 0); - INIT_WORK(&t->cb.delete_work, bpf_timer_delete_work); hrtimer_setup(&t->timer, bpf_timer_cb, clockid, HRTIMER_MODE_REL_SOFT); cb->value = (void *)async - map->record->timer_off; break; @@ -1297,16 +1328,24 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u w = (struct bpf_work *)cb; INIT_WORK(&w->work, bpf_wq_work); - INIT_WORK(&w->delete_work, bpf_wq_delete_work); cb->value = (void *)async - map->record->wq_off; break; } cb->map = map; cb->prog = NULL; cb->flags = flags; + cb->worker = IRQ_WORK_INIT(bpf_async_irq_worker); + init_llist_head(&cb->async_cmds); + refcount_set(&cb->refcnt, 1); /* map's reference */ + cb->type = type; rcu_assign_pointer(cb->callback_fn, NULL); - WRITE_ONCE(async->cb, cb); + old_cb = cmpxchg(&async->cb, NULL, cb); + if (old_cb) { + /* Lost the race to initialize this bpf_async_kern, drop the allocated object */ + kfree_nolock(cb); + return -EBUSY; + } /* Guarantee the order between async->cb and map->usercnt. So * when there are concurrent uref release and bpf timer init, either * bpf_timer_cancel_and_free() called by uref release reads a no-NULL @@ -1317,13 +1356,11 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u /* maps with timers must be either held by user space * or pinned in bpffs. */ - WRITE_ONCE(async->cb, NULL); - kfree_nolock(cb); - ret = -EPERM; + bpf_async_cancel_and_free(async); + return -EPERM; } -out: - __bpf_spin_unlock_irqrestore(&async->lock); - return ret; + + return 0; } BPF_CALL_3(bpf_timer_init, struct bpf_async_kern *, timer, struct bpf_map *, map, @@ -1354,8 +1391,9 @@ static const struct bpf_func_proto bpf_timer_init_proto = { .arg3_type = ARG_ANYTHING, }; -static int bpf_async_update_prog_callback(struct bpf_async_cb *cb, void *callback_fn, - struct bpf_prog *prog) +static int bpf_async_update_prog_callback(struct bpf_async_cb *cb, + struct bpf_prog *prog, + void *callback_fn) { struct bpf_prog *prev; @@ -1380,7 +1418,8 @@ static int bpf_async_update_prog_callback(struct bpf_async_cb *cb, void *callbac if (prev) bpf_prog_put(prev); - } while (READ_ONCE(cb->prog) != prog || READ_ONCE(cb->callback_fn) != callback_fn); + } while (READ_ONCE(cb->prog) != prog || + (void __force *)READ_ONCE(cb->callback_fn) != callback_fn); if (prog) bpf_prog_put(prog); @@ -1388,33 +1427,36 @@ static int bpf_async_update_prog_callback(struct bpf_async_cb *cb, void *callbac return 0; } +static int bpf_async_schedule_op(struct bpf_async_cb *cb, enum bpf_async_op op, + u64 nsec, u32 timer_mode) +{ + WARN_ON_ONCE(!in_hardirq()); + + struct bpf_async_cmd *cmd = kmalloc_nolock(sizeof(*cmd), 0, NUMA_NO_NODE); + + if (!cmd) { + bpf_async_refcount_put(cb); + return -ENOMEM; + } + init_llist_node(&cmd->node); + cmd->nsec = nsec; + cmd->mode = timer_mode; + cmd->op = op; + if (llist_add(&cmd->node, &cb->async_cmds)) + irq_work_queue(&cb->worker); + return 0; +} + static int __bpf_async_set_callback(struct bpf_async_kern *async, void *callback_fn, struct bpf_prog *prog) { struct bpf_async_cb *cb; - int ret = 0; - if (in_nmi()) - return -EOPNOTSUPP; - __bpf_spin_lock_irqsave(&async->lock); - cb = async->cb; - if (!cb) { - ret = -EINVAL; - goto out; - } - if (!atomic64_read(&cb->map->usercnt)) { - /* maps with timers must be either held by user space - * or pinned in bpffs. Otherwise timer might still be - * running even when bpf prog is detached and user space - * is gone, since map_release_uref won't ever be called. - */ - ret = -EPERM; - goto out; - } - ret = bpf_async_update_prog_callback(cb, callback_fn, prog); -out: - __bpf_spin_unlock_irqrestore(&async->lock); - return ret; + cb = READ_ONCE(async->cb); + if (!cb) + return -EINVAL; + + return bpf_async_update_prog_callback(cb, prog, callback_fn); } BPF_CALL_3(bpf_timer_set_callback, struct bpf_async_kern *, timer, void *, callback_fn, @@ -1431,22 +1473,17 @@ static const struct bpf_func_proto bpf_timer_set_callback_proto = { .arg2_type = ARG_PTR_TO_FUNC, }; -BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, timer, u64, nsecs, u64, flags) +BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, async, u64, nsecs, u64, flags) { struct bpf_hrtimer *t; - int ret = 0; - enum hrtimer_mode mode; + u32 mode; - if (in_nmi()) - return -EOPNOTSUPP; if (flags & ~(BPF_F_TIMER_ABS | BPF_F_TIMER_CPU_PIN)) return -EINVAL; - __bpf_spin_lock_irqsave(&timer->lock); - t = timer->timer; - if (!t || !t->cb.prog) { - ret = -EINVAL; - goto out; - } + + t = READ_ONCE(async->timer); + if (!t || !READ_ONCE(t->cb.prog)) + return -EINVAL; if (flags & BPF_F_TIMER_ABS) mode = HRTIMER_MODE_ABS_SOFT; @@ -1456,10 +1493,20 @@ BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, timer, u64, nsecs, u64, fla if (flags & BPF_F_TIMER_CPU_PIN) mode |= HRTIMER_MODE_PINNED; - hrtimer_start(&t->timer, ns_to_ktime(nsecs), mode); -out: - __bpf_spin_unlock_irqrestore(&timer->lock); - return ret; + /* + * bpf_async_cancel_and_free() could have dropped refcnt to zero. In + * such case BPF progs are not allowed to arm the timer to prevent UAF. + */ + if (!refcount_inc_not_zero(&t->cb.refcnt)) + return -ENOENT; + + if (!in_hardirq()) { + hrtimer_start(&t->timer, ns_to_ktime(nsecs), mode); + bpf_async_refcount_put(&t->cb); + return 0; + } else { + return bpf_async_schedule_op(&t->cb, BPF_ASYNC_START, nsecs, mode); + } } static const struct bpf_func_proto bpf_timer_start_proto = { @@ -1477,11 +1524,9 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, async) bool inc = false; int ret = 0; - if (in_nmi()) + if (in_hardirq()) return -EOPNOTSUPP; - guard(rcu)(); - t = READ_ONCE(async->timer); if (!t) return -EINVAL; @@ -1536,78 +1581,85 @@ static const struct bpf_func_proto bpf_timer_cancel_proto = { .arg1_type = ARG_PTR_TO_TIMER, }; -static struct bpf_async_cb *__bpf_async_cancel_and_free(struct bpf_async_kern *async) +static void bpf_async_process_op(struct bpf_async_cb *cb, u32 op, + u64 timer_nsec, u32 timer_mode) +{ + switch (cb->type) { + case BPF_ASYNC_TYPE_TIMER: { + struct bpf_hrtimer *t = container_of(cb, struct bpf_hrtimer, cb); + + switch (op) { + case BPF_ASYNC_START: + hrtimer_start(&t->timer, ns_to_ktime(timer_nsec), timer_mode); + break; + case BPF_ASYNC_CANCEL: + hrtimer_try_to_cancel(&t->timer); + break; + } + break; + } + case BPF_ASYNC_TYPE_WQ: { + struct bpf_work *w = container_of(cb, struct bpf_work, cb); + + switch (op) { + case BPF_ASYNC_START: + schedule_work(&w->work); + break; + case BPF_ASYNC_CANCEL: + cancel_work(&w->work); + break; + } + break; + } + } + bpf_async_refcount_put(cb); +} + +static void bpf_async_irq_worker(struct irq_work *work) +{ + struct bpf_async_cb *cb = container_of(work, struct bpf_async_cb, worker); + struct llist_node *pos, *n, *list; + + list = llist_del_all(&cb->async_cmds); + if (!list) + return; + + list = llist_reverse_order(list); + llist_for_each_safe(pos, n, list) { + struct bpf_async_cmd *cmd; + + cmd = container_of(pos, struct bpf_async_cmd, node); + bpf_async_process_op(cb, cmd->op, cmd->nsec, cmd->mode); + kfree_nolock(cmd); + } +} + +static void bpf_async_cancel_and_free(struct bpf_async_kern *async) { struct bpf_async_cb *cb; - /* Performance optimization: read async->cb without lock first. */ if (!READ_ONCE(async->cb)) - return NULL; - - __bpf_spin_lock_irqsave(&async->lock); - /* re-read it under lock */ - cb = async->cb; - if (!cb) - goto out; - bpf_async_update_prog_callback(cb, NULL, NULL); - /* The subsequent bpf_timer_start/cancel() helpers won't be able to use - * this timer, since it won't be initialized. - */ - WRITE_ONCE(async->cb, NULL); -out: - __bpf_spin_unlock_irqrestore(&async->lock); - return cb; -} - -static void bpf_timer_delete(struct bpf_hrtimer *t) -{ - /* - * We check that bpf_map_delete/update_elem() was called from timer - * callback_fn. In such case we don't call hrtimer_cancel() (since it - * will deadlock) and don't call hrtimer_try_to_cancel() (since it will - * just return -1). Though callback_fn is still running on this cpu it's - * safe to do kfree(t) because bpf_timer_cb() read everything it needed - * from 't'. The bpf subprog callback_fn won't be able to access 't', - * since async->cb = NULL was already done. The timer will be - * effectively cancelled because bpf_timer_cb() will return - * HRTIMER_NORESTART. - * - * However, it is possible the timer callback_fn calling us armed the - * timer _before_ calling us, such that failing to cancel it here will - * cause it to possibly use struct hrtimer after freeing bpf_hrtimer. - * Therefore, we _need_ to cancel any outstanding timers before we do - * call_rcu, even though no more timers can be armed. - * - * Moreover, we need to schedule work even if timer does not belong to - * the calling callback_fn, as on two different CPUs, we can end up in a - * situation where both sides run in parallel, try to cancel one - * another, and we end up waiting on both sides in hrtimer_cancel - * without making forward progress, since timer1 depends on time2 - * callback to finish, and vice versa. - * - * CPU 1 (timer1_cb) CPU 2 (timer2_cb) - * bpf_timer_cancel_and_free(timer2) bpf_timer_cancel_and_free(timer1) - * - * To avoid these issues, punt to workqueue context when we are in a - * timer callback. - */ - if (this_cpu_read(hrtimer_running)) { - queue_work(system_dfl_wq, &t->cb.delete_work); return; - } - if (IS_ENABLED(CONFIG_PREEMPT_RT)) { - /* If the timer is running on other CPU, also use a kworker to - * wait for the completion of the timer instead of trying to - * acquire a sleepable lock in hrtimer_cancel() to wait for its - * completion. - */ - if (hrtimer_try_to_cancel(&t->timer) >= 0) - call_rcu(&t->cb.rcu, bpf_async_cb_rcu_free); - else - queue_work(system_dfl_wq, &t->cb.delete_work); + cb = xchg(&async->cb, NULL); + if (!cb) + return; + + /* + * No refcount_inc_not_zero(&cb->refcnt) here. Dropping the last + * refcnt. Either synchronously or asynchronously in irq_work. + */ + + if (!in_hardirq()) { + bpf_async_process_op(cb, BPF_ASYNC_CANCEL, 0, 0); } else { - bpf_timer_delete_work(&t->cb.delete_work); + (void)bpf_async_schedule_op(cb, BPF_ASYNC_CANCEL, 0, 0); + /* + * bpf_async_schedule_op() either enqueues allocated cmd into llist + * or fails with ENOMEM and drop the last refcnt. + * This is unlikely, but safe, since bpf_async_cb_rcu_tasks_trace_free() + * callback will do additional timer/wq_cancel due to races anyway. + */ } } @@ -1617,33 +1669,16 @@ static void bpf_timer_delete(struct bpf_hrtimer *t) */ void bpf_timer_cancel_and_free(void *val) { - struct bpf_hrtimer *t; - - t = (struct bpf_hrtimer *)__bpf_async_cancel_and_free(val); - if (!t) - return; - - bpf_timer_delete(t); + bpf_async_cancel_and_free(val); } -/* This function is called by map_delete/update_elem for individual element and +/* + * This function is called by map_delete/update_elem for individual element and * by ops->map_release_uref when the user space reference to a map reaches zero. */ void bpf_wq_cancel_and_free(void *val) { - struct bpf_work *work; - - BTF_TYPE_EMIT(struct bpf_wq); - - work = (struct bpf_work *)__bpf_async_cancel_and_free(val); - if (!work) - return; - /* Trigger cancel of the sleepable work, but *do not* wait for - * it to finish if it was running as we might not be in a - * sleepable context. - * kfree will be called once the work has finished. - */ - schedule_work(&work->delete_work); + bpf_async_cancel_and_free(val); } BPF_CALL_2(bpf_kptr_xchg, void *, dst, void *, ptr) @@ -3116,16 +3151,23 @@ __bpf_kfunc int bpf_wq_start(struct bpf_wq *wq, unsigned int flags) struct bpf_async_kern *async = (struct bpf_async_kern *)wq; struct bpf_work *w; - if (in_nmi()) - return -EOPNOTSUPP; if (flags) return -EINVAL; + w = READ_ONCE(async->work); if (!w || !READ_ONCE(w->cb.prog)) return -EINVAL; - schedule_work(&w->work); - return 0; + if (!refcount_inc_not_zero(&w->cb.refcnt)) + return -ENOENT; + + if (!in_hardirq()) { + schedule_work(&w->work); + bpf_async_refcount_put(&w->cb); + return 0; + } else { + return bpf_async_schedule_op(&w->cb, BPF_ASYNC_START, 0, 0); + } } __bpf_kfunc int bpf_wq_set_callback(struct bpf_wq *wq, From 19bd300e22c2be7df838fc4ea41b4e20ccd5c20f Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Sat, 31 Jan 2026 18:53:56 -0800 Subject: [PATCH 226/268] bpf: Add verifier support for bpf_timer argument in kfuncs Extend the verifier to recognize struct bpf_timer as a valid kfunc argument type. Previously, bpf_timer was only supported in BPF helpers. This prepares for adding timer-related kfuncs in subsequent patches. Signed-off-by: Mykyta Yatsenko Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260201025403.66625-3-alexei.starovoitov@gmail.com --- kernel/bpf/verifier.c | 55 +++++++++++++++++++++++++++++-------------- 1 file changed, 37 insertions(+), 18 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6a616dc4dc54..40a8252140fb 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8675,13 +8675,25 @@ static int check_map_field_pointer(struct bpf_verifier_env *env, u32 regno, } static int process_timer_func(struct bpf_verifier_env *env, int regno, - struct bpf_call_arg_meta *meta) + struct bpf_map_desc *map) { if (IS_ENABLED(CONFIG_PREEMPT_RT)) { verbose(env, "bpf_timer cannot be used for PREEMPT_RT.\n"); return -EOPNOTSUPP; } - return check_map_field_pointer(env, regno, BPF_TIMER, &meta->map); + return check_map_field_pointer(env, regno, BPF_TIMER, map); +} + +static int process_timer_helper(struct bpf_verifier_env *env, int regno, + struct bpf_call_arg_meta *meta) +{ + return process_timer_func(env, regno, &meta->map); +} + +static int process_timer_kfunc(struct bpf_verifier_env *env, int regno, + struct bpf_kfunc_call_arg_meta *meta) +{ + return process_timer_func(env, regno, &meta->map); } static int process_kptr_func(struct bpf_verifier_env *env, int regno, @@ -9973,7 +9985,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, } break; case ARG_PTR_TO_TIMER: - err = process_timer_func(env, regno, meta); + err = process_timer_helper(env, regno, meta); if (err) return err; break; @@ -12238,7 +12250,8 @@ enum { KF_ARG_WORKQUEUE_ID, KF_ARG_RES_SPIN_LOCK_ID, KF_ARG_TASK_WORK_ID, - KF_ARG_PROG_AUX_ID + KF_ARG_PROG_AUX_ID, + KF_ARG_TIMER_ID }; BTF_ID_LIST(kf_arg_btf_ids) @@ -12251,6 +12264,7 @@ BTF_ID(struct, bpf_wq) BTF_ID(struct, bpf_res_spin_lock) BTF_ID(struct, bpf_task_work) BTF_ID(struct, bpf_prog_aux) +BTF_ID(struct, bpf_timer) static bool __is_kfunc_ptr_arg_type(const struct btf *btf, const struct btf_param *arg, int type) @@ -12294,6 +12308,11 @@ static bool is_kfunc_arg_rbtree_node(const struct btf *btf, const struct btf_par return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RB_NODE_ID); } +static bool is_kfunc_arg_timer(const struct btf *btf, const struct btf_param *arg) +{ + return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_TIMER_ID); +} + static bool is_kfunc_arg_wq(const struct btf *btf, const struct btf_param *arg) { return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_WORKQUEUE_ID); @@ -12393,6 +12412,7 @@ enum kfunc_ptr_arg_type { KF_ARG_PTR_TO_NULL, KF_ARG_PTR_TO_CONST_STR, KF_ARG_PTR_TO_MAP, + KF_ARG_PTR_TO_TIMER, KF_ARG_PTR_TO_WORKQUEUE, KF_ARG_PTR_TO_IRQ_FLAG, KF_ARG_PTR_TO_RES_SPIN_LOCK, @@ -12646,6 +12666,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (is_kfunc_arg_wq(meta->btf, &args[argno])) return KF_ARG_PTR_TO_WORKQUEUE; + if (is_kfunc_arg_timer(meta->btf, &args[argno])) + return KF_ARG_PTR_TO_TIMER; + if (is_kfunc_arg_task_work(meta->btf, &args[argno])) return KF_ARG_PTR_TO_TASK_WORK; @@ -13439,6 +13462,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_REFCOUNTED_KPTR: case KF_ARG_PTR_TO_CONST_STR: case KF_ARG_PTR_TO_WORKQUEUE: + case KF_ARG_PTR_TO_TIMER: case KF_ARG_PTR_TO_TASK_WORK: case KF_ARG_PTR_TO_IRQ_FLAG: case KF_ARG_PTR_TO_RES_SPIN_LOCK: @@ -13738,6 +13762,15 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (ret < 0) return ret; break; + case KF_ARG_PTR_TO_TIMER: + if (reg->type != PTR_TO_MAP_VALUE) { + verbose(env, "arg#%d doesn't point to a map value\n", i); + return -EINVAL; + } + ret = process_timer_kfunc(env, regno, meta); + if (ret < 0) + return ret; + break; case KF_ARG_PTR_TO_TASK_WORK: if (reg->type != PTR_TO_MAP_VALUE) { verbose(env, "arg#%d doesn't point to a map value\n", i); @@ -21429,20 +21462,6 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, } } - if (btf_record_has_field(map->record, BPF_TIMER)) { - if (is_tracing_prog_type(prog_type)) { - verbose(env, "tracing progs cannot use bpf_timer yet\n"); - return -EINVAL; - } - } - - if (btf_record_has_field(map->record, BPF_WORKQUEUE)) { - if (is_tracing_prog_type(prog_type)) { - verbose(env, "tracing progs cannot use bpf_wq yet\n"); - return -EINVAL; - } - } - if ((bpf_prog_is_offloaded(prog->aux) || bpf_map_is_offloaded(map)) && !bpf_offload_prog_map_match(prog, map)) { verbose(env, "offload device mismatch between prog and map\n"); From a7e172aa4ca276d12fe87ffddff9cbd2d95ea51c Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Sat, 31 Jan 2026 18:53:57 -0800 Subject: [PATCH 227/268] bpf: Introduce bpf_timer_cancel_async() kfunc Introduce bpf_timer_cancel_async() that wraps hrtimer_try_to_cancel() and executes it either synchronously or defers to irq_work. Co-developed-by: Mykyta Yatsenko Signed-off-by: Mykyta Yatsenko Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260201025403.66625-4-alexei.starovoitov@gmail.com --- kernel/bpf/helpers.c | 48 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 354aa607df3e..d4aedac14a60 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -4426,6 +4426,53 @@ __bpf_kfunc int bpf_dynptr_file_discard(struct bpf_dynptr *dynptr) return 0; } +/** + * bpf_timer_cancel_async - try to deactivate a timer + * @timer: bpf_timer to stop + * + * Returns: + * + * * 0 when the timer was not active + * * 1 when the timer was active + * * -1 when the timer is currently executing the callback function and + * cannot be stopped + * * -ECANCELED when the timer will be cancelled asynchronously + * * -ENOMEM when out of memory + * * -EINVAL when the timer was not initialized + * * -ENOENT when this kfunc is racing with timer deletion + */ +__bpf_kfunc int bpf_timer_cancel_async(struct bpf_timer *timer) +{ + struct bpf_async_kern *async = (void *)timer; + struct bpf_async_cb *cb; + int ret; + + cb = READ_ONCE(async->cb); + if (!cb) + return -EINVAL; + + /* + * Unlike hrtimer_start() it's ok to synchronously call + * hrtimer_try_to_cancel() when refcnt reached zero, but deferring to + * irq_work is not, since irq callback may execute after RCU GP and + * cb could be freed at that time. Check for refcnt zero for + * consistency. + */ + if (!refcount_inc_not_zero(&cb->refcnt)) + return -ENOENT; + + if (!in_hardirq()) { + struct bpf_hrtimer *t = container_of(cb, struct bpf_hrtimer, cb); + + ret = hrtimer_try_to_cancel(&t->timer); + bpf_async_refcount_put(cb); + return ret; + } else { + ret = bpf_async_schedule_op(cb, BPF_ASYNC_CANCEL, 0, 0); + return ret ? ret : -ECANCELED; + } +} + __bpf_kfunc_end_defs(); static void bpf_task_work_cancel_scheduled(struct irq_work *irq_work) @@ -4609,6 +4656,7 @@ BTF_ID_FLAGS(func, bpf_task_work_schedule_signal, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_task_work_schedule_resume, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_dynptr_from_file) BTF_ID_FLAGS(func, bpf_dynptr_file_discard) +BTF_ID_FLAGS(func, bpf_timer_cancel_async) BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { From 10653c0dd86812981a9cf7112f0711f9f1f153a8 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Sat, 31 Jan 2026 18:53:58 -0800 Subject: [PATCH 228/268] selftests/bpf: Refactor timer selftests Refactor timer selftests, extracting stress test into a separate test. This makes it easier to debug test failures and allows to extend. Signed-off-by: Mykyta Yatsenko Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260201025403.66625-5-alexei.starovoitov@gmail.com --- .../testing/selftests/bpf/prog_tests/timer.c | 55 ++++++++++++------- 1 file changed, 36 insertions(+), 19 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/timer.c b/tools/testing/selftests/bpf/prog_tests/timer.c index 34f9ccce2602..4d853d1bd2a7 100644 --- a/tools/testing/selftests/bpf/prog_tests/timer.c +++ b/tools/testing/selftests/bpf/prog_tests/timer.c @@ -22,13 +22,35 @@ static void *spin_lock_thread(void *arg) pthread_exit(arg); } -static int timer(struct timer *timer_skel) + +static int timer_stress(struct timer *timer_skel) { - int i, err, prog_fd; + int i, err = 1, prog_fd; LIBBPF_OPTS(bpf_test_run_opts, topts); pthread_t thread_id[NUM_THR]; void *ret; + prog_fd = bpf_program__fd(timer_skel->progs.race); + for (i = 0; i < NUM_THR; i++) { + err = pthread_create(&thread_id[i], NULL, + &spin_lock_thread, &prog_fd); + if (!ASSERT_OK(err, "pthread_create")) + break; + } + + while (i) { + err = pthread_join(thread_id[--i], &ret); + if (ASSERT_OK(err, "pthread_join")) + ASSERT_EQ(ret, (void *)&prog_fd, "pthread_join"); + } + return err; +} + +static int timer(struct timer *timer_skel) +{ + int err, prog_fd; + LIBBPF_OPTS(bpf_test_run_opts, topts); + err = timer__attach(timer_skel); if (!ASSERT_OK(err, "timer_attach")) return err; @@ -63,25 +85,10 @@ static int timer(struct timer *timer_skel) /* check that code paths completed */ ASSERT_EQ(timer_skel->bss->ok, 1 | 2 | 4, "ok"); - prog_fd = bpf_program__fd(timer_skel->progs.race); - for (i = 0; i < NUM_THR; i++) { - err = pthread_create(&thread_id[i], NULL, - &spin_lock_thread, &prog_fd); - if (!ASSERT_OK(err, "pthread_create")) - break; - } - - while (i) { - err = pthread_join(thread_id[--i], &ret); - if (ASSERT_OK(err, "pthread_join")) - ASSERT_EQ(ret, (void *)&prog_fd, "pthread_join"); - } - return 0; } -/* TODO: use pid filtering */ -void serial_test_timer(void) +static void test_timer(int (*timer_test_fn)(struct timer *timer_skel)) { struct timer *timer_skel = NULL; int err; @@ -94,13 +101,23 @@ void serial_test_timer(void) if (!ASSERT_OK_PTR(timer_skel, "timer_skel_load")) return; - err = timer(timer_skel); + err = timer_test_fn(timer_skel); ASSERT_OK(err, "timer"); timer__destroy(timer_skel); +} + +void serial_test_timer(void) +{ + test_timer(timer); RUN_TESTS(timer_failure); } +void serial_test_timer_stress(void) +{ + test_timer(timer_stress); +} + void test_timer_interrupt(void) { struct timer_interrupt *skel = NULL; From d02fdd7195ca24005e36a6f7efed41f9c0ca7f72 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Sat, 31 Jan 2026 18:53:59 -0800 Subject: [PATCH 229/268] selftests/bpf: Add stress test for timer async cancel Extend BPF timer selftest to run stress test for async cancel. Signed-off-by: Mykyta Yatsenko Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260201025403.66625-6-alexei.starovoitov@gmail.com --- tools/testing/selftests/bpf/prog_tests/timer.c | 18 +++++++++++++++++- tools/testing/selftests/bpf/progs/timer.c | 14 +++++++++++--- 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/timer.c b/tools/testing/selftests/bpf/prog_tests/timer.c index 4d853d1bd2a7..a157a2a699e6 100644 --- a/tools/testing/selftests/bpf/prog_tests/timer.c +++ b/tools/testing/selftests/bpf/prog_tests/timer.c @@ -23,13 +23,14 @@ static void *spin_lock_thread(void *arg) } -static int timer_stress(struct timer *timer_skel) +static int timer_stress_runner(struct timer *timer_skel, bool async_cancel) { int i, err = 1, prog_fd; LIBBPF_OPTS(bpf_test_run_opts, topts); pthread_t thread_id[NUM_THR]; void *ret; + timer_skel->bss->async_cancel = async_cancel; prog_fd = bpf_program__fd(timer_skel->progs.race); for (i = 0; i < NUM_THR; i++) { err = pthread_create(&thread_id[i], NULL, @@ -46,6 +47,16 @@ static int timer_stress(struct timer *timer_skel) return err; } +static int timer_stress(struct timer *timer_skel) +{ + return timer_stress_runner(timer_skel, false); +} + +static int timer_stress_async_cancel(struct timer *timer_skel) +{ + return timer_stress_runner(timer_skel, true); +} + static int timer(struct timer *timer_skel) { int err, prog_fd; @@ -118,6 +129,11 @@ void serial_test_timer_stress(void) test_timer(timer_stress); } +void serial_test_timer_stress_async_cancel(void) +{ + test_timer(timer_stress_async_cancel); +} + void test_timer_interrupt(void) { struct timer_interrupt *skel = NULL; diff --git a/tools/testing/selftests/bpf/progs/timer.c b/tools/testing/selftests/bpf/progs/timer.c index 4c677c001258..a81413514e4b 100644 --- a/tools/testing/selftests/bpf/progs/timer.c +++ b/tools/testing/selftests/bpf/progs/timer.c @@ -1,13 +1,17 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2021 Facebook */ -#include -#include + +#include #include #include #include #include +#define CLOCK_MONOTONIC 1 +#define CLOCK_BOOTTIME 7 + char _license[] SEC("license") = "GPL"; + struct hmap_elem { int counter; struct bpf_timer timer; @@ -63,6 +67,7 @@ __u64 callback_check = 52; __u64 callback2_check = 52; __u64 pinned_callback_check; __s32 pinned_cpu; +bool async_cancel = 0; #define ARRAY 1 #define HTAB 2 @@ -419,7 +424,10 @@ int race(void *ctx) bpf_timer_set_callback(timer, race_timer_callback); bpf_timer_start(timer, 0, 0); - bpf_timer_cancel(timer); + if (async_cancel) + bpf_timer_cancel_async(timer); + else + bpf_timer_cancel(timer); return 0; } From fe9d205cec8ccdd13171a056257101374306e802 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Sat, 31 Jan 2026 18:54:00 -0800 Subject: [PATCH 230/268] selftests/bpf: Verify bpf_timer_cancel_async works Add test that verifies that bpf_timer_cancel_async works: can cancel callback successfully. Signed-off-by: Mykyta Yatsenko Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260201025403.66625-7-alexei.starovoitov@gmail.com --- .../testing/selftests/bpf/prog_tests/timer.c | 25 +++++++++++++++++++ tools/testing/selftests/bpf/progs/timer.c | 23 +++++++++++++++++ 2 files changed, 48 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/timer.c b/tools/testing/selftests/bpf/prog_tests/timer.c index a157a2a699e6..2b932d4dfd43 100644 --- a/tools/testing/selftests/bpf/prog_tests/timer.c +++ b/tools/testing/selftests/bpf/prog_tests/timer.c @@ -99,6 +99,26 @@ static int timer(struct timer *timer_skel) return 0; } +static int timer_cancel_async(struct timer *timer_skel) +{ + int err, prog_fd; + LIBBPF_OPTS(bpf_test_run_opts, topts); + + prog_fd = bpf_program__fd(timer_skel->progs.test_async_cancel_succeed); + err = bpf_prog_test_run_opts(prog_fd, &topts); + ASSERT_OK(err, "test_run"); + ASSERT_EQ(topts.retval, 0, "test_run"); + + usleep(500); + /* check that there were no errors in timer execution */ + ASSERT_EQ(timer_skel->bss->err, 0, "err"); + + /* check that code paths completed */ + ASSERT_EQ(timer_skel->bss->ok, 1 | 2 | 4, "ok"); + + return 0; +} + static void test_timer(int (*timer_test_fn)(struct timer *timer_skel)) { struct timer *timer_skel = NULL; @@ -134,6 +154,11 @@ void serial_test_timer_stress_async_cancel(void) test_timer(timer_stress_async_cancel); } +void serial_test_timer_async_cancel(void) +{ + test_timer(timer_cancel_async); +} + void test_timer_interrupt(void) { struct timer_interrupt *skel = NULL; diff --git a/tools/testing/selftests/bpf/progs/timer.c b/tools/testing/selftests/bpf/progs/timer.c index a81413514e4b..4b4ca781e7cd 100644 --- a/tools/testing/selftests/bpf/progs/timer.c +++ b/tools/testing/selftests/bpf/progs/timer.c @@ -169,6 +169,29 @@ int BPF_PROG2(test1, int, a) return 0; } +static int timer_error(void *map, int *key, struct bpf_timer *timer) +{ + err = 42; + return 0; +} + +SEC("syscall") +int test_async_cancel_succeed(void *ctx) +{ + struct bpf_timer *arr_timer; + int array_key = ARRAY; + + arr_timer = bpf_map_lookup_elem(&array, &array_key); + if (!arr_timer) + return 0; + bpf_timer_init(arr_timer, &array, CLOCK_MONOTONIC); + bpf_timer_set_callback(arr_timer, timer_error); + bpf_timer_start(arr_timer, 100000 /* 100us */, 0); + bpf_timer_cancel_async(arr_timer); + ok = 7; + return 0; +} + /* callback for prealloc and non-prealloca hashtab timers */ static int timer_cb2(void *map, int *key, struct hmap_elem *val) { From 083c5a4babad4af89dd07e2cc5f7004343d4c1f0 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Sat, 31 Jan 2026 18:54:01 -0800 Subject: [PATCH 231/268] selftests/bpf: Add timer stress test in NMI context Add stress tests for BPF timers that run in NMI context using perf_event programs attached to PERF_COUNT_HW_CPU_CYCLES. The tests cover three scenarios: - nmi_race: Tests concurrent timer start and async cancel operations - nmi_update: Tests updating a map element (effectively deleting and inserting new for array map) from within a timer callback - nmi_cancel: Tests timer self-cancellation attempt. A common test_common() helper is used to share timer setup logic across all test modes. The tests spawn multiple threads in a child process to generate perf events, which trigger the BPF programs in NMI context. Hit counters verify that the NMI code paths were actually exercised. Signed-off-by: Mykyta Yatsenko Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260201025403.66625-8-alexei.starovoitov@gmail.com --- .../testing/selftests/bpf/prog_tests/timer.c | 158 ++++++++++++++++++ tools/testing/selftests/bpf/progs/timer.c | 87 ++++++++-- 2 files changed, 232 insertions(+), 13 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/timer.c b/tools/testing/selftests/bpf/prog_tests/timer.c index 2b932d4dfd43..09ff21e1ad2f 100644 --- a/tools/testing/selftests/bpf/prog_tests/timer.c +++ b/tools/testing/selftests/bpf/prog_tests/timer.c @@ -1,12 +1,27 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2021 Facebook */ +#include #include +#include +#include #include "timer.skel.h" #include "timer_failure.skel.h" #include "timer_interrupt.skel.h" #define NUM_THR 8 +static int perf_event_open(__u32 type, __u64 config, int pid, int cpu) +{ + struct perf_event_attr attr = { + .type = type, + .config = config, + .size = sizeof(struct perf_event_attr), + .sample_period = 10000, + }; + + return syscall(__NR_perf_event_open, &attr, pid, cpu, -1, 0); +} + static void *spin_lock_thread(void *arg) { int i, err, prog_fd = *(int *)arg; @@ -57,6 +72,134 @@ static int timer_stress_async_cancel(struct timer *timer_skel) return timer_stress_runner(timer_skel, true); } +static void *nmi_cpu_worker(void *arg) +{ + volatile __u64 num = 1; + int i; + + for (i = 0; i < 500000000; ++i) + num *= (i % 7) + 1; + (void)num; + + return NULL; +} + +static int run_nmi_test(struct timer *timer_skel, struct bpf_program *prog) +{ + struct bpf_link *link = NULL; + int pe_fd = -1, pipefd[2] = {-1, -1}, pid = 0, status; + char buf = 0; + int ret = -1; + + if (!ASSERT_OK(pipe(pipefd), "pipe")) + goto cleanup; + + pid = fork(); + if (pid == 0) { + /* Child: spawn multiple threads to consume multiple CPUs */ + pthread_t threads[NUM_THR]; + int i; + + close(pipefd[1]); + read(pipefd[0], &buf, 1); + close(pipefd[0]); + + for (i = 0; i < NUM_THR; i++) + pthread_create(&threads[i], NULL, nmi_cpu_worker, NULL); + for (i = 0; i < NUM_THR; i++) + pthread_join(threads[i], NULL); + exit(0); + } + + if (!ASSERT_GE(pid, 0, "fork")) + goto cleanup; + + /* Open perf event for child process across all CPUs */ + pe_fd = perf_event_open(PERF_TYPE_HARDWARE, + PERF_COUNT_HW_CPU_CYCLES, + pid, /* measure child process */ + -1); /* on any CPU */ + if (pe_fd < 0) { + if (errno == ENOENT || errno == EOPNOTSUPP) { + printf("SKIP:no PERF_COUNT_HW_CPU_CYCLES\n"); + test__skip(); + ret = EOPNOTSUPP; + goto cleanup; + } + ASSERT_GE(pe_fd, 0, "perf_event_open"); + goto cleanup; + } + + link = bpf_program__attach_perf_event(prog, pe_fd); + if (!ASSERT_OK_PTR(link, "attach_perf_event")) + goto cleanup; + pe_fd = -1; /* Ownership transferred to link */ + + /* Signal child to start CPU work */ + close(pipefd[0]); + pipefd[0] = -1; + write(pipefd[1], &buf, 1); + close(pipefd[1]); + pipefd[1] = -1; + + waitpid(pid, &status, 0); + pid = 0; + + /* Verify NMI context was hit */ + ASSERT_GT(timer_skel->bss->test_hits, 0, "test_hits"); + ret = 0; + +cleanup: + bpf_link__destroy(link); + if (pe_fd >= 0) + close(pe_fd); + if (pid > 0) { + write(pipefd[1], &buf, 1); + waitpid(pid, &status, 0); + } + if (pipefd[0] >= 0) + close(pipefd[0]); + if (pipefd[1] >= 0) + close(pipefd[1]); + return ret; +} + +static int timer_stress_nmi_race(struct timer *timer_skel) +{ + int err; + + err = run_nmi_test(timer_skel, timer_skel->progs.nmi_race); + if (err == EOPNOTSUPP) + return 0; + return err; +} + +static int timer_stress_nmi_update(struct timer *timer_skel) +{ + int err; + + err = run_nmi_test(timer_skel, timer_skel->progs.nmi_update); + if (err == EOPNOTSUPP) + return 0; + if (err) + return err; + ASSERT_GT(timer_skel->bss->update_hits, 0, "update_hits"); + return 0; +} + +static int timer_stress_nmi_cancel(struct timer *timer_skel) +{ + int err; + + err = run_nmi_test(timer_skel, timer_skel->progs.nmi_cancel); + if (err == EOPNOTSUPP) + return 0; + if (err) + return err; + ASSERT_GT(timer_skel->bss->cancel_hits, 0, "cancel_hits"); + return 0; +} + static int timer(struct timer *timer_skel) { int err, prog_fd; @@ -159,6 +302,21 @@ void serial_test_timer_async_cancel(void) test_timer(timer_cancel_async); } +void serial_test_timer_stress_nmi_race(void) +{ + test_timer(timer_stress_nmi_race); +} + +void serial_test_timer_stress_nmi_update(void) +{ + test_timer(timer_stress_nmi_update); +} + +void serial_test_timer_stress_nmi_cancel(void) +{ + test_timer(timer_stress_nmi_cancel); +} + void test_timer_interrupt(void) { struct timer_interrupt *skel = NULL; diff --git a/tools/testing/selftests/bpf/progs/timer.c b/tools/testing/selftests/bpf/progs/timer.c index 4b4ca781e7cd..d6d5fefcd9b1 100644 --- a/tools/testing/selftests/bpf/progs/timer.c +++ b/tools/testing/selftests/bpf/progs/timer.c @@ -63,6 +63,9 @@ __u64 bss_data; __u64 abs_data; __u64 err; __u64 ok; +__u64 test_hits; +__u64 update_hits; +__u64 cancel_hits; __u64 callback_check = 52; __u64 callback2_check = 52; __u64 pinned_callback_check; @@ -427,30 +430,88 @@ static int race_timer_callback(void *race_array, int *race_key, struct bpf_timer return 0; } -SEC("syscall") -int race(void *ctx) +/* Callback that updates its own map element */ +static int update_self_callback(void *map, int *key, struct bpf_timer *timer) +{ + struct elem init = {}; + + bpf_map_update_elem(map, key, &init, BPF_ANY); + __sync_fetch_and_add(&update_hits, 1); + return 0; +} + +/* Callback that cancels itself using async cancel */ +static int cancel_self_callback(void *map, int *key, struct bpf_timer *timer) +{ + bpf_timer_cancel_async(timer); + __sync_fetch_and_add(&cancel_hits, 1); + return 0; +} + +enum test_mode { + TEST_RACE_SYNC, + TEST_RACE_ASYNC, + TEST_UPDATE, + TEST_CANCEL, +}; + +static __always_inline int test_common(enum test_mode mode) { struct bpf_timer *timer; - int err, race_key = 0; struct elem init; + int ret, key = 0; __builtin_memset(&init, 0, sizeof(struct elem)); - bpf_map_update_elem(&race_array, &race_key, &init, BPF_ANY); - timer = bpf_map_lookup_elem(&race_array, &race_key); + bpf_map_update_elem(&race_array, &key, &init, BPF_ANY); + timer = bpf_map_lookup_elem(&race_array, &key); if (!timer) - return 1; + return 0; - err = bpf_timer_init(timer, &race_array, CLOCK_MONOTONIC); - if (err && err != -EBUSY) - return 1; + ret = bpf_timer_init(timer, &race_array, CLOCK_MONOTONIC); + if (ret && ret != -EBUSY) + return 0; - bpf_timer_set_callback(timer, race_timer_callback); - bpf_timer_start(timer, 0, 0); - if (async_cancel) - bpf_timer_cancel_async(timer); + if (mode == TEST_RACE_SYNC || mode == TEST_RACE_ASYNC) + bpf_timer_set_callback(timer, race_timer_callback); + else if (mode == TEST_UPDATE) + bpf_timer_set_callback(timer, update_self_callback); else + bpf_timer_set_callback(timer, cancel_self_callback); + + bpf_timer_start(timer, 0, 0); + + if (mode == TEST_RACE_ASYNC) + bpf_timer_cancel_async(timer); + else if (mode == TEST_RACE_SYNC) bpf_timer_cancel(timer); return 0; } + +SEC("syscall") +int race(void *ctx) +{ + return test_common(async_cancel ? TEST_RACE_ASYNC : TEST_RACE_SYNC); +} + +SEC("perf_event") +int nmi_race(void *ctx) +{ + __sync_fetch_and_add(&test_hits, 1); + return test_common(TEST_RACE_ASYNC); +} + +SEC("perf_event") +int nmi_update(void *ctx) +{ + __sync_fetch_and_add(&test_hits, 1); + return test_common(TEST_UPDATE); +} + +SEC("perf_event") +int nmi_cancel(void *ctx) +{ + __sync_fetch_and_add(&test_hits, 1); + return test_common(TEST_CANCEL); +} From 3f7a8415209eeca4b1fda2a57f9d7ec49413e2eb Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Sat, 31 Jan 2026 18:54:02 -0800 Subject: [PATCH 232/268] selftests/bpf: Removed obsolete tests Now bpf_timer can be used in tracepoints, so these tests are no longer relevant. Signed-off-by: Mykyta Yatsenko Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260201025403.66625-9-alexei.starovoitov@gmail.com --- .../bpf/progs/verifier_helper_restricted.c | 111 ------------------ 1 file changed, 111 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/verifier_helper_restricted.c b/tools/testing/selftests/bpf/progs/verifier_helper_restricted.c index 059aa716e3d0..889c9b78b912 100644 --- a/tools/testing/selftests/bpf/progs/verifier_helper_restricted.c +++ b/tools/testing/selftests/bpf/progs/verifier_helper_restricted.c @@ -17,17 +17,6 @@ struct { __type(value, struct val); } map_spin_lock SEC(".maps"); -struct timer { - struct bpf_timer t; -}; - -struct { - __uint(type, BPF_MAP_TYPE_ARRAY); - __uint(max_entries, 1); - __type(key, int); - __type(value, struct timer); -} map_timer SEC(".maps"); - SEC("kprobe") __description("bpf_ktime_get_coarse_ns is forbidden in BPF_PROG_TYPE_KPROBE") __failure __msg("program of this type cannot use helper bpf_ktime_get_coarse_ns") @@ -84,106 +73,6 @@ __naked void bpf_prog_type_raw_tracepoint_1(void) : __clobber_all); } -SEC("kprobe") -__description("bpf_timer_init isn restricted in BPF_PROG_TYPE_KPROBE") -__failure __msg("tracing progs cannot use bpf_timer yet") -__naked void in_bpf_prog_type_kprobe_2(void) -{ - asm volatile (" \ - r2 = r10; \ - r2 += -8; \ - r1 = 0; \ - *(u64*)(r2 + 0) = r1; \ - r1 = %[map_timer] ll; \ - call %[bpf_map_lookup_elem]; \ - if r0 == 0 goto l0_%=; \ - r1 = r0; \ - r2 = %[map_timer] ll; \ - r3 = 1; \ -l0_%=: call %[bpf_timer_init]; \ - exit; \ -" : - : __imm(bpf_map_lookup_elem), - __imm(bpf_timer_init), - __imm_addr(map_timer) - : __clobber_all); -} - -SEC("perf_event") -__description("bpf_timer_init is forbidden in BPF_PROG_TYPE_PERF_EVENT") -__failure __msg("tracing progs cannot use bpf_timer yet") -__naked void bpf_prog_type_perf_event_2(void) -{ - asm volatile (" \ - r2 = r10; \ - r2 += -8; \ - r1 = 0; \ - *(u64*)(r2 + 0) = r1; \ - r1 = %[map_timer] ll; \ - call %[bpf_map_lookup_elem]; \ - if r0 == 0 goto l0_%=; \ - r1 = r0; \ - r2 = %[map_timer] ll; \ - r3 = 1; \ -l0_%=: call %[bpf_timer_init]; \ - exit; \ -" : - : __imm(bpf_map_lookup_elem), - __imm(bpf_timer_init), - __imm_addr(map_timer) - : __clobber_all); -} - -SEC("tracepoint") -__description("bpf_timer_init is forbidden in BPF_PROG_TYPE_TRACEPOINT") -__failure __msg("tracing progs cannot use bpf_timer yet") -__naked void in_bpf_prog_type_tracepoint_2(void) -{ - asm volatile (" \ - r2 = r10; \ - r2 += -8; \ - r1 = 0; \ - *(u64*)(r2 + 0) = r1; \ - r1 = %[map_timer] ll; \ - call %[bpf_map_lookup_elem]; \ - if r0 == 0 goto l0_%=; \ - r1 = r0; \ - r2 = %[map_timer] ll; \ - r3 = 1; \ -l0_%=: call %[bpf_timer_init]; \ - exit; \ -" : - : __imm(bpf_map_lookup_elem), - __imm(bpf_timer_init), - __imm_addr(map_timer) - : __clobber_all); -} - -SEC("raw_tracepoint") -__description("bpf_timer_init is forbidden in BPF_PROG_TYPE_RAW_TRACEPOINT") -__failure __msg("tracing progs cannot use bpf_timer yet") -__naked void bpf_prog_type_raw_tracepoint_2(void) -{ - asm volatile (" \ - r2 = r10; \ - r2 += -8; \ - r1 = 0; \ - *(u64*)(r2 + 0) = r1; \ - r1 = %[map_timer] ll; \ - call %[bpf_map_lookup_elem]; \ - if r0 == 0 goto l0_%=; \ - r1 = r0; \ - r2 = %[map_timer] ll; \ - r3 = 1; \ -l0_%=: call %[bpf_timer_init]; \ - exit; \ -" : - : __imm(bpf_map_lookup_elem), - __imm(bpf_timer_init), - __imm_addr(map_timer) - : __clobber_all); -} - SEC("kprobe") __description("bpf_spin_lock is forbidden in BPF_PROG_TYPE_KPROBE") __failure __msg("tracing progs cannot use bpf_spin_lock yet") From b135beb07758a854160e5421b6f4d5bde72e0da6 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Sat, 31 Jan 2026 18:54:03 -0800 Subject: [PATCH 233/268] selftests/bpf: Add a test to stress bpf_timer_start and map_delete race Add a test to stress bpf_timer_start and map_delete race Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260201025403.66625-10-alexei.starovoitov@gmail.com --- .../bpf/prog_tests/timer_start_delete_race.c | 137 ++++++++++++++++++ .../bpf/progs/timer_start_delete_race.c | 66 +++++++++ 2 files changed, 203 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/timer_start_delete_race.c create mode 100644 tools/testing/selftests/bpf/progs/timer_start_delete_race.c diff --git a/tools/testing/selftests/bpf/prog_tests/timer_start_delete_race.c b/tools/testing/selftests/bpf/prog_tests/timer_start_delete_race.c new file mode 100644 index 000000000000..29a46e96f660 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/timer_start_delete_race.c @@ -0,0 +1,137 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ +#define _GNU_SOURCE +#include +#include +#include +#include "timer_start_delete_race.skel.h" + +/* + * Test for race between bpf_timer_start() and map element deletion. + * + * The race scenario: + * - CPU 1: bpf_timer_start() proceeds to bpf_async_process() and is about + * to call hrtimer_start() but hasn't yet + * - CPU 2: map_delete_elem() calls __bpf_async_cancel_and_free(), since + * timer is not scheduled yet hrtimer_try_to_cancel() is a nop, + * then calls bpf_async_refcount_put() dropping refcnt to zero + * and scheduling call_rcu_tasks_trace() + * - CPU 1: continues and calls hrtimer_start() + * - After RCU tasks trace grace period: memory is freed + * - Timer callback fires on freed memory: UAF! + * + * This test stresses this race by having two threads: + * - Thread 1: repeatedly starts timers + * - Thread 2: repeatedly deletes map elements + * + * KASAN should detect use-after-free. + */ + +#define ITERATIONS 1000 + +struct ctx { + struct timer_start_delete_race *skel; + volatile bool start; + volatile bool stop; + int errors; +}; + +static void *start_timer_thread(void *arg) +{ + struct ctx *ctx = arg; + cpu_set_t cpuset; + int fd, i; + + CPU_ZERO(&cpuset); + CPU_SET(0, &cpuset); + pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset); + + while (!ctx->start && !ctx->stop) + usleep(1); + if (ctx->stop) + return NULL; + + fd = bpf_program__fd(ctx->skel->progs.start_timer); + + for (i = 0; i < ITERATIONS && !ctx->stop; i++) { + LIBBPF_OPTS(bpf_test_run_opts, opts); + int err; + + err = bpf_prog_test_run_opts(fd, &opts); + if (err || opts.retval) { + ctx->errors++; + break; + } + } + + return NULL; +} + +static void *delete_elem_thread(void *arg) +{ + struct ctx *ctx = arg; + cpu_set_t cpuset; + int fd, i; + + CPU_ZERO(&cpuset); + CPU_SET(1, &cpuset); + pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset); + + while (!ctx->start && !ctx->stop) + usleep(1); + if (ctx->stop) + return NULL; + + fd = bpf_program__fd(ctx->skel->progs.delete_elem); + + for (i = 0; i < ITERATIONS && !ctx->stop; i++) { + LIBBPF_OPTS(bpf_test_run_opts, opts); + int err; + + err = bpf_prog_test_run_opts(fd, &opts); + if (err || opts.retval) { + ctx->errors++; + break; + } + } + + return NULL; +} + +void test_timer_start_delete_race(void) +{ + struct timer_start_delete_race *skel; + pthread_t threads[2]; + struct ctx ctx = {}; + int err; + + skel = timer_start_delete_race__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open_and_load")) + return; + + ctx.skel = skel; + + err = pthread_create(&threads[0], NULL, start_timer_thread, &ctx); + if (!ASSERT_OK(err, "create start_timer_thread")) { + ctx.stop = true; + goto cleanup; + } + + err = pthread_create(&threads[1], NULL, delete_elem_thread, &ctx); + if (!ASSERT_OK(err, "create delete_elem_thread")) { + ctx.stop = true; + pthread_join(threads[0], NULL); + goto cleanup; + } + + ctx.start = true; + + pthread_join(threads[0], NULL); + pthread_join(threads[1], NULL); + + ASSERT_EQ(ctx.errors, 0, "thread_errors"); + + /* Either KASAN will catch UAF or kernel will crash or nothing happens */ +cleanup: + timer_start_delete_race__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/timer_start_delete_race.c b/tools/testing/selftests/bpf/progs/timer_start_delete_race.c new file mode 100644 index 000000000000..2ff6357f33f9 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/timer_start_delete_race.c @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include + +#define ITER_CNT 2000 + +struct map_value { + struct bpf_timer timer; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, int); + __type(value, struct map_value); + __uint(max_entries, 1); +} timer_map SEC(".maps"); + +long cb_cnt; + +/* + * Timer callback that accesses the map value. + * If the race bug exists and this runs on freed memory, + * KASAN should detect it. + */ +static int timer_cb(void *map, int *key, struct map_value *value) +{ + __sync_fetch_and_add(&cb_cnt, 1); + return 0; +} + +SEC("syscall") +int start_timer(void *ctx) +{ + struct map_value *value; + int i; + + for (i = 0; i < ITER_CNT; i++) { + int key = 0; + + value = bpf_map_lookup_elem(&timer_map, &key); + /* hey claude, no if (!value) here. The verifier got smarter */ + + bpf_timer_init(&value->timer, &timer_map, CLOCK_MONOTONIC); + bpf_timer_set_callback(&value->timer, timer_cb); + bpf_timer_start(&value->timer, 100000000, 0); + } + return 0; +} + +SEC("syscall") +int delete_elem(void *ctx) +{ + int i; + + for (i = 0; i < ITER_CNT; i++) { + int key = 0; + + bpf_map_delete_elem(&timer_map, &key); + } + + return 0; +} + +char _license[] SEC("license") = "GPL"; From f06581392e9d56ac86d8fcc29c0931441ee82f4a Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 3 Feb 2026 21:30:06 +0000 Subject: [PATCH 234/268] bpf: Use sk_is_inet() and sk_is_unix() in __cgroup_bpf_run_filter_sock_addr(). sk->sk_family should be read with READ_ONCE() in __cgroup_bpf_run_filter_sock_addr() due to IPV6_ADDRFORM. Also, the comment there is a bit stale since commit 859051dd165e ("bpf: Implement cgroup sockaddr hooks for unix sockets"), and the kdoc has the same comment. Let's use sk_is_inet() and sk_is_unix() and remove the comment. Acked-by: Stanislav Fomichev Signed-off-by: Kuniyuki Iwashima Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260203213442.682838-2-kuniyu@google.com --- kernel/bpf/cgroup.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 69988af44b37..b029f0369ecf 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1680,11 +1680,7 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, struct cgroup *cgrp; int ret; - /* Check socket family since not all sockets represent network - * endpoint (e.g. AF_UNIX). - */ - if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6 && - sk->sk_family != AF_UNIX) + if (!sk_is_inet(sk) && !sk_is_unix(sk)) return 0; if (!ctx.uaddr) { From c26b098bf42c71111ca976ce330ea5c66b9a5d8e Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 3 Feb 2026 21:30:07 +0000 Subject: [PATCH 235/268] bpf: Don't check sk_fullsock() in bpf_skc_to_unix_sock(). AF_UNIX does not use TCP_NEW_SYN_RECV nor TCP_TIME_WAIT and checking sk->sk_family is sufficient. Let's remove sk_fullsock() and use sk_is_unix() in bpf_skc_to_unix_sock(). Acked-by: Stanislav Fomichev Signed-off-by: Kuniyuki Iwashima Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260203213442.682838-3-kuniyu@google.com --- net/core/filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/filter.c b/net/core/filter.c index f04982d79d72..90bfe5a6c1db 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -12021,7 +12021,7 @@ BPF_CALL_1(bpf_skc_to_unix_sock, struct sock *, sk) * trigger an explicit type generation here. */ BTF_TYPE_EMIT(struct unix_sock); - if (sk && sk_fullsock(sk) && sk->sk_family == AF_UNIX) + if (sk && sk_is_unix(sk)) return (unsigned long)sk; return (unsigned long)NULL; From 5e6e1dc43a217624087ce45bafd20ac2cfb3c190 Mon Sep 17 00:00:00 2001 From: Donglin Peng Date: Mon, 2 Feb 2026 20:01:14 +0800 Subject: [PATCH 236/268] resolve_btfids: Refactor the sort_btf_by_name function Preserve original relative order of anonymous or same-named types to improve the consistency. No functional changes. Signed-off-by: Donglin Peng Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260202120114.3707141-1-dolinux.peng@gmail.com --- tools/bpf/resolve_btfids/main.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c index db8d1554bdcc..ca7fcd03efb6 100644 --- a/tools/bpf/resolve_btfids/main.c +++ b/tools/bpf/resolve_btfids/main.c @@ -1226,22 +1226,26 @@ static int cmp_type_names(const void *a, const void *b, void *priv) const struct btf_type *ta = btf__type_by_id(btf, *(__u32 *)a); const struct btf_type *tb = btf__type_by_id(btf, *(__u32 *)b); const char *na, *nb; + int r; na = btf__str_by_offset(btf, ta->name_off); nb = btf__str_by_offset(btf, tb->name_off); - return strcmp(na, nb); + r = strcmp(na, nb); + if (r != 0) + return r; + + /* preserve original relative order of anonymous or same-named types */ + return *(__u32 *)a < *(__u32 *)b ? -1 : 1; } static int sort_btf_by_name(struct btf *btf) { __u32 *permute_ids = NULL, *id_map = NULL; int nr_types, i, err = 0; - __u32 start_id = 0, start_offs = 1, id; + __u32 start_id = 0, id; - if (btf__base_btf(btf)) { + if (btf__base_btf(btf)) start_id = btf__type_cnt(btf__base_btf(btf)); - start_offs = 0; - } nr_types = btf__type_cnt(btf) - start_id; permute_ids = calloc(nr_types, sizeof(*permute_ids)); @@ -1259,8 +1263,8 @@ static int sort_btf_by_name(struct btf *btf) for (i = 0, id = start_id; i < nr_types; i++, id++) permute_ids[i] = id; - qsort_r(permute_ids + start_offs, nr_types - start_offs, - sizeof(*permute_ids), cmp_type_names, btf); + qsort_r(permute_ids, nr_types, sizeof(*permute_ids), cmp_type_names, + btf); for (i = 0; i < nr_types; i++) { id = permute_ids[i] - start_id; From 7d49635e3775da946e536bc81ab55b2bca6b791d Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 3 Feb 2026 21:51:44 -0800 Subject: [PATCH 237/268] bpf: Tighten conditions when timer/wq can be called synchronously Though hrtimer_start/cancel() inlines all of the smaller helpers in hrtimer.c and only call timerqueue_add/del() from lib/timerqueue.c where everything is not traceable and not kprobe-able (because all files in lib/ are not traceable), there are tracepoints within hrtimer that are called with locks held. Therefore prevent the deadlock by tightening conditions when timer/wq can be called synchronously. hrtimer/wq are using raw_spin_lock_irqsave(), so irqs_disabled() is enough. Fixes: 1bfbc267ec91 ("bpf: Enable bpf_timer and bpf_wq in any context") Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260204055147.54960-2-alexei.starovoitov@gmail.com --- kernel/bpf/helpers.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index d4aedac14a60..0517e9a8fc7c 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1430,8 +1430,6 @@ static int bpf_async_update_prog_callback(struct bpf_async_cb *cb, static int bpf_async_schedule_op(struct bpf_async_cb *cb, enum bpf_async_op op, u64 nsec, u32 timer_mode) { - WARN_ON_ONCE(!in_hardirq()); - struct bpf_async_cmd *cmd = kmalloc_nolock(sizeof(*cmd), 0, NUMA_NO_NODE); if (!cmd) { @@ -1473,6 +1471,11 @@ static const struct bpf_func_proto bpf_timer_set_callback_proto = { .arg2_type = ARG_PTR_TO_FUNC, }; +static bool defer_timer_wq_op(void) +{ + return in_hardirq() || irqs_disabled(); +} + BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, async, u64, nsecs, u64, flags) { struct bpf_hrtimer *t; @@ -1500,7 +1503,7 @@ BPF_CALL_3(bpf_timer_start, struct bpf_async_kern *, async, u64, nsecs, u64, fla if (!refcount_inc_not_zero(&t->cb.refcnt)) return -ENOENT; - if (!in_hardirq()) { + if (!defer_timer_wq_op()) { hrtimer_start(&t->timer, ns_to_ktime(nsecs), mode); bpf_async_refcount_put(&t->cb); return 0; @@ -1524,7 +1527,7 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, async) bool inc = false; int ret = 0; - if (in_hardirq()) + if (defer_timer_wq_op()) return -EOPNOTSUPP; t = READ_ONCE(async->timer); @@ -1650,7 +1653,7 @@ static void bpf_async_cancel_and_free(struct bpf_async_kern *async) * refcnt. Either synchronously or asynchronously in irq_work. */ - if (!in_hardirq()) { + if (!defer_timer_wq_op()) { bpf_async_process_op(cb, BPF_ASYNC_CANCEL, 0, 0); } else { (void)bpf_async_schedule_op(cb, BPF_ASYNC_CANCEL, 0, 0); @@ -3161,7 +3164,7 @@ __bpf_kfunc int bpf_wq_start(struct bpf_wq *wq, unsigned int flags) if (!refcount_inc_not_zero(&w->cb.refcnt)) return -ENOENT; - if (!in_hardirq()) { + if (!defer_timer_wq_op()) { schedule_work(&w->work); bpf_async_refcount_put(&w->cb); return 0; @@ -4461,7 +4464,7 @@ __bpf_kfunc int bpf_timer_cancel_async(struct bpf_timer *timer) if (!refcount_inc_not_zero(&cb->refcnt)) return -ENOENT; - if (!in_hardirq()) { + if (!defer_timer_wq_op()) { struct bpf_hrtimer *t = container_of(cb, struct bpf_hrtimer, cb); ret = hrtimer_try_to_cancel(&t->timer); From 67ee5ad27d5101be4e9e8980c0734a0423bfd0a7 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 3 Feb 2026 21:51:45 -0800 Subject: [PATCH 238/268] selftests/bpf: Add a testcase for deadlock avoidance Add a testcase that checks that deadlock avoidance is working as expected. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260204055147.54960-3-alexei.starovoitov@gmail.com --- .../bpf/prog_tests/timer_start_deadlock.c | 33 ++++++++ .../bpf/progs/timer_start_deadlock.c | 75 +++++++++++++++++++ 2 files changed, 108 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/timer_start_deadlock.c create mode 100644 tools/testing/selftests/bpf/progs/timer_start_deadlock.c diff --git a/tools/testing/selftests/bpf/prog_tests/timer_start_deadlock.c b/tools/testing/selftests/bpf/prog_tests/timer_start_deadlock.c new file mode 100644 index 000000000000..9f1f9aec8888 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/timer_start_deadlock.c @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ +#include +#include "timer_start_deadlock.skel.h" + +void test_timer_start_deadlock(void) +{ + struct timer_start_deadlock *skel; + int err, prog_fd; + LIBBPF_OPTS(bpf_test_run_opts, opts); + + skel = timer_start_deadlock__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open_and_load")) + return; + + err = timer_start_deadlock__attach(skel); + if (!ASSERT_OK(err, "skel_attach")) + goto cleanup; + + prog_fd = bpf_program__fd(skel->progs.start_timer); + + /* + * Run the syscall program that attempts to deadlock. + * If the kernel deadlocks, this call will never return. + */ + err = bpf_prog_test_run_opts(prog_fd, &opts); + ASSERT_OK(err, "prog_test_run"); + ASSERT_EQ(opts.retval, 0, "prog_retval"); + + ASSERT_EQ(skel->bss->tp_called, 1, "tp_called"); +cleanup: + timer_start_deadlock__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/timer_start_deadlock.c b/tools/testing/selftests/bpf/progs/timer_start_deadlock.c new file mode 100644 index 000000000000..368563747a46 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/timer_start_deadlock.c @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include + +#define CLOCK_MONOTONIC 1 + +char _license[] SEC("license") = "GPL"; + +struct elem { + struct bpf_timer timer; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, int); + __type(value, struct elem); +} timer_map SEC(".maps"); + +volatile int in_timer_start; +volatile int tp_called; + +static int timer_cb(void *map, int *key, struct elem *value) +{ + return 0; +} + +SEC("tp_btf/hrtimer_cancel") +int BPF_PROG(tp_hrtimer_cancel, struct hrtimer *hrtimer) +{ + struct bpf_timer *timer; + static bool called = false; + int key = 0; + + if (!in_timer_start) + return 0; + + tp_called = 1; + timer = bpf_map_lookup_elem(&timer_map, &key); + + /* + * Call bpf_timer_start() from the tracepoint within hrtimer logic + * on the same timer to make sure it doesn't deadlock, + * and do it once. + */ + if (!called) { + called = true; + bpf_timer_start(timer, 1000000000, 0); + } + return 0; +} + +SEC("syscall") +int start_timer(void *ctx) +{ + struct bpf_timer *timer; + int key = 0; + + timer = bpf_map_lookup_elem(&timer_map, &key); + /* claude may complain here that there is no NULL check. Ignoring it. */ + bpf_timer_init(timer, &timer_map, CLOCK_MONOTONIC); + bpf_timer_set_callback(timer, timer_cb); + + /* + * call hrtimer_start() twice, so that 2nd call does + * remove_hrtimer() and trace_hrtimer_cancel() tracepoint. + */ + in_timer_start = 1; + bpf_timer_start(timer, 1000000000, 0); + bpf_timer_start(timer, 1000000000, 0); + in_timer_start = 0; + return 0; +} From 64873307e888505ccc45ef049dccdcfef42d2f54 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 3 Feb 2026 21:51:46 -0800 Subject: [PATCH 239/268] bpf: Add a recursion check to prevent loops in bpf_timer Do not schedule timer/wq operation on a cpu that is in irq_work callback that is processing async_cmds queue. Otherwise the following loop is possible: bpf_timer_start() -> bpf_async_schedule_op() -> irq_work_queue(). irqrestore -> bpf_async_irq_worker() -> tracepoint -> bpf_timer_start(). Fixes: 1bfbc267ec91 ("bpf: Enable bpf_timer and bpf_wq in any context") Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260204055147.54960-4-alexei.starovoitov@gmail.com --- kernel/bpf/helpers.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 0517e9a8fc7c..01052f8664eb 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1427,9 +1427,23 @@ static int bpf_async_update_prog_callback(struct bpf_async_cb *cb, return 0; } +static DEFINE_PER_CPU(struct bpf_async_cb *, async_cb_running); + static int bpf_async_schedule_op(struct bpf_async_cb *cb, enum bpf_async_op op, u64 nsec, u32 timer_mode) { + /* + * Do not schedule another operation on this cpu if it's in irq_work + * callback that is processing async_cmds queue. Otherwise the following + * loop is possible: + * bpf_timer_start() -> bpf_async_schedule_op() -> irq_work_queue(). + * irqrestore -> bpf_async_irq_worker() -> tracepoint -> bpf_timer_start(). + */ + if (this_cpu_read(async_cb_running) == cb) { + bpf_async_refcount_put(cb); + return -EDEADLK; + } + struct bpf_async_cmd *cmd = kmalloc_nolock(sizeof(*cmd), 0, NUMA_NO_NODE); if (!cmd) { @@ -1628,6 +1642,7 @@ static void bpf_async_irq_worker(struct irq_work *work) return; list = llist_reverse_order(list); + this_cpu_write(async_cb_running, cb); llist_for_each_safe(pos, n, list) { struct bpf_async_cmd *cmd; @@ -1635,6 +1650,7 @@ static void bpf_async_irq_worker(struct irq_work *work) bpf_async_process_op(cb, cmd->op, cmd->nsec, cmd->mode); kfree_nolock(cmd); } + this_cpu_write(async_cb_running, NULL); } static void bpf_async_cancel_and_free(struct bpf_async_kern *async) From 6e65cf81accf908d2480739b85dba4731048290d Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 3 Feb 2026 21:51:47 -0800 Subject: [PATCH 240/268] selftests/bpf: Strengthen timer_start_deadlock test Strengthen timer_start_deadlock test and check for recursion now Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260204055147.54960-5-alexei.starovoitov@gmail.com --- tools/testing/selftests/bpf/progs/timer_start_deadlock.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/timer_start_deadlock.c b/tools/testing/selftests/bpf/progs/timer_start_deadlock.c index 368563747a46..019518ee18cd 100644 --- a/tools/testing/selftests/bpf/progs/timer_start_deadlock.c +++ b/tools/testing/selftests/bpf/progs/timer_start_deadlock.c @@ -31,7 +31,6 @@ SEC("tp_btf/hrtimer_cancel") int BPF_PROG(tp_hrtimer_cancel, struct hrtimer *hrtimer) { struct bpf_timer *timer; - static bool called = false; int key = 0; if (!in_timer_start) @@ -42,13 +41,9 @@ int BPF_PROG(tp_hrtimer_cancel, struct hrtimer *hrtimer) /* * Call bpf_timer_start() from the tracepoint within hrtimer logic - * on the same timer to make sure it doesn't deadlock, - * and do it once. + * on the same timer to make sure it doesn't deadlock. */ - if (!called) { - called = true; - bpf_timer_start(timer, 1000000000, 0); - } + bpf_timer_start(timer, 1000000000, 0); return 0; } From 9d21199842247ab05c675fb9b6c6ca393a5c0024 Mon Sep 17 00:00:00 2001 From: Tianci Cao Date: Wed, 4 Feb 2026 19:15:02 +0800 Subject: [PATCH 241/268] bpf: Add bitwise tracking for BPF_END This patch implements bitwise tracking (tnum analysis) for BPF_END (byte swap) operation. Currently, the BPF verifier does not track value for BPF_END operation, treating the result as completely unknown. This limits the verifier's ability to prove safety of programs that perform endianness conversions, which are common in networking code. For example, the following code pattern for port number validation: int test(struct pt_regs *ctx) { __u64 x = bpf_get_prandom_u32(); x &= 0x3f00; // Range: [0, 0x3f00], var_off: (0x0; 0x3f00) x = bswap16(x); // Should swap to range [0, 0x3f], var_off: (0x0; 0x3f) if (x > 0x3f) goto trap; return 0; trap: return *(u64 *)NULL; // Should be unreachable } Currently generates verifier output: 1: (54) w0 &= 16128 ; R0=scalar(smin=smin32=0,smax=umax=smax32=umax32=16128,var_off=(0x0; 0x3f00)) 2: (d7) r0 = bswap16 r0 ; R0=scalar() 3: (25) if r0 > 0x3f goto pc+2 ; R0=scalar(smin=smin32=0,smax=umax=smax32=umax32=63,var_off=(0x0; 0x3f)) Without this patch, even though the verifier knows `x` has certain bits set, after bswap16, it loses all tracking information and treats port as having a completely unknown value [0, 65535]. According to the BPF instruction set[1], there are 3 kinds of BPF_END: 1. `bswap(16|32|64)`: opcode=0xd7 (BPF_END | BPF_ALU64 | BPF_TO_LE) - do unconditional swap 2. `le(16|32|64)`: opcode=0xd4 (BPF_END | BPF_ALU | BPF_TO_LE) - on big-endian: do swap - on little-endian: truncation (16/32-bit) or no-op (64-bit) 3. `be(16|32|64)`: opcode=0xdc (BPF_END | BPF_ALU | BPF_TO_BE) - on little-endian: do swap - on big-endian: truncation (16/32-bit) or no-op (64-bit) Since BPF_END operations are inherently bit-wise permutations, tnum (bitwise tracking) offers the most efficient and precise mechanism for value analysis. By implementing `tnum_bswap16`, `tnum_bswap32`, and `tnum_bswap64`, we can derive exact `var_off` values concisely, directly reflecting the bit-level changes. Here is the overview of changes: 1. In `tnum_bswap(16|32|64)` (kernel/bpf/tnum.c): Call `swab(16|32|64)` function on the value and mask of `var_off`, and do truncation for 16/32-bit cases. 2. In `adjust_scalar_min_max_vals` (kernel/bpf/verifier.c): Call helper function `scalar_byte_swap`. - Only do byte swap when * alu64 (unconditional swap) OR * switching between big-endian and little-endian machines. - If need do byte swap: * Firstly call `tnum_bswap(16|32|64)` to update `var_off`. * Then reset the bound since byte swap scrambles the range. - For 16/32-bit cases, truncate dst register to match the swapped size. This enables better verification of networking code that frequently uses byte swaps for protocol processing, reducing false positive rejections. [1] https://www.kernel.org/doc/Documentation/bpf/standardization/instruction-set.rst Co-developed-by: Shenghao Yuan Signed-off-by: Shenghao Yuan Co-developed-by: Yazhou Tang Signed-off-by: Yazhou Tang Signed-off-by: Tianci Cao Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260204111503.77871-2-ziye@zju.edu.cn Signed-off-by: Alexei Starovoitov --- include/linux/tnum.h | 5 ++++ kernel/bpf/tnum.c | 16 ++++++++++++ kernel/bpf/verifier.c | 60 ++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 78 insertions(+), 3 deletions(-) diff --git a/include/linux/tnum.h b/include/linux/tnum.h index c52b862dad45..fa4654ffb621 100644 --- a/include/linux/tnum.h +++ b/include/linux/tnum.h @@ -63,6 +63,11 @@ struct tnum tnum_union(struct tnum t1, struct tnum t2); /* Return @a with all but the lowest @size bytes cleared */ struct tnum tnum_cast(struct tnum a, u8 size); +/* Swap the bytes of a tnum */ +struct tnum tnum_bswap16(struct tnum a); +struct tnum tnum_bswap32(struct tnum a); +struct tnum tnum_bswap64(struct tnum a); + /* Returns true if @a is a known constant */ static inline bool tnum_is_const(struct tnum a) { diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c index f8e70e9c3998..26fbfbb01700 100644 --- a/kernel/bpf/tnum.c +++ b/kernel/bpf/tnum.c @@ -8,6 +8,7 @@ */ #include #include +#include #define TNUM(_v, _m) (struct tnum){.value = _v, .mask = _m} /* A completely unknown value */ @@ -253,3 +254,18 @@ struct tnum tnum_const_subreg(struct tnum a, u32 value) { return tnum_with_subreg(a, tnum_const(value)); } + +struct tnum tnum_bswap16(struct tnum a) +{ + return TNUM(swab16(a.value & 0xFFFF), swab16(a.mask & 0xFFFF)); +} + +struct tnum tnum_bswap32(struct tnum a) +{ + return TNUM(swab32(a.value & 0xFFFFFFFF), swab32(a.mask & 0xFFFFFFFF)); +} + +struct tnum tnum_bswap64(struct tnum a) +{ + return TNUM(swab64(a.value), swab64(a.mask)); +} diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 40a8252140fb..92e03a5a50f5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -15832,6 +15832,48 @@ static void scalar_min_max_arsh(struct bpf_reg_state *dst_reg, __update_reg_bounds(dst_reg); } +static void scalar_byte_swap(struct bpf_reg_state *dst_reg, struct bpf_insn *insn) +{ + /* + * Byte swap operation - update var_off using tnum_bswap. + * Three cases: + * 1. bswap(16|32|64): opcode=0xd7 (BPF_END | BPF_ALU64 | BPF_TO_LE) + * unconditional swap + * 2. to_le(16|32|64): opcode=0xd4 (BPF_END | BPF_ALU | BPF_TO_LE) + * swap on big-endian, truncation or no-op on little-endian + * 3. to_be(16|32|64): opcode=0xdc (BPF_END | BPF_ALU | BPF_TO_BE) + * swap on little-endian, truncation or no-op on big-endian + */ + + bool alu64 = BPF_CLASS(insn->code) == BPF_ALU64; + bool to_le = BPF_SRC(insn->code) == BPF_TO_LE; + bool is_big_endian; +#ifdef CONFIG_CPU_BIG_ENDIAN + is_big_endian = true; +#else + is_big_endian = false; +#endif + /* Apply bswap if alu64 or switch between big-endian and little-endian machines */ + bool need_bswap = alu64 || (to_le == is_big_endian); + + if (need_bswap) { + if (insn->imm == 16) + dst_reg->var_off = tnum_bswap16(dst_reg->var_off); + else if (insn->imm == 32) + dst_reg->var_off = tnum_bswap32(dst_reg->var_off); + else if (insn->imm == 64) + dst_reg->var_off = tnum_bswap64(dst_reg->var_off); + /* + * Byteswap scrambles the range, so we must reset bounds. + * Bounds will be re-derived from the new tnum later. + */ + __mark_reg_unbounded(dst_reg); + } + /* For bswap16/32, truncate dst register to match the swapped size */ + if (insn->imm == 16 || insn->imm == 32) + coerce_reg_to_size(dst_reg, insn->imm / 8); +} + static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn, const struct bpf_reg_state *src_reg) { @@ -15858,6 +15900,7 @@ static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn, case BPF_XOR: case BPF_OR: case BPF_MUL: + case BPF_END: return true; /* @@ -16047,12 +16090,23 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, else scalar_min_max_arsh(dst_reg, &src_reg); break; + case BPF_END: + scalar_byte_swap(dst_reg, insn); + break; default: break; } - /* ALU32 ops are zero extended into 64bit register */ - if (alu32) + /* + * ALU32 ops are zero extended into 64bit register. + * + * BPF_END is already handled inside the helper (truncation), + * so skip zext here to avoid unexpected zero extension. + * e.g., le64: opcode=(BPF_END|BPF_ALU|BPF_TO_LE), imm=0x40 + * This is a 64bit byte swap operation with alu32==true, + * but we should not zero extend the result. + */ + if (alu32 && opcode != BPF_END) zext_32_to_64(dst_reg); reg_bounds_sync(dst_reg); return 0; @@ -16232,7 +16286,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } /* check dest operand */ - if (opcode == BPF_NEG && + if ((opcode == BPF_NEG || opcode == BPF_END) && regs[insn->dst_reg].type == SCALAR_VALUE) { err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK); err = err ?: adjust_scalar_min_max_vals(env, insn, From 56415363e02f0f561ecc5bda6a4318438f888b43 Mon Sep 17 00:00:00 2001 From: Tianci Cao Date: Wed, 4 Feb 2026 19:15:03 +0800 Subject: [PATCH 242/268] selftests/bpf: Add tests for BPF_END bitwise tracking Now BPF_END has bitwise tracking support. This patch adds selftests to cover various cases of BPF_END (`bswap(16|32|64)`, `be(16|32|64)`, `le(16|32|64)`) with bitwise propagation. This patch is based on existing `verifier_bswap.c`, and add several types of new tests: 1. Unconditional byte swap operations: - bswap16/bswap32/bswap64 with unknown bytes 2. Endian conversion operations (architecture-aware): - be16/be32/be64: convert to big-endian * on little-endian: do swap * on big-endian: truncation (16/32-bit) or no-op (64-bit) - le16/le32/le64: convert to little-endian * on big-endian: do swap * on little-endian: truncation (16/32-bit) or no-op (64-bit) Each test simulates realistic networking scenarios where a value is masked with unknown bits (e.g., var_off=(0x0; 0x3f00), range=[0,0x3f00]), then byte-swapped, and the verifier must prove the result stays within expected bounds. Specifically, these selftests are based on dead code elimination: If the BPF verifier can precisely track bitwise through byte swap operations, it can prune the trap path (invalid memory access) that should be unreachable, allowing the program to pass verification. If bitwise tracking is incorrect, the verifier cannot prove the trap is unreachable, causing verification failure. The tests use preprocessor conditionals (#ifdef __BYTE_ORDER__) to verify correct behavior on both little-endian and big-endian architectures, and require Clang 18+ for bswap instruction support. Co-developed-by: Shenghao Yuan Signed-off-by: Shenghao Yuan Co-developed-by: Yazhou Tang Signed-off-by: Yazhou Tang Signed-off-by: Tianci Cao Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260204111503.77871-3-ziye@zju.edu.cn Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/progs/verifier_bswap.c | 43 +++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/verifier_bswap.c b/tools/testing/selftests/bpf/progs/verifier_bswap.c index e61755656e8d..4b779deee767 100644 --- a/tools/testing/selftests/bpf/progs/verifier_bswap.c +++ b/tools/testing/selftests/bpf/progs/verifier_bswap.c @@ -48,6 +48,49 @@ __naked void bswap_64(void) : __clobber_all); } +#define BSWAP_RANGE_TEST(name, op, in_value, out_value) \ + SEC("socket") \ + __success __log_level(2) \ + __msg("r0 &= {{.*}}; R0=scalar({{.*}},var_off=(0x0; " #in_value "))") \ + __msg("r0 = " op " r0 {{.*}}; R0=scalar({{.*}},var_off=(0x0; " #out_value "))") \ + __naked void name(void) \ + { \ + asm volatile ( \ + "call %[bpf_get_prandom_u32];" \ + "r0 &= " #in_value ";" \ + "r0 = " op " r0;" \ + "r2 = " #out_value " ll;" \ + "if r0 > r2 goto trap_%=;" \ + "r0 = 0;" \ + "exit;" \ + "trap_%=:" \ + "r1 = 42;" \ + "r0 = *(u64 *)(r1 + 0);" \ + "exit;" \ + : \ + : __imm(bpf_get_prandom_u32) \ + : __clobber_all); \ + } + +BSWAP_RANGE_TEST(bswap16_range, "bswap16", 0x3f00, 0x3f) +BSWAP_RANGE_TEST(bswap32_range, "bswap32", 0x3f00, 0x3f0000) +BSWAP_RANGE_TEST(bswap64_range, "bswap64", 0x3f00, 0x3f000000000000) +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +BSWAP_RANGE_TEST(be16_range, "be16", 0x3f00, 0x3f) +BSWAP_RANGE_TEST(be32_range, "be32", 0x3f00, 0x3f0000) +BSWAP_RANGE_TEST(be64_range, "be64", 0x3f00, 0x3f000000000000) +BSWAP_RANGE_TEST(le16_range, "le16", 0x3f00, 0x3f00) +BSWAP_RANGE_TEST(le32_range, "le32", 0x3f00, 0x3f00) +BSWAP_RANGE_TEST(le64_range, "le64", 0x3f00, 0x3f00) +#else +BSWAP_RANGE_TEST(be16_range, "be16", 0x3f00, 0x3f00) +BSWAP_RANGE_TEST(be32_range, "be32", 0x3f00, 0x3f00) +BSWAP_RANGE_TEST(be64_range, "be64", 0x3f00, 0x3f00) +BSWAP_RANGE_TEST(le16_range, "le16", 0x3f00, 0x3f) +BSWAP_RANGE_TEST(le32_range, "le32", 0x3f00, 0x3f0000) +BSWAP_RANGE_TEST(le64_range, "le64", 0x3f00, 0x3f000000000000) +#endif + #else SEC("socket") From 7a433e519364c3c19643e5c857f4fbfaebec441c Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Wed, 4 Feb 2026 07:17:37 -0800 Subject: [PATCH 243/268] bpf: Support negative offsets, BPF_SUB, and alu32 for linked register tracking Previously, the verifier only tracked positive constant deltas between linked registers using BPF_ADD. This limitation meant patterns like: r1 = r0; r1 += -4; if r1 s>= 0 goto l0_%=; // r1 >= 0 implies r0 >= 4 // verifier couldn't propagate bounds back to r0 if r0 != 0 goto l0_%=; r0 /= 0; // Verifier thinks this is reachable l0_%=: Similar limitation exists for 32-bit registers. With this change, the verifier can now track negative deltas in reg->off enabling bound propagation for the above pattern. For alu32, we make sure the destination register has the upper 32 bits as 0s before creating the link. BPF_ADD_CONST is split into BPF_ADD_CONST64 and BPF_ADD_CONST32, the latter is used in case of alu32 and sync_linked_regs uses this to zext the result if known_reg has this flag. Signed-off-by: Puranjay Mohan Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260204151741.2678118-2-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 6 ++- kernel/bpf/verifier.c | 50 +++++++++++++++---- .../selftests/bpf/progs/verifier_bounds.c | 2 +- 3 files changed, 45 insertions(+), 13 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 746025df82c8..ef8e45a362d9 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -147,8 +147,12 @@ struct bpf_reg_state { * registers. Example: * r1 = r2; both will have r1->id == r2->id == N * r1 += 10; r1->id == N | BPF_ADD_CONST and r1->off == 10 + * r3 = r2; both will have r3->id == r2->id == N + * w3 += 10; r3->id == N | BPF_ADD_CONST32 and r3->off == 10 */ -#define BPF_ADD_CONST (1U << 31) +#define BPF_ADD_CONST64 (1U << 31) +#define BPF_ADD_CONST32 (1U << 30) +#define BPF_ADD_CONST (BPF_ADD_CONST64 | BPF_ADD_CONST32) u32 id; /* PTR_TO_SOCKET and PTR_TO_TCP_SOCK could be a ptr returned * from a pointer-cast helper, bpf_sk_fullsock() and diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 92e03a5a50f5..edf5342b982f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -16209,6 +16209,13 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, verbose(env, "verifier internal error: no src_reg\n"); return -EFAULT; } + /* + * For alu32 linked register tracking, we need to check dst_reg's + * umax_value before the ALU operation. After adjust_scalar_min_max_vals(), + * alu32 ops will have zero-extended the result, making umax_value <= U32_MAX. + */ + u64 dst_umax = dst_reg->umax_value; + err = adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg); if (err) return err; @@ -16218,26 +16225,44 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, * r1 += 0x1 * if r2 < 1000 goto ... * use r1 in memory access - * So for 64-bit alu remember constant delta between r2 and r1 and - * update r1 after 'if' condition. + * So remember constant delta between r2 and r1 and update r1 after + * 'if' condition. */ if (env->bpf_capable && - BPF_OP(insn->code) == BPF_ADD && !alu32 && - dst_reg->id && is_reg_const(src_reg, false)) { - u64 val = reg_const_value(src_reg, false); + (BPF_OP(insn->code) == BPF_ADD || BPF_OP(insn->code) == BPF_SUB) && + dst_reg->id && is_reg_const(src_reg, alu32)) { + u64 val = reg_const_value(src_reg, alu32); + s32 off; - if ((dst_reg->id & BPF_ADD_CONST) || - /* prevent overflow in sync_linked_regs() later */ - val > (u32)S32_MAX) { + if (!alu32 && ((s64)val < S32_MIN || (s64)val > S32_MAX)) + goto clear_id; + + if (alu32 && (dst_umax > U32_MAX)) + goto clear_id; + + off = (s32)val; + + if (BPF_OP(insn->code) == BPF_SUB) { + /* Negating S32_MIN would overflow */ + if (off == S32_MIN) + goto clear_id; + off = -off; + } + + if (dst_reg->id & BPF_ADD_CONST) { /* * If the register already went through rX += val * we cannot accumulate another val into rx->off. */ +clear_id: dst_reg->off = 0; dst_reg->id = 0; } else { - dst_reg->id |= BPF_ADD_CONST; - dst_reg->off = val; + if (alu32) + dst_reg->id |= BPF_ADD_CONST32; + else + dst_reg->id |= BPF_ADD_CONST64; + dst_reg->off = off; } } else { /* @@ -17334,7 +17359,7 @@ static void sync_linked_regs(struct bpf_verifier_env *env, struct bpf_verifier_s u32 saved_id = reg->id; fake_reg.type = SCALAR_VALUE; - __mark_reg_known(&fake_reg, (s32)reg->off - (s32)known_reg->off); + __mark_reg_known(&fake_reg, (s64)reg->off - (s64)known_reg->off); /* reg = known_reg; reg += delta */ copy_register_state(reg, known_reg); @@ -17349,6 +17374,9 @@ static void sync_linked_regs(struct bpf_verifier_env *env, struct bpf_verifier_s scalar32_min_max_add(reg, &fake_reg); scalar_min_max_add(reg, &fake_reg); reg->var_off = tnum_add(reg->var_off, fake_reg.var_off); + if (known_reg->id & BPF_ADD_CONST32) + zext_32_to_64(reg); + reg_bounds_sync(reg); } if (e->is_reg) mark_reg_scratched(env, e->regno); diff --git a/tools/testing/selftests/bpf/progs/verifier_bounds.c b/tools/testing/selftests/bpf/progs/verifier_bounds.c index 411a18437d7e..560531404bce 100644 --- a/tools/testing/selftests/bpf/progs/verifier_bounds.c +++ b/tools/testing/selftests/bpf/progs/verifier_bounds.c @@ -1477,7 +1477,7 @@ __naked void sub64_full_overflow(void) SEC("socket") __description("64-bit subtraction, partial overflow, result in unbounded reg") __success __log_level(2) -__msg("3: (1f) r3 -= r2 {{.*}} R3=scalar()") +__msg("3: (1f) r3 -= r2 {{.*}} R3=scalar(id=1-1)") __retval(0) __naked void sub64_partial_overflow(void) { From 47fcf4dc0a346dd0b873a679c547d6848bd85a37 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Wed, 4 Feb 2026 07:17:38 -0800 Subject: [PATCH 244/268] selftests/bpf: Add tests for improved linked register tracking Add tests for linked register tracking with negative offsets, BPF_SUB, and alu32. These test for all edge cases like overflows, etc. Signed-off-by: Puranjay Mohan Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260204151741.2678118-3-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- .../bpf/progs/verifier_linked_scalars.c | 303 +++++++++++++++++- 1 file changed, 301 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/verifier_linked_scalars.c b/tools/testing/selftests/bpf/progs/verifier_linked_scalars.c index 5f41bbb730a7..2ef346c827c2 100644 --- a/tools/testing/selftests/bpf/progs/verifier_linked_scalars.c +++ b/tools/testing/selftests/bpf/progs/verifier_linked_scalars.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include #include #include "bpf_misc.h" @@ -18,9 +19,9 @@ __naked void scalars(void) r4 = r1; \ w2 += 0x7FFFFFFF; \ w4 += 0; \ - if r2 == 0 goto l1; \ + if r2 == 0 goto l0_%=; \ exit; \ -l1: \ +l0_%=: \ r4 >>= 63; \ r3 = 1; \ r3 -= r4; \ @@ -64,4 +65,302 @@ l0_%=: \ : __clobber_all); } +SEC("socket") +__success +__naked void scalars_neg(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r0 &= 0xff; \ + r1 = r0; \ + r1 += -4; \ + if r1 s< 0 goto l0_%=; \ + if r0 != 0 goto l0_%=; \ + r0 /= 0; \ +l0_%=: \ + r0 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +/* Same test but using BPF_SUB instead of BPF_ADD with negative immediate */ +SEC("socket") +__success +__naked void scalars_neg_sub(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r0 &= 0xff; \ + r1 = r0; \ + r1 -= 4; \ + if r1 s< 0 goto l0_%=; \ + if r0 != 0 goto l0_%=; \ + r0 /= 0; \ +l0_%=: \ + r0 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +/* alu32 with negative offset */ +SEC("socket") +__success +__naked void scalars_neg_alu32_add(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w0 &= 0xff; \ + w1 = w0; \ + w1 += -4; \ + if w1 s< 0 goto l0_%=; \ + if w0 != 0 goto l0_%=; \ + r0 /= 0; \ +l0_%=: \ + r0 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +/* alu32 with negative offset using SUB */ +SEC("socket") +__success +__naked void scalars_neg_alu32_sub(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w0 &= 0xff; \ + w1 = w0; \ + w1 -= 4; \ + if w1 s< 0 goto l0_%=; \ + if w0 != 0 goto l0_%=; \ + r0 /= 0; \ +l0_%=: \ + r0 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +/* Positive offset: r1 = r0 + 4, then if r1 >= 6, r0 >= 2, so r0 != 0 */ +SEC("socket") +__success +__naked void scalars_pos(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r0 &= 0xff; \ + r1 = r0; \ + r1 += 4; \ + if r1 < 6 goto l0_%=; \ + if r0 != 0 goto l0_%=; \ + r0 /= 0; \ +l0_%=: \ + r0 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +/* SUB with negative immediate: r1 -= -4 is equivalent to r1 += 4 */ +SEC("socket") +__success +__naked void scalars_sub_neg_imm(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r0 &= 0xff; \ + r1 = r0; \ + r1 -= -4; \ + if r1 < 6 goto l0_%=; \ + if r0 != 0 goto l0_%=; \ + r0 /= 0; \ +l0_%=: \ + r0 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +/* Double ADD clears the ID (can't accumulate offsets) */ +SEC("socket") +__failure +__msg("div by zero") +__naked void scalars_double_add(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r0 &= 0xff; \ + r1 = r0; \ + r1 += 2; \ + r1 += 2; \ + if r1 < 6 goto l0_%=; \ + if r0 != 0 goto l0_%=; \ + r0 /= 0; \ +l0_%=: \ + r0 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +/* + * Test that sync_linked_regs() correctly handles large offset differences. + * r1.off = S32_MIN, r2.off = 1, delta = S32_MIN - 1 requires 64-bit math. + */ +SEC("socket") +__success +__naked void scalars_sync_delta_overflow(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r0 &= 0xff; \ + r1 = r0; \ + r2 = r0; \ + r1 += %[s32_min]; \ + r2 += 1; \ + if r2 s< 100 goto l0_%=; \ + if r1 s< 0 goto l0_%=; \ + r0 /= 0; \ +l0_%=: \ + r0 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32), + [s32_min]"i"(INT_MIN) + : __clobber_all); +} + +/* + * Another large delta case: r1.off = S32_MAX, r2.off = -1. + * delta = S32_MAX - (-1) = S32_MAX + 1 requires 64-bit math. + */ +SEC("socket") +__success +__naked void scalars_sync_delta_overflow_large_range(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r0 &= 0xff; \ + r1 = r0; \ + r2 = r0; \ + r1 += %[s32_max]; \ + r2 += -1; \ + if r2 s< 0 goto l0_%=; \ + if r1 s>= 0 goto l0_%=; \ + r0 /= 0; \ +l0_%=: \ + r0 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32), + [s32_max]"i"(INT_MAX) + : __clobber_all); +} + +/* + * Test linked scalar tracking with alu32 and large positive offset (0x7FFFFFFF). + * After w1 += 0x7FFFFFFF, w1 wraps to negative for any r0 >= 1. + * If w1 is signed-negative, then r0 >= 1, so r0 != 0. + */ +SEC("socket") +__success +__naked void scalars_alu32_big_offset(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w0 &= 0xff; \ + w1 = w0; \ + w1 += 0x7FFFFFFF; \ + if w1 s>= 0 goto l0_%=; \ + if w0 != 0 goto l0_%=; \ + r0 /= 0; \ +l0_%=: \ + r0 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__failure +__msg("div by zero") +__naked void scalars_alu32_basic(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + r1 = r0; \ + w1 += 1; \ + if r1 > 10 goto 1f; \ + r0 >>= 32; \ + if r0 == 0 goto 1f; \ + r0 /= 0; \ +1: \ + r0 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +/* + * Test alu32 linked register tracking with wrapping. + * R0 is bounded to [0xffffff00, 0xffffffff] (high 32-bit values) + * w1 += 0x100 causes R1 to wrap to [0, 0xff] + * + * After sync_linked_regs, if bounds are computed correctly: + * R0 should be [0x00000000_ffffff00, 0x00000000_ffffff80] + * R0 >> 32 == 0, so div by zero is unreachable + * + * If bounds are computed incorrectly (64-bit underflow): + * R0 becomes [0xffffffff_ffffff00, 0xffffffff_ffffff80] + * R0 >> 32 == 0xffffffff != 0, so div by zero is reachable + */ +SEC("socket") +__success +__naked void scalars_alu32_wrap(void) +{ + asm volatile (" \ + call %[bpf_get_prandom_u32]; \ + w0 |= 0xffffff00; \ + r1 = r0; \ + w1 += 0x100; \ + if r1 > 0x80 goto l0_%=; \ + r2 = r0; \ + r2 >>= 32; \ + if r2 == 0 goto l0_%=; \ + r0 /= 0; \ +l0_%=: \ + r0 = 0; \ + exit; \ +" : + : __imm(bpf_get_prandom_u32) + : __clobber_all); +} + +SEC("socket") +__success +void alu32_negative_offset(void) +{ + volatile char path[5]; + volatile int offset = bpf_get_prandom_u32(); + int off = offset; + + if (off >= 5 && off < 10) + path[off - 5] = '.'; + + /* So compiler doesn't say: error: variable 'path' set but not used */ + __sink(path[0]); +} + char _license[] SEC("license") = "GPL"; From 81502d7f20bf862b706f5174979bed88d3ab82b3 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Wed, 4 Feb 2026 16:38:52 -0800 Subject: [PATCH 245/268] bpf: Check for running wq callback when freeing bpf_async_cb When freeing a bpf_async_cb in bpf_async_cb_rcu_tasks_trace_free(), in case the wq callback is not scheduled, doing cancel_work() currently returns false and leads to retry of RCU tasks trace grace period. If the callback is never scheduled, we keep retrying indefinitely and don't put the prog reference. Since the only race we care about here is against a potentially running wq callback in the first grace period, it should finish by the second grace period, hence check work_busy() result to detect presence of running wq callback if it's not pending, otherwise free the object immediately without retrying. Reasoning behind the check and its correctness with racing wq callback invocation: cancel_work is supposed to be synchronized, hence calling it first and getting false would mean that work is definitely not pending, at this point, either the work is not scheduled at all or already running, or we race and it already finished by the time we checked for it using work_busy(). In case it is running, we synchronize using pool->lock to check the current work running there, if we match, it means we extend the wait by another grace period using retry = true, otherwise either the work already finished running or was never scheduled, so we can free the bpf_async_cb right away. Fixes: 1bfbc267ec91 ("bpf: Enable bpf_timer and bpf_wq in any context") Reported-by: Alexei Starovoitov Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20260205003853.527571-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 01052f8664eb..b7aec34540c2 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1257,7 +1257,7 @@ static void bpf_async_cb_rcu_tasks_trace_free(struct rcu_head *rcu) retry = true; break; case BPF_ASYNC_TYPE_WQ: - if (!cancel_work(&w->work)) + if (!cancel_work(&w->work) && work_busy(&w->work)) retry = true; break; } From 5000a097f82c7695b7760c5b67c95f0eab4d209b Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Wed, 4 Feb 2026 16:38:53 -0800 Subject: [PATCH 246/268] bpf: Reset prog callback in bpf_async_cancel_and_free() Replace prog and callback in bpf_async_cb after removing visibility of bpf_async_cb in bpf_async_cancel_and_free() to increase the chances the scheduled async callbacks short-circuit execution and exit early, and not starting a RCU tasks trace section. This improves the overall time spent in running the wq selftest. Suggested-by: Alexei Starovoitov Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20260205003853.527571-3-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index b7aec34540c2..a4f039cee88b 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1664,6 +1664,7 @@ static void bpf_async_cancel_and_free(struct bpf_async_kern *async) if (!cb) return; + bpf_async_update_prog_callback(cb, NULL, NULL); /* * No refcount_inc_not_zero(&cb->refcnt) here. Dropping the last * refcnt. Either synchronously or asynchronously in irq_work. From ea1535e28bb3773fc0b3cbd1f3842b808016990c Mon Sep 17 00:00:00 2001 From: KP Singh Date: Thu, 5 Feb 2026 07:38:07 +0100 Subject: [PATCH 247/268] bpf: Limit bpf program signature size Practical BPF signatures are significantly smaller than KMALLOC_MAX_CACHE_SIZE Allowing larger sizes opens the door for abuse by passing excessive size values and forcing the kernel into expensive allocation paths (via kmalloc_large or vmalloc). Fixes: 349271568303 ("bpf: Implement signature verification for BPF programs") Reported-by: Chris Mason Signed-off-by: KP Singh Acked-by: Daniel Borkmann Link: https://lore.kernel.org/r/20260205063807.690823-1-kpsingh@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5f59dd47a5b1..93bc0f4c65c5 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2813,6 +2813,13 @@ static int bpf_prog_verify_signature(struct bpf_prog *prog, union bpf_attr *attr void *sig; int err = 0; + /* + * Don't attempt to use kmalloc_large or vmalloc for signatures. + * Practical signature for BPF program should be below this limit. + */ + if (attr->signature_size > KMALLOC_MAX_CACHE_SIZE) + return -EINVAL; + if (system_keyring_id_check(attr->keyring_id) == 0) key = bpf_lookup_system_key(attr->keyring_id); else From a2c86aa621c22f2a7e26c654f936d65cfff0aa91 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Thu, 5 Feb 2026 08:07:55 +0100 Subject: [PATCH 248/268] bpf: Require frozen map for calculating map hash Currently, bpf_map_get_info_by_fd calculates and caches the hash of the map regardless of the map's frozen state. This leads to a TOCTOU bug where userspace can call BPF_OBJ_GET_INFO_BY_FD to cache the hash and then modify the map contents before freezing. Therefore, a trusted loader can be tricked into verifying the stale hash while loading the modified contents. Fix this by returning -EPERM if the map is not frozen when the hash is requested. This ensures the hash is only generated for the final, immutable state of the map. Fixes: ea2e6467ac36 ("bpf: Return hashes of maps in BPF_OBJ_GET_INFO_BY_FD") Reported-by: Toshi Piazza Signed-off-by: KP Singh Acked-by: Daniel Borkmann Link: https://lore.kernel.org/r/20260205070755.695776-1-kpsingh@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 93bc0f4c65c5..683c332dbafb 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -5328,6 +5328,9 @@ static int bpf_map_get_info_by_fd(struct file *file, if (info.hash_size != SHA256_DIGEST_SIZE) return -EINVAL; + if (!READ_ONCE(map->frozen)) + return -EPERM; + err = map->ops->map_get_hash(map, SHA256_DIGEST_SIZE, map->sha); if (err != 0) return err; From 1ace9bac1ad2bc6a0a70baaa16d22b7e783e88c5 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 5 Feb 2026 11:02:33 -0800 Subject: [PATCH 249/268] bpf: Prevent reentrance into call_rcu_tasks_trace() call_rcu_tasks_trace() is not safe from in_nmi() and not reentrant. To prevent deadlock on raw_spin_lock_rcu_node(rtpcp) or memory corruption defer to irq_work when IRQs are disabled. call_rcu_tasks_generic() protects itself with local_irq_save(). Note when bpf_async_cb->refcnt drops to zero it's safe to reuse bpf_async_cb->worker for a different irq_work callback, since bpf_async_schedule_op() -> irq_work_queue(&cb->worker); is only called when refcnt >= 1. Fixes: 1bfbc267ec91 ("bpf: Enable bpf_timer and bpf_wq in any context") Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20260205190233.912-1-alexei.starovoitov@gmail.com --- kernel/bpf/helpers.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index a4f039cee88b..0458597134da 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1276,12 +1276,24 @@ static void bpf_async_cb_rcu_tasks_trace_free(struct rcu_head *rcu) bpf_async_cb_rcu_free(rcu); } +static void worker_for_call_rcu(struct irq_work *work) +{ + struct bpf_async_cb *cb = container_of(work, struct bpf_async_cb, worker); + + call_rcu_tasks_trace(&cb->rcu, bpf_async_cb_rcu_tasks_trace_free); +} + static void bpf_async_refcount_put(struct bpf_async_cb *cb) { if (!refcount_dec_and_test(&cb->refcnt)) return; - call_rcu_tasks_trace(&cb->rcu, bpf_async_cb_rcu_tasks_trace_free); + if (irqs_disabled()) { + cb->worker = IRQ_WORK_INIT(worker_for_call_rcu); + irq_work_queue(&cb->worker); + } else { + call_rcu_tasks_trace(&cb->rcu, bpf_async_cb_rcu_tasks_trace_free); + } } static void bpf_async_cancel_and_free(struct bpf_async_kern *async); From 42e41b2a0afa04ca49ee2725aadf90ccb058ed28 Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Tue, 3 Feb 2026 16:50:57 +0100 Subject: [PATCH 250/268] selftests/xsk: properly handle batch ending in the middle of a packet Referenced commit reduced the scope of the variable pkt, so now it has to be reinitialized via pkt_stream_get_next_rx_pkt(), which also increments some counters. When the packet is interrupted by the batch ending, pkt stream therefore proceeds to the next packet, while xsk ring still contains the previous one, this results in a pkt_nb mismatch. Decrement the affected counters when packet is interrupted. Fixes: 8913e653e9b8 ("selftests/xsk: Iterate over all the sockets in the receive pkts function") Reviewed-by: Aleksandr Loktionov Signed-off-by: Larysa Zaremba Link: https://lore.kernel.org/r/20260203155103.2305816-2-larysa.zaremba@intel.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/test_xsk.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/test_xsk.c b/tools/testing/selftests/bpf/prog_tests/test_xsk.c index 5af28f359cfd..69a5a9a5189b 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_xsk.c +++ b/tools/testing/selftests/bpf/prog_tests/test_xsk.c @@ -1090,6 +1090,8 @@ static int __receive_pkts(struct test_spec *test, struct xsk_socket_info *xsk) xsk_ring_prod__cancel(&umem->fq, nb_frags); } frags_processed -= nb_frags; + pkt_stream_cancel(pkt_stream); + pkts_sent--; } if (ifobj->use_fill_ring) From 88af9fefed412e4bea9a1a771cbe6fe347fa3507 Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Tue, 3 Feb 2026 16:50:58 +0100 Subject: [PATCH 251/268] selftests/xsk: fix number of Tx frags in invalid packet The issue occurs in TOO_MANY_FRAGS test case when xdp_zc_max_segs is set to an odd number. TOO_MANY_FRAGS test case contains an invalid packet consisting of (xdp_zc_max_segs) frags. Every frag, even the last one has XDP_PKT_CONTD flag set. This packet is expected to be dropped. After that, there is a valid linear packet, which is expected to be received back. Once (xdp_zc_max_segs) is an odd number, the last packet cannot be received, if packet forwarding between Rx and Tx interfaces relies on the ethernet header, e.g. checks for ETH_P_LOOPBACK. Packet is malformed, if all traffic is looped. Turns out, sending function processes multiple invalid frags as if they were in 2-frag packets. So once the invalid mbuf packet contains an odd number of those, the valid packet after gets paired with the previous invalid descriptor, and hence does not get an ethernet header generated, so it is either dropped or malformed. Make invalid packets in verbatim mode always have only a single frag. For such packets, number of frags is otherwise meaningless, as descriptor flags are pre-configured in verbatim mode and packet data is not generated for invalid descriptors. Fixes: 697604492b64 ("selftests/xsk: add invalid descriptor test for multi-buffer") Reviewed-by: Aleksandr Loktionov Signed-off-by: Larysa Zaremba Link: https://lore.kernel.org/r/20260203155103.2305816-3-larysa.zaremba@intel.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/test_xsk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/test_xsk.c b/tools/testing/selftests/bpf/prog_tests/test_xsk.c index 69a5a9a5189b..bab4a31621c7 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_xsk.c +++ b/tools/testing/selftests/bpf/prog_tests/test_xsk.c @@ -433,7 +433,7 @@ static u32 pkt_nb_frags(u32 frame_size, struct pkt_stream *pkt_stream, struct pk } /* Search for the end of the packet in verbatim mode */ - if (!pkt_continues(pkt->options)) + if (!pkt_continues(pkt->options) || !pkt->valid) return nb_frags; next_frag = pkt_stream->current_pkt_nb; From 0ccef7079ea8d5f7b896b14be7e400022ff240c6 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 5 Feb 2026 14:28:59 -0800 Subject: [PATCH 252/268] bpf: Select bpf_local_storage_map_bucket based on bpf_local_storage A later bpf_local_storage refactor will acquire all locks before performing any update. To simplified the number of locks needed to take in bpf_local_storage_map_update(), determine the bucket based on the local_storage an selem belongs to instead of the selem pointer. Currently, when a new selem needs to be created to replace the old selem in bpf_local_storage_map_update(), locks of both buckets need to be acquired to prevent racing. This can be simplified if the two selem belongs to the same bucket so that only one bucket needs to be locked. Therefore, instead of hashing selem, hashing the local_storage pointer the selem belongs. Performance wise, this is slightly better as update now requires locking one bucket. It should not change the level of contention on one bucket as the pointers to local storages of selems in a map are just as unique as pointers to selems. Acked-by: Alexei Starovoitov Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260205222916.1788211-2-ameryhung@gmail.com --- include/linux/bpf_local_storage.h | 1 + kernel/bpf/bpf_local_storage.c | 17 +++++++++++------ net/core/bpf_sk_storage.c | 2 +- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 66432248cd81..2638487425b8 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -179,6 +179,7 @@ void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now); void bpf_selem_link_map(struct bpf_local_storage_map *smap, + struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem); struct bpf_local_storage_elem * diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index e2fe6c32822b..91b28f4e3130 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -19,9 +19,9 @@ static struct bpf_local_storage_map_bucket * select_bucket(struct bpf_local_storage_map *smap, - struct bpf_local_storage_elem *selem) + struct bpf_local_storage *local_storage) { - return &smap->buckets[hash_ptr(selem, smap->bucket_log)]; + return &smap->buckets[hash_ptr(local_storage, smap->bucket_log)]; } static int mem_charge(struct bpf_local_storage_map *smap, void *owner, u32 size) @@ -349,6 +349,7 @@ void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, static void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) { + struct bpf_local_storage *local_storage; struct bpf_local_storage_map *smap; struct bpf_local_storage_map_bucket *b; unsigned long flags; @@ -357,8 +358,10 @@ static void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) /* selem has already be unlinked from smap */ return; + local_storage = rcu_dereference_check(selem->local_storage, + bpf_rcu_lock_held()); smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); - b = select_bucket(smap, selem); + b = select_bucket(smap, local_storage); raw_spin_lock_irqsave(&b->lock, flags); if (likely(selem_linked_to_map(selem))) hlist_del_init_rcu(&selem->map_node); @@ -366,11 +369,13 @@ static void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) } void bpf_selem_link_map(struct bpf_local_storage_map *smap, + struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem) { - struct bpf_local_storage_map_bucket *b = select_bucket(smap, selem); + struct bpf_local_storage_map_bucket *b; unsigned long flags; + b = select_bucket(smap, local_storage); raw_spin_lock_irqsave(&b->lock, flags); hlist_add_head_rcu(&selem->map_node, &b->list); raw_spin_unlock_irqrestore(&b->lock, flags); @@ -448,7 +453,7 @@ int bpf_local_storage_alloc(void *owner, storage->use_kmalloc_nolock = smap->use_kmalloc_nolock; bpf_selem_link_storage_nolock(storage, first_selem); - bpf_selem_link_map(smap, first_selem); + bpf_selem_link_map(smap, storage, first_selem); owner_storage_ptr = (struct bpf_local_storage **)owner_storage(smap, owner); @@ -576,7 +581,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, alloc_selem = NULL; /* First, link the new selem to the map */ - bpf_selem_link_map(smap, selem); + bpf_selem_link_map(smap, local_storage, selem); /* Second, link (and publish) the new selem to local_storage */ bpf_selem_link_storage_nolock(local_storage, selem); diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index de111818f3a0..e36273e4fcbd 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -191,7 +191,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) } if (new_sk_storage) { - bpf_selem_link_map(smap, copy_selem); + bpf_selem_link_map(smap, new_sk_storage, copy_selem); bpf_selem_link_storage_nolock(new_sk_storage, copy_selem); } else { ret = bpf_local_storage_alloc(newsk, smap, copy_selem, GFP_ATOMIC); From 1b7e0cae85accc9e728004511193e995cd040300 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 5 Feb 2026 14:29:00 -0800 Subject: [PATCH 253/268] bpf: Convert bpf_selem_unlink_map to failable To prepare for changing bpf_local_storage_map_bucket::lock to rqspinlock, convert bpf_selem_unlink_map() to failable. It still always succeeds and returns 0 for now. Since some operations updating local storage cannot fail in the middle, open-code bpf_selem_unlink_map() to take the b->lock before the operation. There are two such locations: - bpf_local_storage_alloc() The first selem will be unlinked from smap if cmpxchg owner_storage_ptr fails, which should not fail. Therefore, hold b->lock when linking until allocation complete. Helpers that assume b->lock is held by callers are introduced: bpf_selem_link_map_nolock() and bpf_selem_unlink_map_nolock(). - bpf_local_storage_update() The three step update process: link_map(new_selem), link_storage(new_selem), and unlink_map(old_selem) should not fail in the middle. In bpf_selem_unlink(), bpf_selem_unlink_map() and bpf_selem_unlink_storage() should either all succeed or fail as a whole instead of failing in the middle. So, return if unlink_map() failed. Remove the selem_linked_to_map_lockless() check as an selem in the common paths (not bpf_local_storage_map_free() or bpf_local_storage_destroy()), will be unlinked under b->lock and local_storage->lock and therefore no other threads can unlink the selem from map at the same time. In bpf_local_storage_destroy(), ignore the return of bpf_selem_unlink_map() for now. A later patch will allow bpf_local_storage_destroy() to unlink selems even when failing to acquire locks. Note that while this patch removes all callers of selem_linked_to_map(), a later patch that introduces bpf_selem_unlink_nofail() will use it again. Acked-by: Alexei Starovoitov Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260205222916.1788211-3-ameryhung@gmail.com --- kernel/bpf/bpf_local_storage.c | 57 +++++++++++++++++++++++----------- 1 file changed, 39 insertions(+), 18 deletions(-) diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 91b28f4e3130..6fa71502c7d7 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -61,11 +61,6 @@ static bool selem_linked_to_storage(const struct bpf_local_storage_elem *selem) return !hlist_unhashed(&selem->snode); } -static bool selem_linked_to_map_lockless(const struct bpf_local_storage_elem *selem) -{ - return !hlist_unhashed_lockless(&selem->map_node); -} - static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem) { return !hlist_unhashed(&selem->map_node); @@ -347,25 +342,27 @@ void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, hlist_add_head_rcu(&selem->snode, &local_storage->list); } -static void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) +static int bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) { struct bpf_local_storage *local_storage; struct bpf_local_storage_map *smap; struct bpf_local_storage_map_bucket *b; unsigned long flags; - if (unlikely(!selem_linked_to_map_lockless(selem))) - /* selem has already be unlinked from smap */ - return; - local_storage = rcu_dereference_check(selem->local_storage, bpf_rcu_lock_held()); smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); b = select_bucket(smap, local_storage); raw_spin_lock_irqsave(&b->lock, flags); - if (likely(selem_linked_to_map(selem))) - hlist_del_init_rcu(&selem->map_node); + hlist_del_init_rcu(&selem->map_node); raw_spin_unlock_irqrestore(&b->lock, flags); + + return 0; +} + +static void bpf_selem_unlink_map_nolock(struct bpf_local_storage_elem *selem) +{ + hlist_del_init_rcu(&selem->map_node); } void bpf_selem_link_map(struct bpf_local_storage_map *smap, @@ -381,13 +378,24 @@ void bpf_selem_link_map(struct bpf_local_storage_map *smap, raw_spin_unlock_irqrestore(&b->lock, flags); } +static void bpf_selem_link_map_nolock(struct bpf_local_storage_map_bucket *b, + struct bpf_local_storage_elem *selem) +{ + hlist_add_head_rcu(&selem->map_node, &b->list); +} + void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now) { + int err; + /* Always unlink from map before unlinking from local_storage * because selem will be freed after successfully unlinked from * the local_storage. */ - bpf_selem_unlink_map(selem); + err = bpf_selem_unlink_map(selem); + if (err) + return; + bpf_selem_unlink_storage(selem, reuse_now); } @@ -429,6 +437,8 @@ int bpf_local_storage_alloc(void *owner, { struct bpf_local_storage *prev_storage, *storage; struct bpf_local_storage **owner_storage_ptr; + struct bpf_local_storage_map_bucket *b; + unsigned long flags; int err; err = mem_charge(smap, owner, sizeof(*storage)); @@ -453,7 +463,10 @@ int bpf_local_storage_alloc(void *owner, storage->use_kmalloc_nolock = smap->use_kmalloc_nolock; bpf_selem_link_storage_nolock(storage, first_selem); - bpf_selem_link_map(smap, storage, first_selem); + + b = select_bucket(smap, storage); + raw_spin_lock_irqsave(&b->lock, flags); + bpf_selem_link_map_nolock(b, first_selem); owner_storage_ptr = (struct bpf_local_storage **)owner_storage(smap, owner); @@ -469,10 +482,12 @@ int bpf_local_storage_alloc(void *owner, */ prev_storage = cmpxchg(owner_storage_ptr, NULL, storage); if (unlikely(prev_storage)) { - bpf_selem_unlink_map(first_selem); + bpf_selem_unlink_map_nolock(first_selem); + raw_spin_unlock_irqrestore(&b->lock, flags); err = -EAGAIN; goto uncharge; } + raw_spin_unlock_irqrestore(&b->lock, flags); return 0; @@ -494,8 +509,9 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, struct bpf_local_storage_data *old_sdata = NULL; struct bpf_local_storage_elem *alloc_selem, *selem = NULL; struct bpf_local_storage *local_storage; + struct bpf_local_storage_map_bucket *b; HLIST_HEAD(old_selem_free_list); - unsigned long flags; + unsigned long flags, b_flags; int err; /* BPF_EXIST and BPF_NOEXIST cannot be both set */ @@ -579,20 +595,25 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, goto unlock; } + b = select_bucket(smap, local_storage); + + raw_spin_lock_irqsave(&b->lock, b_flags); + alloc_selem = NULL; /* First, link the new selem to the map */ - bpf_selem_link_map(smap, local_storage, selem); + bpf_selem_link_map_nolock(b, selem); /* Second, link (and publish) the new selem to local_storage */ bpf_selem_link_storage_nolock(local_storage, selem); /* Third, remove old selem, SELEM(old_sdata) */ if (old_sdata) { - bpf_selem_unlink_map(SELEM(old_sdata)); + bpf_selem_unlink_map_nolock(SELEM(old_sdata)); bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata), &old_selem_free_list); } + raw_spin_unlock_irqrestore(&b->lock, b_flags); unlock: raw_spin_unlock_irqrestore(&local_storage->lock, flags); bpf_selem_free_list(&old_selem_free_list, false); From fd103ffc57c9a3b8b76bc852ffae5eb630a6ded4 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 5 Feb 2026 14:29:01 -0800 Subject: [PATCH 254/268] bpf: Convert bpf_selem_link_map to failable To prepare for changing bpf_local_storage_map_bucket::lock to rqspinlock, convert bpf_selem_link_map() to failable. It still always succeeds and returns 0 until the change happens. No functional change. Acked-by: Alexei Starovoitov Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260205222916.1788211-4-ameryhung@gmail.com --- include/linux/bpf_local_storage.h | 6 +++--- kernel/bpf/bpf_local_storage.c | 8 +++++--- net/core/bpf_sk_storage.c | 9 ++++++++- 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 2638487425b8..709506e982a6 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -178,9 +178,9 @@ void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now); -void bpf_selem_link_map(struct bpf_local_storage_map *smap, - struct bpf_local_storage *local_storage, - struct bpf_local_storage_elem *selem); +int bpf_selem_link_map(struct bpf_local_storage_map *smap, + struct bpf_local_storage *local_storage, + struct bpf_local_storage_elem *selem); struct bpf_local_storage_elem * bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value, diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 6fa71502c7d7..2f94ca4a4475 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -365,9 +365,9 @@ static void bpf_selem_unlink_map_nolock(struct bpf_local_storage_elem *selem) hlist_del_init_rcu(&selem->map_node); } -void bpf_selem_link_map(struct bpf_local_storage_map *smap, - struct bpf_local_storage *local_storage, - struct bpf_local_storage_elem *selem) +int bpf_selem_link_map(struct bpf_local_storage_map *smap, + struct bpf_local_storage *local_storage, + struct bpf_local_storage_elem *selem) { struct bpf_local_storage_map_bucket *b; unsigned long flags; @@ -376,6 +376,8 @@ void bpf_selem_link_map(struct bpf_local_storage_map *smap, raw_spin_lock_irqsave(&b->lock, flags); hlist_add_head_rcu(&selem->map_node, &b->list); raw_spin_unlock_irqrestore(&b->lock, flags); + + return 0; } static void bpf_selem_link_map_nolock(struct bpf_local_storage_map_bucket *b, diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index e36273e4fcbd..0b85d8f2c17e 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -191,7 +191,14 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) } if (new_sk_storage) { - bpf_selem_link_map(smap, new_sk_storage, copy_selem); + ret = bpf_selem_link_map(smap, new_sk_storage, copy_selem); + if (ret) { + bpf_selem_free(copy_selem, true); + atomic_sub(smap->elem_size, + &newsk->sk_omem_alloc); + bpf_map_put(map); + goto out; + } bpf_selem_link_storage_nolock(new_sk_storage, copy_selem); } else { ret = bpf_local_storage_alloc(newsk, smap, copy_selem, GFP_ATOMIC); From 403e935f915896243ff93f9a2ff44e5bb6619032 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 5 Feb 2026 14:29:02 -0800 Subject: [PATCH 255/268] bpf: Convert bpf_selem_unlink to failable To prepare changing both bpf_local_storage_map_bucket::lock and bpf_local_storage::lock to rqspinlock, convert bpf_selem_unlink() to failable. It still always succeeds and returns 0 until the change happens. No functional change. Open code bpf_selem_unlink_storage() in the only caller, bpf_selem_unlink(), since unlink_map and unlink_storage must be done together after all the necessary locks are acquired. For bpf_local_storage_map_free(), ignore the return from bpf_selem_unlink() for now. A later patch will allow it to unlink selems even when failing to acquire locks. Acked-by: Alexei Starovoitov Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260205222916.1788211-5-ameryhung@gmail.com --- include/linux/bpf_local_storage.h | 2 +- kernel/bpf/bpf_cgrp_storage.c | 3 +- kernel/bpf/bpf_inode_storage.c | 4 +- kernel/bpf/bpf_local_storage.c | 71 +++++++++++++++---------------- kernel/bpf/bpf_task_storage.c | 4 +- net/core/bpf_sk_storage.c | 4 +- 6 files changed, 39 insertions(+), 49 deletions(-) diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 709506e982a6..f74e0f7656a1 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -176,7 +176,7 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map, void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem); -void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now); +int bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now); int bpf_selem_link_map(struct bpf_local_storage_map *smap, struct bpf_local_storage *local_storage, diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c index 0687a760974a..8fef24fcac68 100644 --- a/kernel/bpf/bpf_cgrp_storage.c +++ b/kernel/bpf/bpf_cgrp_storage.c @@ -118,8 +118,7 @@ static int cgroup_storage_delete(struct cgroup *cgroup, struct bpf_map *map) if (!sdata) return -ENOENT; - bpf_selem_unlink(SELEM(sdata), false); - return 0; + return bpf_selem_unlink(SELEM(sdata), false); } static long bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key) diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index e54cce2b9175..cedc99184dad 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -110,9 +110,7 @@ static int inode_storage_delete(struct inode *inode, struct bpf_map *map) if (!sdata) return -ENOENT; - bpf_selem_unlink(SELEM(sdata), false); - - return 0; + return bpf_selem_unlink(SELEM(sdata), false); } static long bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key) diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 2f94ca4a4475..38f5c3db2957 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -308,33 +308,6 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor return free_local_storage; } -static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem, - bool reuse_now) -{ - struct bpf_local_storage *local_storage; - bool free_local_storage = false; - HLIST_HEAD(selem_free_list); - unsigned long flags; - - if (unlikely(!selem_linked_to_storage_lockless(selem))) - /* selem has already been unlinked from sk */ - return; - - local_storage = rcu_dereference_check(selem->local_storage, - bpf_rcu_lock_held()); - - raw_spin_lock_irqsave(&local_storage->lock, flags); - if (likely(selem_linked_to_storage(selem))) - free_local_storage = bpf_selem_unlink_storage_nolock( - local_storage, selem, &selem_free_list); - raw_spin_unlock_irqrestore(&local_storage->lock, flags); - - bpf_selem_free_list(&selem_free_list, reuse_now); - - if (free_local_storage) - bpf_local_storage_free(local_storage, reuse_now); -} - void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem) { @@ -386,19 +359,43 @@ static void bpf_selem_link_map_nolock(struct bpf_local_storage_map_bucket *b, hlist_add_head_rcu(&selem->map_node, &b->list); } -void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now) +int bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now) { - int err; + struct bpf_local_storage *local_storage; + bool free_local_storage = false; + HLIST_HEAD(selem_free_list); + unsigned long flags; + int err = 0; - /* Always unlink from map before unlinking from local_storage - * because selem will be freed after successfully unlinked from - * the local_storage. - */ - err = bpf_selem_unlink_map(selem); - if (err) - return; + if (unlikely(!selem_linked_to_storage_lockless(selem))) + /* selem has already been unlinked from sk */ + return 0; - bpf_selem_unlink_storage(selem, reuse_now); + local_storage = rcu_dereference_check(selem->local_storage, + bpf_rcu_lock_held()); + + raw_spin_lock_irqsave(&local_storage->lock, flags); + if (likely(selem_linked_to_storage(selem))) { + /* Always unlink from map before unlinking from local_storage + * because selem will be freed after successfully unlinked from + * the local_storage. + */ + err = bpf_selem_unlink_map(selem); + if (err) + goto out; + + free_local_storage = bpf_selem_unlink_storage_nolock( + local_storage, selem, &selem_free_list); + } +out: + raw_spin_unlock_irqrestore(&local_storage->lock, flags); + + bpf_selem_free_list(&selem_free_list, reuse_now); + + if (free_local_storage) + bpf_local_storage_free(local_storage, reuse_now); + + return err; } void __bpf_local_storage_insert_cache(struct bpf_local_storage *local_storage, diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index a1dc1bf0848a..ab902364ac23 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -167,9 +167,7 @@ static int task_storage_delete(struct task_struct *task, struct bpf_map *map, if (!nobusy) return -EBUSY; - bpf_selem_unlink(SELEM(sdata), false); - - return 0; + return bpf_selem_unlink(SELEM(sdata), false); } static long bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key) diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 0b85d8f2c17e..d7b5c4551997 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -40,9 +40,7 @@ static int bpf_sk_storage_del(struct sock *sk, struct bpf_map *map) if (!sdata) return -ENOENT; - bpf_selem_unlink(SELEM(sdata), false); - - return 0; + return bpf_selem_unlink(SELEM(sdata), false); } /* Called by __sk_destruct() & bpf_sk_storage_clone() */ From 8dabe34b9d5b1135b084268396ed2267107e64c1 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 5 Feb 2026 14:29:03 -0800 Subject: [PATCH 256/268] bpf: Change local_storage->lock and b->lock to rqspinlock Change bpf_local_storage::lock and bpf_local_storage_map_bucket::lock from raw_spin_lock to rqspinlock. Finally, propagate errors from raw_res_spin_lock_irqsave() to syscall return or BPF helper return. In bpf_local_storage_destroy(), ignore return from raw_res_spin_lock_irqsave() for now. A later patch will correctly handle errors correctly in bpf_local_storage_destroy() so that it can unlink selems even when failing to acquire locks. For __bpf_local_storage_map_cache(), instead of handling the error, skip updating the cache. Acked-by: Alexei Starovoitov Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260205222916.1788211-6-ameryhung@gmail.com --- include/linux/bpf_local_storage.h | 5 ++- kernel/bpf/bpf_local_storage.c | 64 +++++++++++++++++++++---------- 2 files changed, 47 insertions(+), 22 deletions(-) diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index f74e0f7656a1..fa50b7afee18 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -15,12 +15,13 @@ #include #include #include +#include #define BPF_LOCAL_STORAGE_CACHE_SIZE 16 struct bpf_local_storage_map_bucket { struct hlist_head list; - raw_spinlock_t lock; + rqspinlock_t lock; }; /* Thp map is not the primary owner of a bpf_local_storage_elem. @@ -94,7 +95,7 @@ struct bpf_local_storage { * bpf_local_storage_elem. */ struct rcu_head rcu; - raw_spinlock_t lock; /* Protect adding/removing from the "list" */ + rqspinlock_t lock; /* Protect adding/removing from the "list" */ bool use_kmalloc_nolock; }; diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 38f5c3db2957..1138e2293b50 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -321,14 +321,18 @@ static int bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) struct bpf_local_storage_map *smap; struct bpf_local_storage_map_bucket *b; unsigned long flags; + int err; local_storage = rcu_dereference_check(selem->local_storage, bpf_rcu_lock_held()); smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); b = select_bucket(smap, local_storage); - raw_spin_lock_irqsave(&b->lock, flags); + err = raw_res_spin_lock_irqsave(&b->lock, flags); + if (err) + return err; + hlist_del_init_rcu(&selem->map_node); - raw_spin_unlock_irqrestore(&b->lock, flags); + raw_res_spin_unlock_irqrestore(&b->lock, flags); return 0; } @@ -344,11 +348,16 @@ int bpf_selem_link_map(struct bpf_local_storage_map *smap, { struct bpf_local_storage_map_bucket *b; unsigned long flags; + int err; b = select_bucket(smap, local_storage); - raw_spin_lock_irqsave(&b->lock, flags); + + err = raw_res_spin_lock_irqsave(&b->lock, flags); + if (err) + return err; + hlist_add_head_rcu(&selem->map_node, &b->list); - raw_spin_unlock_irqrestore(&b->lock, flags); + raw_res_spin_unlock_irqrestore(&b->lock, flags); return 0; } @@ -365,7 +374,7 @@ int bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now) bool free_local_storage = false; HLIST_HEAD(selem_free_list); unsigned long flags; - int err = 0; + int err; if (unlikely(!selem_linked_to_storage_lockless(selem))) /* selem has already been unlinked from sk */ @@ -374,7 +383,10 @@ int bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now) local_storage = rcu_dereference_check(selem->local_storage, bpf_rcu_lock_held()); - raw_spin_lock_irqsave(&local_storage->lock, flags); + err = raw_res_spin_lock_irqsave(&local_storage->lock, flags); + if (err) + return err; + if (likely(selem_linked_to_storage(selem))) { /* Always unlink from map before unlinking from local_storage * because selem will be freed after successfully unlinked from @@ -388,7 +400,7 @@ int bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now) local_storage, selem, &selem_free_list); } out: - raw_spin_unlock_irqrestore(&local_storage->lock, flags); + raw_res_spin_unlock_irqrestore(&local_storage->lock, flags); bpf_selem_free_list(&selem_free_list, reuse_now); @@ -403,16 +415,20 @@ void __bpf_local_storage_insert_cache(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem) { unsigned long flags; + int err; /* spinlock is needed to avoid racing with the * parallel delete. Otherwise, publishing an already * deleted sdata to the cache will become a use-after-free * problem in the next bpf_local_storage_lookup(). */ - raw_spin_lock_irqsave(&local_storage->lock, flags); + err = raw_res_spin_lock_irqsave(&local_storage->lock, flags); + if (err) + return; + if (selem_linked_to_storage(selem)) rcu_assign_pointer(local_storage->cache[smap->cache_idx], SDATA(selem)); - raw_spin_unlock_irqrestore(&local_storage->lock, flags); + raw_res_spin_unlock_irqrestore(&local_storage->lock, flags); } static int check_flags(const struct bpf_local_storage_data *old_sdata, @@ -457,14 +473,17 @@ int bpf_local_storage_alloc(void *owner, RCU_INIT_POINTER(storage->smap, smap); INIT_HLIST_HEAD(&storage->list); - raw_spin_lock_init(&storage->lock); + raw_res_spin_lock_init(&storage->lock); storage->owner = owner; storage->use_kmalloc_nolock = smap->use_kmalloc_nolock; bpf_selem_link_storage_nolock(storage, first_selem); b = select_bucket(smap, storage); - raw_spin_lock_irqsave(&b->lock, flags); + err = raw_res_spin_lock_irqsave(&b->lock, flags); + if (err) + goto uncharge; + bpf_selem_link_map_nolock(b, first_selem); owner_storage_ptr = @@ -482,11 +501,11 @@ int bpf_local_storage_alloc(void *owner, prev_storage = cmpxchg(owner_storage_ptr, NULL, storage); if (unlikely(prev_storage)) { bpf_selem_unlink_map_nolock(first_selem); - raw_spin_unlock_irqrestore(&b->lock, flags); + raw_res_spin_unlock_irqrestore(&b->lock, flags); err = -EAGAIN; goto uncharge; } - raw_spin_unlock_irqrestore(&b->lock, flags); + raw_res_spin_unlock_irqrestore(&b->lock, flags); return 0; @@ -569,7 +588,9 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, if (!alloc_selem) return ERR_PTR(-ENOMEM); - raw_spin_lock_irqsave(&local_storage->lock, flags); + err = raw_res_spin_lock_irqsave(&local_storage->lock, flags); + if (err) + goto free_selem; /* Recheck local_storage->list under local_storage->lock */ if (unlikely(hlist_empty(&local_storage->list))) { @@ -596,7 +617,9 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, b = select_bucket(smap, local_storage); - raw_spin_lock_irqsave(&b->lock, b_flags); + err = raw_res_spin_lock_irqsave(&b->lock, b_flags); + if (err) + goto unlock; alloc_selem = NULL; /* First, link the new selem to the map */ @@ -612,9 +635,10 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, &old_selem_free_list); } - raw_spin_unlock_irqrestore(&b->lock, b_flags); + raw_res_spin_unlock_irqrestore(&b->lock, b_flags); unlock: - raw_spin_unlock_irqrestore(&local_storage->lock, flags); + raw_res_spin_unlock_irqrestore(&local_storage->lock, flags); +free_selem: bpf_selem_free_list(&old_selem_free_list, false); if (alloc_selem) { mem_uncharge(smap, owner, smap->elem_size); @@ -699,7 +723,7 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) * when unlinking elem from the local_storage->list and * the map's bucket->list. */ - raw_spin_lock_irqsave(&local_storage->lock, flags); + raw_res_spin_lock_irqsave(&local_storage->lock, flags); hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) { /* Always unlink from map before unlinking from * local_storage. @@ -714,7 +738,7 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) free_storage = bpf_selem_unlink_storage_nolock( local_storage, selem, &free_selem_list); } - raw_spin_unlock_irqrestore(&local_storage->lock, flags); + raw_res_spin_unlock_irqrestore(&local_storage->lock, flags); bpf_selem_free_list(&free_selem_list, true); @@ -761,7 +785,7 @@ bpf_local_storage_map_alloc(union bpf_attr *attr, for (i = 0; i < nbuckets; i++) { INIT_HLIST_HEAD(&smap->buckets[i].list); - raw_spin_lock_init(&smap->buckets[i].lock); + raw_res_spin_lock_init(&smap->buckets[i].lock); } smap->elem_size = offsetof(struct bpf_local_storage_elem, From 4a98c2efa613f0b01bc3aa0acb8c3ff7ae29b6f9 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 5 Feb 2026 14:29:04 -0800 Subject: [PATCH 257/268] bpf: Remove task local storage percpu counter The percpu counter in task local storage is no longer needed as the underlying bpf_local_storage can now handle deadlock with the help of rqspinlock. Remove the percpu counter and related migrate_{disable, enable}. Since the percpu counter is removed, merge back bpf_task_storage_get() and bpf_task_storage_get_recur(). This will allow the bpf syscalls and helpers to run concurrently on the same CPU, removing the spurious -EBUSY error. bpf_task_storage_get(..., F_CREATE) will now always succeed with enough free memory unless being called recursively. Acked-by: Alexei Starovoitov Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260205222916.1788211-7-ameryhung@gmail.com --- kernel/bpf/bpf_task_storage.c | 152 +++++----------------------------- kernel/bpf/helpers.c | 4 - 2 files changed, 19 insertions(+), 137 deletions(-) diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index ab902364ac23..dd858226ada2 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -20,29 +20,6 @@ DEFINE_BPF_STORAGE_CACHE(task_cache); -static DEFINE_PER_CPU(int, bpf_task_storage_busy); - -static void bpf_task_storage_lock(void) -{ - cant_migrate(); - this_cpu_inc(bpf_task_storage_busy); -} - -static void bpf_task_storage_unlock(void) -{ - this_cpu_dec(bpf_task_storage_busy); -} - -static bool bpf_task_storage_trylock(void) -{ - cant_migrate(); - if (unlikely(this_cpu_inc_return(bpf_task_storage_busy) != 1)) { - this_cpu_dec(bpf_task_storage_busy); - return false; - } - return true; -} - static struct bpf_local_storage __rcu **task_storage_ptr(void *owner) { struct task_struct *task = owner; @@ -70,17 +47,15 @@ void bpf_task_storage_free(struct task_struct *task) { struct bpf_local_storage *local_storage; - rcu_read_lock_dont_migrate(); + rcu_read_lock(); local_storage = rcu_dereference(task->bpf_storage); if (!local_storage) goto out; - bpf_task_storage_lock(); bpf_local_storage_destroy(local_storage); - bpf_task_storage_unlock(); out: - rcu_read_unlock_migrate(); + rcu_read_unlock(); } static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key) @@ -106,9 +81,7 @@ static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key) goto out; } - bpf_task_storage_lock(); sdata = task_storage_lookup(task, map, true); - bpf_task_storage_unlock(); put_pid(pid); return sdata ? sdata->data : NULL; out: @@ -143,11 +116,9 @@ static long bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key, goto out; } - bpf_task_storage_lock(); sdata = bpf_local_storage_update( task, (struct bpf_local_storage_map *)map, value, map_flags, true, GFP_ATOMIC); - bpf_task_storage_unlock(); err = PTR_ERR_OR_ZERO(sdata); out: @@ -155,8 +126,7 @@ static long bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key, return err; } -static int task_storage_delete(struct task_struct *task, struct bpf_map *map, - bool nobusy) +static int task_storage_delete(struct task_struct *task, struct bpf_map *map) { struct bpf_local_storage_data *sdata; @@ -164,9 +134,6 @@ static int task_storage_delete(struct task_struct *task, struct bpf_map *map, if (!sdata) return -ENOENT; - if (!nobusy) - return -EBUSY; - return bpf_selem_unlink(SELEM(sdata), false); } @@ -192,111 +159,50 @@ static long bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key) goto out; } - bpf_task_storage_lock(); - err = task_storage_delete(task, map, true); - bpf_task_storage_unlock(); + err = task_storage_delete(task, map); out: put_pid(pid); return err; } -/* Called by bpf_task_storage_get*() helpers */ -static void *__bpf_task_storage_get(struct bpf_map *map, - struct task_struct *task, void *value, - u64 flags, gfp_t gfp_flags, bool nobusy) -{ - struct bpf_local_storage_data *sdata; - - sdata = task_storage_lookup(task, map, nobusy); - if (sdata) - return sdata->data; - - /* only allocate new storage, when the task is refcounted */ - if (refcount_read(&task->usage) && - (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) && nobusy) { - sdata = bpf_local_storage_update( - task, (struct bpf_local_storage_map *)map, value, - BPF_NOEXIST, false, gfp_flags); - return IS_ERR(sdata) ? NULL : sdata->data; - } - - return NULL; -} - -/* *gfp_flags* is a hidden argument provided by the verifier */ -BPF_CALL_5(bpf_task_storage_get_recur, struct bpf_map *, map, struct task_struct *, - task, void *, value, u64, flags, gfp_t, gfp_flags) -{ - bool nobusy; - void *data; - - WARN_ON_ONCE(!bpf_rcu_lock_held()); - if (flags & ~BPF_LOCAL_STORAGE_GET_F_CREATE || !task) - return (unsigned long)NULL; - - nobusy = bpf_task_storage_trylock(); - data = __bpf_task_storage_get(map, task, value, flags, - gfp_flags, nobusy); - if (nobusy) - bpf_task_storage_unlock(); - return (unsigned long)data; -} - /* *gfp_flags* is a hidden argument provided by the verifier */ BPF_CALL_5(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *, task, void *, value, u64, flags, gfp_t, gfp_flags) { - void *data; + struct bpf_local_storage_data *sdata; WARN_ON_ONCE(!bpf_rcu_lock_held()); if (flags & ~BPF_LOCAL_STORAGE_GET_F_CREATE || !task) return (unsigned long)NULL; - bpf_task_storage_lock(); - data = __bpf_task_storage_get(map, task, value, flags, - gfp_flags, true); - bpf_task_storage_unlock(); - return (unsigned long)data; -} + sdata = task_storage_lookup(task, map, true); + if (sdata) + return (unsigned long)sdata->data; -BPF_CALL_2(bpf_task_storage_delete_recur, struct bpf_map *, map, struct task_struct *, - task) -{ - bool nobusy; - int ret; + /* only allocate new storage, when the task is refcounted */ + if (refcount_read(&task->usage) && + (flags & BPF_LOCAL_STORAGE_GET_F_CREATE)) { + sdata = bpf_local_storage_update( + task, (struct bpf_local_storage_map *)map, value, + BPF_NOEXIST, false, gfp_flags); + return IS_ERR(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data; + } - WARN_ON_ONCE(!bpf_rcu_lock_held()); - if (!task) - return -EINVAL; - - nobusy = bpf_task_storage_trylock(); - /* This helper must only be called from places where the lifetime of the task - * is guaranteed. Either by being refcounted or by being protected - * by an RCU read-side critical section. - */ - ret = task_storage_delete(task, map, nobusy); - if (nobusy) - bpf_task_storage_unlock(); - return ret; + return (unsigned long)NULL; } BPF_CALL_2(bpf_task_storage_delete, struct bpf_map *, map, struct task_struct *, task) { - int ret; - WARN_ON_ONCE(!bpf_rcu_lock_held()); if (!task) return -EINVAL; - bpf_task_storage_lock(); /* This helper must only be called from places where the lifetime of the task * is guaranteed. Either by being refcounted or by being protected * by an RCU read-side critical section. */ - ret = task_storage_delete(task, map, true); - bpf_task_storage_unlock(); - return ret; + return task_storage_delete(task, map); } static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key) @@ -311,7 +217,7 @@ static struct bpf_map *task_storage_map_alloc(union bpf_attr *attr) static void task_storage_map_free(struct bpf_map *map) { - bpf_local_storage_map_free(map, &task_cache, &bpf_task_storage_busy); + bpf_local_storage_map_free(map, &task_cache, NULL); } BTF_ID_LIST_GLOBAL_SINGLE(bpf_local_storage_map_btf_id, struct, bpf_local_storage_map) @@ -330,17 +236,6 @@ const struct bpf_map_ops task_storage_map_ops = { .map_owner_storage_ptr = task_storage_ptr, }; -const struct bpf_func_proto bpf_task_storage_get_recur_proto = { - .func = bpf_task_storage_get_recur, - .gpl_only = false, - .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, - .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL, - .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], - .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, - .arg4_type = ARG_ANYTHING, -}; - const struct bpf_func_proto bpf_task_storage_get_proto = { .func = bpf_task_storage_get, .gpl_only = false, @@ -352,15 +247,6 @@ const struct bpf_func_proto bpf_task_storage_get_proto = { .arg4_type = ARG_ANYTHING, }; -const struct bpf_func_proto bpf_task_storage_delete_recur_proto = { - .func = bpf_task_storage_delete_recur, - .gpl_only = false, - .ret_type = RET_INTEGER, - .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL, - .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], -}; - const struct bpf_func_proto bpf_task_storage_delete_proto = { .func = bpf_task_storage_delete, .gpl_only = false, diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 0458597134da..7ac32798eb04 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2167,12 +2167,8 @@ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_cgroup_classid_curr_proto; #endif case BPF_FUNC_task_storage_get: - if (bpf_prog_check_recur(prog)) - return &bpf_task_storage_get_recur_proto; return &bpf_task_storage_get_proto; case BPF_FUNC_task_storage_delete: - if (bpf_prog_check_recur(prog)) - return &bpf_task_storage_delete_recur_proto; return &bpf_task_storage_delete_proto; default: break; From 5254de7b9607dd341795cd693185f719a9e7298a Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 5 Feb 2026 14:29:05 -0800 Subject: [PATCH 258/268] bpf: Remove cgroup local storage percpu counter The percpu counter in cgroup local storage is no longer needed as the underlying bpf_local_storage can now handle deadlock with the help of rqspinlock. Remove the percpu counter and related migrate_{disable, enable}. Acked-by: Alexei Starovoitov Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260205222916.1788211-8-ameryhung@gmail.com --- kernel/bpf/bpf_cgrp_storage.c | 59 +++++------------------------------ 1 file changed, 8 insertions(+), 51 deletions(-) diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c index 8fef24fcac68..4d84611d8222 100644 --- a/kernel/bpf/bpf_cgrp_storage.c +++ b/kernel/bpf/bpf_cgrp_storage.c @@ -11,29 +11,6 @@ DEFINE_BPF_STORAGE_CACHE(cgroup_cache); -static DEFINE_PER_CPU(int, bpf_cgrp_storage_busy); - -static void bpf_cgrp_storage_lock(void) -{ - cant_migrate(); - this_cpu_inc(bpf_cgrp_storage_busy); -} - -static void bpf_cgrp_storage_unlock(void) -{ - this_cpu_dec(bpf_cgrp_storage_busy); -} - -static bool bpf_cgrp_storage_trylock(void) -{ - cant_migrate(); - if (unlikely(this_cpu_inc_return(bpf_cgrp_storage_busy) != 1)) { - this_cpu_dec(bpf_cgrp_storage_busy); - return false; - } - return true; -} - static struct bpf_local_storage __rcu **cgroup_storage_ptr(void *owner) { struct cgroup *cg = owner; @@ -45,16 +22,14 @@ void bpf_cgrp_storage_free(struct cgroup *cgroup) { struct bpf_local_storage *local_storage; - rcu_read_lock_dont_migrate(); + rcu_read_lock(); local_storage = rcu_dereference(cgroup->bpf_cgrp_storage); if (!local_storage) goto out; - bpf_cgrp_storage_lock(); bpf_local_storage_destroy(local_storage); - bpf_cgrp_storage_unlock(); out: - rcu_read_unlock_migrate(); + rcu_read_unlock(); } static struct bpf_local_storage_data * @@ -83,9 +58,7 @@ static void *bpf_cgrp_storage_lookup_elem(struct bpf_map *map, void *key) if (IS_ERR(cgroup)) return ERR_CAST(cgroup); - bpf_cgrp_storage_lock(); sdata = cgroup_storage_lookup(cgroup, map, true); - bpf_cgrp_storage_unlock(); cgroup_put(cgroup); return sdata ? sdata->data : NULL; } @@ -102,10 +75,8 @@ static long bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key, if (IS_ERR(cgroup)) return PTR_ERR(cgroup); - bpf_cgrp_storage_lock(); sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map, value, map_flags, false, GFP_ATOMIC); - bpf_cgrp_storage_unlock(); cgroup_put(cgroup); return PTR_ERR_OR_ZERO(sdata); } @@ -131,9 +102,7 @@ static long bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key) if (IS_ERR(cgroup)) return PTR_ERR(cgroup); - bpf_cgrp_storage_lock(); err = cgroup_storage_delete(cgroup, map); - bpf_cgrp_storage_unlock(); cgroup_put(cgroup); return err; } @@ -150,7 +119,7 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) static void cgroup_storage_map_free(struct bpf_map *map) { - bpf_local_storage_map_free(map, &cgroup_cache, &bpf_cgrp_storage_busy); + bpf_local_storage_map_free(map, &cgroup_cache, NULL); } /* *gfp_flags* is a hidden argument provided by the verifier */ @@ -158,7 +127,6 @@ BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup, void *, value, u64, flags, gfp_t, gfp_flags) { struct bpf_local_storage_data *sdata; - bool nobusy; WARN_ON_ONCE(!bpf_rcu_lock_held()); if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE)) @@ -167,38 +135,27 @@ BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup, if (!cgroup) return (unsigned long)NULL; - nobusy = bpf_cgrp_storage_trylock(); - - sdata = cgroup_storage_lookup(cgroup, map, nobusy); + sdata = cgroup_storage_lookup(cgroup, map, true); if (sdata) - goto unlock; + goto out; /* only allocate new storage, when the cgroup is refcounted */ if (!percpu_ref_is_dying(&cgroup->self.refcnt) && - (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) && nobusy) + (flags & BPF_LOCAL_STORAGE_GET_F_CREATE)) sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map, value, BPF_NOEXIST, false, gfp_flags); -unlock: - if (nobusy) - bpf_cgrp_storage_unlock(); +out: return IS_ERR_OR_NULL(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data; } BPF_CALL_2(bpf_cgrp_storage_delete, struct bpf_map *, map, struct cgroup *, cgroup) { - int ret; - WARN_ON_ONCE(!bpf_rcu_lock_held()); if (!cgroup) return -EINVAL; - if (!bpf_cgrp_storage_trylock()) - return -EBUSY; - - ret = cgroup_storage_delete(cgroup, map); - bpf_cgrp_storage_unlock(); - return ret; + return cgroup_storage_delete(cgroup, map); } const struct bpf_map_ops cgrp_storage_map_ops = { From 3417dffb58331d1e7e4f3e30ec95cc0f8114ff10 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 5 Feb 2026 14:29:06 -0800 Subject: [PATCH 259/268] bpf: Remove unused percpu counter from bpf_local_storage_map_free Percpu locks have been removed from cgroup and task local storage. Now that all local storage no longer use percpu variables as locks preventing recursion, there is no need to pass them to bpf_local_storage_map_free(). Remove the argument from the function. Acked-by: Alexei Starovoitov Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260205222916.1788211-9-ameryhung@gmail.com --- include/linux/bpf_local_storage.h | 3 +-- kernel/bpf/bpf_cgrp_storage.c | 2 +- kernel/bpf/bpf_inode_storage.c | 2 +- kernel/bpf/bpf_local_storage.c | 7 +------ kernel/bpf/bpf_task_storage.c | 2 +- net/core/bpf_sk_storage.c | 2 +- 6 files changed, 6 insertions(+), 12 deletions(-) diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index fa50b7afee18..fba3354988d3 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -166,8 +166,7 @@ bpf_local_storage_lookup(struct bpf_local_storage *local_storage, void bpf_local_storage_destroy(struct bpf_local_storage *local_storage); void bpf_local_storage_map_free(struct bpf_map *map, - struct bpf_local_storage_cache *cache, - int __percpu *busy_counter); + struct bpf_local_storage_cache *cache); int bpf_local_storage_map_check_btf(const struct bpf_map *map, const struct btf *btf, diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c index 4d84611d8222..853183eead2c 100644 --- a/kernel/bpf/bpf_cgrp_storage.c +++ b/kernel/bpf/bpf_cgrp_storage.c @@ -119,7 +119,7 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) static void cgroup_storage_map_free(struct bpf_map *map) { - bpf_local_storage_map_free(map, &cgroup_cache, NULL); + bpf_local_storage_map_free(map, &cgroup_cache); } /* *gfp_flags* is a hidden argument provided by the verifier */ diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index cedc99184dad..470f4b02c79e 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -184,7 +184,7 @@ static struct bpf_map *inode_storage_map_alloc(union bpf_attr *attr) static void inode_storage_map_free(struct bpf_map *map) { - bpf_local_storage_map_free(map, &inode_cache, NULL); + bpf_local_storage_map_free(map, &inode_cache); } const struct bpf_map_ops inode_storage_map_ops = { diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 1138e2293b50..76e812a40380 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -807,8 +807,7 @@ bpf_local_storage_map_alloc(union bpf_attr *attr, } void bpf_local_storage_map_free(struct bpf_map *map, - struct bpf_local_storage_cache *cache, - int __percpu *busy_counter) + struct bpf_local_storage_cache *cache) { struct bpf_local_storage_map_bucket *b; struct bpf_local_storage_elem *selem; @@ -841,11 +840,7 @@ void bpf_local_storage_map_free(struct bpf_map *map, while ((selem = hlist_entry_safe( rcu_dereference_raw(hlist_first_rcu(&b->list)), struct bpf_local_storage_elem, map_node))) { - if (busy_counter) - this_cpu_inc(*busy_counter); bpf_selem_unlink(selem, true); - if (busy_counter) - this_cpu_dec(*busy_counter); cond_resched_rcu(); } rcu_read_unlock(); diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index dd858226ada2..4d53aebe6784 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -217,7 +217,7 @@ static struct bpf_map *task_storage_map_alloc(union bpf_attr *attr) static void task_storage_map_free(struct bpf_map *map) { - bpf_local_storage_map_free(map, &task_cache, NULL); + bpf_local_storage_map_free(map, &task_cache); } BTF_ID_LIST_GLOBAL_SINGLE(bpf_local_storage_map_btf_id, struct, bpf_local_storage_map) diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index d7b5c4551997..d2164165a994 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -60,7 +60,7 @@ void bpf_sk_storage_free(struct sock *sk) static void bpf_sk_storage_map_free(struct bpf_map *map) { - bpf_local_storage_map_free(map, &sk_cache, NULL); + bpf_local_storage_map_free(map, &sk_cache); } static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) From c8be3da14718f1e732afcb61e8ee5b78e8d93808 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 5 Feb 2026 14:29:07 -0800 Subject: [PATCH 260/268] bpf: Prepare for bpf_selem_unlink_nofail() The next patch will introduce bpf_selem_unlink_nofail() to handle rqspinlock errors. bpf_selem_unlink_nofail() will allow an selem to be partially unlinked from map or local storage. Save memory allocation method in selem so that later an selem can be correctly freed even when SDATA(selem)->smap is init to NULL. In addition, keep track of memory charge to the owner in local storage so that later bpf_selem_unlink_nofail() can return the correct memory charge to the owner. Updating local_storage->mem_charge is protected by local_storage->lock. Finally, extract miscellaneous tasks performed when unlinking an selem from local_storage into bpf_selem_unlink_storage_nolock_misc(). It will be reused by bpf_selem_unlink_nofail(). This patch also takes the chance to remove local_storage->smap, which is no longer used since commit f484f4a3e058 ("bpf: Replace bpf memory allocator with kmalloc_nolock() in local storage"). Acked-by: Alexei Starovoitov Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260205222916.1788211-10-ameryhung@gmail.com --- include/linux/bpf_local_storage.h | 5 ++- kernel/bpf/bpf_local_storage.c | 69 +++++++++++++++---------------- 2 files changed, 37 insertions(+), 37 deletions(-) diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index fba3354988d3..a34ed7fa81d8 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -80,7 +80,8 @@ struct bpf_local_storage_elem { * after raw_spin_unlock */ }; - /* 8 bytes hole */ + bool use_kmalloc_nolock; + /* 7 bytes hole */ /* The data is stored in another cacheline to minimize * the number of cachelines access during a cache hit. */ @@ -89,13 +90,13 @@ struct bpf_local_storage_elem { struct bpf_local_storage { struct bpf_local_storage_data __rcu *cache[BPF_LOCAL_STORAGE_CACHE_SIZE]; - struct bpf_local_storage_map __rcu *smap; struct hlist_head list; /* List of bpf_local_storage_elem */ void *owner; /* The object that owns the above "list" of * bpf_local_storage_elem. */ struct rcu_head rcu; rqspinlock_t lock; /* Protect adding/removing from the "list" */ + u64 mem_charge; /* Copy of mem charged to owner. Protected by "lock" */ bool use_kmalloc_nolock; }; diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 76e812a40380..0e9ae41a9759 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -85,6 +85,7 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, if (selem) { RCU_INIT_POINTER(SDATA(selem)->smap, smap); + selem->use_kmalloc_nolock = smap->use_kmalloc_nolock; if (value) { /* No need to call check_and_init_map_value as memory is zero init */ @@ -214,7 +215,7 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem, smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); - if (!smap->use_kmalloc_nolock) { + if (!selem->use_kmalloc_nolock) { /* * No uptr will be unpin even when reuse_now == false since uptr * is only supported in task local storage, where @@ -251,6 +252,30 @@ static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now) bpf_selem_free(selem, reuse_now); } +static void bpf_selem_unlink_storage_nolock_misc(struct bpf_local_storage_elem *selem, + struct bpf_local_storage_map *smap, + struct bpf_local_storage *local_storage, + bool free_local_storage) +{ + void *owner = local_storage->owner; + u32 uncharge = smap->elem_size; + + if (rcu_access_pointer(local_storage->cache[smap->cache_idx]) == + SDATA(selem)) + RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL); + + uncharge += free_local_storage ? sizeof(*local_storage) : 0; + mem_uncharge(smap, local_storage->owner, uncharge); + local_storage->mem_charge -= uncharge; + + if (free_local_storage) { + local_storage->owner = NULL; + + /* After this RCU_INIT, owner may be freed and cannot be used */ + RCU_INIT_POINTER(*owner_storage(smap, owner), NULL); + } +} + /* local_storage->lock must be held and selem->local_storage == local_storage. * The caller must ensure selem->smap is still valid to be * dereferenced for its smap->elem_size and smap->cache_idx. @@ -261,56 +286,30 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor { struct bpf_local_storage_map *smap; bool free_local_storage; - void *owner; smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); - owner = local_storage->owner; - - /* All uncharging on the owner must be done first. - * The owner may be freed once the last selem is unlinked - * from local_storage. - */ - mem_uncharge(smap, owner, smap->elem_size); free_local_storage = hlist_is_singular_node(&selem->snode, &local_storage->list); - if (free_local_storage) { - mem_uncharge(smap, owner, sizeof(struct bpf_local_storage)); - local_storage->owner = NULL; - /* After this RCU_INIT, owner may be freed and cannot be used */ - RCU_INIT_POINTER(*owner_storage(smap, owner), NULL); + bpf_selem_unlink_storage_nolock_misc(selem, smap, local_storage, + free_local_storage); - /* local_storage is not freed now. local_storage->lock is - * still held and raw_spin_unlock_bh(&local_storage->lock) - * will be done by the caller. - * - * Although the unlock will be done under - * rcu_read_lock(), it is more intuitive to - * read if the freeing of the storage is done - * after the raw_spin_unlock_bh(&local_storage->lock). - * - * Hence, a "bool free_local_storage" is returned - * to the caller which then calls then frees the storage after - * all the RCU grace periods have expired. - */ - } hlist_del_init_rcu(&selem->snode); - if (rcu_access_pointer(local_storage->cache[smap->cache_idx]) == - SDATA(selem)) - RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL); hlist_add_head(&selem->free_node, free_selem_list); - if (rcu_access_pointer(local_storage->smap) == smap) - RCU_INIT_POINTER(local_storage->smap, NULL); - return free_local_storage; } void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem) { + struct bpf_local_storage_map *smap; + + smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); + local_storage->mem_charge += smap->elem_size; + RCU_INIT_POINTER(selem->local_storage, local_storage); hlist_add_head_rcu(&selem->snode, &local_storage->list); } @@ -471,10 +470,10 @@ int bpf_local_storage_alloc(void *owner, goto uncharge; } - RCU_INIT_POINTER(storage->smap, smap); INIT_HLIST_HEAD(&storage->list); raw_res_spin_lock_init(&storage->lock); storage->owner = owner; + storage->mem_charge = sizeof(*storage); storage->use_kmalloc_nolock = smap->use_kmalloc_nolock; bpf_selem_link_storage_nolock(storage, first_selem); From 5d800f87d0a5ea1b156c47a4b9fd128479335153 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 5 Feb 2026 14:29:08 -0800 Subject: [PATCH 261/268] bpf: Support lockless unlink when freeing map or local storage Introduce bpf_selem_unlink_nofail() to properly handle errors returned from rqspinlock in bpf_local_storage_map_free() and bpf_local_storage_destroy() where the operation must succeeds. The idea of bpf_selem_unlink_nofail() is to allow an selem to be partially linked and use atomic operation on a bit field, selem->state, to determine when and who can free the selem if any unlink under lock fails. An selem initially is fully linked to a map and a local storage. Under normal circumstances, bpf_selem_unlink_nofail() will be able to grab locks and unlink a selem from map and local storage in sequeunce, just like bpf_selem_unlink(), and then free it after an RCU grace period. However, if any of the lock attempts fails, it will only clear SDATA(selem)->smap or selem->local_storage depending on the caller and set SELEM_MAP_UNLINKED or SELEM_STORAGE_UNLINKED according to the caller. Then, after both map_free() and destroy() see the selem and the state becomes SELEM_UNLINKED, one of two racing caller can succeed in cmpxchg the state from SELEM_UNLINKED to SELEM_TOFREE, ensuring no double free or memory leak. To make sure bpf_obj_free_fields() is done only once and when map is still present, it is called when unlinking an selem from b->list under b->lock. To make sure uncharging memory is done only when the owner is still present in map_free(), block destroy() from returning until there is no pending map_free(). Since smap may not be valid in destroy(), bpf_selem_unlink_nofail() skips bpf_selem_unlink_storage_nolock_misc() when called from destroy(). This is okay as bpf_local_storage_destroy() will return the remaining amount of memory charge tracked by mem_charge to the owner to uncharge. It is also safe to skip clearing local_storage->owner and owner_storage as the owner is being freed and no users or bpf programs should be able to reference the owner and using local_storage. Finally, access of selem, SDATA(selem)->smap and selem->local_storage are racy. Callers will protect these fields with RCU. Acked-by: Alexei Starovoitov Co-developed-by: Martin KaFai Lau Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260205222916.1788211-11-ameryhung@gmail.com --- include/linux/bpf_local_storage.h | 9 ++- kernel/bpf/bpf_local_storage.c | 116 ++++++++++++++++++++++++++++-- 2 files changed, 118 insertions(+), 7 deletions(-) diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index a34ed7fa81d8..69a5d8aa765d 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -68,6 +68,11 @@ struct bpf_local_storage_data { u8 data[] __aligned(8); }; +#define SELEM_MAP_UNLINKED (1 << 0) +#define SELEM_STORAGE_UNLINKED (1 << 1) +#define SELEM_UNLINKED (SELEM_MAP_UNLINKED | SELEM_STORAGE_UNLINKED) +#define SELEM_TOFREE (1 << 2) + /* Linked to bpf_local_storage and bpf_local_storage_map */ struct bpf_local_storage_elem { struct hlist_node map_node; /* Linked to bpf_local_storage_map */ @@ -80,8 +85,9 @@ struct bpf_local_storage_elem { * after raw_spin_unlock */ }; + atomic_t state; bool use_kmalloc_nolock; - /* 7 bytes hole */ + /* 3 bytes hole */ /* The data is stored in another cacheline to minimize * the number of cachelines access during a cache hit. */ @@ -97,6 +103,7 @@ struct bpf_local_storage { struct rcu_head rcu; rqspinlock_t lock; /* Protect adding/removing from the "list" */ u64 mem_charge; /* Copy of mem charged to owner. Protected by "lock" */ + refcount_t owner_refcnt;/* Used to pin owner when map_free is uncharging */ bool use_kmalloc_nolock; }; diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 0e9ae41a9759..294605c78271 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -85,6 +85,7 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, if (selem) { RCU_INIT_POINTER(SDATA(selem)->smap, smap); + atomic_set(&selem->state, 0); selem->use_kmalloc_nolock = smap->use_kmalloc_nolock; if (value) { @@ -194,9 +195,11 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu) /* The bpf_local_storage_map_free will wait for rcu_barrier */ smap = rcu_dereference_check(SDATA(selem)->smap, 1); - migrate_disable(); - bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); - migrate_enable(); + if (smap) { + migrate_disable(); + bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); + migrate_enable(); + } kfree_nolock(selem); } @@ -221,7 +224,8 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem, * is only supported in task local storage, where * smap->use_kmalloc_nolock == true. */ - bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); + if (smap) + bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); __bpf_selem_free(selem, reuse_now); return; } @@ -255,7 +259,7 @@ static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now) static void bpf_selem_unlink_storage_nolock_misc(struct bpf_local_storage_elem *selem, struct bpf_local_storage_map *smap, struct bpf_local_storage *local_storage, - bool free_local_storage) + bool free_local_storage, bool pin_owner) { void *owner = local_storage->owner; u32 uncharge = smap->elem_size; @@ -264,6 +268,9 @@ static void bpf_selem_unlink_storage_nolock_misc(struct bpf_local_storage_elem * SDATA(selem)) RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL); + if (pin_owner && !refcount_inc_not_zero(&local_storage->owner_refcnt)) + return; + uncharge += free_local_storage ? sizeof(*local_storage) : 0; mem_uncharge(smap, local_storage->owner, uncharge); local_storage->mem_charge -= uncharge; @@ -274,6 +281,9 @@ static void bpf_selem_unlink_storage_nolock_misc(struct bpf_local_storage_elem * /* After this RCU_INIT, owner may be freed and cannot be used */ RCU_INIT_POINTER(*owner_storage(smap, owner), NULL); } + + if (pin_owner) + refcount_dec(&local_storage->owner_refcnt); } /* local_storage->lock must be held and selem->local_storage == local_storage. @@ -293,7 +303,7 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor &local_storage->list); bpf_selem_unlink_storage_nolock_misc(selem, smap, local_storage, - free_local_storage); + free_local_storage, false); hlist_del_init_rcu(&selem->snode); @@ -409,6 +419,94 @@ int bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now) return err; } +/* + * Unlink an selem from map and local storage with lockless fallback if callers + * are racing or rqspinlock returns error. It should only be called by + * bpf_local_storage_destroy() or bpf_local_storage_map_free(). + */ +static void bpf_selem_unlink_nofail(struct bpf_local_storage_elem *selem, + struct bpf_local_storage_map_bucket *b) +{ + bool in_map_free = !!b, free_storage = false; + struct bpf_local_storage *local_storage; + struct bpf_local_storage_map *smap; + unsigned long flags; + int err, unlink = 0; + + local_storage = rcu_dereference_check(selem->local_storage, bpf_rcu_lock_held()); + smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); + + if (smap) { + b = b ? : select_bucket(smap, local_storage); + err = raw_res_spin_lock_irqsave(&b->lock, flags); + if (!err) { + /* + * Call bpf_obj_free_fields() under b->lock to make sure it is done + * exactly once for an selem. Safe to free special fields immediately + * as no BPF program should be referencing the selem. + */ + if (likely(selem_linked_to_map(selem))) { + hlist_del_init_rcu(&selem->map_node); + bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); + unlink++; + } + raw_res_spin_unlock_irqrestore(&b->lock, flags); + } + /* + * Highly unlikely scenario: resource leak + * + * When map_free(selem1), destroy(selem1) and destroy(selem2) are racing + * and both selem belong to the same bucket, if destroy(selem2) acquired + * b->lock and block for too long, neither map_free(selem1) and + * destroy(selem1) will be able to free the special field associated + * with selem1 as raw_res_spin_lock_irqsave() returns -ETIMEDOUT. + */ + WARN_ON_ONCE(err && in_map_free); + if (!err || in_map_free) + RCU_INIT_POINTER(SDATA(selem)->smap, NULL); + } + + if (local_storage) { + err = raw_res_spin_lock_irqsave(&local_storage->lock, flags); + if (!err) { + if (likely(selem_linked_to_storage(selem))) { + free_storage = hlist_is_singular_node(&selem->snode, + &local_storage->list); + /* + * Okay to skip clearing owner_storage and storage->owner in + * destroy() since the owner is going away. No user or bpf + * programs should be able to reference it. + */ + if (smap && in_map_free) + bpf_selem_unlink_storage_nolock_misc( + selem, smap, local_storage, + free_storage, true); + hlist_del_init_rcu(&selem->snode); + unlink++; + } + raw_res_spin_unlock_irqrestore(&local_storage->lock, flags); + } + if (!err || !in_map_free) + RCU_INIT_POINTER(selem->local_storage, NULL); + } + + if (unlink != 2) + atomic_or(in_map_free ? SELEM_MAP_UNLINKED : SELEM_STORAGE_UNLINKED, &selem->state); + + /* + * Normally, an selem can be unlinked under local_storage->lock and b->lock, and + * then freed after an RCU grace period. However, if destroy() and map_free() are + * racing or rqspinlock returns errors in unlikely situations (unlink != 2), free + * the selem only after both map_free() and destroy() see the selem. + */ + if (unlink == 2 || + atomic_cmpxchg(&selem->state, SELEM_UNLINKED, SELEM_TOFREE) == SELEM_UNLINKED) + bpf_selem_free(selem, true); + + if (free_storage) + bpf_local_storage_free(local_storage, true); +} + void __bpf_local_storage_insert_cache(struct bpf_local_storage *local_storage, struct bpf_local_storage_map *smap, struct bpf_local_storage_elem *selem) @@ -475,6 +573,7 @@ int bpf_local_storage_alloc(void *owner, storage->owner = owner; storage->mem_charge = sizeof(*storage); storage->use_kmalloc_nolock = smap->use_kmalloc_nolock; + refcount_set(&storage->owner_refcnt, 1); bpf_selem_link_storage_nolock(storage, first_selem); @@ -743,6 +842,11 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) if (free_storage) bpf_local_storage_free(local_storage, true); + + if (!refcount_dec_and_test(&local_storage->owner_refcnt)) { + while (refcount_read(&local_storage->owner_refcnt)) + cpu_relax(); + } } u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map) From 0be08389c7f2f9db0194ee666d2e4870886e6ffb Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 5 Feb 2026 14:29:09 -0800 Subject: [PATCH 262/268] bpf: Switch to bpf_selem_unlink_nofail in bpf_local_storage_{map_free, destroy} Take care of rqspinlock error in bpf_local_storage_{map_free, destroy}() properly by switching to bpf_selem_unlink_nofail(). Both functions iterate their own RCU-protected list of selems and call bpf_selem_unlink_nofail(). In map_free(), to prevent infinite loop when both map_free() and destroy() fail to remove a selem from b->list (extremely unlikely), switch to hlist_for_each_entry_rcu(). In destroy(), also switch to hlist_for_each_entry_rcu() since we no longer iterate local_storage->list under local_storage->lock. bpf_selem_unlink() now becomes dedicated to helpers and syscalls paths so reuse_now should always be false. Remove it from the argument and hardcode it. Acked-by: Alexei Starovoitov Co-developed-by: Martin KaFai Lau Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260205222916.1788211-12-ameryhung@gmail.com --- include/linux/bpf_local_storage.h | 4 +- kernel/bpf/bpf_cgrp_storage.c | 2 +- kernel/bpf/bpf_inode_storage.c | 2 +- kernel/bpf/bpf_local_storage.c | 63 ++++++++++++++----------------- kernel/bpf/bpf_task_storage.c | 2 +- net/core/bpf_sk_storage.c | 7 +++- 6 files changed, 39 insertions(+), 41 deletions(-) diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 69a5d8aa765d..85efa9772530 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -171,7 +171,7 @@ bpf_local_storage_lookup(struct bpf_local_storage *local_storage, return SDATA(selem); } -void bpf_local_storage_destroy(struct bpf_local_storage *local_storage); +u32 bpf_local_storage_destroy(struct bpf_local_storage *local_storage); void bpf_local_storage_map_free(struct bpf_map *map, struct bpf_local_storage_cache *cache); @@ -184,7 +184,7 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map, void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem); -int bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now); +int bpf_selem_unlink(struct bpf_local_storage_elem *selem); int bpf_selem_link_map(struct bpf_local_storage_map *smap, struct bpf_local_storage *local_storage, diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c index 853183eead2c..c2a2ead1f466 100644 --- a/kernel/bpf/bpf_cgrp_storage.c +++ b/kernel/bpf/bpf_cgrp_storage.c @@ -89,7 +89,7 @@ static int cgroup_storage_delete(struct cgroup *cgroup, struct bpf_map *map) if (!sdata) return -ENOENT; - return bpf_selem_unlink(SELEM(sdata), false); + return bpf_selem_unlink(SELEM(sdata)); } static long bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key) diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index 470f4b02c79e..e86734609f3d 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -110,7 +110,7 @@ static int inode_storage_delete(struct inode *inode, struct bpf_map *map) if (!sdata) return -ENOENT; - return bpf_selem_unlink(SELEM(sdata), false); + return bpf_selem_unlink(SELEM(sdata)); } static long bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key) diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 294605c78271..b28f07d3a0db 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -377,7 +377,11 @@ static void bpf_selem_link_map_nolock(struct bpf_local_storage_map_bucket *b, hlist_add_head_rcu(&selem->map_node, &b->list); } -int bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now) +/* + * Unlink an selem from map and local storage with lock held. + * This is the common path used by local storages to delete an selem. + */ +int bpf_selem_unlink(struct bpf_local_storage_elem *selem) { struct bpf_local_storage *local_storage; bool free_local_storage = false; @@ -411,10 +415,10 @@ int bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now) out: raw_res_spin_unlock_irqrestore(&local_storage->lock, flags); - bpf_selem_free_list(&selem_free_list, reuse_now); + bpf_selem_free_list(&selem_free_list, false); if (free_local_storage) - bpf_local_storage_free(local_storage, reuse_now); + bpf_local_storage_free(local_storage, false); return err; } @@ -804,13 +808,13 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map, return 0; } -void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) +/* + * Destroy local storage when the owner is going away. Caller must uncharge memory + * if memory charging is used. + */ +u32 bpf_local_storage_destroy(struct bpf_local_storage *local_storage) { struct bpf_local_storage_elem *selem; - bool free_storage = false; - HLIST_HEAD(free_selem_list); - struct hlist_node *n; - unsigned long flags; /* Neither the bpf_prog nor the bpf_map's syscall * could be modifying the local_storage->list now. @@ -821,32 +825,20 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) * when unlinking elem from the local_storage->list and * the map's bucket->list. */ - raw_res_spin_lock_irqsave(&local_storage->lock, flags); - hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) { - /* Always unlink from map before unlinking from - * local_storage. - */ - bpf_selem_unlink_map(selem); - /* If local_storage list has only one element, the - * bpf_selem_unlink_storage_nolock() will return true. - * Otherwise, it will return false. The current loop iteration - * intends to remove all local storage. So the last iteration - * of the loop will set the free_cgroup_storage to true. - */ - free_storage = bpf_selem_unlink_storage_nolock( - local_storage, selem, &free_selem_list); - } - raw_res_spin_unlock_irqrestore(&local_storage->lock, flags); - - bpf_selem_free_list(&free_selem_list, true); - - if (free_storage) - bpf_local_storage_free(local_storage, true); + hlist_for_each_entry_rcu(selem, &local_storage->list, snode) + bpf_selem_unlink_nofail(selem, NULL); if (!refcount_dec_and_test(&local_storage->owner_refcnt)) { while (refcount_read(&local_storage->owner_refcnt)) cpu_relax(); + /* + * Paired with refcount_dec() in bpf_selem_unlink_nofail() + * to make sure destroy() sees the correct local_storage->mem_charge. + */ + smp_mb(); } + + return local_storage->mem_charge; } u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map) @@ -940,11 +932,14 @@ void bpf_local_storage_map_free(struct bpf_map *map, rcu_read_lock(); /* No one is adding to b->list now */ - while ((selem = hlist_entry_safe( - rcu_dereference_raw(hlist_first_rcu(&b->list)), - struct bpf_local_storage_elem, map_node))) { - bpf_selem_unlink(selem, true); - cond_resched_rcu(); +restart: + hlist_for_each_entry_rcu(selem, &b->list, map_node) { + bpf_selem_unlink_nofail(selem, b); + + if (need_resched()) { + cond_resched_rcu(); + goto restart; + } } rcu_read_unlock(); } diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index 4d53aebe6784..605506792b5b 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -134,7 +134,7 @@ static int task_storage_delete(struct task_struct *task, struct bpf_map *map) if (!sdata) return -ENOENT; - return bpf_selem_unlink(SELEM(sdata), false); + return bpf_selem_unlink(SELEM(sdata)); } static long bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key) diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index d2164165a994..1eb3e060994e 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -40,20 +40,23 @@ static int bpf_sk_storage_del(struct sock *sk, struct bpf_map *map) if (!sdata) return -ENOENT; - return bpf_selem_unlink(SELEM(sdata), false); + return bpf_selem_unlink(SELEM(sdata)); } /* Called by __sk_destruct() & bpf_sk_storage_clone() */ void bpf_sk_storage_free(struct sock *sk) { struct bpf_local_storage *sk_storage; + u32 uncharge; rcu_read_lock_dont_migrate(); sk_storage = rcu_dereference(sk->sk_bpf_storage); if (!sk_storage) goto out; - bpf_local_storage_destroy(sk_storage); + uncharge = bpf_local_storage_destroy(sk_storage); + if (uncharge) + atomic_sub(uncharge, &sk->sk_omem_alloc); out: rcu_read_unlock_migrate(); } From d652f425d5e332125d358a92158a840084061107 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 5 Feb 2026 14:29:10 -0800 Subject: [PATCH 263/268] selftests/bpf: Update sk_storage_omem_uncharge test Check sk_omem_alloc when the caller of bpf_local_storage_destroy() returns. bpf_local_storage_destroy() now returns the memory to uncharge to the caller instead of directly uncharge. Therefore, in the sk_storage_omem_uncharge, check sk_omem_alloc when bpf_sk_storage_free() returns instead of bpf_local_storage_destroy(). Acked-by: Alexei Starovoitov Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260205222916.1788211-13-ameryhung@gmail.com --- .../selftests/bpf/progs/sk_storage_omem_uncharge.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/sk_storage_omem_uncharge.c b/tools/testing/selftests/bpf/progs/sk_storage_omem_uncharge.c index 46d6eb2a3b17..c8f4815c8dfb 100644 --- a/tools/testing/selftests/bpf/progs/sk_storage_omem_uncharge.c +++ b/tools/testing/selftests/bpf/progs/sk_storage_omem_uncharge.c @@ -6,7 +6,6 @@ #include #include -void *local_storage_ptr = NULL; void *sk_ptr = NULL; int cookie_found = 0; __u64 cookie = 0; @@ -19,21 +18,17 @@ struct { __type(value, int); } sk_storage SEC(".maps"); -SEC("fexit/bpf_local_storage_destroy") -int BPF_PROG(bpf_local_storage_destroy, struct bpf_local_storage *local_storage) +SEC("fexit/bpf_sk_storage_free") +int BPF_PROG(bpf_sk_storage_free, struct sock *sk) { - struct sock *sk; - - if (local_storage_ptr != local_storage) + if (sk_ptr != sk) return 0; - sk = bpf_core_cast(sk_ptr, struct sock); if (sk->sk_cookie.counter != cookie) return 0; cookie_found++; omem = sk->sk_omem_alloc.counter; - local_storage_ptr = NULL; return 0; } @@ -50,7 +45,6 @@ int BPF_PROG(inet6_sock_destruct, struct sock *sk) if (value && *value == 0xdeadbeef) { cookie_found++; sk_ptr = sk; - local_storage_ptr = sk->sk_bpf_storage; } return 0; From e4772031d1053e7640e3094834916ee2605f288f Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 5 Feb 2026 14:29:11 -0800 Subject: [PATCH 264/268] selftests/bpf: Update task_local_storage/recursion test Update the expected result of the selftest as recursion of task local storage syscall and helpers have been relaxed. Now that the percpu counter is removed, task local storage helpers, bpf_task_storage_get() and bpf_task_storage_delete() can now run on the same CPU at the same time unless they cause deadlock. Note that since there is no percpu counter preventing recursion in task local storage helpers, bpf_trampoline now catches the recursion of on_update as reported by recursion_misses. on_enter: tp_btf/sys_enter on_update: fentry/bpf_local_storage_update Old behavior New behavior ____________ ____________ on_enter on_enter bpf_task_storage_get(&map_a) bpf_task_storage_get(&map_a) bpf_task_storage_trylock succeed bpf_local_storage_update(&map_a) bpf_local_storage_update(&map_a) on_update on_update bpf_task_storage_get(&map_a) bpf_task_storage_get(&map_a) bpf_task_storage_trylock fail on_update::misses++ (1) return NULL create and return map_a::ptr map_a::ptr += 1 (1) bpf_task_storage_delete(&map_a) return 0 bpf_task_storage_get(&map_b) bpf_task_storage_get(&map_b) bpf_task_storage_trylock fail on_update::misses++ (2) return NULL create and return map_b::ptr map_b::ptr += 1 (1) create and return map_a::ptr create and return map_a::ptr map_a::ptr = 200 map_a::ptr = 200 bpf_task_storage_get(&map_b) bpf_task_storage_get(&map_b) bpf_task_storage_trylock succeed lockless lookup succeed bpf_local_storage_update(&map_b) return map_b::ptr on_update bpf_task_storage_get(&map_a) bpf_task_storage_trylock fail lockless lookup succeed return map_a::ptr map_a::ptr += 1 (201) bpf_task_storage_delete(&map_a) bpf_task_storage_trylock fail return -EBUSY nr_del_errs++ (1) bpf_task_storage_get(&map_b) bpf_task_storage_trylock fail return NULL create and return ptr map_b::ptr = 100 Expected result: map_a::ptr = 201 map_a::ptr = 200 map_b::ptr = 100 map_b::ptr = 1 nr_del_err = 1 nr_del_err = 0 on_update::recursion_misses = 0 on_update::recursion_misses = 2 On_enter::recursion_misses = 0 on_enter::recursion_misses = 0 Acked-by: Alexei Starovoitov Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260205222916.1788211-14-ameryhung@gmail.com --- .../selftests/bpf/prog_tests/task_local_storage.c | 10 +++++----- .../selftests/bpf/progs/task_ls_recursion.c | 14 ++------------ 2 files changed, 7 insertions(+), 17 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/task_local_storage.c b/tools/testing/selftests/bpf/prog_tests/task_local_storage.c index 42e822ea352f..7bee33797c71 100644 --- a/tools/testing/selftests/bpf/prog_tests/task_local_storage.c +++ b/tools/testing/selftests/bpf/prog_tests/task_local_storage.c @@ -112,24 +112,24 @@ static void test_recursion(void) task_ls_recursion__detach(skel); /* Refer to the comment in BPF_PROG(on_update) for - * the explanation on the value 201 and 100. + * the explanation on the value 200 and 1. */ map_fd = bpf_map__fd(skel->maps.map_a); err = bpf_map_lookup_elem(map_fd, &task_fd, &value); ASSERT_OK(err, "lookup map_a"); - ASSERT_EQ(value, 201, "map_a value"); - ASSERT_EQ(skel->bss->nr_del_errs, 1, "bpf_task_storage_delete busy"); + ASSERT_EQ(value, 200, "map_a value"); + ASSERT_EQ(skel->bss->nr_del_errs, 0, "bpf_task_storage_delete busy"); map_fd = bpf_map__fd(skel->maps.map_b); err = bpf_map_lookup_elem(map_fd, &task_fd, &value); ASSERT_OK(err, "lookup map_b"); - ASSERT_EQ(value, 100, "map_b value"); + ASSERT_EQ(value, 1, "map_b value"); prog_fd = bpf_program__fd(skel->progs.on_update); memset(&info, 0, sizeof(info)); err = bpf_prog_get_info_by_fd(prog_fd, &info, &info_len); ASSERT_OK(err, "get prog info"); - ASSERT_EQ(info.recursion_misses, 0, "on_update prog recursion"); + ASSERT_EQ(info.recursion_misses, 2, "on_update prog recursion"); prog_fd = bpf_program__fd(skel->progs.on_enter); memset(&info, 0, sizeof(info)); diff --git a/tools/testing/selftests/bpf/progs/task_ls_recursion.c b/tools/testing/selftests/bpf/progs/task_ls_recursion.c index f1853c38aada..b37359432692 100644 --- a/tools/testing/selftests/bpf/progs/task_ls_recursion.c +++ b/tools/testing/selftests/bpf/progs/task_ls_recursion.c @@ -36,14 +36,9 @@ int BPF_PROG(on_update) if (!test_pid || task->pid != test_pid) return 0; + /* This will succeed as there is no real deadlock */ ptr = bpf_task_storage_get(&map_a, task, 0, BPF_LOCAL_STORAGE_GET_F_CREATE); - /* ptr will not be NULL when it is called from - * the bpf_task_storage_get(&map_b,...F_CREATE) in - * the BPF_PROG(on_enter) below. It is because - * the value can be found in map_a and the kernel - * does not need to acquire any spin_lock. - */ if (ptr) { int err; @@ -53,12 +48,7 @@ int BPF_PROG(on_update) nr_del_errs++; } - /* This will still fail because map_b is empty and - * this BPF_PROG(on_update) has failed to acquire - * the percpu busy lock => meaning potential - * deadlock is detected and it will fail to create - * new storage. - */ + /* This will succeed as there is no real deadlock */ ptr = bpf_task_storage_get(&map_b, task, 0, BPF_LOCAL_STORAGE_GET_F_CREATE); if (ptr) From 902a79b6389ff39fd736c6fd1581ded1372adbf5 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 5 Feb 2026 14:29:12 -0800 Subject: [PATCH 265/268] selftests/bpf: Update task_local_storage/task_storage_nodeadlock test Adjust the error code we are checking against as bpf_task_storage_delete() now returns -EDEADLK or -ETIMEDOUT when deadlock happens. Acked-by: Alexei Starovoitov Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260205222916.1788211-15-ameryhung@gmail.com --- .../testing/selftests/bpf/progs/task_storage_nodeadlock.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/task_storage_nodeadlock.c b/tools/testing/selftests/bpf/progs/task_storage_nodeadlock.c index 986829aaf73a..6ce98fe9f387 100644 --- a/tools/testing/selftests/bpf/progs/task_storage_nodeadlock.c +++ b/tools/testing/selftests/bpf/progs/task_storage_nodeadlock.c @@ -1,15 +1,12 @@ // SPDX-License-Identifier: GPL-2.0 #include "vmlinux.h" +#include #include #include char _license[] SEC("license") = "GPL"; -#ifndef EBUSY -#define EBUSY 16 -#endif - extern bool CONFIG_PREEMPTION __kconfig __weak; int nr_get_errs = 0; int nr_del_errs = 0; @@ -40,7 +37,7 @@ int BPF_PROG(socket_post_create, struct socket *sock, int family, int type, ret = bpf_task_storage_delete(&task_storage, bpf_get_current_task_btf()); - if (ret == -EBUSY) + if (ret == -EDEADLK || ret == -ETIMEDOUT) __sync_fetch_and_add(&nr_del_errs, 1); return 0; From e02cf06b85f8ae337c86db1bad5a0fd54c7bd301 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 5 Feb 2026 14:29:13 -0800 Subject: [PATCH 266/268] selftests/bpf: Remove test_task_storage_map_stress_lookup Remove a test in test_maps that checks if the updating of the percpu counter in task local storage map is preemption safe as the percpu counter is now removed. Acked-by: Alexei Starovoitov Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260205222916.1788211-16-ameryhung@gmail.com --- .../bpf/map_tests/task_storage_map.c | 128 ------------------ .../bpf/progs/read_bpf_task_storage_busy.c | 38 ------ 2 files changed, 166 deletions(-) delete mode 100644 tools/testing/selftests/bpf/map_tests/task_storage_map.c delete mode 100644 tools/testing/selftests/bpf/progs/read_bpf_task_storage_busy.c diff --git a/tools/testing/selftests/bpf/map_tests/task_storage_map.c b/tools/testing/selftests/bpf/map_tests/task_storage_map.c deleted file mode 100644 index a4121d2248ac..000000000000 --- a/tools/testing/selftests/bpf/map_tests/task_storage_map.c +++ /dev/null @@ -1,128 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2022. Huawei Technologies Co., Ltd */ -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include "bpf_util.h" -#include "test_maps.h" -#include "task_local_storage_helpers.h" -#include "read_bpf_task_storage_busy.skel.h" - -struct lookup_ctx { - bool start; - bool stop; - int pid_fd; - int map_fd; - int loop; -}; - -static void *lookup_fn(void *arg) -{ - struct lookup_ctx *ctx = arg; - long value; - int i = 0; - - while (!ctx->start) - usleep(1); - - while (!ctx->stop && i++ < ctx->loop) - bpf_map_lookup_elem(ctx->map_fd, &ctx->pid_fd, &value); - return NULL; -} - -static void abort_lookup(struct lookup_ctx *ctx, pthread_t *tids, unsigned int nr) -{ - unsigned int i; - - ctx->stop = true; - ctx->start = true; - for (i = 0; i < nr; i++) - pthread_join(tids[i], NULL); -} - -void test_task_storage_map_stress_lookup(void) -{ -#define MAX_NR_THREAD 4096 - unsigned int i, nr = 256, loop = 8192, cpu = 0; - struct read_bpf_task_storage_busy *skel; - pthread_t tids[MAX_NR_THREAD]; - struct lookup_ctx ctx; - cpu_set_t old, new; - const char *cfg; - int err; - - cfg = getenv("TASK_STORAGE_MAP_NR_THREAD"); - if (cfg) { - nr = atoi(cfg); - if (nr > MAX_NR_THREAD) - nr = MAX_NR_THREAD; - } - cfg = getenv("TASK_STORAGE_MAP_NR_LOOP"); - if (cfg) - loop = atoi(cfg); - cfg = getenv("TASK_STORAGE_MAP_PIN_CPU"); - if (cfg) - cpu = atoi(cfg); - - skel = read_bpf_task_storage_busy__open_and_load(); - err = libbpf_get_error(skel); - CHECK(err, "open_and_load", "error %d\n", err); - - /* Only for a fully preemptible kernel */ - if (!skel->kconfig->CONFIG_PREEMPTION) { - printf("%s SKIP (no CONFIG_PREEMPTION)\n", __func__); - read_bpf_task_storage_busy__destroy(skel); - skips++; - return; - } - - /* Save the old affinity setting */ - sched_getaffinity(getpid(), sizeof(old), &old); - - /* Pinned on a specific CPU */ - CPU_ZERO(&new); - CPU_SET(cpu, &new); - sched_setaffinity(getpid(), sizeof(new), &new); - - ctx.start = false; - ctx.stop = false; - ctx.pid_fd = sys_pidfd_open(getpid(), 0); - ctx.map_fd = bpf_map__fd(skel->maps.task); - ctx.loop = loop; - for (i = 0; i < nr; i++) { - err = pthread_create(&tids[i], NULL, lookup_fn, &ctx); - if (err) { - abort_lookup(&ctx, tids, i); - CHECK(err, "pthread_create", "error %d\n", err); - goto out; - } - } - - ctx.start = true; - for (i = 0; i < nr; i++) - pthread_join(tids[i], NULL); - - skel->bss->pid = getpid(); - err = read_bpf_task_storage_busy__attach(skel); - CHECK(err, "attach", "error %d\n", err); - - /* Trigger program */ - sys_gettid(); - skel->bss->pid = 0; - - CHECK(skel->bss->busy != 0, "bad bpf_task_storage_busy", "got %d\n", skel->bss->busy); -out: - read_bpf_task_storage_busy__destroy(skel); - /* Restore affinity setting */ - sched_setaffinity(getpid(), sizeof(old), &old); - printf("%s:PASS\n", __func__); -} diff --git a/tools/testing/selftests/bpf/progs/read_bpf_task_storage_busy.c b/tools/testing/selftests/bpf/progs/read_bpf_task_storage_busy.c deleted file mode 100644 index 69da05bb6c63..000000000000 --- a/tools/testing/selftests/bpf/progs/read_bpf_task_storage_busy.c +++ /dev/null @@ -1,38 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2022. Huawei Technologies Co., Ltd */ -#include "vmlinux.h" -#include -#include - -extern bool CONFIG_PREEMPTION __kconfig __weak; -extern const int bpf_task_storage_busy __ksym; - -char _license[] SEC("license") = "GPL"; - -int pid = 0; -int busy = 0; - -struct { - __uint(type, BPF_MAP_TYPE_TASK_STORAGE); - __uint(map_flags, BPF_F_NO_PREALLOC); - __type(key, int); - __type(value, long); -} task SEC(".maps"); - -SEC("raw_tp/sys_enter") -int BPF_PROG(read_bpf_task_storage_busy) -{ - int *value; - - if (!CONFIG_PREEMPTION) - return 0; - - if (bpf_get_current_pid_tgid() >> 32 != pid) - return 0; - - value = bpf_this_cpu_ptr(&bpf_task_storage_busy); - if (value) - busy = *value; - - return 0; -} From cdce7b0848f6f2be4c6d7dbf243244981d315f6f Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 5 Feb 2026 14:29:14 -0800 Subject: [PATCH 267/268] selftests/bpf: Choose another percpu variable in bpf for btf_dump test bpf_cgrp_storage_busy has been removed. Use bpf_bprintf_nest_level instead. This percpu variable is also in the bpf subsystem so that if it is removed in the future, BPF-CI will catch this type of CI- breaking change. Acked-by: Alexei Starovoitov Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260205222916.1788211-17-ameryhung@gmail.com --- tools/testing/selftests/bpf/prog_tests/btf_dump.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dump.c b/tools/testing/selftests/bpf/prog_tests/btf_dump.c index 10cba526d3e6..f1642794f70e 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf_dump.c +++ b/tools/testing/selftests/bpf/prog_tests/btf_dump.c @@ -875,8 +875,8 @@ static void test_btf_dump_var_data(struct btf *btf, struct btf_dump *d, TEST_BTF_DUMP_VAR(btf, d, NULL, str, "cpu_number", int, BTF_F_COMPACT, "int cpu_number = (int)100", 100); #endif - TEST_BTF_DUMP_VAR(btf, d, NULL, str, "bpf_cgrp_storage_busy", int, BTF_F_COMPACT, - "static int bpf_cgrp_storage_busy = (int)2", 2); + TEST_BTF_DUMP_VAR(btf, d, NULL, str, "bpf_bprintf_nest_level", int, BTF_F_COMPACT, + "static int bpf_bprintf_nest_level = (int)2", 2); } struct btf_dump_string_ctx { From 97b859b5ed04dbbe99be19895d8498009a19553f Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 5 Feb 2026 14:29:15 -0800 Subject: [PATCH 268/268] selftests/bpf: Fix outdated test on storage->smap bpf_local_storage_free() already does not rely on local_storage->smap since switching to kmalloc_nolock(). As local_storage->smap is removed, fix the outdated test by dropping the local_storage->smap check. Keep the second map in task local storage map test to test that multiple elements can be added to the storage similar to sk storage test. Acked-by: Alexei Starovoitov Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260205222916.1788211-18-ameryhung@gmail.com --- .../selftests/bpf/progs/local_storage.c | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/local_storage.c b/tools/testing/selftests/bpf/progs/local_storage.c index 637e75df2e14..d0be77011a84 100644 --- a/tools/testing/selftests/bpf/progs/local_storage.c +++ b/tools/testing/selftests/bpf/progs/local_storage.c @@ -62,7 +62,6 @@ SEC("lsm/inode_unlink") int BPF_PROG(unlink_hook, struct inode *dir, struct dentry *victim) { __u32 pid = bpf_get_current_pid_tgid() >> 32; - struct bpf_local_storage *local_storage; struct local_storage *storage; struct task_struct *task; bool is_self_unlink; @@ -88,15 +87,10 @@ int BPF_PROG(unlink_hook, struct inode *dir, struct dentry *victim) if (!storage || storage->value) return 0; - if (bpf_task_storage_delete(&task_storage_map, task)) + if (bpf_task_storage_delete(&task_storage_map2, task)) return 0; - /* Ensure that the task_storage_map is disconnected from the storage. - * The storage memory should not be freed back to the - * bpf_mem_alloc. - */ - local_storage = task->bpf_storage; - if (!local_storage || local_storage->smap) + if (bpf_task_storage_delete(&task_storage_map, task)) return 0; task_storage_result = 0; @@ -164,18 +158,9 @@ int BPF_PROG(socket_bind, struct socket *sock, struct sockaddr *address, if (bpf_sk_storage_delete(&sk_storage_map2, sk)) return 0; - storage = bpf_sk_storage_get(&sk_storage_map2, sk, 0, - BPF_LOCAL_STORAGE_GET_F_CREATE); - if (!storage) - return 0; - if (bpf_sk_storage_delete(&sk_storage_map, sk)) return 0; - /* Ensure that the sk_storage_map is disconnected from the storage. */ - if (!sk->sk_bpf_storage || sk->sk_bpf_storage->smap) - return 0; - sk_storage_result = 0; return 0; }