mirror of
https://github.com/torvalds/linux.git
synced 2026-05-31 10:33:41 +02:00
Merge branch 'free-htab-element-out-of-bucket-lock'
Hou Tao says: ==================== The patch set continues the previous work [1] to move all the freeings of htab elements out of bucket lock. One motivation for the patch set is the locking problem reported by Sebastian [2]: the freeing of bpf_timer under PREEMPT_RT may acquire a spin-lock (namely softirq_expiry_lock). However the freeing procedure for htab element has already held a raw-spin-lock (namely bucket lock), and it will trigger the warning: "BUG: scheduling while atomic" as demonstrated by the selftests patch. Another motivation is to reduce the locked scope of bucket lock. However, the patch set doesn't move all freeing of htab element out of bucket lock, it still keep the free of special fields in pre-allocated hash map under the protect of bucket lock in htab_map_update_elem(). The patch set is structured as follows: * Patch #1 moves the element freeing out of bucket lock for htab_lru_map_delete_node(). However the freeing is still in the locked scope of LRU raw spin lock. * Patch #2~#3 move the element freeing out of bucket lock for __htab_map_lookup_and_delete_elem() * Patch #4 cancels the bpf_timer in two steps to fix the locking problem in htab_map_update_elem() for PREEMPT_PRT. * Patch #5 adds a selftest for the locking problem Please see individual patches for more details. Comments are always welcome. --- v3: * patch #1: update the commit message to state that the freeing of special field is still in the locked scope of LRU raw spin lock * patch #4: cancel the bpf_timer in two steps only for PREEMPT_RT (suggested by Alexei) v2: https://lore.kernel.org/bpf/20250109061901.2620825-1-houtao@huaweicloud.com * cancels the bpf timer in two steps instead of breaking the reuse the refill of per-cpu ->extra_elems into two steps v1: https://lore.kernel.org/bpf/20250107085559.3081563-1-houtao@huaweicloud.com [1]: https://lore.kernel.org/bpf/20241106063542.357743-1-houtao@huaweicloud.com [2]: https://lore.kernel.org/bpf/20241106084527.4gPrMnHt@linutronix.de ==================== Link: https://patch.msgid.link/20250117101816.2101857-1-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov <ast@kernel.org>
This commit is contained in:
commit
d10cafc5d5
|
|
@ -824,13 +824,14 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
|
|||
hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
|
||||
if (l == tgt_l) {
|
||||
hlist_nulls_del_rcu(&l->hash_node);
|
||||
check_and_free_fields(htab, l);
|
||||
bpf_map_dec_elem_count(&htab->map);
|
||||
break;
|
||||
}
|
||||
|
||||
htab_unlock_bucket(htab, b, tgt_l->hash, flags);
|
||||
|
||||
if (l == tgt_l)
|
||||
check_and_free_fields(htab, l);
|
||||
return l == tgt_l;
|
||||
}
|
||||
|
||||
|
|
@ -1634,41 +1635,44 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
|
|||
l = lookup_elem_raw(head, hash, key, key_size);
|
||||
if (!l) {
|
||||
ret = -ENOENT;
|
||||
} else {
|
||||
if (is_percpu) {
|
||||
u32 roundup_value_size = round_up(map->value_size, 8);
|
||||
void __percpu *pptr;
|
||||
int off = 0, cpu;
|
||||
|
||||
pptr = htab_elem_get_ptr(l, key_size);
|
||||
for_each_possible_cpu(cpu) {
|
||||
copy_map_value_long(&htab->map, value + off, per_cpu_ptr(pptr, cpu));
|
||||
check_and_init_map_value(&htab->map, value + off);
|
||||
off += roundup_value_size;
|
||||
}
|
||||
} else {
|
||||
u32 roundup_key_size = round_up(map->key_size, 8);
|
||||
|
||||
if (flags & BPF_F_LOCK)
|
||||
copy_map_value_locked(map, value, l->key +
|
||||
roundup_key_size,
|
||||
true);
|
||||
else
|
||||
copy_map_value(map, value, l->key +
|
||||
roundup_key_size);
|
||||
/* Zeroing special fields in the temp buffer */
|
||||
check_and_init_map_value(map, value);
|
||||
}
|
||||
|
||||
hlist_nulls_del_rcu(&l->hash_node);
|
||||
if (!is_lru_map)
|
||||
free_htab_elem(htab, l);
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
if (is_percpu) {
|
||||
u32 roundup_value_size = round_up(map->value_size, 8);
|
||||
void __percpu *pptr;
|
||||
int off = 0, cpu;
|
||||
|
||||
pptr = htab_elem_get_ptr(l, key_size);
|
||||
for_each_possible_cpu(cpu) {
|
||||
copy_map_value_long(&htab->map, value + off, per_cpu_ptr(pptr, cpu));
|
||||
check_and_init_map_value(&htab->map, value + off);
|
||||
off += roundup_value_size;
|
||||
}
|
||||
} else {
|
||||
u32 roundup_key_size = round_up(map->key_size, 8);
|
||||
|
||||
if (flags & BPF_F_LOCK)
|
||||
copy_map_value_locked(map, value, l->key +
|
||||
roundup_key_size,
|
||||
true);
|
||||
else
|
||||
copy_map_value(map, value, l->key +
|
||||
roundup_key_size);
|
||||
/* Zeroing special fields in the temp buffer */
|
||||
check_and_init_map_value(map, value);
|
||||
}
|
||||
hlist_nulls_del_rcu(&l->hash_node);
|
||||
|
||||
out_unlock:
|
||||
htab_unlock_bucket(htab, b, hash, bflags);
|
||||
|
||||
if (is_lru_map && l)
|
||||
htab_lru_push_free(htab, l);
|
||||
if (l) {
|
||||
if (is_lru_map)
|
||||
htab_lru_push_free(htab, l);
|
||||
else
|
||||
free_htab_elem(htab, l);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1593,10 +1593,24 @@ void bpf_timer_cancel_and_free(void *val)
|
|||
* To avoid these issues, punt to workqueue context when we are in a
|
||||
* timer callback.
|
||||
*/
|
||||
if (this_cpu_read(hrtimer_running))
|
||||
if (this_cpu_read(hrtimer_running)) {
|
||||
queue_work(system_unbound_wq, &t->cb.delete_work);
|
||||
else
|
||||
return;
|
||||
}
|
||||
|
||||
if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
|
||||
/* If the timer is running on other CPU, also use a kworker to
|
||||
* wait for the completion of the timer instead of trying to
|
||||
* acquire a sleepable lock in hrtimer_cancel() to wait for its
|
||||
* completion.
|
||||
*/
|
||||
if (hrtimer_try_to_cancel(&t->timer) >= 0)
|
||||
kfree_rcu(t, cb.rcu);
|
||||
else
|
||||
queue_work(system_unbound_wq, &t->cb.delete_work);
|
||||
} else {
|
||||
bpf_timer_delete_work(&t->cb.delete_work);
|
||||
}
|
||||
}
|
||||
|
||||
/* This function is called by map_delete/update_elem for individual element and
|
||||
|
|
|
|||
165
tools/testing/selftests/bpf/prog_tests/free_timer.c
Normal file
165
tools/testing/selftests/bpf/prog_tests/free_timer.c
Normal file
|
|
@ -0,0 +1,165 @@
|
|||
// SPDX-License-Identifier: GPL-2.0
|
||||
/* Copyright (C) 2025. Huawei Technologies Co., Ltd */
|
||||
#define _GNU_SOURCE
|
||||
#include <unistd.h>
|
||||
#include <sys/syscall.h>
|
||||
#include <test_progs.h>
|
||||
|
||||
#include "free_timer.skel.h"
|
||||
|
||||
struct run_ctx {
|
||||
struct bpf_program *start_prog;
|
||||
struct bpf_program *overwrite_prog;
|
||||
pthread_barrier_t notify;
|
||||
int loop;
|
||||
bool start;
|
||||
bool stop;
|
||||
};
|
||||
|
||||
static void start_threads(struct run_ctx *ctx)
|
||||
{
|
||||
ctx->start = true;
|
||||
}
|
||||
|
||||
static void stop_threads(struct run_ctx *ctx)
|
||||
{
|
||||
ctx->stop = true;
|
||||
/* Guarantee the order between ->stop and ->start */
|
||||
__atomic_store_n(&ctx->start, true, __ATOMIC_RELEASE);
|
||||
}
|
||||
|
||||
static int wait_for_start(struct run_ctx *ctx)
|
||||
{
|
||||
while (!__atomic_load_n(&ctx->start, __ATOMIC_ACQUIRE))
|
||||
usleep(10);
|
||||
|
||||
return ctx->stop;
|
||||
}
|
||||
|
||||
static void *overwrite_timer_fn(void *arg)
|
||||
{
|
||||
struct run_ctx *ctx = arg;
|
||||
int loop, fd, err;
|
||||
cpu_set_t cpuset;
|
||||
long ret = 0;
|
||||
|
||||
/* Pin on CPU 0 */
|
||||
CPU_ZERO(&cpuset);
|
||||
CPU_SET(0, &cpuset);
|
||||
pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset);
|
||||
|
||||
/* Is the thread being stopped ? */
|
||||
err = wait_for_start(ctx);
|
||||
if (err)
|
||||
return NULL;
|
||||
|
||||
fd = bpf_program__fd(ctx->overwrite_prog);
|
||||
loop = ctx->loop;
|
||||
while (loop-- > 0) {
|
||||
LIBBPF_OPTS(bpf_test_run_opts, opts);
|
||||
|
||||
/* Wait for start thread to complete */
|
||||
pthread_barrier_wait(&ctx->notify);
|
||||
|
||||
/* Overwrite timers */
|
||||
err = bpf_prog_test_run_opts(fd, &opts);
|
||||
if (err)
|
||||
ret |= 1;
|
||||
else if (opts.retval)
|
||||
ret |= 2;
|
||||
|
||||
/* Notify start thread to start timers */
|
||||
pthread_barrier_wait(&ctx->notify);
|
||||
}
|
||||
|
||||
return (void *)ret;
|
||||
}
|
||||
|
||||
static void *start_timer_fn(void *arg)
|
||||
{
|
||||
struct run_ctx *ctx = arg;
|
||||
int loop, fd, err;
|
||||
cpu_set_t cpuset;
|
||||
long ret = 0;
|
||||
|
||||
/* Pin on CPU 1 */
|
||||
CPU_ZERO(&cpuset);
|
||||
CPU_SET(1, &cpuset);
|
||||
pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset);
|
||||
|
||||
/* Is the thread being stopped ? */
|
||||
err = wait_for_start(ctx);
|
||||
if (err)
|
||||
return NULL;
|
||||
|
||||
fd = bpf_program__fd(ctx->start_prog);
|
||||
loop = ctx->loop;
|
||||
while (loop-- > 0) {
|
||||
LIBBPF_OPTS(bpf_test_run_opts, opts);
|
||||
|
||||
/* Run the prog to start timer */
|
||||
err = bpf_prog_test_run_opts(fd, &opts);
|
||||
if (err)
|
||||
ret |= 4;
|
||||
else if (opts.retval)
|
||||
ret |= 8;
|
||||
|
||||
/* Notify overwrite thread to do overwrite */
|
||||
pthread_barrier_wait(&ctx->notify);
|
||||
|
||||
/* Wait for overwrite thread to complete */
|
||||
pthread_barrier_wait(&ctx->notify);
|
||||
}
|
||||
|
||||
return (void *)ret;
|
||||
}
|
||||
|
||||
void test_free_timer(void)
|
||||
{
|
||||
struct free_timer *skel;
|
||||
struct bpf_program *prog;
|
||||
struct run_ctx ctx;
|
||||
pthread_t tid[2];
|
||||
void *ret;
|
||||
int err;
|
||||
|
||||
skel = free_timer__open_and_load();
|
||||
if (!ASSERT_OK_PTR(skel, "open_load"))
|
||||
return;
|
||||
|
||||
memset(&ctx, 0, sizeof(ctx));
|
||||
|
||||
prog = bpf_object__find_program_by_name(skel->obj, "start_timer");
|
||||
if (!ASSERT_OK_PTR(prog, "find start prog"))
|
||||
goto out;
|
||||
ctx.start_prog = prog;
|
||||
|
||||
prog = bpf_object__find_program_by_name(skel->obj, "overwrite_timer");
|
||||
if (!ASSERT_OK_PTR(prog, "find overwrite prog"))
|
||||
goto out;
|
||||
ctx.overwrite_prog = prog;
|
||||
|
||||
pthread_barrier_init(&ctx.notify, NULL, 2);
|
||||
ctx.loop = 10;
|
||||
|
||||
err = pthread_create(&tid[0], NULL, start_timer_fn, &ctx);
|
||||
if (!ASSERT_OK(err, "create start_timer"))
|
||||
goto out;
|
||||
|
||||
err = pthread_create(&tid[1], NULL, overwrite_timer_fn, &ctx);
|
||||
if (!ASSERT_OK(err, "create overwrite_timer")) {
|
||||
stop_threads(&ctx);
|
||||
goto out;
|
||||
}
|
||||
|
||||
start_threads(&ctx);
|
||||
|
||||
ret = NULL;
|
||||
err = pthread_join(tid[0], &ret);
|
||||
ASSERT_EQ(err | (long)ret, 0, "start_timer");
|
||||
ret = NULL;
|
||||
err = pthread_join(tid[1], &ret);
|
||||
ASSERT_EQ(err | (long)ret, 0, "overwrite_timer");
|
||||
out:
|
||||
free_timer__destroy(skel);
|
||||
}
|
||||
71
tools/testing/selftests/bpf/progs/free_timer.c
Normal file
71
tools/testing/selftests/bpf/progs/free_timer.c
Normal file
|
|
@ -0,0 +1,71 @@
|
|||
// SPDX-License-Identifier: GPL-2.0
|
||||
/* Copyright (C) 2025. Huawei Technologies Co., Ltd */
|
||||
#include <linux/bpf.h>
|
||||
#include <time.h>
|
||||
#include <bpf/bpf_tracing.h>
|
||||
#include <bpf/bpf_helpers.h>
|
||||
|
||||
#define MAX_ENTRIES 8
|
||||
|
||||
struct map_value {
|
||||
struct bpf_timer timer;
|
||||
};
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_HASH);
|
||||
__type(key, int);
|
||||
__type(value, struct map_value);
|
||||
__uint(max_entries, MAX_ENTRIES);
|
||||
} map SEC(".maps");
|
||||
|
||||
static int timer_cb(void *map, void *key, struct map_value *value)
|
||||
{
|
||||
volatile int sum = 0;
|
||||
int i;
|
||||
|
||||
bpf_for(i, 0, 1024 * 1024) sum += i;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int start_cb(int key)
|
||||
{
|
||||
struct map_value *value;
|
||||
|
||||
value = bpf_map_lookup_elem(&map, (void *)&key);
|
||||
if (!value)
|
||||
return 0;
|
||||
|
||||
bpf_timer_init(&value->timer, &map, CLOCK_MONOTONIC);
|
||||
bpf_timer_set_callback(&value->timer, timer_cb);
|
||||
/* Hope 100us will be enough to wake-up and run the overwrite thread */
|
||||
bpf_timer_start(&value->timer, 100000, BPF_F_TIMER_CPU_PIN);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int overwrite_cb(int key)
|
||||
{
|
||||
struct map_value zero = {};
|
||||
|
||||
/* Free the timer which may run on other CPU */
|
||||
bpf_map_update_elem(&map, (void *)&key, &zero, BPF_ANY);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
SEC("syscall")
|
||||
int BPF_PROG(start_timer)
|
||||
{
|
||||
bpf_loop(MAX_ENTRIES, start_cb, NULL, 0);
|
||||
return 0;
|
||||
}
|
||||
|
||||
SEC("syscall")
|
||||
int BPF_PROG(overwrite_timer)
|
||||
{
|
||||
bpf_loop(MAX_ENTRIES, overwrite_cb, NULL, 0);
|
||||
return 0;
|
||||
}
|
||||
|
||||
char _license[] SEC("license") = "GPL";
|
||||
Loading…
Reference in New Issue
Block a user