From c1970e26bdc1209974bb5cf31cc23f2b7ad6ce50 Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Fri, 1 Sep 2023 11:10:37 +0800 Subject: [PATCH 1/9] selftests/bpf: Fix a CI failure caused by vsock write While commit 90f0074cd9f9 ("selftests/bpf: fix a CI failure caused by vsock sockmap test") fixes a receive failure of vsock sockmap test, there is still a write failure: Error: #211/79 sockmap_listen/sockmap VSOCK test_vsock_redir Error: #211/79 sockmap_listen/sockmap VSOCK test_vsock_redir ./test_progs:vsock_unix_redir_connectible:1501: egress: write: Transport endpoint is not connected vsock_unix_redir_connectible:FAIL:1501 ./test_progs:vsock_unix_redir_connectible:1501: ingress: write: Transport endpoint is not connected vsock_unix_redir_connectible:FAIL:1501 ./test_progs:vsock_unix_redir_connectible:1501: egress: write: Transport endpoint is not connected vsock_unix_redir_connectible:FAIL:1501 The reason is that the vsock connection in the test is set to ESTABLISHED state by function virtio_transport_recv_pkt, which is executed in a workqueue thread, so when the user space test thread runs before the workqueue thread, this problem occurs. To fix it, before writing the connection, wait for it to be connected. Fixes: d61bd8c1fd02 ("selftests/bpf: add a test case for vsock sockmap") Signed-off-by: Xu Kuohai Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20230901031037.3314007-1-xukuohai@huaweicloud.com --- .../bpf/prog_tests/sockmap_helpers.h | 26 +++++++++++++++++++ .../selftests/bpf/prog_tests/sockmap_listen.c | 7 +++++ 2 files changed, 33 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h b/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h index d12665490a90..36d829a65aa4 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h @@ -179,6 +179,32 @@ __ret; \ }) +static inline int poll_connect(int fd, unsigned int timeout_sec) +{ + struct timeval timeout = { .tv_sec = timeout_sec }; + fd_set wfds; + int r, eval; + socklen_t esize = sizeof(eval); + + FD_ZERO(&wfds); + FD_SET(fd, &wfds); + + r = select(fd + 1, NULL, &wfds, NULL, &timeout); + if (r == 0) + errno = ETIME; + if (r != 1) + return -1; + + if (getsockopt(fd, SOL_SOCKET, SO_ERROR, &eval, &esize) < 0) + return -1; + if (eval != 0) { + errno = eval; + return -1; + } + + return 0; +} + static inline int poll_read(int fd, unsigned int timeout_sec) { struct timeval timeout = { .tv_sec = timeout_sec }; diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c index 5674a9d0cacf..8df8cbb447f1 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c @@ -1452,11 +1452,18 @@ static int vsock_socketpair_connectible(int sotype, int *v0, int *v1) if (p < 0) goto close_cli; + if (poll_connect(c, IO_TIMEOUT_SEC) < 0) { + FAIL_ERRNO("poll_connect"); + goto close_acc; + } + *v0 = p; *v1 = c; return 0; +close_acc: + close(p); close_cli: close(c); close_srv: From 3888fa134eddac467b5a094949a8f0731ef6ffd5 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Fri, 1 Sep 2023 15:59:35 +0300 Subject: [PATCH 2/9] docs/bpf: Fix "file doesn't exist" warnings in {llvm_reloc,btf}.rst scripts/documentation-file-ref-check reports warnings for (valid) cross-links of form: :ref:`Documentation/bpf/btf ` Adding extension to the file name helps to avoid the warning, e.g: :ref:`Documentation/bpf/btf.rst ` Fixes: be4033d36070 ("docs/bpf: Add description for CO-RE relocations") Reported-by: kernel test robot Signed-off-by: Eduard Zingerman Signed-off-by: Daniel Borkmann Acked-by: Jiri Olsa Closes: https://lore.kernel.org/oe-kbuild-all/202309010804.G3MpXo59-lkp@intel.com Link: https://lore.kernel.org/bpf/20230901125935.487972-1-eddyz87@gmail.com --- Documentation/bpf/btf.rst | 2 +- Documentation/bpf/llvm_reloc.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/bpf/btf.rst b/Documentation/bpf/btf.rst index ffc11afee569..e43c2fdafcd7 100644 --- a/Documentation/bpf/btf.rst +++ b/Documentation/bpf/btf.rst @@ -803,7 +803,7 @@ structure when .BTF.ext is generated. All ``bpf_core_relo`` structures within a single ``btf_ext_info_sec`` describe relocations applied to section named by ``btf_ext_info_sec->sec_name_off``. -See :ref:`Documentation/bpf/llvm_reloc ` +See :ref:`Documentation/bpf/llvm_reloc.rst ` for more information on CO-RE relocations. 4.2 .BTF_ids section diff --git a/Documentation/bpf/llvm_reloc.rst b/Documentation/bpf/llvm_reloc.rst index 73bf805000f2..44188e219d32 100644 --- a/Documentation/bpf/llvm_reloc.rst +++ b/Documentation/bpf/llvm_reloc.rst @@ -250,7 +250,7 @@ CO-RE Relocations From object file point of view CO-RE mechanism is implemented as a set of CO-RE specific relocation records. These relocation records are not related to ELF relocations and are encoded in .BTF.ext section. -See :ref:`Documentation/bpf/btf ` for more +See :ref:`Documentation/bpf/btf.rst ` for more information on .BTF.ext structure. CO-RE relocations are applied to BPF instructions to update immediate From a454d84ee20baf7bd7be90721b9821f73c7d23d9 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 1 Sep 2023 13:21:37 -0700 Subject: [PATCH 3/9] bpf, sockmap: Fix skb refcnt race after locking changes There is a race where skb's from the sk_psock_backlog can be referenced after userspace side has already skb_consumed() the sk_buff and its refcnt dropped to zer0 causing use after free. The flow is the following: while ((skb = skb_peek(&psock->ingress_skb)) sk_psock_handle_Skb(psock, skb, ..., ingress) if (!ingress) ... sk_psock_skb_ingress sk_psock_skb_ingress_enqueue(skb) msg->skb = skb sk_psock_queue_msg(psock, msg) skb_dequeue(&psock->ingress_skb) The sk_psock_queue_msg() puts the msg on the ingress_msg queue. This is what the application reads when recvmsg() is called. An application can read this anytime after the msg is placed on the queue. The recvmsg hook will also read msg->skb and then after user space reads the msg will call consume_skb(skb) on it effectively free'ing it. But, the race is in above where backlog queue still has a reference to the skb and calls skb_dequeue(). If the skb_dequeue happens after the user reads and free's the skb we have a use after free. The !ingress case does not suffer from this problem because it uses sendmsg_*(sk, msg) which does not pass the sk_buff further down the stack. The following splat was observed with 'test_progs -t sockmap_listen': [ 1022.710250][ T2556] general protection fault, ... [...] [ 1022.712830][ T2556] Workqueue: events sk_psock_backlog [ 1022.713262][ T2556] RIP: 0010:skb_dequeue+0x4c/0x80 [ 1022.713653][ T2556] Code: ... [...] [ 1022.720699][ T2556] Call Trace: [ 1022.720984][ T2556] [ 1022.721254][ T2556] ? die_addr+0x32/0x80^M [ 1022.721589][ T2556] ? exc_general_protection+0x25a/0x4b0 [ 1022.722026][ T2556] ? asm_exc_general_protection+0x22/0x30 [ 1022.722489][ T2556] ? skb_dequeue+0x4c/0x80 [ 1022.722854][ T2556] sk_psock_backlog+0x27a/0x300 [ 1022.723243][ T2556] process_one_work+0x2a7/0x5b0 [ 1022.723633][ T2556] worker_thread+0x4f/0x3a0 [ 1022.723998][ T2556] ? __pfx_worker_thread+0x10/0x10 [ 1022.724386][ T2556] kthread+0xfd/0x130 [ 1022.724709][ T2556] ? __pfx_kthread+0x10/0x10 [ 1022.725066][ T2556] ret_from_fork+0x2d/0x50 [ 1022.725409][ T2556] ? __pfx_kthread+0x10/0x10 [ 1022.725799][ T2556] ret_from_fork_asm+0x1b/0x30 [ 1022.726201][ T2556] To fix we add an skb_get() before passing the skb to be enqueued in the engress queue. This bumps the skb->users refcnt so that consume_skb() and kfree_skb will not immediately free the sk_buff. With this we can be sure the skb is still around when we do the dequeue. Then we just need to decrement the refcnt or free the skb in the backlog case which we do by calling kfree_skb() on the ingress case as well as the sendmsg case. Before locking change from fixes tag we had the sock locked so we couldn't race with user and there was no issue here. Fixes: 799aa7f98d53e ("skmsg: Avoid lock_sock() in sk_psock_backlog()") Reported-by: Jiri Olsa Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Tested-by: Xu Kuohai Tested-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20230901202137.214666-1-john.fastabend@gmail.com --- net/core/skmsg.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index a0659fc29bcc..6c31eefbd777 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -612,12 +612,18 @@ static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb, u32 off, u32 len, bool ingress) { + int err = 0; + if (!ingress) { if (!sock_writeable(psock->sk)) return -EAGAIN; return skb_send_sock(psock->sk, skb, off, len); } - return sk_psock_skb_ingress(psock, skb, off, len); + skb_get(skb); + err = sk_psock_skb_ingress(psock, skb, off, len); + if (err < 0) + kfree_skb(skb); + return err; } static void sk_psock_skb_state(struct sk_psock *psock, @@ -685,9 +691,7 @@ static void sk_psock_backlog(struct work_struct *work) } while (len); skb = skb_dequeue(&psock->ingress_skb); - if (!ingress) { - kfree_skb(skb); - } + kfree_skb(skb); } end: mutex_unlock(&psock->work_mutex); From 7645629f7dc88cd777f98970134bf1a54c8d77e3 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 30 Aug 2023 10:04:04 +0200 Subject: [PATCH 4/9] bpf: Invoke __bpf_prog_exit_sleepable_recur() on recursion in kern_sys_bpf(). If __bpf_prog_enter_sleepable_recur() detects recursion then it returns 0 without undoing rcu_read_lock_trace(), migrate_disable() or decrementing the recursion counter. This is fine in the JIT case because the JIT code will jump in the 0 case to the end and invoke the matching exit trampoline (__bpf_prog_exit_sleepable_recur()). This is not the case in kern_sys_bpf() which returns directly to the caller with an error code. Add __bpf_prog_exit_sleepable_recur() as clean up in the recursion case. Fixes: b1d18a7574d0d ("bpf: Extend sys_bpf commands for bpf_syscall programs.") Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Daniel Borkmann Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20230830080405.251926-2-bigeasy@linutronix.de --- kernel/bpf/syscall.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ebeb0695305a..53a0b62464e9 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -5505,6 +5505,7 @@ int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size) run_ctx.saved_run_ctx = NULL; if (!__bpf_prog_enter_sleepable_recur(prog, &run_ctx)) { /* recursion detected */ + __bpf_prog_exit_sleepable_recur(prog, 0, &run_ctx); bpf_prog_put(prog); return -EBUSY; } From 6764e767f4af1e35f87f3497e1182d945de37f93 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 30 Aug 2023 10:04:05 +0200 Subject: [PATCH 5/9] bpf: Assign bpf_tramp_run_ctx::saved_run_ctx before recursion check. __bpf_prog_enter_recur() assigns bpf_tramp_run_ctx::saved_run_ctx before performing the recursion check which means in case of a recursion __bpf_prog_exit_recur() uses the previously set bpf_tramp_run_ctx::saved_run_ctx value. __bpf_prog_enter_sleepable_recur() assigns bpf_tramp_run_ctx::saved_run_ctx after the recursion check which means in case of a recursion __bpf_prog_exit_sleepable_recur() uses an uninitialized value. This does not look right. If I read the entry trampoline code right, then bpf_tramp_run_ctx isn't initialized upfront. Align __bpf_prog_enter_sleepable_recur() with __bpf_prog_enter_recur() and set bpf_tramp_run_ctx::saved_run_ctx before the recursion check is made. Remove the assignment of saved_run_ctx in kern_sys_bpf() since it happens a few cycles later. Fixes: e384c7b7b46d0 ("bpf, x86: Create bpf_tramp_run_ctx on the caller thread's stack") Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Daniel Borkmann Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20230830080405.251926-3-bigeasy@linutronix.de --- kernel/bpf/syscall.c | 1 - kernel/bpf/trampoline.c | 5 ++--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 53a0b62464e9..eb01c31ed591 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -5502,7 +5502,6 @@ int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size) } run_ctx.bpf_cookie = 0; - run_ctx.saved_run_ctx = NULL; if (!__bpf_prog_enter_sleepable_recur(prog, &run_ctx)) { /* recursion detected */ __bpf_prog_exit_sleepable_recur(prog, 0, &run_ctx); diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 78acf28d4873..53ff50cac61e 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -926,13 +926,12 @@ u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog, migrate_disable(); might_fault(); + run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); + if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { bpf_prog_inc_misses_counter(prog); return 0; } - - run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); - return bpf_prog_start_time(); } From a192103a11465e9d517975c50f9944dc80e44d61 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Wed, 6 Sep 2023 02:44:19 +0200 Subject: [PATCH 6/9] s390/bpf: Pass through tail call counter in trampolines s390x eBPF programs use the following extension to the s390x calling convention: tail call counter is passed on stack at offset STK_OFF_TCCNT, which callees otherwise use as scratch space. Currently trampoline does not respect this and clobbers tail call counter. This breaks enforcing tail call limits in eBPF programs, which have trampolines attached to them. Fix by forwarding a copy of the tail call counter to the original eBPF program in the trampoline (for fexit), and by restoring it at the end of the trampoline (for fentry). Fixes: 528eb2cb87bc ("s390/bpf: Implement arch_prepare_bpf_trampoline()") Reported-by: Leon Hwang Signed-off-by: Ilya Leoshkevich Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20230906004448.111674-1-iii@linux.ibm.com --- arch/s390/net/bpf_jit_comp.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 5e9371fbf3d5..de2fb12120d2 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -2088,6 +2088,7 @@ struct bpf_tramp_jit { */ int r14_off; /* Offset of saved %r14 */ int run_ctx_off; /* Offset of struct bpf_tramp_run_ctx */ + int tccnt_off; /* Offset of saved tailcall counter */ int do_fexit; /* do_fexit: label */ }; @@ -2258,12 +2259,16 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, tjit->r14_off = alloc_stack(tjit, sizeof(u64)); tjit->run_ctx_off = alloc_stack(tjit, sizeof(struct bpf_tramp_run_ctx)); + tjit->tccnt_off = alloc_stack(tjit, sizeof(u64)); /* The caller has already reserved STACK_FRAME_OVERHEAD bytes. */ tjit->stack_size -= STACK_FRAME_OVERHEAD; tjit->orig_stack_args_off = tjit->stack_size + STACK_FRAME_OVERHEAD; /* aghi %r15,-stack_size */ EMIT4_IMM(0xa70b0000, REG_15, -tjit->stack_size); + /* mvc tccnt_off(4,%r15),stack_size+STK_OFF_TCCNT(%r15) */ + _EMIT6(0xd203f000 | tjit->tccnt_off, + 0xf000 | (tjit->stack_size + STK_OFF_TCCNT)); /* stmg %r2,%rN,fwd_reg_args_off(%r15) */ if (nr_reg_args) EMIT6_DISP_LH(0xeb000000, 0x0024, REG_2, @@ -2400,6 +2405,8 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, (nr_stack_args * sizeof(u64) - 1) << 16 | tjit->stack_args_off, 0xf000 | tjit->orig_stack_args_off); + /* mvc STK_OFF_TCCNT(4,%r15),tccnt_off(%r15) */ + _EMIT6(0xd203f000 | STK_OFF_TCCNT, 0xf000 | tjit->tccnt_off); /* lgr %r1,%r8 */ EMIT4(0xb9040000, REG_1, REG_8); /* %r1() */ @@ -2456,6 +2463,9 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, if (flags & (BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_RET_FENTRY_RET)) EMIT6_DISP_LH(0xe3000000, 0x0004, REG_2, REG_0, REG_15, tjit->retval_off); + /* mvc stack_size+STK_OFF_TCCNT(4,%r15),tccnt_off(%r15) */ + _EMIT6(0xd203f000 | (tjit->stack_size + STK_OFF_TCCNT), + 0xf000 | tjit->tccnt_off); /* aghi %r15,stack_size */ EMIT4_IMM(0xa70b0000, REG_15, tjit->stack_size); /* Emit an expoline for the following indirect jump. */ From a96a44aba556c42b432929d37d60158aca21ad4c Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 1 Sep 2023 16:11:27 -0700 Subject: [PATCH 7/9] bpf: bpf_sk_storage: Fix invalid wait context lockdep report './test_progs -t test_local_storage' reported a splat: [ 27.137569] ============================= [ 27.138122] [ BUG: Invalid wait context ] [ 27.138650] 6.5.0-03980-gd11ae1b16b0a #247 Tainted: G O [ 27.139542] ----------------------------- [ 27.140106] test_progs/1729 is trying to lock: [ 27.140713] ffff8883ef047b88 (stock_lock){-.-.}-{3:3}, at: local_lock_acquire+0x9/0x130 [ 27.141834] other info that might help us debug this: [ 27.142437] context-{5:5} [ 27.142856] 2 locks held by test_progs/1729: [ 27.143352] #0: ffffffff84bcd9c0 (rcu_read_lock){....}-{1:3}, at: rcu_lock_acquire+0x4/0x40 [ 27.144492] #1: ffff888107deb2c0 (&storage->lock){..-.}-{2:2}, at: bpf_local_storage_update+0x39e/0x8e0 [ 27.145855] stack backtrace: [ 27.146274] CPU: 0 PID: 1729 Comm: test_progs Tainted: G O 6.5.0-03980-gd11ae1b16b0a #247 [ 27.147550] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 [ 27.149127] Call Trace: [ 27.149490] [ 27.149867] dump_stack_lvl+0x130/0x1d0 [ 27.152609] dump_stack+0x14/0x20 [ 27.153131] __lock_acquire+0x1657/0x2220 [ 27.153677] lock_acquire+0x1b8/0x510 [ 27.157908] local_lock_acquire+0x29/0x130 [ 27.159048] obj_cgroup_charge+0xf4/0x3c0 [ 27.160794] slab_pre_alloc_hook+0x28e/0x2b0 [ 27.161931] __kmem_cache_alloc_node+0x51/0x210 [ 27.163557] __kmalloc+0xaa/0x210 [ 27.164593] bpf_map_kzalloc+0xbc/0x170 [ 27.165147] bpf_selem_alloc+0x130/0x510 [ 27.166295] bpf_local_storage_update+0x5aa/0x8e0 [ 27.167042] bpf_fd_sk_storage_update_elem+0xdb/0x1a0 [ 27.169199] bpf_map_update_value+0x415/0x4f0 [ 27.169871] map_update_elem+0x413/0x550 [ 27.170330] __sys_bpf+0x5e9/0x640 [ 27.174065] __x64_sys_bpf+0x80/0x90 [ 27.174568] do_syscall_64+0x48/0xa0 [ 27.175201] entry_SYSCALL_64_after_hwframe+0x6e/0xd8 [ 27.175932] RIP: 0033:0x7effb40e41ad [ 27.176357] Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d8 [ 27.179028] RSP: 002b:00007ffe64c21fc8 EFLAGS: 00000202 ORIG_RAX: 0000000000000141 [ 27.180088] RAX: ffffffffffffffda RBX: 00007ffe64c22768 RCX: 00007effb40e41ad [ 27.181082] RDX: 0000000000000020 RSI: 00007ffe64c22008 RDI: 0000000000000002 [ 27.182030] RBP: 00007ffe64c21ff0 R08: 0000000000000000 R09: 00007ffe64c22788 [ 27.183038] R10: 0000000000000064 R11: 0000000000000202 R12: 0000000000000000 [ 27.184006] R13: 00007ffe64c22788 R14: 00007effb42a1000 R15: 0000000000000000 [ 27.184958] It complains about acquiring a local_lock while holding a raw_spin_lock. It means it should not allocate memory while holding a raw_spin_lock since it is not safe for RT. raw_spin_lock is needed because bpf_local_storage supports tracing context. In particular for task local storage, it is easy to get a "current" task PTR_TO_BTF_ID in tracing bpf prog. However, task (and cgroup) local storage has already been moved to bpf mem allocator which can be used after raw_spin_lock. The splat is for the sk storage. For sk (and inode) storage, it has not been moved to bpf mem allocator. Using raw_spin_lock or not, kzalloc(GFP_ATOMIC) could theoretically be unsafe in tracing context. However, the local storage helper requires a verifier accepted sk pointer (PTR_TO_BTF_ID), it is hypothetical if that (mean running a bpf prog in a kzalloc unsafe context and also able to hold a verifier accepted sk pointer) could happen. This patch avoids kzalloc after raw_spin_lock to silent the splat. There is an existing kzalloc before the raw_spin_lock. At that point, a kzalloc is very likely required because a lookup has just been done before. Thus, this patch always does the kzalloc before acquiring the raw_spin_lock and remove the later kzalloc usage after the raw_spin_lock. After this change, it will have a charge and then uncharge during the syscall bpf_map_update_elem() code path. This patch opts for simplicity and not continue the old optimization to save one charge and uncharge. This issue is dated back to the very first commit of bpf_sk_storage which had been refactored multiple times to create task, inode, and cgroup storage. This patch uses a Fixes tag with a more recent commit that should be easier to do backport. Fixes: b00fa38a9c1c ("bpf: Enable non-atomic allocations in local storage") Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20230901231129.578493-2-martin.lau@linux.dev --- kernel/bpf/bpf_local_storage.c | 47 ++++++++++------------------------ 1 file changed, 14 insertions(+), 33 deletions(-) diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index b5149cfce7d4..37ad47d52dc5 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -553,7 +553,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, void *value, u64 map_flags, gfp_t gfp_flags) { struct bpf_local_storage_data *old_sdata = NULL; - struct bpf_local_storage_elem *selem = NULL; + struct bpf_local_storage_elem *alloc_selem, *selem = NULL; struct bpf_local_storage *local_storage; unsigned long flags; int err; @@ -607,11 +607,12 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, } } - if (gfp_flags == GFP_KERNEL) { - selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags); - if (!selem) - return ERR_PTR(-ENOMEM); - } + /* A lookup has just been done before and concluded a new selem is + * needed. The chance of an unnecessary alloc is unlikely. + */ + alloc_selem = selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags); + if (!alloc_selem) + return ERR_PTR(-ENOMEM); raw_spin_lock_irqsave(&local_storage->lock, flags); @@ -623,13 +624,13 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, * simple. */ err = -EAGAIN; - goto unlock_err; + goto unlock; } old_sdata = bpf_local_storage_lookup(local_storage, smap, false); err = check_flags(old_sdata, map_flags); if (err) - goto unlock_err; + goto unlock; if (old_sdata && (map_flags & BPF_F_LOCK)) { copy_map_value_locked(&smap->map, old_sdata->data, value, @@ -638,23 +639,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, goto unlock; } - if (gfp_flags != GFP_KERNEL) { - /* local_storage->lock is held. Hence, we are sure - * we can unlink and uncharge the old_sdata successfully - * later. Hence, instead of charging the new selem now - * and then uncharge the old selem later (which may cause - * a potential but unnecessary charge failure), avoid taking - * a charge at all here (the "!old_sdata" check) and the - * old_sdata will not be uncharged later during - * bpf_selem_unlink_storage_nolock(). - */ - selem = bpf_selem_alloc(smap, owner, value, !old_sdata, gfp_flags); - if (!selem) { - err = -ENOMEM; - goto unlock_err; - } - } - + alloc_selem = NULL; /* First, link the new selem to the map */ bpf_selem_link_map(smap, selem); @@ -665,20 +650,16 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, if (old_sdata) { bpf_selem_unlink_map(SELEM(old_sdata)); bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata), - false, false); + true, false); } unlock: raw_spin_unlock_irqrestore(&local_storage->lock, flags); - return SDATA(selem); - -unlock_err: - raw_spin_unlock_irqrestore(&local_storage->lock, flags); - if (selem) { + if (alloc_selem) { mem_uncharge(smap, owner, smap->elem_size); - bpf_selem_free(selem, smap, true); + bpf_selem_free(alloc_selem, smap, true); } - return ERR_PTR(err); + return err ? ERR_PTR(err) : SDATA(selem); } static u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache) From 55d49f750b1cb1f177fb1b00ae02cba4613bcfb7 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 1 Sep 2023 16:11:28 -0700 Subject: [PATCH 8/9] bpf: bpf_sk_storage: Fix the missing uncharge in sk_omem_alloc The commit c83597fa5dc6 ("bpf: Refactor some inode/task/sk storage functions for reuse"), refactored the bpf_{sk,task,inode}_storage_free() into bpf_local_storage_unlink_nolock() which then later renamed to bpf_local_storage_destroy(). The commit accidentally passed the "bool uncharge_mem = false" argument to bpf_selem_unlink_storage_nolock() which then stopped the uncharge from happening to the sk->sk_omem_alloc. This missing uncharge only happens when the sk is going away (during __sk_destruct). This patch fixes it by always passing "uncharge_mem = true". It is a noop to the task/inode/cgroup storage because they do not have the map_local_storage_(un)charge enabled in the map_ops. A followup patch will be done in bpf-next to remove the uncharge_mem argument. A selftest is added in the next patch. Fixes: c83597fa5dc6 ("bpf: Refactor some inode/task/sk storage functions for reuse") Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20230901231129.578493-3-martin.lau@linux.dev --- kernel/bpf/bpf_local_storage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 37ad47d52dc5..146824cc9689 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -760,7 +760,7 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) * of the loop will set the free_cgroup_storage to true. */ free_storage = bpf_selem_unlink_storage_nolock( - local_storage, selem, false, true); + local_storage, selem, true, true); } raw_spin_unlock_irqrestore(&local_storage->lock, flags); From a96d1cfb2da040bdf692d22022371b249742abb2 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 1 Sep 2023 16:11:29 -0700 Subject: [PATCH 9/9] selftests/bpf: Check bpf_sk_storage has uncharged sk_omem_alloc This patch checks the sk_omem_alloc has been uncharged by bpf_sk_storage during the __sk_destruct. Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20230901231129.578493-4-martin.lau@linux.dev --- .../bpf/prog_tests/sk_storage_omem_uncharge.c | 56 +++++++++++++++++ .../selftests/bpf/progs/bpf_tracing_net.h | 1 + .../bpf/progs/sk_storage_omem_uncharge.c | 61 +++++++++++++++++++ 3 files changed, 118 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/sk_storage_omem_uncharge.c create mode 100644 tools/testing/selftests/bpf/progs/sk_storage_omem_uncharge.c diff --git a/tools/testing/selftests/bpf/prog_tests/sk_storage_omem_uncharge.c b/tools/testing/selftests/bpf/prog_tests/sk_storage_omem_uncharge.c new file mode 100644 index 000000000000..f35852d245e3 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/sk_storage_omem_uncharge.c @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2023 Facebook */ +#include +#include +#include +#include +#include "sk_storage_omem_uncharge.skel.h" + +void test_sk_storage_omem_uncharge(void) +{ + struct sk_storage_omem_uncharge *skel; + int sk_fd = -1, map_fd, err, value; + socklen_t optlen; + + skel = sk_storage_omem_uncharge__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel open_and_load")) + return; + map_fd = bpf_map__fd(skel->maps.sk_storage); + + /* A standalone socket not binding to addr:port, + * so nentns is not needed. + */ + sk_fd = socket(AF_INET6, SOCK_STREAM, 0); + if (!ASSERT_GE(sk_fd, 0, "socket")) + goto done; + + optlen = sizeof(skel->bss->cookie); + err = getsockopt(sk_fd, SOL_SOCKET, SO_COOKIE, &skel->bss->cookie, &optlen); + if (!ASSERT_OK(err, "getsockopt(SO_COOKIE)")) + goto done; + + value = 0; + err = bpf_map_update_elem(map_fd, &sk_fd, &value, 0); + if (!ASSERT_OK(err, "bpf_map_update_elem(value=0)")) + goto done; + + value = 0xdeadbeef; + err = bpf_map_update_elem(map_fd, &sk_fd, &value, 0); + if (!ASSERT_OK(err, "bpf_map_update_elem(value=0xdeadbeef)")) + goto done; + + err = sk_storage_omem_uncharge__attach(skel); + if (!ASSERT_OK(err, "attach")) + goto done; + + close(sk_fd); + sk_fd = -1; + + ASSERT_EQ(skel->bss->cookie_found, 2, "cookie_found"); + ASSERT_EQ(skel->bss->omem, 0, "omem"); + +done: + sk_storage_omem_uncharge__destroy(skel); + if (sk_fd != -1) + close(sk_fd); +} diff --git a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h index cfed4df490f3..0b793a102791 100644 --- a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h +++ b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h @@ -88,6 +88,7 @@ #define sk_v6_rcv_saddr __sk_common.skc_v6_rcv_saddr #define sk_flags __sk_common.skc_flags #define sk_reuse __sk_common.skc_reuse +#define sk_cookie __sk_common.skc_cookie #define s6_addr32 in6_u.u6_addr32 diff --git a/tools/testing/selftests/bpf/progs/sk_storage_omem_uncharge.c b/tools/testing/selftests/bpf/progs/sk_storage_omem_uncharge.c new file mode 100644 index 000000000000..3e745793b27a --- /dev/null +++ b/tools/testing/selftests/bpf/progs/sk_storage_omem_uncharge.c @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2023 Facebook */ +#include "vmlinux.h" +#include "bpf_tracing_net.h" +#include +#include +#include + +void *local_storage_ptr = NULL; +void *sk_ptr = NULL; +int cookie_found = 0; +__u64 cookie = 0; +__u32 omem = 0; + +void *bpf_rdonly_cast(void *, __u32) __ksym; + +struct { + __uint(type, BPF_MAP_TYPE_SK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, int); +} sk_storage SEC(".maps"); + +SEC("fexit/bpf_local_storage_destroy") +int BPF_PROG(bpf_local_storage_destroy, struct bpf_local_storage *local_storage) +{ + struct sock *sk; + + if (local_storage_ptr != local_storage) + return 0; + + sk = bpf_rdonly_cast(sk_ptr, bpf_core_type_id_kernel(struct sock)); + if (sk->sk_cookie.counter != cookie) + return 0; + + cookie_found++; + omem = sk->sk_omem_alloc.counter; + local_storage_ptr = NULL; + + return 0; +} + +SEC("fentry/inet6_sock_destruct") +int BPF_PROG(inet6_sock_destruct, struct sock *sk) +{ + int *value; + + if (!cookie || sk->sk_cookie.counter != cookie) + return 0; + + value = bpf_sk_storage_get(&sk_storage, sk, 0, 0); + if (value && *value == 0xdeadbeef) { + cookie_found++; + sk_ptr = sk; + local_storage_ptr = sk->sk_bpf_storage; + } + + return 0; +} + +char _license[] SEC("license") = "GPL";