From c1970e26bdc1209974bb5cf31cc23f2b7ad6ce50 Mon Sep 17 00:00:00 2001
From: Xu Kuohai <xukuohai@huawei.com>
Date: Fri, 1 Sep 2023 11:10:37 +0800
Subject: [PATCH 1/9] selftests/bpf: Fix a CI failure caused by vsock write

While commit 90f0074cd9f9 ("selftests/bpf: fix a CI failure caused by vsock sockmap test")
fixes a receive failure of vsock sockmap test, there is still a write failure:

Error: #211/79 sockmap_listen/sockmap VSOCK test_vsock_redir
Error: #211/79 sockmap_listen/sockmap VSOCK test_vsock_redir
  ./test_progs:vsock_unix_redir_connectible:1501: egress: write: Transport endpoint is not connected
  vsock_unix_redir_connectible:FAIL:1501
  ./test_progs:vsock_unix_redir_connectible:1501: ingress: write: Transport endpoint is not connected
  vsock_unix_redir_connectible:FAIL:1501
  ./test_progs:vsock_unix_redir_connectible:1501: egress: write: Transport endpoint is not connected
  vsock_unix_redir_connectible:FAIL:1501

The reason is that the vsock connection in the test is set to ESTABLISHED state
by function virtio_transport_recv_pkt, which is executed in a workqueue thread,
so when the user space test thread runs before the workqueue thread, this
problem occurs.

To fix it, before writing the connection, wait for it to be connected.

Fixes: d61bd8c1fd02 ("selftests/bpf: add a test case for vsock sockmap")
Signed-off-by: Xu Kuohai <xukuohai@huawei.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20230901031037.3314007-1-xukuohai@huaweicloud.com
---
 .../bpf/prog_tests/sockmap_helpers.h          | 26 +++++++++++++++++++
 .../selftests/bpf/prog_tests/sockmap_listen.c |  7 +++++
 2 files changed, 33 insertions(+)

diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h b/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h
index d12665490a90..36d829a65aa4 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h
+++ b/tools/testing/selftests/bpf/prog_tests/sockmap_helpers.h
@@ -179,6 +179,32 @@
 		__ret;                                                         \
 	})
 
+static inline int poll_connect(int fd, unsigned int timeout_sec)
+{
+	struct timeval timeout = { .tv_sec = timeout_sec };
+	fd_set wfds;
+	int r, eval;
+	socklen_t esize = sizeof(eval);
+
+	FD_ZERO(&wfds);
+	FD_SET(fd, &wfds);
+
+	r = select(fd + 1, NULL, &wfds, NULL, &timeout);
+	if (r == 0)
+		errno = ETIME;
+	if (r != 1)
+		return -1;
+
+	if (getsockopt(fd, SOL_SOCKET, SO_ERROR, &eval, &esize) < 0)
+		return -1;
+	if (eval != 0) {
+		errno = eval;
+		return -1;
+	}
+
+	return 0;
+}
+
 static inline int poll_read(int fd, unsigned int timeout_sec)
 {
 	struct timeval timeout = { .tv_sec = timeout_sec };
diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
index 5674a9d0cacf..8df8cbb447f1 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
@@ -1452,11 +1452,18 @@ static int vsock_socketpair_connectible(int sotype, int *v0, int *v1)
 	if (p < 0)
 		goto close_cli;
 
+	if (poll_connect(c, IO_TIMEOUT_SEC) < 0) {
+		FAIL_ERRNO("poll_connect");
+		goto close_acc;
+	}
+
 	*v0 = p;
 	*v1 = c;
 
 	return 0;
 
+close_acc:
+	close(p);
 close_cli:
 	close(c);
 close_srv:

From 3888fa134eddac467b5a094949a8f0731ef6ffd5 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Fri, 1 Sep 2023 15:59:35 +0300
Subject: [PATCH 2/9] docs/bpf: Fix "file doesn't exist" warnings in
 {llvm_reloc,btf}.rst

scripts/documentation-file-ref-check reports warnings for (valid) cross-links
of form:

  :ref:`Documentation/bpf/btf <BTF_Ext_Section>`

Adding extension to the file name helps to avoid the warning, e.g:

  :ref:`Documentation/bpf/btf.rst <BTF_Ext_Section>`

Fixes: be4033d36070 ("docs/bpf: Add description for CO-RE relocations")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Closes: https://lore.kernel.org/oe-kbuild-all/202309010804.G3MpXo59-lkp@intel.com
Link: https://lore.kernel.org/bpf/20230901125935.487972-1-eddyz87@gmail.com
---
 Documentation/bpf/btf.rst        | 2 +-
 Documentation/bpf/llvm_reloc.rst | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/bpf/btf.rst b/Documentation/bpf/btf.rst
index ffc11afee569..e43c2fdafcd7 100644
--- a/Documentation/bpf/btf.rst
+++ b/Documentation/bpf/btf.rst
@@ -803,7 +803,7 @@ structure when .BTF.ext is generated. All ``bpf_core_relo`` structures
 within a single ``btf_ext_info_sec`` describe relocations applied to
 section named by ``btf_ext_info_sec->sec_name_off``.
 
-See :ref:`Documentation/bpf/llvm_reloc <btf-co-re-relocations>`
+See :ref:`Documentation/bpf/llvm_reloc.rst <btf-co-re-relocations>`
 for more information on CO-RE relocations.
 
 4.2 .BTF_ids section
diff --git a/Documentation/bpf/llvm_reloc.rst b/Documentation/bpf/llvm_reloc.rst
index 73bf805000f2..44188e219d32 100644
--- a/Documentation/bpf/llvm_reloc.rst
+++ b/Documentation/bpf/llvm_reloc.rst
@@ -250,7 +250,7 @@ CO-RE Relocations
 From object file point of view CO-RE mechanism is implemented as a set
 of CO-RE specific relocation records. These relocation records are not
 related to ELF relocations and are encoded in .BTF.ext section.
-See :ref:`Documentation/bpf/btf <BTF_Ext_Section>` for more
+See :ref:`Documentation/bpf/btf.rst <BTF_Ext_Section>` for more
 information on .BTF.ext structure.
 
 CO-RE relocations are applied to BPF instructions to update immediate

From a454d84ee20baf7bd7be90721b9821f73c7d23d9 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Fri, 1 Sep 2023 13:21:37 -0700
Subject: [PATCH 3/9] bpf, sockmap: Fix skb refcnt race after locking changes

There is a race where skb's from the sk_psock_backlog can be referenced
after userspace side has already skb_consumed() the sk_buff and its refcnt
dropped to zer0 causing use after free.

The flow is the following:

  while ((skb = skb_peek(&psock->ingress_skb))
    sk_psock_handle_Skb(psock, skb, ..., ingress)
    if (!ingress) ...
    sk_psock_skb_ingress
       sk_psock_skb_ingress_enqueue(skb)
          msg->skb = skb
          sk_psock_queue_msg(psock, msg)
    skb_dequeue(&psock->ingress_skb)

The sk_psock_queue_msg() puts the msg on the ingress_msg queue. This is
what the application reads when recvmsg() is called. An application can
read this anytime after the msg is placed on the queue. The recvmsg hook
will also read msg->skb and then after user space reads the msg will call
consume_skb(skb) on it effectively free'ing it.

But, the race is in above where backlog queue still has a reference to
the skb and calls skb_dequeue(). If the skb_dequeue happens after the
user reads and free's the skb we have a use after free.

The !ingress case does not suffer from this problem because it uses
sendmsg_*(sk, msg) which does not pass the sk_buff further down the
stack.

The following splat was observed with 'test_progs -t sockmap_listen':

  [ 1022.710250][ T2556] general protection fault, ...
  [...]
  [ 1022.712830][ T2556] Workqueue: events sk_psock_backlog
  [ 1022.713262][ T2556] RIP: 0010:skb_dequeue+0x4c/0x80
  [ 1022.713653][ T2556] Code: ...
  [...]
  [ 1022.720699][ T2556] Call Trace:
  [ 1022.720984][ T2556]  <TASK>
  [ 1022.721254][ T2556]  ? die_addr+0x32/0x80^M
  [ 1022.721589][ T2556]  ? exc_general_protection+0x25a/0x4b0
  [ 1022.722026][ T2556]  ? asm_exc_general_protection+0x22/0x30
  [ 1022.722489][ T2556]  ? skb_dequeue+0x4c/0x80
  [ 1022.722854][ T2556]  sk_psock_backlog+0x27a/0x300
  [ 1022.723243][ T2556]  process_one_work+0x2a7/0x5b0
  [ 1022.723633][ T2556]  worker_thread+0x4f/0x3a0
  [ 1022.723998][ T2556]  ? __pfx_worker_thread+0x10/0x10
  [ 1022.724386][ T2556]  kthread+0xfd/0x130
  [ 1022.724709][ T2556]  ? __pfx_kthread+0x10/0x10
  [ 1022.725066][ T2556]  ret_from_fork+0x2d/0x50
  [ 1022.725409][ T2556]  ? __pfx_kthread+0x10/0x10
  [ 1022.725799][ T2556]  ret_from_fork_asm+0x1b/0x30
  [ 1022.726201][ T2556]  </TASK>

To fix we add an skb_get() before passing the skb to be enqueued in the
engress queue. This bumps the skb->users refcnt so that consume_skb()
and kfree_skb will not immediately free the sk_buff. With this we can
be sure the skb is still around when we do the dequeue. Then we just
need to decrement the refcnt or free the skb in the backlog case which
we do by calling kfree_skb() on the ingress case as well as the sendmsg
case.

Before locking change from fixes tag we had the sock locked so we
couldn't race with user and there was no issue here.

Fixes: 799aa7f98d53e ("skmsg: Avoid lock_sock() in sk_psock_backlog()")
Reported-by: Jiri Olsa  <jolsa@kernel.org>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: Xu Kuohai <xukuohai@huawei.com>
Tested-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/bpf/20230901202137.214666-1-john.fastabend@gmail.com
---
 net/core/skmsg.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index a0659fc29bcc..6c31eefbd777 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -612,12 +612,18 @@ static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb
 static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,
 			       u32 off, u32 len, bool ingress)
 {
+	int err = 0;
+
 	if (!ingress) {
 		if (!sock_writeable(psock->sk))
 			return -EAGAIN;
 		return skb_send_sock(psock->sk, skb, off, len);
 	}
-	return sk_psock_skb_ingress(psock, skb, off, len);
+	skb_get(skb);
+	err = sk_psock_skb_ingress(psock, skb, off, len);
+	if (err < 0)
+		kfree_skb(skb);
+	return err;
 }
 
 static void sk_psock_skb_state(struct sk_psock *psock,
@@ -685,9 +691,7 @@ static void sk_psock_backlog(struct work_struct *work)
 		} while (len);
 
 		skb = skb_dequeue(&psock->ingress_skb);
-		if (!ingress) {
-			kfree_skb(skb);
-		}
+		kfree_skb(skb);
 	}
 end:
 	mutex_unlock(&psock->work_mutex);

From 7645629f7dc88cd777f98970134bf1a54c8d77e3 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 30 Aug 2023 10:04:04 +0200
Subject: [PATCH 4/9] bpf: Invoke __bpf_prog_exit_sleepable_recur() on
 recursion in kern_sys_bpf().

If __bpf_prog_enter_sleepable_recur() detects recursion then it returns
0 without undoing rcu_read_lock_trace(), migrate_disable() or
decrementing the recursion counter. This is fine in the JIT case because
the JIT code will jump in the 0 case to the end and invoke the matching
exit trampoline (__bpf_prog_exit_sleepable_recur()).

This is not the case in kern_sys_bpf() which returns directly to the
caller with an error code.

Add __bpf_prog_exit_sleepable_recur() as clean up in the recursion case.

Fixes: b1d18a7574d0d ("bpf: Extend sys_bpf commands for bpf_syscall programs.")
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/bpf/20230830080405.251926-2-bigeasy@linutronix.de
---
 kernel/bpf/syscall.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index ebeb0695305a..53a0b62464e9 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -5505,6 +5505,7 @@ int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size)
 		run_ctx.saved_run_ctx = NULL;
 		if (!__bpf_prog_enter_sleepable_recur(prog, &run_ctx)) {
 			/* recursion detected */
+			__bpf_prog_exit_sleepable_recur(prog, 0, &run_ctx);
 			bpf_prog_put(prog);
 			return -EBUSY;
 		}

From 6764e767f4af1e35f87f3497e1182d945de37f93 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 30 Aug 2023 10:04:05 +0200
Subject: [PATCH 5/9] bpf: Assign bpf_tramp_run_ctx::saved_run_ctx before
 recursion check.

__bpf_prog_enter_recur() assigns bpf_tramp_run_ctx::saved_run_ctx before
performing the recursion check which means in case of a recursion
__bpf_prog_exit_recur() uses the previously set bpf_tramp_run_ctx::saved_run_ctx
value.

__bpf_prog_enter_sleepable_recur() assigns bpf_tramp_run_ctx::saved_run_ctx
after the recursion check which means in case of a recursion
__bpf_prog_exit_sleepable_recur() uses an uninitialized value. This does not
look right. If I read the entry trampoline code right, then bpf_tramp_run_ctx
isn't initialized upfront.

Align __bpf_prog_enter_sleepable_recur() with __bpf_prog_enter_recur() and
set bpf_tramp_run_ctx::saved_run_ctx before the recursion check is made.
Remove the assignment of saved_run_ctx in kern_sys_bpf() since it happens
a few cycles later.

Fixes: e384c7b7b46d0 ("bpf, x86: Create bpf_tramp_run_ctx on the caller thread's stack")
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/bpf/20230830080405.251926-3-bigeasy@linutronix.de
---
 kernel/bpf/syscall.c    | 1 -
 kernel/bpf/trampoline.c | 5 ++---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 53a0b62464e9..eb01c31ed591 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -5502,7 +5502,6 @@ int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size)
 		}
 
 		run_ctx.bpf_cookie = 0;
-		run_ctx.saved_run_ctx = NULL;
 		if (!__bpf_prog_enter_sleepable_recur(prog, &run_ctx)) {
 			/* recursion detected */
 			__bpf_prog_exit_sleepable_recur(prog, 0, &run_ctx);
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 78acf28d4873..53ff50cac61e 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -926,13 +926,12 @@ u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog,
 	migrate_disable();
 	might_fault();
 
+	run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
+
 	if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
 		bpf_prog_inc_misses_counter(prog);
 		return 0;
 	}
-
-	run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
-
 	return bpf_prog_start_time();
 }
 

From a192103a11465e9d517975c50f9944dc80e44d61 Mon Sep 17 00:00:00 2001
From: Ilya Leoshkevich <iii@linux.ibm.com>
Date: Wed, 6 Sep 2023 02:44:19 +0200
Subject: [PATCH 6/9] s390/bpf: Pass through tail call counter in trampolines

s390x eBPF programs use the following extension to the s390x calling
convention: tail call counter is passed on stack at offset
STK_OFF_TCCNT, which callees otherwise use as scratch space.

Currently trampoline does not respect this and clobbers tail call
counter. This breaks enforcing tail call limits in eBPF programs, which
have trampolines attached to them.

Fix by forwarding a copy of the tail call counter to the original eBPF
program in the trampoline (for fexit), and by restoring it at the end
of the trampoline (for fentry).

Fixes: 528eb2cb87bc ("s390/bpf: Implement arch_prepare_bpf_trampoline()")
Reported-by: Leon Hwang <hffilwlqm@gmail.com>
Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20230906004448.111674-1-iii@linux.ibm.com
---
 arch/s390/net/bpf_jit_comp.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index 5e9371fbf3d5..de2fb12120d2 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -2088,6 +2088,7 @@ struct bpf_tramp_jit {
 				 */
 	int r14_off;		/* Offset of saved %r14 */
 	int run_ctx_off;	/* Offset of struct bpf_tramp_run_ctx */
+	int tccnt_off;		/* Offset of saved tailcall counter */
 	int do_fexit;		/* do_fexit: label */
 };
 
@@ -2258,12 +2259,16 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im,
 	tjit->r14_off = alloc_stack(tjit, sizeof(u64));
 	tjit->run_ctx_off = alloc_stack(tjit,
 					sizeof(struct bpf_tramp_run_ctx));
+	tjit->tccnt_off = alloc_stack(tjit, sizeof(u64));
 	/* The caller has already reserved STACK_FRAME_OVERHEAD bytes. */
 	tjit->stack_size -= STACK_FRAME_OVERHEAD;
 	tjit->orig_stack_args_off = tjit->stack_size + STACK_FRAME_OVERHEAD;
 
 	/* aghi %r15,-stack_size */
 	EMIT4_IMM(0xa70b0000, REG_15, -tjit->stack_size);
+	/* mvc tccnt_off(4,%r15),stack_size+STK_OFF_TCCNT(%r15) */
+	_EMIT6(0xd203f000 | tjit->tccnt_off,
+	       0xf000 | (tjit->stack_size + STK_OFF_TCCNT));
 	/* stmg %r2,%rN,fwd_reg_args_off(%r15) */
 	if (nr_reg_args)
 		EMIT6_DISP_LH(0xeb000000, 0x0024, REG_2,
@@ -2400,6 +2405,8 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im,
 				       (nr_stack_args * sizeof(u64) - 1) << 16 |
 				       tjit->stack_args_off,
 			       0xf000 | tjit->orig_stack_args_off);
+		/* mvc STK_OFF_TCCNT(4,%r15),tccnt_off(%r15) */
+		_EMIT6(0xd203f000 | STK_OFF_TCCNT, 0xf000 | tjit->tccnt_off);
 		/* lgr %r1,%r8 */
 		EMIT4(0xb9040000, REG_1, REG_8);
 		/* %r1() */
@@ -2456,6 +2463,9 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im,
 	if (flags & (BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_RET_FENTRY_RET))
 		EMIT6_DISP_LH(0xe3000000, 0x0004, REG_2, REG_0, REG_15,
 			      tjit->retval_off);
+	/* mvc stack_size+STK_OFF_TCCNT(4,%r15),tccnt_off(%r15) */
+	_EMIT6(0xd203f000 | (tjit->stack_size + STK_OFF_TCCNT),
+	       0xf000 | tjit->tccnt_off);
 	/* aghi %r15,stack_size */
 	EMIT4_IMM(0xa70b0000, REG_15, tjit->stack_size);
 	/* Emit an expoline for the following indirect jump. */

From a96a44aba556c42b432929d37d60158aca21ad4c Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Fri, 1 Sep 2023 16:11:27 -0700
Subject: [PATCH 7/9] bpf: bpf_sk_storage: Fix invalid wait context lockdep
 report

'./test_progs -t test_local_storage' reported a splat:

[   27.137569] =============================
[   27.138122] [ BUG: Invalid wait context ]
[   27.138650] 6.5.0-03980-gd11ae1b16b0a #247 Tainted: G           O
[   27.139542] -----------------------------
[   27.140106] test_progs/1729 is trying to lock:
[   27.140713] ffff8883ef047b88 (stock_lock){-.-.}-{3:3}, at: local_lock_acquire+0x9/0x130
[   27.141834] other info that might help us debug this:
[   27.142437] context-{5:5}
[   27.142856] 2 locks held by test_progs/1729:
[   27.143352]  #0: ffffffff84bcd9c0 (rcu_read_lock){....}-{1:3}, at: rcu_lock_acquire+0x4/0x40
[   27.144492]  #1: ffff888107deb2c0 (&storage->lock){..-.}-{2:2}, at: bpf_local_storage_update+0x39e/0x8e0
[   27.145855] stack backtrace:
[   27.146274] CPU: 0 PID: 1729 Comm: test_progs Tainted: G           O       6.5.0-03980-gd11ae1b16b0a #247
[   27.147550] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
[   27.149127] Call Trace:
[   27.149490]  <TASK>
[   27.149867]  dump_stack_lvl+0x130/0x1d0
[   27.152609]  dump_stack+0x14/0x20
[   27.153131]  __lock_acquire+0x1657/0x2220
[   27.153677]  lock_acquire+0x1b8/0x510
[   27.157908]  local_lock_acquire+0x29/0x130
[   27.159048]  obj_cgroup_charge+0xf4/0x3c0
[   27.160794]  slab_pre_alloc_hook+0x28e/0x2b0
[   27.161931]  __kmem_cache_alloc_node+0x51/0x210
[   27.163557]  __kmalloc+0xaa/0x210
[   27.164593]  bpf_map_kzalloc+0xbc/0x170
[   27.165147]  bpf_selem_alloc+0x130/0x510
[   27.166295]  bpf_local_storage_update+0x5aa/0x8e0
[   27.167042]  bpf_fd_sk_storage_update_elem+0xdb/0x1a0
[   27.169199]  bpf_map_update_value+0x415/0x4f0
[   27.169871]  map_update_elem+0x413/0x550
[   27.170330]  __sys_bpf+0x5e9/0x640
[   27.174065]  __x64_sys_bpf+0x80/0x90
[   27.174568]  do_syscall_64+0x48/0xa0
[   27.175201]  entry_SYSCALL_64_after_hwframe+0x6e/0xd8
[   27.175932] RIP: 0033:0x7effb40e41ad
[   27.176357] Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d8
[   27.179028] RSP: 002b:00007ffe64c21fc8 EFLAGS: 00000202 ORIG_RAX: 0000000000000141
[   27.180088] RAX: ffffffffffffffda RBX: 00007ffe64c22768 RCX: 00007effb40e41ad
[   27.181082] RDX: 0000000000000020 RSI: 00007ffe64c22008 RDI: 0000000000000002
[   27.182030] RBP: 00007ffe64c21ff0 R08: 0000000000000000 R09: 00007ffe64c22788
[   27.183038] R10: 0000000000000064 R11: 0000000000000202 R12: 0000000000000000
[   27.184006] R13: 00007ffe64c22788 R14: 00007effb42a1000 R15: 0000000000000000
[   27.184958]  </TASK>

It complains about acquiring a local_lock while holding a raw_spin_lock.
It means it should not allocate memory while holding a raw_spin_lock
since it is not safe for RT.

raw_spin_lock is needed because bpf_local_storage supports tracing
context. In particular for task local storage, it is easy to
get a "current" task PTR_TO_BTF_ID in tracing bpf prog.
However, task (and cgroup) local storage has already been moved to
bpf mem allocator which can be used after raw_spin_lock.

The splat is for the sk storage. For sk (and inode) storage,
it has not been moved to bpf mem allocator. Using raw_spin_lock or not,
kzalloc(GFP_ATOMIC) could theoretically be unsafe in tracing context.
However, the local storage helper requires a verifier accepted
sk pointer (PTR_TO_BTF_ID), it is hypothetical if that (mean running
a bpf prog in a kzalloc unsafe context and also able to hold a verifier
accepted sk pointer) could happen.

This patch avoids kzalloc after raw_spin_lock to silent the splat.
There is an existing kzalloc before the raw_spin_lock. At that point,
a kzalloc is very likely required because a lookup has just been done
before. Thus, this patch always does the kzalloc before acquiring
the raw_spin_lock and remove the later kzalloc usage after the
raw_spin_lock. After this change, it will have a charge and then
uncharge during the syscall bpf_map_update_elem() code path.
This patch opts for simplicity and not continue the old
optimization to save one charge and uncharge.

This issue is dated back to the very first commit of bpf_sk_storage
which had been refactored multiple times to create task, inode, and
cgroup storage. This patch uses a Fixes tag with a more recent
commit that should be easier to do backport.

Fixes: b00fa38a9c1c ("bpf: Enable non-atomic allocations in local storage")
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20230901231129.578493-2-martin.lau@linux.dev
---
 kernel/bpf/bpf_local_storage.c | 47 ++++++++++------------------------
 1 file changed, 14 insertions(+), 33 deletions(-)

diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index b5149cfce7d4..37ad47d52dc5 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -553,7 +553,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
 			 void *value, u64 map_flags, gfp_t gfp_flags)
 {
 	struct bpf_local_storage_data *old_sdata = NULL;
-	struct bpf_local_storage_elem *selem = NULL;
+	struct bpf_local_storage_elem *alloc_selem, *selem = NULL;
 	struct bpf_local_storage *local_storage;
 	unsigned long flags;
 	int err;
@@ -607,11 +607,12 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
 		}
 	}
 
-	if (gfp_flags == GFP_KERNEL) {
-		selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags);
-		if (!selem)
-			return ERR_PTR(-ENOMEM);
-	}
+	/* A lookup has just been done before and concluded a new selem is
+	 * needed. The chance of an unnecessary alloc is unlikely.
+	 */
+	alloc_selem = selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags);
+	if (!alloc_selem)
+		return ERR_PTR(-ENOMEM);
 
 	raw_spin_lock_irqsave(&local_storage->lock, flags);
 
@@ -623,13 +624,13 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
 		 * simple.
 		 */
 		err = -EAGAIN;
-		goto unlock_err;
+		goto unlock;
 	}
 
 	old_sdata = bpf_local_storage_lookup(local_storage, smap, false);
 	err = check_flags(old_sdata, map_flags);
 	if (err)
-		goto unlock_err;
+		goto unlock;
 
 	if (old_sdata && (map_flags & BPF_F_LOCK)) {
 		copy_map_value_locked(&smap->map, old_sdata->data, value,
@@ -638,23 +639,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
 		goto unlock;
 	}
 
-	if (gfp_flags != GFP_KERNEL) {
-		/* local_storage->lock is held.  Hence, we are sure
-		 * we can unlink and uncharge the old_sdata successfully
-		 * later.  Hence, instead of charging the new selem now
-		 * and then uncharge the old selem later (which may cause
-		 * a potential but unnecessary charge failure),  avoid taking
-		 * a charge at all here (the "!old_sdata" check) and the
-		 * old_sdata will not be uncharged later during
-		 * bpf_selem_unlink_storage_nolock().
-		 */
-		selem = bpf_selem_alloc(smap, owner, value, !old_sdata, gfp_flags);
-		if (!selem) {
-			err = -ENOMEM;
-			goto unlock_err;
-		}
-	}
-
+	alloc_selem = NULL;
 	/* First, link the new selem to the map */
 	bpf_selem_link_map(smap, selem);
 
@@ -665,20 +650,16 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
 	if (old_sdata) {
 		bpf_selem_unlink_map(SELEM(old_sdata));
 		bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata),
-						false, false);
+						true, false);
 	}
 
 unlock:
 	raw_spin_unlock_irqrestore(&local_storage->lock, flags);
-	return SDATA(selem);
-
-unlock_err:
-	raw_spin_unlock_irqrestore(&local_storage->lock, flags);
-	if (selem) {
+	if (alloc_selem) {
 		mem_uncharge(smap, owner, smap->elem_size);
-		bpf_selem_free(selem, smap, true);
+		bpf_selem_free(alloc_selem, smap, true);
 	}
-	return ERR_PTR(err);
+	return err ? ERR_PTR(err) : SDATA(selem);
 }
 
 static u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache)

From 55d49f750b1cb1f177fb1b00ae02cba4613bcfb7 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Fri, 1 Sep 2023 16:11:28 -0700
Subject: [PATCH 8/9] bpf: bpf_sk_storage: Fix the missing uncharge in
 sk_omem_alloc

The commit c83597fa5dc6 ("bpf: Refactor some inode/task/sk storage functions
for reuse"), refactored the bpf_{sk,task,inode}_storage_free() into
bpf_local_storage_unlink_nolock() which then later renamed to
bpf_local_storage_destroy(). The commit accidentally passed the
"bool uncharge_mem = false" argument to bpf_selem_unlink_storage_nolock()
which then stopped the uncharge from happening to the sk->sk_omem_alloc.

This missing uncharge only happens when the sk is going away (during
__sk_destruct).

This patch fixes it by always passing "uncharge_mem = true". It is a
noop to the task/inode/cgroup storage because they do not have the
map_local_storage_(un)charge enabled in the map_ops. A followup patch
will be done in bpf-next to remove the uncharge_mem argument.

A selftest is added in the next patch.

Fixes: c83597fa5dc6 ("bpf: Refactor some inode/task/sk storage functions for reuse")
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20230901231129.578493-3-martin.lau@linux.dev
---
 kernel/bpf/bpf_local_storage.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 37ad47d52dc5..146824cc9689 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -760,7 +760,7 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
 		 * of the loop will set the free_cgroup_storage to true.
 		 */
 		free_storage = bpf_selem_unlink_storage_nolock(
-			local_storage, selem, false, true);
+			local_storage, selem, true, true);
 	}
 	raw_spin_unlock_irqrestore(&local_storage->lock, flags);
 

From a96d1cfb2da040bdf692d22022371b249742abb2 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Fri, 1 Sep 2023 16:11:29 -0700
Subject: [PATCH 9/9] selftests/bpf: Check bpf_sk_storage has uncharged
 sk_omem_alloc

This patch checks the sk_omem_alloc has been uncharged by bpf_sk_storage
during the __sk_destruct.

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20230901231129.578493-4-martin.lau@linux.dev
---
 .../bpf/prog_tests/sk_storage_omem_uncharge.c | 56 +++++++++++++++++
 .../selftests/bpf/progs/bpf_tracing_net.h     |  1 +
 .../bpf/progs/sk_storage_omem_uncharge.c      | 61 +++++++++++++++++++
 3 files changed, 118 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/sk_storage_omem_uncharge.c
 create mode 100644 tools/testing/selftests/bpf/progs/sk_storage_omem_uncharge.c

diff --git a/tools/testing/selftests/bpf/prog_tests/sk_storage_omem_uncharge.c b/tools/testing/selftests/bpf/prog_tests/sk_storage_omem_uncharge.c
new file mode 100644
index 000000000000..f35852d245e3
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/sk_storage_omem_uncharge.c
@@ -0,0 +1,56 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2023 Facebook */
+#include <test_progs.h>
+#include <bpf/libbpf.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include "sk_storage_omem_uncharge.skel.h"
+
+void test_sk_storage_omem_uncharge(void)
+{
+	struct sk_storage_omem_uncharge *skel;
+	int sk_fd = -1, map_fd, err, value;
+	socklen_t optlen;
+
+	skel = sk_storage_omem_uncharge__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "skel open_and_load"))
+		return;
+	map_fd = bpf_map__fd(skel->maps.sk_storage);
+
+	/* A standalone socket not binding to addr:port,
+	 * so nentns is not needed.
+	 */
+	sk_fd = socket(AF_INET6, SOCK_STREAM, 0);
+	if (!ASSERT_GE(sk_fd, 0, "socket"))
+		goto done;
+
+	optlen = sizeof(skel->bss->cookie);
+	err = getsockopt(sk_fd, SOL_SOCKET, SO_COOKIE, &skel->bss->cookie, &optlen);
+	if (!ASSERT_OK(err, "getsockopt(SO_COOKIE)"))
+		goto done;
+
+	value = 0;
+	err = bpf_map_update_elem(map_fd, &sk_fd, &value, 0);
+	if (!ASSERT_OK(err, "bpf_map_update_elem(value=0)"))
+		goto done;
+
+	value = 0xdeadbeef;
+	err = bpf_map_update_elem(map_fd, &sk_fd, &value, 0);
+	if (!ASSERT_OK(err, "bpf_map_update_elem(value=0xdeadbeef)"))
+		goto done;
+
+	err = sk_storage_omem_uncharge__attach(skel);
+	if (!ASSERT_OK(err, "attach"))
+		goto done;
+
+	close(sk_fd);
+	sk_fd = -1;
+
+	ASSERT_EQ(skel->bss->cookie_found, 2, "cookie_found");
+	ASSERT_EQ(skel->bss->omem, 0, "omem");
+
+done:
+	sk_storage_omem_uncharge__destroy(skel);
+	if (sk_fd != -1)
+		close(sk_fd);
+}
diff --git a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h
index cfed4df490f3..0b793a102791 100644
--- a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h
+++ b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h
@@ -88,6 +88,7 @@
 #define sk_v6_rcv_saddr		__sk_common.skc_v6_rcv_saddr
 #define sk_flags		__sk_common.skc_flags
 #define sk_reuse		__sk_common.skc_reuse
+#define sk_cookie		__sk_common.skc_cookie
 
 #define s6_addr32		in6_u.u6_addr32
 
diff --git a/tools/testing/selftests/bpf/progs/sk_storage_omem_uncharge.c b/tools/testing/selftests/bpf/progs/sk_storage_omem_uncharge.c
new file mode 100644
index 000000000000..3e745793b27a
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/sk_storage_omem_uncharge.c
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2023 Facebook */
+#include "vmlinux.h"
+#include "bpf_tracing_net.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
+
+void *local_storage_ptr = NULL;
+void *sk_ptr = NULL;
+int cookie_found = 0;
+__u64 cookie = 0;
+__u32 omem = 0;
+
+void *bpf_rdonly_cast(void *, __u32) __ksym;
+
+struct {
+	__uint(type, BPF_MAP_TYPE_SK_STORAGE);
+	__uint(map_flags, BPF_F_NO_PREALLOC);
+	__type(key, int);
+	__type(value, int);
+} sk_storage SEC(".maps");
+
+SEC("fexit/bpf_local_storage_destroy")
+int BPF_PROG(bpf_local_storage_destroy, struct bpf_local_storage *local_storage)
+{
+	struct sock *sk;
+
+	if (local_storage_ptr != local_storage)
+		return 0;
+
+	sk = bpf_rdonly_cast(sk_ptr, bpf_core_type_id_kernel(struct sock));
+	if (sk->sk_cookie.counter != cookie)
+		return 0;
+
+	cookie_found++;
+	omem = sk->sk_omem_alloc.counter;
+	local_storage_ptr = NULL;
+
+	return 0;
+}
+
+SEC("fentry/inet6_sock_destruct")
+int BPF_PROG(inet6_sock_destruct, struct sock *sk)
+{
+	int *value;
+
+	if (!cookie || sk->sk_cookie.counter != cookie)
+		return 0;
+
+	value = bpf_sk_storage_get(&sk_storage, sk, 0, 0);
+	if (value && *value == 0xdeadbeef) {
+		cookie_found++;
+		sk_ptr = sk;
+		local_storage_ptr = sk->sk_bpf_storage;
+	}
+
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";