From 846c76ecc02973b05ae909dd4248c11bfa277fc1 Mon Sep 17 00:00:00 2001
From: KaFai Wan <kafai.wan@linux.dev>
Date: Tue, 21 Apr 2026 23:58:01 +0800
Subject: [PATCH 01/19] bpf: Reject TCP_NODELAY in TCP header option callbacks

A BPF_SOCK_OPS program can enable
BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG and then call
bpf_setsockopt(TCP_NODELAY) from BPF_SOCK_OPS_HDR_OPT_LEN_CB or
BPF_SOCK_OPS_WRITE_HDR_OPT_CB.

In these callbacks, bpf_setsockopt(TCP_NODELAY) can reach
__tcp_sock_set_nodelay(), which can call tcp_push_pending_frames().

>From BPF_SOCK_OPS_HDR_OPT_LEN_CB, tcp_push_pending_frames() can call
tcp_current_mss(), which calls tcp_established_options() and re-enters
bpf_skops_hdr_opt_len().

BPF_SOCK_OPS_HDR_OPT_LEN_CB
  -> bpf_setsockopt(TCP_NODELAY)
    -> tcp_push_pending_frames()
      -> tcp_current_mss()
        -> tcp_established_options()
          -> bpf_skops_hdr_opt_len()
            -> BPF_SOCK_OPS_HDR_OPT_LEN_CB

>From BPF_SOCK_OPS_WRITE_HDR_OPT_CB, tcp_push_pending_frames() can call
tcp_write_xmit(), which calls tcp_transmit_skb().  That path recomputes
header option length through tcp_established_options() and
bpf_skops_hdr_opt_len() before re-entering bpf_skops_write_hdr_opt().

BPF_SOCK_OPS_WRITE_HDR_OPT_CB
  -> bpf_setsockopt(TCP_NODELAY)
    -> tcp_push_pending_frames()
      -> tcp_write_xmit()
        -> tcp_transmit_skb()
          -> tcp_established_options()
            -> bpf_skops_hdr_opt_len()
          -> bpf_skops_write_hdr_opt()
            -> BPF_SOCK_OPS_WRITE_HDR_OPT_CB

This leads to unbounded recursion and can overflow the kernel stack.

Reject TCP_NODELAY with -EOPNOTSUPP in bpf_sock_ops_setsockopt()
when bpf_setsockopt() is called from
BPF_SOCK_OPS_HDR_OPT_LEN_CB or BPF_SOCK_OPS_WRITE_HDR_OPT_CB.

Fixes: 7e41df5dbba2 ("bpf: Add a few optnames to bpf_setsockopt")
Closes: https://lore.kernel.org/bpf/d1d523c9-6901-4454-a183-94462b8f3e4e@std.uestc.edu.cn/
Reported-by: Quan Sun <2022090917019@std.uestc.edu.cn>
Reported-by: Yinhao Hu <dddddd@hust.edu.cn>
Reported-by: Kaiyan Mei <M202472210@hust.edu.cn>
Signed-off-by: KaFai Wan <kafai.wan@linux.dev>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Reviewed-by: Jiayuan Chen <jiayuan.chen@linux.dev>
Link: https://patch.msgid.link/20260421155804.135786-2-kafai.wan@linux.dev
---
 net/core/filter.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/net/core/filter.c b/net/core/filter.c
index 5fa9189eb772..96849f4c1fbc 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5833,6 +5833,12 @@ BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
 	if (!is_locked_tcp_sock_ops(bpf_sock))
 		return -EOPNOTSUPP;
 
+	/* TCP_NODELAY triggers tcp_push_pending_frames() and re-enters these callbacks. */
+	if ((bpf_sock->op == BPF_SOCK_OPS_HDR_OPT_LEN_CB ||
+	     bpf_sock->op == BPF_SOCK_OPS_WRITE_HDR_OPT_CB) &&
+	    level == SOL_TCP && optname == TCP_NODELAY)
+		return -EOPNOTSUPP;
+
 	return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen);
 }
 

From 54377fcab51f6f1f8807827d3751be42279e1a6a Mon Sep 17 00:00:00 2001
From: KaFai Wan <kafai.wan@linux.dev>
Date: Tue, 21 Apr 2026 23:58:02 +0800
Subject: [PATCH 02/19] bpf: Reject TCP_NODELAY in bpf-tcp-cc

A BPF TCP congestion control program can call bpf_setsockopt() from
its callbacks. In current kernels, if it calls
bpf_setsockopt(TCP_NODELAY) from cwnd_event_tx_start(), the call can
re-enter the TCP transmit path before the outer tcp_transmit_skb()
has completed and advanced the send head.

This can re-trigger CA_EVENT_TX_START and lead to unbounded recursion:

  tcp_transmit_skb()
    -> tcp_event_data_sent()
      -> tcp_ca_event(sk, CA_EVENT_TX_START)
        -> cwnd_event_tx_start()
          -> bpf_setsockopt(TCP_NODELAY)
            -> tcp_push_pending_frames()
              -> tcp_write_xmit()
                -> tcp_transmit_skb()

This leads to unbounded recursion and can overflow the kernel stack.

Reject TCP_NODELAY with -EOPNOTSUPP for bpf-tcp-cc by introducing
a dedicated setsockopt proto for BPF_PROG_TYPE_STRUCT_OPS TCP
congestion control programs. To keep it simple, all tcp-cc ops is
rejected for TCP_NODELAY.

Fixes: 7e41df5dbba2 ("bpf: Add a few optnames to bpf_setsockopt")
Suggested-by: Martin KaFai Lau <martin.lau@linux.dev>
Signed-off-by: KaFai Wan <kafai.wan@linux.dev>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Reviewed-by: Jiayuan Chen <jiayuan.chen@linux.dev>
Link: https://patch.msgid.link/20260421155804.135786-3-kafai.wan@linux.dev
---
 include/linux/bpf.h   |  1 +
 net/core/filter.c     | 24 ++++++++++++++++++++++++
 net/ipv4/bpf_tcp_ca.c |  2 +-
 3 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index b4b703c90ca9..01e203964892 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -3725,6 +3725,7 @@ extern const struct bpf_func_proto bpf_for_each_map_elem_proto;
 extern const struct bpf_func_proto bpf_btf_find_by_name_kind_proto;
 extern const struct bpf_func_proto bpf_sk_setsockopt_proto;
 extern const struct bpf_func_proto bpf_sk_getsockopt_proto;
+extern const struct bpf_func_proto bpf_sk_setsockopt_nodelay_proto;
 extern const struct bpf_func_proto bpf_unlocked_sk_setsockopt_proto;
 extern const struct bpf_func_proto bpf_unlocked_sk_getsockopt_proto;
 extern const struct bpf_func_proto bpf_find_vma_proto;
diff --git a/net/core/filter.c b/net/core/filter.c
index 96849f4c1fbc..2914f5330310 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5688,6 +5688,30 @@ const struct bpf_func_proto bpf_sk_getsockopt_proto = {
 	.arg5_type	= ARG_CONST_SIZE,
 };
 
+BPF_CALL_5(bpf_sk_setsockopt_nodelay, struct sock *, sk, int, level,
+	   int, optname, char *, optval, int, optlen)
+{
+	/*
+	 * TCP_NODELAY triggers tcp_push_pending_frames() and re-enters
+	 * CA_EVENT_TX_START in bpf_tcp_cc.
+	 */
+	if (level == SOL_TCP && optname == TCP_NODELAY)
+		return -EOPNOTSUPP;
+
+	return _bpf_setsockopt(sk, level, optname, optval, optlen);
+}
+
+const struct bpf_func_proto bpf_sk_setsockopt_nodelay_proto = {
+	.func		= bpf_sk_setsockopt_nodelay,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_ANYTHING,
+	.arg4_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
+	.arg5_type	= ARG_CONST_SIZE,
+};
+
 BPF_CALL_5(bpf_unlocked_sk_setsockopt, struct sock *, sk, int, level,
 	   int, optname, char *, optval, int, optlen)
 {
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index 008edc7f6688..791e15063237 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -168,7 +168,7 @@ bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id,
 		 */
 		if (prog_ops_moff(prog) !=
 		    offsetof(struct tcp_congestion_ops, release))
-			return &bpf_sk_setsockopt_proto;
+			return &bpf_sk_setsockopt_nodelay_proto;
 		return NULL;
 	case BPF_FUNC_getsockopt:
 		/* Since get/setsockopt is usually expected to

From 52b6b5334924d8f083a2abe8edeface9206e13ee Mon Sep 17 00:00:00 2001
From: KaFai Wan <kafai.wan@linux.dev>
Date: Tue, 21 Apr 2026 23:58:03 +0800
Subject: [PATCH 03/19] selftests/bpf: Test TCP_NODELAY in TCP hdr opt
 callbacks

Add a sockops selftest for the TCP_NODELAY restriction in
BPF_SOCK_OPS_HDR_OPT_LEN_CB and BPF_SOCK_OPS_WRITE_HDR_OPT_CB.

With BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG enabled,
bpf_setsockopt(TCP_NODELAY) returns -EOPNOTSUPP from
BPF_SOCK_OPS_HDR_OPT_LEN_CB and BPF_SOCK_OPS_WRITE_HDR_OPT_CB, avoiding
unbounded recursion and kernel stack overflow.

Other cases continue to work as before, including
BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB.

Signed-off-by: KaFai Wan <kafai.wan@linux.dev>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Reviewed-by: Jiayuan Chen <jiayuan.chen@linux.dev>
Link: https://patch.msgid.link/20260421155804.135786-4-kafai.wan@linux.dev
---
 .../selftests/bpf/prog_tests/tcp_hdr_options.c    |  4 ++++
 .../bpf/progs/test_misc_tcp_hdr_options.c         | 15 ++++++++++++++-
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c b/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c
index 56685fc03c7e..80e6315da2a5 100644
--- a/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c
+++ b/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c
@@ -507,6 +507,10 @@ static void misc(void)
 
 	ASSERT_EQ(misc_skel->bss->nr_hwtstamp, 0, "nr_hwtstamp");
 
+	ASSERT_TRUE(misc_skel->bss->nodelay_est_ok, "nodelay_est_ok");
+	ASSERT_TRUE(misc_skel->bss->nodelay_hdr_len_reject, "nodelay_hdr_len_reject");
+	ASSERT_TRUE(misc_skel->bss->nodelay_write_hdr_reject, "nodelay_write_hdr_reject");
+
 check_linum:
 	ASSERT_FALSE(check_error_linum(&sk_fds), "check_error_linum");
 	sk_fds_close(&sk_fds);
diff --git a/tools/testing/selftests/bpf/progs/test_misc_tcp_hdr_options.c b/tools/testing/selftests/bpf/progs/test_misc_tcp_hdr_options.c
index d487153a839d..ed5a0011b863 100644
--- a/tools/testing/selftests/bpf/progs/test_misc_tcp_hdr_options.c
+++ b/tools/testing/selftests/bpf/progs/test_misc_tcp_hdr_options.c
@@ -29,6 +29,10 @@ unsigned int nr_syn = 0;
 unsigned int nr_fin = 0;
 unsigned int nr_hwtstamp = 0;
 
+bool nodelay_est_ok = false;
+bool nodelay_hdr_len_reject = false;
+bool nodelay_write_hdr_reject = false;
+
 /* Check the header received from the active side */
 static int __check_active_hdr_in(struct bpf_sock_ops *skops, bool check_syn)
 {
@@ -300,7 +304,7 @@ static int handle_passive_estab(struct bpf_sock_ops *skops)
 SEC("sockops")
 int misc_estab(struct bpf_sock_ops *skops)
 {
-	int true_val = 1;
+	int true_val = 1, false_val = 0, ret;
 
 	switch (skops->op) {
 	case BPF_SOCK_OPS_TCP_LISTEN_CB:
@@ -316,10 +320,19 @@ int misc_estab(struct bpf_sock_ops *skops)
 	case BPF_SOCK_OPS_PARSE_HDR_OPT_CB:
 		return handle_parse_hdr(skops);
 	case BPF_SOCK_OPS_HDR_OPT_LEN_CB:
+		ret = bpf_setsockopt(skops, SOL_TCP, TCP_NODELAY, &true_val, sizeof(true_val));
+		if (ret == -EOPNOTSUPP)
+			nodelay_hdr_len_reject = true;
 		return handle_hdr_opt_len(skops);
 	case BPF_SOCK_OPS_WRITE_HDR_OPT_CB:
+		ret = bpf_setsockopt(skops, SOL_TCP, TCP_NODELAY, &true_val, sizeof(true_val));
+		if (ret == -EOPNOTSUPP)
+			nodelay_write_hdr_reject = true;
 		return handle_write_hdr_opt(skops);
 	case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
+		ret = bpf_setsockopt(skops, SOL_TCP, TCP_NODELAY, &false_val, sizeof(false_val));
+		if (!ret)
+			nodelay_est_ok = true;
 		return handle_passive_estab(skops);
 	}
 

From 2c7e33f1fc2e75fcfb4aa5d840bcd2e8b53c1847 Mon Sep 17 00:00:00 2001
From: KaFai Wan <kafai.wan@linux.dev>
Date: Tue, 21 Apr 2026 23:58:04 +0800
Subject: [PATCH 04/19] selftests/bpf: Verify bpf-tcp-cc rejects TCP_NODELAY

Add a bpf_tcp_ca selftest for the TCP_NODELAY restriction in
bpf-tcp-cc.

Update bpf_cubic to exercise init() and cwnd_event_tx_start(),
and check that both callbacks reject bpf_setsockopt(TCP_NODELAY)
with -EOPNOTSUPP.

Signed-off-by: KaFai Wan <kafai.wan@linux.dev>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20260421155804.135786-5-kafai.wan@linux.dev
---
 .../testing/selftests/bpf/prog_tests/bpf_tcp_ca.c  |  4 ++++
 tools/testing/selftests/bpf/progs/bpf_cubic.c      | 14 ++++++++++++++
 2 files changed, 18 insertions(+)

diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
index f829b6f09bc9..fe30181e6336 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
@@ -112,6 +112,10 @@ static void test_cubic(void)
 
 	ASSERT_EQ(cubic_skel->bss->bpf_cubic_acked_called, 1, "pkts_acked called");
 
+	ASSERT_TRUE(cubic_skel->bss->nodelay_init_reject, "init reject nodelay option");
+	ASSERT_TRUE(cubic_skel->bss->nodelay_cwnd_event_tx_start_reject,
+		    "cwnd_event_tx_start reject nodelay option");
+
 	bpf_link__destroy(link);
 	bpf_cubic__destroy(cubic_skel);
 }
diff --git a/tools/testing/selftests/bpf/progs/bpf_cubic.c b/tools/testing/selftests/bpf/progs/bpf_cubic.c
index ce18a4db813f..ebd5a1e69f56 100644
--- a/tools/testing/selftests/bpf/progs/bpf_cubic.c
+++ b/tools/testing/selftests/bpf/progs/bpf_cubic.c
@@ -16,6 +16,7 @@
 
 #include "bpf_tracing_net.h"
 #include <bpf/bpf_tracing.h>
+#include <errno.h>
 
 char _license[] SEC("license") = "GPL";
 
@@ -170,10 +171,18 @@ static void bictcp_hystart_reset(struct sock *sk)
 	ca->sample_cnt = 0;
 }
 
+bool nodelay_init_reject = false;
+bool nodelay_cwnd_event_tx_start_reject = false;
+
 SEC("struct_ops")
 void BPF_PROG(bpf_cubic_init, struct sock *sk)
 {
 	struct bpf_bictcp *ca = inet_csk_ca(sk);
+	int true_val = 1, ret;
+
+	ret = bpf_setsockopt(sk, SOL_TCP, TCP_NODELAY, &true_val, sizeof(true_val));
+	if (ret == -EOPNOTSUPP)
+		nodelay_init_reject = true;
 
 	bictcp_reset(ca);
 
@@ -189,8 +198,13 @@ void BPF_PROG(bpf_cubic_cwnd_event_tx_start, struct sock *sk)
 {
 	struct bpf_bictcp *ca = inet_csk_ca(sk);
 	__u32 now = tcp_jiffies32;
+	int true_val = 1, ret;
 	__s32 delta;
 
+	ret = bpf_setsockopt(sk, SOL_TCP, TCP_NODELAY, &true_val, sizeof(true_val));
+	if (ret == -EOPNOTSUPP)
+		nodelay_cwnd_event_tx_start_reject = true;
+
 	delta = now - tcp_sk(sk)->lsndtime;
 
 	/* We were application limited (idle) for a while.

From 375e4e33c18dfa05c5dfd5f3dfffeb29343dd4c7 Mon Sep 17 00:00:00 2001
From: Weiming Shi <bestswngs@gmail.com>
Date: Tue, 21 Apr 2026 23:54:12 -0700
Subject: [PATCH 05/19] bpf: Fix NULL pointer dereference in
 bpf_sk_storage_clone and diag paths

bpf_selem_unlink_nofail() sets SDATA(selem)->smap to NULL before
removing the selem from the storage hlist. A concurrent RCU reader in
bpf_sk_storage_clone() can observe the selem still on the list with
smap already NULL, causing a NULL pointer dereference.

 general protection fault, probably for non-canonical address 0xdffffc000000000a:
 KASAN: null-ptr-deref in range [0x0000000000000050-0x0000000000000057]
 RIP: 0010:bpf_sk_storage_clone+0x1cd/0xaa0 net/core/bpf_sk_storage.c:174
 Call Trace:
  <IRQ>
  sk_clone+0xfed/0x1980 net/core/sock.c:2591
  inet_csk_clone_lock+0x30/0x760 net/ipv4/inet_connection_sock.c:1222
  tcp_create_openreq_child+0x35/0x2680 net/ipv4/tcp_minisocks.c:571
  tcp_v4_syn_recv_sock+0x123/0xf90 net/ipv4/tcp_ipv4.c:1729
  tcp_check_req+0x8e1/0x2580 include/net/tcp.h:855
  tcp_v4_rcv+0x1845/0x3b80 net/ipv4/tcp_ipv4.c:2347

Add a NULL check for smap in bpf_sk_storage_clone().

bpf_sk_storage_diag_put_all() has the same issue. Add a NULL check
and pass the validated smap directly to diag_get(), which is refactored
to take smap as a parameter instead of reading it internally.

bpf_sk_storage_diag_put() uses diag->maps[i] which is always valid
under its refcount, so diag->maps[i] is passed directly to diag_get().

Fixes: 5d800f87d0a5 ("bpf: Support lockless unlink when freeing map or local storage")
Reported-by: Xiang Mei <xmei5@asu.edu>
Acked-by: Amery Hung <ameryhung@gmail.com>
Signed-off-by: Weiming Shi <bestswngs@gmail.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20260422065411.1007737-2-bestswngs@gmail.com
---
 net/core/bpf_sk_storage.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 14eb7812bda4..dc3e8fce8809 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -172,7 +172,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
 		struct bpf_map *map;
 
 		smap = rcu_dereference(SDATA(selem)->smap);
-		if (!(smap->map.map_flags & BPF_F_CLONE))
+		if (!smap || !(smap->map.map_flags & BPF_F_CLONE))
 			continue;
 
 		/* Note that for lockless listeners adding new element
@@ -531,10 +531,10 @@ bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs)
 }
 EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_alloc);
 
-static int diag_get(struct bpf_local_storage_data *sdata, struct sk_buff *skb)
+static int diag_get(struct bpf_local_storage_map *smap,
+		    struct bpf_local_storage_data *sdata, struct sk_buff *skb)
 {
 	struct nlattr *nla_stg, *nla_value;
-	struct bpf_local_storage_map *smap;
 
 	/* It cannot exceed max nlattr's payload */
 	BUILD_BUG_ON(U16_MAX - NLA_HDRLEN < BPF_LOCAL_STORAGE_MAX_VALUE_SIZE);
@@ -543,7 +543,6 @@ static int diag_get(struct bpf_local_storage_data *sdata, struct sk_buff *skb)
 	if (!nla_stg)
 		return -EMSGSIZE;
 
-	smap = rcu_dereference(sdata->smap);
 	if (nla_put_u32(skb, SK_DIAG_BPF_STORAGE_MAP_ID, smap->map.id))
 		goto errout;
 
@@ -596,9 +595,11 @@ static int bpf_sk_storage_diag_put_all(struct sock *sk, struct sk_buff *skb,
 	saved_len = skb->len;
 	hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) {
 		smap = rcu_dereference(SDATA(selem)->smap);
+		if (!smap)
+			continue;
 		diag_size += nla_value_size(smap->map.value_size);
 
-		if (nla_stgs && diag_get(SDATA(selem), skb))
+		if (nla_stgs && diag_get(smap, SDATA(selem), skb))
 			/* Continue to learn diag_size */
 			err = -EMSGSIZE;
 	}
@@ -665,7 +666,7 @@ int bpf_sk_storage_diag_put(struct bpf_sk_storage_diag *diag,
 
 		diag_size += nla_value_size(diag->maps[i]->value_size);
 
-		if (nla_stgs && diag_get(sdata, skb))
+		if (nla_stgs && diag_get((struct bpf_local_storage_map *)diag->maps[i], sdata, skb))
 			/* Continue to learn diag_size */
 			err = -EMSGSIZE;
 	}

From 6451d58a355642b612f2bf948ad39108c998ac2a Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@google.com>
Date: Mon, 20 Apr 2026 19:48:41 +0000
Subject: [PATCH 06/19] sockmap: Fix sk_psock_drop() race vs
 sock_map_{unhash,close,destroy}().

syzbot reported a splat in sock_map_destroy() [0], where psock was
NULL even though sk->sk_prot still pointed to tcp_bpf_prots[][].

The stack trace shows how badly the path was excercised, see
inet_release() calls tcp_close(), not sock_map_close() yet, but
finally reaching sock_map_destroy().

The root cause is a lack of synchronisation.

Even if sk_psock_get() fails to bump psock->refcnt, it does not
guarantee that sk_psock_drop() has finished, and thus sk->sk_prot
might not have been restored to the original one.

Commit 4b4647add7d3 ("sock_map: avoid race between sock_map_close
and sk_psock_put") attempted to address this, but it was insufficient
for two reasons.

It did not cover sock_map_unhash() and sock_map_destroy(), and
it missed the corner case where sk_psock() is NULL.

On non-x86 platforms, sk_psock_restore_proto(sk, psock) and
rcu_assign_sk_user_data(sk, NULL) can be reordered because there
is no address dependency between sk->sk_prot and sk->sk_user_data.

sk_psock_get() returning NULL implies nothing about sk->sk_prot.

Let's simply retry sk_psock_get() in the unlikely case.

Note that we cannot avoid loop even if we added memory barrier
in sk_psock_drop() and sock_map_psock_get_checked().

Also note that sock_map_destroy() cannot be called from softirq
while sock_map_close() has also been running.
It is because sock_map_destroy() requires SOCK_DEAD, so sock_map_destroy()
cannot happen until sock_map_close() has finished the saved_close()
(which is tcp_close()).

[0]:
WARNING: CPU: 1 PID: 8459 at net/core/sock_map.c:1667 sock_map_destroy+0x28b/0x2b0 net/core/sock_map.c:1667
Modules linked in:
CPU: 1 UID: 0 PID: 8459 Comm: syz.0.1109 Not tainted syzkaller #0 PREEMPT_{RT,(full)}
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 07/12/2025
RIP: 0010:sock_map_destroy+0x28b/0x2b0 net/core/sock_map.c:1667
Code: 8b 36 49 83 c6 38 4c 89 f0 48 c1 e8 03 42 80 3c 38 00 74 08 4c 89 f7 e8 93 62 22 f9 4d 8b 3e e9 79 ff ff ff e8 a6 2b c3 f8 90 <0f> 0b 90 eb 9c e8 9b 2b c3 f8 4c 89 e7 be 03 00 00 00 e8 0e 4e bc
RSP: 0018:ffffc9000d067be8 EFLAGS: 00010293
RAX: ffffffff88fb30aa RBX: ffff888024832000 RCX: ffff888024283b80
RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
RBP: 0000000000000001 R08: 0000000000000000 R09: 0000000000000000
R10: dffffc0000000000 R11: ffffed100862e946 R12: dffffc0000000000
R13: ffff888024832000 R14: ffffffff995b2208 R15: ffffffff88fb2e20
FS:  0000555579a7d500(0000) GS:ffff8881269c2000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00002000000048c0 CR3: 000000003713a000 CR4: 00000000003526f0
Call Trace:
 <TASK>
 inet_csk_destroy_sock+0x166/0x3a0 net/ipv4/inet_connection_sock.c:1294
 __tcp_close+0xcc1/0xfd0 net/ipv4/tcp.c:3262
 tcp_close+0x28/0x110 net/ipv4/tcp.c:3274
 inet_release+0x144/0x190 net/ipv4/af_inet.c:435
 __sock_release net/socket.c:649 [inline]
 sock_close+0xc0/0x240 net/socket.c:1439
 __fput+0x45b/0xa80 fs/file_table.c:468
 task_work_run+0x1d4/0x260 kernel/task_work.c:227
 resume_user_mode_work include/linux/resume_user_mode.h:50 [inline]
 exit_to_user_mode_loop+0xec/0x110 kernel/entry/common.c:43
 exit_to_user_mode_prepare include/linux/irq-entry-common.h:225 [inline]
 syscall_exit_to_user_mode_work include/linux/entry-common.h:175 [inline]
 syscall_exit_to_user_mode include/linux/entry-common.h:210 [inline]
 do_syscall_64+0x2bd/0x3b0 arch/x86/entry/syscall_64.c:100
 entry_SYSCALL_64_after_hwframe+0x77/0x7f
RIP: 0033:0x7f265847ebe9
Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 a8 ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007ffd158dfbd8 EFLAGS: 00000246 ORIG_RAX: 00000000000001b4
RAX: 0000000000000000 RBX: 000000000002ddb0 RCX: 00007f265847ebe9
RDX: 0000000000000000 RSI: 000000000000001e RDI: 0000000000000003
RBP: 00007f26586a7da0 R08: 0000000000000001 R09: 0000000e158dfecf
R10: 0000001b30a20000 R11: 0000000000000246 R12: 00007f26586a5fac
R13: 00007f26586a5fa0 R14: ffffffffffffffff R15: 00007ffd158dfcf0
 </TASK>

Fixes: 1aa12bdf1bfb ("bpf: sockmap, add sock close() hook to remove socks")
Fixes: b05545e15e1f ("bpf: sockmap, fix transition through disconnect without close")
Fixes: d8616ee2affc ("bpf, sockmap: Fix sk->sk_forward_alloc warn_on in sk_stream_kill_queues")
Reported-by: syzbot+b0842d38af58376d1fdc@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/bpf/69cec5ef.050a0220.2dbe29.0009.GAE@google.com/
Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Reviewed-by: Jiayuan Chen <jiayuan.chen@linux.dev>
Link: https://patch.msgid.link/20260420194846.1089595-1-kuniyu@google.com
---
 net/core/sock_map.c | 39 +++++++++++++++++++++++++--------------
 1 file changed, 25 insertions(+), 14 deletions(-)

diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index 02a68be3002a..99e3789492a0 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -1630,18 +1630,23 @@ void sock_map_unhash(struct sock *sk)
 	void (*saved_unhash)(struct sock *sk);
 	struct sk_psock *psock;
 
+retry:
 	rcu_read_lock();
 	psock = sk_psock(sk);
 	if (unlikely(!psock)) {
 		rcu_read_unlock();
 		saved_unhash = READ_ONCE(sk->sk_prot)->unhash;
+		if (unlikely(saved_unhash == sock_map_unhash))
+			goto retry;
 	} else {
 		saved_unhash = psock->saved_unhash;
 		sock_map_remove_links(sk, psock);
 		rcu_read_unlock();
+
+		if (WARN_ON_ONCE(saved_unhash == sock_map_unhash))
+			return;
 	}
-	if (WARN_ON_ONCE(saved_unhash == sock_map_unhash))
-		return;
+
 	if (saved_unhash)
 		saved_unhash(sk);
 }
@@ -1652,20 +1657,25 @@ void sock_map_destroy(struct sock *sk)
 	void (*saved_destroy)(struct sock *sk);
 	struct sk_psock *psock;
 
+retry:
 	rcu_read_lock();
 	psock = sk_psock_get(sk);
 	if (unlikely(!psock)) {
 		rcu_read_unlock();
 		saved_destroy = READ_ONCE(sk->sk_prot)->destroy;
+		if (unlikely(saved_destroy == sock_map_destroy))
+			goto retry;
 	} else {
 		saved_destroy = psock->saved_destroy;
 		sock_map_remove_links(sk, psock);
 		rcu_read_unlock();
 		sk_psock_stop(psock);
 		sk_psock_put(sk, psock);
+
+		if (WARN_ON_ONCE(saved_destroy == sock_map_destroy))
+			return;
 	}
-	if (WARN_ON_ONCE(saved_destroy == sock_map_destroy))
-		return;
+
 	if (saved_destroy)
 		saved_destroy(sk);
 }
@@ -1676,32 +1686,33 @@ void sock_map_close(struct sock *sk, long timeout)
 	void (*saved_close)(struct sock *sk, long timeout);
 	struct sk_psock *psock;
 
+retry:
 	lock_sock(sk);
 	rcu_read_lock();
-	psock = sk_psock(sk);
+	psock = sk_psock_get(sk);
 	if (likely(psock)) {
 		saved_close = psock->saved_close;
 		sock_map_remove_links(sk, psock);
-		psock = sk_psock_get(sk);
-		if (unlikely(!psock))
-			goto no_psock;
 		rcu_read_unlock();
 		sk_psock_stop(psock);
 		release_sock(sk);
 		cancel_delayed_work_sync(&psock->work);
 		sk_psock_put(sk, psock);
+
+		/* Make sure we do not recurse. This is a bug.
+		 * Leak the socket instead of crashing on a stack overflow.
+		 */
+		if (WARN_ON_ONCE(saved_close == sock_map_close))
+			return;
 	} else {
 		saved_close = READ_ONCE(sk->sk_prot)->close;
-no_psock:
 		rcu_read_unlock();
 		release_sock(sk);
+
+		if (unlikely(saved_close == sock_map_close))
+			goto retry;
 	}
 
-	/* Make sure we do not recurse. This is a bug.
-	 * Leak the socket instead of crashing on a stack overflow.
-	 */
-	if (WARN_ON_ONCE(saved_close == sock_map_close))
-		return;
 	saved_close(sk, timeout);
 }
 EXPORT_SYMBOL_GPL(sock_map_close);

From 1081de1accb2b224516cca7071122c59532d0b22 Mon Sep 17 00:00:00 2001
From: Weiming Shi <bestswngs@gmail.com>
Date: Thu, 23 Apr 2026 11:38:32 -0700
Subject: [PATCH 07/19] bpf: Fix NULL pointer dereference in
 bpf_skb_fib_lookup()

When tot_len is not provided by the user, bpf_skb_fib_lookup()
resolves the FIB result's output device via dev_get_by_index_rcu()
to check skb forwardability and fill in mtu_result. The returned
pointer is dereferenced without a NULL check. If the device is
concurrently unregistered, dev_get_by_index_rcu() returns NULL and
is_skb_forwardable() crashes at dev->flags:

 KASAN: null-ptr-deref in range
  [0x00000000000000b0-0x00000000000000b7]
 Call Trace:
  is_skb_forwardable (include/linux/netdevice.h:4365)
  bpf_skb_fib_lookup (net/core/filter.c:6446)
  bpf_prog_test_run_skb (net/bpf/test_run.c)
  __sys_bpf (kernel/bpf/syscall.c)

Add the missing NULL check, returning -ENODEV to be consistent
with how bpf_ipv4_fib_lookup() and bpf_ipv6_fib_lookup() handle
the same condition.

Fixes: 4f74fede40df ("bpf: Add mtu checking to FIB forwarding helper")
Reported-by: Xiang Mei <xmei5@asu.edu>
Signed-off-by: Weiming Shi <bestswngs@gmail.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Acked-by: Paul Chaignon <paul.chaignon@gmail.com>
Link: https://patch.msgid.link/20260423183831.1325480-2-bestswngs@gmail.com
---
 net/core/filter.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/core/filter.c b/net/core/filter.c
index 2914f5330310..bc96c18df4e0 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -6473,6 +6473,8 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,
 		 * against MTU of FIB lookup resulting net_device
 		 */
 		dev = dev_get_by_index_rcu(net, params->ifindex);
+		if (unlikely(!dev))
+			return -ENODEV;
 		if (!is_skb_forwardable(dev, skb))
 			rc = BPF_FIB_LKUP_RET_FRAG_NEEDED;
 

From b5c111f4967ba4fdecdd318923ec7b081e9ef95f Mon Sep 17 00:00:00 2001
From: Amery Hung <ameryhung@gmail.com>
Date: Thu, 23 Apr 2026 15:23:55 -0700
Subject: [PATCH 08/19] bpf: Fix sk_local_storage diag dumping uninitialized
 special fields

Call check_and_init_map_value() after the copy_map_value() to zero out
special field regions. diag_get() copies sk_local_storage map values
into a netlink message using copy_map_value{_locked}(), which
intentionally skip special fields. However, the destination buffer from
nla_reserve_64bit() is not zeroed and the skipped regions contain
uninitialized skb data can be sent to userspace.

Fixes: 1ed4d92458a9 ("bpf: INET_DIAG support in bpf_sk_storage")
Signed-off-by: Amery Hung <ameryhung@gmail.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20260423222356.155387-1-ameryhung@gmail.com
---
 net/core/bpf_sk_storage.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index dc3e8fce8809..ecd659f79fd4 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -557,6 +557,7 @@ static int diag_get(struct bpf_local_storage_map *smap,
 				      sdata->data, true);
 	else
 		copy_map_value(&smap->map, nla_data(nla_value), sdata->data);
+	check_and_init_map_value(&smap->map, nla_data(nla_value));
 
 	nla_nest_end(skb, nla_stg);
 	return 0;

From 0c7ae130698e70107430254e79fbe996b4d37ab5 Mon Sep 17 00:00:00 2001
From: Paul Chaignon <paul.chaignon@gmail.com>
Date: Sat, 2 May 2026 12:12:40 +0200
Subject: [PATCH 09/19] tools/headers: Regenerate stddef.h to fix BPF selftests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With commit dacbfc167808 ("crypto: af_alg - Annotate struct af_alg_iv
with __counted_by"), two selftests, test_tag and crypto_sanity, now
indirectly rely on the __counted_by macro. On systems with commit
dacbfc167808 in the installed UAPI headers, the selftests build fails
with:

  In file included from tools/testing/selftests/bpf/prog_tests/crypto_sanity.c:7:
  /usr/include/linux/if_alg.h:45:22: error: expected ‘:’, ‘,’, ‘;’, ‘}’ or ‘__attribute__’ before ‘__counted_by’
     45 |         __u8    iv[] __counted_by(ivlen);
        |                      ^~~~~~~~~~~~

This patch fixes it by regenerating stddef.h in tools/include using the
instructions from commit a778f5d46b62 ("tools/headers: Pull in stddef.h
to uapi to fix BPF selftests build in CI").

Fixes: dacbfc167808 ("crypto: af_alg - Annotate struct af_alg_iv with __counted_by")
Signed-off-by: Paul Chaignon <paul.chaignon@gmail.com>
Reviewed-by: Alan Maguire <alan.maguire@oracle.com>
Tested-by: Ihor Solodrai <ihor.solodrai@linux.dev>
Link: https://lore.kernel.org/r/8da8ef16055aa452d940668ed5359ce54adc6b0b.1777715500.git.paul.chaignon@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/include/uapi/linux/stddef.h | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/tools/include/uapi/linux/stddef.h b/tools/include/uapi/linux/stddef.h
index c53cde425406..457498259494 100644
--- a/tools/include/uapi/linux/stddef.h
+++ b/tools/include/uapi/linux/stddef.h
@@ -3,7 +3,6 @@
 #define _LINUX_STDDEF_H
 
 
-
 #ifndef __always_inline
 #define __always_inline __inline__
 #endif
@@ -36,6 +35,11 @@
 		struct __struct_group_tag(TAG) { MEMBERS } ATTRS NAME; \
 	} ATTRS
 
+#ifdef __cplusplus
+/* sizeof(struct{}) is 1 in C++, not 0, can't use C version of the macro. */
+#define __DECLARE_FLEX_ARRAY(T, member)	\
+	T member[0]
+#else
 /**
  * __DECLARE_FLEX_ARRAY() - Declare a flexible array usable in a union
  *
@@ -52,3 +56,23 @@
 		TYPE NAME[]; \
 	}
 #endif
+
+#ifndef __counted_by
+#define __counted_by(m)
+#endif
+
+#ifndef __counted_by_le
+#define __counted_by_le(m)
+#endif
+
+#ifndef __counted_by_be
+#define __counted_by_be(m)
+#endif
+
+#ifndef __counted_by_ptr
+#define __counted_by_ptr(m)
+#endif
+
+#define __kernel_nonstring
+
+#endif /* _LINUX_STDDEF_H */

From 481c2265286ef302327c93403a8cf7b3fe4506d0 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@google.com>
Date: Mon, 4 May 2026 21:04:48 +0000
Subject: [PATCH 10/19] bpf: tcp: Fix type confusion in bpf_tcp_sock().

bpf_tcp_sock() only checks if sk->sk_protocol is IPPROTO_TCP,
but RAW socket can bypass it:

  socket(AF_INET, SOCK_RAW, IPPROTO_TCP)

Calling bpf_setsockopt() in SOCKOPT prog triggers out-of-bounds
access to another slab object. [0]

Let's use sk_is_tcp().

[0]:
BUG: KASAN: slab-out-of-bounds in sol_tcp_sockopt (net/core/filter.c:5519)
Read of size 8 at addr ffff88801083d760 by task test_progs/1259

CPU: 1 UID: 0 PID: 1259 Comm: test_progs Tainted: G           OE       7.0.0-11175-gb5c111f4967b #1 PREEMPT(full)
Tainted: [O]=OOT_MODULE, [E]=UNSIGNED_MODULE
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-debian-1.17.0-1 04/01/2014
Call Trace:
 <TASK>
 dump_stack_lvl (lib/dump_stack.c:94 lib/dump_stack.c:120)
 print_report (mm/kasan/report.c:378 mm/kasan/report.c:482)
 kasan_report (mm/kasan/report.c:595)
 sol_tcp_sockopt (net/core/filter.c:5519)
 __bpf_getsockopt (net/core/filter.c:5633)
 bpf_sk_getsockopt (net/core/filter.c:5654)
 bpf_prog_629ba00a1601e9f2__setsockopt+0x86/0x22c
 __cgroup_bpf_run_filter_setsockopt (./include/linux/bpf.h:1402 ./include/linux/filter.h:722 ./include/linux/filter.h:729 kernel/bpf/cgroup.c:81 kernel/bpf/cgroup.c:2026)
 do_sock_setsockopt (net/socket.c:2363)
 __x64_sys_setsockopt (net/socket.c:2406)
 do_syscall_64 (arch/x86/entry/syscall_64.c:63)
 entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:121)
RIP: 0033:0x7f85f82fe7de
Code: 55 48 63 c9 48 63 ff 45 89 c9 48 89 e5 48 83 ec 08 6a 2c e8 34 69 f7 ff c9 c3 66 90 f3 0f 1e fa 49 89 ca b8 36 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 0a c3 66 0f 1f 84 00 00 00 00 00 48 8b 15 e1
RSP: 002b:00007ffe59dcecd8 EFLAGS: 00000202 ORIG_RAX: 0000000000000036
RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f85f82fe7de
RDX: 000000000000001c RSI: 0000000000000006 RDI: 000000000000000d
RBP: 00007ffe59dcef20 R08: 000000000000003c R09: 0000000000000000
R10: 00007ffe59dcef00 R11: 0000000000000202 R12: 00007ffe59dcf268
R13: 0000000000000003 R14: 00007f85f9da5000 R15: 000055b2f3201400
 </TASK>

The buggy address belongs to the object at ffff88801083d280
 which belongs to the cache RAW of size 1792
The buggy address is located 1248 bytes inside of
 allocated 1792-byte region [ffff88801083d280, ffff88801083d980)

Fixes: 655a51e536c0 ("bpf: Add struct bpf_tcp_sock and BPF_FUNC_tcp_sock")
Reported-by: Damiano Melotti <melotti@google.com>
Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://patch.msgid.link/20260504210610.180150-2-kuniyu@google.com
---
 net/core/filter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index bc96c18df4e0..cd88633f8dc1 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -7475,7 +7475,7 @@ u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type,
 
 BPF_CALL_1(bpf_tcp_sock, struct sock *, sk)
 {
-	if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP)
+	if (sk_fullsock(sk) && sk_is_tcp(sk))
 		return (unsigned long)sk;
 
 	return (unsigned long)NULL;

From d73549b8bb7fa6147666c579d66f72bf076f719f Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@google.com>
Date: Mon, 4 May 2026 21:04:49 +0000
Subject: [PATCH 11/19] selftest: bpf: Add test for bpf_tcp_sock() and RAW
 socket.

Let's extend sockopt_sk.c to cover bpf_tcp_sock() for the
wrong socket type.

Before:
  # ./test_progs -t sockopt_sk
  [  151.948613] ==================================================================
  [  151.951376] BUG: KASAN: slab-out-of-bounds in sol_tcp_sockopt+0xc7/0x8e0
  [  151.954159] Read of size 8 at addr ffff88801083d760 by task test_progs/1259
  ...
  run_test:FAIL:getsetsockopt unexpected error: -1 (errno 0)
  #427     sockopt_sk:FAIL

After:
  #427     sockopt_sk:OK

While at it, missing free() is fixed up.

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20260504210610.180150-3-kuniyu@google.com
---
 .../selftests/bpf/prog_tests/sockopt_sk.c       | 17 ++++++++++++++++-
 tools/testing/selftests/bpf/progs/sockopt_sk.c  | 16 ++++++++++++++++
 2 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c
index 53637431ec5d..3a41c517b918 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c
@@ -190,7 +190,7 @@ static int getsetsockopt(void)
 	fd = socket(AF_NETLINK, SOCK_RAW, 0);
 	if (fd < 0) {
 		log_err("Failed to create AF_NETLINK socket");
-		return -1;
+		goto err;
 	}
 
 	buf.u32 = 1;
@@ -211,6 +211,21 @@ static int getsetsockopt(void)
 	}
 	ASSERT_EQ(optlen, 8, "Unexpected NETLINK_LIST_MEMBERSHIPS value");
 
+	/* Trick bpf_tcp_sock() with IPPROTO_TCP */
+	close(fd);
+	fd = socket(AF_INET, SOCK_RAW, IPPROTO_TCP);
+	if (!ASSERT_OK_FD(fd, "socket"))
+		goto err;
+
+	/* The BPF prog intercepts this before the kernel sees it, any
+	 * optlen works. Go with 4 bytes for simplicity.
+	 */
+	buf.u32 = 1;
+	optlen = sizeof(buf.u32);
+	err = setsockopt(fd, SOL_TCP, TCP_SAVED_SYN, &buf, optlen);
+	if (!ASSERT_ERR(err, "setsockopt(TCP_SAVED_SYN)"))
+		goto err;
+
 	free(big_buf);
 	close(fd);
 	return 0;
diff --git a/tools/testing/selftests/bpf/progs/sockopt_sk.c b/tools/testing/selftests/bpf/progs/sockopt_sk.c
index cb990a7d3d45..5e0b27e7855c 100644
--- a/tools/testing/selftests/bpf/progs/sockopt_sk.c
+++ b/tools/testing/selftests/bpf/progs/sockopt_sk.c
@@ -149,6 +149,20 @@ int _setsockopt(struct bpf_sockopt *ctx)
 	if (sk && sk->family == AF_NETLINK)
 		goto out;
 
+	if (sk && sk->family == AF_INET && sk->type == SOCK_RAW) {
+		struct bpf_tcp_sock *tp = bpf_tcp_sock(sk);
+
+		if (tp) {
+			char saved_syn[60];
+
+			bpf_getsockopt(sk, SOL_TCP, TCP_SAVED_SYN,
+				       &saved_syn, sizeof(saved_syn));
+			goto consumed;
+		}
+
+		goto out;
+	}
+
 	/* Make sure bpf_get_netns_cookie is callable.
 	 */
 	if (bpf_get_netns_cookie(NULL) == 0)
@@ -224,6 +238,8 @@ int _setsockopt(struct bpf_sockopt *ctx)
 		return 0; /* couldn't get sk storage */
 
 	storage->val = optval[0];
+
+consumed:
 	ctx->optlen = -1; /* BPF has consumed this option, don't call kernel
 			   * setsockopt handler.
 			   */

From 7995b216a731db657f356f6ae37a42f445b9a0ec Mon Sep 17 00:00:00 2001
From: "Matthieu Baerts (NGI0)" <matttbe@kernel.org>
Date: Mon, 4 May 2026 21:04:50 +0000
Subject: [PATCH 12/19] mptcp: bpf: Fix type confusion in
 bpf_mptcp_sock_from_subflow()

bpf_mptcp_sock_from_subflow() only checks if sk->sk_protocol is
IPPROTO_TCP, but RAW socket can bypass it:

  socket(AF_INET, SOCK_RAW, IPPROTO_TCP)

In this case, it would NOT be valid to call sk_is_mptcp() which will
assume sk is a pointer to a struct tcp_sock, and wrongly checks for:
tcp_sk(sk)->is_mptcp.

Fixes: 3bc253c2e652 ("bpf: Add bpf_skc_to_mptcp_sock_proto")
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20260504210610.180150-4-kuniyu@google.com
---
 net/mptcp/bpf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/mptcp/bpf.c b/net/mptcp/bpf.c
index 8a16672b94e2..4cc16cbeb328 100644
--- a/net/mptcp/bpf.c
+++ b/net/mptcp/bpf.c
@@ -14,7 +14,7 @@
 
 struct mptcp_sock *bpf_mptcp_sock_from_subflow(struct sock *sk)
 {
-	if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP && sk_is_mptcp(sk))
+	if (sk && sk_fullsock(sk) && sk_is_tcp(sk) && sk_is_mptcp(sk))
 		return mptcp_sk(mptcp_subflow_ctx(sk)->conn);
 
 	return NULL;

From decb84b8383ab7acff94db208ef7ed19f9c55e1f Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@google.com>
Date: Mon, 4 May 2026 21:04:51 +0000
Subject: [PATCH 13/19] bpf: tcp: Fix type confusion in bpf_skc_to_tcp_sock().

bpf_skc_to_tcp_sock() only checks if sk->sk_protocol is
IPPROTO_TCP, but RAW socket can bypass it:

  socket(AF_INET, SOCK_RAW, IPPROTO_TCP)

Let's use sk_is_tcp().

Fixes: 478cfbdf5f13 ("bpf: Add bpf_skc_to_{tcp, tcp_timewait, tcp_request}_sock() helpers")
Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20260504210610.180150-5-kuniyu@google.com
---
 net/core/filter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index cd88633f8dc1..7d945dc2cb92 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -11963,7 +11963,7 @@ const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto = {
 
 BPF_CALL_1(bpf_skc_to_tcp_sock, struct sock *, sk)
 {
-	if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP)
+	if (sk && sk_fullsock(sk) && sk_is_tcp(sk))
 		return (unsigned long)sk;
 
 	return (unsigned long)NULL;

From 843064b0a77eed3d6d63ffc53aeaa359672b4e12 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@google.com>
Date: Mon, 4 May 2026 21:04:52 +0000
Subject: [PATCH 14/19] bpf: tcp: Fix type confusion in bpf_skc_to_tcp6_sock().

bpf_skc_to_tcp6_sock() only checks if sk->sk_protocol is IPPROTO_TCP
and sk->sk_family is AF_INET6, but RAW socket can bypass it:

  socket(AF_INET6, SOCK_RAW, IPPROTO_TCP)

Let's check sk->sk_type too.

Fixes: af7ec1383361 ("bpf: Add bpf_skc_to_tcp6_sock() helper")
Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20260504210610.180150-6-kuniyu@google.com
---
 net/core/filter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index 7d945dc2cb92..684922efd481 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -11947,7 +11947,7 @@ BPF_CALL_1(bpf_skc_to_tcp6_sock, struct sock *, sk)
 	 */
 	BTF_TYPE_EMIT(struct tcp6_sock);
 	if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP &&
-	    sk->sk_family == AF_INET6)
+	    sk->sk_type == SOCK_STREAM && sk->sk_family == AF_INET6)
 		return (unsigned long)sk;
 
 	return (unsigned long)NULL;

From 1c2958e4ab1ed4594db16425dbcab33c56ea8330 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@google.com>
Date: Mon, 4 May 2026 21:04:53 +0000
Subject: [PATCH 15/19] bpf: tcp: Fix type confusion in sol_tcp_sockopt().

sol_tcp_sockopt() only checks if sk->sk_protocol is IPPROTO_TCP,
but RAW socket can bypass it:

  socket(AF_INET, SOCK_RAW, IPPROTO_TCP)

Let's use sk_is_tcp().

Note that initially sol_tcp_sockopt() checked sk->sk_prot->setsockopt.

Fixes: 2ab42c7b871f ("bpf: Check the protocol of a sock to agree the calls to bpf_setsockopt().")
Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://patch.msgid.link/20260504210610.180150-7-kuniyu@google.com
---
 net/core/filter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index 684922efd481..ef0877eefaa7 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5481,7 +5481,7 @@ static int sol_tcp_sockopt(struct sock *sk, int optname,
 			   char *optval, int *optlen,
 			   bool getopt)
 {
-	if (sk->sk_protocol != IPPROTO_TCP)
+	if (!sk_is_tcp(sk))
 		return -EINVAL;
 
 	switch (optname) {

From 18fc650ccd7fe3376eca89203668cfb8268f60df Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@google.com>
Date: Sun, 26 Apr 2026 01:26:43 +0000
Subject: [PATCH 16/19] bpf: Free reuseport cBPF prog after RCU grace period.

Eulgyu Kim reported the splat below with a repro. [0]

The repro sets up a UDP reuseport group with a cBPF prog and
replaces it with a new one while another thread is sending
a UDP packet to the group.

The reuseport prog is freed by sk_reuseport_prog_free().
bpf_prog_put() is called for "e"BPF prog to destruct through
multiple stages while cBPF prog is freed immediately by
bpf_release_orig_filter() and bpf_prog_free().

If a reuseport prog is detached from the setsockopt() path
(reuseport_attach_prog() or reuseport_detach_prog()),
sk_reuseport_prog_free() is called without waiting for RCU
readers to complete, resulting in various bugs.

Let's defer freeing the reuseport cBPF prog after one RCU
grace period.

Note "e"BPF prog is safe as is unless the fast path starts
to touch fields destroyed in bpf_prog_put_deferred() and
__bpf_prog_put_noref().

[0]:
BUG: KASAN: vmalloc-out-of-bounds in reuseport_select_sock+0xedc/0x1220 net/core/sock_reuseport.c:596
Read of size 4 at addr ffffc9000051e004 by task slowme/10208
CPU: 6 UID: 1000 PID: 10208 Comm: slowme Not tainted 7.0.0-geb7ac95ff75e #32 PREEMPT(full)
Hardware name: QEMU Ubuntu 24.04 PC v2 (i440FX + PIIX, arch_caps fix, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014
Call Trace:
 <IRQ>
 dump_stack_lvl+0xe8/0x150 lib/dump_stack.c:120
 print_address_description mm/kasan/report.c:378 [inline]
 print_report+0xca/0x240 mm/kasan/report.c:482
 kasan_report+0x118/0x150 mm/kasan/report.c:595
 reuseport_select_sock+0xedc/0x1220 net/core/sock_reuseport.c:596
 udp4_lib_lookup2+0x3bc/0x950 net/ipv4/udp.c:495
 __udp4_lib_lookup+0x768/0xe20 net/ipv4/udp.c:723
 __udp4_lib_lookup_skb+0x297/0x390 net/ipv4/udp.c:752
 __udp4_lib_rcv+0x1312/0x2620 net/ipv4/udp.c:2752
 ip_protocol_deliver_rcu+0x282/0x440 net/ipv4/ip_input.c:207
 ip_local_deliver_finish+0x3bb/0x6f0 net/ipv4/ip_input.c:241
 NF_HOOK+0x30c/0x3a0 include/linux/netfilter.h:318
 NF_HOOK+0x30c/0x3a0 include/linux/netfilter.h:318
 __netif_receive_skb_one_core net/core/dev.c:6181 [inline]
 __netif_receive_skb net/core/dev.c:6294 [inline]
 process_backlog+0xaa4/0x1960 net/core/dev.c:6645
 __napi_poll+0xae/0x340 net/core/dev.c:7709
 napi_poll net/core/dev.c:7772 [inline]
 net_rx_action+0x5d7/0xf50 net/core/dev.c:7929
 handle_softirqs+0x22b/0x870 kernel/softirq.c:622
 do_softirq+0x76/0xd0 kernel/softirq.c:523
 </IRQ>
 <TASK>
 __local_bh_enable_ip+0xf8/0x130 kernel/softirq.c:450
 local_bh_enable include/linux/bottom_half.h:33 [inline]
 rcu_read_unlock_bh include/linux/rcupdate.h:924 [inline]
 __dev_queue_xmit+0x1dd7/0x3710 net/core/dev.c:4890
 neigh_output include/net/neighbour.h:556 [inline]
 ip_finish_output2+0xca9/0x1070 net/ipv4/ip_output.c:237
 NF_HOOK_COND include/linux/netfilter.h:307 [inline]
 ip_output+0x29f/0x450 net/ipv4/ip_output.c:438
 ip_send_skb+0x45/0xc0 net/ipv4/ip_output.c:1508
 udp_send_skb+0xb04/0x1510 net/ipv4/udp.c:1195
 udp_sendmsg+0x1a71/0x2350 net/ipv4/udp.c:1485
 sock_sendmsg_nosec net/socket.c:727 [inline]
 __sock_sendmsg net/socket.c:742 [inline]
 __sys_sendto+0x554/0x680 net/socket.c:2206
 __do_sys_sendto net/socket.c:2213 [inline]
 __se_sys_sendto net/socket.c:2209 [inline]
 __x64_sys_sendto+0xde/0x100 net/socket.c:2209
 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
 do_syscall_64+0x160/0xf80 arch/x86/entry/syscall_64.c:94
 entry_SYSCALL_64_after_hwframe+0x77/0x7f
RIP: 0033:0x415a2d
Code: b3 66 2e 0f 1f 84 00 00 00 00 00 66 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007f6bc31e41e8 EFLAGS: 00000212 ORIG_RAX: 000000000000002c
RAX: ffffffffffffffda RBX: 00007f6bc31e4cdc RCX: 0000000000415a2d
RDX: 0000000000000001 RSI: 00007f6bc31e421f RDI: 0000000000000003
RBP: 00007f6bc31e4240 R08: 00007f6bc31e4220 R09: 0000000000000010
R10: 0000000000000000 R11: 0000000000000212 R12: 00007f6bc31e46c0
R13: ffffffffffffffb8 R14: 0000000000000000 R15: 00007ffc9b0d70b0
 </TASK>

Fixes: 538950a1b752 ("soreuseport: setsockopt SO_ATTACH_REUSEPORT_[CE]BPF")
Reported-by: Eulgyu Kim <eulgyukim@snu.ac.kr>
Reported-by: Taeyang Lee <0wn@theori.io>
Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20260426012647.3233119-1-kuniyu@google.com
---
 net/core/filter.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index ef0877eefaa7..70eef14edb99 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1654,15 +1654,24 @@ int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk)
 	return err;
 }
 
+static void sk_reuseport_prog_free_rcu(struct rcu_head *rcu)
+{
+	struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
+	struct bpf_prog *prog = aux->prog;
+
+	bpf_release_orig_filter(prog);
+	bpf_prog_free(prog);
+}
+
 void sk_reuseport_prog_free(struct bpf_prog *prog)
 {
 	if (!prog)
 		return;
 
-	if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT)
-		bpf_prog_put(prog);
+	if (bpf_prog_was_classic(prog))
+		call_rcu(&prog->aux->rcu, sk_reuseport_prog_free_rcu);
 	else
-		bpf_prog_destroy(prog);
+		bpf_prog_put(prog);
 }
 
 static inline int __bpf_try_make_writable(struct sk_buff *skb,

From 512809bb8a370d071f66fc53abe67368e171dec5 Mon Sep 17 00:00:00 2001
From: Paul Chaignon <paul.chaignon@gmail.com>
Date: Thu, 7 May 2026 20:22:06 +0200
Subject: [PATCH 17/19] bpf: Don't run arg-tracking analysis twice on main
 subprog

Because subprog 0, the main subprog, is considered a global function,
we end up running the arg-tracking dataflow analysis twice on it. That
results in slightly longer verification but mostly in more verbose
verifier logs. This patch fixes it by keeping only the iteration over
global subprogs.

When running over all of Cilium's programs with BPF_LOG_LEVEL2, this
reduces verbosity by ~20% on average.

Fixes: bf0c571f7feb6 ("bpf: introduce forward arg-tracking dataflow analysis")
Signed-off-by: Paul Chaignon <paul.chaignon@gmail.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/e4d7b53d4963ef520541a782f5fc8108a168877c.1778176504.git.paul.chaignon@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/liveness.c | 25 +++++++------------------
 1 file changed, 7 insertions(+), 18 deletions(-)

diff --git a/kernel/bpf/liveness.c b/kernel/bpf/liveness.c
index 332e6e003f27..58197d73b120 100644
--- a/kernel/bpf/liveness.c
+++ b/kernel/bpf/liveness.c
@@ -1914,26 +1914,15 @@ int bpf_compute_subprog_arg_access(struct bpf_verifier_env *env)
 		return -ENOMEM;
 	}
 
-	instance = call_instance(env, NULL, 0, 0);
-	if (IS_ERR(instance)) {
-		err = PTR_ERR(instance);
-		goto out;
-	}
-	err = analyze_subprog(env, NULL, info, instance, callsites);
-	if (err)
-		goto out;
-
 	/*
-	 * Subprogs and callbacks that don't receive FP-derived arguments
-	 * cannot access ancestor stack frames, so they were skipped during
-	 * the recursive walk above.  Async callbacks (timer, workqueue) are
-	 * also not reachable from the main program's call graph.  Analyze
-	 * all unvisited subprogs as independent roots at depth 0.
+	 * Analyze every subprog in reverse topological order (callers
+	 * before callees) so that each subprog is analyzed before its
+	 * callees, allowing the recursive walk inside analyze_subprog()
+	 * to naturally reach callees that receive FP-derived args.
 	 *
-	 * Use reverse topological order (callers before callees) so that
-	 * each subprog is analyzed before its callees, allowing the
-	 * recursive walk inside analyze_subprog() to naturally
-	 * reach nested callees that also lack FP-derived args.
+	 * Subprogs and callbacks that don't receive FP-derived arguments
+	 * cannot access ancestor stack frames are analyzed independently.
+	 * Async callbacks (timer, workqueue) are handled the same way.
 	 */
 	for (k = env->subprog_cnt - 1; k >= 0; k--) {
 		int sub = env->subprog_topo_order[k];

From bf6d507f7e3c65751d52fd8caf1ea4e003922624 Mon Sep 17 00:00:00 2001
From: Linpu Yu <linpu5433@gmail.com>
Date: Fri, 8 May 2026 22:43:43 +0800
Subject: [PATCH 18/19] xskmap: reject TX-only AF_XDP sockets

XSKMAP entries are used as redirect targets for incoming XDP frames.
A TX-only AF_XDP socket lacks an Rx ring and cannot handle redirected
traffic, but xsk_map_update_elem() currently allows such sockets to
be inserted into the map.

Redirecting packets to such a socket on the veth generic-XDP path
causes a kernel crash in xsk_generic_rcv().

This became possible after xsk_is_setup_for_bpf_map() was removed from
the XSKMAP update path, which allowed bound TX-only sockets to be
inserted into the map.

Reject TX-only sockets during XSKMAP updates to avoid the crash.
They remain fully operational for pure Tx purposes outside XSKMAP.

Fixes: 968be23ceaca ("xsk: Fix possible segfault at xskmap entry insertion")
Reported-by: Juefei Pu <tomapufckgml@gmail.com>
Reported-by: Yuan Tan <yuantan098@gmail.com>
Reported-by: Xin Liu <bird@lzu.edu.cn>
Signed-off-by: Yifan Wu <yifanwucs@gmail.com>
Signed-off-by: Linpu Yu <linpu5433@gmail.com>
Reviewed-by: Jason Xing <kerneljasonxing@gmail.com>
Link: https://lore.kernel.org/r/20260508144344.694-1-linpu5433@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/xdp/xskmap.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c
index afa457506274..3bff346308d0 100644
--- a/net/xdp/xskmap.c
+++ b/net/xdp/xskmap.c
@@ -184,6 +184,10 @@ static long xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
 	}
 
 	xs = (struct xdp_sock *)sock->sk;
+	if (!READ_ONCE(xs->rx)) {
+		sockfd_put(sock);
+		return -ENOBUFS;
+	}
 
 	map_entry = &m->xsk_map[i];
 	node = xsk_map_node_alloc(m, map_entry);

From 3ac1a467e37683f602221e243fa3c59b0de81165 Mon Sep 17 00:00:00 2001
From: Junyoung Jang <graypanda.inzag@gmail.com>
Date: Mon, 27 Apr 2026 02:25:05 +0900
Subject: [PATCH 19/19] bpf: Fix off-by-one boundary validation in arena
 direct-value access

BPF_MAP_TYPE_ARENA accepts BPF_PSEUDO_MAP_VALUE offsets at exactly
the end of the arena mapping (off == arena_size). The boundary check
in arena_map_direct_value_addr() uses `>` instead of `>=`, which
incorrectly allows a one-past-end pointer to be accepted.

Change the condition to `>=` to correctly reject offsets that fall
outside the valid arena user_vm range.

Fixes: 317460317a02 ("bpf: Introduce bpf_arena.")
Signed-off-by: Junyoung Jang <graypanda.inzag@gmail.com>
Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>
Link: https://lore.kernel.org/r/20260426172505.1947915-1-graypanda.inzag@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/arena.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
index 802656c6fd3c..49a8f7b1beef 100644
--- a/kernel/bpf/arena.c
+++ b/kernel/bpf/arena.c
@@ -511,7 +511,7 @@ static int arena_map_direct_value_addr(const struct bpf_map *map, u64 *imm, u32
 {
 	struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
 
-	if ((u64)off > arena->user_vm_end - arena->user_vm_start)
+	if ((u64)off >= arena->user_vm_end - arena->user_vm_start)
 		return -ERANGE;
 	*imm = (unsigned long)arena->user_vm_start;
 	return 0;