From 7e2a4f7ca0952820731ef7bdadfc9a9e9d3571b4 Mon Sep 17 00:00:00 2001 From: Maoyi Xie Date: Mon, 4 May 2026 22:27:36 +0800 Subject: [PATCH 01/36] xfrm: route MIGRATE notifications to caller's netns xfrm_send_migrate() in net/xfrm/xfrm_user.c and pfkey_send_migrate() in net/key/af_key.c both hardcode &init_net for the multicast that announces a successful XFRM_MSG_MIGRATE / SADB_X_MIGRATE. XFRM_MSG_MIGRATE arrives on a per-netns NETLINK_XFRM socket, and the rest of the xfrm/af_key netlink path was made netns-aware in 2008. The other 14 multicast paths in xfrm_user.c route their event using xs_net(x), xp_net(xp) or sock_net(skb->sk); only the migrate path was missed. Two consequences of the init_net hardcoding: 1. The notification (selector, old/new endpoint addresses, and the km_address) is delivered to listeners on init_net's XFRMNLGRP_MIGRATE / pfkey BROADCAST_ALL groups rather than on the issuing netns. An IKE daemon running in init_net therefore receives migration notifications originating from any other netns on the host. 2. An IKE daemon running inside a non-init netns and subscribed to its own XFRMNLGRP_MIGRATE / pfkey groups never receives the notification of its own migration. IKEv2 MOBIKE / address-update handling inside a netns is silently broken. Thread struct net through km_migrate() and the xfrm_mgr.migrate function pointer, drop the &init_net override in xfrm_send_migrate() and pfkey_send_migrate(), and pass the caller's net (already in scope in xfrm_migrate() via sock_net(skb->sk)) all the way down. struct xfrm_mgr is in-tree only and not exported as a stable API, so the function-pointer signature change is internal. pfkey_broadcast() is already netns-aware via net_generic(net, pfkey_net_id) since the pernet conversion. The five other pfkey_broadcast() callers in af_key.c already pass xs_net(x), sock_net(sk) or a per-netns net, so this only removes the &init_net outlier. Fixes: 5c79de6e79cd ("[XFRM]: User interface for handling XFRM_MSG_MIGRATE") Cc: stable@vger.kernel.org # v5.15+ Signed-off-by: Maoyi Xie Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 3 ++- net/key/af_key.c | 6 +++--- net/xfrm/xfrm_policy.c | 2 +- net/xfrm/xfrm_state.c | 4 ++-- net/xfrm/xfrm_user.c | 5 ++--- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 10d3edde6b2f..874409127e29 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -715,6 +715,7 @@ struct xfrm_mgr { const struct xfrm_migrate *m, int num_bundles, const struct xfrm_kmaddress *k, + struct net *net, const struct xfrm_encap_tmpl *encap); bool (*is_alive)(const struct km_event *c); }; @@ -1891,7 +1892,7 @@ int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol); #ifdef CONFIG_XFRM_MIGRATE int km_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, const struct xfrm_migrate *m, int num_bundles, - const struct xfrm_kmaddress *k, + const struct xfrm_kmaddress *k, struct net *net, const struct xfrm_encap_tmpl *encap); struct xfrm_state *xfrm_migrate_state_find(struct xfrm_migrate *m, struct net *net, u32 if_id); diff --git a/net/key/af_key.c b/net/key/af_key.c index a166a88d8788..9cffeef18cd9 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -3564,7 +3564,7 @@ static int set_ipsecrequest(struct sk_buff *skb, #ifdef CONFIG_NET_KEY_MIGRATE static int pfkey_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, const struct xfrm_migrate *m, int num_bundles, - const struct xfrm_kmaddress *k, + const struct xfrm_kmaddress *k, struct net *net, const struct xfrm_encap_tmpl *encap) { int i; @@ -3669,7 +3669,7 @@ static int pfkey_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, } /* broadcast migrate message to sockets */ - pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, &init_net); + pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, net); return 0; @@ -3680,7 +3680,7 @@ static int pfkey_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, #else static int pfkey_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, const struct xfrm_migrate *m, int num_bundles, - const struct xfrm_kmaddress *k, + const struct xfrm_kmaddress *k, struct net *net, const struct xfrm_encap_tmpl *encap) { return -ENOPROTOOPT; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index c944327ce66c..59968dcbafe1 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -4703,7 +4703,7 @@ int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, } /* Stage 5 - announce */ - km_migrate(sel, dir, type, m, num_migrate, k, encap); + km_migrate(sel, dir, type, m, num_migrate, k, net, encap); xfrm_pol_put(pol); diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 686014d39429..395d82411a87 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -2837,7 +2837,7 @@ EXPORT_SYMBOL(km_policy_expired); #ifdef CONFIG_XFRM_MIGRATE int km_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, const struct xfrm_migrate *m, int num_migrate, - const struct xfrm_kmaddress *k, + const struct xfrm_kmaddress *k, struct net *net, const struct xfrm_encap_tmpl *encap) { int err = -EINVAL; @@ -2848,7 +2848,7 @@ int km_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, list_for_each_entry_rcu(km, &xfrm_km_list, list) { if (km->migrate) { ret = km->migrate(sel, dir, type, m, num_migrate, k, - encap); + net, encap); if (!ret) err = ret; } diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 38a90e5ee3d9..71a4b7278eba 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -3271,10 +3271,9 @@ static int build_migrate(struct sk_buff *skb, const struct xfrm_migrate *m, static int xfrm_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, const struct xfrm_migrate *m, int num_migrate, - const struct xfrm_kmaddress *k, + const struct xfrm_kmaddress *k, struct net *net, const struct xfrm_encap_tmpl *encap) { - struct net *net = &init_net; struct sk_buff *skb; int err; @@ -3292,7 +3291,7 @@ static int xfrm_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, #else static int xfrm_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, const struct xfrm_migrate *m, int num_migrate, - const struct xfrm_kmaddress *k, + const struct xfrm_kmaddress *k, struct net *net, const struct xfrm_encap_tmpl *encap) { return -ENOPROTOOPT; From 7dbac7680eb629b3b4dc7e98c34f943b8814c0c8 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 6 May 2026 21:23:28 +0800 Subject: [PATCH 02/36] xfrm: ipcomp: Free destination pages on acomp errors Move the out_free_req label up by a couple of lines so that the allocated dst SG list gets freed on error as well as success. Fixes: eb2953d26971 ("xfrm: ipcomp: Use crypto_acomp interface") Cc: stable@kernel.org Reported-by: Yuan Tan Reported-by: Yifan Wu Reported-by: Juefei Pu Reported-by: Xin Liu Reported-by: Yilin Zhu Signed-off-by: Herbert Xu Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_ipcomp.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/net/xfrm/xfrm_ipcomp.c b/net/xfrm/xfrm_ipcomp.c index 5f38dff16177..671d48f8c937 100644 --- a/net/xfrm/xfrm_ipcomp.c +++ b/net/xfrm/xfrm_ipcomp.c @@ -51,11 +51,15 @@ static int ipcomp_post_acomp(struct sk_buff *skb, int err, int hlen) struct scatterlist *dsg; int len, dlen; - if (unlikely(err)) - goto out_free_req; + if (unlikely(!req)) + return err; extra = acomp_request_extra(req); dsg = extra->sg; + + if (unlikely(err)) + goto out_free_req; + dlen = req->dlen; pskb_trim_unique(skb, 0); @@ -84,10 +88,10 @@ static int ipcomp_post_acomp(struct sk_buff *skb, int err, int hlen) skb_shinfo(skb)->nr_frags++; } while ((dlen -= len)); - for (; dsg; dsg = sg_next(dsg)) +out_free_req: + for (; dsg && sg_page(dsg); dsg = sg_next(dsg)) __free_page(sg_page(dsg)); -out_free_req: acomp_request_free(req); return err; } From 742b04d0550b0ec89dcbc99537ec88653bd1ad90 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 13 May 2026 10:49:14 -0600 Subject: [PATCH 03/36] xfrm: Check for underflow in xfrm_state_mtu Leo Lin reported OOB write issue in esp component: xfrm_state_mtu() returns u32 but performs its arithmetic in unsigned modulo-2^32 space using an attacker-influenced "header_len + authsize + net_adj" subtracted from a small "mtu" argument. A nobody user can install an IPv4 ESP tunnel SA with a large authentication key (XFRMA_ALG_AUTH_TRUNC, e.g. hmac(sha512), 64-byte key, 64-byte trunc), configure a small interface MTU (68 bytes), and set XFRMA_TFCPAD to a large value. When a single UDP datagram is then sent through the tunnel, xfrm_state_mtu() underflows to a near-2^32 value, and esp_output() consumes it as a signed int via: padto = min(x->tfcpad, xfrm_state_mtu(x, mtu_cached)) esp.tfclen = padto - skb->len (assigned to int) esp.tfclen ends up negative (e.g. -207). It is sign-extended to size_t when passed to memset() inside esp_output_fill_trailer(), producing a ~16 EB write of zeroes at skb_tail_pointer(skb). KASAN logs it as "Write of size 18446744073709551537 at addr ffff888...". Check for underflow and return 1. This causes the sendmsg attempt to fail with ENETUNREACH. Fixes: c5c252389374 ("[XFRM]: Optimize MTU calculation") Reported-by: Leo Lin Assisted-by: Codex:26.506.31004 Signed-off-by: David Ahern Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 395d82411a87..589c3b6e4679 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -3114,10 +3114,14 @@ u32 xfrm_state_mtu(struct xfrm_state *x, int mtu) const struct xfrm_type *type = READ_ONCE(x->type); struct crypto_aead *aead; u32 blksize, net_adj = 0; + u32 overhead, payload_mtu; if (x->km.state != XFRM_STATE_VALID || - !type || type->proto != IPPROTO_ESP) + !type || type->proto != IPPROTO_ESP) { + if (mtu <= x->props.header_len) + return 1; return mtu - x->props.header_len; + } aead = x->data; blksize = ALIGN(crypto_aead_blocksize(aead), 4); @@ -3140,8 +3144,17 @@ u32 xfrm_state_mtu(struct xfrm_state *x, int mtu) break; } - return ((mtu - x->props.header_len - crypto_aead_authsize(aead) - - net_adj) & ~(blksize - 1)) + net_adj - 2; + overhead = x->props.header_len + crypto_aead_authsize(aead) + net_adj; + if (mtu <= overhead) + return 1; + + payload_mtu = mtu - overhead; + payload_mtu &= ~(blksize - 1); + if (payload_mtu <= 2) + return 1; + + return payload_mtu + net_adj - 2; + } EXPORT_SYMBOL_GPL(xfrm_state_mtu); From 79d8be262377f7112cfa3088dfc4142d5a2533f3 Mon Sep 17 00:00:00 2001 From: Michael Bommarito Date: Fri, 15 May 2026 11:45:31 -0400 Subject: [PATCH 04/36] xfrm: ah: use skb_to_full_sk in async output callbacks When AH output is offloaded to an asynchronous crypto provider (hardware accelerators such as AMD CCP, or a forced-async software shim used for testing), the digest completion fires ah_output_done() / ah6_output_done() on a workqueue. The egress skb at that point may have been originated by a TCP listener sending a SYN-ACK, which sets skb->sk to a request_sock via skb_set_owner_edemux(); it may also have been originated by an inet_timewait_sock retransmit. Neither is a full struct sock, and passing the raw skb->sk to xfrm_output_resume() then forwards a non-full socket through the rest of the xfrm output chain. xfrm_output_resume() and its downstream consumers expect a full sk where they dereference at all. The natural egress path through ah_output_done() does not crash today because the consumers that read past sock_common are either gated by sk_fullsock() or short-circuit on flags that are clear on a fresh request_sock; an exhaustive walk of the 50 most plausible consumers under sch_fq, dev_queue_xmit, netfilter, tc-egress and cgroup-egress BPF found no current unguarded deref. The bug is still a real type confusion that future consumer changes could turn into a memory-corruption primitive. This is the same bug class fixed for ESP in commit 1620c88887b1 ("xfrm: Fix the usage of skb->sk"). Apply the analogous fix to AH: convert skb->sk to a full socket pointer (or NULL) via skb_to_full_sk() before handing it to xfrm_output_resume(). The same async AH callbacks were touched recently for an independent ESN-related ICV layout bug in commit ec54093e6a8f ("xfrm: ah: account for ESN high bits in async callbacks"); the sk type-confusion addressed here is orthogonal. This patch is part of an ongoing audit of the AH callback paths; an ah_output ihl-validation hardening series is also currently under review on netdev. Reproduced under UML + KASAN + lockdep with a forced-async hmac(sha1) shim that registers at priority 9999 and wraps the sync in-tree hmac-sha1-lib. With the shim loaded, ah_output_done runs on every SYN-ACK egress through a transport-mode AH SA and skb->sk arrives as a request_sock (TCP_NEW_SYN_RECV); after this patch, xfrm_output_resume() receives the listener (the result of sk_to_full_sk()) and consumer derefs land on full-sock fields as intended. Fixes: 9ab1265d5231 ("xfrm: Use actual socket sk instead of skb socket for xfrm_output_resume") Cc: stable@vger.kernel.org Assisted-by: Claude:claude-opus-4-7 Signed-off-by: Michael Bommarito Signed-off-by: Steffen Klassert --- net/ipv4/ah4.c | 2 +- net/ipv6/ah6.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index 4366cbac3f06..6fd642d2278d 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -143,7 +143,7 @@ static void ah_output_done(void *data, int err) } kfree(AH_SKB_CB(skb)->tmp); - xfrm_output_resume(skb->sk, skb, err); + xfrm_output_resume(skb_to_full_sk(skb), skb, err); } static int ah_output(struct xfrm_state *x, struct sk_buff *skb) diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index de1e68199a01..76f7a2de9108 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -337,7 +337,7 @@ static void ah6_output_done(void *data, int err) ah6_restore_hdrs(top_iph, iph_ext, extlen); kfree(AH_SKB_CB(skb)->tmp); - xfrm_output_resume(skb->sk, skb, err); + xfrm_output_resume(skb_to_full_sk(skb), skb, err); } static int ah6_output(struct xfrm_state *x, struct sk_buff *skb) From 2982e599fff6faa21c8df147d96fc7af6c1a2f24 Mon Sep 17 00:00:00 2001 From: e521588 Date: Wed, 20 May 2026 09:27:17 +0200 Subject: [PATCH 05/36] esp: fix page frag reference leak on skb_to_sgvec failure In esp_output_tail(), when esp->inplace is false, the old skb page frags are replaced with a new page from the xfrm page_frag cache. The source scatterlist (sg) is built from the old frags before the replacement, and esp_ssg_unref() is responsible for releasing the old page references after the crypto operation completes. However, if the second skb_to_sgvec() call (which builds the destination scatterlist from the new page) fails, the code jumps to error_free which only calls kfree(tmp). The old page frag references captured in the source scatterlist are never released: 1. sg[] is built from old frags via skb_to_sgvec() (no extra get_page) 2. nr_frags is set to 1 and frag[0] is replaced with the new page 3. Second skb_to_sgvec() fails -> goto error_free 4. kfree(tmp) frees the sg[] memory but old frags are not unref'd 5. kfree_skb() only releases frag[0] (the new page), not the old ones Fix this by adding a bool parameter to esp_ssg_unref() that, when true, unconditionally unrefs the source scatterlist frags without checking req->src and req->dst, since those fields are not yet initialized by aead_request_set_crypt() at the point of the error. Existing callers pass false to preserve the original behavior. The same issue exists in both esp4 and esp6 as the code is identical. Fixes: cac2661c53f3 ("esp4: Avoid skb_cow_data whenever possible") Fixes: 03e2a30f6a27 ("esp6: Avoid skb_cow_data whenever possible") Signed-off-by: Alessandro Schino <7991aleschino@gmail.com> Signed-off-by: Steffen Klassert --- net/ipv4/esp4.c | 12 +++++++----- net/ipv6/esp6.c | 12 +++++++----- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 6a5febbdbee4..8314d7bddcb7 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -96,7 +96,7 @@ static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead, __alignof__(struct scatterlist)); } -static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb) +static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb, bool already_unref) { struct crypto_aead *aead = x->data; int extralen = 0; @@ -113,7 +113,7 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb) /* Unref skb_frag_pages in the src scatterlist if necessary. * Skip the first sg which comes from skb->data. */ - if (req->src != req->dst) + if (already_unref || req->src != req->dst) for (sg = sg_next(req->src); sg; sg = sg_next(sg)) skb_page_unref(page_to_netmem(sg_page(sg)), skb->pp_recycle); @@ -220,7 +220,7 @@ static void esp_output_done(void *data, int err) } tmp = ESP_SKB_CB(skb)->tmp; - esp_ssg_unref(x, tmp, skb); + esp_ssg_unref(x, tmp, skb, false); kfree(tmp); if (xo && (xo->flags & XFRM_DEV_RESUME)) { @@ -569,8 +569,10 @@ int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * err = skb_to_sgvec(skb, dsg, (unsigned char *)esph - skb->data, assoclen + ivlen + esp->clen + alen); - if (unlikely(err < 0)) + if (unlikely(err < 0)) { + esp_ssg_unref(x, tmp, skb, true); goto error_free; + } } if ((x->props.flags & XFRM_STATE_ESN)) @@ -602,7 +604,7 @@ int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * } if (sg != dsg) - esp_ssg_unref(x, tmp, skb); + esp_ssg_unref(x, tmp, skb, false); if (!err && x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP) err = esp_output_tail_tcp(x, skb); diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 9c06c5a1419d..9d0c4957ac62 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -113,7 +113,7 @@ static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead, __alignof__(struct scatterlist)); } -static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb) +static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb, bool already_unref) { struct crypto_aead *aead = x->data; int extralen = 0; @@ -130,7 +130,7 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb) /* Unref skb_frag_pages in the src scatterlist if necessary. * Skip the first sg which comes from skb->data. */ - if (req->src != req->dst) + if (already_unref || req->src != req->dst) for (sg = sg_next(req->src); sg; sg = sg_next(sg)) skb_page_unref(page_to_netmem(sg_page(sg)), skb->pp_recycle); @@ -254,7 +254,7 @@ static void esp_output_done(void *data, int err) } tmp = ESP_SKB_CB(skb)->tmp; - esp_ssg_unref(x, tmp, skb); + esp_ssg_unref(x, tmp, skb, false); kfree(tmp); esp_output_encap_csum(skb); @@ -600,8 +600,10 @@ int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info err = skb_to_sgvec(skb, dsg, (unsigned char *)esph - skb->data, assoclen + ivlen + esp->clen + alen); - if (unlikely(err < 0)) + if (unlikely(err < 0)) { + esp_ssg_unref(x, tmp, skb, true); goto error_free; + } } if ((x->props.flags & XFRM_STATE_ESN)) @@ -634,7 +636,7 @@ int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info } if (sg != dsg) - esp_ssg_unref(x, tmp, skb); + esp_ssg_unref(x, tmp, skb, false); if (!err && x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP) err = esp_output_tail_tcp(x, skb); From dfa0d7b0ff1eb6b2c416b8fdb9b4f2cefba57a40 Mon Sep 17 00:00:00 2001 From: Jingguo Tan Date: Mon, 18 May 2026 17:06:48 +0800 Subject: [PATCH 06/36] xfrm: esp: restore combined single-frag length gate The ESP out-of-place fast path appends the trailer in esp_output_head() before esp_output_tail() allocates the destination page frag. The head-side gate currently checks skb->data_len and tailen separately, but the tail code allocates a single destination frag from the combined post-trailer skb->data_len. Reject the page-frag fast path when the combined aligned length exceeds a page. Otherwise skb_page_frag_refill() may fall back to a single page while the destination sg still spans the combined skb->data_len. Restore this combined-length page gate for both IPv4 and IPv6. Fixes: 5bd8baab087d ("esp: limit skb_page_frag_refill use to a single page") Cc: stable@vger.kernel.org Signed-off-by: Lin Ma Signed-off-by: Chenyuan Mi Signed-off-by: Jingguo Tan Reviewed-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- net/ipv4/esp4.c | 4 ++-- net/ipv6/esp6.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 8314d7bddcb7..5d3a8656687e 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -419,8 +419,8 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * return err; } - if (ALIGN(tailen, L1_CACHE_BYTES) > PAGE_SIZE || - ALIGN(skb->data_len, L1_CACHE_BYTES) > PAGE_SIZE) + if (ALIGN(skb->data_len + tailen, L1_CACHE_BYTES) > + PAGE_SIZE) goto cow; if (!skb_cloned(skb)) { diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 9d0c4957ac62..b963b8e72604 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -448,8 +448,8 @@ int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info return err; } - if (ALIGN(tailen, L1_CACHE_BYTES) > PAGE_SIZE || - ALIGN(skb->data_len, L1_CACHE_BYTES) > PAGE_SIZE) + if (ALIGN(skb->data_len + tailen, L1_CACHE_BYTES) > + PAGE_SIZE) goto cow; if (!skb_cloned(skb)) { From 7f83d174073234839aea176f265e517e0d50a1d2 Mon Sep 17 00:00:00 2001 From: Shaomin Chen Date: Thu, 21 May 2026 02:07:23 +0800 Subject: [PATCH 07/36] xfrm: iptfs: reset runtime state when cloning SAs iptfs_clone_state() clones the IPTFS mode data with kmemdup(). This copies runtime objects which must not be shared with the original SA, including the embedded sk_buff_head, hrtimers, spinlock, and in-flight reassembly/reorder state. If xfrm_state_migrate() fails after clone_state() but before the later init_state() call has reinitialized those fields, the cloned state can be destroyed by xfrm_state_gc_task() with list and timer state copied from the original SA. With queued packets this lets the clone splice and free skbs owned by the original IPTFS queue, leading to use-after-free and double-free reports in iptfs_destroy_state() and skb release paths. Reinitialize the clone's runtime state before publishing it through x->mode_data. Because clone_state() now publishes a destroyable mode_data object before init_state(), take the mode callback module reference there. Avoid taking it again from __iptfs_init_state() for the same object. Fixes: 0e4fbf013fa5 ("xfrm: iptfs: add user packet (tunnel ingress) handling") Cc: stable@vger.kernel.org Signed-off-by: Shaomin Chen Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_iptfs.c | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/net/xfrm/xfrm_iptfs.c b/net/xfrm/xfrm_iptfs.c index 97bc979e55ba..6c6bbc040517 100644 --- a/net/xfrm/xfrm_iptfs.c +++ b/net/xfrm/xfrm_iptfs.c @@ -2650,7 +2650,8 @@ static void __iptfs_init_state(struct xfrm_state *x, x->props.enc_hdr_len = sizeof(struct ip_iptfs_hdr); /* Always keep a module reference when x->mode_data is set */ - __module_get(x->mode_cbs->owner); + if (x->mode_data != xtfs) + __module_get(x->mode_cbs->owner); x->mode_data = xtfs; xtfs->x = x; @@ -2658,22 +2659,39 @@ static void __iptfs_init_state(struct xfrm_state *x, static int iptfs_clone_state(struct xfrm_state *x, struct xfrm_state *orig) { + struct skb_wseq *w_saved = NULL; struct xfrm_iptfs_data *xtfs; xtfs = kmemdup(orig->mode_data, sizeof(*xtfs), GFP_KERNEL); if (!xtfs) return -ENOMEM; - xtfs->ra_newskb = NULL; if (xtfs->cfg.reorder_win_size) { - xtfs->w_saved = kzalloc_objs(*xtfs->w_saved, - xtfs->cfg.reorder_win_size); - if (!xtfs->w_saved) { + w_saved = kzalloc_objs(*w_saved, xtfs->cfg.reorder_win_size); + if (!w_saved) { kfree_sensitive(xtfs); return -ENOMEM; } } + xtfs->w_saved = w_saved; + __skb_queue_head_init(&xtfs->queue); + xtfs->queue_size = 0; + hrtimer_setup(&xtfs->iptfs_timer, iptfs_delay_timer, CLOCK_MONOTONIC, + IPTFS_HRTIMER_MODE); + + spin_lock_init(&xtfs->drop_lock); + hrtimer_setup(&xtfs->drop_timer, iptfs_drop_timer, CLOCK_MONOTONIC, + IPTFS_HRTIMER_MODE); + + xtfs->w_seq_set = false; + xtfs->w_wantseq = 0; + xtfs->w_savedlen = 0; + xtfs->ra_newskb = NULL; + xtfs->ra_wantseq = 0; + xtfs->ra_runtlen = 0; + + __module_get(x->mode_cbs->owner); x->mode_data = xtfs; xtfs->x = x; From 3e52417318473782012b236d0325bf7d2266a597 Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Thu, 21 May 2026 03:29:26 -0700 Subject: [PATCH 08/36] xfrm: move policy_bydst RCU sync from per-netns .exit to .pre_exit The struct pernet_operations docstring in include/net/net_namespace.h explicitly warns against blocking RCU primitives in .exit handlers: Exit methods using blocking RCU primitives, such as synchronize_rcu(), should be implemented via exit_batch. [...] Please, avoid synchronize_rcu() at all, where it's possible. Note that a combination of pre_exit() and exit() can be used, since a synchronize_rcu() is guaranteed between the calls. xfrm_policy_fini() violates this: it calls synchronize_rcu() before freeing the policy_bydst hash tables (so no RCU reader is mid- traversal at free time), but runs from xfrm_net_ops.exit -- once per namespace -- so a cleanup_net() of N namespaces pays N full RCU grace periods serially. Use the documented pre_exit/exit split. Move the policy flush (and the workqueue drains it depends on) into a new .pre_exit handler; xfrm_policy_fini() then runs in .exit and frees the hash tables after the synchronize_rcu_expedited() that cleanup_net() guarantees between the two phases. Providing O(1) RCU grace periods per batch instead of O(N). Observed on Linux 6.18 with a workload doing unshare(CLONE_NEWNET) at ~13/sec sustained: cleanup_net() and the netns_wq rescuer kthread both stuck in xfrm_policy_fini()'s synchronize_rcu(), >300k struct net accumulated in the cleanup queue, Percpu in /proc/meminfo climbed to 130+ GB on 256-CPU hosts, and memcg OOMs followed. setup_net and __put_net counts were balanced, ruling out a refcount leak. Fixes: 069daad4f2ae ("xfrm: Wait for RCU readers during policy netns exit") Signed-off-by: Usama Arif Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 59968dcbafe1..dd09d2063da2 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -4276,21 +4276,21 @@ static int __net_init xfrm_policy_init(struct net *net) return -ENOMEM; } -static void xfrm_policy_fini(struct net *net) +static void __net_exit xfrm_net_pre_exit(struct net *net) { - struct xfrm_pol_inexact_bin *b, *t; - unsigned int sz; - int dir; - disable_work_sync(&net->xfrm.policy_hthresh.work); - flush_work(&net->xfrm.policy_hash_work); #ifdef CONFIG_XFRM_SUB_POLICY xfrm_policy_flush(net, XFRM_POLICY_TYPE_SUB, false); #endif xfrm_policy_flush(net, XFRM_POLICY_TYPE_MAIN, false); +} - synchronize_rcu(); +static void xfrm_policy_fini(struct net *net) +{ + struct xfrm_pol_inexact_bin *b, *t; + unsigned int sz; + int dir; WARN_ON(!list_empty(&net->xfrm.policy_all)); @@ -4368,6 +4368,7 @@ static void __net_exit xfrm_net_exit(struct net *net) static struct pernet_operations __net_initdata xfrm_net_ops = { .init = xfrm_net_init, + .pre_exit = xfrm_net_pre_exit, .exit = xfrm_net_exit, }; From c16f74dc1d75d0e2e7670076d5375deda110ebeb Mon Sep 17 00:00:00 2001 From: Zhengchuan Liang Date: Fri, 22 May 2026 17:31:55 +0800 Subject: [PATCH 09/36] xfrm: input: hold netns during deferred transport reinjection Transport-mode reinjection stores a struct net pointer in skb->cb and uses it later from xfrm_trans_reinject(). That pointer must stay valid until the deferred callback runs. Take a netns reference when queueing deferred reinjection work and drop it after the callback completes. Use maybe_get_net() so the queueing path does not revive a namespace that is already being torn down. This keeps the existing workqueue design and fixes the netns lifetime handling in one place for all users of xfrm_trans_queue_net(). Fixes: 7b3801927e52 ("xfrm: introduce xfrm_trans_queue_net") Cc: stable@kernel.org Reported-by: Yuan Tan Reported-by: Xin Liu Co-developed-by: Luxing Yin Signed-off-by: Luxing Yin Signed-off-by: Zhengchuan Liang Signed-off-by: Ren Wei Assisted-by: Codex:gpt-5.4 Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_input.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index f65291eba1f6..e4c2cd24936d 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -797,9 +797,12 @@ static void xfrm_trans_reinject(struct work_struct *work) spin_unlock_bh(&trans->queue_lock); local_bh_disable(); - while ((skb = __skb_dequeue(&queue))) - XFRM_TRANS_SKB_CB(skb)->finish(XFRM_TRANS_SKB_CB(skb)->net, - NULL, skb); + while ((skb = __skb_dequeue(&queue))) { + struct net *net = XFRM_TRANS_SKB_CB(skb)->net; + + XFRM_TRANS_SKB_CB(skb)->finish(net, NULL, skb); + put_net(net); + } local_bh_enable(); } @@ -808,6 +811,7 @@ int xfrm_trans_queue_net(struct net *net, struct sk_buff *skb, struct sk_buff *)) { struct xfrm_trans_tasklet *trans; + struct net *hold_net; trans = this_cpu_ptr(&xfrm_trans_tasklet); @@ -816,8 +820,12 @@ int xfrm_trans_queue_net(struct net *net, struct sk_buff *skb, BUILD_BUG_ON(sizeof(struct xfrm_trans_cb) > sizeof(skb->cb)); + hold_net = maybe_get_net(net); + if (!hold_net) + return -ENODEV; + XFRM_TRANS_SKB_CB(skb)->finish = finish; - XFRM_TRANS_SKB_CB(skb)->net = net; + XFRM_TRANS_SKB_CB(skb)->net = hold_net; spin_lock_bh(&trans->queue_lock); __skb_queue_tail(&trans->queue, skb); spin_unlock_bh(&trans->queue_lock); From bfa9d28960ed677d556bdf097073bc3129686229 Mon Sep 17 00:00:00 2001 From: Pavitra Jha Date: Thu, 21 May 2026 04:04:14 -0400 Subject: [PATCH 10/36] Bluetooth: hci_conn: Fix memory leak in hci_le_big_terminate() hci_le_big_terminate() allocates iso_list_data via kzalloc_obj but returns 0 without freeing it when neither pa_sync_term nor big_sync_term flags are set after evaluating the PA and BIG sync connection state. This early-return path was introduced when hci_le_big_terminate() was refactored to take struct hci_conn instead of raw u8 parameters, adding PA/BIG flag evaluation logic. The existing kfree() on hci_cmd_sync_queue failure does not cover this path. Fixes: a7bcffc673de ("Bluetooth: Add PA_LINK to distinguish BIG sync and PA sync connections") Cc: stable@vger.kernel.org Signed-off-by: Pavitra Jha Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_conn.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 17b46ad6a349..54eabaa46960 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -870,8 +870,10 @@ static int hci_le_big_terminate(struct hci_dev *hdev, struct hci_conn *conn) d->big_sync_term = true; } - if (!d->pa_sync_term && !d->big_sync_term) + if (!d->pa_sync_term && !d->big_sync_term) { + kfree(d); return 0; + } ret = hci_cmd_sync_queue(hdev, big_terminate_sync, d, terminate_big_destroy); From 9dbd84990394c51f5cee1e8871bb5ff8af5ed939 Mon Sep 17 00:00:00 2001 From: Siwei Zhang Date: Wed, 20 May 2026 22:30:36 -0400 Subject: [PATCH 11/36] Bluetooth: L2CAP: fix chan ref leak in l2cap_chan_timeout() on !conn __set_chan_timer() takes a l2cap_chan reference via l2cap_chan_hold() before scheduling the delayed work. The normal path in l2cap_chan_timeout() drops this reference with l2cap_chan_put() at the end, but the early return when chan->conn is NULL skips the put, leaking the reference. Add the missing l2cap_chan_put() before the early return. Fixes: adf0398cee86 ("Bluetooth: l2cap: fix null-ptr-deref in l2cap_chan_timeout") Cc: stable@vger.kernel.org Signed-off-by: Siwei Zhang Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/l2cap_core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index fdccd62ccca8..5668c92b3f58 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -411,8 +411,10 @@ static void l2cap_chan_timeout(struct work_struct *work) BT_DBG("chan %p state %s", chan, state_to_string(chan->state)); - if (!conn) + if (!conn) { + l2cap_chan_put(chan); return; + } mutex_lock(&conn->lock); /* __set_chan_timer() calls l2cap_chan_hold(chan) while scheduling From 8c8e620467a7b51562dbcefbd1f09f288d7d710d Mon Sep 17 00:00:00 2001 From: Siwei Zhang Date: Wed, 20 May 2026 22:12:20 -0400 Subject: [PATCH 12/36] Bluetooth: L2CAP: use chan timer to close channels in cleanup_listen() l2cap_chan_close() removes the channel from conn->chan_l, which must be done under conn->lock. cleanup_listen() runs under the parent sk_lock, so acquiring conn->lock would invert the established conn->lock -> chan->lock -> sk_lock order. Instead of calling l2cap_chan_close() directly, schedule l2cap_chan_timeout with delay 0 to close the channel asynchronously. The timeout handler already acquires conn->lock and chan->lock in the correct order. The timer is only armed when chan->conn is still set: if it is already NULL, l2cap_conn_del() has already processed this channel (l2cap_chan_del + l2cap_sock_teardown_cb + l2cap_sock_close_cb), so there is nothing left to do. If l2cap_conn_del() races in after the timer is armed, __clear_chan_timer() inside l2cap_chan_del() cancels it; if the timer has already fired, the handler returns harmlessly because chan->conn was cleared. Fixes: 3df91ea20e74 ("Bluetooth: Revert to mutexes from RCU list") Cc: # 0b58004: Bluetooth: fix UAF in l2cap_sock_cleanup_listen() vs l2cap_conn_del() Signed-off-by: Siwei Zhang Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/l2cap_sock.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index b34e7da8d906..c138aa4ae266 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -1499,6 +1499,10 @@ static void l2cap_sock_cleanup_listen(struct sock *parent) * pin it (hold_unless_zero() additionally skips a chan already past * its last reference). We then drop the sk lock before taking * chan->lock, so sk and chan locks are never held together. + * + * Since we cannot call l2cap_chan_close() without conn->lock, + * schedule l2cap_chan_timeout to close the channel; it already + * acquires conn->lock -> chan->lock in the correct order. */ while ((sk = bt_accept_dequeue(parent, NULL))) { struct l2cap_chan *chan; @@ -1516,14 +1520,12 @@ static void l2cap_sock_cleanup_listen(struct sock *parent) state_to_string(chan->state)); l2cap_chan_lock(chan); - __clear_chan_timer(chan); - l2cap_chan_close(chan, ECONNRESET); - /* l2cap_conn_del() may already have killed this socket - * (it sets SOCK_DEAD); skip the duplicate to avoid a - * double sock_put()/l2cap_chan_put(). + /* Since we cannot call l2cap_chan_close() without + * conn->lock, schedule its timer to trigger the close + * and cleanup of this channel. */ - if (!sock_flag(sk, SOCK_DEAD)) - l2cap_sock_kill(sk); + if (chan->conn) + __set_chan_timer(chan, 0); l2cap_chan_unlock(chan); l2cap_chan_put(chan); From 2a3ac9ee11dbb9845f3947cef4a79dba658cf6f6 Mon Sep 17 00:00:00 2001 From: Muhammad Bilal Date: Wed, 20 May 2026 18:56:43 -0400 Subject: [PATCH 13/36] Bluetooth: HIDP: fix missing length checks in hidp_input_report() hidp_input_report() reads keyboard and mouse payload data from an skb without first verifying that skb->len contains enough data. hidp_recv_intr_frame() pulls the 1-byte HIDP header before dispatching to hidp_input_report(). If a paired device sends a truncated packet, the handler reads beyond the valid skb data, resulting in an out-of-bounds read of skb data. The OOB bytes may be interpreted as phantom key presses or spurious mouse movement. Replace the open-coded length tracking and pointer arithmetic with skb_pull_data() calls. skb_pull_data() returns NULL if the requested bytes are not present, eliminating the need for a manual size variable and the separate skb->len guard. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Cc: stable@vger.kernel.org Signed-off-by: Muhammad Bilal Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hidp/core.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c index 976f91eeb745..70344bd3248a 100644 --- a/net/bluetooth/hidp/core.c +++ b/net/bluetooth/hidp/core.c @@ -179,12 +179,21 @@ static void hidp_input_report(struct hidp_session *session, struct sk_buff *skb) { struct input_dev *dev = session->input; unsigned char *keys = session->keys; - unsigned char *udata = skb->data + 1; - signed char *sdata = skb->data + 1; - int i, size = skb->len - 1; + unsigned char *udata; + signed char *sdata; + u8 *hdr; + int i; - switch (skb->data[0]) { + hdr = skb_pull_data(skb, 1); + if (!hdr) + return; + + switch (*hdr) { case 0x01: /* Keyboard report */ + udata = skb_pull_data(skb, 8); + if (!udata) + break; + for (i = 0; i < 8; i++) input_report_key(dev, hidp_keycode[i + 224], (udata[0] >> i) & 1); @@ -213,6 +222,10 @@ static void hidp_input_report(struct hidp_session *session, struct sk_buff *skb) break; case 0x02: /* Mouse report */ + sdata = skb_pull_data(skb, 3); + if (!sdata) + break; + input_report_key(dev, BTN_LEFT, sdata[0] & 0x01); input_report_key(dev, BTN_RIGHT, sdata[0] & 0x02); input_report_key(dev, BTN_MIDDLE, sdata[0] & 0x04); @@ -222,7 +235,7 @@ static void hidp_input_report(struct hidp_session *session, struct sk_buff *skb) input_report_rel(dev, REL_X, sdata[1]); input_report_rel(dev, REL_Y, sdata[2]); - if (size > 3) + if (skb->len > 0) input_report_rel(dev, REL_WHEEL, sdata[3]); break; } From 82855073c1081732656734b74d7d1d5e4cfd0da7 Mon Sep 17 00:00:00 2001 From: Shuai Zhang Date: Thu, 21 May 2026 13:25:47 +0800 Subject: [PATCH 14/36] Bluetooth: btusb: Allow firmware re-download when version matches The Bluetooth host decides whether to download firmware by reading the controller firmware download completion flag and firmware version information. If a USB error occurs during the firmware download process (for example due to a USB disconnect), the download is aborted immediately. An incomplete firmware transfer does not cause the controller to set the download completion flag, but the firmware version information may be updated at an early stage of the download process. In this case, after USB reconnection, the host attempts to re-download the firmware because the download completion flag is not set. However, since the controller reports the same firmware version as the target firmware, the download is skipped. This ultimately results in the firmware not being properly updated on the controller. This change removes the restriction that skips firmware download when the versions are equal. It covers scenarios where the USB connection can be disconnected at any time and ensures that firmware download can be retriggered after USB reconnection, allowing the Bluetooth firmware to be correctly and completely updated. Fixes: 3267c884cefa ("Bluetooth: btusb: Add support for QCA ROME chipset family") Cc: stable@vger.kernel.org Signed-off-by: Shuai Zhang Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btusb.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c index 7f5fce93d984..830fefb342c6 100644 --- a/drivers/bluetooth/btusb.c +++ b/drivers/bluetooth/btusb.c @@ -3540,7 +3540,13 @@ static int btusb_setup_qca_load_rampatch(struct hci_dev *hdev, "firmware rome 0x%x build 0x%x", rver_rom, rver_patch, ver_rom, ver_patch); - if (rver_rom != ver_rom || rver_patch <= ver_patch) { + /* Allow rampatch when the patch version equals the firmware version. + * A firmware download may be aborted by a transient USB error (e.g. + * disconnect) after the controller updates version info but before + * completion. + * Allowing equal versions enables re-flashing during recovery. + */ + if (rver_rom != ver_rom || rver_patch < ver_patch) { bt_dev_err(hdev, "rampatch file version did not match with firmware"); err = -EINVAL; goto done; From 3c40d381ce04f9575a5d8b542898183c3b4b38dc Mon Sep 17 00:00:00 2001 From: Zhao Dongdong Date: Tue, 26 May 2026 11:21:39 +0800 Subject: [PATCH 15/36] Bluetooth: 6lowpan: check skb_clone() return value in send_mcast_pkt() The skb_clone() function can return NULL if memory allocation fails. send_mcast_pkt() calls skb_clone() without checking the return value, which can lead to a NULL pointer dereference in send_pkt() when it dereferences skb->data. Add a NULL check after skb_clone() and skip the peer if the clone fails. Fixes: 18722c247023 ("Bluetooth: Enable 6LoWPAN support for BT LE devices") Signed-off-by: Zhao Dongdong Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/6lowpan.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 2f03b780b40d..960a19b3e26d 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -486,6 +486,8 @@ static int send_mcast_pkt(struct sk_buff *skb, struct net_device *netdev) int ret; local_skb = skb_clone(skb, GFP_ATOMIC); + if (!local_skb) + continue; BT_DBG("xmit %s to %pMR type %u IP %pI6c chan %p", netdev->name, From bfea6091e0fffb270c20e74384b660910277eb6c Mon Sep 17 00:00:00 2001 From: Doruk Tan Ozturk Date: Mon, 25 May 2026 18:24:38 +0200 Subject: [PATCH 16/36] Bluetooth: hci_sync: fix UAF in hci_le_create_cis_sync hci_le_create_cis_sync() dereferences conn->conn_timeout after releasing both rcu_read_lock() and hci_dev_lock(hdev). The conn pointer was obtained from an RCU-protected iteration over hdev->conn_hash.list and is not valid once these locks are dropped. A concurrent disconnect can free the hci_conn between the unlock and the dereference, causing a use-after-free read. The cancellation mechanism in hci_conn_del() cannot prevent this because hci_le_create_cis_pending() queues hci_create_cis_sync with data=NULL: hci_cmd_sync_queue(hdev, hci_create_cis_sync, NULL, NULL); While hci_conn_del() dequeues with data=conn: hci_cmd_sync_dequeue(hdev, NULL, conn, NULL); Since NULL != conn, the lookup in _hci_cmd_sync_lookup_entry() never matches, and the pending work item is not cancelled. Fix this by saving conn->conn_timeout into a local variable while the locks are still held, so the stale conn pointer is never dereferenced after unlock. This is the same class of bug as the one fixed by commit 035c25007c9e ("Bluetooth: hci_sync: Fix UAF on le_read_features_complete") which addressed the identical pattern in a different function. This vulnerability was identified using 0sec.ai, an open-source automated security auditing platform (https://github.com/0sec-labs). Fixes: c09b80be6ffc ("Bluetooth: hci_conn: Fix not waiting for HCI_EVT_LE_CIS_ESTABLISHED") Cc: stable@vger.kernel.org Reported-by: Doruk Tan Ozturk Signed-off-by: Doruk Tan Ozturk Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_sync.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index aff8562a8690..1faf8df6d159 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -6699,6 +6699,7 @@ int hci_le_create_cis_sync(struct hci_dev *hdev) DEFINE_FLEX(struct hci_cp_le_create_cis, cmd, cis, num_cis, 0x1f); size_t aux_num_cis = 0; struct hci_conn *conn; + u16 timeout = 0; u8 cig = BT_ISO_QOS_CIG_UNSET; /* The spec allows only one pending LE Create CIS command at a time. If @@ -6769,6 +6770,7 @@ int hci_le_create_cis_sync(struct hci_dev *hdev) set_bit(HCI_CONN_CREATE_CIS, &conn->flags); cis->acl_handle = cpu_to_le16(conn->parent->handle); cis->cis_handle = cpu_to_le16(conn->handle); + timeout = conn->conn_timeout; aux_num_cis++; if (aux_num_cis >= cmd->num_cis) @@ -6788,7 +6790,7 @@ int hci_le_create_cis_sync(struct hci_dev *hdev) return __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_CREATE_CIS, struct_size(cmd, cis, cmd->num_cis), cmd, HCI_EVT_LE_CIS_ESTABLISHED, - conn->conn_timeout, NULL); + timeout, NULL); } int hci_le_remove_cig_sync(struct hci_dev *hdev, u8 handle) From fa21e86caba2347e89eb65af926205a36a097c53 Mon Sep 17 00:00:00 2001 From: Shuai Zhang Date: Mon, 25 May 2026 14:51:56 +0800 Subject: [PATCH 17/36] Bluetooth: hci_qca: Use 100 ms SSR delay for rampatch and NVM loading When bt_en is pulled high by hardware, the host does not re-download the firmware after SSR. The controller loads the rampatch and NVM internally. On HMT chip, the rampatch is ~264 KB and the NVM is ~9.4 KB. The loading process takes approximately 70 ms. The previous 50 ms delay is too short, causing the controller to not respond to the reset command sent by the host, which leads to BT initialization failure: Bluetooth: hci0: QCA memdump Done, received 458752, total 458752 Bluetooth: hci0: mem_dump_status: 2 Bluetooth: hci0: Opcode 0x0c03 failed: -110 Increase the delay to 100 ms, which was confirmed as a safe value by the controller, to ensure the controller has finished loading the firmware before the host sends commands. Steps to reproduce: 1. Trigger SSR and wait for SSR to complete: hcitool cmd 0x3f 0c 26 2. Run "bluetoothctl power on" and observe that BT fails to start. Fixes: fce1a9244a0f ("Bluetooth: hci_qca: Fix SSR (SubSystem Restart) fail when BT_EN is pulled up by hw") Cc: stable@vger.kernel.org Reviewed-by: Dmitry Baryshkov Signed-off-by: Shuai Zhang Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/hci_qca.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/bluetooth/hci_qca.c b/drivers/bluetooth/hci_qca.c index ed280399bf47..34500137df2c 100644 --- a/drivers/bluetooth/hci_qca.c +++ b/drivers/bluetooth/hci_qca.c @@ -1680,8 +1680,8 @@ static void qca_hw_error(struct hci_dev *hdev, u8 code) mod_timer(&qca->tx_idle_timer, jiffies + msecs_to_jiffies(qca->tx_idle_delay)); - /* Controller reset completion time is 50ms */ - msleep(50); + /* Wait for the controller to load the rampatch and NVM. */ + msleep(100); clear_bit(QCA_SSR_TRIGGERED, &qca->flags); clear_bit(QCA_IBS_DISABLED, &qca->flags); From 00e1950716c6ed67d74777b2db286b0fa23b4be9 Mon Sep 17 00:00:00 2001 From: Zhenghang Xiao Date: Tue, 26 May 2026 18:51:52 +0800 Subject: [PATCH 18/36] Bluetooth: l2cap: clear chan->ident on ECRED reconfiguration success l2cap_ecred_reconf_rsp() returns early on success without clearing chan->ident. Every other L2CAP response handler (l2cap_ecred_conn_rsp, l2cap_le_connect_rsp, l2cap_config_rsp) clears chan->ident after a successful transaction to prevent the channel from matching subsequent responses with the recycled ident value. A remote attacker that completed a reconfiguration as the peer can replay a failure response with the stale ident, causing the kernel to match and destroy the already-established channel via l2cap_chan_del(chan, ECONNRESET). Clear chan->ident for all matching channels on success, and harden the failure path by using l2cap_chan_hold_unless_zero() consistent with other L2CAP handlers (l2cap_le_command_rej, __l2cap_get_chan_by_ident). Fixes: 15f02b910562 ("Bluetooth: L2CAP: Add initial code for Enhanced Credit Based Mode") Signed-off-by: Zhenghang Xiao Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/l2cap_core.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 5668c92b3f58..ff13c43e3588 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -5460,14 +5460,20 @@ static inline int l2cap_ecred_reconf_rsp(struct l2cap_conn *conn, BT_DBG("result 0x%4.4x", result); - if (!result) + if (!result) { + list_for_each_entry(chan, &conn->chan_l, list) { + if (chan->ident == cmd->ident) + chan->ident = 0; + } return 0; + } list_for_each_entry_safe(chan, tmp, &conn->chan_l, list) { if (chan->ident != cmd->ident) continue; - l2cap_chan_hold(chan); + if (!l2cap_chan_hold_unless_zero(chan)) + continue; l2cap_chan_lock(chan); l2cap_chan_del(chan, ECONNRESET); From 41c2713b204e6cb6a94587bc6bf6935107df5479 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 11 May 2026 12:09:42 -0400 Subject: [PATCH 19/36] Bluetooth: L2CAP: Fix possible crash on l2cap_ecred_conn_rsp If dcid is received for an already-assigned destination CID the spec requires that both channels to be discarded, but calling l2cap_chan_del may invalidate the tmp cursor created by list_for_each_entry_safe and in fact it is the wrong procedure as the chan->dcid may be assigned previously it really needs to be disconnected. Calling l2cap_chan_clone directly may still lead to l2cap_chan_del so instead schedule l2cap_chan_timeout with delay 0 to close the channel asynchronously. Fixes: 15f02b910562 ("Bluetooth: L2CAP: Add initial code for Enhanced Credit Based Mode") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/l2cap_core.c | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index ff13c43e3588..45b175399e8d 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -5262,6 +5262,7 @@ static inline int l2cap_ecred_conn_rsp(struct l2cap_conn *conn, cmd_len -= sizeof(*rsp); list_for_each_entry_safe(chan, tmp, &conn->chan_l, list) { + struct l2cap_chan *orig; u16 dcid; if (chan->ident != cmd->ident || @@ -5283,8 +5284,10 @@ static inline int l2cap_ecred_conn_rsp(struct l2cap_conn *conn, BT_DBG("dcid[%d] 0x%4.4x", i, dcid); + orig = __l2cap_get_chan_by_dcid(conn, dcid); + /* Check if dcid is already in use */ - if (dcid && __l2cap_get_chan_by_dcid(conn, dcid)) { + if (dcid && orig) { /* If a device receives a * L2CAP_CREDIT_BASED_CONNECTION_RSP packet with an * already-assigned Destination CID, then both the @@ -5293,10 +5296,24 @@ static inline int l2cap_ecred_conn_rsp(struct l2cap_conn *conn, */ l2cap_chan_del(chan, ECONNREFUSED); l2cap_chan_unlock(chan); - chan = __l2cap_get_chan_by_dcid(conn, dcid); - l2cap_chan_lock(chan); - l2cap_chan_del(chan, ECONNRESET); - l2cap_chan_unlock(chan); + + /* Check that the dcid channel mode is + * L2CAP_MODE_EXT_FLOWCTL since this procedure is only + * valid for that mode and shouldn't disconnect a dcid + * in other modes. + */ + if (orig->mode == L2CAP_MODE_EXT_FLOWCTL) { + l2cap_chan_lock(orig); + /* Disconnect the original channel as it may be + * considered connected since dcid has already + * been assigned; don't call l2cap_chan_close + * directly since that could lead to + * l2cap_chan_del and then removing the channel + * from the list while we're iterating over it. + */ + __set_chan_timer(orig, 0); + l2cap_chan_unlock(orig); + } continue; } From 47f23a259517abbdb8032c057a1e8a6bf3734878 Mon Sep 17 00:00:00 2001 From: Muhammad Bilal Date: Wed, 27 May 2026 04:59:17 +0000 Subject: [PATCH 20/36] Bluetooth: ISO: fix UAF in iso_recv_frame iso_recv_frame reads conn->sk under iso_conn_lock but releases the lock before using sk, with no reference held. A concurrent iso_sock_kill() can free sk in that window, causing use-after-free on sk->sk_state and sock_queue_rcv_skb(). Fix by replacing the bare pointer read with iso_sock_hold(conn), which calls sock_hold() while the spinlock is held, atomically elevating the refcount before the lock drops. Add a drop_put label so sock_put() is called on all exit paths where the hold succeeded. Fixes: ccf74f2390d60a2f9a75ef496d2564abb478f46a ("Bluetooth: Add BTPROTO_ISO socket type") Cc: stable@vger.kernel.org Signed-off-by: Muhammad Bilal Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/iso.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index d7af617cda45..f03b7fa5dccc 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -564,7 +564,7 @@ static void iso_recv_frame(struct iso_conn *conn, struct sk_buff *skb) struct sock *sk; iso_conn_lock(conn); - sk = conn->sk; + sk = iso_sock_hold(conn); iso_conn_unlock(conn); if (!sk) @@ -573,11 +573,15 @@ static void iso_recv_frame(struct iso_conn *conn, struct sk_buff *skb) BT_DBG("sk %p len %d", sk, skb->len); if (sk->sk_state != BT_CONNECTED) - goto drop; + goto drop_put; - if (!sock_queue_rcv_skb(sk, skb)) + if (!sock_queue_rcv_skb(sk, skb)) { + sock_put(sk); return; + } +drop_put: + sock_put(sk); drop: kfree_skb(skb); } From 4b5f8e608749b7e8fa386c6e4301cf9272595859 Mon Sep 17 00:00:00 2001 From: Muhammad Bilal Date: Wed, 27 May 2026 04:59:18 +0000 Subject: [PATCH 21/36] Bluetooth: ISO: serialize iso_sock_clear_timer with socket lock iso_sock_close() calls iso_sock_clear_timer() before acquiring lock_sock(sk). iso_sock_clear_timer() reads iso_pi(sk)->conn twice without the socket lock held: if (!iso_pi(sk)->conn) return; cancel_delayed_work(&iso_pi(sk)->conn->timeout_work); Concurrently, iso_conn_del() executes under lock_sock(sk) and calls iso_chan_del(), which sets iso_pi(sk)->conn to NULL and may result in the final reference to the connection being dropped: CPU0 CPU1 ---- ---- iso_sock_clear_timer() if (conn != NULL) ... lock_sock(sk) iso_chan_del() iso_pi(sk)->conn = NULL cancel_delayed_work(conn) /* NULL deref or UAF */ iso_pi(sk)->conn is not stable across the unlock window, causing a NULL pointer dereference or use-after-free. Serialize iso_sock_clear_timer() with the socket lock by moving it inside lock_sock()/release_sock(), matching the pattern used in iso_conn_del() and all other call sites. Fixes: ccf74f2390d60a2f9a75ef496d2564abb478f46a ("Bluetooth: Add BTPROTO_ISO socket type") Cc: stable@vger.kernel.org Signed-off-by: Muhammad Bilal Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/iso.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index f03b7fa5dccc..876649556d3c 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -864,8 +864,8 @@ static void __iso_sock_close(struct sock *sk) /* Must be called on unlocked socket. */ static void iso_sock_close(struct sock *sk) { - iso_sock_clear_timer(sk); lock_sock(sk); + iso_sock_clear_timer(sk); __iso_sock_close(sk); release_sock(sk); iso_sock_kill(sk); From 40b87657200cfae93e48904fd9c9c8fc3e192cae Mon Sep 17 00:00:00 2001 From: Heitor Alves de Siqueira Date: Tue, 26 May 2026 10:50:57 -0300 Subject: [PATCH 22/36] Bluetooth: hci_core: Rework hci_dev_do_reset() to use hci_sync functions The current HCI reset function in hci_core.c duplicates most of the work done by hci_dev_close_sync(), and doesn't handle LE, advertising or discovery. Instead of porting these to hci_dev_do_reset(), directly call the close/open functions from hci_sync to reset the hdev. MGMT now notifies when a user performs a reset. Suggested-by: Luiz Augusto von Dentz Signed-off-by: Heitor Alves de Siqueira Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_core.c | 43 +++------------------------------------- 1 file changed, 3 insertions(+), 40 deletions(-) diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index c46c1236ebfa..28d7929dc593 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -539,46 +539,9 @@ static int hci_dev_do_reset(struct hci_dev *hdev) hci_req_sync_lock(hdev); - /* Drop queues */ - skb_queue_purge(&hdev->rx_q); - skb_queue_purge(&hdev->cmd_q); - - /* Cancel these to avoid queueing non-chained pending work */ - hci_dev_set_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE); - /* Wait for - * - * if (!hci_dev_test_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE)) - * queue_delayed_work(&hdev->{cmd,ncmd}_timer) - * - * inside RCU section to see the flag or complete scheduling. - */ - synchronize_rcu(); - /* Explicitly cancel works in case scheduled after setting the flag. */ - cancel_delayed_work(&hdev->cmd_timer); - cancel_delayed_work(&hdev->ncmd_timer); - - /* Avoid potential lockdep warnings from the *_flush() calls by - * ensuring the workqueue is empty up front. - */ - drain_workqueue(hdev->workqueue); - - hci_dev_lock(hdev); - hci_inquiry_cache_flush(hdev); - hci_conn_hash_flush(hdev); - hci_dev_unlock(hdev); - - if (hdev->flush) - hdev->flush(hdev); - - hci_dev_clear_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE); - - atomic_set(&hdev->cmd_cnt, 1); - hdev->acl_cnt = 0; - hdev->sco_cnt = 0; - hdev->le_cnt = 0; - hdev->iso_cnt = 0; - - ret = hci_reset_sync(hdev); + ret = hci_dev_close_sync(hdev); + if (!ret) + ret = hci_dev_open_sync(hdev); hci_req_sync_unlock(hdev); return ret; From 525daaea459fc215f432de1b8debbd9144bf97b0 Mon Sep 17 00:00:00 2001 From: Heitor Alves de Siqueira Date: Tue, 26 May 2026 10:50:58 -0300 Subject: [PATCH 23/36] Bluetooth: hci_sync: Set HCI_CMD_DRAIN_WORKQUEUE during device close Since hci_dev_close_sync() can now be called during the reset path, we should also set HCI_CMD_DRAIN_WORKQUEUE. This avoids queuing timeouts while the hdev workqueue is being drained. Fixes: 877afadad2dc ("Bluetooth: When HCI work queue is drained, only queue chained work") Signed-off-by: Heitor Alves de Siqueira Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_sync.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 1faf8df6d159..0f016d269c62 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -5301,6 +5301,12 @@ int hci_dev_close_sync(struct hci_dev *hdev) bt_dev_dbg(hdev, ""); + /* Set HCI_DRAIN_WORKQUEUE flag to prevent queuing work during + * reset/close. See hci_cmd_work() and handle_cmd_cnt_and_timer(). + */ + hci_dev_set_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE); + synchronize_rcu(); + if (hci_dev_test_flag(hdev, HCI_UNREGISTER)) { disable_delayed_work(&hdev->power_off); disable_delayed_work(&hdev->ncmd_timer); @@ -5324,6 +5330,7 @@ int hci_dev_close_sync(struct hci_dev *hdev) if (!test_and_clear_bit(HCI_UP, &hdev->flags)) { cancel_delayed_work_sync(&hdev->cmd_timer); + hci_dev_clear_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE); return err; } @@ -5423,6 +5430,7 @@ int hci_dev_close_sync(struct hci_dev *hdev) /* Clear flags */ hdev->flags &= BIT(HCI_RAW); hci_dev_clear_volatile_flags(hdev); + hci_dev_clear_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE); memset(hdev->eir, 0, sizeof(hdev->eir)); memset(hdev->dev_class, 0, sizeof(hdev->dev_class)); From cdf88b35e06f1b385f7f6228060ae541d44fbb72 Mon Sep 17 00:00:00 2001 From: Heitor Alves de Siqueira Date: Tue, 26 May 2026 10:50:59 -0300 Subject: [PATCH 24/36] Bluetooth: hci_sync: Reset device counters in hci_dev_close_sync() Before resetting or closing the device, protocol counters should also be zeroed. Fixes: d0b137062b2d ("Bluetooth: hci_sync: Rework init stages") Signed-off-by: Heitor Alves de Siqueira Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_sync.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 0f016d269c62..aeccd8084cba 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -5393,6 +5393,10 @@ int hci_dev_close_sync(struct hci_dev *hdev) /* Reset device */ skb_queue_purge(&hdev->cmd_q); atomic_set(&hdev->cmd_cnt, 1); + hdev->acl_cnt = 0; + hdev->sco_cnt = 0; + hdev->le_cnt = 0; + hdev->iso_cnt = 0; if (hci_test_quirk(hdev, HCI_QUIRK_RESET_ON_CLOSE) && !auto_off && !hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) { set_bit(HCI_INIT, &hdev->flags); From 17bfe0a8c014ee1d542ad352cd6a0a505361664a Mon Sep 17 00:00:00 2001 From: Dipayaan Roy Date: Mon, 25 May 2026 01:08:24 -0700 Subject: [PATCH 25/36] net: mana: Add NULL guards in teardown path to prevent panic on attach failure When queue allocation fails partway through, the error cleanup frees and NULLs apc->tx_qp and apc->rxqs. Multiple teardown paths such as mana_remove(), mana_change_mtu() recovery, and internal error handling in mana_alloc_queues() can subsequently call into functions that dereference these pointers without NULL checks: - mana_chn_setxdp() dereferences apc->rxqs[0], causing a NULL pointer dereference panic (CR2: 0000000000000000 at mana_chn_setxdp+0x26). - mana_destroy_vport() iterates apc->rxqs without a NULL check. - mana_fence_rqs() iterates apc->rxqs without a NULL check. - mana_dealloc_queues() iterates apc->tx_qp without a NULL check. Add NULL guards for apc->rxqs in mana_fence_rqs(), mana_destroy_vport(), and before the mana_chn_setxdp() call. Add a NULL guard for apc->tx_qp in mana_dealloc_queues() to skip TX queue draining when TX queues were never allocated or already freed. Fixes: ca9c54d2d6a5 ("net: mana: Add a driver for Microsoft Azure Network Adapter (MANA)") Reviewed-by: Haiyang Zhang Signed-off-by: Dipayaan Roy Link: https://patch.msgid.link/20260525081129.1230035-2-dipayanroy@linux.microsoft.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/microsoft/mana/mana_en.c | 72 +++++++++++-------- 1 file changed, 42 insertions(+), 30 deletions(-) diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index 9afc786b297a..9e7e4bf526bf 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -1727,6 +1727,9 @@ static void mana_fence_rqs(struct mana_port_context *apc) struct mana_rxq *rxq; int err; + if (!apc->rxqs) + return; + for (rxq_idx = 0; rxq_idx < apc->num_queues; rxq_idx++) { rxq = apc->rxqs[rxq_idx]; err = mana_fence_rq(apc, rxq); @@ -2858,13 +2861,16 @@ static void mana_destroy_vport(struct mana_port_context *apc) struct mana_rxq *rxq; u32 rxq_idx; - for (rxq_idx = 0; rxq_idx < apc->num_queues; rxq_idx++) { - rxq = apc->rxqs[rxq_idx]; - if (!rxq) - continue; + if (apc->rxqs) { - mana_destroy_rxq(apc, rxq, true); - apc->rxqs[rxq_idx] = NULL; + for (rxq_idx = 0; rxq_idx < apc->num_queues; rxq_idx++) { + rxq = apc->rxqs[rxq_idx]; + if (!rxq) + continue; + + mana_destroy_rxq(apc, rxq, true); + apc->rxqs[rxq_idx] = NULL; + } } mana_destroy_txq(apc); @@ -3269,7 +3275,8 @@ static int mana_dealloc_queues(struct net_device *ndev) if (apc->port_is_up) return -EINVAL; - mana_chn_setxdp(apc, NULL); + if (apc->rxqs) + mana_chn_setxdp(apc, NULL); if (gd->gdma_context->is_pf && !apc->ac->bm_hostmode) mana_pf_deregister_filter(apc); @@ -3287,33 +3294,38 @@ static int mana_dealloc_queues(struct net_device *ndev) * number of queues. */ - for (i = 0; i < apc->num_queues; i++) { - txq = &apc->tx_qp[i].txq; - tsleep = 1000; - while (atomic_read(&txq->pending_sends) > 0 && - time_before(jiffies, timeout)) { - usleep_range(tsleep, tsleep + 1000); - tsleep <<= 1; - } - if (atomic_read(&txq->pending_sends)) { - err = pcie_flr(to_pci_dev(gd->gdma_context->dev)); - if (err) { - netdev_err(ndev, "flr failed %d with %d pkts pending in txq %u\n", - err, atomic_read(&txq->pending_sends), - txq->gdma_txq_id); + if (apc->tx_qp) { + for (i = 0; i < apc->num_queues; i++) { + txq = &apc->tx_qp[i].txq; + tsleep = 1000; + while (atomic_read(&txq->pending_sends) > 0 && + time_before(jiffies, timeout)) { + usleep_range(tsleep, tsleep + 1000); + tsleep <<= 1; } - break; + if (atomic_read(&txq->pending_sends)) { + err = + pcie_flr(to_pci_dev(gd->gdma_context->dev)); + if (err) { + netdev_err(ndev, "flr failed %d with %d pkts pending in txq %u\n", + err, + atomic_read(&txq->pending_sends), + txq->gdma_txq_id); + } + break; + } + } + + for (i = 0; i < apc->num_queues; i++) { + txq = &apc->tx_qp[i].txq; + while ((skb = skb_dequeue(&txq->pending_skbs))) { + mana_unmap_skb(skb, apc); + dev_kfree_skb_any(skb); + } + atomic_set(&txq->pending_sends, 0); } } - for (i = 0; i < apc->num_queues; i++) { - txq = &apc->tx_qp[i].txq; - while ((skb = skb_dequeue(&txq->pending_skbs))) { - mana_unmap_skb(skb, apc); - dev_kfree_skb_any(skb); - } - atomic_set(&txq->pending_sends, 0); - } /* We're 100% sure the queues can no longer be woken up, because * we're sure now mana_poll_tx_cq() can't be running. */ From 5b05aa36ee24297d7296ca58dfd8c448d0e4cda3 Mon Sep 17 00:00:00 2001 From: Dipayaan Roy Date: Mon, 25 May 2026 01:08:25 -0700 Subject: [PATCH 26/36] net: mana: Skip redundant detach on already-detached port When mana_per_port_queue_reset_work_handler() runs after a previous detach succeeded but attach failed, the port is left in a detached state with apc->tx_qp and apc->rxqs already freed. Calling mana_detach() again unconditionally leads to NULL pointer dereferences during queue teardown. Add an early exit in mana_detach() when the port is already in detached state (!netif_device_present) for non-close callers, making it safe to call idempotently. This allows the queue reset handler and other recovery paths to simply retry mana_attach() without redundant teardown. Fixes: 3b194343c250 ("net: mana: Implement ndo_tx_timeout and serialize queue resets per port.") Reviewed-by: Haiyang Zhang Signed-off-by: Dipayaan Roy Link: https://patch.msgid.link/20260525081129.1230035-3-dipayanroy@linux.microsoft.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/microsoft/mana/mana_en.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index 9e7e4bf526bf..c9b1df1ed109 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -3350,6 +3350,12 @@ int mana_detach(struct net_device *ndev, bool from_close) ASSERT_RTNL(); + /* If already detached (indicates detach succeeded but attach failed + * previously). Now skip mana detach and just retry mana_attach. + */ + if (!from_close && !netif_device_present(ndev)) + return 0; + apc->port_st_save = apc->port_is_up; apc->port_is_up = false; From f14fe6395a8b3d961a61e138ad7b36ba3626dd4e Mon Sep 17 00:00:00 2001 From: Zhenghang Xiao Date: Wed, 27 May 2026 11:24:11 +0800 Subject: [PATCH 27/36] sctp: fix race between sctp_wait_for_connect and peeloff MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit sctp_wait_for_connect() drops and re-acquires the socket lock while waiting for the association to reach ESTABLISHED state. During this window, another thread can peeloff the association to a new socket via getsockopt(SCTP_SOCKOPT_PEELOFF), changing asoc->base.sk. After re-acquiring the old socket lock, sctp_wait_for_connect() returns success without noticing the migration — the caller then accesses the association under the wrong lock in sctp_datamsg_from_user(). Add the same sk != asoc->base.sk check that sctp_wait_for_sndbuf() already has, returning an error if the association was migrated while we slept. Fixes: 668c9beb9020 ("sctp: implement assign_number for sctp_stream_interleave") Signed-off-by: Zhenghang Xiao Acked-by: Xin Long Link: https://patch.msgid.link/20260527032411.60959-1-kipreyyy@gmail.com Signed-off-by: Jakub Kicinski --- net/sctp/socket.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 1d2568bb6bc2..66e12fb0c646 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -9403,6 +9403,8 @@ static int sctp_wait_for_connect(struct sctp_association *asoc, long *timeo_p) release_sock(sk); current_timeo = schedule_timeout(current_timeo); lock_sock(sk); + if (sk != asoc->base.sk) + goto do_error; *timeo_p = current_timeo; } From 422b5233b607476ac7176bfa2a101b9a103d7653 Mon Sep 17 00:00:00 2001 From: Frank Wunderlich Date: Tue, 26 May 2026 17:32:38 +0200 Subject: [PATCH 28/36] net: pcs: pcs-mtk-lynxi: fix bpi-r3 serdes configuration Commit 8871389da151 introduces common pcs dts properties which writes rx=normal,tx=normal polarity to register SGMSYS_QPHY_WRAP_CTRL of switch. This is initialized with tx-bit set and so change inverts polarity compared to before. It looks like mt7531 has tx polarity inverted in hardware and set tx-bit by default to restore the normal polarity. The MT7531 datasheet quite clearly states: Register 000050EC QPHY_WRAP_CTRL -- QPHY wrapper control Reset value: 0x00000501 BIT 1 RX_BIT_POLARITY -- RX bit polarity control 1'b0: normal 1'b1: inverted BIT 0 TX_BIT_POLARITY -- TX bit polarity control (TX default inversed in MT7531) 1'b0: normal 1'b1: inverted Till this patch the register write was only called when mediatek,pnswap property was set which cannot be done for switch because the fw-node param was always NULL from switch driver in the mtk_pcs_lynxi_create call. Do not configure switch side like it's done before. Fixes: 8871389da151 ("net: pcs: pcs-mtk-lynxi: deprecate "mediatek,pnswap"") Signed-off-by: Frank Wunderlich Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/20260526153239.30194-1-linux@fw-web.de Signed-off-by: Jakub Kicinski --- drivers/net/pcs/pcs-mtk-lynxi.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/pcs/pcs-mtk-lynxi.c b/drivers/net/pcs/pcs-mtk-lynxi.c index c12f8087af9b..a753bd88cbc2 100644 --- a/drivers/net/pcs/pcs-mtk-lynxi.c +++ b/drivers/net/pcs/pcs-mtk-lynxi.c @@ -129,6 +129,9 @@ static int mtk_pcs_config_polarity(struct mtk_pcs_lynxi *mpcs, unsigned int val = 0; int ret; + if (!fwnode) + return 0; + if (fwnode_property_read_bool(fwnode, "mediatek,pnswap")) default_pol = PHY_POL_INVERT; From 6851161feb01cea41358c9ec304bd2f981fc8505 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Fri, 29 May 2026 10:23:25 +0200 Subject: [PATCH 29/36] Revert "esp: fix page frag reference leak on skb_to_sgvec failure" This reverts commit 2982e599fff6faa21c8df147d96fc7af6c1a2f24. The patch does not fully fix the issue and the Author does not match the 'Signed-off-by:' tag, so revert it for now. Signed-off-by: Steffen Klassert --- net/ipv4/esp4.c | 12 +++++------- net/ipv6/esp6.c | 12 +++++------- 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 5d3a8656687e..513c8215c947 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -96,7 +96,7 @@ static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead, __alignof__(struct scatterlist)); } -static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb, bool already_unref) +static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb) { struct crypto_aead *aead = x->data; int extralen = 0; @@ -113,7 +113,7 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb, /* Unref skb_frag_pages in the src scatterlist if necessary. * Skip the first sg which comes from skb->data. */ - if (already_unref || req->src != req->dst) + if (req->src != req->dst) for (sg = sg_next(req->src); sg; sg = sg_next(sg)) skb_page_unref(page_to_netmem(sg_page(sg)), skb->pp_recycle); @@ -220,7 +220,7 @@ static void esp_output_done(void *data, int err) } tmp = ESP_SKB_CB(skb)->tmp; - esp_ssg_unref(x, tmp, skb, false); + esp_ssg_unref(x, tmp, skb); kfree(tmp); if (xo && (xo->flags & XFRM_DEV_RESUME)) { @@ -569,10 +569,8 @@ int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * err = skb_to_sgvec(skb, dsg, (unsigned char *)esph - skb->data, assoclen + ivlen + esp->clen + alen); - if (unlikely(err < 0)) { - esp_ssg_unref(x, tmp, skb, true); + if (unlikely(err < 0)) goto error_free; - } } if ((x->props.flags & XFRM_STATE_ESN)) @@ -604,7 +602,7 @@ int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * } if (sg != dsg) - esp_ssg_unref(x, tmp, skb, false); + esp_ssg_unref(x, tmp, skb); if (!err && x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP) err = esp_output_tail_tcp(x, skb); diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index b963b8e72604..57481e423e59 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -113,7 +113,7 @@ static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead, __alignof__(struct scatterlist)); } -static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb, bool already_unref) +static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb) { struct crypto_aead *aead = x->data; int extralen = 0; @@ -130,7 +130,7 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb, /* Unref skb_frag_pages in the src scatterlist if necessary. * Skip the first sg which comes from skb->data. */ - if (already_unref || req->src != req->dst) + if (req->src != req->dst) for (sg = sg_next(req->src); sg; sg = sg_next(sg)) skb_page_unref(page_to_netmem(sg_page(sg)), skb->pp_recycle); @@ -254,7 +254,7 @@ static void esp_output_done(void *data, int err) } tmp = ESP_SKB_CB(skb)->tmp; - esp_ssg_unref(x, tmp, skb, false); + esp_ssg_unref(x, tmp, skb); kfree(tmp); esp_output_encap_csum(skb); @@ -600,10 +600,8 @@ int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info err = skb_to_sgvec(skb, dsg, (unsigned char *)esph - skb->data, assoclen + ivlen + esp->clen + alen); - if (unlikely(err < 0)) { - esp_ssg_unref(x, tmp, skb, true); + if (unlikely(err < 0)) goto error_free; - } } if ((x->props.flags & XFRM_STATE_ESN)) @@ -636,7 +634,7 @@ int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info } if (sg != dsg) - esp_ssg_unref(x, tmp, skb, false); + esp_ssg_unref(x, tmp, skb); if (!err && x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP) err = esp_output_tail_tcp(x, skb); From 1e584c304cfb94a759417130b1fc6d30b30c4cce Mon Sep 17 00:00:00 2001 From: Jingguo Tan Date: Wed, 27 May 2026 10:33:01 +0800 Subject: [PATCH 30/36] vsock/virtio: bind uarg before filling zerocopy skb virtio_transport_send_pkt_info() allocates or reuses the zerocopy uarg before entering the send loop, but virtio_transport_alloc_skb() still fills the skb before it inherits that uarg. When fixed-buffer vectored zerocopy hits MAX_SKB_FRAGS, io_sg_from_iter() may partially attach managed frags and return -EMSGSIZE. The rollback path call kfree_skb() to free an skb that carries SKBFL_MANAGED_FRAG_REFS but no uarg, so skb_release_data() falls through to ordinary frag unref. Pass the uarg into virtio_transport_alloc_skb() and bind it immediately before virtio_transport_fill_skb(). This keeps control or no-payload skbs untouched while ensuring success and rollback share one lifetime rule. Fixes: 581512a6dc93 ("vsock/virtio: MSG_ZEROCOPY flag support") Signed-off-by: Lin Ma Signed-off-by: Rongzhen Cui Signed-off-by: Jingguo Tan Acked-by: Arseniy Krasnov Acked-by: Michael S. Tsirkin Reviewed-by: Stefano Garzarella Link: https://patch.msgid.link/20260527023301.1075581-1-malin89@huawei.com Signed-off-by: Jakub Kicinski --- net/vmw_vsock/virtio_transport_common.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index b143290a311d..b10666937c49 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -205,6 +205,7 @@ static u16 virtio_transport_get_type(struct sock *sk) static struct sk_buff *virtio_transport_alloc_skb(struct virtio_vsock_pkt_info *info, size_t payload_len, bool zcopy, + struct ubuf_info *uarg, u32 src_cid, u32 src_port, u32 dst_cid, @@ -245,6 +246,12 @@ static struct sk_buff *virtio_transport_alloc_skb(struct virtio_vsock_pkt_info * if (info->msg && payload_len > 0) { int err; + /* Bind the zerocopy lifetime before filling frags so error + * rollback frees managed fixed-buffer pages through + * the uarg-aware path. + */ + skb_zcopy_set(skb, uarg, NULL); + err = virtio_transport_fill_skb(skb, info, payload_len, zcopy); if (err) goto out; @@ -364,6 +371,7 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, skb_len = min(max_skb_len, rest_len); skb = virtio_transport_alloc_skb(info, skb_len, can_zcopy, + uarg, src_cid, src_port, dst_cid, dst_port); if (!skb) { @@ -371,8 +379,6 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, break; } - skb_zcopy_set(skb, uarg, NULL); - virtio_transport_inc_tx_pkt(vvs, skb); ret = t_ops->send_pkt(skb, info->net); @@ -1183,7 +1189,7 @@ static int virtio_transport_reset_no_sock(const struct virtio_transport *t, if (!t) return -ENOTCONN; - reply = virtio_transport_alloc_skb(&info, 0, false, + reply = virtio_transport_alloc_skb(&info, 0, false, NULL, le64_to_cpu(hdr->dst_cid), le32_to_cpu(hdr->dst_port), le64_to_cpu(hdr->src_cid), From f72eed9b84fb771019a955908132410a9ba9ea3f Mon Sep 17 00:00:00 2001 From: Yuqi Xu Date: Wed, 27 May 2026 11:48:15 +0800 Subject: [PATCH 31/36] bpf: sockmap: fix tail fragment offset in bpf_msg_push_data When bpf_msg_push_data() inserts data in the middle of a scatterlist entry, it splits the original entry into a left fragment and a right fragment. The right fragment offset is page-local, but the code advances it with `start`, which is the message-global insertion point. For inserts into a non-first SG entry, this over-advances the offset and leaves the split layout inconsistent. Advance the right fragment offset by the fragment-local delta, `start - offset`, which matches the length removed from the front of the original entry. Fixes: 6fff607e2f14 ("bpf: sk_msg program helper bpf_msg_push_data") Cc: stable@kernel.org Reported-by: Yuan Tan Reported-by: Zhengchuan Liang Reported-by: Xin Liu Signed-off-by: Yuqi Xu Signed-off-by: Ren Wei Link: https://patch.msgid.link/8b129d10566aa3eb43f61a8f9757bcf51707d324.1779636774.git.xuyq21@lenovo.com Signed-off-by: Jakub Kicinski --- net/core/filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/filter.c b/net/core/filter.c index 9590877b0714..80439767e0ee 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2869,7 +2869,7 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, psge->length = start - offset; rsge.length -= psge->length; - rsge.offset += start; + rsge.offset += start - offset; sk_msg_iter_var_next(i); sg_unmark_end(psge); From 9f72412bcf60144f252b0d6205106abf14344abc Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Wed, 27 May 2026 13:31:30 +0800 Subject: [PATCH 32/36] ipv6: fix possible infinite loop in rt6_fill_node() Sashiko reported this issue [1]. Apply the same fix as commit f8d8ce1b515a ("ipv6: fix possible infinite loop in fib6_info_uses_dev()"). Writers holding tb6_lock can list_del_rcu(&rt->fib6_siblings) without waiting for RCU readers; rt->fib6_siblings.next then still points into the old ring and this softirq-side walker never reaches &rt->fib6_siblings, causing a CPU stall. fib6_del_route() always WRITE_ONCE()s rt->fib6_nsiblings to 0 before list_del_rcu(), so an inside-loop check is a reliable detach signal. [1] https://sashiko.dev/#/patchset/20260526020227.4857-1-jiayuan.chen%40linux.dev Fixes: d9ccb18f83ea ("ipv6: Fix soft lockups in fib6_select_path under high next hop churn") Signed-off-by: Jiayuan Chen Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20260527053133.180695-1-jiayuan.chen@linux.dev Signed-off-by: Jakub Kicinski --- net/ipv6/route.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index b106e5fef9cb..dad416fdc585 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -5902,6 +5902,8 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, goto nla_put_failure; } + if (!READ_ONCE(rt->fib6_nsiblings)) + break; } rcu_read_unlock(); From 9c7da87c2dc860bb17ca1ece942495d28b1ce3b9 Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Wed, 27 May 2026 13:31:31 +0800 Subject: [PATCH 33/36] ipv6: fix possible infinite loop in fib6_select_path() Found while auditing the same pattern Sashiko reported in rt6_fill_node() [1]. Apply the same fix as commit f8d8ce1b515a ("ipv6: fix possible infinite loop in fib6_info_uses_dev()"). Writers holding tb6_lock can list_del_rcu(&first->fib6_siblings) without waiting for RCU readers; first->fib6_siblings.next then still points into the old ring and this softirq-side walker never reaches &first->fib6_siblings as its terminator. fib6_purge_rt() always WRITE_ONCE()s first->fib6_nsiblings to 0 before list_del_rcu(), so an inside-loop check is a reliable detach signal. [1] https://sashiko.dev/#/patchset/20260526020227.4857-1-jiayuan.chen%40linux.dev Fixes: d9ccb18f83ea ("ipv6: Fix soft lockups in fib6_select_path under high next hop churn") Signed-off-by: Jiayuan Chen Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20260527053133.180695-2-jiayuan.chen@linux.dev Signed-off-by: Jakub Kicinski --- net/ipv6/route.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index dad416fdc585..636f0120d7e3 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -481,6 +481,9 @@ void fib6_select_path(const struct net *net, struct fib6_result *res, const struct fib6_nh *nh = sibling->fib6_nh; int nh_upper_bound; + if (!READ_ONCE(first->fib6_nsiblings)) + break; + nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound); if (hash > nh_upper_bound) continue; From ff6e798c2eac3ebd0501ad7e796f583fab928de8 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 28 May 2026 19:43:53 +0100 Subject: [PATCH 34/36] net: skbuff: fix pskb_carve leaking zcopy pages When SKBFL_MANAGED_FRAG_REFS is set, frag pages are not refcounted but their lifetime is controlled by the attached ubuf_info. To make a copy of the skb_shared_info, we either should clear the flag and reference the frags, or keep the flag and have frags unreferenced. pskb_carve_inside_header() and pskb_carve_inside_nonlinear() don't follow the rule and thus can leak page references. Let's clear SKBFL_MANAGED_FRAG_REFS from the original skb to fix it. It's the simplest way to address it, but there are more performant ways to do that if it ever becomes a problem. Link: https://lore.kernel.org/all/20260523085809.26331-1-nvminh232@clc.fitus.edu.vn/ Fixes: 753f1ca4e1e50 ("net: introduce managed frags infrastructure") Reported-by: Minh Nguyen Reported-by: Willem de Bruijn Signed-off-by: Pavel Begunkov Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/1e2086aa69217d7f9c8da3d38f5be7160f1b4cd1.1779993185.git.asml.silence@gmail.com Signed-off-by: Jakub Kicinski --- net/core/skbuff.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 0d3cc115f2e7..c02f0a507ba8 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -6823,6 +6823,11 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off, skb_copy_from_linear_data_offset(skb, off, data, new_hlen); skb->len -= off; + /* Remove SKBFL_MANAGED_FRAG_REFS instead of trying to honour it + * while refcounting frags below. + */ + skb_zcopy_downgrade_managed(skb); + memcpy((struct skb_shared_info *)(data + size), skb_shinfo(skb), offsetof(struct skb_shared_info, @@ -6936,6 +6941,11 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, return -ENOMEM; size = SKB_WITH_OVERHEAD(size); + /* Remove SKBFL_MANAGED_FRAG_REFS instead of trying to honour it + * while refcounting frags below. + */ + skb_zcopy_downgrade_managed(skb); + memcpy((struct skb_shared_info *)(data + size), skb_shinfo(skb), offsetof(struct skb_shared_info, frags[0])); if (skb_orphan_frags(skb, gfp_mask)) { From 072aa0f5c3d8f11f3159037418ec45edce7440b8 Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Fri, 29 May 2026 13:23:57 +0200 Subject: [PATCH 35/36] Revert "ipv6: preserve insertion order for same-scope addresses" Chris Adams reported that preserving insertion order for same-scope addresses is causing SSH connections to be dropped after stopping a VM while running NetworkManager. NetworkManager caches the IPv6 address configuration, when a RA arrives, it determines the list of addresses to configure and checks if the addresses are already in the right order in the kernel. If they aren't, NetworkManager removes and re-adds them to achieve the desired order. As the order changes, NetworkManager is confused and reconfigures the addresses on every update. In addition, this would also affect to cloud tooling that relies on IPv6 addresses order to identify primary and secondaries addresses. This reverts commit cb3de96eea66f5e4a580086c6a1be46e765f97f4. Fixes: cb3de96eea66 ("ipv6: preserve insertion order for same-scope addresses") Reported-by: Chris Adams Closes: https://lore.kernel.org/netdev/20260521135310.GC977@cmadams.net/ Signed-off-by: Fernando Fernandez Mancera Link: https://patch.msgid.link/20260529112357.5079-1-fmancera@suse.de Signed-off-by: Jakub Kicinski --- net/ipv6/addrconf.c | 2 +- tools/testing/selftests/net/ioam6.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 5476b6536eb7..bb84a78b80f6 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1013,7 +1013,7 @@ ipv6_link_dev_addr(struct inet6_dev *idev, struct inet6_ifaddr *ifp) list_for_each(p, &idev->addr_list) { struct inet6_ifaddr *ifa = list_entry(p, struct inet6_ifaddr, if_list); - if (ifp_scope > ipv6_addr_src_scope(&ifa->addr)) + if (ifp_scope >= ipv6_addr_src_scope(&ifa->addr)) break; } diff --git a/tools/testing/selftests/net/ioam6.sh b/tools/testing/selftests/net/ioam6.sh index b2b99889942f..845c26dd01a9 100755 --- a/tools/testing/selftests/net/ioam6.sh +++ b/tools/testing/selftests/net/ioam6.sh @@ -273,8 +273,8 @@ setup() ip -netns $ioam_node_beta link set ioam-veth-betaR name veth1 &>/dev/null ip -netns $ioam_node_gamma link set ioam-veth-gamma name veth0 &>/dev/null - ip -netns $ioam_node_alpha addr add 2001:db8:1::2/64 dev veth0 &>/dev/null ip -netns $ioam_node_alpha addr add 2001:db8:1::50/64 dev veth0 &>/dev/null + ip -netns $ioam_node_alpha addr add 2001:db8:1::2/64 dev veth0 &>/dev/null ip -netns $ioam_node_alpha link set veth0 up &>/dev/null ip -netns $ioam_node_alpha link set lo up &>/dev/null ip -netns $ioam_node_alpha route add 2001:db8:2::/64 \ From f75e3eb08fe31d30a9af6ed80cdd22e6772837e2 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Fri, 29 May 2026 19:31:34 +0200 Subject: [PATCH 36/36] wireguard: send: append trailer after expanding head With how this is currently written, we add the trailer, zero it out, and then add the header space on. If that header space requires a reallocation + copy, the zeros in the trailer aren't copied, because the skb len hasn't actually been yet expanded to cover that. Instead add the padding at the end of the process rather than at the beginning. Fixes: e7096c131e51 ("net: WireGuard secure network tunnel") Cc: stable@vger.kernel.org Signed-off-by: Jason A. Donenfeld Link: https://patch.msgid.link/20260529173134.3080773-2-Jason@zx2c4.com Signed-off-by: Jakub Kicinski --- drivers/net/wireguard/send.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/net/wireguard/send.c b/drivers/net/wireguard/send.c index 26e09c30d596..67d01478eb76 100644 --- a/drivers/net/wireguard/send.c +++ b/drivers/net/wireguard/send.c @@ -177,16 +177,6 @@ static bool encrypt_packet(struct sk_buff *skb, struct noise_keypair *keypair) trailer_len = padding_len + noise_encrypted_len(0); plaintext_len = skb->len + padding_len; - /* Expand data section to have room for padding and auth tag. */ - num_frags = skb_cow_data(skb, trailer_len, &trailer); - if (unlikely(num_frags < 0 || num_frags > ARRAY_SIZE(sg))) - return false; - - /* Set the padding to zeros, and make sure it and the auth tag are part - * of the skb. - */ - memset(skb_tail_pointer(trailer), 0, padding_len); - /* Expand head section to have room for our header and the network * stack's headers. */ @@ -198,6 +188,16 @@ static bool encrypt_packet(struct sk_buff *skb, struct noise_keypair *keypair) skb_checksum_help(skb))) return false; + /* Expand data section to have room for padding and auth tag. */ + num_frags = skb_cow_data(skb, trailer_len, &trailer); + if (unlikely(num_frags < 0 || num_frags > ARRAY_SIZE(sg))) + return false; + + /* Set the padding to zeros, and make sure it and the auth tag are part + * of the skb. + */ + memset(skb_tail_pointer(trailer), 0, padding_len); + /* Only after checksumming can we safely add on the padding at the end * and the header. */