From 7e2a4f7ca0952820731ef7bdadfc9a9e9d3571b4 Mon Sep 17 00:00:00 2001 From: Maoyi Xie Date: Mon, 4 May 2026 22:27:36 +0800 Subject: [PATCH 01/10] xfrm: route MIGRATE notifications to caller's netns xfrm_send_migrate() in net/xfrm/xfrm_user.c and pfkey_send_migrate() in net/key/af_key.c both hardcode &init_net for the multicast that announces a successful XFRM_MSG_MIGRATE / SADB_X_MIGRATE. XFRM_MSG_MIGRATE arrives on a per-netns NETLINK_XFRM socket, and the rest of the xfrm/af_key netlink path was made netns-aware in 2008. The other 14 multicast paths in xfrm_user.c route their event using xs_net(x), xp_net(xp) or sock_net(skb->sk); only the migrate path was missed. Two consequences of the init_net hardcoding: 1. The notification (selector, old/new endpoint addresses, and the km_address) is delivered to listeners on init_net's XFRMNLGRP_MIGRATE / pfkey BROADCAST_ALL groups rather than on the issuing netns. An IKE daemon running in init_net therefore receives migration notifications originating from any other netns on the host. 2. An IKE daemon running inside a non-init netns and subscribed to its own XFRMNLGRP_MIGRATE / pfkey groups never receives the notification of its own migration. IKEv2 MOBIKE / address-update handling inside a netns is silently broken. Thread struct net through km_migrate() and the xfrm_mgr.migrate function pointer, drop the &init_net override in xfrm_send_migrate() and pfkey_send_migrate(), and pass the caller's net (already in scope in xfrm_migrate() via sock_net(skb->sk)) all the way down. struct xfrm_mgr is in-tree only and not exported as a stable API, so the function-pointer signature change is internal. pfkey_broadcast() is already netns-aware via net_generic(net, pfkey_net_id) since the pernet conversion. The five other pfkey_broadcast() callers in af_key.c already pass xs_net(x), sock_net(sk) or a per-netns net, so this only removes the &init_net outlier. Fixes: 5c79de6e79cd ("[XFRM]: User interface for handling XFRM_MSG_MIGRATE") Cc: stable@vger.kernel.org # v5.15+ Signed-off-by: Maoyi Xie Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 3 ++- net/key/af_key.c | 6 +++--- net/xfrm/xfrm_policy.c | 2 +- net/xfrm/xfrm_state.c | 4 ++-- net/xfrm/xfrm_user.c | 5 ++--- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 10d3edde6b2f..874409127e29 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -715,6 +715,7 @@ struct xfrm_mgr { const struct xfrm_migrate *m, int num_bundles, const struct xfrm_kmaddress *k, + struct net *net, const struct xfrm_encap_tmpl *encap); bool (*is_alive)(const struct km_event *c); }; @@ -1891,7 +1892,7 @@ int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol); #ifdef CONFIG_XFRM_MIGRATE int km_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, const struct xfrm_migrate *m, int num_bundles, - const struct xfrm_kmaddress *k, + const struct xfrm_kmaddress *k, struct net *net, const struct xfrm_encap_tmpl *encap); struct xfrm_state *xfrm_migrate_state_find(struct xfrm_migrate *m, struct net *net, u32 if_id); diff --git a/net/key/af_key.c b/net/key/af_key.c index a166a88d8788..9cffeef18cd9 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -3564,7 +3564,7 @@ static int set_ipsecrequest(struct sk_buff *skb, #ifdef CONFIG_NET_KEY_MIGRATE static int pfkey_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, const struct xfrm_migrate *m, int num_bundles, - const struct xfrm_kmaddress *k, + const struct xfrm_kmaddress *k, struct net *net, const struct xfrm_encap_tmpl *encap) { int i; @@ -3669,7 +3669,7 @@ static int pfkey_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, } /* broadcast migrate message to sockets */ - pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, &init_net); + pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, net); return 0; @@ -3680,7 +3680,7 @@ static int pfkey_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, #else static int pfkey_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, const struct xfrm_migrate *m, int num_bundles, - const struct xfrm_kmaddress *k, + const struct xfrm_kmaddress *k, struct net *net, const struct xfrm_encap_tmpl *encap) { return -ENOPROTOOPT; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index c944327ce66c..59968dcbafe1 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -4703,7 +4703,7 @@ int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, } /* Stage 5 - announce */ - km_migrate(sel, dir, type, m, num_migrate, k, encap); + km_migrate(sel, dir, type, m, num_migrate, k, net, encap); xfrm_pol_put(pol); diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 686014d39429..395d82411a87 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -2837,7 +2837,7 @@ EXPORT_SYMBOL(km_policy_expired); #ifdef CONFIG_XFRM_MIGRATE int km_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, const struct xfrm_migrate *m, int num_migrate, - const struct xfrm_kmaddress *k, + const struct xfrm_kmaddress *k, struct net *net, const struct xfrm_encap_tmpl *encap) { int err = -EINVAL; @@ -2848,7 +2848,7 @@ int km_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, list_for_each_entry_rcu(km, &xfrm_km_list, list) { if (km->migrate) { ret = km->migrate(sel, dir, type, m, num_migrate, k, - encap); + net, encap); if (!ret) err = ret; } diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 38a90e5ee3d9..71a4b7278eba 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -3271,10 +3271,9 @@ static int build_migrate(struct sk_buff *skb, const struct xfrm_migrate *m, static int xfrm_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, const struct xfrm_migrate *m, int num_migrate, - const struct xfrm_kmaddress *k, + const struct xfrm_kmaddress *k, struct net *net, const struct xfrm_encap_tmpl *encap) { - struct net *net = &init_net; struct sk_buff *skb; int err; @@ -3292,7 +3291,7 @@ static int xfrm_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, #else static int xfrm_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, const struct xfrm_migrate *m, int num_migrate, - const struct xfrm_kmaddress *k, + const struct xfrm_kmaddress *k, struct net *net, const struct xfrm_encap_tmpl *encap) { return -ENOPROTOOPT; From 7dbac7680eb629b3b4dc7e98c34f943b8814c0c8 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 6 May 2026 21:23:28 +0800 Subject: [PATCH 02/10] xfrm: ipcomp: Free destination pages on acomp errors Move the out_free_req label up by a couple of lines so that the allocated dst SG list gets freed on error as well as success. Fixes: eb2953d26971 ("xfrm: ipcomp: Use crypto_acomp interface") Cc: stable@kernel.org Reported-by: Yuan Tan Reported-by: Yifan Wu Reported-by: Juefei Pu Reported-by: Xin Liu Reported-by: Yilin Zhu Signed-off-by: Herbert Xu Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_ipcomp.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/net/xfrm/xfrm_ipcomp.c b/net/xfrm/xfrm_ipcomp.c index 5f38dff16177..671d48f8c937 100644 --- a/net/xfrm/xfrm_ipcomp.c +++ b/net/xfrm/xfrm_ipcomp.c @@ -51,11 +51,15 @@ static int ipcomp_post_acomp(struct sk_buff *skb, int err, int hlen) struct scatterlist *dsg; int len, dlen; - if (unlikely(err)) - goto out_free_req; + if (unlikely(!req)) + return err; extra = acomp_request_extra(req); dsg = extra->sg; + + if (unlikely(err)) + goto out_free_req; + dlen = req->dlen; pskb_trim_unique(skb, 0); @@ -84,10 +88,10 @@ static int ipcomp_post_acomp(struct sk_buff *skb, int err, int hlen) skb_shinfo(skb)->nr_frags++; } while ((dlen -= len)); - for (; dsg; dsg = sg_next(dsg)) +out_free_req: + for (; dsg && sg_page(dsg); dsg = sg_next(dsg)) __free_page(sg_page(dsg)); -out_free_req: acomp_request_free(req); return err; } From 742b04d0550b0ec89dcbc99537ec88653bd1ad90 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 13 May 2026 10:49:14 -0600 Subject: [PATCH 03/10] xfrm: Check for underflow in xfrm_state_mtu Leo Lin reported OOB write issue in esp component: xfrm_state_mtu() returns u32 but performs its arithmetic in unsigned modulo-2^32 space using an attacker-influenced "header_len + authsize + net_adj" subtracted from a small "mtu" argument. A nobody user can install an IPv4 ESP tunnel SA with a large authentication key (XFRMA_ALG_AUTH_TRUNC, e.g. hmac(sha512), 64-byte key, 64-byte trunc), configure a small interface MTU (68 bytes), and set XFRMA_TFCPAD to a large value. When a single UDP datagram is then sent through the tunnel, xfrm_state_mtu() underflows to a near-2^32 value, and esp_output() consumes it as a signed int via: padto = min(x->tfcpad, xfrm_state_mtu(x, mtu_cached)) esp.tfclen = padto - skb->len (assigned to int) esp.tfclen ends up negative (e.g. -207). It is sign-extended to size_t when passed to memset() inside esp_output_fill_trailer(), producing a ~16 EB write of zeroes at skb_tail_pointer(skb). KASAN logs it as "Write of size 18446744073709551537 at addr ffff888...". Check for underflow and return 1. This causes the sendmsg attempt to fail with ENETUNREACH. Fixes: c5c252389374 ("[XFRM]: Optimize MTU calculation") Reported-by: Leo Lin Assisted-by: Codex:26.506.31004 Signed-off-by: David Ahern Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 395d82411a87..589c3b6e4679 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -3114,10 +3114,14 @@ u32 xfrm_state_mtu(struct xfrm_state *x, int mtu) const struct xfrm_type *type = READ_ONCE(x->type); struct crypto_aead *aead; u32 blksize, net_adj = 0; + u32 overhead, payload_mtu; if (x->km.state != XFRM_STATE_VALID || - !type || type->proto != IPPROTO_ESP) + !type || type->proto != IPPROTO_ESP) { + if (mtu <= x->props.header_len) + return 1; return mtu - x->props.header_len; + } aead = x->data; blksize = ALIGN(crypto_aead_blocksize(aead), 4); @@ -3140,8 +3144,17 @@ u32 xfrm_state_mtu(struct xfrm_state *x, int mtu) break; } - return ((mtu - x->props.header_len - crypto_aead_authsize(aead) - - net_adj) & ~(blksize - 1)) + net_adj - 2; + overhead = x->props.header_len + crypto_aead_authsize(aead) + net_adj; + if (mtu <= overhead) + return 1; + + payload_mtu = mtu - overhead; + payload_mtu &= ~(blksize - 1); + if (payload_mtu <= 2) + return 1; + + return payload_mtu + net_adj - 2; + } EXPORT_SYMBOL_GPL(xfrm_state_mtu); From 79d8be262377f7112cfa3088dfc4142d5a2533f3 Mon Sep 17 00:00:00 2001 From: Michael Bommarito Date: Fri, 15 May 2026 11:45:31 -0400 Subject: [PATCH 04/10] xfrm: ah: use skb_to_full_sk in async output callbacks When AH output is offloaded to an asynchronous crypto provider (hardware accelerators such as AMD CCP, or a forced-async software shim used for testing), the digest completion fires ah_output_done() / ah6_output_done() on a workqueue. The egress skb at that point may have been originated by a TCP listener sending a SYN-ACK, which sets skb->sk to a request_sock via skb_set_owner_edemux(); it may also have been originated by an inet_timewait_sock retransmit. Neither is a full struct sock, and passing the raw skb->sk to xfrm_output_resume() then forwards a non-full socket through the rest of the xfrm output chain. xfrm_output_resume() and its downstream consumers expect a full sk where they dereference at all. The natural egress path through ah_output_done() does not crash today because the consumers that read past sock_common are either gated by sk_fullsock() or short-circuit on flags that are clear on a fresh request_sock; an exhaustive walk of the 50 most plausible consumers under sch_fq, dev_queue_xmit, netfilter, tc-egress and cgroup-egress BPF found no current unguarded deref. The bug is still a real type confusion that future consumer changes could turn into a memory-corruption primitive. This is the same bug class fixed for ESP in commit 1620c88887b1 ("xfrm: Fix the usage of skb->sk"). Apply the analogous fix to AH: convert skb->sk to a full socket pointer (or NULL) via skb_to_full_sk() before handing it to xfrm_output_resume(). The same async AH callbacks were touched recently for an independent ESN-related ICV layout bug in commit ec54093e6a8f ("xfrm: ah: account for ESN high bits in async callbacks"); the sk type-confusion addressed here is orthogonal. This patch is part of an ongoing audit of the AH callback paths; an ah_output ihl-validation hardening series is also currently under review on netdev. Reproduced under UML + KASAN + lockdep with a forced-async hmac(sha1) shim that registers at priority 9999 and wraps the sync in-tree hmac-sha1-lib. With the shim loaded, ah_output_done runs on every SYN-ACK egress through a transport-mode AH SA and skb->sk arrives as a request_sock (TCP_NEW_SYN_RECV); after this patch, xfrm_output_resume() receives the listener (the result of sk_to_full_sk()) and consumer derefs land on full-sock fields as intended. Fixes: 9ab1265d5231 ("xfrm: Use actual socket sk instead of skb socket for xfrm_output_resume") Cc: stable@vger.kernel.org Assisted-by: Claude:claude-opus-4-7 Signed-off-by: Michael Bommarito Signed-off-by: Steffen Klassert --- net/ipv4/ah4.c | 2 +- net/ipv6/ah6.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index 4366cbac3f06..6fd642d2278d 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -143,7 +143,7 @@ static void ah_output_done(void *data, int err) } kfree(AH_SKB_CB(skb)->tmp); - xfrm_output_resume(skb->sk, skb, err); + xfrm_output_resume(skb_to_full_sk(skb), skb, err); } static int ah_output(struct xfrm_state *x, struct sk_buff *skb) diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index de1e68199a01..76f7a2de9108 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -337,7 +337,7 @@ static void ah6_output_done(void *data, int err) ah6_restore_hdrs(top_iph, iph_ext, extlen); kfree(AH_SKB_CB(skb)->tmp); - xfrm_output_resume(skb->sk, skb, err); + xfrm_output_resume(skb_to_full_sk(skb), skb, err); } static int ah6_output(struct xfrm_state *x, struct sk_buff *skb) From 2982e599fff6faa21c8df147d96fc7af6c1a2f24 Mon Sep 17 00:00:00 2001 From: e521588 Date: Wed, 20 May 2026 09:27:17 +0200 Subject: [PATCH 05/10] esp: fix page frag reference leak on skb_to_sgvec failure In esp_output_tail(), when esp->inplace is false, the old skb page frags are replaced with a new page from the xfrm page_frag cache. The source scatterlist (sg) is built from the old frags before the replacement, and esp_ssg_unref() is responsible for releasing the old page references after the crypto operation completes. However, if the second skb_to_sgvec() call (which builds the destination scatterlist from the new page) fails, the code jumps to error_free which only calls kfree(tmp). The old page frag references captured in the source scatterlist are never released: 1. sg[] is built from old frags via skb_to_sgvec() (no extra get_page) 2. nr_frags is set to 1 and frag[0] is replaced with the new page 3. Second skb_to_sgvec() fails -> goto error_free 4. kfree(tmp) frees the sg[] memory but old frags are not unref'd 5. kfree_skb() only releases frag[0] (the new page), not the old ones Fix this by adding a bool parameter to esp_ssg_unref() that, when true, unconditionally unrefs the source scatterlist frags without checking req->src and req->dst, since those fields are not yet initialized by aead_request_set_crypt() at the point of the error. Existing callers pass false to preserve the original behavior. The same issue exists in both esp4 and esp6 as the code is identical. Fixes: cac2661c53f3 ("esp4: Avoid skb_cow_data whenever possible") Fixes: 03e2a30f6a27 ("esp6: Avoid skb_cow_data whenever possible") Signed-off-by: Alessandro Schino <7991aleschino@gmail.com> Signed-off-by: Steffen Klassert --- net/ipv4/esp4.c | 12 +++++++----- net/ipv6/esp6.c | 12 +++++++----- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 6a5febbdbee4..8314d7bddcb7 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -96,7 +96,7 @@ static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead, __alignof__(struct scatterlist)); } -static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb) +static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb, bool already_unref) { struct crypto_aead *aead = x->data; int extralen = 0; @@ -113,7 +113,7 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb) /* Unref skb_frag_pages in the src scatterlist if necessary. * Skip the first sg which comes from skb->data. */ - if (req->src != req->dst) + if (already_unref || req->src != req->dst) for (sg = sg_next(req->src); sg; sg = sg_next(sg)) skb_page_unref(page_to_netmem(sg_page(sg)), skb->pp_recycle); @@ -220,7 +220,7 @@ static void esp_output_done(void *data, int err) } tmp = ESP_SKB_CB(skb)->tmp; - esp_ssg_unref(x, tmp, skb); + esp_ssg_unref(x, tmp, skb, false); kfree(tmp); if (xo && (xo->flags & XFRM_DEV_RESUME)) { @@ -569,8 +569,10 @@ int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * err = skb_to_sgvec(skb, dsg, (unsigned char *)esph - skb->data, assoclen + ivlen + esp->clen + alen); - if (unlikely(err < 0)) + if (unlikely(err < 0)) { + esp_ssg_unref(x, tmp, skb, true); goto error_free; + } } if ((x->props.flags & XFRM_STATE_ESN)) @@ -602,7 +604,7 @@ int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * } if (sg != dsg) - esp_ssg_unref(x, tmp, skb); + esp_ssg_unref(x, tmp, skb, false); if (!err && x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP) err = esp_output_tail_tcp(x, skb); diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 9c06c5a1419d..9d0c4957ac62 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -113,7 +113,7 @@ static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead, __alignof__(struct scatterlist)); } -static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb) +static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb, bool already_unref) { struct crypto_aead *aead = x->data; int extralen = 0; @@ -130,7 +130,7 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb) /* Unref skb_frag_pages in the src scatterlist if necessary. * Skip the first sg which comes from skb->data. */ - if (req->src != req->dst) + if (already_unref || req->src != req->dst) for (sg = sg_next(req->src); sg; sg = sg_next(sg)) skb_page_unref(page_to_netmem(sg_page(sg)), skb->pp_recycle); @@ -254,7 +254,7 @@ static void esp_output_done(void *data, int err) } tmp = ESP_SKB_CB(skb)->tmp; - esp_ssg_unref(x, tmp, skb); + esp_ssg_unref(x, tmp, skb, false); kfree(tmp); esp_output_encap_csum(skb); @@ -600,8 +600,10 @@ int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info err = skb_to_sgvec(skb, dsg, (unsigned char *)esph - skb->data, assoclen + ivlen + esp->clen + alen); - if (unlikely(err < 0)) + if (unlikely(err < 0)) { + esp_ssg_unref(x, tmp, skb, true); goto error_free; + } } if ((x->props.flags & XFRM_STATE_ESN)) @@ -634,7 +636,7 @@ int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info } if (sg != dsg) - esp_ssg_unref(x, tmp, skb); + esp_ssg_unref(x, tmp, skb, false); if (!err && x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP) err = esp_output_tail_tcp(x, skb); From dfa0d7b0ff1eb6b2c416b8fdb9b4f2cefba57a40 Mon Sep 17 00:00:00 2001 From: Jingguo Tan Date: Mon, 18 May 2026 17:06:48 +0800 Subject: [PATCH 06/10] xfrm: esp: restore combined single-frag length gate The ESP out-of-place fast path appends the trailer in esp_output_head() before esp_output_tail() allocates the destination page frag. The head-side gate currently checks skb->data_len and tailen separately, but the tail code allocates a single destination frag from the combined post-trailer skb->data_len. Reject the page-frag fast path when the combined aligned length exceeds a page. Otherwise skb_page_frag_refill() may fall back to a single page while the destination sg still spans the combined skb->data_len. Restore this combined-length page gate for both IPv4 and IPv6. Fixes: 5bd8baab087d ("esp: limit skb_page_frag_refill use to a single page") Cc: stable@vger.kernel.org Signed-off-by: Lin Ma Signed-off-by: Chenyuan Mi Signed-off-by: Jingguo Tan Reviewed-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- net/ipv4/esp4.c | 4 ++-- net/ipv6/esp6.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 8314d7bddcb7..5d3a8656687e 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -419,8 +419,8 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * return err; } - if (ALIGN(tailen, L1_CACHE_BYTES) > PAGE_SIZE || - ALIGN(skb->data_len, L1_CACHE_BYTES) > PAGE_SIZE) + if (ALIGN(skb->data_len + tailen, L1_CACHE_BYTES) > + PAGE_SIZE) goto cow; if (!skb_cloned(skb)) { diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 9d0c4957ac62..b963b8e72604 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -448,8 +448,8 @@ int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info return err; } - if (ALIGN(tailen, L1_CACHE_BYTES) > PAGE_SIZE || - ALIGN(skb->data_len, L1_CACHE_BYTES) > PAGE_SIZE) + if (ALIGN(skb->data_len + tailen, L1_CACHE_BYTES) > + PAGE_SIZE) goto cow; if (!skb_cloned(skb)) { From 7f83d174073234839aea176f265e517e0d50a1d2 Mon Sep 17 00:00:00 2001 From: Shaomin Chen Date: Thu, 21 May 2026 02:07:23 +0800 Subject: [PATCH 07/10] xfrm: iptfs: reset runtime state when cloning SAs iptfs_clone_state() clones the IPTFS mode data with kmemdup(). This copies runtime objects which must not be shared with the original SA, including the embedded sk_buff_head, hrtimers, spinlock, and in-flight reassembly/reorder state. If xfrm_state_migrate() fails after clone_state() but before the later init_state() call has reinitialized those fields, the cloned state can be destroyed by xfrm_state_gc_task() with list and timer state copied from the original SA. With queued packets this lets the clone splice and free skbs owned by the original IPTFS queue, leading to use-after-free and double-free reports in iptfs_destroy_state() and skb release paths. Reinitialize the clone's runtime state before publishing it through x->mode_data. Because clone_state() now publishes a destroyable mode_data object before init_state(), take the mode callback module reference there. Avoid taking it again from __iptfs_init_state() for the same object. Fixes: 0e4fbf013fa5 ("xfrm: iptfs: add user packet (tunnel ingress) handling") Cc: stable@vger.kernel.org Signed-off-by: Shaomin Chen Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_iptfs.c | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/net/xfrm/xfrm_iptfs.c b/net/xfrm/xfrm_iptfs.c index 97bc979e55ba..6c6bbc040517 100644 --- a/net/xfrm/xfrm_iptfs.c +++ b/net/xfrm/xfrm_iptfs.c @@ -2650,7 +2650,8 @@ static void __iptfs_init_state(struct xfrm_state *x, x->props.enc_hdr_len = sizeof(struct ip_iptfs_hdr); /* Always keep a module reference when x->mode_data is set */ - __module_get(x->mode_cbs->owner); + if (x->mode_data != xtfs) + __module_get(x->mode_cbs->owner); x->mode_data = xtfs; xtfs->x = x; @@ -2658,22 +2659,39 @@ static void __iptfs_init_state(struct xfrm_state *x, static int iptfs_clone_state(struct xfrm_state *x, struct xfrm_state *orig) { + struct skb_wseq *w_saved = NULL; struct xfrm_iptfs_data *xtfs; xtfs = kmemdup(orig->mode_data, sizeof(*xtfs), GFP_KERNEL); if (!xtfs) return -ENOMEM; - xtfs->ra_newskb = NULL; if (xtfs->cfg.reorder_win_size) { - xtfs->w_saved = kzalloc_objs(*xtfs->w_saved, - xtfs->cfg.reorder_win_size); - if (!xtfs->w_saved) { + w_saved = kzalloc_objs(*w_saved, xtfs->cfg.reorder_win_size); + if (!w_saved) { kfree_sensitive(xtfs); return -ENOMEM; } } + xtfs->w_saved = w_saved; + __skb_queue_head_init(&xtfs->queue); + xtfs->queue_size = 0; + hrtimer_setup(&xtfs->iptfs_timer, iptfs_delay_timer, CLOCK_MONOTONIC, + IPTFS_HRTIMER_MODE); + + spin_lock_init(&xtfs->drop_lock); + hrtimer_setup(&xtfs->drop_timer, iptfs_drop_timer, CLOCK_MONOTONIC, + IPTFS_HRTIMER_MODE); + + xtfs->w_seq_set = false; + xtfs->w_wantseq = 0; + xtfs->w_savedlen = 0; + xtfs->ra_newskb = NULL; + xtfs->ra_wantseq = 0; + xtfs->ra_runtlen = 0; + + __module_get(x->mode_cbs->owner); x->mode_data = xtfs; xtfs->x = x; From 3e52417318473782012b236d0325bf7d2266a597 Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Thu, 21 May 2026 03:29:26 -0700 Subject: [PATCH 08/10] xfrm: move policy_bydst RCU sync from per-netns .exit to .pre_exit The struct pernet_operations docstring in include/net/net_namespace.h explicitly warns against blocking RCU primitives in .exit handlers: Exit methods using blocking RCU primitives, such as synchronize_rcu(), should be implemented via exit_batch. [...] Please, avoid synchronize_rcu() at all, where it's possible. Note that a combination of pre_exit() and exit() can be used, since a synchronize_rcu() is guaranteed between the calls. xfrm_policy_fini() violates this: it calls synchronize_rcu() before freeing the policy_bydst hash tables (so no RCU reader is mid- traversal at free time), but runs from xfrm_net_ops.exit -- once per namespace -- so a cleanup_net() of N namespaces pays N full RCU grace periods serially. Use the documented pre_exit/exit split. Move the policy flush (and the workqueue drains it depends on) into a new .pre_exit handler; xfrm_policy_fini() then runs in .exit and frees the hash tables after the synchronize_rcu_expedited() that cleanup_net() guarantees between the two phases. Providing O(1) RCU grace periods per batch instead of O(N). Observed on Linux 6.18 with a workload doing unshare(CLONE_NEWNET) at ~13/sec sustained: cleanup_net() and the netns_wq rescuer kthread both stuck in xfrm_policy_fini()'s synchronize_rcu(), >300k struct net accumulated in the cleanup queue, Percpu in /proc/meminfo climbed to 130+ GB on 256-CPU hosts, and memcg OOMs followed. setup_net and __put_net counts were balanced, ruling out a refcount leak. Fixes: 069daad4f2ae ("xfrm: Wait for RCU readers during policy netns exit") Signed-off-by: Usama Arif Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 59968dcbafe1..dd09d2063da2 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -4276,21 +4276,21 @@ static int __net_init xfrm_policy_init(struct net *net) return -ENOMEM; } -static void xfrm_policy_fini(struct net *net) +static void __net_exit xfrm_net_pre_exit(struct net *net) { - struct xfrm_pol_inexact_bin *b, *t; - unsigned int sz; - int dir; - disable_work_sync(&net->xfrm.policy_hthresh.work); - flush_work(&net->xfrm.policy_hash_work); #ifdef CONFIG_XFRM_SUB_POLICY xfrm_policy_flush(net, XFRM_POLICY_TYPE_SUB, false); #endif xfrm_policy_flush(net, XFRM_POLICY_TYPE_MAIN, false); +} - synchronize_rcu(); +static void xfrm_policy_fini(struct net *net) +{ + struct xfrm_pol_inexact_bin *b, *t; + unsigned int sz; + int dir; WARN_ON(!list_empty(&net->xfrm.policy_all)); @@ -4368,6 +4368,7 @@ static void __net_exit xfrm_net_exit(struct net *net) static struct pernet_operations __net_initdata xfrm_net_ops = { .init = xfrm_net_init, + .pre_exit = xfrm_net_pre_exit, .exit = xfrm_net_exit, }; From c16f74dc1d75d0e2e7670076d5375deda110ebeb Mon Sep 17 00:00:00 2001 From: Zhengchuan Liang Date: Fri, 22 May 2026 17:31:55 +0800 Subject: [PATCH 09/10] xfrm: input: hold netns during deferred transport reinjection Transport-mode reinjection stores a struct net pointer in skb->cb and uses it later from xfrm_trans_reinject(). That pointer must stay valid until the deferred callback runs. Take a netns reference when queueing deferred reinjection work and drop it after the callback completes. Use maybe_get_net() so the queueing path does not revive a namespace that is already being torn down. This keeps the existing workqueue design and fixes the netns lifetime handling in one place for all users of xfrm_trans_queue_net(). Fixes: 7b3801927e52 ("xfrm: introduce xfrm_trans_queue_net") Cc: stable@kernel.org Reported-by: Yuan Tan Reported-by: Xin Liu Co-developed-by: Luxing Yin Signed-off-by: Luxing Yin Signed-off-by: Zhengchuan Liang Signed-off-by: Ren Wei Assisted-by: Codex:gpt-5.4 Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_input.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index f65291eba1f6..e4c2cd24936d 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -797,9 +797,12 @@ static void xfrm_trans_reinject(struct work_struct *work) spin_unlock_bh(&trans->queue_lock); local_bh_disable(); - while ((skb = __skb_dequeue(&queue))) - XFRM_TRANS_SKB_CB(skb)->finish(XFRM_TRANS_SKB_CB(skb)->net, - NULL, skb); + while ((skb = __skb_dequeue(&queue))) { + struct net *net = XFRM_TRANS_SKB_CB(skb)->net; + + XFRM_TRANS_SKB_CB(skb)->finish(net, NULL, skb); + put_net(net); + } local_bh_enable(); } @@ -808,6 +811,7 @@ int xfrm_trans_queue_net(struct net *net, struct sk_buff *skb, struct sk_buff *)) { struct xfrm_trans_tasklet *trans; + struct net *hold_net; trans = this_cpu_ptr(&xfrm_trans_tasklet); @@ -816,8 +820,12 @@ int xfrm_trans_queue_net(struct net *net, struct sk_buff *skb, BUILD_BUG_ON(sizeof(struct xfrm_trans_cb) > sizeof(skb->cb)); + hold_net = maybe_get_net(net); + if (!hold_net) + return -ENODEV; + XFRM_TRANS_SKB_CB(skb)->finish = finish; - XFRM_TRANS_SKB_CB(skb)->net = net; + XFRM_TRANS_SKB_CB(skb)->net = hold_net; spin_lock_bh(&trans->queue_lock); __skb_queue_tail(&trans->queue, skb); spin_unlock_bh(&trans->queue_lock); From 6851161feb01cea41358c9ec304bd2f981fc8505 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Fri, 29 May 2026 10:23:25 +0200 Subject: [PATCH 10/10] Revert "esp: fix page frag reference leak on skb_to_sgvec failure" This reverts commit 2982e599fff6faa21c8df147d96fc7af6c1a2f24. The patch does not fully fix the issue and the Author does not match the 'Signed-off-by:' tag, so revert it for now. Signed-off-by: Steffen Klassert --- net/ipv4/esp4.c | 12 +++++------- net/ipv6/esp6.c | 12 +++++------- 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 5d3a8656687e..513c8215c947 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -96,7 +96,7 @@ static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead, __alignof__(struct scatterlist)); } -static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb, bool already_unref) +static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb) { struct crypto_aead *aead = x->data; int extralen = 0; @@ -113,7 +113,7 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb, /* Unref skb_frag_pages in the src scatterlist if necessary. * Skip the first sg which comes from skb->data. */ - if (already_unref || req->src != req->dst) + if (req->src != req->dst) for (sg = sg_next(req->src); sg; sg = sg_next(sg)) skb_page_unref(page_to_netmem(sg_page(sg)), skb->pp_recycle); @@ -220,7 +220,7 @@ static void esp_output_done(void *data, int err) } tmp = ESP_SKB_CB(skb)->tmp; - esp_ssg_unref(x, tmp, skb, false); + esp_ssg_unref(x, tmp, skb); kfree(tmp); if (xo && (xo->flags & XFRM_DEV_RESUME)) { @@ -569,10 +569,8 @@ int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * err = skb_to_sgvec(skb, dsg, (unsigned char *)esph - skb->data, assoclen + ivlen + esp->clen + alen); - if (unlikely(err < 0)) { - esp_ssg_unref(x, tmp, skb, true); + if (unlikely(err < 0)) goto error_free; - } } if ((x->props.flags & XFRM_STATE_ESN)) @@ -604,7 +602,7 @@ int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * } if (sg != dsg) - esp_ssg_unref(x, tmp, skb, false); + esp_ssg_unref(x, tmp, skb); if (!err && x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP) err = esp_output_tail_tcp(x, skb); diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index b963b8e72604..57481e423e59 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -113,7 +113,7 @@ static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead, __alignof__(struct scatterlist)); } -static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb, bool already_unref) +static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb) { struct crypto_aead *aead = x->data; int extralen = 0; @@ -130,7 +130,7 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb, /* Unref skb_frag_pages in the src scatterlist if necessary. * Skip the first sg which comes from skb->data. */ - if (already_unref || req->src != req->dst) + if (req->src != req->dst) for (sg = sg_next(req->src); sg; sg = sg_next(sg)) skb_page_unref(page_to_netmem(sg_page(sg)), skb->pp_recycle); @@ -254,7 +254,7 @@ static void esp_output_done(void *data, int err) } tmp = ESP_SKB_CB(skb)->tmp; - esp_ssg_unref(x, tmp, skb, false); + esp_ssg_unref(x, tmp, skb); kfree(tmp); esp_output_encap_csum(skb); @@ -600,10 +600,8 @@ int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info err = skb_to_sgvec(skb, dsg, (unsigned char *)esph - skb->data, assoclen + ivlen + esp->clen + alen); - if (unlikely(err < 0)) { - esp_ssg_unref(x, tmp, skb, true); + if (unlikely(err < 0)) goto error_free; - } } if ((x->props.flags & XFRM_STATE_ESN)) @@ -636,7 +634,7 @@ int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info } if (sg != dsg) - esp_ssg_unref(x, tmp, skb, false); + esp_ssg_unref(x, tmp, skb); if (!err && x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP) err = esp_output_tail_tcp(x, skb);