From f4268b466190dae95a7585f69b4f1f8ad097632c Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Wed, 29 Apr 2026 13:40:41 +0000 Subject: [PATCH 01/94] nfc: llcp: Fix use-after-free in llcp_sock_release() llcp_sock_release() unconditionally unlinks the socket from the local sockets list. However, if the socket is still in connecting state, it is on the connecting list. Fix this by checking the socket state and unlinking from the correct list. Fixes: b4011239a08e ("NFC: llcp: Fix non blocking sockets connections") Signed-off-by: Lee Jones Link: https://patch.msgid.link/20260429134115.3558604-1-lee@kernel.org Signed-off-by: David Heidelberg --- net/nfc/llcp_sock.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index f1be1e84f665..feab29fc62f4 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -633,6 +633,8 @@ static int llcp_sock_release(struct socket *sock) if (sock->type == SOCK_RAW) nfc_llcp_sock_unlink(&local->raw_sockets, sk); + else if (sk->sk_state == LLCP_CONNECTING) + nfc_llcp_sock_unlink(&local->connecting_sockets, sk); else nfc_llcp_sock_unlink(&local->sockets, sk); From b493ea2765cc17cb8aa7e7544a4b6dcb05b6ed77 Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Wed, 29 Apr 2026 13:40:42 +0000 Subject: [PATCH 02/94] nfc: llcp: Fix use-after-free race in nfc_llcp_recv_cc() A race condition exists in the NFC LLCP connection state machine where the connection acceptance packet (CC) can be processed concurrently with socket release. This can lead to a use-after-free of the socket object. When nfc_llcp_recv_cc() moves the socket from the connecting_sockets list to the sockets list, it does so without holding the socket lock. If llcp_sock_release() is executing concurrently, it might have already unlinked the socket and dropped its references, which can result in nfc_llcp_recv_cc() linking a freed socket into the live list. Fix this by holding lock_sock() during the state transition and list movement in nfc_llcp_recv_cc(). After acquiring the lock, check if the socket is still hashed to ensure it hasn't already been unlinked and marked for destruction by the release path. This aligns the locking pattern with recv_hdlc() and recv_disc(). Fixes: a69f32af86e3 ("NFC: Socket linked list") Signed-off-by: Lee Jones Reviewed-by: Simon Horman Link: https://patch.msgid.link/20260429134115.3558604-2-lee@kernel.org Signed-off-by: David Heidelberg --- net/nfc/llcp_core.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/net/nfc/llcp_core.c b/net/nfc/llcp_core.c index db5bc6a878dd..dc65c719f35f 100644 --- a/net/nfc/llcp_core.c +++ b/net/nfc/llcp_core.c @@ -1218,6 +1218,15 @@ static void nfc_llcp_recv_cc(struct nfc_llcp_local *local, sk = &llcp_sock->sk; + lock_sock(sk); + + /* Check if socket was destroyed whilst waiting for the lock */ + if (!sk_hashed(sk)) { + release_sock(sk); + nfc_llcp_sock_put(llcp_sock); + return; + } + /* Unlink from connecting and link to the client array */ nfc_llcp_sock_unlink(&local->connecting_sockets, sk); nfc_llcp_sock_link(&local->sockets, sk); @@ -1229,6 +1238,8 @@ static void nfc_llcp_recv_cc(struct nfc_llcp_local *local, sk->sk_state = LLCP_CONNECTED; sk->sk_state_change(sk); + release_sock(sk); + nfc_llcp_sock_put(llcp_sock); } From f040e590c035bfd9553fe79ee9585caf1b14d67b Mon Sep 17 00:00:00 2001 From: Ashutosh Desai Date: Tue, 5 May 2026 17:07:12 +0000 Subject: [PATCH 03/94] nfc: hci: fix out-of-bounds read in HCP header parsing Both nfc_hci_recv_from_llc() and nci_hci_data_received_cb() read packet->header from skb->data at function entry without first checking that the buffer holds at least one byte. A malicious NFC peer can send a 0-byte HCP frame that passes through the SHDLC layer and reaches these functions, causing an out-of-bounds heap read of packet->header. The same 0-byte frame, if queued as a non-final fragment, also causes the reassembly loop to underflow msg_len to UINT_MAX, triggering skb_over_panic() when the reassembled skb is written. Fix this by adding a pskb_may_pull() check at the entry of each function before packet->header is first accessed. The existing pskb_may_pull() checks before the reassembled hcp_skb is cast to struct hcp_packet remain in place to guard the 2-byte HCP message header. Fixes: 8b8d2e08bf0d ("NFC: HCI support") Fixes: 11f54f228643 ("NFC: nci: Add HCI over NCI protocol support") Cc: stable@vger.kernel.org Reviewed-by: Simon Horman Signed-off-by: Ashutosh Desai Link: https://patch.msgid.link/20260505170712.96560-1-ashutoshdesai993@gmail.com Signed-off-by: David Heidelberg --- net/nfc/hci/core.c | 10 ++++++++++ net/nfc/nci/hci.c | 10 ++++++++++ 2 files changed, 20 insertions(+) diff --git a/net/nfc/hci/core.c b/net/nfc/hci/core.c index 0d33c81a15fe..ba6f0310ffd7 100644 --- a/net/nfc/hci/core.c +++ b/net/nfc/hci/core.c @@ -861,6 +861,11 @@ static void nfc_hci_recv_from_llc(struct nfc_hci_dev *hdev, struct sk_buff *skb) struct sk_buff *frag_skb; int msg_len; + if (!pskb_may_pull(skb, NFC_HCI_HCP_PACKET_HEADER_LEN)) { + kfree_skb(skb); + return; + } + packet = (struct hcp_packet *)skb->data; if ((packet->header & ~NFC_HCI_FRAGMENT) == 0) { skb_queue_tail(&hdev->rx_hcp_frags, skb); @@ -904,6 +909,11 @@ static void nfc_hci_recv_from_llc(struct nfc_hci_dev *hdev, struct sk_buff *skb) * unblock waiting cmd context. Otherwise, enqueue to dispatch * in separate context where handler can also execute command. */ + if (!pskb_may_pull(hcp_skb, NFC_HCI_HCP_HEADER_LEN)) { + kfree_skb(hcp_skb); + return; + } + packet = (struct hcp_packet *)hcp_skb->data; type = HCP_MSG_GET_TYPE(packet->message.header); if (type == NFC_HCI_HCP_RESPONSE) { diff --git a/net/nfc/nci/hci.c b/net/nfc/nci/hci.c index 40ae8e5a7ec7..c03e8a0bd3bd 100644 --- a/net/nfc/nci/hci.c +++ b/net/nfc/nci/hci.c @@ -439,6 +439,11 @@ void nci_hci_data_received_cb(void *context, return; } + if (!pskb_may_pull(skb, NCI_HCI_HCP_PACKET_HEADER_LEN)) { + kfree_skb(skb); + return; + } + packet = (struct nci_hcp_packet *)skb->data; if ((packet->header & ~NCI_HCI_FRAGMENT) == 0) { skb_queue_tail(&ndev->hci_dev->rx_hcp_frags, skb); @@ -482,6 +487,11 @@ void nci_hci_data_received_cb(void *context, * unblock waiting cmd context. Otherwise, enqueue to dispatch * in separate context where handler can also execute command. */ + if (!pskb_may_pull(hcp_skb, NCI_HCI_HCP_HEADER_LEN)) { + kfree_skb(hcp_skb); + return; + } + packet = (struct nci_hcp_packet *)hcp_skb->data; type = NCI_HCP_MSG_GET_TYPE(packet->message.header); if (type == NCI_HCI_HCP_RESPONSE) { From f23bf992d65a42007c517b060ca35cebdea3525a Mon Sep 17 00:00:00 2001 From: Carl Lee Date: Sat, 16 May 2026 19:55:18 +0800 Subject: [PATCH 04/94] nfc: nxp-nci: i2c: use rising-edge IRQ on ACPI systems Some ACPI-based platforms report incorrect IRQ trigger types (e.g. IRQF_TRIGGER_HIGH), which can lead to interrupt storms. Use the historically working rising-edge trigger on ACPI systems to avoid this regression. Device Tree-based systems continue to use the firmware-provided trigger type. Fixes: 57be33f85e36 ("nfc: nxp-nci: remove interrupt trigger type") Signed-off-by: Carl Lee Tested-by: Bartosz Golaszewski Reviewed-by: Bartosz Golaszewski Reviewed-by: Mark Pearson Tested-by: Mark Pearson Tested-by: Luca Stefani Link: https://patch.msgid.link/20260516-nfc-nxp-nci-i2c-restore-irq-trigger-fallback-v3-1-37ba4b6e9086@amd.com Signed-off-by: David Heidelberg --- drivers/nfc/nxp-nci/i2c.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/drivers/nfc/nxp-nci/i2c.c b/drivers/nfc/nxp-nci/i2c.c index b3d34433bd14..a6c08175d9dd 100644 --- a/drivers/nfc/nxp-nci/i2c.c +++ b/drivers/nfc/nxp-nci/i2c.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -267,6 +268,7 @@ static int nxp_nci_i2c_probe(struct i2c_client *client) { struct device *dev = &client->dev; struct nxp_nci_i2c_phy *phy; + unsigned long irqflags; int r; if (!i2c_check_functionality(client->adapter, I2C_FUNC_I2C)) { @@ -303,9 +305,26 @@ static int nxp_nci_i2c_probe(struct i2c_client *client) if (r < 0) return r; + /* + * ACPI platforms may report incorrect IRQ trigger types + * (e.g. level-high), which can lead to interrupt storms. + * + * Use the historically stable rising-edge trigger for ACPI devices. + * + * On non-ACPI systems (e.g. Device Tree), prefer the firmware- + * provided trigger type, falling back to rising-edge if not set. + */ + if (ACPI_COMPANION(dev)) { + irqflags = IRQF_TRIGGER_RISING; + } else { + irqflags = irq_get_trigger_type(client->irq); + if (!irqflags) + irqflags = IRQF_TRIGGER_RISING; + } + r = request_threaded_irq(client->irq, NULL, nxp_nci_i2c_irq_thread_fn, - IRQF_ONESHOT, + irqflags | IRQF_ONESHOT, NXP_NCI_I2C_DRIVER_NAME, phy); if (r < 0) nfc_err(&client->dev, "Unable to register IRQ handler\n"); From bed6e04be8e6b9133d8b16d5a42d0e0ce674fa9a Mon Sep 17 00:00:00 2001 From: Hamza Mahfooz Date: Mon, 11 May 2026 10:43:14 -0400 Subject: [PATCH 05/94] netfilter: conntrack: tcp: do not force CLOSE on invalid-seq RST without direction check An unintended behavior in the TCP conntrack state machine allows a connection to be forced into the CLOSE state using an RST packet with an invalid sequence number. Specifically, after a SYN packet is observed, an RST with an invalid SEQ can transition the conntrack entry to TCP_CONNTRACK_CLOSE, regardless of whether the RST corresponds to the expected reply direction. The relevant code path assumes the RST is a response to an outgoing SYN, but does not validate packet direction or ensure that a matching SYN was actually sent in the opposite direction. As a result, a crafted packet sequence consisting of a SYN followed by an invalid-sequence RST can prematurely terminate an active NAT entry. This makes connection teardown easier than intended. So, tighten the state transition logic to ensure that RST-triggered CLOSE transitions only occur when the RST is a valid response to a previously observed SYN in the correct direction. Cc: stable@vger.kernel.org Fixes: 9fb9cbb1082d ("[NETFILTER]: Add nf_conntrack subsystem.") Signed-off-by: Hamza Mahfooz Signed-off-by: Florian Westphal --- net/netfilter/nf_conntrack_proto_tcp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index b67426c2189b..e99ab1e88e9f 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -1221,7 +1221,8 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct, new_state = old_state; } if (((test_bit(IPS_SEEN_REPLY_BIT, &ct->status) - && ct->proto.tcp.last_index == TCP_SYN_SET) + && ct->proto.tcp.last_index == TCP_SYN_SET + && ct->proto.tcp.last_dir != dir) || (!test_bit(IPS_ASSURED_BIT, &ct->status) && ct->proto.tcp.last_index == TCP_ACK_SET)) && ntohl(th->ack_seq) == ct->proto.tcp.last_end) { From 92170e6afe927ab2792a3f71902845789c8e31b1 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 19 May 2026 12:36:14 -0700 Subject: [PATCH 06/94] netfilter: synproxy: refresh tcphdr after skb_ensure_writable synproxy_tstamp_adjust() rewrites the TCP timestamp option in place and then patches the TCP checksum via inet_proto_csum_replace4() on the caller-supplied tcphdr pointer. Both ipv4_synproxy_hook() and ipv6_synproxy_hook() obtain that pointer with skb_header_pointer() before calling in, so it may either alias skb->head directly or point at the caller's on-stack _tcph buffer. Between obtaining the pointer and using it, the function calls skb_ensure_writable(skb, optend), which on a cloned or non-linear skb invokes pskb_expand_head() and frees the old skb->head. After that point the cached th is stale: caller (ipv[46]_synproxy_hook) th = skb_header_pointer(skb, ..., &_tcph) synproxy_tstamp_adjust(skb, protoff, th, ...) skb_ensure_writable(skb, optend) pskb_expand_head() /* kfree(old skb->head) */ ... inet_proto_csum_replace4(&th->check, ...) /* writes into freed head, or into the caller's stack copy leaving the on-wire checksum stale */ The option bytes are written through skb->data and are fine; only the checksum update goes through th and so lands in the wrong place. The result is either a write into freed slab memory or a packet leaving with a checksum that does not match its payload. Fix by re-deriving th from skb->data + protoff immediately after skb_ensure_writable() succeeds, so the subsequent checksum update targets the linear, writable header. Fixes: 48b1de4c110a ("netfilter: add SYNPROXY core/target") Assisted-by: kres (claude-opus-4-7) Signed-off-by: Chris Mason Reviewed-by: Fernando Fernandez Mancera Signed-off-by: Florian Westphal --- net/netfilter/nf_synproxy_core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index 57f57e2fc80a..036c8586f49b 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -200,6 +200,8 @@ synproxy_tstamp_adjust(struct sk_buff *skb, unsigned int protoff, if (skb_ensure_writable(skb, optend)) return 0; + th = (struct tcphdr *)(skb->data + protoff); + while (optoff < optend) { unsigned char *op = skb->data + optoff; From 47980b6dbf83961eec1c1363ea986e9c06ff8054 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 14 May 2026 14:21:57 +0200 Subject: [PATCH 07/94] netfilter: nf_conntrack_gre: fix gre keymap list corruption Quoting reporter: A race between GRE keymap insertion and destruction can corrupt the kernel list or use a freed object. `nf_ct_gre_keymap_add()` publishes a new keymap pointer before the embedded `list_head` is linked, while `nf_ct_gre_keymap_destroy()` can concurrently delete and free that same object. An unprivileged user can reach this through the PPTP conntrack helper by racing PPTP control messages or helper teardown, leading to KASAN-detectable list corruption/UAF in kernel context. ## Root Cause Analysis `exp_gre()` installs GRE expectations for a PPTP control flow and then adds two GRE keymap entries [..] The add path publishes `ct_pptp_info->keymap[dir]` before linking the embedded list node [..] Concurrent teardown deletes that partially initialized object. Make add/destroy symmetric: install both, destroy both while under lock. Furthermore, we should refuse to publish a new mapping in case ct is going away, else we may leak the allocation. The "retrans" detection is strange: existing mapping is checked for key equality with the new mapping, then for "is on the list" via list walk. But I can't see how an existing keymap entry can be NOT on list. Change this to only check if we're asked to map same tuple again -- if so, skip re-install, else signal failure. Last, add a bug trap for the keymap list; it has to be empty when namespace is going away. Reported-by: Leo Lin Signed-off-by: Florian Westphal --- .../linux/netfilter/nf_conntrack_proto_gre.h | 7 +- net/netfilter/nf_conntrack_core.c | 8 ++ net/netfilter/nf_conntrack_pptp.c | 8 +- net/netfilter/nf_conntrack_proto_gre.c | 106 +++++++++++++----- 4 files changed, 95 insertions(+), 34 deletions(-) diff --git a/include/linux/netfilter/nf_conntrack_proto_gre.h b/include/linux/netfilter/nf_conntrack_proto_gre.h index 9ee7014400e8..ad5563f0f864 100644 --- a/include/linux/netfilter/nf_conntrack_proto_gre.h +++ b/include/linux/netfilter/nf_conntrack_proto_gre.h @@ -18,9 +18,10 @@ struct nf_ct_gre_keymap { struct rcu_head rcu; }; -/* add new tuple->key_reply pair to keymap */ -int nf_ct_gre_keymap_add(struct nf_conn *ct, enum ip_conntrack_dir dir, - struct nf_conntrack_tuple *t); +/* add tuple->key_reply pairs to keymap */ +bool nf_ct_gre_keymap_add(struct nf_conn *ct, + const struct nf_conntrack_tuple *orig, + const struct nf_conntrack_tuple *repl); /* delete keymap entries */ void nf_ct_gre_keymap_destroy(struct nf_conn *ct); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 8ba5b22a1eef..b521b5ebd664 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -568,6 +568,13 @@ static void destroy_gre_conntrack(struct nf_conn *ct) #endif } +static void warn_on_keymap_list_leak(const struct net *net) +{ +#ifdef CONFIG_NF_CT_PROTO_GRE + WARN_ON_ONCE(!list_empty(&net->ct.nf_ct_proto.gre.keymap_list)); +#endif +} + void nf_ct_destroy(struct nf_conntrack *nfct) { struct nf_conn *ct = (struct nf_conn *)nfct; @@ -2510,6 +2517,7 @@ void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list) } list_for_each_entry(net, net_exit_list, exit_list) { + warn_on_keymap_list_leak(net); nf_conntrack_ecache_pernet_fini(net); nf_conntrack_expect_pernet_fini(net); free_percpu(net->ct.stat); diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c index 4c679638df06..dc23e4181618 100644 --- a/net/netfilter/nf_conntrack_pptp.c +++ b/net/netfilter/nf_conntrack_pptp.c @@ -225,13 +225,9 @@ static int exp_gre(struct nf_conn *ct, __be16 callid, __be16 peer_callid) if (nf_ct_expect_related(exp_reply, 0) != 0) goto out_unexpect_orig; - /* Add GRE keymap entries */ - if (nf_ct_gre_keymap_add(ct, IP_CT_DIR_ORIGINAL, &exp_orig->tuple) != 0) + if (!nf_ct_gre_keymap_add(ct, &exp_orig->tuple, + &exp_reply->tuple)) goto out_unexpect_both; - if (nf_ct_gre_keymap_add(ct, IP_CT_DIR_REPLY, &exp_reply->tuple) != 0) { - nf_ct_gre_keymap_destroy(ct); - goto out_unexpect_both; - } ret = 0; out_put_both: diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index 94c19bc4edc5..35e22082d65a 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -87,41 +87,97 @@ static __be16 gre_keymap_lookup(struct net *net, struct nf_conntrack_tuple *t) return key; } -/* add a single keymap entry, associate with specified master ct */ -int nf_ct_gre_keymap_add(struct nf_conn *ct, enum ip_conntrack_dir dir, - struct nf_conntrack_tuple *t) +enum nf_ct_gre_km_act { + NF_CT_GRE_KM_NEW, + NF_CT_GRE_KM_BAD, + NF_CT_GRE_KM_DUP +}; + +static enum nf_ct_gre_km_act +nf_ct_gre_km_acceptable(const struct nf_ct_pptp_master *ct_pptp_info, + const struct nf_conntrack_tuple *orig, + const struct nf_conntrack_tuple *repl) +{ + struct nf_ct_gre_keymap *km_orig, *km_repl; + + lockdep_assert_held(&keymap_lock); + + km_orig = ct_pptp_info->keymap[IP_CT_DIR_ORIGINAL]; + km_repl = ct_pptp_info->keymap[IP_CT_DIR_REPLY]; + + if (km_orig && km_repl) { + if (!gre_key_cmpfn(km_orig, orig)) + return NF_CT_GRE_KM_BAD; + + if (!gre_key_cmpfn(km_repl, repl)) + return NF_CT_GRE_KM_BAD; + + return NF_CT_GRE_KM_DUP; + } + + DEBUG_NET_WARN_ON_ONCE(km_orig); + DEBUG_NET_WARN_ON_ONCE(km_repl); + return NF_CT_GRE_KM_NEW; +} + +/* add keymap entries, associate with specified master ct */ +bool nf_ct_gre_keymap_add(struct nf_conn *ct, + const struct nf_conntrack_tuple *orig, + const struct nf_conntrack_tuple *repl) { struct net *net = nf_ct_net(ct); struct nf_gre_net *net_gre = gre_pernet(net); struct nf_ct_pptp_master *ct_pptp_info = nfct_help_data(ct); - struct nf_ct_gre_keymap **kmp, *km; + struct nf_ct_gre_keymap *km_orig, *km_repl; + bool ret = false; - kmp = &ct_pptp_info->keymap[dir]; - if (*kmp) { - /* check whether it's a retransmission */ - list_for_each_entry_rcu(km, &net_gre->keymap_list, list) { - if (gre_key_cmpfn(km, t) && km == *kmp) - return 0; - } - pr_debug("trying to override keymap_%s for ct %p\n", - dir == IP_CT_DIR_REPLY ? "reply" : "orig", ct); - return -EEXIST; - } + km_orig = kmalloc_obj(*km_orig, GFP_ATOMIC); + if (!km_orig) + return false; + km_repl = kmalloc_obj(*km_repl, GFP_ATOMIC); + if (!km_repl) + goto km_free; - km = kmalloc_obj(*km, GFP_ATOMIC); - if (!km) - return -ENOMEM; - memcpy(&km->tuple, t, sizeof(*t)); - *kmp = km; - - pr_debug("adding new entry %p: ", km); - nf_ct_dump_tuple(&km->tuple); + memcpy(&km_orig->tuple, orig, sizeof(*orig)); + memcpy(&km_repl->tuple, repl, sizeof(*repl)); spin_lock_bh(&keymap_lock); - list_add_tail(&km->list, &net_gre->keymap_list); + if (nf_ct_is_dying(ct)) + goto unlock_free; + + switch (nf_ct_gre_km_acceptable(ct_pptp_info, orig, repl)) { + case NF_CT_GRE_KM_NEW: + break; + case NF_CT_GRE_KM_DUP: + ret = true; + goto unlock_free; + case NF_CT_GRE_KM_BAD: + pr_debug("trying to override keymap for ct %p\n", ct); + goto unlock_free; + } + + if (ct_pptp_info->keymap[IP_CT_DIR_ORIGINAL] || + ct_pptp_info->keymap[IP_CT_DIR_REPLY]) + goto unlock_free; + + pr_debug("adding new entries %p,%p: ", km_orig, km_repl); + nf_ct_dump_tuple(&km_orig->tuple); + nf_ct_dump_tuple(&km_repl->tuple); + + list_add_tail_rcu(&km_orig->list, &net_gre->keymap_list); + list_add_tail_rcu(&km_repl->list, &net_gre->keymap_list); + ct_pptp_info->keymap[IP_CT_DIR_ORIGINAL] = km_orig; + ct_pptp_info->keymap[IP_CT_DIR_REPLY] = km_repl; spin_unlock_bh(&keymap_lock); - return 0; + return true; + +unlock_free: + spin_unlock_bh(&keymap_lock); +km_free: + kfree(km_orig); + kfree(km_repl); + return ret; } EXPORT_SYMBOL_GPL(nf_ct_gre_keymap_add); From c376f07e16c02239ed44cabb97145d03f65b4d15 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 19 May 2026 20:10:08 +0200 Subject: [PATCH 08/94] netfilter: xt_cpu: prefer raw_smp_processor_id With PREEMPT_RCU we get splat: BUG: using smp_processor_id() in preemptible [..] caller is cpu_mt+0x53/0xd0 net/netfilter/xt_cpu.c:37 CPU: 1 .. Comm: syz.3.1377 #0 PREEMPT(full) Call Trace: dump_stack_lvl+0xe8/0x150 lib/dump_stack.c:120 check_preemption_disabled+0xd3/0xe0 lib/smp_processor_id.c:47 cpu_mt+0x53/0xd0 net/netfilter/xt_cpu.c:37 [..] Just use raw version instead. This is similar to 14d14a5d2957 ("netfilter: nft_meta: use raw_smp_processor_id()"). Fixes: 0ca743a55991 ("netfilter: nf_tables: add compatibility layer for x_tables") Reported-by: syzbot+690d3e3ffa7335ac10eb@syzkaller.appspotmail.com Signed-off-by: Florian Westphal --- net/netfilter/xt_cpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/xt_cpu.c b/net/netfilter/xt_cpu.c index 3bdc302a0f91..9cb259902a58 100644 --- a/net/netfilter/xt_cpu.c +++ b/net/netfilter/xt_cpu.c @@ -34,7 +34,7 @@ static bool cpu_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_cpu_info *info = par->matchinfo; - return (info->cpu == smp_processor_id()) ^ info->invert; + return (info->cpu == raw_smp_processor_id()) ^ info->invert; } static struct xt_match cpu_mt_reg __read_mostly = { From 968cc2c96390f06e56ed6a43f935bfebdefed28f Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 16 May 2026 23:23:21 +0800 Subject: [PATCH 09/94] netfilter: disable payload mangling in userns Several parts of network stack rely on iph->ihl validation done by network stack before PRE_ROUTING. Disable this feature for user namespaces for now. tcp option handling is likely safe even for LOCAL_IN, so this this leaves tcp option mangling via nft_exthdr.c as-is. I don't think these are the only means to alter packets, but these appear to be relatively prominent. This could be relaxed later. Example: - allow userns for ingress hook. - allow userns if base is transport header. Also, we should revalidate or restrict generally: - Don't allow linklayer writes to spill into network header - restrict ipv4 and ipv6 to 'known safe' writes, e.g. saddr/daddr/check/tos Reported-by: Qi Tang Reported-by: Tong Liu Tested-by: Qi Tang Link: https://lore.kernel.org/netfilter-devel/20260515100411.3141-1-fw@strlen.de/ Signed-off-by: Florian Westphal --- net/netfilter/nfnetlink_queue.c | 6 ++++-- net/netfilter/nft_payload.c | 3 +++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 984a0eb9e149..60ab88d45096 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -1141,6 +1141,9 @@ nfqnl_mangle(void *data, unsigned int data_len, struct nf_queue_entry *e, int di { struct sk_buff *nskb; + if (e->state.net->user_ns != &init_user_ns) + return -EPERM; + if (diff < 0) { unsigned int min_len = skb_transport_offset(e->skb); @@ -1537,8 +1540,7 @@ static int nfqnl_recv_verdict(struct sk_buff *skb, const struct nfnl_info *info, if (nfqnl_mangle(nla_data(nfqa[NFQA_PAYLOAD]), payload_len, entry, diff) < 0) verdict = NF_DROP; - - if (ct && diff) + else if (ct && diff) nfnl_ct->seq_adjust(entry->skb, ct, ctinfo, diff); } diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 01e13e5255a9..484a5490832e 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -917,6 +917,9 @@ static int nft_payload_set_init(const struct nft_ctx *ctx, struct nft_payload_set *priv = nft_expr_priv(expr); int err; + if (ctx->net->user_ns != &init_user_ns) + return -EPERM; + priv->base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE])); priv->len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN])); From f438d1786d657d57790c5d138d6db3fc9fdac392 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 19 May 2026 22:52:07 +0200 Subject: [PATCH 10/94] netfilter: ebtables: fix OOB read in compat_mtw_from_user Luxiao Xu says: The function compat_mtw_from_user() converts ebtables extensions from 32-bit user structures to kernel native structures. However, it lacks proper validation of the user-supplied match_size/target_size. When certain extensions are processed, the kernel-side translation logic may perform memory accesses based on the extension's expected size. If the user provides a size smaller than what the extension requires, it results in an out-of-bounds read as reported by KASAN. This fix introduces a check to ensure match_size is at least as large as the extension's required compatsize. This covers matches, watchers, and targets, while maintaining compatibility with standard targets. AFAIU this is relevant for matches that need to go though match->compat_from_user() call. Those that use plain memcpy with the user-provided size are ok because the caller checks that size vs the start of the next rule entry offset (which itself is checked vs. total size copied from userspace). The ->compat_from_user() callbacks assume they can read compatsize bytes, so they need this extra check. Based on an earlier patch from Luxiao Xu. Fixes: 81e675c227ec ("netfilter: ebtables: add CONFIG_COMPAT support") Reported-by: Yuan Tan Reported-by: Yifan Wu Reported-by: Juefei Pu Reported-by: Xin Liu Signed-off-by: Luxiao Xu Signed-off-by: Ren Wei Reviewed-by: Fernando Fernandez Mancera Signed-off-by: Florian Westphal --- net/bridge/netfilter/ebtables.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index b9f4daac09af..8a6a069329d2 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1956,6 +1956,25 @@ enum compat_mwt { EBT_COMPAT_TARGET, }; +static bool match_size_ok(const struct xt_match *match, unsigned int match_size) +{ + u16 csize; + + if (match->matchsize == -1) /* cannot validate ebt_among */ + return true; + + csize = match->compatsize ? : match->matchsize; + + return match_size >= csize; +} + +static bool tgt_size_ok(const struct xt_target *tgt, unsigned int tgt_size) +{ + u16 csize = tgt->compatsize ? : tgt->targetsize; + + return tgt_size >= csize; +} + static int compat_mtw_from_user(const struct compat_ebt_entry_mwt *mwt, enum compat_mwt compat_mwt, struct ebt_entries_buf_state *state, @@ -1981,6 +2000,11 @@ static int compat_mtw_from_user(const struct compat_ebt_entry_mwt *mwt, if (IS_ERR(match)) return PTR_ERR(match); + if (!match_size_ok(match, match_size)) { + module_put(match->me); + return -EINVAL; + } + off = ebt_compat_match_offset(match, match_size); if (dst) { if (match->compat_from_user) @@ -2000,6 +2024,12 @@ static int compat_mtw_from_user(const struct compat_ebt_entry_mwt *mwt, mwt->u.revision); if (IS_ERR(wt)) return PTR_ERR(wt); + + if (!tgt_size_ok(wt, match_size)) { + module_put(wt->me); + return -EINVAL; + } + off = xt_compat_target_offset(wt); if (dst) { From 1d001b0a6182b0d2f41a8d687f7522b6f1e94280 Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Wed, 20 May 2026 10:34:09 +0800 Subject: [PATCH 11/94] netfilter: nft_fib_ipv6: walk fib6_siblings under RCU nft_fib6_info_nh_uses_dev() runs from nft_fib6_eval() in softirq under rcu_read_lock(). fib6_siblings is modified by writers that hold tb6_lock but do not wait for RCU readers, so the sibling walk should use list_for_each_entry_rcu(): it adds READ_ONCE() on the ->next pointer and lets CONFIG_PROVE_RCU_LIST validate the locking. No functional change for non-debug builds. Fixes: 1c32b24c234b ("netfilter: nft_fib_ipv6: switch to fib6_lookup") Signed-off-by: Jiayuan Chen Signed-off-by: Florian Westphal --- net/ipv6/netfilter/nft_fib_ipv6.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c b/net/ipv6/netfilter/nft_fib_ipv6.c index 8b2dba88ee96..5e192a446ec8 100644 --- a/net/ipv6/netfilter/nft_fib_ipv6.c +++ b/net/ipv6/netfilter/nft_fib_ipv6.c @@ -170,7 +170,7 @@ static bool nft_fib6_info_nh_uses_dev(struct fib6_info *rt, if (nft_fib6_info_nh_dev_match(nh_dev, dev)) return true; - list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { + list_for_each_entry_rcu(iter, &rt->fib6_siblings, fib6_siblings) { nh_dev = fib6_info_nh_dev(iter); if (nft_fib6_info_nh_dev_match(nh_dev, dev)) From f81b0c2d281faa93e4c2b7247047922aaf3e4ba6 Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Wed, 20 May 2026 10:34:10 +0800 Subject: [PATCH 12/94] netfilter: nft_fib_ipv6: handle routes via external nexthop fib6_info has a union: union { struct list_head fib6_siblings; struct list_head nh_list; }; Old-style multipath (ip -6 route add ... nexthop ... nexthop ...) uses fib6_siblings. External nexthop (ip -6 route add ... nhid N) uses nh_list, linked into &nh->f6i_list. nft_fib6_info_nh_uses_dev() blindly walks &rt->fib6_siblings, causing an OOB read past the struct nexthop slab when rt->nh is set: ================================================================== BUG: KASAN: slab-out-of-bounds in nft_fib6_eval+0x1362/0x16c0 Read of size 8 at addr ffff888103a099d0 by task ping/386 CPU: 2 UID: 0 PID: 386 Comm: ping Not tainted 7.1.0-rc3+ #251 PREEMPT Call Trace: dump_stack_lvl+0x76/0xa0 print_report+0xd1/0x5f0 kasan_report+0xe7/0x130 __asan_report_load8_noabort+0x14/0x30 nft_fib6_eval+0x1362/0x16c0 nft_do_chain+0x279/0x18c0 nft_do_chain_ipv6+0x1a8/0x230 nf_hook_slow+0xad/0x200 ipv6_rcv+0x152/0x380 __netif_receive_skb_one_core+0x118/0x1c0 ================================================================== Branch by route shape: when rt->nh is set, walk via nexthop_for_each_fib6_nh() (also covers nh groups, which the original code missed); otherwise walk fib6_siblings, guarded by READ_ONCE() of rt->fib6_nsiblings as required by commit 31d7d67ba127 ("ipv6: annotate data-races around rt->fib6_nsiblings"). Fixes: 1c32b24c234b ("netfilter: nft_fib_ipv6: switch to fib6_lookup") Signed-off-by: Jiayuan Chen Signed-off-by: Florian Westphal --- net/ipv6/netfilter/nft_fib_ipv6.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c b/net/ipv6/netfilter/nft_fib_ipv6.c index 5e192a446ec8..c0a0075e2590 100644 --- a/net/ipv6/netfilter/nft_fib_ipv6.c +++ b/net/ipv6/netfilter/nft_fib_ipv6.c @@ -160,16 +160,32 @@ static bool nft_fib6_info_nh_dev_match(const struct net_device *nh_dev, l3mdev_master_ifindex_rcu(nh_dev) == dev->ifindex; } +static int nft_fib6_nh_match_dev_cb(struct fib6_nh *nh, void *arg) +{ + const struct net_device *dev = arg; + + return nft_fib6_info_nh_dev_match(nh->fib_nh_dev, dev); +} + static bool nft_fib6_info_nh_uses_dev(struct fib6_info *rt, const struct net_device *dev) { const struct net_device *nh_dev; struct fib6_info *iter; + /* External nexthop: fib6_siblings slot aliases nh_list, walk via nh. */ + if (rt->nh) + return nexthop_for_each_fib6_nh(rt->nh, + nft_fib6_nh_match_dev_cb, + (void *)dev); + nh_dev = fib6_info_nh_dev(rt); if (nft_fib6_info_nh_dev_match(nh_dev, dev)) return true; + if (!READ_ONCE(rt->fib6_nsiblings)) + return false; + list_for_each_entry_rcu(iter, &rt->fib6_siblings, fib6_siblings) { nh_dev = fib6_info_nh_dev(iter); From a40aaaef2f8f5a17a779eeac7032f2f7d5322406 Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Wed, 20 May 2026 10:34:11 +0800 Subject: [PATCH 13/94] selftests: netfilter: add nft_fib_nexthop test Functional coverage of nft_fib6_eval()'s nexthop enumeration over three route shapes: 1) single external nexthop (nhid) 2) external nexthop group (nhid -> group) 3) old-style multipath (nexthop ... nexthop ...) Each scenario places one nexthop on the input device (veth0). For (2) and (3) the matching nexthop is the second member, so the walk has to traverse beyond the primary nh. Two nft counters on prerouting verify the data path: one increments only when fib reports veth0 as the oif, the other counts "missing" results and must stay at zero. ./nft_fib_nexthop.sh PASS: single external nexthop (nhid -> veth0) PASS: nexthop group (dummy0 + veth0) PASS: old-style multipath (sibling on veth0) Suggested-by: Florian Westphal Signed-off-by: Jiayuan Chen Signed-off-by: Florian Westphal --- .../testing/selftests/net/netfilter/Makefile | 1 + .../net/netfilter/nft_fib_nexthop.sh | 152 ++++++++++++++++++ 2 files changed, 153 insertions(+) create mode 100755 tools/testing/selftests/net/netfilter/nft_fib_nexthop.sh diff --git a/tools/testing/selftests/net/netfilter/Makefile b/tools/testing/selftests/net/netfilter/Makefile index ee2d1a5254f8..d953ee218c0f 100644 --- a/tools/testing/selftests/net/netfilter/Makefile +++ b/tools/testing/selftests/net/netfilter/Makefile @@ -26,6 +26,7 @@ TEST_PROGS := \ nft_concat_range.sh \ nft_conntrack_helper.sh \ nft_fib.sh \ + nft_fib_nexthop.sh \ nft_flowtable.sh \ nft_interface_stress.sh \ nft_meta.sh \ diff --git a/tools/testing/selftests/net/netfilter/nft_fib_nexthop.sh b/tools/testing/selftests/net/netfilter/nft_fib_nexthop.sh new file mode 100755 index 000000000000..c4f203057382 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/nft_fib_nexthop.sh @@ -0,0 +1,152 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# shellcheck disable=SC2154 +# +# Exercise nft_fib6_eval()'s sibling/nh enumeration on three route shapes: +# 1) route via a single external nexthop (nhid) +# 2) route via an external nexthop group (nhid -> group, two members) +# 3) route via old-style multipath (nexthop ... nexthop ...) +# +# In each scenario the route's nexthop set contains veth0 (the iif of the +# test packet). nft_fib6_info_nh_uses_dev() must walk the set and report +# veth0 as a valid oif. For (2) and (3) the matching nexthop is the second +# member, so the walk has to traverse beyond the primary nh. +# +# After sending $PKTS ICMPv6 echo requests from ns1, check two counters on +# nsrouter: +# nf_ok -- `fib daddr . iif oif eq "veth0"` must equal $PKTS +# nf_bad -- `fib daddr . iif oif missing` must stay at 0 +# Both rules also match on iif veth0 and ip6 daddr dead:dead::/64 so that +# kernel-generated ND/MLD/RA traffic cannot pollute the counters. +# +# Topology similar to nft_fib.sh, without ns2; two dummy interfaces on +# nsrouter host extra nh devices: +# +# dead:1::99 dead:1::1 +# ns1 <----veth----> nsrouter --- dummy0 dead:2::1 +# \-- dummy1 dead:9::1 + +source lib.sh + +ret=0 +PKTS=3 + +checktool "nft --version" "run test without nft" +checktool "ip -V" "run test without iproute2" + +setup_ns nsrouter ns1 +trap cleanup_all_ns EXIT + +if ! ip link add veth0 netns "$nsrouter" type veth peer name eth0 netns "$ns1" \ + > /dev/null 2>&1; then + echo "SKIP: No virtual ethernet pair device support in kernel" + exit $ksft_skip +fi + +ip -net "$ns1" link set lo up +ip -net "$ns1" link set eth0 up +ip -net "$ns1" -6 addr add dead:1::99/64 dev eth0 nodad +ip -net "$ns1" -6 route add default via dead:1::1 + +ip -net "$nsrouter" link set lo up +ip -net "$nsrouter" link set veth0 up +ip -net "$nsrouter" -6 addr add dead:1::1/64 dev veth0 nodad + +if ! ip -net "$nsrouter" link add dummy0 type dummy 2>/dev/null; then + echo "SKIP: dummy netdev not available" + exit $ksft_skip +fi +ip -net "$nsrouter" link set dummy0 up +ip -net "$nsrouter" -6 addr add dead:2::1/64 dev dummy0 nodad + +ip -net "$nsrouter" link add dummy1 type dummy +ip -net "$nsrouter" link set dummy1 up +ip -net "$nsrouter" -6 addr add dead:9::1/64 dev dummy1 nodad + +ip netns exec "$nsrouter" sysctl -q net.ipv6.conf.all.forwarding=1 + +load_fib_rule() { + # filter on iif + daddr so the counters only see our test packets + ip netns exec "$nsrouter" nft -f /dev/stdin <&2 + ip netns exec "$nsrouter" nft list counter ip6 t "$counter" 1>&2 +} + +run_scenario() { + local what="$1"; shift + # counter output format is "packets PACKET_NUM bytes BYTES_NUM"; + # we only care about the packet count + local expect_ok="packets $PKTS bytes" + local expect_bad="packets 0 bytes" + local lret=0 + + # reset route + nexthop state between scenarios + ip -net "$nsrouter" -6 route del dead:dead::/64 > /dev/null 2>&1 || true + ip -net "$nsrouter" nexthop flush > /dev/null 2>&1 || true + + # run the scenario function passed by the caller + "$@" || echo "WARN ($what): scenario setup returned non-zero" + + load_fib_rule || { echo "FAIL ($what): nft load"; ret=1; return; } + + # ping a daddr inside dead:dead::/64 so fib has to walk the nh set + ip netns exec "$ns1" ping -6 -c "$PKTS" -i 0.1 -W 1 dead:dead::1 \ + > /dev/null 2>&1 || true + + # verify the packets went through the expected fib path + if ! ip netns exec "$nsrouter" nft list counter ip6 t nf_ok | grep -q "$expect_ok"; then + bad_counter nf_ok "$expect_ok" "$what" + lret=1 + fi + if ! ip netns exec "$nsrouter" nft list counter ip6 t nf_bad | grep -q "$expect_bad"; then + bad_counter nf_bad "$expect_bad" "$what" + lret=1 + fi + + if [ $lret -eq 0 ]; then + echo "PASS: $what" + else + ret=1 + fi +} + +scenario_single_nh() { + ip -net "$nsrouter" nexthop add id 1 via dead:1::99 dev veth0 + ip -net "$nsrouter" -6 route add dead:dead::/64 nhid 1 +} +run_scenario "single external nexthop (nhid -> veth0)" scenario_single_nh + +scenario_nh_group() { + ip -net "$nsrouter" nexthop add id 1 via dead:2::2 dev dummy0 + ip -net "$nsrouter" nexthop add id 2 via dead:1::99 dev veth0 + ip -net "$nsrouter" nexthop add id 100 group 1/2 + ip -net "$nsrouter" -6 route add dead:dead::/64 nhid 100 +} +run_scenario "nexthop group (dummy0 + veth0)" scenario_nh_group + +scenario_old_multipath() { + ip -net "$nsrouter" -6 route add dead:dead::/64 \ + nexthop via dead:2::2 dev dummy0 \ + nexthop via dead:1::99 dev veth0 +} +run_scenario "old-style multipath (sibling on veth0)" scenario_old_multipath + +exit $ret From 18014147d3ee7831dce53fe65d7fc8d428b02552 Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Mon, 11 May 2026 16:37:56 +0200 Subject: [PATCH 14/94] netfilter: nf_tables: fix dst corruption in same register operation For lshift and rshift, the shift operations are performed in a loop over 32-bit words. The loop calculates the shifted value and write it to dst, and then immediately reads from src to calculate the carry for the next iteration. Because src and dst could point to the same memory location, the carry is incorrectly calculated using the newly modified dst value instead of the original src value. Adding a temporary local variable to cache the original value before writing to dst and using it for the carry calculation solves the problem. In addition, partial overlap is rejected from control plane for all kind of operations including byteorder. This was tested with the following bytecode: table test_table ip flags 0 use 1 handle 1 ip test_table test_chain use 3 type filter hook input prio 0 policy accept packets 0 bytes 0 flags 1 ip test_table test_chain 2 [ immediate reg 1 0x44332211 0x88776655 ] [ bitwise reg 1 = ( reg 1 << 0x08000000 ) ] [ cmp eq reg 1 0x66443322 0x00887766 ] [ counter pkts 0 bytes 0 ] ip test_table test_chain 4 3 [ immediate reg 1 0x44332211 0x88776655 ] [ bitwise reg 1 = ( reg 1 << 0x08000000 ) ] [ cmp eq reg 1 0x55443322 0x00887766 ] [ counter pkts 21794 bytes 1917798 ] Fixes: 567d746b55bc ("netfilter: bitwise: add support for shifts.") Acked-by: Jeremy Sowden Signed-off-by: Fernando Fernandez Mancera Signed-off-by: Florian Westphal --- include/net/netfilter/nf_tables.h | 7 +++++++ net/netfilter/nft_bitwise.c | 18 ++++++++++++++---- net/netfilter/nft_byteorder.c | 13 ++++++++++--- 3 files changed, 31 insertions(+), 7 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index cff7b773e972..9d844354c4d9 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -180,6 +180,13 @@ static inline u64 nft_reg_load64(const u32 *sreg) return get_unaligned((u64 *)sreg); } +static inline bool nft_reg_overlap(u8 src, u8 dst, u32 len) +{ + unsigned int n = DIV_ROUND_UP(len, sizeof(u32)); + + return src != dst && src < dst + n && dst < src + n; +} + static inline void nft_data_copy(u32 *dst, const struct nft_data *src, unsigned int len) { diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c index 94dccdcfa06b..785b8e9731d1 100644 --- a/net/netfilter/nft_bitwise.c +++ b/net/netfilter/nft_bitwise.c @@ -43,8 +43,10 @@ static void nft_bitwise_eval_lshift(u32 *dst, const u32 *src, u32 carry = 0; for (i = DIV_ROUND_UP(priv->len, sizeof(u32)); i > 0; i--) { - dst[i - 1] = (src[i - 1] << shift) | carry; - carry = src[i - 1] >> (BITS_PER_TYPE(u32) - shift); + u32 tmp_src = src[i - 1]; + + dst[i - 1] = (tmp_src << shift) | carry; + carry = tmp_src >> (BITS_PER_TYPE(u32) - shift); } } @@ -56,8 +58,10 @@ static void nft_bitwise_eval_rshift(u32 *dst, const u32 *src, u32 carry = 0; for (i = 0; i < DIV_ROUND_UP(priv->len, sizeof(u32)); i++) { - dst[i] = carry | (src[i] >> shift); - carry = src[i] << (BITS_PER_TYPE(u32) - shift); + u32 tmp_src = src[i]; + + dst[i] = carry | (tmp_src >> shift); + carry = tmp_src << (BITS_PER_TYPE(u32) - shift); } } @@ -235,6 +239,9 @@ static int nft_bitwise_init_bool(const struct nft_ctx *ctx, &priv->sreg2, priv->len); if (err < 0) return err; + + if (nft_reg_overlap(priv->sreg2, priv->dreg, priv->len)) + return -EINVAL; } return 0; @@ -265,6 +272,9 @@ static int nft_bitwise_init(const struct nft_ctx *ctx, if (err < 0) return err; + if (nft_reg_overlap(priv->sreg, priv->dreg, priv->len)) + return -EINVAL; + if (tb[NFTA_BITWISE_OP]) { priv->op = ntohl(nla_get_be32(tb[NFTA_BITWISE_OP])); switch (priv->op) { diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c index e00dddfa2fc0..2316c77f4228 100644 --- a/net/netfilter/nft_byteorder.c +++ b/net/netfilter/nft_byteorder.c @@ -144,9 +144,16 @@ static int nft_byteorder_init(const struct nft_ctx *ctx, if (err < 0) return err; - return nft_parse_register_store(ctx, tb[NFTA_BYTEORDER_DREG], - &priv->dreg, NULL, NFT_DATA_VALUE, - priv->len); + err = nft_parse_register_store(ctx, tb[NFTA_BYTEORDER_DREG], + &priv->dreg, NULL, NFT_DATA_VALUE, + priv->len); + if (err < 0) + return err; + + if (nft_reg_overlap(priv->sreg, priv->dreg, priv->len)) + return -EINVAL; + + return 0; } static int nft_byteorder_dump(struct sk_buff *skb, From f4feb1e20058e407cb00f45aff47f5b7e19a6bbf Mon Sep 17 00:00:00 2001 From: Weiming Shi Date: Wed, 20 May 2026 09:00:21 -0700 Subject: [PATCH 15/94] tun: free page on short-frame rejection in tun_xdp_one() tun_xdp_one() returns -EINVAL on a frame shorter than ETH_HLEN without freeing the page that vhost_net_build_xdp() allocated for it. tun_sendmsg() discards that -EINVAL and still returns total_len, so vhost_tx_batch() takes the success path and never frees the page; each short frame in a batch leaks one page-frag chunk. A local process that can open /dev/net/tun and /dev/vhost-net can hit this path: it attaches a tun/tap device as the vhost-net backend and feeds TX descriptors whose length minus the virtio-net header is below ETH_HLEN. Each kick leaks the page-frag chunks for that batch, and a tight submission loop exhausts host memory and triggers an OOM panic. Free the page before returning -EINVAL, matching the XDP-program error path in the same function. Fixes: 049584807f1d ("tun: add missing verification for short frame") Reported-by: Xiang Mei Signed-off-by: Weiming Shi Reviewed-by: Dongli Zhang Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20260520160020.375349-2-bestswngs@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/tun.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index b183189f1853..f594360d66d6 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -2394,8 +2394,10 @@ static int tun_xdp_one(struct tun_struct *tun, bool skb_xdp = false; struct page *page; - if (unlikely(datasize < ETH_HLEN)) + if (unlikely(datasize < ETH_HLEN)) { + put_page(virt_to_head_page(xdp->data)); return -EINVAL; + } xdp_prog = rcu_dereference(tun->xdp_prog); if (xdp_prog) { From 3bcf7aec6a9d16438f2cec29f5d7c8d5b8edf9b2 Mon Sep 17 00:00:00 2001 From: Weiming Shi Date: Thu, 21 May 2026 09:32:31 -0700 Subject: [PATCH 16/94] tap: free page on error paths in tap_get_user_xdp() tap_get_user_xdp() rejects a frame shorter than ETH_HLEN with -EINVAL, and returns -ENOMEM when build_skb() fails. Both paths jump to the err label without freeing the page that vhost_net_build_xdp() allocated for the frame. tap_sendmsg() discards the per-buffer return value and always returns 0, so vhost_tx_batch() takes the success path and never frees the page; each rejected frame in a batch leaks one page-frag chunk. Free the page on both error paths, before the skb is built. This is the tap counterpart of the same leak in tun_xdp_one(). Fixes: 0efac27791ee ("tap: accept an array of XDP buffs through sendmsg()") Fixes: ed7f2afdd0e0 ("tap: add missing verification for short frame") Reported-by: Xiang Mei Signed-off-by: Weiming Shi Reviewed-by: Dongli Zhang Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20260521163230.1478627-2-bestswngs@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/tap.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/tap.c b/drivers/net/tap.c index a590e07ce0a9..fae115915c8e 100644 --- a/drivers/net/tap.c +++ b/drivers/net/tap.c @@ -1052,6 +1052,7 @@ static int tap_get_user_xdp(struct tap_queue *q, struct xdp_buff *xdp) int err, depth; if (unlikely(xdp->data_end - xdp->data < ETH_HLEN)) { + put_page(virt_to_head_page(xdp->data)); err = -EINVAL; goto err; } @@ -1061,6 +1062,7 @@ static int tap_get_user_xdp(struct tap_queue *q, struct xdp_buff *xdp) skb = build_skb(xdp->data_hard_start, buflen); if (!skb) { + put_page(virt_to_head_page(xdp->data)); err = -ENOMEM; goto err; } From aa8963fdce667a42fb7f0bdd2909fadcab02f9a8 Mon Sep 17 00:00:00 2001 From: Weiming Shi Date: Thu, 21 May 2026 09:33:13 -0700 Subject: [PATCH 17/94] tun: free page on build_skb failure in tun_xdp_one() When build_skb() fails in tun_xdp_one(), the function sets ret to -ENOMEM and jumps to the out label, which returns without freeing the page that vhost_net_build_xdp() allocated for the frame. As with the short-frame rejection path, tun_sendmsg() discards the per-buffer error and still returns total_len, so vhost_tx_batch() takes the success path and never frees the page. Each build_skb() failure in a batch leaks one page-frag chunk. Free the page before taking the error path, matching the put_page() the other error exits of tun_xdp_one() already perform. Fixes: 043d222f93ab ("tuntap: accept an array of XDP buffs through sendmsg()") Reported-by: Xiang Mei Signed-off-by: Weiming Shi Reviewed-by: Dongli Zhang Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20260521163312.1479805-2-bestswngs@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/tun.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index f594360d66d6..9e7744eb57a3 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -2439,6 +2439,7 @@ static int tun_xdp_one(struct tun_struct *tun, build: skb = build_skb(xdp->data_hard_start, buflen); if (!skb) { + put_page(virt_to_head_page(xdp->data)); ret = -ENOMEM; goto out; } From aae9d8a5528b8ee9ff8dc5d3558b8a9f852a724a Mon Sep 17 00:00:00 2001 From: Ziyu Zhang Date: Wed, 20 May 2026 00:56:36 +0800 Subject: [PATCH 18/94] vsock: keep poll shutdown state consistent vsock_poll() reads vsk->peer_shutdown before taking the socket lock to set EPOLLHUP and EPOLLRDHUP, then reads it again after taking the lock to report EOF readability. A shutdown packet can update peer_shutdown while poll is waiting for the lock, so one poll invocation can report EOF readability without the corresponding HUP/RDHUP bits. For connectible sockets, take one peer_shutdown snapshot after lock_sock() and use it for all peer-shutdown-derived poll bits. For datagram sockets, which do not take lock_sock() in poll(), take one lockless READ_ONCE() snapshot and pair it with WRITE_ONCE() on the writer side. This keeps the peer-shutdown-derived bits internally consistent for each poll pass. Fixes: d021c344051a ("VSOCK: Introduce VM Sockets") Signed-off-by: Ziyu Zhang Link: https://patch.msgid.link/20260519165636.62542-1-ziyuzhang201@gmail.com Signed-off-by: Jakub Kicinski --- net/vmw_vsock/af_vsock.c | 49 ++++++++++++++++--------- net/vmw_vsock/hyperv_transport.c | 9 +++-- net/vmw_vsock/virtio_transport_common.c | 14 ++++--- net/vmw_vsock/vmci_transport.c | 8 ++-- 4 files changed, 52 insertions(+), 28 deletions(-) diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 44037b066a5f..2ce1063d4a67 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -642,7 +642,7 @@ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk) */ sock_reset_flag(sk, SOCK_DONE); sk->sk_state = TCP_CLOSE; - vsk->peer_shutdown = 0; + WRITE_ONCE(vsk->peer_shutdown, 0); } if (sk->sk_type == SOCK_SEQPACKET) { @@ -933,7 +933,7 @@ static struct sock *__vsock_create(struct net *net, vsk->rejected = false; vsk->sent_request = false; vsk->ignore_connecting_rst = false; - vsk->peer_shutdown = 0; + WRITE_ONCE(vsk->peer_shutdown, 0); INIT_DELAYED_WORK(&vsk->connect_work, vsock_connect_timeout); INIT_DELAYED_WORK(&vsk->pending_work, vsock_pending_work); @@ -1241,6 +1241,25 @@ static int vsock_shutdown(struct socket *sock, int mode) return err; } +static __poll_t vsock_poll_shutdown(struct sock *sk, u32 peer_shutdown) +{ + __poll_t mask = 0; + + /* INET sockets treat local write shutdown and peer write shutdown as a + * case of EPOLLHUP set. + */ + if (sk->sk_shutdown == SHUTDOWN_MASK || + ((sk->sk_shutdown & SEND_SHUTDOWN) && + (peer_shutdown & SEND_SHUTDOWN))) + mask |= EPOLLHUP; + + if (sk->sk_shutdown & RCV_SHUTDOWN || + peer_shutdown & SEND_SHUTDOWN) + mask |= EPOLLRDHUP; + + return mask; +} + static __poll_t vsock_poll(struct file *file, struct socket *sock, poll_table *wait) { @@ -1258,24 +1277,17 @@ static __poll_t vsock_poll(struct file *file, struct socket *sock, /* Signify that there has been an error on this socket. */ mask |= EPOLLERR; - /* INET sockets treat local write shutdown and peer write shutdown as a - * case of EPOLLHUP set. - */ - if ((sk->sk_shutdown == SHUTDOWN_MASK) || - ((sk->sk_shutdown & SEND_SHUTDOWN) && - (vsk->peer_shutdown & SEND_SHUTDOWN))) { - mask |= EPOLLHUP; - } - - if (sk->sk_shutdown & RCV_SHUTDOWN || - vsk->peer_shutdown & SEND_SHUTDOWN) { - mask |= EPOLLRDHUP; - } - if (sk_is_readable(sk)) mask |= EPOLLIN | EPOLLRDNORM; if (sock->type == SOCK_DGRAM) { + u32 peer_shutdown = READ_ONCE(vsk->peer_shutdown); + + /* DGRAM sockets do not take lock_sock() in poll(), so use one + * lockless snapshot for all shutdown-derived mask bits. + */ + mask |= vsock_poll_shutdown(sk, peer_shutdown); + /* For datagram sockets we can read if there is something in * the queue and write as long as the socket isn't shutdown for * sending. @@ -1290,6 +1302,7 @@ static __poll_t vsock_poll(struct file *file, struct socket *sock, } else if (sock_type_connectible(sk->sk_type)) { const struct vsock_transport *transport; + u32 peer_shutdown; lock_sock(sk); @@ -1322,8 +1335,10 @@ static __poll_t vsock_poll(struct file *file, struct socket *sock, * terminated should also be considered read, and we check the * shutdown flag for that. */ + peer_shutdown = READ_ONCE(vsk->peer_shutdown); + mask |= vsock_poll_shutdown(sk, peer_shutdown); if (sk->sk_shutdown & RCV_SHUTDOWN || - vsk->peer_shutdown & SEND_SHUTDOWN) { + peer_shutdown & SEND_SHUTDOWN) { mask |= EPOLLIN | EPOLLRDNORM; } diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c index 7a8963595bf9..b3394946b2ed 100644 --- a/net/vmw_vsock/hyperv_transport.c +++ b/net/vmw_vsock/hyperv_transport.c @@ -264,7 +264,7 @@ static void hvs_do_close_lock_held(struct vsock_sock *vsk, struct sock *sk = sk_vsock(vsk); sock_set_flag(sk, SOCK_DONE); - vsk->peer_shutdown = SHUTDOWN_MASK; + WRITE_ONCE(vsk->peer_shutdown, SHUTDOWN_MASK); if (vsock_stream_has_data(vsk) <= 0) sk->sk_state = TCP_CLOSING; sk->sk_state_change(sk); @@ -593,7 +593,9 @@ static int hvs_update_recv_data(struct hvsock *hvs) return -EIO; if (payload_len == 0) - hvs->vsk->peer_shutdown |= SEND_SHUTDOWN; + WRITE_ONCE(hvs->vsk->peer_shutdown, + READ_ONCE(hvs->vsk->peer_shutdown) | + SEND_SHUTDOWN); hvs->recv_data_len = payload_len; hvs->recv_data_off = 0; @@ -736,7 +738,8 @@ static s64 hvs_stream_has_data(struct vsock_sock *vsk) return ret; return hvs->recv_data_len; case 0: - vsk->peer_shutdown |= SEND_SHUTDOWN; + WRITE_ONCE(vsk->peer_shutdown, + READ_ONCE(vsk->peer_shutdown) | SEND_SHUTDOWN); ret = 0; break; default: /* -1 */ diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index df3b418e0392..d4d26fba9e37 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -1228,7 +1228,7 @@ static void virtio_transport_do_close(struct vsock_sock *vsk, struct sock *sk = sk_vsock(vsk); sock_set_flag(sk, SOCK_DONE); - vsk->peer_shutdown = SHUTDOWN_MASK; + WRITE_ONCE(vsk->peer_shutdown, SHUTDOWN_MASK); if (vsock_stream_has_data(vsk) <= 0) sk->sk_state = TCP_CLOSING; sk->sk_state_change(sk); @@ -1431,12 +1431,15 @@ virtio_transport_recv_connected(struct sock *sk, case VIRTIO_VSOCK_OP_CREDIT_UPDATE: sk->sk_write_space(sk); break; - case VIRTIO_VSOCK_OP_SHUTDOWN: + case VIRTIO_VSOCK_OP_SHUTDOWN: { + u32 peer_shutdown = READ_ONCE(vsk->peer_shutdown); + if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SHUTDOWN_RCV) - vsk->peer_shutdown |= RCV_SHUTDOWN; + peer_shutdown |= RCV_SHUTDOWN; if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SHUTDOWN_SEND) - vsk->peer_shutdown |= SEND_SHUTDOWN; - if (vsk->peer_shutdown == SHUTDOWN_MASK) { + peer_shutdown |= SEND_SHUTDOWN; + WRITE_ONCE(vsk->peer_shutdown, peer_shutdown); + if (peer_shutdown == SHUTDOWN_MASK) { if (vsock_stream_has_data(vsk) <= 0 && !sock_flag(sk, SOCK_DONE)) { (void)virtio_transport_reset(vsk, NULL); virtio_transport_do_close(vsk, true); @@ -1451,6 +1454,7 @@ virtio_transport_recv_connected(struct sock *sk, if (le32_to_cpu(virtio_vsock_hdr(skb)->flags)) sk->sk_state_change(sk); break; + } case VIRTIO_VSOCK_OP_RST: virtio_transport_do_close(vsk, true); break; diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index d2579380f51e..5c1ecd5bfdbc 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -819,7 +819,7 @@ static void vmci_transport_handle_detach(struct sock *sk) /* On a detach the peer will not be sending or receiving * anymore. */ - vsk->peer_shutdown = SHUTDOWN_MASK; + WRITE_ONCE(vsk->peer_shutdown, SHUTDOWN_MASK); /* We should not be sending anymore since the peer won't be * there to receive, but we can still receive if there is data @@ -1542,7 +1542,9 @@ static int vmci_transport_recv_connected(struct sock *sk, if (pkt->u.mode) { vsk = vsock_sk(sk); - vsk->peer_shutdown |= pkt->u.mode; + WRITE_ONCE(vsk->peer_shutdown, + READ_ONCE(vsk->peer_shutdown) | + pkt->u.mode); sk->sk_state_change(sk); } break; @@ -1559,7 +1561,7 @@ static int vmci_transport_recv_connected(struct sock *sk, * a clean shutdown. */ sock_set_flag(sk, SOCK_DONE); - vsk->peer_shutdown = SHUTDOWN_MASK; + WRITE_ONCE(vsk->peer_shutdown, SHUTDOWN_MASK); if (vsock_stream_has_data(vsk) <= 0) sk->sk_state = TCP_CLOSING; From 70f8592ee90585272018a725054b6eb2ab7e99ca Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Wed, 20 May 2026 19:22:35 +0200 Subject: [PATCH 19/94] net: netlink: fix sending unassigned nsid after assigned one If the current skb is not shared, it is re-used directly for all the sockets subscribed to the notification. If we have remote all-nsid socket receiving a message first, then the 'nsid_is_set' will be set to 'true'. If the nsid is NOT_ASSIGNED for the next socket in the list, the 'nsid_is_set' will remain 'true' and the negative value is be delivered to the user space. All subsequent nsid values will be delivered as well, since there is no code path that sets the flag back to 'false'. Fix that by always dropping the flag to 'false' first. Fixes: 7212462fa6fd ("netlink: don't send unknown nsid") Signed-off-by: Ilya Maximets Acked-by: Nicolas Dichtel Link: https://patch.msgid.link/20260520172317.175168-2-i.maximets@ovn.org Signed-off-by: Jakub Kicinski --- net/netlink/af_netlink.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 2aeb0680807d..0742e97f256e 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1482,6 +1482,7 @@ static void do_one_broadcast(struct sock *sk, p->skb2 = NULL; goto out; } + NETLINK_CB(p->skb2).nsid_is_set = false; NETLINK_CB(p->skb2).nsid = peernet2id(sock_net(sk), p->net); if (NETLINK_CB(p->skb2).nsid != NETNSA_NSID_NOT_ASSIGNED) NETLINK_CB(p->skb2).nsid_is_set = true; From 88b126b39f9757e9debc322d4679239e9af089c7 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Wed, 20 May 2026 19:22:36 +0200 Subject: [PATCH 20/94] net: netlink: don't set nsid on local notifications In most cases, notifications on sockets with NETLINK_LISTEN_ALL_NSID do not contain NSID in their ancillary data in case the event is local to the listener. However, when a self-referential NSID is allocated for a namespace, every local notification starts sending this ID to the user space. This is problematic, because the listener cannot tell if those notifications are local or not anymore without making extra requests to figure out if the provided NSID is local or not. The listener can also not figure out the local NSID beforehand as it can be allocated at any point in time by other processes, changing the structure of the future notifications for everyone. The value is practically not useful, since it's the namespace's own ID that the application has to obtain from other sources in order to figure out if it's the same or not. So, for the application it's just an extra busy work with no benefits. Moreover, applications that do not know about this quirk may be mishandling notifications with NSID set as notifications from remote namespaces. This is the case for ovs-vswitchd and the iproute2's 'ip monitor' that stops printing 'current' and starts printing the nsid number mid-session. Lack of clear documentation for this behavior is also not helping. A search though open-source projects doesn't reveal any projects that use NETNSA_NSID_NOT_ASSIGNED and rely on metadata to contain self-referential NSIDs (expected, since the value is not useful). Quite the opposite, as already mentioned, there are few applications that rely on NSID to not be present in local events. Since the value is not useful and actively harmful in some cases, let's not report it for local events, making the notifications more consistent. Also adding some blank lines for readability. Fixes: 59324cf35aba ("netlink: allow to listen "all" netns") Reported-by: Matteo Perin Signed-off-by: Ilya Maximets Acked-by: Nicolas Dichtel Link: https://patch.msgid.link/20260520172317.175168-3-i.maximets@ovn.org Signed-off-by: Jakub Kicinski --- net/netlink/af_netlink.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 0742e97f256e..7269e23b578d 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1482,10 +1482,14 @@ static void do_one_broadcast(struct sock *sk, p->skb2 = NULL; goto out; } + NETLINK_CB(p->skb2).nsid_is_set = false; - NETLINK_CB(p->skb2).nsid = peernet2id(sock_net(sk), p->net); - if (NETLINK_CB(p->skb2).nsid != NETNSA_NSID_NOT_ASSIGNED) - NETLINK_CB(p->skb2).nsid_is_set = true; + if (!net_eq(sock_net(sk), p->net)) { + NETLINK_CB(p->skb2).nsid = peernet2id(sock_net(sk), p->net); + if (NETLINK_CB(p->skb2).nsid != NETNSA_NSID_NOT_ASSIGNED) + NETLINK_CB(p->skb2).nsid_is_set = true; + } + val = netlink_broadcast_deliver(sk, p->skb2); if (val < 0) { netlink_overrun(sk); From 2e43b64248909c617281921d6d9ba3bfc0159473 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Wed, 20 May 2026 19:22:38 +0200 Subject: [PATCH 21/94] selftests: net: add a test case for nsid in all nsid notifications The test subscribes to link events from all namespaces and makes sure that local events do not carry NSID in their ancillary data (even if there is a self-referential NSID allocated for the local namespace), and remote events do. Signed-off-by: Ilya Maximets Acked-by: Nicolas Dichtel Link: https://patch.msgid.link/20260520172317.175168-5-i.maximets@ovn.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/link_netns.py | 61 ++++++++++++++++++++++- 1 file changed, 59 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/link_netns.py b/tools/testing/selftests/net/link_netns.py index aab043c59d69..6d1f863b6262 100755 --- a/tools/testing/selftests/net/link_netns.py +++ b/tools/testing/selftests/net/link_netns.py @@ -3,13 +3,14 @@ import time -from lib.py import ksft_run, ksft_exit, ksft_true +from lib.py import ksft_run, ksft_exit, ksft_eq, ksft_true from lib.py import ip from lib.py import NetNS, NetNSEnter from lib.py import RtnlFamily LINK_NETNSID = 100 +LINK_NETNSID2 = 200 def test_event() -> None: @@ -32,6 +33,57 @@ def test_event() -> None: "Received unexpected link notification") +def test_event_all_nsid() -> None: + """NETLINK_LISTEN_ALL_NSID notifications: local events must not + carry nsid even with a self-referential mapping. Remote events + must carry the correct nsid.""" + + with NetNS() as ns1, NetNS() as ns2: + net1, net2 = str(ns1), str(ns2) + + with NetNSEnter(net1): + rtnl = RtnlFamily() + rtnl.ntf_listen_all_nsid() + rtnl.ntf_subscribe("rtnlgrp-link") + + # Case 1: no nsid assigned, local event, no nsid expected. + ip("link add dummy-lo type dummy", ns=net1) + + # Case 2: self-referential nsid, local event, still no nsid. + ip(f"netns set {net1} {LINK_NETNSID}", ns=net1) + ip("link add dummy-sr type dummy", ns=net1) + + # Case 3: remote event, nsid present. + ip(f"netns set {net2} {LINK_NETNSID2}", ns=net1) + ip("link add dummy-re type dummy", ns=net2) + + # Collect the three newlink events, ignoring unrelated noise. + events = {} + for msg in rtnl.poll_ntf(duration=1): + if msg['name'] == 'getlink': + ifname = msg['msg'].get('ifname') + if ifname in ('dummy-lo', 'dummy-sr', 'dummy-re'): + events[ifname] = msg + if len(events) == 3: + break + + ksft_true('dummy-lo' in events, "missing local event") + ksft_true(events['dummy-lo'].get('nsid') is None, + "local event without nsid should not carry nsid") + + ksft_true('dummy-sr' in events, "missing self-ref event") + ksft_true(events['dummy-sr'].get('nsid') is None, + "local event with self-ref nsid should not carry nsid") + + ksft_true('dummy-re' in events, "missing remote event") + ksft_eq(events['dummy-re'].get('nsid'), LINK_NETNSID2, + "remote event should carry nsid") + + ip("link del dummy-lo", ns=net1) + ip("link del dummy-sr", ns=net1) + ip("link del dummy-re", ns=net2) + + def validate_link_netns(netns, ifname, link_netnsid) -> bool: link_info = ip(f"-d link show dev {ifname}", ns=netns, json=True) if not link_info: @@ -133,7 +185,12 @@ def test_peer_net() -> None: def main() -> None: - ksft_run([test_event, test_link_net, test_peer_net]) + ksft_run([ + test_event, + test_event_all_nsid, + test_link_net, + test_peer_net, + ]) ksft_exit() From 9e4389b0038781f19f97895186ed941ff8ac1678 Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Thu, 21 May 2026 16:56:39 +0200 Subject: [PATCH 22/94] net/smc: Do not re-initialize smc hashtables INIT_HLIST_HEAD(&smc_v*_hashinfo.ht) are called after smc_nl_init(), proto_register() and sock_register(). This can lead to smc_v*_hashinfo.ht being reset even though hash entries already exist and are being used, possibly resulting in a corrupted list. Remove unnecessary and dangerous re-initialisation of smc_v*_hashinfo.ht in smc_init(); it is implicitly initialised to zero anyhow. Add HLIST_HEAD_INIT to the definitions for clarity. Fixes: f16a7dd5cf27 ("smc: netlink interface for SMC sockets") Suggested-by: Halil Pasic Signed-off-by: Alexandra Winter Acked-by: Halil Pasic Reviewed-by: Mahanta Jambigi Link: https://patch.msgid.link/20260521145639.10317-1-wintera@linux.ibm.com Signed-off-by: Jakub Kicinski --- net/smc/af_smc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index dffbd529762d..b5db69073e20 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -188,10 +188,12 @@ static bool smc_hs_congested(const struct sock *sk) struct smc_hashinfo smc_v4_hashinfo = { .lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock), + .ht = HLIST_HEAD_INIT, }; struct smc_hashinfo smc_v6_hashinfo = { .lock = __RW_LOCK_UNLOCKED(smc_v6_hashinfo.lock), + .ht = HLIST_HEAD_INIT, }; int smc_hash_sk(struct sock *sk) @@ -3517,8 +3519,6 @@ static int __init smc_init(void) pr_err("%s: sock_register fails with %d\n", __func__, rc); goto out_proto6; } - INIT_HLIST_HEAD(&smc_v4_hashinfo.ht); - INIT_HLIST_HEAD(&smc_v6_hashinfo.ht); rc = smc_ib_register_client(); if (rc) { From 3589d20a666caf30ad100c960a2de7de390fce88 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Thu, 21 May 2026 07:11:45 -0700 Subject: [PATCH 23/94] net/iucv: fix locking in .getsockopt Mirror iucv_sock_setsockopt() and wrap the whole switch in lock_sock()/release_sock(). The pre-existing SO_MSGLIMIT-only lock becomes redundant and is removed. Any AF_IUCV HIPER user can potentially crash the kernel by racing recvmsg() with getsockopt(SO_MSGSIZE): the SO_MSGSIZE arm dereferences iucv->hs_dev->mtu after iucv_sock_close() (called from the racing recvmsg()) has set hs_dev to NULL, producing a NULL pointer dereference oops. Suggested-by: Stanislav Fomichev Fixes: 51363b8751a6 ("af_iucv: allow retrieval of maximum message size") Signed-off-by: Breno Leitao Reviewed-by: Alexandra Winter Tested-by: Alexandra Winter Link: https://patch.msgid.link/20260521-af_iucv_fix2-v1-1-f16b1c510aa9@debian.org Signed-off-by: Jakub Kicinski --- net/iucv/af_iucv.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 72dfccd4e3d5..c2dc3338670e 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -1540,7 +1540,7 @@ static int iucv_sock_getsockopt(struct socket *sock, int level, int optname, struct sock *sk = sock->sk; struct iucv_sock *iucv = iucv_sk(sk); unsigned int val; - int len; + int len, rc; if (level != SOL_IUCV) return -ENOPROTOOPT; @@ -1553,26 +1553,34 @@ static int iucv_sock_getsockopt(struct socket *sock, int level, int optname, len = min_t(unsigned int, len, sizeof(int)); + rc = 0; + + lock_sock(sk); switch (optname) { case SO_IPRMDATA_MSG: val = (iucv->flags & IUCV_IPRMDATA) ? 1 : 0; break; case SO_MSGLIMIT: - lock_sock(sk); val = (iucv->path != NULL) ? iucv->path->msglim /* connected */ : iucv->msglimit; /* default */ - release_sock(sk); break; case SO_MSGSIZE: - if (sk->sk_state == IUCV_OPEN) - return -EBADFD; + if (sk->sk_state == IUCV_OPEN) { + rc = -EBADFD; + break; + } val = (iucv->hs_dev) ? iucv->hs_dev->mtu - sizeof(struct af_iucv_trans_hdr) - ETH_HLEN : 0x7fffffff; break; default: - return -ENOPROTOOPT; + rc = -ENOPROTOOPT; + break; } + release_sock(sk); + + if (rc) + return rc; if (put_user(len, optlen)) return -EFAULT; From 4157501b9a8ff1bbe32ff5a7d8aece7ab18eff40 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Thu, 21 May 2026 14:47:32 +0200 Subject: [PATCH 24/94] vsock/virtio: fix skb overhead overflow on 32-bit builds On 32-bit architectures, both skb_queue_len() and SKB_TRUESIZE(0) evaluate to 32-bit values. The multiplication can overflow before being assigned to the u64 skb_overhead variable, making the skb overhead check ineffective. Cast skb_queue_len() to u64 so the multiplication is always performed in 64-bit arithmetic. This issue was reported by Sashiko while reviewing another patch. Fixes: 059b7dbd20a6 ("vsock/virtio: fix potential unbounded skb queue") Closes: https://sashiko.dev/#/patchset/20260518090656.134588-1-sgarzare%40redhat.com Cc: stable@vger.kernel.org Signed-off-by: Stefano Garzarella Acked-by: Michael S. Tsirkin Link: https://patch.msgid.link/20260521124732.125771-1-sgarzare@redhat.com Signed-off-by: Jakub Kicinski --- net/vmw_vsock/virtio_transport_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index d4d26fba9e37..b143290a311d 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -417,7 +417,7 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, static bool virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs, u32 len) { - u64 skb_overhead = (skb_queue_len(&vvs->rx_queue) + 1) * SKB_TRUESIZE(0); + u64 skb_overhead = ((u64)skb_queue_len(&vvs->rx_queue) + 1) * SKB_TRUESIZE(0); /* Allow at most buf_alloc * 2 total budget (payload + overhead), * similar to how SO_RCVBUF is doubled to reserve space for sk_buff From 87a1e0fe7776da7ab411be332b4be58ac8840d10 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 21 May 2026 12:21:47 +0000 Subject: [PATCH 25/94] ipv4: free net->ipv4.sysctl_local_reserved_ports after unregister_net_sysctl_table() ipv4_sysctl_exit_net() is currently freeing net->ipv4.sysctl_local_reserved_ports too soon. Only after unregister_net_sysctl_table() we can be sure no threads can possibly use the sysctls, including /proc/sys/net/ipv4/ip_local_reserved_ports. Fixes: 122ff243f5f1 ("ipv4: make ip_local_reserved_ports per netns") Reported-by: Ji'an Zhou Signed-off-by: Eric Dumazet Cc: Cong Wang Reviewed-by: Jason Xing Reviewed-by: Jiayuan Chen Link: https://patch.msgid.link/20260521122147.3584624-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/sysctl_net_ipv4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index d8bdb1bdbff1..c0e85cc171ae 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -1705,10 +1705,10 @@ static __net_exit void ipv4_sysctl_exit_net(struct net *net) { const struct ctl_table *table; - kfree(net->ipv4.sysctl_local_reserved_ports); table = net->ipv4.ipv4_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv4.ipv4_hdr); kfree(table); + kfree(net->ipv4.sysctl_local_reserved_ports); } static __net_initdata struct pernet_operations ipv4_sysctl_ops = { From 86f1d0f063e423a5c1982db1e5e7a8eac511e603 Mon Sep 17 00:00:00 2001 From: Prathamesh Deshpande Date: Wed, 6 May 2026 01:00:31 +0100 Subject: [PATCH 26/94] net/mlx5: HWS: Reject unsupported remove-header action mlx5_cmd_hws_packet_reformat_alloc() handles MLX5_REFORMAT_TYPE_REMOVE_HDR by looking up a matching HWS remove-header action. If mlx5_fs_get_action_remove_header_vlan() returns NULL, the code only logs an error and continues. The function then returns success with a NULL HWS action stored in the packet-reformat object. Return an error when no matching remove-header action is available. Fixes: aecd9d1020e3 ("net/mlx5: fs, add HWS packet reformat API function") Signed-off-by: Prathamesh Deshpande Reviewed-by: Simon Horman Reviewed-by: Yevgeny Kliteynik Acked-by: Tariq Toukan Link: https://patch.msgid.link/20260506000054.51797-1-prathameshdeshpande7@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.c index aca77853abb8..5a172c572a68 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.c @@ -1320,8 +1320,10 @@ mlx5_cmd_hws_packet_reformat_alloc(struct mlx5_flow_root_namespace *ns, break; case MLX5_REFORMAT_TYPE_REMOVE_HDR: hws_action = mlx5_fs_get_action_remove_header_vlan(fs_ctx, params); - if (!hws_action) + if (!hws_action) { mlx5_core_err(dev, "Only vlan remove header supported\n"); + return -EOPNOTSUPP; + } break; default: mlx5_core_err(ns->dev, "Packet-reformat not supported(%d)\n", From f7b52afe3592eae66e160586b45a3f2242972c63 Mon Sep 17 00:00:00 2001 From: Zhengchuan Liang Date: Fri, 22 May 2026 17:42:26 +0800 Subject: [PATCH 27/94] ipv6: exthdrs: refresh nh after handling HAO option ip6_parse_tlv() caches skb_network_header(skb) in nh while walking IPv6 TLVs. ipv6_dest_hao() may call pskb_expand_head() for a cloned skb, which can move the skb head and invalidate the cached network header pointer. Refresh nh after ipv6_dest_hao() returns so any trailing padding or TLVs are parsed from the current skb head. This matches the existing pattern used in ip6_parse_tlv() after helpers that can modify skb header storage. Fixes: a831f5bbc89a ("[IPV6] MIP6: Add inbound interface of home address option.") Cc: stable@kernel.org Reported-by: Yuan Tan Reported-by: Xin Liu Co-developed-by: Luxing Yin Signed-off-by: Luxing Yin Signed-off-by: Zhengchuan Liang Signed-off-by: Ren Wei Reviewed-by: Justin Iurman Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/7aba1debc2196189172499e5769802b026f8caf8.1779247873.git.zcliangcn@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv6/exthdrs.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index cf90f933ca1a..6d92c02d0e3d 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -201,6 +201,8 @@ static bool ip6_parse_tlv(bool hopbyhop, case IPV6_TLV_HAO: if (!ipv6_dest_hao(skb, off)) return false; + + nh = skb_network_header(skb); break; #endif default: From d47548a36639095939f4747d4c43f2271366f565 Mon Sep 17 00:00:00 2001 From: Justin Iurman Date: Fri, 22 May 2026 13:20:13 +0200 Subject: [PATCH 28/94] ipv6: exthdrs: refresh nh pointer after ipv6_hop_jumbo() ipv6_hop_jumbo() calls pskb_trim_rcsum(), which can change skb pointers. Let's recompute nh pointer to make sure any change won't mess things up. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Cc: stable@vger.kernel.org Signed-off-by: Justin Iurman Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20260522112013.12342-1-justin.iurman@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv6/exthdrs.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 6d92c02d0e3d..aca2a2abd2df 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -184,6 +184,8 @@ static bool ip6_parse_tlv(bool hopbyhop, case IPV6_TLV_JUMBO: if (!ipv6_hop_jumbo(skb, off)) return false; + + nh = skb_network_header(skb); break; case IPV6_TLV_CALIPSO: if (!ipv6_hop_calipso(skb, off)) From e68842b3356471ba56c882209f324613dac47f64 Mon Sep 17 00:00:00 2001 From: Junrui Luo Date: Wed, 20 May 2026 11:47:55 +0800 Subject: [PATCH 29/94] macsec: fix replay protection at XPN lower-PN wrap In macsec_post_decrypt(), when pn is U32_MAX, pn + 1 overflows u32 to 0 and the first branch never fires. If next_pn_halves.lower is also in the upper half, pn_same_half(pn, lower) is true and the XPN else-if does not fire either, leaving next_pn_halves unchanged. An attacker that captures the legitimate frame carrying pn == 0xFFFFFFFF on an XPN association can then replay it indefinitely, since lowest_pn never rises above the captured pn and macsec_decrypt() reconstructs the same IV. Extend the XPN else-if to also fire when pn + 1 wraps to 0, so receipt of pn == U32_MAX advances next_pn_halves to (upper + 1, 0). Fixes: a21ecf0e0338 ("macsec: Support XPN frame handling - IEEE 802.1AEbw") Reported-by: Yuhao Jiang Cc: stable@vger.kernel.org Signed-off-by: Junrui Luo Link: https://patch.msgid.link/SYBPR01MB78813FD49E58F253B989F197AF012@SYBPR01MB7881.ausprd01.prod.outlook.com Signed-off-by: Jakub Kicinski --- drivers/net/macsec.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index f904f4d16b45..fb009120a924 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -808,7 +808,8 @@ static bool macsec_post_decrypt(struct sk_buff *skb, struct macsec_secy *secy, u if (pn + 1 > rx_sa->next_pn_halves.lower) { rx_sa->next_pn_halves.lower = pn + 1; } else if (secy->xpn && - !pn_same_half(pn, rx_sa->next_pn_halves.lower)) { + (pn + 1 == 0 || + !pn_same_half(pn, rx_sa->next_pn_halves.lower))) { rx_sa->next_pn_halves.upper++; rx_sa->next_pn_halves.lower = pn + 1; } From 2156a29aecfffa2eb7c558255690084efbe9f3b0 Mon Sep 17 00:00:00 2001 From: Michael Bommarito Date: Wed, 20 May 2026 11:41:57 -0400 Subject: [PATCH 30/94] octeontx2-af: validate body pcifunc in rvu_mbox_handler_rep_event_notify rvu_mbox_handler_rep_event_notify() in drivers/net/ethernet/marvell/ octeontx2/af/rvu_rep.c queues a sender-controlled REP_EVENT_NOTIFY request body verbatim, and rvu_rep_up_notify() then forwards event->pcifunc (the nested body field, distinct from the AF-normalised header pcifunc) into rvu_get_pfvf(), rvu_get_pf() and the AF->PF mailbox device index without any bounds check. A VF attached to a PF that has been put into switchdev representor mode reaches this path: the VF mailbox handler otx2_pfvf_mbox_handler() forwards every message id including MBOX_MSG_REP_EVENT_NOTIFY to AF without an allowlist, and the AF dispatcher rewrites only msg->pcifunc, leaving struct rep_event::pcifunc attacker-controlled. The sibling rvu_mbox_handler_esw_cfg() refuses requests whose header pcifunc is not rvu->rep_pcifunc; this handler has no equivalent gate. An out-of-range body pcifunc selects an &rvu->pf[]/&rvu->hwvf[] element past the allocated array and, for RVU_EVENT_MAC_ADDR_CHANGE, turns into a six-byte attacker-chosen OOB ether_addr_copy() target inside the queued worker; KASAN reports a slab-out-of-bounds write in rvu_rep_wq_handler. Reject malformed requests at the handler entry by gating on is_pf_func_valid(), which is already the canonical PF/VF range check in this driver; expose it via rvu.h so callers in rvu_rep.c can use it instead of open-coding the same range arithmetic. Fixes: b8fea84a0468 ("octeontx2-pf: Add support to sync link state between representor and VFs") Cc: stable@vger.kernel.org Signed-off-by: Michael Bommarito Link: https://patch.msgid.link/20260520154157.1439319-1-michael.bommarito@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/octeontx2/af/rvu.c | 2 +- drivers/net/ethernet/marvell/octeontx2/af/rvu.h | 1 + drivers/net/ethernet/marvell/octeontx2/af/rvu_rep.c | 8 ++++++++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c index e40b79076358..3cf131508ecf 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c @@ -436,7 +436,7 @@ struct rvu_pfvf *rvu_get_pfvf(struct rvu *rvu, int pcifunc) return &rvu->pf[rvu_get_pf(rvu->pdev, pcifunc)]; } -static bool is_pf_func_valid(struct rvu *rvu, u16 pcifunc) +bool is_pf_func_valid(struct rvu *rvu, u16 pcifunc) { int pf, vf, nvfs; u64 cfg; diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h index a466181cf908..de3fbd3d15d6 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h @@ -917,6 +917,7 @@ u16 rvu_get_rsrc_mapcount(struct rvu_pfvf *pfvf, int blkaddr); struct rvu_pfvf *rvu_get_pfvf(struct rvu *rvu, int pcifunc); void rvu_get_pf_numvfs(struct rvu *rvu, int pf, int *numvfs, int *hwvf); bool is_block_implemented(struct rvu_hwinfo *hw, int blkaddr); +bool is_pf_func_valid(struct rvu *rvu, u16 pcifunc); bool is_pffunc_map_valid(struct rvu *rvu, u16 pcifunc, int blktype); int rvu_get_lf(struct rvu *rvu, struct rvu_block *block, u16 pcifunc, u16 slot); int rvu_lf_reset(struct rvu *rvu, struct rvu_block *block, int lf); diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_rep.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_rep.c index 901f6fd40fd4..a2781e0f504e 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_rep.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_rep.c @@ -97,6 +97,14 @@ int rvu_mbox_handler_rep_event_notify(struct rvu *rvu, struct rep_event *req, { struct rep_evtq_ent *qentry; + /* The mailbox dispatcher normalises only the header pcifunc; the + * nested struct rep_event::pcifunc body field is sender-controlled + * and is later used by rvu_rep_up_notify() to index rvu->pf[] / + * rvu->hwvf[]. Reject out-of-range body selectors before queueing. + */ + if (!is_pf_func_valid(rvu, req->pcifunc)) + return -EINVAL; + qentry = kmalloc_obj(*qentry, GFP_ATOMIC); if (!qentry) return -ENOMEM; From f229426072fc865654a60978bb7fda790a051ff3 Mon Sep 17 00:00:00 2001 From: Luka Gejak Date: Sat, 23 May 2026 15:03:30 +0200 Subject: [PATCH 31/94] net: hsr: fix potential OOB access in supervision frame handling Ensure the entire TLV header is linearized before access by adding sizeof(struct hsr_sup_tlv) to the pskb_may_pull() calls. Without this, a truncated frame could cause an out-of-bounds access. Fixes: eafaa88b3eb7 ("net: hsr: Add support for redbox supervision frames") Signed-off-by: Luka Gejak Reviewed-by: Fernando Fernandez Mancera Link: https://patch.msgid.link/20260523130330.61880-1-luka.gejak@linux.dev Signed-off-by: Jakub Kicinski --- net/hsr/hsr_forward.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c index 0aca859c88cb..f669a226d728 100644 --- a/net/hsr/hsr_forward.c +++ b/net/hsr/hsr_forward.c @@ -84,7 +84,7 @@ static bool is_supervision_frame(struct hsr_priv *hsr, struct sk_buff *skb) /* Get next tlv */ total_length += hsr_sup_tag->tlv.HSR_TLV_length; - if (!pskb_may_pull(skb, total_length)) + if (!pskb_may_pull(skb, total_length + sizeof(struct hsr_sup_tlv))) return false; skb_pull(skb, total_length); hsr_sup_tlv = (struct hsr_sup_tlv *)skb->data; @@ -100,7 +100,7 @@ static bool is_supervision_frame(struct hsr_priv *hsr, struct sk_buff *skb) /* make sure another tlv follows */ total_length += sizeof(struct hsr_sup_tlv) + hsr_sup_tlv->HSR_TLV_length; - if (!pskb_may_pull(skb, total_length)) + if (!pskb_may_pull(skb, total_length + sizeof(struct hsr_sup_tlv))) return false; /* get next tlv */ From 25fe708bbc59289d3d1ea4b126fbc1b460a072a5 Mon Sep 17 00:00:00 2001 From: Weiming Shi Date: Thu, 21 May 2026 01:12:01 -0700 Subject: [PATCH 32/94] net: team: fix NULL pointer dereference in team_xmit during mode change __team_change_mode() clears team->ops with memset() before restoring safe dummy handlers via team_adjust_ops(). A concurrent team_xmit() running under RCU on another CPU can read team->ops.transmit during this window and call a NULL function pointer, crashing the kernel. The race requires a mode change (CAP_NET_ADMIN) concurrent with transmit on the team device. BUG: kernel NULL pointer dereference, address: 0000000000000000 Oops: 0010 [#1] SMP KASAN NOPTI RIP: 0010:0x0 Call Trace: team_xmit (drivers/net/team/team_core.c:1853) dev_hard_start_xmit (net/core/dev.c:3904) __dev_queue_xmit (net/core/dev.c:4871) packet_sendmsg (net/packet/af_packet.c:3109) __sys_sendto (net/socket.c:2265) The original code assumed that no ports means no traffic, so mode changes could freely memset()/memcpy() the ops. AF_PACKET with forced carrier breaks that assumption. Prevent the race instead of making it safe: replace memset()/memcpy() with per-field updates that never touch transmit or receive. Those two handlers are managed solely by team_adjust_ops(), which already installs dummies when tx_en_port_count == 0 (always true during mode change since no ports are present). WRITE_ONCE/READ_ONCE prevent store/load tearing on the handler pointers. synchronize_net() before exit_op() drains in-flight readers that may still reference old mode state from before port removal switched the handlers to dummies. Fixes: 3d249d4ca7d0 ("net: introduce ethernet teaming device") Reported-by: Xiang Mei Signed-off-by: Weiming Shi Reviewed-by: Jiayuan Chen Link: https://patch.msgid.link/20260521081159.1491563-3-bestswngs@gmail.com Signed-off-by: Paolo Abeni --- drivers/net/team/team_core.c | 45 +++++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 13 deletions(-) diff --git a/drivers/net/team/team_core.c b/drivers/net/team/team_core.c index 0c87f9972457..f51388d50307 100644 --- a/drivers/net/team/team_core.c +++ b/drivers/net/team/team_core.c @@ -534,21 +534,23 @@ static void team_adjust_ops(struct team *team) if (!team->tx_en_port_count || !team_is_mode_set(team) || !team->mode->ops->transmit) - team->ops.transmit = team_dummy_transmit; + WRITE_ONCE(team->ops.transmit, team_dummy_transmit); else - team->ops.transmit = team->mode->ops->transmit; + WRITE_ONCE(team->ops.transmit, team->mode->ops->transmit); if (!team->rx_en_port_count || !team_is_mode_set(team) || !team->mode->ops->receive) - team->ops.receive = team_dummy_receive; + WRITE_ONCE(team->ops.receive, team_dummy_receive); else - team->ops.receive = team->mode->ops->receive; + WRITE_ONCE(team->ops.receive, team->mode->ops->receive); } /* - * We can benefit from the fact that it's ensured no port is present - * at the time of mode change. Therefore no packets are in fly so there's no - * need to set mode operations in any special way. + * team_change_mode() ensures no ports are present during mode change, + * but lockless readers can still reach team_xmit(). Avoid touching + * transmit/receive -- they are already set to dummies by + * team_adjust_ops() since no ports are enabled. synchronize_net() + * drains in-flight readers before destroying old mode state. */ static int __team_change_mode(struct team *team, const struct team_mode *new_mode) @@ -557,9 +559,21 @@ static int __team_change_mode(struct team *team, if (team_is_mode_set(team)) { void (*exit_op)(struct team *team) = team->ops.exit; - /* Clear ops area so no callback is called any longer */ - memset(&team->ops, 0, sizeof(struct team_mode_ops)); - team_adjust_ops(team); + /* Clear cold-path ops used only under RTNL. transmit and + * receive are already dummies (no ports) so leave them + * alone -- overwriting them is the source of the race. + */ + team->ops.init = NULL; + team->ops.exit = NULL; + team->ops.port_enter = NULL; + team->ops.port_leave = NULL; + team->ops.port_change_dev_addr = NULL; + team->ops.port_tx_disabled = NULL; + + /* Wait for in-flight readers before tearing down mode + * state they may reference. + */ + synchronize_net(); if (exit_op) exit_op(team); @@ -582,7 +596,12 @@ static int __team_change_mode(struct team *team, } team->mode = new_mode; - memcpy(&team->ops, new_mode->ops, sizeof(struct team_mode_ops)); + team->ops.init = new_mode->ops->init; + team->ops.exit = new_mode->ops->exit; + team->ops.port_enter = new_mode->ops->port_enter; + team->ops.port_leave = new_mode->ops->port_leave; + team->ops.port_change_dev_addr = new_mode->ops->port_change_dev_addr; + team->ops.port_tx_disabled = new_mode->ops->port_tx_disabled; team_adjust_ops(team); return 0; @@ -743,7 +762,7 @@ static rx_handler_result_t team_handle_frame(struct sk_buff **pskb) /* allow exact match delivery for disabled ports */ res = RX_HANDLER_EXACT; } else { - res = team->ops.receive(team, port, skb); + res = READ_ONCE(team->ops.receive)(team, port, skb); } if (res == RX_HANDLER_ANOTHER) { struct team_pcpu_stats *pcpu_stats; @@ -1845,7 +1864,7 @@ static netdev_tx_t team_xmit(struct sk_buff *skb, struct net_device *dev) tx_success = team_queue_override_transmit(team, skb); if (!tx_success) - tx_success = team->ops.transmit(team, skb); + tx_success = READ_ONCE(team->ops.transmit)(team, skb); if (tx_success) { struct team_pcpu_stats *pcpu_stats; From 11b326fb0a374f4654f9be22d0f0f7abd9f7d3fe Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 21 May 2026 21:05:54 +0800 Subject: [PATCH 33/94] ip6: vti: Use ip6_tnl.net in vti6_changelink(). ip netns add ns1 ip netns add ns2 ip -n ns1 link add vti6_test type vti6 remote ::1 local ::2 key 7 ip -n ns1 link set vti6_test netns ns2 ip -n ns2 link set vti6_test type vti6 remote ::3 local ::4 key 9 ip netns del ns2 ip netns del ns1 [ 132.495484] ------------[ cut here ]------------ [ 132.497609] kernel BUG at net/core/dev.c:12376! Commit 61220ab34948 ("vti6: Enable namespace changing") dropped NETIF_F_NETNS_LOCAL from vti6 devices. A vti6 tunnel can then move through IFLA_NET_NS_FD. After the move dev_net(dev) points at the new netns while t->net stays at the creation netns. vti6_changelink() and vti6_update() still use dev_net(dev) and dev_net(t->dev). They unlink from one per netns hash and relink into another. The creation netns is left with a stale entry. cleanup_net() of that netns later walks freed memory. Reachable from an unprivileged user namespace (unshare --user --map-root-user --net). Cross tenant scope on container hosts. Fixes: 61220ab34948 ("vti6: Enable namespace changing") Reported-by: Maoyi Xie Reviewed-by: Eric Dumazet Cc: stable@vger.kernel.org # v5.15+ Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260521130555.3421684-2-maoyixie.tju@gmail.com Signed-off-by: Paolo Abeni --- net/ipv6/ip6_vti.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index ad5290be4dd6..dcb257411d6e 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -722,10 +722,11 @@ vti6_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p, static int vti6_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p, bool keep_mtu) { - struct net *net = dev_net(t->dev); - struct vti6_net *ip6n = net_generic(net, vti6_net_id); + struct net *net = t->net; + struct vti6_net *ip6n; int err; + ip6n = net_generic(net, vti6_net_id); vti6_tnl_unlink(ip6n, t); synchronize_net(); err = vti6_tnl_change(t, p, keep_mtu); @@ -1031,11 +1032,12 @@ static int vti6_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { - struct ip6_tnl *t; + struct ip6_tnl *t = netdev_priv(dev); + struct net *net = t->net; struct __ip6_tnl_parm p; - struct net *net = dev_net(dev); - struct vti6_net *ip6n = net_generic(net, vti6_net_id); + struct vti6_net *ip6n; + ip6n = net_generic(net, vti6_net_id); if (dev == ip6n->fb_tnl_dev) return -EINVAL; From 8b484efd5cb4eeef9021a661e198edc5349dacf6 Mon Sep 17 00:00:00 2001 From: Maoyi Xie Date: Thu, 21 May 2026 21:05:55 +0800 Subject: [PATCH 34/94] ip6: vti: Use ip6_tnl.net in vti6_siocdevprivate(). After patch 1/2 in this series, vti6_update() unlinks and relinks the tunnel through t->net. vti6_siocdevprivate() still uses dev_net(dev) for the collision lookup. For a tunnel moved through IFLA_NET_NS_FD, dev_net(dev) is the new netns, not t->net. SIOCCHGTUNNEL on a migrated tunnel then runs: net = dev_net(dev) /* migrated netns */ t = vti6_locate(net, &p1, false) /* misses target in t->net */ ... t = netdev_priv(dev) vti6_update(t, &p1, false) /* mutates t->net's hash */ A caller in the migrated netns picks params that match a tunnel in the creation netns. The lookup in dev_net(dev) finds nothing. vti6_update() prepends the migrated tunnel at the head of the creation netns hash bucket for those params. Later lookups in the creation netns resolve to the migrated device. xfrm receive delivers the matched packets through a device the caller controls. Reachable from an unprivileged user namespace (unshare --user --map-root-user --net). Cross tenant scope on container hosts. Switch the SIOCCHGTUNNEL path on a non fallback device to use t->net for the lookup. The lookup now matches the netns vti6_update() operates on. Also add ns_capable(self->net->user_ns, CAP_NET_ADMIN) before the lookup. The check at the top of the case is against dev_net(dev)->user_ns, which after migration is the attacker's netns. A caller there can pick params absent from self->net, the lookup returns NULL, t becomes self, and vti6_update() inserts the device into the creation netns hash. The new check requires CAP_NET_ADMIN in the creation netns user_ns too. SIOCADDTUNNEL and SIOCCHGTUNNEL on the fallback device keep dev_net(dev), which equals init_net there. Fixes: 61220ab34948 ("vti6: Enable namespace changing") Suggested-by: Jakub Kicinski Suggested-by: Xiao Liang Cc: stable@vger.kernel.org # v5.15+ Signed-off-by: Maoyi Xie Link: https://patch.msgid.link/20260521130555.3421684-3-maoyixie.tju@gmail.com Signed-off-by: Paolo Abeni --- net/ipv6/ip6_vti.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index dcb257411d6e..df793c8bfffb 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -835,17 +835,24 @@ vti6_siocdevprivate(struct net_device *dev, struct ifreq *ifr, void __user *data if (p.proto != IPPROTO_IPV6 && p.proto != 0) break; vti6_parm_from_user(&p1, &p); - t = vti6_locate(net, &p1, cmd == SIOCADDTUNNEL); if (dev != ip6n->fb_tnl_dev && cmd == SIOCCHGTUNNEL) { + struct ip6_tnl *self = netdev_priv(dev); + + err = -EPERM; + if (!ns_capable(self->net->user_ns, CAP_NET_ADMIN)) + break; + t = vti6_locate(self->net, &p1, false); if (t) { if (t->dev != dev) { err = -EEXIST; break; } } else - t = netdev_priv(dev); + t = self; err = vti6_update(t, &p1, false); + } else { + t = vti6_locate(net, &p1, cmd == SIOCADDTUNNEL); } if (t) { err = 0; From 2e357f002c61fd76fd8f12468744a06a5ec48eaa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Fri, 22 May 2026 14:06:40 +0200 Subject: [PATCH 35/94] net: Avoid checksumming unreadable skb tail on trim MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pskb_trim_rcsum_slow() keeps CHECKSUM_COMPLETE valid by subtracting the checksum of the bytes removed from the skb tail. That assumes the removed bytes can be read. io_uring zcrx skbs may contain unreadable net_iov frags. With fbnic header/data split, small TCP/IPv4 packets can carry Ethernet padding in such a frag. ip_rcv_core() trims the skb to iph->tot_len before TCP sees it, and the CHECKSUM_COMPLETE adjustment then calls skb_checksum() on the padding. This is exposed by IPv4 because small TCP/IPv4 frames can be shorter than the Ethernet minimum payload. TCP/IPv6 frames are large enough in the normal zcrx path, so they do not hit the same padding trim. Keep the existing checksum adjustment for readable skbs. If the remaining packet is fully linear, drop CHECKSUM_COMPLETE and let the stack validate the packet after trimming. If unreadable payload would remain, fail the trim; the checksum cannot be adjusted without reading the trimmed tail. Also clear skb->unreadable when trimming removes all frags. Fixes: 65249feb6b3d ("net: add support for skbs with unreadable frags") Signed-off-by: Björn Töpel Reviewed-by: Breno Leitao Link: https://patch.msgid.link/20260522120643.242974-1-bjorn@kernel.org Signed-off-by: Paolo Abeni --- net/core/skbuff.c | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 44ac121cfccb..d247acd447e4 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2787,6 +2787,8 @@ int ___pskb_trim(struct sk_buff *skb, unsigned int len) skb->data_len = 0; skb_set_tail_pointer(skb, len); } + if (!skb_shinfo(skb)->nr_frags && !skb_has_frag_list(skb)) + skb->unreadable = 0; if (!skb->sk || skb->destructor == sock_edemux) skb_condense(skb); @@ -2794,16 +2796,37 @@ int ___pskb_trim(struct sk_buff *skb, unsigned int len) } EXPORT_SYMBOL(___pskb_trim); +static int pskb_trim_rcsum_complete(struct sk_buff *skb, unsigned int len) +{ + int delta = skb->len - len; + + if (skb_frags_readable(skb)) { + skb->csum = csum_block_sub(skb->csum, + skb_checksum(skb, len, delta, 0), + len); + return 0; + } + + if (len > skb_headlen(skb)) + return -EFAULT; + + /* The trimmed bytes are unreadable, but the remaining packet can be + * checksummed by software after trimming. + */ + skb->ip_summed = CHECKSUM_NONE; + return 0; +} + /* Note : use pskb_trim_rcsum() instead of calling this directly */ int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len) { if (skb->ip_summed == CHECKSUM_COMPLETE) { - int delta = skb->len - len; + int err; - skb->csum = csum_block_sub(skb->csum, - skb_checksum(skb, len, delta, 0), - len); + err = pskb_trim_rcsum_complete(skb, len); + if (err) + return err; } else if (skb->ip_summed == CHECKSUM_PARTIAL) { int hdlen = (len > skb_headlen(skb)) ? skb_headlen(skb) : len; int offset = skb_checksum_start_offset(skb) + skb->csum_offset; From c75b6f6eaacd0b74b832414cc3b9289c3686e941 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 22 May 2026 16:06:42 -0700 Subject: [PATCH 36/94] ethtool: rss: avoid modifying the RSS context response Gemini says that we're modifying the RSS_CREATE response skb. I think it's right, the comment says that unicast() should unshare the skb but I'm not entirely sure what I meant there. netlink_trim() does a copy but only if skb is not well sized (it's at least 2x larger than necessary for the payload). Fixes: a166ab7816c5 ("ethtool: rss: support creating contexts via Netlink") Link: https://patch.msgid.link/20260522230647.1705600-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/ethtool/rss.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/net/ethtool/rss.c b/net/ethtool/rss.c index 353110b862ab..8ffec9785efa 100644 --- a/net/ethtool/rss.c +++ b/net/ethtool/rss.c @@ -981,11 +981,17 @@ ethnl_rss_create_validate(struct net_device *dev, struct genl_info *info) } static void -ethnl_rss_create_send_ntf(struct sk_buff *rsp, struct net_device *dev) +ethnl_rss_create_send_ntf(const struct sk_buff *rsp, struct net_device *dev) { - struct nlmsghdr *nlh = (void *)rsp->data; struct genlmsghdr *genl_hdr; + struct nlmsghdr *nlh; + struct sk_buff *ntf; + ntf = skb_copy_expand(rsp, 0, 0, GFP_KERNEL); + if (!ntf) + return; + + nlh = nlmsg_hdr(ntf); /* Convert the reply into a notification */ nlh->nlmsg_pid = 0; nlh->nlmsg_seq = ethnl_bcast_seq_next(); @@ -993,7 +999,7 @@ ethnl_rss_create_send_ntf(struct sk_buff *rsp, struct net_device *dev) genl_hdr = nlmsg_data(nlh); genl_hdr->cmd = ETHTOOL_MSG_RSS_CREATE_NTF; - ethnl_multicast(rsp, dev); + ethnl_multicast(ntf, dev); } int ethnl_rss_create_doit(struct sk_buff *skb, struct genl_info *info) @@ -1104,12 +1110,8 @@ int ethnl_rss_create_doit(struct sk_buff *skb, struct genl_info *info) genlmsg_end(rsp, hdr); - /* Use the same skb for the response and the notification, - * genlmsg_reply() will copy the skb if it has elevated user count. - */ - skb_get(rsp); - ret = genlmsg_reply(rsp, info); ethnl_rss_create_send_ntf(rsp, dev); + ret = genlmsg_reply(rsp, info); rsp = NULL; exit_unlock: From 3e6c6e9782ff8a8d8ded774b07ad4590cd61d04c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 22 May 2026 16:06:43 -0700 Subject: [PATCH 37/94] ethtool: rss: add missing errno on RSS context delete Remember to set ret before jumping out if someone tries to delete a context on a device which doesn't support contexts. Fixes: fbe09277fa63 ("ethtool: rss: support removing contexts via Netlink") Link: https://patch.msgid.link/20260522230647.1705600-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/ethtool/rss.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/ethtool/rss.c b/net/ethtool/rss.c index 8ffec9785efa..a16ee1e8e640 100644 --- a/net/ethtool/rss.c +++ b/net/ethtool/rss.c @@ -1170,8 +1170,10 @@ int ethnl_rss_delete_doit(struct sk_buff *skb, struct genl_info *info) dev = req.dev; ops = dev->ethtool_ops; - if (!ops->create_rxfh_context) + if (!ops->create_rxfh_context) { + ret = -EOPNOTSUPP; goto exit_free_dev; + } rtnl_lock(); netdev_lock_ops(dev); From 8d60141a32875248ef71d49c9920fa5e2aa40b29 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 22 May 2026 16:06:44 -0700 Subject: [PATCH 38/94] ethtool: rss: fix falsely ignoring indir table updates rss_set_prep_indir() compares the new indirection table against the current one to determine whether any update is needed. The memcmp call passes data->indir_size as the length argument, but indir_size is the number of u32 entries, not the byte count. Fixes: c0ae03588bbb ("ethtool: rss: initial RSS_SET (indirection table handling)") Link: https://patch.msgid.link/20260522230647.1705600-4-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/ethtool/rss.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ethtool/rss.c b/net/ethtool/rss.c index a16ee1e8e640..458a4a7907e4 100644 --- a/net/ethtool/rss.c +++ b/net/ethtool/rss.c @@ -686,7 +686,7 @@ rss_set_prep_indir(struct net_device *dev, struct genl_info *info, ethtool_rxfh_indir_default(i, num_rx_rings); } - *mod |= memcmp(rxfh->indir, data->indir_table, data->indir_size); + *mod |= memcmp(rxfh->indir, data->indir_table, alloc_size); return user_size; From 266297692f97008ca48bc311775c087c59bd7fe3 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 22 May 2026 16:06:45 -0700 Subject: [PATCH 39/94] ethtool: rss: fix indir_table and hkey leak on get_rxfh failure rss_prepare_get() allocates the indirection table and hash key buffer via rss_get_data_alloc(), then calls ops->get_rxfh() to populate them. If get_rxfh() fails, the function returns an error without freeing the allocation. Fixes: 4f038a6a02d2 ("net: ethtool: Don't call .cleanup_data when prepare_data fails") Link: https://patch.msgid.link/20260522230647.1705600-5-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/ethtool/rss.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/ethtool/rss.c b/net/ethtool/rss.c index 458a4a7907e4..9fb675d29232 100644 --- a/net/ethtool/rss.c +++ b/net/ethtool/rss.c @@ -170,8 +170,10 @@ rss_prepare_get(const struct rss_req_info *request, struct net_device *dev, rxfh.key = data->hkey; ret = ops->get_rxfh(dev, &rxfh); - if (ret) + if (ret) { + rss_get_data_free(data); goto out_unlock; + } data->hfunc = rxfh.hfunc; data->input_xfrm = rxfh.input_xfrm; From 78ccf1a70c6378e1f5073a8c2209b5129067b925 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 22 May 2026 16:06:46 -0700 Subject: [PATCH 40/94] ethtool: rss: fix hkey leak when indir_size is 0 rss_get_data_alloc() allocates a single buffer that backs both the indirection table and the hash key, but only assigned data->indir_table when indir_size was nonzero. The expectation was that no driver implements RSS without supporting indirection table but apparently enic does just that (it's the only such in-tree driver). enic has get_rxfh_key_size but no get_rxfh_indir_size. data->indir_table stays as NULL, hkey gets set but rss_get_data_free() kfree(data->indir_table) is a nop and the allocation leaks. Always store the allocation base in data->indir_table so the free path is unambiguous. No caller treats indir_table as a sentinel; everything keys off indir_size. Fixes: 7112a04664bf ("ethtool: add netlink based get rss support") Link: https://patch.msgid.link/20260522230647.1705600-6-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/ethtool/rss.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/ethtool/rss.c b/net/ethtool/rss.c index 9fb675d29232..f5cf214f8f85 100644 --- a/net/ethtool/rss.c +++ b/net/ethtool/rss.c @@ -134,8 +134,7 @@ rss_get_data_alloc(struct net_device *dev, struct rss_reply_data *data) if (!rss_config) return -ENOMEM; - if (data->indir_size) - data->indir_table = (u32 *)rss_config; + data->indir_table = (u32 *)rss_config; if (data->hkey_size) data->hkey = rss_config + indir_bytes; From 32a9ecde62731c9f7412507709192c84dafc38d1 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 22 May 2026 16:06:47 -0700 Subject: [PATCH 41/94] ethtool: rss: avoid device context leak on reply-build failure We wait with filling the reply for new RSS context creation until after the driver ->create_rxfh_context call. The driver needs to fill some of the defaults in the context. The failure of rss_fill_reply() is somewhat theoretical, but doesn't take much effort to handle it properly. Call ->remove_rxfh_context(). If the driver's remove callback fails (some implementations like sfc can return real command errors from firmware RPCs) - skip the xa_erase and kfree, leaving the context in the xarray. This matches how ethnl_rss_delete_doit() behaves. Fixes: a166ab7816c5 ("ethtool: rss: support creating contexts via Netlink") Link: https://patch.msgid.link/20260522230647.1705600-7-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/ethtool/rss.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/ethtool/rss.c b/net/ethtool/rss.c index f5cf214f8f85..53792f53f922 100644 --- a/net/ethtool/rss.c +++ b/net/ethtool/rss.c @@ -1106,7 +1106,7 @@ int ethnl_rss_create_doit(struct sk_buff *skb, struct genl_info *info) ntf_fail |= rss_fill_reply(rsp, &req.base, &data.base); if (WARN_ON(!hdr || ntf_fail)) { ret = -EMSGSIZE; - goto exit_unlock; + goto err_remove_ctx; } genlmsg_end(rsp, hdr); @@ -1134,6 +1134,10 @@ int ethnl_rss_create_doit(struct sk_buff *skb, struct genl_info *info) nlmsg_free(rsp); return ret; +err_remove_ctx: + if (ops->remove_rxfh_context(dev, ctx, req.rss_context, NULL)) + /* leave the context on failure, like ethnl_rss_delete_doit() */ + goto exit_unlock; err_ctx_id_free: xa_erase(&dev->ethtool->rss_ctx, req.rss_context); err_unlock_free_ctx: From 84371fb58423f997939aacdcbc02d128d76a54e5 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 22 May 2026 16:13:04 -0700 Subject: [PATCH 42/94] ethtool: module: call ethnl_ops_complete() on module flash errors When validate() fails we are skipping over ethnl_ops_complete() even tho we already called ethnl_ops_begin(). Fixes: 32b4c8b53ee7 ("ethtool: Add ability to flash transceiver modules' firmware") Reviewed-by: Maxime Chevallier Reviewed-by: Danielle Ratson Link: https://patch.msgid.link/20260522231312.1710836-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/ethtool/module.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ethtool/module.c b/net/ethtool/module.c index cad2eb25b5a4..741f6fb25d45 100644 --- a/net/ethtool/module.c +++ b/net/ethtool/module.c @@ -427,10 +427,11 @@ int ethnl_act_module_fw_flash(struct sk_buff *skb, struct genl_info *info) ret = ethnl_module_fw_flash_validate(dev, info->extack); if (ret < 0) - goto out_unlock; + goto out_complete; ret = module_flash_fw(dev, tb, skb, info); +out_complete: ethnl_ops_complete(dev); out_unlock: From fb7f511d62692661846c47f199e0afe25c2982db Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 22 May 2026 16:13:05 -0700 Subject: [PATCH 43/94] ethtool: module: avoid leaking a netdev ref on module flash errors module_flash_fw_schedule() is missing undo for setting the "in_progress" flag and taking the netdev reference. Delay taking these, the device can't disappear while we are holding rtnl_lock. Fixes: 32b4c8b53ee7 ("ethtool: Add ability to flash transceiver modules' firmware") Reviewed-by: Maxime Chevallier Reviewed-by: Danielle Ratson Link: https://patch.msgid.link/20260522231312.1710836-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/ethtool/module.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/ethtool/module.c b/net/ethtool/module.c index 741f6fb25d45..392c03935e5e 100644 --- a/net/ethtool/module.c +++ b/net/ethtool/module.c @@ -319,8 +319,6 @@ module_flash_fw_schedule(struct net_device *dev, const char *file_name, if (err < 0) goto err_release_firmware; - dev->ethtool->module_fw_flash_in_progress = true; - netdev_hold(dev, &module_fw->dev_tracker, GFP_KERNEL); fw_update->dev = dev; fw_update->ntf_params.portid = info->snd_portid; fw_update->ntf_params.seq = info->snd_seq; @@ -335,6 +333,9 @@ module_flash_fw_schedule(struct net_device *dev, const char *file_name, if (err < 0) goto err_release_firmware; + dev->ethtool->module_fw_flash_in_progress = true; + netdev_hold(dev, &module_fw->dev_tracker, GFP_KERNEL); + schedule_work(&module_fw->work); return 0; From 7a84b965ffc12030af63cd10a8f3a1123ff39b7a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 22 May 2026 16:13:06 -0700 Subject: [PATCH 44/94] ethtool: module: avoid racy updates to dev->ethtool bitfield When reviewing other changes Gemini points out that we currently update module_fw_flash_in_progress without holding any locks. Since module_fw_flash_in_progress is part of a bitfield this is not great, updates to other fields may be lost. We could use a bool and sprinkle some READ_ONCE/WRITE_ONCE here but seems like the issue is rather than the work is an unusual writer. The other writers already hold the right locks. So just very briefly take these locks when the work completes. Note that nothing ever cancels the FW update work, so there's no concern with deadlocks vs cancel. Fixes: 32b4c8b53ee7 ("ethtool: Add ability to flash transceiver modules' firmware") Reviewed-by: Maxime Chevallier Reviewed-by: Danielle Ratson Link: https://patch.msgid.link/20260522231312.1710836-4-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/ethtool/module.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/net/ethtool/module.c b/net/ethtool/module.c index 392c03935e5e..cdb85e19a23b 100644 --- a/net/ethtool/module.c +++ b/net/ethtool/module.c @@ -221,14 +221,22 @@ static void module_flash_fw_work_list_del(struct list_head *list) static void module_flash_fw_work(struct work_struct *work) { struct ethtool_module_fw_flash *module_fw; + struct net_device *dev; module_fw = container_of(work, struct ethtool_module_fw_flash, work); + dev = module_fw->fw_update.dev; ethtool_cmis_fw_update(&module_fw->fw_update); module_flash_fw_work_list_del(&module_fw->list); - module_fw->fw_update.dev->ethtool->module_fw_flash_in_progress = false; - netdev_put(module_fw->fw_update.dev, &module_fw->dev_tracker); + + rtnl_lock(); + netdev_lock_ops(dev); + dev->ethtool->module_fw_flash_in_progress = false; + netdev_unlock_ops(dev); + rtnl_unlock(); + + netdev_put(dev, &module_fw->dev_tracker); release_firmware(module_fw->fw_update.fw); kfree(module_fw); } From 504eaefa44c8dec50f7499edcb36d24f3aefab2a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 22 May 2026 16:13:07 -0700 Subject: [PATCH 45/94] ethtool: module: check fw_flash_in_progress under rtnl_lock ethnl_set_module_validate() inspects module_fw_flash_in_progress but validate is meant for _input_ validation, not state validation. rtnl_lock is not held, yet. Move the check into ethnl_set_module(). Fixes: 32b4c8b53ee7 ("ethtool: Add ability to flash transceiver modules' firmware") Reviewed-by: Maxime Chevallier Reviewed-by: Danielle Ratson Link: https://patch.msgid.link/20260522231312.1710836-5-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/ethtool/module.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/net/ethtool/module.c b/net/ethtool/module.c index cdb85e19a23b..5b49004ddf60 100644 --- a/net/ethtool/module.c +++ b/net/ethtool/module.c @@ -120,12 +120,6 @@ ethnl_set_module_validate(struct ethnl_req_info *req_info, if (!tb[ETHTOOL_A_MODULE_POWER_MODE_POLICY]) return 0; - if (req_info->dev->ethtool->module_fw_flash_in_progress) { - NL_SET_ERR_MSG(info->extack, - "Module firmware flashing is in progress"); - return -EBUSY; - } - if (!ops->get_module_power_mode || !ops->set_module_power_mode) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_MODULE_POWER_MODE_POLICY], @@ -148,6 +142,12 @@ ethnl_set_module(struct ethnl_req_info *req_info, struct genl_info *info) ops = dev->ethtool_ops; + if (dev->ethtool->module_fw_flash_in_progress) { + NL_SET_ERR_MSG(info->extack, + "Module firmware flashing is in progress"); + return -EBUSY; + } + power_new.policy = nla_get_u8(tb[ETHTOOL_A_MODULE_POWER_MODE_POLICY]); ret = ops->get_module_power_mode(dev, &power, info->extack); if (ret < 0) From 760d04ebad5c4304f22c0d2251c9623b87a117c8 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 22 May 2026 16:13:08 -0700 Subject: [PATCH 46/94] ethtool: module: fix cleanup if socket used for flashing multiple devices When a single Netlink socket issues MODULE_FW_FLASH_ACT against multiple devices, ethnl_sock_priv_set() overwrites sk_priv->dev on each call, retaining only the last one. The socket priv is used on socket close, to walk the global work list and mark the uncompleted flashing work as "orphaned". Otherwise if another socket reuses the PID it will unexpectedly receive the flashing notifications. Don't record the device, record net pointer instead. The purpose of the dev is to scope the work to a netns, anyway. If we store netns the overrides are safe/a nop since all flashed devices must be in the same netns as the socket. Fixes: 32b4c8b53ee7 ("ethtool: Add ability to flash transceiver modules' firmware") Reviewed-by: Danielle Ratson Link: https://patch.msgid.link/20260522231312.1710836-6-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/ethtool/module.c | 9 ++++----- net/ethtool/netlink.c | 4 ++-- net/ethtool/netlink.h | 4 ++-- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/net/ethtool/module.c b/net/ethtool/module.c index 5b49004ddf60..ea4fb2a76650 100644 --- a/net/ethtool/module.c +++ b/net/ethtool/module.c @@ -291,11 +291,9 @@ void ethnl_module_fw_flash_sock_destroy(struct ethnl_sock_priv *sk_priv) spin_lock(&module_fw_flash_work_list_lock); list_for_each_entry(work, &module_fw_flash_work_list, list) { - if (work->fw_update.dev == sk_priv->dev && - work->fw_update.ntf_params.portid == sk_priv->portid) { + if (work->fw_update.ntf_params.portid == sk_priv->portid && + dev_net(work->fw_update.dev) == sk_priv->net) work->fw_update.ntf_params.closed_sock = true; - break; - } } spin_unlock(&module_fw_flash_work_list_lock); } @@ -332,7 +330,8 @@ module_flash_fw_schedule(struct net_device *dev, const char *file_name, fw_update->ntf_params.seq = info->snd_seq; fw_update->ntf_params.closed_sock = false; - err = ethnl_sock_priv_set(skb, dev, fw_update->ntf_params.portid, + err = ethnl_sock_priv_set(skb, dev_net(dev), + fw_update->ntf_params.portid, ETHTOOL_SOCK_TYPE_MODULE_FW_FLASH); if (err < 0) goto err_release_firmware; diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 5046023a30b1..7d45f9a884e5 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -53,7 +53,7 @@ const struct nla_policy ethnl_header_policy_phy_stats[] = { [ETHTOOL_A_HEADER_PHY_INDEX] = NLA_POLICY_MIN(NLA_U32, 1), }; -int ethnl_sock_priv_set(struct sk_buff *skb, struct net_device *dev, u32 portid, +int ethnl_sock_priv_set(struct sk_buff *skb, struct net *net, u32 portid, enum ethnl_sock_type type) { struct ethnl_sock_priv *sk_priv; @@ -62,7 +62,7 @@ int ethnl_sock_priv_set(struct sk_buff *skb, struct net_device *dev, u32 portid, if (IS_ERR(sk_priv)) return PTR_ERR(sk_priv); - sk_priv->dev = dev; + sk_priv->net = net; sk_priv->portid = portid; sk_priv->type = type; diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index aaf6f2468768..fd2198e45d2b 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -318,12 +318,12 @@ enum ethnl_sock_type { }; struct ethnl_sock_priv { - struct net_device *dev; + struct net *net; u32 portid; enum ethnl_sock_type type; }; -int ethnl_sock_priv_set(struct sk_buff *skb, struct net_device *dev, u32 portid, +int ethnl_sock_priv_set(struct sk_buff *skb, struct net *net, u32 portid, enum ethnl_sock_type type); /** From 6c3f999a9d1338c6c89a9ff4549eafe72bc2e7b1 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 22 May 2026 16:13:09 -0700 Subject: [PATCH 47/94] ethtool: cmis: require exact CDB reply length Malicious SFP module could respond with rpl_len longer than what cmis_cdb_process_reply() expected, leading to OOB writes. Malicious HW is a bit theoretical but some modules may just be buggy and/or the reads may occasionally get corrupted, so let's protect the kernel. The existing check protects from short replies. We need to protect from long ones, too. All callers that pass a non-zero rpl_exp_len cast the reply payload to a fixed-layout struct and read fields at fixed offsets, with no version negotiation or short-reply handling: - cmis_cdb_validate_password() - cmis_cdb_module_features_get() - cmis_fw_update_fw_mng_features_get() so let's assume that responses longer than expected do not have to be handled gracefully here. Add a warning message to make the debug easier in case my understanding is wrong... Note that page_data->length (argument of kmalloc) comes from last arg to ethtool_cmis_page_init() which is rpl_exp_len. Note2 that AIs also like to point out overflows in args->req.payload itself (which is a fixed-size 120 B buffer, on the stack), but callers should be reading structs defined by the standard, so protecting from requests for more data than max seem like defensive programming. Fixes: a39c84d79625 ("ethtool: cmis_cdb: Add a layer for supporting CDB commands") Reviewed-by: Danielle Ratson Link: https://patch.msgid.link/20260522231312.1710836-7-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/ethtool/cmis_cdb.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/net/ethtool/cmis_cdb.c b/net/ethtool/cmis_cdb.c index 3670ca42dd40..f3a53a984460 100644 --- a/net/ethtool/cmis_cdb.c +++ b/net/ethtool/cmis_cdb.c @@ -513,8 +513,13 @@ static int cmis_cdb_process_reply(struct net_device *dev, } rpl = (struct ethtool_cmis_cdb_rpl *)page_data->data; - if ((args->rpl_exp_len > rpl->hdr.rpl_len + rpl_hdr_len) || - !rpl->hdr.rpl_chk_code) { + if (rpl->hdr.rpl_len != args->rpl_exp_len) { + netdev_warn(dev, "CDB reply length mismatch, expected %u got %u\n", + args->rpl_exp_len, rpl->hdr.rpl_len); + err = -EIO; + goto out; + } + if (!rpl->hdr.rpl_chk_code) { err = -EIO; goto out; } From 3e8c3d464c36bb342fe377b026577c7ec27fdbb4 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 22 May 2026 16:13:10 -0700 Subject: [PATCH 48/94] ethtool: cmis: fix u16-to-u8 truncation of msleep_pre_rpl ethtool_cmis_cdb_compose_args() accepts msleep_pre_rpl as u16 but stores it into the u8 field ethtool_cmis_cdb_cmd_args::msleep_pre_rpl, silently truncating values >= 256. Seven of the nine call sites pass 1000 ms (it's the third argument from the end). Fixes: a39c84d79625 ("ethtool: cmis_cdb: Add a layer for supporting CDB commands") Reviewed-by: Maxime Chevallier Reviewed-by: Danielle Ratson Link: https://patch.msgid.link/20260522231312.1710836-8-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/ethtool/cmis.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ethtool/cmis.h b/net/ethtool/cmis.h index 4a9a946cabf0..778783a0f23c 100644 --- a/net/ethtool/cmis.h +++ b/net/ethtool/cmis.h @@ -63,9 +63,9 @@ struct ethtool_cmis_cdb_request { * struct ethtool_cmis_cdb_cmd_args - CDB commands execution arguments * @req: CDB command fields as described in the CMIS standard. * @max_duration: Maximum duration time for command completion in msec. + * @msleep_pre_rpl: Waiting time before checking reply in msec. * @read_write_len_ext: Allowable additional number of byte octets to the LPL * in a READ or a WRITE commands. - * @msleep_pre_rpl: Waiting time before checking reply in msec. * @rpl_exp_len: Expected reply length in bytes. * @flags: Validation flags for CDB commands. * @err_msg: Error message to be sent to user space. @@ -73,8 +73,8 @@ struct ethtool_cmis_cdb_request { struct ethtool_cmis_cdb_cmd_args { struct ethtool_cmis_cdb_request req; u16 max_duration; + u16 msleep_pre_rpl; u8 read_write_len_ext; - u8 msleep_pre_rpl; u8 rpl_exp_len; u8 flags; char *err_msg; From 12c2496a71f82f63617971ca9b730dffa05cf58b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 22 May 2026 16:13:11 -0700 Subject: [PATCH 49/94] ethtool: cmis: validate start_cmd_payload_size from module The CMIS firmware update code reads start_cmd_payload_size from the module's FW Management Features CDB reply and uses it directly as the byte count for memcpy. The destination buffer is 112 bytes (ETHTOOL_CMIS_CDB_LPL_MAX_PL_LENGTH - 8). So a malicious module (or corrupted response) can cause a OOB write later on in cmis_fw_update_start_download(). Let's error out. If modules that expect longer LPL writes actually exist we should revisit. struct cmis_cdb_start_fw_download_pl's definition has to move, no change there. Fixes: c4f78134d45c ("ethtool: cmis_fw_update: add a layer for supporting firmware update using CDB") Reviewed-by: Maxime Chevallier Reviewed-by: Danielle Ratson Link: https://patch.msgid.link/20260522231312.1710836-9-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/ethtool/cmis_fw_update.c | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/net/ethtool/cmis_fw_update.c b/net/ethtool/cmis_fw_update.c index df5f344209c4..16190c97e1f7 100644 --- a/net/ethtool/cmis_fw_update.c +++ b/net/ethtool/cmis_fw_update.c @@ -44,6 +44,20 @@ enum cmis_cdb_fw_write_mechanism { CMIS_CDB_FW_WRITE_MECHANISM_BOTH = 0x11, }; +/* See section 9.7.2 "CMD 0101h: Start Firmware Download" in CMIS standard + * revision 5.2. + * struct cmis_cdb_start_fw_download_pl is a structured layout of the + * flat array, ethtool_cmis_cdb_request::payload. + */ +struct cmis_cdb_start_fw_download_pl { + __struct_group(cmis_cdb_start_fw_download_pl_h, head, /* no attrs */, + __be32 image_size; + __be32 resv1; + ); + u8 vendor_data[ETHTOOL_CMIS_CDB_LPL_MAX_PL_LENGTH - + sizeof(struct cmis_cdb_start_fw_download_pl_h)]; +}; + static int cmis_fw_update_fw_mng_features_get(struct ethtool_cmis_cdb *cdb, struct net_device *dev, @@ -86,6 +100,14 @@ cmis_fw_update_fw_mng_features_get(struct ethtool_cmis_cdb *cdb, */ cdb->read_write_len_ext = rpl->read_write_len_ext; fw_mng->start_cmd_payload_size = rpl->start_cmd_payload_size; + if (fw_mng->start_cmd_payload_size > + sizeof_field(struct cmis_cdb_start_fw_download_pl, vendor_data)) { + ethnl_module_fw_flash_ntf_err(dev, ntf_params, + "Start cmd payload size exceeds max LPL payload", + NULL); + return -EINVAL; + } + fw_mng->write_mechanism = rpl->write_mechanism == CMIS_CDB_FW_WRITE_MECHANISM_LPL ? CMIS_CDB_FW_WRITE_MECHANISM_LPL : @@ -97,20 +119,6 @@ cmis_fw_update_fw_mng_features_get(struct ethtool_cmis_cdb *cdb, return 0; } -/* See section 9.7.2 "CMD 0101h: Start Firmware Download" in CMIS standard - * revision 5.2. - * struct cmis_cdb_start_fw_download_pl is a structured layout of the - * flat array, ethtool_cmis_cdb_request::payload. - */ -struct cmis_cdb_start_fw_download_pl { - __struct_group(cmis_cdb_start_fw_download_pl_h, head, /* no attrs */, - __be32 image_size; - __be32 resv1; - ); - u8 vendor_data[ETHTOOL_CMIS_CDB_LPL_MAX_PL_LENGTH - - sizeof(struct cmis_cdb_start_fw_download_pl_h)]; -}; - static int cmis_fw_update_start_download(struct ethtool_cmis_cdb *cdb, struct ethtool_cmis_fw_update_params *fw_update, From d5551f4c1800dc714cec86647bdd651ae0de923e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 22 May 2026 16:13:12 -0700 Subject: [PATCH 50/94] ethtool: cmis: validate fw->size against start_cmd_payload_size cmis_fw_update_start_download() copies start_cmd_payload_size bytes from the firmware blob into the CDB LPL vendor_data[] payload without validating that the FW has enough data. Since the start_cmd_payload_size can only be ~120B an image too short is most likely corrupted, so reject it. Fixes: c4f78134d45c ("ethtool: cmis_fw_update: add a layer for supporting firmware update using CDB") Reviewed-by: Maxime Chevallier Reviewed-by: Danielle Ratson Link: https://patch.msgid.link/20260522231312.1710836-10-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/ethtool/cmis_fw_update.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/net/ethtool/cmis_fw_update.c b/net/ethtool/cmis_fw_update.c index 16190c97e1f7..291d04d2776a 100644 --- a/net/ethtool/cmis_fw_update.c +++ b/net/ethtool/cmis_fw_update.c @@ -130,6 +130,14 @@ cmis_fw_update_start_download(struct ethtool_cmis_cdb *cdb, u8 lpl_len; int err; + if (fw_update->fw->size < vendor_data_size) { + ethnl_module_fw_flash_ntf_err(fw_update->dev, + &fw_update->ntf_params, + "Firmware image too small for module's start payload", + NULL); + return -EINVAL; + } + pl.image_size = cpu_to_be32(fw_update->fw->size); memcpy(pl.vendor_data, fw_update->fw->data, vendor_data_size); From 05f95729ca844704d15e49ce14868af4b403b32b Mon Sep 17 00:00:00 2001 From: Michael Bommarito Date: Fri, 22 May 2026 22:34:23 -0400 Subject: [PATCH 51/94] l2tp: use refcount_inc_not_zero in l2tp_session_get_by_ifname A reader in l2tp_session_get_by_ifname() can return a pointer to a session whose refcount has reached zero. The getter takes its reference with plain refcount_inc(), but every other session getter in the same file (l2tp_v2_session_get, l2tp_v3_session_get, and the corresponding _get_next variants) uses refcount_inc_not_zero() because the IDR/RCU lookup can race with refcount_dec_and_test() -> l2tp_session_free() -> kfree_rcu(). The ifname getter is the only outlier; the inconsistency was raised on-list after 979c017803c4 ("l2tp: use list_del_rcu in l2tp_session_unhash"). A reader inside rcu_read_lock_bh() that matches session->ifname can be preempted between the strcmp() and the refcount_inc(). If the last reference drops on another CPU in that window, the reader's refcount_inc() runs on a counter that has reached zero. refcount_t catches the addition-on-zero, prints "refcount_t: addition on 0; use-after-free", saturates the counter, and returns the saturated pointer to the caller. Session memory is held live by the in-flight RCU read section, but the kfree_rcu() callback queued from l2tp_session_free() will free it once the grace period closes; a caller that dereferences the returned session past that point hits a slab-use-after-free. On PREEMPT_RT local_bh_disable() is a per-CPU sleeping lock and the preemption window is real; on stock PREEMPT kernels local_bh_disable() is a preempt_count increment that closes the cross-CPU race in practice (see below). Use refcount_inc_not_zero() and continue the list walk on failure, matching the other session getters in the file. The ifname getter is the only session getter in net/l2tp/ that still uses the bare refcount_inc() pattern; this change restores file-internal consistency. The success path is unchanged. Fixes: abe7a1a7d0b6 ("l2tp: improve tunnel/session refcount helpers") Cc: stable@vger.kernel.org Signed-off-by: Michael Bommarito Reviewed-by: James Chapman Reviewed-by: Simon Horman Link: https://patch.msgid.link/20260523023423.2568972-1-michael.bommarito@gmail.com Signed-off-by: Jakub Kicinski --- net/l2tp/l2tp_core.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 1455f67e01dd..9419c8555d22 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -441,12 +441,13 @@ struct l2tp_session *l2tp_session_get_by_ifname(const struct net *net, idr_for_each_entry_ul(&pn->l2tp_tunnel_idr, tunnel, tmp, tunnel_id) { if (tunnel) { list_for_each_entry_rcu(session, &tunnel->session_list, list) { - if (!strcmp(session->ifname, ifname)) { - refcount_inc(&session->ref_count); - rcu_read_unlock_bh(); + if (strcmp(session->ifname, ifname)) + continue; + if (!refcount_inc_not_zero(&session->ref_count)) + continue; + rcu_read_unlock_bh(); - return session; - } + return session; } } } From d895767c337814cf4b97d5ad5375e5ed7e12018d Mon Sep 17 00:00:00 2001 From: "Lucien.Jheng" Date: Sun, 24 May 2026 14:39:15 +0800 Subject: [PATCH 52/94] net: phy: air_en8811h: add AN8811HB MCU assert/deassert support AN8811HB needs a MCU soft-reset cycle before firmware loading begins. Assert the MCU (hold it in reset) and immediately deassert (release) via a dedicated PBUS register pair (0x5cf9f8 / 0x5cf9fc), accessed through a registered mdio_device at PHY-addr+8. Add __air_pbus_reg_write() as a low-level helper taking a struct mdio_device *, create and register the PBUS mdio_device in an8811hb_probe() and store it in priv->pbusdev, then implement an8811hb_mcu_assert() / _deassert() on top of it. Add an8811hb_remove() to unregister the PBUS device on teardown. Wire both calls into an8811hb_load_firmware() and en8811h_restart_mcu() so every firmware load or MCU restart on AN8811HB correctly sequences the reset control registers. Fixes: 5afda1d734ed ("net: phy: air_en8811h: add Airoha AN8811HB support") Signed-off-by: Lucien Jheng Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20260524063915.47961-1-lucienzx159@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/air_en8811h.c | 153 +++++++++++++++++++++++++++++++++- 1 file changed, 149 insertions(+), 4 deletions(-) diff --git a/drivers/net/phy/air_en8811h.c b/drivers/net/phy/air_en8811h.c index 29ae73e65caa..a86129ce693c 100644 --- a/drivers/net/phy/air_en8811h.c +++ b/drivers/net/phy/air_en8811h.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -170,9 +171,23 @@ #define AN8811HB_CLK_DRV_CKO_LDPWD BIT(13) #define AN8811HB_CLK_DRV_CKO_LPPWD BIT(14) +#define AN8811HB_MCU_SW_RST 0x5cf9f8 +#define AN8811HB_MCU_SW_RST_HOLD BIT(16) +#define AN8811HB_MCU_SW_RST_RUN (BIT(16) | BIT(0)) +#define AN8811HB_MCU_SW_START 0x5cf9fc +#define AN8811HB_MCU_SW_START_EN BIT(16) + +/* MII register constants for PBUS access (PHY addr + 8) */ +#define AIR_PBUS_ADDR_HIGH 0x1c +#define AIR_PBUS_DATA_HIGH 0x10 +#define AIR_PBUS_REG_ADDR_HIGH_MASK GENMASK(15, 6) +#define AIR_PBUS_REG_ADDR_LOW_MASK GENMASK(5, 2) + /* Led definitions */ #define EN8811H_LED_COUNT 3 +#define EN8811H_PBUS_ADDR_OFFS 8 + /* Default LED setup: * GPIO5 <-> LED0 On: Link detected, blink Rx/Tx * GPIO4 <-> LED1 On: Link detected at 2500 or 1000 Mbps @@ -201,6 +216,7 @@ struct en8811h_priv { struct clk_hw hw; struct phy_device *phydev; unsigned int cko_is_enabled; + struct mdio_device *pbusdev; }; enum { @@ -254,6 +270,31 @@ static int air_phy_write_page(struct phy_device *phydev, int page) return __phy_write(phydev, AIR_EXT_PAGE_ACCESS, page); } +static int __air_pbus_reg_write(struct mdio_device *mdiodev, + u32 pbus_reg, u32 pbus_data) +{ + int ret; + + ret = __mdiobus_write(mdiodev->bus, mdiodev->addr, AIR_EXT_PAGE_ACCESS, + upper_16_bits(pbus_reg)); + if (ret < 0) + return ret; + + ret = __mdiobus_write(mdiodev->bus, mdiodev->addr, AIR_PBUS_ADDR_HIGH, + FIELD_GET(AIR_PBUS_REG_ADDR_HIGH_MASK, pbus_reg)); + if (ret < 0) + return ret; + + ret = __mdiobus_write(mdiodev->bus, mdiodev->addr, + FIELD_GET(AIR_PBUS_REG_ADDR_LOW_MASK, pbus_reg), + lower_16_bits(pbus_data)); + if (ret < 0) + return ret; + + return __mdiobus_write(mdiodev->bus, mdiodev->addr, AIR_PBUS_DATA_HIGH, + upper_16_bits(pbus_data)); +} + static int __air_buckpbus_reg_write(struct phy_device *phydev, u32 pbus_address, u32 pbus_data) { @@ -570,10 +611,67 @@ static int an8811hb_load_file(struct phy_device *phydev, const char *name, return ret; } +static int an8811hb_mcu_assert(struct phy_device *phydev) +{ + struct en8811h_priv *priv = phydev->priv; + int ret; + + phy_lock_mdio_bus(phydev); + + ret = __air_pbus_reg_write(priv->pbusdev, AN8811HB_MCU_SW_RST, + AN8811HB_MCU_SW_RST_HOLD); + if (ret < 0) + goto unlock; + + ret = __air_pbus_reg_write(priv->pbusdev, AN8811HB_MCU_SW_START, 0); + if (ret < 0) + goto unlock; + + msleep(50); + phydev_dbg(phydev, "MCU asserted\n"); + +unlock: + phy_unlock_mdio_bus(phydev); + return ret; +} + +static int an8811hb_mcu_deassert(struct phy_device *phydev) +{ + struct en8811h_priv *priv = phydev->priv; + int ret; + + phy_lock_mdio_bus(phydev); + + ret = __air_pbus_reg_write(priv->pbusdev, AN8811HB_MCU_SW_START, + AN8811HB_MCU_SW_START_EN); + if (ret < 0) + goto unlock; + + ret = __air_pbus_reg_write(priv->pbusdev, AN8811HB_MCU_SW_RST, + AN8811HB_MCU_SW_RST_RUN); + if (ret < 0) + goto unlock; + + msleep(50); + phydev_dbg(phydev, "MCU deasserted\n"); + +unlock: + phy_unlock_mdio_bus(phydev); + return ret; +} + static int an8811hb_load_firmware(struct phy_device *phydev) { int ret; + ret = an8811hb_mcu_assert(phydev); + if (ret < 0) + return ret; + + ret = an8811hb_mcu_deassert(phydev); + if (ret < 0) + return ret; + ret = air_buckpbus_reg_write(phydev, EN8811H_FW_CTRL_1, EN8811H_FW_CTRL_1_START); if (ret < 0) @@ -662,6 +760,16 @@ static int en8811h_restart_mcu(struct phy_device *phydev) { int ret; + if (phy_id_compare_model(phydev->phy_id, AN8811HB_PHY_ID)) { + ret = an8811hb_mcu_assert(phydev); + if (ret < 0) + return ret; + + ret = an8811hb_mcu_deassert(phydev); + if (ret < 0) + return ret; + } + ret = air_buckpbus_reg_write(phydev, EN8811H_FW_CTRL_1, EN8811H_FW_CTRL_1_START); if (ret < 0) @@ -1166,6 +1274,7 @@ static int en8811h_leds_setup(struct phy_device *phydev) static int an8811hb_probe(struct phy_device *phydev) { + struct mdio_device *mdiodev; struct en8811h_priv *priv; int ret; @@ -1175,10 +1284,28 @@ static int an8811hb_probe(struct phy_device *phydev) return -ENOMEM; phydev->priv = priv; + /* + * The AN8811HB PHY address is restricted to 8-15 (decimal), + * depending on the board hardware strapping. + * This means the PBUS address is only in the range 16-21 (decimal), + * so we do not need to handle the case + * where the PBUS address exceeds 31 (decimal). + */ + mdiodev = mdio_device_create(phydev->mdio.bus, + phydev->mdio.addr + EN8811H_PBUS_ADDR_OFFS); + if (IS_ERR(mdiodev)) + return PTR_ERR(mdiodev); + + ret = mdio_device_register(mdiodev); + if (ret) + goto err_dev_free; + + priv->pbusdev = mdiodev; + ret = an8811hb_load_firmware(phydev); if (ret < 0) { phydev_err(phydev, "Load firmware failed: %d\n", ret); - return ret; + goto err_dev_create; } en8811h_print_fw_version(phydev); @@ -1191,22 +1318,29 @@ static int an8811hb_probe(struct phy_device *phydev) ret = en8811h_leds_setup(phydev); if (ret < 0) - return ret; + goto err_dev_create; priv->phydev = phydev; /* Co-Clock Output */ ret = an8811hb_clk_provider_setup(&phydev->mdio.dev, &priv->hw); if (ret) - return ret; + goto err_dev_create; /* Configure led gpio pins as output */ ret = air_buckpbus_reg_modify(phydev, AN8811HB_GPIO_OUTPUT, AN8811HB_GPIO_OUTPUT_345, AN8811HB_GPIO_OUTPUT_345); if (ret < 0) - return ret; + goto err_dev_create; return 0; + +err_dev_create: + mdio_device_remove(mdiodev); + +err_dev_free: + mdio_device_free(mdiodev); + return ret; } static int en8811h_probe(struct phy_device *phydev) @@ -1561,6 +1695,16 @@ static int en8811h_suspend(struct phy_device *phydev) return genphy_suspend(phydev); } +static void an8811hb_remove(struct phy_device *phydev) +{ + struct en8811h_priv *priv = phydev->priv; + + if (priv->pbusdev) { + mdio_device_remove(priv->pbusdev); + mdio_device_free(priv->pbusdev); + } +} + static struct phy_driver en8811h_driver[] = { { PHY_ID_MATCH_MODEL(EN8811H_PHY_ID), @@ -1587,6 +1731,7 @@ static struct phy_driver en8811h_driver[] = { PHY_ID_MATCH_MODEL(AN8811HB_PHY_ID), .name = "Airoha AN8811HB", .probe = an8811hb_probe, + .remove = an8811hb_remove, .get_features = en8811h_get_features, .config_init = an8811hb_config_init, .get_rate_matching = en8811h_get_rate_matching, From b4bc94353050b1fa7b702bd4c6600710dd926cff Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 25 May 2026 20:13:35 +0000 Subject: [PATCH 53/94] tunnels: load network headers after skb_cow() in iptunnel_pmtud_build_icmp[v6]() Sashiko found that iptunnel_pmtud_build_icmp() and iptunnel_pmtud_build_icmpv6() were caching ip_hdr() and ipv6_hdr() before an skb_cow() call which can reallocate skb->head. Fix this possible UAF by initializing the local variables after the skb_cow() call. Remove skb_reset_network_header() calls which were not needed. Fixes: 4cb47a8644cc ("tunnels: PMTU discovery support for directly bridged IP packets") Signed-off-by: Eric Dumazet Reviewed-by: Stefano Brivio Link: https://patch.msgid.link/20260525201335.2361845-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/ip_tunnel_core.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 2667f53482bd..c77a4c3fbe75 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -212,7 +212,7 @@ EXPORT_SYMBOL_GPL(iptunnel_handle_offloads); */ static int iptunnel_pmtud_build_icmp(struct sk_buff *skb, int mtu) { - const struct iphdr *iph = ip_hdr(skb); + const struct iphdr *iph; struct icmphdr *icmph; struct iphdr *niph; struct ethhdr eh; @@ -226,7 +226,6 @@ static int iptunnel_pmtud_build_icmp(struct sk_buff *skb, int mtu) skb_copy_bits(skb, skb_mac_offset(skb), &eh, ETH_HLEN); pskb_pull(skb, ETH_HLEN); - skb_reset_network_header(skb); err = pskb_trim(skb, 576 - sizeof(*niph) - sizeof(*icmph)); if (err) @@ -236,7 +235,7 @@ static int iptunnel_pmtud_build_icmp(struct sk_buff *skb, int mtu) err = skb_cow(skb, sizeof(*niph) + sizeof(*icmph) + ETH_HLEN); if (err) return err; - + iph = ip_hdr(skb); icmph = skb_push(skb, sizeof(*icmph)); *icmph = (struct icmphdr) { .type = ICMP_DEST_UNREACH, @@ -308,7 +307,7 @@ static int iptunnel_pmtud_check_icmp(struct sk_buff *skb, int mtu) */ static int iptunnel_pmtud_build_icmpv6(struct sk_buff *skb, int mtu) { - const struct ipv6hdr *ip6h = ipv6_hdr(skb); + const struct ipv6hdr *ip6h; struct icmp6hdr *icmp6h; struct ipv6hdr *nip6h; struct ethhdr eh; @@ -323,7 +322,6 @@ static int iptunnel_pmtud_build_icmpv6(struct sk_buff *skb, int mtu) skb_copy_bits(skb, skb_mac_offset(skb), &eh, ETH_HLEN); pskb_pull(skb, ETH_HLEN); - skb_reset_network_header(skb); err = pskb_trim(skb, IPV6_MIN_MTU - sizeof(*nip6h) - sizeof(*icmp6h)); if (err) @@ -334,6 +332,7 @@ static int iptunnel_pmtud_build_icmpv6(struct sk_buff *skb, int mtu) if (err) return err; + ip6h = ipv6_hdr(skb); icmp6h = skb_push(skb, sizeof(*icmp6h)); *icmp6h = (struct icmp6hdr) { .icmp6_type = ICMPV6_PKT_TOOBIG, From 7d9ef0cb271555d8cf39fefe6c981e1493b25ecf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 25 May 2026 20:36:42 +0000 Subject: [PATCH 54/94] vxlan: do not reuse cached ip_hdr() value after skb_tunnel_check_pmtu() skb_tunnel_check_pmtu() can change skb->head. Reusing old_iph afer skb_tunnel_check_pmtu() can cause an UAF. Use instead ip_hdr(skb) as done in drivers/net/bareudp.c and drivers/net/geneve.c. Found by Sashiko. Fixes: 4cb47a8644cc ("tunnels: PMTU discovery support for directly bridged IP packets") Signed-off-by: Eric Dumazet Reviewed-by: Stefano Brivio Link: https://patch.msgid.link/20260525203642.2389723-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- drivers/net/vxlan/vxlan_core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index e88798497503..b5b1253ac08b 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -2531,7 +2531,7 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, goto out_unlock; } - tos = ip_tunnel_ecn_encap(tos, old_iph, skb); + tos = ip_tunnel_ecn_encap(tos, ip_hdr(skb), skb); ttl = ttl ? : ip4_dst_hoplimit(&rt->dst); err = vxlan_build_skb(skb, ndst, sizeof(struct iphdr), vni, md, flags, udp_sum); @@ -2605,7 +2605,7 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, goto out_unlock; } - tos = ip_tunnel_ecn_encap(tos, old_iph, skb); + tos = ip_tunnel_ecn_encap(tos, ip_hdr(skb), skb); ttl = ttl ? : ip6_dst_hoplimit(ndst); skb_scrub_packet(skb, xnet); err = vxlan_build_skb(skb, ndst, sizeof(struct ipv6hdr), From 509323077ef79a26ba0c60bb556e45c12c398b2d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 22 May 2026 11:55:12 +0000 Subject: [PATCH 55/94] tunnels: do not assume transport header in iptunnel_pmtud_check_icmp() In some cases, iptunnel_pmtud_check_icmp() can be called while skb transport header is not set. This triggers an out-of-bound access, because (typeof(skb->transport_header))~0U is 65535. Access the icmp header based on IPv4 network header, after making sure icmp->type is present in skb linear part. Note that iptunnel_pmtud_check_icmpv6()) is fine. Fixes: 4cb47a8644cc ("tunnels: PMTU discovery support for directly bridged IP packets") Reported-by: Damiano Melotti Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260522115512.1519110-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/ip_tunnel_core.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index c77a4c3fbe75..d3c677e9bff2 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -280,7 +280,6 @@ static int iptunnel_pmtud_build_icmp(struct sk_buff *skb, int mtu) */ static int iptunnel_pmtud_check_icmp(struct sk_buff *skb, int mtu) { - const struct icmphdr *icmph = icmp_hdr(skb); const struct iphdr *iph = ip_hdr(skb); if (mtu < 576 || iph->frag_off != htons(IP_DF)) @@ -291,9 +290,17 @@ static int iptunnel_pmtud_check_icmp(struct sk_buff *skb, int mtu) ipv4_is_lbcast(iph->saddr) || ipv4_is_multicast(iph->saddr)) return 0; - if (iph->protocol == IPPROTO_ICMP && icmp_is_err(icmph->type)) - return 0; + if (iph->protocol == IPPROTO_ICMP) { + const struct icmphdr *icmph; + if (!pskb_network_may_pull(skb, iph->ihl * 4 + + offsetofend(struct icmphdr, type))) + return 0; + iph = ip_hdr(skb); + icmph = (void *)iph + iph->ihl * 4; + if (icmp_is_err(icmph->type)) + return 0; + } return iptunnel_pmtud_build_icmp(skb, mtu); } From dd433671fef381fdaf7b530c631e6b782d66e224 Mon Sep 17 00:00:00 2001 From: Qi Tang Date: Sat, 23 May 2026 22:32:45 +0800 Subject: [PATCH 56/94] ipv6: validate extension header length before copying to cmsg ip6_datagram_recv_specific_ctl() builds IPV6_{HOPOPTS,DSTOPTS,RTHDR} cmsgs (and their IPV6_2292* legacy counterparts) by trusting the on-wire hdrlen byte (ptr[1]) when computing the put_cmsg() length. The length was validated only at parse time (ipv6_parse_hopopts(), etc.). An nftables payload-write expression can rewrite hdrlen after parsing and before the skb reaches recvmsg; the write itself is in-bounds but put_cmsg() then reads up to ((hdrlen+1) << 3) = 2040 bytes from an 8-byte header. nftables is reachable from an unprivileged user namespace, so this is an unprivileged slab-out-of-bounds read: BUG: KASAN: slab-out-of-bounds in put_cmsg+0x3ac/0x540 put_cmsg+0x3ac/0x540 udpv6_recvmsg+0xca0/0x1250 sock_recvmsg+0xdf/0x190 ____sys_recvmsg+0x1b1/0x620 Add ipv6_get_exthdr_len() which validates that at least two bytes are accessible before reading the hdrlen field, then checks the computed length against skb_tail_pointer(skb), returning 0 on failure. Extension headers are kept in the linear skb area by pskb_may_pull() during input, so skb_tail_pointer() is the correct bound. Use ipv6_get_exthdr_len() at all non-AH call sites: the five standalone cmsg blocks (HbH, 2292HbH, 2292DSTOPTS x2, 2292RTHDR) and the three standard cases in the extension-header walk loop (DSTOPTS, ROUTING, default). AH retains an inline bounds check because its length formula differs ((ptr[1]+2)<<2). The walk loop also gets a pre-read bounds check at the top to validate ptr before any case accesses ptr[0] or ptr[1]. When the walk loop detects a corrupted header, return from the function instead of continuing to process later socket options. Cc: stable@vger.kernel.org Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Qi Tang Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20260523143245.2281415-1-tpluszz77@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv6/datagram.c | 54 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 46 insertions(+), 8 deletions(-) diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index ca3605acb433..38d7b4845281 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -617,6 +617,18 @@ void ip6_datagram_recv_common_ctl(struct sock *sk, struct msghdr *msg, } } +static u16 ipv6_get_exthdr_len(const struct sk_buff *skb, const u8 *ptr) +{ + u16 len; + + if (ptr + 2 > skb_tail_pointer(skb)) + return 0; + + len = (ptr[1] + 1) << 3; + + return (len <= skb_tail_pointer(skb) - ptr) ? len : 0; +} + void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) { @@ -643,7 +655,10 @@ void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg, /* HbH is allowed only once */ if (np->rxopt.bits.hopopts && (opt->flags & IP6SKB_HOPBYHOP)) { u8 *ptr = nh + sizeof(struct ipv6hdr); - put_cmsg(msg, SOL_IPV6, IPV6_HOPOPTS, (ptr[1]+1)<<3, ptr); + u16 len = ipv6_get_exthdr_len(skb, ptr); + + if (len) + put_cmsg(msg, SOL_IPV6, IPV6_HOPOPTS, len, ptr); } if (opt->lastopt && @@ -664,26 +679,37 @@ void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg, unsigned int len; u8 *ptr = nh + off; + if (ptr + 2 > skb_tail_pointer(skb)) + return; + switch (nexthdr) { case IPPROTO_DSTOPTS: nexthdr = ptr[0]; - len = (ptr[1] + 1) << 3; + len = ipv6_get_exthdr_len(skb, ptr); + if (!len) + return; if (np->rxopt.bits.dstopts) put_cmsg(msg, SOL_IPV6, IPV6_DSTOPTS, len, ptr); break; case IPPROTO_ROUTING: nexthdr = ptr[0]; - len = (ptr[1] + 1) << 3; + len = ipv6_get_exthdr_len(skb, ptr); + if (!len) + return; if (np->rxopt.bits.srcrt) put_cmsg(msg, SOL_IPV6, IPV6_RTHDR, len, ptr); break; case IPPROTO_AH: nexthdr = ptr[0]; len = (ptr[1] + 2) << 2; + if (ptr + len > skb_tail_pointer(skb)) + return; break; default: nexthdr = ptr[0]; - len = (ptr[1] + 1) << 3; + len = ipv6_get_exthdr_len(skb, ptr); + if (!len) + return; break; } @@ -705,19 +731,31 @@ void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg, } if (np->rxopt.bits.ohopopts && (opt->flags & IP6SKB_HOPBYHOP)) { u8 *ptr = nh + sizeof(struct ipv6hdr); - put_cmsg(msg, SOL_IPV6, IPV6_2292HOPOPTS, (ptr[1]+1)<<3, ptr); + u16 len = ipv6_get_exthdr_len(skb, ptr); + + if (len) + put_cmsg(msg, SOL_IPV6, IPV6_2292HOPOPTS, len, ptr); } if (np->rxopt.bits.odstopts && opt->dst0) { u8 *ptr = nh + opt->dst0; - put_cmsg(msg, SOL_IPV6, IPV6_2292DSTOPTS, (ptr[1]+1)<<3, ptr); + u16 len = ipv6_get_exthdr_len(skb, ptr); + + if (len) + put_cmsg(msg, SOL_IPV6, IPV6_2292DSTOPTS, len, ptr); } if (np->rxopt.bits.osrcrt && opt->srcrt) { struct ipv6_rt_hdr *rthdr = (struct ipv6_rt_hdr *)(nh + opt->srcrt); - put_cmsg(msg, SOL_IPV6, IPV6_2292RTHDR, (rthdr->hdrlen+1) << 3, rthdr); + u16 len = ipv6_get_exthdr_len(skb, (u8 *)rthdr); + + if (len) + put_cmsg(msg, SOL_IPV6, IPV6_2292RTHDR, len, rthdr); } if (np->rxopt.bits.odstopts && opt->dst1) { u8 *ptr = nh + opt->dst1; - put_cmsg(msg, SOL_IPV6, IPV6_2292DSTOPTS, (ptr[1]+1)<<3, ptr); + u16 len = ipv6_get_exthdr_len(skb, ptr); + + if (len) + put_cmsg(msg, SOL_IPV6, IPV6_2292DSTOPTS, len, ptr); } if (np->rxopt.bits.rxorigdstaddr) { struct sockaddr_in6 sin6; From 8ba68464e4787b6a7ec938826e16124df20fd23d Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Tue, 26 May 2026 21:33:19 +0200 Subject: [PATCH 57/94] bonding: refuse to enslave CAN devices syzbot reported a kernel paging request crash in can_rx_unregister() inside net/can/af_can.c. The crash occurs because a virtual CAN device (vxcan) is being enslaved to a bonding master. During the enslavement process, the bonding driver mutates and modifies the network device states to fit an Ethernet-like aggregation model. However, CAN devices operate on a completely different Layer 2 architecture, relying on the CAN mid-layer private data structure (can_ml_priv) instead of standard Ethernet structures. Since bonding does not initialize or maintain these CAN structures, subsequent operations on the half-enslaved interface (such as closing associated sockets via isotp_release) lead to a null-pointer dereference when accessing the CAN receiver lists. Bonding CAN interfaces is architecturally invalid as CAN lacks MAC addresses, ARP capabilities, and standard Ethernet link-layer mechanisms. While generic loopback devices are blocked globally in net/core/dev.c, virtual CAN devices bypass this check because they do not carry the IFF_LOOPBACK flag, despite acting as local software-loopbacks. Fix this by explicitly blocking network devices of type ARPHRD_CAN from being enslaved at the very beginning of bond_enslave(). This prevents illegal state mutations, eliminates the resulting KASAN crashes, and avoids potential memory leaks from incomplete socket cleanups. As the CAN support has been added a long time after bonding the Fixes-tag points to the introduction of ARPHRD_CAN that would have needed a specific handling in bonding_main.c. Fixes: cd05acfe65ed ("[CAN]: Allocate protocol numbers for PF_CAN") Reported-by: syzbot+8ed98cbd0161632bce95@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=8ed98cbd0161632bce95 Signed-off-by: Oliver Hartkopp Acked-by: Jay Vosburgh Link: https://patch.msgid.link/20260526-bonding-candev-v1-1-ba1df400918a@hartkopp.net Signed-off-by: Jakub Kicinski --- drivers/net/bonding/bond_main.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index af82a3df2c5d..82e779f7916b 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1890,6 +1890,12 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, struct sockaddr_storage ss; int res = 0, i; + if (slave_dev->type == ARPHRD_CAN) { + BOND_NL_ERR(bond_dev, extack, + "CAN devices cannot be enslaved"); + return -EPERM; + } + if (slave_dev->flags & IFF_MASTER && !netif_is_bond_master(slave_dev)) { BOND_NL_ERR(bond_dev, extack, From 5eec4427b89c2fb2beac54920101e55a2f1c0c21 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 26 May 2026 09:48:16 +0300 Subject: [PATCH 58/94] bridge: Fix sleep in atomic context in netlink path Since the introduction of the netlink configuration path for bridge ports in commit 25c71c75ac87 ("bridge: bridge port parameters over netlink"), br_setport() was always called with the bridge lock held around it. Back then this decision made sense: The bridge lock protects the STP state of the bridge and its ports and at that time the function only processed three STP related netlink attributes (cost, priority and state). Nowadays, br_setport() processes a lot more attributes and most of them do not need the bridge lock: * Bridge flags: Only require RTNL. Read locklessly by the data path. Annotations can be added in net-next. * FDB port flushing: Only requires the FDB lock. * Multicast attributes: Only require the multicast lock. * Group forward mask: Only requires RTNL. Read locklessly by the data path. Annotations can be added in net-next. * Backup port and NHID: Only require RTNL. Read locklessly by the data path. This is a problem as the bridge calls dev_set_promiscuity() when certain bridge port flags change and this function can sleep since the commit cited below, resulting in a splat such as [1]. Fix this by reducing the scope of the bridge lock and only take it when processing the three STP related attributes that require it. This is consistent with the multicast attributes where each attribute acquires the multicast lock instead of having one critical section for all relevant attributes. [1] BUG: sleeping function called from invalid context at net/core/dev_addr_lists.c:1262 in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 356, name: bridge preempt_count: 201, expected: 0 RCU nest depth: 0, expected: 0 2 locks held by bridge/356: #0: ffffffff919473a0 (rtnl_mutex){+.+.}-{4:4}, at: rtnetlink_rcv_msg (net/core/rtnetlink.c:80 net/core/rtnetlink.c:7002) #1: ffff888115072d58 (&br->lock){+...}-{3:3}, at: br_setlink (./include/linux/spinlock.h:348 net/bridge/br_netlink.c:1117) Preemption disabled at: 0x0 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 Call Trace: dump_stack_lvl (lib/dump_stack.c:94 lib/dump_stack.c:120) __might_resched.cold (kernel/sched/core.c:9163) netif_rx_mode_run (net/core/dev_addr_lists.c:1262) netif_rx_mode_sync (net/core/dev_addr_lists.c:1428) dev_set_promiscuity (net/core/dev_api.c:289) br_manage_promisc (net/bridge/br_if.c:135 net/bridge/br_if.c:172) br_port_flags_change (net/bridge/br_if.c:242 net/bridge/br_if.c:747) br_setport (net/bridge/br_netlink.c:1000) br_setlink (net/bridge/br_netlink.c:1118) rtnl_bridge_setlink (net/core/rtnetlink.c:5572) rtnetlink_rcv_msg (net/core/rtnetlink.c:7005) netlink_rcv_skb (net/netlink/af_netlink.c:2550) netlink_unicast (net/netlink/af_netlink.c:1318 net/netlink/af_netlink.c:1344) netlink_sendmsg (net/netlink/af_netlink.c:1894) __sock_sendmsg (net/socket.c:787 (discriminator 4) net/socket.c:802 (discriminator 4)) ____sys_sendmsg (net/socket.c:2698) ___sys_sendmsg (net/socket.c:2752) __sys_sendmsg (net/socket.c:2784) do_syscall_64 (arch/x86/entry/syscall_64.c:63 arch/x86/entry/syscall_64.c:94) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:121) Fixes: 78cd408356fe ("net: add missing instance lock to dev_set_promiscuity") Reviewed-by: Nikolay Aleksandrov Signed-off-by: Ido Schimmel Link: https://patch.msgid.link/20260526064818.272516-2-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- net/bridge/br_netlink.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index c04a4d0889ae..b9591dd755f9 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -1000,19 +1000,25 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[], br_port_flags_change(p, changed_mask); if (tb[IFLA_BRPORT_COST]) { + spin_lock_bh(&p->br->lock); err = br_stp_set_path_cost(p, nla_get_u32(tb[IFLA_BRPORT_COST])); + spin_unlock_bh(&p->br->lock); if (err) return err; } if (tb[IFLA_BRPORT_PRIORITY]) { + spin_lock_bh(&p->br->lock); err = br_stp_set_port_priority(p, nla_get_u16(tb[IFLA_BRPORT_PRIORITY])); + spin_unlock_bh(&p->br->lock); if (err) return err; } if (tb[IFLA_BRPORT_STATE]) { + spin_lock_bh(&p->br->lock); err = br_set_port_state(p, nla_get_u8(tb[IFLA_BRPORT_STATE])); + spin_unlock_bh(&p->br->lock); if (err) return err; } @@ -1114,9 +1120,7 @@ int br_setlink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags, if (err) return err; - spin_lock_bh(&p->br->lock); err = br_setport(p, tb, extack); - spin_unlock_bh(&p->br->lock); } else { /* Binary compatibility with old RSTP */ if (nla_len(protinfo) < sizeof(u8)) @@ -1203,17 +1207,10 @@ static int br_port_slave_changelink(struct net_device *brdev, struct nlattr *data[], struct netlink_ext_ack *extack) { - struct net_bridge *br = netdev_priv(brdev); - int ret; - if (!data) return 0; - spin_lock_bh(&br->lock); - ret = br_setport(br_port_get_rtnl(dev), data, extack); - spin_unlock_bh(&br->lock); - - return ret; + return br_setport(br_port_get_rtnl(dev), data, extack); } static int br_port_fill_slave_info(struct sk_buff *skb, From 6d34594cc619d0d4b07d5afcad8b5984f3526dcf Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 26 May 2026 09:48:17 +0300 Subject: [PATCH 59/94] bridge: Fix sleep in atomic context in sysfs path Since the start of the git history, brport_store() always acquired the bridge lock. Back then this decision made sense: The bridge lock protects the STP state of the bridge and its ports and at that time the function was only used by two STP related attributes (cost and priority). Nowadays, brport_store() processes a lot more attributes and most of them do not need the bridge lock: * Bridge flags: Only require RTNL. Read locklessly by the data path. Annotations can be added in net-next. * FDB port flushing: Only requires the FDB lock. * Multicast attributes: Only require the multicast lock. * Group forward mask: Only requires RTNL. Read locklessly by the data path. Annotations can be added in net-next. * Backup port: Only requires RTNL. Read locklessly by the data path. This is a problem as the bridge calls dev_set_promiscuity() when certain bridge port flags change and this function can sleep since the commit cited below, resulting in a splat such as [1]. Fix this by reducing the scope of the bridge lock and only take it when processing the two STP related attributes that require it. Remove the now stale comment from br_switchdev_set_port_flag(). The SWITCHDEV_F_DEFER flag can be removed in net-next. [1] BUG: sleeping function called from invalid context at net/core/dev_addr_lists.c:1262 in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 372, name: bash preempt_count: 201, expected: 0 RCU nest depth: 0, expected: 0 5 locks held by bash/372: #0: ffff88810c51c3f0 (sb_writers#7){.+.+}-{0:0}, at: ksys_write (fs/read_write.c:740) #1: ffff888115ce9480 (&of->mutex){+.+.}-{4:4}, at: kernfs_fop_write_iter (fs/kernfs/file.c:343) #2: ffff88810b9fd330 (kn->active#37){.+.+}-{0:0}, at: kernfs_fop_write_iter (fs/kernfs/file.c:80 fs/kernfs/file.c:344) #3: ffffffffa59473a0 (rtnl_mutex){+.+.}-{4:4}, at: brport_store (net/bridge/br_sysfs_if.c:326) #4: ffff8881099d2d58 (&br->lock){+...}-{3:3}, at: brport_store (./include/linux/spinlock.h:348 net/bridge/br_sysfs_if.c:345) Preemption disabled at: 0x0 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 Call Trace: dump_stack_lvl (lib/dump_stack.c:94 lib/dump_stack.c:120) __might_resched.cold (kernel/sched/core.c:9163) netif_rx_mode_run (net/core/dev_addr_lists.c:1262) netif_rx_mode_sync (net/core/dev_addr_lists.c:1428) dev_set_promiscuity (net/core/dev_api.c:289) br_manage_promisc (net/bridge/br_if.c:135 net/bridge/br_if.c:172) br_port_flags_change (net/bridge/br_if.c:242 net/bridge/br_if.c:747) store_learning (net/bridge/br_sysfs_if.c:79 net/bridge/br_sysfs_if.c:235) brport_store (net/bridge/br_sysfs_if.c:346) kernfs_fop_write_iter (fs/kernfs/file.c:352) new_sync_write (fs/read_write.c:595) vfs_write (fs/read_write.c:688) ksys_write (fs/read_write.c:740) do_syscall_64 (arch/x86/entry/syscall_64.c:63 arch/x86/entry/syscall_64.c:94) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:121) Fixes: 78cd408356fe ("net: add missing instance lock to dev_set_promiscuity") Reviewed-by: Nikolay Aleksandrov Signed-off-by: Ido Schimmel Link: https://patch.msgid.link/20260526064818.272516-3-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- net/bridge/br_switchdev.c | 1 - net/bridge/br_sysfs_if.c | 30 ++++++++++++++++++++++-------- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c index 18b558a931ad..ee3ad9dfbab9 100644 --- a/net/bridge/br_switchdev.c +++ b/net/bridge/br_switchdev.c @@ -99,7 +99,6 @@ int br_switchdev_set_port_flag(struct net_bridge_port *p, attr.u.brport_flags.val = flags; attr.u.brport_flags.mask = mask; - /* We run from atomic context here */ err = call_switchdev_notifiers(SWITCHDEV_PORT_ATTR_SET, p->dev, &info.info, extack); err = notifier_to_errno(err); diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c index 1f57c36a7fc0..d6df81fa0d13 100644 --- a/net/bridge/br_sysfs_if.c +++ b/net/bridge/br_sysfs_if.c @@ -86,16 +86,34 @@ static ssize_t show_path_cost(struct net_bridge_port *p, char *buf) return sysfs_emit(buf, "%d\n", p->path_cost); } -static BRPORT_ATTR(path_cost, 0644, - show_path_cost, br_stp_set_path_cost); +static int store_path_cost(struct net_bridge_port *p, unsigned long v) +{ + int ret; + + spin_lock_bh(&p->br->lock); + ret = br_stp_set_path_cost(p, v); + spin_unlock_bh(&p->br->lock); + return ret; +} + +static BRPORT_ATTR(path_cost, 0644, show_path_cost, store_path_cost); static ssize_t show_priority(struct net_bridge_port *p, char *buf) { return sysfs_emit(buf, "%d\n", p->priority); } -static BRPORT_ATTR(priority, 0644, - show_priority, br_stp_set_port_priority); +static int store_priority(struct net_bridge_port *p, unsigned long v) +{ + int ret; + + spin_lock_bh(&p->br->lock); + ret = br_stp_set_port_priority(p, v); + spin_unlock_bh(&p->br->lock); + return ret; +} + +static BRPORT_ATTR(priority, 0644, show_priority, store_priority); static ssize_t show_designated_root(struct net_bridge_port *p, char *buf) { @@ -334,17 +352,13 @@ static ssize_t brport_store(struct kobject *kobj, ret = -ENOMEM; goto out_unlock; } - spin_lock_bh(&p->br->lock); ret = brport_attr->store_raw(p, buf_copy); - spin_unlock_bh(&p->br->lock); kfree(buf_copy); } else if (brport_attr->store) { val = simple_strtoul(buf, &endp, 0); if (endp == buf) goto out_unlock; - spin_lock_bh(&p->br->lock); ret = brport_attr->store(p, val); - spin_unlock_bh(&p->br->lock); } if (!ret) { From 147f3b1f23cbd74f1022cc5689570a06f6bc47c8 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 26 May 2026 09:48:18 +0300 Subject: [PATCH 60/94] selftests: rtnetlink: Add bridge promiscuity tests Add two test cases that always pass, but trigger sleeping in atomic context BUGs without "bridge: Fix sleep in atomic context in netlink path" and "bridge: Fix sleep in atomic context in sysfs path". Reviewed-by: Nikolay Aleksandrov Signed-off-by: Ido Schimmel Link: https://patch.msgid.link/20260526064818.272516-4-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/rtnetlink.sh | 63 ++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/tools/testing/selftests/net/rtnetlink.sh b/tools/testing/selftests/net/rtnetlink.sh index c499953d4885..ace3a99023ed 100755 --- a/tools/testing/selftests/net/rtnetlink.sh +++ b/tools/testing/selftests/net/rtnetlink.sh @@ -24,6 +24,8 @@ ALL_TESTS=" kci_test_macsec kci_test_macsec_vlan kci_test_team_bridge_macvlan + kci_test_bridge_promisc_netlink + kci_test_bridge_promisc_sysfs kci_test_ipsec kci_test_ipsec_offload kci_test_fdb_get @@ -61,6 +63,14 @@ check_fail() fi } +sysfs_write() +{ + local val="$1" + local path="$2" + + echo "$val" > "$path" +} + run_cmd_common() { local cmd="$*" @@ -680,6 +690,59 @@ kci_test_team_bridge_macvlan() end_test "PASS: team_bridge_macvlan" } +# Test that changing bridge port flags via the netlink path does not sleep with +# the bridge spin lock held. +kci_test_bridge_promisc_netlink() +{ + local dummy="test_dummy1" + local bridge="test_br1" + local team="test_team1" + local ret=0 + + run_cmd ip link add $team up type team + run_cmd ip link add $bridge up type bridge vlan_filtering 1 + run_cmd ip link add $dummy up type dummy + run_cmd ip link set $dummy master $bridge + run_cmd ip link set $team master $bridge + + # This causes the bridge driver to sync all the static FDB entries to + # the team device (which supports unicast filtering) and remove it from + # promiscuous mode. The call to dev_set_promiscuity() can sleep due to + # Rx mode inlining, which is a problem if the bridge spin lock is held. + run_cmd bridge link set dev $dummy flood off learning off + + run_cmd ip link del $dummy + run_cmd ip link del $bridge + run_cmd ip link del $team + + end_test "PASS: bridge_promisc_netlink" +} + +# Same as kci_test_bridge_promisc_netlink(), but the flags are changed via the +# sysfs path. +kci_test_bridge_promisc_sysfs() +{ + local dummy="test_dummy1" + local bridge="test_br1" + local team="test_team1" + local ret=0 + + run_cmd ip link add $team up type team + run_cmd ip link add $bridge up type bridge vlan_filtering 1 + run_cmd ip link add $dummy up type dummy + run_cmd ip link set $dummy master $bridge + run_cmd ip link set $team master $bridge + + run_cmd sysfs_write 0 /sys/class/net/$dummy/brport/unicast_flood + run_cmd sysfs_write 0 /sys/class/net/$dummy/brport/learning + + run_cmd ip link del $dummy + run_cmd ip link del $bridge + run_cmd ip link del $team + + end_test "PASS: bridge_promisc_sysfs" +} + #------------------------------------------------------------------- # Example commands # ip x s add proto esp src 14.0.0.52 dst 14.0.0.70 \ From 7281b096b072f6c6e30420e3467d738f2e4c4b57 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 26 May 2026 08:35:24 -0700 Subject: [PATCH 61/94] ethtool: coalesce: cap profile updates at NET_DIM_PARAMS_NUM_PROFILES ethnl_update_profile() walks the ETHTOOL_A_PROFILE_IRQ_MODERATION nest list with an index 'i' and writes new_profile[i++] without bounding i. The destination is kmemdup()'d at NET_DIM_PARAMS_NUM_PROFILES entries (5), but the Netlink nest count is entirely user-controlled. Netlink policies do not have support for constraining the number of nested entries (or number of multi-attr entries). Fixes: f750dfe825b9 ("ethtool: provide customized dim profile management") Reviewed-by: Maxime Chevallier Link: https://patch.msgid.link/20260526153533.2779187-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/ethtool/coalesce.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/ethtool/coalesce.c b/net/ethtool/coalesce.c index 1e2c5c7048a8..e73fc3e5a02b 100644 --- a/net/ethtool/coalesce.c +++ b/net/ethtool/coalesce.c @@ -472,6 +472,12 @@ static int ethnl_update_profile(struct net_device *dev, nla_for_each_nested_type(nest, ETHTOOL_A_PROFILE_IRQ_MODERATION, nests, rem) { + if (i >= NET_DIM_PARAMS_NUM_PROFILES) { + NL_SET_BAD_ATTR(extack, nest); + ret = -E2BIG; + goto err_out; + } + ret = nla_parse_nested(tb, len_irq_moder - 1, nest, coalesce_irq_moderation_policy, extack); From a888bbd43940cada72f7686337741ce86d1cf869 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 26 May 2026 08:35:25 -0700 Subject: [PATCH 62/94] ethtool: tsconfig: fix reply error handling A couple of trivial bugs in error handling in tsconfig_send_reply(). If we failed to allocate rskb we need to set the error. If we did allocate it but failed to send it - we need to remember to free it. Fixes: 6e9e2eed4f39 ("net: ethtool: Add support for tsconfig command to get/set hwtstamp config") Reviewed-by: Vadim Fedorenko Reviewed-by: Kory Maincent Link: https://patch.msgid.link/20260526153533.2779187-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/ethtool/tsconfig.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/net/ethtool/tsconfig.c b/net/ethtool/tsconfig.c index e4f518e49d4c..e9db4ee2299d 100644 --- a/net/ethtool/tsconfig.c +++ b/net/ethtool/tsconfig.c @@ -224,16 +224,21 @@ static int tsconfig_send_reply(struct net_device *dev, struct genl_info *info) reply_len = ret + ethnl_reply_header_size(); rskb = ethnl_reply_init(reply_len, dev, ETHTOOL_MSG_TSCONFIG_SET_REPLY, ETHTOOL_A_TSCONFIG_HEADER, info, &reply_payload); - if (!rskb) + if (!rskb) { + ret = -ENOMEM; goto err_cleanup; + } ret = tsconfig_fill_reply(rskb, &req_info->base, &reply_data->base); if (ret < 0) - goto err_cleanup; + goto err_free_msg; genlmsg_end(rskb, reply_payload); ret = genlmsg_reply(rskb, info); + rskb = NULL; +err_free_msg: + nlmsg_free(rskb); err_cleanup: kfree(reply_data); kfree(req_info); From 596c51ed9e125b12c4d85b4530dfd4c7847634b7 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 26 May 2026 08:35:26 -0700 Subject: [PATCH 63/94] ethtool: linkstate: fix unbalanced ethnl_ops_complete() on PHY lookup error linkstate_prepare_data() calls ethnl_req_get_phydev() before ethnl_ops_begin(), but routes its error path through "goto out" which calls ethnl_ops_complete(). Fixes: fe55b1d401c6 ("ethtool: linkstate: migrate linkstate functions to support multi-PHY setups") Reviewed-by: Maxime Chevallier Link: https://patch.msgid.link/20260526153533.2779187-4-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/ethtool/linkstate.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/ethtool/linkstate.c b/net/ethtool/linkstate.c index 8a5985fd7712..24569e92942c 100644 --- a/net/ethtool/linkstate.c +++ b/net/ethtool/linkstate.c @@ -106,10 +106,8 @@ static int linkstate_prepare_data(const struct ethnl_req_info *req_base, phydev = ethnl_req_get_phydev(req_base, tb, ETHTOOL_A_LINKSTATE_HEADER, info->extack); - if (IS_ERR(phydev)) { - ret = PTR_ERR(phydev); - goto out; - } + if (IS_ERR(phydev)) + return PTR_ERR(phydev); ret = ethnl_ops_begin(dev); if (ret < 0) From ab5bf428fb6bd361163c7247b92750d1d24ca2ed Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 26 May 2026 08:35:27 -0700 Subject: [PATCH 64/94] ethtool: pse-pd: fix missing ethnl_ops_complete() pse_prepare_data() is missing ethnl_ops_complete() if ethnl_req_get_phydev() returned an error. Move getting phydev up so that we don't have to worry about this (similar order to linkstate_prepare_data()). Note that phydev may still be NULL (this is checked in pse_get_pse_attributes()), the goal isn't really to avoid the _begin() / _complete() calls, only to simplify the error handling. While at it propagate the original error. Why this code overrides the error with -ENODEV but !phydev generates -EOPNOTSUPP is unclear to me... Fixes: 31748765bed3 ("net: ethtool: pse-pd: Target the command to the requested PHY") Reviewed-by: Maxime Chevallier Link: https://patch.msgid.link/20260526153533.2779187-5-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/ethtool/pse-pd.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/net/ethtool/pse-pd.c b/net/ethtool/pse-pd.c index 2eb9bdc2dcb9..757c9e0cc856 100644 --- a/net/ethtool/pse-pd.c +++ b/net/ethtool/pse-pd.c @@ -62,14 +62,14 @@ static int pse_prepare_data(const struct ethnl_req_info *req_base, struct phy_device *phydev; int ret; - ret = ethnl_ops_begin(dev); - if (ret < 0) - return ret; - phydev = ethnl_req_get_phydev(req_base, tb, ETHTOOL_A_PSE_HEADER, info->extack); if (IS_ERR(phydev)) - return -ENODEV; + return PTR_ERR(phydev); + + ret = ethnl_ops_begin(dev); + if (ret < 0) + return ret; ret = pse_get_pse_attributes(phydev, info->extack, data); From 6386bd772de64e6760306eb91c7e86163af6c22f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 26 May 2026 08:35:28 -0700 Subject: [PATCH 65/94] ethtool: tsconfig: fix missing ethnl_ops_complete() tsconfig_prepare_data() calls ethnl_ops_begin(), we need to call ethnl_ops_complete() before returning the error. Fixes: 6e9e2eed4f39 ("net: ethtool: Add support for tsconfig command to get/set hwtstamp config") Reviewed-by: Vadim Fedorenko Reviewed-by: Kory Maincent Link: https://patch.msgid.link/20260526153533.2779187-6-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/ethtool/tsconfig.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/ethtool/tsconfig.c b/net/ethtool/tsconfig.c index e9db4ee2299d..fc4f93cfa459 100644 --- a/net/ethtool/tsconfig.c +++ b/net/ethtool/tsconfig.c @@ -69,8 +69,10 @@ static int tsconfig_prepare_data(const struct ethnl_req_info *req_base, if (ret) goto out; - if (ts_info.phc_index == -1) - return -ENODEV; + if (ts_info.phc_index == -1) { + ret = -ENODEV; + goto out; + } data->hwprov_desc.index = ts_info.phc_index; data->hwprov_desc.qualifier = ts_info.phc_qualifier; From 1de405699c62c3a9544bcdcfb9eff8a01cfc7582 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 26 May 2026 08:35:29 -0700 Subject: [PATCH 66/94] ethtool: tsinfo: fix uninitialized stats on the by-PHC path tsinfo_prepare_data() has two code paths: a "by-PHC" path for user-specified hardware timestamping providers, and the old path. Commit 89e281ebff72 ("ethtool: init tsinfo stats if requested") added ethtool_stats_init() to mark stat slots as ETHTOOL_STAT_NOT_SET before the driver callback populates them, but placed the call inside the old-path block. When commit b9e3f7dc9ed9 ("net: ethtool: tsinfo: Enhance tsinfo to support several hwtstamp by net topology") added the by-PHC early return, it landed above the stats initialization. On that path the stats array retains the zero-fill from ethnl_init_reply_data()'s zalloc. This leads to the reply including a stats nest with four zero-valued attributes that should have been absent. Reject GET requests for stats with HWTSTAMP_PROVIDER or dump. Fixes: b9e3f7dc9ed9 ("net: ethtool: tsinfo: Enhance tsinfo to support several hwtstamp by net topology") Reviewed-by: Maxime Chevallier Link: https://patch.msgid.link/20260526153533.2779187-7-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/ethtool/tsinfo.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/net/ethtool/tsinfo.c b/net/ethtool/tsinfo.c index a865f0fdd26b..f54fe6b662b2 100644 --- a/net/ethtool/tsinfo.c +++ b/net/ethtool/tsinfo.c @@ -83,6 +83,11 @@ tsinfo_parse_request(struct ethnl_req_info *req_base, if (!tb[ETHTOOL_A_TSINFO_HWTSTAMP_PROVIDER]) return 0; + if (req_base->flags & ETHTOOL_FLAG_STATS) { + NL_SET_ERR_MSG(extack, "can't query statistics for a provider"); + return -EOPNOTSUPP; + } + return ts_parse_hwtst_provider(tb[ETHTOOL_A_TSINFO_HWTSTAMP_PROVIDER], &req->hwprov_desc, extack, &mod); } @@ -523,6 +528,12 @@ int ethnl_tsinfo_start(struct netlink_callback *cb) if (ret < 0) goto free_reply_data; + if (req_info->base.flags & ETHTOOL_FLAG_STATS) { + NL_SET_ERR_MSG(cb->extack, "stats not supported in dump"); + ret = -EOPNOTSUPP; + goto err_dev_put; + } + ctx->req_info = req_info; ctx->reply_data = reply_data; ctx->pos_ifindex = 0; @@ -532,6 +543,8 @@ int ethnl_tsinfo_start(struct netlink_callback *cb) return 0; +err_dev_put: + ethnl_parse_header_dev_put(&req_info->base); free_reply_data: kfree(reply_data); free_req_info: From c3fc9976f686f9a95baf87db9d387f218fd65394 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 26 May 2026 08:35:30 -0700 Subject: [PATCH 67/94] ethtool: tsinfo: don't pass ERR_PTR to genlmsg_cancel on prepare failure The goto err label leads to: genlmsg_cancel(skb, ehdr); return ret; If ethnl_tsinfo_prepare_dump() failed, it has not started a genlmsg. There's nothing to cancel, and passing an error pointer to genlmsg_cancel() would cause a crash. Fixes: b9e3f7dc9ed9 ("net: ethtool: tsinfo: Enhance tsinfo to support several hwtstamp by net topology") Reviewed-by: Maxime Chevallier Reviewed-by: Kory Maincent Link: https://patch.msgid.link/20260526153533.2779187-8-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/ethtool/tsinfo.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/ethtool/tsinfo.c b/net/ethtool/tsinfo.c index f54fe6b662b2..14bf01e3b55c 100644 --- a/net/ethtool/tsinfo.c +++ b/net/ethtool/tsinfo.c @@ -407,10 +407,8 @@ static int ethnl_tsinfo_dump_one_netdev(struct sk_buff *skb, continue; ehdr = ethnl_tsinfo_prepare_dump(skb, dev, reply_data, cb); - if (IS_ERR(ehdr)) { - ret = PTR_ERR(ehdr); - goto err; - } + if (IS_ERR(ehdr)) + return PTR_ERR(ehdr); reply_data->ts_info.phc_qualifier = ctx->pos_phcqualifier; ret = ops->get_ts_info(dev, &reply_data->ts_info); From a8d8bef6b45bf7cc0b1f6110c5cd8d0160a9bad7 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 26 May 2026 08:35:31 -0700 Subject: [PATCH 68/94] ethtool: strset: fix header attribute index in ethnl_req_get_phydev() strset_prepare_data() passes ETHTOOL_A_HEADER_FLAGS (3) as the header attribute to ethnl_req_get_phydev(). This is incorrect, in the main attr space 3 is ETHTOOL_A_STRSET_COUNTS_ONLY, not the request header attr. The correct constant is ETHTOOL_A_STRSET_HEADER (1). ethnl_req_get_phydev() only uses this value for the extack, so this is not a "functionally visible"(?) bug. Fixes: e96c93aa4be9 ("net: ethtool: strset: Allow querying phy stats by index") Reviewed-by: Maxime Chevallier Link: https://patch.msgid.link/20260526153533.2779187-9-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/ethtool/strset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ethtool/strset.c b/net/ethtool/strset.c index bb1e829ba099..94c4718d31ae 100644 --- a/net/ethtool/strset.c +++ b/net/ethtool/strset.c @@ -311,7 +311,7 @@ static int strset_prepare_data(const struct ethnl_req_info *req_base, return 0; } - phydev = ethnl_req_get_phydev(req_base, tb, ETHTOOL_A_HEADER_FLAGS, + phydev = ethnl_req_get_phydev(req_base, tb, ETHTOOL_A_STRSET_HEADER, info->extack); /* phydev can be NULL, check for errors only */ From 2376586f85f972fefe701f095bb37dcfe7405d21 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 26 May 2026 08:35:32 -0700 Subject: [PATCH 69/94] ethtool: eeprom: add missing ethnl_ops_begin() / _complete() during fallback All ethtool driver op calls should be sandwiched between ethnl_ops_begin() / ethnl_ops_complete(). In Netlink eeprom code, if the paged access failed we fall back to old API, but we first call _complete() and the fallback never does its own ethnl_ops_begin(). Move the fallback into the _begin() / _complete() section. Fixes: 96d971e307cc ("ethtool: Add fallback to get_module_eeprom from netlink command") Reviewed-by: Maxime Chevallier Link: https://patch.msgid.link/20260526153533.2779187-10-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/ethtool/eeprom.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/net/ethtool/eeprom.c b/net/ethtool/eeprom.c index a557e3996c85..836316df3092 100644 --- a/net/ethtool/eeprom.c +++ b/net/ethtool/eeprom.c @@ -141,12 +141,11 @@ static int eeprom_prepare_data(const struct ethnl_req_info *req_base, return 0; err_ops: + if (ret == -EOPNOTSUPP) + ret = eeprom_fallback(request, reply); ethnl_ops_complete(dev); err_free: kfree(page_data.data); - - if (ret == -EOPNOTSUPP) - return eeprom_fallback(request, reply); return ret; } From 67cfdd9210b99f260b3e0afeb9525e0acc7be31e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 26 May 2026 08:35:33 -0700 Subject: [PATCH 70/94] ethtool: eeprom: add more safeties to EEPROM Netlink fallback The Netlink fallback path for reading module EEPROM (fallback_set_params()) validates that offset < eeprom_len, but does not check that offset + length stays within eeprom_len. The ioctl equivalent (ethtool_get_any_eeprom() in ioctl.c) has always enforced both bounds: if (eeprom.offset + eeprom.len > total_len) return -EINVAL; This could lead to surprises in both drivers and device FW. Add the missing offset + length validation to fallback_set_params(), mirroring the ioctl. Similarly - ethtool core in general, and ethtool_get_any_eeprom() in particular tries to zero-init all buffers passed to the drivers to avoid any extra work of zeroing things out. eeprom_fallback() uses a plain kmalloc(), change it to zalloc. Fixes: 96d971e307cc ("ethtool: Add fallback to get_module_eeprom from netlink command") Reviewed-by: Maxime Chevallier Link: https://patch.msgid.link/20260526153533.2779187-11-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/ethtool/eeprom.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/ethtool/eeprom.c b/net/ethtool/eeprom.c index 836316df3092..0b8cfeddb014 100644 --- a/net/ethtool/eeprom.c +++ b/net/ethtool/eeprom.c @@ -44,6 +44,9 @@ static int fallback_set_params(struct eeprom_req_info *request, if (offset >= modinfo->eeprom_len) return -EINVAL; + if (length > modinfo->eeprom_len - offset) + return -EINVAL; + eeprom->cmd = ETHTOOL_GMODULEEEPROM; eeprom->len = length; eeprom->offset = offset; @@ -69,7 +72,7 @@ static int eeprom_fallback(struct eeprom_req_info *request, if (err < 0) return err; - data = kmalloc(eeprom.len, GFP_KERNEL); + data = kzalloc(eeprom.len, GFP_KERNEL); if (!data) return -ENOMEM; err = ethtool_get_module_eeprom_call(dev, &eeprom, data); From 9d5e7a46a9f6d8f503b41bfefef70659845f1679 Mon Sep 17 00:00:00 2001 From: Rahul Chandelkar Date: Mon, 25 May 2026 21:10:31 +0530 Subject: [PATCH 71/94] ipv6: rpl: fix hdrlen overflow in ipv6_rpl_srh_decompress() ipv6_rpl_srh_decompress() computes: outhdr->hdrlen = (((n + 1) * sizeof(struct in6_addr)) >> 3); hdrlen is __u8. For n >= 127 the result exceeds 255 and silently truncates. With n=127 (cmpri=15, cmpre=15, pad=0, hdrlen=16): (128 * 16) >> 3 = 256, truncated to 0 as __u8 The caller in ipv6_rpl_srh_rcv() then places the compressed header at buf + ((ohdr->hdrlen + 1) << 3). With hdrlen=0 this is buf + 8, but the decompressed region occupies buf[0..2055] (8-byte header plus 128 full addresses). The compressed header overlaps the decompressed data, and ipv6_rpl_srh_compress() writes into this overlap, corrupting the routing header of the forwarded packet. The existing guard at exthdrs.c:546 checks (n + 1) > 255, which prevents n+1 from overflowing unsigned char (the segments_left field), but does not prevent the computed hdrlen from overflowing __u8. n=127 passes because 128 <= 255, yet hdrlen=256 does not fit. Tighten the bound to (n + 1) > 127. This caps n at 126, giving hdrlen = (127 * 16) >> 3 = 254, which fits in __u8. The compressed header then lands at buf + ((254 + 1) << 3) = buf + 2040, exactly past the decompressed region (buf[0..2039]). No overlap. 127 segments is well beyond any realistic RPL deployment. Fixes: 8610c7c6e3bd ("net: ipv6: add support for rpl sr exthdr") Signed-off-by: Rahul Chandelkar Link: https://patch.msgid.link/20260525154031.2290876-1-rc@rexion.ai Signed-off-by: Jakub Kicinski --- net/ipv6/exthdrs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index aca2a2abd2df..43f46ef9c53b 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -548,7 +548,7 @@ static int ipv6_rpl_srh_rcv(struct sk_buff *skb) * unsigned char which is segments_left field. Should not be * higher than that. */ - if (r || (n + 1) > 255) { + if (r || (n + 1) > 127) { kfree_skb(skb); return -1; } From 98b34f3e8c3492cfc89ff943c9d92b4d52863d1d Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Mon, 25 May 2026 08:25:48 -0400 Subject: [PATCH 72/94] net: Introduce skb tc depth field to track packet loops Add a 2-bit per-skb tc depth field to track packet loops across the stack. The previous per-CPU loop counters like MIRRED_NEST_LIMIT assume a single call stack and lose state in two cases: 1) When a packet is queued and reprocessed later (e.g., egress->ingress via backlog), the per-cpu state is gone by the time it is dequeued. 2) With XPS/RPS a packet may arrive on one CPU and be processed on another. A per-skb field solves both by travelling with the packet itself. The field fits in existing padding, using 2 bits that were previously a hole: pahole before(-) and after (+) diff looks like: __u8 slow_gro:1; /* 132: 3 1 */ __u8 csum_not_inet:1; /* 132: 4 1 */ __u8 unreadable:1; /* 132: 5 1 */ + __u8 tc_depth:2; /* 132: 6 1 */ - /* XXX 2 bits hole, try to pack */ /* XXX 1 byte hole, try to pack */ __u16 tc_index; /* 134 2 */ There used to be a ttl field which was removed as part of tc_verd in commit aec745e2c520 ("net-tc: remove unused tc_verd fields"). It was already unused by that time, due to remove earlier in commit c19ae86a510c ("tc: remove unused redirect ttl"). The first user of this field is netem, which increments tc_depth on duplicated packets before re-enqueueing them at the root qdisc. On re-entry, netem skips duplication for any skb with tc_depth already set, bounding recursion to a single level regardless of tree topology. The other user is mirred which increments it on each pass and limits to depth to MIRRED_DEFER_LIMIT (3). The new field was called ttl in earlier versions of this patch but renamed to tc_depth to avoid confusion with IP ttl. Note (looking at you Sashiko! Dont ignore me and continue bringing this up): 1. Since both mirred and netem utilize the same 2-bit tc_depth field it is possible when netem and mirred are used together that netem qdisc to skip the duplication step. This is a known trade-off, as a 2-bit field cannot independently track both features' recursion depths and it is not considered sane to have a setup that addresses both features on at the same time. 2. skb_scrub_packet does not clear tc_depth. This means a packet's loop history is preserved even across namespaces. While this might be restrictive for some topologies, it is also design intent to provide robustness against loops across namespaces. Reviewed-by: Stephen Hemminger Signed-off-by: Jamal Hadi Salim Link: https://patch.msgid.link/20260525122556.973584-2-jhs@mojatatu.com Signed-off-by: Paolo Abeni --- include/linux/skbuff.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 2bcf78a4de7b..3f06254ab1b7 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -821,6 +821,7 @@ enum skb_tstamp_type { * @_sk_redir: socket redirection information for skmsg * @_nfct: Associated connection, if any (with nfctinfo bits) * @skb_iif: ifindex of device we arrived on + * @tc_depth: counter for packet duplication * @tc_index: Traffic control index * @hash: the packet hash * @queue_mapping: Queue mapping for multiqueue devices @@ -1030,6 +1031,7 @@ struct sk_buff { __u8 csum_not_inet:1; #endif __u8 unreadable:1; + __u8 tc_depth:2; #if defined(CONFIG_NET_SCHED) || defined(CONFIG_NET_XGRESS) __u16 tc_index; /* traffic control index */ #endif From eda0b7f203bb166c98d1418b204135bd566ac83b Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Mon, 25 May 2026 08:25:49 -0400 Subject: [PATCH 73/94] net/sched: Revert "net/sched: Restrict conditions for adding duplicating netems to qdisc tree" This reverts commit ec8e0e3d7adef940cdf9475e2352c0680189d14e. The original patch rejects any tree containing two netems when either has duplication set, even when they sit on unrelated classes of the same classful parent. That broke configurations that have worked since netem was introduced. The re-entrancy problem the original commit was trying to solve is handled by later patch using tc_depth flag. Doing this revert will (re)expose the original bug with multiple netem duplication. When this patch is backported make sure and get the full series. Fixes: ec8e0e3d7ade ("net/sched: Restrict conditions for adding duplicating netems to qdisc tree") Reported-by: Ji-Soo Chung Reported-by: Gerlinde Closes: https://bugzilla.kernel.org/show_bug.cgi?id=220774 Reported-by: zyc zyc Closes: https://lore.kernel.org/all/19adda5a1e2.12410b78222774.9191120410578703463@zohomail.cn/ Reported-by: Manas Ghandat Closes: https://lore.kernel.org/netdev/f69b2c8f-8325-4c2e-a011-6dbc089f30e4@gmail.com/ Reviewed-by: Stephen Hemminger Signed-off-by: Jamal Hadi Salim Link: https://patch.msgid.link/20260525122556.973584-3-jhs@mojatatu.com Signed-off-by: Paolo Abeni --- net/sched/sch_netem.c | 40 ---------------------------------------- 1 file changed, 40 deletions(-) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index bc18e1976b6e..d97acd2f3923 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -1007,41 +1007,6 @@ static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla, return 0; } -static const struct Qdisc_class_ops netem_class_ops; - -static int check_netem_in_tree(struct Qdisc *sch, bool duplicates, - struct netlink_ext_ack *extack) -{ - struct Qdisc *root, *q; - unsigned int i; - - root = qdisc_root_sleeping(sch); - - if (sch != root && root->ops->cl_ops == &netem_class_ops) { - if (duplicates || - ((struct netem_sched_data *)qdisc_priv(root))->duplicate) - goto err; - } - - if (!qdisc_dev(root)) - return 0; - - hash_for_each(qdisc_dev(root)->qdisc_hash, i, q, hash) { - if (sch != q && q->ops->cl_ops == &netem_class_ops) { - if (duplicates || - ((struct netem_sched_data *)qdisc_priv(q))->duplicate) - goto err; - } - } - - return 0; - -err: - NL_SET_ERR_MSG(extack, - "netem: cannot mix duplicating netems with other netems in tree"); - return -EINVAL; -} - /* Parse netlink message to set options */ static int netem_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) @@ -1118,11 +1083,6 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt, q->gap = qopt->gap; q->counter = 0; q->loss = qopt->loss; - - ret = check_netem_in_tree(sch, qopt->duplicate, extack); - if (ret) - goto unlock; - q->duplicate = qopt->duplicate; /* for compatibility with earlier versions. From b213a4c6074fc4ee4f1cdef9a73b34732606b637 Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Mon, 25 May 2026 08:25:50 -0400 Subject: [PATCH 74/94] Revert "selftests/tc-testing: Add tests for restrictions on netem duplication" This reverts commit ecdec65ec78d67d3ebd17edc88b88312054abe0d. The tests added were related to check_netem_in_tree() which was just reverted in the previous patch. Reviewed-by: Stephen Hemminger Signed-off-by: Jamal Hadi Salim Link: https://patch.msgid.link/20260525122556.973584-4-jhs@mojatatu.com Signed-off-by: Paolo Abeni --- .../tc-testing/tc-tests/infra/qdiscs.json | 5 +- .../tc-testing/tc-tests/qdiscs/netem.json | 81 ------------------- 2 files changed, 3 insertions(+), 83 deletions(-) diff --git a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json index 848696c373fc..82c38a13dfbf 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json +++ b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json @@ -702,6 +702,7 @@ "$TC qdisc add dev $DUMMY parent 1:1 handle 2:0 netem duplicate 100%", "$TC filter add dev $DUMMY parent 1:0 protocol ip prio 1 u32 match ip dst 10.10.10.1/32 flowid 1:1", "$TC class add dev $DUMMY parent 1:0 classid 1:2 hfsc ls m2 10Mbit", + "$TC qdisc add dev $DUMMY parent 1:2 handle 3:0 netem duplicate 100%", "$TC filter add dev $DUMMY parent 1:0 protocol ip prio 2 u32 match ip dst 10.10.10.2/32 flowid 1:2", "ping -c 1 10.10.10.1 -I$DUMMY > /dev/null || true", "$TC filter del dev $DUMMY parent 1:0 protocol ip prio 1", @@ -714,8 +715,8 @@ { "kind": "hfsc", "handle": "1:", - "bytes": 294, - "packets": 3 + "bytes": 392, + "packets": 4 } ], "matchCount": "1", diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/netem.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/netem.json index 718d2df2aafa..3c4444961488 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/netem.json +++ b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/netem.json @@ -336,86 +336,5 @@ "teardown": [ "$TC qdisc del dev $DUMMY handle 1: root" ] - }, - { - "id": "d34d", - "name": "NETEM test qdisc duplication restriction in qdisc tree in netem_change root", - "category": ["qdisc", "netem"], - "plugins": { - "requires": "nsPlugin" - }, - "setup": [ - "$TC qdisc add dev $DUMMY root handle 1: netem limit 1", - "$TC qdisc add dev $DUMMY parent 1: handle 2: netem limit 1" - ], - "cmdUnderTest": "$TC qdisc change dev $DUMMY handle 1: netem duplicate 50%", - "expExitCode": "2", - "verifyCmd": "$TC -s qdisc show dev $DUMMY", - "matchPattern": "qdisc netem", - "matchCount": "2", - "teardown": [ - "$TC qdisc del dev $DUMMY handle 1:0 root" - ] - }, - { - "id": "b33f", - "name": "NETEM test qdisc duplication restriction in qdisc tree in netem_change non-root", - "category": ["qdisc", "netem"], - "plugins": { - "requires": "nsPlugin" - }, - "setup": [ - "$TC qdisc add dev $DUMMY root handle 1: netem limit 1", - "$TC qdisc add dev $DUMMY parent 1: handle 2: netem limit 1" - ], - "cmdUnderTest": "$TC qdisc change dev $DUMMY handle 2: netem duplicate 50%", - "expExitCode": "2", - "verifyCmd": "$TC -s qdisc show dev $DUMMY", - "matchPattern": "qdisc netem", - "matchCount": "2", - "teardown": [ - "$TC qdisc del dev $DUMMY handle 1:0 root" - ] - }, - { - "id": "cafe", - "name": "NETEM test qdisc duplication restriction in qdisc tree", - "category": ["qdisc", "netem"], - "plugins": { - "requires": "nsPlugin" - }, - "setup": [ - "$TC qdisc add dev $DUMMY root handle 1: netem limit 1 duplicate 100%" - ], - "cmdUnderTest": "$TC qdisc add dev $DUMMY parent 1: handle 2: netem duplicate 100%", - "expExitCode": "2", - "verifyCmd": "$TC -s qdisc show dev $DUMMY", - "matchPattern": "qdisc netem", - "matchCount": "1", - "teardown": [ - "$TC qdisc del dev $DUMMY handle 1:0 root" - ] - }, - { - "id": "1337", - "name": "NETEM test qdisc duplication restriction in qdisc tree across branches", - "category": ["qdisc", "netem"], - "plugins": { - "requires": "nsPlugin" - }, - "setup": [ - "$TC qdisc add dev $DUMMY parent root handle 1:0 hfsc", - "$TC class add dev $DUMMY parent 1:0 classid 1:1 hfsc rt m2 10Mbit", - "$TC qdisc add dev $DUMMY parent 1:1 handle 2:0 netem", - "$TC class add dev $DUMMY parent 1:0 classid 1:2 hfsc rt m2 10Mbit" - ], - "cmdUnderTest": "$TC qdisc add dev $DUMMY parent 1:2 handle 3:0 netem duplicate 100%", - "expExitCode": "2", - "verifyCmd": "$TC -s qdisc show dev $DUMMY", - "matchPattern": "qdisc netem", - "matchCount": "1", - "teardown": [ - "$TC qdisc del dev $DUMMY handle 1:0 root" - ] } ] From 9552b11e3edabc97cfcd9f29103d5afbce7ae183 Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Mon, 25 May 2026 08:25:51 -0400 Subject: [PATCH 75/94] net/sched: fix packet loop on netem when duplicate is on When netem duplicates a packet it re-enqueues the copy at the root qdisc. If another netem sits in the tree the copy can be duplicated again, recursing until the stack or memory is exhausted. The original duplication guard temporarily zeroed q->duplicate around the re-enqueue, but that does not cover all cases because it is per-qdisc state shared across all concurrent enqueue paths and is not safe without additional locking. Use the skb tc_depth field introduced in an earlier patch: - increment it on the duplicate before re-enqueue - skip duplication for any skb whose tc_depth is already non-zero. This marks the packet itself rather than mutating qdisc state, therefore it is safe regardless of tree topology or concurrency. Fixes: 0afb51e72855 ("[PKT_SCHED]: netem: reinsert for duplication") Reported-by: William Liu Reported-by: Savino Dicanosa Closes: https://lore.kernel.org/netdev/8DuRWwfqjoRDLDmBMlIfbrsZg9Gx50DHJc1ilxsEBNe2D6NMoigR_eIRIG0LOjMc3r10nUUZtArXx4oZBIdUfZQrwjcQhdinnMis_0G7VEk=@willsroot.io/ Co-developed-by: Victor Nogueira Signed-off-by: Victor Nogueira Reviewed-by: William Liu Reviewed-by: Stephen Hemminger Signed-off-by: Jamal Hadi Salim Link: https://patch.msgid.link/20260525122556.973584-5-jhs@mojatatu.com Signed-off-by: Paolo Abeni --- net/sched/sch_netem.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index d97acd2f3923..17a79fe2f091 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -461,7 +461,8 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, skb->prev = NULL; /* Random duplication */ - if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor, &q->prng)) + if (q->duplicate && skb->tc_depth == 0 && + q->duplicate >= get_crandom(&q->dup_cor, &q->prng)) ++count; /* Drop packet? */ @@ -540,11 +541,9 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, */ if (skb2) { struct Qdisc *rootq = qdisc_root_bh(sch); - u32 dupsave = q->duplicate; /* prevent duplicating a dup... */ - q->duplicate = 0; + skb2->tc_depth++; /* prevent duplicating a dup... */ rootq->enqueue(skb2, rootq, to_free); - q->duplicate = dupsave; skb2 = NULL; } From db875221ab08d213a83bf30196ae8b64d55a3403 Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Mon, 25 May 2026 08:25:52 -0400 Subject: [PATCH 76/94] net/sched: Fix ethx:ingress -> ethy:egress -> ethx:ingress mirred loop When mirred redirects to ingress (from either ingress or egress) the loop state from sched_mirred_dev array dev is lost because of 1) the packet deferral into the backlog and 2) the fact the sched_mirred_dev array is cleared. In such cases, if there was a loop we won't discover it. Here's a simple test to reproduce: ip a add dev port0 10.10.10.11/24 tc qdisc add dev port0 clsact tc filter add dev port0 egress protocol ip \ prio 10 matchall action mirred ingress redirect dev port1 tc qdisc add dev port1 clsact tc filter add dev port1 ingress protocol ip \ prio 10 matchall action mirred egress redirect dev port0 ping -c 1 -W0.01 10.10.10.10 Fixes: fe946a751d9b ("net/sched: act_mirred: add loop detection") Tested-by: Victor Nogueira Reviewed-by: Stephen Hemminger Signed-off-by: Jamal Hadi Salim Link: https://patch.msgid.link/20260525122556.973584-6-jhs@mojatatu.com Signed-off-by: Paolo Abeni --- net/sched/act_mirred.c | 47 +++++++++++++++++++++++++++--------------- 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 2c5a7a321a94..dd5e7ea7ef26 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -26,6 +26,10 @@ #include #include +#define MIRRED_DEFER_LIMIT 3 +_Static_assert(MIRRED_DEFER_LIMIT <= 3, + "MIRRED_DEFER_LIMIT exceeds tc_depth bitfield width"); + static LIST_HEAD(mirred_list); static DEFINE_SPINLOCK(mirred_list_lock); @@ -234,12 +238,15 @@ tcf_mirred_forward(bool at_ingress, bool want_ingress, struct sk_buff *skb) { int err; - if (!want_ingress) + if (!want_ingress) { err = tcf_dev_queue_xmit(skb, dev_queue_xmit); - else if (!at_ingress) - err = netif_rx(skb); - else - err = netif_receive_skb(skb); + } else { + skb->tc_depth++; + if (!at_ingress) + err = netif_rx(skb); + else + err = netif_receive_skb(skb); + } return err; } @@ -426,6 +433,7 @@ TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb, struct netdev_xmit *xmit; bool m_mac_header_xmit; struct net_device *dev; + bool want_ingress; int i, m_eaction; u32 blockid; @@ -434,7 +442,8 @@ TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb, #else xmit = this_cpu_ptr(&softnet_data.xmit); #endif - if (unlikely(xmit->sched_mirred_nest >= MIRRED_NEST_LIMIT)) { + if (unlikely(xmit->sched_mirred_nest >= MIRRED_NEST_LIMIT || + skb->tc_depth >= MIRRED_DEFER_LIMIT)) { net_warn_ratelimited("Packet exceeded mirred recursion limit on dev %s\n", netdev_name(skb->dev)); return TC_ACT_SHOT; @@ -453,23 +462,27 @@ TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb, tcf_action_inc_overlimit_qstats(&m->common); return retval; } - for (i = 0; i < xmit->sched_mirred_nest; i++) { - if (xmit->sched_mirred_dev[i] != dev) - continue; - pr_notice_once("tc mirred: loop on device %s\n", - netdev_name(dev)); - tcf_action_inc_overlimit_qstats(&m->common); - return retval; + + m_eaction = READ_ONCE(m->tcfm_eaction); + want_ingress = tcf_mirred_act_wants_ingress(m_eaction); + if (!want_ingress) { + for (i = 0; i < xmit->sched_mirred_nest; i++) { + if (xmit->sched_mirred_dev[i] != dev) + continue; + pr_notice_once("tc mirred: loop on device %s\n", + netdev_name(dev)); + tcf_action_inc_overlimit_qstats(&m->common); + return retval; + } + xmit->sched_mirred_dev[xmit->sched_mirred_nest++] = dev; } - xmit->sched_mirred_dev[xmit->sched_mirred_nest++] = dev; - m_mac_header_xmit = READ_ONCE(m->tcfm_mac_header_xmit); - m_eaction = READ_ONCE(m->tcfm_eaction); retval = tcf_mirred_to_dev(skb, m, dev, m_mac_header_xmit, m_eaction, retval); - xmit->sched_mirred_nest--; + if (!want_ingress) + xmit->sched_mirred_nest--; return retval; } From a005fa5d7502eefec7ee6e1c01adadc06de2f9ad Mon Sep 17 00:00:00 2001 From: "Kito Xu (veritas501)" Date: Mon, 25 May 2026 08:25:53 -0400 Subject: [PATCH 77/94] net/sched: act_mirred: Fix blockcast recursion bypass leading to stack overflow tcf_mirred_act() checks sched_mirred_nest against MIRRED_NEST_LIMIT (4) to prevent deep recursion. However, when the action uses blockcast (tcfm_blockid != 0), the function returns at the tcf_blockcast() call BEFORE reaching the counter increment. As a result, the recursion counter never advances and the limit check is entirely bypassed. When two devices share a TC egress block with a mirred blockcast rule, a packet egressing on device A is mirrored to device B via blockcast; device B's egress TC re-enters tcf_mirred_act() via blockcast and mirrors back to A, creating an unbounded recursion loop: tcf_mirred_act -> tcf_blockcast -> tcf_mirred_to_dev -> dev_queue_xmit -> sch_handle_egress -> tcf_classify -> tcf_mirred_act -> (repeat) This recursion continues until the kernel stack overflows. The bug is reachable from an unprivileged user via unshare(CLONE_NEWUSER | CLONE_NEWNET): user namespaces grant CAP_NET_ADMIN in the new network namespace, which is sufficient to create dummy devices, attach clsact qdiscs with shared blocks, and install mirred blockcast filters. BUG: TASK stack guard page was hit at ffffc90000b7fff8 Oops: stack guard page: 0000 [#1] SMP KASAN NOPTI CPU: 2 UID: 1000 PID: 169 Comm: poc Not tainted 7.0.0-rc7-next-20260410 RIP: 0010:xas_find+0x17/0x480 Call Trace: xa_find+0x17b/0x1d0 tcf_mirred_act+0x640/0x1060 tcf_action_exec+0x400/0x530 basic_classify+0x128/0x1d0 tcf_classify+0xd83/0x1150 tc_run+0x328/0x620 __dev_queue_xmit+0x797/0x3100 tcf_mirred_to_dev+0x7b1/0xf70 tcf_mirred_act+0x68a/0x1060 [repeating ~30+ times until stack overflow] Kernel panic - not syncing: Fatal exception in interrupt Fix this by incrementing sched_mirred_nest before calling tcf_blockcast() and decrementing it on return, mirroring the non-blockcast path. This ensures subsequent recursive entries see the updated counter and are correctly limited by MIRRED_NEST_LIMIT. Fixes: fe946a751d9b ("net/sched: act_mirred: add loop detection") Signed-off-by: Kito Xu (veritas501) Link: https://patch.msgid.link/20260525122556.973584-7-jhs@mojatatu.com Signed-off-by: Paolo Abeni --- net/sched/act_mirred.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index dd5e7ea7ef26..dbe4a4ff3e08 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -396,14 +396,12 @@ static int tcf_blockcast_mirror(struct sk_buff *skb, struct tcf_mirred *m, static int tcf_blockcast(struct sk_buff *skb, struct tcf_mirred *m, const u32 blockid, struct tcf_result *res, - int retval) + int m_eaction, int retval) { const u32 exception_ifindex = skb->dev->ifindex; struct tcf_block *block; bool is_redirect; - int m_eaction; - m_eaction = READ_ONCE(m->tcfm_eaction); is_redirect = tcf_mirred_is_act_redirect(m_eaction); /* we are already under rcu protection, so can call block lookup @@ -453,8 +451,16 @@ TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb, tcf_action_update_bstats(&m->common, skb); blockid = READ_ONCE(m->tcfm_blockid); - if (blockid) - return tcf_blockcast(skb, m, blockid, res, retval); + m_eaction = READ_ONCE(m->tcfm_eaction); + want_ingress = tcf_mirred_act_wants_ingress(m_eaction); + if (blockid) { + if (!want_ingress) + xmit->sched_mirred_dev[xmit->sched_mirred_nest++] = NULL; + retval = tcf_blockcast(skb, m, blockid, res, m_eaction, retval); + if (!want_ingress) + xmit->sched_mirred_nest--; + return retval; + } dev = rcu_dereference_bh(m->tcfm_dev); if (unlikely(!dev)) { @@ -463,8 +469,6 @@ TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb, return retval; } - m_eaction = READ_ONCE(m->tcfm_eaction); - want_ingress = tcf_mirred_act_wants_ingress(m_eaction); if (!want_ingress) { for (i = 0; i < xmit->sched_mirred_nest; i++) { if (xmit->sched_mirred_dev[i] != dev) From e80ad525fc7e8c933ad78478c5dda286cfd55c60 Mon Sep 17 00:00:00 2001 From: Victor Nogueira Date: Mon, 25 May 2026 08:25:54 -0400 Subject: [PATCH 78/94] net/sched: act_mirred: Fix return code in early mirred redirect error paths Since retval is set as TC_ACT_STOLEN in the mirred redirect case, returning retval in cases where redirect failed will make the callers not register the skb as being dropped. Fix this by returning TC_ACT_SHOT instead in such scenarios. Fixes: 16085e48cb48 ("net/sched: act_mirred: Create function tcf_mirred_to_dev and improve readability") Reported-by: Sashiko Closes: https://sashiko.dev/#/patchset/20260413082027.2244884-1-hxzene%40gmail.com Signed-off-by: Victor Nogueira Link: https://patch.msgid.link/20260525122556.973584-8-jhs@mojatatu.com Signed-off-by: Paolo Abeni --- net/sched/act_mirred.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index dbe4a4ff3e08..553342c55cf7 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -372,7 +372,8 @@ static int tcf_blockcast_redir(struct sk_buff *skb, struct tcf_mirred *m, dev_is_mac_header_xmit(dev_prev), m_eaction, retval); - return retval; + /* If the packet wasn't redirected, we have to register as a drop */ + return TC_ACT_SHOT; } static int tcf_blockcast_mirror(struct sk_buff *skb, struct tcf_mirred *m, @@ -410,7 +411,7 @@ static int tcf_blockcast(struct sk_buff *skb, struct tcf_mirred *m, block = tcf_block_lookup(dev_net(skb->dev), blockid); if (!block || xa_empty(&block->ports)) { tcf_action_inc_overlimit_qstats(&m->common); - return retval; + return is_redirect ? TC_ACT_SHOT : retval; } if (is_redirect) @@ -428,8 +429,8 @@ TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb, { struct tcf_mirred *m = to_mirred(a); int retval = READ_ONCE(m->tcf_action); + bool m_mac_header_xmit, is_redirect; struct netdev_xmit *xmit; - bool m_mac_header_xmit; struct net_device *dev; bool want_ingress; int i, m_eaction; @@ -462,11 +463,13 @@ TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb, return retval; } + is_redirect = tcf_mirred_is_act_redirect(m_eaction); + dev = rcu_dereference_bh(m->tcfm_dev); if (unlikely(!dev)) { pr_notice_once("tc mirred: target device is gone\n"); tcf_action_inc_overlimit_qstats(&m->common); - return retval; + goto err_out; } if (!want_ingress) { @@ -476,7 +479,7 @@ TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb, pr_notice_once("tc mirred: loop on device %s\n", netdev_name(dev)); tcf_action_inc_overlimit_qstats(&m->common); - return retval; + goto err_out; } xmit->sched_mirred_dev[xmit->sched_mirred_nest++] = dev; } @@ -489,6 +492,11 @@ TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb, xmit->sched_mirred_nest--; return retval; + +err_out: + if (is_redirect) + retval = TC_ACT_SHOT; + return retval; } static void tcf_stats_update(struct tc_action *a, u64 bytes, u64 packets, From d38dc56a0225664e494221b5b251931b35d125ef Mon Sep 17 00:00:00 2001 From: Victor Nogueira Date: Mon, 25 May 2026 08:25:55 -0400 Subject: [PATCH 79/94] selftests/tc-testing: Add mirred test cases exercising loops Add mirred loop test cases to validate that those will be caught and other test cases that were previously misinterpreted as loops by mirred. This commit adds 12 test cases: - Redirect multiport: dummy egress -> dev1 ingress -> dummy egress (Loop) - Redirect singleport: dev1 ingress -> dev1 egress -> dev1 ingress (Loop) - Redirect multiport: dev1 ingress -> dummy ingress -> dev1 egress (No Loop) - Redirect multiport: dev1 ingress -> dummy ingress -> dev1 ingress (Loop) - Redirect multiport: dev1 ingress -> dummy egress -> dev1 ingress (Loop) - Redirect multiport: dummy egress -> dev1 ingress -> dummy egress, different prios (Loop) - Redirect multiport: dev1 ingress -> dummy ingress -> dummy egress -> dev1 egress (No Loop) - Redirect multiport: dev1 ingress -> dummy egress -> dev1 egress (No Loop) - Redirect multiport: dev1 ingress -> dummy egress -> dummy ingress (No Loop) - Redirect singleport: dev1 ingress -> dev1 ingress (Loop) - Redirect singleport: dummy egress -> dummy ingress (No Loop) - Redirect multiport: dev1 ingress -> dummy ingress -> dummy egress (No Loop) Acked-by: Jamal Hadi Salim Acked-by: Stephen Hemminger Signed-off-by: Victor Nogueira Link: https://patch.msgid.link/20260525122556.973584-9-jhs@mojatatu.com Signed-off-by: Paolo Abeni --- .../tc-testing/tc-tests/actions/mirred.json | 616 +++++++++++++++++- 1 file changed, 615 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/mirred.json b/tools/testing/selftests/tc-testing/tc-tests/actions/mirred.json index b056eb966871..d0cad6571691 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/actions/mirred.json +++ b/tools/testing/selftests/tc-testing/tc-tests/actions/mirred.json @@ -1144,6 +1144,620 @@ "teardown": [ "$TC qdisc del dev $DUMMY clsact" ] + }, + { + "id": "531c", + "name": "Redirect multiport: dummy egress -> dev1 ingress -> dummy egress (Loop)", + "category": [ + "filter", + "mirred" + ], + "plugins": { + "requires": [ + "nsPlugin" + ] + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.10.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY clsact", + "$TC filter add dev $DUMMY egress protocol ip prio 10 matchall action mirred ingress redirect dev $DEV1 index 1", + "$TC qdisc add dev $DEV1 clsact", + "$TC filter add dev $DEV1 ingress protocol ip prio 10 matchall action mirred egress redirect dev $DUMMY index 2" + ], + "cmdUnderTest": "ping -c1 -W0.01 -I $DUMMY 10.10.10.1", + "expExitCode": "1", + "verifyCmd": "$TC -j -s actions get action mirred index 1", + "matchJSON": [ + { + "total acts": 0 + }, + { + "actions": [ + { + "order": 1, + "kind": "mirred", + "mirred_action": "redirect", + "direction": "ingress", + "index": 1, + "stats": { + "packets": 3 + }, + "not_in_hw": true + } + ] + } + ], + "teardown": [ + "$TC qdisc del dev $DUMMY clsact", + "$TC qdisc del dev $DEV1 clsact" + ] + }, + { + "id": "b1d7", + "name": "Redirect singleport: dev1 ingress -> dev1 egress -> dev1 ingress (Loop)", + "category": [ + "filter", + "mirred" + ], + "plugins": { + "requires": [ + "nsPlugin", + "scapyPlugin" + ] + }, + "setup": [ + "$TC qdisc add dev $DEV1 clsact", + "$TC filter add dev $DEV1 ingress protocol ip prio 10 matchall action mirred egress redirect dev $DEV1 index 1" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 egress protocol ip prio 11 matchall action mirred ingress redirect dev $DEV1 index 2", + "scapy": [ + { + "iface": "$DEV0", + "count": 1, + "packet": "Ether()/IP(dst='10.10.10.1', src='10.10.10.10')/ICMP()" + } + ], + "expExitCode": "0", + "verifyCmd": "$TC -j -s actions get action mirred index 1", + "matchJSON": [ + { + "total acts": 0 + }, + { + "actions": [ + { + "order": 1, + "kind": "mirred", + "mirred_action": "redirect", + "direction": "egress", + "index": 1, + "stats": { + "packets": 3 + }, + "not_in_hw": true + } + ] + } + ], + "teardown": [ + "$TC qdisc del dev $DEV1 clsact" + ] + }, + { + "id": "c66d", + "name": "Redirect multiport: dev1 ingress -> dummy ingress -> dev1 egress (No Loop)", + "category": [ + "filter", + "mirred" + ], + "plugins": { + "requires": [ + "nsPlugin", + "scapyPlugin" + ] + }, + "setup": [ + "$TC qdisc add dev $DEV1 clsact", + "$TC filter add dev $DEV1 ingress protocol ip prio 10 matchall action mirred ingress redirect dev $DUMMY index 1", + "$TC qdisc add dev $DUMMY clsact" + ], + "cmdUnderTest": "$TC filter add dev $DUMMY ingress protocol ip prio 11 matchall action mirred egress redirect dev $DEV1 index 2", + "scapy": [ + { + "iface": "$DEV0", + "count": 1, + "packet": "Ether()/IP(dst='10.10.10.1', src='10.10.10.10')/ICMP()" + } + ], + "expExitCode": "0", + "verifyCmd": "$TC -j -s actions get action mirred index 1", + "matchJSON": [ + { + "total acts": 0 + }, + { + "actions": [ + { + "order": 1, + "kind": "mirred", + "mirred_action": "redirect", + "direction": "ingress", + "index": 1, + "stats": { + "packets": 1 + }, + "not_in_hw": true + } + ] + } + ], + "teardown": [ + "$TC qdisc del dev $DEV1 clsact", + "$TC qdisc del dev $DUMMY clsact" + ] + }, + { + "id": "aa99", + "name": "Redirect multiport: dev1 ingress -> dummy ingress -> dev1 ingress (Loop)", + "category": [ + "filter", + "mirred" + ], + "plugins": { + "requires": [ + "nsPlugin", + "scapyPlugin" + ] + }, + "setup": [ + "$TC qdisc add dev $DEV1 clsact", + "$TC filter add dev $DEV1 ingress protocol ip prio 10 matchall action mirred ingress redirect dev $DUMMY index 1", + "$TC qdisc add dev $DUMMY clsact" + ], + "cmdUnderTest": "$TC filter add dev $DUMMY ingress protocol ip prio 11 matchall action mirred ingress redirect dev $DEV1 index 2", + "scapy": [ + { + "iface": "$DEV0", + "count": 1, + "packet": "Ether()/IP(dst='10.10.10.1', src='10.10.10.10')/ICMP()" + } + ], + "expExitCode": "0", + "verifyCmd": "$TC -j -s actions get action mirred index 1", + "matchJSON": [ + { + "total acts": 0 + }, + { + "actions": [ + { + "order": 1, + "kind": "mirred", + "mirred_action": "redirect", + "direction": "ingress", + "index": 1, + "stats": { + "packets": 2, + "overlimits": 1 + }, + "not_in_hw": true + } + ] + } + ], + "teardown": [ + "$TC qdisc del dev $DEV1 clsact", + "$TC qdisc del dev $DUMMY clsact" + ] + }, + { + "id": "37d7", + "name": "Redirect multiport: dev1 ingress -> dummy egress -> dev1 ingress (Loop)", + "category": [ + "filter", + "mirred" + ], + "plugins": { + "requires": [ + "nsPlugin", + "scapyPlugin" + ] + }, + "setup": [ + "$TC qdisc add dev $DEV1 clsact", + "$TC filter add dev $DEV1 ingress protocol ip prio 10 matchall action mirred egress redirect dev $DUMMY index 1", + "$TC qdisc add dev $DUMMY clsact" + ], + "cmdUnderTest": "$TC filter add dev $DUMMY egress protocol ip prio 11 matchall action mirred ingress redirect dev $DEV1 index 2", + "scapy": [ + { + "iface": "$DEV0", + "count": 1, + "packet": "Ether()/IP(dst='10.10.10.1', src='10.10.10.10')/ICMP()" + } + ], + "expExitCode": "0", + "verifyCmd": "$TC -j -s actions get action mirred index 1", + "matchJSON": [ + { + "total acts": 0 + }, + { + "actions": [ + { + "order": 1, + "kind": "mirred", + "mirred_action": "redirect", + "direction": "egress", + "index": 1, + "stats": { + "packets": 3 + }, + "not_in_hw": true + } + ] + } + ], + "teardown": [ + "$TC qdisc del dev $DEV1 clsact", + "$TC qdisc del dev $DUMMY clsact" + ] + }, + { + "id": "6d02", + "name": "Redirect multiport: dummy egress -> dev1 ingress -> dummy egress, different prios (Loop)", + "category": [ + "filter", + "mirred" + ], + "plugins": { + "requires": [ + "nsPlugin" + ] + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.10.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY clsact", + "$TC filter add dev $DUMMY egress protocol ip prio 10 matchall action mirred ingress redirect dev $DEV1 index 1", + "$TC qdisc add dev $DEV1 clsact", + "$TC filter add dev $DEV1 ingress protocol ip prio 11 matchall action mirred egress redirect dev $DUMMY index 2" + ], + "cmdUnderTest": "ping -c1 -W0.01 -I $DUMMY 10.10.10.1", + "expExitCode": "1", + "verifyCmd": "$TC -j -s actions get action mirred index 1", + "matchJSON": [ + { + "total acts": 0 + }, + { + "actions": [ + { + "order": 1, + "kind": "mirred", + "mirred_action": "redirect", + "direction": "ingress", + "index": 1, + "stats": { + "packets": 3 + }, + "not_in_hw": true + } + ] + } + ], + "teardown": [ + "$TC qdisc del dev $DUMMY clsact", + "$TC qdisc del dev $DEV1 clsact" + ] + }, + { + "id": "8115", + "name": "Redirect multiport: dev1 ingress -> dummy ingress -> dummy egress -> dev1 egress (No Loop)", + "category": [ + "filter", + "mirred" + ], + "plugins": { + "requires": [ + "nsPlugin", + "scapyPlugin" + ] + }, + "setup": [ + "$TC qdisc add dev $DEV1 clsact", + "$TC filter add dev $DEV1 ingress protocol ip prio 10 matchall action mirred ingress redirect dev $DUMMY index 1", + "$TC qdisc add dev $DUMMY clsact", + "$TC filter add dev $DUMMY ingress protocol ip prio 11 matchall action mirred egress redirect dev $DUMMY index 2" + ], + "cmdUnderTest": "$TC filter add dev $DUMMY egress protocol ip prio 12 matchall action mirred egress redirect dev $DEV1 index 3", + "scapy": [ + { + "iface": "$DEV0", + "count": 1, + "packet": "Ether()/IP(dst='10.10.10.1', src='10.10.10.10')/ICMP()" + } + ], + "expExitCode": "0", + "verifyCmd": "$TC -j -s actions get action mirred index 1", + "matchJSON": [ + { + "total acts": 0 + }, + { + "actions": [ + { + "order": 1, + "kind": "mirred", + "mirred_action": "redirect", + "direction": "ingress", + "index": 1, + "stats": { + "packets": 1 + }, + "not_in_hw": true + } + ] + } + ], + "teardown": [ + "$TC qdisc del dev $DEV1 clsact", + "$TC qdisc del dev $DUMMY clsact" + ] + }, + { + "id": "9eb3", + "name": "Redirect multiport: dev1 ingress -> dummy egress -> dev1 egress (No Loop)", + "category": [ + "filter", + "mirred" + ], + "plugins": { + "requires": [ + "nsPlugin", + "scapyPlugin" + ] + }, + "setup": [ + "$TC qdisc add dev $DEV1 clsact", + "$TC filter add dev $DEV1 ingress protocol ip prio 10 matchall action mirred egress redirect dev $DUMMY index 1", + "$TC qdisc add dev $DUMMY clsact" + ], + "cmdUnderTest": "$TC filter add dev $DUMMY egress protocol ip prio 11 matchall action mirred egress redirect dev $DEV1 index 2", + "scapy": [ + { + "iface": "$DEV0", + "count": 1, + "packet": "Ether()/IP(dst='10.10.10.1', src='10.10.10.10')/ICMP()" + } + ], + "expExitCode": "0", + "verifyCmd": "$TC -j -s actions get action mirred index 1", + "matchJSON": [ + { + "total acts": 0 + }, + { + "actions": [ + { + "order": 1, + "kind": "mirred", + "mirred_action": "redirect", + "direction": "egress", + "index": 1, + "stats": { + "packets": 1 + }, + "not_in_hw": true + } + ] + } + ], + "teardown": [ + "$TC qdisc del dev $DEV1 clsact", + "$TC qdisc del dev $DUMMY clsact" + ] + }, + { + "id": "d837", + "name": "Redirect multiport: dev1 ingress -> dummy egress -> dummy ingress (No Loop)", + "category": [ + "filter", + "mirred" + ], + "plugins": { + "requires": [ + "nsPlugin", + "scapyPlugin" + ] + }, + "setup": [ + "$TC qdisc add dev $DEV1 clsact", + "$TC filter add dev $DEV1 ingress protocol ip prio 10 matchall action mirred egress redirect dev $DUMMY index 1", + "$TC qdisc add dev $DUMMY clsact" + ], + "cmdUnderTest": "$TC filter add dev $DUMMY egress protocol ip prio 11 matchall action mirred ingress redirect dev $DUMMY index 2", + "scapy": [ + { + "iface": "$DEV0", + "count": 1, + "packet": "Ether()/IP(dst='10.10.10.1', src='10.10.10.10')/ICMP()" + } + ], + "expExitCode": "0", + "verifyCmd": "$TC -j -s actions get action mirred index 1", + "matchJSON": [ + { + "total acts": 0 + }, + { + "actions": [ + { + "order": 1, + "kind": "mirred", + "mirred_action": "redirect", + "direction": "egress", + "index": 1, + "stats": { + "packets": 1 + }, + "not_in_hw": true + } + ] + } + ], + "teardown": [ + "$TC qdisc del dev $DEV1 clsact", + "$TC qdisc del dev $DUMMY clsact" + ] + }, + { + "id": "2071", + "name": "Redirect singleport: dev1 ingress -> dev1 ingress (Loop)", + "category": [ + "filter", + "mirred" + ], + "plugins": { + "requires": [ + "nsPlugin", + "scapyPlugin" + ] + }, + "setup": [ + "$TC qdisc add dev $DEV1 clsact" + ], + "cmdUnderTest": "$TC filter add dev $DEV1 ingress protocol ip prio 10 matchall action mirred ingress redirect dev $DEV1 index 1", + "scapy": [ + { + "iface": "$DEV0", + "count": 1, + "packet": "Ether()/IP(dst='10.10.10.1', src='10.10.10.10')/ICMP()" + } + ], + "expExitCode": "0", + "verifyCmd": "$TC -j -s actions get action mirred index 1", + "matchJSON": [ + { + "total acts": 0 + }, + { + "actions": [ + { + "order": 1, + "kind": "mirred", + "mirred_action": "redirect", + "direction": "ingress", + "index": 1, + "stats": { + "packets": 1, + "overlimits": 1 + }, + "not_in_hw": true + } + ] + } + ], + "teardown": [ + "$TC qdisc del dev $DEV1 clsact" + ] + }, + { + "id": "0101", + "name": "Redirect singleport: dummy egress -> dummy ingress (No Loop)", + "category": [ + "filter", + "mirred" + ], + "plugins": { + "requires": [ + "nsPlugin" + ] + }, + "setup": [ + "$IP addr add 10.10.10.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY clsact", + "$TC filter add dev $DUMMY egress protocol ip prio 11 matchall action mirred ingress redirect dev $DUMMY index 1" + ], + "cmdUnderTest": "ping -c1 -W0.01 -I $DUMMY 10.10.10.1", + "expExitCode": "1", + "verifyCmd": "$TC -j -s actions get action mirred index 1", + "matchJSON": [ + { + "total acts": 0 + }, + { + "actions": [ + { + "order": 1, + "kind": "mirred", + "mirred_action": "redirect", + "direction": "ingress", + "index": 1, + "stats": { + "packets": 1 + }, + "not_in_hw": true + } + ] + } + ], + "teardown": [ + "$TC qdisc del dev $DUMMY clsact" + ] + }, + { + "id": "cf97", + "name": "Redirect multiport: dev1 ingress -> dummy ingress -> dummy egress (No Loop)", + "category": [ + "filter", + "mirred" + ], + "plugins": { + "requires": [ + "nsPlugin", + "scapyPlugin" + ] + }, + "setup": [ + "$TC qdisc add dev $DEV1 clsact", + "$TC filter add dev $DEV1 ingress protocol ip prio 10 matchall action mirred ingress redirect dev $DUMMY index 1", + "$TC qdisc add dev $DUMMY clsact" + ], + "cmdUnderTest": "$TC filter add dev $DUMMY ingress protocol ip prio 11 matchall action mirred egress redirect dev $DUMMY index 2", + "scapy": [ + { + "iface": "$DEV0", + "count": 1, + "packet": "Ether()/IP(dst='10.10.10.1', src='10.10.10.10')/ICMP()" + } + ], + "expExitCode": "0", + "verifyCmd": "$TC -j -s actions get action mirred index 1", + "matchJSON": [ + { + "total acts": 0 + }, + { + "actions": [ + { + "order": 1, + "kind": "mirred", + "mirred_action": "redirect", + "direction": "ingress", + "index": 1, + "stats": { + "packets": 1 + }, + "not_in_hw": true + } + ] + } + ], + "teardown": [ + "$TC qdisc del dev $DEV1 clsact", + "$TC qdisc del dev $DUMMY clsact" + ] } - ] From 0f6e00aa5f652f5653e0039b9c9a8835f4b4174b Mon Sep 17 00:00:00 2001 From: Victor Nogueira Date: Mon, 25 May 2026 08:25:56 -0400 Subject: [PATCH 80/94] selftests/tc-testing: Add netem test case exercising loops Add a netem nested duplicate test case to validate that it won't cause an infinite loop Acked-by: Jamal Hadi Salim Acked-by: Stephen Hemminger Signed-off-by: Victor Nogueira Link: https://patch.msgid.link/20260525122556.973584-10-jhs@mojatatu.com Signed-off-by: Paolo Abeni --- .../tc-testing/tc-tests/qdiscs/netem.json | 33 ++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/netem.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/netem.json index 3c4444961488..472b672a600d 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/netem.json +++ b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/netem.json @@ -336,5 +336,36 @@ "teardown": [ "$TC qdisc del dev $DUMMY handle 1: root" ] - } + }, + { + "id": "8c17", + "name": "Test netem's recursive duplicate", + "category": [ + "qdisc", + "netem" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.11.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY root handle 1: netem limit 1000 duplicate 100%", + "$TC qdisc add dev $DUMMY parent 1: handle 2: netem limit 1000 duplicate 100%" + ], + "cmdUnderTest": "ping -c 1 10.10.11.11 -W 0.01", + "expExitCode": "1", + "verifyCmd": "$TC -s -j qdisc ls dev $DUMMY root", + "matchJSON": [ + { + "kind": "netem", + "handle": "1:", + "bytes": 294, + "packets": 3 + } + ], + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root" + ] + } ] From 463a1271aa26eac992851b9d98cc75bc3cd4a1ed Mon Sep 17 00:00:00 2001 From: Jijie Shao Date: Mon, 25 May 2026 22:45:24 +0800 Subject: [PATCH 81/94] net: hibmcge: disable Relaxed Ordering to fix RX packet corruption When SMMU is disabled, the hibmcge driver may receive corrupted packets. The hardware writes packet data and descriptors to the same page, but with Relaxed Ordering enabled, PCI write transactions may not be strictly ordered. This can cause the driver to observe a valid descriptor before the corresponding packet data is fully written. Fix this by clearing PCI_EXP_DEVCTL_RELAX_EN in the PCI bridge control register to ensure strict write ordering between packet data and descriptors. Fixes: f72e25594061 ("net: hibmcge: Implement rx_poll function to receive packets") Signed-off-by: Jijie Shao Link: https://patch.msgid.link/20260525144525.94884-2-shaojijie@huawei.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/hisilicon/hibmcge/hbg_main.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_main.c b/drivers/net/ethernet/hisilicon/hibmcge/hbg_main.c index 068da2fd1fea..f721e9893804 100644 --- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_main.c +++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_main.c @@ -420,6 +420,9 @@ static int hbg_pci_init(struct pci_dev *pdev) return -ENOMEM; pci_set_master(pdev); + pcie_capability_clear_word(pdev, PCI_EXP_DEVCTL, + PCI_EXP_DEVCTL_RELAX_EN); + pci_save_state(pdev); return 0; } From b545b6ea1802b32436fa97f1d2918718212cc831 Mon Sep 17 00:00:00 2001 From: Jijie Shao Date: Mon, 25 May 2026 22:45:25 +0800 Subject: [PATCH 82/94] net: hibmcge: move dma_rmb() after dma_sync_single_for_cpu() in RX path The dma_rmb() barrier was placed before dma_sync_single_for_cpu(), which is incorrect. DMA sync must complete first to make the buffer accessible to the CPU, then the rmb barrier ensures subsequent descriptor reads observe the latest data written by the hardware. Reorder the operations so dma_sync_single_for_cpu() is called before dma_rmb() to guarantee the driver reads consistent data from the DMA buffer. Fixes: f72e25594061 ("net: hibmcge: Implement rx_poll function to receive packets") Signed-off-by: Jijie Shao Link: https://patch.msgid.link/20260525144525.94884-3-shaojijie@huawei.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/hisilicon/hibmcge/hbg_txrx.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hibmcge/hbg_txrx.c b/drivers/net/ethernet/hisilicon/hibmcge/hbg_txrx.c index a4ea92c31c2f..0ae314994676 100644 --- a/drivers/net/ethernet/hisilicon/hibmcge/hbg_txrx.c +++ b/drivers/net/ethernet/hisilicon/hibmcge/hbg_txrx.c @@ -452,12 +452,12 @@ static bool hbg_sync_data_from_hw(struct hbg_priv *priv, { struct hbg_rx_desc *rx_desc; - /* make sure HW write desc complete */ - dma_rmb(); - dma_sync_single_for_cpu(&priv->pdev->dev, buffer->page_dma, buffer->page_size, DMA_FROM_DEVICE); + /* make sure HW write desc complete */ + dma_rmb(); + rx_desc = (struct hbg_rx_desc *)buffer->page_addr; return FIELD_GET(HBG_RX_DESC_W2_PKT_LEN_M, rx_desc->word2) != 0; } From 98d0912e9f841e5529a5b89a972805f34cb1c69d Mon Sep 17 00:00:00 2001 From: Minh Nguyen Date: Tue, 26 May 2026 11:12:39 +0700 Subject: [PATCH 83/94] net: skbuff: fix missing zerocopy reference in pskb_carve helpers pskb_carve_inside_header() and pskb_carve_inside_nonlinear() both copy the old skb_shared_info header into a new buffer via memcpy(), which includes the destructor_arg pointer (uarg) for MSG_ZEROCOPY skbs. Neither function calls net_zcopy_get() for the new shinfo, creating an unaccounted holder: every skb_shared_info with destructor_arg set will call skb_zcopy_clear() once when freed, but the corresponding net_zcopy_get() was never called for the new copy. Repeated calls drive uarg->refcnt to zero prematurely, freeing ubuf_info_msgzc while TX skbs still hold live destructor_arg pointers. KASAN reports use-after-free on a freed ubuf_info_msgzc: BUG: KASAN: slab-use-after-free in skb_release_data+0x77b/0x810 Read of size 8 at addr ffff88801574d3e8 by task poc/220 Call Trace: skb_release_data+0x77b/0x810 kfree_skb_list_reason+0x13e/0x610 skb_release_data+0x4cd/0x810 sk_skb_reason_drop+0xf3/0x340 skb_queue_purge_reason+0x282/0x440 rds_tcp_inc_free+0x1e/0x30 rds_recvmsg+0x354/0x1780 __sys_recvmsg+0xdf/0x180 Allocated by task 219: msg_zerocopy_realloc+0x157/0x7b0 tcp_sendmsg_locked+0x2892/0x3ba0 Freed by task 219: ip_recv_error+0x74a/0xb10 tcp_recvmsg+0x475/0x530 The skb consuming the late access still referenced the same uarg via shinfo->destructor_arg copied by pskb_carve_inside_nonlinear() without a refcount bump. This has been verified to be reliably exploitable: a working proof-of-concept achieves full root privilege escalation from an unprivileged local user on a default kernel configuration. The fix follows the pattern of pskb_expand_head() which has the same memcpy/cloned structure. For pskb_carve_inside_header(), net_zcopy_get() is placed after skb_orphan_frags() succeeds, so the orphan error path needs no cleanup. For pskb_carve_inside_nonlinear(), net_zcopy_get() is placed after all failure points and just before skb_release_data(), so no error path needs cleanup at all -- matching pskb_expand_head() more closely and avoiding the need for a balancing net_zcopy_put(). Fixes: 6fa01ccd8830 ("skbuff: Add pskb_extract() helper function") Cc: stable@vger.kernel.org Assisted-by: Claude:claude-sonnet-4-6 Signed-off-by: Minh Nguyen Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20260526041240.329462-1-minhnguyen.080505@gmail.com Signed-off-by: Paolo Abeni --- net/core/skbuff.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index d247acd447e4..0d3cc115f2e7 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -6833,6 +6833,8 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off, skb_kfree_head(data); return -ENOMEM; } + if (skb_zcopy(skb)) + net_zcopy_get(skb_zcopy(skb)); for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) skb_frag_ref(skb, i); if (skb_has_frag_list(skb)) @@ -6976,6 +6978,8 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, skb_kfree_head(data); return -ENOMEM; } + if (skb_zcopy(skb)) + net_zcopy_get(skb_zcopy(skb)); skb_release_data(skb, SKB_CONSUMED); skb->head = data; From cc993e0927ec8bd98ea33377ada03295fcda0f24 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 25 May 2026 12:51:15 -0400 Subject: [PATCH 84/94] net/handshake: Use spin_lock_bh for hn_lock nvmet_tcp_state_change(), a socket callback that runs in BH context, can reach handshake_req_cancel() via nvmet_tcp_schedule_release_queue() and tls_handshake_cancel(). handshake_req_cancel() acquires hn->hn_lock with plain spin_lock(). If a process-context thread on the same CPU holds hn->hn_lock when a softirq invokes the cancel path, the lock attempt deadlocks. This is the only caller that invokes tls_handshake_cancel() from BH context; every other consumer calls it from process context. Deferring the cancel to process context in the NVMe target is not straightforward: nvmet_tcp_schedule_release_queue() must call tls_handshake_cancel() atomically with its state transition to DISCONNECTING. If the cancel were deferred, the handshake completion callback could fire in the window before the cancel runs, observe the unexpected state, and return without dropping its kref on the queue. Reworking that interlock is considerably more invasive than hardening the handshake lock. Convert all hn->hn_lock acquisitions from spin_lock/spin_unlock to spin_lock_bh/spin_unlock_bh so the lock is never taken with softirqs enabled. Fixes: 675b453e0241 ("nvmet-tcp: enable TLS handshake upcall") Signed-off-by: Chuck Lever Reviewed-by: Hannes Reinecke Link: https://patch.msgid.link/20260525-handshake-file-pin-v3-1-66c616906ead@oracle.com Signed-off-by: Paolo Abeni --- net/handshake/netlink.c | 4 ++-- net/handshake/request.c | 14 +++++++------- net/handshake/tlshd.c | 2 ++ 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/net/handshake/netlink.c b/net/handshake/netlink.c index b989456fc4c5..97114ec8027a 100644 --- a/net/handshake/netlink.c +++ b/net/handshake/netlink.c @@ -202,10 +202,10 @@ static void __net_exit handshake_net_exit(struct net *net) * accepted and are in progress will be destroyed when * the socket is closed. */ - spin_lock(&hn->hn_lock); + spin_lock_bh(&hn->hn_lock); set_bit(HANDSHAKE_F_NET_DRAINING, &hn->hn_flags); list_splice_init(&requests, &hn->hn_requests); - spin_unlock(&hn->hn_lock); + spin_unlock_bh(&hn->hn_lock); while (!list_empty(&requests)) { req = list_first_entry(&requests, struct handshake_req, hr_list); diff --git a/net/handshake/request.c b/net/handshake/request.c index 2829adbeb149..5d4a17f902d2 100644 --- a/net/handshake/request.c +++ b/net/handshake/request.c @@ -167,12 +167,12 @@ static bool remove_pending(struct handshake_net *hn, struct handshake_req *req) { bool ret = false; - spin_lock(&hn->hn_lock); + spin_lock_bh(&hn->hn_lock); if (!list_empty(&req->hr_list)) { __remove_pending_locked(hn, req); ret = true; } - spin_unlock(&hn->hn_lock); + spin_unlock_bh(&hn->hn_lock); return ret; } @@ -182,7 +182,7 @@ struct handshake_req *handshake_req_next(struct handshake_net *hn, int class) struct handshake_req *req, *pos; req = NULL; - spin_lock(&hn->hn_lock); + spin_lock_bh(&hn->hn_lock); list_for_each_entry(pos, &hn->hn_requests, hr_list) { if (pos->hr_proto->hp_handler_class != class) continue; @@ -190,7 +190,7 @@ struct handshake_req *handshake_req_next(struct handshake_net *hn, int class) req = pos; break; } - spin_unlock(&hn->hn_lock); + spin_unlock_bh(&hn->hn_lock); return req; } @@ -249,7 +249,7 @@ int handshake_req_submit(struct socket *sock, struct handshake_req *req, if (READ_ONCE(hn->hn_pending) >= hn->hn_pending_max) goto out_err; - spin_lock(&hn->hn_lock); + spin_lock_bh(&hn->hn_lock); ret = -EOPNOTSUPP; if (test_bit(HANDSHAKE_F_NET_DRAINING, &hn->hn_flags)) goto out_unlock; @@ -258,7 +258,7 @@ int handshake_req_submit(struct socket *sock, struct handshake_req *req, goto out_unlock; if (!__add_pending_locked(hn, req)) goto out_unlock; - spin_unlock(&hn->hn_lock); + spin_unlock_bh(&hn->hn_lock); ret = handshake_genl_notify(net, req->hr_proto, flags); if (ret) { @@ -274,7 +274,7 @@ int handshake_req_submit(struct socket *sock, struct handshake_req *req, return 0; out_unlock: - spin_unlock(&hn->hn_lock); + spin_unlock_bh(&hn->hn_lock); out_err: /* Restore original destructor so socket teardown still runs on failure */ req->hr_sk->sk_destruct = req->hr_odestruct; diff --git a/net/handshake/tlshd.c b/net/handshake/tlshd.c index 8f9532a15f43..af294c6cc717 100644 --- a/net/handshake/tlshd.c +++ b/net/handshake/tlshd.c @@ -425,6 +425,8 @@ EXPORT_SYMBOL(tls_server_hello_psk); * Request cancellation races with request completion. To determine * who won, callers examine the return value from this function. * + * Context: May be called from process or softirq context. + * * Return values: * %true - Uncompleted handshake request was canceled * %false - Handshake request already completed or not found From 9015985b5eb1a90eb86caf5bce1dfcf1aa38f8ad Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 25 May 2026 12:51:16 -0400 Subject: [PATCH 85/94] nvme-tcp: store negative errno in queue->tls_err nvme_tcp_tls_done() assigns queue->tls_err in three branches. The ENOKEY lookup failure and the EOPNOTSUPP initializer both store negative errnos. The third branch, reached when the handshake layer reports a non-zero status, stores -status. The handshake layer delivers status to the consumer callback as a negative errno; the other in-tree consumers -- xs_tls_handshake_done() and the nvmet target callback -- treat their status argument that way. The extra negation in nvme_tcp_tls_done() flips the sign, leaving tls_err as a positive value (for instance, +EIO), which nvme_tcp_start_tls() then returns to its caller. Drop the extra negation so queue->tls_err uniformly carries a negative errno on failure. Fixes: be8e82caa685 ("nvme-tcp: enable TLS handshake upcall") Signed-off-by: Chuck Lever Reviewed-by: Hannes Reinecke Reviewed-by: Alistair Francis Link: https://patch.msgid.link/20260525-handshake-file-pin-v3-2-66c616906ead@oracle.com Signed-off-by: Paolo Abeni --- drivers/nvme/host/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 15d36d6a728e..68a1d7640494 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1702,7 +1702,7 @@ static void nvme_tcp_tls_done(void *data, int status, key_serial_t pskid) qid, pskid, status); if (status) { - queue->tls_err = -status; + queue->tls_err = status; goto out_complete; } From 6b22d433aa13f68e3cd9534ca9a5f4277bfa01c2 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 25 May 2026 12:51:17 -0400 Subject: [PATCH 86/94] net/handshake: Pass negative errno through handshake_complete() handshake_complete() declares status as unsigned int and tls_handshake_done() negates that value (-status) before handing it to the TLS consumer. Consumers match on negative errno constants -- xs_tls_handshake_done() has switch (status) { case 0: case -EACCES: case -ETIMEDOUT: lower_transport->xprt_err = status; break; default: lower_transport->xprt_err = -EACCES; } so the API as designed expects callers to pass positive errno values that the tlshd shim then negates. Three internal callers in handshake_nl_accept_doit(), the net-exit drain, and a kunit test follow kernel convention and pass negative errnos -- -EIO, -ETIMEDOUT, -ETIMEDOUT. The implicit conversion to unsigned int turns -ETIMEDOUT into 0xFFFFFF92; the subsequent -status in tls_handshake_done() wraps back to 110, the consumer's switch falls through, and the xprt reports -EACCES on what should be -ETIMEDOUT or -EIO. Fix the API rather than the call sites. The natural kernel convention is negative errno in, negative errno out. Change handshake_complete() and hp_done to take int status, drop the negation in tls_handshake_done(), and negate once in handshake_nl_done_doit() where status arrives from the wire as an unsigned netlink attribute. The three internal callers were already correct under that convention and need no change. At the same wire boundary, declare MAX_ERRNO as the netlink policy upper bound for HANDSHAKE_A_DONE_STATUS. Attribute validation rejects out-of-range values before handshake_nl_done_doit() runs, and negating a bounded u32 there stays within int range -- closing the UBSAN-visible signed- integer overflow that an unconstrained u32 would invoke. Fixes: 3b3009ea8abb ("net/handshake: Create a NETLINK service for handling handshake requests") Signed-off-by: Chuck Lever Reviewed-by: Hannes Reinecke Link: https://patch.msgid.link/20260525-handshake-file-pin-v3-3-66c616906ead@oracle.com Signed-off-by: Paolo Abeni --- Documentation/netlink/specs/handshake.yaml | 8 ++++++++ net/handshake/genl.c | 3 ++- net/handshake/genl.h | 1 + net/handshake/handshake-test.c | 2 +- net/handshake/handshake.h | 4 ++-- net/handshake/netlink.c | 2 +- net/handshake/request.c | 2 +- net/handshake/tlshd.c | 4 ++-- 8 files changed, 18 insertions(+), 8 deletions(-) diff --git a/Documentation/netlink/specs/handshake.yaml b/Documentation/netlink/specs/handshake.yaml index 95c3fade7a8d..1024297b3851 100644 --- a/Documentation/netlink/specs/handshake.yaml +++ b/Documentation/netlink/specs/handshake.yaml @@ -12,6 +12,12 @@ protocol: genetlink doc: Netlink protocol to request a transport layer security handshake. definitions: + - + type: const + name: max-errno + value: 4095 + header: linux/err.h + scope: kernel - type: enum name: handler-class @@ -80,6 +86,8 @@ attribute-sets: - name: status type: u32 + checks: + max: max-errno - name: sockfd type: s32 diff --git a/net/handshake/genl.c b/net/handshake/genl.c index 870612609491..4b20cd9cdd0e 100644 --- a/net/handshake/genl.c +++ b/net/handshake/genl.c @@ -10,6 +10,7 @@ #include "genl.h" #include +#include /* HANDSHAKE_CMD_ACCEPT - do */ static const struct nla_policy handshake_accept_nl_policy[HANDSHAKE_A_ACCEPT_HANDLER_CLASS + 1] = { @@ -18,7 +19,7 @@ static const struct nla_policy handshake_accept_nl_policy[HANDSHAKE_A_ACCEPT_HAN /* HANDSHAKE_CMD_DONE - do */ static const struct nla_policy handshake_done_nl_policy[HANDSHAKE_A_DONE_REMOTE_AUTH + 1] = { - [HANDSHAKE_A_DONE_STATUS] = { .type = NLA_U32, }, + [HANDSHAKE_A_DONE_STATUS] = NLA_POLICY_MAX(NLA_U32, MAX_ERRNO), [HANDSHAKE_A_DONE_SOCKFD] = { .type = NLA_S32, }, [HANDSHAKE_A_DONE_REMOTE_AUTH] = { .type = NLA_U32, }, }; diff --git a/net/handshake/genl.h b/net/handshake/genl.h index 8d3e18672daf..46b65f131669 100644 --- a/net/handshake/genl.h +++ b/net/handshake/genl.h @@ -11,6 +11,7 @@ #include #include +#include int handshake_nl_accept_doit(struct sk_buff *skb, struct genl_info *info); int handshake_nl_done_doit(struct sk_buff *skb, struct genl_info *info); diff --git a/net/handshake/handshake-test.c b/net/handshake/handshake-test.c index 55442b2f518a..df3948e807a0 100644 --- a/net/handshake/handshake-test.c +++ b/net/handshake/handshake-test.c @@ -25,7 +25,7 @@ static int test_accept_func(struct handshake_req *req, struct genl_info *info, return 0; } -static void test_done_func(struct handshake_req *req, unsigned int status, +static void test_done_func(struct handshake_req *req, int status, struct genl_info *info) { } diff --git a/net/handshake/handshake.h b/net/handshake/handshake.h index a48163765a7a..2289b0e274f4 100644 --- a/net/handshake/handshake.h +++ b/net/handshake/handshake.h @@ -57,7 +57,7 @@ struct handshake_proto { int (*hp_accept)(struct handshake_req *req, struct genl_info *info, int fd); void (*hp_done)(struct handshake_req *req, - unsigned int status, + int status, struct genl_info *info); void (*hp_destroy)(struct handshake_req *req); }; @@ -86,7 +86,7 @@ struct handshake_req *handshake_req_hash_lookup(struct sock *sk); struct handshake_req *handshake_req_next(struct handshake_net *hn, int class); int handshake_req_submit(struct socket *sock, struct handshake_req *req, gfp_t flags); -void handshake_complete(struct handshake_req *req, unsigned int status, +void handshake_complete(struct handshake_req *req, int status, struct genl_info *info); bool handshake_req_cancel(struct sock *sk); diff --git a/net/handshake/netlink.c b/net/handshake/netlink.c index 97114ec8027a..039344979de9 100644 --- a/net/handshake/netlink.c +++ b/net/handshake/netlink.c @@ -160,7 +160,7 @@ int handshake_nl_done_doit(struct sk_buff *skb, struct genl_info *info) status = -EIO; if (info->attrs[HANDSHAKE_A_DONE_STATUS]) - status = nla_get_u32(info->attrs[HANDSHAKE_A_DONE_STATUS]); + status = -(int)nla_get_u32(info->attrs[HANDSHAKE_A_DONE_STATUS]); handshake_complete(req, status, info); sockfd_put(sock); diff --git a/net/handshake/request.c b/net/handshake/request.c index 5d4a17f902d2..97f9f8239949 100644 --- a/net/handshake/request.c +++ b/net/handshake/request.c @@ -284,7 +284,7 @@ int handshake_req_submit(struct socket *sock, struct handshake_req *req, } EXPORT_SYMBOL(handshake_req_submit); -void handshake_complete(struct handshake_req *req, unsigned int status, +void handshake_complete(struct handshake_req *req, int status, struct genl_info *info) { struct sock *sk = req->hr_sk; diff --git a/net/handshake/tlshd.c b/net/handshake/tlshd.c index af294c6cc717..7567150c2a4f 100644 --- a/net/handshake/tlshd.c +++ b/net/handshake/tlshd.c @@ -93,7 +93,7 @@ static void tls_handshake_remote_peerids(struct tls_handshake_req *treq, * */ static void tls_handshake_done(struct handshake_req *req, - unsigned int status, struct genl_info *info) + int status, struct genl_info *info) { struct tls_handshake_req *treq = handshake_req_private(req); @@ -104,7 +104,7 @@ static void tls_handshake_done(struct handshake_req *req, if (!status) set_bit(HANDSHAKE_F_REQ_SESSION, &req->hr_flags); - treq->th_consumer_done(treq->th_consumer_data, -status, + treq->th_consumer_done(treq->th_consumer_data, status, treq->th_peerid[0]); } From 09dba37eee70d0596e26645015f1aa95a9848e9d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 25 May 2026 12:51:18 -0400 Subject: [PATCH 87/94] net/handshake: Take a long-lived file reference at submit handshake_nl_accept_doit() needs the file pointer backing req->hr_sk->sk_socket to survive the window between handshake_req_next() and the subsequent FD_PREPARE() and get_file(). The submit-side sock_hold() does not provide that. sk_refcnt keeps struct sock alive, but struct socket is owned by sock->file: when the consumer fputs the last file reference, sock_release() tears the socket down regardless of any sock_hold. Add an hr_file pointer to struct handshake_req and acquire an explicit reference on sock->file during handshake_req_submit(). handshake_complete() and handshake_req_cancel() release the reference on the completion-bit-winning path. The submit error path must also release the file reference, but after rhashtable insertion a concurrent handshake_req_cancel() can discover the request and race the error path. Gate the error-path cleanup -- sk_destruct restoration, fput, and request destruction -- with test_and_set_bit(HANDSHAKE_F_REQ_COMPLETED), the same serialization handshake_complete() and handshake_req_cancel() already use. When cancel has already claimed ownership, the submit error path returns without touching the request; socket teardown handles final destruction. The accept-side dereferences are not yet retargeted; that change comes in the next patch. Signed-off-by: Chuck Lever Link: https://patch.msgid.link/20260525-handshake-file-pin-v3-4-66c616906ead@oracle.com Signed-off-by: Paolo Abeni --- net/handshake/handshake.h | 2 ++ net/handshake/netlink.c | 6 ------ net/handshake/request.c | 42 ++++++++++++++++++++++++++++++++------- 3 files changed, 37 insertions(+), 13 deletions(-) diff --git a/net/handshake/handshake.h b/net/handshake/handshake.h index 2289b0e274f4..da61cadd1ad3 100644 --- a/net/handshake/handshake.h +++ b/net/handshake/handshake.h @@ -24,6 +24,7 @@ enum hn_flags_bits { HANDSHAKE_F_NET_DRAINING, }; +struct file; struct handshake_proto; /* One handshake request */ @@ -32,6 +33,7 @@ struct handshake_req { struct rhash_head hr_rhash; unsigned long hr_flags; const struct handshake_proto *hr_proto; + struct file *hr_file; struct sock *hr_sk; void (*hr_odestruct)(struct sock *sk); diff --git a/net/handshake/netlink.c b/net/handshake/netlink.c index 039344979de9..1a5821eb7184 100644 --- a/net/handshake/netlink.c +++ b/net/handshake/netlink.c @@ -210,12 +210,6 @@ static void __net_exit handshake_net_exit(struct net *net) while (!list_empty(&requests)) { req = list_first_entry(&requests, struct handshake_req, hr_list); list_del(&req->hr_list); - - /* - * Requests on this list have not yet been - * accepted, so they do not have an fd to put. - */ - handshake_complete(req, -ETIMEDOUT, NULL); } } diff --git a/net/handshake/request.c b/net/handshake/request.c index 97f9f8239949..da064511ab86 100644 --- a/net/handshake/request.c +++ b/net/handshake/request.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -215,9 +216,16 @@ EXPORT_SYMBOL_IF_KUNIT(handshake_req_next); * A zero return value from handshake_req_submit() means that * exactly one subsequent completion callback is guaranteed. * - * A negative return value from handshake_req_submit() means that - * no completion callback will be done and that @req has been - * destroyed. + * A negative return value from handshake_req_submit() guarantees that + * no completion callback will occur and that @req is no longer owned by + * the caller. If cancellation wins the completion race after the request + * has been published, final destruction is deferred until socket teardown. + * + * The caller must hold a reference on @sock->file for the duration + * of this call. Once the request is published to the accept side, a + * concurrent completion or cancellation may release the request's pin on + * @sock->file; the caller's reference is what keeps @sock->sk valid until + * handshake_req_submit() returns. */ int handshake_req_submit(struct socket *sock, struct handshake_req *req, gfp_t flags) @@ -236,6 +244,14 @@ int handshake_req_submit(struct socket *sock, struct handshake_req *req, kfree(req); return -EINVAL; } + + /* + * Pin sock->file for the lifetime of the request so the + * accept side does not race a consumer that releases the + * socket while a handshake is pending. + */ + req->hr_file = get_file(sock->file); + req->hr_odestruct = req->hr_sk->sk_destruct; req->hr_sk->sk_destruct = handshake_sk_destruct; @@ -267,7 +283,11 @@ int handshake_req_submit(struct socket *sock, struct handshake_req *req, goto out_err; } - /* Prevent socket release while a handshake request is pending */ + /* + * Pin struct sock so sk_destruct does not run until the + * handshake completion path releases it; struct socket is + * held separately via hr_file above. + */ sock_hold(req->hr_sk); trace_handshake_submit(net, req, req->hr_sk); @@ -276,10 +296,13 @@ int handshake_req_submit(struct socket *sock, struct handshake_req *req, out_unlock: spin_unlock_bh(&hn->hn_lock); out_err: - /* Restore original destructor so socket teardown still runs on failure */ - req->hr_sk->sk_destruct = req->hr_odestruct; trace_handshake_submit_err(net, req, req->hr_sk, ret); - handshake_req_destroy(req); + if (!test_and_set_bit(HANDSHAKE_F_REQ_COMPLETED, &req->hr_flags)) { + /* Restore original destructor so socket teardown still runs. */ + req->hr_sk->sk_destruct = req->hr_odestruct; + fput(req->hr_file); + handshake_req_destroy(req); + } return ret; } EXPORT_SYMBOL(handshake_req_submit); @@ -291,11 +314,15 @@ void handshake_complete(struct handshake_req *req, int status, struct net *net = sock_net(sk); if (!test_and_set_bit(HANDSHAKE_F_REQ_COMPLETED, &req->hr_flags)) { + struct file *file = req->hr_file; + trace_handshake_complete(net, req, sk, status); req->hr_proto->hp_done(req, status, info); /* Handshake request is no longer pending */ sock_put(sk); + + fput(file); } } EXPORT_SYMBOL_IF_KUNIT(handshake_complete); @@ -344,6 +371,7 @@ bool handshake_req_cancel(struct sock *sk) /* Handshake request is no longer pending */ sock_put(sk); + fput(req->hr_file); return true; } EXPORT_SYMBOL(handshake_req_cancel); From f4251190e58b209999c1ba9e6d2976136a1be055 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 25 May 2026 12:51:19 -0400 Subject: [PATCH 88/94] net/handshake: hand off the pinned file reference to accept_doit handshake_req_next() removes the request from the per-net pending list and drops hn_lock before handshake_nl_accept_doit() reads req->hr_sk->sk_socket and dereferences sock->file (once in FD_PREPARE() and again in get_file()). In that window a consumer running tls_handshake_cancel() followed by sockfd_put() (svc_sock_free) or __fput_sync() (xs_reset_transport) releases sock->file. sock_release() then runs sock_orphan(), zeroing sk_socket, and frees the struct socket. The accept-side code either reads NULL through sk_socket or chases freed memory. The submit-side sock_hold() does not prevent this. sk_refcnt protects struct sock, but struct socket and sock->file are independently refcounted via the file descriptor the consumer owns. Pinning sk leaves sock and sock->file unprotected. Retarget the accept-side dereferences at req->hr_file, which was pinned at submit time, instead of req->hr_sk->sk_socket->file. Pinning on its own is not sufficient: a consumer that cancels between handshake_req_next() returning and accept_doit reaching FD_PREPARE() takes the !remove_pending() branch in handshake_req_cancel() and drops hr_file before the accept side takes its own reference. Hand off an additional file reference inside handshake_req_next(), under hn_lock, so the accept side operates on a reference that no concurrent handshake_req_cancel() can revoke. FD_PREPARE() consumes that handed-off reference, either by transferring it to the new fd in fd_publish() or by dropping it in the cleanup destructor on error; the explicit get_file() that previously balanced FD_PREPARE() is therefore redundant and goes away. Update handshake_req_cancel_test2 and _test3 to simulate the FD_PREPARE() consumption with an fput() so the kunit file-count assertions stay balanced. Reported-by: Chris Mason Fixes: 3b3009ea8abb ("net/handshake: Create a NETLINK service for handling handshake requests") Signed-off-by: Chuck Lever Reviewed-by: Hannes Reinecke Link: https://patch.msgid.link/20260525-handshake-file-pin-v3-5-66c616906ead@oracle.com Signed-off-by: Paolo Abeni --- net/handshake/handshake-test.c | 8 ++++++++ net/handshake/netlink.c | 7 ++----- net/handshake/request.c | 18 ++++++++++++++++++ 3 files changed, 28 insertions(+), 5 deletions(-) diff --git a/net/handshake/handshake-test.c b/net/handshake/handshake-test.c index df3948e807a0..9cc7a95f4120 100644 --- a/net/handshake/handshake-test.c +++ b/net/handshake/handshake-test.c @@ -375,6 +375,10 @@ static void handshake_req_cancel_test2(struct kunit *test) /* Pretend to accept this request */ next = handshake_req_next(hn, HANDSHAKE_HANDLER_CLASS_TLSHD); KUNIT_ASSERT_PTR_EQ(test, req, next); + /* Simulate FD_PREPARE() consuming the file reference handed + * off by handshake_req_next(); see handshake_nl_accept_doit(). + */ + fput(filp); /* Act */ result = handshake_req_cancel(sock->sk); @@ -417,6 +421,10 @@ static void handshake_req_cancel_test3(struct kunit *test) /* Pretend to accept this request */ next = handshake_req_next(hn, HANDSHAKE_HANDLER_CLASS_TLSHD); KUNIT_ASSERT_PTR_EQ(test, req, next); + /* Simulate FD_PREPARE() consuming the file reference handed + * off by handshake_req_next(); see handshake_nl_accept_doit(). + */ + fput(filp); /* Pretend to complete this request */ handshake_complete(next, -ETIMEDOUT, NULL); diff --git a/net/handshake/netlink.c b/net/handshake/netlink.c index 1a5821eb7184..21d6cbd52fcd 100644 --- a/net/handshake/netlink.c +++ b/net/handshake/netlink.c @@ -92,7 +92,6 @@ int handshake_nl_accept_doit(struct sk_buff *skb, struct genl_info *info) struct net *net = sock_net(skb->sk); struct handshake_net *hn = handshake_pernet(net); struct handshake_req *req = NULL; - struct socket *sock; int class, err; err = -EOPNOTSUPP; @@ -107,15 +106,13 @@ int handshake_nl_accept_doit(struct sk_buff *skb, struct genl_info *info) err = -EAGAIN; req = handshake_req_next(hn, class); if (req) { - sock = req->hr_sk->sk_socket; - - FD_PREPARE(fdf, O_CLOEXEC, sock->file); + FD_PREPARE(fdf, O_CLOEXEC, req->hr_file); if (fdf.err) { + fput(req->hr_file); /* drop ref from handshake_req_next() */ err = fdf.err; goto out_complete; } - get_file(sock->file); /* FD_PREPARE() consumes a reference. */ err = req->hr_proto->hp_accept(req, info, fd_prepare_fd(fdf)); if (err) goto out_complete; /* Automatic cleanup handles fput */ diff --git a/net/handshake/request.c b/net/handshake/request.c index da064511ab86..e2d7ee7ce6e0 100644 --- a/net/handshake/request.c +++ b/net/handshake/request.c @@ -178,6 +178,17 @@ static bool remove_pending(struct handshake_net *hn, struct handshake_req *req) return ret; } +/** + * handshake_req_next - Return the next queued handshake request + * @hn: per-net handshake state + * @class: handler class to match + * + * On a non-NULL return, the caller owns an extra reference + * on @req->hr_file. FD_PREPARE() consumes it on success; on + * the FD_PREPARE() failure path the caller must fput() it. + * + * Return: pointer to a removed handshake_req, or NULL. + */ struct handshake_req *handshake_req_next(struct handshake_net *hn, int class) { struct handshake_req *req, *pos; @@ -188,6 +199,13 @@ struct handshake_req *handshake_req_next(struct handshake_net *hn, int class) if (pos->hr_proto->hp_handler_class != class) continue; __remove_pending_locked(hn, pos); + /* Hand off a file reference to the accept side under + * hn_lock. A concurrent handshake_req_cancel() can drop + * hr_file before accept reaches FD_PREPARE(); this extra + * reference keeps the file alive until FD_PREPARE() takes + * ownership. + */ + get_file(pos->hr_file); req = pos; break; } From 5da98f55b13173c08f003011b76531b25c821c07 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 25 May 2026 12:51:20 -0400 Subject: [PATCH 89/94] net/handshake: Close the submit-side sock_hold race handshake_req_submit() publishes the request via handshake_req_hash_add() and __add_pending_locked(), drops hn_lock, and calls handshake_genl_notify() (which can sleep) before taking sock_hold() on req->hr_sk. A fast tlshd ACCEPT followed by DONE can drive handshake_complete()'s sock_put() into the window between the spin_unlock and the late sock_hold(); on a system where the consumer's fd held the only sk reference, the late sock_hold() then operates on an sk whose refcount has reached zero. The preceding two patches install an explicit file reference on struct handshake_req. That file pins sock->file, which pins the embedded struct socket, which defers inet_release()'s sock_put(). As long as hr_file is held, sk cannot reach refcount zero from the consumer side, and the submit-side sock_hold() with its matching sock_put() calls in handshake_complete() and handshake_req_cancel() is now redundant. Drop all three. The file reference already keeps each request's socket alive, and the lifetime story is contained in a single get_file()/fput() pair. Fixes: 3b3009ea8abb ("net/handshake: Create a NETLINK service for handling handshake requests") Signed-off-by: Chuck Lever Reviewed-by: Hannes Reinecke Link: https://patch.msgid.link/20260525-handshake-file-pin-v3-6-66c616906ead@oracle.com Signed-off-by: Paolo Abeni --- net/handshake/request.c | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/net/handshake/request.c b/net/handshake/request.c index e2d7ee7ce6e0..bd3d9467ab91 100644 --- a/net/handshake/request.c +++ b/net/handshake/request.c @@ -301,13 +301,6 @@ int handshake_req_submit(struct socket *sock, struct handshake_req *req, goto out_err; } - /* - * Pin struct sock so sk_destruct does not run until the - * handshake completion path releases it; struct socket is - * held separately via hr_file above. - */ - sock_hold(req->hr_sk); - trace_handshake_submit(net, req, req->hr_sk); return 0; @@ -337,9 +330,6 @@ void handshake_complete(struct handshake_req *req, int status, trace_handshake_complete(net, req, sk, status); req->hr_proto->hp_done(req, status, info); - /* Handshake request is no longer pending */ - sock_put(sk); - fput(file); } } @@ -387,8 +377,6 @@ bool handshake_req_cancel(struct sock *sk) out_true: trace_handshake_cancel(net, req, sk); - /* Handshake request is no longer pending */ - sock_put(sk); fput(req->hr_file); return true; } From 204a5efde5ed52932840ee1d15d3b581cfda48e2 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 25 May 2026 12:51:21 -0400 Subject: [PATCH 90/94] net/handshake: Verify file-reference balance in submit paths The new file-reference contract on struct handshake_req is silently breakable: a missing get_file() at submit or a missing fput() on an error path leaves the file leaked but does not crash the test, so the existing absence-of-crash checks pass either way. Snapshot file_count(filp) before each handshake_req_submit() in the submit-success, EAGAIN, EBUSY, and cancel tests, and assert the expected balance after submit and again after cancel. The already-completed cancel test also asserts the post-complete balance, which pins down that handshake_complete() drops the reference and that the subsequent cancel does not double-fput. The destroy test gets the same treatment before __fput_sync(), which double-checks that cancel's fput() ran and the only remaining reference is the one sock_alloc_file() established. Signed-off-by: Chuck Lever Reviewed-by: Hannes Reinecke Link: https://patch.msgid.link/20260525-handshake-file-pin-v3-7-66c616906ead@oracle.com Signed-off-by: Paolo Abeni --- net/handshake/handshake-test.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/net/handshake/handshake-test.c b/net/handshake/handshake-test.c index 9cc7a95f4120..3dd507470d5f 100644 --- a/net/handshake/handshake-test.c +++ b/net/handshake/handshake-test.c @@ -208,6 +208,7 @@ static void handshake_req_submit_test3(struct kunit *test) static void handshake_req_submit_test4(struct kunit *test) { struct handshake_req *req, *result; + unsigned long fcount_before; struct socket *sock; struct file *filp; int err; @@ -224,8 +225,10 @@ static void handshake_req_submit_test4(struct kunit *test) KUNIT_ASSERT_NOT_NULL(test, sock->sk); sock->file = filp; + fcount_before = file_count(filp); err = handshake_req_submit(sock, req, GFP_KERNEL); KUNIT_ASSERT_EQ(test, err, 0); + KUNIT_EXPECT_EQ(test, file_count(filp), fcount_before + 1); /* Act */ result = handshake_req_hash_lookup(sock->sk); @@ -235,11 +238,13 @@ static void handshake_req_submit_test4(struct kunit *test) KUNIT_EXPECT_PTR_EQ(test, req, result); handshake_req_cancel(sock->sk); + KUNIT_EXPECT_EQ(test, file_count(filp), fcount_before); fput(filp); } static void handshake_req_submit_test5(struct kunit *test) { + unsigned long fcount_before; struct handshake_req *req; struct handshake_net *hn; struct socket *sock; @@ -265,12 +270,14 @@ static void handshake_req_submit_test5(struct kunit *test) saved = hn->hn_pending; hn->hn_pending = hn->hn_pending_max + 1; + fcount_before = file_count(filp); /* Act */ err = handshake_req_submit(sock, req, GFP_KERNEL); /* Assert */ KUNIT_EXPECT_EQ(test, err, -EAGAIN); + KUNIT_EXPECT_EQ(test, file_count(filp), fcount_before); fput(filp); hn->hn_pending = saved; @@ -279,6 +286,7 @@ static void handshake_req_submit_test5(struct kunit *test) static void handshake_req_submit_test6(struct kunit *test) { struct handshake_req *req1, *req2; + unsigned long fcount_before; struct socket *sock; struct file *filp; int err; @@ -296,21 +304,26 @@ static void handshake_req_submit_test6(struct kunit *test) KUNIT_ASSERT_NOT_ERR_OR_NULL(test, filp); KUNIT_ASSERT_NOT_NULL(test, sock->sk); sock->file = filp; + fcount_before = file_count(filp); /* Act */ err = handshake_req_submit(sock, req1, GFP_KERNEL); KUNIT_ASSERT_EQ(test, err, 0); + KUNIT_EXPECT_EQ(test, file_count(filp), fcount_before + 1); err = handshake_req_submit(sock, req2, GFP_KERNEL); /* Assert */ KUNIT_EXPECT_EQ(test, err, -EBUSY); + KUNIT_EXPECT_EQ(test, file_count(filp), fcount_before + 1); handshake_req_cancel(sock->sk); + KUNIT_EXPECT_EQ(test, file_count(filp), fcount_before); fput(filp); } static void handshake_req_cancel_test1(struct kunit *test) { + unsigned long fcount_before; struct handshake_req *req; struct socket *sock; struct file *filp; @@ -329,8 +342,10 @@ static void handshake_req_cancel_test1(struct kunit *test) KUNIT_ASSERT_NOT_ERR_OR_NULL(test, filp); sock->file = filp; + fcount_before = file_count(filp); err = handshake_req_submit(sock, req, GFP_KERNEL); KUNIT_ASSERT_EQ(test, err, 0); + KUNIT_EXPECT_EQ(test, file_count(filp), fcount_before + 1); /* NB: handshake_req hasn't been accepted */ @@ -339,12 +354,14 @@ static void handshake_req_cancel_test1(struct kunit *test) /* Assert */ KUNIT_EXPECT_TRUE(test, result); + KUNIT_EXPECT_EQ(test, file_count(filp), fcount_before); fput(filp); } static void handshake_req_cancel_test2(struct kunit *test) { + unsigned long fcount_before; struct handshake_req *req, *next; struct handshake_net *hn; struct socket *sock; @@ -365,8 +382,10 @@ static void handshake_req_cancel_test2(struct kunit *test) KUNIT_ASSERT_NOT_ERR_OR_NULL(test, filp); sock->file = filp; + fcount_before = file_count(filp); err = handshake_req_submit(sock, req, GFP_KERNEL); KUNIT_ASSERT_EQ(test, err, 0); + KUNIT_EXPECT_EQ(test, file_count(filp), fcount_before + 1); net = sock_net(sock->sk); hn = handshake_pernet(net); @@ -385,12 +404,14 @@ static void handshake_req_cancel_test2(struct kunit *test) /* Assert */ KUNIT_EXPECT_TRUE(test, result); + KUNIT_EXPECT_EQ(test, file_count(filp), fcount_before); fput(filp); } static void handshake_req_cancel_test3(struct kunit *test) { + unsigned long fcount_before; struct handshake_req *req, *next; struct handshake_net *hn; struct socket *sock; @@ -411,8 +432,10 @@ static void handshake_req_cancel_test3(struct kunit *test) KUNIT_ASSERT_NOT_ERR_OR_NULL(test, filp); sock->file = filp; + fcount_before = file_count(filp); err = handshake_req_submit(sock, req, GFP_KERNEL); KUNIT_ASSERT_EQ(test, err, 0); + KUNIT_EXPECT_EQ(test, file_count(filp), fcount_before + 1); net = sock_net(sock->sk); hn = handshake_pernet(net); @@ -428,12 +451,14 @@ static void handshake_req_cancel_test3(struct kunit *test) /* Pretend to complete this request */ handshake_complete(next, -ETIMEDOUT, NULL); + KUNIT_EXPECT_EQ(test, file_count(filp), fcount_before); /* Act */ result = handshake_req_cancel(sock->sk); /* Assert */ KUNIT_EXPECT_FALSE(test, result); + KUNIT_EXPECT_EQ(test, file_count(filp), fcount_before); fput(filp); } @@ -454,6 +479,7 @@ static struct handshake_proto handshake_req_alloc_proto_destroy = { static void handshake_req_destroy_test1(struct kunit *test) { + unsigned long fcount_before; struct handshake_req *req; struct socket *sock; struct file *filp; @@ -473,10 +499,12 @@ static void handshake_req_destroy_test1(struct kunit *test) KUNIT_ASSERT_NOT_ERR_OR_NULL(test, filp); sock->file = filp; + fcount_before = file_count(filp); err = handshake_req_submit(sock, req, GFP_KERNEL); KUNIT_ASSERT_EQ(test, err, 0); handshake_req_cancel(sock->sk); + KUNIT_EXPECT_EQ(test, file_count(filp), fcount_before); /* Act */ /* Ensure the close/release/put process has run to From ea5fe6a73ca57e5150b8a38b341aef2636eb72f0 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 25 May 2026 12:51:22 -0400 Subject: [PATCH 91/94] net/handshake: Drain pending requests at net namespace exit The arguments to list_splice_init() in handshake_net_exit() are reversed. The call moves the local empty "requests" list onto hn->hn_requests, leaving the local list empty, so the subsequent drain loop runs zero iterations. Pending handshake requests that had not yet been accepted are not torn down when the net namespace is destroyed; each one keeps a reference on a socket file and on the handshake_req allocation. Pass the source and destination in the documented order (list_splice_init(list, head) moves list onto head) so the pending list is transferred to the local scratch list and drained through handshake_complete(). Fixing the splice direction exposes a list-corruption race. After the splice each req->hr_list still has non-empty link pointers, threading the stack-local scratch list rather than hn_requests. A concurrent handshake_req_cancel() -- for example, from sunrpc's TLS timeout on a kernel socket whose netns reference was not taken -- finds the request through the rhashtable, calls remove_pending(), and sees !list_empty(&req->hr_list). __remove_pending_locked() then list_del_init()s an entry off the scratch list while the drain iterates, corrupting it. The same call arriving after the drain loop has run list_del() on an entry hits LIST_POISON instead. Have remove_pending() check HANDSHAKE_F_NET_DRAINING under hn_lock and report not-found when drain is in progress. The drain has already taken ownership; handshake_complete()'s existing test_and_set on HANDSHAKE_F_REQ_COMPLETED still arbitrates between drain and cancel for who calls the consumer's hp_done. Use list_del_init() rather than list_del() in the drain so req->hr_list does not carry LIST_POISON after drain releases the entry. The DRAINING guard in remove_pending() makes cancel return false, but cancel still falls through to test_and_set_bit on HANDSHAKE_F_REQ_COMPLETED and drops the request's hr_file reference. Without another pin, if that is the last reference, sk_destruct frees the request while it is still linked on the drain loop's local list. Pin each request's hr_file under hn_lock before releasing the list, and drop that drain pin after the loop finishes with the request. Fixes: 3b3009ea8abb ("net/handshake: Create a NETLINK service for handling handshake requests") Signed-off-by: Chuck Lever Reviewed-by: Hannes Reinecke Link: https://patch.msgid.link/20260525-handshake-file-pin-v3-8-66c616906ead@oracle.com Signed-off-by: Paolo Abeni --- net/handshake/netlink.c | 10 ++++++++-- net/handshake/request.c | 5 ++++- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/net/handshake/netlink.c b/net/handshake/netlink.c index 21d6cbd52fcd..3fd4fef9bab1 100644 --- a/net/handshake/netlink.c +++ b/net/handshake/netlink.c @@ -201,13 +201,19 @@ static void __net_exit handshake_net_exit(struct net *net) */ spin_lock_bh(&hn->hn_lock); set_bit(HANDSHAKE_F_NET_DRAINING, &hn->hn_flags); - list_splice_init(&requests, &hn->hn_requests); + list_splice_init(&hn->hn_requests, &requests); + list_for_each_entry(req, &requests, hr_list) + get_file(req->hr_file); spin_unlock_bh(&hn->hn_lock); while (!list_empty(&requests)) { + struct file *file; + req = list_first_entry(&requests, struct handshake_req, hr_list); - list_del(&req->hr_list); + file = req->hr_file; + list_del_init(&req->hr_list); handshake_complete(req, -ETIMEDOUT, NULL); + fput(file); } } diff --git a/net/handshake/request.c b/net/handshake/request.c index bd3d9467ab91..cd30d54d0501 100644 --- a/net/handshake/request.c +++ b/net/handshake/request.c @@ -163,13 +163,16 @@ static void __remove_pending_locked(struct handshake_net *hn, * otherwise %false. * * If @req was on a pending list, it has not yet been accepted. + * Returns %false when the net namespace is draining; the drain + * loop has taken ownership of the pending list. */ static bool remove_pending(struct handshake_net *hn, struct handshake_req *req) { bool ret = false; spin_lock_bh(&hn->hn_lock); - if (!list_empty(&req->hr_list)) { + if (!test_bit(HANDSHAKE_F_NET_DRAINING, &hn->hn_flags) && + !list_empty(&req->hr_list)) { __remove_pending_locked(hn, req); ret = true; } From 20040b2a3cb992f84d3db4c086b909eb9b906b31 Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Tue, 26 May 2026 09:45:23 +0200 Subject: [PATCH 92/94] dpll: export __dpll_device_change_ntf() for use under dpll_lock Export __dpll_device_change_ntf() so that drivers can send device change notifications from within device callbacks, which are already called under dpll_lock. Using dpll_device_change_ntf() in that context would deadlock. Add lockdep_assert_held() to catch misuse without the lock held. Signed-off-by: Ivan Vecera Reviewed-by: Jiri Pirko Link: https://patch.msgid.link/20260526074525.1451008-2-ivecera@redhat.com Signed-off-by: Paolo Abeni --- drivers/dpll/dpll_netlink.c | 13 +++++++++++-- include/linux/dpll.h | 1 + 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/dpll/dpll_netlink.c b/drivers/dpll/dpll_netlink.c index 0ff1658c2dc1..75e3ae0c16d0 100644 --- a/drivers/dpll/dpll_netlink.c +++ b/drivers/dpll/dpll_netlink.c @@ -829,12 +829,21 @@ int dpll_device_delete_ntf(struct dpll_device *dpll) return dpll_device_event_send(DPLL_CMD_DEVICE_DELETE_NTF, dpll); } -static int -__dpll_device_change_ntf(struct dpll_device *dpll) +/** + * __dpll_device_change_ntf - notify that the dpll device has been changed + * @dpll: registered dpll pointer + * + * Context: caller must hold dpll_lock. Suitable for use inside device + * callbacks which are already invoked under dpll_lock. + * Return: 0 if succeeds, error code otherwise. + */ +int __dpll_device_change_ntf(struct dpll_device *dpll) { + lockdep_assert_held(&dpll_lock); dpll_device_notify(dpll, DPLL_DEVICE_CHANGED); return dpll_device_event_send(DPLL_CMD_DEVICE_CHANGE_NTF, dpll); } +EXPORT_SYMBOL_GPL(__dpll_device_change_ntf); /** * dpll_device_change_ntf - notify that the dpll device has been changed diff --git a/include/linux/dpll.h b/include/linux/dpll.h index f8037f1ab20b..2dbe8567eafc 100644 --- a/include/linux/dpll.h +++ b/include/linux/dpll.h @@ -284,6 +284,7 @@ void dpll_pin_on_pin_unregister(struct dpll_pin *parent, struct dpll_pin *pin, int dpll_pin_ref_sync_pair_add(struct dpll_pin *pin, struct dpll_pin *ref_sync_pin); +int __dpll_device_change_ntf(struct dpll_device *dpll); int dpll_device_change_ntf(struct dpll_device *dpll); int __dpll_pin_change_ntf(struct dpll_pin *pin); From d733f519f6443540f8359461a34e3b0042099bbe Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Tue, 26 May 2026 09:45:24 +0200 Subject: [PATCH 93/94] dpll: zl3073x: use __dpll_device_change_ntf() and remove change_work The change_work was introduced to send device change notifications from DPLL device callbacks without deadlocking on dpll_lock, since the callbacks are already invoked under that lock. Now that __dpll_device_change_ntf() is exported for callers that already hold dpll_lock, use it directly and remove the change_work infrastructure entirely. This eliminates a race condition where change_work could be re-scheduled after cancel_work_sync() during device teardown, potentially causing the handler to dereference a freed or NULL dpll_dev pointer. Fixes: 9363b4837659 ("dpll: zl3073x: Allow to configure phase offset averaging factor") Signed-off-by: Ivan Vecera Link: https://patch.msgid.link/20260526074525.1451008-3-ivecera@redhat.com Signed-off-by: Paolo Abeni --- drivers/dpll/zl3073x/dpll.c | 26 +++++++++----------------- drivers/dpll/zl3073x/dpll.h | 2 -- 2 files changed, 9 insertions(+), 19 deletions(-) diff --git a/drivers/dpll/zl3073x/dpll.c b/drivers/dpll/zl3073x/dpll.c index 64b4e9e3e8fe..0770bd895de9 100644 --- a/drivers/dpll/zl3073x/dpll.c +++ b/drivers/dpll/zl3073x/dpll.c @@ -1079,15 +1079,6 @@ zl3073x_dpll_phase_offset_avg_factor_get(const struct dpll_device *dpll, return 0; } -static void -zl3073x_dpll_change_work(struct work_struct *work) -{ - struct zl3073x_dpll *zldpll; - - zldpll = container_of(work, struct zl3073x_dpll, change_work); - dpll_device_change_ntf(zldpll->dpll_dev); -} - static int zl3073x_dpll_phase_offset_avg_factor_set(const struct dpll_device *dpll, void *dpll_priv, u32 factor, @@ -1113,8 +1104,10 @@ zl3073x_dpll_phase_offset_avg_factor_set(const struct dpll_device *dpll, * we have to send a notification for other DPLL devices. */ list_for_each_entry(item, &zldpll->dev->dplls, list) { - if (item != zldpll) - schedule_work(&item->change_work); + struct dpll_device *dpll_dev = READ_ONCE(item->dpll_dev); + + if (item != zldpll && dpll_dev) + __dpll_device_change_ntf(dpll_dev); } return 0; @@ -1627,13 +1620,13 @@ zl3073x_dpll_device_register(struct zl3073x_dpll *zldpll) static void zl3073x_dpll_device_unregister(struct zl3073x_dpll *zldpll) { - WARN(!zldpll->dpll_dev, "DPLL device is not registered\n"); + struct dpll_device *dpll_dev = READ_ONCE(zldpll->dpll_dev); - cancel_work_sync(&zldpll->change_work); + WARN(!dpll_dev, "DPLL device is not registered\n"); - dpll_device_unregister(zldpll->dpll_dev, &zldpll->ops, zldpll); - dpll_device_put(zldpll->dpll_dev, &zldpll->tracker); - zldpll->dpll_dev = NULL; + WRITE_ONCE(zldpll->dpll_dev, NULL); + dpll_device_unregister(dpll_dev, &zldpll->ops, zldpll); + dpll_device_put(dpll_dev, &zldpll->tracker); } /** @@ -1926,7 +1919,6 @@ zl3073x_dpll_alloc(struct zl3073x_dev *zldev, u8 ch) zldpll->dev = zldev; zldpll->id = ch; INIT_LIST_HEAD(&zldpll->pins); - INIT_WORK(&zldpll->change_work, zl3073x_dpll_change_work); return zldpll; } diff --git a/drivers/dpll/zl3073x/dpll.h b/drivers/dpll/zl3073x/dpll.h index 434c32a7db12..c8bc8437a709 100644 --- a/drivers/dpll/zl3073x/dpll.h +++ b/drivers/dpll/zl3073x/dpll.h @@ -21,7 +21,6 @@ * @tracker: tracking object for the acquired reference * @lock_status: last saved DPLL lock status * @pins: list of pins - * @change_work: device change notification work */ struct zl3073x_dpll { struct list_head list; @@ -35,7 +34,6 @@ struct zl3073x_dpll { dpll_tracker tracker; enum dpll_lock_status lock_status; struct list_head pins; - struct work_struct change_work; }; struct zl3073x_dpll *zl3073x_dpll_alloc(struct zl3073x_dev *zldev, u8 ch); From c1224569cef038b040db0459510cd7948ecd467b Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Tue, 26 May 2026 09:45:25 +0200 Subject: [PATCH 94/94] dpll: zl3073x: make frequency monitor a per-device attribute The frequency monitoring feature uses shared hardware registers that measure input reference frequencies independently of individual DPLL channels. However, the freq_monitor flag was incorrectly placed in the per-DPLL structure, causing each channel to track its own enable/disable state independently. Since the DPLL core calls measured_freq_get() only for the first pin registration, the measured_freq_check() in the periodic worker was gated by the per-DPLL freq_monitor flag of whichever channel happens to be checked. If the first DPLL channel had frequency monitoring disabled while another had it enabled, measurements were never reported. Move freq_monitor from struct zl3073x_dpll to struct zl3073x_dev so all DPLL channels share a single flag, matching the hardware behavior. Update freq_monitor_set() to notify other DPLL devices about the change (like phase_offset_avg_factor_set() already does) and remove the mode-dependent guard in zl3073x_dpll_changes_check() since all input pin monitoring (pin state, phase offset, FFO, and measured frequency) works correctly in all DPLL modes. Fixes: bfc923b642874 ("dpll: zl3073x: implement frequency monitoring") Signed-off-by: Ivan Vecera Link: https://patch.msgid.link/20260526074525.1451008-4-ivecera@redhat.com Signed-off-by: Paolo Abeni --- drivers/dpll/zl3073x/core.c | 19 ++++++++----------- drivers/dpll/zl3073x/core.h | 4 +++- drivers/dpll/zl3073x/dpll.c | 29 ++++++++++++++--------------- drivers/dpll/zl3073x/dpll.h | 2 -- 4 files changed, 25 insertions(+), 29 deletions(-) diff --git a/drivers/dpll/zl3073x/core.c b/drivers/dpll/zl3073x/core.c index 5f1e70f3e40a..0a133b0f2d97 100644 --- a/drivers/dpll/zl3073x/core.c +++ b/drivers/dpll/zl3073x/core.c @@ -762,18 +762,15 @@ zl3073x_dev_periodic_work(struct kthread_work *work) dev_warn(zldev->dev, "Failed to update phase offsets: %pe\n", ERR_PTR(rc)); - /* Update measured input reference frequencies if any DPLL has - * frequency monitoring enabled. + /* Update measured input reference frequencies if frequency + * monitoring is enabled. */ - list_for_each_entry(zldpll, &zldev->dplls, list) { - if (zldpll->freq_monitor) { - rc = zl3073x_ref_freq_meas_update(zldev); - if (rc) - dev_warn(zldev->dev, - "Failed to update measured frequencies: %pe\n", - ERR_PTR(rc)); - break; - } + if (zldev->freq_monitor) { + rc = zl3073x_ref_freq_meas_update(zldev); + if (rc) + dev_warn(zldev->dev, + "Failed to update measured frequencies: %pe\n", + ERR_PTR(rc)); } /* Update references' fractional frequency offsets */ diff --git a/drivers/dpll/zl3073x/core.h b/drivers/dpll/zl3073x/core.h index 99440620407d..addba378b0df 100644 --- a/drivers/dpll/zl3073x/core.h +++ b/drivers/dpll/zl3073x/core.h @@ -57,6 +57,7 @@ struct zl3073x_chip_info { * @work: periodic work * @clock_id: clock id of the device * @phase_avg_factor: phase offset measurement averaging factor + * @freq_monitor: is frequency monitor enabled */ struct zl3073x_dev { struct device *dev; @@ -77,9 +78,10 @@ struct zl3073x_dev { struct kthread_worker *kworker; struct kthread_delayed_work work; - /* Devlink parameters */ + /* Per-chip parameters */ u64 clock_id; u8 phase_avg_factor; + bool freq_monitor; }; extern const struct regmap_config zl3073x_regmap_config; diff --git a/drivers/dpll/zl3073x/dpll.c b/drivers/dpll/zl3073x/dpll.c index 0770bd895de9..0bfcbae2109f 100644 --- a/drivers/dpll/zl3073x/dpll.c +++ b/drivers/dpll/zl3073x/dpll.c @@ -1212,7 +1212,7 @@ zl3073x_dpll_freq_monitor_get(const struct dpll_device *dpll, { struct zl3073x_dpll *zldpll = dpll_priv; - if (zldpll->freq_monitor) + if (zldpll->dev->freq_monitor) *state = DPLL_FEATURE_STATE_ENABLE; else *state = DPLL_FEATURE_STATE_DISABLE; @@ -1226,9 +1226,19 @@ zl3073x_dpll_freq_monitor_set(const struct dpll_device *dpll, enum dpll_feature_state state, struct netlink_ext_ack *extack) { - struct zl3073x_dpll *zldpll = dpll_priv; + struct zl3073x_dpll *item, *zldpll = dpll_priv; - zldpll->freq_monitor = (state == DPLL_FEATURE_STATE_ENABLE); + zldpll->dev->freq_monitor = (state == DPLL_FEATURE_STATE_ENABLE); + + /* The frequency monitoring is common for all DPLL channels so after + * change we have to send a notification for other DPLL devices. + */ + list_for_each_entry(item, &zldpll->dev->dplls, list) { + struct dpll_device *dpll_dev = READ_ONCE(item->dpll_dev); + + if (item != zldpll && dpll_dev) + __dpll_device_change_ntf(dpll_dev); + } return 0; } @@ -1745,7 +1755,7 @@ zl3073x_dpll_pin_measured_freq_check(struct zl3073x_dpll_pin *pin) u8 ref_id; u32 freq; - if (!zldpll->freq_monitor) + if (!zldpll->dev->freq_monitor) return false; ref_id = zl3073x_input_pin_ref_get(pin->id); @@ -1778,10 +1788,8 @@ zl3073x_dpll_changes_check(struct zl3073x_dpll *zldpll) struct zl3073x_dev *zldev = zldpll->dev; enum dpll_lock_status lock_status; struct device *dev = zldev->dev; - const struct zl3073x_chan *chan; struct zl3073x_dpll_pin *pin; int rc; - u8 mode; zldpll->check_count++; @@ -1800,15 +1808,6 @@ zl3073x_dpll_changes_check(struct zl3073x_dpll *zldpll) dpll_device_change_ntf(zldpll->dpll_dev); } - /* Input pin monitoring does make sense only in automatic - * or forced reference modes. - */ - chan = zl3073x_chan_state_get(zldev, zldpll->id); - mode = zl3073x_chan_mode_get(chan); - if (mode != ZL_DPLL_MODE_REFSEL_MODE_AUTO && - mode != ZL_DPLL_MODE_REFSEL_MODE_REFLOCK) - return; - /* Update phase offset latch registers for this DPLL if the phase * offset monitor feature is enabled. */ diff --git a/drivers/dpll/zl3073x/dpll.h b/drivers/dpll/zl3073x/dpll.h index c8bc8437a709..21adcc18e45e 100644 --- a/drivers/dpll/zl3073x/dpll.h +++ b/drivers/dpll/zl3073x/dpll.h @@ -15,7 +15,6 @@ * @id: DPLL index * @check_count: periodic check counter * @phase_monitor: is phase offset monitor enabled - * @freq_monitor: is frequency monitor enabled * @ops: DPLL device operations for this instance * @dpll_dev: pointer to registered DPLL device * @tracker: tracking object for the acquired reference @@ -28,7 +27,6 @@ struct zl3073x_dpll { u8 id; u8 check_count; bool phase_monitor; - bool freq_monitor; struct dpll_device_ops ops; struct dpll_device *dpll_dev; dpll_tracker tracker;