From 1548549e17e374a126e9a4e9edab8bb041fbd67e Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Wed, 13 Aug 2025 13:16:33 +0200 Subject: [PATCH 01/81] MAINTAINERS: update s390/net Remove Thorsten Winkler as maintainer and add Aswin Karuvally as reviewer. Thank you Thorsten for your support, welcome Aswin! Signed-off-by: Alexandra Winter Acked-by: Thorsten Winkler Acked-by: Aswin Karuvally Link: https://patch.msgid.link/20250813111633.241111-1-wintera@linux.ibm.com Signed-off-by: Jakub Kicinski --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index daf520a13bdf..2720544cd91f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -22174,7 +22174,7 @@ F: arch/s390/mm S390 NETWORK DRIVERS M: Alexandra Winter -M: Thorsten Winkler +R: Aswin Karuvally L: linux-s390@vger.kernel.org L: netdev@vger.kernel.org S: Supported From fd980bf6e9cdae885105685259421164f843ca55 Mon Sep 17 00:00:00 2001 From: Suraj Gupta Date: Wed, 13 Aug 2025 19:25:59 +0530 Subject: [PATCH 02/81] net: xilinx: axienet: Fix RX skb ring management in DMAengine mode Submit multiple descriptors in axienet_rx_cb() to fill Rx skb ring. This ensures the ring "catches up" on previously missed allocations. Increment Rx skb ring head pointer after BD is successfully allocated. Previously, head pointer was incremented before verifying if descriptor is successfully allocated and has valid entries, which could lead to ring state inconsistency if descriptor setup failed. These changes improve reliability by maintaining adequate descriptor availability and ensuring proper ring buffer state management. Fixes: 6a91b846af85 ("net: axienet: Introduce dmaengine support") Signed-off-by: Suraj Gupta Link: https://patch.msgid.link/20250813135559.1555652-1-suraj.gupta2@amd.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/xilinx/xilinx_axienet_main.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c index 6011d7eae0c7..0d8a05fe541a 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c +++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c @@ -1160,6 +1160,7 @@ static void axienet_dma_rx_cb(void *data, const struct dmaengine_result *result) struct axienet_local *lp = data; struct sk_buff *skb; u32 *app_metadata; + int i; skbuf_dma = axienet_get_rx_desc(lp, lp->rx_ring_tail++); skb = skbuf_dma->skb; @@ -1178,7 +1179,10 @@ static void axienet_dma_rx_cb(void *data, const struct dmaengine_result *result) u64_stats_add(&lp->rx_packets, 1); u64_stats_add(&lp->rx_bytes, rx_len); u64_stats_update_end(&lp->rx_stat_sync); - axienet_rx_submit_desc(lp->ndev); + + for (i = 0; i < CIRC_SPACE(lp->rx_ring_head, lp->rx_ring_tail, + RX_BUF_NUM_DEFAULT); i++) + axienet_rx_submit_desc(lp->ndev); dma_async_issue_pending(lp->rx_chan); } @@ -1457,7 +1461,6 @@ static void axienet_rx_submit_desc(struct net_device *ndev) if (!skbuf_dma) return; - lp->rx_ring_head++; skb = netdev_alloc_skb(ndev, lp->max_frm_size); if (!skb) return; @@ -1482,6 +1485,7 @@ static void axienet_rx_submit_desc(struct net_device *ndev) skbuf_dma->desc = dma_rx_desc; dma_rx_desc->callback_param = lp; dma_rx_desc->callback_result = axienet_dma_rx_cb; + lp->rx_ring_head++; dmaengine_submit(dma_rx_desc); return; From d1547bf460baec718b3398365f8de33d25c5f36f Mon Sep 17 00:00:00 2001 From: Wang Liang Date: Wed, 13 Aug 2025 10:10:54 +0800 Subject: [PATCH 03/81] net: bridge: fix soft lockup in br_multicast_query_expired() When set multicast_query_interval to a large value, the local variable 'time' in br_multicast_send_query() may overflow. If the time is smaller than jiffies, the timer will expire immediately, and then call mod_timer() again, which creates a loop and may trigger the following soft lockup issue. watchdog: BUG: soft lockup - CPU#1 stuck for 221s! [rb_consumer:66] CPU: 1 UID: 0 PID: 66 Comm: rb_consumer Not tainted 6.16.0+ #259 PREEMPT(none) Call Trace: __netdev_alloc_skb+0x2e/0x3a0 br_ip6_multicast_alloc_query+0x212/0x1b70 __br_multicast_send_query+0x376/0xac0 br_multicast_send_query+0x299/0x510 br_multicast_query_expired.constprop.0+0x16d/0x1b0 call_timer_fn+0x3b/0x2a0 __run_timers+0x619/0x950 run_timer_softirq+0x11c/0x220 handle_softirqs+0x18e/0x560 __irq_exit_rcu+0x158/0x1a0 sysvec_apic_timer_interrupt+0x76/0x90 This issue can be reproduced with: ip link add br0 type bridge echo 1 > /sys/class/net/br0/bridge/multicast_querier echo 0xffffffffffffffff > /sys/class/net/br0/bridge/multicast_query_interval ip link set dev br0 up The multicast_startup_query_interval can also cause this issue. Similar to the commit 99b40610956a ("net: bridge: mcast: add and enforce query interval minimum"), add check for the query interval maximum to fix this issue. Link: https://lore.kernel.org/netdev/20250806094941.1285944-1-wangliang74@huawei.com/ Link: https://lore.kernel.org/netdev/20250812091818.542238-1-wangliang74@huawei.com/ Fixes: d902eee43f19 ("bridge: Add multicast count/interval sysfs entries") Suggested-by: Nikolay Aleksandrov Signed-off-by: Wang Liang Reviewed-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20250813021054.1643649-1-wangliang74@huawei.com Signed-off-by: Jakub Kicinski --- net/bridge/br_multicast.c | 16 ++++++++++++++++ net/bridge/br_private.h | 2 ++ 2 files changed, 18 insertions(+) diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 1377f31b719c..8ce145938b02 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -4818,6 +4818,14 @@ void br_multicast_set_query_intvl(struct net_bridge_mcast *brmctx, intvl_jiffies = BR_MULTICAST_QUERY_INTVL_MIN; } + if (intvl_jiffies > BR_MULTICAST_QUERY_INTVL_MAX) { + br_info(brmctx->br, + "trying to set multicast query interval above maximum, setting to %lu (%ums)\n", + jiffies_to_clock_t(BR_MULTICAST_QUERY_INTVL_MAX), + jiffies_to_msecs(BR_MULTICAST_QUERY_INTVL_MAX)); + intvl_jiffies = BR_MULTICAST_QUERY_INTVL_MAX; + } + brmctx->multicast_query_interval = intvl_jiffies; } @@ -4834,6 +4842,14 @@ void br_multicast_set_startup_query_intvl(struct net_bridge_mcast *brmctx, intvl_jiffies = BR_MULTICAST_STARTUP_QUERY_INTVL_MIN; } + if (intvl_jiffies > BR_MULTICAST_STARTUP_QUERY_INTVL_MAX) { + br_info(brmctx->br, + "trying to set multicast startup query interval above maximum, setting to %lu (%ums)\n", + jiffies_to_clock_t(BR_MULTICAST_STARTUP_QUERY_INTVL_MAX), + jiffies_to_msecs(BR_MULTICAST_STARTUP_QUERY_INTVL_MAX)); + intvl_jiffies = BR_MULTICAST_STARTUP_QUERY_INTVL_MAX; + } + brmctx->multicast_startup_query_interval = intvl_jiffies; } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index b159aae594c0..8de0904b9627 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -31,6 +31,8 @@ #define BR_MULTICAST_DEFAULT_HASH_MAX 4096 #define BR_MULTICAST_QUERY_INTVL_MIN msecs_to_jiffies(1000) #define BR_MULTICAST_STARTUP_QUERY_INTVL_MIN BR_MULTICAST_QUERY_INTVL_MIN +#define BR_MULTICAST_QUERY_INTVL_MAX msecs_to_jiffies(86400000) /* 24 hours */ +#define BR_MULTICAST_STARTUP_QUERY_INTVL_MAX BR_MULTICAST_QUERY_INTVL_MAX #define BR_HWDOM_MAX BITS_PER_LONG From 52bf272636bda69587952b35ae97690b8dc89941 Mon Sep 17 00:00:00 2001 From: William Liu Date: Tue, 12 Aug 2025 23:57:57 +0000 Subject: [PATCH 04/81] net/sched: Fix backlog accounting in qdisc_dequeue_internal This issue applies for the following qdiscs: hhf, fq, fq_codel, and fq_pie, and occurs in their change handlers when adjusting to the new limit. The problem is the following in the values passed to the subsequent qdisc_tree_reduce_backlog call given a tbf parent: When the tbf parent runs out of tokens, skbs of these qdiscs will be placed in gso_skb. Their peek handlers are qdisc_peek_dequeued, which accounts for both qlen and backlog. However, in the case of qdisc_dequeue_internal, ONLY qlen is accounted for when pulling from gso_skb. This means that these qdiscs are missing a qdisc_qstats_backlog_dec when dropping packets to satisfy the new limit in their change handlers. One can observe this issue with the following (with tc patched to support a limit of 0): export TARGET=fq tc qdisc del dev lo root tc qdisc add dev lo root handle 1: tbf rate 8bit burst 100b latency 1ms tc qdisc replace dev lo handle 3: parent 1:1 $TARGET limit 1000 echo ''; echo 'add child'; tc -s -d qdisc show dev lo ping -I lo -f -c2 -s32 -W0.001 127.0.0.1 2>&1 >/dev/null echo ''; echo 'after ping'; tc -s -d qdisc show dev lo tc qdisc change dev lo handle 3: parent 1:1 $TARGET limit 0 echo ''; echo 'after limit drop'; tc -s -d qdisc show dev lo tc qdisc replace dev lo handle 2: parent 1:1 sfq echo ''; echo 'post graft'; tc -s -d qdisc show dev lo The second to last show command shows 0 packets but a positive number (74) of backlog bytes. The problem becomes clearer in the last show command, where qdisc_purge_queue triggers qdisc_tree_reduce_backlog with the positive backlog and causes an underflow in the tbf parent's backlog (4096 Mb instead of 0). To fix this issue, the codepath for all clients of qdisc_dequeue_internal has been simplified: codel, pie, hhf, fq, fq_pie, and fq_codel. qdisc_dequeue_internal handles the backlog adjustments for all cases that do not directly use the dequeue handler. The old fq_codel_change limit adjustment loop accumulated the arguments to the subsequent qdisc_tree_reduce_backlog call through the cstats field. However, this is confusing and error prone as fq_codel_dequeue could also potentially mutate this field (which qdisc_dequeue_internal calls in the non gso_skb case), so we have unified the code here with other qdiscs. Fixes: 2d3cbfd6d54a ("net_sched: Flush gso_skb list too during ->change()") Fixes: 4b549a2ef4be ("fq_codel: Fair Queue Codel AQM") Fixes: 10239edf86f1 ("net-qdisc-hhf: Heavy-Hitter Filter (HHF) qdisc") Signed-off-by: William Liu Reviewed-by: Savino Dicanosa Link: https://patch.msgid.link/20250812235725.45243-1-will@willsroot.io Signed-off-by: Jakub Kicinski --- include/net/sch_generic.h | 11 ++++++++--- net/sched/sch_codel.c | 12 +++++++----- net/sched/sch_fq.c | 12 +++++++----- net/sched/sch_fq_codel.c | 12 +++++++----- net/sched/sch_fq_pie.c | 12 +++++++----- net/sched/sch_hhf.c | 12 +++++++----- net/sched/sch_pie.c | 12 +++++++----- 7 files changed, 50 insertions(+), 33 deletions(-) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 638948be4c50..738cd5b13c62 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -1038,12 +1038,17 @@ static inline struct sk_buff *qdisc_dequeue_internal(struct Qdisc *sch, bool dir skb = __skb_dequeue(&sch->gso_skb); if (skb) { sch->q.qlen--; + qdisc_qstats_backlog_dec(sch, skb); return skb; } - if (direct) - return __qdisc_dequeue_head(&sch->q); - else + if (direct) { + skb = __qdisc_dequeue_head(&sch->q); + if (skb) + qdisc_qstats_backlog_dec(sch, skb); + return skb; + } else { return sch->dequeue(sch); + } } static inline struct sk_buff *qdisc_dequeue_head(struct Qdisc *sch) diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c index c93761040c6e..fa0314679e43 100644 --- a/net/sched/sch_codel.c +++ b/net/sched/sch_codel.c @@ -101,9 +101,9 @@ static const struct nla_policy codel_policy[TCA_CODEL_MAX + 1] = { static int codel_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { + unsigned int dropped_pkts = 0, dropped_bytes = 0; struct codel_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_CODEL_MAX + 1]; - unsigned int qlen, dropped = 0; int err; err = nla_parse_nested_deprecated(tb, TCA_CODEL_MAX, opt, @@ -142,15 +142,17 @@ static int codel_change(struct Qdisc *sch, struct nlattr *opt, WRITE_ONCE(q->params.ecn, !!nla_get_u32(tb[TCA_CODEL_ECN])); - qlen = sch->q.qlen; while (sch->q.qlen > sch->limit) { struct sk_buff *skb = qdisc_dequeue_internal(sch, true); - dropped += qdisc_pkt_len(skb); - qdisc_qstats_backlog_dec(sch, skb); + if (!skb) + break; + + dropped_pkts++; + dropped_bytes += qdisc_pkt_len(skb); rtnl_qdisc_drop(skb, sch); } - qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped); + qdisc_tree_reduce_backlog(sch, dropped_pkts, dropped_bytes); sch_tree_unlock(sch); return 0; diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 902ff5470607..fee922da2f99 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -1013,11 +1013,11 @@ static int fq_load_priomap(struct fq_sched_data *q, static int fq_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { + unsigned int dropped_pkts = 0, dropped_bytes = 0; struct fq_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_FQ_MAX + 1]; - int err, drop_count = 0; - unsigned drop_len = 0; u32 fq_log; + int err; err = nla_parse_nested_deprecated(tb, TCA_FQ_MAX, opt, fq_policy, NULL); @@ -1135,16 +1135,18 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt, err = fq_resize(sch, fq_log); sch_tree_lock(sch); } + while (sch->q.qlen > sch->limit) { struct sk_buff *skb = qdisc_dequeue_internal(sch, false); if (!skb) break; - drop_len += qdisc_pkt_len(skb); + + dropped_pkts++; + dropped_bytes += qdisc_pkt_len(skb); rtnl_kfree_skbs(skb, skb); - drop_count++; } - qdisc_tree_reduce_backlog(sch, drop_count, drop_len); + qdisc_tree_reduce_backlog(sch, dropped_pkts, dropped_bytes); sch_tree_unlock(sch); return err; diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 2a0f3a513bfa..a14142392939 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -366,6 +366,7 @@ static const struct nla_policy fq_codel_policy[TCA_FQ_CODEL_MAX + 1] = { static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { + unsigned int dropped_pkts = 0, dropped_bytes = 0; struct fq_codel_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_FQ_CODEL_MAX + 1]; u32 quantum = 0; @@ -443,13 +444,14 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt, q->memory_usage > q->memory_limit) { struct sk_buff *skb = qdisc_dequeue_internal(sch, false); - q->cstats.drop_len += qdisc_pkt_len(skb); + if (!skb) + break; + + dropped_pkts++; + dropped_bytes += qdisc_pkt_len(skb); rtnl_kfree_skbs(skb, skb); - q->cstats.drop_count++; } - qdisc_tree_reduce_backlog(sch, q->cstats.drop_count, q->cstats.drop_len); - q->cstats.drop_count = 0; - q->cstats.drop_len = 0; + qdisc_tree_reduce_backlog(sch, dropped_pkts, dropped_bytes); sch_tree_unlock(sch); return 0; diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c index b0e34daf1f75..7b96bc3ff891 100644 --- a/net/sched/sch_fq_pie.c +++ b/net/sched/sch_fq_pie.c @@ -287,10 +287,9 @@ static struct sk_buff *fq_pie_qdisc_dequeue(struct Qdisc *sch) static int fq_pie_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { + unsigned int dropped_pkts = 0, dropped_bytes = 0; struct fq_pie_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_FQ_PIE_MAX + 1]; - unsigned int len_dropped = 0; - unsigned int num_dropped = 0; int err; err = nla_parse_nested(tb, TCA_FQ_PIE_MAX, opt, fq_pie_policy, extack); @@ -368,11 +367,14 @@ static int fq_pie_change(struct Qdisc *sch, struct nlattr *opt, while (sch->q.qlen > sch->limit) { struct sk_buff *skb = qdisc_dequeue_internal(sch, false); - len_dropped += qdisc_pkt_len(skb); - num_dropped += 1; + if (!skb) + break; + + dropped_pkts++; + dropped_bytes += qdisc_pkt_len(skb); rtnl_kfree_skbs(skb, skb); } - qdisc_tree_reduce_backlog(sch, num_dropped, len_dropped); + qdisc_tree_reduce_backlog(sch, dropped_pkts, dropped_bytes); sch_tree_unlock(sch); return 0; diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c index 5aa434b46707..2d4855e28a28 100644 --- a/net/sched/sch_hhf.c +++ b/net/sched/sch_hhf.c @@ -508,9 +508,9 @@ static const struct nla_policy hhf_policy[TCA_HHF_MAX + 1] = { static int hhf_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { + unsigned int dropped_pkts = 0, dropped_bytes = 0; struct hhf_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_HHF_MAX + 1]; - unsigned int qlen, prev_backlog; int err; u64 non_hh_quantum; u32 new_quantum = q->quantum; @@ -561,15 +561,17 @@ static int hhf_change(struct Qdisc *sch, struct nlattr *opt, usecs_to_jiffies(us)); } - qlen = sch->q.qlen; - prev_backlog = sch->qstats.backlog; while (sch->q.qlen > sch->limit) { struct sk_buff *skb = qdisc_dequeue_internal(sch, false); + if (!skb) + break; + + dropped_pkts++; + dropped_bytes += qdisc_pkt_len(skb); rtnl_kfree_skbs(skb, skb); } - qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, - prev_backlog - sch->qstats.backlog); + qdisc_tree_reduce_backlog(sch, dropped_pkts, dropped_bytes); sch_tree_unlock(sch); return 0; diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index ad46ee3ed5a9..0a377313b6a9 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -141,9 +141,9 @@ static const struct nla_policy pie_policy[TCA_PIE_MAX + 1] = { static int pie_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { + unsigned int dropped_pkts = 0, dropped_bytes = 0; struct pie_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_PIE_MAX + 1]; - unsigned int qlen, dropped = 0; int err; err = nla_parse_nested_deprecated(tb, TCA_PIE_MAX, opt, pie_policy, @@ -193,15 +193,17 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt, nla_get_u32(tb[TCA_PIE_DQ_RATE_ESTIMATOR])); /* Drop excess packets if new limit is lower */ - qlen = sch->q.qlen; while (sch->q.qlen > sch->limit) { struct sk_buff *skb = qdisc_dequeue_internal(sch, true); - dropped += qdisc_pkt_len(skb); - qdisc_qstats_backlog_dec(sch, skb); + if (!skb) + break; + + dropped_pkts++; + dropped_bytes += qdisc_pkt_len(skb); rtnl_qdisc_drop(skb, sch); } - qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped); + qdisc_tree_reduce_backlog(sch, dropped_pkts, dropped_bytes); sch_tree_unlock(sch); return 0; From 8c06cbdcbaea34d7b96d76df4d6669275c1d291a Mon Sep 17 00:00:00 2001 From: William Liu Date: Tue, 12 Aug 2025 23:58:26 +0000 Subject: [PATCH 05/81] selftests/tc-testing: Check backlog stats in gso_skb case Add tests to ensure proper backlog accounting in hhf, codel, pie, fq, fq_pie, and fq_codel qdiscs. We check for the bug pattern originally found in fq, fq_pie, and fq_codel, which was an underflow in the tbf parent backlog stats upon child qdisc removal. Signed-off-by: William Liu Reviewed-by: Savino Dicanosa Link: https://patch.msgid.link/20250812235808.45281-1-will@willsroot.io Signed-off-by: Jakub Kicinski --- .../tc-testing/tc-tests/infra/qdiscs.json | 198 ++++++++++++++++++ 1 file changed, 198 insertions(+) diff --git a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json index 23a61e5b99d0..998e5a2f4579 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json +++ b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json @@ -185,6 +185,204 @@ "$IP addr del 10.10.10.10/24 dev $DUMMY || true" ] }, + { + "id": "34c0", + "name": "Test TBF with HHF Backlog Accounting in gso_skb case against underflow", + "category": [ + "qdisc", + "tbf", + "hhf" + ], + "plugins": { + "requires": [ + "nsPlugin" + ] + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.11.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY root handle 1: tbf rate 8bit burst 100b latency 100ms", + "$TC qdisc replace dev $DUMMY handle 2: parent 1:1 hhf limit 1000", + [ + "ping -I $DUMMY -c2 10.10.11.11", + 1 + ], + "$TC qdisc change dev $DUMMY handle 2: parent 1:1 hhf limit 1" + ], + "cmdUnderTest": "$TC qdisc del dev $DUMMY handle 2: parent 1:1", + "expExitCode": "0", + "verifyCmd": "$TC -s qdisc show dev $DUMMY", + "matchPattern": "backlog 0b 0p", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root" + ] + }, + { + "id": "fd68", + "name": "Test TBF with CODEL Backlog Accounting in gso_skb case against underflow", + "category": [ + "qdisc", + "tbf", + "codel" + ], + "plugins": { + "requires": [ + "nsPlugin" + ] + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.11.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY root handle 1: tbf rate 8bit burst 100b latency 100ms", + "$TC qdisc replace dev $DUMMY handle 2: parent 1:1 codel limit 1000", + [ + "ping -I $DUMMY -c2 10.10.11.11", + 1 + ], + "$TC qdisc change dev $DUMMY handle 2: parent 1:1 codel limit 1" + ], + "cmdUnderTest": "$TC qdisc del dev $DUMMY handle 2: parent 1:1", + "expExitCode": "0", + "verifyCmd": "$TC -s qdisc show dev $DUMMY", + "matchPattern": "backlog 0b 0p", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root" + ] + }, + { + "id": "514e", + "name": "Test TBF with PIE Backlog Accounting in gso_skb case against underflow", + "category": [ + "qdisc", + "tbf", + "pie" + ], + "plugins": { + "requires": [ + "nsPlugin" + ] + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.11.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY root handle 1: tbf rate 8bit burst 100b latency 100ms", + "$TC qdisc replace dev $DUMMY handle 2: parent 1:1 pie limit 1000", + [ + "ping -I $DUMMY -c2 10.10.11.11", + 1 + ], + "$TC qdisc change dev $DUMMY handle 2: parent 1:1 pie limit 1" + ], + "cmdUnderTest": "$TC qdisc del dev $DUMMY handle 2: parent 1:1", + "expExitCode": "0", + "verifyCmd": "$TC -s qdisc show dev $DUMMY", + "matchPattern": "backlog 0b 0p", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root" + ] + }, + { + "id": "6c97", + "name": "Test TBF with FQ Backlog Accounting in gso_skb case against underflow", + "category": [ + "qdisc", + "tbf", + "fq" + ], + "plugins": { + "requires": [ + "nsPlugin" + ] + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.11.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY root handle 1: tbf rate 8bit burst 100b latency 100ms", + "$TC qdisc replace dev $DUMMY handle 2: parent 1:1 fq limit 1000", + [ + "ping -I $DUMMY -c2 10.10.11.11", + 1 + ], + "$TC qdisc change dev $DUMMY handle 2: parent 1:1 fq limit 1" + ], + "cmdUnderTest": "$TC qdisc del dev $DUMMY handle 2: parent 1:1", + "expExitCode": "0", + "verifyCmd": "$TC -s qdisc show dev $DUMMY", + "matchPattern": "backlog 0b 0p", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root" + ] + }, + { + "id": "5d0b", + "name": "Test TBF with FQ_CODEL Backlog Accounting in gso_skb case against underflow", + "category": [ + "qdisc", + "tbf", + "fq_codel" + ], + "plugins": { + "requires": [ + "nsPlugin" + ] + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.11.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY root handle 1: tbf rate 8bit burst 100b latency 100ms", + "$TC qdisc replace dev $DUMMY handle 2: parent 1:1 fq_codel limit 1000", + [ + "ping -I $DUMMY -c2 10.10.11.11", + 1 + ], + "$TC qdisc change dev $DUMMY handle 2: parent 1:1 fq_codel limit 1" + ], + "cmdUnderTest": "$TC qdisc del dev $DUMMY handle 2: parent 1:1", + "expExitCode": "0", + "verifyCmd": "$TC -s qdisc show dev $DUMMY", + "matchPattern": "backlog 0b 0p", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root" + ] + }, + { + "id": "21c3", + "name": "Test TBF with FQ_PIE Backlog Accounting in gso_skb case against underflow", + "category": [ + "qdisc", + "tbf", + "fq_pie" + ], + "plugins": { + "requires": [ + "nsPlugin" + ] + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.11.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY root handle 1: tbf rate 8bit burst 100b latency 100ms", + "$TC qdisc replace dev $DUMMY handle 2: parent 1:1 fq_pie limit 1000", + [ + "ping -I $DUMMY -c2 10.10.11.11", + 1 + ], + "$TC qdisc change dev $DUMMY handle 2: parent 1:1 fq_pie limit 1" + ], + "cmdUnderTest": "$TC qdisc del dev $DUMMY handle 2: parent 1:1", + "expExitCode": "0", + "verifyCmd": "$TC -s qdisc show dev $DUMMY", + "matchPattern": "backlog 0b 0p", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root" + ] + }, { "id": "a4bb", "name": "Test FQ_CODEL with HTB parent - force packet drop with empty queue", From 065c31f2c6915b38f45b1c817b31f41f62eaa774 Mon Sep 17 00:00:00 2001 From: Justin Lai Date: Wed, 13 Aug 2025 15:16:31 +0800 Subject: [PATCH 06/81] rtase: Fix Rx descriptor CRC error bit definition The CRC error bit is located at bit 17 in the Rx descriptor, but the driver was incorrectly using bit 16. Fix it. Fixes: a36e9f5cfe9e ("rtase: Add support for a pci table in this module") Signed-off-by: Justin Lai Link: https://patch.msgid.link/20250813071631.7566-1-justinlai0215@realtek.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/realtek/rtase/rtase.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/realtek/rtase/rtase.h b/drivers/net/ethernet/realtek/rtase/rtase.h index 20decdeb9fdb..b9209eb6ea73 100644 --- a/drivers/net/ethernet/realtek/rtase/rtase.h +++ b/drivers/net/ethernet/realtek/rtase/rtase.h @@ -241,7 +241,7 @@ union rtase_rx_desc { #define RTASE_RX_RES BIT(20) #define RTASE_RX_RUNT BIT(19) #define RTASE_RX_RWT BIT(18) -#define RTASE_RX_CRC BIT(16) +#define RTASE_RX_CRC BIT(17) #define RTASE_RX_V6F BIT(31) #define RTASE_RX_V4F BIT(30) #define RTASE_RX_UDPT BIT(29) From de5d7d3f27ddd4046736f558a40e252ddda82013 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Mon, 28 Jul 2025 17:08:44 +0800 Subject: [PATCH 07/81] Bluetooth: hci_sync: Avoid adding default advertising on startup list_empty(&hdev->adv_instances) is always true during startup, so an advertising instance is added by default. Call trace: dump_backtrace+0x94/0xec show_stack+0x18/0x24 dump_stack_lvl+0x48/0x60 dump_stack+0x18/0x24 hci_setup_ext_adv_instance_sync+0x17c/0x328 hci_powered_update_adv_sync+0xb4/0x12c hci_powered_update_sync+0x54/0x70 hci_power_on_sync+0xe4/0x278 hci_set_powered_sync+0x28/0x34 set_powered_sync+0x40/0x58 hci_cmd_sync_work+0x94/0x100 process_one_work+0x168/0x444 worker_thread+0x378/0x3f4 kthread+0x108/0x10c ret_from_fork+0x10/0x20 Link: https://github.com/bluez/bluez/issues/1442 Signed-off-by: Yang Li Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_sync.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 2b4f21fbf9c1..7397b6b50ccb 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -3344,7 +3344,7 @@ static int hci_powered_update_adv_sync(struct hci_dev *hdev) * advertising data. This also applies to the case * where BR/EDR was toggled during the AUTO_OFF phase. */ - if (hci_dev_test_flag(hdev, HCI_ADVERTISING) || + if (hci_dev_test_flag(hdev, HCI_ADVERTISING) && list_empty(&hdev->adv_instances)) { if (ext_adv_capable(hdev)) { err = hci_setup_ext_adv_instance_sync(hdev, 0x00); From ca88be1a2725a42f8dbad579181611d9dcca8e88 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 24 Jul 2025 16:43:18 -0400 Subject: [PATCH 08/81] Bluetooth: hci_sync: Fix scan state after PA Sync has been established Passive scanning is used to program the address of the peer to be synchronized, so once HCI_EV_LE_PA_SYNC_ESTABLISHED is received it needs to be updated after clearing HCI_PA_SYNC then call hci_update_passive_scan_sync to return it to its original state. Fixes: 6d0417e4e1cf ("Bluetooth: hci_conn: Fix not setting conn_timeout for Broadcast Receiver") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_sync.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 7397b6b50ccb..387c128f2ba0 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -6985,8 +6985,6 @@ static void create_pa_complete(struct hci_dev *hdev, void *data, int err) hci_dev_lock(hdev); - hci_dev_clear_flag(hdev, HCI_PA_SYNC); - if (!hci_conn_valid(hdev, conn)) clear_bit(HCI_CONN_CREATE_PA_SYNC, &conn->flags); @@ -7080,6 +7078,11 @@ static int hci_le_pa_create_sync(struct hci_dev *hdev, void *data) __hci_cmd_sync_status(hdev, HCI_OP_LE_PA_CREATE_SYNC_CANCEL, 0, NULL, HCI_CMD_TIMEOUT); + hci_dev_clear_flag(hdev, HCI_PA_SYNC); + + /* Update passive scan since HCI_PA_SYNC flag has been cleared */ + hci_update_passive_scan_sync(hdev); + return err; } From aee29c18a38d479c2f058c9b6a39b0527cf81d10 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 24 Jul 2025 16:36:27 -0400 Subject: [PATCH 09/81] Bluetooth: ISO: Fix getname not returning broadcast fields getname shall return iso_bc fields for both BIS_LINK and PA_LINK since the likes of bluetoothd do use the getpeername to retrieve the SID both when enumerating the broadcasters and when synchronizing. Fixes: a7bcffc673de ("Bluetooth: Add PA_LINK to distinguish BIG sync and PA sync connections") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/iso.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index 7bd3aa0a6db9..eaffd25570e3 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -1347,7 +1347,7 @@ static int iso_sock_getname(struct socket *sock, struct sockaddr *addr, bacpy(&sa->iso_bdaddr, &iso_pi(sk)->dst); sa->iso_bdaddr_type = iso_pi(sk)->dst_type; - if (hcon && hcon->type == BIS_LINK) { + if (hcon && (hcon->type == BIS_LINK || hcon->type == PA_LINK)) { sa->iso_bc->bc_sid = iso_pi(sk)->bc_sid; sa->iso_bc->bc_num_bis = iso_pi(sk)->bc_num_bis; memcpy(sa->iso_bc->bc_bis, iso_pi(sk)->bc_bis, From d36349ea73d805bb72cbc24ab90cb1da4ad5c379 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 28 Jul 2025 13:51:01 -0400 Subject: [PATCH 10/81] Bluetooth: hci_conn: Fix running bis_cleanup for hci_conn->type PA_LINK Connections with type of PA_LINK shall be considered temporary just to track the lifetime of PA Sync setup, once the BIG Sync is established and connection are created with BIS_LINK the existing PA_LINK connection shall not longer use bis_cleanup otherwise it terminates the PA Sync when that shall be left to BIS_LINK connection to do it. Fixes: a7bcffc673de ("Bluetooth: Add PA_LINK to distinguish BIG sync and PA sync connections") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_conn.c | 12 +++++++++++- net/bluetooth/hci_event.c | 7 ++++++- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 7d1e79f69cd1..f8b20b609a03 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -830,7 +830,17 @@ static void bis_cleanup(struct hci_conn *conn) /* Check if ISO connection is a BIS and terminate advertising * set and BIG if there are no other connections using it. */ - bis = hci_conn_hash_lookup_big(hdev, conn->iso_qos.bcast.big); + bis = hci_conn_hash_lookup_big_state(hdev, + conn->iso_qos.bcast.big, + BT_CONNECTED, + HCI_ROLE_MASTER); + if (bis) + return; + + bis = hci_conn_hash_lookup_big_state(hdev, + conn->iso_qos.bcast.big, + BT_CONNECT, + HCI_ROLE_MASTER); if (bis) return; diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 8aa5039b975a..4f0a6116291e 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -6957,9 +6957,14 @@ static void hci_le_big_sync_established_evt(struct hci_dev *hdev, void *data, continue; } - if (ev->status != 0x42) + if (ev->status != 0x42) { /* Mark PA sync as established */ set_bit(HCI_CONN_PA_SYNC, &bis->flags); + /* Reset cleanup callback of PA Sync so it doesn't + * terminate the sync when deleting the connection. + */ + conn->cleanup = NULL; + } bis->sync_handle = conn->sync_handle; bis->iso_qos.bcast.big = ev->handle; From 3ba486c5f3ce2c22ffd29c0103404cdbe21912b3 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Tue, 29 Jul 2025 12:11:09 -0400 Subject: [PATCH 11/81] Bluetooth: hci_conn: Fix not cleaning up Broadcaster/Broadcast Source This fixes Broadcaster/Broadcast Source not sending HCI_OP_LE_TERM_BIG because HCI_CONN_PER_ADV where not being set. Fixes: a7bcffc673de ("Bluetooth: Add PA_LINK to distinguish BIG sync and PA sync connections") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_conn.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index f8b20b609a03..ab6fe5b0cc0f 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -2259,7 +2259,7 @@ struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst, * the start periodic advertising and create BIG commands have * been queued */ - hci_conn_hash_list_state(hdev, bis_mark_per_adv, PA_LINK, + hci_conn_hash_list_state(hdev, bis_mark_per_adv, BIS_LINK, BT_BOUND, &data); /* Queue start periodic advertising and create BIG */ From 099799fa9b76c5c02b49e07005a85117a25b01ea Mon Sep 17 00:00:00 2001 From: Jiande Lu Date: Thu, 24 Jul 2025 16:51:17 +0800 Subject: [PATCH 12/81] Bluetooth: btmtk: Fix wait_on_bit_timeout interruption during shutdown During the shutdown process, an interrupt occurs that prematurely terminates the wait for the expected event. This change replaces TASK_INTERRUPTIBLE with TASK_UNINTERRUPTIBLE in the wait_on_bit_timeout call to ensure the shutdown process completes as intended without being interrupted by signals. Fixes: d019930b0049 ("Bluetooth: btmtk: move btusb_mtk_hci_wmt_sync to btmtk.c") Signed-off-by: Jiande Lu Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btmtk.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/bluetooth/btmtk.c b/drivers/bluetooth/btmtk.c index 4390fd571dbd..a8c520dc09e1 100644 --- a/drivers/bluetooth/btmtk.c +++ b/drivers/bluetooth/btmtk.c @@ -642,12 +642,7 @@ static int btmtk_usb_hci_wmt_sync(struct hci_dev *hdev, * WMT command. */ err = wait_on_bit_timeout(&data->flags, BTMTK_TX_WAIT_VND_EVT, - TASK_INTERRUPTIBLE, HCI_INIT_TIMEOUT); - if (err == -EINTR) { - bt_dev_err(hdev, "Execution of wmt command interrupted"); - clear_bit(BTMTK_TX_WAIT_VND_EVT, &data->flags); - goto err_free_wc; - } + TASK_UNINTERRUPTIBLE, HCI_INIT_TIMEOUT); if (err) { bt_dev_err(hdev, "Execution of wmt command timed out"); From 709788b154caf042874d765628ffa860f0bb0d1e Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 4 Aug 2025 09:54:05 -0400 Subject: [PATCH 13/81] Bluetooth: hci_core: Fix using {cis,bis}_capable for current settings {cis,bis}_capable only indicates the controller supports the feature since it doesn't check that LE is enabled so it shall not be used for current setting, instead this introduces {cis,bis}_enabled macros that can be used to indicate that these features are currently enabled. Fixes: 26afbd826ee3 ("Bluetooth: Add initial implementation of CIS connections") Fixes: eca0ae4aea66 ("Bluetooth: Add initial implementation of BIS connections") Fixes: ae7533613133 ("Bluetooth: Check for ISO support in controller") Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/bluetooth.h | 4 ++-- include/net/bluetooth/hci_core.h | 13 ++++++++++++- net/bluetooth/hci_sync.c | 4 ++-- net/bluetooth/iso.c | 14 +++++++------- net/bluetooth/mgmt.c | 10 +++++----- 5 files changed, 28 insertions(+), 17 deletions(-) diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index ada5b56a4413..e5751f3070b8 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -647,7 +647,7 @@ static inline void sco_exit(void) #if IS_ENABLED(CONFIG_BT_LE) int iso_init(void); int iso_exit(void); -bool iso_enabled(void); +bool iso_inited(void); #else static inline int iso_init(void) { @@ -659,7 +659,7 @@ static inline int iso_exit(void) return 0; } -static inline bool iso_enabled(void) +static inline bool iso_inited(void) { return false; } diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 4dc11c66f7b8..bc29f2e2e16f 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1915,6 +1915,8 @@ void hci_conn_del_sysfs(struct hci_conn *conn); !hci_dev_test_flag(dev, HCI_RPA_EXPIRED)) #define adv_rpa_valid(adv) (bacmp(&adv->random_addr, BDADDR_ANY) && \ !adv->rpa_expired) +#define le_enabled(dev) (lmp_le_capable(dev) && \ + hci_dev_test_flag(dev, HCI_LE_ENABLED)) #define scan_1m(dev) (((dev)->le_tx_def_phys & HCI_LE_SET_PHY_1M) || \ ((dev)->le_rx_def_phys & HCI_LE_SET_PHY_1M)) @@ -1981,14 +1983,23 @@ void hci_conn_del_sysfs(struct hci_conn *conn); /* CIS Master/Slave and BIS support */ #define iso_capable(dev) (cis_capable(dev) || bis_capable(dev)) +#define iso_enabled(dev) (le_enabled(dev) && iso_capable(dev)) #define cis_capable(dev) \ (cis_central_capable(dev) || cis_peripheral_capable(dev)) +#define cis_enabled(dev) (le_enabled(dev) && cis_capable(dev)) #define cis_central_capable(dev) \ ((dev)->le_features[3] & HCI_LE_CIS_CENTRAL) +#define cis_central_enabled(dev) \ + (le_enabled(dev) && cis_central_capable(dev)) #define cis_peripheral_capable(dev) \ ((dev)->le_features[3] & HCI_LE_CIS_PERIPHERAL) +#define cis_peripheral_enabled(dev) \ + (le_enabled(dev) && cis_peripheral_capable(dev)) #define bis_capable(dev) ((dev)->le_features[3] & HCI_LE_ISO_BROADCASTER) -#define sync_recv_capable(dev) ((dev)->le_features[3] & HCI_LE_ISO_SYNC_RECEIVER) +#define bis_enabled(dev) (le_enabled(dev) && bis_capable(dev)) +#define sync_recv_capable(dev) \ + ((dev)->le_features[3] & HCI_LE_ISO_SYNC_RECEIVER) +#define sync_recv_enabled(dev) (le_enabled(dev) && sync_recv_capable(dev)) #define mws_transport_config_capable(dev) (((dev)->commands[30] & 0x08) && \ (!hci_test_quirk((dev), HCI_QUIRK_BROKEN_MWS_TRANSPORT_CONFIG))) diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 387c128f2ba0..aa7d7a8ec3ee 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -4531,14 +4531,14 @@ static int hci_le_set_host_feature_sync(struct hci_dev *hdev) { struct hci_cp_le_set_host_feature cp; - if (!cis_capable(hdev)) + if (!iso_capable(hdev)) return 0; memset(&cp, 0, sizeof(cp)); /* Connected Isochronous Channels (Host Support) */ cp.bit_number = 32; - cp.bit_value = 1; + cp.bit_value = iso_enabled(hdev) ? 0x01 : 0x00; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_HOST_FEATURE, sizeof(cp), &cp, HCI_CMD_TIMEOUT); diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index eaffd25570e3..5ce823ca3aaf 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -2483,11 +2483,11 @@ static const struct net_proto_family iso_sock_family_ops = { .create = iso_sock_create, }; -static bool iso_inited; +static bool inited; -bool iso_enabled(void) +bool iso_inited(void) { - return iso_inited; + return inited; } int iso_init(void) @@ -2496,7 +2496,7 @@ int iso_init(void) BUILD_BUG_ON(sizeof(struct sockaddr_iso) > sizeof(struct sockaddr)); - if (iso_inited) + if (inited) return -EALREADY; err = proto_register(&iso_proto, 0); @@ -2524,7 +2524,7 @@ int iso_init(void) iso_debugfs = debugfs_create_file("iso", 0444, bt_debugfs, NULL, &iso_debugfs_fops); - iso_inited = true; + inited = true; return 0; @@ -2535,7 +2535,7 @@ int iso_init(void) int iso_exit(void) { - if (!iso_inited) + if (!inited) return -EALREADY; bt_procfs_cleanup(&init_net, "iso"); @@ -2549,7 +2549,7 @@ int iso_exit(void) proto_unregister(&iso_proto); - iso_inited = false; + inited = false; return 0; } diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 1ce682038b51..c42dffe77daf 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -922,16 +922,16 @@ static u32 get_current_settings(struct hci_dev *hdev) if (hci_dev_test_flag(hdev, HCI_WIDEBAND_SPEECH_ENABLED)) settings |= MGMT_SETTING_WIDEBAND_SPEECH; - if (cis_central_capable(hdev)) + if (cis_central_enabled(hdev)) settings |= MGMT_SETTING_CIS_CENTRAL; - if (cis_peripheral_capable(hdev)) + if (cis_peripheral_enabled(hdev)) settings |= MGMT_SETTING_CIS_PERIPHERAL; - if (bis_capable(hdev)) + if (bis_enabled(hdev)) settings |= MGMT_SETTING_ISO_BROADCASTER; - if (sync_recv_capable(hdev)) + if (sync_recv_enabled(hdev)) settings |= MGMT_SETTING_ISO_SYNC_RECEIVER; if (ll_privacy_capable(hdev)) @@ -4513,7 +4513,7 @@ static int read_exp_features_info(struct sock *sk, struct hci_dev *hdev, } if (IS_ENABLED(CONFIG_BT_LE)) { - flags = iso_enabled() ? BIT(0) : 0; + flags = iso_inited() ? BIT(0) : 0; memcpy(rp->features[idx].uuid, iso_socket_uuid, 16); rp->features[idx].flags = cpu_to_le32(flags); idx++; From 3dcf7175f2c04bd3a7d50db3fa42a0bd933b6e23 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 4 Aug 2025 14:05:03 -0400 Subject: [PATCH 14/81] Bluetooth: hci_core: Fix using ll_privacy_capable for current settings ll_privacy_capable only indicates that the controller supports the feature but it doesnt' check that LE is enabled so it end up being marked as active in the current settings when it shouldn't. Fixes: ad383c2c65a5 ("Bluetooth: hci_sync: Enable advertising when LL privacy is enabled") Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 1 + net/bluetooth/mgmt.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index bc29f2e2e16f..bb30bde6f0e8 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1934,6 +1934,7 @@ void hci_conn_del_sysfs(struct hci_conn *conn); ((dev)->le_rx_def_phys & HCI_LE_SET_PHY_CODED)) #define ll_privacy_capable(dev) ((dev)->le_features[0] & HCI_LE_LL_PRIVACY) +#define ll_privacy_enabled(dev) (le_enabled(dev) && ll_privacy_capable(dev)) #define privacy_mode_capable(dev) (ll_privacy_capable(dev) && \ ((dev)->commands[39] & 0x04)) diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index c42dffe77daf..3166f5fb876b 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -934,7 +934,7 @@ static u32 get_current_settings(struct hci_dev *hdev) if (sync_recv_enabled(hdev)) settings |= MGMT_SETTING_ISO_SYNC_RECEIVER; - if (ll_privacy_capable(hdev)) + if (ll_privacy_enabled(hdev)) settings |= MGMT_SETTING_LL_PRIVACY; return settings; From 4d19cd228bbe8ff84a63fe7b11bc756b4b4370c7 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Thu, 7 Aug 2025 15:56:03 +0800 Subject: [PATCH 15/81] Bluetooth: hci_sync: Prevent unintended PA sync when SID is 0xFF After LE Extended Scan times out, conn->sid remains 0xFF, so the PA sync creation process should be aborted. Btmon snippet from PA sync with SID=0xFF: < HCI Command: LE Set Extended.. (0x08|0x0042) plen 6 #74726 [hci0] 863.107927 Extended scan: Enabled (0x01) Filter duplicates: Enabled (0x01) Duration: 0 msec (0x0000) Period: 0.00 sec (0x0000) > HCI Event: Command Complete (0x0e) plen 4 #74727 [hci0] 863.109389 LE Set Extended Scan Enable (0x08|0x0042) ncmd 1 Status: Success (0x00) < HCI Command: LE Periodic Ad.. (0x08|0x0044) plen 14 #74728 [hci0] 865.141168 Options: 0x0000 Use advertising SID, Advertiser Address Type and address Reporting initially enabled SID: 0xff Adv address type: Random (0x01) Adv address: 0D:D7:2C:E7:42:46 (Non-Resolvable) Skip: 0x0000 Sync timeout: 20000 msec (0x07d0) Sync CTE type: 0x0000 > HCI Event: Command Status (0x0f) plen 4 #74729 [hci0] 865.143223 LE Periodic Advertising Create Sync (0x08|0x0044) ncmd 1 Status: Success (0x00) Fixes: e2d471b7806b ("Bluetooth: ISO: Fix not using SID from adv report") Signed-off-by: Yang Li Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_sync.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index aa7d7a8ec3ee..31d72b9683ef 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -7045,10 +7045,13 @@ static int hci_le_pa_create_sync(struct hci_dev *hdev, void *data) /* SID has not been set listen for HCI_EV_LE_EXT_ADV_REPORT to update * it. */ - if (conn->sid == HCI_SID_INVALID) - __hci_cmd_sync_status_sk(hdev, HCI_OP_NOP, 0, NULL, - HCI_EV_LE_EXT_ADV_REPORT, - conn->conn_timeout, NULL); + if (conn->sid == HCI_SID_INVALID) { + err = __hci_cmd_sync_status_sk(hdev, HCI_OP_NOP, 0, NULL, + HCI_EV_LE_EXT_ADV_REPORT, + conn->conn_timeout, NULL); + if (err == -ETIMEDOUT) + goto done; + } memset(&cp, 0, sizeof(cp)); cp.options = qos->bcast.options; @@ -7078,6 +7081,7 @@ static int hci_le_pa_create_sync(struct hci_dev *hdev, void *data) __hci_cmd_sync_status(hdev, HCI_OP_LE_PA_CREATE_SYNC_CANCEL, 0, NULL, HCI_CMD_TIMEOUT); +done: hci_dev_clear_flag(hdev, HCI_PA_SYNC); /* Update passive scan since HCI_PA_SYNC flag has been cleared */ From 0b3725dbf61b51e7c663834811b3691157ae17d6 Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Sat, 9 Aug 2025 11:36:20 +0300 Subject: [PATCH 16/81] Bluetooth: hci_event: fix MTU for BN == 0 in CIS Established BN == 0x00 in CIS Established means no isochronous data for the corresponding direction (Core v6.1 pp. 2394). In this case SDU MTU should be 0. However, the specification does not say the Max_PDU_C_To_P or P_To_C are then zero. Intel AX210 in Framed CIS mode sets nonzero Max_PDU for direction with zero BN. This causes failure later when we try to LE Setup ISO Data Path for disabled direction, which is disallowed (Core v6.1 pp. 2750). Fix by setting SDU MTU to 0 if BN == 0. Fixes: 2be22f1941d5f ("Bluetooth: hci_event: Fix parsing of CIS Established Event") Signed-off-by: Pauli Virtanen Reviewed-by: Paul Menzel Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_event.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 4f0a6116291e..fe7cdd67ad2a 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -6745,8 +6745,8 @@ static void hci_le_cis_established_evt(struct hci_dev *hdev, void *data, qos->ucast.out.latency = DIV_ROUND_CLOSEST(get_unaligned_le24(ev->p_latency), 1000); - qos->ucast.in.sdu = le16_to_cpu(ev->c_mtu); - qos->ucast.out.sdu = le16_to_cpu(ev->p_mtu); + qos->ucast.in.sdu = ev->c_bn ? le16_to_cpu(ev->c_mtu) : 0; + qos->ucast.out.sdu = ev->p_bn ? le16_to_cpu(ev->p_mtu) : 0; qos->ucast.in.phy = ev->c_phy; qos->ucast.out.phy = ev->p_phy; break; @@ -6760,8 +6760,8 @@ static void hci_le_cis_established_evt(struct hci_dev *hdev, void *data, qos->ucast.in.latency = DIV_ROUND_CLOSEST(get_unaligned_le24(ev->p_latency), 1000); - qos->ucast.out.sdu = le16_to_cpu(ev->c_mtu); - qos->ucast.in.sdu = le16_to_cpu(ev->p_mtu); + qos->ucast.out.sdu = ev->c_bn ? le16_to_cpu(ev->c_mtu) : 0; + qos->ucast.in.sdu = ev->p_bn ? le16_to_cpu(ev->p_mtu) : 0; qos->ucast.out.phy = ev->c_phy; qos->ucast.in.phy = ev->p_phy; break; From 0eaf7c7e85da7495c0e03a99375707fc954f5e7b Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Tue, 5 Aug 2025 22:14:51 +0300 Subject: [PATCH 17/81] Bluetooth: hci_conn: do return error from hci_enhanced_setup_sync() The commit e07a06b4eb41 ("Bluetooth: Convert SCO configure_datapath to hci_sync") missed to update the *return* statement under the *case* of BT_CODEC_TRANSPARENT in hci_enhanced_setup_sync(), which led to returning success (0) instead of the negative error code (-EINVAL). However, the result of hci_enhanced_setup_sync() seems to be ignored anyway, since NULL gets passed to hci_cmd_sync_queue() as the last argument in that case and the only function interested in that result is specified by that argument. Fixes: e07a06b4eb41 ("Bluetooth: Convert SCO configure_datapath to hci_sync") Signed-off-by: Sergey Shtylyov Reviewed-by: Paul Menzel Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_conn.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index ab6fe5b0cc0f..7a879290dd28 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -339,7 +339,8 @@ static int hci_enhanced_setup_sync(struct hci_dev *hdev, void *data) case BT_CODEC_TRANSPARENT: if (!find_next_esco_param(conn, esco_param_msbc, ARRAY_SIZE(esco_param_msbc))) - return false; + return -EINVAL; + param = &esco_param_msbc[conn->attempt - 1]; cp.tx_coding_format.id = 0x03; cp.rx_coding_format.id = 0x03; From e489317d2fd9a51a81bdcbe15a73ddde8246e6d6 Mon Sep 17 00:00:00 2001 From: Neeraj Sanjay Kale Date: Mon, 4 Aug 2025 16:00:15 +0530 Subject: [PATCH 18/81] Bluetooth: btnxpuart: Uses threaded IRQ for host wakeup handling This replaces devm_request_irq() with devm_request_threaded_irq(). On iMX93 11x11 EVK platform, the BT chip's BT_WAKE_OUT pin is connected to an I2C GPIO expander instead of directly been connected to iMX GPIO. When I2C GPIO expander's (PCAL6524) host driver receives an interrupt on it's INTR line, the driver's interrupt handler needs to query the interrupt source with PCAL6524 first, and then call the actual interrupt handler, in this case the IRQ handler in BTNXPUART. In order to handle interrupts when such I2C GPIO expanders are between the host and interrupt source, devm_request_threaded_irq() is needed. This commit also removes the IRQF_TRIGGER_FALLING flag, to allow setting the IRQ trigger type from the device tree setting instead of hardcoding in the driver. Signed-off-by: Neeraj Sanjay Kale Reviewed-by: Sherry Sun Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btnxpuart.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/bluetooth/btnxpuart.c b/drivers/bluetooth/btnxpuart.c index 73a4a325c867..76e7f857fb7d 100644 --- a/drivers/bluetooth/btnxpuart.c +++ b/drivers/bluetooth/btnxpuart.c @@ -543,10 +543,10 @@ static int ps_setup(struct hci_dev *hdev) } if (psdata->wakeup_source) { - ret = devm_request_irq(&serdev->dev, psdata->irq_handler, - ps_host_wakeup_irq_handler, - IRQF_ONESHOT | IRQF_TRIGGER_FALLING, - dev_name(&serdev->dev), nxpdev); + ret = devm_request_threaded_irq(&serdev->dev, psdata->irq_handler, + NULL, ps_host_wakeup_irq_handler, + IRQF_ONESHOT, + dev_name(&serdev->dev), nxpdev); if (ret) bt_dev_info(hdev, "error setting wakeup IRQ handler, ignoring\n"); disable_irq(psdata->irq_handler); From 9d4b01a0bf8d2163ae129c9c537cb0753ad5a2aa Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 14 Aug 2025 11:57:19 -0400 Subject: [PATCH 19/81] Bluetooth: hci_core: Fix not accounting for BIS/CIS/PA links separately This fixes the likes of hci_conn_num(CIS_LINK) returning the total of ISO connection which includes BIS_LINK as well, so this splits the iso_num into each link type and introduces hci_iso_num that can be used in places where the total number of ISO connection still needs to be used. Fixes: 23205562ffc8 ("Bluetooth: separate CIS_LINK and BIS_LINK link types") Fixes: a7bcffc673de ("Bluetooth: Add PA_LINK to distinguish BIG sync and PA sync connections") Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index bb30bde6f0e8..6906af7a8f24 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -129,7 +129,9 @@ struct hci_conn_hash { struct list_head list; unsigned int acl_num; unsigned int sco_num; - unsigned int iso_num; + unsigned int cis_num; + unsigned int bis_num; + unsigned int pa_num; unsigned int le_num; unsigned int le_num_peripheral; }; @@ -1014,9 +1016,13 @@ static inline void hci_conn_hash_add(struct hci_dev *hdev, struct hci_conn *c) h->sco_num++; break; case CIS_LINK: + h->cis_num++; + break; case BIS_LINK: + h->bis_num++; + break; case PA_LINK: - h->iso_num++; + h->pa_num++; break; } } @@ -1042,9 +1048,13 @@ static inline void hci_conn_hash_del(struct hci_dev *hdev, struct hci_conn *c) h->sco_num--; break; case CIS_LINK: + h->cis_num--; + break; case BIS_LINK: + h->bis_num--; + break; case PA_LINK: - h->iso_num--; + h->pa_num--; break; } } @@ -1061,9 +1071,11 @@ static inline unsigned int hci_conn_num(struct hci_dev *hdev, __u8 type) case ESCO_LINK: return h->sco_num; case CIS_LINK: + return h->cis_num; case BIS_LINK: + return h->bis_num; case PA_LINK: - return h->iso_num; + return h->pa_num; default: return 0; } @@ -1073,7 +1085,15 @@ static inline unsigned int hci_conn_count(struct hci_dev *hdev) { struct hci_conn_hash *c = &hdev->conn_hash; - return c->acl_num + c->sco_num + c->le_num + c->iso_num; + return c->acl_num + c->sco_num + c->le_num + c->cis_num + c->bis_num + + c->pa_num; +} + +static inline unsigned int hci_iso_count(struct hci_dev *hdev) +{ + struct hci_conn_hash *c = &hdev->conn_hash; + + return c->cis_num + c->bis_num; } static inline bool hci_conn_valid(struct hci_dev *hdev, struct hci_conn *conn) From f604d3aaf64ff0d90cc875295474d3abf4155629 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 14 Aug 2025 15:06:40 +0200 Subject: [PATCH 20/81] mlxsw: spectrum: Forward packets with an IPv4 link-local source IP By default, the device does not forward IPv4 packets with a link-local source IP (i.e., 169.254.0.0/16). This behavior does not align with the kernel which does forward them. Fix by instructing the device to forward such packets instead of dropping them. Fixes: ca360db4b825 ("mlxsw: spectrum: Disable DIP_LINK_LOCAL check in hardware pipeline") Reported-by: Zoey Mertes Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Signed-off-by: Petr Machata Link: https://patch.msgid.link/6721e6b2c96feb80269e72ce8d0b426e2f32d99c.1755174341.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 2 ++ drivers/net/ethernet/mellanox/mlxsw/trap.h | 1 + 2 files changed, 3 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 618957d65663..9a2d64a0a858 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -2375,6 +2375,8 @@ static const struct mlxsw_listener mlxsw_sp_listener[] = { ROUTER_EXP, false), MLXSW_SP_RXL_NO_MARK(DISCARD_ING_ROUTER_DIP_LINK_LOCAL, FORWARD, ROUTER_EXP, false), + MLXSW_SP_RXL_NO_MARK(DISCARD_ING_ROUTER_SIP_LINK_LOCAL, FORWARD, + ROUTER_EXP, false), /* Multicast Router Traps */ MLXSW_SP_RXL_MARK(ACL1, TRAP_TO_CPU, MULTICAST, false), MLXSW_SP_RXL_L3_MARK(ACL2, TRAP_TO_CPU, MULTICAST, false), diff --git a/drivers/net/ethernet/mellanox/mlxsw/trap.h b/drivers/net/ethernet/mellanox/mlxsw/trap.h index 80ee5c4825dc..9962dc157901 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/trap.h +++ b/drivers/net/ethernet/mellanox/mlxsw/trap.h @@ -94,6 +94,7 @@ enum { MLXSW_TRAP_ID_DISCARD_ING_ROUTER_IPV4_SIP_BC = 0x16A, MLXSW_TRAP_ID_DISCARD_ING_ROUTER_IPV4_DIP_LOCAL_NET = 0x16B, MLXSW_TRAP_ID_DISCARD_ING_ROUTER_DIP_LINK_LOCAL = 0x16C, + MLXSW_TRAP_ID_DISCARD_ING_ROUTER_SIP_LINK_LOCAL = 0x16D, MLXSW_TRAP_ID_DISCARD_ROUTER_IRIF_EN = 0x178, MLXSW_TRAP_ID_DISCARD_ROUTER_ERIF_EN = 0x179, MLXSW_TRAP_ID_DISCARD_ROUTER_LPM4 = 0x17B, From 5e0b2177bdba99c2487480e9864825f742b684ee Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 14 Aug 2025 15:06:41 +0200 Subject: [PATCH 21/81] selftest: forwarding: router: Add a test case for IPv4 link-local source IP Add a test case which checks that packets with an IPv4 link-local source IP are forwarded and not dropped. Signed-off-by: Ido Schimmel Signed-off-by: Petr Machata Link: https://patch.msgid.link/3c2e0b17d99530f57bef5ddff9af284fa0c9b667.1755174341.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- .../selftests/net/forwarding/router.sh | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/tools/testing/selftests/net/forwarding/router.sh b/tools/testing/selftests/net/forwarding/router.sh index b98ea9449b8b..dfb6646cb97b 100755 --- a/tools/testing/selftests/net/forwarding/router.sh +++ b/tools/testing/selftests/net/forwarding/router.sh @@ -18,6 +18,8 @@ # | 2001:db8:1::1/64 2001:db8:2::1/64 | # | | # +-----------------------------------------------------------------+ +# +#shellcheck disable=SC2034 # SC doesn't see our uses of global variables ALL_TESTS=" ping_ipv4 @@ -27,6 +29,7 @@ ALL_TESTS=" ipv4_sip_equal_dip ipv6_sip_equal_dip ipv4_dip_link_local + ipv4_sip_link_local " NUM_NETIFS=4 @@ -330,6 +333,32 @@ ipv4_dip_link_local() tc filter del dev $rp2 egress protocol ip pref 1 handle 101 flower } +ipv4_sip_link_local() +{ + local sip=169.254.1.1 + + RET=0 + + # Disable rpfilter to prevent packets to be dropped because of it. + sysctl_set net.ipv4.conf.all.rp_filter 0 + sysctl_set net.ipv4.conf."$rp1".rp_filter 0 + + tc filter add dev "$rp2" egress protocol ip pref 1 handle 101 \ + flower src_ip "$sip" action pass + + $MZ "$h1" -t udp "sp=54321,dp=12345" -c 5 -d 1msec -b "$rp1mac" \ + -A "$sip" -B 198.51.100.2 -q + + tc_check_packets "dev $rp2 egress" 101 5 + check_err $? "Packets were dropped" + + log_test "IPv4 source IP is link-local" + + tc filter del dev "$rp2" egress protocol ip pref 1 handle 101 flower + sysctl_restore net.ipv4.conf."$rp1".rp_filter + sysctl_restore net.ipv4.conf.all.rp_filter +} + trap cleanup EXIT setup_prepare From 12da2b92ad50e6602b4c5e9073d71f2368b70b63 Mon Sep 17 00:00:00 2001 From: Chandra Mohan Sundar Date: Thu, 14 Aug 2025 22:00:10 +0530 Subject: [PATCH 22/81] net: libwx: Fix the size in RSS hash key population While trying to fill a random RSS key, the size of the pointer is being used rather than the actual size of the RSS key. Fix by passing an appropriate value of the RSS key. This issue was reported by static coverity analyser. Fixes: eb4898fde1de8 ("net: libwx: add wangxun vf common api") Signed-off-by: Chandra Mohan Sundar Link: https://patch.msgid.link/20250814163014.613004-1-chandramohan.explore@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/wangxun/libwx/wx_vf_lib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/wangxun/libwx/wx_vf_lib.c b/drivers/net/ethernet/wangxun/libwx/wx_vf_lib.c index 5d48df7a849f..3023ea2732ef 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_vf_lib.c +++ b/drivers/net/ethernet/wangxun/libwx/wx_vf_lib.c @@ -192,7 +192,7 @@ void wx_setup_vfmrqc_vf(struct wx *wx) u8 i, j; /* Fill out hash function seeds */ - netdev_rss_key_fill(wx->rss_key, sizeof(wx->rss_key)); + netdev_rss_key_fill(wx->rss_key, WX_RSS_KEY_SIZE); for (i = 0; i < WX_RSS_KEY_SIZE / 4; i++) wr32(wx, WX_VXRSSRK(i), wx->rss_key[i]); From 715c7a36d59f54162a26fac1d1ed8dc087a24cf1 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 14 Aug 2025 12:43:23 -0700 Subject: [PATCH 23/81] selftests: tls: make the new data_steal test less flaky The CI has hit a couple of cases of: RUN global.data_steal ... tls.c:2762:data_steal:Expected recv(cfd, buf2, sizeof(buf2), MSG_DONTWAIT) (20000) == -1 (-1) data_steal: Test terminated by timeout FAIL global.data_steal Looks like the 2msec sleep is not long enough. Make the sleep longer, and then instead of second sleep wait for the thieving process to exit. That way we can be sure it called recv() before us. While at it also avoid trying to steal more than a record, this seems to be causing issues in manual testing as well. Fixes: d7e82594a45c ("selftests: tls: test TCP stealing data from under the TLS socket") Link: https://patch.msgid.link/20250814194323.2014650-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/tls.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c index d8cfcf9bb825..2b8387a83bc7 100644 --- a/tools/testing/selftests/net/tls.c +++ b/tools/testing/selftests/net/tls.c @@ -2748,17 +2748,18 @@ TEST(data_steal) { pid = fork(); ASSERT_GE(pid, 0); if (!pid) { - EXPECT_EQ(recv(cfd, buf, sizeof(buf), MSG_WAITALL), - sizeof(buf)); + EXPECT_EQ(recv(cfd, buf, sizeof(buf) / 2, MSG_WAITALL), + sizeof(buf) / 2); exit(!__test_passed(_metadata)); } - usleep(2000); + usleep(10000); ASSERT_EQ(setsockopt(fd, SOL_TLS, TLS_TX, &tls, tls.len), 0); ASSERT_EQ(setsockopt(cfd, SOL_TLS, TLS_RX, &tls, tls.len), 0); EXPECT_EQ(send(fd, buf, sizeof(buf), 0), sizeof(buf)); - usleep(2000); + EXPECT_EQ(wait(&status), pid); + EXPECT_EQ(status, 0); EXPECT_EQ(recv(cfd, buf2, sizeof(buf2), MSG_DONTWAIT), -1); /* Don't check errno, the error will be different depending * on what random bytes TLS interpreted as the record length. @@ -2766,9 +2767,6 @@ TEST(data_steal) { close(fd); close(cfd); - - EXPECT_EQ(wait(&status), pid); - EXPECT_EQ(status, 0); } static void __attribute__((constructor)) fips_check(void) { From 864e3396976ef41de6cc7bc366276bf4e084fff2 Mon Sep 17 00:00:00 2001 From: Jakub Ramaseuski Date: Thu, 14 Aug 2025 12:51:19 +0200 Subject: [PATCH 24/81] net: gso: Forbid IPv6 TSO with extensions on devices with only IPV6_CSUM When performing Generic Segmentation Offload (GSO) on an IPv6 packet that contains extension headers, the kernel incorrectly requests checksum offload if the egress device only advertises NETIF_F_IPV6_CSUM feature, which has a strict contract: it supports checksum offload only for plain TCP or UDP over IPv6 and explicitly does not support packets with extension headers. The current GSO logic violates this contract by failing to disable the feature for packets with extension headers, such as those used in GREoIPv6 tunnels. This violation results in the device being asked to perform an operation it cannot support, leading to a `skb_warn_bad_offload` warning and a collapse of network throughput. While device TSO/USO is correctly bypassed in favor of software GSO for these packets, the GSO stack must be explicitly told not to request checksum offload. Mask NETIF_F_IPV6_CSUM, NETIF_F_TSO6 and NETIF_F_GSO_UDP_L4 in gso_features_check if the IPv6 header contains extension headers to compute checksum in software. The exception is a BIG TCP extension, which, as stated in commit 68e068cabd2c6c53 ("net: reenable NETIF_F_IPV6_CSUM offload for BIG TCP packets"): "The feature is only enabled on devices that support BIG TCP TSO. The header is only present for PF_PACKET taps like tcpdump, and not transmitted by physical devices." kernel log output (truncated): WARNING: CPU: 1 PID: 5273 at net/core/dev.c:3535 skb_warn_bad_offload+0x81/0x140 ... Call Trace: skb_checksum_help+0x12a/0x1f0 validate_xmit_skb+0x1a3/0x2d0 validate_xmit_skb_list+0x4f/0x80 sch_direct_xmit+0x1a2/0x380 __dev_xmit_skb+0x242/0x670 __dev_queue_xmit+0x3fc/0x7f0 ip6_finish_output2+0x25e/0x5d0 ip6_finish_output+0x1fc/0x3f0 ip6_tnl_xmit+0x608/0xc00 [ip6_tunnel] ip6gre_tunnel_xmit+0x1c0/0x390 [ip6_gre] dev_hard_start_xmit+0x63/0x1c0 __dev_queue_xmit+0x6d0/0x7f0 ip6_finish_output2+0x214/0x5d0 ip6_finish_output+0x1fc/0x3f0 ip6_xmit+0x2ca/0x6f0 ip6_finish_output+0x1fc/0x3f0 ip6_xmit+0x2ca/0x6f0 inet6_csk_xmit+0xeb/0x150 __tcp_transmit_skb+0x555/0xa80 tcp_write_xmit+0x32a/0xe90 tcp_sendmsg_locked+0x437/0x1110 tcp_sendmsg+0x2f/0x50 ... skb linear: 00000000: e4 3d 1a 7d ec 30 e4 3d 1a 7e 5d 90 86 dd 60 0e skb linear: 00000010: 00 0a 1b 34 3c 40 20 11 00 00 00 00 00 00 00 00 skb linear: 00000020: 00 00 00 00 00 12 20 11 00 00 00 00 00 00 00 00 skb linear: 00000030: 00 00 00 00 00 11 2f 00 04 01 04 01 01 00 00 00 skb linear: 00000040: 86 dd 60 0e 00 0a 1b 00 06 40 20 23 00 00 00 00 skb linear: 00000050: 00 00 00 00 00 00 00 00 00 12 20 23 00 00 00 00 skb linear: 00000060: 00 00 00 00 00 00 00 00 00 11 bf 96 14 51 13 f9 skb linear: 00000070: ae 27 a0 a8 2b e3 80 18 00 40 5b 6f 00 00 01 01 skb linear: 00000080: 08 0a 42 d4 50 d5 4b 70 f8 1a Fixes: 04c20a9356f283da ("net: skip offload for NETIF_F_IPV6_CSUM if ipv6 header contains extension") Reported-by: Tianhao Zhao Suggested-by: Michal Schmidt Suggested-by: Willem de Bruijn Signed-off-by: Jakub Ramaseuski Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250814105119.1525687-1-jramaseu@redhat.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/net/core/dev.c b/net/core/dev.c index 5a3c0f40a93f..93a25d87b86b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3779,6 +3779,18 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb, features &= ~NETIF_F_TSO_MANGLEID; } + /* NETIF_F_IPV6_CSUM does not support IPv6 extension headers, + * so neither does TSO that depends on it. + */ + if (features & NETIF_F_IPV6_CSUM && + (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6 || + (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4 && + vlan_get_protocol(skb) == htons(ETH_P_IPV6))) && + skb_transport_header_was_set(skb) && + skb_network_header_len(skb) != sizeof(struct ipv6hdr) && + !ipv6_has_hopopt_jumbo(skb)) + features &= ~(NETIF_F_IPV6_CSUM | NETIF_F_TSO6 | NETIF_F_GSO_UDP_L4); + return features; } From 84967deee9d9870b15bc4c3acb50f1d401807902 Mon Sep 17 00:00:00 2001 From: Minhong He Date: Fri, 15 Aug 2025 14:38:45 +0800 Subject: [PATCH 25/81] ipv6: sr: validate HMAC algorithm ID in seg6_hmac_info_add The seg6_genl_sethmac() directly uses the algorithm ID provided by the userspace without verifying whether it is an HMAC algorithm supported by the system. If an unsupported HMAC algorithm ID is configured, packets using SRv6 HMAC will be dropped during encapsulation or decapsulation. Fixes: 4f4853dc1c9c ("ipv6: sr: implement API to control SR HMAC structure") Signed-off-by: Minhong He Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250815063845.85426-1-heminhong@kylinos.cn Signed-off-by: Jakub Kicinski --- net/ipv6/seg6_hmac.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c index f78ecb6ad838..d77b52523b6a 100644 --- a/net/ipv6/seg6_hmac.c +++ b/net/ipv6/seg6_hmac.c @@ -304,6 +304,9 @@ int seg6_hmac_info_add(struct net *net, u32 key, struct seg6_hmac_info *hinfo) struct seg6_pernet_data *sdata = seg6_pernet(net); int err; + if (!__hmac_get_algo(hinfo->alg_id)) + return -EINVAL; + err = rhashtable_lookup_insert_fast(&sdata->hmac_infos, &hinfo->node, rht_params); From ccab044697980c6c01ab51f43f48f13b8a3e5c33 Mon Sep 17 00:00:00 2001 From: Christoph Paasch Date: Fri, 15 Aug 2025 19:28:19 +0200 Subject: [PATCH 26/81] mptcp: drop skb if MPTCP skb extension allocation fails When skb_ext_add(skb, SKB_EXT_MPTCP) fails in mptcp_incoming_options(), we used to return true, letting the segment proceed through the TCP receive path without a DSS mapping. Such segments can leave inconsistent mapping state and trigger a mid-stream fallback to TCP, which in testing collapsed (by artificially forcing failures in skb_ext_add) throughput to zero. Return false instead so the TCP input path drops the skb (see tcp_data_queue() and step-7 processing). This is the safer choice under memory pressure: it preserves MPTCP correctness and provides backpressure to the sender. Control packets remain unaffected: ACK updates and DATA_FIN handling happen before attempting the extension allocation, and tcp_reset() continues to ignore the return value. With this change, MPTCP continues to work at high throughput if we artificially inject failures into skb_ext_add. Fixes: 6787b7e350d3 ("mptcp: avoid processing packet if a subflow reset") Cc: stable@vger.kernel.org Signed-off-by: Christoph Paasch Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250815-net-mptcp-misc-fixes-6-17-rc2-v1-1-521fe9957892@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/options.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 70c0ab0ecf90..2a8ea28442b2 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -1118,7 +1118,9 @@ static bool add_addr_hmac_valid(struct mptcp_sock *msk, return hmac == mp_opt->ahmac; } -/* Return false if a subflow has been reset, else return true */ +/* Return false in case of error (or subflow has been reset), + * else return true. + */ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); @@ -1222,7 +1224,7 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) mpext = skb_ext_add(skb, SKB_EXT_MPTCP); if (!mpext) - return true; + return false; memset(mpext, 0, sizeof(*mpext)); From 68fc0f4b0d25692940cdc85c68e366cae63e1757 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 15 Aug 2025 19:28:20 +0200 Subject: [PATCH 27/81] mptcp: pm: kernel: flush: do not reset ADD_ADDR limit A flush of the MPTCP endpoints should not affect the MPTCP limits. In other words, 'ip mptcp endpoint flush' should not change 'ip mptcp limits'. But it was the case: the MPTCP_PM_ATTR_RCV_ADD_ADDRS (add_addr_accepted) limit was reset by accident. Removing the reset of this counter during a flush fixes this issue. Fixes: 01cacb00b35c ("mptcp: add netlink-based PM") Cc: stable@vger.kernel.org Reported-by: Thomas Dreibholz Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/579 Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250815-net-mptcp-misc-fixes-6-17-rc2-v1-2-521fe9957892@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_kernel.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c index d39e7c178460..667803d72b64 100644 --- a/net/mptcp/pm_kernel.c +++ b/net/mptcp/pm_kernel.c @@ -1085,7 +1085,6 @@ static void __flush_addrs(struct list_head *list) static void __reset_counters(struct pm_nl_pernet *pernet) { WRITE_ONCE(pernet->add_addr_signal_max, 0); - WRITE_ONCE(pernet->add_addr_accept_max, 0); WRITE_ONCE(pernet->local_addr_max, 0); pernet->addrs = 0; } From 452690be7de2f91cc0de68cb9e95252875b33503 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 15 Aug 2025 19:28:21 +0200 Subject: [PATCH 28/81] selftests: mptcp: pm: check flush doesn't reset limits This modification is linked to the parent commit where the received ADD_ADDR limit was accidentally reset when the endpoints were flushed. To validate that, the test is now flushing endpoints after having set new limits, and before checking them. The 'Fixes' tag here below is the same as the one from the previous commit: this patch here is not fixing anything wrong in the selftests, but it validates the previous fix for an issue introduced by this commit ID. Fixes: 01cacb00b35c ("mptcp: add netlink-based PM") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250815-net-mptcp-misc-fixes-6-17-rc2-v1-3-521fe9957892@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/pm_netlink.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/net/mptcp/pm_netlink.sh b/tools/testing/selftests/net/mptcp/pm_netlink.sh index 2e6648a2b2c0..ac7ec6f94023 100755 --- a/tools/testing/selftests/net/mptcp/pm_netlink.sh +++ b/tools/testing/selftests/net/mptcp/pm_netlink.sh @@ -198,6 +198,7 @@ set_limits 1 9 2>/dev/null check "get_limits" "${default_limits}" "subflows above hard limit" set_limits 8 8 +flush_endpoint ## to make sure it doesn't affect the limits check "get_limits" "$(format_limits 8 8)" "set limits" flush_endpoint From 5d13349472ac8abcbcb94407969aa0fdc2e1f1be Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 15 Aug 2025 19:28:22 +0200 Subject: [PATCH 29/81] mptcp: remove duplicate sk_reset_timer call sk_reset_timer() was called twice in mptcp_pm_alloc_anno_list. Simplify the code by using a 'goto' statement to eliminate the duplication. Note that this is not a fix, but it will help backporting the following patch. The same "Fixes" tag has been added for this reason. Fixes: 93f323b9cccc ("mptcp: add a new sysctl add_addr_timeout") Cc: stable@vger.kernel.org Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250815-net-mptcp-misc-fixes-6-17-rc2-v1-4-521fe9957892@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 420d416e2603..c5f6a53ce5f1 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -353,9 +353,7 @@ bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, if (WARN_ON_ONCE(mptcp_pm_is_kernel(msk))) return false; - sk_reset_timer(sk, &add_entry->add_timer, - jiffies + mptcp_get_add_addr_timeout(net)); - return true; + goto reset_timer; } add_entry = kmalloc(sizeof(*add_entry), GFP_ATOMIC); @@ -369,6 +367,7 @@ bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, add_entry->retrans_times = 0; timer_setup(&add_entry->add_timer, mptcp_pm_add_timer, 0); +reset_timer: sk_reset_timer(sk, &add_entry->add_timer, jiffies + mptcp_get_add_addr_timeout(net)); From f5ce0714623cffd00bf2a83e890d09c609b7f50a Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 15 Aug 2025 19:28:23 +0200 Subject: [PATCH 30/81] mptcp: disable add_addr retransmission when timeout is 0 When add_addr_timeout was set to 0, this caused the ADD_ADDR to be retransmitted immediately, which looks like a buggy behaviour. Instead, interpret 0 as "no retransmissions needed". The documentation is updated to explicitly state that setting the timeout to 0 disables retransmission. Fixes: 93f323b9cccc ("mptcp: add a new sysctl add_addr_timeout") Cc: stable@vger.kernel.org Suggested-by: Matthieu Baerts Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250815-net-mptcp-misc-fixes-6-17-rc2-v1-5-521fe9957892@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/networking/mptcp-sysctl.rst | 2 ++ net/mptcp/pm.c | 13 ++++++++++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/Documentation/networking/mptcp-sysctl.rst b/Documentation/networking/mptcp-sysctl.rst index 5bfab01eff5a..1683c139821e 100644 --- a/Documentation/networking/mptcp-sysctl.rst +++ b/Documentation/networking/mptcp-sysctl.rst @@ -12,6 +12,8 @@ add_addr_timeout - INTEGER (seconds) resent to an MPTCP peer that has not acknowledged a previous ADD_ADDR message. + Do not retransmit if set to 0. + The default value matches TCP_RTO_MAX. This is a per-namespace sysctl. diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index c5f6a53ce5f1..136a380602ca 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -274,6 +274,7 @@ static void mptcp_pm_add_timer(struct timer_list *timer) add_timer); struct mptcp_sock *msk = entry->sock; struct sock *sk = (struct sock *)msk; + unsigned int timeout; pr_debug("msk=%p\n", msk); @@ -291,6 +292,10 @@ static void mptcp_pm_add_timer(struct timer_list *timer) goto out; } + timeout = mptcp_get_add_addr_timeout(sock_net(sk)); + if (!timeout) + goto out; + spin_lock_bh(&msk->pm.lock); if (!mptcp_pm_should_add_signal_addr(msk)) { @@ -302,7 +307,7 @@ static void mptcp_pm_add_timer(struct timer_list *timer) if (entry->retrans_times < ADD_ADDR_RETRANS_MAX) sk_reset_timer(sk, timer, - jiffies + mptcp_get_add_addr_timeout(sock_net(sk))); + jiffies + timeout); spin_unlock_bh(&msk->pm.lock); @@ -344,6 +349,7 @@ bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, struct mptcp_pm_add_entry *add_entry = NULL; struct sock *sk = (struct sock *)msk; struct net *net = sock_net(sk); + unsigned int timeout; lockdep_assert_held(&msk->pm.lock); @@ -368,8 +374,9 @@ bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, timer_setup(&add_entry->add_timer, mptcp_pm_add_timer, 0); reset_timer: - sk_reset_timer(sk, &add_entry->add_timer, - jiffies + mptcp_get_add_addr_timeout(net)); + timeout = mptcp_get_add_addr_timeout(net); + if (timeout) + sk_reset_timer(sk, &add_entry->add_timer, jiffies + timeout); return true; } From f92199f551e617fae028c5c5905ddd63e3616e18 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 15 Aug 2025 19:28:24 +0200 Subject: [PATCH 31/81] selftests: mptcp: disable add_addr retrans in endpoint_tests To prevent test instability in the "delete re-add signal" test caused by ADD_ADDR retransmissions, disable retransmissions for this test by setting net.mptcp.add_addr_timeout to 0. Suggested-by: Matthieu Baerts Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250815-net-mptcp-misc-fixes-6-17-rc2-v1-6-521fe9957892@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index b8af65373b3a..82cae37d9c20 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -3842,6 +3842,7 @@ endpoint_tests() # remove and re-add if reset_with_events "delete re-add signal" && mptcp_lib_kallsyms_has "subflow_rebuild_header$"; then + ip netns exec $ns1 sysctl -q net.mptcp.add_addr_timeout=0 pm_nl_set_limits $ns1 0 3 pm_nl_set_limits $ns2 3 3 pm_nl_add_endpoint $ns1 10.0.2.1 id 1 flags signal From 2eefbed30d46d5e68593baf6b52923e00e7678af Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 15 Aug 2025 19:28:25 +0200 Subject: [PATCH 32/81] selftests: mptcp: connect: fix C23 extension warning GCC was complaining about the new label: mptcp_connect.c:187:2: warning: label followed by a declaration is a C23 extension [-Wc23-extensions] 187 | int err = getaddrinfo(node, service, hints, res); | ^ Simply declare 'err' before the label to avoid this warning. Fixes: a862771d1aa4 ("selftests: mptcp: use IPPROTO_MPTCP for getaddrinfo") Cc: stable@vger.kernel.org Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250815-net-mptcp-misc-fixes-6-17-rc2-v1-7-521fe9957892@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_connect.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c index ac1349c4b9e5..4f07ac9fa207 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.c +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c @@ -183,9 +183,10 @@ static void xgetaddrinfo(const char *node, const char *service, struct addrinfo *hints, struct addrinfo **res) { -again: - int err = getaddrinfo(node, service, hints, res); + int err; +again: + err = getaddrinfo(node, service, hints, res); if (err) { const char *errstr; From 3259889fd3c0cc165b7e9ee375c789875dd32326 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 15 Aug 2025 19:28:26 +0200 Subject: [PATCH 33/81] selftests: mptcp: sockopt: fix C23 extension warning GCC was complaining about the new label: mptcp_inq.c:79:2: warning: label followed by a declaration is a C23 extension [-Wc23-extensions] 79 | int err = getaddrinfo(node, service, hints, res); | ^ mptcp_sockopt.c:166:2: warning: label followed by a declaration is a C23 extension [-Wc23-extensions] 166 | int err = getaddrinfo(node, service, hints, res); | ^ Simply declare 'err' before the label to avoid this warning. Fixes: dd367e81b79a ("selftests: mptcp: sockopt: use IPPROTO_MPTCP for getaddrinfo") Cc: stable@vger.kernel.org Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250815-net-mptcp-misc-fixes-6-17-rc2-v1-8-521fe9957892@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_inq.c | 5 +++-- tools/testing/selftests/net/mptcp/mptcp_sockopt.c | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_inq.c b/tools/testing/selftests/net/mptcp/mptcp_inq.c index 3cf1e2a612ce..f3bcaa48df8f 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_inq.c +++ b/tools/testing/selftests/net/mptcp/mptcp_inq.c @@ -75,9 +75,10 @@ static void xgetaddrinfo(const char *node, const char *service, struct addrinfo *hints, struct addrinfo **res) { -again: - int err = getaddrinfo(node, service, hints, res); + int err; +again: + err = getaddrinfo(node, service, hints, res); if (err) { const char *errstr; diff --git a/tools/testing/selftests/net/mptcp/mptcp_sockopt.c b/tools/testing/selftests/net/mptcp/mptcp_sockopt.c index 9934a68df237..e934dd26a59d 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_sockopt.c +++ b/tools/testing/selftests/net/mptcp/mptcp_sockopt.c @@ -162,9 +162,10 @@ static void xgetaddrinfo(const char *node, const char *service, struct addrinfo *hints, struct addrinfo **res) { -again: - int err = getaddrinfo(node, service, hints, res); + int err; +again: + err = getaddrinfo(node, service, hints, res); if (err) { const char *errstr; From 89eb9a62aed77b409663ba1eac152e8f758815b7 Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Fri, 15 Aug 2025 22:18:09 +0200 Subject: [PATCH 34/81] net: dsa: b53: fix reserved register access in b53_fdb_dump() When BCM5325 support was added in c45655386e53 ("net: dsa: b53: add support for FDB operations on 5325/5365"), the register used for ARL access was made conditional on the chip. But in b53_fdb_dump(), instead of the register argument the page argument was replaced, causing it to write to a reserved page 0x50 on !BCM5325*. Writing to this page seems to completely lock the switch up: [ 89.680000] b53-switch spi0.1 lan2: Link is Down [ 89.680000] WARNING: CPU: 1 PID: 26 at drivers/net/phy/phy.c:1350 _phy_state_machine+0x1bc/0x454 [ 89.720000] phy_check_link_status+0x0/0x114: returned: -5 [ 89.730000] Modules linked in: nft_fib_inet nf_flow_table_inet nft_reject_ipv6 nft_reject_ipv4 nft_reject_inet nft_reject nft_redir nft_quota nft_numgen nft_nat nft_masq nft_log nft_limit nft_hash nft_flow_offload nft_fib_ipv6 nft_fib_ipv4 nft_fib nft_ct nft_chain_nat nf_tables nf_nat nf_flow_table nf_conntrack nfnetlink nf_reject_ipv6 nf_reject_ipv4 nf_log_syslog nf_defrag_ipv6 nf_defrag_ipv4 cls_flower sch_tbf sch_ingress sch_htb sch_hfsc em_u32 cls_u32 cls_route cls_matchall cls_fw cls_flow cls_basic act_skbedit act_mirred act_gact vrf md5 crc32c_cryptoapi [ 89.780000] CPU: 1 UID: 0 PID: 26 Comm: kworker/u10:0 Tainted: G W 6.16.0-rc1+ #0 NONE [ 89.780000] Tainted: [W]=WARN [ 89.780000] Hardware name: Netgear DGND3700 v1 [ 89.780000] Workqueue: events_power_efficient phy_state_machine [ 89.780000] Stack : 809c762c 8006b050 00000001 820a9ce3 0000114c 000affff 805d22d0 8200ba00 [ 89.780000] 82005000 6576656e 74735f70 6f776572 5f656666 10008b00 820a9cb8 82088700 [ 89.780000] 00000000 00000000 809c762c 820a9a98 00000000 00000000 ffffefff 80a7a76c [ 89.780000] 80a70000 820a9af8 80a70000 80a70000 80a70000 00000000 809c762c 820a9dd4 [ 89.780000] 00000000 805d1494 80a029e4 80a70000 00000003 00000000 00000004 81a60004 [ 89.780000] ... [ 89.780000] Call Trace: [ 89.780000] [<800228b8>] show_stack+0x38/0x118 [ 89.780000] [<8001afc4>] dump_stack_lvl+0x6c/0xac [ 89.780000] [<80046b90>] __warn+0x9c/0x114 [ 89.780000] [<80046da8>] warn_slowpath_fmt+0x1a0/0x1b0 [ 89.780000] [<805d1494>] _phy_state_machine+0x1bc/0x454 [ 89.780000] [<805d22fc>] phy_state_machine+0x2c/0x70 [ 89.780000] [<80066b08>] process_one_work+0x1e8/0x3e0 [ 89.780000] [<80067a1c>] worker_thread+0x354/0x4e4 [ 89.780000] [<800706cc>] kthread+0x130/0x274 [ 89.780000] [<8001d808>] ret_from_kernel_thread+0x14/0x1c And any further accesses fail: [ 120.790000] b53-switch spi0.1: timeout waiting for ARL to finish: 0x81 [ 120.800000] b53-switch spi0.1: port 2 failed to add 2c:b0:5d:27:9a:bd vid 3 to fdb: -145 [ 121.010000] b53-switch spi0.1: timeout waiting for ARL to finish: 0xbf [ 121.020000] b53-switch spi0.1: port 3 failed to add 2c:b0:5d:27:9a:bd vid 3 to fdb: -145 Restore the correct page B53_ARLIO_PAGE again, and move the offset argument to the correct place. *On BCM5325, this became a write to the MIB page of Port 1. Still a reserved offset, but likely less brokenness from that write. Fixes: c45655386e53 ("net: dsa: b53: add support for FDB operations on 5325/5365") Signed-off-by: Jonas Gorski Reviewed-by: Florian Fainelli Link: https://patch.msgid.link/20250815201809.549195-1-jonas.gorski@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/b53/b53_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index 9942fb6f7f4b..829b1f087e9e 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -2078,7 +2078,7 @@ int b53_fdb_dump(struct dsa_switch *ds, int port, /* Start search operation */ reg = ARL_SRCH_STDN; - b53_write8(priv, offset, B53_ARL_SRCH_CTL, reg); + b53_write8(priv, B53_ARLIO_PAGE, offset, reg); do { ret = b53_arl_search_wait(priv); From 4611d88a37cfc18cbabc6978aaf7325d1ae3f53a Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Sat, 16 Aug 2025 11:38:50 -0700 Subject: [PATCH 35/81] bnxt_en: Fix lockdep warning during rmmod The commit under the Fixes tag added a netdev_assert_locked() in bnxt_free_ntp_fltrs(). The lock should be held during normal run-time but the assert will be triggered (see below) during bnxt_remove_one() which should not need the lock. The netdev is already unregistered by then. Fix it by calling netdev_assert_locked_or_invisible() which will not assert if the netdev is unregistered. WARNING: CPU: 5 PID: 2241 at ./include/net/netdev_lock.h:17 bnxt_free_ntp_fltrs+0xf8/0x100 [bnxt_en] Modules linked in: rpcrdma rdma_cm iw_cm ib_cm configfs ib_core bnxt_en(-) bridge stp llc x86_pkg_temp_thermal xfs tg3 [last unloaded: bnxt_re] CPU: 5 UID: 0 PID: 2241 Comm: rmmod Tainted: G S W 6.16.0 #2 PREEMPT(voluntary) Tainted: [S]=CPU_OUT_OF_SPEC, [W]=WARN Hardware name: Dell Inc. PowerEdge R730/072T6D, BIOS 2.4.3 01/17/2017 RIP: 0010:bnxt_free_ntp_fltrs+0xf8/0x100 [bnxt_en] Code: 41 5c 41 5d 41 5e 41 5f c3 cc cc cc cc 48 8b 47 60 be ff ff ff ff 48 8d b8 28 0c 00 00 e8 d0 cf 41 c3 85 c0 0f 85 2e ff ff ff <0f> 0b e9 27 ff ff ff 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 RSP: 0018:ffffa92082387da0 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff9e5b593d8000 RCX: 0000000000000001 RDX: 0000000000000001 RSI: ffffffff83dc9a70 RDI: ffffffff83e1a1cf RBP: ffff9e5b593d8c80 R08: 0000000000000000 R09: ffffffff8373a2b3 R10: 000000008100009f R11: 0000000000000001 R12: 0000000000000001 R13: ffffffffc01c4478 R14: dead000000000122 R15: dead000000000100 FS: 00007f3a8a52c740(0000) GS:ffff9e631ad1c000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055bb289419c8 CR3: 000000011274e001 CR4: 00000000003706f0 Call Trace: bnxt_remove_one+0x57/0x180 [bnxt_en] pci_device_remove+0x39/0xc0 device_release_driver_internal+0xa5/0x130 driver_detach+0x42/0x90 bus_remove_driver+0x61/0xc0 pci_unregister_driver+0x38/0x90 bnxt_exit+0xc/0x7d0 [bnxt_en] Fixes: 004b5008016a ("eth: bnxt: remove most dependencies on RTNL") Reviewed-by: Pavan Chebbi Signed-off-by: Michael Chan Reviewed-by: Vadim Fedorenko Link: https://patch.msgid.link/20250816183850.4125033-1-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 2800a90fba1f..207a8bb36ae5 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -5332,7 +5332,7 @@ static void bnxt_free_ntp_fltrs(struct bnxt *bp, bool all) { int i; - netdev_assert_locked(bp->dev); + netdev_assert_locked_or_invisible(bp->dev); /* Under netdev instance lock and all our NAPIs have been disabled. * It's safe to delete the hash table. From 62c30c544359aa18b8fb2734166467a07d435c2d Mon Sep 17 00:00:00 2001 From: Qingfang Deng Date: Thu, 14 Aug 2025 09:25:57 +0800 Subject: [PATCH 36/81] net: ethernet: mtk_ppe: add RCU lock around dev_fill_forward_path Ensure ndo_fill_forward_path() is called with RCU lock held. Fixes: 2830e314778d ("net: ethernet: mtk-ppe: fix traffic offload with bridged wlan") Signed-off-by: Qingfang Deng Link: https://patch.msgid.link/20250814012559.3705-1-dqfext@gmail.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/mediatek/mtk_ppe_offload.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/mediatek/mtk_ppe_offload.c b/drivers/net/ethernet/mediatek/mtk_ppe_offload.c index c855fb799ce1..e9bd32741983 100644 --- a/drivers/net/ethernet/mediatek/mtk_ppe_offload.c +++ b/drivers/net/ethernet/mediatek/mtk_ppe_offload.c @@ -101,7 +101,9 @@ mtk_flow_get_wdma_info(struct net_device *dev, const u8 *addr, struct mtk_wdma_i if (!IS_ENABLED(CONFIG_NET_MEDIATEK_SOC_WED)) return -1; + rcu_read_lock(); err = dev_fill_forward_path(dev, addr, &stack); + rcu_read_unlock(); if (err) return err; From 0417adf367a0af11adf7ace849af4638cfb573f7 Mon Sep 17 00:00:00 2001 From: Qingfang Deng Date: Thu, 14 Aug 2025 09:25:58 +0800 Subject: [PATCH 37/81] ppp: fix race conditions in ppp_fill_forward_path ppp_fill_forward_path() has two race conditions: 1. The ppp->channels list can change between list_empty() and list_first_entry(), as ppp_lock() is not held. If the only channel is deleted in ppp_disconnect_channel(), list_first_entry() may access an empty head or a freed entry, and trigger a panic. 2. pch->chan can be NULL. When ppp_unregister_channel() is called, pch->chan is set to NULL before pch is removed from ppp->channels. Fix these by using a lockless RCU approach: - Use list_first_or_null_rcu() to safely test and access the first list entry. - Convert list modifications on ppp->channels to their RCU variants and add synchronize_net() after removal. - Check for a NULL pch->chan before dereferencing it. Fixes: f6efc675c9dd ("net: ppp: resolve forwarding path for bridge pppoe devices") Signed-off-by: Qingfang Deng Link: https://patch.msgid.link/20250814012559.3705-2-dqfext@gmail.com Signed-off-by: Paolo Abeni --- drivers/net/ppp/ppp_generic.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c index 8c98cbd4b06d..824c8dc4120b 100644 --- a/drivers/net/ppp/ppp_generic.c +++ b/drivers/net/ppp/ppp_generic.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -1598,11 +1599,14 @@ static int ppp_fill_forward_path(struct net_device_path_ctx *ctx, if (ppp->flags & SC_MULTILINK) return -EOPNOTSUPP; - if (list_empty(&ppp->channels)) + pch = list_first_or_null_rcu(&ppp->channels, struct channel, clist); + if (!pch) + return -ENODEV; + + chan = READ_ONCE(pch->chan); + if (!chan) return -ENODEV; - pch = list_first_entry(&ppp->channels, struct channel, clist); - chan = pch->chan; if (!chan->ops->fill_forward_path) return -EOPNOTSUPP; @@ -2994,7 +2998,7 @@ ppp_unregister_channel(struct ppp_channel *chan) */ down_write(&pch->chan_sem); spin_lock_bh(&pch->downl); - pch->chan = NULL; + WRITE_ONCE(pch->chan, NULL); spin_unlock_bh(&pch->downl); up_write(&pch->chan_sem); ppp_disconnect_channel(pch); @@ -3515,7 +3519,7 @@ ppp_connect_channel(struct channel *pch, int unit) hdrlen = pch->file.hdrlen + 2; /* for protocol bytes */ if (hdrlen > ppp->dev->hard_header_len) ppp->dev->hard_header_len = hdrlen; - list_add_tail(&pch->clist, &ppp->channels); + list_add_tail_rcu(&pch->clist, &ppp->channels); ++ppp->n_channels; pch->ppp = ppp; refcount_inc(&ppp->file.refcnt); @@ -3545,10 +3549,11 @@ ppp_disconnect_channel(struct channel *pch) if (ppp) { /* remove it from the ppp unit's list */ ppp_lock(ppp); - list_del(&pch->clist); + list_del_rcu(&pch->clist); if (--ppp->n_channels == 0) wake_up_interruptible(&ppp->file.rwait); ppp_unlock(ppp); + synchronize_net(); if (refcount_dec_and_test(&ppp->file.refcnt)) ppp_destroy_interface(ppp); err = 0; From 01792bc3e5bdafa171dd83c7073f00e7de93a653 Mon Sep 17 00:00:00 2001 From: MD Danish Anwar Date: Thu, 14 Aug 2025 16:21:06 +0530 Subject: [PATCH 38/81] net: ti: icssg-prueth: Fix HSR and switch offload Enablement during firwmare reload. To enable HSR / Switch offload, certain configurations are needed. Currently they are done inside icssg_change_mode(). This function only gets called if we move from one mode to another without bringing the links up / down. Once in HSR / Switch mode, if we bring the links down and bring it back up again. The callback sequence is, - emac_ndo_stop() Firmwares are stopped - emac_ndo_open() Firmwares are loaded In this path icssg_change_mode() doesn't get called and as a result the configurations needed for HSR / Switch is not done. To fix this, put all these configurations in a separate function icssg_enable_fw_offload() and call this from both icssg_change_mode() and emac_ndo_open() Fixes: 56375086d093 ("net: ti: icssg-prueth: Enable HSR Tx duplication, Tx Tag and Rx Tag offload") Signed-off-by: MD Danish Anwar Link: https://patch.msgid.link/20250814105106.1491871-1-danishanwar@ti.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/ti/icssg/icssg_prueth.c | 72 +++++++++++--------- 1 file changed, 41 insertions(+), 31 deletions(-) diff --git a/drivers/net/ethernet/ti/icssg/icssg_prueth.c b/drivers/net/ethernet/ti/icssg/icssg_prueth.c index 6c7d776ae4ee..dadce6009791 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_prueth.c +++ b/drivers/net/ethernet/ti/icssg/icssg_prueth.c @@ -203,6 +203,44 @@ static void prueth_emac_stop(struct prueth *prueth) } } +static void icssg_enable_fw_offload(struct prueth *prueth) +{ + struct prueth_emac *emac; + int mac; + + for (mac = PRUETH_MAC0; mac < PRUETH_NUM_MACS; mac++) { + emac = prueth->emac[mac]; + if (prueth->is_hsr_offload_mode) { + if (emac->ndev->features & NETIF_F_HW_HSR_TAG_RM) + icssg_set_port_state(emac, ICSSG_EMAC_HSR_RX_OFFLOAD_ENABLE); + else + icssg_set_port_state(emac, ICSSG_EMAC_HSR_RX_OFFLOAD_DISABLE); + } + + if (prueth->is_switch_mode || prueth->is_hsr_offload_mode) { + if (netif_running(emac->ndev)) { + icssg_fdb_add_del(emac, eth_stp_addr, prueth->default_vlan, + ICSSG_FDB_ENTRY_P0_MEMBERSHIP | + ICSSG_FDB_ENTRY_P1_MEMBERSHIP | + ICSSG_FDB_ENTRY_P2_MEMBERSHIP | + ICSSG_FDB_ENTRY_BLOCK, + true); + icssg_vtbl_modify(emac, emac->port_vlan | DEFAULT_VID, + BIT(emac->port_id) | DEFAULT_PORT_MASK, + BIT(emac->port_id) | DEFAULT_UNTAG_MASK, + true); + if (prueth->is_hsr_offload_mode) + icssg_vtbl_modify(emac, DEFAULT_VID, + DEFAULT_PORT_MASK, + DEFAULT_UNTAG_MASK, true); + icssg_set_pvid(prueth, emac->port_vlan, emac->port_id); + if (prueth->is_switch_mode) + icssg_set_port_state(emac, ICSSG_EMAC_PORT_VLAN_AWARE_ENABLE); + } + } + } +} + static int prueth_emac_common_start(struct prueth *prueth) { struct prueth_emac *emac; @@ -753,6 +791,7 @@ static int emac_ndo_open(struct net_device *ndev) ret = prueth_emac_common_start(prueth); if (ret) goto free_rx_irq; + icssg_enable_fw_offload(prueth); } flow_cfg = emac->dram.va + ICSSG_CONFIG_OFFSET + PSI_L_REGULAR_FLOW_ID_BASE_OFFSET; @@ -1360,8 +1399,7 @@ static int prueth_emac_restart(struct prueth *prueth) static void icssg_change_mode(struct prueth *prueth) { - struct prueth_emac *emac; - int mac, ret; + int ret; ret = prueth_emac_restart(prueth); if (ret) { @@ -1369,35 +1407,7 @@ static void icssg_change_mode(struct prueth *prueth) return; } - for (mac = PRUETH_MAC0; mac < PRUETH_NUM_MACS; mac++) { - emac = prueth->emac[mac]; - if (prueth->is_hsr_offload_mode) { - if (emac->ndev->features & NETIF_F_HW_HSR_TAG_RM) - icssg_set_port_state(emac, ICSSG_EMAC_HSR_RX_OFFLOAD_ENABLE); - else - icssg_set_port_state(emac, ICSSG_EMAC_HSR_RX_OFFLOAD_DISABLE); - } - - if (netif_running(emac->ndev)) { - icssg_fdb_add_del(emac, eth_stp_addr, prueth->default_vlan, - ICSSG_FDB_ENTRY_P0_MEMBERSHIP | - ICSSG_FDB_ENTRY_P1_MEMBERSHIP | - ICSSG_FDB_ENTRY_P2_MEMBERSHIP | - ICSSG_FDB_ENTRY_BLOCK, - true); - icssg_vtbl_modify(emac, emac->port_vlan | DEFAULT_VID, - BIT(emac->port_id) | DEFAULT_PORT_MASK, - BIT(emac->port_id) | DEFAULT_UNTAG_MASK, - true); - if (prueth->is_hsr_offload_mode) - icssg_vtbl_modify(emac, DEFAULT_VID, - DEFAULT_PORT_MASK, - DEFAULT_UNTAG_MASK, true); - icssg_set_pvid(prueth, emac->port_vlan, emac->port_id); - if (prueth->is_switch_mode) - icssg_set_port_state(emac, ICSSG_EMAC_PORT_VLAN_AWARE_ENABLE); - } - } + icssg_enable_fw_offload(prueth); } static int prueth_netdevice_port_link(struct net_device *ndev, From 4a73a36cb704813f588af13d9842d0ba5a185758 Mon Sep 17 00:00:00 2001 From: Lubomir Rintel Date: Thu, 14 Aug 2025 17:42:14 +0200 Subject: [PATCH 39/81] cdc_ncm: Flag Intel OEM version of Fibocom L850-GL as WWAN This lets NetworkManager/ModemManager know that this is a modem and needs to be connected first. Signed-off-by: Lubomir Rintel Link: https://patch.msgid.link/20250814154214.250103-1-lkundrak@v3.sk Signed-off-by: Jakub Kicinski --- drivers/net/usb/cdc_ncm.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c index ea0e5e276cd6..5d123df0a866 100644 --- a/drivers/net/usb/cdc_ncm.c +++ b/drivers/net/usb/cdc_ncm.c @@ -2087,6 +2087,13 @@ static const struct usb_device_id cdc_devs[] = { .driver_info = (unsigned long)&wwan_info, }, + /* Intel modem (label from OEM reads Fibocom L850-GL) */ + { USB_DEVICE_AND_INTERFACE_INFO(0x8087, 0x095a, + USB_CLASS_COMM, + USB_CDC_SUBCLASS_NCM, USB_CDC_PROTO_NONE), + .driver_info = (unsigned long)&wwan_info, + }, + /* DisplayLink docking stations */ { .match_flags = USB_DEVICE_ID_MATCH_INT_INFO | USB_DEVICE_ID_MATCH_VENDOR, From f179f5bc158f07693b74c264f8933c8b0f07503f Mon Sep 17 00:00:00 2001 From: Victor Nogueira Date: Fri, 15 Aug 2025 10:53:17 -0300 Subject: [PATCH 40/81] net/sched: sch_dualpi2: Run prob update timer in softirq to avoid deadlock When a user creates a dualpi2 qdisc it automatically sets a timer. This timer will run constantly and update the qdisc's probability field. The issue is that the timer acquires the qdisc root lock and runs in hardirq. The qdisc root lock is also acquired in dev.c whenever a packet arrives for this qdisc. Since the dualpi2 timer callback runs in hardirq, it may interrupt the packet processing running in softirq. If that happens and it runs on the same CPU, it will acquire the same lock and cause a deadlock. The following splat shows up when running a kernel compiled with lock debugging: [ +0.000224] WARNING: inconsistent lock state [ +0.000224] 6.16.0+ #10 Not tainted [ +0.000169] -------------------------------- [ +0.000029] inconsistent {IN-HARDIRQ-W} -> {HARDIRQ-ON-W} usage. [ +0.000000] ping/156 [HC0[0]:SC0[2]:HE1:SE0] takes: [ +0.000000] ffff897841242110 (&sch->root_lock_key){?.-.}-{3:3}, at: __dev_queue_xmit+0x86d/0x1140 [ +0.000000] {IN-HARDIRQ-W} state was registered at: [ +0.000000] lock_acquire.part.0+0xb6/0x220 [ +0.000000] _raw_spin_lock+0x31/0x80 [ +0.000000] dualpi2_timer+0x6f/0x270 [ +0.000000] __hrtimer_run_queues+0x1c5/0x360 [ +0.000000] hrtimer_interrupt+0x115/0x260 [ +0.000000] __sysvec_apic_timer_interrupt+0x6d/0x1a0 [ +0.000000] sysvec_apic_timer_interrupt+0x6e/0x80 [ +0.000000] asm_sysvec_apic_timer_interrupt+0x1a/0x20 [ +0.000000] pv_native_safe_halt+0xf/0x20 [ +0.000000] default_idle+0x9/0x10 [ +0.000000] default_idle_call+0x7e/0x1e0 [ +0.000000] do_idle+0x1e8/0x250 [ +0.000000] cpu_startup_entry+0x29/0x30 [ +0.000000] rest_init+0x151/0x160 [ +0.000000] start_kernel+0x6f3/0x700 [ +0.000000] x86_64_start_reservations+0x24/0x30 [ +0.000000] x86_64_start_kernel+0xc8/0xd0 [ +0.000000] common_startup_64+0x13e/0x148 [ +0.000000] irq event stamp: 6884 [ +0.000000] hardirqs last enabled at (6883): [] neigh_resolve_output+0x223/0x270 [ +0.000000] hardirqs last disabled at (6882): [] neigh_resolve_output+0x1e8/0x270 [ +0.000000] softirqs last enabled at (6880): [] neigh_resolve_output+0x1db/0x270 [ +0.000000] softirqs last disabled at (6884): [] __dev_queue_xmit+0x73/0x1140 [ +0.000000] other info that might help us debug this: [ +0.000000] Possible unsafe locking scenario: [ +0.000000] CPU0 [ +0.000000] ---- [ +0.000000] lock(&sch->root_lock_key); [ +0.000000] [ +0.000000] lock(&sch->root_lock_key); [ +0.000000] *** DEADLOCK *** [ +0.000000] 4 locks held by ping/156: [ +0.000000] #0: ffff897842332e08 (sk_lock-AF_INET){+.+.}-{0:0}, at: raw_sendmsg+0x41e/0xf40 [ +0.000000] #1: ffffffffa816f880 (rcu_read_lock){....}-{1:3}, at: ip_output+0x2c/0x190 [ +0.000000] #2: ffffffffa816f880 (rcu_read_lock){....}-{1:3}, at: ip_finish_output2+0xad/0x950 [ +0.000000] #3: ffffffffa816f840 (rcu_read_lock_bh){....}-{1:3}, at: __dev_queue_xmit+0x73/0x1140 I am able to reproduce it consistently when running the following: tc qdisc add dev lo handle 1: root dualpi2 ping -f 127.0.0.1 To fix it, make the timer run in softirq. Fixes: 320d031ad6e4 ("sched: Struct definition and parsing of dualpi2 qdisc") Reviewed-by: Jamal Hadi Salim Signed-off-by: Victor Nogueira Link: https://patch.msgid.link/20250815135317.664993-1-victor@mojatatu.com Signed-off-by: Jakub Kicinski --- net/sched/sch_dualpi2.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/sched/sch_dualpi2.c b/net/sched/sch_dualpi2.c index 845375ebd4ea..4b975feb52b1 100644 --- a/net/sched/sch_dualpi2.c +++ b/net/sched/sch_dualpi2.c @@ -927,7 +927,8 @@ static int dualpi2_init(struct Qdisc *sch, struct nlattr *opt, q->sch = sch; dualpi2_reset_default(sch); - hrtimer_setup(&q->pi2_timer, dualpi2_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); + hrtimer_setup(&q->pi2_timer, dualpi2_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_PINNED_SOFT); if (opt && nla_len(opt)) { err = dualpi2_change(sch, opt, extack); @@ -937,7 +938,7 @@ static int dualpi2_init(struct Qdisc *sch, struct nlattr *opt, } hrtimer_start(&q->pi2_timer, next_pi2_timeout(q), - HRTIMER_MODE_ABS_PINNED); + HRTIMER_MODE_ABS_PINNED_SOFT); return 0; } From bc1a59cff9f797bfbf8f3104507584d89e9ecf2e Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Mon, 18 Aug 2025 10:10:29 +0200 Subject: [PATCH 41/81] phy: mscc: Fix timestamping for vsc8584 There was a problem when we received frames and the frames were timestamped. The driver is configured to store the nanosecond part of the timestmap in the ptp reserved bits and it would take the second part by reading the LTC. The problem is that when reading the LTC we are in atomic context and to read the second part will go over mdio bus which might sleep, so we get an error. The fix consists in actually put all the frames in a queue and start the aux work and in that work to read the LTC and then calculate the full received time. Fixes: 7d272e63e0979d ("net: phy: mscc: timestamping and PHC support") Signed-off-by: Horatiu Vultur Reviewed-by: Vadim Fedorenko Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/20250818081029.1300780-1-horatiu.vultur@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/mscc/mscc.h | 12 ++++++++ drivers/net/phy/mscc/mscc_main.c | 12 ++++++++ drivers/net/phy/mscc/mscc_ptp.c | 49 ++++++++++++++++++++++++-------- 3 files changed, 61 insertions(+), 12 deletions(-) diff --git a/drivers/net/phy/mscc/mscc.h b/drivers/net/phy/mscc/mscc.h index 6a3d8a754eb8..58c6d47fbe04 100644 --- a/drivers/net/phy/mscc/mscc.h +++ b/drivers/net/phy/mscc/mscc.h @@ -362,6 +362,13 @@ struct vsc85xx_hw_stat { u16 mask; }; +struct vsc8531_skb_cb { + u32 ns; +}; + +#define VSC8531_SKB_CB(skb) \ + ((struct vsc8531_skb_cb *)((skb)->cb)) + struct vsc8531_private { int rate_magic; u16 supp_led_modes; @@ -410,6 +417,11 @@ struct vsc8531_private { */ struct mutex ts_lock; struct mutex phc_lock; + + /* list of skbs that were received and need timestamp information but it + * didn't received it yet + */ + struct sk_buff_head rx_skbs_list; }; /* Shared structure between the PHYs of the same package. diff --git a/drivers/net/phy/mscc/mscc_main.c b/drivers/net/phy/mscc/mscc_main.c index 7ed6522fb0ef..f1c9ce351ab4 100644 --- a/drivers/net/phy/mscc/mscc_main.c +++ b/drivers/net/phy/mscc/mscc_main.c @@ -2335,6 +2335,13 @@ static int vsc85xx_probe(struct phy_device *phydev) return vsc85xx_dt_led_modes_get(phydev, default_mode); } +static void vsc85xx_remove(struct phy_device *phydev) +{ + struct vsc8531_private *priv = phydev->priv; + + skb_queue_purge(&priv->rx_skbs_list); +} + /* Microsemi VSC85xx PHYs */ static struct phy_driver vsc85xx_driver[] = { { @@ -2589,6 +2596,7 @@ static struct phy_driver vsc85xx_driver[] = { .config_intr = &vsc85xx_config_intr, .suspend = &genphy_suspend, .resume = &genphy_resume, + .remove = &vsc85xx_remove, .probe = &vsc8574_probe, .set_wol = &vsc85xx_wol_set, .get_wol = &vsc85xx_wol_get, @@ -2614,6 +2622,7 @@ static struct phy_driver vsc85xx_driver[] = { .config_intr = &vsc85xx_config_intr, .suspend = &genphy_suspend, .resume = &genphy_resume, + .remove = &vsc85xx_remove, .probe = &vsc8574_probe, .set_wol = &vsc85xx_wol_set, .get_wol = &vsc85xx_wol_get, @@ -2639,6 +2648,7 @@ static struct phy_driver vsc85xx_driver[] = { .config_intr = &vsc85xx_config_intr, .suspend = &genphy_suspend, .resume = &genphy_resume, + .remove = &vsc85xx_remove, .probe = &vsc8584_probe, .get_tunable = &vsc85xx_get_tunable, .set_tunable = &vsc85xx_set_tunable, @@ -2662,6 +2672,7 @@ static struct phy_driver vsc85xx_driver[] = { .config_intr = &vsc85xx_config_intr, .suspend = &genphy_suspend, .resume = &genphy_resume, + .remove = &vsc85xx_remove, .probe = &vsc8584_probe, .get_tunable = &vsc85xx_get_tunable, .set_tunable = &vsc85xx_set_tunable, @@ -2685,6 +2696,7 @@ static struct phy_driver vsc85xx_driver[] = { .config_intr = &vsc85xx_config_intr, .suspend = &genphy_suspend, .resume = &genphy_resume, + .remove = &vsc85xx_remove, .probe = &vsc8584_probe, .get_tunable = &vsc85xx_get_tunable, .set_tunable = &vsc85xx_set_tunable, diff --git a/drivers/net/phy/mscc/mscc_ptp.c b/drivers/net/phy/mscc/mscc_ptp.c index 275706de5847..de6c7312e8f2 100644 --- a/drivers/net/phy/mscc/mscc_ptp.c +++ b/drivers/net/phy/mscc/mscc_ptp.c @@ -1194,9 +1194,7 @@ static bool vsc85xx_rxtstamp(struct mii_timestamper *mii_ts, { struct vsc8531_private *vsc8531 = container_of(mii_ts, struct vsc8531_private, mii_ts); - struct skb_shared_hwtstamps *shhwtstamps = NULL; struct vsc85xx_ptphdr *ptphdr; - struct timespec64 ts; unsigned long ns; if (!vsc8531->ptp->configured) @@ -1206,27 +1204,52 @@ static bool vsc85xx_rxtstamp(struct mii_timestamper *mii_ts, type == PTP_CLASS_NONE) return false; - vsc85xx_gettime(&vsc8531->ptp->caps, &ts); - ptphdr = get_ptp_header_rx(skb, vsc8531->ptp->rx_filter); if (!ptphdr) return false; - shhwtstamps = skb_hwtstamps(skb); - memset(shhwtstamps, 0, sizeof(struct skb_shared_hwtstamps)); - ns = ntohl(ptphdr->rsrvd2); - /* nsec is in reserved field */ - if (ts.tv_nsec < ns) - ts.tv_sec--; + VSC8531_SKB_CB(skb)->ns = ns; + skb_queue_tail(&vsc8531->rx_skbs_list, skb); - shhwtstamps->hwtstamp = ktime_set(ts.tv_sec, ns); - netif_rx(skb); + ptp_schedule_worker(vsc8531->ptp->ptp_clock, 0); return true; } +static long vsc85xx_do_aux_work(struct ptp_clock_info *info) +{ + struct vsc85xx_ptp *ptp = container_of(info, struct vsc85xx_ptp, caps); + struct skb_shared_hwtstamps *shhwtstamps = NULL; + struct phy_device *phydev = ptp->phydev; + struct vsc8531_private *priv = phydev->priv; + struct sk_buff_head received; + struct sk_buff *rx_skb; + struct timespec64 ts; + unsigned long flags; + + __skb_queue_head_init(&received); + spin_lock_irqsave(&priv->rx_skbs_list.lock, flags); + skb_queue_splice_tail_init(&priv->rx_skbs_list, &received); + spin_unlock_irqrestore(&priv->rx_skbs_list.lock, flags); + + vsc85xx_gettime(info, &ts); + while ((rx_skb = __skb_dequeue(&received)) != NULL) { + shhwtstamps = skb_hwtstamps(rx_skb); + memset(shhwtstamps, 0, sizeof(struct skb_shared_hwtstamps)); + + if (ts.tv_nsec < VSC8531_SKB_CB(rx_skb)->ns) + ts.tv_sec--; + + shhwtstamps->hwtstamp = ktime_set(ts.tv_sec, + VSC8531_SKB_CB(rx_skb)->ns); + netif_rx(rx_skb); + } + + return -1; +} + static const struct ptp_clock_info vsc85xx_clk_caps = { .owner = THIS_MODULE, .name = "VSC85xx timer", @@ -1240,6 +1263,7 @@ static const struct ptp_clock_info vsc85xx_clk_caps = { .adjfine = &vsc85xx_adjfine, .gettime64 = &vsc85xx_gettime, .settime64 = &vsc85xx_settime, + .do_aux_work = &vsc85xx_do_aux_work, }; static struct vsc8531_private *vsc8584_base_priv(struct phy_device *phydev) @@ -1567,6 +1591,7 @@ int vsc8584_ptp_probe(struct phy_device *phydev) mutex_init(&vsc8531->phc_lock); mutex_init(&vsc8531->ts_lock); + skb_queue_head_init(&vsc8531->rx_skbs_list); /* Retrieve the shared load/save GPIO. Request it as non exclusive as * the same GPIO can be requested by all the PHYs of the same package. From 24ef2f53c07f273bad99173e27ee88d44d135b1c Mon Sep 17 00:00:00 2001 From: Yuichiro Tsuji Date: Mon, 18 Aug 2025 17:45:07 +0900 Subject: [PATCH 42/81] net: usb: asix_devices: Fix PHY address mask in MDIO bus initialization Syzbot reported shift-out-of-bounds exception on MDIO bus initialization. The PHY address should be masked to 5 bits (0-31). Without this mask, invalid PHY addresses could be used, potentially causing issues with MDIO bus operations. Fix this by masking the PHY address with 0x1f (31 decimal) to ensure it stays within the valid range. Fixes: 4faff70959d5 ("net: usb: asix_devices: add phy_mask for ax88772 mdio bus") Reported-by: syzbot+20537064367a0f98d597@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=20537064367a0f98d597 Tested-by: syzbot+20537064367a0f98d597@syzkaller.appspotmail.com Signed-off-by: Yuichiro Tsuji Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20250818084541.1958-1-yuichtsu@amazon.com Signed-off-by: Jakub Kicinski --- drivers/net/usb/asix_devices.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/usb/asix_devices.c b/drivers/net/usb/asix_devices.c index d9f5942ccc44..792ddda1ad49 100644 --- a/drivers/net/usb/asix_devices.c +++ b/drivers/net/usb/asix_devices.c @@ -676,7 +676,7 @@ static int ax88772_init_mdio(struct usbnet *dev) priv->mdio->read = &asix_mdio_bus_read; priv->mdio->write = &asix_mdio_bus_write; priv->mdio->name = "Asix MDIO Bus"; - priv->mdio->phy_mask = ~(BIT(priv->phy_addr) | BIT(AX_EMBD_PHY_ADDR)); + priv->mdio->phy_mask = ~(BIT(priv->phy_addr & 0x1f) | BIT(AX_EMBD_PHY_ADDR)); /* mii bus name is usb-- */ snprintf(priv->mdio->id, MII_BUS_ID_SIZE, "usb-%03d:%03d", dev->udev->bus->busnum, dev->udev->devnum); From 75a9a46d67f46d608205888f9b34e315c1786345 Mon Sep 17 00:00:00 2001 From: Jordan Rhee Date: Mon, 18 Aug 2025 14:12:45 -0700 Subject: [PATCH 43/81] gve: prevent ethtool ops after shutdown A crash can occur if an ethtool operation is invoked after shutdown() is called. shutdown() is invoked during system shutdown to stop DMA operations without performing expensive deallocations. It is discouraged to unregister the netdev in this path, so the device may still be visible to userspace and kernel helpers. In gve, shutdown() tears down most internal data structures. If an ethtool operation is dispatched after shutdown(), it will dereference freed or NULL pointers, leading to a kernel panic. While graceful shutdown normally quiesces userspace before invoking the reboot syscall, forced shutdowns (as observed on GCP VMs) can still trigger this path. Fix by calling netif_device_detach() in shutdown(). This marks the device as detached so the ethtool ioctl handler will skip dispatching operations to the driver. Fixes: 974365e51861 ("gve: Implement suspend/resume/shutdown") Signed-off-by: Jordan Rhee Signed-off-by: Jeroen de Borst Link: https://patch.msgid.link/20250818211245.1156919-1-jeroendb@google.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/google/gve/gve_main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index 1f411d7c4373..1be1b1ef31ee 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -2870,6 +2870,8 @@ static void gve_shutdown(struct pci_dev *pdev) struct gve_priv *priv = netdev_priv(netdev); bool was_up = netif_running(priv->dev); + netif_device_detach(netdev); + rtnl_lock(); netdev_lock(netdev); if (was_up && gve_close(priv->dev)) { From 6d6714bf0c4e8eb2274081b4b023dfa01581c123 Mon Sep 17 00:00:00 2001 From: Yao Zi Date: Fri, 15 Aug 2025 10:48:03 +0000 Subject: [PATCH 44/81] net: stmmac: thead: Enable TX clock before MAC initialization The clk_tx_i clock must be supplied to the MAC for successful initialization. On TH1520 SoC, the clock is provided by an internal divider configured through GMAC_PLLCLK_DIV register when using RGMII interface. However, currently we don't setup the divider before initialization of the MAC, resulting in DMA reset failures if the bootloader/firmware doesn't enable the divider, [ 7.839601] thead-dwmac ffe7060000.ethernet eth0: Register MEM_TYPE_PAGE_POOL RxQ-0 [ 7.938338] thead-dwmac ffe7060000.ethernet eth0: PHY [stmmac-0:02] driver [RTL8211F Gigabit Ethernet] (irq=POLL) [ 8.160746] thead-dwmac ffe7060000.ethernet eth0: Failed to reset the dma [ 8.170118] thead-dwmac ffe7060000.ethernet eth0: stmmac_hw_setup: DMA engine initialization failed [ 8.179384] thead-dwmac ffe7060000.ethernet eth0: __stmmac_open: Hw setup failed Let's simply write GMAC_PLLCLK_DIV_EN to GMAC_PLLCLK_DIV to enable the divider before MAC initialization. Note that for reconfiguring the divisor, the divider must be disabled first and re-enabled later to make sure the new divisor take effect. The exact clock rate doesn't affect MAC's initialization according to my test. It's set to the speed required by RGMII when the linkspeed is 1Gbps and could be reclocked later after link is up if necessary. Fixes: 33a1a01e3afa ("net: stmmac: Add glue layer for T-HEAD TH1520 SoC") Signed-off-by: Yao Zi Reviewed-by: Drew Fustini Link: https://patch.msgid.link/20250815104803.55294-1-ziyao@disroot.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac-thead.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-thead.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-thead.c index f2946bea0bc2..6c6c49e4b66f 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-thead.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-thead.c @@ -152,7 +152,7 @@ static int thead_set_clk_tx_rate(void *bsp_priv, struct clk *clk_tx_i, static int thead_dwmac_enable_clk(struct plat_stmmacenet_data *plat) { struct thead_dwmac *dwmac = plat->bsp_priv; - u32 reg; + u32 reg, div; switch (plat->mac_interface) { case PHY_INTERFACE_MODE_MII: @@ -164,6 +164,13 @@ static int thead_dwmac_enable_clk(struct plat_stmmacenet_data *plat) case PHY_INTERFACE_MODE_RGMII_RXID: case PHY_INTERFACE_MODE_RGMII_TXID: /* use pll */ + div = clk_get_rate(plat->stmmac_clk) / rgmii_clock(SPEED_1000); + reg = FIELD_PREP(GMAC_PLLCLK_DIV_EN, 1) | + FIELD_PREP(GMAC_PLLCLK_DIV_NUM, div); + + writel(0, dwmac->apb_base + GMAC_PLLCLK_DIV); + writel(reg, dwmac->apb_base + GMAC_PLLCLK_DIV); + writel(GMAC_GTXCLK_SEL_PLL, dwmac->apb_base + GMAC_GTXCLK_SEL); reg = GMAC_TX_CLK_EN | GMAC_TX_CLK_N_EN | GMAC_TX_CLK_OUT_EN | GMAC_RX_CLK_EN | GMAC_RX_CLK_N_EN; From d9cef55ed49117bd63695446fb84b4b91815c0b4 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Mon, 18 Aug 2025 13:46:18 +0800 Subject: [PATCH 45/81] net/smc: fix UAF on smcsk after smc_listen_out() BPF CI testing report a UAF issue: [ 16.446633] BUG: kernel NULL pointer dereference, address: 000000000000003 0 [ 16.447134] #PF: supervisor read access in kernel mod e [ 16.447516] #PF: error_code(0x0000) - not-present pag e [ 16.447878] PGD 0 P4D 0 [ 16.448063] Oops: Oops: 0000 [#1] PREEMPT SMP NOPT I [ 16.448409] CPU: 0 UID: 0 PID: 9 Comm: kworker/0:1 Tainted: G OE 6.13.0-rc3-g89e8a75fda73-dirty #4 2 [ 16.449124] Tainted: [O]=OOT_MODULE, [E]=UNSIGNED_MODUL E [ 16.449502] Hardware name: QEMU Ubuntu 24.04 PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/201 4 [ 16.450201] Workqueue: smc_hs_wq smc_listen_wor k [ 16.450531] RIP: 0010:smc_listen_work+0xc02/0x159 0 [ 16.452158] RSP: 0018:ffffb5ab40053d98 EFLAGS: 0001024 6 [ 16.452526] RAX: 0000000000000001 RBX: 0000000000000002 RCX: 000000000000030 0 [ 16.452994] RDX: 0000000000000280 RSI: 00003513840053f0 RDI: 000000000000000 0 [ 16.453492] RBP: ffffa097808e3800 R08: ffffa09782dba1e0 R09: 000000000000000 5 [ 16.453987] R10: 0000000000000000 R11: 0000000000000000 R12: ffffa0978274640 0 [ 16.454497] R13: 0000000000000000 R14: 0000000000000000 R15: ffffa09782d4092 0 [ 16.454996] FS: 0000000000000000(0000) GS:ffffa097bbc00000(0000) knlGS:000000000000000 0 [ 16.455557] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003 3 [ 16.455961] CR2: 0000000000000030 CR3: 0000000102788004 CR4: 0000000000770ef 0 [ 16.456459] PKRU: 5555555 4 [ 16.456654] Call Trace : [ 16.456832] [ 16.456989] ? __die+0x23/0x7 0 [ 16.457215] ? page_fault_oops+0x180/0x4c 0 [ 16.457508] ? __lock_acquire+0x3e6/0x249 0 [ 16.457801] ? exc_page_fault+0x68/0x20 0 [ 16.458080] ? asm_exc_page_fault+0x26/0x3 0 [ 16.458389] ? smc_listen_work+0xc02/0x159 0 [ 16.458689] ? smc_listen_work+0xc02/0x159 0 [ 16.458987] ? lock_is_held_type+0x8f/0x10 0 [ 16.459284] process_one_work+0x1ea/0x6d 0 [ 16.459570] worker_thread+0x1c3/0x38 0 [ 16.459839] ? __pfx_worker_thread+0x10/0x1 0 [ 16.460144] kthread+0xe0/0x11 0 [ 16.460372] ? __pfx_kthread+0x10/0x1 0 [ 16.460640] ret_from_fork+0x31/0x5 0 [ 16.460896] ? __pfx_kthread+0x10/0x1 0 [ 16.461166] ret_from_fork_asm+0x1a/0x3 0 [ 16.461453] [ 16.461616] Modules linked in: bpf_testmod(OE) [last unloaded: bpf_testmod(OE) ] [ 16.462134] CR2: 000000000000003 0 [ 16.462380] ---[ end trace 0000000000000000 ]--- [ 16.462710] RIP: 0010:smc_listen_work+0xc02/0x1590 The direct cause of this issue is that after smc_listen_out_connected(), newclcsock->sk may be NULL since it will releases the smcsk. Therefore, if the application closes the socket immediately after accept, newclcsock->sk can be NULL. A possible execution order could be as follows: smc_listen_work | userspace ----------------------------------------------------------------- lock_sock(sk) | smc_listen_out_connected() | | \- smc_listen_out | | | \- release_sock | | |- sk->sk_data_ready() | | fd = accept(); | close(fd); | \- socket->sk = NULL; /* newclcsock->sk is NULL now */ SMC_STAT_SERV_SUCC_INC(sock_net(newclcsock->sk)) Since smc_listen_out_connected() will not fail, simply swapping the order of the code can easily fix this issue. Fixes: 3b2dec2603d5 ("net/smc: restructure client and server code in af_smc") Signed-off-by: D. Wythe Reviewed-by: Guangguan Wang Reviewed-by: Alexandra Winter Reviewed-by: Dust Li Link: https://patch.msgid.link/20250818054618.41615-1-alibuda@linux.alibaba.com Signed-off-by: Jakub Kicinski --- net/smc/af_smc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 9311c38f7abe..e0e48f24cd61 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2568,8 +2568,9 @@ static void smc_listen_work(struct work_struct *work) goto out_decl; } - smc_listen_out_connected(new_smc); SMC_STAT_SERV_SUCC_INC(sock_net(newclcsock->sk), ini); + /* smc_listen_out() will release smcsk */ + smc_listen_out_connected(new_smc); goto out_free; out_unlock: From 2462c1b9217246a889ec318b3894d84e4dd709c6 Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Sun, 17 Aug 2025 23:23:17 +0300 Subject: [PATCH 46/81] net/mlx5: HWS, fix bad parameter in CQ creation 'cqe_sz' valid value should be 0 for 64-byte CQE. Fixes: 2ca62599aa0b ("net/mlx5: HWS, added send engine and context handling") Signed-off-by: Yevgeny Kliteynik Reviewed-by: Vlad Dogaru Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250817202323.308604-2-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c index c4b22be19a9b..b0595c9b09e4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c @@ -964,7 +964,6 @@ static int hws_send_ring_open_cq(struct mlx5_core_dev *mdev, return -ENOMEM; MLX5_SET(cqc, cqc_data, uar_page, mdev->priv.uar->index); - MLX5_SET(cqc, cqc_data, cqe_sz, queue->num_entries); MLX5_SET(cqc, cqc_data, log_cq_size, ilog2(queue->num_entries)); err = hws_send_ring_alloc_cq(mdev, numa_node, queue, cqc_data, cq); From 615b690612b7785ab8632f6a5a941550622e4e36 Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Sun, 17 Aug 2025 23:23:18 +0300 Subject: [PATCH 47/81] net/mlx5: HWS, fix simple rules rehash error flow Moving rules from matcher to matcher should not fail. However, if it does fail due to various reasons, the error flow should allow the kernel to continue functioning (albeit with broken steering rules) instead of going into series of soft lock-ups or some other problematic behaviour. This patch fixes the error flow for moving simple rules: - If new rule creation fails before it was even enqeued, do not poll for completion - If TIMEOUT happened while moving the rule, no point trying to poll for completions for other rules. Something is broken, completion won't come, just abort the rehash sequence. - If some other completion with error received, don't give up. Continue handling rest of the rules to minimize the damage. - Make sure that the first error code that was received will be actually returned to the caller instead of replacing it with the generic error code. All the aforementioned issues stem from the same bad error flow, so no point fixing them one by one and leaving partially broken code - fixing them in one patch. Fixes: ef94799a8741 ("net/mlx5: HWS, rework rehash loop") Signed-off-by: Yevgeny Kliteynik Reviewed-by: Vlad Dogaru Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250817202323.308604-3-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- .../mellanox/mlx5/core/steering/hws/bwc.c | 61 +++++++++++++------ 1 file changed, 43 insertions(+), 18 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c index 92de4b761a83..0219a49b2326 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c @@ -74,9 +74,9 @@ static void hws_bwc_matcher_init_attr(struct mlx5hws_bwc_matcher *bwc_matcher, static int hws_bwc_matcher_move_all_simple(struct mlx5hws_bwc_matcher *bwc_matcher) { - bool move_error = false, poll_error = false, drain_error = false; struct mlx5hws_context *ctx = bwc_matcher->matcher->tbl->ctx; struct mlx5hws_matcher *matcher = bwc_matcher->matcher; + int drain_error = 0, move_error = 0, poll_error = 0; u16 bwc_queues = mlx5hws_bwc_queues(ctx); struct mlx5hws_rule_attr rule_attr; struct mlx5hws_bwc_rule *bwc_rule; @@ -99,11 +99,15 @@ hws_bwc_matcher_move_all_simple(struct mlx5hws_bwc_matcher *bwc_matcher) ret = mlx5hws_matcher_resize_rule_move(matcher, bwc_rule->rule, &rule_attr); - if (unlikely(ret && !move_error)) { - mlx5hws_err(ctx, - "Moving BWC rule: move failed (%d), attempting to move rest of the rules\n", - ret); - move_error = true; + if (unlikely(ret)) { + if (!move_error) { + mlx5hws_err(ctx, + "Moving BWC rule: move failed (%d), attempting to move rest of the rules\n", + ret); + move_error = ret; + } + /* Rule wasn't queued, no need to poll */ + continue; } pending_rules++; @@ -111,11 +115,19 @@ hws_bwc_matcher_move_all_simple(struct mlx5hws_bwc_matcher *bwc_matcher) rule_attr.queue_id, &pending_rules, false); - if (unlikely(ret && !poll_error)) { - mlx5hws_err(ctx, - "Moving BWC rule: poll failed (%d), attempting to move rest of the rules\n", - ret); - poll_error = true; + if (unlikely(ret)) { + if (ret == -ETIMEDOUT) { + mlx5hws_err(ctx, + "Moving BWC rule: timeout polling for completions (%d), aborting rehash\n", + ret); + return ret; + } + if (!poll_error) { + mlx5hws_err(ctx, + "Moving BWC rule: polling for completions failed (%d), attempting to move rest of the rules\n", + ret); + poll_error = ret; + } } } @@ -126,17 +138,30 @@ hws_bwc_matcher_move_all_simple(struct mlx5hws_bwc_matcher *bwc_matcher) rule_attr.queue_id, &pending_rules, true); - if (unlikely(ret && !drain_error)) { - mlx5hws_err(ctx, - "Moving BWC rule: drain failed (%d), attempting to move rest of the rules\n", - ret); - drain_error = true; + if (unlikely(ret)) { + if (ret == -ETIMEDOUT) { + mlx5hws_err(ctx, + "Moving bwc rule: timeout draining completions (%d), aborting rehash\n", + ret); + return ret; + } + if (!drain_error) { + mlx5hws_err(ctx, + "Moving bwc rule: drain failed (%d), attempting to move rest of the rules\n", + ret); + drain_error = ret; + } } } } - if (move_error || poll_error || drain_error) - ret = -EINVAL; + /* Return the first error that happened */ + if (unlikely(move_error)) + return move_error; + if (unlikely(poll_error)) + return poll_error; + if (unlikely(drain_error)) + return drain_error; return ret; } From 4a842b1bf18a32ee0c25dd6dd98728b786a76fe4 Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Sun, 17 Aug 2025 23:23:19 +0300 Subject: [PATCH 48/81] net/mlx5: HWS, fix complex rules rehash error flow Moving rules from matcher to matcher should not fail. However, if it does fail due to various reasons, the error flow should allow the kernel to continue functioning (albeit with broken steering rules) instead of going into series of soft lock-ups or some other problematic behaviour. Similar to the simple rules, complex rules rehash logic suffers from the same problems. This patch fixes the error flow for moving complex rules: - If new rule creation fails before it was even enqeued, do not poll for completion - If TIMEOUT happened while moving the rule, no point trying to poll for completions for other rules. Something is broken, completion won't come, just abort the rehash sequence. - If some other completion with error received, don't give up. Continue handling rest of the rules to minimize the damage. - Make sure that the first error code that was received will be actually returned to the caller instead of replacing it with the generic error code. All the aforementioned issues stem from the same bad error flow, so no point fixing them one by one and leaving partially broken code - fixing them in one patch. Fixes: 17e0accac577 ("net/mlx5: HWS, support complex matchers") Signed-off-by: Yevgeny Kliteynik Reviewed-by: Vlad Dogaru Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250817202323.308604-4-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- .../mlx5/core/steering/hws/bwc_complex.c | 41 +++++++++++++------ 1 file changed, 28 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc_complex.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc_complex.c index ca7501c57468..14e79579c719 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc_complex.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc_complex.c @@ -1328,11 +1328,11 @@ mlx5hws_bwc_matcher_move_all_complex(struct mlx5hws_bwc_matcher *bwc_matcher) { struct mlx5hws_context *ctx = bwc_matcher->matcher->tbl->ctx; struct mlx5hws_matcher *matcher = bwc_matcher->matcher; - bool move_error = false, poll_error = false; u16 bwc_queues = mlx5hws_bwc_queues(ctx); struct mlx5hws_bwc_rule *tmp_bwc_rule; struct mlx5hws_rule_attr rule_attr; struct mlx5hws_table *isolated_tbl; + int move_error = 0, poll_error = 0; struct mlx5hws_rule *tmp_rule; struct list_head *rules_list; u32 expected_completions = 1; @@ -1391,11 +1391,15 @@ mlx5hws_bwc_matcher_move_all_complex(struct mlx5hws_bwc_matcher *bwc_matcher) ret = mlx5hws_matcher_resize_rule_move(matcher, tmp_rule, &rule_attr); - if (unlikely(ret && !move_error)) { - mlx5hws_err(ctx, - "Moving complex BWC rule failed (%d), attempting to move rest of the rules\n", - ret); - move_error = true; + if (unlikely(ret)) { + if (!move_error) { + mlx5hws_err(ctx, + "Moving complex BWC rule: move failed (%d), attempting to move rest of the rules\n", + ret); + move_error = ret; + } + /* Rule wasn't queued, no need to poll */ + continue; } expected_completions = 1; @@ -1403,11 +1407,19 @@ mlx5hws_bwc_matcher_move_all_complex(struct mlx5hws_bwc_matcher *bwc_matcher) rule_attr.queue_id, &expected_completions, true); - if (unlikely(ret && !poll_error)) { - mlx5hws_err(ctx, - "Moving complex BWC rule: poll failed (%d), attempting to move rest of the rules\n", - ret); - poll_error = true; + if (unlikely(ret)) { + if (ret == -ETIMEDOUT) { + mlx5hws_err(ctx, + "Moving complex BWC rule: timeout polling for completions (%d), aborting rehash\n", + ret); + return ret; + } + if (!poll_error) { + mlx5hws_err(ctx, + "Moving complex BWC rule: polling for completions failed (%d), attempting to move rest of the rules\n", + ret); + poll_error = ret; + } } /* Done moving the rule to the new matcher, @@ -1422,8 +1434,11 @@ mlx5hws_bwc_matcher_move_all_complex(struct mlx5hws_bwc_matcher *bwc_matcher) } } - if (move_error || poll_error) - ret = -EINVAL; + /* Return the first error that happened */ + if (unlikely(move_error)) + return move_error; + if (unlikely(poll_error)) + return poll_error; return ret; } From 1a72298d27ce4d41b3fd405f6921e8711815767a Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Sun, 17 Aug 2025 23:23:20 +0300 Subject: [PATCH 49/81] net/mlx5: HWS, prevent rehash from filling up the queues While moving the rules during rehash, CQ is not drained. The flush and drain happens only when all the rules of a certain queue have been moved. This behaviour can lead to accumulating large quantity of rules that haven't got their completion yet, and eventually will fill up the queue and will cause the rehash to fail. Fix this problem by requiring drain once the number of outstanding completions reaches a certain threshold. Fixes: ef94799a8741 ("net/mlx5: HWS, rework rehash loop") Signed-off-by: Yevgeny Kliteynik Reviewed-by: Vlad Dogaru Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250817202323.308604-5-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c index 0219a49b2326..2a59be11fe55 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c @@ -84,6 +84,7 @@ hws_bwc_matcher_move_all_simple(struct mlx5hws_bwc_matcher *bwc_matcher) struct list_head *rules_list; u32 pending_rules; int i, ret = 0; + bool drain; mlx5hws_bwc_rule_fill_attr(bwc_matcher, 0, 0, &rule_attr); @@ -111,10 +112,12 @@ hws_bwc_matcher_move_all_simple(struct mlx5hws_bwc_matcher *bwc_matcher) } pending_rules++; + drain = pending_rules >= + hws_bwc_get_burst_th(ctx, rule_attr.queue_id); ret = mlx5hws_bwc_queue_poll(ctx, rule_attr.queue_id, &pending_rules, - false); + drain); if (unlikely(ret)) { if (ret == -ETIMEDOUT) { mlx5hws_err(ctx, From 7c60952f83584bc4950057cfed2cc3c87343b5db Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Sun, 17 Aug 2025 23:23:21 +0300 Subject: [PATCH 50/81] net/mlx5: HWS, don't rehash on every kind of insertion failure If rule creation failed due to a full queue, due to timeout in polling for completion, or due to matcher being in resize, don't try to initiate rehash sequence - rehash would have failed anyway. Fixes: 2111bb970c78 ("net/mlx5: HWS, added backward-compatible API handling") Signed-off-by: Yevgeny Kliteynik Reviewed-by: Vlad Dogaru Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250817202323.308604-6-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- .../mellanox/mlx5/core/steering/hws/bwc.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c index 2a59be11fe55..adeccc588e5d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c @@ -1063,6 +1063,21 @@ int mlx5hws_bwc_rule_create_simple(struct mlx5hws_bwc_rule *bwc_rule, return 0; /* rule inserted successfully */ } + /* Rule insertion could fail due to queue being full, timeout, or + * matcher in resize. In such cases, no point in trying to rehash. + */ + if (ret == -EBUSY || ret == -ETIMEDOUT || ret == -EAGAIN) { + mutex_unlock(queue_lock); + mlx5hws_err(ctx, + "BWC rule insertion failed - %s (%d)\n", + ret == -EBUSY ? "queue is full" : + ret == -ETIMEDOUT ? "timeout" : + ret == -EAGAIN ? "matcher in resize" : "N/A", + ret); + hws_bwc_rule_cnt_dec(bwc_rule); + return ret; + } + /* At this point the rule wasn't added. * It could be because there was collision, or some other problem. * Try rehash by size and insert rule again - last chance. From 8a51507320ebddaab32610199774f69cd7d53e78 Mon Sep 17 00:00:00 2001 From: Alex Vesker Date: Sun, 17 Aug 2025 23:23:22 +0300 Subject: [PATCH 51/81] net/mlx5: HWS, Fix table creation UID During table creation, caller passes a UID using ft_attr. The UID value was ignored, which leads to problems when the caller sets the UID to a non-zero value, such as SHARED_RESOURCE_UID (0xffff) - the internal FT objects will be created with UID=0. Fixes: 0869701cba3d ("net/mlx5: HWS, added FW commands handling") Signed-off-by: Alex Vesker Reviewed-by: Yevgeny Kliteynik Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250817202323.308604-7-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- .../ethernet/mellanox/mlx5/core/steering/hws/cmd.c | 1 + .../ethernet/mellanox/mlx5/core/steering/hws/cmd.h | 1 + .../mellanox/mlx5/core/steering/hws/fs_hws.c | 1 + .../mellanox/mlx5/core/steering/hws/matcher.c | 5 ++++- .../mellanox/mlx5/core/steering/hws/mlx5hws.h | 1 + .../mellanox/mlx5/core/steering/hws/table.c | 13 ++++++++++--- .../mellanox/mlx5/core/steering/hws/table.h | 3 ++- 7 files changed, 20 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c index 9c83753e4592..0bdcab2e5cf3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c @@ -55,6 +55,7 @@ int mlx5hws_cmd_flow_table_create(struct mlx5_core_dev *mdev, MLX5_SET(create_flow_table_in, in, opcode, MLX5_CMD_OP_CREATE_FLOW_TABLE); MLX5_SET(create_flow_table_in, in, table_type, ft_attr->type); + MLX5_SET(create_flow_table_in, in, uid, ft_attr->uid); ft_ctx = MLX5_ADDR_OF(create_flow_table_in, in, flow_table_context); MLX5_SET(flow_table_context, ft_ctx, level, ft_attr->level); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.h index fa6bff210266..122ccc671628 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.h @@ -36,6 +36,7 @@ struct mlx5hws_cmd_set_fte_attr { struct mlx5hws_cmd_ft_create_attr { u8 type; u8 level; + u16 uid; bool rtc_valid; bool decap_en; bool reformat_en; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.c index 57592b92e24b..131e74b2b774 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.c @@ -267,6 +267,7 @@ static int mlx5_cmd_hws_create_flow_table(struct mlx5_flow_root_namespace *ns, tbl_attr.type = MLX5HWS_TABLE_TYPE_FDB; tbl_attr.level = ft_attr->level; + tbl_attr.uid = ft_attr->uid; tbl = mlx5hws_table_create(ctx, &tbl_attr); if (!tbl) { mlx5_core_err(ns->dev, "Failed creating hws flow_table\n"); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c index f3ea09caba2b..32f87fdf3213 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c @@ -85,6 +85,7 @@ static int hws_matcher_create_end_ft_isolated(struct mlx5hws_matcher *matcher) ret = mlx5hws_table_create_default_ft(tbl->ctx->mdev, tbl, + 0, &matcher->end_ft_id); if (ret) { mlx5hws_err(tbl->ctx, "Isolated matcher: failed to create end flow table\n"); @@ -112,7 +113,9 @@ static int hws_matcher_create_end_ft(struct mlx5hws_matcher *matcher) if (mlx5hws_matcher_is_isolated(matcher)) ret = hws_matcher_create_end_ft_isolated(matcher); else - ret = mlx5hws_table_create_default_ft(tbl->ctx->mdev, tbl, + ret = mlx5hws_table_create_default_ft(tbl->ctx->mdev, + tbl, + 0, &matcher->end_ft_id); if (ret) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h index 59c14745ed0c..2498ceff2060 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h @@ -75,6 +75,7 @@ struct mlx5hws_context_attr { struct mlx5hws_table_attr { enum mlx5hws_table_type type; u32 level; + u16 uid; }; enum mlx5hws_matcher_flow_src { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.c index 568f691733f3..6113383ae47b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.c @@ -9,6 +9,7 @@ u32 mlx5hws_table_get_id(struct mlx5hws_table *tbl) } static void hws_table_init_next_ft_attr(struct mlx5hws_table *tbl, + u16 uid, struct mlx5hws_cmd_ft_create_attr *ft_attr) { ft_attr->type = tbl->fw_ft_type; @@ -16,7 +17,9 @@ static void hws_table_init_next_ft_attr(struct mlx5hws_table *tbl, ft_attr->level = tbl->ctx->caps->fdb_ft.max_level - 1; else ft_attr->level = tbl->ctx->caps->nic_ft.max_level - 1; + ft_attr->rtc_valid = true; + ft_attr->uid = uid; } static void hws_table_set_cap_attr(struct mlx5hws_table *tbl, @@ -119,12 +122,12 @@ static int hws_table_connect_to_default_miss_tbl(struct mlx5hws_table *tbl, u32 int mlx5hws_table_create_default_ft(struct mlx5_core_dev *mdev, struct mlx5hws_table *tbl, - u32 *ft_id) + u16 uid, u32 *ft_id) { struct mlx5hws_cmd_ft_create_attr ft_attr = {0}; int ret; - hws_table_init_next_ft_attr(tbl, &ft_attr); + hws_table_init_next_ft_attr(tbl, uid, &ft_attr); hws_table_set_cap_attr(tbl, &ft_attr); ret = mlx5hws_cmd_flow_table_create(mdev, &ft_attr, ft_id); @@ -189,7 +192,10 @@ static int hws_table_init(struct mlx5hws_table *tbl) } mutex_lock(&ctx->ctrl_lock); - ret = mlx5hws_table_create_default_ft(tbl->ctx->mdev, tbl, &tbl->ft_id); + ret = mlx5hws_table_create_default_ft(tbl->ctx->mdev, + tbl, + tbl->uid, + &tbl->ft_id); if (ret) { mlx5hws_err(tbl->ctx, "Failed to create flow table object\n"); mutex_unlock(&ctx->ctrl_lock); @@ -239,6 +245,7 @@ struct mlx5hws_table *mlx5hws_table_create(struct mlx5hws_context *ctx, tbl->ctx = ctx; tbl->type = attr->type; tbl->level = attr->level; + tbl->uid = attr->uid; ret = hws_table_init(tbl); if (ret) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.h index 0400cce0c317..1246f9bd8422 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.h @@ -18,6 +18,7 @@ struct mlx5hws_table { enum mlx5hws_table_type type; u32 fw_ft_type; u32 level; + u16 uid; struct list_head matchers_list; struct list_head tbl_list_node; struct mlx5hws_default_miss default_miss; @@ -47,7 +48,7 @@ u32 mlx5hws_table_get_res_fw_ft_type(enum mlx5hws_table_type tbl_type, int mlx5hws_table_create_default_ft(struct mlx5_core_dev *mdev, struct mlx5hws_table *tbl, - u32 *ft_id); + u16 uid, u32 *ft_id); void mlx5hws_table_destroy_default_ft(struct mlx5hws_table *tbl, u32 ft_id); From d2d6f950cb43be6845a41cac5956cb2a10e657e5 Mon Sep 17 00:00:00 2001 From: Vlad Dogaru Date: Sun, 17 Aug 2025 23:23:23 +0300 Subject: [PATCH 52/81] net/mlx5: CT: Use the correct counter offset Specifying the counter action is not enough, as it is used by multiple counters that were allocated in a bulk. By omitting the offset, rules will be associated with a different counter from the same bulk. Subsequently, the CT subsystem checks the correct counter, assumes that no traffic has triggered the rule, and ages out the rule. The end result is intermittent offloading of long lived connections, as rules are aged out then promptly re-added. Fix this by specifying the correct offset along with the counter rule. Fixes: 34eea5b12a10 ("net/mlx5e: CT: Add initial support for Hardware Steering") Signed-off-by: Vlad Dogaru Reviewed-by: Yevgeny Kliteynik Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250817202323.308604-8-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en/tc/ct_fs_hmfs.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/ct_fs_hmfs.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/ct_fs_hmfs.c index a4263137fef5..01d522b02947 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/ct_fs_hmfs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/ct_fs_hmfs.c @@ -173,6 +173,8 @@ static void mlx5_ct_fs_hmfs_fill_rule_actions(struct mlx5_ct_fs_hmfs *fs_hmfs, memset(rule_actions, 0, NUM_CT_HMFS_RULES * sizeof(*rule_actions)); rule_actions[0].action = mlx5_fc_get_hws_action(fs_hmfs->ctx, attr->counter); + rule_actions[0].counter.offset = + attr->counter->id - attr->counter->bulk->base_id; /* Modify header is special, it may require extra arguments outside the action itself. */ if (mh_action->mh_data) { rule_actions[1].modify_header.offset = mh_action->mh_data->offset; From 1683fd1b2fa79864d3c7a951d9cea0a9ba1a1923 Mon Sep 17 00:00:00 2001 From: Parthiban Veerasooran Date: Mon, 18 Aug 2025 11:35:13 +0530 Subject: [PATCH 53/81] microchip: lan865x: fix missing netif_start_queue() call on device open This fixes an issue where the transmit queue is started implicitly only the very first time the device is registered. When the device is taken down and brought back up again (using `ip` or `ifconfig`), the transmit queue is not restarted, causing packet transmission to hang. Adding an explicit call to netif_start_queue() in lan865x_net_open() ensures the transmit queue is properly started every time the device is reopened. Fixes: 5cd2340cb6a3 ("microchip: lan865x: add driver support for Microchip's LAN865X MAC-PHY") Signed-off-by: Parthiban Veerasooran Link: https://patch.msgid.link/20250818060514.52795-2-parthiban.veerasooran@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/microchip/lan865x/lan865x.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/microchip/lan865x/lan865x.c b/drivers/net/ethernet/microchip/lan865x/lan865x.c index dd436bdff0f8..d03f5a8de58d 100644 --- a/drivers/net/ethernet/microchip/lan865x/lan865x.c +++ b/drivers/net/ethernet/microchip/lan865x/lan865x.c @@ -311,6 +311,8 @@ static int lan865x_net_open(struct net_device *netdev) phy_start(netdev->phydev); + netif_start_queue(netdev); + return 0; } From 2cd58fec912acec273cb155911ab8f06ddbb131a Mon Sep 17 00:00:00 2001 From: Parthiban Veerasooran Date: Mon, 18 Aug 2025 11:35:14 +0530 Subject: [PATCH 54/81] microchip: lan865x: fix missing Timer Increment config for Rev.B0/B1 Fix missing configuration for LAN865x silicon revisions B0 and B1 as per Microchip Application Note AN1760 (Rev F, June 2024). The Timer Increment register was not being set, which is required for accurate timestamping. As per the application note, configure the MAC to set timestamping at the end of the Start of Frame Delimiter (SFD), and set the Timer Increment register to 40 ns (corresponding to a 25 MHz internal clock). Link: https://www.microchip.com/en-us/application-notes/an1760 Fixes: 5cd2340cb6a3 ("microchip: lan865x: add driver support for Microchip's LAN865X MAC-PHY") Signed-off-by: Parthiban Veerasooran Reviewed-by: Vadim Fedorenko Link: https://patch.msgid.link/20250818060514.52795-3-parthiban.veerasooran@microchip.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/microchip/lan865x/lan865x.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/drivers/net/ethernet/microchip/lan865x/lan865x.c b/drivers/net/ethernet/microchip/lan865x/lan865x.c index d03f5a8de58d..84c41f193561 100644 --- a/drivers/net/ethernet/microchip/lan865x/lan865x.c +++ b/drivers/net/ethernet/microchip/lan865x/lan865x.c @@ -32,6 +32,10 @@ /* MAC Specific Addr 1 Top Reg */ #define LAN865X_REG_MAC_H_SADDR1 0x00010023 +/* MAC TSU Timer Increment Register */ +#define LAN865X_REG_MAC_TSU_TIMER_INCR 0x00010077 +#define MAC_TSU_TIMER_INCR_COUNT_NANOSECONDS 0x0028 + struct lan865x_priv { struct work_struct multicast_work; struct net_device *netdev; @@ -346,6 +350,21 @@ static int lan865x_probe(struct spi_device *spi) goto free_netdev; } + /* LAN865x Rev.B0/B1 configuration parameters from AN1760 + * As per the Configuration Application Note AN1760 published in the + * link, https://www.microchip.com/en-us/application-notes/an1760 + * Revision F (DS60001760G - June 2024), configure the MAC to set time + * stamping at the end of the Start of Frame Delimiter (SFD) and set the + * Timer Increment reg to 40 ns to be used as a 25 MHz internal clock. + */ + ret = oa_tc6_write_register(priv->tc6, LAN865X_REG_MAC_TSU_TIMER_INCR, + MAC_TSU_TIMER_INCR_COUNT_NANOSECONDS); + if (ret) { + dev_err(&spi->dev, "Failed to config TSU Timer Incr reg: %d\n", + ret); + goto oa_tc6_exit; + } + /* As per the point s3 in the below errata, SPI receive Ethernet frame * transfer may halt when starting the next frame in the same data block * (chunk) as the end of a previous frame. The RFA field should be From 4d4d9ef9dfee877d494e5418f68a1016ef08cad6 Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Tue, 19 Aug 2025 15:19:57 -0700 Subject: [PATCH 55/81] ixgbe: xsk: resolve the negative overflow of budget in ixgbe_xmit_zc Resolve the budget negative overflow which leads to returning true in ixgbe_xmit_zc even when the budget of descs are thoroughly consumed. Before this patch, when the budget is decreased to zero and finishes sending the last allowed desc in ixgbe_xmit_zc, it will always turn back and enter into the while() statement to see if it should keep processing packets, but in the meantime it unexpectedly decreases the value again to 'unsigned int (0--)', namely, UINT_MAX. Finally, the ixgbe_xmit_zc returns true, showing 'we complete cleaning the budget'. That also means 'clean_complete = true' in ixgbe_poll. The true theory behind this is if that budget number of descs are consumed, it implies that we might have more descs to be done. So we should return false in ixgbe_xmit_zc to tell napi poll to find another chance to start polling to handle the rest of descs. On the contrary, returning true here means job done and we know we finish all the possible descs this time and we don't intend to start a new napi poll. It is apparently against our expectations. Please also see how ixgbe_clean_tx_irq() handles the problem: it uses do..while() statement to make sure the budget can be decreased to zero at most and the negative overflow never happens. The patch adds 'likely' because we rarely would not hit the loop condition since the standard budget is 256. Fixes: 8221c5eba8c1 ("ixgbe: add AF_XDP zero-copy Tx support") Signed-off-by: Jason Xing Reviewed-by: Larysa Zaremba Reviewed-by: Paul Menzel Reviewed-by: Aleksandr Loktionov Tested-by: Priya Singh Signed-off-by: Tony Nguyen Link: https://patch.msgid.link/20250819222000.3504873-4-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c index ac58964b2f08..7b941505a9d0 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c @@ -398,7 +398,7 @@ static bool ixgbe_xmit_zc(struct ixgbe_ring *xdp_ring, unsigned int budget) dma_addr_t dma; u32 cmd_type; - while (budget-- > 0) { + while (likely(budget)) { if (unlikely(!ixgbe_desc_unused(xdp_ring))) { work_done = false; break; @@ -433,6 +433,8 @@ static bool ixgbe_xmit_zc(struct ixgbe_ring *xdp_ring, unsigned int budget) xdp_ring->next_to_use++; if (xdp_ring->next_to_use == xdp_ring->count) xdp_ring->next_to_use = 0; + + budget--; } if (tx_desc) { From f3d9f7fa7f5dbfd4fdb1e69c25fc5627700d19dd Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Tue, 19 Aug 2025 15:19:58 -0700 Subject: [PATCH 56/81] ixgbe: fix ndo_xdp_xmit() workloads MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently ixgbe driver checks periodically in its watchdog subtask if there is anything to be transmitted (considering both Tx and XDP rings) under state of carrier not being 'ok'. Such event is interpreted as Tx hang and therefore results in interface reset. This is currently problematic for ndo_xdp_xmit() as it is allowed to produce descriptors when interface is going through reset or its carrier is turned off. Furthermore, XDP rings should not really be objects of Tx hang detection. This mechanism is rather a matter of ndo_tx_timeout() being called from dev_watchdog against Tx rings exposed to networking stack. Taking into account issues described above, let us have a two fold fix - do not respect XDP rings in local ixgbe watchdog and do not produce Tx descriptors in ndo_xdp_xmit callback when there is some problem with carrier currently. For now, keep the Tx hang checks in clean Tx irq routine, but adjust it to not execute for XDP rings. Cc: Tobias Böhm Reported-by: Marcus Wichelmann Closes: https://lore.kernel.org/netdev/eca1880f-253a-4955-afe6-732d7c6926ee@hetzner-cloud.de/ Fixes: 6453073987ba ("ixgbe: add initial support for xdp redirect") Fixes: 33fdc82f0883 ("ixgbe: add support for XDP_TX action") Reviewed-by: Aleksandr Loktionov Tested-by: Marcus Wichelmann Signed-off-by: Maciej Fijalkowski Signed-off-by: Tony Nguyen Link: https://patch.msgid.link/20250819222000.3504873-5-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 34 ++++++------------- 1 file changed, 11 insertions(+), 23 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 6122a0abb41f..80e6a2ef1350 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -968,10 +968,6 @@ static void ixgbe_update_xoff_rx_lfc(struct ixgbe_adapter *adapter) for (i = 0; i < adapter->num_tx_queues; i++) clear_bit(__IXGBE_HANG_CHECK_ARMED, &adapter->tx_ring[i]->state); - - for (i = 0; i < adapter->num_xdp_queues; i++) - clear_bit(__IXGBE_HANG_CHECK_ARMED, - &adapter->xdp_ring[i]->state); } static void ixgbe_update_xoff_received(struct ixgbe_adapter *adapter) @@ -1214,7 +1210,7 @@ static void ixgbe_pf_handle_tx_hang(struct ixgbe_ring *tx_ring, struct ixgbe_adapter *adapter = netdev_priv(tx_ring->netdev); struct ixgbe_hw *hw = &adapter->hw; - e_err(drv, "Detected Tx Unit Hang%s\n" + e_err(drv, "Detected Tx Unit Hang\n" " Tx Queue <%d>\n" " TDH, TDT <%x>, <%x>\n" " next_to_use <%x>\n" @@ -1222,16 +1218,14 @@ static void ixgbe_pf_handle_tx_hang(struct ixgbe_ring *tx_ring, "tx_buffer_info[next_to_clean]\n" " time_stamp <%lx>\n" " jiffies <%lx>\n", - ring_is_xdp(tx_ring) ? " (XDP)" : "", tx_ring->queue_index, IXGBE_READ_REG(hw, IXGBE_TDH(tx_ring->reg_idx)), IXGBE_READ_REG(hw, IXGBE_TDT(tx_ring->reg_idx)), tx_ring->next_to_use, next, tx_ring->tx_buffer_info[next].time_stamp, jiffies); - if (!ring_is_xdp(tx_ring)) - netif_stop_subqueue(tx_ring->netdev, - tx_ring->queue_index); + netif_stop_subqueue(tx_ring->netdev, + tx_ring->queue_index); } /** @@ -1451,6 +1445,9 @@ static bool ixgbe_clean_tx_irq(struct ixgbe_q_vector *q_vector, total_bytes); adapter->tx_ipsec += total_ipsec; + if (ring_is_xdp(tx_ring)) + return !!budget; + if (check_for_tx_hang(tx_ring) && ixgbe_check_tx_hang(tx_ring)) { if (adapter->hw.mac.type == ixgbe_mac_e610) ixgbe_handle_mdd_event(adapter, tx_ring); @@ -1468,9 +1465,6 @@ static bool ixgbe_clean_tx_irq(struct ixgbe_q_vector *q_vector, return true; } - if (ring_is_xdp(tx_ring)) - return !!budget; - #define TX_WAKE_THRESHOLD (DESC_NEEDED * 2) txq = netdev_get_tx_queue(tx_ring->netdev, tx_ring->queue_index); if (!__netif_txq_completed_wake(txq, total_packets, total_bytes, @@ -7974,12 +7968,9 @@ static void ixgbe_check_hang_subtask(struct ixgbe_adapter *adapter) return; /* Force detection of hung controller */ - if (netif_carrier_ok(adapter->netdev)) { + if (netif_carrier_ok(adapter->netdev)) for (i = 0; i < adapter->num_tx_queues; i++) set_check_for_tx_hang(adapter->tx_ring[i]); - for (i = 0; i < adapter->num_xdp_queues; i++) - set_check_for_tx_hang(adapter->xdp_ring[i]); - } if (!(adapter->flags & IXGBE_FLAG_MSIX_ENABLED)) { /* @@ -8199,13 +8190,6 @@ static bool ixgbe_ring_tx_pending(struct ixgbe_adapter *adapter) return true; } - for (i = 0; i < adapter->num_xdp_queues; i++) { - struct ixgbe_ring *ring = adapter->xdp_ring[i]; - - if (ring->next_to_use != ring->next_to_clean) - return true; - } - return false; } @@ -11005,6 +10989,10 @@ static int ixgbe_xdp_xmit(struct net_device *dev, int n, if (unlikely(test_bit(__IXGBE_DOWN, &adapter->state))) return -ENETDOWN; + if (!netif_carrier_ok(adapter->netdev) || + !netif_running(adapter->netdev)) + return -ENETDOWN; + if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) return -EINVAL; From 1468c1f97cf32418e34dbb40b784ed9333b9e123 Mon Sep 17 00:00:00 2001 From: ValdikSS Date: Tue, 19 Aug 2025 15:19:59 -0700 Subject: [PATCH 57/81] igc: fix disabling L1.2 PCI-E link substate on I226 on init Device ID comparison in igc_is_device_id_i226 is performed before the ID is set, resulting in always failing check on init. Before the patch: * L1.2 is not disabled on init * L1.2 is properly disabled after suspend-resume cycle With the patch: * L1.2 is properly disabled both on init and after suspend-resume How to test: Connect to the 1G link with 300+ mbit/s Internet speed, and run the download speed test, such as: curl -o /dev/null http://speedtest.selectel.ru/1GB Without L1.2 disabled, the speed would be no more than ~200 mbit/s. With L1.2 disabled, the speed would reach 1 gbit/s. Note: it's required that the latency between your host and the remote be around 3-5 ms, the test inside LAN (<1 ms latency) won't trigger the issue. Link: https://lore.kernel.org/intel-wired-lan/15248b4f-3271-42dd-8e35-02bfc92b25e1@intel.com Fixes: 0325143b59c6 ("igc: disable L1.2 PCI-E link substate to avoid performance issue") Signed-off-by: ValdikSS Reviewed-by: Vitaly Lifshits Reviewed-by: Paul Menzel Signed-off-by: Tony Nguyen Link: https://patch.msgid.link/20250819222000.3504873-6-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/igc/igc_main.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 458e5eaa92e5..e79b14d50b24 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -7149,6 +7149,13 @@ static int igc_probe(struct pci_dev *pdev, adapter->port_num = hw->bus.func; adapter->msg_enable = netif_msg_init(debug, DEFAULT_MSG_ENABLE); + /* PCI config space info */ + hw->vendor_id = pdev->vendor; + hw->device_id = pdev->device; + hw->revision_id = pdev->revision; + hw->subsystem_vendor_id = pdev->subsystem_vendor; + hw->subsystem_device_id = pdev->subsystem_device; + /* Disable ASPM L1.2 on I226 devices to avoid packet loss */ if (igc_is_device_id_i226(hw)) pci_disable_link_state(pdev, PCIE_LINK_STATE_L1_2); @@ -7175,13 +7182,6 @@ static int igc_probe(struct pci_dev *pdev, netdev->mem_start = pci_resource_start(pdev, 0); netdev->mem_end = pci_resource_end(pdev, 0); - /* PCI config space info */ - hw->vendor_id = pdev->vendor; - hw->device_id = pdev->device; - hw->revision_id = pdev->revision; - hw->subsystem_vendor_id = pdev->subsystem_vendor; - hw->subsystem_device_id = pdev->subsystem_device; - /* Copy the default MAC and PHY function pointers */ memcpy(&hw->mac.ops, ei->mac_ops, sizeof(hw->mac.ops)); memcpy(&hw->phy.ops, ei->phy_ops, sizeof(hw->phy.ops)); From e318cd6714592fb762fcab59c5684a442243a12f Mon Sep 17 00:00:00 2001 From: Tristram Ha Date: Mon, 18 Aug 2025 18:04:57 -0700 Subject: [PATCH 58/81] net: dsa: microchip: Fix KSZ9477 HSR port setup issue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ksz9477_hsr_join() is called once to setup the HSR port membership, but the port can be enabled later, or disabled and enabled back and the port membership is not set correctly inside ksz_update_port_member(). The added code always use the correct HSR port membership for HSR port that is enabled. Fixes: 2d61298fdd7b ("net: dsa: microchip: Enable HSR offloading for KSZ9477") Reported-by: Frieder Schrempf Signed-off-by: Tristram Ha Reviewed-by: Łukasz Majewski Tested-by: Frieder Schrempf Reviewed-by: Frieder Schrempf Link: https://patch.msgid.link/20250819010457.563286-1-Tristram.Ha@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/microchip/ksz_common.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c index 4cb14288ff0f..9568cc391fe3 100644 --- a/drivers/net/dsa/microchip/ksz_common.c +++ b/drivers/net/dsa/microchip/ksz_common.c @@ -2457,6 +2457,12 @@ static void ksz_update_port_member(struct ksz_device *dev, int port) dev->dev_ops->cfg_port_member(dev, i, val | cpu_port); } + /* HSR ports are setup once so need to use the assigned membership + * when the port is enabled. + */ + if (!port_member && p->stp_state == BR_STATE_FORWARDING && + (dev->hsr_ports & BIT(port))) + port_member = dev->hsr_ports; dev->dev_ops->cfg_port_member(dev, port, port_member | cpu_port); } From 15de71d06a400f7fdc15bf377a2552b0ec437cf5 Mon Sep 17 00:00:00 2001 From: William Liu Date: Tue, 19 Aug 2025 03:36:28 +0000 Subject: [PATCH 59/81] net/sched: Make cake_enqueue return NET_XMIT_CN when past buffer_limit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The following setup can trigger a WARNING in htb_activate due to the condition: !cl->leaf.q->q.qlen tc qdisc del dev lo root tc qdisc add dev lo root handle 1: htb default 1 tc class add dev lo parent 1: classid 1:1 \ htb rate 64bit tc qdisc add dev lo parent 1:1 handle f: \ cake memlimit 1b ping -I lo -f -c1 -s64 -W0.001 127.0.0.1 This is because the low memlimit leads to a low buffer_limit, which causes packet dropping. However, cake_enqueue still returns NET_XMIT_SUCCESS, causing htb_enqueue to call htb_activate with an empty child qdisc. We should return NET_XMIT_CN when packets are dropped from the same tin and flow. I do not believe return value of NET_XMIT_CN is necessary for packet drops in the case of ack filtering, as that is meant to optimize performance, not to signal congestion. Fixes: 046f6fd5daef ("sched: Add Common Applications Kept Enhanced (cake) qdisc") Signed-off-by: William Liu Reviewed-by: Savino Dicanosa Acked-by: Toke Høiland-Jørgensen Reviewed-by: Jamal Hadi Salim Link: https://patch.msgid.link/20250819033601.579821-1-will@willsroot.io Signed-off-by: Jakub Kicinski --- net/sched/sch_cake.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index dbcfb948c867..32bacfc314c2 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -1750,7 +1750,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, ktime_t now = ktime_get(); struct cake_tin_data *b; struct cake_flow *flow; - u32 idx; + u32 idx, tin; /* choose flow to insert into */ idx = cake_classify(sch, &b, skb, q->flow_mode, &ret); @@ -1760,6 +1760,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, __qdisc_drop(skb, to_free); return ret; } + tin = (u32)(b - q->tins); idx--; flow = &b->flows[idx]; @@ -1927,13 +1928,22 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, q->buffer_max_used = q->buffer_used; if (q->buffer_used > q->buffer_limit) { + bool same_flow = false; u32 dropped = 0; + u32 drop_id; while (q->buffer_used > q->buffer_limit) { dropped++; - cake_drop(sch, to_free); + drop_id = cake_drop(sch, to_free); + + if ((drop_id >> 16) == tin && + (drop_id & 0xFFFF) == idx) + same_flow = true; } b->drop_overlimit += dropped; + + if (same_flow) + return NET_XMIT_CN; } return NET_XMIT_SUCCESS; } From 2c2192e5f9c7c2892fe2363244d1387f62710d83 Mon Sep 17 00:00:00 2001 From: William Liu Date: Tue, 19 Aug 2025 03:36:59 +0000 Subject: [PATCH 60/81] net/sched: Remove unnecessary WARNING condition for empty child qdisc in htb_activate The WARN_ON trigger based on !cl->leaf.q->q.qlen is unnecessary in htb_activate. htb_dequeue_tree already accounts for that scenario. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: William Liu Reviewed-by: Savino Dicanosa Link: https://patch.msgid.link/20250819033632.579854-1-will@willsroot.io Signed-off-by: Jakub Kicinski --- net/sched/sch_htb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index c968ea763774..b5e40c51655a 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -592,7 +592,7 @@ htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, s64 *diff) */ static inline void htb_activate(struct htb_sched *q, struct htb_class *cl) { - WARN_ON(cl->level || !cl->leaf.q || !cl->leaf.q->q.qlen); + WARN_ON(cl->level || !cl->leaf.q); if (!cl->prio_activity) { cl->prio_activity = 1 << cl->prio; From 7af76e9d18a9fd6f8611b3313c86c190f9b6a5a7 Mon Sep 17 00:00:00 2001 From: Jakub Acs Date: Tue, 19 Aug 2025 08:28:42 +0000 Subject: [PATCH 61/81] net, hsr: reject HSR frame if skb can't hold tag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Receiving HSR frame with insufficient space to hold HSR tag in the skb can result in a crash (kernel BUG): [ 45.390915] skbuff: skb_under_panic: text:ffffffff86f32cac len:26 put:14 head:ffff888042418000 data:ffff888042417ff4 tail:0xe end:0x180 dev:bridge_slave_1 [ 45.392559] ------------[ cut here ]------------ [ 45.392912] kernel BUG at net/core/skbuff.c:211! [ 45.393276] Oops: invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC KASAN NOPTI [ 45.393809] CPU: 1 UID: 0 PID: 2496 Comm: reproducer Not tainted 6.15.0 #12 PREEMPT(undef) [ 45.394433] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014 [ 45.395273] RIP: 0010:skb_panic+0x15b/0x1d0 [ 45.402911] Call Trace: [ 45.403105] [ 45.404470] skb_push+0xcd/0xf0 [ 45.404726] br_dev_queue_push_xmit+0x7c/0x6c0 [ 45.406513] br_forward_finish+0x128/0x260 [ 45.408483] __br_forward+0x42d/0x590 [ 45.409464] maybe_deliver+0x2eb/0x420 [ 45.409763] br_flood+0x174/0x4a0 [ 45.410030] br_handle_frame_finish+0xc7c/0x1bc0 [ 45.411618] br_handle_frame+0xac3/0x1230 [ 45.413674] __netif_receive_skb_core.constprop.0+0x808/0x3df0 [ 45.422966] __netif_receive_skb_one_core+0xb4/0x1f0 [ 45.424478] __netif_receive_skb+0x22/0x170 [ 45.424806] process_backlog+0x242/0x6d0 [ 45.425116] __napi_poll+0xbb/0x630 [ 45.425394] net_rx_action+0x4d1/0xcc0 [ 45.427613] handle_softirqs+0x1a4/0x580 [ 45.427926] do_softirq+0x74/0x90 [ 45.428196] This issue was found by syzkaller. The panic happens in br_dev_queue_push_xmit() once it receives a corrupted skb with ETH header already pushed in linear data. When it attempts the skb_push() call, there's not enough headroom and skb_push() panics. The corrupted skb is put on the queue by HSR layer, which makes a sequence of unintended transformations when it receives a specific corrupted HSR frame (with incomplete TAG). Fix it by dropping and consuming frames that are not long enough to contain both ethernet and hsr headers. Alternative fix would be to check for enough headroom before skb_push() in br_dev_queue_push_xmit(). In the reproducer, this is injected via AF_PACKET, but I don't easily see why it couldn't be sent over the wire from adjacent network. Further Details: In the reproducer, the following network interface chain is set up: ┌────────────────┐ ┌────────────────┐ │ veth0_to_hsr ├───┤ hsr_slave0 ┼───┐ └────────────────┘ └────────────────┘ │ │ ┌──────┐ ├─┤ hsr0 ├───┐ │ └──────┘ │ ┌────────────────┐ ┌────────────────┐ │ │┌────────┐ │ veth1_to_hsr ┼───┤ hsr_slave1 ├───┘ └┤ │ └────────────────┘ └────────────────┘ ┌┼ bridge │ ││ │ │└────────┘ │ ┌───────┐ │ │ ... ├──────┘ └───────┘ To trigger the events leading up to crash, reproducer sends a corrupted HSR frame with incomplete TAG, via AF_PACKET socket on 'veth0_to_hsr'. The first HSR-layer function to process this frame is hsr_handle_frame(). It and then checks if the protocol is ETH_P_PRP or ETH_P_HSR. If it is, it calls skb_set_network_header(skb, ETH_HLEN + HSR_HLEN), without checking that the skb is long enough. For the crashing frame it is not, and hence the skb->network_header and skb->mac_len fields are set incorrectly, pointing after the end of the linear buffer. I will call this a BUG#1 and it is what is addressed by this patch. In the crashing scenario before the fix, the skb continues to go down the hsr path as follows. hsr_handle_frame() then calls this sequence hsr_forward_skb() fill_frame_info() hsr->proto_ops->fill_frame_info() hsr_fill_frame_info() hsr_fill_frame_info() contains a check that intends to check whether the skb actually contains the HSR header. But the check relies on the skb->mac_len field which was erroneously setup due to BUG#1, so the check passes and the execution continues back in the hsr_forward_skb(): hsr_forward_skb() hsr_forward_do() hsr->proto_ops->get_untagged_frame() hsr_get_untagged_frame() create_stripped_skb_hsr() In create_stripped_skb_hsr(), a copy of the skb is created and is further corrupted by operation that attempts to strip the HSR tag in a call to __pskb_copy(). The skb enters create_stripped_skb_hsr() with ethernet header pushed in linear buffer. The skb_pull(skb_in, HSR_HLEN) thus pulls 6 bytes of ethernet header into the headroom, creating skb_in with a headroom of size 8. The subsequent __pskb_copy() then creates an skb with headroom of just 2 and skb->len of just 12, this is how it looks after the copy: gdb) p skb->len $10 = 12 (gdb) p skb->data $11 = (unsigned char *) 0xffff888041e45382 "\252\252\252\252\252!\210\373", (gdb) p skb->head $12 = (unsigned char *) 0xffff888041e45380 "" It seems create_stripped_skb_hsr() assumes that ETH header is pulled in the headroom when it's entered, because it just pulls HSR header on top. But that is not the case in our code-path and we end up with the corrupted skb instead. I will call this BUG#2 *I got confused here because it seems that under no conditions can create_stripped_skb_hsr() work well, the assumption it makes is not true during the processing of hsr frames - since the skb_push() in hsr_handle_frame to skb_pull in hsr_deliver_master(). I wonder whether I missed something here.* Next, the execution arrives in hsr_deliver_master(). It calls skb_pull(ETH_HLEN), which just returns NULL - the SKB does not have enough space for the pull (as it only has 12 bytes in total at this point). *The skb_pull() here further suggests that ethernet header is meant to be pushed through the whole hsr processing and create_stripped_skb_hsr() should pull it before doing the HSR header pull.* hsr_deliver_master() then puts the corrupted skb on the queue, it is then picked up from there by bridge frame handling layer and finally lands in br_dev_queue_push_xmit where it panics. Cc: stable@kernel.org Fixes: 48b491a5cc74 ("net: hsr: fix mac_len checks") Reported-by: syzbot+a81f2759d022496b40ab@syzkaller.appspotmail.com Signed-off-by: Jakub Acs Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250819082842.94378-1-acsjakub@amazon.de Signed-off-by: Jakub Kicinski --- net/hsr/hsr_slave.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c index b87b6a6fe070..102eccf5ead7 100644 --- a/net/hsr/hsr_slave.c +++ b/net/hsr/hsr_slave.c @@ -63,8 +63,14 @@ static rx_handler_result_t hsr_handle_frame(struct sk_buff **pskb) skb_push(skb, ETH_HLEN); skb_reset_mac_header(skb); if ((!hsr->prot_version && protocol == htons(ETH_P_PRP)) || - protocol == htons(ETH_P_HSR)) + protocol == htons(ETH_P_HSR)) { + if (!pskb_may_pull(skb, ETH_HLEN + HSR_HLEN)) { + kfree_skb(skb); + goto finish_consume; + } + skb_set_network_header(skb, ETH_HLEN + HSR_HLEN); + } skb_reset_mac_len(skb); /* Only the frames received over the interlink port will assign a From a458b2902115b26a25d67393b12ddd57d1216aaa Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 18 Aug 2025 13:27:24 -0700 Subject: [PATCH 62/81] ipv6: sr: Fix MAC comparison to be constant-time To prevent timing attacks, MACs need to be compared in constant time. Use the appropriate helper function for this. Fixes: bf355b8d2c30 ("ipv6: sr: add core files for SR HMAC support") Cc: stable@vger.kernel.org Signed-off-by: Eric Biggers Reviewed-by: Andrea Mayer Link: https://patch.msgid.link/20250818202724.15713-1-ebiggers@kernel.org Signed-off-by: Jakub Kicinski --- net/ipv6/seg6_hmac.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c index d77b52523b6a..fd58426f222b 100644 --- a/net/ipv6/seg6_hmac.c +++ b/net/ipv6/seg6_hmac.c @@ -35,6 +35,7 @@ #include #include +#include #include #include #include @@ -280,7 +281,7 @@ bool seg6_hmac_validate_skb(struct sk_buff *skb) if (seg6_hmac_compute(hinfo, srh, &ipv6_hdr(skb)->saddr, hmac_output)) return false; - if (memcmp(hmac_output, tlv->hmac, SEG6_HMAC_FIELD_LEN) != 0) + if (crypto_memneq(hmac_output, tlv->hmac, SEG6_HMAC_FIELD_LEN)) return false; return true; From c42be534547d6e45c155c347dd792b6ad9c24def Mon Sep 17 00:00:00 2001 From: Ryan Wanner Date: Tue, 19 Aug 2025 09:32:30 -0700 Subject: [PATCH 63/81] Revert "net: cadence: macb: sama7g5_emac: Remove USARIO CLKEN flag" This reverts commit db400061b5e7cc55f9b4dd15443e9838964119ea. This commit can cause a Devicetree ABI break for older DTS files that rely this flag for RMII configuration. Adding this back in ensures that the older DTBs will not break. Fixes: db400061b5e7 ("net: cadence: macb: sama7g5_emac: Remove USARIO CLKEN flag") Signed-off-by: Ryan Wanner Link: https://patch.msgid.link/20250819163236.100680-1-Ryan.Wanner@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/cadence/macb_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index ce95fad8cedd..9693f0289435 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -5113,7 +5113,8 @@ static const struct macb_config sama7g5_gem_config = { static const struct macb_config sama7g5_emac_config = { .caps = MACB_CAPS_USRIO_DEFAULT_IS_MII_GMII | - MACB_CAPS_MIIONRGMII | MACB_CAPS_GEM_HAS_PTP, + MACB_CAPS_USRIO_HAS_CLKEN | MACB_CAPS_MIIONRGMII | + MACB_CAPS_GEM_HAS_PTP, .dma_burst_length = 16, .clk_init = macb_clk_init, .init = macb_init, From b64d035f77b1f02ab449393342264b44950a75ae Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 15 Aug 2025 06:19:58 +0000 Subject: [PATCH 64/81] bonding: update LACP activity flag after setting lacp_active The port's actor_oper_port_state activity flag should be updated immediately after changing the lacp_active option to reflect the current mode correctly. Fixes: 3a755cd8b7c6 ("bonding: add new option lacp_active") Signed-off-by: Hangbin Liu Link: https://patch.msgid.link/20250815062000.22220-2-liuhangbin@gmail.com Signed-off-by: Paolo Abeni --- drivers/net/bonding/bond_3ad.c | 25 +++++++++++++++++++++++++ drivers/net/bonding/bond_options.c | 1 + include/net/bond_3ad.h | 1 + 3 files changed, 27 insertions(+) diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c index 2fca8e84ab10..414fecfd2a0e 100644 --- a/drivers/net/bonding/bond_3ad.c +++ b/drivers/net/bonding/bond_3ad.c @@ -2883,6 +2883,31 @@ void bond_3ad_update_lacp_rate(struct bonding *bond) spin_unlock_bh(&bond->mode_lock); } +/** + * bond_3ad_update_lacp_active - change the lacp active + * @bond: bonding struct + * + * Update actor_oper_port_state when lacp_active is modified. + */ +void bond_3ad_update_lacp_active(struct bonding *bond) +{ + struct port *port = NULL; + struct list_head *iter; + struct slave *slave; + int lacp_active; + + lacp_active = bond->params.lacp_active; + spin_lock_bh(&bond->mode_lock); + bond_for_each_slave(bond, slave, iter) { + port = &(SLAVE_AD_INFO(slave)->port); + if (lacp_active) + port->actor_oper_port_state |= LACP_STATE_LACP_ACTIVITY; + else + port->actor_oper_port_state &= ~LACP_STATE_LACP_ACTIVITY; + } + spin_unlock_bh(&bond->mode_lock); +} + size_t bond_3ad_stats_size(void) { return nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_LACPDU_RX */ diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c index 1d639a3be6ba..3b6f815c55ff 100644 --- a/drivers/net/bonding/bond_options.c +++ b/drivers/net/bonding/bond_options.c @@ -1660,6 +1660,7 @@ static int bond_option_lacp_active_set(struct bonding *bond, netdev_dbg(bond->dev, "Setting LACP active to %s (%llu)\n", newval->string, newval->value); bond->params.lacp_active = newval->value; + bond_3ad_update_lacp_active(bond); return 0; } diff --git a/include/net/bond_3ad.h b/include/net/bond_3ad.h index 2053cd8e788a..dba369a2cf27 100644 --- a/include/net/bond_3ad.h +++ b/include/net/bond_3ad.h @@ -307,6 +307,7 @@ int bond_3ad_lacpdu_recv(const struct sk_buff *skb, struct bonding *bond, struct slave *slave); int bond_3ad_set_carrier(struct bonding *bond); void bond_3ad_update_lacp_rate(struct bonding *bond); +void bond_3ad_update_lacp_active(struct bonding *bond); void bond_3ad_update_ad_actor_settings(struct bonding *bond); int bond_3ad_stats_fill(struct sk_buff *skb, struct bond_3ad_stats *stats); size_t bond_3ad_stats_size(void); From 0599640a21e98f0d6a3e9ff85c0a687c90a8103b Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 15 Aug 2025 06:19:59 +0000 Subject: [PATCH 65/81] bonding: send LACPDUs periodically in passive mode after receiving partner's LACPDU When `lacp_active` is set to `off`, the bond operates in passive mode, meaning it only "speaks when spoken to." However, the current kernel implementation only sends an LACPDU in response when the partner's state changes. As a result, once LACP negotiation succeeds, the actor stops sending LACPDUs until the partner times out and sends an "expired" LACPDU. This causes continuous LACP state flapping. According to IEEE 802.1AX-2014, 6.4.13 Periodic Transmission machine. The values of Partner_Oper_Port_State.LACP_Activity and Actor_Oper_Port_State.LACP_Activity determine whether periodic transmissions take place. If either or both parameters are set to Active LACP, then periodic transmissions occur; if both are set to Passive LACP, then periodic transmissions do not occur. To comply with this, we remove the `!bond->params.lacp_active` check in `ad_periodic_machine()`. Instead, we initialize the actor's port's `LACP_STATE_LACP_ACTIVITY` state based on `lacp_active` setting. Additionally, we avoid setting the partner's state to `LACP_STATE_LACP_ACTIVITY` in the EXPIRED state, since we should not assume the partner is active by default. This ensures that in passive mode, the bond starts sending periodic LACPDUs after receiving one from the partner, and avoids flapping due to inactivity. Fixes: 3a755cd8b7c6 ("bonding: add new option lacp_active") Signed-off-by: Hangbin Liu Link: https://patch.msgid.link/20250815062000.22220-3-liuhangbin@gmail.com Signed-off-by: Paolo Abeni --- drivers/net/bonding/bond_3ad.c | 42 +++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c index 414fecfd2a0e..4edc8e6b6b64 100644 --- a/drivers/net/bonding/bond_3ad.c +++ b/drivers/net/bonding/bond_3ad.c @@ -95,13 +95,13 @@ static int ad_marker_send(struct port *port, struct bond_marker *marker); static void ad_mux_machine(struct port *port, bool *update_slave_arr); static void ad_rx_machine(struct lacpdu *lacpdu, struct port *port); static void ad_tx_machine(struct port *port); -static void ad_periodic_machine(struct port *port, struct bond_params *bond_params); +static void ad_periodic_machine(struct port *port); static void ad_port_selection_logic(struct port *port, bool *update_slave_arr); static void ad_agg_selection_logic(struct aggregator *aggregator, bool *update_slave_arr); static void ad_clear_agg(struct aggregator *aggregator); static void ad_initialize_agg(struct aggregator *aggregator); -static void ad_initialize_port(struct port *port, int lacp_fast); +static void ad_initialize_port(struct port *port, const struct bond_params *bond_params); static void ad_enable_collecting(struct port *port); static void ad_disable_distributing(struct port *port, bool *update_slave_arr); @@ -1307,10 +1307,16 @@ static void ad_rx_machine(struct lacpdu *lacpdu, struct port *port) * case of EXPIRED even if LINK_DOWN didn't arrive for * the port. */ - port->partner_oper.port_state &= ~LACP_STATE_SYNCHRONIZATION; port->sm_vars &= ~AD_PORT_MATCHED; + /* Based on IEEE 8021AX-2014, Figure 6-18 - Receive + * machine state diagram, the statue should be + * Partner_Oper_Port_State.Synchronization = FALSE; + * Partner_Oper_Port_State.LACP_Timeout = Short Timeout; + * start current_while_timer(Short Timeout); + * Actor_Oper_Port_State.Expired = TRUE; + */ + port->partner_oper.port_state &= ~LACP_STATE_SYNCHRONIZATION; port->partner_oper.port_state |= LACP_STATE_LACP_TIMEOUT; - port->partner_oper.port_state |= LACP_STATE_LACP_ACTIVITY; port->sm_rx_timer_counter = __ad_timer_to_ticks(AD_CURRENT_WHILE_TIMER, (u16)(AD_SHORT_TIMEOUT)); port->actor_oper_port_state |= LACP_STATE_EXPIRED; port->sm_vars |= AD_PORT_CHURNED; @@ -1417,11 +1423,10 @@ static void ad_tx_machine(struct port *port) /** * ad_periodic_machine - handle a port's periodic state machine * @port: the port we're looking at - * @bond_params: bond parameters we will use * * Turn ntt flag on priodically to perform periodic transmission of lacpdu's. */ -static void ad_periodic_machine(struct port *port, struct bond_params *bond_params) +static void ad_periodic_machine(struct port *port) { periodic_states_t last_state; @@ -1430,8 +1435,7 @@ static void ad_periodic_machine(struct port *port, struct bond_params *bond_para /* check if port was reinitialized */ if (((port->sm_vars & AD_PORT_BEGIN) || !(port->sm_vars & AD_PORT_LACP_ENABLED) || !port->is_enabled) || - (!(port->actor_oper_port_state & LACP_STATE_LACP_ACTIVITY) && !(port->partner_oper.port_state & LACP_STATE_LACP_ACTIVITY)) || - !bond_params->lacp_active) { + (!(port->actor_oper_port_state & LACP_STATE_LACP_ACTIVITY) && !(port->partner_oper.port_state & LACP_STATE_LACP_ACTIVITY))) { port->sm_periodic_state = AD_NO_PERIODIC; } /* check if state machine should change state */ @@ -1955,16 +1959,16 @@ static void ad_initialize_agg(struct aggregator *aggregator) /** * ad_initialize_port - initialize a given port's parameters * @port: the port we're looking at - * @lacp_fast: boolean. whether fast periodic should be used + * @bond_params: bond parameters we will use */ -static void ad_initialize_port(struct port *port, int lacp_fast) +static void ad_initialize_port(struct port *port, const struct bond_params *bond_params) { static const struct port_params tmpl = { .system_priority = 0xffff, .key = 1, .port_number = 1, .port_priority = 0xff, - .port_state = 1, + .port_state = 0, }; static const struct lacpdu lacpdu = { .subtype = 0x01, @@ -1982,12 +1986,14 @@ static void ad_initialize_port(struct port *port, int lacp_fast) port->actor_port_priority = 0xff; port->actor_port_aggregator_identifier = 0; port->ntt = false; - port->actor_admin_port_state = LACP_STATE_AGGREGATION | - LACP_STATE_LACP_ACTIVITY; - port->actor_oper_port_state = LACP_STATE_AGGREGATION | - LACP_STATE_LACP_ACTIVITY; + port->actor_admin_port_state = LACP_STATE_AGGREGATION; + port->actor_oper_port_state = LACP_STATE_AGGREGATION; + if (bond_params->lacp_active) { + port->actor_admin_port_state |= LACP_STATE_LACP_ACTIVITY; + port->actor_oper_port_state |= LACP_STATE_LACP_ACTIVITY; + } - if (lacp_fast) + if (bond_params->lacp_fast) port->actor_oper_port_state |= LACP_STATE_LACP_TIMEOUT; memcpy(&port->partner_admin, &tmpl, sizeof(tmpl)); @@ -2201,7 +2207,7 @@ void bond_3ad_bind_slave(struct slave *slave) /* port initialization */ port = &(SLAVE_AD_INFO(slave)->port); - ad_initialize_port(port, bond->params.lacp_fast); + ad_initialize_port(port, &bond->params); port->slave = slave; port->actor_port_number = SLAVE_AD_INFO(slave)->id; @@ -2513,7 +2519,7 @@ void bond_3ad_state_machine_handler(struct work_struct *work) } ad_rx_machine(NULL, port); - ad_periodic_machine(port, &bond->params); + ad_periodic_machine(port); ad_port_selection_logic(port, &update_slave_arr); ad_mux_machine(port, &update_slave_arr); ad_tx_machine(port); From 87951b566446da04eed1fe8100f99a512ef02756 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 15 Aug 2025 06:20:00 +0000 Subject: [PATCH 66/81] selftests: bonding: add test for passive LACP mode Add a selftest to verify bonding behavior when `lacp_active` is set to `off`. The test checks the following: - The passive LACP bond should not send LACPDUs before receiving a partner's LACPDU. - The transmitted LACPDUs must not include the active flag. - After transitioning to EXPIRED and DEFAULTED states, the passive side should still not initiate LACPDUs. Signed-off-by: Hangbin Liu Link: https://patch.msgid.link/20250815062000.22220-4-liuhangbin@gmail.com Signed-off-by: Paolo Abeni --- .../selftests/drivers/net/bonding/Makefile | 3 +- .../drivers/net/bonding/bond_passive_lacp.sh | 105 ++++++++++++++++++ .../selftests/drivers/net/bonding/config | 1 + 3 files changed, 108 insertions(+), 1 deletion(-) create mode 100755 tools/testing/selftests/drivers/net/bonding/bond_passive_lacp.sh diff --git a/tools/testing/selftests/drivers/net/bonding/Makefile b/tools/testing/selftests/drivers/net/bonding/Makefile index 2b10854e4b1e..44b98f17f8ff 100644 --- a/tools/testing/selftests/drivers/net/bonding/Makefile +++ b/tools/testing/selftests/drivers/net/bonding/Makefile @@ -10,7 +10,8 @@ TEST_PROGS := \ mode-2-recovery-updelay.sh \ bond_options.sh \ bond-eth-type-change.sh \ - bond_macvlan_ipvlan.sh + bond_macvlan_ipvlan.sh \ + bond_passive_lacp.sh TEST_FILES := \ lag_lib.sh \ diff --git a/tools/testing/selftests/drivers/net/bonding/bond_passive_lacp.sh b/tools/testing/selftests/drivers/net/bonding/bond_passive_lacp.sh new file mode 100755 index 000000000000..9c3b089813df --- /dev/null +++ b/tools/testing/selftests/drivers/net/bonding/bond_passive_lacp.sh @@ -0,0 +1,105 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Test if a bond interface works with lacp_active=off. + +# shellcheck disable=SC2034 +REQUIRE_MZ=no +NUM_NETIFS=0 +lib_dir=$(dirname "$0") +# shellcheck disable=SC1091 +source "$lib_dir"/../../../net/forwarding/lib.sh + +# shellcheck disable=SC2317 +check_port_state() +{ + local netns=$1 + local port=$2 + local state=$3 + + ip -n "${netns}" -d -j link show "$port" | \ + jq -e ".[].linkinfo.info_slave_data.ad_actor_oper_port_state_str | index(\"${state}\") != null" > /dev/null +} + +check_pkt_count() +{ + RET=0 + local ns="$1" + local iface="$2" + + # wait 65s, one per 30s + slowwait_for_counter 65 2 tc_rule_handle_stats_get \ + "dev ${iface} egress" 101 ".packets" "-n ${ns}" &> /dev/null +} + +setup() { + setup_ns c_ns s_ns + + # shellcheck disable=SC2154 + ip -n "${c_ns}" link add eth0 type veth peer name eth0 netns "${s_ns}" + ip -n "${c_ns}" link add eth1 type veth peer name eth1 netns "${s_ns}" + + # Add tc filter to count the pkts + tc -n "${c_ns}" qdisc add dev eth0 clsact + tc -n "${c_ns}" filter add dev eth0 egress handle 101 protocol 0x8809 matchall action pass + tc -n "${s_ns}" qdisc add dev eth1 clsact + tc -n "${s_ns}" filter add dev eth1 egress handle 101 protocol 0x8809 matchall action pass + + ip -n "${s_ns}" link add bond0 type bond mode 802.3ad lacp_active on lacp_rate fast + ip -n "${s_ns}" link set eth0 master bond0 + ip -n "${s_ns}" link set eth1 master bond0 + + ip -n "${c_ns}" link add bond0 type bond mode 802.3ad lacp_active off lacp_rate fast + ip -n "${c_ns}" link set eth0 master bond0 + ip -n "${c_ns}" link set eth1 master bond0 + +} + +trap cleanup_all_ns EXIT +setup + +# The bond will send 2 lacpdu pkts during init time, let's wait at least 2s +# after interface up +ip -n "${c_ns}" link set bond0 up +sleep 2 + +# 1. The passive side shouldn't send LACPDU. +check_pkt_count "${c_ns}" "eth0" && RET=1 +log_test "802.3ad lacp_active off" "init port" + +ip -n "${s_ns}" link set bond0 up +# 2. The passive side should not have the 'active' flag. +RET=0 +slowwait 2 check_port_state "${c_ns}" "eth0" "active" && RET=1 +log_test "802.3ad lacp_active off" "port state active" + +# 3. The active side should have the 'active' flag. +RET=0 +slowwait 2 check_port_state "${s_ns}" "eth0" "active" || RET=1 +log_test "802.3ad lacp_active on" "port state active" + +# 4. Make sure the connection is not expired. +RET=0 +slowwait 5 check_port_state "${s_ns}" "eth0" "distributing" +slowwait 10 check_port_state "${s_ns}" "eth0" "expired" && RET=1 +log_test "bond 802.3ad lacp_active off" "port connection" + +# After testing, disconnect one port on each side to check the state. +ip -n "${s_ns}" link set eth0 nomaster +ip -n "${s_ns}" link set eth0 up +ip -n "${c_ns}" link set eth1 nomaster +ip -n "${c_ns}" link set eth1 up +# Due to Periodic Machine and Rx Machine state change, the bond will still +# send lacpdu pkts in a few seconds. sleep at lease 5s to make sure +# negotiation finished +sleep 5 + +# 5. The active side should keep sending LACPDU. +check_pkt_count "${s_ns}" "eth1" || RET=1 +log_test "bond 802.3ad lacp_active on" "port pkt after disconnect" + +# 6. The passive side shouldn't send LACPDU anymore. +check_pkt_count "${c_ns}" "eth0" && RET=1 +log_test "bond 802.3ad lacp_active off" "port pkt after disconnect" + +exit "$EXIT_STATUS" diff --git a/tools/testing/selftests/drivers/net/bonding/config b/tools/testing/selftests/drivers/net/bonding/config index dad4e5fda4db..4d16a69ffc65 100644 --- a/tools/testing/selftests/drivers/net/bonding/config +++ b/tools/testing/selftests/drivers/net/bonding/config @@ -6,6 +6,7 @@ CONFIG_MACVLAN=y CONFIG_IPVLAN=y CONFIG_NET_ACT_GACT=y CONFIG_NET_CLS_FLOWER=y +CONFIG_NET_CLS_MATCHALL=m CONFIG_NET_SCH_INGRESS=y CONFIG_NLMON=y CONFIG_VETH=y From 9f6b606b6b37e61427412708411e8e04b1a858e8 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Mon, 18 Aug 2025 11:58:25 +0200 Subject: [PATCH 67/81] net: airoha: ppe: Do not invalid PPE entries in case of SW hash collision SW hash computed by airoha_ppe_foe_get_entry_hash routine (used for foe_flow hlist) can theoretically produce collisions between two different HW PPE entries. In airoha_ppe_foe_insert_entry() if the collision occurs we will mark the second PPE entry in the list as stale (setting the hw hash to 0xffff). Stale entries are no more updated in airoha_ppe_foe_flow_entry_update routine and so they are removed by Netfilter. Fix the problem not marking the second entry as stale in airoha_ppe_foe_insert_entry routine if we have already inserted the brand new entry in the PPE table and let Netfilter remove real stale entries according to their timestamp. Please note this is just a theoretical issue spotted reviewing the code and not faced running the system. Fixes: cd53f622611f9 ("net: airoha: Add L2 hw acceleration support") Signed-off-by: Lorenzo Bianconi Link: https://patch.msgid.link/20250818-airoha-en7581-hash-collision-fix-v1-1-d190c4b53d1c@kernel.org Signed-off-by: Paolo Abeni --- drivers/net/ethernet/airoha/airoha_ppe.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/airoha/airoha_ppe.c b/drivers/net/ethernet/airoha/airoha_ppe.c index 47411d2cbd28..88694b08afa1 100644 --- a/drivers/net/ethernet/airoha/airoha_ppe.c +++ b/drivers/net/ethernet/airoha/airoha_ppe.c @@ -736,10 +736,8 @@ static void airoha_ppe_foe_insert_entry(struct airoha_ppe *ppe, continue; } - if (commit_done || !airoha_ppe_foe_compare_entry(e, hwe)) { - e->hash = 0xffff; + if (!airoha_ppe_foe_compare_entry(e, hwe)) continue; - } airoha_ppe_foe_commit_entry(ppe, &e->data, hash); commit_done = true; From 62708b9452f8eb77513115b17c4f8d1a22ebf843 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 19 Aug 2025 19:19:51 -0700 Subject: [PATCH 68/81] tls: fix handling of zero-length records on the rx_list Each recvmsg() call must process either - only contiguous DATA records (any number of them) - one non-DATA record If the next record has different type than what has already been processed we break out of the main processing loop. If the record has already been decrypted (which may be the case for TLS 1.3 where we don't know type until decryption) we queue the pending record to the rx_list. Next recvmsg() will pick it up from there. Queuing the skb to rx_list after zero-copy decrypt is not possible, since in that case we decrypted directly to the user space buffer, and we don't have an skb to queue (darg.skb points to the ciphertext skb for access to metadata like length). Only data records are allowed zero-copy, and we break the processing loop after each non-data record. So we should never zero-copy and then find out that the record type has changed. The corner case we missed is when the initial record comes from rx_list, and it's zero length. Reported-by: Muhammad Alifa Ramdhan Reported-by: Billy Jheng Bing-Jhong Fixes: 84c61fe1a75b ("tls: rx: do not use the standard strparser") Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/20250820021952.143068-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/tls/tls_sw.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 51c98a007dda..bac65d0d4e3e 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1808,6 +1808,9 @@ int decrypt_skb(struct sock *sk, struct scatterlist *sgout) return tls_decrypt_sg(sk, NULL, sgout, &darg); } +/* All records returned from a recvmsg() call must have the same type. + * 0 is not a valid content type. Use it as "no type reported, yet". + */ static int tls_record_content_type(struct msghdr *msg, struct tls_msg *tlm, u8 *control) { @@ -2051,8 +2054,10 @@ int tls_sw_recvmsg(struct sock *sk, if (err < 0) goto end; + /* process_rx_list() will set @control if it processed any records */ copied = err; - if (len <= copied || (copied && control != TLS_RECORD_TYPE_DATA) || rx_more) + if (len <= copied || rx_more || + (control && control != TLS_RECORD_TYPE_DATA)) goto end; target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); From a61a3e961baff65b0a49f862fe21ce304f279b24 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 19 Aug 2025 19:19:52 -0700 Subject: [PATCH 69/81] selftests: tls: add tests for zero-length records Test various combinations of zero-length records. Unfortunately, kernel cannot be coerced into producing those, so hardcode the ciphertext messages in the test. Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/20250820021952.143068-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/tls.c | 300 +++++++++++++++++++++++++++++- 1 file changed, 295 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c index 2b8387a83bc7..0f5640d8dc7f 100644 --- a/tools/testing/selftests/net/tls.c +++ b/tools/testing/selftests/net/tls.c @@ -181,13 +181,12 @@ static int tls_send_cmsg(int fd, unsigned char record_type, return sendmsg(fd, &msg, flags); } -static int tls_recv_cmsg(struct __test_metadata *_metadata, - int fd, unsigned char record_type, - void *data, size_t len, int flags) +static int __tls_recv_cmsg(struct __test_metadata *_metadata, + int fd, unsigned char *ctype, + void *data, size_t len, int flags) { char cbuf[CMSG_SPACE(sizeof(char))]; struct cmsghdr *cmsg; - unsigned char ctype; struct msghdr msg; struct iovec vec; int n; @@ -206,7 +205,20 @@ static int tls_recv_cmsg(struct __test_metadata *_metadata, EXPECT_NE(cmsg, NULL); EXPECT_EQ(cmsg->cmsg_level, SOL_TLS); EXPECT_EQ(cmsg->cmsg_type, TLS_GET_RECORD_TYPE); - ctype = *((unsigned char *)CMSG_DATA(cmsg)); + if (ctype) + *ctype = *((unsigned char *)CMSG_DATA(cmsg)); + + return n; +} + +static int tls_recv_cmsg(struct __test_metadata *_metadata, + int fd, unsigned char record_type, + void *data, size_t len, int flags) +{ + unsigned char ctype; + int n; + + n = __tls_recv_cmsg(_metadata, fd, &ctype, data, len, flags); EXPECT_EQ(ctype, record_type); return n; @@ -2164,6 +2176,284 @@ TEST_F(tls, rekey_poll_delay) } } +struct raw_rec { + unsigned int plain_len; + unsigned char plain_data[100]; + unsigned int cipher_len; + unsigned char cipher_data[128]; +}; + +/* TLS 1.2, AES_CCM, data, seqno:0, plaintext: 'Hello world' */ +static const struct raw_rec id0_data_l11 = { + .plain_len = 11, + .plain_data = { + 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, + 0x72, 0x6c, 0x64, + }, + .cipher_len = 40, + .cipher_data = { + 0x17, 0x03, 0x03, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x26, 0xa2, 0x33, + 0xde, 0x8d, 0x94, 0xf0, 0x29, 0x6c, 0xb1, 0xaf, + 0x6a, 0x75, 0xb2, 0x93, 0xad, 0x45, 0xd5, 0xfd, + 0x03, 0x51, 0x57, 0x8f, 0xf9, 0xcc, 0x3b, 0x42, + }, +}; + +/* TLS 1.2, AES_CCM, ctrl, seqno:0, plaintext: '' */ +static const struct raw_rec id0_ctrl_l0 = { + .plain_len = 0, + .plain_data = { + }, + .cipher_len = 29, + .cipher_data = { + 0x16, 0x03, 0x03, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x13, 0x38, 0x7b, + 0xa6, 0x1c, 0xdd, 0xa7, 0x19, 0x33, 0xab, 0xae, + 0x88, 0xe1, 0xd2, 0x08, 0x4f, + }, +}; + +/* TLS 1.2, AES_CCM, data, seqno:0, plaintext: '' */ +static const struct raw_rec id0_data_l0 = { + .plain_len = 0, + .plain_data = { + }, + .cipher_len = 29, + .cipher_data = { + 0x17, 0x03, 0x03, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0xc5, 0x37, 0x90, + 0x70, 0x45, 0x89, 0xfb, 0x5c, 0xc7, 0x89, 0x03, + 0x68, 0x80, 0xd3, 0xd8, 0xcc, + }, +}; + +/* TLS 1.2, AES_CCM, data, seqno:1, plaintext: 'Hello world' */ +static const struct raw_rec id1_data_l11 = { + .plain_len = 11, + .plain_data = { + 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, + 0x72, 0x6c, 0x64, + }, + .cipher_len = 40, + .cipher_data = { + 0x17, 0x03, 0x03, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x3a, 0x1a, 0x9c, + 0xd0, 0xa8, 0x9a, 0xd6, 0x69, 0xd6, 0x1a, 0xe3, + 0xb5, 0x1f, 0x0d, 0x2c, 0xe2, 0x97, 0x46, 0xff, + 0x2b, 0xcc, 0x5a, 0xc4, 0xa3, 0xb9, 0xef, 0xba, + }, +}; + +/* TLS 1.2, AES_CCM, ctrl, seqno:1, plaintext: '' */ +static const struct raw_rec id1_ctrl_l0 = { + .plain_len = 0, + .plain_data = { + }, + .cipher_len = 29, + .cipher_data = { + 0x16, 0x03, 0x03, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x3e, 0xf0, 0xfe, + 0xee, 0xd9, 0xe2, 0x5d, 0xc7, 0x11, 0x4c, 0xe6, + 0xb4, 0x7e, 0xef, 0x40, 0x2b, + }, +}; + +/* TLS 1.2, AES_CCM, data, seqno:1, plaintext: '' */ +static const struct raw_rec id1_data_l0 = { + .plain_len = 0, + .plain_data = { + }, + .cipher_len = 29, + .cipher_data = { + 0x17, 0x03, 0x03, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0xce, 0xfc, 0x86, + 0xc8, 0xf0, 0x55, 0xf9, 0x47, 0x3f, 0x74, 0xdc, + 0xc9, 0xbf, 0xfe, 0x5b, 0xb1, + }, +}; + +/* TLS 1.2, AES_CCM, ctrl, seqno:2, plaintext: 'Hello world' */ +static const struct raw_rec id2_ctrl_l11 = { + .plain_len = 11, + .plain_data = { + 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, + 0x72, 0x6c, 0x64, + }, + .cipher_len = 40, + .cipher_data = { + 0x16, 0x03, 0x03, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0xe5, 0x3d, 0x19, + 0x3d, 0xca, 0xb8, 0x16, 0xb6, 0xff, 0x79, 0x87, + 0x2a, 0x04, 0x11, 0x3d, 0xf8, 0x64, 0x5f, 0x36, + 0x8b, 0xa8, 0xee, 0x4c, 0x6d, 0x62, 0xa5, 0x00, + }, +}; + +/* TLS 1.2, AES_CCM, data, seqno:2, plaintext: 'Hello world' */ +static const struct raw_rec id2_data_l11 = { + .plain_len = 11, + .plain_data = { + 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, + 0x72, 0x6c, 0x64, + }, + .cipher_len = 40, + .cipher_data = { + 0x17, 0x03, 0x03, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0xe5, 0x3d, 0x19, + 0x3d, 0xca, 0xb8, 0x16, 0xb6, 0xff, 0x79, 0x87, + 0x8e, 0xa1, 0xd0, 0xcd, 0x33, 0xb5, 0x86, 0x2b, + 0x17, 0xf1, 0x52, 0x2a, 0x55, 0x62, 0x65, 0x11, + }, +}; + +/* TLS 1.2, AES_CCM, ctrl, seqno:2, plaintext: '' */ +static const struct raw_rec id2_ctrl_l0 = { + .plain_len = 0, + .plain_data = { + }, + .cipher_len = 29, + .cipher_data = { + 0x16, 0x03, 0x03, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0xdc, 0x5c, 0x0e, + 0x41, 0xdd, 0xba, 0xd3, 0xcc, 0xcf, 0x6d, 0xd9, + 0x06, 0xdb, 0x79, 0xe5, 0x5d, + }, +}; + +/* TLS 1.2, AES_CCM, data, seqno:2, plaintext: '' */ +static const struct raw_rec id2_data_l0 = { + .plain_len = 0, + .plain_data = { + }, + .cipher_len = 29, + .cipher_data = { + 0x17, 0x03, 0x03, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0xc3, 0xca, 0x26, + 0x22, 0xe4, 0x25, 0xfb, 0x5f, 0x6d, 0xbf, 0x83, + 0x30, 0x48, 0x69, 0x1a, 0x47, + }, +}; + +FIXTURE(zero_len) +{ + int fd, cfd; + bool notls; +}; + +FIXTURE_VARIANT(zero_len) +{ + const struct raw_rec *recs[4]; + ssize_t recv_ret[4]; +}; + +FIXTURE_VARIANT_ADD(zero_len, data_data_data) +{ + .recs = { &id0_data_l11, &id1_data_l11, &id2_data_l11, }, + .recv_ret = { 33, -EAGAIN, }, +}; + +FIXTURE_VARIANT_ADD(zero_len, data_0ctrl_data) +{ + .recs = { &id0_data_l11, &id1_ctrl_l0, &id2_data_l11, }, + .recv_ret = { 11, 0, 11, -EAGAIN, }, +}; + +FIXTURE_VARIANT_ADD(zero_len, 0data_0data_0data) +{ + .recs = { &id0_data_l0, &id1_data_l0, &id2_data_l0, }, + .recv_ret = { -EAGAIN, }, +}; + +FIXTURE_VARIANT_ADD(zero_len, 0data_0data_ctrl) +{ + .recs = { &id0_data_l0, &id1_data_l0, &id2_ctrl_l11, }, + .recv_ret = { 0, 11, -EAGAIN, }, +}; + +FIXTURE_VARIANT_ADD(zero_len, 0data_0data_0ctrl) +{ + .recs = { &id0_data_l0, &id1_data_l0, &id2_ctrl_l0, }, + .recv_ret = { 0, 0, -EAGAIN, }, +}; + +FIXTURE_VARIANT_ADD(zero_len, 0ctrl_0ctrl_0ctrl) +{ + .recs = { &id0_ctrl_l0, &id1_ctrl_l0, &id2_ctrl_l0, }, + .recv_ret = { 0, 0, 0, -EAGAIN, }, +}; + +FIXTURE_VARIANT_ADD(zero_len, 0data_0data_data) +{ + .recs = { &id0_data_l0, &id1_data_l0, &id2_data_l11, }, + .recv_ret = { 11, -EAGAIN, }, +}; + +FIXTURE_VARIANT_ADD(zero_len, data_0data_0data) +{ + .recs = { &id0_data_l11, &id1_data_l0, &id2_data_l0, }, + .recv_ret = { 11, -EAGAIN, }, +}; + +FIXTURE_SETUP(zero_len) +{ + struct tls_crypto_info_keys tls12; + int ret; + + tls_crypto_info_init(TLS_1_2_VERSION, TLS_CIPHER_AES_CCM_128, + &tls12, 0); + + ulp_sock_pair(_metadata, &self->fd, &self->cfd, &self->notls); + if (self->notls) + return; + + /* Don't install keys on fd, we'll send raw records */ + ret = setsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12, tls12.len); + ASSERT_EQ(ret, 0); +} + +FIXTURE_TEARDOWN(zero_len) +{ + close(self->fd); + close(self->cfd); +} + +TEST_F(zero_len, test) +{ + const struct raw_rec *const *rec; + unsigned char buf[128]; + int rec_off; + int i; + + for (i = 0; i < 4 && variant->recs[i]; i++) + EXPECT_EQ(send(self->fd, variant->recs[i]->cipher_data, + variant->recs[i]->cipher_len, 0), + variant->recs[i]->cipher_len); + + rec = &variant->recs[0]; + rec_off = 0; + for (i = 0; i < 4; i++) { + int j, ret; + + ret = variant->recv_ret[i] >= 0 ? variant->recv_ret[i] : -1; + EXPECT_EQ(__tls_recv_cmsg(_metadata, self->cfd, NULL, + buf, sizeof(buf), MSG_DONTWAIT), ret); + if (ret == -1) + EXPECT_EQ(errno, -variant->recv_ret[i]); + if (variant->recv_ret[i] == -EAGAIN) + break; + + for (j = 0; j < ret; j++) { + while (rec_off == (*rec)->plain_len) { + rec++; + rec_off = 0; + } + EXPECT_EQ(buf[j], (*rec)->plain_data[rec_off]); + rec_off++; + } + } +}; + FIXTURE(tls_err) { int fd, cfd; From 8c5d95988c34f0aeba1f34cd5e4ba69494c90c5f Mon Sep 17 00:00:00 2001 From: Hariprasad Kelam Date: Wed, 20 Aug 2025 12:09:18 +0530 Subject: [PATCH 70/81] Octeontx2-af: Skip overlap check for SPI field Octeontx2/CN10K silicon supports generating a 256-bit key per packet. The specific fields to be extracted from a packet for key generation are configurable via a Key Extraction (MKEX) Profile. The AF driver scans the configured extraction profile to ensure that fields from upper layers do not overwrite fields from lower layers in the key. Example Packet Field Layout: LA: DMAC + SMAC LB: VLAN LC: IPv4/IPv6 LD: TCP/UDP Valid MKEX Profile Configuration: LA -> DMAC -> key_offset[0-5] LC -> SIP -> key_offset[20-23] LD -> SPORT -> key_offset[30-31] Invalid MKEX profile configuration: LA -> DMAC -> key_offset[0-5] LC -> SIP -> key_offset[20-23] LD -> SPORT -> key_offset[2-3] // Overlaps with DMAC field In another scenario, if the MKEX profile is configured to extract the SPI field from both AH and ESP headers at the same key offset, the driver rejecting this configuration. In a regular traffic, ipsec packet will be having either AH(LD) or ESP (LE). This patch relaxes the check for the same. Fixes: 12aa0a3b93f3 ("octeontx2-af: Harden rule validation.") Signed-off-by: Hariprasad Kelam Link: https://patch.msgid.link/20250820063919.1463518-1-hkelam@marvell.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_fs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_fs.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_fs.c index 1b765045aa63..b56395ac5a74 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_fs.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_fs.c @@ -606,8 +606,8 @@ static void npc_set_features(struct rvu *rvu, int blkaddr, u8 intf) if (!npc_check_field(rvu, blkaddr, NPC_LB, intf)) *features &= ~BIT_ULL(NPC_OUTER_VID); - /* Set SPI flag only if AH/ESP and IPSEC_SPI are in the key */ - if (npc_check_field(rvu, blkaddr, NPC_IPSEC_SPI, intf) && + /* Allow extracting SPI field from AH and ESP headers at same offset */ + if (npc_is_field_present(rvu, NPC_IPSEC_SPI, intf) && (*features & (BIT_ULL(NPC_IPPROTO_ESP) | BIT_ULL(NPC_IPPROTO_AH)))) *features |= BIT_ULL(NPC_IPSEC_SPI); From 1c67f9c54cdc70627e3f6472b89cd3d895df974c Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Wed, 20 Aug 2025 15:27:07 +0200 Subject: [PATCH 71/81] net: pse-pd: pd692x0: Fix power budget leak in manager setup error path Fix a resource leak where manager power budgets were freed on both success and error paths during manager setup. Power budgets should only be freed on error paths after regulator registration or during driver removal. Refactor cleanup logic by extracting OF node cleanup and power budget freeing into separate helper functions for better maintainability. Fixes: 359754013e6a ("net: pse-pd: pd692x0: Add support for PSE PI priority feature") Signed-off-by: Kory Maincent Link: https://patch.msgid.link/20250820132708.837255-1-kory.maincent@bootlin.com Signed-off-by: Jakub Kicinski --- drivers/net/pse-pd/pd692x0.c | 59 +++++++++++++++++++++++++++--------- 1 file changed, 44 insertions(+), 15 deletions(-) diff --git a/drivers/net/pse-pd/pd692x0.c b/drivers/net/pse-pd/pd692x0.c index 399ce9febda4..395f6c662175 100644 --- a/drivers/net/pse-pd/pd692x0.c +++ b/drivers/net/pse-pd/pd692x0.c @@ -1162,12 +1162,44 @@ pd692x0_write_ports_matrix(struct pd692x0_priv *priv, return 0; } +static void pd692x0_of_put_managers(struct pd692x0_priv *priv, + struct pd692x0_manager *manager, + int nmanagers) +{ + int i, j; + + for (i = 0; i < nmanagers; i++) { + for (j = 0; j < manager[i].nports; j++) + of_node_put(manager[i].port_node[j]); + of_node_put(manager[i].node); + } +} + +static void pd692x0_managers_free_pw_budget(struct pd692x0_priv *priv) +{ + int i; + + for (i = 0; i < PD692X0_MAX_MANAGERS; i++) { + struct regulator *supply; + + if (!priv->manager_reg[i] || !priv->manager_pw_budget[i]) + continue; + + supply = priv->manager_reg[i]->supply; + if (!supply) + continue; + + regulator_free_power_budget(supply, + priv->manager_pw_budget[i]); + } +} + static int pd692x0_setup_pi_matrix(struct pse_controller_dev *pcdev) { struct pd692x0_manager *manager __free(kfree) = NULL; struct pd692x0_priv *priv = to_pd692x0_priv(pcdev); struct pd692x0_matrix port_matrix[PD692X0_MAX_PIS]; - int ret, i, j, nmanagers; + int ret, nmanagers; /* Should we flash the port matrix */ if (priv->fw_state != PD692X0_FW_OK && @@ -1185,31 +1217,27 @@ static int pd692x0_setup_pi_matrix(struct pse_controller_dev *pcdev) nmanagers = ret; ret = pd692x0_register_managers_regulator(priv, manager, nmanagers); if (ret) - goto out; + goto err_of_managers; ret = pd692x0_configure_managers(priv, nmanagers); if (ret) - goto out; + goto err_of_managers; ret = pd692x0_set_ports_matrix(priv, manager, nmanagers, port_matrix); if (ret) - goto out; + goto err_managers_req_pw; ret = pd692x0_write_ports_matrix(priv, port_matrix); if (ret) - goto out; + goto err_managers_req_pw; -out: - for (i = 0; i < nmanagers; i++) { - struct regulator *supply = priv->manager_reg[i]->supply; + pd692x0_of_put_managers(priv, manager, nmanagers); + return 0; - regulator_free_power_budget(supply, - priv->manager_pw_budget[i]); - - for (j = 0; j < manager[i].nports; j++) - of_node_put(manager[i].port_node[j]); - of_node_put(manager[i].node); - } +err_managers_req_pw: + pd692x0_managers_free_pw_budget(priv); +err_of_managers: + pd692x0_of_put_managers(priv, manager, nmanagers); return ret; } @@ -1748,6 +1776,7 @@ static void pd692x0_i2c_remove(struct i2c_client *client) { struct pd692x0_priv *priv = i2c_get_clientdata(client); + pd692x0_managers_free_pw_budget(priv); firmware_upload_unregister(priv->fwl); } From 7ef353879f714602b43f98662069f4fb86536761 Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Wed, 20 Aug 2025 15:33:21 +0200 Subject: [PATCH 72/81] net: pse-pd: pd692x0: Skip power budget configuration when undefined If the power supply's power budget is not defined in the device tree, the current code still requests power and configures the PSE manager with a 0W power limit, which is undesirable behavior. Skip power budget configuration entirely when the budget is zero, avoiding unnecessary power requests and preventing invalid 0W limits from being set on the PSE manager. Fixes: 359754013e6a ("net: pse-pd: pd692x0: Add support for PSE PI priority feature") Signed-off-by: Kory Maincent Acked-by: Oleksij Rempel Link: https://patch.msgid.link/20250820133321.841054-1-kory.maincent@bootlin.com Signed-off-by: Jakub Kicinski --- drivers/net/pse-pd/pd692x0.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/pse-pd/pd692x0.c b/drivers/net/pse-pd/pd692x0.c index 395f6c662175..f4e91ba64a66 100644 --- a/drivers/net/pse-pd/pd692x0.c +++ b/drivers/net/pse-pd/pd692x0.c @@ -1041,6 +1041,10 @@ pd692x0_configure_managers(struct pd692x0_priv *priv, int nmanagers) int pw_budget; pw_budget = regulator_get_unclaimed_power_budget(supply); + if (!pw_budget) + /* Do nothing if no power budget */ + continue; + /* Max power budget per manager */ if (pw_budget > 6000000) pw_budget = 6000000; From bc17455bc843b2f4b206e0bb8139013eb3d3c08b Mon Sep 17 00:00:00 2001 From: Daniel Jurgens Date: Wed, 20 Aug 2025 16:32:02 +0300 Subject: [PATCH 73/81] net/mlx5: Base ECVF devlink port attrs from 0 Adjust the vport number by the base ECVF vport number so the port attributes start at 0. Previously the port attributes would start 1 after the maximum number of host VFs. Fixes: dc13180824b7 ("net/mlx5: Enable devlink port for embedded cpu VF vports") Signed-off-by: Daniel Jurgens Reviewed-by: Parav Pandit Reviewed-by: Saeed Mahameed Signed-off-by: Tariq Toukan Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250820133209.389065-2-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c index b7102e14d23d..c33accadae0f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c @@ -47,10 +47,12 @@ static void mlx5_esw_offloads_pf_vf_devlink_port_attrs_set(struct mlx5_eswitch * devlink_port_attrs_pci_vf_set(dl_port, controller_num, pfnum, vport_num - 1, external); } else if (mlx5_core_is_ec_vf_vport(esw->dev, vport_num)) { + u16 base_vport = mlx5_core_ec_vf_vport_base(dev); + memcpy(dl_port->attrs.switch_id.id, ppid.id, ppid.id_len); dl_port->attrs.switch_id.id_len = ppid.id_len; devlink_port_attrs_pci_vf_set(dl_port, 0, pfnum, - vport_num - 1, false); + vport_num - base_vport, false); } } From 330f0f6713a39581936decac72331e6ab7f13529 Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Wed, 20 Aug 2025 16:32:03 +0300 Subject: [PATCH 74/81] net/mlx5: Remove default QoS group and attach vports directly to root TSAR Currently, the driver creates a default group (`node0`) and attaches all vports to it unless the user explicitly sets a parent group. As a result, when a user configures tx_share on a group and tx_share on a VF, the expectation is for the group and the VF to share bandwidth relatively. However, since the VF is not connected to the same parent (but to the default node), the proportional share logic is not applied correctly. To fix this, remove the default group (`node0`) and instead connect vports directly to the root TSAR when no parent is specified. This ensures that vports and groups share the same root scheduler and their tx_share values are compared directly under the same hierarchy. Fixes: 0fe132eac38c ("net/mlx5: E-switch, Allow to add vports to rate groups") Signed-off-by: Carolina Jubran Reviewed-by: Cosmin Ratiu Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250820133209.389065-3-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlx5/core/esw/qos.c | 97 +++++++------------ .../net/ethernet/mellanox/mlx5/core/eswitch.h | 5 - 2 files changed, 33 insertions(+), 69 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c index 91d863c8c152..cd58d3934596 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c @@ -462,6 +462,7 @@ static int esw_qos_vport_create_sched_element(struct mlx5_esw_sched_node *vport_node, struct netlink_ext_ack *extack) { + struct mlx5_esw_sched_node *parent = vport_node->parent; u32 sched_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {}; struct mlx5_core_dev *dev = vport_node->esw->dev; void *attr; @@ -477,7 +478,7 @@ esw_qos_vport_create_sched_element(struct mlx5_esw_sched_node *vport_node, attr = MLX5_ADDR_OF(scheduling_context, sched_ctx, element_attributes); MLX5_SET(vport_element, attr, vport_number, vport_node->vport->vport); MLX5_SET(scheduling_context, sched_ctx, parent_element_id, - vport_node->parent->ix); + parent ? parent->ix : vport_node->esw->qos.root_tsar_ix); MLX5_SET(scheduling_context, sched_ctx, max_average_bw, vport_node->max_rate); @@ -786,48 +787,15 @@ static int esw_qos_create(struct mlx5_eswitch *esw, struct netlink_ext_ack *exta return err; } - if (MLX5_CAP_QOS(dev, log_esw_max_sched_depth)) { - esw->qos.node0 = __esw_qos_create_vports_sched_node(esw, NULL, extack); - } else { - /* The eswitch doesn't support scheduling nodes. - * Create a software-only node0 using the root TSAR to attach vport QoS to. - */ - if (!__esw_qos_alloc_node(esw, - esw->qos.root_tsar_ix, - SCHED_NODE_TYPE_VPORTS_TSAR, - NULL)) - esw->qos.node0 = ERR_PTR(-ENOMEM); - else - list_add_tail(&esw->qos.node0->entry, - &esw->qos.domain->nodes); - } - if (IS_ERR(esw->qos.node0)) { - err = PTR_ERR(esw->qos.node0); - esw_warn(dev, "E-Switch create rate node 0 failed (%d)\n", err); - goto err_node0; - } refcount_set(&esw->qos.refcnt, 1); return 0; - -err_node0: - if (mlx5_destroy_scheduling_element_cmd(esw->dev, SCHEDULING_HIERARCHY_E_SWITCH, - esw->qos.root_tsar_ix)) - esw_warn(esw->dev, "E-Switch destroy root TSAR failed.\n"); - - return err; } static void esw_qos_destroy(struct mlx5_eswitch *esw) { int err; - if (esw->qos.node0->ix != esw->qos.root_tsar_ix) - __esw_qos_destroy_node(esw->qos.node0, NULL); - else - __esw_qos_free_node(esw->qos.node0); - esw->qos.node0 = NULL; - err = mlx5_destroy_scheduling_element_cmd(esw->dev, SCHEDULING_HIERARCHY_E_SWITCH, esw->qos.root_tsar_ix); @@ -990,13 +958,16 @@ esw_qos_vport_tc_enable(struct mlx5_vport *vport, enum sched_node_type type, struct netlink_ext_ack *extack) { struct mlx5_esw_sched_node *vport_node = vport->qos.sched_node; - int err, new_level, max_level; + struct mlx5_esw_sched_node *parent = vport_node->parent; + int err; if (type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) { + int new_level, max_level; + /* Increase the parent's level by 2 to account for both the * TC arbiter and the vports TC scheduling element. */ - new_level = vport_node->parent->level + 2; + new_level = (parent ? parent->level : 2) + 2; max_level = 1 << MLX5_CAP_QOS(vport_node->esw->dev, log_esw_max_sched_depth); if (new_level > max_level) { @@ -1033,9 +1004,7 @@ esw_qos_vport_tc_enable(struct mlx5_vport *vport, enum sched_node_type type, err_sched_nodes: if (type == SCHED_NODE_TYPE_RATE_LIMITER) { esw_qos_node_destroy_sched_element(vport_node, NULL); - list_add_tail(&vport_node->entry, - &vport_node->parent->children); - vport_node->level = vport_node->parent->level + 1; + esw_qos_node_attach_to_parent(vport_node); } else { esw_qos_tc_arbiter_scheduling_teardown(vport_node, NULL); } @@ -1083,7 +1052,6 @@ static int esw_qos_set_vport_tcs_min_rate(struct mlx5_vport *vport, static void esw_qos_vport_disable(struct mlx5_vport *vport, struct netlink_ext_ack *extack) { struct mlx5_esw_sched_node *vport_node = vport->qos.sched_node; - struct mlx5_esw_sched_node *parent = vport_node->parent; enum sched_node_type curr_type = vport_node->type; if (curr_type == SCHED_NODE_TYPE_VPORT) @@ -1093,7 +1061,7 @@ static void esw_qos_vport_disable(struct mlx5_vport *vport, struct netlink_ext_a vport_node->bw_share = 0; list_del_init(&vport_node->entry); - esw_qos_normalize_min_rate(parent->esw, parent, extack); + esw_qos_normalize_min_rate(vport_node->esw, vport_node->parent, extack); trace_mlx5_esw_vport_qos_destroy(vport_node->esw->dev, vport); } @@ -1103,25 +1071,23 @@ static int esw_qos_vport_enable(struct mlx5_vport *vport, struct mlx5_esw_sched_node *parent, struct netlink_ext_ack *extack) { + struct mlx5_esw_sched_node *vport_node = vport->qos.sched_node; int err; esw_assert_qos_lock_held(vport->dev->priv.eswitch); - esw_qos_node_set_parent(vport->qos.sched_node, parent); - if (type == SCHED_NODE_TYPE_VPORT) { - err = esw_qos_vport_create_sched_element(vport->qos.sched_node, - extack); - } else { + esw_qos_node_set_parent(vport_node, parent); + if (type == SCHED_NODE_TYPE_VPORT) + err = esw_qos_vport_create_sched_element(vport_node, extack); + else err = esw_qos_vport_tc_enable(vport, type, extack); - } if (err) return err; - vport->qos.sched_node->type = type; - esw_qos_normalize_min_rate(parent->esw, parent, extack); - trace_mlx5_esw_vport_qos_create(vport->dev, vport, - vport->qos.sched_node->max_rate, - vport->qos.sched_node->bw_share); + vport_node->type = type; + esw_qos_normalize_min_rate(vport_node->esw, parent, extack); + trace_mlx5_esw_vport_qos_create(vport->dev, vport, vport_node->max_rate, + vport_node->bw_share); return 0; } @@ -1132,6 +1098,7 @@ static int mlx5_esw_qos_vport_enable(struct mlx5_vport *vport, enum sched_node_t { struct mlx5_eswitch *esw = vport->dev->priv.eswitch; struct mlx5_esw_sched_node *sched_node; + struct mlx5_eswitch *parent_esw; int err; esw_assert_qos_lock_held(esw); @@ -1139,10 +1106,12 @@ static int mlx5_esw_qos_vport_enable(struct mlx5_vport *vport, enum sched_node_t if (err) return err; - parent = parent ?: esw->qos.node0; - sched_node = __esw_qos_alloc_node(parent->esw, 0, type, parent); + parent_esw = parent ? parent->esw : esw; + sched_node = __esw_qos_alloc_node(parent_esw, 0, type, parent); if (!sched_node) return -ENOMEM; + if (!parent) + list_add_tail(&sched_node->entry, &esw->qos.domain->nodes); sched_node->max_rate = max_rate; sched_node->min_rate = min_rate; @@ -1168,7 +1137,7 @@ void mlx5_esw_qos_vport_disable(struct mlx5_vport *vport) goto unlock; parent = vport->qos.sched_node->parent; - WARN(parent != esw->qos.node0, "Disabling QoS on port before detaching it from node"); + WARN(parent, "Disabling QoS on port before detaching it from node"); esw_qos_vport_disable(vport, NULL); mlx5_esw_qos_vport_qos_free(vport); @@ -1268,7 +1237,6 @@ static int esw_qos_vport_update(struct mlx5_vport *vport, int err; esw_assert_qos_lock_held(vport->dev->priv.eswitch); - parent = parent ?: curr_parent; if (curr_type == type && curr_parent == parent) return 0; @@ -1306,16 +1274,16 @@ static int esw_qos_vport_update_parent(struct mlx5_vport *vport, struct mlx5_esw esw_assert_qos_lock_held(esw); curr_parent = vport->qos.sched_node->parent; - parent = parent ?: esw->qos.node0; if (curr_parent == parent) return 0; /* Set vport QoS type based on parent node type if different from * default QoS; otherwise, use the vport's current QoS type. */ - if (parent->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) + if (parent && parent->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) type = SCHED_NODE_TYPE_RATE_LIMITER; - else if (curr_parent->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) + else if (curr_parent && + curr_parent->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) type = SCHED_NODE_TYPE_VPORT; else type = vport->qos.sched_node->type; @@ -1654,9 +1622,10 @@ static bool esw_qos_validate_unsupported_tc_bw(struct mlx5_eswitch *esw, static bool esw_qos_vport_validate_unsupported_tc_bw(struct mlx5_vport *vport, u32 *tc_bw) { - struct mlx5_eswitch *esw = vport->qos.sched_node ? - vport->qos.sched_node->parent->esw : - vport->dev->priv.eswitch; + struct mlx5_esw_sched_node *node = vport->qos.sched_node; + struct mlx5_eswitch *esw = vport->dev->priv.eswitch; + + esw = (node && node->parent) ? node->parent->esw : esw; return esw_qos_validate_unsupported_tc_bw(esw, tc_bw); } @@ -1763,7 +1732,7 @@ int mlx5_esw_devlink_rate_leaf_tc_bw_set(struct devlink_rate *rate_leaf, if (disable) { if (vport_node->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) err = esw_qos_vport_update(vport, SCHED_NODE_TYPE_VPORT, - NULL, extack); + vport_node->parent, extack); goto unlock; } @@ -1775,7 +1744,7 @@ int mlx5_esw_devlink_rate_leaf_tc_bw_set(struct devlink_rate *rate_leaf, } else { err = esw_qos_vport_update(vport, SCHED_NODE_TYPE_TC_ARBITER_TSAR, - NULL, extack); + vport_node->parent, extack); } if (!err) esw_qos_set_tc_arbiter_bw_shares(vport_node, tc_bw, extack); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h index b0b8ef3ec3c4..45506ad56847 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h @@ -373,11 +373,6 @@ struct mlx5_eswitch { refcount_t refcnt; u32 root_tsar_ix; struct mlx5_qos_domain *domain; - /* Contains all vports with QoS enabled but no explicit node. - * Cannot be NULL if QoS is enabled, but may be a fake node - * referencing the root TSAR if the esw doesn't support nodes. - */ - struct mlx5_esw_sched_node *node0; } qos; struct mlx5_esw_bridge_offloads *br_offloads; From e8f973576ca5387ffd2917b8ae661d3f9acde526 Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Wed, 20 Aug 2025 16:32:04 +0300 Subject: [PATCH 75/81] net/mlx5e: Preserve tc-bw during parent changes When changing parent of a node/leaf with tc-bw configured, the code saves and restores tc-bw values. However, it was reading the converted hardware bw_share values (where 0 becomes 1) instead of the original user values, causing incorrect tc-bw calculations after parent change. Store original tc-bw values in the node structure and use them directly for save/restore operations. Fixes: cf7e73770d1b ("net/mlx5: Manage TC arbiter nodes and implement full support for tc-bw") Signed-off-by: Carolina Jubran Reviewed-by: Cosmin Ratiu Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250820133209.389065-4-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlx5/core/esw/qos.c | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c index cd58d3934596..4ed5968f1638 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c @@ -102,6 +102,8 @@ struct mlx5_esw_sched_node { u8 level; /* Valid only when this node represents a traffic class. */ u8 tc; + /* Valid only for a TC arbiter node or vport TC arbiter. */ + u32 tc_bw[DEVLINK_RATE_TCS_MAX]; }; static void esw_qos_node_attach_to_parent(struct mlx5_esw_sched_node *node) @@ -609,10 +611,7 @@ static void esw_qos_tc_arbiter_get_bw_shares(struct mlx5_esw_sched_node *tc_arbiter_node, u32 *tc_bw) { - struct mlx5_esw_sched_node *vports_tc_node; - - list_for_each_entry(vports_tc_node, &tc_arbiter_node->children, entry) - tc_bw[vports_tc_node->tc] = vports_tc_node->bw_share; + memcpy(tc_bw, tc_arbiter_node->tc_bw, sizeof(tc_arbiter_node->tc_bw)); } static void @@ -629,6 +628,7 @@ esw_qos_set_tc_arbiter_bw_shares(struct mlx5_esw_sched_node *tc_arbiter_node, u8 tc = vports_tc_node->tc; u32 bw_share; + tc_arbiter_node->tc_bw[tc] = tc_bw[tc]; bw_share = tc_bw[tc] * fw_max_bw_share; bw_share = esw_qos_calc_bw_share(bw_share, divider, fw_max_bw_share); @@ -1060,6 +1060,7 @@ static void esw_qos_vport_disable(struct mlx5_vport *vport, struct netlink_ext_a esw_qos_vport_tc_disable(vport, extack); vport_node->bw_share = 0; + memset(vport_node->tc_bw, 0, sizeof(vport_node->tc_bw)); list_del_init(&vport_node->entry); esw_qos_normalize_min_rate(vport_node->esw, vport_node->parent, extack); @@ -1231,8 +1232,9 @@ static int esw_qos_vport_update(struct mlx5_vport *vport, struct mlx5_esw_sched_node *parent, struct netlink_ext_ack *extack) { - struct mlx5_esw_sched_node *curr_parent = vport->qos.sched_node->parent; - enum sched_node_type curr_type = vport->qos.sched_node->type; + struct mlx5_esw_sched_node *vport_node = vport->qos.sched_node; + struct mlx5_esw_sched_node *curr_parent = vport_node->parent; + enum sched_node_type curr_type = vport_node->type; u32 curr_tc_bw[DEVLINK_RATE_TCS_MAX] = {0}; int err; @@ -1244,10 +1246,8 @@ static int esw_qos_vport_update(struct mlx5_vport *vport, if (err) return err; - if (curr_type == SCHED_NODE_TYPE_TC_ARBITER_TSAR && curr_type == type) { - esw_qos_tc_arbiter_get_bw_shares(vport->qos.sched_node, - curr_tc_bw); - } + if (curr_type == SCHED_NODE_TYPE_TC_ARBITER_TSAR && curr_type == type) + esw_qos_tc_arbiter_get_bw_shares(vport_node, curr_tc_bw); esw_qos_vport_disable(vport, extack); @@ -1258,8 +1258,8 @@ static int esw_qos_vport_update(struct mlx5_vport *vport, } if (curr_type == SCHED_NODE_TYPE_TC_ARBITER_TSAR && curr_type == type) { - esw_qos_set_tc_arbiter_bw_shares(vport->qos.sched_node, - curr_tc_bw, extack); + esw_qos_set_tc_arbiter_bw_shares(vport_node, curr_tc_bw, + extack); } return err; From b697ef4d1d136948d282384e6cc3d1af469ea123 Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Wed, 20 Aug 2025 16:32:05 +0300 Subject: [PATCH 76/81] net/mlx5: Destroy vport QoS element when no configuration remains If a VF has been configured and the user later clears all QoS settings, the vport element remains in the firmware QoS tree. This leads to inconsistent behavior compared to VFs that were never configured, since the FW assumes that unconfigured VFs are outside the QoS hierarchy. As a result, the bandwidth share across VFs may differ, even though none of them appear to have any configuration. Align the driver behavior with the FW expectation by destroying the vport QoS element when all configurations are removed. Fixes: c9497c98901c ("net/mlx5: Add support for setting VF min rate") Fixes: cf7e73770d1b ("net/mlx5: Manage TC arbiter nodes and implement full support for tc-bw") Signed-off-by: Carolina Jubran Reviewed-by: Cosmin Ratiu Signed-off-by: Mark Bloch Reviewed-by: Przemek Kitszel Link: https://patch.msgid.link/20250820133209.389065-5-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlx5/core/esw/qos.c | 57 ++++++++++++++++--- 1 file changed, 49 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c index 4ed5968f1638..452a948a3e6d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c @@ -1127,6 +1127,19 @@ static int mlx5_esw_qos_vport_enable(struct mlx5_vport *vport, enum sched_node_t return err; } +static void mlx5_esw_qos_vport_disable_locked(struct mlx5_vport *vport) +{ + struct mlx5_eswitch *esw = vport->dev->priv.eswitch; + + esw_assert_qos_lock_held(esw); + if (!vport->qos.sched_node) + return; + + esw_qos_vport_disable(vport, NULL); + mlx5_esw_qos_vport_qos_free(vport); + esw_qos_put(esw); +} + void mlx5_esw_qos_vport_disable(struct mlx5_vport *vport) { struct mlx5_eswitch *esw = vport->dev->priv.eswitch; @@ -1140,9 +1153,7 @@ void mlx5_esw_qos_vport_disable(struct mlx5_vport *vport) parent = vport->qos.sched_node->parent; WARN(parent, "Disabling QoS on port before detaching it from node"); - esw_qos_vport_disable(vport, NULL); - mlx5_esw_qos_vport_qos_free(vport); - esw_qos_put(esw); + mlx5_esw_qos_vport_disable_locked(vport); unlock: esw_qos_unlock(esw); } @@ -1642,6 +1653,21 @@ static bool esw_qos_tc_bw_disabled(u32 *tc_bw) return true; } +static void esw_vport_qos_prune_empty(struct mlx5_vport *vport) +{ + struct mlx5_esw_sched_node *vport_node = vport->qos.sched_node; + + esw_assert_qos_lock_held(vport->dev->priv.eswitch); + if (!vport_node) + return; + + if (vport_node->parent || vport_node->max_rate || + vport_node->min_rate || !esw_qos_tc_bw_disabled(vport_node->tc_bw)) + return; + + mlx5_esw_qos_vport_disable_locked(vport); +} + int mlx5_esw_qos_init(struct mlx5_eswitch *esw) { if (esw->qos.domain) @@ -1675,6 +1701,10 @@ int mlx5_esw_devlink_rate_leaf_tx_share_set(struct devlink_rate *rate_leaf, void esw_qos_lock(esw); err = mlx5_esw_qos_set_vport_min_rate(vport, tx_share, extack); + if (err) + goto out; + esw_vport_qos_prune_empty(vport); +out: esw_qos_unlock(esw); return err; } @@ -1696,6 +1726,10 @@ int mlx5_esw_devlink_rate_leaf_tx_max_set(struct devlink_rate *rate_leaf, void * esw_qos_lock(esw); err = mlx5_esw_qos_set_vport_max_rate(vport, tx_max, extack); + if (err) + goto out; + esw_vport_qos_prune_empty(vport); +out: esw_qos_unlock(esw); return err; } @@ -1733,6 +1767,7 @@ int mlx5_esw_devlink_rate_leaf_tc_bw_set(struct devlink_rate *rate_leaf, if (vport_node->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) err = esw_qos_vport_update(vport, SCHED_NODE_TYPE_VPORT, vport_node->parent, extack); + esw_vport_qos_prune_empty(vport); goto unlock; } @@ -1893,14 +1928,20 @@ int mlx5_esw_devlink_rate_leaf_parent_set(struct devlink_rate *devlink_rate, void *priv, void *parent_priv, struct netlink_ext_ack *extack) { - struct mlx5_esw_sched_node *node; + struct mlx5_esw_sched_node *node = parent ? parent_priv : NULL; struct mlx5_vport *vport = priv; + int err; - if (!parent) - return mlx5_esw_qos_vport_update_parent(vport, NULL, extack); + err = mlx5_esw_qos_vport_update_parent(vport, node, extack); + if (!err) { + struct mlx5_eswitch *esw = vport->dev->priv.eswitch; - node = parent_priv; - return mlx5_esw_qos_vport_update_parent(vport, node, extack); + esw_qos_lock(esw); + esw_vport_qos_prune_empty(vport); + esw_qos_unlock(esw); + } + + return err; } static bool esw_qos_is_node_empty(struct mlx5_esw_sched_node *node) From 3c114fb2afe493066df5b9e560ef37216b153c90 Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Wed, 20 Aug 2025 16:32:06 +0300 Subject: [PATCH 77/81] net/mlx5: Fix QoS reference leak in vport enable error path Add missing esw_qos_put() call when __esw_qos_alloc_node() fails in mlx5_esw_qos_vport_enable(). Fixes: be034baba83e ("net/mlx5: Make vport QoS enablement more flexible for future extensions") Signed-off-by: Carolina Jubran Reviewed-by: Cosmin Ratiu Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250820133209.389065-6-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c index 452a948a3e6d..41aec07bb6c2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c @@ -1109,8 +1109,10 @@ static int mlx5_esw_qos_vport_enable(struct mlx5_vport *vport, enum sched_node_t parent_esw = parent ? parent->esw : esw; sched_node = __esw_qos_alloc_node(parent_esw, 0, type, parent); - if (!sched_node) + if (!sched_node) { + esw_qos_put(esw); return -ENOMEM; + } if (!parent) list_add_tail(&sched_node->entry, &esw->qos.domain->nodes); From 51b17c98e3dbb2093a81b0490050a0eaa919ebee Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Wed, 20 Aug 2025 16:32:07 +0300 Subject: [PATCH 78/81] net/mlx5: Restore missing scheduling node cleanup on vport enable failure Restore the __esw_qos_free_node() call removed by the offending commit. Fixes: 97733d1e00a0 ("net/mlx5: Add traffic class scheduling support for vport QoS") Signed-off-by: Carolina Jubran Reviewed-by: Tariq Toukan Reviewed-by: Cosmin Ratiu Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250820133209.389065-7-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c index 41aec07bb6c2..8b4977650183 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c @@ -1122,6 +1122,7 @@ static int mlx5_esw_qos_vport_enable(struct mlx5_vport *vport, enum sched_node_t vport->qos.sched_node = sched_node; err = esw_qos_vport_enable(vport, type, parent, extack); if (err) { + __esw_qos_free_node(sched_node); esw_qos_put(esw); vport->qos.sched_node = NULL; } From 451d2849ea66659040b59ae3cb7e50cc97404733 Mon Sep 17 00:00:00 2001 From: Alexei Lazar Date: Wed, 20 Aug 2025 16:32:08 +0300 Subject: [PATCH 79/81] net/mlx5e: Query FW for buffer ownership The SW currently saves local buffer ownership when setting the buffer. This means that the SW assumes it has ownership of the buffer after the command is set. If setting the buffer fails and we remain in FW ownership, the local buffer ownership state incorrectly remains as SW-owned. This leads to incorrect behavior in subsequent PFC commands, causing failures. Instead of saving local buffer ownership in SW, query the FW for buffer ownership when setting the buffer. This ensures that the buffer ownership state is accurately reflected, avoiding the issues caused by incorrect ownership states. Fixes: ecdf2dadee8e ("net/mlx5e: Receive buffer support for DCBX") Signed-off-by: Alexei Lazar Reviewed-by: Shahar Shitrit Reviewed-by: Dragos Tatulea Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250820133209.389065-8-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- .../ethernet/mellanox/mlx5/core/en/dcbnl.h | 1 - .../ethernet/mellanox/mlx5/core/en_dcbnl.c | 12 ++++++++--- .../ethernet/mellanox/mlx5/core/mlx5_core.h | 2 ++ .../net/ethernet/mellanox/mlx5/core/port.c | 20 +++++++++++++++++++ 4 files changed, 31 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/dcbnl.h b/drivers/net/ethernet/mellanox/mlx5/core/en/dcbnl.h index b59aee75de94..2c98a5299df3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/dcbnl.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/dcbnl.h @@ -26,7 +26,6 @@ struct mlx5e_dcbx { u8 cap; /* Buffer configuration */ - bool manual_buffer; u32 cable_len; u32 xoff; u16 port_buff_cell_sz; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c index 5fe016e477b3..d166c0d5189e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c @@ -362,6 +362,7 @@ static int mlx5e_dcbnl_ieee_getpfc(struct net_device *dev, static int mlx5e_dcbnl_ieee_setpfc(struct net_device *dev, struct ieee_pfc *pfc) { + u8 buffer_ownership = MLX5_BUF_OWNERSHIP_UNKNOWN; struct mlx5e_priv *priv = netdev_priv(dev); struct mlx5_core_dev *mdev = priv->mdev; u32 old_cable_len = priv->dcbx.cable_len; @@ -389,7 +390,14 @@ static int mlx5e_dcbnl_ieee_setpfc(struct net_device *dev, if (MLX5_BUFFER_SUPPORTED(mdev)) { pfc_new.pfc_en = (changed & MLX5E_PORT_BUFFER_PFC) ? pfc->pfc_en : curr_pfc_en; - if (priv->dcbx.manual_buffer) + ret = mlx5_query_port_buffer_ownership(mdev, + &buffer_ownership); + if (ret) + netdev_err(dev, + "%s, Failed to get buffer ownership: %d\n", + __func__, ret); + + if (buffer_ownership == MLX5_BUF_OWNERSHIP_SW_OWNED) ret = mlx5e_port_manual_buffer_config(priv, changed, dev->mtu, &pfc_new, NULL, NULL); @@ -982,7 +990,6 @@ static int mlx5e_dcbnl_setbuffer(struct net_device *dev, if (!changed) return 0; - priv->dcbx.manual_buffer = true; err = mlx5e_port_manual_buffer_config(priv, changed, dev->mtu, NULL, buffer_size, prio2buffer); return err; @@ -1252,7 +1259,6 @@ void mlx5e_dcbnl_initialize(struct mlx5e_priv *priv) priv->dcbx.cap |= DCB_CAP_DCBX_HOST; priv->dcbx.port_buff_cell_sz = mlx5e_query_port_buffers_cell_size(priv); - priv->dcbx.manual_buffer = false; priv->dcbx.cable_len = MLX5E_DEFAULT_CABLE_LEN; mlx5e_ets_init(priv); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h index b6d53db27cd5..9d3504f5abfa 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h @@ -367,6 +367,8 @@ int mlx5_query_port_dcbx_param(struct mlx5_core_dev *mdev, u32 *out); int mlx5_set_port_dcbx_param(struct mlx5_core_dev *mdev, u32 *in); int mlx5_set_trust_state(struct mlx5_core_dev *mdev, u8 trust_state); int mlx5_query_trust_state(struct mlx5_core_dev *mdev, u8 *trust_state); +int mlx5_query_port_buffer_ownership(struct mlx5_core_dev *mdev, + u8 *buffer_ownership); int mlx5_set_dscp2prio(struct mlx5_core_dev *mdev, u8 dscp, u8 prio); int mlx5_query_dscp2prio(struct mlx5_core_dev *mdev, u8 *dscp2prio); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c index 549f1066d2a5..2d7adf7444ba 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/port.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c @@ -968,6 +968,26 @@ int mlx5_query_trust_state(struct mlx5_core_dev *mdev, u8 *trust_state) return err; } +int mlx5_query_port_buffer_ownership(struct mlx5_core_dev *mdev, + u8 *buffer_ownership) +{ + u32 out[MLX5_ST_SZ_DW(pfcc_reg)] = {}; + int err; + + if (!MLX5_CAP_PCAM_FEATURE(mdev, buffer_ownership)) { + *buffer_ownership = MLX5_BUF_OWNERSHIP_UNKNOWN; + return 0; + } + + err = mlx5_query_pfcc_reg(mdev, out, sizeof(out)); + if (err) + return err; + + *buffer_ownership = MLX5_GET(pfcc_reg, out, buf_ownership); + + return 0; +} + int mlx5_set_dscp2prio(struct mlx5_core_dev *mdev, u8 dscp, u8 prio) { int sz = MLX5_ST_SZ_BYTES(qpdpm_reg); From 8b0587a885fdb34fd6090a3f8625cb7ac1444826 Mon Sep 17 00:00:00 2001 From: Armen Ratner Date: Wed, 20 Aug 2025 16:32:09 +0300 Subject: [PATCH 80/81] net/mlx5e: Preserve shared buffer capacity during headroom updates When port buffer headroom changes, port_update_shared_buffer() recalculates the shared buffer size and splits it in a 3:1 ratio (lossy:lossless) - Currently, the calculation is: lossless = shared / 4; lossy = (shared / 4) * 3; Meaning, the calculation dropped the remainder of shared % 4 due to integer division, unintentionally reducing the total shared buffer by up to three cells on each update. Over time, this could shrink the buffer below usable size. Fix it by changing the calculation to: lossless = shared / 4; lossy = shared - lossless; This retains all buffer cells while still approximating the intended 3:1 split, preventing capacity loss over time. While at it, perform headroom calculations in units of cells rather than in bytes for more accurate calculations avoiding extra divisions. Fixes: a440030d8946 ("net/mlx5e: Update shared buffer along with device buffer changes") Signed-off-by: Armen Ratner Signed-off-by: Maher Sanalla Reviewed-by: Tariq Toukan Signed-off-by: Alexei Lazar Signed-off-by: Mark Bloch Reviewed-by: Przemek Kitszel Link: https://patch.msgid.link/20250820133209.389065-9-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- .../mellanox/mlx5/core/en/port_buffer.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c b/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c index 5ae787656a7c..3efa8bf1d14e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c @@ -272,8 +272,8 @@ static int port_update_shared_buffer(struct mlx5_core_dev *mdev, /* Total shared buffer size is split in a ratio of 3:1 between * lossy and lossless pools respectively. */ - lossy_epool_size = (shared_buffer_size / 4) * 3; lossless_ipool_size = shared_buffer_size / 4; + lossy_epool_size = shared_buffer_size - lossless_ipool_size; mlx5e_port_set_sbpr(mdev, 0, MLX5_EGRESS_DIR, MLX5_LOSSY_POOL, 0, lossy_epool_size); @@ -288,14 +288,12 @@ static int port_set_buffer(struct mlx5e_priv *priv, u16 port_buff_cell_sz = priv->dcbx.port_buff_cell_sz; struct mlx5_core_dev *mdev = priv->mdev; int sz = MLX5_ST_SZ_BYTES(pbmc_reg); - u32 new_headroom_size = 0; - u32 current_headroom_size; + u32 current_headroom_cells = 0; + u32 new_headroom_cells = 0; void *in; int err; int i; - current_headroom_size = port_buffer->headroom_size; - in = kzalloc(sz, GFP_KERNEL); if (!in) return -ENOMEM; @@ -306,12 +304,14 @@ static int port_set_buffer(struct mlx5e_priv *priv, for (i = 0; i < MLX5E_MAX_NETWORK_BUFFER; i++) { void *buffer = MLX5_ADDR_OF(pbmc_reg, in, buffer[i]); + current_headroom_cells += MLX5_GET(bufferx_reg, buffer, size); + u64 size = port_buffer->buffer[i].size; u64 xoff = port_buffer->buffer[i].xoff; u64 xon = port_buffer->buffer[i].xon; - new_headroom_size += size; do_div(size, port_buff_cell_sz); + new_headroom_cells += size; do_div(xoff, port_buff_cell_sz); do_div(xon, port_buff_cell_sz); MLX5_SET(bufferx_reg, buffer, size, size); @@ -320,10 +320,8 @@ static int port_set_buffer(struct mlx5e_priv *priv, MLX5_SET(bufferx_reg, buffer, xon_threshold, xon); } - new_headroom_size /= port_buff_cell_sz; - current_headroom_size /= port_buff_cell_sz; - err = port_update_shared_buffer(priv->mdev, current_headroom_size, - new_headroom_size); + err = port_update_shared_buffer(priv->mdev, current_headroom_cells, + new_headroom_cells); if (err) goto out; From 91a79b792204313153e1bdbbe5acbfc28903b3a5 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 20 Aug 2025 14:37:07 +0200 Subject: [PATCH 81/81] netfilter: nf_reject: don't leak dst refcount for loopback packets recent patches to add a WARN() when replacing skb dst entry found an old bug: WARNING: include/linux/skbuff.h:1165 skb_dst_check_unset include/linux/skbuff.h:1164 [inline] WARNING: include/linux/skbuff.h:1165 skb_dst_set include/linux/skbuff.h:1210 [inline] WARNING: include/linux/skbuff.h:1165 nf_reject_fill_skb_dst+0x2a4/0x330 net/ipv4/netfilter/nf_reject_ipv4.c:234 [..] Call Trace: nf_send_unreach+0x17b/0x6e0 net/ipv4/netfilter/nf_reject_ipv4.c:325 nft_reject_inet_eval+0x4bc/0x690 net/netfilter/nft_reject_inet.c:27 expr_call_ops_eval net/netfilter/nf_tables_core.c:237 [inline] .. This is because blamed commit forgot about loopback packets. Such packets already have a dst_entry attached, even at PRE_ROUTING stage. Instead of checking hook just check if the skb already has a route attached to it. Fixes: f53b9b0bdc59 ("netfilter: introduce support for reject at prerouting stage") Signed-off-by: Florian Westphal Link: https://patch.msgid.link/20250820123707.10671-1-fw@strlen.de Signed-off-by: Jakub Kicinski --- net/ipv4/netfilter/nf_reject_ipv4.c | 6 ++---- net/ipv6/netfilter/nf_reject_ipv6.c | 5 ++--- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index 87fd945a0d27..0d3cb2ba6fc8 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -247,8 +247,7 @@ void nf_send_reset(struct net *net, struct sock *sk, struct sk_buff *oldskb, if (!oth) return; - if ((hook == NF_INET_PRE_ROUTING || hook == NF_INET_INGRESS) && - nf_reject_fill_skb_dst(oldskb) < 0) + if (!skb_dst(oldskb) && nf_reject_fill_skb_dst(oldskb) < 0) return; if (skb_rtable(oldskb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) @@ -321,8 +320,7 @@ void nf_send_unreach(struct sk_buff *skb_in, int code, int hook) if (iph->frag_off & htons(IP_OFFSET)) return; - if ((hook == NF_INET_PRE_ROUTING || hook == NF_INET_INGRESS) && - nf_reject_fill_skb_dst(skb_in) < 0) + if (!skb_dst(skb_in) && nf_reject_fill_skb_dst(skb_in) < 0) return; if (skb_csum_unnecessary(skb_in) || diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c index 838295fa32e3..cb2d38e80de9 100644 --- a/net/ipv6/netfilter/nf_reject_ipv6.c +++ b/net/ipv6/netfilter/nf_reject_ipv6.c @@ -293,7 +293,7 @@ void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb, fl6.fl6_sport = otcph->dest; fl6.fl6_dport = otcph->source; - if (hook == NF_INET_PRE_ROUTING || hook == NF_INET_INGRESS) { + if (!skb_dst(oldskb)) { nf_ip6_route(net, &dst, flowi6_to_flowi(&fl6), false); if (!dst) return; @@ -397,8 +397,7 @@ void nf_send_unreach6(struct net *net, struct sk_buff *skb_in, if (hooknum == NF_INET_LOCAL_OUT && skb_in->dev == NULL) skb_in->dev = net->loopback_dev; - if ((hooknum == NF_INET_PRE_ROUTING || hooknum == NF_INET_INGRESS) && - nf_reject6_fill_skb_dst(skb_in) < 0) + if (!skb_dst(skb_in) && nf_reject6_fill_skb_dst(skb_in) < 0) return; icmpv6_send(skb_in, ICMPV6_DEST_UNREACH, code, 0);