From b2a38f6df9dab0b05858746edcbe2403f8f4e4ec Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Nov 2025 08:32:43 +0000 Subject: [PATCH 01/14] net_sched: make room for (struct qdisc_skb_cb)->pkt_segs Add a new u16 field, next to pkt_len : pkt_segs This will cache shinfo->gso_segs to speed up qdisc deqeue(). Move slave_dev_queue_mapping at the end of qdisc_skb_cb, and move three bits from tc_skb_cb : - post_ct - post_ct_snat - post_ct_dnat Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20251121083256.674562-2-edumazet@google.com Signed-off-by: Paolo Abeni --- include/net/sch_generic.h | 18 +++++++++--------- net/core/dev.c | 2 +- net/sched/act_ct.c | 8 ++++---- net/sched/cls_api.c | 6 +++--- net/sched/cls_flower.c | 2 +- 5 files changed, 18 insertions(+), 18 deletions(-) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 94966692ccdf..9cd8b5d4b236 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -429,13 +429,16 @@ struct tcf_proto { }; struct qdisc_skb_cb { - struct { - unsigned int pkt_len; - u16 slave_dev_queue_mapping; - u16 tc_classid; - }; + unsigned int pkt_len; + u16 pkt_segs; + u16 tc_classid; #define QDISC_CB_PRIV_LEN 20 unsigned char data[QDISC_CB_PRIV_LEN]; + + u16 slave_dev_queue_mapping; + u8 post_ct:1; + u8 post_ct_snat:1; + u8 post_ct_dnat:1; }; typedef void tcf_chain_head_change_t(struct tcf_proto *tp_head, void *priv); @@ -1064,11 +1067,8 @@ struct tc_skb_cb { struct qdisc_skb_cb qdisc_cb; u32 drop_reason; - u16 zone; /* Only valid if post_ct = true */ + u16 zone; /* Only valid if qdisc_skb_cb(skb)->post_ct = true */ u16 mru; - u8 post_ct:1; - u8 post_ct_snat:1; - u8 post_ct_dnat:1; }; static inline struct tc_skb_cb *tc_skb_cb(const struct sk_buff *skb) diff --git a/net/core/dev.c b/net/core/dev.c index 69515edd17bc..46ce6c610780 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4355,7 +4355,7 @@ static int tc_run(struct tcx_entry *entry, struct sk_buff *skb, return ret; tc_skb_cb(skb)->mru = 0; - tc_skb_cb(skb)->post_ct = false; + qdisc_skb_cb(skb)->post_ct = false; tcf_set_drop_reason(skb, *drop_reason); mini_qdisc_bstats_cpu_update(miniq, skb); diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 6749a4a9a9cd..2b6ac7069dc1 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -948,9 +948,9 @@ static int tcf_ct_act_nat(struct sk_buff *skb, return err & NF_VERDICT_MASK; if (action & BIT(NF_NAT_MANIP_SRC)) - tc_skb_cb(skb)->post_ct_snat = 1; + qdisc_skb_cb(skb)->post_ct_snat = 1; if (action & BIT(NF_NAT_MANIP_DST)) - tc_skb_cb(skb)->post_ct_dnat = 1; + qdisc_skb_cb(skb)->post_ct_dnat = 1; return err; #else @@ -986,7 +986,7 @@ TC_INDIRECT_SCOPE int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a, tcf_action_update_bstats(&c->common, skb); if (clear) { - tc_skb_cb(skb)->post_ct = false; + qdisc_skb_cb(skb)->post_ct = false; ct = nf_ct_get(skb, &ctinfo); if (ct) { nf_ct_put(ct); @@ -1097,7 +1097,7 @@ TC_INDIRECT_SCOPE int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a, out_push: skb_push_rcsum(skb, nh_ofs); - tc_skb_cb(skb)->post_ct = true; + qdisc_skb_cb(skb)->post_ct = true; tc_skb_cb(skb)->zone = p->zone; out_clear: if (defrag) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index f751cd5eeac8..ebca4b926dcf 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1872,9 +1872,9 @@ int tcf_classify(struct sk_buff *skb, } ext->chain = last_executed_chain; ext->mru = cb->mru; - ext->post_ct = cb->post_ct; - ext->post_ct_snat = cb->post_ct_snat; - ext->post_ct_dnat = cb->post_ct_dnat; + ext->post_ct = qdisc_skb_cb(skb)->post_ct; + ext->post_ct_snat = qdisc_skb_cb(skb)->post_ct_snat; + ext->post_ct_dnat = qdisc_skb_cb(skb)->post_ct_dnat; ext->zone = cb->zone; } } diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 099ff6a3e1f5..7669371c1354 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -326,7 +326,7 @@ TC_INDIRECT_SCOPE int fl_classify(struct sk_buff *skb, struct tcf_result *res) { struct cls_fl_head *head = rcu_dereference_bh(tp->root); - bool post_ct = tc_skb_cb(skb)->post_ct; + bool post_ct = qdisc_skb_cb(skb)->post_ct; u16 zone = tc_skb_cb(skb)->zone; struct fl_flow_key skb_key; struct fl_flow_mask *mask; From be1b70ab21cbe8f8bb8ecf39eb34880882252773 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Nov 2025 08:32:44 +0000 Subject: [PATCH 02/14] net: init shinfo->gso_segs from qdisc_pkt_len_init() Qdisc use shinfo->gso_segs for their pkts stats in bstats_update(), but this field needs to be initialized for SKB_GSO_DODGY users. Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20251121083256.674562-3-edumazet@google.com Signed-off-by: Paolo Abeni --- net/core/dev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/core/dev.c b/net/core/dev.c index 46ce6c610780..dba9eef8bd83 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4071,7 +4071,7 @@ EXPORT_SYMBOL_GPL(validate_xmit_skb_list); static void qdisc_pkt_len_init(struct sk_buff *skb) { - const struct skb_shared_info *shinfo = skb_shinfo(skb); + struct skb_shared_info *shinfo = skb_shinfo(skb); qdisc_skb_cb(skb)->pkt_len = skb->len; @@ -4112,6 +4112,7 @@ static void qdisc_pkt_len_init(struct sk_buff *skb) if (payload <= 0) return; gso_segs = DIV_ROUND_UP(payload, shinfo->gso_size); + shinfo->gso_segs = gso_segs; } qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len; } From 874c1928d37297a02452a404c8b496aad9fee146 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Nov 2025 08:32:45 +0000 Subject: [PATCH 03/14] net_sched: initialize qdisc_skb_cb(skb)->pkt_segs in qdisc_pkt_len_init() qdisc_pkt_len_init() is currently initalizing qdisc_skb_cb(skb)->pkt_len. Add qdisc_skb_cb(skb)->pkt_segs initialization and rename this function to qdisc_pkt_len_segs_init(). Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20251121083256.674562-4-edumazet@google.com Signed-off-by: Paolo Abeni --- net/core/dev.c | 15 +++++++++++---- net/sched/sch_cake.c | 2 +- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index dba9eef8bd83..895c3e37e686 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4069,17 +4069,23 @@ struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *d } EXPORT_SYMBOL_GPL(validate_xmit_skb_list); -static void qdisc_pkt_len_init(struct sk_buff *skb) +static void qdisc_pkt_len_segs_init(struct sk_buff *skb) { struct skb_shared_info *shinfo = skb_shinfo(skb); + u16 gso_segs; qdisc_skb_cb(skb)->pkt_len = skb->len; + if (!shinfo->gso_size) { + qdisc_skb_cb(skb)->pkt_segs = 1; + return; + } + + qdisc_skb_cb(skb)->pkt_segs = gso_segs = shinfo->gso_segs; /* To get more precise estimation of bytes sent on wire, * we add to pkt_len the headers size of all segments */ - if (shinfo->gso_size && skb_transport_header_was_set(skb)) { - u16 gso_segs = shinfo->gso_segs; + if (skb_transport_header_was_set(skb)) { unsigned int hdr_len; /* mac layer + network layer */ @@ -4113,6 +4119,7 @@ static void qdisc_pkt_len_init(struct sk_buff *skb) return; gso_segs = DIV_ROUND_UP(payload, shinfo->gso_size); shinfo->gso_segs = gso_segs; + qdisc_skb_cb(skb)->pkt_segs = gso_segs; } qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len; } @@ -4738,7 +4745,7 @@ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) skb_update_prio(skb); - qdisc_pkt_len_init(skb); + qdisc_pkt_len_segs_init(skb); tcx_set_ingress(skb, false); #ifdef CONFIG_NET_EGRESS if (static_branch_unlikely(&egress_needed_key)) { diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 32bacfc314c2..9213129f0de1 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -1406,7 +1406,7 @@ static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb) if (!shinfo->gso_size) return cake_calc_overhead(q, len, off); - /* borrowed from qdisc_pkt_len_init() */ + /* borrowed from qdisc_pkt_len_segs_init() */ if (!skb->encapsulation) hdr_len = skb_transport_offset(skb); else From f9e00e51e391d08de31ca98d9f8609a1bceec2d2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Nov 2025 08:32:46 +0000 Subject: [PATCH 04/14] net: use qdisc_pkt_len_segs_init() in sch_handle_ingress() sch_handle_ingress() sets qdisc_skb_cb(skb)->pkt_len. We also need to initialize qdisc_skb_cb(skb)->pkt_segs. Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20251121083256.674562-5-edumazet@google.com Signed-off-by: Paolo Abeni --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/dev.c b/net/core/dev.c index 895c3e37e686..e19eb4e9d77c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4434,7 +4434,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, *pt_prev = NULL; } - qdisc_skb_cb(skb)->pkt_len = skb->len; + qdisc_pkt_len_segs_init(skb); tcx_set_ingress(skb, true); if (static_branch_unlikely(&tcx_needed_key)) { From 2773cb0b3120eb5c4b66d949eb99853d5bae1221 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Nov 2025 08:32:47 +0000 Subject: [PATCH 05/14] net_sched: use qdisc_skb_cb(skb)->pkt_segs in bstats_update() Avoid up to two cache line misses in qdisc dequeue() to fetch skb_shinfo(skb)->gso_segs/gso_size while qdisc spinlock is held. This gives a 5 % improvement in a TX intensive workload. Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20251121083256.674562-6-edumazet@google.com Signed-off-by: Paolo Abeni --- include/net/sch_generic.h | 13 ++++++++++--- net/sched/sch_cake.c | 1 + net/sched/sch_dualpi2.c | 1 + net/sched/sch_netem.c | 1 + net/sched/sch_qfq.c | 2 +- net/sched/sch_taprio.c | 1 + net/sched/sch_tbf.c | 1 + 7 files changed, 16 insertions(+), 4 deletions(-) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 9cd8b5d4b236..cdf7a58ebcf5 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -829,6 +829,15 @@ static inline unsigned int qdisc_pkt_len(const struct sk_buff *skb) return qdisc_skb_cb(skb)->pkt_len; } +static inline unsigned int qdisc_pkt_segs(const struct sk_buff *skb) +{ + u32 pkt_segs = qdisc_skb_cb(skb)->pkt_segs; + + DEBUG_NET_WARN_ON_ONCE(pkt_segs != + (skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1)); + return pkt_segs; +} + /* additional qdisc xmit flags (NET_XMIT_MASK in linux/netdevice.h) */ enum net_xmit_qdisc_t { __NET_XMIT_STOLEN = 0x00010000, @@ -870,9 +879,7 @@ static inline void _bstats_update(struct gnet_stats_basic_sync *bstats, static inline void bstats_update(struct gnet_stats_basic_sync *bstats, const struct sk_buff *skb) { - _bstats_update(bstats, - qdisc_pkt_len(skb), - skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1); + _bstats_update(bstats, qdisc_pkt_len(skb), qdisc_pkt_segs(skb)); } static inline void qdisc_bstats_cpu_update(struct Qdisc *sch, diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 9213129f0de1..a20880034aa5 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -1800,6 +1800,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, skb_list_walk_safe(segs, segs, nskb) { skb_mark_not_on_list(segs); qdisc_skb_cb(segs)->pkt_len = segs->len; + qdisc_skb_cb(segs)->pkt_segs = 1; cobalt_set_enqueue_time(segs, now); get_cobalt_cb(segs)->adjusted_len = cake_overhead(q, segs); diff --git a/net/sched/sch_dualpi2.c b/net/sched/sch_dualpi2.c index 4b975feb52b1..6d7e6389758d 100644 --- a/net/sched/sch_dualpi2.c +++ b/net/sched/sch_dualpi2.c @@ -475,6 +475,7 @@ static int dualpi2_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, * (3) Enqueue fragment & set ts in dualpi2_enqueue_skb */ qdisc_skb_cb(nskb)->pkt_len = nskb->len; + qdisc_skb_cb(nskb)->pkt_segs = 1; dualpi2_skb_cb(nskb)->classified = dualpi2_skb_cb(skb)->classified; dualpi2_skb_cb(nskb)->ect = dualpi2_skb_cb(skb)->ect; diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index eafc316ae319..32a5f3304046 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -429,6 +429,7 @@ static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff *segs; netdev_features_t features = netif_skb_features(skb); + qdisc_skb_cb(skb)->pkt_segs = 1; segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); if (IS_ERR_OR_NULL(segs)) { diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 2255355e51d3..d920f57dc6d7 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -1250,7 +1250,7 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, } } - gso_segs = skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1; + gso_segs = qdisc_pkt_segs(skb); err = qdisc_enqueue(skb, cl->qdisc, to_free); if (unlikely(err != NET_XMIT_SUCCESS)) { pr_debug("qfq_enqueue: enqueue failed %d\n", err); diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 39b735386996..300d577b3286 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -595,6 +595,7 @@ static int taprio_enqueue_segmented(struct sk_buff *skb, struct Qdisc *sch, skb_list_walk_safe(segs, segs, nskb) { skb_mark_not_on_list(segs); qdisc_skb_cb(segs)->pkt_len = segs->len; + qdisc_skb_cb(segs)->pkt_segs = 1; slen += segs->len; /* FIXME: we should be segmenting to a smaller size diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index 4c977f049670..f2340164f579 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -221,6 +221,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch, skb_mark_not_on_list(segs); seg_len = segs->len; qdisc_skb_cb(segs)->pkt_len = seg_len; + qdisc_skb_cb(segs)->pkt_segs = 1; ret = qdisc_enqueue(segs, q->qdisc, to_free); if (ret != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(ret)) From c5d34f4583ea883b8d3441ded83e3a7207e0182d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Nov 2025 08:32:48 +0000 Subject: [PATCH 06/14] net_sched: cake: use qdisc_pkt_segs() Use new qdisc_pkt_segs() to avoid a cache line miss in cake_enqueue() for non GSO packets. cake_overhead() does not have to recompute it. Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20251121083256.674562-7-edumazet@google.com Signed-off-by: Paolo Abeni --- net/sched/sch_cake.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index a20880034aa5..5948a149129c 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -1398,12 +1398,12 @@ static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb) const struct skb_shared_info *shinfo = skb_shinfo(skb); unsigned int hdr_len, last_len = 0; u32 off = skb_network_offset(skb); + u16 segs = qdisc_pkt_segs(skb); u32 len = qdisc_pkt_len(skb); - u16 segs = 1; q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8); - if (!shinfo->gso_size) + if (segs == 1) return cake_calc_overhead(q, len, off); /* borrowed from qdisc_pkt_len_segs_init() */ @@ -1430,12 +1430,6 @@ static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb) hdr_len += sizeof(struct udphdr); } - if (unlikely(shinfo->gso_type & SKB_GSO_DODGY)) - segs = DIV_ROUND_UP(skb->len - hdr_len, - shinfo->gso_size); - else - segs = shinfo->gso_segs; - len = shinfo->gso_size + hdr_len; last_len = skb->len - shinfo->gso_size * (segs - 1); @@ -1788,7 +1782,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (unlikely(len > b->max_skblen)) b->max_skblen = len; - if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) { + if (qdisc_pkt_segs(skb) > 1 && q->rate_flags & CAKE_FLAG_SPLIT_GSO) { struct sk_buff *segs, *nskb; netdev_features_t features = netif_skb_features(skb); unsigned int slen = 0, numsegs = 0; From ad50d5a3fc20327e133e2db849c6e67fc49650e6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Nov 2025 08:32:49 +0000 Subject: [PATCH 07/14] net_sched: add Qdisc_read_mostly and Qdisc_write groups It is possible to reorg Qdisc to avoid always dirtying 2 cache lines in fast path by reducing this to a single dirtied cache line. In current layout, we change only four/six fields in the first cache line: - q.spinlock - q.qlen - bstats.bytes - bstats.packets - some Qdisc also change q.next/q.prev In the second cache line we change in the fast path: - running - state - qstats.backlog /* --- cacheline 2 boundary (128 bytes) --- */ struct sk_buff_head gso_skb __attribute__((__aligned__(64))); /* 0x80 0x18 */ struct qdisc_skb_head q; /* 0x98 0x18 */ struct gnet_stats_basic_sync bstats __attribute__((__aligned__(16))); /* 0xb0 0x10 */ /* --- cacheline 3 boundary (192 bytes) --- */ struct gnet_stats_queue qstats; /* 0xc0 0x14 */ bool running; /* 0xd4 0x1 */ /* XXX 3 bytes hole, try to pack */ unsigned long state; /* 0xd8 0x8 */ struct Qdisc * next_sched; /* 0xe0 0x8 */ struct sk_buff_head skb_bad_txq; /* 0xe8 0x18 */ /* --- cacheline 4 boundary (256 bytes) --- */ Reorganize things to have a first cache line mostly read, then a mostly written one. This gives a ~3% increase of performance under tx stress. Note that there is an additional hole because @qstats now spans over a third cache line. /* --- cacheline 2 boundary (128 bytes) --- */ __u8 __cacheline_group_begin__Qdisc_read_mostly[0] __attribute__((__aligned__(64))); /* 0x80 0 */ struct sk_buff_head gso_skb; /* 0x80 0x18 */ struct Qdisc * next_sched; /* 0x98 0x8 */ struct sk_buff_head skb_bad_txq; /* 0xa0 0x18 */ __u8 __cacheline_group_end__Qdisc_read_mostly[0]; /* 0xb8 0 */ /* XXX 8 bytes hole, try to pack */ /* --- cacheline 3 boundary (192 bytes) --- */ __u8 __cacheline_group_begin__Qdisc_write[0] __attribute__((__aligned__(64))); /* 0xc0 0 */ struct qdisc_skb_head q; /* 0xc0 0x18 */ unsigned long state; /* 0xd8 0x8 */ struct gnet_stats_basic_sync bstats __attribute__((__aligned__(16))); /* 0xe0 0x10 */ bool running; /* 0xf0 0x1 */ /* XXX 3 bytes hole, try to pack */ struct gnet_stats_queue qstats; /* 0xf4 0x14 */ /* --- cacheline 4 boundary (256 bytes) was 8 bytes ago --- */ __u8 __cacheline_group_end__Qdisc_write[0]; /* 0x108 0 */ /* XXX 56 bytes hole, try to pack */ Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20251121083256.674562-8-edumazet@google.com Signed-off-by: Paolo Abeni --- include/net/sch_generic.h | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index cdf7a58ebcf5..79501499dafb 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -103,17 +103,24 @@ struct Qdisc { int pad; refcount_t refcnt; - /* - * For performance sake on SMP, we put highly modified fields at the end - */ - struct sk_buff_head gso_skb ____cacheline_aligned_in_smp; - struct qdisc_skb_head q; - struct gnet_stats_basic_sync bstats; - struct gnet_stats_queue qstats; - bool running; /* must be written under qdisc spinlock */ - unsigned long state; - struct Qdisc *next_sched; - struct sk_buff_head skb_bad_txq; + /* Cache line potentially dirtied in dequeue() or __netif_reschedule(). */ + __cacheline_group_begin(Qdisc_read_mostly) ____cacheline_aligned; + struct sk_buff_head gso_skb; + struct Qdisc *next_sched; + struct sk_buff_head skb_bad_txq; + __cacheline_group_end(Qdisc_read_mostly); + + /* Fields dirtied in dequeue() fast path. */ + __cacheline_group_begin(Qdisc_write) ____cacheline_aligned; + struct qdisc_skb_head q; + unsigned long state; + struct gnet_stats_basic_sync bstats; + bool running; /* must be written under qdisc spinlock */ + + /* Note : we only change qstats.backlog in fast path. */ + struct gnet_stats_queue qstats; + __cacheline_group_end(Qdisc_write); + atomic_long_t defer_count ____cacheline_aligned_in_smp; struct llist_head defer_list; From 3c1100f042c006cae6c241028cc4c69e1a70483f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Nov 2025 08:32:50 +0000 Subject: [PATCH 08/14] net_sched: sch_fq: move qdisc_bstats_update() to fq_dequeue_skb() Group together changes to qdisc fields to reduce chances of false sharing if another cpu attempts to acquire the qdisc spinlock. qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; qdisc_bstats_update(sch, skb); Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20251121083256.674562-9-edumazet@google.com Signed-off-by: Paolo Abeni --- net/sched/sch_fq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index fee922da2f99..0b0ca1aa9251 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -497,6 +497,7 @@ static void fq_dequeue_skb(struct Qdisc *sch, struct fq_flow *flow, skb_mark_not_on_list(skb); qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; + qdisc_bstats_update(sch, skb); } static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb) @@ -776,7 +777,6 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch) f->time_next_packet = now + len; } out: - qdisc_bstats_update(sch, skb); return skb; } From 2f9babc04d74cbf984f0cb5b6e20bd78fdf32997 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Nov 2025 08:32:51 +0000 Subject: [PATCH 09/14] net_sched: sch_fq: prefetch one skb ahead in dequeue() prefetch the skb that we are likely to dequeue at the next dequeue(). Also call fq_dequeue_skb() a bit sooner in fq_dequeue(). This reduces the window between read of q.qlen and changes of fields in the cache line that could be dirtied by another cpu trying to queue a packet. Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20251121083256.674562-10-edumazet@google.com Signed-off-by: Paolo Abeni --- net/sched/sch_fq.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 0b0ca1aa9251..6e5f2f4f2415 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -480,7 +480,10 @@ static void fq_erase_head(struct Qdisc *sch, struct fq_flow *flow, struct sk_buff *skb) { if (skb == flow->head) { - flow->head = skb->next; + struct sk_buff *next = skb->next; + + prefetch(next); + flow->head = next; } else { rb_erase(&skb->rbnode, &flow->t_root); skb->dev = qdisc_dev(sch); @@ -712,6 +715,7 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch) goto begin; } prefetch(&skb->end); + fq_dequeue_skb(sch, f, skb); if ((s64)(now - time_next_packet - q->ce_threshold) > 0) { INET_ECN_set_ce(skb); q->stat_ce_mark++; @@ -719,7 +723,6 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch) if (--f->qlen == 0) q->inactive_flows++; q->band_pkt_count[fq_skb_cb(skb)->band]--; - fq_dequeue_skb(sch, f, skb); } else { head->first = f->next; /* force a pass through old_flows to prevent starvation */ From b2e9821cff6c3c9ac107fce5327070f4462bf8a7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Nov 2025 08:32:52 +0000 Subject: [PATCH 10/14] net: prefech skb->priority in __dev_xmit_skb() Most qdiscs need to read skb->priority at enqueue time(). In commit 100dfa74cad9 ("net: dev_queue_xmit() llist adoption") I added a prefetch(next), lets add another one for the second half of skb. Note that skb->priority and skb->hash share a common cache line, so this patch helps qdiscs needing both fields. Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20251121083256.674562-11-edumazet@google.com Signed-off-by: Paolo Abeni --- net/core/dev.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/core/dev.c b/net/core/dev.c index e19eb4e9d77c..53e2496dc429 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4246,6 +4246,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, llist_for_each_entry_safe(skb, next, ll_list, ll_node) { prefetch(next); + prefetch(&next->priority); skb_mark_not_on_list(skb); rc = dev_qdisc_enqueue(skb, q, &to_free, txq); count++; From 4792c3a4c1470202b0d5bf44b6058cb0fb050ba8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Nov 2025 08:32:53 +0000 Subject: [PATCH 11/14] net: annotate a data-race in __dev_xmit_skb() q->limit is read locklessly, add a READ_ONCE(). Fixes: 100dfa74cad9 ("net: dev_queue_xmit() llist adoption") Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20251121083256.674562-12-edumazet@google.com Signed-off-by: Paolo Abeni --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/dev.c b/net/core/dev.c index 53e2496dc429..10042139dbb0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4194,7 +4194,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, do { if (first_n && !defer_count) { defer_count = atomic_long_inc_return(&q->defer_count); - if (unlikely(defer_count > q->limit)) { + if (unlikely(defer_count > READ_ONCE(q->limit))) { kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_DROP); return NET_XMIT_DROP; } From 0170d7f47c8bb0311bc802bad52245c045f151fe Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Nov 2025 08:32:54 +0000 Subject: [PATCH 12/14] net_sched: add tcf_kfree_skb_list() helper Using kfree_skb_list_reason() to free list of skbs from qdisc operations seems wrong as each skb might have a different drop reason. Cleanup __dev_xmit_skb() to call tcf_kfree_skb_list() once in preparation of the following patch. Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20251121083256.674562-13-edumazet@google.com Signed-off-by: Paolo Abeni --- include/net/sch_generic.h | 11 +++++++++++ net/core/dev.c | 15 +++++---------- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 79501499dafb..b8092d0378a0 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -1105,6 +1105,17 @@ static inline void tcf_set_drop_reason(const struct sk_buff *skb, tc_skb_cb(skb)->drop_reason = reason; } +static inline void tcf_kfree_skb_list(struct sk_buff *skb) +{ + while (unlikely(skb)) { + struct sk_buff *next = skb->next; + + prefetch(next); + kfree_skb_reason(skb, tcf_get_drop_reason(skb)); + skb = next; + } +} + /* Instead of calling kfree_skb() while root qdisc lock is held, * queue the skb for future freeing at end of __dev_xmit_skb() */ diff --git a/net/core/dev.c b/net/core/dev.c index 10042139dbb0..e865cdb9b696 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4162,7 +4162,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, __qdisc_run(q); qdisc_run_end(q); - goto no_lock_out; + goto free_skbs; } qdisc_bstats_cpu_update(q, skb); @@ -4176,12 +4176,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, rc = dev_qdisc_enqueue(skb, q, &to_free, txq); qdisc_run(q); - -no_lock_out: - if (unlikely(to_free)) - kfree_skb_list_reason(to_free, - tcf_get_drop_reason(to_free)); - return rc; + goto free_skbs; } /* Open code llist_add(&skb->ll_node, &q->defer_list) + queue limit. @@ -4257,9 +4252,9 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, } unlock: spin_unlock(root_lock); - if (unlikely(to_free)) - kfree_skb_list_reason(to_free, - tcf_get_drop_reason(to_free)); + +free_skbs: + tcf_kfree_skb_list(to_free); return rc; } From 191ff13e42a7b7824fec5b2ed84fd6481356754d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Nov 2025 08:32:55 +0000 Subject: [PATCH 13/14] net_sched: add qdisc_dequeue_drop() helper Some qdisc like cake, codel, fq_codel might drop packets in their dequeue() method. This is currently problematic because dequeue() runs with the qdisc spinlock held. Freeing skbs can be extremely expensive. Add qdisc_dequeue_drop() method and a new TCQ_F_DEQUEUE_DROPS so that these qdiscs can opt-in to defer the skb frees after the socket spinlock is released. TCQ_F_DEQUEUE_DROPS is an attempt to not penalize other qdiscs with an extra cache line miss. Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20251121083256.674562-14-edumazet@google.com Signed-off-by: Paolo Abeni --- include/net/pkt_sched.h | 5 +++-- include/net/sch_generic.h | 30 +++++++++++++++++++++++++++--- net/core/dev.c | 22 +++++++++++++--------- 3 files changed, 43 insertions(+), 14 deletions(-) diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index 4678db45832a..e703c507d0da 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -114,12 +114,13 @@ bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, void __qdisc_run(struct Qdisc *q); -static inline void qdisc_run(struct Qdisc *q) +static inline struct sk_buff *qdisc_run(struct Qdisc *q) { if (qdisc_run_begin(q)) { __qdisc_run(q); - qdisc_run_end(q); + return qdisc_run_end(q); } + return NULL; } extern const struct nla_policy rtm_tca_policy[TCA_MAX + 1]; diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index b8092d0378a0..c3a7268b567e 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -88,6 +88,8 @@ struct Qdisc { #define TCQ_F_INVISIBLE 0x80 /* invisible by default in dump */ #define TCQ_F_NOLOCK 0x100 /* qdisc does not require locking */ #define TCQ_F_OFFLOADED 0x200 /* qdisc is offloaded to HW */ +#define TCQ_F_DEQUEUE_DROPS 0x400 /* ->dequeue() can drop packets in q->to_free */ + u32 limit; const struct Qdisc_ops *ops; struct qdisc_size_table __rcu *stab; @@ -119,6 +121,8 @@ struct Qdisc { /* Note : we only change qstats.backlog in fast path. */ struct gnet_stats_queue qstats; + + struct sk_buff *to_free; __cacheline_group_end(Qdisc_write); @@ -218,8 +222,10 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc) return true; } -static inline void qdisc_run_end(struct Qdisc *qdisc) +static inline struct sk_buff *qdisc_run_end(struct Qdisc *qdisc) { + struct sk_buff *to_free = NULL; + if (qdisc->flags & TCQ_F_NOLOCK) { spin_unlock(&qdisc->seqlock); @@ -232,9 +238,16 @@ static inline void qdisc_run_end(struct Qdisc *qdisc) if (unlikely(test_bit(__QDISC_STATE_MISSED, &qdisc->state))) __netif_schedule(qdisc); - } else { - WRITE_ONCE(qdisc->running, false); + return NULL; } + + if (qdisc->flags & TCQ_F_DEQUEUE_DROPS) { + to_free = qdisc->to_free; + if (to_free) + qdisc->to_free = NULL; + } + WRITE_ONCE(qdisc->running, false); + return to_free; } static inline bool qdisc_may_bulk(const struct Qdisc *qdisc) @@ -1116,6 +1129,17 @@ static inline void tcf_kfree_skb_list(struct sk_buff *skb) } } +static inline void qdisc_dequeue_drop(struct Qdisc *q, struct sk_buff *skb, + enum skb_drop_reason reason) +{ + DEBUG_NET_WARN_ON_ONCE(!(q->flags & TCQ_F_DEQUEUE_DROPS)); + DEBUG_NET_WARN_ON_ONCE(q->flags & TCQ_F_NOLOCK); + + tcf_set_drop_reason(skb, reason); + skb->next = q->to_free; + q->to_free = skb; +} + /* Instead of calling kfree_skb() while root qdisc lock is held, * queue the skb for future freeing at end of __dev_xmit_skb() */ diff --git a/net/core/dev.c b/net/core/dev.c index e865cdb9b696..9094c0fb8c68 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4141,7 +4141,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, struct net_device *dev, struct netdev_queue *txq) { - struct sk_buff *next, *to_free = NULL; + struct sk_buff *next, *to_free = NULL, *to_free2 = NULL; spinlock_t *root_lock = qdisc_lock(q); struct llist_node *ll_list, *first_n; unsigned long defer_count = 0; @@ -4160,7 +4160,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, if (unlikely(!nolock_qdisc_is_empty(q))) { rc = dev_qdisc_enqueue(skb, q, &to_free, txq); __qdisc_run(q); - qdisc_run_end(q); + to_free2 = qdisc_run_end(q); goto free_skbs; } @@ -4170,12 +4170,13 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, !nolock_qdisc_is_empty(q)) __qdisc_run(q); - qdisc_run_end(q); - return NET_XMIT_SUCCESS; + to_free2 = qdisc_run_end(q); + rc = NET_XMIT_SUCCESS; + goto free_skbs; } rc = dev_qdisc_enqueue(skb, q, &to_free, txq); - qdisc_run(q); + to_free2 = qdisc_run(q); goto free_skbs; } @@ -4234,7 +4235,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, qdisc_bstats_update(q, skb); if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) __qdisc_run(q); - qdisc_run_end(q); + to_free2 = qdisc_run_end(q); rc = NET_XMIT_SUCCESS; } else { int count = 0; @@ -4246,7 +4247,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, rc = dev_qdisc_enqueue(skb, q, &to_free, txq); count++; } - qdisc_run(q); + to_free2 = qdisc_run(q); if (count != 1) rc = NET_XMIT_SUCCESS; } @@ -4255,6 +4256,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, free_skbs: tcf_kfree_skb_list(to_free); + tcf_kfree_skb_list(to_free2); return rc; } @@ -5747,8 +5749,9 @@ static __latent_entropy void net_tx_action(void) rcu_read_lock(); while (head) { - struct Qdisc *q = head; spinlock_t *root_lock = NULL; + struct sk_buff *to_free; + struct Qdisc *q = head; head = head->next_sched; @@ -5775,9 +5778,10 @@ static __latent_entropy void net_tx_action(void) } clear_bit(__QDISC_STATE_SCHED, &q->state); - qdisc_run(q); + to_free = qdisc_run(q); if (root_lock) spin_unlock(root_lock); + tcf_kfree_skb_list(to_free); } rcu_read_unlock(); From a6efc273ab8245722eee2150fa12cf75781dc410 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Nov 2025 08:32:56 +0000 Subject: [PATCH 14/14] net_sched: use qdisc_dequeue_drop() in cake, codel, fq_codel cake, codel and fq_codel can drop many packets from dequeue(). Use qdisc_dequeue_drop() so that the freeing can happen outside of the qdisc spinlock scope. Add TCQ_F_DEQUEUE_DROPS to sch->flags. Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20251121083256.674562-15-edumazet@google.com Signed-off-by: Paolo Abeni --- net/sched/sch_cake.c | 4 +++- net/sched/sch_codel.c | 4 +++- net/sched/sch_fq_codel.c | 5 ++++- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 5948a149129c..0ea9440f68c6 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -2183,7 +2183,7 @@ static struct sk_buff *cake_dequeue(struct Qdisc *sch) b->tin_dropped++; qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb)); qdisc_qstats_drop(sch); - kfree_skb_reason(skb, reason); + qdisc_dequeue_drop(sch, skb, reason); if (q->rate_flags & CAKE_FLAG_INGRESS) goto retry; } @@ -2724,6 +2724,8 @@ static int cake_init(struct Qdisc *sch, struct nlattr *opt, int i, j, err; sch->limit = 10240; + sch->flags |= TCQ_F_DEQUEUE_DROPS; + q->tin_mode = CAKE_DIFFSERV_DIFFSERV3; q->flow_mode = CAKE_FLOW_TRIPLE; diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c index fa0314679e43..c6551578f1cf 100644 --- a/net/sched/sch_codel.c +++ b/net/sched/sch_codel.c @@ -52,7 +52,7 @@ static void drop_func(struct sk_buff *skb, void *ctx) { struct Qdisc *sch = ctx; - kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_CONGESTED); + qdisc_dequeue_drop(sch, skb, SKB_DROP_REASON_QDISC_CONGESTED); qdisc_qstats_drop(sch); } @@ -182,6 +182,8 @@ static int codel_init(struct Qdisc *sch, struct nlattr *opt, else sch->flags &= ~TCQ_F_CAN_BYPASS; + sch->flags |= TCQ_F_DEQUEUE_DROPS; + return 0; } diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index a14142392939..dc187c7f06b1 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -275,7 +275,7 @@ static void drop_func(struct sk_buff *skb, void *ctx) { struct Qdisc *sch = ctx; - kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_CONGESTED); + qdisc_dequeue_drop(sch, skb, SKB_DROP_REASON_QDISC_CONGESTED); qdisc_qstats_drop(sch); } @@ -519,6 +519,9 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt, sch->flags |= TCQ_F_CAN_BYPASS; else sch->flags &= ~TCQ_F_CAN_BYPASS; + + sch->flags |= TCQ_F_DEQUEUE_DROPS; + return 0; alloc_failure: