From f86f42ed2c471da5b061492bb8ab1d3d73c19c58 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 Aug 2025 12:50:27 +0000 Subject: [PATCH 1/5] net: add sk_drops_read(), sk_drops_inc() and sk_drops_reset() helpers We want to split sk->sk_drops in the future to reduce potential contention on this field. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250826125031.1578842-2-edumazet@google.com Signed-off-by: Paolo Abeni --- include/net/sock.h | 17 ++++++++++++++++- include/net/tcp.h | 2 +- net/core/datagram.c | 2 +- net/core/sock.c | 14 +++++++------- net/ipv4/ping.c | 2 +- net/ipv4/raw.c | 6 +++--- net/ipv4/udp.c | 14 +++++++------- net/ipv6/datagram.c | 2 +- net/ipv6/raw.c | 8 ++++---- net/ipv6/udp.c | 6 +++--- net/iucv/af_iucv.c | 4 ++-- net/netlink/af_netlink.c | 4 ++-- net/packet/af_packet.c | 2 +- net/phonet/pep.c | 6 +++--- net/phonet/socket.c | 2 +- net/sctp/diag.c | 2 +- net/tipc/socket.c | 6 +++--- 17 files changed, 57 insertions(+), 42 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index 63a6a48afb48..34d7029eb622 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2682,11 +2682,26 @@ struct sock_skb_cb { #define sock_skb_cb_check_size(size) \ BUILD_BUG_ON((size) > SOCK_SKB_CB_OFFSET) +static inline void sk_drops_inc(struct sock *sk) +{ + atomic_inc(&sk->sk_drops); +} + +static inline int sk_drops_read(const struct sock *sk) +{ + return atomic_read(&sk->sk_drops); +} + +static inline void sk_drops_reset(struct sock *sk) +{ + atomic_set(&sk->sk_drops, 0); +} + static inline void sock_skb_set_dropcount(const struct sock *sk, struct sk_buff *skb) { SOCK_SKB_CB(skb)->dropcount = sock_flag(sk, SOCK_RXQ_OVFL) ? - atomic_read(&sk->sk_drops) : 0; + sk_drops_read(sk) : 0; } static inline void sk_drops_add(struct sock *sk, const struct sk_buff *skb) diff --git a/include/net/tcp.h b/include/net/tcp.h index 2936b8175950..16dc9cebb9d2 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2612,7 +2612,7 @@ static inline void tcp_segs_in(struct tcp_sock *tp, const struct sk_buff *skb) */ static inline void tcp_listendrop(const struct sock *sk) { - atomic_inc(&((struct sock *)sk)->sk_drops); + sk_drops_inc((struct sock *)sk); __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENDROPS); } diff --git a/net/core/datagram.c b/net/core/datagram.c index 94cc4705e91d..ba8253aa6e07 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -345,7 +345,7 @@ int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue, spin_unlock_bh(&sk_queue->lock); } - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); return err; } EXPORT_SYMBOL(__sk_queue_drop_skb); diff --git a/net/core/sock.c b/net/core/sock.c index 8002ac6293dc..75368823969a 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -491,13 +491,13 @@ int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) struct sk_buff_head *list = &sk->sk_receive_queue; if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); trace_sock_rcvqueue_full(sk, skb); return -ENOMEM; } if (!sk_rmem_schedule(sk, skb, skb->truesize)) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); return -ENOBUFS; } @@ -562,7 +562,7 @@ int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, skb->dev = NULL; if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); reason = SKB_DROP_REASON_SOCKET_RCVBUFF; goto discard_and_relse; } @@ -585,7 +585,7 @@ int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, reason = SKB_DROP_REASON_PFMEMALLOC; if (err == -ENOBUFS) reason = SKB_DROP_REASON_SOCKET_BACKLOG; - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); goto discard_and_relse; } @@ -2505,7 +2505,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) newsk->sk_wmem_queued = 0; newsk->sk_forward_alloc = 0; newsk->sk_reserved_mem = 0; - atomic_set(&newsk->sk_drops, 0); + sk_drops_reset(newsk); newsk->sk_send_head = NULL; newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; atomic_set(&newsk->sk_zckey, 0); @@ -3713,7 +3713,7 @@ void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid) */ smp_wmb(); refcount_set(&sk->sk_refcnt, 1); - atomic_set(&sk->sk_drops, 0); + sk_drops_reset(sk); } EXPORT_SYMBOL(sock_init_data_uid); @@ -3973,7 +3973,7 @@ void sk_get_meminfo(const struct sock *sk, u32 *mem) mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued); mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); - mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); + mem[SK_MEMINFO_DROPS] = sk_drops_read(sk); } #ifdef CONFIG_PROC_FS diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 031df4c19fcc..f119da68fc30 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -1119,7 +1119,7 @@ static void ping_v4_format_sock(struct sock *sp, struct seq_file *f, from_kuid_munged(seq_user_ns(f), sk_uid(sp)), 0, sock_i_ino(sp), refcount_read(&sp->sk_refcnt), sp, - atomic_read(&sp->sk_drops)); + sk_drops_read(sp)); } static int ping_v4_seq_show(struct seq_file *seq, void *v) diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 1d2c89d63cc7..0f9f02f6146e 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -178,7 +178,7 @@ static int raw_v4_input(struct net *net, struct sk_buff *skb, if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); continue; } @@ -311,7 +311,7 @@ static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb) int raw_rcv(struct sock *sk, struct sk_buff *skb) { if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_XFRM_POLICY); return NET_RX_DROP; } @@ -1045,7 +1045,7 @@ static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i) 0, 0L, 0, from_kuid_munged(seq_user_ns(seq), sk_uid(sp)), 0, sock_i_ino(sp), - refcount_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops)); + refcount_read(&sp->sk_refcnt), sp, sk_drops_read(sp)); } static int raw_seq_show(struct seq_file *seq, void *v) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index cc3ce0f762ec..732bdad43626 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1787,7 +1787,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) atomic_sub(skb->truesize, &sk->sk_rmem_alloc); drop: - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); busylock_release(busy); return err; } @@ -1852,7 +1852,7 @@ static struct sk_buff *__first_packet_length(struct sock *sk, IS_UDPLITE(sk)); __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, IS_UDPLITE(sk)); - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); __skb_unlink(skb, rcvq); *total += skb->truesize; kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM); @@ -2008,7 +2008,7 @@ int udp_read_skb(struct sock *sk, skb_read_actor_t recv_actor) __UDP_INC_STATS(net, UDP_MIB_CSUMERRORS, is_udplite); __UDP_INC_STATS(net, UDP_MIB_INERRORS, is_udplite); - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM); goto try_again; } @@ -2078,7 +2078,7 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, if (unlikely(err)) { if (!peeking) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); } @@ -2449,7 +2449,7 @@ static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb) __UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); drop: __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); sk_skb_reason_drop(sk, skb, drop_reason); return -1; } @@ -2534,7 +2534,7 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, nskb = skb_clone(skb, GFP_ATOMIC); if (unlikely(!nskb)) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); __UDP_INC_STATS(net, UDP_MIB_RCVBUFERRORS, IS_UDPLITE(sk)); __UDP_INC_STATS(net, UDP_MIB_INERRORS, @@ -3386,7 +3386,7 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f, from_kuid_munged(seq_user_ns(f), sk_uid(sp)), 0, sock_i_ino(sp), refcount_read(&sp->sk_refcnt), sp, - atomic_read(&sp->sk_drops)); + sk_drops_read(sp)); } int udp4_seq_show(struct seq_file *seq, void *v) diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 972bf0426d59..33ebe93d80e3 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -1068,5 +1068,5 @@ void __ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp, 0, sock_i_ino(sp), refcount_read(&sp->sk_refcnt), sp, - atomic_read(&sp->sk_drops)); + sk_drops_read(sp)); } diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 4c3f8245c40f..4026192143ec 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -163,7 +163,7 @@ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); continue; } @@ -361,7 +361,7 @@ static inline int rawv6_rcv_skb(struct sock *sk, struct sk_buff *skb) if ((raw6_sk(sk)->checksum || rcu_access_pointer(sk->sk_filter)) && skb_checksum_complete(skb)) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_SKB_CSUM); return NET_RX_DROP; } @@ -389,7 +389,7 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb) struct raw6_sock *rp = raw6_sk(sk); if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_XFRM_POLICY); return NET_RX_DROP; } @@ -414,7 +414,7 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb) if (inet_test_bit(HDRINCL, sk)) { if (skb_checksum_complete(skb)) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_SKB_CSUM); return NET_RX_DROP; } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 6a68f77da44b..a35ee6d693a8 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -524,7 +524,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, } if (unlikely(err)) { if (!peeking) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); SNMP_INC_STATS(mib, UDP_MIB_INERRORS); } kfree_skb(skb); @@ -908,7 +908,7 @@ static int udpv6_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb) __UDP6_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); drop: __UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); sk_skb_reason_drop(sk, skb, drop_reason); return -1; } @@ -1013,7 +1013,7 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, } nskb = skb_clone(skb, GFP_ATOMIC); if (unlikely(!nskb)) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); __UDP6_INC_STATS(net, UDP_MIB_RCVBUFERRORS, IS_UDPLITE(sk)); __UDP6_INC_STATS(net, UDP_MIB_INERRORS, diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index cc2b3c44bc05..6c717a7ef292 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -1187,7 +1187,7 @@ static void iucv_process_message(struct sock *sk, struct sk_buff *skb, IUCV_SKB_CB(skb)->offset = 0; if (sk_filter(sk, skb)) { - atomic_inc(&sk->sk_drops); /* skb rejected by filter */ + sk_drops_inc(sk); /* skb rejected by filter */ kfree_skb(skb); return; } @@ -2011,7 +2011,7 @@ static int afiucv_hs_callback_rx(struct sock *sk, struct sk_buff *skb) skb_reset_network_header(skb); IUCV_SKB_CB(skb)->offset = 0; if (sk_filter(sk, skb)) { - atomic_inc(&sk->sk_drops); /* skb rejected by filter */ + sk_drops_inc(sk); /* skb rejected by filter */ kfree_skb(skb); return NET_RX_SUCCESS; } diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index e2f7080dd5d7..2b46c0cd752a 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -356,7 +356,7 @@ static void netlink_overrun(struct sock *sk) sk_error_report(sk); } } - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); } static void netlink_rcv_wake(struct sock *sk) @@ -2711,7 +2711,7 @@ static int netlink_native_seq_show(struct seq_file *seq, void *v) sk_wmem_alloc_get(s), READ_ONCE(nlk->cb_running), refcount_read(&s->sk_refcnt), - atomic_read(&s->sk_drops), + sk_drops_read(s), sock_i_ino(s) ); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index a7017d7f0927..9d42c4bd6e39 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2265,7 +2265,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, drop_n_acct: atomic_inc(&po->tp_drops); - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); drop_reason = SKB_DROP_REASON_PACKET_SOCK_ERROR; drop_n_restore: diff --git a/net/phonet/pep.c b/net/phonet/pep.c index 62527e1ebb88..4db564d9d522 100644 --- a/net/phonet/pep.c +++ b/net/phonet/pep.c @@ -376,7 +376,7 @@ static int pipe_do_rcv(struct sock *sk, struct sk_buff *skb) case PNS_PEP_CTRL_REQ: if (skb_queue_len(&pn->ctrlreq_queue) >= PNPIPE_CTRLREQ_MAX) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); break; } __skb_pull(skb, 4); @@ -397,7 +397,7 @@ static int pipe_do_rcv(struct sock *sk, struct sk_buff *skb) } if (pn->rx_credits == 0) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); err = -ENOBUFS; break; } @@ -567,7 +567,7 @@ static int pipe_handler_do_rcv(struct sock *sk, struct sk_buff *skb) } if (pn->rx_credits == 0) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); err = NET_RX_DROP; break; } diff --git a/net/phonet/socket.c b/net/phonet/socket.c index 2b61a40b568e..db2d552e9b32 100644 --- a/net/phonet/socket.c +++ b/net/phonet/socket.c @@ -587,7 +587,7 @@ static int pn_sock_seq_show(struct seq_file *seq, void *v) from_kuid_munged(seq_user_ns(seq), sk_uid(sk)), sock_i_ino(sk), refcount_read(&sk->sk_refcnt), sk, - atomic_read(&sk->sk_drops)); + sk_drops_read(sk)); } seq_pad(seq, '\n'); return 0; diff --git a/net/sctp/diag.c b/net/sctp/diag.c index 23359e522273..996c2018f0e6 100644 --- a/net/sctp/diag.c +++ b/net/sctp/diag.c @@ -173,7 +173,7 @@ static int inet_sctp_diag_fill(struct sock *sk, struct sctp_association *asoc, mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued; mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); - mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); + mem[SK_MEMINFO_DROPS] = sk_drops_read(sk); if (nla_put(skb, INET_DIAG_SKMEMINFO, sizeof(mem), &mem) < 0) goto errout; diff --git a/net/tipc/socket.c b/net/tipc/socket.c index e028bf658499..1574a83384f8 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -2366,7 +2366,7 @@ static void tipc_sk_filter_rcv(struct sock *sk, struct sk_buff *skb, else if (sk_rmem_alloc_get(sk) + skb->truesize >= limit) { trace_tipc_sk_dump(sk, skb, TIPC_DUMP_ALL, "err_overload2!"); - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); err = TIPC_ERR_OVERLOAD; } @@ -2458,7 +2458,7 @@ static void tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk, trace_tipc_sk_dump(sk, skb, TIPC_DUMP_ALL, "err_overload!"); /* Overload => reject message back to sender */ onode = tipc_own_addr(sock_net(sk)); - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); if (tipc_msg_reverse(onode, &skb, TIPC_ERR_OVERLOAD)) { trace_tipc_sk_rej_msg(sk, skb, TIPC_DUMP_ALL, "@sk_enqueue!"); @@ -3657,7 +3657,7 @@ int tipc_sk_fill_sock_diag(struct sk_buff *skb, struct netlink_callback *cb, nla_put_u32(skb, TIPC_NLA_SOCK_STAT_SENDQ, skb_queue_len(&sk->sk_write_queue)) || nla_put_u32(skb, TIPC_NLA_SOCK_STAT_DROP, - atomic_read(&sk->sk_drops))) + sk_drops_read(sk))) goto stat_msg_cancel; if (tsk->cong_link_cnt && From cb4d5a6eb600a43c2e3ec7f54e06d07aa33d8062 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 Aug 2025 12:50:28 +0000 Subject: [PATCH 2/5] net: add sk_drops_skbadd() helper Existing sk_drops_add() helper is renamed to sk_drops_skbadd(). Add sk_drops_add() and convert sk_drops_inc() to use it. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250826125031.1578842-3-edumazet@google.com Signed-off-by: Paolo Abeni --- include/linux/skmsg.h | 2 +- include/net/sock.h | 11 ++++++++--- include/net/udp.h | 2 +- net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_ipv4.c | 4 ++-- net/ipv6/tcp_ipv6.c | 4 ++-- net/mptcp/protocol.c | 2 +- 7 files changed, 16 insertions(+), 11 deletions(-) diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 0b9095a281b8..49847888c287 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -315,7 +315,7 @@ static inline bool sk_psock_test_state(const struct sk_psock *psock, static inline void sock_drop(struct sock *sk, struct sk_buff *skb) { - sk_drops_add(sk, skb); + sk_drops_skbadd(sk, skb); kfree_skb(skb); } diff --git a/include/net/sock.h b/include/net/sock.h index 34d7029eb622..9edb42ff0622 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2682,9 +2682,14 @@ struct sock_skb_cb { #define sock_skb_cb_check_size(size) \ BUILD_BUG_ON((size) > SOCK_SKB_CB_OFFSET) +static inline void sk_drops_add(struct sock *sk, int segs) +{ + atomic_add(segs, &sk->sk_drops); +} + static inline void sk_drops_inc(struct sock *sk) { - atomic_inc(&sk->sk_drops); + sk_drops_add(sk, 1); } static inline int sk_drops_read(const struct sock *sk) @@ -2704,11 +2709,11 @@ sock_skb_set_dropcount(const struct sock *sk, struct sk_buff *skb) sk_drops_read(sk) : 0; } -static inline void sk_drops_add(struct sock *sk, const struct sk_buff *skb) +static inline void sk_drops_skbadd(struct sock *sk, const struct sk_buff *skb) { int segs = max_t(u16, 1, skb_shinfo(skb)->gso_segs); - atomic_add(segs, &sk->sk_drops); + sk_drops_add(sk, segs); } static inline ktime_t sock_read_timestamp(struct sock *sk) diff --git a/include/net/udp.h b/include/net/udp.h index e2af3bda90c9..7b26d4c50f33 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -627,7 +627,7 @@ static inline struct sk_buff *udp_rcv_segment(struct sock *sk, return segs; drop: - atomic_add(drop_count, &sk->sk_drops); + sk_drops_add(sk, drop_count); SNMP_ADD_STATS(__UDPX_MIB(sk, ipv4), UDP_MIB_INERRORS, drop_count); kfree_skb(skb); return NULL; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a52a747d8a55..f1be65af1a77 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4830,7 +4830,7 @@ static bool tcp_ooo_try_coalesce(struct sock *sk, noinline_for_tracing static void tcp_drop_reason(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason reason) { - sk_drops_add(sk, skb); + sk_drops_skbadd(sk, skb); sk_skb_reason_drop(sk, skb, reason); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a0c93b24c6e0..7c1d612afca1 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2254,7 +2254,7 @@ int tcp_v4_rcv(struct sk_buff *skb) &iph->saddr, &iph->daddr, AF_INET, dif, sdif); if (unlikely(drop_reason)) { - sk_drops_add(sk, skb); + sk_drops_skbadd(sk, skb); reqsk_put(req); goto discard_it; } @@ -2399,7 +2399,7 @@ int tcp_v4_rcv(struct sk_buff *skb) return 0; discard_and_relse: - sk_drops_add(sk, skb); + sk_drops_skbadd(sk, skb); if (refcounted) sock_put(sk); goto discard_it; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 8b2e7b7afbd8..b4e56b877273 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1809,7 +1809,7 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb) &hdr->saddr, &hdr->daddr, AF_INET6, dif, sdif); if (drop_reason) { - sk_drops_add(sk, skb); + sk_drops_skbadd(sk, skb); reqsk_put(req); goto discard_it; } @@ -1948,7 +1948,7 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb) return 0; discard_and_relse: - sk_drops_add(sk, skb); + sk_drops_skbadd(sk, skb); if (refcounted) sock_put(sk); goto discard_it; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index f2e728239480..ad41c48126e4 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -137,7 +137,7 @@ struct sock *__mptcp_nmpc_sk(struct mptcp_sock *msk) static void mptcp_drop(struct sock *sk, struct sk_buff *skb) { - sk_drops_add(sk, skb); + sk_drops_skbadd(sk, skb); __kfree_skb(skb); } From c51613fa276f038bdd18656a57a90ccc5d4e5200 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 Aug 2025 12:50:29 +0000 Subject: [PATCH 3/5] net: add sk->sk_drop_counters Some sockets suffer from heavy false sharing on sk->sk_drops, and fields in the same cache line. Add sk->sk_drop_counters to: - move the drop counter(s) to dedicated cache lines. - Add basic NUMA awareness to these drop counter(s). Following patches will use this infrastructure for UDP and RAW sockets. sk_clone_lock() is not yet ready, it would need to properly set newsk->sk_drop_counters if we plan to use this for TCP sockets. v2: used Paolo suggestion from https://lore.kernel.org/netdev/8f09830a-d83d-43c9-b36b-88ba0a23e9b2@redhat.com/ Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250826125031.1578842-4-edumazet@google.com Signed-off-by: Paolo Abeni --- include/net/sock.h | 32 +++++++++++++++++++++++++++++++- net/core/sock.c | 2 ++ 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/include/net/sock.h b/include/net/sock.h index 9edb42ff0622..73cd3316e288 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -102,6 +102,11 @@ struct net; typedef __u32 __bitwise __portpair; typedef __u64 __bitwise __addrpair; +struct socket_drop_counters { + atomic_t drops0 ____cacheline_aligned_in_smp; + atomic_t drops1 ____cacheline_aligned_in_smp; +}; + /** * struct sock_common - minimal network layer representation of sockets * @skc_daddr: Foreign IPv4 addr @@ -282,6 +287,7 @@ struct sk_filter; * @sk_err_soft: errors that don't cause failure but are the cause of a * persistent failure not just 'timed out' * @sk_drops: raw/udp drops counter + * @sk_drop_counters: optional pointer to socket_drop_counters * @sk_ack_backlog: current listen backlog * @sk_max_ack_backlog: listen backlog set in listen() * @sk_uid: user id of owner @@ -449,6 +455,7 @@ struct sock { #ifdef CONFIG_XFRM struct xfrm_policy __rcu *sk_policy[2]; #endif + struct socket_drop_counters *sk_drop_counters; __cacheline_group_end(sock_read_rxtx); __cacheline_group_begin(sock_write_rxtx); @@ -2684,7 +2691,18 @@ struct sock_skb_cb { static inline void sk_drops_add(struct sock *sk, int segs) { - atomic_add(segs, &sk->sk_drops); + struct socket_drop_counters *sdc = sk->sk_drop_counters; + + if (sdc) { + int n = numa_node_id() % 2; + + if (n) + atomic_add(segs, &sdc->drops1); + else + atomic_add(segs, &sdc->drops0); + } else { + atomic_add(segs, &sk->sk_drops); + } } static inline void sk_drops_inc(struct sock *sk) @@ -2694,11 +2712,23 @@ static inline void sk_drops_inc(struct sock *sk) static inline int sk_drops_read(const struct sock *sk) { + const struct socket_drop_counters *sdc = sk->sk_drop_counters; + + if (sdc) { + DEBUG_NET_WARN_ON_ONCE(atomic_read(&sk->sk_drops)); + return atomic_read(&sdc->drops0) + atomic_read(&sdc->drops1); + } return atomic_read(&sk->sk_drops); } static inline void sk_drops_reset(struct sock *sk) { + struct socket_drop_counters *sdc = sk->sk_drop_counters; + + if (sdc) { + atomic_set(&sdc->drops0, 0); + atomic_set(&sdc->drops1, 0); + } atomic_set(&sk->sk_drops, 0); } diff --git a/net/core/sock.c b/net/core/sock.c index 75368823969a..e66ad1ec3a2d 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2505,6 +2505,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) newsk->sk_wmem_queued = 0; newsk->sk_forward_alloc = 0; newsk->sk_reserved_mem = 0; + DEBUG_NET_WARN_ON_ONCE(newsk->sk_drop_counters); sk_drops_reset(newsk); newsk->sk_send_head = NULL; newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; @@ -4457,6 +4458,7 @@ static int __init sock_struct_check(void) #ifdef CONFIG_MEMCG CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg); #endif + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_drop_counters); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem); From 51132b99f01ce05f8008f0fb189d83eed484bd53 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 Aug 2025 12:50:30 +0000 Subject: [PATCH 4/5] udp: add drop_counters to udp socket When a packet flood hits one or more UDP sockets, many cpus have to update sk->sk_drops. This slows down other cpus, because currently sk_drops is in sock_write_rx group. Add a socket_drop_counters structure to udp sockets. Using dedicated cache lines to hold drop counters makes sure that consumers no longer suffer from false sharing if/when producers only change sk->sk_drops. This adds 128 bytes per UDP socket. Tested with the following stress test, sending about 11 Mpps to a dual socket AMD EPYC 7B13 64-Core. super_netperf 20 -t UDP_STREAM -H DUT -l10 -- -n -P,1000 -m 120 Note: due to socket lookup, only one UDP socket is receiving packets on DUT. Then measure receiver (DUT) behavior. We can see both consumer and BH handlers can process more packets per second. Before: nstat -n ; sleep 1 ; nstat | grep Udp Udp6InDatagrams 615091 0.0 Udp6InErrors 3904277 0.0 Udp6RcvbufErrors 3904277 0.0 After: nstat -n ; sleep 1 ; nstat | grep Udp Udp6InDatagrams 816281 0.0 Udp6InErrors 7497093 0.0 Udp6RcvbufErrors 7497093 0.0 Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250826125031.1578842-5-edumazet@google.com Signed-off-by: Paolo Abeni --- include/linux/udp.h | 1 + include/net/udp.h | 1 + tools/testing/selftests/bpf/progs/bpf_iter_udp4.c | 3 ++- tools/testing/selftests/bpf/progs/bpf_iter_udp6.c | 4 ++-- 4 files changed, 6 insertions(+), 3 deletions(-) diff --git a/include/linux/udp.h b/include/linux/udp.h index 4e1a672af4c5..981506be1e15 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -108,6 +108,7 @@ struct udp_sock { * the last UDP socket cacheline. */ struct hlist_node tunnel_list; + struct socket_drop_counters drop_counters; }; #define udp_test_bit(nr, sk) \ diff --git a/include/net/udp.h b/include/net/udp.h index 7b26d4c50f33..93b159f30e88 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -288,6 +288,7 @@ static inline void udp_lib_init_sock(struct sock *sk) { struct udp_sock *up = udp_sk(sk); + sk->sk_drop_counters = &up->drop_counters; skb_queue_head_init(&up->reader_queue); INIT_HLIST_NODE(&up->tunnel_list); up->forward_threshold = sk->sk_rcvbuf >> 2; diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_udp4.c b/tools/testing/selftests/bpf/progs/bpf_iter_udp4.c index ffbd4b116d17..23b2aa2604de 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_udp4.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_udp4.c @@ -64,7 +64,8 @@ int dump_udp4(struct bpf_iter__udp *ctx) 0, 0L, 0, ctx->uid, 0, sock_i_ino(&inet->sk), inet->sk.sk_refcnt.refs.counter, udp_sk, - inet->sk.sk_drops.counter); + udp_sk->drop_counters.drops0.counter + + udp_sk->drop_counters.drops1.counter); return 0; } diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_udp6.c b/tools/testing/selftests/bpf/progs/bpf_iter_udp6.c index 47ff7754f4fd..c48b05aa2a4b 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_udp6.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_udp6.c @@ -72,7 +72,7 @@ int dump_udp6(struct bpf_iter__udp *ctx) 0, 0L, 0, ctx->uid, 0, sock_i_ino(&inet->sk), inet->sk.sk_refcnt.refs.counter, udp_sk, - inet->sk.sk_drops.counter); - + udp_sk->drop_counters.drops0.counter + + udp_sk->drop_counters.drops1.counter); return 0; } From b81aa23234d94d99951761d9864061d774633ba9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 Aug 2025 12:50:31 +0000 Subject: [PATCH 5/5] inet: raw: add drop_counters to raw sockets When a packet flood hits one or more RAW sockets, many cpus have to update sk->sk_drops. This slows down other cpus, because currently sk_drops is in sock_write_rx group. Add a socket_drop_counters structure to raw sockets. Using dedicated cache lines to hold drop counters makes sure that consumers no longer suffer from false sharing if/when producers only change sk->sk_drops. This adds 128 bytes per RAW socket. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250826125031.1578842-6-edumazet@google.com Signed-off-by: Paolo Abeni --- include/linux/ipv6.h | 2 +- include/net/raw.h | 1 + net/ipv4/raw.c | 1 + net/ipv6/raw.c | 1 + 4 files changed, 4 insertions(+), 1 deletion(-) diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index bc6ec2959173..261d02efb615 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -295,7 +295,7 @@ struct raw6_sock { __u32 offset; /* checksum offset */ struct icmp6_filter filter; __u32 ip6mr_table; - + struct socket_drop_counters drop_counters; struct ipv6_pinfo inet6; }; diff --git a/include/net/raw.h b/include/net/raw.h index 32a61481a253..d52709139060 100644 --- a/include/net/raw.h +++ b/include/net/raw.h @@ -81,6 +81,7 @@ struct raw_sock { struct inet_sock inet; struct icmp_filter filter; u32 ipmr_table; + struct socket_drop_counters drop_counters; }; #define raw_sk(ptr) container_of_const(ptr, struct raw_sock, inet.sk) diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 0f9f02f6146e..d54ebb7df966 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -793,6 +793,7 @@ static int raw_sk_init(struct sock *sk) { struct raw_sock *rp = raw_sk(sk); + sk->sk_drop_counters = &rp->drop_counters; if (inet_sk(sk)->inet_num == IPPROTO_ICMP) memset(&rp->filter, 0, sizeof(rp->filter)); return 0; diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 4026192143ec..4ae07a67b4d4 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -1175,6 +1175,7 @@ static int rawv6_init_sk(struct sock *sk) { struct raw6_sock *rp = raw6_sk(sk); + sk->sk_drop_counters = &rp->drop_counters; switch (inet_sk(sk)->inet_num) { case IPPROTO_ICMPV6: rp->checksum = 1;