Merge branch 'gro-inline-tcp6_gro_-receive-complete'

Eric Dumazet says:

====================
gro: inline tcp6_gro_{receive,complete}

On some platforms, GRO stack is too deep and causes cpu stalls.

Decreasing call depths by one shows a 1.5 % gain on Zen2 cpus.
(32 RX queues, 100Gbit NIC, RFS enabled, tcp_rr with 128 threads and 10,000 flows)

We can go further by inlining ipv6_gro_{receive,complete}
and take care of IPv4 if there is interest.

Note: two temporary __always_inline will be replaced with
      inline_for_performance when/if available.

Cumulative size increase for this series (of 3):

$ scripts/bloat-o-meter -t vmlinux.0 vmlinux.3
add/remove: 2/2 grow/shrink: 5/1 up/down: 1572/-471 (1101)
Function                                     old     new   delta
ipv6_gro_receive                            1069    1846    +777
ipv6_gro_complete                            433     733    +300
tcp6_check_fraglist_gro                        -     272    +272
tcp6_gro_complete                            227     306     +79
tcp4_gro_complete                            325     397     +72
ipv6_offload_init                            218     274     +56
__pfx_tcp6_check_fraglist_gro                  -      16     +16
__pfx___skb_incr_checksum_unnecessary         32       -     -32
__skb_incr_checksum_unnecessary              186       -    -186
tcp6_gro_receive                             959     706    -253
Total: Before=22592724, After=22593825, chg +0.00%
====================

Link: https://patch.msgid.link/20260120164903.1912995-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
Jakub Kicinski 2026-01-21 19:28:34 -08:00
commit 9de76f55b9
7 changed files with 32 additions and 37 deletions

View File

@ -4763,7 +4763,7 @@ static inline void __skb_decr_checksum_unnecessary(struct sk_buff *skb)
}
}
static inline void __skb_incr_checksum_unnecessary(struct sk_buff *skb)
static __always_inline void __skb_incr_checksum_unnecessary(struct sk_buff *skb)
{
if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
if (skb->csum_level < SKB_MAX_CSUM_LEVEL)

View File

@ -405,9 +405,8 @@ INDIRECT_CALLABLE_DECLARE(struct sk_buff *udp4_gro_receive(struct list_head *,
struct sk_buff *));
INDIRECT_CALLABLE_DECLARE(int udp4_gro_complete(struct sk_buff *, int));
INDIRECT_CALLABLE_DECLARE(struct sk_buff *udp6_gro_receive(struct list_head *,
struct sk_buff *));
INDIRECT_CALLABLE_DECLARE(int udp6_gro_complete(struct sk_buff *, int));
struct sk_buff *udp6_gro_receive(struct list_head *, struct sk_buff *);
int udp6_gro_complete(struct sk_buff *, int);
#define indirect_call_gro_receive_inet(cb, f2, f1, head, skb) \
({ \

View File

@ -2324,8 +2324,6 @@ struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb,
struct tcphdr *th);
INDIRECT_CALLABLE_DECLARE(int tcp4_gro_complete(struct sk_buff *skb, int thoff));
INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb));
INDIRECT_CALLABLE_DECLARE(int tcp6_gro_complete(struct sk_buff *skb, int thoff));
INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp6_gro_receive(struct list_head *head, struct sk_buff *skb));
#ifdef CONFIG_INET
void tcp_gro_complete(struct sk_buff *skb);
#else

View File

@ -45,7 +45,7 @@ obj-$(CONFIG_IPV6_FOU) += fou6.o
obj-y += addrconf_core.o exthdrs_core.o ip6_checksum.o ip6_icmp.o
obj-$(CONFIG_INET) += output_core.o protocol.o \
ip6_offload.o tcpv6_offload.o exthdrs_offload.o
ip6_offload.o exthdrs_offload.o
obj-$(subst m,y,$(CONFIG_IPV6)) += inet6_hashtables.o

View File

@ -19,23 +19,7 @@
#include <net/gso.h>
#include "ip6_offload.h"
/* All GRO functions are always builtin, except UDP over ipv6, which lays in
* ipv6 module, as it depends on UDPv6 lookup function, so we need special care
* when ipv6 is built as a module
*/
#if IS_BUILTIN(CONFIG_IPV6)
#define INDIRECT_CALL_L4(f, f2, f1, ...) INDIRECT_CALL_2(f, f2, f1, __VA_ARGS__)
#else
#define INDIRECT_CALL_L4(f, f2, f1, ...) INDIRECT_CALL_1(f, f2, __VA_ARGS__)
#endif
#define indirect_call_gro_receive_l4(f2, f1, cb, head, skb) \
({ \
unlikely(gro_recursion_inc_test(skb)) ? \
NAPI_GRO_CB(skb)->flush |= 1, NULL : \
INDIRECT_CALL_L4(cb, f2, f1, head, skb); \
})
#include "tcpv6_offload.c"
static int ipv6_gro_pull_exthdrs(struct sk_buff *skb, int off, int proto)
{
@ -298,9 +282,19 @@ INDIRECT_CALLABLE_SCOPE struct sk_buff *ipv6_gro_receive(struct list_head *head,
skb_gro_postpull_rcsum(skb, iph, nlen);
pp = indirect_call_gro_receive_l4(tcp6_gro_receive, udp6_gro_receive,
ops->callbacks.gro_receive, head, skb);
if (unlikely(gro_recursion_inc_test(skb))) {
flush = 1;
goto out;
}
if (likely(proto == IPPROTO_TCP))
pp = tcp6_gro_receive(head, skb);
#if IS_BUILTIN(CONFIG_IPV6)
else if (likely(proto == IPPROTO_UDP))
pp = udp6_gro_receive(head, skb);
#endif
else
pp = ops->callbacks.gro_receive(head, skb);
out:
skb_gro_flush_final(skb, pp, flush);
@ -379,11 +373,18 @@ INDIRECT_CALLABLE_SCOPE int ipv6_gro_complete(struct sk_buff *skb, int nhoff)
}
nhoff += sizeof(*iph) + ipv6_exthdrs_len(iph, &ops);
if (likely(ops == &net_hotdata.tcpv6_offload))
return tcp6_gro_complete(skb, nhoff);
#if IS_BUILTIN(CONFIG_IPV6)
if (ops == &net_hotdata.udpv6_offload)
return udp6_gro_complete(skb, nhoff);
#endif
if (WARN_ON(!ops || !ops->callbacks.gro_complete))
goto out;
err = INDIRECT_CALL_L4(ops->callbacks.gro_complete, tcp6_gro_complete,
udp6_gro_complete, skb, nhoff);
err = ops->callbacks.gro_complete(skb, nhoff);
out:
return err;

View File

@ -24,9 +24,6 @@ static void tcp6_check_fraglist_gro(struct list_head *head, struct sk_buff *skb,
struct net *net;
int iif, sdif;
if (likely(!(skb->dev->features & NETIF_F_GRO_FRAGLIST)))
return;
p = tcp_gro_lookup(head, th);
if (p) {
NAPI_GRO_CB(skb)->is_flist = NAPI_GRO_CB(p)->is_flist;
@ -45,8 +42,8 @@ static void tcp6_check_fraglist_gro(struct list_head *head, struct sk_buff *skb,
#endif /* IS_ENABLED(CONFIG_IPV6) */
}
INDIRECT_CALLABLE_SCOPE
struct sk_buff *tcp6_gro_receive(struct list_head *head, struct sk_buff *skb)
static __always_inline struct sk_buff *tcp6_gro_receive(struct list_head *head,
struct sk_buff *skb)
{
struct tcphdr *th;
@ -60,7 +57,8 @@ struct sk_buff *tcp6_gro_receive(struct list_head *head, struct sk_buff *skb)
if (!th)
goto flush;
tcp6_check_fraglist_gro(head, skb, th);
if (unlikely(skb->dev->features & NETIF_F_GRO_FRAGLIST))
tcp6_check_fraglist_gro(head, skb, th);
return tcp_gro_receive(head, skb, th);
@ -69,7 +67,7 @@ struct sk_buff *tcp6_gro_receive(struct list_head *head, struct sk_buff *skb)
return NULL;
}
INDIRECT_CALLABLE_SCOPE int tcp6_gro_complete(struct sk_buff *skb, int thoff)
static __always_inline int tcp6_gro_complete(struct sk_buff *skb, int thoff)
{
const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation];
const struct ipv6hdr *iph = (struct ipv6hdr *)(skb->data + offset);

View File

@ -132,7 +132,6 @@ static struct sock *udp6_gro_lookup_skb(struct sk_buff *skb, __be16 sport,
sdif, net->ipv4.udp_table, NULL);
}
INDIRECT_CALLABLE_SCOPE
struct sk_buff *udp6_gro_receive(struct list_head *head, struct sk_buff *skb)
{
struct udphdr *uh = udp_gro_udphdr(skb);
@ -165,7 +164,7 @@ struct sk_buff *udp6_gro_receive(struct list_head *head, struct sk_buff *skb)
return NULL;
}
INDIRECT_CALLABLE_SCOPE int udp6_gro_complete(struct sk_buff *skb, int nhoff)
int udp6_gro_complete(struct sk_buff *skb, int nhoff)
{
const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation];
const struct ipv6hdr *ipv6h = (struct ipv6hdr *)(skb->data + offset);