Merge branch 'geneve-introduce-double-tunnel-gso-gro-support'

Paolo Abeni says:

====================
geneve: introduce double tunnel GSO/GRO support

This is the [belated] incarnation of topic discussed in the last Neconf
[1].

In container orchestration in virtual environments there is a consistent
usage of double UDP tunneling - specifically geneve. Such setup lack
support of GRO and GSO for inter VM traffic.

After commit b430f6c38d ("Merge branch 'virtio_udp_tunnel_08_07_2025'
of https://github.com/pabeni/linux-devel") and the qemu cunter-part, VMs
are able to send/receive GSO over UDP aggregated packets.

This series introduces the missing bit for full end-to-end aggregation
in the above mentioned scenario. Specifically:

- introduces a new netdev feature set to generalize existing per device
driver GSO admission check.1
- adds GSO partial support for the geneve and vxlan drivers
- introduces and use a geneve option to assist double tunnel GRO
- adds some simple functional tests for the above.

The new device features set is not strictly needed for the following
work, but avoids the introduction of trivial `ndo_features_check` to
support GSO partial and thus possible performance regression due to the
additional indirect call. Such feature set could be leveraged by a
number of existing drivers (intel, meta and possibly wangxun) to avoid
duplicate code/tests. Such part has been omitted here to keep the series
small.

Both GSO partial support and double GRO support have some downsides.
With the first in place, GSO partial packets will traverse the network
stack 'downstream' the outer geneve UDP tunnel and will be visible by
the udp/IP/IPv6 and by netfilter. Currently only H/W NICs implement GSO
partial support and such packets are visible only via software taps.

Double UDP tunnel GRO will cook 'GSO partial' like aggregate packets,
i.e. the inner UDP encapsulation headers set will still carry the
wire-level lengths and csum, so that segmentation considering such
headers parts of a giant, constant encapsulation header will yield the
correct result.

The correct GSO packet layout is applied when the packet traverse the
outermost geneve encapsulation.

Both GSO partial and double UDP encap are disabled by default and must
be explicitly enabled via, respectively ethtool and geneve device
configuration.

Finally note that the GSO partial feature could potentially be applied
to all the other UDP tunnels, but this series limits its usage to geneve
and vxlan devices.

Link: https://netdev.bots.linux.dev/netconf/2024/paolo.pdf [1]
====================

Link: https://patch.msgid.link/cover.1769011015.git.pabeni@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
Jakub Kicinski 2026-01-23 11:31:15 -08:00
commit 35527de54f
10 changed files with 975 additions and 39 deletions

View File

@ -1914,6 +1914,9 @@ attribute-sets:
name: port-range
type: binary
struct: ifla-geneve-port-range
-
name: gro-hint
type: flag
-
name: linkinfo-hsr-attrs
name-prefix: ifla-hsr-

View File

@ -38,6 +38,26 @@ MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
#define GENEVE_IPV4_HLEN (ETH_HLEN + sizeof(struct iphdr) + GENEVE_BASE_HLEN)
#define GENEVE_IPV6_HLEN (ETH_HLEN + sizeof(struct ipv6hdr) + GENEVE_BASE_HLEN)
#define GENEVE_OPT_NETDEV_CLASS 0x100
#define GENEVE_OPT_GRO_HINT_SIZE 8
#define GENEVE_OPT_GRO_HINT_TYPE 1
#define GENEVE_OPT_GRO_HINT_LEN 1
struct geneve_opt_gro_hint {
u8 inner_proto_id:2,
nested_is_v6:1;
u8 nested_nh_offset;
u8 nested_tp_offset;
u8 nested_hdr_len;
};
struct geneve_skb_cb {
unsigned int gro_hint_len;
struct geneve_opt_gro_hint gro_hint;
};
#define GENEVE_SKB_CB(__skb) ((struct geneve_skb_cb *)&((__skb)->cb[0]))
/* per-network namespace private data for this module */
struct geneve_net {
struct list_head geneve_list;
@ -56,6 +76,7 @@ struct geneve_config {
bool collect_md;
bool use_udp6_rx_checksums;
bool ttl_inherit;
bool gro_hint;
enum ifla_geneve_df df;
bool inner_proto_inherit;
u16 port_min;
@ -84,6 +105,7 @@ struct geneve_dev {
struct geneve_sock {
bool collect_md;
bool gro_hint;
struct list_head list;
struct socket *sock;
struct rcu_head rcu;
@ -91,6 +113,21 @@ struct geneve_sock {
struct hlist_head vni_list[VNI_HASH_SIZE];
};
static const __be16 proto_id_map[] = { htons(ETH_P_TEB),
htons(ETH_P_IPV6),
htons(ETH_P_IP) };
static int proto_to_id(__be16 proto)
{
int i;
for (i = 0; i < ARRAY_SIZE(proto_id_map); i++)
if (proto_id_map[i] == proto)
return i;
return -1;
}
static inline __u32 geneve_net_vni_hash(u8 vni[3])
{
__u32 vnid;
@ -222,9 +259,8 @@ static struct geneve_dev *geneve_lookup_skb(struct geneve_sock *gs,
/* geneve receive/decap routine */
static void geneve_rx(struct geneve_dev *geneve, struct geneve_sock *gs,
struct sk_buff *skb)
struct sk_buff *skb, const struct genevehdr *gnvh)
{
struct genevehdr *gnvh = geneve_hdr(skb);
struct metadata_dst *tun_dst = NULL;
unsigned int len;
int nh, err = 0;
@ -325,8 +361,12 @@ static void geneve_rx(struct geneve_dev *geneve, struct geneve_sock *gs,
}
}
/* Skip the additional GRO stage when hints are in use. */
len = skb->len;
err = gro_cells_receive(&geneve->gro_cells, skb);
if (skb->encapsulation)
err = netif_rx(skb);
else
err = gro_cells_receive(&geneve->gro_cells, skb);
if (likely(err == NET_RX_SUCCESS))
dev_dstats_rx_add(geneve->dev, len);
@ -363,6 +403,250 @@ static void geneve_uninit(struct net_device *dev)
gro_cells_destroy(&geneve->gro_cells);
}
static int geneve_hlen(const struct genevehdr *gh)
{
return sizeof(*gh) + gh->opt_len * 4;
}
/*
* Look for GRO hint in the genenve options; if not found or does not pass basic
* sanitization return 0, otherwise the offset WRT the geneve hdr start.
*/
static unsigned int
geneve_opt_gro_hint_off(const struct genevehdr *gh, __be16 *type,
unsigned int *gh_len)
{
struct geneve_opt *opt = (void *)(gh + 1);
unsigned int id, opt_len = gh->opt_len;
struct geneve_opt_gro_hint *gro_hint;
while (opt_len >= (GENEVE_OPT_GRO_HINT_SIZE >> 2)) {
if (opt->opt_class == htons(GENEVE_OPT_NETDEV_CLASS) &&
opt->type == GENEVE_OPT_GRO_HINT_TYPE &&
opt->length == GENEVE_OPT_GRO_HINT_LEN)
goto found;
/* check for bad opt len */
if (opt->length + 1 >= opt_len)
return 0;
/* next opt */
opt_len -= opt->length + 1;
opt = ((void *)opt) + ((opt->length + 1) << 2);
}
return 0;
found:
gro_hint = (struct geneve_opt_gro_hint *)opt->opt_data;
/*
* Sanitize the hinted hdrs: the nested transport is UDP and must fit
* the overall hinted hdr size.
*/
if (gro_hint->nested_tp_offset + sizeof(struct udphdr) >
gro_hint->nested_hdr_len)
return 0;
if (gro_hint->nested_nh_offset +
(gro_hint->nested_is_v6 ? sizeof(struct ipv6hdr) :
sizeof(struct iphdr)) >
gro_hint->nested_tp_offset)
return 0;
/* Allow only supported L2. */
id = gro_hint->inner_proto_id;
if (id >= ARRAY_SIZE(proto_id_map))
return 0;
*type = proto_id_map[id];
*gh_len += gro_hint->nested_hdr_len;
return (void *)gro_hint - (void *)gh;
}
static const struct geneve_opt_gro_hint *
geneve_opt_gro_hint(const struct genevehdr *gh, unsigned int hint_off)
{
return (const struct geneve_opt_gro_hint *)((void *)gh + hint_off);
}
static unsigned int
geneve_sk_gro_hint_off(const struct sock *sk, const struct genevehdr *gh,
__be16 *type, unsigned int *gh_len)
{
const struct geneve_sock *gs = rcu_dereference_sk_user_data(sk);
if (!gs || !gs->gro_hint)
return 0;
return geneve_opt_gro_hint_off(gh, type, gh_len);
}
/* Validate the packet headers pointed by data WRT the provided hint */
static bool
geneve_opt_gro_hint_validate(void *data,
const struct geneve_opt_gro_hint *gro_hint)
{
void *nested_nh = data + gro_hint->nested_nh_offset;
struct iphdr *iph;
if (gro_hint->nested_is_v6) {
struct ipv6hdr *ipv6h = nested_nh;
struct ipv6_opt_hdr *opth;
int offset, len;
if (ipv6h->nexthdr == IPPROTO_UDP)
return true;
offset = sizeof(*ipv6h) + gro_hint->nested_nh_offset;
while (offset + sizeof(*opth) <= gro_hint->nested_tp_offset) {
opth = data + offset;
len = ipv6_optlen(opth);
if (len + offset > gro_hint->nested_tp_offset)
return false;
if (opth->nexthdr == IPPROTO_UDP)
return true;
offset += len;
}
return false;
}
iph = nested_nh;
if (*(u8 *)iph != 0x45 || ip_is_fragment(iph) ||
iph->protocol != IPPROTO_UDP || ip_fast_csum((u8 *)iph, 5))
return false;
return true;
}
/*
* Validate the skb headers following the specified geneve hdr vs the
* provided hint, including nested L4 checksum.
* The caller already ensured that the relevant amount of data is available
* in the linear part.
*/
static bool
geneve_opt_gro_hint_validate_csum(const struct sk_buff *skb,
const struct genevehdr *gh,
const struct geneve_opt_gro_hint *gro_hint)
{
unsigned int plen, gh_len = geneve_hlen(gh);
void *nested = (void *)gh + gh_len;
struct udphdr *nested_uh;
unsigned int nested_len;
struct ipv6hdr *ipv6h;
struct iphdr *iph;
__wsum csum, psum;
if (!geneve_opt_gro_hint_validate(nested, gro_hint))
return false;
/* Use GRO hints with nested csum only if the outer header has csum. */
nested_uh = nested + gro_hint->nested_tp_offset;
if (!nested_uh->check || skb->ip_summed == CHECKSUM_PARTIAL)
return true;
if (!NAPI_GRO_CB(skb)->csum_valid)
return false;
/* Compute the complete checksum up to the nested transport. */
plen = gh_len + gro_hint->nested_tp_offset;
csum = csum_sub(NAPI_GRO_CB(skb)->csum, csum_partial(gh, plen, 0));
nested_len = skb_gro_len(skb) - plen;
/* Compute the nested pseudo header csum. */
ipv6h = nested + gro_hint->nested_nh_offset;
iph = (struct iphdr *)ipv6h;
psum = gro_hint->nested_is_v6 ?
~csum_unfold(csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr,
nested_len, IPPROTO_UDP, 0)) :
csum_tcpudp_nofold(iph->saddr, iph->daddr,
nested_len, IPPROTO_UDP, 0);
return !csum_fold(csum_add(psum, csum));
}
static int geneve_post_decap_hint(const struct sock *sk, struct sk_buff *skb,
unsigned int gh_len,
struct genevehdr **geneveh)
{
const struct geneve_opt_gro_hint *gro_hint;
unsigned int len, total_len, hint_off;
struct ipv6hdr *ipv6h;
struct iphdr *iph;
struct udphdr *uh;
__be16 p;
hint_off = geneve_sk_gro_hint_off(sk, *geneveh, &p, &len);
if (!hint_off)
return 0;
if (!skb_is_gso(skb))
return 0;
gro_hint = geneve_opt_gro_hint(*geneveh, hint_off);
if (unlikely(!pskb_may_pull(skb, gro_hint->nested_hdr_len)))
return -ENOMEM;
*geneveh = geneve_hdr(skb);
gro_hint = geneve_opt_gro_hint(*geneveh, hint_off);
/*
* Validate hints from untrusted source before accessing
* the headers; csum will be checked later by the nested
* protocol rx path.
*/
if (unlikely(skb_shinfo(skb)->gso_type & SKB_GSO_DODGY &&
!geneve_opt_gro_hint_validate(skb->data, gro_hint)))
return -EINVAL;
ipv6h = (void *)skb->data + gro_hint->nested_nh_offset;
iph = (struct iphdr *)ipv6h;
total_len = skb->len - gro_hint->nested_nh_offset;
if (total_len > GRO_LEGACY_MAX_SIZE)
return -E2BIG;
/*
* After stripping the outer encap, the packet still carries a
* tunnel encapsulation: the nested one.
*/
skb->encapsulation = 1;
/* GSO expect a valid transpor header, move it to the current one. */
skb_set_transport_header(skb, gro_hint->nested_tp_offset);
/* Adjust the nested IP{6} hdr to actual GSO len. */
if (gro_hint->nested_is_v6) {
ipv6h->payload_len = htons(total_len - sizeof(*ipv6h));
} else {
__be16 old_len = iph->tot_len;
iph->tot_len = htons(total_len);
/* For IPv4 additionally adjust the nested csum. */
csum_replace2(&iph->check, old_len, iph->tot_len);
ip_send_check(iph);
}
/* Adjust the nested UDP header len and checksum. */
uh = udp_hdr(skb);
uh->len = htons(skb->len - gro_hint->nested_tp_offset);
if (uh->check) {
len = skb->len - gro_hint->nested_nh_offset;
skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL_CSUM;
if (gro_hint->nested_is_v6)
uh->check = ~udp_v6_check(len, &ipv6h->saddr,
&ipv6h->daddr, 0);
else
uh->check = ~udp_v4_check(len, iph->saddr,
iph->daddr, 0);
} else {
skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL;
}
return 0;
}
/* Callback from net/ipv4/udp.c to receive packets */
static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
{
@ -404,7 +688,18 @@ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
goto drop;
}
geneve_rx(geneve, gs, skb);
/*
* After hint processing, the transport header points to the inner one
* and we can't use anymore on geneve_hdr().
*/
geneveh = geneve_hdr(skb);
if (geneve_post_decap_hint(sk, skb, sizeof(struct genevehdr) +
opts_len, &geneveh)) {
DEV_STATS_INC(geneve->dev, rx_errors);
goto drop;
}
geneve_rx(geneve, gs, skb, geneveh);
return 0;
drop:
@ -495,22 +790,93 @@ static struct socket *geneve_create_sock(struct net *net, bool ipv6,
return sock;
}
static int geneve_hlen(struct genevehdr *gh)
static bool geneve_hdr_match(struct sk_buff *skb,
const struct genevehdr *gh,
const struct genevehdr *gh2,
unsigned int hint_off)
{
return sizeof(*gh) + gh->opt_len * 4;
const struct geneve_opt_gro_hint *gro_hint;
void *nested, *nested2, *nh, *nh2;
struct udphdr *udp, *udp2;
unsigned int gh_len;
/* Match the geneve hdr and options */
if (gh->opt_len != gh2->opt_len)
return false;
gh_len = geneve_hlen(gh);
if (memcmp(gh, gh2, gh_len))
return false;
if (!hint_off)
return true;
/*
* When gro is present consider the nested headers as part
* of the geneve options
*/
nested = (void *)gh + gh_len;
nested2 = (void *)gh2 + gh_len;
gro_hint = geneve_opt_gro_hint(gh, hint_off);
if (!memcmp(nested, nested2, gro_hint->nested_hdr_len))
return true;
/*
* The nested headers differ; the packets can still belong to
* the same flow when IPs/proto/ports match; if so flushing is
* required.
*/
nh = nested + gro_hint->nested_nh_offset;
nh2 = nested2 + gro_hint->nested_nh_offset;
if (gro_hint->nested_is_v6) {
struct ipv6hdr *iph = nh, *iph2 = nh2;
unsigned int nested_nlen;
__be32 first_word;
first_word = *(__be32 *)iph ^ *(__be32 *)iph2;
if ((first_word & htonl(0xF00FFFFF)) ||
!ipv6_addr_equal(&iph->saddr, &iph2->saddr) ||
!ipv6_addr_equal(&iph->daddr, &iph2->daddr) ||
iph->nexthdr != iph2->nexthdr)
return false;
nested_nlen = gro_hint->nested_tp_offset -
gro_hint->nested_nh_offset;
if (nested_nlen > sizeof(struct ipv6hdr) &&
(memcmp(iph + 1, iph2 + 1,
nested_nlen - sizeof(struct ipv6hdr))))
return false;
} else {
struct iphdr *iph = nh, *iph2 = nh2;
if ((iph->protocol ^ iph2->protocol) |
((__force u32)iph->saddr ^ (__force u32)iph2->saddr) |
((__force u32)iph->daddr ^ (__force u32)iph2->daddr))
return false;
}
udp = nested + gro_hint->nested_tp_offset;
udp2 = nested2 + gro_hint->nested_tp_offset;
if (udp->source != udp2->source || udp->dest != udp2->dest ||
udp->check != udp2->check)
return false;
NAPI_GRO_CB(skb)->flush = 1;
return true;
}
static struct sk_buff *geneve_gro_receive(struct sock *sk,
struct list_head *head,
struct sk_buff *skb)
{
unsigned int hlen, gh_len, off_gnv, hint_off;
const struct geneve_opt_gro_hint *gro_hint;
const struct packet_offload *ptype;
struct genevehdr *gh, *gh2;
struct sk_buff *pp = NULL;
struct sk_buff *p;
struct genevehdr *gh, *gh2;
unsigned int hlen, gh_len, off_gnv;
const struct packet_offload *ptype;
__be16 type;
int flush = 1;
__be16 type;
off_gnv = skb_gro_offset(skb);
hlen = off_gnv + sizeof(*gh);
@ -521,6 +887,7 @@ static struct sk_buff *geneve_gro_receive(struct sock *sk,
if (gh->ver != GENEVE_VER || gh->oam)
goto out;
gh_len = geneve_hlen(gh);
type = gh->proto_type;
hlen = off_gnv + gh_len;
if (!skb_gro_may_pull(skb, hlen)) {
@ -529,13 +896,30 @@ static struct sk_buff *geneve_gro_receive(struct sock *sk,
goto out;
}
/* The GRO hint/nested hdr could use a different ethernet type. */
hint_off = geneve_sk_gro_hint_off(sk, gh, &type, &gh_len);
if (hint_off) {
/*
* If the hint is present, and nested hdr validation fails, do
* not attempt plain GRO: it will ignore inner hdrs and cause
* OoO.
*/
gh = skb_gro_header(skb, off_gnv + gh_len, off_gnv);
if (unlikely(!gh))
goto out;
gro_hint = geneve_opt_gro_hint(gh, hint_off);
if (!geneve_opt_gro_hint_validate_csum(skb, gh, gro_hint))
goto out;
}
list_for_each_entry(p, head, list) {
if (!NAPI_GRO_CB(p)->same_flow)
continue;
gh2 = (struct genevehdr *)(p->data + off_gnv);
if (gh->opt_len != gh2->opt_len ||
memcmp(gh, gh2, gh_len)) {
if (!geneve_hdr_match(skb, gh, gh2, hint_off)) {
NAPI_GRO_CB(p)->same_flow = 0;
continue;
}
@ -543,7 +927,6 @@ static struct sk_buff *geneve_gro_receive(struct sock *sk,
skb_gro_pull(skb, gh_len);
skb_gro_postpull_rcsum(skb, gh, gh_len);
type = gh->proto_type;
if (likely(type == htons(ETH_P_TEB)))
return call_gro_receive(eth_gro_receive, head, skb);
@ -572,6 +955,7 @@ static int geneve_gro_complete(struct sock *sk, struct sk_buff *skb,
gh = (struct genevehdr *)(skb->data + nhoff);
gh_len = geneve_hlen(gh);
type = gh->proto_type;
geneve_opt_gro_hint_off(gh, &type, &gh_len);
/* since skb->encapsulation is set, eth_gro_complete() sets the inner mac header */
if (likely(type == htons(ETH_P_TEB)))
@ -659,13 +1043,15 @@ static void geneve_sock_release(struct geneve_dev *geneve)
static struct geneve_sock *geneve_find_sock(struct geneve_net *gn,
sa_family_t family,
__be16 dst_port)
__be16 dst_port,
bool gro_hint)
{
struct geneve_sock *gs;
list_for_each_entry(gs, &gn->sock_list, list) {
if (inet_sk(gs->sock->sk)->inet_sport == dst_port &&
geneve_get_sk_family(gs) == family) {
geneve_get_sk_family(gs) == family &&
gs->gro_hint == gro_hint) {
return gs;
}
}
@ -676,12 +1062,14 @@ static int geneve_sock_add(struct geneve_dev *geneve, bool ipv6)
{
struct net *net = geneve->net;
struct geneve_net *gn = net_generic(net, geneve_net_id);
bool gro_hint = geneve->cfg.gro_hint;
struct geneve_dev_node *node;
struct geneve_sock *gs;
__u8 vni[3];
__u32 hash;
gs = geneve_find_sock(gn, ipv6 ? AF_INET6 : AF_INET, geneve->cfg.info.key.tp_dst);
gs = geneve_find_sock(gn, ipv6 ? AF_INET6 : AF_INET,
geneve->cfg.info.key.tp_dst, gro_hint);
if (gs) {
gs->refcnt++;
goto out;
@ -694,6 +1082,7 @@ static int geneve_sock_add(struct geneve_dev *geneve, bool ipv6)
out:
gs->collect_md = geneve->cfg.collect_md;
gs->gro_hint = gro_hint;
#if IS_ENABLED(CONFIG_IPV6)
if (ipv6) {
rcu_assign_pointer(geneve->sock6, gs);
@ -766,34 +1155,116 @@ static void geneve_build_header(struct genevehdr *geneveh,
ip_tunnel_info_opts_get(geneveh->options, info);
}
static int geneve_build_gro_hint_opt(const struct geneve_dev *geneve,
struct sk_buff *skb)
{
struct geneve_skb_cb *cb = GENEVE_SKB_CB(skb);
struct geneve_opt_gro_hint *hint;
unsigned int nhlen;
bool nested_is_v6;
int id;
BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct geneve_skb_cb));
cb->gro_hint_len = 0;
/* Try to add the GRO hint only in case of double encap. */
if (!geneve->cfg.gro_hint || !skb->encapsulation)
return 0;
/*
* The nested headers must fit the geneve opt len fields and the
* nested encap must carry a nested transport (UDP) header.
*/
nhlen = skb_inner_mac_header(skb) - skb->data;
if (nhlen > 255 || !skb_transport_header_was_set(skb) ||
skb->inner_protocol_type != ENCAP_TYPE_ETHER ||
(skb_transport_offset(skb) + sizeof(struct udphdr) > nhlen))
return 0;
id = proto_to_id(skb->inner_protocol);
if (id < 0)
return 0;
nested_is_v6 = skb->protocol == htons(ETH_P_IPV6);
if (nested_is_v6) {
int start = skb_network_offset(skb) + sizeof(struct ipv6hdr);
u8 proto = ipv6_hdr(skb)->nexthdr;
__be16 foff;
if (ipv6_skip_exthdr(skb, start, &proto, &foff) < 0 ||
proto != IPPROTO_UDP)
return 0;
} else {
if (ip_hdr(skb)->protocol != IPPROTO_UDP)
return 0;
}
hint = &cb->gro_hint;
memset(hint, 0, sizeof(*hint));
hint->inner_proto_id = id;
hint->nested_is_v6 = skb->protocol == htons(ETH_P_IPV6);
hint->nested_nh_offset = skb_network_offset(skb);
hint->nested_tp_offset = skb_transport_offset(skb);
hint->nested_hdr_len = nhlen;
cb->gro_hint_len = GENEVE_OPT_GRO_HINT_SIZE;
return GENEVE_OPT_GRO_HINT_SIZE;
}
static void geneve_put_gro_hint_opt(struct genevehdr *gnvh, int opt_size,
const struct geneve_opt_gro_hint *hint)
{
struct geneve_opt *gro_opt;
/* geneve_build_header() did not took in account the GRO hint. */
gnvh->opt_len = (opt_size + GENEVE_OPT_GRO_HINT_SIZE) >> 2;
gro_opt = (void *)(gnvh + 1) + opt_size;
memset(gro_opt, 0, sizeof(*gro_opt));
gro_opt->opt_class = htons(GENEVE_OPT_NETDEV_CLASS);
gro_opt->type = GENEVE_OPT_GRO_HINT_TYPE;
gro_opt->length = GENEVE_OPT_GRO_HINT_LEN;
memcpy(gro_opt + 1, hint, sizeof(*hint));
}
static int geneve_build_skb(struct dst_entry *dst, struct sk_buff *skb,
const struct ip_tunnel_info *info,
bool xnet, int ip_hdr_len,
bool inner_proto_inherit)
const struct geneve_dev *geneve, int ip_hdr_len)
{
bool udp_sum = test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags);
bool inner_proto_inherit = geneve->cfg.inner_proto_inherit;
bool xnet = !net_eq(geneve->net, dev_net(geneve->dev));
struct geneve_skb_cb *cb = GENEVE_SKB_CB(skb);
struct genevehdr *gnvh;
__be16 inner_proto;
bool double_encap;
int min_headroom;
int opt_size;
int err;
skb_reset_mac_header(skb);
skb_scrub_packet(skb, xnet);
opt_size = info->options_len + cb->gro_hint_len;
min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len +
GENEVE_BASE_HLEN + info->options_len + ip_hdr_len;
GENEVE_BASE_HLEN + opt_size + ip_hdr_len;
err = skb_cow_head(skb, min_headroom);
if (unlikely(err))
goto free_dst;
double_encap = udp_tunnel_handle_partial(skb);
err = udp_tunnel_handle_offloads(skb, udp_sum);
if (err)
goto free_dst;
gnvh = __skb_push(skb, sizeof(*gnvh) + info->options_len);
gnvh = __skb_push(skb, sizeof(*gnvh) + opt_size);
inner_proto = inner_proto_inherit ? skb->protocol : htons(ETH_P_TEB);
geneve_build_header(gnvh, info, inner_proto);
skb_set_inner_protocol(skb, inner_proto);
if (cb->gro_hint_len)
geneve_put_gro_hint_opt(gnvh, info->options_len, &cb->gro_hint);
udp_tunnel_set_inner_protocol(skb, double_encap, inner_proto);
return 0;
free_dst:
@ -821,8 +1292,6 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev,
struct geneve_dev *geneve,
const struct ip_tunnel_info *info)
{
bool inner_proto_inherit = geneve->cfg.inner_proto_inherit;
bool xnet = !net_eq(geneve->net, dev_net(geneve->dev));
struct geneve_sock *gs4 = rcu_dereference(geneve->sock4);
const struct ip_tunnel_key *key = &info->key;
struct rtable *rt;
@ -833,7 +1302,7 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev,
__be16 sport;
int err;
if (skb_vlan_inet_prepare(skb, inner_proto_inherit))
if (skb_vlan_inet_prepare(skb, geneve->cfg.inner_proto_inherit))
return -EINVAL;
if (!gs4)
@ -854,7 +1323,8 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev,
return PTR_ERR(rt);
err = skb_tunnel_check_pmtu(skb, &rt->dst,
GENEVE_IPV4_HLEN + info->options_len,
GENEVE_IPV4_HLEN + info->options_len +
geneve_build_gro_hint_opt(geneve, skb),
netif_is_any_bridge_port(dev));
if (err < 0) {
dst_release(&rt->dst);
@ -916,8 +1386,8 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev,
}
}
err = geneve_build_skb(&rt->dst, skb, info, xnet, sizeof(struct iphdr),
inner_proto_inherit);
err = geneve_build_skb(&rt->dst, skb, info, geneve,
sizeof(struct iphdr));
if (unlikely(err))
return err;
@ -934,8 +1404,6 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
struct geneve_dev *geneve,
const struct ip_tunnel_info *info)
{
bool inner_proto_inherit = geneve->cfg.inner_proto_inherit;
bool xnet = !net_eq(geneve->net, dev_net(geneve->dev));
struct geneve_sock *gs6 = rcu_dereference(geneve->sock6);
const struct ip_tunnel_key *key = &info->key;
struct dst_entry *dst = NULL;
@ -945,7 +1413,7 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
__be16 sport;
int err;
if (skb_vlan_inet_prepare(skb, inner_proto_inherit))
if (skb_vlan_inet_prepare(skb, geneve->cfg.inner_proto_inherit))
return -EINVAL;
if (!gs6)
@ -966,7 +1434,8 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
return PTR_ERR(dst);
err = skb_tunnel_check_pmtu(skb, dst,
GENEVE_IPV6_HLEN + info->options_len,
GENEVE_IPV6_HLEN + info->options_len +
geneve_build_gro_hint_opt(geneve, skb),
netif_is_any_bridge_port(dev));
if (err < 0) {
dst_release(dst);
@ -1008,8 +1477,7 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
ttl = key->ttl;
ttl = ttl ? : ip6_dst_hoplimit(dst);
}
err = geneve_build_skb(dst, skb, info, xnet, sizeof(struct ipv6hdr),
inner_proto_inherit);
err = geneve_build_skb(dst, skb, info, geneve, sizeof(struct ipv6hdr));
if (unlikely(err))
return err;
@ -1211,9 +1679,16 @@ static void geneve_setup(struct net_device *dev)
dev->features |= NETIF_F_RXCSUM;
dev->features |= NETIF_F_GSO_SOFTWARE;
/* Partial features are disabled by default. */
dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_FRAGLIST;
dev->hw_features |= NETIF_F_RXCSUM;
dev->hw_features |= NETIF_F_GSO_SOFTWARE;
dev->hw_features |= UDP_TUNNEL_PARTIAL_FEATURES;
dev->hw_features |= NETIF_F_GSO_PARTIAL;
dev->hw_enc_features = dev->hw_features;
dev->gso_partial_features = UDP_TUNNEL_PARTIAL_FEATURES;
dev->mangleid_features = NETIF_F_GSO_PARTIAL;
dev->pcpu_stat_type = NETDEV_PCPU_STAT_DSTATS;
/* MTU range: 68 - (something less than 65535) */
@ -1248,6 +1723,7 @@ static const struct nla_policy geneve_policy[IFLA_GENEVE_MAX + 1] = {
[IFLA_GENEVE_DF] = { .type = NLA_U8 },
[IFLA_GENEVE_INNER_PROTO_INHERIT] = { .type = NLA_FLAG },
[IFLA_GENEVE_PORT_RANGE] = NLA_POLICY_EXACT_LEN(sizeof(struct ifla_geneve_port_range)),
[IFLA_GENEVE_GRO_HINT] = { .type = NLA_FLAG },
};
static int geneve_validate(struct nlattr *tb[], struct nlattr *data[],
@ -1598,10 +2074,18 @@ static int geneve_nl2info(struct nlattr *tb[], struct nlattr *data[],
cfg->inner_proto_inherit = true;
}
if (data[IFLA_GENEVE_GRO_HINT]) {
if (changelink) {
attrtype = IFLA_GENEVE_GRO_HINT;
goto change_notsup;
}
cfg->gro_hint = true;
}
return 0;
change_notsup:
NL_SET_ERR_MSG_ATTR(extack, data[attrtype],
"Changing VNI, Port, endpoint IP address family, external, inner_proto_inherit, and UDP checksum attributes are not supported");
"Changing VNI, Port, endpoint IP address family, external, inner_proto_inherit, gro_hint and UDP checksum attributes are not supported");
return -EOPNOTSUPP;
}
@ -1784,6 +2268,7 @@ static size_t geneve_get_size(const struct net_device *dev)
nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_TTL_INHERIT */
nla_total_size(0) + /* IFLA_GENEVE_INNER_PROTO_INHERIT */
nla_total_size(sizeof(struct ifla_geneve_port_range)) + /* IFLA_GENEVE_PORT_RANGE */
nla_total_size(0) + /* IFLA_GENEVE_GRO_HINT */
0;
}
@ -1856,6 +2341,10 @@ static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev)
if (nla_put(skb, IFLA_GENEVE_PORT_RANGE, sizeof(ports), &ports))
goto nla_put_failure;
if (geneve->cfg.gro_hint &&
nla_put_flag(skb, IFLA_GENEVE_GRO_HINT))
goto nla_put_failure;
return 0;
nla_put_failure:

View File

@ -2183,11 +2183,12 @@ static int vxlan_build_skb(struct sk_buff *skb, struct dst_entry *dst,
struct vxlan_metadata *md, u32 vxflags,
bool udp_sum)
{
struct vxlanhdr *vxh;
int min_headroom;
int err;
int type = udp_sum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;
__be16 inner_protocol = htons(ETH_P_TEB);
struct vxlanhdr *vxh;
bool double_encap;
int min_headroom;
int err;
if ((vxflags & VXLAN_F_REMCSUM_TX) &&
skb->ip_summed == CHECKSUM_PARTIAL) {
@ -2208,6 +2209,7 @@ static int vxlan_build_skb(struct sk_buff *skb, struct dst_entry *dst,
if (unlikely(err))
return err;
double_encap = udp_tunnel_handle_partial(skb);
err = iptunnel_handle_offloads(skb, type);
if (err)
return err;
@ -2238,7 +2240,7 @@ static int vxlan_build_skb(struct sk_buff *skb, struct dst_entry *dst,
inner_protocol = skb->protocol;
}
skb_set_inner_protocol(skb, inner_protocol);
udp_tunnel_set_inner_protocol(skb, double_encap, inner_protocol);
return 0;
}
@ -3348,10 +3350,18 @@ static void vxlan_setup(struct net_device *dev)
dev->features |= NETIF_F_RXCSUM;
dev->features |= NETIF_F_GSO_SOFTWARE;
/* Partial features are disabled by default. */
dev->vlan_features = dev->features;
dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_FRAGLIST;
dev->hw_features |= NETIF_F_RXCSUM;
dev->hw_features |= NETIF_F_GSO_SOFTWARE;
dev->hw_features |= UDP_TUNNEL_PARTIAL_FEATURES;
dev->hw_features |= NETIF_F_GSO_PARTIAL;
dev->hw_enc_features = dev->hw_features;
dev->gso_partial_features = UDP_TUNNEL_PARTIAL_FEATURES;
dev->mangleid_features = NETIF_F_GSO_PARTIAL;
netif_keep_dst(dev);
dev->priv_flags |= IFF_NO_QUEUE;
dev->change_proto_down = true;

View File

@ -1831,6 +1831,8 @@ enum netdev_reg_state {
*
* @mpls_features: Mask of features inheritable by MPLS
* @gso_partial_features: value(s) from NETIF_F_GSO\*
* @mangleid_features: Mask of features requiring MANGLEID, will be
* disabled together with the latter.
*
* @ifindex: interface index
* @group: The group the device belongs to
@ -2219,6 +2221,7 @@ struct net_device {
netdev_features_t vlan_features;
netdev_features_t hw_enc_features;
netdev_features_t mpls_features;
netdev_features_t mangleid_features;
unsigned int min_mtu;
unsigned int max_mtu;

View File

@ -10,6 +10,11 @@
#include <net/ipv6_stubs.h>
#endif
#define UDP_TUNNEL_PARTIAL_FEATURES NETIF_F_GSO_ENCAP_ALL
#define UDP_TUNNEL_STRIPPED_GSO_TYPES ((UDP_TUNNEL_PARTIAL_FEATURES | \
NETIF_F_GSO_PARTIAL) >> \
NETIF_F_GSO_SHIFT)
struct udp_port_cfg {
u8 family;
@ -145,6 +150,33 @@ void udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk,
__be16 src_port, __be16 dst_port, bool nocheck,
u16 ip6cb_flags);
static inline bool udp_tunnel_handle_partial(struct sk_buff *skb)
{
bool double_encap = !!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL);
/*
* If the skb went through partial segmentation, lower devices
* will not need to offload the related features - except for
* UDP_TUNNEL, that will be re-added by the later
* udp_tunnel_handle_offloads().
*/
if (double_encap)
skb_shinfo(skb)->gso_type &= ~UDP_TUNNEL_STRIPPED_GSO_TYPES;
return double_encap;
}
static inline void udp_tunnel_set_inner_protocol(struct sk_buff *skb,
bool double_encap,
__be16 inner_proto)
{
/*
* The inner protocol has been set by the nested tunnel, don't
* overraid it.
*/
if (!double_encap)
skb_set_inner_protocol(skb, inner_proto);
}
void udp_tunnel_sock_release(struct socket *sock);
struct rtable *udp_tunnel_dst_lookup(struct sk_buff *skb,

View File

@ -1443,6 +1443,7 @@ enum {
IFLA_GENEVE_DF,
IFLA_GENEVE_INNER_PROTO_INHERIT,
IFLA_GENEVE_PORT_RANGE,
IFLA_GENEVE_GRO_HINT,
__IFLA_GENEVE_MAX
};
#define IFLA_GENEVE_MAX (__IFLA_GENEVE_MAX - 1)

View File

@ -3802,7 +3802,7 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb,
inner_ip_hdr(skb) : ip_hdr(skb);
if (!(iph->frag_off & htons(IP_DF)))
features &= ~NETIF_F_TSO_MANGLEID;
features &= ~dev->mangleid_features;
}
/* NETIF_F_IPV6_CSUM does not support IPv6 extension headers,
@ -11402,6 +11402,9 @@ int register_netdevice(struct net_device *dev)
if (dev->hw_enc_features & NETIF_F_TSO)
dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
/* TSO_MANGLEID belongs in mangleid_features by definition */
dev->mangleid_features |= NETIF_F_TSO_MANGLEID;
/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
*/
dev->vlan_features |= NETIF_F_HIGHDMA;

View File

@ -22,6 +22,7 @@ TEST_PROGS := \
cmsg_so_mark.sh \
cmsg_so_priority.sh \
cmsg_time.sh \
double_udp_encap.sh \
drop_monitor_tests.sh \
fcnal-ipv4.sh \
fcnal-ipv6.sh \

View File

@ -77,6 +77,7 @@ CONFIG_NET_DROP_MONITOR=m
CONFIG_NETFILTER=y
CONFIG_NETFILTER_ADVANCED=y
CONFIG_NETFILTER_XTABLES_LEGACY=y
CONFIG_NETFILTER_XT_MATCH_BPF=m
CONFIG_NETFILTER_XT_MATCH_LENGTH=m
CONFIG_NETFILTER_XT_MATCH_POLICY=m
CONFIG_NETFILTER_XT_NAT=m

View File

@ -0,0 +1,393 @@
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
source lib.sh
# shellcheck disable=SC2155 # prefer RO variable over return value from cmd
readonly CLI="$(dirname "$(readlink -f "$0")")/../../../net/ynl/pyynl/cli.py"
readonly SRC=1
readonly DST=2
readonly NET_V4=192.168.1.
readonly NET_V6=2001:db8::
readonly OL1_NET_V4=172.16.1.
readonly OL1_NET_V6=2001:db8:1::
readonly OL2_NET_V4=172.16.2.
readonly OL2_NET_V6=2001:db8:2::
trap cleanup_all_ns EXIT
# shellcheck disable=SC2329 # can't figure out usage trough a variable
is_ipv6() {
if [[ $1 =~ .*:.* ]]; then
return 0
fi
return 1
}
# shellcheck disable=SC2329 # can't figure out usage trough a variable
create_gnv_endpoint() {
local -r netns=$1
local -r bm_rem_addr=$2
local -r gnv_dev=$3
local -r gnv_id=$4
local opts=$5
local gnv_json
local rem
if is_ipv6 "$bm_rem_addr"; then
rem=remote6
else
rem=remote
fi
# add ynl opt separator, if needed
[ -n "$opts" ] && opts=", $opts"
gnv_json="{ \"id\": $gnv_id, \"$rem\": \"$bm_rem_addr\"$opts }"
ip netns exec "$netns" "$CLI" --family rt-link --create --excl \
--do newlink --json "{\"ifname\": \"$gnv_dev\",
\"linkinfo\": {\"kind\":\"geneve\",
\"data\": $gnv_json } }" > /dev/null
ip -n "$netns" link set dev "$gnv_dev" up
}
# shellcheck disable=SC2329 # can't figure out usage trough a variable
create_vxlan_endpoint() {
local -r netns=$1
local -r bm_rem_addr=$2
local -r vxlan_dev=$3
local -r vxlan_id=$4
local -r opts_str=$5
local oldifs
local -a opts
local opt
# convert the arguments from yaml format
oldifs=$IFS
IFS=','
for opt in $opts_str; do
local pattern='"port":'
[ -n "$opt" ] || continue
opts+=("${opt/$pattern*/dstport}" "${opt/$pattern/}")
done
IFS=$oldifs
[ ${#opts[@]} -gt 0 ] || opts+=("dstport" "4789")
ip -n "$netns" link add "$vxlan_dev" type vxlan id "$vxlan_id" \
remote "$bm_rem_addr" "${opts[@]}"
ip -n "$netns" link set dev "$vxlan_dev" up
}
create_ns() {
local nested_opt='"port":6082'
local create_endpoint
local options="$1"
local feature
local dev
local id
local ns
RET=0
# +-------------+ +-------------+
# | NS_SRC | | NS_NST_DST |
# | | | |
# | gnv_nst1 | | gnv_nst2 |
# | + | | + |
# | | | | | |
# | + | | + |
# | gnv1 | | gnv2 |
# | + | | + |
# | | | | | |
# | + veth1 +--------+ veth2 + |
# | | | |
# +-------------+ +-------------+
setup_ns NS_SRC NS_DST
# concatenate caller provided options and default one
[ -n "$2" ] && nested_opt="$nested_opt,$2"
ip link add name "veth$SRC" netns "$NS_SRC" type veth \
peer name "veth$DST" netns "$NS_DST"
case "$ENCAP" in
vxlan)
create_endpoint=create_vxlan_endpoint
dev=vx
;;
geneve)
create_endpoint=create_gnv_endpoint
dev=gnv
;;
esac
id=1
for ns in "${NS_LIST[@]}"; do
ip -n "$ns" link set dev "veth$id" up
# ensure the sender can do large write just after 3whs
ip netns exec "$ns" \
sysctl -qw net.ipv4.tcp_wmem="4096 4194304 4194304"
# note that 3 - $SRC == $DST and 3 - $DST == $SRC
if [ $FAMILY = "4" ]; then
ip -n "$ns" addr add dev "veth$id" "$NET_V4$id/24"
$create_endpoint "$ns" "$NET_V4$((3 - id))" \
"$dev$id" 4 "$options"
ip -n "$ns" addr add dev "$dev$id" "$OL1_NET_V4$id/24"
# nested tunnel devices
# pmtu can't be propagated to upper layer devices;
# need manual adjust
$create_endpoint "$ns" "$OL1_NET_V4$((3 - id))" \
"$dev"_nst"$id" 40 "$nested_opt"
ip -n "$ns" addr add dev "$dev"_nst"$id" \
"$OL2_NET_V4$id/24"
ip -n "$ns" link set dev "$dev"_nst"$id" mtu 1392
else
ip -n "$ns" addr add dev "veth$id" "$NET_V6$id/64" \
nodad
$create_endpoint "$ns" "$NET_V6$((3 - id))" \
"$dev"6"$id" 6 "$options"
ip -n "$ns" addr add dev "$dev"6"$id" \
"$OL1_NET_V6$id/64" nodad
$create_endpoint "$ns" "$OL1_NET_V6$((3 - id))" \
"$dev"6_nst"$id" 60 "$nested_opt"
ip -n "$ns" addr add dev "$dev"6_nst"$id" \
"$OL2_NET_V6$id/64" nodad
ip -n "$ns" link set dev "$dev"6_nst"$id" mtu 1352
fi
id=$((id+1))
done
# enable GRO heuristic on the veth peer and ensure UDP L4 over tunnel is
# actually segmented
for feature in tso tx-udp_tnl-segmentation; do
ip netns exec "$NS_SRC" ethtool -K "veth$SRC" \
"$feature" off 2>/dev/null
done
}
create_ns_gso() {
local dev
create_ns "$@"
if [ "$ENCAP" = "geneve" ]; then
dev=gnv
else
dev=vx
fi
[ "$FAMILY" = "6" ] && dev="$dev"6
ip netns exec "$NS_SRC" ethtool -K "$dev$SRC" \
tx-gso-partial on \
tx-udp_tnl-segmentation on \
tx-udp_tnl-csum-segmentation on
}
create_ns_gso_gro() {
create_ns_gso "$@"
ip netns exec "$NS_DST" ethtool -K "veth$DST" gro on
ip netns exec "$NS_SRC" ethtool -K "veth$SRC" tx off >/dev/null 2>&1
}
run_test() {
local -r dst=$NET$DST
local -r msg=$1
local -r total_size=$2
local -r encappkts=$3
local inner_proto_offset=0
local inner_maclen=14
local rx_family="-4"
local ipt=iptables
local bpf_filter
local -a rx_args
local wire_pkts
local rcvpkts
local encl=8
local dport
local pkts
local snd
if [ $FAMILY = "6" ]; then
ipt=ip6tables
else
# rx program does not support '-6' and implies ipv6 usage by
# default
rx_args=("$rx_family")
fi
# The received can only check fixed size packet
pkts=$((total_size / GSO_SIZE))
if [ -n "$4" ]; then
wire_pkts=$4
elif [ $((total_size % GSO_SIZE)) -eq 0 ]; then
wire_pkts=1
rx_args+=("-l" "$GSO_SIZE")
else
wire_pkts=2
pkts=$((pkts + 1))
fi
if [ "$ENCAP" = "geneve" ]; then
dport=6081
else
dport=4789
fi
# Either:
# - IPv4, nested tunnel carries UDP over IPv4, with dport 6082,
# innermost is TCP over IPv4 on port 8000
# - IPv6, nested tunnel carries UDP over IPv6, with dport 6082,
# innermost is TCP over IPv6 on port 8000
# The nested tunnel port is 6082 and the nested encap len is 8
# regardless of the encap type (no geneve opts).
# In inherit protocol mode there is no nested mac hdr and the nested
# l3 protocol type field belongs to the geneve hdr.
[ "$USE_HINT" = true ] && encl=16
[ "$INHERIT" = true ] && inner_maclen=0
[ "$INHERIT" = true ] && inner_proto_offset=-4
local inner=$((inner_maclen+encl))
local proto=$((inner_maclen+encl+inner_proto_offset))
bpf_filter=$(nfbpf_compile "(ip &&
ip[$((40+encl))] == 0x08 && ip[$((41+encl))] == 0x00 &&
ip[$((51+encl))] == 0x11 &&
ip[$((64+encl))] == 0x17 && ip[$((65+encl))] == 0xc2 &&
ip[$((76+proto))] == 0x08 && ip[$((77+proto))] == 0x00 &&
ip[$((87+inner))] == 0x6 &&
ip[$((100+inner))] == 0x1f && ip[$((101+inner))] == 0x40) ||
(ip6 &&
ip6[$((60+encl))] == 0x86 && ip6[$((61+encl))] == 0xdd &&
ip6[$((68+encl))] == 0x11 &&
ip6[$((104+encl))] == 0x17 && ip6[$((105+encl))] == 0xc2 &&
ip6[$((116+proto))] == 0x86 && ip6[$((117+proto))] == 0xdd &&
ip6[$((124+inner))] == 0x6 &&
ip6[$((160+inner))] == 0x1f && ip6[$((161+inner))] == 0x40)")
# ignore shorts packet, to avoid arp/mld induced noise
ip netns exec "$NS_SRC" "$ipt" -A OUTPUT -p udp --dport "$dport" \
-m length --length 600:65535 -m bpf --bytecode "$bpf_filter"
ip netns exec "$NS_DST" "$ipt" -A INPUT -p udp --dport "$dport" \
-m length --length 600:65535 -m bpf --bytecode "$bpf_filter"
ip netns exec "$NS_DST" ./udpgso_bench_rx -C 2000 -t -R 100 \
-n "$pkts" "${rx_args[@]}" &
local pid=$!
wait_local_port_listen "$NS_DST" 8000 tcp
ip netns exec "$NS_SRC" ./udpgso_bench_tx -"$FAMILY" -t -M 1 \
-s "$total_size" -D "$dst"
local ret=$?
check_err "$ret" "client failure exit code $ret"
wait "$pid"
ret=$?
check_err "$ret" "sever failure exit code $ret"
snd=$(ip netns exec "$NS_SRC" "$ipt"-save -c |
grep "dport $dport" | sed -e 's/\[//' -e 's/:.*//')
[ "$snd" = "$wire_pkts" ]
# shellcheck disable=SC2319 # known false positive
check_err $? "send $snd packets on the lowest link, expected $wire_pkts"
rcvpkts=$(ip netns exec "$NS_DST" "$ipt"-save -c | \
grep "dport $dport" | sed -e 's/\[//' -e 's/:.*//')
[ "$rcvpkts" = "$encappkts" ]
check_err $? "received $rcvpkts $ENCAP packets, expected $encappkts"
log_test "$msg"
}
run_tests() {
for FAMILY in 4 6; do
NET=$OL2_NET_V4
GSO_SIZE=1340 # 1392 - 20 - 32
if [ $FAMILY = 6 ]; then
NET=$OL2_NET_V6
GSO_SIZE=1280 # 1352 - 40 - 32
fi
echo "IPv$FAMILY"
unset USE_HINT
unset INHERIT
# "geneve" must be last encap in list, so that later
# test cases will run on it
for ENCAP in "vxlan" "geneve"; do
create_ns
run_test "No GSO - $ENCAP" $((GSO_SIZE * 4)) 4 4
cleanup_all_ns
create_ns_gso
run_test "GSO without GRO - $ENCAP" $((GSO_SIZE * 4)) \
4 1
cleanup_all_ns
# IPv4 only test
[ $FAMILY = "4" ] || continue
create_ns_gso
ip netns exec "$NS_SRC" \
sysctl -qw net.ipv4.ip_no_pmtu_disc=1
run_test "GSO disable due to no fixedid - $ENCAP" \
$((GSO_SIZE * 4)) 4 4
cleanup_all_ns
done
# GRO tests imply/require geneve encap, the only one providing
# GRO hints
create_ns_gso_gro
run_test "double tunnel GRO, no hints" $((GSO_SIZE * 4)) 4
cleanup_all_ns
# hint option is expected for all the following tests in the RX
# path
USE_HINT=true
create_ns_gso_gro \
'"gro-hint":1,"udp-zero-csum6-tx":1,"udp-zero-csum6-rx":1' \
'"udp-zero-csum6-tx":1,"udp-zero-csum6-rx":1'
run_test "double tunnel GRO" $((GSO_SIZE * 4)) 1
cleanup_all_ns
create_ns_gso_gro '"gro-hint":1,"udp-csum":1' '"udp-csum":1'
run_test "double tunnel GRO - csum complete" $((GSO_SIZE * 4))\
1
cleanup_all_ns
create_ns_gso_gro '"gro-hint":1' \
'"udp-csum":0,"udp-zero-csum6-tx":1,"udp-zero-csum6-rx":1'
run_test "double tunnel GRO - no nested csum" \
$((GSO_SIZE * 4)) 1
cleanup_all_ns
create_ns_gso_gro \
'"gro-hint":1,"udp-zero-csum6-tx":1,"udp-zero-csum6-rx":1' \
'"udp-csum":1'
run_test "double tunnel GRO - nested csum, outer 0-csum, skip"\
$((GSO_SIZE * 4)) 4
cleanup_all_ns
INHERIT=true
create_ns_gso_gro '"gro-hint":1,"udp-csum":1' \
'"udp-csum":1,"inner-proto-inherit":1'
run_test "double tunnel GRO - nested inherit proto" \
$((GSO_SIZE * 4)) 1
cleanup_all_ns
unset INHERIT
create_ns_gso_gro '"gro-hint":1'
run_test "double tunnel GRO - short last pkt" \
$((GSO_SIZE * 4 + GSO_SIZE / 2)) 2
cleanup_all_ns
done
}
require_command nfbpf_compile
require_command jq
# tcp retransmisions will break the accounting
xfail_on_slow run_tests
exit "$EXIT_STATUS"