diff --git a/Documentation/netlink/specs/rt-link.yaml b/Documentation/netlink/specs/rt-link.yaml index 6beeb6ee5adf..df4b56beb818 100644 --- a/Documentation/netlink/specs/rt-link.yaml +++ b/Documentation/netlink/specs/rt-link.yaml @@ -1914,6 +1914,9 @@ attribute-sets: name: port-range type: binary struct: ifla-geneve-port-range + - + name: gro-hint + type: flag - name: linkinfo-hsr-attrs name-prefix: ifla-hsr- diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 77b0c3d52041..0949d4579171 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -38,6 +38,26 @@ MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); #define GENEVE_IPV4_HLEN (ETH_HLEN + sizeof(struct iphdr) + GENEVE_BASE_HLEN) #define GENEVE_IPV6_HLEN (ETH_HLEN + sizeof(struct ipv6hdr) + GENEVE_BASE_HLEN) +#define GENEVE_OPT_NETDEV_CLASS 0x100 +#define GENEVE_OPT_GRO_HINT_SIZE 8 +#define GENEVE_OPT_GRO_HINT_TYPE 1 +#define GENEVE_OPT_GRO_HINT_LEN 1 + +struct geneve_opt_gro_hint { + u8 inner_proto_id:2, + nested_is_v6:1; + u8 nested_nh_offset; + u8 nested_tp_offset; + u8 nested_hdr_len; +}; + +struct geneve_skb_cb { + unsigned int gro_hint_len; + struct geneve_opt_gro_hint gro_hint; +}; + +#define GENEVE_SKB_CB(__skb) ((struct geneve_skb_cb *)&((__skb)->cb[0])) + /* per-network namespace private data for this module */ struct geneve_net { struct list_head geneve_list; @@ -56,6 +76,7 @@ struct geneve_config { bool collect_md; bool use_udp6_rx_checksums; bool ttl_inherit; + bool gro_hint; enum ifla_geneve_df df; bool inner_proto_inherit; u16 port_min; @@ -84,6 +105,7 @@ struct geneve_dev { struct geneve_sock { bool collect_md; + bool gro_hint; struct list_head list; struct socket *sock; struct rcu_head rcu; @@ -91,6 +113,21 @@ struct geneve_sock { struct hlist_head vni_list[VNI_HASH_SIZE]; }; +static const __be16 proto_id_map[] = { htons(ETH_P_TEB), + htons(ETH_P_IPV6), + htons(ETH_P_IP) }; + +static int proto_to_id(__be16 proto) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(proto_id_map); i++) + if (proto_id_map[i] == proto) + return i; + + return -1; +} + static inline __u32 geneve_net_vni_hash(u8 vni[3]) { __u32 vnid; @@ -222,9 +259,8 @@ static struct geneve_dev *geneve_lookup_skb(struct geneve_sock *gs, /* geneve receive/decap routine */ static void geneve_rx(struct geneve_dev *geneve, struct geneve_sock *gs, - struct sk_buff *skb) + struct sk_buff *skb, const struct genevehdr *gnvh) { - struct genevehdr *gnvh = geneve_hdr(skb); struct metadata_dst *tun_dst = NULL; unsigned int len; int nh, err = 0; @@ -325,8 +361,12 @@ static void geneve_rx(struct geneve_dev *geneve, struct geneve_sock *gs, } } + /* Skip the additional GRO stage when hints are in use. */ len = skb->len; - err = gro_cells_receive(&geneve->gro_cells, skb); + if (skb->encapsulation) + err = netif_rx(skb); + else + err = gro_cells_receive(&geneve->gro_cells, skb); if (likely(err == NET_RX_SUCCESS)) dev_dstats_rx_add(geneve->dev, len); @@ -363,6 +403,250 @@ static void geneve_uninit(struct net_device *dev) gro_cells_destroy(&geneve->gro_cells); } +static int geneve_hlen(const struct genevehdr *gh) +{ + return sizeof(*gh) + gh->opt_len * 4; +} + +/* + * Look for GRO hint in the genenve options; if not found or does not pass basic + * sanitization return 0, otherwise the offset WRT the geneve hdr start. + */ +static unsigned int +geneve_opt_gro_hint_off(const struct genevehdr *gh, __be16 *type, + unsigned int *gh_len) +{ + struct geneve_opt *opt = (void *)(gh + 1); + unsigned int id, opt_len = gh->opt_len; + struct geneve_opt_gro_hint *gro_hint; + + while (opt_len >= (GENEVE_OPT_GRO_HINT_SIZE >> 2)) { + if (opt->opt_class == htons(GENEVE_OPT_NETDEV_CLASS) && + opt->type == GENEVE_OPT_GRO_HINT_TYPE && + opt->length == GENEVE_OPT_GRO_HINT_LEN) + goto found; + + /* check for bad opt len */ + if (opt->length + 1 >= opt_len) + return 0; + + /* next opt */ + opt_len -= opt->length + 1; + opt = ((void *)opt) + ((opt->length + 1) << 2); + } + return 0; + +found: + gro_hint = (struct geneve_opt_gro_hint *)opt->opt_data; + + /* + * Sanitize the hinted hdrs: the nested transport is UDP and must fit + * the overall hinted hdr size. + */ + if (gro_hint->nested_tp_offset + sizeof(struct udphdr) > + gro_hint->nested_hdr_len) + return 0; + + if (gro_hint->nested_nh_offset + + (gro_hint->nested_is_v6 ? sizeof(struct ipv6hdr) : + sizeof(struct iphdr)) > + gro_hint->nested_tp_offset) + return 0; + + /* Allow only supported L2. */ + id = gro_hint->inner_proto_id; + if (id >= ARRAY_SIZE(proto_id_map)) + return 0; + + *type = proto_id_map[id]; + *gh_len += gro_hint->nested_hdr_len; + + return (void *)gro_hint - (void *)gh; +} + +static const struct geneve_opt_gro_hint * +geneve_opt_gro_hint(const struct genevehdr *gh, unsigned int hint_off) +{ + return (const struct geneve_opt_gro_hint *)((void *)gh + hint_off); +} + +static unsigned int +geneve_sk_gro_hint_off(const struct sock *sk, const struct genevehdr *gh, + __be16 *type, unsigned int *gh_len) +{ + const struct geneve_sock *gs = rcu_dereference_sk_user_data(sk); + + if (!gs || !gs->gro_hint) + return 0; + return geneve_opt_gro_hint_off(gh, type, gh_len); +} + +/* Validate the packet headers pointed by data WRT the provided hint */ +static bool +geneve_opt_gro_hint_validate(void *data, + const struct geneve_opt_gro_hint *gro_hint) +{ + void *nested_nh = data + gro_hint->nested_nh_offset; + struct iphdr *iph; + + if (gro_hint->nested_is_v6) { + struct ipv6hdr *ipv6h = nested_nh; + struct ipv6_opt_hdr *opth; + int offset, len; + + if (ipv6h->nexthdr == IPPROTO_UDP) + return true; + + offset = sizeof(*ipv6h) + gro_hint->nested_nh_offset; + while (offset + sizeof(*opth) <= gro_hint->nested_tp_offset) { + opth = data + offset; + + len = ipv6_optlen(opth); + if (len + offset > gro_hint->nested_tp_offset) + return false; + if (opth->nexthdr == IPPROTO_UDP) + return true; + + offset += len; + } + return false; + } + + iph = nested_nh; + if (*(u8 *)iph != 0x45 || ip_is_fragment(iph) || + iph->protocol != IPPROTO_UDP || ip_fast_csum((u8 *)iph, 5)) + return false; + + return true; +} + +/* + * Validate the skb headers following the specified geneve hdr vs the + * provided hint, including nested L4 checksum. + * The caller already ensured that the relevant amount of data is available + * in the linear part. + */ +static bool +geneve_opt_gro_hint_validate_csum(const struct sk_buff *skb, + const struct genevehdr *gh, + const struct geneve_opt_gro_hint *gro_hint) +{ + unsigned int plen, gh_len = geneve_hlen(gh); + void *nested = (void *)gh + gh_len; + struct udphdr *nested_uh; + unsigned int nested_len; + struct ipv6hdr *ipv6h; + struct iphdr *iph; + __wsum csum, psum; + + if (!geneve_opt_gro_hint_validate(nested, gro_hint)) + return false; + + /* Use GRO hints with nested csum only if the outer header has csum. */ + nested_uh = nested + gro_hint->nested_tp_offset; + if (!nested_uh->check || skb->ip_summed == CHECKSUM_PARTIAL) + return true; + + if (!NAPI_GRO_CB(skb)->csum_valid) + return false; + + /* Compute the complete checksum up to the nested transport. */ + plen = gh_len + gro_hint->nested_tp_offset; + csum = csum_sub(NAPI_GRO_CB(skb)->csum, csum_partial(gh, plen, 0)); + nested_len = skb_gro_len(skb) - plen; + + /* Compute the nested pseudo header csum. */ + ipv6h = nested + gro_hint->nested_nh_offset; + iph = (struct iphdr *)ipv6h; + psum = gro_hint->nested_is_v6 ? + ~csum_unfold(csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, + nested_len, IPPROTO_UDP, 0)) : + csum_tcpudp_nofold(iph->saddr, iph->daddr, + nested_len, IPPROTO_UDP, 0); + + return !csum_fold(csum_add(psum, csum)); +} + +static int geneve_post_decap_hint(const struct sock *sk, struct sk_buff *skb, + unsigned int gh_len, + struct genevehdr **geneveh) +{ + const struct geneve_opt_gro_hint *gro_hint; + unsigned int len, total_len, hint_off; + struct ipv6hdr *ipv6h; + struct iphdr *iph; + struct udphdr *uh; + __be16 p; + + hint_off = geneve_sk_gro_hint_off(sk, *geneveh, &p, &len); + if (!hint_off) + return 0; + + if (!skb_is_gso(skb)) + return 0; + + gro_hint = geneve_opt_gro_hint(*geneveh, hint_off); + if (unlikely(!pskb_may_pull(skb, gro_hint->nested_hdr_len))) + return -ENOMEM; + + *geneveh = geneve_hdr(skb); + gro_hint = geneve_opt_gro_hint(*geneveh, hint_off); + + /* + * Validate hints from untrusted source before accessing + * the headers; csum will be checked later by the nested + * protocol rx path. + */ + if (unlikely(skb_shinfo(skb)->gso_type & SKB_GSO_DODGY && + !geneve_opt_gro_hint_validate(skb->data, gro_hint))) + return -EINVAL; + + ipv6h = (void *)skb->data + gro_hint->nested_nh_offset; + iph = (struct iphdr *)ipv6h; + total_len = skb->len - gro_hint->nested_nh_offset; + if (total_len > GRO_LEGACY_MAX_SIZE) + return -E2BIG; + + /* + * After stripping the outer encap, the packet still carries a + * tunnel encapsulation: the nested one. + */ + skb->encapsulation = 1; + + /* GSO expect a valid transpor header, move it to the current one. */ + skb_set_transport_header(skb, gro_hint->nested_tp_offset); + + /* Adjust the nested IP{6} hdr to actual GSO len. */ + if (gro_hint->nested_is_v6) { + ipv6h->payload_len = htons(total_len - sizeof(*ipv6h)); + } else { + __be16 old_len = iph->tot_len; + + iph->tot_len = htons(total_len); + + /* For IPv4 additionally adjust the nested csum. */ + csum_replace2(&iph->check, old_len, iph->tot_len); + ip_send_check(iph); + } + + /* Adjust the nested UDP header len and checksum. */ + uh = udp_hdr(skb); + uh->len = htons(skb->len - gro_hint->nested_tp_offset); + if (uh->check) { + len = skb->len - gro_hint->nested_nh_offset; + skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL_CSUM; + if (gro_hint->nested_is_v6) + uh->check = ~udp_v6_check(len, &ipv6h->saddr, + &ipv6h->daddr, 0); + else + uh->check = ~udp_v4_check(len, iph->saddr, + iph->daddr, 0); + } else { + skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL; + } + return 0; +} + /* Callback from net/ipv4/udp.c to receive packets */ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb) { @@ -404,7 +688,18 @@ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb) goto drop; } - geneve_rx(geneve, gs, skb); + /* + * After hint processing, the transport header points to the inner one + * and we can't use anymore on geneve_hdr(). + */ + geneveh = geneve_hdr(skb); + if (geneve_post_decap_hint(sk, skb, sizeof(struct genevehdr) + + opts_len, &geneveh)) { + DEV_STATS_INC(geneve->dev, rx_errors); + goto drop; + } + + geneve_rx(geneve, gs, skb, geneveh); return 0; drop: @@ -495,22 +790,93 @@ static struct socket *geneve_create_sock(struct net *net, bool ipv6, return sock; } -static int geneve_hlen(struct genevehdr *gh) +static bool geneve_hdr_match(struct sk_buff *skb, + const struct genevehdr *gh, + const struct genevehdr *gh2, + unsigned int hint_off) { - return sizeof(*gh) + gh->opt_len * 4; + const struct geneve_opt_gro_hint *gro_hint; + void *nested, *nested2, *nh, *nh2; + struct udphdr *udp, *udp2; + unsigned int gh_len; + + /* Match the geneve hdr and options */ + if (gh->opt_len != gh2->opt_len) + return false; + + gh_len = geneve_hlen(gh); + if (memcmp(gh, gh2, gh_len)) + return false; + + if (!hint_off) + return true; + + /* + * When gro is present consider the nested headers as part + * of the geneve options + */ + nested = (void *)gh + gh_len; + nested2 = (void *)gh2 + gh_len; + gro_hint = geneve_opt_gro_hint(gh, hint_off); + if (!memcmp(nested, nested2, gro_hint->nested_hdr_len)) + return true; + + /* + * The nested headers differ; the packets can still belong to + * the same flow when IPs/proto/ports match; if so flushing is + * required. + */ + nh = nested + gro_hint->nested_nh_offset; + nh2 = nested2 + gro_hint->nested_nh_offset; + if (gro_hint->nested_is_v6) { + struct ipv6hdr *iph = nh, *iph2 = nh2; + unsigned int nested_nlen; + __be32 first_word; + + first_word = *(__be32 *)iph ^ *(__be32 *)iph2; + if ((first_word & htonl(0xF00FFFFF)) || + !ipv6_addr_equal(&iph->saddr, &iph2->saddr) || + !ipv6_addr_equal(&iph->daddr, &iph2->daddr) || + iph->nexthdr != iph2->nexthdr) + return false; + + nested_nlen = gro_hint->nested_tp_offset - + gro_hint->nested_nh_offset; + if (nested_nlen > sizeof(struct ipv6hdr) && + (memcmp(iph + 1, iph2 + 1, + nested_nlen - sizeof(struct ipv6hdr)))) + return false; + } else { + struct iphdr *iph = nh, *iph2 = nh2; + + if ((iph->protocol ^ iph2->protocol) | + ((__force u32)iph->saddr ^ (__force u32)iph2->saddr) | + ((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) + return false; + } + + udp = nested + gro_hint->nested_tp_offset; + udp2 = nested2 + gro_hint->nested_tp_offset; + if (udp->source != udp2->source || udp->dest != udp2->dest || + udp->check != udp2->check) + return false; + + NAPI_GRO_CB(skb)->flush = 1; + return true; } static struct sk_buff *geneve_gro_receive(struct sock *sk, struct list_head *head, struct sk_buff *skb) { + unsigned int hlen, gh_len, off_gnv, hint_off; + const struct geneve_opt_gro_hint *gro_hint; + const struct packet_offload *ptype; + struct genevehdr *gh, *gh2; struct sk_buff *pp = NULL; struct sk_buff *p; - struct genevehdr *gh, *gh2; - unsigned int hlen, gh_len, off_gnv; - const struct packet_offload *ptype; - __be16 type; int flush = 1; + __be16 type; off_gnv = skb_gro_offset(skb); hlen = off_gnv + sizeof(*gh); @@ -521,6 +887,7 @@ static struct sk_buff *geneve_gro_receive(struct sock *sk, if (gh->ver != GENEVE_VER || gh->oam) goto out; gh_len = geneve_hlen(gh); + type = gh->proto_type; hlen = off_gnv + gh_len; if (!skb_gro_may_pull(skb, hlen)) { @@ -529,13 +896,30 @@ static struct sk_buff *geneve_gro_receive(struct sock *sk, goto out; } + /* The GRO hint/nested hdr could use a different ethernet type. */ + hint_off = geneve_sk_gro_hint_off(sk, gh, &type, &gh_len); + if (hint_off) { + + /* + * If the hint is present, and nested hdr validation fails, do + * not attempt plain GRO: it will ignore inner hdrs and cause + * OoO. + */ + gh = skb_gro_header(skb, off_gnv + gh_len, off_gnv); + if (unlikely(!gh)) + goto out; + + gro_hint = geneve_opt_gro_hint(gh, hint_off); + if (!geneve_opt_gro_hint_validate_csum(skb, gh, gro_hint)) + goto out; + } + list_for_each_entry(p, head, list) { if (!NAPI_GRO_CB(p)->same_flow) continue; gh2 = (struct genevehdr *)(p->data + off_gnv); - if (gh->opt_len != gh2->opt_len || - memcmp(gh, gh2, gh_len)) { + if (!geneve_hdr_match(skb, gh, gh2, hint_off)) { NAPI_GRO_CB(p)->same_flow = 0; continue; } @@ -543,7 +927,6 @@ static struct sk_buff *geneve_gro_receive(struct sock *sk, skb_gro_pull(skb, gh_len); skb_gro_postpull_rcsum(skb, gh, gh_len); - type = gh->proto_type; if (likely(type == htons(ETH_P_TEB))) return call_gro_receive(eth_gro_receive, head, skb); @@ -572,6 +955,7 @@ static int geneve_gro_complete(struct sock *sk, struct sk_buff *skb, gh = (struct genevehdr *)(skb->data + nhoff); gh_len = geneve_hlen(gh); type = gh->proto_type; + geneve_opt_gro_hint_off(gh, &type, &gh_len); /* since skb->encapsulation is set, eth_gro_complete() sets the inner mac header */ if (likely(type == htons(ETH_P_TEB))) @@ -659,13 +1043,15 @@ static void geneve_sock_release(struct geneve_dev *geneve) static struct geneve_sock *geneve_find_sock(struct geneve_net *gn, sa_family_t family, - __be16 dst_port) + __be16 dst_port, + bool gro_hint) { struct geneve_sock *gs; list_for_each_entry(gs, &gn->sock_list, list) { if (inet_sk(gs->sock->sk)->inet_sport == dst_port && - geneve_get_sk_family(gs) == family) { + geneve_get_sk_family(gs) == family && + gs->gro_hint == gro_hint) { return gs; } } @@ -676,12 +1062,14 @@ static int geneve_sock_add(struct geneve_dev *geneve, bool ipv6) { struct net *net = geneve->net; struct geneve_net *gn = net_generic(net, geneve_net_id); + bool gro_hint = geneve->cfg.gro_hint; struct geneve_dev_node *node; struct geneve_sock *gs; __u8 vni[3]; __u32 hash; - gs = geneve_find_sock(gn, ipv6 ? AF_INET6 : AF_INET, geneve->cfg.info.key.tp_dst); + gs = geneve_find_sock(gn, ipv6 ? AF_INET6 : AF_INET, + geneve->cfg.info.key.tp_dst, gro_hint); if (gs) { gs->refcnt++; goto out; @@ -694,6 +1082,7 @@ static int geneve_sock_add(struct geneve_dev *geneve, bool ipv6) out: gs->collect_md = geneve->cfg.collect_md; + gs->gro_hint = gro_hint; #if IS_ENABLED(CONFIG_IPV6) if (ipv6) { rcu_assign_pointer(geneve->sock6, gs); @@ -766,34 +1155,116 @@ static void geneve_build_header(struct genevehdr *geneveh, ip_tunnel_info_opts_get(geneveh->options, info); } +static int geneve_build_gro_hint_opt(const struct geneve_dev *geneve, + struct sk_buff *skb) +{ + struct geneve_skb_cb *cb = GENEVE_SKB_CB(skb); + struct geneve_opt_gro_hint *hint; + unsigned int nhlen; + bool nested_is_v6; + int id; + + BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct geneve_skb_cb)); + cb->gro_hint_len = 0; + + /* Try to add the GRO hint only in case of double encap. */ + if (!geneve->cfg.gro_hint || !skb->encapsulation) + return 0; + + /* + * The nested headers must fit the geneve opt len fields and the + * nested encap must carry a nested transport (UDP) header. + */ + nhlen = skb_inner_mac_header(skb) - skb->data; + if (nhlen > 255 || !skb_transport_header_was_set(skb) || + skb->inner_protocol_type != ENCAP_TYPE_ETHER || + (skb_transport_offset(skb) + sizeof(struct udphdr) > nhlen)) + return 0; + + id = proto_to_id(skb->inner_protocol); + if (id < 0) + return 0; + + nested_is_v6 = skb->protocol == htons(ETH_P_IPV6); + if (nested_is_v6) { + int start = skb_network_offset(skb) + sizeof(struct ipv6hdr); + u8 proto = ipv6_hdr(skb)->nexthdr; + __be16 foff; + + if (ipv6_skip_exthdr(skb, start, &proto, &foff) < 0 || + proto != IPPROTO_UDP) + return 0; + } else { + if (ip_hdr(skb)->protocol != IPPROTO_UDP) + return 0; + } + + hint = &cb->gro_hint; + memset(hint, 0, sizeof(*hint)); + hint->inner_proto_id = id; + hint->nested_is_v6 = skb->protocol == htons(ETH_P_IPV6); + hint->nested_nh_offset = skb_network_offset(skb); + hint->nested_tp_offset = skb_transport_offset(skb); + hint->nested_hdr_len = nhlen; + cb->gro_hint_len = GENEVE_OPT_GRO_HINT_SIZE; + return GENEVE_OPT_GRO_HINT_SIZE; +} + +static void geneve_put_gro_hint_opt(struct genevehdr *gnvh, int opt_size, + const struct geneve_opt_gro_hint *hint) +{ + struct geneve_opt *gro_opt; + + /* geneve_build_header() did not took in account the GRO hint. */ + gnvh->opt_len = (opt_size + GENEVE_OPT_GRO_HINT_SIZE) >> 2; + + gro_opt = (void *)(gnvh + 1) + opt_size; + memset(gro_opt, 0, sizeof(*gro_opt)); + + gro_opt->opt_class = htons(GENEVE_OPT_NETDEV_CLASS); + gro_opt->type = GENEVE_OPT_GRO_HINT_TYPE; + gro_opt->length = GENEVE_OPT_GRO_HINT_LEN; + memcpy(gro_opt + 1, hint, sizeof(*hint)); +} + static int geneve_build_skb(struct dst_entry *dst, struct sk_buff *skb, const struct ip_tunnel_info *info, - bool xnet, int ip_hdr_len, - bool inner_proto_inherit) + const struct geneve_dev *geneve, int ip_hdr_len) { bool udp_sum = test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags); + bool inner_proto_inherit = geneve->cfg.inner_proto_inherit; + bool xnet = !net_eq(geneve->net, dev_net(geneve->dev)); + struct geneve_skb_cb *cb = GENEVE_SKB_CB(skb); struct genevehdr *gnvh; __be16 inner_proto; + bool double_encap; int min_headroom; + int opt_size; int err; skb_reset_mac_header(skb); skb_scrub_packet(skb, xnet); + opt_size = info->options_len + cb->gro_hint_len; min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len + - GENEVE_BASE_HLEN + info->options_len + ip_hdr_len; + GENEVE_BASE_HLEN + opt_size + ip_hdr_len; err = skb_cow_head(skb, min_headroom); if (unlikely(err)) goto free_dst; + double_encap = udp_tunnel_handle_partial(skb); err = udp_tunnel_handle_offloads(skb, udp_sum); if (err) goto free_dst; - gnvh = __skb_push(skb, sizeof(*gnvh) + info->options_len); + gnvh = __skb_push(skb, sizeof(*gnvh) + opt_size); inner_proto = inner_proto_inherit ? skb->protocol : htons(ETH_P_TEB); geneve_build_header(gnvh, info, inner_proto); - skb_set_inner_protocol(skb, inner_proto); + + if (cb->gro_hint_len) + geneve_put_gro_hint_opt(gnvh, info->options_len, &cb->gro_hint); + + udp_tunnel_set_inner_protocol(skb, double_encap, inner_proto); return 0; free_dst: @@ -821,8 +1292,6 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev, struct geneve_dev *geneve, const struct ip_tunnel_info *info) { - bool inner_proto_inherit = geneve->cfg.inner_proto_inherit; - bool xnet = !net_eq(geneve->net, dev_net(geneve->dev)); struct geneve_sock *gs4 = rcu_dereference(geneve->sock4); const struct ip_tunnel_key *key = &info->key; struct rtable *rt; @@ -833,7 +1302,7 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev, __be16 sport; int err; - if (skb_vlan_inet_prepare(skb, inner_proto_inherit)) + if (skb_vlan_inet_prepare(skb, geneve->cfg.inner_proto_inherit)) return -EINVAL; if (!gs4) @@ -854,7 +1323,8 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev, return PTR_ERR(rt); err = skb_tunnel_check_pmtu(skb, &rt->dst, - GENEVE_IPV4_HLEN + info->options_len, + GENEVE_IPV4_HLEN + info->options_len + + geneve_build_gro_hint_opt(geneve, skb), netif_is_any_bridge_port(dev)); if (err < 0) { dst_release(&rt->dst); @@ -916,8 +1386,8 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev, } } - err = geneve_build_skb(&rt->dst, skb, info, xnet, sizeof(struct iphdr), - inner_proto_inherit); + err = geneve_build_skb(&rt->dst, skb, info, geneve, + sizeof(struct iphdr)); if (unlikely(err)) return err; @@ -934,8 +1404,6 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev, struct geneve_dev *geneve, const struct ip_tunnel_info *info) { - bool inner_proto_inherit = geneve->cfg.inner_proto_inherit; - bool xnet = !net_eq(geneve->net, dev_net(geneve->dev)); struct geneve_sock *gs6 = rcu_dereference(geneve->sock6); const struct ip_tunnel_key *key = &info->key; struct dst_entry *dst = NULL; @@ -945,7 +1413,7 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev, __be16 sport; int err; - if (skb_vlan_inet_prepare(skb, inner_proto_inherit)) + if (skb_vlan_inet_prepare(skb, geneve->cfg.inner_proto_inherit)) return -EINVAL; if (!gs6) @@ -966,7 +1434,8 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev, return PTR_ERR(dst); err = skb_tunnel_check_pmtu(skb, dst, - GENEVE_IPV6_HLEN + info->options_len, + GENEVE_IPV6_HLEN + info->options_len + + geneve_build_gro_hint_opt(geneve, skb), netif_is_any_bridge_port(dev)); if (err < 0) { dst_release(dst); @@ -1008,8 +1477,7 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev, ttl = key->ttl; ttl = ttl ? : ip6_dst_hoplimit(dst); } - err = geneve_build_skb(dst, skb, info, xnet, sizeof(struct ipv6hdr), - inner_proto_inherit); + err = geneve_build_skb(dst, skb, info, geneve, sizeof(struct ipv6hdr)); if (unlikely(err)) return err; @@ -1211,9 +1679,16 @@ static void geneve_setup(struct net_device *dev) dev->features |= NETIF_F_RXCSUM; dev->features |= NETIF_F_GSO_SOFTWARE; + /* Partial features are disabled by default. */ dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_FRAGLIST; dev->hw_features |= NETIF_F_RXCSUM; dev->hw_features |= NETIF_F_GSO_SOFTWARE; + dev->hw_features |= UDP_TUNNEL_PARTIAL_FEATURES; + dev->hw_features |= NETIF_F_GSO_PARTIAL; + + dev->hw_enc_features = dev->hw_features; + dev->gso_partial_features = UDP_TUNNEL_PARTIAL_FEATURES; + dev->mangleid_features = NETIF_F_GSO_PARTIAL; dev->pcpu_stat_type = NETDEV_PCPU_STAT_DSTATS; /* MTU range: 68 - (something less than 65535) */ @@ -1248,6 +1723,7 @@ static const struct nla_policy geneve_policy[IFLA_GENEVE_MAX + 1] = { [IFLA_GENEVE_DF] = { .type = NLA_U8 }, [IFLA_GENEVE_INNER_PROTO_INHERIT] = { .type = NLA_FLAG }, [IFLA_GENEVE_PORT_RANGE] = NLA_POLICY_EXACT_LEN(sizeof(struct ifla_geneve_port_range)), + [IFLA_GENEVE_GRO_HINT] = { .type = NLA_FLAG }, }; static int geneve_validate(struct nlattr *tb[], struct nlattr *data[], @@ -1598,10 +2074,18 @@ static int geneve_nl2info(struct nlattr *tb[], struct nlattr *data[], cfg->inner_proto_inherit = true; } + if (data[IFLA_GENEVE_GRO_HINT]) { + if (changelink) { + attrtype = IFLA_GENEVE_GRO_HINT; + goto change_notsup; + } + cfg->gro_hint = true; + } + return 0; change_notsup: NL_SET_ERR_MSG_ATTR(extack, data[attrtype], - "Changing VNI, Port, endpoint IP address family, external, inner_proto_inherit, and UDP checksum attributes are not supported"); + "Changing VNI, Port, endpoint IP address family, external, inner_proto_inherit, gro_hint and UDP checksum attributes are not supported"); return -EOPNOTSUPP; } @@ -1784,6 +2268,7 @@ static size_t geneve_get_size(const struct net_device *dev) nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_TTL_INHERIT */ nla_total_size(0) + /* IFLA_GENEVE_INNER_PROTO_INHERIT */ nla_total_size(sizeof(struct ifla_geneve_port_range)) + /* IFLA_GENEVE_PORT_RANGE */ + nla_total_size(0) + /* IFLA_GENEVE_GRO_HINT */ 0; } @@ -1856,6 +2341,10 @@ static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev) if (nla_put(skb, IFLA_GENEVE_PORT_RANGE, sizeof(ports), &ports)) goto nla_put_failure; + if (geneve->cfg.gro_hint && + nla_put_flag(skb, IFLA_GENEVE_GRO_HINT)) + goto nla_put_failure; + return 0; nla_put_failure: diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index e957aa12a8a4..7bd0ae0a6a33 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -2183,11 +2183,12 @@ static int vxlan_build_skb(struct sk_buff *skb, struct dst_entry *dst, struct vxlan_metadata *md, u32 vxflags, bool udp_sum) { - struct vxlanhdr *vxh; - int min_headroom; - int err; int type = udp_sum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; __be16 inner_protocol = htons(ETH_P_TEB); + struct vxlanhdr *vxh; + bool double_encap; + int min_headroom; + int err; if ((vxflags & VXLAN_F_REMCSUM_TX) && skb->ip_summed == CHECKSUM_PARTIAL) { @@ -2208,6 +2209,7 @@ static int vxlan_build_skb(struct sk_buff *skb, struct dst_entry *dst, if (unlikely(err)) return err; + double_encap = udp_tunnel_handle_partial(skb); err = iptunnel_handle_offloads(skb, type); if (err) return err; @@ -2238,7 +2240,7 @@ static int vxlan_build_skb(struct sk_buff *skb, struct dst_entry *dst, inner_protocol = skb->protocol; } - skb_set_inner_protocol(skb, inner_protocol); + udp_tunnel_set_inner_protocol(skb, double_encap, inner_protocol); return 0; } @@ -3348,10 +3350,18 @@ static void vxlan_setup(struct net_device *dev) dev->features |= NETIF_F_RXCSUM; dev->features |= NETIF_F_GSO_SOFTWARE; + /* Partial features are disabled by default. */ dev->vlan_features = dev->features; dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_FRAGLIST; dev->hw_features |= NETIF_F_RXCSUM; dev->hw_features |= NETIF_F_GSO_SOFTWARE; + dev->hw_features |= UDP_TUNNEL_PARTIAL_FEATURES; + dev->hw_features |= NETIF_F_GSO_PARTIAL; + + dev->hw_enc_features = dev->hw_features; + dev->gso_partial_features = UDP_TUNNEL_PARTIAL_FEATURES; + dev->mangleid_features = NETIF_F_GSO_PARTIAL; + netif_keep_dst(dev); dev->priv_flags |= IFF_NO_QUEUE; dev->change_proto_down = true; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d99b0fbc1942..179b50f98cee 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1831,6 +1831,8 @@ enum netdev_reg_state { * * @mpls_features: Mask of features inheritable by MPLS * @gso_partial_features: value(s) from NETIF_F_GSO\* + * @mangleid_features: Mask of features requiring MANGLEID, will be + * disabled together with the latter. * * @ifindex: interface index * @group: The group the device belongs to @@ -2219,6 +2221,7 @@ struct net_device { netdev_features_t vlan_features; netdev_features_t hw_enc_features; netdev_features_t mpls_features; + netdev_features_t mangleid_features; unsigned int min_mtu; unsigned int max_mtu; diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index 9acef2fbd2fd..d9c6d04bb3b5 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -10,6 +10,11 @@ #include #endif +#define UDP_TUNNEL_PARTIAL_FEATURES NETIF_F_GSO_ENCAP_ALL +#define UDP_TUNNEL_STRIPPED_GSO_TYPES ((UDP_TUNNEL_PARTIAL_FEATURES | \ + NETIF_F_GSO_PARTIAL) >> \ + NETIF_F_GSO_SHIFT) + struct udp_port_cfg { u8 family; @@ -145,6 +150,33 @@ void udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, __be16 src_port, __be16 dst_port, bool nocheck, u16 ip6cb_flags); +static inline bool udp_tunnel_handle_partial(struct sk_buff *skb) +{ + bool double_encap = !!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL); + + /* + * If the skb went through partial segmentation, lower devices + * will not need to offload the related features - except for + * UDP_TUNNEL, that will be re-added by the later + * udp_tunnel_handle_offloads(). + */ + if (double_encap) + skb_shinfo(skb)->gso_type &= ~UDP_TUNNEL_STRIPPED_GSO_TYPES; + return double_encap; +} + +static inline void udp_tunnel_set_inner_protocol(struct sk_buff *skb, + bool double_encap, + __be16 inner_proto) +{ + /* + * The inner protocol has been set by the nested tunnel, don't + * overraid it. + */ + if (!double_encap) + skb_set_inner_protocol(skb, inner_proto); +} + void udp_tunnel_sock_release(struct socket *sock); struct rtable *udp_tunnel_dst_lookup(struct sk_buff *skb, diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 3b491d96e52e..e9b5f79e1ee1 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -1443,6 +1443,7 @@ enum { IFLA_GENEVE_DF, IFLA_GENEVE_INNER_PROTO_INHERIT, IFLA_GENEVE_PORT_RANGE, + IFLA_GENEVE_GRO_HINT, __IFLA_GENEVE_MAX }; #define IFLA_GENEVE_MAX (__IFLA_GENEVE_MAX - 1) diff --git a/net/core/dev.c b/net/core/dev.c index 048ab4409a2c..ec0d0cdfc078 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3802,7 +3802,7 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb, inner_ip_hdr(skb) : ip_hdr(skb); if (!(iph->frag_off & htons(IP_DF))) - features &= ~NETIF_F_TSO_MANGLEID; + features &= ~dev->mangleid_features; } /* NETIF_F_IPV6_CSUM does not support IPv6 extension headers, @@ -11402,6 +11402,9 @@ int register_netdevice(struct net_device *dev) if (dev->hw_enc_features & NETIF_F_TSO) dev->hw_enc_features |= NETIF_F_TSO_MANGLEID; + /* TSO_MANGLEID belongs in mangleid_features by definition */ + dev->mangleid_features |= NETIF_F_TSO_MANGLEID; + /* Make NETIF_F_HIGHDMA inheritable to VLAN devices. */ dev->vlan_features |= NETIF_F_HIGHDMA; diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index ce9699092f50..33f56fcbde09 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -22,6 +22,7 @@ TEST_PROGS := \ cmsg_so_mark.sh \ cmsg_so_priority.sh \ cmsg_time.sh \ + double_udp_encap.sh \ drop_monitor_tests.sh \ fcnal-ipv4.sh \ fcnal-ipv6.sh \ diff --git a/tools/testing/selftests/net/config b/tools/testing/selftests/net/config index b84362b9b508..cd49b7dfe216 100644 --- a/tools/testing/selftests/net/config +++ b/tools/testing/selftests/net/config @@ -77,6 +77,7 @@ CONFIG_NET_DROP_MONITOR=m CONFIG_NETFILTER=y CONFIG_NETFILTER_ADVANCED=y CONFIG_NETFILTER_XTABLES_LEGACY=y +CONFIG_NETFILTER_XT_MATCH_BPF=m CONFIG_NETFILTER_XT_MATCH_LENGTH=m CONFIG_NETFILTER_XT_MATCH_POLICY=m CONFIG_NETFILTER_XT_NAT=m diff --git a/tools/testing/selftests/net/double_udp_encap.sh b/tools/testing/selftests/net/double_udp_encap.sh new file mode 100755 index 000000000000..9aaf97cdf141 --- /dev/null +++ b/tools/testing/selftests/net/double_udp_encap.sh @@ -0,0 +1,393 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +source lib.sh + +# shellcheck disable=SC2155 # prefer RO variable over return value from cmd +readonly CLI="$(dirname "$(readlink -f "$0")")/../../../net/ynl/pyynl/cli.py" + +readonly SRC=1 +readonly DST=2 + +readonly NET_V4=192.168.1. +readonly NET_V6=2001:db8:: +readonly OL1_NET_V4=172.16.1. +readonly OL1_NET_V6=2001:db8:1:: +readonly OL2_NET_V4=172.16.2. +readonly OL2_NET_V6=2001:db8:2:: + +trap cleanup_all_ns EXIT + +# shellcheck disable=SC2329 # can't figure out usage trough a variable +is_ipv6() { + if [[ $1 =~ .*:.* ]]; then + return 0 + fi + return 1 +} + +# shellcheck disable=SC2329 # can't figure out usage trough a variable +create_gnv_endpoint() { + local -r netns=$1 + local -r bm_rem_addr=$2 + local -r gnv_dev=$3 + local -r gnv_id=$4 + local opts=$5 + local gnv_json + local rem + + if is_ipv6 "$bm_rem_addr"; then + rem=remote6 + else + rem=remote + fi + + # add ynl opt separator, if needed + [ -n "$opts" ] && opts=", $opts" + + gnv_json="{ \"id\": $gnv_id, \"$rem\": \"$bm_rem_addr\"$opts }" + ip netns exec "$netns" "$CLI" --family rt-link --create --excl \ + --do newlink --json "{\"ifname\": \"$gnv_dev\", + \"linkinfo\": {\"kind\":\"geneve\", + \"data\": $gnv_json } }" > /dev/null + ip -n "$netns" link set dev "$gnv_dev" up +} + +# shellcheck disable=SC2329 # can't figure out usage trough a variable +create_vxlan_endpoint() { + local -r netns=$1 + local -r bm_rem_addr=$2 + local -r vxlan_dev=$3 + local -r vxlan_id=$4 + local -r opts_str=$5 + local oldifs + local -a opts + local opt + + # convert the arguments from yaml format + oldifs=$IFS + IFS=',' + for opt in $opts_str; do + local pattern='"port":' + + [ -n "$opt" ] || continue + + opts+=("${opt/$pattern*/dstport}" "${opt/$pattern/}") + done + IFS=$oldifs + [ ${#opts[@]} -gt 0 ] || opts+=("dstport" "4789") + + ip -n "$netns" link add "$vxlan_dev" type vxlan id "$vxlan_id" \ + remote "$bm_rem_addr" "${opts[@]}" + ip -n "$netns" link set dev "$vxlan_dev" up +} + +create_ns() { + local nested_opt='"port":6082' + local create_endpoint + local options="$1" + local feature + local dev + local id + local ns + + RET=0 + + # +-------------+ +-------------+ + # | NS_SRC | | NS_NST_DST | + # | | | | + # | gnv_nst1 | | gnv_nst2 | + # | + | | + | + # | | | | | | + # | + | | + | + # | gnv1 | | gnv2 | + # | + | | + | + # | | | | | | + # | + veth1 +--------+ veth2 + | + # | | | | + # +-------------+ +-------------+ + + setup_ns NS_SRC NS_DST + + # concatenate caller provided options and default one + [ -n "$2" ] && nested_opt="$nested_opt,$2" + + ip link add name "veth$SRC" netns "$NS_SRC" type veth \ + peer name "veth$DST" netns "$NS_DST" + case "$ENCAP" in + vxlan) + create_endpoint=create_vxlan_endpoint + dev=vx + ;; + geneve) + create_endpoint=create_gnv_endpoint + dev=gnv + ;; + esac + + id=1 + for ns in "${NS_LIST[@]}"; do + ip -n "$ns" link set dev "veth$id" up + + # ensure the sender can do large write just after 3whs + ip netns exec "$ns" \ + sysctl -qw net.ipv4.tcp_wmem="4096 4194304 4194304" + + # note that 3 - $SRC == $DST and 3 - $DST == $SRC + if [ $FAMILY = "4" ]; then + ip -n "$ns" addr add dev "veth$id" "$NET_V4$id/24" + $create_endpoint "$ns" "$NET_V4$((3 - id))" \ + "$dev$id" 4 "$options" + ip -n "$ns" addr add dev "$dev$id" "$OL1_NET_V4$id/24" + + # nested tunnel devices + # pmtu can't be propagated to upper layer devices; + # need manual adjust + $create_endpoint "$ns" "$OL1_NET_V4$((3 - id))" \ + "$dev"_nst"$id" 40 "$nested_opt" + ip -n "$ns" addr add dev "$dev"_nst"$id" \ + "$OL2_NET_V4$id/24" + ip -n "$ns" link set dev "$dev"_nst"$id" mtu 1392 + else + ip -n "$ns" addr add dev "veth$id" "$NET_V6$id/64" \ + nodad + $create_endpoint "$ns" "$NET_V6$((3 - id))" \ + "$dev"6"$id" 6 "$options" + ip -n "$ns" addr add dev "$dev"6"$id" \ + "$OL1_NET_V6$id/64" nodad + + $create_endpoint "$ns" "$OL1_NET_V6$((3 - id))" \ + "$dev"6_nst"$id" 60 "$nested_opt" + ip -n "$ns" addr add dev "$dev"6_nst"$id" \ + "$OL2_NET_V6$id/64" nodad + ip -n "$ns" link set dev "$dev"6_nst"$id" mtu 1352 + fi + id=$((id+1)) + done + + # enable GRO heuristic on the veth peer and ensure UDP L4 over tunnel is + # actually segmented + for feature in tso tx-udp_tnl-segmentation; do + ip netns exec "$NS_SRC" ethtool -K "veth$SRC" \ + "$feature" off 2>/dev/null + done +} + +create_ns_gso() { + local dev + + create_ns "$@" + if [ "$ENCAP" = "geneve" ]; then + dev=gnv + else + dev=vx + fi + [ "$FAMILY" = "6" ] && dev="$dev"6 + ip netns exec "$NS_SRC" ethtool -K "$dev$SRC" \ + tx-gso-partial on \ + tx-udp_tnl-segmentation on \ + tx-udp_tnl-csum-segmentation on +} + +create_ns_gso_gro() { + create_ns_gso "$@" + ip netns exec "$NS_DST" ethtool -K "veth$DST" gro on + ip netns exec "$NS_SRC" ethtool -K "veth$SRC" tx off >/dev/null 2>&1 +} + +run_test() { + local -r dst=$NET$DST + local -r msg=$1 + local -r total_size=$2 + local -r encappkts=$3 + local inner_proto_offset=0 + local inner_maclen=14 + local rx_family="-4" + local ipt=iptables + local bpf_filter + local -a rx_args + local wire_pkts + local rcvpkts + local encl=8 + local dport + local pkts + local snd + + if [ $FAMILY = "6" ]; then + ipt=ip6tables + else + # rx program does not support '-6' and implies ipv6 usage by + # default + rx_args=("$rx_family") + fi + + # The received can only check fixed size packet + pkts=$((total_size / GSO_SIZE)) + if [ -n "$4" ]; then + wire_pkts=$4 + elif [ $((total_size % GSO_SIZE)) -eq 0 ]; then + wire_pkts=1 + rx_args+=("-l" "$GSO_SIZE") + else + wire_pkts=2 + pkts=$((pkts + 1)) + fi + + if [ "$ENCAP" = "geneve" ]; then + dport=6081 + else + dport=4789 + fi + + # Either: + # - IPv4, nested tunnel carries UDP over IPv4, with dport 6082, + # innermost is TCP over IPv4 on port 8000 + # - IPv6, nested tunnel carries UDP over IPv6, with dport 6082, + # innermost is TCP over IPv6 on port 8000 + # The nested tunnel port is 6082 and the nested encap len is 8 + # regardless of the encap type (no geneve opts). + # In inherit protocol mode there is no nested mac hdr and the nested + # l3 protocol type field belongs to the geneve hdr. + [ "$USE_HINT" = true ] && encl=16 + [ "$INHERIT" = true ] && inner_maclen=0 + [ "$INHERIT" = true ] && inner_proto_offset=-4 + local inner=$((inner_maclen+encl)) + local proto=$((inner_maclen+encl+inner_proto_offset)) + bpf_filter=$(nfbpf_compile "(ip && + ip[$((40+encl))] == 0x08 && ip[$((41+encl))] == 0x00 && + ip[$((51+encl))] == 0x11 && + ip[$((64+encl))] == 0x17 && ip[$((65+encl))] == 0xc2 && + ip[$((76+proto))] == 0x08 && ip[$((77+proto))] == 0x00 && + ip[$((87+inner))] == 0x6 && + ip[$((100+inner))] == 0x1f && ip[$((101+inner))] == 0x40) || + (ip6 && + ip6[$((60+encl))] == 0x86 && ip6[$((61+encl))] == 0xdd && + ip6[$((68+encl))] == 0x11 && + ip6[$((104+encl))] == 0x17 && ip6[$((105+encl))] == 0xc2 && + ip6[$((116+proto))] == 0x86 && ip6[$((117+proto))] == 0xdd && + ip6[$((124+inner))] == 0x6 && + ip6[$((160+inner))] == 0x1f && ip6[$((161+inner))] == 0x40)") + + # ignore shorts packet, to avoid arp/mld induced noise + ip netns exec "$NS_SRC" "$ipt" -A OUTPUT -p udp --dport "$dport" \ + -m length --length 600:65535 -m bpf --bytecode "$bpf_filter" + ip netns exec "$NS_DST" "$ipt" -A INPUT -p udp --dport "$dport" \ + -m length --length 600:65535 -m bpf --bytecode "$bpf_filter" + ip netns exec "$NS_DST" ./udpgso_bench_rx -C 2000 -t -R 100 \ + -n "$pkts" "${rx_args[@]}" & + local pid=$! + wait_local_port_listen "$NS_DST" 8000 tcp + ip netns exec "$NS_SRC" ./udpgso_bench_tx -"$FAMILY" -t -M 1 \ + -s "$total_size" -D "$dst" + local ret=$? + check_err "$ret" "client failure exit code $ret" + wait "$pid" + ret=$? + check_err "$ret" "sever failure exit code $ret" + + snd=$(ip netns exec "$NS_SRC" "$ipt"-save -c | + grep "dport $dport" | sed -e 's/\[//' -e 's/:.*//') + + [ "$snd" = "$wire_pkts" ] + # shellcheck disable=SC2319 # known false positive + check_err $? "send $snd packets on the lowest link, expected $wire_pkts" + + rcvpkts=$(ip netns exec "$NS_DST" "$ipt"-save -c | \ + grep "dport $dport" | sed -e 's/\[//' -e 's/:.*//') + + [ "$rcvpkts" = "$encappkts" ] + check_err $? "received $rcvpkts $ENCAP packets, expected $encappkts" + log_test "$msg" +} + +run_tests() { + for FAMILY in 4 6; do + NET=$OL2_NET_V4 + GSO_SIZE=1340 # 1392 - 20 - 32 + + if [ $FAMILY = 6 ]; then + NET=$OL2_NET_V6 + GSO_SIZE=1280 # 1352 - 40 - 32 + fi + + echo "IPv$FAMILY" + + unset USE_HINT + unset INHERIT + + # "geneve" must be last encap in list, so that later + # test cases will run on it + for ENCAP in "vxlan" "geneve"; do + create_ns + run_test "No GSO - $ENCAP" $((GSO_SIZE * 4)) 4 4 + cleanup_all_ns + + create_ns_gso + run_test "GSO without GRO - $ENCAP" $((GSO_SIZE * 4)) \ + 4 1 + cleanup_all_ns + + # IPv4 only test + [ $FAMILY = "4" ] || continue + create_ns_gso + ip netns exec "$NS_SRC" \ + sysctl -qw net.ipv4.ip_no_pmtu_disc=1 + run_test "GSO disable due to no fixedid - $ENCAP" \ + $((GSO_SIZE * 4)) 4 4 + cleanup_all_ns + done + + # GRO tests imply/require geneve encap, the only one providing + # GRO hints + create_ns_gso_gro + run_test "double tunnel GRO, no hints" $((GSO_SIZE * 4)) 4 + cleanup_all_ns + + # hint option is expected for all the following tests in the RX + # path + USE_HINT=true + create_ns_gso_gro \ + '"gro-hint":1,"udp-zero-csum6-tx":1,"udp-zero-csum6-rx":1' \ + '"udp-zero-csum6-tx":1,"udp-zero-csum6-rx":1' + run_test "double tunnel GRO" $((GSO_SIZE * 4)) 1 + cleanup_all_ns + + create_ns_gso_gro '"gro-hint":1,"udp-csum":1' '"udp-csum":1' + run_test "double tunnel GRO - csum complete" $((GSO_SIZE * 4))\ + 1 + cleanup_all_ns + + create_ns_gso_gro '"gro-hint":1' \ + '"udp-csum":0,"udp-zero-csum6-tx":1,"udp-zero-csum6-rx":1' + run_test "double tunnel GRO - no nested csum" \ + $((GSO_SIZE * 4)) 1 + cleanup_all_ns + + create_ns_gso_gro \ + '"gro-hint":1,"udp-zero-csum6-tx":1,"udp-zero-csum6-rx":1' \ + '"udp-csum":1' + run_test "double tunnel GRO - nested csum, outer 0-csum, skip"\ + $((GSO_SIZE * 4)) 4 + cleanup_all_ns + + INHERIT=true + create_ns_gso_gro '"gro-hint":1,"udp-csum":1' \ + '"udp-csum":1,"inner-proto-inherit":1' + run_test "double tunnel GRO - nested inherit proto" \ + $((GSO_SIZE * 4)) 1 + cleanup_all_ns + unset INHERIT + + create_ns_gso_gro '"gro-hint":1' + run_test "double tunnel GRO - short last pkt" \ + $((GSO_SIZE * 4 + GSO_SIZE / 2)) 2 + cleanup_all_ns + done +} + +require_command nfbpf_compile +require_command jq + +# tcp retransmisions will break the accounting +xfail_on_slow run_tests +exit "$EXIT_STATUS"