diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ff3e00ff84d2..2cc58fc0f413 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -552,6 +552,56 @@ _out: \ _ret; \ }) +/* To be used by __cgroup_bpf_run_filter_skb for EGRESS BPF progs + * so BPF programs can request cwr for TCP packets. + * + * Current cgroup skb programs can only return 0 or 1 (0 to drop the + * packet. This macro changes the behavior so the low order bit + * indicates whether the packet should be dropped (0) or not (1) + * and the next bit is a congestion notification bit. This could be + * used by TCP to call tcp_enter_cwr() + * + * Hence, new allowed return values of CGROUP EGRESS BPF programs are: + * 0: drop packet + * 1: keep packet + * 2: drop packet and cn + * 3: keep packet and cn + * + * This macro then converts it to one of the NET_XMIT or an error + * code that is then interpreted as drop packet (and no cn): + * 0: NET_XMIT_SUCCESS skb should be transmitted + * 1: NET_XMIT_DROP skb should be dropped and cn + * 2: NET_XMIT_CN skb should be transmitted and cn + * 3: -EPERM skb should be dropped + */ +#define BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY(array, ctx, func) \ + ({ \ + struct bpf_prog_array_item *_item; \ + struct bpf_prog *_prog; \ + struct bpf_prog_array *_array; \ + u32 ret; \ + u32 _ret = 1; \ + u32 _cn = 0; \ + preempt_disable(); \ + rcu_read_lock(); \ + _array = rcu_dereference(array); \ + _item = &_array->items[0]; \ + while ((_prog = READ_ONCE(_item->prog))) { \ + bpf_cgroup_storage_set(_item->cgroup_storage); \ + ret = func(_prog, ctx); \ + _ret &= (ret & 1); \ + _cn |= (ret & 2); \ + _item++; \ + } \ + rcu_read_unlock(); \ + preempt_enable(); \ + if (_ret) \ + _ret = (_cn ? NET_XMIT_CN : NET_XMIT_SUCCESS); \ + else \ + _ret = (_cn ? NET_XMIT_DROP : -EPERM); \ + _ret; \ + }) + #define BPF_PROG_RUN_ARRAY(array, ctx, func) \ __BPF_PROG_RUN_ARRAY(array, ctx, func, false) diff --git a/include/linux/filter.h b/include/linux/filter.h index ba8b65270e0d..43b45d6db36d 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -526,7 +526,8 @@ struct bpf_prog { blinded:1, /* Was blinded */ is_func:1, /* program is a bpf function */ kprobe_override:1, /* Do we override a kprobe? */ - has_callchain_buf:1; /* callchain buffer allocated? */ + has_callchain_buf:1, /* callchain buffer allocated? */ + enforce_expected_attach_type:1; /* Enforce expected_attach_type checking at attach time */ enum bpf_prog_type type; /* Type of BPF program */ enum bpf_attach_type expected_attach_type; /* For some prog types */ u32 len; /* Number of filter blocks */ diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index ff594eb86fd7..1b65ab0df457 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -587,8 +587,16 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr, * The program type passed in via @type must be suitable for network * filtering. No further check is performed to assert that. * - * This function will return %-EPERM if any if an attached program was found - * and if it returned != 1 during execution. In all other cases, 0 is returned. + * For egress packets, this function can return: + * NET_XMIT_SUCCESS (0) - continue with packet output + * NET_XMIT_DROP (1) - drop packet and notify TCP to call cwr + * NET_XMIT_CN (2) - continue with packet output and notify TCP + * to call cwr + * -EPERM - drop packet + * + * For ingress packets, this function will return -EPERM if any + * attached program was found and if it returned != 1 during execution. + * Otherwise 0 is returned. */ int __cgroup_bpf_run_filter_skb(struct sock *sk, struct sk_buff *skb, @@ -614,12 +622,19 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, /* compute pointers for the bpf prog */ bpf_compute_and_save_data_end(skb, &saved_data_end); - ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb, - __bpf_prog_run_save_cb); + if (type == BPF_CGROUP_INET_EGRESS) { + ret = BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY( + cgrp->bpf.effective[type], skb, __bpf_prog_run_save_cb); + } else { + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb, + __bpf_prog_run_save_cb); + ret = (ret == 1 ? 0 : -EPERM); + } bpf_restore_data_end(skb, saved_data_end); __skb_pull(skb, offset); skb->sk = save_sk; - return ret == 1 ? 0 : -EPERM; + + return ret; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 3d546b6f4646..1539774d78c7 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1585,6 +1585,14 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type, default: return -EINVAL; } + case BPF_PROG_TYPE_CGROUP_SKB: + switch (expected_attach_type) { + case BPF_CGROUP_INET_INGRESS: + case BPF_CGROUP_INET_EGRESS: + return 0; + default: + return -EINVAL; + } default: return 0; } @@ -1836,6 +1844,10 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: return attach_type == prog->expected_attach_type ? 0 : -EINVAL; + case BPF_PROG_TYPE_CGROUP_SKB: + return prog->enforce_expected_attach_type && + prog->expected_attach_type != attach_type ? + -EINVAL : 0; default: return 0; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2778417e6e0c..5c2cb5bd84ce 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5508,11 +5508,16 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) static int check_return_code(struct bpf_verifier_env *env) { + struct tnum enforce_attach_type_range = tnum_unknown; struct bpf_reg_state *reg; struct tnum range = tnum_range(0, 1); switch (env->prog->type) { case BPF_PROG_TYPE_CGROUP_SKB: + if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) { + range = tnum_range(0, 3); + enforce_attach_type_range = tnum_range(2, 3); + } case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: case BPF_PROG_TYPE_SOCK_OPS: @@ -5531,18 +5536,23 @@ static int check_return_code(struct bpf_verifier_env *env) } if (!tnum_in(range, reg->var_off)) { + char tn_buf[48]; + verbose(env, "At program exit the register R0 "); if (!tnum_is_unknown(reg->var_off)) { - char tn_buf[48]; - tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); verbose(env, "has value %s", tn_buf); } else { verbose(env, "has unknown scalar value"); } - verbose(env, " should have been 0 or 1\n"); + tnum_strn(tn_buf, sizeof(tn_buf), range); + verbose(env, " should have been %s\n", tn_buf); return -EINVAL; } + + if (!tnum_is_unknown(enforce_attach_type_range) && + tnum_in(enforce_attach_type_range, reg->var_off)) + env->prog->enforce_expected_attach_type = 1; return 0; } diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index bfd0ca554977..1217a53381c2 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -287,16 +287,9 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk, return ret; } -static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) +static int __ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { unsigned int mtu; - int ret; - - ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); - if (ret) { - kfree_skb(skb); - return ret; - } #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) /* Policy lookup after SNAT yielded a new policy */ @@ -315,18 +308,37 @@ static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *sk return ip_finish_output2(net, sk, skb); } +static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + int ret; + + ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); + switch (ret) { + case NET_XMIT_SUCCESS: + return __ip_finish_output(net, sk, skb); + case NET_XMIT_CN: + return __ip_finish_output(net, sk, skb) ? : ret; + default: + kfree_skb(skb); + return ret; + } +} + static int ip_mc_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { int ret; ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); - if (ret) { + switch (ret) { + case NET_XMIT_SUCCESS: + return dev_loopback_xmit(net, sk, skb); + case NET_XMIT_CN: + return dev_loopback_xmit(net, sk, skb) ? : ret; + default: kfree_skb(skb); return ret; } - - return dev_loopback_xmit(net, sk, skb); } int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index adef2236abe2..a75bc21d8c88 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -128,16 +128,8 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * return -EINVAL; } -static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) +static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - int ret; - - ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); - if (ret) { - kfree_skb(skb); - return ret; - } - #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) /* Policy lookup after SNAT yielded a new policy */ if (skb_dst(skb)->xfrm) { @@ -154,6 +146,22 @@ static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *s return ip6_finish_output2(net, sk, skb); } +static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + int ret; + + ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); + switch (ret) { + case NET_XMIT_SUCCESS: + return __ip6_finish_output(net, sk, skb); + case NET_XMIT_CN: + return __ip6_finish_output(net, sk, skb) ? : ret; + default: + kfree_skb(skb); + return ret; + } +} + int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb_dst(skb)->dev; diff --git a/samples/bpf/do_hbm_test.sh b/samples/bpf/do_hbm_test.sh index 56c8b4115c95..e48b047d4646 100755 --- a/samples/bpf/do_hbm_test.sh +++ b/samples/bpf/do_hbm_test.sh @@ -13,10 +13,10 @@ Usage() { echo "egress or ingress bandwidht. It then uses iperf3 or netperf to create" echo "loads. The output is the goodput in Mbps (unless -D was used)." echo "" - echo "USAGE: $name [out] [-b=|--bpf=] [-c=|--cc=] [-D]" - echo " [-d=|--delay=] [--debug] [-E]" + echo "USAGE: $name [out] [-b=|--bpf=] [-c=|--cc=]" + echo " [-D] [-d=|--delay=] [--debug] [-E]" echo " [-f=<#flows>|--flows=<#flows>] [-h] [-i=|--id=]" - echo " [-l] [-N] [-p=|--port=] [-P]" + echo " [-l] [-N] [--no_cn] [-p=|--port=] [-P]" echo " [-q=] [-R] [-s=|--server=|--time=