From 743df8b7749fb5a289fc0c7ac94ec15533596839 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sat, 10 Oct 2020 22:09:07 -0700 Subject: [PATCH 1/4] bpf, sockmap: Check skb_verdict and skb_parser programs explicitly We are about to allow skb_verdict to run without skb_parser programs as a first step change code to check each program type specifically. This should be a mechanical change without any impact to actual result. Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/160239294756.8495.5796595770890272219.stgit@john-Precision-5820-Tower --- net/core/sock_map.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index e83a80e8f13b..a2ed5b6223b9 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -230,16 +230,16 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, { struct bpf_prog *msg_parser, *skb_parser, *skb_verdict; struct sk_psock *psock; - bool skb_progs; int ret; skb_verdict = READ_ONCE(progs->skb_verdict); skb_parser = READ_ONCE(progs->skb_parser); - skb_progs = skb_parser && skb_verdict; - if (skb_progs) { + if (skb_verdict) { skb_verdict = bpf_prog_inc_not_zero(skb_verdict); if (IS_ERR(skb_verdict)) return PTR_ERR(skb_verdict); + } + if (skb_parser) { skb_parser = bpf_prog_inc_not_zero(skb_parser); if (IS_ERR(skb_parser)) { bpf_prog_put(skb_verdict); @@ -264,7 +264,8 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, if (psock) { if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) || - (skb_progs && READ_ONCE(psock->progs.skb_parser))) { + (skb_parser && READ_ONCE(psock->progs.skb_parser)) || + (skb_verdict && READ_ONCE(psock->progs.skb_verdict))) { sk_psock_put(sk, psock); ret = -EBUSY; goto out_progs; @@ -285,7 +286,7 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, goto out_drop; write_lock_bh(&sk->sk_callback_lock); - if (skb_progs && !psock->parser.enabled) { + if (skb_parser && skb_verdict && !psock->parser.enabled) { ret = sk_psock_init_strp(sk, psock); if (ret) { write_unlock_bh(&sk->sk_callback_lock); @@ -303,10 +304,10 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, if (msg_parser) bpf_prog_put(msg_parser); out: - if (skb_progs) { + if (skb_verdict) bpf_prog_put(skb_verdict); + if (skb_parser) bpf_prog_put(skb_parser); - } return ret; } From ef5659280eb13e8ac31c296f58cfdfa1684ac06b Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sat, 10 Oct 2020 22:09:38 -0700 Subject: [PATCH 2/4] bpf, sockmap: Allow skipping sk_skb parser program Currently, we often run with a nop parser namely one that just does this, 'return skb->len'. This happens when either our verdict program can handle streaming data or it is only looking at socket data such as IP addresses and other metadata associated with the flow. The second case is common for a L3/L4 proxy for instance. So lets allow loading programs without the parser then we can skip the stream parser logic and avoid having to add a BPF program that is effectively a nop. Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/160239297866.8495.13345662302749219672.stgit@john-Precision-5820-Tower --- include/linux/skmsg.h | 2 ++ net/core/skmsg.c | 78 +++++++++++++++++++++++++++++++++++++++++++ net/core/sock_map.c | 22 ++++++++---- 3 files changed, 95 insertions(+), 7 deletions(-) diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 3119928fc103..fec0c5ac1c4f 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -308,6 +308,8 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node); int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock); void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock); void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock); +void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock); +void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock); int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock, struct sk_msg *msg); diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 881a5b290946..654182ecf87b 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -627,6 +627,8 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock) rcu_assign_sk_user_data(sk, NULL); if (psock->progs.skb_parser) sk_psock_stop_strp(sk, psock); + else if (psock->progs.skb_verdict) + sk_psock_stop_verdict(sk, psock); write_unlock_bh(&sk->sk_callback_lock); sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); @@ -871,6 +873,57 @@ static void sk_psock_strp_data_ready(struct sock *sk) rcu_read_unlock(); } +static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, + unsigned int offset, size_t orig_len) +{ + struct sock *sk = (struct sock *)desc->arg.data; + struct sk_psock *psock; + struct bpf_prog *prog; + int ret = __SK_DROP; + int len = skb->len; + + /* clone here so sk_eat_skb() in tcp_read_sock does not drop our data */ + skb = skb_clone(skb, GFP_ATOMIC); + if (!skb) { + desc->error = -ENOMEM; + return 0; + } + + rcu_read_lock(); + psock = sk_psock(sk); + if (unlikely(!psock)) { + len = 0; + kfree_skb(skb); + goto out; + } + skb_set_owner_r(skb, sk); + prog = READ_ONCE(psock->progs.skb_verdict); + if (likely(prog)) { + tcp_skb_bpf_redirect_clear(skb); + ret = sk_psock_bpf_run(psock, prog, skb); + ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); + } + sk_psock_verdict_apply(psock, skb, ret); +out: + rcu_read_unlock(); + return len; +} + +static void sk_psock_verdict_data_ready(struct sock *sk) +{ + struct socket *sock = sk->sk_socket; + read_descriptor_t desc; + + if (unlikely(!sock || !sock->ops || !sock->ops->read_sock)) + return; + + desc.arg.data = sk; + desc.error = 0; + desc.count = 1; + + sock->ops->read_sock(sk, &desc, sk_psock_verdict_recv); +} + static void sk_psock_write_space(struct sock *sk) { struct sk_psock *psock; @@ -900,6 +953,19 @@ int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) return strp_init(&psock->parser.strp, sk, &cb); } +void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock) +{ + struct sk_psock_parser *parser = &psock->parser; + + if (parser->enabled) + return; + + parser->saved_data_ready = sk->sk_data_ready; + sk->sk_data_ready = sk_psock_verdict_data_ready; + sk->sk_write_space = sk_psock_write_space; + parser->enabled = true; +} + void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) { struct sk_psock_parser *parser = &psock->parser; @@ -925,3 +991,15 @@ void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) strp_stop(&parser->strp); parser->enabled = false; } + +void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock) +{ + struct sk_psock_parser *parser = &psock->parser; + + if (!parser->enabled) + return; + + sk->sk_data_ready = parser->saved_data_ready; + parser->saved_data_ready = NULL; + parser->enabled = false; +} diff --git a/net/core/sock_map.c b/net/core/sock_map.c index a2ed5b6223b9..df09c39a4dd2 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -148,8 +148,8 @@ static void sock_map_add_link(struct sk_psock *psock, static void sock_map_del_link(struct sock *sk, struct sk_psock *psock, void *link_raw) { + bool strp_stop = false, verdict_stop = false; struct sk_psock_link *link, *tmp; - bool strp_stop = false; spin_lock_bh(&psock->link_lock); list_for_each_entry_safe(link, tmp, &psock->link, list) { @@ -159,14 +159,19 @@ static void sock_map_del_link(struct sock *sk, map); if (psock->parser.enabled && stab->progs.skb_parser) strp_stop = true; + if (psock->parser.enabled && stab->progs.skb_verdict) + verdict_stop = true; list_del(&link->list); sk_psock_free_link(link); } } spin_unlock_bh(&psock->link_lock); - if (strp_stop) { + if (strp_stop || verdict_stop) { write_lock_bh(&sk->sk_callback_lock); - sk_psock_stop_strp(sk, psock); + if (strp_stop) + sk_psock_stop_strp(sk, psock); + else + sk_psock_stop_verdict(sk, psock); write_unlock_bh(&sk->sk_callback_lock); } } @@ -288,16 +293,19 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, write_lock_bh(&sk->sk_callback_lock); if (skb_parser && skb_verdict && !psock->parser.enabled) { ret = sk_psock_init_strp(sk, psock); - if (ret) { - write_unlock_bh(&sk->sk_callback_lock); - goto out_drop; - } + if (ret) + goto out_unlock_drop; psock_set_prog(&psock->progs.skb_verdict, skb_verdict); psock_set_prog(&psock->progs.skb_parser, skb_parser); sk_psock_start_strp(sk, psock); + } else if (!skb_parser && skb_verdict && !psock->parser.enabled) { + psock_set_prog(&psock->progs.skb_verdict, skb_verdict); + sk_psock_start_verdict(sk,psock); } write_unlock_bh(&sk->sk_callback_lock); return 0; +out_unlock_drop: + write_unlock_bh(&sk->sk_callback_lock); out_drop: sk_psock_put(sk, psock); out_progs: From cdf43c4bfa1a465ba7996165b75c49b5c0e46b50 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sat, 10 Oct 2020 22:10:04 -0700 Subject: [PATCH 3/4] bpf, selftests: Add option to test_sockmap to omit adding parser program Add option to allow running without a parser program in place. To test with ping/pong program use, # test_sockmap -t ping --txmsg_omit_skb_parser this will send packets between two socket bouncing through a proxy socket that does not use a parser program. (ping) (pong) sender proxy_recv proxy_send recv | | | | verdict -----+ | | | | | +----------------+ +------------+ Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/160239300387.8495.11908295143121563076.stgit@john-Precision-5820-Tower --- tools/testing/selftests/bpf/test_sockmap.c | 35 +++++++++++++--------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c index 5cf45455de42..419c0b010d14 100644 --- a/tools/testing/selftests/bpf/test_sockmap.c +++ b/tools/testing/selftests/bpf/test_sockmap.c @@ -86,6 +86,7 @@ int txmsg_ktls_skb_redir; int ktls; int peek_flag; int skb_use_parser; +int txmsg_omit_skb_parser; static const struct option long_options[] = { {"help", no_argument, NULL, 'h' }, @@ -111,6 +112,7 @@ static const struct option long_options[] = { {"txmsg_redir_skb", no_argument, &txmsg_redir_skb, 1 }, {"ktls", no_argument, &ktls, 1 }, {"peek", no_argument, &peek_flag, 1 }, + {"txmsg_omit_skb_parser", no_argument, &txmsg_omit_skb_parser, 1}, {"whitelist", required_argument, NULL, 'n' }, {"blacklist", required_argument, NULL, 'b' }, {0, 0, NULL, 0 } @@ -175,6 +177,7 @@ static void test_reset(void) txmsg_apply = txmsg_cork = 0; txmsg_ingress = txmsg_redir_skb = 0; txmsg_ktls_skb = txmsg_ktls_skb_drop = txmsg_ktls_skb_redir = 0; + txmsg_omit_skb_parser = 0; skb_use_parser = 0; } @@ -912,13 +915,15 @@ static int run_options(struct sockmap_options *options, int cg_fd, int test) goto run; /* Attach programs to sockmap */ - err = bpf_prog_attach(prog_fd[0], map_fd[0], - BPF_SK_SKB_STREAM_PARSER, 0); - if (err) { - fprintf(stderr, - "ERROR: bpf_prog_attach (sockmap %i->%i): %d (%s)\n", - prog_fd[0], map_fd[0], err, strerror(errno)); - return err; + if (!txmsg_omit_skb_parser) { + err = bpf_prog_attach(prog_fd[0], map_fd[0], + BPF_SK_SKB_STREAM_PARSER, 0); + if (err) { + fprintf(stderr, + "ERROR: bpf_prog_attach (sockmap %i->%i): %d (%s)\n", + prog_fd[0], map_fd[0], err, strerror(errno)); + return err; + } } err = bpf_prog_attach(prog_fd[1], map_fd[0], @@ -931,13 +936,15 @@ static int run_options(struct sockmap_options *options, int cg_fd, int test) /* Attach programs to TLS sockmap */ if (txmsg_ktls_skb) { - err = bpf_prog_attach(prog_fd[0], map_fd[8], - BPF_SK_SKB_STREAM_PARSER, 0); - if (err) { - fprintf(stderr, - "ERROR: bpf_prog_attach (TLS sockmap %i->%i): %d (%s)\n", - prog_fd[0], map_fd[8], err, strerror(errno)); - return err; + if (!txmsg_omit_skb_parser) { + err = bpf_prog_attach(prog_fd[0], map_fd[8], + BPF_SK_SKB_STREAM_PARSER, 0); + if (err) { + fprintf(stderr, + "ERROR: bpf_prog_attach (TLS sockmap %i->%i): %d (%s)\n", + prog_fd[0], map_fd[8], err, strerror(errno)); + return err; + } } err = bpf_prog_attach(prog_fd[2], map_fd[8], From a24fb420a5775581050026eba2264c87927debf7 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sat, 10 Oct 2020 22:10:26 -0700 Subject: [PATCH 4/4] bpf, selftests: Add three new sockmap tests for verdict only programs Here we add three new tests for sockmap to test having a verdict program without setting the parser program. The first test covers the most simply case, sender proxy_recv proxy_send recv | | | | verdict -----+ | | | | | +----------------+ +------------+ We load the verdict program on the proxy_recv socket without a parser program. It then does a redirect into the send path of the proxy_send socket using sendpage_locked(). Next we test the drop case to ensure if we kfree_skb as a result of the verdict program everything behaves as expected. Next we test the same configuration above, but with ktls and a redirect into socket ingress queue. Shown here tls tls sender proxy_recv proxy_send recv | | | | verdict ------------------+ | | redirect_ingress +----------------+ Also to set up ping/pong test Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/160239302638.8495.17125996694402793471.stgit@john-Precision-5820-Tower --- tools/testing/selftests/bpf/test_sockmap.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c index 419c0b010d14..0fa1e421c3d7 100644 --- a/tools/testing/selftests/bpf/test_sockmap.c +++ b/tools/testing/selftests/bpf/test_sockmap.c @@ -1472,12 +1472,29 @@ static void test_txmsg_skb(int cgrp, struct sockmap_options *opt) txmsg_ktls_skb_drop = 0; txmsg_ktls_skb_redir = 1; test_exec(cgrp, opt); + txmsg_ktls_skb_redir = 0; + + /* Tests that omit skb_parser */ + txmsg_omit_skb_parser = 1; + ktls = 0; + txmsg_ktls_skb = 0; + test_exec(cgrp, opt); + + txmsg_ktls_skb_drop = 1; + test_exec(cgrp, opt); + txmsg_ktls_skb_drop = 0; + + txmsg_ktls_skb_redir = 1; + test_exec(cgrp, opt); + + ktls = 1; + test_exec(cgrp, opt); + txmsg_omit_skb_parser = 0; opt->data_test = data; ktls = k; } - /* Test cork with hung data. This tests poor usage patterns where * cork can leave data on the ring if user program is buggy and * doesn't flush them somehow. They do take some time however