diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 72e69a0e1e8c..0748fd87969e 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -23,8 +23,8 @@ struct ctl_table_header; #ifdef CONFIG_CGROUP_BPF -extern struct static_key_false cgroup_bpf_enabled_key; -#define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key) +extern struct static_key_false cgroup_bpf_enabled_key[MAX_BPF_ATTACH_TYPE]; +#define cgroup_bpf_enabled(type) static_branch_unlikely(&cgroup_bpf_enabled_key[type]) DECLARE_PER_CPU(struct bpf_cgroup_storage*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); @@ -147,6 +147,10 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, int __user *optlen, int max_optlen, int retval); +int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level, + int optname, void *optval, + int *optlen, int retval); + static inline enum bpf_cgroup_storage_type cgroup_storage_type( struct bpf_map *map) { @@ -185,7 +189,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled) \ + if (cgroup_bpf_enabled(BPF_CGROUP_INET_INGRESS)) \ __ret = __cgroup_bpf_run_filter_skb(sk, skb, \ BPF_CGROUP_INET_INGRESS); \ \ @@ -195,7 +199,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled && sk && sk == skb->sk) { \ + if (cgroup_bpf_enabled(BPF_CGROUP_INET_EGRESS) && sk && sk == skb->sk) { \ typeof(sk) __sk = sk_to_full_sk(sk); \ if (sk_fullsock(__sk)) \ __ret = __cgroup_bpf_run_filter_skb(__sk, skb, \ @@ -207,7 +211,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_SK_PROG(sk, type) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled) { \ + if (cgroup_bpf_enabled(type)) { \ __ret = __cgroup_bpf_run_filter_sk(sk, type); \ } \ __ret; \ @@ -228,7 +232,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, type) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled) \ + if (cgroup_bpf_enabled(type)) \ __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type, \ NULL); \ __ret; \ @@ -237,7 +241,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled) { \ + if (cgroup_bpf_enabled(type)) { \ lock_sock(sk); \ __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type, \ t_ctx); \ @@ -252,8 +256,10 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_INET6_BIND_LOCK(sk, uaddr) \ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_BIND, NULL) -#define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (cgroup_bpf_enabled && \ - sk->sk_prot->pre_connect) +#define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) \ + ((cgroup_bpf_enabled(BPF_CGROUP_INET4_CONNECT) || \ + cgroup_bpf_enabled(BPF_CGROUP_INET6_CONNECT)) && \ + (sk)->sk_prot->pre_connect) #define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) \ BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET4_CONNECT) @@ -297,7 +303,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(sock_ops, sk) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled) \ + if (cgroup_bpf_enabled(BPF_CGROUP_SOCK_OPS)) \ __ret = __cgroup_bpf_run_filter_sock_ops(sk, \ sock_ops, \ BPF_CGROUP_SOCK_OPS); \ @@ -307,7 +313,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled && (sock_ops)->sk) { \ + if (cgroup_bpf_enabled(BPF_CGROUP_SOCK_OPS) && (sock_ops)->sk) { \ typeof(sk) __sk = sk_to_full_sk((sock_ops)->sk); \ if (__sk && sk_fullsock(__sk)) \ __ret = __cgroup_bpf_run_filter_sock_ops(__sk, \ @@ -320,7 +326,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type, major, minor, access) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled) \ + if (cgroup_bpf_enabled(BPF_CGROUP_DEVICE)) \ __ret = __cgroup_bpf_check_dev_permission(type, major, minor, \ access, \ BPF_CGROUP_DEVICE); \ @@ -332,7 +338,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, count, pos) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled) \ + if (cgroup_bpf_enabled(BPF_CGROUP_SYSCTL)) \ __ret = __cgroup_bpf_run_filter_sysctl(head, table, write, \ buf, count, pos, \ BPF_CGROUP_SYSCTL); \ @@ -343,7 +349,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, kernel_optval) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled) \ + if (cgroup_bpf_enabled(BPF_CGROUP_SETSOCKOPT)) \ __ret = __cgroup_bpf_run_filter_setsockopt(sock, level, \ optname, optval, \ optlen, \ @@ -354,7 +360,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled) \ + if (cgroup_bpf_enabled(BPF_CGROUP_GETSOCKOPT)) \ get_user(__ret, optlen); \ __ret; \ }) @@ -363,11 +369,24 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, max_optlen, retval) \ ({ \ int __ret = retval; \ - if (cgroup_bpf_enabled) \ - __ret = __cgroup_bpf_run_filter_getsockopt(sock, level, \ - optname, optval, \ - optlen, max_optlen, \ - retval); \ + if (cgroup_bpf_enabled(BPF_CGROUP_GETSOCKOPT)) \ + if (!(sock)->sk_prot->bpf_bypass_getsockopt || \ + !INDIRECT_CALL_INET_1((sock)->sk_prot->bpf_bypass_getsockopt, \ + tcp_bpf_bypass_getsockopt, \ + level, optname)) \ + __ret = __cgroup_bpf_run_filter_getsockopt( \ + sock, level, optname, optval, optlen, \ + max_optlen, retval); \ + __ret; \ +}) + +#define BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sock, level, optname, optval, \ + optlen, retval) \ +({ \ + int __ret = retval; \ + if (cgroup_bpf_enabled(BPF_CGROUP_GETSOCKOPT)) \ + __ret = __cgroup_bpf_run_filter_getsockopt_kern( \ + sock, level, optname, optval, optlen, retval); \ __ret; \ }) @@ -427,7 +446,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, return 0; } -#define cgroup_bpf_enabled (0) +#define cgroup_bpf_enabled(type) (0) #define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx) ({ 0; }) #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0) #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) @@ -452,6 +471,8 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, #define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) ({ 0; }) #define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, \ optlen, max_optlen, retval) ({ retval; }) +#define BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sock, level, optname, optval, \ + optlen, retval) ({ retval; }) #define BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock, level, optname, optval, optlen, \ kernel_optval) ({ 0; }) diff --git a/include/linux/filter.h b/include/linux/filter.h index 7fdce5407214..5b3137d7b690 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1298,6 +1298,11 @@ struct bpf_sysctl_kern { u64 tmp_reg; }; +#define BPF_SOCKOPT_KERN_BUF_SIZE 32 +struct bpf_sockopt_buf { + u8 data[BPF_SOCKOPT_KERN_BUF_SIZE]; +}; + struct bpf_sockopt_kern { struct sock *sk; u8 *optval; diff --git a/include/linux/indirect_call_wrapper.h b/include/linux/indirect_call_wrapper.h index 54c02c84906a..cfcfef37b2f1 100644 --- a/include/linux/indirect_call_wrapper.h +++ b/include/linux/indirect_call_wrapper.h @@ -60,4 +60,10 @@ #define INDIRECT_CALL_INET(f, f2, f1, ...) f(__VA_ARGS__) #endif +#if IS_ENABLED(CONFIG_INET) +#define INDIRECT_CALL_INET_1(f, f1, ...) INDIRECT_CALL_1(f, f1, __VA_ARGS__) +#else +#define INDIRECT_CALL_INET_1(f, f1, ...) f(__VA_ARGS__) +#endif + #endif diff --git a/include/net/sock.h b/include/net/sock.h index 129d200bccb4..7644ea64a376 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1174,6 +1174,8 @@ struct proto { int (*backlog_rcv) (struct sock *sk, struct sk_buff *skb); + bool (*bpf_bypass_getsockopt)(int level, + int optname); void (*release_cb)(struct sock *sk); diff --git a/include/net/tcp.h b/include/net/tcp.h index 78d13c88720f..4bb42fb19711 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -403,6 +403,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait); int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); +bool tcp_bpf_bypass_getsockopt(int level, int optname); int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); void tcp_set_keepalive(struct sock *sk, int val); diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 96555a8a2c54..da649f20d6b2 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -19,7 +19,7 @@ #include "../cgroup/cgroup-internal.h" -DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key); +DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_BPF_ATTACH_TYPE); EXPORT_SYMBOL(cgroup_bpf_enabled_key); void cgroup_bpf_offline(struct cgroup *cgrp) @@ -128,7 +128,7 @@ static void cgroup_bpf_release(struct work_struct *work) if (pl->link) bpf_cgroup_link_auto_detach(pl->link); kfree(pl); - static_branch_dec(&cgroup_bpf_enabled_key); + static_branch_dec(&cgroup_bpf_enabled_key[type]); } old_array = rcu_dereference_protected( cgrp->bpf.effective[type], @@ -499,7 +499,7 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, if (old_prog) bpf_prog_put(old_prog); else - static_branch_inc(&cgroup_bpf_enabled_key); + static_branch_inc(&cgroup_bpf_enabled_key[type]); bpf_cgroup_storages_link(new_storage, cgrp, type); return 0; @@ -698,7 +698,7 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, cgrp->bpf.flags[type] = 0; if (old_prog) bpf_prog_put(old_prog); - static_branch_dec(&cgroup_bpf_enabled_key); + static_branch_dec(&cgroup_bpf_enabled_key[type]); return 0; cleanup: @@ -1298,7 +1298,8 @@ static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp, return empty; } -static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen) +static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen, + struct bpf_sockopt_buf *buf) { if (unlikely(max_optlen < 0)) return -EINVAL; @@ -1310,6 +1311,15 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen) max_optlen = PAGE_SIZE; } + if (max_optlen <= sizeof(buf->data)) { + /* When the optval fits into BPF_SOCKOPT_KERN_BUF_SIZE + * bytes avoid the cost of kzalloc. + */ + ctx->optval = buf->data; + ctx->optval_end = ctx->optval + max_optlen; + return max_optlen; + } + ctx->optval = kzalloc(max_optlen, GFP_USER); if (!ctx->optval) return -ENOMEM; @@ -1319,16 +1329,26 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen) return max_optlen; } -static void sockopt_free_buf(struct bpf_sockopt_kern *ctx) +static void sockopt_free_buf(struct bpf_sockopt_kern *ctx, + struct bpf_sockopt_buf *buf) { + if (ctx->optval == buf->data) + return; kfree(ctx->optval); } +static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx, + struct bpf_sockopt_buf *buf) +{ + return ctx->optval != buf->data; +} + int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, int *optname, char __user *optval, int *optlen, char **kernel_optval) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + struct bpf_sockopt_buf buf = {}; struct bpf_sockopt_kern ctx = { .sk = sk, .level = *level, @@ -1340,8 +1360,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, * attached to the hook so we don't waste time allocating * memory and locking the socket. */ - if (!cgroup_bpf_enabled || - __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT)) + if (__cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT)) return 0; /* Allocate a bit more than the initial user buffer for @@ -1350,7 +1369,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, */ max_optlen = max_t(int, 16, *optlen); - max_optlen = sockopt_alloc_buf(&ctx, max_optlen); + max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf); if (max_optlen < 0) return max_optlen; @@ -1390,14 +1409,31 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, */ if (ctx.optlen != 0) { *optlen = ctx.optlen; - *kernel_optval = ctx.optval; + /* We've used bpf_sockopt_kern->buf as an intermediary + * storage, but the BPF program indicates that we need + * to pass this data to the kernel setsockopt handler. + * No way to export on-stack buf, have to allocate a + * new buffer. + */ + if (!sockopt_buf_allocated(&ctx, &buf)) { + void *p = kmalloc(ctx.optlen, GFP_USER); + + if (!p) { + ret = -ENOMEM; + goto out; + } + memcpy(p, ctx.optval, ctx.optlen); + *kernel_optval = p; + } else { + *kernel_optval = ctx.optval; + } /* export and don't free sockopt buf */ return 0; } } out: - sockopt_free_buf(&ctx); + sockopt_free_buf(&ctx, &buf); return ret; } @@ -1407,6 +1443,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, int retval) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + struct bpf_sockopt_buf buf = {}; struct bpf_sockopt_kern ctx = { .sk = sk, .level = level, @@ -1419,13 +1456,12 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, * attached to the hook so we don't waste time allocating * memory and locking the socket. */ - if (!cgroup_bpf_enabled || - __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT)) + if (__cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT)) return retval; ctx.optlen = max_optlen; - max_optlen = sockopt_alloc_buf(&ctx, max_optlen); + max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf); if (max_optlen < 0) return max_optlen; @@ -1483,9 +1519,55 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, ret = ctx.retval; out: - sockopt_free_buf(&ctx); + sockopt_free_buf(&ctx, &buf); return ret; } + +int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level, + int optname, void *optval, + int *optlen, int retval) +{ + struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + struct bpf_sockopt_kern ctx = { + .sk = sk, + .level = level, + .optname = optname, + .retval = retval, + .optlen = *optlen, + .optval = optval, + .optval_end = optval + *optlen, + }; + int ret; + + /* Note that __cgroup_bpf_run_filter_getsockopt doesn't copy + * user data back into BPF buffer when reval != 0. This is + * done as an optimization to avoid extra copy, assuming + * kernel won't populate the data in case of an error. + * Here we always pass the data and memset() should + * be called if that data shouldn't be "exported". + */ + + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT], + &ctx, BPF_PROG_RUN); + if (!ret) + return -EPERM; + + if (ctx.optlen > *optlen) + return -EFAULT; + + /* BPF programs only allowed to set retval to 0, not some + * arbitrary value. + */ + if (ctx.retval != 0 && ctx.retval != retval) + return -EFAULT; + + /* BPF programs can shrink the buffer, export the modifications. + */ + if (ctx.optlen != 0) + *optlen = ctx.optlen; + + return ctx.retval; +} #endif static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp, diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index b94fa8eb831b..6ba2930ff49b 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -777,18 +777,19 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr, return -ENOTCONN; sin->sin_port = inet->inet_dport; sin->sin_addr.s_addr = inet->inet_daddr; + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, + BPF_CGROUP_INET4_GETPEERNAME, + NULL); } else { __be32 addr = inet->inet_rcv_saddr; if (!addr) addr = inet->inet_saddr; sin->sin_port = inet->inet_sport; sin->sin_addr.s_addr = addr; - } - if (cgroup_bpf_enabled) BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, - peer ? BPF_CGROUP_INET4_GETPEERNAME : - BPF_CGROUP_INET4_GETSOCKNAME, + BPF_CGROUP_INET4_GETSOCKNAME, NULL); + } memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); return sizeof(*sin); } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 856ae516ac18..26aa923cf522 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -4099,6 +4099,8 @@ static int do_tcp_getsockopt(struct sock *sk, int level, return -EFAULT; lock_sock(sk); err = tcp_zerocopy_receive(sk, &zc); + err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname, + &zc, &len, err); release_sock(sk); if (len >= offsetofend(struct tcp_zerocopy_receive, err)) goto zerocopy_rcv_sk_err; @@ -4133,6 +4135,18 @@ static int do_tcp_getsockopt(struct sock *sk, int level, return 0; } +bool tcp_bpf_bypass_getsockopt(int level, int optname) +{ + /* TCP do_tcp_getsockopt has optimized getsockopt implementation + * to avoid extra socket lock for TCP_ZEROCOPY_RECEIVE. + */ + if (level == SOL_TCP && optname == TCP_ZEROCOPY_RECEIVE) + return true; + + return false; +} +EXPORT_SYMBOL(tcp_bpf_bypass_getsockopt); + int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 777306b5bc22..62b6fd385a47 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2793,6 +2793,7 @@ struct proto tcp_prot = { .shutdown = tcp_shutdown, .setsockopt = tcp_setsockopt, .getsockopt = tcp_getsockopt, + .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt, .keepalive = tcp_set_keepalive, .recvmsg = tcp_recvmsg, .sendmsg = tcp_sendmsg, diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 69ea76578abb..c67e483fce41 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1124,7 +1124,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) rcu_read_unlock(); } - if (cgroup_bpf_enabled && !connected) { + if (cgroup_bpf_enabled(BPF_CGROUP_UDP4_SENDMSG) && !connected) { err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, (struct sockaddr *)usin, &ipc.addr); if (err) @@ -1858,9 +1858,8 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); *addr_len = sizeof(*sin); - if (cgroup_bpf_enabled) - BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, - (struct sockaddr *)sin); + BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, + (struct sockaddr *)sin); } if (udp_sk(sk)->gro_enabled) diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 8e9c3e9ea36e..b9c654836b72 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -527,18 +527,19 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr, sin->sin6_addr = sk->sk_v6_daddr; if (np->sndflow) sin->sin6_flowinfo = np->flow_label; + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, + BPF_CGROUP_INET6_GETPEERNAME, + NULL); } else { if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) sin->sin6_addr = np->saddr; else sin->sin6_addr = sk->sk_v6_rcv_saddr; sin->sin6_port = inet->inet_sport; - } - if (cgroup_bpf_enabled) BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, - peer ? BPF_CGROUP_INET6_GETPEERNAME : - BPF_CGROUP_INET6_GETSOCKNAME, + BPF_CGROUP_INET6_GETSOCKNAME, NULL); + } sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr, sk->sk_bound_dev_if); return sizeof(*sin); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 0e1509b02cb3..8539715ff035 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2121,6 +2121,7 @@ struct proto tcpv6_prot = { .shutdown = tcp_shutdown, .setsockopt = tcp_setsockopt, .getsockopt = tcp_getsockopt, + .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt, .keepalive = tcp_set_keepalive, .recvmsg = tcp_recvmsg, .sendmsg = tcp_sendmsg, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index b9f3dfdd2383..a02ac875a923 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -409,9 +409,8 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, } *addr_len = sizeof(*sin6); - if (cgroup_bpf_enabled) - BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, - (struct sockaddr *)sin6); + BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, + (struct sockaddr *)sin6); } if (udp_sk(sk)->gro_enabled) @@ -1462,7 +1461,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.saddr = np->saddr; fl6.fl6_sport = inet->inet_sport; - if (cgroup_bpf_enabled && !connected) { + if (cgroup_bpf_enabled(BPF_CGROUP_UDP6_SENDMSG) && !connected) { err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, (struct sockaddr *)sin6, &fl6.saddr); if (err) diff --git a/net/socket.c b/net/socket.c index 33e8b6c4e1d3..7f0617ab5437 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2126,6 +2126,9 @@ SYSCALL_DEFINE5(setsockopt, int, fd, int, level, int, optname, return __sys_setsockopt(fd, level, optname, optval, optlen); } +INDIRECT_CALLABLE_DECLARE(bool tcp_bpf_bypass_getsockopt(int level, + int optname)); + /* * Get a socket option. Because we don't know the option lengths we have * to pass a user mode parameter for the protocols to sort out. diff --git a/tools/include/uapi/linux/tcp.h b/tools/include/uapi/linux/tcp.h new file mode 100644 index 000000000000..13ceeb395eb8 --- /dev/null +++ b/tools/include/uapi/linux/tcp.h @@ -0,0 +1,357 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Definitions for the TCP protocol. + * + * Version: @(#)tcp.h 1.0.2 04/28/93 + * + * Author: Fred N. van Kempen, + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#ifndef _UAPI_LINUX_TCP_H +#define _UAPI_LINUX_TCP_H + +#include +#include +#include + +struct tcphdr { + __be16 source; + __be16 dest; + __be32 seq; + __be32 ack_seq; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u16 res1:4, + doff:4, + fin:1, + syn:1, + rst:1, + psh:1, + ack:1, + urg:1, + ece:1, + cwr:1; +#elif defined(__BIG_ENDIAN_BITFIELD) + __u16 doff:4, + res1:4, + cwr:1, + ece:1, + urg:1, + ack:1, + psh:1, + rst:1, + syn:1, + fin:1; +#else +#error "Adjust your defines" +#endif + __be16 window; + __sum16 check; + __be16 urg_ptr; +}; + +/* + * The union cast uses a gcc extension to avoid aliasing problems + * (union is compatible to any of its members) + * This means this part of the code is -fstrict-aliasing safe now. + */ +union tcp_word_hdr { + struct tcphdr hdr; + __be32 words[5]; +}; + +#define tcp_flag_word(tp) ( ((union tcp_word_hdr *)(tp))->words [3]) + +enum { + TCP_FLAG_CWR = __constant_cpu_to_be32(0x00800000), + TCP_FLAG_ECE = __constant_cpu_to_be32(0x00400000), + TCP_FLAG_URG = __constant_cpu_to_be32(0x00200000), + TCP_FLAG_ACK = __constant_cpu_to_be32(0x00100000), + TCP_FLAG_PSH = __constant_cpu_to_be32(0x00080000), + TCP_FLAG_RST = __constant_cpu_to_be32(0x00040000), + TCP_FLAG_SYN = __constant_cpu_to_be32(0x00020000), + TCP_FLAG_FIN = __constant_cpu_to_be32(0x00010000), + TCP_RESERVED_BITS = __constant_cpu_to_be32(0x0F000000), + TCP_DATA_OFFSET = __constant_cpu_to_be32(0xF0000000) +}; + +/* + * TCP general constants + */ +#define TCP_MSS_DEFAULT 536U /* IPv4 (RFC1122, RFC2581) */ +#define TCP_MSS_DESIRED 1220U /* IPv6 (tunneled), EDNS0 (RFC3226) */ + +/* TCP socket options */ +#define TCP_NODELAY 1 /* Turn off Nagle's algorithm. */ +#define TCP_MAXSEG 2 /* Limit MSS */ +#define TCP_CORK 3 /* Never send partially complete segments */ +#define TCP_KEEPIDLE 4 /* Start keeplives after this period */ +#define TCP_KEEPINTVL 5 /* Interval between keepalives */ +#define TCP_KEEPCNT 6 /* Number of keepalives before death */ +#define TCP_SYNCNT 7 /* Number of SYN retransmits */ +#define TCP_LINGER2 8 /* Life time of orphaned FIN-WAIT-2 state */ +#define TCP_DEFER_ACCEPT 9 /* Wake up listener only when data arrive */ +#define TCP_WINDOW_CLAMP 10 /* Bound advertised window */ +#define TCP_INFO 11 /* Information about this connection. */ +#define TCP_QUICKACK 12 /* Block/reenable quick acks */ +#define TCP_CONGESTION 13 /* Congestion control algorithm */ +#define TCP_MD5SIG 14 /* TCP MD5 Signature (RFC2385) */ +#define TCP_THIN_LINEAR_TIMEOUTS 16 /* Use linear timeouts for thin streams*/ +#define TCP_THIN_DUPACK 17 /* Fast retrans. after 1 dupack */ +#define TCP_USER_TIMEOUT 18 /* How long for loss retry before timeout */ +#define TCP_REPAIR 19 /* TCP sock is under repair right now */ +#define TCP_REPAIR_QUEUE 20 +#define TCP_QUEUE_SEQ 21 +#define TCP_REPAIR_OPTIONS 22 +#define TCP_FASTOPEN 23 /* Enable FastOpen on listeners */ +#define TCP_TIMESTAMP 24 +#define TCP_NOTSENT_LOWAT 25 /* limit number of unsent bytes in write queue */ +#define TCP_CC_INFO 26 /* Get Congestion Control (optional) info */ +#define TCP_SAVE_SYN 27 /* Record SYN headers for new connections */ +#define TCP_SAVED_SYN 28 /* Get SYN headers recorded for connection */ +#define TCP_REPAIR_WINDOW 29 /* Get/set window parameters */ +#define TCP_FASTOPEN_CONNECT 30 /* Attempt FastOpen with connect */ +#define TCP_ULP 31 /* Attach a ULP to a TCP connection */ +#define TCP_MD5SIG_EXT 32 /* TCP MD5 Signature with extensions */ +#define TCP_FASTOPEN_KEY 33 /* Set the key for Fast Open (cookie) */ +#define TCP_FASTOPEN_NO_COOKIE 34 /* Enable TFO without a TFO cookie */ +#define TCP_ZEROCOPY_RECEIVE 35 +#define TCP_INQ 36 /* Notify bytes available to read as a cmsg on read */ + +#define TCP_CM_INQ TCP_INQ + +#define TCP_TX_DELAY 37 /* delay outgoing packets by XX usec */ + + +#define TCP_REPAIR_ON 1 +#define TCP_REPAIR_OFF 0 +#define TCP_REPAIR_OFF_NO_WP -1 /* Turn off without window probes */ + +struct tcp_repair_opt { + __u32 opt_code; + __u32 opt_val; +}; + +struct tcp_repair_window { + __u32 snd_wl1; + __u32 snd_wnd; + __u32 max_window; + + __u32 rcv_wnd; + __u32 rcv_wup; +}; + +enum { + TCP_NO_QUEUE, + TCP_RECV_QUEUE, + TCP_SEND_QUEUE, + TCP_QUEUES_NR, +}; + +/* why fastopen failed from client perspective */ +enum tcp_fastopen_client_fail { + TFO_STATUS_UNSPEC, /* catch-all */ + TFO_COOKIE_UNAVAILABLE, /* if not in TFO_CLIENT_NO_COOKIE mode */ + TFO_DATA_NOT_ACKED, /* SYN-ACK did not ack SYN data */ + TFO_SYN_RETRANSMITTED, /* SYN-ACK did not ack SYN data after timeout */ +}; + +/* for TCP_INFO socket option */ +#define TCPI_OPT_TIMESTAMPS 1 +#define TCPI_OPT_SACK 2 +#define TCPI_OPT_WSCALE 4 +#define TCPI_OPT_ECN 8 /* ECN was negociated at TCP session init */ +#define TCPI_OPT_ECN_SEEN 16 /* we received at least one packet with ECT */ +#define TCPI_OPT_SYN_DATA 32 /* SYN-ACK acked data in SYN sent or rcvd */ + +/* + * Sender's congestion state indicating normal or abnormal situations + * in the last round of packets sent. The state is driven by the ACK + * information and timer events. + */ +enum tcp_ca_state { + /* + * Nothing bad has been observed recently. + * No apparent reordering, packet loss, or ECN marks. + */ + TCP_CA_Open = 0, +#define TCPF_CA_Open (1< +#include #include #include "bpf_dctcp.skel.h" #include "bpf_cubic.skel.h" diff --git a/tools/testing/selftests/bpf/prog_tests/cls_redirect.c b/tools/testing/selftests/bpf/prog_tests/cls_redirect.c index 9781d85cb223..e075d03ab630 100644 --- a/tools/testing/selftests/bpf/prog_tests/cls_redirect.c +++ b/tools/testing/selftests/bpf/prog_tests/cls_redirect.c @@ -7,6 +7,7 @@ #include #include +#include #include diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c index 85f73261fab0..b8b48cac2ac3 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2020 Cloudflare #include +#include #include "test_progs.h" #include "test_skmsg_load_helpers.skel.h" diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c index b25c9c45c148..d5b44b135c00 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c +++ b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c @@ -2,6 +2,12 @@ #include #include "cgroup_helpers.h" +#include + +#ifndef SOL_TCP +#define SOL_TCP IPPROTO_TCP +#endif + #define SOL_CUSTOM 0xdeadbeef static int getsetsockopt(void) @@ -11,6 +17,7 @@ static int getsetsockopt(void) char u8[4]; __u32 u32; char cc[16]; /* TCP_CA_NAME_MAX */ + struct tcp_zerocopy_receive zc; } buf = {}; socklen_t optlen; char *big_buf = NULL; @@ -154,6 +161,27 @@ static int getsetsockopt(void) goto err; } + /* TCP_ZEROCOPY_RECEIVE triggers */ + memset(&buf, 0, sizeof(buf)); + optlen = sizeof(buf.zc); + err = getsockopt(fd, SOL_TCP, TCP_ZEROCOPY_RECEIVE, &buf, &optlen); + if (err) { + log_err("Unexpected getsockopt(TCP_ZEROCOPY_RECEIVE) err=%d errno=%d", + err, errno); + goto err; + } + + memset(&buf, 0, sizeof(buf)); + buf.zc.address = 12345; /* rejected by BPF */ + optlen = sizeof(buf.zc); + errno = 0; + err = getsockopt(fd, SOL_TCP, TCP_ZEROCOPY_RECEIVE, &buf, &optlen); + if (errno != EPERM) { + log_err("Unexpected getsockopt(TCP_ZEROCOPY_RECEIVE) err=%d errno=%d", + err, errno); + goto err; + } + free(big_buf); close(fd); return 0; diff --git a/tools/testing/selftests/bpf/progs/sockopt_sk.c b/tools/testing/selftests/bpf/progs/sockopt_sk.c index 712df7b49cb1..d3597f81e6e9 100644 --- a/tools/testing/selftests/bpf/progs/sockopt_sk.c +++ b/tools/testing/selftests/bpf/progs/sockopt_sk.c @@ -1,8 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 #include -#include -#include +#include #include +#include #include char _license[] SEC("license") = "GPL"; @@ -12,6 +12,10 @@ __u32 _version SEC("version") = 1; #define PAGE_SIZE 4096 #endif +#ifndef SOL_TCP +#define SOL_TCP IPPROTO_TCP +#endif + #define SOL_CUSTOM 0xdeadbeef struct sockopt_sk { @@ -57,6 +61,21 @@ int _getsockopt(struct bpf_sockopt *ctx) return 1; } + if (ctx->level == SOL_TCP && ctx->optname == TCP_ZEROCOPY_RECEIVE) { + /* Verify that TCP_ZEROCOPY_RECEIVE triggers. + * It has a custom implementation for performance + * reasons. + */ + + if (optval + sizeof(struct tcp_zerocopy_receive) > optval_end) + return 0; /* EPERM, bounds check */ + + if (((struct tcp_zerocopy_receive *)optval)->address != 0) + return 0; /* EPERM, unexpected data */ + + return 1; + } + if (ctx->level == SOL_IP && ctx->optname == IP_FREEBIND) { if (optval + 1 > optval_end) return 0; /* EPERM, bounds check */ diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h index e49e2fdde942..f7c2fd89d01a 100644 --- a/tools/testing/selftests/bpf/test_progs.h +++ b/tools/testing/selftests/bpf/test_progs.h @@ -16,7 +16,6 @@ typedef __u16 __sum16; #include #include #include -#include #include #include #include