bpf: Reject TCP_NODELAY in bpf-tcp-cc

A BPF TCP congestion control program can call bpf_setsockopt() from
its callbacks. In current kernels, if it calls
bpf_setsockopt(TCP_NODELAY) from cwnd_event_tx_start(), the call can
re-enter the TCP transmit path before the outer tcp_transmit_skb()
has completed and advanced the send head.

This can re-trigger CA_EVENT_TX_START and lead to unbounded recursion:

  tcp_transmit_skb()
    -> tcp_event_data_sent()
      -> tcp_ca_event(sk, CA_EVENT_TX_START)
        -> cwnd_event_tx_start()
          -> bpf_setsockopt(TCP_NODELAY)
            -> tcp_push_pending_frames()
              -> tcp_write_xmit()
                -> tcp_transmit_skb()

This leads to unbounded recursion and can overflow the kernel stack.

Reject TCP_NODELAY with -EOPNOTSUPP for bpf-tcp-cc by introducing
a dedicated setsockopt proto for BPF_PROG_TYPE_STRUCT_OPS TCP
congestion control programs. To keep it simple, all tcp-cc ops is
rejected for TCP_NODELAY.

Fixes: 7e41df5dbb ("bpf: Add a few optnames to bpf_setsockopt")
Suggested-by: Martin KaFai Lau <martin.lau@linux.dev>
Signed-off-by: KaFai Wan <kafai.wan@linux.dev>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Reviewed-by: Jiayuan Chen <jiayuan.chen@linux.dev>
Link: https://patch.msgid.link/20260421155804.135786-3-kafai.wan@linux.dev
This commit is contained in:
KaFai Wan 2026-04-21 23:58:02 +08:00 committed by Martin KaFai Lau
parent 846c76ecc0
commit 54377fcab5
3 changed files with 26 additions and 1 deletions

View File

@ -3725,6 +3725,7 @@ extern const struct bpf_func_proto bpf_for_each_map_elem_proto;
extern const struct bpf_func_proto bpf_btf_find_by_name_kind_proto;
extern const struct bpf_func_proto bpf_sk_setsockopt_proto;
extern const struct bpf_func_proto bpf_sk_getsockopt_proto;
extern const struct bpf_func_proto bpf_sk_setsockopt_nodelay_proto;
extern const struct bpf_func_proto bpf_unlocked_sk_setsockopt_proto;
extern const struct bpf_func_proto bpf_unlocked_sk_getsockopt_proto;
extern const struct bpf_func_proto bpf_find_vma_proto;

View File

@ -5688,6 +5688,30 @@ const struct bpf_func_proto bpf_sk_getsockopt_proto = {
.arg5_type = ARG_CONST_SIZE,
};
BPF_CALL_5(bpf_sk_setsockopt_nodelay, struct sock *, sk, int, level,
int, optname, char *, optval, int, optlen)
{
/*
* TCP_NODELAY triggers tcp_push_pending_frames() and re-enters
* CA_EVENT_TX_START in bpf_tcp_cc.
*/
if (level == SOL_TCP && optname == TCP_NODELAY)
return -EOPNOTSUPP;
return _bpf_setsockopt(sk, level, optname, optval, optlen);
}
const struct bpf_func_proto bpf_sk_setsockopt_nodelay_proto = {
.func = bpf_sk_setsockopt_nodelay,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
.arg2_type = ARG_ANYTHING,
.arg3_type = ARG_ANYTHING,
.arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg5_type = ARG_CONST_SIZE,
};
BPF_CALL_5(bpf_unlocked_sk_setsockopt, struct sock *, sk, int, level,
int, optname, char *, optval, int, optlen)
{

View File

@ -168,7 +168,7 @@ bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id,
*/
if (prog_ops_moff(prog) !=
offsetof(struct tcp_congestion_ops, release))
return &bpf_sk_setsockopt_proto;
return &bpf_sk_setsockopt_nodelay_proto;
return NULL;
case BPF_FUNC_getsockopt:
/* Since get/setsockopt is usually expected to