mirror of
https://github.com/torvalds/linux.git
synced 2026-06-02 03:24:19 +02:00
This change implements Accurate ECN without negotiation and AccECN Option (that will be added by later changes). Based on AccECN specifications: https://tools.ietf.org/id/draft-ietf-tcpm-accurate-ecn-28.txt Accurate ECN allows feeding back the number of CE (congestion experienced) marks accurately to the sender in contrast to RFC3168 ECN that can only signal one marks-seen-yes/no per RTT. Congestion control algorithms can take advantage of the accurate ECN information to fine-tune their congestion response to avoid drastic rate reduction when only mild congestion is encountered. With Accurate ECN, tp->received_ce (r.cep in AccECN spec) keeps track of how many segments have arrived with a CE mark. Accurate ECN uses ACE field (ECE, CWR, AE) to communicate the value back to the sender which updates tp->delivered_ce (s.cep) based on the feedback. This signalling channel is lossy when ACE field overflow occurs. Conservative strategy is selected here to deal with the ACE overflow, however, some strategies using the AccECN option later in the overall patchset mitigate against false overflows detected. The ACE field values on the wire are offset by TCP_ACCECN_CEP_INIT_OFFSET. Delivered_ce/received_ce count the real CE marks rather than forcing all downstream users to adapt to the wire offset. This patch uses the first 1-byte hole and the last 4-byte hole of the tcp_sock_write_txrx for 'received_ce_pending' and 'received_ce'. Also, the group size of tcp_sock_write_txrx is increased from 91 + 4 to 95 + 4 due to the new u32 received_ce member. Below are the trimmed pahole outcomes before and after this patch. [BEFORE THIS PATCH] struct tcp_sock { [...] __cacheline_group_begin__tcp_sock_write_txrx[0]; /* 2521 0 */ u8 nonagle:4; /* 2521: 0 1 */ u8 rate_app_limited:1; /* 2521: 4 1 */ /* XXX 3 bits hole, try to pack */ /* XXX 2 bytes hole, try to pack */ [...] u32 delivered_ce; /* 2576 4 */ u32 app_limited; /* 2580 4 */ u32 rcv_wnd; /* 2684 4 */ struct tcp_options_received rx_opt; /* 2688 24 */ __cacheline_group_end__tcp_sock_write_txrx[0]; /* 2612 0 */ /* XXX 4 bytes hole, try to pack */ [...] /* size: 3200, cachelines: 50, members: 161 */ } [AFTER THIS PATCH] struct tcp_sock { [...] __cacheline_group_begin__tcp_sock_write_txrx[0]; /* 2521 0 */ u8 nonagle:4; /* 2521: 0 1 */ u8 rate_app_limited:1; /* 2521: 4 1 */ /* XXX 3 bits hole, try to pack */ /* Force alignment to the next boundary: */ u8 :0; u8 received_ce_pending:4;/* 2522: 0 1 */ u8 unused2:4; /* 2522: 4 1 */ /* XXX 1 byte hole, try to pack */ [...] u32 delivered_ce; /* 2576 4 */ u32 received_ce; /* 2580 4 */ u32 app_limited; /* 2584 4 */ u32 rcv_wnd; /* 2588 4 */ struct tcp_options_received rx_opt; /* 2592 24 */ __cacheline_group_end__tcp_sock_write_txrx[0]; /* 2616 0 */ [...] /* size: 3200, cachelines: 50, members: 164 */ } Signed-off-by: Ilpo Järvinen <ij@kernel.org> Co-developed-by: Olivier Tilmans <olivier.tilmans@nokia.com> Signed-off-by: Olivier Tilmans <olivier.tilmans@nokia.com> Co-developed-by: Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com> Signed-off-by: Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com> Reviewed-by: Eric Dumazet <edumazet@google.com> Link: https://patch.msgid.link/20250916082434.100722-2-chia-yu.chang@nokia-bell-labs.com Signed-off-by: Paolo Abeni <pabeni@redhat.com>
166 lines
4.3 KiB
C
166 lines
4.3 KiB
C
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
|
#ifndef _TCP_ECN_H
|
|
#define _TCP_ECN_H
|
|
|
|
#include <linux/tcp.h>
|
|
#include <linux/skbuff.h>
|
|
|
|
#include <net/inet_connection_sock.h>
|
|
#include <net/sock.h>
|
|
#include <net/tcp.h>
|
|
#include <net/inet_ecn.h>
|
|
|
|
static inline void tcp_ecn_queue_cwr(struct tcp_sock *tp)
|
|
{
|
|
/* Do not set CWR if in AccECN mode! */
|
|
if (tcp_ecn_mode_rfc3168(tp))
|
|
tp->ecn_flags |= TCP_ECN_QUEUE_CWR;
|
|
}
|
|
|
|
static inline void tcp_ecn_accept_cwr(struct sock *sk,
|
|
const struct sk_buff *skb)
|
|
{
|
|
struct tcp_sock *tp = tcp_sk(sk);
|
|
|
|
if (tcp_ecn_mode_rfc3168(tp) && tcp_hdr(skb)->cwr) {
|
|
tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
|
|
|
|
/* If the sender is telling us it has entered CWR, then its
|
|
* cwnd may be very low (even just 1 packet), so we should ACK
|
|
* immediately.
|
|
*/
|
|
if (TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq)
|
|
inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
|
|
}
|
|
}
|
|
|
|
static inline void tcp_ecn_withdraw_cwr(struct tcp_sock *tp)
|
|
{
|
|
tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
|
|
}
|
|
|
|
static inline u8 tcp_accecn_ace(const struct tcphdr *th)
|
|
{
|
|
return (th->ae << 2) | (th->cwr << 1) | th->ece;
|
|
}
|
|
|
|
static inline void tcp_accecn_init_counters(struct tcp_sock *tp)
|
|
{
|
|
tp->received_ce = 0;
|
|
tp->received_ce_pending = 0;
|
|
}
|
|
|
|
/* Updates Accurate ECN received counters from the received IP ECN field */
|
|
static inline void tcp_ecn_received_counters(struct sock *sk, const struct sk_buff *skb)
|
|
{
|
|
u8 ecnfield = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK;
|
|
u8 is_ce = INET_ECN_is_ce(ecnfield);
|
|
struct tcp_sock *tp = tcp_sk(sk);
|
|
|
|
if (!INET_ECN_is_not_ect(ecnfield)) {
|
|
u32 pcount = is_ce * max_t(u16, 1, skb_shinfo(skb)->gso_segs);
|
|
|
|
/* As for accurate ECN, the TCP_ECN_SEEN flag is set by
|
|
* tcp_ecn_received_counters() when the ECN codepoint of
|
|
* received TCP data or ACK contains ECT(0), ECT(1), or CE.
|
|
*/
|
|
if (!tcp_ecn_mode_rfc3168(tp))
|
|
tp->ecn_flags |= TCP_ECN_SEEN;
|
|
|
|
/* ACE counter tracks *all* segments including pure ACKs */
|
|
tp->received_ce += pcount;
|
|
tp->received_ce_pending = min(tp->received_ce_pending + pcount,
|
|
0xfU);
|
|
}
|
|
}
|
|
|
|
static inline void tcp_accecn_set_ace(struct tcphdr *th, struct tcp_sock *tp)
|
|
{
|
|
u32 wire_ace;
|
|
|
|
wire_ace = tp->received_ce + TCP_ACCECN_CEP_INIT_OFFSET;
|
|
th->ece = !!(wire_ace & 0x1);
|
|
th->cwr = !!(wire_ace & 0x2);
|
|
th->ae = !!(wire_ace & 0x4);
|
|
tp->received_ce_pending = 0;
|
|
}
|
|
|
|
static inline void tcp_ecn_rcv_synack(struct tcp_sock *tp,
|
|
const struct tcphdr *th)
|
|
{
|
|
if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || th->cwr))
|
|
tcp_ecn_mode_set(tp, TCP_ECN_DISABLED);
|
|
}
|
|
|
|
static inline void tcp_ecn_rcv_syn(struct tcp_sock *tp,
|
|
const struct tcphdr *th)
|
|
{
|
|
if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || !th->cwr))
|
|
tcp_ecn_mode_set(tp, TCP_ECN_DISABLED);
|
|
}
|
|
|
|
static inline bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp,
|
|
const struct tcphdr *th)
|
|
{
|
|
if (th->ece && !th->syn && tcp_ecn_mode_rfc3168(tp))
|
|
return true;
|
|
return false;
|
|
}
|
|
|
|
/* Packet ECN state for a SYN-ACK */
|
|
static inline void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb)
|
|
{
|
|
const struct tcp_sock *tp = tcp_sk(sk);
|
|
|
|
TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
|
|
if (tcp_ecn_disabled(tp))
|
|
TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;
|
|
else if (tcp_ca_needs_ecn(sk) ||
|
|
tcp_bpf_ca_needs_ecn(sk))
|
|
INET_ECN_xmit(sk);
|
|
}
|
|
|
|
/* Packet ECN state for a SYN. */
|
|
static inline void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
|
|
{
|
|
struct tcp_sock *tp = tcp_sk(sk);
|
|
bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk);
|
|
bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == 1 ||
|
|
tcp_ca_needs_ecn(sk) || bpf_needs_ecn;
|
|
|
|
if (!use_ecn) {
|
|
const struct dst_entry *dst = __sk_dst_get(sk);
|
|
|
|
if (dst && dst_feature(dst, RTAX_FEATURE_ECN))
|
|
use_ecn = true;
|
|
}
|
|
|
|
tp->ecn_flags = 0;
|
|
|
|
if (use_ecn) {
|
|
if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn)
|
|
INET_ECN_xmit(sk);
|
|
|
|
TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
|
|
tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168);
|
|
}
|
|
}
|
|
|
|
static inline void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb)
|
|
{
|
|
if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback))
|
|
/* tp->ecn_flags are cleared at a later point in time when
|
|
* SYN ACK is ultimatively being received.
|
|
*/
|
|
TCP_SKB_CB(skb)->tcp_flags &= ~(TCPHDR_ECE | TCPHDR_CWR);
|
|
}
|
|
|
|
static inline void
|
|
tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th)
|
|
{
|
|
if (inet_rsk(req)->ecn_ok)
|
|
th->ece = 1;
|
|
}
|
|
|
|
#endif /* _LINUX_TCP_ECN_H */
|