Merge branch 'make-time-wait-reuse-delay-deterministic-and-configurable'

Jakub Sitnicki says:

====================
Make TIME-WAIT reuse delay deterministic and configurable

This patch set is an effort to enable faster reuse of TIME-WAIT sockets.
We have recently talked about the motivation and the idea at Plumbers [1].

Experiment in production
------------------------

We are restarting our experiment on a small set of production nodes as the
code has slightly changed since v1 [2], and there are still a few weeks of
development window to soak the changes. We will report back if we observe
any regressions.

Packetdrill tests
-----------------

The packetdrill tests for TIME-WAIT reuse [3] did not change since v1.
Although we are not touching PAWS code any more, I would still like to add
tests to cover PAWS reject after TW reuse. This, however, requires patching
packetdrill as I mentioned in the last cover letter [2].

[1] https://lpc.events/event/18/contributions/1962/
[2] https://lore.kernel.org/r/20241113-jakub-krn-909-poc-msec-tw-tstamp-v2-0-b0a335247304@cloudflare.com
[3] https://github.com/google/packetdrill/pull/90

v1: https://lore.kernel.org/20241204-jakub-krn-909-poc-msec-tw-tstamp-v1-0-8b54467a0f34@cloudflare.com
RFCv2: https://lore.kernel.org/20241113-jakub-krn-909-poc-msec-tw-tstamp-v2-0-b0a335247304@cloudflare.com
RFCv1: https://lore.kernel.org/20240819-jakub-krn-909-poc-msec-tw-tstamp-v1-1-6567b5006fbe@cloudflare.com
====================

Link: https://patch.msgid.link/20241209-jakub-krn-909-poc-msec-tw-tstamp-v2-0-66aca0eed03e@cloudflare.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
Jakub Kicinski 2024-12-11 20:17:38 -08:00
commit 154dee7c32
7 changed files with 41 additions and 3 deletions

View File

@ -1000,6 +1000,20 @@ tcp_tw_reuse - INTEGER
Default: 2
tcp_tw_reuse_delay - UNSIGNED INTEGER
The delay in milliseconds before a TIME-WAIT socket can be reused by a
new connection, if TIME-WAIT socket reuse is enabled. The actual reuse
threshold is within [N, N+1] range, where N is the requested delay in
milliseconds, to ensure the delay interval is never shorter than the
configured value.
This setting contains an assumption about the other TCP timestamp clock
tick interval. It should not be set to a value lower than the peer's
clock tick for PAWS (Protection Against Wrapped Sequence numbers)
mechanism work correctly for the reused connection.
Default: 1000 (milliseconds)
tcp_window_scaling - BOOLEAN
Enable window scaling as defined in RFC1323.

View File

@ -79,6 +79,7 @@ u8 sysctl_tcp_retries1
u8 sysctl_tcp_retries2
u8 sysctl_tcp_orphan_retries
u8 sysctl_tcp_tw_reuse timewait_sock_ops
unsigned_int sysctl_tcp_tw_reuse_delay timewait_sock_ops
int sysctl_tcp_fin_timeout TCP_LAST_ACK/tcp_rcv_state_process
unsigned_int sysctl_tcp_notsent_lowat read_mostly tcp_notsent_lowat/tcp_stream_memory_free
u8 sysctl_tcp_sack tcp_syn_options

View File

@ -74,6 +74,10 @@ struct inet_timewait_sock {
tw_tos : 8;
u32 tw_txhash;
u32 tw_priority;
/**
* @tw_reuse_stamp: Time of entry into %TCP_TIME_WAIT state in msec.
*/
u32 tw_entry_stamp;
struct timer_list tw_timer;
struct inet_bind_bucket *tw_tb;
struct inet_bind2_bucket *tw_tb2;

View File

@ -175,6 +175,7 @@ struct netns_ipv4 {
u8 sysctl_tcp_retries2;
u8 sysctl_tcp_orphan_retries;
u8 sysctl_tcp_tw_reuse;
unsigned int sysctl_tcp_tw_reuse_delay;
int sysctl_tcp_fin_timeout;
u8 sysctl_tcp_sack;
u8 sysctl_tcp_window_scaling;

View File

@ -45,6 +45,7 @@ static unsigned int tcp_child_ehash_entries_max = 16 * 1024 * 1024;
static unsigned int udp_child_hash_entries_max = UDP_HTABLE_SIZE_MAX;
static int tcp_plb_max_rounds = 31;
static int tcp_plb_max_cong_thresh = 256;
static unsigned int tcp_tw_reuse_delay_max = TCP_PAWS_MSL * MSEC_PER_SEC;
/* obsolete */
static int sysctl_tcp_low_latency __read_mostly;
@ -1065,6 +1066,15 @@ static struct ctl_table ipv4_net_table[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_TWO,
},
{
.procname = "tcp_tw_reuse_delay",
.data = &init_net.ipv4.sysctl_tcp_tw_reuse_delay,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_douintvec_minmax,
.extra1 = SYSCTL_ONE,
.extra2 = &tcp_tw_reuse_delay_max,
},
{
.procname = "tcp_max_syn_backlog",
.data = &init_net.ipv4.sysctl_max_syn_backlog,

View File

@ -120,6 +120,7 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
struct tcp_sock *tp = tcp_sk(sk);
int ts_recent_stamp;
u32 reuse_thresh;
if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2)
reuse = 0;
@ -162,9 +163,10 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
and use initial timestamp retrieved from peer table.
*/
ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp);
reuse_thresh = READ_ONCE(tw->tw_entry_stamp) +
READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse_delay);
if (ts_recent_stamp &&
(!twp || (reuse && time_after32(ktime_get_seconds(),
ts_recent_stamp)))) {
(!twp || (reuse && time_after32(tcp_clock_ms(), reuse_thresh)))) {
/* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk
* and releasing the bucket lock.
*/
@ -3457,6 +3459,7 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
net->ipv4.sysctl_tcp_tw_reuse = 2;
net->ipv4.sysctl_tcp_tw_reuse_delay = 1 * MSEC_PER_SEC;
net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);

View File

@ -157,8 +157,11 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
rcv_nxt);
if (tmp_opt.saw_tstamp) {
u64 ts = tcp_clock_ms();
WRITE_ONCE(tw->tw_entry_stamp, ts);
WRITE_ONCE(tcptw->tw_ts_recent_stamp,
ktime_get_seconds());
div_u64(ts, MSEC_PER_SEC));
WRITE_ONCE(tcptw->tw_ts_recent,
tmp_opt.rcv_tsval);
}
@ -316,6 +319,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
tw->tw_mark = sk->sk_mark;
tw->tw_priority = READ_ONCE(sk->sk_priority);
tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale;
/* refreshed when we enter true TIME-WAIT state */
tw->tw_entry_stamp = tcp_time_stamp_ms(tp);
tcptw->tw_rcv_nxt = tp->rcv_nxt;
tcptw->tw_snd_nxt = tp->snd_nxt;
tcptw->tw_rcv_wnd = tcp_receive_window(tp);