mirror of
https://github.com/torvalds/linux.git
synced 2026-05-23 14:42:08 +02:00
Merge branch 'make-time-wait-reuse-delay-deterministic-and-configurable'
Jakub Sitnicki says: ==================== Make TIME-WAIT reuse delay deterministic and configurable This patch set is an effort to enable faster reuse of TIME-WAIT sockets. We have recently talked about the motivation and the idea at Plumbers [1]. Experiment in production ------------------------ We are restarting our experiment on a small set of production nodes as the code has slightly changed since v1 [2], and there are still a few weeks of development window to soak the changes. We will report back if we observe any regressions. Packetdrill tests ----------------- The packetdrill tests for TIME-WAIT reuse [3] did not change since v1. Although we are not touching PAWS code any more, I would still like to add tests to cover PAWS reject after TW reuse. This, however, requires patching packetdrill as I mentioned in the last cover letter [2]. [1] https://lpc.events/event/18/contributions/1962/ [2] https://lore.kernel.org/r/20241113-jakub-krn-909-poc-msec-tw-tstamp-v2-0-b0a335247304@cloudflare.com [3] https://github.com/google/packetdrill/pull/90 v1: https://lore.kernel.org/20241204-jakub-krn-909-poc-msec-tw-tstamp-v1-0-8b54467a0f34@cloudflare.com RFCv2: https://lore.kernel.org/20241113-jakub-krn-909-poc-msec-tw-tstamp-v2-0-b0a335247304@cloudflare.com RFCv1: https://lore.kernel.org/20240819-jakub-krn-909-poc-msec-tw-tstamp-v1-1-6567b5006fbe@cloudflare.com ==================== Link: https://patch.msgid.link/20241209-jakub-krn-909-poc-msec-tw-tstamp-v2-0-66aca0eed03e@cloudflare.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
commit
154dee7c32
|
|
@ -1000,6 +1000,20 @@ tcp_tw_reuse - INTEGER
|
|||
|
||||
Default: 2
|
||||
|
||||
tcp_tw_reuse_delay - UNSIGNED INTEGER
|
||||
The delay in milliseconds before a TIME-WAIT socket can be reused by a
|
||||
new connection, if TIME-WAIT socket reuse is enabled. The actual reuse
|
||||
threshold is within [N, N+1] range, where N is the requested delay in
|
||||
milliseconds, to ensure the delay interval is never shorter than the
|
||||
configured value.
|
||||
|
||||
This setting contains an assumption about the other TCP timestamp clock
|
||||
tick interval. It should not be set to a value lower than the peer's
|
||||
clock tick for PAWS (Protection Against Wrapped Sequence numbers)
|
||||
mechanism work correctly for the reused connection.
|
||||
|
||||
Default: 1000 (milliseconds)
|
||||
|
||||
tcp_window_scaling - BOOLEAN
|
||||
Enable window scaling as defined in RFC1323.
|
||||
|
||||
|
|
|
|||
|
|
@ -79,6 +79,7 @@ u8 sysctl_tcp_retries1
|
|||
u8 sysctl_tcp_retries2
|
||||
u8 sysctl_tcp_orphan_retries
|
||||
u8 sysctl_tcp_tw_reuse timewait_sock_ops
|
||||
unsigned_int sysctl_tcp_tw_reuse_delay timewait_sock_ops
|
||||
int sysctl_tcp_fin_timeout TCP_LAST_ACK/tcp_rcv_state_process
|
||||
unsigned_int sysctl_tcp_notsent_lowat read_mostly tcp_notsent_lowat/tcp_stream_memory_free
|
||||
u8 sysctl_tcp_sack tcp_syn_options
|
||||
|
|
|
|||
|
|
@ -74,6 +74,10 @@ struct inet_timewait_sock {
|
|||
tw_tos : 8;
|
||||
u32 tw_txhash;
|
||||
u32 tw_priority;
|
||||
/**
|
||||
* @tw_reuse_stamp: Time of entry into %TCP_TIME_WAIT state in msec.
|
||||
*/
|
||||
u32 tw_entry_stamp;
|
||||
struct timer_list tw_timer;
|
||||
struct inet_bind_bucket *tw_tb;
|
||||
struct inet_bind2_bucket *tw_tb2;
|
||||
|
|
|
|||
|
|
@ -175,6 +175,7 @@ struct netns_ipv4 {
|
|||
u8 sysctl_tcp_retries2;
|
||||
u8 sysctl_tcp_orphan_retries;
|
||||
u8 sysctl_tcp_tw_reuse;
|
||||
unsigned int sysctl_tcp_tw_reuse_delay;
|
||||
int sysctl_tcp_fin_timeout;
|
||||
u8 sysctl_tcp_sack;
|
||||
u8 sysctl_tcp_window_scaling;
|
||||
|
|
|
|||
|
|
@ -45,6 +45,7 @@ static unsigned int tcp_child_ehash_entries_max = 16 * 1024 * 1024;
|
|||
static unsigned int udp_child_hash_entries_max = UDP_HTABLE_SIZE_MAX;
|
||||
static int tcp_plb_max_rounds = 31;
|
||||
static int tcp_plb_max_cong_thresh = 256;
|
||||
static unsigned int tcp_tw_reuse_delay_max = TCP_PAWS_MSL * MSEC_PER_SEC;
|
||||
|
||||
/* obsolete */
|
||||
static int sysctl_tcp_low_latency __read_mostly;
|
||||
|
|
@ -1065,6 +1066,15 @@ static struct ctl_table ipv4_net_table[] = {
|
|||
.extra1 = SYSCTL_ZERO,
|
||||
.extra2 = SYSCTL_TWO,
|
||||
},
|
||||
{
|
||||
.procname = "tcp_tw_reuse_delay",
|
||||
.data = &init_net.ipv4.sysctl_tcp_tw_reuse_delay,
|
||||
.maxlen = sizeof(unsigned int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_douintvec_minmax,
|
||||
.extra1 = SYSCTL_ONE,
|
||||
.extra2 = &tcp_tw_reuse_delay_max,
|
||||
},
|
||||
{
|
||||
.procname = "tcp_max_syn_backlog",
|
||||
.data = &init_net.ipv4.sysctl_max_syn_backlog,
|
||||
|
|
|
|||
|
|
@ -120,6 +120,7 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
|
|||
const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
|
||||
struct tcp_sock *tp = tcp_sk(sk);
|
||||
int ts_recent_stamp;
|
||||
u32 reuse_thresh;
|
||||
|
||||
if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2)
|
||||
reuse = 0;
|
||||
|
|
@ -162,9 +163,10 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
|
|||
and use initial timestamp retrieved from peer table.
|
||||
*/
|
||||
ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp);
|
||||
reuse_thresh = READ_ONCE(tw->tw_entry_stamp) +
|
||||
READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse_delay);
|
||||
if (ts_recent_stamp &&
|
||||
(!twp || (reuse && time_after32(ktime_get_seconds(),
|
||||
ts_recent_stamp)))) {
|
||||
(!twp || (reuse && time_after32(tcp_clock_ms(), reuse_thresh)))) {
|
||||
/* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk
|
||||
* and releasing the bucket lock.
|
||||
*/
|
||||
|
|
@ -3457,6 +3459,7 @@ static int __net_init tcp_sk_init(struct net *net)
|
|||
net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
|
||||
net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
|
||||
net->ipv4.sysctl_tcp_tw_reuse = 2;
|
||||
net->ipv4.sysctl_tcp_tw_reuse_delay = 1 * MSEC_PER_SEC;
|
||||
net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
|
||||
|
||||
refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
|
||||
|
|
|
|||
|
|
@ -157,8 +157,11 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
|
|||
rcv_nxt);
|
||||
|
||||
if (tmp_opt.saw_tstamp) {
|
||||
u64 ts = tcp_clock_ms();
|
||||
|
||||
WRITE_ONCE(tw->tw_entry_stamp, ts);
|
||||
WRITE_ONCE(tcptw->tw_ts_recent_stamp,
|
||||
ktime_get_seconds());
|
||||
div_u64(ts, MSEC_PER_SEC));
|
||||
WRITE_ONCE(tcptw->tw_ts_recent,
|
||||
tmp_opt.rcv_tsval);
|
||||
}
|
||||
|
|
@ -316,6 +319,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
|
|||
tw->tw_mark = sk->sk_mark;
|
||||
tw->tw_priority = READ_ONCE(sk->sk_priority);
|
||||
tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale;
|
||||
/* refreshed when we enter true TIME-WAIT state */
|
||||
tw->tw_entry_stamp = tcp_time_stamp_ms(tp);
|
||||
tcptw->tw_rcv_nxt = tp->rcv_nxt;
|
||||
tcptw->tw_snd_nxt = tp->snd_nxt;
|
||||
tcptw->tw_rcv_wnd = tcp_receive_window(tp);
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user