mirror of
https://github.com/torvalds/linux.git
synced 2026-05-30 18:13:41 +02:00
Merge branch 'mptcp-autotune-related-improvement'
Matthieu Baerts says:
====================
mptcp: autotune related improvement
Here are two patches from Paolo that have been crafted a couple of
months ago, but needed more validation because they were indirectly
causing instabilities in the sefltests. The root cause has been fixed in
'net' recently in commit 8c09412e58 ("selftests: mptcp: more stable
simult_flows tests").
These patches refactor the receive space and RTT estimator, overall
making DRS more correct while avoiding receive buffer drifting to
tcp_rmem[2], which in turn makes the throughput more stable and less
bursty, especially with high bandwidth and low delay environments.
Note that the first patch addresses a very old issue. 'net-next' is
targeted because the change is quite invasive and based on a recent
backlog refactor. The 'Fixes' tag is then there more as a FYI, because
backporting this patch will quickly be blocked due to large conflicts.
====================
Link: https://patch.msgid.link/20260407-net-next-mptcp-reduce-rbuf-v2-0-0d1d135bf6f6@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
commit
3723c3b656
|
|
@ -219,7 +219,7 @@ TRACE_EVENT(mptcp_rcvbuf_grow,
|
|||
__be32 *p32;
|
||||
|
||||
__entry->time = time;
|
||||
__entry->rtt_us = msk->rcvq_space.rtt_us >> 3;
|
||||
__entry->rtt_us = mptcp_rtt_us_est(msk) >> 3;
|
||||
__entry->copied = msk->rcvq_space.copied;
|
||||
__entry->inq = mptcp_inq_hint(sk);
|
||||
__entry->space = msk->rcvq_space.space;
|
||||
|
|
|
|||
|
|
@ -879,6 +879,32 @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk)
|
|||
return moved;
|
||||
}
|
||||
|
||||
static void mptcp_rcv_rtt_update(struct mptcp_sock *msk,
|
||||
struct mptcp_subflow_context *subflow)
|
||||
{
|
||||
const struct tcp_sock *tp = tcp_sk(subflow->tcp_sock);
|
||||
u32 rtt_us = tp->rcv_rtt_est.rtt_us;
|
||||
int id;
|
||||
|
||||
/* Update once per subflow per rcvwnd to avoid touching the msk
|
||||
* too often.
|
||||
*/
|
||||
if (!rtt_us || tp->rcv_rtt_est.seq == subflow->prev_rtt_seq)
|
||||
return;
|
||||
|
||||
subflow->prev_rtt_seq = tp->rcv_rtt_est.seq;
|
||||
|
||||
/* Pairs with READ_ONCE() in mptcp_rtt_us_est(). */
|
||||
id = msk->rcv_rtt_est.next_sample;
|
||||
WRITE_ONCE(msk->rcv_rtt_est.samples[id], rtt_us);
|
||||
if (++msk->rcv_rtt_est.next_sample == MPTCP_RTT_SAMPLES)
|
||||
msk->rcv_rtt_est.next_sample = 0;
|
||||
|
||||
/* EWMA among the incoming subflows */
|
||||
msk->scaling_ratio = ((msk->scaling_ratio << 3) - msk->scaling_ratio +
|
||||
tp->scaling_ratio) >> 3;
|
||||
}
|
||||
|
||||
void mptcp_data_ready(struct sock *sk, struct sock *ssk)
|
||||
{
|
||||
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
|
||||
|
|
@ -892,6 +918,7 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk)
|
|||
return;
|
||||
|
||||
mptcp_data_lock(sk);
|
||||
mptcp_rcv_rtt_update(msk, subflow);
|
||||
if (!sock_owned_by_user(sk)) {
|
||||
/* Wake-up the reader only for in-sequence data */
|
||||
if (move_skbs_to_msk(msk, ssk) && mptcp_epollin_ready(sk))
|
||||
|
|
@ -2095,7 +2122,6 @@ static void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk)
|
|||
|
||||
msk->rcvspace_init = 1;
|
||||
msk->rcvq_space.copied = 0;
|
||||
msk->rcvq_space.rtt_us = 0;
|
||||
|
||||
/* initial rcv_space offering made to peer */
|
||||
msk->rcvq_space.space = min_t(u32, tp->rcv_wnd,
|
||||
|
|
@ -2106,15 +2132,15 @@ static void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk)
|
|||
|
||||
/* receive buffer autotuning. See tcp_rcv_space_adjust for more information.
|
||||
*
|
||||
* Only difference: Use highest rtt estimate of the subflows in use.
|
||||
* Only difference: Use lowest rtt estimate of the subflows in use, see
|
||||
* mptcp_rcv_rtt_update() and mptcp_rtt_us_est().
|
||||
*/
|
||||
static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
|
||||
{
|
||||
struct mptcp_subflow_context *subflow;
|
||||
struct sock *sk = (struct sock *)msk;
|
||||
u8 scaling_ratio = U8_MAX;
|
||||
u32 time, advmss = 1;
|
||||
u64 rtt_us, mstamp;
|
||||
u32 time, rtt_us;
|
||||
u64 mstamp;
|
||||
|
||||
msk_owned_by_me(msk);
|
||||
|
||||
|
|
@ -2129,36 +2155,17 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
|
|||
mstamp = mptcp_stamp();
|
||||
time = tcp_stamp_us_delta(mstamp, READ_ONCE(msk->rcvq_space.time));
|
||||
|
||||
rtt_us = msk->rcvq_space.rtt_us;
|
||||
if (rtt_us && time < (rtt_us >> 3))
|
||||
rtt_us = mptcp_rtt_us_est(msk);
|
||||
if (rtt_us == U32_MAX || time < (rtt_us >> 3))
|
||||
return;
|
||||
|
||||
rtt_us = 0;
|
||||
mptcp_for_each_subflow(msk, subflow) {
|
||||
const struct tcp_sock *tp;
|
||||
u64 sf_rtt_us;
|
||||
u32 sf_advmss;
|
||||
|
||||
tp = tcp_sk(mptcp_subflow_tcp_sock(subflow));
|
||||
|
||||
sf_rtt_us = READ_ONCE(tp->rcv_rtt_est.rtt_us);
|
||||
sf_advmss = READ_ONCE(tp->advmss);
|
||||
|
||||
rtt_us = max(sf_rtt_us, rtt_us);
|
||||
advmss = max(sf_advmss, advmss);
|
||||
scaling_ratio = min(tp->scaling_ratio, scaling_ratio);
|
||||
}
|
||||
|
||||
msk->rcvq_space.rtt_us = rtt_us;
|
||||
msk->scaling_ratio = scaling_ratio;
|
||||
if (time < (rtt_us >> 3) || rtt_us == 0)
|
||||
return;
|
||||
|
||||
if (msk->rcvq_space.copied <= msk->rcvq_space.space)
|
||||
copied = msk->rcvq_space.copied;
|
||||
copied -= mptcp_inq_hint(sk);
|
||||
if (copied <= msk->rcvq_space.space)
|
||||
goto new_measure;
|
||||
|
||||
trace_mptcp_rcvbuf_grow(sk, time);
|
||||
if (mptcp_rcvbuf_grow(sk, msk->rcvq_space.copied)) {
|
||||
if (mptcp_rcvbuf_grow(sk, copied)) {
|
||||
/* Make subflows follow along. If we do not do this, we
|
||||
* get drops at subflow level if skbs can't be moved to
|
||||
* the mptcp rx queue fast enough (announced rcv_win can
|
||||
|
|
@ -2172,7 +2179,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
|
|||
slow = lock_sock_fast(ssk);
|
||||
/* subflows can be added before tcp_init_transfer() */
|
||||
if (tcp_sk(ssk)->rcvq_space.space)
|
||||
tcp_rcvbuf_grow(ssk, msk->rcvq_space.copied);
|
||||
tcp_rcvbuf_grow(ssk, copied);
|
||||
unlock_sock_fast(ssk, slow);
|
||||
}
|
||||
}
|
||||
|
|
@ -3015,6 +3022,7 @@ static void __mptcp_init_sock(struct sock *sk)
|
|||
msk->timer_ival = TCP_RTO_MIN;
|
||||
msk->scaling_ratio = TCP_DEFAULT_SCALING_RATIO;
|
||||
msk->backlog_len = 0;
|
||||
mptcp_init_rtt_est(msk);
|
||||
|
||||
WRITE_ONCE(msk->first, NULL);
|
||||
inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss;
|
||||
|
|
@ -3460,6 +3468,7 @@ static int mptcp_disconnect(struct sock *sk, int flags)
|
|||
msk->bytes_retrans = 0;
|
||||
msk->rcvspace_init = 0;
|
||||
msk->fastclosing = 0;
|
||||
mptcp_init_rtt_est(msk);
|
||||
|
||||
/* for fallback's sake */
|
||||
WRITE_ONCE(msk->ack_seq, 0);
|
||||
|
|
|
|||
|
|
@ -269,6 +269,13 @@ struct mptcp_data_frag {
|
|||
struct page *page;
|
||||
};
|
||||
|
||||
/* Arbitrary compromise between as low as possible to react timely to subflow
|
||||
* close event and as big as possible to avoid being fouled by biased large
|
||||
* samples due to peer sending data on a different subflow WRT to the incoming
|
||||
* ack.
|
||||
*/
|
||||
#define MPTCP_RTT_SAMPLES 5
|
||||
|
||||
/* MPTCP connection sock */
|
||||
struct mptcp_sock {
|
||||
/* inet_connection_sock must be the first member */
|
||||
|
|
@ -341,11 +348,17 @@ struct mptcp_sock {
|
|||
*/
|
||||
struct mptcp_pm_data pm;
|
||||
struct mptcp_sched_ops *sched;
|
||||
|
||||
/* Most recent rtt_us observed by in use incoming subflows. */
|
||||
struct {
|
||||
u32 samples[MPTCP_RTT_SAMPLES];
|
||||
u32 next_sample;
|
||||
} rcv_rtt_est;
|
||||
|
||||
struct {
|
||||
int space; /* bytes copied in last measurement window */
|
||||
int copied; /* bytes copied in this measurement window */
|
||||
u64 time; /* start time of measurement window */
|
||||
u64 rtt_us; /* last maximum rtt of subflows */
|
||||
} rcvq_space;
|
||||
u8 scaling_ratio;
|
||||
bool allow_subflows;
|
||||
|
|
@ -423,6 +436,27 @@ static inline struct mptcp_data_frag *mptcp_send_head(const struct sock *sk)
|
|||
return msk->first_pending;
|
||||
}
|
||||
|
||||
static inline void mptcp_init_rtt_est(struct mptcp_sock *msk)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < MPTCP_RTT_SAMPLES; ++i)
|
||||
msk->rcv_rtt_est.samples[i] = U32_MAX;
|
||||
msk->rcv_rtt_est.next_sample = 0;
|
||||
msk->scaling_ratio = TCP_DEFAULT_SCALING_RATIO;
|
||||
}
|
||||
|
||||
static inline u32 mptcp_rtt_us_est(const struct mptcp_sock *msk)
|
||||
{
|
||||
u32 rtt_us = READ_ONCE(msk->rcv_rtt_est.samples[0]);
|
||||
int i;
|
||||
|
||||
/* Lockless access of collected samples. */
|
||||
for (i = 1; i < MPTCP_RTT_SAMPLES; ++i)
|
||||
rtt_us = min(rtt_us, READ_ONCE(msk->rcv_rtt_est.samples[i]));
|
||||
return rtt_us;
|
||||
}
|
||||
|
||||
static inline struct mptcp_data_frag *mptcp_send_next(struct sock *sk)
|
||||
{
|
||||
struct mptcp_sock *msk = mptcp_sk(sk);
|
||||
|
|
@ -524,6 +558,7 @@ struct mptcp_subflow_context {
|
|||
u32 map_data_len;
|
||||
__wsum map_data_csum;
|
||||
u32 map_csum_len;
|
||||
u32 prev_rtt_seq;
|
||||
u32 request_mptcp : 1, /* send MP_CAPABLE */
|
||||
request_join : 1, /* send MP_JOIN */
|
||||
request_bkup : 1,
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user