From 27b0e701d3872ba59c5b579a9e8a02ea49ad3d3b Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 28 Oct 2025 09:16:52 +0100 Subject: [PATCH 1/4] mptcp: drop bogus optimization in __mptcp_check_push() Accessing the transmit queue without owning the msk socket lock is inherently racy, hence __mptcp_check_push() could actually quit early even when there is pending data. That in turn could cause unexpected tx lock and timeout. Dropping the early check avoids the race, implicitly relaying on later tests under the relevant lock. With such change, all the other mptcp_send_head() call sites are now under the msk socket lock and we can additionally drop the now unneeded annotation on the transmit head pointer accesses. Fixes: 6e628cd3a8f7 ("mptcp: use mptcp release_cb for delayed tasks") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Geliang Tang Tested-by: Geliang Tang Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20251028-net-mptcp-send-timeout-v1-1-38ffff5a9ec8@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 11 ++++------- net/mptcp/protocol.h | 2 +- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 875027b9319c..655a2a45224f 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1007,7 +1007,7 @@ static void __mptcp_clean_una(struct sock *sk) if (WARN_ON_ONCE(!msk->recovery)) break; - WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); + msk->first_pending = mptcp_send_next(sk); } dfrag_clear(sk, dfrag); @@ -1552,7 +1552,7 @@ static int __subflow_push_pending(struct sock *sk, struct sock *ssk, mptcp_update_post_push(msk, dfrag, ret); } - WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); + msk->first_pending = mptcp_send_next(sk); if (msk->snd_burst <= 0 || !sk_stream_memory_free(ssk) || @@ -1912,7 +1912,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) get_page(dfrag->page); list_add_tail(&dfrag->list, &msk->rtx_queue); if (!msk->first_pending) - WRITE_ONCE(msk->first_pending, dfrag); + msk->first_pending = dfrag; } pr_debug("msk=%p dfrag at seq=%llu len=%u sent=%u new=%d\n", msk, dfrag->data_seq, dfrag->data_len, dfrag->already_sent, @@ -2882,7 +2882,7 @@ static void __mptcp_clear_xmit(struct sock *sk) struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_data_frag *dtmp, *dfrag; - WRITE_ONCE(msk->first_pending, NULL); + msk->first_pending = NULL; list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) dfrag_clear(sk, dfrag); } @@ -3422,9 +3422,6 @@ void __mptcp_data_acked(struct sock *sk) void __mptcp_check_push(struct sock *sk, struct sock *ssk) { - if (!mptcp_send_head(sk)) - return; - if (!sock_owned_by_user(sk)) __mptcp_subflow_push_pending(sk, ssk, false); else diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 52f9cfa4ce95..379a88e14e8d 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -414,7 +414,7 @@ static inline struct mptcp_data_frag *mptcp_send_head(const struct sock *sk) { const struct mptcp_sock *msk = mptcp_sk(sk); - return READ_ONCE(msk->first_pending); + return msk->first_pending; } static inline struct mptcp_data_frag *mptcp_send_next(struct sock *sk) From 8e04ce45a8db7a080220e86e249198fa676b83dc Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 28 Oct 2025 09:16:53 +0100 Subject: [PATCH 2/4] mptcp: fix MSG_PEEK stream corruption If a MSG_PEEK | MSG_WAITALL read operation consumes all the bytes in the receive queue and recvmsg() need to waits for more data - i.e. it's a blocking one - upon arrival of the next packet the MPTCP protocol will start again copying the oldest data present in the receive queue, corrupting the data stream. Address the issue explicitly tracking the peeked sequence number, restarting from the last peeked byte. Fixes: ca4fb892579f ("mptcp: add MSG_PEEK support") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Geliang Tang Tested-by: Geliang Tang Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20251028-net-mptcp-send-timeout-v1-2-38ffff5a9ec8@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 38 +++++++++++++++++++++++++------------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 655a2a45224f..2535788569ab 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1945,22 +1945,36 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied); -static int __mptcp_recvmsg_mskq(struct sock *sk, - struct msghdr *msg, - size_t len, int flags, +static int __mptcp_recvmsg_mskq(struct sock *sk, struct msghdr *msg, + size_t len, int flags, int copied_total, struct scm_timestamping_internal *tss, int *cmsg_flags) { struct mptcp_sock *msk = mptcp_sk(sk); struct sk_buff *skb, *tmp; + int total_data_len = 0; int copied = 0; skb_queue_walk_safe(&sk->sk_receive_queue, skb, tmp) { - u32 offset = MPTCP_SKB_CB(skb)->offset; + u32 delta, offset = MPTCP_SKB_CB(skb)->offset; u32 data_len = skb->len - offset; - u32 count = min_t(size_t, len - copied, data_len); + u32 count; int err; + if (flags & MSG_PEEK) { + /* skip already peeked skbs */ + if (total_data_len + data_len <= copied_total) { + total_data_len += data_len; + continue; + } + + /* skip the already peeked data in the current skb */ + delta = copied_total - total_data_len; + offset += delta; + data_len -= delta; + } + + count = min_t(size_t, len - copied, data_len); if (!(flags & MSG_TRUNC)) { err = skb_copy_datagram_msg(skb, offset, msg, count); if (unlikely(err < 0)) { @@ -1977,16 +1991,14 @@ static int __mptcp_recvmsg_mskq(struct sock *sk, copied += count; - if (count < data_len) { - if (!(flags & MSG_PEEK)) { + if (!(flags & MSG_PEEK)) { + msk->bytes_consumed += count; + if (count < data_len) { MPTCP_SKB_CB(skb)->offset += count; MPTCP_SKB_CB(skb)->map_seq += count; - msk->bytes_consumed += count; + break; } - break; - } - if (!(flags & MSG_PEEK)) { /* avoid the indirect call, we know the destructor is sock_rfree */ skb->destructor = NULL; skb->sk = NULL; @@ -1994,7 +2006,6 @@ static int __mptcp_recvmsg_mskq(struct sock *sk, sk_mem_uncharge(sk, skb->truesize); __skb_unlink(skb, &sk->sk_receive_queue); skb_attempt_defer_free(skb); - msk->bytes_consumed += count; } if (copied >= len) @@ -2191,7 +2202,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, while (copied < len) { int err, bytes_read; - bytes_read = __mptcp_recvmsg_mskq(sk, msg, len - copied, flags, &tss, &cmsg_flags); + bytes_read = __mptcp_recvmsg_mskq(sk, msg, len - copied, flags, + copied, &tss, &cmsg_flags); if (unlikely(bytes_read < 0)) { if (!copied) copied = bytes_read; From a824084b98d8a1dbd6e85d0842a8eb5e73467f59 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 28 Oct 2025 09:16:54 +0100 Subject: [PATCH 3/4] mptcp: restore window probe Since commit 72377ab2d671 ("mptcp: more conservative check for zero probes") the MPTCP-level zero window probe check is always disabled, as the TCP-level write queue always contains at least the newly allocated skb. Refine the relevant check tacking in account that the above condition and that such skb can have zero length. Fixes: 72377ab2d671 ("mptcp: more conservative check for zero probes") Cc: stable@vger.kernel.org Reported-by: Geliang Tang Closes: https://lore.kernel.org/d0a814c364e744ca6b836ccd5b6e9146882e8d42.camel@kernel.org Reviewed-by: Mat Martineau Signed-off-by: Paolo Abeni Tested-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20251028-net-mptcp-send-timeout-v1-3-38ffff5a9ec8@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 2535788569ab..5d8714adae6c 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1299,7 +1299,12 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, if (copy == 0) { u64 snd_una = READ_ONCE(msk->snd_una); - if (snd_una != msk->snd_nxt || tcp_write_queue_tail(ssk)) { + /* No need for zero probe if there are any data pending + * either at the msk or ssk level; skb is the current write + * queue tail and can be empty at this point. + */ + if (snd_una != msk->snd_nxt || skb->len || + skb != tcp_send_head(ssk)) { tcp_remove_empty_skb(ssk); return 0; } From fe11dfa10919ce594682c76f5f648a0840d80a2b Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 28 Oct 2025 09:16:55 +0100 Subject: [PATCH 4/4] mptcp: zero window probe mib Explicitly account for MPTCP-level zero windows probe, to catch hopefully earlier issues alike the one addressed by the previous patch. Reviewed-by: Mat Martineau Signed-off-by: Paolo Abeni Tested-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20251028-net-mptcp-send-timeout-v1-4-38ffff5a9ec8@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/mib.c | 1 + net/mptcp/mib.h | 1 + net/mptcp/protocol.c | 1 + 3 files changed, 3 insertions(+) diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c index 6003e47c770a..171643815076 100644 --- a/net/mptcp/mib.c +++ b/net/mptcp/mib.c @@ -85,6 +85,7 @@ static const struct snmp_mib mptcp_snmp_list[] = { SNMP_MIB_ITEM("DssFallback", MPTCP_MIB_DSSFALLBACK), SNMP_MIB_ITEM("SimultConnectFallback", MPTCP_MIB_SIMULTCONNFALLBACK), SNMP_MIB_ITEM("FallbackFailed", MPTCP_MIB_FALLBACKFAILED), + SNMP_MIB_ITEM("WinProbe", MPTCP_MIB_WINPROBE), }; /* mptcp_mib_alloc - allocate percpu mib counters diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h index 309bac6fea32..a1d3e9369fbb 100644 --- a/net/mptcp/mib.h +++ b/net/mptcp/mib.h @@ -88,6 +88,7 @@ enum linux_mptcp_mib_field { MPTCP_MIB_DSSFALLBACK, /* Bad or missing DSS */ MPTCP_MIB_SIMULTCONNFALLBACK, /* Simultaneous connect */ MPTCP_MIB_FALLBACKFAILED, /* Can't fallback due to msk status */ + MPTCP_MIB_WINPROBE, /* MPTCP-level zero window probe */ __MPTCP_MIB_MAX }; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 5d8714adae6c..2d6b8de35c44 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1355,6 +1355,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, mpext->dsn64); if (zero_window_probe) { + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_WINPROBE); mptcp_subflow_ctx(ssk)->rel_write_seq += copy; mpext->frozen = 1; if (READ_ONCE(msk->csum_enabled))