mirror of
https://github.com/torvalds/linux.git
synced 2026-05-29 17:43:52 +02:00
Merge branch 'mptcp-lowat-sockopt'
Matthieu Baerts says: ==================== mptcp: add TCP_NOTSENT_LOWAT sockopt support Patch 3 does the magic of adding TCP_NOTSENT_LOWAT support, all the other ones are minor cleanup seen along when working on the new feature. Note that this feature relies on the existing accounting for snd_nxt. Such accounting is not 110% accurate as it tracks the most recent sequence number queued to any subflow, and not the actual sequence number sent on the wire. Paolo experimented a lot, trying to implement the latter, and in the end it proved to be both "too complex" and "not necessary". The complexity raises from the need for additional lock and a lot of refactoring to introduce such protections without adding significant overhead. Additionally, snd_nxt is currently used and exposed with the current semantic by the internal packet scheduling. Introducing a different tracking will still require us to keep the old one. More interestingly, a more accurate tracking could be not strictly necessary: as the MPTCP socket enqueues data to the subflows only up to the available send window, any enqueue data is sent on the wire instantly, without any blocking operation short or a drop in the tx path at the nft or TC layer. ==================== Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
This commit is contained in:
commit
b78fcd0a36
|
|
@ -1692,15 +1692,6 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk, bool
|
|||
}
|
||||
}
|
||||
|
||||
static void mptcp_set_nospace(struct sock *sk)
|
||||
{
|
||||
/* enable autotune */
|
||||
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
|
||||
|
||||
/* will be cleared on avail space */
|
||||
set_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags);
|
||||
}
|
||||
|
||||
static int mptcp_disconnect(struct sock *sk, int flags);
|
||||
|
||||
static int mptcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
|
||||
|
|
@ -1771,6 +1762,30 @@ static int do_copy_data_nocache(struct sock *sk, int copy,
|
|||
return 0;
|
||||
}
|
||||
|
||||
/* open-code sk_stream_memory_free() plus sent limit computation to
|
||||
* avoid indirect calls in fast-path.
|
||||
* Called under the msk socket lock, so we can avoid a bunch of ONCE
|
||||
* annotations.
|
||||
*/
|
||||
static u32 mptcp_send_limit(const struct sock *sk)
|
||||
{
|
||||
const struct mptcp_sock *msk = mptcp_sk(sk);
|
||||
u32 limit, not_sent;
|
||||
|
||||
if (sk->sk_wmem_queued >= READ_ONCE(sk->sk_sndbuf))
|
||||
return 0;
|
||||
|
||||
limit = mptcp_notsent_lowat(sk);
|
||||
if (limit == UINT_MAX)
|
||||
return UINT_MAX;
|
||||
|
||||
not_sent = msk->write_seq - msk->snd_nxt;
|
||||
if (not_sent >= limit)
|
||||
return 0;
|
||||
|
||||
return limit - not_sent;
|
||||
}
|
||||
|
||||
static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
|
||||
{
|
||||
struct mptcp_sock *msk = mptcp_sk(sk);
|
||||
|
|
@ -1815,6 +1830,12 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
|
|||
struct mptcp_data_frag *dfrag;
|
||||
bool dfrag_collapsed;
|
||||
size_t psize, offset;
|
||||
u32 copy_limit;
|
||||
|
||||
/* ensure fitting the notsent_lowat() constraint */
|
||||
copy_limit = mptcp_send_limit(sk);
|
||||
if (!copy_limit)
|
||||
goto wait_for_memory;
|
||||
|
||||
/* reuse tail pfrag, if possible, or carve a new one from the
|
||||
* page allocator
|
||||
|
|
@ -1822,9 +1843,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
|
|||
dfrag = mptcp_pending_tail(sk);
|
||||
dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag);
|
||||
if (!dfrag_collapsed) {
|
||||
if (!sk_stream_memory_free(sk))
|
||||
goto wait_for_memory;
|
||||
|
||||
if (!mptcp_page_frag_refill(sk, pfrag))
|
||||
goto wait_for_memory;
|
||||
|
||||
|
|
@ -1839,6 +1857,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
|
|||
offset = dfrag->offset + dfrag->data_len;
|
||||
psize = pfrag->size - offset;
|
||||
psize = min_t(size_t, psize, msg_data_left(msg));
|
||||
psize = min_t(size_t, psize, copy_limit);
|
||||
total_ts = psize + frag_truesize;
|
||||
|
||||
if (!sk_wmem_schedule(sk, total_ts))
|
||||
|
|
@ -1874,7 +1893,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
|
|||
continue;
|
||||
|
||||
wait_for_memory:
|
||||
mptcp_set_nospace(sk);
|
||||
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
|
||||
__mptcp_push_pending(sk, msg->msg_flags);
|
||||
ret = sk_stream_wait_memory(sk, &timeo);
|
||||
if (ret)
|
||||
|
|
@ -3769,6 +3788,7 @@ static struct proto mptcp_prot = {
|
|||
.unhash = mptcp_unhash,
|
||||
.get_port = mptcp_get_port,
|
||||
.forward_alloc_get = mptcp_forward_alloc_get,
|
||||
.stream_memory_free = mptcp_stream_memory_free,
|
||||
.sockets_allocated = &mptcp_sockets_allocated,
|
||||
|
||||
.memory_allocated = &tcp_memory_allocated,
|
||||
|
|
@ -3942,12 +3962,12 @@ static __poll_t mptcp_check_writeable(struct mptcp_sock *msk)
|
|||
{
|
||||
struct sock *sk = (struct sock *)msk;
|
||||
|
||||
if (sk_stream_is_writeable(sk))
|
||||
if (__mptcp_stream_is_writeable(sk, 1))
|
||||
return EPOLLOUT | EPOLLWRNORM;
|
||||
|
||||
mptcp_set_nospace(sk);
|
||||
smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */
|
||||
if (sk_stream_is_writeable(sk))
|
||||
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
|
||||
smp_mb__after_atomic(); /* NOSPACE is changed by mptcp_write_space() */
|
||||
if (__mptcp_stream_is_writeable(sk, 1))
|
||||
return EPOLLOUT | EPOLLWRNORM;
|
||||
|
||||
return 0;
|
||||
|
|
|
|||
|
|
@ -113,10 +113,9 @@
|
|||
#define MPTCP_RST_TRANSIENT BIT(0)
|
||||
|
||||
/* MPTCP socket atomic flags */
|
||||
#define MPTCP_NOSPACE 1
|
||||
#define MPTCP_WORK_RTX 2
|
||||
#define MPTCP_FALLBACK_DONE 4
|
||||
#define MPTCP_WORK_CLOSE_SUBFLOW 5
|
||||
#define MPTCP_WORK_RTX 1
|
||||
#define MPTCP_FALLBACK_DONE 2
|
||||
#define MPTCP_WORK_CLOSE_SUBFLOW 3
|
||||
|
||||
/* MPTCP socket release cb flags */
|
||||
#define MPTCP_PUSH_PENDING 1
|
||||
|
|
@ -308,6 +307,7 @@ struct mptcp_sock {
|
|||
in_accept_queue:1,
|
||||
free_first:1,
|
||||
rcvspace_init:1;
|
||||
u32 notsent_lowat;
|
||||
struct work_struct work;
|
||||
struct sk_buff *ooo_last_skb;
|
||||
struct rb_root out_of_order_queue;
|
||||
|
|
@ -808,14 +808,36 @@ static inline bool mptcp_data_fin_enabled(const struct mptcp_sock *msk)
|
|||
READ_ONCE(msk->write_seq) == READ_ONCE(msk->snd_nxt);
|
||||
}
|
||||
|
||||
static inline u32 mptcp_notsent_lowat(const struct sock *sk)
|
||||
{
|
||||
struct net *net = sock_net(sk);
|
||||
u32 val;
|
||||
|
||||
val = READ_ONCE(mptcp_sk(sk)->notsent_lowat);
|
||||
return val ?: READ_ONCE(net->ipv4.sysctl_tcp_notsent_lowat);
|
||||
}
|
||||
|
||||
static inline bool mptcp_stream_memory_free(const struct sock *sk, int wake)
|
||||
{
|
||||
const struct mptcp_sock *msk = mptcp_sk(sk);
|
||||
u32 notsent_bytes;
|
||||
|
||||
notsent_bytes = READ_ONCE(msk->write_seq) - READ_ONCE(msk->snd_nxt);
|
||||
return (notsent_bytes << wake) < mptcp_notsent_lowat(sk);
|
||||
}
|
||||
|
||||
static inline bool __mptcp_stream_is_writeable(const struct sock *sk, int wake)
|
||||
{
|
||||
return mptcp_stream_memory_free(sk, wake) &&
|
||||
__sk_stream_is_writeable(sk, wake);
|
||||
}
|
||||
|
||||
static inline void mptcp_write_space(struct sock *sk)
|
||||
{
|
||||
if (sk_stream_is_writeable(sk)) {
|
||||
/* pairs with memory barrier in mptcp_poll */
|
||||
smp_mb();
|
||||
if (test_and_clear_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags))
|
||||
sk_stream_write_space(sk);
|
||||
}
|
||||
/* pairs with memory barrier in mptcp_poll */
|
||||
smp_mb();
|
||||
if (mptcp_stream_memory_free(sk, 1))
|
||||
sk_stream_write_space(sk);
|
||||
}
|
||||
|
||||
static inline void __mptcp_sync_sndbuf(struct sock *sk)
|
||||
|
|
|
|||
|
|
@ -624,20 +624,11 @@ static int mptcp_setsockopt_sol_tcp_congestion(struct mptcp_sock *msk, sockptr_t
|
|||
return ret;
|
||||
}
|
||||
|
||||
static int mptcp_setsockopt_sol_tcp_cork(struct mptcp_sock *msk, sockptr_t optval,
|
||||
unsigned int optlen)
|
||||
static int __mptcp_setsockopt_sol_tcp_cork(struct mptcp_sock *msk, int val)
|
||||
{
|
||||
struct mptcp_subflow_context *subflow;
|
||||
struct sock *sk = (struct sock *)msk;
|
||||
int val;
|
||||
|
||||
if (optlen < sizeof(int))
|
||||
return -EINVAL;
|
||||
|
||||
if (copy_from_sockptr(&val, optval, sizeof(val)))
|
||||
return -EFAULT;
|
||||
|
||||
lock_sock(sk);
|
||||
sockopt_seq_inc(msk);
|
||||
msk->cork = !!val;
|
||||
mptcp_for_each_subflow(msk, subflow) {
|
||||
|
|
@ -649,25 +640,15 @@ static int mptcp_setsockopt_sol_tcp_cork(struct mptcp_sock *msk, sockptr_t optva
|
|||
}
|
||||
if (!val)
|
||||
mptcp_check_and_set_pending(sk);
|
||||
release_sock(sk);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int mptcp_setsockopt_sol_tcp_nodelay(struct mptcp_sock *msk, sockptr_t optval,
|
||||
unsigned int optlen)
|
||||
static int __mptcp_setsockopt_sol_tcp_nodelay(struct mptcp_sock *msk, int val)
|
||||
{
|
||||
struct mptcp_subflow_context *subflow;
|
||||
struct sock *sk = (struct sock *)msk;
|
||||
int val;
|
||||
|
||||
if (optlen < sizeof(int))
|
||||
return -EINVAL;
|
||||
|
||||
if (copy_from_sockptr(&val, optval, sizeof(val)))
|
||||
return -EFAULT;
|
||||
|
||||
lock_sock(sk);
|
||||
sockopt_seq_inc(msk);
|
||||
msk->nodelay = !!val;
|
||||
mptcp_for_each_subflow(msk, subflow) {
|
||||
|
|
@ -679,8 +660,6 @@ static int mptcp_setsockopt_sol_tcp_nodelay(struct mptcp_sock *msk, sockptr_t op
|
|||
}
|
||||
if (val)
|
||||
mptcp_check_and_set_pending(sk);
|
||||
release_sock(sk);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
@ -803,25 +782,10 @@ static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
|
|||
int ret, val;
|
||||
|
||||
switch (optname) {
|
||||
case TCP_INQ:
|
||||
ret = mptcp_get_int_option(msk, optval, optlen, &val);
|
||||
if (ret)
|
||||
return ret;
|
||||
if (val < 0 || val > 1)
|
||||
return -EINVAL;
|
||||
|
||||
lock_sock(sk);
|
||||
msk->recvmsg_inq = !!val;
|
||||
release_sock(sk);
|
||||
return 0;
|
||||
case TCP_ULP:
|
||||
return -EOPNOTSUPP;
|
||||
case TCP_CONGESTION:
|
||||
return mptcp_setsockopt_sol_tcp_congestion(msk, optval, optlen);
|
||||
case TCP_CORK:
|
||||
return mptcp_setsockopt_sol_tcp_cork(msk, optval, optlen);
|
||||
case TCP_NODELAY:
|
||||
return mptcp_setsockopt_sol_tcp_nodelay(msk, optval, optlen);
|
||||
case TCP_DEFER_ACCEPT:
|
||||
/* See tcp.c: TCP_DEFER_ACCEPT does not fail */
|
||||
mptcp_setsockopt_first_sf_only(msk, SOL_TCP, optname, optval, optlen);
|
||||
|
|
@ -834,7 +798,34 @@ static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
|
|||
optval, optlen);
|
||||
}
|
||||
|
||||
return -EOPNOTSUPP;
|
||||
ret = mptcp_get_int_option(msk, optval, optlen, &val);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
lock_sock(sk);
|
||||
switch (optname) {
|
||||
case TCP_INQ:
|
||||
if (val < 0 || val > 1)
|
||||
ret = -EINVAL;
|
||||
else
|
||||
msk->recvmsg_inq = !!val;
|
||||
break;
|
||||
case TCP_NOTSENT_LOWAT:
|
||||
WRITE_ONCE(msk->notsent_lowat, val);
|
||||
mptcp_write_space(sk);
|
||||
break;
|
||||
case TCP_CORK:
|
||||
ret = __mptcp_setsockopt_sol_tcp_cork(msk, val);
|
||||
break;
|
||||
case TCP_NODELAY:
|
||||
ret = __mptcp_setsockopt_sol_tcp_nodelay(msk, val);
|
||||
break;
|
||||
default:
|
||||
ret = -ENOPROTOOPT;
|
||||
}
|
||||
|
||||
release_sock(sk);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int mptcp_setsockopt(struct sock *sk, int level, int optname,
|
||||
|
|
@ -1349,6 +1340,8 @@ static int mptcp_getsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
|
|||
return mptcp_put_int_option(msk, optval, optlen, msk->cork);
|
||||
case TCP_NODELAY:
|
||||
return mptcp_put_int_option(msk, optval, optlen, msk->nodelay);
|
||||
case TCP_NOTSENT_LOWAT:
|
||||
return mptcp_put_int_option(msk, optval, optlen, msk->notsent_lowat);
|
||||
}
|
||||
return -EOPNOTSUPP;
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user