From d4e85922e3e7ef2071f91f65e61629b60f3a9cf4 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 7 Feb 2023 14:04:13 +0100 Subject: [PATCH 1/6] mptcp: do not wait for bare sockets' timeout If the peer closes all the existing subflows for a given mptcp socket and later the application closes it, the current implementation let it survive until the timewait timeout expires. While the above is allowed by the protocol specification it consumes resources for almost no reason and additionally causes sporadic self-tests failures. Let's move the mptcp socket to the TCP_CLOSE state when there are no alive subflows at close time, so that the allocated resources will be freed immediately. Fixes: e16163b6e2b7 ("mptcp: refactor shutdown and close") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 8cd6cc67c2c5..bc6c1f62a690 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2897,6 +2897,7 @@ bool __mptcp_close(struct sock *sk, long timeout) struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); bool do_cancel_work = false; + int subflows_alive = 0; sk->sk_shutdown = SHUTDOWN_MASK; @@ -2922,6 +2923,8 @@ bool __mptcp_close(struct sock *sk, long timeout) struct sock *ssk = mptcp_subflow_tcp_sock(subflow); bool slow = lock_sock_fast_nested(ssk); + subflows_alive += ssk->sk_state != TCP_CLOSE; + /* since the close timeout takes precedence on the fail one, * cancel the latter */ @@ -2937,6 +2940,12 @@ bool __mptcp_close(struct sock *sk, long timeout) } sock_orphan(sk); + /* all the subflows are closed, only timeout can change the msk + * state, let's not keep resources busy for no reasons + */ + if (subflows_alive == 0) + inet_sk_state_store(sk, TCP_CLOSE); + sock_hold(sk); pr_debug("msk=%p state=%d", sk, sk->sk_state); if (msk->token) From 21e43569685de4ad773fb060c11a15f3fd5e7ac4 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 7 Feb 2023 14:04:14 +0100 Subject: [PATCH 2/6] mptcp: fix locking for setsockopt corner-case We need to call the __mptcp_nmpc_socket(), and later subflow socket access under the msk socket lock, or e.g. a racing connect() could change the socket status under the hood, with unexpected results. Fixes: 54635bd04701 ("mptcp: add TCP_FASTOPEN_CONNECT socket option") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Signed-off-by: David S. Miller --- net/mptcp/sockopt.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index d4b1e6ec1b36..7f2c3727ab23 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -760,14 +760,21 @@ static int mptcp_setsockopt_v4(struct mptcp_sock *msk, int optname, static int mptcp_setsockopt_first_sf_only(struct mptcp_sock *msk, int level, int optname, sockptr_t optval, unsigned int optlen) { + struct sock *sk = (struct sock *)msk; struct socket *sock; + int ret = -EINVAL; /* Limit to first subflow, before the connection establishment */ + lock_sock(sk); sock = __mptcp_nmpc_socket(msk); if (!sock) - return -EINVAL; + goto unlock; - return tcp_setsockopt(sock->sk, level, optname, optval, optlen); + ret = tcp_setsockopt(sock->sk, level, optname, optval, optlen); + +unlock: + release_sock(sk); + return ret; } static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname, From ad2171009d968104ccda9dc517f5a3ba891515db Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 7 Feb 2023 14:04:15 +0100 Subject: [PATCH 3/6] mptcp: fix locking for in-kernel listener creation For consistency, in mptcp_pm_nl_create_listen_socket(), we need to call the __mptcp_nmpc_socket() under the msk socket lock. Note that as a side effect, mptcp_subflow_create_socket() needs a 'nested' lockdep annotation, as it will acquire the subflow (kernel) socket lock under the in-kernel listener msk socket lock. The current lack of locking is almost harmless, because the relevant socket is not exposed to the user space, but in future we will add more complexity to the mentioned helper, let's play safe. Fixes: 1729cf186d8a ("mptcp: create the listening socket for new port") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Signed-off-by: David S. Miller --- net/mptcp/pm_netlink.c | 10 ++++++---- net/mptcp/subflow.c | 2 +- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 2ea7eae43bdb..10fe9771a852 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -998,8 +998,8 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk, { int addrlen = sizeof(struct sockaddr_in); struct sockaddr_storage addr; - struct mptcp_sock *msk; struct socket *ssock; + struct sock *newsk; int backlog = 1024; int err; @@ -1008,11 +1008,13 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk, if (err) return err; - msk = mptcp_sk(entry->lsk->sk); - if (!msk) + newsk = entry->lsk->sk; + if (!newsk) return -EINVAL; - ssock = __mptcp_nmpc_socket(msk); + lock_sock(newsk); + ssock = __mptcp_nmpc_socket(mptcp_sk(newsk)); + release_sock(newsk); if (!ssock) return -EINVAL; diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index ec54413fb31f..a3e5026bee5b 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1679,7 +1679,7 @@ int mptcp_subflow_create_socket(struct sock *sk, unsigned short family, if (err) return err; - lock_sock(sf->sk); + lock_sock_nested(sf->sk, SINGLE_DEPTH_NESTING); /* the newly created socket has to be in the same cgroup as its parent */ mptcp_attach_cgroup(sk, sf->sk); From 1249db44a102d9d3541ed7798d4b01ffdcf03524 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 7 Feb 2023 14:04:16 +0100 Subject: [PATCH 4/6] mptcp: be careful on subflow status propagation on errors Currently the subflow error report callback unconditionally propagates the fallback subflow status to the owning msk. If the msk is already orphaned, the above prevents the code from correctly tracking the msk moving to the TCP_CLOSE state and doing the appropriate cleanup. All the above causes increasing memory usage over time and sporadic self-tests failures. There is a great deal of infrastructure trying to propagate correctly the fallback subflow status to the owning mptcp socket, e.g. via mptcp_subflow_eof() and subflow_sched_work_if_closed(): in the error propagation path we need only to cope with unorphaned sockets. Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/339 Fixes: 15cc10453398 ("mptcp: deliver ssk errors to msk") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Signed-off-by: David S. Miller --- net/mptcp/subflow.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index a3e5026bee5b..32904c76c6a1 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1399,6 +1399,7 @@ void __mptcp_error_report(struct sock *sk) mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); int err = sock_error(ssk); + int ssk_state; if (!err) continue; @@ -1409,7 +1410,14 @@ void __mptcp_error_report(struct sock *sk) if (sk->sk_state != TCP_SYN_SENT && !__mptcp_check_fallback(msk)) continue; - inet_sk_state_store(sk, inet_sk_state_load(ssk)); + /* We need to propagate only transition to CLOSE state. + * Orphaned socket will see such state change via + * subflow_sched_work_if_closed() and that path will properly + * destroy the msk as needed. + */ + ssk_state = inet_sk_state_load(ssk); + if (ssk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DEAD)) + inet_sk_state_store(sk, ssk_state); sk->sk_err = -err; /* This barrier is coupled with smp_rmb() in mptcp_poll() */ From a635a8c3df66ab68dc088c08a4e9e955e22c0e64 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 7 Feb 2023 14:04:17 +0100 Subject: [PATCH 5/6] selftests: mptcp: allow more slack for slow test-case A test-case is frequently failing on some extremely slow VMs. The mptcp transfer completes before the script is able to do all the required PM manipulation. Address the issue in the simplest possible way, making the transfer even more slow. Additionally dump more info in case of failures, to help debugging similar problems in the future and init dump_stats var. Fixes: e274f7154008 ("selftests: mptcp: add subflow limits test-cases") Cc: stable@vger.kernel.org Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/323 Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index d11d3d566608..f8a969300ef4 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -1694,6 +1694,7 @@ chk_subflow_nr() local subflow_nr=$3 local cnt1 local cnt2 + local dump_stats if [ -n "${need_title}" ]; then printf "%03u %-36s %s" "${TEST_COUNT}" "${TEST_NAME}" "${msg}" @@ -1711,7 +1712,12 @@ chk_subflow_nr() echo "[ ok ]" fi - [ "${dump_stats}" = 1 ] && ( ss -N $ns1 -tOni ; ss -N $ns1 -tOni | grep token; ip -n $ns1 mptcp endpoint ) + if [ "${dump_stats}" = 1 ]; then + ss -N $ns1 -tOni + ss -N $ns1 -tOni | grep token + ip -n $ns1 mptcp endpoint + dump_stats + fi } chk_link_usage() @@ -3069,7 +3075,7 @@ endpoint_tests() pm_nl_set_limits $ns1 1 1 pm_nl_set_limits $ns2 1 1 pm_nl_add_endpoint $ns2 10.0.2.2 id 2 dev ns2eth2 flags subflow - run_tests $ns1 $ns2 10.0.1.1 4 0 0 slow & + run_tests $ns1 $ns2 10.0.1.1 4 0 0 speed_20 & wait_mpj $ns2 pm_nl_del_endpoint $ns2 2 10.0.2.2 From 070d6dafacbaa9d1f2e4e3edc263853d194af15e Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Tue, 7 Feb 2023 14:04:18 +0100 Subject: [PATCH 6/6] selftests: mptcp: stop tests earlier These 'endpoint' tests from 'mptcp_join.sh' selftest start a transfer in the background and check the status during this transfer. Once the expected events have been recorded, there is no reason to wait for the data transfer to finish. It can be stopped earlier to reduce the execution time by more than half. For these tests, the exchanged data were not verified. Errors, if any, were ignored but that's fine, plenty of other tests are looking at that. It is then OK to mute stderr now that we are sure errors will be printed (and still ignored) because the transfer is stopped before the end. Fixes: e274f7154008 ("selftests: mptcp: add subflow limits test-cases") Cc: stable@vger.kernel.org Signed-off-by: Matthieu Baerts Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index f8a969300ef4..079f8f46849d 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -498,6 +498,12 @@ kill_events_pids() kill_wait $evts_ns2_pid } +kill_tests_wait() +{ + kill -SIGUSR1 $(ip netns pids $ns2) $(ip netns pids $ns1) + wait +} + pm_nl_set_limits() { local ns=$1 @@ -3055,7 +3061,7 @@ endpoint_tests() pm_nl_set_limits $ns1 2 2 pm_nl_set_limits $ns2 2 2 pm_nl_add_endpoint $ns1 10.0.2.1 flags signal - run_tests $ns1 $ns2 10.0.1.1 0 0 0 slow & + run_tests $ns1 $ns2 10.0.1.1 0 0 0 slow 2>/dev/null & wait_mpj $ns1 pm_nl_check_endpoint 1 "creation" \ @@ -3068,14 +3074,14 @@ endpoint_tests() pm_nl_add_endpoint $ns2 10.0.2.2 flags signal pm_nl_check_endpoint 0 "modif is allowed" \ $ns2 10.0.2.2 id 1 flags signal - wait + kill_tests_wait fi if reset "delete and re-add"; then pm_nl_set_limits $ns1 1 1 pm_nl_set_limits $ns2 1 1 pm_nl_add_endpoint $ns2 10.0.2.2 id 2 dev ns2eth2 flags subflow - run_tests $ns1 $ns2 10.0.1.1 4 0 0 speed_20 & + run_tests $ns1 $ns2 10.0.1.1 4 0 0 speed_20 2>/dev/null & wait_mpj $ns2 pm_nl_del_endpoint $ns2 2 10.0.2.2 @@ -3085,7 +3091,7 @@ endpoint_tests() pm_nl_add_endpoint $ns2 10.0.2.2 dev ns2eth2 flags subflow wait_mpj $ns2 chk_subflow_nr "" "after re-add" 2 - wait + kill_tests_wait fi }