Merge branch 'mptcp-pm-special-case-for-c-flag-luminar-endp'

Matthieu Baerts says:

====================
mptcp: pm: special case for c-flag + luminar endp

Here are some patches for the MPTCP PM, including some refactoring that
I thought it would be best to send at the end of a cycle to avoid
conflicts between net and net-next that could last a few weeks.

The most interesting changes are in the first and last patch, the rest
are patches refactoring the code & tests to validate the modifications.

- Patches 1 & 2: When servers set the C-flag in their MP_CAPABLE to tell
  clients not to create subflows to the initial address and port -- e.g.
  a deployment behind a L4 load balancer like a typical CDN deployment
  -- clients will not use their other endpoints when default settings
  are used. That's because the in-kernel path-manager uses the 'subflow'
  endpoints to create subflows only to the initial address and port. The
  first patch fixes that (for >=v5.14), and the second one validates it.

- Patches 3-14: various patches refactoring the code around the
  in-kernel PM (mainly): split too long functions, rename variables and
  functions to avoid confusions, reduce structure size, and compare IDs
  instead of IP addresses. Note that one patch modifies one internal
  variable used in one BPF selftest.

- Patch 15: ability to control endpoints that are used in reaction to a
  new address announced by the other peer. With that, endpoints can be
  used only once.
====================

Link: https://patch.msgid.link/20250925-net-next-mptcp-c-flag-laminar-v1-0-ad126cc47c6b@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
Jakub Kicinski 2025-09-26 17:44:14 -07:00
commit 9ccec26623
8 changed files with 441 additions and 229 deletions

View File

@ -39,14 +39,20 @@
#define MPTCP_PM_ADDR_FLAG_BACKUP _BITUL(2)
#define MPTCP_PM_ADDR_FLAG_FULLMESH _BITUL(3)
#define MPTCP_PM_ADDR_FLAG_IMPLICIT _BITUL(4)
#define MPTCP_PM_ADDR_FLAG_LAMINAR _BITUL(5)
struct mptcp_info {
__u8 mptcpi_subflows;
#define mptcpi_extra_subflows mptcpi_subflows
__u8 mptcpi_add_addr_signal;
__u8 mptcpi_add_addr_accepted;
__u8 mptcpi_subflows_max;
#define mptcpi_limit_extra_subflows mptcpi_subflows_max
__u8 mptcpi_add_addr_signal_max;
#define mptcpi_endp_signal_max mptcpi_add_addr_signal_max
__u8 mptcpi_add_addr_accepted_max;
#define mptcpi_limit_add_addr_accepted mptcpi_add_addr_accepted_max
/* 16-bit hole that can no longer be filled */
__u32 mptcpi_flags;
__u32 mptcpi_token;
__u64 mptcpi_write_seq;
@ -54,14 +60,17 @@ struct mptcp_info {
__u64 mptcpi_rcv_nxt;
__u8 mptcpi_local_addr_used;
__u8 mptcpi_local_addr_max;
#define mptcpi_endp_subflow_max mptcpi_local_addr_max
__u8 mptcpi_csum_enabled;
/* 8-bit hole that can no longer be filled */
__u32 mptcpi_retransmits;
__u64 mptcpi_bytes_retrans;
__u64 mptcpi_bytes_sent;
__u64 mptcpi_bytes_received;
__u64 mptcpi_bytes_acked;
__u8 mptcpi_subflows_total;
__u8 reserved[3];
__u8 mptcpi_endp_laminar_max;
__u8 reserved[2];
__u32 mptcpi_last_data_sent;
__u32 mptcpi_last_data_recv;
__u32 mptcpi_last_ack_recv;

View File

@ -483,23 +483,24 @@ void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int
bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk)
{
struct mptcp_pm_data *pm = &msk->pm;
unsigned int subflows_max;
unsigned int limit_extra_subflows;
int ret = 0;
if (mptcp_pm_is_userspace(msk)) {
if (mptcp_userspace_pm_active(msk)) {
spin_lock_bh(&pm->lock);
pm->subflows++;
pm->extra_subflows++;
spin_unlock_bh(&pm->lock);
return true;
}
return false;
}
subflows_max = mptcp_pm_get_subflows_max(msk);
limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk);
pr_debug("msk=%p subflows=%d max=%d allow=%d\n", msk, pm->subflows,
subflows_max, READ_ONCE(pm->accept_subflow));
pr_debug("msk=%p subflows=%d max=%d allow=%d\n", msk,
pm->extra_subflows, limit_extra_subflows,
READ_ONCE(pm->accept_subflow));
/* try to avoid acquiring the lock below */
if (!READ_ONCE(pm->accept_subflow))
@ -507,8 +508,8 @@ bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk)
spin_lock_bh(&pm->lock);
if (READ_ONCE(pm->accept_subflow)) {
ret = pm->subflows < subflows_max;
if (ret && ++pm->subflows == subflows_max)
ret = pm->extra_subflows < limit_extra_subflows;
if (ret && ++pm->extra_subflows == limit_extra_subflows)
WRITE_ONCE(pm->accept_subflow, false);
}
spin_unlock_bh(&pm->lock);
@ -594,7 +595,7 @@ void mptcp_pm_subflow_check_next(struct mptcp_sock *msk,
if (mptcp_pm_is_userspace(msk)) {
if (update_subflows) {
spin_lock_bh(&pm->lock);
pm->subflows--;
pm->extra_subflows--;
spin_unlock_bh(&pm->lock);
}
return;
@ -637,9 +638,12 @@ void mptcp_pm_add_addr_received(const struct sock *ssk,
} else {
__MPTCP_INC_STATS(sock_net((struct sock *)msk), MPTCP_MIB_ADDADDRDROP);
}
/* id0 should not have a different address */
/* - id0 should not have a different address
* - special case for C-flag: linked to fill_local_addresses_vec()
*/
} else if ((addr->id == 0 && !mptcp_pm_is_init_remote_addr(msk, addr)) ||
(addr->id > 0 && !READ_ONCE(pm->accept_addr))) {
(addr->id > 0 && !READ_ONCE(pm->accept_addr) &&
!mptcp_pm_add_addr_c_flag_case(msk))) {
mptcp_pm_announce_addr(msk, addr, true);
mptcp_pm_add_addr_send_ack(msk);
} else if (mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_RECEIVED)) {
@ -1025,17 +1029,17 @@ void mptcp_pm_data_reset(struct mptcp_sock *msk)
WRITE_ONCE(pm->pm_type, pm_type);
if (pm_type == MPTCP_PM_TYPE_KERNEL) {
bool subflows_allowed = !!mptcp_pm_get_subflows_max(msk);
bool subflows_allowed = !!mptcp_pm_get_limit_extra_subflows(msk);
/* pm->work_pending must be only be set to 'true' when
* pm->pm_type is set to MPTCP_PM_TYPE_KERNEL
*/
WRITE_ONCE(pm->work_pending,
(!!mptcp_pm_get_local_addr_max(msk) &&
(!!mptcp_pm_get_endp_subflow_max(msk) &&
subflows_allowed) ||
!!mptcp_pm_get_add_addr_signal_max(msk));
!!mptcp_pm_get_endp_signal_max(msk));
WRITE_ONCE(pm->accept_addr,
!!mptcp_pm_get_add_addr_accept_max(msk) &&
!!mptcp_pm_get_limit_add_addr_accepted(msk) &&
subflows_allowed);
WRITE_ONCE(pm->accept_subflow, subflows_allowed);

View File

@ -17,14 +17,14 @@ static int pm_nl_pernet_id;
struct pm_nl_pernet {
/* protects pernet updates */
spinlock_t lock;
struct list_head local_addr_list;
unsigned int addrs;
unsigned int stale_loss_cnt;
unsigned int add_addr_signal_max;
unsigned int add_addr_accept_max;
unsigned int local_addr_max;
unsigned int subflows_max;
unsigned int next_id;
struct list_head endp_list;
u8 endpoints;
u8 endp_signal_max;
u8 endp_subflow_max;
u8 endp_laminar_max;
u8 limit_add_addr_accepted;
u8 limit_extra_subflows;
u8 next_id;
DECLARE_BITMAP(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
};
@ -46,37 +46,45 @@ static struct pm_nl_pernet *genl_info_pm_nl(struct genl_info *info)
return pm_nl_get_pernet(genl_info_net(info));
}
unsigned int mptcp_pm_get_add_addr_signal_max(const struct mptcp_sock *msk)
u8 mptcp_pm_get_endp_signal_max(const struct mptcp_sock *msk)
{
const struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
return READ_ONCE(pernet->add_addr_signal_max);
return READ_ONCE(pernet->endp_signal_max);
}
EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_signal_max);
EXPORT_SYMBOL_GPL(mptcp_pm_get_endp_signal_max);
unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk)
u8 mptcp_pm_get_endp_subflow_max(const struct mptcp_sock *msk)
{
struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
return READ_ONCE(pernet->add_addr_accept_max);
return READ_ONCE(pernet->endp_subflow_max);
}
EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_accept_max);
EXPORT_SYMBOL_GPL(mptcp_pm_get_endp_subflow_max);
unsigned int mptcp_pm_get_subflows_max(const struct mptcp_sock *msk)
u8 mptcp_pm_get_endp_laminar_max(const struct mptcp_sock *msk)
{
struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
return READ_ONCE(pernet->subflows_max);
return READ_ONCE(pernet->endp_laminar_max);
}
EXPORT_SYMBOL_GPL(mptcp_pm_get_subflows_max);
EXPORT_SYMBOL_GPL(mptcp_pm_get_endp_laminar_max);
unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk)
u8 mptcp_pm_get_limit_add_addr_accepted(const struct mptcp_sock *msk)
{
struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
return READ_ONCE(pernet->local_addr_max);
return READ_ONCE(pernet->limit_add_addr_accepted);
}
EXPORT_SYMBOL_GPL(mptcp_pm_get_local_addr_max);
EXPORT_SYMBOL_GPL(mptcp_pm_get_limit_add_addr_accepted);
u8 mptcp_pm_get_limit_extra_subflows(const struct mptcp_sock *msk)
{
struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
return READ_ONCE(pernet->limit_extra_subflows);
}
EXPORT_SYMBOL_GPL(mptcp_pm_get_limit_extra_subflows);
static bool lookup_subflow_by_daddr(const struct list_head *list,
const struct mptcp_addr_info *daddr)
@ -110,7 +118,7 @@ select_local_address(const struct pm_nl_pernet *pernet,
msk_owned_by_me(msk);
rcu_read_lock();
list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) {
list_for_each_entry_rcu(entry, &pernet->endp_list, list) {
if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW))
continue;
@ -141,7 +149,7 @@ select_signal_address(struct pm_nl_pernet *pernet, const struct mptcp_sock *msk,
* Note: removal from the local address list during the msk life-cycle
* can lead to additional addresses not being announced.
*/
list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) {
list_for_each_entry_rcu(entry, &pernet->endp_list, list) {
if (!test_bit(entry->addr.id, msk->pm.id_avail_bitmap))
continue;
@ -159,80 +167,96 @@ select_signal_address(struct pm_nl_pernet *pernet, const struct mptcp_sock *msk,
return found;
}
/* Fill all the remote addresses into the array addrs[],
* and return the array size.
*/
static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk,
struct mptcp_addr_info *local,
bool fullmesh,
struct mptcp_addr_info *addrs)
static unsigned int
fill_remote_addr(struct mptcp_sock *msk, struct mptcp_addr_info *local,
struct mptcp_addr_info *addrs)
{
bool deny_id0 = READ_ONCE(msk->pm.remote_deny_join_id0);
struct sock *sk = (struct sock *)msk, *ssk;
struct mptcp_subflow_context *subflow;
struct mptcp_addr_info remote = { 0 };
unsigned int subflows_max;
int i = 0;
struct sock *sk = (struct sock *)msk;
if (deny_id0)
return 0;
subflows_max = mptcp_pm_get_subflows_max(msk);
mptcp_remote_address((struct sock_common *)sk, &remote);
/* Non-fullmesh endpoint, fill in the single entry
* corresponding to the primary MPC subflow remote address
if (!mptcp_pm_addr_families_match(sk, local, &remote))
return 0;
msk->pm.extra_subflows++;
*addrs = remote;
return 1;
}
static unsigned int
fill_remote_addresses_fullmesh(struct mptcp_sock *msk,
struct mptcp_addr_info *local,
struct mptcp_addr_info *addrs)
{
u8 limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk);
bool deny_id0 = READ_ONCE(msk->pm.remote_deny_join_id0);
DECLARE_BITMAP(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1);
struct sock *sk = (struct sock *)msk, *ssk;
struct mptcp_subflow_context *subflow;
int i = 0;
/* Forbid creation of new subflows matching existing ones, possibly
* already created by incoming ADD_ADDR
*/
if (!fullmesh) {
if (deny_id0)
return 0;
bitmap_zero(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1);
mptcp_for_each_subflow(msk, subflow)
if (READ_ONCE(subflow->local_id) == local->id)
__set_bit(subflow->remote_id, unavail_id);
if (!mptcp_pm_addr_families_match(sk, local, &remote))
return 0;
mptcp_for_each_subflow(msk, subflow) {
ssk = mptcp_subflow_tcp_sock(subflow);
mptcp_remote_address((struct sock_common *)ssk, &addrs[i]);
addrs[i].id = READ_ONCE(subflow->remote_id);
if (deny_id0 && !addrs[i].id)
continue;
msk->pm.subflows++;
addrs[i++] = remote;
} else {
DECLARE_BITMAP(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1);
if (test_bit(addrs[i].id, unavail_id))
continue;
/* Forbid creation of new subflows matching existing
* ones, possibly already created by incoming ADD_ADDR
*/
bitmap_zero(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1);
mptcp_for_each_subflow(msk, subflow)
if (READ_ONCE(subflow->local_id) == local->id)
__set_bit(subflow->remote_id, unavail_id);
if (!mptcp_pm_addr_families_match(sk, local, &addrs[i]))
continue;
mptcp_for_each_subflow(msk, subflow) {
ssk = mptcp_subflow_tcp_sock(subflow);
mptcp_remote_address((struct sock_common *)ssk, &addrs[i]);
addrs[i].id = READ_ONCE(subflow->remote_id);
if (deny_id0 && !addrs[i].id)
continue;
/* forbid creating multiple address towards this id */
__set_bit(addrs[i].id, unavail_id);
msk->pm.extra_subflows++;
i++;
if (test_bit(addrs[i].id, unavail_id))
continue;
if (!mptcp_pm_addr_families_match(sk, local, &addrs[i]))
continue;
if (msk->pm.subflows < subflows_max) {
/* forbid creating multiple address towards
* this id
*/
__set_bit(addrs[i].id, unavail_id);
msk->pm.subflows++;
i++;
}
}
if (msk->pm.extra_subflows >= limit_extra_subflows)
break;
}
return i;
}
/* Fill all the remote addresses into the array addrs[],
* and return the array size.
*/
static unsigned int
fill_remote_addresses_vec(struct mptcp_sock *msk, struct mptcp_addr_info *local,
bool fullmesh, struct mptcp_addr_info *addrs)
{
/* Non-fullmesh: fill in the single entry corresponding to the primary
* MPC subflow remote address, and return 1, corresponding to 1 entry.
*/
if (!fullmesh)
return fill_remote_addr(msk, local, addrs);
/* Fullmesh endpoint: fill all possible remote addresses */
return fill_remote_addresses_fullmesh(msk, local, addrs);
}
static struct mptcp_pm_addr_entry *
__lookup_addr_by_id(struct pm_nl_pernet *pernet, unsigned int id)
{
struct mptcp_pm_addr_entry *entry;
list_for_each_entry_rcu(entry, &pernet->local_addr_list, list,
list_for_each_entry_rcu(entry, &pernet->endp_list, list,
lockdep_is_held(&pernet->lock)) {
if (entry->addr.id == id)
return entry;
@ -245,7 +269,7 @@ __lookup_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *info)
{
struct mptcp_pm_addr_entry *entry;
list_for_each_entry_rcu(entry, &pernet->local_addr_list, list,
list_for_each_entry_rcu(entry, &pernet->endp_list, list,
lockdep_is_held(&pernet->lock)) {
if (mptcp_addresses_equal(&entry->addr, info, entry->addr.port))
return entry;
@ -253,52 +277,65 @@ __lookup_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *info)
return NULL;
}
static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
static u8 mptcp_endp_get_local_id(struct mptcp_sock *msk,
const struct mptcp_addr_info *addr)
{
struct sock *sk = (struct sock *)msk;
unsigned int add_addr_signal_max;
bool signal_and_subflow = false;
unsigned int local_addr_max;
return msk->mpc_endpoint_id == addr->id ? 0 : addr->id;
}
/* Set mpc_endpoint_id, and send MP_PRIO for ID0 if needed */
static void mptcp_mpc_endpoint_setup(struct mptcp_sock *msk)
{
struct mptcp_subflow_context *subflow;
struct mptcp_pm_addr_entry *entry;
struct mptcp_addr_info mpc_addr;
struct pm_nl_pernet *pernet;
struct mptcp_pm_local local;
unsigned int subflows_max;
pernet = pm_nl_get_pernet(sock_net(sk));
add_addr_signal_max = mptcp_pm_get_add_addr_signal_max(msk);
local_addr_max = mptcp_pm_get_local_addr_max(msk);
subflows_max = mptcp_pm_get_subflows_max(msk);
bool backup = false;
/* do lazy endpoint usage accounting for the MPC subflows */
if (unlikely(!(msk->pm.status & BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED))) && msk->first) {
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(msk->first);
struct mptcp_pm_addr_entry *entry;
struct mptcp_addr_info mpc_addr;
bool backup = false;
if (likely(msk->pm.status & BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED)) ||
!msk->first)
return;
mptcp_local_address((struct sock_common *)msk->first, &mpc_addr);
rcu_read_lock();
entry = __lookup_addr(pernet, &mpc_addr);
if (entry) {
__clear_bit(entry->addr.id, msk->pm.id_avail_bitmap);
msk->mpc_endpoint_id = entry->addr.id;
backup = !!(entry->flags & MPTCP_PM_ADDR_FLAG_BACKUP);
}
rcu_read_unlock();
subflow = mptcp_subflow_ctx(msk->first);
pernet = pm_nl_get_pernet_from_msk(msk);
if (backup)
mptcp_pm_send_ack(msk, subflow, true, backup);
msk->pm.status |= BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED);
mptcp_local_address((struct sock_common *)msk->first, &mpc_addr);
rcu_read_lock();
entry = __lookup_addr(pernet, &mpc_addr);
if (entry) {
__clear_bit(entry->addr.id, msk->pm.id_avail_bitmap);
msk->mpc_endpoint_id = entry->addr.id;
backup = !!(entry->flags & MPTCP_PM_ADDR_FLAG_BACKUP);
}
rcu_read_unlock();
/* Send MP_PRIO */
if (backup)
mptcp_pm_send_ack(msk, subflow, true, backup);
msk->pm.status |= BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED);
}
static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
{
u8 limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk);
struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
u8 endp_subflow_max = mptcp_pm_get_endp_subflow_max(msk);
u8 endp_signal_max = mptcp_pm_get_endp_signal_max(msk);
struct sock *sk = (struct sock *)msk;
bool signal_and_subflow = false;
struct mptcp_pm_local local;
mptcp_mpc_endpoint_setup(msk);
pr_debug("local %d:%d signal %d:%d subflows %d:%d\n",
msk->pm.local_addr_used, local_addr_max,
msk->pm.add_addr_signaled, add_addr_signal_max,
msk->pm.subflows, subflows_max);
msk->pm.local_addr_used, endp_subflow_max,
msk->pm.add_addr_signaled, endp_signal_max,
msk->pm.extra_subflows, limit_extra_subflows);
/* check first for announce */
if (msk->pm.add_addr_signaled < add_addr_signal_max) {
if (msk->pm.add_addr_signaled < endp_signal_max) {
/* due to racing events on both ends we can reach here while
* previous add address is still running: if we invoke now
* mptcp_pm_announce_addr(), that will fail and the
@ -334,8 +371,8 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
subflow:
/* check if should create a new subflow */
while (msk->pm.local_addr_used < local_addr_max &&
msk->pm.subflows < subflows_max) {
while (msk->pm.local_addr_used < endp_subflow_max &&
msk->pm.extra_subflows < limit_extra_subflows) {
struct mptcp_addr_info addrs[MPTCP_PM_ADDR_MAX];
bool fullmesh;
int i, nr;
@ -377,90 +414,225 @@ static void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk)
mptcp_pm_create_subflow_or_signal_addr(msk);
}
/* Fill all the local addresses into the array addrs[],
* and return the array size.
*/
static unsigned int fill_local_addresses_vec(struct mptcp_sock *msk,
struct mptcp_addr_info *remote,
struct mptcp_pm_local *locals)
static unsigned int
fill_local_addresses_vec_fullmesh(struct mptcp_sock *msk,
struct mptcp_addr_info *remote,
struct mptcp_pm_local *locals,
bool c_flag_case)
{
u8 limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk);
struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
struct sock *sk = (struct sock *)msk;
struct mptcp_pm_addr_entry *entry;
struct mptcp_addr_info mpc_addr;
struct pm_nl_pernet *pernet;
unsigned int subflows_max;
struct mptcp_pm_local *local;
int i = 0;
pernet = pm_nl_get_pernet_from_msk(msk);
subflows_max = mptcp_pm_get_subflows_max(msk);
mptcp_local_address((struct sock_common *)msk, &mpc_addr);
rcu_read_lock();
list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) {
list_for_each_entry_rcu(entry, &pernet->endp_list, list) {
bool is_id0;
if (!(entry->flags & MPTCP_PM_ADDR_FLAG_FULLMESH))
continue;
if (!mptcp_pm_addr_families_match(sk, &entry->addr, remote))
continue;
if (msk->pm.subflows < subflows_max) {
locals[i].addr = entry->addr;
locals[i].flags = entry->flags;
locals[i].ifindex = entry->ifindex;
local = &locals[i];
local->addr = entry->addr;
local->flags = entry->flags;
local->ifindex = entry->ifindex;
/* Special case for ID0: set the correct ID */
if (mptcp_addresses_equal(&locals[i].addr, &mpc_addr, locals[i].addr.port))
locals[i].addr.id = 0;
is_id0 = local->addr.id == msk->mpc_endpoint_id;
msk->pm.subflows++;
i++;
if (c_flag_case &&
(entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)) {
__clear_bit(local->addr.id, msk->pm.id_avail_bitmap);
if (!is_id0)
msk->pm.local_addr_used++;
}
/* Special case for ID0: set the correct ID */
if (is_id0)
local->addr.id = 0;
msk->pm.extra_subflows++;
i++;
if (msk->pm.extra_subflows >= limit_extra_subflows)
break;
}
rcu_read_unlock();
/* If the array is empty, fill in the single
* 'IPADDRANY' local address
return i;
}
static unsigned int
fill_local_laminar_endp(struct mptcp_sock *msk, struct mptcp_addr_info *remote,
struct mptcp_pm_local *locals)
{
struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
DECLARE_BITMAP(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1);
struct mptcp_subflow_context *subflow;
struct sock *sk = (struct sock *)msk;
struct mptcp_pm_addr_entry *entry;
struct mptcp_pm_local *local;
int found = 0;
/* Forbid creation of new subflows matching existing ones, possibly
* already created by 'subflow' endpoints
*/
if (!i) {
memset(&locals[i], 0, sizeof(locals[i]));
locals[i].addr.family =
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
remote->family == AF_INET6 &&
ipv6_addr_v4mapped(&remote->addr6) ? AF_INET :
#endif
remote->family;
bitmap_zero(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1);
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
if (!mptcp_pm_addr_families_match(sk, &locals[i].addr, remote))
return 0;
if ((1 << inet_sk_state_load(ssk)) &
(TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSING |
TCPF_CLOSE))
continue;
msk->pm.subflows++;
__set_bit(subflow_get_local_id(subflow), unavail_id);
}
rcu_read_lock();
list_for_each_entry_rcu(entry, &pernet->endp_list, list) {
if (!(entry->flags & MPTCP_PM_ADDR_FLAG_LAMINAR))
continue;
if (!mptcp_pm_addr_families_match(sk, &entry->addr, remote))
continue;
if (test_bit(mptcp_endp_get_local_id(msk, &entry->addr),
unavail_id))
continue;
local = &locals[0];
local->addr = entry->addr;
local->flags = entry->flags;
local->ifindex = entry->ifindex;
if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) {
__clear_bit(local->addr.id, msk->pm.id_avail_bitmap);
if (local->addr.id != msk->mpc_endpoint_id)
msk->pm.local_addr_used++;
}
msk->pm.extra_subflows++;
found = 1;
break;
}
rcu_read_unlock();
return found;
}
static unsigned int
fill_local_addresses_vec_c_flag(struct mptcp_sock *msk,
struct mptcp_addr_info *remote,
struct mptcp_pm_local *locals)
{
u8 limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk);
struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
u8 endp_subflow_max = mptcp_pm_get_endp_subflow_max(msk);
struct sock *sk = (struct sock *)msk;
struct mptcp_pm_local *local;
int i = 0;
while (msk->pm.local_addr_used < endp_subflow_max) {
local = &locals[i];
if (!select_local_address(pernet, msk, local))
break;
__clear_bit(local->addr.id, msk->pm.id_avail_bitmap);
if (!mptcp_pm_addr_families_match(sk, &local->addr, remote))
continue;
if (local->addr.id == msk->mpc_endpoint_id)
continue;
msk->pm.local_addr_used++;
msk->pm.extra_subflows++;
i++;
if (msk->pm.extra_subflows >= limit_extra_subflows)
break;
}
return i;
}
static unsigned int
fill_local_address_any(struct mptcp_sock *msk, struct mptcp_addr_info *remote,
struct mptcp_pm_local *local)
{
struct sock *sk = (struct sock *)msk;
memset(local, 0, sizeof(*local));
local->addr.family =
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
remote->family == AF_INET6 &&
ipv6_addr_v4mapped(&remote->addr6) ? AF_INET :
#endif
remote->family;
if (!mptcp_pm_addr_families_match(sk, &local->addr, remote))
return 0;
msk->pm.extra_subflows++;
return 1;
}
/* Fill all the local addresses into the array addrs[],
* and return the array size.
*/
static unsigned int
fill_local_addresses_vec(struct mptcp_sock *msk, struct mptcp_addr_info *remote,
struct mptcp_pm_local *locals)
{
bool c_flag_case = remote->id && mptcp_pm_add_addr_c_flag_case(msk);
int i;
/* If there is at least one MPTCP endpoint with a fullmesh flag */
i = fill_local_addresses_vec_fullmesh(msk, remote, locals, c_flag_case);
if (i)
return i;
/* If there is at least one MPTCP endpoint with a laminar flag */
if (mptcp_pm_get_endp_laminar_max(msk))
return fill_local_laminar_endp(msk, remote, locals);
/* Special case: peer sets the C flag, accept one ADD_ADDR if default
* limits are used -- accepting no ADD_ADDR -- and use subflow endpoints
*/
if (c_flag_case)
return fill_local_addresses_vec_c_flag(msk, remote, locals);
/* No special case: fill in the single 'IPADDRANY' local address */
return fill_local_address_any(msk, remote, &locals[0]);
}
static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk)
{
u8 limit_add_addr_accepted = mptcp_pm_get_limit_add_addr_accepted(msk);
u8 limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk);
struct mptcp_pm_local locals[MPTCP_PM_ADDR_MAX];
struct sock *sk = (struct sock *)msk;
unsigned int add_addr_accept_max;
struct mptcp_addr_info remote;
unsigned int subflows_max;
bool sf_created = false;
int i, nr;
add_addr_accept_max = mptcp_pm_get_add_addr_accept_max(msk);
subflows_max = mptcp_pm_get_subflows_max(msk);
pr_debug("accepted %d:%d remote family %d\n",
msk->pm.add_addr_accepted, add_addr_accept_max,
msk->pm.add_addr_accepted, limit_add_addr_accepted,
msk->pm.remote.family);
remote = msk->pm.remote;
mptcp_pm_announce_addr(msk, &remote, true);
mptcp_pm_addr_send_ack(msk);
mptcp_mpc_endpoint_setup(msk);
if (lookup_subflow_by_daddr(&msk->conn_list, &remote))
return;
@ -486,8 +658,8 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk)
/* add_addr_accepted is not decr for ID 0 */
if (remote.id)
msk->pm.add_addr_accepted++;
if (msk->pm.add_addr_accepted >= add_addr_accept_max ||
msk->pm.subflows >= subflows_max)
if (msk->pm.add_addr_accepted >= limit_add_addr_accepted ||
msk->pm.extra_subflows >= limit_extra_subflows)
WRITE_ONCE(msk->pm.accept_addr, false);
}
}
@ -495,10 +667,13 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk)
void mptcp_pm_nl_rm_addr(struct mptcp_sock *msk, u8 rm_id)
{
if (rm_id && WARN_ON_ONCE(msk->pm.add_addr_accepted == 0)) {
u8 limit_add_addr_accepted =
mptcp_pm_get_limit_add_addr_accepted(msk);
/* Note: if the subflow has been closed before, this
* add_addr_accepted counter will not be decremented.
*/
if (--msk->pm.add_addr_accepted < mptcp_pm_get_add_addr_accept_max(msk))
if (--msk->pm.add_addr_accepted < limit_add_addr_accepted)
WRITE_ONCE(msk->pm.accept_addr, true);
}
}
@ -523,8 +698,8 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet,
bool needs_id, bool replace)
{
struct mptcp_pm_addr_entry *cur, *del_entry = NULL;
unsigned int addr_max;
int ret = -EINVAL;
u8 addr_max;
spin_lock_bh(&pernet->lock);
/* to keep the code simple, don't do IDR-like allocation for address ID,
@ -532,7 +707,7 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet,
*/
if (pernet->next_id == MPTCP_PM_MAX_ADDR_ID)
pernet->next_id = 1;
if (pernet->addrs >= MPTCP_PM_ADDR_MAX) {
if (pernet->endpoints >= MPTCP_PM_ADDR_MAX) {
ret = -ERANGE;
goto out;
}
@ -546,7 +721,7 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet,
*/
if (!address_use_port(entry))
entry->addr.port = 0;
list_for_each_entry(cur, &pernet->local_addr_list, list) {
list_for_each_entry(cur, &pernet->endp_list, list) {
if (mptcp_addresses_equal(&cur->addr, &entry->addr,
cur->addr.port || entry->addr.port)) {
/* allow replacing the exiting endpoint only if such
@ -571,7 +746,7 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet,
goto out;
}
pernet->addrs--;
pernet->endpoints--;
entry->addr.id = cur->addr.id;
list_del_rcu(&cur->list);
del_entry = cur;
@ -598,19 +773,23 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet,
pernet->next_id = entry->addr.id;
if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL) {
addr_max = pernet->add_addr_signal_max;
WRITE_ONCE(pernet->add_addr_signal_max, addr_max + 1);
addr_max = pernet->endp_signal_max;
WRITE_ONCE(pernet->endp_signal_max, addr_max + 1);
}
if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) {
addr_max = pernet->local_addr_max;
WRITE_ONCE(pernet->local_addr_max, addr_max + 1);
addr_max = pernet->endp_subflow_max;
WRITE_ONCE(pernet->endp_subflow_max, addr_max + 1);
}
if (entry->flags & MPTCP_PM_ADDR_FLAG_LAMINAR) {
addr_max = pernet->endp_laminar_max;
WRITE_ONCE(pernet->endp_laminar_max, addr_max + 1);
}
pernet->addrs++;
pernet->endpoints++;
if (!entry->addr.port)
list_add_tail_rcu(&entry->list, &pernet->local_addr_list);
list_add_tail_rcu(&entry->list, &pernet->endp_list);
else
list_add_rcu(&entry->list, &pernet->local_addr_list);
list_add_rcu(&entry->list, &pernet->endp_list);
ret = entry->addr.id;
out:
@ -845,12 +1024,6 @@ int mptcp_pm_nl_add_addr_doit(struct sk_buff *skb, struct genl_info *info)
return ret;
}
static u8 mptcp_endp_get_local_id(struct mptcp_sock *msk,
const struct mptcp_addr_info *addr)
{
return msk->mpc_endpoint_id == addr->id ? 0 : addr->id;
}
static bool mptcp_pm_remove_anno_addr(struct mptcp_sock *msk,
const struct mptcp_addr_info *addr,
bool force)
@ -969,8 +1142,8 @@ int mptcp_pm_nl_del_addr_doit(struct sk_buff *skb, struct genl_info *info)
{
struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
struct mptcp_pm_addr_entry addr, *entry;
unsigned int addr_max;
struct nlattr *attr;
u8 addr_max;
int ret;
if (GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ENDPOINT_ADDR))
@ -997,15 +1170,19 @@ int mptcp_pm_nl_del_addr_doit(struct sk_buff *skb, struct genl_info *info)
return -EINVAL;
}
if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL) {
addr_max = pernet->add_addr_signal_max;
WRITE_ONCE(pernet->add_addr_signal_max, addr_max - 1);
addr_max = pernet->endp_signal_max;
WRITE_ONCE(pernet->endp_signal_max, addr_max - 1);
}
if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) {
addr_max = pernet->local_addr_max;
WRITE_ONCE(pernet->local_addr_max, addr_max - 1);
addr_max = pernet->endp_subflow_max;
WRITE_ONCE(pernet->endp_subflow_max, addr_max - 1);
}
if (entry->flags & MPTCP_PM_ADDR_FLAG_LAMINAR) {
addr_max = pernet->endp_laminar_max;
WRITE_ONCE(pernet->endp_laminar_max, addr_max - 1);
}
pernet->addrs--;
pernet->endpoints--;
list_del_rcu(&entry->list);
__clear_bit(entry->addr.id, pernet->id_bitmap);
spin_unlock_bh(&pernet->lock);
@ -1084,9 +1261,10 @@ static void __flush_addrs(struct list_head *list)
static void __reset_counters(struct pm_nl_pernet *pernet)
{
WRITE_ONCE(pernet->add_addr_signal_max, 0);
WRITE_ONCE(pernet->local_addr_max, 0);
pernet->addrs = 0;
WRITE_ONCE(pernet->endp_signal_max, 0);
WRITE_ONCE(pernet->endp_subflow_max, 0);
WRITE_ONCE(pernet->endp_laminar_max, 0);
pernet->endpoints = 0;
}
int mptcp_pm_nl_flush_addrs_doit(struct sk_buff *skb, struct genl_info *info)
@ -1095,7 +1273,7 @@ int mptcp_pm_nl_flush_addrs_doit(struct sk_buff *skb, struct genl_info *info)
LIST_HEAD(free_list);
spin_lock_bh(&pernet->lock);
list_splice_init(&pernet->local_addr_list, &free_list);
list_splice_init(&pernet->endp_list, &free_list);
__reset_counters(pernet);
pernet->next_id = 1;
bitmap_zero(pernet->id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
@ -1181,18 +1359,18 @@ int mptcp_pm_nl_set_limits_doit(struct sk_buff *skb, struct genl_info *info)
int ret;
spin_lock_bh(&pernet->lock);
rcv_addrs = pernet->add_addr_accept_max;
rcv_addrs = pernet->limit_add_addr_accepted;
ret = parse_limit(info, MPTCP_PM_ATTR_RCV_ADD_ADDRS, &rcv_addrs);
if (ret)
goto unlock;
subflows = pernet->subflows_max;
subflows = pernet->limit_extra_subflows;
ret = parse_limit(info, MPTCP_PM_ATTR_SUBFLOWS, &subflows);
if (ret)
goto unlock;
WRITE_ONCE(pernet->add_addr_accept_max, rcv_addrs);
WRITE_ONCE(pernet->subflows_max, subflows);
WRITE_ONCE(pernet->limit_add_addr_accepted, rcv_addrs);
WRITE_ONCE(pernet->limit_extra_subflows, subflows);
unlock:
spin_unlock_bh(&pernet->lock);
@ -1215,11 +1393,11 @@ int mptcp_pm_nl_get_limits_doit(struct sk_buff *skb, struct genl_info *info)
goto fail;
if (nla_put_u32(msg, MPTCP_PM_ATTR_RCV_ADD_ADDRS,
READ_ONCE(pernet->add_addr_accept_max)))
READ_ONCE(pernet->limit_add_addr_accepted)))
goto fail;
if (nla_put_u32(msg, MPTCP_PM_ATTR_SUBFLOWS,
READ_ONCE(pernet->subflows_max)))
READ_ONCE(pernet->limit_extra_subflows)))
goto fail;
genlmsg_end(msg, reply);
@ -1328,7 +1506,7 @@ bool mptcp_pm_nl_check_work_pending(struct mptcp_sock *msk)
{
struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
if (msk->pm.subflows == mptcp_pm_get_subflows_max(msk) ||
if (msk->pm.extra_subflows == mptcp_pm_get_limit_extra_subflows(msk) ||
(find_next_and_bit(pernet->id_bitmap, msk->pm.id_avail_bitmap,
MPTCP_PM_MAX_ADDR_ID + 1, 0) == MPTCP_PM_MAX_ADDR_ID + 1)) {
WRITE_ONCE(msk->pm.work_pending, false);
@ -1360,12 +1538,11 @@ static int __net_init pm_nl_init_net(struct net *net)
{
struct pm_nl_pernet *pernet = pm_nl_get_pernet(net);
INIT_LIST_HEAD_RCU(&pernet->local_addr_list);
INIT_LIST_HEAD_RCU(&pernet->endp_list);
/* Cit. 2 subflows ought to be enough for anybody. */
pernet->subflows_max = 2;
pernet->limit_extra_subflows = 2;
pernet->next_id = 1;
pernet->stale_loss_cnt = 4;
spin_lock_init(&pernet->lock);
/* No need to initialize other pernet fields, the struct is zeroed at
@ -1386,7 +1563,7 @@ static void __net_exit pm_nl_exit_net(struct list_head *net_list)
* other modifiers, also netns core already waited for a
* RCU grace period.
*/
__flush_addrs(&pernet->local_addr_list);
__flush_addrs(&pernet->endp_list);
}
}

View File

@ -419,7 +419,7 @@ int mptcp_pm_nl_subflow_create_doit(struct sk_buff *skb, struct genl_info *info)
if (err)
mptcp_userspace_pm_delete_local_addr(msk, &entry);
else
msk->pm.subflows++;
msk->pm.extra_subflows++;
spin_unlock_bh(&msk->pm.lock);
create_err:

View File

@ -235,7 +235,7 @@ struct mptcp_pm_data {
u8 add_addr_accepted;
u8 local_addr_used;
u8 pm_type;
u8 subflows;
u8 extra_subflows;
u8 status;
);
@ -1180,15 +1180,16 @@ void __init mptcp_pm_userspace_register(void);
void __init mptcp_pm_nl_init(void);
void mptcp_pm_worker(struct mptcp_sock *msk);
void __mptcp_pm_kernel_worker(struct mptcp_sock *msk);
unsigned int mptcp_pm_get_add_addr_signal_max(const struct mptcp_sock *msk);
unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk);
unsigned int mptcp_pm_get_subflows_max(const struct mptcp_sock *msk);
unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk);
u8 mptcp_pm_get_endp_signal_max(const struct mptcp_sock *msk);
u8 mptcp_pm_get_endp_subflow_max(const struct mptcp_sock *msk);
u8 mptcp_pm_get_endp_laminar_max(const struct mptcp_sock *msk);
u8 mptcp_pm_get_limit_add_addr_accepted(const struct mptcp_sock *msk);
u8 mptcp_pm_get_limit_extra_subflows(const struct mptcp_sock *msk);
/* called under PM lock */
static inline void __mptcp_pm_close_subflow(struct mptcp_sock *msk)
{
if (--msk->pm.subflows < mptcp_pm_get_subflows_max(msk))
if (--msk->pm.extra_subflows < mptcp_pm_get_limit_extra_subflows(msk))
WRITE_ONCE(msk->pm.accept_subflow, true);
}
@ -1199,6 +1200,14 @@ static inline void mptcp_pm_close_subflow(struct mptcp_sock *msk)
spin_unlock_bh(&msk->pm.lock);
}
static inline bool mptcp_pm_add_addr_c_flag_case(struct mptcp_sock *msk)
{
return READ_ONCE(msk->pm.remote_deny_join_id0) &&
msk->pm.local_addr_used == 0 &&
mptcp_pm_get_limit_add_addr_accepted(msk) == 0 &&
msk->pm.extra_subflows < mptcp_pm_get_limit_extra_subflows(msk);
}
void mptcp_sockopt_sync_locked(struct mptcp_sock *msk, struct sock *ssk);
static inline struct mptcp_ext *mptcp_get_ext(const struct sk_buff *skb)

View File

@ -962,7 +962,7 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info)
memset(info, 0, sizeof(*info));
info->mptcpi_subflows = READ_ONCE(msk->pm.subflows);
info->mptcpi_extra_subflows = READ_ONCE(msk->pm.extra_subflows);
info->mptcpi_add_addr_signal = READ_ONCE(msk->pm.add_addr_signaled);
info->mptcpi_add_addr_accepted = READ_ONCE(msk->pm.add_addr_accepted);
info->mptcpi_local_addr_used = READ_ONCE(msk->pm.local_addr_used);
@ -972,14 +972,16 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info)
/* The following limits only make sense for the in-kernel PM */
if (mptcp_pm_is_kernel(msk)) {
info->mptcpi_subflows_max =
mptcp_pm_get_subflows_max(msk);
info->mptcpi_add_addr_signal_max =
mptcp_pm_get_add_addr_signal_max(msk);
info->mptcpi_add_addr_accepted_max =
mptcp_pm_get_add_addr_accept_max(msk);
info->mptcpi_local_addr_max =
mptcp_pm_get_local_addr_max(msk);
info->mptcpi_limit_extra_subflows =
mptcp_pm_get_limit_extra_subflows(msk);
info->mptcpi_endp_signal_max =
mptcp_pm_get_endp_signal_max(msk);
info->mptcpi_limit_add_addr_accepted =
mptcp_pm_get_limit_add_addr_accepted(msk);
info->mptcpi_endp_subflow_max =
mptcp_pm_get_endp_subflow_max(msk);
info->mptcpi_endp_laminar_max =
mptcp_pm_get_endp_laminar_max(msk);
}
if (__mptcp_check_fallback(msk))
@ -996,7 +998,7 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info)
info->mptcpi_bytes_sent = msk->bytes_sent;
info->mptcpi_bytes_received = msk->bytes_received;
info->mptcpi_bytes_retrans = msk->bytes_retrans;
info->mptcpi_subflows_total = info->mptcpi_subflows +
info->mptcpi_subflows_total = info->mptcpi_extra_subflows +
__mptcp_has_initial_subflow(msk);
now = tcp_jiffies32;
info->mptcpi_last_data_sent = jiffies_to_msecs(now - msk->last_data_sent);

View File

@ -117,7 +117,7 @@ int _getsockopt_subflow(struct bpf_sockopt *ctx)
return 1;
msk = bpf_core_cast(sk, struct mptcp_sock);
if (msk->pm.subflows != 1) {
if (msk->pm.extra_subflows != 1) {
ctx->retval = -1;
return 1;
}

View File

@ -3306,6 +3306,17 @@ deny_join_id0_tests()
run_tests $ns1 $ns2 10.0.1.1
chk_join_nr 1 1 1
fi
# default limits, server deny join id 0 + signal
if reset_with_allow_join_id0 "default limits, server deny join id 0" 0 1; then
pm_nl_set_limits $ns1 0 2
pm_nl_set_limits $ns2 0 2
pm_nl_add_endpoint $ns1 10.0.2.1 flags signal
pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow
pm_nl_add_endpoint $ns2 10.0.4.2 flags subflow
run_tests $ns1 $ns2 10.0.1.1
chk_join_nr 2 2 2
fi
}
fullmesh_tests()