From 5e07e672412bed473122813ab35d4f7d42fd9635 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 19 Dec 2023 09:18:22 +0900 Subject: [PATCH 01/12] tcp: Use bhash2 for v4-mapped-v6 non-wildcard address. While checking port availability in bind() or listen(), we used only bhash for all v4-mapped-v6 addresses. But there is no good reason not to use bhash2 for v4-mapped-v6 non-wildcard addresses. Let's do it by returning true in inet_use_bhash2_on_bind(). Then, we also need to add a test in inet_bind2_bucket_match_addr_any() so that ::ffff:X.X.X.X will match with 0.0.0.0. Note that sk->sk_rcv_saddr is initialised for v4-mapped-v6 sk in __inet6_bind(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 7 +++++-- net/ipv4/inet_hashtables.c | 3 ++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index bd325b029dd1..d48255875f60 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -159,8 +159,11 @@ static bool inet_use_bhash2_on_bind(const struct sock *sk) if (sk->sk_family == AF_INET6) { int addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr); - return addr_type != IPV6_ADDR_ANY && - addr_type != IPV6_ADDR_MAPPED; + if (addr_type == IPV6_ADDR_ANY) + return false; + + if (addr_type != IPV6_ADDR_MAPPED) + return true; } #endif return sk->sk_rcv_saddr != htonl(INADDR_ANY); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 9ff201bc4e6d..7e8dbc5cc317 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -841,7 +841,8 @@ bool inet_bind2_bucket_match_addr_any(const struct inet_bind2_bucket *tb, const return ipv6_addr_any(&tb->v6_rcv_saddr) || ipv6_addr_v4mapped_any(&tb->v6_rcv_saddr); - return false; + return ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr) && + tb->rcv_saddr == 0; } if (sk->sk_family == AF_INET6) From 56f3e3f01f81dfb010598e01df033e8836f5c61a Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 19 Dec 2023 09:18:23 +0900 Subject: [PATCH 02/12] tcp: Rearrange tests in inet_bind2_bucket_(addr_match|match_addr_any)(). The protocol family tests in inet_bind2_bucket_addr_match() and inet_bind2_bucket_match_addr_any() are ordered as follows. if (sk->sk_family != tb2->family) else if (sk->sk_family == AF_INET6) else This patch rearranges them so that AF_INET6 socket is handled first to make the following patch tidy, where tb2->family will be removed. if (sk->sk_family == AF_INET6) else if (tb2->family == AF_INET6) else Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/inet_hashtables.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 7e8dbc5cc317..896fcefc06c0 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -149,18 +149,17 @@ static bool inet_bind2_bucket_addr_match(const struct inet_bind2_bucket *tb2, const struct sock *sk) { #if IS_ENABLED(CONFIG_IPV6) - if (sk->sk_family != tb2->family) { - if (sk->sk_family == AF_INET) - return ipv6_addr_v4mapped(&tb2->v6_rcv_saddr) && - tb2->v6_rcv_saddr.s6_addr32[3] == sk->sk_rcv_saddr; + if (sk->sk_family == AF_INET6) { + if (tb2->family == AF_INET6) + return ipv6_addr_equal(&tb2->v6_rcv_saddr, &sk->sk_v6_rcv_saddr); return ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr) && sk->sk_v6_rcv_saddr.s6_addr32[3] == tb2->rcv_saddr; } - if (sk->sk_family == AF_INET6) - return ipv6_addr_equal(&tb2->v6_rcv_saddr, - &sk->sk_v6_rcv_saddr); + if (tb2->family == AF_INET6) + return ipv6_addr_v4mapped(&tb2->v6_rcv_saddr) && + tb2->v6_rcv_saddr.s6_addr32[3] == sk->sk_rcv_saddr; #endif return tb2->rcv_saddr == sk->sk_rcv_saddr; } @@ -836,17 +835,17 @@ bool inet_bind2_bucket_match_addr_any(const struct inet_bind2_bucket *tb, const return false; #if IS_ENABLED(CONFIG_IPV6) - if (sk->sk_family != tb->family) { - if (sk->sk_family == AF_INET) - return ipv6_addr_any(&tb->v6_rcv_saddr) || - ipv6_addr_v4mapped_any(&tb->v6_rcv_saddr); + if (sk->sk_family == AF_INET6) { + if (tb->family == AF_INET6) + return ipv6_addr_any(&tb->v6_rcv_saddr); return ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr) && tb->rcv_saddr == 0; } - if (sk->sk_family == AF_INET6) - return ipv6_addr_any(&tb->v6_rcv_saddr); + if (tb->family == AF_INET6) + return ipv6_addr_any(&tb->v6_rcv_saddr) || + ipv6_addr_v4mapped_any(&tb->v6_rcv_saddr); #endif return tb->rcv_saddr == 0; } From 06a8c04f89949576b715f11350f1896b62cecb0a Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 19 Dec 2023 09:18:24 +0900 Subject: [PATCH 03/12] tcp: Save v4 address as v4-mapped-v6 in inet_bind2_bucket.v6_rcv_saddr. In bhash2, IPv4/IPv6 addresses are saved in two union members, which complicate address checks in inet_bind2_bucket_addr_match() and inet_bind2_bucket_match_addr_any() considering uninitialised memory and v4-mapped-v6 conflicts. Let's simplify that by saving IPv4 address as v4-mapped-v6 address and defining tb2.rcv_saddr as tb2.v6_rcv_saddr.s6_addr32[3]. Then, we can compare v6 address as is, and after checking v4-mapped-v6, we can compare v4 address easily. Also, we can remove tb2->family. Note these functions will be further refactored in the next patch. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_hashtables.h | 11 ++++------- include/net/ipv6.h | 5 ----- net/ipv4/inet_hashtables.c | 34 +++++++++++++++++----------------- 3 files changed, 21 insertions(+), 29 deletions(-) diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 3ecfeadbfa06..171cc235d045 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -96,14 +96,11 @@ struct inet_bind2_bucket { int l3mdev; unsigned short port; #if IS_ENABLED(CONFIG_IPV6) - unsigned short family; + struct in6_addr v6_rcv_saddr; +#define rcv_saddr v6_rcv_saddr.s6_addr32[3] +#else + __be32 rcv_saddr; #endif - union { -#if IS_ENABLED(CONFIG_IPV6) - struct in6_addr v6_rcv_saddr; -#endif - __be32 rcv_saddr; - }; /* Node in the bhash2 inet_bind_hashbucket chain */ struct hlist_node node; /* List of sockets hashed to this bucket */ diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 78d38dd88aba..cf25ea21d770 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -784,11 +784,6 @@ static inline bool ipv6_addr_v4mapped(const struct in6_addr *a) cpu_to_be32(0x0000ffff))) == 0UL; } -static inline bool ipv6_addr_v4mapped_any(const struct in6_addr *a) -{ - return ipv6_addr_v4mapped(a) && ipv4_is_zeronet(a->s6_addr32[3]); -} - static inline bool ipv6_addr_v4mapped_loopback(const struct in6_addr *a) { return ipv6_addr_v4mapped(a) && ipv4_is_loopback(a->s6_addr32[3]); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 896fcefc06c0..15594424e9f5 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -110,12 +110,13 @@ static void inet_bind2_bucket_init(struct inet_bind2_bucket *tb, tb->l3mdev = l3mdev; tb->port = port; #if IS_ENABLED(CONFIG_IPV6) - tb->family = sk->sk_family; if (sk->sk_family == AF_INET6) tb->v6_rcv_saddr = sk->sk_v6_rcv_saddr; else + ipv6_addr_set_v4mapped(sk->sk_rcv_saddr, &tb->v6_rcv_saddr); +#else + tb->rcv_saddr = sk->sk_rcv_saddr; #endif - tb->rcv_saddr = sk->sk_rcv_saddr; INIT_HLIST_HEAD(&tb->owners); INIT_HLIST_HEAD(&tb->deathrow); hlist_add_head(&tb->node, &head->chain); @@ -149,17 +150,11 @@ static bool inet_bind2_bucket_addr_match(const struct inet_bind2_bucket *tb2, const struct sock *sk) { #if IS_ENABLED(CONFIG_IPV6) - if (sk->sk_family == AF_INET6) { - if (tb2->family == AF_INET6) - return ipv6_addr_equal(&tb2->v6_rcv_saddr, &sk->sk_v6_rcv_saddr); + if (sk->sk_family == AF_INET6) + return ipv6_addr_equal(&tb2->v6_rcv_saddr, &sk->sk_v6_rcv_saddr); - return ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr) && - sk->sk_v6_rcv_saddr.s6_addr32[3] == tb2->rcv_saddr; - } - - if (tb2->family == AF_INET6) - return ipv6_addr_v4mapped(&tb2->v6_rcv_saddr) && - tb2->v6_rcv_saddr.s6_addr32[3] == sk->sk_rcv_saddr; + if (!ipv6_addr_v4mapped(&tb2->v6_rcv_saddr)) + return false; #endif return tb2->rcv_saddr == sk->sk_rcv_saddr; } @@ -836,16 +831,21 @@ bool inet_bind2_bucket_match_addr_any(const struct inet_bind2_bucket *tb, const #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) { - if (tb->family == AF_INET6) - return ipv6_addr_any(&tb->v6_rcv_saddr); + if (ipv6_addr_any(&tb->v6_rcv_saddr)) + return true; + + if (!ipv6_addr_v4mapped(&tb->v6_rcv_saddr)) + return false; return ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr) && tb->rcv_saddr == 0; } - if (tb->family == AF_INET6) - return ipv6_addr_any(&tb->v6_rcv_saddr) || - ipv6_addr_v4mapped_any(&tb->v6_rcv_saddr); + if (ipv6_addr_any(&tb->v6_rcv_saddr)) + return true; + + if (!ipv6_addr_v4mapped(&tb->v6_rcv_saddr)) + return false; #endif return tb->rcv_saddr == 0; } From 5a22bba13d0162d278d6e31232259ddcce196b8e Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 19 Dec 2023 09:18:25 +0900 Subject: [PATCH 04/12] tcp: Save address type in inet_bind2_bucket. inet_bind2_bucket_addr_match() and inet_bind2_bucket_match_addr_any() are called for each bhash2 bucket to check conflicts. Thus, we call ipv6_addr_any() and ipv6_addr_v4mapped() over and over during bind(). Let's avoid calling them by saving the address type in inet_bind2_bucket. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_hashtables.h | 1 + net/ipv4/inet_hashtables.c | 29 +++++++++++++---------------- 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 171cc235d045..260e673ede22 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -96,6 +96,7 @@ struct inet_bind2_bucket { int l3mdev; unsigned short port; #if IS_ENABLED(CONFIG_IPV6) + unsigned short addr_type; struct in6_addr v6_rcv_saddr; #define rcv_saddr v6_rcv_saddr.s6_addr32[3] #else diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 15594424e9f5..4e39e3f905b4 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -110,10 +110,14 @@ static void inet_bind2_bucket_init(struct inet_bind2_bucket *tb, tb->l3mdev = l3mdev; tb->port = port; #if IS_ENABLED(CONFIG_IPV6) - if (sk->sk_family == AF_INET6) + BUILD_BUG_ON(USHRT_MAX < (IPV6_ADDR_ANY | IPV6_ADDR_MAPPED)); + if (sk->sk_family == AF_INET6) { + tb->addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr); tb->v6_rcv_saddr = sk->sk_v6_rcv_saddr; - else + } else { + tb->addr_type = IPV6_ADDR_MAPPED; ipv6_addr_set_v4mapped(sk->sk_rcv_saddr, &tb->v6_rcv_saddr); + } #else tb->rcv_saddr = sk->sk_rcv_saddr; #endif @@ -153,7 +157,7 @@ static bool inet_bind2_bucket_addr_match(const struct inet_bind2_bucket *tb2, if (sk->sk_family == AF_INET6) return ipv6_addr_equal(&tb2->v6_rcv_saddr, &sk->sk_v6_rcv_saddr); - if (!ipv6_addr_v4mapped(&tb2->v6_rcv_saddr)) + if (tb2->addr_type != IPV6_ADDR_MAPPED) return false; #endif return tb2->rcv_saddr == sk->sk_rcv_saddr; @@ -830,21 +834,14 @@ bool inet_bind2_bucket_match_addr_any(const struct inet_bind2_bucket *tb, const return false; #if IS_ENABLED(CONFIG_IPV6) - if (sk->sk_family == AF_INET6) { - if (ipv6_addr_any(&tb->v6_rcv_saddr)) - return true; - - if (!ipv6_addr_v4mapped(&tb->v6_rcv_saddr)) - return false; - - return ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr) && - tb->rcv_saddr == 0; - } - - if (ipv6_addr_any(&tb->v6_rcv_saddr)) + if (tb->addr_type == IPV6_ADDR_ANY) return true; - if (!ipv6_addr_v4mapped(&tb->v6_rcv_saddr)) + if (tb->addr_type != IPV6_ADDR_MAPPED) + return false; + + if (sk->sk_family == AF_INET6 && + !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) return false; #endif return tb->rcv_saddr == 0; From 4dd7108854304bd2052fa73e6889a5b1bbaca869 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 19 Dec 2023 09:18:26 +0900 Subject: [PATCH 05/12] tcp: Rename tb in inet_bind2_bucket_(init|create)(). Later, we no longer link sockets to bhash. Instead, each bhash2 bucket is linked to the corresponding bhash bucket. Then, we pass the bhash bucket to bhash2 allocation functions as tb. However, tb is already used in inet_bind2_bucket_create() and inet_bind2_bucket_init() as the bhash2 bucket. To make the following diff clear, let's use tb2 for the bhash2 bucket there. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/inet_hashtables.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 4e39e3f905b4..0a9919755709 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -100,30 +100,30 @@ bool inet_bind_bucket_match(const struct inet_bind_bucket *tb, const struct net tb->l3mdev == l3mdev; } -static void inet_bind2_bucket_init(struct inet_bind2_bucket *tb, +static void inet_bind2_bucket_init(struct inet_bind2_bucket *tb2, struct net *net, struct inet_bind_hashbucket *head, unsigned short port, int l3mdev, const struct sock *sk) { - write_pnet(&tb->ib_net, net); - tb->l3mdev = l3mdev; - tb->port = port; + write_pnet(&tb2->ib_net, net); + tb2->l3mdev = l3mdev; + tb2->port = port; #if IS_ENABLED(CONFIG_IPV6) BUILD_BUG_ON(USHRT_MAX < (IPV6_ADDR_ANY | IPV6_ADDR_MAPPED)); if (sk->sk_family == AF_INET6) { - tb->addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr); - tb->v6_rcv_saddr = sk->sk_v6_rcv_saddr; + tb2->addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr); + tb2->v6_rcv_saddr = sk->sk_v6_rcv_saddr; } else { - tb->addr_type = IPV6_ADDR_MAPPED; - ipv6_addr_set_v4mapped(sk->sk_rcv_saddr, &tb->v6_rcv_saddr); + tb2->addr_type = IPV6_ADDR_MAPPED; + ipv6_addr_set_v4mapped(sk->sk_rcv_saddr, &tb2->v6_rcv_saddr); } #else - tb->rcv_saddr = sk->sk_rcv_saddr; + tb2->rcv_saddr = sk->sk_rcv_saddr; #endif - INIT_HLIST_HEAD(&tb->owners); - INIT_HLIST_HEAD(&tb->deathrow); - hlist_add_head(&tb->node, &head->chain); + INIT_HLIST_HEAD(&tb2->owners); + INIT_HLIST_HEAD(&tb2->deathrow); + hlist_add_head(&tb2->node, &head->chain); } struct inet_bind2_bucket *inet_bind2_bucket_create(struct kmem_cache *cachep, @@ -133,12 +133,12 @@ struct inet_bind2_bucket *inet_bind2_bucket_create(struct kmem_cache *cachep, int l3mdev, const struct sock *sk) { - struct inet_bind2_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); + struct inet_bind2_bucket *tb2 = kmem_cache_alloc(cachep, GFP_ATOMIC); - if (tb) - inet_bind2_bucket_init(tb, net, head, port, l3mdev, sk); + if (tb2) + inet_bind2_bucket_init(tb2, net, head, port, l3mdev, sk); - return tb; + return tb2; } /* Caller must hold hashbucket lock for this tb with local BH disabled */ From 822fb91fc724786372eed4f4397409c70afc08b8 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 19 Dec 2023 09:18:27 +0900 Subject: [PATCH 06/12] tcp: Link bhash2 to bhash. bhash2 added a new member sk_bind2_node in struct sock to link sockets to bhash2 in addition to bhash. bhash is still needed to search conflicting sockets efficiently from a port for the wildcard address. However, bhash itself need not have sockets. If we link each bhash2 bucket to the corresponding bhash bucket, we can iterate the same set of the sockets from bhash2 via bhash. This patch links bhash2 to bhash only, and the actual use will be in the later patches. Finally, we will remove sk_bind2_node. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_hashtables.h | 4 +++- net/ipv4/inet_connection_sock.c | 2 +- net/ipv4/inet_hashtables.c | 21 +++++++++++---------- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 260e673ede22..25ba471ba161 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -89,6 +89,7 @@ struct inet_bind_bucket { bool fast_ipv6_only; struct hlist_node node; struct hlist_head owners; + struct hlist_head bhash2; }; struct inet_bind2_bucket { @@ -104,6 +105,7 @@ struct inet_bind2_bucket { #endif /* Node in the bhash2 inet_bind_hashbucket chain */ struct hlist_node node; + struct hlist_node bhash_node; /* List of sockets hashed to this bucket */ struct hlist_head owners; /* bhash has twsk in owners, but bhash2 has twsk in @@ -239,7 +241,7 @@ bool inet_bind_bucket_match(const struct inet_bind_bucket *tb, struct inet_bind2_bucket * inet_bind2_bucket_create(struct kmem_cache *cachep, struct net *net, struct inet_bind_hashbucket *head, - unsigned short port, int l3mdev, + struct inet_bind_bucket *tb, const struct sock *sk); void inet_bind2_bucket_destroy(struct kmem_cache *cachep, diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index d48255875f60..8b29056f454d 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -572,7 +572,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) if (!tb2) { tb2 = inet_bind2_bucket_create(hinfo->bind2_bucket_cachep, - net, head2, port, l3mdev, sk); + net, head2, tb, sk); if (!tb2) goto fail_unlock; bhash2_created = true; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 0a9919755709..7dc33dd1ba35 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -77,6 +77,7 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, tb->fastreuse = 0; tb->fastreuseport = 0; INIT_HLIST_HEAD(&tb->owners); + INIT_HLIST_HEAD(&tb->bhash2); hlist_add_head(&tb->node, &head->chain); } return tb; @@ -103,12 +104,12 @@ bool inet_bind_bucket_match(const struct inet_bind_bucket *tb, const struct net static void inet_bind2_bucket_init(struct inet_bind2_bucket *tb2, struct net *net, struct inet_bind_hashbucket *head, - unsigned short port, int l3mdev, + struct inet_bind_bucket *tb, const struct sock *sk) { write_pnet(&tb2->ib_net, net); - tb2->l3mdev = l3mdev; - tb2->port = port; + tb2->l3mdev = tb->l3mdev; + tb2->port = tb->port; #if IS_ENABLED(CONFIG_IPV6) BUILD_BUG_ON(USHRT_MAX < (IPV6_ADDR_ANY | IPV6_ADDR_MAPPED)); if (sk->sk_family == AF_INET6) { @@ -124,19 +125,19 @@ static void inet_bind2_bucket_init(struct inet_bind2_bucket *tb2, INIT_HLIST_HEAD(&tb2->owners); INIT_HLIST_HEAD(&tb2->deathrow); hlist_add_head(&tb2->node, &head->chain); + hlist_add_head(&tb2->bhash_node, &tb->bhash2); } struct inet_bind2_bucket *inet_bind2_bucket_create(struct kmem_cache *cachep, struct net *net, struct inet_bind_hashbucket *head, - unsigned short port, - int l3mdev, + struct inet_bind_bucket *tb, const struct sock *sk) { struct inet_bind2_bucket *tb2 = kmem_cache_alloc(cachep, GFP_ATOMIC); if (tb2) - inet_bind2_bucket_init(tb2, net, head, port, l3mdev, sk); + inet_bind2_bucket_init(tb2, net, head, tb, sk); return tb2; } @@ -146,6 +147,7 @@ void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_buck { if (hlist_empty(&tb->owners) && hlist_empty(&tb->deathrow)) { __hlist_del(&tb->node); + __hlist_del(&tb->bhash_node); kmem_cache_free(cachep, tb); } } @@ -273,8 +275,7 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child) tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, child); if (!tb2) { tb2 = inet_bind2_bucket_create(table->bind2_bucket_cachep, - net, head2, port, - l3mdev, child); + net, head2, tb, child); if (!tb2) goto error; } @@ -954,7 +955,7 @@ static int __inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family, tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk); if (!tb2) { tb2 = new_tb2; - inet_bind2_bucket_init(tb2, net, head2, port, l3mdev, sk); + inet_bind2_bucket_init(tb2, net, head2, inet_csk(sk)->icsk_bind_hash, sk); } sk_add_bind2_node(sk, &tb2->owners); inet_csk(sk)->icsk_bind2_hash = tb2; @@ -1101,7 +1102,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk); if (!tb2) { tb2 = inet_bind2_bucket_create(hinfo->bind2_bucket_cachep, net, - head2, port, l3mdev, sk); + head2, tb, sk); if (!tb2) goto error; } From 58655bc0ad7ccdd5b53319bcc091cb81b6aee7c3 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 19 Dec 2023 09:18:28 +0900 Subject: [PATCH 07/12] tcp: Rearrange tests in inet_csk_bind_conflict(). The following patch adds code in the !inet_use_bhash2_on_bind(sk) case in inet_csk_bind_conflict(). To avoid adding nest and make the change cleaner, this patch rearranges tests in inet_csk_bind_conflict(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 44 ++++++++++++++++----------------- 1 file changed, 21 insertions(+), 23 deletions(-) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 8b29056f454d..0b49778b425f 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -242,9 +242,10 @@ static int inet_csk_bind_conflict(const struct sock *sk, const struct inet_bind2_bucket *tb2, /* may be null */ bool relax, bool reuseport_ok) { - bool reuseport_cb_ok; - struct sock_reuseport *reuseport_cb; kuid_t uid = sock_i_uid((struct sock *)sk); + struct sock_reuseport *reuseport_cb; + bool reuseport_cb_ok; + struct sock *sk2; rcu_read_lock(); reuseport_cb = rcu_dereference(sk->sk_reuseport_cb); @@ -252,32 +253,29 @@ static int inet_csk_bind_conflict(const struct sock *sk, reuseport_cb_ok = !reuseport_cb || READ_ONCE(reuseport_cb->num_closed_socks); rcu_read_unlock(); - /* - * Unlike other sk lookup places we do not check - * for sk_net here, since _all_ the socks listed - * in tb->owners and tb2->owners list belong - * to the same net - the one this bucket belongs to. - */ - - if (!inet_use_bhash2_on_bind(sk)) { - struct sock *sk2; - - sk_for_each_bound(sk2, &tb->owners) - if (inet_bind_conflict(sk, sk2, uid, relax, - reuseport_cb_ok, reuseport_ok) && - inet_rcv_saddr_equal(sk, sk2, true)) - return true; - - return false; - } - /* Conflicts with an existing IPV6_ADDR_ANY (if ipv6) or INADDR_ANY (if * ipv4) should have been checked already. We need to do these two * checks separately because their spinlocks have to be acquired/released * independently of each other, to prevent possible deadlocks */ - return tb2 && inet_bhash2_conflict(sk, tb2, uid, relax, reuseport_cb_ok, - reuseport_ok); + if (inet_use_bhash2_on_bind(sk)) + return tb2 && inet_bhash2_conflict(sk, tb2, uid, relax, + reuseport_cb_ok, reuseport_ok); + + /* Unlike other sk lookup places we do not check + * for sk_net here, since _all_ the socks listed + * in tb->owners and tb2->owners list belong + * to the same net - the one this bucket belongs to. + */ + sk_for_each_bound(sk2, &tb->owners) { + if (!inet_bind_conflict(sk, sk2, uid, relax, reuseport_cb_ok, reuseport_ok)) + continue; + + if (inet_rcv_saddr_equal(sk, sk2, true)) + return true; + } + + return false; } /* Determine if there is a bind conflict with an existing IPV6_ADDR_ANY (if ipv6) or From b82ba728ccfecb4cee518199597598ef385b47d5 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 19 Dec 2023 09:18:29 +0900 Subject: [PATCH 08/12] tcp: Iterate tb->bhash2 in inet_csk_bind_conflict(). Sockets in bhash are also linked to bhash2, but TIME_WAIT sockets are linked separately in tb2->deathrow. Let's replace tb->owners iteration in inet_csk_bind_conflict() with two iterations over tb2->owners and tb2->deathrow. This can be done safely under bhash's lock because socket insertion/ deletion in bhash2 happens with bhash's lock held. Note that twsk_for_each_bound_bhash() will be removed later. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 0b49778b425f..a31f302c4cc0 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -236,6 +236,14 @@ static bool inet_bhash2_conflict(const struct sock *sk, return false; } +#define sk_for_each_bound_bhash(__sk, __tb2, __tb) \ + hlist_for_each_entry(__tb2, &(__tb)->bhash2, bhash_node) \ + sk_for_each_bound_bhash2(sk2, &(__tb2)->owners) + +#define twsk_for_each_bound_bhash(__sk, __tb2, __tb) \ + hlist_for_each_entry(__tb2, &(__tb)->bhash2, bhash_node) \ + sk_for_each_bound_bhash2(sk2, &(__tb2)->deathrow) + /* This should be called only when the tb and tb2 hashbuckets' locks are held */ static int inet_csk_bind_conflict(const struct sock *sk, const struct inet_bind_bucket *tb, @@ -267,7 +275,15 @@ static int inet_csk_bind_conflict(const struct sock *sk, * in tb->owners and tb2->owners list belong * to the same net - the one this bucket belongs to. */ - sk_for_each_bound(sk2, &tb->owners) { + sk_for_each_bound_bhash(sk2, tb2, tb) { + if (!inet_bind_conflict(sk, sk2, uid, relax, reuseport_cb_ok, reuseport_ok)) + continue; + + if (inet_rcv_saddr_equal(sk, sk2, true)) + return true; + } + + twsk_for_each_bound_bhash(sk2, tb2, tb) { if (!inet_bind_conflict(sk, sk2, uid, relax, reuseport_cb_ok, reuseport_ok)) continue; From 8002d44fe84dd7b98812aadcbc1bd897e270355c Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 19 Dec 2023 09:18:30 +0900 Subject: [PATCH 09/12] tcp: Check hlist_empty(&tb->bhash2) instead of hlist_empty(&tb->owners). We use hlist_empty(&tb->owners) to check if the bhash bucket has a socket. We can check the child bhash2 buckets instead. For this to work, the bhash2 bucket must be freed before the bhash bucket. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 9 ++++----- net/ipv4/inet_hashtables.c | 6 +++--- net/ipv4/inet_timewait_sock.c | 2 +- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index a31f302c4cc0..1bd13dcd45ae 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -474,7 +474,7 @@ void inet_csk_update_fastreuse(struct inet_bind_bucket *tb, kuid_t uid = sock_i_uid(sk); bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; - if (hlist_empty(&tb->owners)) { + if (hlist_empty(&tb->bhash2)) { tb->fastreuse = reuse; if (sk->sk_reuseport) { tb->fastreuseport = FASTREUSEPORT_ANY; @@ -566,7 +566,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) } if (!found_port) { - if (!hlist_empty(&tb->owners)) { + if (!hlist_empty(&tb->bhash2)) { if (sk->sk_reuse == SK_FORCE_REUSE || (tb->fastreuse > 0 && reuse) || sk_reuseport_match(tb, sk)) @@ -608,11 +608,10 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) fail_unlock: if (ret) { + if (bhash2_created) + inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, tb2); if (bhash_created) inet_bind_bucket_destroy(hinfo->bind_bucket_cachep, tb); - if (bhash2_created) - inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, - tb2); } if (head2_lock_acquired) spin_unlock(&head2->lock); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 7dc33dd1ba35..355cc6c0eaab 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -88,7 +88,7 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, */ void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb) { - if (hlist_empty(&tb->owners)) { + if (hlist_empty(&tb->bhash2)) { __hlist_del(&tb->node); kmem_cache_free(cachep, tb); } @@ -195,7 +195,6 @@ static void __inet_put_port(struct sock *sk) __sk_del_bind_node(sk); inet_csk(sk)->icsk_bind_hash = NULL; inet_sk(sk)->inet_num = 0; - inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); spin_lock(&head2->lock); if (inet_csk(sk)->icsk_bind2_hash) { @@ -207,6 +206,7 @@ static void __inet_put_port(struct sock *sk) } spin_unlock(&head2->lock); + inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); spin_unlock(&head->lock); } @@ -1062,7 +1062,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, if (tb->fastreuse >= 0 || tb->fastreuseport >= 0) goto next_port; - WARN_ON(hlist_empty(&tb->owners)); + WARN_ON(hlist_empty(&tb->bhash2)); if (!check_established(death_row, sk, port, &tw)) goto ok; diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index dd37a5bf6881..466d4faa9272 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -37,11 +37,11 @@ void inet_twsk_bind_unhash(struct inet_timewait_sock *tw, __hlist_del(&tw->tw_bind_node); tw->tw_tb = NULL; - inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); __hlist_del(&tw->tw_bind2_node); tw->tw_tb2 = NULL; inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2); + inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); __sock_put((struct sock *)tw); } From b2cb9f9ef240b4afaa1838b74fad077f17682f61 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 19 Dec 2023 09:18:31 +0900 Subject: [PATCH 10/12] tcp: Unlink sk from bhash. Now we do not use tb->owners and can unlink sockets from bhash. sk_bind_node/tw_bind_node are available for bhash2 and will be used in the following patch. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_hashtables.h | 1 - net/ipv4/inet_hashtables.c | 3 --- net/ipv4/inet_timewait_sock.c | 8 -------- 3 files changed, 12 deletions(-) diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 25ba471ba161..98ba728aec08 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -88,7 +88,6 @@ struct inet_bind_bucket { unsigned short fast_sk_family; bool fast_ipv6_only; struct hlist_node node; - struct hlist_head owners; struct hlist_head bhash2; }; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 355cc6c0eaab..5c3ad37624f1 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -76,7 +76,6 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, tb->port = snum; tb->fastreuse = 0; tb->fastreuseport = 0; - INIT_HLIST_HEAD(&tb->owners); INIT_HLIST_HEAD(&tb->bhash2); hlist_add_head(&tb->node, &head->chain); } @@ -169,7 +168,6 @@ void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, struct inet_bind2_bucket *tb2, unsigned short port) { inet_sk(sk)->inet_num = port; - sk_add_bind_node(sk, &tb->owners); inet_csk(sk)->icsk_bind_hash = tb; sk_add_bind2_node(sk, &tb2->owners); inet_csk(sk)->icsk_bind2_hash = tb2; @@ -192,7 +190,6 @@ static void __inet_put_port(struct sock *sk) spin_lock(&head->lock); tb = inet_csk(sk)->icsk_bind_hash; - __sk_del_bind_node(sk); inet_csk(sk)->icsk_bind_hash = NULL; inet_sk(sk)->inet_num = 0; diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 466d4faa9272..547583a87bd3 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -35,7 +35,6 @@ void inet_twsk_bind_unhash(struct inet_timewait_sock *tw, if (!tb) return; - __hlist_del(&tw->tw_bind_node); tw->tw_tb = NULL; __hlist_del(&tw->tw_bind2_node); @@ -94,12 +93,6 @@ static void inet_twsk_add_node_rcu(struct inet_timewait_sock *tw, hlist_nulls_add_head_rcu(&tw->tw_node, list); } -static void inet_twsk_add_bind_node(struct inet_timewait_sock *tw, - struct hlist_head *list) -{ - hlist_add_head(&tw->tw_bind_node, list); -} - static void inet_twsk_add_bind2_node(struct inet_timewait_sock *tw, struct hlist_head *list) { @@ -133,7 +126,6 @@ void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, tw->tw_tb = icsk->icsk_bind_hash; WARN_ON(!icsk->icsk_bind_hash); - inet_twsk_add_bind_node(tw, &tw->tw_tb->owners); tw->tw_tb2 = icsk->icsk_bind2_hash; WARN_ON(!icsk->icsk_bind2_hash); From 770041d337a85923810dd3aeebea38037a28d534 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 19 Dec 2023 09:18:32 +0900 Subject: [PATCH 11/12] tcp: Link sk and twsk to tb2->owners using skc_bind_node. Now we can use sk_bind_node/tw_bind_node for bhash2, which means we need not link TIME_WAIT sockets separately. The dead code and sk_bind2_node will be removed in the next patch. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 10 ---------- net/ipv4/inet_connection_sock.c | 4 ++-- net/ipv4/inet_diag.c | 2 +- net/ipv4/inet_hashtables.c | 8 ++++---- net/ipv4/inet_timewait_sock.c | 11 ++--------- 5 files changed, 9 insertions(+), 26 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index ba6642811db4..21c86e4c71fc 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -873,16 +873,6 @@ static inline void sk_add_bind_node(struct sock *sk, hlist_add_head(&sk->sk_bind_node, list); } -static inline void __sk_del_bind2_node(struct sock *sk) -{ - __hlist_del(&sk->sk_bind2_node); -} - -static inline void sk_add_bind2_node(struct sock *sk, struct hlist_head *list) -{ - hlist_add_head(&sk->sk_bind2_node, list); -} - #define sk_for_each(__sk, list) \ hlist_for_each_entry(__sk, list, sk_node) #define sk_for_each_rcu(__sk, list) \ diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 1bd13dcd45ae..1e19f85bce20 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -219,7 +219,7 @@ static bool inet_bhash2_conflict(const struct sock *sk, struct inet_timewait_sock *tw2; struct sock *sk2; - sk_for_each_bound_bhash2(sk2, &tb2->owners) { + sk_for_each_bound(sk2, &tb2->owners) { if (__inet_bhash2_conflict(sk, sk2, sk_uid, relax, reuseport_cb_ok, reuseport_ok)) return true; @@ -238,7 +238,7 @@ static bool inet_bhash2_conflict(const struct sock *sk, #define sk_for_each_bound_bhash(__sk, __tb2, __tb) \ hlist_for_each_entry(__tb2, &(__tb)->bhash2, bhash_node) \ - sk_for_each_bound_bhash2(sk2, &(__tb2)->owners) + sk_for_each_bound(sk2, &(__tb2)->owners) #define twsk_for_each_bound_bhash(__sk, __tb2, __tb) \ hlist_for_each_entry(__tb2, &(__tb)->bhash2, bhash_node) \ diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 46b13962ad02..8e6b6aa0579e 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -1104,7 +1104,7 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, if (!net_eq(ib2_net(tb2), net)) continue; - sk_for_each_bound_bhash2(sk, &tb2->owners) { + sk_for_each_bound(sk, &tb2->owners) { struct inet_sock *inet = inet_sk(sk); if (num < s_num) diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 5c3ad37624f1..4ca726a71b9d 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -169,8 +169,8 @@ void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, { inet_sk(sk)->inet_num = port; inet_csk(sk)->icsk_bind_hash = tb; - sk_add_bind2_node(sk, &tb2->owners); inet_csk(sk)->icsk_bind2_hash = tb2; + sk_add_bind_node(sk, &tb2->owners); } /* @@ -197,7 +197,7 @@ static void __inet_put_port(struct sock *sk) if (inet_csk(sk)->icsk_bind2_hash) { struct inet_bind2_bucket *tb2 = inet_csk(sk)->icsk_bind2_hash; - __sk_del_bind2_node(sk); + __sk_del_bind_node(sk); inet_csk(sk)->icsk_bind2_hash = NULL; inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2); } @@ -937,7 +937,7 @@ static int __inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family, spin_lock_bh(&head->lock); spin_lock(&head2->lock); - __sk_del_bind2_node(sk); + __sk_del_bind_node(sk); inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, inet_csk(sk)->icsk_bind2_hash); spin_unlock(&head2->lock); @@ -954,8 +954,8 @@ static int __inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family, tb2 = new_tb2; inet_bind2_bucket_init(tb2, net, head2, inet_csk(sk)->icsk_bind_hash, sk); } - sk_add_bind2_node(sk, &tb2->owners); inet_csk(sk)->icsk_bind2_hash = tb2; + sk_add_bind_node(sk, &tb2->owners); spin_unlock(&head2->lock); spin_unlock_bh(&head->lock); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 547583a87bd3..5befa4de5b24 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -35,9 +35,8 @@ void inet_twsk_bind_unhash(struct inet_timewait_sock *tw, if (!tb) return; + __sk_del_bind_node((struct sock *)tw); tw->tw_tb = NULL; - - __hlist_del(&tw->tw_bind2_node); tw->tw_tb2 = NULL; inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2); inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); @@ -93,12 +92,6 @@ static void inet_twsk_add_node_rcu(struct inet_timewait_sock *tw, hlist_nulls_add_head_rcu(&tw->tw_node, list); } -static void inet_twsk_add_bind2_node(struct inet_timewait_sock *tw, - struct hlist_head *list) -{ - hlist_add_head(&tw->tw_bind2_node, list); -} - /* * Enter the time wait state. This is called with locally disabled BH. * Essentially we whip up a timewait bucket, copy the relevant info into it @@ -129,7 +122,7 @@ void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, tw->tw_tb2 = icsk->icsk_bind2_hash; WARN_ON(!icsk->icsk_bind2_hash); - inet_twsk_add_bind2_node(tw, &tw->tw_tb2->deathrow); + sk_add_bind_node((struct sock *)tw, &tw->tw_tb2->owners); spin_unlock(&bhead2->lock); spin_unlock(&bhead->lock); From 8191792c18c55ee444588100cf0464bd8c0bf5f3 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 19 Dec 2023 09:18:33 +0900 Subject: [PATCH 12/12] tcp: Remove dead code and fields for bhash2. Now all sockets including TIME_WAIT are linked to bhash2 using sock_common.skc_bind_node. We no longer use inet_bind2_bucket.deathrow, sock.sk_bind2_node, and inet_timewait_sock.tw_bind2_node. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_hashtables.h | 4 ---- include/net/inet_timewait_sock.h | 4 ---- include/net/sock.h | 4 ---- net/ipv4/inet_connection_sock.c | 21 --------------------- net/ipv4/inet_hashtables.c | 3 +-- 5 files changed, 1 insertion(+), 35 deletions(-) diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 98ba728aec08..7f1b38458743 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -107,10 +107,6 @@ struct inet_bind2_bucket { struct hlist_node bhash_node; /* List of sockets hashed to this bucket */ struct hlist_head owners; - /* bhash has twsk in owners, but bhash2 has twsk in - * deathrow not to add a member in struct sock_common. - */ - struct hlist_head deathrow; }; static inline struct net *ib_net(const struct inet_bind_bucket *ib) diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index b14999ff55db..f28da08a37b4 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -75,13 +75,9 @@ struct inet_timewait_sock { struct timer_list tw_timer; struct inet_bind_bucket *tw_tb; struct inet_bind2_bucket *tw_tb2; - struct hlist_node tw_bind2_node; }; #define tw_tclass tw_tos -#define twsk_for_each_bound_bhash2(__tw, list) \ - hlist_for_each_entry(__tw, list, tw_bind2_node) - static inline struct inet_timewait_sock *inet_twsk(const struct sock *sk) { return (struct inet_timewait_sock *)sk; diff --git a/include/net/sock.h b/include/net/sock.h index 21c86e4c71fc..2fb3ea2d2ec3 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -352,7 +352,6 @@ struct sk_filter; * @sk_txtime_report_errors: set report errors mode for SO_TXTIME * @sk_txtime_unused: unused txtime flags * @ns_tracker: tracker for netns reference - * @sk_bind2_node: bind node in the bhash2 table */ struct sock { /* @@ -544,7 +543,6 @@ struct sock { #endif struct rcu_head sk_rcu; netns_tracker ns_tracker; - struct hlist_node sk_bind2_node; }; enum sk_pacing { @@ -890,8 +888,6 @@ static inline void sk_add_bind_node(struct sock *sk, hlist_for_each_entry_safe(__sk, tmp, list, sk_node) #define sk_for_each_bound(__sk, list) \ hlist_for_each_entry(__sk, list, sk_bind_node) -#define sk_for_each_bound_bhash2(__sk, list) \ - hlist_for_each_entry(__sk, list, sk_bind2_node) /** * sk_for_each_entry_offset_rcu - iterate over a list at a given struct offset diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 1e19f85bce20..8e2eb1793685 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -216,7 +216,6 @@ static bool inet_bhash2_conflict(const struct sock *sk, bool relax, bool reuseport_cb_ok, bool reuseport_ok) { - struct inet_timewait_sock *tw2; struct sock *sk2; sk_for_each_bound(sk2, &tb2->owners) { @@ -225,14 +224,6 @@ static bool inet_bhash2_conflict(const struct sock *sk, return true; } - twsk_for_each_bound_bhash2(tw2, &tb2->deathrow) { - sk2 = (struct sock *)tw2; - - if (__inet_bhash2_conflict(sk, sk2, sk_uid, relax, - reuseport_cb_ok, reuseport_ok)) - return true; - } - return false; } @@ -240,10 +231,6 @@ static bool inet_bhash2_conflict(const struct sock *sk, hlist_for_each_entry(__tb2, &(__tb)->bhash2, bhash_node) \ sk_for_each_bound(sk2, &(__tb2)->owners) -#define twsk_for_each_bound_bhash(__sk, __tb2, __tb) \ - hlist_for_each_entry(__tb2, &(__tb)->bhash2, bhash_node) \ - sk_for_each_bound_bhash2(sk2, &(__tb2)->deathrow) - /* This should be called only when the tb and tb2 hashbuckets' locks are held */ static int inet_csk_bind_conflict(const struct sock *sk, const struct inet_bind_bucket *tb, @@ -283,14 +270,6 @@ static int inet_csk_bind_conflict(const struct sock *sk, return true; } - twsk_for_each_bound_bhash(sk2, tb2, tb) { - if (!inet_bind_conflict(sk, sk2, uid, relax, reuseport_cb_ok, reuseport_ok)) - continue; - - if (inet_rcv_saddr_equal(sk, sk2, true)) - return true; - } - return false; } diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 4ca726a71b9d..93e9193df544 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -122,7 +122,6 @@ static void inet_bind2_bucket_init(struct inet_bind2_bucket *tb2, tb2->rcv_saddr = sk->sk_rcv_saddr; #endif INIT_HLIST_HEAD(&tb2->owners); - INIT_HLIST_HEAD(&tb2->deathrow); hlist_add_head(&tb2->node, &head->chain); hlist_add_head(&tb2->bhash_node, &tb->bhash2); } @@ -144,7 +143,7 @@ struct inet_bind2_bucket *inet_bind2_bucket_create(struct kmem_cache *cachep, /* Caller must hold hashbucket lock for this tb with local BH disabled */ void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb) { - if (hlist_empty(&tb->owners) && hlist_empty(&tb->deathrow)) { + if (hlist_empty(&tb->owners)) { __hlist_del(&tb->node); __hlist_del(&tb->bhash_node); kmem_cache_free(cachep, tb);