Merge branch 'ipv4-fib-convert-rtm_newroute-and-rtm_delroute-to-per-netns-rtnl'

Kuniyuki Iwashima says:

====================
ipv4: fib: Convert RTM_NEWROUTE and RTM_DELROUTE to per-netns RTNL.

Patch 1 is misc cleanup.
Patch 2 ~ 8 converts two fib_info hash tables to per-netns.
Patch 9 ~ 12 converts rtnl_lock() to rtnl_net_lcok().

v2: https://lore.kernel.org/20250226192556.21633-1-kuniyu@amazon.com
v1: https://lore.kernel.org/20250225182250.74650-1-kuniyu@amazon.com
====================

Link: https://patch.msgid.link/20250228042328.96624-1-kuniyu@amazon.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
Jakub Kicinski 2025-03-03 15:04:14 -08:00
commit 3424291dd2
5 changed files with 159 additions and 148 deletions

View File

@ -162,6 +162,8 @@ struct fib_info {
struct fib_nh fib_nh[] __counted_by(fib_nhs);
};
int __net_init fib4_semantics_init(struct net *net);
void __net_exit fib4_semantics_exit(struct net *net);
#ifdef CONFIG_IP_MULTIPLE_TABLES
struct fib_rule;

View File

@ -111,6 +111,9 @@ struct netns_ipv4 {
#endif
struct hlist_head *fib_table_hash;
struct sock *fibnl;
struct hlist_head *fib_info_hash;
unsigned int fib_info_hash_bits;
unsigned int fib_info_cnt;
struct sock *mc_autojoin_sk;

View File

@ -553,18 +553,16 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
const struct in_ifaddr *ifa;
struct in_device *in_dev;
in_dev = __in_dev_get_rtnl(dev);
in_dev = __in_dev_get_rtnl_net(dev);
if (!in_dev)
return -ENODEV;
*colon = ':';
rcu_read_lock();
in_dev_for_each_ifa_rcu(ifa, in_dev) {
in_dev_for_each_ifa_rtnl_net(net, ifa, in_dev) {
if (strcmp(ifa->ifa_label, devname) == 0)
break;
}
rcu_read_unlock();
if (!ifa)
return -ENODEV;
@ -635,7 +633,7 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, struct rtentry *rt)
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
return -EPERM;
rtnl_lock();
rtnl_net_lock(net);
err = rtentry_to_fib_config(net, cmd, rt, &cfg);
if (err == 0) {
struct fib_table *tb;
@ -659,7 +657,7 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, struct rtentry *rt)
/* allocated by rtentry_to_fib_config() */
kfree(cfg.fc_mx);
}
rtnl_unlock();
rtnl_net_unlock(net);
return err;
}
return -EINVAL;
@ -837,19 +835,33 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
}
}
if (cfg->fc_dst_len > 32) {
NL_SET_ERR_MSG(extack, "Invalid prefix length");
err = -EINVAL;
goto errout;
}
if (cfg->fc_dst_len < 32 && (ntohl(cfg->fc_dst) << cfg->fc_dst_len)) {
NL_SET_ERR_MSG(extack, "Invalid prefix for given prefix length");
err = -EINVAL;
goto errout;
}
if (cfg->fc_nh_id) {
if (cfg->fc_oif || cfg->fc_gw_family ||
cfg->fc_encap || cfg->fc_mp) {
NL_SET_ERR_MSG(extack,
"Nexthop specification and nexthop id are mutually exclusive");
return -EINVAL;
err = -EINVAL;
goto errout;
}
}
if (has_gw && has_via) {
NL_SET_ERR_MSG(extack,
"Nexthop configuration can not contain both GATEWAY and VIA");
return -EINVAL;
err = -EINVAL;
goto errout;
}
if (!cfg->fc_table)
@ -872,20 +884,24 @@ static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
if (err < 0)
goto errout;
rtnl_net_lock(net);
if (cfg.fc_nh_id && !nexthop_find_by_id(net, cfg.fc_nh_id)) {
NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
err = -EINVAL;
goto errout;
goto unlock;
}
tb = fib_get_table(net, cfg.fc_table);
if (!tb) {
NL_SET_ERR_MSG(extack, "FIB table does not exist");
err = -ESRCH;
goto errout;
goto unlock;
}
err = fib_table_delete(net, tb, &cfg, extack);
unlock:
rtnl_net_unlock(net);
errout:
return err;
}
@ -902,15 +918,20 @@ static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
if (err < 0)
goto errout;
rtnl_net_lock(net);
tb = fib_new_table(net, cfg.fc_table);
if (!tb) {
err = -ENOBUFS;
goto errout;
goto unlock;
}
err = fib_table_insert(net, tb, &cfg, extack);
if (!err && cfg.fc_type == RTN_LOCAL)
net->ipv4.fib_has_custom_local_routes = true;
unlock:
rtnl_net_unlock(net);
errout:
return err;
}
@ -1450,7 +1471,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
fib_sync_up(dev, RTNH_F_DEAD);
#endif
atomic_inc(&net->ipv4.dev_addr_genid);
rt_cache_flush(dev_net(dev));
rt_cache_flush(net);
break;
case NETDEV_DOWN:
fib_del_ifaddr(ifa, NULL);
@ -1461,7 +1482,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
*/
fib_disable_ip(dev, event, true);
} else {
rt_cache_flush(dev_net(dev));
rt_cache_flush(net);
}
break;
}
@ -1575,7 +1596,7 @@ static void ip_fib_net_exit(struct net *net)
{
int i;
ASSERT_RTNL();
ASSERT_RTNL_NET(net);
#ifdef CONFIG_IP_MULTIPLE_TABLES
RCU_INIT_POINTER(net->ipv4.fib_main, NULL);
RCU_INIT_POINTER(net->ipv4.fib_default, NULL);
@ -1615,9 +1636,15 @@ static int __net_init fib_net_init(struct net *net)
error = ip_fib_net_init(net);
if (error < 0)
goto out;
error = fib4_semantics_init(net);
if (error)
goto out_semantics;
error = nl_fib_lookup_init(net);
if (error < 0)
goto out_nlfl;
error = fib_proc_init(net);
if (error < 0)
goto out_proc;
@ -1627,9 +1654,11 @@ static int __net_init fib_net_init(struct net *net)
out_proc:
nl_fib_lookup_exit(net);
out_nlfl:
rtnl_lock();
fib4_semantics_exit(net);
out_semantics:
rtnl_net_lock(net);
ip_fib_net_exit(net);
rtnl_unlock();
rtnl_net_unlock(net);
goto out;
}
@ -1644,10 +1673,15 @@ static void __net_exit fib_net_exit_batch(struct list_head *net_list)
struct net *net;
rtnl_lock();
list_for_each_entry(net, net_list, exit_list)
list_for_each_entry(net, net_list, exit_list) {
__rtnl_net_lock(net);
ip_fib_net_exit(net);
__rtnl_net_unlock(net);
}
rtnl_unlock();
list_for_each_entry(net, net_list, exit_list)
fib4_semantics_exit(net);
}
static struct pernet_operations fib_net_ops = {
@ -1658,9 +1692,9 @@ static struct pernet_operations fib_net_ops = {
static const struct rtnl_msg_handler fib_rtnl_msg_handlers[] __initconst = {
{.protocol = PF_INET, .msgtype = RTM_NEWROUTE,
.doit = inet_rtm_newroute},
.doit = inet_rtm_newroute, .flags = RTNL_FLAG_DOIT_PERNET},
{.protocol = PF_INET, .msgtype = RTM_DELROUTE,
.doit = inet_rtm_delroute},
.doit = inet_rtm_delroute, .flags = RTNL_FLAG_DOIT_PERNET},
{.protocol = PF_INET, .msgtype = RTM_GETROUTE, .dumpit = inet_dump_fib,
.flags = RTNL_FLAG_DUMP_UNLOCKED | RTNL_FLAG_DUMP_SPLIT_NLM_DONE},
};

View File

@ -50,12 +50,6 @@
#include "fib_lookup.h"
static struct hlist_head *fib_info_hash;
static struct hlist_head *fib_info_laddrhash;
static unsigned int fib_info_hash_size;
static unsigned int fib_info_hash_bits;
static unsigned int fib_info_cnt;
/* for_nexthops and change_nexthops only used when nexthop object
* is not set in a fib_info. The logic within can reference fib_nh.
*/
@ -258,8 +252,7 @@ void fib_release_info(struct fib_info *fi)
ASSERT_RTNL();
if (fi && refcount_dec_and_test(&fi->fib_treeref)) {
hlist_del(&fi->fib_hash);
fib_info_cnt--;
fi->fib_net->ipv4.fib_info_cnt--;
if (fi->fib_prefsrc)
hlist_del(&fi->fib_lhash);
@ -335,11 +328,12 @@ static unsigned int fib_info_hashfn_1(int init_val, u8 protocol, u8 scope,
static unsigned int fib_info_hashfn_result(const struct net *net,
unsigned int val)
{
return hash_32(val ^ net_hash_mix(net), fib_info_hash_bits);
return hash_32(val ^ net_hash_mix(net), net->ipv4.fib_info_hash_bits);
}
static inline unsigned int fib_info_hashfn(struct fib_info *fi)
static struct hlist_head *fib_info_hash_bucket(struct fib_info *fi)
{
struct net *net = fi->fib_net;
unsigned int val;
val = fib_info_hashfn_1(fi->fib_nhs, fi->fib_protocol,
@ -354,7 +348,70 @@ static inline unsigned int fib_info_hashfn(struct fib_info *fi)
} endfor_nexthops(fi)
}
return fib_info_hashfn_result(fi->fib_net, val);
return &net->ipv4.fib_info_hash[fib_info_hashfn_result(net, val)];
}
static struct hlist_head *fib_info_laddrhash_bucket(const struct net *net,
__be32 val)
{
unsigned int hash_bits = net->ipv4.fib_info_hash_bits;
u32 slot;
slot = hash_32(net_hash_mix(net) ^ (__force u32)val, hash_bits);
return &net->ipv4.fib_info_hash[(1 << hash_bits) + slot];
}
static struct hlist_head *fib_info_hash_alloc(unsigned int hash_bits)
{
/* The second half is used for prefsrc */
return kvcalloc((1 << hash_bits) * 2, sizeof(struct hlist_head *),
GFP_KERNEL);
}
static void fib_info_hash_free(struct hlist_head *head)
{
kvfree(head);
}
static void fib_info_hash_grow(struct net *net)
{
unsigned int old_size = 1 << net->ipv4.fib_info_hash_bits;
struct hlist_head *new_info_hash, *old_info_hash;
unsigned int i;
if (net->ipv4.fib_info_cnt < old_size)
return;
new_info_hash = fib_info_hash_alloc(net->ipv4.fib_info_hash_bits + 1);
if (!new_info_hash)
return;
old_info_hash = net->ipv4.fib_info_hash;
net->ipv4.fib_info_hash = new_info_hash;
net->ipv4.fib_info_hash_bits += 1;
for (i = 0; i < old_size; i++) {
struct hlist_head *head = &old_info_hash[i];
struct hlist_node *n;
struct fib_info *fi;
hlist_for_each_entry_safe(fi, n, head, fib_hash)
hlist_add_head(&fi->fib_hash, fib_info_hash_bucket(fi));
}
for (i = 0; i < old_size; i++) {
struct hlist_head *lhead = &old_info_hash[old_size + i];
struct hlist_node *n;
struct fib_info *fi;
hlist_for_each_entry_safe(fi, n, lhead, fib_lhash)
hlist_add_head(&fi->fib_lhash,
fib_info_laddrhash_bucket(fi->fib_net,
fi->fib_prefsrc));
}
fib_info_hash_free(old_info_hash);
}
/* no metrics, only nexthop id */
@ -370,13 +427,12 @@ static struct fib_info *fib_find_info_nh(struct net *net,
(__force u32)cfg->fc_prefsrc,
cfg->fc_priority);
hash = fib_info_hashfn_result(net, hash);
head = &fib_info_hash[hash];
head = &net->ipv4.fib_info_hash[hash];
hlist_for_each_entry(fi, head, fib_hash) {
if (!net_eq(fi->fib_net, net))
continue;
if (!fi->nh || fi->nh->id != cfg->fc_nh_id)
continue;
if (cfg->fc_protocol == fi->fib_protocol &&
cfg->fc_scope == fi->fib_scope &&
cfg->fc_prefsrc == fi->fib_prefsrc &&
@ -392,18 +448,13 @@ static struct fib_info *fib_find_info_nh(struct net *net,
static struct fib_info *fib_find_info(struct fib_info *nfi)
{
struct hlist_head *head;
struct hlist_head *head = fib_info_hash_bucket(nfi);
struct fib_info *fi;
unsigned int hash;
hash = fib_info_hashfn(nfi);
head = &fib_info_hash[hash];
hlist_for_each_entry(fi, head, fib_hash) {
if (!net_eq(fi->fib_net, nfi->fib_net))
continue;
if (fi->fib_nhs != nfi->fib_nhs)
continue;
if (nfi->fib_protocol == fi->fib_protocol &&
nfi->fib_scope == fi->fib_scope &&
nfi->fib_prefsrc == fi->fib_prefsrc &&
@ -1239,64 +1290,6 @@ int fib_check_nh(struct net *net, struct fib_nh *nh, u32 table, u8 scope,
return err;
}
static struct hlist_head *
fib_info_laddrhash_bucket(const struct net *net, __be32 val)
{
u32 slot = hash_32(net_hash_mix(net) ^ (__force u32)val,
fib_info_hash_bits);
return &fib_info_laddrhash[slot];
}
static void fib_info_hash_move(struct hlist_head *new_info_hash,
struct hlist_head *new_laddrhash,
unsigned int new_size)
{
struct hlist_head *old_info_hash, *old_laddrhash;
unsigned int old_size = fib_info_hash_size;
unsigned int i;
ASSERT_RTNL();
old_info_hash = fib_info_hash;
old_laddrhash = fib_info_laddrhash;
fib_info_hash_size = new_size;
fib_info_hash_bits = ilog2(new_size);
for (i = 0; i < old_size; i++) {
struct hlist_head *head = &fib_info_hash[i];
struct hlist_node *n;
struct fib_info *fi;
hlist_for_each_entry_safe(fi, n, head, fib_hash) {
struct hlist_head *dest;
unsigned int new_hash;
new_hash = fib_info_hashfn(fi);
dest = &new_info_hash[new_hash];
hlist_add_head(&fi->fib_hash, dest);
}
}
fib_info_hash = new_info_hash;
fib_info_laddrhash = new_laddrhash;
for (i = 0; i < old_size; i++) {
struct hlist_head *lhead = &old_laddrhash[i];
struct hlist_node *n;
struct fib_info *fi;
hlist_for_each_entry_safe(fi, n, lhead, fib_lhash) {
struct hlist_head *ldest;
ldest = fib_info_laddrhash_bucket(fi->fib_net,
fi->fib_prefsrc);
hlist_add_head(&fi->fib_lhash, ldest);
}
}
kvfree(old_info_hash);
kvfree(old_laddrhash);
}
__be32 fib_info_update_nhc_saddr(struct net *net, struct fib_nh_common *nhc,
unsigned char scope)
{
@ -1409,32 +1402,14 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
}
#endif
err = -ENOBUFS;
if (fib_info_cnt >= fib_info_hash_size) {
unsigned int new_size = fib_info_hash_size << 1;
struct hlist_head *new_info_hash;
struct hlist_head *new_laddrhash;
size_t bytes;
if (!new_size)
new_size = 16;
bytes = (size_t)new_size * sizeof(struct hlist_head *);
new_info_hash = kvzalloc(bytes, GFP_KERNEL);
new_laddrhash = kvzalloc(bytes, GFP_KERNEL);
if (!new_info_hash || !new_laddrhash) {
kvfree(new_info_hash);
kvfree(new_laddrhash);
} else {
fib_info_hash_move(new_info_hash, new_laddrhash, new_size);
}
if (!fib_info_hash_size)
goto failure;
}
fib_info_hash_grow(net);
fi = kzalloc(struct_size(fi, fib_nh, nhs), GFP_KERNEL);
if (!fi)
if (!fi) {
err = -ENOBUFS;
goto failure;
}
fi->fib_metrics = ip_fib_metrics_init(cfg->fc_mx, cfg->fc_mx_len, extack);
if (IS_ERR(fi->fib_metrics)) {
err = PTR_ERR(fi->fib_metrics);
@ -1571,9 +1546,9 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
refcount_set(&fi->fib_treeref, 1);
refcount_set(&fi->fib_clntref, 1);
fib_info_cnt++;
hlist_add_head(&fi->fib_hash,
&fib_info_hash[fib_info_hashfn(fi)]);
net->ipv4.fib_info_cnt++;
hlist_add_head(&fi->fib_hash, fib_info_hash_bucket(fi));
if (fi->fib_prefsrc) {
struct hlist_head *head;
@ -1855,7 +1830,7 @@ int fib_sync_down_addr(struct net_device *dev, __be32 local)
struct fib_info *fi;
int ret = 0;
if (!fib_info_laddrhash || local == 0)
if (!local)
return 0;
head = fib_info_laddrhash_bucket(net, local);
@ -2257,3 +2232,22 @@ void fib_select_path(struct net *net, struct fib_result *res,
fl4->saddr = inet_select_addr(l3mdev, 0, RT_SCOPE_LINK);
}
}
int __net_init fib4_semantics_init(struct net *net)
{
unsigned int hash_bits = 4;
net->ipv4.fib_info_hash = fib_info_hash_alloc(hash_bits);
if (!net->ipv4.fib_info_hash)
return -ENOMEM;
net->ipv4.fib_info_hash_bits = hash_bits;
net->ipv4.fib_info_cnt = 0;
return 0;
}
void __net_exit fib4_semantics_exit(struct net *net)
{
fib_info_hash_free(net->ipv4.fib_info_hash);
}

View File

@ -1187,22 +1187,6 @@ static int fib_insert_alias(struct trie *t, struct key_vector *tp,
return 0;
}
static bool fib_valid_key_len(u32 key, u8 plen, struct netlink_ext_ack *extack)
{
if (plen > KEYLENGTH) {
NL_SET_ERR_MSG(extack, "Invalid prefix length");
return false;
}
if ((plen < KEYLENGTH) && (key << plen)) {
NL_SET_ERR_MSG(extack,
"Invalid prefix for given prefix length");
return false;
}
return true;
}
static void fib_remove_alias(struct trie *t, struct key_vector *tp,
struct key_vector *l, struct fib_alias *old);
@ -1223,9 +1207,6 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
key = ntohl(cfg->fc_dst);
if (!fib_valid_key_len(key, plen, extack))
return -EINVAL;
pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
fi = fib_create_info(cfg, extack);
@ -1717,9 +1698,6 @@ int fib_table_delete(struct net *net, struct fib_table *tb,
key = ntohl(cfg->fc_dst);
if (!fib_valid_key_len(key, plen, extack))
return -EINVAL;
l = fib_find_node(t, &tp, key);
if (!l)
return -ESRCH;