From 1c07dbb0cccfe85060b6eb089db3d6bfeb6aaf31 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 13 Feb 2024 06:32:33 +0000 Subject: [PATCH 01/13] net: annotate data-races around dev->name_assign_type name_assign_type_show() runs locklessly, we should annotate accesses to dev->name_assign_type. Alternative would be to grab devnet_rename_sem semaphore from name_assign_type_show(), but this would not bring more accuracy. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 6 +++--- net/core/net-sysfs.c | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 2d02ca8a3da5..720bd6838212 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1228,13 +1228,13 @@ int dev_change_name(struct net_device *dev, const char *newname) dev->flags & IFF_UP ? " (while UP)" : ""); old_assign_type = dev->name_assign_type; - dev->name_assign_type = NET_NAME_RENAMED; + WRITE_ONCE(dev->name_assign_type, NET_NAME_RENAMED); rollback: ret = device_rename(&dev->dev, dev->name); if (ret) { memcpy(dev->name, oldname, IFNAMSIZ); - dev->name_assign_type = old_assign_type; + WRITE_ONCE(dev->name_assign_type, old_assign_type); up_write(&devnet_rename_sem); return ret; } @@ -1263,7 +1263,7 @@ int dev_change_name(struct net_device *dev, const char *newname) down_write(&devnet_rename_sem); memcpy(dev->name, oldname, IFNAMSIZ); memcpy(oldname, newname, IFNAMSIZ); - dev->name_assign_type = old_assign_type; + WRITE_ONCE(dev->name_assign_type, old_assign_type); old_assign_type = NET_NAME_RENAMED; goto rollback; } else { diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index a09d507c5b03..f4c2b8267495 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -125,7 +125,7 @@ static DEVICE_ATTR_RO(iflink); static ssize_t format_name_assign_type(const struct net_device *dev, char *buf) { - return sysfs_emit(buf, fmt_dec, dev->name_assign_type); + return sysfs_emit(buf, fmt_dec, READ_ONCE(dev->name_assign_type)); } static ssize_t name_assign_type_show(struct device *dev, @@ -135,7 +135,7 @@ static ssize_t name_assign_type_show(struct device *dev, struct net_device *ndev = to_net_dev(dev); ssize_t ret = -EINVAL; - if (ndev->name_assign_type != NET_NAME_UNKNOWN) + if (READ_ONCE(ndev->name_assign_type) != NET_NAME_UNKNOWN) ret = netdev_show(dev, attr, buf, format_name_assign_type); return ret; From f694eee9e1c00d6ca06c5e59c04e3b6ff7d64aa9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 13 Feb 2024 06:32:34 +0000 Subject: [PATCH 02/13] ip_tunnel: annotate data-races around t->parms.link t->parms.link is read locklessly, annotate these reads and opposite writes accordingly. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ip_tunnel.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 58de780ab0e2..9f44c49a61de 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -102,10 +102,9 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, if (!ip_tunnel_key_match(&t->parms, flags, key)) continue; - if (t->parms.link == link) + if (READ_ONCE(t->parms.link) == link) return t; - else - cand = t; + cand = t; } hlist_for_each_entry_rcu(t, head, hash_node) { @@ -117,9 +116,9 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, if (!ip_tunnel_key_match(&t->parms, flags, key)) continue; - if (t->parms.link == link) + if (READ_ONCE(t->parms.link) == link) return t; - else if (!cand) + if (!cand) cand = t; } @@ -137,9 +136,9 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, if (!ip_tunnel_key_match(&t->parms, flags, key)) continue; - if (t->parms.link == link) + if (READ_ONCE(t->parms.link) == link) return t; - else if (!cand) + if (!cand) cand = t; } @@ -150,9 +149,9 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, !(t->dev->flags & IFF_UP)) continue; - if (t->parms.link == link) + if (READ_ONCE(t->parms.link) == link) return t; - else if (!cand) + if (!cand) cand = t; } @@ -221,7 +220,7 @@ static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn, hlist_for_each_entry_rcu(t, head, hash_node) { if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr && - link == t->parms.link && + link == READ_ONCE(t->parms.link) && type == t->dev->type && ip_tunnel_key_match(&t->parms, flags, key)) break; @@ -747,7 +746,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr, tunnel->parms.o_key, RT_TOS(tos), - dev_net(dev), tunnel->parms.link, + dev_net(dev), READ_ONCE(tunnel->parms.link), tunnel->fwmark, skb_get_hash(skb), 0); if (ip_tunnel_encap(skb, &tunnel->encap, &protocol, &fl4) < 0) @@ -867,7 +866,7 @@ static void ip_tunnel_update(struct ip_tunnel_net *itn, if (t->parms.link != p->link || t->fwmark != fwmark) { int mtu; - t->parms.link = p->link; + WRITE_ONCE(t->parms.link, p->link); t->fwmark = fwmark; mtu = ip_tunnel_bind_dev(dev); if (set_mtu) @@ -1057,9 +1056,9 @@ EXPORT_SYMBOL(ip_tunnel_get_link_net); int ip_tunnel_get_iflink(const struct net_device *dev) { - struct ip_tunnel *tunnel = netdev_priv(dev); + const struct ip_tunnel *tunnel = netdev_priv(dev); - return tunnel->parms.link; + return READ_ONCE(tunnel->parms.link); } EXPORT_SYMBOL(ip_tunnel_get_iflink); From a6473fe9b623f6667af72d972b87cd9a5ff87e21 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 13 Feb 2024 06:32:35 +0000 Subject: [PATCH 03/13] dev: annotate accesses to dev->link Following patch will read dev->link locklessly, annotate the write from do_setlink(). Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 6f1c5537e842..fd47a4422a51 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2956,7 +2956,7 @@ static int do_setlink(const struct sk_buff *skb, write_lock(&dev_base_lock); if (dev->link_mode ^ value) status |= DO_SETLINK_NOTIFY; - dev->link_mode = value; + WRITE_ONCE(dev->link_mode, value); write_unlock(&dev_base_lock); } From 4d42b37def70327b2bb19f823d42289aed2cd7c7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 13 Feb 2024 06:32:36 +0000 Subject: [PATCH 04/13] net: convert dev->reg_state to u8 Prepares things so that dev->reg_state reads can be lockless, by adding WRITE_ONCE() on write side. READ_ONCE()/WRITE_ONCE() do not support bitfields. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 23 ++++++++++++++--------- net/core/dev.c | 8 ++++---- 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a3f9c95da51e..631124655107 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1815,6 +1815,15 @@ enum netdev_stat_type { NETDEV_PCPU_STAT_DSTATS, /* struct pcpu_dstats */ }; +enum netdev_reg_state { + NETREG_UNINITIALIZED = 0, + NETREG_REGISTERED, /* completed register_netdevice */ + NETREG_UNREGISTERING, /* called unregister_netdevice */ + NETREG_UNREGISTERED, /* completed unregister todo */ + NETREG_RELEASED, /* called free_netdev */ + NETREG_DUMMY, /* dummy device for NAPI poll */ +}; + /** * struct net_device - The DEVICE structure. * @@ -2372,13 +2381,7 @@ struct net_device { struct list_head link_watch_list; - enum { NETREG_UNINITIALIZED=0, - NETREG_REGISTERED, /* completed register_netdevice */ - NETREG_UNREGISTERING, /* called unregister_netdevice */ - NETREG_UNREGISTERED, /* completed unregister todo */ - NETREG_RELEASED, /* called free_netdev */ - NETREG_DUMMY, /* dummy device for NAPI poll */ - } reg_state:8; + u8 reg_state; bool dismantle; @@ -5254,7 +5257,9 @@ static inline const char *netdev_name(const struct net_device *dev) static inline const char *netdev_reg_state(const struct net_device *dev) { - switch (dev->reg_state) { + u8 reg_state = READ_ONCE(dev->reg_state); + + switch (reg_state) { case NETREG_UNINITIALIZED: return " (uninitialized)"; case NETREG_REGISTERED: return ""; case NETREG_UNREGISTERING: return " (unregistering)"; @@ -5263,7 +5268,7 @@ static inline const char *netdev_reg_state(const struct net_device *dev) case NETREG_DUMMY: return " (dummy)"; } - WARN_ONCE(1, "%s: unknown reg_state %d\n", dev->name, dev->reg_state); + WARN_ONCE(1, "%s: unknown reg_state %d\n", dev->name, reg_state); return " (unknown)"; } diff --git a/net/core/dev.c b/net/core/dev.c index 720bd6838212..9c95cae9d6ab 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10339,7 +10339,7 @@ int register_netdevice(struct net_device *dev) ret = netdev_register_kobject(dev); write_lock(&dev_base_lock); - dev->reg_state = ret ? NETREG_UNREGISTERED : NETREG_REGISTERED; + WRITE_ONCE(dev->reg_state, ret ? NETREG_UNREGISTERED : NETREG_REGISTERED); write_unlock(&dev_base_lock); if (ret) goto err_uninit_notify; @@ -10630,7 +10630,7 @@ void netdev_run_todo(void) } write_lock(&dev_base_lock); - dev->reg_state = NETREG_UNREGISTERED; + WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERED); write_unlock(&dev_base_lock); linkwatch_sync_dev(dev); } @@ -11050,7 +11050,7 @@ void free_netdev(struct net_device *dev) } BUG_ON(dev->reg_state != NETREG_UNREGISTERED); - dev->reg_state = NETREG_RELEASED; + WRITE_ONCE(dev->reg_state, NETREG_RELEASED); /* will free via device release */ put_device(&dev->dev); @@ -11140,7 +11140,7 @@ void unregister_netdevice_many_notify(struct list_head *head, /* And unlink it from device chain. */ write_lock(&dev_base_lock); unlist_netdevice(dev, false); - dev->reg_state = NETREG_UNREGISTERING; + WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERING); write_unlock(&dev_base_lock); } flush_all_backlogs(); From 12692e3df2dacf2993c56aa23b6d3de921a5bdff Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 13 Feb 2024 06:32:37 +0000 Subject: [PATCH 05/13] net-sysfs: convert netdev_show() to RCU Make clear dev_isalive() can be called with RCU protection. Then convert netdev_show() to RCU, to remove dev_base_lock dependency. Also add RCU to broadcast_show(). Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index f4c2b8267495..678e4be69082 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -34,10 +34,10 @@ static const char fmt_dec[] = "%d\n"; static const char fmt_ulong[] = "%lu\n"; static const char fmt_u64[] = "%llu\n"; -/* Caller holds RTNL or dev_base_lock */ +/* Caller holds RTNL, RCU or dev_base_lock */ static inline int dev_isalive(const struct net_device *dev) { - return dev->reg_state <= NETREG_REGISTERED; + return READ_ONCE(dev->reg_state) <= NETREG_REGISTERED; } /* use same locking rules as GIF* ioctl's */ @@ -48,10 +48,10 @@ static ssize_t netdev_show(const struct device *dev, struct net_device *ndev = to_net_dev(dev); ssize_t ret = -EINVAL; - read_lock(&dev_base_lock); + rcu_read_lock(); if (dev_isalive(ndev)) ret = (*format)(ndev, buf); - read_unlock(&dev_base_lock); + rcu_read_unlock(); return ret; } @@ -60,7 +60,7 @@ static ssize_t netdev_show(const struct device *dev, #define NETDEVICE_SHOW(field, format_string) \ static ssize_t format_##field(const struct net_device *dev, char *buf) \ { \ - return sysfs_emit(buf, format_string, dev->field); \ + return sysfs_emit(buf, format_string, READ_ONCE(dev->field)); \ } \ static ssize_t field##_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ @@ -161,10 +161,13 @@ static ssize_t broadcast_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *ndev = to_net_dev(dev); + int ret = -EINVAL; + rcu_read_lock(); if (dev_isalive(ndev)) - return sysfs_format_mac(buf, ndev->broadcast, ndev->addr_len); - return -EINVAL; + ret = sysfs_format_mac(buf, ndev->broadcast, ndev->addr_len); + rcu_read_unlock(); + return ret; } static DEVICE_ATTR_RO(broadcast); From c7d52737e7ebd31cc5fef46380d94b58becf9479 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 13 Feb 2024 06:32:38 +0000 Subject: [PATCH 06/13] net-sysfs: use dev_addr_sem to remove races in address_show() Using dev_base_lock is not preventing from reading garbage. Use dev_addr_sem instead. v4: place dev_addr_sem extern in net/core/dev.h (Jakub Kicinski) Link: https://lore.kernel.org/netdev/20240212175845.10f6680a@kernel.org/ Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- net/core/dev.h | 3 +++ net/core/net-sysfs.c | 10 +++++++--- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 9c95cae9d6ab..26f93446b743 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -8993,7 +8993,7 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, } EXPORT_SYMBOL(dev_set_mac_address); -static DECLARE_RWSEM(dev_addr_sem); +DECLARE_RWSEM(dev_addr_sem); int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa, struct netlink_ext_ack *extack) diff --git a/net/core/dev.h b/net/core/dev.h index a43dfe3de50e..45892267848d 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -3,6 +3,7 @@ #define _NET_CORE_DEV_H #include +#include struct net; struct net_device; @@ -46,6 +47,8 @@ extern int weight_p; extern int dev_weight_rx_bias; extern int dev_weight_tx_bias; +extern struct rw_semaphore dev_addr_sem; + /* rtnl helpers */ extern struct list_head net_todo_list; void netdev_run_todo(void); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 678e4be69082..23ef2df549c3 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -142,17 +142,21 @@ static ssize_t name_assign_type_show(struct device *dev, } static DEVICE_ATTR_RO(name_assign_type); -/* use same locking rules as GIFHWADDR ioctl's */ +/* use same locking rules as GIFHWADDR ioctl's (dev_get_mac_address()) */ static ssize_t address_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *ndev = to_net_dev(dev); ssize_t ret = -EINVAL; - read_lock(&dev_base_lock); + down_read(&dev_addr_sem); + + rcu_read_lock(); if (dev_isalive(ndev)) ret = sysfs_format_mac(buf, ndev->dev_addr, ndev->addr_len); - read_unlock(&dev_base_lock); + rcu_read_unlock(); + + up_read(&dev_addr_sem); return ret; } static DEVICE_ATTR_RO(address); From 004d138364fd10dd5ff8ceb54cfdc2d792a7b338 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 13 Feb 2024 06:32:39 +0000 Subject: [PATCH 07/13] net-sysfs: convert dev->operstate reads to lockless ones operstate_show() can omit dev_base_lock acquisition only to read dev->operstate. Annotate accesses to dev->operstate. Writers still acquire dev_base_lock for mutual exclusion. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 3 ++- net/core/link_watch.c | 4 ++-- net/core/net-sysfs.c | 4 +--- net/core/rtnetlink.c | 4 ++-- net/hsr/hsr_device.c | 10 +++++----- net/ipv6/addrconf.c | 2 +- 6 files changed, 13 insertions(+), 14 deletions(-) diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 5ad4abfcb7ba..2cf4fc756263 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -455,7 +455,8 @@ static int br_fill_ifinfo(struct sk_buff *skb, u32 filter_mask, const struct net_device *dev, bool getlink) { - u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN; + u8 operstate = netif_running(dev) ? READ_ONCE(dev->operstate) : + IF_OPER_DOWN; struct nlattr *af = NULL; struct net_bridge *br; struct ifinfomsg *hdr; diff --git a/net/core/link_watch.c b/net/core/link_watch.c index 429571c258da..1b93e054c9a3 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -67,7 +67,7 @@ static void rfc2863_policy(struct net_device *dev) { unsigned char operstate = default_operstate(dev); - if (operstate == dev->operstate) + if (operstate == READ_ONCE(dev->operstate)) return; write_lock(&dev_base_lock); @@ -87,7 +87,7 @@ static void rfc2863_policy(struct net_device *dev) break; } - dev->operstate = operstate; + WRITE_ONCE(dev->operstate, operstate); write_unlock(&dev_base_lock); } diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 23ef2df549c3..c5d164b8c6bf 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -325,11 +325,9 @@ static ssize_t operstate_show(struct device *dev, const struct net_device *netdev = to_net_dev(dev); unsigned char operstate; - read_lock(&dev_base_lock); - operstate = netdev->operstate; + operstate = READ_ONCE(netdev->operstate); if (!netif_running(netdev)) operstate = IF_OPER_DOWN; - read_unlock(&dev_base_lock); if (operstate >= ARRAY_SIZE(operstates)) return -EINVAL; /* should not happen */ diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index fd47a4422a51..43d92de8601c 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -866,9 +866,9 @@ static void set_operstate(struct net_device *dev, unsigned char transition) break; } - if (dev->operstate != operstate) { + if (READ_ONCE(dev->operstate) != operstate) { write_lock(&dev_base_lock); - dev->operstate = operstate; + WRITE_ONCE(dev->operstate, operstate); write_unlock(&dev_base_lock); netdev_state_change(dev); } diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index 9d71b66183da..be0e43f46556 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -31,8 +31,8 @@ static bool is_slave_up(struct net_device *dev) static void __hsr_set_operstate(struct net_device *dev, int transition) { write_lock(&dev_base_lock); - if (dev->operstate != transition) { - dev->operstate = transition; + if (READ_ONCE(dev->operstate) != transition) { + WRITE_ONCE(dev->operstate, transition); write_unlock(&dev_base_lock); netdev_state_change(dev); } else { @@ -78,14 +78,14 @@ static void hsr_check_announce(struct net_device *hsr_dev, hsr = netdev_priv(hsr_dev); - if (hsr_dev->operstate == IF_OPER_UP && old_operstate != IF_OPER_UP) { + if (READ_ONCE(hsr_dev->operstate) == IF_OPER_UP && old_operstate != IF_OPER_UP) { /* Went up */ hsr->announce_count = 0; mod_timer(&hsr->announce_timer, jiffies + msecs_to_jiffies(HSR_ANNOUNCE_INTERVAL)); } - if (hsr_dev->operstate != IF_OPER_UP && old_operstate == IF_OPER_UP) + if (READ_ONCE(hsr_dev->operstate) != IF_OPER_UP && old_operstate == IF_OPER_UP) /* Went down */ del_timer(&hsr->announce_timer); } @@ -100,7 +100,7 @@ void hsr_check_carrier_and_operstate(struct hsr_priv *hsr) /* netif_stacked_transfer_operstate() cannot be used here since * it doesn't set IF_OPER_LOWERLAYERDOWN (?) */ - old_operstate = master->dev->operstate; + old_operstate = READ_ONCE(master->dev->operstate); has_carrier = hsr_check_carrier(master); hsr_set_operstate(master, has_carrier); hsr_check_announce(master->dev, old_operstate); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index ca1b719323c0..2c1ed642e3f4 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -6025,7 +6025,7 @@ static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev, (dev->ifindex != dev_get_iflink(dev) && nla_put_u32(skb, IFLA_LINK, dev_get_iflink(dev))) || nla_put_u8(skb, IFLA_OPERSTATE, - netif_running(dev) ? dev->operstate : IF_OPER_DOWN)) + netif_running(dev) ? READ_ONCE(dev->operstate) : IF_OPER_DOWN)) goto nla_put_failure; protoinfo = nla_nest_start_noflag(skb, IFLA_PROTINFO); if (!protoinfo) From e154bb7a6ebbe5414accb5d94dc5ba80c204ea64 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 13 Feb 2024 06:32:40 +0000 Subject: [PATCH 08/13] net-sysfs: convert netstat_show() to RCU dev_get_stats() can be called from RCU, there is no need to acquire dev_base_lock. Change dev_isalive() comment to reflect we no longer use dev_base_lock from net/core/net-sysfs.c Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index c5d164b8c6bf..946caefdd959 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -34,7 +34,7 @@ static const char fmt_dec[] = "%d\n"; static const char fmt_ulong[] = "%lu\n"; static const char fmt_u64[] = "%llu\n"; -/* Caller holds RTNL, RCU or dev_base_lock */ +/* Caller holds RTNL or RCU */ static inline int dev_isalive(const struct net_device *dev) { return READ_ONCE(dev->reg_state) <= NETREG_REGISTERED; @@ -685,14 +685,14 @@ static ssize_t netstat_show(const struct device *d, WARN_ON(offset > sizeof(struct rtnl_link_stats64) || offset % sizeof(u64) != 0); - read_lock(&dev_base_lock); + rcu_read_lock(); if (dev_isalive(dev)) { struct rtnl_link_stats64 temp; const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp); ret = sysfs_emit(buf, fmt_u64, *(u64 *)(((u8 *)stats) + offset)); } - read_unlock(&dev_base_lock); + rcu_read_unlock(); return ret; } From 328771deab16fcac55763309bb59e28b1c050853 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 13 Feb 2024 06:32:41 +0000 Subject: [PATCH 09/13] net: remove stale mentions of dev_base_lock in comments Change comments incorrectly mentioning dev_base_lock. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- Documentation/networking/netdevices.rst | 4 ++-- drivers/net/ethernet/cisco/enic/enic_main.c | 2 +- drivers/net/ethernet/nvidia/forcedeth.c | 4 ++-- drivers/net/ethernet/sfc/efx_common.c | 2 +- drivers/net/ethernet/sfc/falcon/efx.c | 2 +- drivers/net/ethernet/sfc/siena/efx_common.c | 2 +- 6 files changed, 8 insertions(+), 8 deletions(-) diff --git a/Documentation/networking/netdevices.rst b/Documentation/networking/netdevices.rst index 9e4cccb90b87..c2476917a6c3 100644 --- a/Documentation/networking/netdevices.rst +++ b/Documentation/networking/netdevices.rst @@ -252,8 +252,8 @@ ndo_eth_ioctl: Context: process ndo_get_stats: - Synchronization: rtnl_lock() semaphore, dev_base_lock rwlock, or RCU. - Context: atomic (can't sleep under rwlock or RCU) + Synchronization: rtnl_lock() semaphore, or RCU. + Context: atomic (can't sleep under RCU) ndo_start_xmit: Synchronization: __netif_tx_lock spinlock. diff --git a/drivers/net/ethernet/cisco/enic/enic_main.c b/drivers/net/ethernet/cisco/enic/enic_main.c index 37bd38d772e8..d266a87297a5 100644 --- a/drivers/net/ethernet/cisco/enic/enic_main.c +++ b/drivers/net/ethernet/cisco/enic/enic_main.c @@ -872,7 +872,7 @@ static netdev_tx_t enic_hard_start_xmit(struct sk_buff *skb, return NETDEV_TX_OK; } -/* dev_base_lock rwlock held, nominally process context */ +/* rcu_read_lock potentially held, nominally process context */ static void enic_get_stats(struct net_device *netdev, struct rtnl_link_stats64 *net_stats) { diff --git a/drivers/net/ethernet/nvidia/forcedeth.c b/drivers/net/ethernet/nvidia/forcedeth.c index 7a549b834e97..31f896c4aa26 100644 --- a/drivers/net/ethernet/nvidia/forcedeth.c +++ b/drivers/net/ethernet/nvidia/forcedeth.c @@ -1761,7 +1761,7 @@ static void nv_get_stats(int cpu, struct fe_priv *np, /* * nv_get_stats64: dev->ndo_get_stats64 function * Get latest stats value from the nic. - * Called with read_lock(&dev_base_lock) held for read - + * Called with rcu_read_lock() held - * only synchronized against unregister_netdevice. */ static void @@ -3090,7 +3090,7 @@ static void set_bufsize(struct net_device *dev) /* * nv_change_mtu: dev->change_mtu function - * Called with dev_base_lock held for read. + * Called with RTNL held for read. */ static int nv_change_mtu(struct net_device *dev, int new_mtu) { diff --git a/drivers/net/ethernet/sfc/efx_common.c b/drivers/net/ethernet/sfc/efx_common.c index 175bd9cdfdac..551f890db90a 100644 --- a/drivers/net/ethernet/sfc/efx_common.c +++ b/drivers/net/ethernet/sfc/efx_common.c @@ -595,7 +595,7 @@ void efx_stop_all(struct efx_nic *efx) efx_stop_datapath(efx); } -/* Context: process, dev_base_lock or RTNL held, non-blocking. */ +/* Context: process, rcu_read_lock or RTNL held, non-blocking. */ void efx_net_stats(struct net_device *net_dev, struct rtnl_link_stats64 *stats) { struct efx_nic *efx = efx_netdev_priv(net_dev); diff --git a/drivers/net/ethernet/sfc/falcon/efx.c b/drivers/net/ethernet/sfc/falcon/efx.c index e001f27085c6..1cb32aedd89c 100644 --- a/drivers/net/ethernet/sfc/falcon/efx.c +++ b/drivers/net/ethernet/sfc/falcon/efx.c @@ -2085,7 +2085,7 @@ int ef4_net_stop(struct net_device *net_dev) return 0; } -/* Context: process, dev_base_lock or RTNL held, non-blocking. */ +/* Context: process, rcu_read_lock or RTNL held, non-blocking. */ static void ef4_net_stats(struct net_device *net_dev, struct rtnl_link_stats64 *stats) { diff --git a/drivers/net/ethernet/sfc/siena/efx_common.c b/drivers/net/ethernet/sfc/siena/efx_common.c index e4b294b8e9ac..88e5bc347a44 100644 --- a/drivers/net/ethernet/sfc/siena/efx_common.c +++ b/drivers/net/ethernet/sfc/siena/efx_common.c @@ -605,7 +605,7 @@ static size_t efx_siena_update_stats_atomic(struct efx_nic *efx, u64 *full_stats return efx->type->update_stats(efx, full_stats, core_stats); } -/* Context: process, dev_base_lock or RTNL held, non-blocking. */ +/* Context: process, rcu_read_lock or RTNL held, non-blocking. */ void efx_siena_net_stats(struct net_device *net_dev, struct rtnl_link_stats64 *stats) { From 6a2968ee1ee2cc6fce30f6f5724442b34b1483b3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 13 Feb 2024 06:32:42 +0000 Subject: [PATCH 10/13] net: add netdev_set_operstate() helper dev_base_lock is going away, add netdev_set_operstate() helper so that hsr does not have to know core internals. Remove dev_base_lock acquisition from rfc2863_policy() v3: use an "unsigned int" for dev->operstate, so that try_cmpxchg() can work on all arches. ( https://lore.kernel.org/oe-kbuild-all/202402081918.OLyGaea3-lkp@intel.com/ ) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- include/linux/rtnetlink.h | 2 ++ net/core/link_watch.c | 9 ++------- net/core/rtnetlink.c | 22 +++++++++++++++------- net/hsr/hsr_device.c | 22 ++++++---------------- 5 files changed, 26 insertions(+), 31 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 631124655107..697370706a82 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2258,7 +2258,7 @@ struct net_device { const struct tlsdev_ops *tlsdev_ops; #endif - unsigned char operstate; + unsigned int operstate; unsigned char link_mode; unsigned char if_port; diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 21780608cf47..cdfc897f1e3c 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -172,4 +172,6 @@ rtnl_notify_needed(const struct net *net, u16 nlflags, u32 group) return (nlflags & NLM_F_ECHO) || rtnl_has_listeners(net, group); } +void netdev_set_operstate(struct net_device *dev, int newstate); + #endif /* __LINUX_RTNETLINK_H */ diff --git a/net/core/link_watch.c b/net/core/link_watch.c index 1b93e054c9a3..8ec35194bfcb 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -33,7 +33,7 @@ static DECLARE_DELAYED_WORK(linkwatch_work, linkwatch_event); static LIST_HEAD(lweventlist); static DEFINE_SPINLOCK(lweventlist_lock); -static unsigned char default_operstate(const struct net_device *dev) +static unsigned int default_operstate(const struct net_device *dev) { if (netif_testing(dev)) return IF_OPER_TESTING; @@ -62,16 +62,13 @@ static unsigned char default_operstate(const struct net_device *dev) return IF_OPER_UP; } - static void rfc2863_policy(struct net_device *dev) { - unsigned char operstate = default_operstate(dev); + unsigned int operstate = default_operstate(dev); if (operstate == READ_ONCE(dev->operstate)) return; - write_lock(&dev_base_lock); - switch(dev->link_mode) { case IF_LINK_MODE_TESTING: if (operstate == IF_OPER_UP) @@ -88,8 +85,6 @@ static void rfc2863_policy(struct net_device *dev) } WRITE_ONCE(dev->operstate, operstate); - - write_unlock(&dev_base_lock); } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 43d92de8601c..e484ba44f23b 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -842,9 +842,22 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, } EXPORT_SYMBOL_GPL(rtnl_put_cacheinfo); +void netdev_set_operstate(struct net_device *dev, int newstate) +{ + unsigned int old = READ_ONCE(dev->operstate); + + do { + if (old == newstate) + return; + } while (!try_cmpxchg(&dev->operstate, &old, newstate)); + + netdev_state_change(dev); +} +EXPORT_SYMBOL(netdev_set_operstate); + static void set_operstate(struct net_device *dev, unsigned char transition) { - unsigned char operstate = dev->operstate; + unsigned char operstate = READ_ONCE(dev->operstate); switch (transition) { case IF_OPER_UP: @@ -866,12 +879,7 @@ static void set_operstate(struct net_device *dev, unsigned char transition) break; } - if (READ_ONCE(dev->operstate) != operstate) { - write_lock(&dev_base_lock); - WRITE_ONCE(dev->operstate, operstate); - write_unlock(&dev_base_lock); - netdev_state_change(dev); - } + netdev_set_operstate(dev, operstate); } static unsigned int rtnl_dev_get_flags(const struct net_device *dev) diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index be0e43f46556..5ef6d437db72 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -28,29 +28,19 @@ static bool is_slave_up(struct net_device *dev) return dev && is_admin_up(dev) && netif_oper_up(dev); } -static void __hsr_set_operstate(struct net_device *dev, int transition) -{ - write_lock(&dev_base_lock); - if (READ_ONCE(dev->operstate) != transition) { - WRITE_ONCE(dev->operstate, transition); - write_unlock(&dev_base_lock); - netdev_state_change(dev); - } else { - write_unlock(&dev_base_lock); - } -} - static void hsr_set_operstate(struct hsr_port *master, bool has_carrier) { - if (!is_admin_up(master->dev)) { - __hsr_set_operstate(master->dev, IF_OPER_DOWN); + struct net_device *dev = master->dev; + + if (!is_admin_up(dev)) { + netdev_set_operstate(dev, IF_OPER_DOWN); return; } if (has_carrier) - __hsr_set_operstate(master->dev, IF_OPER_UP); + netdev_set_operstate(dev, IF_OPER_UP); else - __hsr_set_operstate(master->dev, IF_OPER_LOWERLAYERDOWN); + netdev_set_operstate(dev, IF_OPER_LOWERLAYERDOWN); } static bool hsr_check_carrier(struct hsr_port *master) From 2dd4d828d648e101aaf19326afcdfee8667cb185 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 13 Feb 2024 06:32:43 +0000 Subject: [PATCH 11/13] net: remove dev_base_lock from do_setlink() We hold RTNL here, and dev->link_mode readers already are using READ_ONCE(). Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index e484ba44f23b..39e66bf3e238 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2961,11 +2961,9 @@ static int do_setlink(const struct sk_buff *skb, if (tb[IFLA_LINKMODE]) { unsigned char value = nla_get_u8(tb[IFLA_LINKMODE]); - write_lock(&dev_base_lock); if (dev->link_mode ^ value) status |= DO_SETLINK_NOTIFY; WRITE_ONCE(dev->link_mode, value); - write_unlock(&dev_base_lock); } if (tb[IFLA_VFINFO_LIST]) { From e51b962438741f5482c82fb225c1d59136f0fd87 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 13 Feb 2024 06:32:44 +0000 Subject: [PATCH 12/13] net: remove dev_base_lock from register_netdevice() and friends. RTNL already protects writes to dev->reg_state, we no longer need to hold dev_base_lock to protect the readers. unlist_netdevice() second argument can be removed. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 26f93446b743..02cf9fd68da6 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -414,7 +414,7 @@ static void list_netdevice(struct net_device *dev) /* Device list removal * caller must respect a RCU grace period before freeing/reusing dev */ -static void unlist_netdevice(struct net_device *dev, bool lock) +static void unlist_netdevice(struct net_device *dev) { struct netdev_name_node *name_node; struct net *net = dev_net(dev); @@ -427,13 +427,11 @@ static void unlist_netdevice(struct net_device *dev, bool lock) netdev_name_node_del(name_node); /* Unlink dev from the device chain */ - if (lock) - write_lock(&dev_base_lock); + write_lock(&dev_base_lock); list_del_rcu(&dev->dev_list); netdev_name_node_del(dev->name_node); hlist_del_rcu(&dev->index_hlist); - if (lock) - write_unlock(&dev_base_lock); + write_unlock(&dev_base_lock); dev_base_seq_inc(dev_net(dev)); } @@ -10338,9 +10336,9 @@ int register_netdevice(struct net_device *dev) goto err_ifindex_release; ret = netdev_register_kobject(dev); - write_lock(&dev_base_lock); + WRITE_ONCE(dev->reg_state, ret ? NETREG_UNREGISTERED : NETREG_REGISTERED); - write_unlock(&dev_base_lock); + if (ret) goto err_uninit_notify; @@ -10629,9 +10627,7 @@ void netdev_run_todo(void) continue; } - write_lock(&dev_base_lock); WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERED); - write_unlock(&dev_base_lock); linkwatch_sync_dev(dev); } @@ -11138,10 +11134,8 @@ void unregister_netdevice_many_notify(struct list_head *head, list_for_each_entry(dev, head, unreg_list) { /* And unlink it from device chain. */ - write_lock(&dev_base_lock); - unlist_netdevice(dev, false); + unlist_netdevice(dev); WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERING); - write_unlock(&dev_base_lock); } flush_all_backlogs(); @@ -11323,7 +11317,7 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, dev_close(dev); /* And unlink it from device chain */ - unlist_netdevice(dev, true); + unlist_netdevice(dev); synchronize_net(); From 1b3ef46cb7f2618cc0b507393220a69810f6da12 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 13 Feb 2024 06:32:45 +0000 Subject: [PATCH 13/13] net: remove dev_base_lock dev_base_lock is not needed anymore, all remaining users also hold RTNL. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 -- net/core/dev.c | 39 ++++----------------------------------- 2 files changed, 4 insertions(+), 37 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 697370706a82..c541550b0e6e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3077,8 +3077,6 @@ int call_netdevice_notifiers(unsigned long val, struct net_device *dev); int call_netdevice_notifiers_info(unsigned long val, struct netdev_notifier_info *info); -extern rwlock_t dev_base_lock; /* Device list lock */ - #define for_each_netdev(net, d) \ list_for_each_entry(d, &(net)->dev_base_head, dev_list) #define for_each_netdev_reverse(net, d) \ diff --git a/net/core/dev.c b/net/core/dev.c index 02cf9fd68da6..d8dd293a7a27 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -168,28 +168,6 @@ static int call_netdevice_notifiers_extack(unsigned long val, struct net_device *dev, struct netlink_ext_ack *extack); -/* - * The @dev_base_head list is protected by @dev_base_lock and the rtnl - * semaphore. - * - * Pure readers hold dev_base_lock for reading, or rcu_read_lock() - * - * Writers must hold the rtnl semaphore while they loop through the - * dev_base_head list, and hold dev_base_lock for writing when they do the - * actual updates. This allows pure readers to access the list even - * while a writer is preparing to update it. - * - * To put it another way, dev_base_lock is held for writing only to - * protect against pure readers; the rtnl semaphore provides the - * protection against other writers. - * - * See, for example usages, register_netdevice() and - * unregister_netdevice(), which must be called with the rtnl - * semaphore held. - */ -DEFINE_RWLOCK(dev_base_lock); -EXPORT_SYMBOL(dev_base_lock); - static DEFINE_MUTEX(ifalias_mutex); /* protects napi_hash addition/deletion and napi_gen_id */ @@ -395,12 +373,10 @@ static void list_netdevice(struct net_device *dev) ASSERT_RTNL(); - write_lock(&dev_base_lock); list_add_tail_rcu(&dev->dev_list, &net->dev_base_head); netdev_name_node_add(net, dev->name_node); hlist_add_head_rcu(&dev->index_hlist, dev_index_hash(net, dev->ifindex)); - write_unlock(&dev_base_lock); netdev_for_each_altname(dev, name_node) netdev_name_node_add(net, name_node); @@ -427,11 +403,9 @@ static void unlist_netdevice(struct net_device *dev) netdev_name_node_del(name_node); /* Unlink dev from the device chain */ - write_lock(&dev_base_lock); list_del_rcu(&dev->dev_list); netdev_name_node_del(dev->name_node); hlist_del_rcu(&dev->index_hlist); - write_unlock(&dev_base_lock); dev_base_seq_inc(dev_net(dev)); } @@ -752,9 +726,9 @@ EXPORT_SYMBOL_GPL(dev_fill_forward_path); * @net: the applicable net namespace * @name: name to find * - * Find an interface by name. Must be called under RTNL semaphore - * or @dev_base_lock. If the name is found a pointer to the device - * is returned. If the name is not found then %NULL is returned. The + * Find an interface by name. Must be called under RTNL semaphore. + * If the name is found a pointer to the device is returned. + * If the name is not found then %NULL is returned. The * reference counters are not incremented so the caller must be * careful with locks. */ @@ -835,8 +809,7 @@ EXPORT_SYMBOL(netdev_get_by_name); * Search for an interface by index. Returns %NULL if the device * is not found or a pointer to the device. The device has not * had its reference counter increased so the caller must be careful - * about locking. The caller must hold either the RTNL semaphore - * or @dev_base_lock. + * about locking. The caller must hold the RTNL semaphore. */ struct net_device *__dev_get_by_index(struct net *net, int ifindex) @@ -1241,15 +1214,11 @@ int dev_change_name(struct net_device *dev, const char *newname) netdev_adjacent_rename_links(dev, oldname); - write_lock(&dev_base_lock); netdev_name_node_del(dev->name_node); - write_unlock(&dev_base_lock); synchronize_net(); - write_lock(&dev_base_lock); netdev_name_node_add(net, dev->name_node); - write_unlock(&dev_base_lock); ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev); ret = notifier_to_errno(ret);