Merge branch 'net-sysfs-remove-the-rtnl_trylock-restart_syscall-construction'

Antoine Tenart says:

====================
net-sysfs: remove the rtnl_trylock/restart_syscall construction

The series initially aimed at improving spins (and thus delays) while
accessing net sysfs under rtnl lock contention[1]. The culprit was the
trylock/restart_syscall constructions. There wasn't much interest at the
time but it got traction recently for other reasons (lowering the rtnl
lock pressure).

Since v1[2]:

- Do not export rtnl_lock_interruptible [Stephen].
- Add netdev_warn_once messages in rx_queue_add_kobject [Jakub].

Since the RFC[1]:

- Limit the breaking of the sysfs protection to sysfs_rtnl_lock() only
  as this is not needed in the whole rtnl locking section thanks to the
  additional check on dev_isalive(). This simplifies error handling as
  well as the unlocking path.
- Used an interruptible version of rtnl_lock, as done by Jakub in
  his experiments.
- Removed a WARN_ONCE_ONCE [Greg].
- Removed explicit inline markers [Stephen].

Most of the reasoning is explained in comments added in patch 1. This
was tested by stress-testing net sysfs attributes (read/write ops) while
adding/removing queues and adding/removing veths, all in parallel. I
also used an OCP single node cluster, spawning lots of pods.

[1] https://lore.kernel.org/all/20231018154804.420823-1-atenart@kernel.org/T/
[2] https://lore.kernel.org/all/20250117102612.132644-1-atenart@kernel.org/T/
====================

Link: https://patch.msgid.link/20250204170314.146022-1-atenart@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
Jakub Kicinski 2025-02-05 17:49:10 -08:00
commit fadbe52b3b
5 changed files with 284 additions and 118 deletions

View File

@ -658,6 +658,7 @@ struct netdev_queue {
struct Qdisc __rcu *qdisc_sleeping;
#ifdef CONFIG_SYSFS
struct kobject kobj;
const struct attribute_group **groups;
#endif
unsigned long tx_maxrate;
/*

View File

@ -43,6 +43,7 @@ extern void rtnl_lock(void);
extern void rtnl_unlock(void);
extern int rtnl_trylock(void);
extern int rtnl_is_locked(void);
extern int rtnl_lock_interruptible(void);
extern int rtnl_lock_killable(void);
extern bool refcount_dec_and_rtnl_lock(refcount_t *r);

View File

@ -16,6 +16,7 @@ struct netdev_rx_queue {
struct rps_dev_flow_table __rcu *rps_flow_table;
#endif
struct kobject kobj;
const struct attribute_group **groups;
struct net_device *dev;
netdevice_tracker dev_tracker;

View File

@ -42,6 +42,87 @@ static inline int dev_isalive(const struct net_device *dev)
return READ_ONCE(dev->reg_state) <= NETREG_REGISTERED;
}
/* There is a possible ABBA deadlock between rtnl_lock and kernfs_node->active,
* when unregistering a net device and accessing associated sysfs files. The
* potential deadlock is as follow:
*
* CPU 0 CPU 1
*
* rtnl_lock vfs_read
* unregister_netdevice_many kernfs_seq_start
* device_del / kobject_put kernfs_get_active (kn->active++)
* kernfs_drain sysfs_kf_seq_show
* wait_event( rtnl_lock
* kn->active == KN_DEACTIVATED_BIAS) -> waits on CPU 0 to release
* -> waits on CPU 1 to decrease kn->active the rtnl lock.
*
* The historical fix was to use rtnl_trylock with restart_syscall to bail out
* of sysfs operations when the lock couldn't be taken. This fixed the above
* issue as it allowed CPU 1 to bail out of the ABBA situation.
*
* But it came with performances issues, as syscalls are being restarted in
* loops when there was contention on the rtnl lock, with huge slow downs in
* specific scenarios (e.g. lots of virtual interfaces created and userspace
* daemons querying their attributes).
*
* The idea below is to bail out of the active kernfs_node protection
* (kn->active) while trying to take the rtnl lock.
*
* This replaces rtnl_lock() and still has to be used with rtnl_unlock(). The
* net device is guaranteed to be alive if this returns successfully.
*/
static int sysfs_rtnl_lock(struct kobject *kobj, struct attribute *attr,
struct net_device *ndev)
{
struct kernfs_node *kn;
int ret = 0;
/* First, we hold a reference to the net device as the unregistration
* path might run in parallel. This will ensure the net device and the
* associated sysfs objects won't be freed while we try to take the rtnl
* lock.
*/
dev_hold(ndev);
/* sysfs_break_active_protection was introduced to allow self-removal of
* devices and their associated sysfs files by bailing out of the
* sysfs/kernfs protection. We do this here to allow the unregistration
* path to complete in parallel. The following takes a reference on the
* kobject and the kernfs_node being accessed.
*
* This works because we hold a reference onto the net device and the
* unregistration path will wait for us eventually in netdev_run_todo
* (outside an rtnl lock section).
*/
kn = sysfs_break_active_protection(kobj, attr);
/* We can now try to take the rtnl lock. This can't deadlock us as the
* unregistration path is able to drain sysfs files (kernfs_node) thanks
* to the above dance.
*/
if (rtnl_lock_interruptible()) {
ret = -ERESTARTSYS;
goto unbreak;
}
/* Check dismantle on the device hasn't started, otherwise deny the
* operation.
*/
if (!dev_isalive(ndev)) {
rtnl_unlock();
ret = -ENODEV;
goto unbreak;
}
/* We are now sure the device dismantle hasn't started nor that it can
* start before we exit the locking section as we hold the rtnl lock.
* There's no need to keep unbreaking the sysfs protection nor to hold
* a net device reference from that point; that was only needed to take
* the rtnl lock.
*/
unbreak:
sysfs_unbreak_active_protection(kn);
dev_put(ndev);
return ret;
}
/* use same locking rules as GIF* ioctl's */
static ssize_t netdev_show(const struct device *dev,
struct device_attribute *attr, char *buf,
@ -95,14 +176,14 @@ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr,
if (ret)
goto err;
if (!rtnl_trylock())
return restart_syscall();
ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev);
if (ret)
goto err;
ret = (*set)(netdev, new);
if (ret == 0)
ret = len;
if (dev_isalive(netdev)) {
ret = (*set)(netdev, new);
if (ret == 0)
ret = len;
}
rtnl_unlock();
err:
return ret;
@ -220,7 +301,7 @@ static ssize_t carrier_store(struct device *dev, struct device_attribute *attr,
struct net_device *netdev = to_net_dev(dev);
/* The check is also done in change_carrier; this helps returning early
* without hitting the trylock/restart in netdev_store.
* without hitting the locking section in netdev_store.
*/
if (!netdev->netdev_ops->ndo_change_carrier)
return -EOPNOTSUPP;
@ -234,8 +315,9 @@ static ssize_t carrier_show(struct device *dev,
struct net_device *netdev = to_net_dev(dev);
int ret = -EINVAL;
if (!rtnl_trylock())
return restart_syscall();
ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev);
if (ret)
return ret;
if (netif_running(netdev)) {
/* Synchronize carrier state with link watch,
@ -245,8 +327,8 @@ static ssize_t carrier_show(struct device *dev,
ret = sysfs_emit(buf, fmt_dec, !!netif_carrier_ok(netdev));
}
rtnl_unlock();
rtnl_unlock();
return ret;
}
static DEVICE_ATTR_RW(carrier);
@ -258,13 +340,14 @@ static ssize_t speed_show(struct device *dev,
int ret = -EINVAL;
/* The check is also done in __ethtool_get_link_ksettings; this helps
* returning early without hitting the trylock/restart below.
* returning early without hitting the locking section below.
*/
if (!netdev->ethtool_ops->get_link_ksettings)
return ret;
if (!rtnl_trylock())
return restart_syscall();
ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev);
if (ret)
return ret;
if (netif_running(netdev)) {
struct ethtool_link_ksettings cmd;
@ -284,13 +367,14 @@ static ssize_t duplex_show(struct device *dev,
int ret = -EINVAL;
/* The check is also done in __ethtool_get_link_ksettings; this helps
* returning early without hitting the trylock/restart below.
* returning early without hitting the locking section below.
*/
if (!netdev->ethtool_ops->get_link_ksettings)
return ret;
if (!rtnl_trylock())
return restart_syscall();
ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev);
if (ret)
return ret;
if (netif_running(netdev)) {
struct ethtool_link_ksettings cmd;
@ -490,16 +574,15 @@ static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr,
if (len > 0 && buf[len - 1] == '\n')
--count;
if (!rtnl_trylock())
return restart_syscall();
ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev);
if (ret)
return ret;
if (dev_isalive(netdev)) {
ret = dev_set_alias(netdev, buf, count);
if (ret < 0)
goto err;
ret = len;
netdev_state_change(netdev);
}
ret = dev_set_alias(netdev, buf, count);
if (ret < 0)
goto err;
ret = len;
netdev_state_change(netdev);
err:
rtnl_unlock();
@ -551,24 +634,23 @@ static ssize_t phys_port_id_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct net_device *netdev = to_net_dev(dev);
struct netdev_phys_item_id ppid;
ssize_t ret = -EINVAL;
/* The check is also done in dev_get_phys_port_id; this helps returning
* early without hitting the trylock/restart below.
* early without hitting the locking section below.
*/
if (!netdev->netdev_ops->ndo_get_phys_port_id)
return -EOPNOTSUPP;
if (!rtnl_trylock())
return restart_syscall();
ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev);
if (ret)
return ret;
if (dev_isalive(netdev)) {
struct netdev_phys_item_id ppid;
ret = dev_get_phys_port_id(netdev, &ppid);
if (!ret)
ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id);
ret = dev_get_phys_port_id(netdev, &ppid);
if (!ret)
ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id);
}
rtnl_unlock();
return ret;
@ -580,24 +662,23 @@ static ssize_t phys_port_name_show(struct device *dev,
{
struct net_device *netdev = to_net_dev(dev);
ssize_t ret = -EINVAL;
char name[IFNAMSIZ];
/* The checks are also done in dev_get_phys_port_name; this helps
* returning early without hitting the trylock/restart below.
* returning early without hitting the locking section below.
*/
if (!netdev->netdev_ops->ndo_get_phys_port_name &&
!netdev->devlink_port)
return -EOPNOTSUPP;
if (!rtnl_trylock())
return restart_syscall();
ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev);
if (ret)
return ret;
if (dev_isalive(netdev)) {
char name[IFNAMSIZ];
ret = dev_get_phys_port_name(netdev, name, sizeof(name));
if (!ret)
ret = sysfs_emit(buf, "%s\n", name);
ret = dev_get_phys_port_name(netdev, name, sizeof(name));
if (!ret)
ret = sysfs_emit(buf, "%s\n", name);
}
rtnl_unlock();
return ret;
@ -608,26 +689,25 @@ static ssize_t phys_switch_id_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct net_device *netdev = to_net_dev(dev);
struct netdev_phys_item_id ppid = { };
ssize_t ret = -EINVAL;
/* The checks are also done in dev_get_phys_port_name; this helps
* returning early without hitting the trylock/restart below. This works
* returning early without hitting the locking section below. This works
* because recurse is false when calling dev_get_port_parent_id.
*/
if (!netdev->netdev_ops->ndo_get_port_parent_id &&
!netdev->devlink_port)
return -EOPNOTSUPP;
if (!rtnl_trylock())
return restart_syscall();
ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev);
if (ret)
return ret;
if (dev_isalive(netdev)) {
struct netdev_phys_item_id ppid = { };
ret = dev_get_port_parent_id(netdev, &ppid, false);
if (!ret)
ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id);
ret = dev_get_port_parent_id(netdev, &ppid, false);
if (!ret)
ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id);
}
rtnl_unlock();
return ret;
@ -1108,7 +1188,6 @@ static void rx_queue_get_ownership(const struct kobject *kobj,
static const struct kobj_type rx_queue_ktype = {
.sysfs_ops = &rx_queue_sysfs_ops,
.release = rx_queue_release,
.default_groups = rx_queue_default_groups,
.namespace = rx_queue_namespace,
.get_ownership = rx_queue_get_ownership,
};
@ -1131,6 +1210,22 @@ static int rx_queue_add_kobject(struct net_device *dev, int index)
struct kobject *kobj = &queue->kobj;
int error = 0;
/* Rx queues are cleared in rx_queue_release to allow later
* re-registration. This is triggered when their kobj refcount is
* dropped.
*
* If a queue is removed while both a read (or write) operation and a
* the re-addition of the same queue are pending (waiting on rntl_lock)
* it might happen that the re-addition will execute before the read,
* making the initial removal to never happen (queue's kobj refcount
* won't drop enough because of the pending read). In such rare case,
* return to allow the removal operation to complete.
*/
if (unlikely(kobj->state_initialized)) {
netdev_warn_once(dev, "Cannot re-add rx queues before their removal completed");
return -EAGAIN;
}
/* Kobject_put later will trigger rx_queue_release call which
* decreases dev refcount: Take that reference here
*/
@ -1142,20 +1237,27 @@ static int rx_queue_add_kobject(struct net_device *dev, int index)
if (error)
goto err;
queue->groups = rx_queue_default_groups;
error = sysfs_create_groups(kobj, queue->groups);
if (error)
goto err;
if (dev->sysfs_rx_queue_group) {
error = sysfs_create_group(kobj, dev->sysfs_rx_queue_group);
if (error)
goto err;
goto err_default_groups;
}
error = rx_queue_default_mask(dev, queue);
if (error)
goto err;
goto err_default_groups;
kobject_uevent(kobj, KOBJ_ADD);
return error;
err_default_groups:
sysfs_remove_groups(kobj, queue->groups);
err:
kobject_put(kobj);
return error;
@ -1200,12 +1302,14 @@ net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
}
while (--i >= new_num) {
struct kobject *kobj = &dev->_rx[i].kobj;
struct netdev_rx_queue *queue = &dev->_rx[i];
struct kobject *kobj = &queue->kobj;
if (!refcount_read(&dev_net(dev)->ns.count))
kobj->uevent_suppress = 1;
if (dev->sysfs_rx_queue_group)
sysfs_remove_group(kobj, dev->sysfs_rx_queue_group);
sysfs_remove_groups(kobj, queue->groups);
kobject_put(kobj);
}
@ -1244,9 +1348,11 @@ static int net_rx_queue_change_owner(struct net_device *dev, int num,
*/
struct netdev_queue_attribute {
struct attribute attr;
ssize_t (*show)(struct netdev_queue *queue, char *buf);
ssize_t (*store)(struct netdev_queue *queue,
const char *buf, size_t len);
ssize_t (*show)(struct kobject *kobj, struct attribute *attr,
struct netdev_queue *queue, char *buf);
ssize_t (*store)(struct kobject *kobj, struct attribute *attr,
struct netdev_queue *queue, const char *buf,
size_t len);
};
#define to_netdev_queue_attr(_attr) \
container_of(_attr, struct netdev_queue_attribute, attr)
@ -1263,7 +1369,7 @@ static ssize_t netdev_queue_attr_show(struct kobject *kobj,
if (!attribute->show)
return -EIO;
return attribute->show(queue, buf);
return attribute->show(kobj, attr, queue, buf);
}
static ssize_t netdev_queue_attr_store(struct kobject *kobj,
@ -1277,7 +1383,7 @@ static ssize_t netdev_queue_attr_store(struct kobject *kobj,
if (!attribute->store)
return -EIO;
return attribute->store(queue, buf, count);
return attribute->store(kobj, attr, queue, buf, count);
}
static const struct sysfs_ops netdev_queue_sysfs_ops = {
@ -1285,7 +1391,8 @@ static const struct sysfs_ops netdev_queue_sysfs_ops = {
.store = netdev_queue_attr_store,
};
static ssize_t tx_timeout_show(struct netdev_queue *queue, char *buf)
static ssize_t tx_timeout_show(struct kobject *kobj, struct attribute *attr,
struct netdev_queue *queue, char *buf)
{
unsigned long trans_timeout = atomic_long_read(&queue->trans_timeout);
@ -1303,18 +1410,18 @@ static unsigned int get_netdev_queue_index(struct netdev_queue *queue)
return i;
}
static ssize_t traffic_class_show(struct netdev_queue *queue,
char *buf)
static ssize_t traffic_class_show(struct kobject *kobj, struct attribute *attr,
struct netdev_queue *queue, char *buf)
{
struct net_device *dev = queue->dev;
int num_tc, tc;
int index;
int num_tc, tc, index, ret;
if (!netif_is_multiqueue(dev))
return -ENOENT;
if (!rtnl_trylock())
return restart_syscall();
ret = sysfs_rtnl_lock(kobj, attr, queue->dev);
if (ret)
return ret;
index = get_netdev_queue_index(queue);
@ -1341,24 +1448,25 @@ static ssize_t traffic_class_show(struct netdev_queue *queue,
}
#ifdef CONFIG_XPS
static ssize_t tx_maxrate_show(struct netdev_queue *queue,
char *buf)
static ssize_t tx_maxrate_show(struct kobject *kobj, struct attribute *attr,
struct netdev_queue *queue, char *buf)
{
return sysfs_emit(buf, "%lu\n", queue->tx_maxrate);
}
static ssize_t tx_maxrate_store(struct netdev_queue *queue,
const char *buf, size_t len)
static ssize_t tx_maxrate_store(struct kobject *kobj, struct attribute *attr,
struct netdev_queue *queue, const char *buf,
size_t len)
{
struct net_device *dev = queue->dev;
int err, index = get_netdev_queue_index(queue);
struct net_device *dev = queue->dev;
u32 rate = 0;
if (!capable(CAP_NET_ADMIN))
return -EPERM;
/* The check is also done later; this helps returning early without
* hitting the trylock/restart below.
* hitting the locking section below.
*/
if (!dev->netdev_ops->ndo_set_tx_maxrate)
return -EOPNOTSUPP;
@ -1367,18 +1475,21 @@ static ssize_t tx_maxrate_store(struct netdev_queue *queue,
if (err < 0)
return err;
if (!rtnl_trylock())
return restart_syscall();
err = sysfs_rtnl_lock(kobj, attr, dev);
if (err)
return err;
err = -EOPNOTSUPP;
if (dev->netdev_ops->ndo_set_tx_maxrate)
err = dev->netdev_ops->ndo_set_tx_maxrate(dev, index, rate);
rtnl_unlock();
if (!err) {
queue->tx_maxrate = rate;
rtnl_unlock();
return len;
}
rtnl_unlock();
return err;
}
@ -1422,16 +1533,17 @@ static ssize_t bql_set(const char *buf, const size_t count,
return count;
}
static ssize_t bql_show_hold_time(struct netdev_queue *queue,
char *buf)
static ssize_t bql_show_hold_time(struct kobject *kobj, struct attribute *attr,
struct netdev_queue *queue, char *buf)
{
struct dql *dql = &queue->dql;
return sysfs_emit(buf, "%u\n", jiffies_to_msecs(dql->slack_hold_time));
}
static ssize_t bql_set_hold_time(struct netdev_queue *queue,
const char *buf, size_t len)
static ssize_t bql_set_hold_time(struct kobject *kobj, struct attribute *attr,
struct netdev_queue *queue, const char *buf,
size_t len)
{
struct dql *dql = &queue->dql;
unsigned int value;
@ -1450,15 +1562,17 @@ static struct netdev_queue_attribute bql_hold_time_attribute __ro_after_init
= __ATTR(hold_time, 0644,
bql_show_hold_time, bql_set_hold_time);
static ssize_t bql_show_stall_thrs(struct netdev_queue *queue, char *buf)
static ssize_t bql_show_stall_thrs(struct kobject *kobj, struct attribute *attr,
struct netdev_queue *queue, char *buf)
{
struct dql *dql = &queue->dql;
return sysfs_emit(buf, "%u\n", jiffies_to_msecs(dql->stall_thrs));
}
static ssize_t bql_set_stall_thrs(struct netdev_queue *queue,
const char *buf, size_t len)
static ssize_t bql_set_stall_thrs(struct kobject *kobj, struct attribute *attr,
struct netdev_queue *queue, const char *buf,
size_t len)
{
struct dql *dql = &queue->dql;
unsigned int value;
@ -1484,13 +1598,15 @@ static ssize_t bql_set_stall_thrs(struct netdev_queue *queue,
static struct netdev_queue_attribute bql_stall_thrs_attribute __ro_after_init =
__ATTR(stall_thrs, 0644, bql_show_stall_thrs, bql_set_stall_thrs);
static ssize_t bql_show_stall_max(struct netdev_queue *queue, char *buf)
static ssize_t bql_show_stall_max(struct kobject *kobj, struct attribute *attr,
struct netdev_queue *queue, char *buf)
{
return sysfs_emit(buf, "%u\n", READ_ONCE(queue->dql.stall_max));
}
static ssize_t bql_set_stall_max(struct netdev_queue *queue,
const char *buf, size_t len)
static ssize_t bql_set_stall_max(struct kobject *kobj, struct attribute *attr,
struct netdev_queue *queue, const char *buf,
size_t len)
{
WRITE_ONCE(queue->dql.stall_max, 0);
return len;
@ -1499,7 +1615,8 @@ static ssize_t bql_set_stall_max(struct netdev_queue *queue,
static struct netdev_queue_attribute bql_stall_max_attribute __ro_after_init =
__ATTR(stall_max, 0644, bql_show_stall_max, bql_set_stall_max);
static ssize_t bql_show_stall_cnt(struct netdev_queue *queue, char *buf)
static ssize_t bql_show_stall_cnt(struct kobject *kobj, struct attribute *attr,
struct netdev_queue *queue, char *buf)
{
struct dql *dql = &queue->dql;
@ -1509,8 +1626,8 @@ static ssize_t bql_show_stall_cnt(struct netdev_queue *queue, char *buf)
static struct netdev_queue_attribute bql_stall_cnt_attribute __ro_after_init =
__ATTR(stall_cnt, 0444, bql_show_stall_cnt, NULL);
static ssize_t bql_show_inflight(struct netdev_queue *queue,
char *buf)
static ssize_t bql_show_inflight(struct kobject *kobj, struct attribute *attr,
struct netdev_queue *queue, char *buf)
{
struct dql *dql = &queue->dql;
@ -1521,13 +1638,16 @@ static struct netdev_queue_attribute bql_inflight_attribute __ro_after_init =
__ATTR(inflight, 0444, bql_show_inflight, NULL);
#define BQL_ATTR(NAME, FIELD) \
static ssize_t bql_show_ ## NAME(struct netdev_queue *queue, \
char *buf) \
static ssize_t bql_show_ ## NAME(struct kobject *kobj, \
struct attribute *attr, \
struct netdev_queue *queue, char *buf) \
{ \
return bql_show(buf, queue->dql.FIELD); \
} \
\
static ssize_t bql_set_ ## NAME(struct netdev_queue *queue, \
static ssize_t bql_set_ ## NAME(struct kobject *kobj, \
struct attribute *attr, \
struct netdev_queue *queue, \
const char *buf, size_t len) \
{ \
return bql_set(buf, len, &queue->dql.FIELD); \
@ -1613,19 +1733,21 @@ static ssize_t xps_queue_show(struct net_device *dev, unsigned int index,
return len < PAGE_SIZE ? len : -EINVAL;
}
static ssize_t xps_cpus_show(struct netdev_queue *queue, char *buf)
static ssize_t xps_cpus_show(struct kobject *kobj, struct attribute *attr,
struct netdev_queue *queue, char *buf)
{
struct net_device *dev = queue->dev;
unsigned int index;
int len, tc;
int len, tc, ret;
if (!netif_is_multiqueue(dev))
return -ENOENT;
index = get_netdev_queue_index(queue);
if (!rtnl_trylock())
return restart_syscall();
ret = sysfs_rtnl_lock(kobj, attr, queue->dev);
if (ret)
return ret;
/* If queue belongs to subordinate dev use its map */
dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
@ -1636,18 +1758,21 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, char *buf)
return -EINVAL;
}
/* Make sure the subordinate device can't be freed */
get_device(&dev->dev);
/* Increase the net device refcnt to make sure it won't be freed while
* xps_queue_show is running.
*/
dev_hold(dev);
rtnl_unlock();
len = xps_queue_show(dev, index, tc, buf, XPS_CPUS);
put_device(&dev->dev);
dev_put(dev);
return len;
}
static ssize_t xps_cpus_store(struct netdev_queue *queue,
const char *buf, size_t len)
static ssize_t xps_cpus_store(struct kobject *kobj, struct attribute *attr,
struct netdev_queue *queue, const char *buf,
size_t len)
{
struct net_device *dev = queue->dev;
unsigned int index;
@ -1671,9 +1796,10 @@ static ssize_t xps_cpus_store(struct netdev_queue *queue,
return err;
}
if (!rtnl_trylock()) {
err = sysfs_rtnl_lock(kobj, attr, dev);
if (err) {
free_cpumask_var(mask);
return restart_syscall();
return err;
}
err = netif_set_xps_queue(dev, mask, index);
@ -1687,26 +1813,34 @@ static ssize_t xps_cpus_store(struct netdev_queue *queue,
static struct netdev_queue_attribute xps_cpus_attribute __ro_after_init
= __ATTR_RW(xps_cpus);
static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf)
static ssize_t xps_rxqs_show(struct kobject *kobj, struct attribute *attr,
struct netdev_queue *queue, char *buf)
{
struct net_device *dev = queue->dev;
unsigned int index;
int tc;
int tc, ret;
index = get_netdev_queue_index(queue);
if (!rtnl_trylock())
return restart_syscall();
ret = sysfs_rtnl_lock(kobj, attr, dev);
if (ret)
return ret;
tc = netdev_txq_to_tc(dev, index);
rtnl_unlock();
if (tc < 0)
return -EINVAL;
return xps_queue_show(dev, index, tc, buf, XPS_RXQS);
/* Increase the net device refcnt to make sure it won't be freed while
* xps_queue_show is running.
*/
dev_hold(dev);
rtnl_unlock();
ret = tc >= 0 ? xps_queue_show(dev, index, tc, buf, XPS_RXQS) : -EINVAL;
dev_put(dev);
return ret;
}
static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf,
static ssize_t xps_rxqs_store(struct kobject *kobj, struct attribute *attr,
struct netdev_queue *queue, const char *buf,
size_t len)
{
struct net_device *dev = queue->dev;
@ -1730,9 +1864,10 @@ static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf,
return err;
}
if (!rtnl_trylock()) {
err = sysfs_rtnl_lock(kobj, attr, dev);
if (err) {
bitmap_free(mask);
return restart_syscall();
return err;
}
cpus_read_lock();
@ -1792,7 +1927,6 @@ static void netdev_queue_get_ownership(const struct kobject *kobj,
static const struct kobj_type netdev_queue_ktype = {
.sysfs_ops = &netdev_queue_sysfs_ops,
.release = netdev_queue_release,
.default_groups = netdev_queue_default_groups,
.namespace = netdev_queue_namespace,
.get_ownership = netdev_queue_get_ownership,
};
@ -1811,6 +1945,22 @@ static int netdev_queue_add_kobject(struct net_device *dev, int index)
struct kobject *kobj = &queue->kobj;
int error = 0;
/* Tx queues are cleared in netdev_queue_release to allow later
* re-registration. This is triggered when their kobj refcount is
* dropped.
*
* If a queue is removed while both a read (or write) operation and a
* the re-addition of the same queue are pending (waiting on rntl_lock)
* it might happen that the re-addition will execute before the read,
* making the initial removal to never happen (queue's kobj refcount
* won't drop enough because of the pending read). In such rare case,
* return to allow the removal operation to complete.
*/
if (unlikely(kobj->state_initialized)) {
netdev_warn_once(dev, "Cannot re-add tx queues before their removal completed");
return -EAGAIN;
}
/* Kobject_put later will trigger netdev_queue_release call
* which decreases dev refcount: Take that reference here
*/
@ -1822,15 +1972,22 @@ static int netdev_queue_add_kobject(struct net_device *dev, int index)
if (error)
goto err;
queue->groups = netdev_queue_default_groups;
error = sysfs_create_groups(kobj, queue->groups);
if (error)
goto err;
if (netdev_uses_bql(dev)) {
error = sysfs_create_group(kobj, &dql_group);
if (error)
goto err;
goto err_default_groups;
}
kobject_uevent(kobj, KOBJ_ADD);
return 0;
err_default_groups:
sysfs_remove_groups(kobj, queue->groups);
err:
kobject_put(kobj);
return error;
@ -1885,6 +2042,7 @@ netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
if (netdev_uses_bql(dev))
sysfs_remove_group(&queue->kobj, &dql_group);
sysfs_remove_groups(&queue->kobj, queue->groups);
kobject_put(&queue->kobj);
}

View File

@ -80,6 +80,11 @@ void rtnl_lock(void)
}
EXPORT_SYMBOL(rtnl_lock);
int rtnl_lock_interruptible(void)
{
return mutex_lock_interruptible(&rtnl_mutex);
}
int rtnl_lock_killable(void)
{
return mutex_lock_killable(&rtnl_mutex);