net: Implement netdev_nl_queue_create_doit

Implement netdev_nl_queue_create_doit which creates a new rx queue in a
virtual netdev and then leases it to a rx queue in a physical netdev.

Example with ynl client:

  # ynl --family netdev --output-json --do queue-create \
        --json '{"ifindex": 8, "type": "rx", "lease": {"ifindex": 4, "queue": {"type": "rx", "id": 15}}}'
  {'id': 1}

Note that the netdevice locking order is always from the virtual to
the physical device.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Co-developed-by: David Wei <dw@davidwei.uk>
Signed-off-by: David Wei <dw@davidwei.uk>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Link: https://patch.msgid.link/20260402231031.447597-3-daniel@iogearbox.net
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
Daniel Borkmann 2026-04-03 01:10:19 +02:00 committed by Jakub Kicinski
parent 7789c6bb76
commit d04686d9bc
9 changed files with 323 additions and 11 deletions

View File

@ -329,6 +329,12 @@ by setting ``request_ops_lock`` to true. Code comments and docs refer
to drivers which have ops called under the instance lock as "ops locked". to drivers which have ops called under the instance lock as "ops locked".
See also the documentation of the ``lock`` member of struct net_device. See also the documentation of the ``lock`` member of struct net_device.
There is also a case of taking two per-netdev locks in sequence when netdev
queues are leased, that is, the netdev-scope lock is taken for both the
virtual and the physical device. To prevent deadlocks, the virtual device's
lock must always be acquired before the physical device's (see
``netdev_nl_queue_create_doit``).
In the future, there will be an option for individual In the future, there will be an option for individual
drivers to opt out of using ``rtnl_lock`` and instead perform their control drivers to opt out of using ``rtnl_lock`` and instead perform their control
operations directly under the netdev instance lock. operations directly under the netdev instance lock.

View File

@ -2561,7 +2561,14 @@ struct net_device {
* Also protects some fields in: * Also protects some fields in:
* struct napi_struct, struct netdev_queue, struct netdev_rx_queue * struct napi_struct, struct netdev_queue, struct netdev_rx_queue
* *
* Ordering: take after rtnl_lock. * Ordering:
*
* - take after rtnl_lock
*
* - for the case of netdev queue leasing, the netdev-scope lock is
* taken for both the virtual and the physical device; to prevent
* deadlocks, the virtual device's lock must always be acquired
* before the physical device's (see netdev_nl_queue_create_doit)
*/ */
struct mutex lock; struct mutex lock;

View File

@ -150,6 +150,11 @@ enum {
* When NIC-wide config is changed the callback will * When NIC-wide config is changed the callback will
* be invoked for all queues. * be invoked for all queues.
* *
* @ndo_queue_create: Create a new RX queue on a virtual device that will
* be paired with a physical device's queue via leasing.
* Return the new queue id on success, negative error
* on failure.
*
* @supported_params: Bitmask of supported parameters, see QCFG_*. * @supported_params: Bitmask of supported parameters, see QCFG_*.
* *
* Note that @ndo_queue_mem_alloc and @ndo_queue_mem_free may be called while * Note that @ndo_queue_mem_alloc and @ndo_queue_mem_free may be called while
@ -178,6 +183,8 @@ struct netdev_queue_mgmt_ops {
struct netlink_ext_ack *extack); struct netlink_ext_ack *extack);
struct device * (*ndo_queue_get_dma_dev)(struct net_device *dev, struct device * (*ndo_queue_get_dma_dev)(struct net_device *dev,
int idx); int idx);
int (*ndo_queue_create)(struct net_device *dev,
struct netlink_ext_ack *extack);
unsigned int supported_params; unsigned int supported_params;
}; };
@ -185,7 +192,7 @@ struct netdev_queue_mgmt_ops {
void netdev_queue_config(struct net_device *dev, int rxq, void netdev_queue_config(struct net_device *dev, int rxq,
struct netdev_queue_config *qcfg); struct netdev_queue_config *qcfg);
bool netif_rxq_has_unreadable_mp(struct net_device *dev, int idx); bool netif_rxq_has_unreadable_mp(struct net_device *dev, unsigned int rxq_idx);
/** /**
* DOC: Lockless queue stopping / waking helpers. * DOC: Lockless queue stopping / waking helpers.
@ -374,5 +381,11 @@ static inline unsigned int netif_xmit_timeout_ms(struct netdev_queue *txq)
}) })
struct device *netdev_queue_get_dma_dev(struct net_device *dev, int idx); struct device *netdev_queue_get_dma_dev(struct net_device *dev, int idx);
bool netdev_can_create_queue(const struct net_device *dev,
#endif struct netlink_ext_ack *extack);
bool netdev_can_lease_queue(const struct net_device *dev,
struct netlink_ext_ack *extack);
bool netdev_queue_busy(struct net_device *dev, unsigned int idx,
enum netdev_queue_type type,
struct netlink_ext_ack *extack);
#endif /* _LINUX_NET_QUEUES_H */

View File

@ -31,6 +31,14 @@ struct netdev_rx_queue {
struct napi_struct *napi; struct napi_struct *napi;
struct netdev_queue_config qcfg; struct netdev_queue_config qcfg;
struct pp_memory_provider_params mp_params; struct pp_memory_provider_params mp_params;
/* If a queue is leased, then the lease pointer is always
* valid. From the physical device it points to the virtual
* queue, and from the virtual device it points to the
* physical queue.
*/
struct netdev_rx_queue *lease;
netdevice_tracker lease_tracker;
} ____cacheline_aligned_in_smp; } ____cacheline_aligned_in_smp;
/* /*
@ -60,5 +68,8 @@ get_netdev_rx_queue_index(struct netdev_rx_queue *queue)
} }
int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq); int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq);
void netdev_rx_queue_lease(struct netdev_rx_queue *rxq_dst,
#endif struct netdev_rx_queue *rxq_src);
void netdev_rx_queue_unlease(struct netdev_rx_queue *rxq_dst,
struct netdev_rx_queue *rxq_src);
#endif /* _LINUX_NETDEV_RX_QUEUE_H */

View File

@ -1121,6 +1121,14 @@ netdev_get_by_index_lock_ops_compat(struct net *net, int ifindex)
return __netdev_put_lock_ops_compat(dev, net); return __netdev_put_lock_ops_compat(dev, net);
} }
struct net_device *
netdev_put_lock(struct net_device *dev, struct net *net,
netdevice_tracker *tracker)
{
netdev_tracker_free(dev, tracker);
return __netdev_put_lock(dev, net);
}
struct net_device * struct net_device *
netdev_xa_find_lock(struct net *net, struct net_device *dev, netdev_xa_find_lock(struct net *net, struct net_device *dev,
unsigned long *index) unsigned long *index)

View File

@ -31,6 +31,8 @@ netdev_napi_by_id_lock(struct net *net, unsigned int napi_id);
struct net_device *dev_get_by_napi_id(unsigned int napi_id); struct net_device *dev_get_by_napi_id(unsigned int napi_id);
struct net_device *__netdev_put_lock(struct net_device *dev, struct net *net); struct net_device *__netdev_put_lock(struct net_device *dev, struct net *net);
struct net_device *netdev_put_lock(struct net_device *dev, struct net *net,
netdevice_tracker *tracker);
struct net_device * struct net_device *
netdev_xa_find_lock(struct net *net, struct net_device *dev, netdev_xa_find_lock(struct net *net, struct net_device *dev,
unsigned long *index); unsigned long *index);
@ -96,6 +98,9 @@ int netdev_queue_config_validate(struct net_device *dev, int rxq_idx,
struct netdev_queue_config *qcfg, struct netdev_queue_config *qcfg,
struct netlink_ext_ack *extack); struct netlink_ext_ack *extack);
bool netif_rxq_has_mp(struct net_device *dev, unsigned int rxq_idx);
bool netif_rxq_is_leased(struct net_device *dev, unsigned int rxq_idx);
/* netdev management, shared between various uAPI entry points */ /* netdev management, shared between various uAPI entry points */
struct netdev_name_node { struct netdev_name_node {
struct hlist_node hlist; struct hlist_node hlist;

View File

@ -1122,7 +1122,169 @@ int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info)
int netdev_nl_queue_create_doit(struct sk_buff *skb, struct genl_info *info) int netdev_nl_queue_create_doit(struct sk_buff *skb, struct genl_info *info)
{ {
return -EOPNOTSUPP; const int qmaxtype = ARRAY_SIZE(netdev_queue_id_nl_policy) - 1;
const int lmaxtype = ARRAY_SIZE(netdev_lease_nl_policy) - 1;
int err, ifindex, ifindex_lease, queue_id, queue_id_lease;
struct nlattr *qtb[ARRAY_SIZE(netdev_queue_id_nl_policy)];
struct nlattr *ltb[ARRAY_SIZE(netdev_lease_nl_policy)];
struct netdev_rx_queue *rxq, *rxq_lease;
struct net_device *dev, *dev_lease;
netdevice_tracker dev_tracker;
s32 netns_lease = -1;
struct nlattr *nest;
struct sk_buff *rsp;
struct net *net;
void *hdr;
if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_IFINDEX) ||
GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_TYPE) ||
GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_LEASE))
return -EINVAL;
if (nla_get_u32(info->attrs[NETDEV_A_QUEUE_TYPE]) !=
NETDEV_QUEUE_TYPE_RX) {
NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_QUEUE_TYPE]);
return -EINVAL;
}
ifindex = nla_get_u32(info->attrs[NETDEV_A_QUEUE_IFINDEX]);
nest = info->attrs[NETDEV_A_QUEUE_LEASE];
err = nla_parse_nested(ltb, lmaxtype, nest,
netdev_lease_nl_policy, info->extack);
if (err < 0)
return err;
if (NL_REQ_ATTR_CHECK(info->extack, nest, ltb, NETDEV_A_LEASE_IFINDEX) ||
NL_REQ_ATTR_CHECK(info->extack, nest, ltb, NETDEV_A_LEASE_QUEUE))
return -EINVAL;
if (ltb[NETDEV_A_LEASE_NETNS_ID]) {
if (!capable(CAP_NET_ADMIN))
return -EPERM;
netns_lease = nla_get_s32(ltb[NETDEV_A_LEASE_NETNS_ID]);
}
ifindex_lease = nla_get_u32(ltb[NETDEV_A_LEASE_IFINDEX]);
nest = ltb[NETDEV_A_LEASE_QUEUE];
err = nla_parse_nested(qtb, qmaxtype, nest,
netdev_queue_id_nl_policy, info->extack);
if (err < 0)
return err;
if (NL_REQ_ATTR_CHECK(info->extack, nest, qtb, NETDEV_A_QUEUE_ID) ||
NL_REQ_ATTR_CHECK(info->extack, nest, qtb, NETDEV_A_QUEUE_TYPE))
return -EINVAL;
if (nla_get_u32(qtb[NETDEV_A_QUEUE_TYPE]) != NETDEV_QUEUE_TYPE_RX) {
NL_SET_BAD_ATTR(info->extack, qtb[NETDEV_A_QUEUE_TYPE]);
return -EINVAL;
}
queue_id_lease = nla_get_u32(qtb[NETDEV_A_QUEUE_ID]);
rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!rsp)
return -ENOMEM;
hdr = genlmsg_iput(rsp, info);
if (!hdr) {
err = -EMSGSIZE;
goto err_genlmsg_free;
}
/* Locking order is always from the virtual to the physical device
* since this is also the same order when applications open the
* memory provider later on.
*/
dev = netdev_get_by_index_lock(genl_info_net(info), ifindex);
if (!dev) {
err = -ENODEV;
goto err_genlmsg_free;
}
if (!netdev_can_create_queue(dev, info->extack)) {
err = -EINVAL;
goto err_unlock_dev;
}
net = genl_info_net(info);
if (netns_lease >= 0) {
net = get_net_ns_by_id(net, netns_lease);
if (!net) {
err = -ENONET;
goto err_unlock_dev;
}
}
dev_lease = netdev_get_by_index(net, ifindex_lease, &dev_tracker,
GFP_KERNEL);
if (!dev_lease) {
err = -ENODEV;
goto err_put_netns;
}
if (!netdev_can_lease_queue(dev_lease, info->extack)) {
netdev_put(dev_lease, &dev_tracker);
err = -EINVAL;
goto err_put_netns;
}
dev_lease = netdev_put_lock(dev_lease, net, &dev_tracker);
if (!dev_lease) {
err = -ENODEV;
goto err_put_netns;
}
if (queue_id_lease >= dev_lease->real_num_rx_queues) {
err = -ERANGE;
NL_SET_BAD_ATTR(info->extack, qtb[NETDEV_A_QUEUE_ID]);
goto err_unlock_dev_lease;
}
if (netdev_queue_busy(dev_lease, queue_id_lease, NETDEV_QUEUE_TYPE_RX,
info->extack)) {
err = -EBUSY;
goto err_unlock_dev_lease;
}
rxq_lease = __netif_get_rx_queue(dev_lease, queue_id_lease);
rxq = __netif_get_rx_queue(dev, dev->real_num_rx_queues - 1);
/* Leasing queues from different physical devices is currently
* not supported. Capabilities such as XDP features and DMA
* device may differ between physical devices, and computing
* a correct intersection for the virtual device is not yet
* implemented.
*/
if (rxq->lease && rxq->lease->dev != dev_lease) {
err = -EOPNOTSUPP;
NL_SET_ERR_MSG(info->extack,
"Leasing queues from different devices not supported");
goto err_unlock_dev_lease;
}
queue_id = dev->queue_mgmt_ops->ndo_queue_create(dev, info->extack);
if (queue_id < 0) {
err = queue_id;
goto err_unlock_dev_lease;
}
rxq = __netif_get_rx_queue(dev, queue_id);
netdev_rx_queue_lease(rxq, rxq_lease);
nla_put_u32(rsp, NETDEV_A_QUEUE_ID, queue_id);
genlmsg_end(rsp, hdr);
netdev_unlock(dev_lease);
netdev_unlock(dev);
if (netns_lease >= 0)
put_net(net);
return genlmsg_reply(rsp, info);
err_unlock_dev_lease:
netdev_unlock(dev_lease);
err_put_netns:
if (netns_lease >= 0)
put_net(net);
err_unlock_dev:
netdev_unlock(dev);
err_genlmsg_free:
nlmsg_free(rsp);
return err;
} }
void netdev_nl_sock_priv_init(struct netdev_nl_sock *priv) void netdev_nl_sock_priv_init(struct netdev_nl_sock *priv)

View File

@ -1,6 +1,10 @@
// SPDX-License-Identifier: GPL-2.0-or-later // SPDX-License-Identifier: GPL-2.0-or-later
#include <net/netdev_queues.h> #include <net/netdev_queues.h>
#include <net/netdev_rx_queue.h>
#include <net/xdp_sock_drv.h>
#include "dev.h"
/** /**
* netdev_queue_get_dma_dev() - get dma device for zero-copy operations * netdev_queue_get_dma_dev() - get dma device for zero-copy operations
@ -25,3 +29,61 @@ struct device *netdev_queue_get_dma_dev(struct net_device *dev, int idx)
return dma_dev && dma_dev->dma_mask ? dma_dev : NULL; return dma_dev && dma_dev->dma_mask ? dma_dev : NULL;
} }
bool netdev_can_create_queue(const struct net_device *dev,
struct netlink_ext_ack *extack)
{
if (dev->dev.parent) {
NL_SET_ERR_MSG(extack, "Device is not a virtual device");
return false;
}
if (!dev->queue_mgmt_ops ||
!dev->queue_mgmt_ops->ndo_queue_create) {
NL_SET_ERR_MSG(extack, "Device does not support queue creation");
return false;
}
if (dev->real_num_rx_queues < 1 ||
dev->real_num_tx_queues < 1) {
NL_SET_ERR_MSG(extack, "Device must have at least one real queue");
return false;
}
return true;
}
bool netdev_can_lease_queue(const struct net_device *dev,
struct netlink_ext_ack *extack)
{
if (!dev->dev.parent) {
NL_SET_ERR_MSG(extack, "Lease device is a virtual device");
return false;
}
if (!netif_device_present(dev)) {
NL_SET_ERR_MSG(extack, "Lease device has been removed from the system");
return false;
}
if (!dev->queue_mgmt_ops) {
NL_SET_ERR_MSG(extack, "Lease device does not support queue management operations");
return false;
}
return true;
}
bool netdev_queue_busy(struct net_device *dev, unsigned int idx,
enum netdev_queue_type type,
struct netlink_ext_ack *extack)
{
if (xsk_get_pool_from_qid(dev, idx)) {
NL_SET_ERR_MSG(extack, "Device queue in use by AF_XDP");
return true;
}
if (type == NETDEV_QUEUE_TYPE_TX)
return false;
if (netif_rxq_is_leased(dev, idx)) {
NL_SET_ERR_MSG(extack, "Device queue in use due to queue leasing");
return true;
}
if (netif_rxq_has_mp(dev, idx)) {
NL_SET_ERR_MSG(extack, "Device queue in use by memory provider");
return true;
}
return false;
}

View File

@ -10,15 +10,53 @@
#include "dev.h" #include "dev.h"
#include "page_pool_priv.h" #include "page_pool_priv.h"
/* See also page_pool_is_unreadable() */ void netdev_rx_queue_lease(struct netdev_rx_queue *rxq_dst,
bool netif_rxq_has_unreadable_mp(struct net_device *dev, int idx) struct netdev_rx_queue *rxq_src)
{ {
struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, idx); netdev_assert_locked(rxq_src->dev);
netdev_assert_locked(rxq_dst->dev);
return !!rxq->mp_params.mp_ops; netdev_hold(rxq_src->dev, &rxq_src->lease_tracker, GFP_KERNEL);
WRITE_ONCE(rxq_src->lease, rxq_dst);
WRITE_ONCE(rxq_dst->lease, rxq_src);
}
void netdev_rx_queue_unlease(struct netdev_rx_queue *rxq_dst,
struct netdev_rx_queue *rxq_src)
{
netdev_assert_locked(rxq_dst->dev);
netdev_assert_locked(rxq_src->dev);
WRITE_ONCE(rxq_src->lease, NULL);
WRITE_ONCE(rxq_dst->lease, NULL);
netdev_put(rxq_src->dev, &rxq_src->lease_tracker);
}
bool netif_rxq_is_leased(struct net_device *dev, unsigned int rxq_idx)
{
if (rxq_idx < dev->real_num_rx_queues)
return READ_ONCE(__netif_get_rx_queue(dev, rxq_idx)->lease);
return false;
}
/* See also page_pool_is_unreadable() */
bool netif_rxq_has_unreadable_mp(struct net_device *dev, unsigned int rxq_idx)
{
if (rxq_idx < dev->real_num_rx_queues)
return __netif_get_rx_queue(dev, rxq_idx)->mp_params.mp_ops;
return false;
} }
EXPORT_SYMBOL(netif_rxq_has_unreadable_mp); EXPORT_SYMBOL(netif_rxq_has_unreadable_mp);
bool netif_rxq_has_mp(struct net_device *dev, unsigned int rxq_idx)
{
if (rxq_idx < dev->real_num_rx_queues)
return __netif_get_rx_queue(dev, rxq_idx)->mp_params.mp_priv;
return false;
}
static int netdev_rx_queue_reconfig(struct net_device *dev, static int netdev_rx_queue_reconfig(struct net_device *dev,
unsigned int rxq_idx, unsigned int rxq_idx,
struct netdev_queue_config *qcfg_old, struct netdev_queue_config *qcfg_old,