Merge branch 'devmem-io_uring-allow-more-flexibility-for-zc-dma-devices'

Dragos Tatulea says:

====================
devmem/io_uring: allow more flexibility for ZC DMA devices

For TCP zerocopy rx (io_uring, devmem), there is an assumption that the
parent device can do DMA. However that is not always the case:
- Scalable Function netdevs [1] have the DMA device in the grandparent.
- For Multi-PF netdevs [2] queues can be associated to different DMA
  devices.

The series adds an API for getting the DMA device for a netdev queue.
Drivers that have special requirements can implement the newly added
queue management op. Otherwise the parent will still be used as before.

This series continues with switching to this API for io_uring zcrx and
devmem and adds a ndo_queue_dma_dev op for mlx5.

The last part of the series changes devmem rx bind to get the DMA device
per queue and blocks the case when multiple queues use different DMA
devices. The tx bind is left as is.

[1] Documentation/networking/device_drivers/ethernet/mellanox/mlx5/switchdev.rst
[2] Documentation/networking/multi-pf-netdev.rst
====================

Link: https://patch.msgid.link/20250827144017.1529208-2-dtatulea@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
Jakub Kicinski 2025-08-28 16:05:34 -07:00
commit bbf02c3184
8 changed files with 163 additions and 31 deletions

View File

@ -5625,12 +5625,36 @@ static int mlx5e_queue_start(struct net_device *dev, void *newq,
return 0;
}
static struct device *mlx5e_queue_get_dma_dev(struct net_device *dev,
int queue_index)
{
struct mlx5e_priv *priv = netdev_priv(dev);
struct mlx5e_channels *channels;
struct device *pdev = NULL;
struct mlx5e_channel *ch;
channels = &priv->channels;
mutex_lock(&priv->state_lock);
if (queue_index >= channels->num)
goto out;
ch = channels->c[queue_index];
pdev = ch->pdev;
out:
mutex_unlock(&priv->state_lock);
return pdev;
}
static const struct netdev_queue_mgmt_ops mlx5e_queue_mgmt_ops = {
.ndo_queue_mem_size = sizeof(struct mlx5_qmgmt_data),
.ndo_queue_mem_alloc = mlx5e_queue_mem_alloc,
.ndo_queue_mem_free = mlx5e_queue_mem_free,
.ndo_queue_start = mlx5e_queue_start,
.ndo_queue_stop = mlx5e_queue_stop,
.ndo_queue_get_dma_dev = mlx5e_queue_get_dma_dev,
};
static void mlx5e_build_nic_netdev(struct net_device *netdev)

View File

@ -127,6 +127,9 @@ void netdev_stat_queue_sum(struct net_device *netdev,
* @ndo_queue_stop: Stop the RX queue at the specified index. The stopped
* queue's memory is written at the specified address.
*
* @ndo_queue_get_dma_dev: Get dma device for zero-copy operations to be used
* for this queue. Return NULL on error.
*
* Note that @ndo_queue_mem_alloc and @ndo_queue_mem_free may be called while
* the interface is closed. @ndo_queue_start and @ndo_queue_stop will only
* be called for an interface which is open.
@ -144,6 +147,8 @@ struct netdev_queue_mgmt_ops {
int (*ndo_queue_stop)(struct net_device *dev,
void *per_queue_mem,
int idx);
struct device * (*ndo_queue_get_dma_dev)(struct net_device *dev,
int idx);
};
/**
@ -321,4 +326,6 @@ static inline void netif_subqueue_sent(const struct net_device *dev,
get_desc, start_thrs); \
})
struct device *netdev_queue_get_dma_dev(struct net_device *dev, int idx);
#endif

View File

@ -12,6 +12,7 @@
#include <net/page_pool/helpers.h>
#include <net/page_pool/memory_provider.h>
#include <net/netlink.h>
#include <net/netdev_queues.h>
#include <net/netdev_rx_queue.h>
#include <net/tcp.h>
#include <net/rps.h>
@ -599,7 +600,7 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
goto err;
}
ifq->dev = ifq->netdev->dev.parent;
ifq->dev = netdev_queue_get_dma_dev(ifq->netdev, ifq->if_rxq);
if (!ifq->dev) {
ret = -EOPNOTSUPP;
goto err;

View File

@ -20,6 +20,7 @@ obj-$(CONFIG_NETDEV_ADDR_LIST_TEST) += dev_addr_lists_test.o
obj-y += net-sysfs.o
obj-y += hotdata.o
obj-y += netdev_rx_queue.o
obj-y += netdev_queues.o
obj-$(CONFIG_PAGE_POOL) += page_pool.o page_pool_user.o
obj-$(CONFIG_PROC_FS) += net-procfs.o
obj-$(CONFIG_NET_PKTGEN) += pktgen.o

View File

@ -176,6 +176,7 @@ int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx,
struct net_devmem_dmabuf_binding *
net_devmem_bind_dmabuf(struct net_device *dev,
struct device *dma_dev,
enum dma_data_direction direction,
unsigned int dmabuf_fd, struct netdev_nl_sock *priv,
struct netlink_ext_ack *extack)
@ -188,6 +189,11 @@ net_devmem_bind_dmabuf(struct net_device *dev,
unsigned long virtual;
int err;
if (!dma_dev) {
NL_SET_ERR_MSG(extack, "Device doesn't support DMA");
return ERR_PTR(-EOPNOTSUPP);
}
dmabuf = dma_buf_get(dmabuf_fd);
if (IS_ERR(dmabuf))
return ERR_CAST(dmabuf);
@ -209,7 +215,7 @@ net_devmem_bind_dmabuf(struct net_device *dev,
binding->dmabuf = dmabuf;
binding->direction = direction;
binding->attachment = dma_buf_attach(binding->dmabuf, dev->dev.parent);
binding->attachment = dma_buf_attach(binding->dmabuf, dma_dev);
if (IS_ERR(binding->attachment)) {
err = PTR_ERR(binding->attachment);
NL_SET_ERR_MSG(extack, "Failed to bind dmabuf to device");

View File

@ -85,6 +85,7 @@ struct dmabuf_genpool_chunk_owner {
void __net_devmem_dmabuf_binding_free(struct work_struct *wq);
struct net_devmem_dmabuf_binding *
net_devmem_bind_dmabuf(struct net_device *dev,
struct device *dma_dev,
enum dma_data_direction direction,
unsigned int dmabuf_fd, struct netdev_nl_sock *priv,
struct netlink_ext_ack *extack);
@ -170,6 +171,7 @@ static inline void net_devmem_put_net_iov(struct net_iov *niov)
static inline struct net_devmem_dmabuf_binding *
net_devmem_bind_dmabuf(struct net_device *dev,
struct device *dma_dev,
enum dma_data_direction direction,
unsigned int dmabuf_fd,
struct netdev_nl_sock *priv,

View File

@ -869,16 +869,79 @@ int netdev_nl_qstats_get_dumpit(struct sk_buff *skb,
return err;
}
static int netdev_nl_read_rxq_bitmap(struct genl_info *info,
u32 rxq_bitmap_len,
unsigned long *rxq_bitmap)
{
const int maxtype = ARRAY_SIZE(netdev_queue_id_nl_policy) - 1;
struct nlattr *tb[ARRAY_SIZE(netdev_queue_id_nl_policy)];
struct nlattr *attr;
int rem, err = 0;
u32 rxq_idx;
nla_for_each_attr_type(attr, NETDEV_A_DMABUF_QUEUES,
genlmsg_data(info->genlhdr),
genlmsg_len(info->genlhdr), rem) {
err = nla_parse_nested(tb, maxtype, attr,
netdev_queue_id_nl_policy, info->extack);
if (err < 0)
return err;
if (NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_ID) ||
NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_TYPE))
return -EINVAL;
if (nla_get_u32(tb[NETDEV_A_QUEUE_TYPE]) != NETDEV_QUEUE_TYPE_RX) {
NL_SET_BAD_ATTR(info->extack, tb[NETDEV_A_QUEUE_TYPE]);
return -EINVAL;
}
rxq_idx = nla_get_u32(tb[NETDEV_A_QUEUE_ID]);
if (rxq_idx >= rxq_bitmap_len) {
NL_SET_BAD_ATTR(info->extack, tb[NETDEV_A_QUEUE_ID]);
return -EINVAL;
}
bitmap_set(rxq_bitmap, rxq_idx, 1);
}
return 0;
}
static struct device *
netdev_nl_get_dma_dev(struct net_device *netdev, unsigned long *rxq_bitmap,
struct netlink_ext_ack *extack)
{
struct device *dma_dev = NULL;
u32 rxq_idx, prev_rxq_idx;
for_each_set_bit(rxq_idx, rxq_bitmap, netdev->real_num_rx_queues) {
struct device *rxq_dma_dev;
rxq_dma_dev = netdev_queue_get_dma_dev(netdev, rxq_idx);
if (dma_dev && rxq_dma_dev != dma_dev) {
NL_SET_ERR_MSG_FMT(extack, "DMA device mismatch between queue %u and %u (multi-PF device?)",
rxq_idx, prev_rxq_idx);
return ERR_PTR(-EOPNOTSUPP);
}
dma_dev = rxq_dma_dev;
prev_rxq_idx = rxq_idx;
}
return dma_dev;
}
int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info)
{
struct nlattr *tb[ARRAY_SIZE(netdev_queue_id_nl_policy)];
struct net_devmem_dmabuf_binding *binding;
u32 ifindex, dmabuf_fd, rxq_idx;
struct netdev_nl_sock *priv;
struct net_device *netdev;
unsigned long *rxq_bitmap;
struct device *dma_dev;
struct sk_buff *rsp;
struct nlattr *attr;
int rem, err = 0;
int err = 0;
void *hdr;
if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX) ||
@ -921,36 +984,31 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info)
goto err_unlock;
}
binding = net_devmem_bind_dmabuf(netdev, DMA_FROM_DEVICE, dmabuf_fd,
priv, info->extack);
if (IS_ERR(binding)) {
err = PTR_ERR(binding);
rxq_bitmap = bitmap_zalloc(netdev->real_num_rx_queues, GFP_KERNEL);
if (!rxq_bitmap) {
err = -ENOMEM;
goto err_unlock;
}
nla_for_each_attr_type(attr, NETDEV_A_DMABUF_QUEUES,
genlmsg_data(info->genlhdr),
genlmsg_len(info->genlhdr), rem) {
err = nla_parse_nested(
tb, ARRAY_SIZE(netdev_queue_id_nl_policy) - 1, attr,
netdev_queue_id_nl_policy, info->extack);
if (err < 0)
goto err_unbind;
err = netdev_nl_read_rxq_bitmap(info, netdev->real_num_rx_queues,
rxq_bitmap);
if (err)
goto err_rxq_bitmap;
if (NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_ID) ||
NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_TYPE)) {
err = -EINVAL;
goto err_unbind;
}
dma_dev = netdev_nl_get_dma_dev(netdev, rxq_bitmap, info->extack);
if (IS_ERR(dma_dev)) {
err = PTR_ERR(dma_dev);
goto err_rxq_bitmap;
}
if (nla_get_u32(tb[NETDEV_A_QUEUE_TYPE]) != NETDEV_QUEUE_TYPE_RX) {
NL_SET_BAD_ATTR(info->extack, tb[NETDEV_A_QUEUE_TYPE]);
err = -EINVAL;
goto err_unbind;
}
rxq_idx = nla_get_u32(tb[NETDEV_A_QUEUE_ID]);
binding = net_devmem_bind_dmabuf(netdev, dma_dev, DMA_FROM_DEVICE,
dmabuf_fd, priv, info->extack);
if (IS_ERR(binding)) {
err = PTR_ERR(binding);
goto err_rxq_bitmap;
}
for_each_set_bit(rxq_idx, rxq_bitmap, netdev->real_num_rx_queues) {
err = net_devmem_bind_dmabuf_to_queue(netdev, rxq_idx, binding,
info->extack);
if (err)
@ -964,6 +1022,8 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info)
if (err)
goto err_unbind;
bitmap_free(rxq_bitmap);
netdev_unlock(netdev);
mutex_unlock(&priv->lock);
@ -972,6 +1032,8 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info)
err_unbind:
net_devmem_unbind_dmabuf(binding);
err_rxq_bitmap:
bitmap_free(rxq_bitmap);
err_unlock:
netdev_unlock(netdev);
err_unlock_sock:
@ -986,6 +1048,7 @@ int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info)
struct net_devmem_dmabuf_binding *binding;
struct netdev_nl_sock *priv;
struct net_device *netdev;
struct device *dma_dev;
u32 ifindex, dmabuf_fd;
struct sk_buff *rsp;
int err = 0;
@ -1032,8 +1095,9 @@ int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info)
goto err_unlock_netdev;
}
binding = net_devmem_bind_dmabuf(netdev, DMA_TO_DEVICE, dmabuf_fd, priv,
info->extack);
dma_dev = netdev_queue_get_dma_dev(netdev, 0);
binding = net_devmem_bind_dmabuf(netdev, dma_dev, DMA_TO_DEVICE,
dmabuf_fd, priv, info->extack);
if (IS_ERR(binding)) {
err = PTR_ERR(binding);
goto err_unlock_netdev;

27
net/core/netdev_queues.c Normal file
View File

@ -0,0 +1,27 @@
// SPDX-License-Identifier: GPL-2.0-or-later
#include <net/netdev_queues.h>
/**
* netdev_queue_get_dma_dev() - get dma device for zero-copy operations
* @dev: net_device
* @idx: queue index
*
* Get dma device for zero-copy operations to be used for this queue.
* When such device is not available or valid, the function will return NULL.
*
* Return: Device or NULL on error
*/
struct device *netdev_queue_get_dma_dev(struct net_device *dev, int idx)
{
const struct netdev_queue_mgmt_ops *queue_ops = dev->queue_mgmt_ops;
struct device *dma_dev;
if (queue_ops && queue_ops->ndo_queue_get_dma_dev)
dma_dev = queue_ops->ndo_queue_get_dma_dev(dev, idx);
else
dma_dev = dev->dev.parent;
return dma_dev && dma_dev->dma_mask ? dma_dev : NULL;
}