diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index aa3febdc8322..dce798d8cfe6 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -12,7 +12,7 @@ ib_core-y := packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \ roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \ multicast.o mad.o smi.o agent.o mad_rmpp.o \ nldev.o restrack.o counters.o ib_core_uverbs.o \ - trace.o lag.o iter.o + trace.o lag.o iter.o frmr_pools.o ib_core-$(CONFIG_SECURITY_INFINIBAND) += security.o ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o diff --git a/drivers/infiniband/core/frmr_pools.c b/drivers/infiniband/core/frmr_pools.c new file mode 100644 index 000000000000..e08c8093a468 --- /dev/null +++ b/drivers/infiniband/core/frmr_pools.c @@ -0,0 +1,319 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + */ + +#include +#include +#include +#include +#include + +#include "frmr_pools.h" + +static int push_handle_to_queue_locked(struct frmr_queue *queue, u32 handle) +{ + u32 tmp = queue->ci % NUM_HANDLES_PER_PAGE; + struct frmr_handles_page *page; + + if (queue->ci >= queue->num_pages * NUM_HANDLES_PER_PAGE) { + page = kzalloc_obj(*page, GFP_ATOMIC); + if (!page) + return -ENOMEM; + queue->num_pages++; + list_add_tail(&page->list, &queue->pages_list); + } else { + page = list_last_entry(&queue->pages_list, + struct frmr_handles_page, list); + } + + page->handles[tmp] = handle; + queue->ci++; + return 0; +} + +static u32 pop_handle_from_queue_locked(struct frmr_queue *queue) +{ + u32 tmp = (queue->ci - 1) % NUM_HANDLES_PER_PAGE; + struct frmr_handles_page *page; + u32 handle; + + page = list_last_entry(&queue->pages_list, struct frmr_handles_page, + list); + handle = page->handles[tmp]; + queue->ci--; + + if (!tmp) { + list_del(&page->list); + queue->num_pages--; + kfree(page); + } + + return handle; +} + +static bool pop_frmr_handles_page(struct ib_frmr_pool *pool, + struct frmr_queue *queue, + struct frmr_handles_page **page, u32 *count) +{ + spin_lock(&pool->lock); + if (list_empty(&queue->pages_list)) { + spin_unlock(&pool->lock); + return false; + } + + *page = list_first_entry(&queue->pages_list, struct frmr_handles_page, + list); + list_del(&(*page)->list); + queue->num_pages--; + + /* If this is the last page, count may be less than + * NUM_HANDLES_PER_PAGE. + */ + if (queue->ci >= NUM_HANDLES_PER_PAGE) + *count = NUM_HANDLES_PER_PAGE; + else + *count = queue->ci; + + queue->ci -= *count; + spin_unlock(&pool->lock); + return true; +} + +static void destroy_frmr_pool(struct ib_device *device, + struct ib_frmr_pool *pool) +{ + struct ib_frmr_pools *pools = device->frmr_pools; + struct frmr_handles_page *page; + u32 count; + + while (pop_frmr_handles_page(pool, &pool->queue, &page, &count)) { + pools->pool_ops->destroy_frmrs(device, page->handles, count); + kfree(page); + } + + kfree(pool); +} + +/* + * Initialize the FRMR pools for a device. + * + * @device: The device to initialize the FRMR pools for. + * @pool_ops: The pool operations to use. + * + * Returns 0 on success, negative error code on failure. + */ +int ib_frmr_pools_init(struct ib_device *device, + const struct ib_frmr_pool_ops *pool_ops) +{ + struct ib_frmr_pools *pools; + + pools = kzalloc_obj(*pools); + if (!pools) + return -ENOMEM; + + pools->rb_root = RB_ROOT; + rwlock_init(&pools->rb_lock); + pools->pool_ops = pool_ops; + + device->frmr_pools = pools; + return 0; +} +EXPORT_SYMBOL(ib_frmr_pools_init); + +/* + * Clean up the FRMR pools for a device. + * + * @device: The device to clean up the FRMR pools for. + * + * Call cleanup only after all FRMR handles have been pushed back to the pool + * and no other FRMR operations are allowed to run in parallel. + * Ensuring this allows us to save synchronization overhead in pop and push + * operations. + */ +void ib_frmr_pools_cleanup(struct ib_device *device) +{ + struct ib_frmr_pools *pools = device->frmr_pools; + struct ib_frmr_pool *pool, *next; + + if (!pools) + return; + + rbtree_postorder_for_each_entry_safe(pool, next, &pools->rb_root, node) + destroy_frmr_pool(device, pool); + + kfree(pools); + device->frmr_pools = NULL; +} +EXPORT_SYMBOL(ib_frmr_pools_cleanup); + +static inline int compare_keys(struct ib_frmr_key *key1, + struct ib_frmr_key *key2) +{ + int res; + + res = cmp_int(key1->ats, key2->ats); + if (res) + return res; + + res = cmp_int(key1->access_flags, key2->access_flags); + if (res) + return res; + + res = cmp_int(key1->vendor_key, key2->vendor_key); + if (res) + return res; + + res = cmp_int(key1->kernel_vendor_key, key2->kernel_vendor_key); + if (res) + return res; + + /* + * allow using handles that support more DMA blocks, up to twice the + * requested number + */ + res = cmp_int(key1->num_dma_blocks, key2->num_dma_blocks); + if (res > 0) { + if (key1->num_dma_blocks - key2->num_dma_blocks < + key2->num_dma_blocks) + return 0; + } + + return res; +} + +static int frmr_pool_cmp_find(const void *key, const struct rb_node *node) +{ + struct ib_frmr_pool *pool = rb_entry(node, struct ib_frmr_pool, node); + + return compare_keys(&pool->key, (struct ib_frmr_key *)key); +} + +static int frmr_pool_cmp_add(struct rb_node *new, const struct rb_node *node) +{ + struct ib_frmr_pool *new_pool = + rb_entry(new, struct ib_frmr_pool, node); + struct ib_frmr_pool *pool = rb_entry(node, struct ib_frmr_pool, node); + + return compare_keys(&pool->key, &new_pool->key); +} + +static struct ib_frmr_pool *ib_frmr_pool_find(struct ib_frmr_pools *pools, + struct ib_frmr_key *key) +{ + struct ib_frmr_pool *pool; + struct rb_node *node; + + /* find operation is done under read lock for performance reasons. + * The case of threads failing to find the same pool and creating it + * is handled by the create_frmr_pool function. + */ + read_lock(&pools->rb_lock); + node = rb_find(key, &pools->rb_root, frmr_pool_cmp_find); + pool = rb_entry_safe(node, struct ib_frmr_pool, node); + read_unlock(&pools->rb_lock); + + return pool; +} + +static struct ib_frmr_pool *create_frmr_pool(struct ib_device *device, + struct ib_frmr_key *key) +{ + struct ib_frmr_pools *pools = device->frmr_pools; + struct ib_frmr_pool *pool; + struct rb_node *existing; + + pool = kzalloc_obj(*pool); + if (!pool) + return ERR_PTR(-ENOMEM); + + memcpy(&pool->key, key, sizeof(*key)); + INIT_LIST_HEAD(&pool->queue.pages_list); + spin_lock_init(&pool->lock); + + write_lock(&pools->rb_lock); + existing = rb_find_add(&pool->node, &pools->rb_root, frmr_pool_cmp_add); + write_unlock(&pools->rb_lock); + + /* If a different thread has already created the pool, return it. + * The insert operation is done under the write lock so we are sure + * that the pool is not inserted twice. + */ + if (existing) { + kfree(pool); + return rb_entry(existing, struct ib_frmr_pool, node); + } + + return pool; +} + +static int get_frmr_from_pool(struct ib_device *device, + struct ib_frmr_pool *pool, struct ib_mr *mr) +{ + struct ib_frmr_pools *pools = device->frmr_pools; + u32 handle; + int err; + + spin_lock(&pool->lock); + if (pool->queue.ci == 0) { + spin_unlock(&pool->lock); + err = pools->pool_ops->create_frmrs(device, &pool->key, &handle, + 1); + if (err) + return err; + } else { + handle = pop_handle_from_queue_locked(&pool->queue); + spin_unlock(&pool->lock); + } + + mr->frmr.pool = pool; + mr->frmr.handle = handle; + + return 0; +} + +/* + * Pop an FRMR handle from the pool. + * + * @device: The device to pop the FRMR handle from. + * @mr: The MR to pop the FRMR handle from. + * + * Returns 0 on success, negative error code on failure. + */ +int ib_frmr_pool_pop(struct ib_device *device, struct ib_mr *mr) +{ + struct ib_frmr_pools *pools = device->frmr_pools; + struct ib_frmr_pool *pool; + + WARN_ON_ONCE(!device->frmr_pools); + pool = ib_frmr_pool_find(pools, &mr->frmr.key); + if (!pool) { + pool = create_frmr_pool(device, &mr->frmr.key); + if (IS_ERR(pool)) + return PTR_ERR(pool); + } + + return get_frmr_from_pool(device, pool, mr); +} +EXPORT_SYMBOL(ib_frmr_pool_pop); + +/* + * Push an FRMR handle back to the pool. + * + * @device: The device to push the FRMR handle to. + * @mr: The MR containing the FRMR handle to push back to the pool. + * + * Returns 0 on success, negative error code on failure. + */ +int ib_frmr_pool_push(struct ib_device *device, struct ib_mr *mr) +{ + struct ib_frmr_pool *pool = mr->frmr.pool; + int ret; + + spin_lock(&pool->lock); + ret = push_handle_to_queue_locked(&pool->queue, mr->frmr.handle); + spin_unlock(&pool->lock); + + return ret; +} +EXPORT_SYMBOL(ib_frmr_pool_push); diff --git a/drivers/infiniband/core/frmr_pools.h b/drivers/infiniband/core/frmr_pools.h new file mode 100644 index 000000000000..0433db5061bd --- /dev/null +++ b/drivers/infiniband/core/frmr_pools.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB + * + * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + */ + +#ifndef RDMA_CORE_FRMR_POOLS_H +#define RDMA_CORE_FRMR_POOLS_H + +#include +#include +#include +#include +#include + +#define NUM_HANDLES_PER_PAGE \ + ((PAGE_SIZE - sizeof(struct list_head)) / sizeof(u32)) + +struct frmr_handles_page { + struct list_head list; + u32 handles[NUM_HANDLES_PER_PAGE]; +}; + +/* FRMR queue holds a list of frmr_handles_page. + * num_pages: number of pages in the queue. + * ci: current index in the handles array across all pages. + */ +struct frmr_queue { + struct list_head pages_list; + u32 num_pages; + unsigned long ci; +}; + +struct ib_frmr_pool { + struct rb_node node; + struct ib_frmr_key key; /* Pool key */ + + /* Protect access to the queue */ + spinlock_t lock; + struct frmr_queue queue; +}; + +struct ib_frmr_pools { + struct rb_root rb_root; + rwlock_t rb_lock; + const struct ib_frmr_pool_ops *pool_ops; +}; + +#endif /* RDMA_CORE_FRMR_POOLS_H */ diff --git a/include/rdma/frmr_pools.h b/include/rdma/frmr_pools.h new file mode 100644 index 000000000000..9ef41eb43e4b --- /dev/null +++ b/include/rdma/frmr_pools.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB + * + * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + */ + +#ifndef FRMR_POOLS_H +#define FRMR_POOLS_H + +#include +#include + +struct ib_device; +struct ib_mr; + +struct ib_frmr_key { + u64 vendor_key; + /* A pool with non-zero kernel_vendor_key is a kernel-only pool. */ + u64 kernel_vendor_key; + size_t num_dma_blocks; + int access_flags; + u8 ats:1; +}; + +struct ib_frmr_pool_ops { + int (*create_frmrs)(struct ib_device *device, struct ib_frmr_key *key, + u32 *handles, u32 count); + void (*destroy_frmrs)(struct ib_device *device, u32 *handles, + u32 count); +}; + +int ib_frmr_pools_init(struct ib_device *device, + const struct ib_frmr_pool_ops *pool_ops); +void ib_frmr_pools_cleanup(struct ib_device *device); +int ib_frmr_pool_pop(struct ib_device *device, struct ib_mr *mr); +int ib_frmr_pool_push(struct ib_device *device, struct ib_mr *mr); + +#endif /* FRMR_POOLS_H */ diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 1b77fd88d0fb..ba34b131e9be 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -44,6 +44,7 @@ #include #include #include +#include #include #define IB_FW_VERSION_NAME_MAX ETHTOOL_FWVERS_LEN @@ -1905,6 +1906,11 @@ struct ib_mr { struct ib_dm *dm; struct ib_sig_attrs *sig_attrs; /* only for IB_MR_TYPE_INTEGRITY MRs */ struct ib_dmah *dmah; + struct { + struct ib_frmr_pool *pool; + struct ib_frmr_key key; + u32 handle; + } frmr; /* * Implementation details of the RDMA core, don't use in drivers: */ @@ -2907,6 +2913,8 @@ struct ib_device { struct list_head subdev_list; enum rdma_nl_name_assign_type name_assign_type; + + struct ib_frmr_pools *frmr_pools; }; static inline void *rdma_zalloc_obj(struct ib_device *dev, size_t size,