RDMA v6.16 merge window pull request

Usual collection of driver fixes:
 
 - Small bug fixes and cleansup in hfi, hns, rxe, mlx5, mana siw
 
 - Further ODP functionality in rxe
 
 - Remote access MRs in mana, along with more page sizes
 
 - Improve CM scalability with a rwlock around the agent
 
 - More trace points for hns
 
 - ODP hmm conversion to the new two step dma API
 
 - Support the ethernet HW device in mana as well as the RNIC
 
 - Cleanups:
  * Use secs_to_jiffies() when appropriate
  * Use ERR_CAST() instead of naked casts
  * Don't use %pK in printk
  * Unusued functions removed
  * Allocation type matching
 -----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQRRRCHOFoQz/8F5bUaFwuHvBreFYQUCaDm95gAKCRCFwuHvBreF
 YXJxAQCZ+p+mxt0rTeVI2j6YQ26thuvb/tH0Upu8epgdQ3T/ZgD/YOHBC6OrXWJa
 Uz6BTiyz/xiyMtJLTD4kEiG2o74J1gE=
 =DNQC
 -----END PGP SIGNATURE-----

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma

Pull rdma updates from Jason Gunthorpe:
 "Usual collection of driver fixes:

   - Small bug fixes and cleansup in hfi, hns, rxe, mlx5, mana siw

   - Further ODP functionality in rxe

   - Remote access MRs in mana, along with more page sizes

   - Improve CM scalability with a rwlock around the agent

   - More trace points for hns

   - ODP hmm conversion to the new two step dma API

   - Support the ethernet HW device in mana as well as the RNIC

   - Cleanups:
       - Use secs_to_jiffies() when appropriate
       - Use ERR_CAST() instead of naked casts
       - Don't use %pK in printk
       - Unusued functions removed
       - Allocation type matching"

* tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma: (57 commits)
  RDMA/cma: Fix hang when cma_netevent_callback fails to queue_work
  RDMA/bnxt_re: Support extended stats for Thor2 VF
  RDMA/hns: Fix endian issue in trace events
  RDMA/mlx5: Avoid flexible array warning
  IB/cm: Remove dead code and adjust naming
  RDMA/core: Avoid hmm_dma_map_alloc() for virtual DMA devices
  RDMA/rxe: Break endless pagefault loop for RO pages
  RDMA/bnxt_re: Fix return code of bnxt_re_configure_cc
  RDMA/bnxt_re: Fix missing error handling for tx_queue
  RDMA/bnxt_re: Fix incorrect display of inactivity_cp in debugfs output
  RDMA/mlx5: Add support for 200Gbps per lane speeds
  RDMA/mlx5: Remove the redundant MLX5_IB_STAGE_UAR stage
  RDMA/iwcm: Fix use-after-free of work objects after cm_id destruction
  net: mana: Add support for auxiliary device servicing events
  RDMA/mana_ib: unify mana_ib functions to support any gdma device
  RDMA/mana_ib: Add support of mana_ib for RNIC and ETH nic
  net: mana: Probe rdma device in mana driver
  RDMA/siw: replace redundant ternary operator with just rv
  RDMA/umem: Separate implicit ODP initialization from explicit ODP
  RDMA/core: Convert UMEM ODP DMA mapping to caching IOVA and page linkage
  ...
This commit is contained in:
Linus Torvalds 2025-05-30 10:18:56 -07:00
commit dd91b5e1d6
76 changed files with 1481 additions and 769 deletions

View File

@ -36,6 +36,7 @@ MODULE_LICENSE("Dual BSD/GPL");
#define CM_DESTROY_ID_WAIT_TIMEOUT 10000 /* msecs */
#define CM_DIRECT_RETRY_CTX ((void *) 1UL)
#define CM_MRA_SETTING 24 /* 4.096us * 2^24 = ~68.7 seconds */
static const char * const ibcm_rej_reason_strs[] = {
[IB_CM_REJ_NO_QP] = "no QP",
@ -167,7 +168,7 @@ struct cm_port {
struct cm_device {
struct kref kref;
struct list_head list;
spinlock_t mad_agent_lock;
rwlock_t mad_agent_lock;
struct ib_device *ib_device;
u8 ack_delay;
int going_down;
@ -241,7 +242,6 @@ struct cm_id_private {
u8 initiator_depth;
u8 retry_count;
u8 rnr_retry_count;
u8 service_timeout;
u8 target_ack_delay;
struct list_head work_list;
@ -285,7 +285,7 @@ static struct ib_mad_send_buf *cm_alloc_msg(struct cm_id_private *cm_id_priv)
if (!cm_id_priv->av.port)
return ERR_PTR(-EINVAL);
spin_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
read_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
mad_agent = cm_id_priv->av.port->mad_agent;
if (!mad_agent) {
m = ERR_PTR(-EINVAL);
@ -311,7 +311,7 @@ static struct ib_mad_send_buf *cm_alloc_msg(struct cm_id_private *cm_id_priv)
m->ah = ah;
out:
spin_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
read_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
return m;
}
@ -1297,10 +1297,10 @@ static __be64 cm_form_tid(struct cm_id_private *cm_id_priv)
if (!cm_id_priv->av.port)
return cpu_to_be64(low_tid);
spin_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
read_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
if (cm_id_priv->av.port->mad_agent)
hi_tid = ((u64)cm_id_priv->av.port->mad_agent->hi_tid) << 32;
spin_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
read_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
return cpu_to_be64(hi_tid | low_tid);
}
@ -1872,7 +1872,7 @@ static void cm_process_work(struct cm_id_private *cm_id_priv,
static void cm_format_mra(struct cm_mra_msg *mra_msg,
struct cm_id_private *cm_id_priv,
enum cm_msg_response msg_mraed, u8 service_timeout,
enum cm_msg_response msg_mraed,
const void *private_data, u8 private_data_len)
{
cm_format_mad_hdr(&mra_msg->hdr, CM_MRA_ATTR_ID, cm_id_priv->tid);
@ -1881,7 +1881,7 @@ static void cm_format_mra(struct cm_mra_msg *mra_msg,
be32_to_cpu(cm_id_priv->id.local_id));
IBA_SET(CM_MRA_REMOTE_COMM_ID, mra_msg,
be32_to_cpu(cm_id_priv->id.remote_id));
IBA_SET(CM_MRA_SERVICE_TIMEOUT, mra_msg, service_timeout);
IBA_SET(CM_MRA_SERVICE_TIMEOUT, mra_msg, CM_MRA_SETTING);
if (private_data && private_data_len)
IBA_SET_MEM(CM_MRA_PRIVATE_DATA, mra_msg, private_data,
@ -1960,7 +1960,7 @@ static void cm_dup_req_handler(struct cm_work *work,
switch (cm_id_priv->id.state) {
case IB_CM_MRA_REQ_SENT:
cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
CM_MSG_RESPONSE_REQ, cm_id_priv->service_timeout,
CM_MSG_RESPONSE_REQ,
cm_id_priv->private_data,
cm_id_priv->private_data_len);
break;
@ -2454,7 +2454,7 @@ static void cm_dup_rep_handler(struct cm_work *work)
cm_id_priv->private_data_len);
else if (cm_id_priv->id.state == IB_CM_MRA_REP_SENT)
cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
CM_MSG_RESPONSE_REP, cm_id_priv->service_timeout,
CM_MSG_RESPONSE_REP,
cm_id_priv->private_data,
cm_id_priv->private_data_len);
else
@ -3094,26 +3094,13 @@ static int cm_rej_handler(struct cm_work *work)
return -EINVAL;
}
int ib_send_cm_mra(struct ib_cm_id *cm_id,
u8 service_timeout,
const void *private_data,
u8 private_data_len)
int ib_prepare_cm_mra(struct ib_cm_id *cm_id)
{
struct cm_id_private *cm_id_priv;
struct ib_mad_send_buf *msg;
enum ib_cm_state cm_state;
enum ib_cm_lap_state lap_state;
enum cm_msg_response msg_response;
void *data;
unsigned long flags;
int ret;
if (private_data && private_data_len > IB_CM_MRA_PRIVATE_DATA_SIZE)
return -EINVAL;
data = cm_copy_private_data(private_data, private_data_len);
if (IS_ERR(data))
return PTR_ERR(data);
int ret = 0;
cm_id_priv = container_of(cm_id, struct cm_id_private, id);
@ -3122,58 +3109,33 @@ int ib_send_cm_mra(struct ib_cm_id *cm_id,
case IB_CM_REQ_RCVD:
cm_state = IB_CM_MRA_REQ_SENT;
lap_state = cm_id->lap_state;
msg_response = CM_MSG_RESPONSE_REQ;
break;
case IB_CM_REP_RCVD:
cm_state = IB_CM_MRA_REP_SENT;
lap_state = cm_id->lap_state;
msg_response = CM_MSG_RESPONSE_REP;
break;
case IB_CM_ESTABLISHED:
if (cm_id->lap_state == IB_CM_LAP_RCVD) {
cm_state = cm_id->state;
lap_state = IB_CM_MRA_LAP_SENT;
msg_response = CM_MSG_RESPONSE_OTHER;
break;
}
fallthrough;
default:
trace_icm_send_mra_unknown_err(&cm_id_priv->id);
trace_icm_prepare_mra_unknown_err(&cm_id_priv->id);
ret = -EINVAL;
goto error_unlock;
}
if (!(service_timeout & IB_CM_MRA_FLAG_DELAY)) {
msg = cm_alloc_msg(cm_id_priv);
if (IS_ERR(msg)) {
ret = PTR_ERR(msg);
goto error_unlock;
}
cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
msg_response, service_timeout,
private_data, private_data_len);
trace_icm_send_mra(cm_id);
ret = ib_post_send_mad(msg, NULL);
if (ret)
goto error_free_msg;
}
cm_id->state = cm_state;
cm_id->lap_state = lap_state;
cm_id_priv->service_timeout = service_timeout;
cm_set_private_data(cm_id_priv, data, private_data_len);
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
return 0;
cm_set_private_data(cm_id_priv, NULL, 0);
error_free_msg:
cm_free_msg(msg);
error_unlock:
spin_unlock_irqrestore(&cm_id_priv->lock, flags);
kfree(data);
return ret;
}
EXPORT_SYMBOL(ib_send_cm_mra);
EXPORT_SYMBOL(ib_prepare_cm_mra);
static struct cm_id_private *cm_acquire_mraed_id(struct cm_mra_msg *mra_msg)
{
@ -3377,7 +3339,6 @@ static int cm_lap_handler(struct cm_work *work)
cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
CM_MSG_RESPONSE_OTHER,
cm_id_priv->service_timeout,
cm_id_priv->private_data,
cm_id_priv->private_data_len);
spin_unlock_irq(&cm_id_priv->lock);
@ -3786,7 +3747,8 @@ static void cm_process_send_error(struct cm_id_private *cm_id_priv,
spin_lock_irq(&cm_id_priv->lock);
if (msg != cm_id_priv->msg) {
spin_unlock_irq(&cm_id_priv->lock);
cm_free_priv_msg(msg);
cm_free_msg(msg);
cm_deref_id(cm_id_priv);
return;
}
cm_free_priv_msg(msg);
@ -4378,7 +4340,7 @@ static int cm_add_one(struct ib_device *ib_device)
return -ENOMEM;
kref_init(&cm_dev->kref);
spin_lock_init(&cm_dev->mad_agent_lock);
rwlock_init(&cm_dev->mad_agent_lock);
cm_dev->ib_device = ib_device;
cm_dev->ack_delay = ib_device->attrs.local_ca_ack_delay;
cm_dev->going_down = 0;
@ -4494,9 +4456,9 @@ static void cm_remove_one(struct ib_device *ib_device, void *client_data)
* The above ensures no call paths from the work are running,
* the remaining paths all take the mad_agent_lock.
*/
spin_lock(&cm_dev->mad_agent_lock);
write_lock(&cm_dev->mad_agent_lock);
port->mad_agent = NULL;
spin_unlock(&cm_dev->mad_agent_lock);
write_unlock(&cm_dev->mad_agent_lock);
ib_unregister_mad_agent(mad_agent);
ib_port_unregister_client_groups(ib_device, i,
cm_counter_groups);

View File

@ -229,7 +229,7 @@ DEFINE_CM_ERR_EVENT(send_drep);
DEFINE_CM_ERR_EVENT(dreq_unknown);
DEFINE_CM_ERR_EVENT(send_unknown_rej);
DEFINE_CM_ERR_EVENT(rej_unknown);
DEFINE_CM_ERR_EVENT(send_mra_unknown);
DEFINE_CM_ERR_EVENT(prepare_mra_unknown);
DEFINE_CM_ERR_EVENT(mra_unknown);
DEFINE_CM_ERR_EVENT(qp_init);
DEFINE_CM_ERR_EVENT(qp_rtr);

View File

@ -46,7 +46,6 @@ MODULE_LICENSE("Dual BSD/GPL");
#define CMA_CM_RESPONSE_TIMEOUT 20
#define CMA_MAX_CM_RETRIES 15
#define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24)
#define CMA_IBOE_PACKET_LIFETIME 16
#define CMA_PREFERRED_ROCE_GID_TYPE IB_GID_TYPE_ROCE_UDP_ENCAP
@ -146,19 +145,6 @@ struct iw_cm_id *rdma_iw_cm_id(struct rdma_cm_id *id)
}
EXPORT_SYMBOL(rdma_iw_cm_id);
/**
* rdma_res_to_id() - return the rdma_cm_id pointer for this restrack.
* @res: rdma resource tracking entry pointer
*/
struct rdma_cm_id *rdma_res_to_id(struct rdma_restrack_entry *res)
{
struct rdma_id_private *id_priv =
container_of(res, struct rdma_id_private, res);
return &id_priv->id;
}
EXPORT_SYMBOL(rdma_res_to_id);
static int cma_add_one(struct ib_device *device);
static void cma_remove_one(struct ib_device *device, void *client_data);
@ -2214,8 +2200,8 @@ static int cma_ib_handler(struct ib_cm_id *cm_id,
case IB_CM_REP_RECEIVED:
if (state == RDMA_CM_CONNECT &&
(id_priv->id.qp_type != IB_QPT_UD)) {
trace_cm_send_mra(id_priv);
ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0);
trace_cm_prepare_mra(id_priv);
ib_prepare_cm_mra(cm_id);
}
if (id_priv->id.qp) {
event.status = cma_rep_recv(id_priv);
@ -2476,8 +2462,8 @@ static int cma_ib_req_handler(struct ib_cm_id *cm_id,
if (READ_ONCE(conn_id->state) == RDMA_CM_CONNECT &&
conn_id->id.qp_type != IB_QPT_UD) {
trace_cm_send_mra(cm_id->context);
ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0);
trace_cm_prepare_mra(cm_id->context);
ib_prepare_cm_mra(cm_id);
}
mutex_unlock(&conn_id->handler_mutex);
@ -5245,7 +5231,8 @@ static int cma_netevent_callback(struct notifier_block *self,
neigh->ha, ETH_ALEN))
continue;
cma_id_get(current_id);
queue_work(cma_wq, &current_id->id.net_work);
if (!queue_work(cma_wq, &current_id->id.net_work))
cma_id_put(current_id);
}
out:
spin_unlock_irqrestore(&id_table_lock, flags);

View File

@ -55,7 +55,7 @@ DECLARE_EVENT_CLASS(cma_fsm_class,
DEFINE_CMA_FSM_EVENT(send_rtu);
DEFINE_CMA_FSM_EVENT(send_rej);
DEFINE_CMA_FSM_EVENT(send_mra);
DEFINE_CMA_FSM_EVENT(prepare_mra);
DEFINE_CMA_FSM_EVENT(send_sidr_req);
DEFINE_CMA_FSM_EVENT(send_sidr_rep);
DEFINE_CMA_FSM_EVENT(disconnect);

View File

@ -368,12 +368,9 @@ EXPORT_SYMBOL(iw_cm_disconnect);
/*
* CM_ID <-- DESTROYING
*
* Clean up all resources associated with the connection and release
* the initial reference taken by iw_create_cm_id.
*
* Returns true if and only if the last cm_id_priv reference has been dropped.
* Clean up all resources associated with the connection.
*/
static bool destroy_cm_id(struct iw_cm_id *cm_id)
static void destroy_cm_id(struct iw_cm_id *cm_id)
{
struct iwcm_id_private *cm_id_priv;
struct ib_qp *qp;
@ -442,20 +439,22 @@ static bool destroy_cm_id(struct iw_cm_id *cm_id)
iwpm_remove_mapinfo(&cm_id->local_addr, &cm_id->m_local_addr);
iwpm_remove_mapping(&cm_id->local_addr, RDMA_NL_IWCM);
}
return iwcm_deref_id(cm_id_priv);
}
/*
* This function is only called by the application thread and cannot
* be called by the event thread. The function will wait for all
* references to be released on the cm_id and then kfree the cm_id
* object.
* Destroy cm_id. If the cm_id still has other references, wait for all
* references to be released on the cm_id and then release the initial
* reference taken by iw_create_cm_id.
*/
void iw_destroy_cm_id(struct iw_cm_id *cm_id)
{
if (!destroy_cm_id(cm_id))
struct iwcm_id_private *cm_id_priv;
cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
destroy_cm_id(cm_id);
if (refcount_read(&cm_id_priv->refcount) > 1)
flush_workqueue(iwcm_wq);
iwcm_deref_id(cm_id_priv);
}
EXPORT_SYMBOL(iw_destroy_cm_id);
@ -1035,8 +1034,10 @@ static void cm_work_handler(struct work_struct *_work)
if (!test_bit(IWCM_F_DROP_EVENTS, &cm_id_priv->flags)) {
ret = process_event(cm_id_priv, &levent);
if (ret)
WARN_ON_ONCE(destroy_cm_id(&cm_id_priv->id));
if (ret) {
destroy_cm_id(&cm_id_priv->id);
WARN_ON_ONCE(iwcm_deref_id(cm_id_priv));
}
} else
pr_debug("dropping event %d\n", levent.event);
if (iwcm_deref_id(cm_id_priv))

View File

@ -158,7 +158,7 @@ static struct ib_mad_send_buf *alloc_response_msg(struct ib_mad_agent *agent,
ah = ib_create_ah_from_wc(agent->qp->pd, recv_wc->wc,
recv_wc->recv_buf.grh, agent->port_num);
if (IS_ERR(ah))
return (void *) ah;
return ERR_CAST(ah);
hdr_len = ib_get_mad_data_offset(recv_wc->recv_buf.mad->mad_hdr.mgmt_class);
msg = ib_create_send_mad(agent, recv_wc->wc->src_qp,

View File

@ -41,67 +41,72 @@
#include <linux/hugetlb.h>
#include <linux/interval_tree.h>
#include <linux/hmm.h>
#include <linux/hmm-dma.h>
#include <linux/pagemap.h>
#include <rdma/ib_umem_odp.h>
#include "uverbs.h"
static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp,
const struct mmu_interval_notifier_ops *ops)
static void ib_init_umem_implicit_odp(struct ib_umem_odp *umem_odp)
{
int ret;
umem_odp->is_implicit_odp = 1;
umem_odp->umem.is_odp = 1;
mutex_init(&umem_odp->umem_mutex);
}
static int ib_init_umem_odp(struct ib_umem_odp *umem_odp,
const struct mmu_interval_notifier_ops *ops)
{
struct ib_device *dev = umem_odp->umem.ibdev;
size_t page_size = 1UL << umem_odp->page_shift;
struct hmm_dma_map *map;
unsigned long start;
unsigned long end;
size_t nr_entries;
int ret = 0;
umem_odp->umem.is_odp = 1;
mutex_init(&umem_odp->umem_mutex);
if (!umem_odp->is_implicit_odp) {
size_t page_size = 1UL << umem_odp->page_shift;
unsigned long start;
unsigned long end;
size_t ndmas, npfns;
start = ALIGN_DOWN(umem_odp->umem.address, page_size);
if (check_add_overflow(umem_odp->umem.address,
(unsigned long)umem_odp->umem.length, &end))
return -EOVERFLOW;
end = ALIGN(end, page_size);
if (unlikely(end < page_size))
return -EOVERFLOW;
start = ALIGN_DOWN(umem_odp->umem.address, page_size);
if (check_add_overflow(umem_odp->umem.address,
(unsigned long)umem_odp->umem.length,
&end))
return -EOVERFLOW;
end = ALIGN(end, page_size);
if (unlikely(end < page_size))
return -EOVERFLOW;
nr_entries = (end - start) >> PAGE_SHIFT;
if (!(nr_entries * PAGE_SIZE / page_size))
return -EINVAL;
ndmas = (end - start) >> umem_odp->page_shift;
if (!ndmas)
return -EINVAL;
npfns = (end - start) >> PAGE_SHIFT;
umem_odp->pfn_list = kvcalloc(
npfns, sizeof(*umem_odp->pfn_list),
GFP_KERNEL | __GFP_NOWARN);
if (!umem_odp->pfn_list)
return -ENOMEM;
umem_odp->dma_list = kvcalloc(
ndmas, sizeof(*umem_odp->dma_list),
GFP_KERNEL | __GFP_NOWARN);
if (!umem_odp->dma_list) {
map = &umem_odp->map;
if (ib_uses_virt_dma(dev)) {
map->pfn_list = kvcalloc(nr_entries, sizeof(*map->pfn_list),
GFP_KERNEL | __GFP_NOWARN);
if (!map->pfn_list)
ret = -ENOMEM;
goto out_pfn_list;
}
} else
ret = hmm_dma_map_alloc(dev->dma_device, map,
(end - start) >> PAGE_SHIFT,
1 << umem_odp->page_shift);
if (ret)
return ret;
ret = mmu_interval_notifier_insert(&umem_odp->notifier,
umem_odp->umem.owning_mm,
start, end - start, ops);
if (ret)
goto out_dma_list;
}
ret = mmu_interval_notifier_insert(&umem_odp->notifier,
umem_odp->umem.owning_mm, start,
end - start, ops);
if (ret)
goto out_free_map;
return 0;
out_dma_list:
kvfree(umem_odp->dma_list);
out_pfn_list:
kvfree(umem_odp->pfn_list);
out_free_map:
if (ib_uses_virt_dma(dev))
kfree(map->pfn_list);
else
hmm_dma_map_free(dev->dma_device, map);
return ret;
}
@ -120,7 +125,6 @@ struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_device *device,
{
struct ib_umem *umem;
struct ib_umem_odp *umem_odp;
int ret;
if (access & IB_ACCESS_HUGETLB)
return ERR_PTR(-EINVAL);
@ -132,16 +136,10 @@ struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_device *device,
umem->ibdev = device;
umem->writable = ib_access_writable(access);
umem->owning_mm = current->mm;
umem_odp->is_implicit_odp = 1;
umem_odp->page_shift = PAGE_SHIFT;
umem_odp->tgid = get_task_pid(current->group_leader, PIDTYPE_PID);
ret = ib_init_umem_odp(umem_odp, NULL);
if (ret) {
put_pid(umem_odp->tgid);
kfree(umem_odp);
return ERR_PTR(ret);
}
ib_init_umem_implicit_odp(umem_odp);
return umem_odp;
}
EXPORT_SYMBOL(ib_umem_odp_alloc_implicit);
@ -262,74 +260,41 @@ struct ib_umem_odp *ib_umem_odp_get(struct ib_device *device,
}
EXPORT_SYMBOL(ib_umem_odp_get);
void ib_umem_odp_release(struct ib_umem_odp *umem_odp)
static void ib_umem_odp_free(struct ib_umem_odp *umem_odp)
{
struct ib_device *dev = umem_odp->umem.ibdev;
/*
* Ensure that no more pages are mapped in the umem.
*
* It is the driver's responsibility to ensure, before calling us,
* that the hardware will not attempt to access the MR any more.
*/
if (!umem_odp->is_implicit_odp) {
mutex_lock(&umem_odp->umem_mutex);
ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp),
ib_umem_end(umem_odp));
mutex_unlock(&umem_odp->umem_mutex);
mmu_interval_notifier_remove(&umem_odp->notifier);
kvfree(umem_odp->dma_list);
kvfree(umem_odp->pfn_list);
}
mutex_lock(&umem_odp->umem_mutex);
ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp),
ib_umem_end(umem_odp));
mutex_unlock(&umem_odp->umem_mutex);
mmu_interval_notifier_remove(&umem_odp->notifier);
if (ib_uses_virt_dma(dev))
kfree(umem_odp->map.pfn_list);
else
hmm_dma_map_free(dev->dma_device, &umem_odp->map);
}
void ib_umem_odp_release(struct ib_umem_odp *umem_odp)
{
if (!umem_odp->is_implicit_odp)
ib_umem_odp_free(umem_odp);
put_pid(umem_odp->tgid);
kfree(umem_odp);
}
EXPORT_SYMBOL(ib_umem_odp_release);
/*
* Map for DMA and insert a single page into the on-demand paging page tables.
*
* @umem: the umem to insert the page to.
* @dma_index: index in the umem to add the dma to.
* @page: the page struct to map and add.
* @access_mask: access permissions needed for this page.
*
* The function returns -EFAULT if the DMA mapping operation fails.
*
*/
static int ib_umem_odp_map_dma_single_page(
struct ib_umem_odp *umem_odp,
unsigned int dma_index,
struct page *page,
u64 access_mask)
{
struct ib_device *dev = umem_odp->umem.ibdev;
dma_addr_t *dma_addr = &umem_odp->dma_list[dma_index];
if (*dma_addr) {
/*
* If the page is already dma mapped it means it went through
* a non-invalidating trasition, like read-only to writable.
* Resync the flags.
*/
*dma_addr = (*dma_addr & ODP_DMA_ADDR_MASK) | access_mask;
return 0;
}
*dma_addr = ib_dma_map_page(dev, page, 0, 1 << umem_odp->page_shift,
DMA_BIDIRECTIONAL);
if (ib_dma_mapping_error(dev, *dma_addr)) {
*dma_addr = 0;
return -EFAULT;
}
umem_odp->npages++;
*dma_addr |= access_mask;
return 0;
}
/**
* ib_umem_odp_map_dma_and_lock - DMA map userspace memory in an ODP MR and lock it.
*
* Maps the range passed in the argument to DMA addresses.
* The DMA addresses of the mapped pages is updated in umem_odp->dma_list.
* Upon success the ODP MR will be locked to let caller complete its device
* page table update.
*
@ -357,9 +322,6 @@ int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt,
struct hmm_range range = {};
unsigned long timeout;
if (access_mask == 0)
return -EINVAL;
if (user_virt < ib_umem_start(umem_odp) ||
user_virt + bcnt > ib_umem_end(umem_odp))
return -EFAULT;
@ -385,11 +347,11 @@ int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt,
if (fault) {
range.default_flags = HMM_PFN_REQ_FAULT;
if (access_mask & ODP_WRITE_ALLOWED_BIT)
if (access_mask & HMM_PFN_WRITE)
range.default_flags |= HMM_PFN_REQ_WRITE;
}
range.hmm_pfns = &(umem_odp->pfn_list[pfn_start_idx]);
range.hmm_pfns = &(umem_odp->map.pfn_list[pfn_start_idx]);
timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
retry:
@ -417,22 +379,17 @@ int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt,
for (pfn_index = 0; pfn_index < num_pfns;
pfn_index += 1 << (page_shift - PAGE_SHIFT), dma_index++) {
if (fault) {
/*
* Since we asked for hmm_range_fault() to populate
* pages it shouldn't return an error entry on success.
*/
WARN_ON(range.hmm_pfns[pfn_index] & HMM_PFN_ERROR);
WARN_ON(!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID));
} else {
if (!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID)) {
WARN_ON(umem_odp->dma_list[dma_index]);
continue;
}
access_mask = ODP_READ_ALLOWED_BIT;
if (range.hmm_pfns[pfn_index] & HMM_PFN_WRITE)
access_mask |= ODP_WRITE_ALLOWED_BIT;
}
/*
* Since we asked for hmm_range_fault() to populate
* pages it shouldn't return an error entry on success.
*/
WARN_ON(fault && range.hmm_pfns[pfn_index] & HMM_PFN_ERROR);
WARN_ON(fault && !(range.hmm_pfns[pfn_index] & HMM_PFN_VALID));
if (!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID))
continue;
if (range.hmm_pfns[pfn_index] & HMM_PFN_DMA_MAPPED)
continue;
hmm_order = hmm_pfn_to_map_order(range.hmm_pfns[pfn_index]);
/* If a hugepage was detected and ODP wasn't set for, the umem
@ -445,15 +402,6 @@ int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt,
__func__, hmm_order, page_shift);
break;
}
ret = ib_umem_odp_map_dma_single_page(
umem_odp, dma_index, hmm_pfn_to_page(range.hmm_pfns[pfn_index]),
access_mask);
if (ret < 0) {
ibdev_dbg(umem_odp->umem.ibdev,
"ib_umem_odp_map_dma_single_page failed with error %d\n", ret);
break;
}
}
/* upon success lock should stay on hold for the callee */
if (!ret)
@ -473,45 +421,38 @@ EXPORT_SYMBOL(ib_umem_odp_map_dma_and_lock);
void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
u64 bound)
{
dma_addr_t dma_addr;
dma_addr_t dma;
int idx;
u64 addr;
struct ib_device *dev = umem_odp->umem.ibdev;
u64 addr;
lockdep_assert_held(&umem_odp->umem_mutex);
virt = max_t(u64, virt, ib_umem_start(umem_odp));
bound = min_t(u64, bound, ib_umem_end(umem_odp));
for (addr = virt; addr < bound; addr += BIT(umem_odp->page_shift)) {
idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
dma = umem_odp->dma_list[idx];
u64 offset = addr - ib_umem_start(umem_odp);
size_t idx = offset >> umem_odp->page_shift;
unsigned long pfn = umem_odp->map.pfn_list[idx];
/* The access flags guaranteed a valid DMA address in case was NULL */
if (dma) {
unsigned long pfn_idx = (addr - ib_umem_start(umem_odp)) >> PAGE_SHIFT;
struct page *page = hmm_pfn_to_page(umem_odp->pfn_list[pfn_idx]);
if (!hmm_dma_unmap_pfn(dev->dma_device, &umem_odp->map, idx))
goto clear;
dma_addr = dma & ODP_DMA_ADDR_MASK;
ib_dma_unmap_page(dev, dma_addr,
BIT(umem_odp->page_shift),
DMA_BIDIRECTIONAL);
if (dma & ODP_WRITE_ALLOWED_BIT) {
struct page *head_page = compound_head(page);
/*
* set_page_dirty prefers being called with
* the page lock. However, MMU notifiers are
* called sometimes with and sometimes without
* the lock. We rely on the umem_mutex instead
* to prevent other mmu notifiers from
* continuing and allowing the page mapping to
* be removed.
*/
set_page_dirty(head_page);
}
umem_odp->dma_list[idx] = 0;
umem_odp->npages--;
if (pfn & HMM_PFN_WRITE) {
struct page *page = hmm_pfn_to_page(pfn);
struct page *head_page = compound_head(page);
/*
* set_page_dirty prefers being called with
* the page lock. However, MMU notifiers are
* called sometimes with and sometimes without
* the lock. We rely on the umem_mutex instead
* to prevent other mmu notifiers from
* continuing and allowing the page mapping to
* be removed.
*/
set_page_dirty(head_page);
}
umem_odp->npages--;
clear:
umem_odp->map.pfn_list[idx] &= ~HMM_PFN_FLAGS;
}
}
EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages);

View File

@ -193,7 +193,7 @@ _ib_uverbs_lookup_comp_file(s32 fd, struct uverbs_attr_bundle *attrs)
fd, attrs);
if (IS_ERR(uobj))
return (void *)uobj;
return ERR_CAST(uobj);
uverbs_uobject_get(uobj);
uobj_put_read(uobj);

View File

@ -572,7 +572,7 @@ struct ib_ah *rdma_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr,
GFP_KERNEL : GFP_ATOMIC);
if (IS_ERR(slave)) {
rdma_unfill_sgid_attr(ah_attr, old_sgid_attr);
return (void *)slave;
return ERR_CAST(slave);
}
ah = _rdma_create_ah(pd, ah_attr, flags, NULL, slave);
rdma_lag_put_ah_roce_slave(slave);

View File

@ -170,6 +170,9 @@ static int map_cc_config_offset_gen0_ext0(u32 offset, struct bnxt_qplib_cc_param
case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_TCP_CP:
*val = ccparam->tcp_cp;
break;
case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_INACTIVITY_CP:
*val = ccparam->inact_th;
break;
default:
return -EINVAL;
}
@ -203,7 +206,7 @@ static ssize_t bnxt_re_cc_config_get(struct file *filp, char __user *buffer,
return simple_read_from_buffer(buffer, usr_buf_len, ppos, (u8 *)(buf), rc);
}
static void bnxt_re_fill_gen0_ext0(struct bnxt_qplib_cc_param *ccparam, u32 offset, u32 val)
static int bnxt_re_fill_gen0_ext0(struct bnxt_qplib_cc_param *ccparam, u32 offset, u32 val)
{
u32 modify_mask;
@ -247,7 +250,9 @@ static void bnxt_re_fill_gen0_ext0(struct bnxt_qplib_cc_param *ccparam, u32 offs
ccparam->tcp_cp = val;
break;
case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_TX_QUEUE:
return -EOPNOTSUPP;
case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_INACTIVITY_CP:
ccparam->inact_th = val;
break;
case CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_TIME_PER_PHASE:
ccparam->time_pph = val;
@ -258,17 +263,20 @@ static void bnxt_re_fill_gen0_ext0(struct bnxt_qplib_cc_param *ccparam, u32 offs
}
ccparam->mask = modify_mask;
return 0;
}
static int bnxt_re_configure_cc(struct bnxt_re_dev *rdev, u32 gen_ext, u32 offset, u32 val)
{
struct bnxt_qplib_cc_param ccparam = { };
int rc;
/* Supporting only Gen 0 now */
if (gen_ext == CC_CONFIG_GEN0_EXT0)
bnxt_re_fill_gen0_ext0(&ccparam, offset, val);
else
return -EINVAL;
if (gen_ext != CC_CONFIG_GEN0_EXT0)
return -EOPNOTSUPP;
rc = bnxt_re_fill_gen0_ext0(&ccparam, offset, val);
if (rc)
return rc;
bnxt_qplib_modify_cc(&rdev->qplib_res, &ccparam);
return 0;

View File

@ -1113,7 +1113,7 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
qp_flags |= CMDQ_CREATE_QP_QP_FLAGS_FORCE_COMPLETION;
if (qp->wqe_mode == BNXT_QPLIB_WQE_MODE_VARIABLE)
qp_flags |= CMDQ_CREATE_QP_QP_FLAGS_VARIABLE_SIZED_WQE_ENABLED;
if (_is_ext_stats_supported(res->dattr->dev_cap_flags) && !res->is_vf)
if (bnxt_ext_stats_supported(res->cctx, res->dattr->dev_cap_flags, res->is_vf))
qp_flags |= CMDQ_CREATE_QP_QP_FLAGS_EXT_STATS_ENABLED;
req.qp_flags = cpu_to_le32(qp_flags);

View File

@ -846,7 +846,12 @@ int bnxt_qplib_qext_stat(struct bnxt_qplib_rcfw *rcfw, u32 fid,
req.resp_size = sbuf.size / BNXT_QPLIB_CMDQE_UNITS;
req.resp_addr = cpu_to_le64(sbuf.dma_addr);
req.function_id = cpu_to_le32(fid);
if (bnxt_qplib_is_chip_gen_p7(rcfw->res->cctx) && rcfw->res->is_vf)
req.function_id =
cpu_to_le32(CMDQ_QUERY_ROCE_STATS_EXT_VF_VALID |
(fid << CMDQ_QUERY_ROCE_STATS_EXT_VF_NUM_SFT));
else
req.function_id = cpu_to_le32(fid);
req.flags = cpu_to_le16(CMDQ_QUERY_ROCE_STATS_EXT_FLAGS_FUNCTION_ID);
bnxt_qplib_fill_cmdqmsg(&msg, &req, &resp, &sbuf, sizeof(req),

View File

@ -124,7 +124,6 @@ struct opa_mad_notice_attr {
} __packed ntc_2048;
};
u8 class_data[];
};
#define IB_VLARB_LOWPRI_0_31 1

View File

@ -1361,16 +1361,6 @@ void sc_flush(struct send_context *sc)
sc_wait_for_packet_egress(sc, 1);
}
/* drop all packets on the context, no waiting until they are sent */
void sc_drop(struct send_context *sc)
{
if (!sc)
return;
dd_dev_info(sc->dd, "%s: context %u(%u) - not implemented\n",
__func__, sc->sw_index, sc->hw_context);
}
/*
* Start the software reaction to a context halt or SPC freeze:
* - mark the context as halted or frozen

View File

@ -246,7 +246,6 @@ void sc_disable(struct send_context *sc);
int sc_restart(struct send_context *sc);
void sc_return_credits(struct send_context *sc);
void sc_flush(struct send_context *sc);
void sc_drop(struct send_context *sc);
void sc_stop(struct send_context *sc, int bit);
struct pio_buf *sc_buffer_alloc(struct send_context *sc, u32 dw_len,
pio_release_cb cb, void *arg);

View File

@ -1520,24 +1520,6 @@ void sdma_all_running(struct hfi1_devdata *dd)
}
}
/**
* sdma_all_idle() - called when the link goes down
* @dd: hfi1_devdata
*
* This routine moves all engines to the idle state.
*/
void sdma_all_idle(struct hfi1_devdata *dd)
{
struct sdma_engine *sde;
unsigned int i;
/* idle all engines */
for (i = 0; i < dd->num_sdma; ++i) {
sde = &dd->per_sdma[i];
sdma_process_event(sde, sdma_event_e70_go_idle);
}
}
/**
* sdma_start() - called to kick off state processing for all engines
* @dd: hfi1_devdata

View File

@ -373,7 +373,6 @@ void sdma_start(struct hfi1_devdata *dd);
void sdma_exit(struct hfi1_devdata *dd);
void sdma_clean(struct hfi1_devdata *dd, size_t num_engines);
void sdma_all_running(struct hfi1_devdata *dd);
void sdma_all_idle(struct hfi1_devdata *dd);
void sdma_freeze_notify(struct hfi1_devdata *dd, int go_idle);
void sdma_freeze(struct hfi1_devdata *dd);
void sdma_unfreeze(struct hfi1_devdata *dd);

View File

@ -53,7 +53,7 @@ int hfi1_user_exp_rcv_init(struct hfi1_filedata *fd,
int ret = 0;
fd->entry_to_rb = kcalloc(uctxt->expected_count,
sizeof(struct rb_node *),
sizeof(*fd->entry_to_rb),
GFP_KERNEL);
if (!fd->entry_to_rb)
return -ENOMEM;

View File

@ -4,6 +4,7 @@
#
ccflags-y := -I $(srctree)/drivers/net/ethernet/hisilicon/hns3
ccflags-y += -I $(src)
hns-roce-hw-v2-objs := hns_roce_main.o hns_roce_cmd.o hns_roce_pd.o \
hns_roce_ah.o hns_roce_hem.o hns_roce_mr.o hns_roce_qp.o \

View File

@ -33,7 +33,6 @@
#include <linux/pci.h>
#include <rdma/ib_addr.h>
#include <rdma/ib_cache.h>
#include "hnae3.h"
#include "hns_roce_device.h"
#include "hns_roce_hw_v2.h"

View File

@ -1027,6 +1027,26 @@ struct hns_roce_dev {
atomic64_t *dfx_cnt;
};
enum hns_roce_trace_type {
TRACE_SQ,
TRACE_RQ,
TRACE_SRQ,
};
static inline const char *trace_type_to_str(enum hns_roce_trace_type type)
{
switch (type) {
case TRACE_SQ:
return "SQ";
case TRACE_RQ:
return "RQ";
case TRACE_SRQ:
return "SRQ";
default:
return "UNKNOWN";
}
}
static inline struct hns_roce_dev *to_hr_dev(struct ib_device *ib_dev)
{
return container_of(ib_dev, struct hns_roce_dev, ib_dev);

View File

@ -43,13 +43,15 @@
#include <rdma/ib_umem.h>
#include <rdma/uverbs_ioctl.h>
#include "hnae3.h"
#include "hns_roce_common.h"
#include "hns_roce_device.h"
#include "hns_roce_cmd.h"
#include "hns_roce_hem.h"
#include "hns_roce_hw_v2.h"
#define CREATE_TRACE_POINTS
#include "hns_roce_trace.h"
enum {
CMD_RST_PRC_OTHERS,
CMD_RST_PRC_SUCCESS,
@ -738,6 +740,8 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp,
else
ret = set_ud_wqe(qp, wr, wqe, &sge_idx, owner_bit);
trace_hns_sq_wqe(qp->qpn, wqe_idx, wqe, 1 << qp->sq.wqe_shift,
wr->wr_id, TRACE_SQ);
if (unlikely(ret)) {
*bad_wr = wr;
goto out;
@ -807,6 +811,9 @@ static void fill_rq_wqe(struct hns_roce_qp *hr_qp, const struct ib_recv_wr *wr,
wqe = hns_roce_get_recv_wqe(hr_qp, wqe_idx);
fill_recv_sge_to_wqe(wr, wqe, max_sge, hr_qp->rq.rsv_sge);
trace_hns_rq_wqe(hr_qp->qpn, wqe_idx, wqe, 1 << hr_qp->rq.wqe_shift,
wr->wr_id, TRACE_RQ);
}
static int hns_roce_v2_post_recv(struct ib_qp *ibqp,
@ -943,7 +950,7 @@ static void fill_wqe_idx(struct hns_roce_srq *srq, unsigned int wqe_idx)
static void update_srq_db(struct hns_roce_srq *srq)
{
struct hns_roce_dev *hr_dev = to_hr_dev(srq->ibsrq.device);
struct hns_roce_v2_db db;
struct hns_roce_v2_db db = {};
hr_reg_write(&db, DB_TAG, srq->srqn);
hr_reg_write(&db, DB_CMD, HNS_ROCE_V2_SRQ_DB);
@ -984,6 +991,9 @@ static int hns_roce_v2_post_srq_recv(struct ib_srq *ibsrq,
fill_recv_sge_to_wqe(wr, wqe, max_sge, srq->rsv_sge);
fill_wqe_idx(srq, wqe_idx);
srq->wrid[wqe_idx] = wr->wr_id;
trace_hns_srq_wqe(srq->srqn, wqe_idx, wqe, 1 << srq->wqe_shift,
wr->wr_id, TRACE_SRQ);
}
if (likely(nreq)) {
@ -1311,6 +1321,8 @@ static int __hns_roce_cmq_send_one(struct hns_roce_dev *hr_dev,
tail = csq->head;
for (i = 0; i < num; i++) {
trace_hns_cmdq_req(hr_dev, &desc[i]);
csq->desc[csq->head++] = desc[i];
if (csq->head == csq->desc_num)
csq->head = 0;
@ -1325,6 +1337,8 @@ static int __hns_roce_cmq_send_one(struct hns_roce_dev *hr_dev,
if (hns_roce_cmq_csq_done(hr_dev)) {
ret = 0;
for (i = 0; i < num; i++) {
trace_hns_cmdq_resp(hr_dev, &csq->desc[tail]);
/* check the result of hardware write back */
desc_ret = le16_to_cpu(csq->desc[tail++].retval);
if (tail == csq->desc_num)
@ -4302,8 +4316,7 @@ static inline int get_pdn(struct ib_pd *ib_pd)
}
static void modify_qp_reset_to_init(struct ib_qp *ibqp,
struct hns_roce_v2_qp_context *context,
struct hns_roce_v2_qp_context *qpc_mask)
struct hns_roce_v2_qp_context *context)
{
struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
@ -5122,7 +5135,7 @@ static int hns_roce_v2_set_abs_fields(struct ib_qp *ibqp,
if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
memset(qpc_mask, 0, hr_dev->caps.qpc_sz);
modify_qp_reset_to_init(ibqp, context, qpc_mask);
modify_qp_reset_to_init(ibqp, context);
} else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_INIT) {
modify_qp_init_to_init(ibqp, context, qpc_mask);
} else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) {
@ -5313,6 +5326,7 @@ static void v2_set_flushed_fields(struct ib_qp *ibqp,
return;
spin_lock_irqsave(&hr_qp->sq.lock, sq_flag);
trace_hns_sq_flush_cqe(hr_qp->qpn, hr_qp->sq.head, TRACE_SQ);
hr_reg_write(context, QPC_SQ_PRODUCER_IDX, hr_qp->sq.head);
hr_reg_clear(qpc_mask, QPC_SQ_PRODUCER_IDX);
hr_qp->state = IB_QPS_ERR;
@ -5322,6 +5336,7 @@ static void v2_set_flushed_fields(struct ib_qp *ibqp,
return;
spin_lock_irqsave(&hr_qp->rq.lock, rq_flag);
trace_hns_rq_flush_cqe(hr_qp->qpn, hr_qp->rq.head, TRACE_RQ);
hr_reg_write(context, QPC_RQ_PRODUCER_IDX, hr_qp->rq.head);
hr_reg_clear(qpc_mask, QPC_RQ_PRODUCER_IDX);
spin_unlock_irqrestore(&hr_qp->rq.lock, rq_flag);
@ -6248,6 +6263,7 @@ static irqreturn_t hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev,
eq->sub_type = sub_type;
++eq->cons_index;
aeqe_found = IRQ_HANDLED;
trace_hns_ae_info(event_type, aeqe, eq->eqe_size);
atomic64_inc(&hr_dev->dfx_cnt[HNS_ROCE_DFX_AEQE_CNT]);

View File

@ -34,6 +34,7 @@
#define _HNS_ROCE_HW_V2_H
#include <linux/bitops.h>
#include "hnae3.h"
#define HNS_ROCE_V2_MAX_RC_INL_INN_SZ 32
#define HNS_ROCE_V2_MTT_ENTRY_SZ 64

View File

@ -37,7 +37,6 @@
#include <rdma/ib_smi.h>
#include <rdma/ib_user_verbs.h>
#include <rdma/ib_cache.h>
#include "hnae3.h"
#include "hns_roce_common.h"
#include "hns_roce_device.h"
#include "hns_roce_hem.h"

View File

@ -38,6 +38,7 @@
#include "hns_roce_device.h"
#include "hns_roce_cmd.h"
#include "hns_roce_hem.h"
#include "hns_roce_trace.h"
static u32 hw_index_to_key(int ind)
{
@ -159,6 +160,7 @@ static int hns_roce_mr_enable(struct hns_roce_dev *hr_dev,
if (IS_ERR(mailbox))
return PTR_ERR(mailbox);
trace_hns_mr(mr);
if (mr->type != MR_TYPE_FRMR)
ret = hr_dev->hw->write_mtpt(hr_dev, mailbox->buf, mr);
else
@ -1146,6 +1148,7 @@ int hns_roce_mtr_create(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr,
struct ib_device *ibdev = &hr_dev->ib_dev;
int ret;
trace_hns_buf_attr(buf_attr);
/* The caller has its own buffer list and invokes the hns_roce_mtr_map()
* to finish the MTT configuration.
*/

View File

@ -4,7 +4,6 @@
#include <rdma/rdma_cm.h>
#include <rdma/restrack.h>
#include <uapi/rdma/rdma_netlink.h>
#include "hnae3.h"
#include "hns_roce_common.h"
#include "hns_roce_device.h"
#include "hns_roce_hw_v2.h"

View File

@ -0,0 +1,216 @@
/* SPDX-License-Identifier: GPL-2.0+ */
/*
* Copyright (c) 2025 Hisilicon Limited.
*/
#undef TRACE_SYSTEM
#define TRACE_SYSTEM hns_roce
#if !defined(__HNS_ROCE_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
#define __HNS_ROCE_TRACE_H
#include <linux/tracepoint.h>
#include <linux/string_choices.h>
#include "hns_roce_device.h"
#include "hns_roce_hw_v2.h"
DECLARE_EVENT_CLASS(flush_head_template,
TP_PROTO(unsigned long qpn, u32 pi,
enum hns_roce_trace_type type),
TP_ARGS(qpn, pi, type),
TP_STRUCT__entry(__field(unsigned long, qpn)
__field(u32, pi)
__field(enum hns_roce_trace_type, type)
),
TP_fast_assign(__entry->qpn = qpn;
__entry->pi = pi;
__entry->type = type;
),
TP_printk("%s 0x%lx flush head 0x%x.",
trace_type_to_str(__entry->type),
__entry->qpn, __entry->pi)
);
DEFINE_EVENT(flush_head_template, hns_sq_flush_cqe,
TP_PROTO(unsigned long qpn, u32 pi,
enum hns_roce_trace_type type),
TP_ARGS(qpn, pi, type));
DEFINE_EVENT(flush_head_template, hns_rq_flush_cqe,
TP_PROTO(unsigned long qpn, u32 pi,
enum hns_roce_trace_type type),
TP_ARGS(qpn, pi, type));
#define MAX_SGE_PER_WQE 64
#define MAX_WQE_SIZE (MAX_SGE_PER_WQE * HNS_ROCE_SGE_SIZE)
DECLARE_EVENT_CLASS(wqe_template,
TP_PROTO(unsigned long qpn, u32 idx, void *wqe, u32 len,
u64 id, enum hns_roce_trace_type type),
TP_ARGS(qpn, idx, wqe, len, id, type),
TP_STRUCT__entry(__field(unsigned long, qpn)
__field(u32, idx)
__array(u32, wqe,
MAX_WQE_SIZE / sizeof(__le32))
__field(u32, len)
__field(u64, id)
__field(enum hns_roce_trace_type, type)
),
TP_fast_assign(__entry->qpn = qpn;
__entry->idx = idx;
__entry->id = id;
__entry->len = len / sizeof(__le32);
__entry->type = type;
for (int i = 0; i < __entry->len; i++)
__entry->wqe[i] = le32_to_cpu(((__le32 *)wqe)[i]);
),
TP_printk("%s 0x%lx wqe(0x%x/0x%llx): %s",
trace_type_to_str(__entry->type),
__entry->qpn, __entry->idx, __entry->id,
__print_array(__entry->wqe, __entry->len,
sizeof(__le32)))
);
DEFINE_EVENT(wqe_template, hns_sq_wqe,
TP_PROTO(unsigned long qpn, u32 idx, void *wqe, u32 len, u64 id,
enum hns_roce_trace_type type),
TP_ARGS(qpn, idx, wqe, len, id, type));
DEFINE_EVENT(wqe_template, hns_rq_wqe,
TP_PROTO(unsigned long qpn, u32 idx, void *wqe, u32 len, u64 id,
enum hns_roce_trace_type type),
TP_ARGS(qpn, idx, wqe, len, id, type));
DEFINE_EVENT(wqe_template, hns_srq_wqe,
TP_PROTO(unsigned long qpn, u32 idx, void *wqe, u32 len, u64 id,
enum hns_roce_trace_type type),
TP_ARGS(qpn, idx, wqe, len, id, type));
TRACE_EVENT(hns_ae_info,
TP_PROTO(int event_type, void *aeqe, unsigned int len),
TP_ARGS(event_type, aeqe, len),
TP_STRUCT__entry(__field(int, event_type)
__array(u32, aeqe,
HNS_ROCE_V3_EQE_SIZE / sizeof(__le32))
__field(u32, len)
),
TP_fast_assign(__entry->event_type = event_type;
__entry->len = len / sizeof(__le32);
for (int i = 0; i < __entry->len; i++)
__entry->aeqe[i] = le32_to_cpu(((__le32 *)aeqe)[i]);
),
TP_printk("event %2d aeqe: %s", __entry->event_type,
__print_array(__entry->aeqe, __entry->len, sizeof(__le32)))
);
TRACE_EVENT(hns_mr,
TP_PROTO(struct hns_roce_mr *mr),
TP_ARGS(mr),
TP_STRUCT__entry(__field(u64, iova)
__field(u64, size)
__field(u32, key)
__field(u32, pd)
__field(u32, pbl_hop_num)
__field(u32, npages)
__field(int, type)
__field(int, enabled)
),
TP_fast_assign(__entry->iova = mr->iova;
__entry->size = mr->size;
__entry->key = mr->key;
__entry->pd = mr->pd;
__entry->pbl_hop_num = mr->pbl_hop_num;
__entry->npages = mr->npages;
__entry->type = mr->type;
__entry->enabled = mr->enabled;
),
TP_printk("iova:0x%llx, size:%llu, key:%u, pd:%u, pbl_hop:%u, npages:%u, type:%d, status:%d",
__entry->iova, __entry->size, __entry->key,
__entry->pd, __entry->pbl_hop_num, __entry->npages,
__entry->type, __entry->enabled)
);
TRACE_EVENT(hns_buf_attr,
TP_PROTO(struct hns_roce_buf_attr *attr),
TP_ARGS(attr),
TP_STRUCT__entry(__field(unsigned int, region_count)
__field(unsigned int, region0_size)
__field(int, region0_hopnum)
__field(unsigned int, region1_size)
__field(int, region1_hopnum)
__field(unsigned int, region2_size)
__field(int, region2_hopnum)
__field(unsigned int, page_shift)
__field(bool, mtt_only)
),
TP_fast_assign(__entry->region_count = attr->region_count;
__entry->region0_size = attr->region[0].size;
__entry->region0_hopnum = attr->region[0].hopnum;
__entry->region1_size = attr->region[1].size;
__entry->region1_hopnum = attr->region[1].hopnum;
__entry->region2_size = attr->region[2].size;
__entry->region2_hopnum = attr->region[2].hopnum;
__entry->page_shift = attr->page_shift;
__entry->mtt_only = attr->mtt_only;
),
TP_printk("rg cnt:%u, pg_sft:0x%x, mtt_only:%s, rg 0 (sz:%u, hop:%u), rg 1 (sz:%u, hop:%u), rg 2 (sz:%u, hop:%u)\n",
__entry->region_count, __entry->page_shift,
str_yes_no(__entry->mtt_only),
__entry->region0_size, __entry->region0_hopnum,
__entry->region1_size, __entry->region1_hopnum,
__entry->region2_size, __entry->region2_hopnum)
);
DECLARE_EVENT_CLASS(cmdq,
TP_PROTO(struct hns_roce_dev *hr_dev,
struct hns_roce_cmq_desc *desc),
TP_ARGS(hr_dev, desc),
TP_STRUCT__entry(__string(dev_name, dev_name(hr_dev->dev))
__field(u16, opcode)
__field(u16, flag)
__field(u16, retval)
__array(u32, data, 6)
),
TP_fast_assign(__assign_str(dev_name);
__entry->opcode = le16_to_cpu(desc->opcode);
__entry->flag = le16_to_cpu(desc->flag);
__entry->retval = le16_to_cpu(desc->retval);
for (int i = 0; i < 6; i++)
__entry->data[i] = le32_to_cpu(desc->data[i]);
),
TP_printk("%s cmdq opcode:0x%x, flag:0x%x, retval:0x%x, data:%s\n",
__get_str(dev_name), __entry->opcode,
__entry->flag, __entry->retval,
__print_array(__entry->data, 6, sizeof(__le32)))
);
DEFINE_EVENT(cmdq, hns_cmdq_req,
TP_PROTO(struct hns_roce_dev *hr_dev,
struct hns_roce_cmq_desc *desc),
TP_ARGS(hr_dev, desc));
DEFINE_EVENT(cmdq, hns_cmdq_resp,
TP_PROTO(struct hns_roce_dev *hr_dev,
struct hns_roce_cmq_desc *desc),
TP_ARGS(hr_dev, desc));
#endif /* __HNS_ROCE_TRACE_H */
#undef TRACE_INCLUDE_FILE
#define TRACE_INCLUDE_FILE hns_roce_trace
#undef TRACE_INCLUDE_PATH
#define TRACE_INCLUDE_PATH .
#include <trace/define_trace.h>

View File

@ -3131,7 +3131,7 @@ int irdma_sc_cqp_init(struct irdma_sc_cqp *cqp,
writel(0, cqp->dev->hw_regs[IRDMA_CCQPSTATUS]);
ibdev_dbg(to_ibdev(cqp->dev),
"WQE: sq_size[%04d] hw_sq_size[%04d] sq_base[%p] sq_pa[%pK] cqp[%p] polarity[x%04x]\n",
"WQE: sq_size[%04d] hw_sq_size[%04d] sq_base[%p] sq_pa[%p] cqp[%p] polarity[x%04x]\n",
cqp->sq_size, cqp->hw_sq_size, cqp->sq_base,
(u64 *)(uintptr_t)cqp->sq_pa, cqp, cqp->polarity);
return 0;

View File

@ -108,7 +108,7 @@ static int add_sd_direct(struct irdma_hmc_pble_rsrc *pble_rsrc,
chunk->vaddr = sd_entry->u.bp.addr.va + offset;
chunk->fpm_addr = pble_rsrc->next_fpm_addr;
ibdev_dbg(to_ibdev(dev),
"PBLE: chunk_size[%lld] = 0x%llx vaddr=0x%pK fpm_addr = %llx\n",
"PBLE: chunk_size[%lld] = 0x%llx vaddr=0x%p fpm_addr = %llx\n",
chunk->size, chunk->size, chunk->vaddr, chunk->fpm_addr);
return 0;

View File

@ -15,14 +15,12 @@ int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
struct ib_device *ibdev = ibcq->device;
struct mana_ib_create_cq ucmd = {};
struct mana_ib_dev *mdev;
struct gdma_context *gc;
bool is_rnic_cq;
u32 doorbell;
u32 buf_size;
int err;
mdev = container_of(ibdev, struct mana_ib_dev, ib_dev);
gc = mdev_to_gc(mdev);
cq->comp_vector = attr->comp_vector % ibdev->num_comp_vectors;
cq->cq_handle = INVALID_MANA_HANDLE;
@ -65,7 +63,7 @@ int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
ibdev_dbg(ibdev, "Failed to create kernel queue for create cq, %d\n", err);
return err;
}
doorbell = gc->mana_ib.doorbell;
doorbell = mdev->gdma_dev->doorbell;
}
if (is_rnic_cq) {

View File

@ -101,103 +101,95 @@ static int mana_ib_probe(struct auxiliary_device *adev,
const struct auxiliary_device_id *id)
{
struct mana_adev *madev = container_of(adev, struct mana_adev, adev);
struct gdma_context *gc = madev->mdev->gdma_context;
struct mana_context *mc = gc->mana.driver_data;
struct gdma_dev *mdev = madev->mdev;
struct net_device *ndev;
struct mana_context *mc;
struct mana_ib_dev *dev;
u8 mac_addr[ETH_ALEN];
int ret;
mc = mdev->driver_data;
dev = ib_alloc_device(mana_ib_dev, ib_dev);
if (!dev)
return -ENOMEM;
ib_set_device_ops(&dev->ib_dev, &mana_ib_dev_ops);
dev->ib_dev.phys_port_cnt = mc->num_ports;
ibdev_dbg(&dev->ib_dev, "mdev=%p id=%d num_ports=%d\n", mdev,
mdev->dev_id.as_uint32, dev->ib_dev.phys_port_cnt);
dev->ib_dev.node_type = RDMA_NODE_IB_CA;
/*
* num_comp_vectors needs to set to the max MSIX index
* when interrupts and event queues are implemented
*/
dev->ib_dev.num_comp_vectors = mdev->gdma_context->max_num_queues;
dev->ib_dev.dev.parent = mdev->gdma_context->dev;
ndev = mana_get_primary_netdev(mc, 0, &dev->dev_tracker);
if (!ndev) {
ret = -ENODEV;
ibdev_err(&dev->ib_dev, "Failed to get netdev for IB port 1");
goto free_ib_device;
}
ether_addr_copy(mac_addr, ndev->dev_addr);
addrconf_addr_eui48((u8 *)&dev->ib_dev.node_guid, ndev->dev_addr);
ret = ib_device_set_netdev(&dev->ib_dev, ndev, 1);
/* mana_get_primary_netdev() returns ndev with refcount held */
netdev_put(ndev, &dev->dev_tracker);
if (ret) {
ibdev_err(&dev->ib_dev, "Failed to set ib netdev, ret %d", ret);
goto free_ib_device;
}
ret = mana_gd_register_device(&mdev->gdma_context->mana_ib);
if (ret) {
ibdev_err(&dev->ib_dev, "Failed to register device, ret %d",
ret);
goto free_ib_device;
}
dev->gdma_dev = &mdev->gdma_context->mana_ib;
dev->nb.notifier_call = mana_ib_netdev_event;
ret = register_netdevice_notifier(&dev->nb);
if (ret) {
ibdev_err(&dev->ib_dev, "Failed to register net notifier, %d",
ret);
goto deregister_device;
}
ret = mana_ib_gd_query_adapter_caps(dev);
if (ret) {
ibdev_err(&dev->ib_dev, "Failed to query device caps, ret %d",
ret);
goto deregister_net_notifier;
}
ib_set_device_ops(&dev->ib_dev, &mana_ib_stats_ops);
ret = mana_ib_create_eqs(dev);
if (ret) {
ibdev_err(&dev->ib_dev, "Failed to create EQs, ret %d", ret);
goto deregister_net_notifier;
}
ret = mana_ib_gd_create_rnic_adapter(dev);
if (ret)
goto destroy_eqs;
dev->ib_dev.num_comp_vectors = gc->max_num_queues;
dev->ib_dev.dev.parent = gc->dev;
dev->gdma_dev = mdev;
xa_init_flags(&dev->qp_table_wq, XA_FLAGS_LOCK_IRQ);
ret = mana_ib_gd_config_mac(dev, ADDR_OP_ADD, mac_addr);
if (ret) {
ibdev_err(&dev->ib_dev, "Failed to add Mac address, ret %d",
ret);
goto destroy_rnic;
if (mana_ib_is_rnic(dev)) {
dev->ib_dev.phys_port_cnt = 1;
ndev = mana_get_primary_netdev(mc, 0, &dev->dev_tracker);
if (!ndev) {
ret = -ENODEV;
ibdev_err(&dev->ib_dev, "Failed to get netdev for IB port 1");
goto free_ib_device;
}
ether_addr_copy(mac_addr, ndev->dev_addr);
addrconf_addr_eui48((u8 *)&dev->ib_dev.node_guid, ndev->dev_addr);
ret = ib_device_set_netdev(&dev->ib_dev, ndev, 1);
/* mana_get_primary_netdev() returns ndev with refcount held */
netdev_put(ndev, &dev->dev_tracker);
if (ret) {
ibdev_err(&dev->ib_dev, "Failed to set ib netdev, ret %d", ret);
goto free_ib_device;
}
dev->nb.notifier_call = mana_ib_netdev_event;
ret = register_netdevice_notifier(&dev->nb);
if (ret) {
ibdev_err(&dev->ib_dev, "Failed to register net notifier, %d",
ret);
goto free_ib_device;
}
ret = mana_ib_gd_query_adapter_caps(dev);
if (ret) {
ibdev_err(&dev->ib_dev, "Failed to query device caps, ret %d", ret);
goto deregister_net_notifier;
}
ib_set_device_ops(&dev->ib_dev, &mana_ib_stats_ops);
ret = mana_ib_create_eqs(dev);
if (ret) {
ibdev_err(&dev->ib_dev, "Failed to create EQs, ret %d", ret);
goto deregister_net_notifier;
}
ret = mana_ib_gd_create_rnic_adapter(dev);
if (ret)
goto destroy_eqs;
ret = mana_ib_gd_config_mac(dev, ADDR_OP_ADD, mac_addr);
if (ret) {
ibdev_err(&dev->ib_dev, "Failed to add Mac address, ret %d", ret);
goto destroy_rnic;
}
} else {
dev->ib_dev.phys_port_cnt = mc->num_ports;
ret = mana_eth_query_adapter_caps(dev);
if (ret) {
ibdev_err(&dev->ib_dev, "Failed to query ETH device caps, ret %d", ret);
goto free_ib_device;
}
}
dev->av_pool = dma_pool_create("mana_ib_av", mdev->gdma_context->dev,
MANA_AV_BUFFER_SIZE, MANA_AV_BUFFER_SIZE, 0);
dev->av_pool = dma_pool_create("mana_ib_av", gc->dev, MANA_AV_BUFFER_SIZE,
MANA_AV_BUFFER_SIZE, 0);
if (!dev->av_pool) {
ret = -ENOMEM;
goto destroy_rnic;
}
ret = ib_register_device(&dev->ib_dev, "mana_%d",
mdev->gdma_context->dev);
ibdev_dbg(&dev->ib_dev, "mdev=%p id=%d num_ports=%d\n", mdev,
mdev->dev_id.as_uint32, dev->ib_dev.phys_port_cnt);
ret = ib_register_device(&dev->ib_dev, mana_ib_is_rnic(dev) ? "mana_%d" : "manae_%d",
gc->dev);
if (ret)
goto deallocate_pool;
@ -208,15 +200,16 @@ static int mana_ib_probe(struct auxiliary_device *adev,
deallocate_pool:
dma_pool_destroy(dev->av_pool);
destroy_rnic:
xa_destroy(&dev->qp_table_wq);
mana_ib_gd_destroy_rnic_adapter(dev);
if (mana_ib_is_rnic(dev))
mana_ib_gd_destroy_rnic_adapter(dev);
destroy_eqs:
mana_ib_destroy_eqs(dev);
if (mana_ib_is_rnic(dev))
mana_ib_destroy_eqs(dev);
deregister_net_notifier:
unregister_netdevice_notifier(&dev->nb);
deregister_device:
mana_gd_deregister_device(dev->gdma_dev);
if (mana_ib_is_rnic(dev))
unregister_netdevice_notifier(&dev->nb);
free_ib_device:
xa_destroy(&dev->qp_table_wq);
ib_dealloc_device(&dev->ib_dev);
return ret;
}
@ -227,25 +220,24 @@ static void mana_ib_remove(struct auxiliary_device *adev)
ib_unregister_device(&dev->ib_dev);
dma_pool_destroy(dev->av_pool);
if (mana_ib_is_rnic(dev)) {
mana_ib_gd_destroy_rnic_adapter(dev);
mana_ib_destroy_eqs(dev);
unregister_netdevice_notifier(&dev->nb);
}
xa_destroy(&dev->qp_table_wq);
mana_ib_gd_destroy_rnic_adapter(dev);
mana_ib_destroy_eqs(dev);
unregister_netdevice_notifier(&dev->nb);
mana_gd_deregister_device(dev->gdma_dev);
ib_dealloc_device(&dev->ib_dev);
}
static const struct auxiliary_device_id mana_id_table[] = {
{
.name = "mana.rdma",
},
{ .name = "mana.rdma", },
{ .name = "mana.eth", },
{},
};
MODULE_DEVICE_TABLE(auxiliary, mana_id_table);
static struct auxiliary_driver mana_driver = {
.name = "rdma",
.probe = mana_ib_probe,
.remove = mana_ib_remove,
.id_table = mana_id_table,

View File

@ -4,6 +4,7 @@
*/
#include "mana_ib.h"
#include "linux/pci.h"
void mana_ib_uncfg_vport(struct mana_ib_dev *dev, struct mana_ib_pd *pd,
u32 port)
@ -243,7 +244,6 @@ void mana_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
int mana_ib_create_kernel_queue(struct mana_ib_dev *mdev, u32 size, enum gdma_queue_type type,
struct mana_ib_queue *queue)
{
struct gdma_context *gc = mdev_to_gc(mdev);
struct gdma_queue_spec spec = {};
int err;
@ -252,7 +252,7 @@ int mana_ib_create_kernel_queue(struct mana_ib_dev *mdev, u32 size, enum gdma_qu
spec.type = type;
spec.monitor_avl_buf = false;
spec.queue_size = size;
err = mana_gd_create_mana_wq_cq(&gc->mana_ib, &spec, &queue->kmem);
err = mana_gd_create_mana_wq_cq(mdev->gdma_dev, &spec, &queue->kmem);
if (err)
return err;
/* take ownership into mana_ib from mana */
@ -479,7 +479,7 @@ int mana_ib_create_dma_region(struct mana_ib_dev *dev, struct ib_umem *umem,
{
unsigned long page_sz;
page_sz = ib_umem_find_best_pgsz(umem, PAGE_SZ_BM, virt);
page_sz = ib_umem_find_best_pgsz(umem, dev->adapter_caps.page_size_cap, virt);
if (!page_sz) {
ibdev_dbg(&dev->ib_dev, "Failed to find page size.\n");
return -EINVAL;
@ -494,7 +494,7 @@ int mana_ib_create_zero_offset_dma_region(struct mana_ib_dev *dev, struct ib_ume
unsigned long page_sz;
/* Hardware requires dma region to align to chosen page size */
page_sz = ib_umem_find_best_pgoff(umem, PAGE_SZ_BM, 0);
page_sz = ib_umem_find_best_pgoff(umem, dev->adapter_caps.page_size_cap, 0);
if (!page_sz) {
ibdev_dbg(&dev->ib_dev, "Failed to find page size.\n");
return -EINVAL;
@ -551,6 +551,7 @@ int mana_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma)
int mana_ib_get_port_immutable(struct ib_device *ibdev, u32 port_num,
struct ib_port_immutable *immutable)
{
struct mana_ib_dev *dev = container_of(ibdev, struct mana_ib_dev, ib_dev);
struct ib_port_attr attr;
int err;
@ -560,10 +561,12 @@ int mana_ib_get_port_immutable(struct ib_device *ibdev, u32 port_num,
immutable->pkey_tbl_len = attr.pkey_tbl_len;
immutable->gid_tbl_len = attr.gid_tbl_len;
immutable->core_cap_flags = RDMA_CORE_PORT_RAW_PACKET;
if (port_num == 1) {
immutable->core_cap_flags |= RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP;
if (mana_ib_is_rnic(dev)) {
immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP;
immutable->max_mad_size = IB_MGMT_MAD_SIZE;
} else {
immutable->core_cap_flags = RDMA_CORE_PORT_RAW_PACKET;
}
return 0;
@ -572,12 +575,14 @@ int mana_ib_get_port_immutable(struct ib_device *ibdev, u32 port_num,
int mana_ib_query_device(struct ib_device *ibdev, struct ib_device_attr *props,
struct ib_udata *uhw)
{
struct mana_ib_dev *dev = container_of(ibdev,
struct mana_ib_dev, ib_dev);
struct mana_ib_dev *dev = container_of(ibdev, struct mana_ib_dev, ib_dev);
struct pci_dev *pdev = to_pci_dev(mdev_to_gc(dev)->dev);
memset(props, 0, sizeof(*props));
props->vendor_id = pdev->vendor;
props->vendor_part_id = dev->gdma_dev->dev_id.type;
props->max_mr_size = MANA_IB_MAX_MR_SIZE;
props->page_size_cap = PAGE_SZ_BM;
props->page_size_cap = dev->adapter_caps.page_size_cap;
props->max_qp = dev->adapter_caps.max_qp_count;
props->max_qp_wr = dev->adapter_caps.max_qp_wr;
props->device_cap_flags = IB_DEVICE_RC_RNR_NAK_GEN;
@ -596,6 +601,8 @@ int mana_ib_query_device(struct ib_device *ibdev, struct ib_device_attr *props,
props->max_ah = INT_MAX;
props->max_pkeys = 1;
props->local_ca_ack_delay = MANA_CA_ACK_DELAY;
if (!mana_ib_is_rnic(dev))
props->raw_packet_caps = IB_RAW_PACKET_CAP_IP_CSUM;
return 0;
}
@ -603,6 +610,7 @@ int mana_ib_query_device(struct ib_device *ibdev, struct ib_device_attr *props,
int mana_ib_query_port(struct ib_device *ibdev, u32 port,
struct ib_port_attr *props)
{
struct mana_ib_dev *dev = container_of(ibdev, struct mana_ib_dev, ib_dev);
struct net_device *ndev = mana_ib_get_netdev(ibdev, port);
if (!ndev)
@ -623,7 +631,7 @@ int mana_ib_query_port(struct ib_device *ibdev, u32 port,
props->active_width = IB_WIDTH_4X;
props->active_speed = IB_SPEED_EDR;
props->pkey_tbl_len = 1;
if (port == 1) {
if (mana_ib_is_rnic(dev)) {
props->gid_tbl_len = 16;
props->port_cap_flags = IB_PORT_CM_SUP;
props->ip_gids = true;
@ -696,6 +704,41 @@ int mana_ib_gd_query_adapter_caps(struct mana_ib_dev *dev)
caps->max_recv_sge_count = resp.max_recv_sge_count;
caps->feature_flags = resp.feature_flags;
caps->page_size_cap = PAGE_SZ_BM;
if (mdev_to_gc(dev)->pf_cap_flags1 & GDMA_DRV_CAP_FLAG_1_GDMA_PAGES_4MB_1GB_2GB)
caps->page_size_cap |= (SZ_4M | SZ_1G | SZ_2G);
return 0;
}
int mana_eth_query_adapter_caps(struct mana_ib_dev *dev)
{
struct mana_ib_adapter_caps *caps = &dev->adapter_caps;
struct gdma_query_max_resources_resp resp = {};
struct gdma_general_req req = {};
int err;
mana_gd_init_req_hdr(&req.hdr, GDMA_QUERY_MAX_RESOURCES,
sizeof(req), sizeof(resp));
err = mana_gd_send_request(mdev_to_gc(dev), sizeof(req), &req, sizeof(resp), &resp);
if (err) {
ibdev_err(&dev->ib_dev,
"Failed to query adapter caps err %d", err);
return err;
}
caps->max_qp_count = min_t(u32, resp.max_sq, resp.max_rq);
caps->max_cq_count = resp.max_cq;
caps->max_mr_count = resp.max_mst;
caps->max_pd_count = 0x6000;
caps->max_qp_wr = min_t(u32,
0x100000 / GDMA_MAX_SQE_SIZE,
0x100000 / GDMA_MAX_RQE_SIZE);
caps->max_send_sge_count = 30;
caps->max_recv_sge_count = 15;
caps->page_size_cap = PAGE_SZ_BM;
return 0;
}
@ -740,7 +783,7 @@ int mana_ib_create_eqs(struct mana_ib_dev *mdev)
spec.eq.log2_throttle_limit = LOG2_EQ_THROTTLE;
spec.eq.msix_index = 0;
err = mana_gd_create_mana_eq(&gc->mana_ib, &spec, &mdev->fatal_err_eq);
err = mana_gd_create_mana_eq(mdev->gdma_dev, &spec, &mdev->fatal_err_eq);
if (err)
return err;
@ -791,7 +834,7 @@ int mana_ib_gd_create_rnic_adapter(struct mana_ib_dev *mdev)
mana_gd_init_req_hdr(&req.hdr, MANA_IB_CREATE_ADAPTER, sizeof(req), sizeof(resp));
req.hdr.req.msg_version = GDMA_MESSAGE_V2;
req.hdr.dev_id = gc->mana_ib.dev_id;
req.hdr.dev_id = mdev->gdma_dev->dev_id;
req.notify_eq_id = mdev->fatal_err_eq->id;
if (mdev->adapter_caps.feature_flags & MANA_IB_FEATURE_CLIENT_ERROR_CQE_SUPPORT)
@ -816,7 +859,7 @@ int mana_ib_gd_destroy_rnic_adapter(struct mana_ib_dev *mdev)
gc = mdev_to_gc(mdev);
mana_gd_init_req_hdr(&req.hdr, MANA_IB_DESTROY_ADAPTER, sizeof(req), sizeof(resp));
req.hdr.dev_id = gc->mana_ib.dev_id;
req.hdr.dev_id = mdev->gdma_dev->dev_id;
req.adapter = mdev->adapter_handle;
err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
@ -843,7 +886,7 @@ int mana_ib_gd_add_gid(const struct ib_gid_attr *attr, void **context)
}
mana_gd_init_req_hdr(&req.hdr, MANA_IB_CONFIG_IP_ADDR, sizeof(req), sizeof(resp));
req.hdr.dev_id = gc->mana_ib.dev_id;
req.hdr.dev_id = mdev->gdma_dev->dev_id;
req.adapter = mdev->adapter_handle;
req.op = ADDR_OP_ADD;
req.sgid_type = (ntype == RDMA_NETWORK_IPV6) ? SGID_TYPE_IPV6 : SGID_TYPE_IPV4;
@ -873,7 +916,7 @@ int mana_ib_gd_del_gid(const struct ib_gid_attr *attr, void **context)
}
mana_gd_init_req_hdr(&req.hdr, MANA_IB_CONFIG_IP_ADDR, sizeof(req), sizeof(resp));
req.hdr.dev_id = gc->mana_ib.dev_id;
req.hdr.dev_id = mdev->gdma_dev->dev_id;
req.adapter = mdev->adapter_handle;
req.op = ADDR_OP_REMOVE;
req.sgid_type = (ntype == RDMA_NETWORK_IPV6) ? SGID_TYPE_IPV6 : SGID_TYPE_IPV4;
@ -896,7 +939,7 @@ int mana_ib_gd_config_mac(struct mana_ib_dev *mdev, enum mana_ib_addr_op op, u8
int err;
mana_gd_init_req_hdr(&req.hdr, MANA_IB_CONFIG_MAC_ADDR, sizeof(req), sizeof(resp));
req.hdr.dev_id = gc->mana_ib.dev_id;
req.hdr.dev_id = mdev->gdma_dev->dev_id;
req.adapter = mdev->adapter_handle;
req.op = op;
copy_in_reverse(req.mac_addr, mac, ETH_ALEN);
@ -917,8 +960,11 @@ int mana_ib_gd_create_cq(struct mana_ib_dev *mdev, struct mana_ib_cq *cq, u32 do
struct mana_rnic_create_cq_req req = {};
int err;
if (!mdev->eqs)
return -EINVAL;
mana_gd_init_req_hdr(&req.hdr, MANA_IB_CREATE_CQ, sizeof(req), sizeof(resp));
req.hdr.dev_id = gc->mana_ib.dev_id;
req.hdr.dev_id = mdev->gdma_dev->dev_id;
req.adapter = mdev->adapter_handle;
req.gdma_region = cq->queue.gdma_region;
req.eq_id = mdev->eqs[cq->comp_vector]->id;
@ -950,7 +996,7 @@ int mana_ib_gd_destroy_cq(struct mana_ib_dev *mdev, struct mana_ib_cq *cq)
return 0;
mana_gd_init_req_hdr(&req.hdr, MANA_IB_DESTROY_CQ, sizeof(req), sizeof(resp));
req.hdr.dev_id = gc->mana_ib.dev_id;
req.hdr.dev_id = mdev->gdma_dev->dev_id;
req.adapter = mdev->adapter_handle;
req.cq_handle = cq->cq_handle;
@ -976,7 +1022,7 @@ int mana_ib_gd_create_rc_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp,
int err, i;
mana_gd_init_req_hdr(&req.hdr, MANA_IB_CREATE_RC_QP, sizeof(req), sizeof(resp));
req.hdr.dev_id = gc->mana_ib.dev_id;
req.hdr.dev_id = mdev->gdma_dev->dev_id;
req.adapter = mdev->adapter_handle;
req.pd_handle = pd->pd_handle;
req.send_cq_handle = send_cq->cq_handle;
@ -1012,7 +1058,7 @@ int mana_ib_gd_destroy_rc_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp)
int err;
mana_gd_init_req_hdr(&req.hdr, MANA_IB_DESTROY_RC_QP, sizeof(req), sizeof(resp));
req.hdr.dev_id = gc->mana_ib.dev_id;
req.hdr.dev_id = mdev->gdma_dev->dev_id;
req.adapter = mdev->adapter_handle;
req.rc_qp_handle = qp->qp_handle;
err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
@ -1035,7 +1081,7 @@ int mana_ib_gd_create_ud_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp,
int err, i;
mana_gd_init_req_hdr(&req.hdr, MANA_IB_CREATE_UD_QP, sizeof(req), sizeof(resp));
req.hdr.dev_id = gc->mana_ib.dev_id;
req.hdr.dev_id = mdev->gdma_dev->dev_id;
req.adapter = mdev->adapter_handle;
req.pd_handle = pd->pd_handle;
req.send_cq_handle = send_cq->cq_handle;
@ -1070,7 +1116,7 @@ int mana_ib_gd_destroy_ud_qp(struct mana_ib_dev *mdev, struct mana_ib_qp *qp)
int err;
mana_gd_init_req_hdr(&req.hdr, MANA_IB_DESTROY_UD_QP, sizeof(req), sizeof(resp));
req.hdr.dev_id = gc->mana_ib.dev_id;
req.hdr.dev_id = mdev->gdma_dev->dev_id;
req.adapter = mdev->adapter_handle;
req.qp_handle = qp->qp_handle;
err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);

View File

@ -60,6 +60,7 @@ struct mana_ib_adapter_caps {
u32 max_recv_sge_count;
u32 max_inline_data_size;
u64 feature_flags;
u64 page_size_cap;
};
struct mana_ib_queue {
@ -543,6 +544,11 @@ static inline void mana_put_qp_ref(struct mana_ib_qp *qp)
complete(&qp->free);
}
static inline bool mana_ib_is_rnic(struct mana_ib_dev *mdev)
{
return mdev->gdma_dev->dev_id.type == GDMA_DEVICE_MANA_IB;
}
static inline struct net_device *mana_ib_get_netdev(struct ib_device *ibdev, u32 port)
{
struct mana_ib_dev *mdev = container_of(ibdev, struct mana_ib_dev, ib_dev);
@ -642,6 +648,7 @@ int mana_ib_query_gid(struct ib_device *ibdev, u32 port, int index,
void mana_ib_disassociate_ucontext(struct ib_ucontext *ibcontext);
int mana_ib_gd_query_adapter_caps(struct mana_ib_dev *mdev);
int mana_eth_query_adapter_caps(struct mana_ib_dev *mdev);
int mana_ib_create_eqs(struct mana_ib_dev *mdev);

View File

@ -5,8 +5,8 @@
#include "mana_ib.h"
#define VALID_MR_FLAGS \
(IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ)
#define VALID_MR_FLAGS (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ |\
IB_ACCESS_REMOTE_ATOMIC | IB_ZERO_BASED)
#define VALID_DMA_MR_FLAGS (IB_ACCESS_LOCAL_WRITE)
@ -24,6 +24,9 @@ mana_ib_verbs_to_gdma_access_flags(int access_flags)
if (access_flags & IB_ACCESS_REMOTE_READ)
flags |= GDMA_ACCESS_FLAG_REMOTE_READ;
if (access_flags & IB_ACCESS_REMOTE_ATOMIC)
flags |= GDMA_ACCESS_FLAG_REMOTE_ATOMIC;
return flags;
}
@ -48,7 +51,10 @@ static int mana_ib_gd_create_mr(struct mana_ib_dev *dev, struct mana_ib_mr *mr,
req.gva.virtual_address = mr_params->gva.virtual_address;
req.gva.access_flags = mr_params->gva.access_flags;
break;
case GDMA_MR_TYPE_ZBVA:
req.zbva.dma_region_handle = mr_params->zbva.dma_region_handle;
req.zbva.access_flags = mr_params->zbva.access_flags;
break;
default:
ibdev_dbg(&dev->ib_dev,
"invalid param (GDMA_MR_TYPE) passed, type %d\n",
@ -144,11 +150,18 @@ struct ib_mr *mana_ib_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length,
dma_region_handle);
mr_params.pd_handle = pd->pd_handle;
mr_params.mr_type = GDMA_MR_TYPE_GVA;
mr_params.gva.dma_region_handle = dma_region_handle;
mr_params.gva.virtual_address = iova;
mr_params.gva.access_flags =
mana_ib_verbs_to_gdma_access_flags(access_flags);
if (access_flags & IB_ZERO_BASED) {
mr_params.mr_type = GDMA_MR_TYPE_ZBVA;
mr_params.zbva.dma_region_handle = dma_region_handle;
mr_params.zbva.access_flags =
mana_ib_verbs_to_gdma_access_flags(access_flags);
} else {
mr_params.mr_type = GDMA_MR_TYPE_GVA;
mr_params.gva.dma_region_handle = dma_region_handle;
mr_params.gva.virtual_address = iova;
mr_params.gva.access_flags =
mana_ib_verbs_to_gdma_access_flags(access_flags);
}
err = mana_ib_gd_create_mr(dev, mr, &mr_params);
if (err)

View File

@ -635,7 +635,6 @@ static int mana_ib_create_ud_qp(struct ib_qp *ibqp, struct ib_pd *ibpd,
{
struct mana_ib_dev *mdev = container_of(ibpd->device, struct mana_ib_dev, ib_dev);
struct mana_ib_qp *qp = container_of(ibqp, struct mana_ib_qp, ibqp);
struct gdma_context *gc = mdev_to_gc(mdev);
u32 doorbell, queue_size;
int i, err;
@ -654,7 +653,7 @@ static int mana_ib_create_ud_qp(struct ib_qp *ibqp, struct ib_pd *ibpd,
goto destroy_queues;
}
}
doorbell = gc->mana_ib.doorbell;
doorbell = mdev->gdma_dev->doorbell;
err = create_shadow_queue(&qp->shadow_rq, attr->cap.max_recv_wr,
sizeof(struct ud_rq_shadow_wqe));
@ -736,7 +735,7 @@ static int mana_ib_gd_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
int err;
mana_gd_init_req_hdr(&req.hdr, MANA_IB_SET_QP_STATE, sizeof(req), sizeof(resp));
req.hdr.dev_id = gc->mana_ib.dev_id;
req.hdr.dev_id = mdev->gdma_dev->dev_id;
req.adapter = mdev->adapter_handle;
req.qp_handle = qp->qp_handle;
req.qp_state = attr->qp_state;

View File

@ -43,7 +43,7 @@
#define MAX_VFS 80
#define MAX_PEND_REQS_PER_FUNC 4
#define MAD_TIMEOUT_MS 2000
#define MAD_TIMEOUT_SEC 2
#define mcg_warn(fmt, arg...) pr_warn("MCG WARNING: " fmt, ##arg)
#define mcg_error(fmt, arg...) pr_err(fmt, ##arg)
@ -270,7 +270,7 @@ static int send_join_to_wire(struct mcast_group *group, struct ib_sa_mad *sa_mad
if (!ret) {
/* calls mlx4_ib_mcg_timeout_handler */
queue_delayed_work(group->demux->mcg_wq, &group->timeout_work,
msecs_to_jiffies(MAD_TIMEOUT_MS));
secs_to_jiffies(MAD_TIMEOUT_SEC));
}
return ret;
@ -309,7 +309,7 @@ static int send_leave_to_wire(struct mcast_group *group, u8 join_state)
if (!ret) {
/* calls mlx4_ib_mcg_timeout_handler */
queue_delayed_work(group->demux->mcg_wq, &group->timeout_work,
msecs_to_jiffies(MAD_TIMEOUT_MS));
secs_to_jiffies(MAD_TIMEOUT_SEC));
}
return ret;
@ -1091,7 +1091,7 @@ static void _mlx4_ib_mcg_port_cleanup(struct mlx4_ib_demux_ctx *ctx, int destroy
for (i = 0; i < MAX_VFS; ++i)
clean_vf_mcast(ctx, i);
end = jiffies + msecs_to_jiffies(MAD_TIMEOUT_MS + 3000);
end = jiffies + secs_to_jiffies(MAD_TIMEOUT_SEC + 3);
do {
count = 0;
mutex_lock(&ctx->mcg_table_lock);

View File

@ -1645,11 +1645,6 @@ static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev,
return _create_flow_rule(dev, ft_prio, flow_attr, dst, 0, NULL);
}
enum {
LEFTOVERS_MC,
LEFTOVERS_UC,
};
static struct mlx5_ib_flow_handler *create_leftovers_rule(struct mlx5_ib_dev *dev,
struct mlx5_ib_flow_prio *ft_prio,
struct ib_flow_attr *flow_attr,
@ -1659,43 +1654,32 @@ static struct mlx5_ib_flow_handler *create_leftovers_rule(struct mlx5_ib_dev *de
struct mlx5_ib_flow_handler *handler = NULL;
static struct {
struct ib_flow_attr flow_attr;
struct ib_flow_spec_eth eth_flow;
} leftovers_specs[] = {
[LEFTOVERS_MC] = {
.flow_attr = {
.num_of_specs = 1,
.size = sizeof(leftovers_specs[0])
},
.eth_flow = {
.type = IB_FLOW_SPEC_ETH,
.size = sizeof(struct ib_flow_spec_eth),
.mask = {.dst_mac = {0x1} },
.val = {.dst_mac = {0x1} }
}
},
[LEFTOVERS_UC] = {
.flow_attr = {
.num_of_specs = 1,
.size = sizeof(leftovers_specs[0])
},
.eth_flow = {
.type = IB_FLOW_SPEC_ETH,
.size = sizeof(struct ib_flow_spec_eth),
.mask = {.dst_mac = {0x1} },
.val = {.dst_mac = {} }
}
}
};
struct ib_flow_attr flow_attr;
} leftovers_wc = { .flow_attr = { .num_of_specs = 1,
.size = sizeof(leftovers_wc) },
.eth_flow = {
.type = IB_FLOW_SPEC_ETH,
.size = sizeof(struct ib_flow_spec_eth),
.mask = { .dst_mac = { 0x1 } },
.val = { .dst_mac = { 0x1 } } } };
handler = create_flow_rule(dev, ft_prio,
&leftovers_specs[LEFTOVERS_MC].flow_attr,
dst);
static struct {
struct ib_flow_spec_eth eth_flow;
struct ib_flow_attr flow_attr;
} leftovers_uc = { .flow_attr = { .num_of_specs = 1,
.size = sizeof(leftovers_uc) },
.eth_flow = {
.type = IB_FLOW_SPEC_ETH,
.size = sizeof(struct ib_flow_spec_eth),
.mask = { .dst_mac = { 0x1 } },
.val = { .dst_mac = {} } } };
handler = create_flow_rule(dev, ft_prio, &leftovers_wc.flow_attr, dst);
if (!IS_ERR(handler) &&
flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT) {
handler_ucast = create_flow_rule(dev, ft_prio,
&leftovers_specs[LEFTOVERS_UC].flow_attr,
dst);
&leftovers_uc.flow_attr, dst);
if (IS_ERR(handler_ucast)) {
mlx5_del_flow_rules(handler->rule);
ft_prio->refcount--;

View File

@ -485,6 +485,10 @@ static int translate_eth_ext_proto_oper(u32 eth_proto_oper, u16 *active_speed,
*active_width = IB_WIDTH_2X;
*active_speed = IB_SPEED_NDR;
break;
case MLX5E_PROT_MASK(MLX5E_200GAUI_1_200GBASE_CR1_KR1):
*active_width = IB_WIDTH_1X;
*active_speed = IB_SPEED_XDR;
break;
case MLX5E_PROT_MASK(MLX5E_400GAUI_8_400GBASE_CR8):
*active_width = IB_WIDTH_8X;
*active_speed = IB_SPEED_HDR;
@ -493,10 +497,18 @@ static int translate_eth_ext_proto_oper(u32 eth_proto_oper, u16 *active_speed,
*active_width = IB_WIDTH_4X;
*active_speed = IB_SPEED_NDR;
break;
case MLX5E_PROT_MASK(MLX5E_400GAUI_2_400GBASE_CR2_KR2):
*active_width = IB_WIDTH_2X;
*active_speed = IB_SPEED_XDR;
break;
case MLX5E_PROT_MASK(MLX5E_800GAUI_8_800GBASE_CR8_KR8):
*active_width = IB_WIDTH_8X;
*active_speed = IB_SPEED_NDR;
break;
case MLX5E_PROT_MASK(MLX5E_800GAUI_4_800GBASE_CR4_KR4):
*active_width = IB_WIDTH_4X;
*active_speed = IB_SPEED_XDR;
break;
default:
return -EINVAL;
}
@ -4422,17 +4434,6 @@ static void mlx5_ib_stage_cong_debugfs_cleanup(struct mlx5_ib_dev *dev)
mlx5_core_native_port_num(dev->mdev) - 1);
}
static int mlx5_ib_stage_uar_init(struct mlx5_ib_dev *dev)
{
dev->mdev->priv.uar = mlx5_get_uars_page(dev->mdev);
return PTR_ERR_OR_ZERO(dev->mdev->priv.uar);
}
static void mlx5_ib_stage_uar_cleanup(struct mlx5_ib_dev *dev)
{
mlx5_put_uars_page(dev->mdev, dev->mdev->priv.uar);
}
static int mlx5_ib_stage_bfrag_init(struct mlx5_ib_dev *dev)
{
int err;
@ -4662,9 +4663,6 @@ static const struct mlx5_ib_profile pf_profile = {
STAGE_CREATE(MLX5_IB_STAGE_CONG_DEBUGFS,
mlx5_ib_stage_cong_debugfs_init,
mlx5_ib_stage_cong_debugfs_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_UAR,
mlx5_ib_stage_uar_init,
mlx5_ib_stage_uar_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_BFREG,
mlx5_ib_stage_bfrag_init,
mlx5_ib_stage_bfrag_cleanup),
@ -4722,9 +4720,6 @@ const struct mlx5_ib_profile raw_eth_profile = {
STAGE_CREATE(MLX5_IB_STAGE_CONG_DEBUGFS,
mlx5_ib_stage_cong_debugfs_init,
mlx5_ib_stage_cong_debugfs_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_UAR,
mlx5_ib_stage_uar_init,
mlx5_ib_stage_uar_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_BFREG,
mlx5_ib_stage_bfrag_init,
mlx5_ib_stage_bfrag_cleanup),

View File

@ -351,6 +351,7 @@ struct mlx5_ib_flow_db {
#define MLX5_IB_UPD_XLT_PD BIT(4)
#define MLX5_IB_UPD_XLT_ACCESS BIT(5)
#define MLX5_IB_UPD_XLT_INDIRECT BIT(6)
#define MLX5_IB_UPD_XLT_DOWNGRADE BIT(7)
/* Private QP creation flags to be passed in ib_qp_init_attr.create_flags.
*
@ -1005,7 +1006,6 @@ enum mlx5_ib_stages {
MLX5_IB_STAGE_ODP,
MLX5_IB_STAGE_COUNTERS,
MLX5_IB_STAGE_CONG_DEBUGFS,
MLX5_IB_STAGE_UAR,
MLX5_IB_STAGE_BFREG,
MLX5_IB_STAGE_PRE_IB_REG_UMR,
MLX5_IB_STAGE_WHITELIST_UID,
@ -1473,8 +1473,8 @@ void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev);
int __init mlx5_ib_odp_init(void);
void mlx5_ib_odp_cleanup(void);
int mlx5_odp_init_mkey_cache(struct mlx5_ib_dev *dev);
void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
struct mlx5_ib_mr *mr, int flags);
int mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
struct mlx5_ib_mr *mr, int flags);
int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
enum ib_uverbs_advise_mr_advice advice,
@ -1495,8 +1495,11 @@ static inline int mlx5_odp_init_mkey_cache(struct mlx5_ib_dev *dev)
{
return 0;
}
static inline void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
struct mlx5_ib_mr *mr, int flags) {}
static inline int mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
struct mlx5_ib_mr *mr, int flags)
{
return -EOPNOTSUPP;
}
static inline int
mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,

View File

@ -525,7 +525,7 @@ static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent)
ent->fill_to_high_water = false;
if (ent->pending)
queue_delayed_work(ent->dev->cache.wq, &ent->dwork,
msecs_to_jiffies(1000));
secs_to_jiffies(1));
else
mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0);
}
@ -576,7 +576,7 @@ static void __cache_work_func(struct mlx5_cache_ent *ent)
"add keys command failed, err %d\n",
err);
queue_delayed_work(cache->wq, &ent->dwork,
msecs_to_jiffies(1000));
secs_to_jiffies(1));
}
}
} else if (ent->mkeys_queue.ci > 2 * ent->limit) {
@ -2051,7 +2051,7 @@ static int mlx5_revoke_mr(struct mlx5_ib_mr *mr)
ent->in_use--;
if (ent->is_tmp && !ent->tmp_cleanup_scheduled) {
mod_delayed_work(ent->dev->cache.wq, &ent->dwork,
msecs_to_jiffies(30 * 1000));
secs_to_jiffies(30));
ent->tmp_cleanup_scheduled = true;
}
spin_unlock_irq(&ent->mkeys_queue.lock);

View File

@ -34,6 +34,9 @@
#include <linux/kernel.h>
#include <linux/dma-buf.h>
#include <linux/dma-resv.h>
#include <linux/hmm.h>
#include <linux/hmm-dma.h>
#include <linux/pci-p2pdma.h>
#include "mlx5_ib.h"
#include "cmd.h"
@ -158,41 +161,50 @@ static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries,
}
}
static u64 umem_dma_to_mtt(dma_addr_t umem_dma)
{
u64 mtt_entry = umem_dma & ODP_DMA_ADDR_MASK;
if (umem_dma & ODP_READ_ALLOWED_BIT)
mtt_entry |= MLX5_IB_MTT_READ;
if (umem_dma & ODP_WRITE_ALLOWED_BIT)
mtt_entry |= MLX5_IB_MTT_WRITE;
return mtt_entry;
}
static void populate_mtt(__be64 *pas, size_t idx, size_t nentries,
struct mlx5_ib_mr *mr, int flags)
static int populate_mtt(__be64 *pas, size_t start, size_t nentries,
struct mlx5_ib_mr *mr, int flags)
{
struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
dma_addr_t pa;
bool downgrade = flags & MLX5_IB_UPD_XLT_DOWNGRADE;
struct pci_p2pdma_map_state p2pdma_state = {};
struct ib_device *dev = odp->umem.ibdev;
size_t i;
if (flags & MLX5_IB_UPD_XLT_ZAP)
return;
return 0;
for (i = 0; i < nentries; i++) {
pa = odp->dma_list[idx + i];
pas[i] = cpu_to_be64(umem_dma_to_mtt(pa));
unsigned long pfn = odp->map.pfn_list[start + i];
dma_addr_t dma_addr;
pfn = odp->map.pfn_list[start + i];
if (!(pfn & HMM_PFN_VALID))
/* ODP initialization */
continue;
dma_addr = hmm_dma_map_pfn(dev->dma_device, &odp->map,
start + i, &p2pdma_state);
if (ib_dma_mapping_error(dev, dma_addr))
return -EFAULT;
dma_addr |= MLX5_IB_MTT_READ;
if ((pfn & HMM_PFN_WRITE) && !downgrade)
dma_addr |= MLX5_IB_MTT_WRITE;
pas[i] = cpu_to_be64(dma_addr);
odp->npages++;
}
return 0;
}
void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
struct mlx5_ib_mr *mr, int flags)
int mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
struct mlx5_ib_mr *mr, int flags)
{
if (flags & MLX5_IB_UPD_XLT_INDIRECT) {
populate_klm(xlt, idx, nentries, mr, flags);
return 0;
} else {
populate_mtt(xlt, idx, nentries, mr, flags);
return populate_mtt(xlt, idx, nentries, mr, flags);
}
}
@ -303,8 +315,7 @@ static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni,
* estimate the cost of another UMR vs. the cost of bigger
* UMR.
*/
if (umem_odp->dma_list[idx] &
(ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) {
if (umem_odp->map.pfn_list[idx] & HMM_PFN_VALID) {
if (!in_block) {
blk_start_idx = idx;
in_block = 1;
@ -687,7 +698,7 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp,
{
int page_shift, ret, np;
bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE;
u64 access_mask;
u64 access_mask = 0;
u64 start_idx;
bool fault = !(flags & MLX5_PF_FLAGS_SNAPSHOT);
u32 xlt_flags = MLX5_IB_UPD_XLT_ATOMIC;
@ -695,12 +706,14 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp,
if (flags & MLX5_PF_FLAGS_ENABLE)
xlt_flags |= MLX5_IB_UPD_XLT_ENABLE;
if (flags & MLX5_PF_FLAGS_DOWNGRADE)
xlt_flags |= MLX5_IB_UPD_XLT_DOWNGRADE;
page_shift = odp->page_shift;
start_idx = (user_va - ib_umem_start(odp)) >> page_shift;
access_mask = ODP_READ_ALLOWED_BIT;
if (odp->umem.writable && !downgrade)
access_mask |= ODP_WRITE_ALLOWED_BIT;
access_mask |= HMM_PFN_WRITE;
np = ib_umem_odp_map_dma_and_lock(odp, user_va, bcnt, access_mask, fault);
if (np < 0)

View File

@ -21,8 +21,10 @@ mlx5_get_rsc(struct mlx5_qp_table *table, u32 rsn)
spin_lock_irqsave(&table->lock, flags);
common = radix_tree_lookup(&table->tree, rsn);
if (common)
if (common && !common->invalid)
refcount_inc(&common->refcount);
else
common = NULL;
spin_unlock_irqrestore(&table->lock, flags);
@ -178,6 +180,18 @@ static int create_resource_common(struct mlx5_ib_dev *dev,
return 0;
}
static void modify_resource_common_state(struct mlx5_ib_dev *dev,
struct mlx5_core_qp *qp,
bool invalid)
{
struct mlx5_qp_table *table = &dev->qp_table;
unsigned long flags;
spin_lock_irqsave(&table->lock, flags);
qp->common.invalid = invalid;
spin_unlock_irqrestore(&table->lock, flags);
}
static void destroy_resource_common(struct mlx5_ib_dev *dev,
struct mlx5_core_qp *qp)
{
@ -609,8 +623,20 @@ int mlx5_core_create_rq_tracked(struct mlx5_ib_dev *dev, u32 *in, int inlen,
int mlx5_core_destroy_rq_tracked(struct mlx5_ib_dev *dev,
struct mlx5_core_qp *rq)
{
int ret;
/* The rq destruction can be called again in case it fails, hence we
* mark the common resource as invalid and only once FW destruction
* is completed successfully we actually destroy the resources.
*/
modify_resource_common_state(dev, rq, true);
ret = destroy_rq_tracked(dev, rq->qpn, rq->uid);
if (ret) {
modify_resource_common_state(dev, rq, false);
return ret;
}
destroy_resource_common(dev, rq);
return destroy_rq_tracked(dev, rq->qpn, rq->uid);
return 0;
}
static void destroy_sq_tracked(struct mlx5_ib_dev *dev, u32 sqn, u16 uid)

View File

@ -840,7 +840,17 @@ int mlx5r_umr_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
size_to_map = npages * desc_size;
dma_sync_single_for_cpu(ddev, sg.addr, sg.length,
DMA_TO_DEVICE);
mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags);
/*
* npages is the maximum number of pages to map, but we
* can't guarantee that all pages are actually mapped.
*
* For example, if page is p2p of type which is not supported
* for mapping, the number of pages mapped will be less than
* requested.
*/
err = mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags);
if (err)
return err;
dma_sync_single_for_device(ddev, sg.addr, sg.length,
DMA_TO_DEVICE);
sg.length = ALIGN(size_to_map, MLX5_UMR_FLEX_ALIGNMENT);

View File

@ -144,7 +144,7 @@ static int mthca_buddy_init(struct mthca_buddy *buddy, int max_order)
buddy->max_order = max_order;
spin_lock_init(&buddy->lock);
buddy->bits = kcalloc(buddy->max_order + 1, sizeof(long *),
buddy->bits = kcalloc(buddy->max_order + 1, sizeof(*buddy->bits),
GFP_KERNEL);
buddy->num_free = kcalloc((buddy->max_order + 1), sizeof *buddy->num_free,
GFP_KERNEL);

View File

@ -56,7 +56,7 @@ static int usnic_uiom_dma_fault(struct iommu_domain *domain,
unsigned long iova, int flags,
void *token)
{
usnic_err("Device %s iommu fault domain 0x%pK va 0x%lx flags 0x%x\n",
usnic_err("Device %s iommu fault domain 0x%p va 0x%lx flags 0x%x\n",
dev_name(dev),
domain, iova, flags);
return -ENOSYS;

View File

@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
config RDMA_RXE
tristate "Software RDMA over Ethernet (RoCE) driver"
depends on INET && PCI && INFINIBAND
depends on INET && PCI && INFINIBAND && 64BIT
depends on INFINIBAND_VIRT_DMA
select NET_UDP_TUNNEL
select CRC32

View File

@ -101,6 +101,8 @@ static void rxe_init_device_param(struct rxe_dev *rxe, struct net_device *ndev)
rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_READ;
rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC;
rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV;
rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_FLUSH;
rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC_WRITE;
}
}

View File

@ -70,9 +70,9 @@ int copy_data(struct rxe_pd *pd, int access, struct rxe_dma_info *dma,
void *addr, int length, enum rxe_mr_copy_dir dir);
int rxe_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg,
int sg_nents, unsigned int *sg_offset);
int rxe_mr_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode,
u64 compare, u64 swap_add, u64 *orig_val);
int rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value);
enum resp_states rxe_mr_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode,
u64 compare, u64 swap_add, u64 *orig_val);
enum resp_states rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value);
struct rxe_mr *lookup_mr(struct rxe_pd *pd, int access, u32 key,
enum rxe_mr_lookup_type type);
int mr_check_range(struct rxe_mr *mr, u64 iova, size_t length);
@ -193,13 +193,16 @@ static inline unsigned int wr_opcode_mask(int opcode, struct rxe_qp *qp)
/* rxe_odp.c */
extern const struct mmu_interval_notifier_ops rxe_mn_ops;
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
#if defined CONFIG_INFINIBAND_ON_DEMAND_PAGING
int rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length,
u64 iova, int access_flags, struct rxe_mr *mr);
int rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length,
enum rxe_mr_copy_dir dir);
int rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode,
u64 compare, u64 swap_add, u64 *orig_val);
enum resp_states rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode,
u64 compare, u64 swap_add, u64 *orig_val);
int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova,
unsigned int length);
enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value);
#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
static inline int
rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova,
@ -212,9 +215,19 @@ static inline int rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr,
{
return -EOPNOTSUPP;
}
static inline int
static inline enum resp_states
rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode,
u64 compare, u64 swap_add, u64 *orig_val)
u64 compare, u64 swap_add, u64 *orig_val)
{
return RESPST_ERR_UNSUPPORTED_OPCODE;
}
static inline int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova,
unsigned int length)
{
return -EOPNOTSUPP;
}
static inline enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr,
u64 iova, u64 value)
{
return RESPST_ERR_UNSUPPORTED_OPCODE;
}

View File

@ -424,7 +424,7 @@ int copy_data(
return err;
}
int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length)
static int rxe_mr_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length)
{
unsigned int page_offset;
unsigned long index;
@ -433,16 +433,6 @@ int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length)
int err;
u8 *va;
/* mr must be valid even if length is zero */
if (WARN_ON(!mr))
return -EINVAL;
if (length == 0)
return 0;
if (mr->ibmr.type == IB_MR_TYPE_DMA)
return -EFAULT;
err = mr_check_range(mr, iova, length);
if (err)
return err;
@ -454,7 +444,7 @@ int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length)
if (!page)
return -EFAULT;
bytes = min_t(unsigned int, length,
mr_page_size(mr) - page_offset);
mr_page_size(mr) - page_offset);
va = kmap_local_page(page);
arch_wb_cache_pmem(va + page_offset, bytes);
@ -468,11 +458,33 @@ int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length)
return 0;
}
int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 start, unsigned int length)
{
int err;
/* mr must be valid even if length is zero */
if (WARN_ON(!mr))
return -EINVAL;
if (length == 0)
return 0;
if (mr->ibmr.type == IB_MR_TYPE_DMA)
return -EFAULT;
if (is_odp_mr(mr))
err = rxe_odp_flush_pmem_iova(mr, start, length);
else
err = rxe_mr_flush_pmem_iova(mr, start, length);
return err;
}
/* Guarantee atomicity of atomic operations at the machine level. */
DEFINE_SPINLOCK(atomic_ops_lock);
int rxe_mr_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode,
u64 compare, u64 swap_add, u64 *orig_val)
enum resp_states rxe_mr_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode,
u64 compare, u64 swap_add, u64 *orig_val)
{
unsigned int page_offset;
struct page *page;
@ -524,27 +536,15 @@ int rxe_mr_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode,
kunmap_local(va);
return 0;
return RESPST_NONE;
}
#if defined CONFIG_64BIT
/* only implemented or called for 64 bit architectures */
int rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value)
enum resp_states rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value)
{
unsigned int page_offset;
struct page *page;
u64 *va;
/* ODP is not supported right now. WIP. */
if (is_odp_mr(mr))
return RESPST_ERR_UNSUPPORTED_OPCODE;
/* See IBA oA19-28 */
if (unlikely(mr->state != RXE_MR_STATE_VALID)) {
rxe_dbg_mr(mr, "mr not in valid state\n");
return RESPST_ERR_RKEY_VIOLATION;
}
if (mr->ibmr.type == IB_MR_TYPE_DMA) {
page_offset = iova & (PAGE_SIZE - 1);
page = ib_virt_dma_to_page(iova);
@ -572,20 +572,12 @@ int rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value)
}
va = kmap_local_page(page);
/* Do atomic write after all prior operations have completed */
smp_store_release(&va[page_offset >> 3], value);
kunmap_local(va);
return 0;
return RESPST_NONE;
}
#else
int rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value)
{
return RESPST_ERR_UNSUPPORTED_OPCODE;
}
#endif
int advance_dma_data(struct rxe_dma_info *dma, unsigned int length)
{

View File

@ -4,6 +4,7 @@
*/
#include <linux/hmm.h>
#include <linux/libnvdimm.h>
#include <rdma/ib_umem_odp.h>
@ -26,7 +27,7 @@ static bool rxe_ib_invalidate_range(struct mmu_interval_notifier *mni,
start = max_t(u64, ib_umem_start(umem_odp), range->start);
end = min_t(u64, ib_umem_end(umem_odp), range->end);
/* update umem_odp->dma_list */
/* update umem_odp->map.pfn_list */
ib_umem_odp_unmap_dma_pages(umem_odp, start, end);
mutex_unlock(&umem_odp->umem_mutex);
@ -44,12 +45,11 @@ static int rxe_odp_do_pagefault_and_lock(struct rxe_mr *mr, u64 user_va, int bcn
{
struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem);
bool fault = !(flags & RXE_PAGEFAULT_SNAPSHOT);
u64 access_mask;
u64 access_mask = 0;
int np;
access_mask = ODP_READ_ALLOWED_BIT;
if (umem_odp->umem.writable && !(flags & RXE_PAGEFAULT_RDONLY))
access_mask |= ODP_WRITE_ALLOWED_BIT;
access_mask |= HMM_PFN_WRITE;
/*
* ib_umem_odp_map_dma_and_lock() locks umem_mutex on success.
@ -124,8 +124,8 @@ int rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length,
return err;
}
static inline bool rxe_check_pagefault(struct ib_umem_odp *umem_odp,
u64 iova, int length, u32 perm)
static inline bool rxe_check_pagefault(struct ib_umem_odp *umem_odp, u64 iova,
int length)
{
bool need_fault = false;
u64 addr;
@ -137,7 +137,7 @@ static inline bool rxe_check_pagefault(struct ib_umem_odp *umem_odp,
while (addr < iova + length) {
idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
if (!(umem_odp->dma_list[idx] & perm)) {
if (!(umem_odp->map.pfn_list[idx] & HMM_PFN_VALID)) {
need_fault = true;
break;
}
@ -147,23 +147,28 @@ static inline bool rxe_check_pagefault(struct ib_umem_odp *umem_odp,
return need_fault;
}
static unsigned long rxe_odp_iova_to_index(struct ib_umem_odp *umem_odp, u64 iova)
{
return (iova - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
}
static unsigned long rxe_odp_iova_to_page_offset(struct ib_umem_odp *umem_odp, u64 iova)
{
return iova & (BIT(umem_odp->page_shift) - 1);
}
static int rxe_odp_map_range_and_lock(struct rxe_mr *mr, u64 iova, int length, u32 flags)
{
struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem);
bool need_fault;
u64 perm;
int err;
if (unlikely(length < 1))
return -EINVAL;
perm = ODP_READ_ALLOWED_BIT;
if (!(flags & RXE_PAGEFAULT_RDONLY))
perm |= ODP_WRITE_ALLOWED_BIT;
mutex_lock(&umem_odp->umem_mutex);
need_fault = rxe_check_pagefault(umem_odp, iova, length, perm);
need_fault = rxe_check_pagefault(umem_odp, iova, length);
if (need_fault) {
mutex_unlock(&umem_odp->umem_mutex);
@ -173,7 +178,7 @@ static int rxe_odp_map_range_and_lock(struct rxe_mr *mr, u64 iova, int length, u
if (err < 0)
return err;
need_fault = rxe_check_pagefault(umem_odp, iova, length, perm);
need_fault = rxe_check_pagefault(umem_odp, iova, length);
if (need_fault)
return -EFAULT;
}
@ -190,13 +195,13 @@ static int __rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr,
size_t offset;
u8 *user_va;
idx = (iova - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
offset = iova & (BIT(umem_odp->page_shift) - 1);
idx = rxe_odp_iova_to_index(umem_odp, iova);
offset = rxe_odp_iova_to_page_offset(umem_odp, iova);
while (length > 0) {
u8 *src, *dest;
page = hmm_pfn_to_page(umem_odp->pfn_list[idx]);
page = hmm_pfn_to_page(umem_odp->map.pfn_list[idx]);
user_va = kmap_local_page(page);
if (!user_va)
return -EFAULT;
@ -255,8 +260,9 @@ int rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length,
return err;
}
static int rxe_odp_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode,
u64 compare, u64 swap_add, u64 *orig_val)
static enum resp_states rxe_odp_do_atomic_op(struct rxe_mr *mr, u64 iova,
int opcode, u64 compare,
u64 swap_add, u64 *orig_val)
{
struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem);
unsigned int page_offset;
@ -277,9 +283,9 @@ static int rxe_odp_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode,
return RESPST_ERR_RKEY_VIOLATION;
}
idx = (iova - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
page_offset = iova & (BIT(umem_odp->page_shift) - 1);
page = hmm_pfn_to_page(umem_odp->pfn_list[idx]);
idx = rxe_odp_iova_to_index(umem_odp, iova);
page_offset = rxe_odp_iova_to_page_offset(umem_odp, iova);
page = hmm_pfn_to_page(umem_odp->map.pfn_list[idx]);
if (!page)
return RESPST_ERR_RKEY_VIOLATION;
@ -304,11 +310,11 @@ static int rxe_odp_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode,
kunmap_local(va);
return 0;
return RESPST_NONE;
}
int rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode,
u64 compare, u64 swap_add, u64 *orig_val)
enum resp_states rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode,
u64 compare, u64 swap_add, u64 *orig_val)
{
struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem);
int err;
@ -324,3 +330,91 @@ int rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode,
return err;
}
int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova,
unsigned int length)
{
struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem);
unsigned int page_offset;
unsigned long index;
struct page *page;
unsigned int bytes;
int err;
u8 *va;
err = rxe_odp_map_range_and_lock(mr, iova, length,
RXE_PAGEFAULT_DEFAULT);
if (err)
return err;
while (length > 0) {
index = rxe_odp_iova_to_index(umem_odp, iova);
page_offset = rxe_odp_iova_to_page_offset(umem_odp, iova);
page = hmm_pfn_to_page(umem_odp->map.pfn_list[index]);
if (!page) {
mutex_unlock(&umem_odp->umem_mutex);
return -EFAULT;
}
bytes = min_t(unsigned int, length,
mr_page_size(mr) - page_offset);
va = kmap_local_page(page);
arch_wb_cache_pmem(va + page_offset, bytes);
kunmap_local(va);
length -= bytes;
iova += bytes;
page_offset = 0;
}
mutex_unlock(&umem_odp->umem_mutex);
return 0;
}
enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value)
{
struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem);
unsigned int page_offset;
unsigned long index;
struct page *page;
int err;
u64 *va;
/* See IBA oA19-28 */
err = mr_check_range(mr, iova, sizeof(value));
if (unlikely(err)) {
rxe_dbg_mr(mr, "iova out of range\n");
return RESPST_ERR_RKEY_VIOLATION;
}
err = rxe_odp_map_range_and_lock(mr, iova, sizeof(value),
RXE_PAGEFAULT_DEFAULT);
if (err)
return RESPST_ERR_RKEY_VIOLATION;
page_offset = rxe_odp_iova_to_page_offset(umem_odp, iova);
index = rxe_odp_iova_to_index(umem_odp, iova);
page = hmm_pfn_to_page(umem_odp->map.pfn_list[index]);
if (!page) {
mutex_unlock(&umem_odp->umem_mutex);
return RESPST_ERR_RKEY_VIOLATION;
}
/* See IBA A19.4.2 */
if (unlikely(page_offset & 0x7)) {
mutex_unlock(&umem_odp->umem_mutex);
rxe_dbg_mr(mr, "misaligned address\n");
return RESPST_ERR_MISALIGNED_ATOMIC;
}
va = kmap_local_page(page);
/* Do atomic write after all prior operations have completed */
smp_store_release(&va[page_offset >> 3], value);
kunmap_local(va);
mutex_unlock(&umem_odp->umem_mutex);
return RESPST_NONE;
}

View File

@ -53,12 +53,9 @@ enum rxe_device_param {
| IB_DEVICE_MEM_WINDOW
| IB_DEVICE_FLUSH_GLOBAL
| IB_DEVICE_FLUSH_PERSISTENT
#ifdef CONFIG_64BIT
| IB_DEVICE_MEM_WINDOW_TYPE_2B
| IB_DEVICE_ATOMIC_WRITE,
#else
| IB_DEVICE_MEM_WINDOW_TYPE_2B,
#endif /* CONFIG_64BIT */
RXE_MAX_SGE = 32,
RXE_MAX_WQE_SIZE = sizeof(struct rxe_send_wqe) +
sizeof(struct ib_sge) * RXE_MAX_SGE,

View File

@ -811,7 +811,12 @@ static void rxe_qp_do_cleanup(struct work_struct *work)
spin_unlock_irqrestore(&qp->state_lock, flags);
qp->qp_timeout_jiffies = 0;
if (qp_type(qp) == IB_QPT_RC) {
/* In the function timer_setup, .function is initialized. If .function
* is NULL, it indicates the function timer_setup is not called, the
* timer is not initialized. Or else, the timer is initialized.
*/
if (qp_type(qp) == IB_QPT_RC && qp->retrans_timer.function &&
qp->rnr_nak_timer.function) {
timer_delete_sync(&qp->retrans_timer);
timer_delete_sync(&qp->rnr_nak_timer);
}

View File

@ -649,10 +649,6 @@ static enum resp_states process_flush(struct rxe_qp *qp,
struct rxe_mr *mr = qp->resp.mr;
struct resp_res *res = qp->resp.res;
/* ODP is not supported right now. WIP. */
if (is_odp_mr(mr))
return RESPST_ERR_UNSUPPORTED_OPCODE;
/* oA19-14, oA19-15 */
if (res && res->replay)
return RESPST_ACKNOWLEDGE;
@ -753,7 +749,16 @@ static enum resp_states atomic_write_reply(struct rxe_qp *qp,
value = *(u64 *)payload_addr(pkt);
iova = qp->resp.va + qp->resp.offset;
err = rxe_mr_do_atomic_write(mr, iova, value);
/* See IBA oA19-28 */
if (unlikely(mr->state != RXE_MR_STATE_VALID)) {
rxe_dbg_mr(mr, "mr not in valid state\n");
return RESPST_ERR_RKEY_VIOLATION;
}
if (is_odp_mr(mr))
err = rxe_odp_do_atomic_write(mr, iova, value);
else
err = rxe_mr_do_atomic_write(mr, iova, value);
if (err)
return err;

View File

@ -85,17 +85,17 @@ static bool is_done(struct rxe_task *task)
/* do_task is a wrapper for the three tasks (requester,
* completer, responder) and calls them in a loop until
* they return a non-zero value. It is called either
* directly by rxe_run_task or indirectly if rxe_sched_task
* schedules the task. They must call __reserve_if_idle to
* move the task to busy before calling or scheduling.
* The task can also be moved to drained or invalid
* by calls to rxe_cleanup_task or rxe_disable_task.
* In that case tasks which get here are not executed but
* just flushed. The tasks are designed to look to see if
* there is work to do and then do part of it before returning
* here with a return value of zero until all the work
* has been consumed then it returns a non-zero value.
* they return a non-zero value. It is called indirectly
* when rxe_sched_task schedules the task. They must
* call __reserve_if_idle to move the task to busy before
* calling or scheduling. The task can also be moved to
* drained or invalid by calls to rxe_cleanup_task or
* rxe_disable_task. In that case tasks which get here
* are not executed but just flushed. The tasks are
* designed to look to see if there is work to do and
* then do part of it before returning here with a return
* value of zero until all the work has been consumed then
* it returns a non-zero value.
* The number of times the task can be run is limited by
* max iterations so one task cannot hold the cpu forever.
* If the limit is hit and work remains the task is rescheduled.
@ -234,24 +234,6 @@ void rxe_cleanup_task(struct rxe_task *task)
spin_unlock_irqrestore(&task->lock, flags);
}
/* run the task inline if it is currently idle
* cannot call do_task holding the lock
*/
void rxe_run_task(struct rxe_task *task)
{
unsigned long flags;
bool run;
WARN_ON(rxe_read(task->qp) <= 0);
spin_lock_irqsave(&task->lock, flags);
run = __reserve_if_idle(task);
spin_unlock_irqrestore(&task->lock, flags);
if (run)
do_task(task);
}
/* schedule the task to run later as a work queue entry.
* the queue_work call can be called holding
* the lock.

View File

@ -47,8 +47,6 @@ int rxe_init_task(struct rxe_task *task, struct rxe_qp *qp,
/* cleanup task */
void rxe_cleanup_task(struct rxe_task *task);
void rxe_run_task(struct rxe_task *task);
void rxe_sched_task(struct rxe_task *task);
/* keep a task from scheduling */

View File

@ -718,7 +718,7 @@ static inline void siw_crc_skb(struct siw_rx_stream *srx, unsigned int len)
"MEM[0x%08x] %s: " fmt, mem->stag, __func__, ##__VA_ARGS__)
#define siw_dbg_cep(cep, fmt, ...) \
ibdev_dbg(&cep->sdev->base_dev, "CEP[0x%pK] %s: " fmt, \
ibdev_dbg(&cep->sdev->base_dev, "CEP[0x%p] %s: " fmt, \
cep, __func__, ##__VA_ARGS__)
void siw_cq_flush(struct siw_cq *cq);

View File

@ -72,7 +72,7 @@ int siw_reap_cqe(struct siw_cq *cq, struct ib_wc *wc)
wc->opcode = map_wc_opcode[cqe->opcode];
wc->status = map_cqe_status[cqe->status].ib;
siw_dbg_cq(cq,
"idx %u, type %d, flags %2x, id 0x%pK\n",
"idx %u, type %d, flags %2x, id 0x%p\n",
cq->cq_get % cq->num_cqe, cqe->opcode,
cqe->flags, (void *)(uintptr_t)cqe->id);
} else {

View File

@ -17,30 +17,6 @@
/* Stag lookup is based on its index part only (24 bits). */
#define SIW_STAG_MAX_INDEX 0x00ffffff
/*
* The code avoids special Stag of zero and tries to randomize
* STag values between 1 and SIW_STAG_MAX_INDEX.
*/
int siw_mem_add(struct siw_device *sdev, struct siw_mem *m)
{
struct xa_limit limit = XA_LIMIT(1, SIW_STAG_MAX_INDEX);
u32 id, next;
get_random_bytes(&next, 4);
next &= SIW_STAG_MAX_INDEX;
if (xa_alloc_cyclic(&sdev->mem_xa, &id, m, limit, &next,
GFP_KERNEL) < 0)
return -ENOMEM;
/* Set the STag index part */
m->stag = id << 8;
siw_dbg_mem(m, "new MEM object\n");
return 0;
}
/*
* siw_mem_id2obj()
*
@ -181,10 +157,10 @@ int siw_check_mem(struct ib_pd *pd, struct siw_mem *mem, u64 addr,
*/
if (addr < mem->va || addr + len > mem->va + mem->len) {
siw_dbg_pd(pd, "MEM interval len %d\n", len);
siw_dbg_pd(pd, "[0x%pK, 0x%pK] out of bounds\n",
siw_dbg_pd(pd, "[0x%p, 0x%p] out of bounds\n",
(void *)(uintptr_t)addr,
(void *)(uintptr_t)(addr + len));
siw_dbg_pd(pd, "[0x%pK, 0x%pK] STag=0x%08x\n",
siw_dbg_pd(pd, "[0x%p, 0x%p] STag=0x%08x\n",
(void *)(uintptr_t)mem->va,
(void *)(uintptr_t)(mem->va + mem->len),
mem->stag);

View File

@ -12,7 +12,6 @@ void siw_umem_release(struct siw_umem *umem);
struct siw_pbl *siw_pbl_alloc(u32 num_buf);
dma_addr_t siw_pbl_get_buffer(struct siw_pbl *pbl, u64 off, int *len, int *idx);
struct siw_mem *siw_mem_id2obj(struct siw_device *sdev, int stag_index);
int siw_mem_add(struct siw_device *sdev, struct siw_mem *m);
int siw_invalidate_stag(struct ib_pd *pd, u32 stag);
int siw_check_mem(struct ib_pd *pd, struct siw_mem *mem, u64 addr,
enum ib_access_flags perms, int len);

View File

@ -38,7 +38,7 @@ static int siw_rx_umem(struct siw_rx_stream *srx, struct siw_umem *umem,
p = siw_get_upage(umem, dest_addr);
if (unlikely(!p)) {
pr_warn("siw: %s: [QP %u]: bogus addr: %pK, %pK\n",
pr_warn("siw: %s: [QP %u]: bogus addr: %p, %p\n",
__func__, qp_id(rx_qp(srx)),
(void *)(uintptr_t)dest_addr,
(void *)(uintptr_t)umem->fp_addr);
@ -51,7 +51,7 @@ static int siw_rx_umem(struct siw_rx_stream *srx, struct siw_umem *umem,
pg_off = dest_addr & ~PAGE_MASK;
bytes = min(len, (int)PAGE_SIZE - pg_off);
siw_dbg_qp(rx_qp(srx), "page %pK, bytes=%u\n", p, bytes);
siw_dbg_qp(rx_qp(srx), "page %p, bytes=%u\n", p, bytes);
dest = kmap_atomic(p);
rv = skb_copy_bits(srx->skb, srx->skb_offset, dest + pg_off,
@ -105,11 +105,11 @@ static int siw_rx_kva(struct siw_rx_stream *srx, void *kva, int len)
{
int rv;
siw_dbg_qp(rx_qp(srx), "kva: 0x%pK, len: %u\n", kva, len);
siw_dbg_qp(rx_qp(srx), "kva: 0x%p, len: %u\n", kva, len);
rv = skb_copy_bits(srx->skb, srx->skb_offset, kva, len);
if (unlikely(rv)) {
pr_warn("siw: [QP %u]: %s, len %d, kva 0x%pK, rv %d\n",
pr_warn("siw: [QP %u]: %s, len %d, kva 0x%p, rv %d\n",
qp_id(rx_qp(srx)), __func__, len, kva, rv);
return rv;

View File

@ -936,7 +936,7 @@ int siw_post_send(struct ib_qp *base_qp, const struct ib_send_wr *wr,
rv = -EINVAL;
break;
}
siw_dbg_qp(qp, "opcode %d, flags 0x%x, wr_id 0x%pK\n",
siw_dbg_qp(qp, "opcode %d, flags 0x%x, wr_id 0x%p\n",
sqe->opcode, sqe->flags,
(void *)(uintptr_t)sqe->id);
@ -1102,7 +1102,7 @@ int siw_post_receive(struct ib_qp *base_qp, const struct ib_recv_wr *wr,
siw_dbg_qp(qp, "error %d\n", rv);
*bad_wr = wr;
}
return rv > 0 ? 0 : rv;
return rv;
}
int siw_destroy_cq(struct ib_cq *base_cq, struct ib_udata *udata)
@ -1332,7 +1332,7 @@ struct ib_mr *siw_reg_user_mr(struct ib_pd *pd, u64 start, u64 len,
struct siw_device *sdev = to_siw_dev(pd->device);
int rv;
siw_dbg_pd(pd, "start: 0x%pK, va: 0x%pK, len: %llu\n",
siw_dbg_pd(pd, "start: 0x%p, va: 0x%p, len: %llu\n",
(void *)(uintptr_t)start, (void *)(uintptr_t)rnic_va,
(unsigned long long)len);
@ -1525,7 +1525,7 @@ int siw_map_mr_sg(struct ib_mr *base_mr, struct scatterlist *sl, int num_sle,
mem->len = base_mr->length;
mem->va = base_mr->iova;
siw_dbg_mem(mem,
"%llu bytes, start 0x%pK, %u SLE to %u entries\n",
"%llu bytes, start 0x%p, %u SLE to %u entries\n",
mem->len, (void *)(uintptr_t)mem->va, num_sle,
pbl->num_buf);
}

View File

@ -391,6 +391,7 @@ static void mana_gd_process_eqe(struct gdma_queue *eq)
case GDMA_EQE_HWC_INIT_EQ_ID_DB:
case GDMA_EQE_HWC_INIT_DATA:
case GDMA_EQE_HWC_INIT_DONE:
case GDMA_EQE_HWC_SOC_SERVICE:
case GDMA_EQE_RNIC_QP_FATAL:
if (!eq->eq.callback)
break;
@ -964,6 +965,7 @@ int mana_gd_verify_vf_version(struct pci_dev *pdev)
err, resp.hdr.status);
return err ? err : -EPROTO;
}
gc->pf_cap_flags1 = resp.pf_cap_flags1;
if (resp.pf_cap_flags1 & GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECONFIG) {
err = mana_gd_query_hwc_timeout(pdev, &hwc->hwc_timeout);
if (err) {
@ -1004,7 +1006,6 @@ int mana_gd_register_device(struct gdma_dev *gd)
return 0;
}
EXPORT_SYMBOL_NS(mana_gd_register_device, "NET_MANA");
int mana_gd_deregister_device(struct gdma_dev *gd)
{
@ -1035,7 +1036,6 @@ int mana_gd_deregister_device(struct gdma_dev *gd)
return err;
}
EXPORT_SYMBOL_NS(mana_gd_deregister_device, "NET_MANA");
u32 mana_gd_wq_avail_space(struct gdma_queue *wq)
{
@ -1469,10 +1469,14 @@ static int mana_gd_setup(struct pci_dev *pdev)
mana_gd_init_registers(pdev);
mana_smc_init(&gc->shm_channel, gc->dev, gc->shm_base);
gc->service_wq = alloc_ordered_workqueue("gdma_service_wq", 0);
if (!gc->service_wq)
return -ENOMEM;
err = mana_gd_setup_irqs(pdev);
if (err) {
dev_err(gc->dev, "Failed to setup IRQs: %d\n", err);
return err;
goto free_workqueue;
}
err = mana_hwc_create_channel(gc);
@ -1498,6 +1502,8 @@ static int mana_gd_setup(struct pci_dev *pdev)
mana_hwc_destroy_channel(gc);
remove_irq:
mana_gd_remove_irqs(pdev);
free_workqueue:
destroy_workqueue(gc->service_wq);
dev_err(&pdev->dev, "%s failed (error %d)\n", __func__, err);
return err;
}
@ -1509,6 +1515,8 @@ static void mana_gd_cleanup(struct pci_dev *pdev)
mana_hwc_destroy_channel(gc);
mana_gd_remove_irqs(pdev);
destroy_workqueue(gc->service_wq);
dev_dbg(&pdev->dev, "mana gdma cleanup successful\n");
}
@ -1578,8 +1586,14 @@ static int mana_gd_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
if (err)
goto cleanup_gd;
err = mana_rdma_probe(&gc->mana_ib);
if (err)
goto cleanup_mana;
return 0;
cleanup_mana:
mana_remove(&gc->mana, false);
cleanup_gd:
mana_gd_cleanup(pdev);
unmap_bar:
@ -1607,6 +1621,7 @@ static void mana_gd_remove(struct pci_dev *pdev)
{
struct gdma_context *gc = pci_get_drvdata(pdev);
mana_rdma_remove(&gc->mana_ib);
mana_remove(&gc->mana, false);
mana_gd_cleanup(pdev);
@ -1630,6 +1645,7 @@ static int mana_gd_suspend(struct pci_dev *pdev, pm_message_t state)
{
struct gdma_context *gc = pci_get_drvdata(pdev);
mana_rdma_remove(&gc->mana_ib);
mana_remove(&gc->mana, true);
mana_gd_cleanup(pdev);
@ -1654,6 +1670,10 @@ static int mana_gd_resume(struct pci_dev *pdev)
if (err)
return err;
err = mana_rdma_probe(&gc->mana_ib);
if (err)
return err;
return 0;
}
@ -1664,6 +1684,7 @@ static void mana_gd_shutdown(struct pci_dev *pdev)
dev_info(&pdev->dev, "Shutdown was called\n");
mana_rdma_remove(&gc->mana_ib);
mana_remove(&gc->mana, true);
mana_gd_cleanup(pdev);

View File

@ -112,11 +112,13 @@ static void mana_hwc_handle_resp(struct hw_channel_context *hwc, u32 resp_len,
static void mana_hwc_init_event_handler(void *ctx, struct gdma_queue *q_self,
struct gdma_event *event)
{
union hwc_init_soc_service_type service_data;
struct hw_channel_context *hwc = ctx;
struct gdma_dev *gd = hwc->gdma_dev;
union hwc_init_type_data type_data;
union hwc_init_eq_id_db eq_db;
u32 type, val;
int ret;
switch (event->type) {
case GDMA_EQE_HWC_INIT_EQ_ID_DB:
@ -199,7 +201,24 @@ static void mana_hwc_init_event_handler(void *ctx, struct gdma_queue *q_self,
}
break;
case GDMA_EQE_HWC_SOC_SERVICE:
service_data.as_uint32 = event->details[0];
type = service_data.type;
switch (type) {
case GDMA_SERVICE_TYPE_RDMA_SUSPEND:
case GDMA_SERVICE_TYPE_RDMA_RESUME:
ret = mana_rdma_service_event(gd->gdma_context, type);
if (ret)
dev_err(hwc->dev, "Failed to schedule adev service event: %d\n",
ret);
break;
default:
dev_warn(hwc->dev, "Received unknown SOC service type %u\n", type);
break;
}
break;
default:
dev_warn(hwc->dev, "Received unknown gdma event %u\n", event->type);
/* Ignore unknown events, which should never happen. */

View File

@ -2950,7 +2950,7 @@ static void remove_adev(struct gdma_dev *gd)
gd->adev = NULL;
}
static int add_adev(struct gdma_dev *gd)
static int add_adev(struct gdma_dev *gd, const char *name)
{
struct auxiliary_device *adev;
struct mana_adev *madev;
@ -2966,7 +2966,7 @@ static int add_adev(struct gdma_dev *gd)
goto idx_fail;
adev->id = ret;
adev->name = "rdma";
adev->name = name;
adev->dev.parent = gd->gdma_context->dev;
adev->dev.release = adev_release;
madev->mdev = gd;
@ -2998,6 +2998,70 @@ static int add_adev(struct gdma_dev *gd)
return ret;
}
static void mana_rdma_service_handle(struct work_struct *work)
{
struct mana_service_work *serv_work =
container_of(work, struct mana_service_work, work);
struct gdma_dev *gd = serv_work->gdma_dev;
struct device *dev = gd->gdma_context->dev;
int ret;
if (READ_ONCE(gd->rdma_teardown))
goto out;
switch (serv_work->event) {
case GDMA_SERVICE_TYPE_RDMA_SUSPEND:
if (!gd->adev || gd->is_suspended)
break;
remove_adev(gd);
gd->is_suspended = true;
break;
case GDMA_SERVICE_TYPE_RDMA_RESUME:
if (!gd->is_suspended)
break;
ret = add_adev(gd, "rdma");
if (ret)
dev_err(dev, "Failed to add adev on resume: %d\n", ret);
else
gd->is_suspended = false;
break;
default:
dev_warn(dev, "unknown adev service event %u\n",
serv_work->event);
break;
}
out:
kfree(serv_work);
}
int mana_rdma_service_event(struct gdma_context *gc, enum gdma_service_type event)
{
struct gdma_dev *gd = &gc->mana_ib;
struct mana_service_work *serv_work;
if (gd->dev_id.type != GDMA_DEVICE_MANA_IB) {
/* RDMA device is not detected on pci */
return 0;
}
serv_work = kzalloc(sizeof(*serv_work), GFP_ATOMIC);
if (!serv_work)
return -ENOMEM;
serv_work->event = event;
serv_work->gdma_dev = gd;
INIT_WORK(&serv_work->work, mana_rdma_service_handle);
queue_work(gc->service_wq, &serv_work->work);
return 0;
}
int mana_probe(struct gdma_dev *gd, bool resuming)
{
struct gdma_context *gc = gd->gdma_context;
@ -3085,7 +3149,7 @@ int mana_probe(struct gdma_dev *gd, bool resuming)
}
}
err = add_adev(gd);
err = add_adev(gd, "eth");
out:
if (err) {
mana_remove(gd, false);
@ -3159,6 +3223,44 @@ void mana_remove(struct gdma_dev *gd, bool suspending)
dev_dbg(dev, "%s succeeded\n", __func__);
}
int mana_rdma_probe(struct gdma_dev *gd)
{
int err = 0;
if (gd->dev_id.type != GDMA_DEVICE_MANA_IB) {
/* RDMA device is not detected on pci */
return err;
}
err = mana_gd_register_device(gd);
if (err)
return err;
err = add_adev(gd, "rdma");
if (err)
mana_gd_deregister_device(gd);
return err;
}
void mana_rdma_remove(struct gdma_dev *gd)
{
struct gdma_context *gc = gd->gdma_context;
if (gd->dev_id.type != GDMA_DEVICE_MANA_IB) {
/* RDMA device is not detected on pci */
return;
}
WRITE_ONCE(gd->rdma_teardown, true);
flush_workqueue(gc->service_wq);
if (gd->adev)
remove_adev(gd);
mana_gd_deregister_device(gd);
}
struct net_device *mana_get_primary_netdev(struct mana_context *ac,
u32 port_index,
netdevice_tracker *tracker)

33
include/linux/hmm-dma.h Normal file
View File

@ -0,0 +1,33 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* Copyright (c) 2024 NVIDIA Corporation & Affiliates */
#ifndef LINUX_HMM_DMA_H
#define LINUX_HMM_DMA_H
#include <linux/dma-mapping.h>
struct dma_iova_state;
struct pci_p2pdma_map_state;
/*
* struct hmm_dma_map - array of PFNs and DMA addresses
*
* @state: DMA IOVA state
* @pfns: array of PFNs
* @dma_list: array of DMA addresses
* @dma_entry_size: size of each DMA entry in the array
*/
struct hmm_dma_map {
struct dma_iova_state state;
unsigned long *pfn_list;
dma_addr_t *dma_list;
size_t dma_entry_size;
};
int hmm_dma_map_alloc(struct device *dev, struct hmm_dma_map *map,
size_t nr_entries, size_t dma_entry_size);
void hmm_dma_map_free(struct device *dev, struct hmm_dma_map *map);
dma_addr_t hmm_dma_map_pfn(struct device *dev, struct hmm_dma_map *map,
size_t idx,
struct pci_p2pdma_map_state *p2pdma_state);
bool hmm_dma_unmap_pfn(struct device *dev, struct hmm_dma_map *map, size_t idx);
#endif /* LINUX_HMM_DMA_H */

View File

@ -23,6 +23,10 @@ struct mmu_interval_notifier;
* HMM_PFN_WRITE - if the page memory can be written to (requires HMM_PFN_VALID)
* HMM_PFN_ERROR - accessing the pfn is impossible and the device should
* fail. ie poisoned memory, special pages, no vma, etc
* HMM_PFN_P2PDMA - P2P page
* HMM_PFN_P2PDMA_BUS - Bus mapped P2P transfer
* HMM_PFN_DMA_MAPPED - Flag preserved on input-to-output transformation
* to mark that page is already DMA mapped
*
* On input:
* 0 - Return the current state of the page, do not fault it.
@ -36,13 +40,21 @@ enum hmm_pfn_flags {
HMM_PFN_VALID = 1UL << (BITS_PER_LONG - 1),
HMM_PFN_WRITE = 1UL << (BITS_PER_LONG - 2),
HMM_PFN_ERROR = 1UL << (BITS_PER_LONG - 3),
HMM_PFN_ORDER_SHIFT = (BITS_PER_LONG - 8),
/*
* Sticky flags, carried from input to output,
* don't forget to update HMM_PFN_INOUT_FLAGS
*/
HMM_PFN_DMA_MAPPED = 1UL << (BITS_PER_LONG - 4),
HMM_PFN_P2PDMA = 1UL << (BITS_PER_LONG - 5),
HMM_PFN_P2PDMA_BUS = 1UL << (BITS_PER_LONG - 6),
HMM_PFN_ORDER_SHIFT = (BITS_PER_LONG - 11),
/* Input flags */
HMM_PFN_REQ_FAULT = HMM_PFN_VALID,
HMM_PFN_REQ_WRITE = HMM_PFN_WRITE,
HMM_PFN_FLAGS = 0xFFUL << HMM_PFN_ORDER_SHIFT,
HMM_PFN_FLAGS = ~((1UL << HMM_PFN_ORDER_SHIFT) - 1),
};
/*
@ -57,6 +69,14 @@ static inline struct page *hmm_pfn_to_page(unsigned long hmm_pfn)
return pfn_to_page(hmm_pfn & ~HMM_PFN_FLAGS);
}
/*
* hmm_pfn_to_phys() - return physical address pointed to by a device entry
*/
static inline phys_addr_t hmm_pfn_to_phys(unsigned long hmm_pfn)
{
return __pfn_to_phys(hmm_pfn & ~HMM_PFN_FLAGS);
}
/*
* hmm_pfn_to_map_order() - return the CPU mapping size order
*

View File

@ -398,6 +398,7 @@ struct mlx5_core_rsc_common {
enum mlx5_res_type res;
refcount_t refcount;
struct completion free;
bool invalid;
};
struct mlx5_uars_page {

View File

@ -60,6 +60,7 @@ enum gdma_eqe_type {
GDMA_EQE_HWC_INIT_DONE = 131,
GDMA_EQE_HWC_SOC_RECONFIG = 132,
GDMA_EQE_HWC_SOC_RECONFIG_DATA = 133,
GDMA_EQE_HWC_SOC_SERVICE = 134,
GDMA_EQE_RNIC_QP_FATAL = 176,
};
@ -70,6 +71,18 @@ enum {
GDMA_DEVICE_MANA_IB = 3,
};
enum gdma_service_type {
GDMA_SERVICE_TYPE_NONE = 0,
GDMA_SERVICE_TYPE_RDMA_SUSPEND = 1,
GDMA_SERVICE_TYPE_RDMA_RESUME = 2,
};
struct mana_service_work {
struct work_struct work;
struct gdma_dev *gdma_dev;
enum gdma_service_type event;
};
struct gdma_resource {
/* Protect the bitmap */
spinlock_t lock;
@ -224,6 +237,8 @@ struct gdma_dev {
void *driver_data;
struct auxiliary_device *adev;
bool is_suspended;
bool rdma_teardown;
};
/* MANA_PAGE_SIZE is the DMA unit */
@ -407,6 +422,10 @@ struct gdma_context {
/* Azure RDMA adapter */
struct gdma_dev mana_ib;
u64 pf_cap_flags1;
struct workqueue_struct *service_wq;
};
static inline bool mana_gd_is_mana(struct gdma_dev *gd)
@ -553,6 +572,7 @@ enum {
*/
#define GDMA_DRV_CAP_FLAG_1_NAPI_WKDONE_FIX BIT(2)
#define GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECONFIG BIT(3)
#define GDMA_DRV_CAP_FLAG_1_GDMA_PAGES_4MB_1GB_2GB BIT(4)
#define GDMA_DRV_CAP_FLAG_1_VARIABLE_INDIRECTION_TABLE_SUPPORT BIT(5)
/* Driver can handle holes (zeros) in the device list */
@ -707,20 +727,6 @@ struct gdma_query_hwc_timeout_resp {
u32 reserved;
};
enum atb_page_size {
ATB_PAGE_SIZE_4K,
ATB_PAGE_SIZE_8K,
ATB_PAGE_SIZE_16K,
ATB_PAGE_SIZE_32K,
ATB_PAGE_SIZE_64K,
ATB_PAGE_SIZE_128K,
ATB_PAGE_SIZE_256K,
ATB_PAGE_SIZE_512K,
ATB_PAGE_SIZE_1M,
ATB_PAGE_SIZE_2M,
ATB_PAGE_SIZE_MAX,
};
enum gdma_mr_access_flags {
GDMA_ACCESS_FLAG_LOCAL_READ = BIT_ULL(0),
GDMA_ACCESS_FLAG_LOCAL_WRITE = BIT_ULL(1),
@ -815,6 +821,8 @@ enum gdma_mr_type {
* address that is set up in the MST
*/
GDMA_MR_TYPE_GVA = 2,
/* Guest zero-based address MRs */
GDMA_MR_TYPE_ZBVA = 4,
};
struct gdma_create_mr_params {
@ -826,6 +834,10 @@ struct gdma_create_mr_params {
u64 virtual_address;
enum gdma_mr_access_flags access_flags;
} gva;
struct {
u64 dma_region_handle;
enum gdma_mr_access_flags access_flags;
} zbva;
};
};
@ -841,7 +853,10 @@ struct gdma_create_mr_request {
u64 virtual_address;
enum gdma_mr_access_flags access_flags;
} gva;
struct {
u64 dma_region_handle;
enum gdma_mr_access_flags access_flags;
} zbva;
};
u32 reserved_2;
};/* HW DATA */
@ -893,4 +908,6 @@ int mana_gd_destroy_dma_region(struct gdma_context *gc, u64 dma_region_handle);
void mana_register_debugfs(void);
void mana_unregister_debugfs(void);
int mana_rdma_service_event(struct gdma_context *gc, enum gdma_service_type event);
#endif /* _GDMA_H */

View File

@ -49,6 +49,15 @@ union hwc_init_type_data {
};
}; /* HW DATA */
union hwc_init_soc_service_type {
u32 as_uint32;
struct {
u32 value : 28;
u32 type : 4;
};
}; /* HW DATA */
struct hwc_rx_oob {
u32 type : 6;
u32 eom : 1;

View File

@ -489,6 +489,9 @@ int mana_detach(struct net_device *ndev, bool from_close);
int mana_probe(struct gdma_dev *gd, bool resuming);
void mana_remove(struct gdma_dev *gd, bool suspending);
int mana_rdma_probe(struct gdma_dev *gd);
void mana_rdma_remove(struct gdma_dev *gd);
void mana_xdp_tx(struct sk_buff *skb, struct net_device *ndev);
int mana_xdp_xmit(struct net_device *ndev, int n, struct xdp_frame **frames,
u32 flags);

View File

@ -480,23 +480,12 @@ int ib_send_cm_rej(struct ib_cm_id *cm_id,
const void *private_data,
u8 private_data_len);
#define IB_CM_MRA_FLAG_DELAY 0x80 /* Send MRA only after a duplicate msg */
/**
* ib_send_cm_mra - Sends a message receipt acknowledgement to a connection
* message.
* ib_prepare_cm_mra - Prepares to send a message receipt acknowledgment to a
connection message in case duplicates are received.
* @cm_id: Connection identifier associated with the connection message.
* @service_timeout: The lower 5-bits specify the maximum time required for
* the sender to reply to the connection message. The upper 3-bits
* specify additional control flags.
* @private_data: Optional user-defined private data sent with the
* message receipt acknowledgement.
* @private_data_len: Size of the private data buffer, in bytes.
*/
int ib_send_cm_mra(struct ib_cm_id *cm_id,
u8 service_timeout,
const void *private_data,
u8 private_data_len);
int ib_prepare_cm_mra(struct ib_cm_id *cm_id);
/**
* ib_cm_init_qp_attr - Initializes the QP attributes for use in transitioning

View File

@ -8,23 +8,17 @@
#include <rdma/ib_umem.h>
#include <rdma/ib_verbs.h>
#include <linux/hmm-dma.h>
struct ib_umem_odp {
struct ib_umem umem;
struct mmu_interval_notifier notifier;
struct pid *tgid;
/* An array of the pfns included in the on-demand paging umem. */
unsigned long *pfn_list;
struct hmm_dma_map map;
/*
* An array with DMA addresses mapped for pfns in pfn_list.
* The lower two bits designate access permissions.
* See ODP_READ_ALLOWED_BIT and ODP_WRITE_ALLOWED_BIT.
*/
dma_addr_t *dma_list;
/*
* The umem_mutex protects the page_list and dma_list fields of an ODP
* The umem_mutex protects the page_list field of an ODP
* umem, allowing only a single thread to map/unmap pages. The mutex
* also protects access to the mmu notifier counters.
*/
@ -67,19 +61,6 @@ static inline size_t ib_umem_odp_num_pages(struct ib_umem_odp *umem_odp)
umem_odp->page_shift;
}
/*
* The lower 2 bits of the DMA address signal the R/W permissions for
* the entry. To upgrade the permissions, provide the appropriate
* bitmask to the map_dma_pages function.
*
* Be aware that upgrading a mapped address might result in change of
* the DMA address for the page.
*/
#define ODP_READ_ALLOWED_BIT (1<<0ULL)
#define ODP_WRITE_ALLOWED_BIT (1<<1ULL)
#define ODP_DMA_ADDR_MASK (~(ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT))
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
struct ib_umem_odp *

View File

@ -314,17 +314,19 @@ enum ib_atomic_cap {
};
enum ib_odp_general_cap_bits {
IB_ODP_SUPPORT = 1 << 0,
IB_ODP_SUPPORT_IMPLICIT = 1 << 1,
IB_ODP_SUPPORT = IB_UVERBS_ODP_SUPPORT,
IB_ODP_SUPPORT_IMPLICIT = IB_UVERBS_ODP_SUPPORT_IMPLICIT,
};
enum ib_odp_transport_cap_bits {
IB_ODP_SUPPORT_SEND = 1 << 0,
IB_ODP_SUPPORT_RECV = 1 << 1,
IB_ODP_SUPPORT_WRITE = 1 << 2,
IB_ODP_SUPPORT_READ = 1 << 3,
IB_ODP_SUPPORT_ATOMIC = 1 << 4,
IB_ODP_SUPPORT_SRQ_RECV = 1 << 5,
IB_ODP_SUPPORT_SEND = IB_UVERBS_ODP_SUPPORT_SEND,
IB_ODP_SUPPORT_RECV = IB_UVERBS_ODP_SUPPORT_RECV,
IB_ODP_SUPPORT_WRITE = IB_UVERBS_ODP_SUPPORT_WRITE,
IB_ODP_SUPPORT_READ = IB_UVERBS_ODP_SUPPORT_READ,
IB_ODP_SUPPORT_ATOMIC = IB_UVERBS_ODP_SUPPORT_ATOMIC,
IB_ODP_SUPPORT_SRQ_RECV = IB_UVERBS_ODP_SUPPORT_SRQ_RECV,
IB_ODP_SUPPORT_FLUSH = IB_UVERBS_ODP_SUPPORT_FLUSH,
IB_ODP_SUPPORT_ATOMIC_WRITE = IB_UVERBS_ODP_SUPPORT_ATOMIC_WRITE,
};
struct ib_odp_caps {

View File

@ -388,6 +388,5 @@ void rdma_read_gids(struct rdma_cm_id *cm_id, union ib_gid *sgid,
union ib_gid *dgid);
struct iw_cm_id *rdma_iw_cm_id(struct rdma_cm_id *cm_id);
struct rdma_cm_id *rdma_res_to_id(struct rdma_restrack_entry *res);
#endif /* RDMA_CM_H */

View File

@ -233,6 +233,22 @@ struct ib_uverbs_ex_query_device {
__u32 reserved;
};
enum ib_uverbs_odp_general_cap_bits {
IB_UVERBS_ODP_SUPPORT = 1 << 0,
IB_UVERBS_ODP_SUPPORT_IMPLICIT = 1 << 1,
};
enum ib_uverbs_odp_transport_cap_bits {
IB_UVERBS_ODP_SUPPORT_SEND = 1 << 0,
IB_UVERBS_ODP_SUPPORT_RECV = 1 << 1,
IB_UVERBS_ODP_SUPPORT_WRITE = 1 << 2,
IB_UVERBS_ODP_SUPPORT_READ = 1 << 3,
IB_UVERBS_ODP_SUPPORT_ATOMIC = 1 << 4,
IB_UVERBS_ODP_SUPPORT_SRQ_RECV = 1 << 5,
IB_UVERBS_ODP_SUPPORT_FLUSH = 1 << 6,
IB_UVERBS_ODP_SUPPORT_ATOMIC_WRITE = 1 << 7,
};
struct ib_uverbs_odp_caps {
__aligned_u64 general_caps;
struct {

262
mm/hmm.c
View File

@ -10,6 +10,7 @@
*/
#include <linux/pagewalk.h>
#include <linux/hmm.h>
#include <linux/hmm-dma.h>
#include <linux/init.h>
#include <linux/rmap.h>
#include <linux/swap.h>
@ -23,6 +24,7 @@
#include <linux/sched/mm.h>
#include <linux/jump_label.h>
#include <linux/dma-mapping.h>
#include <linux/pci-p2pdma.h>
#include <linux/mmu_notifier.h>
#include <linux/memory_hotplug.h>
@ -39,13 +41,21 @@ enum {
HMM_NEED_ALL_BITS = HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT,
};
enum {
/* These flags are carried from input-to-output */
HMM_PFN_INOUT_FLAGS = HMM_PFN_DMA_MAPPED | HMM_PFN_P2PDMA |
HMM_PFN_P2PDMA_BUS,
};
static int hmm_pfns_fill(unsigned long addr, unsigned long end,
struct hmm_range *range, unsigned long cpu_flags)
{
unsigned long i = (addr - range->start) >> PAGE_SHIFT;
for (; addr < end; addr += PAGE_SIZE, i++)
range->hmm_pfns[i] = cpu_flags;
for (; addr < end; addr += PAGE_SIZE, i++) {
range->hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
range->hmm_pfns[i] |= cpu_flags;
}
return 0;
}
@ -202,8 +212,10 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
return hmm_vma_fault(addr, end, required_fault, walk);
pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++)
hmm_pfns[i] = pfn | cpu_flags;
for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) {
hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
hmm_pfns[i] |= pfn | cpu_flags;
}
return 0;
}
#else /* CONFIG_TRANSPARENT_HUGEPAGE */
@ -230,14 +242,14 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
unsigned long cpu_flags;
pte_t pte = ptep_get(ptep);
uint64_t pfn_req_flags = *hmm_pfn;
uint64_t new_pfn_flags = 0;
if (pte_none_mostly(pte)) {
required_fault =
hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0);
if (required_fault)
goto fault;
*hmm_pfn = 0;
return 0;
goto out;
}
if (!pte_present(pte)) {
@ -253,16 +265,14 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
cpu_flags = HMM_PFN_VALID;
if (is_writable_device_private_entry(entry))
cpu_flags |= HMM_PFN_WRITE;
*hmm_pfn = swp_offset_pfn(entry) | cpu_flags;
return 0;
new_pfn_flags = swp_offset_pfn(entry) | cpu_flags;
goto out;
}
required_fault =
hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0);
if (!required_fault) {
*hmm_pfn = 0;
return 0;
}
if (!required_fault)
goto out;
if (!non_swap_entry(entry))
goto fault;
@ -304,11 +314,13 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
pte_unmap(ptep);
return -EFAULT;
}
*hmm_pfn = HMM_PFN_ERROR;
return 0;
new_pfn_flags = HMM_PFN_ERROR;
goto out;
}
*hmm_pfn = pte_pfn(pte) | cpu_flags;
new_pfn_flags = pte_pfn(pte) | cpu_flags;
out:
*hmm_pfn = (*hmm_pfn & HMM_PFN_INOUT_FLAGS) | new_pfn_flags;
return 0;
fault:
@ -448,8 +460,10 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
}
pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
for (i = 0; i < npages; ++i, ++pfn)
hmm_pfns[i] = pfn | cpu_flags;
for (i = 0; i < npages; ++i, ++pfn) {
hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
hmm_pfns[i] |= pfn | cpu_flags;
}
goto out_unlock;
}
@ -507,8 +521,10 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
}
pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT);
for (; addr < end; addr += PAGE_SIZE, i++, pfn++)
range->hmm_pfns[i] = pfn | cpu_flags;
for (; addr < end; addr += PAGE_SIZE, i++, pfn++) {
range->hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
range->hmm_pfns[i] |= pfn | cpu_flags;
}
spin_unlock(ptl);
return 0;
@ -607,3 +623,211 @@ int hmm_range_fault(struct hmm_range *range)
return ret;
}
EXPORT_SYMBOL(hmm_range_fault);
/**
* hmm_dma_map_alloc - Allocate HMM map structure
* @dev: device to allocate structure for
* @map: HMM map to allocate
* @nr_entries: number of entries in the map
* @dma_entry_size: size of the DMA entry in the map
*
* Allocate the HMM map structure and all the lists it contains.
* Return 0 on success, -ENOMEM on failure.
*/
int hmm_dma_map_alloc(struct device *dev, struct hmm_dma_map *map,
size_t nr_entries, size_t dma_entry_size)
{
bool dma_need_sync = false;
bool use_iova;
WARN_ON_ONCE(!(nr_entries * PAGE_SIZE / dma_entry_size));
/*
* The HMM API violates our normal DMA buffer ownership rules and can't
* transfer buffer ownership. The dma_addressing_limited() check is a
* best approximation to ensure no swiotlb buffering happens.
*/
#ifdef CONFIG_DMA_NEED_SYNC
dma_need_sync = !dev->dma_skip_sync;
#endif /* CONFIG_DMA_NEED_SYNC */
if (dma_need_sync || dma_addressing_limited(dev))
return -EOPNOTSUPP;
map->dma_entry_size = dma_entry_size;
map->pfn_list = kvcalloc(nr_entries, sizeof(*map->pfn_list),
GFP_KERNEL | __GFP_NOWARN);
if (!map->pfn_list)
return -ENOMEM;
use_iova = dma_iova_try_alloc(dev, &map->state, 0,
nr_entries * PAGE_SIZE);
if (!use_iova && dma_need_unmap(dev)) {
map->dma_list = kvcalloc(nr_entries, sizeof(*map->dma_list),
GFP_KERNEL | __GFP_NOWARN);
if (!map->dma_list)
goto err_dma;
}
return 0;
err_dma:
kvfree(map->pfn_list);
return -ENOMEM;
}
EXPORT_SYMBOL_GPL(hmm_dma_map_alloc);
/**
* hmm_dma_map_free - iFree HMM map structure
* @dev: device to free structure from
* @map: HMM map containing the various lists and state
*
* Free the HMM map structure and all the lists it contains.
*/
void hmm_dma_map_free(struct device *dev, struct hmm_dma_map *map)
{
if (dma_use_iova(&map->state))
dma_iova_free(dev, &map->state);
kvfree(map->pfn_list);
kvfree(map->dma_list);
}
EXPORT_SYMBOL_GPL(hmm_dma_map_free);
/**
* hmm_dma_map_pfn - Map a physical HMM page to DMA address
* @dev: Device to map the page for
* @map: HMM map
* @idx: Index into the PFN and dma address arrays
* @p2pdma_state: PCI P2P state.
*
* dma_alloc_iova() allocates IOVA based on the size specified by their use in
* iova->size. Call this function after IOVA allocation to link whole @page
* to get the DMA address. Note that very first call to this function
* will have @offset set to 0 in the IOVA space allocated from
* dma_alloc_iova(). For subsequent calls to this function on same @iova,
* @offset needs to be advanced by the caller with the size of previous
* page that was linked + DMA address returned for the previous page that was
* linked by this function.
*/
dma_addr_t hmm_dma_map_pfn(struct device *dev, struct hmm_dma_map *map,
size_t idx,
struct pci_p2pdma_map_state *p2pdma_state)
{
struct dma_iova_state *state = &map->state;
dma_addr_t *dma_addrs = map->dma_list;
unsigned long *pfns = map->pfn_list;
struct page *page = hmm_pfn_to_page(pfns[idx]);
phys_addr_t paddr = hmm_pfn_to_phys(pfns[idx]);
size_t offset = idx * map->dma_entry_size;
unsigned long attrs = 0;
dma_addr_t dma_addr;
int ret;
if ((pfns[idx] & HMM_PFN_DMA_MAPPED) &&
!(pfns[idx] & HMM_PFN_P2PDMA_BUS)) {
/*
* We are in this flow when there is a need to resync flags,
* for example when page was already linked in prefetch call
* with READ flag and now we need to add WRITE flag
*
* This page was already programmed to HW and we don't want/need
* to unlink and link it again just to resync flags.
*/
if (dma_use_iova(state))
return state->addr + offset;
/*
* Without dma_need_unmap, the dma_addrs array is NULL, thus we
* need to regenerate the address below even if there already
* was a mapping. But !dma_need_unmap implies that the
* mapping stateless, so this is fine.
*/
if (dma_need_unmap(dev))
return dma_addrs[idx];
/* Continue to remapping */
}
switch (pci_p2pdma_state(p2pdma_state, dev, page)) {
case PCI_P2PDMA_MAP_NONE:
break;
case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
attrs |= DMA_ATTR_SKIP_CPU_SYNC;
pfns[idx] |= HMM_PFN_P2PDMA;
break;
case PCI_P2PDMA_MAP_BUS_ADDR:
pfns[idx] |= HMM_PFN_P2PDMA_BUS | HMM_PFN_DMA_MAPPED;
return pci_p2pdma_bus_addr_map(p2pdma_state, paddr);
default:
return DMA_MAPPING_ERROR;
}
if (dma_use_iova(state)) {
ret = dma_iova_link(dev, state, paddr, offset,
map->dma_entry_size, DMA_BIDIRECTIONAL,
attrs);
if (ret)
goto error;
ret = dma_iova_sync(dev, state, offset, map->dma_entry_size);
if (ret) {
dma_iova_unlink(dev, state, offset, map->dma_entry_size,
DMA_BIDIRECTIONAL, attrs);
goto error;
}
dma_addr = state->addr + offset;
} else {
if (WARN_ON_ONCE(dma_need_unmap(dev) && !dma_addrs))
goto error;
dma_addr = dma_map_page(dev, page, 0, map->dma_entry_size,
DMA_BIDIRECTIONAL);
if (dma_mapping_error(dev, dma_addr))
goto error;
if (dma_need_unmap(dev))
dma_addrs[idx] = dma_addr;
}
pfns[idx] |= HMM_PFN_DMA_MAPPED;
return dma_addr;
error:
pfns[idx] &= ~HMM_PFN_P2PDMA;
return DMA_MAPPING_ERROR;
}
EXPORT_SYMBOL_GPL(hmm_dma_map_pfn);
/**
* hmm_dma_unmap_pfn - Unmap a physical HMM page from DMA address
* @dev: Device to unmap the page from
* @map: HMM map
* @idx: Index of the PFN to unmap
*
* Returns true if the PFN was mapped and has been unmapped, false otherwise.
*/
bool hmm_dma_unmap_pfn(struct device *dev, struct hmm_dma_map *map, size_t idx)
{
const unsigned long valid_dma = HMM_PFN_VALID | HMM_PFN_DMA_MAPPED;
struct dma_iova_state *state = &map->state;
dma_addr_t *dma_addrs = map->dma_list;
unsigned long *pfns = map->pfn_list;
unsigned long attrs = 0;
if ((pfns[idx] & valid_dma) != valid_dma)
return false;
if (pfns[idx] & HMM_PFN_P2PDMA_BUS)
; /* no need to unmap bus address P2P mappings */
else if (dma_use_iova(state)) {
if (pfns[idx] & HMM_PFN_P2PDMA)
attrs |= DMA_ATTR_SKIP_CPU_SYNC;
dma_iova_unlink(dev, state, idx * map->dma_entry_size,
map->dma_entry_size, DMA_BIDIRECTIONAL, attrs);
} else if (dma_need_unmap(dev))
dma_unmap_page(dev, dma_addrs[idx], map->dma_entry_size,
DMA_BIDIRECTIONAL);
pfns[idx] &=
~(HMM_PFN_DMA_MAPPED | HMM_PFN_P2PDMA | HMM_PFN_P2PDMA_BUS);
return true;
}
EXPORT_SYMBOL_GPL(hmm_dma_unmap_pfn);